]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
import 15.2.5
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
eafe8130 21#include <boost/container/flat_set.hpp>
9f95a23c 22#include "boost/algorithm/string.hpp"
eafe8130 23
31f18b77
FG
24#include "include/cpp-btree/btree_set.h"
25
9f95a23c 26#include "bluestore_common.h"
7c673cae
FG
27#include "BlueStore.h"
28#include "os/kv.h"
29#include "include/compat.h"
30#include "include/intarith.h"
31#include "include/stringify.h"
11fdf7f2
TL
32#include "include/str_map.h"
33#include "include/util.h"
7c673cae
FG
34#include "common/errno.h"
35#include "common/safe_io.h"
91327a77 36#include "common/PriorityCache.h"
9f95a23c 37#include "common/RWLock.h"
7c673cae
FG
38#include "Allocator.h"
39#include "FreelistManager.h"
40#include "BlueFS.h"
41#include "BlueRocksEnv.h"
42#include "auth/Crypto.h"
43#include "common/EventTrace.h"
91327a77 44#include "perfglue/heap_profiler.h"
11fdf7f2
TL
45#include "common/blkdev.h"
46#include "common/numa.h"
7c673cae 47
9f95a23c
TL
48#if defined(WITH_LTTNG)
49#define TRACEPOINT_DEFINE
50#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51#include "tracing/bluestore.h"
52#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53#undef TRACEPOINT_DEFINE
54#else
55#define tracepoint(...)
56#endif
57
7c673cae
FG
58#define dout_context cct
59#define dout_subsys ceph_subsys_bluestore
60
31f18b77
FG
61using bid_t = decltype(BlueStore::Blob::id);
62
63// bluestore_cache_onode
7c673cae 64MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 65 bluestore_cache_onode);
7c673cae 66
31f18b77 67// bluestore_cache_other
7c673cae 68MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 69 bluestore_cache_other);
7c673cae 70MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 71 bluestore_cache_other);
7c673cae 72MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 73 bluestore_cache_other);
7c673cae 74MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
75 bluestore_cache_other);
76
77// bluestore_txc
78MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
79 bluestore_txc);
80
7c673cae
FG
81
82// kv store prefixes
11fdf7f2
TL
83const string PREFIX_SUPER = "S"; // field -> value
84const string PREFIX_STAT = "T"; // field -> value(int64 array)
85const string PREFIX_COLL = "C"; // collection name -> cnode_t
86const string PREFIX_OBJ = "O"; // object name -> onode_t
87const string PREFIX_OMAP = "M"; // u64 + keyname -> value
88const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 89const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
11fdf7f2
TL
90const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
91const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
92const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae
FG
93const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
94
11fdf7f2
TL
95const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
96
7c673cae
FG
97// write a label in the first block. always use this size. note that
98// bluefs makes a matching assumption about the location of its
99// superblock (always the second block of the device).
100#define BDEV_LABEL_BLOCK_SIZE 4096
101
102// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103#define SUPER_RESERVED 8192
104
105#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
106
107
108/*
109 * extent map blob encoding
110 *
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
113 */
114#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118#define BLOBID_SHIFT_BITS 4
119
120/*
121 * object name key structure
122 *
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
126 *
127 * escaped string: namespace
128 *
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
133 *
134 * encoded u64: snap
135 * encoded u64: generation
136 * 'o'
137 */
138#define ONODE_KEY_SUFFIX 'o'
139
140/*
141 * extent shard key
142 *
143 * object prefix key
144 * u32
145 * 'x'
146 */
147#define EXTENT_SHARD_KEY_SUFFIX 'x'
148
149/*
150 * string encoding in the key
151 *
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
155 * hex digits.
156 *
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
159 *
160 */
161template<typename S>
162static void append_escaped(const string &in, S *out)
163{
224ce89b
WB
164 char hexbyte[in.length() * 3 + 1];
165 char* ptr = &hexbyte[0];
7c673cae
FG
166 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
167 if (*i <= '#') {
224ce89b
WB
168 *ptr++ = '#';
169 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
170 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 171 } else if (*i >= '~') {
224ce89b
WB
172 *ptr++ = '~';
173 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
174 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 175 } else {
224ce89b 176 *ptr++ = *i;
7c673cae
FG
177 }
178 }
224ce89b
WB
179 *ptr++ = '!';
180 out->append(hexbyte, ptr - &hexbyte[0]);
181}
182
183inline unsigned h2i(char c)
184{
185 if ((c >= '0') && (c <= '9')) {
186 return c - 0x30;
187 } else if ((c >= 'a') && (c <= 'f')) {
188 return c - 'a' + 10;
189 } else if ((c >= 'A') && (c <= 'F')) {
190 return c - 'A' + 10;
191 } else {
192 return 256; // make it always larger than 255
193 }
7c673cae
FG
194}
195
196static int decode_escaped(const char *p, string *out)
197{
224ce89b
WB
198 char buff[256];
199 char* ptr = &buff[0];
200 char* max = &buff[252];
7c673cae
FG
201 const char *orig_p = p;
202 while (*p && *p != '!') {
203 if (*p == '#' || *p == '~') {
224ce89b
WB
204 unsigned hex = 0;
205 p++;
206 hex = h2i(*p++) << 4;
207 if (hex > 255) {
208 return -EINVAL;
209 }
210 hex |= h2i(*p++);
211 if (hex > 255) {
212 return -EINVAL;
213 }
214 *ptr++ = hex;
7c673cae 215 } else {
224ce89b
WB
216 *ptr++ = *p++;
217 }
218 if (ptr > max) {
219 out->append(buff, ptr-buff);
220 ptr = &buff[0];
7c673cae
FG
221 }
222 }
224ce89b
WB
223 if (ptr != buff) {
224 out->append(buff, ptr-buff);
225 }
7c673cae
FG
226 return p - orig_p;
227}
228
229// some things we encode in binary (as le32 or le64); print the
230// resulting key strings nicely
231template<typename S>
232static string pretty_binary_string(const S& in)
233{
234 char buf[10];
235 string out;
236 out.reserve(in.length() * 3);
237 enum { NONE, HEX, STRING } mode = NONE;
238 unsigned from = 0, i;
239 for (i=0; i < in.length(); ++i) {
240 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
241 (mode == HEX && in.length() - i >= 4 &&
242 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
243 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
244 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
245 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
246 if (mode == STRING) {
247 out.append(in.c_str() + from, i - from);
248 out.push_back('\'');
249 }
250 if (mode != HEX) {
251 out.append("0x");
252 mode = HEX;
253 }
254 if (in.length() - i >= 4) {
255 // print a whole u32 at once
256 snprintf(buf, sizeof(buf), "%08x",
257 (uint32_t)(((unsigned char)in[i] << 24) |
258 ((unsigned char)in[i+1] << 16) |
259 ((unsigned char)in[i+2] << 8) |
260 ((unsigned char)in[i+3] << 0)));
261 i += 3;
262 } else {
263 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
264 }
265 out.append(buf);
266 } else {
267 if (mode != STRING) {
268 out.push_back('\'');
269 mode = STRING;
270 from = i;
271 }
272 }
273 }
274 if (mode == STRING) {
275 out.append(in.c_str() + from, i - from);
276 out.push_back('\'');
277 }
278 return out;
279}
280
281template<typename T>
282static void _key_encode_shard(shard_id_t shard, T *key)
283{
284 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
285}
286
287static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
288{
289 pshard->id = (uint8_t)*key - (uint8_t)0x80;
290 return key + 1;
291}
292
293static void get_coll_key_range(const coll_t& cid, int bits,
294 string *temp_start, string *temp_end,
295 string *start, string *end)
296{
297 temp_start->clear();
298 temp_end->clear();
299 start->clear();
300 end->clear();
301
302 spg_t pgid;
303 if (cid.is_pg(&pgid)) {
304 _key_encode_shard(pgid.shard, start);
305 *temp_start = *start;
306
307 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
308 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
309
310 *end = *start;
311 *temp_end = *temp_start;
312
313 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
314 _key_encode_u32(reverse_hash, start);
315 _key_encode_u32(reverse_hash, temp_start);
316
317 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
318 if (end_hash > 0xffffffffull)
319 end_hash = 0xffffffffull;
320
321 _key_encode_u32(end_hash, end);
322 _key_encode_u32(end_hash, temp_end);
323 } else {
324 _key_encode_shard(shard_id_t::NO_SHARD, start);
325 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
326 *end = *start;
327 _key_encode_u32(0, start);
328 _key_encode_u32(0xffffffff, end);
329
330 // no separate temp section
331 *temp_start = *end;
332 *temp_end = *end;
333 }
334}
335
336static void get_shared_blob_key(uint64_t sbid, string *key)
337{
338 key->clear();
339 _key_encode_u64(sbid, key);
340}
341
342static int get_key_shared_blob(const string& key, uint64_t *sbid)
343{
344 const char *p = key.c_str();
345 if (key.length() < sizeof(uint64_t))
346 return -1;
224ce89b 347 _key_decode_u64(p, sbid);
7c673cae
FG
348 return 0;
349}
350
351template<typename S>
352static int get_key_object(const S& key, ghobject_t *oid)
353{
354 int r;
355 const char *p = key.c_str();
356
357 if (key.length() < 1 + 8 + 4)
358 return -1;
359 p = _key_decode_shard(p, &oid->shard_id);
360
361 uint64_t pool;
362 p = _key_decode_u64(p, &pool);
363 oid->hobj.pool = pool - 0x8000000000000000ull;
364
365 unsigned hash;
366 p = _key_decode_u32(p, &hash);
367
368 oid->hobj.set_bitwise_key_u32(hash);
369
370 r = decode_escaped(p, &oid->hobj.nspace);
371 if (r < 0)
372 return -2;
373 p += r + 1;
374
375 string k;
376 r = decode_escaped(p, &k);
377 if (r < 0)
378 return -3;
379 p += r + 1;
380 if (*p == '=') {
381 // no key
382 ++p;
383 oid->hobj.oid.name = k;
384 } else if (*p == '<' || *p == '>') {
385 // key + name
386 ++p;
387 r = decode_escaped(p, &oid->hobj.oid.name);
388 if (r < 0)
389 return -5;
390 p += r + 1;
391 oid->hobj.set_key(k);
392 } else {
393 // malformed
394 return -6;
395 }
396
397 p = _key_decode_u64(p, &oid->hobj.snap.val);
398 p = _key_decode_u64(p, &oid->generation);
399
400 if (*p != ONODE_KEY_SUFFIX) {
401 return -7;
402 }
403 p++;
404 if (*p) {
405 // if we get something other than a null terminator here,
406 // something goes wrong.
407 return -8;
408 }
409
410 return 0;
411}
412
413template<typename S>
414static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
415{
416 key->clear();
417
418 size_t max_len = 1 + 8 + 4 +
419 (oid.hobj.nspace.length() * 3 + 1) +
420 (oid.hobj.get_key().length() * 3 + 1) +
421 1 + // for '<', '=', or '>'
422 (oid.hobj.oid.name.length() * 3 + 1) +
423 8 + 8 + 1;
424 key->reserve(max_len);
425
426 _key_encode_shard(oid.shard_id, key);
427 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
428 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
429
430 append_escaped(oid.hobj.nspace, key);
431
432 if (oid.hobj.get_key().length()) {
433 // is a key... could be < = or >.
434 append_escaped(oid.hobj.get_key(), key);
435 // (ASCII chars < = and > sort in that order, yay)
436 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
437 if (r) {
438 key->append(r > 0 ? ">" : "<");
439 append_escaped(oid.hobj.oid.name, key);
440 } else {
441 // same as no key
442 key->append("=");
443 }
444 } else {
445 // no key
446 append_escaped(oid.hobj.oid.name, key);
447 key->append("=");
448 }
449
450 _key_encode_u64(oid.hobj.snap, key);
451 _key_encode_u64(oid.generation, key);
452
453 key->push_back(ONODE_KEY_SUFFIX);
454
455 // sanity check
456 if (true) {
457 ghobject_t t;
458 int r = get_key_object(*key, &t);
459 if (r || t != oid) {
460 derr << " r " << r << dendl;
461 derr << "key " << pretty_binary_string(*key) << dendl;
462 derr << "oid " << oid << dendl;
463 derr << " t " << t << dendl;
11fdf7f2 464 ceph_assert(r == 0 && t == oid);
7c673cae
FG
465 }
466 }
467}
468
469
470// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
471// char lets us quickly test whether it is a shard key without decoding any
472// of the prefix bytes.
473template<typename S>
474static void get_extent_shard_key(const S& onode_key, uint32_t offset,
475 string *key)
476{
477 key->clear();
478 key->reserve(onode_key.length() + 4 + 1);
479 key->append(onode_key.c_str(), onode_key.size());
480 _key_encode_u32(offset, key);
481 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
482}
483
484static void rewrite_extent_shard_key(uint32_t offset, string *key)
485{
11fdf7f2
TL
486 ceph_assert(key->size() > sizeof(uint32_t) + 1);
487 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
488 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
489}
490
491template<typename S>
492static void generate_extent_shard_key_and_apply(
493 const S& onode_key,
494 uint32_t offset,
495 string *key,
496 std::function<void(const string& final_key)> apply)
497{
498 if (key->empty()) { // make full key
11fdf7f2 499 ceph_assert(!onode_key.empty());
7c673cae
FG
500 get_extent_shard_key(onode_key, offset, key);
501 } else {
502 rewrite_extent_shard_key(offset, key);
503 }
504 apply(*key);
505}
506
507int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
508{
11fdf7f2
TL
509 ceph_assert(key.size() > sizeof(uint32_t) + 1);
510 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
511 int okey_len = key.size() - sizeof(uint32_t) - 1;
512 *onode_key = key.substr(0, okey_len);
513 const char *p = key.data() + okey_len;
224ce89b 514 _key_decode_u32(p, offset);
7c673cae
FG
515 return 0;
516}
517
518static bool is_extent_shard_key(const string& key)
519{
520 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
521}
522
7c673cae
FG
523static void get_deferred_key(uint64_t seq, string *out)
524{
525 _key_encode_u64(seq, out);
526}
527
11fdf7f2
TL
528static void get_pool_stat_key(int64_t pool_id, string *key)
529{
530 key->clear();
531 _key_encode_u64(pool_id, key);
532}
533
534static int get_key_pool_stat(const string& key, uint64_t* pool_id)
535{
536 const char *p = key.c_str();
537 if (key.length() < sizeof(uint64_t))
538 return -1;
539 _key_decode_u64(p, pool_id);
540 return 0;
541}
7c673cae 542
81eedcae
TL
543template <int LogLevelV>
544void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
545{
546 uint64_t pos = 0;
547 for (auto& s : em.shards) {
548 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
549 << (s.loaded ? " (loaded)" : "")
550 << (s.dirty ? " (dirty)" : "")
551 << dendl;
552 }
553 for (auto& e : em.extent_map) {
554 dout(LogLevelV) << __func__ << " " << e << dendl;
555 ceph_assert(e.logical_offset >= pos);
556 pos = e.logical_offset + e.length;
557 const bluestore_blob_t& blob = e.blob->get_blob();
558 if (blob.has_csum()) {
559 vector<uint64_t> v;
560 unsigned n = blob.get_csum_count();
561 for (unsigned i = 0; i < n; ++i)
562 v.push_back(blob.get_csum_item(i));
563 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
564 << dendl;
565 }
566 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
567 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
568 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
569 << "~" << i.second->length << std::dec
570 << " " << *i.second << dendl;
571 }
572 }
573}
574
575template <int LogLevelV>
576void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
577{
578 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
579 return;
580 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
581 << " nid " << o.onode.nid
582 << " size 0x" << std::hex << o.onode.size
583 << " (" << std::dec << o.onode.size << ")"
584 << " expected_object_size " << o.onode.expected_object_size
585 << " expected_write_size " << o.onode.expected_write_size
586 << " in " << o.onode.extent_map_shards.size() << " shards"
587 << ", " << o.extent_map.spanning_blob_map.size()
588 << " spanning blobs"
589 << dendl;
590 for (auto p = o.onode.attrs.begin();
591 p != o.onode.attrs.end();
592 ++p) {
593 dout(LogLevelV) << __func__ << " attr " << p->first
594 << " len " << p->second.length() << dendl;
595 }
596 _dump_extent_map<LogLevelV>(cct, o.extent_map);
597}
598
599template <int LogLevelV>
600void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
601{
602 dout(LogLevelV) << __func__ << " transaction dump:\n";
603 JSONFormatter f(true);
604 f.open_object_section("transaction");
605 t->dump(&f);
606 f.close_section();
607 f.flush(*_dout);
608 *_dout << dendl;
609}
610
7c673cae
FG
611// merge operators
612
613struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
614 void merge_nonexistent(
615 const char *rdata, size_t rlen, std::string *new_value) override {
616 *new_value = std::string(rdata, rlen);
617 }
618 void merge(
619 const char *ldata, size_t llen,
620 const char *rdata, size_t rlen,
621 std::string *new_value) override {
11fdf7f2
TL
622 ceph_assert(llen == rlen);
623 ceph_assert((rlen % 8) == 0);
7c673cae 624 new_value->resize(rlen);
eafe8130
TL
625 const ceph_le64* lv = (const ceph_le64*)ldata;
626 const ceph_le64* rv = (const ceph_le64*)rdata;
627 ceph_le64* nv = &(ceph_le64&)new_value->at(0);
7c673cae
FG
628 for (size_t i = 0; i < rlen >> 3; ++i) {
629 nv[i] = lv[i] + rv[i];
630 }
631 }
632 // We use each operator name and each prefix to construct the
633 // overall RocksDB operator name for consistency check at open time.
91327a77 634 const char *name() const override {
7c673cae
FG
635 return "int64_array";
636 }
637};
638
639
640// Buffer
641
642ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
643{
644 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
645 << b.offset << "~" << b.length << std::dec
646 << " " << BlueStore::Buffer::get_state_name(b.state);
647 if (b.flags)
648 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
649 return out << ")";
650}
651
652// Garbage Collector
653
654void BlueStore::GarbageCollector::process_protrusive_extents(
655 const BlueStore::ExtentMap& extent_map,
656 uint64_t start_offset,
657 uint64_t end_offset,
658 uint64_t start_touch_offset,
659 uint64_t end_touch_offset,
660 uint64_t min_alloc_size)
661{
11fdf7f2 662 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 663
11fdf7f2
TL
664 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
665 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
666
667 dout(30) << __func__ << " (hex): [" << std::hex
668 << lookup_start_offset << ", " << lookup_end_offset
669 << ")" << std::dec << dendl;
670
671 for (auto it = extent_map.seek_lextent(lookup_start_offset);
672 it != extent_map.extent_map.end() &&
673 it->logical_offset < lookup_end_offset;
674 ++it) {
675 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
676 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
677
678 dout(30) << __func__ << " " << *it
679 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
680 << dendl;
681
682 Blob* b = it->blob.get();
683
684 if (it->logical_offset >=start_touch_offset &&
685 it->logical_end() <= end_touch_offset) {
686 // Process extents within the range affected by
687 // the current write request.
688 // Need to take into account if existing extents
689 // can be merged with them (uncompressed case)
690 if (!b->get_blob().is_compressed()) {
691 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
692 --blob_info_counted->expected_allocations; // don't need to allocate
693 // new AU for compressed
694 // data since another
695 // collocated uncompressed
696 // blob already exists
697 dout(30) << __func__ << " --expected:"
698 << alloc_unit_start << dendl;
699 }
700 used_alloc_unit = alloc_unit_end;
701 blob_info_counted = nullptr;
702 }
703 } else if (b->get_blob().is_compressed()) {
704
705 // additionally we take compressed blobs that were not impacted
706 // by the write into account too
707 BlobInfo& bi =
708 affected_blobs.emplace(
709 b, BlobInfo(b->get_referenced_bytes())).first->second;
710
711 int adjust =
712 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
713 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
714 dout(30) << __func__ << " expected_allocations="
715 << bi.expected_allocations << " end_au:"
716 << alloc_unit_end << dendl;
717
718 blob_info_counted = &bi;
719 used_alloc_unit = alloc_unit_end;
720
11fdf7f2 721 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
722 bi.referenced_bytes -= it->length;
723 dout(30) << __func__ << " affected_blob:" << *b
724 << " unref 0x" << std::hex << it->length
725 << " referenced = 0x" << bi.referenced_bytes
726 << std::dec << dendl;
727 // NOTE: we can't move specific blob to resulting GC list here
728 // when reference counter == 0 since subsequent extents might
729 // decrement its expected_allocation.
730 // Hence need to enumerate all the extents first.
731 if (!bi.collect_candidate) {
732 bi.first_lextent = it;
733 bi.collect_candidate = true;
734 }
735 bi.last_lextent = it;
736 } else {
737 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
738 // don't need to allocate new AU for compressed data since another
739 // collocated uncompressed blob already exists
740 --blob_info_counted->expected_allocations;
741 dout(30) << __func__ << " --expected_allocations:"
742 << alloc_unit_start << dendl;
743 }
744 used_alloc_unit = alloc_unit_end;
745 blob_info_counted = nullptr;
746 }
747 }
748
749 for (auto b_it = affected_blobs.begin();
750 b_it != affected_blobs.end();
751 ++b_it) {
752 Blob* b = b_it->first;
753 BlobInfo& bi = b_it->second;
754 if (bi.referenced_bytes == 0) {
755 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
756 int64_t blob_expected_for_release =
11fdf7f2 757 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
758
759 dout(30) << __func__ << " " << *(b_it->first)
760 << " expected4release=" << blob_expected_for_release
761 << " expected_allocations=" << bi.expected_allocations
762 << dendl;
763 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 764 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
765 if (bi.collect_candidate) {
766 auto it = bi.first_lextent;
767 bool bExit = false;
768 do {
769 if (it->blob.get() == b) {
eafe8130 770 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
771 }
772 bExit = it == bi.last_lextent;
773 ++it;
31f18b77 774 } while (!bExit);
7c673cae
FG
775 }
776 expected_for_release += blob_expected_for_release;
777 expected_allocations += bi.expected_allocations;
778 }
779 }
780 }
781}
782
783int64_t BlueStore::GarbageCollector::estimate(
784 uint64_t start_offset,
785 uint64_t length,
786 const BlueStore::ExtentMap& extent_map,
787 const BlueStore::old_extent_map_t& old_extents,
788 uint64_t min_alloc_size)
789{
790
791 affected_blobs.clear();
792 extents_to_collect.clear();
793 used_alloc_unit = boost::optional<uint64_t >();
794 blob_info_counted = nullptr;
795
eafe8130
TL
796 uint64_t gc_start_offset = start_offset;
797 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
798
799 uint64_t end_offset = start_offset + length;
800
801 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
802 Blob* b = it->e.blob.get();
803 if (b->get_blob().is_compressed()) {
804
805 // update gc_start_offset/gc_end_offset if needed
806 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 807 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
808
809 auto o = it->e.logical_offset;
810 auto l = it->e.length;
811
812 uint64_t ref_bytes = b->get_referenced_bytes();
813 // micro optimization to bypass blobs that have no more references
814 if (ref_bytes != 0) {
815 dout(30) << __func__ << " affected_blob:" << *b
816 << " unref 0x" << std::hex << o << "~" << l
817 << std::dec << dendl;
818 affected_blobs.emplace(b, BlobInfo(ref_bytes));
819 }
820 }
821 }
822 dout(30) << __func__ << " gc range(hex): [" << std::hex
823 << gc_start_offset << ", " << gc_end_offset
824 << ")" << std::dec << dendl;
825
826 // enumerate preceeding extents to check if they reference affected blobs
827 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
828 process_protrusive_extents(extent_map,
829 gc_start_offset,
830 gc_end_offset,
831 start_offset,
832 end_offset,
833 min_alloc_size);
834 }
835 return expected_for_release - expected_allocations;
836}
837
9f95a23c
TL
838// LruOnodeCacheShard
839struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
840 typedef boost::intrusive::list<
841 BlueStore::Onode,
842 boost::intrusive::member_hook<
843 BlueStore::Onode,
844 boost::intrusive::list_member_hook<>,
845 &BlueStore::Onode::lru_item> > list_t;
7c673cae 846
9f95a23c 847 list_t lru;
7c673cae 848
9f95a23c 849 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 850
f6b5b4d7 851 void _add(BlueStore::Onode* o, int level) override
9f95a23c 852 {
f6b5b4d7 853 if (o->put_cache()) {
9f95a23c 854 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
f6b5b4d7
TL
855 } else {
856 ++num_pinned;
9f95a23c 857 }
f6b5b4d7
TL
858 ++num; // we count both pinned and unpinned entries
859 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
eafe8130 860 }
f6b5b4d7 861 void _rm(BlueStore::Onode* o) override
9f95a23c 862 {
f6b5b4d7 863 if (o->pop_cache()) {
9f95a23c 864 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
865 } else {
866 ceph_assert(num_pinned);
867 --num_pinned;
9f95a23c 868 }
f6b5b4d7
TL
869 ceph_assert(num);
870 --num;
871 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 872 }
f6b5b4d7 873 void _pin(BlueStore::Onode* o) override
9f95a23c 874 {
9f95a23c 875 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
876 ++num_pinned;
877 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
9f95a23c 878 }
f6b5b4d7 879 void _unpin(BlueStore::Onode* o) override
9f95a23c 880 {
f6b5b4d7
TL
881 lru.push_front(*o);
882 ceph_assert(num_pinned);
883 --num_pinned;
884 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
9f95a23c 885 }
f6b5b4d7 886
9f95a23c
TL
887 void _trim_to(uint64_t new_size) override
888 {
889 if (new_size >= lru.size()) {
890 return; // don't even try
891 }
892 uint64_t n = lru.size() - new_size;
893 auto p = lru.end();
894 ceph_assert(p != lru.begin());
895 --p;
f6b5b4d7
TL
896 ceph_assert(num >= n);
897 num -= n;
898 while (n-- > 0) {
9f95a23c 899 BlueStore::Onode *o = &*p;
f6b5b4d7
TL
900 dout(20) << __func__ << " rm " << o->oid << " "
901 << o->nref << " " << o->cached << " " << o->pinned << dendl;
9f95a23c
TL
902 if (p != lru.begin()) {
903 lru.erase(p--);
904 } else {
f6b5b4d7 905 ceph_assert(n == 0);
9f95a23c 906 lru.erase(p);
9f95a23c 907 }
f6b5b4d7
TL
908 auto pinned = !o->pop_cache();
909 ceph_assert(!pinned);
910 o->c->onode_map._remove(o->oid);
9f95a23c 911 }
f6b5b4d7
TL
912 }
913 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
914 {
915 if (to == this) {
916 return;
917 }
918 ceph_assert(o->cached);
919 ceph_assert(o->pinned);
920 ceph_assert(num);
921 ceph_assert(num_pinned);
922 --num_pinned;
923 --num;
924 ++to->num_pinned;
925 ++to->num;
9f95a23c
TL
926 }
927 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
928 {
f6b5b4d7 929 *onodes += num;
9f95a23c
TL
930 *pinned_onodes += num_pinned;
931 }
932};
7c673cae 933
9f95a23c
TL
934// OnodeCacheShard
935BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
936 CephContext* cct,
937 string type,
938 PerfCounters *logger)
7c673cae 939{
9f95a23c
TL
940 BlueStore::OnodeCacheShard *c = nullptr;
941 // Currently we only implement an LRU cache for onodes
942 c = new LruOnodeCacheShard(cct);
943 c->logger = logger;
944 return c;
7c673cae
FG
945}
946
9f95a23c
TL
947// LruBufferCacheShard
948struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
949 typedef boost::intrusive::list<
950 BlueStore::Buffer,
951 boost::intrusive::member_hook<
952 BlueStore::Buffer,
953 boost::intrusive::list_member_hook<>,
954 &BlueStore::Buffer::lru_item> > list_t;
955 list_t lru;
956
957 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
958
959 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
960 if (near) {
961 auto q = lru.iterator_to(*near);
962 lru.insert(q, *b);
963 } else if (level > 0) {
964 lru.push_front(*b);
965 } else {
966 lru.push_back(*b);
7c673cae 967 }
9f95a23c
TL
968 buffer_bytes += b->length;
969 num = lru.size();
970 }
971 void _rm(BlueStore::Buffer *b) override {
972 ceph_assert(buffer_bytes >= b->length);
973 buffer_bytes -= b->length;
974 auto q = lru.iterator_to(*b);
975 lru.erase(q);
976 num = lru.size();
977 }
978 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
979 src->_rm(b);
980 _add(b, 0, nullptr);
981 }
982 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
983 ceph_assert((int64_t)buffer_bytes + delta >= 0);
984 buffer_bytes += delta;
985 }
986 void _touch(BlueStore::Buffer *b) override {
987 auto p = lru.iterator_to(*b);
988 lru.erase(p);
989 lru.push_front(*b);
990 num = lru.size();
991 _audit("_touch_buffer end");
992 }
7c673cae 993
9f95a23c
TL
994 void _trim_to(uint64_t max) override
995 {
996 while (buffer_bytes > max) {
997 auto i = lru.rbegin();
998 if (i == lru.rend()) {
999 // stop if lru is now empty
7c673cae
FG
1000 break;
1001 }
1002
9f95a23c
TL
1003 BlueStore::Buffer *b = &*i;
1004 ceph_assert(b->is_clean());
1005 dout(20) << __func__ << " rm " << *b << dendl;
1006 b->space->_rm_buffer(this, b);
7c673cae 1007 }
9f95a23c 1008 num = lru.size();
7c673cae 1009 }
7c673cae 1010
9f95a23c
TL
1011 void add_stats(uint64_t *extents,
1012 uint64_t *blobs,
1013 uint64_t *buffers,
1014 uint64_t *bytes) override {
1015 *extents += num_extents;
1016 *blobs += num_blobs;
1017 *buffers += num;
1018 *bytes += buffer_bytes;
7c673cae 1019 }
9f95a23c
TL
1020#ifdef DEBUG_CACHE
1021 void _audit(const char *s) override
1022 {
1023 dout(10) << __func__ << " " << when << " start" << dendl;
1024 uint64_t s = 0;
1025 for (auto i = lru.begin(); i != lru.end(); ++i) {
1026 s += i->length;
1027 }
1028 if (s != buffer_bytes) {
1029 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1030 << dendl;
1031 for (auto i = lru.begin(); i != lru.end(); ++i) {
1032 derr << __func__ << " " << *i << dendl;
1033 }
1034 ceph_assert(s == buffer_bytes);
7c673cae 1035 }
9f95a23c
TL
1036 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1037 << " ok" << dendl;
7c673cae 1038 }
7c673cae 1039#endif
9f95a23c 1040};
7c673cae 1041
9f95a23c
TL
1042// TwoQBufferCacheShard
1043
1044struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1045 typedef boost::intrusive::list<
1046 BlueStore::Buffer,
1047 boost::intrusive::member_hook<
1048 BlueStore::Buffer,
1049 boost::intrusive::list_member_hook<>,
1050 &BlueStore::Buffer::lru_item> > list_t;
1051 list_t hot; ///< "Am" hot buffers
1052 list_t warm_in; ///< "A1in" newly warm buffers
1053 list_t warm_out; ///< "A1out" empty buffers we've evicted
1054 uint64_t buffer_bytes = 0; ///< bytes
1055
1056 enum {
1057 BUFFER_NEW = 0,
1058 BUFFER_WARM_IN, ///< in warm_in
1059 BUFFER_WARM_OUT, ///< in warm_out
1060 BUFFER_HOT, ///< in hot
1061 BUFFER_TYPE_MAX
1062 };
7c673cae 1063
9f95a23c 1064 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1065
9f95a23c
TL
1066public:
1067 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1068
9f95a23c
TL
1069 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1070 {
1071 dout(20) << __func__ << " level " << level << " near " << near
1072 << " on " << *b
1073 << " which has cache_private " << b->cache_private << dendl;
1074 if (near) {
1075 b->cache_private = near->cache_private;
1076 switch (b->cache_private) {
1077 case BUFFER_WARM_IN:
1078 warm_in.insert(warm_in.iterator_to(*near), *b);
1079 break;
1080 case BUFFER_WARM_OUT:
1081 ceph_assert(b->is_empty());
1082 warm_out.insert(warm_out.iterator_to(*near), *b);
1083 break;
1084 case BUFFER_HOT:
1085 hot.insert(hot.iterator_to(*near), *b);
1086 break;
1087 default:
1088 ceph_abort_msg("bad cache_private");
1089 }
1090 } else if (b->cache_private == BUFFER_NEW) {
1091 b->cache_private = BUFFER_WARM_IN;
1092 if (level > 0) {
1093 warm_in.push_front(*b);
1094 } else {
1095 // take caller hint to start at the back of the warm queue
1096 warm_in.push_back(*b);
1097 }
1098 } else {
1099 // we got a hint from discard
1100 switch (b->cache_private) {
1101 case BUFFER_WARM_IN:
1102 // stay in warm_in. move to front, even though 2Q doesn't actually
1103 // do this.
1104 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1105 warm_in.push_front(*b);
1106 break;
1107 case BUFFER_WARM_OUT:
1108 b->cache_private = BUFFER_HOT;
1109 // move to hot. fall-thru
1110 case BUFFER_HOT:
1111 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1112 hot.push_front(*b);
1113 break;
1114 default:
1115 ceph_abort_msg("bad cache_private");
1116 }
1117 }
1118 if (!b->is_empty()) {
1119 buffer_bytes += b->length;
1120 list_bytes[b->cache_private] += b->length;
1121 }
1122 num = hot.size() + warm_in.size();
1123 }
1124
1125 void _rm(BlueStore::Buffer *b) override
1126 {
1127 dout(20) << __func__ << " " << *b << dendl;
1128 if (!b->is_empty()) {
1129 ceph_assert(buffer_bytes >= b->length);
1130 buffer_bytes -= b->length;
1131 ceph_assert(list_bytes[b->cache_private] >= b->length);
1132 list_bytes[b->cache_private] -= b->length;
1133 }
7c673cae
FG
1134 switch (b->cache_private) {
1135 case BUFFER_WARM_IN:
9f95a23c 1136 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1137 break;
1138 case BUFFER_WARM_OUT:
9f95a23c 1139 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1140 break;
1141 case BUFFER_HOT:
9f95a23c 1142 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1143 break;
1144 default:
11fdf7f2 1145 ceph_abort_msg("bad cache_private");
7c673cae 1146 }
9f95a23c
TL
1147 num = hot.size() + warm_in.size();
1148 }
1149
1150 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1151 {
1152 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1153 src->_rm(b);
1154
1155 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1156 switch (b->cache_private) {
1157 case BUFFER_WARM_IN:
9f95a23c
TL
1158 ceph_assert(!b->is_empty());
1159 warm_in.push_back(*b);
7c673cae
FG
1160 break;
1161 case BUFFER_WARM_OUT:
9f95a23c
TL
1162 ceph_assert(b->is_empty());
1163 warm_out.push_back(*b);
1164 break;
7c673cae 1165 case BUFFER_HOT:
9f95a23c
TL
1166 ceph_assert(!b->is_empty());
1167 hot.push_back(*b);
7c673cae
FG
1168 break;
1169 default:
11fdf7f2 1170 ceph_abort_msg("bad cache_private");
7c673cae 1171 }
9f95a23c
TL
1172 if (!b->is_empty()) {
1173 buffer_bytes += b->length;
1174 list_bytes[b->cache_private] += b->length;
1175 }
1176 num = hot.size() + warm_in.size();
7c673cae 1177 }
7c673cae 1178
9f95a23c
TL
1179 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1180 {
1181 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1182 if (!b->is_empty()) {
1183 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1184 buffer_bytes += delta;
1185 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1186 list_bytes[b->cache_private] += delta;
1187 }
7c673cae 1188 }
7c673cae 1189
9f95a23c
TL
1190 void _touch(BlueStore::Buffer *b) override {
1191 switch (b->cache_private) {
1192 case BUFFER_WARM_IN:
1193 // do nothing (somewhat counter-intuitively!)
1194 break;
1195 case BUFFER_WARM_OUT:
1196 // move from warm_out to hot LRU
1197 ceph_abort_msg("this happens via discard hint");
1198 break;
1199 case BUFFER_HOT:
1200 // move to front of hot LRU
1201 hot.erase(hot.iterator_to(*b));
1202 hot.push_front(*b);
1203 break;
1204 }
1205 num = hot.size() + warm_in.size();
1206 _audit("_touch_buffer end");
7c673cae 1207 }
7c673cae 1208
9f95a23c
TL
1209 void _trim_to(uint64_t max) override
1210 {
1211 if (buffer_bytes > max) {
1212 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1213 uint64_t khot = max - kin;
1214
1215 // pre-calculate kout based on average buffer size too,
1216 // which is typical(the warm_in and hot lists may change later)
1217 uint64_t kout = 0;
1218 uint64_t buffer_num = hot.size() + warm_in.size();
1219 if (buffer_num) {
1220 uint64_t avg_size = buffer_bytes / buffer_num;
1221 ceph_assert(avg_size);
1222 uint64_t calculated_num = max / avg_size;
1223 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1224 }
1225
1226 if (list_bytes[BUFFER_HOT] < khot) {
1227 // hot is small, give slack to warm_in
1228 kin += khot - list_bytes[BUFFER_HOT];
1229 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1230 // warm_in is small, give slack to hot
1231 khot += kin - list_bytes[BUFFER_WARM_IN];
1232 }
1233
1234 // adjust warm_in list
1235 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1236 uint64_t evicted = 0;
1237
1238 while (to_evict_bytes > 0) {
1239 auto p = warm_in.rbegin();
1240 if (p == warm_in.rend()) {
1241 // stop if warm_in list is now empty
1242 break;
1243 }
7c673cae 1244
9f95a23c
TL
1245 BlueStore::Buffer *b = &*p;
1246 ceph_assert(b->is_clean());
1247 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1248 ceph_assert(buffer_bytes >= b->length);
1249 buffer_bytes -= b->length;
1250 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1251 list_bytes[BUFFER_WARM_IN] -= b->length;
1252 to_evict_bytes -= b->length;
1253 evicted += b->length;
1254 b->state = BlueStore::Buffer::STATE_EMPTY;
1255 b->data.clear();
1256 warm_in.erase(warm_in.iterator_to(*b));
1257 warm_out.push_front(*b);
1258 b->cache_private = BUFFER_WARM_OUT;
1259 }
1260
1261 if (evicted > 0) {
1262 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1263 << " from warm_in list, done evicting warm_in buffers"
1264 << dendl;
1265 }
7c673cae 1266
9f95a23c
TL
1267 // adjust hot list
1268 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1269 evicted = 0;
7c673cae 1270
9f95a23c
TL
1271 while (to_evict_bytes > 0) {
1272 auto p = hot.rbegin();
1273 if (p == hot.rend()) {
1274 // stop if hot list is now empty
1275 break;
1276 }
7c673cae 1277
9f95a23c
TL
1278 BlueStore::Buffer *b = &*p;
1279 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1280 ceph_assert(b->is_clean());
1281 // adjust evict size before buffer goes invalid
1282 to_evict_bytes -= b->length;
1283 evicted += b->length;
1284 b->space->_rm_buffer(this, b);
1285 }
7c673cae 1286
9f95a23c
TL
1287 if (evicted > 0) {
1288 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1289 << " from hot list, done evicting hot buffers"
1290 << dendl;
7c673cae
FG
1291 }
1292
9f95a23c
TL
1293 // adjust warm out list too, if necessary
1294 int64_t n = warm_out.size() - kout;
1295 while (n-- > 0) {
1296 BlueStore::Buffer *b = &*warm_out.rbegin();
1297 ceph_assert(b->is_empty());
1298 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1299 b->space->_rm_buffer(this, b);
1300 }
7c673cae 1301 }
9f95a23c
TL
1302 num = hot.size() + warm_in.size();
1303 }
7c673cae 1304
9f95a23c
TL
1305 void add_stats(uint64_t *extents,
1306 uint64_t *blobs,
1307 uint64_t *buffers,
1308 uint64_t *bytes) override {
1309 *extents += num_extents;
1310 *blobs += num_blobs;
1311 *buffers += num;
1312 *bytes += buffer_bytes;
1313 }
7c673cae 1314
9f95a23c
TL
1315#ifdef DEBUG_CACHE
1316 void _audit(const char *s) override
1317 {
1318 dout(10) << __func__ << " " << when << " start" << dendl;
1319 uint64_t s = 0;
1320 for (auto i = hot.begin(); i != hot.end(); ++i) {
1321 s += i->length;
7c673cae
FG
1322 }
1323
9f95a23c
TL
1324 uint64_t hot_bytes = s;
1325 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1326 derr << __func__ << " hot_list_bytes "
1327 << list_bytes[BUFFER_HOT]
1328 << " != actual " << hot_bytes
1329 << dendl;
1330 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1331 }
1332
9f95a23c
TL
1333 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1334 s += i->length;
7c673cae 1335 }
7c673cae 1336
9f95a23c
TL
1337 uint64_t warm_in_bytes = s - hot_bytes;
1338 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1339 derr << __func__ << " warm_in_list_bytes "
1340 << list_bytes[BUFFER_WARM_IN]
1341 << " != actual " << warm_in_bytes
1342 << dendl;
1343 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1344 }
7c673cae 1345
9f95a23c
TL
1346 if (s != buffer_bytes) {
1347 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1348 << dendl;
1349 ceph_assert(s == buffer_bytes);
1350 }
7c673cae 1351
9f95a23c
TL
1352 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1353 << " ok" << dendl;
7c673cae 1354 }
9f95a23c
TL
1355#endif
1356};
7c673cae 1357
9f95a23c 1358// BuferCacheShard
7c673cae 1359
9f95a23c
TL
1360BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1361 CephContext* cct,
1362 string type,
1363 PerfCounters *logger)
1364{
1365 BufferCacheShard *c = nullptr;
1366 if (type == "lru")
1367 c = new LruBufferCacheShard(cct);
1368 else if (type == "2q")
1369 c = new TwoQBufferCacheShard(cct);
1370 else
1371 ceph_abort_msg("unrecognized cache type");
1372 c->logger = logger;
1373 return c;
7c673cae 1374}
7c673cae
FG
1375
1376// BufferSpace
1377
1378#undef dout_prefix
1379#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1380
9f95a23c 1381void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1382{
1383 // note: we already hold cache->lock
1384 ldout(cache->cct, 20) << __func__ << dendl;
1385 while (!buffer_map.empty()) {
1386 _rm_buffer(cache, buffer_map.begin());
1387 }
1388}
1389
9f95a23c 1390int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1391{
1392 // note: we already hold cache->lock
1393 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1394 << std::dec << dendl;
1395 int cache_private = 0;
1396 cache->_audit("discard start");
1397 auto i = _data_lower_bound(offset);
1398 uint32_t end = offset + length;
1399 while (i != buffer_map.end()) {
1400 Buffer *b = i->second.get();
1401 if (b->offset >= end) {
1402 break;
1403 }
1404 if (b->cache_private > cache_private) {
1405 cache_private = b->cache_private;
1406 }
1407 if (b->offset < offset) {
1408 int64_t front = offset - b->offset;
1409 if (b->end() > end) {
1410 // drop middle (split)
1411 uint32_t tail = b->end() - end;
1412 if (b->data.length()) {
1413 bufferlist bl;
1414 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1415 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1416 nb->maybe_rebuild();
1417 _add_buffer(cache, nb, 0, b);
7c673cae 1418 } else {
31f18b77
FG
1419 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1420 0, b);
7c673cae
FG
1421 }
1422 if (!b->is_writing()) {
9f95a23c 1423 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1424 }
1425 b->truncate(front);
31f18b77 1426 b->maybe_rebuild();
7c673cae
FG
1427 cache->_audit("discard end 1");
1428 break;
1429 } else {
1430 // drop tail
1431 if (!b->is_writing()) {
9f95a23c 1432 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1433 }
1434 b->truncate(front);
31f18b77 1435 b->maybe_rebuild();
7c673cae
FG
1436 ++i;
1437 continue;
1438 }
1439 }
1440 if (b->end() <= end) {
1441 // drop entire buffer
1442 _rm_buffer(cache, i++);
1443 continue;
1444 }
1445 // drop front
1446 uint32_t keep = b->end() - end;
1447 if (b->data.length()) {
1448 bufferlist bl;
1449 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1450 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1451 nb->maybe_rebuild();
1452 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1453 } else {
1454 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1455 }
1456 _rm_buffer(cache, i);
1457 cache->_audit("discard end 2");
1458 break;
1459 }
1460 return cache_private;
1461}
1462
1463void BlueStore::BufferSpace::read(
9f95a23c 1464 BufferCacheShard* cache,
224ce89b
WB
1465 uint32_t offset,
1466 uint32_t length,
7c673cae 1467 BlueStore::ready_regions_t& res,
91327a77
AA
1468 interval_set<uint32_t>& res_intervals,
1469 int flags)
7c673cae 1470{
7c673cae
FG
1471 res.clear();
1472 res_intervals.clear();
1473 uint32_t want_bytes = length;
1474 uint32_t end = offset + length;
224ce89b
WB
1475
1476 {
11fdf7f2 1477 std::lock_guard l(cache->lock);
224ce89b
WB
1478 for (auto i = _data_lower_bound(offset);
1479 i != buffer_map.end() && offset < end && i->first < end;
1480 ++i) {
1481 Buffer *b = i->second.get();
11fdf7f2 1482 ceph_assert(b->end() > offset);
91327a77
AA
1483
1484 bool val = false;
1485 if (flags & BYPASS_CLEAN_CACHE)
1486 val = b->is_writing();
1487 else
1488 val = b->is_writing() || b->is_clean();
1489 if (val) {
224ce89b
WB
1490 if (b->offset < offset) {
1491 uint32_t skip = offset - b->offset;
11fdf7f2 1492 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1493 res[offset].substr_of(b->data, skip, l);
1494 res_intervals.insert(offset, l);
1495 offset += l;
1496 length -= l;
1497 if (!b->is_writing()) {
9f95a23c 1498 cache->_touch(b);
224ce89b
WB
1499 }
1500 continue;
1501 }
1502 if (b->offset > offset) {
1503 uint32_t gap = b->offset - offset;
1504 if (length <= gap) {
1505 break;
1506 }
1507 offset += gap;
1508 length -= gap;
1509 }
1510 if (!b->is_writing()) {
9f95a23c 1511 cache->_touch(b);
224ce89b
WB
1512 }
1513 if (b->length > length) {
1514 res[offset].substr_of(b->data, 0, length);
1515 res_intervals.insert(offset, length);
7c673cae 1516 break;
224ce89b
WB
1517 } else {
1518 res[offset].append(b->data);
1519 res_intervals.insert(offset, b->length);
1520 if (b->length == length)
1521 break;
1522 offset += b->length;
1523 length -= b->length;
1524 }
7c673cae
FG
1525 }
1526 }
1527 }
1528
1529 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1530 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1531 uint64_t miss_bytes = want_bytes - hit_bytes;
1532 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1533 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1534}
1535
9f95a23c 1536void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1537{
7c673cae
FG
1538 auto i = writing.begin();
1539 while (i != writing.end()) {
1540 if (i->seq > seq) {
1541 break;
1542 }
1543 if (i->seq < seq) {
1544 ++i;
1545 continue;
1546 }
1547
1548 Buffer *b = &*i;
11fdf7f2 1549 ceph_assert(b->is_writing());
7c673cae
FG
1550
1551 if (b->flags & Buffer::FLAG_NOCACHE) {
1552 writing.erase(i++);
1553 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1554 buffer_map.erase(b->offset);
1555 } else {
1556 b->state = Buffer::STATE_CLEAN;
1557 writing.erase(i++);
31f18b77
FG
1558 b->maybe_rebuild();
1559 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1560 cache->_add(b, 1, nullptr);
7c673cae
FG
1561 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1562 }
1563 }
9f95a23c 1564 cache->_trim();
7c673cae
FG
1565 cache->_audit("finish_write end");
1566}
1567
9f95a23c 1568void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1569{
11fdf7f2 1570 std::lock_guard lk(cache->lock);
7c673cae
FG
1571 if (buffer_map.empty())
1572 return;
1573
1574 auto p = --buffer_map.end();
1575 while (true) {
1576 if (p->second->end() <= pos)
1577 break;
1578
1579 if (p->second->offset < pos) {
1580 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1581 size_t left = pos - p->second->offset;
1582 size_t right = p->second->length - left;
1583 if (p->second->data.length()) {
1584 bufferlist bl;
1585 bl.substr_of(p->second->data, left, right);
1586 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1587 0, p->second.get());
1588 } else {
1589 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1590 0, p->second.get());
1591 }
9f95a23c 1592 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1593 p->second->truncate(left);
1594 break;
1595 }
1596
11fdf7f2 1597 ceph_assert(p->second->end() > pos);
7c673cae
FG
1598 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1599 if (p->second->data.length()) {
1600 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1601 p->second->offset - pos, p->second->data),
1602 0, p->second.get());
1603 } else {
1604 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1605 p->second->offset - pos, p->second->length),
1606 0, p->second.get());
1607 }
1608 if (p == buffer_map.begin()) {
1609 _rm_buffer(cache, p);
1610 break;
1611 } else {
1612 _rm_buffer(cache, p--);
1613 }
1614 }
11fdf7f2 1615 ceph_assert(writing.empty());
9f95a23c 1616 cache->_trim();
7c673cae
FG
1617}
1618
1619// OnodeSpace
1620
1621#undef dout_prefix
1622#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1623
f6b5b4d7
TL
1624BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1625 OnodeRef& o)
7c673cae 1626{
11fdf7f2 1627 std::lock_guard l(cache->lock);
7c673cae
FG
1628 auto p = onode_map.find(oid);
1629 if (p != onode_map.end()) {
1630 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1631 << " raced, returning existing " << p->second
1632 << dendl;
1633 return p->second;
1634 }
f6b5b4d7 1635 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
7c673cae 1636 onode_map[oid] = o;
f6b5b4d7 1637 cache->_add(o.get(), 1);
9f95a23c 1638 cache->_trim();
7c673cae
FG
1639 return o;
1640}
1641
f6b5b4d7
TL
1642void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1643{
1644 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1645 onode_map.erase(oid);
1646}
1647
7c673cae
FG
1648BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1649{
7c673cae 1650 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1651 OnodeRef o;
1652 bool hit = false;
1653
1654 {
11fdf7f2 1655 std::lock_guard l(cache->lock);
224ce89b
WB
1656 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1657 if (p == onode_map.end()) {
1658 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1659 } else {
1660 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1661 << " " << p->second->nref
1662 << " " << p->second->cached
1663 << " " << p->second->pinned
224ce89b 1664 << dendl;
f6b5b4d7
TL
1665 // This will pin onode and implicitly touch the cache when Onode
1666 // eventually will become unpinned
224ce89b 1667 o = p->second;
f6b5b4d7
TL
1668 ceph_assert(!o->cached || o->pinned);
1669
1670 hit = true;
224ce89b
WB
1671 }
1672 }
1673
1674 if (hit) {
1675 cache->logger->inc(l_bluestore_onode_hits);
1676 } else {
7c673cae 1677 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1678 }
224ce89b 1679 return o;
7c673cae
FG
1680}
1681
1682void BlueStore::OnodeSpace::clear()
1683{
11fdf7f2 1684 std::lock_guard l(cache->lock);
f6b5b4d7 1685 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1686 for (auto &p : onode_map) {
f6b5b4d7 1687 cache->_rm(p.second.get());
7c673cae
FG
1688 }
1689 onode_map.clear();
1690}
1691
1692bool BlueStore::OnodeSpace::empty()
1693{
11fdf7f2 1694 std::lock_guard l(cache->lock);
7c673cae
FG
1695 return onode_map.empty();
1696}
1697
1698void BlueStore::OnodeSpace::rename(
1699 OnodeRef& oldo,
1700 const ghobject_t& old_oid,
1701 const ghobject_t& new_oid,
31f18b77 1702 const mempool::bluestore_cache_other::string& new_okey)
7c673cae 1703{
11fdf7f2 1704 std::lock_guard l(cache->lock);
7c673cae
FG
1705 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1706 << dendl;
1707 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1708 po = onode_map.find(old_oid);
1709 pn = onode_map.find(new_oid);
11fdf7f2 1710 ceph_assert(po != pn);
7c673cae 1711
11fdf7f2 1712 ceph_assert(po != onode_map.end());
7c673cae
FG
1713 if (pn != onode_map.end()) {
1714 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1715 << dendl;
f6b5b4d7 1716 cache->_rm(pn->second.get());
7c673cae
FG
1717 onode_map.erase(pn);
1718 }
1719 OnodeRef o = po->second;
1720
1721 // install a non-existent onode at old location
1722 oldo.reset(new Onode(o->c, old_oid, o->key));
1723 po->second = oldo;
f6b5b4d7
TL
1724 cache->_add(oldo.get(), 1);
1725 // add at new position and fix oid, key.
1726 // This will pin 'o' and implicitly touch cache
1727 // when it will eventually become unpinned
7c673cae 1728 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7
TL
1729 ceph_assert(o->pinned);
1730
7c673cae
FG
1731 o->oid = new_oid;
1732 o->key = new_okey;
9f95a23c 1733 cache->_trim();
7c673cae
FG
1734}
1735
1736bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1737{
11fdf7f2 1738 std::lock_guard l(cache->lock);
7c673cae
FG
1739 ldout(cache->cct, 20) << __func__ << dendl;
1740 for (auto& i : onode_map) {
1741 if (f(i.second)) {
1742 return true;
1743 }
1744 }
1745 return false;
1746}
1747
11fdf7f2
TL
1748template <int LogLevelV = 30>
1749void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1750{
1751 for (auto& i : onode_map) {
f6b5b4d7
TL
1752 ldout(cct, LogLevelV) << i.first << " : " << i.second
1753 << " " << i.second->nref
1754 << " " << i.second->cached
1755 << " " << i.second->pinned
1756 << dendl;
3efd9988
FG
1757 }
1758}
7c673cae
FG
1759
1760// SharedBlob
1761
1762#undef dout_prefix
1763#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
1764#undef dout_context
1765#define dout_context coll->store->cct
7c673cae 1766
9f95a23c 1767void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 1768{
9f95a23c
TL
1769 f->dump_bool("loaded", loaded);
1770 if (loaded) {
1771 persistent->dump(f);
1772 } else {
1773 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
1774 }
1775}
1776
1777ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1778{
1779 out << "SharedBlob(" << &sb;
1780
7c673cae
FG
1781 if (sb.loaded) {
1782 out << " loaded " << *sb.persistent;
1783 } else {
1784 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1785 }
1786 return out << ")";
1787}
1788
1789BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1790 : coll(_coll), sbid_unloaded(i)
1791{
11fdf7f2 1792 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
1793 if (get_cache()) {
1794 get_cache()->add_blob();
1795 }
1796}
1797
1798BlueStore::SharedBlob::~SharedBlob()
1799{
7c673cae
FG
1800 if (loaded && persistent) {
1801 delete persistent;
1802 }
1803}
1804
1805void BlueStore::SharedBlob::put()
1806{
1807 if (--nref == 0) {
9f95a23c
TL
1808 dout(20) << __func__ << " " << this
1809 << " removing self from set " << get_parent()
1810 << dendl;
1adf2230
AA
1811 again:
1812 auto coll_snap = coll;
1813 if (coll_snap) {
11fdf7f2 1814 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
1815 if (coll_snap != coll) {
1816 goto again;
1817 }
91327a77
AA
1818 if (!coll_snap->shared_blob_set.remove(this, true)) {
1819 // race with lookup
1820 return;
1821 }
1adf2230
AA
1822 bc._clear(coll_snap->cache);
1823 coll_snap->cache->rm_blob();
7c673cae 1824 }
28e407b8 1825 delete this;
7c673cae
FG
1826 }
1827}
1828
1829void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1830{
11fdf7f2 1831 ceph_assert(persistent);
7c673cae
FG
1832 persistent->ref_map.get(offset, length);
1833}
1834
1835void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 1836 PExtentVector *r,
11fdf7f2 1837 bool *unshare)
7c673cae 1838{
11fdf7f2
TL
1839 ceph_assert(persistent);
1840 persistent->ref_map.put(offset, length, r,
1841 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
1842}
1843
f64942e4
AA
1844void BlueStore::SharedBlob::finish_write(uint64_t seq)
1845{
1846 while (true) {
9f95a23c 1847 BufferCacheShard *cache = coll->cache;
11fdf7f2 1848 std::lock_guard l(cache->lock);
f64942e4 1849 if (coll->cache != cache) {
9f95a23c
TL
1850 dout(20) << __func__
1851 << " raced with sb cache update, was " << cache
1852 << ", now " << coll->cache << ", retrying"
1853 << dendl;
f64942e4
AA
1854 continue;
1855 }
1856 bc._finish_write(cache, seq);
1857 break;
1858 }
1859}
1860
3efd9988
FG
1861// SharedBlobSet
1862
1863#undef dout_prefix
1864#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1865
11fdf7f2
TL
1866template <int LogLevelV = 30>
1867void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 1868{
11fdf7f2 1869 std::lock_guard l(lock);
3efd9988 1870 for (auto& i : sb_map) {
11fdf7f2 1871 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
1872 }
1873}
1874
7c673cae
FG
1875// Blob
1876
1877#undef dout_prefix
1878#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1879
9f95a23c
TL
1880void BlueStore::Blob::dump(Formatter* f) const
1881{
1882 if (is_spanning()) {
1883 f->dump_unsigned("spanning_id ", id);
1884 }
1885 blob.dump(f);
1886 if (shared_blob) {
1887 f->dump_object("shared", *shared_blob);
1888 }
1889}
1890
7c673cae
FG
1891ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1892{
1893 out << "Blob(" << &b;
1894 if (b.is_spanning()) {
1895 out << " spanning " << b.id;
1896 }
35e4c445
FG
1897 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1898 if (b.shared_blob) {
1899 out << " " << *b.shared_blob;
1900 } else {
1901 out << " (shared_blob=NULL)";
1902 }
1903 out << ")";
7c673cae
FG
1904 return out;
1905}
1906
1907void BlueStore::Blob::discard_unallocated(Collection *coll)
1908{
224ce89b 1909 if (get_blob().is_shared()) {
7c673cae
FG
1910 return;
1911 }
224ce89b 1912 if (get_blob().is_compressed()) {
7c673cae
FG
1913 bool discard = false;
1914 bool all_invalid = true;
224ce89b 1915 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1916 if (!e.is_valid()) {
1917 discard = true;
1918 } else {
1919 all_invalid = false;
1920 }
1921 }
11fdf7f2 1922 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
1923 // or none pextents are invalid.
1924 if (discard) {
224ce89b
WB
1925 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1926 get_blob().get_logical_length());
7c673cae
FG
1927 }
1928 } else {
1929 size_t pos = 0;
224ce89b 1930 for (auto e : get_blob().get_extents()) {
7c673cae 1931 if (!e.is_valid()) {
9f95a23c
TL
1932 dout(20) << __func__ << " 0x" << std::hex << pos
1933 << "~" << e.length
1934 << std::dec << dendl;
7c673cae
FG
1935 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1936 }
1937 pos += e.length;
1938 }
224ce89b
WB
1939 if (get_blob().can_prune_tail()) {
1940 dirty_blob().prune_tail();
1941 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 1942 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1943 }
1944 }
1945}
1946
1947void BlueStore::Blob::get_ref(
1948 Collection *coll,
1949 uint32_t offset,
1950 uint32_t length)
1951{
1952 // Caller has to initialize Blob's logical length prior to increment
1953 // references. Otherwise one is neither unable to determine required
1954 // amount of counters in case of per-au tracking nor obtain min_release_size
1955 // for single counter mode.
11fdf7f2 1956 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
1957 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1958 << std::dec << " " << *this << dendl;
1959
1960 if (used_in_blob.is_empty()) {
1961 uint32_t min_release_size =
224ce89b
WB
1962 get_blob().get_release_size(coll->store->min_alloc_size);
1963 uint64_t l = get_blob().get_logical_length();
1964 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1965 << min_release_size << std::dec << dendl;
7c673cae
FG
1966 used_in_blob.init(l, min_release_size);
1967 }
1968 used_in_blob.get(
1969 offset,
1970 length);
1971}
1972
1973bool BlueStore::Blob::put_ref(
1974 Collection *coll,
1975 uint32_t offset,
1976 uint32_t length,
1977 PExtentVector *r)
1978{
1979 PExtentVector logical;
1980
7c673cae
FG
1981 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1982 << std::dec << " " << *this << dendl;
1983
1984 bool empty = used_in_blob.put(
1985 offset,
1986 length,
1987 &logical);
1988 r->clear();
1989 // nothing to release
1990 if (!empty && logical.empty()) {
1991 return false;
1992 }
1993
1994 bluestore_blob_t& b = dirty_blob();
1995 return b.release_extents(empty, logical, r);
1996}
1997
224ce89b 1998bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1999 uint32_t target_blob_size,
2000 uint32_t b_offset,
2001 uint32_t *length0) {
11fdf7f2
TL
2002 ceph_assert(min_alloc_size);
2003 ceph_assert(target_blob_size);
7c673cae
FG
2004 if (!get_blob().is_mutable()) {
2005 return false;
2006 }
2007
2008 uint32_t length = *length0;
2009 uint32_t end = b_offset + length;
2010
2011 // Currently for the sake of simplicity we omit blob reuse if data is
2012 // unaligned with csum chunk. Later we can perform padding if needed.
2013 if (get_blob().has_csum() &&
2014 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2015 (end % get_blob().get_csum_chunk_size()) != 0)) {
2016 return false;
2017 }
2018
2019 auto blen = get_blob().get_logical_length();
2020 uint32_t new_blen = blen;
2021
2022 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2023 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2024
2025 if (b_offset >= blen) {
224ce89b
WB
2026 // new data totally stands out of the existing blob
2027 new_blen = end;
7c673cae 2028 } else {
224ce89b 2029 // new data overlaps with the existing blob
11fdf7f2 2030 new_blen = std::max(blen, end);
224ce89b
WB
2031
2032 uint32_t overlap = 0;
2033 if (new_blen > blen) {
2034 overlap = blen - b_offset;
2035 } else {
2036 overlap = length;
2037 }
2038
2039 if (!get_blob().is_unallocated(b_offset, overlap)) {
2040 // abort if any piece of the overlap has already been allocated
2041 return false;
7c673cae
FG
2042 }
2043 }
224ce89b 2044
7c673cae
FG
2045 if (new_blen > blen) {
2046 int64_t overflow = int64_t(new_blen) - target_blob_size;
2047 // Unable to decrease the provided length to fit into max_blob_size
2048 if (overflow >= length) {
2049 return false;
2050 }
2051
2052 // FIXME: in some cases we could reduce unused resolution
2053 if (get_blob().has_unused()) {
2054 return false;
2055 }
2056
2057 if (overflow > 0) {
2058 new_blen -= overflow;
2059 length -= overflow;
2060 *length0 = length;
2061 }
224ce89b 2062
7c673cae
FG
2063 if (new_blen > blen) {
2064 dirty_blob().add_tail(new_blen);
2065 used_in_blob.add_tail(new_blen,
224ce89b 2066 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2067 }
2068 }
2069 return true;
2070}
2071
2072void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2073{
7c673cae
FG
2074 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2075 << " start " << *this << dendl;
11fdf7f2
TL
2076 ceph_assert(blob.can_split());
2077 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2078 bluestore_blob_t &lb = dirty_blob();
2079 bluestore_blob_t &rb = r->dirty_blob();
2080
2081 used_in_blob.split(
2082 blob_offset,
2083 &(r->used_in_blob));
2084
2085 lb.split(blob_offset, rb);
2086 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2087
2088 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2089 << " finish " << *this << dendl;
2090 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2091 << " and " << *r << dendl;
2092}
2093
2094#ifndef CACHE_BLOB_BL
2095void BlueStore::Blob::decode(
2096 Collection *coll,
11fdf7f2 2097 bufferptr::const_iterator& p,
7c673cae
FG
2098 uint64_t struct_v,
2099 uint64_t* sbid,
2100 bool include_ref_map)
2101{
2102 denc(blob, p, struct_v);
2103 if (blob.is_shared()) {
2104 denc(*sbid, p);
2105 }
2106 if (include_ref_map) {
2107 if (struct_v > 1) {
2108 used_in_blob.decode(p);
2109 } else {
2110 used_in_blob.clear();
2111 bluestore_extent_ref_map_t legacy_ref_map;
2112 legacy_ref_map.decode(p);
2113 for (auto r : legacy_ref_map.ref_map) {
2114 get_ref(
2115 coll,
2116 r.first,
2117 r.second.refs * r.second.length);
2118 }
2119 }
2120 }
2121}
2122#endif
2123
2124// Extent
2125
9f95a23c
TL
2126void BlueStore::Extent::dump(Formatter* f) const
2127{
2128 f->dump_unsigned("logical_offset", logical_offset);
2129 f->dump_unsigned("length", length);
2130 f->dump_unsigned("blob_offset", blob_offset);
2131 f->dump_object("blob", *blob);
2132}
2133
7c673cae
FG
2134ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2135{
2136 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2137 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2138 << " " << *e.blob;
2139}
2140
2141// OldExtent
2142BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2143 uint32_t lo,
2144 uint32_t o,
2145 uint32_t l,
2146 BlobRef& b) {
2147 OldExtent* oe = new OldExtent(lo, o, l, b);
2148 b->put_ref(c.get(), o, l, &(oe->r));
2149 oe->blob_empty = b->get_referenced_bytes() == 0;
2150 return oe;
2151}
2152
2153// ExtentMap
2154
2155#undef dout_prefix
2156#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2157#undef dout_context
2158#define dout_context onode->c->store->cct
7c673cae
FG
2159
2160BlueStore::ExtentMap::ExtentMap(Onode *o)
2161 : onode(o),
2162 inline_bl(
2163 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2164}
2165
9f95a23c
TL
2166void BlueStore::ExtentMap::dump(Formatter* f) const
2167{
2168 f->open_array_section("extents");
2169
2170 for (auto& e : extent_map) {
2171 f->dump_object("extent", e);
2172 }
2173 f->close_section();
2174}
2175
11fdf7f2
TL
2176void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2177 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2178 uint64_t& length, uint64_t& dstoff) {
2179
2180 auto cct = onode->c->store->cct;
2181 bool inject_21040 =
2182 cct->_conf->bluestore_debug_inject_bug21040;
2183 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2184 for (auto& e : oldo->extent_map.extent_map) {
2185 e.blob->last_encoded_id = -1;
2186 }
2187
2188 int n = 0;
2189 uint64_t end = srcoff + length;
2190 uint32_t dirty_range_begin = 0;
2191 uint32_t dirty_range_end = 0;
2192 bool src_dirty = false;
2193 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2194 ep != oldo->extent_map.extent_map.end();
2195 ++ep) {
2196 auto& e = *ep;
2197 if (e.logical_offset >= end) {
2198 break;
2199 }
2200 dout(20) << __func__ << " src " << e << dendl;
2201 BlobRef cb;
2202 bool blob_duped = true;
2203 if (e.blob->last_encoded_id >= 0) {
2204 cb = id_to_blob[e.blob->last_encoded_id];
2205 blob_duped = false;
2206 } else {
2207 // dup the blob
2208 const bluestore_blob_t& blob = e.blob->get_blob();
2209 // make sure it is shared
2210 if (!blob.is_shared()) {
2211 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2212 if (!inject_21040 && !src_dirty) {
2213 src_dirty = true;
2214 dirty_range_begin = e.logical_offset;
2215 } else if (inject_21040 &&
2216 dirty_range_begin == 0 && dirty_range_end == 0) {
2217 dirty_range_begin = e.logical_offset;
2218 }
2219 ceph_assert(e.logical_end() > 0);
2220 // -1 to exclude next potential shard
2221 dirty_range_end = e.logical_end() - 1;
2222 } else {
2223 c->load_shared_blob(e.blob->shared_blob);
2224 }
2225 cb = new Blob();
2226 e.blob->last_encoded_id = n;
2227 id_to_blob[n] = cb;
2228 e.blob->dup(*cb);
2229 // bump the extent refs on the copied blob's extents
2230 for (auto p : blob.get_extents()) {
2231 if (p.is_valid()) {
2232 e.blob->shared_blob->get_ref(p.offset, p.length);
2233 }
2234 }
2235 txc->write_shared_blob(e.blob->shared_blob);
2236 dout(20) << __func__ << " new " << *cb << dendl;
2237 }
2238
2239 int skip_front, skip_back;
2240 if (e.logical_offset < srcoff) {
2241 skip_front = srcoff - e.logical_offset;
2242 } else {
2243 skip_front = 0;
2244 }
2245 if (e.logical_end() > end) {
2246 skip_back = e.logical_end() - end;
2247 } else {
2248 skip_back = 0;
2249 }
2250
2251 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2252 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2253 newo->extent_map.extent_map.insert(*ne);
2254 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2255 // fixme: we may leave parts of new blob unreferenced that could
2256 // be freed (relative to the shared_blob).
2257 txc->statfs_delta.stored() += ne->length;
2258 if (e.blob->get_blob().is_compressed()) {
2259 txc->statfs_delta.compressed_original() += ne->length;
2260 if (blob_duped) {
2261 txc->statfs_delta.compressed() +=
2262 cb->get_blob().get_compressed_payload_length();
2263 }
2264 }
2265 dout(20) << __func__ << " dst " << *ne << dendl;
2266 ++n;
2267 }
2268 if ((!inject_21040 && src_dirty) ||
2269 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2270 oldo->extent_map.dirty_range(dirty_range_begin,
2271 dirty_range_end - dirty_range_begin);
2272 txc->write_onode(oldo);
2273 }
2274 txc->write_onode(newo);
2275
2276 if (dstoff + length > newo->onode.size) {
2277 newo->onode.size = dstoff + length;
2278 }
2279 newo->extent_map.dirty_range(dstoff, length);
2280}
7c673cae
FG
2281void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2282 bool force)
2283{
2284 auto cct = onode->c->store->cct; //used by dout
2285 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2286 if (onode->onode.extent_map_shards.empty()) {
2287 if (inline_bl.length() == 0) {
2288 unsigned n;
2289 // we need to encode inline_bl to measure encoded length
2290 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
3efd9988 2291 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11fdf7f2 2292 ceph_assert(!never_happen);
7c673cae
FG
2293 size_t len = inline_bl.length();
2294 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2295 << " extents" << dendl;
2296 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2297 request_reshard(0, OBJECT_MAX_SIZE);
2298 return;
2299 }
2300 }
2301 // will persist in the onode key.
2302 } else {
2303 // pending shard update
2304 struct dirty_shard_t {
2305 Shard *shard;
2306 bufferlist bl;
2307 dirty_shard_t(Shard *s) : shard(s) {}
2308 };
2309 vector<dirty_shard_t> encoded_shards;
2310 // allocate slots for all shards in a single call instead of
2311 // doing multiple allocations - one per each dirty shard
2312 encoded_shards.reserve(shards.size());
2313
2314 auto p = shards.begin();
2315 auto prev_p = p;
2316 while (p != shards.end()) {
11fdf7f2 2317 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2318 auto n = p;
2319 ++n;
2320 if (p->dirty) {
2321 uint32_t endoff;
2322 if (n == shards.end()) {
2323 endoff = OBJECT_MAX_SIZE;
2324 } else {
2325 endoff = n->shard_info->offset;
2326 }
2327 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2328 bufferlist& bl = encoded_shards.back().bl;
2329 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2330 bl, &p->extents)) {
2331 if (force) {
2332 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2333 ceph_assert(!force);
7c673cae
FG
2334 }
2335 }
2336 size_t len = bl.length();
2337
2338 dout(20) << __func__ << " shard 0x" << std::hex
2339 << p->shard_info->offset << std::dec << " is " << len
2340 << " bytes (was " << p->shard_info->bytes << ") from "
2341 << p->extents << " extents" << dendl;
2342
2343 if (!force) {
2344 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2345 // we are big; reshard ourselves
2346 request_reshard(p->shard_info->offset, endoff);
2347 }
2348 // avoid resharding the trailing shard, even if it is small
2349 else if (n != shards.end() &&
11fdf7f2
TL
2350 len < g_conf()->bluestore_extent_map_shard_min_size) {
2351 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2352 if (p == shards.begin()) {
2353 // we are the first shard, combine with next shard
7c673cae 2354 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2355 } else {
31f18b77
FG
2356 // combine either with the previous shard or the next,
2357 // whichever is smaller
7c673cae
FG
2358 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2359 request_reshard(p->shard_info->offset, endoff + 1);
2360 } else {
2361 request_reshard(prev_p->shard_info->offset, endoff);
2362 }
2363 }
2364 }
2365 }
2366 }
2367 prev_p = p;
2368 p = n;
2369 }
2370 if (needs_reshard()) {
2371 return;
2372 }
2373
2374 // schedule DB update for dirty shards
2375 string key;
2376 for (auto& it : encoded_shards) {
2377 it.shard->dirty = false;
2378 it.shard->shard_info->bytes = it.bl.length();
2379 generate_extent_shard_key_and_apply(
2380 onode->key,
2381 it.shard->shard_info->offset,
2382 &key,
2383 [&](const string& final_key) {
2384 t->set(PREFIX_OBJ, final_key, it.bl);
2385 }
2386 );
2387 }
2388 }
2389}
2390
31f18b77
FG
2391bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2392{
2393 if (spanning_blob_map.empty())
2394 return 0;
2395 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2396 // bid is valid and available.
2397 if (bid >= 0)
2398 return bid;
2399 // Find next unused bid;
2400 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2401 const auto begin_bid = bid;
2402 do {
2403 if (!spanning_blob_map.count(bid))
2404 return bid;
2405 else {
2406 bid++;
2407 if (bid < 0) bid = 0;
2408 }
2409 } while (bid != begin_bid);
81eedcae
TL
2410 auto cct = onode->c->store->cct; // used by dout
2411 _dump_onode<0>(cct, *onode);
11fdf7f2 2412 ceph_abort_msg("no available blob id");
31f18b77
FG
2413}
2414
7c673cae
FG
2415void BlueStore::ExtentMap::reshard(
2416 KeyValueDB *db,
2417 KeyValueDB::Transaction t)
2418{
2419 auto cct = onode->c->store->cct; // used by dout
2420
2421 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2422 << needs_reshard_end << ")" << std::dec
2423 << " of " << onode->onode.extent_map_shards.size()
2424 << " shards on " << onode->oid << dendl;
2425 for (auto& p : spanning_blob_map) {
2426 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2427 << dendl;
2428 }
2429 // determine shard index range
2430 unsigned si_begin = 0, si_end = 0;
2431 if (!shards.empty()) {
2432 while (si_begin + 1 < shards.size() &&
2433 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2434 ++si_begin;
2435 }
2436 needs_reshard_begin = shards[si_begin].shard_info->offset;
2437 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2438 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2439 needs_reshard_end = shards[si_end].shard_info->offset;
2440 break;
2441 }
2442 }
2443 if (si_end == shards.size()) {
2444 needs_reshard_end = OBJECT_MAX_SIZE;
2445 }
2446 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2447 << " over 0x[" << std::hex << needs_reshard_begin << ","
2448 << needs_reshard_end << ")" << std::dec << dendl;
2449 }
2450
181888fb 2451 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2452
2453 // we may need to fault in a larger interval later must have all
2454 // referring extents for spanning blobs loaded in order to have
2455 // accurate use_tracker values.
2456 uint32_t spanning_scan_begin = needs_reshard_begin;
2457 uint32_t spanning_scan_end = needs_reshard_end;
2458
2459 // remove old keys
2460 string key;
2461 for (unsigned i = si_begin; i < si_end; ++i) {
2462 generate_extent_shard_key_and_apply(
2463 onode->key, shards[i].shard_info->offset, &key,
2464 [&](const string& final_key) {
2465 t->rmkey(PREFIX_OBJ, final_key);
2466 }
2467 );
2468 }
2469
2470 // calculate average extent size
2471 unsigned bytes = 0;
2472 unsigned extents = 0;
2473 if (onode->onode.extent_map_shards.empty()) {
2474 bytes = inline_bl.length();
2475 extents = extent_map.size();
2476 } else {
2477 for (unsigned i = si_begin; i < si_end; ++i) {
2478 bytes += shards[i].shard_info->bytes;
2479 extents += shards[i].extents;
2480 }
2481 }
2482 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2483 unsigned slop = target *
2484 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2485 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2486 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2487 << ", slop " << slop << dendl;
2488
2489 // reshard
2490 unsigned estimate = 0;
31f18b77 2491 unsigned offset = needs_reshard_begin;
7c673cae
FG
2492 vector<bluestore_onode_t::shard_info> new_shard_info;
2493 unsigned max_blob_end = 0;
2494 Extent dummy(needs_reshard_begin);
2495 for (auto e = extent_map.lower_bound(dummy);
2496 e != extent_map.end();
2497 ++e) {
2498 if (e->logical_offset >= needs_reshard_end) {
2499 break;
2500 }
2501 dout(30) << " extent " << *e << dendl;
2502
2503 // disfavor shard boundaries that span a blob
2504 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2505 if (estimate &&
2506 estimate + extent_avg > target + (would_span ? slop : 0)) {
2507 // new shard
31f18b77 2508 if (offset == needs_reshard_begin) {
7c673cae
FG
2509 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2510 new_shard_info.back().offset = offset;
2511 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2512 << std::dec << dendl;
7c673cae
FG
2513 }
2514 offset = e->logical_offset;
2515 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2516 new_shard_info.back().offset = offset;
2517 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2518 << std::dec << dendl;
2519 estimate = 0;
2520 }
2521 estimate += extent_avg;
31f18b77
FG
2522 unsigned bs = e->blob_start();
2523 if (bs < spanning_scan_begin) {
2524 spanning_scan_begin = bs;
7c673cae
FG
2525 }
2526 uint32_t be = e->blob_end();
2527 if (be > max_blob_end) {
2528 max_blob_end = be;
2529 }
2530 if (be > spanning_scan_end) {
2531 spanning_scan_end = be;
2532 }
2533 }
2534 if (new_shard_info.empty() && (si_begin > 0 ||
2535 si_end < shards.size())) {
2536 // we resharded a partial range; we must produce at least one output
2537 // shard
2538 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2539 new_shard_info.back().offset = needs_reshard_begin;
2540 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2541 << std::dec << " (singleton degenerate case)" << dendl;
2542 }
2543
2544 auto& sv = onode->onode.extent_map_shards;
2545 dout(20) << __func__ << " new " << new_shard_info << dendl;
2546 dout(20) << __func__ << " old " << sv << dendl;
2547 if (sv.empty()) {
2548 // no old shards to keep
2549 sv.swap(new_shard_info);
2550 init_shards(true, true);
2551 } else {
2552 // splice in new shards
2553 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2554 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2555 sv.insert(
2556 sv.begin() + si_begin,
2557 new_shard_info.begin(),
2558 new_shard_info.end());
2559 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2560 si_end = si_begin + new_shard_info.size();
31f18b77 2561
11fdf7f2 2562 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2563
2564 // note that we need to update every shard_info of shards here,
2565 // as sv might have been totally re-allocated above
2566 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2567 shards[i].shard_info = &sv[i];
31f18b77
FG
2568 }
2569
2570 // mark newly added shards as dirty
2571 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2572 shards[i].loaded = true;
2573 shards[i].dirty = true;
2574 }
7c673cae
FG
2575 }
2576 dout(20) << __func__ << " fin " << sv << dendl;
2577 inline_bl.clear();
2578
2579 if (sv.empty()) {
2580 // no more shards; unspan all previously spanning blobs
2581 auto p = spanning_blob_map.begin();
2582 while (p != spanning_blob_map.end()) {
2583 p->second->id = -1;
2584 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2585 p = spanning_blob_map.erase(p);
2586 }
2587 } else {
2588 // identify new spanning blobs
2589 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2590 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2591 if (spanning_scan_begin < needs_reshard_begin) {
2592 fault_range(db, spanning_scan_begin,
2593 needs_reshard_begin - spanning_scan_begin);
2594 }
2595 if (spanning_scan_end > needs_reshard_end) {
2596 fault_range(db, needs_reshard_end,
31f18b77 2597 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2598 }
2599 auto sp = sv.begin() + si_begin;
2600 auto esp = sv.end();
2601 unsigned shard_start = sp->offset;
2602 unsigned shard_end;
2603 ++sp;
2604 if (sp == esp) {
2605 shard_end = OBJECT_MAX_SIZE;
2606 } else {
2607 shard_end = sp->offset;
2608 }
7c673cae 2609 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2610
2611 bool was_too_many_blobs_check = false;
2612 auto too_many_blobs_threshold =
2613 g_conf()->bluestore_debug_too_many_blobs_threshold;
2614 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2615 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2616 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2617
7c673cae
FG
2618 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2619 if (e->logical_offset >= needs_reshard_end) {
2620 break;
2621 }
2622 dout(30) << " extent " << *e << dendl;
2623 while (e->logical_offset >= shard_end) {
2624 shard_start = shard_end;
11fdf7f2 2625 ceph_assert(sp != esp);
7c673cae
FG
2626 ++sp;
2627 if (sp == esp) {
2628 shard_end = OBJECT_MAX_SIZE;
2629 } else {
2630 shard_end = sp->offset;
2631 }
2632 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2633 << " to 0x" << shard_end << std::dec << dendl;
2634 }
9f95a23c 2635
7c673cae
FG
2636 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2637 if (!e->blob->is_spanning()) {
2638 // We have two options: (1) split the blob into pieces at the
2639 // shard boundaries (and adjust extents accordingly), or (2)
2640 // mark it spanning. We prefer to cut the blob if we can. Note that
2641 // we may have to split it multiple times--potentially at every
2642 // shard boundary.
2643 bool must_span = false;
2644 BlobRef b = e->blob;
2645 if (b->can_split()) {
2646 uint32_t bstart = e->blob_start();
2647 uint32_t bend = e->blob_end();
2648 for (const auto& sh : shards) {
2649 if (bstart < sh.shard_info->offset &&
2650 bend > sh.shard_info->offset) {
2651 uint32_t blob_offset = sh.shard_info->offset - bstart;
2652 if (b->can_split_at(blob_offset)) {
2653 dout(20) << __func__ << " splitting blob, bstart 0x"
2654 << std::hex << bstart << " blob_offset 0x"
2655 << blob_offset << std::dec << " " << *b << dendl;
2656 b = split_blob(b, blob_offset, sh.shard_info->offset);
2657 // switch b to the new right-hand side, in case it
2658 // *also* has to get split.
2659 bstart += blob_offset;
2660 onode->c->store->logger->inc(l_bluestore_blob_split);
2661 } else {
2662 must_span = true;
2663 break;
2664 }
2665 }
2666 }
2667 } else {
2668 must_span = true;
2669 }
2670 if (must_span) {
31f18b77
FG
2671 auto bid = allocate_spanning_blob_id();
2672 b->id = bid;
7c673cae
FG
2673 spanning_blob_map[b->id] = b;
2674 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2675 if (!was_too_many_blobs_check &&
2676 too_many_blobs_threshold &&
2677 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2678
2679 was_too_many_blobs_check = true;
2680 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2681 if (dumped_onodes[i].first == onode->oid) {
2682 oid_slot = &dumped_onodes[i];
2683 break;
2684 }
2685 if (!oldest_slot || (oldest_slot &&
2686 dumped_onodes[i].second < oldest_slot->second)) {
2687 oldest_slot = &dumped_onodes[i];
2688 }
2689 }
2690 }
7c673cae
FG
2691 }
2692 }
2693 } else {
2694 if (e->blob->is_spanning()) {
2695 spanning_blob_map.erase(e->blob->id);
2696 e->blob->id = -1;
2697 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2698 }
2699 }
2700 }
9f95a23c
TL
2701 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2702 (oid_slot &&
2703 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2704 if (do_dump) {
2705 dout(0) << __func__
2706 << " spanning blob count exceeds threshold, "
2707 << spanning_blob_map.size() << " spanning blobs"
2708 << dendl;
2709 _dump_onode<0>(cct, *onode);
2710 if (oid_slot) {
2711 oid_slot->second = mono_clock::now();
2712 } else {
2713 ceph_assert(oldest_slot);
2714 oldest_slot->first = onode->oid;
2715 oldest_slot->second = mono_clock::now();
2716 }
2717 }
7c673cae
FG
2718 }
2719
2720 clear_needs_reshard();
2721}
2722
2723bool BlueStore::ExtentMap::encode_some(
2724 uint32_t offset,
2725 uint32_t length,
2726 bufferlist& bl,
2727 unsigned *pn)
2728{
7c673cae
FG
2729 Extent dummy(offset);
2730 auto start = extent_map.lower_bound(dummy);
2731 uint32_t end = offset + length;
2732
2733 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2734 // serialization only. Hence there is no specific
2735 // handling at ExtentMap level.
2736
2737 unsigned n = 0;
2738 size_t bound = 0;
7c673cae
FG
2739 bool must_reshard = false;
2740 for (auto p = start;
2741 p != extent_map.end() && p->logical_offset < end;
2742 ++p, ++n) {
11fdf7f2 2743 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2744 p->blob->last_encoded_id = -1;
2745 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2746 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2747 << std::dec << " hit new spanning blob " << *p << dendl;
2748 request_reshard(p->blob_start(), p->blob_end());
2749 must_reshard = true;
2750 }
31f18b77
FG
2751 if (!must_reshard) {
2752 denc_varint(0, bound); // blobid
2753 denc_varint(0, bound); // logical_offset
2754 denc_varint(0, bound); // len
2755 denc_varint(0, bound); // blob_offset
7c673cae 2756
31f18b77
FG
2757 p->blob->bound_encode(
2758 bound,
2759 struct_v,
2760 p->blob->shared_blob->get_sbid(),
2761 false);
2762 }
7c673cae
FG
2763 }
2764 if (must_reshard) {
2765 return true;
2766 }
2767
31f18b77
FG
2768 denc(struct_v, bound);
2769 denc_varint(0, bound); // number of extents
2770
7c673cae
FG
2771 {
2772 auto app = bl.get_contiguous_appender(bound);
2773 denc(struct_v, app);
2774 denc_varint(n, app);
2775 if (pn) {
2776 *pn = n;
2777 }
2778
2779 n = 0;
2780 uint64_t pos = 0;
2781 uint64_t prev_len = 0;
2782 for (auto p = start;
2783 p != extent_map.end() && p->logical_offset < end;
2784 ++p, ++n) {
2785 unsigned blobid;
2786 bool include_blob = false;
2787 if (p->blob->is_spanning()) {
2788 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2789 blobid |= BLOBID_FLAG_SPANNING;
2790 } else if (p->blob->last_encoded_id < 0) {
2791 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2792 include_blob = true;
2793 blobid = 0; // the decoder will infer the id from n
2794 } else {
2795 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2796 }
2797 if (p->logical_offset == pos) {
2798 blobid |= BLOBID_FLAG_CONTIGUOUS;
2799 }
2800 if (p->blob_offset == 0) {
2801 blobid |= BLOBID_FLAG_ZEROOFFSET;
2802 }
2803 if (p->length == prev_len) {
2804 blobid |= BLOBID_FLAG_SAMELENGTH;
2805 } else {
2806 prev_len = p->length;
2807 }
2808 denc_varint(blobid, app);
2809 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2810 denc_varint_lowz(p->logical_offset - pos, app);
2811 }
2812 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2813 denc_varint_lowz(p->blob_offset, app);
2814 }
2815 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2816 denc_varint_lowz(p->length, app);
2817 }
2818 pos = p->logical_end();
2819 if (include_blob) {
2820 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2821 }
2822 }
2823 }
2824 /*derr << __func__ << bl << dendl;
2825 derr << __func__ << ":";
2826 bl.hexdump(*_dout);
2827 *_dout << dendl;
2828 */
2829 return false;
2830}
2831
2832unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2833{
7c673cae
FG
2834 /*
2835 derr << __func__ << ":";
2836 bl.hexdump(*_dout);
2837 *_dout << dendl;
2838 */
2839
11fdf7f2 2840 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
2841 auto p = bl.front().begin_deep();
2842 __u8 struct_v;
2843 denc(struct_v, p);
2844 // Version 2 differs from v1 in blob's ref_map
2845 // serialization only. Hence there is no specific
2846 // handling at ExtentMap level below.
11fdf7f2 2847 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2848
2849 uint32_t num;
2850 denc_varint(num, p);
2851 vector<BlobRef> blobs(num);
2852 uint64_t pos = 0;
2853 uint64_t prev_len = 0;
2854 unsigned n = 0;
2855
2856 while (!p.end()) {
2857 Extent *le = new Extent();
2858 uint64_t blobid;
2859 denc_varint(blobid, p);
2860 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2861 uint64_t gap;
2862 denc_varint_lowz(gap, p);
2863 pos += gap;
2864 }
2865 le->logical_offset = pos;
2866 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2867 denc_varint_lowz(le->blob_offset, p);
2868 } else {
2869 le->blob_offset = 0;
2870 }
2871 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2872 denc_varint_lowz(prev_len, p);
2873 }
2874 le->length = prev_len;
2875
2876 if (blobid & BLOBID_FLAG_SPANNING) {
2877 dout(30) << __func__ << " getting spanning blob "
2878 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2879 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2880 } else {
2881 blobid >>= BLOBID_SHIFT_BITS;
2882 if (blobid) {
2883 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 2884 ceph_assert(le->blob);
7c673cae
FG
2885 } else {
2886 Blob *b = new Blob();
2887 uint64_t sbid = 0;
2888 b->decode(onode->c, p, struct_v, &sbid, false);
2889 blobs[n] = b;
2890 onode->c->open_shared_blob(sbid, b);
2891 le->assign_blob(b);
2892 }
2893 // we build ref_map dynamically for non-spanning blobs
2894 le->blob->get_ref(
2895 onode->c,
2896 le->blob_offset,
2897 le->length);
2898 }
2899 pos += prev_len;
2900 ++n;
2901 extent_map.insert(*le);
2902 }
2903
11fdf7f2 2904 ceph_assert(n == num);
7c673cae
FG
2905 return num;
2906}
2907
2908void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2909{
2910 // Version 2 differs from v1 in blob's ref_map
2911 // serialization only. Hence there is no specific
2912 // handling at ExtentMap level.
2913 __u8 struct_v = 2;
2914
2915 denc(struct_v, p);
2916 denc_varint((uint32_t)0, p);
2917 size_t key_size = 0;
2918 denc_varint((uint32_t)0, key_size);
2919 p += spanning_blob_map.size() * key_size;
2920 for (const auto& i : spanning_blob_map) {
2921 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2922 }
2923}
2924
2925void BlueStore::ExtentMap::encode_spanning_blobs(
2926 bufferlist::contiguous_appender& p)
2927{
2928 // Version 2 differs from v1 in blob's ref_map
2929 // serialization only. Hence there is no specific
2930 // handling at ExtentMap level.
2931 __u8 struct_v = 2;
2932
2933 denc(struct_v, p);
2934 denc_varint(spanning_blob_map.size(), p);
2935 for (auto& i : spanning_blob_map) {
2936 denc_varint(i.second->id, p);
2937 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2938 }
2939}
2940
2941void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 2942 bufferptr::const_iterator& p)
7c673cae
FG
2943{
2944 __u8 struct_v;
2945 denc(struct_v, p);
2946 // Version 2 differs from v1 in blob's ref_map
2947 // serialization only. Hence there is no specific
2948 // handling at ExtentMap level.
11fdf7f2 2949 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2950
2951 unsigned n;
2952 denc_varint(n, p);
2953 while (n--) {
2954 BlobRef b(new Blob());
2955 denc_varint(b->id, p);
2956 spanning_blob_map[b->id] = b;
2957 uint64_t sbid = 0;
2958 b->decode(onode->c, p, struct_v, &sbid, true);
2959 onode->c->open_shared_blob(sbid, b);
2960 }
2961}
2962
2963void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2964{
2965 shards.resize(onode->onode.extent_map_shards.size());
2966 unsigned i = 0;
2967 for (auto &s : onode->onode.extent_map_shards) {
2968 shards[i].shard_info = &s;
2969 shards[i].loaded = loaded;
2970 shards[i].dirty = dirty;
2971 ++i;
2972 }
2973}
2974
2975void BlueStore::ExtentMap::fault_range(
2976 KeyValueDB *db,
2977 uint32_t offset,
2978 uint32_t length)
2979{
7c673cae
FG
2980 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2981 << std::dec << dendl;
2982 auto start = seek_shard(offset);
2983 auto last = seek_shard(offset + length);
2984
2985 if (start < 0)
2986 return;
2987
11fdf7f2 2988 ceph_assert(last >= start);
7c673cae
FG
2989 string key;
2990 while (start <= last) {
11fdf7f2 2991 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2992 auto p = &shards[start];
2993 if (!p->loaded) {
2994 dout(30) << __func__ << " opening shard 0x" << std::hex
2995 << p->shard_info->offset << std::dec << dendl;
2996 bufferlist v;
2997 generate_extent_shard_key_and_apply(
2998 onode->key, p->shard_info->offset, &key,
2999 [&](const string& final_key) {
3000 int r = db->get(PREFIX_OBJ, final_key, &v);
3001 if (r < 0) {
3002 derr << __func__ << " missing shard 0x" << std::hex
3003 << p->shard_info->offset << std::dec << " for " << onode->oid
3004 << dendl;
11fdf7f2 3005 ceph_assert(r >= 0);
7c673cae
FG
3006 }
3007 }
3008 );
3009 p->extents = decode_some(v);
3010 p->loaded = true;
3011 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3012 << p->shard_info->offset
3013 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3014 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3015 ceph_assert(p->dirty == false);
3016 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3017 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3018 } else {
3019 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3020 }
3021 ++start;
3022 }
3023}
3024
3025void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3026 uint32_t offset,
3027 uint32_t length)
3028{
7c673cae
FG
3029 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3030 << std::dec << dendl;
3031 if (shards.empty()) {
3032 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3033 inline_bl.clear();
3034 return;
3035 }
3036 auto start = seek_shard(offset);
11fdf7f2
TL
3037 if (length == 0) {
3038 length = 1;
3039 }
3040 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3041 if (start < 0)
3042 return;
3043
11fdf7f2 3044 ceph_assert(last >= start);
7c673cae 3045 while (start <= last) {
11fdf7f2 3046 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3047 auto p = &shards[start];
3048 if (!p->loaded) {
11fdf7f2
TL
3049 derr << __func__ << "on write 0x" << std::hex << offset
3050 << "~" << length << " shard 0x" << p->shard_info->offset
3051 << std::dec << " is not loaded, can't mark dirty" << dendl;
3052 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3053 }
3054 if (!p->dirty) {
3055 dout(20) << __func__ << " mark shard 0x" << std::hex
3056 << p->shard_info->offset << std::dec << " dirty" << dendl;
3057 p->dirty = true;
3058 }
3059 ++start;
3060 }
3061}
3062
3063BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3064 uint64_t offset)
3065{
3066 Extent dummy(offset);
3067 return extent_map.find(dummy);
3068}
3069
7c673cae
FG
3070BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3071 uint64_t offset)
3072{
3073 Extent dummy(offset);
3074 auto fp = extent_map.lower_bound(dummy);
3075 if (fp != extent_map.begin()) {
3076 --fp;
3077 if (fp->logical_end() <= offset) {
3078 ++fp;
3079 }
3080 }
3081 return fp;
3082}
3083
3084BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3085 uint64_t offset) const
3086{
3087 Extent dummy(offset);
3088 auto fp = extent_map.lower_bound(dummy);
3089 if (fp != extent_map.begin()) {
3090 --fp;
3091 if (fp->logical_end() <= offset) {
3092 ++fp;
3093 }
3094 }
3095 return fp;
3096}
3097
3098bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3099{
3100 auto fp = seek_lextent(offset);
3101 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3102 return false;
3103 }
3104 return true;
3105}
3106
3107int BlueStore::ExtentMap::compress_extent_map(
3108 uint64_t offset,
3109 uint64_t length)
3110{
7c673cae
FG
3111 if (extent_map.empty())
3112 return 0;
3113 int removed = 0;
3114 auto p = seek_lextent(offset);
3115 if (p != extent_map.begin()) {
3116 --p; // start to the left of offset
3117 }
3118 // the caller should have just written to this region
11fdf7f2 3119 ceph_assert(p != extent_map.end());
7c673cae
FG
3120
3121 // identify the *next* shard
3122 auto pshard = shards.begin();
3123 while (pshard != shards.end() &&
3124 p->logical_offset >= pshard->shard_info->offset) {
3125 ++pshard;
3126 }
3127 uint64_t shard_end;
3128 if (pshard != shards.end()) {
3129 shard_end = pshard->shard_info->offset;
3130 } else {
3131 shard_end = OBJECT_MAX_SIZE;
3132 }
3133
3134 auto n = p;
3135 for (++n; n != extent_map.end(); p = n++) {
3136 if (n->logical_offset > offset + length) {
3137 break; // stop after end
3138 }
3139 while (n != extent_map.end() &&
3140 p->logical_end() == n->logical_offset &&
3141 p->blob == n->blob &&
3142 p->blob_offset + p->length == n->blob_offset &&
3143 n->logical_offset < shard_end) {
3144 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3145 << " next shard 0x" << shard_end << std::dec
3146 << " merging " << *p << " and " << *n << dendl;
3147 p->length += n->length;
3148 rm(n++);
3149 ++removed;
3150 }
3151 if (n == extent_map.end()) {
3152 break;
3153 }
3154 if (n->logical_offset >= shard_end) {
11fdf7f2 3155 ceph_assert(pshard != shards.end());
7c673cae
FG
3156 ++pshard;
3157 if (pshard != shards.end()) {
3158 shard_end = pshard->shard_info->offset;
3159 } else {
3160 shard_end = OBJECT_MAX_SIZE;
3161 }
3162 }
3163 }
11fdf7f2 3164 if (removed) {
7c673cae
FG
3165 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3166 }
3167 return removed;
3168}
3169
3170void BlueStore::ExtentMap::punch_hole(
3171 CollectionRef &c,
3172 uint64_t offset,
3173 uint64_t length,
3174 old_extent_map_t *old_extents)
3175{
3176 auto p = seek_lextent(offset);
3177 uint64_t end = offset + length;
3178 while (p != extent_map.end()) {
3179 if (p->logical_offset >= end) {
3180 break;
3181 }
3182 if (p->logical_offset < offset) {
3183 if (p->logical_end() > end) {
3184 // split and deref middle
3185 uint64_t front = offset - p->logical_offset;
3186 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3187 length, p->blob);
3188 old_extents->push_back(*oe);
3189 add(end,
3190 p->blob_offset + front + length,
3191 p->length - front - length,
3192 p->blob);
3193 p->length = front;
3194 break;
3195 } else {
3196 // deref tail
11fdf7f2 3197 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3198 uint64_t keep = offset - p->logical_offset;
3199 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3200 p->length - keep, p->blob);
3201 old_extents->push_back(*oe);
3202 p->length = keep;
3203 ++p;
3204 continue;
3205 }
3206 }
3207 if (p->logical_offset + p->length <= end) {
3208 // deref whole lextent
3209 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3210 p->length, p->blob);
3211 old_extents->push_back(*oe);
3212 rm(p++);
3213 continue;
3214 }
3215 // deref head
3216 uint64_t keep = p->logical_end() - end;
3217 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3218 p->length - keep, p->blob);
3219 old_extents->push_back(*oe);
3220
3221 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3222 rm(p);
3223 break;
3224 }
3225}
3226
3227BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3228 CollectionRef &c,
3229 uint64_t logical_offset,
3230 uint64_t blob_offset, uint64_t length, BlobRef b,
3231 old_extent_map_t *old_extents)
3232{
3233 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3234 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3235
3236 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3237 // old_extents list if we overwre the blob totally
3238 // This might happen during WAL overwrite.
3239 b->get_ref(onode->c, blob_offset, length);
3240
3241 if (old_extents) {
3242 punch_hole(c, logical_offset, length, old_extents);
3243 }
3244
3245 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3246 extent_map.insert(*le);
3247 if (spans_shard(logical_offset, length)) {
3248 request_reshard(logical_offset, logical_offset + length);
3249 }
3250 return le;
3251}
3252
3253BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3254 BlobRef lb,
3255 uint32_t blob_offset,
3256 uint32_t pos)
3257{
7c673cae
FG
3258 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3259 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3260 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3261 << dendl;
3262 BlobRef rb = onode->c->new_blob();
3263 lb->split(onode->c, blob_offset, rb.get());
3264
3265 for (auto ep = seek_lextent(pos);
3266 ep != extent_map.end() && ep->logical_offset < end_pos;
3267 ++ep) {
3268 if (ep->blob != lb) {
3269 continue;
3270 }
3271 if (ep->logical_offset < pos) {
3272 // split extent
3273 size_t left = pos - ep->logical_offset;
3274 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3275 extent_map.insert(*ne);
3276 ep->length = left;
3277 dout(30) << __func__ << " split " << *ep << dendl;
3278 dout(30) << __func__ << " to " << *ne << dendl;
3279 } else {
3280 // switch blob
11fdf7f2 3281 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3282
3283 ep->blob = rb;
3284 ep->blob_offset -= blob_offset;
3285 dout(30) << __func__ << " adjusted " << *ep << dendl;
3286 }
3287 }
3288 return rb;
3289}
3290
3291// Onode
3292
3293#undef dout_prefix
3294#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3295
f6b5b4d7
TL
3296//
3297// A tricky thing about Onode's ref counter is that we do an additional
3298// increment when newly pinned instance is detected. And -1 on unpin.
3299// This prevents from a conflict with a delete call (when nref == 0).
3300// The latter might happen while the thread is in unpin() function
3301// (and e.g. waiting for lock acquisition) since nref is already
3302// decremented. And another 'putting' thread on the instance will release it.
3303//
3304void BlueStore::Onode::get() {
3305 if (++nref == 2) {
3306 c->get_onode_cache()->pin(this, [&]() {
3307 bool was_pinned = pinned;
3308 pinned = nref >= 2;
3309 // additional increment for newly pinned instance
3310 bool r = !was_pinned && pinned;
3311 if (r) {
3312 ++nref;
3313 }
3314 return cached && r;
3315 });
3316 }
3317}
3318void BlueStore::Onode::put() {
3319 if (--nref == 2) {
3320 c->get_onode_cache()->unpin(this, [&]() {
3321 bool was_pinned = pinned;
3322 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3323 // +1 due to pinned state
3324 bool r = was_pinned && !pinned;
3325 // additional decrement for newly unpinned instance
3326 if (r) {
3327 --nref;
3328 }
3329 return cached && r;
3330 });
3331 }
3332 if (nref == 0) {
3333 delete this;
3334 }
3335}
3336
eafe8130
TL
3337BlueStore::Onode* BlueStore::Onode::decode(
3338 CollectionRef c,
3339 const ghobject_t& oid,
3340 const string& key,
3341 const bufferlist& v)
3342{
3343 Onode* on = new Onode(c.get(), oid, key);
3344 on->exists = true;
3345 auto p = v.front().begin_deep();
3346 on->onode.decode(p);
3347 for (auto& i : on->onode.attrs) {
3348 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3349 }
3350
3351 // initialize extent_map
3352 on->extent_map.decode_spanning_blobs(p);
3353 if (on->onode.extent_map_shards.empty()) {
3354 denc(on->extent_map.inline_bl, p);
3355 on->extent_map.decode_some(on->extent_map.inline_bl);
3356 on->extent_map.inline_bl.reassign_to_mempool(
3357 mempool::mempool_bluestore_cache_other);
3358 }
3359 else {
3360 on->extent_map.init_shards(false, false);
3361 }
3362 return on;
3363}
3364
7c673cae
FG
3365void BlueStore::Onode::flush()
3366{
3367 if (flushing_count.load()) {
3368 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3369 waiting_count++;
11fdf7f2 3370 std::unique_lock l(flush_lock);
7c673cae
FG
3371 while (flushing_count.load()) {
3372 flush_cond.wait(l);
3373 }
9f95a23c 3374 waiting_count--;
7c673cae
FG
3375 }
3376 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3377}
3378
9f95a23c
TL
3379void BlueStore::Onode::dump(Formatter* f) const
3380{
3381 onode.dump(f);
3382 extent_map.dump(f);
3383}
3384
3385
3386const string& BlueStore::Onode::get_omap_prefix()
3387{
3388 if (onode.is_pgmeta_omap()) {
3389 return PREFIX_PGMETA_OMAP;
3390 }
3391 if (onode.is_perpool_omap()) {
3392 return PREFIX_PERPOOL_OMAP;
3393 }
3394 return PREFIX_OMAP;
3395}
3396
3397// '-' < '.' < '~'
3398
3399void BlueStore::Onode::get_omap_header(string *out)
3400{
3401 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3402 _key_encode_u64(c->pool(), out);
3403 }
3404 _key_encode_u64(onode.nid, out);
3405 out->push_back('-');
3406}
3407
3408void BlueStore::Onode::get_omap_key(const string& key, string *out)
3409{
3410 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3411 _key_encode_u64(c->pool(), out);
3412 }
3413 _key_encode_u64(onode.nid, out);
3414 out->push_back('.');
3415 out->append(key);
3416}
3417
3418void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3419{
3420 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3421 _key_encode_u64(c->pool(), out);
3422 }
3423 _key_encode_u64(onode.nid, out);
3424 out->append(old.c_str() + out->length(), old.size() - out->length());
3425}
3426
3427void BlueStore::Onode::get_omap_tail(string *out)
3428{
3429 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3430 _key_encode_u64(c->pool(), out);
3431 }
3432 _key_encode_u64(onode.nid, out);
3433 out->push_back('~');
3434}
3435
3436void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3437{
3438 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3439 *user_key = key.substr(sizeof(uint64_t)*2 + 1);
3440 } else {
3441 *user_key = key.substr(sizeof(uint64_t) + 1);
3442 }
3443}
3444
3445
7c673cae
FG
3446// =======================================================
3447// WriteContext
3448
3449/// Checks for writes to the same pextent within a blob
3450bool BlueStore::WriteContext::has_conflict(
3451 BlobRef b,
3452 uint64_t loffs,
3453 uint64_t loffs_end,
3454 uint64_t min_alloc_size)
3455{
11fdf7f2
TL
3456 ceph_assert((loffs % min_alloc_size) == 0);
3457 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3458 for (auto w : writes) {
3459 if (b == w.b) {
11fdf7f2
TL
3460 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3461 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3462 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3463 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3464 return true;
3465 }
3466 }
3467 }
3468 return false;
3469}
3470
3471// =======================================================
3472
3473// DeferredBatch
3474#undef dout_prefix
3475#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3476#undef dout_context
3477#define dout_context cct
7c673cae
FG
3478
3479void BlueStore::DeferredBatch::prepare_write(
3480 CephContext *cct,
3481 uint64_t seq, uint64_t offset, uint64_t length,
3482 bufferlist::const_iterator& blp)
3483{
3484 _discard(cct, offset, length);
3485 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3486 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3487 i.first->second.seq = seq;
3488 blp.copy(length, i.first->second.bl);
31f18b77
FG
3489 i.first->second.bl.reassign_to_mempool(
3490 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3491 dout(20) << __func__ << " seq " << seq
3492 << " 0x" << std::hex << offset << "~" << length
3493 << " crc " << i.first->second.bl.crc32c(-1)
3494 << std::dec << dendl;
3495 seq_bytes[seq] += length;
3496#ifdef DEBUG_DEFERRED
3497 _audit(cct);
3498#endif
3499}
3500
3501void BlueStore::DeferredBatch::_discard(
3502 CephContext *cct, uint64_t offset, uint64_t length)
3503{
3504 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3505 << std::dec << dendl;
3506 auto p = iomap.lower_bound(offset);
3507 if (p != iomap.begin()) {
3508 --p;
3509 auto end = p->first + p->second.bl.length();
3510 if (end > offset) {
3511 bufferlist head;
3512 head.substr_of(p->second.bl, 0, offset - p->first);
3513 dout(20) << __func__ << " keep head " << p->second.seq
3514 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3515 << " -> 0x" << head.length() << std::dec << dendl;
3516 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3517 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3518 if (end > offset + length) {
3519 bufferlist tail;
3520 tail.substr_of(p->second.bl, offset + length - p->first,
3521 end - (offset + length));
3522 dout(20) << __func__ << " keep tail " << p->second.seq
3523 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3524 << " -> 0x" << tail.length() << std::dec << dendl;
3525 auto &n = iomap[offset + length];
3526 n.bl.swap(tail);
3527 n.seq = p->second.seq;
3528 i->second -= length;
3529 } else {
3530 i->second -= end - offset;
3531 }
11fdf7f2 3532 ceph_assert(i->second >= 0);
7c673cae
FG
3533 p->second.bl.swap(head);
3534 }
3535 ++p;
3536 }
3537 while (p != iomap.end()) {
3538 if (p->first >= offset + length) {
3539 break;
3540 }
3541 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3542 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3543 auto end = p->first + p->second.bl.length();
3544 if (end > offset + length) {
3545 unsigned drop_front = offset + length - p->first;
3546 unsigned keep_tail = end - (offset + length);
3547 dout(20) << __func__ << " truncate front " << p->second.seq
3548 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3549 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3550 << " to 0x" << (offset + length) << "~" << keep_tail
3551 << std::dec << dendl;
3552 auto &s = iomap[offset + length];
3553 s.seq = p->second.seq;
3554 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3555 i->second -= drop_front;
3556 } else {
3557 dout(20) << __func__ << " drop " << p->second.seq
3558 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3559 << std::dec << dendl;
3560 i->second -= p->second.bl.length();
3561 }
11fdf7f2 3562 ceph_assert(i->second >= 0);
7c673cae
FG
3563 p = iomap.erase(p);
3564 }
3565}
3566
3567void BlueStore::DeferredBatch::_audit(CephContext *cct)
3568{
3569 map<uint64_t,int> sb;
3570 for (auto p : seq_bytes) {
3571 sb[p.first] = 0; // make sure we have the same set of keys
3572 }
3573 uint64_t pos = 0;
3574 for (auto& p : iomap) {
11fdf7f2 3575 ceph_assert(p.first >= pos);
7c673cae
FG
3576 sb[p.second.seq] += p.second.bl.length();
3577 pos = p.first + p.second.bl.length();
3578 }
11fdf7f2 3579 ceph_assert(sb == seq_bytes);
7c673cae
FG
3580}
3581
3582
3583// Collection
3584
3585#undef dout_prefix
3586#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3587
9f95a23c
TL
3588BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3589 : CollectionImpl(store_->cct, cid),
11fdf7f2 3590 store(store_),
9f95a23c 3591 cache(bc),
7c673cae 3592 exists(true),
9f95a23c 3593 onode_map(oc),
11fdf7f2
TL
3594 commit_queue(nullptr)
3595{
3596}
3597
3598bool BlueStore::Collection::flush_commit(Context *c)
3599{
3600 return osr->flush_commit(c);
3601}
3602
3603void BlueStore::Collection::flush()
3604{
3605 osr->flush();
3606}
3607
3608void BlueStore::Collection::flush_all_but_last()
7c673cae 3609{
11fdf7f2 3610 osr->flush_all_but_last();
7c673cae
FG
3611}
3612
3613void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3614{
11fdf7f2 3615 ceph_assert(!b->shared_blob);
7c673cae
FG
3616 const bluestore_blob_t& blob = b->get_blob();
3617 if (!blob.is_shared()) {
3618 b->shared_blob = new SharedBlob(this);
3619 return;
3620 }
3621
3622 b->shared_blob = shared_blob_set.lookup(sbid);
3623 if (b->shared_blob) {
3624 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3625 << std::dec << " had " << *b->shared_blob << dendl;
3626 } else {
3627 b->shared_blob = new SharedBlob(sbid, this);
3628 shared_blob_set.add(this, b->shared_blob.get());
3629 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3630 << std::dec << " opened " << *b->shared_blob
3631 << dendl;
3632 }
3633}
3634
3635void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3636{
3637 if (!sb->is_loaded()) {
3638
3639 bufferlist v;
3640 string key;
3641 auto sbid = sb->get_sbid();
3642 get_shared_blob_key(sbid, &key);
3643 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3644 if (r < 0) {
3645 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3646 << std::dec << " not found at key "
3647 << pretty_binary_string(key) << dendl;
11fdf7f2 3648 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3649 }
3650
3651 sb->loaded = true;
3652 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3653 auto p = v.cbegin();
3654 decode(*(sb->persistent), p);
7c673cae
FG
3655 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3656 << std::dec << " loaded shared_blob " << *sb << dendl;
3657 }
3658}
3659
3660void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3661{
7c673cae 3662 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3663 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3664
3665 // update blob
31f18b77 3666 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3667 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3668
3669 // update shared blob
3670 b->shared_blob->loaded = true;
3671 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3672 shared_blob_set.add(this, b->shared_blob.get());
3673 for (auto p : blob.get_extents()) {
3674 if (p.is_valid()) {
3675 b->shared_blob->get_ref(
3676 p.offset,
3677 p.length);
3678 }
3679 }
3680 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3681}
3682
31f18b77
FG
3683uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3684{
3685 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3686 ceph_assert(sb->is_loaded());
31f18b77
FG
3687
3688 uint64_t sbid = sb->get_sbid();
3689 shared_blob_set.remove(sb);
3690 sb->loaded = false;
3691 delete sb->persistent;
3692 sb->sbid_unloaded = 0;
3693 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3694 return sbid;
3695}
3696
7c673cae
FG
3697BlueStore::OnodeRef BlueStore::Collection::get_onode(
3698 const ghobject_t& oid,
9f95a23c
TL
3699 bool create,
3700 bool is_createop)
7c673cae 3701{
9f95a23c 3702 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
3703
3704 spg_t pgid;
3705 if (cid.is_pg(&pgid)) {
3706 if (!oid.match(cnode.bits, pgid.ps())) {
3707 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3708 << pgid << " bits " << cnode.bits << dendl;
3709 ceph_abort();
3710 }
3711 }
3712
3713 OnodeRef o = onode_map.lookup(oid);
3714 if (o)
3715 return o;
3716
eafe8130 3717 string key;
7c673cae
FG
3718 get_object_key(store->cct, oid, &key);
3719
3720 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3721 << pretty_binary_string(key) << dendl;
3722
3723 bufferlist v;
9f95a23c 3724 int r = -ENOENT;
7c673cae 3725 Onode *on;
9f95a23c
TL
3726 if (!is_createop) {
3727 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3728 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3729 }
7c673cae 3730 if (v.length() == 0) {
11fdf7f2 3731 ceph_assert(r == -ENOENT);
7c673cae
FG
3732 if (!store->cct->_conf->bluestore_debug_misc &&
3733 !create)
3734 return OnodeRef();
3735
3736 // new object, new onode
3737 on = new Onode(this, oid, key);
3738 } else {
3739 // loaded
11fdf7f2 3740 ceph_assert(r >= 0);
eafe8130 3741 on = Onode::decode(this, oid, key, v);
7c673cae
FG
3742 }
3743 o.reset(on);
3744 return onode_map.add(oid, o);
3745}
3746
3747void BlueStore::Collection::split_cache(
3748 Collection *dest)
3749{
3750 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3751
3752 // lock (one or both) cache shards
3753 std::lock(cache->lock, dest->cache->lock);
11fdf7f2
TL
3754 std::lock_guard l(cache->lock, std::adopt_lock);
3755 std::lock_guard l2(dest->cache->lock, std::adopt_lock);
7c673cae
FG
3756
3757 int destbits = dest->cnode.bits;
3758 spg_t destpg;
3759 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 3760 ceph_assert(is_pg);
7c673cae
FG
3761
3762 auto p = onode_map.onode_map.begin();
3763 while (p != onode_map.onode_map.end()) {
11fdf7f2 3764 OnodeRef o = p->second;
7c673cae
FG
3765 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3766 // onode does not belong to this child
11fdf7f2
TL
3767 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
3768 << dendl;
7c673cae
FG
3769 ++p;
3770 } else {
7c673cae
FG
3771 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3772 << dendl;
3773
f6b5b4d7
TL
3774 // ensuring that nref is always >= 2 and hence onode is pinned and
3775 // physically out of cache during the transition
3776 OnodeRef o_pin = o;
3777 ceph_assert(o->pinned);
3778
7c673cae 3779 p = onode_map.onode_map.erase(p);
7c673cae 3780 dest->onode_map.onode_map[o->oid] = o;
f6b5b4d7
TL
3781 if (get_onode_cache() != dest->get_onode_cache()) {
3782 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 3783 }
f6b5b4d7 3784 o->c = dest;
7c673cae
FG
3785
3786 // move over shared blobs and buffers. cover shared blobs from
3787 // both extent map and spanning blob map (the full extent map
3788 // may not be faulted in)
3789 vector<SharedBlob*> sbvec;
3790 for (auto& e : o->extent_map.extent_map) {
3791 sbvec.push_back(e.blob->shared_blob.get());
3792 }
3793 for (auto& b : o->extent_map.spanning_blob_map) {
3794 sbvec.push_back(b.second->shared_blob.get());
3795 }
3796 for (auto sb : sbvec) {
3797 if (sb->coll == dest) {
3798 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3799 << dendl;
3800 continue;
3801 }
3802 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
3803 if (sb->get_sbid()) {
3804 ldout(store->cct, 20) << __func__
3805 << " moving registration " << *sb << dendl;
3806 shared_blob_set.remove(sb);
3807 dest->shared_blob_set.add(dest, sb);
3808 }
3efd9988 3809 sb->coll = dest;
7c673cae 3810 if (dest->cache != cache) {
7c673cae
FG
3811 for (auto& i : sb->bc.buffer_map) {
3812 if (!i.second->is_writing()) {
3813 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3814 << dendl;
9f95a23c 3815 dest->cache->_move(cache, i.second.get());
7c673cae
FG
3816 }
3817 }
3818 }
3819 }
7c673cae
FG
3820 }
3821 }
9f95a23c 3822 dest->cache->_trim();
7c673cae
FG
3823}
3824
7c673cae
FG
3825// =======================================================
3826
91327a77
AA
3827// MempoolThread
3828
3829#undef dout_prefix
3830#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
3831#undef dout_context
3832#define dout_context store->cct
91327a77 3833
7c673cae
FG
3834void *BlueStore::MempoolThread::entry()
3835{
9f95a23c 3836 std::unique_lock l{lock};
11fdf7f2 3837
92f5a8d4 3838 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
3839 uint64_t base = store->osd_memory_base;
3840 double fragmentation = store->osd_memory_expected_fragmentation;
3841 uint64_t target = store->osd_memory_target;
3842 uint64_t min = store->osd_memory_cache_min;
3843 uint64_t max = min;
3844
3845 // When setting the maximum amount of memory to use for cache, first
3846 // assume some base amount of memory for the OSD and then fudge in
3847 // some overhead for fragmentation that scales with cache usage.
3848 uint64_t ltarget = (1.0 - fragmentation) * target;
3849 if (ltarget > base + min) {
3850 max = ltarget - base;
11fdf7f2 3851 }
31f18b77 3852
eafe8130
TL
3853 binned_kv_cache = store->db->get_priority_cache();
3854 if (store->cache_autotune && binned_kv_cache != nullptr) {
3855 pcm = std::make_shared<PriorityCache::Manager>(
3856 store->cct, min, max, target, true);
3857 pcm->insert("kv", binned_kv_cache, true);
3858 pcm->insert("meta", meta_cache, true);
3859 pcm->insert("data", data_cache, true);
3860 }
91327a77
AA
3861
3862 utime_t next_balance = ceph_clock_now();
3863 utime_t next_resize = ceph_clock_now();
9f95a23c
TL
3864 utime_t next_deferred_force_submit = ceph_clock_now();
3865 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 3866
91327a77 3867 bool interval_stats_trim = false;
91327a77 3868 while (!stop) {
92f5a8d4
TL
3869 // Update pcm cache settings if related configuration was changed
3870 uint32_t cur_config_change = store->config_changed.load();
3871 if (cur_config_change != prev_config_change) {
3872 _update_cache_settings();
3873 prev_config_change = cur_config_change;
3874 }
3875
91327a77
AA
3876 // Before we trim, check and see if it's time to rebalance/resize.
3877 double autotune_interval = store->cache_autotune_interval;
3878 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c
TL
3879 double max_defer_interval = store->max_defer_interval;
3880
3881 double alloc_stats_dump_interval =
3882 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 3883
9f95a23c
TL
3884 if (alloc_stats_dump_interval > 0 &&
3885 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
3886 store->_record_allocation_stats();
3887 alloc_stats_dump_clock = ceph_clock_now();
3888 }
91327a77 3889 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
3890 _adjust_cache_settings();
3891
91327a77 3892 // Log events at 5 instead of 20 when balance happens.
91327a77 3893 interval_stats_trim = true;
eafe8130
TL
3894
3895 if (pcm != nullptr) {
3896 pcm->balance();
91327a77 3897 }
31f18b77 3898
91327a77
AA
3899 next_balance = ceph_clock_now();
3900 next_balance += autotune_interval;
3901 }
3902 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
3903 if (ceph_using_tcmalloc() && pcm != nullptr) {
3904 pcm->tune_memory();
91327a77
AA
3905 }
3906 next_resize = ceph_clock_now();
3907 next_resize += resize_interval;
31f18b77
FG
3908 }
3909
9f95a23c
TL
3910 if (max_defer_interval > 0 &&
3911 next_deferred_force_submit < ceph_clock_now()) {
3912 if (store->get_deferred_last_submitted() + max_defer_interval <
3913 ceph_clock_now()) {
3914 store->deferred_try_submit();
3915 }
3916 next_deferred_force_submit = ceph_clock_now();
3917 next_deferred_force_submit += max_defer_interval/3;
3918 }
3919
3920 // Now Resize the shards
3921 _resize_shards(interval_stats_trim);
91327a77 3922 interval_stats_trim = false;
31f18b77 3923
91327a77 3924 store->_update_cache_logger();
11fdf7f2
TL
3925 auto wait = ceph::make_timespan(
3926 store->cct->_conf->bluestore_cache_trim_interval);
3927 cond.wait_for(l, wait);
7c673cae 3928 }
9f95a23c
TL
3929 // do final dump
3930 store->_record_allocation_stats();
7c673cae
FG
3931 stop = false;
3932 return NULL;
3933}
3934
91327a77
AA
3935void BlueStore::MempoolThread::_adjust_cache_settings()
3936{
11fdf7f2
TL
3937 if (binned_kv_cache != nullptr) {
3938 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
3939 }
3940 meta_cache->set_cache_ratio(store->cache_meta_ratio);
3941 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
3942}
3943
9f95a23c 3944void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 3945{
9f95a23c
TL
3946 size_t onode_shards = store->onode_cache_shards.size();
3947 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 3948 int64_t kv_used = store->db->get_cache_usage();
11fdf7f2
TL
3949 int64_t meta_used = meta_cache->_get_used_bytes();
3950 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
3951
3952 uint64_t cache_size = store->cache_size;
3953 int64_t kv_alloc =
11fdf7f2 3954 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
91327a77 3955 int64_t meta_alloc =
11fdf7f2 3956 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 3957 int64_t data_alloc =
11fdf7f2 3958 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 3959
eafe8130
TL
3960 if (pcm != nullptr && binned_kv_cache != nullptr) {
3961 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
3962 kv_alloc = binned_kv_cache->get_committed_size();
3963 meta_alloc = meta_cache->get_committed_size();
3964 data_alloc = data_cache->get_committed_size();
91327a77
AA
3965 }
3966
3967 if (interval_stats) {
9f95a23c 3968 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
3969 << " kv_alloc: " << kv_alloc
3970 << " kv_used: " << kv_used
3971 << " meta_alloc: " << meta_alloc
3972 << " meta_used: " << meta_used
3973 << " data_alloc: " << data_alloc
3974 << " data_used: " << data_used << dendl;
3975 } else {
9f95a23c 3976 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
3977 << " kv_alloc: " << kv_alloc
3978 << " kv_used: " << kv_used
3979 << " meta_alloc: " << meta_alloc
3980 << " meta_used: " << meta_used
3981 << " data_alloc: " << data_alloc
3982 << " data_used: " << data_used << dendl;
3983 }
3984
3985 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
3986 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
3987 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 3988
9f95a23c 3989 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
3990 << " max_shard_buffer: " << max_shard_buffer << dendl;
3991
9f95a23c
TL
3992 for (auto i : store->onode_cache_shards) {
3993 i->set_max(max_shard_onodes);
3994 }
3995 for (auto i : store->buffer_cache_shards) {
3996 i->set_max(max_shard_buffer);
91327a77
AA
3997 }
3998}
3999
92f5a8d4
TL
4000void BlueStore::MempoolThread::_update_cache_settings()
4001{
4002 // Nothing to do if pcm is not used.
4003 if (pcm == nullptr) {
4004 return;
4005 }
4006
92f5a8d4
TL
4007 uint64_t target = store->osd_memory_target;
4008 uint64_t base = store->osd_memory_base;
4009 uint64_t min = store->osd_memory_cache_min;
4010 uint64_t max = min;
4011 double fragmentation = store->osd_memory_expected_fragmentation;
4012
4013 uint64_t ltarget = (1.0 - fragmentation) * target;
4014 if (ltarget > base + min) {
4015 max = ltarget - base;
4016 }
4017
4018 // set pcm cache levels
4019 pcm->set_target_memory(target);
4020 pcm->set_min_memory(min);
4021 pcm->set_max_memory(max);
4022
9f95a23c 4023 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4024 << " pcm min: " << min
4025 << " pcm max: " << max
4026 << dendl;
4027}
4028
7c673cae
FG
4029// =======================================================
4030
31f18b77
FG
4031// OmapIteratorImpl
4032
4033#undef dout_prefix
4034#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4035
4036BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4037 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4038 : c(c), o(o), it(it)
4039{
9f95a23c 4040 std::shared_lock l(c->lock);
31f18b77 4041 if (o->onode.has_omap()) {
9f95a23c
TL
4042 o->get_omap_key(string(), &head);
4043 o->get_omap_tail(&tail);
31f18b77
FG
4044 it->lower_bound(head);
4045 }
4046}
4047
11fdf7f2
TL
4048string BlueStore::OmapIteratorImpl::_stringify() const
4049{
4050 stringstream s;
4051 s << " omap_iterator(cid = " << c->cid
4052 <<", oid = " << o->oid << ")";
4053 return s.str();
4054}
4055
31f18b77
FG
4056int BlueStore::OmapIteratorImpl::seek_to_first()
4057{
9f95a23c 4058 std::shared_lock l(c->lock);
11fdf7f2 4059 auto start1 = mono_clock::now();
31f18b77
FG
4060 if (o->onode.has_omap()) {
4061 it->lower_bound(head);
4062 } else {
4063 it = KeyValueDB::Iterator();
4064 }
494da23a
TL
4065 c->store->log_latency(
4066 __func__,
11fdf7f2
TL
4067 l_bluestore_omap_seek_to_first_lat,
4068 mono_clock::now() - start1,
494da23a 4069 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4070
31f18b77
FG
4071 return 0;
4072}
4073
4074int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4075{
9f95a23c 4076 std::shared_lock l(c->lock);
11fdf7f2 4077 auto start1 = mono_clock::now();
31f18b77
FG
4078 if (o->onode.has_omap()) {
4079 string key;
9f95a23c 4080 o->get_omap_key(after, &key);
31f18b77
FG
4081 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4082 << pretty_binary_string(key) << dendl;
4083 it->upper_bound(key);
4084 } else {
4085 it = KeyValueDB::Iterator();
4086 }
11fdf7f2 4087 c->store->log_latency_fn(
494da23a 4088 __func__,
11fdf7f2
TL
4089 l_bluestore_omap_upper_bound_lat,
4090 mono_clock::now() - start1,
494da23a 4091 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4092 [&] (const ceph::timespan& lat) {
494da23a 4093 return ", after = " + after +
11fdf7f2
TL
4094 _stringify();
4095 }
4096 );
31f18b77
FG
4097 return 0;
4098}
4099
4100int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4101{
9f95a23c 4102 std::shared_lock l(c->lock);
11fdf7f2 4103 auto start1 = mono_clock::now();
31f18b77
FG
4104 if (o->onode.has_omap()) {
4105 string key;
9f95a23c 4106 o->get_omap_key(to, &key);
31f18b77
FG
4107 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4108 << pretty_binary_string(key) << dendl;
4109 it->lower_bound(key);
4110 } else {
4111 it = KeyValueDB::Iterator();
4112 }
11fdf7f2 4113 c->store->log_latency_fn(
494da23a 4114 __func__,
11fdf7f2
TL
4115 l_bluestore_omap_lower_bound_lat,
4116 mono_clock::now() - start1,
494da23a 4117 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4118 [&] (const ceph::timespan& lat) {
494da23a 4119 return ", to = " + to +
11fdf7f2
TL
4120 _stringify();
4121 }
4122 );
31f18b77
FG
4123 return 0;
4124}
4125
4126bool BlueStore::OmapIteratorImpl::valid()
4127{
9f95a23c 4128 std::shared_lock l(c->lock);
31f18b77 4129 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4130 it->raw_key().second < tail;
31f18b77
FG
4131 if (it && it->valid()) {
4132 ldout(c->store->cct,20) << __func__ << " is at "
4133 << pretty_binary_string(it->raw_key().second)
4134 << dendl;
4135 }
4136 return r;
4137}
4138
11fdf7f2 4139int BlueStore::OmapIteratorImpl::next()
31f18b77 4140{
11fdf7f2 4141 int r = -1;
9f95a23c 4142 std::shared_lock l(c->lock);
11fdf7f2 4143 auto start1 = mono_clock::now();
31f18b77
FG
4144 if (o->onode.has_omap()) {
4145 it->next();
11fdf7f2 4146 r = 0;
31f18b77 4147 }
494da23a
TL
4148 c->store->log_latency(
4149 __func__,
11fdf7f2
TL
4150 l_bluestore_omap_next_lat,
4151 mono_clock::now() - start1,
494da23a 4152 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4153
4154 return r;
31f18b77
FG
4155}
4156
4157string BlueStore::OmapIteratorImpl::key()
4158{
9f95a23c 4159 std::shared_lock l(c->lock);
11fdf7f2 4160 ceph_assert(it->valid());
31f18b77
FG
4161 string db_key = it->raw_key().second;
4162 string user_key;
9f95a23c 4163 o->decode_omap_key(db_key, &user_key);
494da23a 4164
31f18b77
FG
4165 return user_key;
4166}
4167
4168bufferlist BlueStore::OmapIteratorImpl::value()
4169{
9f95a23c 4170 std::shared_lock l(c->lock);
11fdf7f2 4171 ceph_assert(it->valid());
31f18b77
FG
4172 return it->value();
4173}
4174
4175
4176// =====================================
4177
7c673cae
FG
4178#undef dout_prefix
4179#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4180#undef dout_context
4181#define dout_context cct
7c673cae
FG
4182
4183
4184static void aio_cb(void *priv, void *priv2)
4185{
4186 BlueStore *store = static_cast<BlueStore*>(priv);
4187 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4188 c->aio_finish(store);
4189}
4190
11fdf7f2
TL
4191static void discard_cb(void *priv, void *priv2)
4192{
4193 BlueStore *store = static_cast<BlueStore*>(priv);
4194 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4195 store->handle_discard(*tmp);
4196}
4197
4198void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4199{
4200 dout(10) << __func__ << dendl;
4201 ceph_assert(alloc);
4202 alloc->release(to_release);
4203}
4204
7c673cae 4205BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4206 : BlueStore(cct, path, 0) {}
7c673cae
FG
4207
4208BlueStore::BlueStore(CephContext *cct,
4209 const string& path,
4210 uint64_t _min_alloc_size)
4211 : ObjectStore(cct, path),
9f95a23c 4212 throttle(cct),
11fdf7f2 4213 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4214 kv_sync_thread(this),
31f18b77 4215 kv_finalize_thread(this),
7c673cae
FG
4216 min_alloc_size(_min_alloc_size),
4217 min_alloc_size_order(ctz(_min_alloc_size)),
4218 mempool_thread(this)
4219{
4220 _init_logger();
11fdf7f2 4221 cct->_conf.add_observer(this);
7c673cae 4222 set_cache_shards(1);
7c673cae
FG
4223}
4224
4225BlueStore::~BlueStore()
4226{
11fdf7f2 4227 cct->_conf.remove_observer(this);
7c673cae 4228 _shutdown_logger();
11fdf7f2
TL
4229 ceph_assert(!mounted);
4230 ceph_assert(db == NULL);
4231 ceph_assert(bluefs == NULL);
4232 ceph_assert(fsid_fd < 0);
4233 ceph_assert(path_fd < 0);
9f95a23c
TL
4234 for (auto i : onode_cache_shards) {
4235 delete i;
4236 }
4237 for (auto i : buffer_cache_shards) {
7c673cae
FG
4238 delete i;
4239 }
9f95a23c
TL
4240 onode_cache_shards.clear();
4241 buffer_cache_shards.clear();
7c673cae
FG
4242}
4243
4244const char **BlueStore::get_tracked_conf_keys() const
4245{
4246 static const char* KEYS[] = {
4247 "bluestore_csum_type",
4248 "bluestore_compression_mode",
4249 "bluestore_compression_algorithm",
4250 "bluestore_compression_min_blob_size",
4251 "bluestore_compression_min_blob_size_ssd",
4252 "bluestore_compression_min_blob_size_hdd",
4253 "bluestore_compression_max_blob_size",
4254 "bluestore_compression_max_blob_size_ssd",
4255 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4256 "bluestore_compression_required_ratio",
7c673cae
FG
4257 "bluestore_max_alloc_size",
4258 "bluestore_prefer_deferred_size",
181888fb
FG
4259 "bluestore_prefer_deferred_size_hdd",
4260 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4261 "bluestore_deferred_batch_ops",
4262 "bluestore_deferred_batch_ops_hdd",
4263 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4264 "bluestore_throttle_bytes",
4265 "bluestore_throttle_deferred_bytes",
4266 "bluestore_throttle_cost_per_io_hdd",
4267 "bluestore_throttle_cost_per_io_ssd",
4268 "bluestore_throttle_cost_per_io",
4269 "bluestore_max_blob_size",
4270 "bluestore_max_blob_size_ssd",
4271 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4272 "osd_memory_target",
4273 "osd_memory_target_cgroup_limit_ratio",
4274 "osd_memory_base",
4275 "osd_memory_cache_min",
92f5a8d4 4276 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4277 "bluestore_cache_autotune",
4278 "bluestore_cache_autotune_interval",
81eedcae 4279 "bluestore_warn_on_legacy_statfs",
9f95a23c
TL
4280 "bluestore_warn_on_no_per_pool_omap",
4281 "bluestore_max_defer_interval",
7c673cae
FG
4282 NULL
4283 };
4284 return KEYS;
4285}
4286
11fdf7f2 4287void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4288 const std::set<std::string> &changed)
4289{
eafe8130 4290 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4291 _check_legacy_statfs_alert();
4292 }
9f95a23c
TL
4293 if (changed.count("bluestore_warn_on_no_per_pool_omap")) {
4294 _check_no_per_pool_omap_alert();
4295 }
81eedcae 4296
7c673cae
FG
4297 if (changed.count("bluestore_csum_type")) {
4298 _set_csum();
4299 }
4300 if (changed.count("bluestore_compression_mode") ||
4301 changed.count("bluestore_compression_algorithm") ||
4302 changed.count("bluestore_compression_min_blob_size") ||
4303 changed.count("bluestore_compression_max_blob_size")) {
4304 if (bdev) {
4305 _set_compression();
4306 }
4307 }
4308 if (changed.count("bluestore_max_blob_size") ||
4309 changed.count("bluestore_max_blob_size_ssd") ||
4310 changed.count("bluestore_max_blob_size_hdd")) {
4311 if (bdev) {
4312 // only after startup
4313 _set_blob_size();
4314 }
4315 }
4316 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4317 changed.count("bluestore_prefer_deferred_size_hdd") ||
4318 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4319 changed.count("bluestore_max_alloc_size") ||
4320 changed.count("bluestore_deferred_batch_ops") ||
4321 changed.count("bluestore_deferred_batch_ops_hdd") ||
4322 changed.count("bluestore_deferred_batch_ops_ssd")) {
4323 if (bdev) {
4324 // only after startup
4325 _set_alloc_sizes();
4326 }
4327 }
4328 if (changed.count("bluestore_throttle_cost_per_io") ||
4329 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4330 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4331 if (bdev) {
4332 _set_throttle_params();
4333 }
4334 }
9f95a23c
TL
4335 if (changed.count("bluestore_throttle_bytes") ||
4336 changed.count("bluestore_throttle_deferred_bytes") ||
4337 changed.count("bluestore_throttle_trace_rate")) {
4338 throttle.reset_throttle(conf);
7c673cae 4339 }
9f95a23c
TL
4340 if (changed.count("bluestore_max_defer_interval")) {
4341 if (bdev) {
4342 _set_max_defer_interval();
4343 }
7c673cae 4344 }
92f5a8d4
TL
4345 if (changed.count("osd_memory_target") ||
4346 changed.count("osd_memory_base") ||
4347 changed.count("osd_memory_cache_min") ||
4348 changed.count("osd_memory_expected_fragmentation")) {
4349 _update_osd_memory_options();
4350 }
7c673cae
FG
4351}
4352
4353void BlueStore::_set_compression()
4354{
224ce89b
WB
4355 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4356 if (m) {
11fdf7f2 4357 _clear_compression_alert();
224ce89b
WB
4358 comp_mode = *m;
4359 } else {
4360 derr << __func__ << " unrecognized value '"
4361 << cct->_conf->bluestore_compression_mode
4362 << "' for bluestore_compression_mode, reverting to 'none'"
4363 << dendl;
4364 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4365 string s("unknown mode: ");
4366 s += cct->_conf->bluestore_compression_mode;
4367 _set_compression_alert(true, s.c_str());
224ce89b
WB
4368 }
4369
4370 compressor = nullptr;
4371
3efd9988
FG
4372 if (cct->_conf->bluestore_compression_min_blob_size) {
4373 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4374 } else {
11fdf7f2 4375 ceph_assert(bdev);
9f95a23c 4376 if (_use_rotational_settings()) {
7c673cae
FG
4377 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4378 } else {
4379 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4380 }
4381 }
4382
4383 if (cct->_conf->bluestore_compression_max_blob_size) {
4384 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4385 } else {
11fdf7f2 4386 ceph_assert(bdev);
9f95a23c 4387 if (_use_rotational_settings()) {
7c673cae
FG
4388 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4389 } else {
4390 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4391 }
4392 }
4393
7c673cae
FG
4394 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4395 if (!alg_name.empty()) {
4396 compressor = Compressor::create(cct, alg_name);
4397 if (!compressor) {
4398 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4399 << dendl;
11fdf7f2 4400 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4401 }
4402 }
4403
4404 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4405 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4406 << " min_blob " << comp_min_blob_size
4407 << " max_blob " << comp_max_blob_size
7c673cae
FG
4408 << dendl;
4409}
4410
4411void BlueStore::_set_csum()
4412{
4413 csum_type = Checksummer::CSUM_NONE;
4414 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4415 if (t > Checksummer::CSUM_NONE)
4416 csum_type = t;
4417
4418 dout(10) << __func__ << " csum_type "
4419 << Checksummer::get_csum_type_string(csum_type)
4420 << dendl;
4421}
4422
4423void BlueStore::_set_throttle_params()
4424{
4425 if (cct->_conf->bluestore_throttle_cost_per_io) {
4426 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4427 } else {
11fdf7f2 4428 ceph_assert(bdev);
9f95a23c 4429 if (_use_rotational_settings()) {
7c673cae
FG
4430 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4431 } else {
4432 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4433 }
4434 }
4435
4436 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4437 << dendl;
4438}
4439void BlueStore::_set_blob_size()
4440{
4441 if (cct->_conf->bluestore_max_blob_size) {
4442 max_blob_size = cct->_conf->bluestore_max_blob_size;
4443 } else {
11fdf7f2 4444 ceph_assert(bdev);
9f95a23c 4445 if (_use_rotational_settings()) {
7c673cae
FG
4446 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4447 } else {
4448 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4449 }
4450 }
4451 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4452 << std::dec << dendl;
4453}
4454
92f5a8d4
TL
4455void BlueStore::_update_osd_memory_options()
4456{
4457 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4458 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4459 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4460 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4461 config_changed++;
4462 dout(10) << __func__
4463 << " osd_memory_target " << osd_memory_target
4464 << " osd_memory_base " << osd_memory_base
4465 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4466 << " osd_memory_cache_min " << osd_memory_cache_min
4467 << dendl;
4468}
4469
11fdf7f2 4470int BlueStore::_set_cache_sizes()
1adf2230 4471{
11fdf7f2
TL
4472 ceph_assert(bdev);
4473 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4474 cache_autotune_interval =
11fdf7f2
TL
4475 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4476 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4477 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4478 osd_memory_expected_fragmentation =
11fdf7f2
TL
4479 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4480 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4481 osd_memory_cache_resize_interval =
11fdf7f2 4482 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4483
224ce89b
WB
4484 if (cct->_conf->bluestore_cache_size) {
4485 cache_size = cct->_conf->bluestore_cache_size;
4486 } else {
4487 // choose global cache size based on backend type
9f95a23c 4488 if (_use_rotational_settings()) {
224ce89b
WB
4489 cache_size = cct->_conf->bluestore_cache_size_hdd;
4490 } else {
4491 cache_size = cct->_conf->bluestore_cache_size_ssd;
4492 }
4493 }
31f18b77 4494
91327a77 4495 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
224ce89b 4496 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4497 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4498 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4499 return -EINVAL;
4500 }
91327a77
AA
4501
4502 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b 4503 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4504 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4505 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4506 return -EINVAL;
4507 }
91327a77 4508
31f18b77 4509 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4510 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4511 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4512 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4513 << dendl;
31f18b77
FG
4514 return -EINVAL;
4515 }
91327a77
AA
4516
4517 cache_data_ratio =
4518 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
31f18b77
FG
4519 if (cache_data_ratio < 0) {
4520 // deal with floating point imprecision
4521 cache_data_ratio = 0;
4522 }
91327a77 4523
224ce89b
WB
4524 dout(1) << __func__ << " cache_size " << cache_size
4525 << " meta " << cache_meta_ratio
31f18b77
FG
4526 << " kv " << cache_kv_ratio
4527 << " data " << cache_data_ratio
4528 << dendl;
4529 return 0;
4530}
4531
3efd9988
FG
4532int BlueStore::write_meta(const std::string& key, const std::string& value)
4533{
4534 bluestore_bdev_label_t label;
4535 string p = path + "/block";
4536 int r = _read_bdev_label(cct, p, &label);
4537 if (r < 0) {
4538 return ObjectStore::write_meta(key, value);
4539 }
4540 label.meta[key] = value;
4541 r = _write_bdev_label(cct, p, label);
11fdf7f2 4542 ceph_assert(r == 0);
3efd9988
FG
4543 return ObjectStore::write_meta(key, value);
4544}
4545
4546int BlueStore::read_meta(const std::string& key, std::string *value)
4547{
4548 bluestore_bdev_label_t label;
4549 string p = path + "/block";
4550 int r = _read_bdev_label(cct, p, &label);
4551 if (r < 0) {
4552 return ObjectStore::read_meta(key, value);
4553 }
4554 auto i = label.meta.find(key);
4555 if (i == label.meta.end()) {
4556 return ObjectStore::read_meta(key, value);
4557 }
4558 *value = i->second;
4559 return 0;
4560}
4561
7c673cae
FG
4562void BlueStore::_init_logger()
4563{
4564 PerfCountersBuilder b(cct, "bluestore",
4565 l_bluestore_first, l_bluestore_last);
4566 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4567 "Average kv_thread flush latency",
4568 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4569 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4570 "Average kv_thread commit latency");
11fdf7f2
TL
4571 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4572 "Average kv_sync thread latency",
4573 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4574 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4575 "Average kv_finalize thread latency",
4576 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4577 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4578 "Average prepare state latency");
4579 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4580 "Average aio_wait state latency",
4581 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4582 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4583 "Average io_done state latency");
4584 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4585 "Average kv_queued state latency");
4586 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4587 "Average kv_commiting state latency");
4588 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4589 "Average kv_done state latency");
4590 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4591 "Average deferred_queued state latency");
4592 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4593 "Average aio_wait state latency");
4594 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4595 "Average cleanup state latency");
4596 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4597 "Average finishing state latency");
4598 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4599 "Average done state latency");
4600 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4601 "Average submit throttle latency",
4602 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4603 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4604 "Average submit latency",
4605 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4606 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4607 "Average commit latency",
4608 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4609 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4610 "Average read latency",
4611 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4612 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4613 "Average read onode metadata latency");
4614 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4615 "Average read latency");
4616 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4617 "Average compress latency");
4618 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4619 "Average decompress latency");
4620 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4621 "Average checksum latency");
4622 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4623 "Sum for beneficial compress ops");
4624 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4625 "Sum for compress ops rejected due to low net gain of space");
4626 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4627 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4628 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4629 "Sum for deferred write op");
4630 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4631 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4632 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4633 "Sum for write penalty read ops");
4634 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4635 "Sum for allocated bytes");
4636 b.add_u64(l_bluestore_stored, "bluestore_stored",
4637 "Sum for stored bytes");
4638 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
92f5a8d4
TL
4639 "Sum for stored compressed bytes",
4640 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4641 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
92f5a8d4
TL
4642 "Sum for bytes allocated for compressed data",
4643 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4644 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
92f5a8d4
TL
4645 "Sum for original bytes that were compressed",
4646 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
4647 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4648 "Number of onodes in cache");
9f95a23c
TL
4649 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4650 "Number of pinned onodes in cache");
7c673cae
FG
4651 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4652 "Sum for onode-lookups hit in the cache");
4653 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4654 "Sum for onode-lookups missed in the cache");
4655 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4656 "Sum for onode-shard lookups hit in the cache");
4657 b.add_u64_counter(l_bluestore_onode_shard_misses,
4658 "bluestore_onode_shard_misses",
4659 "Sum for onode-shard lookups missed in the cache");
4660 b.add_u64(l_bluestore_extents, "bluestore_extents",
4661 "Number of extents in cache");
4662 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4663 "Number of blobs in cache");
4664 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4665 "Number of buffers in cache");
4666 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4667 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4668 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4669 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4670 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4671 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4672
4673 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4674 "Large aligned writes into fresh blobs");
4675 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4676 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4677 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4678 "Large aligned writes into fresh blobs (blobs)");
4679 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4680 "Small writes into existing or sparse small blobs");
4681 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4682 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4683 b.add_u64_counter(l_bluestore_write_small_unused,
4684 "bluestore_write_small_unused",
4685 "Small writes into unused portion of existing blob");
4686 b.add_u64_counter(l_bluestore_write_small_deferred,
4687 "bluestore_write_small_deferred",
4688 "Small overwrites using deferred");
4689 b.add_u64_counter(l_bluestore_write_small_pre_read,
4690 "bluestore_write_small_pre_read",
4691 "Small writes that required we read some data (possibly "
4692 "cached) to fill out the block");
4693 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4694 "Small write into new (sparse) blob");
4695
4696 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4697 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4698 "Onode extent map reshard events");
4699 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4700 "Sum for blob splitting due to resharding");
4701 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4702 "Sum for extents that have been removed due to compression");
4703 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4704 "Sum for extents that have been merged due to garbage "
4705 "collection");
b32b8144
FG
4706 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4707 "Read EIO errors propagated to high level callers");
f64942e4
AA
4708 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4709 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
4710 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4711 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
4712 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4713 "Average omap iterator seek_to_first call latency");
4714 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4715 "Average omap iterator upper_bound call latency");
4716 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
4717 "Average omap iterator lower_bound call latency");
4718 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
4719 "Average omap iterator next call latency");
494da23a
TL
4720 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
4721 "Average collection listing latency");
7c673cae
FG
4722 logger = b.create_perf_counters();
4723 cct->get_perfcounters_collection()->add(logger);
4724}
4725
4726int BlueStore::_reload_logger()
4727{
4728 struct store_statfs_t store_statfs;
7c673cae 4729 int r = statfs(&store_statfs);
11fdf7f2 4730 if (r >= 0) {
7c673cae 4731 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
4732 logger->set(l_bluestore_stored, store_statfs.data_stored);
4733 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
4734 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
4735 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
4736 }
4737 return r;
4738}
4739
4740void BlueStore::_shutdown_logger()
4741{
4742 cct->get_perfcounters_collection()->remove(logger);
4743 delete logger;
4744}
4745
4746int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4747 uuid_d *fsid)
4748{
4749 bluestore_bdev_label_t label;
4750 int r = _read_bdev_label(cct, path, &label);
4751 if (r < 0)
4752 return r;
4753 *fsid = label.osd_uuid;
4754 return 0;
4755}
4756
4757int BlueStore::_open_path()
4758{
b32b8144 4759 // sanity check(s)
11fdf7f2 4760 ceph_assert(path_fd < 0);
91327a77 4761 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
4762 if (path_fd < 0) {
4763 int r = -errno;
4764 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4765 << dendl;
4766 return r;
4767 }
4768 return 0;
4769}
4770
4771void BlueStore::_close_path()
4772{
4773 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4774 path_fd = -1;
4775}
4776
3efd9988
FG
4777int BlueStore::_write_bdev_label(CephContext *cct,
4778 string path, bluestore_bdev_label_t label)
7c673cae
FG
4779{
4780 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4781 bufferlist bl;
11fdf7f2 4782 encode(label, bl);
7c673cae 4783 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
4784 encode(crc, bl);
4785 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
4786 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4787 z.zero();
4788 bl.append(std::move(z));
4789
91327a77 4790 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
4791 if (fd < 0) {
4792 fd = -errno;
4793 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4794 << dendl;
4795 return fd;
4796 }
4797 int r = bl.write_fd(fd);
4798 if (r < 0) {
4799 derr << __func__ << " failed to write to " << path
4800 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 4801 goto out;
7c673cae 4802 }
3efd9988
FG
4803 r = ::fsync(fd);
4804 if (r < 0) {
4805 derr << __func__ << " failed to fsync " << path
4806 << ": " << cpp_strerror(r) << dendl;
4807 }
11fdf7f2 4808out:
7c673cae
FG
4809 VOID_TEMP_FAILURE_RETRY(::close(fd));
4810 return r;
4811}
4812
4813int BlueStore::_read_bdev_label(CephContext* cct, string path,
4814 bluestore_bdev_label_t *label)
4815{
4816 dout(10) << __func__ << dendl;
91327a77 4817 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
4818 if (fd < 0) {
4819 fd = -errno;
4820 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4821 << dendl;
4822 return fd;
4823 }
4824 bufferlist bl;
4825 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4826 VOID_TEMP_FAILURE_RETRY(::close(fd));
4827 if (r < 0) {
4828 derr << __func__ << " failed to read from " << path
4829 << ": " << cpp_strerror(r) << dendl;
4830 return r;
4831 }
4832
4833 uint32_t crc, expected_crc;
11fdf7f2 4834 auto p = bl.cbegin();
7c673cae 4835 try {
11fdf7f2 4836 decode(*label, p);
7c673cae
FG
4837 bufferlist t;
4838 t.substr_of(bl, 0, p.get_off());
4839 crc = t.crc32c(-1);
11fdf7f2 4840 decode(expected_crc, p);
7c673cae
FG
4841 }
4842 catch (buffer::error& e) {
b32b8144 4843 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
4844 << ": " << e.what()
4845 << dendl;
b32b8144 4846 return -ENOENT;
7c673cae
FG
4847 }
4848 if (crc != expected_crc) {
4849 derr << __func__ << " bad crc on label, expected " << expected_crc
4850 << " != actual " << crc << dendl;
4851 return -EIO;
4852 }
4853 dout(10) << __func__ << " got " << *label << dendl;
4854 return 0;
4855}
4856
4857int BlueStore::_check_or_set_bdev_label(
4858 string path, uint64_t size, string desc, bool create)
4859{
4860 bluestore_bdev_label_t label;
4861 if (create) {
4862 label.osd_uuid = fsid;
4863 label.size = size;
4864 label.btime = ceph_clock_now();
4865 label.description = desc;
3efd9988 4866 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
4867 if (r < 0)
4868 return r;
4869 } else {
4870 int r = _read_bdev_label(cct, path, &label);
4871 if (r < 0)
4872 return r;
31f18b77
FG
4873 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4874 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4875 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 4876 } else if (label.osd_uuid != fsid) {
7c673cae
FG
4877 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4878 << " does not match our fsid " << fsid << dendl;
4879 return -EIO;
4880 }
4881 }
4882 return 0;
4883}
4884
4885void BlueStore::_set_alloc_sizes(void)
4886{
7c673cae
FG
4887 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4888
4889 if (cct->_conf->bluestore_prefer_deferred_size) {
4890 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4891 } else {
11fdf7f2 4892 ceph_assert(bdev);
9f95a23c 4893 if (_use_rotational_settings()) {
7c673cae
FG
4894 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4895 } else {
4896 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4897 }
4898 }
4899
4900 if (cct->_conf->bluestore_deferred_batch_ops) {
4901 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4902 } else {
11fdf7f2 4903 ceph_assert(bdev);
9f95a23c 4904 if (_use_rotational_settings()) {
7c673cae
FG
4905 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4906 } else {
4907 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4908 }
4909 }
4910
4911 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 4912 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
4913 << " max_alloc_size 0x" << std::hex << max_alloc_size
4914 << " prefer_deferred_size 0x" << prefer_deferred_size
4915 << std::dec
4916 << " deferred_batch_ops " << deferred_batch_ops
4917 << dendl;
4918}
4919
4920int BlueStore::_open_bdev(bool create)
4921{
11fdf7f2 4922 ceph_assert(bdev == NULL);
7c673cae 4923 string p = path + "/block";
11fdf7f2 4924 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
4925 int r = bdev->open(p);
4926 if (r < 0)
4927 goto fail;
4928
11fdf7f2
TL
4929 if (create && cct->_conf->bdev_enable_discard) {
4930 bdev->discard(0, bdev->get_size());
4931 }
4932
7c673cae
FG
4933 if (bdev->supported_bdev_label()) {
4934 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4935 if (r < 0)
4936 goto fail_close;
4937 }
4938
4939 // initialize global block parameters
4940 block_size = bdev->get_block_size();
4941 block_mask = ~(block_size - 1);
4942 block_size_order = ctz(block_size);
11fdf7f2 4943 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 4944 _set_max_defer_interval();
224ce89b
WB
4945 // and set cache_size based on device type
4946 r = _set_cache_sizes();
4947 if (r < 0) {
4948 goto fail_close;
4949 }
7c673cae
FG
4950 return 0;
4951
4952 fail_close:
4953 bdev->close();
4954 fail:
4955 delete bdev;
4956 bdev = NULL;
4957 return r;
4958}
4959
11fdf7f2
TL
4960void BlueStore::_validate_bdev()
4961{
4962 ceph_assert(bdev);
4963 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
4964 uint64_t dev_size = bdev->get_size();
4965 if (dev_size <
4966 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
4967 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
4968 << " is too small, disable bluestore_bluefs_min for now"
4969 << dendl;
4970 ceph_assert(dev_size >= _get_ondisk_reserved());
4971
4972 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
4973 ceph_assert(r == 0);
4974 }
4975}
4976
7c673cae
FG
4977void BlueStore::_close_bdev()
4978{
11fdf7f2 4979 ceph_assert(bdev);
7c673cae
FG
4980 bdev->close();
4981 delete bdev;
4982 bdev = NULL;
4983}
4984
1911f103 4985int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
7c673cae 4986{
1911f103
TL
4987 int r;
4988 bluestore_bdev_label_t label;
4989
11fdf7f2
TL
4990 ceph_assert(fm == NULL);
4991 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
4992 ceph_assert(fm);
4993 if (t) {
4994 // create mode. initialize freespace
7c673cae 4995 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
4996 {
4997 bufferlist bl;
4998 bl.append(freelist_type);
4999 t->set(PREFIX_SUPER, "freelist_type", bl);
5000 }
b32b8144
FG
5001 // being able to allocate in units less than bdev block size
5002 // seems to be a bad idea.
11fdf7f2 5003 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
b32b8144 5004 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
5005
5006 // allocate superblock reserved space. note that we do not mark
5007 // bluefs space as allocated in the freelist; we instead rely on
5008 // bluefs_extents.
11fdf7f2 5009 auto reserved = _get_ondisk_reserved();
3efd9988 5010 fm->allocate(0, reserved, t);
7c673cae 5011
7c673cae 5012 if (cct->_conf->bluestore_bluefs) {
11fdf7f2 5013 ceph_assert(bluefs_extents.num_intervals() == 1);
7c673cae 5014 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
11fdf7f2 5015 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
5016 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
5017 << " for bluefs" << dendl;
7c673cae
FG
5018 }
5019
5020 if (cct->_conf->bluestore_debug_prefill > 0) {
5021 uint64_t end = bdev->get_size() - reserved;
5022 dout(1) << __func__ << " pre-fragmenting freespace, using "
5023 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5024 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5025 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5026 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5027 float r = cct->_conf->bluestore_debug_prefill;
5028 r /= 1.0 - r;
5029 bool stop = false;
5030
5031 while (!stop && start < end) {
5032 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5033 if (start + l > end) {
5034 l = end - start;
11fdf7f2 5035 l = p2align(l, min_alloc_size);
7c673cae 5036 }
11fdf7f2 5037 ceph_assert(start + l <= end);
7c673cae
FG
5038
5039 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5040 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5041 if (start + l + u > end) {
5042 u = end - (start + l);
5043 // trim to align so we don't overflow again
11fdf7f2 5044 u = p2align(u, min_alloc_size);
7c673cae
FG
5045 stop = true;
5046 }
11fdf7f2 5047 ceph_assert(start + l + u <= end);
7c673cae 5048
11fdf7f2 5049 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5050 << " use 0x" << u << std::dec << dendl;
5051
5052 if (u == 0) {
5053 // break if u has been trimmed to nothing
5054 break;
5055 }
5056
5057 fm->allocate(start + l, u, t);
5058 start += l + u;
5059 }
5060 }
1911f103
TL
5061 r = _write_out_fm_meta(0, false, &label);
5062 ceph_assert(r == 0);
5063 } else {
5064 string p = path + "/block";
5065 r = _read_bdev_label(cct, p, &label);
5066 if (r < 0) {
5067 derr << __func__ << " freelist init failed, error reading bdev label: " << cpp_strerror(r) << dendl;
5068 delete fm;
5069 fm = NULL;
5070 return r;
5071 }
7c673cae 5072 }
1911f103 5073 r = fm->init(label, db, read_only);
7c673cae
FG
5074 if (r < 0) {
5075 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
5076 delete fm;
5077 fm = NULL;
5078 return r;
5079 }
81eedcae
TL
5080 // if space size tracked by free list manager is that higher than actual
5081 // dev size one can hit out-of-space allocation which will result
5082 // in data loss and/or assertions
5083 // Probably user altered the device size somehow.
5084 // The only fix for now is to redeploy OSD.
5085 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5086 ostringstream ss;
5087 ss << "slow device size mismatch detected, "
5088 << " fm size(" << fm->get_size()
5089 << ") > slow device size(" << bdev->get_size()
5090 << "), Please stop using this OSD as it might cause data loss.";
5091 _set_disk_size_mismatch_alert(ss.str());
5092 }
7c673cae
FG
5093 return 0;
5094}
5095
5096void BlueStore::_close_fm()
5097{
5098 dout(10) << __func__ << dendl;
11fdf7f2 5099 ceph_assert(fm);
7c673cae
FG
5100 fm->shutdown();
5101 delete fm;
5102 fm = NULL;
5103}
5104
1911f103
TL
5105int BlueStore::_write_out_fm_meta(uint64_t target_size,
5106 bool update_root_size,
5107 bluestore_bdev_label_t* res_label)
5108{
5109 string p = path + "/block";
5110
5111 std::vector<std::pair<string, string>> fm_meta;
5112 fm->get_meta(target_size, &fm_meta);
5113
5114 bluestore_bdev_label_t label;
5115 int r = _read_bdev_label(cct, p, &label);
5116 if (r < 0)
5117 return r;
5118
5119 for (auto& m : fm_meta) {
5120 label.meta[m.first] = m.second;
5121 }
5122 if (update_root_size) {
5123 label.size = target_size;
5124 }
5125 r = _write_bdev_label(cct, p, label);
5126 if (res_label) {
5127 *res_label = label;
5128 }
5129
5130 return r;
5131}
5132
7c673cae
FG
5133int BlueStore::_open_alloc()
5134{
11fdf7f2
TL
5135 ceph_assert(alloc == NULL);
5136 ceph_assert(bdev->get_size());
5137
5138 if (bluefs) {
5139 bluefs_extents.clear();
9f95a23c
TL
5140 auto r = bluefs->get_block_extents(bluefs_layout.shared_bdev,
5141 &bluefs_extents);
11fdf7f2
TL
5142 if (r < 0) {
5143 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
5144 << cpp_strerror(r) << dendl;
5145
5146 return r;
5147 }
5148 dout(10) << __func__ << " bluefs extents 0x"
5149 << std::hex << bluefs_extents << std::dec
5150 << dendl;
5151 }
5152
7c673cae
FG
5153 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
5154 bdev->get_size(),
eafe8130 5155 min_alloc_size, "block");
7c673cae
FG
5156 if (!alloc) {
5157 lderr(cct) << __func__ << " Allocator::unknown alloc type "
5158 << cct->_conf->bluestore_allocator
5159 << dendl;
5160 return -EINVAL;
5161 }
5162
5163 uint64_t num = 0, bytes = 0;
5164
5165 dout(1) << __func__ << " opening allocation metadata" << dendl;
5166 // initialize from freelist
5167 fm->enumerate_reset();
5168 uint64_t offset, length;
11fdf7f2 5169 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
5170 alloc->init_add_free(offset, length);
5171 ++num;
5172 bytes += length;
5173 }
224ce89b 5174 fm->enumerate_reset();
7c673cae
FG
5175
5176 // also mark bluefs space as allocated
5177 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5178 alloc->init_rm_free(e.get_start(), e.get_len());
5179 }
7c673cae 5180
1911f103
TL
5181 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
5182 << " in " << num << " extents"
5183 << " available " << byte_u_t(alloc->get_free())
5184 << dendl;
5185
7c673cae
FG
5186 return 0;
5187}
5188
5189void BlueStore::_close_alloc()
5190{
11fdf7f2
TL
5191 ceph_assert(bdev);
5192 bdev->discard_drain();
5193
5194 ceph_assert(alloc);
7c673cae
FG
5195 alloc->shutdown();
5196 delete alloc;
5197 alloc = NULL;
11fdf7f2 5198 bluefs_extents.clear();
7c673cae
FG
5199}
5200
5201int BlueStore::_open_fsid(bool create)
5202{
11fdf7f2 5203 ceph_assert(fsid_fd < 0);
91327a77 5204 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5205 if (create)
5206 flags |= O_CREAT;
5207 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5208 if (fsid_fd < 0) {
5209 int err = -errno;
5210 derr << __func__ << " " << cpp_strerror(err) << dendl;
5211 return err;
5212 }
5213 return 0;
5214}
5215
5216int BlueStore::_read_fsid(uuid_d *uuid)
5217{
5218 char fsid_str[40];
5219 memset(fsid_str, 0, sizeof(fsid_str));
5220 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5221 if (ret < 0) {
5222 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5223 return ret;
5224 }
5225 if (ret > 36)
5226 fsid_str[36] = 0;
5227 else
5228 fsid_str[ret] = 0;
5229 if (!uuid->parse(fsid_str)) {
5230 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5231 return -EINVAL;
5232 }
5233 return 0;
5234}
5235
5236int BlueStore::_write_fsid()
5237{
5238 int r = ::ftruncate(fsid_fd, 0);
5239 if (r < 0) {
5240 r = -errno;
5241 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5242 return r;
5243 }
5244 string str = stringify(fsid) + "\n";
5245 r = safe_write(fsid_fd, str.c_str(), str.length());
5246 if (r < 0) {
5247 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5248 return r;
5249 }
5250 r = ::fsync(fsid_fd);
5251 if (r < 0) {
5252 r = -errno;
5253 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5254 return r;
5255 }
5256 return 0;
5257}
5258
5259void BlueStore::_close_fsid()
5260{
5261 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5262 fsid_fd = -1;
5263}
5264
5265int BlueStore::_lock_fsid()
5266{
5267 struct flock l;
5268 memset(&l, 0, sizeof(l));
5269 l.l_type = F_WRLCK;
5270 l.l_whence = SEEK_SET;
5271 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5272 if (r < 0) {
5273 int err = errno;
5274 derr << __func__ << " failed to lock " << path << "/fsid"
5275 << " (is another ceph-osd still running?)"
5276 << cpp_strerror(err) << dendl;
5277 return -err;
5278 }
5279 return 0;
5280}
5281
31f18b77
FG
5282bool BlueStore::is_rotational()
5283{
5284 if (bdev) {
5285 return bdev->is_rotational();
5286 }
5287
5288 bool rotational = true;
5289 int r = _open_path();
5290 if (r < 0)
5291 goto out;
5292 r = _open_fsid(false);
5293 if (r < 0)
5294 goto out_path;
5295 r = _read_fsid(&fsid);
5296 if (r < 0)
5297 goto out_fsid;
5298 r = _lock_fsid();
5299 if (r < 0)
5300 goto out_fsid;
5301 r = _open_bdev(false);
5302 if (r < 0)
5303 goto out_fsid;
5304 rotational = bdev->is_rotational();
5305 _close_bdev();
5306 out_fsid:
5307 _close_fsid();
5308 out_path:
5309 _close_path();
5310 out:
5311 return rotational;
5312}
5313
d2e6a577
FG
5314bool BlueStore::is_journal_rotational()
5315{
5316 if (!bluefs) {
5317 dout(5) << __func__ << " bluefs disabled, default to store media type"
5318 << dendl;
5319 return is_rotational();
5320 }
5321 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5322 return bluefs->wal_is_rotational();
5323}
5324
9f95a23c
TL
5325bool BlueStore::_use_rotational_settings()
5326{
5327 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5328 return true;
5329 }
5330 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5331 return false;
5332 }
5333 return bdev->is_rotational();
5334}
5335
7c673cae
FG
5336bool BlueStore::test_mount_in_use()
5337{
5338 // most error conditions mean the mount is not in use (e.g., because
5339 // it doesn't exist). only if we fail to lock do we conclude it is
5340 // in use.
5341 bool ret = false;
5342 int r = _open_path();
5343 if (r < 0)
5344 return false;
5345 r = _open_fsid(false);
5346 if (r < 0)
5347 goto out_path;
5348 r = _lock_fsid();
5349 if (r < 0)
5350 ret = true; // if we can't lock, it is in use
5351 _close_fsid();
5352 out_path:
5353 _close_path();
5354 return ret;
5355}
5356
11fdf7f2 5357int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5358{
5359 int r;
11fdf7f2 5360 bluefs = new BlueFS(cct);
7c673cae 5361
11fdf7f2
TL
5362 string bfn;
5363 struct stat st;
5364
5365 bfn = path + "/block.db";
5366 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
5367 r = bluefs->add_block_device(
5368 BlueFS::BDEV_DB, bfn,
5369 create && cct->_conf->bdev_enable_discard);
7c673cae 5370 if (r < 0) {
11fdf7f2
TL
5371 derr << __func__ << " add block device(" << bfn << ") returned: "
5372 << cpp_strerror(r) << dendl;
5373 goto free_bluefs;
7c673cae 5374 }
7c673cae 5375
11fdf7f2
TL
5376 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5377 r = _check_or_set_bdev_label(
5378 bfn,
5379 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5380 "bluefs db", create);
5381 if (r < 0) {
5382 derr << __func__
5383 << " check block device(" << bfn << ") label returned: "
5384 << cpp_strerror(r) << dendl;
5385 goto free_bluefs;
5386 }
7c673cae 5387 }
11fdf7f2
TL
5388 if (create) {
5389 bluefs->add_block_extent(
5390 BlueFS::BDEV_DB,
5391 SUPER_RESERVED,
5392 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5393 }
9f95a23c
TL
5394 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5395 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
5396 } else {
5397 r = -errno;
5398 if (::lstat(bfn.c_str(), &st) == -1) {
5399 r = 0;
9f95a23c 5400 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 5401 } else {
11fdf7f2
TL
5402 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5403 << cpp_strerror(r) << dendl;
5404 goto free_bluefs;
7c673cae
FG
5405 }
5406 }
7c673cae 5407
11fdf7f2
TL
5408 // shared device
5409 bfn = path + "/block";
5410 // never trim here
9f95a23c 5411 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
11fdf7f2
TL
5412 true /* shared with bluestore */);
5413 if (r < 0) {
5414 derr << __func__ << " add block device(" << bfn << ") returned: "
5415 << cpp_strerror(r) << dendl;
5416 goto free_bluefs;
5417 }
5418 if (create) {
5419 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5420 uint64_t initial =
5421 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5422 cct->_conf->bluestore_bluefs_gift_ratio);
5423 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
eafe8130
TL
5424 uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size;
5425 if (alloc_size % min_alloc_size) {
5426 derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex
5427 << alloc_size << " is not a multiple of "
11fdf7f2
TL
5428 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5429 r = -EINVAL;
5430 goto free_bluefs;
7c673cae 5431 }
11fdf7f2 5432 // align to bluefs's alloc_size
eafe8130 5433 initial = p2roundup(initial, alloc_size);
11fdf7f2 5434 // put bluefs in the middle of the device in case it is an HDD
eafe8130 5435 uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size);
11fdf7f2 5436 //avoiding superblock overwrite
eafe8130
TL
5437 start = std::max(alloc_size, start);
5438 ceph_assert(start >=_get_ondisk_reserved());
7c673cae 5439
9f95a23c 5440 bluefs->add_block_extent(bluefs_layout.shared_bdev, start, initial);
11fdf7f2
TL
5441 bluefs_extents.insert(start, initial);
5442 ++out_of_sync_fm;
5443 }
5444
5445 bfn = path + "/block.wal";
5446 if (::stat(bfn.c_str(), &st) == 0) {
5447 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
eafe8130 5448 create && cct->_conf->bdev_enable_discard);
11fdf7f2
TL
5449 if (r < 0) {
5450 derr << __func__ << " add block device(" << bfn << ") returned: "
5451 << cpp_strerror(r) << dendl;
5452 goto free_bluefs;
5453 }
7c673cae 5454
11fdf7f2
TL
5455 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5456 r = _check_or_set_bdev_label(
5457 bfn,
5458 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5459 "bluefs wal", create);
7c673cae 5460 if (r < 0) {
11fdf7f2
TL
5461 derr << __func__ << " check block device(" << bfn
5462 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5463 goto free_bluefs;
5464 }
7c673cae
FG
5465 }
5466
11fdf7f2
TL
5467 if (create) {
5468 bluefs->add_block_extent(
5469 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5470 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5471 BDEV_LABEL_BLOCK_SIZE);
5472 }
9f95a23c 5473 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
5474 } else {
5475 r = 0;
5476 if (::lstat(bfn.c_str(), &st) != -1) {
5477 r = -errno;
5478 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5479 << cpp_strerror(r) << dendl;
7c673cae
FG
5480 goto free_bluefs;
5481 }
11fdf7f2
TL
5482 }
5483 return 0;
7c673cae 5484
11fdf7f2
TL
5485free_bluefs:
5486 ceph_assert(bluefs);
5487 delete bluefs;
5488 bluefs = NULL;
5489 return r;
5490}
7c673cae 5491
11fdf7f2
TL
5492int BlueStore::_open_bluefs(bool create)
5493{
5494 int r = _minimal_open_bluefs(create);
5495 if (r < 0) {
5496 return r;
5497 }
9f95a23c
TL
5498 RocksDBBlueFSVolumeSelector* vselector = nullptr;
5499 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5500
5501 string options = cct->_conf->bluestore_rocksdb_options;
5502
5503 rocksdb::Options rocks_opts;
5504 int r = RocksDBStore::ParseOptionsFromStringStatic(
5505 cct,
5506 options,
5507 rocks_opts,
5508 nullptr);
5509 if (r < 0) {
5510 return r;
5511 }
5512
5513 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5514 vselector =
5515 new RocksDBBlueFSVolumeSelector(
5516 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5517 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5518 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5519 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5520 rocks_opts.max_bytes_for_level_base,
5521 rocks_opts.max_bytes_for_level_multiplier,
5522 reserved_factor,
5523 cct->_conf->bluestore_volume_selection_reserved,
5524 cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
5525 }
11fdf7f2 5526 if (create) {
9f95a23c 5527 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 5528 }
9f95a23c 5529 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
5530 r = bluefs->mount();
5531 if (r < 0) {
5532 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5533 }
9f95a23c 5534 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
5535 return r;
5536}
5537
1911f103 5538void BlueStore::_close_bluefs(bool cold_close)
11fdf7f2 5539{
1911f103 5540 bluefs->umount(cold_close);
11fdf7f2
TL
5541 _minimal_close_bluefs();
5542}
5543
5544void BlueStore::_minimal_close_bluefs()
5545{
5546 delete bluefs;
5547 bluefs = NULL;
5548}
5549
5550int BlueStore::_is_bluefs(bool create, bool* ret)
5551{
5552 if (create) {
5553 *ret = cct->_conf->bluestore_bluefs;
5554 } else {
5555 string s;
5556 int r = read_meta("bluefs", &s);
5557 if (r < 0) {
5558 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5559 return -EIO;
5560 }
5561 if (s == "1") {
5562 *ret = true;
5563 } else if (s == "0") {
5564 *ret = false;
31f18b77 5565 } else {
11fdf7f2
TL
5566 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5567 << dendl;
5568 return -EIO;
5569 }
5570 }
5571 return 0;
5572}
5573
5574/*
5575* opens both DB and dependant super_meta, FreelistManager and allocator
5576* in the proper order
5577*/
5578int BlueStore::_open_db_and_around(bool read_only)
5579{
5580 int r;
5581 bool do_bluefs = false;
5582 _is_bluefs(false, &do_bluefs); // ignore err code
5583 if (do_bluefs) {
5584 // open in read-only first to read FM list and init allocator
5585 // as they might be needed for some BlueFS procedures
5586 r = _open_db(false, false, true);
5587 if (r < 0)
5588 return r;
5589
5590 r = _open_super_meta();
5591 if (r < 0) {
5592 goto out_db;
5593 }
5594
1911f103 5595 r = _open_fm(nullptr, true);
11fdf7f2
TL
5596 if (r < 0)
5597 goto out_db;
5598
5599 r = _open_alloc();
5600 if (r < 0)
5601 goto out_fm;
5602
5603 // now open in R/W mode
5604 if (!read_only) {
1911f103 5605 _close_db(true);
11fdf7f2
TL
5606
5607 r = _open_db(false, false, false);
5608 if (r < 0) {
5609 _close_alloc();
5610 _close_fm();
5611 return r;
28e407b8 5612 }
1911f103 5613 fm->sync(db);
7c673cae 5614 }
11fdf7f2
TL
5615 } else {
5616 r = _open_db(false, false);
5617 if (r < 0) {
5618 return r;
5619 }
5620 r = _open_super_meta();
5621 if (r < 0) {
5622 goto out_db;
5623 }
7c673cae 5624
1911f103 5625 r = _open_fm(nullptr, false);
11fdf7f2
TL
5626 if (r < 0)
5627 goto out_db;
5628
5629 r = _open_alloc();
5630 if (r < 0)
5631 goto out_fm;
5632 }
5633 return 0;
5634
5635 out_fm:
5636 _close_fm();
5637 out_db:
1911f103 5638 _close_db(read_only);
11fdf7f2
TL
5639 return r;
5640}
5641
1911f103 5642void BlueStore::_close_db_and_around(bool read_only)
11fdf7f2
TL
5643{
5644 if (bluefs) {
1911f103 5645 if (!read_only && out_of_sync_fm.fetch_and(0)) {
11fdf7f2
TL
5646 _sync_bluefs_and_fm();
5647 }
1911f103
TL
5648 _close_db(read_only);
5649 while(!read_only && out_of_sync_fm.fetch_and(0)) {
11fdf7f2
TL
5650 // if seen some allocations during close - repeat open_db, sync fm, close
5651 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5652 int r = _open_db(false, false, false);
5653 if (r < 0) {
5654 derr << __func__
5655 << " unable to open db, FreelistManager is probably out of sync"
5656 << dendl;
5657 break;
5658 }
5659 _sync_bluefs_and_fm();
1911f103 5660 _close_db(false);
7c673cae 5661 }
11fdf7f2
TL
5662 if (!_kv_only) {
5663 _close_alloc();
5664 _close_fm();
5665 }
5666 } else {
5667 _close_alloc();
5668 _close_fm();
1911f103 5669 _close_db(read_only);
11fdf7f2
TL
5670 }
5671}
5672
5673// updates legacy bluefs related recs in DB to a state valid for
5674// downgrades from nautilus.
5675void BlueStore::_sync_bluefs_and_fm()
5676{
5677 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5678 bufferlist bl;
5679 encode(bluefs_extents, bl);
5680 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5681 << std::hex << bluefs_extents << std::dec
5682 << dendl;
5683 KeyValueDB::Transaction synct = db->get_transaction();
5684 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5685 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5686
5687 // Nice thing is that we don't need to update FreelistManager here.
5688 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5689 // pre-Nautilis releases.
5690 // So once we get an extent to bluefs_extents this means it's
5691 // been free in allocator and hence it's free in FM too.
5692
5693 db->submit_transaction_sync(synct);
5694 }
5695}
5696
5697int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5698{
5699 int r;
5700 ceph_assert(!db);
5701 ceph_assert(!(create && read_only));
5702 string fn = path + "/db";
5703 string options;
5704 stringstream err;
5705 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5706
5707 string kv_backend;
5708 std::vector<KeyValueDB::ColumnFamily> cfs;
5709
5710 if (create) {
5711 kv_backend = cct->_conf->bluestore_kvbackend;
5712 } else {
5713 r = read_meta("kv_backend", &kv_backend);
7c673cae 5714 if (r < 0) {
11fdf7f2
TL
5715 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5716 return -EIO;
5717 }
5718 }
5719 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5720
5721 bool do_bluefs;
5722 r = _is_bluefs(create, &do_bluefs);
5723 if (r < 0) {
5724 return r;
5725 }
5726 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5727
5728 map<string,string> kv_options;
5729 // force separate wal dir for all new deployments.
5730 kv_options["separate_wal_dir"] = 1;
5731 rocksdb::Env *env = NULL;
5732 if (do_bluefs) {
5733 dout(10) << __func__ << " initializing bluefs" << dendl;
5734 if (kv_backend != "rocksdb") {
5735 derr << " backend must be rocksdb to use bluefs" << dendl;
5736 return -EINVAL;
7c673cae 5737 }
11fdf7f2
TL
5738
5739 r = _open_bluefs(create);
5740 if (r < 0) {
5741 return r;
5742 }
11fdf7f2 5743
7c673cae 5744 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
5745 rocksdb::Env* a = new BlueRocksEnv(bluefs);
5746 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 5747 if (create) {
9f95a23c
TL
5748 string cmd = "rm -rf " + path + "/db " +
5749 path + "/db.slow " +
5750 path + "/db.wal";
5751 int r = system(cmd.c_str());
5752 (void)r;
7c673cae
FG
5753 }
5754 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 5755 } else {
7c673cae
FG
5756 env = new BlueRocksEnv(bluefs);
5757
5758 // simplify the dir names, too, as "seen" by rocksdb
5759 fn = "db";
5760 }
9f95a23c
TL
5761 bluefs->set_slow_device_expander(this);
5762 BlueFSVolumeSelector::paths paths;
5763 bluefs->get_vselector_paths(fn, paths);
7c673cae 5764
9f95a23c 5765 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
7c673cae
FG
5766 // we have both block.db and block; tell rocksdb!
5767 // note: the second (last) size value doesn't really matter
5768 ostringstream db_paths;
9f95a23c
TL
5769 bool first = true;
5770 for (auto& p : paths) {
5771 if (!first) {
5772 db_paths << " ";
5773 }
5774 first = false;
5775 db_paths << p.first << "," << p.second;
5776
5777 }
11fdf7f2 5778 kv_options["db_paths"] = db_paths.str();
9f95a23c 5779 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
5780 }
5781
5782 if (create) {
9f95a23c
TL
5783 for (auto& p : paths) {
5784 env->CreateDir(p.first);
5785 }
5786 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 5787 env->CreateDir(fn + ".wal");
11fdf7f2
TL
5788 } else {
5789 std::vector<std::string> res;
5790 // check for dir presence
5791 auto r = env->GetChildren(fn+".wal", &res);
5792 if (r.IsNotFound()) {
5793 kv_options.erase("separate_wal_dir");
5794 }
7c673cae 5795 }
11fdf7f2
TL
5796 } else {
5797 string walfn = path + "/db.wal";
7c673cae 5798
11fdf7f2
TL
5799 if (create) {
5800 int r = ::mkdir(fn.c_str(), 0755);
5801 if (r < 0)
5802 r = -errno;
5803 if (r < 0 && r != -EEXIST) {
5804 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
5805 << dendl;
5806 return r;
5807 }
5808
5809 // wal_dir, too!
7c673cae
FG
5810 r = ::mkdir(walfn.c_str(), 0755);
5811 if (r < 0)
5812 r = -errno;
5813 if (r < 0 && r != -EEXIST) {
5814 derr << __func__ << " failed to create " << walfn
5815 << ": " << cpp_strerror(r)
5816 << dendl;
5817 return r;
5818 }
11fdf7f2
TL
5819 } else {
5820 struct stat st;
5821 r = ::stat(walfn.c_str(), &st);
5822 if (r < 0 && errno == ENOENT) {
5823 kv_options.erase("separate_wal_dir");
5824 }
7c673cae
FG
5825 }
5826 }
5827
91327a77 5828
7c673cae
FG
5829 db = KeyValueDB::create(cct,
5830 kv_backend,
5831 fn,
11fdf7f2 5832 kv_options,
7c673cae
FG
5833 static_cast<void*>(env));
5834 if (!db) {
5835 derr << __func__ << " error creating db" << dendl;
5836 if (bluefs) {
1911f103 5837 _close_bluefs(read_only);
7c673cae
FG
5838 }
5839 // delete env manually here since we can't depend on db to do this
5840 // under this case
5841 delete env;
5842 env = NULL;
5843 return -EIO;
5844 }
5845
5846 FreelistManager::setup_merge_operators(db);
5847 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 5848 db->set_cache_size(cache_kv_ratio * cache_size);
31f18b77 5849
11fdf7f2 5850 if (kv_backend == "rocksdb") {
7c673cae 5851 options = cct->_conf->bluestore_rocksdb_options;
11fdf7f2
TL
5852
5853 map<string,string> cf_map;
5854 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
5855 get_str_map,
5856 &cf_map,
5857 " \t");
5858 for (auto& i : cf_map) {
5859 dout(10) << "column family " << i.first << ": " << i.second << dendl;
5860 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
5861 }
5862 }
5863
7c673cae 5864 db->init(options);
11fdf7f2
TL
5865 if (to_repair_db)
5866 return 0;
5867 if (create) {
5868 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
5869 r = db->create_and_open(err, cfs);
5870 } else {
5871 r = db->create_and_open(err);
5872 }
5873 } else {
5874 // we pass in cf list here, but it is only used if the db already has
5875 // column families created.
5876 r = read_only ?
5877 db->open_read_only(err, cfs) :
5878 db->open(err, cfs);
5879 }
7c673cae
FG
5880 if (r) {
5881 derr << __func__ << " erroring opening db: " << err.str() << dendl;
1911f103 5882 _close_db(read_only);
7c673cae
FG
5883 return -EIO;
5884 }
5885 dout(1) << __func__ << " opened " << kv_backend
5886 << " path " << fn << " options " << options << dendl;
5887 return 0;
7c673cae
FG
5888}
5889
1911f103 5890void BlueStore::_close_db(bool cold_close)
7c673cae 5891{
11fdf7f2 5892 ceph_assert(db);
7c673cae
FG
5893 delete db;
5894 db = NULL;
5895 if (bluefs) {
1911f103 5896 _close_bluefs(cold_close);
7c673cae
FG
5897 }
5898}
5899
11fdf7f2 5900void BlueStore::_dump_alloc_on_failure()
7c673cae 5901{
11fdf7f2
TL
5902 auto dump_interval =
5903 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
5904 if (dump_interval > 0 &&
5905 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
5906 alloc->dump();
5907 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
5908 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 5909 }
11fdf7f2 5910}
7c673cae 5911
7c673cae 5912
11fdf7f2
TL
5913int BlueStore::allocate_bluefs_freespace(
5914 uint64_t min_size,
5915 uint64_t size,
5916 PExtentVector* extents_out)
5917{
5918 ceph_assert(min_size <= size);
5919 if (size) {
5920 // round up to alloc size
9f95a23c 5921 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
eafe8130
TL
5922 min_size = p2roundup(min_size, alloc_size);
5923 size = p2roundup(size, alloc_size);
11fdf7f2
TL
5924
5925 PExtentVector extents_local;
5926 PExtentVector* extents = extents_out ? extents_out : &extents_local;
5927
5928
5929 uint64_t gift;
5930 uint64_t allocated = 0;
5931 int64_t alloc_len;
9f95a23c
TL
5932 auto need = size;
5933 auto extent_count0 = extents->size();
11fdf7f2
TL
5934 do {
5935 // hard cap to fit into 32 bits
9f95a23c 5936 gift = std::min<uint64_t>(size, 1ull << 30);
11fdf7f2
TL
5937 dout(10) << __func__ << " gifting " << gift
5938 << " (" << byte_u_t(gift) << ")" << dendl;
5939
eafe8130
TL
5940 alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents);
5941 if (alloc_len > 0) {
11fdf7f2
TL
5942 allocated += alloc_len;
5943 size -= alloc_len;
5944 }
5945
eafe8130
TL
5946 if (alloc_len < 0 ||
5947 (alloc_len < (int64_t)gift && (min_size > allocated))) {
11fdf7f2
TL
5948 derr << __func__
5949 << " failed to allocate on 0x" << std::hex << gift
5950 << " min_size 0x" << min_size
5951 << " > allocated total 0x" << allocated
eafe8130
TL
5952 << " bluefs_shared_alloc_size 0x" << alloc_size
5953 << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
5954 << " available 0x " << alloc->get_free()
5955 << std::dec << dendl;
7c673cae 5956
494da23a 5957 _dump_alloc_on_failure();
11fdf7f2
TL
5958 alloc->release(*extents);
5959 extents->clear();
5960 return -ENOSPC;
5961 }
5962 } while (size && alloc_len > 0);
9f95a23c
TL
5963 _collect_allocation_stats(need, alloc_size, extents->size() - extent_count0);
5964
11fdf7f2
TL
5965 for (auto& e : *extents) {
5966 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
5967 bluefs_extents.insert(e.offset, e.length);
5968 ++out_of_sync_fm;
5969 // apply to bluefs if not requested from outside
5970 if (!extents_out) {
9f95a23c 5971 bluefs->add_block_extent(bluefs_layout.shared_bdev, e.offset, e.length);
11fdf7f2 5972 }
7c673cae
FG
5973 }
5974 }
7c673cae
FG
5975 return 0;
5976}
5977
9f95a23c
TL
5978uint64_t BlueStore::available_freespace(uint64_t alloc_size) {
5979 uint64_t total = 0;
5980 auto iterated_allocation = [&](uint64_t off, uint64_t len) {
eafe8130 5981 //only count in size that is alloc_size aligned
9f95a23c
TL
5982 uint64_t dist_to_alignment;
5983 uint64_t offset_in_block = off & (alloc_size - 1);
eafe8130
TL
5984 if (offset_in_block == 0)
5985 dist_to_alignment = 0;
5986 else
5987 dist_to_alignment = alloc_size - offset_in_block;
5988 if (dist_to_alignment >= len)
5989 return;
5990 len -= dist_to_alignment;
5991 total += p2align(len, alloc_size);
5992 };
5993 alloc->dump(iterated_allocation);
5994 return total;
5995}
5996
11fdf7f2 5997int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
f64942e4 5998{
7c673cae
FG
5999 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
6000
6001 uint64_t my_free = alloc->get_free();
6002 uint64_t total = bdev->get_size();
6003 float my_free_ratio = (float)my_free / (float)total;
6004
6005 uint64_t total_free = bluefs_free + my_free;
6006
6007 float bluefs_ratio = (float)bluefs_free / (float)total_free;
6008
6009 dout(10) << __func__
1adf2230 6010 << " bluefs " << byte_u_t(bluefs_free)
7c673cae 6011 << " free (" << bluefs_free_ratio
1adf2230 6012 << ") bluestore " << byte_u_t(my_free)
7c673cae
FG
6013 << " free (" << my_free_ratio
6014 << "), bluefs_ratio " << bluefs_ratio
6015 << dendl;
6016
6017 uint64_t gift = 0;
6018 uint64_t reclaim = 0;
6019 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
6020 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
9f95a23c
TL
6021 if (gift >= my_free)
6022 gift = my_free / 2;
7c673cae
FG
6023 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6024 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
1adf2230 6025 << ", should gift " << byte_u_t(gift) << dendl;
7c673cae
FG
6026 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
6027 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
6028 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
6029 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
9f95a23c
TL
6030 if (reclaim >= bluefs_free)
6031 reclaim = bluefs_free / 2;
7c673cae
FG
6032 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6033 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
1adf2230 6034 << ", should reclaim " << byte_u_t(reclaim) << dendl;
7c673cae 6035 }
3efd9988
FG
6036
6037 // don't take over too much of the freespace
6038 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 6039 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 6040 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
6041 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
6042 dout(10) << __func__ << " bluefs_total " << bluefs_total
6043 << " < min " << cct->_conf->bluestore_bluefs_min
1adf2230 6044 << ", should gift " << byte_u_t(g) << dendl;
7c673cae
FG
6045 if (g > gift)
6046 gift = g;
6047 reclaim = 0;
6048 }
9f95a23c
TL
6049 uint64_t min_free =
6050 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
3efd9988
FG
6051 if (bluefs_free < min_free &&
6052 min_free < free_cap) {
6053 uint64_t g = min_free - bluefs_free;
11fdf7f2 6054 dout(10) << __func__ << " bluefs_free " << bluefs_free
3efd9988 6055 << " < min " << min_free
1adf2230 6056 << ", should gift " << byte_u_t(g) << dendl;
3efd9988
FG
6057 if (g > gift)
6058 gift = g;
6059 reclaim = 0;
6060 }
9f95a23c
TL
6061 uint64_t max_free =
6062 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_max_free");
6063 if (bluefs_free > max_free) {
6064 dout(10) << __func__ << " bluefs_free " << bluefs_free
6065 << " > max " << max_free
6066 << ", stop gifting for now" << dendl;
6067 gift = 0;
6068 }
11fdf7f2
TL
6069 ceph_assert((int64_t)gift >= 0);
6070 ceph_assert((int64_t)reclaim >= 0);
6071 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
6072}
7c673cae 6073
11fdf7f2
TL
6074int BlueStore::_balance_bluefs_freespace()
6075{
6076 int ret = 0;
6077 ceph_assert(bluefs);
7c673cae 6078
11fdf7f2
TL
6079 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
6080 bluefs->get_usage(&bluefs_usage);
9f95a23c 6081 ceph_assert(bluefs_usage.size() > bluefs_layout.shared_bdev);
7c673cae 6082
11fdf7f2 6083 bool clear_alert = true;
9f95a23c
TL
6084 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6085 auto& p = bluefs_usage[bluefs_layout.shared_bdev];
11fdf7f2
TL
6086 if (p.first != p.second) {
6087 auto& db = bluefs_usage[BlueFS::BDEV_DB];
6088 ostringstream ss;
6089 ss << "spilled over " << byte_u_t(p.second - p.first)
6090 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
6091 << " used of " << byte_u_t(db.second) << ") to slow device";
6092 _set_spillover_alert(ss.str());
6093 clear_alert = false;
6094 }
6095 }
6096 if (clear_alert) {
6097 _clear_spillover_alert();
7c673cae
FG
6098 }
6099
11fdf7f2
TL
6100 // fixme: look at primary bdev only for now
6101 int64_t delta = _get_bluefs_size_delta(
9f95a23c
TL
6102 bluefs_usage[bluefs_layout.shared_bdev].first,
6103 bluefs_usage[bluefs_layout.shared_bdev].second);
11fdf7f2 6104
7c673cae 6105 // reclaim from bluefs?
11fdf7f2 6106 if (delta < 0) {
7c673cae 6107 // round up to alloc size
9f95a23c 6108 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
eafe8130 6109 auto reclaim = p2roundup(uint64_t(-delta), alloc_size);
7c673cae
FG
6110
6111 // hard cap to fit into 32 bits
9f95a23c 6112 reclaim = std::min<uint64_t>(reclaim, 1ull << 30);
7c673cae 6113 dout(10) << __func__ << " reclaiming " << reclaim
1adf2230 6114 << " (" << byte_u_t(reclaim) << ")" << dendl;
7c673cae
FG
6115
6116 while (reclaim > 0) {
6117 // NOTE: this will block and do IO.
a8e16298 6118 PExtentVector extents;
9f95a23c 6119 int r = bluefs->reclaim_blocks(bluefs_layout.shared_bdev, reclaim,
7c673cae
FG
6120 &extents);
6121 if (r < 0) {
6122 derr << __func__ << " failed to reclaim space from bluefs"
6123 << dendl;
6124 break;
6125 }
6126 for (auto e : extents) {
11fdf7f2 6127 ++out_of_sync_fm;
7c673cae
FG
6128 bluefs_extents.erase(e.offset, e.length);
6129 bluefs_extents_reclaiming.insert(e.offset, e.length);
6130 reclaim -= e.length;
6131 }
6132 }
6133
6134 ret = 1;
6135 }
6136
6137 return ret;
6138}
6139
eafe8130 6140int BlueStore::_open_collections()
7c673cae 6141{
28e407b8 6142 dout(10) << __func__ << dendl;
eafe8130 6143 collections_had_errors = false;
11fdf7f2 6144 ceph_assert(coll_map.empty());
7c673cae
FG
6145 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6146 for (it->upper_bound(string());
6147 it->valid();
6148 it->next()) {
6149 coll_t cid;
6150 if (cid.parse(it->key())) {
9f95a23c 6151 auto c = ceph::make_ref<Collection>(
7c673cae 6152 this,
9f95a23c
TL
6153 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6154 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6155 cid);
7c673cae 6156 bufferlist bl = it->value();
11fdf7f2 6157 auto p = bl.cbegin();
7c673cae 6158 try {
11fdf7f2 6159 decode(c->cnode, p);
7c673cae
FG
6160 } catch (buffer::error& e) {
6161 derr << __func__ << " failed to decode cnode, key:"
6162 << pretty_binary_string(it->key()) << dendl;
6163 return -EIO;
6164 }
28e407b8
AA
6165 dout(20) << __func__ << " opened " << cid << " " << c
6166 << " " << c->cnode << dendl;
11fdf7f2 6167 _osr_attach(c.get());
7c673cae 6168 coll_map[cid] = c;
11fdf7f2 6169
7c673cae
FG
6170 } else {
6171 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6172 collections_had_errors = true;
7c673cae
FG
6173 }
6174 }
6175 return 0;
6176}
6177
eafe8130
TL
6178void BlueStore::_fsck_collections(int64_t* errors)
6179{
6180 if (collections_had_errors) {
6181 dout(10) << __func__ << dendl;
6182 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6183 for (it->upper_bound(string());
6184 it->valid();
6185 it->next()) {
6186 coll_t cid;
6187 if (!cid.parse(it->key())) {
6188 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6189 if (errors) {
6190 (*errors)++;
6191 }
6192 }
6193 }
6194 }
6195}
6196
9f95a23c
TL
6197void BlueStore::_set_per_pool_omap()
6198{
6199 per_pool_omap = false;
6200 bufferlist bl;
6201 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6202 if (bl.length()) {
6203 per_pool_omap = true;
6204 dout(10) << __func__ << " per_pool_omap=1" << dendl;
6205 } else {
6206 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6207 }
6208 _check_no_per_pool_omap_alert();
6209}
6210
224ce89b 6211void BlueStore::_open_statfs()
31f18b77 6212{
11fdf7f2
TL
6213 osd_pools.clear();
6214 vstatfs.reset();
6215
31f18b77 6216 bufferlist bl;
11fdf7f2 6217 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6218 if (r >= 0) {
11fdf7f2 6219 per_pool_stat_collection = false;
31f18b77 6220 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6221 auto it = bl.cbegin();
31f18b77 6222 vstatfs.decode(it);
11fdf7f2 6223 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6224 } else {
31f18b77
FG
6225 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6226 }
81eedcae 6227 _check_legacy_statfs_alert();
11fdf7f2
TL
6228 } else {
6229 per_pool_stat_collection = true;
6230 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6231 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
6232 for (it->upper_bound(string());
6233 it->valid();
6234 it->next()) {
6235
6236 uint64_t pool_id;
6237 int r = get_key_pool_stat(it->key(), &pool_id);
6238 ceph_assert(r == 0);
6239
6240 bufferlist bl;
6241 bl = it->value();
6242 auto p = bl.cbegin();
6243 auto& st = osd_pools[pool_id];
6244 try {
6245 st.decode(p);
6246 vstatfs += st;
6247
6248 dout(30) << __func__ << " pool " << pool_id
6249 << " statfs " << st << dendl;
6250 } catch (buffer::error& e) {
6251 derr << __func__ << " failed to decode pool stats, key:"
6252 << pretty_binary_string(it->key()) << dendl;
6253 }
6254 }
31f18b77 6255 }
11fdf7f2
TL
6256 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6257
31f18b77
FG
6258}
6259
7c673cae
FG
6260int BlueStore::_setup_block_symlink_or_file(
6261 string name,
6262 string epath,
6263 uint64_t size,
6264 bool create)
6265{
6266 dout(20) << __func__ << " name " << name << " path " << epath
6267 << " size " << size << " create=" << (int)create << dendl;
6268 int r = 0;
91327a77 6269 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6270 if (create)
6271 flags |= O_CREAT;
6272 if (epath.length()) {
6273 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6274 if (r < 0) {
6275 r = -errno;
6276 derr << __func__ << " failed to create " << name << " symlink to "
6277 << epath << ": " << cpp_strerror(r) << dendl;
6278 return r;
6279 }
6280
6281 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6282 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6283 if (fd < 0) {
6284 r = -errno;
6285 derr << __func__ << " failed to open " << epath << " file: "
6286 << cpp_strerror(r) << dendl;
6287 return r;
6288 }
11fdf7f2
TL
6289 // write the Transport ID of the NVMe device
6290 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6291 // where "0000:02:00.0" is the selector of a PCI device, see
6292 // the first column of "lspci -mm -n -D"
6293 string trid{"trtype:PCIe "};
6294 trid += "traddr:";
6295 trid += epath.substr(strlen(SPDK_PREFIX));
6296 r = ::write(fd, trid.c_str(), trid.size());
6297 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6298 dout(1) << __func__ << " created " << name << " symlink to "
6299 << epath << dendl;
6300 VOID_TEMP_FAILURE_RETRY(::close(fd));
6301 }
6302 }
6303 if (size) {
6304 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6305 if (fd >= 0) {
6306 // block file is present
6307 struct stat st;
6308 int r = ::fstat(fd, &st);
6309 if (r == 0 &&
6310 S_ISREG(st.st_mode) && // if it is a regular file
6311 st.st_size == 0) { // and is 0 bytes
6312 r = ::ftruncate(fd, size);
6313 if (r < 0) {
6314 r = -errno;
6315 derr << __func__ << " failed to resize " << name << " file to "
6316 << size << ": " << cpp_strerror(r) << dendl;
6317 VOID_TEMP_FAILURE_RETRY(::close(fd));
6318 return r;
6319 }
6320
6321 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6322 r = ::ceph_posix_fallocate(fd, 0, size);
6323 if (r > 0) {
7c673cae
FG
6324 derr << __func__ << " failed to prefallocate " << name << " file to "
6325 << size << ": " << cpp_strerror(r) << dendl;
6326 VOID_TEMP_FAILURE_RETRY(::close(fd));
6327 return -r;
6328 }
7c673cae
FG
6329 }
6330 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6331 << byte_u_t(size) << dendl;
7c673cae
FG
6332 }
6333 VOID_TEMP_FAILURE_RETRY(::close(fd));
6334 } else {
6335 int r = -errno;
6336 if (r != -ENOENT) {
6337 derr << __func__ << " failed to open " << name << " file: "
6338 << cpp_strerror(r) << dendl;
6339 return r;
6340 }
6341 }
6342 }
6343 return 0;
6344}
6345
6346int BlueStore::mkfs()
6347{
6348 dout(1) << __func__ << " path " << path << dendl;
6349 int r;
6350 uuid_d old_fsid;
6351
eafe8130
TL
6352 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6353 derr << __func__ << " osd_max_object_size "
6354 << cct->_conf->osd_max_object_size << " > bluestore max "
6355 << OBJECT_MAX_SIZE << dendl;
6356 return -EINVAL;
6357 }
6358
7c673cae
FG
6359 {
6360 string done;
6361 r = read_meta("mkfs_done", &done);
6362 if (r == 0) {
6363 dout(1) << __func__ << " already created" << dendl;
6364 if (cct->_conf->bluestore_fsck_on_mkfs) {
6365 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6366 if (r < 0) {
6367 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6368 << dendl;
6369 return r;
6370 }
6371 if (r > 0) {
6372 derr << __func__ << " fsck found " << r << " errors" << dendl;
6373 r = -EIO;
6374 }
6375 }
6376 return r; // idempotent
6377 }
6378 }
6379
6380 {
6381 string type;
6382 r = read_meta("type", &type);
6383 if (r == 0) {
6384 if (type != "bluestore") {
6385 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6386 return -EIO;
6387 }
6388 } else {
6389 r = write_meta("type", "bluestore");
6390 if (r < 0)
6391 return r;
6392 }
6393 }
6394
6395 freelist_type = "bitmap";
6396
6397 r = _open_path();
6398 if (r < 0)
6399 return r;
6400
6401 r = _open_fsid(true);
6402 if (r < 0)
6403 goto out_path_fd;
6404
6405 r = _lock_fsid();
6406 if (r < 0)
6407 goto out_close_fsid;
6408
6409 r = _read_fsid(&old_fsid);
6410 if (r < 0 || old_fsid.is_zero()) {
6411 if (fsid.is_zero()) {
6412 fsid.generate_random();
6413 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6414 } else {
6415 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6416 }
6417 // we'll write it later.
6418 } else {
6419 if (!fsid.is_zero() && fsid != old_fsid) {
6420 derr << __func__ << " on-disk fsid " << old_fsid
6421 << " != provided " << fsid << dendl;
6422 r = -EINVAL;
6423 goto out_close_fsid;
6424 }
6425 fsid = old_fsid;
6426 }
6427
6428 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6429 cct->_conf->bluestore_block_size,
6430 cct->_conf->bluestore_block_create);
6431 if (r < 0)
6432 goto out_close_fsid;
6433 if (cct->_conf->bluestore_bluefs) {
6434 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6435 cct->_conf->bluestore_block_wal_size,
6436 cct->_conf->bluestore_block_wal_create);
6437 if (r < 0)
6438 goto out_close_fsid;
6439 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6440 cct->_conf->bluestore_block_db_size,
6441 cct->_conf->bluestore_block_db_create);
6442 if (r < 0)
6443 goto out_close_fsid;
6444 }
6445
6446 r = _open_bdev(true);
6447 if (r < 0)
6448 goto out_close_fsid;
6449
3efd9988
FG
6450 // choose min_alloc_size
6451 if (cct->_conf->bluestore_min_alloc_size) {
6452 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6453 } else {
11fdf7f2 6454 ceph_assert(bdev);
3efd9988
FG
6455 if (bdev->is_rotational()) {
6456 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6457 } else {
6458 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6459 }
6460 }
11fdf7f2 6461 _validate_bdev();
3efd9988
FG
6462
6463 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6464 if (!isp2(min_alloc_size)) {
3efd9988
FG
6465 derr << __func__ << " min_alloc_size 0x"
6466 << std::hex << min_alloc_size << std::dec
6467 << " is not power of 2 aligned!"
6468 << dendl;
6469 r = -EINVAL;
6470 goto out_close_bdev;
6471 }
6472
7c673cae
FG
6473 r = _open_db(true);
6474 if (r < 0)
6475 goto out_close_bdev;
6476
7c673cae
FG
6477 {
6478 KeyValueDB::Transaction t = db->get_transaction();
1911f103 6479 r = _open_fm(t, true);
11fdf7f2
TL
6480 if (r < 0)
6481 goto out_close_db;
7c673cae
FG
6482 {
6483 bufferlist bl;
11fdf7f2 6484 encode((uint64_t)0, bl);
7c673cae
FG
6485 t->set(PREFIX_SUPER, "nid_max", bl);
6486 t->set(PREFIX_SUPER, "blobid_max", bl);
6487 }
6488
7c673cae
FG
6489 {
6490 bufferlist bl;
11fdf7f2 6491 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6492 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6493 }
9f95a23c
TL
6494 {
6495 bufferlist bl;
6496 bl.append("1");
6497 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6498 }
7c673cae
FG
6499 ondisk_format = latest_ondisk_format;
6500 _prepare_ondisk_format_super(t);
6501 db->submit_transaction_sync(t);
6502 }
6503
7c673cae
FG
6504 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6505 if (r < 0)
224ce89b
WB
6506 goto out_close_fm;
6507
3efd9988 6508 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6509 if (r < 0)
224ce89b 6510 goto out_close_fm;
7c673cae
FG
6511
6512 if (fsid != old_fsid) {
6513 r = _write_fsid();
6514 if (r < 0) {
6515 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6516 goto out_close_fm;
7c673cae
FG
6517 }
6518 }
6519
11fdf7f2
TL
6520 if (out_of_sync_fm.fetch_and(0)) {
6521 _sync_bluefs_and_fm();
6522 }
6523
7c673cae
FG
6524 out_close_fm:
6525 _close_fm();
6526 out_close_db:
1911f103 6527 _close_db(false);
7c673cae
FG
6528 out_close_bdev:
6529 _close_bdev();
6530 out_close_fsid:
6531 _close_fsid();
6532 out_path_fd:
6533 _close_path();
6534
6535 if (r == 0 &&
6536 cct->_conf->bluestore_fsck_on_mkfs) {
6537 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6538 if (rc < 0)
6539 return rc;
6540 if (rc > 0) {
6541 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6542 r = -EIO;
6543 }
11fdf7f2
TL
6544 }
6545
6546 if (r == 0) {
6547 // indicate success by writing the 'mkfs_done' file
6548 r = write_meta("mkfs_done", "yes");
6549 }
6550
6551 if (r < 0) {
6552 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6553 } else {
6554 dout(0) << __func__ << " success" << dendl;
6555 }
6556 return r;
6557}
6558
6559int BlueStore::_mount_for_bluefs()
6560{
6561 int r = _open_path();
6562 ceph_assert(r == 0);
6563 r = _open_fsid(false);
6564 ceph_assert(r == 0);
6565 r = _read_fsid(&fsid);
6566 ceph_assert(r == 0);
6567 r = _lock_fsid();
6568 ceph_assert(r == 0);
6569 r = _open_bluefs(false);
6570 ceph_assert(r == 0);
6571 return r;
6572}
6573
6574void BlueStore::_umount_for_bluefs()
6575{
1911f103 6576 _close_bluefs(false);
11fdf7f2
TL
6577 _close_fsid();
6578 _close_path();
6579}
6580
6581int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6582{
6583 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6584 int r;
6585 ceph_assert(path_fd < 0);
6586
6587 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6588
6589 if (!cct->_conf->bluestore_bluefs) {
6590 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6591 return -EIO;
6592 }
6593
6594 r = _mount_for_bluefs();
6595
6596 int reserved = 0;
6597 if (id == BlueFS::BDEV_NEWWAL) {
6598 string p = path + "/block.wal";
6599 r = _setup_block_symlink_or_file("block.wal", dev_path,
6600 cct->_conf->bluestore_block_wal_size,
6601 true);
6602 ceph_assert(r == 0);
6603
6604 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
eafe8130 6605 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6606 ceph_assert(r == 0);
6607
6608 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6609 r = _check_or_set_bdev_label(
6610 p,
6611 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6612 "bluefs wal",
6613 true);
6614 ceph_assert(r == 0);
6615 }
6616
6617 reserved = BDEV_LABEL_BLOCK_SIZE;
9f95a23c 6618 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6619 } else if (id == BlueFS::BDEV_NEWDB) {
6620 string p = path + "/block.db";
6621 r = _setup_block_symlink_or_file("block.db", dev_path,
6622 cct->_conf->bluestore_block_db_size,
6623 true);
6624 ceph_assert(r == 0);
6625
6626 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
eafe8130 6627 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6628 ceph_assert(r == 0);
6629
6630 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6631 r = _check_or_set_bdev_label(
6632 p,
6633 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6634 "bluefs db",
6635 true);
6636 ceph_assert(r == 0);
6637 }
6638 reserved = SUPER_RESERVED;
9f95a23c
TL
6639 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6640 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6641 }
6642
6643 bluefs->umount();
6644 bluefs->mount();
6645
6646 bluefs->add_block_extent(
6647 id,
6648 reserved,
1911f103 6649 bluefs->get_block_device_size(id) - reserved, true);
11fdf7f2 6650
9f95a23c 6651 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
6652 ceph_assert(r == 0);
6653
6654 if (r < 0) {
6655 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6656 } else {
6657 dout(0) << __func__ << " success" << dendl;
6658 }
6659
6660 _umount_for_bluefs();
6661 return r;
6662}
6663
6664int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6665 int id)
6666{
6667 dout(10) << __func__ << " id:" << id << dendl;
6668 ceph_assert(path_fd < 0);
6669
6670 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6671
6672 if (!cct->_conf->bluestore_bluefs) {
6673 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6674 return -EIO;
6675 }
6676
6677 int r = _mount_for_bluefs();
6678
6679 // require bluestore_bluefs_min_free to be free at target device!
6680 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6681 for(auto src_id : devs_source) {
6682 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6683 }
6684 uint64_t target_free = bluefs->get_free(id);
6685 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6686 // will need to remount full BlueStore instance to allocate more space
6687 _umount_for_bluefs();
6688
6689 r = mount();
6690 ceph_assert(r == 0);
6691 dout(1) << __func__
6692 << " Allocating more space at slow device for BlueFS: +"
6693 << used_space - target_free << " bytes" << dendl;
6694 r = allocate_bluefs_freespace(
6695 used_space - target_free,
6696 used_space - target_free,
6697 nullptr);
6698
6699 umount();
6700 if (r != 0) {
6701 derr << __func__
6702 << " can't migrate, unable to allocate extra space: "
6703 << used_space - target_free << " at target:" << id
6704 << dendl;
6705 return -ENOSPC;
6706 }
6707
6708 r = _mount_for_bluefs();
6709 ceph_assert(r == 0);
6710 } else if (target_free < used_space) {
6711 derr << __func__
6712 << " can't migrate, free space at target: " << target_free
6713 << " is less than required space: " << used_space
6714 << dendl;
6715 return -ENOSPC;
6716 }
9f95a23c
TL
6717 if (devs_source.count(BlueFS::BDEV_DB)) {
6718 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6719 bluefs_layout.dedicated_db = false;
6720 }
6721 if (devs_source.count(BlueFS::BDEV_WAL)) {
6722 bluefs_layout.dedicated_wal = false;
6723 }
6724 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
6725 if (r < 0) {
6726 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6727 goto shutdown;
6728 }
6729
6730 if (devs_source.count(BlueFS::BDEV_DB)) {
6731 r = unlink(string(path + "/block.db").c_str());
6732 ceph_assert(r == 0);
6733 }
6734 if (devs_source.count(BlueFS::BDEV_WAL)) {
6735 r = unlink(string(path + "/block.wal").c_str());
6736 ceph_assert(r == 0);
6737 }
6738
6739shutdown:
6740 _umount_for_bluefs();
6741 return r;
6742}
6743
6744int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6745 int id,
6746 const string& dev_path)
6747{
6748 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6749 int r;
6750 ceph_assert(path_fd < 0);
6751
6752 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6753
6754 if (!cct->_conf->bluestore_bluefs) {
6755 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6756 return -EIO;
6757 }
6758
6759 r = _mount_for_bluefs();
6760
6761 int reserved = 0;
6762 string link_db;
6763 string link_wal;
6764 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 6765 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 6766 link_db = path + "/block.db";
9f95a23c
TL
6767 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6768 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
6769 }
6770 if (devs_source.count(BlueFS::BDEV_WAL)) {
6771 link_wal = path + "/block.wal";
9f95a23c 6772 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
6773 }
6774
6775 size_t target_size;
6776 string target_name;
6777 if (id == BlueFS::BDEV_NEWWAL) {
6778 target_name = "block.wal";
6779 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 6780 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6781
6782 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
eafe8130 6783 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6784 ceph_assert(r == 0);
6785
6786 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6787 r = _check_or_set_bdev_label(
6788 dev_path,
6789 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6790 "bluefs wal",
6791 true);
6792 ceph_assert(r == 0);
6793 }
6794 reserved = BDEV_LABEL_BLOCK_SIZE;
6795 } else if (id == BlueFS::BDEV_NEWDB) {
6796 target_name = "block.db";
6797 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
6798 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6799 bluefs_layout.dedicated_db = true;
31f18b77 6800
11fdf7f2 6801 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
eafe8130 6802 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6803 ceph_assert(r == 0);
6804
6805 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6806 r = _check_or_set_bdev_label(
6807 dev_path,
6808 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6809 "bluefs db",
6810 true);
6811 ceph_assert(r == 0);
6812 }
6813 reserved = SUPER_RESERVED;
31f18b77
FG
6814 }
6815
11fdf7f2
TL
6816 bluefs->umount();
6817 bluefs->mount();
6818
6819 bluefs->add_block_extent(
6820 id, reserved, bluefs->get_block_device_size(id) - reserved);
6821
9f95a23c 6822 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 6823
7c673cae 6824 if (r < 0) {
11fdf7f2
TL
6825 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6826 goto shutdown;
6827 }
6828
6829 if (!link_db.empty()) {
6830 r = unlink(link_db.c_str());
6831 ceph_assert(r == 0);
6832 }
6833 if (!link_wal.empty()) {
6834 r = unlink(link_wal.c_str());
6835 ceph_assert(r == 0);
6836 }
6837 r = _setup_block_symlink_or_file(
6838 target_name,
6839 dev_path,
6840 target_size,
6841 true);
6842 ceph_assert(r == 0);
6843 dout(0) << __func__ << " success" << dendl;
6844
6845shutdown:
6846 _umount_for_bluefs();
6847 return r;
6848}
6849
6850string BlueStore::get_device_path(unsigned id)
6851{
6852 string res;
6853 if (id < BlueFS::MAX_BDEV) {
6854 switch (id) {
6855 case BlueFS::BDEV_WAL:
6856 res = path + "/block.wal";
6857 break;
6858 case BlueFS::BDEV_DB:
9f95a23c 6859 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
6860 res = path + "/block";
6861 } else {
6862 res = path + "/block.db";
6863 }
6864 break;
6865 case BlueFS::BDEV_SLOW:
6866 res = path + "/block";
6867 break;
6868 }
6869 }
6870 return res;
6871}
6872
6873int BlueStore::expand_devices(ostream& out)
6874{
1911f103 6875 int r = cold_open();
11fdf7f2
TL
6876 ceph_assert(r == 0);
6877 bluefs->dump_block_extents(out);
1911f103 6878 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 6879 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 6880 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
6881 continue;
6882 }
6883 uint64_t size = bluefs->get_block_device_size(devid);
6884 if (size == 0) {
6885 // no bdev
6886 continue;
6887 }
6888
6889 interval_set<uint64_t> before;
6890 bluefs->get_block_extents(devid, &before);
6891 ceph_assert(!before.empty());
6892 uint64_t end = before.range_end();
6893 if (end < size) {
6894 out << devid
6895 <<" : expanding " << " from 0x" << std::hex
6896 << end << " to 0x" << size << std::dec << std::endl;
6897 bluefs->add_block_extent(devid, end, size-end);
6898 string p = get_device_path(devid);
6899 const char* path = p.c_str();
6900 if (path == nullptr) {
6901 derr << devid
6902 <<": can't find device path " << dendl;
6903 continue;
6904 }
6905 bluestore_bdev_label_t label;
6906 int r = _read_bdev_label(cct, path, &label);
6907 if (r < 0) {
6908 derr << "unable to read label for " << path << ": "
6909 << cpp_strerror(r) << dendl;
6910 continue;
6911 }
6912 label.size = size;
6913 r = _write_bdev_label(cct, path, label);
6914 if (r < 0) {
6915 derr << "unable to write label for " << path << ": "
6916 << cpp_strerror(r) << dendl;
6917 continue;
6918 }
6919 out << devid
6920 <<" : size label updated to " << size
6921 << std::endl;
6922 }
6923 }
6924 uint64_t size0 = fm->get_size();
6925 uint64_t size = bdev->get_size();
6926 if (size0 < size) {
9f95a23c 6927 out << bluefs_layout.shared_bdev
1911f103
TL
6928 << " : expanding " << " from 0x" << std::hex
6929 << size0 << " to 0x" << size << std::dec << std::endl;
6930 _write_out_fm_meta(size, true);
6931 cold_close();
6932
6933 // mount in read/write to sync expansion changes
6934 r = _mount(false);
11fdf7f2 6935 ceph_assert(r == 0);
1911f103
TL
6936 umount();
6937 } else {
6938 cold_close();
7c673cae 6939 }
1911f103
TL
6940 return r;
6941}
6942
6943int BlueStore::dump_bluefs_sizes(ostream& out)
6944{
6945 int r = cold_open();
6946 ceph_assert(r == 0);
6947 bluefs->dump_block_extents(out);
6948 cold_close();
7c673cae
FG
6949 return r;
6950}
6951
6952void BlueStore::set_cache_shards(unsigned num)
6953{
6954 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
6955 size_t oold = onode_cache_shards.size();
6956 size_t bold = buffer_cache_shards.size();
6957 ceph_assert(num >= oold && num >= bold);
6958 onode_cache_shards.resize(num);
6959 buffer_cache_shards.resize(num);
6960 for (unsigned i = oold; i < num; ++i) {
6961 onode_cache_shards[i] =
6962 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6963 logger);
6964 }
6965 for (unsigned i = bold; i < num; ++i) {
6966 buffer_cache_shards[i] =
6967 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6968 logger);
7c673cae
FG
6969 }
6970}
6971
11fdf7f2 6972int BlueStore::_mount(bool kv_only, bool open_db)
7c673cae
FG
6973{
6974 dout(1) << __func__ << " path " << path << dendl;
6975
3efd9988
FG
6976 _kv_only = kv_only;
6977
7c673cae
FG
6978 {
6979 string type;
6980 int r = read_meta("type", &type);
6981 if (r < 0) {
6982 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6983 << dendl;
6984 return r;
6985 }
6986
6987 if (type != "bluestore") {
6988 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6989 return -EIO;
6990 }
6991 }
6992
6993 if (cct->_conf->bluestore_fsck_on_mount) {
6994 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6995 if (rc < 0)
6996 return rc;
6997 if (rc > 0) {
6998 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6999 return -EIO;
7000 }
7001 }
7002
eafe8130
TL
7003 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7004 derr << __func__ << " osd_max_object_size "
7005 << cct->_conf->osd_max_object_size << " > bluestore max "
7006 << OBJECT_MAX_SIZE << dendl;
7007 return -EINVAL;
7008 }
7009
7c673cae
FG
7010 int r = _open_path();
7011 if (r < 0)
7012 return r;
7013 r = _open_fsid(false);
7014 if (r < 0)
7015 goto out_path;
7016
7017 r = _read_fsid(&fsid);
7018 if (r < 0)
7019 goto out_fsid;
7020
7021 r = _lock_fsid();
7022 if (r < 0)
7023 goto out_fsid;
7024
7025 r = _open_bdev(false);
7026 if (r < 0)
7027 goto out_fsid;
7028
11fdf7f2
TL
7029 if (open_db) {
7030 r = _open_db_and_around(false);
7031 } else {
7032 // we can bypass db open exclusively in case of kv_only mode
7033 ceph_assert(kv_only);
7034 r = _open_db(false, true);
9f95a23c
TL
7035 }
7036 if (r < 0) {
7037 goto out_bdev;
11fdf7f2 7038 }
7c673cae
FG
7039
7040 if (kv_only)
7041 return 0;
7042
11fdf7f2
TL
7043 r = _upgrade_super();
7044 if (r < 0) {
7c673cae 7045 goto out_db;
11fdf7f2 7046 }
7c673cae
FG
7047
7048 r = _open_collections();
7049 if (r < 0)
11fdf7f2 7050 goto out_db;
7c673cae
FG
7051
7052 r = _reload_logger();
7053 if (r < 0)
7054 goto out_coll;
7055
31f18b77 7056 _kv_start();
7c673cae
FG
7057
7058 r = _deferred_replay();
7059 if (r < 0)
7060 goto out_stop;
7061
7062 mempool_thread.init();
7063
9f95a23c 7064 if ((!per_pool_stat_collection || !per_pool_omap) &&
eafe8130 7065 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c
TL
7066
7067 bool was_per_pool_omap = per_pool_omap;
7068
eafe8130
TL
7069 dout(1) << __func__ << " quick-fix on mount" << dendl;
7070 _fsck_on_open(FSCK_SHALLOW, true);
7071
7072 //reread statfs
7073 //FIXME minor: replace with actual open/close?
7074 _open_statfs();
eafe8130 7075 _check_legacy_statfs_alert();
9f95a23c
TL
7076
7077 //set again as hopefully it has been fixed
7078 if (!was_per_pool_omap) {
7079 _set_per_pool_omap();
7080 }
eafe8130
TL
7081 }
7082
7c673cae
FG
7083 mounted = true;
7084 return 0;
7085
7086 out_stop:
7087 _kv_stop();
7c673cae 7088 out_coll:
f6b5b4d7 7089 _shutdown_cache();
7c673cae 7090 out_db:
1911f103 7091 _close_db_and_around(false);
7c673cae
FG
7092 out_bdev:
7093 _close_bdev();
7094 out_fsid:
7095 _close_fsid();
7096 out_path:
7097 _close_path();
7098 return r;
7099}
7100
7101int BlueStore::umount()
7102{
11fdf7f2 7103 ceph_assert(_kv_only || mounted);
7c673cae
FG
7104 dout(1) << __func__ << dendl;
7105
7106 _osr_drain_all();
7c673cae 7107
7c673cae 7108 mounted = false;
3efd9988
FG
7109 if (!_kv_only) {
7110 mempool_thread.shutdown();
7111 dout(20) << __func__ << " stopping kv thread" << dendl;
7112 _kv_stop();
f6b5b4d7 7113 _shutdown_cache();
3efd9988
FG
7114 dout(20) << __func__ << " closing" << dendl;
7115
3efd9988 7116 }
1911f103 7117 _close_db_and_around(false);
7c673cae
FG
7118 _close_bdev();
7119 _close_fsid();
7120 _close_path();
7121
7122 if (cct->_conf->bluestore_fsck_on_umount) {
7123 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7124 if (rc < 0)
7125 return rc;
7126 if (rc > 0) {
7127 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7128 return -EIO;
7129 }
7130 }
7131 return 0;
7132}
7133
eafe8130
TL
7134int BlueStore::cold_open()
7135{
7136 int r = _open_path();
7137 if (r < 0)
7138 return r;
7139 r = _open_fsid(false);
7140 if (r < 0)
7141 goto out_path;
7142
7143 r = _read_fsid(&fsid);
7144 if (r < 0)
7145 goto out_fsid;
7146
7147 r = _lock_fsid();
7148 if (r < 0)
7149 goto out_fsid;
7150
7151 r = _open_bdev(false);
7152 if (r < 0)
7153 goto out_fsid;
7154 r = _open_db_and_around(true);
7155 if (r < 0) {
7156 goto out_bdev;
7157 }
7158 return 0;
7159 out_bdev:
7160 _close_bdev();
7161 out_fsid:
7162 _close_fsid();
7163 out_path:
7164 _close_path();
7165 return r;
7166}
7167int BlueStore::cold_close()
7168{
1911f103 7169 _close_db_and_around(true);
eafe8130
TL
7170 _close_bdev();
7171 _close_fsid();
7172 _close_path();
7173 return 0;
7174}
7175
9f95a23c
TL
7176// derr wrapper to limit enormous output and avoid log flooding.
7177// Of limited use where such output is expected for now
7178#define fsck_derr(err_cnt, threshold) \
7179 if (err_cnt <= threshold) { \
7180 bool need_skip_print = err_cnt == threshold; \
7181 derr
7182
7183#define fsck_dendl \
7184 dendl; \
7185 if (need_skip_print) \
7186 derr << "more error lines skipped..." << dendl; \
7c673cae 7187 }
7c673cae 7188
eafe8130
TL
7189int _fsck_sum_extents(
7190 const PExtentVector& extents,
7191 bool compressed,
7192 store_statfs_t& expected_statfs)
7193{
7194 for (auto e : extents) {
7195 if (!e.is_valid())
7196 continue;
7197 expected_statfs.allocated += e.length;
7198 if (compressed) {
7199 expected_statfs.data_compressed_allocated += e.length;
7200 }
7201 }
7202 return 0;
7203}
7204
7c673cae 7205int BlueStore::_fsck_check_extents(
11fdf7f2 7206 const coll_t& cid,
7c673cae
FG
7207 const ghobject_t& oid,
7208 const PExtentVector& extents,
7209 bool compressed,
7210 mempool_dynamic_bitset &used_blocks,
b32b8144 7211 uint64_t granularity,
11fdf7f2 7212 BlueStoreRepairer* repairer,
eafe8130
TL
7213 store_statfs_t& expected_statfs,
7214 FSCKDepth depth)
7c673cae
FG
7215{
7216 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7217 int errors = 0;
7218 for (auto e : extents) {
7219 if (!e.is_valid())
7220 continue;
7221 expected_statfs.allocated += e.length;
7222 if (compressed) {
11fdf7f2 7223 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7224 }
eafe8130
TL
7225 if (depth != FSCK_SHALLOW) {
7226 bool already = false;
9f95a23c 7227 apply_for_bitset_range(
eafe8130
TL
7228 e.offset, e.length, granularity, used_blocks,
7229 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7230 if (bs.test(pos)) {
7231 if (repairer) {
7232 repairer->note_misreference(
7233 pos * min_alloc_size, min_alloc_size, !already);
7234 }
7235 if (!already) {
7236 derr << "fsck error: " << oid << " extent " << e
7237 << " or a subset is already allocated (misreferenced)" << dendl;
7238 ++errors;
7239 already = true;
7240 }
11fdf7f2 7241 }
eafe8130
TL
7242 else
7243 bs.set(pos);
7244 });
7245 if (repairer) {
7246 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
7247 }
11fdf7f2 7248
eafe8130
TL
7249 if (e.end() > bdev->get_size()) {
7250 derr << "fsck error: " << oid << " extent " << e
7251 << " past end of block device" << dendl;
7252 ++errors;
7253 }
7c673cae
FG
7254 }
7255 }
7256 return errors;
7257}
7258
11fdf7f2
TL
7259void BlueStore::_fsck_check_pool_statfs(
7260 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7261 int64_t& errors,
7262 int64_t& warnings,
11fdf7f2
TL
7263 BlueStoreRepairer* repairer)
7264{
7265 auto it = db->get_iterator(PREFIX_STAT);
7266 if (it) {
7267 for (it->lower_bound(string()); it->valid(); it->next()) {
7268 string key = it->key();
7269 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7270 if (repairer) {
eafe8130
TL
7271 ++errors;
7272 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7273 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7274 << dendl;
7275 }
7276 continue;
7277 }
11fdf7f2
TL
7278 uint64_t pool_id;
7279 if (get_key_pool_stat(key, &pool_id) < 0) {
7280 derr << "fsck error: bad key " << key
7281 << "in statfs namespece" << dendl;
7282 if (repairer) {
7283 repairer->remove_key(db, PREFIX_STAT, key);
7284 }
7285 ++errors;
7286 continue;
7287 }
7288
7289 volatile_statfs vstatfs;
7290 bufferlist bl = it->value();
7291 auto blp = bl.cbegin();
7292 try {
7293 vstatfs.decode(blp);
7294 } catch (buffer::error& e) {
7295 derr << "fsck error: failed to decode Pool StatFS record"
7296 << pretty_binary_string(key) << dendl;
7297 if (repairer) {
7298 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7299 << pretty_binary_string(key)
7300 << "', removing" << dendl;
7301 repairer->remove_key(db, PREFIX_STAT, key);
7302 }
7303 ++errors;
7304 vstatfs.reset();
7305 }
7306 auto stat_it = expected_pool_statfs.find(pool_id);
7307 if (stat_it == expected_pool_statfs.end()) {
7308 if (vstatfs.is_empty()) {
7309 // we don't consider that as an error since empty pool statfs
7310 // are left in DB for now
7311 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7312 << std::hex << pool_id << std::dec << dendl;
7313 if (repairer) {
7314 // but we need to increment error count in case of repair
7315 // to have proper counters at the end
7316 // (as repairer increments recovery counter anyway).
7317 ++errors;
7318 }
7319 } else {
7320 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7321 << std::hex << pool_id << std::dec << dendl;
7322 ++errors;
7323 }
7324 if (repairer) {
7325 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
7326 }
7327 continue;
7328 }
7329 store_statfs_t statfs;
7330 vstatfs.publish(&statfs);
7331 if (!(stat_it->second == statfs)) {
7332 derr << "fsck error: actual " << statfs
7333 << " != expected " << stat_it->second
7334 << " for pool "
7335 << std::hex << pool_id << std::dec << dendl;
7336 if (repairer) {
7337 repairer->fix_statfs(db, key, stat_it->second);
7338 }
7339 ++errors;
7340 }
7341 expected_pool_statfs.erase(stat_it);
7342 }
7343 } // if (it)
eafe8130
TL
7344 for (auto& s : expected_pool_statfs) {
7345 if (s.second.is_zero()) {
11fdf7f2
TL
7346 // we might lack empty statfs recs in DB
7347 continue;
7348 }
7349 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7350 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7351 if (repairer) {
7352 string key;
eafe8130
TL
7353 get_pool_stat_key(s.first, &key);
7354 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7355 }
7356 ++errors;
7357 }
eafe8130 7358 if (!per_pool_stat_collection &&
eafe8130
TL
7359 repairer) {
7360 // by virtue of running this method, we correct the top-level
7361 // error of having global stats
7362 repairer->inc_repaired();
7363 }
11fdf7f2
TL
7364}
7365
eafe8130
TL
7366BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7367 BlueStore::FSCKDepth depth,
7368 int64_t pool_id,
7369 BlueStore::CollectionRef c,
7370 const ghobject_t& oid,
7371 const string& key,
7372 const bufferlist& value,
9f95a23c 7373 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
7374 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7375 const BlueStore::FSCK_ObjectCtx& ctx)
7376{
7377 auto& errors = ctx.errors;
7378 auto& num_objects = ctx.num_objects;
7379 auto& num_extents = ctx.num_extents;
7380 auto& num_blobs = ctx.num_blobs;
7381 auto& num_sharded_objects = ctx.num_sharded_objects;
7382 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7383 auto used_blocks = ctx.used_blocks;
7384 auto sb_info_lock = ctx.sb_info_lock;
7385 auto& sb_info = ctx.sb_info;
7386 auto repairer = ctx.repairer;
7387
7388 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7389 &ctx.expected_pool_statfs[pool_id] :
7390 &ctx.expected_store_statfs;
7391
7392 dout(10) << __func__ << " " << oid << dendl;
7393 OnodeRef o;
7394 o.reset(Onode::decode(c, oid, key, value));
7395 ++num_objects;
7c673cae 7396
eafe8130 7397 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 7398
eafe8130
TL
7399 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7400 _dump_onode<30>(cct, *o);
7401 // shards
7402 if (!o->extent_map.shards.empty()) {
7403 ++num_sharded_objects;
7404 if (depth != FSCK_SHALLOW) {
9f95a23c 7405 ceph_assert(expecting_shards);
eafe8130
TL
7406 for (auto& s : o->extent_map.shards) {
7407 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 7408 expecting_shards->push_back(string());
eafe8130 7409 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 7410 &expecting_shards->back());
eafe8130
TL
7411 if (s.shard_info->offset >= o->onode.size) {
7412 derr << "fsck error: " << oid << " shard 0x" << std::hex
7413 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7414 << std::dec << dendl;
7415 ++errors;
7416 }
7417 }
7418 }
7419 }
7c673cae 7420
eafe8130
TL
7421 // lextents
7422 uint64_t pos = 0;
7423 mempool::bluestore_fsck::map<BlobRef,
7424 bluestore_blob_use_tracker_t> ref_map;
7425 for (auto& l : o->extent_map.extent_map) {
7426 dout(20) << __func__ << " " << l << dendl;
7427 if (l.logical_offset < pos) {
7428 derr << "fsck error: " << oid << " lextent at 0x"
7429 << std::hex << l.logical_offset
7430 << " overlaps with the previous, which ends at 0x" << pos
7431 << std::dec << dendl;
7432 ++errors;
7433 }
7434 if (depth != FSCK_SHALLOW &&
7435 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7436 derr << "fsck error: " << oid << " lextent at 0x"
7437 << std::hex << l.logical_offset << "~" << l.length
7438 << " spans a shard boundary"
7439 << std::dec << dendl;
7440 ++errors;
7441 }
7442 pos = l.logical_offset + l.length;
7443 res_statfs->data_stored += l.length;
7444 ceph_assert(l.blob);
7445 const bluestore_blob_t& blob = l.blob->get_blob();
7446
7447 auto& ref = ref_map[l.blob];
7448 if (ref.is_empty()) {
7449 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7450 uint32_t l = blob.get_logical_length();
7451 ref.init(l, min_release_size);
7452 }
7453 ref.get(
7454 l.blob_offset,
7455 l.length);
7456 ++num_extents;
7457 if (depth != FSCK_SHALLOW &&
7458 blob.has_unused()) {
7459 ceph_assert(referenced);
7460 auto p = referenced->find(l.blob);
7461 bluestore_blob_t::unused_t* pu;
7462 if (p == referenced->end()) {
7463 pu = &(*referenced)[l.blob];
7464 }
7465 else {
7466 pu = &p->second;
7467 }
7468 uint64_t blob_len = blob.get_logical_length();
7469 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7470 ceph_assert(l.blob_offset + l.length <= blob_len);
7471 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7472 uint64_t start = l.blob_offset / chunk_size;
7473 uint64_t end =
7474 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7475 for (auto i = start; i < end; ++i) {
7476 (*pu) |= (1u << i);
7477 }
7478 }
7479 } //for (auto& l : o->extent_map.extent_map)
7480
7481 for (auto& i : ref_map) {
7482 ++num_blobs;
7483 const bluestore_blob_t& blob = i.first->get_blob();
7484 bool equal =
7485 depth == FSCK_SHALLOW ? true :
7486 i.first->get_blob_use_tracker().equal(i.second);
7487 if (!equal) {
7488 derr << "fsck error: " << oid << " blob " << *i.first
7489 << " doesn't match expected ref_map " << i.second << dendl;
7490 ++errors;
7491 }
7492 if (blob.is_compressed()) {
7493 res_statfs->data_compressed += blob.get_compressed_payload_length();
7494 res_statfs->data_compressed_original +=
7495 i.first->get_referenced_bytes();
7496 }
7497 if (blob.is_shared()) {
7498 if (i.first->shared_blob->get_sbid() > blobid_max) {
7499 derr << "fsck error: " << oid << " blob " << blob
7500 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7501 << blobid_max << dendl;
7502 ++errors;
7503 }
7504 else if (i.first->shared_blob->get_sbid() == 0) {
7505 derr << "fsck error: " << oid << " blob " << blob
7506 << " marked as shared but has uninitialized sbid"
7507 << dendl;
7508 ++errors;
7509 }
7510 // the below lock is optional and provided in multithreading mode only
7511 if (sb_info_lock) {
7512 sb_info_lock->lock();
7513 }
7514 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7515 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7516 ceph_assert(sbi.pool_id == INT64_MIN ||
7517 sbi.pool_id == oid.hobj.get_logical_pool());
7518 sbi.cid = c->cid;
7519 sbi.pool_id = oid.hobj.get_logical_pool();
7520 sbi.sb = i.first->shared_blob;
7521 sbi.oids.push_back(oid);
7522 sbi.compressed = blob.is_compressed();
7523 for (auto e : blob.get_extents()) {
7524 if (e.is_valid()) {
7525 sbi.ref_map.get(e.offset, e.length);
7526 }
7527 }
7528 if (sb_info_lock) {
7529 sb_info_lock->unlock();
7530 }
7531 } else if (depth != FSCK_SHALLOW) {
7532 ceph_assert(used_blocks);
7533 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7534 blob.is_compressed(),
7535 *used_blocks,
7536 fm->get_alloc_size(),
7537 repairer,
7538 *res_statfs,
7539 depth);
7540 } else {
7541 errors += _fsck_sum_extents(
7542 blob.get_extents(),
7543 blob.is_compressed(),
7544 *res_statfs);
7545 }
7546 } // for (auto& i : ref_map)
9f95a23c
TL
7547
7548 if (o->onode.has_omap()) {
7549 _fsck_check_object_omap(depth, o, ctx);
7550 }
7551
eafe8130
TL
7552 return o;
7553}
7554
7555#include "common/WorkQueue.h"
7556
7557class ShallowFSCKThreadPool : public ThreadPool
7558{
7559public:
7560 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7561 ThreadPool(cct_, nm, tn, n) {
7562 }
7563 void worker(ThreadPool::WorkThread* wt) override {
7564 int next_wq = 0;
7565 while (!_stop) {
7566 next_wq %= work_queues.size();
7567 WorkQueue_ *wq = work_queues[next_wq++];
7568
7569 void* item = wq->_void_dequeue();
7570 if (item) {
7571 processing++;
7572 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7573 wq->_void_process(item, tp_handle);
7574 processing--;
7575 }
7576 }
7577 }
7578 template <size_t BatchLen>
7579 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7580 {
7581 struct Entry {
7582 int64_t pool_id;
7583 BlueStore::CollectionRef c;
7584 ghobject_t oid;
7585 string key;
7586 bufferlist value;
7587 };
7588 struct Batch {
7589 std::atomic<size_t> running = { 0 };
7590 size_t entry_count = 0;
7591 std::array<Entry, BatchLen> entries;
7592
7593 int64_t errors = 0;
7594 int64_t warnings = 0;
7595 uint64_t num_objects = 0;
7596 uint64_t num_extents = 0;
7597 uint64_t num_blobs = 0;
7598 uint64_t num_sharded_objects = 0;
7599 uint64_t num_spanning_blobs = 0;
7600 store_statfs_t expected_store_statfs;
7601 BlueStore::per_pool_statfs expected_pool_statfs;
7602 };
7603
7604 size_t batchCount;
7605 BlueStore* store = nullptr;
7606
eafe8130
TL
7607 ceph::mutex* sb_info_lock = nullptr;
7608 BlueStore::sb_info_map_t* sb_info = nullptr;
7609 BlueStoreRepairer* repairer = nullptr;
7610
7611 Batch* batches = nullptr;
7612 size_t last_batch_pos = 0;
7613 bool batch_acquired = false;
7614
7615 FSCKWorkQueue(std::string n,
7616 size_t _batchCount,
7617 BlueStore* _store,
eafe8130
TL
7618 ceph::mutex* _sb_info_lock,
7619 BlueStore::sb_info_map_t& _sb_info,
7620 BlueStoreRepairer* _repairer) :
7621 WorkQueue_(n, time_t(), time_t()),
7622 batchCount(_batchCount),
7623 store(_store),
eafe8130
TL
7624 sb_info_lock(_sb_info_lock),
7625 sb_info(&_sb_info),
7626 repairer(_repairer)
7627 {
7628 batches = new Batch[batchCount];
7629 }
7630 ~FSCKWorkQueue() {
7631 delete[] batches;
7632 }
7633
7634 /// Remove all work items from the queue.
7635 void _clear() override {
7636 //do nothing
7637 }
7638 /// Check whether there is anything to do.
7639 bool _empty() override {
7640 ceph_assert(false);
7641 }
7642
7643 /// Get the next work item to process.
7644 void* _void_dequeue() override {
7645 size_t pos = rand() % batchCount;
7646 size_t pos0 = pos;
7647 do {
7648 auto& batch = batches[pos];
7649 if (batch.running.fetch_add(1) == 0) {
7650 if (batch.entry_count) {
7651 return &batch;
7652 }
7653 }
7654 batch.running--;
7655 pos++;
7656 pos %= batchCount;
7657 } while (pos != pos0);
7658 return nullptr;
7659 }
7660 /** @brief Process the work item.
7661 * This function will be called several times in parallel
7662 * and must therefore be thread-safe. */
7663 void _void_process(void* item, TPHandle& handle) override {
7664 Batch* batch = (Batch*)item;
7665
7666 BlueStore::FSCK_ObjectCtx ctx(
7667 batch->errors,
7668 batch->warnings,
7669 batch->num_objects,
7670 batch->num_extents,
7671 batch->num_blobs,
7672 batch->num_sharded_objects,
7673 batch->num_spanning_blobs,
7674 nullptr, // used_blocks
9f95a23c 7675 nullptr, //used_omap_head
eafe8130
TL
7676 sb_info_lock,
7677 *sb_info,
7678 batch->expected_store_statfs,
7679 batch->expected_pool_statfs,
7680 repairer);
7681
7682 for (size_t i = 0; i < batch->entry_count; i++) {
7683 auto& entry = batch->entries[i];
7684
7685 store->fsck_check_objects_shallow(
7686 BlueStore::FSCK_SHALLOW,
7687 entry.pool_id,
7688 entry.c,
7689 entry.oid,
7690 entry.key,
7691 entry.value,
9f95a23c 7692 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
7693 nullptr, // referenced
7694 ctx);
7695 }
7696 //std::cout << "processed " << batch << std::endl;
7697 batch->entry_count = 0;
7698 batch->running--;
7699 }
7700 /** @brief Synchronously finish processing a work item.
7701 * This function is called after _void_process with the global thread pool lock held,
7702 * so at most one copy will execute simultaneously for a given thread pool.
7703 * It can be used for non-thread-safe finalization. */
7704 void _void_process_finish(void*) override {
7705 ceph_assert(false);
7706 }
7707
7708 bool queue(
7709 int64_t pool_id,
7710 BlueStore::CollectionRef c,
7711 const ghobject_t& oid,
7712 const string& key,
7713 const bufferlist& value) {
7714 bool res = false;
7715 size_t pos0 = last_batch_pos;
7716 if (!batch_acquired) {
7717 do {
7718 auto& batch = batches[last_batch_pos];
7719 if (batch.running.fetch_add(1) == 0) {
7720 if (batch.entry_count < BatchLen) {
7721 batch_acquired = true;
7722 break;
7723 }
7724 }
7725 batch.running.fetch_sub(1);
7726 last_batch_pos++;
7727 last_batch_pos %= batchCount;
7728 } while (last_batch_pos != pos0);
7729 }
7730 if (batch_acquired) {
7731 auto& batch = batches[last_batch_pos];
7732 ceph_assert(batch.running);
7733 ceph_assert(batch.entry_count < BatchLen);
7734
7735 auto& entry = batch.entries[batch.entry_count];
7736 entry.pool_id = pool_id;
7737 entry.c = c;
7738 entry.oid = oid;
7739 entry.key = key;
7740 entry.value = value;
7741
7742 ++batch.entry_count;
7743 if (batch.entry_count == BatchLen) {
7744 batch_acquired = false;
7745 batch.running.fetch_sub(1);
7746 last_batch_pos++;
7747 last_batch_pos %= batchCount;
7748 }
7749 res = true;
7750 }
7751 return res;
7752 }
7753
7754 void finalize(ThreadPool& tp,
7755 BlueStore::FSCK_ObjectCtx& ctx) {
7756 if (batch_acquired) {
7757 auto& batch = batches[last_batch_pos];
7758 ceph_assert(batch.running);
7759 batch.running.fetch_sub(1);
7760 }
7761 tp.stop();
7762
7763 for (size_t i = 0; i < batchCount; i++) {
7764 auto& batch = batches[i];
7765
7766 //process leftovers if any
7767 if (batch.entry_count) {
7768 TPHandle tp_handle(store->cct,
7769 nullptr,
7770 timeout_interval,
7771 suicide_interval);
7772 ceph_assert(batch.running == 0);
7773
7774 batch.running++; // just to be on-par with the regular call
7775 _void_process(&batch, tp_handle);
7776 }
7777 ceph_assert(batch.entry_count == 0);
7778
7779 ctx.errors += batch.errors;
7780 ctx.warnings += batch.warnings;
7781 ctx.num_objects += batch.num_objects;
7782 ctx.num_extents += batch.num_extents;
7783 ctx.num_blobs += batch.num_blobs;
7784 ctx.num_sharded_objects += batch.num_sharded_objects;
7785 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 7786
eafe8130
TL
7787 ctx.expected_store_statfs.add(batch.expected_store_statfs);
7788
7789 for (auto it = batch.expected_pool_statfs.begin();
7790 it != batch.expected_pool_statfs.end();
7791 it++) {
7792 ctx.expected_pool_statfs[it->first].add(it->second);
7793 }
7794 }
7795 }
7796 };
7797};
7798
9f95a23c
TL
7799void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
7800 OnodeRef& o,
7801 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 7802{
9f95a23c
TL
7803 auto& errors = ctx.errors;
7804 auto& warnings = ctx.warnings;
7805 auto repairer = ctx.repairer;
7806
7807 ceph_assert(o->onode.has_omap());
7808 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
7809 if (per_pool_omap) {
7810 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7811 << "fsck error: " << o->oid
7812 << " has omap that is not per-pool or pgmeta"
7813 << fsck_dendl;
7814 ++errors;
7815 } else {
7816 const char* w;
7817 int64_t num;
7818 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
7819 ++errors;
7820 num = errors;
7821 w = "error";
7822 } else {
7823 ++warnings;
7824 num = warnings;
7825 w = "warning";
7826 }
7827 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7828 << "fsck " << w << ": " << o->oid
7829 << " has omap that is not per-pool or pgmeta"
7830 << fsck_dendl;
7831 }
7832 }
7833 if (repairer &&
7834 !o->onode.is_perpool_omap() &&
7835 !o->onode.is_pgmeta_omap()) {
7836 dout(10) << "fsck converting " << o->oid << " omap to per-pool" << dendl;
7837 bufferlist h;
7838 map<string, bufferlist> kv;
7839 int r = _onode_omap_get(o, &h, &kv);
7840 if (r < 0) {
7841 derr << " got " << r << " " << cpp_strerror(r) << dendl;
7842 } else {
7843 KeyValueDB::Transaction txn = db->get_transaction();
7844 // remove old keys
7845 const string& old_omap_prefix = o->get_omap_prefix();
7846 string old_head, old_tail;
7847 o->get_omap_header(&old_head);
7848 o->get_omap_tail(&old_tail);
7849 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
7850 txn->rmkey(old_omap_prefix, old_tail);
7851 // set flag
7852 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP);
7853 _record_onode(o, txn);
7854 const string& new_omap_prefix = o->get_omap_prefix();
7855 // head
7856 if (h.length()) {
7857 string new_head;
7858 o->get_omap_header(&new_head);
7859 txn->set(new_omap_prefix, new_head, h);
7860 }
7861 // tail
7862 string new_tail;
7863 o->get_omap_tail(&new_tail);
7864 bufferlist empty;
7865 txn->set(new_omap_prefix, new_tail, empty);
7866 // values
7867 string final_key;
7868 o->get_omap_key(string(), &final_key);
7869 size_t base_key_len = final_key.size();
7870 for (auto& i : kv) {
7871 final_key.resize(base_key_len);
7872 final_key += i.first;
7873 txn->set(new_omap_prefix, final_key, i.second);
7874 }
7875 db->submit_transaction_sync(txn);
7876 repairer->inc_repaired();
7877 }
eafe8130 7878 }
9f95a23c 7879}
eafe8130 7880
9f95a23c
TL
7881void BlueStore::_fsck_check_objects(FSCKDepth depth,
7882 BlueStore::FSCK_ObjectCtx& ctx)
7883{
eafe8130 7884 auto& errors = ctx.errors;
eafe8130
TL
7885 auto sb_info_lock = ctx.sb_info_lock;
7886 auto& sb_info = ctx.sb_info;
7887 auto repairer = ctx.repairer;
7888
7889 uint64_t_btree_t used_nids;
7890
7891 size_t processed_myself = 0;
7892
7893 auto it = db->get_iterator(PREFIX_OBJ);
7894 mempool::bluestore_fsck::list<string> expecting_shards;
7895 if (it) {
7896 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
7897 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
7898 std::unique_ptr<WQ> wq(
7899 new WQ(
7900 "FSCKWorkQueue",
7901 (thread_count ? : 1) * 32,
7902 this,
eafe8130
TL
7903 sb_info_lock,
7904 sb_info,
7905 repairer));
7906
7907 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
7908
7909 thread_pool.add_work_queue(wq.get());
7910 if (depth == FSCK_SHALLOW && thread_count > 0) {
7911 //not the best place but let's check anyway
7912 ceph_assert(sb_info_lock);
7913 thread_pool.start();
7914 }
7915
7916 //fill global if not overriden below
7917 CollectionRef c;
7918 int64_t pool_id = -1;
7919 spg_t pgid;
7920 for (it->lower_bound(string()); it->valid(); it->next()) {
7921 dout(30) << __func__ << " key "
7922 << pretty_binary_string(it->key()) << dendl;
7923 if (is_extent_shard_key(it->key())) {
7924 if (depth == FSCK_SHALLOW) {
7925 continue;
7926 }
7927 while (!expecting_shards.empty() &&
7928 expecting_shards.front() < it->key()) {
7929 derr << "fsck error: missing shard key "
7930 << pretty_binary_string(expecting_shards.front())
7931 << dendl;
7932 ++errors;
7933 expecting_shards.pop_front();
7934 }
7935 if (!expecting_shards.empty() &&
7936 expecting_shards.front() == it->key()) {
7937 // all good
7938 expecting_shards.pop_front();
7939 continue;
7940 }
7941
7942 uint32_t offset;
7943 string okey;
7944 get_key_extent_shard(it->key(), &okey, &offset);
7945 derr << "fsck error: stray shard 0x" << std::hex << offset
7946 << std::dec << dendl;
7947 if (expecting_shards.empty()) {
7948 derr << "fsck error: " << pretty_binary_string(it->key())
7949 << " is unexpected" << dendl;
7950 ++errors;
7951 continue;
7952 }
7953 while (expecting_shards.front() > it->key()) {
7954 derr << "fsck error: saw " << pretty_binary_string(it->key())
7955 << dendl;
7956 derr << "fsck error: exp "
7957 << pretty_binary_string(expecting_shards.front()) << dendl;
7958 ++errors;
7959 expecting_shards.pop_front();
7960 if (expecting_shards.empty()) {
7961 break;
7962 }
7963 }
7964 continue;
7965 }
7966
7967 ghobject_t oid;
7968 int r = get_key_object(it->key(), &oid);
7969 if (r < 0) {
7970 derr << "fsck error: bad object key "
7971 << pretty_binary_string(it->key()) << dendl;
7972 ++errors;
7973 continue;
7974 }
7975 if (!c ||
7976 oid.shard_id != pgid.shard ||
7977 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7978 !c->contains(oid)) {
7979 c = nullptr;
7980 for (auto& p : coll_map) {
7981 if (p.second->contains(oid)) {
7982 c = p.second;
7983 break;
7984 }
7985 }
7986 if (!c) {
7987 derr << "fsck error: stray object " << oid
7988 << " not owned by any collection" << dendl;
7989 ++errors;
7990 continue;
7991 }
7992 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7993 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7994 << dendl;
7995 }
7996
7997 if (depth != FSCK_SHALLOW &&
7998 !expecting_shards.empty()) {
7999 for (auto& k : expecting_shards) {
8000 derr << "fsck error: missing shard key "
8001 << pretty_binary_string(k) << dendl;
8002 }
8003 ++errors;
8004 expecting_shards.clear();
8005 }
8006
8007 bool queued = false;
8008 if (depth == FSCK_SHALLOW && thread_count > 0) {
8009 queued = wq->queue(
8010 pool_id,
8011 c,
8012 oid,
8013 it->key(),
8014 it->value());
8015 }
8016 OnodeRef o;
8017 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8018
8019 if (!queued) {
8020 ++processed_myself;
8021
8022 o = fsck_check_objects_shallow(
8023 depth,
8024 pool_id,
8025 c,
8026 oid,
8027 it->key(),
8028 it->value(),
9f95a23c 8029 &expecting_shards,
eafe8130
TL
8030 &referenced,
8031 ctx);
8032 }
8033
8034 if (depth != FSCK_SHALLOW) {
8035 ceph_assert(o != nullptr);
8036 if (o->onode.nid) {
8037 if (o->onode.nid > nid_max) {
8038 derr << "fsck error: " << oid << " nid " << o->onode.nid
8039 << " > nid_max " << nid_max << dendl;
8040 ++errors;
8041 }
8042 if (used_nids.count(o->onode.nid)) {
8043 derr << "fsck error: " << oid << " nid " << o->onode.nid
8044 << " already in use" << dendl;
8045 ++errors;
8046 continue; // go for next object
8047 }
8048 used_nids.insert(o->onode.nid);
8049 }
8050 for (auto& i : referenced) {
8051 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8052 << std::dec << " for " << *i.first << dendl;
8053 const bluestore_blob_t& blob = i.first->get_blob();
8054 if (i.second & blob.unused) {
8055 derr << "fsck error: " << oid << " blob claims unused 0x"
8056 << std::hex << blob.unused
8057 << " but extents reference 0x" << i.second << std::dec
8058 << " on blob " << *i.first << dendl;
8059 ++errors;
8060 }
8061 if (blob.has_csum()) {
8062 uint64_t blob_len = blob.get_logical_length();
8063 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8064 unsigned csum_count = blob.get_csum_count();
8065 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8066 for (unsigned p = 0; p < csum_count; ++p) {
8067 unsigned pos = p * csum_chunk_size;
8068 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8069 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8070 unsigned mask = 1u << firstbit;
8071 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8072 mask |= 1u << b;
8073 }
8074 if ((blob.unused & mask) == mask) {
8075 // this csum chunk region is marked unused
8076 if (blob.get_csum_item(p) != 0) {
8077 derr << "fsck error: " << oid
8078 << " blob claims csum chunk 0x" << std::hex << pos
8079 << "~" << csum_chunk_size
8080 << " is unused (mask 0x" << mask << " of unused 0x"
8081 << blob.unused << ") but csum is non-zero 0x"
8082 << blob.get_csum_item(p) << std::dec << " on blob "
8083 << *i.first << dendl;
8084 ++errors;
8085 }
8086 }
8087 }
8088 }
8089 }
8090 // omap
8091 if (o->onode.has_omap()) {
9f95a23c
TL
8092 ceph_assert(ctx.used_omap_head);
8093 if (ctx.used_omap_head->count(o->onode.nid)) {
8094 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8095 << " already in use" << dendl;
eafe8130
TL
8096 ++errors;
8097 } else {
9f95a23c 8098 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8099 }
9f95a23c 8100 } // if (o->onode.has_omap())
eafe8130
TL
8101 if (depth == FSCK_DEEP) {
8102 bufferlist bl;
8103 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8104 uint64_t offset = 0;
8105 do {
8106 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8107 int r = _do_read(c.get(), o, offset, l, bl,
8108 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8109 if (r < 0) {
8110 ++errors;
8111 derr << "fsck error: " << oid << std::hex
8112 << " error during read: "
8113 << " " << offset << "~" << l
8114 << " " << cpp_strerror(r) << std::dec
8115 << dendl;
8116 break;
8117 }
8118 offset += l;
8119 } while (offset < o->onode.size);
8120 } // deep
8121 } //if (depth != FSCK_SHALLOW)
8122 } // for (it->lower_bound(string()); it->valid(); it->next())
8123 if (depth == FSCK_SHALLOW && thread_count > 0) {
8124 wq->finalize(thread_pool, ctx);
8125 if (processed_myself) {
8126 // may be needs more threads?
8127 dout(0) << __func__ << " partial offload"
8128 << ", done myself " << processed_myself
8129 << " of " << ctx.num_objects
8130 << "objects, threads " << thread_count
8131 << dendl;
8132 }
8133 }
8134 } // if (it)
8135}
8136/**
8137An overview for currently implemented repair logics
8138performed in fsck in two stages: detection(+preparation) and commit.
8139Detection stage (in processing order):
8140 (Issue -> Repair action to schedule)
8141 - Detect undecodable keys for Shared Blobs -> Remove
8142 - Detect undecodable records for Shared Blobs -> Remove
8143 (might trigger missed Shared Blob detection below)
8144 - Detect stray records for Shared Blobs -> Remove
8145 - Detect misreferenced pextents -> Fix
8146 Prepare Bloom-like filter to track cid/oid -> pextent
8147 Prepare list of extents that are improperly referenced
8148 Enumerate Onode records that might use 'misreferenced' pextents
8149 (Bloom-like filter applied to reduce computation)
8150 Per each questinable Onode enumerate all blobs and identify broken ones
8151 (i.e. blobs having 'misreferences')
8152 Rewrite each broken blob data by allocating another extents and
8153 copying data there
8154 If blob is shared - unshare it and mark corresponding Shared Blob
8155 for removal
8156 Release previously allocated space
8157 Update Extent Map
8158 - Detect missed Shared Blobs -> Recreate
8159 - Detect undecodable deferred transaction -> Remove
8160 - Detect Freelist Manager's 'false free' entries -> Mark as used
8161 - Detect Freelist Manager's leaked entries -> Mark as free
8162 - Detect statfs inconsistency - Update
8163 Commit stage (separate DB commit per each step):
8164 - Apply leaked FM entries fix
8165 - Apply 'false free' FM entries fix
8166 - Apply 'Remove' actions
8167 - Apply fix for misreference pextents
8168 - Apply Shared Blob recreate
8169 (can be merged with the step above if misreferences were dectected)
8170 - Apply StatFS update
8171*/
8172int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8173{
8174 dout(1) << __func__
8175 << (repair ? " repair" : " check")
8176 << (depth == FSCK_DEEP ? " (deep)" :
8177 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8178 << dendl;
8179
8180 // in deep mode we need R/W write access to be able to replay deferred ops
8181 bool read_only = !(repair || depth == FSCK_DEEP);
8182
8183 int r = _open_path();
8184 if (r < 0)
8185 return r;
7c673cae
FG
8186 r = _open_fsid(false);
8187 if (r < 0)
8188 goto out_path;
8189
8190 r = _read_fsid(&fsid);
8191 if (r < 0)
8192 goto out_fsid;
8193
8194 r = _lock_fsid();
8195 if (r < 0)
8196 goto out_fsid;
8197
8198 r = _open_bdev(false);
8199 if (r < 0)
8200 goto out_fsid;
8201
11fdf7f2 8202 r = _open_db_and_around(read_only);
7c673cae
FG
8203 if (r < 0)
8204 goto out_bdev;
8205
11fdf7f2
TL
8206 if (!read_only) {
8207 r = _upgrade_super();
8208 if (r < 0) {
8209 goto out_db;
8210 }
8211 }
7c673cae 8212
eafe8130 8213 r = _open_collections();
7c673cae 8214 if (r < 0)
11fdf7f2 8215 goto out_db;
7c673cae
FG
8216
8217 mempool_thread.init();
8218
11fdf7f2
TL
8219 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8220 // enable in repair or deep mode modes only
8221 if (!read_only) {
8222 _kv_start();
8223 r = _deferred_replay();
8224 _kv_stop();
8225 }
7c673cae
FG
8226 if (r < 0)
8227 goto out_scan;
8228
eafe8130
TL
8229 r = _fsck_on_open(depth, repair);
8230
8231out_scan:
8232 mempool_thread.shutdown();
f6b5b4d7 8233 _shutdown_cache();
eafe8130 8234out_db:
1911f103 8235 _close_db_and_around(false);
eafe8130
TL
8236out_bdev:
8237 _close_bdev();
8238out_fsid:
8239 _close_fsid();
8240out_path:
8241 _close_path();
8242
8243 return r;
8244}
8245
8246int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8247{
8248 dout(1) << __func__
8249 << " <<<START>>>"
8250 << (repair ? " repair" : " check")
8251 << (depth == FSCK_DEEP ? " (deep)" :
8252 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8253 << " start" << dendl;
8254 int64_t errors = 0;
8255 int64_t warnings = 0;
8256 unsigned repaired = 0;
8257
8258 uint64_t_btree_t used_omap_head;
eafe8130
TL
8259 uint64_t_btree_t used_sbids;
8260
8261 mempool_dynamic_bitset used_blocks;
8262 KeyValueDB::Iterator it;
8263 store_statfs_t expected_store_statfs, actual_statfs;
8264 per_pool_statfs expected_pool_statfs;
8265
8266 sb_info_map_t sb_info;
8267
8268 uint64_t num_objects = 0;
8269 uint64_t num_extents = 0;
8270 uint64_t num_blobs = 0;
8271 uint64_t num_spanning_blobs = 0;
8272 uint64_t num_shared_blobs = 0;
8273 uint64_t num_sharded_objects = 0;
8274 BlueStoreRepairer repairer;
8275
8276 utime_t start = ceph_clock_now();
8277
8278 _fsck_collections(&errors);
b32b8144 8279 used_blocks.resize(fm->get_alloc_units());
9f95a23c 8280 apply_for_bitset_range(
11fdf7f2 8281 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae
FG
8282 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8283 bs.set(pos);
8284 }
8285 );
11fdf7f2
TL
8286 if (repair) {
8287 repairer.get_space_usage_tracker().init(
8288 bdev->get_size(),
8289 min_alloc_size);
8290 }
7c673cae
FG
8291
8292 if (bluefs) {
11fdf7f2
TL
8293 if( cct->_conf->bluestore_bluefs_db_compatibility) {
8294 interval_set<uint64_t> bluefs_extents_db;
8295 bufferlist bl;
8296 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
8297 auto p = bl.cbegin();
8298 auto prev_errors = errors;
8299 try {
8300 decode(bluefs_extents_db, p);
8301 bluefs_extents_db.union_of(bluefs_extents);
8302 bluefs_extents_db.subtract(bluefs_extents);
8303 if (!bluefs_extents_db.empty()) {
8304 derr << "fsck error: bluefs_extents inconsistency, "
8305 << "downgrade to previous releases might be broken."
8306 << dendl;
8307 ++errors;
8308 }
8309 }
8310 catch (buffer::error& e) {
8311 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
8312 ++errors;
8313 }
8314 if (errors != prev_errors && repair) {
8315 repairer.fix_bluefs_extents(out_of_sync_fm);
8316 }
8317 }
8318
7c673cae 8319 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9f95a23c 8320 apply_for_bitset_range(
b32b8144 8321 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae
FG
8322 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8323 bs.set(pos);
1911f103 8324 });
7c673cae 8325 }
eafe8130 8326 int r = bluefs->fsck();
7c673cae 8327 if (r < 0) {
eafe8130 8328 return r;
7c673cae
FG
8329 }
8330 if (r > 0)
8331 errors += r;
8332 }
8333
eafe8130
TL
8334 if (!per_pool_stat_collection) {
8335 const char *w;
8336 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8337 w = "error";
8338 ++errors;
8339 } else {
8340 w = "warning";
8341 ++warnings;
8342 }
8343 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8344 << dendl;
8345 }
9f95a23c
TL
8346 if (!per_pool_omap) {
8347 const char *w;
8348 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8349 w = "error";
8350 ++errors;
8351 } else {
8352 w = "warning";
8353 ++warnings;
8354 }
8355 derr << "fsck " << w << ": store not yet converted to per-pool omap"
8356 << dendl;
8357 }
8358
11fdf7f2 8359 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
8360 // structs
8361 statfs(&actual_statfs);
11fdf7f2
TL
8362 actual_statfs.total = 0;
8363 actual_statfs.internally_reserved = 0;
8364 actual_statfs.available = 0;
8365 actual_statfs.internal_metadata = 0;
8366 actual_statfs.omap_allocated = 0;
8367
eafe8130
TL
8368 if (g_conf()->bluestore_debug_fsck_abort) {
8369 dout(1) << __func__ << " debug abort" << dendl;
8370 goto out_scan;
8371 }
7c673cae 8372 // walk PREFIX_OBJ
eafe8130
TL
8373 {
8374 dout(1) << __func__ << " walking object keyspace" << dendl;
8375 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8376 BlueStore::FSCK_ObjectCtx ctx(
8377 errors,
8378 warnings,
8379 num_objects,
8380 num_extents,
8381 num_blobs,
8382 num_sharded_objects,
8383 num_spanning_blobs,
8384 &used_blocks,
8385 &used_omap_head,
9f95a23c
TL
8386 //no need for the below lock when in non-shallow mode as
8387 // there is no multithreading in this case
8388 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130
TL
8389 sb_info,
8390 expected_store_statfs,
8391 expected_pool_statfs,
8392 repair ? &repairer : nullptr);
9f95a23c
TL
8393
8394 _fsck_check_objects(depth, ctx);
eafe8130 8395 }
11fdf7f2 8396
7c673cae
FG
8397 dout(1) << __func__ << " checking shared_blobs" << dendl;
8398 it = db->get_iterator(PREFIX_SHARED_BLOB);
8399 if (it) {
eafe8130
TL
8400 // FIXME minor: perhaps simplify for shallow mode?
8401 // fill global if not overriden below
8402 auto expected_statfs = &expected_store_statfs;
11fdf7f2 8403
7c673cae
FG
8404 for (it->lower_bound(string()); it->valid(); it->next()) {
8405 string key = it->key();
8406 uint64_t sbid;
8407 if (get_key_shared_blob(key, &sbid)) {
3efd9988 8408 derr << "fsck error: bad key '" << key
7c673cae 8409 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
8410 if (repair) {
8411 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8412 }
7c673cae
FG
8413 ++errors;
8414 continue;
8415 }
8416 auto p = sb_info.find(sbid);
8417 if (p == sb_info.end()) {
3efd9988 8418 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 8419 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
8420 if (repair) {
8421 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8422 }
7c673cae
FG
8423 ++errors;
8424 } else {
8425 ++num_shared_blobs;
8426 sb_info_t& sbi = p->second;
8427 bluestore_shared_blob_t shared_blob(sbid);
8428 bufferlist bl = it->value();
11fdf7f2
TL
8429 auto blp = bl.cbegin();
8430 try {
8431 decode(shared_blob, blp);
8432 } catch (buffer::error& e) {
8433 ++errors;
8434 // Force update and don't report as missing
8435 sbi.updated = sbi.passed = true;
8436
8437 derr << "fsck error: failed to decode Shared Blob"
8438 << pretty_binary_string(it->key()) << dendl;
8439 if (repair) {
8440 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8441 << pretty_binary_string(it->key())
8442 << "', removing" << dendl;
8443 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8444 }
8445 continue;
8446 }
7c673cae
FG
8447 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8448 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 8449 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
8450 << std::dec << " ref_map " << shared_blob.ref_map
8451 << " != expected " << sbi.ref_map << dendl;
8452 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
8453 ++errors;
8454 }
8455 PExtentVector extents;
8456 for (auto &r : shared_blob.ref_map.ref_map) {
8457 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8458 }
eafe8130 8459 if (per_pool_stat_collection || repair) {
11fdf7f2
TL
8460 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8461 }
8462 errors += _fsck_check_extents(sbi.cid,
8463 p->second.oids.front(),
7c673cae
FG
8464 extents,
8465 p->second.compressed,
b32b8144
FG
8466 used_blocks,
8467 fm->get_alloc_size(),
11fdf7f2 8468 repair ? &repairer : nullptr,
eafe8130
TL
8469 *expected_statfs,
8470 depth);
11fdf7f2
TL
8471 sbi.passed = true;
8472 }
8473 }
8474 } // if (it)
8475
8476 if (repair && repairer.preprocess_misreference(db)) {
8477
8478 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
8479 auto& space_tracker = repairer.get_space_usage_tracker();
8480 auto& misref_extents = repairer.get_misreferences();
8481 interval_set<uint64_t> to_release;
8482 it = db->get_iterator(PREFIX_OBJ);
8483 if (it) {
eafe8130
TL
8484 // fill global if not overriden below
8485 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
8486
8487 CollectionRef c;
8488 spg_t pgid;
8489 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8490 bool bypass_rest = false;
8491 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8492 it->next()) {
8493 dout(30) << __func__ << " key "
8494 << pretty_binary_string(it->key()) << dendl;
8495 if (is_extent_shard_key(it->key())) {
8496 continue;
8497 }
8498
8499 ghobject_t oid;
8500 int r = get_key_object(it->key(), &oid);
8501 if (r < 0 || !space_tracker.is_used(oid)) {
8502 continue;
8503 }
8504
8505 if (!c ||
8506 oid.shard_id != pgid.shard ||
8507 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8508 !c->contains(oid)) {
8509 c = nullptr;
8510 for (auto& p : coll_map) {
8511 if (p.second->contains(oid)) {
8512 c = p.second;
8513 break;
8514 }
8515 }
8516 if (!c) {
8517 continue;
8518 }
eafe8130
TL
8519 if (per_pool_stat_collection || repair) {
8520 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
8521 expected_statfs = &expected_pool_statfs[pool_id];
8522 }
8523 }
8524 if (!space_tracker.is_used(c->cid)) {
8525 continue;
8526 }
8527
8528 dout(20) << __func__ << " check misreference for col:" << c->cid
8529 << " obj:" << oid << dendl;
8530
eafe8130
TL
8531 OnodeRef o;
8532 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
8533 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8534 mempool::bluestore_fsck::set<BlobRef> blobs;
8535
8536 for (auto& e : o->extent_map.extent_map) {
8537 blobs.insert(e.blob);
8538 }
8539 bool need_onode_update = false;
8540 bool first_dump = true;
8541 for(auto b : blobs) {
8542 bool broken_blob = false;
8543 auto& pextents = b->dirty_blob().dirty_extents();
8544 for (auto& e : pextents) {
8545 if (!e.is_valid()) {
8546 continue;
8547 }
8548 // for the sake of simplicity and proper shared blob handling
8549 // always rewrite the whole blob even when it's partially
8550 // misreferenced.
8551 if (misref_extents.intersects(e.offset, e.length)) {
8552 if (first_dump) {
8553 first_dump = false;
81eedcae 8554 _dump_onode<10>(cct, *o);
11fdf7f2
TL
8555 }
8556 broken_blob = true;
8557 break;
8558 }
8559 }
8560 if (!broken_blob)
8561 continue;
8562 bool compressed = b->get_blob().is_compressed();
8563 need_onode_update = true;
8564 dout(10) << __func__
8565 << " fix misreferences in oid:" << oid
8566 << " " << *b << dendl;
8567 uint64_t b_off = 0;
8568 PExtentVector pext_to_release;
8569 pext_to_release.reserve(pextents.size());
8570 // rewriting all valid pextents
8571 for (auto e = pextents.begin(); e != pextents.end();
8572 b_off += e->length, e++) {
8573 if (!e->is_valid()) {
8574 continue;
8575 }
8576 PExtentVector exts;
8577 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
8578 0, 0, &exts);
eafe8130 8579 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
8580 derr << __func__
8581 << " failed to allocate 0x" << std::hex << e->length
eafe8130 8582 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
8583 << " min_alloc_size 0x" << min_alloc_size
8584 << " available 0x " << alloc->get_free()
8585 << std::dec << dendl;
8586 if (alloc_len > 0) {
8587 alloc->release(exts);
8588 }
8589 bypass_rest = true;
8590 break;
8591 }
8592 expected_statfs->allocated += e->length;
8593 if (compressed) {
8594 expected_statfs->data_compressed_allocated += e->length;
8595 }
8596
8597 bufferlist bl;
8598 IOContext ioc(cct, NULL, true); // allow EIO
8599 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8600 if (r < 0) {
8601 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8602 <<"~" << e->length << std::dec << dendl;
8603 ceph_abort_msg("read failed, wtf");
8604 }
8605 pext_to_release.push_back(*e);
8606 e = pextents.erase(e);
8607 e = pextents.insert(e, exts.begin(), exts.end());
8608 b->get_blob().map_bl(
8609 b_off, bl,
8610 [&](uint64_t offset, bufferlist& t) {
8611 int r = bdev->write(offset, t, false);
8612 ceph_assert(r == 0);
8613 });
8614 e += exts.size() - 1;
8615 for (auto& p : exts) {
8616 fm->allocate(p.offset, p.length, txn);
8617 }
8618 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8619
8620 if (b->get_blob().is_shared()) {
8621 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8622
8623 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8624 ceph_assert(sb_it != sb_info.end());
8625 sb_info_t& sbi = sb_it->second;
8626
8627 for (auto& r : sbi.ref_map.ref_map) {
8628 expected_statfs->allocated -= r.second.length;
8629 if (sbi.compressed) {
8630 // NB: it's crucial to use compressed flag from sb_info_t
8631 // as we originally used that value while accumulating
8632 // expected_statfs
8633 expected_statfs->data_compressed_allocated -= r.second.length;
8634 }
8635 }
8636 sbi.updated = sbi.passed = true;
8637 sbi.ref_map.clear();
8638
8639 // relying on blob's pextents to decide what to release.
8640 for (auto& p : pext_to_release) {
8641 to_release.union_insert(p.offset, p.length);
8642 }
8643 } else {
8644 for (auto& p : pext_to_release) {
8645 expected_statfs->allocated -= p.length;
8646 if (compressed) {
8647 expected_statfs->data_compressed_allocated -= p.length;
8648 }
8649 to_release.union_insert(p.offset, p.length);
8650 }
8651 }
8652 if (bypass_rest) {
8653 break;
8654 }
8655 } // for(auto b : blobs)
8656 if (need_onode_update) {
8657 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8658 _record_onode(o, txn);
8659 }
8660 } // for (it->lower_bound(string()); it->valid(); it->next())
8661
8662 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8663 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8664 << "~" << it.get_len() << std::dec << dendl;
8665 fm->release(it.get_start(), it.get_len(), txn);
8666 }
8667 alloc->release(to_release);
8668 to_release.clear();
8669 } // if (it) {
8670 } //if (repair && repairer.preprocess_misreference()) {
8671
eafe8130
TL
8672 if (depth != FSCK_SHALLOW) {
8673 for (auto &p : sb_info) {
8674 sb_info_t& sbi = p.second;
8675 if (!sbi.passed) {
8676 derr << "fsck error: missing " << *sbi.sb << dendl;
8677 ++errors;
8678 }
8679 if (repair && (!sbi.passed || sbi.updated)) {
8680 auto sbid = p.first;
8681 if (sbi.ref_map.empty()) {
8682 ceph_assert(sbi.passed);
8683 dout(20) << __func__ << " " << *sbi.sb
8684 << " is empty, removing" << dendl;
8685 repairer.fix_shared_blob(db, sbid, nullptr);
8686 } else {
8687 bufferlist bl;
8688 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8689 encode(persistent, bl);
8690 dout(20) << __func__ << " " << *sbi.sb
8691 << " is " << bl.length() << " bytes, updating" << dendl;
11fdf7f2 8692
eafe8130
TL
8693 repairer.fix_shared_blob(db, sbid, &bl);
8694 }
7c673cae
FG
8695 }
8696 }
8697 }
11fdf7f2
TL
8698 sb_info.clear();
8699
eafe8130
TL
8700 // check global stats only if fscking (not repairing) w/o per-pool stats
8701 if (!per_pool_stat_collection &&
8702 !repair &&
8703 !(actual_statfs == expected_store_statfs)) {
8704 derr << "fsck error: actual " << actual_statfs
8705 << " != expected " << expected_store_statfs << dendl;
8706 if (repair) {
8707 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8708 expected_store_statfs);
11fdf7f2 8709 }
eafe8130 8710 ++errors;
7c673cae
FG
8711 }
8712
eafe8130
TL
8713 dout(1) << __func__ << " checking pool_statfs" << dendl;
8714 _fsck_check_pool_statfs(expected_pool_statfs,
8715 errors, warnings, repair ? &repairer : nullptr);
8716
8717 if (depth != FSCK_SHALLOW) {
9f95a23c 8718 dout(1) << __func__ << " checking for stray omap data " << dendl;
eafe8130
TL
8719 it = db->get_iterator(PREFIX_OMAP);
8720 if (it) {
9f95a23c 8721 uint64_t last_omap_head = 0;
eafe8130
TL
8722 for (it->lower_bound(string()); it->valid(); it->next()) {
8723 uint64_t omap_head;
8724 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8725 if (used_omap_head.count(omap_head) == 0 &&
8726 omap_head != last_omap_head) {
8727 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8728 << "fsck error: found stray omap data on omap_head "
8729 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head)<< fsck_dendl;
eafe8130 8730 ++errors;
9f95a23c 8731 last_omap_head = omap_head;
eafe8130 8732 }
7c673cae
FG
8733 }
8734 }
eafe8130
TL
8735 it = db->get_iterator(PREFIX_PGMETA_OMAP);
8736 if (it) {
9f95a23c 8737 uint64_t last_omap_head = 0;
eafe8130
TL
8738 for (it->lower_bound(string()); it->valid(); it->next()) {
8739 uint64_t omap_head;
8740 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8741 if (used_omap_head.count(omap_head) == 0 &&
8742 omap_head != last_omap_head) {
8743 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8744 << "fsck error: found stray (pgmeta) omap data on omap_head "
8745 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8746 last_omap_head = omap_head;
eafe8130
TL
8747 ++errors;
8748 }
11fdf7f2
TL
8749 }
8750 }
9f95a23c
TL
8751 it = db->get_iterator(PREFIX_PERPOOL_OMAP);
8752 if (it) {
8753 uint64_t last_omap_head = 0;
8754 for (it->lower_bound(string()); it->valid(); it->next()) {
8755 uint64_t pool;
8756 uint64_t omap_head;
8757 string k = it->key();
8758 const char *c = k.c_str();
8759 c = _key_decode_u64(c, &pool);
8760 c = _key_decode_u64(c, &omap_head);
8761 if (used_omap_head.count(omap_head) == 0 &&
8762 omap_head != last_omap_head) {
8763 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8764 << "fsck error: found stray (per-pool) omap data on omap_head "
8765 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8766 ++errors;
8767 last_omap_head = omap_head;
8768 }
8769 }
8770 }
eafe8130
TL
8771 dout(1) << __func__ << " checking deferred events" << dendl;
8772 it = db->get_iterator(PREFIX_DEFERRED);
8773 if (it) {
8774 for (it->lower_bound(string()); it->valid(); it->next()) {
8775 bufferlist bl = it->value();
8776 auto p = bl.cbegin();
8777 bluestore_deferred_transaction_t wt;
8778 try {
8779 decode(wt, p);
8780 } catch (buffer::error& e) {
8781 derr << "fsck error: failed to decode deferred txn "
8782 << pretty_binary_string(it->key()) << dendl;
8783 if (repair) {
8784 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8785 << pretty_binary_string(it->key())
8786 << "', removing" << dendl;
8787 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8788 }
8789 continue;
8790 }
8791 dout(20) << __func__ << " deferred " << wt.seq
8792 << " ops " << wt.ops.size()
8793 << " released 0x" << std::hex << wt.released << std::dec << dendl;
8794 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 8795 apply_for_bitset_range(
eafe8130
TL
8796 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
8797 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8798 bs.set(pos);
8799 }
8800 );
8801 }
7c673cae 8802 }
eafe8130
TL
8803 }
8804
8805 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
8806 {
8807 // remove bluefs_extents from used set since the freelist doesn't
8808 // know they are allocated.
8809 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9f95a23c 8810 apply_for_bitset_range(
b32b8144 8811 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 8812 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130 8813 bs.reset(pos);
7c673cae
FG
8814 }
8815 );
8816 }
eafe8130
TL
8817 fm->enumerate_reset();
8818 uint64_t offset, length;
8819 while (fm->enumerate_next(db, &offset, &length)) {
8820 bool intersects = false;
9f95a23c 8821 apply_for_bitset_range(
eafe8130
TL
8822 offset, length, fm->get_alloc_size(), used_blocks,
8823 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8824 if (bs.test(pos)) {
8825 if (offset == SUPER_RESERVED &&
8826 length == min_alloc_size - SUPER_RESERVED) {
8827 // this is due to the change just after luminous to min_alloc_size
8828 // granularity allocations, and our baked in assumption at the top
8829 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8830 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8831 // since we will never allocate this region below min_alloc_size.
8832 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
8833 << " and min_alloc_size, 0x" << std::hex << offset << "~"
8834 << length << std::dec << dendl;
8835 } else {
8836 intersects = true;
8837 if (repair) {
8838 repairer.fix_false_free(db, fm,
8839 pos * min_alloc_size,
8840 min_alloc_size);
8841 }
11fdf7f2 8842 }
eafe8130
TL
8843 } else {
8844 bs.set(pos);
8845 }
7c673cae 8846 }
eafe8130
TL
8847 );
8848 if (intersects) {
8849 derr << "fsck error: free extent 0x" << std::hex << offset
8850 << "~" << length << std::dec
8851 << " intersects allocated blocks" << dendl;
8852 ++errors;
7c673cae 8853 }
b5b8bbf5 8854 }
eafe8130
TL
8855 fm->enumerate_reset();
8856 size_t count = used_blocks.count();
8857 if (used_blocks.size() != count) {
8858 ceph_assert(used_blocks.size() > count);
8859 used_blocks.flip();
8860 size_t start = used_blocks.find_first();
8861 while (start != decltype(used_blocks)::npos) {
8862 size_t cur = start;
8863 while (true) {
8864 size_t next = used_blocks.find_next(cur);
8865 if (next != cur + 1) {
8866 ++errors;
8867 derr << "fsck error: leaked extent 0x" << std::hex
8868 << ((uint64_t)start * fm->get_alloc_size()) << "~"
8869 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
8870 << dendl;
8871 if (repair) {
8872 repairer.fix_leaked(db,
8873 fm,
8874 start * min_alloc_size,
8875 (cur + 1 - start) * min_alloc_size);
8876 }
8877 start = next;
8878 break;
11fdf7f2 8879 }
eafe8130 8880 cur = next;
b5b8bbf5 8881 }
eafe8130
TL
8882 }
8883 used_blocks.flip();
b5b8bbf5 8884 }
7c673cae
FG
8885 }
8886 }
11fdf7f2 8887 if (repair) {
9f95a23c
TL
8888 if (!per_pool_omap) {
8889 dout(5) << __func__ << " marking per_pool_omap=1" << dendl;
8890 repairer.fix_per_pool_omap(db);
8891 }
8892
11fdf7f2
TL
8893 dout(5) << __func__ << " applying repair results" << dendl;
8894 repaired = repairer.apply(db);
8895 dout(5) << __func__ << " repair applied" << dendl;
8896 }
7c673cae 8897
eafe8130 8898out_scan:
7c673cae
FG
8899 dout(2) << __func__ << " " << num_objects << " objects, "
8900 << num_sharded_objects << " of them sharded. "
8901 << dendl;
8902 dout(2) << __func__ << " " << num_extents << " extents to "
8903 << num_blobs << " blobs, "
8904 << num_spanning_blobs << " spanning, "
8905 << num_shared_blobs << " shared."
8906 << dendl;
8907
8908 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
8909 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
8910 << warnings << " warnings, "
8911 << repaired << " repaired, "
8912 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 8913 << duration << " seconds" << dendl;
9f95a23c
TL
8914
8915 // In non-repair mode we should return error count only as
8916 // it indicates if store status is OK.
8917 // In repair mode both errors and warnings are taken into account
8918 // since repaired counter relates to them both.
8919 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
8920}
8921
8922/// methods to inject various errors fsck can repair
8923void BlueStore::inject_broken_shared_blob_key(const string& key,
8924 const bufferlist& bl)
8925{
8926 KeyValueDB::Transaction txn;
8927 txn = db->get_transaction();
8928 txn->set(PREFIX_SHARED_BLOB, key, bl);
8929 db->submit_transaction_sync(txn);
8930};
8931
8932void BlueStore::inject_leaked(uint64_t len)
8933{
8934 KeyValueDB::Transaction txn;
8935 txn = db->get_transaction();
8936
8937 PExtentVector exts;
8938 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
8939 min_alloc_size * 256, 0, &exts);
8940 ceph_assert(alloc_len >= (int64_t)len);
8941 for (auto& p : exts) {
8942 fm->allocate(p.offset, p.length, txn);
8943 }
8944 db->submit_transaction_sync(txn);
8945}
8946
8947void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
8948{
8949 KeyValueDB::Transaction txn;
8950 OnodeRef o;
8951 CollectionRef c = _get_collection(cid);
8952 ceph_assert(c);
8953 {
9f95a23c 8954 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
8955 o = c->get_onode(oid, false);
8956 ceph_assert(o);
8957 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8958 }
8959
8960 bool injected = false;
8961 txn = db->get_transaction();
8962 auto& em = o->extent_map.extent_map;
8963 std::vector<const PExtentVector*> v;
8964 if (em.size()) {
8965 v.push_back(&em.begin()->blob->get_blob().get_extents());
8966 }
8967 if (em.size() > 1) {
8968 auto it = em.end();
8969 --it;
8970 v.push_back(&(it->blob->get_blob().get_extents()));
8971 }
8972 for (auto pext : v) {
8973 if (pext->size()) {
8974 auto p = pext->begin();
8975 while (p != pext->end()) {
8976 if (p->is_valid()) {
8977 dout(20) << __func__ << " release 0x" << std::hex << p->offset
8978 << "~" << p->length << std::dec << dendl;
8979 fm->release(p->offset, p->length, txn);
8980 injected = true;
8981 break;
8982 }
8983 ++p;
8984 }
8985 }
8986 }
8987 ceph_assert(injected);
8988 db->submit_transaction_sync(txn);
8989}
8990
9f95a23c
TL
8991void BlueStore::inject_legacy_omap()
8992{
8993 dout(1) << __func__ << dendl;
8994 per_pool_omap = false;
8995 KeyValueDB::Transaction txn;
8996 txn = db->get_transaction();
8997 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
8998 db->submit_transaction_sync(txn);
8999}
9000
9001void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
9002{
9003 dout(1) << __func__ << " "
9004 << cid << " " << oid
9005 <<dendl;
9006 KeyValueDB::Transaction txn;
9007 OnodeRef o;
9008 CollectionRef c = _get_collection(cid);
9009 ceph_assert(c);
9010 {
9011 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9012 o = c->get_onode(oid, false);
9013 ceph_assert(o);
9014 }
9015 o->onode.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PGMETA_OMAP);
9016 txn = db->get_transaction();
9017 _record_onode(o, txn);
9018 db->submit_transaction_sync(txn);
9019}
9020
9021
11fdf7f2
TL
9022void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
9023{
9024 BlueStoreRepairer repairer;
9025 repairer.fix_statfs(db, key, new_statfs);
9026 repairer.apply(db);
9027}
9028
eafe8130
TL
9029void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
9030{
9031 KeyValueDB::Transaction t = db->get_transaction();
9032 volatile_statfs v;
9033 v = new_statfs;
9034 bufferlist bl;
9035 v.encode(bl);
9036 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
9037 db->submit_transaction_sync(t);
9038}
9039
11fdf7f2
TL
9040void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9041 coll_t cid2, ghobject_t oid2,
9042 uint64_t offset)
9043{
9044 OnodeRef o1;
9045 CollectionRef c1 = _get_collection(cid1);
9046 ceph_assert(c1);
9047 {
9f95a23c 9048 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
9049 o1 = c1->get_onode(oid1, false);
9050 ceph_assert(o1);
9051 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9052 }
9053 OnodeRef o2;
9054 CollectionRef c2 = _get_collection(cid2);
9055 ceph_assert(c2);
9056 {
9f95a23c 9057 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
9058 o2 = c2->get_onode(oid2, false);
9059 ceph_assert(o2);
9060 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9061 }
9062 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9063 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9064
9065 // require onode/extent layout to be the same (and simple)
9066 // to make things easier
9067 ceph_assert(o1->onode.extent_map_shards.empty());
9068 ceph_assert(o2->onode.extent_map_shards.empty());
9069 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9070 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9071 ceph_assert(e1.logical_offset == e2.logical_offset);
9072 ceph_assert(e1.length == e2.length);
9073 ceph_assert(e1.blob_offset == e2.blob_offset);
9074
9075 KeyValueDB::Transaction txn;
9076 txn = db->get_transaction();
9077
9078 // along with misreference error this will create space leaks errors
9079 e2.blob->dirty_blob() = e1.blob->get_blob();
9080 o2->extent_map.dirty_range(offset, e2.length);
9081 o2->extent_map.update(txn, false);
9082
9083 _record_onode(o2, txn);
9084 db->submit_transaction_sync(txn);
7c673cae
FG
9085}
9086
9087void BlueStore::collect_metadata(map<string,string> *pm)
9088{
9089 dout(10) << __func__ << dendl;
9090 bdev->collect_metadata("bluestore_bdev_", pm);
9091 if (bluefs) {
9092 (*pm)["bluefs"] = "1";
9f95a23c
TL
9093 // this value is for backward compatibility only
9094 (*pm)["bluefs_single_shared_device"] = \
9095 stringify((int)bluefs_layout.single_shared_device());
9096 (*pm)["bluefs_dedicated_db"] = \
9097 stringify((int)bluefs_layout.dedicated_db);
9098 (*pm)["bluefs_dedicated_wal"] = \
9099 stringify((int)bluefs_layout.dedicated_wal);
9100 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
9101 } else {
9102 (*pm)["bluefs"] = "0";
9103 }
11fdf7f2
TL
9104
9105 // report numa mapping for underlying devices
9106 int node = -1;
9107 set<int> nodes;
9108 set<string> failed;
9109 int r = get_numa_node(&node, &nodes, &failed);
9110 if (r >= 0) {
9111 if (!failed.empty()) {
9112 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9113 }
9114 if (!nodes.empty()) {
9115 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9116 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9117 }
9118 if (node >= 0) {
9119 (*pm)["objectstore_numa_node"] = stringify(node);
9120 }
9121 }
9122}
9123
9124int BlueStore::get_numa_node(
9125 int *final_node,
9126 set<int> *out_nodes,
9127 set<string> *out_failed)
9128{
9129 int node = -1;
9130 set<string> devices;
9131 get_devices(&devices);
9132 set<int> nodes;
9133 set<string> failed;
9134 for (auto& devname : devices) {
9135 int n;
9136 BlkDev bdev(devname);
9137 int r = bdev.get_numa_node(&n);
9138 if (r < 0) {
9139 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9140 << dendl;
9141 failed.insert(devname);
9142 continue;
9143 }
9144 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9145 << dendl;
9146 nodes.insert(n);
9147 if (node < 0) {
9148 node = n;
9149 }
9150 }
9151 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9152 *final_node = node;
9153 }
9154 if (out_nodes) {
9155 *out_nodes = nodes;
9156 }
9157 if (out_failed) {
9158 *out_failed = failed;
9159 }
9160 return 0;
9161}
9162
9163int BlueStore::get_devices(set<string> *ls)
9164{
9165 if (bdev) {
9166 bdev->get_devices(ls);
9167 if (bluefs) {
9168 bluefs->get_devices(ls);
9169 }
9170 return 0;
9171 }
9172
9173 // grumble, we haven't started up yet.
9174 int r = _open_path();
9175 if (r < 0)
9176 goto out;
9177 r = _open_fsid(false);
9178 if (r < 0)
9179 goto out_path;
9180 r = _read_fsid(&fsid);
9181 if (r < 0)
9182 goto out_fsid;
9183 r = _lock_fsid();
9184 if (r < 0)
9185 goto out_fsid;
9186 r = _open_bdev(false);
9187 if (r < 0)
9188 goto out_fsid;
9189 r = _minimal_open_bluefs(false);
9190 if (r < 0)
9191 goto out_bdev;
9192 bdev->get_devices(ls);
9193 if (bluefs) {
9194 bluefs->get_devices(ls);
9195 }
9196 r = 0;
9197 _minimal_close_bluefs();
9198 out_bdev:
9199 _close_bdev();
9200 out_fsid:
9201 _close_fsid();
9202 out_path:
9203 _close_path();
9204 out:
9205 return r;
7c673cae
FG
9206}
9207
11fdf7f2 9208void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
9209{
9210 buf->reset();
11fdf7f2 9211
9f95a23c
TL
9212 buf->omap_allocated =
9213 db->estimate_prefix_size(PREFIX_OMAP, string()) +
9214 db->estimate_prefix_size(PREFIX_PERPOOL_OMAP, string());
11fdf7f2
TL
9215
9216 uint64_t bfree = alloc->get_free();
7c673cae
FG
9217
9218 if (bluefs) {
9f95a23c
TL
9219 int64_t bluefs_total = bluefs->get_total(bluefs_layout.shared_bdev);
9220 int64_t bluefs_free = bluefs->get_free(bluefs_layout.shared_bdev);
94b18763
FG
9221 // part of our shared device is "free" according to BlueFS, but we
9222 // can't touch bluestore_bluefs_min of it.
9223 int64_t shared_available = std::min(
11fdf7f2
TL
9224 bluefs_free,
9225 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
9226 buf->internally_reserved = bluefs_total - shared_available;
94b18763 9227 if (shared_available > 0) {
11fdf7f2
TL
9228 bfree += shared_available;
9229 }
9230 // include dedicated db, too, if that isn't the shared device.
9f95a23c 9231 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 9232 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 9233 }
11fdf7f2
TL
9234 // call any non-omap bluefs space "internal metadata"
9235 buf->internal_metadata =
9236 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
9237 - buf->omap_allocated;
7c673cae
FG
9238 }
9239
11fdf7f2
TL
9240 uint64_t thin_total, thin_avail;
9241 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9242 buf->total += thin_total;
9243
9244 // we are limited by both the size of the virtual device and the
9245 // underlying physical device.
9246 bfree = std::min(bfree, thin_avail);
9247
9248 buf->allocated = thin_total - thin_avail;
9249 } else {
9250 buf->total += bdev->get_size();
9251 }
9252 buf->available = bfree;
9253}
9254
9255int BlueStore::statfs(struct store_statfs_t *buf,
9256 osd_alert_list_t* alerts)
9257{
9258 if (alerts) {
9259 alerts->clear();
9260 _log_alerts(*alerts);
9261 }
9262 _get_statfs_overall(buf);
31f18b77 9263 {
11fdf7f2 9264 std::lock_guard l(vstatfs_lock);
31f18b77 9265 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
9266 buf->data_stored = vstatfs.stored();
9267 buf->data_compressed = vstatfs.compressed();
9268 buf->data_compressed_original = vstatfs.compressed_original();
9269 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9270 }
9271
9272 dout(20) << __func__ << " " << *buf << dendl;
9273 return 0;
9274}
9275
9f95a23c
TL
9276int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9277 bool *out_per_pool_omap)
11fdf7f2
TL
9278{
9279 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 9280
11fdf7f2
TL
9281 if (!per_pool_stat_collection) {
9282 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9283 return -ENOTSUP;
7c673cae 9284 }
11fdf7f2 9285 buf->reset();
7c673cae 9286
11fdf7f2
TL
9287 {
9288 std::lock_guard l(vstatfs_lock);
9289 osd_pools[pool_id].publish(buf);
9290 }
9f95a23c
TL
9291
9292 string key_prefix;
9293 _key_encode_u64(pool_id, &key_prefix);
9294 buf->omap_allocated = db->estimate_prefix_size(PREFIX_PERPOOL_OMAP,
9295 key_prefix);
9296 *out_per_pool_omap = per_pool_omap;
9297
11fdf7f2 9298 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
9299 return 0;
9300}
9301
81eedcae
TL
9302void BlueStore::_check_legacy_statfs_alert()
9303{
9304 string s;
9305 if (!per_pool_stat_collection &&
eafe8130 9306 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
9307 s = "legacy statfs reporting detected, "
9308 "suggest to run store repair to get consistent statistic reports";
9309 }
9310 std::lock_guard l(qlock);
9311 legacy_statfs_alert = s;
9312}
9313
9f95a23c
TL
9314void BlueStore::_check_no_per_pool_omap_alert()
9315{
9316 string s;
9317 if (!per_pool_omap &&
9318 cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9319 s = "legacy (not per-pool) omap detected, "
9320 "suggest to run store repair to measure per-pool omap usage";
9321 }
9322 std::lock_guard l(qlock);
9323 no_per_pool_omap_alert = s;
9324}
9325
7c673cae
FG
9326// ---------------
9327// cache
9328
9329BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9330{
9f95a23c 9331 std::shared_lock l(coll_lock);
7c673cae
FG
9332 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9333 if (cp == coll_map.end())
9334 return CollectionRef();
9335 return cp->second;
9336}
9337
9338void BlueStore::_queue_reap_collection(CollectionRef& c)
9339{
9340 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
9341 // _reap_collections and this in the same thread,
9342 // so no need a lock.
7c673cae
FG
9343 removed_collections.push_back(c);
9344}
9345
9346void BlueStore::_reap_collections()
9347{
94b18763 9348
7c673cae
FG
9349 list<CollectionRef> removed_colls;
9350 {
94b18763
FG
9351 // _queue_reap_collection and this in the same thread.
9352 // So no need a lock.
9353 if (!removed_collections.empty())
9354 removed_colls.swap(removed_collections);
9355 else
9356 return;
7c673cae
FG
9357 }
9358
94b18763
FG
9359 list<CollectionRef>::iterator p = removed_colls.begin();
9360 while (p != removed_colls.end()) {
7c673cae
FG
9361 CollectionRef c = *p;
9362 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9363 if (c->onode_map.map_any([&](OnodeRef o) {
11fdf7f2 9364 ceph_assert(!o->exists);
7c673cae
FG
9365 if (o->flushing_count.load()) {
9366 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9367 << " flush_txns " << o->flushing_count << dendl;
94b18763 9368 return true;
7c673cae 9369 }
94b18763 9370 return false;
7c673cae 9371 })) {
94b18763 9372 ++p;
7c673cae
FG
9373 continue;
9374 }
9375 c->onode_map.clear();
94b18763 9376 p = removed_colls.erase(p);
7c673cae
FG
9377 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9378 }
94b18763 9379 if (removed_colls.empty()) {
7c673cae 9380 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
9381 } else {
9382 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
9383 }
9384}
9385
9386void BlueStore::_update_cache_logger()
9387{
9388 uint64_t num_onodes = 0;
9f95a23c 9389 uint64_t num_pinned_onodes = 0;
7c673cae
FG
9390 uint64_t num_extents = 0;
9391 uint64_t num_blobs = 0;
9392 uint64_t num_buffers = 0;
9393 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
9394 for (auto c : onode_cache_shards) {
9395 c->add_stats(&num_onodes, &num_pinned_onodes);
9396 }
9397 for (auto c : buffer_cache_shards) {
9398 c->add_stats(&num_extents, &num_blobs,
9399 &num_buffers, &num_buffer_bytes);
7c673cae
FG
9400 }
9401 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 9402 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
9403 logger->set(l_bluestore_extents, num_extents);
9404 logger->set(l_bluestore_blobs, num_blobs);
9405 logger->set(l_bluestore_buffers, num_buffers);
9406 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9407}
9408
9409// ---------------
9410// read operations
9411
9412ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9413{
9414 return _get_collection(cid);
9415}
9416
11fdf7f2
TL
9417ObjectStore::CollectionHandle BlueStore::create_new_collection(
9418 const coll_t& cid)
7c673cae 9419{
9f95a23c
TL
9420 std::unique_lock l{coll_lock};
9421 auto c = ceph::make_ref<Collection>(
11fdf7f2 9422 this,
9f95a23c
TL
9423 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9424 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
9425 cid);
9426 new_coll_map[cid] = c;
9f95a23c 9427 _osr_attach(c.get());
11fdf7f2
TL
9428 return c;
9429}
9430
9431void BlueStore::set_collection_commit_queue(
9432 const coll_t& cid,
9433 ContextQueue *commit_queue)
9434{
9435 if (commit_queue) {
9f95a23c 9436 std::shared_lock l(coll_lock);
11fdf7f2
TL
9437 if (coll_map.count(cid)) {
9438 coll_map[cid]->commit_queue = commit_queue;
9439 } else if (new_coll_map.count(cid)) {
9440 new_coll_map[cid]->commit_queue = commit_queue;
9441 }
9442 }
7c673cae
FG
9443}
9444
11fdf7f2 9445
7c673cae
FG
9446bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9447{
9448 Collection *c = static_cast<Collection *>(c_.get());
9449 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9450 if (!c->exists)
9451 return false;
9452
9453 bool r = true;
9454
9455 {
9f95a23c 9456 std::shared_lock l(c->lock);
7c673cae
FG
9457 OnodeRef o = c->get_onode(oid, false);
9458 if (!o || !o->exists)
9459 r = false;
9460 }
9461
7c673cae
FG
9462 return r;
9463}
9464
7c673cae
FG
9465int BlueStore::stat(
9466 CollectionHandle &c_,
9467 const ghobject_t& oid,
9468 struct stat *st,
9469 bool allow_eio)
9470{
9471 Collection *c = static_cast<Collection *>(c_.get());
9472 if (!c->exists)
9473 return -ENOENT;
9474 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9475
9476 {
9f95a23c 9477 std::shared_lock l(c->lock);
7c673cae
FG
9478 OnodeRef o = c->get_onode(oid, false);
9479 if (!o || !o->exists)
9480 return -ENOENT;
9481 st->st_size = o->onode.size;
9482 st->st_blksize = 4096;
9483 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9484 st->st_nlink = 1;
9485 }
9486
7c673cae
FG
9487 int r = 0;
9488 if (_debug_mdata_eio(oid)) {
9489 r = -EIO;
9490 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9491 }
9492 return r;
9493}
9494int BlueStore::set_collection_opts(
11fdf7f2 9495 CollectionHandle& ch,
7c673cae
FG
9496 const pool_opts_t& opts)
9497{
7c673cae 9498 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 9499 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
9500 if (!c->exists)
9501 return -ENOENT;
9f95a23c 9502 std::unique_lock l{c->lock};
7c673cae
FG
9503 c->pool_opts = opts;
9504 return 0;
9505}
9506
7c673cae
FG
9507int BlueStore::read(
9508 CollectionHandle &c_,
9509 const ghobject_t& oid,
9510 uint64_t offset,
9511 size_t length,
9512 bufferlist& bl,
224ce89b 9513 uint32_t op_flags)
7c673cae 9514{
11fdf7f2 9515 auto start = mono_clock::now();
7c673cae
FG
9516 Collection *c = static_cast<Collection *>(c_.get());
9517 const coll_t &cid = c->get_cid();
9518 dout(15) << __func__ << " " << cid << " " << oid
9519 << " 0x" << std::hex << offset << "~" << length << std::dec
9520 << dendl;
9521 if (!c->exists)
9522 return -ENOENT;
9523
9524 bl.clear();
9525 int r;
9526 {
9f95a23c 9527 std::shared_lock l(c->lock);
11fdf7f2 9528 auto start1 = mono_clock::now();
7c673cae 9529 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
9530 log_latency("get_onode@read",
9531 l_bluestore_read_onode_meta_lat,
9532 mono_clock::now() - start1,
9533 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9534 if (!o || !o->exists) {
9535 r = -ENOENT;
9536 goto out;
9537 }
9538
9539 if (offset == length && offset == 0)
9540 length = o->onode.size;
9541
9542 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
9543 if (r == -EIO) {
9544 logger->inc(l_bluestore_read_eio);
9545 }
7c673cae
FG
9546 }
9547
9548 out:
28e407b8 9549 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
9550 r = -EIO;
9551 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
9552 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9553 cct->_conf->bluestore_debug_random_read_err &&
9554 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9555 100.0)) == 0) {
224ce89b
WB
9556 dout(0) << __func__ << ": inject random EIO" << dendl;
9557 r = -EIO;
7c673cae
FG
9558 }
9559 dout(10) << __func__ << " " << cid << " " << oid
9560 << " 0x" << std::hex << offset << "~" << length << std::dec
9561 << " = " << r << dendl;
494da23a
TL
9562 log_latency(__func__,
9563 l_bluestore_read_lat,
9564 mono_clock::now() - start,
9565 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9566 return r;
9567}
9568
9f95a23c 9569void BlueStore::_read_cache(
7c673cae
FG
9570 OnodeRef o,
9571 uint64_t offset,
9572 size_t length,
9f95a23c
TL
9573 int read_cache_policy,
9574 ready_regions_t& ready_regions,
9575 blobs2read_t& blobs2read)
7c673cae 9576{
7c673cae 9577 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
9578 unsigned left = length;
9579 uint64_t pos = offset;
7c673cae
FG
9580 auto lp = o->extent_map.seek_lextent(offset);
9581 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9582 if (pos < lp->logical_offset) {
9583 unsigned hole = lp->logical_offset - pos;
9584 if (hole >= left) {
9f95a23c 9585 break;
7c673cae
FG
9586 }
9587 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 9588 << std::dec << dendl;
7c673cae
FG
9589 pos += hole;
9590 left -= hole;
9591 }
94b18763 9592 BlobRef& bptr = lp->blob;
7c673cae
FG
9593 unsigned l_off = pos - lp->logical_offset;
9594 unsigned b_off = l_off + lp->blob_offset;
9595 unsigned b_len = std::min(left, lp->length - l_off);
9596
9597 ready_regions_t cache_res;
9598 interval_set<uint32_t> cache_interval;
9599 bptr->shared_blob->bc.read(
91327a77
AA
9600 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9601 read_cache_policy);
7c673cae 9602 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
9603 << " need 0x" << b_off << "~" << b_len
9604 << " cache has 0x" << cache_interval
9605 << std::dec << dendl;
7c673cae
FG
9606
9607 auto pc = cache_res.begin();
11fdf7f2 9608 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
9609 while (b_len > 0) {
9610 unsigned l;
9611 if (pc != cache_res.end() &&
9f95a23c
TL
9612 pc->first == b_off) {
9613 l = pc->second.length();
9614 ready_regions[pos].claim(pc->second);
9615 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9616 << b_off << "~" << l << std::dec << dendl;
9617 ++pc;
7c673cae 9618 } else {
9f95a23c
TL
9619 l = b_len;
9620 if (pc != cache_res.end()) {
9621 ceph_assert(pc->first > b_off);
9622 l = pc->first - b_off;
9623 }
9624 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9625 << b_off << "~" << l << std::dec << dendl;
9626 // merge regions
9627 {
9628 uint64_t r_off = b_off;
9629 uint64_t r_len = l;
9630 uint64_t front = r_off % chunk_size;
9631 if (front) {
9632 r_off -= front;
9633 r_len += front;
9634 }
9635 unsigned tail = r_len % chunk_size;
9636 if (tail) {
9637 r_len += chunk_size - tail;
9638 }
9639 bool merged = false;
9640 regions2read_t& r2r = blobs2read[bptr];
9641 if (r2r.size()) {
9642 read_req_t& pre = r2r.back();
9643 if (r_off <= (pre.r_off + pre.r_len)) {
9644 front += (r_off - pre.r_off);
9645 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9646 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9647 merged = true;
9648 }
9649 }
9650 if (!merged) {
9651 read_req_t req(r_off, r_len);
9652 req.regs.emplace_back(region_t(pos, b_off, l, front));
9653 r2r.emplace_back(std::move(req));
9654 }
9655 }
7c673cae
FG
9656 }
9657 pos += l;
9658 b_off += l;
9659 left -= l;
9660 b_len -= l;
9661 }
9662 ++lp;
9663 }
9f95a23c 9664}
7c673cae 9665
9f95a23c
TL
9666int BlueStore::_prepare_read_ioc(
9667 blobs2read_t& blobs2read,
9668 vector<bufferlist>* compressed_blob_bls,
9669 IOContext* ioc)
9670{
7c673cae 9671 for (auto& p : blobs2read) {
94b18763 9672 const BlobRef& bptr = p.first;
11fdf7f2 9673 regions2read_t& r2r = p.second;
7c673cae 9674 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9675 << " need " << r2r << std::dec << dendl;
7c673cae
FG
9676 if (bptr->get_blob().is_compressed()) {
9677 // read the whole thing
9f95a23c
TL
9678 if (compressed_blob_bls->empty()) {
9679 // ensure we avoid any reallocation on subsequent blobs
9680 compressed_blob_bls->reserve(blobs2read.size());
9681 }
9682 compressed_blob_bls->push_back(bufferlist());
9683 bufferlist& bl = compressed_blob_bls->back();
9684 auto r = bptr->get_blob().map(
9685 0, bptr->get_blob().get_ondisk_length(),
9686 [&](uint64_t offset, uint64_t length) {
9687 int r = bdev->aio_read(offset, length, &bl, ioc);
9688 if (r < 0)
7c673cae
FG
9689 return r;
9690 return 0;
9f95a23c 9691 });
b32b8144
FG
9692 if (r < 0) {
9693 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9694 if (r == -EIO) {
9695 // propagate EIO to caller
9696 return r;
9697 }
11fdf7f2 9698 ceph_assert(r == 0);
b32b8144 9699 }
7c673cae
FG
9700 } else {
9701 // read the pieces
11fdf7f2 9702 for (auto& req : r2r) {
9f95a23c
TL
9703 dout(20) << __func__ << " region 0x" << std::hex
9704 << req.regs.front().logical_offset
9705 << ": 0x" << req.regs.front().blob_xoffset
9706 << " reading 0x" << req.r_off
9707 << "~" << req.r_len << std::dec
9708 << dendl;
7c673cae 9709
9f95a23c
TL
9710 // read it
9711 auto r = bptr->get_blob().map(
9712 req.r_off, req.r_len,
9713 [&](uint64_t offset, uint64_t length) {
9714 int r = bdev->aio_read(offset, length, &req.bl, ioc);
9715 if (r < 0)
7c673cae
FG
9716 return r;
9717 return 0;
9f95a23c 9718 });
b32b8144
FG
9719 if (r < 0) {
9720 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9721 << dendl;
9722 if (r == -EIO) {
9723 // propagate EIO to caller
9724 return r;
9725 }
11fdf7f2 9726 ceph_assert(r == 0);
b32b8144 9727 }
9f95a23c 9728 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
9729 }
9730 }
9731 }
9f95a23c
TL
9732 return 0;
9733}
11fdf7f2 9734
9f95a23c
TL
9735int BlueStore::_generate_read_result_bl(
9736 OnodeRef o,
9737 uint64_t offset,
9738 size_t length,
9739 ready_regions_t& ready_regions,
9740 vector<bufferlist>& compressed_blob_bls,
9741 blobs2read_t& blobs2read,
9742 bool buffered,
9743 bool* csum_error,
9744 bufferlist& bl)
9745{
9746 // enumerate and decompress desired blobs
7c673cae
FG
9747 auto p = compressed_blob_bls.begin();
9748 blobs2read_t::iterator b2r_it = blobs2read.begin();
9749 while (b2r_it != blobs2read.end()) {
94b18763 9750 const BlobRef& bptr = b2r_it->first;
11fdf7f2 9751 regions2read_t& r2r = b2r_it->second;
7c673cae 9752 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9753 << " need 0x" << r2r << std::dec << dendl;
7c673cae 9754 if (bptr->get_blob().is_compressed()) {
11fdf7f2 9755 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
9756 bufferlist& compressed_bl = *p++;
9757 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
9758 r2r.front().regs.front().logical_offset) < 0) {
9759 *csum_error = true;
9760 return -EIO;
7c673cae
FG
9761 }
9762 bufferlist raw_bl;
9f95a23c 9763 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 9764 if (r < 0)
9f95a23c 9765 return r;
7c673cae 9766 if (buffered) {
9f95a23c
TL
9767 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9768 raw_bl);
7c673cae 9769 }
11fdf7f2
TL
9770 for (auto& req : r2r) {
9771 for (auto& r : req.regs) {
9772 ready_regions[r.logical_offset].substr_of(
9773 raw_bl, r.blob_xoffset, r.length);
9774 }
7c673cae
FG
9775 }
9776 } else {
11fdf7f2 9777 for (auto& req : r2r) {
9f95a23c
TL
9778 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9779 req.regs.front().logical_offset) < 0) {
9780 *csum_error = true;
9781 return -EIO;
9782 }
9783 if (buffered) {
9784 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
9785 req.r_off, req.bl);
9786 }
7c673cae 9787
9f95a23c
TL
9788 // prune and keep result
9789 for (const auto& r : req.regs) {
9790 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 9791 }
7c673cae
FG
9792 }
9793 }
9794 ++b2r_it;
9795 }
9796
9797 // generate a resulting buffer
9798 auto pr = ready_regions.begin();
9799 auto pr_end = ready_regions.end();
9f95a23c 9800 uint64_t pos = 0;
7c673cae
FG
9801 while (pos < length) {
9802 if (pr != pr_end && pr->first == pos + offset) {
9803 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9804 << ": data from 0x" << pr->first << "~" << pr->second.length()
9805 << std::dec << dendl;
7c673cae
FG
9806 pos += pr->second.length();
9807 bl.claim_append(pr->second);
9808 ++pr;
9809 } else {
9810 uint64_t l = length - pos;
9811 if (pr != pr_end) {
11fdf7f2 9812 ceph_assert(pr->first > pos + offset);
9f95a23c 9813 l = pr->first - (pos + offset);
7c673cae
FG
9814 }
9815 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9816 << ": zeros for 0x" << (pos + offset) << "~" << l
9817 << std::dec << dendl;
7c673cae
FG
9818 bl.append_zero(l);
9819 pos += l;
9820 }
9821 }
11fdf7f2
TL
9822 ceph_assert(bl.length() == length);
9823 ceph_assert(pos == length);
9824 ceph_assert(pr == pr_end);
9f95a23c
TL
9825 return 0;
9826}
9827
9828int BlueStore::_do_read(
9829 Collection *c,
9830 OnodeRef o,
9831 uint64_t offset,
9832 size_t length,
9833 bufferlist& bl,
9834 uint32_t op_flags,
9835 uint64_t retry_count)
9836{
9837 FUNCTRACE(cct);
9838 int r = 0;
9839 int read_cache_policy = 0; // do not bypass clean or dirty cache
9840
9841 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9842 << " size 0x" << o->onode.size << " (" << std::dec
9843 << o->onode.size << ")" << dendl;
9844 bl.clear();
9845
9846 if (offset >= o->onode.size) {
9847 return r;
9848 }
9849
9850 // generally, don't buffer anything, unless the client explicitly requests
9851 // it.
9852 bool buffered = false;
9853 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9854 dout(20) << __func__ << " will do buffered read" << dendl;
9855 buffered = true;
9856 } else if (cct->_conf->bluestore_default_buffered_read &&
9857 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9858 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9859 dout(20) << __func__ << " defaulting to buffered read" << dendl;
9860 buffered = true;
9861 }
9862
9863 if (offset + length > o->onode.size) {
9864 length = o->onode.size - offset;
9865 }
9866
9867 auto start = mono_clock::now();
9868 o->extent_map.fault_range(db, offset, length);
9869 log_latency(__func__,
9870 l_bluestore_read_onode_meta_lat,
9871 mono_clock::now() - start,
9872 cct->_conf->bluestore_log_op_age);
9873 _dump_onode<30>(cct, *o);
9874
9875 // for deep-scrub, we only read dirty cache and bypass clean cache in
9876 // order to read underlying block device in case there are silent disk errors.
9877 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
9878 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
9879 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
9880 }
9881
9882 // build blob-wise list to of stuff read (that isn't cached)
9883 ready_regions_t ready_regions;
9884 blobs2read_t blobs2read;
9885 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
9886
9887
9888 // read raw blob data.
9889 start = mono_clock::now(); // for the sake of simplicity
9890 // measure the whole block below.
9891 // The error isn't that much...
9892 vector<bufferlist> compressed_blob_bls;
9893 IOContext ioc(cct, NULL, true); // allow EIO
9894 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
9895 // we always issue aio for reading, so errors other than EIO are not allowed
9896 if (r < 0)
9897 return r;
9898
9899 int64_t num_ios = length;
9900 if (ioc.has_pending_aios()) {
9901 num_ios = -ioc.get_num_ios();
9902 bdev->aio_submit(&ioc);
9903 dout(20) << __func__ << " waiting for aio" << dendl;
9904 ioc.aio_wait();
9905 r = ioc.get_return_value();
9906 if (r < 0) {
9907 ceph_assert(r == -EIO); // no other errors allowed
9908 return -EIO;
9909 }
9910 }
9911 log_latency_fn(__func__,
9912 l_bluestore_read_wait_aio_lat,
9913 mono_clock::now() - start,
9914 cct->_conf->bluestore_log_op_age,
9915 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
9916 );
9917
9918 bool csum_error = false;
9919 r = _generate_read_result_bl(o, offset, length, ready_regions,
9920 compressed_blob_bls, blobs2read,
9921 buffered, &csum_error, bl);
9922 if (csum_error) {
9923 // Handles spurious read errors caused by a kernel bug.
9924 // We sometimes get all-zero pages as a result of the read under
9925 // high memory pressure. Retrying the failing read succeeds in most
9926 // cases.
9927 // See also: http://tracker.ceph.com/issues/22464
9928 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
9929 return -EIO;
9930 }
9931 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
9932 }
7c673cae 9933 r = bl.length();
f64942e4
AA
9934 if (retry_count) {
9935 logger->inc(l_bluestore_reads_with_retries);
9936 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
9937 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
9938 }
7c673cae
FG
9939 return r;
9940}
9941
9942int BlueStore::_verify_csum(OnodeRef& o,
9943 const bluestore_blob_t* blob, uint64_t blob_xoffset,
9944 const bufferlist& bl,
9945 uint64_t logical_offset) const
9946{
9947 int bad;
9948 uint64_t bad_csum;
11fdf7f2 9949 auto start = mono_clock::now();
7c673cae 9950 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
9951 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
9952 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
9953 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
9954 bad = blob_xoffset;
9955 r = -1;
9956 bad_csum = 0xDEADBEEF;
9957 }
7c673cae
FG
9958 if (r < 0) {
9959 if (r == -1) {
9960 PExtentVector pex;
9961 blob->map(
9962 bad,
9963 blob->get_csum_chunk_size(),
9964 [&](uint64_t offset, uint64_t length) {
9965 pex.emplace_back(bluestore_pextent_t(offset, length));
9966 return 0;
9967 });
9968 derr << __func__ << " bad "
9969 << Checksummer::get_csum_type_string(blob->csum_type)
9970 << "/0x" << std::hex << blob->get_csum_chunk_size()
9971 << " checksum at blob offset 0x" << bad
9972 << ", got 0x" << bad_csum << ", expected 0x"
9973 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
9974 << ", device location " << pex
9975 << ", logical extent 0x" << std::hex
9976 << (logical_offset + bad - blob_xoffset) << "~"
9977 << blob->get_csum_chunk_size() << std::dec
9978 << ", object " << o->oid
9979 << dendl;
9980 } else {
9981 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
9982 }
9983 }
494da23a
TL
9984 log_latency(__func__,
9985 l_bluestore_csum_lat,
9986 mono_clock::now() - start,
9987 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
9988 if (cct->_conf->bluestore_ignore_data_csum) {
9989 return 0;
9990 }
7c673cae
FG
9991 return r;
9992}
9993
9994int BlueStore::_decompress(bufferlist& source, bufferlist* result)
9995{
9996 int r = 0;
11fdf7f2
TL
9997 auto start = mono_clock::now();
9998 auto i = source.cbegin();
7c673cae 9999 bluestore_compression_header_t chdr;
11fdf7f2 10000 decode(chdr, i);
7c673cae
FG
10001 int alg = int(chdr.type);
10002 CompressorRef cp = compressor;
10003 if (!cp || (int)cp->get_type() != alg) {
10004 cp = Compressor::create(cct, alg);
10005 }
10006
10007 if (!cp.get()) {
10008 // if compressor isn't available - error, because cannot return
10009 // decompressed data?
11fdf7f2
TL
10010
10011 const char* alg_name = Compressor::get_comp_alg_name(alg);
10012 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10013 _set_compression_alert(false, alg_name);
7c673cae
FG
10014 r = -EIO;
10015 } else {
10016 r = cp->decompress(i, chdr.length, *result);
10017 if (r < 0) {
10018 derr << __func__ << " decompression failed with exit code " << r << dendl;
10019 r = -EIO;
10020 }
10021 }
494da23a
TL
10022 log_latency(__func__,
10023 l_bluestore_decompress_lat,
10024 mono_clock::now() - start,
10025 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10026 return r;
10027}
10028
10029// this stores fiemap into interval_set, other variations
10030// use it internally
10031int BlueStore::_fiemap(
10032 CollectionHandle &c_,
10033 const ghobject_t& oid,
10034 uint64_t offset,
10035 size_t length,
10036 interval_set<uint64_t>& destset)
10037{
10038 Collection *c = static_cast<Collection *>(c_.get());
10039 if (!c->exists)
10040 return -ENOENT;
10041 {
9f95a23c 10042 std::shared_lock l(c->lock);
7c673cae
FG
10043
10044 OnodeRef o = c->get_onode(oid, false);
10045 if (!o || !o->exists) {
10046 return -ENOENT;
10047 }
81eedcae 10048 _dump_onode<30>(cct, *o);
7c673cae
FG
10049
10050 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10051 << " size 0x" << o->onode.size << std::dec << dendl;
10052
10053 boost::intrusive::set<Extent>::iterator ep, eend;
10054 if (offset >= o->onode.size)
10055 goto out;
10056
10057 if (offset + length > o->onode.size) {
10058 length = o->onode.size - offset;
10059 }
10060
10061 o->extent_map.fault_range(db, offset, length);
10062 eend = o->extent_map.extent_map.end();
10063 ep = o->extent_map.seek_lextent(offset);
10064 while (length > 0) {
10065 dout(20) << __func__ << " offset " << offset << dendl;
10066 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10067 ++ep;
10068 continue;
10069 }
10070
10071 uint64_t x_len = length;
10072 if (ep != eend && ep->logical_offset <= offset) {
10073 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 10074 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
10075 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10076 << x_len << std::dec << " blob " << ep->blob << dendl;
10077 destset.insert(offset, x_len);
10078 length -= x_len;
10079 offset += x_len;
10080 if (x_off + x_len == ep->length)
10081 ++ep;
10082 continue;
10083 }
10084 if (ep != eend &&
10085 ep->logical_offset > offset &&
10086 ep->logical_offset - offset < x_len) {
10087 x_len = ep->logical_offset - offset;
10088 }
10089 offset += x_len;
10090 length -= x_len;
10091 }
10092 }
9f95a23c
TL
10093
10094 out:
10095 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10096 << " size = 0x(" << destset << ")" << std::dec << dendl;
10097 return 0;
10098}
10099
10100int BlueStore::fiemap(
10101 CollectionHandle &c_,
10102 const ghobject_t& oid,
10103 uint64_t offset,
10104 size_t length,
10105 bufferlist& bl)
10106{
10107 interval_set<uint64_t> m;
10108 int r = _fiemap(c_, oid, offset, length, m);
10109 if (r >= 0) {
10110 encode(m, bl);
10111 }
10112 return r;
10113}
10114
10115int BlueStore::fiemap(
10116 CollectionHandle &c_,
10117 const ghobject_t& oid,
10118 uint64_t offset,
10119 size_t length,
10120 map<uint64_t, uint64_t>& destmap)
10121{
10122 interval_set<uint64_t> m;
10123 int r = _fiemap(c_, oid, offset, length, m);
10124 if (r >= 0) {
10125 destmap = std::move(m).detach();
10126 }
10127 return r;
10128}
10129
10130int BlueStore::readv(
10131 CollectionHandle &c_,
10132 const ghobject_t& oid,
10133 interval_set<uint64_t>& m,
10134 bufferlist& bl,
10135 uint32_t op_flags)
10136{
10137 auto start = mono_clock::now();
10138 Collection *c = static_cast<Collection *>(c_.get());
10139 const coll_t &cid = c->get_cid();
10140 dout(15) << __func__ << " " << cid << " " << oid
10141 << " fiemap " << m
10142 << dendl;
10143 if (!c->exists)
10144 return -ENOENT;
10145
10146 bl.clear();
10147 int r;
10148 {
10149 std::shared_lock l(c->lock);
10150 auto start1 = mono_clock::now();
10151 OnodeRef o = c->get_onode(oid, false);
10152 log_latency("get_onode@read",
10153 l_bluestore_read_onode_meta_lat,
10154 mono_clock::now() - start1,
10155 cct->_conf->bluestore_log_op_age);
10156 if (!o || !o->exists) {
10157 r = -ENOENT;
10158 goto out;
10159 }
10160
10161 if (m.empty()) {
10162 r = 0;
10163 goto out;
10164 }
10165
10166 r = _do_readv(c, o, m, bl, op_flags);
10167 if (r == -EIO) {
10168 logger->inc(l_bluestore_read_eio);
10169 }
10170 }
10171
10172 out:
10173 if (r >= 0 && _debug_data_eio(oid)) {
10174 r = -EIO;
10175 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10176 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10177 cct->_conf->bluestore_debug_random_read_err &&
10178 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10179 100.0)) == 0) {
10180 dout(0) << __func__ << ": inject random EIO" << dendl;
10181 r = -EIO;
10182 }
10183 dout(10) << __func__ << " " << cid << " " << oid
10184 << " fiemap " << m << std::dec
10185 << " = " << r << dendl;
10186 log_latency(__func__,
10187 l_bluestore_read_lat,
10188 mono_clock::now() - start,
10189 cct->_conf->bluestore_log_op_age);
10190 return r;
10191}
10192
10193int BlueStore::_do_readv(
10194 Collection *c,
10195 OnodeRef o,
10196 const interval_set<uint64_t>& m,
10197 bufferlist& bl,
10198 uint32_t op_flags,
10199 uint64_t retry_count)
10200{
10201 FUNCTRACE(cct);
10202 int r = 0;
10203 int read_cache_policy = 0; // do not bypass clean or dirty cache
10204
10205 dout(20) << __func__ << " fiemap " << m << std::hex
10206 << " size 0x" << o->onode.size << " (" << std::dec
10207 << o->onode.size << ")" << dendl;
10208
10209 // generally, don't buffer anything, unless the client explicitly requests
10210 // it.
10211 bool buffered = false;
10212 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10213 dout(20) << __func__ << " will do buffered read" << dendl;
10214 buffered = true;
10215 } else if (cct->_conf->bluestore_default_buffered_read &&
10216 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10217 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10218 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10219 buffered = true;
10220 }
10221 // this method must be idempotent since we may call it several times
10222 // before we finally read the expected result.
10223 bl.clear();
10224
10225 // call fiemap first!
10226 ceph_assert(m.range_start() <= o->onode.size);
10227 ceph_assert(m.range_end() <= o->onode.size);
10228 auto start = mono_clock::now();
10229 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10230 log_latency(__func__,
10231 l_bluestore_read_onode_meta_lat,
10232 mono_clock::now() - start,
10233 cct->_conf->bluestore_log_op_age);
10234 _dump_onode<30>(cct, *o);
10235
10236 IOContext ioc(cct, NULL, true); // allow EIO
10237 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10238 raw_results.reserve(m.num_intervals());
10239 int i = 0;
10240 for (auto p = m.begin(); p != m.end(); p++, i++) {
10241 raw_results.push_back({});
10242 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10243 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10244 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10245 // we always issue aio for reading, so errors other than EIO are not allowed
10246 if (r < 0)
10247 return r;
10248 }
10249
10250 auto num_ios = m.size();
10251 if (ioc.has_pending_aios()) {
10252 num_ios = ioc.get_num_ios();
10253 bdev->aio_submit(&ioc);
10254 dout(20) << __func__ << " waiting for aio" << dendl;
10255 ioc.aio_wait();
10256 r = ioc.get_return_value();
10257 if (r < 0) {
10258 ceph_assert(r == -EIO); // no other errors allowed
10259 return -EIO;
10260 }
10261 }
10262 log_latency_fn(__func__,
10263 l_bluestore_read_wait_aio_lat,
10264 mono_clock::now() - start,
10265 cct->_conf->bluestore_log_op_age,
10266 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10267 );
10268
10269 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10270 i = 0;
10271 for (auto p = m.begin(); p != m.end(); p++, i++) {
10272 bool csum_error = false;
10273 bufferlist t;
10274 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10275 std::get<0>(raw_results[i]),
10276 std::get<1>(raw_results[i]),
10277 std::get<2>(raw_results[i]),
10278 buffered, &csum_error, t);
10279 if (csum_error) {
10280 // Handles spurious read errors caused by a kernel bug.
10281 // We sometimes get all-zero pages as a result of the read under
10282 // high memory pressure. Retrying the failing read succeeds in most
10283 // cases.
10284 // See also: http://tracker.ceph.com/issues/22464
10285 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10286 return -EIO;
10287 }
10288 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10289 }
10290 bl.claim_append(t);
10291 }
10292 if (retry_count) {
10293 logger->inc(l_bluestore_reads_with_retries);
10294 dout(5) << __func__ << " read fiemap " << m
10295 << " failed " << retry_count << " times before succeeding"
10296 << dendl;
10297 }
10298 return bl.length();
7c673cae
FG
10299}
10300
9f95a23c 10301int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 10302 const ghobject_t& oid,
9f95a23c
TL
10303 const string& section_name,
10304 Formatter *f)
7c673cae 10305{
9f95a23c
TL
10306 Collection *c = static_cast<Collection *>(c_.get());
10307 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10308 if (!c->exists)
10309 return -ENOENT;
7c673cae 10310
9f95a23c
TL
10311 int r;
10312 {
10313 std::shared_lock l(c->lock);
10314
10315 OnodeRef o = c->get_onode(oid, false);
10316 if (!o || !o->exists) {
10317 r = -ENOENT;
10318 goto out;
10319 }
10320 // FIXME minor: actually the next line isn't enough to
10321 // load shared blobs. Leaving as is for now..
10322 //
10323 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10324
10325 _dump_onode<0>(cct, *o);
10326 f->open_object_section(section_name.c_str());
10327 o->dump(f);
10328 f->close_section();
10329 r = 0;
7c673cae 10330 }
9f95a23c
TL
10331 out:
10332 dout(10) << __func__ << " " << c->cid << " " << oid
10333 << " = " << r << dendl;
7c673cae
FG
10334 return r;
10335}
10336
7c673cae
FG
10337int BlueStore::getattr(
10338 CollectionHandle &c_,
10339 const ghobject_t& oid,
10340 const char *name,
10341 bufferptr& value)
10342{
10343 Collection *c = static_cast<Collection *>(c_.get());
10344 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10345 if (!c->exists)
10346 return -ENOENT;
10347
10348 int r;
10349 {
9f95a23c 10350 std::shared_lock l(c->lock);
31f18b77 10351 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
10352
10353 OnodeRef o = c->get_onode(oid, false);
10354 if (!o || !o->exists) {
10355 r = -ENOENT;
10356 goto out;
10357 }
10358
10359 if (!o->onode.attrs.count(k)) {
10360 r = -ENODATA;
10361 goto out;
10362 }
10363 value = o->onode.attrs[k];
10364 r = 0;
10365 }
10366 out:
7c673cae
FG
10367 if (r == 0 && _debug_mdata_eio(oid)) {
10368 r = -EIO;
10369 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10370 }
10371 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10372 << " = " << r << dendl;
10373 return r;
10374}
10375
7c673cae
FG
10376int BlueStore::getattrs(
10377 CollectionHandle &c_,
10378 const ghobject_t& oid,
10379 map<string,bufferptr>& aset)
10380{
10381 Collection *c = static_cast<Collection *>(c_.get());
10382 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10383 if (!c->exists)
10384 return -ENOENT;
10385
10386 int r;
10387 {
9f95a23c 10388 std::shared_lock l(c->lock);
7c673cae
FG
10389
10390 OnodeRef o = c->get_onode(oid, false);
10391 if (!o || !o->exists) {
10392 r = -ENOENT;
10393 goto out;
10394 }
10395 for (auto& i : o->onode.attrs) {
10396 aset.emplace(i.first.c_str(), i.second);
10397 }
10398 r = 0;
10399 }
10400
10401 out:
7c673cae
FG
10402 if (r == 0 && _debug_mdata_eio(oid)) {
10403 r = -EIO;
10404 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10405 }
10406 dout(10) << __func__ << " " << c->cid << " " << oid
10407 << " = " << r << dendl;
10408 return r;
10409}
10410
10411int BlueStore::list_collections(vector<coll_t>& ls)
10412{
9f95a23c 10413 std::shared_lock l(coll_lock);
11fdf7f2 10414 ls.reserve(coll_map.size());
7c673cae
FG
10415 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10416 p != coll_map.end();
10417 ++p)
10418 ls.push_back(p->first);
10419 return 0;
10420}
10421
10422bool BlueStore::collection_exists(const coll_t& c)
10423{
9f95a23c 10424 std::shared_lock l(coll_lock);
7c673cae
FG
10425 return coll_map.count(c);
10426}
10427
11fdf7f2 10428int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 10429{
11fdf7f2 10430 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
10431 vector<ghobject_t> ls;
10432 ghobject_t next;
11fdf7f2 10433 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
10434 &ls, &next);
10435 if (r < 0) {
10436 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10437 << dendl;
10438 return r;
10439 }
10440 *empty = ls.empty();
11fdf7f2 10441 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
10442 return 0;
10443}
10444
11fdf7f2 10445int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 10446{
11fdf7f2
TL
10447 dout(15) << __func__ << " " << ch->cid << dendl;
10448 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 10449 std::shared_lock l(c->lock);
11fdf7f2 10450 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
10451 return c->cnode.bits;
10452}
10453
7c673cae
FG
10454int BlueStore::collection_list(
10455 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10456 vector<ghobject_t> *ls, ghobject_t *pnext)
10457{
10458 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 10459 c->flush();
7c673cae
FG
10460 dout(15) << __func__ << " " << c->cid
10461 << " start " << start << " end " << end << " max " << max << dendl;
10462 int r;
10463 {
9f95a23c 10464 std::shared_lock l(c->lock);
7c673cae
FG
10465 r = _collection_list(c, start, end, max, ls, pnext);
10466 }
10467
7c673cae
FG
10468 dout(10) << __func__ << " " << c->cid
10469 << " start " << start << " end " << end << " max " << max
10470 << " = " << r << ", ls.size() = " << ls->size()
10471 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10472 return r;
10473}
10474
10475int BlueStore::_collection_list(
10476 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
10477 vector<ghobject_t> *ls, ghobject_t *pnext)
10478{
10479
10480 if (!c->exists)
10481 return -ENOENT;
10482
494da23a 10483 auto start_time = mono_clock::now();
7c673cae
FG
10484 int r = 0;
10485 ghobject_t static_next;
10486 KeyValueDB::Iterator it;
10487 string temp_start_key, temp_end_key;
10488 string start_key, end_key;
10489 bool set_next = false;
10490 string pend;
10491 bool temp;
10492
10493 if (!pnext)
10494 pnext = &static_next;
10495
11fdf7f2 10496 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
10497 goto out;
10498 }
10499 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
10500 &start_key, &end_key);
10501 dout(20) << __func__
10502 << " range " << pretty_binary_string(temp_start_key)
10503 << " to " << pretty_binary_string(temp_end_key)
10504 << " and " << pretty_binary_string(start_key)
10505 << " to " << pretty_binary_string(end_key)
10506 << " start " << start << dendl;
10507 it = db->get_iterator(PREFIX_OBJ);
10508 if (start == ghobject_t() ||
10509 start.hobj == hobject_t() ||
10510 start == c->cid.get_min_hobj()) {
10511 it->upper_bound(temp_start_key);
10512 temp = true;
10513 } else {
10514 string k;
10515 get_object_key(cct, start, &k);
10516 if (start.hobj.is_temp()) {
10517 temp = true;
11fdf7f2 10518 ceph_assert(k >= temp_start_key && k < temp_end_key);
7c673cae
FG
10519 } else {
10520 temp = false;
11fdf7f2 10521 ceph_assert(k >= start_key && k < end_key);
7c673cae 10522 }
11fdf7f2 10523 dout(20) << __func__ << " start from " << pretty_binary_string(k)
7c673cae
FG
10524 << " temp=" << (int)temp << dendl;
10525 it->lower_bound(k);
10526 }
10527 if (end.hobj.is_max()) {
10528 pend = temp ? temp_end_key : end_key;
10529 } else {
10530 get_object_key(cct, end, &end_key);
10531 if (end.hobj.is_temp()) {
10532 if (temp)
10533 pend = end_key;
10534 else
10535 goto out;
10536 } else {
10537 pend = temp ? temp_end_key : end_key;
10538 }
10539 }
10540 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
10541 while (true) {
10542 if (!it->valid() || it->key() >= pend) {
10543 if (!it->valid())
10544 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10545 else
10546 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
10547 << " >= " << end << dendl;
10548 if (temp) {
10549 if (end.hobj.is_temp()) {
10550 break;
10551 }
10552 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10553 temp = false;
10554 it->upper_bound(start_key);
10555 pend = end_key;
10556 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
10557 continue;
10558 }
10559 break;
10560 }
10561 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
10562 if (is_extent_shard_key(it->key())) {
10563 it->next();
10564 continue;
10565 }
10566 ghobject_t oid;
10567 int r = get_key_object(it->key(), &oid);
11fdf7f2 10568 ceph_assert(r == 0);
7c673cae
FG
10569 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
10570 if (ls->size() >= (unsigned)max) {
10571 dout(20) << __func__ << " reached max " << max << dendl;
10572 *pnext = oid;
10573 set_next = true;
10574 break;
10575 }
10576 ls->push_back(oid);
10577 it->next();
10578 }
10579out:
10580 if (!set_next) {
10581 *pnext = ghobject_t::get_max();
10582 }
494da23a
TL
10583 log_latency_fn(
10584 __func__,
10585 l_bluestore_clist_lat,
10586 mono_clock::now() - start_time,
10587 cct->_conf->bluestore_log_collection_list_age,
10588 [&] (const ceph::timespan& lat) {
10589 ostringstream ostr;
10590 ostr << ", lat = " << timespan_str(lat)
10591 << " cid =" << c->cid
10592 << " start " << start << " end " << end
10593 << " max " << max;
10594 return ostr.str();
10595 }
10596 );
7c673cae
FG
10597 return r;
10598}
10599
7c673cae
FG
10600int BlueStore::omap_get(
10601 CollectionHandle &c_, ///< [in] Collection containing oid
10602 const ghobject_t &oid, ///< [in] Object containing omap
10603 bufferlist *header, ///< [out] omap header
10604 map<string, bufferlist> *out /// < [out] Key to value map
10605 )
10606{
10607 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
10608 return _omap_get(c, oid, header, out);
10609}
10610
10611int BlueStore::_omap_get(
10612 Collection *c, ///< [in] Collection containing oid
10613 const ghobject_t &oid, ///< [in] Object containing omap
10614 bufferlist *header, ///< [out] omap header
10615 map<string, bufferlist> *out /// < [out] Key to value map
10616 )
10617{
7c673cae
FG
10618 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10619 if (!c->exists)
10620 return -ENOENT;
9f95a23c 10621 std::shared_lock l(c->lock);
7c673cae
FG
10622 int r = 0;
10623 OnodeRef o = c->get_onode(oid, false);
10624 if (!o || !o->exists) {
10625 r = -ENOENT;
10626 goto out;
10627 }
9f95a23c
TL
10628 r = _onode_omap_get(o, header, out);
10629 out:
10630 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10631 << dendl;
10632 return r;
10633}
10634
10635int BlueStore::_onode_omap_get(
10636 const OnodeRef &o, ///< [in] Object containing omap
10637 bufferlist *header, ///< [out] omap header
10638 map<string, bufferlist> *out /// < [out] Key to value map
10639)
10640{
10641 int r = 0;
10642 if (!o || !o->exists) {
10643 r = -ENOENT;
10644 goto out;
10645 }
7c673cae
FG
10646 if (!o->onode.has_omap())
10647 goto out;
10648 o->flush();
10649 {
9f95a23c 10650 const string& prefix = o->get_omap_prefix();
11fdf7f2 10651 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10652 string head, tail;
9f95a23c
TL
10653 o->get_omap_header(&head);
10654 o->get_omap_tail(&tail);
7c673cae
FG
10655 it->lower_bound(head);
10656 while (it->valid()) {
10657 if (it->key() == head) {
9f95a23c
TL
10658 dout(30) << __func__ << " got header" << dendl;
10659 *header = it->value();
7c673cae 10660 } else if (it->key() >= tail) {
9f95a23c
TL
10661 dout(30) << __func__ << " reached tail" << dendl;
10662 break;
7c673cae 10663 } else {
9f95a23c
TL
10664 string user_key;
10665 o->decode_omap_key(it->key(), &user_key);
10666 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10667 << " -> " << user_key << dendl;
10668 (*out)[user_key] = it->value();
7c673cae
FG
10669 }
10670 it->next();
10671 }
10672 }
9f95a23c 10673out:
7c673cae
FG
10674 return r;
10675}
10676
7c673cae
FG
10677int BlueStore::omap_get_header(
10678 CollectionHandle &c_, ///< [in] Collection containing oid
10679 const ghobject_t &oid, ///< [in] Object containing omap
10680 bufferlist *header, ///< [out] omap header
10681 bool allow_eio ///< [in] don't assert on eio
10682 )
10683{
10684 Collection *c = static_cast<Collection *>(c_.get());
10685 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10686 if (!c->exists)
10687 return -ENOENT;
9f95a23c 10688 std::shared_lock l(c->lock);
7c673cae
FG
10689 int r = 0;
10690 OnodeRef o = c->get_onode(oid, false);
10691 if (!o || !o->exists) {
10692 r = -ENOENT;
10693 goto out;
10694 }
10695 if (!o->onode.has_omap())
10696 goto out;
10697 o->flush();
10698 {
10699 string head;
9f95a23c
TL
10700 o->get_omap_header(&head);
10701 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
10702 dout(30) << __func__ << " got header" << dendl;
10703 } else {
10704 dout(30) << __func__ << " no header" << dendl;
10705 }
10706 }
10707 out:
10708 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10709 << dendl;
10710 return r;
10711}
10712
7c673cae
FG
10713int BlueStore::omap_get_keys(
10714 CollectionHandle &c_, ///< [in] Collection containing oid
10715 const ghobject_t &oid, ///< [in] Object containing omap
10716 set<string> *keys ///< [out] Keys defined on oid
10717 )
10718{
10719 Collection *c = static_cast<Collection *>(c_.get());
10720 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10721 if (!c->exists)
10722 return -ENOENT;
9f95a23c 10723 std::shared_lock l(c->lock);
7c673cae
FG
10724 int r = 0;
10725 OnodeRef o = c->get_onode(oid, false);
10726 if (!o || !o->exists) {
10727 r = -ENOENT;
10728 goto out;
10729 }
10730 if (!o->onode.has_omap())
10731 goto out;
10732 o->flush();
10733 {
9f95a23c 10734 const string& prefix = o->get_omap_prefix();
11fdf7f2 10735 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10736 string head, tail;
9f95a23c
TL
10737 o->get_omap_key(string(), &head);
10738 o->get_omap_tail(&tail);
7c673cae
FG
10739 it->lower_bound(head);
10740 while (it->valid()) {
10741 if (it->key() >= tail) {
10742 dout(30) << __func__ << " reached tail" << dendl;
10743 break;
10744 }
10745 string user_key;
9f95a23c 10746 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 10747 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
10748 << " -> " << user_key << dendl;
10749 keys->insert(user_key);
10750 it->next();
11fdf7f2
TL
10751 }
10752 }
10753 out:
10754 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10755 << dendl;
10756 return r;
7c673cae
FG
10757}
10758
10759int BlueStore::omap_get_values(
10760 CollectionHandle &c_, ///< [in] Collection containing oid
10761 const ghobject_t &oid, ///< [in] Object containing omap
10762 const set<string> &keys, ///< [in] Keys to get
10763 map<string, bufferlist> *out ///< [out] Returned keys and values
10764 )
10765{
10766 Collection *c = static_cast<Collection *>(c_.get());
10767 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10768 if (!c->exists)
10769 return -ENOENT;
9f95a23c 10770 std::shared_lock l(c->lock);
7c673cae
FG
10771 int r = 0;
10772 string final_key;
10773 OnodeRef o = c->get_onode(oid, false);
10774 if (!o || !o->exists) {
10775 r = -ENOENT;
10776 goto out;
10777 }
9f95a23c 10778 if (!o->onode.has_omap()) {
7c673cae 10779 goto out;
9f95a23c
TL
10780 }
10781 o->flush();
11fdf7f2 10782 {
9f95a23c
TL
10783 const string& prefix = o->get_omap_prefix();
10784 o->get_omap_key(string(), &final_key);
10785 size_t base_key_len = final_key.size();
11fdf7f2 10786 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10787 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10788 final_key += *p;
10789 bufferlist val;
10790 if (db->get(prefix, final_key, &val) >= 0) {
10791 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
10792 << " -> " << *p << dendl;
10793 out->insert(make_pair(*p, val));
10794 }
7c673cae
FG
10795 }
10796 }
10797 out:
10798 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10799 << dendl;
10800 return r;
10801}
10802
9f95a23c
TL
10803#ifdef WITH_SEASTAR
10804int BlueStore::omap_get_values(
10805 CollectionHandle &c_, ///< [in] Collection containing oid
10806 const ghobject_t &oid, ///< [in] Object containing omap
10807 const std::optional<string> &start_after, ///< [in] Keys to get
10808 map<string, bufferlist> *output ///< [out] Returned keys and values
10809 )
10810{
10811 Collection *c = static_cast<Collection *>(c_.get());
10812 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10813 if (!c->exists)
10814 return -ENOENT;
10815 std::shared_lock l(c->lock);
10816 int r = 0;
10817 OnodeRef o = c->get_onode(oid, false);
10818 if (!o || !o->exists) {
10819 r = -ENOENT;
10820 goto out;
10821 }
10822 if (!o->onode.has_omap()) {
10823 goto out;
10824 }
10825 o->flush();
10826 {
10827 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
10828 if (!iter) {
10829 r = -ENOENT;
10830 goto out;
10831 }
10832 iter->upper_bound(*start_after);
10833 for (; iter->valid(); iter->next()) {
10834 output->insert(make_pair(iter->key(), iter->value()));
10835 }
10836 }
10837
10838out:
10839 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10840 << dendl;
10841 return r;
10842}
10843#endif
10844
7c673cae
FG
10845int BlueStore::omap_check_keys(
10846 CollectionHandle &c_, ///< [in] Collection containing oid
10847 const ghobject_t &oid, ///< [in] Object containing omap
10848 const set<string> &keys, ///< [in] Keys to check
10849 set<string> *out ///< [out] Subset of keys defined on oid
10850 )
10851{
10852 Collection *c = static_cast<Collection *>(c_.get());
10853 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10854 if (!c->exists)
10855 return -ENOENT;
9f95a23c 10856 std::shared_lock l(c->lock);
7c673cae
FG
10857 int r = 0;
10858 string final_key;
10859 OnodeRef o = c->get_onode(oid, false);
10860 if (!o || !o->exists) {
10861 r = -ENOENT;
10862 goto out;
10863 }
9f95a23c 10864 if (!o->onode.has_omap()) {
7c673cae 10865 goto out;
9f95a23c
TL
10866 }
10867 o->flush();
11fdf7f2 10868 {
9f95a23c
TL
10869 const string& prefix = o->get_omap_prefix();
10870 o->get_omap_key(string(), &final_key);
10871 size_t base_key_len = final_key.size();
11fdf7f2 10872 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10873 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10874 final_key += *p;
10875 bufferlist val;
10876 if (db->get(prefix, final_key, &val) >= 0) {
10877 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
10878 << " -> " << *p << dendl;
10879 out->insert(*p);
10880 } else {
10881 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
10882 << " -> " << *p << dendl;
10883 }
7c673cae
FG
10884 }
10885 }
10886 out:
10887 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10888 << dendl;
10889 return r;
10890}
10891
7c673cae
FG
10892ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
10893 CollectionHandle &c_, ///< [in] collection
10894 const ghobject_t &oid ///< [in] object
10895 )
10896{
10897 Collection *c = static_cast<Collection *>(c_.get());
10898 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10899 if (!c->exists) {
10900 return ObjectMap::ObjectMapIterator();
10901 }
9f95a23c 10902 std::shared_lock l(c->lock);
7c673cae
FG
10903 OnodeRef o = c->get_onode(oid, false);
10904 if (!o || !o->exists) {
10905 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
10906 return ObjectMap::ObjectMapIterator();
10907 }
10908 o->flush();
10909 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
9f95a23c 10910 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
7c673cae
FG
10911 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
10912}
10913
10914// -----------------
10915// write helpers
10916
11fdf7f2
TL
10917uint64_t BlueStore::_get_ondisk_reserved() const {
10918 return round_up_to(
10919 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
10920}
10921
7c673cae
FG
10922void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
10923{
10924 dout(10) << __func__ << " ondisk_format " << ondisk_format
10925 << " min_compat_ondisk_format " << min_compat_ondisk_format
10926 << dendl;
11fdf7f2 10927 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
10928 {
10929 bufferlist bl;
11fdf7f2 10930 encode(ondisk_format, bl);
7c673cae
FG
10931 t->set(PREFIX_SUPER, "ondisk_format", bl);
10932 }
10933 {
10934 bufferlist bl;
11fdf7f2 10935 encode(min_compat_ondisk_format, bl);
7c673cae
FG
10936 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
10937 }
10938}
10939
10940int BlueStore::_open_super_meta()
10941{
10942 // nid
10943 {
10944 nid_max = 0;
10945 bufferlist bl;
10946 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 10947 auto p = bl.cbegin();
7c673cae
FG
10948 try {
10949 uint64_t v;
11fdf7f2 10950 decode(v, p);
7c673cae
FG
10951 nid_max = v;
10952 } catch (buffer::error& e) {
10953 derr << __func__ << " unable to read nid_max" << dendl;
10954 return -EIO;
10955 }
10956 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
10957 nid_last = nid_max.load();
10958 }
10959
10960 // blobid
10961 {
10962 blobid_max = 0;
10963 bufferlist bl;
10964 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 10965 auto p = bl.cbegin();
7c673cae
FG
10966 try {
10967 uint64_t v;
11fdf7f2 10968 decode(v, p);
7c673cae
FG
10969 blobid_max = v;
10970 } catch (buffer::error& e) {
10971 derr << __func__ << " unable to read blobid_max" << dendl;
10972 return -EIO;
10973 }
10974 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
10975 blobid_last = blobid_max.load();
10976 }
10977
10978 // freelist
10979 {
10980 bufferlist bl;
10981 db->get(PREFIX_SUPER, "freelist_type", &bl);
10982 if (bl.length()) {
10983 freelist_type = std::string(bl.c_str(), bl.length());
10984 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
10985 } else {
11fdf7f2 10986 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 10987 }
7c673cae
FG
10988 }
10989
10990 // ondisk format
10991 int32_t compat_ondisk_format = 0;
10992 {
10993 bufferlist bl;
10994 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
10995 if (r < 0) {
10996 // base case: kraken bluestore is v1 and readable by v1
10997 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
10998 << dendl;
10999 ondisk_format = 1;
11000 compat_ondisk_format = 1;
11001 } else {
11fdf7f2 11002 auto p = bl.cbegin();
7c673cae 11003 try {
11fdf7f2 11004 decode(ondisk_format, p);
7c673cae
FG
11005 } catch (buffer::error& e) {
11006 derr << __func__ << " unable to read ondisk_format" << dendl;
11007 return -EIO;
11008 }
11009 bl.clear();
11010 {
11011 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
11012 ceph_assert(!r);
11013 auto p = bl.cbegin();
7c673cae 11014 try {
11fdf7f2 11015 decode(compat_ondisk_format, p);
7c673cae
FG
11016 } catch (buffer::error& e) {
11017 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11018 return -EIO;
11019 }
11020 }
11021 }
11022 dout(10) << __func__ << " ondisk_format " << ondisk_format
11023 << " compat_ondisk_format " << compat_ondisk_format
11024 << dendl;
11025 }
11026
11027 if (latest_ondisk_format < compat_ondisk_format) {
11028 derr << __func__ << " compat_ondisk_format is "
11029 << compat_ondisk_format << " but we only understand version "
11030 << latest_ondisk_format << dendl;
11031 return -EPERM;
11032 }
7c673cae
FG
11033
11034 {
11035 bufferlist bl;
11036 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 11037 auto p = bl.cbegin();
7c673cae
FG
11038 try {
11039 uint64_t val;
11fdf7f2 11040 decode(val, p);
7c673cae 11041 min_alloc_size = val;
224ce89b 11042 min_alloc_size_order = ctz(val);
11fdf7f2 11043 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
11044 } catch (buffer::error& e) {
11045 derr << __func__ << " unable to read min_alloc_size" << dendl;
11046 return -EIO;
11047 }
11048 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11049 << std::dec << dendl;
11050 }
9f95a23c
TL
11051
11052 _set_per_pool_omap();
11053
224ce89b 11054 _open_statfs();
7c673cae
FG
11055 _set_alloc_sizes();
11056 _set_throttle_params();
11057
11058 _set_csum();
11059 _set_compression();
11060 _set_blob_size();
11061
11fdf7f2 11062 _validate_bdev();
7c673cae
FG
11063 return 0;
11064}
11065
11066int BlueStore::_upgrade_super()
11067{
11068 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11069 << latest_ondisk_format << dendl;
11fdf7f2
TL
11070 if (ondisk_format < latest_ondisk_format) {
11071 ceph_assert(ondisk_format > 0);
11072 ceph_assert(ondisk_format < latest_ondisk_format);
11073
1911f103 11074 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
11075 if (ondisk_format == 1) {
11076 // changes:
11077 // - super: added ondisk_format
11078 // - super: added min_readable_ondisk_format
11079 // - super: added min_compat_ondisk_format
11080 // - super: added min_alloc_size
11081 // - super: removed min_min_alloc_size
11fdf7f2
TL
11082 {
11083 bufferlist bl;
11084 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11085 auto p = bl.cbegin();
11086 try {
11087 uint64_t val;
11088 decode(val, p);
11089 min_alloc_size = val;
11090 } catch (buffer::error& e) {
11091 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11092 return -EIO;
11093 }
11094 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11095 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 11096 }
11fdf7f2 11097 ondisk_format = 2;
7c673cae 11098 }
9f95a23c
TL
11099 if (ondisk_format == 2) {
11100 // changes:
11101 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11102 // ondes are using the per-pool prefix until a repair is run; at that
11103 // point the per_pool_omap=1 key will be set.
11104 // - super: added per_pool_omap key, which indicates that *all* objects
11105 // are using the new prefix and key format
11106 ondisk_format = 3;
1911f103
TL
11107 }
11108 if (ondisk_format == 3) {
11109 // changes:
11110 // - FreelistManager keeps meta within bdev label
11111 int r = _write_out_fm_meta(0);
9f95a23c 11112 ceph_assert(r == 0);
1911f103 11113 ondisk_format = 4;
9f95a23c 11114 }
1911f103
TL
11115 // This to be the last operation
11116 _prepare_ondisk_format_super(t);
11117 int r = db->submit_transaction_sync(t);
11118 ceph_assert(r == 0);
7c673cae 11119 }
7c673cae
FG
11120 // done
11121 dout(1) << __func__ << " done" << dendl;
11122 return 0;
11123}
11124
11125void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11126{
224ce89b 11127 if (o->onode.nid) {
11fdf7f2 11128 ceph_assert(o->exists);
7c673cae 11129 return;
224ce89b 11130 }
7c673cae
FG
11131 uint64_t nid = ++nid_last;
11132 dout(20) << __func__ << " " << nid << dendl;
11133 o->onode.nid = nid;
11134 txc->last_nid = nid;
224ce89b 11135 o->exists = true;
7c673cae
FG
11136}
11137
11138uint64_t BlueStore::_assign_blobid(TransContext *txc)
11139{
11140 uint64_t bid = ++blobid_last;
11141 dout(20) << __func__ << " " << bid << dendl;
11142 txc->last_blobid = bid;
11143 return bid;
11144}
11145
11146void BlueStore::get_db_statistics(Formatter *f)
11147{
11148 db->get_statistics(f);
11149}
11150
11fdf7f2
TL
11151BlueStore::TransContext *BlueStore::_txc_create(
11152 Collection *c, OpSequencer *osr,
11153 list<Context*> *on_commits)
7c673cae 11154{
11fdf7f2 11155 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae
FG
11156 txc->t = db->get_transaction();
11157 osr->queue_new(txc);
11158 dout(20) << __func__ << " osr " << osr << " = " << txc
11159 << " seq " << txc->seq << dendl;
11160 return txc;
11161}
11162
11163void BlueStore::_txc_calc_cost(TransContext *txc)
11164{
11fdf7f2
TL
11165 // one "io" for the kv commit
11166 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
11167 auto cost = throttle_cost_per_io.load();
11168 txc->cost = ios * cost + txc->bytes;
9f95a23c 11169 txc->ios = ios;
7c673cae
FG
11170 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11171 << ios << " ios * " << cost << " + " << txc->bytes
11172 << " bytes)" << dendl;
11173}
11174
11175void BlueStore::_txc_update_store_statfs(TransContext *txc)
11176{
11177 if (txc->statfs_delta.is_empty())
11178 return;
11179
11180 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11181 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11182 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11183 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11184 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11185
11186 bufferlist bl;
11187 txc->statfs_delta.encode(bl);
11fdf7f2
TL
11188 if (per_pool_stat_collection) {
11189 string key;
11190 get_pool_stat_key(txc->osd_pool_id, &key);
11191 txc->t->merge(PREFIX_STAT, key, bl);
11192
11193 std::lock_guard l(vstatfs_lock);
11194 auto& stats = osd_pools[txc->osd_pool_id];
11195 stats += txc->statfs_delta;
11196
11197 vstatfs += txc->statfs_delta; //non-persistent in this mode
11198
11199 } else {
11200 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 11201
11fdf7f2
TL
11202 std::lock_guard l(vstatfs_lock);
11203 vstatfs += txc->statfs_delta;
11204 }
7c673cae
FG
11205 txc->statfs_delta.reset();
11206}
11207
11208void BlueStore::_txc_state_proc(TransContext *txc)
11209{
11210 while (true) {
11211 dout(10) << __func__ << " txc " << txc
11212 << " " << txc->get_state_name() << dendl;
11213 switch (txc->state) {
11214 case TransContext::STATE_PREPARE:
9f95a23c 11215 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae
FG
11216 if (txc->ioc.has_pending_aios()) {
11217 txc->state = TransContext::STATE_AIO_WAIT;
11218 txc->had_ios = true;
11219 _txc_aio_submit(txc);
11220 return;
11221 }
11222 // ** fall-thru **
11223
11224 case TransContext::STATE_AIO_WAIT:
11fdf7f2 11225 {
9f95a23c
TL
11226 mono_clock::duration lat = throttle.log_state_latency(
11227 *txc, logger, l_bluestore_state_aio_wait_lat);
11228 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
11229 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11230 << ", latency = " << lat
11231 << dendl;
11232 }
11233 }
11234
7c673cae
FG
11235 _txc_finish_io(txc); // may trigger blocked txc's too
11236 return;
11237
11238 case TransContext::STATE_IO_DONE:
11fdf7f2 11239 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
11240 if (txc->had_ios) {
11241 ++txc->osr->txc_with_unstable_io;
11242 }
9f95a23c 11243 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
7c673cae
FG
11244 txc->state = TransContext::STATE_KV_QUEUED;
11245 if (cct->_conf->bluestore_sync_submit_transaction) {
11246 if (txc->last_nid >= nid_max ||
11247 txc->last_blobid >= blobid_max) {
11248 dout(20) << __func__
11249 << " last_{nid,blobid} exceeds max, submit via kv thread"
11250 << dendl;
11251 } else if (txc->osr->kv_committing_serially) {
11252 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11253 << dendl;
11254 // note: this is starvation-prone. once we have a txc in a busy
11255 // sequencer that is committing serially it is possible to keep
11256 // submitting new transactions fast enough that we get stuck doing
11257 // so. the alternative is to block here... fixme?
11258 } else if (txc->osr->txc_with_unstable_io) {
11259 dout(20) << __func__ << " prior txc(s) with unstable ios "
11260 << txc->osr->txc_with_unstable_io.load() << dendl;
11261 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11262 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11263 == 0) {
11264 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11265 << dendl;
11266 } else {
9f95a23c 11267 _txc_apply_kv(txc, true);
7c673cae
FG
11268 }
11269 }
11270 {
11fdf7f2 11271 std::lock_guard l(kv_lock);
7c673cae 11272 kv_queue.push_back(txc);
9f95a23c
TL
11273 if (!kv_sync_in_progress) {
11274 kv_sync_in_progress = true;
11275 kv_cond.notify_one();
11276 }
7c673cae
FG
11277 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
11278 kv_queue_unsubmitted.push_back(txc);
11279 ++txc->osr->kv_committing_serially;
11280 }
31f18b77
FG
11281 if (txc->had_ios)
11282 kv_ios++;
11283 kv_throttle_costs += txc->cost;
7c673cae
FG
11284 }
11285 return;
11286 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
11287 _txc_committed_kv(txc);
11288 // ** fall-thru **
11289
11290 case TransContext::STATE_KV_DONE:
9f95a23c 11291 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae
FG
11292 if (txc->deferred_txn) {
11293 txc->state = TransContext::STATE_DEFERRED_QUEUED;
11294 _deferred_queue(txc);
11295 return;
11296 }
11297 txc->state = TransContext::STATE_FINISHING;
11298 break;
11299
11300 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 11301 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
7c673cae
FG
11302 txc->state = TransContext::STATE_FINISHING;
11303 // ** fall-thru **
11304
11305 case TransContext::STATE_FINISHING:
9f95a23c 11306 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
11307 _txc_finish(txc);
11308 return;
11309
11310 default:
11311 derr << __func__ << " unexpected txc " << txc
11312 << " state " << txc->get_state_name() << dendl;
11fdf7f2 11313 ceph_abort_msg("unexpected txc state");
7c673cae
FG
11314 return;
11315 }
11316 }
11317}
11318
11319void BlueStore::_txc_finish_io(TransContext *txc)
11320{
11321 dout(20) << __func__ << " " << txc << dendl;
11322
11323 /*
11324 * we need to preserve the order of kv transactions,
11325 * even though aio will complete in any order.
11326 */
11327
11328 OpSequencer *osr = txc->osr.get();
11fdf7f2 11329 std::lock_guard l(osr->qlock);
7c673cae 11330 txc->state = TransContext::STATE_IO_DONE;
11fdf7f2 11331 txc->ioc.release_running_aios();
7c673cae
FG
11332 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11333 while (p != osr->q.begin()) {
11334 --p;
11335 if (p->state < TransContext::STATE_IO_DONE) {
11336 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11337 << p->get_state_name() << dendl;
11338 return;
11339 }
11340 if (p->state > TransContext::STATE_IO_DONE) {
11341 ++p;
11342 break;
11343 }
11344 }
11345 do {
11346 _txc_state_proc(&*p++);
11347 } while (p != osr->q.end() &&
11348 p->state == TransContext::STATE_IO_DONE);
11349
11fdf7f2 11350 if (osr->kv_submitted_waiters) {
7c673cae
FG
11351 osr->qcond.notify_all();
11352 }
11353}
11354
11355void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11356{
11357 dout(20) << __func__ << " txc " << txc
11358 << " onodes " << txc->onodes
11359 << " shared_blobs " << txc->shared_blobs
11360 << dendl;
11361
11362 // finalize onodes
11363 for (auto o : txc->onodes) {
11fdf7f2 11364 _record_onode(o, t);
7c673cae
FG
11365 o->flushing_count++;
11366 }
11367
11368 // objects we modified but didn't affect the onode
11369 auto p = txc->modified_objects.begin();
11370 while (p != txc->modified_objects.end()) {
11371 if (txc->onodes.count(*p) == 0) {
11372 (*p)->flushing_count++;
11373 ++p;
11374 } else {
11375 // remove dups with onodes list to avoid problems in _txc_finish
11376 p = txc->modified_objects.erase(p);
11377 }
11378 }
11379
11380 // finalize shared_blobs
11381 for (auto sb : txc->shared_blobs) {
11382 string key;
11383 auto sbid = sb->get_sbid();
11384 get_shared_blob_key(sbid, &key);
11385 if (sb->persistent->empty()) {
11fdf7f2
TL
11386 dout(20) << __func__ << " shared_blob 0x"
11387 << std::hex << sbid << std::dec
7c673cae
FG
11388 << " is empty" << dendl;
11389 t->rmkey(PREFIX_SHARED_BLOB, key);
11390 } else {
11391 bufferlist bl;
11fdf7f2
TL
11392 encode(*(sb->persistent), bl);
11393 dout(20) << __func__ << " shared_blob 0x"
11394 << std::hex << sbid << std::dec
31f18b77 11395 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
11396 t->set(PREFIX_SHARED_BLOB, key, bl);
11397 }
11398 }
11399}
11400
11401void BlueStore::BSPerfTracker::update_from_perfcounters(
11402 PerfCounters &logger)
11403{
11fdf7f2
TL
11404 os_commit_latency_ns.consume_next(
11405 logger.get_tavg_ns(
7c673cae 11406 l_bluestore_commit_lat));
11fdf7f2
TL
11407 os_apply_latency_ns.consume_next(
11408 logger.get_tavg_ns(
7c673cae
FG
11409 l_bluestore_commit_lat));
11410}
11411
11412void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11413{
11414 dout(20) << __func__ << " txc " << txc << std::hex
11415 << " allocated 0x" << txc->allocated
11416 << " released 0x" << txc->released
11417 << std::dec << dendl;
11418
11419 // We have to handle the case where we allocate *and* deallocate the
11420 // same region in this transaction. The freelist doesn't like that.
11421 // (Actually, the only thing that cares is the BitmapFreelistManager
11422 // debug check. But that's important.)
11423 interval_set<uint64_t> tmp_allocated, tmp_released;
11424 interval_set<uint64_t> *pallocated = &txc->allocated;
11425 interval_set<uint64_t> *preleased = &txc->released;
11426 if (!txc->allocated.empty() && !txc->released.empty()) {
11427 interval_set<uint64_t> overlap;
11428 overlap.intersection_of(txc->allocated, txc->released);
11429 if (!overlap.empty()) {
11430 tmp_allocated = txc->allocated;
11431 tmp_allocated.subtract(overlap);
11432 tmp_released = txc->released;
11433 tmp_released.subtract(overlap);
11434 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11435 << ", new allocated 0x" << tmp_allocated
11436 << " released 0x" << tmp_released << std::dec
11437 << dendl;
11438 pallocated = &tmp_allocated;
11439 preleased = &tmp_released;
11440 }
11441 }
11442
11443 // update freelist with non-overlap sets
11444 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11445 p != pallocated->end();
11446 ++p) {
11447 fm->allocate(p.get_start(), p.get_len(), t);
11448 }
11449 for (interval_set<uint64_t>::iterator p = preleased->begin();
11450 p != preleased->end();
11451 ++p) {
11452 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11453 << "~" << p.get_len() << std::dec << dendl;
11454 fm->release(p.get_start(), p.get_len(), t);
11455 }
11456
11457 _txc_update_store_statfs(txc);
11458}
11459
9f95a23c 11460void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 11461{
9f95a23c
TL
11462 ceph_assert(txc->state == TransContext::STATE_KV_QUEUED);
11463 {
11464#if defined(WITH_LTTNG)
11465 auto start = mono_clock::now();
11466#endif
11467
11468 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11469 ceph_assert(r == 0);
11470 txc->state = TransContext::STATE_KV_SUBMITTED;
11471 if (txc->osr->kv_submitted_waiters) {
11472 std::lock_guard l(txc->osr->qlock);
11473 txc->osr->qcond.notify_all();
11474 }
11475
11476#if defined(WITH_LTTNG)
11477 if (txc->tracing) {
11478 tracepoint(
11479 bluestore,
11480 transaction_kv_submit_latency,
11481 txc->osr->get_sequencer_id(),
11482 txc->seq,
11483 sync_submit_transaction,
11484 ceph::to_seconds<double>(mono_clock::now() - start));
11485 }
11486#endif
11487 }
11488
7c673cae
FG
11489 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11490 for (auto& o : *ls) {
11491 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11492 << dendl;
9f95a23c 11493 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 11494 std::lock_guard l(o->flush_lock);
7c673cae
FG
11495 o->flush_cond.notify_all();
11496 }
11497 }
11498 }
11499}
11500
11501void BlueStore::_txc_committed_kv(TransContext *txc)
11502{
11503 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 11504 throttle.complete_kv(*txc);
1adf2230 11505 {
11fdf7f2 11506 std::lock_guard l(txc->osr->qlock);
1adf2230 11507 txc->state = TransContext::STATE_KV_DONE;
11fdf7f2
TL
11508 if (txc->ch->commit_queue) {
11509 txc->ch->commit_queue->queue(txc->oncommits);
11510 } else {
11511 finisher.queue(txc->oncommits);
1adf2230 11512 }
7c673cae 11513 }
9f95a23c 11514 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
11515 log_latency_fn(
11516 __func__,
11517 l_bluestore_commit_lat,
9f95a23c 11518 mono_clock::now() - txc->start,
494da23a
TL
11519 cct->_conf->bluestore_log_op_age,
11520 [&](auto lat) {
11521 return ", txc = " + stringify(txc);
11522 }
11fdf7f2 11523 );
7c673cae
FG
11524}
11525
11526void BlueStore::_txc_finish(TransContext *txc)
11527{
11528 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11fdf7f2 11529 ceph_assert(txc->state == TransContext::STATE_FINISHING);
7c673cae
FG
11530
11531 for (auto& sb : txc->shared_blobs_written) {
f64942e4 11532 sb->finish_write(txc->seq);
7c673cae
FG
11533 }
11534 txc->shared_blobs_written.clear();
11535
11536 while (!txc->removed_collections.empty()) {
11537 _queue_reap_collection(txc->removed_collections.front());
11538 txc->removed_collections.pop_front();
11539 }
11540
11541 OpSequencerRef osr = txc->osr;
7c673cae 11542 bool empty = false;
31f18b77 11543 bool submit_deferred = false;
7c673cae
FG
11544 OpSequencer::q_list_t releasing_txc;
11545 {
11fdf7f2 11546 std::lock_guard l(osr->qlock);
7c673cae
FG
11547 txc->state = TransContext::STATE_DONE;
11548 bool notify = false;
11549 while (!osr->q.empty()) {
11550 TransContext *txc = &osr->q.front();
11551 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11552 << dendl;
11553 if (txc->state != TransContext::STATE_DONE) {
11554 if (txc->state == TransContext::STATE_PREPARE &&
11555 deferred_aggressive) {
11556 // for _osr_drain_preceding()
11557 notify = true;
11558 }
31f18b77 11559 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 11560 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
11561 submit_deferred = true;
11562 }
7c673cae
FG
11563 break;
11564 }
11565
7c673cae
FG
11566 osr->q.pop_front();
11567 releasing_txc.push_back(*txc);
7c673cae 11568 }
9f95a23c 11569
7c673cae
FG
11570 if (osr->q.empty()) {
11571 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11572 empty = true;
11573 }
9f95a23c
TL
11574
11575 // only drain()/drain_preceding() need wakeup,
11576 // other cases use kv_submitted_waiters
11577 if (notify || empty) {
11578 osr->qcond.notify_all();
11579 }
7c673cae 11580 }
9f95a23c 11581
7c673cae
FG
11582 while (!releasing_txc.empty()) {
11583 // release to allocator only after all preceding txc's have also
11584 // finished any deferred writes that potentially land in these
11585 // blocks
11586 auto txc = &releasing_txc.front();
11587 _txc_release_alloc(txc);
11588 releasing_txc.pop_front();
9f95a23c
TL
11589 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11590 throttle.complete(*txc);
7c673cae
FG
11591 delete txc;
11592 }
11593
31f18b77
FG
11594 if (submit_deferred) {
11595 // we're pinning memory; flush! we could be more fine-grained here but
11596 // i'm not sure it's worth the bother.
11597 deferred_try_submit();
7c673cae
FG
11598 }
11599
7c673cae 11600 if (empty && osr->zombie) {
11fdf7f2
TL
11601 std::lock_guard l(zombie_osr_lock);
11602 if (zombie_osr_set.erase(osr->cid)) {
11603 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11604 } else {
11605 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11606 << dendl;
11607 }
7c673cae 11608 }
9f95a23c 11609}
7c673cae
FG
11610
11611void BlueStore::_txc_release_alloc(TransContext *txc)
11612{
a8e16298 11613 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
11614 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11615 int r = 0;
11616 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11617 r = bdev->queue_discard(txc->released);
11618 if (r == 0) {
11619 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11620 << txc->released << std::dec << dendl;
11621 goto out;
11622 }
11623 } else if (cct->_conf->bdev_enable_discard) {
11624 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11625 bdev->discard(p.get_start(), p.get_len());
11626 }
11627 }
11628 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 11629 << txc->released << std::dec << dendl;
11fdf7f2 11630 alloc->release(txc->released);
7c673cae
FG
11631 }
11632
11fdf7f2 11633out:
7c673cae
FG
11634 txc->allocated.clear();
11635 txc->released.clear();
11636}
11637
11fdf7f2
TL
11638void BlueStore::_osr_attach(Collection *c)
11639{
11640 // note: caller has RWLock on coll_map
11641 auto q = coll_map.find(c->cid);
11642 if (q != coll_map.end()) {
11643 c->osr = q->second->osr;
11644 ldout(cct, 10) << __func__ << " " << c->cid
11645 << " reusing osr " << c->osr << " from existing coll "
11646 << q->second << dendl;
11647 } else {
11648 std::lock_guard l(zombie_osr_lock);
11649 auto p = zombie_osr_set.find(c->cid);
11650 if (p == zombie_osr_set.end()) {
9f95a23c 11651 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
11652 ldout(cct, 10) << __func__ << " " << c->cid
11653 << " fresh osr " << c->osr << dendl;
11654 } else {
11655 c->osr = p->second;
11656 zombie_osr_set.erase(p);
11657 ldout(cct, 10) << __func__ << " " << c->cid
11658 << " resurrecting zombie osr " << c->osr << dendl;
11659 c->osr->zombie = false;
11660 }
11661 }
11662}
11663
11664void BlueStore::_osr_register_zombie(OpSequencer *osr)
11665{
11666 std::lock_guard l(zombie_osr_lock);
11667 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
11668 osr->zombie = true;
11669 auto i = zombie_osr_set.emplace(osr->cid, osr);
11670 // this is either a new insertion or the same osr is already there
11671 ceph_assert(i.second || i.first->second == osr);
11672}
11673
7c673cae
FG
11674void BlueStore::_osr_drain_preceding(TransContext *txc)
11675{
11676 OpSequencer *osr = txc->osr.get();
11677 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
11678 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11679 {
11680 // submit anything pending
224ce89b 11681 deferred_lock.lock();
11fdf7f2 11682 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
11683 _deferred_submit_unlock(osr);
11684 } else {
11685 deferred_lock.unlock();
7c673cae
FG
11686 }
11687 }
11688 {
11689 // wake up any previously finished deferred events
11fdf7f2 11690 std::lock_guard l(kv_lock);
9f95a23c
TL
11691 if (!kv_sync_in_progress) {
11692 kv_sync_in_progress = true;
11693 kv_cond.notify_one();
11694 }
7c673cae
FG
11695 }
11696 osr->drain_preceding(txc);
11697 --deferred_aggressive;
11698 dout(10) << __func__ << " " << osr << " done" << dendl;
11699}
11700
11fdf7f2
TL
11701void BlueStore::_osr_drain(OpSequencer *osr)
11702{
11703 dout(10) << __func__ << " " << osr << dendl;
11704 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11705 {
11706 // submit anything pending
11707 deferred_lock.lock();
11708 if (osr->deferred_pending && !osr->deferred_running) {
11709 _deferred_submit_unlock(osr);
11710 } else {
11711 deferred_lock.unlock();
11712 }
11713 }
11714 {
11715 // wake up any previously finished deferred events
11716 std::lock_guard l(kv_lock);
9f95a23c
TL
11717 if (!kv_sync_in_progress) {
11718 kv_sync_in_progress = true;
11719 kv_cond.notify_one();
11720 }
11fdf7f2
TL
11721 }
11722 osr->drain();
11723 --deferred_aggressive;
11724 dout(10) << __func__ << " " << osr << " done" << dendl;
11725}
11726
7c673cae
FG
11727void BlueStore::_osr_drain_all()
11728{
11729 dout(10) << __func__ << dendl;
11730
11731 set<OpSequencerRef> s;
11fdf7f2
TL
11732 vector<OpSequencerRef> zombies;
11733 {
9f95a23c 11734 std::shared_lock l(coll_lock);
11fdf7f2
TL
11735 for (auto& i : coll_map) {
11736 s.insert(i.second->osr);
11737 }
11738 }
7c673cae 11739 {
11fdf7f2
TL
11740 std::lock_guard l(zombie_osr_lock);
11741 for (auto& i : zombie_osr_set) {
11742 s.insert(i.second);
11743 zombies.push_back(i.second);
11744 }
7c673cae
FG
11745 }
11746 dout(20) << __func__ << " osr_set " << s << dendl;
11747
11748 ++deferred_aggressive;
11749 {
11750 // submit anything pending
224ce89b 11751 deferred_try_submit();
7c673cae
FG
11752 }
11753 {
11754 // wake up any previously finished deferred events
11fdf7f2 11755 std::lock_guard l(kv_lock);
7c673cae
FG
11756 kv_cond.notify_one();
11757 }
31f18b77 11758 {
11fdf7f2 11759 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11760 kv_finalize_cond.notify_one();
11761 }
7c673cae
FG
11762 for (auto osr : s) {
11763 dout(20) << __func__ << " drain " << osr << dendl;
11764 osr->drain();
11765 }
11766 --deferred_aggressive;
11767
7c673cae 11768 {
11fdf7f2
TL
11769 std::lock_guard l(zombie_osr_lock);
11770 for (auto& osr : zombies) {
11771 if (zombie_osr_set.erase(osr->cid)) {
11772 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11773 ceph_assert(osr->q.empty());
11774 } else if (osr->zombie) {
11775 dout(10) << __func__ << " empty zombie osr " << osr
11776 << " already reaped" << dendl;
11777 ceph_assert(osr->q.empty());
11778 } else {
11779 dout(10) << __func__ << " empty zombie osr " << osr
11780 << " resurrected" << dendl;
11781 }
7c673cae
FG
11782 }
11783 }
11fdf7f2
TL
11784
11785 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
11786}
11787
11fdf7f2 11788
31f18b77
FG
11789void BlueStore::_kv_start()
11790{
11791 dout(10) << __func__ << dendl;
11792
11fdf7f2 11793 finisher.start();
31f18b77
FG
11794 kv_sync_thread.create("bstore_kv_sync");
11795 kv_finalize_thread.create("bstore_kv_final");
11796}
11797
11798void BlueStore::_kv_stop()
11799{
11800 dout(10) << __func__ << dendl;
11801 {
9f95a23c 11802 std::unique_lock l{kv_lock};
31f18b77
FG
11803 while (!kv_sync_started) {
11804 kv_cond.wait(l);
11805 }
11806 kv_stop = true;
11807 kv_cond.notify_all();
11808 }
11809 {
9f95a23c 11810 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
11811 while (!kv_finalize_started) {
11812 kv_finalize_cond.wait(l);
11813 }
11814 kv_finalize_stop = true;
11815 kv_finalize_cond.notify_all();
11816 }
11817 kv_sync_thread.join();
11818 kv_finalize_thread.join();
11fdf7f2 11819 ceph_assert(removed_collections.empty());
31f18b77 11820 {
11fdf7f2 11821 std::lock_guard l(kv_lock);
31f18b77
FG
11822 kv_stop = false;
11823 }
11824 {
11fdf7f2 11825 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11826 kv_finalize_stop = false;
11827 }
11828 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
11829 finisher.wait_for_empty();
11830 finisher.stop();
31f18b77
FG
11831 dout(10) << __func__ << " stopped" << dendl;
11832}
11833
7c673cae
FG
11834void BlueStore::_kv_sync_thread()
11835{
11836 dout(10) << __func__ << " start" << dendl;
11fdf7f2 11837 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 11838 std::unique_lock l{kv_lock};
11fdf7f2 11839 ceph_assert(!kv_sync_started);
31f18b77
FG
11840 kv_sync_started = true;
11841 kv_cond.notify_all();
7c673cae 11842 while (true) {
11fdf7f2 11843 ceph_assert(kv_committing.empty());
7c673cae
FG
11844 if (kv_queue.empty() &&
11845 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 11846 !deferred_aggressive)) {
7c673cae
FG
11847 if (kv_stop)
11848 break;
11849 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 11850 kv_sync_in_progress = false;
11fdf7f2 11851 kv_cond.wait(l);
7c673cae
FG
11852 dout(20) << __func__ << " wake" << dendl;
11853 } else {
11854 deque<TransContext*> kv_submitting;
11855 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
11856 uint64_t aios = 0, costs = 0;
11857
7c673cae
FG
11858 dout(20) << __func__ << " committing " << kv_queue.size()
11859 << " submitting " << kv_queue_unsubmitted.size()
11860 << " deferred done " << deferred_done_queue.size()
11861 << " stable " << deferred_stable_queue.size()
11862 << dendl;
11863 kv_committing.swap(kv_queue);
11864 kv_submitting.swap(kv_queue_unsubmitted);
11865 deferred_done.swap(deferred_done_queue);
11866 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
11867 aios = kv_ios;
11868 costs = kv_throttle_costs;
11869 kv_ios = 0;
11870 kv_throttle_costs = 0;
7c673cae
FG
11871 l.unlock();
11872
11873 dout(30) << __func__ << " committing " << kv_committing << dendl;
11874 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
11875 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
11876 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
11877
11fdf7f2
TL
11878 auto start = mono_clock::now();
11879
7c673cae
FG
11880 bool force_flush = false;
11881 // if bluefs is sharing the same device as data (only), then we
11882 // can rely on the bluefs commit to flush the device and make
11883 // deferred aios stable. that means that if we do have done deferred
11884 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 11885 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 11886 if (aios) {
7c673cae 11887 force_flush = true;
11fdf7f2 11888 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
11889 force_flush = true; // there's nothing else to commit!
11890 } else if (deferred_aggressive) {
11891 force_flush = true;
11892 }
11fdf7f2
TL
11893 } else {
11894 if (aios || !deferred_done.empty()) {
11895 force_flush = true;
11896 } else {
11897 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
11898 }
11899 }
7c673cae
FG
11900
11901 if (force_flush) {
31f18b77 11902 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
11903 << " force_flush=" << (int)force_flush
11904 << ", flushing, deferred done->stable" << dendl;
11905 // flush/barrier on block device
11906 bdev->flush();
11907
11908 // if we flush then deferred done are now deferred stable
11909 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
11910 deferred_done.end());
11911 deferred_done.clear();
11912 }
11fdf7f2 11913 auto after_flush = mono_clock::now();
7c673cae
FG
11914
11915 // we will use one final transaction to force a sync
11916 KeyValueDB::Transaction synct = db->get_transaction();
11917
11918 // increase {nid,blobid}_max? note that this covers both the
11919 // case where we are approaching the max and the case we passed
11920 // it. in either case, we increase the max in the earlier txn
11921 // we submit.
11922 uint64_t new_nid_max = 0, new_blobid_max = 0;
11923 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
11924 KeyValueDB::Transaction t =
11925 kv_submitting.empty() ? synct : kv_submitting.front()->t;
11926 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
11927 bufferlist bl;
11fdf7f2 11928 encode(new_nid_max, bl);
7c673cae
FG
11929 t->set(PREFIX_SUPER, "nid_max", bl);
11930 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
11931 }
11932 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
11933 KeyValueDB::Transaction t =
11934 kv_submitting.empty() ? synct : kv_submitting.front()->t;
11935 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
11936 bufferlist bl;
11fdf7f2 11937 encode(new_blobid_max, bl);
7c673cae
FG
11938 t->set(PREFIX_SUPER, "blobid_max", bl);
11939 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
11940 }
c07f9fc5
FG
11941
11942 for (auto txc : kv_committing) {
9f95a23c 11943 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
c07f9fc5 11944 if (txc->state == TransContext::STATE_KV_QUEUED) {
9f95a23c 11945 _txc_apply_kv(txc, false);
c07f9fc5 11946 --txc->osr->kv_committing_serially;
c07f9fc5 11947 } else {
11fdf7f2 11948 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 11949 }
7c673cae
FG
11950 if (txc->had_ios) {
11951 --txc->osr->txc_with_unstable_io;
11952 }
7c673cae
FG
11953 }
11954
31f18b77
FG
11955 // release throttle *before* we commit. this allows new ops
11956 // to be prepared and enter pipeline while we are waiting on
11957 // the kv commit sync/flush. then hopefully on the next
11958 // iteration there will already be ops awake. otherwise, we
11959 // end up going to sleep, and then wake up when the very first
11960 // transaction is ready for commit.
9f95a23c 11961 throttle.release_kv_throttle(costs);
31f18b77 11962
7c673cae
FG
11963 if (bluefs &&
11964 after_flush - bluefs_last_balance >
11fdf7f2 11965 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
7c673cae 11966 bluefs_last_balance = after_flush;
11fdf7f2
TL
11967 int r = _balance_bluefs_freespace();
11968 ceph_assert(r >= 0);
7c673cae
FG
11969 }
11970
11971 // cleanup sync deferred keys
11972 for (auto b : deferred_stable) {
11973 for (auto& txc : b->txcs) {
11974 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 11975 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
11976 string key;
11977 get_deferred_key(wt.seq, &key);
11978 synct->rm_single_key(PREFIX_DEFERRED, key);
11979 }
11980 }
11981
9f95a23c
TL
11982#if defined(WITH_LTTNG)
11983 auto sync_start = mono_clock::now();
11984#endif
7c673cae 11985 // submit synct synchronously (block and wait for it to commit)
31f18b77 11986 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
11987 ceph_assert(r == 0);
11988
9f95a23c
TL
11989 int committing_size = kv_committing.size();
11990 int deferred_size = deferred_stable.size();
11991
11992#if defined(WITH_LTTNG)
11993 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
11994 for (auto txc: kv_committing) {
11995 if (txc->tracing) {
11996 tracepoint(
11997 bluestore,
11998 transaction_kv_sync_latency,
11999 txc->osr->get_sequencer_id(),
12000 txc->seq,
12001 kv_committing.size(),
12002 deferred_done.size(),
12003 deferred_stable.size(),
12004 sync_latency);
12005 }
12006 }
12007#endif
12008
11fdf7f2 12009 {
9f95a23c 12010 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
12011 if (kv_committing_to_finalize.empty()) {
12012 kv_committing_to_finalize.swap(kv_committing);
12013 } else {
12014 kv_committing_to_finalize.insert(
12015 kv_committing_to_finalize.end(),
12016 kv_committing.begin(),
12017 kv_committing.end());
12018 kv_committing.clear();
12019 }
12020 if (deferred_stable_to_finalize.empty()) {
12021 deferred_stable_to_finalize.swap(deferred_stable);
12022 } else {
12023 deferred_stable_to_finalize.insert(
12024 deferred_stable_to_finalize.end(),
12025 deferred_stable.begin(),
12026 deferred_stable.end());
12027 deferred_stable.clear();
12028 }
9f95a23c
TL
12029 if (!kv_finalize_in_progress) {
12030 kv_finalize_in_progress = true;
12031 kv_finalize_cond.notify_one();
12032 }
11fdf7f2 12033 }
7c673cae
FG
12034
12035 if (new_nid_max) {
12036 nid_max = new_nid_max;
12037 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12038 }
12039 if (new_blobid_max) {
12040 blobid_max = new_blobid_max;
12041 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12042 }
12043
224ce89b 12044 {
11fdf7f2
TL
12045 auto finish = mono_clock::now();
12046 ceph::timespan dur_flush = after_flush - start;
12047 ceph::timespan dur_kv = finish - after_flush;
12048 ceph::timespan dur = finish - start;
9f95a23c
TL
12049 dout(20) << __func__ << " committed " << committing_size
12050 << " cleaned " << deferred_size
224ce89b
WB
12051 << " in " << dur
12052 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12053 << dendl;
494da23a
TL
12054 log_latency("kv_flush",
12055 l_bluestore_kv_flush_lat,
12056 dur_flush,
12057 cct->_conf->bluestore_log_op_age);
12058 log_latency("kv_commit",
12059 l_bluestore_kv_commit_lat,
12060 dur_kv,
12061 cct->_conf->bluestore_log_op_age);
12062 log_latency("kv_sync",
12063 l_bluestore_kv_sync_lat,
12064 dur,
12065 cct->_conf->bluestore_log_op_age);
7c673cae 12066 }
31f18b77
FG
12067
12068 if (bluefs) {
11fdf7f2
TL
12069 if (!bluefs_extents_reclaiming.empty()) {
12070 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
12071 << bluefs_extents_reclaiming << std::dec << dendl;
81eedcae
TL
12072 int r = 0;
12073 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12074 r = bdev->queue_discard(bluefs_extents_reclaiming);
12075 if (r == 0) {
12076 goto clear;
12077 }
12078 } else if (cct->_conf->bdev_enable_discard) {
12079 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
12080 bdev->discard(p.get_start(), p.get_len());
12081 }
12082 }
12083
11fdf7f2 12084 alloc->release(bluefs_extents_reclaiming);
81eedcae 12085clear:
11fdf7f2 12086 bluefs_extents_reclaiming.clear();
31f18b77 12087 }
31f18b77
FG
12088 }
12089
12090 l.lock();
12091 // previously deferred "done" are now "stable" by virtue of this
12092 // commit cycle.
12093 deferred_stable_queue.swap(deferred_done);
12094 }
12095 }
12096 dout(10) << __func__ << " finish" << dendl;
12097 kv_sync_started = false;
12098}
12099
12100void BlueStore::_kv_finalize_thread()
12101{
12102 deque<TransContext*> kv_committed;
12103 deque<DeferredBatch*> deferred_stable;
12104 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
12105 std::unique_lock l(kv_finalize_lock);
12106 ceph_assert(!kv_finalize_started);
31f18b77
FG
12107 kv_finalize_started = true;
12108 kv_finalize_cond.notify_all();
12109 while (true) {
11fdf7f2
TL
12110 ceph_assert(kv_committed.empty());
12111 ceph_assert(deferred_stable.empty());
31f18b77
FG
12112 if (kv_committing_to_finalize.empty() &&
12113 deferred_stable_to_finalize.empty()) {
12114 if (kv_finalize_stop)
12115 break;
12116 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 12117 kv_finalize_in_progress = false;
31f18b77
FG
12118 kv_finalize_cond.wait(l);
12119 dout(20) << __func__ << " wake" << dendl;
12120 } else {
12121 kv_committed.swap(kv_committing_to_finalize);
12122 deferred_stable.swap(deferred_stable_to_finalize);
12123 l.unlock();
12124 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12125 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12126
11fdf7f2
TL
12127 auto start = mono_clock::now();
12128
31f18b77
FG
12129 while (!kv_committed.empty()) {
12130 TransContext *txc = kv_committed.front();
11fdf7f2 12131 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 12132 _txc_state_proc(txc);
31f18b77 12133 kv_committed.pop_front();
7c673cae 12134 }
31f18b77 12135
7c673cae
FG
12136 for (auto b : deferred_stable) {
12137 auto p = b->txcs.begin();
12138 while (p != b->txcs.end()) {
12139 TransContext *txc = &*p;
12140 p = b->txcs.erase(p); // unlink here because
12141 _txc_state_proc(txc); // this may destroy txc
12142 }
12143 delete b;
12144 }
31f18b77 12145 deferred_stable.clear();
7c673cae
FG
12146
12147 if (!deferred_aggressive) {
31f18b77 12148 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 12149 throttle.should_submit_deferred()) {
224ce89b 12150 deferred_try_submit();
7c673cae
FG
12151 }
12152 }
12153
12154 // this is as good a place as any ...
12155 _reap_collections();
12156
11fdf7f2 12157 logger->set(l_bluestore_fragmentation,
9f95a23c 12158 (uint64_t)(alloc->get_fragmentation() * 1000));
11fdf7f2 12159
494da23a
TL
12160 log_latency("kv_final",
12161 l_bluestore_kv_final_lat,
12162 mono_clock::now() - start,
12163 cct->_conf->bluestore_log_op_age);
11fdf7f2 12164
7c673cae 12165 l.lock();
7c673cae
FG
12166 }
12167 }
12168 dout(10) << __func__ << " finish" << dendl;
31f18b77 12169 kv_finalize_started = false;
7c673cae
FG
12170}
12171
12172bluestore_deferred_op_t *BlueStore::_get_deferred_op(
9f95a23c 12173 TransContext *txc)
7c673cae
FG
12174{
12175 if (!txc->deferred_txn) {
12176 txc->deferred_txn = new bluestore_deferred_transaction_t;
12177 }
12178 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12179 return &txc->deferred_txn->ops.back();
12180}
12181
12182void BlueStore::_deferred_queue(TransContext *txc)
12183{
12184 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 12185 deferred_lock.lock();
7c673cae
FG
12186 if (!txc->osr->deferred_pending &&
12187 !txc->osr->deferred_running) {
12188 deferred_queue.push_back(*txc->osr);
12189 }
12190 if (!txc->osr->deferred_pending) {
12191 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
12192 }
12193 ++deferred_queue_size;
12194 txc->osr->deferred_pending->txcs.push_back(*txc);
12195 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12196 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12197 const auto& op = *opi;
11fdf7f2 12198 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
12199 bufferlist::const_iterator p = op.data.begin();
12200 for (auto e : op.extents) {
12201 txc->osr->deferred_pending->prepare_write(
12202 cct, wt.seq, e.offset, e.length, p);
12203 }
12204 }
12205 if (deferred_aggressive &&
12206 !txc->osr->deferred_running) {
224ce89b
WB
12207 _deferred_submit_unlock(txc->osr.get());
12208 } else {
12209 deferred_lock.unlock();
7c673cae
FG
12210 }
12211}
12212
224ce89b 12213void BlueStore::deferred_try_submit()
7c673cae
FG
12214{
12215 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12216 << deferred_queue_size << " txcs" << dendl;
11fdf7f2 12217 std::lock_guard l(deferred_lock);
224ce89b
WB
12218 vector<OpSequencerRef> osrs;
12219 osrs.reserve(deferred_queue.size());
7c673cae 12220 for (auto& osr : deferred_queue) {
224ce89b
WB
12221 osrs.push_back(&osr);
12222 }
12223 for (auto& osr : osrs) {
181888fb
FG
12224 if (osr->deferred_pending) {
12225 if (!osr->deferred_running) {
12226 _deferred_submit_unlock(osr.get());
12227 deferred_lock.lock();
12228 } else {
12229 dout(20) << __func__ << " osr " << osr << " already has running"
12230 << dendl;
12231 }
12232 } else {
12233 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
12234 }
12235 }
9f95a23c
TL
12236
12237 deferred_last_submitted = ceph_clock_now();
7c673cae
FG
12238}
12239
224ce89b 12240void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
12241{
12242 dout(10) << __func__ << " osr " << osr
12243 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12244 << dendl;
11fdf7f2
TL
12245 ceph_assert(osr->deferred_pending);
12246 ceph_assert(!osr->deferred_running);
7c673cae
FG
12247
12248 auto b = osr->deferred_pending;
12249 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 12250 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
12251
12252 osr->deferred_running = osr->deferred_pending;
12253 osr->deferred_pending = nullptr;
12254
11fdf7f2
TL
12255 deferred_lock.unlock();
12256
12257 for (auto& txc : b->txcs) {
9f95a23c 12258 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 12259 }
7c673cae
FG
12260 uint64_t start = 0, pos = 0;
12261 bufferlist bl;
12262 auto i = b->iomap.begin();
12263 while (true) {
12264 if (i == b->iomap.end() || i->first != pos) {
12265 if (bl.length()) {
12266 dout(20) << __func__ << " write 0x" << std::hex
12267 << start << "~" << bl.length()
12268 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 12269 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12270 logger->inc(l_bluestore_deferred_write_ops);
12271 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12272 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 12273 ceph_assert(r == 0);
7c673cae
FG
12274 }
12275 }
12276 if (i == b->iomap.end()) {
12277 break;
12278 }
12279 start = 0;
12280 pos = i->first;
12281 bl.clear();
12282 }
12283 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12284 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12285 << dendl;
12286 if (!bl.length()) {
12287 start = pos;
12288 }
12289 pos += i->second.bl.length();
12290 bl.claim_append(i->second.bl);
12291 ++i;
12292 }
224ce89b 12293
7c673cae
FG
12294 bdev->aio_submit(&b->ioc);
12295}
12296
3efd9988
FG
12297struct C_DeferredTrySubmit : public Context {
12298 BlueStore *store;
12299 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12300 void finish(int r) {
12301 store->deferred_try_submit();
12302 }
12303};
12304
7c673cae
FG
12305void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12306{
12307 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 12308 ceph_assert(osr->deferred_running);
7c673cae
FG
12309 DeferredBatch *b = osr->deferred_running;
12310
12311 {
9f95a23c 12312 deferred_lock.lock();
11fdf7f2 12313 ceph_assert(osr->deferred_running == b);
7c673cae
FG
12314 osr->deferred_running = nullptr;
12315 if (!osr->deferred_pending) {
181888fb 12316 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
12317 auto q = deferred_queue.iterator_to(*osr);
12318 deferred_queue.erase(q);
9f95a23c 12319 deferred_lock.unlock();
181888fb 12320 } else {
9f95a23c
TL
12321 deferred_lock.unlock();
12322 if (deferred_aggressive) {
12323 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12324 finisher.queue(new C_DeferredTrySubmit(this));
12325 } else {
12326 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12327 }
7c673cae
FG
12328 }
12329 }
12330
12331 {
31f18b77 12332 uint64_t costs = 0;
11fdf7f2 12333 {
11fdf7f2
TL
12334 for (auto& i : b->txcs) {
12335 TransContext *txc = &i;
9f95a23c 12336 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
11fdf7f2
TL
12337 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
12338 costs += txc->cost;
12339 }
7c673cae 12340 }
9f95a23c 12341 throttle.release_deferred_throttle(costs);
7c673cae
FG
12342 }
12343
9f95a23c 12344 {
11fdf7f2 12345 std::lock_guard l(kv_lock);
9f95a23c
TL
12346 deferred_done_queue.emplace_back(b);
12347
12348 // in the normal case, do not bother waking up the kv thread; it will
12349 // catch us on the next commit anyway.
12350 if (deferred_aggressive && !kv_sync_in_progress) {
12351 kv_sync_in_progress = true;
12352 kv_cond.notify_one();
12353 }
7c673cae
FG
12354 }
12355}
12356
12357int BlueStore::_deferred_replay()
12358{
12359 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
12360 int count = 0;
12361 int r = 0;
11fdf7f2
TL
12362 CollectionRef ch = _get_collection(coll_t::meta());
12363 bool fake_ch = false;
12364 if (!ch) {
12365 // hmm, replaying initial mkfs?
12366 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12367 fake_ch = true;
12368 }
12369 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
12370 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12371 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12372 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12373 << dendl;
12374 bluestore_deferred_transaction_t *deferred_txn =
12375 new bluestore_deferred_transaction_t;
12376 bufferlist bl = it->value();
11fdf7f2 12377 auto p = bl.cbegin();
7c673cae 12378 try {
11fdf7f2 12379 decode(*deferred_txn, p);
7c673cae
FG
12380 } catch (buffer::error& e) {
12381 derr << __func__ << " failed to decode deferred txn "
12382 << pretty_binary_string(it->key()) << dendl;
12383 delete deferred_txn;
12384 r = -EIO;
12385 goto out;
12386 }
11fdf7f2 12387 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae
FG
12388 txc->deferred_txn = deferred_txn;
12389 txc->state = TransContext::STATE_KV_DONE;
12390 _txc_state_proc(txc);
12391 }
12392 out:
12393 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 12394 _osr_register_zombie(osr);
7c673cae 12395 _osr_drain_all();
11fdf7f2
TL
12396 if (fake_ch) {
12397 new_coll_map.clear();
12398 }
7c673cae
FG
12399 dout(10) << __func__ << " completed " << count << " events" << dendl;
12400 return r;
12401}
12402
12403// ---------------------------
12404// transactions
12405
12406int BlueStore::queue_transactions(
11fdf7f2
TL
12407 CollectionHandle& ch,
12408 vector<Transaction>& tls,
12409 TrackedOpRef op,
12410 ThreadPool::TPHandle *handle)
12411{
12412 FUNCTRACE(cct);
12413 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 12414 ObjectStore::Transaction::collect_contexts(
11fdf7f2 12415 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 12416
11fdf7f2
TL
12417 auto start = mono_clock::now();
12418
12419 Collection *c = static_cast<Collection*>(ch.get());
12420 OpSequencer *osr = c->osr.get();
12421 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
12422
12423 // prepare
11fdf7f2
TL
12424 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
12425 &on_commit);
7c673cae
FG
12426
12427 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
12428 txc->bytes += (*p).get_num_bytes();
12429 _txc_add_transaction(txc, &(*p));
12430 }
12431 _txc_calc_cost(txc);
12432
12433 _txc_write_nodes(txc, txc->t);
12434
12435 // journal deferred items
12436 if (txc->deferred_txn) {
12437 txc->deferred_txn->seq = ++deferred_seq;
12438 bufferlist bl;
11fdf7f2 12439 encode(*txc->deferred_txn, bl);
7c673cae
FG
12440 string key;
12441 get_deferred_key(txc->deferred_txn->seq, &key);
12442 txc->t->set(PREFIX_DEFERRED, key, bl);
12443 }
12444
12445 _txc_finalize_kv(txc, txc->t);
12446 if (handle)
12447 handle->suspend_tp_timeout();
12448
11fdf7f2 12449 auto tstart = mono_clock::now();
9f95a23c
TL
12450
12451 if (!throttle.try_start_transaction(
12452 *db,
12453 *txc,
12454 tstart)) {
7c673cae 12455 // ensure we do not block here because of deferred writes
9f95a23c
TL
12456 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12457 << dendl;
12458 ++deferred_aggressive;
12459 deferred_try_submit();
12460 {
12461 // wake up any previously finished deferred events
12462 std::lock_guard l(kv_lock);
12463 if (!kv_sync_in_progress) {
12464 kv_sync_in_progress = true;
3efd9988
FG
12465 kv_cond.notify_one();
12466 }
9f95a23c
TL
12467 }
12468 throttle.finish_start_transaction(*db, *txc, tstart);
12469 --deferred_aggressive;
7c673cae 12470 }
11fdf7f2 12471 auto tend = mono_clock::now();
7c673cae
FG
12472
12473 if (handle)
12474 handle->reset_tp_timeout();
12475
12476 logger->inc(l_bluestore_txc);
12477
12478 // execute (start)
12479 _txc_state_proc(txc);
12480
11fdf7f2
TL
12481 // we're immediately readable (unlike FileStore)
12482 for (auto c : on_applied_sync) {
12483 c->complete(0);
12484 }
12485 if (!on_applied.empty()) {
12486 if (c->commit_queue) {
12487 c->commit_queue->queue(on_applied);
12488 } else {
12489 finisher.queue(on_applied);
12490 }
12491 }
12492
494da23a
TL
12493 log_latency("submit_transact",
12494 l_bluestore_submit_lat,
12495 mono_clock::now() - start,
12496 cct->_conf->bluestore_log_op_age);
12497 log_latency("throttle_transact",
12498 l_bluestore_throttle_lat,
12499 tend - tstart,
12500 cct->_conf->bluestore_log_op_age);
7c673cae
FG
12501 return 0;
12502}
12503
12504void BlueStore::_txc_aio_submit(TransContext *txc)
12505{
12506 dout(10) << __func__ << " txc " << txc << dendl;
12507 bdev->aio_submit(&txc->ioc);
12508}
12509
12510void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12511{
12512 Transaction::iterator i = t->begin();
12513
81eedcae 12514 _dump_transaction<30>(cct, t);
7c673cae
FG
12515
12516 vector<CollectionRef> cvec(i.colls.size());
12517 unsigned j = 0;
12518 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12519 ++p, ++j) {
12520 cvec[j] = _get_collection(*p);
7c673cae 12521 }
11fdf7f2 12522
7c673cae
FG
12523 vector<OnodeRef> ovec(i.objects.size());
12524
12525 for (int pos = 0; i.have_op(); ++pos) {
12526 Transaction::Op *op = i.decode_op();
12527 int r = 0;
12528
12529 // no coll or obj
12530 if (op->op == Transaction::OP_NOP)
12531 continue;
12532
11fdf7f2 12533
7c673cae
FG
12534 // collection operations
12535 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
12536
12537 // initialize osd_pool_id and do a smoke test that all collections belong
12538 // to the same pool
12539 spg_t pgid;
12540 if (!!c ? c->cid.is_pg(&pgid) : false) {
12541 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12542 txc->osd_pool_id == pgid.pool());
12543 txc->osd_pool_id = pgid.pool();
12544 }
12545
7c673cae
FG
12546 switch (op->op) {
12547 case Transaction::OP_RMCOLL:
12548 {
12549 const coll_t &cid = i.get_cid(op->cid);
12550 r = _remove_collection(txc, cid, &c);
12551 if (!r)
12552 continue;
12553 }
12554 break;
12555
12556 case Transaction::OP_MKCOLL:
12557 {
11fdf7f2 12558 ceph_assert(!c);
7c673cae
FG
12559 const coll_t &cid = i.get_cid(op->cid);
12560 r = _create_collection(txc, cid, op->split_bits, &c);
12561 if (!r)
12562 continue;
12563 }
12564 break;
12565
12566 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 12567 ceph_abort_msg("deprecated");
7c673cae
FG
12568 break;
12569
12570 case Transaction::OP_SPLIT_COLLECTION2:
12571 {
12572 uint32_t bits = op->split_bits;
12573 uint32_t rem = op->split_rem;
12574 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12575 if (!r)
12576 continue;
12577 }
12578 break;
12579
11fdf7f2
TL
12580 case Transaction::OP_MERGE_COLLECTION:
12581 {
12582 uint32_t bits = op->split_bits;
12583 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12584 if (!r)
12585 continue;
12586 }
12587 break;
12588
7c673cae
FG
12589 case Transaction::OP_COLL_HINT:
12590 {
12591 uint32_t type = op->hint_type;
12592 bufferlist hint;
12593 i.decode_bl(hint);
11fdf7f2 12594 auto hiter = hint.cbegin();
7c673cae
FG
12595 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12596 uint32_t pg_num;
12597 uint64_t num_objs;
11fdf7f2
TL
12598 decode(pg_num, hiter);
12599 decode(num_objs, hiter);
7c673cae
FG
12600 dout(10) << __func__ << " collection hint objects is a no-op, "
12601 << " pg_num " << pg_num << " num_objects " << num_objs
12602 << dendl;
12603 } else {
12604 // Ignore the hint
12605 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12606 }
12607 continue;
12608 }
12609 break;
12610
12611 case Transaction::OP_COLL_SETATTR:
12612 r = -EOPNOTSUPP;
12613 break;
12614
12615 case Transaction::OP_COLL_RMATTR:
12616 r = -EOPNOTSUPP;
12617 break;
12618
12619 case Transaction::OP_COLL_RENAME:
11fdf7f2 12620 ceph_abort_msg("not implemented");
7c673cae
FG
12621 break;
12622 }
12623 if (r < 0) {
12624 derr << __func__ << " error " << cpp_strerror(r)
12625 << " not handled on operation " << op->op
12626 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 12627 _dump_transaction<0>(cct, t);
11fdf7f2 12628 ceph_abort_msg("unexpected error");
7c673cae
FG
12629 }
12630
12631 // these operations implicity create the object
12632 bool create = false;
12633 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 12634 op->op == Transaction::OP_CREATE ||
7c673cae
FG
12635 op->op == Transaction::OP_WRITE ||
12636 op->op == Transaction::OP_ZERO) {
12637 create = true;
12638 }
12639
12640 // object operations
9f95a23c 12641 std::unique_lock l(c->lock);
7c673cae
FG
12642 OnodeRef &o = ovec[op->oid];
12643 if (!o) {
12644 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 12645 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
12646 }
12647 if (!create && (!o || !o->exists)) {
12648 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
12649 << i.get_oid(op->oid) << dendl;
12650 r = -ENOENT;
12651 goto endop;
12652 }
12653
12654 switch (op->op) {
9f95a23c 12655 case Transaction::OP_CREATE:
7c673cae
FG
12656 case Transaction::OP_TOUCH:
12657 r = _touch(txc, c, o);
12658 break;
12659
12660 case Transaction::OP_WRITE:
12661 {
12662 uint64_t off = op->off;
12663 uint64_t len = op->len;
12664 uint32_t fadvise_flags = i.get_fadvise_flags();
12665 bufferlist bl;
12666 i.decode_bl(bl);
12667 r = _write(txc, c, o, off, len, bl, fadvise_flags);
12668 }
12669 break;
12670
12671 case Transaction::OP_ZERO:
12672 {
12673 uint64_t off = op->off;
12674 uint64_t len = op->len;
12675 r = _zero(txc, c, o, off, len);
12676 }
12677 break;
12678
12679 case Transaction::OP_TRIMCACHE:
12680 {
12681 // deprecated, no-op
12682 }
12683 break;
12684
12685 case Transaction::OP_TRUNCATE:
12686 {
12687 uint64_t off = op->off;
35e4c445 12688 r = _truncate(txc, c, o, off);
7c673cae
FG
12689 }
12690 break;
12691
12692 case Transaction::OP_REMOVE:
12693 {
12694 r = _remove(txc, c, o);
12695 }
12696 break;
12697
12698 case Transaction::OP_SETATTR:
12699 {
12700 string name = i.decode_string();
12701 bufferptr bp;
12702 i.decode_bp(bp);
12703 r = _setattr(txc, c, o, name, bp);
12704 }
12705 break;
12706
12707 case Transaction::OP_SETATTRS:
12708 {
12709 map<string, bufferptr> aset;
12710 i.decode_attrset(aset);
12711 r = _setattrs(txc, c, o, aset);
12712 }
12713 break;
12714
12715 case Transaction::OP_RMATTR:
12716 {
12717 string name = i.decode_string();
12718 r = _rmattr(txc, c, o, name);
12719 }
12720 break;
12721
12722 case Transaction::OP_RMATTRS:
12723 {
12724 r = _rmattrs(txc, c, o);
12725 }
12726 break;
12727
12728 case Transaction::OP_CLONE:
12729 {
12730 OnodeRef& no = ovec[op->dest_oid];
12731 if (!no) {
12732 const ghobject_t& noid = i.get_oid(op->dest_oid);
12733 no = c->get_onode(noid, true);
12734 }
12735 r = _clone(txc, c, o, no);
12736 }
12737 break;
12738
12739 case Transaction::OP_CLONERANGE:
11fdf7f2 12740 ceph_abort_msg("deprecated");
7c673cae
FG
12741 break;
12742
12743 case Transaction::OP_CLONERANGE2:
12744 {
12745 OnodeRef& no = ovec[op->dest_oid];
12746 if (!no) {
12747 const ghobject_t& noid = i.get_oid(op->dest_oid);
12748 no = c->get_onode(noid, true);
12749 }
12750 uint64_t srcoff = op->off;
12751 uint64_t len = op->len;
12752 uint64_t dstoff = op->dest_off;
12753 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
12754 }
12755 break;
12756
12757 case Transaction::OP_COLL_ADD:
11fdf7f2 12758 ceph_abort_msg("not implemented");
7c673cae
FG
12759 break;
12760
12761 case Transaction::OP_COLL_REMOVE:
11fdf7f2 12762 ceph_abort_msg("not implemented");
7c673cae
FG
12763 break;
12764
12765 case Transaction::OP_COLL_MOVE:
11fdf7f2 12766 ceph_abort_msg("deprecated");
7c673cae
FG
12767 break;
12768
12769 case Transaction::OP_COLL_MOVE_RENAME:
12770 case Transaction::OP_TRY_RENAME:
12771 {
11fdf7f2 12772 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
12773 const ghobject_t& noid = i.get_oid(op->dest_oid);
12774 OnodeRef& no = ovec[op->dest_oid];
12775 if (!no) {
12776 no = c->get_onode(noid, false);
12777 }
12778 r = _rename(txc, c, o, no, noid);
12779 }
12780 break;
12781
12782 case Transaction::OP_OMAP_CLEAR:
12783 {
12784 r = _omap_clear(txc, c, o);
12785 }
12786 break;
12787 case Transaction::OP_OMAP_SETKEYS:
12788 {
12789 bufferlist aset_bl;
12790 i.decode_attrset_bl(&aset_bl);
12791 r = _omap_setkeys(txc, c, o, aset_bl);
12792 }
12793 break;
12794 case Transaction::OP_OMAP_RMKEYS:
12795 {
12796 bufferlist keys_bl;
12797 i.decode_keyset_bl(&keys_bl);
12798 r = _omap_rmkeys(txc, c, o, keys_bl);
12799 }
12800 break;
12801 case Transaction::OP_OMAP_RMKEYRANGE:
12802 {
12803 string first, last;
12804 first = i.decode_string();
12805 last = i.decode_string();
12806 r = _omap_rmkey_range(txc, c, o, first, last);
12807 }
12808 break;
12809 case Transaction::OP_OMAP_SETHEADER:
12810 {
12811 bufferlist bl;
12812 i.decode_bl(bl);
12813 r = _omap_setheader(txc, c, o, bl);
12814 }
12815 break;
12816
12817 case Transaction::OP_SETALLOCHINT:
12818 {
12819 r = _set_alloc_hint(txc, c, o,
12820 op->expected_object_size,
12821 op->expected_write_size,
12822 op->alloc_hint_flags);
12823 }
12824 break;
12825
12826 default:
11fdf7f2 12827 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
12828 ceph_abort();
12829 }
12830
12831 endop:
12832 if (r < 0) {
12833 bool ok = false;
12834
12835 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
12836 op->op == Transaction::OP_CLONE ||
12837 op->op == Transaction::OP_CLONERANGE2 ||
12838 op->op == Transaction::OP_COLL_ADD ||
12839 op->op == Transaction::OP_SETATTR ||
12840 op->op == Transaction::OP_SETATTRS ||
12841 op->op == Transaction::OP_RMATTR ||
12842 op->op == Transaction::OP_OMAP_SETKEYS ||
12843 op->op == Transaction::OP_OMAP_RMKEYS ||
12844 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
12845 op->op == Transaction::OP_OMAP_SETHEADER))
12846 // -ENOENT is usually okay
12847 ok = true;
12848 if (r == -ENODATA)
12849 ok = true;
12850
12851 if (!ok) {
12852 const char *msg = "unexpected error code";
12853
12854 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
12855 op->op == Transaction::OP_CLONE ||
12856 op->op == Transaction::OP_CLONERANGE2))
12857 msg = "ENOENT on clone suggests osd bug";
12858
12859 if (r == -ENOSPC)
12860 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
12861 // by partially applying transactions.
12862 msg = "ENOSPC from bluestore, misconfigured cluster";
12863
12864 if (r == -ENOTEMPTY) {
12865 msg = "ENOTEMPTY suggests garbage data in osd data dir";
12866 }
12867
12868 derr << __func__ << " error " << cpp_strerror(r)
12869 << " not handled on operation " << op->op
12870 << " (op " << pos << ", counting from 0)"
12871 << dendl;
12872 derr << msg << dendl;
81eedcae 12873 _dump_transaction<0>(cct, t);
11fdf7f2 12874 ceph_abort_msg("unexpected error");
7c673cae
FG
12875 }
12876 }
12877 }
12878}
12879
12880
12881
12882// -----------------
12883// write operations
12884
12885int BlueStore::_touch(TransContext *txc,
12886 CollectionRef& c,
12887 OnodeRef &o)
12888{
12889 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12890 int r = 0;
7c673cae
FG
12891 _assign_nid(txc, o);
12892 txc->write_onode(o);
12893 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12894 return r;
12895}
12896
7c673cae
FG
12897void BlueStore::_pad_zeros(
12898 bufferlist *bl, uint64_t *offset,
12899 uint64_t chunk_size)
12900{
12901 auto length = bl->length();
12902 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
12903 << " chunk_size 0x" << chunk_size << std::dec << dendl;
12904 dout(40) << "before:\n";
12905 bl->hexdump(*_dout);
12906 *_dout << dendl;
12907 // front
12908 size_t front_pad = *offset % chunk_size;
12909 size_t back_pad = 0;
12910 size_t pad_count = 0;
12911 if (front_pad) {
11fdf7f2
TL
12912 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
12913 bufferptr z = buffer::create_small_page_aligned(chunk_size);
224ce89b 12914 z.zero(0, front_pad, false);
7c673cae 12915 pad_count += front_pad;
9f95a23c 12916 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
12917 if (front_copy + front_pad < chunk_size) {
12918 back_pad = chunk_size - (length + front_pad);
224ce89b 12919 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
12920 pad_count += back_pad;
12921 }
12922 bufferlist old, t;
12923 old.swap(*bl);
12924 t.substr_of(old, front_copy, length - front_copy);
12925 bl->append(z);
12926 bl->claim_append(t);
12927 *offset -= front_pad;
224ce89b 12928 length += pad_count;
7c673cae
FG
12929 }
12930
12931 // back
12932 uint64_t end = *offset + length;
12933 unsigned back_copy = end % chunk_size;
12934 if (back_copy) {
11fdf7f2 12935 ceph_assert(back_pad == 0);
7c673cae 12936 back_pad = chunk_size - back_copy;
11fdf7f2 12937 ceph_assert(back_copy <= length);
7c673cae 12938 bufferptr tail(chunk_size);
9f95a23c 12939 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 12940 tail.zero(back_copy, back_pad, false);
7c673cae
FG
12941 bufferlist old;
12942 old.swap(*bl);
12943 bl->substr_of(old, 0, length - back_copy);
12944 bl->append(tail);
12945 length += back_pad;
12946 pad_count += back_pad;
12947 }
12948 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
12949 << back_pad << " on front/back, now 0x" << *offset << "~"
12950 << length << std::dec << dendl;
12951 dout(40) << "after:\n";
12952 bl->hexdump(*_dout);
12953 *_dout << dendl;
12954 if (pad_count)
12955 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 12956 ceph_assert(bl->length() == length);
7c673cae
FG
12957}
12958
12959void BlueStore::_do_write_small(
12960 TransContext *txc,
12961 CollectionRef &c,
12962 OnodeRef o,
12963 uint64_t offset, uint64_t length,
12964 bufferlist::iterator& blp,
12965 WriteContext *wctx)
12966{
12967 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
12968 << std::dec << dendl;
11fdf7f2 12969 ceph_assert(length < min_alloc_size);
7c673cae
FG
12970 uint64_t end_offs = offset + length;
12971
12972 logger->inc(l_bluestore_write_small);
12973 logger->inc(l_bluestore_write_small_bytes, length);
12974
12975 bufferlist bl;
12976 blp.copy(length, bl);
12977
81eedcae
TL
12978 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
12979 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
12980 uint32_t alloc_len = min_alloc_size;
12981 auto offset0 = p2align<uint64_t>(offset, alloc_len);
12982
12983 bool any_change;
12984
12985 // search suitable extent in both forward and reverse direction in
12986 // [offset - target_max_blob_size, offset + target_max_blob_size] range
12987 // then check if blob can be reused via can_reuse_blob func or apply
12988 // direct/deferred write (the latter for extents including or higher
12989 // than 'offset' only).
12990 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
12991
7c673cae
FG
12992 // Look for an existing mutable blob we can use.
12993 auto begin = o->extent_map.extent_map.begin();
12994 auto end = o->extent_map.extent_map.end();
12995 auto ep = o->extent_map.seek_lextent(offset);
12996 if (ep != begin) {
12997 --ep;
12998 if (ep->blob_end() <= offset) {
12999 ++ep;
13000 }
13001 }
13002 auto prev_ep = ep;
13003 if (prev_ep != begin) {
13004 --prev_ep;
13005 } else {
13006 prev_ep = end; // to avoid this extent check as it's a duplicate
13007 }
13008
eafe8130
TL
13009 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13010 // We don't want to have more blobs than min alloc units fit
13011 // into 2 max blobs
13012 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13013 bool above_blob_threshold = false;
13014
13015 inspected_blobs.reserve(blob_threshold);
13016
13017 uint64_t max_off = 0;
13018 auto start_ep = ep;
13019 auto end_ep = ep; // exclusively
7c673cae
FG
13020 do {
13021 any_change = false;
13022
13023 if (ep != end && ep->logical_offset < offset + max_bsize) {
13024 BlobRef b = ep->blob;
eafe8130
TL
13025 if (!above_blob_threshold) {
13026 inspected_blobs.insert(&b->get_blob());
13027 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13028 }
13029 max_off = ep->logical_end();
7c673cae 13030 auto bstart = ep->blob_start();
eafe8130 13031
7c673cae
FG
13032 dout(20) << __func__ << " considering " << *b
13033 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13034 if (bstart >= end_offs) {
13035 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13036 } else if (!b->get_blob().is_mutable()) {
13037 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13038 } else if (ep->logical_offset % min_alloc_size !=
13039 ep->blob_offset % min_alloc_size) {
13040 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13041 } else {
13042 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13043 // can we pad our head/tail out with zeros?
13044 uint64_t head_pad, tail_pad;
11fdf7f2
TL
13045 head_pad = p2phase(offset, chunk_size);
13046 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
13047 if (head_pad || tail_pad) {
13048 o->extent_map.fault_range(db, offset - head_pad,
13049 end_offs - offset + head_pad + tail_pad);
13050 }
13051 if (head_pad &&
13052 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
13053 head_pad = 0;
13054 }
13055 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13056 tail_pad = 0;
13057 }
13058
13059 uint64_t b_off = offset - head_pad - bstart;
13060 uint64_t b_len = length + head_pad + tail_pad;
13061
13062 // direct write into unused blocks of an existing mutable blob?
13063 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13064 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13065 b->get_blob().is_unused(b_off, b_len) &&
13066 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 13067 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13068
13069 dout(20) << __func__ << " write to unused 0x" << std::hex
13070 << b_off << "~" << b_len
13071 << " pad 0x" << head_pad << " + 0x" << tail_pad
13072 << std::dec << " of mutable " << *b << dendl;
224ce89b 13073 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13074 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13075
11fdf7f2 13076 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13077 if (b_len <= prefer_deferred_size) {
13078 dout(20) << __func__ << " deferring small 0x" << std::hex
13079 << b_len << std::dec << " unused write via deferred" << dendl;
9f95a23c 13080 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13081 op->op = bluestore_deferred_op_t::OP_WRITE;
13082 b->get_blob().map(
13083 b_off, b_len,
13084 [&](uint64_t offset, uint64_t length) {
13085 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13086 return 0;
13087 });
224ce89b 13088 op->data = bl;
7c673cae
FG
13089 } else {
13090 b->get_blob().map_bl(
224ce89b 13091 b_off, bl,
7c673cae
FG
13092 [&](uint64_t offset, bufferlist& t) {
13093 bdev->aio_write(offset, t,
13094 &txc->ioc, wctx->buffered);
13095 });
13096 }
13097 }
224ce89b 13098 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
13099 dout(20) << __func__ << " lex old " << *ep << dendl;
13100 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13101 b,
13102 &wctx->old_extents);
13103 b->dirty_blob().mark_used(le->blob_offset, le->length);
13104 txc->statfs_delta.stored() += le->length;
13105 dout(20) << __func__ << " lex " << *le << dendl;
13106 logger->inc(l_bluestore_write_small_unused);
13107 return;
13108 }
13109 // read some data to fill out the chunk?
11fdf7f2
TL
13110 uint64_t head_read = p2phase(b_off, chunk_size);
13111 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
13112 if ((head_read || tail_read) &&
13113 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13114 head_read + tail_read < min_alloc_size) {
13115 b_off -= head_read;
13116 b_len += head_read + tail_read;
13117
13118 } else {
13119 head_read = tail_read = 0;
13120 }
13121
13122 // chunk-aligned deferred overwrite?
13123 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13124 b_off % chunk_size == 0 &&
13125 b_len % chunk_size == 0 &&
13126 b->get_blob().is_allocated(b_off, b_len)) {
13127
224ce89b 13128 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13129
13130 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13131 << " and tail 0x" << tail_read << std::dec << dendl;
13132 if (head_read) {
13133 bufferlist head_bl;
13134 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13135 head_bl, 0);
11fdf7f2 13136 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
13137 size_t zlen = head_read - r;
13138 if (zlen) {
13139 head_bl.append_zero(zlen);
13140 logger->inc(l_bluestore_write_pad_bytes, zlen);
13141 }
11fdf7f2
TL
13142 head_bl.claim_append(bl);
13143 bl.swap(head_bl);
7c673cae
FG
13144 logger->inc(l_bluestore_write_penalty_read_ops);
13145 }
13146 if (tail_read) {
13147 bufferlist tail_bl;
13148 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13149 tail_bl, 0);
11fdf7f2 13150 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
13151 size_t zlen = tail_read - r;
13152 if (zlen) {
13153 tail_bl.append_zero(zlen);
13154 logger->inc(l_bluestore_write_pad_bytes, zlen);
13155 }
224ce89b 13156 bl.claim_append(tail_bl);
7c673cae
FG
13157 logger->inc(l_bluestore_write_penalty_read_ops);
13158 }
13159 logger->inc(l_bluestore_write_small_pre_read);
13160
224ce89b 13161 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13162 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13163
7c673cae 13164 if (b->get_blob().csum_type) {
224ce89b 13165 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 13166 }
11fdf7f2
TL
13167
13168 if (!g_conf()->bluestore_debug_omit_block_device_write) {
9f95a23c 13169 bluestore_deferred_op_t *op = _get_deferred_op(txc);
11fdf7f2
TL
13170 op->op = bluestore_deferred_op_t::OP_WRITE;
13171 int r = b->get_blob().map(
13172 b_off, b_len,
13173 [&](uint64_t offset, uint64_t length) {
13174 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13175 return 0;
13176 });
13177 ceph_assert(r == 0);
13178 op->data.claim(bl);
13179 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13180 << b_len << std::dec << " of mutable " << *b
13181 << " at " << op->extents << dendl;
13182 }
13183
7c673cae
FG
13184 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13185 b, &wctx->old_extents);
13186 b->dirty_blob().mark_used(le->blob_offset, le->length);
13187 txc->statfs_delta.stored() += le->length;
13188 dout(20) << __func__ << " lex " << *le << dendl;
13189 logger->inc(l_bluestore_write_small_deferred);
13190 return;
13191 }
224ce89b
WB
13192 // try to reuse blob if we can
13193 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13194 max_bsize,
13195 offset0 - bstart,
13196 &alloc_len)) {
11fdf7f2 13197 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13198 // fit into reused blob
13199 // Need to check for pending writes desiring to
13200 // reuse the same pextent. The rationale is that during GC two chunks
13201 // from garbage blobs(compressed?) can share logical space within the same
13202 // AU. That's in turn might be caused by unaligned len in clone_range2.
13203 // Hence the second write will fail in an attempt to reuse blob at
13204 // do_alloc_write().
13205 if (!wctx->has_conflict(b,
13206 offset0,
13207 offset0 + alloc_len,
13208 min_alloc_size)) {
13209
13210 // we can't reuse pad_head/pad_tail since they might be truncated
13211 // due to existent extents
13212 uint64_t b_off = offset - bstart;
13213 uint64_t b_off0 = b_off;
13214 _pad_zeros(&bl, &b_off0, chunk_size);
13215
13216 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13217 << " (0x" << b_off0 << "~" << bl.length() << ")"
13218 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13219 << std::dec << dendl;
13220
13221 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13222 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13223 false, false);
13224 logger->inc(l_bluestore_write_small_unused);
13225 return;
13226 }
13227 }
13228 }
13229 ++ep;
eafe8130 13230 end_ep = ep;
7c673cae
FG
13231 any_change = true;
13232 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13233
13234 // check extent for reuse in reverse order
13235 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13236 BlobRef b = prev_ep->blob;
eafe8130
TL
13237 if (!above_blob_threshold) {
13238 inspected_blobs.insert(&b->get_blob());
13239 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13240 }
13241 start_ep = prev_ep;
7c673cae
FG
13242 auto bstart = prev_ep->blob_start();
13243 dout(20) << __func__ << " considering " << *b
13244 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 13245 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13246 max_bsize,
13247 offset0 - bstart,
13248 &alloc_len)) {
11fdf7f2 13249 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13250 // fit into reused blob
13251 // Need to check for pending writes desiring to
13252 // reuse the same pextent. The rationale is that during GC two chunks
13253 // from garbage blobs(compressed?) can share logical space within the same
13254 // AU. That's in turn might be caused by unaligned len in clone_range2.
13255 // Hence the second write will fail in an attempt to reuse blob at
13256 // do_alloc_write().
13257 if (!wctx->has_conflict(b,
13258 offset0,
13259 offset0 + alloc_len,
13260 min_alloc_size)) {
13261
13262 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13263 uint64_t b_off = offset - bstart;
13264 uint64_t b_off0 = b_off;
13265 _pad_zeros(&bl, &b_off0, chunk_size);
13266
13267 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13268 << " (0x" << b_off0 << "~" << bl.length() << ")"
13269 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13270 << std::dec << dendl;
13271
13272 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13273 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13274 false, false);
13275 logger->inc(l_bluestore_write_small_unused);
13276 return;
13277 }
13278 }
13279 if (prev_ep != begin) {
13280 --prev_ep;
13281 any_change = true;
13282 } else {
13283 prev_ep = end; // to avoid useless first extent re-check
13284 }
13285 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13286 } while (any_change);
13287
eafe8130
TL
13288 if (above_blob_threshold) {
13289 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13290 << " " << std::hex << min_off << "~" << max_off << std::dec
13291 << dendl;
13292 ceph_assert(start_ep != end_ep);
13293 for (auto ep = start_ep; ep != end_ep; ++ep) {
13294 dout(20) << __func__ << " inserting for GC "
13295 << std::hex << ep->logical_offset << "~" << ep->length
13296 << std::dec << dendl;
13297
13298 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13299 }
13300 // insert newly written extent to GC
13301 wctx->extents_to_gc.union_insert(offset, length);
13302 dout(20) << __func__ << " inserting (last) for GC "
13303 << std::hex << offset << "~" << length
13304 << std::dec << dendl;
13305 }
7c673cae 13306 // new blob.
7c673cae 13307 BlobRef b = c->new_blob();
11fdf7f2 13308 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
13309 uint64_t b_off0 = b_off;
13310 _pad_zeros(&bl, &b_off0, block_size);
13311 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
1911f103
TL
13312 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13313 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13314 // doesn't match disk one only
13315 true);
7c673cae
FG
13316
13317 return;
13318}
13319
13320void BlueStore::_do_write_big(
13321 TransContext *txc,
13322 CollectionRef &c,
13323 OnodeRef o,
13324 uint64_t offset, uint64_t length,
13325 bufferlist::iterator& blp,
13326 WriteContext *wctx)
13327{
13328 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13329 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13330 << " compress " << (int)wctx->compress
13331 << dendl;
13332 logger->inc(l_bluestore_write_big);
13333 logger->inc(l_bluestore_write_big_bytes, length);
13334 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11fdf7f2 13335 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae
FG
13336 while (length > 0) {
13337 bool new_blob = false;
11fdf7f2 13338 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
13339 BlobRef b;
13340 uint32_t b_off = 0;
13341
13342 //attempting to reuse existing blob
13343 if (!wctx->compress) {
13344 // look for an existing mutable blob we can reuse
13345 auto begin = o->extent_map.extent_map.begin();
13346 auto end = o->extent_map.extent_map.end();
13347 auto ep = o->extent_map.seek_lextent(offset);
13348 auto prev_ep = ep;
13349 if (prev_ep != begin) {
13350 --prev_ep;
13351 } else {
13352 prev_ep = end; // to avoid this extent check as it's a duplicate
13353 }
13354 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13355 // search suitable extent in both forward and reverse direction in
13356 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 13357 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
13358 bool any_change;
13359 do {
13360 any_change = false;
13361 if (ep != end && ep->logical_offset < offset + max_bsize) {
13362 if (offset >= ep->blob_start() &&
224ce89b 13363 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13364 offset - ep->blob_start(),
13365 &l)) {
13366 b = ep->blob;
13367 b_off = offset - ep->blob_start();
13368 prev_ep = end; // to avoid check below
13369 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13370 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13371 } else {
13372 ++ep;
13373 any_change = true;
13374 }
13375 }
13376
13377 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 13378 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13379 offset - prev_ep->blob_start(),
13380 &l)) {
13381 b = prev_ep->blob;
13382 b_off = offset - prev_ep->blob_start();
13383 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13384 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13385 } else if (prev_ep != begin) {
13386 --prev_ep;
13387 any_change = true;
13388 } else {
13389 prev_ep = end; // to avoid useless first extent re-check
13390 }
13391 }
13392 } while (b == nullptr && any_change);
13393 }
13394 if (b == nullptr) {
13395 b = c->new_blob();
13396 b_off = 0;
13397 new_blob = true;
13398 }
13399
13400 bufferlist t;
13401 blp.copy(l, t);
13402 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13403 offset += l;
13404 length -= l;
13405 logger->inc(l_bluestore_write_big_blobs);
13406 }
13407}
13408
13409int BlueStore::_do_alloc_write(
13410 TransContext *txc,
13411 CollectionRef coll,
13412 OnodeRef o,
13413 WriteContext *wctx)
13414{
13415 dout(20) << __func__ << " txc " << txc
13416 << " " << wctx->writes.size() << " blobs"
13417 << dendl;
3efd9988
FG
13418 if (wctx->writes.empty()) {
13419 return 0;
7c673cae
FG
13420 }
13421
7c673cae
FG
13422 CompressorRef c;
13423 double crr = 0;
13424 if (wctx->compress) {
13425 c = select_option(
13426 "compression_algorithm",
13427 compressor,
13428 [&]() {
13429 string val;
13430 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13431 CompressorRef cp = compressor;
13432 if (!cp || cp->get_type_name() != val) {
13433 cp = Compressor::create(cct, val);
11fdf7f2
TL
13434 if (!cp) {
13435 if (_set_compression_alert(false, val.c_str())) {
13436 derr << __func__ << " unable to initialize " << val.c_str()
13437 << " compressor" << dendl;
13438 }
13439 }
7c673cae
FG
13440 }
13441 return boost::optional<CompressorRef>(cp);
13442 }
13443 return boost::optional<CompressorRef>();
13444 }
13445 );
13446
13447 crr = select_option(
13448 "compression_required_ratio",
13449 cct->_conf->bluestore_compression_required_ratio,
13450 [&]() {
13451 double val;
3efd9988 13452 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
13453 return boost::optional<double>(val);
13454 }
13455 return boost::optional<double>();
13456 }
13457 );
13458 }
13459
13460 // checksum
11fdf7f2 13461 int64_t csum = csum_type.load();
7c673cae
FG
13462 csum = select_option(
13463 "csum_type",
13464 csum,
13465 [&]() {
11fdf7f2 13466 int64_t val;
3efd9988 13467 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 13468 return boost::optional<int64_t>(val);
7c673cae 13469 }
11fdf7f2 13470 return boost::optional<int64_t>();
7c673cae
FG
13471 }
13472 );
13473
3efd9988
FG
13474 // compress (as needed) and calc needed space
13475 uint64_t need = 0;
11fdf7f2 13476 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 13477 for (auto& wi : wctx->writes) {
3efd9988 13478 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 13479 auto start = mono_clock::now();
7c673cae
FG
13480
13481 // compress
11fdf7f2
TL
13482 ceph_assert(wi.b_off == 0);
13483 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 13484
7c673cae
FG
13485 // FIXME: memory alignment here is bad
13486 bufferlist t;
3efd9988 13487 int r = c->compress(wi.bl, t);
3efd9988 13488 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 13489 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
13490 bool rejected = false;
13491 uint64_t compressed_len = t.length();
13492 // do an approximate (fast) estimation for resulting blob size
13493 // that doesn't take header overhead into account
11fdf7f2 13494 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13495 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
13496 bluestore_compression_header_t chdr;
13497 chdr.type = c->get_type();
13498 chdr.length = t.length();
13499 encode(chdr, wi.compressed_bl);
13500 wi.compressed_bl.claim_append(t);
13501
13502 compressed_len = wi.compressed_bl.length();
11fdf7f2 13503 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13504 if (result_len <= want_len && result_len < wi.blob_length) {
13505 // Cool. We compressed at least as much as we were hoping to.
13506 // pad out to min_alloc_size
13507 wi.compressed_bl.append_zero(result_len - compressed_len);
13508 wi.compressed_len = compressed_len;
13509 wi.compressed = true;
13510 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
13511 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
13512 << " -> 0x" << compressed_len << " => 0x" << result_len
13513 << " with " << c->get_type()
13514 << std::dec << dendl;
13515 txc->statfs_delta.compressed() += compressed_len;
13516 txc->statfs_delta.compressed_original() += wi.blob_length;
13517 txc->statfs_delta.compressed_allocated() += result_len;
13518 logger->inc(l_bluestore_compress_success_count);
13519 need += result_len;
13520 } else {
13521 rejected = true;
13522 }
13523 } else if (r != 0) {
13524 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
13525 << " bytes compressed using " << c->get_type_name()
13526 << std::dec
13527 << " failed with errcode = " << r
13528 << ", leaving uncompressed"
13529 << dendl;
13530 logger->inc(l_bluestore_compress_rejected_count);
13531 need += wi.blob_length;
7c673cae 13532 } else {
a8e16298
TL
13533 rejected = true;
13534 }
13535
13536 if (rejected) {
3efd9988 13537 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 13538 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
13539 << " with " << c->get_type()
13540 << ", which is more than required 0x" << want_len_raw
7c673cae 13541 << " -> 0x" << want_len
3efd9988
FG
13542 << ", leaving uncompressed"
13543 << std::dec << dendl;
13544 logger->inc(l_bluestore_compress_rejected_count);
13545 need += wi.blob_length;
7c673cae 13546 }
494da23a
TL
13547 log_latency("compress@_do_alloc_write",
13548 l_bluestore_compress_lat,
13549 mono_clock::now() - start,
13550 cct->_conf->bluestore_log_op_age );
3efd9988
FG
13551 } else {
13552 need += wi.blob_length;
7c673cae 13553 }
3efd9988 13554 }
a8e16298 13555 PExtentVector prealloc;
3efd9988 13556 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 13557 int64_t prealloc_left = 0;
3efd9988
FG
13558 prealloc_left = alloc->allocate(
13559 need, min_alloc_size, need,
13560 0, &prealloc);
eafe8130 13561 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 13562 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 13563 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2
TL
13564 << " min_alloc_size 0x" << min_alloc_size
13565 << " available 0x " << alloc->get_free()
13566 << std::dec << dendl;
13567 if (prealloc.size()) {
13568 alloc->release(prealloc);
13569 }
a8e16298
TL
13570 return -ENOSPC;
13571 }
9f95a23c 13572 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
a8e16298 13573
3efd9988
FG
13574 dout(20) << __func__ << " prealloc " << prealloc << dendl;
13575 auto prealloc_pos = prealloc.begin();
13576
13577 for (auto& wi : wctx->writes) {
13578 BlobRef b = wi.b;
13579 bluestore_blob_t& dblob = b->dirty_blob();
13580 uint64_t b_off = wi.b_off;
13581 bufferlist *l = &wi.bl;
13582 uint64_t final_length = wi.blob_length;
13583 uint64_t csum_length = wi.blob_length;
3efd9988
FG
13584 if (wi.compressed) {
13585 final_length = wi.compressed_bl.length();
13586 csum_length = final_length;
3efd9988
FG
13587 l = &wi.compressed_bl;
13588 dblob.set_compressed(wi.blob_length, wi.compressed_len);
13589 } else if (wi.new_blob) {
7c673cae 13590 // initialize newly created blob only
11fdf7f2
TL
13591 ceph_assert(dblob.is_mutable());
13592 unsigned csum_order;
7c673cae
FG
13593 if (l->length() != wi.blob_length) {
13594 // hrm, maybe we could do better here, but let's not bother.
13595 dout(20) << __func__ << " forcing csum_order to block_size_order "
13596 << block_size_order << dendl;
31f18b77 13597 csum_order = block_size_order;
7c673cae
FG
13598 } else {
13599 csum_order = std::min(wctx->csum_order, ctz(l->length()));
13600 }
13601 // try to align blob with max_blob_size to improve
13602 // its reuse ratio, e.g. in case of reverse write
13603 uint32_t suggested_boff =
13604 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
13605 if ((suggested_boff % (1 << csum_order)) == 0 &&
13606 suggested_boff + final_length <= max_bsize &&
13607 suggested_boff > b_off) {
181888fb 13608 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 13609 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 13610 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
13611 csum_length += suggested_boff - b_off;
13612 b_off = suggested_boff;
13613 }
181888fb
FG
13614 if (csum != Checksummer::CSUM_NONE) {
13615 dout(20) << __func__ << " initialize csum setting for new blob " << *b
13616 << " csum_type " << Checksummer::get_csum_type_string(csum)
13617 << " csum_order " << csum_order
13618 << " csum_length 0x" << std::hex << csum_length << std::dec
13619 << dendl;
13620 dblob.init_csum(csum, csum_order, csum_length);
13621 }
7c673cae
FG
13622 }
13623
a8e16298 13624 PExtentVector extents;
3efd9988
FG
13625 int64_t left = final_length;
13626 while (left > 0) {
11fdf7f2 13627 ceph_assert(prealloc_left > 0);
3efd9988
FG
13628 if (prealloc_pos->length <= left) {
13629 prealloc_left -= prealloc_pos->length;
13630 left -= prealloc_pos->length;
13631 txc->statfs_delta.allocated() += prealloc_pos->length;
13632 extents.push_back(*prealloc_pos);
13633 ++prealloc_pos;
13634 } else {
13635 extents.emplace_back(prealloc_pos->offset, left);
13636 prealloc_pos->offset += left;
13637 prealloc_pos->length -= left;
13638 prealloc_left -= left;
13639 txc->statfs_delta.allocated() += left;
13640 left = 0;
13641 break;
13642 }
13643 }
7c673cae 13644 for (auto& p : extents) {
3efd9988 13645 txc->allocated.insert(p.offset, p.length);
7c673cae 13646 }
11fdf7f2 13647 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 13648
181888fb
FG
13649 dout(20) << __func__ << " blob " << *b << dendl;
13650 if (dblob.has_csum()) {
7c673cae
FG
13651 dblob.calc_csum(b_off, *l);
13652 }
181888fb 13653
7c673cae 13654 if (wi.mark_unused) {
1911f103 13655 ceph_assert(!dblob.is_compressed());
7c673cae
FG
13656 auto b_end = b_off + wi.bl.length();
13657 if (b_off) {
13658 dblob.add_unused(0, b_off);
13659 }
1911f103
TL
13660 uint64_t llen = dblob.get_logical_length();
13661 if (b_end < llen) {
13662 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
13663 }
13664 }
13665
13666 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
13667 b_off + (wi.b_off0 - wi.b_off),
13668 wi.length0,
13669 wi.b,
13670 nullptr);
13671 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
13672 txc->statfs_delta.stored() += le->length;
13673 dout(20) << __func__ << " lex " << *le << dendl;
13674 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
13675 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13676
13677 // queue io
11fdf7f2 13678 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13679 if (l->length() <= prefer_deferred_size.load()) {
13680 dout(20) << __func__ << " deferring small 0x" << std::hex
13681 << l->length() << std::dec << " write via deferred" << dendl;
9f95a23c 13682 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13683 op->op = bluestore_deferred_op_t::OP_WRITE;
13684 int r = b->get_blob().map(
13685 b_off, l->length(),
13686 [&](uint64_t offset, uint64_t length) {
13687 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13688 return 0;
13689 });
11fdf7f2 13690 ceph_assert(r == 0);
7c673cae 13691 op->data = *l;
81eedcae 13692 logger->inc(l_bluestore_write_small_deferred);
7c673cae
FG
13693 } else {
13694 b->get_blob().map_bl(
13695 b_off, *l,
13696 [&](uint64_t offset, bufferlist& t) {
13697 bdev->aio_write(offset, t, &txc->ioc, false);
13698 });
81eedcae 13699 logger->inc(l_bluestore_write_small_new);
7c673cae
FG
13700 }
13701 }
13702 }
11fdf7f2
TL
13703 ceph_assert(prealloc_pos == prealloc.end());
13704 ceph_assert(prealloc_left == 0);
7c673cae
FG
13705 return 0;
13706}
13707
13708void BlueStore::_wctx_finish(
13709 TransContext *txc,
13710 CollectionRef& c,
13711 OnodeRef o,
31f18b77
FG
13712 WriteContext *wctx,
13713 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
13714{
13715 auto oep = wctx->old_extents.begin();
13716 while (oep != wctx->old_extents.end()) {
13717 auto &lo = *oep;
13718 oep = wctx->old_extents.erase(oep);
13719 dout(20) << __func__ << " lex_old " << lo.e << dendl;
13720 BlobRef b = lo.e.blob;
13721 const bluestore_blob_t& blob = b->get_blob();
13722 if (blob.is_compressed()) {
13723 if (lo.blob_empty) {
13724 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
13725 }
13726 txc->statfs_delta.compressed_original() -= lo.e.length;
13727 }
13728 auto& r = lo.r;
13729 txc->statfs_delta.stored() -= lo.e.length;
13730 if (!r.empty()) {
13731 dout(20) << __func__ << " blob release " << r << dendl;
13732 if (blob.is_shared()) {
13733 PExtentVector final;
13734 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
13735 bool unshare = false;
13736 bool* unshare_ptr =
13737 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 13738 for (auto e : r) {
31f18b77
FG
13739 b->shared_blob->put_ref(
13740 e.offset, e.length, &final,
11fdf7f2
TL
13741 unshare_ptr);
13742 }
13743 if (unshare) {
13744 ceph_assert(maybe_unshared_blobs);
13745 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
13746 }
13747 dout(20) << __func__ << " shared_blob release " << final
13748 << " from " << *b->shared_blob << dendl;
13749 txc->write_shared_blob(b->shared_blob);
13750 r.clear();
13751 r.swap(final);
13752 }
13753 }
13754 // we can't invalidate our logical extents as we drop them because
13755 // other lextents (either in our onode or others) may still
13756 // reference them. but we can throw out anything that is no
13757 // longer allocated. Note that this will leave behind edge bits
13758 // that are no longer referenced but not deallocated (until they
13759 // age out of the cache naturally).
13760 b->discard_unallocated(c.get());
13761 for (auto e : r) {
13762 dout(20) << __func__ << " release " << e << dendl;
13763 txc->released.insert(e.offset, e.length);
13764 txc->statfs_delta.allocated() -= e.length;
13765 if (blob.is_compressed()) {
13766 txc->statfs_delta.compressed_allocated() -= e.length;
13767 }
13768 }
9f95a23c
TL
13769
13770 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
13771 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
13772 << dendl;
13773 o->extent_map.spanning_blob_map.erase(b->id);
13774 }
9f95a23c 13775 delete &lo;
7c673cae
FG
13776 }
13777}
13778
13779void BlueStore::_do_write_data(
13780 TransContext *txc,
13781 CollectionRef& c,
13782 OnodeRef o,
13783 uint64_t offset,
13784 uint64_t length,
13785 bufferlist& bl,
13786 WriteContext *wctx)
13787{
13788 uint64_t end = offset + length;
13789 bufferlist::iterator p = bl.begin();
13790
13791 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
13792 (length != min_alloc_size)) {
13793 // we fall within the same block
13794 _do_write_small(txc, c, o, offset, length, p, wctx);
13795 } else {
13796 uint64_t head_offset, head_length;
13797 uint64_t middle_offset, middle_length;
13798 uint64_t tail_offset, tail_length;
13799
13800 head_offset = offset;
11fdf7f2 13801 head_length = p2nphase(offset, min_alloc_size);
7c673cae 13802
11fdf7f2
TL
13803 tail_offset = p2align(end, min_alloc_size);
13804 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
13805
13806 middle_offset = head_offset + head_length;
13807 middle_length = length - head_length - tail_length;
13808
13809 if (head_length) {
13810 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
13811 }
13812
13813 if (middle_length) {
13814 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
13815 }
13816
13817 if (tail_length) {
13818 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
13819 }
13820 }
13821}
13822
31f18b77
FG
13823void BlueStore::_choose_write_options(
13824 CollectionRef& c,
13825 OnodeRef o,
13826 uint32_t fadvise_flags,
13827 WriteContext *wctx)
7c673cae 13828{
7c673cae
FG
13829 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
13830 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 13831 wctx->buffered = true;
7c673cae
FG
13832 } else if (cct->_conf->bluestore_default_buffered_write &&
13833 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
13834 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
13835 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 13836 wctx->buffered = true;
7c673cae
FG
13837 }
13838
31f18b77
FG
13839 // apply basic csum block size
13840 wctx->csum_order = block_size_order;
7c673cae
FG
13841
13842 // compression parameters
13843 unsigned alloc_hints = o->onode.alloc_hint_flags;
13844 auto cm = select_option(
13845 "compression_mode",
31f18b77 13846 comp_mode.load(),
7c673cae
FG
13847 [&]() {
13848 string val;
11fdf7f2 13849 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
13850 return boost::optional<Compressor::CompressionMode>(
13851 Compressor::get_comp_mode_type(val));
7c673cae
FG
13852 }
13853 return boost::optional<Compressor::CompressionMode>();
13854 }
13855 );
31f18b77
FG
13856
13857 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
13858 ((cm == Compressor::COMP_FORCE) ||
13859 (cm == Compressor::COMP_AGGRESSIVE &&
13860 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
13861 (cm == Compressor::COMP_PASSIVE &&
13862 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
13863
13864 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
13865 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
13866 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
13867 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 13868 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 13869
7c673cae 13870 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 13871
7c673cae 13872 if (o->onode.expected_write_size) {
224ce89b 13873 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 13874 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 13875 } else {
224ce89b 13876 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
13877 }
13878
31f18b77
FG
13879 if (wctx->compress) {
13880 wctx->target_blob_size = select_option(
7c673cae 13881 "compression_max_blob_size",
31f18b77 13882 comp_max_blob_size.load(),
7c673cae 13883 [&]() {
11fdf7f2
TL
13884 int64_t val;
13885 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
13886 return boost::optional<uint64_t>((uint64_t)val);
13887 }
13888 return boost::optional<uint64_t>();
13889 }
13890 );
13891 }
13892 } else {
31f18b77
FG
13893 if (wctx->compress) {
13894 wctx->target_blob_size = select_option(
7c673cae 13895 "compression_min_blob_size",
31f18b77 13896 comp_min_blob_size.load(),
7c673cae 13897 [&]() {
11fdf7f2
TL
13898 int64_t val;
13899 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
13900 return boost::optional<uint64_t>((uint64_t)val);
13901 }
13902 return boost::optional<uint64_t>();
13903 }
13904 );
13905 }
13906 }
31f18b77 13907
7c673cae 13908 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
13909 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
13910 wctx->target_blob_size = max_bsize;
7c673cae 13911 }
31f18b77 13912
7c673cae
FG
13913 // set the min blob size floor at 2x the min_alloc_size, or else we
13914 // won't be able to allocate a smaller extent for the compressed
13915 // data.
31f18b77
FG
13916 if (wctx->compress &&
13917 wctx->target_blob_size < min_alloc_size * 2) {
13918 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 13919 }
31f18b77
FG
13920
13921 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
13922 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
13923 << " compress=" << (int)wctx->compress
13924 << " buffered=" << (int)wctx->buffered
31f18b77
FG
13925 << std::dec << dendl;
13926}
13927
13928int BlueStore::_do_gc(
13929 TransContext *txc,
13930 CollectionRef& c,
13931 OnodeRef o,
31f18b77
FG
13932 const WriteContext& wctx,
13933 uint64_t *dirty_start,
13934 uint64_t *dirty_end)
13935{
31f18b77 13936
1adf2230 13937 bool dirty_range_updated = false;
31f18b77 13938 WriteContext wctx_gc;
7c673cae 13939 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 13940
eafe8130 13941 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
13942 for (auto it = extents_to_collect.begin();
13943 it != extents_to_collect.end();
13944 ++it) {
13945 bufferlist bl;
eafe8130
TL
13946 auto offset = (*it).first;
13947 auto length = (*it).second;
13948 dout(20) << __func__ << " processing " << std::hex
13949 << offset << "~" << length << std::dec
13950 << dendl;
13951 int r = _do_read(c.get(), o, offset, length, bl, 0);
13952 ceph_assert(r == (int)length);
31f18b77 13953
eafe8130
TL
13954 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
13955 logger->inc(l_bluestore_gc_merged, length);
31f18b77 13956
eafe8130
TL
13957 if (*dirty_start > offset) {
13958 *dirty_start = offset;
1adf2230 13959 dirty_range_updated = true;
31f18b77
FG
13960 }
13961
eafe8130
TL
13962 if (*dirty_end < offset + length) {
13963 *dirty_end = offset + length;
1adf2230 13964 dirty_range_updated = true;
31f18b77
FG
13965 }
13966 }
1adf2230
AA
13967 if (dirty_range_updated) {
13968 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
13969 }
31f18b77
FG
13970
13971 dout(30) << __func__ << " alloc write" << dendl;
13972 int r = _do_alloc_write(txc, c, o, &wctx_gc);
13973 if (r < 0) {
13974 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
13975 << dendl;
13976 return r;
13977 }
13978
13979 _wctx_finish(txc, c, o, &wctx_gc);
13980 return 0;
13981}
13982
13983int BlueStore::_do_write(
13984 TransContext *txc,
13985 CollectionRef& c,
13986 OnodeRef o,
13987 uint64_t offset,
13988 uint64_t length,
13989 bufferlist& bl,
13990 uint32_t fadvise_flags)
13991{
13992 int r = 0;
13993
13994 dout(20) << __func__
13995 << " " << o->oid
13996 << " 0x" << std::hex << offset << "~" << length
13997 << " - have 0x" << o->onode.size
13998 << " (" << std::dec << o->onode.size << ")"
13999 << " bytes"
14000 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
14001 << dendl;
81eedcae 14002 _dump_onode<30>(cct, *o);
31f18b77
FG
14003
14004 if (length == 0) {
14005 return 0;
14006 }
14007
14008 uint64_t end = offset + length;
14009
14010 GarbageCollector gc(c->store->cct);
eafe8130 14011 int64_t benefit = 0;
31f18b77
FG
14012 auto dirty_start = offset;
14013 auto dirty_end = end;
14014
14015 WriteContext wctx;
14016 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
14017 o->extent_map.fault_range(db, offset, length);
14018 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
14019 r = _do_alloc_write(txc, c, o, &wctx);
14020 if (r < 0) {
14021 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14022 << dendl;
14023 goto out;
14024 }
14025
eafe8130
TL
14026 if (wctx.extents_to_gc.empty() ||
14027 wctx.extents_to_gc.range_start() > offset ||
14028 wctx.extents_to_gc.range_end() < offset + length) {
14029 benefit = gc.estimate(offset,
14030 length,
14031 o->extent_map,
14032 wctx.old_extents,
14033 min_alloc_size);
14034 }
14035
31f18b77
FG
14036 // NB: _wctx_finish() will empty old_extents
14037 // so we must do gc estimation before that
7c673cae
FG
14038 _wctx_finish(txc, c, o, &wctx);
14039 if (end > o->onode.size) {
14040 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 14041 << std::dec << dendl;
7c673cae
FG
14042 o->onode.size = end;
14043 }
14044
11fdf7f2 14045 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
14046 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14047 dout(20) << __func__
14048 << " perform garbage collection for compressed extents, "
14049 << "expected benefit = " << benefit << " AUs" << dendl;
14050 }
14051 if (!wctx.extents_to_gc.empty()) {
14052 dout(20) << __func__ << " perform garbage collection" << dendl;
14053
14054 r = _do_gc(txc, c, o,
14055 wctx,
14056 &dirty_start, &dirty_end);
14057 if (r < 0) {
14058 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14059 << dendl;
14060 goto out;
7c673cae 14061 }
eafe8130
TL
14062 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14063 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 14064 }
7c673cae 14065 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
14066 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14067
7c673cae
FG
14068 r = 0;
14069
14070 out:
14071 return r;
14072}
14073
14074int BlueStore::_write(TransContext *txc,
14075 CollectionRef& c,
14076 OnodeRef& o,
31f18b77
FG
14077 uint64_t offset, size_t length,
14078 bufferlist& bl,
14079 uint32_t fadvise_flags)
7c673cae
FG
14080{
14081 dout(15) << __func__ << " " << c->cid << " " << o->oid
14082 << " 0x" << std::hex << offset << "~" << length << std::dec
14083 << dendl;
35e4c445
FG
14084 int r = 0;
14085 if (offset + length >= OBJECT_MAX_SIZE) {
14086 r = -E2BIG;
14087 } else {
14088 _assign_nid(txc, o);
14089 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14090 txc->write_onode(o);
14091 }
7c673cae
FG
14092 dout(10) << __func__ << " " << c->cid << " " << o->oid
14093 << " 0x" << std::hex << offset << "~" << length << std::dec
14094 << " = " << r << dendl;
14095 return r;
14096}
14097
14098int BlueStore::_zero(TransContext *txc,
14099 CollectionRef& c,
14100 OnodeRef& o,
14101 uint64_t offset, size_t length)
14102{
14103 dout(15) << __func__ << " " << c->cid << " " << o->oid
14104 << " 0x" << std::hex << offset << "~" << length << std::dec
14105 << dendl;
35e4c445
FG
14106 int r = 0;
14107 if (offset + length >= OBJECT_MAX_SIZE) {
14108 r = -E2BIG;
14109 } else {
14110 _assign_nid(txc, o);
14111 r = _do_zero(txc, c, o, offset, length);
14112 }
7c673cae
FG
14113 dout(10) << __func__ << " " << c->cid << " " << o->oid
14114 << " 0x" << std::hex << offset << "~" << length << std::dec
14115 << " = " << r << dendl;
14116 return r;
14117}
14118
14119int BlueStore::_do_zero(TransContext *txc,
14120 CollectionRef& c,
14121 OnodeRef& o,
14122 uint64_t offset, size_t length)
14123{
14124 dout(15) << __func__ << " " << c->cid << " " << o->oid
14125 << " 0x" << std::hex << offset << "~" << length << std::dec
14126 << dendl;
14127 int r = 0;
14128
81eedcae 14129 _dump_onode<30>(cct, *o);
7c673cae
FG
14130
14131 WriteContext wctx;
14132 o->extent_map.fault_range(db, offset, length);
14133 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 14134 o->extent_map.dirty_range(offset, length);
7c673cae
FG
14135 _wctx_finish(txc, c, o, &wctx);
14136
b32b8144 14137 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
14138 o->onode.size = offset + length;
14139 dout(20) << __func__ << " extending size to " << offset + length
14140 << dendl;
14141 }
14142 txc->write_onode(o);
14143
14144 dout(10) << __func__ << " " << c->cid << " " << o->oid
14145 << " 0x" << std::hex << offset << "~" << length << std::dec
14146 << " = " << r << dendl;
14147 return r;
14148}
14149
14150void BlueStore::_do_truncate(
31f18b77
FG
14151 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14152 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14153{
14154 dout(15) << __func__ << " " << c->cid << " " << o->oid
14155 << " 0x" << std::hex << offset << std::dec << dendl;
14156
81eedcae 14157 _dump_onode<30>(cct, *o);
7c673cae
FG
14158
14159 if (offset == o->onode.size)
31f18b77 14160 return;
7c673cae
FG
14161
14162 if (offset < o->onode.size) {
14163 WriteContext wctx;
14164 uint64_t length = o->onode.size - offset;
14165 o->extent_map.fault_range(db, offset, length);
14166 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
14167 o->extent_map.dirty_range(offset, length);
14168 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
14169
14170 // if we have shards past EOF, ask for a reshard
14171 if (!o->onode.extent_map_shards.empty() &&
14172 o->onode.extent_map_shards.back().offset >= offset) {
14173 dout(10) << __func__ << " request reshard past EOF" << dendl;
14174 if (offset) {
14175 o->extent_map.request_reshard(offset - 1, offset + length);
14176 } else {
14177 o->extent_map.request_reshard(0, length);
14178 }
14179 }
14180 }
14181
14182 o->onode.size = offset;
14183
14184 txc->write_onode(o);
14185}
14186
35e4c445 14187int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
14188 CollectionRef& c,
14189 OnodeRef& o,
14190 uint64_t offset)
14191{
14192 dout(15) << __func__ << " " << c->cid << " " << o->oid
14193 << " 0x" << std::hex << offset << std::dec
14194 << dendl;
35e4c445
FG
14195 int r = 0;
14196 if (offset >= OBJECT_MAX_SIZE) {
14197 r = -E2BIG;
14198 } else {
14199 _do_truncate(txc, c, o, offset);
14200 }
14201 dout(10) << __func__ << " " << c->cid << " " << o->oid
14202 << " 0x" << std::hex << offset << std::dec
14203 << " = " << r << dendl;
14204 return r;
7c673cae
FG
14205}
14206
14207int BlueStore::_do_remove(
14208 TransContext *txc,
14209 CollectionRef& c,
14210 OnodeRef o)
14211{
31f18b77 14212 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
14213 bool is_gen = !o->oid.is_no_gen();
14214 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
14215 if (o->onode.has_omap()) {
14216 o->flush();
9f95a23c 14217 _do_omap_clear(txc, o);
7c673cae
FG
14218 }
14219 o->exists = false;
14220 string key;
14221 for (auto &s : o->extent_map.shards) {
14222 dout(20) << __func__ << " removing shard 0x" << std::hex
14223 << s.shard_info->offset << std::dec << dendl;
14224 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14225 [&](const string& final_key) {
14226 txc->t->rmkey(PREFIX_OBJ, final_key);
14227 }
14228 );
14229 }
14230 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 14231 txc->note_removed_object(o);
7c673cae
FG
14232 o->extent_map.clear();
14233 o->onode = bluestore_onode_t();
14234 _debug_obj_on_delete(o->oid);
31f18b77 14235
224ce89b
WB
14236 if (!is_gen || maybe_unshared_blobs.empty()) {
14237 return 0;
14238 }
31f18b77 14239
224ce89b
WB
14240 // see if we can unshare blobs still referenced by the head
14241 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14242 << maybe_unshared_blobs << dendl;
14243 ghobject_t nogen = o->oid;
14244 nogen.generation = ghobject_t::NO_GEN;
14245 OnodeRef h = c->onode_map.lookup(nogen);
14246
14247 if (!h || !h->exists) {
14248 return 0;
14249 }
14250
14251 dout(20) << __func__ << " checking for unshareable blobs on " << h
14252 << " " << h->oid << dendl;
14253 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14254 for (auto& e : h->extent_map.extent_map) {
14255 const bluestore_blob_t& b = e.blob->get_blob();
14256 SharedBlob *sb = e.blob->shared_blob.get();
14257 if (b.is_shared() &&
14258 sb->loaded &&
14259 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
14260 if (b.is_compressed()) {
14261 expect[sb].get(0, b.get_ondisk_length());
14262 } else {
14263 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14264 expect[sb].get(off, len);
14265 return 0;
14266 });
14267 }
224ce89b
WB
14268 }
14269 }
31f18b77 14270
224ce89b
WB
14271 vector<SharedBlob*> unshared_blobs;
14272 unshared_blobs.reserve(maybe_unshared_blobs.size());
14273 for (auto& p : expect) {
14274 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14275 if (p.first->persistent->ref_map == p.second) {
14276 SharedBlob *sb = p.first;
14277 dout(20) << __func__ << " unsharing " << *sb << dendl;
14278 unshared_blobs.push_back(sb);
14279 txc->unshare_blob(sb);
14280 uint64_t sbid = c->make_blob_unshared(sb);
14281 string key;
14282 get_shared_blob_key(sbid, &key);
14283 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14284 }
14285 }
14286
14287 if (unshared_blobs.empty()) {
14288 return 0;
14289 }
14290
224ce89b
WB
14291 for (auto& e : h->extent_map.extent_map) {
14292 const bluestore_blob_t& b = e.blob->get_blob();
14293 SharedBlob *sb = e.blob->shared_blob.get();
14294 if (b.is_shared() &&
14295 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14296 sb) != unshared_blobs.end()) {
14297 dout(20) << __func__ << " unsharing " << e << dendl;
14298 bluestore_blob_t& blob = e.blob->dirty_blob();
14299 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 14300 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
14301 }
14302 }
224ce89b
WB
14303 txc->write_onode(h);
14304
7c673cae
FG
14305 return 0;
14306}
14307
14308int BlueStore::_remove(TransContext *txc,
14309 CollectionRef& c,
14310 OnodeRef &o)
14311{
11fdf7f2
TL
14312 dout(15) << __func__ << " " << c->cid << " " << o->oid
14313 << " onode " << o.get()
14314 << " txc "<< txc << dendl;
7c673cae
FG
14315 int r = _do_remove(txc, c, o);
14316 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14317 return r;
14318}
14319
14320int BlueStore::_setattr(TransContext *txc,
14321 CollectionRef& c,
14322 OnodeRef& o,
14323 const string& name,
14324 bufferptr& val)
14325{
14326 dout(15) << __func__ << " " << c->cid << " " << o->oid
14327 << " " << name << " (" << val.length() << " bytes)"
14328 << dendl;
14329 int r = 0;
3efd9988
FG
14330 if (val.is_partial()) {
14331 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14332 val.length());
14333 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14334 } else {
14335 auto& b = o->onode.attrs[name.c_str()] = val;
14336 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14337 }
7c673cae
FG
14338 txc->write_onode(o);
14339 dout(10) << __func__ << " " << c->cid << " " << o->oid
14340 << " " << name << " (" << val.length() << " bytes)"
14341 << " = " << r << dendl;
14342 return r;
14343}
14344
14345int BlueStore::_setattrs(TransContext *txc,
14346 CollectionRef& c,
14347 OnodeRef& o,
14348 const map<string,bufferptr>& aset)
14349{
14350 dout(15) << __func__ << " " << c->cid << " " << o->oid
14351 << " " << aset.size() << " keys"
14352 << dendl;
14353 int r = 0;
14354 for (map<string,bufferptr>::const_iterator p = aset.begin();
14355 p != aset.end(); ++p) {
3efd9988
FG
14356 if (p->second.is_partial()) {
14357 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 14358 bufferptr(p->second.c_str(), p->second.length());
3efd9988
FG
14359 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14360 } else {
14361 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
14362 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14363 }
7c673cae
FG
14364 }
14365 txc->write_onode(o);
14366 dout(10) << __func__ << " " << c->cid << " " << o->oid
14367 << " " << aset.size() << " keys"
14368 << " = " << r << dendl;
14369 return r;
14370}
14371
14372
14373int BlueStore::_rmattr(TransContext *txc,
14374 CollectionRef& c,
14375 OnodeRef& o,
14376 const string& name)
14377{
14378 dout(15) << __func__ << " " << c->cid << " " << o->oid
14379 << " " << name << dendl;
14380 int r = 0;
14381 auto it = o->onode.attrs.find(name.c_str());
14382 if (it == o->onode.attrs.end())
14383 goto out;
14384
14385 o->onode.attrs.erase(it);
14386 txc->write_onode(o);
14387
14388 out:
14389 dout(10) << __func__ << " " << c->cid << " " << o->oid
14390 << " " << name << " = " << r << dendl;
14391 return r;
14392}
14393
14394int BlueStore::_rmattrs(TransContext *txc,
14395 CollectionRef& c,
14396 OnodeRef& o)
14397{
14398 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14399 int r = 0;
14400
14401 if (o->onode.attrs.empty())
14402 goto out;
14403
14404 o->onode.attrs.clear();
14405 txc->write_onode(o);
14406
14407 out:
14408 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14409 return r;
14410}
14411
9f95a23c 14412void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 14413{
9f95a23c 14414 const string& omap_prefix = o->get_omap_prefix();
7c673cae 14415 string prefix, tail;
9f95a23c
TL
14416 o->get_omap_header(&prefix);
14417 o->get_omap_tail(&tail);
11fdf7f2 14418 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 14419 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
14420 dout(20) << __func__ << " remove range start: "
14421 << pretty_binary_string(prefix) << " end: "
14422 << pretty_binary_string(tail) << dendl;
7c673cae
FG
14423}
14424
14425int BlueStore::_omap_clear(TransContext *txc,
14426 CollectionRef& c,
14427 OnodeRef& o)
14428{
14429 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14430 int r = 0;
14431 if (o->onode.has_omap()) {
14432 o->flush();
9f95a23c 14433 _do_omap_clear(txc, o);
7c673cae
FG
14434 o->onode.clear_omap_flag();
14435 txc->write_onode(o);
14436 }
14437 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14438 return r;
14439}
14440
14441int BlueStore::_omap_setkeys(TransContext *txc,
14442 CollectionRef& c,
14443 OnodeRef& o,
14444 bufferlist &bl)
14445{
14446 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14447 int r;
11fdf7f2 14448 auto p = bl.cbegin();
7c673cae
FG
14449 __u32 num;
14450 if (!o->onode.has_omap()) {
11fdf7f2 14451 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14452 o->onode.set_omap_flags_pgmeta();
14453 } else {
14454 o->onode.set_omap_flags();
11fdf7f2 14455 }
7c673cae 14456 txc->write_onode(o);
494da23a 14457
9f95a23c 14458 const string& prefix = o->get_omap_prefix();
494da23a
TL
14459 string key_tail;
14460 bufferlist tail;
9f95a23c 14461 o->get_omap_tail(&key_tail);
494da23a 14462 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14463 } else {
14464 txc->note_modified_object(o);
14465 }
9f95a23c 14466 const string& prefix = o->get_omap_prefix();
7c673cae 14467 string final_key;
9f95a23c
TL
14468 o->get_omap_key(string(), &final_key);
14469 size_t base_key_len = final_key.size();
11fdf7f2 14470 decode(num, p);
7c673cae
FG
14471 while (num--) {
14472 string key;
14473 bufferlist value;
11fdf7f2
TL
14474 decode(key, p);
14475 decode(value, p);
9f95a23c 14476 final_key.resize(base_key_len); // keep prefix
7c673cae 14477 final_key += key;
11fdf7f2 14478 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 14479 << " <- " << key << dendl;
11fdf7f2 14480 txc->t->set(prefix, final_key, value);
7c673cae
FG
14481 }
14482 r = 0;
14483 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14484 return r;
14485}
14486
14487int BlueStore::_omap_setheader(TransContext *txc,
14488 CollectionRef& c,
14489 OnodeRef &o,
14490 bufferlist& bl)
14491{
14492 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14493 int r;
14494 string key;
14495 if (!o->onode.has_omap()) {
11fdf7f2 14496 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14497 o->onode.set_omap_flags_pgmeta();
14498 } else {
14499 o->onode.set_omap_flags();
11fdf7f2 14500 }
7c673cae 14501 txc->write_onode(o);
494da23a 14502
9f95a23c 14503 const string& prefix = o->get_omap_prefix();
494da23a
TL
14504 string key_tail;
14505 bufferlist tail;
9f95a23c 14506 o->get_omap_tail(&key_tail);
494da23a 14507 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14508 } else {
14509 txc->note_modified_object(o);
14510 }
9f95a23c
TL
14511 const string& prefix = o->get_omap_prefix();
14512 o->get_omap_header(&key);
11fdf7f2 14513 txc->t->set(prefix, key, bl);
7c673cae
FG
14514 r = 0;
14515 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14516 return r;
14517}
14518
14519int BlueStore::_omap_rmkeys(TransContext *txc,
14520 CollectionRef& c,
14521 OnodeRef& o,
14522 bufferlist& bl)
14523{
14524 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14525 int r = 0;
11fdf7f2 14526 auto p = bl.cbegin();
7c673cae
FG
14527 __u32 num;
14528 string final_key;
14529
14530 if (!o->onode.has_omap()) {
14531 goto out;
14532 }
11fdf7f2 14533 {
9f95a23c
TL
14534 const string& prefix = o->get_omap_prefix();
14535 o->get_omap_key(string(), &final_key);
14536 size_t base_key_len = final_key.size();
11fdf7f2
TL
14537 decode(num, p);
14538 while (num--) {
14539 string key;
14540 decode(key, p);
9f95a23c 14541 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
14542 final_key += key;
14543 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
14544 << " <- " << key << dendl;
14545 txc->t->rmkey(prefix, final_key);
14546 }
7c673cae
FG
14547 }
14548 txc->note_modified_object(o);
14549
14550 out:
14551 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14552 return r;
14553}
14554
14555int BlueStore::_omap_rmkey_range(TransContext *txc,
14556 CollectionRef& c,
14557 OnodeRef& o,
14558 const string& first, const string& last)
14559{
14560 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
14561 string key_first, key_last;
14562 int r = 0;
14563 if (!o->onode.has_omap()) {
14564 goto out;
14565 }
11fdf7f2 14566 {
9f95a23c 14567 const string& prefix = o->get_omap_prefix();
11fdf7f2 14568 o->flush();
9f95a23c
TL
14569 o->get_omap_key(first, &key_first);
14570 o->get_omap_key(last, &key_last);
11fdf7f2
TL
14571 txc->t->rm_range_keys(prefix, key_first, key_last);
14572 dout(20) << __func__ << " remove range start: "
14573 << pretty_binary_string(key_first) << " end: "
14574 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
14575 }
14576 txc->note_modified_object(o);
14577
14578 out:
14579 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14580 return r;
14581}
14582
14583int BlueStore::_set_alloc_hint(
14584 TransContext *txc,
14585 CollectionRef& c,
14586 OnodeRef& o,
14587 uint64_t expected_object_size,
14588 uint64_t expected_write_size,
14589 uint32_t flags)
14590{
14591 dout(15) << __func__ << " " << c->cid << " " << o->oid
14592 << " object_size " << expected_object_size
14593 << " write_size " << expected_write_size
14594 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
14595 << dendl;
14596 int r = 0;
14597 o->onode.expected_object_size = expected_object_size;
14598 o->onode.expected_write_size = expected_write_size;
14599 o->onode.alloc_hint_flags = flags;
14600 txc->write_onode(o);
14601 dout(10) << __func__ << " " << c->cid << " " << o->oid
14602 << " object_size " << expected_object_size
14603 << " write_size " << expected_write_size
14604 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
14605 << " = " << r << dendl;
14606 return r;
14607}
14608
14609int BlueStore::_clone(TransContext *txc,
14610 CollectionRef& c,
14611 OnodeRef& oldo,
14612 OnodeRef& newo)
14613{
14614 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14615 << newo->oid << dendl;
14616 int r = 0;
14617 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
14618 derr << __func__ << " mismatched hash on " << oldo->oid
14619 << " and " << newo->oid << dendl;
14620 return -EINVAL;
14621 }
14622
7c673cae
FG
14623 _assign_nid(txc, newo);
14624
14625 // clone data
14626 oldo->flush();
14627 _do_truncate(txc, c, newo, 0);
14628 if (cct->_conf->bluestore_clone_cow) {
14629 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
14630 } else {
14631 bufferlist bl;
14632 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
14633 if (r < 0)
14634 goto out;
14635 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
14636 if (r < 0)
14637 goto out;
14638 }
14639
14640 // clone attrs
14641 newo->onode.attrs = oldo->onode.attrs;
14642
14643 // clone omap
14644 if (newo->onode.has_omap()) {
14645 dout(20) << __func__ << " clearing old omap data" << dendl;
14646 newo->flush();
9f95a23c 14647 _do_omap_clear(txc, newo);
494da23a 14648 newo->onode.clear_omap_flag();
7c673cae
FG
14649 }
14650 if (oldo->onode.has_omap()) {
14651 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 14652 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
14653 newo->onode.set_omap_flags_pgmeta();
14654 } else {
14655 newo->onode.set_omap_flags();
7c673cae 14656 }
9f95a23c 14657 const string& prefix = newo->get_omap_prefix();
11fdf7f2 14658 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 14659 string head, tail;
9f95a23c
TL
14660 oldo->get_omap_header(&head);
14661 oldo->get_omap_tail(&tail);
7c673cae
FG
14662 it->lower_bound(head);
14663 while (it->valid()) {
14664 if (it->key() >= tail) {
14665 dout(30) << __func__ << " reached tail" << dendl;
14666 break;
14667 } else {
14668 dout(30) << __func__ << " got header/data "
14669 << pretty_binary_string(it->key()) << dendl;
14670 string key;
9f95a23c 14671 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 14672 txc->t->set(prefix, key, it->value());
7c673cae
FG
14673 }
14674 it->next();
14675 }
494da23a
TL
14676 string new_tail;
14677 bufferlist new_tail_value;
9f95a23c 14678 newo->get_omap_tail(&new_tail);
494da23a 14679 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
14680 }
14681
14682 txc->write_onode(newo);
14683 r = 0;
14684
14685 out:
14686 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14687 << newo->oid << " = " << r << dendl;
14688 return r;
14689}
14690
14691int BlueStore::_do_clone_range(
14692 TransContext *txc,
14693 CollectionRef& c,
14694 OnodeRef& oldo,
14695 OnodeRef& newo,
224ce89b
WB
14696 uint64_t srcoff,
14697 uint64_t length,
14698 uint64_t dstoff)
7c673cae
FG
14699{
14700 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14701 << newo->oid
14702 << " 0x" << std::hex << srcoff << "~" << length << " -> "
14703 << " 0x" << dstoff << "~" << length << std::dec << dendl;
14704 oldo->extent_map.fault_range(db, srcoff, length);
14705 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
14706 _dump_onode<30>(cct, *oldo);
14707 _dump_onode<30>(cct, *newo);
7c673cae 14708
11fdf7f2 14709 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
14710 _dump_onode<30>(cct, *oldo);
14711 _dump_onode<30>(cct, *newo);
7c673cae
FG
14712 return 0;
14713}
14714
14715int BlueStore::_clone_range(TransContext *txc,
14716 CollectionRef& c,
14717 OnodeRef& oldo,
14718 OnodeRef& newo,
14719 uint64_t srcoff, uint64_t length, uint64_t dstoff)
14720{
14721 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14722 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
14723 << " to offset 0x" << dstoff << std::dec << dendl;
14724 int r = 0;
14725
35e4c445
FG
14726 if (srcoff + length >= OBJECT_MAX_SIZE ||
14727 dstoff + length >= OBJECT_MAX_SIZE) {
14728 r = -E2BIG;
14729 goto out;
14730 }
7c673cae
FG
14731 if (srcoff + length > oldo->onode.size) {
14732 r = -EINVAL;
14733 goto out;
14734 }
14735
7c673cae
FG
14736 _assign_nid(txc, newo);
14737
14738 if (length > 0) {
14739 if (cct->_conf->bluestore_clone_cow) {
14740 _do_zero(txc, c, newo, dstoff, length);
14741 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
14742 } else {
14743 bufferlist bl;
14744 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
14745 if (r < 0)
14746 goto out;
14747 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
14748 if (r < 0)
14749 goto out;
14750 }
14751 }
14752
14753 txc->write_onode(newo);
14754 r = 0;
14755
14756 out:
14757 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14758 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
14759 << " to offset 0x" << dstoff << std::dec
14760 << " = " << r << dendl;
14761 return r;
14762}
14763
14764int BlueStore::_rename(TransContext *txc,
14765 CollectionRef& c,
14766 OnodeRef& oldo,
14767 OnodeRef& newo,
14768 const ghobject_t& new_oid)
14769{
14770 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14771 << new_oid << dendl;
14772 int r;
14773 ghobject_t old_oid = oldo->oid;
31f18b77 14774 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
14775
14776 if (newo) {
14777 if (newo->exists) {
14778 r = -EEXIST;
14779 goto out;
14780 }
11fdf7f2 14781 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
14782 }
14783
14784 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
14785
14786 // rewrite shards
14787 {
14788 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
14789 get_object_key(cct, new_oid, &new_okey);
14790 string key;
14791 for (auto &s : oldo->extent_map.shards) {
14792 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
14793 [&](const string& final_key) {
14794 txc->t->rmkey(PREFIX_OBJ, final_key);
14795 }
14796 );
14797 s.dirty = true;
14798 }
14799 }
14800
14801 newo = oldo;
14802 txc->write_onode(newo);
14803
14804 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
14805 // Onode in the old slot
14806 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
14807 r = 0;
14808
f64942e4
AA
14809 // hold a ref to new Onode in old name position, to ensure we don't drop
14810 // it from the cache before this txc commits (or else someone may come along
14811 // and read newo's metadata via the old name).
14812 txc->note_modified_object(oldo);
14813
7c673cae
FG
14814 out:
14815 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
14816 << new_oid << " = " << r << dendl;
14817 return r;
14818}
14819
14820// collections
14821
14822int BlueStore::_create_collection(
14823 TransContext *txc,
14824 const coll_t &cid,
14825 unsigned bits,
14826 CollectionRef *c)
14827{
14828 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
14829 int r;
14830 bufferlist bl;
14831
14832 {
9f95a23c 14833 std::unique_lock l(coll_lock);
7c673cae
FG
14834 if (*c) {
14835 r = -EEXIST;
14836 goto out;
14837 }
11fdf7f2
TL
14838 auto p = new_coll_map.find(cid);
14839 ceph_assert(p != new_coll_map.end());
14840 *c = p->second;
7c673cae
FG
14841 (*c)->cnode.bits = bits;
14842 coll_map[cid] = *c;
11fdf7f2 14843 new_coll_map.erase(p);
7c673cae 14844 }
11fdf7f2 14845 encode((*c)->cnode, bl);
7c673cae
FG
14846 txc->t->set(PREFIX_COLL, stringify(cid), bl);
14847 r = 0;
14848
14849 out:
14850 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
14851 return r;
14852}
14853
14854int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
14855 CollectionRef *c)
14856{
14857 dout(15) << __func__ << " " << cid << dendl;
14858 int r;
14859
11fdf7f2 14860 (*c)->flush_all_but_last();
7c673cae 14861 {
9f95a23c 14862 std::unique_lock l(coll_lock);
7c673cae
FG
14863 if (!*c) {
14864 r = -ENOENT;
14865 goto out;
14866 }
14867 size_t nonexistent_count = 0;
11fdf7f2 14868 ceph_assert((*c)->exists);
7c673cae
FG
14869 if ((*c)->onode_map.map_any([&](OnodeRef o) {
14870 if (o->exists) {
494da23a
TL
14871 dout(1) << __func__ << " " << o->oid << " " << o
14872 << " exists in onode_map" << dendl;
7c673cae
FG
14873 return true;
14874 }
14875 ++nonexistent_count;
14876 return false;
14877 })) {
14878 r = -ENOTEMPTY;
14879 goto out;
14880 }
14881
14882 vector<ghobject_t> ls;
14883 ghobject_t next;
14884 // Enumerate onodes in db, up to nonexistent_count + 1
14885 // then check if all of them are marked as non-existent.
11fdf7f2 14886 // Bypass the check if (next != ghobject_t::get_max())
7c673cae
FG
14887 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
14888 nonexistent_count + 1, &ls, &next);
14889 if (r >= 0) {
11fdf7f2
TL
14890 // If true mean collecton has more objects than nonexistent_count,
14891 // so bypass check.
14892 bool exists = (!next.is_max());
7c673cae
FG
14893 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
14894 dout(10) << __func__ << " oid " << *it << dendl;
14895 auto onode = (*c)->onode_map.lookup(*it);
14896 exists = !onode || onode->exists;
14897 if (exists) {
494da23a
TL
14898 dout(1) << __func__ << " " << *it
14899 << " exists in db, "
14900 << (!onode ? "not present in ram" : "present in ram")
14901 << dendl;
7c673cae
FG
14902 }
14903 }
14904 if (!exists) {
11fdf7f2 14905 _do_remove_collection(txc, c);
7c673cae
FG
14906 r = 0;
14907 } else {
14908 dout(10) << __func__ << " " << cid
14909 << " is non-empty" << dendl;
14910 r = -ENOTEMPTY;
14911 }
14912 }
14913 }
14914
14915 out:
14916 dout(10) << __func__ << " " << cid << " = " << r << dendl;
14917 return r;
14918}
14919
11fdf7f2
TL
14920void BlueStore::_do_remove_collection(TransContext *txc,
14921 CollectionRef *c)
14922{
14923 coll_map.erase((*c)->cid);
14924 txc->removed_collections.push_back(*c);
14925 (*c)->exists = false;
14926 _osr_register_zombie((*c)->osr.get());
14927 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
14928 c->reset();
14929}
14930
7c673cae
FG
14931int BlueStore::_split_collection(TransContext *txc,
14932 CollectionRef& c,
14933 CollectionRef& d,
14934 unsigned bits, int rem)
14935{
14936 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
14937 << " bits " << bits << dendl;
9f95a23c
TL
14938 std::unique_lock l(c->lock);
14939 std::unique_lock l2(d->lock);
7c673cae
FG
14940 int r;
14941
14942 // flush all previous deferred writes on this sequencer. this is a bit
14943 // heavyweight, but we need to make sure all deferred writes complete
14944 // before we split as the new collection's sequencer may need to order
14945 // this after those writes, and we don't bother with the complexity of
14946 // moving those TransContexts over to the new osr.
14947 _osr_drain_preceding(txc);
14948
14949 // move any cached items (onodes and referenced shared blobs) that will
14950 // belong to the child collection post-split. leave everything else behind.
14951 // this may include things that don't strictly belong to the now-smaller
14952 // parent split, but the OSD will always send us a split for every new
14953 // child.
14954
14955 spg_t pgid, dest_pgid;
14956 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 14957 ceph_assert(is_pg);
7c673cae 14958 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 14959 ceph_assert(is_pg);
7c673cae
FG
14960
14961 // the destination should initially be empty.
11fdf7f2
TL
14962 ceph_assert(d->onode_map.empty());
14963 ceph_assert(d->shared_blob_set.empty());
14964 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
14965
14966 c->split_cache(d.get());
14967
14968 // adjust bits. note that this will be redundant for all but the first
14969 // split call for this parent (first child).
14970 c->cnode.bits = bits;
11fdf7f2 14971 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
14972 r = 0;
14973
14974 bufferlist bl;
11fdf7f2 14975 encode(c->cnode, bl);
7c673cae
FG
14976 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
14977
14978 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
14979 << " bits " << bits << " = " << r << dendl;
14980 return r;
14981}
14982
11fdf7f2
TL
14983int BlueStore::_merge_collection(
14984 TransContext *txc,
14985 CollectionRef *c,
14986 CollectionRef& d,
14987 unsigned bits)
14988{
14989 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
14990 << " bits " << bits << dendl;
9f95a23c
TL
14991 std::unique_lock l((*c)->lock);
14992 std::unique_lock l2(d->lock);
11fdf7f2
TL
14993 int r;
14994
14995 coll_t cid = (*c)->cid;
14996
14997 // flush all previous deferred writes on the source collection to ensure
14998 // that all deferred writes complete before we merge as the target collection's
14999 // sequencer may need to order new ops after those writes.
15000
15001 _osr_drain((*c)->osr.get());
15002
15003 // move any cached items (onodes and referenced shared blobs) that will
15004 // belong to the child collection post-split. leave everything else behind.
15005 // this may include things that don't strictly belong to the now-smaller
15006 // parent split, but the OSD will always send us a split for every new
15007 // child.
15008
15009 spg_t pgid, dest_pgid;
15010 bool is_pg = cid.is_pg(&pgid);
15011 ceph_assert(is_pg);
15012 is_pg = d->cid.is_pg(&dest_pgid);
15013 ceph_assert(is_pg);
15014
15015 // adjust bits. note that this will be redundant for all but the first
15016 // merge call for the parent/target.
15017 d->cnode.bits = bits;
15018
15019 // behavior depends on target (d) bits, so this after that is updated.
15020 (*c)->split_cache(d.get());
15021
15022 // remove source collection
15023 {
9f95a23c 15024 std::unique_lock l3(coll_lock);
11fdf7f2
TL
15025 _do_remove_collection(txc, c);
15026 }
15027
15028 r = 0;
15029
15030 bufferlist bl;
15031 encode(d->cnode, bl);
15032 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15033
15034 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15035 << " bits " << bits << " = " << r << dendl;
15036 return r;
15037}
15038
494da23a
TL
15039void BlueStore::log_latency(
15040 const char* name,
15041 int idx,
15042 const ceph::timespan& l,
15043 double lat_threshold,
15044 const char* info) const
15045{
15046 logger->tinc(idx, l);
15047 if (lat_threshold > 0.0 &&
15048 l >= make_timespan(lat_threshold)) {
15049 dout(0) << __func__ << " slow operation observed for " << name
15050 << ", latency = " << l
15051 << info
15052 << dendl;
15053 }
15054}
15055
11fdf7f2 15056void BlueStore::log_latency_fn(
494da23a 15057 const char* name,
11fdf7f2
TL
15058 int idx,
15059 const ceph::timespan& l,
494da23a
TL
15060 double lat_threshold,
15061 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 15062{
494da23a
TL
15063 logger->tinc(idx, l);
15064 if (lat_threshold > 0.0 &&
15065 l >= make_timespan(lat_threshold)) {
15066 dout(0) << __func__ << " slow operation observed for " << name
15067 << ", latency = " << l
15068 << fn(l)
15069 << dendl;
15070 }
11fdf7f2
TL
15071}
15072
9f95a23c
TL
15073#if defined(WITH_LTTNG)
15074void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15075 KeyValueDB &db,
15076 TransContext &txc,
15077 mono_clock::time_point start_throttle_acquire)
15078{
15079 pending_kv_ios += txc.ios;
15080 if (txc.deferred_txn) {
15081 pending_deferred_ios += txc.ios;
15082 }
15083
15084 uint64_t started = 0;
15085 uint64_t completed = 0;
15086 if (should_trace(&started, &completed)) {
15087 txc.tracing = true;
15088 uint64_t rocksdb_base_level,
15089 rocksdb_estimate_pending_compaction_bytes,
15090 rocksdb_cur_size_all_mem_tables,
15091 rocksdb_compaction_pending,
15092 rocksdb_mem_table_flush_pending,
15093 rocksdb_num_running_compactions,
15094 rocksdb_num_running_flushes,
15095 rocksdb_actual_delayed_write_rate;
15096 db.get_property(
15097 "rocksdb.base-level",
15098 &rocksdb_base_level);
15099 db.get_property(
15100 "rocksdb.estimate-pending-compaction-bytes",
15101 &rocksdb_estimate_pending_compaction_bytes);
15102 db.get_property(
15103 "rocksdb.cur-size-all-mem-tables",
15104 &rocksdb_cur_size_all_mem_tables);
15105 db.get_property(
15106 "rocksdb.compaction-pending",
15107 &rocksdb_compaction_pending);
15108 db.get_property(
15109 "rocksdb.mem-table-flush-pending",
15110 &rocksdb_mem_table_flush_pending);
15111 db.get_property(
15112 "rocksdb.num-running-compactions",
15113 &rocksdb_num_running_compactions);
15114 db.get_property(
15115 "rocksdb.num-running-flushes",
15116 &rocksdb_num_running_flushes);
15117 db.get_property(
15118 "rocksdb.actual-delayed-write-rate",
15119 &rocksdb_actual_delayed_write_rate);
15120
15121
15122 tracepoint(
15123 bluestore,
15124 transaction_initial_state,
15125 txc.osr->get_sequencer_id(),
15126 txc.seq,
15127 throttle_bytes.get_current(),
15128 throttle_deferred_bytes.get_current(),
15129 pending_kv_ios,
15130 pending_deferred_ios,
15131 started,
15132 completed,
15133 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15134
15135 tracepoint(
15136 bluestore,
15137 transaction_initial_state_rocksdb,
15138 txc.osr->get_sequencer_id(),
15139 txc.seq,
15140 rocksdb_base_level,
15141 rocksdb_estimate_pending_compaction_bytes,
15142 rocksdb_cur_size_all_mem_tables,
15143 rocksdb_compaction_pending,
15144 rocksdb_mem_table_flush_pending,
15145 rocksdb_num_running_compactions,
15146 rocksdb_num_running_flushes,
15147 rocksdb_actual_delayed_write_rate);
15148 }
15149}
15150#endif
15151
15152mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15153 TransContext &txc, PerfCounters *logger, int state)
15154{
15155 mono_clock::time_point now = mono_clock::now();
15156 mono_clock::duration lat = now - txc.last_stamp;
15157 logger->tinc(state, lat);
15158#if defined(WITH_LTTNG)
15159 if (txc.tracing &&
15160 state >= l_bluestore_state_prepare_lat &&
15161 state <= l_bluestore_state_done_lat) {
15162 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15163 tracepoint(
15164 bluestore,
15165 transaction_state_duration,
15166 txc.osr->get_sequencer_id(),
15167 txc.seq,
15168 state,
15169 ceph::to_seconds<double>(lat));
15170 }
15171#endif
15172 txc.last_stamp = now;
15173 return lat;
15174}
15175
15176bool BlueStore::BlueStoreThrottle::try_start_transaction(
15177 KeyValueDB &db,
15178 TransContext &txc,
15179 mono_clock::time_point start_throttle_acquire)
15180{
15181 throttle_bytes.get(txc.cost);
15182
15183 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15184 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15185 return true;
15186 } else {
15187 return false;
15188 }
15189}
15190
15191void BlueStore::BlueStoreThrottle::finish_start_transaction(
15192 KeyValueDB &db,
15193 TransContext &txc,
15194 mono_clock::time_point start_throttle_acquire)
15195{
15196 ceph_assert(txc.deferred_txn);
15197 throttle_deferred_bytes.get(txc.cost);
15198 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15199}
15200
15201#if defined(WITH_LTTNG)
15202void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15203{
15204 pending_kv_ios -= 1;
15205 ios_completed_since_last_traced++;
15206 if (txc.tracing) {
15207 tracepoint(
15208 bluestore,
15209 transaction_commit_latency,
15210 txc.osr->get_sequencer_id(),
15211 txc.seq,
15212 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15213 }
15214}
15215#endif
15216
15217#if defined(WITH_LTTNG)
15218void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15219{
15220 if (txc.deferred_txn) {
15221 pending_deferred_ios -= 1;
15222 }
15223 if (txc.tracing) {
15224 mono_clock::time_point now = mono_clock::now();
15225 mono_clock::duration lat = now - txc.start;
15226 tracepoint(
15227 bluestore,
15228 transaction_total_duration,
15229 txc.osr->get_sequencer_id(),
15230 txc.seq,
15231 ceph::to_seconds<double>(lat));
15232 }
15233}
15234#endif
11fdf7f2 15235
7c673cae
FG
15236// DB key value Histogram
15237#define KEY_SLAB 32
15238#define VALUE_SLAB 64
15239
15240const string prefix_onode = "o";
15241const string prefix_onode_shard = "x";
15242const string prefix_other = "Z";
15243
15244int BlueStore::DBHistogram::get_key_slab(size_t sz)
15245{
15246 return (sz/KEY_SLAB);
15247}
15248
15249string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15250{
15251 int lower_bound = slab * KEY_SLAB;
15252 int upper_bound = (slab + 1) * KEY_SLAB;
15253 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15254 return ret;
15255}
15256
15257int BlueStore::DBHistogram::get_value_slab(size_t sz)
15258{
15259 return (sz/VALUE_SLAB);
15260}
15261
15262string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15263{
15264 int lower_bound = slab * VALUE_SLAB;
15265 int upper_bound = (slab + 1) * VALUE_SLAB;
15266 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15267 return ret;
15268}
15269
15270void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15271 const string &prefix, size_t key_size, size_t value_size)
15272{
15273 uint32_t key_slab = get_key_slab(key_size);
15274 uint32_t value_slab = get_value_slab(value_size);
15275 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
15276 key_hist[prefix][key_slab].max_len =
15277 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
15278 key_hist[prefix][key_slab].val_map[value_slab].count++;
15279 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
15280 std::max<size_t>(value_size,
15281 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
15282}
15283
15284void BlueStore::DBHistogram::dump(Formatter *f)
15285{
15286 f->open_object_section("rocksdb_value_distribution");
15287 for (auto i : value_hist) {
15288 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15289 }
15290 f->close_section();
15291
15292 f->open_object_section("rocksdb_key_value_histogram");
15293 for (auto i : key_hist) {
15294 f->dump_string("prefix", i.first);
15295 f->open_object_section("key_hist");
15296 for ( auto k : i.second) {
15297 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15298 f->dump_unsigned("max_len", k.second.max_len);
15299 f->open_object_section("value_hist");
15300 for ( auto j : k.second.val_map) {
15301 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15302 f->dump_unsigned("max_len", j.second.max_len);
15303 }
15304 f->close_section();
15305 }
15306 f->close_section();
15307 }
15308 f->close_section();
15309}
15310
15311//Itrerates through the db and collects the stats
15312void BlueStore::generate_db_histogram(Formatter *f)
15313{
15314 //globals
15315 uint64_t num_onodes = 0;
15316 uint64_t num_shards = 0;
15317 uint64_t num_super = 0;
15318 uint64_t num_coll = 0;
15319 uint64_t num_omap = 0;
11fdf7f2 15320 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
15321 uint64_t num_deferred = 0;
15322 uint64_t num_alloc = 0;
15323 uint64_t num_stat = 0;
15324 uint64_t num_others = 0;
15325 uint64_t num_shared_shards = 0;
15326 size_t max_key_size =0, max_value_size = 0;
15327 uint64_t total_key_size = 0, total_value_size = 0;
15328 size_t key_size = 0, value_size = 0;
15329 DBHistogram hist;
15330
11fdf7f2 15331 auto start = coarse_mono_clock::now();
7c673cae 15332
11fdf7f2 15333 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
15334 iter->seek_to_first();
15335 while (iter->valid()) {
15336 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15337 key_size = iter->key_size();
15338 value_size = iter->value_size();
15339 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
15340 max_key_size = std::max(max_key_size, key_size);
15341 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
15342 total_key_size += key_size;
15343 total_value_size += value_size;
15344
15345 pair<string,string> key(iter->raw_key());
15346
15347 if (key.first == PREFIX_SUPER) {
15348 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15349 num_super++;
15350 } else if (key.first == PREFIX_STAT) {
15351 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15352 num_stat++;
15353 } else if (key.first == PREFIX_COLL) {
15354 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15355 num_coll++;
15356 } else if (key.first == PREFIX_OBJ) {
15357 if (key.second.back() == ONODE_KEY_SUFFIX) {
15358 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15359 num_onodes++;
15360 } else {
15361 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15362 num_shards++;
15363 }
15364 } else if (key.first == PREFIX_OMAP) {
15365 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15366 num_omap++;
11fdf7f2
TL
15367 } else if (key.first == PREFIX_PGMETA_OMAP) {
15368 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15369 num_pgmeta_omap++;
7c673cae
FG
15370 } else if (key.first == PREFIX_DEFERRED) {
15371 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15372 num_deferred++;
11fdf7f2 15373 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
15374 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15375 num_alloc++;
15376 } else if (key.first == PREFIX_SHARED_BLOB) {
15377 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15378 num_shared_shards++;
15379 } else {
15380 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15381 num_others++;
15382 }
15383 iter->next();
15384 }
15385
11fdf7f2 15386 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
15387 f->open_object_section("rocksdb_key_value_stats");
15388 f->dump_unsigned("num_onodes", num_onodes);
15389 f->dump_unsigned("num_shards", num_shards);
15390 f->dump_unsigned("num_super", num_super);
15391 f->dump_unsigned("num_coll", num_coll);
15392 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 15393 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
15394 f->dump_unsigned("num_deferred", num_deferred);
15395 f->dump_unsigned("num_alloc", num_alloc);
15396 f->dump_unsigned("num_stat", num_stat);
15397 f->dump_unsigned("num_shared_shards", num_shared_shards);
15398 f->dump_unsigned("num_others", num_others);
15399 f->dump_unsigned("max_key_size", max_key_size);
15400 f->dump_unsigned("max_value_size", max_value_size);
15401 f->dump_unsigned("total_key_size", total_key_size);
15402 f->dump_unsigned("total_value_size", total_value_size);
15403 f->close_section();
15404
15405 hist.dump(f);
15406
15407 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
15408
15409}
15410
f6b5b4d7 15411void BlueStore::_shutdown_cache()
7c673cae
FG
15412{
15413 dout(10) << __func__ << dendl;
9f95a23c
TL
15414 for (auto i : buffer_cache_shards) {
15415 i->flush();
11fdf7f2 15416 ceph_assert(i->empty());
7c673cae
FG
15417 }
15418 for (auto& p : coll_map) {
f6b5b4d7 15419 p.second->onode_map.clear();
3efd9988
FG
15420 if (!p.second->shared_blob_set.empty()) {
15421 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 15422 p.second->shared_blob_set.dump<0>(cct);
3efd9988 15423 }
11fdf7f2
TL
15424 ceph_assert(p.second->onode_map.empty());
15425 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
15426 }
15427 coll_map.clear();
f6b5b4d7
TL
15428 for (auto i : onode_cache_shards) {
15429 ceph_assert(i->empty());
15430 }
7c673cae
FG
15431}
15432
31f18b77
FG
15433// For external caller.
15434// We use a best-effort policy instead, e.g.,
15435// we don't care if there are still some pinned onodes/data in the cache
15436// after this command is completed.
11fdf7f2 15437int BlueStore::flush_cache(ostream *os)
31f18b77
FG
15438{
15439 dout(10) << __func__ << dendl;
9f95a23c
TL
15440 for (auto i : onode_cache_shards) {
15441 i->flush();
15442 }
15443 for (auto i : buffer_cache_shards) {
15444 i->flush();
31f18b77 15445 }
11fdf7f2
TL
15446
15447 return 0;
31f18b77
FG
15448}
15449
7c673cae
FG
15450void BlueStore::_apply_padding(uint64_t head_pad,
15451 uint64_t tail_pad,
7c673cae
FG
15452 bufferlist& padded)
15453{
7c673cae 15454 if (head_pad) {
224ce89b 15455 padded.prepend_zero(head_pad);
7c673cae
FG
15456 }
15457 if (tail_pad) {
15458 padded.append_zero(tail_pad);
15459 }
15460 if (head_pad || tail_pad) {
15461 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
15462 << " tail 0x" << tail_pad << std::dec << dendl;
15463 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
15464 }
15465}
15466
11fdf7f2
TL
15467void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
15468{
15469 // finalize extent_map shards
15470 o->extent_map.update(txn, false);
15471 if (o->extent_map.needs_reshard()) {
15472 o->extent_map.reshard(db, txn);
15473 o->extent_map.update(txn, true);
15474 if (o->extent_map.needs_reshard()) {
15475 dout(20) << __func__ << " warning: still wants reshard, check options?"
15476 << dendl;
15477 o->extent_map.clear_needs_reshard();
15478 }
15479 logger->inc(l_bluestore_onode_reshard);
15480 }
15481
15482 // bound encode
15483 size_t bound = 0;
15484 denc(o->onode, bound);
15485 o->extent_map.bound_encode_spanning_blobs(bound);
15486 if (o->onode.extent_map_shards.empty()) {
15487 denc(o->extent_map.inline_bl, bound);
15488 }
15489
15490 // encode
15491 bufferlist bl;
15492 unsigned onode_part, blob_part, extent_part;
15493 {
15494 auto p = bl.get_contiguous_appender(bound, true);
15495 denc(o->onode, p);
15496 onode_part = p.get_logical_offset();
15497 o->extent_map.encode_spanning_blobs(p);
15498 blob_part = p.get_logical_offset() - onode_part;
15499 if (o->onode.extent_map_shards.empty()) {
15500 denc(o->extent_map.inline_bl, p);
15501 }
15502 extent_part = p.get_logical_offset() - onode_part - blob_part;
15503 }
15504
15505 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
15506 << " (" << onode_part << " bytes onode + "
15507 << blob_part << " bytes spanning blobs + "
15508 << extent_part << " bytes inline extents)"
15509 << dendl;
15510
15511
15512 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
15513}
15514
15515void BlueStore::_log_alerts(osd_alert_list_t& alerts)
15516{
15517 std::lock_guard l(qlock);
15518
81eedcae
TL
15519 if (!disk_size_mismatch_alert.empty()) {
15520 alerts.emplace(
15521 "BLUESTORE_DISK_SIZE_MISMATCH",
15522 disk_size_mismatch_alert);
15523 }
15524 if (!legacy_statfs_alert.empty()) {
15525 alerts.emplace(
15526 "BLUESTORE_LEGACY_STATFS",
15527 legacy_statfs_alert);
15528 }
11fdf7f2
TL
15529 if (!spillover_alert.empty() &&
15530 cct->_conf->bluestore_warn_on_bluefs_spillover) {
15531 alerts.emplace(
15532 "BLUEFS_SPILLOVER",
15533 spillover_alert);
15534 }
9f95a23c
TL
15535 if (!no_per_pool_omap_alert.empty()) {
15536 alerts.emplace(
15537 "BLUESTORE_NO_PER_POOL_OMAP",
15538 no_per_pool_omap_alert);
15539 }
11fdf7f2
TL
15540 string s0(failed_cmode);
15541
15542 if (!failed_compressors.empty()) {
15543 if (!s0.empty()) {
15544 s0 += ", ";
15545 }
15546 s0 += "unable to load:";
15547 bool first = true;
15548 for (auto& s : failed_compressors) {
15549 if (first) {
15550 first = false;
15551 } else {
15552 s0 += ", ";
15553 }
15554 s0 += s;
15555 }
15556 alerts.emplace(
15557 "BLUESTORE_NO_COMPRESSION",
15558 s0);
15559 }
15560}
15561
9f95a23c
TL
15562void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
15563 size_t extents)
15564{
15565 alloc_stats_count++;
15566 alloc_stats_fragments += extents;
15567 alloc_stats_size += need;
15568}
15569
15570void BlueStore::_record_allocation_stats()
15571{
15572 // don't care about data consistency,
15573 // fields can be partially modified while making the tuple
15574 auto t0 = std::make_tuple(
15575 alloc_stats_count.exchange(0),
15576 alloc_stats_fragments.exchange(0),
15577 alloc_stats_size.exchange(0));
15578
15579 dout(0) << " allocation stats probe "
15580 << probe_count << ":"
15581 << " cnt: " << std::get<0>(t0)
15582 << " frags: " << std::get<1>(t0)
15583 << " size: " << std::get<2>(t0)
15584 << dendl;
15585
15586
15587 //
15588 // Keep the history for probes from the power-of-two sequence:
15589 // -1, -2, -4, -8, -16
15590 //
15591 size_t base = 1;
15592 for (auto& t : alloc_stats_history) {
15593 dout(0) << " probe -"
15594 << base + (probe_count % base) << ": "
15595 << std::get<0>(t)
15596 << ", " << std::get<1>(t)
15597 << ", " << std::get<2>(t)
15598 << dendl;
15599 base <<= 1;
15600 }
15601 dout(0) << "------------" << dendl;
15602
15603 auto prev = probe_count++;
15604 auto mask = (1 << alloc_stats_history.size()) - 1;
15605 probe_count &= mask;
15606
15607 for (size_t i = cbits(prev ^ probe_count) - 1; i > 0 ; --i) {
15608 alloc_stats_history[i] = alloc_stats_history[i - 1];
15609 }
15610 alloc_stats_history[0].swap(t0);
15611}
15612
7c673cae 15613// ===========================================
11fdf7f2
TL
15614// BlueStoreRepairer
15615
15616size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
15617 const interval_set<uint64_t>& extents)
15618{
15619 ceph_assert(granularity); // initialized
15620 // can't call for the second time
15621 ceph_assert(!was_filtered_out);
15622 ceph_assert(collections_bfs.size() == objects_bfs.size());
15623
15624 uint64_t prev_pos = 0;
15625 uint64_t npos = collections_bfs.size();
15626
15627 bloom_vector collections_reduced;
15628 bloom_vector objects_reduced;
15629
15630 for (auto e : extents) {
15631 if (e.second == 0) {
15632 continue;
15633 }
15634 uint64_t pos = max(e.first / granularity, prev_pos);
15635 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
15636 while (pos != npos && pos < end_pos) {
15637 ceph_assert( collections_bfs[pos].element_count() ==
15638 objects_bfs[pos].element_count());
15639 if (collections_bfs[pos].element_count()) {
15640 collections_reduced.push_back(std::move(collections_bfs[pos]));
15641 objects_reduced.push_back(std::move(objects_bfs[pos]));
15642 }
15643 ++pos;
15644 }
15645 prev_pos = end_pos;
15646 }
15647 collections_reduced.swap(collections_bfs);
15648 objects_reduced.swap(objects_bfs);
15649 was_filtered_out = true;
15650 return collections_bfs.size();
15651}
15652
15653bool BlueStoreRepairer::remove_key(KeyValueDB *db,
15654 const string& prefix,
15655 const string& key)
15656{
15657 if (!remove_key_txn) {
15658 remove_key_txn = db->get_transaction();
15659 }
15660 ++to_repair_cnt;
15661 remove_key_txn->rmkey(prefix, key);
15662
15663 return true;
15664}
15665
9f95a23c
TL
15666void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db)
15667{
15668 fix_per_pool_omap_txn = db->get_transaction();
15669 ++to_repair_cnt;
15670 bufferlist bl;
15671 bl.append("1");
15672 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
15673}
15674
11fdf7f2
TL
15675bool BlueStoreRepairer::fix_shared_blob(
15676 KeyValueDB *db,
15677 uint64_t sbid,
15678 const bufferlist* bl)
15679{
15680 KeyValueDB::Transaction txn;
15681 if (fix_misreferences_txn) { // reuse this txn
15682 txn = fix_misreferences_txn;
15683 } else {
15684 if (!fix_shared_blob_txn) {
15685 fix_shared_blob_txn = db->get_transaction();
15686 }
15687 txn = fix_shared_blob_txn;
15688 }
15689 string key;
15690 get_shared_blob_key(sbid, &key);
15691
15692 ++to_repair_cnt;
15693 if (bl) {
15694 txn->set(PREFIX_SHARED_BLOB, key, *bl);
15695 } else {
15696 txn->rmkey(PREFIX_SHARED_BLOB, key);
15697 }
15698 return true;
15699}
15700
15701bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
15702 const string& key,
15703 const store_statfs_t& new_statfs)
15704{
15705 if (!fix_statfs_txn) {
15706 fix_statfs_txn = db->get_transaction();
15707 }
15708 BlueStore::volatile_statfs vstatfs;
15709 vstatfs = new_statfs;
15710 bufferlist bl;
15711 vstatfs.encode(bl);
15712 ++to_repair_cnt;
15713 fix_statfs_txn->set(PREFIX_STAT, key, bl);
15714 return true;
15715}
15716
15717bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
15718 FreelistManager* fm,
15719 uint64_t offset, uint64_t len)
15720{
15721 if (!fix_fm_leaked_txn) {
15722 fix_fm_leaked_txn = db->get_transaction();
15723 }
15724 ++to_repair_cnt;
15725 fm->release(offset, len, fix_fm_leaked_txn);
15726 return true;
15727}
15728bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
15729 FreelistManager* fm,
15730 uint64_t offset, uint64_t len)
15731{
15732 if (!fix_fm_false_free_txn) {
15733 fix_fm_false_free_txn = db->get_transaction();
15734 }
15735 ++to_repair_cnt;
15736 fm->allocate(offset, len, fix_fm_false_free_txn);
15737 return true;
15738}
15739
15740bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
15741{
15742 // this is just a stub to count num of repairs properly,
15743 // actual repair happens in BlueStore::_close_db_and_around()
15744 // while doing _sync_bluefs_and_fm
15745 ++out_of_sync_flag;
15746 ++to_repair_cnt;
15747 return true;
15748}
15749
15750bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
15751{
15752 if (misreferenced_extents.size()) {
15753 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
15754 ceph_assert(n > 0);
15755 if (!fix_misreferences_txn) {
15756 fix_misreferences_txn = db->get_transaction();
15757 }
15758 return true;
15759 }
15760 return false;
15761}
15762
15763unsigned BlueStoreRepairer::apply(KeyValueDB* db)
15764{
9f95a23c
TL
15765 if (fix_per_pool_omap_txn) {
15766 db->submit_transaction_sync(fix_per_pool_omap_txn);
15767 fix_per_pool_omap_txn = nullptr;
15768 }
11fdf7f2
TL
15769 if (fix_fm_leaked_txn) {
15770 db->submit_transaction_sync(fix_fm_leaked_txn);
15771 fix_fm_leaked_txn = nullptr;
15772 }
15773 if (fix_fm_false_free_txn) {
15774 db->submit_transaction_sync(fix_fm_false_free_txn);
15775 fix_fm_false_free_txn = nullptr;
15776 }
15777 if (remove_key_txn) {
15778 db->submit_transaction_sync(remove_key_txn);
15779 remove_key_txn = nullptr;
15780 }
15781 if (fix_misreferences_txn) {
15782 db->submit_transaction_sync(fix_misreferences_txn);
15783 fix_misreferences_txn = nullptr;
15784 }
15785 if (fix_shared_blob_txn) {
15786 db->submit_transaction_sync(fix_shared_blob_txn);
15787 fix_shared_blob_txn = nullptr;
15788 }
15789
15790 if (fix_statfs_txn) {
15791 db->submit_transaction_sync(fix_statfs_txn);
15792 fix_statfs_txn = nullptr;
15793 }
15794 unsigned repaired = to_repair_cnt;
15795 to_repair_cnt = 0;
15796 return repaired;
15797}
15798
15799// =======================================================
9f95a23c
TL
15800// RocksDBBlueFSVolumeSelector
15801
15802uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
15803 ceph_assert(h != nullptr);
15804 uint64_t hint = reinterpret_cast<uint64_t>(h);
15805 uint8_t res;
15806 switch (hint) {
15807 case LEVEL_SLOW:
15808 res = BlueFS::BDEV_SLOW;
15809 if (db_avail4slow > 0) {
15810 // considering statically available db space vs.
15811 // - observed maximums on DB dev for DB/WAL/UNSORTED data
15812 // - observed maximum spillovers
15813 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 15814 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
15815 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
15816 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
15817 // this could go to db hence using it in the estimation
15818 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
15819
15820 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
15821 uint64_t avail = min(
15822 db_avail4slow,
15823 max_db_use < db_total ? db_total - max_db_use : 0);
15824
15825 // considering current DB dev usage for SLOW data
15826 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
15827 res = BlueFS::BDEV_DB;
15828 }
15829 }
15830 break;
f6b5b4d7 15831 case LEVEL_LOG:
9f95a23c
TL
15832 case LEVEL_WAL:
15833 res = BlueFS::BDEV_WAL;
15834 break;
15835 case LEVEL_DB:
15836 default:
15837 res = BlueFS::BDEV_DB;
15838 break;
15839 }
15840 return res;
15841}
15842
15843void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
15844{
15845 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
15846 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
15847}
15848
15849void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
15850 uint8_t res = LEVEL_DB;
15851 if (dirname.length() > 5) {
15852 // the "db.slow" and "db.wal" directory names are hard-coded at
15853 // match up with bluestore. the slow device is always the second
15854 // one (when a dedicated block.db device is present and used at
15855 // bdev 0). the wal device is always last.
15856 if (boost::algorithm::ends_with(dirname, ".slow")) {
15857 res = LEVEL_SLOW;
15858 }
15859 else if (boost::algorithm::ends_with(dirname, ".wal")) {
15860 res = LEVEL_WAL;
15861 }
15862 }
15863 return reinterpret_cast<void*>(res);
15864}
15865
15866void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
15867 auto max_x = per_level_per_dev_usage.get_max_x();
15868 auto max_y = per_level_per_dev_usage.get_max_y();
15869 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
15870 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
15871 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
15872 << ", db_avail:" << db_avail4slow << std::endl
15873 << "Usage matrix:" << std::endl;
f6b5b4d7 15874 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
15875 "DEV/LEV",
15876 "WAL",
15877 "DB",
15878 "SLOW",
15879 "*",
15880 "*",
f6b5b4d7
TL
15881 "REAL",
15882 "FILES",
9f95a23c
TL
15883 } };
15884 const size_t width = 12;
15885 for (size_t i = 0; i < names.size(); ++i) {
15886 sout.setf(std::ios::left, std::ios::adjustfield);
15887 sout.width(width);
15888 sout << names[i];
15889 }
15890 sout << std::endl;
15891 for (size_t l = 0; l < max_y; l++) {
15892 sout.setf(std::ios::left, std::ios::adjustfield);
15893 sout.width(width);
15894 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
15895 case LEVEL_LOG:
15896 sout << "LOG"; break;
9f95a23c
TL
15897 case LEVEL_WAL:
15898 sout << "WAL"; break;
15899 case LEVEL_DB:
15900 sout << "DB"; break;
15901 case LEVEL_SLOW:
15902 sout << "SLOW"; break;
15903 case LEVEL_MAX:
15904 sout << "TOTALS"; break;
15905 }
f6b5b4d7 15906 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
15907 sout.setf(std::ios::left, std::ios::adjustfield);
15908 sout.width(width);
15909 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
15910 }
15911 sout.setf(std::ios::left, std::ios::adjustfield);
15912 sout.width(width);
f6b5b4d7 15913 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
15914 }
15915 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
15916 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
15917 sout << "MAXIMUMS:" << std::endl;
15918 for (size_t l = 0; l < max_y; l++) {
15919 sout.setf(std::ios::left, std::ios::adjustfield);
15920 sout.width(width);
15921 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
15922 case LEVEL_LOG:
15923 sout << "LOG"; break;
9f95a23c
TL
15924 case LEVEL_WAL:
15925 sout << "WAL"; break;
15926 case LEVEL_DB:
15927 sout << "DB"; break;
15928 case LEVEL_SLOW:
15929 sout << "SLOW"; break;
15930 case LEVEL_MAX:
15931 sout << "TOTALS"; break;
15932 }
15933 for (size_t d = 0; d < max_x - 1; d++) {
15934 sout.setf(std::ios::left, std::ios::adjustfield);
15935 sout.width(width);
15936 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
15937 }
15938 sout.setf(std::ios::left, std::ios::adjustfield);
15939 sout.width(width);
15940 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
15941 if (l < max_y - 1) {
15942 sout << std::endl;
15943 }
15944 }
15945}
11fdf7f2 15946
9f95a23c 15947// =======================================================