]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
import ceph 16.2.6
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <unistd.h>
16 #include <stdlib.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
23
24 #include "include/cpp-btree/btree_set.h"
25
26 #include "BlueStore.h"
27 #include "bluestore_common.h"
28 #include "os/kv.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
40 #include "BlueFS.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
47 #include "common/pretty_binary.h"
48
49 #if defined(WITH_LTTNG)
50 #define TRACEPOINT_DEFINE
51 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
52 #include "tracing/bluestore.h"
53 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
54 #undef TRACEPOINT_DEFINE
55 #else
56 #define tracepoint(...)
57 #endif
58
59 #define dout_context cct
60 #define dout_subsys ceph_subsys_bluestore
61
62 using bid_t = decltype(BlueStore::Blob::id);
63
64 // bluestore_cache_onode
65 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
66 bluestore_cache_onode);
67
68 // bluestore_cache_other
69 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
70 bluestore_Buffer);
71 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
72 bluestore_Extent);
73 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
74 bluestore_Blob);
75 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
76 bluestore_SharedBlob);
77
78 // bluestore_txc
79 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
80 bluestore_txc);
81 using std::deque;
82 using std::min;
83 using std::make_pair;
84 using std::numeric_limits;
85 using std::pair;
86 using std::list;
87 using std::map;
88 using std::max;
89 using std::ostream;
90 using std::ostringstream;
91 using std::set;
92 using std::string;
93 using std::stringstream;
94 using std::vector;
95
96 using ceph::bufferlist;
97 using ceph::bufferptr;
98 using ceph::coarse_mono_clock;
99 using ceph::decode;
100 using ceph::encode;
101 using ceph::Formatter;
102 using ceph::JSONFormatter;
103 using ceph::make_timespan;
104 using ceph::mono_clock;
105 using ceph::mono_time;
106 using ceph::timespan_str;
107
108 // kv store prefixes
109 const string PREFIX_SUPER = "S"; // field -> value
110 const string PREFIX_STAT = "T"; // field -> value(int64 array)
111 const string PREFIX_COLL = "C"; // collection name -> cnode_t
112 const string PREFIX_OBJ = "O"; // object name -> onode_t
113 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
114 const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
115 const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
116 const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
117 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
118 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
119 const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
120 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
121 const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
122 const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
123 const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
124
125 const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
126
127 // write a label in the first block. always use this size. note that
128 // bluefs makes a matching assumption about the location of its
129 // superblock (always the second block of the device).
130 #define BDEV_LABEL_BLOCK_SIZE 4096
131
132 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
133 #define SUPER_RESERVED 8192
134
135 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
136
137
138 /*
139 * extent map blob encoding
140 *
141 * we use the low bits of the blobid field to indicate some common scenarios
142 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
143 */
144 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
145 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
146 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
147 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
148 #define BLOBID_SHIFT_BITS 4
149
150 /*
151 * object name key structure
152 *
153 * encoded u8: shard + 2^7 (so that it sorts properly)
154 * encoded u64: poolid + 2^63 (so that it sorts properly)
155 * encoded u32: hash (bit reversed)
156 *
157 * escaped string: namespace
158 *
159 * escaped string: key or object name
160 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
161 * we are done. otherwise, we are followed by the object name.
162 * escaped string: object name (unless '=' above)
163 *
164 * encoded u64: snap
165 * encoded u64: generation
166 * 'o'
167 */
168 #define ONODE_KEY_SUFFIX 'o'
169
170 /*
171 * extent shard key
172 *
173 * object prefix key
174 * u32
175 * 'x'
176 */
177 #define EXTENT_SHARD_KEY_SUFFIX 'x'
178
179 /*
180 * string encoding in the key
181 *
182 * The key string needs to lexicographically sort the same way that
183 * ghobject_t does. We do this by escaping anything <= to '#' with #
184 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
185 * hex digits.
186 *
187 * We use ! as a terminator for strings; this works because it is < #
188 * and will get escaped if it is present in the string.
189 *
190 * NOTE: There is a bug in this implementation: due to implicit
191 * character type conversion in comparison it may produce unexpected
192 * ordering. Unfortunately fixing the bug would mean invalidating the
193 * keys in existing deployments. Instead we do additional sorting
194 * where it is needed.
195 */
196 template<typename S>
197 static void append_escaped(const string &in, S *out)
198 {
199 char hexbyte[in.length() * 3 + 1];
200 char* ptr = &hexbyte[0];
201 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
202 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
203 *ptr++ = '#';
204 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
205 *ptr++ = "0123456789abcdef"[*i & 0x0f];
206 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
207 *ptr++ = '~';
208 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
209 *ptr++ = "0123456789abcdef"[*i & 0x0f];
210 } else {
211 *ptr++ = *i;
212 }
213 }
214 *ptr++ = '!';
215 out->append(hexbyte, ptr - &hexbyte[0]);
216 }
217
218 inline unsigned h2i(char c)
219 {
220 if ((c >= '0') && (c <= '9')) {
221 return c - 0x30;
222 } else if ((c >= 'a') && (c <= 'f')) {
223 return c - 'a' + 10;
224 } else if ((c >= 'A') && (c <= 'F')) {
225 return c - 'A' + 10;
226 } else {
227 return 256; // make it always larger than 255
228 }
229 }
230
231 static int decode_escaped(const char *p, string *out)
232 {
233 char buff[256];
234 char* ptr = &buff[0];
235 char* max = &buff[252];
236 const char *orig_p = p;
237 while (*p && *p != '!') {
238 if (*p == '#' || *p == '~') {
239 unsigned hex = 0;
240 p++;
241 hex = h2i(*p++) << 4;
242 if (hex > 255) {
243 return -EINVAL;
244 }
245 hex |= h2i(*p++);
246 if (hex > 255) {
247 return -EINVAL;
248 }
249 *ptr++ = hex;
250 } else {
251 *ptr++ = *p++;
252 }
253 if (ptr > max) {
254 out->append(buff, ptr-buff);
255 ptr = &buff[0];
256 }
257 }
258 if (ptr != buff) {
259 out->append(buff, ptr-buff);
260 }
261 return p - orig_p;
262 }
263
264 template<typename T>
265 static void _key_encode_shard(shard_id_t shard, T *key)
266 {
267 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
268 }
269
270 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
271 {
272 pshard->id = (uint8_t)*key - (uint8_t)0x80;
273 return key + 1;
274 }
275
276 static void get_coll_range(const coll_t& cid, int bits,
277 ghobject_t *temp_start, ghobject_t *temp_end,
278 ghobject_t *start, ghobject_t *end)
279 {
280 spg_t pgid;
281 if (cid.is_pg(&pgid)) {
282 start->shard_id = pgid.shard;
283 *temp_start = *start;
284
285 start->hobj.pool = pgid.pool();
286 temp_start->hobj.pool = -2ll - pgid.pool();
287
288 *end = *start;
289 *temp_end = *temp_start;
290
291 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
292 start->hobj.set_bitwise_key_u32(reverse_hash);
293 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
294
295 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
296 if (end_hash > 0xffffffffull)
297 end_hash = 0xffffffffull;
298
299 end->hobj.set_bitwise_key_u32(end_hash);
300 temp_end->hobj.set_bitwise_key_u32(end_hash);
301 } else {
302 start->shard_id = shard_id_t::NO_SHARD;
303 start->hobj.pool = -1ull;
304
305 *end = *start;
306 start->hobj.set_bitwise_key_u32(0);
307 end->hobj.set_bitwise_key_u32(0xffffffff);
308
309 // no separate temp section
310 *temp_start = *end;
311 *temp_end = *end;
312 }
313
314 start->generation = 0;
315 end->generation = 0;
316 temp_start->generation = 0;
317 temp_end->generation = 0;
318 }
319
320 static void get_shared_blob_key(uint64_t sbid, string *key)
321 {
322 key->clear();
323 _key_encode_u64(sbid, key);
324 }
325
326 static int get_key_shared_blob(const string& key, uint64_t *sbid)
327 {
328 const char *p = key.c_str();
329 if (key.length() < sizeof(uint64_t))
330 return -1;
331 _key_decode_u64(p, sbid);
332 return 0;
333 }
334
335 template<typename S>
336 static void _key_encode_prefix(const ghobject_t& oid, S *key)
337 {
338 _key_encode_shard(oid.shard_id, key);
339 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
340 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
341 }
342
343 static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
344 {
345 p = _key_decode_shard(p, &oid->shard_id);
346
347 uint64_t pool;
348 p = _key_decode_u64(p, &pool);
349 oid->hobj.pool = pool - 0x8000000000000000ull;
350
351 unsigned hash;
352 p = _key_decode_u32(p, &hash);
353
354 oid->hobj.set_bitwise_key_u32(hash);
355
356 return p;
357 }
358
359 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
360
361 template<typename S>
362 static int get_key_object(const S& key, ghobject_t *oid)
363 {
364 int r;
365 const char *p = key.c_str();
366
367 if (key.length() < ENCODED_KEY_PREFIX_LEN)
368 return -1;
369
370 p = _key_decode_prefix(p, oid);
371
372 if (key.length() == ENCODED_KEY_PREFIX_LEN)
373 return -2;
374
375 r = decode_escaped(p, &oid->hobj.nspace);
376 if (r < 0)
377 return -2;
378 p += r + 1;
379
380 string k;
381 r = decode_escaped(p, &k);
382 if (r < 0)
383 return -3;
384 p += r + 1;
385 if (*p == '=') {
386 // no key
387 ++p;
388 oid->hobj.oid.name = k;
389 } else if (*p == '<' || *p == '>') {
390 // key + name
391 ++p;
392 r = decode_escaped(p, &oid->hobj.oid.name);
393 if (r < 0)
394 return -5;
395 p += r + 1;
396 oid->hobj.set_key(k);
397 } else {
398 // malformed
399 return -6;
400 }
401
402 p = _key_decode_u64(p, &oid->hobj.snap.val);
403 p = _key_decode_u64(p, &oid->generation);
404
405 if (*p != ONODE_KEY_SUFFIX) {
406 return -7;
407 }
408 p++;
409 if (*p) {
410 // if we get something other than a null terminator here,
411 // something goes wrong.
412 return -8;
413 }
414
415 return 0;
416 }
417
418 template<typename S>
419 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
420 {
421 key->clear();
422
423 size_t max_len = ENCODED_KEY_PREFIX_LEN +
424 (oid.hobj.nspace.length() * 3 + 1) +
425 (oid.hobj.get_key().length() * 3 + 1) +
426 1 + // for '<', '=', or '>'
427 (oid.hobj.oid.name.length() * 3 + 1) +
428 8 + 8 + 1;
429 key->reserve(max_len);
430
431 _key_encode_prefix(oid, key);
432
433 append_escaped(oid.hobj.nspace, key);
434
435 if (oid.hobj.get_key().length()) {
436 // is a key... could be < = or >.
437 append_escaped(oid.hobj.get_key(), key);
438 // (ASCII chars < = and > sort in that order, yay)
439 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
440 if (r) {
441 key->append(r > 0 ? ">" : "<");
442 append_escaped(oid.hobj.oid.name, key);
443 } else {
444 // same as no key
445 key->append("=");
446 }
447 } else {
448 // no key
449 append_escaped(oid.hobj.oid.name, key);
450 key->append("=");
451 }
452
453 _key_encode_u64(oid.hobj.snap, key);
454 _key_encode_u64(oid.generation, key);
455
456 key->push_back(ONODE_KEY_SUFFIX);
457
458 // sanity check
459 if (true) {
460 ghobject_t t;
461 int r = get_key_object(*key, &t);
462 if (r || t != oid) {
463 derr << " r " << r << dendl;
464 derr << "key " << pretty_binary_string(*key) << dendl;
465 derr << "oid " << oid << dendl;
466 derr << " t " << t << dendl;
467 ceph_assert(r == 0 && t == oid);
468 }
469 }
470 }
471
472 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
473 // char lets us quickly test whether it is a shard key without decoding any
474 // of the prefix bytes.
475 template<typename S>
476 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
477 string *key)
478 {
479 key->clear();
480 key->reserve(onode_key.length() + 4 + 1);
481 key->append(onode_key.c_str(), onode_key.size());
482 _key_encode_u32(offset, key);
483 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
484 }
485
486 static void rewrite_extent_shard_key(uint32_t offset, string *key)
487 {
488 ceph_assert(key->size() > sizeof(uint32_t) + 1);
489 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
490 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
491 }
492
493 template<typename S>
494 static void generate_extent_shard_key_and_apply(
495 const S& onode_key,
496 uint32_t offset,
497 string *key,
498 std::function<void(const string& final_key)> apply)
499 {
500 if (key->empty()) { // make full key
501 ceph_assert(!onode_key.empty());
502 get_extent_shard_key(onode_key, offset, key);
503 } else {
504 rewrite_extent_shard_key(offset, key);
505 }
506 apply(*key);
507 }
508
509 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
510 {
511 ceph_assert(key.size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
513 int okey_len = key.size() - sizeof(uint32_t) - 1;
514 *onode_key = key.substr(0, okey_len);
515 const char *p = key.data() + okey_len;
516 _key_decode_u32(p, offset);
517 return 0;
518 }
519
520 static bool is_extent_shard_key(const string& key)
521 {
522 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
523 }
524
525 static void get_deferred_key(uint64_t seq, string *out)
526 {
527 _key_encode_u64(seq, out);
528 }
529
530 static void get_pool_stat_key(int64_t pool_id, string *key)
531 {
532 key->clear();
533 _key_encode_u64(pool_id, key);
534 }
535
536 static int get_key_pool_stat(const string& key, uint64_t* pool_id)
537 {
538 const char *p = key.c_str();
539 if (key.length() < sizeof(uint64_t))
540 return -1;
541 _key_decode_u64(p, pool_id);
542 return 0;
543 }
544
545
546 template <int LogLevelV>
547 void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
548 {
549 uint64_t pos = 0;
550 for (auto& s : em.shards) {
551 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
552 << (s.loaded ? " (loaded)" : "")
553 << (s.dirty ? " (dirty)" : "")
554 << dendl;
555 }
556 for (auto& e : em.extent_map) {
557 dout(LogLevelV) << __func__ << " " << e << dendl;
558 ceph_assert(e.logical_offset >= pos);
559 pos = e.logical_offset + e.length;
560 const bluestore_blob_t& blob = e.blob->get_blob();
561 if (blob.has_csum()) {
562 vector<uint64_t> v;
563 unsigned n = blob.get_csum_count();
564 for (unsigned i = 0; i < n; ++i)
565 v.push_back(blob.get_csum_item(i));
566 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
567 << dendl;
568 }
569 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
570 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
571 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
572 << "~" << i.second->length << std::dec
573 << " " << *i.second << dendl;
574 }
575 }
576 }
577
578 template <int LogLevelV>
579 void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
580 {
581 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
582 return;
583 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
584 << " nid " << o.onode.nid
585 << " size 0x" << std::hex << o.onode.size
586 << " (" << std::dec << o.onode.size << ")"
587 << " expected_object_size " << o.onode.expected_object_size
588 << " expected_write_size " << o.onode.expected_write_size
589 << " in " << o.onode.extent_map_shards.size() << " shards"
590 << ", " << o.extent_map.spanning_blob_map.size()
591 << " spanning blobs"
592 << dendl;
593 for (auto p = o.onode.attrs.begin();
594 p != o.onode.attrs.end();
595 ++p) {
596 dout(LogLevelV) << __func__ << " attr " << p->first
597 << " len " << p->second.length() << dendl;
598 }
599 _dump_extent_map<LogLevelV>(cct, o.extent_map);
600 }
601
602 template <int LogLevelV>
603 void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
604 {
605 dout(LogLevelV) << __func__ << " transaction dump:\n";
606 JSONFormatter f(true);
607 f.open_object_section("transaction");
608 t->dump(&f);
609 f.close_section();
610 f.flush(*_dout);
611 *_dout << dendl;
612 }
613
614 // Buffer
615
616 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
617 {
618 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
619 << b.offset << "~" << b.length << std::dec
620 << " " << BlueStore::Buffer::get_state_name(b.state);
621 if (b.flags)
622 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
623 return out << ")";
624 }
625
626 namespace {
627
628 /*
629 * Due to a bug in key string encoding (see a comment for append_escaped)
630 * the KeyValueDB iterator does not lexicographically sort the same
631 * way that ghobject_t does: objects with the same hash may have wrong order.
632 *
633 * This is the iterator wrapper that fixes the keys order.
634 */
635
636 class CollectionListIterator {
637 public:
638 CollectionListIterator(const KeyValueDB::Iterator &it)
639 : m_it(it) {
640 }
641 virtual ~CollectionListIterator() {
642 }
643
644 virtual bool valid() const = 0;
645 virtual const ghobject_t &oid() const = 0;
646 virtual void lower_bound(const ghobject_t &oid) = 0;
647 virtual void upper_bound(const ghobject_t &oid) = 0;
648 virtual void next() = 0;
649
650 virtual int cmp(const ghobject_t &oid) const = 0;
651
652 bool is_ge(const ghobject_t &oid) const {
653 return cmp(oid) >= 0;
654 }
655
656 bool is_lt(const ghobject_t &oid) const {
657 return cmp(oid) < 0;
658 }
659
660 protected:
661 KeyValueDB::Iterator m_it;
662 };
663
664 class SimpleCollectionListIterator : public CollectionListIterator {
665 public:
666 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
667 : CollectionListIterator(it), m_cct(cct) {
668 }
669
670 bool valid() const override {
671 return m_it->valid();
672 }
673
674 const ghobject_t &oid() const override {
675 ceph_assert(valid());
676
677 return m_oid;
678 }
679
680 void lower_bound(const ghobject_t &oid) override {
681 string key;
682 get_object_key(m_cct, oid, &key);
683
684 m_it->lower_bound(key);
685 get_oid();
686 }
687
688 void upper_bound(const ghobject_t &oid) override {
689 string key;
690 get_object_key(m_cct, oid, &key);
691
692 m_it->upper_bound(key);
693 get_oid();
694 }
695
696 void next() override {
697 ceph_assert(valid());
698
699 m_it->next();
700 get_oid();
701 }
702
703 int cmp(const ghobject_t &oid) const override {
704 ceph_assert(valid());
705
706 string key;
707 get_object_key(m_cct, oid, &key);
708
709 return m_it->key().compare(key);
710 }
711
712 private:
713 CephContext *m_cct;
714 ghobject_t m_oid;
715
716 void get_oid() {
717 m_oid = ghobject_t();
718 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
719 m_it->next();
720 }
721 if (!valid()) {
722 return;
723 }
724
725 int r = get_key_object(m_it->key(), &m_oid);
726 ceph_assert(r == 0);
727 }
728 };
729
730 class SortedCollectionListIterator : public CollectionListIterator {
731 public:
732 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
733 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
734 }
735
736 bool valid() const override {
737 return m_chunk_iter != m_chunk.end();
738 }
739
740 const ghobject_t &oid() const override {
741 ceph_assert(valid());
742
743 return m_chunk_iter->first;
744 }
745
746 void lower_bound(const ghobject_t &oid) override {
747 std::string key;
748 _key_encode_prefix(oid, &key);
749
750 m_it->lower_bound(key);
751 m_chunk_iter = m_chunk.end();
752 if (!get_next_chunk()) {
753 return;
754 }
755
756 if (this->oid().shard_id != oid.shard_id ||
757 this->oid().hobj.pool != oid.hobj.pool ||
758 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
759 return;
760 }
761
762 m_chunk_iter = m_chunk.lower_bound(oid);
763 if (m_chunk_iter == m_chunk.end()) {
764 get_next_chunk();
765 }
766 }
767
768 void upper_bound(const ghobject_t &oid) override {
769 lower_bound(oid);
770
771 if (valid() && this->oid() == oid) {
772 next();
773 }
774 }
775
776 void next() override {
777 ceph_assert(valid());
778
779 m_chunk_iter++;
780 if (m_chunk_iter == m_chunk.end()) {
781 get_next_chunk();
782 }
783 }
784
785 int cmp(const ghobject_t &oid) const override {
786 ceph_assert(valid());
787
788 if (this->oid() < oid) {
789 return -1;
790 }
791 if (this->oid() > oid) {
792 return 1;
793 }
794 return 0;
795 }
796
797 private:
798 std::map<ghobject_t, std::string> m_chunk;
799 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
800
801 bool get_next_chunk() {
802 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
803 m_it->next();
804 }
805
806 if (!m_it->valid()) {
807 return false;
808 }
809
810 ghobject_t oid;
811 int r = get_key_object(m_it->key(), &oid);
812 ceph_assert(r == 0);
813
814 m_chunk.clear();
815 while (true) {
816 m_chunk.insert({oid, m_it->key()});
817
818 do {
819 m_it->next();
820 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
821
822 if (!m_it->valid()) {
823 break;
824 }
825
826 ghobject_t next;
827 r = get_key_object(m_it->key(), &next);
828 ceph_assert(r == 0);
829 if (next.shard_id != oid.shard_id ||
830 next.hobj.pool != oid.hobj.pool ||
831 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
832 break;
833 }
834 oid = next;
835 }
836
837 m_chunk_iter = m_chunk.begin();
838 return true;
839 }
840 };
841
842 } // anonymous namespace
843
844 // Garbage Collector
845
846 void BlueStore::GarbageCollector::process_protrusive_extents(
847 const BlueStore::ExtentMap& extent_map,
848 uint64_t start_offset,
849 uint64_t end_offset,
850 uint64_t start_touch_offset,
851 uint64_t end_touch_offset,
852 uint64_t min_alloc_size)
853 {
854 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
855
856 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
857 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
858
859 dout(30) << __func__ << " (hex): [" << std::hex
860 << lookup_start_offset << ", " << lookup_end_offset
861 << ")" << std::dec << dendl;
862
863 for (auto it = extent_map.seek_lextent(lookup_start_offset);
864 it != extent_map.extent_map.end() &&
865 it->logical_offset < lookup_end_offset;
866 ++it) {
867 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
868 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
869
870 dout(30) << __func__ << " " << *it
871 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
872 << dendl;
873
874 Blob* b = it->blob.get();
875
876 if (it->logical_offset >=start_touch_offset &&
877 it->logical_end() <= end_touch_offset) {
878 // Process extents within the range affected by
879 // the current write request.
880 // Need to take into account if existing extents
881 // can be merged with them (uncompressed case)
882 if (!b->get_blob().is_compressed()) {
883 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
884 --blob_info_counted->expected_allocations; // don't need to allocate
885 // new AU for compressed
886 // data since another
887 // collocated uncompressed
888 // blob already exists
889 dout(30) << __func__ << " --expected:"
890 << alloc_unit_start << dendl;
891 }
892 used_alloc_unit = alloc_unit_end;
893 blob_info_counted = nullptr;
894 }
895 } else if (b->get_blob().is_compressed()) {
896
897 // additionally we take compressed blobs that were not impacted
898 // by the write into account too
899 BlobInfo& bi =
900 affected_blobs.emplace(
901 b, BlobInfo(b->get_referenced_bytes())).first->second;
902
903 int adjust =
904 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
905 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
906 dout(30) << __func__ << " expected_allocations="
907 << bi.expected_allocations << " end_au:"
908 << alloc_unit_end << dendl;
909
910 blob_info_counted = &bi;
911 used_alloc_unit = alloc_unit_end;
912
913 ceph_assert(it->length <= bi.referenced_bytes);
914 bi.referenced_bytes -= it->length;
915 dout(30) << __func__ << " affected_blob:" << *b
916 << " unref 0x" << std::hex << it->length
917 << " referenced = 0x" << bi.referenced_bytes
918 << std::dec << dendl;
919 // NOTE: we can't move specific blob to resulting GC list here
920 // when reference counter == 0 since subsequent extents might
921 // decrement its expected_allocation.
922 // Hence need to enumerate all the extents first.
923 if (!bi.collect_candidate) {
924 bi.first_lextent = it;
925 bi.collect_candidate = true;
926 }
927 bi.last_lextent = it;
928 } else {
929 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
930 // don't need to allocate new AU for compressed data since another
931 // collocated uncompressed blob already exists
932 --blob_info_counted->expected_allocations;
933 dout(30) << __func__ << " --expected_allocations:"
934 << alloc_unit_start << dendl;
935 }
936 used_alloc_unit = alloc_unit_end;
937 blob_info_counted = nullptr;
938 }
939 }
940
941 for (auto b_it = affected_blobs.begin();
942 b_it != affected_blobs.end();
943 ++b_it) {
944 Blob* b = b_it->first;
945 BlobInfo& bi = b_it->second;
946 if (bi.referenced_bytes == 0) {
947 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
948 int64_t blob_expected_for_release =
949 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
950
951 dout(30) << __func__ << " " << *(b_it->first)
952 << " expected4release=" << blob_expected_for_release
953 << " expected_allocations=" << bi.expected_allocations
954 << dendl;
955 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
956 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
957 if (bi.collect_candidate) {
958 auto it = bi.first_lextent;
959 bool bExit = false;
960 do {
961 if (it->blob.get() == b) {
962 extents_to_collect.insert(it->logical_offset, it->length);
963 }
964 bExit = it == bi.last_lextent;
965 ++it;
966 } while (!bExit);
967 }
968 expected_for_release += blob_expected_for_release;
969 expected_allocations += bi.expected_allocations;
970 }
971 }
972 }
973 }
974
975 int64_t BlueStore::GarbageCollector::estimate(
976 uint64_t start_offset,
977 uint64_t length,
978 const BlueStore::ExtentMap& extent_map,
979 const BlueStore::old_extent_map_t& old_extents,
980 uint64_t min_alloc_size)
981 {
982
983 affected_blobs.clear();
984 extents_to_collect.clear();
985 used_alloc_unit = boost::optional<uint64_t >();
986 blob_info_counted = nullptr;
987
988 uint64_t gc_start_offset = start_offset;
989 uint64_t gc_end_offset = start_offset + length;
990
991 uint64_t end_offset = start_offset + length;
992
993 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
994 Blob* b = it->e.blob.get();
995 if (b->get_blob().is_compressed()) {
996
997 // update gc_start_offset/gc_end_offset if needed
998 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
999 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
1000
1001 auto o = it->e.logical_offset;
1002 auto l = it->e.length;
1003
1004 uint64_t ref_bytes = b->get_referenced_bytes();
1005 // micro optimization to bypass blobs that have no more references
1006 if (ref_bytes != 0) {
1007 dout(30) << __func__ << " affected_blob:" << *b
1008 << " unref 0x" << std::hex << o << "~" << l
1009 << std::dec << dendl;
1010 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1011 }
1012 }
1013 }
1014 dout(30) << __func__ << " gc range(hex): [" << std::hex
1015 << gc_start_offset << ", " << gc_end_offset
1016 << ")" << std::dec << dendl;
1017
1018 // enumerate preceeding extents to check if they reference affected blobs
1019 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1020 process_protrusive_extents(extent_map,
1021 gc_start_offset,
1022 gc_end_offset,
1023 start_offset,
1024 end_offset,
1025 min_alloc_size);
1026 }
1027 return expected_for_release - expected_allocations;
1028 }
1029
1030 // LruOnodeCacheShard
1031 struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1032 typedef boost::intrusive::list<
1033 BlueStore::Onode,
1034 boost::intrusive::member_hook<
1035 BlueStore::Onode,
1036 boost::intrusive::list_member_hook<>,
1037 &BlueStore::Onode::lru_item> > list_t;
1038
1039 list_t lru;
1040
1041 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
1042
1043 void _add(BlueStore::Onode* o, int level) override
1044 {
1045 if (o->put_cache()) {
1046 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
1047 } else {
1048 ++num_pinned;
1049 }
1050 ++num; // we count both pinned and unpinned entries
1051 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
1052 }
1053 void _rm(BlueStore::Onode* o) override
1054 {
1055 if (o->pop_cache()) {
1056 lru.erase(lru.iterator_to(*o));
1057 } else {
1058 ceph_assert(num_pinned);
1059 --num_pinned;
1060 }
1061 ceph_assert(num);
1062 --num;
1063 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
1064 }
1065 void _pin(BlueStore::Onode* o) override
1066 {
1067 lru.erase(lru.iterator_to(*o));
1068 ++num_pinned;
1069 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
1070 }
1071 void _unpin(BlueStore::Onode* o) override
1072 {
1073 lru.push_front(*o);
1074 ceph_assert(num_pinned);
1075 --num_pinned;
1076 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
1077 }
1078 void _unpin_and_rm(BlueStore::Onode* o) override
1079 {
1080 o->pop_cache();
1081 ceph_assert(num_pinned);
1082 --num_pinned;
1083 ceph_assert(num);
1084 --num;
1085 }
1086 void _trim_to(uint64_t new_size) override
1087 {
1088 if (new_size >= lru.size()) {
1089 return; // don't even try
1090 }
1091 uint64_t n = lru.size() - new_size;
1092 auto p = lru.end();
1093 ceph_assert(p != lru.begin());
1094 --p;
1095 ceph_assert(num >= n);
1096 num -= n;
1097 while (n-- > 0) {
1098 BlueStore::Onode *o = &*p;
1099 dout(20) << __func__ << " rm " << o->oid << " "
1100 << o->nref << " " << o->cached << " " << o->pinned << dendl;
1101 if (p != lru.begin()) {
1102 lru.erase(p--);
1103 } else {
1104 ceph_assert(n == 0);
1105 lru.erase(p);
1106 }
1107 auto pinned = !o->pop_cache();
1108 ceph_assert(!pinned);
1109 o->c->onode_map._remove(o->oid);
1110 }
1111 }
1112 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1113 {
1114 if (to == this) {
1115 return;
1116 }
1117 ceph_assert(o->cached);
1118 ceph_assert(o->pinned);
1119 ceph_assert(num);
1120 ceph_assert(num_pinned);
1121 --num_pinned;
1122 --num;
1123 ++to->num_pinned;
1124 ++to->num;
1125 }
1126 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1127 {
1128 *onodes += num;
1129 *pinned_onodes += num_pinned;
1130 }
1131 };
1132
1133 // OnodeCacheShard
1134 BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1135 CephContext* cct,
1136 string type,
1137 PerfCounters *logger)
1138 {
1139 BlueStore::OnodeCacheShard *c = nullptr;
1140 // Currently we only implement an LRU cache for onodes
1141 c = new LruOnodeCacheShard(cct);
1142 c->logger = logger;
1143 return c;
1144 }
1145
1146 // LruBufferCacheShard
1147 struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1148 typedef boost::intrusive::list<
1149 BlueStore::Buffer,
1150 boost::intrusive::member_hook<
1151 BlueStore::Buffer,
1152 boost::intrusive::list_member_hook<>,
1153 &BlueStore::Buffer::lru_item> > list_t;
1154 list_t lru;
1155
1156 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1157
1158 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1159 if (near) {
1160 auto q = lru.iterator_to(*near);
1161 lru.insert(q, *b);
1162 } else if (level > 0) {
1163 lru.push_front(*b);
1164 } else {
1165 lru.push_back(*b);
1166 }
1167 buffer_bytes += b->length;
1168 num = lru.size();
1169 }
1170 void _rm(BlueStore::Buffer *b) override {
1171 ceph_assert(buffer_bytes >= b->length);
1172 buffer_bytes -= b->length;
1173 auto q = lru.iterator_to(*b);
1174 lru.erase(q);
1175 num = lru.size();
1176 }
1177 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1178 src->_rm(b);
1179 _add(b, 0, nullptr);
1180 }
1181 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1182 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1183 buffer_bytes += delta;
1184 }
1185 void _touch(BlueStore::Buffer *b) override {
1186 auto p = lru.iterator_to(*b);
1187 lru.erase(p);
1188 lru.push_front(*b);
1189 num = lru.size();
1190 _audit("_touch_buffer end");
1191 }
1192
1193 void _trim_to(uint64_t max) override
1194 {
1195 while (buffer_bytes > max) {
1196 auto i = lru.rbegin();
1197 if (i == lru.rend()) {
1198 // stop if lru is now empty
1199 break;
1200 }
1201
1202 BlueStore::Buffer *b = &*i;
1203 ceph_assert(b->is_clean());
1204 dout(20) << __func__ << " rm " << *b << dendl;
1205 b->space->_rm_buffer(this, b);
1206 }
1207 num = lru.size();
1208 }
1209
1210 void add_stats(uint64_t *extents,
1211 uint64_t *blobs,
1212 uint64_t *buffers,
1213 uint64_t *bytes) override {
1214 *extents += num_extents;
1215 *blobs += num_blobs;
1216 *buffers += num;
1217 *bytes += buffer_bytes;
1218 }
1219 #ifdef DEBUG_CACHE
1220 void _audit(const char *s) override
1221 {
1222 dout(10) << __func__ << " " << when << " start" << dendl;
1223 uint64_t s = 0;
1224 for (auto i = lru.begin(); i != lru.end(); ++i) {
1225 s += i->length;
1226 }
1227 if (s != buffer_bytes) {
1228 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1229 << dendl;
1230 for (auto i = lru.begin(); i != lru.end(); ++i) {
1231 derr << __func__ << " " << *i << dendl;
1232 }
1233 ceph_assert(s == buffer_bytes);
1234 }
1235 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1236 << " ok" << dendl;
1237 }
1238 #endif
1239 };
1240
1241 // TwoQBufferCacheShard
1242
1243 struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1244 typedef boost::intrusive::list<
1245 BlueStore::Buffer,
1246 boost::intrusive::member_hook<
1247 BlueStore::Buffer,
1248 boost::intrusive::list_member_hook<>,
1249 &BlueStore::Buffer::lru_item> > list_t;
1250 list_t hot; ///< "Am" hot buffers
1251 list_t warm_in; ///< "A1in" newly warm buffers
1252 list_t warm_out; ///< "A1out" empty buffers we've evicted
1253
1254 enum {
1255 BUFFER_NEW = 0,
1256 BUFFER_WARM_IN, ///< in warm_in
1257 BUFFER_WARM_OUT, ///< in warm_out
1258 BUFFER_HOT, ///< in hot
1259 BUFFER_TYPE_MAX
1260 };
1261
1262 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1263
1264 public:
1265 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
1266
1267 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1268 {
1269 dout(20) << __func__ << " level " << level << " near " << near
1270 << " on " << *b
1271 << " which has cache_private " << b->cache_private << dendl;
1272 if (near) {
1273 b->cache_private = near->cache_private;
1274 switch (b->cache_private) {
1275 case BUFFER_WARM_IN:
1276 warm_in.insert(warm_in.iterator_to(*near), *b);
1277 break;
1278 case BUFFER_WARM_OUT:
1279 ceph_assert(b->is_empty());
1280 warm_out.insert(warm_out.iterator_to(*near), *b);
1281 break;
1282 case BUFFER_HOT:
1283 hot.insert(hot.iterator_to(*near), *b);
1284 break;
1285 default:
1286 ceph_abort_msg("bad cache_private");
1287 }
1288 } else if (b->cache_private == BUFFER_NEW) {
1289 b->cache_private = BUFFER_WARM_IN;
1290 if (level > 0) {
1291 warm_in.push_front(*b);
1292 } else {
1293 // take caller hint to start at the back of the warm queue
1294 warm_in.push_back(*b);
1295 }
1296 } else {
1297 // we got a hint from discard
1298 switch (b->cache_private) {
1299 case BUFFER_WARM_IN:
1300 // stay in warm_in. move to front, even though 2Q doesn't actually
1301 // do this.
1302 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1303 warm_in.push_front(*b);
1304 break;
1305 case BUFFER_WARM_OUT:
1306 b->cache_private = BUFFER_HOT;
1307 // move to hot. fall-thru
1308 case BUFFER_HOT:
1309 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1310 hot.push_front(*b);
1311 break;
1312 default:
1313 ceph_abort_msg("bad cache_private");
1314 }
1315 }
1316 if (!b->is_empty()) {
1317 buffer_bytes += b->length;
1318 list_bytes[b->cache_private] += b->length;
1319 }
1320 num = hot.size() + warm_in.size();
1321 }
1322
1323 void _rm(BlueStore::Buffer *b) override
1324 {
1325 dout(20) << __func__ << " " << *b << dendl;
1326 if (!b->is_empty()) {
1327 ceph_assert(buffer_bytes >= b->length);
1328 buffer_bytes -= b->length;
1329 ceph_assert(list_bytes[b->cache_private] >= b->length);
1330 list_bytes[b->cache_private] -= b->length;
1331 }
1332 switch (b->cache_private) {
1333 case BUFFER_WARM_IN:
1334 warm_in.erase(warm_in.iterator_to(*b));
1335 break;
1336 case BUFFER_WARM_OUT:
1337 warm_out.erase(warm_out.iterator_to(*b));
1338 break;
1339 case BUFFER_HOT:
1340 hot.erase(hot.iterator_to(*b));
1341 break;
1342 default:
1343 ceph_abort_msg("bad cache_private");
1344 }
1345 num = hot.size() + warm_in.size();
1346 }
1347
1348 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1349 {
1350 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1351 src->_rm(b);
1352
1353 // preserve which list we're on (even if we can't preserve the order!)
1354 switch (b->cache_private) {
1355 case BUFFER_WARM_IN:
1356 ceph_assert(!b->is_empty());
1357 warm_in.push_back(*b);
1358 break;
1359 case BUFFER_WARM_OUT:
1360 ceph_assert(b->is_empty());
1361 warm_out.push_back(*b);
1362 break;
1363 case BUFFER_HOT:
1364 ceph_assert(!b->is_empty());
1365 hot.push_back(*b);
1366 break;
1367 default:
1368 ceph_abort_msg("bad cache_private");
1369 }
1370 if (!b->is_empty()) {
1371 buffer_bytes += b->length;
1372 list_bytes[b->cache_private] += b->length;
1373 }
1374 num = hot.size() + warm_in.size();
1375 }
1376
1377 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1378 {
1379 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1380 if (!b->is_empty()) {
1381 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1382 buffer_bytes += delta;
1383 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1384 list_bytes[b->cache_private] += delta;
1385 }
1386 }
1387
1388 void _touch(BlueStore::Buffer *b) override {
1389 switch (b->cache_private) {
1390 case BUFFER_WARM_IN:
1391 // do nothing (somewhat counter-intuitively!)
1392 break;
1393 case BUFFER_WARM_OUT:
1394 // move from warm_out to hot LRU
1395 ceph_abort_msg("this happens via discard hint");
1396 break;
1397 case BUFFER_HOT:
1398 // move to front of hot LRU
1399 hot.erase(hot.iterator_to(*b));
1400 hot.push_front(*b);
1401 break;
1402 }
1403 num = hot.size() + warm_in.size();
1404 _audit("_touch_buffer end");
1405 }
1406
1407 void _trim_to(uint64_t max) override
1408 {
1409 if (buffer_bytes > max) {
1410 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1411 uint64_t khot = max - kin;
1412
1413 // pre-calculate kout based on average buffer size too,
1414 // which is typical(the warm_in and hot lists may change later)
1415 uint64_t kout = 0;
1416 uint64_t buffer_num = hot.size() + warm_in.size();
1417 if (buffer_num) {
1418 uint64_t avg_size = buffer_bytes / buffer_num;
1419 ceph_assert(avg_size);
1420 uint64_t calculated_num = max / avg_size;
1421 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1422 }
1423
1424 if (list_bytes[BUFFER_HOT] < khot) {
1425 // hot is small, give slack to warm_in
1426 kin += khot - list_bytes[BUFFER_HOT];
1427 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1428 // warm_in is small, give slack to hot
1429 khot += kin - list_bytes[BUFFER_WARM_IN];
1430 }
1431
1432 // adjust warm_in list
1433 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1434 uint64_t evicted = 0;
1435
1436 while (to_evict_bytes > 0) {
1437 auto p = warm_in.rbegin();
1438 if (p == warm_in.rend()) {
1439 // stop if warm_in list is now empty
1440 break;
1441 }
1442
1443 BlueStore::Buffer *b = &*p;
1444 ceph_assert(b->is_clean());
1445 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1446 ceph_assert(buffer_bytes >= b->length);
1447 buffer_bytes -= b->length;
1448 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1449 list_bytes[BUFFER_WARM_IN] -= b->length;
1450 to_evict_bytes -= b->length;
1451 evicted += b->length;
1452 b->state = BlueStore::Buffer::STATE_EMPTY;
1453 b->data.clear();
1454 warm_in.erase(warm_in.iterator_to(*b));
1455 warm_out.push_front(*b);
1456 b->cache_private = BUFFER_WARM_OUT;
1457 }
1458
1459 if (evicted > 0) {
1460 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1461 << " from warm_in list, done evicting warm_in buffers"
1462 << dendl;
1463 }
1464
1465 // adjust hot list
1466 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1467 evicted = 0;
1468
1469 while (to_evict_bytes > 0) {
1470 auto p = hot.rbegin();
1471 if (p == hot.rend()) {
1472 // stop if hot list is now empty
1473 break;
1474 }
1475
1476 BlueStore::Buffer *b = &*p;
1477 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1478 ceph_assert(b->is_clean());
1479 // adjust evict size before buffer goes invalid
1480 to_evict_bytes -= b->length;
1481 evicted += b->length;
1482 b->space->_rm_buffer(this, b);
1483 }
1484
1485 if (evicted > 0) {
1486 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1487 << " from hot list, done evicting hot buffers"
1488 << dendl;
1489 }
1490
1491 // adjust warm out list too, if necessary
1492 int64_t n = warm_out.size() - kout;
1493 while (n-- > 0) {
1494 BlueStore::Buffer *b = &*warm_out.rbegin();
1495 ceph_assert(b->is_empty());
1496 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1497 b->space->_rm_buffer(this, b);
1498 }
1499 }
1500 num = hot.size() + warm_in.size();
1501 }
1502
1503 void add_stats(uint64_t *extents,
1504 uint64_t *blobs,
1505 uint64_t *buffers,
1506 uint64_t *bytes) override {
1507 *extents += num_extents;
1508 *blobs += num_blobs;
1509 *buffers += num;
1510 *bytes += buffer_bytes;
1511 }
1512
1513 #ifdef DEBUG_CACHE
1514 void _audit(const char *s) override
1515 {
1516 dout(10) << __func__ << " " << when << " start" << dendl;
1517 uint64_t s = 0;
1518 for (auto i = hot.begin(); i != hot.end(); ++i) {
1519 s += i->length;
1520 }
1521
1522 uint64_t hot_bytes = s;
1523 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1524 derr << __func__ << " hot_list_bytes "
1525 << list_bytes[BUFFER_HOT]
1526 << " != actual " << hot_bytes
1527 << dendl;
1528 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
1529 }
1530
1531 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1532 s += i->length;
1533 }
1534
1535 uint64_t warm_in_bytes = s - hot_bytes;
1536 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1537 derr << __func__ << " warm_in_list_bytes "
1538 << list_bytes[BUFFER_WARM_IN]
1539 << " != actual " << warm_in_bytes
1540 << dendl;
1541 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
1542 }
1543
1544 if (s != buffer_bytes) {
1545 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1546 << dendl;
1547 ceph_assert(s == buffer_bytes);
1548 }
1549
1550 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1551 << " ok" << dendl;
1552 }
1553 #endif
1554 };
1555
1556 // BuferCacheShard
1557
1558 BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1559 CephContext* cct,
1560 string type,
1561 PerfCounters *logger)
1562 {
1563 BufferCacheShard *c = nullptr;
1564 if (type == "lru")
1565 c = new LruBufferCacheShard(cct);
1566 else if (type == "2q")
1567 c = new TwoQBufferCacheShard(cct);
1568 else
1569 ceph_abort_msg("unrecognized cache type");
1570 c->logger = logger;
1571 return c;
1572 }
1573
1574 // BufferSpace
1575
1576 #undef dout_prefix
1577 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1578
1579 void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
1580 {
1581 // note: we already hold cache->lock
1582 ldout(cache->cct, 20) << __func__ << dendl;
1583 while (!buffer_map.empty()) {
1584 _rm_buffer(cache, buffer_map.begin());
1585 }
1586 }
1587
1588 int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
1589 {
1590 // note: we already hold cache->lock
1591 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1592 << std::dec << dendl;
1593 int cache_private = 0;
1594 cache->_audit("discard start");
1595 auto i = _data_lower_bound(offset);
1596 uint32_t end = offset + length;
1597 while (i != buffer_map.end()) {
1598 Buffer *b = i->second.get();
1599 if (b->offset >= end) {
1600 break;
1601 }
1602 if (b->cache_private > cache_private) {
1603 cache_private = b->cache_private;
1604 }
1605 if (b->offset < offset) {
1606 int64_t front = offset - b->offset;
1607 if (b->end() > end) {
1608 // drop middle (split)
1609 uint32_t tail = b->end() - end;
1610 if (b->data.length()) {
1611 bufferlist bl;
1612 bl.substr_of(b->data, b->length - tail, tail);
1613 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
1614 nb->maybe_rebuild();
1615 _add_buffer(cache, nb, 0, b);
1616 } else {
1617 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1618 b->flags),
1619 0, b);
1620 }
1621 if (!b->is_writing()) {
1622 cache->_adjust_size(b, front - (int64_t)b->length);
1623 }
1624 b->truncate(front);
1625 b->maybe_rebuild();
1626 cache->_audit("discard end 1");
1627 break;
1628 } else {
1629 // drop tail
1630 if (!b->is_writing()) {
1631 cache->_adjust_size(b, front - (int64_t)b->length);
1632 }
1633 b->truncate(front);
1634 b->maybe_rebuild();
1635 ++i;
1636 continue;
1637 }
1638 }
1639 if (b->end() <= end) {
1640 // drop entire buffer
1641 _rm_buffer(cache, i++);
1642 continue;
1643 }
1644 // drop front
1645 uint32_t keep = b->end() - end;
1646 if (b->data.length()) {
1647 bufferlist bl;
1648 bl.substr_of(b->data, b->length - keep, keep);
1649 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
1650 nb->maybe_rebuild();
1651 _add_buffer(cache, nb, 0, b);
1652 } else {
1653 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1654 b->flags),
1655 0, b);
1656 }
1657 _rm_buffer(cache, i);
1658 cache->_audit("discard end 2");
1659 break;
1660 }
1661 return cache_private;
1662 }
1663
1664 void BlueStore::BufferSpace::read(
1665 BufferCacheShard* cache,
1666 uint32_t offset,
1667 uint32_t length,
1668 BlueStore::ready_regions_t& res,
1669 interval_set<uint32_t>& res_intervals,
1670 int flags)
1671 {
1672 res.clear();
1673 res_intervals.clear();
1674 uint32_t want_bytes = length;
1675 uint32_t end = offset + length;
1676
1677 {
1678 std::lock_guard l(cache->lock);
1679 for (auto i = _data_lower_bound(offset);
1680 i != buffer_map.end() && offset < end && i->first < end;
1681 ++i) {
1682 Buffer *b = i->second.get();
1683 ceph_assert(b->end() > offset);
1684
1685 bool val = false;
1686 if (flags & BYPASS_CLEAN_CACHE)
1687 val = b->is_writing();
1688 else
1689 val = b->is_writing() || b->is_clean();
1690 if (val) {
1691 if (b->offset < offset) {
1692 uint32_t skip = offset - b->offset;
1693 uint32_t l = min(length, b->length - skip);
1694 res[offset].substr_of(b->data, skip, l);
1695 res_intervals.insert(offset, l);
1696 offset += l;
1697 length -= l;
1698 if (!b->is_writing()) {
1699 cache->_touch(b);
1700 }
1701 continue;
1702 }
1703 if (b->offset > offset) {
1704 uint32_t gap = b->offset - offset;
1705 if (length <= gap) {
1706 break;
1707 }
1708 offset += gap;
1709 length -= gap;
1710 }
1711 if (!b->is_writing()) {
1712 cache->_touch(b);
1713 }
1714 if (b->length > length) {
1715 res[offset].substr_of(b->data, 0, length);
1716 res_intervals.insert(offset, length);
1717 break;
1718 } else {
1719 res[offset].append(b->data);
1720 res_intervals.insert(offset, b->length);
1721 if (b->length == length)
1722 break;
1723 offset += b->length;
1724 length -= b->length;
1725 }
1726 }
1727 }
1728 }
1729
1730 uint64_t hit_bytes = res_intervals.size();
1731 ceph_assert(hit_bytes <= want_bytes);
1732 uint64_t miss_bytes = want_bytes - hit_bytes;
1733 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1734 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1735 }
1736
1737 void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
1738 {
1739 auto i = writing.begin();
1740 while (i != writing.end()) {
1741 if (i->seq > seq) {
1742 break;
1743 }
1744 if (i->seq < seq) {
1745 ++i;
1746 continue;
1747 }
1748
1749 Buffer *b = &*i;
1750 ceph_assert(b->is_writing());
1751
1752 if (b->flags & Buffer::FLAG_NOCACHE) {
1753 writing.erase(i++);
1754 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1755 buffer_map.erase(b->offset);
1756 } else {
1757 b->state = Buffer::STATE_CLEAN;
1758 writing.erase(i++);
1759 b->maybe_rebuild();
1760 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1761 cache->_add(b, 1, nullptr);
1762 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1763 }
1764 }
1765 cache->_trim();
1766 cache->_audit("finish_write end");
1767 }
1768
1769 void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
1770 {
1771 std::lock_guard lk(cache->lock);
1772 if (buffer_map.empty())
1773 return;
1774
1775 auto p = --buffer_map.end();
1776 while (true) {
1777 if (p->second->end() <= pos)
1778 break;
1779
1780 if (p->second->offset < pos) {
1781 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1782 size_t left = pos - p->second->offset;
1783 size_t right = p->second->length - left;
1784 if (p->second->data.length()) {
1785 bufferlist bl;
1786 bl.substr_of(p->second->data, left, right);
1787 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1788 0, bl, p->second->flags),
1789 0, p->second.get());
1790 } else {
1791 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1792 0, right, p->second->flags),
1793 0, p->second.get());
1794 }
1795 cache->_adjust_size(p->second.get(), -right);
1796 p->second->truncate(left);
1797 break;
1798 }
1799
1800 ceph_assert(p->second->end() > pos);
1801 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1802 if (p->second->data.length()) {
1803 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1804 p->second->offset - pos, p->second->data, p->second->flags),
1805 0, p->second.get());
1806 } else {
1807 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1808 p->second->offset - pos, p->second->length, p->second->flags),
1809 0, p->second.get());
1810 }
1811 if (p == buffer_map.begin()) {
1812 _rm_buffer(cache, p);
1813 break;
1814 } else {
1815 _rm_buffer(cache, p--);
1816 }
1817 }
1818 ceph_assert(writing.empty());
1819 cache->_trim();
1820 }
1821
1822 // OnodeSpace
1823
1824 #undef dout_prefix
1825 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1826
1827 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1828 OnodeRef& o)
1829 {
1830 std::lock_guard l(cache->lock);
1831 auto p = onode_map.find(oid);
1832 if (p != onode_map.end()) {
1833 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1834 << " raced, returning existing " << p->second
1835 << dendl;
1836 return p->second;
1837 }
1838 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
1839 onode_map[oid] = o;
1840 cache->_add(o.get(), 1);
1841 cache->_trim();
1842 return o;
1843 }
1844
1845 void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1846 {
1847 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1848 onode_map.erase(oid);
1849 }
1850
1851 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1852 {
1853 ldout(cache->cct, 30) << __func__ << dendl;
1854 OnodeRef o;
1855 bool hit = false;
1856
1857 {
1858 std::lock_guard l(cache->lock);
1859 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1860 if (p == onode_map.end()) {
1861 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1862 } else {
1863 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1864 << " " << p->second->nref
1865 << " " << p->second->cached
1866 << " " << p->second->pinned
1867 << dendl;
1868 // This will pin onode and implicitly touch the cache when Onode
1869 // eventually will become unpinned
1870 o = p->second;
1871 ceph_assert(!o->cached || o->pinned);
1872
1873 hit = true;
1874 }
1875 }
1876
1877 if (hit) {
1878 cache->logger->inc(l_bluestore_onode_hits);
1879 } else {
1880 cache->logger->inc(l_bluestore_onode_misses);
1881 }
1882 return o;
1883 }
1884
1885 void BlueStore::OnodeSpace::clear()
1886 {
1887 std::lock_guard l(cache->lock);
1888 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
1889 for (auto &p : onode_map) {
1890 cache->_rm(p.second.get());
1891 }
1892 onode_map.clear();
1893 }
1894
1895 bool BlueStore::OnodeSpace::empty()
1896 {
1897 std::lock_guard l(cache->lock);
1898 return onode_map.empty();
1899 }
1900
1901 void BlueStore::OnodeSpace::rename(
1902 OnodeRef& oldo,
1903 const ghobject_t& old_oid,
1904 const ghobject_t& new_oid,
1905 const mempool::bluestore_cache_meta::string& new_okey)
1906 {
1907 std::lock_guard l(cache->lock);
1908 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1909 << dendl;
1910 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1911 po = onode_map.find(old_oid);
1912 pn = onode_map.find(new_oid);
1913 ceph_assert(po != pn);
1914
1915 ceph_assert(po != onode_map.end());
1916 if (pn != onode_map.end()) {
1917 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1918 << dendl;
1919 cache->_rm(pn->second.get());
1920 onode_map.erase(pn);
1921 }
1922 OnodeRef o = po->second;
1923
1924 // install a non-existent onode at old location
1925 oldo.reset(new Onode(o->c, old_oid, o->key));
1926 po->second = oldo;
1927 cache->_add(oldo.get(), 1);
1928 // add at new position and fix oid, key.
1929 // This will pin 'o' and implicitly touch cache
1930 // when it will eventually become unpinned
1931 onode_map.insert(make_pair(new_oid, o));
1932 ceph_assert(o->pinned);
1933
1934 o->oid = new_oid;
1935 o->key = new_okey;
1936 cache->_trim();
1937 }
1938
1939 bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
1940 {
1941 std::lock_guard l(cache->lock);
1942 ldout(cache->cct, 20) << __func__ << dendl;
1943 for (auto& i : onode_map) {
1944 if (f(i.second.get())) {
1945 return true;
1946 }
1947 }
1948 return false;
1949 }
1950
1951 template <int LogLevelV = 30>
1952 void BlueStore::OnodeSpace::dump(CephContext *cct)
1953 {
1954 for (auto& i : onode_map) {
1955 ldout(cct, LogLevelV) << i.first << " : " << i.second
1956 << " " << i.second->nref
1957 << " " << i.second->cached
1958 << " " << i.second->pinned
1959 << dendl;
1960 }
1961 }
1962
1963 // SharedBlob
1964
1965 #undef dout_prefix
1966 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1967 #undef dout_context
1968 #define dout_context coll->store->cct
1969
1970 void BlueStore::SharedBlob::dump(Formatter* f) const
1971 {
1972 f->dump_bool("loaded", loaded);
1973 if (loaded) {
1974 persistent->dump(f);
1975 } else {
1976 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
1977 }
1978 }
1979
1980 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1981 {
1982 out << "SharedBlob(" << &sb;
1983
1984 if (sb.loaded) {
1985 out << " loaded " << *sb.persistent;
1986 } else {
1987 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1988 }
1989 return out << ")";
1990 }
1991
1992 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1993 : coll(_coll), sbid_unloaded(i)
1994 {
1995 ceph_assert(sbid_unloaded > 0);
1996 if (get_cache()) {
1997 get_cache()->add_blob();
1998 }
1999 }
2000
2001 BlueStore::SharedBlob::~SharedBlob()
2002 {
2003 if (loaded && persistent) {
2004 delete persistent;
2005 }
2006 }
2007
2008 void BlueStore::SharedBlob::put()
2009 {
2010 if (--nref == 0) {
2011 dout(20) << __func__ << " " << this
2012 << " removing self from set " << get_parent()
2013 << dendl;
2014 again:
2015 auto coll_snap = coll;
2016 if (coll_snap) {
2017 std::lock_guard l(coll_snap->cache->lock);
2018 if (coll_snap != coll) {
2019 goto again;
2020 }
2021 if (!coll_snap->shared_blob_set.remove(this, true)) {
2022 // race with lookup
2023 return;
2024 }
2025 bc._clear(coll_snap->cache);
2026 coll_snap->cache->rm_blob();
2027 }
2028 delete this;
2029 }
2030 }
2031
2032 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2033 {
2034 ceph_assert(persistent);
2035 persistent->ref_map.get(offset, length);
2036 }
2037
2038 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
2039 PExtentVector *r,
2040 bool *unshare)
2041 {
2042 ceph_assert(persistent);
2043 persistent->ref_map.put(offset, length, r,
2044 unshare && !*unshare ? unshare : nullptr);
2045 }
2046
2047 void BlueStore::SharedBlob::finish_write(uint64_t seq)
2048 {
2049 while (true) {
2050 BufferCacheShard *cache = coll->cache;
2051 std::lock_guard l(cache->lock);
2052 if (coll->cache != cache) {
2053 dout(20) << __func__
2054 << " raced with sb cache update, was " << cache
2055 << ", now " << coll->cache << ", retrying"
2056 << dendl;
2057 continue;
2058 }
2059 bc._finish_write(cache, seq);
2060 break;
2061 }
2062 }
2063
2064 // SharedBlobSet
2065
2066 #undef dout_prefix
2067 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2068
2069 template <int LogLevelV = 30>
2070 void BlueStore::SharedBlobSet::dump(CephContext *cct)
2071 {
2072 std::lock_guard l(lock);
2073 for (auto& i : sb_map) {
2074 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
2075 }
2076 }
2077
2078 // Blob
2079
2080 #undef dout_prefix
2081 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2082
2083 void BlueStore::Blob::dump(Formatter* f) const
2084 {
2085 if (is_spanning()) {
2086 f->dump_unsigned("spanning_id ", id);
2087 }
2088 blob.dump(f);
2089 if (shared_blob) {
2090 f->dump_object("shared", *shared_blob);
2091 }
2092 }
2093
2094 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2095 {
2096 out << "Blob(" << &b;
2097 if (b.is_spanning()) {
2098 out << " spanning " << b.id;
2099 }
2100 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2101 if (b.shared_blob) {
2102 out << " " << *b.shared_blob;
2103 } else {
2104 out << " (shared_blob=NULL)";
2105 }
2106 out << ")";
2107 return out;
2108 }
2109
2110 void BlueStore::Blob::discard_unallocated(Collection *coll)
2111 {
2112 if (get_blob().is_shared()) {
2113 return;
2114 }
2115 if (get_blob().is_compressed()) {
2116 bool discard = false;
2117 bool all_invalid = true;
2118 for (auto e : get_blob().get_extents()) {
2119 if (!e.is_valid()) {
2120 discard = true;
2121 } else {
2122 all_invalid = false;
2123 }
2124 }
2125 ceph_assert(discard == all_invalid); // in case of compressed blob all
2126 // or none pextents are invalid.
2127 if (discard) {
2128 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2129 get_blob().get_logical_length());
2130 }
2131 } else {
2132 size_t pos = 0;
2133 for (auto e : get_blob().get_extents()) {
2134 if (!e.is_valid()) {
2135 dout(20) << __func__ << " 0x" << std::hex << pos
2136 << "~" << e.length
2137 << std::dec << dendl;
2138 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2139 }
2140 pos += e.length;
2141 }
2142 if (get_blob().can_prune_tail()) {
2143 dirty_blob().prune_tail();
2144 used_in_blob.prune_tail(get_blob().get_ondisk_length());
2145 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
2146 }
2147 }
2148 }
2149
2150 void BlueStore::Blob::get_ref(
2151 Collection *coll,
2152 uint32_t offset,
2153 uint32_t length)
2154 {
2155 // Caller has to initialize Blob's logical length prior to increment
2156 // references. Otherwise one is neither unable to determine required
2157 // amount of counters in case of per-au tracking nor obtain min_release_size
2158 // for single counter mode.
2159 ceph_assert(get_blob().get_logical_length() != 0);
2160 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2161 << std::dec << " " << *this << dendl;
2162
2163 if (used_in_blob.is_empty()) {
2164 uint32_t min_release_size =
2165 get_blob().get_release_size(coll->store->min_alloc_size);
2166 uint64_t l = get_blob().get_logical_length();
2167 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2168 << min_release_size << std::dec << dendl;
2169 used_in_blob.init(l, min_release_size);
2170 }
2171 used_in_blob.get(
2172 offset,
2173 length);
2174 }
2175
2176 bool BlueStore::Blob::put_ref(
2177 Collection *coll,
2178 uint32_t offset,
2179 uint32_t length,
2180 PExtentVector *r)
2181 {
2182 PExtentVector logical;
2183
2184 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2185 << std::dec << " " << *this << dendl;
2186
2187 bool empty = used_in_blob.put(
2188 offset,
2189 length,
2190 &logical);
2191 r->clear();
2192 // nothing to release
2193 if (!empty && logical.empty()) {
2194 return false;
2195 }
2196
2197 bluestore_blob_t& b = dirty_blob();
2198 return b.release_extents(empty, logical, r);
2199 }
2200
2201 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
2202 uint32_t target_blob_size,
2203 uint32_t b_offset,
2204 uint32_t *length0) {
2205 ceph_assert(min_alloc_size);
2206 ceph_assert(target_blob_size);
2207 if (!get_blob().is_mutable()) {
2208 return false;
2209 }
2210
2211 uint32_t length = *length0;
2212 uint32_t end = b_offset + length;
2213
2214 // Currently for the sake of simplicity we omit blob reuse if data is
2215 // unaligned with csum chunk. Later we can perform padding if needed.
2216 if (get_blob().has_csum() &&
2217 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2218 (end % get_blob().get_csum_chunk_size()) != 0)) {
2219 return false;
2220 }
2221
2222 auto blen = get_blob().get_logical_length();
2223 uint32_t new_blen = blen;
2224
2225 // make sure target_blob_size isn't less than current blob len
2226 target_blob_size = std::max(blen, target_blob_size);
2227
2228 if (b_offset >= blen) {
2229 // new data totally stands out of the existing blob
2230 new_blen = end;
2231 } else {
2232 // new data overlaps with the existing blob
2233 new_blen = std::max(blen, end);
2234
2235 uint32_t overlap = 0;
2236 if (new_blen > blen) {
2237 overlap = blen - b_offset;
2238 } else {
2239 overlap = length;
2240 }
2241
2242 if (!get_blob().is_unallocated(b_offset, overlap)) {
2243 // abort if any piece of the overlap has already been allocated
2244 return false;
2245 }
2246 }
2247
2248 if (new_blen > blen) {
2249 int64_t overflow = int64_t(new_blen) - target_blob_size;
2250 // Unable to decrease the provided length to fit into max_blob_size
2251 if (overflow >= length) {
2252 return false;
2253 }
2254
2255 // FIXME: in some cases we could reduce unused resolution
2256 if (get_blob().has_unused()) {
2257 return false;
2258 }
2259
2260 if (overflow > 0) {
2261 new_blen -= overflow;
2262 length -= overflow;
2263 *length0 = length;
2264 }
2265
2266 if (new_blen > blen) {
2267 dirty_blob().add_tail(new_blen);
2268 used_in_blob.add_tail(new_blen,
2269 get_blob().get_release_size(min_alloc_size));
2270 }
2271 }
2272 return true;
2273 }
2274
2275 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2276 {
2277 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2278 << " start " << *this << dendl;
2279 ceph_assert(blob.can_split());
2280 ceph_assert(used_in_blob.can_split());
2281 bluestore_blob_t &lb = dirty_blob();
2282 bluestore_blob_t &rb = r->dirty_blob();
2283
2284 used_in_blob.split(
2285 blob_offset,
2286 &(r->used_in_blob));
2287
2288 lb.split(blob_offset, rb);
2289 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2290
2291 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2292 << " finish " << *this << dendl;
2293 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2294 << " and " << *r << dendl;
2295 }
2296
2297 #ifndef CACHE_BLOB_BL
2298 void BlueStore::Blob::decode(
2299 Collection *coll,
2300 bufferptr::const_iterator& p,
2301 uint64_t struct_v,
2302 uint64_t* sbid,
2303 bool include_ref_map)
2304 {
2305 denc(blob, p, struct_v);
2306 if (blob.is_shared()) {
2307 denc(*sbid, p);
2308 }
2309 if (include_ref_map) {
2310 if (struct_v > 1) {
2311 used_in_blob.decode(p);
2312 } else {
2313 used_in_blob.clear();
2314 bluestore_extent_ref_map_t legacy_ref_map;
2315 legacy_ref_map.decode(p);
2316 for (auto r : legacy_ref_map.ref_map) {
2317 get_ref(
2318 coll,
2319 r.first,
2320 r.second.refs * r.second.length);
2321 }
2322 }
2323 }
2324 }
2325 #endif
2326
2327 // Extent
2328
2329 void BlueStore::Extent::dump(Formatter* f) const
2330 {
2331 f->dump_unsigned("logical_offset", logical_offset);
2332 f->dump_unsigned("length", length);
2333 f->dump_unsigned("blob_offset", blob_offset);
2334 f->dump_object("blob", *blob);
2335 }
2336
2337 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2338 {
2339 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2340 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2341 << " " << *e.blob;
2342 }
2343
2344 // OldExtent
2345 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2346 uint32_t lo,
2347 uint32_t o,
2348 uint32_t l,
2349 BlobRef& b) {
2350 OldExtent* oe = new OldExtent(lo, o, l, b);
2351 b->put_ref(c.get(), o, l, &(oe->r));
2352 oe->blob_empty = !b->is_referenced();
2353 return oe;
2354 }
2355
2356 // ExtentMap
2357
2358 #undef dout_prefix
2359 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2360 #undef dout_context
2361 #define dout_context onode->c->store->cct
2362
2363 BlueStore::ExtentMap::ExtentMap(Onode *o)
2364 : onode(o),
2365 inline_bl(
2366 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2367 }
2368
2369 void BlueStore::ExtentMap::dump(Formatter* f) const
2370 {
2371 f->open_array_section("extents");
2372
2373 for (auto& e : extent_map) {
2374 f->dump_object("extent", e);
2375 }
2376 f->close_section();
2377 }
2378
2379 void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2380 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2381 uint64_t& length, uint64_t& dstoff) {
2382
2383 auto cct = onode->c->store->cct;
2384 bool inject_21040 =
2385 cct->_conf->bluestore_debug_inject_bug21040;
2386 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2387 for (auto& e : oldo->extent_map.extent_map) {
2388 e.blob->last_encoded_id = -1;
2389 }
2390
2391 int n = 0;
2392 uint64_t end = srcoff + length;
2393 uint32_t dirty_range_begin = 0;
2394 uint32_t dirty_range_end = 0;
2395 bool src_dirty = false;
2396 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2397 ep != oldo->extent_map.extent_map.end();
2398 ++ep) {
2399 auto& e = *ep;
2400 if (e.logical_offset >= end) {
2401 break;
2402 }
2403 dout(20) << __func__ << " src " << e << dendl;
2404 BlobRef cb;
2405 bool blob_duped = true;
2406 if (e.blob->last_encoded_id >= 0) {
2407 cb = id_to_blob[e.blob->last_encoded_id];
2408 blob_duped = false;
2409 } else {
2410 // dup the blob
2411 const bluestore_blob_t& blob = e.blob->get_blob();
2412 // make sure it is shared
2413 if (!blob.is_shared()) {
2414 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2415 if (!inject_21040 && !src_dirty) {
2416 src_dirty = true;
2417 dirty_range_begin = e.logical_offset;
2418 } else if (inject_21040 &&
2419 dirty_range_begin == 0 && dirty_range_end == 0) {
2420 dirty_range_begin = e.logical_offset;
2421 }
2422 ceph_assert(e.logical_end() > 0);
2423 // -1 to exclude next potential shard
2424 dirty_range_end = e.logical_end() - 1;
2425 } else {
2426 c->load_shared_blob(e.blob->shared_blob);
2427 }
2428 cb = new Blob();
2429 e.blob->last_encoded_id = n;
2430 id_to_blob[n] = cb;
2431 e.blob->dup(*cb);
2432 // bump the extent refs on the copied blob's extents
2433 for (auto p : blob.get_extents()) {
2434 if (p.is_valid()) {
2435 e.blob->shared_blob->get_ref(p.offset, p.length);
2436 }
2437 }
2438 txc->write_shared_blob(e.blob->shared_blob);
2439 dout(20) << __func__ << " new " << *cb << dendl;
2440 }
2441
2442 int skip_front, skip_back;
2443 if (e.logical_offset < srcoff) {
2444 skip_front = srcoff - e.logical_offset;
2445 } else {
2446 skip_front = 0;
2447 }
2448 if (e.logical_end() > end) {
2449 skip_back = e.logical_end() - end;
2450 } else {
2451 skip_back = 0;
2452 }
2453
2454 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2455 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2456 newo->extent_map.extent_map.insert(*ne);
2457 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2458 // fixme: we may leave parts of new blob unreferenced that could
2459 // be freed (relative to the shared_blob).
2460 txc->statfs_delta.stored() += ne->length;
2461 if (e.blob->get_blob().is_compressed()) {
2462 txc->statfs_delta.compressed_original() += ne->length;
2463 if (blob_duped) {
2464 txc->statfs_delta.compressed() +=
2465 cb->get_blob().get_compressed_payload_length();
2466 }
2467 }
2468 dout(20) << __func__ << " dst " << *ne << dendl;
2469 ++n;
2470 }
2471 if ((!inject_21040 && src_dirty) ||
2472 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2473 oldo->extent_map.dirty_range(dirty_range_begin,
2474 dirty_range_end - dirty_range_begin);
2475 txc->write_onode(oldo);
2476 }
2477 txc->write_onode(newo);
2478
2479 if (dstoff + length > newo->onode.size) {
2480 newo->onode.size = dstoff + length;
2481 }
2482 newo->extent_map.dirty_range(dstoff, length);
2483 }
2484 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2485 bool force)
2486 {
2487 auto cct = onode->c->store->cct; //used by dout
2488 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2489 if (onode->onode.extent_map_shards.empty()) {
2490 if (inline_bl.length() == 0) {
2491 unsigned n;
2492 // we need to encode inline_bl to measure encoded length
2493 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
2494 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
2495 ceph_assert(!never_happen);
2496 size_t len = inline_bl.length();
2497 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2498 << " extents" << dendl;
2499 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2500 request_reshard(0, OBJECT_MAX_SIZE);
2501 return;
2502 }
2503 }
2504 // will persist in the onode key.
2505 } else {
2506 // pending shard update
2507 struct dirty_shard_t {
2508 Shard *shard;
2509 bufferlist bl;
2510 dirty_shard_t(Shard *s) : shard(s) {}
2511 };
2512 vector<dirty_shard_t> encoded_shards;
2513 // allocate slots for all shards in a single call instead of
2514 // doing multiple allocations - one per each dirty shard
2515 encoded_shards.reserve(shards.size());
2516
2517 auto p = shards.begin();
2518 auto prev_p = p;
2519 while (p != shards.end()) {
2520 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
2521 auto n = p;
2522 ++n;
2523 if (p->dirty) {
2524 uint32_t endoff;
2525 if (n == shards.end()) {
2526 endoff = OBJECT_MAX_SIZE;
2527 } else {
2528 endoff = n->shard_info->offset;
2529 }
2530 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2531 bufferlist& bl = encoded_shards.back().bl;
2532 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2533 bl, &p->extents)) {
2534 if (force) {
2535 derr << __func__ << " encode_some needs reshard" << dendl;
2536 ceph_assert(!force);
2537 }
2538 }
2539 size_t len = bl.length();
2540
2541 dout(20) << __func__ << " shard 0x" << std::hex
2542 << p->shard_info->offset << std::dec << " is " << len
2543 << " bytes (was " << p->shard_info->bytes << ") from "
2544 << p->extents << " extents" << dendl;
2545
2546 if (!force) {
2547 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2548 // we are big; reshard ourselves
2549 request_reshard(p->shard_info->offset, endoff);
2550 }
2551 // avoid resharding the trailing shard, even if it is small
2552 else if (n != shards.end() &&
2553 len < g_conf()->bluestore_extent_map_shard_min_size) {
2554 ceph_assert(endoff != OBJECT_MAX_SIZE);
2555 if (p == shards.begin()) {
2556 // we are the first shard, combine with next shard
2557 request_reshard(p->shard_info->offset, endoff + 1);
2558 } else {
2559 // combine either with the previous shard or the next,
2560 // whichever is smaller
2561 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2562 request_reshard(p->shard_info->offset, endoff + 1);
2563 } else {
2564 request_reshard(prev_p->shard_info->offset, endoff);
2565 }
2566 }
2567 }
2568 }
2569 }
2570 prev_p = p;
2571 p = n;
2572 }
2573 if (needs_reshard()) {
2574 return;
2575 }
2576
2577 // schedule DB update for dirty shards
2578 string key;
2579 for (auto& it : encoded_shards) {
2580 it.shard->dirty = false;
2581 it.shard->shard_info->bytes = it.bl.length();
2582 generate_extent_shard_key_and_apply(
2583 onode->key,
2584 it.shard->shard_info->offset,
2585 &key,
2586 [&](const string& final_key) {
2587 t->set(PREFIX_OBJ, final_key, it.bl);
2588 }
2589 );
2590 }
2591 }
2592 }
2593
2594 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2595 {
2596 if (spanning_blob_map.empty())
2597 return 0;
2598 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2599 // bid is valid and available.
2600 if (bid >= 0)
2601 return bid;
2602 // Find next unused bid;
2603 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2604 const auto begin_bid = bid;
2605 do {
2606 if (!spanning_blob_map.count(bid))
2607 return bid;
2608 else {
2609 bid++;
2610 if (bid < 0) bid = 0;
2611 }
2612 } while (bid != begin_bid);
2613 auto cct = onode->c->store->cct; // used by dout
2614 _dump_onode<0>(cct, *onode);
2615 ceph_abort_msg("no available blob id");
2616 }
2617
2618 void BlueStore::ExtentMap::reshard(
2619 KeyValueDB *db,
2620 KeyValueDB::Transaction t)
2621 {
2622 auto cct = onode->c->store->cct; // used by dout
2623
2624 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2625 << needs_reshard_end << ")" << std::dec
2626 << " of " << onode->onode.extent_map_shards.size()
2627 << " shards on " << onode->oid << dendl;
2628 for (auto& p : spanning_blob_map) {
2629 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2630 << dendl;
2631 }
2632 // determine shard index range
2633 unsigned si_begin = 0, si_end = 0;
2634 if (!shards.empty()) {
2635 while (si_begin + 1 < shards.size() &&
2636 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2637 ++si_begin;
2638 }
2639 needs_reshard_begin = shards[si_begin].shard_info->offset;
2640 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2641 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2642 needs_reshard_end = shards[si_end].shard_info->offset;
2643 break;
2644 }
2645 }
2646 if (si_end == shards.size()) {
2647 needs_reshard_end = OBJECT_MAX_SIZE;
2648 }
2649 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2650 << " over 0x[" << std::hex << needs_reshard_begin << ","
2651 << needs_reshard_end << ")" << std::dec << dendl;
2652 }
2653
2654 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2655
2656 // we may need to fault in a larger interval later must have all
2657 // referring extents for spanning blobs loaded in order to have
2658 // accurate use_tracker values.
2659 uint32_t spanning_scan_begin = needs_reshard_begin;
2660 uint32_t spanning_scan_end = needs_reshard_end;
2661
2662 // remove old keys
2663 string key;
2664 for (unsigned i = si_begin; i < si_end; ++i) {
2665 generate_extent_shard_key_and_apply(
2666 onode->key, shards[i].shard_info->offset, &key,
2667 [&](const string& final_key) {
2668 t->rmkey(PREFIX_OBJ, final_key);
2669 }
2670 );
2671 }
2672
2673 // calculate average extent size
2674 unsigned bytes = 0;
2675 unsigned extents = 0;
2676 if (onode->onode.extent_map_shards.empty()) {
2677 bytes = inline_bl.length();
2678 extents = extent_map.size();
2679 } else {
2680 for (unsigned i = si_begin; i < si_end; ++i) {
2681 bytes += shards[i].shard_info->bytes;
2682 extents += shards[i].extents;
2683 }
2684 }
2685 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2686 unsigned slop = target *
2687 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2688 unsigned extent_avg = bytes / std::max(1u, extents);
2689 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2690 << ", slop " << slop << dendl;
2691
2692 // reshard
2693 unsigned estimate = 0;
2694 unsigned offset = needs_reshard_begin;
2695 vector<bluestore_onode_t::shard_info> new_shard_info;
2696 unsigned max_blob_end = 0;
2697 Extent dummy(needs_reshard_begin);
2698 for (auto e = extent_map.lower_bound(dummy);
2699 e != extent_map.end();
2700 ++e) {
2701 if (e->logical_offset >= needs_reshard_end) {
2702 break;
2703 }
2704 dout(30) << " extent " << *e << dendl;
2705
2706 // disfavor shard boundaries that span a blob
2707 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2708 if (estimate &&
2709 estimate + extent_avg > target + (would_span ? slop : 0)) {
2710 // new shard
2711 if (offset == needs_reshard_begin) {
2712 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2713 new_shard_info.back().offset = offset;
2714 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2715 << std::dec << dendl;
2716 }
2717 offset = e->logical_offset;
2718 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2719 new_shard_info.back().offset = offset;
2720 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2721 << std::dec << dendl;
2722 estimate = 0;
2723 }
2724 estimate += extent_avg;
2725 unsigned bs = e->blob_start();
2726 if (bs < spanning_scan_begin) {
2727 spanning_scan_begin = bs;
2728 }
2729 uint32_t be = e->blob_end();
2730 if (be > max_blob_end) {
2731 max_blob_end = be;
2732 }
2733 if (be > spanning_scan_end) {
2734 spanning_scan_end = be;
2735 }
2736 }
2737 if (new_shard_info.empty() && (si_begin > 0 ||
2738 si_end < shards.size())) {
2739 // we resharded a partial range; we must produce at least one output
2740 // shard
2741 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2742 new_shard_info.back().offset = needs_reshard_begin;
2743 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2744 << std::dec << " (singleton degenerate case)" << dendl;
2745 }
2746
2747 auto& sv = onode->onode.extent_map_shards;
2748 dout(20) << __func__ << " new " << new_shard_info << dendl;
2749 dout(20) << __func__ << " old " << sv << dendl;
2750 if (sv.empty()) {
2751 // no old shards to keep
2752 sv.swap(new_shard_info);
2753 init_shards(true, true);
2754 } else {
2755 // splice in new shards
2756 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2757 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2758 sv.insert(
2759 sv.begin() + si_begin,
2760 new_shard_info.begin(),
2761 new_shard_info.end());
2762 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2763 si_end = si_begin + new_shard_info.size();
2764
2765 ceph_assert(sv.size() == shards.size());
2766
2767 // note that we need to update every shard_info of shards here,
2768 // as sv might have been totally re-allocated above
2769 for (unsigned i = 0; i < shards.size(); i++) {
2770 shards[i].shard_info = &sv[i];
2771 }
2772
2773 // mark newly added shards as dirty
2774 for (unsigned i = si_begin; i < si_end; ++i) {
2775 shards[i].loaded = true;
2776 shards[i].dirty = true;
2777 }
2778 }
2779 dout(20) << __func__ << " fin " << sv << dendl;
2780 inline_bl.clear();
2781
2782 if (sv.empty()) {
2783 // no more shards; unspan all previously spanning blobs
2784 auto p = spanning_blob_map.begin();
2785 while (p != spanning_blob_map.end()) {
2786 p->second->id = -1;
2787 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2788 p = spanning_blob_map.erase(p);
2789 }
2790 } else {
2791 // identify new spanning blobs
2792 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2793 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2794 if (spanning_scan_begin < needs_reshard_begin) {
2795 fault_range(db, spanning_scan_begin,
2796 needs_reshard_begin - spanning_scan_begin);
2797 }
2798 if (spanning_scan_end > needs_reshard_end) {
2799 fault_range(db, needs_reshard_end,
2800 spanning_scan_end - needs_reshard_end);
2801 }
2802 auto sp = sv.begin() + si_begin;
2803 auto esp = sv.end();
2804 unsigned shard_start = sp->offset;
2805 unsigned shard_end;
2806 ++sp;
2807 if (sp == esp) {
2808 shard_end = OBJECT_MAX_SIZE;
2809 } else {
2810 shard_end = sp->offset;
2811 }
2812 Extent dummy(needs_reshard_begin);
2813
2814 bool was_too_many_blobs_check = false;
2815 auto too_many_blobs_threshold =
2816 g_conf()->bluestore_debug_too_many_blobs_threshold;
2817 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2818 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2819 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2820
2821 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2822 if (e->logical_offset >= needs_reshard_end) {
2823 break;
2824 }
2825 dout(30) << " extent " << *e << dendl;
2826 while (e->logical_offset >= shard_end) {
2827 shard_start = shard_end;
2828 ceph_assert(sp != esp);
2829 ++sp;
2830 if (sp == esp) {
2831 shard_end = OBJECT_MAX_SIZE;
2832 } else {
2833 shard_end = sp->offset;
2834 }
2835 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2836 << " to 0x" << shard_end << std::dec << dendl;
2837 }
2838
2839 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2840 if (!e->blob->is_spanning()) {
2841 // We have two options: (1) split the blob into pieces at the
2842 // shard boundaries (and adjust extents accordingly), or (2)
2843 // mark it spanning. We prefer to cut the blob if we can. Note that
2844 // we may have to split it multiple times--potentially at every
2845 // shard boundary.
2846 bool must_span = false;
2847 BlobRef b = e->blob;
2848 if (b->can_split()) {
2849 uint32_t bstart = e->blob_start();
2850 uint32_t bend = e->blob_end();
2851 for (const auto& sh : shards) {
2852 if (bstart < sh.shard_info->offset &&
2853 bend > sh.shard_info->offset) {
2854 uint32_t blob_offset = sh.shard_info->offset - bstart;
2855 if (b->can_split_at(blob_offset)) {
2856 dout(20) << __func__ << " splitting blob, bstart 0x"
2857 << std::hex << bstart << " blob_offset 0x"
2858 << blob_offset << std::dec << " " << *b << dendl;
2859 b = split_blob(b, blob_offset, sh.shard_info->offset);
2860 // switch b to the new right-hand side, in case it
2861 // *also* has to get split.
2862 bstart += blob_offset;
2863 onode->c->store->logger->inc(l_bluestore_blob_split);
2864 } else {
2865 must_span = true;
2866 break;
2867 }
2868 }
2869 }
2870 } else {
2871 must_span = true;
2872 }
2873 if (must_span) {
2874 auto bid = allocate_spanning_blob_id();
2875 b->id = bid;
2876 spanning_blob_map[b->id] = b;
2877 dout(20) << __func__ << " adding spanning " << *b << dendl;
2878 if (!was_too_many_blobs_check &&
2879 too_many_blobs_threshold &&
2880 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2881
2882 was_too_many_blobs_check = true;
2883 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2884 if (dumped_onodes[i].first == onode->oid) {
2885 oid_slot = &dumped_onodes[i];
2886 break;
2887 }
2888 if (!oldest_slot || (oldest_slot &&
2889 dumped_onodes[i].second < oldest_slot->second)) {
2890 oldest_slot = &dumped_onodes[i];
2891 }
2892 }
2893 }
2894 }
2895 }
2896 } else {
2897 if (e->blob->is_spanning()) {
2898 spanning_blob_map.erase(e->blob->id);
2899 e->blob->id = -1;
2900 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2901 }
2902 }
2903 }
2904 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2905 (oid_slot &&
2906 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2907 if (do_dump) {
2908 dout(0) << __func__
2909 << " spanning blob count exceeds threshold, "
2910 << spanning_blob_map.size() << " spanning blobs"
2911 << dendl;
2912 _dump_onode<0>(cct, *onode);
2913 if (oid_slot) {
2914 oid_slot->second = mono_clock::now();
2915 } else {
2916 ceph_assert(oldest_slot);
2917 oldest_slot->first = onode->oid;
2918 oldest_slot->second = mono_clock::now();
2919 }
2920 }
2921 }
2922
2923 clear_needs_reshard();
2924 }
2925
2926 bool BlueStore::ExtentMap::encode_some(
2927 uint32_t offset,
2928 uint32_t length,
2929 bufferlist& bl,
2930 unsigned *pn)
2931 {
2932 Extent dummy(offset);
2933 auto start = extent_map.lower_bound(dummy);
2934 uint32_t end = offset + length;
2935
2936 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2937 // serialization only. Hence there is no specific
2938 // handling at ExtentMap level.
2939
2940 unsigned n = 0;
2941 size_t bound = 0;
2942 bool must_reshard = false;
2943 for (auto p = start;
2944 p != extent_map.end() && p->logical_offset < end;
2945 ++p, ++n) {
2946 ceph_assert(p->logical_offset >= offset);
2947 p->blob->last_encoded_id = -1;
2948 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2949 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2950 << std::dec << " hit new spanning blob " << *p << dendl;
2951 request_reshard(p->blob_start(), p->blob_end());
2952 must_reshard = true;
2953 }
2954 if (!must_reshard) {
2955 denc_varint(0, bound); // blobid
2956 denc_varint(0, bound); // logical_offset
2957 denc_varint(0, bound); // len
2958 denc_varint(0, bound); // blob_offset
2959
2960 p->blob->bound_encode(
2961 bound,
2962 struct_v,
2963 p->blob->shared_blob->get_sbid(),
2964 false);
2965 }
2966 }
2967 if (must_reshard) {
2968 return true;
2969 }
2970
2971 denc(struct_v, bound);
2972 denc_varint(0, bound); // number of extents
2973
2974 {
2975 auto app = bl.get_contiguous_appender(bound);
2976 denc(struct_v, app);
2977 denc_varint(n, app);
2978 if (pn) {
2979 *pn = n;
2980 }
2981
2982 n = 0;
2983 uint64_t pos = 0;
2984 uint64_t prev_len = 0;
2985 for (auto p = start;
2986 p != extent_map.end() && p->logical_offset < end;
2987 ++p, ++n) {
2988 unsigned blobid;
2989 bool include_blob = false;
2990 if (p->blob->is_spanning()) {
2991 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2992 blobid |= BLOBID_FLAG_SPANNING;
2993 } else if (p->blob->last_encoded_id < 0) {
2994 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2995 include_blob = true;
2996 blobid = 0; // the decoder will infer the id from n
2997 } else {
2998 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2999 }
3000 if (p->logical_offset == pos) {
3001 blobid |= BLOBID_FLAG_CONTIGUOUS;
3002 }
3003 if (p->blob_offset == 0) {
3004 blobid |= BLOBID_FLAG_ZEROOFFSET;
3005 }
3006 if (p->length == prev_len) {
3007 blobid |= BLOBID_FLAG_SAMELENGTH;
3008 } else {
3009 prev_len = p->length;
3010 }
3011 denc_varint(blobid, app);
3012 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3013 denc_varint_lowz(p->logical_offset - pos, app);
3014 }
3015 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3016 denc_varint_lowz(p->blob_offset, app);
3017 }
3018 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3019 denc_varint_lowz(p->length, app);
3020 }
3021 pos = p->logical_end();
3022 if (include_blob) {
3023 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3024 }
3025 }
3026 }
3027 /*derr << __func__ << bl << dendl;
3028 derr << __func__ << ":";
3029 bl.hexdump(*_dout);
3030 *_dout << dendl;
3031 */
3032 return false;
3033 }
3034
3035 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3036 {
3037 /*
3038 derr << __func__ << ":";
3039 bl.hexdump(*_dout);
3040 *_dout << dendl;
3041 */
3042
3043 ceph_assert(bl.get_num_buffers() <= 1);
3044 auto p = bl.front().begin_deep();
3045 __u8 struct_v;
3046 denc(struct_v, p);
3047 // Version 2 differs from v1 in blob's ref_map
3048 // serialization only. Hence there is no specific
3049 // handling at ExtentMap level below.
3050 ceph_assert(struct_v == 1 || struct_v == 2);
3051
3052 uint32_t num;
3053 denc_varint(num, p);
3054 vector<BlobRef> blobs(num);
3055 uint64_t pos = 0;
3056 uint64_t prev_len = 0;
3057 unsigned n = 0;
3058
3059 while (!p.end()) {
3060 Extent *le = new Extent();
3061 uint64_t blobid;
3062 denc_varint(blobid, p);
3063 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3064 uint64_t gap;
3065 denc_varint_lowz(gap, p);
3066 pos += gap;
3067 }
3068 le->logical_offset = pos;
3069 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3070 denc_varint_lowz(le->blob_offset, p);
3071 } else {
3072 le->blob_offset = 0;
3073 }
3074 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3075 denc_varint_lowz(prev_len, p);
3076 }
3077 le->length = prev_len;
3078
3079 if (blobid & BLOBID_FLAG_SPANNING) {
3080 dout(30) << __func__ << " getting spanning blob "
3081 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3082 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3083 } else {
3084 blobid >>= BLOBID_SHIFT_BITS;
3085 if (blobid) {
3086 le->assign_blob(blobs[blobid - 1]);
3087 ceph_assert(le->blob);
3088 } else {
3089 Blob *b = new Blob();
3090 uint64_t sbid = 0;
3091 b->decode(onode->c, p, struct_v, &sbid, false);
3092 blobs[n] = b;
3093 onode->c->open_shared_blob(sbid, b);
3094 le->assign_blob(b);
3095 }
3096 // we build ref_map dynamically for non-spanning blobs
3097 le->blob->get_ref(
3098 onode->c,
3099 le->blob_offset,
3100 le->length);
3101 }
3102 pos += prev_len;
3103 ++n;
3104 extent_map.insert(*le);
3105 }
3106
3107 ceph_assert(n == num);
3108 return num;
3109 }
3110
3111 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3112 {
3113 // Version 2 differs from v1 in blob's ref_map
3114 // serialization only. Hence there is no specific
3115 // handling at ExtentMap level.
3116 __u8 struct_v = 2;
3117
3118 denc(struct_v, p);
3119 denc_varint((uint32_t)0, p);
3120 size_t key_size = 0;
3121 denc_varint((uint32_t)0, key_size);
3122 p += spanning_blob_map.size() * key_size;
3123 for (const auto& i : spanning_blob_map) {
3124 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3125 }
3126 }
3127
3128 void BlueStore::ExtentMap::encode_spanning_blobs(
3129 bufferlist::contiguous_appender& p)
3130 {
3131 // Version 2 differs from v1 in blob's ref_map
3132 // serialization only. Hence there is no specific
3133 // handling at ExtentMap level.
3134 __u8 struct_v = 2;
3135
3136 denc(struct_v, p);
3137 denc_varint(spanning_blob_map.size(), p);
3138 for (auto& i : spanning_blob_map) {
3139 denc_varint(i.second->id, p);
3140 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3141 }
3142 }
3143
3144 void BlueStore::ExtentMap::decode_spanning_blobs(
3145 bufferptr::const_iterator& p)
3146 {
3147 __u8 struct_v;
3148 denc(struct_v, p);
3149 // Version 2 differs from v1 in blob's ref_map
3150 // serialization only. Hence there is no specific
3151 // handling at ExtentMap level.
3152 ceph_assert(struct_v == 1 || struct_v == 2);
3153
3154 unsigned n;
3155 denc_varint(n, p);
3156 while (n--) {
3157 BlobRef b(new Blob());
3158 denc_varint(b->id, p);
3159 spanning_blob_map[b->id] = b;
3160 uint64_t sbid = 0;
3161 b->decode(onode->c, p, struct_v, &sbid, true);
3162 onode->c->open_shared_blob(sbid, b);
3163 }
3164 }
3165
3166 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3167 {
3168 shards.resize(onode->onode.extent_map_shards.size());
3169 unsigned i = 0;
3170 for (auto &s : onode->onode.extent_map_shards) {
3171 shards[i].shard_info = &s;
3172 shards[i].loaded = loaded;
3173 shards[i].dirty = dirty;
3174 ++i;
3175 }
3176 }
3177
3178 void BlueStore::ExtentMap::fault_range(
3179 KeyValueDB *db,
3180 uint32_t offset,
3181 uint32_t length)
3182 {
3183 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3184 << std::dec << dendl;
3185 auto start = seek_shard(offset);
3186 auto last = seek_shard(offset + length);
3187
3188 if (start < 0)
3189 return;
3190
3191 ceph_assert(last >= start);
3192 string key;
3193 while (start <= last) {
3194 ceph_assert((size_t)start < shards.size());
3195 auto p = &shards[start];
3196 if (!p->loaded) {
3197 dout(30) << __func__ << " opening shard 0x" << std::hex
3198 << p->shard_info->offset << std::dec << dendl;
3199 bufferlist v;
3200 generate_extent_shard_key_and_apply(
3201 onode->key, p->shard_info->offset, &key,
3202 [&](const string& final_key) {
3203 int r = db->get(PREFIX_OBJ, final_key, &v);
3204 if (r < 0) {
3205 derr << __func__ << " missing shard 0x" << std::hex
3206 << p->shard_info->offset << std::dec << " for " << onode->oid
3207 << dendl;
3208 ceph_assert(r >= 0);
3209 }
3210 }
3211 );
3212 p->extents = decode_some(v);
3213 p->loaded = true;
3214 dout(20) << __func__ << " open shard 0x" << std::hex
3215 << p->shard_info->offset
3216 << " for range 0x" << offset << "~" << length << std::dec
3217 << " (" << v.length() << " bytes)" << dendl;
3218 ceph_assert(p->dirty == false);
3219 ceph_assert(v.length() == p->shard_info->bytes);
3220 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3221 } else {
3222 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3223 }
3224 ++start;
3225 }
3226 }
3227
3228 void BlueStore::ExtentMap::dirty_range(
3229 uint32_t offset,
3230 uint32_t length)
3231 {
3232 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3233 << std::dec << dendl;
3234 if (shards.empty()) {
3235 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3236 inline_bl.clear();
3237 return;
3238 }
3239 auto start = seek_shard(offset);
3240 if (length == 0) {
3241 length = 1;
3242 }
3243 auto last = seek_shard(offset + length - 1);
3244 if (start < 0)
3245 return;
3246
3247 ceph_assert(last >= start);
3248 while (start <= last) {
3249 ceph_assert((size_t)start < shards.size());
3250 auto p = &shards[start];
3251 if (!p->loaded) {
3252 derr << __func__ << "on write 0x" << std::hex << offset
3253 << "~" << length << " shard 0x" << p->shard_info->offset
3254 << std::dec << " is not loaded, can't mark dirty" << dendl;
3255 ceph_abort_msg("can't mark unloaded shard dirty");
3256 }
3257 if (!p->dirty) {
3258 dout(20) << __func__ << " mark shard 0x" << std::hex
3259 << p->shard_info->offset << std::dec << " dirty" << dendl;
3260 p->dirty = true;
3261 }
3262 ++start;
3263 }
3264 }
3265
3266 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3267 uint64_t offset)
3268 {
3269 Extent dummy(offset);
3270 return extent_map.find(dummy);
3271 }
3272
3273 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3274 uint64_t offset)
3275 {
3276 Extent dummy(offset);
3277 auto fp = extent_map.lower_bound(dummy);
3278 if (fp != extent_map.begin()) {
3279 --fp;
3280 if (fp->logical_end() <= offset) {
3281 ++fp;
3282 }
3283 }
3284 return fp;
3285 }
3286
3287 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3288 uint64_t offset) const
3289 {
3290 Extent dummy(offset);
3291 auto fp = extent_map.lower_bound(dummy);
3292 if (fp != extent_map.begin()) {
3293 --fp;
3294 if (fp->logical_end() <= offset) {
3295 ++fp;
3296 }
3297 }
3298 return fp;
3299 }
3300
3301 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3302 {
3303 auto fp = seek_lextent(offset);
3304 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3305 return false;
3306 }
3307 return true;
3308 }
3309
3310 int BlueStore::ExtentMap::compress_extent_map(
3311 uint64_t offset,
3312 uint64_t length)
3313 {
3314 if (extent_map.empty())
3315 return 0;
3316 int removed = 0;
3317 auto p = seek_lextent(offset);
3318 if (p != extent_map.begin()) {
3319 --p; // start to the left of offset
3320 }
3321 // the caller should have just written to this region
3322 ceph_assert(p != extent_map.end());
3323
3324 // identify the *next* shard
3325 auto pshard = shards.begin();
3326 while (pshard != shards.end() &&
3327 p->logical_offset >= pshard->shard_info->offset) {
3328 ++pshard;
3329 }
3330 uint64_t shard_end;
3331 if (pshard != shards.end()) {
3332 shard_end = pshard->shard_info->offset;
3333 } else {
3334 shard_end = OBJECT_MAX_SIZE;
3335 }
3336
3337 auto n = p;
3338 for (++n; n != extent_map.end(); p = n++) {
3339 if (n->logical_offset > offset + length) {
3340 break; // stop after end
3341 }
3342 while (n != extent_map.end() &&
3343 p->logical_end() == n->logical_offset &&
3344 p->blob == n->blob &&
3345 p->blob_offset + p->length == n->blob_offset &&
3346 n->logical_offset < shard_end) {
3347 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3348 << " next shard 0x" << shard_end << std::dec
3349 << " merging " << *p << " and " << *n << dendl;
3350 p->length += n->length;
3351 rm(n++);
3352 ++removed;
3353 }
3354 if (n == extent_map.end()) {
3355 break;
3356 }
3357 if (n->logical_offset >= shard_end) {
3358 ceph_assert(pshard != shards.end());
3359 ++pshard;
3360 if (pshard != shards.end()) {
3361 shard_end = pshard->shard_info->offset;
3362 } else {
3363 shard_end = OBJECT_MAX_SIZE;
3364 }
3365 }
3366 }
3367 if (removed) {
3368 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3369 }
3370 return removed;
3371 }
3372
3373 void BlueStore::ExtentMap::punch_hole(
3374 CollectionRef &c,
3375 uint64_t offset,
3376 uint64_t length,
3377 old_extent_map_t *old_extents)
3378 {
3379 auto p = seek_lextent(offset);
3380 uint64_t end = offset + length;
3381 while (p != extent_map.end()) {
3382 if (p->logical_offset >= end) {
3383 break;
3384 }
3385 if (p->logical_offset < offset) {
3386 if (p->logical_end() > end) {
3387 // split and deref middle
3388 uint64_t front = offset - p->logical_offset;
3389 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3390 length, p->blob);
3391 old_extents->push_back(*oe);
3392 add(end,
3393 p->blob_offset + front + length,
3394 p->length - front - length,
3395 p->blob);
3396 p->length = front;
3397 break;
3398 } else {
3399 // deref tail
3400 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
3401 uint64_t keep = offset - p->logical_offset;
3402 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3403 p->length - keep, p->blob);
3404 old_extents->push_back(*oe);
3405 p->length = keep;
3406 ++p;
3407 continue;
3408 }
3409 }
3410 if (p->logical_offset + p->length <= end) {
3411 // deref whole lextent
3412 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3413 p->length, p->blob);
3414 old_extents->push_back(*oe);
3415 rm(p++);
3416 continue;
3417 }
3418 // deref head
3419 uint64_t keep = p->logical_end() - end;
3420 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3421 p->length - keep, p->blob);
3422 old_extents->push_back(*oe);
3423
3424 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3425 rm(p);
3426 break;
3427 }
3428 }
3429
3430 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3431 CollectionRef &c,
3432 uint64_t logical_offset,
3433 uint64_t blob_offset, uint64_t length, BlobRef b,
3434 old_extent_map_t *old_extents)
3435 {
3436 // We need to have completely initialized Blob to increment its ref counters.
3437 ceph_assert(b->get_blob().get_logical_length() != 0);
3438
3439 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3440 // old_extents list if we overwre the blob totally
3441 // This might happen during WAL overwrite.
3442 b->get_ref(onode->c, blob_offset, length);
3443
3444 if (old_extents) {
3445 punch_hole(c, logical_offset, length, old_extents);
3446 }
3447
3448 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3449 extent_map.insert(*le);
3450 if (spans_shard(logical_offset, length)) {
3451 request_reshard(logical_offset, logical_offset + length);
3452 }
3453 return le;
3454 }
3455
3456 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3457 BlobRef lb,
3458 uint32_t blob_offset,
3459 uint32_t pos)
3460 {
3461 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3462 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3463 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3464 << dendl;
3465 BlobRef rb = onode->c->new_blob();
3466 lb->split(onode->c, blob_offset, rb.get());
3467
3468 for (auto ep = seek_lextent(pos);
3469 ep != extent_map.end() && ep->logical_offset < end_pos;
3470 ++ep) {
3471 if (ep->blob != lb) {
3472 continue;
3473 }
3474 if (ep->logical_offset < pos) {
3475 // split extent
3476 size_t left = pos - ep->logical_offset;
3477 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3478 extent_map.insert(*ne);
3479 ep->length = left;
3480 dout(30) << __func__ << " split " << *ep << dendl;
3481 dout(30) << __func__ << " to " << *ne << dendl;
3482 } else {
3483 // switch blob
3484 ceph_assert(ep->blob_offset >= blob_offset);
3485
3486 ep->blob = rb;
3487 ep->blob_offset -= blob_offset;
3488 dout(30) << __func__ << " adjusted " << *ep << dendl;
3489 }
3490 }
3491 return rb;
3492 }
3493
3494 // Onode
3495
3496 #undef dout_prefix
3497 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3498
3499 //
3500 // A tricky thing about Onode's ref counter is that we do an additional
3501 // increment when newly pinned instance is detected. And -1 on unpin.
3502 // This prevents from a conflict with a delete call (when nref == 0).
3503 // The latter might happen while the thread is in unpin() function
3504 // (and e.g. waiting for lock acquisition) since nref is already
3505 // decremented. And another 'putting' thread on the instance will release it.
3506 //
3507 void BlueStore::Onode::get() {
3508 if (++nref >= 2 && !pinned) {
3509 OnodeCacheShard* ocs = c->get_onode_cache();
3510 ocs->lock.lock();
3511 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3512 while (ocs != c->get_onode_cache()) {
3513 ocs->lock.unlock();
3514 ocs = c->get_onode_cache();
3515 ocs->lock.lock();
3516 }
3517 bool was_pinned = pinned;
3518 pinned = nref >= 2;
3519 // additional increment for newly pinned instance
3520 bool r = !was_pinned && pinned;
3521 if (r) {
3522 ++nref;
3523 }
3524 if (cached && r) {
3525 ocs->_pin(this);
3526 }
3527 ocs->lock.unlock();
3528 }
3529 }
3530 void BlueStore::Onode::put() {
3531 int n = --nref;
3532 if (n == 2) {
3533 OnodeCacheShard* ocs = c->get_onode_cache();
3534 ocs->lock.lock();
3535 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3536 while (ocs != c->get_onode_cache()) {
3537 ocs->lock.unlock();
3538 ocs = c->get_onode_cache();
3539 ocs->lock.lock();
3540 }
3541 bool need_unpin = pinned;
3542 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3543 // +1 due to pinned state
3544 need_unpin = need_unpin && !pinned;
3545 if (cached && need_unpin) {
3546 if (exists) {
3547 ocs->_unpin(this);
3548 } else {
3549 ocs->_unpin_and_rm(this);
3550 // remove will also decrement nref and delete Onode
3551 c->onode_map._remove(oid);
3552 }
3553 }
3554 // additional decrement for newly unpinned instance
3555 // should be the last action since Onode can be released
3556 // at any point after this decrement
3557 if (need_unpin) {
3558 n = --nref;
3559 }
3560 ocs->lock.unlock();
3561 }
3562 if (n == 0) {
3563 delete this;
3564 }
3565 }
3566
3567 BlueStore::Onode* BlueStore::Onode::decode(
3568 CollectionRef c,
3569 const ghobject_t& oid,
3570 const string& key,
3571 const bufferlist& v)
3572 {
3573 Onode* on = new Onode(c.get(), oid, key);
3574 on->exists = true;
3575 auto p = v.front().begin_deep();
3576 on->onode.decode(p);
3577 for (auto& i : on->onode.attrs) {
3578 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3579 }
3580
3581 // initialize extent_map
3582 on->extent_map.decode_spanning_blobs(p);
3583 if (on->onode.extent_map_shards.empty()) {
3584 denc(on->extent_map.inline_bl, p);
3585 on->extent_map.decode_some(on->extent_map.inline_bl);
3586 on->extent_map.inline_bl.reassign_to_mempool(
3587 mempool::mempool_bluestore_cache_data);
3588 }
3589 else {
3590 on->extent_map.init_shards(false, false);
3591 }
3592 return on;
3593 }
3594
3595 void BlueStore::Onode::flush()
3596 {
3597 if (flushing_count.load()) {
3598 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
3599 waiting_count++;
3600 std::unique_lock l(flush_lock);
3601 while (flushing_count.load()) {
3602 flush_cond.wait(l);
3603 }
3604 waiting_count--;
3605 }
3606 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3607 }
3608
3609 void BlueStore::Onode::dump(Formatter* f) const
3610 {
3611 onode.dump(f);
3612 extent_map.dump(f);
3613 }
3614
3615 const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
3616 {
3617 if (bluestore_onode_t::is_pgmeta_omap(flags)) {
3618 return PREFIX_PGMETA_OMAP;
3619 }
3620 if (bluestore_onode_t::is_perpg_omap(flags)) {
3621 return PREFIX_PERPG_OMAP;
3622 }
3623 if (bluestore_onode_t::is_perpool_omap(flags)) {
3624 return PREFIX_PERPOOL_OMAP;
3625 }
3626 return PREFIX_OMAP;
3627 }
3628
3629 // '-' < '.' < '~'
3630 void BlueStore::Onode::calc_omap_header(
3631 uint8_t flags,
3632 const Onode* o,
3633 std::string* out)
3634 {
3635 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3636 if (bluestore_onode_t::is_perpg_omap(flags)) {
3637 _key_encode_u64(o->c->pool(), out);
3638 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3639 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3640 _key_encode_u64(o->c->pool(), out);
3641 }
3642 }
3643 _key_encode_u64(o->onode.nid, out);
3644 out->push_back('-');
3645 }
3646
3647 void BlueStore::Onode::calc_omap_key(uint8_t flags,
3648 const Onode* o,
3649 const std::string& key,
3650 std::string* out)
3651 {
3652 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3653 if (bluestore_onode_t::is_perpg_omap(flags)) {
3654 _key_encode_u64(o->c->pool(), out);
3655 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3656 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3657 _key_encode_u64(o->c->pool(), out);
3658 }
3659 }
3660 _key_encode_u64(o->onode.nid, out);
3661 out->push_back('.');
3662 out->append(key);
3663 }
3664
3665 void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3666 {
3667 if (!onode.is_pgmeta_omap()) {
3668 if (onode.is_perpg_omap()) {
3669 _key_encode_u64(c->pool(), out);
3670 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3671 } else if (onode.is_perpool_omap()) {
3672 _key_encode_u64(c->pool(), out);
3673 }
3674 }
3675 _key_encode_u64(onode.nid, out);
3676 out->append(old.c_str() + out->length(), old.size() - out->length());
3677 }
3678
3679 void BlueStore::Onode::calc_omap_tail(
3680 uint8_t flags,
3681 const Onode* o,
3682 std::string* out)
3683 {
3684 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3685 if (bluestore_onode_t::is_perpg_omap(flags)) {
3686 _key_encode_u64(o->c->pool(), out);
3687 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3688 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3689 _key_encode_u64(o->c->pool(), out);
3690 }
3691 }
3692 _key_encode_u64(o->onode.nid, out);
3693 out->push_back('~');
3694 }
3695
3696 void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3697 {
3698 size_t pos = sizeof(uint64_t) + 1;
3699 if (!onode.is_pgmeta_omap()) {
3700 if (onode.is_perpg_omap()) {
3701 pos += sizeof(uint64_t) + sizeof(uint32_t);
3702 } else if (onode.is_perpool_omap()) {
3703 pos += sizeof(uint64_t);
3704 }
3705 }
3706 *user_key = key.substr(pos);
3707 }
3708
3709 // =======================================================
3710 // WriteContext
3711
3712 /// Checks for writes to the same pextent within a blob
3713 bool BlueStore::WriteContext::has_conflict(
3714 BlobRef b,
3715 uint64_t loffs,
3716 uint64_t loffs_end,
3717 uint64_t min_alloc_size)
3718 {
3719 ceph_assert((loffs % min_alloc_size) == 0);
3720 ceph_assert((loffs_end % min_alloc_size) == 0);
3721 for (auto w : writes) {
3722 if (b == w.b) {
3723 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3724 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
3725 if ((loffs <= loffs2 && loffs_end > loffs2) ||
3726 (loffs >= loffs2 && loffs < loffs2_end)) {
3727 return true;
3728 }
3729 }
3730 }
3731 return false;
3732 }
3733
3734 // =======================================================
3735
3736 // DeferredBatch
3737 #undef dout_prefix
3738 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3739 #undef dout_context
3740 #define dout_context cct
3741
3742 void BlueStore::DeferredBatch::prepare_write(
3743 CephContext *cct,
3744 uint64_t seq, uint64_t offset, uint64_t length,
3745 bufferlist::const_iterator& blp)
3746 {
3747 _discard(cct, offset, length);
3748 auto i = iomap.insert(make_pair(offset, deferred_io()));
3749 ceph_assert(i.second); // this should be a new insertion
3750 i.first->second.seq = seq;
3751 blp.copy(length, i.first->second.bl);
3752 i.first->second.bl.reassign_to_mempool(
3753 mempool::mempool_bluestore_writing_deferred);
3754 dout(20) << __func__ << " seq " << seq
3755 << " 0x" << std::hex << offset << "~" << length
3756 << " crc " << i.first->second.bl.crc32c(-1)
3757 << std::dec << dendl;
3758 seq_bytes[seq] += length;
3759 #ifdef DEBUG_DEFERRED
3760 _audit(cct);
3761 #endif
3762 }
3763
3764 void BlueStore::DeferredBatch::_discard(
3765 CephContext *cct, uint64_t offset, uint64_t length)
3766 {
3767 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3768 << std::dec << dendl;
3769 auto p = iomap.lower_bound(offset);
3770 if (p != iomap.begin()) {
3771 --p;
3772 auto end = p->first + p->second.bl.length();
3773 if (end > offset) {
3774 bufferlist head;
3775 head.substr_of(p->second.bl, 0, offset - p->first);
3776 dout(20) << __func__ << " keep head " << p->second.seq
3777 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3778 << " -> 0x" << head.length() << std::dec << dendl;
3779 auto i = seq_bytes.find(p->second.seq);
3780 ceph_assert(i != seq_bytes.end());
3781 if (end > offset + length) {
3782 bufferlist tail;
3783 tail.substr_of(p->second.bl, offset + length - p->first,
3784 end - (offset + length));
3785 dout(20) << __func__ << " keep tail " << p->second.seq
3786 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3787 << " -> 0x" << tail.length() << std::dec << dendl;
3788 auto &n = iomap[offset + length];
3789 n.bl.swap(tail);
3790 n.seq = p->second.seq;
3791 i->second -= length;
3792 } else {
3793 i->second -= end - offset;
3794 }
3795 ceph_assert(i->second >= 0);
3796 p->second.bl.swap(head);
3797 }
3798 ++p;
3799 }
3800 while (p != iomap.end()) {
3801 if (p->first >= offset + length) {
3802 break;
3803 }
3804 auto i = seq_bytes.find(p->second.seq);
3805 ceph_assert(i != seq_bytes.end());
3806 auto end = p->first + p->second.bl.length();
3807 if (end > offset + length) {
3808 unsigned drop_front = offset + length - p->first;
3809 unsigned keep_tail = end - (offset + length);
3810 dout(20) << __func__ << " truncate front " << p->second.seq
3811 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3812 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3813 << " to 0x" << (offset + length) << "~" << keep_tail
3814 << std::dec << dendl;
3815 auto &s = iomap[offset + length];
3816 s.seq = p->second.seq;
3817 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3818 i->second -= drop_front;
3819 } else {
3820 dout(20) << __func__ << " drop " << p->second.seq
3821 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3822 << std::dec << dendl;
3823 i->second -= p->second.bl.length();
3824 }
3825 ceph_assert(i->second >= 0);
3826 p = iomap.erase(p);
3827 }
3828 }
3829
3830 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3831 {
3832 map<uint64_t,int> sb;
3833 for (auto p : seq_bytes) {
3834 sb[p.first] = 0; // make sure we have the same set of keys
3835 }
3836 uint64_t pos = 0;
3837 for (auto& p : iomap) {
3838 ceph_assert(p.first >= pos);
3839 sb[p.second.seq] += p.second.bl.length();
3840 pos = p.first + p.second.bl.length();
3841 }
3842 ceph_assert(sb == seq_bytes);
3843 }
3844
3845
3846 // Collection
3847
3848 #undef dout_prefix
3849 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3850
3851 BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3852 : CollectionImpl(store_->cct, cid),
3853 store(store_),
3854 cache(bc),
3855 exists(true),
3856 onode_map(oc),
3857 commit_queue(nullptr)
3858 {
3859 }
3860
3861 bool BlueStore::Collection::flush_commit(Context *c)
3862 {
3863 return osr->flush_commit(c);
3864 }
3865
3866 void BlueStore::Collection::flush()
3867 {
3868 osr->flush();
3869 }
3870
3871 void BlueStore::Collection::flush_all_but_last()
3872 {
3873 osr->flush_all_but_last();
3874 }
3875
3876 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3877 {
3878 ceph_assert(!b->shared_blob);
3879 const bluestore_blob_t& blob = b->get_blob();
3880 if (!blob.is_shared()) {
3881 b->shared_blob = new SharedBlob(this);
3882 return;
3883 }
3884
3885 b->shared_blob = shared_blob_set.lookup(sbid);
3886 if (b->shared_blob) {
3887 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3888 << std::dec << " had " << *b->shared_blob << dendl;
3889 } else {
3890 b->shared_blob = new SharedBlob(sbid, this);
3891 shared_blob_set.add(this, b->shared_blob.get());
3892 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3893 << std::dec << " opened " << *b->shared_blob
3894 << dendl;
3895 }
3896 }
3897
3898 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3899 {
3900 if (!sb->is_loaded()) {
3901
3902 bufferlist v;
3903 string key;
3904 auto sbid = sb->get_sbid();
3905 get_shared_blob_key(sbid, &key);
3906 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3907 if (r < 0) {
3908 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3909 << std::dec << " not found at key "
3910 << pretty_binary_string(key) << dendl;
3911 ceph_abort_msg("uh oh, missing shared_blob");
3912 }
3913
3914 sb->loaded = true;
3915 sb->persistent = new bluestore_shared_blob_t(sbid);
3916 auto p = v.cbegin();
3917 decode(*(sb->persistent), p);
3918 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3919 << std::dec << " loaded shared_blob " << *sb << dendl;
3920 }
3921 }
3922
3923 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3924 {
3925 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3926 ceph_assert(!b->shared_blob->is_loaded());
3927
3928 // update blob
3929 bluestore_blob_t& blob = b->dirty_blob();
3930 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3931
3932 // update shared blob
3933 b->shared_blob->loaded = true;
3934 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3935 shared_blob_set.add(this, b->shared_blob.get());
3936 for (auto p : blob.get_extents()) {
3937 if (p.is_valid()) {
3938 b->shared_blob->get_ref(
3939 p.offset,
3940 p.length);
3941 }
3942 }
3943 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3944 }
3945
3946 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3947 {
3948 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3949 ceph_assert(sb->is_loaded());
3950
3951 uint64_t sbid = sb->get_sbid();
3952 shared_blob_set.remove(sb);
3953 sb->loaded = false;
3954 delete sb->persistent;
3955 sb->sbid_unloaded = 0;
3956 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3957 return sbid;
3958 }
3959
3960 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3961 const ghobject_t& oid,
3962 bool create,
3963 bool is_createop)
3964 {
3965 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
3966
3967 spg_t pgid;
3968 if (cid.is_pg(&pgid)) {
3969 if (!oid.match(cnode.bits, pgid.ps())) {
3970 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3971 << pgid << " bits " << cnode.bits << dendl;
3972 ceph_abort();
3973 }
3974 }
3975
3976 OnodeRef o = onode_map.lookup(oid);
3977 if (o)
3978 return o;
3979
3980 string key;
3981 get_object_key(store->cct, oid, &key);
3982
3983 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3984 << pretty_binary_string(key) << dendl;
3985
3986 bufferlist v;
3987 int r = -ENOENT;
3988 Onode *on;
3989 if (!is_createop) {
3990 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3991 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3992 }
3993 if (v.length() == 0) {
3994 ceph_assert(r == -ENOENT);
3995 if (!create)
3996 return OnodeRef();
3997
3998 // new object, new onode
3999 on = new Onode(this, oid, key);
4000 } else {
4001 // loaded
4002 ceph_assert(r >= 0);
4003 on = Onode::decode(this, oid, key, v);
4004 }
4005 o.reset(on);
4006 return onode_map.add(oid, o);
4007 }
4008
4009 void BlueStore::Collection::split_cache(
4010 Collection *dest)
4011 {
4012 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4013
4014 auto *ocache = get_onode_cache();
4015 auto *ocache_dest = dest->get_onode_cache();
4016
4017 // lock cache shards
4018 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4019 std::lock_guard l(ocache->lock, std::adopt_lock);
4020 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4021 std::lock_guard l3(cache->lock, std::adopt_lock);
4022 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
4023
4024 int destbits = dest->cnode.bits;
4025 spg_t destpg;
4026 bool is_pg = dest->cid.is_pg(&destpg);
4027 ceph_assert(is_pg);
4028
4029 auto p = onode_map.onode_map.begin();
4030 while (p != onode_map.onode_map.end()) {
4031 OnodeRef o = p->second;
4032 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4033 // onode does not belong to this child
4034 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4035 << dendl;
4036 ++p;
4037 } else {
4038 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4039 << dendl;
4040
4041 // ensuring that nref is always >= 2 and hence onode is pinned and
4042 // physically out of cache during the transition
4043 OnodeRef o_pin = o;
4044 ceph_assert(o->pinned);
4045
4046 p = onode_map.onode_map.erase(p);
4047 dest->onode_map.onode_map[o->oid] = o;
4048 if (o->cached) {
4049 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
4050 }
4051 o->c = dest;
4052
4053 // move over shared blobs and buffers. cover shared blobs from
4054 // both extent map and spanning blob map (the full extent map
4055 // may not be faulted in)
4056 vector<SharedBlob*> sbvec;
4057 for (auto& e : o->extent_map.extent_map) {
4058 sbvec.push_back(e.blob->shared_blob.get());
4059 }
4060 for (auto& b : o->extent_map.spanning_blob_map) {
4061 sbvec.push_back(b.second->shared_blob.get());
4062 }
4063 for (auto sb : sbvec) {
4064 if (sb->coll == dest) {
4065 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4066 << dendl;
4067 continue;
4068 }
4069 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
4070 if (sb->get_sbid()) {
4071 ldout(store->cct, 20) << __func__
4072 << " moving registration " << *sb << dendl;
4073 shared_blob_set.remove(sb);
4074 dest->shared_blob_set.add(dest, sb);
4075 }
4076 sb->coll = dest;
4077 if (dest->cache != cache) {
4078 for (auto& i : sb->bc.buffer_map) {
4079 if (!i.second->is_writing()) {
4080 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4081 << dendl;
4082 dest->cache->_move(cache, i.second.get());
4083 }
4084 }
4085 }
4086 }
4087 }
4088 }
4089 dest->cache->_trim();
4090 }
4091
4092 // =======================================================
4093
4094 // MempoolThread
4095
4096 #undef dout_prefix
4097 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4098 #undef dout_context
4099 #define dout_context store->cct
4100
4101 void *BlueStore::MempoolThread::entry()
4102 {
4103 std::unique_lock l{lock};
4104
4105 uint32_t prev_config_change = store->config_changed.load();
4106 uint64_t base = store->osd_memory_base;
4107 double fragmentation = store->osd_memory_expected_fragmentation;
4108 uint64_t target = store->osd_memory_target;
4109 uint64_t min = store->osd_memory_cache_min;
4110 uint64_t max = min;
4111
4112 // When setting the maximum amount of memory to use for cache, first
4113 // assume some base amount of memory for the OSD and then fudge in
4114 // some overhead for fragmentation that scales with cache usage.
4115 uint64_t ltarget = (1.0 - fragmentation) * target;
4116 if (ltarget > base + min) {
4117 max = ltarget - base;
4118 }
4119
4120 binned_kv_cache = store->db->get_priority_cache();
4121 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
4122 if (store->cache_autotune && binned_kv_cache != nullptr) {
4123 pcm = std::make_shared<PriorityCache::Manager>(
4124 store->cct, min, max, target, true, "bluestore-pricache");
4125 pcm->insert("kv", binned_kv_cache, true);
4126 pcm->insert("meta", meta_cache, true);
4127 pcm->insert("data", data_cache, true);
4128 if (binned_kv_onode_cache != nullptr) {
4129 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4130 }
4131 }
4132
4133 utime_t next_balance = ceph_clock_now();
4134 utime_t next_resize = ceph_clock_now();
4135 utime_t next_deferred_force_submit = ceph_clock_now();
4136 utime_t alloc_stats_dump_clock = ceph_clock_now();
4137
4138 bool interval_stats_trim = false;
4139 while (!stop) {
4140 // Update pcm cache settings if related configuration was changed
4141 uint32_t cur_config_change = store->config_changed.load();
4142 if (cur_config_change != prev_config_change) {
4143 _update_cache_settings();
4144 prev_config_change = cur_config_change;
4145 }
4146
4147 // Before we trim, check and see if it's time to rebalance/resize.
4148 double autotune_interval = store->cache_autotune_interval;
4149 double resize_interval = store->osd_memory_cache_resize_interval;
4150 double max_defer_interval = store->max_defer_interval;
4151
4152 double alloc_stats_dump_interval =
4153 store->cct->_conf->bluestore_alloc_stats_dump_interval;
4154
4155 if (alloc_stats_dump_interval > 0 &&
4156 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4157 store->_record_allocation_stats();
4158 alloc_stats_dump_clock = ceph_clock_now();
4159 }
4160 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
4161 _adjust_cache_settings();
4162
4163 // Log events at 5 instead of 20 when balance happens.
4164 interval_stats_trim = true;
4165
4166 if (pcm != nullptr) {
4167 pcm->balance();
4168 }
4169
4170 next_balance = ceph_clock_now();
4171 next_balance += autotune_interval;
4172 }
4173 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
4174 if (ceph_using_tcmalloc() && pcm != nullptr) {
4175 pcm->tune_memory();
4176 }
4177 next_resize = ceph_clock_now();
4178 next_resize += resize_interval;
4179 }
4180
4181 if (max_defer_interval > 0 &&
4182 next_deferred_force_submit < ceph_clock_now()) {
4183 if (store->get_deferred_last_submitted() + max_defer_interval <
4184 ceph_clock_now()) {
4185 store->deferred_try_submit();
4186 }
4187 next_deferred_force_submit = ceph_clock_now();
4188 next_deferred_force_submit += max_defer_interval/3;
4189 }
4190
4191 // Now Resize the shards
4192 _resize_shards(interval_stats_trim);
4193 interval_stats_trim = false;
4194
4195 store->_update_cache_logger();
4196 auto wait = ceph::make_timespan(
4197 store->cct->_conf->bluestore_cache_trim_interval);
4198 cond.wait_for(l, wait);
4199 }
4200 // do final dump
4201 store->_record_allocation_stats();
4202 stop = false;
4203 pcm = nullptr;
4204 return NULL;
4205 }
4206
4207 void BlueStore::MempoolThread::_adjust_cache_settings()
4208 {
4209 if (binned_kv_cache != nullptr) {
4210 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4211 }
4212 if (binned_kv_onode_cache != nullptr) {
4213 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4214 }
4215 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4216 data_cache->set_cache_ratio(store->cache_data_ratio);
4217 }
4218
4219 void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
4220 {
4221 size_t onode_shards = store->onode_cache_shards.size();
4222 size_t buffer_shards = store->buffer_cache_shards.size();
4223 int64_t kv_used = store->db->get_cache_usage();
4224 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
4225 int64_t meta_used = meta_cache->_get_used_bytes();
4226 int64_t data_used = data_cache->_get_used_bytes();
4227
4228 uint64_t cache_size = store->cache_size;
4229 int64_t kv_alloc =
4230 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
4231 int64_t kv_onode_alloc =
4232 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
4233 int64_t meta_alloc =
4234 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
4235 int64_t data_alloc =
4236 static_cast<int64_t>(store->cache_data_ratio * cache_size);
4237
4238 if (pcm != nullptr && binned_kv_cache != nullptr) {
4239 cache_size = pcm->get_tuned_mem();
4240 kv_alloc = binned_kv_cache->get_committed_size();
4241 meta_alloc = meta_cache->get_committed_size();
4242 data_alloc = data_cache->get_committed_size();
4243 if (binned_kv_onode_cache != nullptr) {
4244 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4245 }
4246 }
4247
4248 if (interval_stats) {
4249 dout(5) << __func__ << " cache_size: " << cache_size
4250 << " kv_alloc: " << kv_alloc
4251 << " kv_used: " << kv_used
4252 << " kv_onode_alloc: " << kv_onode_alloc
4253 << " kv_onode_used: " << kv_onode_used
4254 << " meta_alloc: " << meta_alloc
4255 << " meta_used: " << meta_used
4256 << " data_alloc: " << data_alloc
4257 << " data_used: " << data_used << dendl;
4258 } else {
4259 dout(20) << __func__ << " cache_size: " << cache_size
4260 << " kv_alloc: " << kv_alloc
4261 << " kv_used: " << kv_used
4262 << " kv_onode_alloc: " << kv_onode_alloc
4263 << " kv_onode_used: " << kv_onode_used
4264 << " meta_alloc: " << meta_alloc
4265 << " meta_used: " << meta_used
4266 << " data_alloc: " << data_alloc
4267 << " data_used: " << data_used << dendl;
4268 }
4269
4270 uint64_t max_shard_onodes = static_cast<uint64_t>(
4271 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4272 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
4273
4274 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
4275 << " max_shard_buffer: " << max_shard_buffer << dendl;
4276
4277 for (auto i : store->onode_cache_shards) {
4278 i->set_max(max_shard_onodes);
4279 }
4280 for (auto i : store->buffer_cache_shards) {
4281 i->set_max(max_shard_buffer);
4282 }
4283 }
4284
4285 void BlueStore::MempoolThread::_update_cache_settings()
4286 {
4287 // Nothing to do if pcm is not used.
4288 if (pcm == nullptr) {
4289 return;
4290 }
4291
4292 uint64_t target = store->osd_memory_target;
4293 uint64_t base = store->osd_memory_base;
4294 uint64_t min = store->osd_memory_cache_min;
4295 uint64_t max = min;
4296 double fragmentation = store->osd_memory_expected_fragmentation;
4297
4298 uint64_t ltarget = (1.0 - fragmentation) * target;
4299 if (ltarget > base + min) {
4300 max = ltarget - base;
4301 }
4302
4303 // set pcm cache levels
4304 pcm->set_target_memory(target);
4305 pcm->set_min_memory(min);
4306 pcm->set_max_memory(max);
4307
4308 dout(5) << __func__ << " updated pcm target: " << target
4309 << " pcm min: " << min
4310 << " pcm max: " << max
4311 << dendl;
4312 }
4313
4314 // =======================================================
4315
4316 // OmapIteratorImpl
4317
4318 #undef dout_prefix
4319 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4320
4321 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4322 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4323 : c(c), o(o), it(it)
4324 {
4325 std::shared_lock l(c->lock);
4326 if (o->onode.has_omap()) {
4327 o->get_omap_key(string(), &head);
4328 o->get_omap_tail(&tail);
4329 it->lower_bound(head);
4330 }
4331 }
4332
4333 string BlueStore::OmapIteratorImpl::_stringify() const
4334 {
4335 stringstream s;
4336 s << " omap_iterator(cid = " << c->cid
4337 <<", oid = " << o->oid << ")";
4338 return s.str();
4339 }
4340
4341 int BlueStore::OmapIteratorImpl::seek_to_first()
4342 {
4343 std::shared_lock l(c->lock);
4344 auto start1 = mono_clock::now();
4345 if (o->onode.has_omap()) {
4346 it->lower_bound(head);
4347 } else {
4348 it = KeyValueDB::Iterator();
4349 }
4350 c->store->log_latency(
4351 __func__,
4352 l_bluestore_omap_seek_to_first_lat,
4353 mono_clock::now() - start1,
4354 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4355
4356 return 0;
4357 }
4358
4359 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4360 {
4361 std::shared_lock l(c->lock);
4362 auto start1 = mono_clock::now();
4363 if (o->onode.has_omap()) {
4364 string key;
4365 o->get_omap_key(after, &key);
4366 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4367 << pretty_binary_string(key) << dendl;
4368 it->upper_bound(key);
4369 } else {
4370 it = KeyValueDB::Iterator();
4371 }
4372 c->store->log_latency_fn(
4373 __func__,
4374 l_bluestore_omap_upper_bound_lat,
4375 mono_clock::now() - start1,
4376 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4377 [&] (const ceph::timespan& lat) {
4378 return ", after = " + after +
4379 _stringify();
4380 }
4381 );
4382 return 0;
4383 }
4384
4385 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4386 {
4387 std::shared_lock l(c->lock);
4388 auto start1 = mono_clock::now();
4389 if (o->onode.has_omap()) {
4390 string key;
4391 o->get_omap_key(to, &key);
4392 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4393 << pretty_binary_string(key) << dendl;
4394 it->lower_bound(key);
4395 } else {
4396 it = KeyValueDB::Iterator();
4397 }
4398 c->store->log_latency_fn(
4399 __func__,
4400 l_bluestore_omap_lower_bound_lat,
4401 mono_clock::now() - start1,
4402 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4403 [&] (const ceph::timespan& lat) {
4404 return ", to = " + to +
4405 _stringify();
4406 }
4407 );
4408 return 0;
4409 }
4410
4411 bool BlueStore::OmapIteratorImpl::valid()
4412 {
4413 std::shared_lock l(c->lock);
4414 bool r = o->onode.has_omap() && it && it->valid() &&
4415 it->raw_key().second < tail;
4416 if (it && it->valid()) {
4417 ldout(c->store->cct,20) << __func__ << " is at "
4418 << pretty_binary_string(it->raw_key().second)
4419 << dendl;
4420 }
4421 return r;
4422 }
4423
4424 int BlueStore::OmapIteratorImpl::next()
4425 {
4426 int r = -1;
4427 std::shared_lock l(c->lock);
4428 auto start1 = mono_clock::now();
4429 if (o->onode.has_omap()) {
4430 it->next();
4431 r = 0;
4432 }
4433 c->store->log_latency(
4434 __func__,
4435 l_bluestore_omap_next_lat,
4436 mono_clock::now() - start1,
4437 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4438
4439 return r;
4440 }
4441
4442 string BlueStore::OmapIteratorImpl::key()
4443 {
4444 std::shared_lock l(c->lock);
4445 ceph_assert(it->valid());
4446 string db_key = it->raw_key().second;
4447 string user_key;
4448 o->decode_omap_key(db_key, &user_key);
4449
4450 return user_key;
4451 }
4452
4453 bufferlist BlueStore::OmapIteratorImpl::value()
4454 {
4455 std::shared_lock l(c->lock);
4456 ceph_assert(it->valid());
4457 return it->value();
4458 }
4459
4460
4461 // =====================================
4462
4463 #undef dout_prefix
4464 #define dout_prefix *_dout << "bluestore(" << path << ") "
4465 #undef dout_context
4466 #define dout_context cct
4467
4468
4469 static void aio_cb(void *priv, void *priv2)
4470 {
4471 BlueStore *store = static_cast<BlueStore*>(priv);
4472 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4473 c->aio_finish(store);
4474 }
4475
4476 static void discard_cb(void *priv, void *priv2)
4477 {
4478 BlueStore *store = static_cast<BlueStore*>(priv);
4479 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4480 store->handle_discard(*tmp);
4481 }
4482
4483 void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4484 {
4485 dout(10) << __func__ << dendl;
4486 ceph_assert(shared_alloc.a);
4487 shared_alloc.a->release(to_release);
4488 }
4489
4490 BlueStore::BlueStore(CephContext *cct, const string& path)
4491 : BlueStore(cct, path, 0) {}
4492
4493 BlueStore::BlueStore(CephContext *cct,
4494 const string& path,
4495 uint64_t _min_alloc_size)
4496 : ObjectStore(cct, path),
4497 throttle(cct),
4498 finisher(cct, "commit_finisher", "cfin"),
4499 kv_sync_thread(this),
4500 kv_finalize_thread(this),
4501 zoned_cleaner_thread(this),
4502 min_alloc_size(_min_alloc_size),
4503 min_alloc_size_order(ctz(_min_alloc_size)),
4504 mempool_thread(this)
4505 {
4506 _init_logger();
4507 cct->_conf.add_observer(this);
4508 set_cache_shards(1);
4509 }
4510
4511 BlueStore::~BlueStore()
4512 {
4513 cct->_conf.remove_observer(this);
4514 _shutdown_logger();
4515 ceph_assert(!mounted);
4516 ceph_assert(db == NULL);
4517 ceph_assert(bluefs == NULL);
4518 ceph_assert(fsid_fd < 0);
4519 ceph_assert(path_fd < 0);
4520 for (auto i : onode_cache_shards) {
4521 delete i;
4522 }
4523 for (auto i : buffer_cache_shards) {
4524 delete i;
4525 }
4526 onode_cache_shards.clear();
4527 buffer_cache_shards.clear();
4528 }
4529
4530 const char **BlueStore::get_tracked_conf_keys() const
4531 {
4532 static const char* KEYS[] = {
4533 "bluestore_csum_type",
4534 "bluestore_compression_mode",
4535 "bluestore_compression_algorithm",
4536 "bluestore_compression_min_blob_size",
4537 "bluestore_compression_min_blob_size_ssd",
4538 "bluestore_compression_min_blob_size_hdd",
4539 "bluestore_compression_max_blob_size",
4540 "bluestore_compression_max_blob_size_ssd",
4541 "bluestore_compression_max_blob_size_hdd",
4542 "bluestore_compression_required_ratio",
4543 "bluestore_max_alloc_size",
4544 "bluestore_prefer_deferred_size",
4545 "bluestore_prefer_deferred_size_hdd",
4546 "bluestore_prefer_deferred_size_ssd",
4547 "bluestore_deferred_batch_ops",
4548 "bluestore_deferred_batch_ops_hdd",
4549 "bluestore_deferred_batch_ops_ssd",
4550 "bluestore_throttle_bytes",
4551 "bluestore_throttle_deferred_bytes",
4552 "bluestore_throttle_cost_per_io_hdd",
4553 "bluestore_throttle_cost_per_io_ssd",
4554 "bluestore_throttle_cost_per_io",
4555 "bluestore_max_blob_size",
4556 "bluestore_max_blob_size_ssd",
4557 "bluestore_max_blob_size_hdd",
4558 "osd_memory_target",
4559 "osd_memory_target_cgroup_limit_ratio",
4560 "osd_memory_base",
4561 "osd_memory_cache_min",
4562 "osd_memory_expected_fragmentation",
4563 "bluestore_cache_autotune",
4564 "bluestore_cache_autotune_interval",
4565 "bluestore_warn_on_legacy_statfs",
4566 "bluestore_warn_on_no_per_pool_omap",
4567 "bluestore_max_defer_interval",
4568 NULL
4569 };
4570 return KEYS;
4571 }
4572
4573 void BlueStore::handle_conf_change(const ConfigProxy& conf,
4574 const std::set<std::string> &changed)
4575 {
4576 if (changed.count("bluestore_warn_on_legacy_statfs")) {
4577 _check_legacy_statfs_alert();
4578 }
4579 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4580 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4581 _check_no_per_pg_or_pool_omap_alert();
4582 }
4583
4584 if (changed.count("bluestore_csum_type")) {
4585 _set_csum();
4586 }
4587 if (changed.count("bluestore_compression_mode") ||
4588 changed.count("bluestore_compression_algorithm") ||
4589 changed.count("bluestore_compression_min_blob_size") ||
4590 changed.count("bluestore_compression_max_blob_size")) {
4591 if (bdev) {
4592 _set_compression();
4593 }
4594 }
4595 if (changed.count("bluestore_max_blob_size") ||
4596 changed.count("bluestore_max_blob_size_ssd") ||
4597 changed.count("bluestore_max_blob_size_hdd")) {
4598 if (bdev) {
4599 // only after startup
4600 _set_blob_size();
4601 }
4602 }
4603 if (changed.count("bluestore_prefer_deferred_size") ||
4604 changed.count("bluestore_prefer_deferred_size_hdd") ||
4605 changed.count("bluestore_prefer_deferred_size_ssd") ||
4606 changed.count("bluestore_max_alloc_size") ||
4607 changed.count("bluestore_deferred_batch_ops") ||
4608 changed.count("bluestore_deferred_batch_ops_hdd") ||
4609 changed.count("bluestore_deferred_batch_ops_ssd")) {
4610 if (bdev) {
4611 // only after startup
4612 _set_alloc_sizes();
4613 }
4614 }
4615 if (changed.count("bluestore_throttle_cost_per_io") ||
4616 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4617 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4618 if (bdev) {
4619 _set_throttle_params();
4620 }
4621 }
4622 if (changed.count("bluestore_throttle_bytes") ||
4623 changed.count("bluestore_throttle_deferred_bytes") ||
4624 changed.count("bluestore_throttle_trace_rate")) {
4625 throttle.reset_throttle(conf);
4626 }
4627 if (changed.count("bluestore_max_defer_interval")) {
4628 if (bdev) {
4629 _set_max_defer_interval();
4630 }
4631 }
4632 if (changed.count("osd_memory_target") ||
4633 changed.count("osd_memory_base") ||
4634 changed.count("osd_memory_cache_min") ||
4635 changed.count("osd_memory_expected_fragmentation")) {
4636 _update_osd_memory_options();
4637 }
4638 }
4639
4640 void BlueStore::_set_compression()
4641 {
4642 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4643 if (m) {
4644 _clear_compression_alert();
4645 comp_mode = *m;
4646 } else {
4647 derr << __func__ << " unrecognized value '"
4648 << cct->_conf->bluestore_compression_mode
4649 << "' for bluestore_compression_mode, reverting to 'none'"
4650 << dendl;
4651 comp_mode = Compressor::COMP_NONE;
4652 string s("unknown mode: ");
4653 s += cct->_conf->bluestore_compression_mode;
4654 _set_compression_alert(true, s.c_str());
4655 }
4656
4657 compressor = nullptr;
4658
4659 if (cct->_conf->bluestore_compression_min_blob_size) {
4660 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
4661 } else {
4662 ceph_assert(bdev);
4663 if (_use_rotational_settings()) {
4664 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4665 } else {
4666 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4667 }
4668 }
4669
4670 if (cct->_conf->bluestore_compression_max_blob_size) {
4671 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4672 } else {
4673 ceph_assert(bdev);
4674 if (_use_rotational_settings()) {
4675 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4676 } else {
4677 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4678 }
4679 }
4680
4681 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4682 if (!alg_name.empty()) {
4683 compressor = Compressor::create(cct, alg_name);
4684 if (!compressor) {
4685 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4686 << dendl;
4687 _set_compression_alert(false, alg_name.c_str());
4688 }
4689 }
4690
4691 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4692 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
4693 << " min_blob " << comp_min_blob_size
4694 << " max_blob " << comp_max_blob_size
4695 << dendl;
4696 }
4697
4698 void BlueStore::_set_csum()
4699 {
4700 csum_type = Checksummer::CSUM_NONE;
4701 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4702 if (t > Checksummer::CSUM_NONE)
4703 csum_type = t;
4704
4705 dout(10) << __func__ << " csum_type "
4706 << Checksummer::get_csum_type_string(csum_type)
4707 << dendl;
4708 }
4709
4710 void BlueStore::_set_throttle_params()
4711 {
4712 if (cct->_conf->bluestore_throttle_cost_per_io) {
4713 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4714 } else {
4715 ceph_assert(bdev);
4716 if (_use_rotational_settings()) {
4717 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4718 } else {
4719 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4720 }
4721 }
4722
4723 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4724 << dendl;
4725 }
4726 void BlueStore::_set_blob_size()
4727 {
4728 if (cct->_conf->bluestore_max_blob_size) {
4729 max_blob_size = cct->_conf->bluestore_max_blob_size;
4730 } else {
4731 ceph_assert(bdev);
4732 if (_use_rotational_settings()) {
4733 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4734 } else {
4735 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4736 }
4737 }
4738 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4739 << std::dec << dendl;
4740 }
4741
4742 void BlueStore::_update_osd_memory_options()
4743 {
4744 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4745 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4746 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4747 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4748 config_changed++;
4749 dout(10) << __func__
4750 << " osd_memory_target " << osd_memory_target
4751 << " osd_memory_base " << osd_memory_base
4752 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4753 << " osd_memory_cache_min " << osd_memory_cache_min
4754 << dendl;
4755 }
4756
4757 int BlueStore::_set_cache_sizes()
4758 {
4759 ceph_assert(bdev);
4760 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
4761 cache_autotune_interval =
4762 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4763 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4764 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4765 osd_memory_expected_fragmentation =
4766 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4767 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4768 osd_memory_cache_resize_interval =
4769 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
4770
4771 if (cct->_conf->bluestore_cache_size) {
4772 cache_size = cct->_conf->bluestore_cache_size;
4773 } else {
4774 // choose global cache size based on backend type
4775 if (_use_rotational_settings()) {
4776 cache_size = cct->_conf->bluestore_cache_size_hdd;
4777 } else {
4778 cache_size = cct->_conf->bluestore_cache_size_ssd;
4779 }
4780 }
4781
4782 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
4783 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
4784 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4785 << ") must be in range [0,1.0]" << dendl;
4786 return -EINVAL;
4787 }
4788
4789 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
4790 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
4791 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
4792 << ") must be in range [0,1.0]" << dendl;
4793 return -EINVAL;
4794 }
4795
4796 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4797 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4798 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4799 << ") must be in range [0,1.0]" << dendl;
4800 return -EINVAL;
4801 }
4802
4803 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
4804 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4805 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4806 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4807 << dendl;
4808 return -EINVAL;
4809 }
4810
4811 cache_data_ratio = (double)1.0 -
4812 (double)cache_meta_ratio -
4813 (double)cache_kv_ratio -
4814 (double)cache_kv_onode_ratio;
4815 if (cache_data_ratio < 0) {
4816 // deal with floating point imprecision
4817 cache_data_ratio = 0;
4818 }
4819
4820 dout(1) << __func__ << " cache_size " << cache_size
4821 << " meta " << cache_meta_ratio
4822 << " kv " << cache_kv_ratio
4823 << " data " << cache_data_ratio
4824 << dendl;
4825 return 0;
4826 }
4827
4828 int BlueStore::write_meta(const std::string& key, const std::string& value)
4829 {
4830 bluestore_bdev_label_t label;
4831 string p = path + "/block";
4832 int r = _read_bdev_label(cct, p, &label);
4833 if (r < 0) {
4834 return ObjectStore::write_meta(key, value);
4835 }
4836 label.meta[key] = value;
4837 r = _write_bdev_label(cct, p, label);
4838 ceph_assert(r == 0);
4839 return ObjectStore::write_meta(key, value);
4840 }
4841
4842 int BlueStore::read_meta(const std::string& key, std::string *value)
4843 {
4844 bluestore_bdev_label_t label;
4845 string p = path + "/block";
4846 int r = _read_bdev_label(cct, p, &label);
4847 if (r < 0) {
4848 return ObjectStore::read_meta(key, value);
4849 }
4850 auto i = label.meta.find(key);
4851 if (i == label.meta.end()) {
4852 return ObjectStore::read_meta(key, value);
4853 }
4854 *value = i->second;
4855 return 0;
4856 }
4857
4858 void BlueStore::_init_logger()
4859 {
4860 PerfCountersBuilder b(cct, "bluestore",
4861 l_bluestore_first, l_bluestore_last);
4862 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4863 "Average kv_thread flush latency",
4864 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4865 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4866 "Average kv_thread commit latency");
4867 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4868 "Average kv_sync thread latency",
4869 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4870 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4871 "Average kv_finalize thread latency",
4872 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
4873 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4874 "Average prepare state latency");
4875 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4876 "Average aio_wait state latency",
4877 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4878 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4879 "Average io_done state latency");
4880 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4881 "Average kv_queued state latency");
4882 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4883 "Average kv_commiting state latency");
4884 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4885 "Average kv_done state latency");
4886 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4887 "Average deferred_queued state latency");
4888 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4889 "Average aio_wait state latency");
4890 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4891 "Average cleanup state latency");
4892 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4893 "Average finishing state latency");
4894 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4895 "Average done state latency");
4896 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4897 "Average submit throttle latency",
4898 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4899 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4900 "Average submit latency",
4901 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4902 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4903 "Average commit latency",
4904 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4905 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4906 "Average read latency",
4907 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4908 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4909 "Average read onode metadata latency");
4910 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4911 "Average read latency");
4912 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4913 "Average compress latency");
4914 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4915 "Average decompress latency");
4916 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4917 "Average checksum latency");
4918 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4919 "Sum for beneficial compress ops");
4920 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4921 "Sum for compress ops rejected due to low net gain of space");
4922 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
4923 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
4924 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4925 "Sum for deferred write op");
4926 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
4927 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
4928 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4929 "Sum for write penalty read ops");
4930 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4931 "Sum for allocated bytes");
4932 b.add_u64(l_bluestore_stored, "bluestore_stored",
4933 "Sum for stored bytes");
4934 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
4935 "Sum for stored compressed bytes",
4936 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4937 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
4938 "Sum for bytes allocated for compressed data",
4939 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4940 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
4941 "Sum for original bytes that were compressed",
4942 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4943 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4944 "Number of onodes in cache");
4945 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4946 "Number of pinned onodes in cache");
4947 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4948 "Sum for onode-lookups hit in the cache");
4949 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4950 "Sum for onode-lookups missed in the cache");
4951 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4952 "Sum for onode-shard lookups hit in the cache");
4953 b.add_u64_counter(l_bluestore_onode_shard_misses,
4954 "bluestore_onode_shard_misses",
4955 "Sum for onode-shard lookups missed in the cache");
4956 b.add_u64(l_bluestore_extents, "bluestore_extents",
4957 "Number of extents in cache");
4958 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4959 "Number of blobs in cache");
4960 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4961 "Number of buffers in cache");
4962 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
4963 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
4964 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
4965 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
4966 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
4967 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
4968
4969 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4970 "Large aligned writes into fresh blobs");
4971 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
4972 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
4973 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4974 "Large aligned writes into fresh blobs (blobs)");
4975 b.add_u64_counter(l_bluestore_write_big_deferred,
4976 "bluestore_write_big_deferred",
4977 "Big overwrites using deferred");
4978 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4979 "Small writes into existing or sparse small blobs");
4980 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
4981 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
4982 b.add_u64_counter(l_bluestore_write_small_unused,
4983 "bluestore_write_small_unused",
4984 "Small writes into unused portion of existing blob");
4985 b.add_u64_counter(l_bluestore_write_deferred,
4986 "bluestore_write_deferred",
4987 "Total deferred writes submitted");
4988 b.add_u64_counter(l_bluestore_write_deferred_bytes,
4989 "bluestore_write_deferred_bytes",
4990 "Total bytes submitted as deferred writes");
4991 b.add_u64_counter(l_bluestore_write_small_pre_read,
4992 "bluestore_write_small_pre_read",
4993 "Small writes that required we read some data (possibly "
4994 "cached) to fill out the block");
4995 b.add_u64_counter(l_bluestore_write_new, "bluestore_write_new",
4996 "Write into new blob");
4997
4998 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4999 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
5000 "Onode extent map reshard events");
5001 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
5002 "Sum for blob splitting due to resharding");
5003 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
5004 "Sum for extents that have been removed due to compression");
5005 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
5006 "Sum for extents that have been merged due to garbage "
5007 "collection");
5008 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
5009 "Read EIO errors propagated to high level callers");
5010 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
5011 "Read operations that required at least one retry due to failed checksum validation");
5012 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
5013 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
5014 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
5015 "Average omap iterator seek_to_first call latency");
5016 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
5017 "Average omap iterator upper_bound call latency");
5018 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
5019 "Average omap iterator lower_bound call latency");
5020 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
5021 "Average omap iterator next call latency");
5022 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
5023 "Average omap get_keys call latency");
5024 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
5025 "Average omap get_values call latency");
5026 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
5027 "Average collection listing latency");
5028 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
5029 "Average removal latency");
5030
5031 logger = b.create_perf_counters();
5032 cct->get_perfcounters_collection()->add(logger);
5033 }
5034
5035 int BlueStore::_reload_logger()
5036 {
5037 struct store_statfs_t store_statfs;
5038 int r = statfs(&store_statfs);
5039 if (r >= 0) {
5040 logger->set(l_bluestore_allocated, store_statfs.allocated);
5041 logger->set(l_bluestore_stored, store_statfs.data_stored);
5042 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5043 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5044 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
5045 }
5046 return r;
5047 }
5048
5049 void BlueStore::_shutdown_logger()
5050 {
5051 cct->get_perfcounters_collection()->remove(logger);
5052 delete logger;
5053 }
5054
5055 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5056 uuid_d *fsid)
5057 {
5058 bluestore_bdev_label_t label;
5059 int r = _read_bdev_label(cct, path, &label);
5060 if (r < 0)
5061 return r;
5062 *fsid = label.osd_uuid;
5063 return 0;
5064 }
5065
5066 int BlueStore::_open_path()
5067 {
5068 // sanity check(s)
5069 ceph_assert(path_fd < 0);
5070 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
5071 if (path_fd < 0) {
5072 int r = -errno;
5073 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5074 << dendl;
5075 return r;
5076 }
5077 return 0;
5078 }
5079
5080 void BlueStore::_close_path()
5081 {
5082 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5083 path_fd = -1;
5084 }
5085
5086 int BlueStore::_write_bdev_label(CephContext *cct,
5087 string path, bluestore_bdev_label_t label)
5088 {
5089 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5090 bufferlist bl;
5091 encode(label, bl);
5092 uint32_t crc = bl.crc32c(-1);
5093 encode(crc, bl);
5094 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
5095 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5096 z.zero();
5097 bl.append(std::move(z));
5098
5099 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
5100 if (fd < 0) {
5101 fd = -errno;
5102 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5103 << dendl;
5104 return fd;
5105 }
5106 int r = bl.write_fd(fd);
5107 if (r < 0) {
5108 derr << __func__ << " failed to write to " << path
5109 << ": " << cpp_strerror(r) << dendl;
5110 goto out;
5111 }
5112 r = ::fsync(fd);
5113 if (r < 0) {
5114 derr << __func__ << " failed to fsync " << path
5115 << ": " << cpp_strerror(r) << dendl;
5116 }
5117 out:
5118 VOID_TEMP_FAILURE_RETRY(::close(fd));
5119 return r;
5120 }
5121
5122 int BlueStore::_read_bdev_label(CephContext* cct, string path,
5123 bluestore_bdev_label_t *label)
5124 {
5125 dout(10) << __func__ << dendl;
5126 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
5127 if (fd < 0) {
5128 fd = -errno;
5129 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5130 << dendl;
5131 return fd;
5132 }
5133 bufferlist bl;
5134 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5135 VOID_TEMP_FAILURE_RETRY(::close(fd));
5136 if (r < 0) {
5137 derr << __func__ << " failed to read from " << path
5138 << ": " << cpp_strerror(r) << dendl;
5139 return r;
5140 }
5141
5142 uint32_t crc, expected_crc;
5143 auto p = bl.cbegin();
5144 try {
5145 decode(*label, p);
5146 bufferlist t;
5147 t.substr_of(bl, 0, p.get_off());
5148 crc = t.crc32c(-1);
5149 decode(expected_crc, p);
5150 }
5151 catch (ceph::buffer::error& e) {
5152 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
5153 << ": " << e.what()
5154 << dendl;
5155 return -ENOENT;
5156 }
5157 if (crc != expected_crc) {
5158 derr << __func__ << " bad crc on label, expected " << expected_crc
5159 << " != actual " << crc << dendl;
5160 return -EIO;
5161 }
5162 dout(10) << __func__ << " got " << *label << dendl;
5163 return 0;
5164 }
5165
5166 int BlueStore::_check_or_set_bdev_label(
5167 string path, uint64_t size, string desc, bool create)
5168 {
5169 bluestore_bdev_label_t label;
5170 if (create) {
5171 label.osd_uuid = fsid;
5172 label.size = size;
5173 label.btime = ceph_clock_now();
5174 label.description = desc;
5175 int r = _write_bdev_label(cct, path, label);
5176 if (r < 0)
5177 return r;
5178 } else {
5179 int r = _read_bdev_label(cct, path, &label);
5180 if (r < 0)
5181 return r;
5182 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5183 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5184 << " and fsid " << fsid << " check bypassed" << dendl;
5185 } else if (label.osd_uuid != fsid) {
5186 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5187 << " does not match our fsid " << fsid << dendl;
5188 return -EIO;
5189 }
5190 }
5191 return 0;
5192 }
5193
5194 void BlueStore::_set_alloc_sizes(void)
5195 {
5196 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5197
5198 if (cct->_conf->bluestore_prefer_deferred_size) {
5199 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5200 } else {
5201 ceph_assert(bdev);
5202 if (_use_rotational_settings()) {
5203 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5204 } else {
5205 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5206 }
5207 }
5208
5209 if (cct->_conf->bluestore_deferred_batch_ops) {
5210 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5211 } else {
5212 ceph_assert(bdev);
5213 if (_use_rotational_settings()) {
5214 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5215 } else {
5216 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5217 }
5218 }
5219
5220 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
5221 << std::dec << " order " << (int)min_alloc_size_order
5222 << " max_alloc_size 0x" << std::hex << max_alloc_size
5223 << " prefer_deferred_size 0x" << prefer_deferred_size
5224 << std::dec
5225 << " deferred_batch_ops " << deferred_batch_ops
5226 << dendl;
5227 }
5228
5229 int BlueStore::_open_bdev(bool create)
5230 {
5231 ceph_assert(bdev == NULL);
5232 string p = path + "/block";
5233 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
5234 int r = bdev->open(p);
5235 if (r < 0)
5236 goto fail;
5237
5238 if (create && cct->_conf->bdev_enable_discard) {
5239 bdev->discard(0, bdev->get_size());
5240 }
5241
5242 if (bdev->supported_bdev_label()) {
5243 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5244 if (r < 0)
5245 goto fail_close;
5246 }
5247
5248 // initialize global block parameters
5249 block_size = bdev->get_block_size();
5250 block_mask = ~(block_size - 1);
5251 block_size_order = ctz(block_size);
5252 ceph_assert(block_size == 1u << block_size_order);
5253 _set_max_defer_interval();
5254 // and set cache_size based on device type
5255 r = _set_cache_sizes();
5256 if (r < 0) {
5257 goto fail_close;
5258 }
5259
5260 if (bdev->is_smr()) {
5261 freelist_type = "zoned";
5262 }
5263 return 0;
5264
5265 fail_close:
5266 bdev->close();
5267 fail:
5268 delete bdev;
5269 bdev = NULL;
5270 return r;
5271 }
5272
5273 void BlueStore::_validate_bdev()
5274 {
5275 ceph_assert(bdev);
5276 uint64_t dev_size = bdev->get_size();
5277 ceph_assert(dev_size > _get_ondisk_reserved());
5278 }
5279
5280 void BlueStore::_close_bdev()
5281 {
5282 ceph_assert(bdev);
5283 bdev->close();
5284 delete bdev;
5285 bdev = NULL;
5286 }
5287
5288 int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
5289 {
5290 int r;
5291
5292 ceph_assert(fm == NULL);
5293 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5294 ceph_assert(fm);
5295 if (t) {
5296 // create mode. initialize freespace
5297 dout(20) << __func__ << " initializing freespace" << dendl;
5298 {
5299 bufferlist bl;
5300 bl.append(freelist_type);
5301 t->set(PREFIX_SUPER, "freelist_type", bl);
5302 }
5303 // being able to allocate in units less than bdev block size
5304 // seems to be a bad idea.
5305 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
5306
5307 uint64_t alloc_size = min_alloc_size;
5308 if (bdev->is_smr()) {
5309 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
5310 }
5311
5312 fm->create(bdev->get_size(), alloc_size, t);
5313
5314 // allocate superblock reserved space. note that we do not mark
5315 // bluefs space as allocated in the freelist; we instead rely on
5316 // bluefs doing that itself.
5317 auto reserved = _get_ondisk_reserved();
5318 fm->allocate(0, reserved, t);
5319
5320 if (cct->_conf->bluestore_debug_prefill > 0) {
5321 uint64_t end = bdev->get_size() - reserved;
5322 dout(1) << __func__ << " pre-fragmenting freespace, using "
5323 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5324 << cct->_conf->bluestore_debug_prefragment_max << dendl;
5325 uint64_t start = p2roundup(reserved, min_alloc_size);
5326 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5327 float r = cct->_conf->bluestore_debug_prefill;
5328 r /= 1.0 - r;
5329 bool stop = false;
5330
5331 while (!stop && start < end) {
5332 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5333 if (start + l > end) {
5334 l = end - start;
5335 l = p2align(l, min_alloc_size);
5336 }
5337 ceph_assert(start + l <= end);
5338
5339 uint64_t u = 1 + (uint64_t)(r * (double)l);
5340 u = p2roundup(u, min_alloc_size);
5341 if (start + l + u > end) {
5342 u = end - (start + l);
5343 // trim to align so we don't overflow again
5344 u = p2align(u, min_alloc_size);
5345 stop = true;
5346 }
5347 ceph_assert(start + l + u <= end);
5348
5349 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
5350 << " use 0x" << u << std::dec << dendl;
5351
5352 if (u == 0) {
5353 // break if u has been trimmed to nothing
5354 break;
5355 }
5356
5357 fm->allocate(start + l, u, t);
5358 start += l + u;
5359 }
5360 }
5361 r = _write_out_fm_meta(0);
5362 ceph_assert(r == 0);
5363 } else {
5364 r = fm->init(db, read_only,
5365 [&](const std::string& key, std::string* result) {
5366 return read_meta(key, result);
5367 });
5368 if (r < 0) {
5369 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
5370 delete fm;
5371 fm = NULL;
5372 return r;
5373 }
5374 }
5375 // if space size tracked by free list manager is that higher than actual
5376 // dev size one can hit out-of-space allocation which will result
5377 // in data loss and/or assertions
5378 // Probably user altered the device size somehow.
5379 // The only fix for now is to redeploy OSD.
5380 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5381 ostringstream ss;
5382 ss << "slow device size mismatch detected, "
5383 << " fm size(" << fm->get_size()
5384 << ") > slow device size(" << bdev->get_size()
5385 << "), Please stop using this OSD as it might cause data loss.";
5386 _set_disk_size_mismatch_alert(ss.str());
5387 }
5388 return 0;
5389 }
5390
5391 void BlueStore::_close_fm()
5392 {
5393 dout(10) << __func__ << dendl;
5394 ceph_assert(fm);
5395 fm->shutdown();
5396 delete fm;
5397 fm = NULL;
5398 }
5399
5400 int BlueStore::_write_out_fm_meta(uint64_t target_size)
5401 {
5402 int r = 0;
5403 string p = path + "/block";
5404
5405 std::vector<std::pair<string, string>> fm_meta;
5406 fm->get_meta(target_size, &fm_meta);
5407
5408 for (auto& m : fm_meta) {
5409 r = write_meta(m.first, m.second);
5410 ceph_assert(r == 0);
5411 }
5412 return r;
5413 }
5414
5415 int BlueStore::_create_alloc()
5416 {
5417 ceph_assert(shared_alloc.a == NULL);
5418 ceph_assert(bdev->get_size());
5419
5420 uint64_t alloc_size = min_alloc_size;
5421 if (bdev->is_smr()) {
5422 int r = _zoned_check_config_settings();
5423 if (r < 0)
5424 return r;
5425 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
5426 }
5427
5428 shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator,
5429 bdev->get_size(),
5430 alloc_size, "block"));
5431
5432 if (!shared_alloc.a) {
5433 lderr(cct) << __func__ << "Failed to create allocator:: "
5434 << cct->_conf->bluestore_allocator
5435 << dendl;
5436 return -EINVAL;
5437 }
5438 return 0;
5439 }
5440
5441 int BlueStore::_init_alloc()
5442 {
5443 int r = _create_alloc();
5444 if (r < 0) {
5445 return r;
5446 }
5447 ceph_assert(shared_alloc.a != NULL);
5448
5449 if (bdev->is_smr()) {
5450 shared_alloc.a->zoned_set_zone_states(fm->get_zone_states(db));
5451 }
5452
5453 uint64_t num = 0, bytes = 0;
5454
5455 dout(1) << __func__ << " opening allocation metadata" << dendl;
5456 // initialize from freelist
5457 fm->enumerate_reset();
5458 uint64_t offset, length;
5459 while (fm->enumerate_next(db, &offset, &length)) {
5460 shared_alloc.a->init_add_free(offset, length);
5461 ++num;
5462 bytes += length;
5463 }
5464 fm->enumerate_reset();
5465
5466 dout(1) << __func__
5467 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5468 << std::hex
5469 << ", allocator type " << shared_alloc.a->get_type()
5470 << ", capacity 0x" << shared_alloc.a->get_capacity()
5471 << ", block size 0x" << shared_alloc.a->get_block_size()
5472 << ", free 0x" << shared_alloc.a->get_free()
5473 << ", fragmentation " << shared_alloc.a->get_fragmentation()
5474 << std::dec << dendl;
5475
5476 return 0;
5477 }
5478
5479 void BlueStore::_close_alloc()
5480 {
5481 ceph_assert(bdev);
5482 bdev->discard_drain();
5483
5484 ceph_assert(shared_alloc.a);
5485 shared_alloc.a->shutdown();
5486 delete shared_alloc.a;
5487 shared_alloc.reset();
5488 }
5489
5490 int BlueStore::_open_fsid(bool create)
5491 {
5492 ceph_assert(fsid_fd < 0);
5493 int flags = O_RDWR|O_CLOEXEC;
5494 if (create)
5495 flags |= O_CREAT;
5496 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5497 if (fsid_fd < 0) {
5498 int err = -errno;
5499 derr << __func__ << " " << cpp_strerror(err) << dendl;
5500 return err;
5501 }
5502 return 0;
5503 }
5504
5505 int BlueStore::_read_fsid(uuid_d *uuid)
5506 {
5507 char fsid_str[40];
5508 memset(fsid_str, 0, sizeof(fsid_str));
5509 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5510 if (ret < 0) {
5511 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5512 return ret;
5513 }
5514 if (ret > 36)
5515 fsid_str[36] = 0;
5516 else
5517 fsid_str[ret] = 0;
5518 if (!uuid->parse(fsid_str)) {
5519 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5520 return -EINVAL;
5521 }
5522 return 0;
5523 }
5524
5525 int BlueStore::_write_fsid()
5526 {
5527 int r = ::ftruncate(fsid_fd, 0);
5528 if (r < 0) {
5529 r = -errno;
5530 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5531 return r;
5532 }
5533 string str = stringify(fsid) + "\n";
5534 r = safe_write(fsid_fd, str.c_str(), str.length());
5535 if (r < 0) {
5536 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5537 return r;
5538 }
5539 r = ::fsync(fsid_fd);
5540 if (r < 0) {
5541 r = -errno;
5542 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5543 return r;
5544 }
5545 return 0;
5546 }
5547
5548 void BlueStore::_close_fsid()
5549 {
5550 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5551 fsid_fd = -1;
5552 }
5553
5554 int BlueStore::_lock_fsid()
5555 {
5556 struct flock l;
5557 memset(&l, 0, sizeof(l));
5558 l.l_type = F_WRLCK;
5559 l.l_whence = SEEK_SET;
5560 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5561 if (r < 0) {
5562 int err = errno;
5563 derr << __func__ << " failed to lock " << path << "/fsid"
5564 << " (is another ceph-osd still running?)"
5565 << cpp_strerror(err) << dendl;
5566 return -err;
5567 }
5568 return 0;
5569 }
5570
5571 bool BlueStore::is_rotational()
5572 {
5573 if (bdev) {
5574 return bdev->is_rotational();
5575 }
5576
5577 bool rotational = true;
5578 int r = _open_path();
5579 if (r < 0)
5580 goto out;
5581 r = _open_fsid(false);
5582 if (r < 0)
5583 goto out_path;
5584 r = _read_fsid(&fsid);
5585 if (r < 0)
5586 goto out_fsid;
5587 r = _lock_fsid();
5588 if (r < 0)
5589 goto out_fsid;
5590 r = _open_bdev(false);
5591 if (r < 0)
5592 goto out_fsid;
5593 rotational = bdev->is_rotational();
5594 _close_bdev();
5595 out_fsid:
5596 _close_fsid();
5597 out_path:
5598 _close_path();
5599 out:
5600 return rotational;
5601 }
5602
5603 bool BlueStore::is_journal_rotational()
5604 {
5605 if (!bluefs) {
5606 dout(5) << __func__ << " bluefs disabled, default to store media type"
5607 << dendl;
5608 return is_rotational();
5609 }
5610 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5611 return bluefs->wal_is_rotational();
5612 }
5613
5614 bool BlueStore::_use_rotational_settings()
5615 {
5616 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5617 return true;
5618 }
5619 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5620 return false;
5621 }
5622 return bdev->is_rotational();
5623 }
5624
5625 bool BlueStore::test_mount_in_use()
5626 {
5627 // most error conditions mean the mount is not in use (e.g., because
5628 // it doesn't exist). only if we fail to lock do we conclude it is
5629 // in use.
5630 bool ret = false;
5631 int r = _open_path();
5632 if (r < 0)
5633 return false;
5634 r = _open_fsid(false);
5635 if (r < 0)
5636 goto out_path;
5637 r = _lock_fsid();
5638 if (r < 0)
5639 ret = true; // if we can't lock, it is in use
5640 _close_fsid();
5641 out_path:
5642 _close_path();
5643 return ret;
5644 }
5645
5646 int BlueStore::_minimal_open_bluefs(bool create)
5647 {
5648 int r;
5649 bluefs = new BlueFS(cct);
5650
5651 string bfn;
5652 struct stat st;
5653
5654 bfn = path + "/block.db";
5655 if (::stat(bfn.c_str(), &st) == 0) {
5656 r = bluefs->add_block_device(
5657 BlueFS::BDEV_DB, bfn,
5658 create && cct->_conf->bdev_enable_discard,
5659 SUPER_RESERVED);
5660 if (r < 0) {
5661 derr << __func__ << " add block device(" << bfn << ") returned: "
5662 << cpp_strerror(r) << dendl;
5663 goto free_bluefs;
5664 }
5665
5666 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5667 r = _check_or_set_bdev_label(
5668 bfn,
5669 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5670 "bluefs db", create);
5671 if (r < 0) {
5672 derr << __func__
5673 << " check block device(" << bfn << ") label returned: "
5674 << cpp_strerror(r) << dendl;
5675 goto free_bluefs;
5676 }
5677 }
5678 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5679 bluefs_layout.dedicated_db = true;
5680 } else {
5681 r = -errno;
5682 if (::lstat(bfn.c_str(), &st) == -1) {
5683 r = 0;
5684 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
5685 } else {
5686 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5687 << cpp_strerror(r) << dendl;
5688 goto free_bluefs;
5689 }
5690 }
5691
5692 // shared device
5693 bfn = path + "/block";
5694 // never trim here
5695 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
5696 0, // no need to provide valid 'reserved' for shared dev
5697 &shared_alloc);
5698 if (r < 0) {
5699 derr << __func__ << " add block device(" << bfn << ") returned: "
5700 << cpp_strerror(r) << dendl;
5701 goto free_bluefs;
5702 }
5703
5704 bfn = path + "/block.wal";
5705 if (::stat(bfn.c_str(), &st) == 0) {
5706 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
5707 create && cct->_conf->bdev_enable_discard,
5708 BDEV_LABEL_BLOCK_SIZE);
5709 if (r < 0) {
5710 derr << __func__ << " add block device(" << bfn << ") returned: "
5711 << cpp_strerror(r) << dendl;
5712 goto free_bluefs;
5713 }
5714
5715 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5716 r = _check_or_set_bdev_label(
5717 bfn,
5718 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5719 "bluefs wal", create);
5720 if (r < 0) {
5721 derr << __func__ << " check block device(" << bfn
5722 << ") label returned: " << cpp_strerror(r) << dendl;
5723 goto free_bluefs;
5724 }
5725 }
5726
5727 bluefs_layout.dedicated_wal = true;
5728 } else {
5729 r = 0;
5730 if (::lstat(bfn.c_str(), &st) != -1) {
5731 r = -errno;
5732 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5733 << cpp_strerror(r) << dendl;
5734 goto free_bluefs;
5735 }
5736 }
5737 return 0;
5738
5739 free_bluefs:
5740 ceph_assert(bluefs);
5741 delete bluefs;
5742 bluefs = NULL;
5743 return r;
5744 }
5745
5746 int BlueStore::_open_bluefs(bool create, bool read_only)
5747 {
5748 int r = _minimal_open_bluefs(create);
5749 if (r < 0) {
5750 return r;
5751 }
5752 BlueFSVolumeSelector* vselector = nullptr;
5753 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5754
5755 string options = cct->_conf->bluestore_rocksdb_options;
5756 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
5757 if (!options_annex.empty()) {
5758 if (!options.empty() &&
5759 *options.rbegin() != ',') {
5760 options += ',';
5761 }
5762 options += options_annex;
5763 }
5764
5765 rocksdb::Options rocks_opts;
5766 r = RocksDBStore::ParseOptionsFromStringStatic(
5767 cct,
5768 options,
5769 rocks_opts,
5770 nullptr);
5771 if (r < 0) {
5772 return r;
5773 }
5774 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
5775 vselector = new FitToFastVolumeSelector(
5776 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5777 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5778 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
5779 } else {
5780 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5781 vselector =
5782 new RocksDBBlueFSVolumeSelector(
5783 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5784 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5785 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5786 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5787 rocks_opts.max_bytes_for_level_base,
5788 rocks_opts.max_bytes_for_level_multiplier,
5789 reserved_factor,
5790 cct->_conf->bluestore_volume_selection_reserved,
5791 cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
5792 }
5793 }
5794 if (create) {
5795 bluefs->mkfs(fsid, bluefs_layout);
5796 }
5797 bluefs->set_volume_selector(vselector);
5798 r = bluefs->mount();
5799 if (r < 0) {
5800 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5801 }
5802 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
5803 return r;
5804 }
5805
5806 void BlueStore::_close_bluefs(bool cold_close)
5807 {
5808 bluefs->umount(cold_close);
5809 _minimal_close_bluefs();
5810 }
5811
5812 void BlueStore::_minimal_close_bluefs()
5813 {
5814 delete bluefs;
5815 bluefs = NULL;
5816 }
5817
5818 int BlueStore::_is_bluefs(bool create, bool* ret)
5819 {
5820 if (create) {
5821 *ret = cct->_conf->bluestore_bluefs;
5822 } else {
5823 string s;
5824 int r = read_meta("bluefs", &s);
5825 if (r < 0) {
5826 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5827 return -EIO;
5828 }
5829 if (s == "1") {
5830 *ret = true;
5831 } else if (s == "0") {
5832 *ret = false;
5833 } else {
5834 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5835 << dendl;
5836 return -EIO;
5837 }
5838 }
5839 return 0;
5840 }
5841
5842 /*
5843 * opens both DB and dependant super_meta, FreelistManager and allocator
5844 * in the proper order
5845 */
5846 int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
5847 {
5848 dout(0) << __func__ << " read-only:" << read_only
5849 << " repair:" << to_repair << dendl;
5850 {
5851 string type;
5852 int r = read_meta("type", &type);
5853 if (r < 0) {
5854 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5855 << dendl;
5856 return r;
5857 }
5858
5859 if (type != "bluestore") {
5860 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5861 return -EIO;
5862 }
5863 }
5864
5865 int r = _open_path();
5866 if (r < 0)
5867 return r;
5868 r = _open_fsid(false);
5869 if (r < 0)
5870 goto out_path;
5871
5872 r = _read_fsid(&fsid);
5873 if (r < 0)
5874 goto out_fsid;
5875
5876 r = _lock_fsid();
5877 if (r < 0)
5878 goto out_fsid;
5879
5880 r = _open_bdev(false);
5881 if (r < 0)
5882 goto out_fsid;
5883
5884 // open in read-only first to read FM list and init allocator
5885 // as they might be needed for some BlueFS procedures
5886 r = _open_db(false, false, true);
5887 if (r < 0)
5888 goto out_bdev;
5889
5890 r = _open_super_meta();
5891 if (r < 0) {
5892 goto out_db;
5893 }
5894
5895 r = _open_fm(nullptr, true);
5896 if (r < 0)
5897 goto out_db;
5898
5899 r = _init_alloc();
5900 if (r < 0)
5901 goto out_fm;
5902
5903 // Re-open in the proper mode(s).
5904
5905 // Can't simply bypass second open for read-only mode as we need to
5906 // load allocated extents from bluefs into allocator.
5907 // And now it's time to do that
5908 //
5909 _close_db(true);
5910
5911 r = _open_db(false, to_repair, read_only);
5912 if (r < 0) {
5913 goto out_alloc;
5914 }
5915 return 0;
5916
5917 out_alloc:
5918 _close_alloc();
5919 out_fm:
5920 _close_fm();
5921 out_db:
5922 _close_db(read_only);
5923 out_bdev:
5924 _close_bdev();
5925 out_fsid:
5926 _close_fsid();
5927 out_path:
5928 _close_path();
5929 return r;
5930 }
5931
5932 void BlueStore::_close_db_and_around(bool read_only)
5933 {
5934 _close_db(read_only);
5935 _close_fm();
5936 _close_alloc();
5937 _close_bdev();
5938 _close_fsid();
5939 _close_path();
5940 }
5941
5942 int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
5943 {
5944 _kv_only = true;
5945 int r = _open_db_and_around(false, to_repair);
5946 if (r == 0) {
5947 *pdb = db;
5948 } else {
5949 *pdb = nullptr;
5950 }
5951 return r;
5952 }
5953
5954 int BlueStore::close_db_environment()
5955 {
5956 _close_db_and_around(false);
5957 return 0;
5958 }
5959
5960 int BlueStore::_prepare_db_environment(bool create, bool read_only,
5961 std::string* _fn, std::string* _kv_backend)
5962 {
5963 int r;
5964 ceph_assert(!db);
5965 std::string& fn=*_fn;
5966 std::string& kv_backend=*_kv_backend;
5967 fn = path + "/db";
5968 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5969
5970 if (create) {
5971 kv_backend = cct->_conf->bluestore_kvbackend;
5972 } else {
5973 r = read_meta("kv_backend", &kv_backend);
5974 if (r < 0) {
5975 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5976 return -EIO;
5977 }
5978 }
5979 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5980
5981 bool do_bluefs;
5982 r = _is_bluefs(create, &do_bluefs);
5983 if (r < 0) {
5984 return r;
5985 }
5986 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5987
5988 map<string,string> kv_options;
5989 // force separate wal dir for all new deployments.
5990 kv_options["separate_wal_dir"] = 1;
5991 rocksdb::Env *env = NULL;
5992 if (do_bluefs) {
5993 dout(10) << __func__ << " initializing bluefs" << dendl;
5994 if (kv_backend != "rocksdb") {
5995 derr << " backend must be rocksdb to use bluefs" << dendl;
5996 return -EINVAL;
5997 }
5998
5999 r = _open_bluefs(create, read_only);
6000 if (r < 0) {
6001 return r;
6002 }
6003
6004 if (cct->_conf->bluestore_bluefs_env_mirror) {
6005 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6006 rocksdb::Env* b = rocksdb::Env::Default();
6007 if (create) {
6008 string cmd = "rm -rf " + path + "/db " +
6009 path + "/db.slow " +
6010 path + "/db.wal";
6011 int r = system(cmd.c_str());
6012 (void)r;
6013 }
6014 env = new rocksdb::EnvMirror(b, a, false, true);
6015 } else {
6016 env = new BlueRocksEnv(bluefs);
6017
6018 // simplify the dir names, too, as "seen" by rocksdb
6019 fn = "db";
6020 }
6021 BlueFSVolumeSelector::paths paths;
6022 bluefs->get_vselector_paths(fn, paths);
6023
6024 {
6025 ostringstream db_paths;
6026 bool first = true;
6027 for (auto& p : paths) {
6028 if (!first) {
6029 db_paths << " ";
6030 }
6031 first = false;
6032 db_paths << p.first << "," << p.second;
6033
6034 }
6035 kv_options["db_paths"] = db_paths.str();
6036 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
6037 }
6038
6039 if (create) {
6040 for (auto& p : paths) {
6041 env->CreateDir(p.first);
6042 }
6043 // Selectors don't provide wal path so far hence create explicitly
6044 env->CreateDir(fn + ".wal");
6045 } else {
6046 std::vector<std::string> res;
6047 // check for dir presence
6048 auto r = env->GetChildren(fn+".wal", &res);
6049 if (r.IsNotFound()) {
6050 kv_options.erase("separate_wal_dir");
6051 }
6052 }
6053 } else {
6054 string walfn = path + "/db.wal";
6055
6056 if (create) {
6057 int r = ::mkdir(fn.c_str(), 0755);
6058 if (r < 0)
6059 r = -errno;
6060 if (r < 0 && r != -EEXIST) {
6061 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6062 << dendl;
6063 return r;
6064 }
6065
6066 // wal_dir, too!
6067 r = ::mkdir(walfn.c_str(), 0755);
6068 if (r < 0)
6069 r = -errno;
6070 if (r < 0 && r != -EEXIST) {
6071 derr << __func__ << " failed to create " << walfn
6072 << ": " << cpp_strerror(r)
6073 << dendl;
6074 return r;
6075 }
6076 } else {
6077 struct stat st;
6078 r = ::stat(walfn.c_str(), &st);
6079 if (r < 0 && errno == ENOENT) {
6080 kv_options.erase("separate_wal_dir");
6081 }
6082 }
6083 }
6084
6085
6086 db = KeyValueDB::create(cct,
6087 kv_backend,
6088 fn,
6089 kv_options,
6090 static_cast<void*>(env));
6091 if (!db) {
6092 derr << __func__ << " error creating db" << dendl;
6093 if (bluefs) {
6094 _close_bluefs(read_only);
6095 }
6096 // delete env manually here since we can't depend on db to do this
6097 // under this case
6098 delete env;
6099 env = NULL;
6100 return -EIO;
6101 }
6102
6103 FreelistManager::setup_merge_operators(db, freelist_type);
6104 db->set_merge_operator(PREFIX_STAT, merge_op);
6105 db->set_cache_size(cache_kv_ratio * cache_size);
6106 return 0;
6107 }
6108
6109 int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6110 {
6111 int r;
6112 ceph_assert(!(create && read_only));
6113 string options;
6114 string options_annex;
6115 stringstream err;
6116 string kv_dir_fn;
6117 string kv_backend;
6118 std::string sharding_def;
6119 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6120 if (r < 0) {
6121 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6122 return -EIO;
6123 }
6124 if (kv_backend == "rocksdb") {
6125 options = cct->_conf->bluestore_rocksdb_options;
6126 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6127 if (!options_annex.empty()) {
6128 if (!options.empty() &&
6129 *options.rbegin() != ',') {
6130 options += ',';
6131 }
6132 options += options_annex;
6133 }
6134
6135 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6136 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
6137 }
6138 }
6139
6140 db->init(options);
6141 if (to_repair_db)
6142 return 0;
6143 if (create) {
6144 r = db->create_and_open(err, sharding_def);
6145 } else {
6146 // we pass in cf list here, but it is only used if the db already has
6147 // column families created.
6148 r = read_only ?
6149 db->open_read_only(err, sharding_def) :
6150 db->open(err, sharding_def);
6151 }
6152 if (r) {
6153 derr << __func__ << " erroring opening db: " << err.str() << dendl;
6154 _close_db(read_only);
6155 return -EIO;
6156 }
6157 dout(1) << __func__ << " opened " << kv_backend
6158 << " path " << kv_dir_fn << " options " << options << dendl;
6159 return 0;
6160 }
6161
6162 void BlueStore::_close_db(bool cold_close)
6163 {
6164 ceph_assert(db);
6165 delete db;
6166 db = NULL;
6167 if (bluefs) {
6168 _close_bluefs(cold_close);
6169 }
6170 }
6171
6172 void BlueStore::_dump_alloc_on_failure()
6173 {
6174 auto dump_interval =
6175 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6176 if (dump_interval > 0 &&
6177 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
6178 shared_alloc.a->dump();
6179 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6180 next_dump_on_bluefs_alloc_failure += dump_interval;
6181 }
6182 }
6183
6184 int BlueStore::_open_collections()
6185 {
6186 dout(10) << __func__ << dendl;
6187 collections_had_errors = false;
6188 ceph_assert(coll_map.empty());
6189 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6190 for (it->upper_bound(string());
6191 it->valid();
6192 it->next()) {
6193 coll_t cid;
6194 if (cid.parse(it->key())) {
6195 auto c = ceph::make_ref<Collection>(
6196 this,
6197 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6198 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6199 cid);
6200 bufferlist bl = it->value();
6201 auto p = bl.cbegin();
6202 try {
6203 decode(c->cnode, p);
6204 } catch (ceph::buffer::error& e) {
6205 derr << __func__ << " failed to decode cnode, key:"
6206 << pretty_binary_string(it->key()) << dendl;
6207 return -EIO;
6208 }
6209 dout(20) << __func__ << " opened " << cid << " " << c
6210 << " " << c->cnode << dendl;
6211 _osr_attach(c.get());
6212 coll_map[cid] = c;
6213
6214 } else {
6215 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6216 collections_had_errors = true;
6217 }
6218 }
6219 return 0;
6220 }
6221
6222 void BlueStore::_fsck_collections(int64_t* errors)
6223 {
6224 if (collections_had_errors) {
6225 dout(10) << __func__ << dendl;
6226 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
6227 for (it->upper_bound(string());
6228 it->valid();
6229 it->next()) {
6230 coll_t cid;
6231 if (!cid.parse(it->key())) {
6232 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6233 if (errors) {
6234 (*errors)++;
6235 }
6236 }
6237 }
6238 }
6239 }
6240
6241 void BlueStore::_set_per_pool_omap()
6242 {
6243 per_pool_omap = OMAP_BULK;
6244 bufferlist bl;
6245 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6246 if (bl.length()) {
6247 auto s = bl.to_str();
6248 if (s == stringify(OMAP_PER_POOL)) {
6249 per_pool_omap = OMAP_PER_POOL;
6250 } else {
6251 ceph_assert(s == stringify(OMAP_PER_PG));
6252 per_pool_omap = OMAP_PER_PG;
6253 }
6254 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
6255 } else {
6256 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6257 }
6258 _check_no_per_pg_or_pool_omap_alert();
6259 }
6260
6261 void BlueStore::_open_statfs()
6262 {
6263 osd_pools.clear();
6264 vstatfs.reset();
6265
6266 bufferlist bl;
6267 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
6268 if (r >= 0) {
6269 per_pool_stat_collection = false;
6270 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
6271 auto it = bl.cbegin();
6272 vstatfs.decode(it);
6273 dout(10) << __func__ << " store_statfs is found" << dendl;
6274 } else {
6275 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6276 }
6277 _check_legacy_statfs_alert();
6278 } else {
6279 per_pool_stat_collection = true;
6280 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6281 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
6282 for (it->upper_bound(string());
6283 it->valid();
6284 it->next()) {
6285
6286 uint64_t pool_id;
6287 int r = get_key_pool_stat(it->key(), &pool_id);
6288 ceph_assert(r == 0);
6289
6290 bufferlist bl;
6291 bl = it->value();
6292 auto p = bl.cbegin();
6293 auto& st = osd_pools[pool_id];
6294 try {
6295 st.decode(p);
6296 vstatfs += st;
6297
6298 dout(30) << __func__ << " pool " << pool_id
6299 << " statfs " << st << dendl;
6300 } catch (ceph::buffer::error& e) {
6301 derr << __func__ << " failed to decode pool stats, key:"
6302 << pretty_binary_string(it->key()) << dendl;
6303 }
6304 }
6305 }
6306 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6307
6308 }
6309
6310 int BlueStore::_setup_block_symlink_or_file(
6311 string name,
6312 string epath,
6313 uint64_t size,
6314 bool create)
6315 {
6316 dout(20) << __func__ << " name " << name << " path " << epath
6317 << " size " << size << " create=" << (int)create << dendl;
6318 int r = 0;
6319 int flags = O_RDWR|O_CLOEXEC;
6320 if (create)
6321 flags |= O_CREAT;
6322 if (epath.length()) {
6323 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6324 if (r < 0) {
6325 r = -errno;
6326 derr << __func__ << " failed to create " << name << " symlink to "
6327 << epath << ": " << cpp_strerror(r) << dendl;
6328 return r;
6329 }
6330
6331 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6332 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6333 if (fd < 0) {
6334 r = -errno;
6335 derr << __func__ << " failed to open " << epath << " file: "
6336 << cpp_strerror(r) << dendl;
6337 return r;
6338 }
6339 // write the Transport ID of the NVMe device
6340 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6341 // where "0000:02:00.0" is the selector of a PCI device, see
6342 // the first column of "lspci -mm -n -D"
6343 string trid{"trtype:PCIe "};
6344 trid += "traddr:";
6345 trid += epath.substr(strlen(SPDK_PREFIX));
6346 r = ::write(fd, trid.c_str(), trid.size());
6347 ceph_assert(r == static_cast<int>(trid.size()));
6348 dout(1) << __func__ << " created " << name << " symlink to "
6349 << epath << dendl;
6350 VOID_TEMP_FAILURE_RETRY(::close(fd));
6351 }
6352 }
6353 if (size) {
6354 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6355 if (fd >= 0) {
6356 // block file is present
6357 struct stat st;
6358 int r = ::fstat(fd, &st);
6359 if (r == 0 &&
6360 S_ISREG(st.st_mode) && // if it is a regular file
6361 st.st_size == 0) { // and is 0 bytes
6362 r = ::ftruncate(fd, size);
6363 if (r < 0) {
6364 r = -errno;
6365 derr << __func__ << " failed to resize " << name << " file to "
6366 << size << ": " << cpp_strerror(r) << dendl;
6367 VOID_TEMP_FAILURE_RETRY(::close(fd));
6368 return r;
6369 }
6370
6371 if (cct->_conf->bluestore_block_preallocate_file) {
6372 r = ::ceph_posix_fallocate(fd, 0, size);
6373 if (r > 0) {
6374 derr << __func__ << " failed to prefallocate " << name << " file to "
6375 << size << ": " << cpp_strerror(r) << dendl;
6376 VOID_TEMP_FAILURE_RETRY(::close(fd));
6377 return -r;
6378 }
6379 }
6380 dout(1) << __func__ << " resized " << name << " file to "
6381 << byte_u_t(size) << dendl;
6382 }
6383 VOID_TEMP_FAILURE_RETRY(::close(fd));
6384 } else {
6385 int r = -errno;
6386 if (r != -ENOENT) {
6387 derr << __func__ << " failed to open " << name << " file: "
6388 << cpp_strerror(r) << dendl;
6389 return r;
6390 }
6391 }
6392 }
6393 return 0;
6394 }
6395
6396 int BlueStore::mkfs()
6397 {
6398 dout(1) << __func__ << " path " << path << dendl;
6399 int r;
6400 uuid_d old_fsid;
6401 uint64_t reserved;
6402 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6403 derr << __func__ << " osd_max_object_size "
6404 << cct->_conf->osd_max_object_size << " > bluestore max "
6405 << OBJECT_MAX_SIZE << dendl;
6406 return -EINVAL;
6407 }
6408
6409 {
6410 string done;
6411 r = read_meta("mkfs_done", &done);
6412 if (r == 0) {
6413 dout(1) << __func__ << " already created" << dendl;
6414 if (cct->_conf->bluestore_fsck_on_mkfs) {
6415 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6416 if (r < 0) {
6417 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6418 << dendl;
6419 return r;
6420 }
6421 if (r > 0) {
6422 derr << __func__ << " fsck found " << r << " errors" << dendl;
6423 r = -EIO;
6424 }
6425 }
6426 return r; // idempotent
6427 }
6428 }
6429
6430 {
6431 string type;
6432 r = read_meta("type", &type);
6433 if (r == 0) {
6434 if (type != "bluestore") {
6435 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6436 return -EIO;
6437 }
6438 } else {
6439 r = write_meta("type", "bluestore");
6440 if (r < 0)
6441 return r;
6442 }
6443 }
6444
6445 freelist_type = "bitmap";
6446
6447 r = _open_path();
6448 if (r < 0)
6449 return r;
6450
6451 r = _open_fsid(true);
6452 if (r < 0)
6453 goto out_path_fd;
6454
6455 r = _lock_fsid();
6456 if (r < 0)
6457 goto out_close_fsid;
6458
6459 r = _read_fsid(&old_fsid);
6460 if (r < 0 || old_fsid.is_zero()) {
6461 if (fsid.is_zero()) {
6462 fsid.generate_random();
6463 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6464 } else {
6465 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6466 }
6467 // we'll write it later.
6468 } else {
6469 if (!fsid.is_zero() && fsid != old_fsid) {
6470 derr << __func__ << " on-disk fsid " << old_fsid
6471 << " != provided " << fsid << dendl;
6472 r = -EINVAL;
6473 goto out_close_fsid;
6474 }
6475 fsid = old_fsid;
6476 }
6477
6478 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6479 cct->_conf->bluestore_block_size,
6480 cct->_conf->bluestore_block_create);
6481 if (r < 0)
6482 goto out_close_fsid;
6483 if (cct->_conf->bluestore_bluefs) {
6484 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6485 cct->_conf->bluestore_block_wal_size,
6486 cct->_conf->bluestore_block_wal_create);
6487 if (r < 0)
6488 goto out_close_fsid;
6489 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6490 cct->_conf->bluestore_block_db_size,
6491 cct->_conf->bluestore_block_db_create);
6492 if (r < 0)
6493 goto out_close_fsid;
6494 }
6495
6496 r = _open_bdev(true);
6497 if (r < 0)
6498 goto out_close_fsid;
6499
6500 // choose min_alloc_size
6501 if (cct->_conf->bluestore_min_alloc_size) {
6502 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6503 } else {
6504 ceph_assert(bdev);
6505 if (_use_rotational_settings()) {
6506 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6507 } else {
6508 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6509 }
6510 }
6511 _validate_bdev();
6512
6513 // make sure min_alloc_size is power of 2 aligned.
6514 if (!isp2(min_alloc_size)) {
6515 derr << __func__ << " min_alloc_size 0x"
6516 << std::hex << min_alloc_size << std::dec
6517 << " is not power of 2 aligned!"
6518 << dendl;
6519 r = -EINVAL;
6520 goto out_close_bdev;
6521 }
6522
6523 r = _create_alloc();
6524 if (r < 0) {
6525 goto out_close_bdev;
6526 }
6527
6528 reserved = _get_ondisk_reserved();
6529 shared_alloc.a->init_add_free(reserved,
6530 p2align(bdev->get_size(), min_alloc_size) - reserved);
6531
6532 r = _open_db(true);
6533 if (r < 0)
6534 goto out_close_alloc;
6535
6536 {
6537 KeyValueDB::Transaction t = db->get_transaction();
6538 r = _open_fm(t, true);
6539 if (r < 0)
6540 goto out_close_db;
6541 {
6542 bufferlist bl;
6543 encode((uint64_t)0, bl);
6544 t->set(PREFIX_SUPER, "nid_max", bl);
6545 t->set(PREFIX_SUPER, "blobid_max", bl);
6546 }
6547
6548 {
6549 bufferlist bl;
6550 encode((uint64_t)min_alloc_size, bl);
6551 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6552 }
6553 {
6554 bufferlist bl;
6555 bl.append(stringify(OMAP_PER_PG));
6556 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6557 }
6558 ondisk_format = latest_ondisk_format;
6559 _prepare_ondisk_format_super(t);
6560 db->submit_transaction_sync(t);
6561 }
6562
6563 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6564 if (r < 0)
6565 goto out_close_fm;
6566
6567 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
6568 if (r < 0)
6569 goto out_close_fm;
6570
6571 if (fsid != old_fsid) {
6572 r = _write_fsid();
6573 if (r < 0) {
6574 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
6575 goto out_close_fm;
6576 }
6577 }
6578
6579 out_close_fm:
6580 _close_fm();
6581 out_close_db:
6582 _close_db(false);
6583 out_close_alloc:
6584 _close_alloc();
6585 out_close_bdev:
6586 _close_bdev();
6587 out_close_fsid:
6588 _close_fsid();
6589 out_path_fd:
6590 _close_path();
6591
6592 if (r == 0 &&
6593 cct->_conf->bluestore_fsck_on_mkfs) {
6594 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6595 if (rc < 0)
6596 return rc;
6597 if (rc > 0) {
6598 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6599 r = -EIO;
6600 }
6601 }
6602
6603 if (r == 0) {
6604 // indicate success by writing the 'mkfs_done' file
6605 r = write_meta("mkfs_done", "yes");
6606 }
6607
6608 if (r < 0) {
6609 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6610 } else {
6611 dout(0) << __func__ << " success" << dendl;
6612 }
6613 return r;
6614 }
6615
6616 int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6617 {
6618 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6619 int r;
6620 ceph_assert(path_fd < 0);
6621
6622 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6623
6624 if (!cct->_conf->bluestore_bluefs) {
6625 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6626 return -EIO;
6627 }
6628
6629 r = _open_db_and_around(true);
6630
6631 if (id == BlueFS::BDEV_NEWWAL) {
6632 string p = path + "/block.wal";
6633 r = _setup_block_symlink_or_file("block.wal", dev_path,
6634 cct->_conf->bluestore_block_wal_size,
6635 true);
6636 ceph_assert(r == 0);
6637
6638 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
6639 cct->_conf->bdev_enable_discard,
6640 BDEV_LABEL_BLOCK_SIZE);
6641 ceph_assert(r == 0);
6642
6643 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6644 r = _check_or_set_bdev_label(
6645 p,
6646 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6647 "bluefs wal",
6648 true);
6649 ceph_assert(r == 0);
6650 }
6651
6652 bluefs_layout.dedicated_wal = true;
6653 } else if (id == BlueFS::BDEV_NEWDB) {
6654 string p = path + "/block.db";
6655 r = _setup_block_symlink_or_file("block.db", dev_path,
6656 cct->_conf->bluestore_block_db_size,
6657 true);
6658 ceph_assert(r == 0);
6659
6660 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
6661 cct->_conf->bdev_enable_discard,
6662 SUPER_RESERVED);
6663 ceph_assert(r == 0);
6664
6665 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6666 r = _check_or_set_bdev_label(
6667 p,
6668 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6669 "bluefs db",
6670 true);
6671 ceph_assert(r == 0);
6672 }
6673 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6674 bluefs_layout.dedicated_db = true;
6675 }
6676
6677 bluefs->umount();
6678 bluefs->mount();
6679
6680 r = bluefs->prepare_new_device(id, bluefs_layout);
6681 ceph_assert(r == 0);
6682
6683 if (r < 0) {
6684 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6685 } else {
6686 dout(0) << __func__ << " success" << dendl;
6687 }
6688
6689 _close_db_and_around(true);
6690 return r;
6691 }
6692
6693 int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6694 int id)
6695 {
6696 dout(10) << __func__ << " id:" << id << dendl;
6697 ceph_assert(path_fd < 0);
6698
6699 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6700
6701 if (!cct->_conf->bluestore_bluefs) {
6702 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6703 return -EIO;
6704 }
6705
6706 int r = _open_db_and_around(true);
6707
6708 uint64_t used_space = 0;
6709 for(auto src_id : devs_source) {
6710 used_space += bluefs->get_used(src_id);
6711 }
6712 uint64_t target_free = bluefs->get_free(id);
6713 if (target_free < used_space) {
6714 derr << __func__
6715 << " can't migrate, free space at target: " << target_free
6716 << " is less than required space: " << used_space
6717 << dendl;
6718 r = -ENOSPC;
6719 goto shutdown;
6720 }
6721 if (devs_source.count(BlueFS::BDEV_DB)) {
6722 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6723 bluefs_layout.dedicated_db = false;
6724 }
6725 if (devs_source.count(BlueFS::BDEV_WAL)) {
6726 bluefs_layout.dedicated_wal = false;
6727 }
6728 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
6729 if (r < 0) {
6730 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6731 goto shutdown;
6732 }
6733
6734 if (devs_source.count(BlueFS::BDEV_DB)) {
6735 r = unlink(string(path + "/block.db").c_str());
6736 ceph_assert(r == 0);
6737 }
6738 if (devs_source.count(BlueFS::BDEV_WAL)) {
6739 r = unlink(string(path + "/block.wal").c_str());
6740 ceph_assert(r == 0);
6741 }
6742
6743 shutdown:
6744 _close_db_and_around(true);
6745 return r;
6746 }
6747
6748 int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6749 int id,
6750 const string& dev_path)
6751 {
6752 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6753 int r;
6754 ceph_assert(path_fd < 0);
6755
6756 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6757
6758 if (!cct->_conf->bluestore_bluefs) {
6759 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6760 return -EIO;
6761 }
6762
6763 r = _open_db_and_around(true);
6764
6765 string link_db;
6766 string link_wal;
6767 if (devs_source.count(BlueFS::BDEV_DB) &&
6768 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
6769 link_db = path + "/block.db";
6770 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6771 bluefs_layout.dedicated_db = false;
6772 }
6773 if (devs_source.count(BlueFS::BDEV_WAL)) {
6774 link_wal = path + "/block.wal";
6775 bluefs_layout.dedicated_wal = false;
6776 }
6777
6778 size_t target_size;
6779 string target_name;
6780 if (id == BlueFS::BDEV_NEWWAL) {
6781 target_name = "block.wal";
6782 target_size = cct->_conf->bluestore_block_wal_size;
6783 bluefs_layout.dedicated_wal = true;
6784
6785 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
6786 cct->_conf->bdev_enable_discard,
6787 BDEV_LABEL_BLOCK_SIZE);
6788 ceph_assert(r == 0);
6789
6790 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6791 r = _check_or_set_bdev_label(
6792 dev_path,
6793 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6794 "bluefs wal",
6795 true);
6796 ceph_assert(r == 0);
6797 }
6798 } else if (id == BlueFS::BDEV_NEWDB) {
6799 target_name = "block.db";
6800 target_size = cct->_conf->bluestore_block_db_size;
6801 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6802 bluefs_layout.dedicated_db = true;
6803
6804 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
6805 cct->_conf->bdev_enable_discard,
6806 SUPER_RESERVED);
6807 ceph_assert(r == 0);
6808
6809 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6810 r = _check_or_set_bdev_label(
6811 dev_path,
6812 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6813 "bluefs db",
6814 true);
6815 ceph_assert(r == 0);
6816 }
6817 }
6818
6819 bluefs->umount();
6820 bluefs->mount();
6821
6822 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
6823
6824 if (r < 0) {
6825 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6826 goto shutdown;
6827 }
6828
6829 if (!link_db.empty()) {
6830 r = unlink(link_db.c_str());
6831 ceph_assert(r == 0);
6832 }
6833 if (!link_wal.empty()) {
6834 r = unlink(link_wal.c_str());
6835 ceph_assert(r == 0);
6836 }
6837 r = _setup_block_symlink_or_file(
6838 target_name,
6839 dev_path,
6840 target_size,
6841 true);
6842 ceph_assert(r == 0);
6843 dout(0) << __func__ << " success" << dendl;
6844
6845 shutdown:
6846 _close_db_and_around(true);
6847
6848 return r;
6849 }
6850
6851 string BlueStore::get_device_path(unsigned id)
6852 {
6853 string res;
6854 if (id < BlueFS::MAX_BDEV) {
6855 switch (id) {
6856 case BlueFS::BDEV_WAL:
6857 res = path + "/block.wal";
6858 break;
6859 case BlueFS::BDEV_DB:
6860 if (id == bluefs_layout.shared_bdev) {
6861 res = path + "/block";
6862 } else {
6863 res = path + "/block.db";
6864 }
6865 break;
6866 case BlueFS::BDEV_SLOW:
6867 res = path + "/block";
6868 break;
6869 }
6870 }
6871 return res;
6872 }
6873
6874 int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
6875 {
6876 bluestore_bdev_label_t label;
6877 int r = _read_bdev_label(cct, path, &label);
6878 if (r < 0) {
6879 derr << "unable to read label for " << path << ": "
6880 << cpp_strerror(r) << dendl;
6881 } else {
6882 label.size = size;
6883 r = _write_bdev_label(cct, path, label);
6884 if (r < 0) {
6885 derr << "unable to write label for " << path << ": "
6886 << cpp_strerror(r) << dendl;
6887 }
6888 }
6889 return r;
6890 }
6891
6892 int BlueStore::expand_devices(ostream& out)
6893 {
6894 int r = _open_db_and_around(true);
6895 ceph_assert(r == 0);
6896 bluefs->dump_block_extents(out);
6897 out << "Expanding DB/WAL..." << std::endl;
6898 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
6899 if (devid == bluefs_layout.shared_bdev ) {
6900 continue;
6901 }
6902 uint64_t size = bluefs->get_block_device_size(devid);
6903 if (size == 0) {
6904 // no bdev
6905 continue;
6906 }
6907
6908 out << devid
6909 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
6910 string p = get_device_path(devid);
6911 const char* path = p.c_str();
6912 if (path == nullptr) {
6913 derr << devid
6914 <<": can't find device path " << dendl;
6915 continue;
6916 }
6917 if (bluefs->bdev_support_label(devid)) {
6918 if (_set_bdev_label_size(p, size) >= 0) {
6919 out << devid
6920 << " : size label updated to " << size
6921 << std::endl;
6922 }
6923 }
6924 }
6925 uint64_t size0 = fm->get_size();
6926 uint64_t size = bdev->get_size();
6927 if (size0 < size) {
6928 out << bluefs_layout.shared_bdev
6929 << " : expanding " << " from 0x" << std::hex
6930 << size0 << " to 0x" << size << std::dec << std::endl;
6931 _write_out_fm_meta(size);
6932 if (bdev->supported_bdev_label()) {
6933 if (_set_bdev_label_size(path, size) >= 0) {
6934 out << bluefs_layout.shared_bdev
6935 << " : size label updated to " << size
6936 << std::endl;
6937 }
6938 }
6939 _close_db_and_around(true);
6940
6941 // mount in read/write to sync expansion changes
6942 r = _mount();
6943 ceph_assert(r == 0);
6944 umount();
6945 } else {
6946 _close_db_and_around(true);
6947 }
6948 return r;
6949 }
6950
6951 int BlueStore::dump_bluefs_sizes(ostream& out)
6952 {
6953 int r = _open_db_and_around(true);
6954 ceph_assert(r == 0);
6955 bluefs->dump_block_extents(out);
6956 _close_db_and_around(true);
6957 return r;
6958 }
6959
6960 void BlueStore::set_cache_shards(unsigned num)
6961 {
6962 dout(10) << __func__ << " " << num << dendl;
6963 size_t oold = onode_cache_shards.size();
6964 size_t bold = buffer_cache_shards.size();
6965 ceph_assert(num >= oold && num >= bold);
6966 onode_cache_shards.resize(num);
6967 buffer_cache_shards.resize(num);
6968 for (unsigned i = oold; i < num; ++i) {
6969 onode_cache_shards[i] =
6970 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6971 logger);
6972 }
6973 for (unsigned i = bold; i < num; ++i) {
6974 buffer_cache_shards[i] =
6975 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6976 logger);
6977 }
6978 }
6979
6980 int BlueStore::_mount()
6981 {
6982 dout(1) << __func__ << " path " << path << dendl;
6983
6984 _kv_only = false;
6985 if (cct->_conf->bluestore_fsck_on_mount) {
6986 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6987 if (rc < 0)
6988 return rc;
6989 if (rc > 0) {
6990 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6991 return -EIO;
6992 }
6993 }
6994
6995 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6996 derr << __func__ << " osd_max_object_size "
6997 << cct->_conf->osd_max_object_size << " > bluestore max "
6998 << OBJECT_MAX_SIZE << dendl;
6999 return -EINVAL;
7000 }
7001
7002 int r = _open_db_and_around(false);
7003 if (r < 0) {
7004 return r;
7005 }
7006
7007 r = _upgrade_super();
7008 if (r < 0) {
7009 goto out_db;
7010 }
7011
7012 r = _open_collections();
7013 if (r < 0)
7014 goto out_db;
7015
7016 r = _reload_logger();
7017 if (r < 0)
7018 goto out_coll;
7019
7020 _kv_start();
7021
7022 if (bdev->is_smr()) {
7023 _zoned_cleaner_start();
7024 }
7025
7026 r = _deferred_replay();
7027 if (r < 0)
7028 goto out_stop;
7029
7030 mempool_thread.init();
7031
7032 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
7033 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
7034
7035 auto was_per_pool_omap = per_pool_omap;
7036
7037 dout(1) << __func__ << " quick-fix on mount" << dendl;
7038 _fsck_on_open(FSCK_SHALLOW, true);
7039
7040 //reread statfs
7041 //FIXME minor: replace with actual open/close?
7042 _open_statfs();
7043 _check_legacy_statfs_alert();
7044
7045 //set again as hopefully it has been fixed
7046 if (was_per_pool_omap != OMAP_PER_PG) {
7047 _set_per_pool_omap();
7048 }
7049 }
7050
7051 mounted = true;
7052 return 0;
7053
7054 out_stop:
7055 if (bdev->is_smr()) {
7056 _zoned_cleaner_stop();
7057 }
7058 _kv_stop();
7059 out_coll:
7060 _shutdown_cache();
7061 out_db:
7062 _close_db_and_around(false);
7063 return r;
7064 }
7065
7066 int BlueStore::umount()
7067 {
7068 ceph_assert(_kv_only || mounted);
7069 dout(1) << __func__ << dendl;
7070
7071 _osr_drain_all();
7072
7073 mounted = false;
7074 if (!_kv_only) {
7075 mempool_thread.shutdown();
7076 if (bdev->is_smr()) {
7077 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7078 _zoned_cleaner_stop();
7079 }
7080 dout(20) << __func__ << " stopping kv thread" << dendl;
7081 _kv_stop();
7082 _shutdown_cache();
7083 dout(20) << __func__ << " closing" << dendl;
7084
7085 }
7086 _close_db_and_around(false);
7087
7088 if (cct->_conf->bluestore_fsck_on_umount) {
7089 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7090 if (rc < 0)
7091 return rc;
7092 if (rc > 0) {
7093 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7094 return -EIO;
7095 }
7096 }
7097 return 0;
7098 }
7099
7100 int BlueStore::cold_open()
7101 {
7102 return _open_db_and_around(true);
7103 }
7104
7105 int BlueStore::cold_close()
7106 {
7107 _close_db_and_around(true);
7108 return 0;
7109 }
7110
7111 // derr wrapper to limit enormous output and avoid log flooding.
7112 // Of limited use where such output is expected for now
7113 #define fsck_derr(err_cnt, threshold) \
7114 if (err_cnt <= threshold) { \
7115 bool need_skip_print = err_cnt == threshold; \
7116 derr
7117
7118 #define fsck_dendl \
7119 dendl; \
7120 if (need_skip_print) \
7121 derr << "more error lines skipped..." << dendl; \
7122 }
7123
7124 int _fsck_sum_extents(
7125 const PExtentVector& extents,
7126 bool compressed,
7127 store_statfs_t& expected_statfs)
7128 {
7129 for (auto e : extents) {
7130 if (!e.is_valid())
7131 continue;
7132 expected_statfs.allocated += e.length;
7133 if (compressed) {
7134 expected_statfs.data_compressed_allocated += e.length;
7135 }
7136 }
7137 return 0;
7138 }
7139
7140 int BlueStore::_fsck_check_extents(
7141 const coll_t& cid,
7142 const ghobject_t& oid,
7143 const PExtentVector& extents,
7144 bool compressed,
7145 mempool_dynamic_bitset &used_blocks,
7146 uint64_t granularity,
7147 BlueStoreRepairer* repairer,
7148 store_statfs_t& expected_statfs,
7149 FSCKDepth depth)
7150 {
7151 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7152 int errors = 0;
7153 for (auto e : extents) {
7154 if (!e.is_valid())
7155 continue;
7156 expected_statfs.allocated += e.length;
7157 if (compressed) {
7158 expected_statfs.data_compressed_allocated += e.length;
7159 }
7160 if (depth != FSCK_SHALLOW) {
7161 bool already = false;
7162 apply_for_bitset_range(
7163 e.offset, e.length, granularity, used_blocks,
7164 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
7165 if (bs.test(pos)) {
7166 if (repairer) {
7167 repairer->note_misreference(
7168 pos * min_alloc_size, min_alloc_size, !already);
7169 }
7170 if (!already) {
7171 derr << "fsck error: " << oid << " extent " << e
7172 << " or a subset is already allocated (misreferenced)" << dendl;
7173 ++errors;
7174 already = true;
7175 }
7176 }
7177 else
7178 bs.set(pos);
7179 });
7180 if (repairer) {
7181 repairer->set_space_used(e.offset, e.length, cid, oid);
7182 }
7183
7184 if (e.end() > bdev->get_size()) {
7185 derr << "fsck error: " << oid << " extent " << e
7186 << " past end of block device" << dendl;
7187 ++errors;
7188 }
7189 }
7190 }
7191 return errors;
7192 }
7193
7194 void BlueStore::_fsck_check_pool_statfs(
7195 BlueStore::per_pool_statfs& expected_pool_statfs,
7196 int64_t& errors,
7197 int64_t& warnings,
7198 BlueStoreRepairer* repairer)
7199 {
7200 auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
7201 if (it) {
7202 for (it->lower_bound(string()); it->valid(); it->next()) {
7203 string key = it->key();
7204 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7205 if (repairer) {
7206 ++errors;
7207 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7208 derr << "fsck error: " << "legacy statfs record found, removing"
7209 << dendl;
7210 }
7211 continue;
7212 }
7213 uint64_t pool_id;
7214 if (get_key_pool_stat(key, &pool_id) < 0) {
7215 derr << "fsck error: bad key " << key
7216 << "in statfs namespece" << dendl;
7217 if (repairer) {
7218 repairer->remove_key(db, PREFIX_STAT, key);
7219 }
7220 ++errors;
7221 continue;
7222 }
7223
7224 volatile_statfs vstatfs;
7225 bufferlist bl = it->value();
7226 auto blp = bl.cbegin();
7227 try {
7228 vstatfs.decode(blp);
7229 } catch (ceph::buffer::error& e) {
7230 derr << "fsck error: failed to decode Pool StatFS record"
7231 << pretty_binary_string(key) << dendl;
7232 if (repairer) {
7233 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7234 << pretty_binary_string(key)
7235 << "', removing" << dendl;
7236 repairer->remove_key(db, PREFIX_STAT, key);
7237 }
7238 ++errors;
7239 vstatfs.reset();
7240 }
7241 auto stat_it = expected_pool_statfs.find(pool_id);
7242 if (stat_it == expected_pool_statfs.end()) {
7243 if (vstatfs.is_empty()) {
7244 // we don't consider that as an error since empty pool statfs
7245 // are left in DB for now
7246 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7247 << std::hex << pool_id << std::dec << dendl;
7248 if (repairer) {
7249 // but we need to increment error count in case of repair
7250 // to have proper counters at the end
7251 // (as repairer increments recovery counter anyway).
7252 ++errors;
7253 }
7254 } else {
7255 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7256 << std::hex << pool_id << std::dec << dendl;
7257 ++errors;
7258 }
7259 if (repairer) {
7260 repairer->remove_key(db, PREFIX_STAT, key);
7261 }
7262 continue;
7263 }
7264 store_statfs_t statfs;
7265 vstatfs.publish(&statfs);
7266 if (!(stat_it->second == statfs)) {
7267 derr << "fsck error: actual " << statfs
7268 << " != expected " << stat_it->second
7269 << " for pool "
7270 << std::hex << pool_id << std::dec << dendl;
7271 if (repairer) {
7272 repairer->fix_statfs(db, key, stat_it->second);
7273 }
7274 ++errors;
7275 }
7276 expected_pool_statfs.erase(stat_it);
7277 }
7278 } // if (it)
7279 for (auto& s : expected_pool_statfs) {
7280 if (s.second.is_zero()) {
7281 // we might lack empty statfs recs in DB
7282 continue;
7283 }
7284 derr << "fsck error: missing Pool StatFS record for pool "
7285 << std::hex << s.first << std::dec << dendl;
7286 if (repairer) {
7287 string key;
7288 get_pool_stat_key(s.first, &key);
7289 repairer->fix_statfs(db, key, s.second);
7290 }
7291 ++errors;
7292 }
7293 if (!per_pool_stat_collection &&
7294 repairer) {
7295 // by virtue of running this method, we correct the top-level
7296 // error of having global stats
7297 repairer->inc_repaired();
7298 }
7299 }
7300
7301 BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7302 BlueStore::FSCKDepth depth,
7303 int64_t pool_id,
7304 BlueStore::CollectionRef c,
7305 const ghobject_t& oid,
7306 const string& key,
7307 const bufferlist& value,
7308 mempool::bluestore_fsck::list<string>* expecting_shards,
7309 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7310 const BlueStore::FSCK_ObjectCtx& ctx)
7311 {
7312 auto& errors = ctx.errors;
7313 auto& num_objects = ctx.num_objects;
7314 auto& num_extents = ctx.num_extents;
7315 auto& num_blobs = ctx.num_blobs;
7316 auto& num_sharded_objects = ctx.num_sharded_objects;
7317 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7318 auto used_blocks = ctx.used_blocks;
7319 auto sb_info_lock = ctx.sb_info_lock;
7320 auto& sb_info = ctx.sb_info;
7321 auto repairer = ctx.repairer;
7322
7323 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7324 &ctx.expected_pool_statfs[pool_id] :
7325 &ctx.expected_store_statfs;
7326
7327 dout(10) << __func__ << " " << oid << dendl;
7328 OnodeRef o;
7329 o.reset(Onode::decode(c, oid, key, value));
7330 ++num_objects;
7331
7332 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7333
7334 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7335 _dump_onode<30>(cct, *o);
7336 // shards
7337 if (!o->extent_map.shards.empty()) {
7338 ++num_sharded_objects;
7339 if (depth != FSCK_SHALLOW) {
7340 ceph_assert(expecting_shards);
7341 for (auto& s : o->extent_map.shards) {
7342 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
7343 expecting_shards->push_back(string());
7344 get_extent_shard_key(o->key, s.shard_info->offset,
7345 &expecting_shards->back());
7346 if (s.shard_info->offset >= o->onode.size) {
7347 derr << "fsck error: " << oid << " shard 0x" << std::hex
7348 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7349 << std::dec << dendl;
7350 ++errors;
7351 }
7352 }
7353 }
7354 }
7355
7356 // lextents
7357 uint64_t pos = 0;
7358 mempool::bluestore_fsck::map<BlobRef,
7359 bluestore_blob_use_tracker_t> ref_map;
7360 for (auto& l : o->extent_map.extent_map) {
7361 dout(20) << __func__ << " " << l << dendl;
7362 if (l.logical_offset < pos) {
7363 derr << "fsck error: " << oid << " lextent at 0x"
7364 << std::hex << l.logical_offset
7365 << " overlaps with the previous, which ends at 0x" << pos
7366 << std::dec << dendl;
7367 ++errors;
7368 }
7369 if (depth != FSCK_SHALLOW &&
7370 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7371 derr << "fsck error: " << oid << " lextent at 0x"
7372 << std::hex << l.logical_offset << "~" << l.length
7373 << " spans a shard boundary"
7374 << std::dec << dendl;
7375 ++errors;
7376 }
7377 pos = l.logical_offset + l.length;
7378 res_statfs->data_stored += l.length;
7379 ceph_assert(l.blob);
7380 const bluestore_blob_t& blob = l.blob->get_blob();
7381
7382 auto& ref = ref_map[l.blob];
7383 if (ref.is_empty()) {
7384 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7385 uint32_t l = blob.get_logical_length();
7386 ref.init(l, min_release_size);
7387 }
7388 ref.get(
7389 l.blob_offset,
7390 l.length);
7391 ++num_extents;
7392 if (depth != FSCK_SHALLOW &&
7393 blob.has_unused()) {
7394 ceph_assert(referenced);
7395 auto p = referenced->find(l.blob);
7396 bluestore_blob_t::unused_t* pu;
7397 if (p == referenced->end()) {
7398 pu = &(*referenced)[l.blob];
7399 }
7400 else {
7401 pu = &p->second;
7402 }
7403 uint64_t blob_len = blob.get_logical_length();
7404 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7405 ceph_assert(l.blob_offset + l.length <= blob_len);
7406 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7407 uint64_t start = l.blob_offset / chunk_size;
7408 uint64_t end =
7409 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7410 for (auto i = start; i < end; ++i) {
7411 (*pu) |= (1u << i);
7412 }
7413 }
7414 } //for (auto& l : o->extent_map.extent_map)
7415
7416 for (auto& i : ref_map) {
7417 ++num_blobs;
7418 const bluestore_blob_t& blob = i.first->get_blob();
7419 bool equal =
7420 depth == FSCK_SHALLOW ? true :
7421 i.first->get_blob_use_tracker().equal(i.second);
7422 if (!equal) {
7423 derr << "fsck error: " << oid << " blob " << *i.first
7424 << " doesn't match expected ref_map " << i.second << dendl;
7425 ++errors;
7426 }
7427 if (blob.is_compressed()) {
7428 res_statfs->data_compressed += blob.get_compressed_payload_length();
7429 res_statfs->data_compressed_original +=
7430 i.first->get_referenced_bytes();
7431 }
7432 if (blob.is_shared()) {
7433 if (i.first->shared_blob->get_sbid() > blobid_max) {
7434 derr << "fsck error: " << oid << " blob " << blob
7435 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7436 << blobid_max << dendl;
7437 ++errors;
7438 }
7439 else if (i.first->shared_blob->get_sbid() == 0) {
7440 derr << "fsck error: " << oid << " blob " << blob
7441 << " marked as shared but has uninitialized sbid"
7442 << dendl;
7443 ++errors;
7444 }
7445 // the below lock is optional and provided in multithreading mode only
7446 if (sb_info_lock) {
7447 sb_info_lock->lock();
7448 }
7449 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7450 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7451 ceph_assert(sbi.pool_id == INT64_MIN ||
7452 sbi.pool_id == oid.hobj.get_logical_pool());
7453 sbi.cid = c->cid;
7454 sbi.pool_id = oid.hobj.get_logical_pool();
7455 sbi.sb = i.first->shared_blob;
7456 sbi.oids.push_back(oid);
7457 sbi.compressed = blob.is_compressed();
7458 for (auto e : blob.get_extents()) {
7459 if (e.is_valid()) {
7460 sbi.ref_map.get(e.offset, e.length);
7461 }
7462 }
7463 if (sb_info_lock) {
7464 sb_info_lock->unlock();
7465 }
7466 } else if (depth != FSCK_SHALLOW) {
7467 ceph_assert(used_blocks);
7468 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7469 blob.is_compressed(),
7470 *used_blocks,
7471 fm->get_alloc_size(),
7472 repairer,
7473 *res_statfs,
7474 depth);
7475 } else {
7476 errors += _fsck_sum_extents(
7477 blob.get_extents(),
7478 blob.is_compressed(),
7479 *res_statfs);
7480 }
7481 } // for (auto& i : ref_map)
7482
7483 {
7484 auto &sbm = o->extent_map.spanning_blob_map;
7485 size_t broken = 0;
7486 BlobRef first_broken;
7487 for (auto it = sbm.begin(); it != sbm.end();) {
7488 auto it1 = it++;
7489 if (ref_map.count(it1->second) == 0) {
7490 if (!broken) {
7491 first_broken = it1->second;
7492 ++errors;
7493 }
7494 broken++;
7495 if (repairer) {
7496 sbm.erase(it1);
7497 }
7498 }
7499 }
7500 if (broken) {
7501 derr << "fsck error: " << oid << " - " << broken
7502 << " zombie spanning blob(s) found, the first one: "
7503 << *first_broken << dendl;
7504 if(repairer) {
7505 repairer->fix_spanning_blobs(
7506 db,
7507 [&](KeyValueDB::Transaction txn) {
7508 _record_onode(o, txn);
7509 });
7510 }
7511 }
7512 }
7513
7514 if (o->onode.has_omap()) {
7515 _fsck_check_object_omap(depth, o, ctx);
7516 }
7517
7518 return o;
7519 }
7520
7521 #include "common/WorkQueue.h"
7522
7523 class ShallowFSCKThreadPool : public ThreadPool
7524 {
7525 public:
7526 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7527 ThreadPool(cct_, nm, tn, n) {
7528 }
7529 void worker(ThreadPool::WorkThread* wt) override {
7530 int next_wq = 0;
7531 while (!_stop) {
7532 next_wq %= work_queues.size();
7533 WorkQueue_ *wq = work_queues[next_wq++];
7534
7535 void* item = wq->_void_dequeue();
7536 if (item) {
7537 processing++;
7538 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7539 wq->_void_process(item, tp_handle);
7540 processing--;
7541 }
7542 }
7543 }
7544 template <size_t BatchLen>
7545 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7546 {
7547 struct Entry {
7548 int64_t pool_id;
7549 BlueStore::CollectionRef c;
7550 ghobject_t oid;
7551 string key;
7552 bufferlist value;
7553 };
7554 struct Batch {
7555 std::atomic<size_t> running = { 0 };
7556 size_t entry_count = 0;
7557 std::array<Entry, BatchLen> entries;
7558
7559 int64_t errors = 0;
7560 int64_t warnings = 0;
7561 uint64_t num_objects = 0;
7562 uint64_t num_extents = 0;
7563 uint64_t num_blobs = 0;
7564 uint64_t num_sharded_objects = 0;
7565 uint64_t num_spanning_blobs = 0;
7566 store_statfs_t expected_store_statfs;
7567 BlueStore::per_pool_statfs expected_pool_statfs;
7568 };
7569
7570 size_t batchCount;
7571 BlueStore* store = nullptr;
7572
7573 ceph::mutex* sb_info_lock = nullptr;
7574 BlueStore::sb_info_map_t* sb_info = nullptr;
7575 BlueStoreRepairer* repairer = nullptr;
7576
7577 Batch* batches = nullptr;
7578 size_t last_batch_pos = 0;
7579 bool batch_acquired = false;
7580
7581 FSCKWorkQueue(std::string n,
7582 size_t _batchCount,
7583 BlueStore* _store,
7584 ceph::mutex* _sb_info_lock,
7585 BlueStore::sb_info_map_t& _sb_info,
7586 BlueStoreRepairer* _repairer) :
7587 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
7588 batchCount(_batchCount),
7589 store(_store),
7590 sb_info_lock(_sb_info_lock),
7591 sb_info(&_sb_info),
7592 repairer(_repairer)
7593 {
7594 batches = new Batch[batchCount];
7595 }
7596 ~FSCKWorkQueue() {
7597 delete[] batches;
7598 }
7599
7600 /// Remove all work items from the queue.
7601 void _clear() override {
7602 //do nothing
7603 }
7604 /// Check whether there is anything to do.
7605 bool _empty() override {
7606 ceph_assert(false);
7607 }
7608
7609 /// Get the next work item to process.
7610 void* _void_dequeue() override {
7611 size_t pos = rand() % batchCount;
7612 size_t pos0 = pos;
7613 do {
7614 auto& batch = batches[pos];
7615 if (batch.running.fetch_add(1) == 0) {
7616 if (batch.entry_count) {
7617 return &batch;
7618 }
7619 }
7620 batch.running--;
7621 pos++;
7622 pos %= batchCount;
7623 } while (pos != pos0);
7624 return nullptr;
7625 }
7626 /** @brief Process the work item.
7627 * This function will be called several times in parallel
7628 * and must therefore be thread-safe. */
7629 void _void_process(void* item, TPHandle& handle) override {
7630 Batch* batch = (Batch*)item;
7631
7632 BlueStore::FSCK_ObjectCtx ctx(
7633 batch->errors,
7634 batch->warnings,
7635 batch->num_objects,
7636 batch->num_extents,
7637 batch->num_blobs,
7638 batch->num_sharded_objects,
7639 batch->num_spanning_blobs,
7640 nullptr, // used_blocks
7641 nullptr, //used_omap_head
7642 sb_info_lock,
7643 *sb_info,
7644 batch->expected_store_statfs,
7645 batch->expected_pool_statfs,
7646 repairer);
7647
7648 for (size_t i = 0; i < batch->entry_count; i++) {
7649 auto& entry = batch->entries[i];
7650
7651 store->fsck_check_objects_shallow(
7652 BlueStore::FSCK_SHALLOW,
7653 entry.pool_id,
7654 entry.c,
7655 entry.oid,
7656 entry.key,
7657 entry.value,
7658 nullptr, // expecting_shards - this will need a protection if passed
7659 nullptr, // referenced
7660 ctx);
7661 }
7662 //std::cout << "processed " << batch << std::endl;
7663 batch->entry_count = 0;
7664 batch->running--;
7665 }
7666 /** @brief Synchronously finish processing a work item.
7667 * This function is called after _void_process with the global thread pool lock held,
7668 * so at most one copy will execute simultaneously for a given thread pool.
7669 * It can be used for non-thread-safe finalization. */
7670 void _void_process_finish(void*) override {
7671 ceph_assert(false);
7672 }
7673
7674 bool queue(
7675 int64_t pool_id,
7676 BlueStore::CollectionRef c,
7677 const ghobject_t& oid,
7678 const string& key,
7679 const bufferlist& value) {
7680 bool res = false;
7681 size_t pos0 = last_batch_pos;
7682 if (!batch_acquired) {
7683 do {
7684 auto& batch = batches[last_batch_pos];
7685 if (batch.running.fetch_add(1) == 0) {
7686 if (batch.entry_count < BatchLen) {
7687 batch_acquired = true;
7688 break;
7689 }
7690 }
7691 batch.running.fetch_sub(1);
7692 last_batch_pos++;
7693 last_batch_pos %= batchCount;
7694 } while (last_batch_pos != pos0);
7695 }
7696 if (batch_acquired) {
7697 auto& batch = batches[last_batch_pos];
7698 ceph_assert(batch.running);
7699 ceph_assert(batch.entry_count < BatchLen);
7700
7701 auto& entry = batch.entries[batch.entry_count];
7702 entry.pool_id = pool_id;
7703 entry.c = c;
7704 entry.oid = oid;
7705 entry.key = key;
7706 entry.value = value;
7707
7708 ++batch.entry_count;
7709 if (batch.entry_count == BatchLen) {
7710 batch_acquired = false;
7711 batch.running.fetch_sub(1);
7712 last_batch_pos++;
7713 last_batch_pos %= batchCount;
7714 }
7715 res = true;
7716 }
7717 return res;
7718 }
7719
7720 void finalize(ThreadPool& tp,
7721 BlueStore::FSCK_ObjectCtx& ctx) {
7722 if (batch_acquired) {
7723 auto& batch = batches[last_batch_pos];
7724 ceph_assert(batch.running);
7725 batch.running.fetch_sub(1);
7726 }
7727 tp.stop();
7728
7729 for (size_t i = 0; i < batchCount; i++) {
7730 auto& batch = batches[i];
7731
7732 //process leftovers if any
7733 if (batch.entry_count) {
7734 TPHandle tp_handle(store->cct,
7735 nullptr,
7736 timeout_interval,
7737 suicide_interval);
7738 ceph_assert(batch.running == 0);
7739
7740 batch.running++; // just to be on-par with the regular call
7741 _void_process(&batch, tp_handle);
7742 }
7743 ceph_assert(batch.entry_count == 0);
7744
7745 ctx.errors += batch.errors;
7746 ctx.warnings += batch.warnings;
7747 ctx.num_objects += batch.num_objects;
7748 ctx.num_extents += batch.num_extents;
7749 ctx.num_blobs += batch.num_blobs;
7750 ctx.num_sharded_objects += batch.num_sharded_objects;
7751 ctx.num_spanning_blobs += batch.num_spanning_blobs;
7752
7753 ctx.expected_store_statfs.add(batch.expected_store_statfs);
7754
7755 for (auto it = batch.expected_pool_statfs.begin();
7756 it != batch.expected_pool_statfs.end();
7757 it++) {
7758 ctx.expected_pool_statfs[it->first].add(it->second);
7759 }
7760 }
7761 }
7762 };
7763 };
7764
7765 void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
7766 OnodeRef& o,
7767 const BlueStore::FSCK_ObjectCtx& ctx)
7768 {
7769 auto& errors = ctx.errors;
7770 auto& warnings = ctx.warnings;
7771 auto repairer = ctx.repairer;
7772
7773 ceph_assert(o->onode.has_omap());
7774 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
7775 if (per_pool_omap == OMAP_PER_POOL) {
7776 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7777 << "fsck error: " << o->oid
7778 << " has omap that is not per-pool or pgmeta"
7779 << fsck_dendl;
7780 ++errors;
7781 } else {
7782 const char* w;
7783 int64_t num;
7784 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
7785 ++errors;
7786 num = errors;
7787 w = "error";
7788 } else {
7789 ++warnings;
7790 num = warnings;
7791 w = "warning";
7792 }
7793 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7794 << "fsck " << w << ": " << o->oid
7795 << " has omap that is not per-pool or pgmeta"
7796 << fsck_dendl;
7797 }
7798 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
7799 if (per_pool_omap == OMAP_PER_PG) {
7800 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7801 << "fsck error: " << o->oid
7802 << " has omap that is not per-pg or pgmeta"
7803 << fsck_dendl;
7804 ++errors;
7805 } else {
7806 const char* w;
7807 int64_t num;
7808 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
7809 ++errors;
7810 num = errors;
7811 w = "error";
7812 } else {
7813 ++warnings;
7814 num = warnings;
7815 w = "warning";
7816 }
7817 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7818 << "fsck " << w << ": " << o->oid
7819 << " has omap that is not per-pg or pgmeta"
7820 << fsck_dendl;
7821 }
7822 }
7823 if (repairer &&
7824 !o->onode.is_perpg_omap() &&
7825 !o->onode.is_pgmeta_omap()) {
7826 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
7827 bufferlist header;
7828 map<string, bufferlist> kv;
7829 {
7830 KeyValueDB::Transaction txn = db->get_transaction();
7831 uint64_t txn_cost = 0;
7832 const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
7833 uint8_t new_flags = o->onode.flags |
7834 bluestore_onode_t::FLAG_PERPOOL_OMAP |
7835 bluestore_onode_t::FLAG_PERPG_OMAP;
7836 const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
7837
7838 KeyValueDB::Iterator it = db->get_iterator(prefix);
7839 string head, tail;
7840 o->get_omap_header(&head);
7841 o->get_omap_tail(&tail);
7842 it->lower_bound(head);
7843 // head
7844 if (it->valid() && it->key() == head) {
7845 dout(30) << __func__ << " got header" << dendl;
7846 header = it->value();
7847 if (header.length()) {
7848 string new_head;
7849 Onode::calc_omap_header(new_flags, o.get(), &new_head);
7850 txn->set(new_omap_prefix, new_head, header);
7851 txn_cost += new_head.length() + header.length();
7852 }
7853 }
7854 // tail
7855 {
7856 string new_tail;
7857 Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
7858 bufferlist empty;
7859 txn->set(new_omap_prefix, new_tail, empty);
7860 txn_cost += new_tail.length() + new_tail.length();
7861 }
7862 // values
7863 string final_key;
7864 Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
7865 size_t base_key_len = final_key.size();
7866 while (it->valid() && it->key() < tail) {
7867 string user_key;
7868 o->decode_omap_key(it->key(), &user_key);
7869 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7870 << " -> " << user_key << dendl;
7871
7872 final_key.resize(base_key_len);
7873 final_key += it->key();
7874 auto v = it->value();
7875 txn->set(new_omap_prefix, final_key, v);
7876 txn_cost += final_key.length() + v.length();
7877
7878 // submit a portion if cost exceeds 16MB
7879 if (txn_cost >= 16 * (1 << 20) ) {
7880 db->submit_transaction_sync(txn);
7881 txn = db->get_transaction();
7882 txn_cost = 0;
7883 }
7884 it->next();
7885 }
7886 if (txn_cost > 0) {
7887 db->submit_transaction_sync(txn);
7888 }
7889 }
7890 // finalize: remove legacy data
7891 {
7892 KeyValueDB::Transaction txn = db->get_transaction();
7893 // remove old keys
7894 const string& old_omap_prefix = o->get_omap_prefix();
7895 string old_head, old_tail;
7896 o->get_omap_header(&old_head);
7897 o->get_omap_tail(&old_tail);
7898 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
7899 txn->rmkey(old_omap_prefix, old_tail);
7900 // set flag
7901 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
7902 _record_onode(o, txn);
7903 db->submit_transaction_sync(txn);
7904 repairer->inc_repaired();
7905 repairer->request_compaction();
7906 }
7907 }
7908 }
7909
7910 void BlueStore::_fsck_check_objects(FSCKDepth depth,
7911 BlueStore::FSCK_ObjectCtx& ctx)
7912 {
7913 auto& errors = ctx.errors;
7914 auto sb_info_lock = ctx.sb_info_lock;
7915 auto& sb_info = ctx.sb_info;
7916 auto repairer = ctx.repairer;
7917
7918 uint64_t_btree_t used_nids;
7919
7920 size_t processed_myself = 0;
7921
7922 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
7923 mempool::bluestore_fsck::list<string> expecting_shards;
7924 if (it) {
7925 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
7926 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
7927 std::unique_ptr<WQ> wq(
7928 new WQ(
7929 "FSCKWorkQueue",
7930 (thread_count ? : 1) * 32,
7931 this,
7932 sb_info_lock,
7933 sb_info,
7934 repairer));
7935
7936 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
7937
7938 thread_pool.add_work_queue(wq.get());
7939 if (depth == FSCK_SHALLOW && thread_count > 0) {
7940 //not the best place but let's check anyway
7941 ceph_assert(sb_info_lock);
7942 thread_pool.start();
7943 }
7944
7945 //fill global if not overriden below
7946 CollectionRef c;
7947 int64_t pool_id = -1;
7948 spg_t pgid;
7949 for (it->lower_bound(string()); it->valid(); it->next()) {
7950 dout(30) << __func__ << " key "
7951 << pretty_binary_string(it->key()) << dendl;
7952 if (is_extent_shard_key(it->key())) {
7953 if (depth == FSCK_SHALLOW) {
7954 continue;
7955 }
7956 while (!expecting_shards.empty() &&
7957 expecting_shards.front() < it->key()) {
7958 derr << "fsck error: missing shard key "
7959 << pretty_binary_string(expecting_shards.front())
7960 << dendl;
7961 ++errors;
7962 expecting_shards.pop_front();
7963 }
7964 if (!expecting_shards.empty() &&
7965 expecting_shards.front() == it->key()) {
7966 // all good
7967 expecting_shards.pop_front();
7968 continue;
7969 }
7970
7971 uint32_t offset;
7972 string okey;
7973 get_key_extent_shard(it->key(), &okey, &offset);
7974 derr << "fsck error: stray shard 0x" << std::hex << offset
7975 << std::dec << dendl;
7976 if (expecting_shards.empty()) {
7977 derr << "fsck error: " << pretty_binary_string(it->key())
7978 << " is unexpected" << dendl;
7979 ++errors;
7980 continue;
7981 }
7982 while (expecting_shards.front() > it->key()) {
7983 derr << "fsck error: saw " << pretty_binary_string(it->key())
7984 << dendl;
7985 derr << "fsck error: exp "
7986 << pretty_binary_string(expecting_shards.front()) << dendl;
7987 ++errors;
7988 expecting_shards.pop_front();
7989 if (expecting_shards.empty()) {
7990 break;
7991 }
7992 }
7993 continue;
7994 }
7995
7996 ghobject_t oid;
7997 int r = get_key_object(it->key(), &oid);
7998 if (r < 0) {
7999 derr << "fsck error: bad object key "
8000 << pretty_binary_string(it->key()) << dendl;
8001 ++errors;
8002 continue;
8003 }
8004 if (!c ||
8005 oid.shard_id != pgid.shard ||
8006 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8007 !c->contains(oid)) {
8008 c = nullptr;
8009 for (auto& p : coll_map) {
8010 if (p.second->contains(oid)) {
8011 c = p.second;
8012 break;
8013 }
8014 }
8015 if (!c) {
8016 derr << "fsck error: stray object " << oid
8017 << " not owned by any collection" << dendl;
8018 ++errors;
8019 continue;
8020 }
8021 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8022 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8023 << dendl;
8024 }
8025
8026 if (depth != FSCK_SHALLOW &&
8027 !expecting_shards.empty()) {
8028 for (auto& k : expecting_shards) {
8029 derr << "fsck error: missing shard key "
8030 << pretty_binary_string(k) << dendl;
8031 }
8032 ++errors;
8033 expecting_shards.clear();
8034 }
8035
8036 bool queued = false;
8037 if (depth == FSCK_SHALLOW && thread_count > 0) {
8038 queued = wq->queue(
8039 pool_id,
8040 c,
8041 oid,
8042 it->key(),
8043 it->value());
8044 }
8045 OnodeRef o;
8046 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8047
8048 if (!queued) {
8049 ++processed_myself;
8050
8051 o = fsck_check_objects_shallow(
8052 depth,
8053 pool_id,
8054 c,
8055 oid,
8056 it->key(),
8057 it->value(),
8058 &expecting_shards,
8059 &referenced,
8060 ctx);
8061 }
8062
8063 if (depth != FSCK_SHALLOW) {
8064 ceph_assert(o != nullptr);
8065 if (o->onode.nid) {
8066 if (o->onode.nid > nid_max) {
8067 derr << "fsck error: " << oid << " nid " << o->onode.nid
8068 << " > nid_max " << nid_max << dendl;
8069 ++errors;
8070 }
8071 if (used_nids.count(o->onode.nid)) {
8072 derr << "fsck error: " << oid << " nid " << o->onode.nid
8073 << " already in use" << dendl;
8074 ++errors;
8075 continue; // go for next object
8076 }
8077 used_nids.insert(o->onode.nid);
8078 }
8079 for (auto& i : referenced) {
8080 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8081 << std::dec << " for " << *i.first << dendl;
8082 const bluestore_blob_t& blob = i.first->get_blob();
8083 if (i.second & blob.unused) {
8084 derr << "fsck error: " << oid << " blob claims unused 0x"
8085 << std::hex << blob.unused
8086 << " but extents reference 0x" << i.second << std::dec
8087 << " on blob " << *i.first << dendl;
8088 ++errors;
8089 }
8090 if (blob.has_csum()) {
8091 uint64_t blob_len = blob.get_logical_length();
8092 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8093 unsigned csum_count = blob.get_csum_count();
8094 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8095 for (unsigned p = 0; p < csum_count; ++p) {
8096 unsigned pos = p * csum_chunk_size;
8097 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8098 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8099 unsigned mask = 1u << firstbit;
8100 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8101 mask |= 1u << b;
8102 }
8103 if ((blob.unused & mask) == mask) {
8104 // this csum chunk region is marked unused
8105 if (blob.get_csum_item(p) != 0) {
8106 derr << "fsck error: " << oid
8107 << " blob claims csum chunk 0x" << std::hex << pos
8108 << "~" << csum_chunk_size
8109 << " is unused (mask 0x" << mask << " of unused 0x"
8110 << blob.unused << ") but csum is non-zero 0x"
8111 << blob.get_csum_item(p) << std::dec << " on blob "
8112 << *i.first << dendl;
8113 ++errors;
8114 }
8115 }
8116 }
8117 }
8118 }
8119 // omap
8120 if (o->onode.has_omap()) {
8121 ceph_assert(ctx.used_omap_head);
8122 if (ctx.used_omap_head->count(o->onode.nid)) {
8123 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8124 << " already in use" << dendl;
8125 ++errors;
8126 } else {
8127 ctx.used_omap_head->insert(o->onode.nid);
8128 }
8129 } // if (o->onode.has_omap())
8130 if (depth == FSCK_DEEP) {
8131 bufferlist bl;
8132 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8133 uint64_t offset = 0;
8134 do {
8135 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8136 int r = _do_read(c.get(), o, offset, l, bl,
8137 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8138 if (r < 0) {
8139 ++errors;
8140 derr << "fsck error: " << oid << std::hex
8141 << " error during read: "
8142 << " " << offset << "~" << l
8143 << " " << cpp_strerror(r) << std::dec
8144 << dendl;
8145 break;
8146 }
8147 offset += l;
8148 } while (offset < o->onode.size);
8149 } // deep
8150 } //if (depth != FSCK_SHALLOW)
8151 } // for (it->lower_bound(string()); it->valid(); it->next())
8152 if (depth == FSCK_SHALLOW && thread_count > 0) {
8153 wq->finalize(thread_pool, ctx);
8154 if (processed_myself) {
8155 // may be needs more threads?
8156 dout(0) << __func__ << " partial offload"
8157 << ", done myself " << processed_myself
8158 << " of " << ctx.num_objects
8159 << "objects, threads " << thread_count
8160 << dendl;
8161 }
8162 }
8163 } // if (it)
8164 }
8165 /**
8166 An overview for currently implemented repair logics
8167 performed in fsck in two stages: detection(+preparation) and commit.
8168 Detection stage (in processing order):
8169 (Issue -> Repair action to schedule)
8170 - Detect undecodable keys for Shared Blobs -> Remove
8171 - Detect undecodable records for Shared Blobs -> Remove
8172 (might trigger missed Shared Blob detection below)
8173 - Detect stray records for Shared Blobs -> Remove
8174 - Detect misreferenced pextents -> Fix
8175 Prepare Bloom-like filter to track cid/oid -> pextent
8176 Prepare list of extents that are improperly referenced
8177 Enumerate Onode records that might use 'misreferenced' pextents
8178 (Bloom-like filter applied to reduce computation)
8179 Per each questinable Onode enumerate all blobs and identify broken ones
8180 (i.e. blobs having 'misreferences')
8181 Rewrite each broken blob data by allocating another extents and
8182 copying data there
8183 If blob is shared - unshare it and mark corresponding Shared Blob
8184 for removal
8185 Release previously allocated space
8186 Update Extent Map
8187 - Detect missed Shared Blobs -> Recreate
8188 - Detect undecodable deferred transaction -> Remove
8189 - Detect Freelist Manager's 'false free' entries -> Mark as used
8190 - Detect Freelist Manager's leaked entries -> Mark as free
8191 - Detect statfs inconsistency - Update
8192 Commit stage (separate DB commit per each step):
8193 - Apply leaked FM entries fix
8194 - Apply 'false free' FM entries fix
8195 - Apply 'Remove' actions
8196 - Apply fix for misreference pextents
8197 - Apply Shared Blob recreate
8198 (can be merged with the step above if misreferences were dectected)
8199 - Apply StatFS update
8200 */
8201 int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8202 {
8203 dout(1) << __func__
8204 << (repair ? " repair" : " check")
8205 << (depth == FSCK_DEEP ? " (deep)" :
8206 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8207 << dendl;
8208
8209 // in deep mode we need R/W write access to be able to replay deferred ops
8210 bool read_only = !(repair || depth == FSCK_DEEP);
8211
8212 int r = _open_db_and_around(read_only);
8213 if (r < 0)
8214 return r;
8215
8216 if (!read_only) {
8217 r = _upgrade_super();
8218 if (r < 0) {
8219 goto out_db;
8220 }
8221 }
8222
8223 r = _open_collections();
8224 if (r < 0)
8225 goto out_db;
8226
8227 mempool_thread.init();
8228
8229 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8230 // enable in repair or deep mode modes only
8231 if (!read_only) {
8232 _kv_start();
8233 r = _deferred_replay();
8234 _kv_stop();
8235 }
8236 if (r < 0)
8237 goto out_scan;
8238
8239 r = _fsck_on_open(depth, repair);
8240
8241 out_scan:
8242 mempool_thread.shutdown();
8243 _shutdown_cache();
8244 out_db:
8245 _close_db_and_around(false);
8246
8247 return r;
8248 }
8249
8250 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8251 {
8252 dout(1) << __func__
8253 << " <<<START>>>"
8254 << (repair ? " repair" : " check")
8255 << (depth == FSCK_DEEP ? " (deep)" :
8256 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8257 << " start" << dendl;
8258 int64_t errors = 0;
8259 int64_t warnings = 0;
8260 unsigned repaired = 0;
8261
8262 uint64_t_btree_t used_omap_head;
8263 uint64_t_btree_t used_sbids;
8264
8265 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
8266 KeyValueDB::Iterator it;
8267 store_statfs_t expected_store_statfs, actual_statfs;
8268 per_pool_statfs expected_pool_statfs;
8269
8270 sb_info_map_t sb_info;
8271
8272 uint64_t num_objects = 0;
8273 uint64_t num_extents = 0;
8274 uint64_t num_blobs = 0;
8275 uint64_t num_spanning_blobs = 0;
8276 uint64_t num_shared_blobs = 0;
8277 uint64_t num_sharded_objects = 0;
8278 BlueStoreRepairer repairer;
8279
8280 auto alloc_size = fm->get_alloc_size();
8281
8282 utime_t start = ceph_clock_now();
8283
8284 _fsck_collections(&errors);
8285 used_blocks.resize(fm->get_alloc_units());
8286
8287 if (bluefs) {
8288 interval_set<uint64_t> bluefs_extents;
8289
8290 int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
8291 ceph_assert(r == 0);
8292 for (auto [start, len] : bluefs_extents) {
8293 apply_for_bitset_range(start, len, alloc_size, used_blocks,
8294 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
8295 ceph_assert(pos < bs.size());
8296 bs.set(pos);
8297 }
8298 );
8299 }
8300 }
8301
8302 bluefs_used_blocks = used_blocks;
8303
8304 apply_for_bitset_range(
8305 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
8306 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8307 bs.set(pos);
8308 }
8309 );
8310
8311
8312 if (repair) {
8313 repairer.init_space_usage_tracker(
8314 bdev->get_size(),
8315 min_alloc_size);
8316 }
8317
8318 if (bluefs) {
8319 int r = bluefs->fsck();
8320 if (r < 0) {
8321 return r;
8322 }
8323 if (r > 0)
8324 errors += r;
8325 }
8326
8327 if (!per_pool_stat_collection) {
8328 const char *w;
8329 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8330 w = "error";
8331 ++errors;
8332 } else {
8333 w = "warning";
8334 ++warnings;
8335 }
8336 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8337 << dendl;
8338 }
8339 if (per_pool_omap != OMAP_PER_PG) {
8340 const char *w;
8341 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8342 w = "error";
8343 ++errors;
8344 } else {
8345 w = "warning";
8346 ++warnings;
8347 }
8348 derr << "fsck " << w << ": store not yet converted to per-pg omap"
8349 << dendl;
8350 }
8351
8352 // get expected statfs; reset unaffected fields to be able to compare
8353 // structs
8354 statfs(&actual_statfs);
8355 actual_statfs.total = 0;
8356 actual_statfs.internally_reserved = 0;
8357 actual_statfs.available = 0;
8358 actual_statfs.internal_metadata = 0;
8359 actual_statfs.omap_allocated = 0;
8360
8361 if (g_conf()->bluestore_debug_fsck_abort) {
8362 dout(1) << __func__ << " debug abort" << dendl;
8363 goto out_scan;
8364 }
8365 // walk PREFIX_OBJ
8366 {
8367 dout(1) << __func__ << " walking object keyspace" << dendl;
8368 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8369 BlueStore::FSCK_ObjectCtx ctx(
8370 errors,
8371 warnings,
8372 num_objects,
8373 num_extents,
8374 num_blobs,
8375 num_sharded_objects,
8376 num_spanning_blobs,
8377 &used_blocks,
8378 &used_omap_head,
8379 //no need for the below lock when in non-shallow mode as
8380 // there is no multithreading in this case
8381 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
8382 sb_info,
8383 expected_store_statfs,
8384 expected_pool_statfs,
8385 repair ? &repairer : nullptr);
8386
8387 _fsck_check_objects(depth, ctx);
8388 }
8389
8390 dout(1) << __func__ << " checking shared_blobs" << dendl;
8391 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
8392 if (it) {
8393 // FIXME minor: perhaps simplify for shallow mode?
8394 // fill global if not overriden below
8395 auto expected_statfs = &expected_store_statfs;
8396
8397 for (it->lower_bound(string()); it->valid(); it->next()) {
8398 string key = it->key();
8399 uint64_t sbid;
8400 if (get_key_shared_blob(key, &sbid)) {
8401 derr << "fsck error: bad key '" << key
8402 << "' in shared blob namespace" << dendl;
8403 if (repair) {
8404 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8405 }
8406 ++errors;
8407 continue;
8408 }
8409 auto p = sb_info.find(sbid);
8410 if (p == sb_info.end()) {
8411 derr << "fsck error: found stray shared blob data for sbid 0x"
8412 << std::hex << sbid << std::dec << dendl;
8413 if (repair) {
8414 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8415 }
8416 ++errors;
8417 } else {
8418 ++num_shared_blobs;
8419 sb_info_t& sbi = p->second;
8420 bluestore_shared_blob_t shared_blob(sbid);
8421 bufferlist bl = it->value();
8422 auto blp = bl.cbegin();
8423 try {
8424 decode(shared_blob, blp);
8425 } catch (ceph::buffer::error& e) {
8426 ++errors;
8427 // Force update and don't report as missing
8428 sbi.updated = sbi.passed = true;
8429
8430 derr << "fsck error: failed to decode Shared Blob"
8431 << pretty_binary_string(it->key()) << dendl;
8432 if (repair) {
8433 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8434 << pretty_binary_string(it->key())
8435 << "', removing" << dendl;
8436 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8437 }
8438 continue;
8439 }
8440 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8441 if (shared_blob.ref_map != sbi.ref_map) {
8442 derr << "fsck error: shared blob 0x" << std::hex << sbid
8443 << std::dec << " ref_map " << shared_blob.ref_map
8444 << " != expected " << sbi.ref_map << dendl;
8445 sbi.updated = true; // will update later in repair mode only!
8446 ++errors;
8447 }
8448 PExtentVector extents;
8449 for (auto &r : shared_blob.ref_map.ref_map) {
8450 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8451 }
8452 if (per_pool_stat_collection || repair) {
8453 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8454 }
8455 errors += _fsck_check_extents(sbi.cid,
8456 p->second.oids.front(),
8457 extents,
8458 p->second.compressed,
8459 used_blocks,
8460 fm->get_alloc_size(),
8461 repair ? &repairer : nullptr,
8462 *expected_statfs,
8463 depth);
8464 sbi.passed = true;
8465 }
8466 }
8467 } // if (it)
8468
8469 if (repair && repairer.preprocess_misreference(db)) {
8470
8471 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
8472 auto& misref_extents = repairer.get_misreferences();
8473 interval_set<uint64_t> to_release;
8474 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
8475 if (it) {
8476 // fill global if not overriden below
8477 auto expected_statfs = &expected_store_statfs;
8478
8479 CollectionRef c;
8480 spg_t pgid;
8481 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8482 bool bypass_rest = false;
8483 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8484 it->next()) {
8485 dout(30) << __func__ << " key "
8486 << pretty_binary_string(it->key()) << dendl;
8487 if (is_extent_shard_key(it->key())) {
8488 continue;
8489 }
8490
8491 ghobject_t oid;
8492 int r = get_key_object(it->key(), &oid);
8493 if (r < 0 || !repairer.is_used(oid)) {
8494 continue;
8495 }
8496
8497 if (!c ||
8498 oid.shard_id != pgid.shard ||
8499 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8500 !c->contains(oid)) {
8501 c = nullptr;
8502 for (auto& p : coll_map) {
8503 if (p.second->contains(oid)) {
8504 c = p.second;
8505 break;
8506 }
8507 }
8508 if (!c) {
8509 continue;
8510 }
8511 if (per_pool_stat_collection || repair) {
8512 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8513 expected_statfs = &expected_pool_statfs[pool_id];
8514 }
8515 }
8516 if (!repairer.is_used(c->cid)) {
8517 continue;
8518 }
8519
8520 dout(20) << __func__ << " check misreference for col:" << c->cid
8521 << " obj:" << oid << dendl;
8522
8523 OnodeRef o;
8524 o.reset(Onode::decode(c, oid, it->key(), it->value()));
8525 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8526 mempool::bluestore_fsck::set<BlobRef> blobs;
8527
8528 for (auto& e : o->extent_map.extent_map) {
8529 blobs.insert(e.blob);
8530 }
8531 bool need_onode_update = false;
8532 bool first_dump = true;
8533 for(auto b : blobs) {
8534 bool broken_blob = false;
8535 auto& pextents = b->dirty_blob().dirty_extents();
8536 for (auto& e : pextents) {
8537 if (!e.is_valid()) {
8538 continue;
8539 }
8540 // for the sake of simplicity and proper shared blob handling
8541 // always rewrite the whole blob even when it's partially
8542 // misreferenced.
8543 if (misref_extents.intersects(e.offset, e.length)) {
8544 if (first_dump) {
8545 first_dump = false;
8546 _dump_onode<10>(cct, *o);
8547 }
8548 broken_blob = true;
8549 break;
8550 }
8551 }
8552 if (!broken_blob)
8553 continue;
8554 bool compressed = b->get_blob().is_compressed();
8555 need_onode_update = true;
8556 dout(10) << __func__
8557 << " fix misreferences in oid:" << oid
8558 << " " << *b << dendl;
8559 uint64_t b_off = 0;
8560 PExtentVector pext_to_release;
8561 pext_to_release.reserve(pextents.size());
8562 // rewriting all valid pextents
8563 for (auto e = pextents.begin(); e != pextents.end();
8564 b_off += e->length, e++) {
8565 if (!e->is_valid()) {
8566 continue;
8567 }
8568 PExtentVector exts;
8569 int64_t alloc_len =
8570 shared_alloc.a->allocate(e->length, min_alloc_size,
8571 0, 0, &exts);
8572 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
8573 derr << __func__
8574 << " failed to allocate 0x" << std::hex << e->length
8575 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
8576 << " min_alloc_size 0x" << min_alloc_size
8577 << " available 0x " << shared_alloc.a->get_free()
8578 << std::dec << dendl;
8579 if (alloc_len > 0) {
8580 shared_alloc.a->release(exts);
8581 }
8582 bypass_rest = true;
8583 break;
8584 }
8585 expected_statfs->allocated += e->length;
8586 if (compressed) {
8587 expected_statfs->data_compressed_allocated += e->length;
8588 }
8589
8590 bufferlist bl;
8591 IOContext ioc(cct, NULL, true); // allow EIO
8592 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8593 if (r < 0) {
8594 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8595 <<"~" << e->length << std::dec << dendl;
8596 ceph_abort_msg("read failed, wtf");
8597 }
8598 pext_to_release.push_back(*e);
8599 e = pextents.erase(e);
8600 e = pextents.insert(e, exts.begin(), exts.end());
8601 b->get_blob().map_bl(
8602 b_off, bl,
8603 [&](uint64_t offset, bufferlist& t) {
8604 int r = bdev->write(offset, t, false);
8605 ceph_assert(r == 0);
8606 });
8607 e += exts.size() - 1;
8608 for (auto& p : exts) {
8609 fm->allocate(p.offset, p.length, txn);
8610 }
8611 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8612
8613 if (b->get_blob().is_shared()) {
8614 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8615
8616 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8617 ceph_assert(sb_it != sb_info.end());
8618 sb_info_t& sbi = sb_it->second;
8619
8620 for (auto& r : sbi.ref_map.ref_map) {
8621 expected_statfs->allocated -= r.second.length;
8622 if (sbi.compressed) {
8623 // NB: it's crucial to use compressed flag from sb_info_t
8624 // as we originally used that value while accumulating
8625 // expected_statfs
8626 expected_statfs->data_compressed_allocated -= r.second.length;
8627 }
8628 }
8629 sbi.updated = sbi.passed = true;
8630 sbi.ref_map.clear();
8631
8632 // relying on blob's pextents to decide what to release.
8633 for (auto& p : pext_to_release) {
8634 to_release.union_insert(p.offset, p.length);
8635 }
8636 } else {
8637 for (auto& p : pext_to_release) {
8638 expected_statfs->allocated -= p.length;
8639 if (compressed) {
8640 expected_statfs->data_compressed_allocated -= p.length;
8641 }
8642 to_release.union_insert(p.offset, p.length);
8643 }
8644 }
8645 if (bypass_rest) {
8646 break;
8647 }
8648 } // for(auto b : blobs)
8649 if (need_onode_update) {
8650 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8651 _record_onode(o, txn);
8652 }
8653 } // for (it->lower_bound(string()); it->valid(); it->next())
8654
8655 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8656 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8657 << "~" << it.get_len() << std::dec << dendl;
8658 fm->release(it.get_start(), it.get_len(), txn);
8659 }
8660 shared_alloc.a->release(to_release);
8661 to_release.clear();
8662 } // if (it) {
8663 } //if (repair && repairer.preprocess_misreference()) {
8664
8665 if (depth != FSCK_SHALLOW) {
8666 for (auto &p : sb_info) {
8667 sb_info_t& sbi = p.second;
8668 if (!sbi.passed) {
8669 derr << "fsck error: missing " << *sbi.sb << dendl;
8670 ++errors;
8671 }
8672 if (repair && (!sbi.passed || sbi.updated)) {
8673 auto sbid = p.first;
8674 if (sbi.ref_map.empty()) {
8675 ceph_assert(sbi.passed);
8676 dout(20) << __func__ << " " << *sbi.sb
8677 << " is empty, removing" << dendl;
8678 repairer.fix_shared_blob(db, sbid, nullptr);
8679 } else {
8680 bufferlist bl;
8681 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8682 encode(persistent, bl);
8683 dout(20) << __func__ << " " << *sbi.sb
8684 << " is " << bl.length() << " bytes, updating" << dendl;
8685
8686 repairer.fix_shared_blob(db, sbid, &bl);
8687 }
8688 }
8689 }
8690 }
8691 sb_info.clear();
8692
8693 // check global stats only if fscking (not repairing) w/o per-pool stats
8694 if (!per_pool_stat_collection &&
8695 !repair &&
8696 !(actual_statfs == expected_store_statfs)) {
8697 derr << "fsck error: actual " << actual_statfs
8698 << " != expected " << expected_store_statfs << dendl;
8699 if (repair) {
8700 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8701 expected_store_statfs);
8702 }
8703 ++errors;
8704 }
8705
8706 dout(1) << __func__ << " checking pool_statfs" << dendl;
8707 _fsck_check_pool_statfs(expected_pool_statfs,
8708 errors, warnings, repair ? &repairer : nullptr);
8709
8710 if (depth != FSCK_SHALLOW) {
8711 dout(1) << __func__ << " checking for stray omap data " << dendl;
8712 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
8713 if (it) {
8714 uint64_t last_omap_head = 0;
8715 for (it->lower_bound(string()); it->valid(); it->next()) {
8716 uint64_t omap_head;
8717
8718 _key_decode_u64(it->key().c_str(), &omap_head);
8719
8720 if (used_omap_head.count(omap_head) == 0 &&
8721 omap_head != last_omap_head) {
8722 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8723 << "fsck error: found stray omap data on omap_head "
8724 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8725 ++errors;
8726 last_omap_head = omap_head;
8727 }
8728 }
8729 }
8730 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
8731 if (it) {
8732 uint64_t last_omap_head = 0;
8733 for (it->lower_bound(string()); it->valid(); it->next()) {
8734 uint64_t omap_head;
8735 _key_decode_u64(it->key().c_str(), &omap_head);
8736 if (used_omap_head.count(omap_head) == 0 &&
8737 omap_head != last_omap_head) {
8738 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8739 << "fsck error: found stray (pgmeta) omap data on omap_head "
8740 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8741 last_omap_head = omap_head;
8742 ++errors;
8743 }
8744 }
8745 }
8746 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
8747 if (it) {
8748 uint64_t last_omap_head = 0;
8749 for (it->lower_bound(string()); it->valid(); it->next()) {
8750 uint64_t pool;
8751 uint64_t omap_head;
8752 string k = it->key();
8753 const char *c = k.c_str();
8754 c = _key_decode_u64(c, &pool);
8755 c = _key_decode_u64(c, &omap_head);
8756 if (used_omap_head.count(omap_head) == 0 &&
8757 omap_head != last_omap_head) {
8758 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8759 << "fsck error: found stray (per-pool) omap data on omap_head "
8760 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8761 ++errors;
8762 last_omap_head = omap_head;
8763 }
8764 }
8765 }
8766 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
8767 if (it) {
8768 uint64_t last_omap_head = 0;
8769 for (it->lower_bound(string()); it->valid(); it->next()) {
8770 uint64_t pool;
8771 uint32_t hash;
8772 uint64_t omap_head;
8773 string k = it->key();
8774 const char* c = k.c_str();
8775 c = _key_decode_u64(c, &pool);
8776 c = _key_decode_u32(c, &hash);
8777 c = _key_decode_u64(c, &omap_head);
8778 if (used_omap_head.count(omap_head) == 0 &&
8779 omap_head != last_omap_head) {
8780 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8781 << "fsck error: found stray (per-pg) omap data on omap_head "
8782 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8783 ++errors;
8784 last_omap_head = omap_head;
8785 }
8786 }
8787 }
8788 dout(1) << __func__ << " checking deferred events" << dendl;
8789 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
8790 if (it) {
8791 for (it->lower_bound(string()); it->valid(); it->next()) {
8792 bufferlist bl = it->value();
8793 auto p = bl.cbegin();
8794 bluestore_deferred_transaction_t wt;
8795 try {
8796 decode(wt, p);
8797 } catch (ceph::buffer::error& e) {
8798 derr << "fsck error: failed to decode deferred txn "
8799 << pretty_binary_string(it->key()) << dendl;
8800 if (repair) {
8801 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8802 << pretty_binary_string(it->key())
8803 << "', removing" << dendl;
8804 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8805 }
8806 continue;
8807 }
8808 dout(20) << __func__ << " deferred " << wt.seq
8809 << " ops " << wt.ops.size()
8810 << " released 0x" << std::hex << wt.released << std::dec << dendl;
8811 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
8812 apply_for_bitset_range(
8813 e.get_start(), e.get_len(), alloc_size, used_blocks,
8814 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8815 bs.set(pos);
8816 }
8817 );
8818 }
8819 }
8820 }
8821
8822 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
8823 {
8824 fm->enumerate_reset();
8825 uint64_t offset, length;
8826 while (fm->enumerate_next(db, &offset, &length)) {
8827 bool intersects = false;
8828 apply_for_bitset_range(
8829 offset, length, alloc_size, used_blocks,
8830 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8831 ceph_assert(pos < bs.size());
8832 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
8833 if (offset == SUPER_RESERVED &&
8834 length == min_alloc_size - SUPER_RESERVED) {
8835 // this is due to the change just after luminous to min_alloc_size
8836 // granularity allocations, and our baked in assumption at the top
8837 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8838 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8839 // since we will never allocate this region below min_alloc_size.
8840 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
8841 << " and min_alloc_size, 0x" << std::hex << offset << "~"
8842 << length << std::dec << dendl;
8843 } else {
8844 intersects = true;
8845 if (repair) {
8846 repairer.fix_false_free(db, fm,
8847 pos * min_alloc_size,
8848 min_alloc_size);
8849 }
8850 }
8851 } else {
8852 bs.set(pos);
8853 }
8854 }
8855 );
8856 if (intersects) {
8857 derr << "fsck error: free extent 0x" << std::hex << offset
8858 << "~" << length << std::dec
8859 << " intersects allocated blocks" << dendl;
8860 ++errors;
8861 }
8862 }
8863 fm->enumerate_reset();
8864 size_t count = used_blocks.count();
8865 if (used_blocks.size() != count) {
8866 ceph_assert(used_blocks.size() > count);
8867 used_blocks.flip();
8868 size_t start = used_blocks.find_first();
8869 while (start != decltype(used_blocks)::npos) {
8870 size_t cur = start;
8871 while (true) {
8872 size_t next = used_blocks.find_next(cur);
8873 if (next != cur + 1) {
8874 ++errors;
8875 derr << "fsck error: leaked extent 0x" << std::hex
8876 << ((uint64_t)start * fm->get_alloc_size()) << "~"
8877 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
8878 << dendl;
8879 if (repair) {
8880 repairer.fix_leaked(db,
8881 fm,
8882 start * min_alloc_size,
8883 (cur + 1 - start) * min_alloc_size);
8884 }
8885 start = next;
8886 break;
8887 }
8888 cur = next;
8889 }
8890 }
8891 used_blocks.flip();
8892 }
8893 }
8894 }
8895 if (repair) {
8896 if (per_pool_omap != OMAP_PER_PG) {
8897 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
8898 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
8899 }
8900
8901 dout(5) << __func__ << " applying repair results" << dendl;
8902 repaired = repairer.apply(db);
8903 dout(5) << __func__ << " repair applied" << dendl;
8904 }
8905
8906 out_scan:
8907 dout(2) << __func__ << " " << num_objects << " objects, "
8908 << num_sharded_objects << " of them sharded. "
8909 << dendl;
8910 dout(2) << __func__ << " " << num_extents << " extents to "
8911 << num_blobs << " blobs, "
8912 << num_spanning_blobs << " spanning, "
8913 << num_shared_blobs << " shared."
8914 << dendl;
8915
8916 utime_t duration = ceph_clock_now() - start;
8917 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
8918 << warnings << " warnings, "
8919 << repaired << " repaired, "
8920 << (errors + warnings - (int)repaired) << " remaining in "
8921 << duration << " seconds" << dendl;
8922
8923 // In non-repair mode we should return error count only as
8924 // it indicates if store status is OK.
8925 // In repair mode both errors and warnings are taken into account
8926 // since repaired counter relates to them both.
8927 return repair ? errors + warnings - (int)repaired : errors;
8928 }
8929
8930 /// methods to inject various errors fsck can repair
8931 void BlueStore::inject_broken_shared_blob_key(const string& key,
8932 const bufferlist& bl)
8933 {
8934 KeyValueDB::Transaction txn;
8935 txn = db->get_transaction();
8936 txn->set(PREFIX_SHARED_BLOB, key, bl);
8937 db->submit_transaction_sync(txn);
8938 };
8939
8940 void BlueStore::inject_leaked(uint64_t len)
8941 {
8942 KeyValueDB::Transaction txn;
8943 txn = db->get_transaction();
8944
8945 PExtentVector exts;
8946 int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size,
8947 min_alloc_size * 256, 0, &exts);
8948 ceph_assert(alloc_len >= (int64_t)len);
8949 for (auto& p : exts) {
8950 fm->allocate(p.offset, p.length, txn);
8951 }
8952 db->submit_transaction_sync(txn);
8953 }
8954
8955 void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
8956 {
8957 KeyValueDB::Transaction txn;
8958 OnodeRef o;
8959 CollectionRef c = _get_collection(cid);
8960 ceph_assert(c);
8961 {
8962 std::unique_lock l{c->lock}; // just to avoid internal asserts
8963 o = c->get_onode(oid, false);
8964 ceph_assert(o);
8965 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8966 }
8967
8968 bool injected = false;
8969 txn = db->get_transaction();
8970 auto& em = o->extent_map.extent_map;
8971 std::vector<const PExtentVector*> v;
8972 if (em.size()) {
8973 v.push_back(&em.begin()->blob->get_blob().get_extents());
8974 }
8975 if (em.size() > 1) {
8976 auto it = em.end();
8977 --it;
8978 v.push_back(&(it->blob->get_blob().get_extents()));
8979 }
8980 for (auto pext : v) {
8981 if (pext->size()) {
8982 auto p = pext->begin();
8983 while (p != pext->end()) {
8984 if (p->is_valid()) {
8985 dout(20) << __func__ << " release 0x" << std::hex << p->offset
8986 << "~" << p->length << std::dec << dendl;
8987 fm->release(p->offset, p->length, txn);
8988 injected = true;
8989 break;
8990 }
8991 ++p;
8992 }
8993 }
8994 }
8995 ceph_assert(injected);
8996 db->submit_transaction_sync(txn);
8997 }
8998
8999 void BlueStore::inject_legacy_omap()
9000 {
9001 dout(1) << __func__ << dendl;
9002 per_pool_omap = OMAP_BULK;
9003 KeyValueDB::Transaction txn;
9004 txn = db->get_transaction();
9005 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
9006 db->submit_transaction_sync(txn);
9007 }
9008
9009 void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
9010 {
9011 dout(1) << __func__ << " "
9012 << cid << " " << oid
9013 <<dendl;
9014 KeyValueDB::Transaction txn;
9015 OnodeRef o;
9016 CollectionRef c = _get_collection(cid);
9017 ceph_assert(c);
9018 {
9019 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9020 o = c->get_onode(oid, false);
9021 ceph_assert(o);
9022 }
9023 o->onode.clear_flag(
9024 bluestore_onode_t::FLAG_PERPG_OMAP |
9025 bluestore_onode_t::FLAG_PERPOOL_OMAP |
9026 bluestore_onode_t::FLAG_PGMETA_OMAP);
9027 txn = db->get_transaction();
9028 _record_onode(o, txn);
9029 db->submit_transaction_sync(txn);
9030 }
9031
9032
9033 void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
9034 {
9035 BlueStoreRepairer repairer;
9036 repairer.fix_statfs(db, key, new_statfs);
9037 repairer.apply(db);
9038 }
9039
9040 void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
9041 {
9042 KeyValueDB::Transaction t = db->get_transaction();
9043 volatile_statfs v;
9044 v = new_statfs;
9045 bufferlist bl;
9046 v.encode(bl);
9047 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
9048 db->submit_transaction_sync(t);
9049 }
9050
9051 void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9052 coll_t cid2, ghobject_t oid2,
9053 uint64_t offset)
9054 {
9055 OnodeRef o1;
9056 CollectionRef c1 = _get_collection(cid1);
9057 ceph_assert(c1);
9058 {
9059 std::unique_lock l{c1->lock}; // just to avoid internal asserts
9060 o1 = c1->get_onode(oid1, false);
9061 ceph_assert(o1);
9062 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9063 }
9064 OnodeRef o2;
9065 CollectionRef c2 = _get_collection(cid2);
9066 ceph_assert(c2);
9067 {
9068 std::unique_lock l{c2->lock}; // just to avoid internal asserts
9069 o2 = c2->get_onode(oid2, false);
9070 ceph_assert(o2);
9071 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9072 }
9073 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9074 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9075
9076 // require onode/extent layout to be the same (and simple)
9077 // to make things easier
9078 ceph_assert(o1->onode.extent_map_shards.empty());
9079 ceph_assert(o2->onode.extent_map_shards.empty());
9080 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9081 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9082 ceph_assert(e1.logical_offset == e2.logical_offset);
9083 ceph_assert(e1.length == e2.length);
9084 ceph_assert(e1.blob_offset == e2.blob_offset);
9085
9086 KeyValueDB::Transaction txn;
9087 txn = db->get_transaction();
9088
9089 // along with misreference error this will create space leaks errors
9090 e2.blob->dirty_blob() = e1.blob->get_blob();
9091 o2->extent_map.dirty_range(offset, e2.length);
9092 o2->extent_map.update(txn, false);
9093
9094 _record_onode(o2, txn);
9095 db->submit_transaction_sync(txn);
9096 }
9097
9098 void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
9099 int16_t blob_id)
9100 {
9101 OnodeRef o;
9102 CollectionRef c = _get_collection(cid);
9103 ceph_assert(c);
9104 {
9105 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9106 o = c->get_onode(oid, false);
9107 ceph_assert(o);
9108 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9109 }
9110
9111 BlobRef b = c->new_blob();
9112 b->id = blob_id;
9113 o->extent_map.spanning_blob_map[blob_id] = b;
9114
9115 KeyValueDB::Transaction txn;
9116 txn = db->get_transaction();
9117
9118 _record_onode(o, txn);
9119 db->submit_transaction_sync(txn);
9120 }
9121
9122 void BlueStore::collect_metadata(map<string,string> *pm)
9123 {
9124 dout(10) << __func__ << dendl;
9125 bdev->collect_metadata("bluestore_bdev_", pm);
9126 if (bluefs) {
9127 (*pm)["bluefs"] = "1";
9128 // this value is for backward compatibility only
9129 (*pm)["bluefs_single_shared_device"] = \
9130 stringify((int)bluefs_layout.single_shared_device());
9131 (*pm)["bluefs_dedicated_db"] = \
9132 stringify((int)bluefs_layout.dedicated_db);
9133 (*pm)["bluefs_dedicated_wal"] = \
9134 stringify((int)bluefs_layout.dedicated_wal);
9135 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
9136 } else {
9137 (*pm)["bluefs"] = "0";
9138 }
9139
9140 // report numa mapping for underlying devices
9141 int node = -1;
9142 set<int> nodes;
9143 set<string> failed;
9144 int r = get_numa_node(&node, &nodes, &failed);
9145 if (r >= 0) {
9146 if (!failed.empty()) {
9147 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9148 }
9149 if (!nodes.empty()) {
9150 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9151 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9152 }
9153 if (node >= 0) {
9154 (*pm)["objectstore_numa_node"] = stringify(node);
9155 }
9156 }
9157 }
9158
9159 int BlueStore::get_numa_node(
9160 int *final_node,
9161 set<int> *out_nodes,
9162 set<string> *out_failed)
9163 {
9164 int node = -1;
9165 set<string> devices;
9166 get_devices(&devices);
9167 set<int> nodes;
9168 set<string> failed;
9169 for (auto& devname : devices) {
9170 int n;
9171 BlkDev bdev(devname);
9172 int r = bdev.get_numa_node(&n);
9173 if (r < 0) {
9174 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9175 << dendl;
9176 failed.insert(devname);
9177 continue;
9178 }
9179 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9180 << dendl;
9181 nodes.insert(n);
9182 if (node < 0) {
9183 node = n;
9184 }
9185 }
9186 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9187 *final_node = node;
9188 }
9189 if (out_nodes) {
9190 *out_nodes = nodes;
9191 }
9192 if (out_failed) {
9193 *out_failed = failed;
9194 }
9195 return 0;
9196 }
9197
9198 int BlueStore::get_devices(set<string> *ls)
9199 {
9200 if (bdev) {
9201 bdev->get_devices(ls);
9202 if (bluefs) {
9203 bluefs->get_devices(ls);
9204 }
9205 return 0;
9206 }
9207
9208 // grumble, we haven't started up yet.
9209 int r = _open_path();
9210 if (r < 0)
9211 goto out;
9212 r = _open_fsid(false);
9213 if (r < 0)
9214 goto out_path;
9215 r = _read_fsid(&fsid);
9216 if (r < 0)
9217 goto out_fsid;
9218 r = _lock_fsid();
9219 if (r < 0)
9220 goto out_fsid;
9221 r = _open_bdev(false);
9222 if (r < 0)
9223 goto out_fsid;
9224 r = _minimal_open_bluefs(false);
9225 if (r < 0)
9226 goto out_bdev;
9227 bdev->get_devices(ls);
9228 if (bluefs) {
9229 bluefs->get_devices(ls);
9230 }
9231 r = 0;
9232 _minimal_close_bluefs();
9233 out_bdev:
9234 _close_bdev();
9235 out_fsid:
9236 _close_fsid();
9237 out_path:
9238 _close_path();
9239 out:
9240 return r;
9241 }
9242
9243 void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
9244 {
9245 buf->reset();
9246
9247 auto prefix = per_pool_omap == OMAP_BULK ?
9248 PREFIX_OMAP :
9249 per_pool_omap == OMAP_PER_POOL ?
9250 PREFIX_PERPOOL_OMAP :
9251 PREFIX_PERPG_OMAP;
9252 buf->omap_allocated =
9253 db->estimate_prefix_size(prefix, string());
9254
9255 uint64_t bfree = shared_alloc.a->get_free();
9256
9257 if (bluefs) {
9258 buf->internally_reserved = 0;
9259 // include dedicated db, too, if that isn't the shared device.
9260 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
9261 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
9262 }
9263 // call any non-omap bluefs space "internal metadata"
9264 buf->internal_metadata =
9265 bluefs->get_used()
9266 - buf->omap_allocated;
9267 }
9268
9269 uint64_t thin_total, thin_avail;
9270 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9271 buf->total += thin_total;
9272
9273 // we are limited by both the size of the virtual device and the
9274 // underlying physical device.
9275 bfree = std::min(bfree, thin_avail);
9276
9277 buf->allocated = thin_total - thin_avail;
9278 } else {
9279 buf->total += bdev->get_size();
9280 }
9281 buf->available = bfree;
9282 }
9283
9284 int BlueStore::statfs(struct store_statfs_t *buf,
9285 osd_alert_list_t* alerts)
9286 {
9287 if (alerts) {
9288 alerts->clear();
9289 _log_alerts(*alerts);
9290 }
9291 _get_statfs_overall(buf);
9292 {
9293 std::lock_guard l(vstatfs_lock);
9294 buf->allocated = vstatfs.allocated();
9295 buf->data_stored = vstatfs.stored();
9296 buf->data_compressed = vstatfs.compressed();
9297 buf->data_compressed_original = vstatfs.compressed_original();
9298 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9299 }
9300
9301 dout(20) << __func__ << " " << *buf << dendl;
9302 return 0;
9303 }
9304
9305 int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9306 bool *out_per_pool_omap)
9307 {
9308 dout(20) << __func__ << " pool " << pool_id<< dendl;
9309
9310 if (!per_pool_stat_collection) {
9311 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9312 return -ENOTSUP;
9313 }
9314 buf->reset();
9315
9316 {
9317 std::lock_guard l(vstatfs_lock);
9318 osd_pools[pool_id].publish(buf);
9319 }
9320
9321 string key_prefix;
9322 _key_encode_u64(pool_id, &key_prefix);
9323 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
9324 if (*out_per_pool_omap) {
9325 auto prefix = per_pool_omap == OMAP_PER_POOL ?
9326 PREFIX_PERPOOL_OMAP :
9327 PREFIX_PERPG_OMAP;
9328 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
9329 }
9330
9331 dout(10) << __func__ << *buf << dendl;
9332 return 0;
9333 }
9334
9335 void BlueStore::_check_legacy_statfs_alert()
9336 {
9337 string s;
9338 if (!per_pool_stat_collection &&
9339 cct->_conf->bluestore_warn_on_legacy_statfs) {
9340 s = "legacy statfs reporting detected, "
9341 "suggest to run store repair to get consistent statistic reports";
9342 }
9343 std::lock_guard l(qlock);
9344 legacy_statfs_alert = s;
9345 }
9346
9347 void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9348 {
9349 string per_pg, per_pool;
9350 if (per_pool_omap != OMAP_PER_PG) {
9351 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
9352 per_pg = "legacy (not per-pg) omap detected, "
9353 "suggest to run store repair to benefit from faster PG removal";
9354 }
9355 if (per_pool_omap != OMAP_PER_POOL) {
9356 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9357 per_pool = "legacy (not per-pool) omap detected, "
9358 "suggest to run store repair to benefit from per-pool omap usage statistics";
9359 }
9360 }
9361 }
9362 std::lock_guard l(qlock);
9363 no_per_pg_omap_alert = per_pg;
9364 no_per_pool_omap_alert = per_pool;
9365 }
9366
9367 // ---------------
9368 // cache
9369
9370 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9371 {
9372 std::shared_lock l(coll_lock);
9373 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9374 if (cp == coll_map.end())
9375 return CollectionRef();
9376 return cp->second;
9377 }
9378
9379 void BlueStore::_queue_reap_collection(CollectionRef& c)
9380 {
9381 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9382 // _reap_collections and this in the same thread,
9383 // so no need a lock.
9384 removed_collections.push_back(c);
9385 }
9386
9387 void BlueStore::_reap_collections()
9388 {
9389
9390 list<CollectionRef> removed_colls;
9391 {
9392 // _queue_reap_collection and this in the same thread.
9393 // So no need a lock.
9394 if (!removed_collections.empty())
9395 removed_colls.swap(removed_collections);
9396 else
9397 return;
9398 }
9399
9400 list<CollectionRef>::iterator p = removed_colls.begin();
9401 while (p != removed_colls.end()) {
9402 CollectionRef c = *p;
9403 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9404 if (c->onode_map.map_any([&](Onode* o) {
9405 ceph_assert(!o->exists);
9406 if (o->flushing_count.load()) {
9407 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9408 << " flush_txns " << o->flushing_count << dendl;
9409 return true;
9410 }
9411 return false;
9412 })) {
9413 ++p;
9414 continue;
9415 }
9416 c->onode_map.clear();
9417 p = removed_colls.erase(p);
9418 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9419 }
9420 if (removed_colls.empty()) {
9421 dout(10) << __func__ << " all reaped" << dendl;
9422 } else {
9423 removed_collections.splice(removed_collections.begin(), removed_colls);
9424 }
9425 }
9426
9427 void BlueStore::_update_cache_logger()
9428 {
9429 uint64_t num_onodes = 0;
9430 uint64_t num_pinned_onodes = 0;
9431 uint64_t num_extents = 0;
9432 uint64_t num_blobs = 0;
9433 uint64_t num_buffers = 0;
9434 uint64_t num_buffer_bytes = 0;
9435 for (auto c : onode_cache_shards) {
9436 c->add_stats(&num_onodes, &num_pinned_onodes);
9437 }
9438 for (auto c : buffer_cache_shards) {
9439 c->add_stats(&num_extents, &num_blobs,
9440 &num_buffers, &num_buffer_bytes);
9441 }
9442 logger->set(l_bluestore_onodes, num_onodes);
9443 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
9444 logger->set(l_bluestore_extents, num_extents);
9445 logger->set(l_bluestore_blobs, num_blobs);
9446 logger->set(l_bluestore_buffers, num_buffers);
9447 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9448 }
9449
9450 // ---------------
9451 // read operations
9452
9453 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9454 {
9455 return _get_collection(cid);
9456 }
9457
9458 ObjectStore::CollectionHandle BlueStore::create_new_collection(
9459 const coll_t& cid)
9460 {
9461 std::unique_lock l{coll_lock};
9462 auto c = ceph::make_ref<Collection>(
9463 this,
9464 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9465 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
9466 cid);
9467 new_coll_map[cid] = c;
9468 _osr_attach(c.get());
9469 return c;
9470 }
9471
9472 void BlueStore::set_collection_commit_queue(
9473 const coll_t& cid,
9474 ContextQueue *commit_queue)
9475 {
9476 if (commit_queue) {
9477 std::shared_lock l(coll_lock);
9478 if (coll_map.count(cid)) {
9479 coll_map[cid]->commit_queue = commit_queue;
9480 } else if (new_coll_map.count(cid)) {
9481 new_coll_map[cid]->commit_queue = commit_queue;
9482 }
9483 }
9484 }
9485
9486
9487 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9488 {
9489 Collection *c = static_cast<Collection *>(c_.get());
9490 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9491 if (!c->exists)
9492 return false;
9493
9494 bool r = true;
9495
9496 {
9497 std::shared_lock l(c->lock);
9498 OnodeRef o = c->get_onode(oid, false);
9499 if (!o || !o->exists)
9500 r = false;
9501 }
9502
9503 return r;
9504 }
9505
9506 int BlueStore::stat(
9507 CollectionHandle &c_,
9508 const ghobject_t& oid,
9509 struct stat *st,
9510 bool allow_eio)
9511 {
9512 Collection *c = static_cast<Collection *>(c_.get());
9513 if (!c->exists)
9514 return -ENOENT;
9515 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9516
9517 {
9518 std::shared_lock l(c->lock);
9519 OnodeRef o = c->get_onode(oid, false);
9520 if (!o || !o->exists)
9521 return -ENOENT;
9522 st->st_size = o->onode.size;
9523 st->st_blksize = 4096;
9524 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9525 st->st_nlink = 1;
9526 }
9527
9528 int r = 0;
9529 if (_debug_mdata_eio(oid)) {
9530 r = -EIO;
9531 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9532 }
9533 return r;
9534 }
9535 int BlueStore::set_collection_opts(
9536 CollectionHandle& ch,
9537 const pool_opts_t& opts)
9538 {
9539 Collection *c = static_cast<Collection *>(ch.get());
9540 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
9541 if (!c->exists)
9542 return -ENOENT;
9543 std::unique_lock l{c->lock};
9544 c->pool_opts = opts;
9545 return 0;
9546 }
9547
9548 int BlueStore::read(
9549 CollectionHandle &c_,
9550 const ghobject_t& oid,
9551 uint64_t offset,
9552 size_t length,
9553 bufferlist& bl,
9554 uint32_t op_flags)
9555 {
9556 auto start = mono_clock::now();
9557 Collection *c = static_cast<Collection *>(c_.get());
9558 const coll_t &cid = c->get_cid();
9559 dout(15) << __func__ << " " << cid << " " << oid
9560 << " 0x" << std::hex << offset << "~" << length << std::dec
9561 << dendl;
9562 if (!c->exists)
9563 return -ENOENT;
9564
9565 bl.clear();
9566 int r;
9567 {
9568 std::shared_lock l(c->lock);
9569 auto start1 = mono_clock::now();
9570 OnodeRef o = c->get_onode(oid, false);
9571 log_latency("get_onode@read",
9572 l_bluestore_read_onode_meta_lat,
9573 mono_clock::now() - start1,
9574 cct->_conf->bluestore_log_op_age);
9575 if (!o || !o->exists) {
9576 r = -ENOENT;
9577 goto out;
9578 }
9579
9580 if (offset == length && offset == 0)
9581 length = o->onode.size;
9582
9583 r = _do_read(c, o, offset, length, bl, op_flags);
9584 if (r == -EIO) {
9585 logger->inc(l_bluestore_read_eio);
9586 }
9587 }
9588
9589 out:
9590 if (r >= 0 && _debug_data_eio(oid)) {
9591 r = -EIO;
9592 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9593 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9594 cct->_conf->bluestore_debug_random_read_err &&
9595 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9596 100.0)) == 0) {
9597 dout(0) << __func__ << ": inject random EIO" << dendl;
9598 r = -EIO;
9599 }
9600 dout(10) << __func__ << " " << cid << " " << oid
9601 << " 0x" << std::hex << offset << "~" << length << std::dec
9602 << " = " << r << dendl;
9603 log_latency(__func__,
9604 l_bluestore_read_lat,
9605 mono_clock::now() - start,
9606 cct->_conf->bluestore_log_op_age);
9607 return r;
9608 }
9609
9610 void BlueStore::_read_cache(
9611 OnodeRef o,
9612 uint64_t offset,
9613 size_t length,
9614 int read_cache_policy,
9615 ready_regions_t& ready_regions,
9616 blobs2read_t& blobs2read)
9617 {
9618 // build blob-wise list to of stuff read (that isn't cached)
9619 unsigned left = length;
9620 uint64_t pos = offset;
9621 auto lp = o->extent_map.seek_lextent(offset);
9622 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9623 if (pos < lp->logical_offset) {
9624 unsigned hole = lp->logical_offset - pos;
9625 if (hole >= left) {
9626 break;
9627 }
9628 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9629 << std::dec << dendl;
9630 pos += hole;
9631 left -= hole;
9632 }
9633 BlobRef& bptr = lp->blob;
9634 unsigned l_off = pos - lp->logical_offset;
9635 unsigned b_off = l_off + lp->blob_offset;
9636 unsigned b_len = std::min(left, lp->length - l_off);
9637
9638 ready_regions_t cache_res;
9639 interval_set<uint32_t> cache_interval;
9640 bptr->shared_blob->bc.read(
9641 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9642 read_cache_policy);
9643 dout(20) << __func__ << " blob " << *bptr << std::hex
9644 << " need 0x" << b_off << "~" << b_len
9645 << " cache has 0x" << cache_interval
9646 << std::dec << dendl;
9647
9648 auto pc = cache_res.begin();
9649 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
9650 while (b_len > 0) {
9651 unsigned l;
9652 if (pc != cache_res.end() &&
9653 pc->first == b_off) {
9654 l = pc->second.length();
9655 ready_regions[pos] = std::move(pc->second);
9656 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9657 << b_off << "~" << l << std::dec << dendl;
9658 ++pc;
9659 } else {
9660 l = b_len;
9661 if (pc != cache_res.end()) {
9662 ceph_assert(pc->first > b_off);
9663 l = pc->first - b_off;
9664 }
9665 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9666 << b_off << "~" << l << std::dec << dendl;
9667 // merge regions
9668 {
9669 uint64_t r_off = b_off;
9670 uint64_t r_len = l;
9671 uint64_t front = r_off % chunk_size;
9672 if (front) {
9673 r_off -= front;
9674 r_len += front;
9675 }
9676 unsigned tail = r_len % chunk_size;
9677 if (tail) {
9678 r_len += chunk_size - tail;
9679 }
9680 bool merged = false;
9681 regions2read_t& r2r = blobs2read[bptr];
9682 if (r2r.size()) {
9683 read_req_t& pre = r2r.back();
9684 if (r_off <= (pre.r_off + pre.r_len)) {
9685 front += (r_off - pre.r_off);
9686 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9687 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9688 merged = true;
9689 }
9690 }
9691 if (!merged) {
9692 read_req_t req(r_off, r_len);
9693 req.regs.emplace_back(region_t(pos, b_off, l, front));
9694 r2r.emplace_back(std::move(req));
9695 }
9696 }
9697 }
9698 pos += l;
9699 b_off += l;
9700 left -= l;
9701 b_len -= l;
9702 }
9703 ++lp;
9704 }
9705 }
9706
9707 int BlueStore::_prepare_read_ioc(
9708 blobs2read_t& blobs2read,
9709 vector<bufferlist>* compressed_blob_bls,
9710 IOContext* ioc)
9711 {
9712 for (auto& p : blobs2read) {
9713 const BlobRef& bptr = p.first;
9714 regions2read_t& r2r = p.second;
9715 dout(20) << __func__ << " blob " << *bptr << std::hex
9716 << " need " << r2r << std::dec << dendl;
9717 if (bptr->get_blob().is_compressed()) {
9718 // read the whole thing
9719 if (compressed_blob_bls->empty()) {
9720 // ensure we avoid any reallocation on subsequent blobs
9721 compressed_blob_bls->reserve(blobs2read.size());
9722 }
9723 compressed_blob_bls->push_back(bufferlist());
9724 bufferlist& bl = compressed_blob_bls->back();
9725 auto r = bptr->get_blob().map(
9726 0, bptr->get_blob().get_ondisk_length(),
9727 [&](uint64_t offset, uint64_t length) {
9728 int r = bdev->aio_read(offset, length, &bl, ioc);
9729 if (r < 0)
9730 return r;
9731 return 0;
9732 });
9733 if (r < 0) {
9734 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9735 if (r == -EIO) {
9736 // propagate EIO to caller
9737 return r;
9738 }
9739 ceph_assert(r == 0);
9740 }
9741 } else {
9742 // read the pieces
9743 for (auto& req : r2r) {
9744 dout(20) << __func__ << " region 0x" << std::hex
9745 << req.regs.front().logical_offset
9746 << ": 0x" << req.regs.front().blob_xoffset
9747 << " reading 0x" << req.r_off
9748 << "~" << req.r_len << std::dec
9749 << dendl;
9750
9751 // read it
9752 auto r = bptr->get_blob().map(
9753 req.r_off, req.r_len,
9754 [&](uint64_t offset, uint64_t length) {
9755 int r = bdev->aio_read(offset, length, &req.bl, ioc);
9756 if (r < 0)
9757 return r;
9758 return 0;
9759 });
9760 if (r < 0) {
9761 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9762 << dendl;
9763 if (r == -EIO) {
9764 // propagate EIO to caller
9765 return r;
9766 }
9767 ceph_assert(r == 0);
9768 }
9769 ceph_assert(req.bl.length() == req.r_len);
9770 }
9771 }
9772 }
9773 return 0;
9774 }
9775
9776 int BlueStore::_generate_read_result_bl(
9777 OnodeRef o,
9778 uint64_t offset,
9779 size_t length,
9780 ready_regions_t& ready_regions,
9781 vector<bufferlist>& compressed_blob_bls,
9782 blobs2read_t& blobs2read,
9783 bool buffered,
9784 bool* csum_error,
9785 bufferlist& bl)
9786 {
9787 // enumerate and decompress desired blobs
9788 auto p = compressed_blob_bls.begin();
9789 blobs2read_t::iterator b2r_it = blobs2read.begin();
9790 while (b2r_it != blobs2read.end()) {
9791 const BlobRef& bptr = b2r_it->first;
9792 regions2read_t& r2r = b2r_it->second;
9793 dout(20) << __func__ << " blob " << *bptr << std::hex
9794 << " need 0x" << r2r << std::dec << dendl;
9795 if (bptr->get_blob().is_compressed()) {
9796 ceph_assert(p != compressed_blob_bls.end());
9797 bufferlist& compressed_bl = *p++;
9798 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9799 r2r.front().regs.front().logical_offset) < 0) {
9800 *csum_error = true;
9801 return -EIO;
9802 }
9803 bufferlist raw_bl;
9804 auto r = _decompress(compressed_bl, &raw_bl);
9805 if (r < 0)
9806 return r;
9807 if (buffered) {
9808 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9809 raw_bl);
9810 }
9811 for (auto& req : r2r) {
9812 for (auto& r : req.regs) {
9813 ready_regions[r.logical_offset].substr_of(
9814 raw_bl, r.blob_xoffset, r.length);
9815 }
9816 }
9817 } else {
9818 for (auto& req : r2r) {
9819 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9820 req.regs.front().logical_offset) < 0) {
9821 *csum_error = true;
9822 return -EIO;
9823 }
9824 if (buffered) {
9825 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
9826 req.r_off, req.bl);
9827 }
9828
9829 // prune and keep result
9830 for (const auto& r : req.regs) {
9831 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
9832 }
9833 }
9834 }
9835 ++b2r_it;
9836 }
9837
9838 // generate a resulting buffer
9839 auto pr = ready_regions.begin();
9840 auto pr_end = ready_regions.end();
9841 uint64_t pos = 0;
9842 while (pos < length) {
9843 if (pr != pr_end && pr->first == pos + offset) {
9844 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9845 << ": data from 0x" << pr->first << "~" << pr->second.length()
9846 << std::dec << dendl;
9847 pos += pr->second.length();
9848 bl.claim_append(pr->second);
9849 ++pr;
9850 } else {
9851 uint64_t l = length - pos;
9852 if (pr != pr_end) {
9853 ceph_assert(pr->first > pos + offset);
9854 l = pr->first - (pos + offset);
9855 }
9856 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9857 << ": zeros for 0x" << (pos + offset) << "~" << l
9858 << std::dec << dendl;
9859 bl.append_zero(l);
9860 pos += l;
9861 }
9862 }
9863 ceph_assert(bl.length() == length);
9864 ceph_assert(pos == length);
9865 ceph_assert(pr == pr_end);
9866 return 0;
9867 }
9868
9869 int BlueStore::_do_read(
9870 Collection *c,
9871 OnodeRef o,
9872 uint64_t offset,
9873 size_t length,
9874 bufferlist& bl,
9875 uint32_t op_flags,
9876 uint64_t retry_count)
9877 {
9878 FUNCTRACE(cct);
9879 int r = 0;
9880 int read_cache_policy = 0; // do not bypass clean or dirty cache
9881
9882 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9883 << " size 0x" << o->onode.size << " (" << std::dec
9884 << o->onode.size << ")" << dendl;
9885 bl.clear();
9886
9887 if (offset >= o->onode.size) {
9888 return r;
9889 }
9890
9891 // generally, don't buffer anything, unless the client explicitly requests
9892 // it.
9893 bool buffered = false;
9894 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9895 dout(20) << __func__ << " will do buffered read" << dendl;
9896 buffered = true;
9897 } else if (cct->_conf->bluestore_default_buffered_read &&
9898 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9899 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9900 dout(20) << __func__ << " defaulting to buffered read" << dendl;
9901 buffered = true;
9902 }
9903
9904 if (offset + length > o->onode.size) {
9905 length = o->onode.size - offset;
9906 }
9907
9908 auto start = mono_clock::now();
9909 o->extent_map.fault_range(db, offset, length);
9910 log_latency(__func__,
9911 l_bluestore_read_onode_meta_lat,
9912 mono_clock::now() - start,
9913 cct->_conf->bluestore_log_op_age);
9914 _dump_onode<30>(cct, *o);
9915
9916 // for deep-scrub, we only read dirty cache and bypass clean cache in
9917 // order to read underlying block device in case there are silent disk errors.
9918 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
9919 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
9920 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
9921 }
9922
9923 // build blob-wise list to of stuff read (that isn't cached)
9924 ready_regions_t ready_regions;
9925 blobs2read_t blobs2read;
9926 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
9927
9928
9929 // read raw blob data.
9930 start = mono_clock::now(); // for the sake of simplicity
9931 // measure the whole block below.
9932 // The error isn't that much...
9933 vector<bufferlist> compressed_blob_bls;
9934 IOContext ioc(cct, NULL, true); // allow EIO
9935 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
9936 // we always issue aio for reading, so errors other than EIO are not allowed
9937 if (r < 0)
9938 return r;
9939
9940 int64_t num_ios = blobs2read.size();
9941 if (ioc.has_pending_aios()) {
9942 num_ios = ioc.get_num_ios();
9943 bdev->aio_submit(&ioc);
9944 dout(20) << __func__ << " waiting for aio" << dendl;
9945 ioc.aio_wait();
9946 r = ioc.get_return_value();
9947 if (r < 0) {
9948 ceph_assert(r == -EIO); // no other errors allowed
9949 return -EIO;
9950 }
9951 }
9952 log_latency_fn(__func__,
9953 l_bluestore_read_wait_aio_lat,
9954 mono_clock::now() - start,
9955 cct->_conf->bluestore_log_op_age,
9956 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
9957 );
9958
9959 bool csum_error = false;
9960 r = _generate_read_result_bl(o, offset, length, ready_regions,
9961 compressed_blob_bls, blobs2read,
9962 buffered, &csum_error, bl);
9963 if (csum_error) {
9964 // Handles spurious read errors caused by a kernel bug.
9965 // We sometimes get all-zero pages as a result of the read under
9966 // high memory pressure. Retrying the failing read succeeds in most
9967 // cases.
9968 // See also: http://tracker.ceph.com/issues/22464
9969 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
9970 return -EIO;
9971 }
9972 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
9973 }
9974 r = bl.length();
9975 if (retry_count) {
9976 logger->inc(l_bluestore_reads_with_retries);
9977 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
9978 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
9979 stringstream s;
9980 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
9981 _set_spurious_read_errors_alert(s.str());
9982 }
9983 return r;
9984 }
9985
9986 int BlueStore::_verify_csum(OnodeRef& o,
9987 const bluestore_blob_t* blob, uint64_t blob_xoffset,
9988 const bufferlist& bl,
9989 uint64_t logical_offset) const
9990 {
9991 int bad;
9992 uint64_t bad_csum;
9993 auto start = mono_clock::now();
9994 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
9995 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
9996 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
9997 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
9998 bad = blob_xoffset;
9999 r = -1;
10000 bad_csum = 0xDEADBEEF;
10001 }
10002 if (r < 0) {
10003 if (r == -1) {
10004 PExtentVector pex;
10005 blob->map(
10006 bad,
10007 blob->get_csum_chunk_size(),
10008 [&](uint64_t offset, uint64_t length) {
10009 pex.emplace_back(bluestore_pextent_t(offset, length));
10010 return 0;
10011 });
10012 derr << __func__ << " bad "
10013 << Checksummer::get_csum_type_string(blob->csum_type)
10014 << "/0x" << std::hex << blob->get_csum_chunk_size()
10015 << " checksum at blob offset 0x" << bad
10016 << ", got 0x" << bad_csum << ", expected 0x"
10017 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
10018 << ", device location " << pex
10019 << ", logical extent 0x" << std::hex
10020 << (logical_offset + bad - blob_xoffset) << "~"
10021 << blob->get_csum_chunk_size() << std::dec
10022 << ", object " << o->oid
10023 << dendl;
10024 } else {
10025 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
10026 }
10027 }
10028 log_latency(__func__,
10029 l_bluestore_csum_lat,
10030 mono_clock::now() - start,
10031 cct->_conf->bluestore_log_op_age);
10032 if (cct->_conf->bluestore_ignore_data_csum) {
10033 return 0;
10034 }
10035 return r;
10036 }
10037
10038 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
10039 {
10040 int r = 0;
10041 auto start = mono_clock::now();
10042 auto i = source.cbegin();
10043 bluestore_compression_header_t chdr;
10044 decode(chdr, i);
10045 int alg = int(chdr.type);
10046 CompressorRef cp = compressor;
10047 if (!cp || (int)cp->get_type() != alg) {
10048 cp = Compressor::create(cct, alg);
10049 }
10050
10051 if (!cp.get()) {
10052 // if compressor isn't available - error, because cannot return
10053 // decompressed data?
10054
10055 const char* alg_name = Compressor::get_comp_alg_name(alg);
10056 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10057 _set_compression_alert(false, alg_name);
10058 r = -EIO;
10059 } else {
10060 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
10061 if (r < 0) {
10062 derr << __func__ << " decompression failed with exit code " << r << dendl;
10063 r = -EIO;
10064 }
10065 }
10066 log_latency(__func__,
10067 l_bluestore_decompress_lat,
10068 mono_clock::now() - start,
10069 cct->_conf->bluestore_log_op_age);
10070 return r;
10071 }
10072
10073 // this stores fiemap into interval_set, other variations
10074 // use it internally
10075 int BlueStore::_fiemap(
10076 CollectionHandle &c_,
10077 const ghobject_t& oid,
10078 uint64_t offset,
10079 size_t length,
10080 interval_set<uint64_t>& destset)
10081 {
10082 Collection *c = static_cast<Collection *>(c_.get());
10083 if (!c->exists)
10084 return -ENOENT;
10085 {
10086 std::shared_lock l(c->lock);
10087
10088 OnodeRef o = c->get_onode(oid, false);
10089 if (!o || !o->exists) {
10090 return -ENOENT;
10091 }
10092 _dump_onode<30>(cct, *o);
10093
10094 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10095 << " size 0x" << o->onode.size << std::dec << dendl;
10096
10097 boost::intrusive::set<Extent>::iterator ep, eend;
10098 if (offset >= o->onode.size)
10099 goto out;
10100
10101 if (offset + length > o->onode.size) {
10102 length = o->onode.size - offset;
10103 }
10104
10105 o->extent_map.fault_range(db, offset, length);
10106 eend = o->extent_map.extent_map.end();
10107 ep = o->extent_map.seek_lextent(offset);
10108 while (length > 0) {
10109 dout(20) << __func__ << " offset " << offset << dendl;
10110 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10111 ++ep;
10112 continue;
10113 }
10114
10115 uint64_t x_len = length;
10116 if (ep != eend && ep->logical_offset <= offset) {
10117 uint64_t x_off = offset - ep->logical_offset;
10118 x_len = std::min(x_len, ep->length - x_off);
10119 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10120 << x_len << std::dec << " blob " << ep->blob << dendl;
10121 destset.insert(offset, x_len);
10122 length -= x_len;
10123 offset += x_len;
10124 if (x_off + x_len == ep->length)
10125 ++ep;
10126 continue;
10127 }
10128 if (ep != eend &&
10129 ep->logical_offset > offset &&
10130 ep->logical_offset - offset < x_len) {
10131 x_len = ep->logical_offset - offset;
10132 }
10133 offset += x_len;
10134 length -= x_len;
10135 }
10136 }
10137
10138 out:
10139 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10140 << " size = 0x(" << destset << ")" << std::dec << dendl;
10141 return 0;
10142 }
10143
10144 int BlueStore::fiemap(
10145 CollectionHandle &c_,
10146 const ghobject_t& oid,
10147 uint64_t offset,
10148 size_t length,
10149 bufferlist& bl)
10150 {
10151 interval_set<uint64_t> m;
10152 int r = _fiemap(c_, oid, offset, length, m);
10153 if (r >= 0) {
10154 encode(m, bl);
10155 }
10156 return r;
10157 }
10158
10159 int BlueStore::fiemap(
10160 CollectionHandle &c_,
10161 const ghobject_t& oid,
10162 uint64_t offset,
10163 size_t length,
10164 map<uint64_t, uint64_t>& destmap)
10165 {
10166 interval_set<uint64_t> m;
10167 int r = _fiemap(c_, oid, offset, length, m);
10168 if (r >= 0) {
10169 destmap = std::move(m).detach();
10170 }
10171 return r;
10172 }
10173
10174 int BlueStore::readv(
10175 CollectionHandle &c_,
10176 const ghobject_t& oid,
10177 interval_set<uint64_t>& m,
10178 bufferlist& bl,
10179 uint32_t op_flags)
10180 {
10181 auto start = mono_clock::now();
10182 Collection *c = static_cast<Collection *>(c_.get());
10183 const coll_t &cid = c->get_cid();
10184 dout(15) << __func__ << " " << cid << " " << oid
10185 << " fiemap " << m
10186 << dendl;
10187 if (!c->exists)
10188 return -ENOENT;
10189
10190 bl.clear();
10191 int r;
10192 {
10193 std::shared_lock l(c->lock);
10194 auto start1 = mono_clock::now();
10195 OnodeRef o = c->get_onode(oid, false);
10196 log_latency("get_onode@read",
10197 l_bluestore_read_onode_meta_lat,
10198 mono_clock::now() - start1,
10199 cct->_conf->bluestore_log_op_age);
10200 if (!o || !o->exists) {
10201 r = -ENOENT;
10202 goto out;
10203 }
10204
10205 if (m.empty()) {
10206 r = 0;
10207 goto out;
10208 }
10209
10210 r = _do_readv(c, o, m, bl, op_flags);
10211 if (r == -EIO) {
10212 logger->inc(l_bluestore_read_eio);
10213 }
10214 }
10215
10216 out:
10217 if (r >= 0 && _debug_data_eio(oid)) {
10218 r = -EIO;
10219 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10220 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10221 cct->_conf->bluestore_debug_random_read_err &&
10222 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10223 100.0)) == 0) {
10224 dout(0) << __func__ << ": inject random EIO" << dendl;
10225 r = -EIO;
10226 }
10227 dout(10) << __func__ << " " << cid << " " << oid
10228 << " fiemap " << m << std::dec
10229 << " = " << r << dendl;
10230 log_latency(__func__,
10231 l_bluestore_read_lat,
10232 mono_clock::now() - start,
10233 cct->_conf->bluestore_log_op_age);
10234 return r;
10235 }
10236
10237 int BlueStore::_do_readv(
10238 Collection *c,
10239 OnodeRef o,
10240 const interval_set<uint64_t>& m,
10241 bufferlist& bl,
10242 uint32_t op_flags,
10243 uint64_t retry_count)
10244 {
10245 FUNCTRACE(cct);
10246 int r = 0;
10247 int read_cache_policy = 0; // do not bypass clean or dirty cache
10248
10249 dout(20) << __func__ << " fiemap " << m << std::hex
10250 << " size 0x" << o->onode.size << " (" << std::dec
10251 << o->onode.size << ")" << dendl;
10252
10253 // generally, don't buffer anything, unless the client explicitly requests
10254 // it.
10255 bool buffered = false;
10256 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10257 dout(20) << __func__ << " will do buffered read" << dendl;
10258 buffered = true;
10259 } else if (cct->_conf->bluestore_default_buffered_read &&
10260 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10261 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10262 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10263 buffered = true;
10264 }
10265 // this method must be idempotent since we may call it several times
10266 // before we finally read the expected result.
10267 bl.clear();
10268
10269 // call fiemap first!
10270 ceph_assert(m.range_start() <= o->onode.size);
10271 ceph_assert(m.range_end() <= o->onode.size);
10272 auto start = mono_clock::now();
10273 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10274 log_latency(__func__,
10275 l_bluestore_read_onode_meta_lat,
10276 mono_clock::now() - start,
10277 cct->_conf->bluestore_log_op_age);
10278 _dump_onode<30>(cct, *o);
10279
10280 IOContext ioc(cct, NULL, true); // allow EIO
10281 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10282 raw_results.reserve(m.num_intervals());
10283 int i = 0;
10284 for (auto p = m.begin(); p != m.end(); p++, i++) {
10285 raw_results.push_back({});
10286 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10287 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10288 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10289 // we always issue aio for reading, so errors other than EIO are not allowed
10290 if (r < 0)
10291 return r;
10292 }
10293
10294 auto num_ios = m.size();
10295 if (ioc.has_pending_aios()) {
10296 num_ios = ioc.get_num_ios();
10297 bdev->aio_submit(&ioc);
10298 dout(20) << __func__ << " waiting for aio" << dendl;
10299 ioc.aio_wait();
10300 r = ioc.get_return_value();
10301 if (r < 0) {
10302 ceph_assert(r == -EIO); // no other errors allowed
10303 return -EIO;
10304 }
10305 }
10306 log_latency_fn(__func__,
10307 l_bluestore_read_wait_aio_lat,
10308 mono_clock::now() - start,
10309 cct->_conf->bluestore_log_op_age,
10310 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10311 );
10312
10313 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10314 i = 0;
10315 for (auto p = m.begin(); p != m.end(); p++, i++) {
10316 bool csum_error = false;
10317 bufferlist t;
10318 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10319 std::get<0>(raw_results[i]),
10320 std::get<1>(raw_results[i]),
10321 std::get<2>(raw_results[i]),
10322 buffered, &csum_error, t);
10323 if (csum_error) {
10324 // Handles spurious read errors caused by a kernel bug.
10325 // We sometimes get all-zero pages as a result of the read under
10326 // high memory pressure. Retrying the failing read succeeds in most
10327 // cases.
10328 // See also: http://tracker.ceph.com/issues/22464
10329 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10330 return -EIO;
10331 }
10332 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10333 }
10334 bl.claim_append(t);
10335 }
10336 if (retry_count) {
10337 logger->inc(l_bluestore_reads_with_retries);
10338 dout(5) << __func__ << " read fiemap " << m
10339 << " failed " << retry_count << " times before succeeding"
10340 << dendl;
10341 }
10342 return bl.length();
10343 }
10344
10345 int BlueStore::dump_onode(CollectionHandle &c_,
10346 const ghobject_t& oid,
10347 const string& section_name,
10348 Formatter *f)
10349 {
10350 Collection *c = static_cast<Collection *>(c_.get());
10351 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10352 if (!c->exists)
10353 return -ENOENT;
10354
10355 int r;
10356 {
10357 std::shared_lock l(c->lock);
10358
10359 OnodeRef o = c->get_onode(oid, false);
10360 if (!o || !o->exists) {
10361 r = -ENOENT;
10362 goto out;
10363 }
10364 // FIXME minor: actually the next line isn't enough to
10365 // load shared blobs. Leaving as is for now..
10366 //
10367 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10368
10369 _dump_onode<0>(cct, *o);
10370 f->open_object_section(section_name.c_str());
10371 o->dump(f);
10372 f->close_section();
10373 r = 0;
10374 }
10375 out:
10376 dout(10) << __func__ << " " << c->cid << " " << oid
10377 << " = " << r << dendl;
10378 return r;
10379 }
10380
10381 int BlueStore::getattr(
10382 CollectionHandle &c_,
10383 const ghobject_t& oid,
10384 const char *name,
10385 bufferptr& value)
10386 {
10387 Collection *c = static_cast<Collection *>(c_.get());
10388 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10389 if (!c->exists)
10390 return -ENOENT;
10391
10392 int r;
10393 {
10394 std::shared_lock l(c->lock);
10395 mempool::bluestore_cache_meta::string k(name);
10396
10397 OnodeRef o = c->get_onode(oid, false);
10398 if (!o || !o->exists) {
10399 r = -ENOENT;
10400 goto out;
10401 }
10402
10403 if (!o->onode.attrs.count(k)) {
10404 r = -ENODATA;
10405 goto out;
10406 }
10407 value = o->onode.attrs[k];
10408 r = 0;
10409 }
10410 out:
10411 if (r == 0 && _debug_mdata_eio(oid)) {
10412 r = -EIO;
10413 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10414 }
10415 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10416 << " = " << r << dendl;
10417 return r;
10418 }
10419
10420 int BlueStore::getattrs(
10421 CollectionHandle &c_,
10422 const ghobject_t& oid,
10423 map<string,bufferptr>& aset)
10424 {
10425 Collection *c = static_cast<Collection *>(c_.get());
10426 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10427 if (!c->exists)
10428 return -ENOENT;
10429
10430 int r;
10431 {
10432 std::shared_lock l(c->lock);
10433
10434 OnodeRef o = c->get_onode(oid, false);
10435 if (!o || !o->exists) {
10436 r = -ENOENT;
10437 goto out;
10438 }
10439 for (auto& i : o->onode.attrs) {
10440 aset.emplace(i.first.c_str(), i.second);
10441 }
10442 r = 0;
10443 }
10444
10445 out:
10446 if (r == 0 && _debug_mdata_eio(oid)) {
10447 r = -EIO;
10448 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10449 }
10450 dout(10) << __func__ << " " << c->cid << " " << oid
10451 << " = " << r << dendl;
10452 return r;
10453 }
10454
10455 int BlueStore::list_collections(vector<coll_t>& ls)
10456 {
10457 std::shared_lock l(coll_lock);
10458 ls.reserve(coll_map.size());
10459 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10460 p != coll_map.end();
10461 ++p)
10462 ls.push_back(p->first);
10463 return 0;
10464 }
10465
10466 bool BlueStore::collection_exists(const coll_t& c)
10467 {
10468 std::shared_lock l(coll_lock);
10469 return coll_map.count(c);
10470 }
10471
10472 int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
10473 {
10474 dout(15) << __func__ << " " << ch->cid << dendl;
10475 vector<ghobject_t> ls;
10476 ghobject_t next;
10477 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
10478 &ls, &next);
10479 if (r < 0) {
10480 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10481 << dendl;
10482 return r;
10483 }
10484 *empty = ls.empty();
10485 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
10486 return 0;
10487 }
10488
10489 int BlueStore::collection_bits(CollectionHandle& ch)
10490 {
10491 dout(15) << __func__ << " " << ch->cid << dendl;
10492 Collection *c = static_cast<Collection*>(ch.get());
10493 std::shared_lock l(c->lock);
10494 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
10495 return c->cnode.bits;
10496 }
10497
10498 int BlueStore::collection_list(
10499 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10500 vector<ghobject_t> *ls, ghobject_t *pnext)
10501 {
10502 Collection *c = static_cast<Collection *>(c_.get());
10503 c->flush();
10504 dout(15) << __func__ << " " << c->cid
10505 << " start " << start << " end " << end << " max " << max << dendl;
10506 int r;
10507 {
10508 std::shared_lock l(c->lock);
10509 r = _collection_list(c, start, end, max, false, ls, pnext);
10510 }
10511
10512 dout(10) << __func__ << " " << c->cid
10513 << " start " << start << " end " << end << " max " << max
10514 << " = " << r << ", ls.size() = " << ls->size()
10515 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10516 return r;
10517 }
10518
10519 int BlueStore::collection_list_legacy(
10520 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10521 vector<ghobject_t> *ls, ghobject_t *pnext)
10522 {
10523 Collection *c = static_cast<Collection *>(c_.get());
10524 c->flush();
10525 dout(15) << __func__ << " " << c->cid
10526 << " start " << start << " end " << end << " max " << max << dendl;
10527 int r;
10528 {
10529 std::shared_lock l(c->lock);
10530 r = _collection_list(c, start, end, max, true, ls, pnext);
10531 }
10532
10533 dout(10) << __func__ << " " << c->cid
10534 << " start " << start << " end " << end << " max " << max
10535 << " = " << r << ", ls.size() = " << ls->size()
10536 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10537 return r;
10538 }
10539
10540 int BlueStore::_collection_list(
10541 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
10542 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
10543 {
10544
10545 if (!c->exists)
10546 return -ENOENT;
10547
10548 auto start_time = mono_clock::now();
10549 int r = 0;
10550 ghobject_t static_next;
10551 std::unique_ptr<CollectionListIterator> it;
10552 ghobject_t coll_range_temp_start, coll_range_temp_end;
10553 ghobject_t coll_range_start, coll_range_end;
10554 bool set_next = false;
10555 ghobject_t pend;
10556 bool temp;
10557
10558 if (!pnext)
10559 pnext = &static_next;
10560
10561 if (start.is_max() || start.hobj.is_max()) {
10562 goto out;
10563 }
10564 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
10565 &coll_range_temp_end, &coll_range_start, &coll_range_end);
10566 dout(20) << __func__
10567 << " range " << coll_range_temp_start
10568 << " to " << coll_range_temp_end
10569 << " and " << coll_range_start
10570 << " to " << coll_range_end
10571 << " start " << start << dendl;
10572 if (legacy) {
10573 it = std::make_unique<SimpleCollectionListIterator>(
10574 cct, db->get_iterator(PREFIX_OBJ));
10575 } else {
10576 it = std::make_unique<SortedCollectionListIterator>(
10577 db->get_iterator(PREFIX_OBJ));
10578 }
10579 if (start == ghobject_t() ||
10580 start.hobj == hobject_t() ||
10581 start == c->cid.get_min_hobj()) {
10582 it->upper_bound(coll_range_temp_start);
10583 temp = true;
10584 } else {
10585 if (start.hobj.is_temp()) {
10586 temp = true;
10587 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
10588 } else {
10589 temp = false;
10590 ceph_assert(start >= coll_range_start && start < coll_range_end);
10591 }
10592 dout(20) << __func__ << " temp=" << (int)temp << dendl;
10593 it->lower_bound(start);
10594 }
10595 if (end.hobj.is_max()) {
10596 pend = temp ? coll_range_temp_end : coll_range_end;
10597 } else {
10598 if (end.hobj.is_temp()) {
10599 if (temp)
10600 pend = end;
10601 else
10602 goto out;
10603 } else {
10604 pend = temp ? coll_range_temp_end : end;
10605 }
10606 }
10607 dout(20) << __func__ << " pend " << pend << dendl;
10608 while (true) {
10609 if (!it->valid() || it->is_ge(pend)) {
10610 if (!it->valid())
10611 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10612 else
10613 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
10614 if (temp) {
10615 if (end.hobj.is_temp()) {
10616 if (it->valid() && it->is_lt(coll_range_temp_end)) {
10617 *pnext = it->oid();
10618 set_next = true;
10619 }
10620 break;
10621 }
10622 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10623 temp = false;
10624 it->upper_bound(coll_range_start);
10625 if (end.hobj.is_max())
10626 pend = coll_range_end;
10627 else
10628 pend = end;
10629 dout(30) << __func__ << " pend " << pend << dendl;
10630 continue;
10631 }
10632 if (it->valid() && it->is_lt(coll_range_end)) {
10633 *pnext = it->oid();
10634 set_next = true;
10635 }
10636 break;
10637 }
10638 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
10639 if (ls->size() >= (unsigned)max) {
10640 dout(20) << __func__ << " reached max " << max << dendl;
10641 *pnext = it->oid();
10642 set_next = true;
10643 break;
10644 }
10645 ls->push_back(it->oid());
10646 it->next();
10647 }
10648 out:
10649 if (!set_next) {
10650 *pnext = ghobject_t::get_max();
10651 }
10652 log_latency_fn(
10653 __func__,
10654 l_bluestore_clist_lat,
10655 mono_clock::now() - start_time,
10656 cct->_conf->bluestore_log_collection_list_age,
10657 [&] (const ceph::timespan& lat) {
10658 ostringstream ostr;
10659 ostr << ", lat = " << timespan_str(lat)
10660 << " cid =" << c->cid
10661 << " start " << start << " end " << end
10662 << " max " << max;
10663 return ostr.str();
10664 }
10665 );
10666 return r;
10667 }
10668
10669 int BlueStore::omap_get(
10670 CollectionHandle &c_, ///< [in] Collection containing oid
10671 const ghobject_t &oid, ///< [in] Object containing omap
10672 bufferlist *header, ///< [out] omap header
10673 map<string, bufferlist> *out /// < [out] Key to value map
10674 )
10675 {
10676 Collection *c = static_cast<Collection *>(c_.get());
10677 return _omap_get(c, oid, header, out);
10678 }
10679
10680 int BlueStore::_omap_get(
10681 Collection *c, ///< [in] Collection containing oid
10682 const ghobject_t &oid, ///< [in] Object containing omap
10683 bufferlist *header, ///< [out] omap header
10684 map<string, bufferlist> *out /// < [out] Key to value map
10685 )
10686 {
10687 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10688 if (!c->exists)
10689 return -ENOENT;
10690 std::shared_lock l(c->lock);
10691 int r = 0;
10692 OnodeRef o = c->get_onode(oid, false);
10693 if (!o || !o->exists) {
10694 r = -ENOENT;
10695 goto out;
10696 }
10697 r = _onode_omap_get(o, header, out);
10698 out:
10699 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10700 << dendl;
10701 return r;
10702 }
10703
10704 int BlueStore::_onode_omap_get(
10705 const OnodeRef &o, ///< [in] Object containing omap
10706 bufferlist *header, ///< [out] omap header
10707 map<string, bufferlist> *out /// < [out] Key to value map
10708 )
10709 {
10710 int r = 0;
10711 if (!o || !o->exists) {
10712 r = -ENOENT;
10713 goto out;
10714 }
10715 if (!o->onode.has_omap())
10716 goto out;
10717 o->flush();
10718 {
10719 const string& prefix = o->get_omap_prefix();
10720 KeyValueDB::Iterator it = db->get_iterator(prefix);
10721 string head, tail;
10722 o->get_omap_header(&head);
10723 o->get_omap_tail(&tail);
10724 it->lower_bound(head);
10725 while (it->valid()) {
10726 if (it->key() == head) {
10727 dout(30) << __func__ << " got header" << dendl;
10728 *header = it->value();
10729 } else if (it->key() >= tail) {
10730 dout(30) << __func__ << " reached tail" << dendl;
10731 break;
10732 } else {
10733 string user_key;
10734 o->decode_omap_key(it->key(), &user_key);
10735 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10736 << " -> " << user_key << dendl;
10737 (*out)[user_key] = it->value();
10738 }
10739 it->next();
10740 }
10741 }
10742 out:
10743 return r;
10744 }
10745
10746 int BlueStore::omap_get_header(
10747 CollectionHandle &c_, ///< [in] Collection containing oid
10748 const ghobject_t &oid, ///< [in] Object containing omap
10749 bufferlist *header, ///< [out] omap header
10750 bool allow_eio ///< [in] don't assert on eio
10751 )
10752 {
10753 Collection *c = static_cast<Collection *>(c_.get());
10754 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10755 if (!c->exists)
10756 return -ENOENT;
10757 std::shared_lock l(c->lock);
10758 int r = 0;
10759 OnodeRef o = c->get_onode(oid, false);
10760 if (!o || !o->exists) {
10761 r = -ENOENT;
10762 goto out;
10763 }
10764 if (!o->onode.has_omap())
10765 goto out;
10766 o->flush();
10767 {
10768 string head;
10769 o->get_omap_header(&head);
10770 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
10771 dout(30) << __func__ << " got header" << dendl;
10772 } else {
10773 dout(30) << __func__ << " no header" << dendl;
10774 }
10775 }
10776 out:
10777 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10778 << dendl;
10779 return r;
10780 }
10781
10782 int BlueStore::omap_get_keys(
10783 CollectionHandle &c_, ///< [in] Collection containing oid
10784 const ghobject_t &oid, ///< [in] Object containing omap
10785 set<string> *keys ///< [out] Keys defined on oid
10786 )
10787 {
10788 Collection *c = static_cast<Collection *>(c_.get());
10789 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10790 if (!c->exists)
10791 return -ENOENT;
10792 auto start1 = mono_clock::now();
10793 std::shared_lock l(c->lock);
10794 int r = 0;
10795 OnodeRef o = c->get_onode(oid, false);
10796 if (!o || !o->exists) {
10797 r = -ENOENT;
10798 goto out;
10799 }
10800 if (!o->onode.has_omap())
10801 goto out;
10802 o->flush();
10803 {
10804 const string& prefix = o->get_omap_prefix();
10805 KeyValueDB::Iterator it = db->get_iterator(prefix);
10806 string head, tail;
10807 o->get_omap_key(string(), &head);
10808 o->get_omap_tail(&tail);
10809 it->lower_bound(head);
10810 while (it->valid()) {
10811 if (it->key() >= tail) {
10812 dout(30) << __func__ << " reached tail" << dendl;
10813 break;
10814 }
10815 string user_key;
10816 o->decode_omap_key(it->key(), &user_key);
10817 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10818 << " -> " << user_key << dendl;
10819 keys->insert(user_key);
10820 it->next();
10821 }
10822 }
10823 out:
10824 c->store->log_latency(
10825 __func__,
10826 l_bluestore_omap_get_keys_lat,
10827 mono_clock::now() - start1,
10828 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10829
10830 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10831 << dendl;
10832 return r;
10833 }
10834
10835 int BlueStore::omap_get_values(
10836 CollectionHandle &c_, ///< [in] Collection containing oid
10837 const ghobject_t &oid, ///< [in] Object containing omap
10838 const set<string> &keys, ///< [in] Keys to get
10839 map<string, bufferlist> *out ///< [out] Returned keys and values
10840 )
10841 {
10842 Collection *c = static_cast<Collection *>(c_.get());
10843 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10844 if (!c->exists)
10845 return -ENOENT;
10846 std::shared_lock l(c->lock);
10847 auto start1 = mono_clock::now();
10848 int r = 0;
10849 string final_key;
10850 OnodeRef o = c->get_onode(oid, false);
10851 if (!o || !o->exists) {
10852 r = -ENOENT;
10853 goto out;
10854 }
10855 if (!o->onode.has_omap()) {
10856 goto out;
10857 }
10858 o->flush();
10859 {
10860 const string& prefix = o->get_omap_prefix();
10861 o->get_omap_key(string(), &final_key);
10862 size_t base_key_len = final_key.size();
10863 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
10864 final_key.resize(base_key_len); // keep prefix
10865 final_key += *p;
10866 bufferlist val;
10867 if (db->get(prefix, final_key, &val) >= 0) {
10868 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
10869 << " -> " << *p << dendl;
10870 out->insert(make_pair(*p, val));
10871 }
10872 }
10873 }
10874 out:
10875 c->store->log_latency(
10876 __func__,
10877 l_bluestore_omap_get_values_lat,
10878 mono_clock::now() - start1,
10879 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10880
10881 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10882 << dendl;
10883 return r;
10884 }
10885
10886 #ifdef WITH_SEASTAR
10887 int BlueStore::omap_get_values(
10888 CollectionHandle &c_, ///< [in] Collection containing oid
10889 const ghobject_t &oid, ///< [in] Object containing omap
10890 const std::optional<string> &start_after, ///< [in] Keys to get
10891 map<string, bufferlist> *output ///< [out] Returned keys and values
10892 )
10893 {
10894 Collection *c = static_cast<Collection *>(c_.get());
10895 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10896 if (!c->exists)
10897 return -ENOENT;
10898 std::shared_lock l(c->lock);
10899 int r = 0;
10900 OnodeRef o = c->get_onode(oid, false);
10901 if (!o || !o->exists) {
10902 r = -ENOENT;
10903 goto out;
10904 }
10905 if (!o->onode.has_omap()) {
10906 goto out;
10907 }
10908 o->flush();
10909 {
10910 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
10911 if (!iter) {
10912 r = -ENOENT;
10913 goto out;
10914 }
10915 iter->upper_bound(*start_after);
10916 for (; iter->valid(); iter->next()) {
10917 output->insert(make_pair(iter->key(), iter->value()));
10918 }
10919 }
10920
10921 out:
10922 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10923 << dendl;
10924 return r;
10925 }
10926 #endif
10927
10928 int BlueStore::omap_check_keys(
10929 CollectionHandle &c_, ///< [in] Collection containing oid
10930 const ghobject_t &oid, ///< [in] Object containing omap
10931 const set<string> &keys, ///< [in] Keys to check
10932 set<string> *out ///< [out] Subset of keys defined on oid
10933 )
10934 {
10935 Collection *c = static_cast<Collection *>(c_.get());
10936 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10937 if (!c->exists)
10938 return -ENOENT;
10939 std::shared_lock l(c->lock);
10940 int r = 0;
10941 string final_key;
10942 OnodeRef o = c->get_onode(oid, false);
10943 if (!o || !o->exists) {
10944 r = -ENOENT;
10945 goto out;
10946 }
10947 if (!o->onode.has_omap()) {
10948 goto out;
10949 }
10950 o->flush();
10951 {
10952 const string& prefix = o->get_omap_prefix();
10953 o->get_omap_key(string(), &final_key);
10954 size_t base_key_len = final_key.size();
10955 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
10956 final_key.resize(base_key_len); // keep prefix
10957 final_key += *p;
10958 bufferlist val;
10959 if (db->get(prefix, final_key, &val) >= 0) {
10960 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
10961 << " -> " << *p << dendl;
10962 out->insert(*p);
10963 } else {
10964 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
10965 << " -> " << *p << dendl;
10966 }
10967 }
10968 }
10969 out:
10970 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10971 << dendl;
10972 return r;
10973 }
10974
10975 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
10976 CollectionHandle &c_, ///< [in] collection
10977 const ghobject_t &oid ///< [in] object
10978 )
10979 {
10980 Collection *c = static_cast<Collection *>(c_.get());
10981 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10982 if (!c->exists) {
10983 return ObjectMap::ObjectMapIterator();
10984 }
10985 std::shared_lock l(c->lock);
10986 OnodeRef o = c->get_onode(oid, false);
10987 if (!o || !o->exists) {
10988 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
10989 return ObjectMap::ObjectMapIterator();
10990 }
10991 o->flush();
10992 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
10993 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
10994 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
10995 }
10996
10997 // -----------------
10998 // write helpers
10999
11000 uint64_t BlueStore::_get_ondisk_reserved() const {
11001 ceph_assert(min_alloc_size);
11002 return round_up_to(
11003 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
11004 }
11005
11006 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
11007 {
11008 dout(10) << __func__ << " ondisk_format " << ondisk_format
11009 << " min_compat_ondisk_format " << min_compat_ondisk_format
11010 << dendl;
11011 ceph_assert(ondisk_format == latest_ondisk_format);
11012 {
11013 bufferlist bl;
11014 encode(ondisk_format, bl);
11015 t->set(PREFIX_SUPER, "ondisk_format", bl);
11016 }
11017 {
11018 bufferlist bl;
11019 encode(min_compat_ondisk_format, bl);
11020 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
11021 }
11022 }
11023
11024 int BlueStore::_open_super_meta()
11025 {
11026 // nid
11027 {
11028 nid_max = 0;
11029 bufferlist bl;
11030 db->get(PREFIX_SUPER, "nid_max", &bl);
11031 auto p = bl.cbegin();
11032 try {
11033 uint64_t v;
11034 decode(v, p);
11035 nid_max = v;
11036 } catch (ceph::buffer::error& e) {
11037 derr << __func__ << " unable to read nid_max" << dendl;
11038 return -EIO;
11039 }
11040 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
11041 nid_last = nid_max.load();
11042 }
11043
11044 // blobid
11045 {
11046 blobid_max = 0;
11047 bufferlist bl;
11048 db->get(PREFIX_SUPER, "blobid_max", &bl);
11049 auto p = bl.cbegin();
11050 try {
11051 uint64_t v;
11052 decode(v, p);
11053 blobid_max = v;
11054 } catch (ceph::buffer::error& e) {
11055 derr << __func__ << " unable to read blobid_max" << dendl;
11056 return -EIO;
11057 }
11058 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
11059 blobid_last = blobid_max.load();
11060 }
11061
11062 // freelist
11063 {
11064 bufferlist bl;
11065 db->get(PREFIX_SUPER, "freelist_type", &bl);
11066 if (bl.length()) {
11067 freelist_type = std::string(bl.c_str(), bl.length());
11068 dout(1) << __func__ << " freelist_type " << freelist_type << dendl;
11069 } else {
11070 ceph_abort_msg("Not Support extent freelist manager");
11071 }
11072 }
11073
11074 // ondisk format
11075 int32_t compat_ondisk_format = 0;
11076 {
11077 bufferlist bl;
11078 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
11079 if (r < 0) {
11080 // base case: kraken bluestore is v1 and readable by v1
11081 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
11082 << dendl;
11083 ondisk_format = 1;
11084 compat_ondisk_format = 1;
11085 } else {
11086 auto p = bl.cbegin();
11087 try {
11088 decode(ondisk_format, p);
11089 } catch (ceph::buffer::error& e) {
11090 derr << __func__ << " unable to read ondisk_format" << dendl;
11091 return -EIO;
11092 }
11093 bl.clear();
11094 {
11095 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11096 ceph_assert(!r);
11097 auto p = bl.cbegin();
11098 try {
11099 decode(compat_ondisk_format, p);
11100 } catch (ceph::buffer::error& e) {
11101 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11102 return -EIO;
11103 }
11104 }
11105 }
11106 dout(1) << __func__ << " ondisk_format " << ondisk_format
11107 << " compat_ondisk_format " << compat_ondisk_format
11108 << dendl;
11109 }
11110
11111 if (latest_ondisk_format < compat_ondisk_format) {
11112 derr << __func__ << " compat_ondisk_format is "
11113 << compat_ondisk_format << " but we only understand version "
11114 << latest_ondisk_format << dendl;
11115 return -EPERM;
11116 }
11117
11118 {
11119 bufferlist bl;
11120 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11121 auto p = bl.cbegin();
11122 try {
11123 uint64_t val;
11124 decode(val, p);
11125 min_alloc_size = val;
11126 min_alloc_size_order = ctz(val);
11127 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
11128 } catch (ceph::buffer::error& e) {
11129 derr << __func__ << " unable to read min_alloc_size" << dendl;
11130 return -EIO;
11131 }
11132 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11133 << std::dec << dendl;
11134 }
11135
11136 _set_per_pool_omap();
11137
11138 _open_statfs();
11139 _set_alloc_sizes();
11140 _set_throttle_params();
11141
11142 _set_csum();
11143 _set_compression();
11144 _set_blob_size();
11145
11146 _validate_bdev();
11147 return 0;
11148 }
11149
11150 int BlueStore::_upgrade_super()
11151 {
11152 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11153 << latest_ondisk_format << dendl;
11154 if (ondisk_format < latest_ondisk_format) {
11155 ceph_assert(ondisk_format > 0);
11156 ceph_assert(ondisk_format < latest_ondisk_format);
11157
11158 KeyValueDB::Transaction t = db->get_transaction();
11159 if (ondisk_format == 1) {
11160 // changes:
11161 // - super: added ondisk_format
11162 // - super: added min_readable_ondisk_format
11163 // - super: added min_compat_ondisk_format
11164 // - super: added min_alloc_size
11165 // - super: removed min_min_alloc_size
11166 {
11167 bufferlist bl;
11168 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11169 auto p = bl.cbegin();
11170 try {
11171 uint64_t val;
11172 decode(val, p);
11173 min_alloc_size = val;
11174 } catch (ceph::buffer::error& e) {
11175 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11176 return -EIO;
11177 }
11178 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11179 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
11180 }
11181 ondisk_format = 2;
11182 }
11183 if (ondisk_format == 2) {
11184 // changes:
11185 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
11186 // oondes are using the per-pool prefix until a repair is run; at that
11187 // point the per_pool_omap=1 key will be set.
11188 // - super: added per_pool_omap key, which indicates that *all* objects
11189 // are using the new prefix and key format
11190 ondisk_format = 3;
11191 }
11192 if (ondisk_format == 3) {
11193 // changes:
11194 // - FreelistManager keeps meta within bdev label
11195 int r = _write_out_fm_meta(0);
11196 ceph_assert(r == 0);
11197 ondisk_format = 4;
11198 }
11199 // This to be the last operation
11200 _prepare_ondisk_format_super(t);
11201 int r = db->submit_transaction_sync(t);
11202 ceph_assert(r == 0);
11203 }
11204 // done
11205 dout(1) << __func__ << " done" << dendl;
11206 return 0;
11207 }
11208
11209 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11210 {
11211 if (o->onode.nid) {
11212 ceph_assert(o->exists);
11213 return;
11214 }
11215 uint64_t nid = ++nid_last;
11216 dout(20) << __func__ << " " << nid << dendl;
11217 o->onode.nid = nid;
11218 txc->last_nid = nid;
11219 o->exists = true;
11220 }
11221
11222 uint64_t BlueStore::_assign_blobid(TransContext *txc)
11223 {
11224 uint64_t bid = ++blobid_last;
11225 dout(20) << __func__ << " " << bid << dendl;
11226 txc->last_blobid = bid;
11227 return bid;
11228 }
11229
11230 void BlueStore::get_db_statistics(Formatter *f)
11231 {
11232 db->get_statistics(f);
11233 }
11234
11235 BlueStore::TransContext *BlueStore::_txc_create(
11236 Collection *c, OpSequencer *osr,
11237 list<Context*> *on_commits,
11238 TrackedOpRef osd_op)
11239 {
11240 TransContext *txc = new TransContext(cct, c, osr, on_commits);
11241 txc->t = db->get_transaction();
11242
11243 #ifdef WITH_BLKIN
11244 if (osd_op && osd_op->pg_trace) {
11245 txc->trace.init("TransContext", &trace_endpoint,
11246 &osd_op->pg_trace);
11247 txc->trace.event("txc create");
11248 txc->trace.keyval("txc seq", txc->seq);
11249 }
11250 #endif
11251
11252 osr->queue_new(txc);
11253 dout(20) << __func__ << " osr " << osr << " = " << txc
11254 << " seq " << txc->seq << dendl;
11255 return txc;
11256 }
11257
11258 void BlueStore::_txc_calc_cost(TransContext *txc)
11259 {
11260 // one "io" for the kv commit
11261 auto ios = 1 + txc->ioc.get_num_ios();
11262 auto cost = throttle_cost_per_io.load();
11263 txc->cost = ios * cost + txc->bytes;
11264 txc->ios = ios;
11265 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11266 << ios << " ios * " << cost << " + " << txc->bytes
11267 << " bytes)" << dendl;
11268 }
11269
11270 void BlueStore::_txc_update_store_statfs(TransContext *txc)
11271 {
11272 if (txc->statfs_delta.is_empty())
11273 return;
11274
11275 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11276 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11277 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11278 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11279 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11280
11281 bufferlist bl;
11282 txc->statfs_delta.encode(bl);
11283 if (per_pool_stat_collection) {
11284 string key;
11285 get_pool_stat_key(txc->osd_pool_id, &key);
11286 txc->t->merge(PREFIX_STAT, key, bl);
11287
11288 std::lock_guard l(vstatfs_lock);
11289 auto& stats = osd_pools[txc->osd_pool_id];
11290 stats += txc->statfs_delta;
11291
11292 vstatfs += txc->statfs_delta; //non-persistent in this mode
11293
11294 } else {
11295 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
11296
11297 std::lock_guard l(vstatfs_lock);
11298 vstatfs += txc->statfs_delta;
11299 }
11300 txc->statfs_delta.reset();
11301 }
11302
11303 void BlueStore::_txc_state_proc(TransContext *txc)
11304 {
11305 while (true) {
11306 dout(10) << __func__ << " txc " << txc
11307 << " " << txc->get_state_name() << dendl;
11308 switch (txc->get_state()) {
11309 case TransContext::STATE_PREPARE:
11310 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
11311 if (txc->ioc.has_pending_aios()) {
11312 txc->set_state(TransContext::STATE_AIO_WAIT);
11313 #ifdef WITH_BLKIN
11314 if (txc->trace) {
11315 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
11316 }
11317 #endif
11318 txc->had_ios = true;
11319 _txc_aio_submit(txc);
11320 return;
11321 }
11322 // ** fall-thru **
11323
11324 case TransContext::STATE_AIO_WAIT:
11325 {
11326 mono_clock::duration lat = throttle.log_state_latency(
11327 *txc, logger, l_bluestore_state_aio_wait_lat);
11328 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11329 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11330 << ", latency = " << lat
11331 << dendl;
11332 }
11333 }
11334
11335 _txc_finish_io(txc); // may trigger blocked txc's too
11336 return;
11337
11338 case TransContext::STATE_IO_DONE:
11339 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
11340 if (txc->had_ios) {
11341 ++txc->osr->txc_with_unstable_io;
11342 }
11343 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
11344 txc->set_state(TransContext::STATE_KV_QUEUED);
11345 if (cct->_conf->bluestore_sync_submit_transaction) {
11346 if (txc->last_nid >= nid_max ||
11347 txc->last_blobid >= blobid_max) {
11348 dout(20) << __func__
11349 << " last_{nid,blobid} exceeds max, submit via kv thread"
11350 << dendl;
11351 } else if (txc->osr->kv_committing_serially) {
11352 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11353 << dendl;
11354 // note: this is starvation-prone. once we have a txc in a busy
11355 // sequencer that is committing serially it is possible to keep
11356 // submitting new transactions fast enough that we get stuck doing
11357 // so. the alternative is to block here... fixme?
11358 } else if (txc->osr->txc_with_unstable_io) {
11359 dout(20) << __func__ << " prior txc(s) with unstable ios "
11360 << txc->osr->txc_with_unstable_io.load() << dendl;
11361 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11362 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11363 == 0) {
11364 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11365 << dendl;
11366 } else {
11367 _txc_apply_kv(txc, true);
11368 }
11369 }
11370 {
11371 std::lock_guard l(kv_lock);
11372 kv_queue.push_back(txc);
11373 if (!kv_sync_in_progress) {
11374 kv_sync_in_progress = true;
11375 kv_cond.notify_one();
11376 }
11377 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
11378 kv_queue_unsubmitted.push_back(txc);
11379 ++txc->osr->kv_committing_serially;
11380 }
11381 if (txc->had_ios)
11382 kv_ios++;
11383 kv_throttle_costs += txc->cost;
11384 }
11385 return;
11386 case TransContext::STATE_KV_SUBMITTED:
11387 _txc_committed_kv(txc);
11388 // ** fall-thru **
11389
11390 case TransContext::STATE_KV_DONE:
11391 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
11392 if (txc->deferred_txn) {
11393 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
11394 _deferred_queue(txc);
11395 return;
11396 }
11397 txc->set_state(TransContext::STATE_FINISHING);
11398 break;
11399
11400 case TransContext::STATE_DEFERRED_CLEANUP:
11401 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
11402 txc->set_state(TransContext::STATE_FINISHING);
11403 // ** fall-thru **
11404
11405 case TransContext::STATE_FINISHING:
11406 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
11407 _txc_finish(txc);
11408 return;
11409
11410 default:
11411 derr << __func__ << " unexpected txc " << txc
11412 << " state " << txc->get_state_name() << dendl;
11413 ceph_abort_msg("unexpected txc state");
11414 return;
11415 }
11416 }
11417 }
11418
11419 void BlueStore::_txc_finish_io(TransContext *txc)
11420 {
11421 dout(20) << __func__ << " " << txc << dendl;
11422
11423 /*
11424 * we need to preserve the order of kv transactions,
11425 * even though aio will complete in any order.
11426 */
11427
11428 OpSequencer *osr = txc->osr.get();
11429 std::lock_guard l(osr->qlock);
11430 txc->set_state(TransContext::STATE_IO_DONE);
11431 txc->ioc.release_running_aios();
11432 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11433 while (p != osr->q.begin()) {
11434 --p;
11435 if (p->get_state() < TransContext::STATE_IO_DONE) {
11436 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11437 << p->get_state_name() << dendl;
11438 return;
11439 }
11440 if (p->get_state() > TransContext::STATE_IO_DONE) {
11441 ++p;
11442 break;
11443 }
11444 }
11445 do {
11446 _txc_state_proc(&*p++);
11447 } while (p != osr->q.end() &&
11448 p->get_state() == TransContext::STATE_IO_DONE);
11449
11450 if (osr->kv_submitted_waiters) {
11451 osr->qcond.notify_all();
11452 }
11453 }
11454
11455 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11456 {
11457 dout(20) << __func__ << " txc " << txc
11458 << " onodes " << txc->onodes
11459 << " shared_blobs " << txc->shared_blobs
11460 << dendl;
11461
11462 // finalize onodes
11463 for (auto o : txc->onodes) {
11464 _record_onode(o, t);
11465 o->flushing_count++;
11466 }
11467
11468 // objects we modified but didn't affect the onode
11469 auto p = txc->modified_objects.begin();
11470 while (p != txc->modified_objects.end()) {
11471 if (txc->onodes.count(*p) == 0) {
11472 (*p)->flushing_count++;
11473 ++p;
11474 } else {
11475 // remove dups with onodes list to avoid problems in _txc_finish
11476 p = txc->modified_objects.erase(p);
11477 }
11478 }
11479
11480 // finalize shared_blobs
11481 for (auto sb : txc->shared_blobs) {
11482 string key;
11483 auto sbid = sb->get_sbid();
11484 get_shared_blob_key(sbid, &key);
11485 if (sb->persistent->empty()) {
11486 dout(20) << __func__ << " shared_blob 0x"
11487 << std::hex << sbid << std::dec
11488 << " is empty" << dendl;
11489 t->rmkey(PREFIX_SHARED_BLOB, key);
11490 } else {
11491 bufferlist bl;
11492 encode(*(sb->persistent), bl);
11493 dout(20) << __func__ << " shared_blob 0x"
11494 << std::hex << sbid << std::dec
11495 << " is " << bl.length() << " " << *sb << dendl;
11496 t->set(PREFIX_SHARED_BLOB, key, bl);
11497 }
11498 }
11499 }
11500
11501 void BlueStore::BSPerfTracker::update_from_perfcounters(
11502 PerfCounters &logger)
11503 {
11504 os_commit_latency_ns.consume_next(
11505 logger.get_tavg_ns(
11506 l_bluestore_commit_lat));
11507 os_apply_latency_ns.consume_next(
11508 logger.get_tavg_ns(
11509 l_bluestore_commit_lat));
11510 }
11511
11512 // For every object we maintain <zone_num+oid, offset> tuple in the key-value
11513 // store. When a new object written to a zone, we insert the corresponding
11514 // tuple to the database. When an object is truncated, we remove the
11515 // corresponding tuple. When an object is overwritten, we remove the old tuple
11516 // and insert a new tuple corresponding to the new location of the object. The
11517 // cleaner can now identify live objects within the zone <zone_num> by
11518 // enumerating all the keys starting with <zone_num> prefix.
11519 void BlueStore::_zoned_update_cleaning_metadata(TransContext *txc) {
11520 for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) {
11521 std::string key;
11522 get_object_key(cct, o->oid, &key);
11523 for (auto offset : offsets) {
11524 if (offset > 0) {
11525 bufferlist offset_bl;
11526 encode(offset, offset_bl);
11527 txc->t->set(_zoned_get_prefix(offset), key, offset_bl);
11528 } else {
11529 txc->t->rmkey(_zoned_get_prefix(-offset), key);
11530 }
11531 }
11532 }
11533 }
11534
11535 std::string BlueStore::_zoned_get_prefix(uint64_t offset) {
11536 uint64_t zone_num = offset / bdev->get_zone_size();
11537 std::string zone_key;
11538 _key_encode_u64(zone_num, &zone_key);
11539 return PREFIX_ZONED_CL_INFO + zone_key;
11540 }
11541
11542 // For now, to avoid interface changes we piggyback zone_size (in MiB) and the
11543 // first sequential zone number onto min_alloc_size and pass it to functions
11544 // Allocator::create and FreelistManager::create.
11545 uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) {
11546 uint64_t zone_size = bdev->get_zone_size();
11547 uint64_t zone_size_mb = zone_size / (1024 * 1024);
11548 uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size;
11549 min_alloc_size |= (zone_size_mb << 32);
11550 min_alloc_size |= (first_seq_zone << 48);
11551 return min_alloc_size;
11552 }
11553
11554 int BlueStore::_zoned_check_config_settings() {
11555 if (cct->_conf->bluestore_allocator != "zoned") {
11556 dout(1) << __func__ << " The drive is HM-SMR but "
11557 << cct->_conf->bluestore_allocator << " allocator is specified. "
11558 << "Only zoned allocator can be used with HM-SMR drive." << dendl;
11559 return -EINVAL;
11560 }
11561
11562 // At least for now we want to use large min_alloc_size with HM-SMR drives.
11563 // Populating used_blocks bitset on a debug build of ceph-osd takes about 5
11564 // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size.
11565 if (min_alloc_size < 64 * 1024) {
11566 dout(1) << __func__ << " The drive is HM-SMR but min_alloc_size is "
11567 << min_alloc_size << ". "
11568 << "Please set to at least 64 KiB." << dendl;
11569 return -EINVAL;
11570 }
11571
11572 // We don't want to defer writes with HM-SMR because it violates sequential
11573 // write requirement.
11574 if (prefer_deferred_size) {
11575 dout(1) << __func__ << " The drive is HM-SMR but prefer_deferred_size is "
11576 << prefer_deferred_size << ". "
11577 << "Please set to 0." << dendl;
11578 return -EINVAL;
11579 }
11580 return 0;
11581 }
11582
11583 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11584 {
11585 dout(20) << __func__ << " txc " << txc << std::hex
11586 << " allocated 0x" << txc->allocated
11587 << " released 0x" << txc->released
11588 << std::dec << dendl;
11589
11590 // We have to handle the case where we allocate *and* deallocate the
11591 // same region in this transaction. The freelist doesn't like that.
11592 // (Actually, the only thing that cares is the BitmapFreelistManager
11593 // debug check. But that's important.)
11594 interval_set<uint64_t> tmp_allocated, tmp_released;
11595 interval_set<uint64_t> *pallocated = &txc->allocated;
11596 interval_set<uint64_t> *preleased = &txc->released;
11597 if (!txc->allocated.empty() && !txc->released.empty()) {
11598 interval_set<uint64_t> overlap;
11599 overlap.intersection_of(txc->allocated, txc->released);
11600 if (!overlap.empty()) {
11601 tmp_allocated = txc->allocated;
11602 tmp_allocated.subtract(overlap);
11603 tmp_released = txc->released;
11604 tmp_released.subtract(overlap);
11605 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11606 << ", new allocated 0x" << tmp_allocated
11607 << " released 0x" << tmp_released << std::dec
11608 << dendl;
11609 pallocated = &tmp_allocated;
11610 preleased = &tmp_released;
11611 }
11612 }
11613
11614 // update freelist with non-overlap sets
11615 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11616 p != pallocated->end();
11617 ++p) {
11618 fm->allocate(p.get_start(), p.get_len(), t);
11619 }
11620 for (interval_set<uint64_t>::iterator p = preleased->begin();
11621 p != preleased->end();
11622 ++p) {
11623 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11624 << "~" << p.get_len() << std::dec << dendl;
11625 fm->release(p.get_start(), p.get_len(), t);
11626 }
11627
11628 if (bdev->is_smr()) {
11629 _zoned_update_cleaning_metadata(txc);
11630 }
11631
11632 _txc_update_store_statfs(txc);
11633 }
11634
11635 void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
11636 {
11637 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
11638 {
11639 #if defined(WITH_LTTNG)
11640 auto start = mono_clock::now();
11641 #endif
11642
11643 #ifdef WITH_BLKIN
11644 if (txc->trace) {
11645 txc->trace.event("db async submit");
11646 }
11647 #endif
11648
11649 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11650 ceph_assert(r == 0);
11651 txc->set_state(TransContext::STATE_KV_SUBMITTED);
11652 if (txc->osr->kv_submitted_waiters) {
11653 std::lock_guard l(txc->osr->qlock);
11654 txc->osr->qcond.notify_all();
11655 }
11656
11657 #if defined(WITH_LTTNG)
11658 if (txc->tracing) {
11659 tracepoint(
11660 bluestore,
11661 transaction_kv_submit_latency,
11662 txc->osr->get_sequencer_id(),
11663 txc->seq,
11664 sync_submit_transaction,
11665 ceph::to_seconds<double>(mono_clock::now() - start));
11666 }
11667 #endif
11668 }
11669
11670 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11671 for (auto& o : *ls) {
11672 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11673 << dendl;
11674 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11675 std::lock_guard l(o->flush_lock);
11676 o->flush_cond.notify_all();
11677 }
11678 }
11679 }
11680 }
11681
11682 void BlueStore::_txc_committed_kv(TransContext *txc)
11683 {
11684 dout(20) << __func__ << " txc " << txc << dendl;
11685 throttle.complete_kv(*txc);
11686 {
11687 std::lock_guard l(txc->osr->qlock);
11688 txc->set_state(TransContext::STATE_KV_DONE);
11689 if (txc->ch->commit_queue) {
11690 txc->ch->commit_queue->queue(txc->oncommits);
11691 } else {
11692 finisher.queue(txc->oncommits);
11693 }
11694 }
11695 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
11696 log_latency_fn(
11697 __func__,
11698 l_bluestore_commit_lat,
11699 mono_clock::now() - txc->start,
11700 cct->_conf->bluestore_log_op_age,
11701 [&](auto lat) {
11702 return ", txc = " + stringify(txc);
11703 }
11704 );
11705 }
11706
11707 void BlueStore::_txc_finish(TransContext *txc)
11708 {
11709 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11710 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
11711
11712 for (auto& sb : txc->shared_blobs_written) {
11713 sb->finish_write(txc->seq);
11714 }
11715 txc->shared_blobs_written.clear();
11716
11717 while (!txc->removed_collections.empty()) {
11718 _queue_reap_collection(txc->removed_collections.front());
11719 txc->removed_collections.pop_front();
11720 }
11721
11722 OpSequencerRef osr = txc->osr;
11723 bool empty = false;
11724 bool submit_deferred = false;
11725 OpSequencer::q_list_t releasing_txc;
11726 {
11727 std::lock_guard l(osr->qlock);
11728 txc->set_state(TransContext::STATE_DONE);
11729 bool notify = false;
11730 while (!osr->q.empty()) {
11731 TransContext *txc = &osr->q.front();
11732 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11733 << dendl;
11734 if (txc->get_state() != TransContext::STATE_DONE) {
11735 if (txc->get_state() == TransContext::STATE_PREPARE &&
11736 deferred_aggressive) {
11737 // for _osr_drain_preceding()
11738 notify = true;
11739 }
11740 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
11741 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
11742 submit_deferred = true;
11743 }
11744 break;
11745 }
11746
11747 osr->q.pop_front();
11748 releasing_txc.push_back(*txc);
11749 }
11750
11751 if (osr->q.empty()) {
11752 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11753 empty = true;
11754 }
11755
11756 // only drain()/drain_preceding() need wakeup,
11757 // other cases use kv_submitted_waiters
11758 if (notify || empty) {
11759 osr->qcond.notify_all();
11760 }
11761 }
11762
11763 while (!releasing_txc.empty()) {
11764 // release to allocator only after all preceding txc's have also
11765 // finished any deferred writes that potentially land in these
11766 // blocks
11767 auto txc = &releasing_txc.front();
11768 _txc_release_alloc(txc);
11769 releasing_txc.pop_front();
11770 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11771 throttle.complete(*txc);
11772 delete txc;
11773 }
11774
11775 if (submit_deferred) {
11776 // we're pinning memory; flush! we could be more fine-grained here but
11777 // i'm not sure it's worth the bother.
11778 deferred_try_submit();
11779 }
11780
11781 if (empty && osr->zombie) {
11782 std::lock_guard l(zombie_osr_lock);
11783 if (zombie_osr_set.erase(osr->cid)) {
11784 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11785 } else {
11786 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11787 << dendl;
11788 }
11789 }
11790 }
11791
11792 void BlueStore::_txc_release_alloc(TransContext *txc)
11793 {
11794 // it's expected we're called with lazy_release_lock already taken!
11795 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11796 int r = 0;
11797 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11798 r = bdev->queue_discard(txc->released);
11799 if (r == 0) {
11800 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11801 << txc->released << std::dec << dendl;
11802 goto out;
11803 }
11804 } else if (cct->_conf->bdev_enable_discard) {
11805 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11806 bdev->discard(p.get_start(), p.get_len());
11807 }
11808 }
11809 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
11810 << txc->released << std::dec << dendl;
11811 shared_alloc.a->release(txc->released);
11812 }
11813
11814 out:
11815 txc->allocated.clear();
11816 txc->released.clear();
11817 }
11818
11819 void BlueStore::_osr_attach(Collection *c)
11820 {
11821 // note: caller has RWLock on coll_map
11822 auto q = coll_map.find(c->cid);
11823 if (q != coll_map.end()) {
11824 c->osr = q->second->osr;
11825 ldout(cct, 10) << __func__ << " " << c->cid
11826 << " reusing osr " << c->osr << " from existing coll "
11827 << q->second << dendl;
11828 } else {
11829 std::lock_guard l(zombie_osr_lock);
11830 auto p = zombie_osr_set.find(c->cid);
11831 if (p == zombie_osr_set.end()) {
11832 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11833 ldout(cct, 10) << __func__ << " " << c->cid
11834 << " fresh osr " << c->osr << dendl;
11835 } else {
11836 c->osr = p->second;
11837 zombie_osr_set.erase(p);
11838 ldout(cct, 10) << __func__ << " " << c->cid
11839 << " resurrecting zombie osr " << c->osr << dendl;
11840 c->osr->zombie = false;
11841 }
11842 }
11843 }
11844
11845 void BlueStore::_osr_register_zombie(OpSequencer *osr)
11846 {
11847 std::lock_guard l(zombie_osr_lock);
11848 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
11849 osr->zombie = true;
11850 auto i = zombie_osr_set.emplace(osr->cid, osr);
11851 // this is either a new insertion or the same osr is already there
11852 ceph_assert(i.second || i.first->second == osr);
11853 }
11854
11855 void BlueStore::_osr_drain_preceding(TransContext *txc)
11856 {
11857 OpSequencer *osr = txc->osr.get();
11858 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
11859 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11860 {
11861 // submit anything pending
11862 osr->deferred_lock.lock();
11863 if (osr->deferred_pending && !osr->deferred_running) {
11864 _deferred_submit_unlock(osr);
11865 } else {
11866 osr->deferred_lock.unlock();
11867 }
11868 }
11869 {
11870 // wake up any previously finished deferred events
11871 std::lock_guard l(kv_lock);
11872 if (!kv_sync_in_progress) {
11873 kv_sync_in_progress = true;
11874 kv_cond.notify_one();
11875 }
11876 }
11877 osr->drain_preceding(txc);
11878 --deferred_aggressive;
11879 dout(10) << __func__ << " " << osr << " done" << dendl;
11880 }
11881
11882 void BlueStore::_osr_drain(OpSequencer *osr)
11883 {
11884 dout(10) << __func__ << " " << osr << dendl;
11885 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11886 {
11887 // submit anything pending
11888 osr->deferred_lock.lock();
11889 if (osr->deferred_pending && !osr->deferred_running) {
11890 _deferred_submit_unlock(osr);
11891 } else {
11892 osr->deferred_lock.unlock();
11893 }
11894 }
11895 {
11896 // wake up any previously finished deferred events
11897 std::lock_guard l(kv_lock);
11898 if (!kv_sync_in_progress) {
11899 kv_sync_in_progress = true;
11900 kv_cond.notify_one();
11901 }
11902 }
11903 osr->drain();
11904 --deferred_aggressive;
11905 dout(10) << __func__ << " " << osr << " done" << dendl;
11906 }
11907
11908 void BlueStore::_osr_drain_all()
11909 {
11910 dout(10) << __func__ << dendl;
11911
11912 set<OpSequencerRef> s;
11913 vector<OpSequencerRef> zombies;
11914 {
11915 std::shared_lock l(coll_lock);
11916 for (auto& i : coll_map) {
11917 s.insert(i.second->osr);
11918 }
11919 }
11920 {
11921 std::lock_guard l(zombie_osr_lock);
11922 for (auto& i : zombie_osr_set) {
11923 s.insert(i.second);
11924 zombies.push_back(i.second);
11925 }
11926 }
11927 dout(20) << __func__ << " osr_set " << s << dendl;
11928
11929 ++deferred_aggressive;
11930 {
11931 // submit anything pending
11932 deferred_try_submit();
11933 }
11934 {
11935 // wake up any previously finished deferred events
11936 std::lock_guard l(kv_lock);
11937 kv_cond.notify_one();
11938 }
11939 {
11940 std::lock_guard l(kv_finalize_lock);
11941 kv_finalize_cond.notify_one();
11942 }
11943 for (auto osr : s) {
11944 dout(20) << __func__ << " drain " << osr << dendl;
11945 osr->drain();
11946 }
11947 --deferred_aggressive;
11948
11949 {
11950 std::lock_guard l(zombie_osr_lock);
11951 for (auto& osr : zombies) {
11952 if (zombie_osr_set.erase(osr->cid)) {
11953 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11954 ceph_assert(osr->q.empty());
11955 } else if (osr->zombie) {
11956 dout(10) << __func__ << " empty zombie osr " << osr
11957 << " already reaped" << dendl;
11958 ceph_assert(osr->q.empty());
11959 } else {
11960 dout(10) << __func__ << " empty zombie osr " << osr
11961 << " resurrected" << dendl;
11962 }
11963 }
11964 }
11965
11966 dout(10) << __func__ << " done" << dendl;
11967 }
11968
11969
11970 void BlueStore::_kv_start()
11971 {
11972 dout(10) << __func__ << dendl;
11973
11974 finisher.start();
11975 kv_sync_thread.create("bstore_kv_sync");
11976 kv_finalize_thread.create("bstore_kv_final");
11977 }
11978
11979 void BlueStore::_kv_stop()
11980 {
11981 dout(10) << __func__ << dendl;
11982 {
11983 std::unique_lock l{kv_lock};
11984 while (!kv_sync_started) {
11985 kv_cond.wait(l);
11986 }
11987 kv_stop = true;
11988 kv_cond.notify_all();
11989 }
11990 {
11991 std::unique_lock l{kv_finalize_lock};
11992 while (!kv_finalize_started) {
11993 kv_finalize_cond.wait(l);
11994 }
11995 kv_finalize_stop = true;
11996 kv_finalize_cond.notify_all();
11997 }
11998 kv_sync_thread.join();
11999 kv_finalize_thread.join();
12000 ceph_assert(removed_collections.empty());
12001 {
12002 std::lock_guard l(kv_lock);
12003 kv_stop = false;
12004 }
12005 {
12006 std::lock_guard l(kv_finalize_lock);
12007 kv_finalize_stop = false;
12008 }
12009 dout(10) << __func__ << " stopping finishers" << dendl;
12010 finisher.wait_for_empty();
12011 finisher.stop();
12012 dout(10) << __func__ << " stopped" << dendl;
12013 }
12014
12015 void BlueStore::_kv_sync_thread()
12016 {
12017 dout(10) << __func__ << " start" << dendl;
12018 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
12019 std::unique_lock l{kv_lock};
12020 ceph_assert(!kv_sync_started);
12021 kv_sync_started = true;
12022 kv_cond.notify_all();
12023
12024 auto t0 = mono_clock::now();
12025 timespan twait = ceph::make_timespan(0);
12026 size_t kv_submitted = 0;
12027
12028 while (true) {
12029 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
12030 auto observation_period =
12031 ceph::make_timespan(period);
12032 auto elapsed = mono_clock::now() - t0;
12033 if (period && elapsed >= observation_period) {
12034 dout(5) << __func__ << " utilization: idle "
12035 << twait << " of " << elapsed
12036 << ", submitted: " << kv_submitted
12037 <<dendl;
12038 t0 = mono_clock::now();
12039 twait = ceph::make_timespan(0);
12040 kv_submitted = 0;
12041 }
12042 ceph_assert(kv_committing.empty());
12043 if (kv_queue.empty() &&
12044 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
12045 !deferred_aggressive)) {
12046 if (kv_stop)
12047 break;
12048 dout(20) << __func__ << " sleep" << dendl;
12049 auto t = mono_clock::now();
12050 kv_sync_in_progress = false;
12051 kv_cond.wait(l);
12052 twait += mono_clock::now() - t;
12053
12054 dout(20) << __func__ << " wake" << dendl;
12055 } else {
12056 deque<TransContext*> kv_submitting;
12057 deque<DeferredBatch*> deferred_done, deferred_stable;
12058 uint64_t aios = 0, costs = 0;
12059
12060 dout(20) << __func__ << " committing " << kv_queue.size()
12061 << " submitting " << kv_queue_unsubmitted.size()
12062 << " deferred done " << deferred_done_queue.size()
12063 << " stable " << deferred_stable_queue.size()
12064 << dendl;
12065 kv_committing.swap(kv_queue);
12066 kv_submitting.swap(kv_queue_unsubmitted);
12067 deferred_done.swap(deferred_done_queue);
12068 deferred_stable.swap(deferred_stable_queue);
12069 aios = kv_ios;
12070 costs = kv_throttle_costs;
12071 kv_ios = 0;
12072 kv_throttle_costs = 0;
12073 l.unlock();
12074
12075 dout(30) << __func__ << " committing " << kv_committing << dendl;
12076 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
12077 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
12078 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
12079
12080 auto start = mono_clock::now();
12081
12082 bool force_flush = false;
12083 // if bluefs is sharing the same device as data (only), then we
12084 // can rely on the bluefs commit to flush the device and make
12085 // deferred aios stable. that means that if we do have done deferred
12086 // txcs AND we are not on a single device, we need to force a flush.
12087 if (bluefs && bluefs_layout.single_shared_device()) {
12088 if (aios) {
12089 force_flush = true;
12090 } else if (kv_committing.empty() && deferred_stable.empty()) {
12091 force_flush = true; // there's nothing else to commit!
12092 } else if (deferred_aggressive) {
12093 force_flush = true;
12094 }
12095 } else {
12096 if (aios || !deferred_done.empty()) {
12097 force_flush = true;
12098 } else {
12099 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
12100 }
12101 }
12102
12103 if (force_flush) {
12104 dout(20) << __func__ << " num_aios=" << aios
12105 << " force_flush=" << (int)force_flush
12106 << ", flushing, deferred done->stable" << dendl;
12107 // flush/barrier on block device
12108 bdev->flush();
12109
12110 // if we flush then deferred done are now deferred stable
12111 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
12112 deferred_done.end());
12113 deferred_done.clear();
12114 }
12115 auto after_flush = mono_clock::now();
12116
12117 // we will use one final transaction to force a sync
12118 KeyValueDB::Transaction synct = db->get_transaction();
12119
12120 // increase {nid,blobid}_max? note that this covers both the
12121 // case where we are approaching the max and the case we passed
12122 // it. in either case, we increase the max in the earlier txn
12123 // we submit.
12124 uint64_t new_nid_max = 0, new_blobid_max = 0;
12125 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
12126 KeyValueDB::Transaction t =
12127 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12128 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
12129 bufferlist bl;
12130 encode(new_nid_max, bl);
12131 t->set(PREFIX_SUPER, "nid_max", bl);
12132 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
12133 }
12134 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
12135 KeyValueDB::Transaction t =
12136 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12137 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
12138 bufferlist bl;
12139 encode(new_blobid_max, bl);
12140 t->set(PREFIX_SUPER, "blobid_max", bl);
12141 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
12142 }
12143
12144 for (auto txc : kv_committing) {
12145 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
12146 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
12147 ++kv_submitted;
12148 _txc_apply_kv(txc, false);
12149 --txc->osr->kv_committing_serially;
12150 } else {
12151 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
12152 }
12153 if (txc->had_ios) {
12154 --txc->osr->txc_with_unstable_io;
12155 }
12156 }
12157
12158 // release throttle *before* we commit. this allows new ops
12159 // to be prepared and enter pipeline while we are waiting on
12160 // the kv commit sync/flush. then hopefully on the next
12161 // iteration there will already be ops awake. otherwise, we
12162 // end up going to sleep, and then wake up when the very first
12163 // transaction is ready for commit.
12164 throttle.release_kv_throttle(costs);
12165
12166 // cleanup sync deferred keys
12167 for (auto b : deferred_stable) {
12168 for (auto& txc : b->txcs) {
12169 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
12170 ceph_assert(wt.released.empty()); // only kraken did this
12171 string key;
12172 get_deferred_key(wt.seq, &key);
12173 synct->rm_single_key(PREFIX_DEFERRED, key);
12174 }
12175 }
12176
12177 #if defined(WITH_LTTNG)
12178 auto sync_start = mono_clock::now();
12179 #endif
12180 // submit synct synchronously (block and wait for it to commit)
12181 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
12182 ceph_assert(r == 0);
12183
12184 #ifdef WITH_BLKIN
12185 for (auto txc : kv_committing) {
12186 if (txc->trace) {
12187 txc->trace.event("db sync submit");
12188 txc->trace.keyval("kv_committing size", kv_committing.size());
12189 }
12190 }
12191 #endif
12192
12193 int committing_size = kv_committing.size();
12194 int deferred_size = deferred_stable.size();
12195
12196 #if defined(WITH_LTTNG)
12197 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
12198 for (auto txc: kv_committing) {
12199 if (txc->tracing) {
12200 tracepoint(
12201 bluestore,
12202 transaction_kv_sync_latency,
12203 txc->osr->get_sequencer_id(),
12204 txc->seq,
12205 kv_committing.size(),
12206 deferred_done.size(),
12207 deferred_stable.size(),
12208 sync_latency);
12209 }
12210 }
12211 #endif
12212
12213 {
12214 std::unique_lock m{kv_finalize_lock};
12215 if (kv_committing_to_finalize.empty()) {
12216 kv_committing_to_finalize.swap(kv_committing);
12217 } else {
12218 kv_committing_to_finalize.insert(
12219 kv_committing_to_finalize.end(),
12220 kv_committing.begin(),
12221 kv_committing.end());
12222 kv_committing.clear();
12223 }
12224 if (deferred_stable_to_finalize.empty()) {
12225 deferred_stable_to_finalize.swap(deferred_stable);
12226 } else {
12227 deferred_stable_to_finalize.insert(
12228 deferred_stable_to_finalize.end(),
12229 deferred_stable.begin(),
12230 deferred_stable.end());
12231 deferred_stable.clear();
12232 }
12233 if (!kv_finalize_in_progress) {
12234 kv_finalize_in_progress = true;
12235 kv_finalize_cond.notify_one();
12236 }
12237 }
12238
12239 if (new_nid_max) {
12240 nid_max = new_nid_max;
12241 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12242 }
12243 if (new_blobid_max) {
12244 blobid_max = new_blobid_max;
12245 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12246 }
12247
12248 {
12249 auto finish = mono_clock::now();
12250 ceph::timespan dur_flush = after_flush - start;
12251 ceph::timespan dur_kv = finish - after_flush;
12252 ceph::timespan dur = finish - start;
12253 dout(20) << __func__ << " committed " << committing_size
12254 << " cleaned " << deferred_size
12255 << " in " << dur
12256 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12257 << dendl;
12258 log_latency("kv_flush",
12259 l_bluestore_kv_flush_lat,
12260 dur_flush,
12261 cct->_conf->bluestore_log_op_age);
12262 log_latency("kv_commit",
12263 l_bluestore_kv_commit_lat,
12264 dur_kv,
12265 cct->_conf->bluestore_log_op_age);
12266 log_latency("kv_sync",
12267 l_bluestore_kv_sync_lat,
12268 dur,
12269 cct->_conf->bluestore_log_op_age);
12270 }
12271
12272 l.lock();
12273 // previously deferred "done" are now "stable" by virtue of this
12274 // commit cycle.
12275 deferred_stable_queue.swap(deferred_done);
12276 }
12277 }
12278 dout(10) << __func__ << " finish" << dendl;
12279 kv_sync_started = false;
12280 }
12281
12282 void BlueStore::_kv_finalize_thread()
12283 {
12284 deque<TransContext*> kv_committed;
12285 deque<DeferredBatch*> deferred_stable;
12286 dout(10) << __func__ << " start" << dendl;
12287 std::unique_lock l(kv_finalize_lock);
12288 ceph_assert(!kv_finalize_started);
12289 kv_finalize_started = true;
12290 kv_finalize_cond.notify_all();
12291 while (true) {
12292 ceph_assert(kv_committed.empty());
12293 ceph_assert(deferred_stable.empty());
12294 if (kv_committing_to_finalize.empty() &&
12295 deferred_stable_to_finalize.empty()) {
12296 if (kv_finalize_stop)
12297 break;
12298 dout(20) << __func__ << " sleep" << dendl;
12299 kv_finalize_in_progress = false;
12300 kv_finalize_cond.wait(l);
12301 dout(20) << __func__ << " wake" << dendl;
12302 } else {
12303 kv_committed.swap(kv_committing_to_finalize);
12304 deferred_stable.swap(deferred_stable_to_finalize);
12305 l.unlock();
12306 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12307 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12308
12309 auto start = mono_clock::now();
12310
12311 while (!kv_committed.empty()) {
12312 TransContext *txc = kv_committed.front();
12313 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
12314 _txc_state_proc(txc);
12315 kv_committed.pop_front();
12316 }
12317
12318 for (auto b : deferred_stable) {
12319 auto p = b->txcs.begin();
12320 while (p != b->txcs.end()) {
12321 TransContext *txc = &*p;
12322 p = b->txcs.erase(p); // unlink here because
12323 _txc_state_proc(txc); // this may destroy txc
12324 }
12325 delete b;
12326 }
12327 deferred_stable.clear();
12328
12329 if (!deferred_aggressive) {
12330 if (deferred_queue_size >= deferred_batch_ops.load() ||
12331 throttle.should_submit_deferred()) {
12332 deferred_try_submit();
12333 }
12334 }
12335
12336 // this is as good a place as any ...
12337 _reap_collections();
12338
12339 logger->set(l_bluestore_fragmentation,
12340 (uint64_t)(shared_alloc.a->get_fragmentation() * 1000));
12341
12342 log_latency("kv_final",
12343 l_bluestore_kv_final_lat,
12344 mono_clock::now() - start,
12345 cct->_conf->bluestore_log_op_age);
12346
12347 l.lock();
12348 }
12349 }
12350 dout(10) << __func__ << " finish" << dendl;
12351 kv_finalize_started = false;
12352 }
12353
12354 void BlueStore::_zoned_cleaner_start() {
12355 dout(10) << __func__ << dendl;
12356
12357 zoned_cleaner_thread.create("bstore_zcleaner");
12358 }
12359
12360 void BlueStore::_zoned_cleaner_stop() {
12361 dout(10) << __func__ << dendl;
12362 {
12363 std::unique_lock l{zoned_cleaner_lock};
12364 while (!zoned_cleaner_started) {
12365 zoned_cleaner_cond.wait(l);
12366 }
12367 zoned_cleaner_stop = true;
12368 zoned_cleaner_cond.notify_all();
12369 }
12370 zoned_cleaner_thread.join();
12371 {
12372 std::lock_guard l{zoned_cleaner_lock};
12373 zoned_cleaner_stop = false;
12374 }
12375 dout(10) << __func__ << " done" << dendl;
12376 }
12377
12378 void BlueStore::_zoned_cleaner_thread() {
12379 dout(10) << __func__ << " start" << dendl;
12380 std::unique_lock l{zoned_cleaner_lock};
12381 ceph_assert(!zoned_cleaner_started);
12382 zoned_cleaner_started = true;
12383 zoned_cleaner_cond.notify_all();
12384 std::deque<uint64_t> zones_to_clean;
12385 while (true) {
12386 if (zoned_cleaner_queue.empty()) {
12387 if (zoned_cleaner_stop) {
12388 break;
12389 }
12390 dout(20) << __func__ << " sleep" << dendl;
12391 zoned_cleaner_cond.wait(l);
12392 dout(20) << __func__ << " wake" << dendl;
12393 } else {
12394 zones_to_clean.swap(zoned_cleaner_queue);
12395 l.unlock();
12396 while (!zones_to_clean.empty()) {
12397 _zoned_clean_zone(zones_to_clean.front());
12398 zones_to_clean.pop_front();
12399 }
12400 l.lock();
12401 }
12402 }
12403 dout(10) << __func__ << " finish" << dendl;
12404 zoned_cleaner_started = false;
12405 }
12406
12407 void BlueStore::_zoned_clean_zone(uint64_t zone_num) {
12408 dout(10) << __func__ << " cleaning zone " << zone_num << dendl;
12409 }
12410
12411 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
12412 TransContext *txc, uint64_t len)
12413 {
12414 if (!txc->deferred_txn) {
12415 txc->deferred_txn = new bluestore_deferred_transaction_t;
12416 }
12417 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12418 logger->inc(l_bluestore_write_deferred);
12419 logger->inc(l_bluestore_write_deferred_bytes, len);
12420 return &txc->deferred_txn->ops.back();
12421 }
12422
12423 void BlueStore::_deferred_queue(TransContext *txc)
12424 {
12425 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
12426
12427 DeferredBatch *tmp;
12428 txc->osr->deferred_lock.lock();
12429 {
12430 if (!txc->osr->deferred_pending) {
12431 tmp = new DeferredBatch(cct, txc->osr.get());
12432 } else {
12433 tmp = txc->osr->deferred_pending;
12434 }
12435 }
12436
12437 tmp->txcs.push_back(*txc);
12438 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12439 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12440 const auto& op = *opi;
12441 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
12442 bufferlist::const_iterator p = op.data.begin();
12443 for (auto e : op.extents) {
12444 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
12445 }
12446 }
12447
12448 {
12449 ++deferred_queue_size;
12450 txc->osr->deferred_pending = tmp;
12451 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
12452 // So we should add osr into deferred_queue.
12453 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
12454 deferred_lock.lock();
12455 deferred_queue.push_back(*txc->osr);
12456 deferred_lock.unlock();
12457 }
12458
12459 if (deferred_aggressive &&
12460 !txc->osr->deferred_running) {
12461 _deferred_submit_unlock(txc->osr.get());
12462 } else {
12463 txc->osr->deferred_lock.unlock();
12464 }
12465 }
12466 }
12467
12468 void BlueStore::deferred_try_submit()
12469 {
12470 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12471 << deferred_queue_size << " txcs" << dendl;
12472 vector<OpSequencerRef> osrs;
12473
12474 {
12475 std::lock_guard l(deferred_lock);
12476 osrs.reserve(deferred_queue.size());
12477 for (auto& osr : deferred_queue) {
12478 osrs.push_back(&osr);
12479 }
12480 }
12481
12482 for (auto& osr : osrs) {
12483 osr->deferred_lock.lock();
12484 if (osr->deferred_pending) {
12485 if (!osr->deferred_running) {
12486 _deferred_submit_unlock(osr.get());
12487 } else {
12488 osr->deferred_lock.unlock();
12489 dout(20) << __func__ << " osr " << osr << " already has running"
12490 << dendl;
12491 }
12492 } else {
12493 osr->deferred_lock.unlock();
12494 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
12495 }
12496 }
12497
12498 {
12499 std::lock_guard l(deferred_lock);
12500 deferred_last_submitted = ceph_clock_now();
12501 }
12502 }
12503
12504 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
12505 {
12506 dout(10) << __func__ << " osr " << osr
12507 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12508 << dendl;
12509 ceph_assert(osr->deferred_pending);
12510 ceph_assert(!osr->deferred_running);
12511
12512 auto b = osr->deferred_pending;
12513 deferred_queue_size -= b->seq_bytes.size();
12514 ceph_assert(deferred_queue_size >= 0);
12515
12516 osr->deferred_running = osr->deferred_pending;
12517 osr->deferred_pending = nullptr;
12518
12519 osr->deferred_lock.unlock();
12520
12521 for (auto& txc : b->txcs) {
12522 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
12523 }
12524 uint64_t start = 0, pos = 0;
12525 bufferlist bl;
12526 auto i = b->iomap.begin();
12527 while (true) {
12528 if (i == b->iomap.end() || i->first != pos) {
12529 if (bl.length()) {
12530 dout(20) << __func__ << " write 0x" << std::hex
12531 << start << "~" << bl.length()
12532 << " crc " << bl.crc32c(-1) << std::dec << dendl;
12533 if (!g_conf()->bluestore_debug_omit_block_device_write) {
12534 logger->inc(l_bluestore_deferred_write_ops);
12535 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12536 int r = bdev->aio_write(start, bl, &b->ioc, false);
12537 ceph_assert(r == 0);
12538 }
12539 }
12540 if (i == b->iomap.end()) {
12541 break;
12542 }
12543 start = 0;
12544 pos = i->first;
12545 bl.clear();
12546 }
12547 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12548 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12549 << dendl;
12550 if (!bl.length()) {
12551 start = pos;
12552 }
12553 pos += i->second.bl.length();
12554 bl.claim_append(i->second.bl);
12555 ++i;
12556 }
12557
12558 bdev->aio_submit(&b->ioc);
12559 }
12560
12561 struct C_DeferredTrySubmit : public Context {
12562 BlueStore *store;
12563 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12564 void finish(int r) {
12565 store->deferred_try_submit();
12566 }
12567 };
12568
12569 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12570 {
12571 dout(10) << __func__ << " osr " << osr << dendl;
12572 ceph_assert(osr->deferred_running);
12573 DeferredBatch *b = osr->deferred_running;
12574
12575 {
12576 osr->deferred_lock.lock();
12577 ceph_assert(osr->deferred_running == b);
12578 osr->deferred_running = nullptr;
12579 if (!osr->deferred_pending) {
12580 dout(20) << __func__ << " dequeueing" << dendl;
12581 {
12582 deferred_lock.lock();
12583 auto q = deferred_queue.iterator_to(*osr);
12584 deferred_queue.erase(q);
12585 deferred_lock.unlock();
12586 }
12587 osr->deferred_lock.unlock();
12588 } else {
12589 osr->deferred_lock.unlock();
12590 if (deferred_aggressive) {
12591 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12592 finisher.queue(new C_DeferredTrySubmit(this));
12593 } else {
12594 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12595 }
12596 }
12597 }
12598
12599 {
12600 uint64_t costs = 0;
12601 {
12602 for (auto& i : b->txcs) {
12603 TransContext *txc = &i;
12604 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
12605 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
12606 costs += txc->cost;
12607 }
12608 }
12609 throttle.release_deferred_throttle(costs);
12610 }
12611
12612 {
12613 std::lock_guard l(kv_lock);
12614 deferred_done_queue.emplace_back(b);
12615
12616 // in the normal case, do not bother waking up the kv thread; it will
12617 // catch us on the next commit anyway.
12618 if (deferred_aggressive && !kv_sync_in_progress) {
12619 kv_sync_in_progress = true;
12620 kv_cond.notify_one();
12621 }
12622 }
12623 }
12624
12625 int BlueStore::_deferred_replay()
12626 {
12627 dout(10) << __func__ << " start" << dendl;
12628 int count = 0;
12629 int r = 0;
12630 CollectionRef ch = _get_collection(coll_t::meta());
12631 bool fake_ch = false;
12632 if (!ch) {
12633 // hmm, replaying initial mkfs?
12634 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12635 fake_ch = true;
12636 }
12637 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
12638 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12639 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12640 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12641 << dendl;
12642 bluestore_deferred_transaction_t *deferred_txn =
12643 new bluestore_deferred_transaction_t;
12644 bufferlist bl = it->value();
12645 auto p = bl.cbegin();
12646 try {
12647 decode(*deferred_txn, p);
12648 } catch (ceph::buffer::error& e) {
12649 derr << __func__ << " failed to decode deferred txn "
12650 << pretty_binary_string(it->key()) << dendl;
12651 delete deferred_txn;
12652 r = -EIO;
12653 goto out;
12654 }
12655 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
12656 txc->deferred_txn = deferred_txn;
12657 txc->set_state(TransContext::STATE_KV_DONE);
12658 _txc_state_proc(txc);
12659 }
12660 out:
12661 dout(20) << __func__ << " draining osr" << dendl;
12662 _osr_register_zombie(osr);
12663 _osr_drain_all();
12664 if (fake_ch) {
12665 new_coll_map.clear();
12666 }
12667 dout(10) << __func__ << " completed " << count << " events" << dendl;
12668 return r;
12669 }
12670
12671 // ---------------------------
12672 // transactions
12673
12674 int BlueStore::queue_transactions(
12675 CollectionHandle& ch,
12676 vector<Transaction>& tls,
12677 TrackedOpRef op,
12678 ThreadPool::TPHandle *handle)
12679 {
12680 FUNCTRACE(cct);
12681 list<Context *> on_applied, on_commit, on_applied_sync;
12682 ObjectStore::Transaction::collect_contexts(
12683 tls, &on_applied, &on_commit, &on_applied_sync);
12684
12685 auto start = mono_clock::now();
12686
12687 Collection *c = static_cast<Collection*>(ch.get());
12688 OpSequencer *osr = c->osr.get();
12689 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
12690
12691 // prepare
12692 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
12693 &on_commit, op);
12694
12695 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
12696 // submission to happen atomically because if I/O submission happens in a
12697 // different order than I/O allocation, we end up issuing non-sequential
12698 // writes to the drive. This is a temporary solution until ZONE APPEND
12699 // support matures in the kernel. For more information please see:
12700 // https://www.usenix.org/conference/vault20/presentation/bjorling
12701 if (bdev->is_smr()) {
12702 atomic_alloc_and_submit_lock.lock();
12703 }
12704 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
12705 txc->bytes += (*p).get_num_bytes();
12706 _txc_add_transaction(txc, &(*p));
12707 }
12708 _txc_calc_cost(txc);
12709
12710 _txc_write_nodes(txc, txc->t);
12711
12712 // journal deferred items
12713 if (txc->deferred_txn) {
12714 txc->deferred_txn->seq = ++deferred_seq;
12715 bufferlist bl;
12716 encode(*txc->deferred_txn, bl);
12717 string key;
12718 get_deferred_key(txc->deferred_txn->seq, &key);
12719 txc->t->set(PREFIX_DEFERRED, key, bl);
12720 }
12721
12722 _txc_finalize_kv(txc, txc->t);
12723
12724 #ifdef WITH_BLKIN
12725 if (txc->trace) {
12726 txc->trace.event("txc encode finished");
12727 }
12728 #endif
12729
12730 if (handle)
12731 handle->suspend_tp_timeout();
12732
12733 auto tstart = mono_clock::now();
12734
12735 if (!throttle.try_start_transaction(
12736 *db,
12737 *txc,
12738 tstart)) {
12739 // ensure we do not block here because of deferred writes
12740 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12741 << dendl;
12742 ++deferred_aggressive;
12743 deferred_try_submit();
12744 {
12745 // wake up any previously finished deferred events
12746 std::lock_guard l(kv_lock);
12747 if (!kv_sync_in_progress) {
12748 kv_sync_in_progress = true;
12749 kv_cond.notify_one();
12750 }
12751 }
12752 throttle.finish_start_transaction(*db, *txc, tstart);
12753 --deferred_aggressive;
12754 }
12755 auto tend = mono_clock::now();
12756
12757 if (handle)
12758 handle->reset_tp_timeout();
12759
12760 logger->inc(l_bluestore_txc);
12761
12762 // execute (start)
12763 _txc_state_proc(txc);
12764
12765 if (bdev->is_smr()) {
12766 atomic_alloc_and_submit_lock.unlock();
12767 }
12768
12769 // we're immediately readable (unlike FileStore)
12770 for (auto c : on_applied_sync) {
12771 c->complete(0);
12772 }
12773 if (!on_applied.empty()) {
12774 if (c->commit_queue) {
12775 c->commit_queue->queue(on_applied);
12776 } else {
12777 finisher.queue(on_applied);
12778 }
12779 }
12780
12781 #ifdef WITH_BLKIN
12782 if (txc->trace) {
12783 txc->trace.event("txc applied");
12784 }
12785 #endif
12786
12787 log_latency("submit_transact",
12788 l_bluestore_submit_lat,
12789 mono_clock::now() - start,
12790 cct->_conf->bluestore_log_op_age);
12791 log_latency("throttle_transact",
12792 l_bluestore_throttle_lat,
12793 tend - tstart,
12794 cct->_conf->bluestore_log_op_age);
12795 return 0;
12796 }
12797
12798 void BlueStore::_txc_aio_submit(TransContext *txc)
12799 {
12800 dout(10) << __func__ << " txc " << txc << dendl;
12801 bdev->aio_submit(&txc->ioc);
12802 }
12803
12804 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12805 {
12806 Transaction::iterator i = t->begin();
12807
12808 _dump_transaction<30>(cct, t);
12809
12810 vector<CollectionRef> cvec(i.colls.size());
12811 unsigned j = 0;
12812 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12813 ++p, ++j) {
12814 cvec[j] = _get_collection(*p);
12815 }
12816
12817 vector<OnodeRef> ovec(i.objects.size());
12818
12819 for (int pos = 0; i.have_op(); ++pos) {
12820 Transaction::Op *op = i.decode_op();
12821 int r = 0;
12822
12823 // no coll or obj
12824 if (op->op == Transaction::OP_NOP)
12825 continue;
12826
12827
12828 // collection operations
12829 CollectionRef &c = cvec[op->cid];
12830
12831 // initialize osd_pool_id and do a smoke test that all collections belong
12832 // to the same pool
12833 spg_t pgid;
12834 if (!!c ? c->cid.is_pg(&pgid) : false) {
12835 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12836 txc->osd_pool_id == pgid.pool());
12837 txc->osd_pool_id = pgid.pool();
12838 }
12839
12840 switch (op->op) {
12841 case Transaction::OP_RMCOLL:
12842 {
12843 const coll_t &cid = i.get_cid(op->cid);
12844 r = _remove_collection(txc, cid, &c);
12845 if (!r)
12846 continue;
12847 }
12848 break;
12849
12850 case Transaction::OP_MKCOLL:
12851 {
12852 ceph_assert(!c);
12853 const coll_t &cid = i.get_cid(op->cid);
12854 r = _create_collection(txc, cid, op->split_bits, &c);
12855 if (!r)
12856 continue;
12857 }
12858 break;
12859
12860 case Transaction::OP_SPLIT_COLLECTION:
12861 ceph_abort_msg("deprecated");
12862 break;
12863
12864 case Transaction::OP_SPLIT_COLLECTION2:
12865 {
12866 uint32_t bits = op->split_bits;
12867 uint32_t rem = op->split_rem;
12868 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12869 if (!r)
12870 continue;
12871 }
12872 break;
12873
12874 case Transaction::OP_MERGE_COLLECTION:
12875 {
12876 uint32_t bits = op->split_bits;
12877 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12878 if (!r)
12879 continue;
12880 }
12881 break;
12882
12883 case Transaction::OP_COLL_HINT:
12884 {
12885 uint32_t type = op->hint;
12886 bufferlist hint;
12887 i.decode_bl(hint);
12888 auto hiter = hint.cbegin();
12889 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12890 uint32_t pg_num;
12891 uint64_t num_objs;
12892 decode(pg_num, hiter);
12893 decode(num_objs, hiter);
12894 dout(10) << __func__ << " collection hint objects is a no-op, "
12895 << " pg_num " << pg_num << " num_objects " << num_objs
12896 << dendl;
12897 } else {
12898 // Ignore the hint
12899 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12900 }
12901 continue;
12902 }
12903 break;
12904
12905 case Transaction::OP_COLL_SETATTR:
12906 r = -EOPNOTSUPP;
12907 break;
12908
12909 case Transaction::OP_COLL_RMATTR:
12910 r = -EOPNOTSUPP;
12911 break;
12912
12913 case Transaction::OP_COLL_RENAME:
12914 ceph_abort_msg("not implemented");
12915 break;
12916 }
12917 if (r < 0) {
12918 derr << __func__ << " error " << cpp_strerror(r)
12919 << " not handled on operation " << op->op
12920 << " (op " << pos << ", counting from 0)" << dendl;
12921 _dump_transaction<0>(cct, t);
12922 ceph_abort_msg("unexpected error");
12923 }
12924
12925 // these operations implicity create the object
12926 bool create = false;
12927 if (op->op == Transaction::OP_TOUCH ||
12928 op->op == Transaction::OP_CREATE ||
12929 op->op == Transaction::OP_WRITE ||
12930 op->op == Transaction::OP_ZERO) {
12931 create = true;
12932 }
12933
12934 // object operations
12935 std::unique_lock l(c->lock);
12936 OnodeRef &o = ovec[op->oid];
12937 if (!o) {
12938 ghobject_t oid = i.get_oid(op->oid);
12939 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
12940 }
12941 if (!create && (!o || !o->exists)) {
12942 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
12943 << i.get_oid(op->oid) << dendl;
12944 r = -ENOENT;
12945 goto endop;
12946 }
12947
12948 switch (op->op) {
12949 case Transaction::OP_CREATE:
12950 case Transaction::OP_TOUCH:
12951 r = _touch(txc, c, o);
12952 break;
12953
12954 case Transaction::OP_WRITE:
12955 {
12956 uint64_t off = op->off;
12957 uint64_t len = op->len;
12958 uint32_t fadvise_flags = i.get_fadvise_flags();
12959 bufferlist bl;
12960 i.decode_bl(bl);
12961 r = _write(txc, c, o, off, len, bl, fadvise_flags);
12962 }
12963 break;
12964
12965 case Transaction::OP_ZERO:
12966 {
12967 uint64_t off = op->off;
12968 uint64_t len = op->len;
12969 r = _zero(txc, c, o, off, len);
12970 }
12971 break;
12972
12973 case Transaction::OP_TRIMCACHE:
12974 {
12975 // deprecated, no-op
12976 }
12977 break;
12978
12979 case Transaction::OP_TRUNCATE:
12980 {
12981 uint64_t off = op->off;
12982 r = _truncate(txc, c, o, off);
12983 }
12984 break;
12985
12986 case Transaction::OP_REMOVE:
12987 {
12988 r = _remove(txc, c, o);
12989 }
12990 break;
12991
12992 case Transaction::OP_SETATTR:
12993 {
12994 string name = i.decode_string();
12995 bufferptr bp;
12996 i.decode_bp(bp);
12997 r = _setattr(txc, c, o, name, bp);
12998 }
12999 break;
13000
13001 case Transaction::OP_SETATTRS:
13002 {
13003 map<string, bufferptr> aset;
13004 i.decode_attrset(aset);
13005 r = _setattrs(txc, c, o, aset);
13006 }
13007 break;
13008
13009 case Transaction::OP_RMATTR:
13010 {
13011 string name = i.decode_string();
13012 r = _rmattr(txc, c, o, name);
13013 }
13014 break;
13015
13016 case Transaction::OP_RMATTRS:
13017 {
13018 r = _rmattrs(txc, c, o);
13019 }
13020 break;
13021
13022 case Transaction::OP_CLONE:
13023 {
13024 OnodeRef& no = ovec[op->dest_oid];
13025 if (!no) {
13026 const ghobject_t& noid = i.get_oid(op->dest_oid);
13027 no = c->get_onode(noid, true);
13028 }
13029 r = _clone(txc, c, o, no);
13030 }
13031 break;
13032
13033 case Transaction::OP_CLONERANGE:
13034 ceph_abort_msg("deprecated");
13035 break;
13036
13037 case Transaction::OP_CLONERANGE2:
13038 {
13039 OnodeRef& no = ovec[op->dest_oid];
13040 if (!no) {
13041 const ghobject_t& noid = i.get_oid(op->dest_oid);
13042 no = c->get_onode(noid, true);
13043 }
13044 uint64_t srcoff = op->off;
13045 uint64_t len = op->len;
13046 uint64_t dstoff = op->dest_off;
13047 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
13048 }
13049 break;
13050
13051 case Transaction::OP_COLL_ADD:
13052 ceph_abort_msg("not implemented");
13053 break;
13054
13055 case Transaction::OP_COLL_REMOVE:
13056 ceph_abort_msg("not implemented");
13057 break;
13058
13059 case Transaction::OP_COLL_MOVE:
13060 ceph_abort_msg("deprecated");
13061 break;
13062
13063 case Transaction::OP_COLL_MOVE_RENAME:
13064 case Transaction::OP_TRY_RENAME:
13065 {
13066 ceph_assert(op->cid == op->dest_cid);
13067 const ghobject_t& noid = i.get_oid(op->dest_oid);
13068 OnodeRef& no = ovec[op->dest_oid];
13069 if (!no) {
13070 no = c->get_onode(noid, false);
13071 }
13072 r = _rename(txc, c, o, no, noid);
13073 }
13074 break;
13075
13076 case Transaction::OP_OMAP_CLEAR:
13077 {
13078 r = _omap_clear(txc, c, o);
13079 }
13080 break;
13081 case Transaction::OP_OMAP_SETKEYS:
13082 {
13083 bufferlist aset_bl;
13084 i.decode_attrset_bl(&aset_bl);
13085 r = _omap_setkeys(txc, c, o, aset_bl);
13086 }
13087 break;
13088 case Transaction::OP_OMAP_RMKEYS:
13089 {
13090 bufferlist keys_bl;
13091 i.decode_keyset_bl(&keys_bl);
13092 r = _omap_rmkeys(txc, c, o, keys_bl);
13093 }
13094 break;
13095 case Transaction::OP_OMAP_RMKEYRANGE:
13096 {
13097 string first, last;
13098 first = i.decode_string();
13099 last = i.decode_string();
13100 r = _omap_rmkey_range(txc, c, o, first, last);
13101 }
13102 break;
13103 case Transaction::OP_OMAP_SETHEADER:
13104 {
13105 bufferlist bl;
13106 i.decode_bl(bl);
13107 r = _omap_setheader(txc, c, o, bl);
13108 }
13109 break;
13110
13111 case Transaction::OP_SETALLOCHINT:
13112 {
13113 r = _set_alloc_hint(txc, c, o,
13114 op->expected_object_size,
13115 op->expected_write_size,
13116 op->hint);
13117 }
13118 break;
13119
13120 default:
13121 derr << __func__ << " bad op " << op->op << dendl;
13122 ceph_abort();
13123 }
13124
13125 endop:
13126 if (r < 0) {
13127 bool ok = false;
13128
13129 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
13130 op->op == Transaction::OP_CLONE ||
13131 op->op == Transaction::OP_CLONERANGE2 ||
13132 op->op == Transaction::OP_COLL_ADD ||
13133 op->op == Transaction::OP_SETATTR ||
13134 op->op == Transaction::OP_SETATTRS ||
13135 op->op == Transaction::OP_RMATTR ||
13136 op->op == Transaction::OP_OMAP_SETKEYS ||
13137 op->op == Transaction::OP_OMAP_RMKEYS ||
13138 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
13139 op->op == Transaction::OP_OMAP_SETHEADER))
13140 // -ENOENT is usually okay
13141 ok = true;
13142 if (r == -ENODATA)
13143 ok = true;
13144
13145 if (!ok) {
13146 const char *msg = "unexpected error code";
13147
13148 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
13149 op->op == Transaction::OP_CLONE ||
13150 op->op == Transaction::OP_CLONERANGE2))
13151 msg = "ENOENT on clone suggests osd bug";
13152
13153 if (r == -ENOSPC)
13154 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13155 // by partially applying transactions.
13156 msg = "ENOSPC from bluestore, misconfigured cluster";
13157
13158 if (r == -ENOTEMPTY) {
13159 msg = "ENOTEMPTY suggests garbage data in osd data dir";
13160 }
13161
13162 derr << __func__ << " error " << cpp_strerror(r)
13163 << " not handled on operation " << op->op
13164 << " (op " << pos << ", counting from 0)"
13165 << dendl;
13166 derr << msg << dendl;
13167 _dump_transaction<0>(cct, t);
13168 ceph_abort_msg("unexpected error");
13169 }
13170 }
13171 }
13172 }
13173
13174
13175
13176 // -----------------
13177 // write operations
13178
13179 int BlueStore::_touch(TransContext *txc,
13180 CollectionRef& c,
13181 OnodeRef &o)
13182 {
13183 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13184 int r = 0;
13185 _assign_nid(txc, o);
13186 txc->write_onode(o);
13187 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13188 return r;
13189 }
13190
13191 void BlueStore::_pad_zeros(
13192 bufferlist *bl, uint64_t *offset,
13193 uint64_t chunk_size)
13194 {
13195 auto length = bl->length();
13196 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
13197 << " chunk_size 0x" << chunk_size << std::dec << dendl;
13198 dout(40) << "before:\n";
13199 bl->hexdump(*_dout);
13200 *_dout << dendl;
13201 // front
13202 size_t front_pad = *offset % chunk_size;
13203 size_t back_pad = 0;
13204 size_t pad_count = 0;
13205 if (front_pad) {
13206 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
13207 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
13208 z.zero(0, front_pad, false);
13209 pad_count += front_pad;
13210 bl->begin().copy(front_copy, z.c_str() + front_pad);
13211 if (front_copy + front_pad < chunk_size) {
13212 back_pad = chunk_size - (length + front_pad);
13213 z.zero(front_pad + length, back_pad, false);
13214 pad_count += back_pad;
13215 }
13216 bufferlist old, t;
13217 old.swap(*bl);
13218 t.substr_of(old, front_copy, length - front_copy);
13219 bl->append(z);
13220 bl->claim_append(t);
13221 *offset -= front_pad;
13222 length += pad_count;
13223 }
13224
13225 // back
13226 uint64_t end = *offset + length;
13227 unsigned back_copy = end % chunk_size;
13228 if (back_copy) {
13229 ceph_assert(back_pad == 0);
13230 back_pad = chunk_size - back_copy;
13231 ceph_assert(back_copy <= length);
13232 bufferptr tail(chunk_size);
13233 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
13234 tail.zero(back_copy, back_pad, false);
13235 bufferlist old;
13236 old.swap(*bl);
13237 bl->substr_of(old, 0, length - back_copy);
13238 bl->append(tail);
13239 length += back_pad;
13240 pad_count += back_pad;
13241 }
13242 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
13243 << back_pad << " on front/back, now 0x" << *offset << "~"
13244 << length << std::dec << dendl;
13245 dout(40) << "after:\n";
13246 bl->hexdump(*_dout);
13247 *_dout << dendl;
13248 if (pad_count)
13249 logger->inc(l_bluestore_write_pad_bytes, pad_count);
13250 ceph_assert(bl->length() == length);
13251 }
13252
13253 void BlueStore::_do_write_small(
13254 TransContext *txc,
13255 CollectionRef &c,
13256 OnodeRef o,
13257 uint64_t offset, uint64_t length,
13258 bufferlist::iterator& blp,
13259 WriteContext *wctx)
13260 {
13261 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13262 << std::dec << dendl;
13263 ceph_assert(length < min_alloc_size);
13264
13265 uint64_t end_offs = offset + length;
13266
13267 logger->inc(l_bluestore_write_small);
13268 logger->inc(l_bluestore_write_small_bytes, length);
13269
13270 bufferlist bl;
13271 blp.copy(length, bl);
13272
13273 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13274 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13275 uint32_t alloc_len = min_alloc_size;
13276 auto offset0 = p2align<uint64_t>(offset, alloc_len);
13277
13278 bool any_change;
13279
13280 // search suitable extent in both forward and reverse direction in
13281 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13282 // then check if blob can be reused via can_reuse_blob func or apply
13283 // direct/deferred write (the latter for extents including or higher
13284 // than 'offset' only).
13285 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
13286
13287 // On zoned devices, the first goal is to support non-overwrite workloads,
13288 // such as RGW, with large, aligned objects. Therefore, for user writes
13289 // _do_write_small should not trigger. OSDs, however, write and update a tiny
13290 // amount of metadata, such as OSD maps, to disk. For those cases, we
13291 // temporarily just pad them to min_alloc_size and write them to a new place
13292 // on every update.
13293 if (bdev->is_smr()) {
13294 BlobRef b = c->new_blob();
13295 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
13296 uint64_t b_off0 = b_off;
13297 _pad_zeros(&bl, &b_off0, min_alloc_size);
13298 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13299 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
13300 return;
13301 }
13302
13303 // Look for an existing mutable blob we can use.
13304 auto begin = o->extent_map.extent_map.begin();
13305 auto end = o->extent_map.extent_map.end();
13306 auto ep = o->extent_map.seek_lextent(offset);
13307 if (ep != begin) {
13308 --ep;
13309 if (ep->blob_end() <= offset) {
13310 ++ep;
13311 }
13312 }
13313 auto prev_ep = end;
13314 if (ep != begin) {
13315 prev_ep = ep;
13316 --prev_ep;
13317 }
13318
13319 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13320 // We don't want to have more blobs than min alloc units fit
13321 // into 2 max blobs
13322 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13323 bool above_blob_threshold = false;
13324
13325 inspected_blobs.reserve(blob_threshold);
13326
13327 uint64_t max_off = 0;
13328 auto start_ep = ep;
13329 auto end_ep = ep; // exclusively
13330 do {
13331 any_change = false;
13332
13333 if (ep != end && ep->logical_offset < offset + max_bsize) {
13334 BlobRef b = ep->blob;
13335 if (!above_blob_threshold) {
13336 inspected_blobs.insert(&b->get_blob());
13337 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13338 }
13339 max_off = ep->logical_end();
13340 auto bstart = ep->blob_start();
13341
13342 dout(20) << __func__ << " considering " << *b
13343 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13344 if (bstart >= end_offs) {
13345 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13346 } else if (!b->get_blob().is_mutable()) {
13347 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13348 } else if (ep->logical_offset % min_alloc_size !=
13349 ep->blob_offset % min_alloc_size) {
13350 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13351 } else {
13352 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13353 // can we pad our head/tail out with zeros?
13354 uint64_t head_pad, tail_pad;
13355 head_pad = p2phase(offset, chunk_size);
13356 tail_pad = p2nphase(end_offs, chunk_size);
13357 if (head_pad || tail_pad) {
13358 o->extent_map.fault_range(db, offset - head_pad,
13359 end_offs - offset + head_pad + tail_pad);
13360 }
13361 if (head_pad &&
13362 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
13363 head_pad = 0;
13364 }
13365 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13366 tail_pad = 0;
13367 }
13368
13369 uint64_t b_off = offset - head_pad - bstart;
13370 uint64_t b_len = length + head_pad + tail_pad;
13371
13372 // direct write into unused blocks of an existing mutable blob?
13373 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13374 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13375 b->get_blob().is_unused(b_off, b_len) &&
13376 b->get_blob().is_allocated(b_off, b_len)) {
13377 _apply_padding(head_pad, tail_pad, bl);
13378
13379 dout(20) << __func__ << " write to unused 0x" << std::hex
13380 << b_off << "~" << b_len
13381 << " pad 0x" << head_pad << " + 0x" << tail_pad
13382 << std::dec << " of mutable " << *b << dendl;
13383 _buffer_cache_write(txc, b, b_off, bl,
13384 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13385
13386 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13387 if (b_len < prefer_deferred_size) {
13388 dout(20) << __func__ << " deferring small 0x" << std::hex
13389 << b_len << std::dec << " unused write via deferred" << dendl;
13390 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
13391 op->op = bluestore_deferred_op_t::OP_WRITE;
13392 b->get_blob().map(
13393 b_off, b_len,
13394 [&](uint64_t offset, uint64_t length) {
13395 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13396 return 0;
13397 });
13398 op->data = bl;
13399 } else {
13400 b->get_blob().map_bl(
13401 b_off, bl,
13402 [&](uint64_t offset, bufferlist& t) {
13403 bdev->aio_write(offset, t,
13404 &txc->ioc, wctx->buffered);
13405 });
13406 }
13407 }
13408 b->dirty_blob().calc_csum(b_off, bl);
13409 dout(20) << __func__ << " lex old " << *ep << dendl;
13410 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13411 b,
13412 &wctx->old_extents);
13413 b->dirty_blob().mark_used(le->blob_offset, le->length);
13414
13415 txc->statfs_delta.stored() += le->length;
13416 dout(20) << __func__ << " lex " << *le << dendl;
13417 logger->inc(l_bluestore_write_small_unused);
13418 return;
13419 }
13420 // read some data to fill out the chunk?
13421 uint64_t head_read = p2phase(b_off, chunk_size);
13422 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
13423 if ((head_read || tail_read) &&
13424 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13425 head_read + tail_read < min_alloc_size) {
13426 b_off -= head_read;
13427 b_len += head_read + tail_read;
13428
13429 } else {
13430 head_read = tail_read = 0;
13431 }
13432
13433 // chunk-aligned deferred overwrite?
13434 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13435 b_off % chunk_size == 0 &&
13436 b_len % chunk_size == 0 &&
13437 b->get_blob().is_allocated(b_off, b_len)) {
13438
13439 _apply_padding(head_pad, tail_pad, bl);
13440
13441 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13442 << " and tail 0x" << tail_read << std::dec << dendl;
13443 if (head_read) {
13444 bufferlist head_bl;
13445 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13446 head_bl, 0);
13447 ceph_assert(r >= 0 && r <= (int)head_read);
13448 size_t zlen = head_read - r;
13449 if (zlen) {
13450 head_bl.append_zero(zlen);
13451 logger->inc(l_bluestore_write_pad_bytes, zlen);
13452 }
13453 head_bl.claim_append(bl);
13454 bl.swap(head_bl);
13455 logger->inc(l_bluestore_write_penalty_read_ops);
13456 }
13457 if (tail_read) {
13458 bufferlist tail_bl;
13459 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13460 tail_bl, 0);
13461 ceph_assert(r >= 0 && r <= (int)tail_read);
13462 size_t zlen = tail_read - r;
13463 if (zlen) {
13464 tail_bl.append_zero(zlen);
13465 logger->inc(l_bluestore_write_pad_bytes, zlen);
13466 }
13467 bl.claim_append(tail_bl);
13468 logger->inc(l_bluestore_write_penalty_read_ops);
13469 }
13470 logger->inc(l_bluestore_write_small_pre_read);
13471
13472 _buffer_cache_write(txc, b, b_off, bl,
13473 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13474
13475 b->dirty_blob().calc_csum(b_off, bl);
13476
13477 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13478 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
13479 op->op = bluestore_deferred_op_t::OP_WRITE;
13480 int r = b->get_blob().map(
13481 b_off, b_len,
13482 [&](uint64_t offset, uint64_t length) {
13483 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13484 return 0;
13485 });
13486 ceph_assert(r == 0);
13487 op->data = std::move(bl);
13488 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13489 << b_len << std::dec << " of mutable " << *b
13490 << " at " << op->extents << dendl;
13491 }
13492
13493 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13494 b, &wctx->old_extents);
13495 b->dirty_blob().mark_used(le->blob_offset, le->length);
13496 txc->statfs_delta.stored() += le->length;
13497 dout(20) << __func__ << " lex " << *le << dendl;
13498 return;
13499 }
13500 // try to reuse blob if we can
13501 if (b->can_reuse_blob(min_alloc_size,
13502 max_bsize,
13503 offset0 - bstart,
13504 &alloc_len)) {
13505 ceph_assert(alloc_len == min_alloc_size); // expecting data always
13506 // fit into reused blob
13507 // Need to check for pending writes desiring to
13508 // reuse the same pextent. The rationale is that during GC two chunks
13509 // from garbage blobs(compressed?) can share logical space within the same
13510 // AU. That's in turn might be caused by unaligned len in clone_range2.
13511 // Hence the second write will fail in an attempt to reuse blob at
13512 // do_alloc_write().
13513 if (!wctx->has_conflict(b,
13514 offset0,
13515 offset0 + alloc_len,
13516 min_alloc_size)) {
13517
13518 // we can't reuse pad_head/pad_tail since they might be truncated
13519 // due to existent extents
13520 uint64_t b_off = offset - bstart;
13521 uint64_t b_off0 = b_off;
13522 _pad_zeros(&bl, &b_off0, chunk_size);
13523
13524 dout(20) << __func__ << " reuse blob " << *b << std::hex
13525 << " (0x" << b_off0 << "~" << bl.length() << ")"
13526 << " (0x" << b_off << "~" << length << ")"
13527 << std::dec << dendl;
13528
13529 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13530 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13531 false, false);
13532 logger->inc(l_bluestore_write_small_unused);
13533 return;
13534 }
13535 }
13536 }
13537 ++ep;
13538 end_ep = ep;
13539 any_change = true;
13540 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13541
13542 // check extent for reuse in reverse order
13543 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13544 BlobRef b = prev_ep->blob;
13545 if (!above_blob_threshold) {
13546 inspected_blobs.insert(&b->get_blob());
13547 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13548 }
13549 start_ep = prev_ep;
13550 auto bstart = prev_ep->blob_start();
13551 dout(20) << __func__ << " considering " << *b
13552 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13553 if (b->can_reuse_blob(min_alloc_size,
13554 max_bsize,
13555 offset0 - bstart,
13556 &alloc_len)) {
13557 ceph_assert(alloc_len == min_alloc_size); // expecting data always
13558 // fit into reused blob
13559 // Need to check for pending writes desiring to
13560 // reuse the same pextent. The rationale is that during GC two chunks
13561 // from garbage blobs(compressed?) can share logical space within the same
13562 // AU. That's in turn might be caused by unaligned len in clone_range2.
13563 // Hence the second write will fail in an attempt to reuse blob at
13564 // do_alloc_write().
13565 if (!wctx->has_conflict(b,
13566 offset0,
13567 offset0 + alloc_len,
13568 min_alloc_size)) {
13569
13570 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13571 uint64_t b_off = offset - bstart;
13572 uint64_t b_off0 = b_off;
13573 _pad_zeros(&bl, &b_off0, chunk_size);
13574
13575 dout(20) << __func__ << " reuse blob " << *b << std::hex
13576 << " (0x" << b_off0 << "~" << bl.length() << ")"
13577 << " (0x" << b_off << "~" << length << ")"
13578 << std::dec << dendl;
13579
13580 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13581 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13582 false, false);
13583 logger->inc(l_bluestore_write_small_unused);
13584 return;
13585 }
13586 }
13587 if (prev_ep != begin) {
13588 --prev_ep;
13589 any_change = true;
13590 } else {
13591 prev_ep = end; // to avoid useless first extent re-check
13592 }
13593 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13594 } while (any_change);
13595
13596 if (above_blob_threshold) {
13597 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13598 << " " << std::hex << min_off << "~" << max_off << std::dec
13599 << dendl;
13600 ceph_assert(start_ep != end_ep);
13601 for (auto ep = start_ep; ep != end_ep; ++ep) {
13602 dout(20) << __func__ << " inserting for GC "
13603 << std::hex << ep->logical_offset << "~" << ep->length
13604 << std::dec << dendl;
13605
13606 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13607 }
13608 // insert newly written extent to GC
13609 wctx->extents_to_gc.union_insert(offset, length);
13610 dout(20) << __func__ << " inserting (last) for GC "
13611 << std::hex << offset << "~" << length
13612 << std::dec << dendl;
13613 }
13614 // new blob.
13615 BlobRef b = c->new_blob();
13616 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
13617 uint64_t b_off0 = b_off;
13618 _pad_zeros(&bl, &b_off0, block_size);
13619 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13620 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13621 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13622 // doesn't match disk one only
13623 true);
13624
13625 return;
13626 }
13627
13628 bool BlueStore::BigDeferredWriteContext::can_defer(
13629 BlueStore::extent_map_t::iterator ep,
13630 uint64_t prefer_deferred_size,
13631 uint64_t block_size,
13632 uint64_t offset,
13633 uint64_t l)
13634 {
13635 bool res = false;
13636 auto& blob = ep->blob->get_blob();
13637 if (offset >= ep->blob_start() &&
13638 blob.is_mutable()) {
13639 off = offset;
13640 b_off = offset - ep->blob_start();
13641 uint64_t chunk_size = blob.get_chunk_size(block_size);
13642 uint64_t ondisk = blob.get_ondisk_length();
13643 used = std::min(l, ondisk - b_off);
13644
13645 // will read some data to fill out the chunk?
13646 head_read = p2phase<uint64_t>(b_off, chunk_size);
13647 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
13648 b_off -= head_read;
13649
13650 ceph_assert(b_off % chunk_size == 0);
13651 ceph_assert(blob_aligned_len() % chunk_size == 0);
13652
13653 res = blob_aligned_len() < prefer_deferred_size &&
13654 blob_aligned_len() <= ondisk &&
13655 blob.is_allocated(b_off, blob_aligned_len());
13656 if (res) {
13657 blob_ref = ep->blob;
13658 blob_start = ep->blob_start();
13659 }
13660 }
13661 return res;
13662 }
13663
13664 bool BlueStore::BigDeferredWriteContext::apply_defer()
13665 {
13666 int r = blob_ref->get_blob().map(
13667 b_off, blob_aligned_len(),
13668 [&](const bluestore_pextent_t& pext,
13669 uint64_t offset,
13670 uint64_t length) {
13671 // apply deferred if overwrite breaks blob continuity only.
13672 // if it totally overlaps some pextent - fallback to regular write
13673 if (pext.offset < offset ||
13674 pext.end() > offset + length) {
13675 res_extents.emplace_back(bluestore_pextent_t(offset, length));
13676 return 0;
13677 }
13678 return -1;
13679 });
13680 return r >= 0;
13681 }
13682
13683 void BlueStore::_do_write_big_apply_deferred(
13684 TransContext* txc,
13685 CollectionRef& c,
13686 OnodeRef o,
13687 BlueStore::BigDeferredWriteContext& dctx,
13688 bufferlist::iterator& blp,
13689 WriteContext* wctx)
13690 {
13691 bufferlist bl;
13692 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
13693 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
13694 if (dctx.head_read) {
13695 int r = _do_read(c.get(), o,
13696 dctx.off - dctx.head_read,
13697 dctx.head_read,
13698 bl,
13699 0);
13700 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
13701 size_t zlen = dctx.head_read - r;
13702 if (zlen) {
13703 bl.append_zero(zlen);
13704 logger->inc(l_bluestore_write_pad_bytes, zlen);
13705 }
13706 logger->inc(l_bluestore_write_penalty_read_ops);
13707 }
13708 blp.copy(dctx.used, bl);
13709
13710 if (dctx.tail_read) {
13711 bufferlist tail_bl;
13712 int r = _do_read(c.get(), o,
13713 dctx.off + dctx.used, dctx.tail_read,
13714 tail_bl, 0);
13715 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
13716 size_t zlen = dctx.tail_read - r;
13717 if (zlen) {
13718 tail_bl.append_zero(zlen);
13719 logger->inc(l_bluestore_write_pad_bytes, zlen);
13720 }
13721 bl.claim_append(tail_bl);
13722 logger->inc(l_bluestore_write_penalty_read_ops);
13723 }
13724 auto& b0 = dctx.blob_ref;
13725 _buffer_cache_write(txc, b0, dctx.b_off, bl,
13726 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13727
13728 b0->dirty_blob().calc_csum(dctx.b_off, bl);
13729
13730 Extent* le = o->extent_map.set_lextent(c, dctx.off,
13731 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
13732
13733 // in fact this is a no-op for big writes but left here to maintain
13734 // uniformity and avoid missing after some refactor.
13735 b0->dirty_blob().mark_used(le->blob_offset, le->length);
13736 txc->statfs_delta.stored() += le->length;
13737
13738 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13739 bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
13740 op->op = bluestore_deferred_op_t::OP_WRITE;
13741 op->extents.swap(dctx.res_extents);
13742 op->data = std::move(bl);
13743 }
13744 }
13745
13746 void BlueStore::_do_write_big(
13747 TransContext *txc,
13748 CollectionRef &c,
13749 OnodeRef o,
13750 uint64_t offset, uint64_t length,
13751 bufferlist::iterator& blp,
13752 WriteContext *wctx)
13753 {
13754 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13755 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13756 << " compress " << (int)wctx->compress
13757 << dendl;
13758 logger->inc(l_bluestore_write_big);
13759 logger->inc(l_bluestore_write_big_bytes, length);
13760 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13761 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
13762 while (length > 0) {
13763 bool new_blob = false;
13764 BlobRef b;
13765 uint32_t b_off = 0;
13766 uint32_t l = 0;
13767
13768 //attempting to reuse existing blob
13769 if (!wctx->compress) {
13770 // enforce target blob alignment with max_bsize
13771 l = max_bsize - p2phase(offset, max_bsize);
13772 l = std::min(uint64_t(l), length);
13773
13774 auto end = o->extent_map.extent_map.end();
13775
13776 dout(20) << __func__ << " may be defer: 0x" << std::hex
13777 << offset << "~" << l
13778 << std::dec << dendl;
13779
13780 if (prefer_deferred_size_snapshot &&
13781 l <= prefer_deferred_size_snapshot * 2) {
13782 // Single write that spans two adjusted existing blobs can result
13783 // in up to two deferred blocks of 'prefer_deferred_size'
13784 // So we're trying to minimize the amount of resulting blobs
13785 // and preserve 2 blobs rather than inserting one more in between
13786 // E.g. write 0x10000~20000 over existing blobs
13787 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
13788 // performance point of view) to result in two deferred writes to
13789 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
13790
13791 // look for an existing mutable blob we can write into
13792 auto ep = o->extent_map.seek_lextent(offset);
13793 auto ep_next = end;
13794 BigDeferredWriteContext head_info, tail_info;
13795
13796 bool will_defer = ep != end ?
13797 head_info.can_defer(ep,
13798 prefer_deferred_size_snapshot,
13799 block_size,
13800 offset,
13801 l) :
13802 false;
13803 auto offset_next = offset + head_info.used;
13804 auto remaining = l - head_info.used;
13805 if (will_defer && remaining) {
13806 will_defer = false;
13807 if (remaining <= prefer_deferred_size_snapshot) {
13808 ep_next = o->extent_map.seek_lextent(offset_next);
13809 // check if we can defer remaining totally
13810 will_defer = ep_next == end ?
13811 false :
13812 tail_info.can_defer(ep_next,
13813 prefer_deferred_size_snapshot,
13814 block_size,
13815 offset_next,
13816 remaining);
13817 will_defer = will_defer && remaining == tail_info.used;
13818 }
13819 }
13820 if (will_defer) {
13821 dout(20) << __func__ << " " << *(head_info.blob_ref)
13822 << " deferring big " << std::hex
13823 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
13824 << std::dec << " write via deferred"
13825 << dendl;
13826 if (remaining) {
13827 dout(20) << __func__ << " " << *(tail_info.blob_ref)
13828 << " deferring big " << std::hex
13829 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
13830 << std::dec << " write via deferred"
13831 << dendl;
13832 }
13833
13834 will_defer = head_info.apply_defer();
13835 if (!will_defer) {
13836 dout(20) << __func__
13837 << " deferring big fell back, head isn't continuous"
13838 << dendl;
13839 } else if (remaining) {
13840 will_defer = tail_info.apply_defer();
13841 if (!will_defer) {
13842 dout(20) << __func__
13843 << " deferring big fell back, tail isn't continuous"
13844 << dendl;
13845 }
13846 }
13847 }
13848 if (will_defer) {
13849 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
13850 if (remaining) {
13851 _do_write_big_apply_deferred(txc, c, o, tail_info,
13852 blp, wctx);
13853 }
13854 dout(20) << __func__ << " defer big: 0x" << std::hex
13855 << offset << "~" << l
13856 << std::dec << dendl;
13857 offset += l;
13858 length -= l;
13859 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
13860 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
13861 continue;
13862 }
13863 }
13864 dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
13865
13866 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13867
13868 // seek again as punch_hole could invalidate ep
13869 auto ep = o->extent_map.seek_lextent(offset);
13870 auto begin = o->extent_map.extent_map.begin();
13871 auto prev_ep = end;
13872 if (ep != begin) {
13873 prev_ep = ep;
13874 --prev_ep;
13875 }
13876
13877 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13878 // search suitable extent in both forward and reverse direction in
13879 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13880 // then check if blob can be reused via can_reuse_blob func.
13881 bool any_change;
13882 do {
13883 any_change = false;
13884 if (ep != end && ep->logical_offset < offset + max_bsize) {
13885 dout(20) << __func__ << " considering " << *ep
13886 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
13887
13888 if (offset >= ep->blob_start() &&
13889 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
13890 offset - ep->blob_start(),
13891 &l)) {
13892 b = ep->blob;
13893 b_off = offset - ep->blob_start();
13894 prev_ep = end; // to avoid check below
13895 dout(20) << __func__ << " reuse blob " << *b << std::hex
13896 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
13897 } else {
13898 ++ep;
13899 any_change = true;
13900 }
13901 }
13902
13903 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13904 dout(20) << __func__ << " considering rev " << *prev_ep
13905 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
13906 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
13907 offset - prev_ep->blob_start(),
13908 &l)) {
13909 b = prev_ep->blob;
13910 b_off = offset - prev_ep->blob_start();
13911 dout(20) << __func__ << " reuse blob " << *b << std::hex
13912 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
13913 } else if (prev_ep != begin) {
13914 --prev_ep;
13915 any_change = true;
13916 } else {
13917 prev_ep = end; // to avoid useless first extent re-check
13918 }
13919 }
13920 } while (b == nullptr && any_change);
13921 } else {
13922 // trying to utilize as longer chunk as permitted in case of compression.
13923 l = std::min(max_bsize, length);
13924 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13925 } // if (!wctx->compress)
13926
13927 if (b == nullptr) {
13928 b = c->new_blob();
13929 b_off = 0;
13930 new_blob = true;
13931 }
13932 bufferlist t;
13933 blp.copy(l, t);
13934 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13935 dout(20) << __func__ << " schedule write big: 0x"
13936 << std::hex << offset << "~" << l << std::dec
13937 << (new_blob ? " new " : " reuse ")
13938 << *b << dendl;
13939 offset += l;
13940 length -= l;
13941 logger->inc(l_bluestore_write_big_blobs);
13942 }
13943 }
13944
13945 int BlueStore::_do_alloc_write(
13946 TransContext *txc,
13947 CollectionRef coll,
13948 OnodeRef o,
13949 WriteContext *wctx)
13950 {
13951 dout(20) << __func__ << " txc " << txc
13952 << " " << wctx->writes.size() << " blobs"
13953 << dendl;
13954 if (wctx->writes.empty()) {
13955 return 0;
13956 }
13957
13958 CompressorRef c;
13959 double crr = 0;
13960 if (wctx->compress) {
13961 c = select_option(
13962 "compression_algorithm",
13963 compressor,
13964 [&]() {
13965 string val;
13966 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13967 CompressorRef cp = compressor;
13968 if (!cp || cp->get_type_name() != val) {
13969 cp = Compressor::create(cct, val);
13970 if (!cp) {
13971 if (_set_compression_alert(false, val.c_str())) {
13972 derr << __func__ << " unable to initialize " << val.c_str()
13973 << " compressor" << dendl;
13974 }
13975 }
13976 }
13977 return boost::optional<CompressorRef>(cp);
13978 }
13979 return boost::optional<CompressorRef>();
13980 }
13981 );
13982
13983 crr = select_option(
13984 "compression_required_ratio",
13985 cct->_conf->bluestore_compression_required_ratio,
13986 [&]() {
13987 double val;
13988 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
13989 return boost::optional<double>(val);
13990 }
13991 return boost::optional<double>();
13992 }
13993 );
13994 }
13995
13996 // checksum
13997 int64_t csum = csum_type.load();
13998 csum = select_option(
13999 "csum_type",
14000 csum,
14001 [&]() {
14002 int64_t val;
14003 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
14004 return boost::optional<int64_t>(val);
14005 }
14006 return boost::optional<int64_t>();
14007 }
14008 );
14009
14010 // compress (as needed) and calc needed space
14011 uint64_t need = 0;
14012 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
14013 for (auto& wi : wctx->writes) {
14014 if (c && wi.blob_length > min_alloc_size) {
14015 auto start = mono_clock::now();
14016
14017 // compress
14018 ceph_assert(wi.b_off == 0);
14019 ceph_assert(wi.blob_length == wi.bl.length());
14020
14021 // FIXME: memory alignment here is bad
14022 bufferlist t;
14023 boost::optional<int32_t> compressor_message;
14024 int r = c->compress(wi.bl, t, compressor_message);
14025 uint64_t want_len_raw = wi.blob_length * crr;
14026 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
14027 bool rejected = false;
14028 uint64_t compressed_len = t.length();
14029 // do an approximate (fast) estimation for resulting blob size
14030 // that doesn't take header overhead into account
14031 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
14032 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
14033 bluestore_compression_header_t chdr;
14034 chdr.type = c->get_type();
14035 chdr.length = t.length();
14036 chdr.compressor_message = compressor_message;
14037 encode(chdr, wi.compressed_bl);
14038 wi.compressed_bl.claim_append(t);
14039
14040 compressed_len = wi.compressed_bl.length();
14041 result_len = p2roundup(compressed_len, min_alloc_size);
14042 if (result_len <= want_len && result_len < wi.blob_length) {
14043 // Cool. We compressed at least as much as we were hoping to.
14044 // pad out to min_alloc_size
14045 wi.compressed_bl.append_zero(result_len - compressed_len);
14046 wi.compressed_len = compressed_len;
14047 wi.compressed = true;
14048 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
14049 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
14050 << " -> 0x" << compressed_len << " => 0x" << result_len
14051 << " with " << c->get_type()
14052 << std::dec << dendl;
14053 txc->statfs_delta.compressed() += compressed_len;
14054 txc->statfs_delta.compressed_original() += wi.blob_length;
14055 txc->statfs_delta.compressed_allocated() += result_len;
14056 logger->inc(l_bluestore_compress_success_count);
14057 need += result_len;
14058 } else {
14059 rejected = true;
14060 }
14061 } else if (r != 0) {
14062 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
14063 << " bytes compressed using " << c->get_type_name()
14064 << std::dec
14065 << " failed with errcode = " << r
14066 << ", leaving uncompressed"
14067 << dendl;
14068 logger->inc(l_bluestore_compress_rejected_count);
14069 need += wi.blob_length;
14070 } else {
14071 rejected = true;
14072 }
14073
14074 if (rejected) {
14075 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
14076 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
14077 << " with " << c->get_type()
14078 << ", which is more than required 0x" << want_len_raw
14079 << " -> 0x" << want_len
14080 << ", leaving uncompressed"
14081 << std::dec << dendl;
14082 logger->inc(l_bluestore_compress_rejected_count);
14083 need += wi.blob_length;
14084 }
14085 log_latency("compress@_do_alloc_write",
14086 l_bluestore_compress_lat,
14087 mono_clock::now() - start,
14088 cct->_conf->bluestore_log_op_age );
14089 } else {
14090 need += wi.blob_length;
14091 }
14092 }
14093 PExtentVector prealloc;
14094 prealloc.reserve(2 * wctx->writes.size());;
14095 int64_t prealloc_left = 0;
14096 prealloc_left = shared_alloc.a->allocate(
14097 need, min_alloc_size, need,
14098 0, &prealloc);
14099 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
14100 derr << __func__ << " failed to allocate 0x" << std::hex << need
14101 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
14102 << " min_alloc_size 0x" << min_alloc_size
14103 << " available 0x " << shared_alloc.a->get_free()
14104 << std::dec << dendl;
14105 if (prealloc.size()) {
14106 shared_alloc.a->release(prealloc);
14107 }
14108 return -ENOSPC;
14109 }
14110 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
14111
14112 if (bdev->is_smr()) {
14113 std::deque<uint64_t> zones_to_clean;
14114 if (shared_alloc.a->zoned_get_zones_to_clean(&zones_to_clean)) {
14115 std::lock_guard l{zoned_cleaner_lock};
14116 zoned_cleaner_queue.swap(zones_to_clean);
14117 zoned_cleaner_cond.notify_one();
14118 }
14119 }
14120
14121 dout(20) << __func__ << " prealloc " << prealloc << dendl;
14122 auto prealloc_pos = prealloc.begin();
14123 ceph_assert(prealloc_pos != prealloc.end());
14124 uint64_t prealloc_pos_length = prealloc_pos->length;
14125
14126 for (auto& wi : wctx->writes) {
14127 bluestore_blob_t& dblob = wi.b->dirty_blob();
14128 uint64_t b_off = wi.b_off;
14129 bufferlist *l = &wi.bl;
14130 uint64_t final_length = wi.blob_length;
14131 uint64_t csum_length = wi.blob_length;
14132 if (wi.compressed) {
14133 final_length = wi.compressed_bl.length();
14134 csum_length = final_length;
14135 unsigned csum_order = ctz(csum_length);
14136 l = &wi.compressed_bl;
14137 dblob.set_compressed(wi.blob_length, wi.compressed_len);
14138 if (csum != Checksummer::CSUM_NONE) {
14139 dout(20) << __func__
14140 << " initialize csum setting for compressed blob " << *wi.b
14141 << " csum_type " << Checksummer::get_csum_type_string(csum)
14142 << " csum_order " << csum_order
14143 << " csum_length 0x" << std::hex << csum_length
14144 << " blob_length 0x" << wi.blob_length
14145 << " compressed_length 0x" << wi.compressed_len << std::dec
14146 << dendl;
14147 dblob.init_csum(csum, csum_order, csum_length);
14148 }
14149 } else if (wi.new_blob) {
14150 unsigned csum_order;
14151 // initialize newly created blob only
14152 ceph_assert(dblob.is_mutable());
14153 if (l->length() != wi.blob_length) {
14154 // hrm, maybe we could do better here, but let's not bother.
14155 dout(20) << __func__ << " forcing csum_order to block_size_order "
14156 << block_size_order << dendl;
14157 csum_order = block_size_order;
14158 } else {
14159 csum_order = std::min(wctx->csum_order, ctz(l->length()));
14160 }
14161 // try to align blob with max_blob_size to improve
14162 // its reuse ratio, e.g. in case of reverse write
14163 uint32_t suggested_boff =
14164 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
14165 if ((suggested_boff % (1 << csum_order)) == 0 &&
14166 suggested_boff + final_length <= max_bsize &&
14167 suggested_boff > b_off) {
14168 dout(20) << __func__ << " forcing blob_offset to 0x"
14169 << std::hex << suggested_boff << std::dec << dendl;
14170 ceph_assert(suggested_boff >= b_off);
14171 csum_length += suggested_boff - b_off;
14172 b_off = suggested_boff;
14173 }
14174 if (csum != Checksummer::CSUM_NONE) {
14175 dout(20) << __func__
14176 << " initialize csum setting for new blob " << *wi.b
14177 << " csum_type " << Checksummer::get_csum_type_string(csum)
14178 << " csum_order " << csum_order
14179 << " csum_length 0x" << std::hex << csum_length << std::dec
14180 << dendl;
14181 dblob.init_csum(csum, csum_order, csum_length);
14182 }
14183 }
14184
14185 PExtentVector extents;
14186 int64_t left = final_length;
14187 bool has_chunk2defer = false;
14188 auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
14189 while (left > 0) {
14190 ceph_assert(prealloc_left > 0);
14191 has_chunk2defer |= (prealloc_pos_length < prefer_deferred_size_snapshot);
14192 if (prealloc_pos->length <= left) {
14193 prealloc_left -= prealloc_pos->length;
14194 left -= prealloc_pos->length;
14195 txc->statfs_delta.allocated() += prealloc_pos->length;
14196 extents.push_back(*prealloc_pos);
14197 ++prealloc_pos;
14198 if (prealloc_pos != prealloc.end()) {
14199 prealloc_pos_length = prealloc_pos->length;
14200 }
14201 } else {
14202 extents.emplace_back(prealloc_pos->offset, left);
14203 prealloc_pos->offset += left;
14204 prealloc_pos->length -= left;
14205 prealloc_left -= left;
14206 txc->statfs_delta.allocated() += left;
14207 left = 0;
14208 break;
14209 }
14210 }
14211 for (auto& p : extents) {
14212 txc->allocated.insert(p.offset, p.length);
14213 }
14214 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
14215
14216 dout(20) << __func__ << " blob " << *wi.b << dendl;
14217 if (dblob.has_csum()) {
14218 dblob.calc_csum(b_off, *l);
14219 }
14220
14221 if (wi.mark_unused) {
14222 ceph_assert(!dblob.is_compressed());
14223 auto b_end = b_off + wi.bl.length();
14224 if (b_off) {
14225 dblob.add_unused(0, b_off);
14226 }
14227 uint64_t llen = dblob.get_logical_length();
14228 if (b_end < llen) {
14229 dblob.add_unused(b_end, llen - b_end);
14230 }
14231 }
14232
14233 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
14234 b_off + (wi.b_off0 - wi.b_off),
14235 wi.length0,
14236 wi.b,
14237 nullptr);
14238 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
14239 txc->statfs_delta.stored() += le->length;
14240 dout(20) << __func__ << " lex " << *le << dendl;
14241 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
14242 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14243
14244 // queue io
14245 if (!g_conf()->bluestore_debug_omit_block_device_write) {
14246 if (has_chunk2defer && l->length() < prefer_deferred_size_snapshot) {
14247 dout(20) << __func__ << " deferring 0x" << std::hex
14248 << l->length() << std::dec << " write via deferred" << dendl;
14249 bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
14250 op->op = bluestore_deferred_op_t::OP_WRITE;
14251 int r = wi.b->get_blob().map(
14252 b_off, l->length(),
14253 [&](uint64_t offset, uint64_t length) {
14254 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14255 return 0;
14256 });
14257 ceph_assert(r == 0);
14258 op->data = *l;
14259 } else {
14260 wi.b->get_blob().map_bl(
14261 b_off, *l,
14262 [&](uint64_t offset, bufferlist& t) {
14263 bdev->aio_write(offset, t, &txc->ioc, false);
14264 });
14265 logger->inc(l_bluestore_write_new);
14266 }
14267 }
14268 }
14269 ceph_assert(prealloc_pos == prealloc.end());
14270 ceph_assert(prealloc_left == 0);
14271 return 0;
14272 }
14273
14274 void BlueStore::_wctx_finish(
14275 TransContext *txc,
14276 CollectionRef& c,
14277 OnodeRef o,
14278 WriteContext *wctx,
14279 set<SharedBlob*> *maybe_unshared_blobs)
14280 {
14281 auto oep = wctx->old_extents.begin();
14282 while (oep != wctx->old_extents.end()) {
14283 auto &lo = *oep;
14284 oep = wctx->old_extents.erase(oep);
14285 dout(20) << __func__ << " lex_old " << lo.e << dendl;
14286 BlobRef b = lo.e.blob;
14287 const bluestore_blob_t& blob = b->get_blob();
14288 if (blob.is_compressed()) {
14289 if (lo.blob_empty) {
14290 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
14291 }
14292 txc->statfs_delta.compressed_original() -= lo.e.length;
14293 }
14294 auto& r = lo.r;
14295 txc->statfs_delta.stored() -= lo.e.length;
14296 if (!r.empty()) {
14297 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
14298 if (blob.is_shared()) {
14299 PExtentVector final;
14300 c->load_shared_blob(b->shared_blob);
14301 bool unshare = false;
14302 bool* unshare_ptr =
14303 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
14304 for (auto e : r) {
14305 b->shared_blob->put_ref(
14306 e.offset, e.length, &final,
14307 unshare_ptr);
14308 }
14309 if (unshare) {
14310 ceph_assert(maybe_unshared_blobs);
14311 maybe_unshared_blobs->insert(b->shared_blob.get());
14312 }
14313 dout(20) << __func__ << " shared_blob release " << final
14314 << " from " << *b->shared_blob << dendl;
14315 txc->write_shared_blob(b->shared_blob);
14316 r.clear();
14317 r.swap(final);
14318 }
14319 }
14320 // we can't invalidate our logical extents as we drop them because
14321 // other lextents (either in our onode or others) may still
14322 // reference them. but we can throw out anything that is no
14323 // longer allocated. Note that this will leave behind edge bits
14324 // that are no longer referenced but not deallocated (until they
14325 // age out of the cache naturally).
14326 b->discard_unallocated(c.get());
14327 for (auto e : r) {
14328 dout(20) << __func__ << " release " << e << dendl;
14329 txc->released.insert(e.offset, e.length);
14330 txc->statfs_delta.allocated() -= e.length;
14331 if (blob.is_compressed()) {
14332 txc->statfs_delta.compressed_allocated() -= e.length;
14333 }
14334 }
14335
14336 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
14337 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
14338 << dendl;
14339 o->extent_map.spanning_blob_map.erase(b->id);
14340 }
14341 delete &lo;
14342 }
14343 }
14344
14345 void BlueStore::_do_write_data(
14346 TransContext *txc,
14347 CollectionRef& c,
14348 OnodeRef o,
14349 uint64_t offset,
14350 uint64_t length,
14351 bufferlist& bl,
14352 WriteContext *wctx)
14353 {
14354 uint64_t end = offset + length;
14355 bufferlist::iterator p = bl.begin();
14356
14357 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
14358 (length != min_alloc_size)) {
14359 // we fall within the same block
14360 _do_write_small(txc, c, o, offset, length, p, wctx);
14361 } else {
14362 uint64_t head_offset, head_length;
14363 uint64_t middle_offset, middle_length;
14364 uint64_t tail_offset, tail_length;
14365
14366 head_offset = offset;
14367 head_length = p2nphase(offset, min_alloc_size);
14368
14369 tail_offset = p2align(end, min_alloc_size);
14370 tail_length = p2phase(end, min_alloc_size);
14371
14372 middle_offset = head_offset + head_length;
14373 middle_length = length - head_length - tail_length;
14374
14375 if (head_length) {
14376 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
14377 }
14378
14379 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
14380
14381 if (tail_length) {
14382 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
14383 }
14384 }
14385 }
14386
14387 void BlueStore::_choose_write_options(
14388 CollectionRef& c,
14389 OnodeRef o,
14390 uint32_t fadvise_flags,
14391 WriteContext *wctx)
14392 {
14393 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
14394 dout(20) << __func__ << " will do buffered write" << dendl;
14395 wctx->buffered = true;
14396 } else if (cct->_conf->bluestore_default_buffered_write &&
14397 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
14398 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
14399 dout(20) << __func__ << " defaulting to buffered write" << dendl;
14400 wctx->buffered = true;
14401 }
14402
14403 // apply basic csum block size
14404 wctx->csum_order = block_size_order;
14405
14406 // compression parameters
14407 unsigned alloc_hints = o->onode.alloc_hint_flags;
14408 auto cm = select_option(
14409 "compression_mode",
14410 comp_mode.load(),
14411 [&]() {
14412 string val;
14413 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
14414 return boost::optional<Compressor::CompressionMode>(
14415 Compressor::get_comp_mode_type(val));
14416 }
14417 return boost::optional<Compressor::CompressionMode>();
14418 }
14419 );
14420
14421 wctx->compress = (cm != Compressor::COMP_NONE) &&
14422 ((cm == Compressor::COMP_FORCE) ||
14423 (cm == Compressor::COMP_AGGRESSIVE &&
14424 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
14425 (cm == Compressor::COMP_PASSIVE &&
14426 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
14427
14428 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
14429 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
14430 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
14431 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
14432 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
14433
14434 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
14435
14436 if (o->onode.expected_write_size) {
14437 wctx->csum_order = std::max(min_alloc_size_order,
14438 (uint8_t)ctz(o->onode.expected_write_size));
14439 } else {
14440 wctx->csum_order = min_alloc_size_order;
14441 }
14442
14443 if (wctx->compress) {
14444 wctx->target_blob_size = select_option(
14445 "compression_max_blob_size",
14446 comp_max_blob_size.load(),
14447 [&]() {
14448 int64_t val;
14449 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
14450 return boost::optional<uint64_t>((uint64_t)val);
14451 }
14452 return boost::optional<uint64_t>();
14453 }
14454 );
14455 }
14456 } else {
14457 if (wctx->compress) {
14458 wctx->target_blob_size = select_option(
14459 "compression_min_blob_size",
14460 comp_min_blob_size.load(),
14461 [&]() {
14462 int64_t val;
14463 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
14464 return boost::optional<uint64_t>((uint64_t)val);
14465 }
14466 return boost::optional<uint64_t>();
14467 }
14468 );
14469 }
14470 }
14471
14472 uint64_t max_bsize = max_blob_size.load();
14473 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
14474 wctx->target_blob_size = max_bsize;
14475 }
14476
14477 // set the min blob size floor at 2x the min_alloc_size, or else we
14478 // won't be able to allocate a smaller extent for the compressed
14479 // data.
14480 if (wctx->compress &&
14481 wctx->target_blob_size < min_alloc_size * 2) {
14482 wctx->target_blob_size = min_alloc_size * 2;
14483 }
14484
14485 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
14486 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
14487 << " compress=" << (int)wctx->compress
14488 << " buffered=" << (int)wctx->buffered
14489 << std::dec << dendl;
14490 }
14491
14492 int BlueStore::_do_gc(
14493 TransContext *txc,
14494 CollectionRef& c,
14495 OnodeRef o,
14496 const WriteContext& wctx,
14497 uint64_t *dirty_start,
14498 uint64_t *dirty_end)
14499 {
14500
14501 bool dirty_range_updated = false;
14502 WriteContext wctx_gc;
14503 wctx_gc.fork(wctx); // make a clone for garbage collection
14504
14505 auto & extents_to_collect = wctx.extents_to_gc;
14506 for (auto it = extents_to_collect.begin();
14507 it != extents_to_collect.end();
14508 ++it) {
14509 bufferlist bl;
14510 auto offset = (*it).first;
14511 auto length = (*it).second;
14512 dout(20) << __func__ << " processing " << std::hex
14513 << offset << "~" << length << std::dec
14514 << dendl;
14515 int r = _do_read(c.get(), o, offset, length, bl, 0);
14516 ceph_assert(r == (int)length);
14517
14518 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
14519 logger->inc(l_bluestore_gc_merged, length);
14520
14521 if (*dirty_start > offset) {
14522 *dirty_start = offset;
14523 dirty_range_updated = true;
14524 }
14525
14526 if (*dirty_end < offset + length) {
14527 *dirty_end = offset + length;
14528 dirty_range_updated = true;
14529 }
14530 }
14531 if (dirty_range_updated) {
14532 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
14533 }
14534
14535 dout(30) << __func__ << " alloc write" << dendl;
14536 int r = _do_alloc_write(txc, c, o, &wctx_gc);
14537 if (r < 0) {
14538 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14539 << dendl;
14540 return r;
14541 }
14542
14543 _wctx_finish(txc, c, o, &wctx_gc);
14544 return 0;
14545 }
14546
14547 int BlueStore::_do_write(
14548 TransContext *txc,
14549 CollectionRef& c,
14550 OnodeRef o,
14551 uint64_t offset,
14552 uint64_t length,
14553 bufferlist& bl,
14554 uint32_t fadvise_flags)
14555 {
14556 int r = 0;
14557
14558 dout(20) << __func__
14559 << " " << o->oid
14560 << " 0x" << std::hex << offset << "~" << length
14561 << " - have 0x" << o->onode.size
14562 << " (" << std::dec << o->onode.size << ")"
14563 << " bytes" << std::hex
14564 << " fadvise_flags 0x" << fadvise_flags
14565 << " alloc_hint 0x" << o->onode.alloc_hint_flags
14566 << " expected_object_size " << o->onode.expected_object_size
14567 << " expected_write_size " << o->onode.expected_write_size
14568 << std::dec
14569 << dendl;
14570 _dump_onode<30>(cct, *o);
14571
14572 if (length == 0) {
14573 return 0;
14574 }
14575
14576 uint64_t end = offset + length;
14577
14578 GarbageCollector gc(c->store->cct);
14579 int64_t benefit = 0;
14580 auto dirty_start = offset;
14581 auto dirty_end = end;
14582
14583 WriteContext wctx;
14584 _choose_write_options(c, o, fadvise_flags, &wctx);
14585 o->extent_map.fault_range(db, offset, length);
14586 _do_write_data(txc, c, o, offset, length, bl, &wctx);
14587 r = _do_alloc_write(txc, c, o, &wctx);
14588 if (r < 0) {
14589 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14590 << dendl;
14591 goto out;
14592 }
14593
14594 if (wctx.extents_to_gc.empty() ||
14595 wctx.extents_to_gc.range_start() > offset ||
14596 wctx.extents_to_gc.range_end() < offset + length) {
14597 benefit = gc.estimate(offset,
14598 length,
14599 o->extent_map,
14600 wctx.old_extents,
14601 min_alloc_size);
14602 }
14603
14604 if (bdev->is_smr()) {
14605 if (wctx.old_extents.empty()) {
14606 txc->zoned_note_new_object(o);
14607 } else {
14608 int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14609 txc->zoned_note_updated_object(o, old_ondisk_offset);
14610 }
14611 }
14612
14613 // NB: _wctx_finish() will empty old_extents
14614 // so we must do gc estimation before that
14615 _wctx_finish(txc, c, o, &wctx);
14616 if (end > o->onode.size) {
14617 dout(20) << __func__ << " extending size to 0x" << std::hex << end
14618 << std::dec << dendl;
14619 o->onode.size = end;
14620 }
14621
14622 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
14623 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14624 dout(20) << __func__
14625 << " perform garbage collection for compressed extents, "
14626 << "expected benefit = " << benefit << " AUs" << dendl;
14627 }
14628 if (!wctx.extents_to_gc.empty()) {
14629 dout(20) << __func__ << " perform garbage collection" << dendl;
14630
14631 r = _do_gc(txc, c, o,
14632 wctx,
14633 &dirty_start, &dirty_end);
14634 if (r < 0) {
14635 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14636 << dendl;
14637 goto out;
14638 }
14639 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14640 << "~" << dirty_end - dirty_start << std::dec << dendl;
14641 }
14642 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
14643 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14644
14645 r = 0;
14646
14647 out:
14648 return r;
14649 }
14650
14651 int BlueStore::_write(TransContext *txc,
14652 CollectionRef& c,
14653 OnodeRef& o,
14654 uint64_t offset, size_t length,
14655 bufferlist& bl,
14656 uint32_t fadvise_flags)
14657 {
14658 dout(15) << __func__ << " " << c->cid << " " << o->oid
14659 << " 0x" << std::hex << offset << "~" << length << std::dec
14660 << dendl;
14661 int r = 0;
14662 if (offset + length >= OBJECT_MAX_SIZE) {
14663 r = -E2BIG;
14664 } else {
14665 _assign_nid(txc, o);
14666 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14667 txc->write_onode(o);
14668 }
14669 dout(10) << __func__ << " " << c->cid << " " << o->oid
14670 << " 0x" << std::hex << offset << "~" << length << std::dec
14671 << " = " << r << dendl;
14672 return r;
14673 }
14674
14675 int BlueStore::_zero(TransContext *txc,
14676 CollectionRef& c,
14677 OnodeRef& o,
14678 uint64_t offset, size_t length)
14679 {
14680 dout(15) << __func__ << " " << c->cid << " " << o->oid
14681 << " 0x" << std::hex << offset << "~" << length << std::dec
14682 << dendl;
14683 int r = 0;
14684 if (offset + length >= OBJECT_MAX_SIZE) {
14685 r = -E2BIG;
14686 } else {
14687 _assign_nid(txc, o);
14688 r = _do_zero(txc, c, o, offset, length);
14689 }
14690 dout(10) << __func__ << " " << c->cid << " " << o->oid
14691 << " 0x" << std::hex << offset << "~" << length << std::dec
14692 << " = " << r << dendl;
14693 return r;
14694 }
14695
14696 int BlueStore::_do_zero(TransContext *txc,
14697 CollectionRef& c,
14698 OnodeRef& o,
14699 uint64_t offset, size_t length)
14700 {
14701 dout(15) << __func__ << " " << c->cid << " " << o->oid
14702 << " 0x" << std::hex << offset << "~" << length << std::dec
14703 << dendl;
14704 int r = 0;
14705
14706 _dump_onode<30>(cct, *o);
14707
14708 WriteContext wctx;
14709 o->extent_map.fault_range(db, offset, length);
14710 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
14711 o->extent_map.dirty_range(offset, length);
14712 _wctx_finish(txc, c, o, &wctx);
14713
14714 if (length > 0 && offset + length > o->onode.size) {
14715 o->onode.size = offset + length;
14716 dout(20) << __func__ << " extending size to " << offset + length
14717 << dendl;
14718 }
14719 txc->write_onode(o);
14720
14721 dout(10) << __func__ << " " << c->cid << " " << o->oid
14722 << " 0x" << std::hex << offset << "~" << length << std::dec
14723 << " = " << r << dendl;
14724 return r;
14725 }
14726
14727 void BlueStore::_do_truncate(
14728 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14729 set<SharedBlob*> *maybe_unshared_blobs)
14730 {
14731 dout(15) << __func__ << " " << c->cid << " " << o->oid
14732 << " 0x" << std::hex << offset << std::dec << dendl;
14733
14734 _dump_onode<30>(cct, *o);
14735
14736 if (offset == o->onode.size)
14737 return;
14738
14739 WriteContext wctx;
14740 if (offset < o->onode.size) {
14741 uint64_t length = o->onode.size - offset;
14742 o->extent_map.fault_range(db, offset, length);
14743 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
14744 o->extent_map.dirty_range(offset, length);
14745 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
14746
14747 // if we have shards past EOF, ask for a reshard
14748 if (!o->onode.extent_map_shards.empty() &&
14749 o->onode.extent_map_shards.back().offset >= offset) {
14750 dout(10) << __func__ << " request reshard past EOF" << dendl;
14751 if (offset) {
14752 o->extent_map.request_reshard(offset - 1, offset + length);
14753 } else {
14754 o->extent_map.request_reshard(0, length);
14755 }
14756 }
14757 }
14758
14759 o->onode.size = offset;
14760
14761 if (bdev->is_smr()) {
14762 // On zoned devices, we currently support only removing an object or
14763 // truncating it to zero size, both of which fall through this code path.
14764 ceph_assert(offset == 0 && !wctx.old_extents.empty());
14765 int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14766 txc->zoned_note_truncated_object(o, ondisk_offset);
14767 }
14768
14769 txc->write_onode(o);
14770 }
14771
14772 int BlueStore::_truncate(TransContext *txc,
14773 CollectionRef& c,
14774 OnodeRef& o,
14775 uint64_t offset)
14776 {
14777 dout(15) << __func__ << " " << c->cid << " " << o->oid
14778 << " 0x" << std::hex << offset << std::dec
14779 << dendl;
14780 int r = 0;
14781 if (offset >= OBJECT_MAX_SIZE) {
14782 r = -E2BIG;
14783 } else {
14784 _do_truncate(txc, c, o, offset);
14785 }
14786 dout(10) << __func__ << " " << c->cid << " " << o->oid
14787 << " 0x" << std::hex << offset << std::dec
14788 << " = " << r << dendl;
14789 return r;
14790 }
14791
14792 int BlueStore::_do_remove(
14793 TransContext *txc,
14794 CollectionRef& c,
14795 OnodeRef o)
14796 {
14797 set<SharedBlob*> maybe_unshared_blobs;
14798 bool is_gen = !o->oid.is_no_gen();
14799 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
14800 if (o->onode.has_omap()) {
14801 o->flush();
14802 _do_omap_clear(txc, o);
14803 }
14804 o->exists = false;
14805 string key;
14806 for (auto &s : o->extent_map.shards) {
14807 dout(20) << __func__ << " removing shard 0x" << std::hex
14808 << s.shard_info->offset << std::dec << dendl;
14809 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14810 [&](const string& final_key) {
14811 txc->t->rmkey(PREFIX_OBJ, final_key);
14812 }
14813 );
14814 }
14815 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
14816 txc->note_removed_object(o);
14817 o->extent_map.clear();
14818 o->onode = bluestore_onode_t();
14819 _debug_obj_on_delete(o->oid);
14820
14821 if (!is_gen || maybe_unshared_blobs.empty()) {
14822 return 0;
14823 }
14824
14825 // see if we can unshare blobs still referenced by the head
14826 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14827 << maybe_unshared_blobs << dendl;
14828 ghobject_t nogen = o->oid;
14829 nogen.generation = ghobject_t::NO_GEN;
14830 OnodeRef h = c->get_onode(nogen, false);
14831
14832 if (!h || !h->exists) {
14833 return 0;
14834 }
14835
14836 dout(20) << __func__ << " checking for unshareable blobs on " << h
14837 << " " << h->oid << dendl;
14838 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14839 for (auto& e : h->extent_map.extent_map) {
14840 const bluestore_blob_t& b = e.blob->get_blob();
14841 SharedBlob *sb = e.blob->shared_blob.get();
14842 if (b.is_shared() &&
14843 sb->loaded &&
14844 maybe_unshared_blobs.count(sb)) {
14845 if (b.is_compressed()) {
14846 expect[sb].get(0, b.get_ondisk_length());
14847 } else {
14848 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14849 expect[sb].get(off, len);
14850 return 0;
14851 });
14852 }
14853 }
14854 }
14855
14856 vector<SharedBlob*> unshared_blobs;
14857 unshared_blobs.reserve(maybe_unshared_blobs.size());
14858 for (auto& p : expect) {
14859 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14860 if (p.first->persistent->ref_map == p.second) {
14861 SharedBlob *sb = p.first;
14862 dout(20) << __func__ << " unsharing " << *sb << dendl;
14863 unshared_blobs.push_back(sb);
14864 txc->unshare_blob(sb);
14865 uint64_t sbid = c->make_blob_unshared(sb);
14866 string key;
14867 get_shared_blob_key(sbid, &key);
14868 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14869 }
14870 }
14871
14872 if (unshared_blobs.empty()) {
14873 return 0;
14874 }
14875
14876 for (auto& e : h->extent_map.extent_map) {
14877 const bluestore_blob_t& b = e.blob->get_blob();
14878 SharedBlob *sb = e.blob->shared_blob.get();
14879 if (b.is_shared() &&
14880 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14881 sb) != unshared_blobs.end()) {
14882 dout(20) << __func__ << " unsharing " << e << dendl;
14883 bluestore_blob_t& blob = e.blob->dirty_blob();
14884 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
14885 h->extent_map.dirty_range(e.logical_offset, 1);
14886 }
14887 }
14888 txc->write_onode(h);
14889
14890 return 0;
14891 }
14892
14893 int BlueStore::_remove(TransContext *txc,
14894 CollectionRef& c,
14895 OnodeRef &o)
14896 {
14897 dout(15) << __func__ << " " << c->cid << " " << o->oid
14898 << " onode " << o.get()
14899 << " txc "<< txc << dendl;
14900
14901 auto start_time = mono_clock::now();
14902 int r = _do_remove(txc, c, o);
14903 log_latency_fn(
14904 __func__,
14905 l_bluestore_remove_lat,
14906 mono_clock::now() - start_time,
14907 cct->_conf->bluestore_log_op_age,
14908 [&](const ceph::timespan& lat) {
14909 ostringstream ostr;
14910 ostr << ", lat = " << timespan_str(lat)
14911 << " cid =" << c->cid
14912 << " oid =" << o->oid;
14913 return ostr.str();
14914 }
14915 );
14916
14917 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14918 return r;
14919 }
14920
14921 int BlueStore::_setattr(TransContext *txc,
14922 CollectionRef& c,
14923 OnodeRef& o,
14924 const string& name,
14925 bufferptr& val)
14926 {
14927 dout(15) << __func__ << " " << c->cid << " " << o->oid
14928 << " " << name << " (" << val.length() << " bytes)"
14929 << dendl;
14930 int r = 0;
14931 if (val.is_partial()) {
14932 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14933 val.length());
14934 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14935 } else {
14936 auto& b = o->onode.attrs[name.c_str()] = val;
14937 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14938 }
14939 txc->write_onode(o);
14940 dout(10) << __func__ << " " << c->cid << " " << o->oid
14941 << " " << name << " (" << val.length() << " bytes)"
14942 << " = " << r << dendl;
14943 return r;
14944 }
14945
14946 int BlueStore::_setattrs(TransContext *txc,
14947 CollectionRef& c,
14948 OnodeRef& o,
14949 const map<string,bufferptr>& aset)
14950 {
14951 dout(15) << __func__ << " " << c->cid << " " << o->oid
14952 << " " << aset.size() << " keys"
14953 << dendl;
14954 int r = 0;
14955 for (map<string,bufferptr>::const_iterator p = aset.begin();
14956 p != aset.end(); ++p) {
14957 if (p->second.is_partial()) {
14958 auto& b = o->onode.attrs[p->first.c_str()] =
14959 bufferptr(p->second.c_str(), p->second.length());
14960 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14961 } else {
14962 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
14963 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14964 }
14965 }
14966 txc->write_onode(o);
14967 dout(10) << __func__ << " " << c->cid << " " << o->oid
14968 << " " << aset.size() << " keys"
14969 << " = " << r << dendl;
14970 return r;
14971 }
14972
14973
14974 int BlueStore::_rmattr(TransContext *txc,
14975 CollectionRef& c,
14976 OnodeRef& o,
14977 const string& name)
14978 {
14979 dout(15) << __func__ << " " << c->cid << " " << o->oid
14980 << " " << name << dendl;
14981 int r = 0;
14982 auto it = o->onode.attrs.find(name.c_str());
14983 if (it == o->onode.attrs.end())
14984 goto out;
14985
14986 o->onode.attrs.erase(it);
14987 txc->write_onode(o);
14988
14989 out:
14990 dout(10) << __func__ << " " << c->cid << " " << o->oid
14991 << " " << name << " = " << r << dendl;
14992 return r;
14993 }
14994
14995 int BlueStore::_rmattrs(TransContext *txc,
14996 CollectionRef& c,
14997 OnodeRef& o)
14998 {
14999 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15000 int r = 0;
15001
15002 if (o->onode.attrs.empty())
15003 goto out;
15004
15005 o->onode.attrs.clear();
15006 txc->write_onode(o);
15007
15008 out:
15009 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15010 return r;
15011 }
15012
15013 void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
15014 {
15015 const string& omap_prefix = o->get_omap_prefix();
15016 string prefix, tail;
15017 o->get_omap_header(&prefix);
15018 o->get_omap_tail(&tail);
15019 txc->t->rm_range_keys(omap_prefix, prefix, tail);
15020 txc->t->rmkey(omap_prefix, tail);
15021 dout(20) << __func__ << " remove range start: "
15022 << pretty_binary_string(prefix) << " end: "
15023 << pretty_binary_string(tail) << dendl;
15024 }
15025
15026 int BlueStore::_omap_clear(TransContext *txc,
15027 CollectionRef& c,
15028 OnodeRef& o)
15029 {
15030 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15031 int r = 0;
15032 if (o->onode.has_omap()) {
15033 o->flush();
15034 _do_omap_clear(txc, o);
15035 o->onode.clear_omap_flag();
15036 txc->write_onode(o);
15037 }
15038 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15039 return r;
15040 }
15041
15042 int BlueStore::_omap_setkeys(TransContext *txc,
15043 CollectionRef& c,
15044 OnodeRef& o,
15045 bufferlist &bl)
15046 {
15047 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15048 int r;
15049 auto p = bl.cbegin();
15050 __u32 num;
15051 if (!o->onode.has_omap()) {
15052 if (o->oid.is_pgmeta()) {
15053 o->onode.set_omap_flags_pgmeta();
15054 } else {
15055 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
15056 }
15057 txc->write_onode(o);
15058
15059 const string& prefix = o->get_omap_prefix();
15060 string key_tail;
15061 bufferlist tail;
15062 o->get_omap_tail(&key_tail);
15063 txc->t->set(prefix, key_tail, tail);
15064 } else {
15065 txc->note_modified_object(o);
15066 }
15067 const string& prefix = o->get_omap_prefix();
15068 string final_key;
15069 o->get_omap_key(string(), &final_key);
15070 size_t base_key_len = final_key.size();
15071 decode(num, p);
15072 while (num--) {
15073 string key;
15074 bufferlist value;
15075 decode(key, p);
15076 decode(value, p);
15077 final_key.resize(base_key_len); // keep prefix
15078 final_key += key;
15079 dout(20) << __func__ << " " << pretty_binary_string(final_key)
15080 << " <- " << key << dendl;
15081 txc->t->set(prefix, final_key, value);
15082 }
15083 r = 0;
15084 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15085 return r;
15086 }
15087
15088 int BlueStore::_omap_setheader(TransContext *txc,
15089 CollectionRef& c,
15090 OnodeRef &o,
15091 bufferlist& bl)
15092 {
15093 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15094 int r;
15095 string key;
15096 if (!o->onode.has_omap()) {
15097 if (o->oid.is_pgmeta()) {
15098 o->onode.set_omap_flags_pgmeta();
15099 } else {
15100 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
15101 }
15102 txc->write_onode(o);
15103
15104 const string& prefix = o->get_omap_prefix();
15105 string key_tail;
15106 bufferlist tail;
15107 o->get_omap_tail(&key_tail);
15108 txc->t->set(prefix, key_tail, tail);
15109 } else {
15110 txc->note_modified_object(o);
15111 }
15112 const string& prefix = o->get_omap_prefix();
15113 o->get_omap_header(&key);
15114 txc->t->set(prefix, key, bl);
15115 r = 0;
15116 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15117 return r;
15118 }
15119
15120 int BlueStore::_omap_rmkeys(TransContext *txc,
15121 CollectionRef& c,
15122 OnodeRef& o,
15123 bufferlist& bl)
15124 {
15125 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15126 int r = 0;
15127 auto p = bl.cbegin();
15128 __u32 num;
15129 string final_key;
15130
15131 if (!o->onode.has_omap()) {
15132 goto out;
15133 }
15134 {
15135 const string& prefix = o->get_omap_prefix();
15136 o->get_omap_key(string(), &final_key);
15137 size_t base_key_len = final_key.size();
15138 decode(num, p);
15139 while (num--) {
15140 string key;
15141 decode(key, p);
15142 final_key.resize(base_key_len); // keep prefix
15143 final_key += key;
15144 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
15145 << " <- " << key << dendl;
15146 txc->t->rmkey(prefix, final_key);
15147 }
15148 }
15149 txc->note_modified_object(o);
15150
15151 out:
15152 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15153 return r;
15154 }
15155
15156 int BlueStore::_omap_rmkey_range(TransContext *txc,
15157 CollectionRef& c,
15158 OnodeRef& o,
15159 const string& first, const string& last)
15160 {
15161 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15162 string key_first, key_last;
15163 int r = 0;
15164 if (!o->onode.has_omap()) {
15165 goto out;
15166 }
15167 {
15168 const string& prefix = o->get_omap_prefix();
15169 o->flush();
15170 o->get_omap_key(first, &key_first);
15171 o->get_omap_key(last, &key_last);
15172 txc->t->rm_range_keys(prefix, key_first, key_last);
15173 dout(20) << __func__ << " remove range start: "
15174 << pretty_binary_string(key_first) << " end: "
15175 << pretty_binary_string(key_last) << dendl;
15176 }
15177 txc->note_modified_object(o);
15178
15179 out:
15180 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15181 return r;
15182 }
15183
15184 int BlueStore::_set_alloc_hint(
15185 TransContext *txc,
15186 CollectionRef& c,
15187 OnodeRef& o,
15188 uint64_t expected_object_size,
15189 uint64_t expected_write_size,
15190 uint32_t flags)
15191 {
15192 dout(15) << __func__ << " " << c->cid << " " << o->oid
15193 << " object_size " << expected_object_size
15194 << " write_size " << expected_write_size
15195 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15196 << dendl;
15197 int r = 0;
15198 o->onode.expected_object_size = expected_object_size;
15199 o->onode.expected_write_size = expected_write_size;
15200 o->onode.alloc_hint_flags = flags;
15201 txc->write_onode(o);
15202 dout(10) << __func__ << " " << c->cid << " " << o->oid
15203 << " object_size " << expected_object_size
15204 << " write_size " << expected_write_size
15205 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15206 << " = " << r << dendl;
15207 return r;
15208 }
15209
15210 int BlueStore::_clone(TransContext *txc,
15211 CollectionRef& c,
15212 OnodeRef& oldo,
15213 OnodeRef& newo)
15214 {
15215 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15216 << newo->oid << dendl;
15217 int r = 0;
15218 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
15219 derr << __func__ << " mismatched hash on " << oldo->oid
15220 << " and " << newo->oid << dendl;
15221 return -EINVAL;
15222 }
15223
15224 _assign_nid(txc, newo);
15225
15226 // clone data
15227 oldo->flush();
15228 _do_truncate(txc, c, newo, 0);
15229 if (cct->_conf->bluestore_clone_cow) {
15230 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
15231 } else {
15232 bufferlist bl;
15233 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
15234 if (r < 0)
15235 goto out;
15236 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
15237 if (r < 0)
15238 goto out;
15239 }
15240
15241 // clone attrs
15242 newo->onode.attrs = oldo->onode.attrs;
15243
15244 // clone omap
15245 if (newo->onode.has_omap()) {
15246 dout(20) << __func__ << " clearing old omap data" << dendl;
15247 newo->flush();
15248 _do_omap_clear(txc, newo);
15249 newo->onode.clear_omap_flag();
15250 }
15251 if (oldo->onode.has_omap()) {
15252 dout(20) << __func__ << " copying omap data" << dendl;
15253 if (newo->oid.is_pgmeta()) {
15254 newo->onode.set_omap_flags_pgmeta();
15255 } else {
15256 newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
15257 }
15258 const string& prefix = newo->get_omap_prefix();
15259 KeyValueDB::Iterator it = db->get_iterator(prefix);
15260 string head, tail;
15261 oldo->get_omap_header(&head);
15262 oldo->get_omap_tail(&tail);
15263 it->lower_bound(head);
15264 while (it->valid()) {
15265 if (it->key() >= tail) {
15266 dout(30) << __func__ << " reached tail" << dendl;
15267 break;
15268 } else {
15269 dout(30) << __func__ << " got header/data "
15270 << pretty_binary_string(it->key()) << dendl;
15271 string key;
15272 newo->rewrite_omap_key(it->key(), &key);
15273 txc->t->set(prefix, key, it->value());
15274 }
15275 it->next();
15276 }
15277 string new_tail;
15278 bufferlist new_tail_value;
15279 newo->get_omap_tail(&new_tail);
15280 txc->t->set(prefix, new_tail, new_tail_value);
15281 }
15282
15283 txc->write_onode(newo);
15284 r = 0;
15285
15286 out:
15287 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15288 << newo->oid << " = " << r << dendl;
15289 return r;
15290 }
15291
15292 int BlueStore::_do_clone_range(
15293 TransContext *txc,
15294 CollectionRef& c,
15295 OnodeRef& oldo,
15296 OnodeRef& newo,
15297 uint64_t srcoff,
15298 uint64_t length,
15299 uint64_t dstoff)
15300 {
15301 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15302 << newo->oid
15303 << " 0x" << std::hex << srcoff << "~" << length << " -> "
15304 << " 0x" << dstoff << "~" << length << std::dec << dendl;
15305 oldo->extent_map.fault_range(db, srcoff, length);
15306 newo->extent_map.fault_range(db, dstoff, length);
15307 _dump_onode<30>(cct, *oldo);
15308 _dump_onode<30>(cct, *newo);
15309
15310 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
15311 _dump_onode<30>(cct, *oldo);
15312 _dump_onode<30>(cct, *newo);
15313 return 0;
15314 }
15315
15316 int BlueStore::_clone_range(TransContext *txc,
15317 CollectionRef& c,
15318 OnodeRef& oldo,
15319 OnodeRef& newo,
15320 uint64_t srcoff, uint64_t length, uint64_t dstoff)
15321 {
15322 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15323 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15324 << " to offset 0x" << dstoff << std::dec << dendl;
15325 int r = 0;
15326
15327 if (srcoff + length >= OBJECT_MAX_SIZE ||
15328 dstoff + length >= OBJECT_MAX_SIZE) {
15329 r = -E2BIG;
15330 goto out;
15331 }
15332 if (srcoff + length > oldo->onode.size) {
15333 r = -EINVAL;
15334 goto out;
15335 }
15336
15337 _assign_nid(txc, newo);
15338
15339 if (length > 0) {
15340 if (cct->_conf->bluestore_clone_cow) {
15341 _do_zero(txc, c, newo, dstoff, length);
15342 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
15343 } else {
15344 bufferlist bl;
15345 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
15346 if (r < 0)
15347 goto out;
15348 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
15349 if (r < 0)
15350 goto out;
15351 }
15352 }
15353
15354 txc->write_onode(newo);
15355 r = 0;
15356
15357 out:
15358 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15359 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15360 << " to offset 0x" << dstoff << std::dec
15361 << " = " << r << dendl;
15362 return r;
15363 }
15364
15365 int BlueStore::_rename(TransContext *txc,
15366 CollectionRef& c,
15367 OnodeRef& oldo,
15368 OnodeRef& newo,
15369 const ghobject_t& new_oid)
15370 {
15371 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15372 << new_oid << dendl;
15373 int r;
15374 ghobject_t old_oid = oldo->oid;
15375 mempool::bluestore_cache_meta::string new_okey;
15376
15377 if (newo) {
15378 if (newo->exists) {
15379 r = -EEXIST;
15380 goto out;
15381 }
15382 ceph_assert(txc->onodes.count(newo) == 0);
15383 }
15384
15385 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
15386
15387 // rewrite shards
15388 {
15389 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
15390 get_object_key(cct, new_oid, &new_okey);
15391 string key;
15392 for (auto &s : oldo->extent_map.shards) {
15393 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
15394 [&](const string& final_key) {
15395 txc->t->rmkey(PREFIX_OBJ, final_key);
15396 }
15397 );
15398 s.dirty = true;
15399 }
15400 }
15401
15402 newo = oldo;
15403 txc->write_onode(newo);
15404
15405 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15406 // Onode in the old slot
15407 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
15408 r = 0;
15409
15410 // hold a ref to new Onode in old name position, to ensure we don't drop
15411 // it from the cache before this txc commits (or else someone may come along
15412 // and read newo's metadata via the old name).
15413 txc->note_modified_object(oldo);
15414
15415 out:
15416 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
15417 << new_oid << " = " << r << dendl;
15418 return r;
15419 }
15420
15421 // collections
15422
15423 int BlueStore::_create_collection(
15424 TransContext *txc,
15425 const coll_t &cid,
15426 unsigned bits,
15427 CollectionRef *c)
15428 {
15429 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
15430 int r;
15431 bufferlist bl;
15432
15433 {
15434 std::unique_lock l(coll_lock);
15435 if (*c) {
15436 r = -EEXIST;
15437 goto out;
15438 }
15439 auto p = new_coll_map.find(cid);
15440 ceph_assert(p != new_coll_map.end());
15441 *c = p->second;
15442 (*c)->cnode.bits = bits;
15443 coll_map[cid] = *c;
15444 new_coll_map.erase(p);
15445 }
15446 encode((*c)->cnode, bl);
15447 txc->t->set(PREFIX_COLL, stringify(cid), bl);
15448 r = 0;
15449
15450 out:
15451 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
15452 return r;
15453 }
15454
15455 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
15456 CollectionRef *c)
15457 {
15458 dout(15) << __func__ << " " << cid << dendl;
15459 int r;
15460
15461 (*c)->flush_all_but_last();
15462 {
15463 std::unique_lock l(coll_lock);
15464 if (!*c) {
15465 r = -ENOENT;
15466 goto out;
15467 }
15468 size_t nonexistent_count = 0;
15469 ceph_assert((*c)->exists);
15470 if ((*c)->onode_map.map_any([&](Onode* o) {
15471 if (o->exists) {
15472 dout(1) << __func__ << " " << o->oid << " " << o
15473 << " exists in onode_map" << dendl;
15474 return true;
15475 }
15476 ++nonexistent_count;
15477 return false;
15478 })) {
15479 r = -ENOTEMPTY;
15480 goto out;
15481 }
15482 vector<ghobject_t> ls;
15483 ghobject_t next;
15484 // Enumerate onodes in db, up to nonexistent_count + 1
15485 // then check if all of them are marked as non-existent.
15486 // Bypass the check if (next != ghobject_t::get_max())
15487 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
15488 nonexistent_count + 1, false, &ls, &next);
15489 if (r >= 0) {
15490 // If true mean collecton has more objects than nonexistent_count,
15491 // so bypass check.
15492 bool exists = (!next.is_max());
15493 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
15494 dout(10) << __func__ << " oid " << *it << dendl;
15495 auto onode = (*c)->onode_map.lookup(*it);
15496 exists = !onode || onode->exists;
15497 if (exists) {
15498 dout(1) << __func__ << " " << *it
15499 << " exists in db, "
15500 << (!onode ? "not present in ram" : "present in ram")
15501 << dendl;
15502 }
15503 }
15504 if (!exists) {
15505 _do_remove_collection(txc, c);
15506 r = 0;
15507 } else {
15508 dout(10) << __func__ << " " << cid
15509 << " is non-empty" << dendl;
15510 r = -ENOTEMPTY;
15511 }
15512 }
15513 }
15514 out:
15515 dout(10) << __func__ << " " << cid << " = " << r << dendl;
15516 return r;
15517 }
15518
15519 void BlueStore::_do_remove_collection(TransContext *txc,
15520 CollectionRef *c)
15521 {
15522 coll_map.erase((*c)->cid);
15523 txc->removed_collections.push_back(*c);
15524 (*c)->exists = false;
15525 _osr_register_zombie((*c)->osr.get());
15526 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
15527 c->reset();
15528 }
15529
15530 int BlueStore::_split_collection(TransContext *txc,
15531 CollectionRef& c,
15532 CollectionRef& d,
15533 unsigned bits, int rem)
15534 {
15535 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
15536 << " bits " << bits << dendl;
15537 std::unique_lock l(c->lock);
15538 std::unique_lock l2(d->lock);
15539 int r;
15540
15541 // flush all previous deferred writes on this sequencer. this is a bit
15542 // heavyweight, but we need to make sure all deferred writes complete
15543 // before we split as the new collection's sequencer may need to order
15544 // this after those writes, and we don't bother with the complexity of
15545 // moving those TransContexts over to the new osr.
15546 _osr_drain_preceding(txc);
15547
15548 // move any cached items (onodes and referenced shared blobs) that will
15549 // belong to the child collection post-split. leave everything else behind.
15550 // this may include things that don't strictly belong to the now-smaller
15551 // parent split, but the OSD will always send us a split for every new
15552 // child.
15553
15554 spg_t pgid, dest_pgid;
15555 bool is_pg = c->cid.is_pg(&pgid);
15556 ceph_assert(is_pg);
15557 is_pg = d->cid.is_pg(&dest_pgid);
15558 ceph_assert(is_pg);
15559
15560 // the destination should initially be empty.
15561 ceph_assert(d->onode_map.empty());
15562 ceph_assert(d->shared_blob_set.empty());
15563 ceph_assert(d->cnode.bits == bits);
15564
15565 c->split_cache(d.get());
15566
15567 // adjust bits. note that this will be redundant for all but the first
15568 // split call for this parent (first child).
15569 c->cnode.bits = bits;
15570 ceph_assert(d->cnode.bits == bits);
15571 r = 0;
15572
15573 bufferlist bl;
15574 encode(c->cnode, bl);
15575 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
15576
15577 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
15578 << " bits " << bits << " = " << r << dendl;
15579 return r;
15580 }
15581
15582 int BlueStore::_merge_collection(
15583 TransContext *txc,
15584 CollectionRef *c,
15585 CollectionRef& d,
15586 unsigned bits)
15587 {
15588 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
15589 << " bits " << bits << dendl;
15590 std::unique_lock l((*c)->lock);
15591 std::unique_lock l2(d->lock);
15592 int r;
15593
15594 coll_t cid = (*c)->cid;
15595
15596 // flush all previous deferred writes on the source collection to ensure
15597 // that all deferred writes complete before we merge as the target collection's
15598 // sequencer may need to order new ops after those writes.
15599
15600 _osr_drain((*c)->osr.get());
15601
15602 // move any cached items (onodes and referenced shared blobs) that will
15603 // belong to the child collection post-split. leave everything else behind.
15604 // this may include things that don't strictly belong to the now-smaller
15605 // parent split, but the OSD will always send us a split for every new
15606 // child.
15607
15608 spg_t pgid, dest_pgid;
15609 bool is_pg = cid.is_pg(&pgid);
15610 ceph_assert(is_pg);
15611 is_pg = d->cid.is_pg(&dest_pgid);
15612 ceph_assert(is_pg);
15613
15614 // adjust bits. note that this will be redundant for all but the first
15615 // merge call for the parent/target.
15616 d->cnode.bits = bits;
15617
15618 // behavior depends on target (d) bits, so this after that is updated.
15619 (*c)->split_cache(d.get());
15620
15621 // remove source collection
15622 {
15623 std::unique_lock l3(coll_lock);
15624 _do_remove_collection(txc, c);
15625 }
15626
15627 r = 0;
15628
15629 bufferlist bl;
15630 encode(d->cnode, bl);
15631 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15632
15633 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15634 << " bits " << bits << " = " << r << dendl;
15635 return r;
15636 }
15637
15638 void BlueStore::log_latency(
15639 const char* name,
15640 int idx,
15641 const ceph::timespan& l,
15642 double lat_threshold,
15643 const char* info) const
15644 {
15645 logger->tinc(idx, l);
15646 if (lat_threshold > 0.0 &&
15647 l >= make_timespan(lat_threshold)) {
15648 dout(0) << __func__ << " slow operation observed for " << name
15649 << ", latency = " << l
15650 << info
15651 << dendl;
15652 }
15653 }
15654
15655 void BlueStore::log_latency_fn(
15656 const char* name,
15657 int idx,
15658 const ceph::timespan& l,
15659 double lat_threshold,
15660 std::function<string (const ceph::timespan& lat)> fn) const
15661 {
15662 logger->tinc(idx, l);
15663 if (lat_threshold > 0.0 &&
15664 l >= make_timespan(lat_threshold)) {
15665 dout(0) << __func__ << " slow operation observed for " << name
15666 << ", latency = " << l
15667 << fn(l)
15668 << dendl;
15669 }
15670 }
15671
15672 #if defined(WITH_LTTNG)
15673 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15674 KeyValueDB &db,
15675 TransContext &txc,
15676 mono_clock::time_point start_throttle_acquire)
15677 {
15678 pending_kv_ios += txc.ios;
15679 if (txc.deferred_txn) {
15680 pending_deferred_ios += txc.ios;
15681 }
15682
15683 uint64_t started = 0;
15684 uint64_t completed = 0;
15685 if (should_trace(&started, &completed)) {
15686 txc.tracing = true;
15687 uint64_t rocksdb_base_level,
15688 rocksdb_estimate_pending_compaction_bytes,
15689 rocksdb_cur_size_all_mem_tables,
15690 rocksdb_compaction_pending,
15691 rocksdb_mem_table_flush_pending,
15692 rocksdb_num_running_compactions,
15693 rocksdb_num_running_flushes,
15694 rocksdb_actual_delayed_write_rate;
15695 db.get_property(
15696 "rocksdb.base-level",
15697 &rocksdb_base_level);
15698 db.get_property(
15699 "rocksdb.estimate-pending-compaction-bytes",
15700 &rocksdb_estimate_pending_compaction_bytes);
15701 db.get_property(
15702 "rocksdb.cur-size-all-mem-tables",
15703 &rocksdb_cur_size_all_mem_tables);
15704 db.get_property(
15705 "rocksdb.compaction-pending",
15706 &rocksdb_compaction_pending);
15707 db.get_property(
15708 "rocksdb.mem-table-flush-pending",
15709 &rocksdb_mem_table_flush_pending);
15710 db.get_property(
15711 "rocksdb.num-running-compactions",
15712 &rocksdb_num_running_compactions);
15713 db.get_property(
15714 "rocksdb.num-running-flushes",
15715 &rocksdb_num_running_flushes);
15716 db.get_property(
15717 "rocksdb.actual-delayed-write-rate",
15718 &rocksdb_actual_delayed_write_rate);
15719
15720
15721 tracepoint(
15722 bluestore,
15723 transaction_initial_state,
15724 txc.osr->get_sequencer_id(),
15725 txc.seq,
15726 throttle_bytes.get_current(),
15727 throttle_deferred_bytes.get_current(),
15728 pending_kv_ios,
15729 pending_deferred_ios,
15730 started,
15731 completed,
15732 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15733
15734 tracepoint(
15735 bluestore,
15736 transaction_initial_state_rocksdb,
15737 txc.osr->get_sequencer_id(),
15738 txc.seq,
15739 rocksdb_base_level,
15740 rocksdb_estimate_pending_compaction_bytes,
15741 rocksdb_cur_size_all_mem_tables,
15742 rocksdb_compaction_pending,
15743 rocksdb_mem_table_flush_pending,
15744 rocksdb_num_running_compactions,
15745 rocksdb_num_running_flushes,
15746 rocksdb_actual_delayed_write_rate);
15747 }
15748 }
15749 #endif
15750
15751 mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15752 TransContext &txc, PerfCounters *logger, int state)
15753 {
15754 mono_clock::time_point now = mono_clock::now();
15755 mono_clock::duration lat = now - txc.last_stamp;
15756 logger->tinc(state, lat);
15757 #if defined(WITH_LTTNG)
15758 if (txc.tracing &&
15759 state >= l_bluestore_state_prepare_lat &&
15760 state <= l_bluestore_state_done_lat) {
15761 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15762 tracepoint(
15763 bluestore,
15764 transaction_state_duration,
15765 txc.osr->get_sequencer_id(),
15766 txc.seq,
15767 state,
15768 ceph::to_seconds<double>(lat));
15769 }
15770 #endif
15771 txc.last_stamp = now;
15772 return lat;
15773 }
15774
15775 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15776 KeyValueDB &db,
15777 TransContext &txc,
15778 mono_clock::time_point start_throttle_acquire)
15779 {
15780 throttle_bytes.get(txc.cost);
15781
15782 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15783 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15784 return true;
15785 } else {
15786 return false;
15787 }
15788 }
15789
15790 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15791 KeyValueDB &db,
15792 TransContext &txc,
15793 mono_clock::time_point start_throttle_acquire)
15794 {
15795 ceph_assert(txc.deferred_txn);
15796 throttle_deferred_bytes.get(txc.cost);
15797 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15798 }
15799
15800 #if defined(WITH_LTTNG)
15801 void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15802 {
15803 pending_kv_ios -= 1;
15804 ios_completed_since_last_traced++;
15805 if (txc.tracing) {
15806 tracepoint(
15807 bluestore,
15808 transaction_commit_latency,
15809 txc.osr->get_sequencer_id(),
15810 txc.seq,
15811 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15812 }
15813 }
15814 #endif
15815
15816 #if defined(WITH_LTTNG)
15817 void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15818 {
15819 if (txc.deferred_txn) {
15820 pending_deferred_ios -= 1;
15821 }
15822 if (txc.tracing) {
15823 mono_clock::time_point now = mono_clock::now();
15824 mono_clock::duration lat = now - txc.start;
15825 tracepoint(
15826 bluestore,
15827 transaction_total_duration,
15828 txc.osr->get_sequencer_id(),
15829 txc.seq,
15830 ceph::to_seconds<double>(lat));
15831 }
15832 }
15833 #endif
15834
15835 // DB key value Histogram
15836 #define KEY_SLAB 32
15837 #define VALUE_SLAB 64
15838
15839 const string prefix_onode = "o";
15840 const string prefix_onode_shard = "x";
15841 const string prefix_other = "Z";
15842
15843 int BlueStore::DBHistogram::get_key_slab(size_t sz)
15844 {
15845 return (sz/KEY_SLAB);
15846 }
15847
15848 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15849 {
15850 int lower_bound = slab * KEY_SLAB;
15851 int upper_bound = (slab + 1) * KEY_SLAB;
15852 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15853 return ret;
15854 }
15855
15856 int BlueStore::DBHistogram::get_value_slab(size_t sz)
15857 {
15858 return (sz/VALUE_SLAB);
15859 }
15860
15861 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15862 {
15863 int lower_bound = slab * VALUE_SLAB;
15864 int upper_bound = (slab + 1) * VALUE_SLAB;
15865 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15866 return ret;
15867 }
15868
15869 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15870 const string &prefix, size_t key_size, size_t value_size)
15871 {
15872 uint32_t key_slab = get_key_slab(key_size);
15873 uint32_t value_slab = get_value_slab(value_size);
15874 key_hist[prefix][key_slab].count++;
15875 key_hist[prefix][key_slab].max_len =
15876 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
15877 key_hist[prefix][key_slab].val_map[value_slab].count++;
15878 key_hist[prefix][key_slab].val_map[value_slab].max_len =
15879 std::max<size_t>(value_size,
15880 key_hist[prefix][key_slab].val_map[value_slab].max_len);
15881 }
15882
15883 void BlueStore::DBHistogram::dump(Formatter *f)
15884 {
15885 f->open_object_section("rocksdb_value_distribution");
15886 for (auto i : value_hist) {
15887 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15888 }
15889 f->close_section();
15890
15891 f->open_object_section("rocksdb_key_value_histogram");
15892 for (auto i : key_hist) {
15893 f->dump_string("prefix", i.first);
15894 f->open_object_section("key_hist");
15895 for ( auto k : i.second) {
15896 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15897 f->dump_unsigned("max_len", k.second.max_len);
15898 f->open_object_section("value_hist");
15899 for ( auto j : k.second.val_map) {
15900 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15901 f->dump_unsigned("max_len", j.second.max_len);
15902 }
15903 f->close_section();
15904 }
15905 f->close_section();
15906 }
15907 f->close_section();
15908 }
15909
15910 //Itrerates through the db and collects the stats
15911 void BlueStore::generate_db_histogram(Formatter *f)
15912 {
15913 //globals
15914 uint64_t num_onodes = 0;
15915 uint64_t num_shards = 0;
15916 uint64_t num_super = 0;
15917 uint64_t num_coll = 0;
15918 uint64_t num_omap = 0;
15919 uint64_t num_pgmeta_omap = 0;
15920 uint64_t num_deferred = 0;
15921 uint64_t num_alloc = 0;
15922 uint64_t num_stat = 0;
15923 uint64_t num_others = 0;
15924 uint64_t num_shared_shards = 0;
15925 size_t max_key_size =0, max_value_size = 0;
15926 uint64_t total_key_size = 0, total_value_size = 0;
15927 size_t key_size = 0, value_size = 0;
15928 DBHistogram hist;
15929
15930 auto start = coarse_mono_clock::now();
15931
15932 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
15933 iter->seek_to_first();
15934 while (iter->valid()) {
15935 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15936 key_size = iter->key_size();
15937 value_size = iter->value_size();
15938 hist.value_hist[hist.get_value_slab(value_size)]++;
15939 max_key_size = std::max(max_key_size, key_size);
15940 max_value_size = std::max(max_value_size, value_size);
15941 total_key_size += key_size;
15942 total_value_size += value_size;
15943
15944 pair<string,string> key(iter->raw_key());
15945
15946 if (key.first == PREFIX_SUPER) {
15947 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15948 num_super++;
15949 } else if (key.first == PREFIX_STAT) {
15950 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15951 num_stat++;
15952 } else if (key.first == PREFIX_COLL) {
15953 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15954 num_coll++;
15955 } else if (key.first == PREFIX_OBJ) {
15956 if (key.second.back() == ONODE_KEY_SUFFIX) {
15957 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15958 num_onodes++;
15959 } else {
15960 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15961 num_shards++;
15962 }
15963 } else if (key.first == PREFIX_OMAP) {
15964 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15965 num_omap++;
15966 } else if (key.first == PREFIX_PERPOOL_OMAP) {
15967 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
15968 num_omap++;
15969 } else if (key.first == PREFIX_PERPG_OMAP) {
15970 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
15971 num_omap++;
15972 } else if (key.first == PREFIX_PGMETA_OMAP) {
15973 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15974 num_pgmeta_omap++;
15975 } else if (key.first == PREFIX_DEFERRED) {
15976 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15977 num_deferred++;
15978 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
15979 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15980 num_alloc++;
15981 } else if (key.first == PREFIX_SHARED_BLOB) {
15982 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15983 num_shared_shards++;
15984 } else {
15985 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15986 num_others++;
15987 }
15988 iter->next();
15989 }
15990
15991 ceph::timespan duration = coarse_mono_clock::now() - start;
15992 f->open_object_section("rocksdb_key_value_stats");
15993 f->dump_unsigned("num_onodes", num_onodes);
15994 f->dump_unsigned("num_shards", num_shards);
15995 f->dump_unsigned("num_super", num_super);
15996 f->dump_unsigned("num_coll", num_coll);
15997 f->dump_unsigned("num_omap", num_omap);
15998 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
15999 f->dump_unsigned("num_deferred", num_deferred);
16000 f->dump_unsigned("num_alloc", num_alloc);
16001 f->dump_unsigned("num_stat", num_stat);
16002 f->dump_unsigned("num_shared_shards", num_shared_shards);
16003 f->dump_unsigned("num_others", num_others);
16004 f->dump_unsigned("max_key_size", max_key_size);
16005 f->dump_unsigned("max_value_size", max_value_size);
16006 f->dump_unsigned("total_key_size", total_key_size);
16007 f->dump_unsigned("total_value_size", total_value_size);
16008 f->close_section();
16009
16010 hist.dump(f);
16011
16012 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
16013
16014 }
16015
16016 void BlueStore::_shutdown_cache()
16017 {
16018 dout(10) << __func__ << dendl;
16019 for (auto i : buffer_cache_shards) {
16020 i->flush();
16021 ceph_assert(i->empty());
16022 }
16023 for (auto& p : coll_map) {
16024 p.second->onode_map.clear();
16025 if (!p.second->shared_blob_set.empty()) {
16026 derr << __func__ << " stray shared blobs on " << p.first << dendl;
16027 p.second->shared_blob_set.dump<0>(cct);
16028 }
16029 ceph_assert(p.second->onode_map.empty());
16030 ceph_assert(p.second->shared_blob_set.empty());
16031 }
16032 coll_map.clear();
16033 for (auto i : onode_cache_shards) {
16034 ceph_assert(i->empty());
16035 }
16036 }
16037
16038 // For external caller.
16039 // We use a best-effort policy instead, e.g.,
16040 // we don't care if there are still some pinned onodes/data in the cache
16041 // after this command is completed.
16042 int BlueStore::flush_cache(ostream *os)
16043 {
16044 dout(10) << __func__ << dendl;
16045 for (auto i : onode_cache_shards) {
16046 i->flush();
16047 }
16048 for (auto i : buffer_cache_shards) {
16049 i->flush();
16050 }
16051
16052 return 0;
16053 }
16054
16055 void BlueStore::_apply_padding(uint64_t head_pad,
16056 uint64_t tail_pad,
16057 bufferlist& padded)
16058 {
16059 if (head_pad) {
16060 padded.prepend_zero(head_pad);
16061 }
16062 if (tail_pad) {
16063 padded.append_zero(tail_pad);
16064 }
16065 if (head_pad || tail_pad) {
16066 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
16067 << " tail 0x" << tail_pad << std::dec << dendl;
16068 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
16069 }
16070 }
16071
16072 void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
16073 {
16074 // finalize extent_map shards
16075 o->extent_map.update(txn, false);
16076 if (o->extent_map.needs_reshard()) {
16077 o->extent_map.reshard(db, txn);
16078 o->extent_map.update(txn, true);
16079 if (o->extent_map.needs_reshard()) {
16080 dout(20) << __func__ << " warning: still wants reshard, check options?"
16081 << dendl;
16082 o->extent_map.clear_needs_reshard();
16083 }
16084 logger->inc(l_bluestore_onode_reshard);
16085 }
16086
16087 // bound encode
16088 size_t bound = 0;
16089 denc(o->onode, bound);
16090 o->extent_map.bound_encode_spanning_blobs(bound);
16091 if (o->onode.extent_map_shards.empty()) {
16092 denc(o->extent_map.inline_bl, bound);
16093 }
16094
16095 // encode
16096 bufferlist bl;
16097 unsigned onode_part, blob_part, extent_part;
16098 {
16099 auto p = bl.get_contiguous_appender(bound, true);
16100 denc(o->onode, p);
16101 onode_part = p.get_logical_offset();
16102 o->extent_map.encode_spanning_blobs(p);
16103 blob_part = p.get_logical_offset() - onode_part;
16104 if (o->onode.extent_map_shards.empty()) {
16105 denc(o->extent_map.inline_bl, p);
16106 }
16107 extent_part = p.get_logical_offset() - onode_part - blob_part;
16108 }
16109
16110 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
16111 << " (" << onode_part << " bytes onode + "
16112 << blob_part << " bytes spanning blobs + "
16113 << extent_part << " bytes inline extents)"
16114 << dendl;
16115
16116
16117 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
16118 }
16119
16120 void BlueStore::_log_alerts(osd_alert_list_t& alerts)
16121 {
16122 std::lock_guard l(qlock);
16123
16124 if (!spurious_read_errors_alert.empty() &&
16125 cct->_conf->bluestore_warn_on_spurious_read_errors) {
16126 alerts.emplace(
16127 "BLUESTORE_SPURIOUS_READ_ERRORS",
16128 spurious_read_errors_alert);
16129 }
16130 if (!disk_size_mismatch_alert.empty()) {
16131 alerts.emplace(
16132 "BLUESTORE_DISK_SIZE_MISMATCH",
16133 disk_size_mismatch_alert);
16134 }
16135 if (!legacy_statfs_alert.empty()) {
16136 alerts.emplace(
16137 "BLUESTORE_LEGACY_STATFS",
16138 legacy_statfs_alert);
16139 }
16140 if (!spillover_alert.empty() &&
16141 cct->_conf->bluestore_warn_on_bluefs_spillover) {
16142 alerts.emplace(
16143 "BLUEFS_SPILLOVER",
16144 spillover_alert);
16145 }
16146 if (!no_per_pg_omap_alert.empty()) {
16147 alerts.emplace(
16148 "BLUESTORE_NO_PER_PG_OMAP",
16149 no_per_pg_omap_alert);
16150 }
16151 if (!no_per_pool_omap_alert.empty()) {
16152 alerts.emplace(
16153 "BLUESTORE_NO_PER_POOL_OMAP",
16154 no_per_pool_omap_alert);
16155 }
16156 string s0(failed_cmode);
16157
16158 if (!failed_compressors.empty()) {
16159 if (!s0.empty()) {
16160 s0 += ", ";
16161 }
16162 s0 += "unable to load:";
16163 bool first = true;
16164 for (auto& s : failed_compressors) {
16165 if (first) {
16166 first = false;
16167 } else {
16168 s0 += ", ";
16169 }
16170 s0 += s;
16171 }
16172 alerts.emplace(
16173 "BLUESTORE_NO_COMPRESSION",
16174 s0);
16175 }
16176 }
16177
16178 void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
16179 size_t extents)
16180 {
16181 alloc_stats_count++;
16182 alloc_stats_fragments += extents;
16183 alloc_stats_size += need;
16184 }
16185
16186 void BlueStore::_record_allocation_stats()
16187 {
16188 // don't care about data consistency,
16189 // fields can be partially modified while making the tuple
16190 auto t0 = std::make_tuple(
16191 alloc_stats_count.exchange(0),
16192 alloc_stats_fragments.exchange(0),
16193 alloc_stats_size.exchange(0));
16194
16195 dout(0) << " allocation stats probe "
16196 << probe_count << ":"
16197 << " cnt: " << std::get<0>(t0)
16198 << " frags: " << std::get<1>(t0)
16199 << " size: " << std::get<2>(t0)
16200 << dendl;
16201
16202
16203 //
16204 // Keep the history for probes from the power-of-two sequence:
16205 // -1, -2, -4, -8, -16
16206 //
16207 size_t base = 1;
16208 for (auto& t : alloc_stats_history) {
16209 dout(0) << " probe -"
16210 << base + (probe_count % base) << ": "
16211 << std::get<0>(t)
16212 << ", " << std::get<1>(t)
16213 << ", " << std::get<2>(t)
16214 << dendl;
16215 base <<= 1;
16216 }
16217 dout(0) << "------------" << dendl;
16218
16219 ++ probe_count;
16220
16221 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
16222 if ((probe_count % (1 << i)) == 0) {
16223 alloc_stats_history[i] = alloc_stats_history[i - 1];
16224 }
16225 }
16226 alloc_stats_history[0].swap(t0);
16227 }
16228
16229 // ===========================================
16230 // BlueStoreRepairer
16231
16232 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16233 const interval_set<uint64_t>& extents)
16234 {
16235 ceph_assert(granularity); // initialized
16236 // can't call for the second time
16237 ceph_assert(!was_filtered_out);
16238 ceph_assert(collections_bfs.size() == objects_bfs.size());
16239
16240 uint64_t prev_pos = 0;
16241 uint64_t npos = collections_bfs.size();
16242
16243 bloom_vector collections_reduced;
16244 bloom_vector objects_reduced;
16245
16246 for (auto e : extents) {
16247 if (e.second == 0) {
16248 continue;
16249 }
16250 uint64_t pos = max(e.first / granularity, prev_pos);
16251 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
16252 while (pos != npos && pos < end_pos) {
16253 ceph_assert( collections_bfs[pos].element_count() ==
16254 objects_bfs[pos].element_count());
16255 if (collections_bfs[pos].element_count()) {
16256 collections_reduced.push_back(std::move(collections_bfs[pos]));
16257 objects_reduced.push_back(std::move(objects_bfs[pos]));
16258 }
16259 ++pos;
16260 }
16261 prev_pos = end_pos;
16262 }
16263 collections_reduced.swap(collections_bfs);
16264 objects_reduced.swap(objects_bfs);
16265 was_filtered_out = true;
16266 return collections_bfs.size();
16267 }
16268
16269 bool BlueStoreRepairer::remove_key(KeyValueDB *db,
16270 const string& prefix,
16271 const string& key)
16272 {
16273 std::lock_guard l(lock);
16274 if (!remove_key_txn) {
16275 remove_key_txn = db->get_transaction();
16276 }
16277 ++to_repair_cnt;
16278 remove_key_txn->rmkey(prefix, key);
16279
16280 return true;
16281 }
16282
16283 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
16284 {
16285 std::lock_guard l(lock); // possibly redundant
16286 ceph_assert(fix_per_pool_omap_txn == nullptr);
16287 fix_per_pool_omap_txn = db->get_transaction();
16288 ++to_repair_cnt;
16289 bufferlist bl;
16290 bl.append(stringify(val));
16291 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
16292 }
16293
16294 bool BlueStoreRepairer::fix_shared_blob(
16295 KeyValueDB *db,
16296 uint64_t sbid,
16297 const bufferlist* bl)
16298 {
16299 std::lock_guard l(lock); // possibly redundant
16300 KeyValueDB::Transaction txn;
16301 if (fix_misreferences_txn) { // reuse this txn
16302 txn = fix_misreferences_txn;
16303 } else {
16304 if (!fix_shared_blob_txn) {
16305 fix_shared_blob_txn = db->get_transaction();
16306 }
16307 txn = fix_shared_blob_txn;
16308 }
16309 string key;
16310 get_shared_blob_key(sbid, &key);
16311
16312 ++to_repair_cnt;
16313 if (bl) {
16314 txn->set(PREFIX_SHARED_BLOB, key, *bl);
16315 } else {
16316 txn->rmkey(PREFIX_SHARED_BLOB, key);
16317 }
16318 return true;
16319 }
16320
16321 bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
16322 const string& key,
16323 const store_statfs_t& new_statfs)
16324 {
16325 std::lock_guard l(lock);
16326 if (!fix_statfs_txn) {
16327 fix_statfs_txn = db->get_transaction();
16328 }
16329 BlueStore::volatile_statfs vstatfs;
16330 vstatfs = new_statfs;
16331 bufferlist bl;
16332 vstatfs.encode(bl);
16333 ++to_repair_cnt;
16334 fix_statfs_txn->set(PREFIX_STAT, key, bl);
16335 return true;
16336 }
16337
16338 bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
16339 FreelistManager* fm,
16340 uint64_t offset, uint64_t len)
16341 {
16342 std::lock_guard l(lock);
16343 if (!fix_fm_leaked_txn) {
16344 fix_fm_leaked_txn = db->get_transaction();
16345 }
16346 ++to_repair_cnt;
16347 fm->release(offset, len, fix_fm_leaked_txn);
16348 return true;
16349 }
16350 bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
16351 FreelistManager* fm,
16352 uint64_t offset, uint64_t len)
16353 {
16354 std::lock_guard l(lock);
16355 if (!fix_fm_false_free_txn) {
16356 fix_fm_false_free_txn = db->get_transaction();
16357 }
16358 ++to_repair_cnt;
16359 fm->allocate(offset, len, fix_fm_false_free_txn);
16360 return true;
16361 }
16362
16363 bool BlueStoreRepairer::fix_spanning_blobs(
16364 KeyValueDB* db,
16365 std::function<void(KeyValueDB::Transaction)> f)
16366 {
16367 std::lock_guard l(lock);
16368 if (!fix_onode_txn) {
16369 fix_onode_txn = db->get_transaction();
16370 }
16371 f(fix_onode_txn);
16372 ++to_repair_cnt;
16373 return true;
16374 }
16375
16376 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
16377 {
16378 //NB: not for use in multithreading mode!!!
16379 if (misreferenced_extents.size()) {
16380 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
16381 ceph_assert(n > 0);
16382 if (!fix_misreferences_txn) {
16383 fix_misreferences_txn = db->get_transaction();
16384 }
16385 return true;
16386 }
16387 return false;
16388 }
16389
16390 unsigned BlueStoreRepairer::apply(KeyValueDB* db)
16391 {
16392 //NB: not for use in multithreading mode!!!
16393 if (fix_per_pool_omap_txn) {
16394 db->submit_transaction_sync(fix_per_pool_omap_txn);
16395 fix_per_pool_omap_txn = nullptr;
16396 }
16397 if (fix_fm_leaked_txn) {
16398 db->submit_transaction_sync(fix_fm_leaked_txn);
16399 fix_fm_leaked_txn = nullptr;
16400 }
16401 if (fix_fm_false_free_txn) {
16402 db->submit_transaction_sync(fix_fm_false_free_txn);
16403 fix_fm_false_free_txn = nullptr;
16404 }
16405 if (remove_key_txn) {
16406 db->submit_transaction_sync(remove_key_txn);
16407 remove_key_txn = nullptr;
16408 }
16409 if (fix_misreferences_txn) {
16410 db->submit_transaction_sync(fix_misreferences_txn);
16411 fix_misreferences_txn = nullptr;
16412 }
16413 if (fix_onode_txn) {
16414 db->submit_transaction_sync(fix_onode_txn);
16415 fix_onode_txn = nullptr;
16416 }
16417 if (fix_shared_blob_txn) {
16418 db->submit_transaction_sync(fix_shared_blob_txn);
16419 fix_shared_blob_txn = nullptr;
16420 }
16421
16422 if (fix_statfs_txn) {
16423 db->submit_transaction_sync(fix_statfs_txn);
16424 fix_statfs_txn = nullptr;
16425 }
16426 if (need_compact) {
16427 db->compact();
16428 need_compact = false;
16429 }
16430 unsigned repaired = to_repair_cnt;
16431 to_repair_cnt = 0;
16432 return repaired;
16433 }
16434
16435 // =======================================================
16436 // RocksDBBlueFSVolumeSelector
16437
16438 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
16439 ceph_assert(h != nullptr);
16440 uint64_t hint = reinterpret_cast<uint64_t>(h);
16441 uint8_t res;
16442 switch (hint) {
16443 case LEVEL_SLOW:
16444 res = BlueFS::BDEV_SLOW;
16445 if (db_avail4slow > 0) {
16446 // considering statically available db space vs.
16447 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16448 // - observed maximum spillovers
16449 uint64_t max_db_use = 0; // max db usage we potentially observed
16450 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
16451 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
16452 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
16453 // this could go to db hence using it in the estimation
16454 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
16455
16456 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
16457 uint64_t avail = min(
16458 db_avail4slow,
16459 max_db_use < db_total ? db_total - max_db_use : 0);
16460
16461 // considering current DB dev usage for SLOW data
16462 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
16463 res = BlueFS::BDEV_DB;
16464 }
16465 }
16466 break;
16467 case LEVEL_LOG:
16468 case LEVEL_WAL:
16469 res = BlueFS::BDEV_WAL;
16470 break;
16471 case LEVEL_DB:
16472 default:
16473 res = BlueFS::BDEV_DB;
16474 break;
16475 }
16476 return res;
16477 }
16478
16479 void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
16480 {
16481 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
16482 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
16483 }
16484
16485 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
16486 uint8_t res = LEVEL_DB;
16487 if (dirname.length() > 5) {
16488 // the "db.slow" and "db.wal" directory names are hard-coded at
16489 // match up with bluestore. the slow device is always the second
16490 // one (when a dedicated block.db device is present and used at
16491 // bdev 0). the wal device is always last.
16492 if (boost::algorithm::ends_with(dirname, ".slow")) {
16493 res = LEVEL_SLOW;
16494 }
16495 else if (boost::algorithm::ends_with(dirname, ".wal")) {
16496 res = LEVEL_WAL;
16497 }
16498 }
16499 return reinterpret_cast<void*>(res);
16500 }
16501
16502 void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
16503 auto max_x = per_level_per_dev_usage.get_max_x();
16504 auto max_y = per_level_per_dev_usage.get_max_y();
16505 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
16506 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
16507 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
16508 << ", db_avail:" << db_avail4slow << std::endl
16509 << "Usage matrix:" << std::endl;
16510 constexpr std::array<const char*, 8> names{ {
16511 "DEV/LEV",
16512 "WAL",
16513 "DB",
16514 "SLOW",
16515 "*",
16516 "*",
16517 "REAL",
16518 "FILES",
16519 } };
16520 const size_t width = 12;
16521 for (size_t i = 0; i < names.size(); ++i) {
16522 sout.setf(std::ios::left, std::ios::adjustfield);
16523 sout.width(width);
16524 sout << names[i];
16525 }
16526 sout << std::endl;
16527 for (size_t l = 0; l < max_y; l++) {
16528 sout.setf(std::ios::left, std::ios::adjustfield);
16529 sout.width(width);
16530 switch (l + LEVEL_FIRST) {
16531 case LEVEL_LOG:
16532 sout << "LOG"; break;
16533 case LEVEL_WAL:
16534 sout << "WAL"; break;
16535 case LEVEL_DB:
16536 sout << "DB"; break;
16537 case LEVEL_SLOW:
16538 sout << "SLOW"; break;
16539 case LEVEL_MAX:
16540 sout << "TOTALS"; break;
16541 }
16542 for (size_t d = 0; d < max_x; d++) {
16543 sout.setf(std::ios::left, std::ios::adjustfield);
16544 sout.width(width);
16545 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
16546 }
16547 sout.setf(std::ios::left, std::ios::adjustfield);
16548 sout.width(width);
16549 sout << stringify(per_level_files[l]) << std::endl;
16550 }
16551 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
16552 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
16553 sout << "MAXIMUMS:" << std::endl;
16554 for (size_t l = 0; l < max_y; l++) {
16555 sout.setf(std::ios::left, std::ios::adjustfield);
16556 sout.width(width);
16557 switch (l + LEVEL_FIRST) {
16558 case LEVEL_LOG:
16559 sout << "LOG"; break;
16560 case LEVEL_WAL:
16561 sout << "WAL"; break;
16562 case LEVEL_DB:
16563 sout << "DB"; break;
16564 case LEVEL_SLOW:
16565 sout << "SLOW"; break;
16566 case LEVEL_MAX:
16567 sout << "TOTALS"; break;
16568 }
16569 for (size_t d = 0; d < max_x - 1; d++) {
16570 sout.setf(std::ios::left, std::ios::adjustfield);
16571 sout.width(width);
16572 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
16573 }
16574 sout.setf(std::ios::left, std::ios::adjustfield);
16575 sout.width(width);
16576 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
16577 if (l < max_y - 1) {
16578 sout << std::endl;
16579 }
16580 }
16581 }
16582
16583 // =======================================================