]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
import ceph 15.2.13
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <unistd.h>
16 #include <stdlib.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
23
24 #include "include/cpp-btree/btree_set.h"
25
26 #include "bluestore_common.h"
27 #include "BlueStore.h"
28 #include "os/kv.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
40 #include "BlueFS.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
47
48 #if defined(WITH_LTTNG)
49 #define TRACEPOINT_DEFINE
50 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51 #include "tracing/bluestore.h"
52 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53 #undef TRACEPOINT_DEFINE
54 #else
55 #define tracepoint(...)
56 #endif
57
58 #define dout_context cct
59 #define dout_subsys ceph_subsys_bluestore
60
61 using bid_t = decltype(BlueStore::Blob::id);
62
63 // bluestore_cache_onode
64 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
65 bluestore_cache_onode);
66
67 // bluestore_cache_other
68 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
69 bluestore_Buffer);
70 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
71 bluestore_Extent);
72 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
73 bluestore_Blob);
74 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
75 bluestore_SharedBlob);
76
77 // bluestore_txc
78 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
79 bluestore_txc);
80
81
82 // kv store prefixes
83 const string PREFIX_SUPER = "S"; // field -> value
84 const string PREFIX_STAT = "T"; // field -> value(int64 array)
85 const string PREFIX_COLL = "C"; // collection name -> cnode_t
86 const string PREFIX_OBJ = "O"; // object name -> onode_t
87 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
88 const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
89 const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
90 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
91 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
92 const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
93 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
94
95 const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
96
97 // write a label in the first block. always use this size. note that
98 // bluefs makes a matching assumption about the location of its
99 // superblock (always the second block of the device).
100 #define BDEV_LABEL_BLOCK_SIZE 4096
101
102 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103 #define SUPER_RESERVED 8192
104
105 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
106
107
108 /*
109 * extent map blob encoding
110 *
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
113 */
114 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118 #define BLOBID_SHIFT_BITS 4
119
120 /*
121 * object name key structure
122 *
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
126 *
127 * escaped string: namespace
128 *
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
133 *
134 * encoded u64: snap
135 * encoded u64: generation
136 * 'o'
137 */
138 #define ONODE_KEY_SUFFIX 'o'
139
140 /*
141 * extent shard key
142 *
143 * object prefix key
144 * u32
145 * 'x'
146 */
147 #define EXTENT_SHARD_KEY_SUFFIX 'x'
148
149 /*
150 * string encoding in the key
151 *
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
155 * hex digits.
156 *
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
159 *
160 * NOTE: There is a bug in this implementation: due to implicit
161 * character type conversion in comparison it may produce unexpected
162 * ordering. Unfortunately fixing the bug would mean invalidating the
163 * keys in existing deployments. Instead we do additional sorting
164 * where it is needed.
165 */
166 template<typename S>
167 static void append_escaped(const string &in, S *out)
168 {
169 char hexbyte[in.length() * 3 + 1];
170 char* ptr = &hexbyte[0];
171 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
172 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
173 *ptr++ = '#';
174 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
175 *ptr++ = "0123456789abcdef"[*i & 0x0f];
176 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
177 *ptr++ = '~';
178 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
179 *ptr++ = "0123456789abcdef"[*i & 0x0f];
180 } else {
181 *ptr++ = *i;
182 }
183 }
184 *ptr++ = '!';
185 out->append(hexbyte, ptr - &hexbyte[0]);
186 }
187
188 inline unsigned h2i(char c)
189 {
190 if ((c >= '0') && (c <= '9')) {
191 return c - 0x30;
192 } else if ((c >= 'a') && (c <= 'f')) {
193 return c - 'a' + 10;
194 } else if ((c >= 'A') && (c <= 'F')) {
195 return c - 'A' + 10;
196 } else {
197 return 256; // make it always larger than 255
198 }
199 }
200
201 static int decode_escaped(const char *p, string *out)
202 {
203 char buff[256];
204 char* ptr = &buff[0];
205 char* max = &buff[252];
206 const char *orig_p = p;
207 while (*p && *p != '!') {
208 if (*p == '#' || *p == '~') {
209 unsigned hex = 0;
210 p++;
211 hex = h2i(*p++) << 4;
212 if (hex > 255) {
213 return -EINVAL;
214 }
215 hex |= h2i(*p++);
216 if (hex > 255) {
217 return -EINVAL;
218 }
219 *ptr++ = hex;
220 } else {
221 *ptr++ = *p++;
222 }
223 if (ptr > max) {
224 out->append(buff, ptr-buff);
225 ptr = &buff[0];
226 }
227 }
228 if (ptr != buff) {
229 out->append(buff, ptr-buff);
230 }
231 return p - orig_p;
232 }
233
234 // some things we encode in binary (as le32 or le64); print the
235 // resulting key strings nicely
236 template<typename S>
237 static string pretty_binary_string(const S& in)
238 {
239 char buf[10];
240 string out;
241 out.reserve(in.length() * 3);
242 enum { NONE, HEX, STRING } mode = NONE;
243 unsigned from = 0, i;
244 for (i=0; i < in.length(); ++i) {
245 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
246 (mode == HEX && in.length() - i >= 4 &&
247 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
248 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
249 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
250 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
251 if (mode == STRING) {
252 out.append(in.c_str() + from, i - from);
253 out.push_back('\'');
254 }
255 if (mode != HEX) {
256 out.append("0x");
257 mode = HEX;
258 }
259 if (in.length() - i >= 4) {
260 // print a whole u32 at once
261 snprintf(buf, sizeof(buf), "%08x",
262 (uint32_t)(((unsigned char)in[i] << 24) |
263 ((unsigned char)in[i+1] << 16) |
264 ((unsigned char)in[i+2] << 8) |
265 ((unsigned char)in[i+3] << 0)));
266 i += 3;
267 } else {
268 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
269 }
270 out.append(buf);
271 } else {
272 if (mode != STRING) {
273 out.push_back('\'');
274 mode = STRING;
275 from = i;
276 }
277 }
278 }
279 if (mode == STRING) {
280 out.append(in.c_str() + from, i - from);
281 out.push_back('\'');
282 }
283 return out;
284 }
285
286 template<typename T>
287 static void _key_encode_shard(shard_id_t shard, T *key)
288 {
289 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
290 }
291
292 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
293 {
294 pshard->id = (uint8_t)*key - (uint8_t)0x80;
295 return key + 1;
296 }
297
298 static void get_coll_range(const coll_t& cid, int bits,
299 ghobject_t *temp_start, ghobject_t *temp_end,
300 ghobject_t *start, ghobject_t *end)
301 {
302 spg_t pgid;
303 if (cid.is_pg(&pgid)) {
304 start->shard_id = pgid.shard;
305 *temp_start = *start;
306
307 start->hobj.pool = pgid.pool();
308 temp_start->hobj.pool = -2ll - pgid.pool();
309
310 *end = *start;
311 *temp_end = *temp_start;
312
313 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
314 start->hobj.set_bitwise_key_u32(reverse_hash);
315 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
316
317 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
318 if (end_hash > 0xffffffffull)
319 end_hash = 0xffffffffull;
320
321 end->hobj.set_bitwise_key_u32(end_hash);
322 temp_end->hobj.set_bitwise_key_u32(end_hash);
323 } else {
324 start->shard_id = shard_id_t::NO_SHARD;
325 start->hobj.pool = -1ull;
326
327 *end = *start;
328 start->hobj.set_bitwise_key_u32(0);
329 end->hobj.set_bitwise_key_u32(0xffffffff);
330
331 // no separate temp section
332 *temp_start = *end;
333 *temp_end = *end;
334 }
335
336 start->generation = 0;
337 end->generation = 0;
338 temp_start->generation = 0;
339 temp_end->generation = 0;
340 }
341
342 static void get_shared_blob_key(uint64_t sbid, string *key)
343 {
344 key->clear();
345 _key_encode_u64(sbid, key);
346 }
347
348 static int get_key_shared_blob(const string& key, uint64_t *sbid)
349 {
350 const char *p = key.c_str();
351 if (key.length() < sizeof(uint64_t))
352 return -1;
353 _key_decode_u64(p, sbid);
354 return 0;
355 }
356
357 template<typename S>
358 static void _key_encode_prefix(const ghobject_t& oid, S *key)
359 {
360 _key_encode_shard(oid.shard_id, key);
361 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
362 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
363 }
364
365 static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
366 {
367 p = _key_decode_shard(p, &oid->shard_id);
368
369 uint64_t pool;
370 p = _key_decode_u64(p, &pool);
371 oid->hobj.pool = pool - 0x8000000000000000ull;
372
373 unsigned hash;
374 p = _key_decode_u32(p, &hash);
375
376 oid->hobj.set_bitwise_key_u32(hash);
377
378 return p;
379 }
380
381 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
382
383 template<typename S>
384 static int get_key_object(const S& key, ghobject_t *oid)
385 {
386 int r;
387 const char *p = key.c_str();
388
389 if (key.length() < ENCODED_KEY_PREFIX_LEN)
390 return -1;
391
392 p = _key_decode_prefix(p, oid);
393
394 if (key.length() == ENCODED_KEY_PREFIX_LEN)
395 return -2;
396
397 r = decode_escaped(p, &oid->hobj.nspace);
398 if (r < 0)
399 return -2;
400 p += r + 1;
401
402 string k;
403 r = decode_escaped(p, &k);
404 if (r < 0)
405 return -3;
406 p += r + 1;
407 if (*p == '=') {
408 // no key
409 ++p;
410 oid->hobj.oid.name = k;
411 } else if (*p == '<' || *p == '>') {
412 // key + name
413 ++p;
414 r = decode_escaped(p, &oid->hobj.oid.name);
415 if (r < 0)
416 return -5;
417 p += r + 1;
418 oid->hobj.set_key(k);
419 } else {
420 // malformed
421 return -6;
422 }
423
424 p = _key_decode_u64(p, &oid->hobj.snap.val);
425 p = _key_decode_u64(p, &oid->generation);
426
427 if (*p != ONODE_KEY_SUFFIX) {
428 return -7;
429 }
430 p++;
431 if (*p) {
432 // if we get something other than a null terminator here,
433 // something goes wrong.
434 return -8;
435 }
436
437 return 0;
438 }
439
440 template<typename S>
441 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
442 {
443 key->clear();
444
445 size_t max_len = ENCODED_KEY_PREFIX_LEN +
446 (oid.hobj.nspace.length() * 3 + 1) +
447 (oid.hobj.get_key().length() * 3 + 1) +
448 1 + // for '<', '=', or '>'
449 (oid.hobj.oid.name.length() * 3 + 1) +
450 8 + 8 + 1;
451 key->reserve(max_len);
452
453 _key_encode_prefix(oid, key);
454
455 append_escaped(oid.hobj.nspace, key);
456
457 if (oid.hobj.get_key().length()) {
458 // is a key... could be < = or >.
459 append_escaped(oid.hobj.get_key(), key);
460 // (ASCII chars < = and > sort in that order, yay)
461 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
462 if (r) {
463 key->append(r > 0 ? ">" : "<");
464 append_escaped(oid.hobj.oid.name, key);
465 } else {
466 // same as no key
467 key->append("=");
468 }
469 } else {
470 // no key
471 append_escaped(oid.hobj.oid.name, key);
472 key->append("=");
473 }
474
475 _key_encode_u64(oid.hobj.snap, key);
476 _key_encode_u64(oid.generation, key);
477
478 key->push_back(ONODE_KEY_SUFFIX);
479
480 // sanity check
481 if (true) {
482 ghobject_t t;
483 int r = get_key_object(*key, &t);
484 if (r || t != oid) {
485 derr << " r " << r << dendl;
486 derr << "key " << pretty_binary_string(*key) << dendl;
487 derr << "oid " << oid << dendl;
488 derr << " t " << t << dendl;
489 ceph_assert(r == 0 && t == oid);
490 }
491 }
492 }
493
494
495 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
496 // char lets us quickly test whether it is a shard key without decoding any
497 // of the prefix bytes.
498 template<typename S>
499 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
500 string *key)
501 {
502 key->clear();
503 key->reserve(onode_key.length() + 4 + 1);
504 key->append(onode_key.c_str(), onode_key.size());
505 _key_encode_u32(offset, key);
506 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
507 }
508
509 static void rewrite_extent_shard_key(uint32_t offset, string *key)
510 {
511 ceph_assert(key->size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
513 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
514 }
515
516 template<typename S>
517 static void generate_extent_shard_key_and_apply(
518 const S& onode_key,
519 uint32_t offset,
520 string *key,
521 std::function<void(const string& final_key)> apply)
522 {
523 if (key->empty()) { // make full key
524 ceph_assert(!onode_key.empty());
525 get_extent_shard_key(onode_key, offset, key);
526 } else {
527 rewrite_extent_shard_key(offset, key);
528 }
529 apply(*key);
530 }
531
532 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
533 {
534 ceph_assert(key.size() > sizeof(uint32_t) + 1);
535 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
536 int okey_len = key.size() - sizeof(uint32_t) - 1;
537 *onode_key = key.substr(0, okey_len);
538 const char *p = key.data() + okey_len;
539 _key_decode_u32(p, offset);
540 return 0;
541 }
542
543 static bool is_extent_shard_key(const string& key)
544 {
545 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
546 }
547
548 static void get_deferred_key(uint64_t seq, string *out)
549 {
550 _key_encode_u64(seq, out);
551 }
552
553 static void get_pool_stat_key(int64_t pool_id, string *key)
554 {
555 key->clear();
556 _key_encode_u64(pool_id, key);
557 }
558
559 static int get_key_pool_stat(const string& key, uint64_t* pool_id)
560 {
561 const char *p = key.c_str();
562 if (key.length() < sizeof(uint64_t))
563 return -1;
564 _key_decode_u64(p, pool_id);
565 return 0;
566 }
567
568 template <int LogLevelV>
569 void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
570 {
571 uint64_t pos = 0;
572 for (auto& s : em.shards) {
573 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
574 << (s.loaded ? " (loaded)" : "")
575 << (s.dirty ? " (dirty)" : "")
576 << dendl;
577 }
578 for (auto& e : em.extent_map) {
579 dout(LogLevelV) << __func__ << " " << e << dendl;
580 ceph_assert(e.logical_offset >= pos);
581 pos = e.logical_offset + e.length;
582 const bluestore_blob_t& blob = e.blob->get_blob();
583 if (blob.has_csum()) {
584 vector<uint64_t> v;
585 unsigned n = blob.get_csum_count();
586 for (unsigned i = 0; i < n; ++i)
587 v.push_back(blob.get_csum_item(i));
588 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
589 << dendl;
590 }
591 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
592 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
593 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
594 << "~" << i.second->length << std::dec
595 << " " << *i.second << dendl;
596 }
597 }
598 }
599
600 template <int LogLevelV>
601 void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
602 {
603 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
604 return;
605 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
606 << " nid " << o.onode.nid
607 << " size 0x" << std::hex << o.onode.size
608 << " (" << std::dec << o.onode.size << ")"
609 << " expected_object_size " << o.onode.expected_object_size
610 << " expected_write_size " << o.onode.expected_write_size
611 << " in " << o.onode.extent_map_shards.size() << " shards"
612 << ", " << o.extent_map.spanning_blob_map.size()
613 << " spanning blobs"
614 << dendl;
615 for (auto p = o.onode.attrs.begin();
616 p != o.onode.attrs.end();
617 ++p) {
618 dout(LogLevelV) << __func__ << " attr " << p->first
619 << " len " << p->second.length() << dendl;
620 }
621 _dump_extent_map<LogLevelV>(cct, o.extent_map);
622 }
623
624 template <int LogLevelV>
625 void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
626 {
627 dout(LogLevelV) << __func__ << " transaction dump:\n";
628 JSONFormatter f(true);
629 f.open_object_section("transaction");
630 t->dump(&f);
631 f.close_section();
632 f.flush(*_dout);
633 *_dout << dendl;
634 }
635
636 // merge operators
637
638 struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
639 void merge_nonexistent(
640 const char *rdata, size_t rlen, std::string *new_value) override {
641 *new_value = std::string(rdata, rlen);
642 }
643 void merge(
644 const char *ldata, size_t llen,
645 const char *rdata, size_t rlen,
646 std::string *new_value) override {
647 ceph_assert(llen == rlen);
648 ceph_assert((rlen % 8) == 0);
649 new_value->resize(rlen);
650 const ceph_le64* lv = (const ceph_le64*)ldata;
651 const ceph_le64* rv = (const ceph_le64*)rdata;
652 ceph_le64* nv = &(ceph_le64&)new_value->at(0);
653 for (size_t i = 0; i < rlen >> 3; ++i) {
654 nv[i] = lv[i] + rv[i];
655 }
656 }
657 // We use each operator name and each prefix to construct the
658 // overall RocksDB operator name for consistency check at open time.
659 const char *name() const override {
660 return "int64_array";
661 }
662 };
663
664
665 // Buffer
666
667 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
668 {
669 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
670 << b.offset << "~" << b.length << std::dec
671 << " " << BlueStore::Buffer::get_state_name(b.state);
672 if (b.flags)
673 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
674 return out << ")";
675 }
676
677 namespace {
678
679 /*
680 * Due to a bug in key string encoding (see a comment for append_escaped)
681 * the KeyValueDB iterator does not lexicographically sort the same
682 * way that ghobject_t does: objects with the same hash may have wrong order.
683 *
684 * This is the iterator wrapper that fixes the keys order.
685 */
686
687 class CollectionListIterator {
688 public:
689 CollectionListIterator(const KeyValueDB::Iterator &it)
690 : m_it(it) {
691 }
692 virtual ~CollectionListIterator() {
693 }
694
695 virtual bool valid() const = 0;
696 virtual const ghobject_t &oid() const = 0;
697 virtual void lower_bound(const ghobject_t &oid) = 0;
698 virtual void upper_bound(const ghobject_t &oid) = 0;
699 virtual void next() = 0;
700
701 virtual int cmp(const ghobject_t &oid) const = 0;
702
703 bool is_ge(const ghobject_t &oid) const {
704 return cmp(oid) >= 0;
705 }
706
707 bool is_lt(const ghobject_t &oid) const {
708 return cmp(oid) < 0;
709 }
710
711 protected:
712 KeyValueDB::Iterator m_it;
713 };
714
715 class SimpleCollectionListIterator : public CollectionListIterator {
716 public:
717 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
718 : CollectionListIterator(it), m_cct(cct) {
719 }
720
721 bool valid() const override {
722 return m_it->valid();
723 }
724
725 const ghobject_t &oid() const override {
726 ceph_assert(valid());
727
728 return m_oid;
729 }
730
731 void lower_bound(const ghobject_t &oid) override {
732 string key;
733 get_object_key(m_cct, oid, &key);
734
735 m_it->lower_bound(key);
736 get_oid();
737 }
738
739 void upper_bound(const ghobject_t &oid) override {
740 string key;
741 get_object_key(m_cct, oid, &key);
742
743 m_it->upper_bound(key);
744 get_oid();
745 }
746
747 void next() override {
748 ceph_assert(valid());
749
750 m_it->next();
751 get_oid();
752 }
753
754 int cmp(const ghobject_t &oid) const override {
755 ceph_assert(valid());
756
757 string key;
758 get_object_key(m_cct, oid, &key);
759
760 return m_it->key().compare(key);
761 }
762
763 private:
764 CephContext *m_cct;
765 ghobject_t m_oid;
766
767 void get_oid() {
768 if (!valid()) {
769 return;
770 }
771
772 if (is_extent_shard_key(m_it->key())) {
773 next();
774 return;
775 }
776
777 m_oid = ghobject_t();
778 int r = get_key_object(m_it->key(), &m_oid);
779 ceph_assert(r == 0);
780 }
781 };
782
783 class SortedCollectionListIterator : public CollectionListIterator {
784 public:
785 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
786 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
787 }
788
789 bool valid() const override {
790 return m_chunk_iter != m_chunk.end();
791 }
792
793 const ghobject_t &oid() const override {
794 ceph_assert(valid());
795
796 return m_chunk_iter->first;
797 }
798
799 void lower_bound(const ghobject_t &oid) override {
800 std::string key;
801 _key_encode_prefix(oid, &key);
802
803 m_it->lower_bound(key);
804 m_chunk_iter = m_chunk.end();
805 if (!get_next_chunk()) {
806 return;
807 }
808
809 if (this->oid().shard_id != oid.shard_id ||
810 this->oid().hobj.pool != oid.hobj.pool ||
811 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
812 return;
813 }
814
815 m_chunk_iter = m_chunk.lower_bound(oid);
816 if (m_chunk_iter == m_chunk.end()) {
817 get_next_chunk();
818 }
819 }
820
821 void upper_bound(const ghobject_t &oid) override {
822 lower_bound(oid);
823
824 if (valid() && this->oid() == oid) {
825 next();
826 }
827 }
828
829 void next() override {
830 ceph_assert(valid());
831
832 m_chunk_iter++;
833 if (m_chunk_iter == m_chunk.end()) {
834 get_next_chunk();
835 }
836 }
837
838 int cmp(const ghobject_t &oid) const override {
839 ceph_assert(valid());
840
841 if (this->oid() < oid) {
842 return -1;
843 }
844 if (this->oid() > oid) {
845 return 1;
846 }
847 return 0;
848 }
849
850 private:
851 std::map<ghobject_t, std::string> m_chunk;
852 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
853
854 bool get_next_chunk() {
855 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
856 m_it->next();
857 }
858
859 if (!m_it->valid()) {
860 return false;
861 }
862
863 ghobject_t oid;
864 int r = get_key_object(m_it->key(), &oid);
865 ceph_assert(r == 0);
866
867 m_chunk.clear();
868 while (true) {
869 m_chunk.insert({oid, m_it->key()});
870
871 do {
872 m_it->next();
873 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
874
875 if (!m_it->valid()) {
876 break;
877 }
878
879 ghobject_t next;
880 r = get_key_object(m_it->key(), &next);
881 ceph_assert(r == 0);
882 if (next.shard_id != oid.shard_id ||
883 next.hobj.pool != oid.hobj.pool ||
884 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
885 break;
886 }
887 oid = next;
888 }
889
890 m_chunk_iter = m_chunk.begin();
891 return true;
892 }
893 };
894
895 } // anonymous namespace
896
897 // Garbage Collector
898
899 void BlueStore::GarbageCollector::process_protrusive_extents(
900 const BlueStore::ExtentMap& extent_map,
901 uint64_t start_offset,
902 uint64_t end_offset,
903 uint64_t start_touch_offset,
904 uint64_t end_touch_offset,
905 uint64_t min_alloc_size)
906 {
907 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
908
909 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
910 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
911
912 dout(30) << __func__ << " (hex): [" << std::hex
913 << lookup_start_offset << ", " << lookup_end_offset
914 << ")" << std::dec << dendl;
915
916 for (auto it = extent_map.seek_lextent(lookup_start_offset);
917 it != extent_map.extent_map.end() &&
918 it->logical_offset < lookup_end_offset;
919 ++it) {
920 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
921 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
922
923 dout(30) << __func__ << " " << *it
924 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
925 << dendl;
926
927 Blob* b = it->blob.get();
928
929 if (it->logical_offset >=start_touch_offset &&
930 it->logical_end() <= end_touch_offset) {
931 // Process extents within the range affected by
932 // the current write request.
933 // Need to take into account if existing extents
934 // can be merged with them (uncompressed case)
935 if (!b->get_blob().is_compressed()) {
936 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
937 --blob_info_counted->expected_allocations; // don't need to allocate
938 // new AU for compressed
939 // data since another
940 // collocated uncompressed
941 // blob already exists
942 dout(30) << __func__ << " --expected:"
943 << alloc_unit_start << dendl;
944 }
945 used_alloc_unit = alloc_unit_end;
946 blob_info_counted = nullptr;
947 }
948 } else if (b->get_blob().is_compressed()) {
949
950 // additionally we take compressed blobs that were not impacted
951 // by the write into account too
952 BlobInfo& bi =
953 affected_blobs.emplace(
954 b, BlobInfo(b->get_referenced_bytes())).first->second;
955
956 int adjust =
957 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
958 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
959 dout(30) << __func__ << " expected_allocations="
960 << bi.expected_allocations << " end_au:"
961 << alloc_unit_end << dendl;
962
963 blob_info_counted = &bi;
964 used_alloc_unit = alloc_unit_end;
965
966 ceph_assert(it->length <= bi.referenced_bytes);
967 bi.referenced_bytes -= it->length;
968 dout(30) << __func__ << " affected_blob:" << *b
969 << " unref 0x" << std::hex << it->length
970 << " referenced = 0x" << bi.referenced_bytes
971 << std::dec << dendl;
972 // NOTE: we can't move specific blob to resulting GC list here
973 // when reference counter == 0 since subsequent extents might
974 // decrement its expected_allocation.
975 // Hence need to enumerate all the extents first.
976 if (!bi.collect_candidate) {
977 bi.first_lextent = it;
978 bi.collect_candidate = true;
979 }
980 bi.last_lextent = it;
981 } else {
982 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
983 // don't need to allocate new AU for compressed data since another
984 // collocated uncompressed blob already exists
985 --blob_info_counted->expected_allocations;
986 dout(30) << __func__ << " --expected_allocations:"
987 << alloc_unit_start << dendl;
988 }
989 used_alloc_unit = alloc_unit_end;
990 blob_info_counted = nullptr;
991 }
992 }
993
994 for (auto b_it = affected_blobs.begin();
995 b_it != affected_blobs.end();
996 ++b_it) {
997 Blob* b = b_it->first;
998 BlobInfo& bi = b_it->second;
999 if (bi.referenced_bytes == 0) {
1000 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
1001 int64_t blob_expected_for_release =
1002 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
1003
1004 dout(30) << __func__ << " " << *(b_it->first)
1005 << " expected4release=" << blob_expected_for_release
1006 << " expected_allocations=" << bi.expected_allocations
1007 << dendl;
1008 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
1009 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
1010 if (bi.collect_candidate) {
1011 auto it = bi.first_lextent;
1012 bool bExit = false;
1013 do {
1014 if (it->blob.get() == b) {
1015 extents_to_collect.insert(it->logical_offset, it->length);
1016 }
1017 bExit = it == bi.last_lextent;
1018 ++it;
1019 } while (!bExit);
1020 }
1021 expected_for_release += blob_expected_for_release;
1022 expected_allocations += bi.expected_allocations;
1023 }
1024 }
1025 }
1026 }
1027
1028 int64_t BlueStore::GarbageCollector::estimate(
1029 uint64_t start_offset,
1030 uint64_t length,
1031 const BlueStore::ExtentMap& extent_map,
1032 const BlueStore::old_extent_map_t& old_extents,
1033 uint64_t min_alloc_size)
1034 {
1035
1036 affected_blobs.clear();
1037 extents_to_collect.clear();
1038 used_alloc_unit = boost::optional<uint64_t >();
1039 blob_info_counted = nullptr;
1040
1041 uint64_t gc_start_offset = start_offset;
1042 uint64_t gc_end_offset = start_offset + length;
1043
1044 uint64_t end_offset = start_offset + length;
1045
1046 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1047 Blob* b = it->e.blob.get();
1048 if (b->get_blob().is_compressed()) {
1049
1050 // update gc_start_offset/gc_end_offset if needed
1051 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
1052 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
1053
1054 auto o = it->e.logical_offset;
1055 auto l = it->e.length;
1056
1057 uint64_t ref_bytes = b->get_referenced_bytes();
1058 // micro optimization to bypass blobs that have no more references
1059 if (ref_bytes != 0) {
1060 dout(30) << __func__ << " affected_blob:" << *b
1061 << " unref 0x" << std::hex << o << "~" << l
1062 << std::dec << dendl;
1063 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1064 }
1065 }
1066 }
1067 dout(30) << __func__ << " gc range(hex): [" << std::hex
1068 << gc_start_offset << ", " << gc_end_offset
1069 << ")" << std::dec << dendl;
1070
1071 // enumerate preceeding extents to check if they reference affected blobs
1072 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1073 process_protrusive_extents(extent_map,
1074 gc_start_offset,
1075 gc_end_offset,
1076 start_offset,
1077 end_offset,
1078 min_alloc_size);
1079 }
1080 return expected_for_release - expected_allocations;
1081 }
1082
1083 // LruOnodeCacheShard
1084 struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1085 typedef boost::intrusive::list<
1086 BlueStore::Onode,
1087 boost::intrusive::member_hook<
1088 BlueStore::Onode,
1089 boost::intrusive::list_member_hook<>,
1090 &BlueStore::Onode::lru_item> > list_t;
1091
1092 list_t lru;
1093
1094 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
1095
1096 void _add(BlueStore::Onode* o, int level) override
1097 {
1098 if (o->put_cache()) {
1099 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
1100 } else {
1101 ++num_pinned;
1102 }
1103 ++num; // we count both pinned and unpinned entries
1104 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
1105 }
1106 void _rm(BlueStore::Onode* o) override
1107 {
1108 if (o->pop_cache()) {
1109 lru.erase(lru.iterator_to(*o));
1110 } else {
1111 ceph_assert(num_pinned);
1112 --num_pinned;
1113 }
1114 ceph_assert(num);
1115 --num;
1116 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
1117 }
1118 void _pin(BlueStore::Onode* o) override
1119 {
1120 lru.erase(lru.iterator_to(*o));
1121 ++num_pinned;
1122 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
1123 }
1124 void _unpin(BlueStore::Onode* o) override
1125 {
1126 lru.push_front(*o);
1127 ceph_assert(num_pinned);
1128 --num_pinned;
1129 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
1130 }
1131 void _unpin_and_rm(BlueStore::Onode* o) override
1132 {
1133 o->pop_cache();
1134 ceph_assert(num_pinned);
1135 --num_pinned;
1136 ceph_assert(num);
1137 --num;
1138 }
1139 void _trim_to(uint64_t new_size) override
1140 {
1141 if (new_size >= lru.size()) {
1142 return; // don't even try
1143 }
1144 uint64_t n = lru.size() - new_size;
1145 auto p = lru.end();
1146 ceph_assert(p != lru.begin());
1147 --p;
1148 ceph_assert(num >= n);
1149 num -= n;
1150 while (n-- > 0) {
1151 BlueStore::Onode *o = &*p;
1152 dout(20) << __func__ << " rm " << o->oid << " "
1153 << o->nref << " " << o->cached << " " << o->pinned << dendl;
1154 if (p != lru.begin()) {
1155 lru.erase(p--);
1156 } else {
1157 ceph_assert(n == 0);
1158 lru.erase(p);
1159 }
1160 auto pinned = !o->pop_cache();
1161 ceph_assert(!pinned);
1162 o->c->onode_map._remove(o->oid);
1163 }
1164 }
1165 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1166 {
1167 if (to == this) {
1168 return;
1169 }
1170 ceph_assert(o->cached);
1171 ceph_assert(o->pinned);
1172 ceph_assert(num);
1173 ceph_assert(num_pinned);
1174 --num_pinned;
1175 --num;
1176 ++to->num_pinned;
1177 ++to->num;
1178 }
1179 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1180 {
1181 *onodes += num;
1182 *pinned_onodes += num_pinned;
1183 }
1184 };
1185
1186 // OnodeCacheShard
1187 BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1188 CephContext* cct,
1189 string type,
1190 PerfCounters *logger)
1191 {
1192 BlueStore::OnodeCacheShard *c = nullptr;
1193 // Currently we only implement an LRU cache for onodes
1194 c = new LruOnodeCacheShard(cct);
1195 c->logger = logger;
1196 return c;
1197 }
1198
1199 // LruBufferCacheShard
1200 struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1201 typedef boost::intrusive::list<
1202 BlueStore::Buffer,
1203 boost::intrusive::member_hook<
1204 BlueStore::Buffer,
1205 boost::intrusive::list_member_hook<>,
1206 &BlueStore::Buffer::lru_item> > list_t;
1207 list_t lru;
1208
1209 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1210
1211 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1212 if (near) {
1213 auto q = lru.iterator_to(*near);
1214 lru.insert(q, *b);
1215 } else if (level > 0) {
1216 lru.push_front(*b);
1217 } else {
1218 lru.push_back(*b);
1219 }
1220 buffer_bytes += b->length;
1221 num = lru.size();
1222 }
1223 void _rm(BlueStore::Buffer *b) override {
1224 ceph_assert(buffer_bytes >= b->length);
1225 buffer_bytes -= b->length;
1226 auto q = lru.iterator_to(*b);
1227 lru.erase(q);
1228 num = lru.size();
1229 }
1230 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1231 src->_rm(b);
1232 _add(b, 0, nullptr);
1233 }
1234 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1235 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1236 buffer_bytes += delta;
1237 }
1238 void _touch(BlueStore::Buffer *b) override {
1239 auto p = lru.iterator_to(*b);
1240 lru.erase(p);
1241 lru.push_front(*b);
1242 num = lru.size();
1243 _audit("_touch_buffer end");
1244 }
1245
1246 void _trim_to(uint64_t max) override
1247 {
1248 while (buffer_bytes > max) {
1249 auto i = lru.rbegin();
1250 if (i == lru.rend()) {
1251 // stop if lru is now empty
1252 break;
1253 }
1254
1255 BlueStore::Buffer *b = &*i;
1256 ceph_assert(b->is_clean());
1257 dout(20) << __func__ << " rm " << *b << dendl;
1258 b->space->_rm_buffer(this, b);
1259 }
1260 num = lru.size();
1261 }
1262
1263 void add_stats(uint64_t *extents,
1264 uint64_t *blobs,
1265 uint64_t *buffers,
1266 uint64_t *bytes) override {
1267 *extents += num_extents;
1268 *blobs += num_blobs;
1269 *buffers += num;
1270 *bytes += buffer_bytes;
1271 }
1272 #ifdef DEBUG_CACHE
1273 void _audit(const char *s) override
1274 {
1275 dout(10) << __func__ << " " << when << " start" << dendl;
1276 uint64_t s = 0;
1277 for (auto i = lru.begin(); i != lru.end(); ++i) {
1278 s += i->length;
1279 }
1280 if (s != buffer_bytes) {
1281 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1282 << dendl;
1283 for (auto i = lru.begin(); i != lru.end(); ++i) {
1284 derr << __func__ << " " << *i << dendl;
1285 }
1286 ceph_assert(s == buffer_bytes);
1287 }
1288 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1289 << " ok" << dendl;
1290 }
1291 #endif
1292 };
1293
1294 // TwoQBufferCacheShard
1295
1296 struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1297 typedef boost::intrusive::list<
1298 BlueStore::Buffer,
1299 boost::intrusive::member_hook<
1300 BlueStore::Buffer,
1301 boost::intrusive::list_member_hook<>,
1302 &BlueStore::Buffer::lru_item> > list_t;
1303 list_t hot; ///< "Am" hot buffers
1304 list_t warm_in; ///< "A1in" newly warm buffers
1305 list_t warm_out; ///< "A1out" empty buffers we've evicted
1306 uint64_t buffer_bytes = 0; ///< bytes
1307
1308 enum {
1309 BUFFER_NEW = 0,
1310 BUFFER_WARM_IN, ///< in warm_in
1311 BUFFER_WARM_OUT, ///< in warm_out
1312 BUFFER_HOT, ///< in hot
1313 BUFFER_TYPE_MAX
1314 };
1315
1316 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1317
1318 public:
1319 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
1320
1321 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1322 {
1323 dout(20) << __func__ << " level " << level << " near " << near
1324 << " on " << *b
1325 << " which has cache_private " << b->cache_private << dendl;
1326 if (near) {
1327 b->cache_private = near->cache_private;
1328 switch (b->cache_private) {
1329 case BUFFER_WARM_IN:
1330 warm_in.insert(warm_in.iterator_to(*near), *b);
1331 break;
1332 case BUFFER_WARM_OUT:
1333 ceph_assert(b->is_empty());
1334 warm_out.insert(warm_out.iterator_to(*near), *b);
1335 break;
1336 case BUFFER_HOT:
1337 hot.insert(hot.iterator_to(*near), *b);
1338 break;
1339 default:
1340 ceph_abort_msg("bad cache_private");
1341 }
1342 } else if (b->cache_private == BUFFER_NEW) {
1343 b->cache_private = BUFFER_WARM_IN;
1344 if (level > 0) {
1345 warm_in.push_front(*b);
1346 } else {
1347 // take caller hint to start at the back of the warm queue
1348 warm_in.push_back(*b);
1349 }
1350 } else {
1351 // we got a hint from discard
1352 switch (b->cache_private) {
1353 case BUFFER_WARM_IN:
1354 // stay in warm_in. move to front, even though 2Q doesn't actually
1355 // do this.
1356 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1357 warm_in.push_front(*b);
1358 break;
1359 case BUFFER_WARM_OUT:
1360 b->cache_private = BUFFER_HOT;
1361 // move to hot. fall-thru
1362 case BUFFER_HOT:
1363 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1364 hot.push_front(*b);
1365 break;
1366 default:
1367 ceph_abort_msg("bad cache_private");
1368 }
1369 }
1370 if (!b->is_empty()) {
1371 buffer_bytes += b->length;
1372 list_bytes[b->cache_private] += b->length;
1373 }
1374 num = hot.size() + warm_in.size();
1375 }
1376
1377 void _rm(BlueStore::Buffer *b) override
1378 {
1379 dout(20) << __func__ << " " << *b << dendl;
1380 if (!b->is_empty()) {
1381 ceph_assert(buffer_bytes >= b->length);
1382 buffer_bytes -= b->length;
1383 ceph_assert(list_bytes[b->cache_private] >= b->length);
1384 list_bytes[b->cache_private] -= b->length;
1385 }
1386 switch (b->cache_private) {
1387 case BUFFER_WARM_IN:
1388 warm_in.erase(warm_in.iterator_to(*b));
1389 break;
1390 case BUFFER_WARM_OUT:
1391 warm_out.erase(warm_out.iterator_to(*b));
1392 break;
1393 case BUFFER_HOT:
1394 hot.erase(hot.iterator_to(*b));
1395 break;
1396 default:
1397 ceph_abort_msg("bad cache_private");
1398 }
1399 num = hot.size() + warm_in.size();
1400 }
1401
1402 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1403 {
1404 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1405 src->_rm(b);
1406
1407 // preserve which list we're on (even if we can't preserve the order!)
1408 switch (b->cache_private) {
1409 case BUFFER_WARM_IN:
1410 ceph_assert(!b->is_empty());
1411 warm_in.push_back(*b);
1412 break;
1413 case BUFFER_WARM_OUT:
1414 ceph_assert(b->is_empty());
1415 warm_out.push_back(*b);
1416 break;
1417 case BUFFER_HOT:
1418 ceph_assert(!b->is_empty());
1419 hot.push_back(*b);
1420 break;
1421 default:
1422 ceph_abort_msg("bad cache_private");
1423 }
1424 if (!b->is_empty()) {
1425 buffer_bytes += b->length;
1426 list_bytes[b->cache_private] += b->length;
1427 }
1428 num = hot.size() + warm_in.size();
1429 }
1430
1431 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1432 {
1433 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1434 if (!b->is_empty()) {
1435 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1436 buffer_bytes += delta;
1437 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1438 list_bytes[b->cache_private] += delta;
1439 }
1440 }
1441
1442 void _touch(BlueStore::Buffer *b) override {
1443 switch (b->cache_private) {
1444 case BUFFER_WARM_IN:
1445 // do nothing (somewhat counter-intuitively!)
1446 break;
1447 case BUFFER_WARM_OUT:
1448 // move from warm_out to hot LRU
1449 ceph_abort_msg("this happens via discard hint");
1450 break;
1451 case BUFFER_HOT:
1452 // move to front of hot LRU
1453 hot.erase(hot.iterator_to(*b));
1454 hot.push_front(*b);
1455 break;
1456 }
1457 num = hot.size() + warm_in.size();
1458 _audit("_touch_buffer end");
1459 }
1460
1461 void _trim_to(uint64_t max) override
1462 {
1463 if (buffer_bytes > max) {
1464 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1465 uint64_t khot = max - kin;
1466
1467 // pre-calculate kout based on average buffer size too,
1468 // which is typical(the warm_in and hot lists may change later)
1469 uint64_t kout = 0;
1470 uint64_t buffer_num = hot.size() + warm_in.size();
1471 if (buffer_num) {
1472 uint64_t avg_size = buffer_bytes / buffer_num;
1473 ceph_assert(avg_size);
1474 uint64_t calculated_num = max / avg_size;
1475 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1476 }
1477
1478 if (list_bytes[BUFFER_HOT] < khot) {
1479 // hot is small, give slack to warm_in
1480 kin += khot - list_bytes[BUFFER_HOT];
1481 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1482 // warm_in is small, give slack to hot
1483 khot += kin - list_bytes[BUFFER_WARM_IN];
1484 }
1485
1486 // adjust warm_in list
1487 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1488 uint64_t evicted = 0;
1489
1490 while (to_evict_bytes > 0) {
1491 auto p = warm_in.rbegin();
1492 if (p == warm_in.rend()) {
1493 // stop if warm_in list is now empty
1494 break;
1495 }
1496
1497 BlueStore::Buffer *b = &*p;
1498 ceph_assert(b->is_clean());
1499 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1500 ceph_assert(buffer_bytes >= b->length);
1501 buffer_bytes -= b->length;
1502 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1503 list_bytes[BUFFER_WARM_IN] -= b->length;
1504 to_evict_bytes -= b->length;
1505 evicted += b->length;
1506 b->state = BlueStore::Buffer::STATE_EMPTY;
1507 b->data.clear();
1508 warm_in.erase(warm_in.iterator_to(*b));
1509 warm_out.push_front(*b);
1510 b->cache_private = BUFFER_WARM_OUT;
1511 }
1512
1513 if (evicted > 0) {
1514 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1515 << " from warm_in list, done evicting warm_in buffers"
1516 << dendl;
1517 }
1518
1519 // adjust hot list
1520 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1521 evicted = 0;
1522
1523 while (to_evict_bytes > 0) {
1524 auto p = hot.rbegin();
1525 if (p == hot.rend()) {
1526 // stop if hot list is now empty
1527 break;
1528 }
1529
1530 BlueStore::Buffer *b = &*p;
1531 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1532 ceph_assert(b->is_clean());
1533 // adjust evict size before buffer goes invalid
1534 to_evict_bytes -= b->length;
1535 evicted += b->length;
1536 b->space->_rm_buffer(this, b);
1537 }
1538
1539 if (evicted > 0) {
1540 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1541 << " from hot list, done evicting hot buffers"
1542 << dendl;
1543 }
1544
1545 // adjust warm out list too, if necessary
1546 int64_t n = warm_out.size() - kout;
1547 while (n-- > 0) {
1548 BlueStore::Buffer *b = &*warm_out.rbegin();
1549 ceph_assert(b->is_empty());
1550 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1551 b->space->_rm_buffer(this, b);
1552 }
1553 }
1554 num = hot.size() + warm_in.size();
1555 }
1556
1557 void add_stats(uint64_t *extents,
1558 uint64_t *blobs,
1559 uint64_t *buffers,
1560 uint64_t *bytes) override {
1561 *extents += num_extents;
1562 *blobs += num_blobs;
1563 *buffers += num;
1564 *bytes += buffer_bytes;
1565 }
1566
1567 #ifdef DEBUG_CACHE
1568 void _audit(const char *s) override
1569 {
1570 dout(10) << __func__ << " " << when << " start" << dendl;
1571 uint64_t s = 0;
1572 for (auto i = hot.begin(); i != hot.end(); ++i) {
1573 s += i->length;
1574 }
1575
1576 uint64_t hot_bytes = s;
1577 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1578 derr << __func__ << " hot_list_bytes "
1579 << list_bytes[BUFFER_HOT]
1580 << " != actual " << hot_bytes
1581 << dendl;
1582 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
1583 }
1584
1585 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1586 s += i->length;
1587 }
1588
1589 uint64_t warm_in_bytes = s - hot_bytes;
1590 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1591 derr << __func__ << " warm_in_list_bytes "
1592 << list_bytes[BUFFER_WARM_IN]
1593 << " != actual " << warm_in_bytes
1594 << dendl;
1595 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
1596 }
1597
1598 if (s != buffer_bytes) {
1599 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1600 << dendl;
1601 ceph_assert(s == buffer_bytes);
1602 }
1603
1604 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1605 << " ok" << dendl;
1606 }
1607 #endif
1608 };
1609
1610 // BuferCacheShard
1611
1612 BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1613 CephContext* cct,
1614 string type,
1615 PerfCounters *logger)
1616 {
1617 BufferCacheShard *c = nullptr;
1618 if (type == "lru")
1619 c = new LruBufferCacheShard(cct);
1620 else if (type == "2q")
1621 c = new TwoQBufferCacheShard(cct);
1622 else
1623 ceph_abort_msg("unrecognized cache type");
1624 c->logger = logger;
1625 return c;
1626 }
1627
1628 // BufferSpace
1629
1630 #undef dout_prefix
1631 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1632
1633 void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
1634 {
1635 // note: we already hold cache->lock
1636 ldout(cache->cct, 20) << __func__ << dendl;
1637 while (!buffer_map.empty()) {
1638 _rm_buffer(cache, buffer_map.begin());
1639 }
1640 }
1641
1642 int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
1643 {
1644 // note: we already hold cache->lock
1645 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1646 << std::dec << dendl;
1647 int cache_private = 0;
1648 cache->_audit("discard start");
1649 auto i = _data_lower_bound(offset);
1650 uint32_t end = offset + length;
1651 while (i != buffer_map.end()) {
1652 Buffer *b = i->second.get();
1653 if (b->offset >= end) {
1654 break;
1655 }
1656 if (b->cache_private > cache_private) {
1657 cache_private = b->cache_private;
1658 }
1659 if (b->offset < offset) {
1660 int64_t front = offset - b->offset;
1661 if (b->end() > end) {
1662 // drop middle (split)
1663 uint32_t tail = b->end() - end;
1664 if (b->data.length()) {
1665 bufferlist bl;
1666 bl.substr_of(b->data, b->length - tail, tail);
1667 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1668 nb->maybe_rebuild();
1669 _add_buffer(cache, nb, 0, b);
1670 } else {
1671 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1672 0, b);
1673 }
1674 if (!b->is_writing()) {
1675 cache->_adjust_size(b, front - (int64_t)b->length);
1676 }
1677 b->truncate(front);
1678 b->maybe_rebuild();
1679 cache->_audit("discard end 1");
1680 break;
1681 } else {
1682 // drop tail
1683 if (!b->is_writing()) {
1684 cache->_adjust_size(b, front - (int64_t)b->length);
1685 }
1686 b->truncate(front);
1687 b->maybe_rebuild();
1688 ++i;
1689 continue;
1690 }
1691 }
1692 if (b->end() <= end) {
1693 // drop entire buffer
1694 _rm_buffer(cache, i++);
1695 continue;
1696 }
1697 // drop front
1698 uint32_t keep = b->end() - end;
1699 if (b->data.length()) {
1700 bufferlist bl;
1701 bl.substr_of(b->data, b->length - keep, keep);
1702 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1703 nb->maybe_rebuild();
1704 _add_buffer(cache, nb, 0, b);
1705 } else {
1706 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1707 }
1708 _rm_buffer(cache, i);
1709 cache->_audit("discard end 2");
1710 break;
1711 }
1712 return cache_private;
1713 }
1714
1715 void BlueStore::BufferSpace::read(
1716 BufferCacheShard* cache,
1717 uint32_t offset,
1718 uint32_t length,
1719 BlueStore::ready_regions_t& res,
1720 interval_set<uint32_t>& res_intervals,
1721 int flags)
1722 {
1723 res.clear();
1724 res_intervals.clear();
1725 uint32_t want_bytes = length;
1726 uint32_t end = offset + length;
1727
1728 {
1729 std::lock_guard l(cache->lock);
1730 for (auto i = _data_lower_bound(offset);
1731 i != buffer_map.end() && offset < end && i->first < end;
1732 ++i) {
1733 Buffer *b = i->second.get();
1734 ceph_assert(b->end() > offset);
1735
1736 bool val = false;
1737 if (flags & BYPASS_CLEAN_CACHE)
1738 val = b->is_writing();
1739 else
1740 val = b->is_writing() || b->is_clean();
1741 if (val) {
1742 if (b->offset < offset) {
1743 uint32_t skip = offset - b->offset;
1744 uint32_t l = min(length, b->length - skip);
1745 res[offset].substr_of(b->data, skip, l);
1746 res_intervals.insert(offset, l);
1747 offset += l;
1748 length -= l;
1749 if (!b->is_writing()) {
1750 cache->_touch(b);
1751 }
1752 continue;
1753 }
1754 if (b->offset > offset) {
1755 uint32_t gap = b->offset - offset;
1756 if (length <= gap) {
1757 break;
1758 }
1759 offset += gap;
1760 length -= gap;
1761 }
1762 if (!b->is_writing()) {
1763 cache->_touch(b);
1764 }
1765 if (b->length > length) {
1766 res[offset].substr_of(b->data, 0, length);
1767 res_intervals.insert(offset, length);
1768 break;
1769 } else {
1770 res[offset].append(b->data);
1771 res_intervals.insert(offset, b->length);
1772 if (b->length == length)
1773 break;
1774 offset += b->length;
1775 length -= b->length;
1776 }
1777 }
1778 }
1779 }
1780
1781 uint64_t hit_bytes = res_intervals.size();
1782 ceph_assert(hit_bytes <= want_bytes);
1783 uint64_t miss_bytes = want_bytes - hit_bytes;
1784 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1785 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1786 }
1787
1788 void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
1789 {
1790 auto i = writing.begin();
1791 while (i != writing.end()) {
1792 if (i->seq > seq) {
1793 break;
1794 }
1795 if (i->seq < seq) {
1796 ++i;
1797 continue;
1798 }
1799
1800 Buffer *b = &*i;
1801 ceph_assert(b->is_writing());
1802
1803 if (b->flags & Buffer::FLAG_NOCACHE) {
1804 writing.erase(i++);
1805 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1806 buffer_map.erase(b->offset);
1807 } else {
1808 b->state = Buffer::STATE_CLEAN;
1809 writing.erase(i++);
1810 b->maybe_rebuild();
1811 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1812 cache->_add(b, 1, nullptr);
1813 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1814 }
1815 }
1816 cache->_trim();
1817 cache->_audit("finish_write end");
1818 }
1819
1820 void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
1821 {
1822 std::lock_guard lk(cache->lock);
1823 if (buffer_map.empty())
1824 return;
1825
1826 auto p = --buffer_map.end();
1827 while (true) {
1828 if (p->second->end() <= pos)
1829 break;
1830
1831 if (p->second->offset < pos) {
1832 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1833 size_t left = pos - p->second->offset;
1834 size_t right = p->second->length - left;
1835 if (p->second->data.length()) {
1836 bufferlist bl;
1837 bl.substr_of(p->second->data, left, right);
1838 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1839 0, p->second.get());
1840 } else {
1841 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1842 0, p->second.get());
1843 }
1844 cache->_adjust_size(p->second.get(), -right);
1845 p->second->truncate(left);
1846 break;
1847 }
1848
1849 ceph_assert(p->second->end() > pos);
1850 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1851 if (p->second->data.length()) {
1852 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1853 p->second->offset - pos, p->second->data),
1854 0, p->second.get());
1855 } else {
1856 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1857 p->second->offset - pos, p->second->length),
1858 0, p->second.get());
1859 }
1860 if (p == buffer_map.begin()) {
1861 _rm_buffer(cache, p);
1862 break;
1863 } else {
1864 _rm_buffer(cache, p--);
1865 }
1866 }
1867 ceph_assert(writing.empty());
1868 cache->_trim();
1869 }
1870
1871 // OnodeSpace
1872
1873 #undef dout_prefix
1874 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1875
1876 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1877 OnodeRef& o)
1878 {
1879 std::lock_guard l(cache->lock);
1880 auto p = onode_map.find(oid);
1881 if (p != onode_map.end()) {
1882 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1883 << " raced, returning existing " << p->second
1884 << dendl;
1885 return p->second;
1886 }
1887 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
1888 onode_map[oid] = o;
1889 cache->_add(o.get(), 1);
1890 cache->_trim();
1891 return o;
1892 }
1893
1894 void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1895 {
1896 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1897 onode_map.erase(oid);
1898 }
1899
1900 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1901 {
1902 ldout(cache->cct, 30) << __func__ << dendl;
1903 OnodeRef o;
1904 bool hit = false;
1905
1906 {
1907 std::lock_guard l(cache->lock);
1908 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1909 if (p == onode_map.end()) {
1910 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1911 } else {
1912 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1913 << " " << p->second->nref
1914 << " " << p->second->cached
1915 << " " << p->second->pinned
1916 << dendl;
1917 // This will pin onode and implicitly touch the cache when Onode
1918 // eventually will become unpinned
1919 o = p->second;
1920 ceph_assert(!o->cached || o->pinned);
1921
1922 hit = true;
1923 }
1924 }
1925
1926 if (hit) {
1927 cache->logger->inc(l_bluestore_onode_hits);
1928 } else {
1929 cache->logger->inc(l_bluestore_onode_misses);
1930 }
1931 return o;
1932 }
1933
1934 void BlueStore::OnodeSpace::clear()
1935 {
1936 std::lock_guard l(cache->lock);
1937 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
1938 for (auto &p : onode_map) {
1939 cache->_rm(p.second.get());
1940 }
1941 onode_map.clear();
1942 }
1943
1944 bool BlueStore::OnodeSpace::empty()
1945 {
1946 std::lock_guard l(cache->lock);
1947 return onode_map.empty();
1948 }
1949
1950 void BlueStore::OnodeSpace::rename(
1951 OnodeRef& oldo,
1952 const ghobject_t& old_oid,
1953 const ghobject_t& new_oid,
1954 const mempool::bluestore_cache_meta::string& new_okey)
1955 {
1956 std::lock_guard l(cache->lock);
1957 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1958 << dendl;
1959 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1960 po = onode_map.find(old_oid);
1961 pn = onode_map.find(new_oid);
1962 ceph_assert(po != pn);
1963
1964 ceph_assert(po != onode_map.end());
1965 if (pn != onode_map.end()) {
1966 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1967 << dendl;
1968 cache->_rm(pn->second.get());
1969 onode_map.erase(pn);
1970 }
1971 OnodeRef o = po->second;
1972
1973 // install a non-existent onode at old location
1974 oldo.reset(new Onode(o->c, old_oid, o->key));
1975 po->second = oldo;
1976 cache->_add(oldo.get(), 1);
1977 // add at new position and fix oid, key.
1978 // This will pin 'o' and implicitly touch cache
1979 // when it will eventually become unpinned
1980 onode_map.insert(make_pair(new_oid, o));
1981 ceph_assert(o->pinned);
1982
1983 o->oid = new_oid;
1984 o->key = new_okey;
1985 cache->_trim();
1986 }
1987
1988 bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
1989 {
1990 std::lock_guard l(cache->lock);
1991 ldout(cache->cct, 20) << __func__ << dendl;
1992 for (auto& i : onode_map) {
1993 if (f(i.second.get())) {
1994 return true;
1995 }
1996 }
1997 return false;
1998 }
1999
2000 template <int LogLevelV = 30>
2001 void BlueStore::OnodeSpace::dump(CephContext *cct)
2002 {
2003 for (auto& i : onode_map) {
2004 ldout(cct, LogLevelV) << i.first << " : " << i.second
2005 << " " << i.second->nref
2006 << " " << i.second->cached
2007 << " " << i.second->pinned
2008 << dendl;
2009 }
2010 }
2011
2012 // SharedBlob
2013
2014 #undef dout_prefix
2015 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
2016 #undef dout_context
2017 #define dout_context coll->store->cct
2018
2019 void BlueStore::SharedBlob::dump(Formatter* f) const
2020 {
2021 f->dump_bool("loaded", loaded);
2022 if (loaded) {
2023 persistent->dump(f);
2024 } else {
2025 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
2026 }
2027 }
2028
2029 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
2030 {
2031 out << "SharedBlob(" << &sb;
2032
2033 if (sb.loaded) {
2034 out << " loaded " << *sb.persistent;
2035 } else {
2036 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
2037 }
2038 return out << ")";
2039 }
2040
2041 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2042 : coll(_coll), sbid_unloaded(i)
2043 {
2044 ceph_assert(sbid_unloaded > 0);
2045 if (get_cache()) {
2046 get_cache()->add_blob();
2047 }
2048 }
2049
2050 BlueStore::SharedBlob::~SharedBlob()
2051 {
2052 if (loaded && persistent) {
2053 delete persistent;
2054 }
2055 }
2056
2057 void BlueStore::SharedBlob::put()
2058 {
2059 if (--nref == 0) {
2060 dout(20) << __func__ << " " << this
2061 << " removing self from set " << get_parent()
2062 << dendl;
2063 again:
2064 auto coll_snap = coll;
2065 if (coll_snap) {
2066 std::lock_guard l(coll_snap->cache->lock);
2067 if (coll_snap != coll) {
2068 goto again;
2069 }
2070 if (!coll_snap->shared_blob_set.remove(this, true)) {
2071 // race with lookup
2072 return;
2073 }
2074 bc._clear(coll_snap->cache);
2075 coll_snap->cache->rm_blob();
2076 }
2077 delete this;
2078 }
2079 }
2080
2081 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2082 {
2083 ceph_assert(persistent);
2084 persistent->ref_map.get(offset, length);
2085 }
2086
2087 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
2088 PExtentVector *r,
2089 bool *unshare)
2090 {
2091 ceph_assert(persistent);
2092 persistent->ref_map.put(offset, length, r,
2093 unshare && !*unshare ? unshare : nullptr);
2094 }
2095
2096 void BlueStore::SharedBlob::finish_write(uint64_t seq)
2097 {
2098 while (true) {
2099 BufferCacheShard *cache = coll->cache;
2100 std::lock_guard l(cache->lock);
2101 if (coll->cache != cache) {
2102 dout(20) << __func__
2103 << " raced with sb cache update, was " << cache
2104 << ", now " << coll->cache << ", retrying"
2105 << dendl;
2106 continue;
2107 }
2108 bc._finish_write(cache, seq);
2109 break;
2110 }
2111 }
2112
2113 // SharedBlobSet
2114
2115 #undef dout_prefix
2116 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2117
2118 template <int LogLevelV = 30>
2119 void BlueStore::SharedBlobSet::dump(CephContext *cct)
2120 {
2121 std::lock_guard l(lock);
2122 for (auto& i : sb_map) {
2123 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
2124 }
2125 }
2126
2127 // Blob
2128
2129 #undef dout_prefix
2130 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2131
2132 void BlueStore::Blob::dump(Formatter* f) const
2133 {
2134 if (is_spanning()) {
2135 f->dump_unsigned("spanning_id ", id);
2136 }
2137 blob.dump(f);
2138 if (shared_blob) {
2139 f->dump_object("shared", *shared_blob);
2140 }
2141 }
2142
2143 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2144 {
2145 out << "Blob(" << &b;
2146 if (b.is_spanning()) {
2147 out << " spanning " << b.id;
2148 }
2149 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2150 if (b.shared_blob) {
2151 out << " " << *b.shared_blob;
2152 } else {
2153 out << " (shared_blob=NULL)";
2154 }
2155 out << ")";
2156 return out;
2157 }
2158
2159 void BlueStore::Blob::discard_unallocated(Collection *coll)
2160 {
2161 if (get_blob().is_shared()) {
2162 return;
2163 }
2164 if (get_blob().is_compressed()) {
2165 bool discard = false;
2166 bool all_invalid = true;
2167 for (auto e : get_blob().get_extents()) {
2168 if (!e.is_valid()) {
2169 discard = true;
2170 } else {
2171 all_invalid = false;
2172 }
2173 }
2174 ceph_assert(discard == all_invalid); // in case of compressed blob all
2175 // or none pextents are invalid.
2176 if (discard) {
2177 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2178 get_blob().get_logical_length());
2179 }
2180 } else {
2181 size_t pos = 0;
2182 for (auto e : get_blob().get_extents()) {
2183 if (!e.is_valid()) {
2184 dout(20) << __func__ << " 0x" << std::hex << pos
2185 << "~" << e.length
2186 << std::dec << dendl;
2187 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2188 }
2189 pos += e.length;
2190 }
2191 if (get_blob().can_prune_tail()) {
2192 dirty_blob().prune_tail();
2193 used_in_blob.prune_tail(get_blob().get_ondisk_length());
2194 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
2195 }
2196 }
2197 }
2198
2199 void BlueStore::Blob::get_ref(
2200 Collection *coll,
2201 uint32_t offset,
2202 uint32_t length)
2203 {
2204 // Caller has to initialize Blob's logical length prior to increment
2205 // references. Otherwise one is neither unable to determine required
2206 // amount of counters in case of per-au tracking nor obtain min_release_size
2207 // for single counter mode.
2208 ceph_assert(get_blob().get_logical_length() != 0);
2209 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2210 << std::dec << " " << *this << dendl;
2211
2212 if (used_in_blob.is_empty()) {
2213 uint32_t min_release_size =
2214 get_blob().get_release_size(coll->store->min_alloc_size);
2215 uint64_t l = get_blob().get_logical_length();
2216 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2217 << min_release_size << std::dec << dendl;
2218 used_in_blob.init(l, min_release_size);
2219 }
2220 used_in_blob.get(
2221 offset,
2222 length);
2223 }
2224
2225 bool BlueStore::Blob::put_ref(
2226 Collection *coll,
2227 uint32_t offset,
2228 uint32_t length,
2229 PExtentVector *r)
2230 {
2231 PExtentVector logical;
2232
2233 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2234 << std::dec << " " << *this << dendl;
2235
2236 bool empty = used_in_blob.put(
2237 offset,
2238 length,
2239 &logical);
2240 r->clear();
2241 // nothing to release
2242 if (!empty && logical.empty()) {
2243 return false;
2244 }
2245
2246 bluestore_blob_t& b = dirty_blob();
2247 return b.release_extents(empty, logical, r);
2248 }
2249
2250 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
2251 uint32_t target_blob_size,
2252 uint32_t b_offset,
2253 uint32_t *length0) {
2254 ceph_assert(min_alloc_size);
2255 ceph_assert(target_blob_size);
2256 if (!get_blob().is_mutable()) {
2257 return false;
2258 }
2259
2260 uint32_t length = *length0;
2261 uint32_t end = b_offset + length;
2262
2263 // Currently for the sake of simplicity we omit blob reuse if data is
2264 // unaligned with csum chunk. Later we can perform padding if needed.
2265 if (get_blob().has_csum() &&
2266 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2267 (end % get_blob().get_csum_chunk_size()) != 0)) {
2268 return false;
2269 }
2270
2271 auto blen = get_blob().get_logical_length();
2272 uint32_t new_blen = blen;
2273
2274 // make sure target_blob_size isn't less than current blob len
2275 target_blob_size = std::max(blen, target_blob_size);
2276
2277 if (b_offset >= blen) {
2278 // new data totally stands out of the existing blob
2279 new_blen = end;
2280 } else {
2281 // new data overlaps with the existing blob
2282 new_blen = std::max(blen, end);
2283
2284 uint32_t overlap = 0;
2285 if (new_blen > blen) {
2286 overlap = blen - b_offset;
2287 } else {
2288 overlap = length;
2289 }
2290
2291 if (!get_blob().is_unallocated(b_offset, overlap)) {
2292 // abort if any piece of the overlap has already been allocated
2293 return false;
2294 }
2295 }
2296
2297 if (new_blen > blen) {
2298 int64_t overflow = int64_t(new_blen) - target_blob_size;
2299 // Unable to decrease the provided length to fit into max_blob_size
2300 if (overflow >= length) {
2301 return false;
2302 }
2303
2304 // FIXME: in some cases we could reduce unused resolution
2305 if (get_blob().has_unused()) {
2306 return false;
2307 }
2308
2309 if (overflow > 0) {
2310 new_blen -= overflow;
2311 length -= overflow;
2312 *length0 = length;
2313 }
2314
2315 if (new_blen > blen) {
2316 dirty_blob().add_tail(new_blen);
2317 used_in_blob.add_tail(new_blen,
2318 get_blob().get_release_size(min_alloc_size));
2319 }
2320 }
2321 return true;
2322 }
2323
2324 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2325 {
2326 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2327 << " start " << *this << dendl;
2328 ceph_assert(blob.can_split());
2329 ceph_assert(used_in_blob.can_split());
2330 bluestore_blob_t &lb = dirty_blob();
2331 bluestore_blob_t &rb = r->dirty_blob();
2332
2333 used_in_blob.split(
2334 blob_offset,
2335 &(r->used_in_blob));
2336
2337 lb.split(blob_offset, rb);
2338 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2339
2340 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2341 << " finish " << *this << dendl;
2342 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2343 << " and " << *r << dendl;
2344 }
2345
2346 #ifndef CACHE_BLOB_BL
2347 void BlueStore::Blob::decode(
2348 Collection *coll,
2349 bufferptr::const_iterator& p,
2350 uint64_t struct_v,
2351 uint64_t* sbid,
2352 bool include_ref_map)
2353 {
2354 denc(blob, p, struct_v);
2355 if (blob.is_shared()) {
2356 denc(*sbid, p);
2357 }
2358 if (include_ref_map) {
2359 if (struct_v > 1) {
2360 used_in_blob.decode(p);
2361 } else {
2362 used_in_blob.clear();
2363 bluestore_extent_ref_map_t legacy_ref_map;
2364 legacy_ref_map.decode(p);
2365 for (auto r : legacy_ref_map.ref_map) {
2366 get_ref(
2367 coll,
2368 r.first,
2369 r.second.refs * r.second.length);
2370 }
2371 }
2372 }
2373 }
2374 #endif
2375
2376 // Extent
2377
2378 void BlueStore::Extent::dump(Formatter* f) const
2379 {
2380 f->dump_unsigned("logical_offset", logical_offset);
2381 f->dump_unsigned("length", length);
2382 f->dump_unsigned("blob_offset", blob_offset);
2383 f->dump_object("blob", *blob);
2384 }
2385
2386 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2387 {
2388 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2389 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2390 << " " << *e.blob;
2391 }
2392
2393 // OldExtent
2394 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2395 uint32_t lo,
2396 uint32_t o,
2397 uint32_t l,
2398 BlobRef& b) {
2399 OldExtent* oe = new OldExtent(lo, o, l, b);
2400 b->put_ref(c.get(), o, l, &(oe->r));
2401 oe->blob_empty = !b->is_referenced();
2402 return oe;
2403 }
2404
2405 // ExtentMap
2406
2407 #undef dout_prefix
2408 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2409 #undef dout_context
2410 #define dout_context onode->c->store->cct
2411
2412 BlueStore::ExtentMap::ExtentMap(Onode *o)
2413 : onode(o),
2414 inline_bl(
2415 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2416 }
2417
2418 void BlueStore::ExtentMap::dump(Formatter* f) const
2419 {
2420 f->open_array_section("extents");
2421
2422 for (auto& e : extent_map) {
2423 f->dump_object("extent", e);
2424 }
2425 f->close_section();
2426 }
2427
2428 void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2429 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2430 uint64_t& length, uint64_t& dstoff) {
2431
2432 auto cct = onode->c->store->cct;
2433 bool inject_21040 =
2434 cct->_conf->bluestore_debug_inject_bug21040;
2435 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2436 for (auto& e : oldo->extent_map.extent_map) {
2437 e.blob->last_encoded_id = -1;
2438 }
2439
2440 int n = 0;
2441 uint64_t end = srcoff + length;
2442 uint32_t dirty_range_begin = 0;
2443 uint32_t dirty_range_end = 0;
2444 bool src_dirty = false;
2445 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2446 ep != oldo->extent_map.extent_map.end();
2447 ++ep) {
2448 auto& e = *ep;
2449 if (e.logical_offset >= end) {
2450 break;
2451 }
2452 dout(20) << __func__ << " src " << e << dendl;
2453 BlobRef cb;
2454 bool blob_duped = true;
2455 if (e.blob->last_encoded_id >= 0) {
2456 cb = id_to_blob[e.blob->last_encoded_id];
2457 blob_duped = false;
2458 } else {
2459 // dup the blob
2460 const bluestore_blob_t& blob = e.blob->get_blob();
2461 // make sure it is shared
2462 if (!blob.is_shared()) {
2463 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2464 if (!inject_21040 && !src_dirty) {
2465 src_dirty = true;
2466 dirty_range_begin = e.logical_offset;
2467 } else if (inject_21040 &&
2468 dirty_range_begin == 0 && dirty_range_end == 0) {
2469 dirty_range_begin = e.logical_offset;
2470 }
2471 ceph_assert(e.logical_end() > 0);
2472 // -1 to exclude next potential shard
2473 dirty_range_end = e.logical_end() - 1;
2474 } else {
2475 c->load_shared_blob(e.blob->shared_blob);
2476 }
2477 cb = new Blob();
2478 e.blob->last_encoded_id = n;
2479 id_to_blob[n] = cb;
2480 e.blob->dup(*cb);
2481 // bump the extent refs on the copied blob's extents
2482 for (auto p : blob.get_extents()) {
2483 if (p.is_valid()) {
2484 e.blob->shared_blob->get_ref(p.offset, p.length);
2485 }
2486 }
2487 txc->write_shared_blob(e.blob->shared_blob);
2488 dout(20) << __func__ << " new " << *cb << dendl;
2489 }
2490
2491 int skip_front, skip_back;
2492 if (e.logical_offset < srcoff) {
2493 skip_front = srcoff - e.logical_offset;
2494 } else {
2495 skip_front = 0;
2496 }
2497 if (e.logical_end() > end) {
2498 skip_back = e.logical_end() - end;
2499 } else {
2500 skip_back = 0;
2501 }
2502
2503 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2504 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2505 newo->extent_map.extent_map.insert(*ne);
2506 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2507 // fixme: we may leave parts of new blob unreferenced that could
2508 // be freed (relative to the shared_blob).
2509 txc->statfs_delta.stored() += ne->length;
2510 if (e.blob->get_blob().is_compressed()) {
2511 txc->statfs_delta.compressed_original() += ne->length;
2512 if (blob_duped) {
2513 txc->statfs_delta.compressed() +=
2514 cb->get_blob().get_compressed_payload_length();
2515 }
2516 }
2517 dout(20) << __func__ << " dst " << *ne << dendl;
2518 ++n;
2519 }
2520 if ((!inject_21040 && src_dirty) ||
2521 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2522 oldo->extent_map.dirty_range(dirty_range_begin,
2523 dirty_range_end - dirty_range_begin);
2524 txc->write_onode(oldo);
2525 }
2526 txc->write_onode(newo);
2527
2528 if (dstoff + length > newo->onode.size) {
2529 newo->onode.size = dstoff + length;
2530 }
2531 newo->extent_map.dirty_range(dstoff, length);
2532 }
2533 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2534 bool force)
2535 {
2536 auto cct = onode->c->store->cct; //used by dout
2537 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2538 if (onode->onode.extent_map_shards.empty()) {
2539 if (inline_bl.length() == 0) {
2540 unsigned n;
2541 // we need to encode inline_bl to measure encoded length
2542 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
2543 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
2544 ceph_assert(!never_happen);
2545 size_t len = inline_bl.length();
2546 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2547 << " extents" << dendl;
2548 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2549 request_reshard(0, OBJECT_MAX_SIZE);
2550 return;
2551 }
2552 }
2553 // will persist in the onode key.
2554 } else {
2555 // pending shard update
2556 struct dirty_shard_t {
2557 Shard *shard;
2558 bufferlist bl;
2559 dirty_shard_t(Shard *s) : shard(s) {}
2560 };
2561 vector<dirty_shard_t> encoded_shards;
2562 // allocate slots for all shards in a single call instead of
2563 // doing multiple allocations - one per each dirty shard
2564 encoded_shards.reserve(shards.size());
2565
2566 auto p = shards.begin();
2567 auto prev_p = p;
2568 while (p != shards.end()) {
2569 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
2570 auto n = p;
2571 ++n;
2572 if (p->dirty) {
2573 uint32_t endoff;
2574 if (n == shards.end()) {
2575 endoff = OBJECT_MAX_SIZE;
2576 } else {
2577 endoff = n->shard_info->offset;
2578 }
2579 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2580 bufferlist& bl = encoded_shards.back().bl;
2581 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2582 bl, &p->extents)) {
2583 if (force) {
2584 derr << __func__ << " encode_some needs reshard" << dendl;
2585 ceph_assert(!force);
2586 }
2587 }
2588 size_t len = bl.length();
2589
2590 dout(20) << __func__ << " shard 0x" << std::hex
2591 << p->shard_info->offset << std::dec << " is " << len
2592 << " bytes (was " << p->shard_info->bytes << ") from "
2593 << p->extents << " extents" << dendl;
2594
2595 if (!force) {
2596 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2597 // we are big; reshard ourselves
2598 request_reshard(p->shard_info->offset, endoff);
2599 }
2600 // avoid resharding the trailing shard, even if it is small
2601 else if (n != shards.end() &&
2602 len < g_conf()->bluestore_extent_map_shard_min_size) {
2603 ceph_assert(endoff != OBJECT_MAX_SIZE);
2604 if (p == shards.begin()) {
2605 // we are the first shard, combine with next shard
2606 request_reshard(p->shard_info->offset, endoff + 1);
2607 } else {
2608 // combine either with the previous shard or the next,
2609 // whichever is smaller
2610 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2611 request_reshard(p->shard_info->offset, endoff + 1);
2612 } else {
2613 request_reshard(prev_p->shard_info->offset, endoff);
2614 }
2615 }
2616 }
2617 }
2618 }
2619 prev_p = p;
2620 p = n;
2621 }
2622 if (needs_reshard()) {
2623 return;
2624 }
2625
2626 // schedule DB update for dirty shards
2627 string key;
2628 for (auto& it : encoded_shards) {
2629 it.shard->dirty = false;
2630 it.shard->shard_info->bytes = it.bl.length();
2631 generate_extent_shard_key_and_apply(
2632 onode->key,
2633 it.shard->shard_info->offset,
2634 &key,
2635 [&](const string& final_key) {
2636 t->set(PREFIX_OBJ, final_key, it.bl);
2637 }
2638 );
2639 }
2640 }
2641 }
2642
2643 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2644 {
2645 if (spanning_blob_map.empty())
2646 return 0;
2647 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2648 // bid is valid and available.
2649 if (bid >= 0)
2650 return bid;
2651 // Find next unused bid;
2652 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2653 const auto begin_bid = bid;
2654 do {
2655 if (!spanning_blob_map.count(bid))
2656 return bid;
2657 else {
2658 bid++;
2659 if (bid < 0) bid = 0;
2660 }
2661 } while (bid != begin_bid);
2662 auto cct = onode->c->store->cct; // used by dout
2663 _dump_onode<0>(cct, *onode);
2664 ceph_abort_msg("no available blob id");
2665 }
2666
2667 void BlueStore::ExtentMap::reshard(
2668 KeyValueDB *db,
2669 KeyValueDB::Transaction t)
2670 {
2671 auto cct = onode->c->store->cct; // used by dout
2672
2673 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2674 << needs_reshard_end << ")" << std::dec
2675 << " of " << onode->onode.extent_map_shards.size()
2676 << " shards on " << onode->oid << dendl;
2677 for (auto& p : spanning_blob_map) {
2678 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2679 << dendl;
2680 }
2681 // determine shard index range
2682 unsigned si_begin = 0, si_end = 0;
2683 if (!shards.empty()) {
2684 while (si_begin + 1 < shards.size() &&
2685 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2686 ++si_begin;
2687 }
2688 needs_reshard_begin = shards[si_begin].shard_info->offset;
2689 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2690 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2691 needs_reshard_end = shards[si_end].shard_info->offset;
2692 break;
2693 }
2694 }
2695 if (si_end == shards.size()) {
2696 needs_reshard_end = OBJECT_MAX_SIZE;
2697 }
2698 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2699 << " over 0x[" << std::hex << needs_reshard_begin << ","
2700 << needs_reshard_end << ")" << std::dec << dendl;
2701 }
2702
2703 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2704
2705 // we may need to fault in a larger interval later must have all
2706 // referring extents for spanning blobs loaded in order to have
2707 // accurate use_tracker values.
2708 uint32_t spanning_scan_begin = needs_reshard_begin;
2709 uint32_t spanning_scan_end = needs_reshard_end;
2710
2711 // remove old keys
2712 string key;
2713 for (unsigned i = si_begin; i < si_end; ++i) {
2714 generate_extent_shard_key_and_apply(
2715 onode->key, shards[i].shard_info->offset, &key,
2716 [&](const string& final_key) {
2717 t->rmkey(PREFIX_OBJ, final_key);
2718 }
2719 );
2720 }
2721
2722 // calculate average extent size
2723 unsigned bytes = 0;
2724 unsigned extents = 0;
2725 if (onode->onode.extent_map_shards.empty()) {
2726 bytes = inline_bl.length();
2727 extents = extent_map.size();
2728 } else {
2729 for (unsigned i = si_begin; i < si_end; ++i) {
2730 bytes += shards[i].shard_info->bytes;
2731 extents += shards[i].extents;
2732 }
2733 }
2734 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2735 unsigned slop = target *
2736 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2737 unsigned extent_avg = bytes / std::max(1u, extents);
2738 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2739 << ", slop " << slop << dendl;
2740
2741 // reshard
2742 unsigned estimate = 0;
2743 unsigned offset = needs_reshard_begin;
2744 vector<bluestore_onode_t::shard_info> new_shard_info;
2745 unsigned max_blob_end = 0;
2746 Extent dummy(needs_reshard_begin);
2747 for (auto e = extent_map.lower_bound(dummy);
2748 e != extent_map.end();
2749 ++e) {
2750 if (e->logical_offset >= needs_reshard_end) {
2751 break;
2752 }
2753 dout(30) << " extent " << *e << dendl;
2754
2755 // disfavor shard boundaries that span a blob
2756 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2757 if (estimate &&
2758 estimate + extent_avg > target + (would_span ? slop : 0)) {
2759 // new shard
2760 if (offset == needs_reshard_begin) {
2761 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2762 new_shard_info.back().offset = offset;
2763 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2764 << std::dec << dendl;
2765 }
2766 offset = e->logical_offset;
2767 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2768 new_shard_info.back().offset = offset;
2769 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2770 << std::dec << dendl;
2771 estimate = 0;
2772 }
2773 estimate += extent_avg;
2774 unsigned bs = e->blob_start();
2775 if (bs < spanning_scan_begin) {
2776 spanning_scan_begin = bs;
2777 }
2778 uint32_t be = e->blob_end();
2779 if (be > max_blob_end) {
2780 max_blob_end = be;
2781 }
2782 if (be > spanning_scan_end) {
2783 spanning_scan_end = be;
2784 }
2785 }
2786 if (new_shard_info.empty() && (si_begin > 0 ||
2787 si_end < shards.size())) {
2788 // we resharded a partial range; we must produce at least one output
2789 // shard
2790 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2791 new_shard_info.back().offset = needs_reshard_begin;
2792 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2793 << std::dec << " (singleton degenerate case)" << dendl;
2794 }
2795
2796 auto& sv = onode->onode.extent_map_shards;
2797 dout(20) << __func__ << " new " << new_shard_info << dendl;
2798 dout(20) << __func__ << " old " << sv << dendl;
2799 if (sv.empty()) {
2800 // no old shards to keep
2801 sv.swap(new_shard_info);
2802 init_shards(true, true);
2803 } else {
2804 // splice in new shards
2805 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2806 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2807 sv.insert(
2808 sv.begin() + si_begin,
2809 new_shard_info.begin(),
2810 new_shard_info.end());
2811 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2812 si_end = si_begin + new_shard_info.size();
2813
2814 ceph_assert(sv.size() == shards.size());
2815
2816 // note that we need to update every shard_info of shards here,
2817 // as sv might have been totally re-allocated above
2818 for (unsigned i = 0; i < shards.size(); i++) {
2819 shards[i].shard_info = &sv[i];
2820 }
2821
2822 // mark newly added shards as dirty
2823 for (unsigned i = si_begin; i < si_end; ++i) {
2824 shards[i].loaded = true;
2825 shards[i].dirty = true;
2826 }
2827 }
2828 dout(20) << __func__ << " fin " << sv << dendl;
2829 inline_bl.clear();
2830
2831 if (sv.empty()) {
2832 // no more shards; unspan all previously spanning blobs
2833 auto p = spanning_blob_map.begin();
2834 while (p != spanning_blob_map.end()) {
2835 p->second->id = -1;
2836 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2837 p = spanning_blob_map.erase(p);
2838 }
2839 } else {
2840 // identify new spanning blobs
2841 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2842 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2843 if (spanning_scan_begin < needs_reshard_begin) {
2844 fault_range(db, spanning_scan_begin,
2845 needs_reshard_begin - spanning_scan_begin);
2846 }
2847 if (spanning_scan_end > needs_reshard_end) {
2848 fault_range(db, needs_reshard_end,
2849 spanning_scan_end - needs_reshard_end);
2850 }
2851 auto sp = sv.begin() + si_begin;
2852 auto esp = sv.end();
2853 unsigned shard_start = sp->offset;
2854 unsigned shard_end;
2855 ++sp;
2856 if (sp == esp) {
2857 shard_end = OBJECT_MAX_SIZE;
2858 } else {
2859 shard_end = sp->offset;
2860 }
2861 Extent dummy(needs_reshard_begin);
2862
2863 bool was_too_many_blobs_check = false;
2864 auto too_many_blobs_threshold =
2865 g_conf()->bluestore_debug_too_many_blobs_threshold;
2866 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2867 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2868 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2869
2870 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2871 if (e->logical_offset >= needs_reshard_end) {
2872 break;
2873 }
2874 dout(30) << " extent " << *e << dendl;
2875 while (e->logical_offset >= shard_end) {
2876 shard_start = shard_end;
2877 ceph_assert(sp != esp);
2878 ++sp;
2879 if (sp == esp) {
2880 shard_end = OBJECT_MAX_SIZE;
2881 } else {
2882 shard_end = sp->offset;
2883 }
2884 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2885 << " to 0x" << shard_end << std::dec << dendl;
2886 }
2887
2888 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2889 if (!e->blob->is_spanning()) {
2890 // We have two options: (1) split the blob into pieces at the
2891 // shard boundaries (and adjust extents accordingly), or (2)
2892 // mark it spanning. We prefer to cut the blob if we can. Note that
2893 // we may have to split it multiple times--potentially at every
2894 // shard boundary.
2895 bool must_span = false;
2896 BlobRef b = e->blob;
2897 if (b->can_split()) {
2898 uint32_t bstart = e->blob_start();
2899 uint32_t bend = e->blob_end();
2900 for (const auto& sh : shards) {
2901 if (bstart < sh.shard_info->offset &&
2902 bend > sh.shard_info->offset) {
2903 uint32_t blob_offset = sh.shard_info->offset - bstart;
2904 if (b->can_split_at(blob_offset)) {
2905 dout(20) << __func__ << " splitting blob, bstart 0x"
2906 << std::hex << bstart << " blob_offset 0x"
2907 << blob_offset << std::dec << " " << *b << dendl;
2908 b = split_blob(b, blob_offset, sh.shard_info->offset);
2909 // switch b to the new right-hand side, in case it
2910 // *also* has to get split.
2911 bstart += blob_offset;
2912 onode->c->store->logger->inc(l_bluestore_blob_split);
2913 } else {
2914 must_span = true;
2915 break;
2916 }
2917 }
2918 }
2919 } else {
2920 must_span = true;
2921 }
2922 if (must_span) {
2923 auto bid = allocate_spanning_blob_id();
2924 b->id = bid;
2925 spanning_blob_map[b->id] = b;
2926 dout(20) << __func__ << " adding spanning " << *b << dendl;
2927 if (!was_too_many_blobs_check &&
2928 too_many_blobs_threshold &&
2929 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2930
2931 was_too_many_blobs_check = true;
2932 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2933 if (dumped_onodes[i].first == onode->oid) {
2934 oid_slot = &dumped_onodes[i];
2935 break;
2936 }
2937 if (!oldest_slot || (oldest_slot &&
2938 dumped_onodes[i].second < oldest_slot->second)) {
2939 oldest_slot = &dumped_onodes[i];
2940 }
2941 }
2942 }
2943 }
2944 }
2945 } else {
2946 if (e->blob->is_spanning()) {
2947 spanning_blob_map.erase(e->blob->id);
2948 e->blob->id = -1;
2949 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2950 }
2951 }
2952 }
2953 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2954 (oid_slot &&
2955 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2956 if (do_dump) {
2957 dout(0) << __func__
2958 << " spanning blob count exceeds threshold, "
2959 << spanning_blob_map.size() << " spanning blobs"
2960 << dendl;
2961 _dump_onode<0>(cct, *onode);
2962 if (oid_slot) {
2963 oid_slot->second = mono_clock::now();
2964 } else {
2965 ceph_assert(oldest_slot);
2966 oldest_slot->first = onode->oid;
2967 oldest_slot->second = mono_clock::now();
2968 }
2969 }
2970 }
2971
2972 clear_needs_reshard();
2973 }
2974
2975 bool BlueStore::ExtentMap::encode_some(
2976 uint32_t offset,
2977 uint32_t length,
2978 bufferlist& bl,
2979 unsigned *pn)
2980 {
2981 Extent dummy(offset);
2982 auto start = extent_map.lower_bound(dummy);
2983 uint32_t end = offset + length;
2984
2985 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2986 // serialization only. Hence there is no specific
2987 // handling at ExtentMap level.
2988
2989 unsigned n = 0;
2990 size_t bound = 0;
2991 bool must_reshard = false;
2992 for (auto p = start;
2993 p != extent_map.end() && p->logical_offset < end;
2994 ++p, ++n) {
2995 ceph_assert(p->logical_offset >= offset);
2996 p->blob->last_encoded_id = -1;
2997 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2998 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2999 << std::dec << " hit new spanning blob " << *p << dendl;
3000 request_reshard(p->blob_start(), p->blob_end());
3001 must_reshard = true;
3002 }
3003 if (!must_reshard) {
3004 denc_varint(0, bound); // blobid
3005 denc_varint(0, bound); // logical_offset
3006 denc_varint(0, bound); // len
3007 denc_varint(0, bound); // blob_offset
3008
3009 p->blob->bound_encode(
3010 bound,
3011 struct_v,
3012 p->blob->shared_blob->get_sbid(),
3013 false);
3014 }
3015 }
3016 if (must_reshard) {
3017 return true;
3018 }
3019
3020 denc(struct_v, bound);
3021 denc_varint(0, bound); // number of extents
3022
3023 {
3024 auto app = bl.get_contiguous_appender(bound);
3025 denc(struct_v, app);
3026 denc_varint(n, app);
3027 if (pn) {
3028 *pn = n;
3029 }
3030
3031 n = 0;
3032 uint64_t pos = 0;
3033 uint64_t prev_len = 0;
3034 for (auto p = start;
3035 p != extent_map.end() && p->logical_offset < end;
3036 ++p, ++n) {
3037 unsigned blobid;
3038 bool include_blob = false;
3039 if (p->blob->is_spanning()) {
3040 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3041 blobid |= BLOBID_FLAG_SPANNING;
3042 } else if (p->blob->last_encoded_id < 0) {
3043 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3044 include_blob = true;
3045 blobid = 0; // the decoder will infer the id from n
3046 } else {
3047 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3048 }
3049 if (p->logical_offset == pos) {
3050 blobid |= BLOBID_FLAG_CONTIGUOUS;
3051 }
3052 if (p->blob_offset == 0) {
3053 blobid |= BLOBID_FLAG_ZEROOFFSET;
3054 }
3055 if (p->length == prev_len) {
3056 blobid |= BLOBID_FLAG_SAMELENGTH;
3057 } else {
3058 prev_len = p->length;
3059 }
3060 denc_varint(blobid, app);
3061 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3062 denc_varint_lowz(p->logical_offset - pos, app);
3063 }
3064 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3065 denc_varint_lowz(p->blob_offset, app);
3066 }
3067 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3068 denc_varint_lowz(p->length, app);
3069 }
3070 pos = p->logical_end();
3071 if (include_blob) {
3072 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3073 }
3074 }
3075 }
3076 /*derr << __func__ << bl << dendl;
3077 derr << __func__ << ":";
3078 bl.hexdump(*_dout);
3079 *_dout << dendl;
3080 */
3081 return false;
3082 }
3083
3084 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3085 {
3086 /*
3087 derr << __func__ << ":";
3088 bl.hexdump(*_dout);
3089 *_dout << dendl;
3090 */
3091
3092 ceph_assert(bl.get_num_buffers() <= 1);
3093 auto p = bl.front().begin_deep();
3094 __u8 struct_v;
3095 denc(struct_v, p);
3096 // Version 2 differs from v1 in blob's ref_map
3097 // serialization only. Hence there is no specific
3098 // handling at ExtentMap level below.
3099 ceph_assert(struct_v == 1 || struct_v == 2);
3100
3101 uint32_t num;
3102 denc_varint(num, p);
3103 vector<BlobRef> blobs(num);
3104 uint64_t pos = 0;
3105 uint64_t prev_len = 0;
3106 unsigned n = 0;
3107
3108 while (!p.end()) {
3109 Extent *le = new Extent();
3110 uint64_t blobid;
3111 denc_varint(blobid, p);
3112 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3113 uint64_t gap;
3114 denc_varint_lowz(gap, p);
3115 pos += gap;
3116 }
3117 le->logical_offset = pos;
3118 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3119 denc_varint_lowz(le->blob_offset, p);
3120 } else {
3121 le->blob_offset = 0;
3122 }
3123 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3124 denc_varint_lowz(prev_len, p);
3125 }
3126 le->length = prev_len;
3127
3128 if (blobid & BLOBID_FLAG_SPANNING) {
3129 dout(30) << __func__ << " getting spanning blob "
3130 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3131 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3132 } else {
3133 blobid >>= BLOBID_SHIFT_BITS;
3134 if (blobid) {
3135 le->assign_blob(blobs[blobid - 1]);
3136 ceph_assert(le->blob);
3137 } else {
3138 Blob *b = new Blob();
3139 uint64_t sbid = 0;
3140 b->decode(onode->c, p, struct_v, &sbid, false);
3141 blobs[n] = b;
3142 onode->c->open_shared_blob(sbid, b);
3143 le->assign_blob(b);
3144 }
3145 // we build ref_map dynamically for non-spanning blobs
3146 le->blob->get_ref(
3147 onode->c,
3148 le->blob_offset,
3149 le->length);
3150 }
3151 pos += prev_len;
3152 ++n;
3153 extent_map.insert(*le);
3154 }
3155
3156 ceph_assert(n == num);
3157 return num;
3158 }
3159
3160 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3161 {
3162 // Version 2 differs from v1 in blob's ref_map
3163 // serialization only. Hence there is no specific
3164 // handling at ExtentMap level.
3165 __u8 struct_v = 2;
3166
3167 denc(struct_v, p);
3168 denc_varint((uint32_t)0, p);
3169 size_t key_size = 0;
3170 denc_varint((uint32_t)0, key_size);
3171 p += spanning_blob_map.size() * key_size;
3172 for (const auto& i : spanning_blob_map) {
3173 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3174 }
3175 }
3176
3177 void BlueStore::ExtentMap::encode_spanning_blobs(
3178 bufferlist::contiguous_appender& p)
3179 {
3180 // Version 2 differs from v1 in blob's ref_map
3181 // serialization only. Hence there is no specific
3182 // handling at ExtentMap level.
3183 __u8 struct_v = 2;
3184
3185 denc(struct_v, p);
3186 denc_varint(spanning_blob_map.size(), p);
3187 for (auto& i : spanning_blob_map) {
3188 denc_varint(i.second->id, p);
3189 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3190 }
3191 }
3192
3193 void BlueStore::ExtentMap::decode_spanning_blobs(
3194 bufferptr::const_iterator& p)
3195 {
3196 __u8 struct_v;
3197 denc(struct_v, p);
3198 // Version 2 differs from v1 in blob's ref_map
3199 // serialization only. Hence there is no specific
3200 // handling at ExtentMap level.
3201 ceph_assert(struct_v == 1 || struct_v == 2);
3202
3203 unsigned n;
3204 denc_varint(n, p);
3205 while (n--) {
3206 BlobRef b(new Blob());
3207 denc_varint(b->id, p);
3208 spanning_blob_map[b->id] = b;
3209 uint64_t sbid = 0;
3210 b->decode(onode->c, p, struct_v, &sbid, true);
3211 onode->c->open_shared_blob(sbid, b);
3212 }
3213 }
3214
3215 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3216 {
3217 shards.resize(onode->onode.extent_map_shards.size());
3218 unsigned i = 0;
3219 for (auto &s : onode->onode.extent_map_shards) {
3220 shards[i].shard_info = &s;
3221 shards[i].loaded = loaded;
3222 shards[i].dirty = dirty;
3223 ++i;
3224 }
3225 }
3226
3227 void BlueStore::ExtentMap::fault_range(
3228 KeyValueDB *db,
3229 uint32_t offset,
3230 uint32_t length)
3231 {
3232 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3233 << std::dec << dendl;
3234 auto start = seek_shard(offset);
3235 auto last = seek_shard(offset + length);
3236
3237 if (start < 0)
3238 return;
3239
3240 ceph_assert(last >= start);
3241 string key;
3242 while (start <= last) {
3243 ceph_assert((size_t)start < shards.size());
3244 auto p = &shards[start];
3245 if (!p->loaded) {
3246 dout(30) << __func__ << " opening shard 0x" << std::hex
3247 << p->shard_info->offset << std::dec << dendl;
3248 bufferlist v;
3249 generate_extent_shard_key_and_apply(
3250 onode->key, p->shard_info->offset, &key,
3251 [&](const string& final_key) {
3252 int r = db->get(PREFIX_OBJ, final_key, &v);
3253 if (r < 0) {
3254 derr << __func__ << " missing shard 0x" << std::hex
3255 << p->shard_info->offset << std::dec << " for " << onode->oid
3256 << dendl;
3257 ceph_assert(r >= 0);
3258 }
3259 }
3260 );
3261 p->extents = decode_some(v);
3262 p->loaded = true;
3263 dout(20) << __func__ << " open shard 0x" << std::hex
3264 << p->shard_info->offset
3265 << " for range 0x" << offset << "~" << length << std::dec
3266 << " (" << v.length() << " bytes)" << dendl;
3267 ceph_assert(p->dirty == false);
3268 ceph_assert(v.length() == p->shard_info->bytes);
3269 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3270 } else {
3271 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3272 }
3273 ++start;
3274 }
3275 }
3276
3277 void BlueStore::ExtentMap::dirty_range(
3278 uint32_t offset,
3279 uint32_t length)
3280 {
3281 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3282 << std::dec << dendl;
3283 if (shards.empty()) {
3284 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3285 inline_bl.clear();
3286 return;
3287 }
3288 auto start = seek_shard(offset);
3289 if (length == 0) {
3290 length = 1;
3291 }
3292 auto last = seek_shard(offset + length - 1);
3293 if (start < 0)
3294 return;
3295
3296 ceph_assert(last >= start);
3297 while (start <= last) {
3298 ceph_assert((size_t)start < shards.size());
3299 auto p = &shards[start];
3300 if (!p->loaded) {
3301 derr << __func__ << "on write 0x" << std::hex << offset
3302 << "~" << length << " shard 0x" << p->shard_info->offset
3303 << std::dec << " is not loaded, can't mark dirty" << dendl;
3304 ceph_abort_msg("can't mark unloaded shard dirty");
3305 }
3306 if (!p->dirty) {
3307 dout(20) << __func__ << " mark shard 0x" << std::hex
3308 << p->shard_info->offset << std::dec << " dirty" << dendl;
3309 p->dirty = true;
3310 }
3311 ++start;
3312 }
3313 }
3314
3315 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3316 uint64_t offset)
3317 {
3318 Extent dummy(offset);
3319 return extent_map.find(dummy);
3320 }
3321
3322 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3323 uint64_t offset)
3324 {
3325 Extent dummy(offset);
3326 auto fp = extent_map.lower_bound(dummy);
3327 if (fp != extent_map.begin()) {
3328 --fp;
3329 if (fp->logical_end() <= offset) {
3330 ++fp;
3331 }
3332 }
3333 return fp;
3334 }
3335
3336 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3337 uint64_t offset) const
3338 {
3339 Extent dummy(offset);
3340 auto fp = extent_map.lower_bound(dummy);
3341 if (fp != extent_map.begin()) {
3342 --fp;
3343 if (fp->logical_end() <= offset) {
3344 ++fp;
3345 }
3346 }
3347 return fp;
3348 }
3349
3350 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3351 {
3352 auto fp = seek_lextent(offset);
3353 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3354 return false;
3355 }
3356 return true;
3357 }
3358
3359 int BlueStore::ExtentMap::compress_extent_map(
3360 uint64_t offset,
3361 uint64_t length)
3362 {
3363 if (extent_map.empty())
3364 return 0;
3365 int removed = 0;
3366 auto p = seek_lextent(offset);
3367 if (p != extent_map.begin()) {
3368 --p; // start to the left of offset
3369 }
3370 // the caller should have just written to this region
3371 ceph_assert(p != extent_map.end());
3372
3373 // identify the *next* shard
3374 auto pshard = shards.begin();
3375 while (pshard != shards.end() &&
3376 p->logical_offset >= pshard->shard_info->offset) {
3377 ++pshard;
3378 }
3379 uint64_t shard_end;
3380 if (pshard != shards.end()) {
3381 shard_end = pshard->shard_info->offset;
3382 } else {
3383 shard_end = OBJECT_MAX_SIZE;
3384 }
3385
3386 auto n = p;
3387 for (++n; n != extent_map.end(); p = n++) {
3388 if (n->logical_offset > offset + length) {
3389 break; // stop after end
3390 }
3391 while (n != extent_map.end() &&
3392 p->logical_end() == n->logical_offset &&
3393 p->blob == n->blob &&
3394 p->blob_offset + p->length == n->blob_offset &&
3395 n->logical_offset < shard_end) {
3396 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3397 << " next shard 0x" << shard_end << std::dec
3398 << " merging " << *p << " and " << *n << dendl;
3399 p->length += n->length;
3400 rm(n++);
3401 ++removed;
3402 }
3403 if (n == extent_map.end()) {
3404 break;
3405 }
3406 if (n->logical_offset >= shard_end) {
3407 ceph_assert(pshard != shards.end());
3408 ++pshard;
3409 if (pshard != shards.end()) {
3410 shard_end = pshard->shard_info->offset;
3411 } else {
3412 shard_end = OBJECT_MAX_SIZE;
3413 }
3414 }
3415 }
3416 if (removed) {
3417 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3418 }
3419 return removed;
3420 }
3421
3422 void BlueStore::ExtentMap::punch_hole(
3423 CollectionRef &c,
3424 uint64_t offset,
3425 uint64_t length,
3426 old_extent_map_t *old_extents)
3427 {
3428 auto p = seek_lextent(offset);
3429 uint64_t end = offset + length;
3430 while (p != extent_map.end()) {
3431 if (p->logical_offset >= end) {
3432 break;
3433 }
3434 if (p->logical_offset < offset) {
3435 if (p->logical_end() > end) {
3436 // split and deref middle
3437 uint64_t front = offset - p->logical_offset;
3438 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3439 length, p->blob);
3440 old_extents->push_back(*oe);
3441 add(end,
3442 p->blob_offset + front + length,
3443 p->length - front - length,
3444 p->blob);
3445 p->length = front;
3446 break;
3447 } else {
3448 // deref tail
3449 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
3450 uint64_t keep = offset - p->logical_offset;
3451 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3452 p->length - keep, p->blob);
3453 old_extents->push_back(*oe);
3454 p->length = keep;
3455 ++p;
3456 continue;
3457 }
3458 }
3459 if (p->logical_offset + p->length <= end) {
3460 // deref whole lextent
3461 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3462 p->length, p->blob);
3463 old_extents->push_back(*oe);
3464 rm(p++);
3465 continue;
3466 }
3467 // deref head
3468 uint64_t keep = p->logical_end() - end;
3469 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3470 p->length - keep, p->blob);
3471 old_extents->push_back(*oe);
3472
3473 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3474 rm(p);
3475 break;
3476 }
3477 }
3478
3479 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3480 CollectionRef &c,
3481 uint64_t logical_offset,
3482 uint64_t blob_offset, uint64_t length, BlobRef b,
3483 old_extent_map_t *old_extents)
3484 {
3485 // We need to have completely initialized Blob to increment its ref counters.
3486 ceph_assert(b->get_blob().get_logical_length() != 0);
3487
3488 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3489 // old_extents list if we overwre the blob totally
3490 // This might happen during WAL overwrite.
3491 b->get_ref(onode->c, blob_offset, length);
3492
3493 if (old_extents) {
3494 punch_hole(c, logical_offset, length, old_extents);
3495 }
3496
3497 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3498 extent_map.insert(*le);
3499 if (spans_shard(logical_offset, length)) {
3500 request_reshard(logical_offset, logical_offset + length);
3501 }
3502 return le;
3503 }
3504
3505 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3506 BlobRef lb,
3507 uint32_t blob_offset,
3508 uint32_t pos)
3509 {
3510 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3511 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3512 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3513 << dendl;
3514 BlobRef rb = onode->c->new_blob();
3515 lb->split(onode->c, blob_offset, rb.get());
3516
3517 for (auto ep = seek_lextent(pos);
3518 ep != extent_map.end() && ep->logical_offset < end_pos;
3519 ++ep) {
3520 if (ep->blob != lb) {
3521 continue;
3522 }
3523 if (ep->logical_offset < pos) {
3524 // split extent
3525 size_t left = pos - ep->logical_offset;
3526 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3527 extent_map.insert(*ne);
3528 ep->length = left;
3529 dout(30) << __func__ << " split " << *ep << dendl;
3530 dout(30) << __func__ << " to " << *ne << dendl;
3531 } else {
3532 // switch blob
3533 ceph_assert(ep->blob_offset >= blob_offset);
3534
3535 ep->blob = rb;
3536 ep->blob_offset -= blob_offset;
3537 dout(30) << __func__ << " adjusted " << *ep << dendl;
3538 }
3539 }
3540 return rb;
3541 }
3542
3543 // Onode
3544
3545 #undef dout_prefix
3546 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3547
3548 //
3549 // A tricky thing about Onode's ref counter is that we do an additional
3550 // increment when newly pinned instance is detected. And -1 on unpin.
3551 // This prevents from a conflict with a delete call (when nref == 0).
3552 // The latter might happen while the thread is in unpin() function
3553 // (and e.g. waiting for lock acquisition) since nref is already
3554 // decremented. And another 'putting' thread on the instance will release it.
3555 //
3556 void BlueStore::Onode::get() {
3557 if (++nref >= 2 && !pinned) {
3558 OnodeCacheShard* ocs = c->get_onode_cache();
3559 ocs->lock.lock();
3560 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3561 while (ocs != c->get_onode_cache()) {
3562 ocs->lock.unlock();
3563 ocs = c->get_onode_cache();
3564 ocs->lock.lock();
3565 }
3566 bool was_pinned = pinned;
3567 pinned = nref >= 2;
3568 // additional increment for newly pinned instance
3569 bool r = !was_pinned && pinned;
3570 if (r) {
3571 ++nref;
3572 }
3573 if (cached && r) {
3574 ocs->_pin(this);
3575 }
3576 ocs->lock.unlock();
3577 }
3578 }
3579 void BlueStore::Onode::put() {
3580 int n = --nref;
3581 if (n == 2) {
3582 OnodeCacheShard* ocs = c->get_onode_cache();
3583 ocs->lock.lock();
3584 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3585 while (ocs != c->get_onode_cache()) {
3586 ocs->lock.unlock();
3587 ocs = c->get_onode_cache();
3588 ocs->lock.lock();
3589 }
3590 bool need_unpin = pinned;
3591 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3592 // +1 due to pinned state
3593 need_unpin = need_unpin && !pinned;
3594 if (cached && need_unpin) {
3595 if (exists) {
3596 ocs->_unpin(this);
3597 } else {
3598 ocs->_unpin_and_rm(this);
3599 // remove will also decrement nref and delete Onode
3600 c->onode_map._remove(oid);
3601 }
3602 }
3603 // additional decrement for newly unpinned instance
3604 // should be the last action since Onode can be released
3605 // at any point after this decrement
3606 if (need_unpin) {
3607 n = --nref;
3608 }
3609 ocs->lock.unlock();
3610 }
3611 if (n == 0) {
3612 delete this;
3613 }
3614 }
3615
3616 BlueStore::Onode* BlueStore::Onode::decode(
3617 CollectionRef c,
3618 const ghobject_t& oid,
3619 const string& key,
3620 const bufferlist& v)
3621 {
3622 Onode* on = new Onode(c.get(), oid, key);
3623 on->exists = true;
3624 auto p = v.front().begin_deep();
3625 on->onode.decode(p);
3626 for (auto& i : on->onode.attrs) {
3627 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3628 }
3629
3630 // initialize extent_map
3631 on->extent_map.decode_spanning_blobs(p);
3632 if (on->onode.extent_map_shards.empty()) {
3633 denc(on->extent_map.inline_bl, p);
3634 on->extent_map.decode_some(on->extent_map.inline_bl);
3635 on->extent_map.inline_bl.reassign_to_mempool(
3636 mempool::mempool_bluestore_cache_data);
3637 }
3638 else {
3639 on->extent_map.init_shards(false, false);
3640 }
3641 return on;
3642 }
3643
3644 void BlueStore::Onode::flush()
3645 {
3646 if (flushing_count.load()) {
3647 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
3648 waiting_count++;
3649 std::unique_lock l(flush_lock);
3650 while (flushing_count.load()) {
3651 flush_cond.wait(l);
3652 }
3653 waiting_count--;
3654 }
3655 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3656 }
3657
3658 void BlueStore::Onode::dump(Formatter* f) const
3659 {
3660 onode.dump(f);
3661 extent_map.dump(f);
3662 }
3663
3664
3665 const string& BlueStore::Onode::get_omap_prefix()
3666 {
3667 if (onode.is_pgmeta_omap()) {
3668 return PREFIX_PGMETA_OMAP;
3669 }
3670 if (onode.is_perpool_omap()) {
3671 return PREFIX_PERPOOL_OMAP;
3672 }
3673 return PREFIX_OMAP;
3674 }
3675
3676 // '-' < '.' < '~'
3677
3678 void BlueStore::Onode::get_omap_header(string *out)
3679 {
3680 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3681 _key_encode_u64(c->pool(), out);
3682 }
3683 _key_encode_u64(onode.nid, out);
3684 out->push_back('-');
3685 }
3686
3687 void BlueStore::Onode::get_omap_key(const string& key, string *out)
3688 {
3689 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3690 _key_encode_u64(c->pool(), out);
3691 }
3692 _key_encode_u64(onode.nid, out);
3693 out->push_back('.');
3694 out->append(key);
3695 }
3696
3697 void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3698 {
3699 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3700 _key_encode_u64(c->pool(), out);
3701 }
3702 _key_encode_u64(onode.nid, out);
3703 out->append(old.c_str() + out->length(), old.size() - out->length());
3704 }
3705
3706 void BlueStore::Onode::get_omap_tail(string *out)
3707 {
3708 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3709 _key_encode_u64(c->pool(), out);
3710 }
3711 _key_encode_u64(onode.nid, out);
3712 out->push_back('~');
3713 }
3714
3715 void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3716 {
3717 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3718 *user_key = key.substr(sizeof(uint64_t)*2 + 1);
3719 } else {
3720 *user_key = key.substr(sizeof(uint64_t) + 1);
3721 }
3722 }
3723
3724
3725 // =======================================================
3726 // WriteContext
3727
3728 /// Checks for writes to the same pextent within a blob
3729 bool BlueStore::WriteContext::has_conflict(
3730 BlobRef b,
3731 uint64_t loffs,
3732 uint64_t loffs_end,
3733 uint64_t min_alloc_size)
3734 {
3735 ceph_assert((loffs % min_alloc_size) == 0);
3736 ceph_assert((loffs_end % min_alloc_size) == 0);
3737 for (auto w : writes) {
3738 if (b == w.b) {
3739 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3740 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
3741 if ((loffs <= loffs2 && loffs_end > loffs2) ||
3742 (loffs >= loffs2 && loffs < loffs2_end)) {
3743 return true;
3744 }
3745 }
3746 }
3747 return false;
3748 }
3749
3750 // =======================================================
3751
3752 // DeferredBatch
3753 #undef dout_prefix
3754 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3755 #undef dout_context
3756 #define dout_context cct
3757
3758 void BlueStore::DeferredBatch::prepare_write(
3759 CephContext *cct,
3760 uint64_t seq, uint64_t offset, uint64_t length,
3761 bufferlist::const_iterator& blp)
3762 {
3763 _discard(cct, offset, length);
3764 auto i = iomap.insert(make_pair(offset, deferred_io()));
3765 ceph_assert(i.second); // this should be a new insertion
3766 i.first->second.seq = seq;
3767 blp.copy(length, i.first->second.bl);
3768 i.first->second.bl.reassign_to_mempool(
3769 mempool::mempool_bluestore_writing_deferred);
3770 dout(20) << __func__ << " seq " << seq
3771 << " 0x" << std::hex << offset << "~" << length
3772 << " crc " << i.first->second.bl.crc32c(-1)
3773 << std::dec << dendl;
3774 seq_bytes[seq] += length;
3775 #ifdef DEBUG_DEFERRED
3776 _audit(cct);
3777 #endif
3778 }
3779
3780 void BlueStore::DeferredBatch::_discard(
3781 CephContext *cct, uint64_t offset, uint64_t length)
3782 {
3783 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3784 << std::dec << dendl;
3785 auto p = iomap.lower_bound(offset);
3786 if (p != iomap.begin()) {
3787 --p;
3788 auto end = p->first + p->second.bl.length();
3789 if (end > offset) {
3790 bufferlist head;
3791 head.substr_of(p->second.bl, 0, offset - p->first);
3792 dout(20) << __func__ << " keep head " << p->second.seq
3793 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3794 << " -> 0x" << head.length() << std::dec << dendl;
3795 auto i = seq_bytes.find(p->second.seq);
3796 ceph_assert(i != seq_bytes.end());
3797 if (end > offset + length) {
3798 bufferlist tail;
3799 tail.substr_of(p->second.bl, offset + length - p->first,
3800 end - (offset + length));
3801 dout(20) << __func__ << " keep tail " << p->second.seq
3802 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3803 << " -> 0x" << tail.length() << std::dec << dendl;
3804 auto &n = iomap[offset + length];
3805 n.bl.swap(tail);
3806 n.seq = p->second.seq;
3807 i->second -= length;
3808 } else {
3809 i->second -= end - offset;
3810 }
3811 ceph_assert(i->second >= 0);
3812 p->second.bl.swap(head);
3813 }
3814 ++p;
3815 }
3816 while (p != iomap.end()) {
3817 if (p->first >= offset + length) {
3818 break;
3819 }
3820 auto i = seq_bytes.find(p->second.seq);
3821 ceph_assert(i != seq_bytes.end());
3822 auto end = p->first + p->second.bl.length();
3823 if (end > offset + length) {
3824 unsigned drop_front = offset + length - p->first;
3825 unsigned keep_tail = end - (offset + length);
3826 dout(20) << __func__ << " truncate front " << p->second.seq
3827 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3828 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3829 << " to 0x" << (offset + length) << "~" << keep_tail
3830 << std::dec << dendl;
3831 auto &s = iomap[offset + length];
3832 s.seq = p->second.seq;
3833 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3834 i->second -= drop_front;
3835 } else {
3836 dout(20) << __func__ << " drop " << p->second.seq
3837 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3838 << std::dec << dendl;
3839 i->second -= p->second.bl.length();
3840 }
3841 ceph_assert(i->second >= 0);
3842 p = iomap.erase(p);
3843 }
3844 }
3845
3846 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3847 {
3848 map<uint64_t,int> sb;
3849 for (auto p : seq_bytes) {
3850 sb[p.first] = 0; // make sure we have the same set of keys
3851 }
3852 uint64_t pos = 0;
3853 for (auto& p : iomap) {
3854 ceph_assert(p.first >= pos);
3855 sb[p.second.seq] += p.second.bl.length();
3856 pos = p.first + p.second.bl.length();
3857 }
3858 ceph_assert(sb == seq_bytes);
3859 }
3860
3861
3862 // Collection
3863
3864 #undef dout_prefix
3865 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3866
3867 BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3868 : CollectionImpl(store_->cct, cid),
3869 store(store_),
3870 cache(bc),
3871 exists(true),
3872 onode_map(oc),
3873 commit_queue(nullptr)
3874 {
3875 }
3876
3877 bool BlueStore::Collection::flush_commit(Context *c)
3878 {
3879 return osr->flush_commit(c);
3880 }
3881
3882 void BlueStore::Collection::flush()
3883 {
3884 osr->flush();
3885 }
3886
3887 void BlueStore::Collection::flush_all_but_last()
3888 {
3889 osr->flush_all_but_last();
3890 }
3891
3892 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3893 {
3894 ceph_assert(!b->shared_blob);
3895 const bluestore_blob_t& blob = b->get_blob();
3896 if (!blob.is_shared()) {
3897 b->shared_blob = new SharedBlob(this);
3898 return;
3899 }
3900
3901 b->shared_blob = shared_blob_set.lookup(sbid);
3902 if (b->shared_blob) {
3903 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3904 << std::dec << " had " << *b->shared_blob << dendl;
3905 } else {
3906 b->shared_blob = new SharedBlob(sbid, this);
3907 shared_blob_set.add(this, b->shared_blob.get());
3908 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3909 << std::dec << " opened " << *b->shared_blob
3910 << dendl;
3911 }
3912 }
3913
3914 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3915 {
3916 if (!sb->is_loaded()) {
3917
3918 bufferlist v;
3919 string key;
3920 auto sbid = sb->get_sbid();
3921 get_shared_blob_key(sbid, &key);
3922 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3923 if (r < 0) {
3924 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3925 << std::dec << " not found at key "
3926 << pretty_binary_string(key) << dendl;
3927 ceph_abort_msg("uh oh, missing shared_blob");
3928 }
3929
3930 sb->loaded = true;
3931 sb->persistent = new bluestore_shared_blob_t(sbid);
3932 auto p = v.cbegin();
3933 decode(*(sb->persistent), p);
3934 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3935 << std::dec << " loaded shared_blob " << *sb << dendl;
3936 }
3937 }
3938
3939 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3940 {
3941 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3942 ceph_assert(!b->shared_blob->is_loaded());
3943
3944 // update blob
3945 bluestore_blob_t& blob = b->dirty_blob();
3946 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3947
3948 // update shared blob
3949 b->shared_blob->loaded = true;
3950 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3951 shared_blob_set.add(this, b->shared_blob.get());
3952 for (auto p : blob.get_extents()) {
3953 if (p.is_valid()) {
3954 b->shared_blob->get_ref(
3955 p.offset,
3956 p.length);
3957 }
3958 }
3959 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3960 }
3961
3962 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3963 {
3964 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3965 ceph_assert(sb->is_loaded());
3966
3967 uint64_t sbid = sb->get_sbid();
3968 shared_blob_set.remove(sb);
3969 sb->loaded = false;
3970 delete sb->persistent;
3971 sb->sbid_unloaded = 0;
3972 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3973 return sbid;
3974 }
3975
3976 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3977 const ghobject_t& oid,
3978 bool create,
3979 bool is_createop)
3980 {
3981 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
3982
3983 spg_t pgid;
3984 if (cid.is_pg(&pgid)) {
3985 if (!oid.match(cnode.bits, pgid.ps())) {
3986 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3987 << pgid << " bits " << cnode.bits << dendl;
3988 ceph_abort();
3989 }
3990 }
3991
3992 OnodeRef o = onode_map.lookup(oid);
3993 if (o)
3994 return o;
3995
3996 string key;
3997 get_object_key(store->cct, oid, &key);
3998
3999 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
4000 << pretty_binary_string(key) << dendl;
4001
4002 bufferlist v;
4003 int r = -ENOENT;
4004 Onode *on;
4005 if (!is_createop) {
4006 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
4007 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
4008 }
4009 if (v.length() == 0) {
4010 ceph_assert(r == -ENOENT);
4011 if (!store->cct->_conf->bluestore_debug_misc &&
4012 !create)
4013 return OnodeRef();
4014
4015 // new object, new onode
4016 on = new Onode(this, oid, key);
4017 } else {
4018 // loaded
4019 ceph_assert(r >= 0);
4020 on = Onode::decode(this, oid, key, v);
4021 }
4022 o.reset(on);
4023 return onode_map.add(oid, o);
4024 }
4025
4026 void BlueStore::Collection::split_cache(
4027 Collection *dest)
4028 {
4029 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4030
4031 auto *ocache = get_onode_cache();
4032 auto *ocache_dest = dest->get_onode_cache();
4033
4034 // lock cache shards
4035 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4036 std::lock_guard l(ocache->lock, std::adopt_lock);
4037 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4038 std::lock_guard l3(cache->lock, std::adopt_lock);
4039 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
4040
4041 int destbits = dest->cnode.bits;
4042 spg_t destpg;
4043 bool is_pg = dest->cid.is_pg(&destpg);
4044 ceph_assert(is_pg);
4045
4046 auto p = onode_map.onode_map.begin();
4047 while (p != onode_map.onode_map.end()) {
4048 OnodeRef o = p->second;
4049 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4050 // onode does not belong to this child
4051 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4052 << dendl;
4053 ++p;
4054 } else {
4055 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4056 << dendl;
4057
4058 // ensuring that nref is always >= 2 and hence onode is pinned and
4059 // physically out of cache during the transition
4060 OnodeRef o_pin = o;
4061 ceph_assert(o->pinned);
4062
4063 p = onode_map.onode_map.erase(p);
4064 dest->onode_map.onode_map[o->oid] = o;
4065 if (o->cached) {
4066 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
4067 }
4068 o->c = dest;
4069
4070 // move over shared blobs and buffers. cover shared blobs from
4071 // both extent map and spanning blob map (the full extent map
4072 // may not be faulted in)
4073 vector<SharedBlob*> sbvec;
4074 for (auto& e : o->extent_map.extent_map) {
4075 sbvec.push_back(e.blob->shared_blob.get());
4076 }
4077 for (auto& b : o->extent_map.spanning_blob_map) {
4078 sbvec.push_back(b.second->shared_blob.get());
4079 }
4080 for (auto sb : sbvec) {
4081 if (sb->coll == dest) {
4082 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4083 << dendl;
4084 continue;
4085 }
4086 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
4087 if (sb->get_sbid()) {
4088 ldout(store->cct, 20) << __func__
4089 << " moving registration " << *sb << dendl;
4090 shared_blob_set.remove(sb);
4091 dest->shared_blob_set.add(dest, sb);
4092 }
4093 sb->coll = dest;
4094 if (dest->cache != cache) {
4095 for (auto& i : sb->bc.buffer_map) {
4096 if (!i.second->is_writing()) {
4097 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4098 << dendl;
4099 dest->cache->_move(cache, i.second.get());
4100 }
4101 }
4102 }
4103 }
4104 }
4105 }
4106 dest->cache->_trim();
4107 }
4108
4109 // =======================================================
4110
4111 // MempoolThread
4112
4113 #undef dout_prefix
4114 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4115 #undef dout_context
4116 #define dout_context store->cct
4117
4118 void *BlueStore::MempoolThread::entry()
4119 {
4120 std::unique_lock l{lock};
4121
4122 uint32_t prev_config_change = store->config_changed.load();
4123 uint64_t base = store->osd_memory_base;
4124 double fragmentation = store->osd_memory_expected_fragmentation;
4125 uint64_t target = store->osd_memory_target;
4126 uint64_t min = store->osd_memory_cache_min;
4127 uint64_t max = min;
4128
4129 // When setting the maximum amount of memory to use for cache, first
4130 // assume some base amount of memory for the OSD and then fudge in
4131 // some overhead for fragmentation that scales with cache usage.
4132 uint64_t ltarget = (1.0 - fragmentation) * target;
4133 if (ltarget > base + min) {
4134 max = ltarget - base;
4135 }
4136
4137 binned_kv_cache = store->db->get_priority_cache();
4138 if (store->cache_autotune && binned_kv_cache != nullptr) {
4139 pcm = std::make_shared<PriorityCache::Manager>(
4140 store->cct, min, max, target, true);
4141 pcm->insert("kv", binned_kv_cache, true);
4142 pcm->insert("meta", meta_cache, true);
4143 pcm->insert("data", data_cache, true);
4144 }
4145
4146 utime_t next_balance = ceph_clock_now();
4147 utime_t next_resize = ceph_clock_now();
4148 utime_t next_deferred_force_submit = ceph_clock_now();
4149 utime_t alloc_stats_dump_clock = ceph_clock_now();
4150
4151 bool interval_stats_trim = false;
4152 while (!stop) {
4153 // Update pcm cache settings if related configuration was changed
4154 uint32_t cur_config_change = store->config_changed.load();
4155 if (cur_config_change != prev_config_change) {
4156 _update_cache_settings();
4157 prev_config_change = cur_config_change;
4158 }
4159
4160 // Before we trim, check and see if it's time to rebalance/resize.
4161 double autotune_interval = store->cache_autotune_interval;
4162 double resize_interval = store->osd_memory_cache_resize_interval;
4163 double max_defer_interval = store->max_defer_interval;
4164
4165 double alloc_stats_dump_interval =
4166 store->cct->_conf->bluestore_alloc_stats_dump_interval;
4167
4168 if (alloc_stats_dump_interval > 0 &&
4169 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4170 store->_record_allocation_stats();
4171 alloc_stats_dump_clock = ceph_clock_now();
4172 }
4173 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
4174 _adjust_cache_settings();
4175
4176 // Log events at 5 instead of 20 when balance happens.
4177 interval_stats_trim = true;
4178
4179 if (pcm != nullptr) {
4180 pcm->balance();
4181 }
4182
4183 next_balance = ceph_clock_now();
4184 next_balance += autotune_interval;
4185 }
4186 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
4187 if (ceph_using_tcmalloc() && pcm != nullptr) {
4188 pcm->tune_memory();
4189 }
4190 next_resize = ceph_clock_now();
4191 next_resize += resize_interval;
4192 }
4193
4194 if (max_defer_interval > 0 &&
4195 next_deferred_force_submit < ceph_clock_now()) {
4196 if (store->get_deferred_last_submitted() + max_defer_interval <
4197 ceph_clock_now()) {
4198 store->deferred_try_submit();
4199 }
4200 next_deferred_force_submit = ceph_clock_now();
4201 next_deferred_force_submit += max_defer_interval/3;
4202 }
4203
4204 // Now Resize the shards
4205 _resize_shards(interval_stats_trim);
4206 interval_stats_trim = false;
4207
4208 store->_update_cache_logger();
4209 auto wait = ceph::make_timespan(
4210 store->cct->_conf->bluestore_cache_trim_interval);
4211 cond.wait_for(l, wait);
4212 }
4213 // do final dump
4214 store->_record_allocation_stats();
4215 stop = false;
4216 return NULL;
4217 }
4218
4219 void BlueStore::MempoolThread::_adjust_cache_settings()
4220 {
4221 if (binned_kv_cache != nullptr) {
4222 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4223 }
4224 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4225 data_cache->set_cache_ratio(store->cache_data_ratio);
4226 }
4227
4228 void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
4229 {
4230 size_t onode_shards = store->onode_cache_shards.size();
4231 size_t buffer_shards = store->buffer_cache_shards.size();
4232 int64_t kv_used = store->db->get_cache_usage();
4233 int64_t meta_used = meta_cache->_get_used_bytes();
4234 int64_t data_used = data_cache->_get_used_bytes();
4235
4236 uint64_t cache_size = store->cache_size;
4237 int64_t kv_alloc =
4238 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
4239 int64_t meta_alloc =
4240 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
4241 int64_t data_alloc =
4242 static_cast<int64_t>(store->cache_data_ratio * cache_size);
4243
4244 if (pcm != nullptr && binned_kv_cache != nullptr) {
4245 cache_size = pcm->get_tuned_mem();
4246 kv_alloc = binned_kv_cache->get_committed_size();
4247 meta_alloc = meta_cache->get_committed_size();
4248 data_alloc = data_cache->get_committed_size();
4249 }
4250
4251 if (interval_stats) {
4252 dout(5) << __func__ << " cache_size: " << cache_size
4253 << " kv_alloc: " << kv_alloc
4254 << " kv_used: " << kv_used
4255 << " meta_alloc: " << meta_alloc
4256 << " meta_used: " << meta_used
4257 << " data_alloc: " << data_alloc
4258 << " data_used: " << data_used << dendl;
4259 } else {
4260 dout(20) << __func__ << " cache_size: " << cache_size
4261 << " kv_alloc: " << kv_alloc
4262 << " kv_used: " << kv_used
4263 << " meta_alloc: " << meta_alloc
4264 << " meta_used: " << meta_used
4265 << " data_alloc: " << data_alloc
4266 << " data_used: " << data_used << dendl;
4267 }
4268
4269 uint64_t max_shard_onodes = static_cast<uint64_t>(
4270 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4271 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
4272
4273 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
4274 << " max_shard_buffer: " << max_shard_buffer << dendl;
4275
4276 for (auto i : store->onode_cache_shards) {
4277 i->set_max(max_shard_onodes);
4278 }
4279 for (auto i : store->buffer_cache_shards) {
4280 i->set_max(max_shard_buffer);
4281 }
4282 }
4283
4284 void BlueStore::MempoolThread::_update_cache_settings()
4285 {
4286 // Nothing to do if pcm is not used.
4287 if (pcm == nullptr) {
4288 return;
4289 }
4290
4291 uint64_t target = store->osd_memory_target;
4292 uint64_t base = store->osd_memory_base;
4293 uint64_t min = store->osd_memory_cache_min;
4294 uint64_t max = min;
4295 double fragmentation = store->osd_memory_expected_fragmentation;
4296
4297 uint64_t ltarget = (1.0 - fragmentation) * target;
4298 if (ltarget > base + min) {
4299 max = ltarget - base;
4300 }
4301
4302 // set pcm cache levels
4303 pcm->set_target_memory(target);
4304 pcm->set_min_memory(min);
4305 pcm->set_max_memory(max);
4306
4307 dout(5) << __func__ << " updated pcm target: " << target
4308 << " pcm min: " << min
4309 << " pcm max: " << max
4310 << dendl;
4311 }
4312
4313 // =======================================================
4314
4315 // OmapIteratorImpl
4316
4317 #undef dout_prefix
4318 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4319
4320 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4321 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4322 : c(c), o(o), it(it)
4323 {
4324 std::shared_lock l(c->lock);
4325 if (o->onode.has_omap()) {
4326 o->get_omap_key(string(), &head);
4327 o->get_omap_tail(&tail);
4328 it->lower_bound(head);
4329 }
4330 }
4331
4332 string BlueStore::OmapIteratorImpl::_stringify() const
4333 {
4334 stringstream s;
4335 s << " omap_iterator(cid = " << c->cid
4336 <<", oid = " << o->oid << ")";
4337 return s.str();
4338 }
4339
4340 int BlueStore::OmapIteratorImpl::seek_to_first()
4341 {
4342 std::shared_lock l(c->lock);
4343 auto start1 = mono_clock::now();
4344 if (o->onode.has_omap()) {
4345 it->lower_bound(head);
4346 } else {
4347 it = KeyValueDB::Iterator();
4348 }
4349 c->store->log_latency(
4350 __func__,
4351 l_bluestore_omap_seek_to_first_lat,
4352 mono_clock::now() - start1,
4353 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4354
4355 return 0;
4356 }
4357
4358 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4359 {
4360 std::shared_lock l(c->lock);
4361 auto start1 = mono_clock::now();
4362 if (o->onode.has_omap()) {
4363 string key;
4364 o->get_omap_key(after, &key);
4365 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4366 << pretty_binary_string(key) << dendl;
4367 it->upper_bound(key);
4368 } else {
4369 it = KeyValueDB::Iterator();
4370 }
4371 c->store->log_latency_fn(
4372 __func__,
4373 l_bluestore_omap_upper_bound_lat,
4374 mono_clock::now() - start1,
4375 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4376 [&] (const ceph::timespan& lat) {
4377 return ", after = " + after +
4378 _stringify();
4379 }
4380 );
4381 return 0;
4382 }
4383
4384 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4385 {
4386 std::shared_lock l(c->lock);
4387 auto start1 = mono_clock::now();
4388 if (o->onode.has_omap()) {
4389 string key;
4390 o->get_omap_key(to, &key);
4391 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4392 << pretty_binary_string(key) << dendl;
4393 it->lower_bound(key);
4394 } else {
4395 it = KeyValueDB::Iterator();
4396 }
4397 c->store->log_latency_fn(
4398 __func__,
4399 l_bluestore_omap_lower_bound_lat,
4400 mono_clock::now() - start1,
4401 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4402 [&] (const ceph::timespan& lat) {
4403 return ", to = " + to +
4404 _stringify();
4405 }
4406 );
4407 return 0;
4408 }
4409
4410 bool BlueStore::OmapIteratorImpl::valid()
4411 {
4412 std::shared_lock l(c->lock);
4413 bool r = o->onode.has_omap() && it && it->valid() &&
4414 it->raw_key().second < tail;
4415 if (it && it->valid()) {
4416 ldout(c->store->cct,20) << __func__ << " is at "
4417 << pretty_binary_string(it->raw_key().second)
4418 << dendl;
4419 }
4420 return r;
4421 }
4422
4423 int BlueStore::OmapIteratorImpl::next()
4424 {
4425 int r = -1;
4426 std::shared_lock l(c->lock);
4427 auto start1 = mono_clock::now();
4428 if (o->onode.has_omap()) {
4429 it->next();
4430 r = 0;
4431 }
4432 c->store->log_latency(
4433 __func__,
4434 l_bluestore_omap_next_lat,
4435 mono_clock::now() - start1,
4436 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4437
4438 return r;
4439 }
4440
4441 string BlueStore::OmapIteratorImpl::key()
4442 {
4443 std::shared_lock l(c->lock);
4444 ceph_assert(it->valid());
4445 string db_key = it->raw_key().second;
4446 string user_key;
4447 o->decode_omap_key(db_key, &user_key);
4448
4449 return user_key;
4450 }
4451
4452 bufferlist BlueStore::OmapIteratorImpl::value()
4453 {
4454 std::shared_lock l(c->lock);
4455 ceph_assert(it->valid());
4456 return it->value();
4457 }
4458
4459
4460 // =====================================
4461
4462 #undef dout_prefix
4463 #define dout_prefix *_dout << "bluestore(" << path << ") "
4464 #undef dout_context
4465 #define dout_context cct
4466
4467
4468 static void aio_cb(void *priv, void *priv2)
4469 {
4470 BlueStore *store = static_cast<BlueStore*>(priv);
4471 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4472 c->aio_finish(store);
4473 }
4474
4475 static void discard_cb(void *priv, void *priv2)
4476 {
4477 BlueStore *store = static_cast<BlueStore*>(priv);
4478 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4479 store->handle_discard(*tmp);
4480 }
4481
4482 void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4483 {
4484 dout(10) << __func__ << dendl;
4485 ceph_assert(alloc);
4486 alloc->release(to_release);
4487 }
4488
4489 BlueStore::BlueStore(CephContext *cct, const string& path)
4490 : BlueStore(cct, path, 0) {}
4491
4492 BlueStore::BlueStore(CephContext *cct,
4493 const string& path,
4494 uint64_t _min_alloc_size)
4495 : ObjectStore(cct, path),
4496 throttle(cct),
4497 finisher(cct, "commit_finisher", "cfin"),
4498 kv_sync_thread(this),
4499 kv_finalize_thread(this),
4500 min_alloc_size(_min_alloc_size),
4501 min_alloc_size_order(ctz(_min_alloc_size)),
4502 mempool_thread(this)
4503 {
4504 _init_logger();
4505 cct->_conf.add_observer(this);
4506 set_cache_shards(1);
4507 }
4508
4509 BlueStore::~BlueStore()
4510 {
4511 cct->_conf.remove_observer(this);
4512 _shutdown_logger();
4513 ceph_assert(!mounted);
4514 ceph_assert(db == NULL);
4515 ceph_assert(bluefs == NULL);
4516 ceph_assert(fsid_fd < 0);
4517 ceph_assert(path_fd < 0);
4518 for (auto i : onode_cache_shards) {
4519 delete i;
4520 }
4521 for (auto i : buffer_cache_shards) {
4522 delete i;
4523 }
4524 onode_cache_shards.clear();
4525 buffer_cache_shards.clear();
4526 }
4527
4528 const char **BlueStore::get_tracked_conf_keys() const
4529 {
4530 static const char* KEYS[] = {
4531 "bluestore_csum_type",
4532 "bluestore_compression_mode",
4533 "bluestore_compression_algorithm",
4534 "bluestore_compression_min_blob_size",
4535 "bluestore_compression_min_blob_size_ssd",
4536 "bluestore_compression_min_blob_size_hdd",
4537 "bluestore_compression_max_blob_size",
4538 "bluestore_compression_max_blob_size_ssd",
4539 "bluestore_compression_max_blob_size_hdd",
4540 "bluestore_compression_required_ratio",
4541 "bluestore_max_alloc_size",
4542 "bluestore_prefer_deferred_size",
4543 "bluestore_prefer_deferred_size_hdd",
4544 "bluestore_prefer_deferred_size_ssd",
4545 "bluestore_deferred_batch_ops",
4546 "bluestore_deferred_batch_ops_hdd",
4547 "bluestore_deferred_batch_ops_ssd",
4548 "bluestore_throttle_bytes",
4549 "bluestore_throttle_deferred_bytes",
4550 "bluestore_throttle_cost_per_io_hdd",
4551 "bluestore_throttle_cost_per_io_ssd",
4552 "bluestore_throttle_cost_per_io",
4553 "bluestore_max_blob_size",
4554 "bluestore_max_blob_size_ssd",
4555 "bluestore_max_blob_size_hdd",
4556 "osd_memory_target",
4557 "osd_memory_target_cgroup_limit_ratio",
4558 "osd_memory_base",
4559 "osd_memory_cache_min",
4560 "osd_memory_expected_fragmentation",
4561 "bluestore_cache_autotune",
4562 "bluestore_cache_autotune_interval",
4563 "bluestore_warn_on_legacy_statfs",
4564 "bluestore_warn_on_no_per_pool_omap",
4565 "bluestore_max_defer_interval",
4566 NULL
4567 };
4568 return KEYS;
4569 }
4570
4571 void BlueStore::handle_conf_change(const ConfigProxy& conf,
4572 const std::set<std::string> &changed)
4573 {
4574 if (changed.count("bluestore_warn_on_legacy_statfs")) {
4575 _check_legacy_statfs_alert();
4576 }
4577 if (changed.count("bluestore_warn_on_no_per_pool_omap")) {
4578 _check_no_per_pool_omap_alert();
4579 }
4580
4581 if (changed.count("bluestore_csum_type")) {
4582 _set_csum();
4583 }
4584 if (changed.count("bluestore_compression_mode") ||
4585 changed.count("bluestore_compression_algorithm") ||
4586 changed.count("bluestore_compression_min_blob_size") ||
4587 changed.count("bluestore_compression_max_blob_size")) {
4588 if (bdev) {
4589 _set_compression();
4590 }
4591 }
4592 if (changed.count("bluestore_max_blob_size") ||
4593 changed.count("bluestore_max_blob_size_ssd") ||
4594 changed.count("bluestore_max_blob_size_hdd")) {
4595 if (bdev) {
4596 // only after startup
4597 _set_blob_size();
4598 }
4599 }
4600 if (changed.count("bluestore_prefer_deferred_size") ||
4601 changed.count("bluestore_prefer_deferred_size_hdd") ||
4602 changed.count("bluestore_prefer_deferred_size_ssd") ||
4603 changed.count("bluestore_max_alloc_size") ||
4604 changed.count("bluestore_deferred_batch_ops") ||
4605 changed.count("bluestore_deferred_batch_ops_hdd") ||
4606 changed.count("bluestore_deferred_batch_ops_ssd")) {
4607 if (bdev) {
4608 // only after startup
4609 _set_alloc_sizes();
4610 }
4611 }
4612 if (changed.count("bluestore_throttle_cost_per_io") ||
4613 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4614 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4615 if (bdev) {
4616 _set_throttle_params();
4617 }
4618 }
4619 if (changed.count("bluestore_throttle_bytes") ||
4620 changed.count("bluestore_throttle_deferred_bytes") ||
4621 changed.count("bluestore_throttle_trace_rate")) {
4622 throttle.reset_throttle(conf);
4623 }
4624 if (changed.count("bluestore_max_defer_interval")) {
4625 if (bdev) {
4626 _set_max_defer_interval();
4627 }
4628 }
4629 if (changed.count("osd_memory_target") ||
4630 changed.count("osd_memory_base") ||
4631 changed.count("osd_memory_cache_min") ||
4632 changed.count("osd_memory_expected_fragmentation")) {
4633 _update_osd_memory_options();
4634 }
4635 }
4636
4637 void BlueStore::_set_compression()
4638 {
4639 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4640 if (m) {
4641 _clear_compression_alert();
4642 comp_mode = *m;
4643 } else {
4644 derr << __func__ << " unrecognized value '"
4645 << cct->_conf->bluestore_compression_mode
4646 << "' for bluestore_compression_mode, reverting to 'none'"
4647 << dendl;
4648 comp_mode = Compressor::COMP_NONE;
4649 string s("unknown mode: ");
4650 s += cct->_conf->bluestore_compression_mode;
4651 _set_compression_alert(true, s.c_str());
4652 }
4653
4654 compressor = nullptr;
4655
4656 if (cct->_conf->bluestore_compression_min_blob_size) {
4657 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
4658 } else {
4659 ceph_assert(bdev);
4660 if (_use_rotational_settings()) {
4661 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4662 } else {
4663 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4664 }
4665 }
4666
4667 if (cct->_conf->bluestore_compression_max_blob_size) {
4668 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4669 } else {
4670 ceph_assert(bdev);
4671 if (_use_rotational_settings()) {
4672 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4673 } else {
4674 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4675 }
4676 }
4677
4678 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4679 if (!alg_name.empty()) {
4680 compressor = Compressor::create(cct, alg_name);
4681 if (!compressor) {
4682 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4683 << dendl;
4684 _set_compression_alert(false, alg_name.c_str());
4685 }
4686 }
4687
4688 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4689 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
4690 << " min_blob " << comp_min_blob_size
4691 << " max_blob " << comp_max_blob_size
4692 << dendl;
4693 }
4694
4695 void BlueStore::_set_csum()
4696 {
4697 csum_type = Checksummer::CSUM_NONE;
4698 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4699 if (t > Checksummer::CSUM_NONE)
4700 csum_type = t;
4701
4702 dout(10) << __func__ << " csum_type "
4703 << Checksummer::get_csum_type_string(csum_type)
4704 << dendl;
4705 }
4706
4707 void BlueStore::_set_throttle_params()
4708 {
4709 if (cct->_conf->bluestore_throttle_cost_per_io) {
4710 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4711 } else {
4712 ceph_assert(bdev);
4713 if (_use_rotational_settings()) {
4714 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4715 } else {
4716 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4717 }
4718 }
4719
4720 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4721 << dendl;
4722 }
4723 void BlueStore::_set_blob_size()
4724 {
4725 if (cct->_conf->bluestore_max_blob_size) {
4726 max_blob_size = cct->_conf->bluestore_max_blob_size;
4727 } else {
4728 ceph_assert(bdev);
4729 if (_use_rotational_settings()) {
4730 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4731 } else {
4732 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4733 }
4734 }
4735 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4736 << std::dec << dendl;
4737 }
4738
4739 void BlueStore::_update_osd_memory_options()
4740 {
4741 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4742 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4743 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4744 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4745 config_changed++;
4746 dout(10) << __func__
4747 << " osd_memory_target " << osd_memory_target
4748 << " osd_memory_base " << osd_memory_base
4749 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4750 << " osd_memory_cache_min " << osd_memory_cache_min
4751 << dendl;
4752 }
4753
4754 int BlueStore::_set_cache_sizes()
4755 {
4756 ceph_assert(bdev);
4757 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
4758 cache_autotune_interval =
4759 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4760 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4761 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4762 osd_memory_expected_fragmentation =
4763 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4764 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4765 osd_memory_cache_resize_interval =
4766 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
4767
4768 if (cct->_conf->bluestore_cache_size) {
4769 cache_size = cct->_conf->bluestore_cache_size;
4770 } else {
4771 // choose global cache size based on backend type
4772 if (_use_rotational_settings()) {
4773 cache_size = cct->_conf->bluestore_cache_size_hdd;
4774 } else {
4775 cache_size = cct->_conf->bluestore_cache_size_ssd;
4776 }
4777 }
4778
4779 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
4780 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
4781 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4782 << ") must be in range [0,1.0]" << dendl;
4783 return -EINVAL;
4784 }
4785
4786 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
4787 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
4788 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
4789 << ") must be in range [0,1.0]" << dendl;
4790 return -EINVAL;
4791 }
4792
4793 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
4794 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4795 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4796 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4797 << dendl;
4798 return -EINVAL;
4799 }
4800
4801 cache_data_ratio =
4802 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
4803 if (cache_data_ratio < 0) {
4804 // deal with floating point imprecision
4805 cache_data_ratio = 0;
4806 }
4807
4808 dout(1) << __func__ << " cache_size " << cache_size
4809 << " meta " << cache_meta_ratio
4810 << " kv " << cache_kv_ratio
4811 << " data " << cache_data_ratio
4812 << dendl;
4813 return 0;
4814 }
4815
4816 int BlueStore::write_meta(const std::string& key, const std::string& value)
4817 {
4818 bluestore_bdev_label_t label;
4819 string p = path + "/block";
4820 int r = _read_bdev_label(cct, p, &label);
4821 if (r < 0) {
4822 return ObjectStore::write_meta(key, value);
4823 }
4824 label.meta[key] = value;
4825 r = _write_bdev_label(cct, p, label);
4826 ceph_assert(r == 0);
4827 return ObjectStore::write_meta(key, value);
4828 }
4829
4830 int BlueStore::read_meta(const std::string& key, std::string *value)
4831 {
4832 bluestore_bdev_label_t label;
4833 string p = path + "/block";
4834 int r = _read_bdev_label(cct, p, &label);
4835 if (r < 0) {
4836 return ObjectStore::read_meta(key, value);
4837 }
4838 auto i = label.meta.find(key);
4839 if (i == label.meta.end()) {
4840 return ObjectStore::read_meta(key, value);
4841 }
4842 *value = i->second;
4843 return 0;
4844 }
4845
4846 void BlueStore::_init_logger()
4847 {
4848 PerfCountersBuilder b(cct, "bluestore",
4849 l_bluestore_first, l_bluestore_last);
4850 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4851 "Average kv_thread flush latency",
4852 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4853 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4854 "Average kv_thread commit latency");
4855 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4856 "Average kv_sync thread latency",
4857 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4858 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4859 "Average kv_finalize thread latency",
4860 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
4861 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4862 "Average prepare state latency");
4863 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4864 "Average aio_wait state latency",
4865 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4866 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4867 "Average io_done state latency");
4868 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4869 "Average kv_queued state latency");
4870 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4871 "Average kv_commiting state latency");
4872 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4873 "Average kv_done state latency");
4874 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4875 "Average deferred_queued state latency");
4876 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4877 "Average aio_wait state latency");
4878 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4879 "Average cleanup state latency");
4880 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4881 "Average finishing state latency");
4882 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4883 "Average done state latency");
4884 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4885 "Average submit throttle latency",
4886 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4887 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4888 "Average submit latency",
4889 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4890 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4891 "Average commit latency",
4892 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4893 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4894 "Average read latency",
4895 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4896 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4897 "Average read onode metadata latency");
4898 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4899 "Average read latency");
4900 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4901 "Average compress latency");
4902 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4903 "Average decompress latency");
4904 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4905 "Average checksum latency");
4906 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4907 "Sum for beneficial compress ops");
4908 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4909 "Sum for compress ops rejected due to low net gain of space");
4910 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
4911 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
4912 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4913 "Sum for deferred write op");
4914 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
4915 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
4916 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4917 "Sum for write penalty read ops");
4918 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4919 "Sum for allocated bytes");
4920 b.add_u64(l_bluestore_stored, "bluestore_stored",
4921 "Sum for stored bytes");
4922 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
4923 "Sum for stored compressed bytes",
4924 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4925 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
4926 "Sum for bytes allocated for compressed data",
4927 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4928 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
4929 "Sum for original bytes that were compressed",
4930 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4931 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4932 "Number of onodes in cache");
4933 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4934 "Number of pinned onodes in cache");
4935 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4936 "Sum for onode-lookups hit in the cache");
4937 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4938 "Sum for onode-lookups missed in the cache");
4939 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4940 "Sum for onode-shard lookups hit in the cache");
4941 b.add_u64_counter(l_bluestore_onode_shard_misses,
4942 "bluestore_onode_shard_misses",
4943 "Sum for onode-shard lookups missed in the cache");
4944 b.add_u64(l_bluestore_extents, "bluestore_extents",
4945 "Number of extents in cache");
4946 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4947 "Number of blobs in cache");
4948 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4949 "Number of buffers in cache");
4950 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
4951 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
4952 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
4953 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
4954 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
4955 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
4956
4957 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4958 "Large aligned writes into fresh blobs");
4959 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
4960 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
4961 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4962 "Large aligned writes into fresh blobs (blobs)");
4963 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4964 "Small writes into existing or sparse small blobs");
4965 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
4966 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
4967 b.add_u64_counter(l_bluestore_write_small_unused,
4968 "bluestore_write_small_unused",
4969 "Small writes into unused portion of existing blob");
4970 b.add_u64_counter(l_bluestore_write_small_deferred,
4971 "bluestore_write_small_deferred",
4972 "Small overwrites using deferred");
4973 b.add_u64_counter(l_bluestore_write_small_pre_read,
4974 "bluestore_write_small_pre_read",
4975 "Small writes that required we read some data (possibly "
4976 "cached) to fill out the block");
4977 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4978 "Small write into new (sparse) blob");
4979
4980 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4981 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4982 "Onode extent map reshard events");
4983 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4984 "Sum for blob splitting due to resharding");
4985 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4986 "Sum for extents that have been removed due to compression");
4987 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4988 "Sum for extents that have been merged due to garbage "
4989 "collection");
4990 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4991 "Read EIO errors propagated to high level callers");
4992 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4993 "Read operations that required at least one retry due to failed checksum validation");
4994 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4995 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4996 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4997 "Average omap iterator seek_to_first call latency");
4998 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4999 "Average omap iterator upper_bound call latency");
5000 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
5001 "Average omap iterator lower_bound call latency");
5002 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
5003 "Average omap iterator next call latency");
5004 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
5005 "Average omap get_keys call latency");
5006 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
5007 "Average omap get_values call latency");
5008 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
5009 "Average collection listing latency");
5010 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
5011 "Average removal latency");
5012
5013 logger = b.create_perf_counters();
5014 cct->get_perfcounters_collection()->add(logger);
5015 }
5016
5017 int BlueStore::_reload_logger()
5018 {
5019 struct store_statfs_t store_statfs;
5020 int r = statfs(&store_statfs);
5021 if (r >= 0) {
5022 logger->set(l_bluestore_allocated, store_statfs.allocated);
5023 logger->set(l_bluestore_stored, store_statfs.data_stored);
5024 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5025 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5026 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
5027 }
5028 return r;
5029 }
5030
5031 void BlueStore::_shutdown_logger()
5032 {
5033 cct->get_perfcounters_collection()->remove(logger);
5034 delete logger;
5035 }
5036
5037 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5038 uuid_d *fsid)
5039 {
5040 bluestore_bdev_label_t label;
5041 int r = _read_bdev_label(cct, path, &label);
5042 if (r < 0)
5043 return r;
5044 *fsid = label.osd_uuid;
5045 return 0;
5046 }
5047
5048 int BlueStore::_open_path()
5049 {
5050 // sanity check(s)
5051 ceph_assert(path_fd < 0);
5052 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
5053 if (path_fd < 0) {
5054 int r = -errno;
5055 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5056 << dendl;
5057 return r;
5058 }
5059 return 0;
5060 }
5061
5062 void BlueStore::_close_path()
5063 {
5064 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5065 path_fd = -1;
5066 }
5067
5068 int BlueStore::_write_bdev_label(CephContext *cct,
5069 string path, bluestore_bdev_label_t label)
5070 {
5071 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5072 bufferlist bl;
5073 encode(label, bl);
5074 uint32_t crc = bl.crc32c(-1);
5075 encode(crc, bl);
5076 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
5077 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5078 z.zero();
5079 bl.append(std::move(z));
5080
5081 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
5082 if (fd < 0) {
5083 fd = -errno;
5084 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5085 << dendl;
5086 return fd;
5087 }
5088 int r = bl.write_fd(fd);
5089 if (r < 0) {
5090 derr << __func__ << " failed to write to " << path
5091 << ": " << cpp_strerror(r) << dendl;
5092 goto out;
5093 }
5094 r = ::fsync(fd);
5095 if (r < 0) {
5096 derr << __func__ << " failed to fsync " << path
5097 << ": " << cpp_strerror(r) << dendl;
5098 }
5099 out:
5100 VOID_TEMP_FAILURE_RETRY(::close(fd));
5101 return r;
5102 }
5103
5104 int BlueStore::_read_bdev_label(CephContext* cct, string path,
5105 bluestore_bdev_label_t *label)
5106 {
5107 dout(10) << __func__ << dendl;
5108 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
5109 if (fd < 0) {
5110 fd = -errno;
5111 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5112 << dendl;
5113 return fd;
5114 }
5115 bufferlist bl;
5116 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5117 VOID_TEMP_FAILURE_RETRY(::close(fd));
5118 if (r < 0) {
5119 derr << __func__ << " failed to read from " << path
5120 << ": " << cpp_strerror(r) << dendl;
5121 return r;
5122 }
5123
5124 uint32_t crc, expected_crc;
5125 auto p = bl.cbegin();
5126 try {
5127 decode(*label, p);
5128 bufferlist t;
5129 t.substr_of(bl, 0, p.get_off());
5130 crc = t.crc32c(-1);
5131 decode(expected_crc, p);
5132 }
5133 catch (buffer::error& e) {
5134 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
5135 << ": " << e.what()
5136 << dendl;
5137 return -ENOENT;
5138 }
5139 if (crc != expected_crc) {
5140 derr << __func__ << " bad crc on label, expected " << expected_crc
5141 << " != actual " << crc << dendl;
5142 return -EIO;
5143 }
5144 dout(10) << __func__ << " got " << *label << dendl;
5145 return 0;
5146 }
5147
5148 int BlueStore::_check_or_set_bdev_label(
5149 string path, uint64_t size, string desc, bool create)
5150 {
5151 bluestore_bdev_label_t label;
5152 if (create) {
5153 label.osd_uuid = fsid;
5154 label.size = size;
5155 label.btime = ceph_clock_now();
5156 label.description = desc;
5157 int r = _write_bdev_label(cct, path, label);
5158 if (r < 0)
5159 return r;
5160 } else {
5161 int r = _read_bdev_label(cct, path, &label);
5162 if (r < 0)
5163 return r;
5164 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5165 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5166 << " and fsid " << fsid << " check bypassed" << dendl;
5167 } else if (label.osd_uuid != fsid) {
5168 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5169 << " does not match our fsid " << fsid << dendl;
5170 return -EIO;
5171 }
5172 }
5173 return 0;
5174 }
5175
5176 void BlueStore::_set_alloc_sizes(void)
5177 {
5178 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5179
5180 if (cct->_conf->bluestore_prefer_deferred_size) {
5181 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5182 } else {
5183 ceph_assert(bdev);
5184 if (_use_rotational_settings()) {
5185 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5186 } else {
5187 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5188 }
5189 }
5190
5191 if (cct->_conf->bluestore_deferred_batch_ops) {
5192 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5193 } else {
5194 ceph_assert(bdev);
5195 if (_use_rotational_settings()) {
5196 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5197 } else {
5198 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5199 }
5200 }
5201
5202 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
5203 << std::dec << " order " << (int)min_alloc_size_order
5204 << " max_alloc_size 0x" << std::hex << max_alloc_size
5205 << " prefer_deferred_size 0x" << prefer_deferred_size
5206 << std::dec
5207 << " deferred_batch_ops " << deferred_batch_ops
5208 << dendl;
5209 }
5210
5211 int BlueStore::_open_bdev(bool create)
5212 {
5213 ceph_assert(bdev == NULL);
5214 string p = path + "/block";
5215 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
5216 int r = bdev->open(p);
5217 if (r < 0)
5218 goto fail;
5219
5220 if (create && cct->_conf->bdev_enable_discard) {
5221 bdev->discard(0, bdev->get_size());
5222 }
5223
5224 if (bdev->supported_bdev_label()) {
5225 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5226 if (r < 0)
5227 goto fail_close;
5228 }
5229
5230 // initialize global block parameters
5231 block_size = bdev->get_block_size();
5232 block_mask = ~(block_size - 1);
5233 block_size_order = ctz(block_size);
5234 ceph_assert(block_size == 1u << block_size_order);
5235 _set_max_defer_interval();
5236 // and set cache_size based on device type
5237 r = _set_cache_sizes();
5238 if (r < 0) {
5239 goto fail_close;
5240 }
5241 return 0;
5242
5243 fail_close:
5244 bdev->close();
5245 fail:
5246 delete bdev;
5247 bdev = NULL;
5248 return r;
5249 }
5250
5251 void BlueStore::_validate_bdev()
5252 {
5253 ceph_assert(bdev);
5254 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
5255 uint64_t dev_size = bdev->get_size();
5256 if (dev_size <
5257 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
5258 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
5259 << " is too small, disable bluestore_bluefs_min for now"
5260 << dendl;
5261 ceph_assert(dev_size >= _get_ondisk_reserved());
5262
5263 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
5264 ceph_assert(r == 0);
5265 }
5266 }
5267
5268 void BlueStore::_close_bdev()
5269 {
5270 ceph_assert(bdev);
5271 bdev->close();
5272 delete bdev;
5273 bdev = NULL;
5274 }
5275
5276 int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
5277 {
5278 int r;
5279 bluestore_bdev_label_t label;
5280
5281 ceph_assert(fm == NULL);
5282 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5283 ceph_assert(fm);
5284 if (t) {
5285 // create mode. initialize freespace
5286 dout(20) << __func__ << " initializing freespace" << dendl;
5287 {
5288 bufferlist bl;
5289 bl.append(freelist_type);
5290 t->set(PREFIX_SUPER, "freelist_type", bl);
5291 }
5292 // being able to allocate in units less than bdev block size
5293 // seems to be a bad idea.
5294 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
5295 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
5296
5297 // allocate superblock reserved space. note that we do not mark
5298 // bluefs space as allocated in the freelist; we instead rely on
5299 // bluefs_extents.
5300 auto reserved = _get_ondisk_reserved();
5301 fm->allocate(0, reserved, t);
5302
5303 if (cct->_conf->bluestore_bluefs) {
5304 ceph_assert(bluefs_extents.num_intervals() == 1);
5305 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
5306 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
5307 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
5308 << " for bluefs" << dendl;
5309 }
5310
5311 if (cct->_conf->bluestore_debug_prefill > 0) {
5312 uint64_t end = bdev->get_size() - reserved;
5313 dout(1) << __func__ << " pre-fragmenting freespace, using "
5314 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5315 << cct->_conf->bluestore_debug_prefragment_max << dendl;
5316 uint64_t start = p2roundup(reserved, min_alloc_size);
5317 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5318 float r = cct->_conf->bluestore_debug_prefill;
5319 r /= 1.0 - r;
5320 bool stop = false;
5321
5322 while (!stop && start < end) {
5323 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5324 if (start + l > end) {
5325 l = end - start;
5326 l = p2align(l, min_alloc_size);
5327 }
5328 ceph_assert(start + l <= end);
5329
5330 uint64_t u = 1 + (uint64_t)(r * (double)l);
5331 u = p2roundup(u, min_alloc_size);
5332 if (start + l + u > end) {
5333 u = end - (start + l);
5334 // trim to align so we don't overflow again
5335 u = p2align(u, min_alloc_size);
5336 stop = true;
5337 }
5338 ceph_assert(start + l + u <= end);
5339
5340 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
5341 << " use 0x" << u << std::dec << dendl;
5342
5343 if (u == 0) {
5344 // break if u has been trimmed to nothing
5345 break;
5346 }
5347
5348 fm->allocate(start + l, u, t);
5349 start += l + u;
5350 }
5351 }
5352 r = _write_out_fm_meta(0, false, &label);
5353 ceph_assert(r == 0);
5354 } else {
5355 string p = path + "/block";
5356 r = _read_bdev_label(cct, p, &label);
5357 if (r < 0) {
5358 derr << __func__ << " freelist init failed, error reading bdev label: " << cpp_strerror(r) << dendl;
5359 delete fm;
5360 fm = NULL;
5361 return r;
5362 }
5363 }
5364 r = fm->init(label, db, read_only);
5365 if (r < 0) {
5366 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
5367 delete fm;
5368 fm = NULL;
5369 return r;
5370 }
5371 // if space size tracked by free list manager is that higher than actual
5372 // dev size one can hit out-of-space allocation which will result
5373 // in data loss and/or assertions
5374 // Probably user altered the device size somehow.
5375 // The only fix for now is to redeploy OSD.
5376 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5377 ostringstream ss;
5378 ss << "slow device size mismatch detected, "
5379 << " fm size(" << fm->get_size()
5380 << ") > slow device size(" << bdev->get_size()
5381 << "), Please stop using this OSD as it might cause data loss.";
5382 _set_disk_size_mismatch_alert(ss.str());
5383 }
5384 return 0;
5385 }
5386
5387 void BlueStore::_close_fm()
5388 {
5389 dout(10) << __func__ << dendl;
5390 ceph_assert(fm);
5391 fm->shutdown();
5392 delete fm;
5393 fm = NULL;
5394 }
5395
5396 int BlueStore::_write_out_fm_meta(uint64_t target_size,
5397 bool update_root_size,
5398 bluestore_bdev_label_t* res_label)
5399 {
5400 string p = path + "/block";
5401
5402 std::vector<std::pair<string, string>> fm_meta;
5403 fm->get_meta(target_size, &fm_meta);
5404
5405 bluestore_bdev_label_t label;
5406 int r = _read_bdev_label(cct, p, &label);
5407 if (r < 0)
5408 return r;
5409
5410 for (auto& m : fm_meta) {
5411 label.meta[m.first] = m.second;
5412 }
5413 if (update_root_size) {
5414 label.size = target_size;
5415 }
5416 r = _write_bdev_label(cct, p, label);
5417 if (res_label) {
5418 *res_label = label;
5419 }
5420
5421 return r;
5422 }
5423
5424 int BlueStore::_open_alloc()
5425 {
5426 ceph_assert(alloc == NULL);
5427 ceph_assert(bdev->get_size());
5428
5429 if (bluefs) {
5430 bluefs_extents.clear();
5431 auto r = bluefs->get_block_extents(bluefs_layout.shared_bdev,
5432 &bluefs_extents);
5433 if (r < 0) {
5434 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
5435 << cpp_strerror(r) << dendl;
5436
5437 return r;
5438 }
5439 dout(10) << __func__ << " bluefs extents 0x"
5440 << std::hex << bluefs_extents << std::dec
5441 << dendl;
5442 }
5443
5444 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
5445 bdev->get_size(),
5446 min_alloc_size, "block");
5447 if (!alloc) {
5448 lderr(cct) << __func__ << " Allocator::unknown alloc type "
5449 << cct->_conf->bluestore_allocator
5450 << dendl;
5451 return -EINVAL;
5452 }
5453
5454 uint64_t num = 0, bytes = 0;
5455
5456 dout(1) << __func__ << " opening allocation metadata" << dendl;
5457 // initialize from freelist
5458 fm->enumerate_reset();
5459 uint64_t offset, length;
5460 while (fm->enumerate_next(db, &offset, &length)) {
5461 alloc->init_add_free(offset, length);
5462 ++num;
5463 bytes += length;
5464 }
5465 fm->enumerate_reset();
5466
5467 // also mark bluefs space as allocated
5468 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5469 alloc->init_rm_free(e.get_start(), e.get_len());
5470 }
5471
5472 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
5473 << " in " << num << " extents"
5474 << " available " << byte_u_t(alloc->get_free())
5475 << dendl;
5476
5477 return 0;
5478 }
5479
5480 void BlueStore::_close_alloc()
5481 {
5482 ceph_assert(bdev);
5483 bdev->discard_drain();
5484
5485 ceph_assert(alloc);
5486 alloc->shutdown();
5487 delete alloc;
5488 alloc = NULL;
5489 bluefs_extents.clear();
5490 }
5491
5492 int BlueStore::_open_fsid(bool create)
5493 {
5494 ceph_assert(fsid_fd < 0);
5495 int flags = O_RDWR|O_CLOEXEC;
5496 if (create)
5497 flags |= O_CREAT;
5498 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5499 if (fsid_fd < 0) {
5500 int err = -errno;
5501 derr << __func__ << " " << cpp_strerror(err) << dendl;
5502 return err;
5503 }
5504 return 0;
5505 }
5506
5507 int BlueStore::_read_fsid(uuid_d *uuid)
5508 {
5509 char fsid_str[40];
5510 memset(fsid_str, 0, sizeof(fsid_str));
5511 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5512 if (ret < 0) {
5513 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5514 return ret;
5515 }
5516 if (ret > 36)
5517 fsid_str[36] = 0;
5518 else
5519 fsid_str[ret] = 0;
5520 if (!uuid->parse(fsid_str)) {
5521 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5522 return -EINVAL;
5523 }
5524 return 0;
5525 }
5526
5527 int BlueStore::_write_fsid()
5528 {
5529 int r = ::ftruncate(fsid_fd, 0);
5530 if (r < 0) {
5531 r = -errno;
5532 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5533 return r;
5534 }
5535 string str = stringify(fsid) + "\n";
5536 r = safe_write(fsid_fd, str.c_str(), str.length());
5537 if (r < 0) {
5538 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5539 return r;
5540 }
5541 r = ::fsync(fsid_fd);
5542 if (r < 0) {
5543 r = -errno;
5544 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5545 return r;
5546 }
5547 return 0;
5548 }
5549
5550 void BlueStore::_close_fsid()
5551 {
5552 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5553 fsid_fd = -1;
5554 }
5555
5556 int BlueStore::_lock_fsid()
5557 {
5558 struct flock l;
5559 memset(&l, 0, sizeof(l));
5560 l.l_type = F_WRLCK;
5561 l.l_whence = SEEK_SET;
5562 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5563 if (r < 0) {
5564 int err = errno;
5565 derr << __func__ << " failed to lock " << path << "/fsid"
5566 << " (is another ceph-osd still running?)"
5567 << cpp_strerror(err) << dendl;
5568 return -err;
5569 }
5570 return 0;
5571 }
5572
5573 bool BlueStore::is_rotational()
5574 {
5575 if (bdev) {
5576 return bdev->is_rotational();
5577 }
5578
5579 bool rotational = true;
5580 int r = _open_path();
5581 if (r < 0)
5582 goto out;
5583 r = _open_fsid(false);
5584 if (r < 0)
5585 goto out_path;
5586 r = _read_fsid(&fsid);
5587 if (r < 0)
5588 goto out_fsid;
5589 r = _lock_fsid();
5590 if (r < 0)
5591 goto out_fsid;
5592 r = _open_bdev(false);
5593 if (r < 0)
5594 goto out_fsid;
5595 rotational = bdev->is_rotational();
5596 _close_bdev();
5597 out_fsid:
5598 _close_fsid();
5599 out_path:
5600 _close_path();
5601 out:
5602 return rotational;
5603 }
5604
5605 bool BlueStore::is_journal_rotational()
5606 {
5607 if (!bluefs) {
5608 dout(5) << __func__ << " bluefs disabled, default to store media type"
5609 << dendl;
5610 return is_rotational();
5611 }
5612 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5613 return bluefs->wal_is_rotational();
5614 }
5615
5616 bool BlueStore::_use_rotational_settings()
5617 {
5618 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5619 return true;
5620 }
5621 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5622 return false;
5623 }
5624 return bdev->is_rotational();
5625 }
5626
5627 bool BlueStore::test_mount_in_use()
5628 {
5629 // most error conditions mean the mount is not in use (e.g., because
5630 // it doesn't exist). only if we fail to lock do we conclude it is
5631 // in use.
5632 bool ret = false;
5633 int r = _open_path();
5634 if (r < 0)
5635 return false;
5636 r = _open_fsid(false);
5637 if (r < 0)
5638 goto out_path;
5639 r = _lock_fsid();
5640 if (r < 0)
5641 ret = true; // if we can't lock, it is in use
5642 _close_fsid();
5643 out_path:
5644 _close_path();
5645 return ret;
5646 }
5647
5648 int BlueStore::_minimal_open_bluefs(bool create)
5649 {
5650 int r;
5651 bluefs = new BlueFS(cct);
5652
5653 string bfn;
5654 struct stat st;
5655
5656 bfn = path + "/block.db";
5657 if (::stat(bfn.c_str(), &st) == 0) {
5658 r = bluefs->add_block_device(
5659 BlueFS::BDEV_DB, bfn,
5660 create && cct->_conf->bdev_enable_discard);
5661 if (r < 0) {
5662 derr << __func__ << " add block device(" << bfn << ") returned: "
5663 << cpp_strerror(r) << dendl;
5664 goto free_bluefs;
5665 }
5666
5667 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5668 r = _check_or_set_bdev_label(
5669 bfn,
5670 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5671 "bluefs db", create);
5672 if (r < 0) {
5673 derr << __func__
5674 << " check block device(" << bfn << ") label returned: "
5675 << cpp_strerror(r) << dendl;
5676 goto free_bluefs;
5677 }
5678 }
5679 if (create) {
5680 bluefs->add_block_extent(
5681 BlueFS::BDEV_DB,
5682 SUPER_RESERVED,
5683 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5684 }
5685 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5686 bluefs_layout.dedicated_db = true;
5687 } else {
5688 r = -errno;
5689 if (::lstat(bfn.c_str(), &st) == -1) {
5690 r = 0;
5691 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
5692 } else {
5693 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5694 << cpp_strerror(r) << dendl;
5695 goto free_bluefs;
5696 }
5697 }
5698
5699 // shared device
5700 bfn = path + "/block";
5701 // never trim here
5702 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
5703 true /* shared with bluestore */);
5704 if (r < 0) {
5705 derr << __func__ << " add block device(" << bfn << ") returned: "
5706 << cpp_strerror(r) << dendl;
5707 goto free_bluefs;
5708 }
5709 if (create) {
5710 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5711 uint64_t initial =
5712 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5713 cct->_conf->bluestore_bluefs_gift_ratio);
5714 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
5715 uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size;
5716 if (alloc_size % min_alloc_size) {
5717 derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex
5718 << alloc_size << " is not a multiple of "
5719 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5720 r = -EINVAL;
5721 goto free_bluefs;
5722 }
5723 // align to bluefs's alloc_size
5724 initial = p2roundup(initial, alloc_size);
5725 // put bluefs in the middle of the device in case it is an HDD
5726 uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size);
5727 //avoiding superblock overwrite
5728 start = std::max(alloc_size, start);
5729 ceph_assert(start >=_get_ondisk_reserved());
5730
5731 bluefs->add_block_extent(bluefs_layout.shared_bdev, start, initial);
5732 bluefs_extents.insert(start, initial);
5733 ++out_of_sync_fm;
5734 }
5735
5736 bfn = path + "/block.wal";
5737 if (::stat(bfn.c_str(), &st) == 0) {
5738 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
5739 create && cct->_conf->bdev_enable_discard);
5740 if (r < 0) {
5741 derr << __func__ << " add block device(" << bfn << ") returned: "
5742 << cpp_strerror(r) << dendl;
5743 goto free_bluefs;
5744 }
5745
5746 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5747 r = _check_or_set_bdev_label(
5748 bfn,
5749 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5750 "bluefs wal", create);
5751 if (r < 0) {
5752 derr << __func__ << " check block device(" << bfn
5753 << ") label returned: " << cpp_strerror(r) << dendl;
5754 goto free_bluefs;
5755 }
5756 }
5757
5758 if (create) {
5759 bluefs->add_block_extent(
5760 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5761 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5762 BDEV_LABEL_BLOCK_SIZE);
5763 }
5764 bluefs_layout.dedicated_wal = true;
5765 } else {
5766 r = 0;
5767 if (::lstat(bfn.c_str(), &st) != -1) {
5768 r = -errno;
5769 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5770 << cpp_strerror(r) << dendl;
5771 goto free_bluefs;
5772 }
5773 }
5774 return 0;
5775
5776 free_bluefs:
5777 ceph_assert(bluefs);
5778 delete bluefs;
5779 bluefs = NULL;
5780 return r;
5781 }
5782
5783 int BlueStore::_open_bluefs(bool create)
5784 {
5785 int r = _minimal_open_bluefs(create);
5786 if (r < 0) {
5787 return r;
5788 }
5789 RocksDBBlueFSVolumeSelector* vselector = nullptr;
5790 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5791
5792 string options = cct->_conf->bluestore_rocksdb_options;
5793 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
5794 if (!options_annex.empty()) {
5795 if (!options.empty() &&
5796 *options.rbegin() != ',') {
5797 options += ',';
5798 }
5799 options += options_annex;
5800 }
5801
5802 rocksdb::Options rocks_opts;
5803 int r = RocksDBStore::ParseOptionsFromStringStatic(
5804 cct,
5805 options,
5806 rocks_opts,
5807 nullptr);
5808 if (r < 0) {
5809 return r;
5810 }
5811
5812 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5813 vselector =
5814 new RocksDBBlueFSVolumeSelector(
5815 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5816 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5817 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5818 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5819 rocks_opts.max_bytes_for_level_base,
5820 rocks_opts.max_bytes_for_level_multiplier,
5821 reserved_factor,
5822 cct->_conf->bluestore_volume_selection_reserved,
5823 cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
5824 }
5825 if (create) {
5826 bluefs->mkfs(fsid, bluefs_layout);
5827 }
5828 bluefs->set_volume_selector(vselector);
5829 r = bluefs->mount();
5830 if (r < 0) {
5831 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5832 }
5833 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
5834 return r;
5835 }
5836
5837 void BlueStore::_close_bluefs(bool cold_close)
5838 {
5839 bluefs->umount(cold_close);
5840 _minimal_close_bluefs();
5841 }
5842
5843 void BlueStore::_minimal_close_bluefs()
5844 {
5845 delete bluefs;
5846 bluefs = NULL;
5847 }
5848
5849 int BlueStore::_is_bluefs(bool create, bool* ret)
5850 {
5851 if (create) {
5852 *ret = cct->_conf->bluestore_bluefs;
5853 } else {
5854 string s;
5855 int r = read_meta("bluefs", &s);
5856 if (r < 0) {
5857 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5858 return -EIO;
5859 }
5860 if (s == "1") {
5861 *ret = true;
5862 } else if (s == "0") {
5863 *ret = false;
5864 } else {
5865 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5866 << dendl;
5867 return -EIO;
5868 }
5869 }
5870 return 0;
5871 }
5872
5873 /*
5874 * opens both DB and dependant super_meta, FreelistManager and allocator
5875 * in the proper order
5876 */
5877 int BlueStore::_open_db_and_around(bool read_only)
5878 {
5879 int r;
5880 bool do_bluefs = false;
5881 _is_bluefs(false, &do_bluefs); // ignore err code
5882 if (do_bluefs) {
5883 // open in read-only first to read FM list and init allocator
5884 // as they might be needed for some BlueFS procedures
5885 r = _open_db(false, false, true);
5886 if (r < 0)
5887 return r;
5888
5889 r = _open_super_meta();
5890 if (r < 0) {
5891 goto out_db;
5892 }
5893
5894 r = _open_fm(nullptr, true);
5895 if (r < 0)
5896 goto out_db;
5897
5898 r = _open_alloc();
5899 if (r < 0)
5900 goto out_fm;
5901
5902 // now open in R/W mode
5903 if (!read_only) {
5904 _close_db(true);
5905
5906 r = _open_db(false, false, false);
5907 if (r < 0) {
5908 _close_alloc();
5909 _close_fm();
5910 return r;
5911 }
5912 fm->sync(db);
5913 }
5914 } else {
5915 r = _open_db(false, false);
5916 if (r < 0) {
5917 return r;
5918 }
5919 r = _open_super_meta();
5920 if (r < 0) {
5921 goto out_db;
5922 }
5923
5924 r = _open_fm(nullptr, false);
5925 if (r < 0)
5926 goto out_db;
5927
5928 r = _open_alloc();
5929 if (r < 0)
5930 goto out_fm;
5931 }
5932 return 0;
5933
5934 out_fm:
5935 _close_fm();
5936 out_db:
5937 _close_db(read_only);
5938 return r;
5939 }
5940
5941 void BlueStore::_close_db_and_around(bool read_only)
5942 {
5943 if (bluefs) {
5944 if (!read_only && out_of_sync_fm.fetch_and(0)) {
5945 _sync_bluefs_and_fm();
5946 }
5947 _close_db(read_only);
5948 while(!read_only && out_of_sync_fm.fetch_and(0)) {
5949 // if seen some allocations during close - repeat open_db, sync fm, close
5950 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5951 int r = _open_db(false, false, false);
5952 if (r < 0) {
5953 derr << __func__
5954 << " unable to open db, FreelistManager is probably out of sync"
5955 << dendl;
5956 break;
5957 }
5958 _sync_bluefs_and_fm();
5959 _close_db(false);
5960 }
5961 if (!_kv_only) {
5962 _close_alloc();
5963 _close_fm();
5964 }
5965 } else {
5966 _close_alloc();
5967 _close_fm();
5968 _close_db(read_only);
5969 }
5970 }
5971
5972 // updates legacy bluefs related recs in DB to a state valid for
5973 // downgrades from nautilus.
5974 void BlueStore::_sync_bluefs_and_fm()
5975 {
5976 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5977 bufferlist bl;
5978 encode(bluefs_extents, bl);
5979 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5980 << std::hex << bluefs_extents << std::dec
5981 << dendl;
5982 KeyValueDB::Transaction synct = db->get_transaction();
5983 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5984 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5985
5986 // Nice thing is that we don't need to update FreelistManager here.
5987 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5988 // pre-Nautilis releases.
5989 // So once we get an extent to bluefs_extents this means it's
5990 // been free in allocator and hence it's free in FM too.
5991
5992 db->submit_transaction_sync(synct);
5993 }
5994 }
5995
5996 int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5997 {
5998 int r;
5999 ceph_assert(!db);
6000 ceph_assert(!(create && read_only));
6001 string fn = path + "/db";
6002 string options;
6003 string options_annex;
6004 stringstream err;
6005 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
6006
6007 string kv_backend;
6008 std::vector<KeyValueDB::ColumnFamily> cfs;
6009
6010 if (create) {
6011 kv_backend = cct->_conf->bluestore_kvbackend;
6012 } else {
6013 r = read_meta("kv_backend", &kv_backend);
6014 if (r < 0) {
6015 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
6016 return -EIO;
6017 }
6018 }
6019 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
6020
6021 bool do_bluefs;
6022 r = _is_bluefs(create, &do_bluefs);
6023 if (r < 0) {
6024 return r;
6025 }
6026 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
6027
6028 map<string,string> kv_options;
6029 // force separate wal dir for all new deployments.
6030 kv_options["separate_wal_dir"] = 1;
6031 rocksdb::Env *env = NULL;
6032 if (do_bluefs) {
6033 dout(10) << __func__ << " initializing bluefs" << dendl;
6034 if (kv_backend != "rocksdb") {
6035 derr << " backend must be rocksdb to use bluefs" << dendl;
6036 return -EINVAL;
6037 }
6038
6039 r = _open_bluefs(create);
6040 if (r < 0) {
6041 return r;
6042 }
6043
6044 if (cct->_conf->bluestore_bluefs_env_mirror) {
6045 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6046 rocksdb::Env* b = rocksdb::Env::Default();
6047 if (create) {
6048 string cmd = "rm -rf " + path + "/db " +
6049 path + "/db.slow " +
6050 path + "/db.wal";
6051 int r = system(cmd.c_str());
6052 (void)r;
6053 }
6054 env = new rocksdb::EnvMirror(b, a, false, true);
6055 } else {
6056 env = new BlueRocksEnv(bluefs);
6057
6058 // simplify the dir names, too, as "seen" by rocksdb
6059 fn = "db";
6060 }
6061 bluefs->set_slow_device_expander(this);
6062 BlueFSVolumeSelector::paths paths;
6063 bluefs->get_vselector_paths(fn, paths);
6064
6065 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6066 // we have both block.db and block; tell rocksdb!
6067 // note: the second (last) size value doesn't really matter
6068 ostringstream db_paths;
6069 bool first = true;
6070 for (auto& p : paths) {
6071 if (!first) {
6072 db_paths << " ";
6073 }
6074 first = false;
6075 db_paths << p.first << "," << p.second;
6076
6077 }
6078 kv_options["db_paths"] = db_paths.str();
6079 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
6080 }
6081
6082 if (create) {
6083 for (auto& p : paths) {
6084 env->CreateDir(p.first);
6085 }
6086 // Selectors don't provide wal path so far hence create explicitly
6087 env->CreateDir(fn + ".wal");
6088 } else {
6089 std::vector<std::string> res;
6090 // check for dir presence
6091 auto r = env->GetChildren(fn+".wal", &res);
6092 if (r.IsNotFound()) {
6093 kv_options.erase("separate_wal_dir");
6094 }
6095 }
6096 } else {
6097 string walfn = path + "/db.wal";
6098
6099 if (create) {
6100 int r = ::mkdir(fn.c_str(), 0755);
6101 if (r < 0)
6102 r = -errno;
6103 if (r < 0 && r != -EEXIST) {
6104 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6105 << dendl;
6106 return r;
6107 }
6108
6109 // wal_dir, too!
6110 r = ::mkdir(walfn.c_str(), 0755);
6111 if (r < 0)
6112 r = -errno;
6113 if (r < 0 && r != -EEXIST) {
6114 derr << __func__ << " failed to create " << walfn
6115 << ": " << cpp_strerror(r)
6116 << dendl;
6117 return r;
6118 }
6119 } else {
6120 struct stat st;
6121 r = ::stat(walfn.c_str(), &st);
6122 if (r < 0 && errno == ENOENT) {
6123 kv_options.erase("separate_wal_dir");
6124 }
6125 }
6126 }
6127
6128
6129 db = KeyValueDB::create(cct,
6130 kv_backend,
6131 fn,
6132 kv_options,
6133 static_cast<void*>(env));
6134 if (!db) {
6135 derr << __func__ << " error creating db" << dendl;
6136 if (bluefs) {
6137 _close_bluefs(read_only);
6138 }
6139 // delete env manually here since we can't depend on db to do this
6140 // under this case
6141 delete env;
6142 env = NULL;
6143 return -EIO;
6144 }
6145
6146 FreelistManager::setup_merge_operators(db);
6147 db->set_merge_operator(PREFIX_STAT, merge_op);
6148 db->set_cache_size(cache_kv_ratio * cache_size);
6149
6150 if (kv_backend == "rocksdb") {
6151 options = cct->_conf->bluestore_rocksdb_options;
6152 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6153 if (!options_annex.empty()) {
6154 if (!options.empty() &&
6155 *options.rbegin() != ',') {
6156 options += ',';
6157 }
6158 options += options_annex;
6159 }
6160
6161 map<string,string> cf_map;
6162 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
6163 get_str_map,
6164 &cf_map,
6165 " \t");
6166 for (auto& i : cf_map) {
6167 dout(10) << "column family " << i.first << ": " << i.second << dendl;
6168 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
6169 }
6170 }
6171
6172 db->init(options);
6173 if (to_repair_db)
6174 return 0;
6175 if (create) {
6176 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6177 r = db->create_and_open(err, cfs);
6178 } else {
6179 r = db->create_and_open(err);
6180 }
6181 } else {
6182 // we pass in cf list here, but it is only used if the db already has
6183 // column families created.
6184 r = read_only ?
6185 db->open_read_only(err, cfs) :
6186 db->open(err, cfs);
6187 }
6188 if (r) {
6189 derr << __func__ << " erroring opening db: " << err.str() << dendl;
6190 _close_db(read_only);
6191 return -EIO;
6192 }
6193 dout(1) << __func__ << " opened " << kv_backend
6194 << " path " << fn << " options " << options << dendl;
6195 return 0;
6196 }
6197
6198 void BlueStore::_close_db(bool cold_close)
6199 {
6200 ceph_assert(db);
6201 delete db;
6202 db = NULL;
6203 if (bluefs) {
6204 _close_bluefs(cold_close);
6205 }
6206 }
6207
6208 void BlueStore::_dump_alloc_on_failure()
6209 {
6210 auto dump_interval =
6211 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6212 if (dump_interval > 0 &&
6213 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
6214 alloc->dump();
6215 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6216 next_dump_on_bluefs_alloc_failure += dump_interval;
6217 }
6218 }
6219
6220
6221 int BlueStore::allocate_bluefs_freespace(
6222 uint64_t min_size,
6223 uint64_t size,
6224 PExtentVector* extents_out)
6225 {
6226 ceph_assert(min_size <= size);
6227 if (size) {
6228 // round up to alloc size
6229 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
6230 min_size = p2roundup(min_size, alloc_size);
6231 size = p2roundup(size, alloc_size);
6232
6233 PExtentVector extents_local;
6234 PExtentVector* extents = extents_out ? extents_out : &extents_local;
6235
6236
6237 uint64_t gift;
6238 uint64_t allocated = 0;
6239 int64_t alloc_len;
6240 auto need = size;
6241 auto extent_count0 = extents->size();
6242 do {
6243 // hard cap to fit into 32 bits
6244 gift = std::min<uint64_t>(size, 1ull << 30);
6245 dout(10) << __func__ << " gifting " << gift
6246 << " (" << byte_u_t(gift) << ")" << dendl;
6247
6248 alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents);
6249 if (alloc_len > 0) {
6250 allocated += alloc_len;
6251 size -= alloc_len;
6252 }
6253
6254 if (alloc_len < 0 ||
6255 (alloc_len < (int64_t)gift && (min_size > allocated))) {
6256 derr << __func__
6257 << " failed to allocate on 0x" << std::hex << gift
6258 << " min_size 0x" << min_size
6259 << " > allocated total 0x" << allocated
6260 << " bluefs_shared_alloc_size 0x" << alloc_size
6261 << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len)
6262 << " available 0x " << alloc->get_free()
6263 << std::dec << dendl;
6264
6265 _dump_alloc_on_failure();
6266 alloc->release(*extents);
6267 extents->clear();
6268 return -ENOSPC;
6269 }
6270 } while (size && alloc_len > 0);
6271 _collect_allocation_stats(need, alloc_size, extents->size() - extent_count0);
6272
6273 for (auto& e : *extents) {
6274 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
6275 bluefs_extents.insert(e.offset, e.length);
6276 ++out_of_sync_fm;
6277 // apply to bluefs if not requested from outside
6278 if (!extents_out) {
6279 bluefs->add_block_extent(bluefs_layout.shared_bdev, e.offset, e.length);
6280 }
6281 }
6282 }
6283 return 0;
6284 }
6285
6286 uint64_t BlueStore::available_freespace(uint64_t alloc_size) {
6287 uint64_t total = 0;
6288 auto iterated_allocation = [&](uint64_t off, uint64_t len) {
6289 //only count in size that is alloc_size aligned
6290 uint64_t dist_to_alignment;
6291 uint64_t offset_in_block = off & (alloc_size - 1);
6292 if (offset_in_block == 0)
6293 dist_to_alignment = 0;
6294 else
6295 dist_to_alignment = alloc_size - offset_in_block;
6296 if (dist_to_alignment >= len)
6297 return;
6298 len -= dist_to_alignment;
6299 total += p2align(len, alloc_size);
6300 };
6301 alloc->dump(iterated_allocation);
6302 return total;
6303 }
6304
6305 int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
6306 {
6307 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
6308
6309 uint64_t my_free = alloc->get_free();
6310 uint64_t total = bdev->get_size();
6311 float my_free_ratio = (float)my_free / (float)total;
6312
6313 uint64_t total_free = bluefs_free + my_free;
6314
6315 float bluefs_ratio = (float)bluefs_free / (float)total_free;
6316
6317 dout(10) << __func__
6318 << " bluefs " << byte_u_t(bluefs_free)
6319 << " free (" << bluefs_free_ratio
6320 << ") bluestore " << byte_u_t(my_free)
6321 << " free (" << my_free_ratio
6322 << "), bluefs_ratio " << bluefs_ratio
6323 << dendl;
6324
6325 uint64_t gift = 0;
6326 uint64_t reclaim = 0;
6327 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
6328 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
6329 if (gift >= my_free)
6330 gift = my_free / 2;
6331 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6332 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
6333 << ", should gift " << byte_u_t(gift) << dendl;
6334 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
6335 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
6336 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
6337 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
6338 if (reclaim >= bluefs_free)
6339 reclaim = bluefs_free / 2;
6340 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6341 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
6342 << ", should reclaim " << byte_u_t(reclaim) << dendl;
6343 }
6344
6345 // don't take over too much of the freespace
6346 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
6347 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
6348 cct->_conf->bluestore_bluefs_min < free_cap) {
6349 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
6350 dout(10) << __func__ << " bluefs_total " << bluefs_total
6351 << " < min " << cct->_conf->bluestore_bluefs_min
6352 << ", should gift " << byte_u_t(g) << dendl;
6353 if (g > gift)
6354 gift = g;
6355 reclaim = 0;
6356 }
6357 uint64_t min_free =
6358 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6359 if (bluefs_free < min_free &&
6360 min_free < free_cap) {
6361 uint64_t g = min_free - bluefs_free;
6362 dout(10) << __func__ << " bluefs_free " << bluefs_free
6363 << " < min " << min_free
6364 << ", should gift " << byte_u_t(g) << dendl;
6365 if (g > gift)
6366 gift = g;
6367 reclaim = 0;
6368 }
6369 uint64_t max_free =
6370 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_max_free");
6371 if (bluefs_free > max_free) {
6372 dout(10) << __func__ << " bluefs_free " << bluefs_free
6373 << " > max " << max_free
6374 << ", stop gifting for now" << dendl;
6375 gift = 0;
6376 }
6377 ceph_assert((int64_t)gift >= 0);
6378 ceph_assert((int64_t)reclaim >= 0);
6379 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
6380 }
6381
6382 int BlueStore::_balance_bluefs_freespace()
6383 {
6384 int ret = 0;
6385 ceph_assert(bluefs);
6386
6387 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
6388 bluefs->get_usage(&bluefs_usage);
6389 ceph_assert(bluefs_usage.size() > bluefs_layout.shared_bdev);
6390
6391 bool clear_alert = true;
6392 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6393 auto& p = bluefs_usage[bluefs_layout.shared_bdev];
6394 if (p.first != p.second) {
6395 auto& db = bluefs_usage[BlueFS::BDEV_DB];
6396 ostringstream ss;
6397 ss << "spilled over " << byte_u_t(p.second - p.first)
6398 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
6399 << " used of " << byte_u_t(db.second) << ") to slow device";
6400 _set_spillover_alert(ss.str());
6401 clear_alert = false;
6402 }
6403 }
6404 if (clear_alert) {
6405 _clear_spillover_alert();
6406 }
6407
6408 // fixme: look at primary bdev only for now
6409 int64_t delta = _get_bluefs_size_delta(
6410 bluefs_usage[bluefs_layout.shared_bdev].first,
6411 bluefs_usage[bluefs_layout.shared_bdev].second);
6412
6413 // reclaim from bluefs?
6414 if (delta < 0) {
6415 // round up to alloc size
6416 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
6417 auto reclaim = p2roundup(uint64_t(-delta), alloc_size);
6418
6419 // hard cap to fit into 32 bits
6420 reclaim = std::min<uint64_t>(reclaim, 1ull << 30);
6421 dout(10) << __func__ << " reclaiming " << reclaim
6422 << " (" << byte_u_t(reclaim) << ")" << dendl;
6423
6424 while (reclaim > 0) {
6425 // NOTE: this will block and do IO.
6426 PExtentVector extents;
6427 int r = bluefs->reclaim_blocks(bluefs_layout.shared_bdev, reclaim,
6428 &extents);
6429 if (r < 0) {
6430 derr << __func__ << " failed to reclaim space from bluefs"
6431 << dendl;
6432 break;
6433 }
6434 for (auto e : extents) {
6435 ++out_of_sync_fm;
6436 bluefs_extents.erase(e.offset, e.length);
6437 bluefs_extents_reclaiming.insert(e.offset, e.length);
6438 reclaim -= e.length;
6439 }
6440 }
6441
6442 ret = 1;
6443 }
6444
6445 return ret;
6446 }
6447
6448 int BlueStore::_open_collections()
6449 {
6450 dout(10) << __func__ << dendl;
6451 collections_had_errors = false;
6452 ceph_assert(coll_map.empty());
6453 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6454 for (it->upper_bound(string());
6455 it->valid();
6456 it->next()) {
6457 coll_t cid;
6458 if (cid.parse(it->key())) {
6459 auto c = ceph::make_ref<Collection>(
6460 this,
6461 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6462 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6463 cid);
6464 bufferlist bl = it->value();
6465 auto p = bl.cbegin();
6466 try {
6467 decode(c->cnode, p);
6468 } catch (buffer::error& e) {
6469 derr << __func__ << " failed to decode cnode, key:"
6470 << pretty_binary_string(it->key()) << dendl;
6471 return -EIO;
6472 }
6473 dout(20) << __func__ << " opened " << cid << " " << c
6474 << " " << c->cnode << dendl;
6475 _osr_attach(c.get());
6476 coll_map[cid] = c;
6477
6478 } else {
6479 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6480 collections_had_errors = true;
6481 }
6482 }
6483 return 0;
6484 }
6485
6486 void BlueStore::_fsck_collections(int64_t* errors)
6487 {
6488 if (collections_had_errors) {
6489 dout(10) << __func__ << dendl;
6490 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6491 for (it->upper_bound(string());
6492 it->valid();
6493 it->next()) {
6494 coll_t cid;
6495 if (!cid.parse(it->key())) {
6496 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6497 if (errors) {
6498 (*errors)++;
6499 }
6500 }
6501 }
6502 }
6503 }
6504
6505 void BlueStore::_set_per_pool_omap()
6506 {
6507 per_pool_omap = false;
6508 bufferlist bl;
6509 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6510 if (bl.length()) {
6511 per_pool_omap = true;
6512 dout(10) << __func__ << " per_pool_omap=1" << dendl;
6513 } else {
6514 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6515 }
6516 _check_no_per_pool_omap_alert();
6517 }
6518
6519 void BlueStore::_open_statfs()
6520 {
6521 osd_pools.clear();
6522 vstatfs.reset();
6523
6524 bufferlist bl;
6525 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
6526 if (r >= 0) {
6527 per_pool_stat_collection = false;
6528 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
6529 auto it = bl.cbegin();
6530 vstatfs.decode(it);
6531 dout(10) << __func__ << " store_statfs is found" << dendl;
6532 } else {
6533 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6534 }
6535 _check_legacy_statfs_alert();
6536 } else {
6537 per_pool_stat_collection = true;
6538 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6539 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
6540 for (it->upper_bound(string());
6541 it->valid();
6542 it->next()) {
6543
6544 uint64_t pool_id;
6545 int r = get_key_pool_stat(it->key(), &pool_id);
6546 ceph_assert(r == 0);
6547
6548 bufferlist bl;
6549 bl = it->value();
6550 auto p = bl.cbegin();
6551 auto& st = osd_pools[pool_id];
6552 try {
6553 st.decode(p);
6554 vstatfs += st;
6555
6556 dout(30) << __func__ << " pool " << pool_id
6557 << " statfs " << st << dendl;
6558 } catch (buffer::error& e) {
6559 derr << __func__ << " failed to decode pool stats, key:"
6560 << pretty_binary_string(it->key()) << dendl;
6561 }
6562 }
6563 }
6564 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6565
6566 }
6567
6568 int BlueStore::_setup_block_symlink_or_file(
6569 string name,
6570 string epath,
6571 uint64_t size,
6572 bool create)
6573 {
6574 dout(20) << __func__ << " name " << name << " path " << epath
6575 << " size " << size << " create=" << (int)create << dendl;
6576 int r = 0;
6577 int flags = O_RDWR|O_CLOEXEC;
6578 if (create)
6579 flags |= O_CREAT;
6580 if (epath.length()) {
6581 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6582 if (r < 0) {
6583 r = -errno;
6584 derr << __func__ << " failed to create " << name << " symlink to "
6585 << epath << ": " << cpp_strerror(r) << dendl;
6586 return r;
6587 }
6588
6589 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6590 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6591 if (fd < 0) {
6592 r = -errno;
6593 derr << __func__ << " failed to open " << epath << " file: "
6594 << cpp_strerror(r) << dendl;
6595 return r;
6596 }
6597 // write the Transport ID of the NVMe device
6598 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6599 // where "0000:02:00.0" is the selector of a PCI device, see
6600 // the first column of "lspci -mm -n -D"
6601 string trid{"trtype:PCIe "};
6602 trid += "traddr:";
6603 trid += epath.substr(strlen(SPDK_PREFIX));
6604 r = ::write(fd, trid.c_str(), trid.size());
6605 ceph_assert(r == static_cast<int>(trid.size()));
6606 dout(1) << __func__ << " created " << name << " symlink to "
6607 << epath << dendl;
6608 VOID_TEMP_FAILURE_RETRY(::close(fd));
6609 }
6610 }
6611 if (size) {
6612 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6613 if (fd >= 0) {
6614 // block file is present
6615 struct stat st;
6616 int r = ::fstat(fd, &st);
6617 if (r == 0 &&
6618 S_ISREG(st.st_mode) && // if it is a regular file
6619 st.st_size == 0) { // and is 0 bytes
6620 r = ::ftruncate(fd, size);
6621 if (r < 0) {
6622 r = -errno;
6623 derr << __func__ << " failed to resize " << name << " file to "
6624 << size << ": " << cpp_strerror(r) << dendl;
6625 VOID_TEMP_FAILURE_RETRY(::close(fd));
6626 return r;
6627 }
6628
6629 if (cct->_conf->bluestore_block_preallocate_file) {
6630 r = ::ceph_posix_fallocate(fd, 0, size);
6631 if (r > 0) {
6632 derr << __func__ << " failed to prefallocate " << name << " file to "
6633 << size << ": " << cpp_strerror(r) << dendl;
6634 VOID_TEMP_FAILURE_RETRY(::close(fd));
6635 return -r;
6636 }
6637 }
6638 dout(1) << __func__ << " resized " << name << " file to "
6639 << byte_u_t(size) << dendl;
6640 }
6641 VOID_TEMP_FAILURE_RETRY(::close(fd));
6642 } else {
6643 int r = -errno;
6644 if (r != -ENOENT) {
6645 derr << __func__ << " failed to open " << name << " file: "
6646 << cpp_strerror(r) << dendl;
6647 return r;
6648 }
6649 }
6650 }
6651 return 0;
6652 }
6653
6654 int BlueStore::mkfs()
6655 {
6656 dout(1) << __func__ << " path " << path << dendl;
6657 int r;
6658 uuid_d old_fsid;
6659
6660 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6661 derr << __func__ << " osd_max_object_size "
6662 << cct->_conf->osd_max_object_size << " > bluestore max "
6663 << OBJECT_MAX_SIZE << dendl;
6664 return -EINVAL;
6665 }
6666
6667 {
6668 string done;
6669 r = read_meta("mkfs_done", &done);
6670 if (r == 0) {
6671 dout(1) << __func__ << " already created" << dendl;
6672 if (cct->_conf->bluestore_fsck_on_mkfs) {
6673 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6674 if (r < 0) {
6675 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6676 << dendl;
6677 return r;
6678 }
6679 if (r > 0) {
6680 derr << __func__ << " fsck found " << r << " errors" << dendl;
6681 r = -EIO;
6682 }
6683 }
6684 return r; // idempotent
6685 }
6686 }
6687
6688 {
6689 string type;
6690 r = read_meta("type", &type);
6691 if (r == 0) {
6692 if (type != "bluestore") {
6693 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6694 return -EIO;
6695 }
6696 } else {
6697 r = write_meta("type", "bluestore");
6698 if (r < 0)
6699 return r;
6700 }
6701 }
6702
6703 freelist_type = "bitmap";
6704
6705 r = _open_path();
6706 if (r < 0)
6707 return r;
6708
6709 r = _open_fsid(true);
6710 if (r < 0)
6711 goto out_path_fd;
6712
6713 r = _lock_fsid();
6714 if (r < 0)
6715 goto out_close_fsid;
6716
6717 r = _read_fsid(&old_fsid);
6718 if (r < 0 || old_fsid.is_zero()) {
6719 if (fsid.is_zero()) {
6720 fsid.generate_random();
6721 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6722 } else {
6723 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6724 }
6725 // we'll write it later.
6726 } else {
6727 if (!fsid.is_zero() && fsid != old_fsid) {
6728 derr << __func__ << " on-disk fsid " << old_fsid
6729 << " != provided " << fsid << dendl;
6730 r = -EINVAL;
6731 goto out_close_fsid;
6732 }
6733 fsid = old_fsid;
6734 }
6735
6736 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6737 cct->_conf->bluestore_block_size,
6738 cct->_conf->bluestore_block_create);
6739 if (r < 0)
6740 goto out_close_fsid;
6741 if (cct->_conf->bluestore_bluefs) {
6742 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6743 cct->_conf->bluestore_block_wal_size,
6744 cct->_conf->bluestore_block_wal_create);
6745 if (r < 0)
6746 goto out_close_fsid;
6747 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6748 cct->_conf->bluestore_block_db_size,
6749 cct->_conf->bluestore_block_db_create);
6750 if (r < 0)
6751 goto out_close_fsid;
6752 }
6753
6754 r = _open_bdev(true);
6755 if (r < 0)
6756 goto out_close_fsid;
6757
6758 // choose min_alloc_size
6759 if (cct->_conf->bluestore_min_alloc_size) {
6760 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6761 } else {
6762 ceph_assert(bdev);
6763 if (bdev->is_rotational()) {
6764 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6765 } else {
6766 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6767 }
6768 }
6769 _validate_bdev();
6770
6771 // make sure min_alloc_size is power of 2 aligned.
6772 if (!isp2(min_alloc_size)) {
6773 derr << __func__ << " min_alloc_size 0x"
6774 << std::hex << min_alloc_size << std::dec
6775 << " is not power of 2 aligned!"
6776 << dendl;
6777 r = -EINVAL;
6778 goto out_close_bdev;
6779 }
6780
6781 r = _open_db(true);
6782 if (r < 0)
6783 goto out_close_bdev;
6784
6785 {
6786 KeyValueDB::Transaction t = db->get_transaction();
6787 r = _open_fm(t, true);
6788 if (r < 0)
6789 goto out_close_db;
6790 {
6791 bufferlist bl;
6792 encode((uint64_t)0, bl);
6793 t->set(PREFIX_SUPER, "nid_max", bl);
6794 t->set(PREFIX_SUPER, "blobid_max", bl);
6795 }
6796
6797 {
6798 bufferlist bl;
6799 encode((uint64_t)min_alloc_size, bl);
6800 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6801 }
6802 {
6803 bufferlist bl;
6804 bl.append("1");
6805 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6806 }
6807 ondisk_format = latest_ondisk_format;
6808 _prepare_ondisk_format_super(t);
6809 db->submit_transaction_sync(t);
6810 }
6811
6812 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6813 if (r < 0)
6814 goto out_close_fm;
6815
6816 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
6817 if (r < 0)
6818 goto out_close_fm;
6819
6820 if (fsid != old_fsid) {
6821 r = _write_fsid();
6822 if (r < 0) {
6823 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
6824 goto out_close_fm;
6825 }
6826 }
6827
6828 if (out_of_sync_fm.fetch_and(0)) {
6829 _sync_bluefs_and_fm();
6830 }
6831
6832 out_close_fm:
6833 _close_fm();
6834 out_close_db:
6835 _close_db(false);
6836 out_close_bdev:
6837 _close_bdev();
6838 out_close_fsid:
6839 _close_fsid();
6840 out_path_fd:
6841 _close_path();
6842
6843 if (r == 0 &&
6844 cct->_conf->bluestore_fsck_on_mkfs) {
6845 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6846 if (rc < 0)
6847 return rc;
6848 if (rc > 0) {
6849 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6850 r = -EIO;
6851 }
6852 }
6853
6854 if (r == 0) {
6855 // indicate success by writing the 'mkfs_done' file
6856 r = write_meta("mkfs_done", "yes");
6857 }
6858
6859 if (r < 0) {
6860 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6861 } else {
6862 dout(0) << __func__ << " success" << dendl;
6863 }
6864 return r;
6865 }
6866
6867 int BlueStore::_mount_for_bluefs()
6868 {
6869 int r = _open_path();
6870 ceph_assert(r == 0);
6871 r = _open_fsid(false);
6872 ceph_assert(r == 0);
6873 r = _read_fsid(&fsid);
6874 ceph_assert(r == 0);
6875 r = _lock_fsid();
6876 ceph_assert(r == 0);
6877 r = _open_bluefs(false);
6878 ceph_assert(r == 0);
6879 return r;
6880 }
6881
6882 void BlueStore::_umount_for_bluefs()
6883 {
6884 _close_bluefs(false);
6885 _close_fsid();
6886 _close_path();
6887 }
6888
6889 int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6890 {
6891 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6892 int r;
6893 ceph_assert(path_fd < 0);
6894
6895 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6896
6897 if (!cct->_conf->bluestore_bluefs) {
6898 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6899 return -EIO;
6900 }
6901
6902 r = _mount_for_bluefs();
6903
6904 int reserved = 0;
6905 if (id == BlueFS::BDEV_NEWWAL) {
6906 string p = path + "/block.wal";
6907 r = _setup_block_symlink_or_file("block.wal", dev_path,
6908 cct->_conf->bluestore_block_wal_size,
6909 true);
6910 ceph_assert(r == 0);
6911
6912 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
6913 cct->_conf->bdev_enable_discard);
6914 ceph_assert(r == 0);
6915
6916 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6917 r = _check_or_set_bdev_label(
6918 p,
6919 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6920 "bluefs wal",
6921 true);
6922 ceph_assert(r == 0);
6923 }
6924
6925 reserved = BDEV_LABEL_BLOCK_SIZE;
6926 bluefs_layout.dedicated_wal = true;
6927 } else if (id == BlueFS::BDEV_NEWDB) {
6928 string p = path + "/block.db";
6929 r = _setup_block_symlink_or_file("block.db", dev_path,
6930 cct->_conf->bluestore_block_db_size,
6931 true);
6932 ceph_assert(r == 0);
6933
6934 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
6935 cct->_conf->bdev_enable_discard);
6936 ceph_assert(r == 0);
6937
6938 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6939 r = _check_or_set_bdev_label(
6940 p,
6941 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6942 "bluefs db",
6943 true);
6944 ceph_assert(r == 0);
6945 }
6946 reserved = SUPER_RESERVED;
6947 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6948 bluefs_layout.dedicated_db = true;
6949 }
6950
6951 bluefs->umount();
6952 bluefs->mount();
6953
6954 bluefs->add_block_extent(
6955 id,
6956 reserved,
6957 bluefs->get_block_device_size(id) - reserved, true);
6958
6959 r = bluefs->prepare_new_device(id, bluefs_layout);
6960 ceph_assert(r == 0);
6961
6962 if (r < 0) {
6963 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6964 } else {
6965 dout(0) << __func__ << " success" << dendl;
6966 }
6967
6968 _umount_for_bluefs();
6969 return r;
6970 }
6971
6972 int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6973 int id)
6974 {
6975 dout(10) << __func__ << " id:" << id << dendl;
6976 ceph_assert(path_fd < 0);
6977
6978 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6979
6980 if (!cct->_conf->bluestore_bluefs) {
6981 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6982 return -EIO;
6983 }
6984
6985 int r = _mount_for_bluefs();
6986
6987 // require bluestore_bluefs_min_free to be free at target device!
6988 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6989 for(auto src_id : devs_source) {
6990 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6991 }
6992 uint64_t target_free = bluefs->get_free(id);
6993 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6994 // will need to remount full BlueStore instance to allocate more space
6995 _umount_for_bluefs();
6996
6997 r = mount();
6998 ceph_assert(r == 0);
6999 dout(1) << __func__
7000 << " Allocating more space at slow device for BlueFS: +"
7001 << used_space - target_free << " bytes" << dendl;
7002 r = allocate_bluefs_freespace(
7003 used_space - target_free,
7004 used_space - target_free,
7005 nullptr);
7006
7007 umount();
7008 if (r != 0) {
7009 derr << __func__
7010 << " can't migrate, unable to allocate extra space: "
7011 << used_space - target_free << " at target:" << id
7012 << dendl;
7013 return -ENOSPC;
7014 }
7015
7016 r = _mount_for_bluefs();
7017 ceph_assert(r == 0);
7018 } else if (target_free < used_space) {
7019 derr << __func__
7020 << " can't migrate, free space at target: " << target_free
7021 << " is less than required space: " << used_space
7022 << dendl;
7023 return -ENOSPC;
7024 }
7025 if (devs_source.count(BlueFS::BDEV_DB)) {
7026 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7027 bluefs_layout.dedicated_db = false;
7028 }
7029 if (devs_source.count(BlueFS::BDEV_WAL)) {
7030 bluefs_layout.dedicated_wal = false;
7031 }
7032 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
7033 if (r < 0) {
7034 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7035 goto shutdown;
7036 }
7037
7038 if (devs_source.count(BlueFS::BDEV_DB)) {
7039 r = unlink(string(path + "/block.db").c_str());
7040 ceph_assert(r == 0);
7041 }
7042 if (devs_source.count(BlueFS::BDEV_WAL)) {
7043 r = unlink(string(path + "/block.wal").c_str());
7044 ceph_assert(r == 0);
7045 }
7046
7047 shutdown:
7048 _umount_for_bluefs();
7049 return r;
7050 }
7051
7052 int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
7053 int id,
7054 const string& dev_path)
7055 {
7056 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7057 int r;
7058 ceph_assert(path_fd < 0);
7059
7060 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7061
7062 if (!cct->_conf->bluestore_bluefs) {
7063 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7064 return -EIO;
7065 }
7066
7067 r = _mount_for_bluefs();
7068
7069 int reserved = 0;
7070 string link_db;
7071 string link_wal;
7072 if (devs_source.count(BlueFS::BDEV_DB) &&
7073 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
7074 link_db = path + "/block.db";
7075 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7076 bluefs_layout.dedicated_db = false;
7077 }
7078 if (devs_source.count(BlueFS::BDEV_WAL)) {
7079 link_wal = path + "/block.wal";
7080 bluefs_layout.dedicated_wal = false;
7081 }
7082
7083 size_t target_size;
7084 string target_name;
7085 if (id == BlueFS::BDEV_NEWWAL) {
7086 target_name = "block.wal";
7087 target_size = cct->_conf->bluestore_block_wal_size;
7088 bluefs_layout.dedicated_wal = true;
7089
7090 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
7091 cct->_conf->bdev_enable_discard);
7092 ceph_assert(r == 0);
7093
7094 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7095 r = _check_or_set_bdev_label(
7096 dev_path,
7097 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7098 "bluefs wal",
7099 true);
7100 ceph_assert(r == 0);
7101 }
7102 reserved = BDEV_LABEL_BLOCK_SIZE;
7103 } else if (id == BlueFS::BDEV_NEWDB) {
7104 target_name = "block.db";
7105 target_size = cct->_conf->bluestore_block_db_size;
7106 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7107 bluefs_layout.dedicated_db = true;
7108
7109 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
7110 cct->_conf->bdev_enable_discard);
7111 ceph_assert(r == 0);
7112
7113 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7114 r = _check_or_set_bdev_label(
7115 dev_path,
7116 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7117 "bluefs db",
7118 true);
7119 ceph_assert(r == 0);
7120 }
7121 reserved = SUPER_RESERVED;
7122 }
7123
7124 bluefs->umount();
7125 bluefs->mount();
7126
7127 bluefs->add_block_extent(
7128 id, reserved, bluefs->get_block_device_size(id) - reserved);
7129
7130 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
7131
7132 if (r < 0) {
7133 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7134 goto shutdown;
7135 }
7136
7137 if (!link_db.empty()) {
7138 r = unlink(link_db.c_str());
7139 ceph_assert(r == 0);
7140 }
7141 if (!link_wal.empty()) {
7142 r = unlink(link_wal.c_str());
7143 ceph_assert(r == 0);
7144 }
7145 r = _setup_block_symlink_or_file(
7146 target_name,
7147 dev_path,
7148 target_size,
7149 true);
7150 ceph_assert(r == 0);
7151 dout(0) << __func__ << " success" << dendl;
7152
7153 shutdown:
7154 _umount_for_bluefs();
7155 return r;
7156 }
7157
7158 string BlueStore::get_device_path(unsigned id)
7159 {
7160 string res;
7161 if (id < BlueFS::MAX_BDEV) {
7162 switch (id) {
7163 case BlueFS::BDEV_WAL:
7164 res = path + "/block.wal";
7165 break;
7166 case BlueFS::BDEV_DB:
7167 if (id == bluefs_layout.shared_bdev) {
7168 res = path + "/block";
7169 } else {
7170 res = path + "/block.db";
7171 }
7172 break;
7173 case BlueFS::BDEV_SLOW:
7174 res = path + "/block";
7175 break;
7176 }
7177 }
7178 return res;
7179 }
7180
7181 int BlueStore::expand_devices(ostream& out)
7182 {
7183 int r = cold_open();
7184 ceph_assert(r == 0);
7185 bluefs->dump_block_extents(out);
7186 out << "Expanding DB/WAL..." << std::endl;
7187 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
7188 if (devid == bluefs_layout.shared_bdev ) {
7189 continue;
7190 }
7191 uint64_t size = bluefs->get_block_device_size(devid);
7192 if (size == 0) {
7193 // no bdev
7194 continue;
7195 }
7196
7197 interval_set<uint64_t> before;
7198 bluefs->get_block_extents(devid, &before);
7199 ceph_assert(!before.empty());
7200 uint64_t end = before.range_end();
7201 if (end < size) {
7202 out << devid
7203 <<" : expanding " << " from 0x" << std::hex
7204 << end << " to 0x" << size << std::dec << std::endl;
7205 bluefs->add_block_extent(devid, end, size-end);
7206 string p = get_device_path(devid);
7207 const char* path = p.c_str();
7208 if (path == nullptr) {
7209 derr << devid
7210 <<": can't find device path " << dendl;
7211 continue;
7212 }
7213 bluestore_bdev_label_t label;
7214 int r = _read_bdev_label(cct, path, &label);
7215 if (r < 0) {
7216 derr << "unable to read label for " << path << ": "
7217 << cpp_strerror(r) << dendl;
7218 continue;
7219 }
7220 label.size = size;
7221 r = _write_bdev_label(cct, path, label);
7222 if (r < 0) {
7223 derr << "unable to write label for " << path << ": "
7224 << cpp_strerror(r) << dendl;
7225 continue;
7226 }
7227 out << devid
7228 <<" : size label updated to " << size
7229 << std::endl;
7230 }
7231 }
7232 uint64_t size0 = fm->get_size();
7233 uint64_t size = bdev->get_size();
7234 if (size0 < size) {
7235 out << bluefs_layout.shared_bdev
7236 << " : expanding " << " from 0x" << std::hex
7237 << size0 << " to 0x" << size << std::dec << std::endl;
7238 _write_out_fm_meta(size, true);
7239 cold_close();
7240
7241 // mount in read/write to sync expansion changes
7242 r = _mount(false);
7243 ceph_assert(r == 0);
7244 umount();
7245 } else {
7246 cold_close();
7247 }
7248 return r;
7249 }
7250
7251 int BlueStore::dump_bluefs_sizes(ostream& out)
7252 {
7253 int r = cold_open();
7254 ceph_assert(r == 0);
7255 bluefs->dump_block_extents(out);
7256 cold_close();
7257 return r;
7258 }
7259
7260 void BlueStore::set_cache_shards(unsigned num)
7261 {
7262 dout(10) << __func__ << " " << num << dendl;
7263 size_t oold = onode_cache_shards.size();
7264 size_t bold = buffer_cache_shards.size();
7265 ceph_assert(num >= oold && num >= bold);
7266 onode_cache_shards.resize(num);
7267 buffer_cache_shards.resize(num);
7268 for (unsigned i = oold; i < num; ++i) {
7269 onode_cache_shards[i] =
7270 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7271 logger);
7272 }
7273 for (unsigned i = bold; i < num; ++i) {
7274 buffer_cache_shards[i] =
7275 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7276 logger);
7277 }
7278 }
7279
7280 int BlueStore::_mount(bool kv_only, bool open_db)
7281 {
7282 dout(1) << __func__ << " path " << path << dendl;
7283
7284 _kv_only = kv_only;
7285
7286 {
7287 string type;
7288 int r = read_meta("type", &type);
7289 if (r < 0) {
7290 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
7291 << dendl;
7292 return r;
7293 }
7294
7295 if (type != "bluestore") {
7296 derr << __func__ << " expected bluestore, but type is " << type << dendl;
7297 return -EIO;
7298 }
7299 }
7300
7301 if (cct->_conf->bluestore_fsck_on_mount) {
7302 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7303 if (rc < 0)
7304 return rc;
7305 if (rc > 0) {
7306 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7307 return -EIO;
7308 }
7309 }
7310
7311 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7312 derr << __func__ << " osd_max_object_size "
7313 << cct->_conf->osd_max_object_size << " > bluestore max "
7314 << OBJECT_MAX_SIZE << dendl;
7315 return -EINVAL;
7316 }
7317
7318 int r = _open_path();
7319 if (r < 0)
7320 return r;
7321 r = _open_fsid(false);
7322 if (r < 0)
7323 goto out_path;
7324
7325 r = _read_fsid(&fsid);
7326 if (r < 0)
7327 goto out_fsid;
7328
7329 r = _lock_fsid();
7330 if (r < 0)
7331 goto out_fsid;
7332
7333 r = _open_bdev(false);
7334 if (r < 0)
7335 goto out_fsid;
7336
7337 if (open_db) {
7338 r = _open_db_and_around(false);
7339 } else {
7340 // we can bypass db open exclusively in case of kv_only mode
7341 ceph_assert(kv_only);
7342 r = _open_db(false, true);
7343 }
7344 if (r < 0) {
7345 goto out_bdev;
7346 }
7347
7348 if (kv_only)
7349 return 0;
7350
7351 r = _upgrade_super();
7352 if (r < 0) {
7353 goto out_db;
7354 }
7355
7356 r = _open_collections();
7357 if (r < 0)
7358 goto out_db;
7359
7360 r = _reload_logger();
7361 if (r < 0)
7362 goto out_coll;
7363
7364 _kv_start();
7365
7366 r = _deferred_replay();
7367 if (r < 0)
7368 goto out_stop;
7369
7370 mempool_thread.init();
7371
7372 if ((!per_pool_stat_collection || !per_pool_omap) &&
7373 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
7374
7375 bool was_per_pool_omap = per_pool_omap;
7376
7377 dout(1) << __func__ << " quick-fix on mount" << dendl;
7378 _fsck_on_open(FSCK_SHALLOW, true);
7379
7380 //reread statfs
7381 //FIXME minor: replace with actual open/close?
7382 _open_statfs();
7383 _check_legacy_statfs_alert();
7384
7385 //set again as hopefully it has been fixed
7386 if (!was_per_pool_omap) {
7387 _set_per_pool_omap();
7388 }
7389 }
7390
7391 mounted = true;
7392 return 0;
7393
7394 out_stop:
7395 _kv_stop();
7396 out_coll:
7397 _shutdown_cache();
7398 out_db:
7399 _close_db_and_around(false);
7400 out_bdev:
7401 _close_bdev();
7402 out_fsid:
7403 _close_fsid();
7404 out_path:
7405 _close_path();
7406 return r;
7407 }
7408
7409 int BlueStore::umount()
7410 {
7411 ceph_assert(_kv_only || mounted);
7412 dout(1) << __func__ << dendl;
7413
7414 _osr_drain_all();
7415
7416 mounted = false;
7417 if (!_kv_only) {
7418 mempool_thread.shutdown();
7419 dout(20) << __func__ << " stopping kv thread" << dendl;
7420 _kv_stop();
7421 _shutdown_cache();
7422 dout(20) << __func__ << " closing" << dendl;
7423
7424 }
7425 _close_db_and_around(false);
7426 _close_bdev();
7427 _close_fsid();
7428 _close_path();
7429
7430 if (cct->_conf->bluestore_fsck_on_umount) {
7431 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7432 if (rc < 0)
7433 return rc;
7434 if (rc > 0) {
7435 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7436 return -EIO;
7437 }
7438 }
7439 return 0;
7440 }
7441
7442 int BlueStore::cold_open()
7443 {
7444 int r = _open_path();
7445 if (r < 0)
7446 return r;
7447 r = _open_fsid(false);
7448 if (r < 0)
7449 goto out_path;
7450
7451 r = _read_fsid(&fsid);
7452 if (r < 0)
7453 goto out_fsid;
7454
7455 r = _lock_fsid();
7456 if (r < 0)
7457 goto out_fsid;
7458
7459 r = _open_bdev(false);
7460 if (r < 0)
7461 goto out_fsid;
7462 r = _open_db_and_around(true);
7463 if (r < 0) {
7464 goto out_bdev;
7465 }
7466 return 0;
7467 out_bdev:
7468 _close_bdev();
7469 out_fsid:
7470 _close_fsid();
7471 out_path:
7472 _close_path();
7473 return r;
7474 }
7475 int BlueStore::cold_close()
7476 {
7477 _close_db_and_around(true);
7478 _close_bdev();
7479 _close_fsid();
7480 _close_path();
7481 return 0;
7482 }
7483
7484 // derr wrapper to limit enormous output and avoid log flooding.
7485 // Of limited use where such output is expected for now
7486 #define fsck_derr(err_cnt, threshold) \
7487 if (err_cnt <= threshold) { \
7488 bool need_skip_print = err_cnt == threshold; \
7489 derr
7490
7491 #define fsck_dendl \
7492 dendl; \
7493 if (need_skip_print) \
7494 derr << "more error lines skipped..." << dendl; \
7495 }
7496
7497 int _fsck_sum_extents(
7498 const PExtentVector& extents,
7499 bool compressed,
7500 store_statfs_t& expected_statfs)
7501 {
7502 for (auto e : extents) {
7503 if (!e.is_valid())
7504 continue;
7505 expected_statfs.allocated += e.length;
7506 if (compressed) {
7507 expected_statfs.data_compressed_allocated += e.length;
7508 }
7509 }
7510 return 0;
7511 }
7512
7513 int BlueStore::_fsck_check_extents(
7514 const coll_t& cid,
7515 const ghobject_t& oid,
7516 const PExtentVector& extents,
7517 bool compressed,
7518 mempool_dynamic_bitset &used_blocks,
7519 uint64_t granularity,
7520 BlueStoreRepairer* repairer,
7521 store_statfs_t& expected_statfs,
7522 FSCKDepth depth)
7523 {
7524 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7525 int errors = 0;
7526 for (auto e : extents) {
7527 if (!e.is_valid())
7528 continue;
7529 expected_statfs.allocated += e.length;
7530 if (compressed) {
7531 expected_statfs.data_compressed_allocated += e.length;
7532 }
7533 if (depth != FSCK_SHALLOW) {
7534 bool already = false;
7535 apply_for_bitset_range(
7536 e.offset, e.length, granularity, used_blocks,
7537 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
7538 if (bs.test(pos)) {
7539 if (repairer) {
7540 repairer->note_misreference(
7541 pos * min_alloc_size, min_alloc_size, !already);
7542 }
7543 if (!already) {
7544 derr << "fsck error: " << oid << " extent " << e
7545 << " or a subset is already allocated (misreferenced)" << dendl;
7546 ++errors;
7547 already = true;
7548 }
7549 }
7550 else
7551 bs.set(pos);
7552 });
7553 if (repairer) {
7554 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
7555 }
7556
7557 if (e.end() > bdev->get_size()) {
7558 derr << "fsck error: " << oid << " extent " << e
7559 << " past end of block device" << dendl;
7560 ++errors;
7561 }
7562 }
7563 }
7564 return errors;
7565 }
7566
7567 void BlueStore::_fsck_check_pool_statfs(
7568 BlueStore::per_pool_statfs& expected_pool_statfs,
7569 int64_t& errors,
7570 int64_t& warnings,
7571 BlueStoreRepairer* repairer)
7572 {
7573 auto it = db->get_iterator(PREFIX_STAT);
7574 if (it) {
7575 for (it->lower_bound(string()); it->valid(); it->next()) {
7576 string key = it->key();
7577 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7578 if (repairer) {
7579 ++errors;
7580 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7581 derr << "fsck error: " << "legacy statfs record found, removing"
7582 << dendl;
7583 }
7584 continue;
7585 }
7586 uint64_t pool_id;
7587 if (get_key_pool_stat(key, &pool_id) < 0) {
7588 derr << "fsck error: bad key " << key
7589 << "in statfs namespece" << dendl;
7590 if (repairer) {
7591 repairer->remove_key(db, PREFIX_STAT, key);
7592 }
7593 ++errors;
7594 continue;
7595 }
7596
7597 volatile_statfs vstatfs;
7598 bufferlist bl = it->value();
7599 auto blp = bl.cbegin();
7600 try {
7601 vstatfs.decode(blp);
7602 } catch (buffer::error& e) {
7603 derr << "fsck error: failed to decode Pool StatFS record"
7604 << pretty_binary_string(key) << dendl;
7605 if (repairer) {
7606 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7607 << pretty_binary_string(key)
7608 << "', removing" << dendl;
7609 repairer->remove_key(db, PREFIX_STAT, key);
7610 }
7611 ++errors;
7612 vstatfs.reset();
7613 }
7614 auto stat_it = expected_pool_statfs.find(pool_id);
7615 if (stat_it == expected_pool_statfs.end()) {
7616 if (vstatfs.is_empty()) {
7617 // we don't consider that as an error since empty pool statfs
7618 // are left in DB for now
7619 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7620 << std::hex << pool_id << std::dec << dendl;
7621 if (repairer) {
7622 // but we need to increment error count in case of repair
7623 // to have proper counters at the end
7624 // (as repairer increments recovery counter anyway).
7625 ++errors;
7626 }
7627 } else {
7628 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7629 << std::hex << pool_id << std::dec << dendl;
7630 ++errors;
7631 }
7632 if (repairer) {
7633 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
7634 }
7635 continue;
7636 }
7637 store_statfs_t statfs;
7638 vstatfs.publish(&statfs);
7639 if (!(stat_it->second == statfs)) {
7640 derr << "fsck error: actual " << statfs
7641 << " != expected " << stat_it->second
7642 << " for pool "
7643 << std::hex << pool_id << std::dec << dendl;
7644 if (repairer) {
7645 repairer->fix_statfs(db, key, stat_it->second);
7646 }
7647 ++errors;
7648 }
7649 expected_pool_statfs.erase(stat_it);
7650 }
7651 } // if (it)
7652 for (auto& s : expected_pool_statfs) {
7653 if (s.second.is_zero()) {
7654 // we might lack empty statfs recs in DB
7655 continue;
7656 }
7657 derr << "fsck error: missing Pool StatFS record for pool "
7658 << std::hex << s.first << std::dec << dendl;
7659 if (repairer) {
7660 string key;
7661 get_pool_stat_key(s.first, &key);
7662 repairer->fix_statfs(db, key, s.second);
7663 }
7664 ++errors;
7665 }
7666 if (!per_pool_stat_collection &&
7667 repairer) {
7668 // by virtue of running this method, we correct the top-level
7669 // error of having global stats
7670 repairer->inc_repaired();
7671 }
7672 }
7673
7674 BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7675 BlueStore::FSCKDepth depth,
7676 int64_t pool_id,
7677 BlueStore::CollectionRef c,
7678 const ghobject_t& oid,
7679 const string& key,
7680 const bufferlist& value,
7681 mempool::bluestore_fsck::list<string>* expecting_shards,
7682 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7683 const BlueStore::FSCK_ObjectCtx& ctx)
7684 {
7685 auto& errors = ctx.errors;
7686 auto& num_objects = ctx.num_objects;
7687 auto& num_extents = ctx.num_extents;
7688 auto& num_blobs = ctx.num_blobs;
7689 auto& num_sharded_objects = ctx.num_sharded_objects;
7690 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7691 auto used_blocks = ctx.used_blocks;
7692 auto sb_info_lock = ctx.sb_info_lock;
7693 auto& sb_info = ctx.sb_info;
7694 auto repairer = ctx.repairer;
7695
7696 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7697 &ctx.expected_pool_statfs[pool_id] :
7698 &ctx.expected_store_statfs;
7699
7700 dout(10) << __func__ << " " << oid << dendl;
7701 OnodeRef o;
7702 o.reset(Onode::decode(c, oid, key, value));
7703 ++num_objects;
7704
7705 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7706
7707 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7708 _dump_onode<30>(cct, *o);
7709 // shards
7710 if (!o->extent_map.shards.empty()) {
7711 ++num_sharded_objects;
7712 if (depth != FSCK_SHALLOW) {
7713 ceph_assert(expecting_shards);
7714 for (auto& s : o->extent_map.shards) {
7715 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
7716 expecting_shards->push_back(string());
7717 get_extent_shard_key(o->key, s.shard_info->offset,
7718 &expecting_shards->back());
7719 if (s.shard_info->offset >= o->onode.size) {
7720 derr << "fsck error: " << oid << " shard 0x" << std::hex
7721 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7722 << std::dec << dendl;
7723 ++errors;
7724 }
7725 }
7726 }
7727 }
7728
7729 // lextents
7730 uint64_t pos = 0;
7731 mempool::bluestore_fsck::map<BlobRef,
7732 bluestore_blob_use_tracker_t> ref_map;
7733 for (auto& l : o->extent_map.extent_map) {
7734 dout(20) << __func__ << " " << l << dendl;
7735 if (l.logical_offset < pos) {
7736 derr << "fsck error: " << oid << " lextent at 0x"
7737 << std::hex << l.logical_offset
7738 << " overlaps with the previous, which ends at 0x" << pos
7739 << std::dec << dendl;
7740 ++errors;
7741 }
7742 if (depth != FSCK_SHALLOW &&
7743 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7744 derr << "fsck error: " << oid << " lextent at 0x"
7745 << std::hex << l.logical_offset << "~" << l.length
7746 << " spans a shard boundary"
7747 << std::dec << dendl;
7748 ++errors;
7749 }
7750 pos = l.logical_offset + l.length;
7751 res_statfs->data_stored += l.length;
7752 ceph_assert(l.blob);
7753 const bluestore_blob_t& blob = l.blob->get_blob();
7754
7755 auto& ref = ref_map[l.blob];
7756 if (ref.is_empty()) {
7757 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7758 uint32_t l = blob.get_logical_length();
7759 ref.init(l, min_release_size);
7760 }
7761 ref.get(
7762 l.blob_offset,
7763 l.length);
7764 ++num_extents;
7765 if (depth != FSCK_SHALLOW &&
7766 blob.has_unused()) {
7767 ceph_assert(referenced);
7768 auto p = referenced->find(l.blob);
7769 bluestore_blob_t::unused_t* pu;
7770 if (p == referenced->end()) {
7771 pu = &(*referenced)[l.blob];
7772 }
7773 else {
7774 pu = &p->second;
7775 }
7776 uint64_t blob_len = blob.get_logical_length();
7777 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7778 ceph_assert(l.blob_offset + l.length <= blob_len);
7779 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7780 uint64_t start = l.blob_offset / chunk_size;
7781 uint64_t end =
7782 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7783 for (auto i = start; i < end; ++i) {
7784 (*pu) |= (1u << i);
7785 }
7786 }
7787 } //for (auto& l : o->extent_map.extent_map)
7788
7789 for (auto& i : ref_map) {
7790 ++num_blobs;
7791 const bluestore_blob_t& blob = i.first->get_blob();
7792 bool equal =
7793 depth == FSCK_SHALLOW ? true :
7794 i.first->get_blob_use_tracker().equal(i.second);
7795 if (!equal) {
7796 derr << "fsck error: " << oid << " blob " << *i.first
7797 << " doesn't match expected ref_map " << i.second << dendl;
7798 ++errors;
7799 }
7800 if (blob.is_compressed()) {
7801 res_statfs->data_compressed += blob.get_compressed_payload_length();
7802 res_statfs->data_compressed_original +=
7803 i.first->get_referenced_bytes();
7804 }
7805 if (blob.is_shared()) {
7806 if (i.first->shared_blob->get_sbid() > blobid_max) {
7807 derr << "fsck error: " << oid << " blob " << blob
7808 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7809 << blobid_max << dendl;
7810 ++errors;
7811 }
7812 else if (i.first->shared_blob->get_sbid() == 0) {
7813 derr << "fsck error: " << oid << " blob " << blob
7814 << " marked as shared but has uninitialized sbid"
7815 << dendl;
7816 ++errors;
7817 }
7818 // the below lock is optional and provided in multithreading mode only
7819 if (sb_info_lock) {
7820 sb_info_lock->lock();
7821 }
7822 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7823 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7824 ceph_assert(sbi.pool_id == INT64_MIN ||
7825 sbi.pool_id == oid.hobj.get_logical_pool());
7826 sbi.cid = c->cid;
7827 sbi.pool_id = oid.hobj.get_logical_pool();
7828 sbi.sb = i.first->shared_blob;
7829 sbi.oids.push_back(oid);
7830 sbi.compressed = blob.is_compressed();
7831 for (auto e : blob.get_extents()) {
7832 if (e.is_valid()) {
7833 sbi.ref_map.get(e.offset, e.length);
7834 }
7835 }
7836 if (sb_info_lock) {
7837 sb_info_lock->unlock();
7838 }
7839 } else if (depth != FSCK_SHALLOW) {
7840 ceph_assert(used_blocks);
7841 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7842 blob.is_compressed(),
7843 *used_blocks,
7844 fm->get_alloc_size(),
7845 repairer,
7846 *res_statfs,
7847 depth);
7848 } else {
7849 errors += _fsck_sum_extents(
7850 blob.get_extents(),
7851 blob.is_compressed(),
7852 *res_statfs);
7853 }
7854 } // for (auto& i : ref_map)
7855
7856 {
7857 auto &sbm = o->extent_map.spanning_blob_map;
7858 size_t broken = 0;
7859 BlobRef first_broken;
7860 for (auto it = sbm.begin(); it != sbm.end();) {
7861 auto it1 = it++;
7862 if (ref_map.count(it1->second) == 0) {
7863 if (!broken) {
7864 first_broken = it1->second;
7865 ++errors;
7866 }
7867 broken++;
7868 if (repairer) {
7869 sbm.erase(it1);
7870 }
7871 }
7872 }
7873 if (broken) {
7874 derr << "fsck error: " << oid << " - " << broken
7875 << " zombie spanning blob(s) found, the first one: "
7876 << *first_broken << dendl;
7877 if(repairer) {
7878 auto txn = repairer->fix_spanning_blobs(db);
7879 _record_onode(o, txn);
7880 }
7881 }
7882 }
7883
7884 if (o->onode.has_omap()) {
7885 _fsck_check_object_omap(depth, o, ctx);
7886 }
7887
7888 return o;
7889 }
7890
7891 #include "common/WorkQueue.h"
7892
7893 class ShallowFSCKThreadPool : public ThreadPool
7894 {
7895 public:
7896 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7897 ThreadPool(cct_, nm, tn, n) {
7898 }
7899 void worker(ThreadPool::WorkThread* wt) override {
7900 int next_wq = 0;
7901 while (!_stop) {
7902 next_wq %= work_queues.size();
7903 WorkQueue_ *wq = work_queues[next_wq++];
7904
7905 void* item = wq->_void_dequeue();
7906 if (item) {
7907 processing++;
7908 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7909 wq->_void_process(item, tp_handle);
7910 processing--;
7911 }
7912 }
7913 }
7914 template <size_t BatchLen>
7915 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7916 {
7917 struct Entry {
7918 int64_t pool_id;
7919 BlueStore::CollectionRef c;
7920 ghobject_t oid;
7921 string key;
7922 bufferlist value;
7923 };
7924 struct Batch {
7925 std::atomic<size_t> running = { 0 };
7926 size_t entry_count = 0;
7927 std::array<Entry, BatchLen> entries;
7928
7929 int64_t errors = 0;
7930 int64_t warnings = 0;
7931 uint64_t num_objects = 0;
7932 uint64_t num_extents = 0;
7933 uint64_t num_blobs = 0;
7934 uint64_t num_sharded_objects = 0;
7935 uint64_t num_spanning_blobs = 0;
7936 store_statfs_t expected_store_statfs;
7937 BlueStore::per_pool_statfs expected_pool_statfs;
7938 };
7939
7940 size_t batchCount;
7941 BlueStore* store = nullptr;
7942
7943 ceph::mutex* sb_info_lock = nullptr;
7944 BlueStore::sb_info_map_t* sb_info = nullptr;
7945 BlueStoreRepairer* repairer = nullptr;
7946
7947 Batch* batches = nullptr;
7948 size_t last_batch_pos = 0;
7949 bool batch_acquired = false;
7950
7951 FSCKWorkQueue(std::string n,
7952 size_t _batchCount,
7953 BlueStore* _store,
7954 ceph::mutex* _sb_info_lock,
7955 BlueStore::sb_info_map_t& _sb_info,
7956 BlueStoreRepairer* _repairer) :
7957 WorkQueue_(n, time_t(), time_t()),
7958 batchCount(_batchCount),
7959 store(_store),
7960 sb_info_lock(_sb_info_lock),
7961 sb_info(&_sb_info),
7962 repairer(_repairer)
7963 {
7964 batches = new Batch[batchCount];
7965 }
7966 ~FSCKWorkQueue() {
7967 delete[] batches;
7968 }
7969
7970 /// Remove all work items from the queue.
7971 void _clear() override {
7972 //do nothing
7973 }
7974 /// Check whether there is anything to do.
7975 bool _empty() override {
7976 ceph_assert(false);
7977 }
7978
7979 /// Get the next work item to process.
7980 void* _void_dequeue() override {
7981 size_t pos = rand() % batchCount;
7982 size_t pos0 = pos;
7983 do {
7984 auto& batch = batches[pos];
7985 if (batch.running.fetch_add(1) == 0) {
7986 if (batch.entry_count) {
7987 return &batch;
7988 }
7989 }
7990 batch.running--;
7991 pos++;
7992 pos %= batchCount;
7993 } while (pos != pos0);
7994 return nullptr;
7995 }
7996 /** @brief Process the work item.
7997 * This function will be called several times in parallel
7998 * and must therefore be thread-safe. */
7999 void _void_process(void* item, TPHandle& handle) override {
8000 Batch* batch = (Batch*)item;
8001
8002 BlueStore::FSCK_ObjectCtx ctx(
8003 batch->errors,
8004 batch->warnings,
8005 batch->num_objects,
8006 batch->num_extents,
8007 batch->num_blobs,
8008 batch->num_sharded_objects,
8009 batch->num_spanning_blobs,
8010 nullptr, // used_blocks
8011 nullptr, //used_omap_head
8012 sb_info_lock,
8013 *sb_info,
8014 batch->expected_store_statfs,
8015 batch->expected_pool_statfs,
8016 repairer);
8017
8018 for (size_t i = 0; i < batch->entry_count; i++) {
8019 auto& entry = batch->entries[i];
8020
8021 store->fsck_check_objects_shallow(
8022 BlueStore::FSCK_SHALLOW,
8023 entry.pool_id,
8024 entry.c,
8025 entry.oid,
8026 entry.key,
8027 entry.value,
8028 nullptr, // expecting_shards - this will need a protection if passed
8029 nullptr, // referenced
8030 ctx);
8031 }
8032 //std::cout << "processed " << batch << std::endl;
8033 batch->entry_count = 0;
8034 batch->running--;
8035 }
8036 /** @brief Synchronously finish processing a work item.
8037 * This function is called after _void_process with the global thread pool lock held,
8038 * so at most one copy will execute simultaneously for a given thread pool.
8039 * It can be used for non-thread-safe finalization. */
8040 void _void_process_finish(void*) override {
8041 ceph_assert(false);
8042 }
8043
8044 bool queue(
8045 int64_t pool_id,
8046 BlueStore::CollectionRef c,
8047 const ghobject_t& oid,
8048 const string& key,
8049 const bufferlist& value) {
8050 bool res = false;
8051 size_t pos0 = last_batch_pos;
8052 if (!batch_acquired) {
8053 do {
8054 auto& batch = batches[last_batch_pos];
8055 if (batch.running.fetch_add(1) == 0) {
8056 if (batch.entry_count < BatchLen) {
8057 batch_acquired = true;
8058 break;
8059 }
8060 }
8061 batch.running.fetch_sub(1);
8062 last_batch_pos++;
8063 last_batch_pos %= batchCount;
8064 } while (last_batch_pos != pos0);
8065 }
8066 if (batch_acquired) {
8067 auto& batch = batches[last_batch_pos];
8068 ceph_assert(batch.running);
8069 ceph_assert(batch.entry_count < BatchLen);
8070
8071 auto& entry = batch.entries[batch.entry_count];
8072 entry.pool_id = pool_id;
8073 entry.c = c;
8074 entry.oid = oid;
8075 entry.key = key;
8076 entry.value = value;
8077
8078 ++batch.entry_count;
8079 if (batch.entry_count == BatchLen) {
8080 batch_acquired = false;
8081 batch.running.fetch_sub(1);
8082 last_batch_pos++;
8083 last_batch_pos %= batchCount;
8084 }
8085 res = true;
8086 }
8087 return res;
8088 }
8089
8090 void finalize(ThreadPool& tp,
8091 BlueStore::FSCK_ObjectCtx& ctx) {
8092 if (batch_acquired) {
8093 auto& batch = batches[last_batch_pos];
8094 ceph_assert(batch.running);
8095 batch.running.fetch_sub(1);
8096 }
8097 tp.stop();
8098
8099 for (size_t i = 0; i < batchCount; i++) {
8100 auto& batch = batches[i];
8101
8102 //process leftovers if any
8103 if (batch.entry_count) {
8104 TPHandle tp_handle(store->cct,
8105 nullptr,
8106 timeout_interval,
8107 suicide_interval);
8108 ceph_assert(batch.running == 0);
8109
8110 batch.running++; // just to be on-par with the regular call
8111 _void_process(&batch, tp_handle);
8112 }
8113 ceph_assert(batch.entry_count == 0);
8114
8115 ctx.errors += batch.errors;
8116 ctx.warnings += batch.warnings;
8117 ctx.num_objects += batch.num_objects;
8118 ctx.num_extents += batch.num_extents;
8119 ctx.num_blobs += batch.num_blobs;
8120 ctx.num_sharded_objects += batch.num_sharded_objects;
8121 ctx.num_spanning_blobs += batch.num_spanning_blobs;
8122
8123 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8124
8125 for (auto it = batch.expected_pool_statfs.begin();
8126 it != batch.expected_pool_statfs.end();
8127 it++) {
8128 ctx.expected_pool_statfs[it->first].add(it->second);
8129 }
8130 }
8131 }
8132 };
8133 };
8134
8135 void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8136 OnodeRef& o,
8137 const BlueStore::FSCK_ObjectCtx& ctx)
8138 {
8139 auto& errors = ctx.errors;
8140 auto& warnings = ctx.warnings;
8141 auto repairer = ctx.repairer;
8142
8143 ceph_assert(o->onode.has_omap());
8144 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
8145 if (per_pool_omap) {
8146 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8147 << "fsck error: " << o->oid
8148 << " has omap that is not per-pool or pgmeta"
8149 << fsck_dendl;
8150 ++errors;
8151 } else {
8152 const char* w;
8153 int64_t num;
8154 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8155 ++errors;
8156 num = errors;
8157 w = "error";
8158 } else {
8159 ++warnings;
8160 num = warnings;
8161 w = "warning";
8162 }
8163 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8164 << "fsck " << w << ": " << o->oid
8165 << " has omap that is not per-pool or pgmeta"
8166 << fsck_dendl;
8167 }
8168 }
8169 if (repairer &&
8170 !o->onode.is_perpool_omap() &&
8171 !o->onode.is_pgmeta_omap()) {
8172 dout(10) << "fsck converting " << o->oid << " omap to per-pool" << dendl;
8173 bufferlist h;
8174 map<string, bufferlist> kv;
8175 int r = _onode_omap_get(o, &h, &kv);
8176 if (r < 0) {
8177 derr << " got " << r << " " << cpp_strerror(r) << dendl;
8178 } else {
8179 KeyValueDB::Transaction txn = db->get_transaction();
8180 // remove old keys
8181 const string& old_omap_prefix = o->get_omap_prefix();
8182 string old_head, old_tail;
8183 o->get_omap_header(&old_head);
8184 o->get_omap_tail(&old_tail);
8185 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8186 txn->rmkey(old_omap_prefix, old_tail);
8187 // set flag
8188 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP);
8189 _record_onode(o, txn);
8190 const string& new_omap_prefix = o->get_omap_prefix();
8191 // head
8192 if (h.length()) {
8193 string new_head;
8194 o->get_omap_header(&new_head);
8195 txn->set(new_omap_prefix, new_head, h);
8196 }
8197 // tail
8198 string new_tail;
8199 o->get_omap_tail(&new_tail);
8200 bufferlist empty;
8201 txn->set(new_omap_prefix, new_tail, empty);
8202 // values
8203 string final_key;
8204 o->get_omap_key(string(), &final_key);
8205 size_t base_key_len = final_key.size();
8206 for (auto& i : kv) {
8207 final_key.resize(base_key_len);
8208 final_key += i.first;
8209 txn->set(new_omap_prefix, final_key, i.second);
8210 }
8211 db->submit_transaction_sync(txn);
8212 repairer->inc_repaired();
8213 }
8214 }
8215 }
8216
8217 void BlueStore::_fsck_check_objects(FSCKDepth depth,
8218 BlueStore::FSCK_ObjectCtx& ctx)
8219 {
8220 auto& errors = ctx.errors;
8221 auto sb_info_lock = ctx.sb_info_lock;
8222 auto& sb_info = ctx.sb_info;
8223 auto repairer = ctx.repairer;
8224
8225 uint64_t_btree_t used_nids;
8226
8227 size_t processed_myself = 0;
8228
8229 auto it = db->get_iterator(PREFIX_OBJ);
8230 mempool::bluestore_fsck::list<string> expecting_shards;
8231 if (it) {
8232 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8233 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8234 std::unique_ptr<WQ> wq(
8235 new WQ(
8236 "FSCKWorkQueue",
8237 (thread_count ? : 1) * 32,
8238 this,
8239 sb_info_lock,
8240 sb_info,
8241 repairer));
8242
8243 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8244
8245 thread_pool.add_work_queue(wq.get());
8246 if (depth == FSCK_SHALLOW && thread_count > 0) {
8247 //not the best place but let's check anyway
8248 ceph_assert(sb_info_lock);
8249 thread_pool.start();
8250 }
8251
8252 //fill global if not overriden below
8253 CollectionRef c;
8254 int64_t pool_id = -1;
8255 spg_t pgid;
8256 for (it->lower_bound(string()); it->valid(); it->next()) {
8257 dout(30) << __func__ << " key "
8258 << pretty_binary_string(it->key()) << dendl;
8259 if (is_extent_shard_key(it->key())) {
8260 if (depth == FSCK_SHALLOW) {
8261 continue;
8262 }
8263 while (!expecting_shards.empty() &&
8264 expecting_shards.front() < it->key()) {
8265 derr << "fsck error: missing shard key "
8266 << pretty_binary_string(expecting_shards.front())
8267 << dendl;
8268 ++errors;
8269 expecting_shards.pop_front();
8270 }
8271 if (!expecting_shards.empty() &&
8272 expecting_shards.front() == it->key()) {
8273 // all good
8274 expecting_shards.pop_front();
8275 continue;
8276 }
8277
8278 uint32_t offset;
8279 string okey;
8280 get_key_extent_shard(it->key(), &okey, &offset);
8281 derr << "fsck error: stray shard 0x" << std::hex << offset
8282 << std::dec << dendl;
8283 if (expecting_shards.empty()) {
8284 derr << "fsck error: " << pretty_binary_string(it->key())
8285 << " is unexpected" << dendl;
8286 ++errors;
8287 continue;
8288 }
8289 while (expecting_shards.front() > it->key()) {
8290 derr << "fsck error: saw " << pretty_binary_string(it->key())
8291 << dendl;
8292 derr << "fsck error: exp "
8293 << pretty_binary_string(expecting_shards.front()) << dendl;
8294 ++errors;
8295 expecting_shards.pop_front();
8296 if (expecting_shards.empty()) {
8297 break;
8298 }
8299 }
8300 continue;
8301 }
8302
8303 ghobject_t oid;
8304 int r = get_key_object(it->key(), &oid);
8305 if (r < 0) {
8306 derr << "fsck error: bad object key "
8307 << pretty_binary_string(it->key()) << dendl;
8308 ++errors;
8309 continue;
8310 }
8311 if (!c ||
8312 oid.shard_id != pgid.shard ||
8313 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8314 !c->contains(oid)) {
8315 c = nullptr;
8316 for (auto& p : coll_map) {
8317 if (p.second->contains(oid)) {
8318 c = p.second;
8319 break;
8320 }
8321 }
8322 if (!c) {
8323 derr << "fsck error: stray object " << oid
8324 << " not owned by any collection" << dendl;
8325 ++errors;
8326 continue;
8327 }
8328 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8329 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8330 << dendl;
8331 }
8332
8333 if (depth != FSCK_SHALLOW &&
8334 !expecting_shards.empty()) {
8335 for (auto& k : expecting_shards) {
8336 derr << "fsck error: missing shard key "
8337 << pretty_binary_string(k) << dendl;
8338 }
8339 ++errors;
8340 expecting_shards.clear();
8341 }
8342
8343 bool queued = false;
8344 if (depth == FSCK_SHALLOW && thread_count > 0) {
8345 queued = wq->queue(
8346 pool_id,
8347 c,
8348 oid,
8349 it->key(),
8350 it->value());
8351 }
8352 OnodeRef o;
8353 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8354
8355 if (!queued) {
8356 ++processed_myself;
8357
8358 o = fsck_check_objects_shallow(
8359 depth,
8360 pool_id,
8361 c,
8362 oid,
8363 it->key(),
8364 it->value(),
8365 &expecting_shards,
8366 &referenced,
8367 ctx);
8368 }
8369
8370 if (depth != FSCK_SHALLOW) {
8371 ceph_assert(o != nullptr);
8372 if (o->onode.nid) {
8373 if (o->onode.nid > nid_max) {
8374 derr << "fsck error: " << oid << " nid " << o->onode.nid
8375 << " > nid_max " << nid_max << dendl;
8376 ++errors;
8377 }
8378 if (used_nids.count(o->onode.nid)) {
8379 derr << "fsck error: " << oid << " nid " << o->onode.nid
8380 << " already in use" << dendl;
8381 ++errors;
8382 continue; // go for next object
8383 }
8384 used_nids.insert(o->onode.nid);
8385 }
8386 for (auto& i : referenced) {
8387 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8388 << std::dec << " for " << *i.first << dendl;
8389 const bluestore_blob_t& blob = i.first->get_blob();
8390 if (i.second & blob.unused) {
8391 derr << "fsck error: " << oid << " blob claims unused 0x"
8392 << std::hex << blob.unused
8393 << " but extents reference 0x" << i.second << std::dec
8394 << " on blob " << *i.first << dendl;
8395 ++errors;
8396 }
8397 if (blob.has_csum()) {
8398 uint64_t blob_len = blob.get_logical_length();
8399 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8400 unsigned csum_count = blob.get_csum_count();
8401 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8402 for (unsigned p = 0; p < csum_count; ++p) {
8403 unsigned pos = p * csum_chunk_size;
8404 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8405 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8406 unsigned mask = 1u << firstbit;
8407 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8408 mask |= 1u << b;
8409 }
8410 if ((blob.unused & mask) == mask) {
8411 // this csum chunk region is marked unused
8412 if (blob.get_csum_item(p) != 0) {
8413 derr << "fsck error: " << oid
8414 << " blob claims csum chunk 0x" << std::hex << pos
8415 << "~" << csum_chunk_size
8416 << " is unused (mask 0x" << mask << " of unused 0x"
8417 << blob.unused << ") but csum is non-zero 0x"
8418 << blob.get_csum_item(p) << std::dec << " on blob "
8419 << *i.first << dendl;
8420 ++errors;
8421 }
8422 }
8423 }
8424 }
8425 }
8426 // omap
8427 if (o->onode.has_omap()) {
8428 ceph_assert(ctx.used_omap_head);
8429 if (ctx.used_omap_head->count(o->onode.nid)) {
8430 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8431 << " already in use" << dendl;
8432 ++errors;
8433 } else {
8434 ctx.used_omap_head->insert(o->onode.nid);
8435 }
8436 } // if (o->onode.has_omap())
8437 if (depth == FSCK_DEEP) {
8438 bufferlist bl;
8439 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8440 uint64_t offset = 0;
8441 do {
8442 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8443 int r = _do_read(c.get(), o, offset, l, bl,
8444 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8445 if (r < 0) {
8446 ++errors;
8447 derr << "fsck error: " << oid << std::hex
8448 << " error during read: "
8449 << " " << offset << "~" << l
8450 << " " << cpp_strerror(r) << std::dec
8451 << dendl;
8452 break;
8453 }
8454 offset += l;
8455 } while (offset < o->onode.size);
8456 } // deep
8457 } //if (depth != FSCK_SHALLOW)
8458 } // for (it->lower_bound(string()); it->valid(); it->next())
8459 if (depth == FSCK_SHALLOW && thread_count > 0) {
8460 wq->finalize(thread_pool, ctx);
8461 if (processed_myself) {
8462 // may be needs more threads?
8463 dout(0) << __func__ << " partial offload"
8464 << ", done myself " << processed_myself
8465 << " of " << ctx.num_objects
8466 << "objects, threads " << thread_count
8467 << dendl;
8468 }
8469 }
8470 } // if (it)
8471 }
8472 /**
8473 An overview for currently implemented repair logics
8474 performed in fsck in two stages: detection(+preparation) and commit.
8475 Detection stage (in processing order):
8476 (Issue -> Repair action to schedule)
8477 - Detect undecodable keys for Shared Blobs -> Remove
8478 - Detect undecodable records for Shared Blobs -> Remove
8479 (might trigger missed Shared Blob detection below)
8480 - Detect stray records for Shared Blobs -> Remove
8481 - Detect misreferenced pextents -> Fix
8482 Prepare Bloom-like filter to track cid/oid -> pextent
8483 Prepare list of extents that are improperly referenced
8484 Enumerate Onode records that might use 'misreferenced' pextents
8485 (Bloom-like filter applied to reduce computation)
8486 Per each questinable Onode enumerate all blobs and identify broken ones
8487 (i.e. blobs having 'misreferences')
8488 Rewrite each broken blob data by allocating another extents and
8489 copying data there
8490 If blob is shared - unshare it and mark corresponding Shared Blob
8491 for removal
8492 Release previously allocated space
8493 Update Extent Map
8494 - Detect missed Shared Blobs -> Recreate
8495 - Detect undecodable deferred transaction -> Remove
8496 - Detect Freelist Manager's 'false free' entries -> Mark as used
8497 - Detect Freelist Manager's leaked entries -> Mark as free
8498 - Detect statfs inconsistency - Update
8499 Commit stage (separate DB commit per each step):
8500 - Apply leaked FM entries fix
8501 - Apply 'false free' FM entries fix
8502 - Apply 'Remove' actions
8503 - Apply fix for misreference pextents
8504 - Apply Shared Blob recreate
8505 (can be merged with the step above if misreferences were dectected)
8506 - Apply StatFS update
8507 */
8508 int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8509 {
8510 dout(1) << __func__
8511 << (repair ? " repair" : " check")
8512 << (depth == FSCK_DEEP ? " (deep)" :
8513 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8514 << dendl;
8515
8516 // in deep mode we need R/W write access to be able to replay deferred ops
8517 bool read_only = !(repair || depth == FSCK_DEEP);
8518
8519 int r = _open_path();
8520 if (r < 0)
8521 return r;
8522 r = _open_fsid(false);
8523 if (r < 0)
8524 goto out_path;
8525
8526 r = _read_fsid(&fsid);
8527 if (r < 0)
8528 goto out_fsid;
8529
8530 r = _lock_fsid();
8531 if (r < 0)
8532 goto out_fsid;
8533
8534 r = _open_bdev(false);
8535 if (r < 0)
8536 goto out_fsid;
8537
8538 r = _open_db_and_around(read_only);
8539 if (r < 0)
8540 goto out_bdev;
8541
8542 if (!read_only) {
8543 r = _upgrade_super();
8544 if (r < 0) {
8545 goto out_db;
8546 }
8547 }
8548
8549 r = _open_collections();
8550 if (r < 0)
8551 goto out_db;
8552
8553 mempool_thread.init();
8554
8555 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8556 // enable in repair or deep mode modes only
8557 if (!read_only) {
8558 _kv_start();
8559 r = _deferred_replay();
8560 _kv_stop();
8561 }
8562 if (r < 0)
8563 goto out_scan;
8564
8565 r = _fsck_on_open(depth, repair);
8566
8567 out_scan:
8568 mempool_thread.shutdown();
8569 _shutdown_cache();
8570 out_db:
8571 _close_db_and_around(false);
8572 out_bdev:
8573 _close_bdev();
8574 out_fsid:
8575 _close_fsid();
8576 out_path:
8577 _close_path();
8578
8579 return r;
8580 }
8581
8582 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8583 {
8584 dout(1) << __func__
8585 << " <<<START>>>"
8586 << (repair ? " repair" : " check")
8587 << (depth == FSCK_DEEP ? " (deep)" :
8588 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8589 << " start" << dendl;
8590 int64_t errors = 0;
8591 int64_t warnings = 0;
8592 unsigned repaired = 0;
8593
8594 uint64_t_btree_t used_omap_head;
8595 uint64_t_btree_t used_sbids;
8596
8597 mempool_dynamic_bitset used_blocks;
8598 KeyValueDB::Iterator it;
8599 store_statfs_t expected_store_statfs, actual_statfs;
8600 per_pool_statfs expected_pool_statfs;
8601
8602 sb_info_map_t sb_info;
8603
8604 uint64_t num_objects = 0;
8605 uint64_t num_extents = 0;
8606 uint64_t num_blobs = 0;
8607 uint64_t num_spanning_blobs = 0;
8608 uint64_t num_shared_blobs = 0;
8609 uint64_t num_sharded_objects = 0;
8610 BlueStoreRepairer repairer;
8611
8612 utime_t start = ceph_clock_now();
8613
8614 _fsck_collections(&errors);
8615 used_blocks.resize(fm->get_alloc_units());
8616 apply_for_bitset_range(
8617 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
8618 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8619 bs.set(pos);
8620 }
8621 );
8622 if (repair) {
8623 repairer.get_space_usage_tracker().init(
8624 bdev->get_size(),
8625 min_alloc_size);
8626 }
8627
8628 if (bluefs) {
8629 if( cct->_conf->bluestore_bluefs_db_compatibility) {
8630 interval_set<uint64_t> bluefs_extents_db;
8631 bufferlist bl;
8632 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
8633 auto p = bl.cbegin();
8634 auto prev_errors = errors;
8635 try {
8636 decode(bluefs_extents_db, p);
8637 bluefs_extents_db.union_of(bluefs_extents);
8638 bluefs_extents_db.subtract(bluefs_extents);
8639 if (!bluefs_extents_db.empty()) {
8640 derr << "fsck error: bluefs_extents inconsistency, "
8641 << "downgrade to previous releases might be broken."
8642 << dendl;
8643 ++errors;
8644 }
8645 }
8646 catch (buffer::error& e) {
8647 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
8648 ++errors;
8649 }
8650 if (errors != prev_errors && repair) {
8651 repairer.fix_bluefs_extents(out_of_sync_fm);
8652 }
8653 }
8654
8655 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
8656 apply_for_bitset_range(
8657 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
8658 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8659 bs.set(pos);
8660 });
8661 }
8662 int r = bluefs->fsck();
8663 if (r < 0) {
8664 return r;
8665 }
8666 if (r > 0)
8667 errors += r;
8668 }
8669
8670 if (!per_pool_stat_collection) {
8671 const char *w;
8672 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8673 w = "error";
8674 ++errors;
8675 } else {
8676 w = "warning";
8677 ++warnings;
8678 }
8679 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8680 << dendl;
8681 }
8682 if (!per_pool_omap) {
8683 const char *w;
8684 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8685 w = "error";
8686 ++errors;
8687 } else {
8688 w = "warning";
8689 ++warnings;
8690 }
8691 derr << "fsck " << w << ": store not yet converted to per-pool omap"
8692 << dendl;
8693 }
8694
8695 // get expected statfs; reset unaffected fields to be able to compare
8696 // structs
8697 statfs(&actual_statfs);
8698 actual_statfs.total = 0;
8699 actual_statfs.internally_reserved = 0;
8700 actual_statfs.available = 0;
8701 actual_statfs.internal_metadata = 0;
8702 actual_statfs.omap_allocated = 0;
8703
8704 if (g_conf()->bluestore_debug_fsck_abort) {
8705 dout(1) << __func__ << " debug abort" << dendl;
8706 goto out_scan;
8707 }
8708 // walk PREFIX_OBJ
8709 {
8710 dout(1) << __func__ << " walking object keyspace" << dendl;
8711 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8712 BlueStore::FSCK_ObjectCtx ctx(
8713 errors,
8714 warnings,
8715 num_objects,
8716 num_extents,
8717 num_blobs,
8718 num_sharded_objects,
8719 num_spanning_blobs,
8720 &used_blocks,
8721 &used_omap_head,
8722 //no need for the below lock when in non-shallow mode as
8723 // there is no multithreading in this case
8724 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
8725 sb_info,
8726 expected_store_statfs,
8727 expected_pool_statfs,
8728 repair ? &repairer : nullptr);
8729
8730 _fsck_check_objects(depth, ctx);
8731 }
8732
8733 dout(1) << __func__ << " checking shared_blobs" << dendl;
8734 it = db->get_iterator(PREFIX_SHARED_BLOB);
8735 if (it) {
8736 // FIXME minor: perhaps simplify for shallow mode?
8737 // fill global if not overriden below
8738 auto expected_statfs = &expected_store_statfs;
8739
8740 for (it->lower_bound(string()); it->valid(); it->next()) {
8741 string key = it->key();
8742 uint64_t sbid;
8743 if (get_key_shared_blob(key, &sbid)) {
8744 derr << "fsck error: bad key '" << key
8745 << "' in shared blob namespace" << dendl;
8746 if (repair) {
8747 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8748 }
8749 ++errors;
8750 continue;
8751 }
8752 auto p = sb_info.find(sbid);
8753 if (p == sb_info.end()) {
8754 derr << "fsck error: found stray shared blob data for sbid 0x"
8755 << std::hex << sbid << std::dec << dendl;
8756 if (repair) {
8757 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8758 }
8759 ++errors;
8760 } else {
8761 ++num_shared_blobs;
8762 sb_info_t& sbi = p->second;
8763 bluestore_shared_blob_t shared_blob(sbid);
8764 bufferlist bl = it->value();
8765 auto blp = bl.cbegin();
8766 try {
8767 decode(shared_blob, blp);
8768 } catch (buffer::error& e) {
8769 ++errors;
8770 // Force update and don't report as missing
8771 sbi.updated = sbi.passed = true;
8772
8773 derr << "fsck error: failed to decode Shared Blob"
8774 << pretty_binary_string(it->key()) << dendl;
8775 if (repair) {
8776 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8777 << pretty_binary_string(it->key())
8778 << "', removing" << dendl;
8779 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8780 }
8781 continue;
8782 }
8783 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8784 if (shared_blob.ref_map != sbi.ref_map) {
8785 derr << "fsck error: shared blob 0x" << std::hex << sbid
8786 << std::dec << " ref_map " << shared_blob.ref_map
8787 << " != expected " << sbi.ref_map << dendl;
8788 sbi.updated = true; // will update later in repair mode only!
8789 ++errors;
8790 }
8791 PExtentVector extents;
8792 for (auto &r : shared_blob.ref_map.ref_map) {
8793 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8794 }
8795 if (per_pool_stat_collection || repair) {
8796 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8797 }
8798 errors += _fsck_check_extents(sbi.cid,
8799 p->second.oids.front(),
8800 extents,
8801 p->second.compressed,
8802 used_blocks,
8803 fm->get_alloc_size(),
8804 repair ? &repairer : nullptr,
8805 *expected_statfs,
8806 depth);
8807 sbi.passed = true;
8808 }
8809 }
8810 } // if (it)
8811
8812 if (repair && repairer.preprocess_misreference(db)) {
8813
8814 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
8815 auto& space_tracker = repairer.get_space_usage_tracker();
8816 auto& misref_extents = repairer.get_misreferences();
8817 interval_set<uint64_t> to_release;
8818 it = db->get_iterator(PREFIX_OBJ);
8819 if (it) {
8820 // fill global if not overriden below
8821 auto expected_statfs = &expected_store_statfs;
8822
8823 CollectionRef c;
8824 spg_t pgid;
8825 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8826 bool bypass_rest = false;
8827 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8828 it->next()) {
8829 dout(30) << __func__ << " key "
8830 << pretty_binary_string(it->key()) << dendl;
8831 if (is_extent_shard_key(it->key())) {
8832 continue;
8833 }
8834
8835 ghobject_t oid;
8836 int r = get_key_object(it->key(), &oid);
8837 if (r < 0 || !space_tracker.is_used(oid)) {
8838 continue;
8839 }
8840
8841 if (!c ||
8842 oid.shard_id != pgid.shard ||
8843 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8844 !c->contains(oid)) {
8845 c = nullptr;
8846 for (auto& p : coll_map) {
8847 if (p.second->contains(oid)) {
8848 c = p.second;
8849 break;
8850 }
8851 }
8852 if (!c) {
8853 continue;
8854 }
8855 if (per_pool_stat_collection || repair) {
8856 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8857 expected_statfs = &expected_pool_statfs[pool_id];
8858 }
8859 }
8860 if (!space_tracker.is_used(c->cid)) {
8861 continue;
8862 }
8863
8864 dout(20) << __func__ << " check misreference for col:" << c->cid
8865 << " obj:" << oid << dendl;
8866
8867 OnodeRef o;
8868 o.reset(Onode::decode(c, oid, it->key(), it->value()));
8869 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8870 mempool::bluestore_fsck::set<BlobRef> blobs;
8871
8872 for (auto& e : o->extent_map.extent_map) {
8873 blobs.insert(e.blob);
8874 }
8875 bool need_onode_update = false;
8876 bool first_dump = true;
8877 for(auto b : blobs) {
8878 bool broken_blob = false;
8879 auto& pextents = b->dirty_blob().dirty_extents();
8880 for (auto& e : pextents) {
8881 if (!e.is_valid()) {
8882 continue;
8883 }
8884 // for the sake of simplicity and proper shared blob handling
8885 // always rewrite the whole blob even when it's partially
8886 // misreferenced.
8887 if (misref_extents.intersects(e.offset, e.length)) {
8888 if (first_dump) {
8889 first_dump = false;
8890 _dump_onode<10>(cct, *o);
8891 }
8892 broken_blob = true;
8893 break;
8894 }
8895 }
8896 if (!broken_blob)
8897 continue;
8898 bool compressed = b->get_blob().is_compressed();
8899 need_onode_update = true;
8900 dout(10) << __func__
8901 << " fix misreferences in oid:" << oid
8902 << " " << *b << dendl;
8903 uint64_t b_off = 0;
8904 PExtentVector pext_to_release;
8905 pext_to_release.reserve(pextents.size());
8906 // rewriting all valid pextents
8907 for (auto e = pextents.begin(); e != pextents.end();
8908 b_off += e->length, e++) {
8909 if (!e->is_valid()) {
8910 continue;
8911 }
8912 PExtentVector exts;
8913 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
8914 0, 0, &exts);
8915 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
8916 derr << __func__
8917 << " failed to allocate 0x" << std::hex << e->length
8918 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
8919 << " min_alloc_size 0x" << min_alloc_size
8920 << " available 0x " << alloc->get_free()
8921 << std::dec << dendl;
8922 if (alloc_len > 0) {
8923 alloc->release(exts);
8924 }
8925 bypass_rest = true;
8926 break;
8927 }
8928 expected_statfs->allocated += e->length;
8929 if (compressed) {
8930 expected_statfs->data_compressed_allocated += e->length;
8931 }
8932
8933 bufferlist bl;
8934 IOContext ioc(cct, NULL, true); // allow EIO
8935 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8936 if (r < 0) {
8937 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8938 <<"~" << e->length << std::dec << dendl;
8939 ceph_abort_msg("read failed, wtf");
8940 }
8941 pext_to_release.push_back(*e);
8942 e = pextents.erase(e);
8943 e = pextents.insert(e, exts.begin(), exts.end());
8944 b->get_blob().map_bl(
8945 b_off, bl,
8946 [&](uint64_t offset, bufferlist& t) {
8947 int r = bdev->write(offset, t, false);
8948 ceph_assert(r == 0);
8949 });
8950 e += exts.size() - 1;
8951 for (auto& p : exts) {
8952 fm->allocate(p.offset, p.length, txn);
8953 }
8954 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8955
8956 if (b->get_blob().is_shared()) {
8957 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8958
8959 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8960 ceph_assert(sb_it != sb_info.end());
8961 sb_info_t& sbi = sb_it->second;
8962
8963 for (auto& r : sbi.ref_map.ref_map) {
8964 expected_statfs->allocated -= r.second.length;
8965 if (sbi.compressed) {
8966 // NB: it's crucial to use compressed flag from sb_info_t
8967 // as we originally used that value while accumulating
8968 // expected_statfs
8969 expected_statfs->data_compressed_allocated -= r.second.length;
8970 }
8971 }
8972 sbi.updated = sbi.passed = true;
8973 sbi.ref_map.clear();
8974
8975 // relying on blob's pextents to decide what to release.
8976 for (auto& p : pext_to_release) {
8977 to_release.union_insert(p.offset, p.length);
8978 }
8979 } else {
8980 for (auto& p : pext_to_release) {
8981 expected_statfs->allocated -= p.length;
8982 if (compressed) {
8983 expected_statfs->data_compressed_allocated -= p.length;
8984 }
8985 to_release.union_insert(p.offset, p.length);
8986 }
8987 }
8988 if (bypass_rest) {
8989 break;
8990 }
8991 } // for(auto b : blobs)
8992 if (need_onode_update) {
8993 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8994 _record_onode(o, txn);
8995 }
8996 } // for (it->lower_bound(string()); it->valid(); it->next())
8997
8998 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8999 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
9000 << "~" << it.get_len() << std::dec << dendl;
9001 fm->release(it.get_start(), it.get_len(), txn);
9002 }
9003 alloc->release(to_release);
9004 to_release.clear();
9005 } // if (it) {
9006 } //if (repair && repairer.preprocess_misreference()) {
9007
9008 if (depth != FSCK_SHALLOW) {
9009 for (auto &p : sb_info) {
9010 sb_info_t& sbi = p.second;
9011 if (!sbi.passed) {
9012 derr << "fsck error: missing " << *sbi.sb << dendl;
9013 ++errors;
9014 }
9015 if (repair && (!sbi.passed || sbi.updated)) {
9016 auto sbid = p.first;
9017 if (sbi.ref_map.empty()) {
9018 ceph_assert(sbi.passed);
9019 dout(20) << __func__ << " " << *sbi.sb
9020 << " is empty, removing" << dendl;
9021 repairer.fix_shared_blob(db, sbid, nullptr);
9022 } else {
9023 bufferlist bl;
9024 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
9025 encode(persistent, bl);
9026 dout(20) << __func__ << " " << *sbi.sb
9027 << " is " << bl.length() << " bytes, updating" << dendl;
9028
9029 repairer.fix_shared_blob(db, sbid, &bl);
9030 }
9031 }
9032 }
9033 }
9034 sb_info.clear();
9035
9036 // check global stats only if fscking (not repairing) w/o per-pool stats
9037 if (!per_pool_stat_collection &&
9038 !repair &&
9039 !(actual_statfs == expected_store_statfs)) {
9040 derr << "fsck error: actual " << actual_statfs
9041 << " != expected " << expected_store_statfs << dendl;
9042 if (repair) {
9043 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
9044 expected_store_statfs);
9045 }
9046 ++errors;
9047 }
9048
9049 dout(1) << __func__ << " checking pool_statfs" << dendl;
9050 _fsck_check_pool_statfs(expected_pool_statfs,
9051 errors, warnings, repair ? &repairer : nullptr);
9052
9053 if (depth != FSCK_SHALLOW) {
9054 dout(1) << __func__ << " checking for stray omap data " << dendl;
9055 it = db->get_iterator(PREFIX_OMAP);
9056 if (it) {
9057 uint64_t last_omap_head = 0;
9058 for (it->lower_bound(string()); it->valid(); it->next()) {
9059 uint64_t omap_head;
9060 _key_decode_u64(it->key().c_str(), &omap_head);
9061 if (used_omap_head.count(omap_head) == 0 &&
9062 omap_head != last_omap_head) {
9063 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9064 << "fsck error: found stray omap data on omap_head "
9065 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head)<< fsck_dendl;
9066 ++errors;
9067 last_omap_head = omap_head;
9068 }
9069 }
9070 }
9071 it = db->get_iterator(PREFIX_PGMETA_OMAP);
9072 if (it) {
9073 uint64_t last_omap_head = 0;
9074 for (it->lower_bound(string()); it->valid(); it->next()) {
9075 uint64_t omap_head;
9076 _key_decode_u64(it->key().c_str(), &omap_head);
9077 if (used_omap_head.count(omap_head) == 0 &&
9078 omap_head != last_omap_head) {
9079 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9080 << "fsck error: found stray (pgmeta) omap data on omap_head "
9081 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9082 last_omap_head = omap_head;
9083 ++errors;
9084 }
9085 }
9086 }
9087 it = db->get_iterator(PREFIX_PERPOOL_OMAP);
9088 if (it) {
9089 uint64_t last_omap_head = 0;
9090 for (it->lower_bound(string()); it->valid(); it->next()) {
9091 uint64_t pool;
9092 uint64_t omap_head;
9093 string k = it->key();
9094 const char *c = k.c_str();
9095 c = _key_decode_u64(c, &pool);
9096 c = _key_decode_u64(c, &omap_head);
9097 if (used_omap_head.count(omap_head) == 0 &&
9098 omap_head != last_omap_head) {
9099 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9100 << "fsck error: found stray (per-pool) omap data on omap_head "
9101 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9102 ++errors;
9103 last_omap_head = omap_head;
9104 }
9105 }
9106 }
9107 dout(1) << __func__ << " checking deferred events" << dendl;
9108 it = db->get_iterator(PREFIX_DEFERRED);
9109 if (it) {
9110 for (it->lower_bound(string()); it->valid(); it->next()) {
9111 bufferlist bl = it->value();
9112 auto p = bl.cbegin();
9113 bluestore_deferred_transaction_t wt;
9114 try {
9115 decode(wt, p);
9116 } catch (buffer::error& e) {
9117 derr << "fsck error: failed to decode deferred txn "
9118 << pretty_binary_string(it->key()) << dendl;
9119 if (repair) {
9120 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
9121 << pretty_binary_string(it->key())
9122 << "', removing" << dendl;
9123 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9124 }
9125 continue;
9126 }
9127 dout(20) << __func__ << " deferred " << wt.seq
9128 << " ops " << wt.ops.size()
9129 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9130 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9131 apply_for_bitset_range(
9132 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
9133 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9134 bs.set(pos);
9135 }
9136 );
9137 }
9138 }
9139 }
9140
9141 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9142 {
9143 // remove bluefs_extents from used set since the freelist doesn't
9144 // know they are allocated.
9145 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9146 apply_for_bitset_range(
9147 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
9148 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9149 bs.reset(pos);
9150 }
9151 );
9152 }
9153 fm->enumerate_reset();
9154 uint64_t offset, length;
9155 while (fm->enumerate_next(db, &offset, &length)) {
9156 bool intersects = false;
9157 apply_for_bitset_range(
9158 offset, length, fm->get_alloc_size(), used_blocks,
9159 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9160 if (bs.test(pos)) {
9161 if (offset == SUPER_RESERVED &&
9162 length == min_alloc_size - SUPER_RESERVED) {
9163 // this is due to the change just after luminous to min_alloc_size
9164 // granularity allocations, and our baked in assumption at the top
9165 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9166 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9167 // since we will never allocate this region below min_alloc_size.
9168 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9169 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9170 << length << std::dec << dendl;
9171 } else {
9172 intersects = true;
9173 if (repair) {
9174 repairer.fix_false_free(db, fm,
9175 pos * min_alloc_size,
9176 min_alloc_size);
9177 }
9178 }
9179 } else {
9180 bs.set(pos);
9181 }
9182 }
9183 );
9184 if (intersects) {
9185 derr << "fsck error: free extent 0x" << std::hex << offset
9186 << "~" << length << std::dec
9187 << " intersects allocated blocks" << dendl;
9188 ++errors;
9189 }
9190 }
9191 fm->enumerate_reset();
9192 size_t count = used_blocks.count();
9193 if (used_blocks.size() != count) {
9194 ceph_assert(used_blocks.size() > count);
9195 used_blocks.flip();
9196 size_t start = used_blocks.find_first();
9197 while (start != decltype(used_blocks)::npos) {
9198 size_t cur = start;
9199 while (true) {
9200 size_t next = used_blocks.find_next(cur);
9201 if (next != cur + 1) {
9202 ++errors;
9203 derr << "fsck error: leaked extent 0x" << std::hex
9204 << ((uint64_t)start * fm->get_alloc_size()) << "~"
9205 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
9206 << dendl;
9207 if (repair) {
9208 repairer.fix_leaked(db,
9209 fm,
9210 start * min_alloc_size,
9211 (cur + 1 - start) * min_alloc_size);
9212 }
9213 start = next;
9214 break;
9215 }
9216 cur = next;
9217 }
9218 }
9219 used_blocks.flip();
9220 }
9221 }
9222 }
9223 if (repair) {
9224 if (!per_pool_omap) {
9225 dout(5) << __func__ << " marking per_pool_omap=1" << dendl;
9226 repairer.fix_per_pool_omap(db);
9227 }
9228
9229 dout(5) << __func__ << " applying repair results" << dendl;
9230 repaired = repairer.apply(db);
9231 dout(5) << __func__ << " repair applied" << dendl;
9232 }
9233
9234 out_scan:
9235 dout(2) << __func__ << " " << num_objects << " objects, "
9236 << num_sharded_objects << " of them sharded. "
9237 << dendl;
9238 dout(2) << __func__ << " " << num_extents << " extents to "
9239 << num_blobs << " blobs, "
9240 << num_spanning_blobs << " spanning, "
9241 << num_shared_blobs << " shared."
9242 << dendl;
9243
9244 utime_t duration = ceph_clock_now() - start;
9245 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
9246 << warnings << " warnings, "
9247 << repaired << " repaired, "
9248 << (errors + warnings - (int)repaired) << " remaining in "
9249 << duration << " seconds" << dendl;
9250
9251 // In non-repair mode we should return error count only as
9252 // it indicates if store status is OK.
9253 // In repair mode both errors and warnings are taken into account
9254 // since repaired counter relates to them both.
9255 return repair ? errors + warnings - (int)repaired : errors;
9256 }
9257
9258 /// methods to inject various errors fsck can repair
9259 void BlueStore::inject_broken_shared_blob_key(const string& key,
9260 const bufferlist& bl)
9261 {
9262 KeyValueDB::Transaction txn;
9263 txn = db->get_transaction();
9264 txn->set(PREFIX_SHARED_BLOB, key, bl);
9265 db->submit_transaction_sync(txn);
9266 };
9267
9268 void BlueStore::inject_leaked(uint64_t len)
9269 {
9270 KeyValueDB::Transaction txn;
9271 txn = db->get_transaction();
9272
9273 PExtentVector exts;
9274 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
9275 min_alloc_size * 256, 0, &exts);
9276 ceph_assert(alloc_len >= (int64_t)len);
9277 for (auto& p : exts) {
9278 fm->allocate(p.offset, p.length, txn);
9279 }
9280 db->submit_transaction_sync(txn);
9281 }
9282
9283 void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
9284 {
9285 KeyValueDB::Transaction txn;
9286 OnodeRef o;
9287 CollectionRef c = _get_collection(cid);
9288 ceph_assert(c);
9289 {
9290 std::unique_lock l{c->lock}; // just to avoid internal asserts
9291 o = c->get_onode(oid, false);
9292 ceph_assert(o);
9293 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9294 }
9295
9296 bool injected = false;
9297 txn = db->get_transaction();
9298 auto& em = o->extent_map.extent_map;
9299 std::vector<const PExtentVector*> v;
9300 if (em.size()) {
9301 v.push_back(&em.begin()->blob->get_blob().get_extents());
9302 }
9303 if (em.size() > 1) {
9304 auto it = em.end();
9305 --it;
9306 v.push_back(&(it->blob->get_blob().get_extents()));
9307 }
9308 for (auto pext : v) {
9309 if (pext->size()) {
9310 auto p = pext->begin();
9311 while (p != pext->end()) {
9312 if (p->is_valid()) {
9313 dout(20) << __func__ << " release 0x" << std::hex << p->offset
9314 << "~" << p->length << std::dec << dendl;
9315 fm->release(p->offset, p->length, txn);
9316 injected = true;
9317 break;
9318 }
9319 ++p;
9320 }
9321 }
9322 }
9323 ceph_assert(injected);
9324 db->submit_transaction_sync(txn);
9325 }
9326
9327 void BlueStore::inject_legacy_omap()
9328 {
9329 dout(1) << __func__ << dendl;
9330 per_pool_omap = false;
9331 KeyValueDB::Transaction txn;
9332 txn = db->get_transaction();
9333 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
9334 db->submit_transaction_sync(txn);
9335 }
9336
9337 void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
9338 {
9339 dout(1) << __func__ << " "
9340 << cid << " " << oid
9341 <<dendl;
9342 KeyValueDB::Transaction txn;
9343 OnodeRef o;
9344 CollectionRef c = _get_collection(cid);
9345 ceph_assert(c);
9346 {
9347 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9348 o = c->get_onode(oid, false);
9349 ceph_assert(o);
9350 }
9351 o->onode.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PGMETA_OMAP);
9352 txn = db->get_transaction();
9353 _record_onode(o, txn);
9354 db->submit_transaction_sync(txn);
9355 }
9356
9357
9358 void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
9359 {
9360 BlueStoreRepairer repairer;
9361 repairer.fix_statfs(db, key, new_statfs);
9362 repairer.apply(db);
9363 }
9364
9365 void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
9366 {
9367 KeyValueDB::Transaction t = db->get_transaction();
9368 volatile_statfs v;
9369 v = new_statfs;
9370 bufferlist bl;
9371 v.encode(bl);
9372 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
9373 db->submit_transaction_sync(t);
9374 }
9375
9376 void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9377 coll_t cid2, ghobject_t oid2,
9378 uint64_t offset)
9379 {
9380 OnodeRef o1;
9381 CollectionRef c1 = _get_collection(cid1);
9382 ceph_assert(c1);
9383 {
9384 std::unique_lock l{c1->lock}; // just to avoid internal asserts
9385 o1 = c1->get_onode(oid1, false);
9386 ceph_assert(o1);
9387 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9388 }
9389 OnodeRef o2;
9390 CollectionRef c2 = _get_collection(cid2);
9391 ceph_assert(c2);
9392 {
9393 std::unique_lock l{c2->lock}; // just to avoid internal asserts
9394 o2 = c2->get_onode(oid2, false);
9395 ceph_assert(o2);
9396 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9397 }
9398 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9399 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9400
9401 // require onode/extent layout to be the same (and simple)
9402 // to make things easier
9403 ceph_assert(o1->onode.extent_map_shards.empty());
9404 ceph_assert(o2->onode.extent_map_shards.empty());
9405 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9406 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9407 ceph_assert(e1.logical_offset == e2.logical_offset);
9408 ceph_assert(e1.length == e2.length);
9409 ceph_assert(e1.blob_offset == e2.blob_offset);
9410
9411 KeyValueDB::Transaction txn;
9412 txn = db->get_transaction();
9413
9414 // along with misreference error this will create space leaks errors
9415 e2.blob->dirty_blob() = e1.blob->get_blob();
9416 o2->extent_map.dirty_range(offset, e2.length);
9417 o2->extent_map.update(txn, false);
9418
9419 _record_onode(o2, txn);
9420 db->submit_transaction_sync(txn);
9421 }
9422
9423 void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
9424 int16_t blob_id)
9425 {
9426 OnodeRef o;
9427 CollectionRef c = _get_collection(cid);
9428 ceph_assert(c);
9429 {
9430 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9431 o = c->get_onode(oid, false);
9432 ceph_assert(o);
9433 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9434 }
9435
9436 BlobRef b = c->new_blob();
9437 b->id = blob_id;
9438 o->extent_map.spanning_blob_map[blob_id] = b;
9439
9440 KeyValueDB::Transaction txn;
9441 txn = db->get_transaction();
9442
9443 _record_onode(o, txn);
9444 db->submit_transaction_sync(txn);
9445 }
9446
9447 void BlueStore::collect_metadata(map<string,string> *pm)
9448 {
9449 dout(10) << __func__ << dendl;
9450 bdev->collect_metadata("bluestore_bdev_", pm);
9451 if (bluefs) {
9452 (*pm)["bluefs"] = "1";
9453 // this value is for backward compatibility only
9454 (*pm)["bluefs_single_shared_device"] = \
9455 stringify((int)bluefs_layout.single_shared_device());
9456 (*pm)["bluefs_dedicated_db"] = \
9457 stringify((int)bluefs_layout.dedicated_db);
9458 (*pm)["bluefs_dedicated_wal"] = \
9459 stringify((int)bluefs_layout.dedicated_wal);
9460 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
9461 } else {
9462 (*pm)["bluefs"] = "0";
9463 }
9464
9465 // report numa mapping for underlying devices
9466 int node = -1;
9467 set<int> nodes;
9468 set<string> failed;
9469 int r = get_numa_node(&node, &nodes, &failed);
9470 if (r >= 0) {
9471 if (!failed.empty()) {
9472 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9473 }
9474 if (!nodes.empty()) {
9475 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9476 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9477 }
9478 if (node >= 0) {
9479 (*pm)["objectstore_numa_node"] = stringify(node);
9480 }
9481 }
9482 }
9483
9484 int BlueStore::get_numa_node(
9485 int *final_node,
9486 set<int> *out_nodes,
9487 set<string> *out_failed)
9488 {
9489 int node = -1;
9490 set<string> devices;
9491 get_devices(&devices);
9492 set<int> nodes;
9493 set<string> failed;
9494 for (auto& devname : devices) {
9495 int n;
9496 BlkDev bdev(devname);
9497 int r = bdev.get_numa_node(&n);
9498 if (r < 0) {
9499 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9500 << dendl;
9501 failed.insert(devname);
9502 continue;
9503 }
9504 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9505 << dendl;
9506 nodes.insert(n);
9507 if (node < 0) {
9508 node = n;
9509 }
9510 }
9511 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9512 *final_node = node;
9513 }
9514 if (out_nodes) {
9515 *out_nodes = nodes;
9516 }
9517 if (out_failed) {
9518 *out_failed = failed;
9519 }
9520 return 0;
9521 }
9522
9523 int BlueStore::get_devices(set<string> *ls)
9524 {
9525 if (bdev) {
9526 bdev->get_devices(ls);
9527 if (bluefs) {
9528 bluefs->get_devices(ls);
9529 }
9530 return 0;
9531 }
9532
9533 // grumble, we haven't started up yet.
9534 int r = _open_path();
9535 if (r < 0)
9536 goto out;
9537 r = _open_fsid(false);
9538 if (r < 0)
9539 goto out_path;
9540 r = _read_fsid(&fsid);
9541 if (r < 0)
9542 goto out_fsid;
9543 r = _lock_fsid();
9544 if (r < 0)
9545 goto out_fsid;
9546 r = _open_bdev(false);
9547 if (r < 0)
9548 goto out_fsid;
9549 r = _minimal_open_bluefs(false);
9550 if (r < 0)
9551 goto out_bdev;
9552 bdev->get_devices(ls);
9553 if (bluefs) {
9554 bluefs->get_devices(ls);
9555 }
9556 r = 0;
9557 _minimal_close_bluefs();
9558 out_bdev:
9559 _close_bdev();
9560 out_fsid:
9561 _close_fsid();
9562 out_path:
9563 _close_path();
9564 out:
9565 return r;
9566 }
9567
9568 void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
9569 {
9570 buf->reset();
9571
9572 buf->omap_allocated =
9573 db->estimate_prefix_size(PREFIX_OMAP, string()) +
9574 db->estimate_prefix_size(PREFIX_PERPOOL_OMAP, string());
9575
9576 uint64_t bfree = alloc->get_free();
9577
9578 if (bluefs) {
9579 int64_t bluefs_total = bluefs->get_total(bluefs_layout.shared_bdev);
9580 int64_t bluefs_free = bluefs->get_free(bluefs_layout.shared_bdev);
9581 // part of our shared device is "free" according to BlueFS, but we
9582 // can't touch bluestore_bluefs_min of it.
9583 int64_t shared_available = std::min(
9584 bluefs_free,
9585 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
9586 buf->internally_reserved = bluefs_total - shared_available;
9587 if (shared_available > 0) {
9588 bfree += shared_available;
9589 }
9590 // include dedicated db, too, if that isn't the shared device.
9591 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
9592 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
9593 }
9594 // call any non-omap bluefs space "internal metadata"
9595 buf->internal_metadata =
9596 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
9597 - buf->omap_allocated;
9598 }
9599
9600 uint64_t thin_total, thin_avail;
9601 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9602 buf->total += thin_total;
9603
9604 // we are limited by both the size of the virtual device and the
9605 // underlying physical device.
9606 bfree = std::min(bfree, thin_avail);
9607
9608 buf->allocated = thin_total - thin_avail;
9609 } else {
9610 buf->total += bdev->get_size();
9611 }
9612 buf->available = bfree;
9613 }
9614
9615 int BlueStore::statfs(struct store_statfs_t *buf,
9616 osd_alert_list_t* alerts)
9617 {
9618 if (alerts) {
9619 alerts->clear();
9620 _log_alerts(*alerts);
9621 }
9622 _get_statfs_overall(buf);
9623 {
9624 std::lock_guard l(vstatfs_lock);
9625 buf->allocated = vstatfs.allocated();
9626 buf->data_stored = vstatfs.stored();
9627 buf->data_compressed = vstatfs.compressed();
9628 buf->data_compressed_original = vstatfs.compressed_original();
9629 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9630 }
9631
9632 dout(20) << __func__ << " " << *buf << dendl;
9633 return 0;
9634 }
9635
9636 int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9637 bool *out_per_pool_omap)
9638 {
9639 dout(20) << __func__ << " pool " << pool_id<< dendl;
9640
9641 if (!per_pool_stat_collection) {
9642 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9643 return -ENOTSUP;
9644 }
9645 buf->reset();
9646
9647 {
9648 std::lock_guard l(vstatfs_lock);
9649 osd_pools[pool_id].publish(buf);
9650 }
9651
9652 string key_prefix;
9653 _key_encode_u64(pool_id, &key_prefix);
9654 buf->omap_allocated = db->estimate_prefix_size(PREFIX_PERPOOL_OMAP,
9655 key_prefix);
9656 *out_per_pool_omap = per_pool_omap;
9657
9658 dout(10) << __func__ << *buf << dendl;
9659 return 0;
9660 }
9661
9662 void BlueStore::_check_legacy_statfs_alert()
9663 {
9664 string s;
9665 if (!per_pool_stat_collection &&
9666 cct->_conf->bluestore_warn_on_legacy_statfs) {
9667 s = "legacy statfs reporting detected, "
9668 "suggest to run store repair to get consistent statistic reports";
9669 }
9670 std::lock_guard l(qlock);
9671 legacy_statfs_alert = s;
9672 }
9673
9674 void BlueStore::_check_no_per_pool_omap_alert()
9675 {
9676 string s;
9677 if (!per_pool_omap &&
9678 cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9679 s = "legacy (not per-pool) omap detected, "
9680 "suggest to run store repair to measure per-pool omap usage";
9681 }
9682 std::lock_guard l(qlock);
9683 no_per_pool_omap_alert = s;
9684 }
9685
9686 // ---------------
9687 // cache
9688
9689 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9690 {
9691 std::shared_lock l(coll_lock);
9692 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9693 if (cp == coll_map.end())
9694 return CollectionRef();
9695 return cp->second;
9696 }
9697
9698 void BlueStore::_queue_reap_collection(CollectionRef& c)
9699 {
9700 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9701 // _reap_collections and this in the same thread,
9702 // so no need a lock.
9703 removed_collections.push_back(c);
9704 }
9705
9706 void BlueStore::_reap_collections()
9707 {
9708
9709 list<CollectionRef> removed_colls;
9710 {
9711 // _queue_reap_collection and this in the same thread.
9712 // So no need a lock.
9713 if (!removed_collections.empty())
9714 removed_colls.swap(removed_collections);
9715 else
9716 return;
9717 }
9718
9719 list<CollectionRef>::iterator p = removed_colls.begin();
9720 while (p != removed_colls.end()) {
9721 CollectionRef c = *p;
9722 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9723 if (c->onode_map.map_any([&](Onode* o) {
9724 ceph_assert(!o->exists);
9725 if (o->flushing_count.load()) {
9726 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9727 << " flush_txns " << o->flushing_count << dendl;
9728 return true;
9729 }
9730 return false;
9731 })) {
9732 ++p;
9733 continue;
9734 }
9735 c->onode_map.clear();
9736 p = removed_colls.erase(p);
9737 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9738 }
9739 if (removed_colls.empty()) {
9740 dout(10) << __func__ << " all reaped" << dendl;
9741 } else {
9742 removed_collections.splice(removed_collections.begin(), removed_colls);
9743 }
9744 }
9745
9746 void BlueStore::_update_cache_logger()
9747 {
9748 uint64_t num_onodes = 0;
9749 uint64_t num_pinned_onodes = 0;
9750 uint64_t num_extents = 0;
9751 uint64_t num_blobs = 0;
9752 uint64_t num_buffers = 0;
9753 uint64_t num_buffer_bytes = 0;
9754 for (auto c : onode_cache_shards) {
9755 c->add_stats(&num_onodes, &num_pinned_onodes);
9756 }
9757 for (auto c : buffer_cache_shards) {
9758 c->add_stats(&num_extents, &num_blobs,
9759 &num_buffers, &num_buffer_bytes);
9760 }
9761 logger->set(l_bluestore_onodes, num_onodes);
9762 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
9763 logger->set(l_bluestore_extents, num_extents);
9764 logger->set(l_bluestore_blobs, num_blobs);
9765 logger->set(l_bluestore_buffers, num_buffers);
9766 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9767 }
9768
9769 // ---------------
9770 // read operations
9771
9772 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9773 {
9774 return _get_collection(cid);
9775 }
9776
9777 ObjectStore::CollectionHandle BlueStore::create_new_collection(
9778 const coll_t& cid)
9779 {
9780 std::unique_lock l{coll_lock};
9781 auto c = ceph::make_ref<Collection>(
9782 this,
9783 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9784 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
9785 cid);
9786 new_coll_map[cid] = c;
9787 _osr_attach(c.get());
9788 return c;
9789 }
9790
9791 void BlueStore::set_collection_commit_queue(
9792 const coll_t& cid,
9793 ContextQueue *commit_queue)
9794 {
9795 if (commit_queue) {
9796 std::shared_lock l(coll_lock);
9797 if (coll_map.count(cid)) {
9798 coll_map[cid]->commit_queue = commit_queue;
9799 } else if (new_coll_map.count(cid)) {
9800 new_coll_map[cid]->commit_queue = commit_queue;
9801 }
9802 }
9803 }
9804
9805
9806 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9807 {
9808 Collection *c = static_cast<Collection *>(c_.get());
9809 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9810 if (!c->exists)
9811 return false;
9812
9813 bool r = true;
9814
9815 {
9816 std::shared_lock l(c->lock);
9817 OnodeRef o = c->get_onode(oid, false);
9818 if (!o || !o->exists)
9819 r = false;
9820 }
9821
9822 return r;
9823 }
9824
9825 int BlueStore::stat(
9826 CollectionHandle &c_,
9827 const ghobject_t& oid,
9828 struct stat *st,
9829 bool allow_eio)
9830 {
9831 Collection *c = static_cast<Collection *>(c_.get());
9832 if (!c->exists)
9833 return -ENOENT;
9834 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9835
9836 {
9837 std::shared_lock l(c->lock);
9838 OnodeRef o = c->get_onode(oid, false);
9839 if (!o || !o->exists)
9840 return -ENOENT;
9841 st->st_size = o->onode.size;
9842 st->st_blksize = 4096;
9843 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9844 st->st_nlink = 1;
9845 }
9846
9847 int r = 0;
9848 if (_debug_mdata_eio(oid)) {
9849 r = -EIO;
9850 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9851 }
9852 return r;
9853 }
9854 int BlueStore::set_collection_opts(
9855 CollectionHandle& ch,
9856 const pool_opts_t& opts)
9857 {
9858 Collection *c = static_cast<Collection *>(ch.get());
9859 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
9860 if (!c->exists)
9861 return -ENOENT;
9862 std::unique_lock l{c->lock};
9863 c->pool_opts = opts;
9864 return 0;
9865 }
9866
9867 int BlueStore::read(
9868 CollectionHandle &c_,
9869 const ghobject_t& oid,
9870 uint64_t offset,
9871 size_t length,
9872 bufferlist& bl,
9873 uint32_t op_flags)
9874 {
9875 auto start = mono_clock::now();
9876 Collection *c = static_cast<Collection *>(c_.get());
9877 const coll_t &cid = c->get_cid();
9878 dout(15) << __func__ << " " << cid << " " << oid
9879 << " 0x" << std::hex << offset << "~" << length << std::dec
9880 << dendl;
9881 if (!c->exists)
9882 return -ENOENT;
9883
9884 bl.clear();
9885 int r;
9886 {
9887 std::shared_lock l(c->lock);
9888 auto start1 = mono_clock::now();
9889 OnodeRef o = c->get_onode(oid, false);
9890 log_latency("get_onode@read",
9891 l_bluestore_read_onode_meta_lat,
9892 mono_clock::now() - start1,
9893 cct->_conf->bluestore_log_op_age);
9894 if (!o || !o->exists) {
9895 r = -ENOENT;
9896 goto out;
9897 }
9898
9899 if (offset == length && offset == 0)
9900 length = o->onode.size;
9901
9902 r = _do_read(c, o, offset, length, bl, op_flags);
9903 if (r == -EIO) {
9904 logger->inc(l_bluestore_read_eio);
9905 }
9906 }
9907
9908 out:
9909 if (r >= 0 && _debug_data_eio(oid)) {
9910 r = -EIO;
9911 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9912 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9913 cct->_conf->bluestore_debug_random_read_err &&
9914 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9915 100.0)) == 0) {
9916 dout(0) << __func__ << ": inject random EIO" << dendl;
9917 r = -EIO;
9918 }
9919 dout(10) << __func__ << " " << cid << " " << oid
9920 << " 0x" << std::hex << offset << "~" << length << std::dec
9921 << " = " << r << dendl;
9922 log_latency(__func__,
9923 l_bluestore_read_lat,
9924 mono_clock::now() - start,
9925 cct->_conf->bluestore_log_op_age);
9926 return r;
9927 }
9928
9929 void BlueStore::_read_cache(
9930 OnodeRef o,
9931 uint64_t offset,
9932 size_t length,
9933 int read_cache_policy,
9934 ready_regions_t& ready_regions,
9935 blobs2read_t& blobs2read)
9936 {
9937 // build blob-wise list to of stuff read (that isn't cached)
9938 unsigned left = length;
9939 uint64_t pos = offset;
9940 auto lp = o->extent_map.seek_lextent(offset);
9941 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9942 if (pos < lp->logical_offset) {
9943 unsigned hole = lp->logical_offset - pos;
9944 if (hole >= left) {
9945 break;
9946 }
9947 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9948 << std::dec << dendl;
9949 pos += hole;
9950 left -= hole;
9951 }
9952 BlobRef& bptr = lp->blob;
9953 unsigned l_off = pos - lp->logical_offset;
9954 unsigned b_off = l_off + lp->blob_offset;
9955 unsigned b_len = std::min(left, lp->length - l_off);
9956
9957 ready_regions_t cache_res;
9958 interval_set<uint32_t> cache_interval;
9959 bptr->shared_blob->bc.read(
9960 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9961 read_cache_policy);
9962 dout(20) << __func__ << " blob " << *bptr << std::hex
9963 << " need 0x" << b_off << "~" << b_len
9964 << " cache has 0x" << cache_interval
9965 << std::dec << dendl;
9966
9967 auto pc = cache_res.begin();
9968 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
9969 while (b_len > 0) {
9970 unsigned l;
9971 if (pc != cache_res.end() &&
9972 pc->first == b_off) {
9973 l = pc->second.length();
9974 ready_regions[pos].claim(pc->second);
9975 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9976 << b_off << "~" << l << std::dec << dendl;
9977 ++pc;
9978 } else {
9979 l = b_len;
9980 if (pc != cache_res.end()) {
9981 ceph_assert(pc->first > b_off);
9982 l = pc->first - b_off;
9983 }
9984 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9985 << b_off << "~" << l << std::dec << dendl;
9986 // merge regions
9987 {
9988 uint64_t r_off = b_off;
9989 uint64_t r_len = l;
9990 uint64_t front = r_off % chunk_size;
9991 if (front) {
9992 r_off -= front;
9993 r_len += front;
9994 }
9995 unsigned tail = r_len % chunk_size;
9996 if (tail) {
9997 r_len += chunk_size - tail;
9998 }
9999 bool merged = false;
10000 regions2read_t& r2r = blobs2read[bptr];
10001 if (r2r.size()) {
10002 read_req_t& pre = r2r.back();
10003 if (r_off <= (pre.r_off + pre.r_len)) {
10004 front += (r_off - pre.r_off);
10005 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
10006 pre.regs.emplace_back(region_t(pos, b_off, l, front));
10007 merged = true;
10008 }
10009 }
10010 if (!merged) {
10011 read_req_t req(r_off, r_len);
10012 req.regs.emplace_back(region_t(pos, b_off, l, front));
10013 r2r.emplace_back(std::move(req));
10014 }
10015 }
10016 }
10017 pos += l;
10018 b_off += l;
10019 left -= l;
10020 b_len -= l;
10021 }
10022 ++lp;
10023 }
10024 }
10025
10026 int BlueStore::_prepare_read_ioc(
10027 blobs2read_t& blobs2read,
10028 vector<bufferlist>* compressed_blob_bls,
10029 IOContext* ioc)
10030 {
10031 for (auto& p : blobs2read) {
10032 const BlobRef& bptr = p.first;
10033 regions2read_t& r2r = p.second;
10034 dout(20) << __func__ << " blob " << *bptr << std::hex
10035 << " need " << r2r << std::dec << dendl;
10036 if (bptr->get_blob().is_compressed()) {
10037 // read the whole thing
10038 if (compressed_blob_bls->empty()) {
10039 // ensure we avoid any reallocation on subsequent blobs
10040 compressed_blob_bls->reserve(blobs2read.size());
10041 }
10042 compressed_blob_bls->push_back(bufferlist());
10043 bufferlist& bl = compressed_blob_bls->back();
10044 auto r = bptr->get_blob().map(
10045 0, bptr->get_blob().get_ondisk_length(),
10046 [&](uint64_t offset, uint64_t length) {
10047 int r = bdev->aio_read(offset, length, &bl, ioc);
10048 if (r < 0)
10049 return r;
10050 return 0;
10051 });
10052 if (r < 0) {
10053 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
10054 if (r == -EIO) {
10055 // propagate EIO to caller
10056 return r;
10057 }
10058 ceph_assert(r == 0);
10059 }
10060 } else {
10061 // read the pieces
10062 for (auto& req : r2r) {
10063 dout(20) << __func__ << " region 0x" << std::hex
10064 << req.regs.front().logical_offset
10065 << ": 0x" << req.regs.front().blob_xoffset
10066 << " reading 0x" << req.r_off
10067 << "~" << req.r_len << std::dec
10068 << dendl;
10069
10070 // read it
10071 auto r = bptr->get_blob().map(
10072 req.r_off, req.r_len,
10073 [&](uint64_t offset, uint64_t length) {
10074 int r = bdev->aio_read(offset, length, &req.bl, ioc);
10075 if (r < 0)
10076 return r;
10077 return 0;
10078 });
10079 if (r < 0) {
10080 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
10081 << dendl;
10082 if (r == -EIO) {
10083 // propagate EIO to caller
10084 return r;
10085 }
10086 ceph_assert(r == 0);
10087 }
10088 ceph_assert(req.bl.length() == req.r_len);
10089 }
10090 }
10091 }
10092 return 0;
10093 }
10094
10095 int BlueStore::_generate_read_result_bl(
10096 OnodeRef o,
10097 uint64_t offset,
10098 size_t length,
10099 ready_regions_t& ready_regions,
10100 vector<bufferlist>& compressed_blob_bls,
10101 blobs2read_t& blobs2read,
10102 bool buffered,
10103 bool* csum_error,
10104 bufferlist& bl)
10105 {
10106 // enumerate and decompress desired blobs
10107 auto p = compressed_blob_bls.begin();
10108 blobs2read_t::iterator b2r_it = blobs2read.begin();
10109 while (b2r_it != blobs2read.end()) {
10110 const BlobRef& bptr = b2r_it->first;
10111 regions2read_t& r2r = b2r_it->second;
10112 dout(20) << __func__ << " blob " << *bptr << std::hex
10113 << " need 0x" << r2r << std::dec << dendl;
10114 if (bptr->get_blob().is_compressed()) {
10115 ceph_assert(p != compressed_blob_bls.end());
10116 bufferlist& compressed_bl = *p++;
10117 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
10118 r2r.front().regs.front().logical_offset) < 0) {
10119 *csum_error = true;
10120 return -EIO;
10121 }
10122 bufferlist raw_bl;
10123 auto r = _decompress(compressed_bl, &raw_bl);
10124 if (r < 0)
10125 return r;
10126 if (buffered) {
10127 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
10128 raw_bl);
10129 }
10130 for (auto& req : r2r) {
10131 for (auto& r : req.regs) {
10132 ready_regions[r.logical_offset].substr_of(
10133 raw_bl, r.blob_xoffset, r.length);
10134 }
10135 }
10136 } else {
10137 for (auto& req : r2r) {
10138 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
10139 req.regs.front().logical_offset) < 0) {
10140 *csum_error = true;
10141 return -EIO;
10142 }
10143 if (buffered) {
10144 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
10145 req.r_off, req.bl);
10146 }
10147
10148 // prune and keep result
10149 for (const auto& r : req.regs) {
10150 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
10151 }
10152 }
10153 }
10154 ++b2r_it;
10155 }
10156
10157 // generate a resulting buffer
10158 auto pr = ready_regions.begin();
10159 auto pr_end = ready_regions.end();
10160 uint64_t pos = 0;
10161 while (pos < length) {
10162 if (pr != pr_end && pr->first == pos + offset) {
10163 dout(30) << __func__ << " assemble 0x" << std::hex << pos
10164 << ": data from 0x" << pr->first << "~" << pr->second.length()
10165 << std::dec << dendl;
10166 pos += pr->second.length();
10167 bl.claim_append(pr->second);
10168 ++pr;
10169 } else {
10170 uint64_t l = length - pos;
10171 if (pr != pr_end) {
10172 ceph_assert(pr->first > pos + offset);
10173 l = pr->first - (pos + offset);
10174 }
10175 dout(30) << __func__ << " assemble 0x" << std::hex << pos
10176 << ": zeros for 0x" << (pos + offset) << "~" << l
10177 << std::dec << dendl;
10178 bl.append_zero(l);
10179 pos += l;
10180 }
10181 }
10182 ceph_assert(bl.length() == length);
10183 ceph_assert(pos == length);
10184 ceph_assert(pr == pr_end);
10185 return 0;
10186 }
10187
10188 int BlueStore::_do_read(
10189 Collection *c,
10190 OnodeRef o,
10191 uint64_t offset,
10192 size_t length,
10193 bufferlist& bl,
10194 uint32_t op_flags,
10195 uint64_t retry_count)
10196 {
10197 FUNCTRACE(cct);
10198 int r = 0;
10199 int read_cache_policy = 0; // do not bypass clean or dirty cache
10200
10201 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10202 << " size 0x" << o->onode.size << " (" << std::dec
10203 << o->onode.size << ")" << dendl;
10204 bl.clear();
10205
10206 if (offset >= o->onode.size) {
10207 return r;
10208 }
10209
10210 // generally, don't buffer anything, unless the client explicitly requests
10211 // it.
10212 bool buffered = false;
10213 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10214 dout(20) << __func__ << " will do buffered read" << dendl;
10215 buffered = true;
10216 } else if (cct->_conf->bluestore_default_buffered_read &&
10217 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10218 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10219 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10220 buffered = true;
10221 }
10222
10223 if (offset + length > o->onode.size) {
10224 length = o->onode.size - offset;
10225 }
10226
10227 auto start = mono_clock::now();
10228 o->extent_map.fault_range(db, offset, length);
10229 log_latency(__func__,
10230 l_bluestore_read_onode_meta_lat,
10231 mono_clock::now() - start,
10232 cct->_conf->bluestore_log_op_age);
10233 _dump_onode<30>(cct, *o);
10234
10235 // for deep-scrub, we only read dirty cache and bypass clean cache in
10236 // order to read underlying block device in case there are silent disk errors.
10237 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
10238 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
10239 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
10240 }
10241
10242 // build blob-wise list to of stuff read (that isn't cached)
10243 ready_regions_t ready_regions;
10244 blobs2read_t blobs2read;
10245 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
10246
10247
10248 // read raw blob data.
10249 start = mono_clock::now(); // for the sake of simplicity
10250 // measure the whole block below.
10251 // The error isn't that much...
10252 vector<bufferlist> compressed_blob_bls;
10253 IOContext ioc(cct, NULL, true); // allow EIO
10254 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
10255 // we always issue aio for reading, so errors other than EIO are not allowed
10256 if (r < 0)
10257 return r;
10258
10259 int64_t num_ios = length;
10260 if (ioc.has_pending_aios()) {
10261 num_ios = -ioc.get_num_ios();
10262 bdev->aio_submit(&ioc);
10263 dout(20) << __func__ << " waiting for aio" << dendl;
10264 ioc.aio_wait();
10265 r = ioc.get_return_value();
10266 if (r < 0) {
10267 ceph_assert(r == -EIO); // no other errors allowed
10268 return -EIO;
10269 }
10270 }
10271 log_latency_fn(__func__,
10272 l_bluestore_read_wait_aio_lat,
10273 mono_clock::now() - start,
10274 cct->_conf->bluestore_log_op_age,
10275 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10276 );
10277
10278 bool csum_error = false;
10279 r = _generate_read_result_bl(o, offset, length, ready_regions,
10280 compressed_blob_bls, blobs2read,
10281 buffered, &csum_error, bl);
10282 if (csum_error) {
10283 // Handles spurious read errors caused by a kernel bug.
10284 // We sometimes get all-zero pages as a result of the read under
10285 // high memory pressure. Retrying the failing read succeeds in most
10286 // cases.
10287 // See also: http://tracker.ceph.com/issues/22464
10288 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10289 return -EIO;
10290 }
10291 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
10292 }
10293 r = bl.length();
10294 if (retry_count) {
10295 logger->inc(l_bluestore_reads_with_retries);
10296 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
10297 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
10298 }
10299 return r;
10300 }
10301
10302 int BlueStore::_verify_csum(OnodeRef& o,
10303 const bluestore_blob_t* blob, uint64_t blob_xoffset,
10304 const bufferlist& bl,
10305 uint64_t logical_offset) const
10306 {
10307 int bad;
10308 uint64_t bad_csum;
10309 auto start = mono_clock::now();
10310 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
10311 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
10312 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
10313 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
10314 bad = blob_xoffset;
10315 r = -1;
10316 bad_csum = 0xDEADBEEF;
10317 }
10318 if (r < 0) {
10319 if (r == -1) {
10320 PExtentVector pex;
10321 blob->map(
10322 bad,
10323 blob->get_csum_chunk_size(),
10324 [&](uint64_t offset, uint64_t length) {
10325 pex.emplace_back(bluestore_pextent_t(offset, length));
10326 return 0;
10327 });
10328 derr << __func__ << " bad "
10329 << Checksummer::get_csum_type_string(blob->csum_type)
10330 << "/0x" << std::hex << blob->get_csum_chunk_size()
10331 << " checksum at blob offset 0x" << bad
10332 << ", got 0x" << bad_csum << ", expected 0x"
10333 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
10334 << ", device location " << pex
10335 << ", logical extent 0x" << std::hex
10336 << (logical_offset + bad - blob_xoffset) << "~"
10337 << blob->get_csum_chunk_size() << std::dec
10338 << ", object " << o->oid
10339 << dendl;
10340 } else {
10341 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
10342 }
10343 }
10344 log_latency(__func__,
10345 l_bluestore_csum_lat,
10346 mono_clock::now() - start,
10347 cct->_conf->bluestore_log_op_age);
10348 if (cct->_conf->bluestore_ignore_data_csum) {
10349 return 0;
10350 }
10351 return r;
10352 }
10353
10354 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
10355 {
10356 int r = 0;
10357 auto start = mono_clock::now();
10358 auto i = source.cbegin();
10359 bluestore_compression_header_t chdr;
10360 decode(chdr, i);
10361 int alg = int(chdr.type);
10362 CompressorRef cp = compressor;
10363 if (!cp || (int)cp->get_type() != alg) {
10364 cp = Compressor::create(cct, alg);
10365 }
10366
10367 if (!cp.get()) {
10368 // if compressor isn't available - error, because cannot return
10369 // decompressed data?
10370
10371 const char* alg_name = Compressor::get_comp_alg_name(alg);
10372 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10373 _set_compression_alert(false, alg_name);
10374 r = -EIO;
10375 } else {
10376 r = cp->decompress(i, chdr.length, *result);
10377 if (r < 0) {
10378 derr << __func__ << " decompression failed with exit code " << r << dendl;
10379 r = -EIO;
10380 }
10381 }
10382 log_latency(__func__,
10383 l_bluestore_decompress_lat,
10384 mono_clock::now() - start,
10385 cct->_conf->bluestore_log_op_age);
10386 return r;
10387 }
10388
10389 // this stores fiemap into interval_set, other variations
10390 // use it internally
10391 int BlueStore::_fiemap(
10392 CollectionHandle &c_,
10393 const ghobject_t& oid,
10394 uint64_t offset,
10395 size_t length,
10396 interval_set<uint64_t>& destset)
10397 {
10398 Collection *c = static_cast<Collection *>(c_.get());
10399 if (!c->exists)
10400 return -ENOENT;
10401 {
10402 std::shared_lock l(c->lock);
10403
10404 OnodeRef o = c->get_onode(oid, false);
10405 if (!o || !o->exists) {
10406 return -ENOENT;
10407 }
10408 _dump_onode<30>(cct, *o);
10409
10410 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10411 << " size 0x" << o->onode.size << std::dec << dendl;
10412
10413 boost::intrusive::set<Extent>::iterator ep, eend;
10414 if (offset >= o->onode.size)
10415 goto out;
10416
10417 if (offset + length > o->onode.size) {
10418 length = o->onode.size - offset;
10419 }
10420
10421 o->extent_map.fault_range(db, offset, length);
10422 eend = o->extent_map.extent_map.end();
10423 ep = o->extent_map.seek_lextent(offset);
10424 while (length > 0) {
10425 dout(20) << __func__ << " offset " << offset << dendl;
10426 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10427 ++ep;
10428 continue;
10429 }
10430
10431 uint64_t x_len = length;
10432 if (ep != eend && ep->logical_offset <= offset) {
10433 uint64_t x_off = offset - ep->logical_offset;
10434 x_len = std::min(x_len, ep->length - x_off);
10435 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10436 << x_len << std::dec << " blob " << ep->blob << dendl;
10437 destset.insert(offset, x_len);
10438 length -= x_len;
10439 offset += x_len;
10440 if (x_off + x_len == ep->length)
10441 ++ep;
10442 continue;
10443 }
10444 if (ep != eend &&
10445 ep->logical_offset > offset &&
10446 ep->logical_offset - offset < x_len) {
10447 x_len = ep->logical_offset - offset;
10448 }
10449 offset += x_len;
10450 length -= x_len;
10451 }
10452 }
10453
10454 out:
10455 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10456 << " size = 0x(" << destset << ")" << std::dec << dendl;
10457 return 0;
10458 }
10459
10460 int BlueStore::fiemap(
10461 CollectionHandle &c_,
10462 const ghobject_t& oid,
10463 uint64_t offset,
10464 size_t length,
10465 bufferlist& bl)
10466 {
10467 interval_set<uint64_t> m;
10468 int r = _fiemap(c_, oid, offset, length, m);
10469 if (r >= 0) {
10470 encode(m, bl);
10471 }
10472 return r;
10473 }
10474
10475 int BlueStore::fiemap(
10476 CollectionHandle &c_,
10477 const ghobject_t& oid,
10478 uint64_t offset,
10479 size_t length,
10480 map<uint64_t, uint64_t>& destmap)
10481 {
10482 interval_set<uint64_t> m;
10483 int r = _fiemap(c_, oid, offset, length, m);
10484 if (r >= 0) {
10485 destmap = std::move(m).detach();
10486 }
10487 return r;
10488 }
10489
10490 int BlueStore::readv(
10491 CollectionHandle &c_,
10492 const ghobject_t& oid,
10493 interval_set<uint64_t>& m,
10494 bufferlist& bl,
10495 uint32_t op_flags)
10496 {
10497 auto start = mono_clock::now();
10498 Collection *c = static_cast<Collection *>(c_.get());
10499 const coll_t &cid = c->get_cid();
10500 dout(15) << __func__ << " " << cid << " " << oid
10501 << " fiemap " << m
10502 << dendl;
10503 if (!c->exists)
10504 return -ENOENT;
10505
10506 bl.clear();
10507 int r;
10508 {
10509 std::shared_lock l(c->lock);
10510 auto start1 = mono_clock::now();
10511 OnodeRef o = c->get_onode(oid, false);
10512 log_latency("get_onode@read",
10513 l_bluestore_read_onode_meta_lat,
10514 mono_clock::now() - start1,
10515 cct->_conf->bluestore_log_op_age);
10516 if (!o || !o->exists) {
10517 r = -ENOENT;
10518 goto out;
10519 }
10520
10521 if (m.empty()) {
10522 r = 0;
10523 goto out;
10524 }
10525
10526 r = _do_readv(c, o, m, bl, op_flags);
10527 if (r == -EIO) {
10528 logger->inc(l_bluestore_read_eio);
10529 }
10530 }
10531
10532 out:
10533 if (r >= 0 && _debug_data_eio(oid)) {
10534 r = -EIO;
10535 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10536 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10537 cct->_conf->bluestore_debug_random_read_err &&
10538 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10539 100.0)) == 0) {
10540 dout(0) << __func__ << ": inject random EIO" << dendl;
10541 r = -EIO;
10542 }
10543 dout(10) << __func__ << " " << cid << " " << oid
10544 << " fiemap " << m << std::dec
10545 << " = " << r << dendl;
10546 log_latency(__func__,
10547 l_bluestore_read_lat,
10548 mono_clock::now() - start,
10549 cct->_conf->bluestore_log_op_age);
10550 return r;
10551 }
10552
10553 int BlueStore::_do_readv(
10554 Collection *c,
10555 OnodeRef o,
10556 const interval_set<uint64_t>& m,
10557 bufferlist& bl,
10558 uint32_t op_flags,
10559 uint64_t retry_count)
10560 {
10561 FUNCTRACE(cct);
10562 int r = 0;
10563 int read_cache_policy = 0; // do not bypass clean or dirty cache
10564
10565 dout(20) << __func__ << " fiemap " << m << std::hex
10566 << " size 0x" << o->onode.size << " (" << std::dec
10567 << o->onode.size << ")" << dendl;
10568
10569 // generally, don't buffer anything, unless the client explicitly requests
10570 // it.
10571 bool buffered = false;
10572 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10573 dout(20) << __func__ << " will do buffered read" << dendl;
10574 buffered = true;
10575 } else if (cct->_conf->bluestore_default_buffered_read &&
10576 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10577 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10578 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10579 buffered = true;
10580 }
10581 // this method must be idempotent since we may call it several times
10582 // before we finally read the expected result.
10583 bl.clear();
10584
10585 // call fiemap first!
10586 ceph_assert(m.range_start() <= o->onode.size);
10587 ceph_assert(m.range_end() <= o->onode.size);
10588 auto start = mono_clock::now();
10589 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10590 log_latency(__func__,
10591 l_bluestore_read_onode_meta_lat,
10592 mono_clock::now() - start,
10593 cct->_conf->bluestore_log_op_age);
10594 _dump_onode<30>(cct, *o);
10595
10596 IOContext ioc(cct, NULL, true); // allow EIO
10597 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10598 raw_results.reserve(m.num_intervals());
10599 int i = 0;
10600 for (auto p = m.begin(); p != m.end(); p++, i++) {
10601 raw_results.push_back({});
10602 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10603 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10604 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10605 // we always issue aio for reading, so errors other than EIO are not allowed
10606 if (r < 0)
10607 return r;
10608 }
10609
10610 auto num_ios = m.size();
10611 if (ioc.has_pending_aios()) {
10612 num_ios = ioc.get_num_ios();
10613 bdev->aio_submit(&ioc);
10614 dout(20) << __func__ << " waiting for aio" << dendl;
10615 ioc.aio_wait();
10616 r = ioc.get_return_value();
10617 if (r < 0) {
10618 ceph_assert(r == -EIO); // no other errors allowed
10619 return -EIO;
10620 }
10621 }
10622 log_latency_fn(__func__,
10623 l_bluestore_read_wait_aio_lat,
10624 mono_clock::now() - start,
10625 cct->_conf->bluestore_log_op_age,
10626 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10627 );
10628
10629 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10630 i = 0;
10631 for (auto p = m.begin(); p != m.end(); p++, i++) {
10632 bool csum_error = false;
10633 bufferlist t;
10634 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10635 std::get<0>(raw_results[i]),
10636 std::get<1>(raw_results[i]),
10637 std::get<2>(raw_results[i]),
10638 buffered, &csum_error, t);
10639 if (csum_error) {
10640 // Handles spurious read errors caused by a kernel bug.
10641 // We sometimes get all-zero pages as a result of the read under
10642 // high memory pressure. Retrying the failing read succeeds in most
10643 // cases.
10644 // See also: http://tracker.ceph.com/issues/22464
10645 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10646 return -EIO;
10647 }
10648 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10649 }
10650 bl.claim_append(t);
10651 }
10652 if (retry_count) {
10653 logger->inc(l_bluestore_reads_with_retries);
10654 dout(5) << __func__ << " read fiemap " << m
10655 << " failed " << retry_count << " times before succeeding"
10656 << dendl;
10657 }
10658 return bl.length();
10659 }
10660
10661 int BlueStore::dump_onode(CollectionHandle &c_,
10662 const ghobject_t& oid,
10663 const string& section_name,
10664 Formatter *f)
10665 {
10666 Collection *c = static_cast<Collection *>(c_.get());
10667 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10668 if (!c->exists)
10669 return -ENOENT;
10670
10671 int r;
10672 {
10673 std::shared_lock l(c->lock);
10674
10675 OnodeRef o = c->get_onode(oid, false);
10676 if (!o || !o->exists) {
10677 r = -ENOENT;
10678 goto out;
10679 }
10680 // FIXME minor: actually the next line isn't enough to
10681 // load shared blobs. Leaving as is for now..
10682 //
10683 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10684
10685 _dump_onode<0>(cct, *o);
10686 f->open_object_section(section_name.c_str());
10687 o->dump(f);
10688 f->close_section();
10689 r = 0;
10690 }
10691 out:
10692 dout(10) << __func__ << " " << c->cid << " " << oid
10693 << " = " << r << dendl;
10694 return r;
10695 }
10696
10697 int BlueStore::getattr(
10698 CollectionHandle &c_,
10699 const ghobject_t& oid,
10700 const char *name,
10701 bufferptr& value)
10702 {
10703 Collection *c = static_cast<Collection *>(c_.get());
10704 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10705 if (!c->exists)
10706 return -ENOENT;
10707
10708 int r;
10709 {
10710 std::shared_lock l(c->lock);
10711 mempool::bluestore_cache_meta::string k(name);
10712
10713 OnodeRef o = c->get_onode(oid, false);
10714 if (!o || !o->exists) {
10715 r = -ENOENT;
10716 goto out;
10717 }
10718
10719 if (!o->onode.attrs.count(k)) {
10720 r = -ENODATA;
10721 goto out;
10722 }
10723 value = o->onode.attrs[k];
10724 r = 0;
10725 }
10726 out:
10727 if (r == 0 && _debug_mdata_eio(oid)) {
10728 r = -EIO;
10729 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10730 }
10731 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10732 << " = " << r << dendl;
10733 return r;
10734 }
10735
10736 int BlueStore::getattrs(
10737 CollectionHandle &c_,
10738 const ghobject_t& oid,
10739 map<string,bufferptr>& aset)
10740 {
10741 Collection *c = static_cast<Collection *>(c_.get());
10742 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10743 if (!c->exists)
10744 return -ENOENT;
10745
10746 int r;
10747 {
10748 std::shared_lock l(c->lock);
10749
10750 OnodeRef o = c->get_onode(oid, false);
10751 if (!o || !o->exists) {
10752 r = -ENOENT;
10753 goto out;
10754 }
10755 for (auto& i : o->onode.attrs) {
10756 aset.emplace(i.first.c_str(), i.second);
10757 }
10758 r = 0;
10759 }
10760
10761 out:
10762 if (r == 0 && _debug_mdata_eio(oid)) {
10763 r = -EIO;
10764 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10765 }
10766 dout(10) << __func__ << " " << c->cid << " " << oid
10767 << " = " << r << dendl;
10768 return r;
10769 }
10770
10771 int BlueStore::list_collections(vector<coll_t>& ls)
10772 {
10773 std::shared_lock l(coll_lock);
10774 ls.reserve(coll_map.size());
10775 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10776 p != coll_map.end();
10777 ++p)
10778 ls.push_back(p->first);
10779 return 0;
10780 }
10781
10782 bool BlueStore::collection_exists(const coll_t& c)
10783 {
10784 std::shared_lock l(coll_lock);
10785 return coll_map.count(c);
10786 }
10787
10788 int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
10789 {
10790 dout(15) << __func__ << " " << ch->cid << dendl;
10791 vector<ghobject_t> ls;
10792 ghobject_t next;
10793 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
10794 &ls, &next);
10795 if (r < 0) {
10796 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10797 << dendl;
10798 return r;
10799 }
10800 *empty = ls.empty();
10801 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
10802 return 0;
10803 }
10804
10805 int BlueStore::collection_bits(CollectionHandle& ch)
10806 {
10807 dout(15) << __func__ << " " << ch->cid << dendl;
10808 Collection *c = static_cast<Collection*>(ch.get());
10809 std::shared_lock l(c->lock);
10810 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
10811 return c->cnode.bits;
10812 }
10813
10814 int BlueStore::collection_list(
10815 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10816 vector<ghobject_t> *ls, ghobject_t *pnext)
10817 {
10818 Collection *c = static_cast<Collection *>(c_.get());
10819 c->flush();
10820 dout(15) << __func__ << " " << c->cid
10821 << " start " << start << " end " << end << " max " << max << dendl;
10822 int r;
10823 {
10824 std::shared_lock l(c->lock);
10825 r = _collection_list(c, start, end, max, false, ls, pnext);
10826 }
10827
10828 dout(10) << __func__ << " " << c->cid
10829 << " start " << start << " end " << end << " max " << max
10830 << " = " << r << ", ls.size() = " << ls->size()
10831 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10832 return r;
10833 }
10834
10835 int BlueStore::collection_list_legacy(
10836 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10837 vector<ghobject_t> *ls, ghobject_t *pnext)
10838 {
10839 Collection *c = static_cast<Collection *>(c_.get());
10840 c->flush();
10841 dout(15) << __func__ << " " << c->cid
10842 << " start " << start << " end " << end << " max " << max << dendl;
10843 int r;
10844 {
10845 std::shared_lock l(c->lock);
10846 r = _collection_list(c, start, end, max, true, ls, pnext);
10847 }
10848
10849 dout(10) << __func__ << " " << c->cid
10850 << " start " << start << " end " << end << " max " << max
10851 << " = " << r << ", ls.size() = " << ls->size()
10852 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10853 return r;
10854 }
10855
10856 int BlueStore::_collection_list(
10857 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
10858 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
10859 {
10860
10861 if (!c->exists)
10862 return -ENOENT;
10863
10864 auto start_time = mono_clock::now();
10865 int r = 0;
10866 ghobject_t static_next;
10867 std::unique_ptr<CollectionListIterator> it;
10868 ghobject_t coll_range_temp_start, coll_range_temp_end;
10869 ghobject_t coll_range_start, coll_range_end;
10870 bool set_next = false;
10871 ghobject_t pend;
10872 bool temp;
10873
10874 if (!pnext)
10875 pnext = &static_next;
10876
10877 if (start.is_max() || start.hobj.is_max()) {
10878 goto out;
10879 }
10880 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
10881 &coll_range_temp_end, &coll_range_start, &coll_range_end);
10882 dout(20) << __func__
10883 << " range " << coll_range_temp_start
10884 << " to " << coll_range_temp_end
10885 << " and " << coll_range_start
10886 << " to " << coll_range_end
10887 << " start " << start << dendl;
10888 if (legacy) {
10889 it = std::make_unique<SimpleCollectionListIterator>(
10890 cct, db->get_iterator(PREFIX_OBJ));
10891 } else {
10892 it = std::make_unique<SortedCollectionListIterator>(
10893 db->get_iterator(PREFIX_OBJ));
10894 }
10895 if (start == ghobject_t() ||
10896 start.hobj == hobject_t() ||
10897 start == c->cid.get_min_hobj()) {
10898 it->upper_bound(coll_range_temp_start);
10899 temp = true;
10900 } else {
10901 if (start.hobj.is_temp()) {
10902 temp = true;
10903 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
10904 } else {
10905 temp = false;
10906 ceph_assert(start >= coll_range_start && start < coll_range_end);
10907 }
10908 dout(20) << __func__ << " temp=" << (int)temp << dendl;
10909 it->lower_bound(start);
10910 }
10911 if (end.hobj.is_max()) {
10912 pend = temp ? coll_range_temp_end : coll_range_end;
10913 } else {
10914 if (end.hobj.is_temp()) {
10915 if (temp)
10916 pend = end;
10917 else
10918 goto out;
10919 } else {
10920 pend = temp ? coll_range_temp_end : end;
10921 }
10922 }
10923 dout(20) << __func__ << " pend " << pend << dendl;
10924 while (true) {
10925 if (!it->valid() || it->is_ge(pend)) {
10926 if (!it->valid())
10927 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10928 else
10929 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
10930 if (temp) {
10931 if (end.hobj.is_temp()) {
10932 if (it->valid() && it->is_lt(coll_range_temp_end)) {
10933 *pnext = it->oid();
10934 set_next = true;
10935 }
10936 break;
10937 }
10938 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10939 temp = false;
10940 it->upper_bound(coll_range_start);
10941 if (end.hobj.is_max())
10942 pend = coll_range_end;
10943 else
10944 pend = end;
10945 dout(30) << __func__ << " pend " << pend << dendl;
10946 continue;
10947 }
10948 if (it->valid() && it->is_lt(coll_range_end)) {
10949 *pnext = it->oid();
10950 set_next = true;
10951 }
10952 break;
10953 }
10954 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
10955 if (ls->size() >= (unsigned)max) {
10956 dout(20) << __func__ << " reached max " << max << dendl;
10957 *pnext = it->oid();
10958 set_next = true;
10959 break;
10960 }
10961 ls->push_back(it->oid());
10962 it->next();
10963 }
10964 out:
10965 if (!set_next) {
10966 *pnext = ghobject_t::get_max();
10967 }
10968 log_latency_fn(
10969 __func__,
10970 l_bluestore_clist_lat,
10971 mono_clock::now() - start_time,
10972 cct->_conf->bluestore_log_collection_list_age,
10973 [&] (const ceph::timespan& lat) {
10974 ostringstream ostr;
10975 ostr << ", lat = " << timespan_str(lat)
10976 << " cid =" << c->cid
10977 << " start " << start << " end " << end
10978 << " max " << max;
10979 return ostr.str();
10980 }
10981 );
10982 return r;
10983 }
10984
10985 int BlueStore::omap_get(
10986 CollectionHandle &c_, ///< [in] Collection containing oid
10987 const ghobject_t &oid, ///< [in] Object containing omap
10988 bufferlist *header, ///< [out] omap header
10989 map<string, bufferlist> *out /// < [out] Key to value map
10990 )
10991 {
10992 Collection *c = static_cast<Collection *>(c_.get());
10993 return _omap_get(c, oid, header, out);
10994 }
10995
10996 int BlueStore::_omap_get(
10997 Collection *c, ///< [in] Collection containing oid
10998 const ghobject_t &oid, ///< [in] Object containing omap
10999 bufferlist *header, ///< [out] omap header
11000 map<string, bufferlist> *out /// < [out] Key to value map
11001 )
11002 {
11003 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11004 if (!c->exists)
11005 return -ENOENT;
11006 std::shared_lock l(c->lock);
11007 int r = 0;
11008 OnodeRef o = c->get_onode(oid, false);
11009 if (!o || !o->exists) {
11010 r = -ENOENT;
11011 goto out;
11012 }
11013 r = _onode_omap_get(o, header, out);
11014 out:
11015 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11016 << dendl;
11017 return r;
11018 }
11019
11020 int BlueStore::_onode_omap_get(
11021 const OnodeRef &o, ///< [in] Object containing omap
11022 bufferlist *header, ///< [out] omap header
11023 map<string, bufferlist> *out /// < [out] Key to value map
11024 )
11025 {
11026 int r = 0;
11027 if (!o || !o->exists) {
11028 r = -ENOENT;
11029 goto out;
11030 }
11031 if (!o->onode.has_omap())
11032 goto out;
11033 o->flush();
11034 {
11035 const string& prefix = o->get_omap_prefix();
11036 KeyValueDB::Iterator it = db->get_iterator(prefix);
11037 string head, tail;
11038 o->get_omap_header(&head);
11039 o->get_omap_tail(&tail);
11040 it->lower_bound(head);
11041 while (it->valid()) {
11042 if (it->key() == head) {
11043 dout(30) << __func__ << " got header" << dendl;
11044 *header = it->value();
11045 } else if (it->key() >= tail) {
11046 dout(30) << __func__ << " reached tail" << dendl;
11047 break;
11048 } else {
11049 string user_key;
11050 o->decode_omap_key(it->key(), &user_key);
11051 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11052 << " -> " << user_key << dendl;
11053 (*out)[user_key] = it->value();
11054 }
11055 it->next();
11056 }
11057 }
11058 out:
11059 return r;
11060 }
11061
11062 int BlueStore::omap_get_header(
11063 CollectionHandle &c_, ///< [in] Collection containing oid
11064 const ghobject_t &oid, ///< [in] Object containing omap
11065 bufferlist *header, ///< [out] omap header
11066 bool allow_eio ///< [in] don't assert on eio
11067 )
11068 {
11069 Collection *c = static_cast<Collection *>(c_.get());
11070 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11071 if (!c->exists)
11072 return -ENOENT;
11073 std::shared_lock l(c->lock);
11074 int r = 0;
11075 OnodeRef o = c->get_onode(oid, false);
11076 if (!o || !o->exists) {
11077 r = -ENOENT;
11078 goto out;
11079 }
11080 if (!o->onode.has_omap())
11081 goto out;
11082 o->flush();
11083 {
11084 string head;
11085 o->get_omap_header(&head);
11086 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
11087 dout(30) << __func__ << " got header" << dendl;
11088 } else {
11089 dout(30) << __func__ << " no header" << dendl;
11090 }
11091 }
11092 out:
11093 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11094 << dendl;
11095 return r;
11096 }
11097
11098 int BlueStore::omap_get_keys(
11099 CollectionHandle &c_, ///< [in] Collection containing oid
11100 const ghobject_t &oid, ///< [in] Object containing omap
11101 set<string> *keys ///< [out] Keys defined on oid
11102 )
11103 {
11104 Collection *c = static_cast<Collection *>(c_.get());
11105 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11106 if (!c->exists)
11107 return -ENOENT;
11108 auto start1 = mono_clock::now();
11109 std::shared_lock l(c->lock);
11110 int r = 0;
11111 OnodeRef o = c->get_onode(oid, false);
11112 if (!o || !o->exists) {
11113 r = -ENOENT;
11114 goto out;
11115 }
11116 if (!o->onode.has_omap())
11117 goto out;
11118 o->flush();
11119 {
11120 const string& prefix = o->get_omap_prefix();
11121 KeyValueDB::Iterator it = db->get_iterator(prefix);
11122 string head, tail;
11123 o->get_omap_key(string(), &head);
11124 o->get_omap_tail(&tail);
11125 it->lower_bound(head);
11126 while (it->valid()) {
11127 if (it->key() >= tail) {
11128 dout(30) << __func__ << " reached tail" << dendl;
11129 break;
11130 }
11131 string user_key;
11132 o->decode_omap_key(it->key(), &user_key);
11133 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11134 << " -> " << user_key << dendl;
11135 keys->insert(user_key);
11136 it->next();
11137 }
11138 }
11139 out:
11140 c->store->log_latency(
11141 __func__,
11142 l_bluestore_omap_get_keys_lat,
11143 mono_clock::now() - start1,
11144 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11145
11146 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11147 << dendl;
11148 return r;
11149 }
11150
11151 int BlueStore::omap_get_values(
11152 CollectionHandle &c_, ///< [in] Collection containing oid
11153 const ghobject_t &oid, ///< [in] Object containing omap
11154 const set<string> &keys, ///< [in] Keys to get
11155 map<string, bufferlist> *out ///< [out] Returned keys and values
11156 )
11157 {
11158 Collection *c = static_cast<Collection *>(c_.get());
11159 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11160 if (!c->exists)
11161 return -ENOENT;
11162 std::shared_lock l(c->lock);
11163 auto start1 = mono_clock::now();
11164 int r = 0;
11165 string final_key;
11166 OnodeRef o = c->get_onode(oid, false);
11167 if (!o || !o->exists) {
11168 r = -ENOENT;
11169 goto out;
11170 }
11171 if (!o->onode.has_omap()) {
11172 goto out;
11173 }
11174 o->flush();
11175 {
11176 const string& prefix = o->get_omap_prefix();
11177 o->get_omap_key(string(), &final_key);
11178 size_t base_key_len = final_key.size();
11179 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
11180 final_key.resize(base_key_len); // keep prefix
11181 final_key += *p;
11182 bufferlist val;
11183 if (db->get(prefix, final_key, &val) >= 0) {
11184 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
11185 << " -> " << *p << dendl;
11186 out->insert(make_pair(*p, val));
11187 }
11188 }
11189 }
11190 out:
11191 c->store->log_latency(
11192 __func__,
11193 l_bluestore_omap_get_values_lat,
11194 mono_clock::now() - start1,
11195 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11196
11197 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11198 << dendl;
11199 return r;
11200 }
11201
11202 #ifdef WITH_SEASTAR
11203 int BlueStore::omap_get_values(
11204 CollectionHandle &c_, ///< [in] Collection containing oid
11205 const ghobject_t &oid, ///< [in] Object containing omap
11206 const std::optional<string> &start_after, ///< [in] Keys to get
11207 map<string, bufferlist> *output ///< [out] Returned keys and values
11208 )
11209 {
11210 Collection *c = static_cast<Collection *>(c_.get());
11211 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11212 if (!c->exists)
11213 return -ENOENT;
11214 std::shared_lock l(c->lock);
11215 int r = 0;
11216 OnodeRef o = c->get_onode(oid, false);
11217 if (!o || !o->exists) {
11218 r = -ENOENT;
11219 goto out;
11220 }
11221 if (!o->onode.has_omap()) {
11222 goto out;
11223 }
11224 o->flush();
11225 {
11226 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
11227 if (!iter) {
11228 r = -ENOENT;
11229 goto out;
11230 }
11231 iter->upper_bound(*start_after);
11232 for (; iter->valid(); iter->next()) {
11233 output->insert(make_pair(iter->key(), iter->value()));
11234 }
11235 }
11236
11237 out:
11238 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11239 << dendl;
11240 return r;
11241 }
11242 #endif
11243
11244 int BlueStore::omap_check_keys(
11245 CollectionHandle &c_, ///< [in] Collection containing oid
11246 const ghobject_t &oid, ///< [in] Object containing omap
11247 const set<string> &keys, ///< [in] Keys to check
11248 set<string> *out ///< [out] Subset of keys defined on oid
11249 )
11250 {
11251 Collection *c = static_cast<Collection *>(c_.get());
11252 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11253 if (!c->exists)
11254 return -ENOENT;
11255 std::shared_lock l(c->lock);
11256 int r = 0;
11257 string final_key;
11258 OnodeRef o = c->get_onode(oid, false);
11259 if (!o || !o->exists) {
11260 r = -ENOENT;
11261 goto out;
11262 }
11263 if (!o->onode.has_omap()) {
11264 goto out;
11265 }
11266 o->flush();
11267 {
11268 const string& prefix = o->get_omap_prefix();
11269 o->get_omap_key(string(), &final_key);
11270 size_t base_key_len = final_key.size();
11271 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
11272 final_key.resize(base_key_len); // keep prefix
11273 final_key += *p;
11274 bufferlist val;
11275 if (db->get(prefix, final_key, &val) >= 0) {
11276 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
11277 << " -> " << *p << dendl;
11278 out->insert(*p);
11279 } else {
11280 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
11281 << " -> " << *p << dendl;
11282 }
11283 }
11284 }
11285 out:
11286 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11287 << dendl;
11288 return r;
11289 }
11290
11291 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
11292 CollectionHandle &c_, ///< [in] collection
11293 const ghobject_t &oid ///< [in] object
11294 )
11295 {
11296 Collection *c = static_cast<Collection *>(c_.get());
11297 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
11298 if (!c->exists) {
11299 return ObjectMap::ObjectMapIterator();
11300 }
11301 std::shared_lock l(c->lock);
11302 OnodeRef o = c->get_onode(oid, false);
11303 if (!o || !o->exists) {
11304 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
11305 return ObjectMap::ObjectMapIterator();
11306 }
11307 o->flush();
11308 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
11309 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
11310 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
11311 }
11312
11313 // -----------------
11314 // write helpers
11315
11316 uint64_t BlueStore::_get_ondisk_reserved() const {
11317 return round_up_to(
11318 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
11319 }
11320
11321 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
11322 {
11323 dout(10) << __func__ << " ondisk_format " << ondisk_format
11324 << " min_compat_ondisk_format " << min_compat_ondisk_format
11325 << dendl;
11326 ceph_assert(ondisk_format == latest_ondisk_format);
11327 {
11328 bufferlist bl;
11329 encode(ondisk_format, bl);
11330 t->set(PREFIX_SUPER, "ondisk_format", bl);
11331 }
11332 {
11333 bufferlist bl;
11334 encode(min_compat_ondisk_format, bl);
11335 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
11336 }
11337 }
11338
11339 int BlueStore::_open_super_meta()
11340 {
11341 // nid
11342 {
11343 nid_max = 0;
11344 bufferlist bl;
11345 db->get(PREFIX_SUPER, "nid_max", &bl);
11346 auto p = bl.cbegin();
11347 try {
11348 uint64_t v;
11349 decode(v, p);
11350 nid_max = v;
11351 } catch (buffer::error& e) {
11352 derr << __func__ << " unable to read nid_max" << dendl;
11353 return -EIO;
11354 }
11355 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
11356 nid_last = nid_max.load();
11357 }
11358
11359 // blobid
11360 {
11361 blobid_max = 0;
11362 bufferlist bl;
11363 db->get(PREFIX_SUPER, "blobid_max", &bl);
11364 auto p = bl.cbegin();
11365 try {
11366 uint64_t v;
11367 decode(v, p);
11368 blobid_max = v;
11369 } catch (buffer::error& e) {
11370 derr << __func__ << " unable to read blobid_max" << dendl;
11371 return -EIO;
11372 }
11373 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
11374 blobid_last = blobid_max.load();
11375 }
11376
11377 // freelist
11378 {
11379 bufferlist bl;
11380 db->get(PREFIX_SUPER, "freelist_type", &bl);
11381 if (bl.length()) {
11382 freelist_type = std::string(bl.c_str(), bl.length());
11383 dout(1) << __func__ << " freelist_type " << freelist_type << dendl;
11384 } else {
11385 ceph_abort_msg("Not Support extent freelist manager");
11386 }
11387 }
11388
11389 // ondisk format
11390 int32_t compat_ondisk_format = 0;
11391 {
11392 bufferlist bl;
11393 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
11394 if (r < 0) {
11395 // base case: kraken bluestore is v1 and readable by v1
11396 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
11397 << dendl;
11398 ondisk_format = 1;
11399 compat_ondisk_format = 1;
11400 } else {
11401 auto p = bl.cbegin();
11402 try {
11403 decode(ondisk_format, p);
11404 } catch (buffer::error& e) {
11405 derr << __func__ << " unable to read ondisk_format" << dendl;
11406 return -EIO;
11407 }
11408 bl.clear();
11409 {
11410 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11411 ceph_assert(!r);
11412 auto p = bl.cbegin();
11413 try {
11414 decode(compat_ondisk_format, p);
11415 } catch (buffer::error& e) {
11416 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11417 return -EIO;
11418 }
11419 }
11420 }
11421 dout(1) << __func__ << " ondisk_format " << ondisk_format
11422 << " compat_ondisk_format " << compat_ondisk_format
11423 << dendl;
11424 }
11425
11426 if (latest_ondisk_format < compat_ondisk_format) {
11427 derr << __func__ << " compat_ondisk_format is "
11428 << compat_ondisk_format << " but we only understand version "
11429 << latest_ondisk_format << dendl;
11430 return -EPERM;
11431 }
11432
11433 {
11434 bufferlist bl;
11435 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11436 auto p = bl.cbegin();
11437 try {
11438 uint64_t val;
11439 decode(val, p);
11440 min_alloc_size = val;
11441 min_alloc_size_order = ctz(val);
11442 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
11443 } catch (buffer::error& e) {
11444 derr << __func__ << " unable to read min_alloc_size" << dendl;
11445 return -EIO;
11446 }
11447 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11448 << std::dec << dendl;
11449 }
11450
11451 _set_per_pool_omap();
11452
11453 _open_statfs();
11454 _set_alloc_sizes();
11455 _set_throttle_params();
11456
11457 _set_csum();
11458 _set_compression();
11459 _set_blob_size();
11460
11461 _validate_bdev();
11462 return 0;
11463 }
11464
11465 int BlueStore::_upgrade_super()
11466 {
11467 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11468 << latest_ondisk_format << dendl;
11469 if (ondisk_format < latest_ondisk_format) {
11470 ceph_assert(ondisk_format > 0);
11471 ceph_assert(ondisk_format < latest_ondisk_format);
11472
11473 KeyValueDB::Transaction t = db->get_transaction();
11474 if (ondisk_format == 1) {
11475 // changes:
11476 // - super: added ondisk_format
11477 // - super: added min_readable_ondisk_format
11478 // - super: added min_compat_ondisk_format
11479 // - super: added min_alloc_size
11480 // - super: removed min_min_alloc_size
11481 {
11482 bufferlist bl;
11483 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11484 auto p = bl.cbegin();
11485 try {
11486 uint64_t val;
11487 decode(val, p);
11488 min_alloc_size = val;
11489 } catch (buffer::error& e) {
11490 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11491 return -EIO;
11492 }
11493 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11494 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
11495 }
11496 ondisk_format = 2;
11497 }
11498 if (ondisk_format == 2) {
11499 // changes:
11500 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11501 // ondes are using the per-pool prefix until a repair is run; at that
11502 // point the per_pool_omap=1 key will be set.
11503 // - super: added per_pool_omap key, which indicates that *all* objects
11504 // are using the new prefix and key format
11505 ondisk_format = 3;
11506 }
11507 if (ondisk_format == 3) {
11508 // changes:
11509 // - FreelistManager keeps meta within bdev label
11510 int r = _write_out_fm_meta(0);
11511 ceph_assert(r == 0);
11512 ondisk_format = 4;
11513 }
11514 // This to be the last operation
11515 _prepare_ondisk_format_super(t);
11516 int r = db->submit_transaction_sync(t);
11517 ceph_assert(r == 0);
11518 }
11519 // done
11520 dout(1) << __func__ << " done" << dendl;
11521 return 0;
11522 }
11523
11524 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11525 {
11526 if (o->onode.nid) {
11527 ceph_assert(o->exists);
11528 return;
11529 }
11530 uint64_t nid = ++nid_last;
11531 dout(20) << __func__ << " " << nid << dendl;
11532 o->onode.nid = nid;
11533 txc->last_nid = nid;
11534 o->exists = true;
11535 }
11536
11537 uint64_t BlueStore::_assign_blobid(TransContext *txc)
11538 {
11539 uint64_t bid = ++blobid_last;
11540 dout(20) << __func__ << " " << bid << dendl;
11541 txc->last_blobid = bid;
11542 return bid;
11543 }
11544
11545 void BlueStore::get_db_statistics(Formatter *f)
11546 {
11547 db->get_statistics(f);
11548 }
11549
11550 BlueStore::TransContext *BlueStore::_txc_create(
11551 Collection *c, OpSequencer *osr,
11552 list<Context*> *on_commits)
11553 {
11554 TransContext *txc = new TransContext(cct, c, osr, on_commits);
11555 txc->t = db->get_transaction();
11556 osr->queue_new(txc);
11557 dout(20) << __func__ << " osr " << osr << " = " << txc
11558 << " seq " << txc->seq << dendl;
11559 return txc;
11560 }
11561
11562 void BlueStore::_txc_calc_cost(TransContext *txc)
11563 {
11564 // one "io" for the kv commit
11565 auto ios = 1 + txc->ioc.get_num_ios();
11566 auto cost = throttle_cost_per_io.load();
11567 txc->cost = ios * cost + txc->bytes;
11568 txc->ios = ios;
11569 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11570 << ios << " ios * " << cost << " + " << txc->bytes
11571 << " bytes)" << dendl;
11572 }
11573
11574 void BlueStore::_txc_update_store_statfs(TransContext *txc)
11575 {
11576 if (txc->statfs_delta.is_empty())
11577 return;
11578
11579 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11580 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11581 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11582 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11583 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11584
11585 bufferlist bl;
11586 txc->statfs_delta.encode(bl);
11587 if (per_pool_stat_collection) {
11588 string key;
11589 get_pool_stat_key(txc->osd_pool_id, &key);
11590 txc->t->merge(PREFIX_STAT, key, bl);
11591
11592 std::lock_guard l(vstatfs_lock);
11593 auto& stats = osd_pools[txc->osd_pool_id];
11594 stats += txc->statfs_delta;
11595
11596 vstatfs += txc->statfs_delta; //non-persistent in this mode
11597
11598 } else {
11599 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
11600
11601 std::lock_guard l(vstatfs_lock);
11602 vstatfs += txc->statfs_delta;
11603 }
11604 txc->statfs_delta.reset();
11605 }
11606
11607 void BlueStore::_txc_state_proc(TransContext *txc)
11608 {
11609 while (true) {
11610 dout(10) << __func__ << " txc " << txc
11611 << " " << txc->get_state_name() << dendl;
11612 switch (txc->state) {
11613 case TransContext::STATE_PREPARE:
11614 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
11615 if (txc->ioc.has_pending_aios()) {
11616 txc->state = TransContext::STATE_AIO_WAIT;
11617 txc->had_ios = true;
11618 _txc_aio_submit(txc);
11619 return;
11620 }
11621 // ** fall-thru **
11622
11623 case TransContext::STATE_AIO_WAIT:
11624 {
11625 mono_clock::duration lat = throttle.log_state_latency(
11626 *txc, logger, l_bluestore_state_aio_wait_lat);
11627 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11628 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11629 << ", latency = " << lat
11630 << dendl;
11631 }
11632 }
11633
11634 _txc_finish_io(txc); // may trigger blocked txc's too
11635 return;
11636
11637 case TransContext::STATE_IO_DONE:
11638 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
11639 if (txc->had_ios) {
11640 ++txc->osr->txc_with_unstable_io;
11641 }
11642 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
11643 txc->state = TransContext::STATE_KV_QUEUED;
11644 if (cct->_conf->bluestore_sync_submit_transaction) {
11645 if (txc->last_nid >= nid_max ||
11646 txc->last_blobid >= blobid_max) {
11647 dout(20) << __func__
11648 << " last_{nid,blobid} exceeds max, submit via kv thread"
11649 << dendl;
11650 } else if (txc->osr->kv_committing_serially) {
11651 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11652 << dendl;
11653 // note: this is starvation-prone. once we have a txc in a busy
11654 // sequencer that is committing serially it is possible to keep
11655 // submitting new transactions fast enough that we get stuck doing
11656 // so. the alternative is to block here... fixme?
11657 } else if (txc->osr->txc_with_unstable_io) {
11658 dout(20) << __func__ << " prior txc(s) with unstable ios "
11659 << txc->osr->txc_with_unstable_io.load() << dendl;
11660 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11661 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11662 == 0) {
11663 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11664 << dendl;
11665 } else {
11666 _txc_apply_kv(txc, true);
11667 }
11668 }
11669 {
11670 std::lock_guard l(kv_lock);
11671 kv_queue.push_back(txc);
11672 if (!kv_sync_in_progress) {
11673 kv_sync_in_progress = true;
11674 kv_cond.notify_one();
11675 }
11676 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
11677 kv_queue_unsubmitted.push_back(txc);
11678 ++txc->osr->kv_committing_serially;
11679 }
11680 if (txc->had_ios)
11681 kv_ios++;
11682 kv_throttle_costs += txc->cost;
11683 }
11684 return;
11685 case TransContext::STATE_KV_SUBMITTED:
11686 _txc_committed_kv(txc);
11687 // ** fall-thru **
11688
11689 case TransContext::STATE_KV_DONE:
11690 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
11691 if (txc->deferred_txn) {
11692 txc->state = TransContext::STATE_DEFERRED_QUEUED;
11693 _deferred_queue(txc);
11694 return;
11695 }
11696 txc->state = TransContext::STATE_FINISHING;
11697 break;
11698
11699 case TransContext::STATE_DEFERRED_CLEANUP:
11700 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
11701 txc->state = TransContext::STATE_FINISHING;
11702 // ** fall-thru **
11703
11704 case TransContext::STATE_FINISHING:
11705 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
11706 _txc_finish(txc);
11707 return;
11708
11709 default:
11710 derr << __func__ << " unexpected txc " << txc
11711 << " state " << txc->get_state_name() << dendl;
11712 ceph_abort_msg("unexpected txc state");
11713 return;
11714 }
11715 }
11716 }
11717
11718 void BlueStore::_txc_finish_io(TransContext *txc)
11719 {
11720 dout(20) << __func__ << " " << txc << dendl;
11721
11722 /*
11723 * we need to preserve the order of kv transactions,
11724 * even though aio will complete in any order.
11725 */
11726
11727 OpSequencer *osr = txc->osr.get();
11728 std::lock_guard l(osr->qlock);
11729 txc->state = TransContext::STATE_IO_DONE;
11730 txc->ioc.release_running_aios();
11731 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11732 while (p != osr->q.begin()) {
11733 --p;
11734 if (p->state < TransContext::STATE_IO_DONE) {
11735 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11736 << p->get_state_name() << dendl;
11737 return;
11738 }
11739 if (p->state > TransContext::STATE_IO_DONE) {
11740 ++p;
11741 break;
11742 }
11743 }
11744 do {
11745 _txc_state_proc(&*p++);
11746 } while (p != osr->q.end() &&
11747 p->state == TransContext::STATE_IO_DONE);
11748
11749 if (osr->kv_submitted_waiters) {
11750 osr->qcond.notify_all();
11751 }
11752 }
11753
11754 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11755 {
11756 dout(20) << __func__ << " txc " << txc
11757 << " onodes " << txc->onodes
11758 << " shared_blobs " << txc->shared_blobs
11759 << dendl;
11760
11761 // finalize onodes
11762 for (auto o : txc->onodes) {
11763 _record_onode(o, t);
11764 o->flushing_count++;
11765 }
11766
11767 // objects we modified but didn't affect the onode
11768 auto p = txc->modified_objects.begin();
11769 while (p != txc->modified_objects.end()) {
11770 if (txc->onodes.count(*p) == 0) {
11771 (*p)->flushing_count++;
11772 ++p;
11773 } else {
11774 // remove dups with onodes list to avoid problems in _txc_finish
11775 p = txc->modified_objects.erase(p);
11776 }
11777 }
11778
11779 // finalize shared_blobs
11780 for (auto sb : txc->shared_blobs) {
11781 string key;
11782 auto sbid = sb->get_sbid();
11783 get_shared_blob_key(sbid, &key);
11784 if (sb->persistent->empty()) {
11785 dout(20) << __func__ << " shared_blob 0x"
11786 << std::hex << sbid << std::dec
11787 << " is empty" << dendl;
11788 t->rmkey(PREFIX_SHARED_BLOB, key);
11789 } else {
11790 bufferlist bl;
11791 encode(*(sb->persistent), bl);
11792 dout(20) << __func__ << " shared_blob 0x"
11793 << std::hex << sbid << std::dec
11794 << " is " << bl.length() << " " << *sb << dendl;
11795 t->set(PREFIX_SHARED_BLOB, key, bl);
11796 }
11797 }
11798 }
11799
11800 void BlueStore::BSPerfTracker::update_from_perfcounters(
11801 PerfCounters &logger)
11802 {
11803 os_commit_latency_ns.consume_next(
11804 logger.get_tavg_ns(
11805 l_bluestore_commit_lat));
11806 os_apply_latency_ns.consume_next(
11807 logger.get_tavg_ns(
11808 l_bluestore_commit_lat));
11809 }
11810
11811 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11812 {
11813 dout(20) << __func__ << " txc " << txc << std::hex
11814 << " allocated 0x" << txc->allocated
11815 << " released 0x" << txc->released
11816 << std::dec << dendl;
11817
11818 // We have to handle the case where we allocate *and* deallocate the
11819 // same region in this transaction. The freelist doesn't like that.
11820 // (Actually, the only thing that cares is the BitmapFreelistManager
11821 // debug check. But that's important.)
11822 interval_set<uint64_t> tmp_allocated, tmp_released;
11823 interval_set<uint64_t> *pallocated = &txc->allocated;
11824 interval_set<uint64_t> *preleased = &txc->released;
11825 if (!txc->allocated.empty() && !txc->released.empty()) {
11826 interval_set<uint64_t> overlap;
11827 overlap.intersection_of(txc->allocated, txc->released);
11828 if (!overlap.empty()) {
11829 tmp_allocated = txc->allocated;
11830 tmp_allocated.subtract(overlap);
11831 tmp_released = txc->released;
11832 tmp_released.subtract(overlap);
11833 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11834 << ", new allocated 0x" << tmp_allocated
11835 << " released 0x" << tmp_released << std::dec
11836 << dendl;
11837 pallocated = &tmp_allocated;
11838 preleased = &tmp_released;
11839 }
11840 }
11841
11842 // update freelist with non-overlap sets
11843 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11844 p != pallocated->end();
11845 ++p) {
11846 fm->allocate(p.get_start(), p.get_len(), t);
11847 }
11848 for (interval_set<uint64_t>::iterator p = preleased->begin();
11849 p != preleased->end();
11850 ++p) {
11851 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11852 << "~" << p.get_len() << std::dec << dendl;
11853 fm->release(p.get_start(), p.get_len(), t);
11854 }
11855
11856 _txc_update_store_statfs(txc);
11857 }
11858
11859 void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
11860 {
11861 ceph_assert(txc->state == TransContext::STATE_KV_QUEUED);
11862 {
11863 #if defined(WITH_LTTNG)
11864 auto start = mono_clock::now();
11865 #endif
11866
11867 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11868 ceph_assert(r == 0);
11869 txc->state = TransContext::STATE_KV_SUBMITTED;
11870 if (txc->osr->kv_submitted_waiters) {
11871 std::lock_guard l(txc->osr->qlock);
11872 txc->osr->qcond.notify_all();
11873 }
11874
11875 #if defined(WITH_LTTNG)
11876 if (txc->tracing) {
11877 tracepoint(
11878 bluestore,
11879 transaction_kv_submit_latency,
11880 txc->osr->get_sequencer_id(),
11881 txc->seq,
11882 sync_submit_transaction,
11883 ceph::to_seconds<double>(mono_clock::now() - start));
11884 }
11885 #endif
11886 }
11887
11888 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11889 for (auto& o : *ls) {
11890 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11891 << dendl;
11892 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11893 std::lock_guard l(o->flush_lock);
11894 o->flush_cond.notify_all();
11895 }
11896 }
11897 }
11898 }
11899
11900 void BlueStore::_txc_committed_kv(TransContext *txc)
11901 {
11902 dout(20) << __func__ << " txc " << txc << dendl;
11903 throttle.complete_kv(*txc);
11904 {
11905 std::lock_guard l(txc->osr->qlock);
11906 txc->state = TransContext::STATE_KV_DONE;
11907 if (txc->ch->commit_queue) {
11908 txc->ch->commit_queue->queue(txc->oncommits);
11909 } else {
11910 finisher.queue(txc->oncommits);
11911 }
11912 }
11913 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
11914 log_latency_fn(
11915 __func__,
11916 l_bluestore_commit_lat,
11917 mono_clock::now() - txc->start,
11918 cct->_conf->bluestore_log_op_age,
11919 [&](auto lat) {
11920 return ", txc = " + stringify(txc);
11921 }
11922 );
11923 }
11924
11925 void BlueStore::_txc_finish(TransContext *txc)
11926 {
11927 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11928 ceph_assert(txc->state == TransContext::STATE_FINISHING);
11929
11930 for (auto& sb : txc->shared_blobs_written) {
11931 sb->finish_write(txc->seq);
11932 }
11933 txc->shared_blobs_written.clear();
11934
11935 while (!txc->removed_collections.empty()) {
11936 _queue_reap_collection(txc->removed_collections.front());
11937 txc->removed_collections.pop_front();
11938 }
11939
11940 OpSequencerRef osr = txc->osr;
11941 bool empty = false;
11942 bool submit_deferred = false;
11943 OpSequencer::q_list_t releasing_txc;
11944 {
11945 std::lock_guard l(osr->qlock);
11946 txc->state = TransContext::STATE_DONE;
11947 bool notify = false;
11948 while (!osr->q.empty()) {
11949 TransContext *txc = &osr->q.front();
11950 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11951 << dendl;
11952 if (txc->state != TransContext::STATE_DONE) {
11953 if (txc->state == TransContext::STATE_PREPARE &&
11954 deferred_aggressive) {
11955 // for _osr_drain_preceding()
11956 notify = true;
11957 }
11958 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11959 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
11960 submit_deferred = true;
11961 }
11962 break;
11963 }
11964
11965 osr->q.pop_front();
11966 releasing_txc.push_back(*txc);
11967 }
11968
11969 if (osr->q.empty()) {
11970 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11971 empty = true;
11972 }
11973
11974 // only drain()/drain_preceding() need wakeup,
11975 // other cases use kv_submitted_waiters
11976 if (notify || empty) {
11977 osr->qcond.notify_all();
11978 }
11979 }
11980
11981 while (!releasing_txc.empty()) {
11982 // release to allocator only after all preceding txc's have also
11983 // finished any deferred writes that potentially land in these
11984 // blocks
11985 auto txc = &releasing_txc.front();
11986 _txc_release_alloc(txc);
11987 releasing_txc.pop_front();
11988 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11989 throttle.complete(*txc);
11990 delete txc;
11991 }
11992
11993 if (submit_deferred) {
11994 // we're pinning memory; flush! we could be more fine-grained here but
11995 // i'm not sure it's worth the bother.
11996 deferred_try_submit();
11997 }
11998
11999 if (empty && osr->zombie) {
12000 std::lock_guard l(zombie_osr_lock);
12001 if (zombie_osr_set.erase(osr->cid)) {
12002 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12003 } else {
12004 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
12005 << dendl;
12006 }
12007 }
12008 }
12009
12010 void BlueStore::_txc_release_alloc(TransContext *txc)
12011 {
12012 // it's expected we're called with lazy_release_lock already taken!
12013 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
12014 int r = 0;
12015 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12016 r = bdev->queue_discard(txc->released);
12017 if (r == 0) {
12018 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
12019 << txc->released << std::dec << dendl;
12020 goto out;
12021 }
12022 } else if (cct->_conf->bdev_enable_discard) {
12023 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
12024 bdev->discard(p.get_start(), p.get_len());
12025 }
12026 }
12027 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
12028 << txc->released << std::dec << dendl;
12029 alloc->release(txc->released);
12030 }
12031
12032 out:
12033 txc->allocated.clear();
12034 txc->released.clear();
12035 }
12036
12037 void BlueStore::_osr_attach(Collection *c)
12038 {
12039 // note: caller has RWLock on coll_map
12040 auto q = coll_map.find(c->cid);
12041 if (q != coll_map.end()) {
12042 c->osr = q->second->osr;
12043 ldout(cct, 10) << __func__ << " " << c->cid
12044 << " reusing osr " << c->osr << " from existing coll "
12045 << q->second << dendl;
12046 } else {
12047 std::lock_guard l(zombie_osr_lock);
12048 auto p = zombie_osr_set.find(c->cid);
12049 if (p == zombie_osr_set.end()) {
12050 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
12051 ldout(cct, 10) << __func__ << " " << c->cid
12052 << " fresh osr " << c->osr << dendl;
12053 } else {
12054 c->osr = p->second;
12055 zombie_osr_set.erase(p);
12056 ldout(cct, 10) << __func__ << " " << c->cid
12057 << " resurrecting zombie osr " << c->osr << dendl;
12058 c->osr->zombie = false;
12059 }
12060 }
12061 }
12062
12063 void BlueStore::_osr_register_zombie(OpSequencer *osr)
12064 {
12065 std::lock_guard l(zombie_osr_lock);
12066 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
12067 osr->zombie = true;
12068 auto i = zombie_osr_set.emplace(osr->cid, osr);
12069 // this is either a new insertion or the same osr is already there
12070 ceph_assert(i.second || i.first->second == osr);
12071 }
12072
12073 void BlueStore::_osr_drain_preceding(TransContext *txc)
12074 {
12075 OpSequencer *osr = txc->osr.get();
12076 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
12077 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
12078 {
12079 // submit anything pending
12080 deferred_lock.lock();
12081 if (osr->deferred_pending && !osr->deferred_running) {
12082 _deferred_submit_unlock(osr);
12083 } else {
12084 deferred_lock.unlock();
12085 }
12086 }
12087 {
12088 // wake up any previously finished deferred events
12089 std::lock_guard l(kv_lock);
12090 if (!kv_sync_in_progress) {
12091 kv_sync_in_progress = true;
12092 kv_cond.notify_one();
12093 }
12094 }
12095 osr->drain_preceding(txc);
12096 --deferred_aggressive;
12097 dout(10) << __func__ << " " << osr << " done" << dendl;
12098 }
12099
12100 void BlueStore::_osr_drain(OpSequencer *osr)
12101 {
12102 dout(10) << __func__ << " " << osr << dendl;
12103 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
12104 {
12105 // submit anything pending
12106 deferred_lock.lock();
12107 if (osr->deferred_pending && !osr->deferred_running) {
12108 _deferred_submit_unlock(osr);
12109 } else {
12110 deferred_lock.unlock();
12111 }
12112 }
12113 {
12114 // wake up any previously finished deferred events
12115 std::lock_guard l(kv_lock);
12116 if (!kv_sync_in_progress) {
12117 kv_sync_in_progress = true;
12118 kv_cond.notify_one();
12119 }
12120 }
12121 osr->drain();
12122 --deferred_aggressive;
12123 dout(10) << __func__ << " " << osr << " done" << dendl;
12124 }
12125
12126 void BlueStore::_osr_drain_all()
12127 {
12128 dout(10) << __func__ << dendl;
12129
12130 set<OpSequencerRef> s;
12131 vector<OpSequencerRef> zombies;
12132 {
12133 std::shared_lock l(coll_lock);
12134 for (auto& i : coll_map) {
12135 s.insert(i.second->osr);
12136 }
12137 }
12138 {
12139 std::lock_guard l(zombie_osr_lock);
12140 for (auto& i : zombie_osr_set) {
12141 s.insert(i.second);
12142 zombies.push_back(i.second);
12143 }
12144 }
12145 dout(20) << __func__ << " osr_set " << s << dendl;
12146
12147 ++deferred_aggressive;
12148 {
12149 // submit anything pending
12150 deferred_try_submit();
12151 }
12152 {
12153 // wake up any previously finished deferred events
12154 std::lock_guard l(kv_lock);
12155 kv_cond.notify_one();
12156 }
12157 {
12158 std::lock_guard l(kv_finalize_lock);
12159 kv_finalize_cond.notify_one();
12160 }
12161 for (auto osr : s) {
12162 dout(20) << __func__ << " drain " << osr << dendl;
12163 osr->drain();
12164 }
12165 --deferred_aggressive;
12166
12167 {
12168 std::lock_guard l(zombie_osr_lock);
12169 for (auto& osr : zombies) {
12170 if (zombie_osr_set.erase(osr->cid)) {
12171 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12172 ceph_assert(osr->q.empty());
12173 } else if (osr->zombie) {
12174 dout(10) << __func__ << " empty zombie osr " << osr
12175 << " already reaped" << dendl;
12176 ceph_assert(osr->q.empty());
12177 } else {
12178 dout(10) << __func__ << " empty zombie osr " << osr
12179 << " resurrected" << dendl;
12180 }
12181 }
12182 }
12183
12184 dout(10) << __func__ << " done" << dendl;
12185 }
12186
12187
12188 void BlueStore::_kv_start()
12189 {
12190 dout(10) << __func__ << dendl;
12191
12192 finisher.start();
12193 kv_sync_thread.create("bstore_kv_sync");
12194 kv_finalize_thread.create("bstore_kv_final");
12195 }
12196
12197 void BlueStore::_kv_stop()
12198 {
12199 dout(10) << __func__ << dendl;
12200 {
12201 std::unique_lock l{kv_lock};
12202 while (!kv_sync_started) {
12203 kv_cond.wait(l);
12204 }
12205 kv_stop = true;
12206 kv_cond.notify_all();
12207 }
12208 {
12209 std::unique_lock l{kv_finalize_lock};
12210 while (!kv_finalize_started) {
12211 kv_finalize_cond.wait(l);
12212 }
12213 kv_finalize_stop = true;
12214 kv_finalize_cond.notify_all();
12215 }
12216 kv_sync_thread.join();
12217 kv_finalize_thread.join();
12218 ceph_assert(removed_collections.empty());
12219 {
12220 std::lock_guard l(kv_lock);
12221 kv_stop = false;
12222 }
12223 {
12224 std::lock_guard l(kv_finalize_lock);
12225 kv_finalize_stop = false;
12226 }
12227 dout(10) << __func__ << " stopping finishers" << dendl;
12228 finisher.wait_for_empty();
12229 finisher.stop();
12230 dout(10) << __func__ << " stopped" << dendl;
12231 }
12232
12233 void BlueStore::_kv_sync_thread()
12234 {
12235 dout(10) << __func__ << " start" << dendl;
12236 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
12237 std::unique_lock l{kv_lock};
12238 ceph_assert(!kv_sync_started);
12239 kv_sync_started = true;
12240 kv_cond.notify_all();
12241
12242 auto t0 = mono_clock::now();
12243 timespan twait = ceph::make_timespan(0);
12244 size_t kv_submitted = 0;
12245
12246 while (true) {
12247 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
12248 auto observation_period =
12249 ceph::make_timespan(period);
12250 auto elapsed = mono_clock::now() - t0;
12251 if (period && elapsed >= observation_period) {
12252 dout(5) << __func__ << " utilization: idle "
12253 << twait << " of " << elapsed
12254 << ", submitted: " << kv_submitted
12255 <<dendl;
12256 t0 = mono_clock::now();
12257 twait = ceph::make_timespan(0);
12258 kv_submitted = 0;
12259 }
12260 ceph_assert(kv_committing.empty());
12261 if (kv_queue.empty() &&
12262 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
12263 !deferred_aggressive)) {
12264 if (kv_stop)
12265 break;
12266 dout(20) << __func__ << " sleep" << dendl;
12267 auto t = mono_clock::now();
12268 kv_sync_in_progress = false;
12269 kv_cond.wait(l);
12270 twait += mono_clock::now() - t;
12271
12272 dout(20) << __func__ << " wake" << dendl;
12273 } else {
12274 deque<TransContext*> kv_submitting;
12275 deque<DeferredBatch*> deferred_done, deferred_stable;
12276 uint64_t aios = 0, costs = 0;
12277
12278 dout(20) << __func__ << " committing " << kv_queue.size()
12279 << " submitting " << kv_queue_unsubmitted.size()
12280 << " deferred done " << deferred_done_queue.size()
12281 << " stable " << deferred_stable_queue.size()
12282 << dendl;
12283 kv_committing.swap(kv_queue);
12284 kv_submitting.swap(kv_queue_unsubmitted);
12285 deferred_done.swap(deferred_done_queue);
12286 deferred_stable.swap(deferred_stable_queue);
12287 aios = kv_ios;
12288 costs = kv_throttle_costs;
12289 kv_ios = 0;
12290 kv_throttle_costs = 0;
12291 l.unlock();
12292
12293 dout(30) << __func__ << " committing " << kv_committing << dendl;
12294 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
12295 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
12296 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
12297
12298 auto start = mono_clock::now();
12299
12300 bool force_flush = false;
12301 // if bluefs is sharing the same device as data (only), then we
12302 // can rely on the bluefs commit to flush the device and make
12303 // deferred aios stable. that means that if we do have done deferred
12304 // txcs AND we are not on a single device, we need to force a flush.
12305 if (bluefs && bluefs_layout.single_shared_device()) {
12306 if (aios) {
12307 force_flush = true;
12308 } else if (kv_committing.empty() && deferred_stable.empty()) {
12309 force_flush = true; // there's nothing else to commit!
12310 } else if (deferred_aggressive) {
12311 force_flush = true;
12312 }
12313 } else {
12314 if (aios || !deferred_done.empty()) {
12315 force_flush = true;
12316 } else {
12317 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
12318 }
12319 }
12320
12321 if (force_flush) {
12322 dout(20) << __func__ << " num_aios=" << aios
12323 << " force_flush=" << (int)force_flush
12324 << ", flushing, deferred done->stable" << dendl;
12325 // flush/barrier on block device
12326 bdev->flush();
12327
12328 // if we flush then deferred done are now deferred stable
12329 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
12330 deferred_done.end());
12331 deferred_done.clear();
12332 }
12333 auto after_flush = mono_clock::now();
12334
12335 // we will use one final transaction to force a sync
12336 KeyValueDB::Transaction synct = db->get_transaction();
12337
12338 // increase {nid,blobid}_max? note that this covers both the
12339 // case where we are approaching the max and the case we passed
12340 // it. in either case, we increase the max in the earlier txn
12341 // we submit.
12342 uint64_t new_nid_max = 0, new_blobid_max = 0;
12343 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
12344 KeyValueDB::Transaction t =
12345 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12346 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
12347 bufferlist bl;
12348 encode(new_nid_max, bl);
12349 t->set(PREFIX_SUPER, "nid_max", bl);
12350 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
12351 }
12352 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
12353 KeyValueDB::Transaction t =
12354 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12355 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
12356 bufferlist bl;
12357 encode(new_blobid_max, bl);
12358 t->set(PREFIX_SUPER, "blobid_max", bl);
12359 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
12360 }
12361
12362 for (auto txc : kv_committing) {
12363 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
12364 if (txc->state == TransContext::STATE_KV_QUEUED) {
12365 ++kv_submitted;
12366 _txc_apply_kv(txc, false);
12367 --txc->osr->kv_committing_serially;
12368 } else {
12369 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
12370 }
12371 if (txc->had_ios) {
12372 --txc->osr->txc_with_unstable_io;
12373 }
12374 }
12375
12376 // release throttle *before* we commit. this allows new ops
12377 // to be prepared and enter pipeline while we are waiting on
12378 // the kv commit sync/flush. then hopefully on the next
12379 // iteration there will already be ops awake. otherwise, we
12380 // end up going to sleep, and then wake up when the very first
12381 // transaction is ready for commit.
12382 throttle.release_kv_throttle(costs);
12383
12384 if (bluefs &&
12385 after_flush - bluefs_last_balance >
12386 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
12387 bluefs_last_balance = after_flush;
12388 int r = _balance_bluefs_freespace();
12389 ceph_assert(r >= 0);
12390 }
12391
12392 // cleanup sync deferred keys
12393 for (auto b : deferred_stable) {
12394 for (auto& txc : b->txcs) {
12395 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
12396 ceph_assert(wt.released.empty()); // only kraken did this
12397 string key;
12398 get_deferred_key(wt.seq, &key);
12399 synct->rm_single_key(PREFIX_DEFERRED, key);
12400 }
12401 }
12402
12403 #if defined(WITH_LTTNG)
12404 auto sync_start = mono_clock::now();
12405 #endif
12406 // submit synct synchronously (block and wait for it to commit)
12407 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
12408 ceph_assert(r == 0);
12409
12410 int committing_size = kv_committing.size();
12411 int deferred_size = deferred_stable.size();
12412
12413 #if defined(WITH_LTTNG)
12414 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
12415 for (auto txc: kv_committing) {
12416 if (txc->tracing) {
12417 tracepoint(
12418 bluestore,
12419 transaction_kv_sync_latency,
12420 txc->osr->get_sequencer_id(),
12421 txc->seq,
12422 kv_committing.size(),
12423 deferred_done.size(),
12424 deferred_stable.size(),
12425 sync_latency);
12426 }
12427 }
12428 #endif
12429
12430 {
12431 std::unique_lock m{kv_finalize_lock};
12432 if (kv_committing_to_finalize.empty()) {
12433 kv_committing_to_finalize.swap(kv_committing);
12434 } else {
12435 kv_committing_to_finalize.insert(
12436 kv_committing_to_finalize.end(),
12437 kv_committing.begin(),
12438 kv_committing.end());
12439 kv_committing.clear();
12440 }
12441 if (deferred_stable_to_finalize.empty()) {
12442 deferred_stable_to_finalize.swap(deferred_stable);
12443 } else {
12444 deferred_stable_to_finalize.insert(
12445 deferred_stable_to_finalize.end(),
12446 deferred_stable.begin(),
12447 deferred_stable.end());
12448 deferred_stable.clear();
12449 }
12450 if (!kv_finalize_in_progress) {
12451 kv_finalize_in_progress = true;
12452 kv_finalize_cond.notify_one();
12453 }
12454 }
12455
12456 if (new_nid_max) {
12457 nid_max = new_nid_max;
12458 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12459 }
12460 if (new_blobid_max) {
12461 blobid_max = new_blobid_max;
12462 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12463 }
12464
12465 {
12466 auto finish = mono_clock::now();
12467 ceph::timespan dur_flush = after_flush - start;
12468 ceph::timespan dur_kv = finish - after_flush;
12469 ceph::timespan dur = finish - start;
12470 dout(20) << __func__ << " committed " << committing_size
12471 << " cleaned " << deferred_size
12472 << " in " << dur
12473 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12474 << dendl;
12475 log_latency("kv_flush",
12476 l_bluestore_kv_flush_lat,
12477 dur_flush,
12478 cct->_conf->bluestore_log_op_age);
12479 log_latency("kv_commit",
12480 l_bluestore_kv_commit_lat,
12481 dur_kv,
12482 cct->_conf->bluestore_log_op_age);
12483 log_latency("kv_sync",
12484 l_bluestore_kv_sync_lat,
12485 dur,
12486 cct->_conf->bluestore_log_op_age);
12487 }
12488
12489 if (bluefs) {
12490 if (!bluefs_extents_reclaiming.empty()) {
12491 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
12492 << bluefs_extents_reclaiming << std::dec << dendl;
12493 int r = 0;
12494 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12495 r = bdev->queue_discard(bluefs_extents_reclaiming);
12496 if (r == 0) {
12497 goto clear;
12498 }
12499 } else if (cct->_conf->bdev_enable_discard) {
12500 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
12501 bdev->discard(p.get_start(), p.get_len());
12502 }
12503 }
12504
12505 alloc->release(bluefs_extents_reclaiming);
12506 clear:
12507 bluefs_extents_reclaiming.clear();
12508 }
12509 }
12510
12511 l.lock();
12512 // previously deferred "done" are now "stable" by virtue of this
12513 // commit cycle.
12514 deferred_stable_queue.swap(deferred_done);
12515 }
12516 }
12517 dout(10) << __func__ << " finish" << dendl;
12518 kv_sync_started = false;
12519 }
12520
12521 void BlueStore::_kv_finalize_thread()
12522 {
12523 deque<TransContext*> kv_committed;
12524 deque<DeferredBatch*> deferred_stable;
12525 dout(10) << __func__ << " start" << dendl;
12526 std::unique_lock l(kv_finalize_lock);
12527 ceph_assert(!kv_finalize_started);
12528 kv_finalize_started = true;
12529 kv_finalize_cond.notify_all();
12530 while (true) {
12531 ceph_assert(kv_committed.empty());
12532 ceph_assert(deferred_stable.empty());
12533 if (kv_committing_to_finalize.empty() &&
12534 deferred_stable_to_finalize.empty()) {
12535 if (kv_finalize_stop)
12536 break;
12537 dout(20) << __func__ << " sleep" << dendl;
12538 kv_finalize_in_progress = false;
12539 kv_finalize_cond.wait(l);
12540 dout(20) << __func__ << " wake" << dendl;
12541 } else {
12542 kv_committed.swap(kv_committing_to_finalize);
12543 deferred_stable.swap(deferred_stable_to_finalize);
12544 l.unlock();
12545 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12546 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12547
12548 auto start = mono_clock::now();
12549
12550 while (!kv_committed.empty()) {
12551 TransContext *txc = kv_committed.front();
12552 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
12553 _txc_state_proc(txc);
12554 kv_committed.pop_front();
12555 }
12556
12557 for (auto b : deferred_stable) {
12558 auto p = b->txcs.begin();
12559 while (p != b->txcs.end()) {
12560 TransContext *txc = &*p;
12561 p = b->txcs.erase(p); // unlink here because
12562 _txc_state_proc(txc); // this may destroy txc
12563 }
12564 delete b;
12565 }
12566 deferred_stable.clear();
12567
12568 if (!deferred_aggressive) {
12569 if (deferred_queue_size >= deferred_batch_ops.load() ||
12570 throttle.should_submit_deferred()) {
12571 deferred_try_submit();
12572 }
12573 }
12574
12575 // this is as good a place as any ...
12576 _reap_collections();
12577
12578 logger->set(l_bluestore_fragmentation,
12579 (uint64_t)(alloc->get_fragmentation() * 1000));
12580
12581 log_latency("kv_final",
12582 l_bluestore_kv_final_lat,
12583 mono_clock::now() - start,
12584 cct->_conf->bluestore_log_op_age);
12585
12586 l.lock();
12587 }
12588 }
12589 dout(10) << __func__ << " finish" << dendl;
12590 kv_finalize_started = false;
12591 }
12592
12593 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
12594 TransContext *txc)
12595 {
12596 if (!txc->deferred_txn) {
12597 txc->deferred_txn = new bluestore_deferred_transaction_t;
12598 }
12599 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12600 return &txc->deferred_txn->ops.back();
12601 }
12602
12603 void BlueStore::_deferred_queue(TransContext *txc)
12604 {
12605 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
12606 deferred_lock.lock();
12607 if (!txc->osr->deferred_pending &&
12608 !txc->osr->deferred_running) {
12609 deferred_queue.push_back(*txc->osr);
12610 }
12611 if (!txc->osr->deferred_pending) {
12612 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
12613 }
12614 ++deferred_queue_size;
12615 txc->osr->deferred_pending->txcs.push_back(*txc);
12616 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12617 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12618 const auto& op = *opi;
12619 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
12620 bufferlist::const_iterator p = op.data.begin();
12621 for (auto e : op.extents) {
12622 txc->osr->deferred_pending->prepare_write(
12623 cct, wt.seq, e.offset, e.length, p);
12624 }
12625 }
12626 if (deferred_aggressive &&
12627 !txc->osr->deferred_running) {
12628 _deferred_submit_unlock(txc->osr.get());
12629 } else {
12630 deferred_lock.unlock();
12631 }
12632 }
12633
12634 void BlueStore::deferred_try_submit()
12635 {
12636 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12637 << deferred_queue_size << " txcs" << dendl;
12638 std::lock_guard l(deferred_lock);
12639 vector<OpSequencerRef> osrs;
12640 osrs.reserve(deferred_queue.size());
12641 for (auto& osr : deferred_queue) {
12642 osrs.push_back(&osr);
12643 }
12644 for (auto& osr : osrs) {
12645 if (osr->deferred_pending) {
12646 if (!osr->deferred_running) {
12647 _deferred_submit_unlock(osr.get());
12648 deferred_lock.lock();
12649 } else {
12650 dout(20) << __func__ << " osr " << osr << " already has running"
12651 << dendl;
12652 }
12653 } else {
12654 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
12655 }
12656 }
12657
12658 deferred_last_submitted = ceph_clock_now();
12659 }
12660
12661 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
12662 {
12663 dout(10) << __func__ << " osr " << osr
12664 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12665 << dendl;
12666 ceph_assert(osr->deferred_pending);
12667 ceph_assert(!osr->deferred_running);
12668
12669 auto b = osr->deferred_pending;
12670 deferred_queue_size -= b->seq_bytes.size();
12671 ceph_assert(deferred_queue_size >= 0);
12672
12673 osr->deferred_running = osr->deferred_pending;
12674 osr->deferred_pending = nullptr;
12675
12676 deferred_lock.unlock();
12677
12678 for (auto& txc : b->txcs) {
12679 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
12680 }
12681 uint64_t start = 0, pos = 0;
12682 bufferlist bl;
12683 auto i = b->iomap.begin();
12684 while (true) {
12685 if (i == b->iomap.end() || i->first != pos) {
12686 if (bl.length()) {
12687 dout(20) << __func__ << " write 0x" << std::hex
12688 << start << "~" << bl.length()
12689 << " crc " << bl.crc32c(-1) << std::dec << dendl;
12690 if (!g_conf()->bluestore_debug_omit_block_device_write) {
12691 logger->inc(l_bluestore_deferred_write_ops);
12692 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12693 int r = bdev->aio_write(start, bl, &b->ioc, false);
12694 ceph_assert(r == 0);
12695 }
12696 }
12697 if (i == b->iomap.end()) {
12698 break;
12699 }
12700 start = 0;
12701 pos = i->first;
12702 bl.clear();
12703 }
12704 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12705 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12706 << dendl;
12707 if (!bl.length()) {
12708 start = pos;
12709 }
12710 pos += i->second.bl.length();
12711 bl.claim_append(i->second.bl);
12712 ++i;
12713 }
12714
12715 bdev->aio_submit(&b->ioc);
12716 }
12717
12718 struct C_DeferredTrySubmit : public Context {
12719 BlueStore *store;
12720 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12721 void finish(int r) {
12722 store->deferred_try_submit();
12723 }
12724 };
12725
12726 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12727 {
12728 dout(10) << __func__ << " osr " << osr << dendl;
12729 ceph_assert(osr->deferred_running);
12730 DeferredBatch *b = osr->deferred_running;
12731
12732 {
12733 deferred_lock.lock();
12734 ceph_assert(osr->deferred_running == b);
12735 osr->deferred_running = nullptr;
12736 if (!osr->deferred_pending) {
12737 dout(20) << __func__ << " dequeueing" << dendl;
12738 auto q = deferred_queue.iterator_to(*osr);
12739 deferred_queue.erase(q);
12740 deferred_lock.unlock();
12741 } else {
12742 deferred_lock.unlock();
12743 if (deferred_aggressive) {
12744 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12745 finisher.queue(new C_DeferredTrySubmit(this));
12746 } else {
12747 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12748 }
12749 }
12750 }
12751
12752 {
12753 uint64_t costs = 0;
12754 {
12755 for (auto& i : b->txcs) {
12756 TransContext *txc = &i;
12757 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
12758 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
12759 costs += txc->cost;
12760 }
12761 }
12762 throttle.release_deferred_throttle(costs);
12763 }
12764
12765 {
12766 std::lock_guard l(kv_lock);
12767 deferred_done_queue.emplace_back(b);
12768
12769 // in the normal case, do not bother waking up the kv thread; it will
12770 // catch us on the next commit anyway.
12771 if (deferred_aggressive && !kv_sync_in_progress) {
12772 kv_sync_in_progress = true;
12773 kv_cond.notify_one();
12774 }
12775 }
12776 }
12777
12778 int BlueStore::_deferred_replay()
12779 {
12780 dout(10) << __func__ << " start" << dendl;
12781 int count = 0;
12782 int r = 0;
12783 CollectionRef ch = _get_collection(coll_t::meta());
12784 bool fake_ch = false;
12785 if (!ch) {
12786 // hmm, replaying initial mkfs?
12787 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12788 fake_ch = true;
12789 }
12790 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
12791 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12792 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12793 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12794 << dendl;
12795 bluestore_deferred_transaction_t *deferred_txn =
12796 new bluestore_deferred_transaction_t;
12797 bufferlist bl = it->value();
12798 auto p = bl.cbegin();
12799 try {
12800 decode(*deferred_txn, p);
12801 } catch (buffer::error& e) {
12802 derr << __func__ << " failed to decode deferred txn "
12803 << pretty_binary_string(it->key()) << dendl;
12804 delete deferred_txn;
12805 r = -EIO;
12806 goto out;
12807 }
12808 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
12809 txc->deferred_txn = deferred_txn;
12810 txc->state = TransContext::STATE_KV_DONE;
12811 _txc_state_proc(txc);
12812 }
12813 out:
12814 dout(20) << __func__ << " draining osr" << dendl;
12815 _osr_register_zombie(osr);
12816 _osr_drain_all();
12817 if (fake_ch) {
12818 new_coll_map.clear();
12819 }
12820 dout(10) << __func__ << " completed " << count << " events" << dendl;
12821 return r;
12822 }
12823
12824 // ---------------------------
12825 // transactions
12826
12827 int BlueStore::queue_transactions(
12828 CollectionHandle& ch,
12829 vector<Transaction>& tls,
12830 TrackedOpRef op,
12831 ThreadPool::TPHandle *handle)
12832 {
12833 FUNCTRACE(cct);
12834 list<Context *> on_applied, on_commit, on_applied_sync;
12835 ObjectStore::Transaction::collect_contexts(
12836 tls, &on_applied, &on_commit, &on_applied_sync);
12837
12838 auto start = mono_clock::now();
12839
12840 Collection *c = static_cast<Collection*>(ch.get());
12841 OpSequencer *osr = c->osr.get();
12842 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
12843
12844 // prepare
12845 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
12846 &on_commit);
12847
12848 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
12849 txc->bytes += (*p).get_num_bytes();
12850 _txc_add_transaction(txc, &(*p));
12851 }
12852 _txc_calc_cost(txc);
12853
12854 _txc_write_nodes(txc, txc->t);
12855
12856 // journal deferred items
12857 if (txc->deferred_txn) {
12858 txc->deferred_txn->seq = ++deferred_seq;
12859 bufferlist bl;
12860 encode(*txc->deferred_txn, bl);
12861 string key;
12862 get_deferred_key(txc->deferred_txn->seq, &key);
12863 txc->t->set(PREFIX_DEFERRED, key, bl);
12864 }
12865
12866 _txc_finalize_kv(txc, txc->t);
12867 if (handle)
12868 handle->suspend_tp_timeout();
12869
12870 auto tstart = mono_clock::now();
12871
12872 if (!throttle.try_start_transaction(
12873 *db,
12874 *txc,
12875 tstart)) {
12876 // ensure we do not block here because of deferred writes
12877 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12878 << dendl;
12879 ++deferred_aggressive;
12880 deferred_try_submit();
12881 {
12882 // wake up any previously finished deferred events
12883 std::lock_guard l(kv_lock);
12884 if (!kv_sync_in_progress) {
12885 kv_sync_in_progress = true;
12886 kv_cond.notify_one();
12887 }
12888 }
12889 throttle.finish_start_transaction(*db, *txc, tstart);
12890 --deferred_aggressive;
12891 }
12892 auto tend = mono_clock::now();
12893
12894 if (handle)
12895 handle->reset_tp_timeout();
12896
12897 logger->inc(l_bluestore_txc);
12898
12899 // execute (start)
12900 _txc_state_proc(txc);
12901
12902 // we're immediately readable (unlike FileStore)
12903 for (auto c : on_applied_sync) {
12904 c->complete(0);
12905 }
12906 if (!on_applied.empty()) {
12907 if (c->commit_queue) {
12908 c->commit_queue->queue(on_applied);
12909 } else {
12910 finisher.queue(on_applied);
12911 }
12912 }
12913
12914 log_latency("submit_transact",
12915 l_bluestore_submit_lat,
12916 mono_clock::now() - start,
12917 cct->_conf->bluestore_log_op_age);
12918 log_latency("throttle_transact",
12919 l_bluestore_throttle_lat,
12920 tend - tstart,
12921 cct->_conf->bluestore_log_op_age);
12922 return 0;
12923 }
12924
12925 void BlueStore::_txc_aio_submit(TransContext *txc)
12926 {
12927 dout(10) << __func__ << " txc " << txc << dendl;
12928 bdev->aio_submit(&txc->ioc);
12929 }
12930
12931 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12932 {
12933 Transaction::iterator i = t->begin();
12934
12935 _dump_transaction<30>(cct, t);
12936
12937 vector<CollectionRef> cvec(i.colls.size());
12938 unsigned j = 0;
12939 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12940 ++p, ++j) {
12941 cvec[j] = _get_collection(*p);
12942 }
12943
12944 vector<OnodeRef> ovec(i.objects.size());
12945
12946 for (int pos = 0; i.have_op(); ++pos) {
12947 Transaction::Op *op = i.decode_op();
12948 int r = 0;
12949
12950 // no coll or obj
12951 if (op->op == Transaction::OP_NOP)
12952 continue;
12953
12954
12955 // collection operations
12956 CollectionRef &c = cvec[op->cid];
12957
12958 // initialize osd_pool_id and do a smoke test that all collections belong
12959 // to the same pool
12960 spg_t pgid;
12961 if (!!c ? c->cid.is_pg(&pgid) : false) {
12962 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12963 txc->osd_pool_id == pgid.pool());
12964 txc->osd_pool_id = pgid.pool();
12965 }
12966
12967 switch (op->op) {
12968 case Transaction::OP_RMCOLL:
12969 {
12970 const coll_t &cid = i.get_cid(op->cid);
12971 r = _remove_collection(txc, cid, &c);
12972 if (!r)
12973 continue;
12974 }
12975 break;
12976
12977 case Transaction::OP_MKCOLL:
12978 {
12979 ceph_assert(!c);
12980 const coll_t &cid = i.get_cid(op->cid);
12981 r = _create_collection(txc, cid, op->split_bits, &c);
12982 if (!r)
12983 continue;
12984 }
12985 break;
12986
12987 case Transaction::OP_SPLIT_COLLECTION:
12988 ceph_abort_msg("deprecated");
12989 break;
12990
12991 case Transaction::OP_SPLIT_COLLECTION2:
12992 {
12993 uint32_t bits = op->split_bits;
12994 uint32_t rem = op->split_rem;
12995 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12996 if (!r)
12997 continue;
12998 }
12999 break;
13000
13001 case Transaction::OP_MERGE_COLLECTION:
13002 {
13003 uint32_t bits = op->split_bits;
13004 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
13005 if (!r)
13006 continue;
13007 }
13008 break;
13009
13010 case Transaction::OP_COLL_HINT:
13011 {
13012 uint32_t type = op->hint_type;
13013 bufferlist hint;
13014 i.decode_bl(hint);
13015 auto hiter = hint.cbegin();
13016 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
13017 uint32_t pg_num;
13018 uint64_t num_objs;
13019 decode(pg_num, hiter);
13020 decode(num_objs, hiter);
13021 dout(10) << __func__ << " collection hint objects is a no-op, "
13022 << " pg_num " << pg_num << " num_objects " << num_objs
13023 << dendl;
13024 } else {
13025 // Ignore the hint
13026 dout(10) << __func__ << " unknown collection hint " << type << dendl;
13027 }
13028 continue;
13029 }
13030 break;
13031
13032 case Transaction::OP_COLL_SETATTR:
13033 r = -EOPNOTSUPP;
13034 break;
13035
13036 case Transaction::OP_COLL_RMATTR:
13037 r = -EOPNOTSUPP;
13038 break;
13039
13040 case Transaction::OP_COLL_RENAME:
13041 ceph_abort_msg("not implemented");
13042 break;
13043 }
13044 if (r < 0) {
13045 derr << __func__ << " error " << cpp_strerror(r)
13046 << " not handled on operation " << op->op
13047 << " (op " << pos << ", counting from 0)" << dendl;
13048 _dump_transaction<0>(cct, t);
13049 ceph_abort_msg("unexpected error");
13050 }
13051
13052 // these operations implicity create the object
13053 bool create = false;
13054 if (op->op == Transaction::OP_TOUCH ||
13055 op->op == Transaction::OP_CREATE ||
13056 op->op == Transaction::OP_WRITE ||
13057 op->op == Transaction::OP_ZERO) {
13058 create = true;
13059 }
13060
13061 // object operations
13062 std::unique_lock l(c->lock);
13063 OnodeRef &o = ovec[op->oid];
13064 if (!o) {
13065 ghobject_t oid = i.get_oid(op->oid);
13066 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
13067 }
13068 if (!create && (!o || !o->exists)) {
13069 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
13070 << i.get_oid(op->oid) << dendl;
13071 r = -ENOENT;
13072 goto endop;
13073 }
13074
13075 switch (op->op) {
13076 case Transaction::OP_CREATE:
13077 case Transaction::OP_TOUCH:
13078 r = _touch(txc, c, o);
13079 break;
13080
13081 case Transaction::OP_WRITE:
13082 {
13083 uint64_t off = op->off;
13084 uint64_t len = op->len;
13085 uint32_t fadvise_flags = i.get_fadvise_flags();
13086 bufferlist bl;
13087 i.decode_bl(bl);
13088 r = _write(txc, c, o, off, len, bl, fadvise_flags);
13089 }
13090 break;
13091
13092 case Transaction::OP_ZERO:
13093 {
13094 uint64_t off = op->off;
13095 uint64_t len = op->len;
13096 r = _zero(txc, c, o, off, len);
13097 }
13098 break;
13099
13100 case Transaction::OP_TRIMCACHE:
13101 {
13102 // deprecated, no-op
13103 }
13104 break;
13105
13106 case Transaction::OP_TRUNCATE:
13107 {
13108 uint64_t off = op->off;
13109 r = _truncate(txc, c, o, off);
13110 }
13111 break;
13112
13113 case Transaction::OP_REMOVE:
13114 {
13115 r = _remove(txc, c, o);
13116 }
13117 break;
13118
13119 case Transaction::OP_SETATTR:
13120 {
13121 string name = i.decode_string();
13122 bufferptr bp;
13123 i.decode_bp(bp);
13124 r = _setattr(txc, c, o, name, bp);
13125 }
13126 break;
13127
13128 case Transaction::OP_SETATTRS:
13129 {
13130 map<string, bufferptr> aset;
13131 i.decode_attrset(aset);
13132 r = _setattrs(txc, c, o, aset);
13133 }
13134 break;
13135
13136 case Transaction::OP_RMATTR:
13137 {
13138 string name = i.decode_string();
13139 r = _rmattr(txc, c, o, name);
13140 }
13141 break;
13142
13143 case Transaction::OP_RMATTRS:
13144 {
13145 r = _rmattrs(txc, c, o);
13146 }
13147 break;
13148
13149 case Transaction::OP_CLONE:
13150 {
13151 OnodeRef& no = ovec[op->dest_oid];
13152 if (!no) {
13153 const ghobject_t& noid = i.get_oid(op->dest_oid);
13154 no = c->get_onode(noid, true);
13155 }
13156 r = _clone(txc, c, o, no);
13157 }
13158 break;
13159
13160 case Transaction::OP_CLONERANGE:
13161 ceph_abort_msg("deprecated");
13162 break;
13163
13164 case Transaction::OP_CLONERANGE2:
13165 {
13166 OnodeRef& no = ovec[op->dest_oid];
13167 if (!no) {
13168 const ghobject_t& noid = i.get_oid(op->dest_oid);
13169 no = c->get_onode(noid, true);
13170 }
13171 uint64_t srcoff = op->off;
13172 uint64_t len = op->len;
13173 uint64_t dstoff = op->dest_off;
13174 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
13175 }
13176 break;
13177
13178 case Transaction::OP_COLL_ADD:
13179 ceph_abort_msg("not implemented");
13180 break;
13181
13182 case Transaction::OP_COLL_REMOVE:
13183 ceph_abort_msg("not implemented");
13184 break;
13185
13186 case Transaction::OP_COLL_MOVE:
13187 ceph_abort_msg("deprecated");
13188 break;
13189
13190 case Transaction::OP_COLL_MOVE_RENAME:
13191 case Transaction::OP_TRY_RENAME:
13192 {
13193 ceph_assert(op->cid == op->dest_cid);
13194 const ghobject_t& noid = i.get_oid(op->dest_oid);
13195 OnodeRef& no = ovec[op->dest_oid];
13196 if (!no) {
13197 no = c->get_onode(noid, false);
13198 }
13199 r = _rename(txc, c, o, no, noid);
13200 }
13201 break;
13202
13203 case Transaction::OP_OMAP_CLEAR:
13204 {
13205 r = _omap_clear(txc, c, o);
13206 }
13207 break;
13208 case Transaction::OP_OMAP_SETKEYS:
13209 {
13210 bufferlist aset_bl;
13211 i.decode_attrset_bl(&aset_bl);
13212 r = _omap_setkeys(txc, c, o, aset_bl);
13213 }
13214 break;
13215 case Transaction::OP_OMAP_RMKEYS:
13216 {
13217 bufferlist keys_bl;
13218 i.decode_keyset_bl(&keys_bl);
13219 r = _omap_rmkeys(txc, c, o, keys_bl);
13220 }
13221 break;
13222 case Transaction::OP_OMAP_RMKEYRANGE:
13223 {
13224 string first, last;
13225 first = i.decode_string();
13226 last = i.decode_string();
13227 r = _omap_rmkey_range(txc, c, o, first, last);
13228 }
13229 break;
13230 case Transaction::OP_OMAP_SETHEADER:
13231 {
13232 bufferlist bl;
13233 i.decode_bl(bl);
13234 r = _omap_setheader(txc, c, o, bl);
13235 }
13236 break;
13237
13238 case Transaction::OP_SETALLOCHINT:
13239 {
13240 r = _set_alloc_hint(txc, c, o,
13241 op->expected_object_size,
13242 op->expected_write_size,
13243 op->alloc_hint_flags);
13244 }
13245 break;
13246
13247 default:
13248 derr << __func__ << " bad op " << op->op << dendl;
13249 ceph_abort();
13250 }
13251
13252 endop:
13253 if (r < 0) {
13254 bool ok = false;
13255
13256 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
13257 op->op == Transaction::OP_CLONE ||
13258 op->op == Transaction::OP_CLONERANGE2 ||
13259 op->op == Transaction::OP_COLL_ADD ||
13260 op->op == Transaction::OP_SETATTR ||
13261 op->op == Transaction::OP_SETATTRS ||
13262 op->op == Transaction::OP_RMATTR ||
13263 op->op == Transaction::OP_OMAP_SETKEYS ||
13264 op->op == Transaction::OP_OMAP_RMKEYS ||
13265 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
13266 op->op == Transaction::OP_OMAP_SETHEADER))
13267 // -ENOENT is usually okay
13268 ok = true;
13269 if (r == -ENODATA)
13270 ok = true;
13271
13272 if (!ok) {
13273 const char *msg = "unexpected error code";
13274
13275 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
13276 op->op == Transaction::OP_CLONE ||
13277 op->op == Transaction::OP_CLONERANGE2))
13278 msg = "ENOENT on clone suggests osd bug";
13279
13280 if (r == -ENOSPC)
13281 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13282 // by partially applying transactions.
13283 msg = "ENOSPC from bluestore, misconfigured cluster";
13284
13285 if (r == -ENOTEMPTY) {
13286 msg = "ENOTEMPTY suggests garbage data in osd data dir";
13287 }
13288
13289 derr << __func__ << " error " << cpp_strerror(r)
13290 << " not handled on operation " << op->op
13291 << " (op " << pos << ", counting from 0)"
13292 << dendl;
13293 derr << msg << dendl;
13294 _dump_transaction<0>(cct, t);
13295 ceph_abort_msg("unexpected error");
13296 }
13297 }
13298 }
13299 }
13300
13301
13302
13303 // -----------------
13304 // write operations
13305
13306 int BlueStore::_touch(TransContext *txc,
13307 CollectionRef& c,
13308 OnodeRef &o)
13309 {
13310 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13311 int r = 0;
13312 _assign_nid(txc, o);
13313 txc->write_onode(o);
13314 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13315 return r;
13316 }
13317
13318 void BlueStore::_pad_zeros(
13319 bufferlist *bl, uint64_t *offset,
13320 uint64_t chunk_size)
13321 {
13322 auto length = bl->length();
13323 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
13324 << " chunk_size 0x" << chunk_size << std::dec << dendl;
13325 dout(40) << "before:\n";
13326 bl->hexdump(*_dout);
13327 *_dout << dendl;
13328 // front
13329 size_t front_pad = *offset % chunk_size;
13330 size_t back_pad = 0;
13331 size_t pad_count = 0;
13332 if (front_pad) {
13333 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
13334 bufferptr z = buffer::create_small_page_aligned(chunk_size);
13335 z.zero(0, front_pad, false);
13336 pad_count += front_pad;
13337 bl->begin().copy(front_copy, z.c_str() + front_pad);
13338 if (front_copy + front_pad < chunk_size) {
13339 back_pad = chunk_size - (length + front_pad);
13340 z.zero(front_pad + length, back_pad, false);
13341 pad_count += back_pad;
13342 }
13343 bufferlist old, t;
13344 old.swap(*bl);
13345 t.substr_of(old, front_copy, length - front_copy);
13346 bl->append(z);
13347 bl->claim_append(t);
13348 *offset -= front_pad;
13349 length += pad_count;
13350 }
13351
13352 // back
13353 uint64_t end = *offset + length;
13354 unsigned back_copy = end % chunk_size;
13355 if (back_copy) {
13356 ceph_assert(back_pad == 0);
13357 back_pad = chunk_size - back_copy;
13358 ceph_assert(back_copy <= length);
13359 bufferptr tail(chunk_size);
13360 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
13361 tail.zero(back_copy, back_pad, false);
13362 bufferlist old;
13363 old.swap(*bl);
13364 bl->substr_of(old, 0, length - back_copy);
13365 bl->append(tail);
13366 length += back_pad;
13367 pad_count += back_pad;
13368 }
13369 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
13370 << back_pad << " on front/back, now 0x" << *offset << "~"
13371 << length << std::dec << dendl;
13372 dout(40) << "after:\n";
13373 bl->hexdump(*_dout);
13374 *_dout << dendl;
13375 if (pad_count)
13376 logger->inc(l_bluestore_write_pad_bytes, pad_count);
13377 ceph_assert(bl->length() == length);
13378 }
13379
13380 void BlueStore::_do_write_small(
13381 TransContext *txc,
13382 CollectionRef &c,
13383 OnodeRef o,
13384 uint64_t offset, uint64_t length,
13385 bufferlist::iterator& blp,
13386 WriteContext *wctx)
13387 {
13388 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13389 << std::dec << dendl;
13390 ceph_assert(length < min_alloc_size);
13391 uint64_t end_offs = offset + length;
13392
13393 logger->inc(l_bluestore_write_small);
13394 logger->inc(l_bluestore_write_small_bytes, length);
13395
13396 bufferlist bl;
13397 blp.copy(length, bl);
13398
13399 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13400 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13401 uint32_t alloc_len = min_alloc_size;
13402 auto offset0 = p2align<uint64_t>(offset, alloc_len);
13403
13404 bool any_change;
13405
13406 // search suitable extent in both forward and reverse direction in
13407 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13408 // then check if blob can be reused via can_reuse_blob func or apply
13409 // direct/deferred write (the latter for extents including or higher
13410 // than 'offset' only).
13411 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
13412
13413 // Look for an existing mutable blob we can use.
13414 auto begin = o->extent_map.extent_map.begin();
13415 auto end = o->extent_map.extent_map.end();
13416 auto ep = o->extent_map.seek_lextent(offset);
13417 if (ep != begin) {
13418 --ep;
13419 if (ep->blob_end() <= offset) {
13420 ++ep;
13421 }
13422 }
13423 auto prev_ep = ep;
13424 if (prev_ep != begin) {
13425 --prev_ep;
13426 } else {
13427 prev_ep = end; // to avoid this extent check as it's a duplicate
13428 }
13429
13430 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13431 // We don't want to have more blobs than min alloc units fit
13432 // into 2 max blobs
13433 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13434 bool above_blob_threshold = false;
13435
13436 inspected_blobs.reserve(blob_threshold);
13437
13438 uint64_t max_off = 0;
13439 auto start_ep = ep;
13440 auto end_ep = ep; // exclusively
13441 do {
13442 any_change = false;
13443
13444 if (ep != end && ep->logical_offset < offset + max_bsize) {
13445 BlobRef b = ep->blob;
13446 if (!above_blob_threshold) {
13447 inspected_blobs.insert(&b->get_blob());
13448 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13449 }
13450 max_off = ep->logical_end();
13451 auto bstart = ep->blob_start();
13452
13453 dout(20) << __func__ << " considering " << *b
13454 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13455 if (bstart >= end_offs) {
13456 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13457 } else if (!b->get_blob().is_mutable()) {
13458 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13459 } else if (ep->logical_offset % min_alloc_size !=
13460 ep->blob_offset % min_alloc_size) {
13461 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13462 } else {
13463 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13464 // can we pad our head/tail out with zeros?
13465 uint64_t head_pad, tail_pad;
13466 head_pad = p2phase(offset, chunk_size);
13467 tail_pad = p2nphase(end_offs, chunk_size);
13468 if (head_pad || tail_pad) {
13469 o->extent_map.fault_range(db, offset - head_pad,
13470 end_offs - offset + head_pad + tail_pad);
13471 }
13472 if (head_pad &&
13473 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
13474 head_pad = 0;
13475 }
13476 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13477 tail_pad = 0;
13478 }
13479
13480 uint64_t b_off = offset - head_pad - bstart;
13481 uint64_t b_len = length + head_pad + tail_pad;
13482
13483 // direct write into unused blocks of an existing mutable blob?
13484 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13485 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13486 b->get_blob().is_unused(b_off, b_len) &&
13487 b->get_blob().is_allocated(b_off, b_len)) {
13488 _apply_padding(head_pad, tail_pad, bl);
13489
13490 dout(20) << __func__ << " write to unused 0x" << std::hex
13491 << b_off << "~" << b_len
13492 << " pad 0x" << head_pad << " + 0x" << tail_pad
13493 << std::dec << " of mutable " << *b << dendl;
13494 _buffer_cache_write(txc, b, b_off, bl,
13495 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13496
13497 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13498 if (b_len <= prefer_deferred_size) {
13499 dout(20) << __func__ << " deferring small 0x" << std::hex
13500 << b_len << std::dec << " unused write via deferred" << dendl;
13501 bluestore_deferred_op_t *op = _get_deferred_op(txc);
13502 op->op = bluestore_deferred_op_t::OP_WRITE;
13503 b->get_blob().map(
13504 b_off, b_len,
13505 [&](uint64_t offset, uint64_t length) {
13506 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13507 return 0;
13508 });
13509 op->data = bl;
13510 } else {
13511 b->get_blob().map_bl(
13512 b_off, bl,
13513 [&](uint64_t offset, bufferlist& t) {
13514 bdev->aio_write(offset, t,
13515 &txc->ioc, wctx->buffered);
13516 });
13517 }
13518 }
13519 b->dirty_blob().calc_csum(b_off, bl);
13520 dout(20) << __func__ << " lex old " << *ep << dendl;
13521 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13522 b,
13523 &wctx->old_extents);
13524 b->dirty_blob().mark_used(le->blob_offset, le->length);
13525 txc->statfs_delta.stored() += le->length;
13526 dout(20) << __func__ << " lex " << *le << dendl;
13527 logger->inc(l_bluestore_write_small_unused);
13528 return;
13529 }
13530 // read some data to fill out the chunk?
13531 uint64_t head_read = p2phase(b_off, chunk_size);
13532 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
13533 if ((head_read || tail_read) &&
13534 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13535 head_read + tail_read < min_alloc_size) {
13536 b_off -= head_read;
13537 b_len += head_read + tail_read;
13538
13539 } else {
13540 head_read = tail_read = 0;
13541 }
13542
13543 // chunk-aligned deferred overwrite?
13544 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13545 b_off % chunk_size == 0 &&
13546 b_len % chunk_size == 0 &&
13547 b->get_blob().is_allocated(b_off, b_len)) {
13548
13549 _apply_padding(head_pad, tail_pad, bl);
13550
13551 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13552 << " and tail 0x" << tail_read << std::dec << dendl;
13553 if (head_read) {
13554 bufferlist head_bl;
13555 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13556 head_bl, 0);
13557 ceph_assert(r >= 0 && r <= (int)head_read);
13558 size_t zlen = head_read - r;
13559 if (zlen) {
13560 head_bl.append_zero(zlen);
13561 logger->inc(l_bluestore_write_pad_bytes, zlen);
13562 }
13563 head_bl.claim_append(bl);
13564 bl.swap(head_bl);
13565 logger->inc(l_bluestore_write_penalty_read_ops);
13566 }
13567 if (tail_read) {
13568 bufferlist tail_bl;
13569 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13570 tail_bl, 0);
13571 ceph_assert(r >= 0 && r <= (int)tail_read);
13572 size_t zlen = tail_read - r;
13573 if (zlen) {
13574 tail_bl.append_zero(zlen);
13575 logger->inc(l_bluestore_write_pad_bytes, zlen);
13576 }
13577 bl.claim_append(tail_bl);
13578 logger->inc(l_bluestore_write_penalty_read_ops);
13579 }
13580 logger->inc(l_bluestore_write_small_pre_read);
13581
13582 _buffer_cache_write(txc, b, b_off, bl,
13583 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13584
13585 if (b->get_blob().csum_type) {
13586 b->dirty_blob().calc_csum(b_off, bl);
13587 }
13588
13589 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13590 bluestore_deferred_op_t *op = _get_deferred_op(txc);
13591 op->op = bluestore_deferred_op_t::OP_WRITE;
13592 int r = b->get_blob().map(
13593 b_off, b_len,
13594 [&](uint64_t offset, uint64_t length) {
13595 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13596 return 0;
13597 });
13598 ceph_assert(r == 0);
13599 op->data.claim(bl);
13600 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13601 << b_len << std::dec << " of mutable " << *b
13602 << " at " << op->extents << dendl;
13603 }
13604
13605 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13606 b, &wctx->old_extents);
13607 b->dirty_blob().mark_used(le->blob_offset, le->length);
13608 txc->statfs_delta.stored() += le->length;
13609 dout(20) << __func__ << " lex " << *le << dendl;
13610 logger->inc(l_bluestore_write_small_deferred);
13611 return;
13612 }
13613 // try to reuse blob if we can
13614 if (b->can_reuse_blob(min_alloc_size,
13615 max_bsize,
13616 offset0 - bstart,
13617 &alloc_len)) {
13618 ceph_assert(alloc_len == min_alloc_size); // expecting data always
13619 // fit into reused blob
13620 // Need to check for pending writes desiring to
13621 // reuse the same pextent. The rationale is that during GC two chunks
13622 // from garbage blobs(compressed?) can share logical space within the same
13623 // AU. That's in turn might be caused by unaligned len in clone_range2.
13624 // Hence the second write will fail in an attempt to reuse blob at
13625 // do_alloc_write().
13626 if (!wctx->has_conflict(b,
13627 offset0,
13628 offset0 + alloc_len,
13629 min_alloc_size)) {
13630
13631 // we can't reuse pad_head/pad_tail since they might be truncated
13632 // due to existent extents
13633 uint64_t b_off = offset - bstart;
13634 uint64_t b_off0 = b_off;
13635 _pad_zeros(&bl, &b_off0, chunk_size);
13636
13637 dout(20) << __func__ << " reuse blob " << *b << std::hex
13638 << " (0x" << b_off0 << "~" << bl.length() << ")"
13639 << " (0x" << b_off << "~" << length << ")"
13640 << std::dec << dendl;
13641
13642 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13643 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13644 false, false);
13645 logger->inc(l_bluestore_write_small_unused);
13646 return;
13647 }
13648 }
13649 }
13650 ++ep;
13651 end_ep = ep;
13652 any_change = true;
13653 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13654
13655 // check extent for reuse in reverse order
13656 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13657 BlobRef b = prev_ep->blob;
13658 if (!above_blob_threshold) {
13659 inspected_blobs.insert(&b->get_blob());
13660 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13661 }
13662 start_ep = prev_ep;
13663 auto bstart = prev_ep->blob_start();
13664 dout(20) << __func__ << " considering " << *b
13665 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13666 if (b->can_reuse_blob(min_alloc_size,
13667 max_bsize,
13668 offset0 - bstart,
13669 &alloc_len)) {
13670 ceph_assert(alloc_len == min_alloc_size); // expecting data always
13671 // fit into reused blob
13672 // Need to check for pending writes desiring to
13673 // reuse the same pextent. The rationale is that during GC two chunks
13674 // from garbage blobs(compressed?) can share logical space within the same
13675 // AU. That's in turn might be caused by unaligned len in clone_range2.
13676 // Hence the second write will fail in an attempt to reuse blob at
13677 // do_alloc_write().
13678 if (!wctx->has_conflict(b,
13679 offset0,
13680 offset0 + alloc_len,
13681 min_alloc_size)) {
13682
13683 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13684 uint64_t b_off = offset - bstart;
13685 uint64_t b_off0 = b_off;
13686 _pad_zeros(&bl, &b_off0, chunk_size);
13687
13688 dout(20) << __func__ << " reuse blob " << *b << std::hex
13689 << " (0x" << b_off0 << "~" << bl.length() << ")"
13690 << " (0x" << b_off << "~" << length << ")"
13691 << std::dec << dendl;
13692
13693 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13694 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13695 false, false);
13696 logger->inc(l_bluestore_write_small_unused);
13697 return;
13698 }
13699 }
13700 if (prev_ep != begin) {
13701 --prev_ep;
13702 any_change = true;
13703 } else {
13704 prev_ep = end; // to avoid useless first extent re-check
13705 }
13706 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13707 } while (any_change);
13708
13709 if (above_blob_threshold) {
13710 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13711 << " " << std::hex << min_off << "~" << max_off << std::dec
13712 << dendl;
13713 ceph_assert(start_ep != end_ep);
13714 for (auto ep = start_ep; ep != end_ep; ++ep) {
13715 dout(20) << __func__ << " inserting for GC "
13716 << std::hex << ep->logical_offset << "~" << ep->length
13717 << std::dec << dendl;
13718
13719 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13720 }
13721 // insert newly written extent to GC
13722 wctx->extents_to_gc.union_insert(offset, length);
13723 dout(20) << __func__ << " inserting (last) for GC "
13724 << std::hex << offset << "~" << length
13725 << std::dec << dendl;
13726 }
13727 // new blob.
13728 BlobRef b = c->new_blob();
13729 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
13730 uint64_t b_off0 = b_off;
13731 _pad_zeros(&bl, &b_off0, block_size);
13732 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13733 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13734 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13735 // doesn't match disk one only
13736 true);
13737
13738 return;
13739 }
13740
13741 void BlueStore::_do_write_big(
13742 TransContext *txc,
13743 CollectionRef &c,
13744 OnodeRef o,
13745 uint64_t offset, uint64_t length,
13746 bufferlist::iterator& blp,
13747 WriteContext *wctx)
13748 {
13749 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13750 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13751 << " compress " << (int)wctx->compress
13752 << dendl;
13753 logger->inc(l_bluestore_write_big);
13754 logger->inc(l_bluestore_write_big_bytes, length);
13755 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13756 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13757 while (length > 0) {
13758 bool new_blob = false;
13759 uint32_t l = std::min(max_bsize, length);
13760 BlobRef b;
13761 uint32_t b_off = 0;
13762
13763 //attempting to reuse existing blob
13764 if (!wctx->compress) {
13765 // look for an existing mutable blob we can reuse
13766 auto begin = o->extent_map.extent_map.begin();
13767 auto end = o->extent_map.extent_map.end();
13768 auto ep = o->extent_map.seek_lextent(offset);
13769 auto prev_ep = ep;
13770 if (prev_ep != begin) {
13771 --prev_ep;
13772 } else {
13773 prev_ep = end; // to avoid this extent check as it's a duplicate
13774 }
13775 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13776 // search suitable extent in both forward and reverse direction in
13777 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13778 // then check if blob can be reused via can_reuse_blob func.
13779 bool any_change;
13780 do {
13781 any_change = false;
13782 if (ep != end && ep->logical_offset < offset + max_bsize) {
13783 if (offset >= ep->blob_start() &&
13784 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
13785 offset - ep->blob_start(),
13786 &l)) {
13787 b = ep->blob;
13788 b_off = offset - ep->blob_start();
13789 prev_ep = end; // to avoid check below
13790 dout(20) << __func__ << " reuse blob " << *b << std::hex
13791 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
13792 } else {
13793 ++ep;
13794 any_change = true;
13795 }
13796 }
13797
13798 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13799 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
13800 offset - prev_ep->blob_start(),
13801 &l)) {
13802 b = prev_ep->blob;
13803 b_off = offset - prev_ep->blob_start();
13804 dout(20) << __func__ << " reuse blob " << *b << std::hex
13805 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
13806 } else if (prev_ep != begin) {
13807 --prev_ep;
13808 any_change = true;
13809 } else {
13810 prev_ep = end; // to avoid useless first extent re-check
13811 }
13812 }
13813 } while (b == nullptr && any_change);
13814 }
13815 if (b == nullptr) {
13816 b = c->new_blob();
13817 b_off = 0;
13818 new_blob = true;
13819 }
13820
13821 bufferlist t;
13822 blp.copy(l, t);
13823 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13824 offset += l;
13825 length -= l;
13826 logger->inc(l_bluestore_write_big_blobs);
13827 }
13828 }
13829
13830 int BlueStore::_do_alloc_write(
13831 TransContext *txc,
13832 CollectionRef coll,
13833 OnodeRef o,
13834 WriteContext *wctx)
13835 {
13836 dout(20) << __func__ << " txc " << txc
13837 << " " << wctx->writes.size() << " blobs"
13838 << dendl;
13839 if (wctx->writes.empty()) {
13840 return 0;
13841 }
13842
13843 CompressorRef c;
13844 double crr = 0;
13845 if (wctx->compress) {
13846 c = select_option(
13847 "compression_algorithm",
13848 compressor,
13849 [&]() {
13850 string val;
13851 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13852 CompressorRef cp = compressor;
13853 if (!cp || cp->get_type_name() != val) {
13854 cp = Compressor::create(cct, val);
13855 if (!cp) {
13856 if (_set_compression_alert(false, val.c_str())) {
13857 derr << __func__ << " unable to initialize " << val.c_str()
13858 << " compressor" << dendl;
13859 }
13860 }
13861 }
13862 return boost::optional<CompressorRef>(cp);
13863 }
13864 return boost::optional<CompressorRef>();
13865 }
13866 );
13867
13868 crr = select_option(
13869 "compression_required_ratio",
13870 cct->_conf->bluestore_compression_required_ratio,
13871 [&]() {
13872 double val;
13873 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
13874 return boost::optional<double>(val);
13875 }
13876 return boost::optional<double>();
13877 }
13878 );
13879 }
13880
13881 // checksum
13882 int64_t csum = csum_type.load();
13883 csum = select_option(
13884 "csum_type",
13885 csum,
13886 [&]() {
13887 int64_t val;
13888 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
13889 return boost::optional<int64_t>(val);
13890 }
13891 return boost::optional<int64_t>();
13892 }
13893 );
13894
13895 // compress (as needed) and calc needed space
13896 uint64_t need = 0;
13897 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13898 for (auto& wi : wctx->writes) {
13899 if (c && wi.blob_length > min_alloc_size) {
13900 auto start = mono_clock::now();
13901
13902 // compress
13903 ceph_assert(wi.b_off == 0);
13904 ceph_assert(wi.blob_length == wi.bl.length());
13905
13906 // FIXME: memory alignment here is bad
13907 bufferlist t;
13908 int r = c->compress(wi.bl, t);
13909 uint64_t want_len_raw = wi.blob_length * crr;
13910 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
13911 bool rejected = false;
13912 uint64_t compressed_len = t.length();
13913 // do an approximate (fast) estimation for resulting blob size
13914 // that doesn't take header overhead into account
13915 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
13916 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
13917 bluestore_compression_header_t chdr;
13918 chdr.type = c->get_type();
13919 chdr.length = t.length();
13920 encode(chdr, wi.compressed_bl);
13921 wi.compressed_bl.claim_append(t);
13922
13923 compressed_len = wi.compressed_bl.length();
13924 result_len = p2roundup(compressed_len, min_alloc_size);
13925 if (result_len <= want_len && result_len < wi.blob_length) {
13926 // Cool. We compressed at least as much as we were hoping to.
13927 // pad out to min_alloc_size
13928 wi.compressed_bl.append_zero(result_len - compressed_len);
13929 wi.compressed_len = compressed_len;
13930 wi.compressed = true;
13931 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
13932 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
13933 << " -> 0x" << compressed_len << " => 0x" << result_len
13934 << " with " << c->get_type()
13935 << std::dec << dendl;
13936 txc->statfs_delta.compressed() += compressed_len;
13937 txc->statfs_delta.compressed_original() += wi.blob_length;
13938 txc->statfs_delta.compressed_allocated() += result_len;
13939 logger->inc(l_bluestore_compress_success_count);
13940 need += result_len;
13941 } else {
13942 rejected = true;
13943 }
13944 } else if (r != 0) {
13945 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
13946 << " bytes compressed using " << c->get_type_name()
13947 << std::dec
13948 << " failed with errcode = " << r
13949 << ", leaving uncompressed"
13950 << dendl;
13951 logger->inc(l_bluestore_compress_rejected_count);
13952 need += wi.blob_length;
13953 } else {
13954 rejected = true;
13955 }
13956
13957 if (rejected) {
13958 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
13959 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
13960 << " with " << c->get_type()
13961 << ", which is more than required 0x" << want_len_raw
13962 << " -> 0x" << want_len
13963 << ", leaving uncompressed"
13964 << std::dec << dendl;
13965 logger->inc(l_bluestore_compress_rejected_count);
13966 need += wi.blob_length;
13967 }
13968 log_latency("compress@_do_alloc_write",
13969 l_bluestore_compress_lat,
13970 mono_clock::now() - start,
13971 cct->_conf->bluestore_log_op_age );
13972 } else {
13973 need += wi.blob_length;
13974 }
13975 }
13976 PExtentVector prealloc;
13977 prealloc.reserve(2 * wctx->writes.size());;
13978 int64_t prealloc_left = 0;
13979 prealloc_left = alloc->allocate(
13980 need, min_alloc_size, need,
13981 0, &prealloc);
13982 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
13983 derr << __func__ << " failed to allocate 0x" << std::hex << need
13984 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
13985 << " min_alloc_size 0x" << min_alloc_size
13986 << " available 0x " << alloc->get_free()
13987 << std::dec << dendl;
13988 if (prealloc.size()) {
13989 alloc->release(prealloc);
13990 }
13991 return -ENOSPC;
13992 }
13993 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
13994
13995 dout(20) << __func__ << " prealloc " << prealloc << dendl;
13996 auto prealloc_pos = prealloc.begin();
13997
13998 for (auto& wi : wctx->writes) {
13999 BlobRef b = wi.b;
14000 bluestore_blob_t& dblob = b->dirty_blob();
14001 uint64_t b_off = wi.b_off;
14002 bufferlist *l = &wi.bl;
14003 uint64_t final_length = wi.blob_length;
14004 uint64_t csum_length = wi.blob_length;
14005 if (wi.compressed) {
14006 final_length = wi.compressed_bl.length();
14007 csum_length = final_length;
14008 unsigned csum_order = ctz(csum_length);
14009 l = &wi.compressed_bl;
14010 dblob.set_compressed(wi.blob_length, wi.compressed_len);
14011 if (csum != Checksummer::CSUM_NONE) {
14012 dout(20) << __func__ << " initialize csum setting for compressed blob " << *b
14013 << " csum_type " << Checksummer::get_csum_type_string(csum)
14014 << " csum_order " << csum_order
14015 << " csum_length 0x" << std::hex << csum_length
14016 << " blob_length 0x" << wi.blob_length
14017 << " compressed_length 0x" << wi.compressed_len << std::dec
14018 << dendl;
14019 dblob.init_csum(csum, csum_order, csum_length);
14020 }
14021 } else if (wi.new_blob) {
14022 unsigned csum_order;
14023 // initialize newly created blob only
14024 ceph_assert(dblob.is_mutable());
14025 if (l->length() != wi.blob_length) {
14026 // hrm, maybe we could do better here, but let's not bother.
14027 dout(20) << __func__ << " forcing csum_order to block_size_order "
14028 << block_size_order << dendl;
14029 csum_order = block_size_order;
14030 } else {
14031 csum_order = std::min(wctx->csum_order, ctz(l->length()));
14032 }
14033 // try to align blob with max_blob_size to improve
14034 // its reuse ratio, e.g. in case of reverse write
14035 uint32_t suggested_boff =
14036 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
14037 if ((suggested_boff % (1 << csum_order)) == 0 &&
14038 suggested_boff + final_length <= max_bsize &&
14039 suggested_boff > b_off) {
14040 dout(20) << __func__ << " forcing blob_offset to 0x"
14041 << std::hex << suggested_boff << std::dec << dendl;
14042 ceph_assert(suggested_boff >= b_off);
14043 csum_length += suggested_boff - b_off;
14044 b_off = suggested_boff;
14045 }
14046 if (csum != Checksummer::CSUM_NONE) {
14047 dout(20) << __func__ << " initialize csum setting for new blob " << *b
14048 << " csum_type " << Checksummer::get_csum_type_string(csum)
14049 << " csum_order " << csum_order
14050 << " csum_length 0x" << std::hex << csum_length << std::dec
14051 << dendl;
14052 dblob.init_csum(csum, csum_order, csum_length);
14053 }
14054 }
14055
14056 PExtentVector extents;
14057 int64_t left = final_length;
14058 while (left > 0) {
14059 ceph_assert(prealloc_left > 0);
14060 if (prealloc_pos->length <= left) {
14061 prealloc_left -= prealloc_pos->length;
14062 left -= prealloc_pos->length;
14063 txc->statfs_delta.allocated() += prealloc_pos->length;
14064 extents.push_back(*prealloc_pos);
14065 ++prealloc_pos;
14066 } else {
14067 extents.emplace_back(prealloc_pos->offset, left);
14068 prealloc_pos->offset += left;
14069 prealloc_pos->length -= left;
14070 prealloc_left -= left;
14071 txc->statfs_delta.allocated() += left;
14072 left = 0;
14073 break;
14074 }
14075 }
14076 for (auto& p : extents) {
14077 txc->allocated.insert(p.offset, p.length);
14078 }
14079 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
14080
14081 dout(20) << __func__ << " blob " << *b << dendl;
14082 if (dblob.has_csum()) {
14083 dblob.calc_csum(b_off, *l);
14084 }
14085
14086 if (wi.mark_unused) {
14087 ceph_assert(!dblob.is_compressed());
14088 auto b_end = b_off + wi.bl.length();
14089 if (b_off) {
14090 dblob.add_unused(0, b_off);
14091 }
14092 uint64_t llen = dblob.get_logical_length();
14093 if (b_end < llen) {
14094 dblob.add_unused(b_end, llen - b_end);
14095 }
14096 }
14097
14098 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
14099 b_off + (wi.b_off0 - wi.b_off),
14100 wi.length0,
14101 wi.b,
14102 nullptr);
14103 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
14104 txc->statfs_delta.stored() += le->length;
14105 dout(20) << __func__ << " lex " << *le << dendl;
14106 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
14107 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14108
14109 // queue io
14110 if (!g_conf()->bluestore_debug_omit_block_device_write) {
14111 if (l->length() <= prefer_deferred_size.load()) {
14112 dout(20) << __func__ << " deferring small 0x" << std::hex
14113 << l->length() << std::dec << " write via deferred" << dendl;
14114 bluestore_deferred_op_t *op = _get_deferred_op(txc);
14115 op->op = bluestore_deferred_op_t::OP_WRITE;
14116 int r = b->get_blob().map(
14117 b_off, l->length(),
14118 [&](uint64_t offset, uint64_t length) {
14119 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14120 return 0;
14121 });
14122 ceph_assert(r == 0);
14123 op->data = *l;
14124 logger->inc(l_bluestore_write_small_deferred);
14125 } else {
14126 b->get_blob().map_bl(
14127 b_off, *l,
14128 [&](uint64_t offset, bufferlist& t) {
14129 bdev->aio_write(offset, t, &txc->ioc, false);
14130 });
14131 logger->inc(l_bluestore_write_small_new);
14132 }
14133 }
14134 }
14135 ceph_assert(prealloc_pos == prealloc.end());
14136 ceph_assert(prealloc_left == 0);
14137 return 0;
14138 }
14139
14140 void BlueStore::_wctx_finish(
14141 TransContext *txc,
14142 CollectionRef& c,
14143 OnodeRef o,
14144 WriteContext *wctx,
14145 set<SharedBlob*> *maybe_unshared_blobs)
14146 {
14147 auto oep = wctx->old_extents.begin();
14148 while (oep != wctx->old_extents.end()) {
14149 auto &lo = *oep;
14150 oep = wctx->old_extents.erase(oep);
14151 dout(20) << __func__ << " lex_old " << lo.e << dendl;
14152 BlobRef b = lo.e.blob;
14153 const bluestore_blob_t& blob = b->get_blob();
14154 if (blob.is_compressed()) {
14155 if (lo.blob_empty) {
14156 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
14157 }
14158 txc->statfs_delta.compressed_original() -= lo.e.length;
14159 }
14160 auto& r = lo.r;
14161 txc->statfs_delta.stored() -= lo.e.length;
14162 if (!r.empty()) {
14163 dout(20) << __func__ << " blob release " << r << dendl;
14164 if (blob.is_shared()) {
14165 PExtentVector final;
14166 c->load_shared_blob(b->shared_blob);
14167 bool unshare = false;
14168 bool* unshare_ptr =
14169 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
14170 for (auto e : r) {
14171 b->shared_blob->put_ref(
14172 e.offset, e.length, &final,
14173 unshare_ptr);
14174 }
14175 if (unshare) {
14176 ceph_assert(maybe_unshared_blobs);
14177 maybe_unshared_blobs->insert(b->shared_blob.get());
14178 }
14179 dout(20) << __func__ << " shared_blob release " << final
14180 << " from " << *b->shared_blob << dendl;
14181 txc->write_shared_blob(b->shared_blob);
14182 r.clear();
14183 r.swap(final);
14184 }
14185 }
14186 // we can't invalidate our logical extents as we drop them because
14187 // other lextents (either in our onode or others) may still
14188 // reference them. but we can throw out anything that is no
14189 // longer allocated. Note that this will leave behind edge bits
14190 // that are no longer referenced but not deallocated (until they
14191 // age out of the cache naturally).
14192 b->discard_unallocated(c.get());
14193 for (auto e : r) {
14194 dout(20) << __func__ << " release " << e << dendl;
14195 txc->released.insert(e.offset, e.length);
14196 txc->statfs_delta.allocated() -= e.length;
14197 if (blob.is_compressed()) {
14198 txc->statfs_delta.compressed_allocated() -= e.length;
14199 }
14200 }
14201
14202 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
14203 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
14204 << dendl;
14205 o->extent_map.spanning_blob_map.erase(b->id);
14206 }
14207 delete &lo;
14208 }
14209 }
14210
14211 void BlueStore::_do_write_data(
14212 TransContext *txc,
14213 CollectionRef& c,
14214 OnodeRef o,
14215 uint64_t offset,
14216 uint64_t length,
14217 bufferlist& bl,
14218 WriteContext *wctx)
14219 {
14220 uint64_t end = offset + length;
14221 bufferlist::iterator p = bl.begin();
14222
14223 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
14224 (length != min_alloc_size)) {
14225 // we fall within the same block
14226 _do_write_small(txc, c, o, offset, length, p, wctx);
14227 } else {
14228 uint64_t head_offset, head_length;
14229 uint64_t middle_offset, middle_length;
14230 uint64_t tail_offset, tail_length;
14231
14232 head_offset = offset;
14233 head_length = p2nphase(offset, min_alloc_size);
14234
14235 tail_offset = p2align(end, min_alloc_size);
14236 tail_length = p2phase(end, min_alloc_size);
14237
14238 middle_offset = head_offset + head_length;
14239 middle_length = length - head_length - tail_length;
14240
14241 if (head_length) {
14242 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
14243 }
14244
14245 if (middle_length) {
14246 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
14247 }
14248
14249 if (tail_length) {
14250 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
14251 }
14252 }
14253 }
14254
14255 void BlueStore::_choose_write_options(
14256 CollectionRef& c,
14257 OnodeRef o,
14258 uint32_t fadvise_flags,
14259 WriteContext *wctx)
14260 {
14261 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
14262 dout(20) << __func__ << " will do buffered write" << dendl;
14263 wctx->buffered = true;
14264 } else if (cct->_conf->bluestore_default_buffered_write &&
14265 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
14266 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
14267 dout(20) << __func__ << " defaulting to buffered write" << dendl;
14268 wctx->buffered = true;
14269 }
14270
14271 // apply basic csum block size
14272 wctx->csum_order = block_size_order;
14273
14274 // compression parameters
14275 unsigned alloc_hints = o->onode.alloc_hint_flags;
14276 auto cm = select_option(
14277 "compression_mode",
14278 comp_mode.load(),
14279 [&]() {
14280 string val;
14281 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
14282 return boost::optional<Compressor::CompressionMode>(
14283 Compressor::get_comp_mode_type(val));
14284 }
14285 return boost::optional<Compressor::CompressionMode>();
14286 }
14287 );
14288
14289 wctx->compress = (cm != Compressor::COMP_NONE) &&
14290 ((cm == Compressor::COMP_FORCE) ||
14291 (cm == Compressor::COMP_AGGRESSIVE &&
14292 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
14293 (cm == Compressor::COMP_PASSIVE &&
14294 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
14295
14296 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
14297 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
14298 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
14299 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
14300 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
14301
14302 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
14303
14304 if (o->onode.expected_write_size) {
14305 wctx->csum_order = std::max(min_alloc_size_order,
14306 (uint8_t)ctz(o->onode.expected_write_size));
14307 } else {
14308 wctx->csum_order = min_alloc_size_order;
14309 }
14310
14311 if (wctx->compress) {
14312 wctx->target_blob_size = select_option(
14313 "compression_max_blob_size",
14314 comp_max_blob_size.load(),
14315 [&]() {
14316 int64_t val;
14317 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
14318 return boost::optional<uint64_t>((uint64_t)val);
14319 }
14320 return boost::optional<uint64_t>();
14321 }
14322 );
14323 }
14324 } else {
14325 if (wctx->compress) {
14326 wctx->target_blob_size = select_option(
14327 "compression_min_blob_size",
14328 comp_min_blob_size.load(),
14329 [&]() {
14330 int64_t val;
14331 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
14332 return boost::optional<uint64_t>((uint64_t)val);
14333 }
14334 return boost::optional<uint64_t>();
14335 }
14336 );
14337 }
14338 }
14339
14340 uint64_t max_bsize = max_blob_size.load();
14341 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
14342 wctx->target_blob_size = max_bsize;
14343 }
14344
14345 // set the min blob size floor at 2x the min_alloc_size, or else we
14346 // won't be able to allocate a smaller extent for the compressed
14347 // data.
14348 if (wctx->compress &&
14349 wctx->target_blob_size < min_alloc_size * 2) {
14350 wctx->target_blob_size = min_alloc_size * 2;
14351 }
14352
14353 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
14354 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
14355 << " compress=" << (int)wctx->compress
14356 << " buffered=" << (int)wctx->buffered
14357 << std::dec << dendl;
14358 }
14359
14360 int BlueStore::_do_gc(
14361 TransContext *txc,
14362 CollectionRef& c,
14363 OnodeRef o,
14364 const WriteContext& wctx,
14365 uint64_t *dirty_start,
14366 uint64_t *dirty_end)
14367 {
14368
14369 bool dirty_range_updated = false;
14370 WriteContext wctx_gc;
14371 wctx_gc.fork(wctx); // make a clone for garbage collection
14372
14373 auto & extents_to_collect = wctx.extents_to_gc;
14374 for (auto it = extents_to_collect.begin();
14375 it != extents_to_collect.end();
14376 ++it) {
14377 bufferlist bl;
14378 auto offset = (*it).first;
14379 auto length = (*it).second;
14380 dout(20) << __func__ << " processing " << std::hex
14381 << offset << "~" << length << std::dec
14382 << dendl;
14383 int r = _do_read(c.get(), o, offset, length, bl, 0);
14384 ceph_assert(r == (int)length);
14385
14386 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
14387 logger->inc(l_bluestore_gc_merged, length);
14388
14389 if (*dirty_start > offset) {
14390 *dirty_start = offset;
14391 dirty_range_updated = true;
14392 }
14393
14394 if (*dirty_end < offset + length) {
14395 *dirty_end = offset + length;
14396 dirty_range_updated = true;
14397 }
14398 }
14399 if (dirty_range_updated) {
14400 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
14401 }
14402
14403 dout(30) << __func__ << " alloc write" << dendl;
14404 int r = _do_alloc_write(txc, c, o, &wctx_gc);
14405 if (r < 0) {
14406 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14407 << dendl;
14408 return r;
14409 }
14410
14411 _wctx_finish(txc, c, o, &wctx_gc);
14412 return 0;
14413 }
14414
14415 int BlueStore::_do_write(
14416 TransContext *txc,
14417 CollectionRef& c,
14418 OnodeRef o,
14419 uint64_t offset,
14420 uint64_t length,
14421 bufferlist& bl,
14422 uint32_t fadvise_flags)
14423 {
14424 int r = 0;
14425
14426 dout(20) << __func__
14427 << " " << o->oid
14428 << " 0x" << std::hex << offset << "~" << length
14429 << " - have 0x" << o->onode.size
14430 << " (" << std::dec << o->onode.size << ")"
14431 << " bytes"
14432 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
14433 << dendl;
14434 _dump_onode<30>(cct, *o);
14435
14436 if (length == 0) {
14437 return 0;
14438 }
14439
14440 uint64_t end = offset + length;
14441
14442 GarbageCollector gc(c->store->cct);
14443 int64_t benefit = 0;
14444 auto dirty_start = offset;
14445 auto dirty_end = end;
14446
14447 WriteContext wctx;
14448 _choose_write_options(c, o, fadvise_flags, &wctx);
14449 o->extent_map.fault_range(db, offset, length);
14450 _do_write_data(txc, c, o, offset, length, bl, &wctx);
14451 r = _do_alloc_write(txc, c, o, &wctx);
14452 if (r < 0) {
14453 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14454 << dendl;
14455 goto out;
14456 }
14457
14458 if (wctx.extents_to_gc.empty() ||
14459 wctx.extents_to_gc.range_start() > offset ||
14460 wctx.extents_to_gc.range_end() < offset + length) {
14461 benefit = gc.estimate(offset,
14462 length,
14463 o->extent_map,
14464 wctx.old_extents,
14465 min_alloc_size);
14466 }
14467
14468 // NB: _wctx_finish() will empty old_extents
14469 // so we must do gc estimation before that
14470 _wctx_finish(txc, c, o, &wctx);
14471 if (end > o->onode.size) {
14472 dout(20) << __func__ << " extending size to 0x" << std::hex << end
14473 << std::dec << dendl;
14474 o->onode.size = end;
14475 }
14476
14477 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
14478 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14479 dout(20) << __func__
14480 << " perform garbage collection for compressed extents, "
14481 << "expected benefit = " << benefit << " AUs" << dendl;
14482 }
14483 if (!wctx.extents_to_gc.empty()) {
14484 dout(20) << __func__ << " perform garbage collection" << dendl;
14485
14486 r = _do_gc(txc, c, o,
14487 wctx,
14488 &dirty_start, &dirty_end);
14489 if (r < 0) {
14490 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14491 << dendl;
14492 goto out;
14493 }
14494 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14495 << "~" << dirty_end - dirty_start << std::dec << dendl;
14496 }
14497 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
14498 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14499
14500 r = 0;
14501
14502 out:
14503 return r;
14504 }
14505
14506 int BlueStore::_write(TransContext *txc,
14507 CollectionRef& c,
14508 OnodeRef& o,
14509 uint64_t offset, size_t length,
14510 bufferlist& bl,
14511 uint32_t fadvise_flags)
14512 {
14513 dout(15) << __func__ << " " << c->cid << " " << o->oid
14514 << " 0x" << std::hex << offset << "~" << length << std::dec
14515 << dendl;
14516 int r = 0;
14517 if (offset + length >= OBJECT_MAX_SIZE) {
14518 r = -E2BIG;
14519 } else {
14520 _assign_nid(txc, o);
14521 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14522 txc->write_onode(o);
14523 }
14524 dout(10) << __func__ << " " << c->cid << " " << o->oid
14525 << " 0x" << std::hex << offset << "~" << length << std::dec
14526 << " = " << r << dendl;
14527 return r;
14528 }
14529
14530 int BlueStore::_zero(TransContext *txc,
14531 CollectionRef& c,
14532 OnodeRef& o,
14533 uint64_t offset, size_t length)
14534 {
14535 dout(15) << __func__ << " " << c->cid << " " << o->oid
14536 << " 0x" << std::hex << offset << "~" << length << std::dec
14537 << dendl;
14538 int r = 0;
14539 if (offset + length >= OBJECT_MAX_SIZE) {
14540 r = -E2BIG;
14541 } else {
14542 _assign_nid(txc, o);
14543 r = _do_zero(txc, c, o, offset, length);
14544 }
14545 dout(10) << __func__ << " " << c->cid << " " << o->oid
14546 << " 0x" << std::hex << offset << "~" << length << std::dec
14547 << " = " << r << dendl;
14548 return r;
14549 }
14550
14551 int BlueStore::_do_zero(TransContext *txc,
14552 CollectionRef& c,
14553 OnodeRef& o,
14554 uint64_t offset, size_t length)
14555 {
14556 dout(15) << __func__ << " " << c->cid << " " << o->oid
14557 << " 0x" << std::hex << offset << "~" << length << std::dec
14558 << dendl;
14559 int r = 0;
14560
14561 _dump_onode<30>(cct, *o);
14562
14563 WriteContext wctx;
14564 o->extent_map.fault_range(db, offset, length);
14565 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
14566 o->extent_map.dirty_range(offset, length);
14567 _wctx_finish(txc, c, o, &wctx);
14568
14569 if (length > 0 && offset + length > o->onode.size) {
14570 o->onode.size = offset + length;
14571 dout(20) << __func__ << " extending size to " << offset + length
14572 << dendl;
14573 }
14574 txc->write_onode(o);
14575
14576 dout(10) << __func__ << " " << c->cid << " " << o->oid
14577 << " 0x" << std::hex << offset << "~" << length << std::dec
14578 << " = " << r << dendl;
14579 return r;
14580 }
14581
14582 void BlueStore::_do_truncate(
14583 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14584 set<SharedBlob*> *maybe_unshared_blobs)
14585 {
14586 dout(15) << __func__ << " " << c->cid << " " << o->oid
14587 << " 0x" << std::hex << offset << std::dec << dendl;
14588
14589 _dump_onode<30>(cct, *o);
14590
14591 if (offset == o->onode.size)
14592 return;
14593
14594 if (offset < o->onode.size) {
14595 WriteContext wctx;
14596 uint64_t length = o->onode.size - offset;
14597 o->extent_map.fault_range(db, offset, length);
14598 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
14599 o->extent_map.dirty_range(offset, length);
14600 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
14601
14602 // if we have shards past EOF, ask for a reshard
14603 if (!o->onode.extent_map_shards.empty() &&
14604 o->onode.extent_map_shards.back().offset >= offset) {
14605 dout(10) << __func__ << " request reshard past EOF" << dendl;
14606 if (offset) {
14607 o->extent_map.request_reshard(offset - 1, offset + length);
14608 } else {
14609 o->extent_map.request_reshard(0, length);
14610 }
14611 }
14612 }
14613
14614 o->onode.size = offset;
14615
14616 txc->write_onode(o);
14617 }
14618
14619 int BlueStore::_truncate(TransContext *txc,
14620 CollectionRef& c,
14621 OnodeRef& o,
14622 uint64_t offset)
14623 {
14624 dout(15) << __func__ << " " << c->cid << " " << o->oid
14625 << " 0x" << std::hex << offset << std::dec
14626 << dendl;
14627 int r = 0;
14628 if (offset >= OBJECT_MAX_SIZE) {
14629 r = -E2BIG;
14630 } else {
14631 _do_truncate(txc, c, o, offset);
14632 }
14633 dout(10) << __func__ << " " << c->cid << " " << o->oid
14634 << " 0x" << std::hex << offset << std::dec
14635 << " = " << r << dendl;
14636 return r;
14637 }
14638
14639 int BlueStore::_do_remove(
14640 TransContext *txc,
14641 CollectionRef& c,
14642 OnodeRef o)
14643 {
14644 set<SharedBlob*> maybe_unshared_blobs;
14645 bool is_gen = !o->oid.is_no_gen();
14646 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
14647 if (o->onode.has_omap()) {
14648 o->flush();
14649 _do_omap_clear(txc, o);
14650 }
14651 o->exists = false;
14652 string key;
14653 for (auto &s : o->extent_map.shards) {
14654 dout(20) << __func__ << " removing shard 0x" << std::hex
14655 << s.shard_info->offset << std::dec << dendl;
14656 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14657 [&](const string& final_key) {
14658 txc->t->rmkey(PREFIX_OBJ, final_key);
14659 }
14660 );
14661 }
14662 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
14663 txc->note_removed_object(o);
14664 o->extent_map.clear();
14665 o->onode = bluestore_onode_t();
14666 _debug_obj_on_delete(o->oid);
14667
14668 if (!is_gen || maybe_unshared_blobs.empty()) {
14669 return 0;
14670 }
14671
14672 // see if we can unshare blobs still referenced by the head
14673 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14674 << maybe_unshared_blobs << dendl;
14675 ghobject_t nogen = o->oid;
14676 nogen.generation = ghobject_t::NO_GEN;
14677 OnodeRef h = c->onode_map.lookup(nogen);
14678
14679 if (!h || !h->exists) {
14680 return 0;
14681 }
14682
14683 dout(20) << __func__ << " checking for unshareable blobs on " << h
14684 << " " << h->oid << dendl;
14685 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14686 for (auto& e : h->extent_map.extent_map) {
14687 const bluestore_blob_t& b = e.blob->get_blob();
14688 SharedBlob *sb = e.blob->shared_blob.get();
14689 if (b.is_shared() &&
14690 sb->loaded &&
14691 maybe_unshared_blobs.count(sb)) {
14692 if (b.is_compressed()) {
14693 expect[sb].get(0, b.get_ondisk_length());
14694 } else {
14695 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14696 expect[sb].get(off, len);
14697 return 0;
14698 });
14699 }
14700 }
14701 }
14702
14703 vector<SharedBlob*> unshared_blobs;
14704 unshared_blobs.reserve(maybe_unshared_blobs.size());
14705 for (auto& p : expect) {
14706 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14707 if (p.first->persistent->ref_map == p.second) {
14708 SharedBlob *sb = p.first;
14709 dout(20) << __func__ << " unsharing " << *sb << dendl;
14710 unshared_blobs.push_back(sb);
14711 txc->unshare_blob(sb);
14712 uint64_t sbid = c->make_blob_unshared(sb);
14713 string key;
14714 get_shared_blob_key(sbid, &key);
14715 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14716 }
14717 }
14718
14719 if (unshared_blobs.empty()) {
14720 return 0;
14721 }
14722
14723 for (auto& e : h->extent_map.extent_map) {
14724 const bluestore_blob_t& b = e.blob->get_blob();
14725 SharedBlob *sb = e.blob->shared_blob.get();
14726 if (b.is_shared() &&
14727 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14728 sb) != unshared_blobs.end()) {
14729 dout(20) << __func__ << " unsharing " << e << dendl;
14730 bluestore_blob_t& blob = e.blob->dirty_blob();
14731 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
14732 h->extent_map.dirty_range(e.logical_offset, 1);
14733 }
14734 }
14735 txc->write_onode(h);
14736
14737 return 0;
14738 }
14739
14740 int BlueStore::_remove(TransContext *txc,
14741 CollectionRef& c,
14742 OnodeRef &o)
14743 {
14744 dout(15) << __func__ << " " << c->cid << " " << o->oid
14745 << " onode " << o.get()
14746 << " txc "<< txc << dendl;
14747
14748 auto start_time = mono_clock::now();
14749 int r = _do_remove(txc, c, o);
14750 log_latency_fn(
14751 __func__,
14752 l_bluestore_remove_lat,
14753 mono_clock::now() - start_time,
14754 cct->_conf->bluestore_log_op_age,
14755 [&](const ceph::timespan& lat) {
14756 ostringstream ostr;
14757 ostr << ", lat = " << timespan_str(lat)
14758 << " cid =" << c->cid
14759 << " oid =" << o->oid;
14760 return ostr.str();
14761 }
14762 );
14763
14764 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14765 return r;
14766 }
14767
14768 int BlueStore::_setattr(TransContext *txc,
14769 CollectionRef& c,
14770 OnodeRef& o,
14771 const string& name,
14772 bufferptr& val)
14773 {
14774 dout(15) << __func__ << " " << c->cid << " " << o->oid
14775 << " " << name << " (" << val.length() << " bytes)"
14776 << dendl;
14777 int r = 0;
14778 if (val.is_partial()) {
14779 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14780 val.length());
14781 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14782 } else {
14783 auto& b = o->onode.attrs[name.c_str()] = val;
14784 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14785 }
14786 txc->write_onode(o);
14787 dout(10) << __func__ << " " << c->cid << " " << o->oid
14788 << " " << name << " (" << val.length() << " bytes)"
14789 << " = " << r << dendl;
14790 return r;
14791 }
14792
14793 int BlueStore::_setattrs(TransContext *txc,
14794 CollectionRef& c,
14795 OnodeRef& o,
14796 const map<string,bufferptr>& aset)
14797 {
14798 dout(15) << __func__ << " " << c->cid << " " << o->oid
14799 << " " << aset.size() << " keys"
14800 << dendl;
14801 int r = 0;
14802 for (map<string,bufferptr>::const_iterator p = aset.begin();
14803 p != aset.end(); ++p) {
14804 if (p->second.is_partial()) {
14805 auto& b = o->onode.attrs[p->first.c_str()] =
14806 bufferptr(p->second.c_str(), p->second.length());
14807 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14808 } else {
14809 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
14810 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14811 }
14812 }
14813 txc->write_onode(o);
14814 dout(10) << __func__ << " " << c->cid << " " << o->oid
14815 << " " << aset.size() << " keys"
14816 << " = " << r << dendl;
14817 return r;
14818 }
14819
14820
14821 int BlueStore::_rmattr(TransContext *txc,
14822 CollectionRef& c,
14823 OnodeRef& o,
14824 const string& name)
14825 {
14826 dout(15) << __func__ << " " << c->cid << " " << o->oid
14827 << " " << name << dendl;
14828 int r = 0;
14829 auto it = o->onode.attrs.find(name.c_str());
14830 if (it == o->onode.attrs.end())
14831 goto out;
14832
14833 o->onode.attrs.erase(it);
14834 txc->write_onode(o);
14835
14836 out:
14837 dout(10) << __func__ << " " << c->cid << " " << o->oid
14838 << " " << name << " = " << r << dendl;
14839 return r;
14840 }
14841
14842 int BlueStore::_rmattrs(TransContext *txc,
14843 CollectionRef& c,
14844 OnodeRef& o)
14845 {
14846 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14847 int r = 0;
14848
14849 if (o->onode.attrs.empty())
14850 goto out;
14851
14852 o->onode.attrs.clear();
14853 txc->write_onode(o);
14854
14855 out:
14856 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14857 return r;
14858 }
14859
14860 void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
14861 {
14862 const string& omap_prefix = o->get_omap_prefix();
14863 string prefix, tail;
14864 o->get_omap_header(&prefix);
14865 o->get_omap_tail(&tail);
14866 txc->t->rm_range_keys(omap_prefix, prefix, tail);
14867 txc->t->rmkey(omap_prefix, tail);
14868 dout(20) << __func__ << " remove range start: "
14869 << pretty_binary_string(prefix) << " end: "
14870 << pretty_binary_string(tail) << dendl;
14871 }
14872
14873 int BlueStore::_omap_clear(TransContext *txc,
14874 CollectionRef& c,
14875 OnodeRef& o)
14876 {
14877 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14878 int r = 0;
14879 if (o->onode.has_omap()) {
14880 o->flush();
14881 _do_omap_clear(txc, o);
14882 o->onode.clear_omap_flag();
14883 txc->write_onode(o);
14884 }
14885 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14886 return r;
14887 }
14888
14889 int BlueStore::_omap_setkeys(TransContext *txc,
14890 CollectionRef& c,
14891 OnodeRef& o,
14892 bufferlist &bl)
14893 {
14894 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14895 int r;
14896 auto p = bl.cbegin();
14897 __u32 num;
14898 if (!o->onode.has_omap()) {
14899 if (o->oid.is_pgmeta()) {
14900 o->onode.set_omap_flags_pgmeta();
14901 } else {
14902 o->onode.set_omap_flags();
14903 }
14904 txc->write_onode(o);
14905
14906 const string& prefix = o->get_omap_prefix();
14907 string key_tail;
14908 bufferlist tail;
14909 o->get_omap_tail(&key_tail);
14910 txc->t->set(prefix, key_tail, tail);
14911 } else {
14912 txc->note_modified_object(o);
14913 }
14914 const string& prefix = o->get_omap_prefix();
14915 string final_key;
14916 o->get_omap_key(string(), &final_key);
14917 size_t base_key_len = final_key.size();
14918 decode(num, p);
14919 while (num--) {
14920 string key;
14921 bufferlist value;
14922 decode(key, p);
14923 decode(value, p);
14924 final_key.resize(base_key_len); // keep prefix
14925 final_key += key;
14926 dout(20) << __func__ << " " << pretty_binary_string(final_key)
14927 << " <- " << key << dendl;
14928 txc->t->set(prefix, final_key, value);
14929 }
14930 r = 0;
14931 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14932 return r;
14933 }
14934
14935 int BlueStore::_omap_setheader(TransContext *txc,
14936 CollectionRef& c,
14937 OnodeRef &o,
14938 bufferlist& bl)
14939 {
14940 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14941 int r;
14942 string key;
14943 if (!o->onode.has_omap()) {
14944 if (o->oid.is_pgmeta()) {
14945 o->onode.set_omap_flags_pgmeta();
14946 } else {
14947 o->onode.set_omap_flags();
14948 }
14949 txc->write_onode(o);
14950
14951 const string& prefix = o->get_omap_prefix();
14952 string key_tail;
14953 bufferlist tail;
14954 o->get_omap_tail(&key_tail);
14955 txc->t->set(prefix, key_tail, tail);
14956 } else {
14957 txc->note_modified_object(o);
14958 }
14959 const string& prefix = o->get_omap_prefix();
14960 o->get_omap_header(&key);
14961 txc->t->set(prefix, key, bl);
14962 r = 0;
14963 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14964 return r;
14965 }
14966
14967 int BlueStore::_omap_rmkeys(TransContext *txc,
14968 CollectionRef& c,
14969 OnodeRef& o,
14970 bufferlist& bl)
14971 {
14972 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14973 int r = 0;
14974 auto p = bl.cbegin();
14975 __u32 num;
14976 string final_key;
14977
14978 if (!o->onode.has_omap()) {
14979 goto out;
14980 }
14981 {
14982 const string& prefix = o->get_omap_prefix();
14983 o->get_omap_key(string(), &final_key);
14984 size_t base_key_len = final_key.size();
14985 decode(num, p);
14986 while (num--) {
14987 string key;
14988 decode(key, p);
14989 final_key.resize(base_key_len); // keep prefix
14990 final_key += key;
14991 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
14992 << " <- " << key << dendl;
14993 txc->t->rmkey(prefix, final_key);
14994 }
14995 }
14996 txc->note_modified_object(o);
14997
14998 out:
14999 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15000 return r;
15001 }
15002
15003 int BlueStore::_omap_rmkey_range(TransContext *txc,
15004 CollectionRef& c,
15005 OnodeRef& o,
15006 const string& first, const string& last)
15007 {
15008 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15009 string key_first, key_last;
15010 int r = 0;
15011 if (!o->onode.has_omap()) {
15012 goto out;
15013 }
15014 {
15015 const string& prefix = o->get_omap_prefix();
15016 o->flush();
15017 o->get_omap_key(first, &key_first);
15018 o->get_omap_key(last, &key_last);
15019 txc->t->rm_range_keys(prefix, key_first, key_last);
15020 dout(20) << __func__ << " remove range start: "
15021 << pretty_binary_string(key_first) << " end: "
15022 << pretty_binary_string(key_last) << dendl;
15023 }
15024 txc->note_modified_object(o);
15025
15026 out:
15027 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15028 return r;
15029 }
15030
15031 int BlueStore::_set_alloc_hint(
15032 TransContext *txc,
15033 CollectionRef& c,
15034 OnodeRef& o,
15035 uint64_t expected_object_size,
15036 uint64_t expected_write_size,
15037 uint32_t flags)
15038 {
15039 dout(15) << __func__ << " " << c->cid << " " << o->oid
15040 << " object_size " << expected_object_size
15041 << " write_size " << expected_write_size
15042 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15043 << dendl;
15044 int r = 0;
15045 o->onode.expected_object_size = expected_object_size;
15046 o->onode.expected_write_size = expected_write_size;
15047 o->onode.alloc_hint_flags = flags;
15048 txc->write_onode(o);
15049 dout(10) << __func__ << " " << c->cid << " " << o->oid
15050 << " object_size " << expected_object_size
15051 << " write_size " << expected_write_size
15052 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15053 << " = " << r << dendl;
15054 return r;
15055 }
15056
15057 int BlueStore::_clone(TransContext *txc,
15058 CollectionRef& c,
15059 OnodeRef& oldo,
15060 OnodeRef& newo)
15061 {
15062 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15063 << newo->oid << dendl;
15064 int r = 0;
15065 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
15066 derr << __func__ << " mismatched hash on " << oldo->oid
15067 << " and " << newo->oid << dendl;
15068 return -EINVAL;
15069 }
15070
15071 _assign_nid(txc, newo);
15072
15073 // clone data
15074 oldo->flush();
15075 _do_truncate(txc, c, newo, 0);
15076 if (cct->_conf->bluestore_clone_cow) {
15077 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
15078 } else {
15079 bufferlist bl;
15080 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
15081 if (r < 0)
15082 goto out;
15083 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
15084 if (r < 0)
15085 goto out;
15086 }
15087
15088 // clone attrs
15089 newo->onode.attrs = oldo->onode.attrs;
15090
15091 // clone omap
15092 if (newo->onode.has_omap()) {
15093 dout(20) << __func__ << " clearing old omap data" << dendl;
15094 newo->flush();
15095 _do_omap_clear(txc, newo);
15096 newo->onode.clear_omap_flag();
15097 }
15098 if (oldo->onode.has_omap()) {
15099 dout(20) << __func__ << " copying omap data" << dendl;
15100 if (newo->oid.is_pgmeta()) {
15101 newo->onode.set_omap_flags_pgmeta();
15102 } else {
15103 newo->onode.set_omap_flags();
15104 }
15105 const string& prefix = newo->get_omap_prefix();
15106 KeyValueDB::Iterator it = db->get_iterator(prefix);
15107 string head, tail;
15108 oldo->get_omap_header(&head);
15109 oldo->get_omap_tail(&tail);
15110 it->lower_bound(head);
15111 while (it->valid()) {
15112 if (it->key() >= tail) {
15113 dout(30) << __func__ << " reached tail" << dendl;
15114 break;
15115 } else {
15116 dout(30) << __func__ << " got header/data "
15117 << pretty_binary_string(it->key()) << dendl;
15118 string key;
15119 newo->rewrite_omap_key(it->key(), &key);
15120 txc->t->set(prefix, key, it->value());
15121 }
15122 it->next();
15123 }
15124 string new_tail;
15125 bufferlist new_tail_value;
15126 newo->get_omap_tail(&new_tail);
15127 txc->t->set(prefix, new_tail, new_tail_value);
15128 }
15129
15130 txc->write_onode(newo);
15131 r = 0;
15132
15133 out:
15134 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15135 << newo->oid << " = " << r << dendl;
15136 return r;
15137 }
15138
15139 int BlueStore::_do_clone_range(
15140 TransContext *txc,
15141 CollectionRef& c,
15142 OnodeRef& oldo,
15143 OnodeRef& newo,
15144 uint64_t srcoff,
15145 uint64_t length,
15146 uint64_t dstoff)
15147 {
15148 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15149 << newo->oid
15150 << " 0x" << std::hex << srcoff << "~" << length << " -> "
15151 << " 0x" << dstoff << "~" << length << std::dec << dendl;
15152 oldo->extent_map.fault_range(db, srcoff, length);
15153 newo->extent_map.fault_range(db, dstoff, length);
15154 _dump_onode<30>(cct, *oldo);
15155 _dump_onode<30>(cct, *newo);
15156
15157 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
15158 _dump_onode<30>(cct, *oldo);
15159 _dump_onode<30>(cct, *newo);
15160 return 0;
15161 }
15162
15163 int BlueStore::_clone_range(TransContext *txc,
15164 CollectionRef& c,
15165 OnodeRef& oldo,
15166 OnodeRef& newo,
15167 uint64_t srcoff, uint64_t length, uint64_t dstoff)
15168 {
15169 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15170 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15171 << " to offset 0x" << dstoff << std::dec << dendl;
15172 int r = 0;
15173
15174 if (srcoff + length >= OBJECT_MAX_SIZE ||
15175 dstoff + length >= OBJECT_MAX_SIZE) {
15176 r = -E2BIG;
15177 goto out;
15178 }
15179 if (srcoff + length > oldo->onode.size) {
15180 r = -EINVAL;
15181 goto out;
15182 }
15183
15184 _assign_nid(txc, newo);
15185
15186 if (length > 0) {
15187 if (cct->_conf->bluestore_clone_cow) {
15188 _do_zero(txc, c, newo, dstoff, length);
15189 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
15190 } else {
15191 bufferlist bl;
15192 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
15193 if (r < 0)
15194 goto out;
15195 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
15196 if (r < 0)
15197 goto out;
15198 }
15199 }
15200
15201 txc->write_onode(newo);
15202 r = 0;
15203
15204 out:
15205 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15206 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15207 << " to offset 0x" << dstoff << std::dec
15208 << " = " << r << dendl;
15209 return r;
15210 }
15211
15212 int BlueStore::_rename(TransContext *txc,
15213 CollectionRef& c,
15214 OnodeRef& oldo,
15215 OnodeRef& newo,
15216 const ghobject_t& new_oid)
15217 {
15218 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15219 << new_oid << dendl;
15220 int r;
15221 ghobject_t old_oid = oldo->oid;
15222 mempool::bluestore_cache_meta::string new_okey;
15223
15224 if (newo) {
15225 if (newo->exists) {
15226 r = -EEXIST;
15227 goto out;
15228 }
15229 ceph_assert(txc->onodes.count(newo) == 0);
15230 }
15231
15232 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
15233
15234 // rewrite shards
15235 {
15236 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
15237 get_object_key(cct, new_oid, &new_okey);
15238 string key;
15239 for (auto &s : oldo->extent_map.shards) {
15240 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
15241 [&](const string& final_key) {
15242 txc->t->rmkey(PREFIX_OBJ, final_key);
15243 }
15244 );
15245 s.dirty = true;
15246 }
15247 }
15248
15249 newo = oldo;
15250 txc->write_onode(newo);
15251
15252 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15253 // Onode in the old slot
15254 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
15255 r = 0;
15256
15257 // hold a ref to new Onode in old name position, to ensure we don't drop
15258 // it from the cache before this txc commits (or else someone may come along
15259 // and read newo's metadata via the old name).
15260 txc->note_modified_object(oldo);
15261
15262 out:
15263 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
15264 << new_oid << " = " << r << dendl;
15265 return r;
15266 }
15267
15268 // collections
15269
15270 int BlueStore::_create_collection(
15271 TransContext *txc,
15272 const coll_t &cid,
15273 unsigned bits,
15274 CollectionRef *c)
15275 {
15276 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
15277 int r;
15278 bufferlist bl;
15279
15280 {
15281 std::unique_lock l(coll_lock);
15282 if (*c) {
15283 r = -EEXIST;
15284 goto out;
15285 }
15286 auto p = new_coll_map.find(cid);
15287 ceph_assert(p != new_coll_map.end());
15288 *c = p->second;
15289 (*c)->cnode.bits = bits;
15290 coll_map[cid] = *c;
15291 new_coll_map.erase(p);
15292 }
15293 encode((*c)->cnode, bl);
15294 txc->t->set(PREFIX_COLL, stringify(cid), bl);
15295 r = 0;
15296
15297 out:
15298 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
15299 return r;
15300 }
15301
15302 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
15303 CollectionRef *c)
15304 {
15305 dout(15) << __func__ << " " << cid << dendl;
15306 int r;
15307
15308 (*c)->flush_all_but_last();
15309 {
15310 std::unique_lock l(coll_lock);
15311 if (!*c) {
15312 r = -ENOENT;
15313 goto out;
15314 }
15315 size_t nonexistent_count = 0;
15316 ceph_assert((*c)->exists);
15317 if ((*c)->onode_map.map_any([&](Onode* o) {
15318 if (o->exists) {
15319 dout(1) << __func__ << " " << o->oid << " " << o
15320 << " exists in onode_map" << dendl;
15321 return true;
15322 }
15323 ++nonexistent_count;
15324 return false;
15325 })) {
15326 r = -ENOTEMPTY;
15327 goto out;
15328 }
15329
15330 vector<ghobject_t> ls;
15331 ghobject_t next;
15332 // Enumerate onodes in db, up to nonexistent_count + 1
15333 // then check if all of them are marked as non-existent.
15334 // Bypass the check if (next != ghobject_t::get_max())
15335 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
15336 nonexistent_count + 1, false, &ls, &next);
15337 if (r >= 0) {
15338 // If true mean collecton has more objects than nonexistent_count,
15339 // so bypass check.
15340 bool exists = (!next.is_max());
15341 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
15342 dout(10) << __func__ << " oid " << *it << dendl;
15343 auto onode = (*c)->onode_map.lookup(*it);
15344 exists = !onode || onode->exists;
15345 if (exists) {
15346 dout(1) << __func__ << " " << *it
15347 << " exists in db, "
15348 << (!onode ? "not present in ram" : "present in ram")
15349 << dendl;
15350 }
15351 }
15352 if (!exists) {
15353 _do_remove_collection(txc, c);
15354 r = 0;
15355 } else {
15356 dout(10) << __func__ << " " << cid
15357 << " is non-empty" << dendl;
15358 r = -ENOTEMPTY;
15359 }
15360 }
15361 }
15362
15363 out:
15364 dout(10) << __func__ << " " << cid << " = " << r << dendl;
15365 return r;
15366 }
15367
15368 void BlueStore::_do_remove_collection(TransContext *txc,
15369 CollectionRef *c)
15370 {
15371 coll_map.erase((*c)->cid);
15372 txc->removed_collections.push_back(*c);
15373 (*c)->exists = false;
15374 _osr_register_zombie((*c)->osr.get());
15375 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
15376 c->reset();
15377 }
15378
15379 int BlueStore::_split_collection(TransContext *txc,
15380 CollectionRef& c,
15381 CollectionRef& d,
15382 unsigned bits, int rem)
15383 {
15384 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
15385 << " bits " << bits << dendl;
15386 std::unique_lock l(c->lock);
15387 std::unique_lock l2(d->lock);
15388 int r;
15389
15390 // flush all previous deferred writes on this sequencer. this is a bit
15391 // heavyweight, but we need to make sure all deferred writes complete
15392 // before we split as the new collection's sequencer may need to order
15393 // this after those writes, and we don't bother with the complexity of
15394 // moving those TransContexts over to the new osr.
15395 _osr_drain_preceding(txc);
15396
15397 // move any cached items (onodes and referenced shared blobs) that will
15398 // belong to the child collection post-split. leave everything else behind.
15399 // this may include things that don't strictly belong to the now-smaller
15400 // parent split, but the OSD will always send us a split for every new
15401 // child.
15402
15403 spg_t pgid, dest_pgid;
15404 bool is_pg = c->cid.is_pg(&pgid);
15405 ceph_assert(is_pg);
15406 is_pg = d->cid.is_pg(&dest_pgid);
15407 ceph_assert(is_pg);
15408
15409 // the destination should initially be empty.
15410 ceph_assert(d->onode_map.empty());
15411 ceph_assert(d->shared_blob_set.empty());
15412 ceph_assert(d->cnode.bits == bits);
15413
15414 c->split_cache(d.get());
15415
15416 // adjust bits. note that this will be redundant for all but the first
15417 // split call for this parent (first child).
15418 c->cnode.bits = bits;
15419 ceph_assert(d->cnode.bits == bits);
15420 r = 0;
15421
15422 bufferlist bl;
15423 encode(c->cnode, bl);
15424 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
15425
15426 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
15427 << " bits " << bits << " = " << r << dendl;
15428 return r;
15429 }
15430
15431 int BlueStore::_merge_collection(
15432 TransContext *txc,
15433 CollectionRef *c,
15434 CollectionRef& d,
15435 unsigned bits)
15436 {
15437 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
15438 << " bits " << bits << dendl;
15439 std::unique_lock l((*c)->lock);
15440 std::unique_lock l2(d->lock);
15441 int r;
15442
15443 coll_t cid = (*c)->cid;
15444
15445 // flush all previous deferred writes on the source collection to ensure
15446 // that all deferred writes complete before we merge as the target collection's
15447 // sequencer may need to order new ops after those writes.
15448
15449 _osr_drain((*c)->osr.get());
15450
15451 // move any cached items (onodes and referenced shared blobs) that will
15452 // belong to the child collection post-split. leave everything else behind.
15453 // this may include things that don't strictly belong to the now-smaller
15454 // parent split, but the OSD will always send us a split for every new
15455 // child.
15456
15457 spg_t pgid, dest_pgid;
15458 bool is_pg = cid.is_pg(&pgid);
15459 ceph_assert(is_pg);
15460 is_pg = d->cid.is_pg(&dest_pgid);
15461 ceph_assert(is_pg);
15462
15463 // adjust bits. note that this will be redundant for all but the first
15464 // merge call for the parent/target.
15465 d->cnode.bits = bits;
15466
15467 // behavior depends on target (d) bits, so this after that is updated.
15468 (*c)->split_cache(d.get());
15469
15470 // remove source collection
15471 {
15472 std::unique_lock l3(coll_lock);
15473 _do_remove_collection(txc, c);
15474 }
15475
15476 r = 0;
15477
15478 bufferlist bl;
15479 encode(d->cnode, bl);
15480 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15481
15482 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15483 << " bits " << bits << " = " << r << dendl;
15484 return r;
15485 }
15486
15487 void BlueStore::log_latency(
15488 const char* name,
15489 int idx,
15490 const ceph::timespan& l,
15491 double lat_threshold,
15492 const char* info) const
15493 {
15494 logger->tinc(idx, l);
15495 if (lat_threshold > 0.0 &&
15496 l >= make_timespan(lat_threshold)) {
15497 dout(0) << __func__ << " slow operation observed for " << name
15498 << ", latency = " << l
15499 << info
15500 << dendl;
15501 }
15502 }
15503
15504 void BlueStore::log_latency_fn(
15505 const char* name,
15506 int idx,
15507 const ceph::timespan& l,
15508 double lat_threshold,
15509 std::function<string (const ceph::timespan& lat)> fn) const
15510 {
15511 logger->tinc(idx, l);
15512 if (lat_threshold > 0.0 &&
15513 l >= make_timespan(lat_threshold)) {
15514 dout(0) << __func__ << " slow operation observed for " << name
15515 << ", latency = " << l
15516 << fn(l)
15517 << dendl;
15518 }
15519 }
15520
15521 #if defined(WITH_LTTNG)
15522 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15523 KeyValueDB &db,
15524 TransContext &txc,
15525 mono_clock::time_point start_throttle_acquire)
15526 {
15527 pending_kv_ios += txc.ios;
15528 if (txc.deferred_txn) {
15529 pending_deferred_ios += txc.ios;
15530 }
15531
15532 uint64_t started = 0;
15533 uint64_t completed = 0;
15534 if (should_trace(&started, &completed)) {
15535 txc.tracing = true;
15536 uint64_t rocksdb_base_level,
15537 rocksdb_estimate_pending_compaction_bytes,
15538 rocksdb_cur_size_all_mem_tables,
15539 rocksdb_compaction_pending,
15540 rocksdb_mem_table_flush_pending,
15541 rocksdb_num_running_compactions,
15542 rocksdb_num_running_flushes,
15543 rocksdb_actual_delayed_write_rate;
15544 db.get_property(
15545 "rocksdb.base-level",
15546 &rocksdb_base_level);
15547 db.get_property(
15548 "rocksdb.estimate-pending-compaction-bytes",
15549 &rocksdb_estimate_pending_compaction_bytes);
15550 db.get_property(
15551 "rocksdb.cur-size-all-mem-tables",
15552 &rocksdb_cur_size_all_mem_tables);
15553 db.get_property(
15554 "rocksdb.compaction-pending",
15555 &rocksdb_compaction_pending);
15556 db.get_property(
15557 "rocksdb.mem-table-flush-pending",
15558 &rocksdb_mem_table_flush_pending);
15559 db.get_property(
15560 "rocksdb.num-running-compactions",
15561 &rocksdb_num_running_compactions);
15562 db.get_property(
15563 "rocksdb.num-running-flushes",
15564 &rocksdb_num_running_flushes);
15565 db.get_property(
15566 "rocksdb.actual-delayed-write-rate",
15567 &rocksdb_actual_delayed_write_rate);
15568
15569
15570 tracepoint(
15571 bluestore,
15572 transaction_initial_state,
15573 txc.osr->get_sequencer_id(),
15574 txc.seq,
15575 throttle_bytes.get_current(),
15576 throttle_deferred_bytes.get_current(),
15577 pending_kv_ios,
15578 pending_deferred_ios,
15579 started,
15580 completed,
15581 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15582
15583 tracepoint(
15584 bluestore,
15585 transaction_initial_state_rocksdb,
15586 txc.osr->get_sequencer_id(),
15587 txc.seq,
15588 rocksdb_base_level,
15589 rocksdb_estimate_pending_compaction_bytes,
15590 rocksdb_cur_size_all_mem_tables,
15591 rocksdb_compaction_pending,
15592 rocksdb_mem_table_flush_pending,
15593 rocksdb_num_running_compactions,
15594 rocksdb_num_running_flushes,
15595 rocksdb_actual_delayed_write_rate);
15596 }
15597 }
15598 #endif
15599
15600 mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15601 TransContext &txc, PerfCounters *logger, int state)
15602 {
15603 mono_clock::time_point now = mono_clock::now();
15604 mono_clock::duration lat = now - txc.last_stamp;
15605 logger->tinc(state, lat);
15606 #if defined(WITH_LTTNG)
15607 if (txc.tracing &&
15608 state >= l_bluestore_state_prepare_lat &&
15609 state <= l_bluestore_state_done_lat) {
15610 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15611 tracepoint(
15612 bluestore,
15613 transaction_state_duration,
15614 txc.osr->get_sequencer_id(),
15615 txc.seq,
15616 state,
15617 ceph::to_seconds<double>(lat));
15618 }
15619 #endif
15620 txc.last_stamp = now;
15621 return lat;
15622 }
15623
15624 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15625 KeyValueDB &db,
15626 TransContext &txc,
15627 mono_clock::time_point start_throttle_acquire)
15628 {
15629 throttle_bytes.get(txc.cost);
15630
15631 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15632 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15633 return true;
15634 } else {
15635 return false;
15636 }
15637 }
15638
15639 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15640 KeyValueDB &db,
15641 TransContext &txc,
15642 mono_clock::time_point start_throttle_acquire)
15643 {
15644 ceph_assert(txc.deferred_txn);
15645 throttle_deferred_bytes.get(txc.cost);
15646 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15647 }
15648
15649 #if defined(WITH_LTTNG)
15650 void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15651 {
15652 pending_kv_ios -= 1;
15653 ios_completed_since_last_traced++;
15654 if (txc.tracing) {
15655 tracepoint(
15656 bluestore,
15657 transaction_commit_latency,
15658 txc.osr->get_sequencer_id(),
15659 txc.seq,
15660 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15661 }
15662 }
15663 #endif
15664
15665 #if defined(WITH_LTTNG)
15666 void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15667 {
15668 if (txc.deferred_txn) {
15669 pending_deferred_ios -= 1;
15670 }
15671 if (txc.tracing) {
15672 mono_clock::time_point now = mono_clock::now();
15673 mono_clock::duration lat = now - txc.start;
15674 tracepoint(
15675 bluestore,
15676 transaction_total_duration,
15677 txc.osr->get_sequencer_id(),
15678 txc.seq,
15679 ceph::to_seconds<double>(lat));
15680 }
15681 }
15682 #endif
15683
15684 // DB key value Histogram
15685 #define KEY_SLAB 32
15686 #define VALUE_SLAB 64
15687
15688 const string prefix_onode = "o";
15689 const string prefix_onode_shard = "x";
15690 const string prefix_other = "Z";
15691
15692 int BlueStore::DBHistogram::get_key_slab(size_t sz)
15693 {
15694 return (sz/KEY_SLAB);
15695 }
15696
15697 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15698 {
15699 int lower_bound = slab * KEY_SLAB;
15700 int upper_bound = (slab + 1) * KEY_SLAB;
15701 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15702 return ret;
15703 }
15704
15705 int BlueStore::DBHistogram::get_value_slab(size_t sz)
15706 {
15707 return (sz/VALUE_SLAB);
15708 }
15709
15710 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15711 {
15712 int lower_bound = slab * VALUE_SLAB;
15713 int upper_bound = (slab + 1) * VALUE_SLAB;
15714 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15715 return ret;
15716 }
15717
15718 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15719 const string &prefix, size_t key_size, size_t value_size)
15720 {
15721 uint32_t key_slab = get_key_slab(key_size);
15722 uint32_t value_slab = get_value_slab(value_size);
15723 key_hist[prefix][key_slab].count++;
15724 key_hist[prefix][key_slab].max_len =
15725 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
15726 key_hist[prefix][key_slab].val_map[value_slab].count++;
15727 key_hist[prefix][key_slab].val_map[value_slab].max_len =
15728 std::max<size_t>(value_size,
15729 key_hist[prefix][key_slab].val_map[value_slab].max_len);
15730 }
15731
15732 void BlueStore::DBHistogram::dump(Formatter *f)
15733 {
15734 f->open_object_section("rocksdb_value_distribution");
15735 for (auto i : value_hist) {
15736 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15737 }
15738 f->close_section();
15739
15740 f->open_object_section("rocksdb_key_value_histogram");
15741 for (auto i : key_hist) {
15742 f->dump_string("prefix", i.first);
15743 f->open_object_section("key_hist");
15744 for ( auto k : i.second) {
15745 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15746 f->dump_unsigned("max_len", k.second.max_len);
15747 f->open_object_section("value_hist");
15748 for ( auto j : k.second.val_map) {
15749 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15750 f->dump_unsigned("max_len", j.second.max_len);
15751 }
15752 f->close_section();
15753 }
15754 f->close_section();
15755 }
15756 f->close_section();
15757 }
15758
15759 //Itrerates through the db and collects the stats
15760 void BlueStore::generate_db_histogram(Formatter *f)
15761 {
15762 //globals
15763 uint64_t num_onodes = 0;
15764 uint64_t num_shards = 0;
15765 uint64_t num_super = 0;
15766 uint64_t num_coll = 0;
15767 uint64_t num_omap = 0;
15768 uint64_t num_pgmeta_omap = 0;
15769 uint64_t num_deferred = 0;
15770 uint64_t num_alloc = 0;
15771 uint64_t num_stat = 0;
15772 uint64_t num_others = 0;
15773 uint64_t num_shared_shards = 0;
15774 size_t max_key_size =0, max_value_size = 0;
15775 uint64_t total_key_size = 0, total_value_size = 0;
15776 size_t key_size = 0, value_size = 0;
15777 DBHistogram hist;
15778
15779 auto start = coarse_mono_clock::now();
15780
15781 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
15782 iter->seek_to_first();
15783 while (iter->valid()) {
15784 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15785 key_size = iter->key_size();
15786 value_size = iter->value_size();
15787 hist.value_hist[hist.get_value_slab(value_size)]++;
15788 max_key_size = std::max(max_key_size, key_size);
15789 max_value_size = std::max(max_value_size, value_size);
15790 total_key_size += key_size;
15791 total_value_size += value_size;
15792
15793 pair<string,string> key(iter->raw_key());
15794
15795 if (key.first == PREFIX_SUPER) {
15796 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15797 num_super++;
15798 } else if (key.first == PREFIX_STAT) {
15799 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15800 num_stat++;
15801 } else if (key.first == PREFIX_COLL) {
15802 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15803 num_coll++;
15804 } else if (key.first == PREFIX_OBJ) {
15805 if (key.second.back() == ONODE_KEY_SUFFIX) {
15806 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15807 num_onodes++;
15808 } else {
15809 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15810 num_shards++;
15811 }
15812 } else if (key.first == PREFIX_OMAP) {
15813 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15814 num_omap++;
15815 } else if (key.first == PREFIX_PGMETA_OMAP) {
15816 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15817 num_pgmeta_omap++;
15818 } else if (key.first == PREFIX_DEFERRED) {
15819 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15820 num_deferred++;
15821 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
15822 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15823 num_alloc++;
15824 } else if (key.first == PREFIX_SHARED_BLOB) {
15825 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15826 num_shared_shards++;
15827 } else {
15828 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15829 num_others++;
15830 }
15831 iter->next();
15832 }
15833
15834 ceph::timespan duration = coarse_mono_clock::now() - start;
15835 f->open_object_section("rocksdb_key_value_stats");
15836 f->dump_unsigned("num_onodes", num_onodes);
15837 f->dump_unsigned("num_shards", num_shards);
15838 f->dump_unsigned("num_super", num_super);
15839 f->dump_unsigned("num_coll", num_coll);
15840 f->dump_unsigned("num_omap", num_omap);
15841 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
15842 f->dump_unsigned("num_deferred", num_deferred);
15843 f->dump_unsigned("num_alloc", num_alloc);
15844 f->dump_unsigned("num_stat", num_stat);
15845 f->dump_unsigned("num_shared_shards", num_shared_shards);
15846 f->dump_unsigned("num_others", num_others);
15847 f->dump_unsigned("max_key_size", max_key_size);
15848 f->dump_unsigned("max_value_size", max_value_size);
15849 f->dump_unsigned("total_key_size", total_key_size);
15850 f->dump_unsigned("total_value_size", total_value_size);
15851 f->close_section();
15852
15853 hist.dump(f);
15854
15855 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
15856
15857 }
15858
15859 void BlueStore::_shutdown_cache()
15860 {
15861 dout(10) << __func__ << dendl;
15862 for (auto i : buffer_cache_shards) {
15863 i->flush();
15864 ceph_assert(i->empty());
15865 }
15866 for (auto& p : coll_map) {
15867 p.second->onode_map.clear();
15868 if (!p.second->shared_blob_set.empty()) {
15869 derr << __func__ << " stray shared blobs on " << p.first << dendl;
15870 p.second->shared_blob_set.dump<0>(cct);
15871 }
15872 ceph_assert(p.second->onode_map.empty());
15873 ceph_assert(p.second->shared_blob_set.empty());
15874 }
15875 coll_map.clear();
15876 for (auto i : onode_cache_shards) {
15877 ceph_assert(i->empty());
15878 }
15879 }
15880
15881 // For external caller.
15882 // We use a best-effort policy instead, e.g.,
15883 // we don't care if there are still some pinned onodes/data in the cache
15884 // after this command is completed.
15885 int BlueStore::flush_cache(ostream *os)
15886 {
15887 dout(10) << __func__ << dendl;
15888 for (auto i : onode_cache_shards) {
15889 i->flush();
15890 }
15891 for (auto i : buffer_cache_shards) {
15892 i->flush();
15893 }
15894
15895 return 0;
15896 }
15897
15898 void BlueStore::_apply_padding(uint64_t head_pad,
15899 uint64_t tail_pad,
15900 bufferlist& padded)
15901 {
15902 if (head_pad) {
15903 padded.prepend_zero(head_pad);
15904 }
15905 if (tail_pad) {
15906 padded.append_zero(tail_pad);
15907 }
15908 if (head_pad || tail_pad) {
15909 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
15910 << " tail 0x" << tail_pad << std::dec << dendl;
15911 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
15912 }
15913 }
15914
15915 void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
15916 {
15917 // finalize extent_map shards
15918 o->extent_map.update(txn, false);
15919 if (o->extent_map.needs_reshard()) {
15920 o->extent_map.reshard(db, txn);
15921 o->extent_map.update(txn, true);
15922 if (o->extent_map.needs_reshard()) {
15923 dout(20) << __func__ << " warning: still wants reshard, check options?"
15924 << dendl;
15925 o->extent_map.clear_needs_reshard();
15926 }
15927 logger->inc(l_bluestore_onode_reshard);
15928 }
15929
15930 // bound encode
15931 size_t bound = 0;
15932 denc(o->onode, bound);
15933 o->extent_map.bound_encode_spanning_blobs(bound);
15934 if (o->onode.extent_map_shards.empty()) {
15935 denc(o->extent_map.inline_bl, bound);
15936 }
15937
15938 // encode
15939 bufferlist bl;
15940 unsigned onode_part, blob_part, extent_part;
15941 {
15942 auto p = bl.get_contiguous_appender(bound, true);
15943 denc(o->onode, p);
15944 onode_part = p.get_logical_offset();
15945 o->extent_map.encode_spanning_blobs(p);
15946 blob_part = p.get_logical_offset() - onode_part;
15947 if (o->onode.extent_map_shards.empty()) {
15948 denc(o->extent_map.inline_bl, p);
15949 }
15950 extent_part = p.get_logical_offset() - onode_part - blob_part;
15951 }
15952
15953 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
15954 << " (" << onode_part << " bytes onode + "
15955 << blob_part << " bytes spanning blobs + "
15956 << extent_part << " bytes inline extents)"
15957 << dendl;
15958
15959
15960 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
15961 }
15962
15963 void BlueStore::_log_alerts(osd_alert_list_t& alerts)
15964 {
15965 std::lock_guard l(qlock);
15966
15967 if (!disk_size_mismatch_alert.empty()) {
15968 alerts.emplace(
15969 "BLUESTORE_DISK_SIZE_MISMATCH",
15970 disk_size_mismatch_alert);
15971 }
15972 if (!legacy_statfs_alert.empty()) {
15973 alerts.emplace(
15974 "BLUESTORE_LEGACY_STATFS",
15975 legacy_statfs_alert);
15976 }
15977 if (!spillover_alert.empty() &&
15978 cct->_conf->bluestore_warn_on_bluefs_spillover) {
15979 alerts.emplace(
15980 "BLUEFS_SPILLOVER",
15981 spillover_alert);
15982 }
15983 if (!no_per_pool_omap_alert.empty()) {
15984 alerts.emplace(
15985 "BLUESTORE_NO_PER_POOL_OMAP",
15986 no_per_pool_omap_alert);
15987 }
15988 string s0(failed_cmode);
15989
15990 if (!failed_compressors.empty()) {
15991 if (!s0.empty()) {
15992 s0 += ", ";
15993 }
15994 s0 += "unable to load:";
15995 bool first = true;
15996 for (auto& s : failed_compressors) {
15997 if (first) {
15998 first = false;
15999 } else {
16000 s0 += ", ";
16001 }
16002 s0 += s;
16003 }
16004 alerts.emplace(
16005 "BLUESTORE_NO_COMPRESSION",
16006 s0);
16007 }
16008 }
16009
16010 void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
16011 size_t extents)
16012 {
16013 alloc_stats_count++;
16014 alloc_stats_fragments += extents;
16015 alloc_stats_size += need;
16016 }
16017
16018 void BlueStore::_record_allocation_stats()
16019 {
16020 // don't care about data consistency,
16021 // fields can be partially modified while making the tuple
16022 auto t0 = std::make_tuple(
16023 alloc_stats_count.exchange(0),
16024 alloc_stats_fragments.exchange(0),
16025 alloc_stats_size.exchange(0));
16026
16027 dout(0) << " allocation stats probe "
16028 << probe_count << ":"
16029 << " cnt: " << std::get<0>(t0)
16030 << " frags: " << std::get<1>(t0)
16031 << " size: " << std::get<2>(t0)
16032 << dendl;
16033
16034
16035 //
16036 // Keep the history for probes from the power-of-two sequence:
16037 // -1, -2, -4, -8, -16
16038 //
16039 size_t base = 1;
16040 for (auto& t : alloc_stats_history) {
16041 dout(0) << " probe -"
16042 << base + (probe_count % base) << ": "
16043 << std::get<0>(t)
16044 << ", " << std::get<1>(t)
16045 << ", " << std::get<2>(t)
16046 << dendl;
16047 base <<= 1;
16048 }
16049 dout(0) << "------------" << dendl;
16050
16051 auto prev = probe_count++;
16052 auto mask = (1 << alloc_stats_history.size()) - 1;
16053 probe_count &= mask;
16054
16055 for (size_t i = cbits(prev ^ probe_count) - 1; i > 0 ; --i) {
16056 alloc_stats_history[i] = alloc_stats_history[i - 1];
16057 }
16058 alloc_stats_history[0].swap(t0);
16059 }
16060
16061 // ===========================================
16062 // BlueStoreRepairer
16063
16064 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16065 const interval_set<uint64_t>& extents)
16066 {
16067 ceph_assert(granularity); // initialized
16068 // can't call for the second time
16069 ceph_assert(!was_filtered_out);
16070 ceph_assert(collections_bfs.size() == objects_bfs.size());
16071
16072 uint64_t prev_pos = 0;
16073 uint64_t npos = collections_bfs.size();
16074
16075 bloom_vector collections_reduced;
16076 bloom_vector objects_reduced;
16077
16078 for (auto e : extents) {
16079 if (e.second == 0) {
16080 continue;
16081 }
16082 uint64_t pos = max(e.first / granularity, prev_pos);
16083 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
16084 while (pos != npos && pos < end_pos) {
16085 ceph_assert( collections_bfs[pos].element_count() ==
16086 objects_bfs[pos].element_count());
16087 if (collections_bfs[pos].element_count()) {
16088 collections_reduced.push_back(std::move(collections_bfs[pos]));
16089 objects_reduced.push_back(std::move(objects_bfs[pos]));
16090 }
16091 ++pos;
16092 }
16093 prev_pos = end_pos;
16094 }
16095 collections_reduced.swap(collections_bfs);
16096 objects_reduced.swap(objects_bfs);
16097 was_filtered_out = true;
16098 return collections_bfs.size();
16099 }
16100
16101 bool BlueStoreRepairer::remove_key(KeyValueDB *db,
16102 const string& prefix,
16103 const string& key)
16104 {
16105 if (!remove_key_txn) {
16106 remove_key_txn = db->get_transaction();
16107 }
16108 ++to_repair_cnt;
16109 remove_key_txn->rmkey(prefix, key);
16110
16111 return true;
16112 }
16113
16114 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db)
16115 {
16116 fix_per_pool_omap_txn = db->get_transaction();
16117 ++to_repair_cnt;
16118 bufferlist bl;
16119 bl.append("1");
16120 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
16121 }
16122
16123 bool BlueStoreRepairer::fix_shared_blob(
16124 KeyValueDB *db,
16125 uint64_t sbid,
16126 const bufferlist* bl)
16127 {
16128 KeyValueDB::Transaction txn;
16129 if (fix_misreferences_txn) { // reuse this txn
16130 txn = fix_misreferences_txn;
16131 } else {
16132 if (!fix_shared_blob_txn) {
16133 fix_shared_blob_txn = db->get_transaction();
16134 }
16135 txn = fix_shared_blob_txn;
16136 }
16137 string key;
16138 get_shared_blob_key(sbid, &key);
16139
16140 ++to_repair_cnt;
16141 if (bl) {
16142 txn->set(PREFIX_SHARED_BLOB, key, *bl);
16143 } else {
16144 txn->rmkey(PREFIX_SHARED_BLOB, key);
16145 }
16146 return true;
16147 }
16148
16149 bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
16150 const string& key,
16151 const store_statfs_t& new_statfs)
16152 {
16153 if (!fix_statfs_txn) {
16154 fix_statfs_txn = db->get_transaction();
16155 }
16156 BlueStore::volatile_statfs vstatfs;
16157 vstatfs = new_statfs;
16158 bufferlist bl;
16159 vstatfs.encode(bl);
16160 ++to_repair_cnt;
16161 fix_statfs_txn->set(PREFIX_STAT, key, bl);
16162 return true;
16163 }
16164
16165 bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
16166 FreelistManager* fm,
16167 uint64_t offset, uint64_t len)
16168 {
16169 if (!fix_fm_leaked_txn) {
16170 fix_fm_leaked_txn = db->get_transaction();
16171 }
16172 ++to_repair_cnt;
16173 fm->release(offset, len, fix_fm_leaked_txn);
16174 return true;
16175 }
16176 bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
16177 FreelistManager* fm,
16178 uint64_t offset, uint64_t len)
16179 {
16180 if (!fix_fm_false_free_txn) {
16181 fix_fm_false_free_txn = db->get_transaction();
16182 }
16183 ++to_repair_cnt;
16184 fm->allocate(offset, len, fix_fm_false_free_txn);
16185 return true;
16186 }
16187
16188 bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
16189 {
16190 // this is just a stub to count num of repairs properly,
16191 // actual repair happens in BlueStore::_close_db_and_around()
16192 // while doing _sync_bluefs_and_fm
16193 ++out_of_sync_flag;
16194 ++to_repair_cnt;
16195 return true;
16196 }
16197
16198 KeyValueDB::Transaction BlueStoreRepairer::fix_spanning_blobs(KeyValueDB* db)
16199 {
16200 if (!fix_onode_txn) {
16201 fix_onode_txn = db->get_transaction();
16202 }
16203 ++to_repair_cnt;
16204 return fix_onode_txn;
16205 }
16206
16207 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
16208 {
16209 if (misreferenced_extents.size()) {
16210 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
16211 ceph_assert(n > 0);
16212 if (!fix_misreferences_txn) {
16213 fix_misreferences_txn = db->get_transaction();
16214 }
16215 return true;
16216 }
16217 return false;
16218 }
16219
16220 unsigned BlueStoreRepairer::apply(KeyValueDB* db)
16221 {
16222 if (fix_per_pool_omap_txn) {
16223 db->submit_transaction_sync(fix_per_pool_omap_txn);
16224 fix_per_pool_omap_txn = nullptr;
16225 }
16226 if (fix_fm_leaked_txn) {
16227 db->submit_transaction_sync(fix_fm_leaked_txn);
16228 fix_fm_leaked_txn = nullptr;
16229 }
16230 if (fix_fm_false_free_txn) {
16231 db->submit_transaction_sync(fix_fm_false_free_txn);
16232 fix_fm_false_free_txn = nullptr;
16233 }
16234 if (remove_key_txn) {
16235 db->submit_transaction_sync(remove_key_txn);
16236 remove_key_txn = nullptr;
16237 }
16238 if (fix_misreferences_txn) {
16239 db->submit_transaction_sync(fix_misreferences_txn);
16240 fix_misreferences_txn = nullptr;
16241 }
16242 if (fix_onode_txn) {
16243 db->submit_transaction_sync(fix_onode_txn);
16244 fix_onode_txn = nullptr;
16245 }
16246 if (fix_shared_blob_txn) {
16247 db->submit_transaction_sync(fix_shared_blob_txn);
16248 fix_shared_blob_txn = nullptr;
16249 }
16250
16251 if (fix_statfs_txn) {
16252 db->submit_transaction_sync(fix_statfs_txn);
16253 fix_statfs_txn = nullptr;
16254 }
16255 unsigned repaired = to_repair_cnt;
16256 to_repair_cnt = 0;
16257 return repaired;
16258 }
16259
16260 // =======================================================
16261 // RocksDBBlueFSVolumeSelector
16262
16263 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
16264 ceph_assert(h != nullptr);
16265 uint64_t hint = reinterpret_cast<uint64_t>(h);
16266 uint8_t res;
16267 switch (hint) {
16268 case LEVEL_SLOW:
16269 res = BlueFS::BDEV_SLOW;
16270 if (db_avail4slow > 0) {
16271 // considering statically available db space vs.
16272 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16273 // - observed maximum spillovers
16274 uint64_t max_db_use = 0; // max db usage we potentially observed
16275 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
16276 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
16277 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
16278 // this could go to db hence using it in the estimation
16279 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
16280
16281 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
16282 uint64_t avail = min(
16283 db_avail4slow,
16284 max_db_use < db_total ? db_total - max_db_use : 0);
16285
16286 // considering current DB dev usage for SLOW data
16287 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
16288 res = BlueFS::BDEV_DB;
16289 }
16290 }
16291 break;
16292 case LEVEL_LOG:
16293 case LEVEL_WAL:
16294 res = BlueFS::BDEV_WAL;
16295 break;
16296 case LEVEL_DB:
16297 default:
16298 res = BlueFS::BDEV_DB;
16299 break;
16300 }
16301 return res;
16302 }
16303
16304 void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
16305 {
16306 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
16307 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
16308 }
16309
16310 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
16311 uint8_t res = LEVEL_DB;
16312 if (dirname.length() > 5) {
16313 // the "db.slow" and "db.wal" directory names are hard-coded at
16314 // match up with bluestore. the slow device is always the second
16315 // one (when a dedicated block.db device is present and used at
16316 // bdev 0). the wal device is always last.
16317 if (boost::algorithm::ends_with(dirname, ".slow")) {
16318 res = LEVEL_SLOW;
16319 }
16320 else if (boost::algorithm::ends_with(dirname, ".wal")) {
16321 res = LEVEL_WAL;
16322 }
16323 }
16324 return reinterpret_cast<void*>(res);
16325 }
16326
16327 void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
16328 auto max_x = per_level_per_dev_usage.get_max_x();
16329 auto max_y = per_level_per_dev_usage.get_max_y();
16330 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
16331 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
16332 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
16333 << ", db_avail:" << db_avail4slow << std::endl
16334 << "Usage matrix:" << std::endl;
16335 constexpr std::array<const char*, 8> names{ {
16336 "DEV/LEV",
16337 "WAL",
16338 "DB",
16339 "SLOW",
16340 "*",
16341 "*",
16342 "REAL",
16343 "FILES",
16344 } };
16345 const size_t width = 12;
16346 for (size_t i = 0; i < names.size(); ++i) {
16347 sout.setf(std::ios::left, std::ios::adjustfield);
16348 sout.width(width);
16349 sout << names[i];
16350 }
16351 sout << std::endl;
16352 for (size_t l = 0; l < max_y; l++) {
16353 sout.setf(std::ios::left, std::ios::adjustfield);
16354 sout.width(width);
16355 switch (l + LEVEL_FIRST) {
16356 case LEVEL_LOG:
16357 sout << "LOG"; break;
16358 case LEVEL_WAL:
16359 sout << "WAL"; break;
16360 case LEVEL_DB:
16361 sout << "DB"; break;
16362 case LEVEL_SLOW:
16363 sout << "SLOW"; break;
16364 case LEVEL_MAX:
16365 sout << "TOTALS"; break;
16366 }
16367 for (size_t d = 0; d < max_x; d++) {
16368 sout.setf(std::ios::left, std::ios::adjustfield);
16369 sout.width(width);
16370 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
16371 }
16372 sout.setf(std::ios::left, std::ios::adjustfield);
16373 sout.width(width);
16374 sout << stringify(per_level_files[l]) << std::endl;
16375 }
16376 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
16377 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
16378 sout << "MAXIMUMS:" << std::endl;
16379 for (size_t l = 0; l < max_y; l++) {
16380 sout.setf(std::ios::left, std::ios::adjustfield);
16381 sout.width(width);
16382 switch (l + LEVEL_FIRST) {
16383 case LEVEL_LOG:
16384 sout << "LOG"; break;
16385 case LEVEL_WAL:
16386 sout << "WAL"; break;
16387 case LEVEL_DB:
16388 sout << "DB"; break;
16389 case LEVEL_SLOW:
16390 sout << "SLOW"; break;
16391 case LEVEL_MAX:
16392 sout << "TOTALS"; break;
16393 }
16394 for (size_t d = 0; d < max_x - 1; d++) {
16395 sout.setf(std::ios::left, std::ios::adjustfield);
16396 sout.width(width);
16397 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
16398 }
16399 sout.setf(std::ios::left, std::ios::adjustfield);
16400 sout.width(width);
16401 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
16402 if (l < max_y - 1) {
16403 sout << std::endl;
16404 }
16405 }
16406 }
16407
16408 // =======================================================