]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
26de268192baaa042f40ce3e567a19c1d043f1d3
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <unistd.h>
16 #include <stdlib.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
23
24 #include "include/cpp-btree/btree_set.h"
25
26 #include "bluestore_common.h"
27 #include "BlueStore.h"
28 #include "os/kv.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
40 #include "BlueFS.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
47
48 #if defined(WITH_LTTNG)
49 #define TRACEPOINT_DEFINE
50 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51 #include "tracing/bluestore.h"
52 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53 #undef TRACEPOINT_DEFINE
54 #else
55 #define tracepoint(...)
56 #endif
57
58 #define dout_context cct
59 #define dout_subsys ceph_subsys_bluestore
60
61 using bid_t = decltype(BlueStore::Blob::id);
62
63 // bluestore_cache_onode
64 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
65 bluestore_cache_onode);
66
67 // bluestore_cache_other
68 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
69 bluestore_Buffer);
70 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
71 bluestore_Extent);
72 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
73 bluestore_Blob);
74 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
75 bluestore_SharedBlob);
76
77 // bluestore_txc
78 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
79 bluestore_txc);
80
81
82 // kv store prefixes
83 const string PREFIX_SUPER = "S"; // field -> value
84 const string PREFIX_STAT = "T"; // field -> value(int64 array)
85 const string PREFIX_COLL = "C"; // collection name -> cnode_t
86 const string PREFIX_OBJ = "O"; // object name -> onode_t
87 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
88 const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
89 const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
90 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
91 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
92 const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
93 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
94
95 const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
96
97 // write a label in the first block. always use this size. note that
98 // bluefs makes a matching assumption about the location of its
99 // superblock (always the second block of the device).
100 #define BDEV_LABEL_BLOCK_SIZE 4096
101
102 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103 #define SUPER_RESERVED 8192
104
105 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
106
107
108 /*
109 * extent map blob encoding
110 *
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
113 */
114 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118 #define BLOBID_SHIFT_BITS 4
119
120 /*
121 * object name key structure
122 *
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
126 *
127 * escaped string: namespace
128 *
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
133 *
134 * encoded u64: snap
135 * encoded u64: generation
136 * 'o'
137 */
138 #define ONODE_KEY_SUFFIX 'o'
139
140 /*
141 * extent shard key
142 *
143 * object prefix key
144 * u32
145 * 'x'
146 */
147 #define EXTENT_SHARD_KEY_SUFFIX 'x'
148
149 /*
150 * string encoding in the key
151 *
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
155 * hex digits.
156 *
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
159 *
160 * NOTE: There is a bug in this implementation: due to implicit
161 * character type conversion in comparison it may produce unexpected
162 * ordering. Unfortunately fixing the bug would mean invalidating the
163 * keys in existing deployments. Instead we do additional sorting
164 * where it is needed.
165 */
166 template<typename S>
167 static void append_escaped(const string &in, S *out)
168 {
169 char hexbyte[in.length() * 3 + 1];
170 char* ptr = &hexbyte[0];
171 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
172 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
173 *ptr++ = '#';
174 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
175 *ptr++ = "0123456789abcdef"[*i & 0x0f];
176 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
177 *ptr++ = '~';
178 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
179 *ptr++ = "0123456789abcdef"[*i & 0x0f];
180 } else {
181 *ptr++ = *i;
182 }
183 }
184 *ptr++ = '!';
185 out->append(hexbyte, ptr - &hexbyte[0]);
186 }
187
188 inline unsigned h2i(char c)
189 {
190 if ((c >= '0') && (c <= '9')) {
191 return c - 0x30;
192 } else if ((c >= 'a') && (c <= 'f')) {
193 return c - 'a' + 10;
194 } else if ((c >= 'A') && (c <= 'F')) {
195 return c - 'A' + 10;
196 } else {
197 return 256; // make it always larger than 255
198 }
199 }
200
201 static int decode_escaped(const char *p, string *out)
202 {
203 char buff[256];
204 char* ptr = &buff[0];
205 char* max = &buff[252];
206 const char *orig_p = p;
207 while (*p && *p != '!') {
208 if (*p == '#' || *p == '~') {
209 unsigned hex = 0;
210 p++;
211 hex = h2i(*p++) << 4;
212 if (hex > 255) {
213 return -EINVAL;
214 }
215 hex |= h2i(*p++);
216 if (hex > 255) {
217 return -EINVAL;
218 }
219 *ptr++ = hex;
220 } else {
221 *ptr++ = *p++;
222 }
223 if (ptr > max) {
224 out->append(buff, ptr-buff);
225 ptr = &buff[0];
226 }
227 }
228 if (ptr != buff) {
229 out->append(buff, ptr-buff);
230 }
231 return p - orig_p;
232 }
233
234 // some things we encode in binary (as le32 or le64); print the
235 // resulting key strings nicely
236 template<typename S>
237 static string pretty_binary_string(const S& in)
238 {
239 char buf[10];
240 string out;
241 out.reserve(in.length() * 3);
242 enum { NONE, HEX, STRING } mode = NONE;
243 unsigned from = 0, i;
244 for (i=0; i < in.length(); ++i) {
245 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
246 (mode == HEX && in.length() - i >= 4 &&
247 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
248 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
249 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
250 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
251 if (mode == STRING) {
252 out.append(in.c_str() + from, i - from);
253 out.push_back('\'');
254 }
255 if (mode != HEX) {
256 out.append("0x");
257 mode = HEX;
258 }
259 if (in.length() - i >= 4) {
260 // print a whole u32 at once
261 snprintf(buf, sizeof(buf), "%08x",
262 (uint32_t)(((unsigned char)in[i] << 24) |
263 ((unsigned char)in[i+1] << 16) |
264 ((unsigned char)in[i+2] << 8) |
265 ((unsigned char)in[i+3] << 0)));
266 i += 3;
267 } else {
268 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
269 }
270 out.append(buf);
271 } else {
272 if (mode != STRING) {
273 out.push_back('\'');
274 mode = STRING;
275 from = i;
276 }
277 }
278 }
279 if (mode == STRING) {
280 out.append(in.c_str() + from, i - from);
281 out.push_back('\'');
282 }
283 return out;
284 }
285
286 template<typename T>
287 static void _key_encode_shard(shard_id_t shard, T *key)
288 {
289 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
290 }
291
292 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
293 {
294 pshard->id = (uint8_t)*key - (uint8_t)0x80;
295 return key + 1;
296 }
297
298 static void get_coll_range(const coll_t& cid, int bits,
299 ghobject_t *temp_start, ghobject_t *temp_end,
300 ghobject_t *start, ghobject_t *end)
301 {
302 spg_t pgid;
303 if (cid.is_pg(&pgid)) {
304 start->shard_id = pgid.shard;
305 *temp_start = *start;
306
307 start->hobj.pool = pgid.pool();
308 temp_start->hobj.pool = -2ll - pgid.pool();
309
310 *end = *start;
311 *temp_end = *temp_start;
312
313 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
314 start->hobj.set_bitwise_key_u32(reverse_hash);
315 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
316
317 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
318 if (end_hash > 0xffffffffull)
319 end_hash = 0xffffffffull;
320
321 end->hobj.set_bitwise_key_u32(end_hash);
322 temp_end->hobj.set_bitwise_key_u32(end_hash);
323 } else {
324 start->shard_id = shard_id_t::NO_SHARD;
325 start->hobj.pool = -1ull;
326
327 *end = *start;
328 start->hobj.set_bitwise_key_u32(0);
329 end->hobj.set_bitwise_key_u32(0xffffffff);
330
331 // no separate temp section
332 *temp_start = *end;
333 *temp_end = *end;
334 }
335
336 start->generation = 0;
337 end->generation = 0;
338 temp_start->generation = 0;
339 temp_end->generation = 0;
340 }
341
342 static void get_shared_blob_key(uint64_t sbid, string *key)
343 {
344 key->clear();
345 _key_encode_u64(sbid, key);
346 }
347
348 static int get_key_shared_blob(const string& key, uint64_t *sbid)
349 {
350 const char *p = key.c_str();
351 if (key.length() < sizeof(uint64_t))
352 return -1;
353 _key_decode_u64(p, sbid);
354 return 0;
355 }
356
357 template<typename S>
358 static void _key_encode_prefix(const ghobject_t& oid, S *key)
359 {
360 _key_encode_shard(oid.shard_id, key);
361 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
362 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
363 }
364
365 static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
366 {
367 p = _key_decode_shard(p, &oid->shard_id);
368
369 uint64_t pool;
370 p = _key_decode_u64(p, &pool);
371 oid->hobj.pool = pool - 0x8000000000000000ull;
372
373 unsigned hash;
374 p = _key_decode_u32(p, &hash);
375
376 oid->hobj.set_bitwise_key_u32(hash);
377
378 return p;
379 }
380
381 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
382
383 template<typename S>
384 static int get_key_object(const S& key, ghobject_t *oid)
385 {
386 int r;
387 const char *p = key.c_str();
388
389 if (key.length() < ENCODED_KEY_PREFIX_LEN)
390 return -1;
391
392 p = _key_decode_prefix(p, oid);
393
394 if (key.length() == ENCODED_KEY_PREFIX_LEN)
395 return -2;
396
397 r = decode_escaped(p, &oid->hobj.nspace);
398 if (r < 0)
399 return -2;
400 p += r + 1;
401
402 string k;
403 r = decode_escaped(p, &k);
404 if (r < 0)
405 return -3;
406 p += r + 1;
407 if (*p == '=') {
408 // no key
409 ++p;
410 oid->hobj.oid.name = k;
411 } else if (*p == '<' || *p == '>') {
412 // key + name
413 ++p;
414 r = decode_escaped(p, &oid->hobj.oid.name);
415 if (r < 0)
416 return -5;
417 p += r + 1;
418 oid->hobj.set_key(k);
419 } else {
420 // malformed
421 return -6;
422 }
423
424 p = _key_decode_u64(p, &oid->hobj.snap.val);
425 p = _key_decode_u64(p, &oid->generation);
426
427 if (*p != ONODE_KEY_SUFFIX) {
428 return -7;
429 }
430 p++;
431 if (*p) {
432 // if we get something other than a null terminator here,
433 // something goes wrong.
434 return -8;
435 }
436
437 return 0;
438 }
439
440 template<typename S>
441 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
442 {
443 key->clear();
444
445 size_t max_len = ENCODED_KEY_PREFIX_LEN +
446 (oid.hobj.nspace.length() * 3 + 1) +
447 (oid.hobj.get_key().length() * 3 + 1) +
448 1 + // for '<', '=', or '>'
449 (oid.hobj.oid.name.length() * 3 + 1) +
450 8 + 8 + 1;
451 key->reserve(max_len);
452
453 _key_encode_prefix(oid, key);
454
455 append_escaped(oid.hobj.nspace, key);
456
457 if (oid.hobj.get_key().length()) {
458 // is a key... could be < = or >.
459 append_escaped(oid.hobj.get_key(), key);
460 // (ASCII chars < = and > sort in that order, yay)
461 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
462 if (r) {
463 key->append(r > 0 ? ">" : "<");
464 append_escaped(oid.hobj.oid.name, key);
465 } else {
466 // same as no key
467 key->append("=");
468 }
469 } else {
470 // no key
471 append_escaped(oid.hobj.oid.name, key);
472 key->append("=");
473 }
474
475 _key_encode_u64(oid.hobj.snap, key);
476 _key_encode_u64(oid.generation, key);
477
478 key->push_back(ONODE_KEY_SUFFIX);
479
480 // sanity check
481 if (true) {
482 ghobject_t t;
483 int r = get_key_object(*key, &t);
484 if (r || t != oid) {
485 derr << " r " << r << dendl;
486 derr << "key " << pretty_binary_string(*key) << dendl;
487 derr << "oid " << oid << dendl;
488 derr << " t " << t << dendl;
489 ceph_assert(r == 0 && t == oid);
490 }
491 }
492 }
493
494
495 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
496 // char lets us quickly test whether it is a shard key without decoding any
497 // of the prefix bytes.
498 template<typename S>
499 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
500 string *key)
501 {
502 key->clear();
503 key->reserve(onode_key.length() + 4 + 1);
504 key->append(onode_key.c_str(), onode_key.size());
505 _key_encode_u32(offset, key);
506 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
507 }
508
509 static void rewrite_extent_shard_key(uint32_t offset, string *key)
510 {
511 ceph_assert(key->size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
513 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
514 }
515
516 template<typename S>
517 static void generate_extent_shard_key_and_apply(
518 const S& onode_key,
519 uint32_t offset,
520 string *key,
521 std::function<void(const string& final_key)> apply)
522 {
523 if (key->empty()) { // make full key
524 ceph_assert(!onode_key.empty());
525 get_extent_shard_key(onode_key, offset, key);
526 } else {
527 rewrite_extent_shard_key(offset, key);
528 }
529 apply(*key);
530 }
531
532 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
533 {
534 ceph_assert(key.size() > sizeof(uint32_t) + 1);
535 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
536 int okey_len = key.size() - sizeof(uint32_t) - 1;
537 *onode_key = key.substr(0, okey_len);
538 const char *p = key.data() + okey_len;
539 _key_decode_u32(p, offset);
540 return 0;
541 }
542
543 static bool is_extent_shard_key(const string& key)
544 {
545 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
546 }
547
548 static void get_deferred_key(uint64_t seq, string *out)
549 {
550 _key_encode_u64(seq, out);
551 }
552
553 static void get_pool_stat_key(int64_t pool_id, string *key)
554 {
555 key->clear();
556 _key_encode_u64(pool_id, key);
557 }
558
559 static int get_key_pool_stat(const string& key, uint64_t* pool_id)
560 {
561 const char *p = key.c_str();
562 if (key.length() < sizeof(uint64_t))
563 return -1;
564 _key_decode_u64(p, pool_id);
565 return 0;
566 }
567
568 template <int LogLevelV>
569 void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
570 {
571 uint64_t pos = 0;
572 for (auto& s : em.shards) {
573 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
574 << (s.loaded ? " (loaded)" : "")
575 << (s.dirty ? " (dirty)" : "")
576 << dendl;
577 }
578 for (auto& e : em.extent_map) {
579 dout(LogLevelV) << __func__ << " " << e << dendl;
580 ceph_assert(e.logical_offset >= pos);
581 pos = e.logical_offset + e.length;
582 const bluestore_blob_t& blob = e.blob->get_blob();
583 if (blob.has_csum()) {
584 vector<uint64_t> v;
585 unsigned n = blob.get_csum_count();
586 for (unsigned i = 0; i < n; ++i)
587 v.push_back(blob.get_csum_item(i));
588 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
589 << dendl;
590 }
591 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
592 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
593 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
594 << "~" << i.second->length << std::dec
595 << " " << *i.second << dendl;
596 }
597 }
598 }
599
600 template <int LogLevelV>
601 void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
602 {
603 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
604 return;
605 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
606 << " nid " << o.onode.nid
607 << " size 0x" << std::hex << o.onode.size
608 << " (" << std::dec << o.onode.size << ")"
609 << " expected_object_size " << o.onode.expected_object_size
610 << " expected_write_size " << o.onode.expected_write_size
611 << " in " << o.onode.extent_map_shards.size() << " shards"
612 << ", " << o.extent_map.spanning_blob_map.size()
613 << " spanning blobs"
614 << dendl;
615 for (auto p = o.onode.attrs.begin();
616 p != o.onode.attrs.end();
617 ++p) {
618 dout(LogLevelV) << __func__ << " attr " << p->first
619 << " len " << p->second.length() << dendl;
620 }
621 _dump_extent_map<LogLevelV>(cct, o.extent_map);
622 }
623
624 template <int LogLevelV>
625 void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
626 {
627 dout(LogLevelV) << __func__ << " transaction dump:\n";
628 JSONFormatter f(true);
629 f.open_object_section("transaction");
630 t->dump(&f);
631 f.close_section();
632 f.flush(*_dout);
633 *_dout << dendl;
634 }
635
636 // merge operators
637
638 struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
639 void merge_nonexistent(
640 const char *rdata, size_t rlen, std::string *new_value) override {
641 *new_value = std::string(rdata, rlen);
642 }
643 void merge(
644 const char *ldata, size_t llen,
645 const char *rdata, size_t rlen,
646 std::string *new_value) override {
647 ceph_assert(llen == rlen);
648 ceph_assert((rlen % 8) == 0);
649 new_value->resize(rlen);
650 const ceph_le64* lv = (const ceph_le64*)ldata;
651 const ceph_le64* rv = (const ceph_le64*)rdata;
652 ceph_le64* nv = &(ceph_le64&)new_value->at(0);
653 for (size_t i = 0; i < rlen >> 3; ++i) {
654 nv[i] = lv[i] + rv[i];
655 }
656 }
657 // We use each operator name and each prefix to construct the
658 // overall RocksDB operator name for consistency check at open time.
659 const char *name() const override {
660 return "int64_array";
661 }
662 };
663
664
665 // Buffer
666
667 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
668 {
669 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
670 << b.offset << "~" << b.length << std::dec
671 << " " << BlueStore::Buffer::get_state_name(b.state);
672 if (b.flags)
673 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
674 return out << ")";
675 }
676
677 namespace {
678
679 /*
680 * Due to a bug in key string encoding (see a comment for append_escaped)
681 * the KeyValueDB iterator does not lexicographically sort the same
682 * way that ghobject_t does: objects with the same hash may have wrong order.
683 *
684 * This is the iterator wrapper that fixes the keys order.
685 */
686
687 class CollectionListIterator {
688 public:
689 CollectionListIterator(const KeyValueDB::Iterator &it)
690 : m_it(it) {
691 }
692 virtual ~CollectionListIterator() {
693 }
694
695 virtual bool valid() const = 0;
696 virtual const ghobject_t &oid() const = 0;
697 virtual void lower_bound(const ghobject_t &oid) = 0;
698 virtual void upper_bound(const ghobject_t &oid) = 0;
699 virtual void next() = 0;
700
701 virtual int cmp(const ghobject_t &oid) const = 0;
702
703 bool is_ge(const ghobject_t &oid) const {
704 return cmp(oid) >= 0;
705 }
706
707 bool is_lt(const ghobject_t &oid) const {
708 return cmp(oid) < 0;
709 }
710
711 protected:
712 KeyValueDB::Iterator m_it;
713 };
714
715 class SimpleCollectionListIterator : public CollectionListIterator {
716 public:
717 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
718 : CollectionListIterator(it), m_cct(cct) {
719 }
720
721 bool valid() const override {
722 return m_it->valid();
723 }
724
725 const ghobject_t &oid() const override {
726 ceph_assert(valid());
727
728 return m_oid;
729 }
730
731 void lower_bound(const ghobject_t &oid) override {
732 string key;
733 get_object_key(m_cct, oid, &key);
734
735 m_it->lower_bound(key);
736 get_oid();
737 }
738
739 void upper_bound(const ghobject_t &oid) override {
740 string key;
741 get_object_key(m_cct, oid, &key);
742
743 m_it->upper_bound(key);
744 get_oid();
745 }
746
747 void next() override {
748 ceph_assert(valid());
749
750 m_it->next();
751 get_oid();
752 }
753
754 int cmp(const ghobject_t &oid) const override {
755 ceph_assert(valid());
756
757 string key;
758 get_object_key(m_cct, oid, &key);
759
760 return m_it->key().compare(key);
761 }
762
763 private:
764 CephContext *m_cct;
765 ghobject_t m_oid;
766
767 void get_oid() {
768 if (!valid()) {
769 return;
770 }
771
772 if (is_extent_shard_key(m_it->key())) {
773 next();
774 return;
775 }
776
777 m_oid = ghobject_t();
778 int r = get_key_object(m_it->key(), &m_oid);
779 ceph_assert(r == 0);
780 }
781 };
782
783 class SortedCollectionListIterator : public CollectionListIterator {
784 public:
785 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
786 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
787 }
788
789 bool valid() const override {
790 return m_chunk_iter != m_chunk.end();
791 }
792
793 const ghobject_t &oid() const override {
794 ceph_assert(valid());
795
796 return m_chunk_iter->first;
797 }
798
799 void lower_bound(const ghobject_t &oid) override {
800 std::string key;
801 _key_encode_prefix(oid, &key);
802
803 m_it->lower_bound(key);
804 m_chunk_iter = m_chunk.end();
805 if (!get_next_chunk()) {
806 return;
807 }
808
809 if (this->oid().shard_id != oid.shard_id ||
810 this->oid().hobj.pool != oid.hobj.pool ||
811 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
812 return;
813 }
814
815 m_chunk_iter = m_chunk.lower_bound(oid);
816 if (m_chunk_iter == m_chunk.end()) {
817 get_next_chunk();
818 }
819 }
820
821 void upper_bound(const ghobject_t &oid) override {
822 lower_bound(oid);
823
824 if (valid() && this->oid() == oid) {
825 next();
826 }
827 }
828
829 void next() override {
830 ceph_assert(valid());
831
832 m_chunk_iter++;
833 if (m_chunk_iter == m_chunk.end()) {
834 get_next_chunk();
835 }
836 }
837
838 int cmp(const ghobject_t &oid) const override {
839 ceph_assert(valid());
840
841 if (this->oid() < oid) {
842 return -1;
843 }
844 if (this->oid() > oid) {
845 return 1;
846 }
847 return 0;
848 }
849
850 private:
851 std::map<ghobject_t, std::string> m_chunk;
852 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
853
854 bool get_next_chunk() {
855 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
856 m_it->next();
857 }
858
859 if (!m_it->valid()) {
860 return false;
861 }
862
863 ghobject_t oid;
864 int r = get_key_object(m_it->key(), &oid);
865 ceph_assert(r == 0);
866
867 m_chunk.clear();
868 while (true) {
869 m_chunk.insert({oid, m_it->key()});
870
871 do {
872 m_it->next();
873 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
874
875 if (!m_it->valid()) {
876 break;
877 }
878
879 ghobject_t next;
880 r = get_key_object(m_it->key(), &next);
881 ceph_assert(r == 0);
882 if (next.shard_id != oid.shard_id ||
883 next.hobj.pool != oid.hobj.pool ||
884 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
885 break;
886 }
887 oid = next;
888 }
889
890 m_chunk_iter = m_chunk.begin();
891 return true;
892 }
893 };
894
895 } // anonymous namespace
896
897 // Garbage Collector
898
899 void BlueStore::GarbageCollector::process_protrusive_extents(
900 const BlueStore::ExtentMap& extent_map,
901 uint64_t start_offset,
902 uint64_t end_offset,
903 uint64_t start_touch_offset,
904 uint64_t end_touch_offset,
905 uint64_t min_alloc_size)
906 {
907 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
908
909 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
910 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
911
912 dout(30) << __func__ << " (hex): [" << std::hex
913 << lookup_start_offset << ", " << lookup_end_offset
914 << ")" << std::dec << dendl;
915
916 for (auto it = extent_map.seek_lextent(lookup_start_offset);
917 it != extent_map.extent_map.end() &&
918 it->logical_offset < lookup_end_offset;
919 ++it) {
920 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
921 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
922
923 dout(30) << __func__ << " " << *it
924 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
925 << dendl;
926
927 Blob* b = it->blob.get();
928
929 if (it->logical_offset >=start_touch_offset &&
930 it->logical_end() <= end_touch_offset) {
931 // Process extents within the range affected by
932 // the current write request.
933 // Need to take into account if existing extents
934 // can be merged with them (uncompressed case)
935 if (!b->get_blob().is_compressed()) {
936 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
937 --blob_info_counted->expected_allocations; // don't need to allocate
938 // new AU for compressed
939 // data since another
940 // collocated uncompressed
941 // blob already exists
942 dout(30) << __func__ << " --expected:"
943 << alloc_unit_start << dendl;
944 }
945 used_alloc_unit = alloc_unit_end;
946 blob_info_counted = nullptr;
947 }
948 } else if (b->get_blob().is_compressed()) {
949
950 // additionally we take compressed blobs that were not impacted
951 // by the write into account too
952 BlobInfo& bi =
953 affected_blobs.emplace(
954 b, BlobInfo(b->get_referenced_bytes())).first->second;
955
956 int adjust =
957 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
958 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
959 dout(30) << __func__ << " expected_allocations="
960 << bi.expected_allocations << " end_au:"
961 << alloc_unit_end << dendl;
962
963 blob_info_counted = &bi;
964 used_alloc_unit = alloc_unit_end;
965
966 ceph_assert(it->length <= bi.referenced_bytes);
967 bi.referenced_bytes -= it->length;
968 dout(30) << __func__ << " affected_blob:" << *b
969 << " unref 0x" << std::hex << it->length
970 << " referenced = 0x" << bi.referenced_bytes
971 << std::dec << dendl;
972 // NOTE: we can't move specific blob to resulting GC list here
973 // when reference counter == 0 since subsequent extents might
974 // decrement its expected_allocation.
975 // Hence need to enumerate all the extents first.
976 if (!bi.collect_candidate) {
977 bi.first_lextent = it;
978 bi.collect_candidate = true;
979 }
980 bi.last_lextent = it;
981 } else {
982 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
983 // don't need to allocate new AU for compressed data since another
984 // collocated uncompressed blob already exists
985 --blob_info_counted->expected_allocations;
986 dout(30) << __func__ << " --expected_allocations:"
987 << alloc_unit_start << dendl;
988 }
989 used_alloc_unit = alloc_unit_end;
990 blob_info_counted = nullptr;
991 }
992 }
993
994 for (auto b_it = affected_blobs.begin();
995 b_it != affected_blobs.end();
996 ++b_it) {
997 Blob* b = b_it->first;
998 BlobInfo& bi = b_it->second;
999 if (bi.referenced_bytes == 0) {
1000 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
1001 int64_t blob_expected_for_release =
1002 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
1003
1004 dout(30) << __func__ << " " << *(b_it->first)
1005 << " expected4release=" << blob_expected_for_release
1006 << " expected_allocations=" << bi.expected_allocations
1007 << dendl;
1008 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
1009 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
1010 if (bi.collect_candidate) {
1011 auto it = bi.first_lextent;
1012 bool bExit = false;
1013 do {
1014 if (it->blob.get() == b) {
1015 extents_to_collect.insert(it->logical_offset, it->length);
1016 }
1017 bExit = it == bi.last_lextent;
1018 ++it;
1019 } while (!bExit);
1020 }
1021 expected_for_release += blob_expected_for_release;
1022 expected_allocations += bi.expected_allocations;
1023 }
1024 }
1025 }
1026 }
1027
1028 int64_t BlueStore::GarbageCollector::estimate(
1029 uint64_t start_offset,
1030 uint64_t length,
1031 const BlueStore::ExtentMap& extent_map,
1032 const BlueStore::old_extent_map_t& old_extents,
1033 uint64_t min_alloc_size)
1034 {
1035
1036 affected_blobs.clear();
1037 extents_to_collect.clear();
1038 used_alloc_unit = boost::optional<uint64_t >();
1039 blob_info_counted = nullptr;
1040
1041 uint64_t gc_start_offset = start_offset;
1042 uint64_t gc_end_offset = start_offset + length;
1043
1044 uint64_t end_offset = start_offset + length;
1045
1046 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1047 Blob* b = it->e.blob.get();
1048 if (b->get_blob().is_compressed()) {
1049
1050 // update gc_start_offset/gc_end_offset if needed
1051 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
1052 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
1053
1054 auto o = it->e.logical_offset;
1055 auto l = it->e.length;
1056
1057 uint64_t ref_bytes = b->get_referenced_bytes();
1058 // micro optimization to bypass blobs that have no more references
1059 if (ref_bytes != 0) {
1060 dout(30) << __func__ << " affected_blob:" << *b
1061 << " unref 0x" << std::hex << o << "~" << l
1062 << std::dec << dendl;
1063 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1064 }
1065 }
1066 }
1067 dout(30) << __func__ << " gc range(hex): [" << std::hex
1068 << gc_start_offset << ", " << gc_end_offset
1069 << ")" << std::dec << dendl;
1070
1071 // enumerate preceeding extents to check if they reference affected blobs
1072 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1073 process_protrusive_extents(extent_map,
1074 gc_start_offset,
1075 gc_end_offset,
1076 start_offset,
1077 end_offset,
1078 min_alloc_size);
1079 }
1080 return expected_for_release - expected_allocations;
1081 }
1082
1083 // LruOnodeCacheShard
1084 struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1085 typedef boost::intrusive::list<
1086 BlueStore::Onode,
1087 boost::intrusive::member_hook<
1088 BlueStore::Onode,
1089 boost::intrusive::list_member_hook<>,
1090 &BlueStore::Onode::lru_item> > list_t;
1091
1092 list_t lru;
1093
1094 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
1095
1096 void _add(BlueStore::Onode* o, int level) override
1097 {
1098 if (o->put_cache()) {
1099 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
1100 } else {
1101 ++num_pinned;
1102 }
1103 ++num; // we count both pinned and unpinned entries
1104 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
1105 }
1106 void _rm(BlueStore::Onode* o) override
1107 {
1108 if (o->pop_cache()) {
1109 lru.erase(lru.iterator_to(*o));
1110 } else {
1111 ceph_assert(num_pinned);
1112 --num_pinned;
1113 }
1114 ceph_assert(num);
1115 --num;
1116 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
1117 }
1118 void _pin(BlueStore::Onode* o) override
1119 {
1120 lru.erase(lru.iterator_to(*o));
1121 ++num_pinned;
1122 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
1123 }
1124 void _unpin(BlueStore::Onode* o) override
1125 {
1126 lru.push_front(*o);
1127 ceph_assert(num_pinned);
1128 --num_pinned;
1129 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
1130 }
1131 void _unpin_and_rm(BlueStore::Onode* o) override
1132 {
1133 o->pop_cache();
1134 ceph_assert(num_pinned);
1135 --num_pinned;
1136 ceph_assert(num);
1137 --num;
1138 }
1139 void _trim_to(uint64_t new_size) override
1140 {
1141 if (new_size >= lru.size()) {
1142 return; // don't even try
1143 }
1144 uint64_t n = lru.size() - new_size;
1145 auto p = lru.end();
1146 ceph_assert(p != lru.begin());
1147 --p;
1148 ceph_assert(num >= n);
1149 num -= n;
1150 while (n-- > 0) {
1151 BlueStore::Onode *o = &*p;
1152 dout(20) << __func__ << " rm " << o->oid << " "
1153 << o->nref << " " << o->cached << " " << o->pinned << dendl;
1154 if (p != lru.begin()) {
1155 lru.erase(p--);
1156 } else {
1157 ceph_assert(n == 0);
1158 lru.erase(p);
1159 }
1160 auto pinned = !o->pop_cache();
1161 ceph_assert(!pinned);
1162 o->c->onode_map._remove(o->oid);
1163 }
1164 }
1165 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1166 {
1167 if (to == this) {
1168 return;
1169 }
1170 ceph_assert(o->cached);
1171 ceph_assert(o->pinned);
1172 ceph_assert(num);
1173 ceph_assert(num_pinned);
1174 --num_pinned;
1175 --num;
1176 ++to->num_pinned;
1177 ++to->num;
1178 }
1179 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1180 {
1181 *onodes += num;
1182 *pinned_onodes += num_pinned;
1183 }
1184 };
1185
1186 // OnodeCacheShard
1187 BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1188 CephContext* cct,
1189 string type,
1190 PerfCounters *logger)
1191 {
1192 BlueStore::OnodeCacheShard *c = nullptr;
1193 // Currently we only implement an LRU cache for onodes
1194 c = new LruOnodeCacheShard(cct);
1195 c->logger = logger;
1196 return c;
1197 }
1198
1199 // LruBufferCacheShard
1200 struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1201 typedef boost::intrusive::list<
1202 BlueStore::Buffer,
1203 boost::intrusive::member_hook<
1204 BlueStore::Buffer,
1205 boost::intrusive::list_member_hook<>,
1206 &BlueStore::Buffer::lru_item> > list_t;
1207 list_t lru;
1208
1209 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1210
1211 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1212 if (near) {
1213 auto q = lru.iterator_to(*near);
1214 lru.insert(q, *b);
1215 } else if (level > 0) {
1216 lru.push_front(*b);
1217 } else {
1218 lru.push_back(*b);
1219 }
1220 buffer_bytes += b->length;
1221 num = lru.size();
1222 }
1223 void _rm(BlueStore::Buffer *b) override {
1224 ceph_assert(buffer_bytes >= b->length);
1225 buffer_bytes -= b->length;
1226 auto q = lru.iterator_to(*b);
1227 lru.erase(q);
1228 num = lru.size();
1229 }
1230 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1231 src->_rm(b);
1232 _add(b, 0, nullptr);
1233 }
1234 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1235 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1236 buffer_bytes += delta;
1237 }
1238 void _touch(BlueStore::Buffer *b) override {
1239 auto p = lru.iterator_to(*b);
1240 lru.erase(p);
1241 lru.push_front(*b);
1242 num = lru.size();
1243 _audit("_touch_buffer end");
1244 }
1245
1246 void _trim_to(uint64_t max) override
1247 {
1248 while (buffer_bytes > max) {
1249 auto i = lru.rbegin();
1250 if (i == lru.rend()) {
1251 // stop if lru is now empty
1252 break;
1253 }
1254
1255 BlueStore::Buffer *b = &*i;
1256 ceph_assert(b->is_clean());
1257 dout(20) << __func__ << " rm " << *b << dendl;
1258 b->space->_rm_buffer(this, b);
1259 }
1260 num = lru.size();
1261 }
1262
1263 void add_stats(uint64_t *extents,
1264 uint64_t *blobs,
1265 uint64_t *buffers,
1266 uint64_t *bytes) override {
1267 *extents += num_extents;
1268 *blobs += num_blobs;
1269 *buffers += num;
1270 *bytes += buffer_bytes;
1271 }
1272 #ifdef DEBUG_CACHE
1273 void _audit(const char *s) override
1274 {
1275 dout(10) << __func__ << " " << when << " start" << dendl;
1276 uint64_t s = 0;
1277 for (auto i = lru.begin(); i != lru.end(); ++i) {
1278 s += i->length;
1279 }
1280 if (s != buffer_bytes) {
1281 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1282 << dendl;
1283 for (auto i = lru.begin(); i != lru.end(); ++i) {
1284 derr << __func__ << " " << *i << dendl;
1285 }
1286 ceph_assert(s == buffer_bytes);
1287 }
1288 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1289 << " ok" << dendl;
1290 }
1291 #endif
1292 };
1293
1294 // TwoQBufferCacheShard
1295
1296 struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1297 typedef boost::intrusive::list<
1298 BlueStore::Buffer,
1299 boost::intrusive::member_hook<
1300 BlueStore::Buffer,
1301 boost::intrusive::list_member_hook<>,
1302 &BlueStore::Buffer::lru_item> > list_t;
1303 list_t hot; ///< "Am" hot buffers
1304 list_t warm_in; ///< "A1in" newly warm buffers
1305 list_t warm_out; ///< "A1out" empty buffers we've evicted
1306 uint64_t buffer_bytes = 0; ///< bytes
1307
1308 enum {
1309 BUFFER_NEW = 0,
1310 BUFFER_WARM_IN, ///< in warm_in
1311 BUFFER_WARM_OUT, ///< in warm_out
1312 BUFFER_HOT, ///< in hot
1313 BUFFER_TYPE_MAX
1314 };
1315
1316 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1317
1318 public:
1319 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
1320
1321 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1322 {
1323 dout(20) << __func__ << " level " << level << " near " << near
1324 << " on " << *b
1325 << " which has cache_private " << b->cache_private << dendl;
1326 if (near) {
1327 b->cache_private = near->cache_private;
1328 switch (b->cache_private) {
1329 case BUFFER_WARM_IN:
1330 warm_in.insert(warm_in.iterator_to(*near), *b);
1331 break;
1332 case BUFFER_WARM_OUT:
1333 ceph_assert(b->is_empty());
1334 warm_out.insert(warm_out.iterator_to(*near), *b);
1335 break;
1336 case BUFFER_HOT:
1337 hot.insert(hot.iterator_to(*near), *b);
1338 break;
1339 default:
1340 ceph_abort_msg("bad cache_private");
1341 }
1342 } else if (b->cache_private == BUFFER_NEW) {
1343 b->cache_private = BUFFER_WARM_IN;
1344 if (level > 0) {
1345 warm_in.push_front(*b);
1346 } else {
1347 // take caller hint to start at the back of the warm queue
1348 warm_in.push_back(*b);
1349 }
1350 } else {
1351 // we got a hint from discard
1352 switch (b->cache_private) {
1353 case BUFFER_WARM_IN:
1354 // stay in warm_in. move to front, even though 2Q doesn't actually
1355 // do this.
1356 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1357 warm_in.push_front(*b);
1358 break;
1359 case BUFFER_WARM_OUT:
1360 b->cache_private = BUFFER_HOT;
1361 // move to hot. fall-thru
1362 case BUFFER_HOT:
1363 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1364 hot.push_front(*b);
1365 break;
1366 default:
1367 ceph_abort_msg("bad cache_private");
1368 }
1369 }
1370 if (!b->is_empty()) {
1371 buffer_bytes += b->length;
1372 list_bytes[b->cache_private] += b->length;
1373 }
1374 num = hot.size() + warm_in.size();
1375 }
1376
1377 void _rm(BlueStore::Buffer *b) override
1378 {
1379 dout(20) << __func__ << " " << *b << dendl;
1380 if (!b->is_empty()) {
1381 ceph_assert(buffer_bytes >= b->length);
1382 buffer_bytes -= b->length;
1383 ceph_assert(list_bytes[b->cache_private] >= b->length);
1384 list_bytes[b->cache_private] -= b->length;
1385 }
1386 switch (b->cache_private) {
1387 case BUFFER_WARM_IN:
1388 warm_in.erase(warm_in.iterator_to(*b));
1389 break;
1390 case BUFFER_WARM_OUT:
1391 warm_out.erase(warm_out.iterator_to(*b));
1392 break;
1393 case BUFFER_HOT:
1394 hot.erase(hot.iterator_to(*b));
1395 break;
1396 default:
1397 ceph_abort_msg("bad cache_private");
1398 }
1399 num = hot.size() + warm_in.size();
1400 }
1401
1402 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1403 {
1404 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1405 src->_rm(b);
1406
1407 // preserve which list we're on (even if we can't preserve the order!)
1408 switch (b->cache_private) {
1409 case BUFFER_WARM_IN:
1410 ceph_assert(!b->is_empty());
1411 warm_in.push_back(*b);
1412 break;
1413 case BUFFER_WARM_OUT:
1414 ceph_assert(b->is_empty());
1415 warm_out.push_back(*b);
1416 break;
1417 case BUFFER_HOT:
1418 ceph_assert(!b->is_empty());
1419 hot.push_back(*b);
1420 break;
1421 default:
1422 ceph_abort_msg("bad cache_private");
1423 }
1424 if (!b->is_empty()) {
1425 buffer_bytes += b->length;
1426 list_bytes[b->cache_private] += b->length;
1427 }
1428 num = hot.size() + warm_in.size();
1429 }
1430
1431 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1432 {
1433 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1434 if (!b->is_empty()) {
1435 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1436 buffer_bytes += delta;
1437 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1438 list_bytes[b->cache_private] += delta;
1439 }
1440 }
1441
1442 void _touch(BlueStore::Buffer *b) override {
1443 switch (b->cache_private) {
1444 case BUFFER_WARM_IN:
1445 // do nothing (somewhat counter-intuitively!)
1446 break;
1447 case BUFFER_WARM_OUT:
1448 // move from warm_out to hot LRU
1449 ceph_abort_msg("this happens via discard hint");
1450 break;
1451 case BUFFER_HOT:
1452 // move to front of hot LRU
1453 hot.erase(hot.iterator_to(*b));
1454 hot.push_front(*b);
1455 break;
1456 }
1457 num = hot.size() + warm_in.size();
1458 _audit("_touch_buffer end");
1459 }
1460
1461 void _trim_to(uint64_t max) override
1462 {
1463 if (buffer_bytes > max) {
1464 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1465 uint64_t khot = max - kin;
1466
1467 // pre-calculate kout based on average buffer size too,
1468 // which is typical(the warm_in and hot lists may change later)
1469 uint64_t kout = 0;
1470 uint64_t buffer_num = hot.size() + warm_in.size();
1471 if (buffer_num) {
1472 uint64_t avg_size = buffer_bytes / buffer_num;
1473 ceph_assert(avg_size);
1474 uint64_t calculated_num = max / avg_size;
1475 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1476 }
1477
1478 if (list_bytes[BUFFER_HOT] < khot) {
1479 // hot is small, give slack to warm_in
1480 kin += khot - list_bytes[BUFFER_HOT];
1481 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1482 // warm_in is small, give slack to hot
1483 khot += kin - list_bytes[BUFFER_WARM_IN];
1484 }
1485
1486 // adjust warm_in list
1487 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1488 uint64_t evicted = 0;
1489
1490 while (to_evict_bytes > 0) {
1491 auto p = warm_in.rbegin();
1492 if (p == warm_in.rend()) {
1493 // stop if warm_in list is now empty
1494 break;
1495 }
1496
1497 BlueStore::Buffer *b = &*p;
1498 ceph_assert(b->is_clean());
1499 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1500 ceph_assert(buffer_bytes >= b->length);
1501 buffer_bytes -= b->length;
1502 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1503 list_bytes[BUFFER_WARM_IN] -= b->length;
1504 to_evict_bytes -= b->length;
1505 evicted += b->length;
1506 b->state = BlueStore::Buffer::STATE_EMPTY;
1507 b->data.clear();
1508 warm_in.erase(warm_in.iterator_to(*b));
1509 warm_out.push_front(*b);
1510 b->cache_private = BUFFER_WARM_OUT;
1511 }
1512
1513 if (evicted > 0) {
1514 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1515 << " from warm_in list, done evicting warm_in buffers"
1516 << dendl;
1517 }
1518
1519 // adjust hot list
1520 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1521 evicted = 0;
1522
1523 while (to_evict_bytes > 0) {
1524 auto p = hot.rbegin();
1525 if (p == hot.rend()) {
1526 // stop if hot list is now empty
1527 break;
1528 }
1529
1530 BlueStore::Buffer *b = &*p;
1531 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1532 ceph_assert(b->is_clean());
1533 // adjust evict size before buffer goes invalid
1534 to_evict_bytes -= b->length;
1535 evicted += b->length;
1536 b->space->_rm_buffer(this, b);
1537 }
1538
1539 if (evicted > 0) {
1540 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1541 << " from hot list, done evicting hot buffers"
1542 << dendl;
1543 }
1544
1545 // adjust warm out list too, if necessary
1546 int64_t n = warm_out.size() - kout;
1547 while (n-- > 0) {
1548 BlueStore::Buffer *b = &*warm_out.rbegin();
1549 ceph_assert(b->is_empty());
1550 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1551 b->space->_rm_buffer(this, b);
1552 }
1553 }
1554 num = hot.size() + warm_in.size();
1555 }
1556
1557 void add_stats(uint64_t *extents,
1558 uint64_t *blobs,
1559 uint64_t *buffers,
1560 uint64_t *bytes) override {
1561 *extents += num_extents;
1562 *blobs += num_blobs;
1563 *buffers += num;
1564 *bytes += buffer_bytes;
1565 }
1566
1567 #ifdef DEBUG_CACHE
1568 void _audit(const char *s) override
1569 {
1570 dout(10) << __func__ << " " << when << " start" << dendl;
1571 uint64_t s = 0;
1572 for (auto i = hot.begin(); i != hot.end(); ++i) {
1573 s += i->length;
1574 }
1575
1576 uint64_t hot_bytes = s;
1577 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1578 derr << __func__ << " hot_list_bytes "
1579 << list_bytes[BUFFER_HOT]
1580 << " != actual " << hot_bytes
1581 << dendl;
1582 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
1583 }
1584
1585 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1586 s += i->length;
1587 }
1588
1589 uint64_t warm_in_bytes = s - hot_bytes;
1590 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1591 derr << __func__ << " warm_in_list_bytes "
1592 << list_bytes[BUFFER_WARM_IN]
1593 << " != actual " << warm_in_bytes
1594 << dendl;
1595 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
1596 }
1597
1598 if (s != buffer_bytes) {
1599 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1600 << dendl;
1601 ceph_assert(s == buffer_bytes);
1602 }
1603
1604 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1605 << " ok" << dendl;
1606 }
1607 #endif
1608 };
1609
1610 // BuferCacheShard
1611
1612 BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1613 CephContext* cct,
1614 string type,
1615 PerfCounters *logger)
1616 {
1617 BufferCacheShard *c = nullptr;
1618 if (type == "lru")
1619 c = new LruBufferCacheShard(cct);
1620 else if (type == "2q")
1621 c = new TwoQBufferCacheShard(cct);
1622 else
1623 ceph_abort_msg("unrecognized cache type");
1624 c->logger = logger;
1625 return c;
1626 }
1627
1628 // BufferSpace
1629
1630 #undef dout_prefix
1631 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1632
1633 void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
1634 {
1635 // note: we already hold cache->lock
1636 ldout(cache->cct, 20) << __func__ << dendl;
1637 while (!buffer_map.empty()) {
1638 _rm_buffer(cache, buffer_map.begin());
1639 }
1640 }
1641
1642 int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
1643 {
1644 // note: we already hold cache->lock
1645 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1646 << std::dec << dendl;
1647 int cache_private = 0;
1648 cache->_audit("discard start");
1649 auto i = _data_lower_bound(offset);
1650 uint32_t end = offset + length;
1651 while (i != buffer_map.end()) {
1652 Buffer *b = i->second.get();
1653 if (b->offset >= end) {
1654 break;
1655 }
1656 if (b->cache_private > cache_private) {
1657 cache_private = b->cache_private;
1658 }
1659 if (b->offset < offset) {
1660 int64_t front = offset - b->offset;
1661 if (b->end() > end) {
1662 // drop middle (split)
1663 uint32_t tail = b->end() - end;
1664 if (b->data.length()) {
1665 bufferlist bl;
1666 bl.substr_of(b->data, b->length - tail, tail);
1667 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1668 nb->maybe_rebuild();
1669 _add_buffer(cache, nb, 0, b);
1670 } else {
1671 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1672 0, b);
1673 }
1674 if (!b->is_writing()) {
1675 cache->_adjust_size(b, front - (int64_t)b->length);
1676 }
1677 b->truncate(front);
1678 b->maybe_rebuild();
1679 cache->_audit("discard end 1");
1680 break;
1681 } else {
1682 // drop tail
1683 if (!b->is_writing()) {
1684 cache->_adjust_size(b, front - (int64_t)b->length);
1685 }
1686 b->truncate(front);
1687 b->maybe_rebuild();
1688 ++i;
1689 continue;
1690 }
1691 }
1692 if (b->end() <= end) {
1693 // drop entire buffer
1694 _rm_buffer(cache, i++);
1695 continue;
1696 }
1697 // drop front
1698 uint32_t keep = b->end() - end;
1699 if (b->data.length()) {
1700 bufferlist bl;
1701 bl.substr_of(b->data, b->length - keep, keep);
1702 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1703 nb->maybe_rebuild();
1704 _add_buffer(cache, nb, 0, b);
1705 } else {
1706 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1707 }
1708 _rm_buffer(cache, i);
1709 cache->_audit("discard end 2");
1710 break;
1711 }
1712 return cache_private;
1713 }
1714
1715 void BlueStore::BufferSpace::read(
1716 BufferCacheShard* cache,
1717 uint32_t offset,
1718 uint32_t length,
1719 BlueStore::ready_regions_t& res,
1720 interval_set<uint32_t>& res_intervals,
1721 int flags)
1722 {
1723 res.clear();
1724 res_intervals.clear();
1725 uint32_t want_bytes = length;
1726 uint32_t end = offset + length;
1727
1728 {
1729 std::lock_guard l(cache->lock);
1730 for (auto i = _data_lower_bound(offset);
1731 i != buffer_map.end() && offset < end && i->first < end;
1732 ++i) {
1733 Buffer *b = i->second.get();
1734 ceph_assert(b->end() > offset);
1735
1736 bool val = false;
1737 if (flags & BYPASS_CLEAN_CACHE)
1738 val = b->is_writing();
1739 else
1740 val = b->is_writing() || b->is_clean();
1741 if (val) {
1742 if (b->offset < offset) {
1743 uint32_t skip = offset - b->offset;
1744 uint32_t l = min(length, b->length - skip);
1745 res[offset].substr_of(b->data, skip, l);
1746 res_intervals.insert(offset, l);
1747 offset += l;
1748 length -= l;
1749 if (!b->is_writing()) {
1750 cache->_touch(b);
1751 }
1752 continue;
1753 }
1754 if (b->offset > offset) {
1755 uint32_t gap = b->offset - offset;
1756 if (length <= gap) {
1757 break;
1758 }
1759 offset += gap;
1760 length -= gap;
1761 }
1762 if (!b->is_writing()) {
1763 cache->_touch(b);
1764 }
1765 if (b->length > length) {
1766 res[offset].substr_of(b->data, 0, length);
1767 res_intervals.insert(offset, length);
1768 break;
1769 } else {
1770 res[offset].append(b->data);
1771 res_intervals.insert(offset, b->length);
1772 if (b->length == length)
1773 break;
1774 offset += b->length;
1775 length -= b->length;
1776 }
1777 }
1778 }
1779 }
1780
1781 uint64_t hit_bytes = res_intervals.size();
1782 ceph_assert(hit_bytes <= want_bytes);
1783 uint64_t miss_bytes = want_bytes - hit_bytes;
1784 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1785 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1786 }
1787
1788 void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
1789 {
1790 auto i = writing.begin();
1791 while (i != writing.end()) {
1792 if (i->seq > seq) {
1793 break;
1794 }
1795 if (i->seq < seq) {
1796 ++i;
1797 continue;
1798 }
1799
1800 Buffer *b = &*i;
1801 ceph_assert(b->is_writing());
1802
1803 if (b->flags & Buffer::FLAG_NOCACHE) {
1804 writing.erase(i++);
1805 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1806 buffer_map.erase(b->offset);
1807 } else {
1808 b->state = Buffer::STATE_CLEAN;
1809 writing.erase(i++);
1810 b->maybe_rebuild();
1811 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1812 cache->_add(b, 1, nullptr);
1813 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1814 }
1815 }
1816 cache->_trim();
1817 cache->_audit("finish_write end");
1818 }
1819
1820 void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
1821 {
1822 std::lock_guard lk(cache->lock);
1823 if (buffer_map.empty())
1824 return;
1825
1826 auto p = --buffer_map.end();
1827 while (true) {
1828 if (p->second->end() <= pos)
1829 break;
1830
1831 if (p->second->offset < pos) {
1832 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1833 size_t left = pos - p->second->offset;
1834 size_t right = p->second->length - left;
1835 if (p->second->data.length()) {
1836 bufferlist bl;
1837 bl.substr_of(p->second->data, left, right);
1838 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1839 0, p->second.get());
1840 } else {
1841 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1842 0, p->second.get());
1843 }
1844 cache->_adjust_size(p->second.get(), -right);
1845 p->second->truncate(left);
1846 break;
1847 }
1848
1849 ceph_assert(p->second->end() > pos);
1850 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1851 if (p->second->data.length()) {
1852 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1853 p->second->offset - pos, p->second->data),
1854 0, p->second.get());
1855 } else {
1856 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1857 p->second->offset - pos, p->second->length),
1858 0, p->second.get());
1859 }
1860 if (p == buffer_map.begin()) {
1861 _rm_buffer(cache, p);
1862 break;
1863 } else {
1864 _rm_buffer(cache, p--);
1865 }
1866 }
1867 ceph_assert(writing.empty());
1868 cache->_trim();
1869 }
1870
1871 // OnodeSpace
1872
1873 #undef dout_prefix
1874 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1875
1876 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1877 OnodeRef& o)
1878 {
1879 std::lock_guard l(cache->lock);
1880 auto p = onode_map.find(oid);
1881 if (p != onode_map.end()) {
1882 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1883 << " raced, returning existing " << p->second
1884 << dendl;
1885 return p->second;
1886 }
1887 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
1888 onode_map[oid] = o;
1889 cache->_add(o.get(), 1);
1890 cache->_trim();
1891 return o;
1892 }
1893
1894 void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1895 {
1896 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1897 onode_map.erase(oid);
1898 }
1899
1900 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1901 {
1902 ldout(cache->cct, 30) << __func__ << dendl;
1903 OnodeRef o;
1904 bool hit = false;
1905
1906 {
1907 std::lock_guard l(cache->lock);
1908 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1909 if (p == onode_map.end()) {
1910 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1911 } else {
1912 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1913 << " " << p->second->nref
1914 << " " << p->second->cached
1915 << " " << p->second->pinned
1916 << dendl;
1917 // This will pin onode and implicitly touch the cache when Onode
1918 // eventually will become unpinned
1919 o = p->second;
1920 ceph_assert(!o->cached || o->pinned);
1921
1922 hit = true;
1923 }
1924 }
1925
1926 if (hit) {
1927 cache->logger->inc(l_bluestore_onode_hits);
1928 } else {
1929 cache->logger->inc(l_bluestore_onode_misses);
1930 }
1931 return o;
1932 }
1933
1934 void BlueStore::OnodeSpace::clear()
1935 {
1936 std::lock_guard l(cache->lock);
1937 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
1938 for (auto &p : onode_map) {
1939 cache->_rm(p.second.get());
1940 }
1941 onode_map.clear();
1942 }
1943
1944 bool BlueStore::OnodeSpace::empty()
1945 {
1946 std::lock_guard l(cache->lock);
1947 return onode_map.empty();
1948 }
1949
1950 void BlueStore::OnodeSpace::rename(
1951 OnodeRef& oldo,
1952 const ghobject_t& old_oid,
1953 const ghobject_t& new_oid,
1954 const mempool::bluestore_cache_meta::string& new_okey)
1955 {
1956 std::lock_guard l(cache->lock);
1957 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1958 << dendl;
1959 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1960 po = onode_map.find(old_oid);
1961 pn = onode_map.find(new_oid);
1962 ceph_assert(po != pn);
1963
1964 ceph_assert(po != onode_map.end());
1965 if (pn != onode_map.end()) {
1966 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1967 << dendl;
1968 cache->_rm(pn->second.get());
1969 onode_map.erase(pn);
1970 }
1971 OnodeRef o = po->second;
1972
1973 // install a non-existent onode at old location
1974 oldo.reset(new Onode(o->c, old_oid, o->key));
1975 po->second = oldo;
1976 cache->_add(oldo.get(), 1);
1977 // add at new position and fix oid, key.
1978 // This will pin 'o' and implicitly touch cache
1979 // when it will eventually become unpinned
1980 onode_map.insert(make_pair(new_oid, o));
1981 ceph_assert(o->pinned);
1982
1983 o->oid = new_oid;
1984 o->key = new_okey;
1985 cache->_trim();
1986 }
1987
1988 bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
1989 {
1990 std::lock_guard l(cache->lock);
1991 ldout(cache->cct, 20) << __func__ << dendl;
1992 for (auto& i : onode_map) {
1993 if (f(i.second.get())) {
1994 return true;
1995 }
1996 }
1997 return false;
1998 }
1999
2000 template <int LogLevelV = 30>
2001 void BlueStore::OnodeSpace::dump(CephContext *cct)
2002 {
2003 for (auto& i : onode_map) {
2004 ldout(cct, LogLevelV) << i.first << " : " << i.second
2005 << " " << i.second->nref
2006 << " " << i.second->cached
2007 << " " << i.second->pinned
2008 << dendl;
2009 }
2010 }
2011
2012 // SharedBlob
2013
2014 #undef dout_prefix
2015 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
2016 #undef dout_context
2017 #define dout_context coll->store->cct
2018
2019 void BlueStore::SharedBlob::dump(Formatter* f) const
2020 {
2021 f->dump_bool("loaded", loaded);
2022 if (loaded) {
2023 persistent->dump(f);
2024 } else {
2025 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
2026 }
2027 }
2028
2029 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
2030 {
2031 out << "SharedBlob(" << &sb;
2032
2033 if (sb.loaded) {
2034 out << " loaded " << *sb.persistent;
2035 } else {
2036 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
2037 }
2038 return out << ")";
2039 }
2040
2041 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2042 : coll(_coll), sbid_unloaded(i)
2043 {
2044 ceph_assert(sbid_unloaded > 0);
2045 if (get_cache()) {
2046 get_cache()->add_blob();
2047 }
2048 }
2049
2050 BlueStore::SharedBlob::~SharedBlob()
2051 {
2052 if (loaded && persistent) {
2053 delete persistent;
2054 }
2055 }
2056
2057 void BlueStore::SharedBlob::put()
2058 {
2059 if (--nref == 0) {
2060 dout(20) << __func__ << " " << this
2061 << " removing self from set " << get_parent()
2062 << dendl;
2063 again:
2064 auto coll_snap = coll;
2065 if (coll_snap) {
2066 std::lock_guard l(coll_snap->cache->lock);
2067 if (coll_snap != coll) {
2068 goto again;
2069 }
2070 if (!coll_snap->shared_blob_set.remove(this, true)) {
2071 // race with lookup
2072 return;
2073 }
2074 bc._clear(coll_snap->cache);
2075 coll_snap->cache->rm_blob();
2076 }
2077 delete this;
2078 }
2079 }
2080
2081 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2082 {
2083 ceph_assert(persistent);
2084 persistent->ref_map.get(offset, length);
2085 }
2086
2087 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
2088 PExtentVector *r,
2089 bool *unshare)
2090 {
2091 ceph_assert(persistent);
2092 persistent->ref_map.put(offset, length, r,
2093 unshare && !*unshare ? unshare : nullptr);
2094 }
2095
2096 void BlueStore::SharedBlob::finish_write(uint64_t seq)
2097 {
2098 while (true) {
2099 BufferCacheShard *cache = coll->cache;
2100 std::lock_guard l(cache->lock);
2101 if (coll->cache != cache) {
2102 dout(20) << __func__
2103 << " raced with sb cache update, was " << cache
2104 << ", now " << coll->cache << ", retrying"
2105 << dendl;
2106 continue;
2107 }
2108 bc._finish_write(cache, seq);
2109 break;
2110 }
2111 }
2112
2113 // SharedBlobSet
2114
2115 #undef dout_prefix
2116 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2117
2118 template <int LogLevelV = 30>
2119 void BlueStore::SharedBlobSet::dump(CephContext *cct)
2120 {
2121 std::lock_guard l(lock);
2122 for (auto& i : sb_map) {
2123 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
2124 }
2125 }
2126
2127 // Blob
2128
2129 #undef dout_prefix
2130 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2131
2132 void BlueStore::Blob::dump(Formatter* f) const
2133 {
2134 if (is_spanning()) {
2135 f->dump_unsigned("spanning_id ", id);
2136 }
2137 blob.dump(f);
2138 if (shared_blob) {
2139 f->dump_object("shared", *shared_blob);
2140 }
2141 }
2142
2143 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2144 {
2145 out << "Blob(" << &b;
2146 if (b.is_spanning()) {
2147 out << " spanning " << b.id;
2148 }
2149 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2150 if (b.shared_blob) {
2151 out << " " << *b.shared_blob;
2152 } else {
2153 out << " (shared_blob=NULL)";
2154 }
2155 out << ")";
2156 return out;
2157 }
2158
2159 void BlueStore::Blob::discard_unallocated(Collection *coll)
2160 {
2161 if (get_blob().is_shared()) {
2162 return;
2163 }
2164 if (get_blob().is_compressed()) {
2165 bool discard = false;
2166 bool all_invalid = true;
2167 for (auto e : get_blob().get_extents()) {
2168 if (!e.is_valid()) {
2169 discard = true;
2170 } else {
2171 all_invalid = false;
2172 }
2173 }
2174 ceph_assert(discard == all_invalid); // in case of compressed blob all
2175 // or none pextents are invalid.
2176 if (discard) {
2177 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2178 get_blob().get_logical_length());
2179 }
2180 } else {
2181 size_t pos = 0;
2182 for (auto e : get_blob().get_extents()) {
2183 if (!e.is_valid()) {
2184 dout(20) << __func__ << " 0x" << std::hex << pos
2185 << "~" << e.length
2186 << std::dec << dendl;
2187 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2188 }
2189 pos += e.length;
2190 }
2191 if (get_blob().can_prune_tail()) {
2192 dirty_blob().prune_tail();
2193 used_in_blob.prune_tail(get_blob().get_ondisk_length());
2194 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
2195 }
2196 }
2197 }
2198
2199 void BlueStore::Blob::get_ref(
2200 Collection *coll,
2201 uint32_t offset,
2202 uint32_t length)
2203 {
2204 // Caller has to initialize Blob's logical length prior to increment
2205 // references. Otherwise one is neither unable to determine required
2206 // amount of counters in case of per-au tracking nor obtain min_release_size
2207 // for single counter mode.
2208 ceph_assert(get_blob().get_logical_length() != 0);
2209 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2210 << std::dec << " " << *this << dendl;
2211
2212 if (used_in_blob.is_empty()) {
2213 uint32_t min_release_size =
2214 get_blob().get_release_size(coll->store->min_alloc_size);
2215 uint64_t l = get_blob().get_logical_length();
2216 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2217 << min_release_size << std::dec << dendl;
2218 used_in_blob.init(l, min_release_size);
2219 }
2220 used_in_blob.get(
2221 offset,
2222 length);
2223 }
2224
2225 bool BlueStore::Blob::put_ref(
2226 Collection *coll,
2227 uint32_t offset,
2228 uint32_t length,
2229 PExtentVector *r)
2230 {
2231 PExtentVector logical;
2232
2233 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2234 << std::dec << " " << *this << dendl;
2235
2236 bool empty = used_in_blob.put(
2237 offset,
2238 length,
2239 &logical);
2240 r->clear();
2241 // nothing to release
2242 if (!empty && logical.empty()) {
2243 return false;
2244 }
2245
2246 bluestore_blob_t& b = dirty_blob();
2247 return b.release_extents(empty, logical, r);
2248 }
2249
2250 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
2251 uint32_t target_blob_size,
2252 uint32_t b_offset,
2253 uint32_t *length0) {
2254 ceph_assert(min_alloc_size);
2255 ceph_assert(target_blob_size);
2256 if (!get_blob().is_mutable()) {
2257 return false;
2258 }
2259
2260 uint32_t length = *length0;
2261 uint32_t end = b_offset + length;
2262
2263 // Currently for the sake of simplicity we omit blob reuse if data is
2264 // unaligned with csum chunk. Later we can perform padding if needed.
2265 if (get_blob().has_csum() &&
2266 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2267 (end % get_blob().get_csum_chunk_size()) != 0)) {
2268 return false;
2269 }
2270
2271 auto blen = get_blob().get_logical_length();
2272 uint32_t new_blen = blen;
2273
2274 // make sure target_blob_size isn't less than current blob len
2275 target_blob_size = std::max(blen, target_blob_size);
2276
2277 if (b_offset >= blen) {
2278 // new data totally stands out of the existing blob
2279 new_blen = end;
2280 } else {
2281 // new data overlaps with the existing blob
2282 new_blen = std::max(blen, end);
2283
2284 uint32_t overlap = 0;
2285 if (new_blen > blen) {
2286 overlap = blen - b_offset;
2287 } else {
2288 overlap = length;
2289 }
2290
2291 if (!get_blob().is_unallocated(b_offset, overlap)) {
2292 // abort if any piece of the overlap has already been allocated
2293 return false;
2294 }
2295 }
2296
2297 if (new_blen > blen) {
2298 int64_t overflow = int64_t(new_blen) - target_blob_size;
2299 // Unable to decrease the provided length to fit into max_blob_size
2300 if (overflow >= length) {
2301 return false;
2302 }
2303
2304 // FIXME: in some cases we could reduce unused resolution
2305 if (get_blob().has_unused()) {
2306 return false;
2307 }
2308
2309 if (overflow > 0) {
2310 new_blen -= overflow;
2311 length -= overflow;
2312 *length0 = length;
2313 }
2314
2315 if (new_blen > blen) {
2316 dirty_blob().add_tail(new_blen);
2317 used_in_blob.add_tail(new_blen,
2318 get_blob().get_release_size(min_alloc_size));
2319 }
2320 }
2321 return true;
2322 }
2323
2324 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2325 {
2326 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2327 << " start " << *this << dendl;
2328 ceph_assert(blob.can_split());
2329 ceph_assert(used_in_blob.can_split());
2330 bluestore_blob_t &lb = dirty_blob();
2331 bluestore_blob_t &rb = r->dirty_blob();
2332
2333 used_in_blob.split(
2334 blob_offset,
2335 &(r->used_in_blob));
2336
2337 lb.split(blob_offset, rb);
2338 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2339
2340 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2341 << " finish " << *this << dendl;
2342 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2343 << " and " << *r << dendl;
2344 }
2345
2346 #ifndef CACHE_BLOB_BL
2347 void BlueStore::Blob::decode(
2348 Collection *coll,
2349 bufferptr::const_iterator& p,
2350 uint64_t struct_v,
2351 uint64_t* sbid,
2352 bool include_ref_map)
2353 {
2354 denc(blob, p, struct_v);
2355 if (blob.is_shared()) {
2356 denc(*sbid, p);
2357 }
2358 if (include_ref_map) {
2359 if (struct_v > 1) {
2360 used_in_blob.decode(p);
2361 } else {
2362 used_in_blob.clear();
2363 bluestore_extent_ref_map_t legacy_ref_map;
2364 legacy_ref_map.decode(p);
2365 for (auto r : legacy_ref_map.ref_map) {
2366 get_ref(
2367 coll,
2368 r.first,
2369 r.second.refs * r.second.length);
2370 }
2371 }
2372 }
2373 }
2374 #endif
2375
2376 // Extent
2377
2378 void BlueStore::Extent::dump(Formatter* f) const
2379 {
2380 f->dump_unsigned("logical_offset", logical_offset);
2381 f->dump_unsigned("length", length);
2382 f->dump_unsigned("blob_offset", blob_offset);
2383 f->dump_object("blob", *blob);
2384 }
2385
2386 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2387 {
2388 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2389 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2390 << " " << *e.blob;
2391 }
2392
2393 // OldExtent
2394 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2395 uint32_t lo,
2396 uint32_t o,
2397 uint32_t l,
2398 BlobRef& b) {
2399 OldExtent* oe = new OldExtent(lo, o, l, b);
2400 b->put_ref(c.get(), o, l, &(oe->r));
2401 oe->blob_empty = !b->is_referenced();
2402 return oe;
2403 }
2404
2405 // ExtentMap
2406
2407 #undef dout_prefix
2408 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2409 #undef dout_context
2410 #define dout_context onode->c->store->cct
2411
2412 BlueStore::ExtentMap::ExtentMap(Onode *o)
2413 : onode(o),
2414 inline_bl(
2415 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2416 }
2417
2418 void BlueStore::ExtentMap::dump(Formatter* f) const
2419 {
2420 f->open_array_section("extents");
2421
2422 for (auto& e : extent_map) {
2423 f->dump_object("extent", e);
2424 }
2425 f->close_section();
2426 }
2427
2428 void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2429 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2430 uint64_t& length, uint64_t& dstoff) {
2431
2432 auto cct = onode->c->store->cct;
2433 bool inject_21040 =
2434 cct->_conf->bluestore_debug_inject_bug21040;
2435 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2436 for (auto& e : oldo->extent_map.extent_map) {
2437 e.blob->last_encoded_id = -1;
2438 }
2439
2440 int n = 0;
2441 uint64_t end = srcoff + length;
2442 uint32_t dirty_range_begin = 0;
2443 uint32_t dirty_range_end = 0;
2444 bool src_dirty = false;
2445 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2446 ep != oldo->extent_map.extent_map.end();
2447 ++ep) {
2448 auto& e = *ep;
2449 if (e.logical_offset >= end) {
2450 break;
2451 }
2452 dout(20) << __func__ << " src " << e << dendl;
2453 BlobRef cb;
2454 bool blob_duped = true;
2455 if (e.blob->last_encoded_id >= 0) {
2456 cb = id_to_blob[e.blob->last_encoded_id];
2457 blob_duped = false;
2458 } else {
2459 // dup the blob
2460 const bluestore_blob_t& blob = e.blob->get_blob();
2461 // make sure it is shared
2462 if (!blob.is_shared()) {
2463 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2464 if (!inject_21040 && !src_dirty) {
2465 src_dirty = true;
2466 dirty_range_begin = e.logical_offset;
2467 } else if (inject_21040 &&
2468 dirty_range_begin == 0 && dirty_range_end == 0) {
2469 dirty_range_begin = e.logical_offset;
2470 }
2471 ceph_assert(e.logical_end() > 0);
2472 // -1 to exclude next potential shard
2473 dirty_range_end = e.logical_end() - 1;
2474 } else {
2475 c->load_shared_blob(e.blob->shared_blob);
2476 }
2477 cb = new Blob();
2478 e.blob->last_encoded_id = n;
2479 id_to_blob[n] = cb;
2480 e.blob->dup(*cb);
2481 // bump the extent refs on the copied blob's extents
2482 for (auto p : blob.get_extents()) {
2483 if (p.is_valid()) {
2484 e.blob->shared_blob->get_ref(p.offset, p.length);
2485 }
2486 }
2487 txc->write_shared_blob(e.blob->shared_blob);
2488 dout(20) << __func__ << " new " << *cb << dendl;
2489 }
2490
2491 int skip_front, skip_back;
2492 if (e.logical_offset < srcoff) {
2493 skip_front = srcoff - e.logical_offset;
2494 } else {
2495 skip_front = 0;
2496 }
2497 if (e.logical_end() > end) {
2498 skip_back = e.logical_end() - end;
2499 } else {
2500 skip_back = 0;
2501 }
2502
2503 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2504 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2505 newo->extent_map.extent_map.insert(*ne);
2506 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2507 // fixme: we may leave parts of new blob unreferenced that could
2508 // be freed (relative to the shared_blob).
2509 txc->statfs_delta.stored() += ne->length;
2510 if (e.blob->get_blob().is_compressed()) {
2511 txc->statfs_delta.compressed_original() += ne->length;
2512 if (blob_duped) {
2513 txc->statfs_delta.compressed() +=
2514 cb->get_blob().get_compressed_payload_length();
2515 }
2516 }
2517 dout(20) << __func__ << " dst " << *ne << dendl;
2518 ++n;
2519 }
2520 if ((!inject_21040 && src_dirty) ||
2521 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2522 oldo->extent_map.dirty_range(dirty_range_begin,
2523 dirty_range_end - dirty_range_begin);
2524 txc->write_onode(oldo);
2525 }
2526 txc->write_onode(newo);
2527
2528 if (dstoff + length > newo->onode.size) {
2529 newo->onode.size = dstoff + length;
2530 }
2531 newo->extent_map.dirty_range(dstoff, length);
2532 }
2533 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2534 bool force)
2535 {
2536 auto cct = onode->c->store->cct; //used by dout
2537 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2538 if (onode->onode.extent_map_shards.empty()) {
2539 if (inline_bl.length() == 0) {
2540 unsigned n;
2541 // we need to encode inline_bl to measure encoded length
2542 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
2543 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
2544 ceph_assert(!never_happen);
2545 size_t len = inline_bl.length();
2546 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2547 << " extents" << dendl;
2548 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2549 request_reshard(0, OBJECT_MAX_SIZE);
2550 return;
2551 }
2552 }
2553 // will persist in the onode key.
2554 } else {
2555 // pending shard update
2556 struct dirty_shard_t {
2557 Shard *shard;
2558 bufferlist bl;
2559 dirty_shard_t(Shard *s) : shard(s) {}
2560 };
2561 vector<dirty_shard_t> encoded_shards;
2562 // allocate slots for all shards in a single call instead of
2563 // doing multiple allocations - one per each dirty shard
2564 encoded_shards.reserve(shards.size());
2565
2566 auto p = shards.begin();
2567 auto prev_p = p;
2568 while (p != shards.end()) {
2569 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
2570 auto n = p;
2571 ++n;
2572 if (p->dirty) {
2573 uint32_t endoff;
2574 if (n == shards.end()) {
2575 endoff = OBJECT_MAX_SIZE;
2576 } else {
2577 endoff = n->shard_info->offset;
2578 }
2579 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2580 bufferlist& bl = encoded_shards.back().bl;
2581 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2582 bl, &p->extents)) {
2583 if (force) {
2584 derr << __func__ << " encode_some needs reshard" << dendl;
2585 ceph_assert(!force);
2586 }
2587 }
2588 size_t len = bl.length();
2589
2590 dout(20) << __func__ << " shard 0x" << std::hex
2591 << p->shard_info->offset << std::dec << " is " << len
2592 << " bytes (was " << p->shard_info->bytes << ") from "
2593 << p->extents << " extents" << dendl;
2594
2595 if (!force) {
2596 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2597 // we are big; reshard ourselves
2598 request_reshard(p->shard_info->offset, endoff);
2599 }
2600 // avoid resharding the trailing shard, even if it is small
2601 else if (n != shards.end() &&
2602 len < g_conf()->bluestore_extent_map_shard_min_size) {
2603 ceph_assert(endoff != OBJECT_MAX_SIZE);
2604 if (p == shards.begin()) {
2605 // we are the first shard, combine with next shard
2606 request_reshard(p->shard_info->offset, endoff + 1);
2607 } else {
2608 // combine either with the previous shard or the next,
2609 // whichever is smaller
2610 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2611 request_reshard(p->shard_info->offset, endoff + 1);
2612 } else {
2613 request_reshard(prev_p->shard_info->offset, endoff);
2614 }
2615 }
2616 }
2617 }
2618 }
2619 prev_p = p;
2620 p = n;
2621 }
2622 if (needs_reshard()) {
2623 return;
2624 }
2625
2626 // schedule DB update for dirty shards
2627 string key;
2628 for (auto& it : encoded_shards) {
2629 it.shard->dirty = false;
2630 it.shard->shard_info->bytes = it.bl.length();
2631 generate_extent_shard_key_and_apply(
2632 onode->key,
2633 it.shard->shard_info->offset,
2634 &key,
2635 [&](const string& final_key) {
2636 t->set(PREFIX_OBJ, final_key, it.bl);
2637 }
2638 );
2639 }
2640 }
2641 }
2642
2643 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2644 {
2645 if (spanning_blob_map.empty())
2646 return 0;
2647 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2648 // bid is valid and available.
2649 if (bid >= 0)
2650 return bid;
2651 // Find next unused bid;
2652 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2653 const auto begin_bid = bid;
2654 do {
2655 if (!spanning_blob_map.count(bid))
2656 return bid;
2657 else {
2658 bid++;
2659 if (bid < 0) bid = 0;
2660 }
2661 } while (bid != begin_bid);
2662 auto cct = onode->c->store->cct; // used by dout
2663 _dump_onode<0>(cct, *onode);
2664 ceph_abort_msg("no available blob id");
2665 }
2666
2667 void BlueStore::ExtentMap::reshard(
2668 KeyValueDB *db,
2669 KeyValueDB::Transaction t)
2670 {
2671 auto cct = onode->c->store->cct; // used by dout
2672
2673 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2674 << needs_reshard_end << ")" << std::dec
2675 << " of " << onode->onode.extent_map_shards.size()
2676 << " shards on " << onode->oid << dendl;
2677 for (auto& p : spanning_blob_map) {
2678 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2679 << dendl;
2680 }
2681 // determine shard index range
2682 unsigned si_begin = 0, si_end = 0;
2683 if (!shards.empty()) {
2684 while (si_begin + 1 < shards.size() &&
2685 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2686 ++si_begin;
2687 }
2688 needs_reshard_begin = shards[si_begin].shard_info->offset;
2689 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2690 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2691 needs_reshard_end = shards[si_end].shard_info->offset;
2692 break;
2693 }
2694 }
2695 if (si_end == shards.size()) {
2696 needs_reshard_end = OBJECT_MAX_SIZE;
2697 }
2698 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2699 << " over 0x[" << std::hex << needs_reshard_begin << ","
2700 << needs_reshard_end << ")" << std::dec << dendl;
2701 }
2702
2703 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2704
2705 // we may need to fault in a larger interval later must have all
2706 // referring extents for spanning blobs loaded in order to have
2707 // accurate use_tracker values.
2708 uint32_t spanning_scan_begin = needs_reshard_begin;
2709 uint32_t spanning_scan_end = needs_reshard_end;
2710
2711 // remove old keys
2712 string key;
2713 for (unsigned i = si_begin; i < si_end; ++i) {
2714 generate_extent_shard_key_and_apply(
2715 onode->key, shards[i].shard_info->offset, &key,
2716 [&](const string& final_key) {
2717 t->rmkey(PREFIX_OBJ, final_key);
2718 }
2719 );
2720 }
2721
2722 // calculate average extent size
2723 unsigned bytes = 0;
2724 unsigned extents = 0;
2725 if (onode->onode.extent_map_shards.empty()) {
2726 bytes = inline_bl.length();
2727 extents = extent_map.size();
2728 } else {
2729 for (unsigned i = si_begin; i < si_end; ++i) {
2730 bytes += shards[i].shard_info->bytes;
2731 extents += shards[i].extents;
2732 }
2733 }
2734 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2735 unsigned slop = target *
2736 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2737 unsigned extent_avg = bytes / std::max(1u, extents);
2738 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2739 << ", slop " << slop << dendl;
2740
2741 // reshard
2742 unsigned estimate = 0;
2743 unsigned offset = needs_reshard_begin;
2744 vector<bluestore_onode_t::shard_info> new_shard_info;
2745 unsigned max_blob_end = 0;
2746 Extent dummy(needs_reshard_begin);
2747 for (auto e = extent_map.lower_bound(dummy);
2748 e != extent_map.end();
2749 ++e) {
2750 if (e->logical_offset >= needs_reshard_end) {
2751 break;
2752 }
2753 dout(30) << " extent " << *e << dendl;
2754
2755 // disfavor shard boundaries that span a blob
2756 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2757 if (estimate &&
2758 estimate + extent_avg > target + (would_span ? slop : 0)) {
2759 // new shard
2760 if (offset == needs_reshard_begin) {
2761 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2762 new_shard_info.back().offset = offset;
2763 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2764 << std::dec << dendl;
2765 }
2766 offset = e->logical_offset;
2767 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2768 new_shard_info.back().offset = offset;
2769 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2770 << std::dec << dendl;
2771 estimate = 0;
2772 }
2773 estimate += extent_avg;
2774 unsigned bs = e->blob_start();
2775 if (bs < spanning_scan_begin) {
2776 spanning_scan_begin = bs;
2777 }
2778 uint32_t be = e->blob_end();
2779 if (be > max_blob_end) {
2780 max_blob_end = be;
2781 }
2782 if (be > spanning_scan_end) {
2783 spanning_scan_end = be;
2784 }
2785 }
2786 if (new_shard_info.empty() && (si_begin > 0 ||
2787 si_end < shards.size())) {
2788 // we resharded a partial range; we must produce at least one output
2789 // shard
2790 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2791 new_shard_info.back().offset = needs_reshard_begin;
2792 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2793 << std::dec << " (singleton degenerate case)" << dendl;
2794 }
2795
2796 auto& sv = onode->onode.extent_map_shards;
2797 dout(20) << __func__ << " new " << new_shard_info << dendl;
2798 dout(20) << __func__ << " old " << sv << dendl;
2799 if (sv.empty()) {
2800 // no old shards to keep
2801 sv.swap(new_shard_info);
2802 init_shards(true, true);
2803 } else {
2804 // splice in new shards
2805 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2806 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2807 sv.insert(
2808 sv.begin() + si_begin,
2809 new_shard_info.begin(),
2810 new_shard_info.end());
2811 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2812 si_end = si_begin + new_shard_info.size();
2813
2814 ceph_assert(sv.size() == shards.size());
2815
2816 // note that we need to update every shard_info of shards here,
2817 // as sv might have been totally re-allocated above
2818 for (unsigned i = 0; i < shards.size(); i++) {
2819 shards[i].shard_info = &sv[i];
2820 }
2821
2822 // mark newly added shards as dirty
2823 for (unsigned i = si_begin; i < si_end; ++i) {
2824 shards[i].loaded = true;
2825 shards[i].dirty = true;
2826 }
2827 }
2828 dout(20) << __func__ << " fin " << sv << dendl;
2829 inline_bl.clear();
2830
2831 if (sv.empty()) {
2832 // no more shards; unspan all previously spanning blobs
2833 auto p = spanning_blob_map.begin();
2834 while (p != spanning_blob_map.end()) {
2835 p->second->id = -1;
2836 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2837 p = spanning_blob_map.erase(p);
2838 }
2839 } else {
2840 // identify new spanning blobs
2841 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2842 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2843 if (spanning_scan_begin < needs_reshard_begin) {
2844 fault_range(db, spanning_scan_begin,
2845 needs_reshard_begin - spanning_scan_begin);
2846 }
2847 if (spanning_scan_end > needs_reshard_end) {
2848 fault_range(db, needs_reshard_end,
2849 spanning_scan_end - needs_reshard_end);
2850 }
2851 auto sp = sv.begin() + si_begin;
2852 auto esp = sv.end();
2853 unsigned shard_start = sp->offset;
2854 unsigned shard_end;
2855 ++sp;
2856 if (sp == esp) {
2857 shard_end = OBJECT_MAX_SIZE;
2858 } else {
2859 shard_end = sp->offset;
2860 }
2861 Extent dummy(needs_reshard_begin);
2862
2863 bool was_too_many_blobs_check = false;
2864 auto too_many_blobs_threshold =
2865 g_conf()->bluestore_debug_too_many_blobs_threshold;
2866 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2867 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2868 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2869
2870 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2871 if (e->logical_offset >= needs_reshard_end) {
2872 break;
2873 }
2874 dout(30) << " extent " << *e << dendl;
2875 while (e->logical_offset >= shard_end) {
2876 shard_start = shard_end;
2877 ceph_assert(sp != esp);
2878 ++sp;
2879 if (sp == esp) {
2880 shard_end = OBJECT_MAX_SIZE;
2881 } else {
2882 shard_end = sp->offset;
2883 }
2884 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2885 << " to 0x" << shard_end << std::dec << dendl;
2886 }
2887
2888 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2889 if (!e->blob->is_spanning()) {
2890 // We have two options: (1) split the blob into pieces at the
2891 // shard boundaries (and adjust extents accordingly), or (2)
2892 // mark it spanning. We prefer to cut the blob if we can. Note that
2893 // we may have to split it multiple times--potentially at every
2894 // shard boundary.
2895 bool must_span = false;
2896 BlobRef b = e->blob;
2897 if (b->can_split()) {
2898 uint32_t bstart = e->blob_start();
2899 uint32_t bend = e->blob_end();
2900 for (const auto& sh : shards) {
2901 if (bstart < sh.shard_info->offset &&
2902 bend > sh.shard_info->offset) {
2903 uint32_t blob_offset = sh.shard_info->offset - bstart;
2904 if (b->can_split_at(blob_offset)) {
2905 dout(20) << __func__ << " splitting blob, bstart 0x"
2906 << std::hex << bstart << " blob_offset 0x"
2907 << blob_offset << std::dec << " " << *b << dendl;
2908 b = split_blob(b, blob_offset, sh.shard_info->offset);
2909 // switch b to the new right-hand side, in case it
2910 // *also* has to get split.
2911 bstart += blob_offset;
2912 onode->c->store->logger->inc(l_bluestore_blob_split);
2913 } else {
2914 must_span = true;
2915 break;
2916 }
2917 }
2918 }
2919 } else {
2920 must_span = true;
2921 }
2922 if (must_span) {
2923 auto bid = allocate_spanning_blob_id();
2924 b->id = bid;
2925 spanning_blob_map[b->id] = b;
2926 dout(20) << __func__ << " adding spanning " << *b << dendl;
2927 if (!was_too_many_blobs_check &&
2928 too_many_blobs_threshold &&
2929 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2930
2931 was_too_many_blobs_check = true;
2932 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2933 if (dumped_onodes[i].first == onode->oid) {
2934 oid_slot = &dumped_onodes[i];
2935 break;
2936 }
2937 if (!oldest_slot || (oldest_slot &&
2938 dumped_onodes[i].second < oldest_slot->second)) {
2939 oldest_slot = &dumped_onodes[i];
2940 }
2941 }
2942 }
2943 }
2944 }
2945 } else {
2946 if (e->blob->is_spanning()) {
2947 spanning_blob_map.erase(e->blob->id);
2948 e->blob->id = -1;
2949 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2950 }
2951 }
2952 }
2953 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2954 (oid_slot &&
2955 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2956 if (do_dump) {
2957 dout(0) << __func__
2958 << " spanning blob count exceeds threshold, "
2959 << spanning_blob_map.size() << " spanning blobs"
2960 << dendl;
2961 _dump_onode<0>(cct, *onode);
2962 if (oid_slot) {
2963 oid_slot->second = mono_clock::now();
2964 } else {
2965 ceph_assert(oldest_slot);
2966 oldest_slot->first = onode->oid;
2967 oldest_slot->second = mono_clock::now();
2968 }
2969 }
2970 }
2971
2972 clear_needs_reshard();
2973 }
2974
2975 bool BlueStore::ExtentMap::encode_some(
2976 uint32_t offset,
2977 uint32_t length,
2978 bufferlist& bl,
2979 unsigned *pn)
2980 {
2981 Extent dummy(offset);
2982 auto start = extent_map.lower_bound(dummy);
2983 uint32_t end = offset + length;
2984
2985 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2986 // serialization only. Hence there is no specific
2987 // handling at ExtentMap level.
2988
2989 unsigned n = 0;
2990 size_t bound = 0;
2991 bool must_reshard = false;
2992 for (auto p = start;
2993 p != extent_map.end() && p->logical_offset < end;
2994 ++p, ++n) {
2995 ceph_assert(p->logical_offset >= offset);
2996 p->blob->last_encoded_id = -1;
2997 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2998 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2999 << std::dec << " hit new spanning blob " << *p << dendl;
3000 request_reshard(p->blob_start(), p->blob_end());
3001 must_reshard = true;
3002 }
3003 if (!must_reshard) {
3004 denc_varint(0, bound); // blobid
3005 denc_varint(0, bound); // logical_offset
3006 denc_varint(0, bound); // len
3007 denc_varint(0, bound); // blob_offset
3008
3009 p->blob->bound_encode(
3010 bound,
3011 struct_v,
3012 p->blob->shared_blob->get_sbid(),
3013 false);
3014 }
3015 }
3016 if (must_reshard) {
3017 return true;
3018 }
3019
3020 denc(struct_v, bound);
3021 denc_varint(0, bound); // number of extents
3022
3023 {
3024 auto app = bl.get_contiguous_appender(bound);
3025 denc(struct_v, app);
3026 denc_varint(n, app);
3027 if (pn) {
3028 *pn = n;
3029 }
3030
3031 n = 0;
3032 uint64_t pos = 0;
3033 uint64_t prev_len = 0;
3034 for (auto p = start;
3035 p != extent_map.end() && p->logical_offset < end;
3036 ++p, ++n) {
3037 unsigned blobid;
3038 bool include_blob = false;
3039 if (p->blob->is_spanning()) {
3040 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3041 blobid |= BLOBID_FLAG_SPANNING;
3042 } else if (p->blob->last_encoded_id < 0) {
3043 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3044 include_blob = true;
3045 blobid = 0; // the decoder will infer the id from n
3046 } else {
3047 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3048 }
3049 if (p->logical_offset == pos) {
3050 blobid |= BLOBID_FLAG_CONTIGUOUS;
3051 }
3052 if (p->blob_offset == 0) {
3053 blobid |= BLOBID_FLAG_ZEROOFFSET;
3054 }
3055 if (p->length == prev_len) {
3056 blobid |= BLOBID_FLAG_SAMELENGTH;
3057 } else {
3058 prev_len = p->length;
3059 }
3060 denc_varint(blobid, app);
3061 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3062 denc_varint_lowz(p->logical_offset - pos, app);
3063 }
3064 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3065 denc_varint_lowz(p->blob_offset, app);
3066 }
3067 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3068 denc_varint_lowz(p->length, app);
3069 }
3070 pos = p->logical_end();
3071 if (include_blob) {
3072 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3073 }
3074 }
3075 }
3076 /*derr << __func__ << bl << dendl;
3077 derr << __func__ << ":";
3078 bl.hexdump(*_dout);
3079 *_dout << dendl;
3080 */
3081 return false;
3082 }
3083
3084 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3085 {
3086 /*
3087 derr << __func__ << ":";
3088 bl.hexdump(*_dout);
3089 *_dout << dendl;
3090 */
3091
3092 ceph_assert(bl.get_num_buffers() <= 1);
3093 auto p = bl.front().begin_deep();
3094 __u8 struct_v;
3095 denc(struct_v, p);
3096 // Version 2 differs from v1 in blob's ref_map
3097 // serialization only. Hence there is no specific
3098 // handling at ExtentMap level below.
3099 ceph_assert(struct_v == 1 || struct_v == 2);
3100
3101 uint32_t num;
3102 denc_varint(num, p);
3103 vector<BlobRef> blobs(num);
3104 uint64_t pos = 0;
3105 uint64_t prev_len = 0;
3106 unsigned n = 0;
3107
3108 while (!p.end()) {
3109 Extent *le = new Extent();
3110 uint64_t blobid;
3111 denc_varint(blobid, p);
3112 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3113 uint64_t gap;
3114 denc_varint_lowz(gap, p);
3115 pos += gap;
3116 }
3117 le->logical_offset = pos;
3118 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3119 denc_varint_lowz(le->blob_offset, p);
3120 } else {
3121 le->blob_offset = 0;
3122 }
3123 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3124 denc_varint_lowz(prev_len, p);
3125 }
3126 le->length = prev_len;
3127
3128 if (blobid & BLOBID_FLAG_SPANNING) {
3129 dout(30) << __func__ << " getting spanning blob "
3130 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3131 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3132 } else {
3133 blobid >>= BLOBID_SHIFT_BITS;
3134 if (blobid) {
3135 le->assign_blob(blobs[blobid - 1]);
3136 ceph_assert(le->blob);
3137 } else {
3138 Blob *b = new Blob();
3139 uint64_t sbid = 0;
3140 b->decode(onode->c, p, struct_v, &sbid, false);
3141 blobs[n] = b;
3142 onode->c->open_shared_blob(sbid, b);
3143 le->assign_blob(b);
3144 }
3145 // we build ref_map dynamically for non-spanning blobs
3146 le->blob->get_ref(
3147 onode->c,
3148 le->blob_offset,
3149 le->length);
3150 }
3151 pos += prev_len;
3152 ++n;
3153 extent_map.insert(*le);
3154 }
3155
3156 ceph_assert(n == num);
3157 return num;
3158 }
3159
3160 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3161 {
3162 // Version 2 differs from v1 in blob's ref_map
3163 // serialization only. Hence there is no specific
3164 // handling at ExtentMap level.
3165 __u8 struct_v = 2;
3166
3167 denc(struct_v, p);
3168 denc_varint((uint32_t)0, p);
3169 size_t key_size = 0;
3170 denc_varint((uint32_t)0, key_size);
3171 p += spanning_blob_map.size() * key_size;
3172 for (const auto& i : spanning_blob_map) {
3173 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3174 }
3175 }
3176
3177 void BlueStore::ExtentMap::encode_spanning_blobs(
3178 bufferlist::contiguous_appender& p)
3179 {
3180 // Version 2 differs from v1 in blob's ref_map
3181 // serialization only. Hence there is no specific
3182 // handling at ExtentMap level.
3183 __u8 struct_v = 2;
3184
3185 denc(struct_v, p);
3186 denc_varint(spanning_blob_map.size(), p);
3187 for (auto& i : spanning_blob_map) {
3188 denc_varint(i.second->id, p);
3189 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3190 }
3191 }
3192
3193 void BlueStore::ExtentMap::decode_spanning_blobs(
3194 bufferptr::const_iterator& p)
3195 {
3196 __u8 struct_v;
3197 denc(struct_v, p);
3198 // Version 2 differs from v1 in blob's ref_map
3199 // serialization only. Hence there is no specific
3200 // handling at ExtentMap level.
3201 ceph_assert(struct_v == 1 || struct_v == 2);
3202
3203 unsigned n;
3204 denc_varint(n, p);
3205 while (n--) {
3206 BlobRef b(new Blob());
3207 denc_varint(b->id, p);
3208 spanning_blob_map[b->id] = b;
3209 uint64_t sbid = 0;
3210 b->decode(onode->c, p, struct_v, &sbid, true);
3211 onode->c->open_shared_blob(sbid, b);
3212 }
3213 }
3214
3215 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3216 {
3217 shards.resize(onode->onode.extent_map_shards.size());
3218 unsigned i = 0;
3219 for (auto &s : onode->onode.extent_map_shards) {
3220 shards[i].shard_info = &s;
3221 shards[i].loaded = loaded;
3222 shards[i].dirty = dirty;
3223 ++i;
3224 }
3225 }
3226
3227 void BlueStore::ExtentMap::fault_range(
3228 KeyValueDB *db,
3229 uint32_t offset,
3230 uint32_t length)
3231 {
3232 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3233 << std::dec << dendl;
3234 auto start = seek_shard(offset);
3235 auto last = seek_shard(offset + length);
3236
3237 if (start < 0)
3238 return;
3239
3240 ceph_assert(last >= start);
3241 string key;
3242 while (start <= last) {
3243 ceph_assert((size_t)start < shards.size());
3244 auto p = &shards[start];
3245 if (!p->loaded) {
3246 dout(30) << __func__ << " opening shard 0x" << std::hex
3247 << p->shard_info->offset << std::dec << dendl;
3248 bufferlist v;
3249 generate_extent_shard_key_and_apply(
3250 onode->key, p->shard_info->offset, &key,
3251 [&](const string& final_key) {
3252 int r = db->get(PREFIX_OBJ, final_key, &v);
3253 if (r < 0) {
3254 derr << __func__ << " missing shard 0x" << std::hex
3255 << p->shard_info->offset << std::dec << " for " << onode->oid
3256 << dendl;
3257 ceph_assert(r >= 0);
3258 }
3259 }
3260 );
3261 p->extents = decode_some(v);
3262 p->loaded = true;
3263 dout(20) << __func__ << " open shard 0x" << std::hex
3264 << p->shard_info->offset
3265 << " for range 0x" << offset << "~" << length << std::dec
3266 << " (" << v.length() << " bytes)" << dendl;
3267 ceph_assert(p->dirty == false);
3268 ceph_assert(v.length() == p->shard_info->bytes);
3269 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3270 } else {
3271 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3272 }
3273 ++start;
3274 }
3275 }
3276
3277 void BlueStore::ExtentMap::dirty_range(
3278 uint32_t offset,
3279 uint32_t length)
3280 {
3281 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3282 << std::dec << dendl;
3283 if (shards.empty()) {
3284 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3285 inline_bl.clear();
3286 return;
3287 }
3288 auto start = seek_shard(offset);
3289 if (length == 0) {
3290 length = 1;
3291 }
3292 auto last = seek_shard(offset + length - 1);
3293 if (start < 0)
3294 return;
3295
3296 ceph_assert(last >= start);
3297 while (start <= last) {
3298 ceph_assert((size_t)start < shards.size());
3299 auto p = &shards[start];
3300 if (!p->loaded) {
3301 derr << __func__ << "on write 0x" << std::hex << offset
3302 << "~" << length << " shard 0x" << p->shard_info->offset
3303 << std::dec << " is not loaded, can't mark dirty" << dendl;
3304 ceph_abort_msg("can't mark unloaded shard dirty");
3305 }
3306 if (!p->dirty) {
3307 dout(20) << __func__ << " mark shard 0x" << std::hex
3308 << p->shard_info->offset << std::dec << " dirty" << dendl;
3309 p->dirty = true;
3310 }
3311 ++start;
3312 }
3313 }
3314
3315 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3316 uint64_t offset)
3317 {
3318 Extent dummy(offset);
3319 return extent_map.find(dummy);
3320 }
3321
3322 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3323 uint64_t offset)
3324 {
3325 Extent dummy(offset);
3326 auto fp = extent_map.lower_bound(dummy);
3327 if (fp != extent_map.begin()) {
3328 --fp;
3329 if (fp->logical_end() <= offset) {
3330 ++fp;
3331 }
3332 }
3333 return fp;
3334 }
3335
3336 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3337 uint64_t offset) const
3338 {
3339 Extent dummy(offset);
3340 auto fp = extent_map.lower_bound(dummy);
3341 if (fp != extent_map.begin()) {
3342 --fp;
3343 if (fp->logical_end() <= offset) {
3344 ++fp;
3345 }
3346 }
3347 return fp;
3348 }
3349
3350 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3351 {
3352 auto fp = seek_lextent(offset);
3353 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3354 return false;
3355 }
3356 return true;
3357 }
3358
3359 int BlueStore::ExtentMap::compress_extent_map(
3360 uint64_t offset,
3361 uint64_t length)
3362 {
3363 if (extent_map.empty())
3364 return 0;
3365 int removed = 0;
3366 auto p = seek_lextent(offset);
3367 if (p != extent_map.begin()) {
3368 --p; // start to the left of offset
3369 }
3370 // the caller should have just written to this region
3371 ceph_assert(p != extent_map.end());
3372
3373 // identify the *next* shard
3374 auto pshard = shards.begin();
3375 while (pshard != shards.end() &&
3376 p->logical_offset >= pshard->shard_info->offset) {
3377 ++pshard;
3378 }
3379 uint64_t shard_end;
3380 if (pshard != shards.end()) {
3381 shard_end = pshard->shard_info->offset;
3382 } else {
3383 shard_end = OBJECT_MAX_SIZE;
3384 }
3385
3386 auto n = p;
3387 for (++n; n != extent_map.end(); p = n++) {
3388 if (n->logical_offset > offset + length) {
3389 break; // stop after end
3390 }
3391 while (n != extent_map.end() &&
3392 p->logical_end() == n->logical_offset &&
3393 p->blob == n->blob &&
3394 p->blob_offset + p->length == n->blob_offset &&
3395 n->logical_offset < shard_end) {
3396 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3397 << " next shard 0x" << shard_end << std::dec
3398 << " merging " << *p << " and " << *n << dendl;
3399 p->length += n->length;
3400 rm(n++);
3401 ++removed;
3402 }
3403 if (n == extent_map.end()) {
3404 break;
3405 }
3406 if (n->logical_offset >= shard_end) {
3407 ceph_assert(pshard != shards.end());
3408 ++pshard;
3409 if (pshard != shards.end()) {
3410 shard_end = pshard->shard_info->offset;
3411 } else {
3412 shard_end = OBJECT_MAX_SIZE;
3413 }
3414 }
3415 }
3416 if (removed) {
3417 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3418 }
3419 return removed;
3420 }
3421
3422 void BlueStore::ExtentMap::punch_hole(
3423 CollectionRef &c,
3424 uint64_t offset,
3425 uint64_t length,
3426 old_extent_map_t *old_extents)
3427 {
3428 auto p = seek_lextent(offset);
3429 uint64_t end = offset + length;
3430 while (p != extent_map.end()) {
3431 if (p->logical_offset >= end) {
3432 break;
3433 }
3434 if (p->logical_offset < offset) {
3435 if (p->logical_end() > end) {
3436 // split and deref middle
3437 uint64_t front = offset - p->logical_offset;
3438 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3439 length, p->blob);
3440 old_extents->push_back(*oe);
3441 add(end,
3442 p->blob_offset + front + length,
3443 p->length - front - length,
3444 p->blob);
3445 p->length = front;
3446 break;
3447 } else {
3448 // deref tail
3449 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
3450 uint64_t keep = offset - p->logical_offset;
3451 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3452 p->length - keep, p->blob);
3453 old_extents->push_back(*oe);
3454 p->length = keep;
3455 ++p;
3456 continue;
3457 }
3458 }
3459 if (p->logical_offset + p->length <= end) {
3460 // deref whole lextent
3461 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3462 p->length, p->blob);
3463 old_extents->push_back(*oe);
3464 rm(p++);
3465 continue;
3466 }
3467 // deref head
3468 uint64_t keep = p->logical_end() - end;
3469 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3470 p->length - keep, p->blob);
3471 old_extents->push_back(*oe);
3472
3473 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3474 rm(p);
3475 break;
3476 }
3477 }
3478
3479 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3480 CollectionRef &c,
3481 uint64_t logical_offset,
3482 uint64_t blob_offset, uint64_t length, BlobRef b,
3483 old_extent_map_t *old_extents)
3484 {
3485 // We need to have completely initialized Blob to increment its ref counters.
3486 ceph_assert(b->get_blob().get_logical_length() != 0);
3487
3488 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3489 // old_extents list if we overwre the blob totally
3490 // This might happen during WAL overwrite.
3491 b->get_ref(onode->c, blob_offset, length);
3492
3493 if (old_extents) {
3494 punch_hole(c, logical_offset, length, old_extents);
3495 }
3496
3497 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3498 extent_map.insert(*le);
3499 if (spans_shard(logical_offset, length)) {
3500 request_reshard(logical_offset, logical_offset + length);
3501 }
3502 return le;
3503 }
3504
3505 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3506 BlobRef lb,
3507 uint32_t blob_offset,
3508 uint32_t pos)
3509 {
3510 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3511 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3512 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3513 << dendl;
3514 BlobRef rb = onode->c->new_blob();
3515 lb->split(onode->c, blob_offset, rb.get());
3516
3517 for (auto ep = seek_lextent(pos);
3518 ep != extent_map.end() && ep->logical_offset < end_pos;
3519 ++ep) {
3520 if (ep->blob != lb) {
3521 continue;
3522 }
3523 if (ep->logical_offset < pos) {
3524 // split extent
3525 size_t left = pos - ep->logical_offset;
3526 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3527 extent_map.insert(*ne);
3528 ep->length = left;
3529 dout(30) << __func__ << " split " << *ep << dendl;
3530 dout(30) << __func__ << " to " << *ne << dendl;
3531 } else {
3532 // switch blob
3533 ceph_assert(ep->blob_offset >= blob_offset);
3534
3535 ep->blob = rb;
3536 ep->blob_offset -= blob_offset;
3537 dout(30) << __func__ << " adjusted " << *ep << dendl;
3538 }
3539 }
3540 return rb;
3541 }
3542
3543 // Onode
3544
3545 #undef dout_prefix
3546 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3547
3548 //
3549 // A tricky thing about Onode's ref counter is that we do an additional
3550 // increment when newly pinned instance is detected. And -1 on unpin.
3551 // This prevents from a conflict with a delete call (when nref == 0).
3552 // The latter might happen while the thread is in unpin() function
3553 // (and e.g. waiting for lock acquisition) since nref is already
3554 // decremented. And another 'putting' thread on the instance will release it.
3555 //
3556 void BlueStore::Onode::get() {
3557 if (++nref >= 2 && !pinned) {
3558 OnodeCacheShard* ocs = c->get_onode_cache();
3559 std::lock_guard l(ocs->lock);
3560 bool was_pinned = pinned;
3561 pinned = nref >= 2;
3562 // additional increment for newly pinned instance
3563 bool r = !was_pinned && pinned;
3564 if (r) {
3565 ++nref;
3566 }
3567 if (cached && r) {
3568 ocs->_pin(this);
3569 }
3570 }
3571 }
3572 void BlueStore::Onode::put() {
3573 int n = --nref;
3574 if (n == 2) {
3575 OnodeCacheShard* ocs = c->get_onode_cache();
3576 std::lock_guard l(ocs->lock);
3577 bool need_unpin = pinned;
3578 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3579 // +1 due to pinned state
3580 need_unpin = need_unpin && !pinned;
3581 if (cached && need_unpin) {
3582 if (exists) {
3583 ocs->_unpin(this);
3584 } else {
3585 ocs->_unpin_and_rm(this);
3586 // remove will also decrement nref and delete Onode
3587 c->onode_map._remove(oid);
3588 }
3589 }
3590 // additional decrement for newly unpinned instance
3591 // should be the last action since Onode can be released
3592 // at any point after this decrement
3593 if (need_unpin) {
3594 n = --nref;
3595 }
3596 }
3597 if (n == 0) {
3598 delete this;
3599 }
3600 }
3601
3602 BlueStore::Onode* BlueStore::Onode::decode(
3603 CollectionRef c,
3604 const ghobject_t& oid,
3605 const string& key,
3606 const bufferlist& v)
3607 {
3608 Onode* on = new Onode(c.get(), oid, key);
3609 on->exists = true;
3610 auto p = v.front().begin_deep();
3611 on->onode.decode(p);
3612 for (auto& i : on->onode.attrs) {
3613 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3614 }
3615
3616 // initialize extent_map
3617 on->extent_map.decode_spanning_blobs(p);
3618 if (on->onode.extent_map_shards.empty()) {
3619 denc(on->extent_map.inline_bl, p);
3620 on->extent_map.decode_some(on->extent_map.inline_bl);
3621 on->extent_map.inline_bl.reassign_to_mempool(
3622 mempool::mempool_bluestore_cache_data);
3623 }
3624 else {
3625 on->extent_map.init_shards(false, false);
3626 }
3627 return on;
3628 }
3629
3630 void BlueStore::Onode::flush()
3631 {
3632 if (flushing_count.load()) {
3633 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
3634 waiting_count++;
3635 std::unique_lock l(flush_lock);
3636 while (flushing_count.load()) {
3637 flush_cond.wait(l);
3638 }
3639 waiting_count--;
3640 }
3641 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3642 }
3643
3644 void BlueStore::Onode::dump(Formatter* f) const
3645 {
3646 onode.dump(f);
3647 extent_map.dump(f);
3648 }
3649
3650
3651 const string& BlueStore::Onode::get_omap_prefix()
3652 {
3653 if (onode.is_pgmeta_omap()) {
3654 return PREFIX_PGMETA_OMAP;
3655 }
3656 if (onode.is_perpool_omap()) {
3657 return PREFIX_PERPOOL_OMAP;
3658 }
3659 return PREFIX_OMAP;
3660 }
3661
3662 // '-' < '.' < '~'
3663
3664 void BlueStore::Onode::get_omap_header(string *out)
3665 {
3666 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3667 _key_encode_u64(c->pool(), out);
3668 }
3669 _key_encode_u64(onode.nid, out);
3670 out->push_back('-');
3671 }
3672
3673 void BlueStore::Onode::get_omap_key(const string& key, string *out)
3674 {
3675 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3676 _key_encode_u64(c->pool(), out);
3677 }
3678 _key_encode_u64(onode.nid, out);
3679 out->push_back('.');
3680 out->append(key);
3681 }
3682
3683 void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3684 {
3685 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3686 _key_encode_u64(c->pool(), out);
3687 }
3688 _key_encode_u64(onode.nid, out);
3689 out->append(old.c_str() + out->length(), old.size() - out->length());
3690 }
3691
3692 void BlueStore::Onode::get_omap_tail(string *out)
3693 {
3694 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3695 _key_encode_u64(c->pool(), out);
3696 }
3697 _key_encode_u64(onode.nid, out);
3698 out->push_back('~');
3699 }
3700
3701 void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3702 {
3703 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3704 *user_key = key.substr(sizeof(uint64_t)*2 + 1);
3705 } else {
3706 *user_key = key.substr(sizeof(uint64_t) + 1);
3707 }
3708 }
3709
3710
3711 // =======================================================
3712 // WriteContext
3713
3714 /// Checks for writes to the same pextent within a blob
3715 bool BlueStore::WriteContext::has_conflict(
3716 BlobRef b,
3717 uint64_t loffs,
3718 uint64_t loffs_end,
3719 uint64_t min_alloc_size)
3720 {
3721 ceph_assert((loffs % min_alloc_size) == 0);
3722 ceph_assert((loffs_end % min_alloc_size) == 0);
3723 for (auto w : writes) {
3724 if (b == w.b) {
3725 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3726 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
3727 if ((loffs <= loffs2 && loffs_end > loffs2) ||
3728 (loffs >= loffs2 && loffs < loffs2_end)) {
3729 return true;
3730 }
3731 }
3732 }
3733 return false;
3734 }
3735
3736 // =======================================================
3737
3738 // DeferredBatch
3739 #undef dout_prefix
3740 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3741 #undef dout_context
3742 #define dout_context cct
3743
3744 void BlueStore::DeferredBatch::prepare_write(
3745 CephContext *cct,
3746 uint64_t seq, uint64_t offset, uint64_t length,
3747 bufferlist::const_iterator& blp)
3748 {
3749 _discard(cct, offset, length);
3750 auto i = iomap.insert(make_pair(offset, deferred_io()));
3751 ceph_assert(i.second); // this should be a new insertion
3752 i.first->second.seq = seq;
3753 blp.copy(length, i.first->second.bl);
3754 i.first->second.bl.reassign_to_mempool(
3755 mempool::mempool_bluestore_writing_deferred);
3756 dout(20) << __func__ << " seq " << seq
3757 << " 0x" << std::hex << offset << "~" << length
3758 << " crc " << i.first->second.bl.crc32c(-1)
3759 << std::dec << dendl;
3760 seq_bytes[seq] += length;
3761 #ifdef DEBUG_DEFERRED
3762 _audit(cct);
3763 #endif
3764 }
3765
3766 void BlueStore::DeferredBatch::_discard(
3767 CephContext *cct, uint64_t offset, uint64_t length)
3768 {
3769 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3770 << std::dec << dendl;
3771 auto p = iomap.lower_bound(offset);
3772 if (p != iomap.begin()) {
3773 --p;
3774 auto end = p->first + p->second.bl.length();
3775 if (end > offset) {
3776 bufferlist head;
3777 head.substr_of(p->second.bl, 0, offset - p->first);
3778 dout(20) << __func__ << " keep head " << p->second.seq
3779 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3780 << " -> 0x" << head.length() << std::dec << dendl;
3781 auto i = seq_bytes.find(p->second.seq);
3782 ceph_assert(i != seq_bytes.end());
3783 if (end > offset + length) {
3784 bufferlist tail;
3785 tail.substr_of(p->second.bl, offset + length - p->first,
3786 end - (offset + length));
3787 dout(20) << __func__ << " keep tail " << p->second.seq
3788 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3789 << " -> 0x" << tail.length() << std::dec << dendl;
3790 auto &n = iomap[offset + length];
3791 n.bl.swap(tail);
3792 n.seq = p->second.seq;
3793 i->second -= length;
3794 } else {
3795 i->second -= end - offset;
3796 }
3797 ceph_assert(i->second >= 0);
3798 p->second.bl.swap(head);
3799 }
3800 ++p;
3801 }
3802 while (p != iomap.end()) {
3803 if (p->first >= offset + length) {
3804 break;
3805 }
3806 auto i = seq_bytes.find(p->second.seq);
3807 ceph_assert(i != seq_bytes.end());
3808 auto end = p->first + p->second.bl.length();
3809 if (end > offset + length) {
3810 unsigned drop_front = offset + length - p->first;
3811 unsigned keep_tail = end - (offset + length);
3812 dout(20) << __func__ << " truncate front " << p->second.seq
3813 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3814 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3815 << " to 0x" << (offset + length) << "~" << keep_tail
3816 << std::dec << dendl;
3817 auto &s = iomap[offset + length];
3818 s.seq = p->second.seq;
3819 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3820 i->second -= drop_front;
3821 } else {
3822 dout(20) << __func__ << " drop " << p->second.seq
3823 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3824 << std::dec << dendl;
3825 i->second -= p->second.bl.length();
3826 }
3827 ceph_assert(i->second >= 0);
3828 p = iomap.erase(p);
3829 }
3830 }
3831
3832 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3833 {
3834 map<uint64_t,int> sb;
3835 for (auto p : seq_bytes) {
3836 sb[p.first] = 0; // make sure we have the same set of keys
3837 }
3838 uint64_t pos = 0;
3839 for (auto& p : iomap) {
3840 ceph_assert(p.first >= pos);
3841 sb[p.second.seq] += p.second.bl.length();
3842 pos = p.first + p.second.bl.length();
3843 }
3844 ceph_assert(sb == seq_bytes);
3845 }
3846
3847
3848 // Collection
3849
3850 #undef dout_prefix
3851 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3852
3853 BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3854 : CollectionImpl(store_->cct, cid),
3855 store(store_),
3856 cache(bc),
3857 exists(true),
3858 onode_map(oc),
3859 commit_queue(nullptr)
3860 {
3861 }
3862
3863 bool BlueStore::Collection::flush_commit(Context *c)
3864 {
3865 return osr->flush_commit(c);
3866 }
3867
3868 void BlueStore::Collection::flush()
3869 {
3870 osr->flush();
3871 }
3872
3873 void BlueStore::Collection::flush_all_but_last()
3874 {
3875 osr->flush_all_but_last();
3876 }
3877
3878 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3879 {
3880 ceph_assert(!b->shared_blob);
3881 const bluestore_blob_t& blob = b->get_blob();
3882 if (!blob.is_shared()) {
3883 b->shared_blob = new SharedBlob(this);
3884 return;
3885 }
3886
3887 b->shared_blob = shared_blob_set.lookup(sbid);
3888 if (b->shared_blob) {
3889 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3890 << std::dec << " had " << *b->shared_blob << dendl;
3891 } else {
3892 b->shared_blob = new SharedBlob(sbid, this);
3893 shared_blob_set.add(this, b->shared_blob.get());
3894 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3895 << std::dec << " opened " << *b->shared_blob
3896 << dendl;
3897 }
3898 }
3899
3900 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3901 {
3902 if (!sb->is_loaded()) {
3903
3904 bufferlist v;
3905 string key;
3906 auto sbid = sb->get_sbid();
3907 get_shared_blob_key(sbid, &key);
3908 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3909 if (r < 0) {
3910 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3911 << std::dec << " not found at key "
3912 << pretty_binary_string(key) << dendl;
3913 ceph_abort_msg("uh oh, missing shared_blob");
3914 }
3915
3916 sb->loaded = true;
3917 sb->persistent = new bluestore_shared_blob_t(sbid);
3918 auto p = v.cbegin();
3919 decode(*(sb->persistent), p);
3920 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3921 << std::dec << " loaded shared_blob " << *sb << dendl;
3922 }
3923 }
3924
3925 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3926 {
3927 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3928 ceph_assert(!b->shared_blob->is_loaded());
3929
3930 // update blob
3931 bluestore_blob_t& blob = b->dirty_blob();
3932 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3933
3934 // update shared blob
3935 b->shared_blob->loaded = true;
3936 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3937 shared_blob_set.add(this, b->shared_blob.get());
3938 for (auto p : blob.get_extents()) {
3939 if (p.is_valid()) {
3940 b->shared_blob->get_ref(
3941 p.offset,
3942 p.length);
3943 }
3944 }
3945 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3946 }
3947
3948 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3949 {
3950 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3951 ceph_assert(sb->is_loaded());
3952
3953 uint64_t sbid = sb->get_sbid();
3954 shared_blob_set.remove(sb);
3955 sb->loaded = false;
3956 delete sb->persistent;
3957 sb->sbid_unloaded = 0;
3958 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3959 return sbid;
3960 }
3961
3962 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3963 const ghobject_t& oid,
3964 bool create,
3965 bool is_createop)
3966 {
3967 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
3968
3969 spg_t pgid;
3970 if (cid.is_pg(&pgid)) {
3971 if (!oid.match(cnode.bits, pgid.ps())) {
3972 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3973 << pgid << " bits " << cnode.bits << dendl;
3974 ceph_abort();
3975 }
3976 }
3977
3978 OnodeRef o = onode_map.lookup(oid);
3979 if (o)
3980 return o;
3981
3982 string key;
3983 get_object_key(store->cct, oid, &key);
3984
3985 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3986 << pretty_binary_string(key) << dendl;
3987
3988 bufferlist v;
3989 int r = -ENOENT;
3990 Onode *on;
3991 if (!is_createop) {
3992 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3993 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3994 }
3995 if (v.length() == 0) {
3996 ceph_assert(r == -ENOENT);
3997 if (!store->cct->_conf->bluestore_debug_misc &&
3998 !create)
3999 return OnodeRef();
4000
4001 // new object, new onode
4002 on = new Onode(this, oid, key);
4003 } else {
4004 // loaded
4005 ceph_assert(r >= 0);
4006 on = Onode::decode(this, oid, key, v);
4007 }
4008 o.reset(on);
4009 return onode_map.add(oid, o);
4010 }
4011
4012 void BlueStore::Collection::split_cache(
4013 Collection *dest)
4014 {
4015 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4016
4017 // lock (one or both) cache shards
4018 std::lock(cache->lock, dest->cache->lock);
4019 std::lock_guard l(cache->lock, std::adopt_lock);
4020 std::lock_guard l2(dest->cache->lock, std::adopt_lock);
4021
4022 int destbits = dest->cnode.bits;
4023 spg_t destpg;
4024 bool is_pg = dest->cid.is_pg(&destpg);
4025 ceph_assert(is_pg);
4026
4027 auto p = onode_map.onode_map.begin();
4028 while (p != onode_map.onode_map.end()) {
4029 OnodeRef o = p->second;
4030 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4031 // onode does not belong to this child
4032 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4033 << dendl;
4034 ++p;
4035 } else {
4036 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4037 << dendl;
4038
4039 // ensuring that nref is always >= 2 and hence onode is pinned and
4040 // physically out of cache during the transition
4041 OnodeRef o_pin = o;
4042 ceph_assert(o->pinned);
4043
4044 p = onode_map.onode_map.erase(p);
4045 dest->onode_map.onode_map[o->oid] = o;
4046 if (o->cached) {
4047 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
4048 }
4049 o->c = dest;
4050
4051 // move over shared blobs and buffers. cover shared blobs from
4052 // both extent map and spanning blob map (the full extent map
4053 // may not be faulted in)
4054 vector<SharedBlob*> sbvec;
4055 for (auto& e : o->extent_map.extent_map) {
4056 sbvec.push_back(e.blob->shared_blob.get());
4057 }
4058 for (auto& b : o->extent_map.spanning_blob_map) {
4059 sbvec.push_back(b.second->shared_blob.get());
4060 }
4061 for (auto sb : sbvec) {
4062 if (sb->coll == dest) {
4063 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4064 << dendl;
4065 continue;
4066 }
4067 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
4068 if (sb->get_sbid()) {
4069 ldout(store->cct, 20) << __func__
4070 << " moving registration " << *sb << dendl;
4071 shared_blob_set.remove(sb);
4072 dest->shared_blob_set.add(dest, sb);
4073 }
4074 sb->coll = dest;
4075 if (dest->cache != cache) {
4076 for (auto& i : sb->bc.buffer_map) {
4077 if (!i.second->is_writing()) {
4078 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4079 << dendl;
4080 dest->cache->_move(cache, i.second.get());
4081 }
4082 }
4083 }
4084 }
4085 }
4086 }
4087 dest->cache->_trim();
4088 }
4089
4090 // =======================================================
4091
4092 // MempoolThread
4093
4094 #undef dout_prefix
4095 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4096 #undef dout_context
4097 #define dout_context store->cct
4098
4099 void *BlueStore::MempoolThread::entry()
4100 {
4101 std::unique_lock l{lock};
4102
4103 uint32_t prev_config_change = store->config_changed.load();
4104 uint64_t base = store->osd_memory_base;
4105 double fragmentation = store->osd_memory_expected_fragmentation;
4106 uint64_t target = store->osd_memory_target;
4107 uint64_t min = store->osd_memory_cache_min;
4108 uint64_t max = min;
4109
4110 // When setting the maximum amount of memory to use for cache, first
4111 // assume some base amount of memory for the OSD and then fudge in
4112 // some overhead for fragmentation that scales with cache usage.
4113 uint64_t ltarget = (1.0 - fragmentation) * target;
4114 if (ltarget > base + min) {
4115 max = ltarget - base;
4116 }
4117
4118 binned_kv_cache = store->db->get_priority_cache();
4119 if (store->cache_autotune && binned_kv_cache != nullptr) {
4120 pcm = std::make_shared<PriorityCache::Manager>(
4121 store->cct, min, max, target, true);
4122 pcm->insert("kv", binned_kv_cache, true);
4123 pcm->insert("meta", meta_cache, true);
4124 pcm->insert("data", data_cache, true);
4125 }
4126
4127 utime_t next_balance = ceph_clock_now();
4128 utime_t next_resize = ceph_clock_now();
4129 utime_t next_deferred_force_submit = ceph_clock_now();
4130 utime_t alloc_stats_dump_clock = ceph_clock_now();
4131
4132 bool interval_stats_trim = false;
4133 while (!stop) {
4134 // Update pcm cache settings if related configuration was changed
4135 uint32_t cur_config_change = store->config_changed.load();
4136 if (cur_config_change != prev_config_change) {
4137 _update_cache_settings();
4138 prev_config_change = cur_config_change;
4139 }
4140
4141 // Before we trim, check and see if it's time to rebalance/resize.
4142 double autotune_interval = store->cache_autotune_interval;
4143 double resize_interval = store->osd_memory_cache_resize_interval;
4144 double max_defer_interval = store->max_defer_interval;
4145
4146 double alloc_stats_dump_interval =
4147 store->cct->_conf->bluestore_alloc_stats_dump_interval;
4148
4149 if (alloc_stats_dump_interval > 0 &&
4150 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4151 store->_record_allocation_stats();
4152 alloc_stats_dump_clock = ceph_clock_now();
4153 }
4154 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
4155 _adjust_cache_settings();
4156
4157 // Log events at 5 instead of 20 when balance happens.
4158 interval_stats_trim = true;
4159
4160 if (pcm != nullptr) {
4161 pcm->balance();
4162 }
4163
4164 next_balance = ceph_clock_now();
4165 next_balance += autotune_interval;
4166 }
4167 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
4168 if (ceph_using_tcmalloc() && pcm != nullptr) {
4169 pcm->tune_memory();
4170 }
4171 next_resize = ceph_clock_now();
4172 next_resize += resize_interval;
4173 }
4174
4175 if (max_defer_interval > 0 &&
4176 next_deferred_force_submit < ceph_clock_now()) {
4177 if (store->get_deferred_last_submitted() + max_defer_interval <
4178 ceph_clock_now()) {
4179 store->deferred_try_submit();
4180 }
4181 next_deferred_force_submit = ceph_clock_now();
4182 next_deferred_force_submit += max_defer_interval/3;
4183 }
4184
4185 // Now Resize the shards
4186 _resize_shards(interval_stats_trim);
4187 interval_stats_trim = false;
4188
4189 store->_update_cache_logger();
4190 auto wait = ceph::make_timespan(
4191 store->cct->_conf->bluestore_cache_trim_interval);
4192 cond.wait_for(l, wait);
4193 }
4194 // do final dump
4195 store->_record_allocation_stats();
4196 stop = false;
4197 return NULL;
4198 }
4199
4200 void BlueStore::MempoolThread::_adjust_cache_settings()
4201 {
4202 if (binned_kv_cache != nullptr) {
4203 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4204 }
4205 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4206 data_cache->set_cache_ratio(store->cache_data_ratio);
4207 }
4208
4209 void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
4210 {
4211 size_t onode_shards = store->onode_cache_shards.size();
4212 size_t buffer_shards = store->buffer_cache_shards.size();
4213 int64_t kv_used = store->db->get_cache_usage();
4214 int64_t meta_used = meta_cache->_get_used_bytes();
4215 int64_t data_used = data_cache->_get_used_bytes();
4216
4217 uint64_t cache_size = store->cache_size;
4218 int64_t kv_alloc =
4219 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
4220 int64_t meta_alloc =
4221 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
4222 int64_t data_alloc =
4223 static_cast<int64_t>(store->cache_data_ratio * cache_size);
4224
4225 if (pcm != nullptr && binned_kv_cache != nullptr) {
4226 cache_size = pcm->get_tuned_mem();
4227 kv_alloc = binned_kv_cache->get_committed_size();
4228 meta_alloc = meta_cache->get_committed_size();
4229 data_alloc = data_cache->get_committed_size();
4230 }
4231
4232 if (interval_stats) {
4233 dout(5) << __func__ << " cache_size: " << cache_size
4234 << " kv_alloc: " << kv_alloc
4235 << " kv_used: " << kv_used
4236 << " meta_alloc: " << meta_alloc
4237 << " meta_used: " << meta_used
4238 << " data_alloc: " << data_alloc
4239 << " data_used: " << data_used << dendl;
4240 } else {
4241 dout(20) << __func__ << " cache_size: " << cache_size
4242 << " kv_alloc: " << kv_alloc
4243 << " kv_used: " << kv_used
4244 << " meta_alloc: " << meta_alloc
4245 << " meta_used: " << meta_used
4246 << " data_alloc: " << data_alloc
4247 << " data_used: " << data_used << dendl;
4248 }
4249
4250 uint64_t max_shard_onodes = static_cast<uint64_t>(
4251 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4252 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
4253
4254 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
4255 << " max_shard_buffer: " << max_shard_buffer << dendl;
4256
4257 for (auto i : store->onode_cache_shards) {
4258 i->set_max(max_shard_onodes);
4259 }
4260 for (auto i : store->buffer_cache_shards) {
4261 i->set_max(max_shard_buffer);
4262 }
4263 }
4264
4265 void BlueStore::MempoolThread::_update_cache_settings()
4266 {
4267 // Nothing to do if pcm is not used.
4268 if (pcm == nullptr) {
4269 return;
4270 }
4271
4272 uint64_t target = store->osd_memory_target;
4273 uint64_t base = store->osd_memory_base;
4274 uint64_t min = store->osd_memory_cache_min;
4275 uint64_t max = min;
4276 double fragmentation = store->osd_memory_expected_fragmentation;
4277
4278 uint64_t ltarget = (1.0 - fragmentation) * target;
4279 if (ltarget > base + min) {
4280 max = ltarget - base;
4281 }
4282
4283 // set pcm cache levels
4284 pcm->set_target_memory(target);
4285 pcm->set_min_memory(min);
4286 pcm->set_max_memory(max);
4287
4288 dout(5) << __func__ << " updated pcm target: " << target
4289 << " pcm min: " << min
4290 << " pcm max: " << max
4291 << dendl;
4292 }
4293
4294 // =======================================================
4295
4296 // OmapIteratorImpl
4297
4298 #undef dout_prefix
4299 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4300
4301 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4302 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4303 : c(c), o(o), it(it)
4304 {
4305 std::shared_lock l(c->lock);
4306 if (o->onode.has_omap()) {
4307 o->get_omap_key(string(), &head);
4308 o->get_omap_tail(&tail);
4309 it->lower_bound(head);
4310 }
4311 }
4312
4313 string BlueStore::OmapIteratorImpl::_stringify() const
4314 {
4315 stringstream s;
4316 s << " omap_iterator(cid = " << c->cid
4317 <<", oid = " << o->oid << ")";
4318 return s.str();
4319 }
4320
4321 int BlueStore::OmapIteratorImpl::seek_to_first()
4322 {
4323 std::shared_lock l(c->lock);
4324 auto start1 = mono_clock::now();
4325 if (o->onode.has_omap()) {
4326 it->lower_bound(head);
4327 } else {
4328 it = KeyValueDB::Iterator();
4329 }
4330 c->store->log_latency(
4331 __func__,
4332 l_bluestore_omap_seek_to_first_lat,
4333 mono_clock::now() - start1,
4334 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4335
4336 return 0;
4337 }
4338
4339 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4340 {
4341 std::shared_lock l(c->lock);
4342 auto start1 = mono_clock::now();
4343 if (o->onode.has_omap()) {
4344 string key;
4345 o->get_omap_key(after, &key);
4346 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4347 << pretty_binary_string(key) << dendl;
4348 it->upper_bound(key);
4349 } else {
4350 it = KeyValueDB::Iterator();
4351 }
4352 c->store->log_latency_fn(
4353 __func__,
4354 l_bluestore_omap_upper_bound_lat,
4355 mono_clock::now() - start1,
4356 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4357 [&] (const ceph::timespan& lat) {
4358 return ", after = " + after +
4359 _stringify();
4360 }
4361 );
4362 return 0;
4363 }
4364
4365 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4366 {
4367 std::shared_lock l(c->lock);
4368 auto start1 = mono_clock::now();
4369 if (o->onode.has_omap()) {
4370 string key;
4371 o->get_omap_key(to, &key);
4372 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4373 << pretty_binary_string(key) << dendl;
4374 it->lower_bound(key);
4375 } else {
4376 it = KeyValueDB::Iterator();
4377 }
4378 c->store->log_latency_fn(
4379 __func__,
4380 l_bluestore_omap_lower_bound_lat,
4381 mono_clock::now() - start1,
4382 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4383 [&] (const ceph::timespan& lat) {
4384 return ", to = " + to +
4385 _stringify();
4386 }
4387 );
4388 return 0;
4389 }
4390
4391 bool BlueStore::OmapIteratorImpl::valid()
4392 {
4393 std::shared_lock l(c->lock);
4394 bool r = o->onode.has_omap() && it && it->valid() &&
4395 it->raw_key().second < tail;
4396 if (it && it->valid()) {
4397 ldout(c->store->cct,20) << __func__ << " is at "
4398 << pretty_binary_string(it->raw_key().second)
4399 << dendl;
4400 }
4401 return r;
4402 }
4403
4404 int BlueStore::OmapIteratorImpl::next()
4405 {
4406 int r = -1;
4407 std::shared_lock l(c->lock);
4408 auto start1 = mono_clock::now();
4409 if (o->onode.has_omap()) {
4410 it->next();
4411 r = 0;
4412 }
4413 c->store->log_latency(
4414 __func__,
4415 l_bluestore_omap_next_lat,
4416 mono_clock::now() - start1,
4417 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4418
4419 return r;
4420 }
4421
4422 string BlueStore::OmapIteratorImpl::key()
4423 {
4424 std::shared_lock l(c->lock);
4425 ceph_assert(it->valid());
4426 string db_key = it->raw_key().second;
4427 string user_key;
4428 o->decode_omap_key(db_key, &user_key);
4429
4430 return user_key;
4431 }
4432
4433 bufferlist BlueStore::OmapIteratorImpl::value()
4434 {
4435 std::shared_lock l(c->lock);
4436 ceph_assert(it->valid());
4437 return it->value();
4438 }
4439
4440
4441 // =====================================
4442
4443 #undef dout_prefix
4444 #define dout_prefix *_dout << "bluestore(" << path << ") "
4445 #undef dout_context
4446 #define dout_context cct
4447
4448
4449 static void aio_cb(void *priv, void *priv2)
4450 {
4451 BlueStore *store = static_cast<BlueStore*>(priv);
4452 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4453 c->aio_finish(store);
4454 }
4455
4456 static void discard_cb(void *priv, void *priv2)
4457 {
4458 BlueStore *store = static_cast<BlueStore*>(priv);
4459 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4460 store->handle_discard(*tmp);
4461 }
4462
4463 void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4464 {
4465 dout(10) << __func__ << dendl;
4466 ceph_assert(alloc);
4467 alloc->release(to_release);
4468 }
4469
4470 BlueStore::BlueStore(CephContext *cct, const string& path)
4471 : BlueStore(cct, path, 0) {}
4472
4473 BlueStore::BlueStore(CephContext *cct,
4474 const string& path,
4475 uint64_t _min_alloc_size)
4476 : ObjectStore(cct, path),
4477 throttle(cct),
4478 finisher(cct, "commit_finisher", "cfin"),
4479 kv_sync_thread(this),
4480 kv_finalize_thread(this),
4481 min_alloc_size(_min_alloc_size),
4482 min_alloc_size_order(ctz(_min_alloc_size)),
4483 mempool_thread(this)
4484 {
4485 _init_logger();
4486 cct->_conf.add_observer(this);
4487 set_cache_shards(1);
4488 }
4489
4490 BlueStore::~BlueStore()
4491 {
4492 cct->_conf.remove_observer(this);
4493 _shutdown_logger();
4494 ceph_assert(!mounted);
4495 ceph_assert(db == NULL);
4496 ceph_assert(bluefs == NULL);
4497 ceph_assert(fsid_fd < 0);
4498 ceph_assert(path_fd < 0);
4499 for (auto i : onode_cache_shards) {
4500 delete i;
4501 }
4502 for (auto i : buffer_cache_shards) {
4503 delete i;
4504 }
4505 onode_cache_shards.clear();
4506 buffer_cache_shards.clear();
4507 }
4508
4509 const char **BlueStore::get_tracked_conf_keys() const
4510 {
4511 static const char* KEYS[] = {
4512 "bluestore_csum_type",
4513 "bluestore_compression_mode",
4514 "bluestore_compression_algorithm",
4515 "bluestore_compression_min_blob_size",
4516 "bluestore_compression_min_blob_size_ssd",
4517 "bluestore_compression_min_blob_size_hdd",
4518 "bluestore_compression_max_blob_size",
4519 "bluestore_compression_max_blob_size_ssd",
4520 "bluestore_compression_max_blob_size_hdd",
4521 "bluestore_compression_required_ratio",
4522 "bluestore_max_alloc_size",
4523 "bluestore_prefer_deferred_size",
4524 "bluestore_prefer_deferred_size_hdd",
4525 "bluestore_prefer_deferred_size_ssd",
4526 "bluestore_deferred_batch_ops",
4527 "bluestore_deferred_batch_ops_hdd",
4528 "bluestore_deferred_batch_ops_ssd",
4529 "bluestore_throttle_bytes",
4530 "bluestore_throttle_deferred_bytes",
4531 "bluestore_throttle_cost_per_io_hdd",
4532 "bluestore_throttle_cost_per_io_ssd",
4533 "bluestore_throttle_cost_per_io",
4534 "bluestore_max_blob_size",
4535 "bluestore_max_blob_size_ssd",
4536 "bluestore_max_blob_size_hdd",
4537 "osd_memory_target",
4538 "osd_memory_target_cgroup_limit_ratio",
4539 "osd_memory_base",
4540 "osd_memory_cache_min",
4541 "osd_memory_expected_fragmentation",
4542 "bluestore_cache_autotune",
4543 "bluestore_cache_autotune_interval",
4544 "bluestore_warn_on_legacy_statfs",
4545 "bluestore_warn_on_no_per_pool_omap",
4546 "bluestore_max_defer_interval",
4547 NULL
4548 };
4549 return KEYS;
4550 }
4551
4552 void BlueStore::handle_conf_change(const ConfigProxy& conf,
4553 const std::set<std::string> &changed)
4554 {
4555 if (changed.count("bluestore_warn_on_legacy_statfs")) {
4556 _check_legacy_statfs_alert();
4557 }
4558 if (changed.count("bluestore_warn_on_no_per_pool_omap")) {
4559 _check_no_per_pool_omap_alert();
4560 }
4561
4562 if (changed.count("bluestore_csum_type")) {
4563 _set_csum();
4564 }
4565 if (changed.count("bluestore_compression_mode") ||
4566 changed.count("bluestore_compression_algorithm") ||
4567 changed.count("bluestore_compression_min_blob_size") ||
4568 changed.count("bluestore_compression_max_blob_size")) {
4569 if (bdev) {
4570 _set_compression();
4571 }
4572 }
4573 if (changed.count("bluestore_max_blob_size") ||
4574 changed.count("bluestore_max_blob_size_ssd") ||
4575 changed.count("bluestore_max_blob_size_hdd")) {
4576 if (bdev) {
4577 // only after startup
4578 _set_blob_size();
4579 }
4580 }
4581 if (changed.count("bluestore_prefer_deferred_size") ||
4582 changed.count("bluestore_prefer_deferred_size_hdd") ||
4583 changed.count("bluestore_prefer_deferred_size_ssd") ||
4584 changed.count("bluestore_max_alloc_size") ||
4585 changed.count("bluestore_deferred_batch_ops") ||
4586 changed.count("bluestore_deferred_batch_ops_hdd") ||
4587 changed.count("bluestore_deferred_batch_ops_ssd")) {
4588 if (bdev) {
4589 // only after startup
4590 _set_alloc_sizes();
4591 }
4592 }
4593 if (changed.count("bluestore_throttle_cost_per_io") ||
4594 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4595 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4596 if (bdev) {
4597 _set_throttle_params();
4598 }
4599 }
4600 if (changed.count("bluestore_throttle_bytes") ||
4601 changed.count("bluestore_throttle_deferred_bytes") ||
4602 changed.count("bluestore_throttle_trace_rate")) {
4603 throttle.reset_throttle(conf);
4604 }
4605 if (changed.count("bluestore_max_defer_interval")) {
4606 if (bdev) {
4607 _set_max_defer_interval();
4608 }
4609 }
4610 if (changed.count("osd_memory_target") ||
4611 changed.count("osd_memory_base") ||
4612 changed.count("osd_memory_cache_min") ||
4613 changed.count("osd_memory_expected_fragmentation")) {
4614 _update_osd_memory_options();
4615 }
4616 }
4617
4618 void BlueStore::_set_compression()
4619 {
4620 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4621 if (m) {
4622 _clear_compression_alert();
4623 comp_mode = *m;
4624 } else {
4625 derr << __func__ << " unrecognized value '"
4626 << cct->_conf->bluestore_compression_mode
4627 << "' for bluestore_compression_mode, reverting to 'none'"
4628 << dendl;
4629 comp_mode = Compressor::COMP_NONE;
4630 string s("unknown mode: ");
4631 s += cct->_conf->bluestore_compression_mode;
4632 _set_compression_alert(true, s.c_str());
4633 }
4634
4635 compressor = nullptr;
4636
4637 if (cct->_conf->bluestore_compression_min_blob_size) {
4638 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
4639 } else {
4640 ceph_assert(bdev);
4641 if (_use_rotational_settings()) {
4642 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4643 } else {
4644 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4645 }
4646 }
4647
4648 if (cct->_conf->bluestore_compression_max_blob_size) {
4649 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4650 } else {
4651 ceph_assert(bdev);
4652 if (_use_rotational_settings()) {
4653 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4654 } else {
4655 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4656 }
4657 }
4658
4659 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4660 if (!alg_name.empty()) {
4661 compressor = Compressor::create(cct, alg_name);
4662 if (!compressor) {
4663 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4664 << dendl;
4665 _set_compression_alert(false, alg_name.c_str());
4666 }
4667 }
4668
4669 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4670 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
4671 << " min_blob " << comp_min_blob_size
4672 << " max_blob " << comp_max_blob_size
4673 << dendl;
4674 }
4675
4676 void BlueStore::_set_csum()
4677 {
4678 csum_type = Checksummer::CSUM_NONE;
4679 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4680 if (t > Checksummer::CSUM_NONE)
4681 csum_type = t;
4682
4683 dout(10) << __func__ << " csum_type "
4684 << Checksummer::get_csum_type_string(csum_type)
4685 << dendl;
4686 }
4687
4688 void BlueStore::_set_throttle_params()
4689 {
4690 if (cct->_conf->bluestore_throttle_cost_per_io) {
4691 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4692 } else {
4693 ceph_assert(bdev);
4694 if (_use_rotational_settings()) {
4695 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4696 } else {
4697 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4698 }
4699 }
4700
4701 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4702 << dendl;
4703 }
4704 void BlueStore::_set_blob_size()
4705 {
4706 if (cct->_conf->bluestore_max_blob_size) {
4707 max_blob_size = cct->_conf->bluestore_max_blob_size;
4708 } else {
4709 ceph_assert(bdev);
4710 if (_use_rotational_settings()) {
4711 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4712 } else {
4713 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4714 }
4715 }
4716 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4717 << std::dec << dendl;
4718 }
4719
4720 void BlueStore::_update_osd_memory_options()
4721 {
4722 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4723 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4724 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4725 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4726 config_changed++;
4727 dout(10) << __func__
4728 << " osd_memory_target " << osd_memory_target
4729 << " osd_memory_base " << osd_memory_base
4730 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4731 << " osd_memory_cache_min " << osd_memory_cache_min
4732 << dendl;
4733 }
4734
4735 int BlueStore::_set_cache_sizes()
4736 {
4737 ceph_assert(bdev);
4738 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
4739 cache_autotune_interval =
4740 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4741 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4742 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4743 osd_memory_expected_fragmentation =
4744 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4745 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4746 osd_memory_cache_resize_interval =
4747 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
4748
4749 if (cct->_conf->bluestore_cache_size) {
4750 cache_size = cct->_conf->bluestore_cache_size;
4751 } else {
4752 // choose global cache size based on backend type
4753 if (_use_rotational_settings()) {
4754 cache_size = cct->_conf->bluestore_cache_size_hdd;
4755 } else {
4756 cache_size = cct->_conf->bluestore_cache_size_ssd;
4757 }
4758 }
4759
4760 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
4761 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
4762 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4763 << ") must be in range [0,1.0]" << dendl;
4764 return -EINVAL;
4765 }
4766
4767 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
4768 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
4769 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
4770 << ") must be in range [0,1.0]" << dendl;
4771 return -EINVAL;
4772 }
4773
4774 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
4775 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4776 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4777 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4778 << dendl;
4779 return -EINVAL;
4780 }
4781
4782 cache_data_ratio =
4783 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
4784 if (cache_data_ratio < 0) {
4785 // deal with floating point imprecision
4786 cache_data_ratio = 0;
4787 }
4788
4789 dout(1) << __func__ << " cache_size " << cache_size
4790 << " meta " << cache_meta_ratio
4791 << " kv " << cache_kv_ratio
4792 << " data " << cache_data_ratio
4793 << dendl;
4794 return 0;
4795 }
4796
4797 int BlueStore::write_meta(const std::string& key, const std::string& value)
4798 {
4799 bluestore_bdev_label_t label;
4800 string p = path + "/block";
4801 int r = _read_bdev_label(cct, p, &label);
4802 if (r < 0) {
4803 return ObjectStore::write_meta(key, value);
4804 }
4805 label.meta[key] = value;
4806 r = _write_bdev_label(cct, p, label);
4807 ceph_assert(r == 0);
4808 return ObjectStore::write_meta(key, value);
4809 }
4810
4811 int BlueStore::read_meta(const std::string& key, std::string *value)
4812 {
4813 bluestore_bdev_label_t label;
4814 string p = path + "/block";
4815 int r = _read_bdev_label(cct, p, &label);
4816 if (r < 0) {
4817 return ObjectStore::read_meta(key, value);
4818 }
4819 auto i = label.meta.find(key);
4820 if (i == label.meta.end()) {
4821 return ObjectStore::read_meta(key, value);
4822 }
4823 *value = i->second;
4824 return 0;
4825 }
4826
4827 void BlueStore::_init_logger()
4828 {
4829 PerfCountersBuilder b(cct, "bluestore",
4830 l_bluestore_first, l_bluestore_last);
4831 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4832 "Average kv_thread flush latency",
4833 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4834 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4835 "Average kv_thread commit latency");
4836 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4837 "Average kv_sync thread latency",
4838 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4839 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4840 "Average kv_finalize thread latency",
4841 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
4842 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4843 "Average prepare state latency");
4844 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4845 "Average aio_wait state latency",
4846 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4847 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4848 "Average io_done state latency");
4849 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4850 "Average kv_queued state latency");
4851 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4852 "Average kv_commiting state latency");
4853 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4854 "Average kv_done state latency");
4855 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4856 "Average deferred_queued state latency");
4857 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4858 "Average aio_wait state latency");
4859 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4860 "Average cleanup state latency");
4861 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4862 "Average finishing state latency");
4863 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4864 "Average done state latency");
4865 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4866 "Average submit throttle latency",
4867 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4868 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4869 "Average submit latency",
4870 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4871 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4872 "Average commit latency",
4873 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4874 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4875 "Average read latency",
4876 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4877 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4878 "Average read onode metadata latency");
4879 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4880 "Average read latency");
4881 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4882 "Average compress latency");
4883 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4884 "Average decompress latency");
4885 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4886 "Average checksum latency");
4887 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4888 "Sum for beneficial compress ops");
4889 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4890 "Sum for compress ops rejected due to low net gain of space");
4891 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
4892 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
4893 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4894 "Sum for deferred write op");
4895 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
4896 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
4897 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4898 "Sum for write penalty read ops");
4899 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4900 "Sum for allocated bytes");
4901 b.add_u64(l_bluestore_stored, "bluestore_stored",
4902 "Sum for stored bytes");
4903 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
4904 "Sum for stored compressed bytes",
4905 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4906 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
4907 "Sum for bytes allocated for compressed data",
4908 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4909 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
4910 "Sum for original bytes that were compressed",
4911 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
4912 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4913 "Number of onodes in cache");
4914 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4915 "Number of pinned onodes in cache");
4916 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4917 "Sum for onode-lookups hit in the cache");
4918 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4919 "Sum for onode-lookups missed in the cache");
4920 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4921 "Sum for onode-shard lookups hit in the cache");
4922 b.add_u64_counter(l_bluestore_onode_shard_misses,
4923 "bluestore_onode_shard_misses",
4924 "Sum for onode-shard lookups missed in the cache");
4925 b.add_u64(l_bluestore_extents, "bluestore_extents",
4926 "Number of extents in cache");
4927 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4928 "Number of blobs in cache");
4929 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4930 "Number of buffers in cache");
4931 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
4932 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
4933 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
4934 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
4935 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
4936 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
4937
4938 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4939 "Large aligned writes into fresh blobs");
4940 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
4941 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
4942 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4943 "Large aligned writes into fresh blobs (blobs)");
4944 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4945 "Small writes into existing or sparse small blobs");
4946 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
4947 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
4948 b.add_u64_counter(l_bluestore_write_small_unused,
4949 "bluestore_write_small_unused",
4950 "Small writes into unused portion of existing blob");
4951 b.add_u64_counter(l_bluestore_write_small_deferred,
4952 "bluestore_write_small_deferred",
4953 "Small overwrites using deferred");
4954 b.add_u64_counter(l_bluestore_write_small_pre_read,
4955 "bluestore_write_small_pre_read",
4956 "Small writes that required we read some data (possibly "
4957 "cached) to fill out the block");
4958 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4959 "Small write into new (sparse) blob");
4960
4961 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4962 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4963 "Onode extent map reshard events");
4964 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4965 "Sum for blob splitting due to resharding");
4966 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4967 "Sum for extents that have been removed due to compression");
4968 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4969 "Sum for extents that have been merged due to garbage "
4970 "collection");
4971 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4972 "Read EIO errors propagated to high level callers");
4973 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4974 "Read operations that required at least one retry due to failed checksum validation");
4975 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4976 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4977 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4978 "Average omap iterator seek_to_first call latency");
4979 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4980 "Average omap iterator upper_bound call latency");
4981 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
4982 "Average omap iterator lower_bound call latency");
4983 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
4984 "Average omap iterator next call latency");
4985 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
4986 "Average omap get_keys call latency");
4987 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
4988 "Average omap get_values call latency");
4989 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
4990 "Average collection listing latency");
4991 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
4992 "Average removal latency");
4993
4994 logger = b.create_perf_counters();
4995 cct->get_perfcounters_collection()->add(logger);
4996 }
4997
4998 int BlueStore::_reload_logger()
4999 {
5000 struct store_statfs_t store_statfs;
5001 int r = statfs(&store_statfs);
5002 if (r >= 0) {
5003 logger->set(l_bluestore_allocated, store_statfs.allocated);
5004 logger->set(l_bluestore_stored, store_statfs.data_stored);
5005 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5006 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5007 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
5008 }
5009 return r;
5010 }
5011
5012 void BlueStore::_shutdown_logger()
5013 {
5014 cct->get_perfcounters_collection()->remove(logger);
5015 delete logger;
5016 }
5017
5018 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5019 uuid_d *fsid)
5020 {
5021 bluestore_bdev_label_t label;
5022 int r = _read_bdev_label(cct, path, &label);
5023 if (r < 0)
5024 return r;
5025 *fsid = label.osd_uuid;
5026 return 0;
5027 }
5028
5029 int BlueStore::_open_path()
5030 {
5031 // sanity check(s)
5032 ceph_assert(path_fd < 0);
5033 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
5034 if (path_fd < 0) {
5035 int r = -errno;
5036 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5037 << dendl;
5038 return r;
5039 }
5040 return 0;
5041 }
5042
5043 void BlueStore::_close_path()
5044 {
5045 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5046 path_fd = -1;
5047 }
5048
5049 int BlueStore::_write_bdev_label(CephContext *cct,
5050 string path, bluestore_bdev_label_t label)
5051 {
5052 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5053 bufferlist bl;
5054 encode(label, bl);
5055 uint32_t crc = bl.crc32c(-1);
5056 encode(crc, bl);
5057 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
5058 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5059 z.zero();
5060 bl.append(std::move(z));
5061
5062 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
5063 if (fd < 0) {
5064 fd = -errno;
5065 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5066 << dendl;
5067 return fd;
5068 }
5069 int r = bl.write_fd(fd);
5070 if (r < 0) {
5071 derr << __func__ << " failed to write to " << path
5072 << ": " << cpp_strerror(r) << dendl;
5073 goto out;
5074 }
5075 r = ::fsync(fd);
5076 if (r < 0) {
5077 derr << __func__ << " failed to fsync " << path
5078 << ": " << cpp_strerror(r) << dendl;
5079 }
5080 out:
5081 VOID_TEMP_FAILURE_RETRY(::close(fd));
5082 return r;
5083 }
5084
5085 int BlueStore::_read_bdev_label(CephContext* cct, string path,
5086 bluestore_bdev_label_t *label)
5087 {
5088 dout(10) << __func__ << dendl;
5089 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
5090 if (fd < 0) {
5091 fd = -errno;
5092 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5093 << dendl;
5094 return fd;
5095 }
5096 bufferlist bl;
5097 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5098 VOID_TEMP_FAILURE_RETRY(::close(fd));
5099 if (r < 0) {
5100 derr << __func__ << " failed to read from " << path
5101 << ": " << cpp_strerror(r) << dendl;
5102 return r;
5103 }
5104
5105 uint32_t crc, expected_crc;
5106 auto p = bl.cbegin();
5107 try {
5108 decode(*label, p);
5109 bufferlist t;
5110 t.substr_of(bl, 0, p.get_off());
5111 crc = t.crc32c(-1);
5112 decode(expected_crc, p);
5113 }
5114 catch (buffer::error& e) {
5115 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
5116 << ": " << e.what()
5117 << dendl;
5118 return -ENOENT;
5119 }
5120 if (crc != expected_crc) {
5121 derr << __func__ << " bad crc on label, expected " << expected_crc
5122 << " != actual " << crc << dendl;
5123 return -EIO;
5124 }
5125 dout(10) << __func__ << " got " << *label << dendl;
5126 return 0;
5127 }
5128
5129 int BlueStore::_check_or_set_bdev_label(
5130 string path, uint64_t size, string desc, bool create)
5131 {
5132 bluestore_bdev_label_t label;
5133 if (create) {
5134 label.osd_uuid = fsid;
5135 label.size = size;
5136 label.btime = ceph_clock_now();
5137 label.description = desc;
5138 int r = _write_bdev_label(cct, path, label);
5139 if (r < 0)
5140 return r;
5141 } else {
5142 int r = _read_bdev_label(cct, path, &label);
5143 if (r < 0)
5144 return r;
5145 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5146 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5147 << " and fsid " << fsid << " check bypassed" << dendl;
5148 } else if (label.osd_uuid != fsid) {
5149 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5150 << " does not match our fsid " << fsid << dendl;
5151 return -EIO;
5152 }
5153 }
5154 return 0;
5155 }
5156
5157 void BlueStore::_set_alloc_sizes(void)
5158 {
5159 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5160
5161 if (cct->_conf->bluestore_prefer_deferred_size) {
5162 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5163 } else {
5164 ceph_assert(bdev);
5165 if (_use_rotational_settings()) {
5166 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5167 } else {
5168 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5169 }
5170 }
5171
5172 if (cct->_conf->bluestore_deferred_batch_ops) {
5173 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5174 } else {
5175 ceph_assert(bdev);
5176 if (_use_rotational_settings()) {
5177 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5178 } else {
5179 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5180 }
5181 }
5182
5183 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
5184 << std::dec << " order " << (int)min_alloc_size_order
5185 << " max_alloc_size 0x" << std::hex << max_alloc_size
5186 << " prefer_deferred_size 0x" << prefer_deferred_size
5187 << std::dec
5188 << " deferred_batch_ops " << deferred_batch_ops
5189 << dendl;
5190 }
5191
5192 int BlueStore::_open_bdev(bool create)
5193 {
5194 ceph_assert(bdev == NULL);
5195 string p = path + "/block";
5196 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
5197 int r = bdev->open(p);
5198 if (r < 0)
5199 goto fail;
5200
5201 if (create && cct->_conf->bdev_enable_discard) {
5202 bdev->discard(0, bdev->get_size());
5203 }
5204
5205 if (bdev->supported_bdev_label()) {
5206 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5207 if (r < 0)
5208 goto fail_close;
5209 }
5210
5211 // initialize global block parameters
5212 block_size = bdev->get_block_size();
5213 block_mask = ~(block_size - 1);
5214 block_size_order = ctz(block_size);
5215 ceph_assert(block_size == 1u << block_size_order);
5216 _set_max_defer_interval();
5217 // and set cache_size based on device type
5218 r = _set_cache_sizes();
5219 if (r < 0) {
5220 goto fail_close;
5221 }
5222 return 0;
5223
5224 fail_close:
5225 bdev->close();
5226 fail:
5227 delete bdev;
5228 bdev = NULL;
5229 return r;
5230 }
5231
5232 void BlueStore::_validate_bdev()
5233 {
5234 ceph_assert(bdev);
5235 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
5236 uint64_t dev_size = bdev->get_size();
5237 if (dev_size <
5238 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
5239 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
5240 << " is too small, disable bluestore_bluefs_min for now"
5241 << dendl;
5242 ceph_assert(dev_size >= _get_ondisk_reserved());
5243
5244 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
5245 ceph_assert(r == 0);
5246 }
5247 }
5248
5249 void BlueStore::_close_bdev()
5250 {
5251 ceph_assert(bdev);
5252 bdev->close();
5253 delete bdev;
5254 bdev = NULL;
5255 }
5256
5257 int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
5258 {
5259 int r;
5260 bluestore_bdev_label_t label;
5261
5262 ceph_assert(fm == NULL);
5263 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5264 ceph_assert(fm);
5265 if (t) {
5266 // create mode. initialize freespace
5267 dout(20) << __func__ << " initializing freespace" << dendl;
5268 {
5269 bufferlist bl;
5270 bl.append(freelist_type);
5271 t->set(PREFIX_SUPER, "freelist_type", bl);
5272 }
5273 // being able to allocate in units less than bdev block size
5274 // seems to be a bad idea.
5275 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
5276 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
5277
5278 // allocate superblock reserved space. note that we do not mark
5279 // bluefs space as allocated in the freelist; we instead rely on
5280 // bluefs_extents.
5281 auto reserved = _get_ondisk_reserved();
5282 fm->allocate(0, reserved, t);
5283
5284 if (cct->_conf->bluestore_bluefs) {
5285 ceph_assert(bluefs_extents.num_intervals() == 1);
5286 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
5287 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
5288 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
5289 << " for bluefs" << dendl;
5290 }
5291
5292 if (cct->_conf->bluestore_debug_prefill > 0) {
5293 uint64_t end = bdev->get_size() - reserved;
5294 dout(1) << __func__ << " pre-fragmenting freespace, using "
5295 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5296 << cct->_conf->bluestore_debug_prefragment_max << dendl;
5297 uint64_t start = p2roundup(reserved, min_alloc_size);
5298 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5299 float r = cct->_conf->bluestore_debug_prefill;
5300 r /= 1.0 - r;
5301 bool stop = false;
5302
5303 while (!stop && start < end) {
5304 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5305 if (start + l > end) {
5306 l = end - start;
5307 l = p2align(l, min_alloc_size);
5308 }
5309 ceph_assert(start + l <= end);
5310
5311 uint64_t u = 1 + (uint64_t)(r * (double)l);
5312 u = p2roundup(u, min_alloc_size);
5313 if (start + l + u > end) {
5314 u = end - (start + l);
5315 // trim to align so we don't overflow again
5316 u = p2align(u, min_alloc_size);
5317 stop = true;
5318 }
5319 ceph_assert(start + l + u <= end);
5320
5321 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
5322 << " use 0x" << u << std::dec << dendl;
5323
5324 if (u == 0) {
5325 // break if u has been trimmed to nothing
5326 break;
5327 }
5328
5329 fm->allocate(start + l, u, t);
5330 start += l + u;
5331 }
5332 }
5333 r = _write_out_fm_meta(0, false, &label);
5334 ceph_assert(r == 0);
5335 } else {
5336 string p = path + "/block";
5337 r = _read_bdev_label(cct, p, &label);
5338 if (r < 0) {
5339 derr << __func__ << " freelist init failed, error reading bdev label: " << cpp_strerror(r) << dendl;
5340 delete fm;
5341 fm = NULL;
5342 return r;
5343 }
5344 }
5345 r = fm->init(label, db, read_only);
5346 if (r < 0) {
5347 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
5348 delete fm;
5349 fm = NULL;
5350 return r;
5351 }
5352 // if space size tracked by free list manager is that higher than actual
5353 // dev size one can hit out-of-space allocation which will result
5354 // in data loss and/or assertions
5355 // Probably user altered the device size somehow.
5356 // The only fix for now is to redeploy OSD.
5357 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5358 ostringstream ss;
5359 ss << "slow device size mismatch detected, "
5360 << " fm size(" << fm->get_size()
5361 << ") > slow device size(" << bdev->get_size()
5362 << "), Please stop using this OSD as it might cause data loss.";
5363 _set_disk_size_mismatch_alert(ss.str());
5364 }
5365 return 0;
5366 }
5367
5368 void BlueStore::_close_fm()
5369 {
5370 dout(10) << __func__ << dendl;
5371 ceph_assert(fm);
5372 fm->shutdown();
5373 delete fm;
5374 fm = NULL;
5375 }
5376
5377 int BlueStore::_write_out_fm_meta(uint64_t target_size,
5378 bool update_root_size,
5379 bluestore_bdev_label_t* res_label)
5380 {
5381 string p = path + "/block";
5382
5383 std::vector<std::pair<string, string>> fm_meta;
5384 fm->get_meta(target_size, &fm_meta);
5385
5386 bluestore_bdev_label_t label;
5387 int r = _read_bdev_label(cct, p, &label);
5388 if (r < 0)
5389 return r;
5390
5391 for (auto& m : fm_meta) {
5392 label.meta[m.first] = m.second;
5393 }
5394 if (update_root_size) {
5395 label.size = target_size;
5396 }
5397 r = _write_bdev_label(cct, p, label);
5398 if (res_label) {
5399 *res_label = label;
5400 }
5401
5402 return r;
5403 }
5404
5405 int BlueStore::_open_alloc()
5406 {
5407 ceph_assert(alloc == NULL);
5408 ceph_assert(bdev->get_size());
5409
5410 if (bluefs) {
5411 bluefs_extents.clear();
5412 auto r = bluefs->get_block_extents(bluefs_layout.shared_bdev,
5413 &bluefs_extents);
5414 if (r < 0) {
5415 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
5416 << cpp_strerror(r) << dendl;
5417
5418 return r;
5419 }
5420 dout(10) << __func__ << " bluefs extents 0x"
5421 << std::hex << bluefs_extents << std::dec
5422 << dendl;
5423 }
5424
5425 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
5426 bdev->get_size(),
5427 min_alloc_size, "block");
5428 if (!alloc) {
5429 lderr(cct) << __func__ << " Allocator::unknown alloc type "
5430 << cct->_conf->bluestore_allocator
5431 << dendl;
5432 return -EINVAL;
5433 }
5434
5435 uint64_t num = 0, bytes = 0;
5436
5437 dout(1) << __func__ << " opening allocation metadata" << dendl;
5438 // initialize from freelist
5439 fm->enumerate_reset();
5440 uint64_t offset, length;
5441 while (fm->enumerate_next(db, &offset, &length)) {
5442 alloc->init_add_free(offset, length);
5443 ++num;
5444 bytes += length;
5445 }
5446 fm->enumerate_reset();
5447
5448 // also mark bluefs space as allocated
5449 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5450 alloc->init_rm_free(e.get_start(), e.get_len());
5451 }
5452
5453 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
5454 << " in " << num << " extents"
5455 << " available " << byte_u_t(alloc->get_free())
5456 << dendl;
5457
5458 return 0;
5459 }
5460
5461 void BlueStore::_close_alloc()
5462 {
5463 ceph_assert(bdev);
5464 bdev->discard_drain();
5465
5466 ceph_assert(alloc);
5467 alloc->shutdown();
5468 delete alloc;
5469 alloc = NULL;
5470 bluefs_extents.clear();
5471 }
5472
5473 int BlueStore::_open_fsid(bool create)
5474 {
5475 ceph_assert(fsid_fd < 0);
5476 int flags = O_RDWR|O_CLOEXEC;
5477 if (create)
5478 flags |= O_CREAT;
5479 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5480 if (fsid_fd < 0) {
5481 int err = -errno;
5482 derr << __func__ << " " << cpp_strerror(err) << dendl;
5483 return err;
5484 }
5485 return 0;
5486 }
5487
5488 int BlueStore::_read_fsid(uuid_d *uuid)
5489 {
5490 char fsid_str[40];
5491 memset(fsid_str, 0, sizeof(fsid_str));
5492 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5493 if (ret < 0) {
5494 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5495 return ret;
5496 }
5497 if (ret > 36)
5498 fsid_str[36] = 0;
5499 else
5500 fsid_str[ret] = 0;
5501 if (!uuid->parse(fsid_str)) {
5502 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5503 return -EINVAL;
5504 }
5505 return 0;
5506 }
5507
5508 int BlueStore::_write_fsid()
5509 {
5510 int r = ::ftruncate(fsid_fd, 0);
5511 if (r < 0) {
5512 r = -errno;
5513 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5514 return r;
5515 }
5516 string str = stringify(fsid) + "\n";
5517 r = safe_write(fsid_fd, str.c_str(), str.length());
5518 if (r < 0) {
5519 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5520 return r;
5521 }
5522 r = ::fsync(fsid_fd);
5523 if (r < 0) {
5524 r = -errno;
5525 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5526 return r;
5527 }
5528 return 0;
5529 }
5530
5531 void BlueStore::_close_fsid()
5532 {
5533 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5534 fsid_fd = -1;
5535 }
5536
5537 int BlueStore::_lock_fsid()
5538 {
5539 struct flock l;
5540 memset(&l, 0, sizeof(l));
5541 l.l_type = F_WRLCK;
5542 l.l_whence = SEEK_SET;
5543 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5544 if (r < 0) {
5545 int err = errno;
5546 derr << __func__ << " failed to lock " << path << "/fsid"
5547 << " (is another ceph-osd still running?)"
5548 << cpp_strerror(err) << dendl;
5549 return -err;
5550 }
5551 return 0;
5552 }
5553
5554 bool BlueStore::is_rotational()
5555 {
5556 if (bdev) {
5557 return bdev->is_rotational();
5558 }
5559
5560 bool rotational = true;
5561 int r = _open_path();
5562 if (r < 0)
5563 goto out;
5564 r = _open_fsid(false);
5565 if (r < 0)
5566 goto out_path;
5567 r = _read_fsid(&fsid);
5568 if (r < 0)
5569 goto out_fsid;
5570 r = _lock_fsid();
5571 if (r < 0)
5572 goto out_fsid;
5573 r = _open_bdev(false);
5574 if (r < 0)
5575 goto out_fsid;
5576 rotational = bdev->is_rotational();
5577 _close_bdev();
5578 out_fsid:
5579 _close_fsid();
5580 out_path:
5581 _close_path();
5582 out:
5583 return rotational;
5584 }
5585
5586 bool BlueStore::is_journal_rotational()
5587 {
5588 if (!bluefs) {
5589 dout(5) << __func__ << " bluefs disabled, default to store media type"
5590 << dendl;
5591 return is_rotational();
5592 }
5593 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5594 return bluefs->wal_is_rotational();
5595 }
5596
5597 bool BlueStore::_use_rotational_settings()
5598 {
5599 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5600 return true;
5601 }
5602 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5603 return false;
5604 }
5605 return bdev->is_rotational();
5606 }
5607
5608 bool BlueStore::test_mount_in_use()
5609 {
5610 // most error conditions mean the mount is not in use (e.g., because
5611 // it doesn't exist). only if we fail to lock do we conclude it is
5612 // in use.
5613 bool ret = false;
5614 int r = _open_path();
5615 if (r < 0)
5616 return false;
5617 r = _open_fsid(false);
5618 if (r < 0)
5619 goto out_path;
5620 r = _lock_fsid();
5621 if (r < 0)
5622 ret = true; // if we can't lock, it is in use
5623 _close_fsid();
5624 out_path:
5625 _close_path();
5626 return ret;
5627 }
5628
5629 int BlueStore::_minimal_open_bluefs(bool create)
5630 {
5631 int r;
5632 bluefs = new BlueFS(cct);
5633
5634 string bfn;
5635 struct stat st;
5636
5637 bfn = path + "/block.db";
5638 if (::stat(bfn.c_str(), &st) == 0) {
5639 r = bluefs->add_block_device(
5640 BlueFS::BDEV_DB, bfn,
5641 create && cct->_conf->bdev_enable_discard);
5642 if (r < 0) {
5643 derr << __func__ << " add block device(" << bfn << ") returned: "
5644 << cpp_strerror(r) << dendl;
5645 goto free_bluefs;
5646 }
5647
5648 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5649 r = _check_or_set_bdev_label(
5650 bfn,
5651 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5652 "bluefs db", create);
5653 if (r < 0) {
5654 derr << __func__
5655 << " check block device(" << bfn << ") label returned: "
5656 << cpp_strerror(r) << dendl;
5657 goto free_bluefs;
5658 }
5659 }
5660 if (create) {
5661 bluefs->add_block_extent(
5662 BlueFS::BDEV_DB,
5663 SUPER_RESERVED,
5664 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5665 }
5666 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5667 bluefs_layout.dedicated_db = true;
5668 } else {
5669 r = -errno;
5670 if (::lstat(bfn.c_str(), &st) == -1) {
5671 r = 0;
5672 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
5673 } else {
5674 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5675 << cpp_strerror(r) << dendl;
5676 goto free_bluefs;
5677 }
5678 }
5679
5680 // shared device
5681 bfn = path + "/block";
5682 // never trim here
5683 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
5684 true /* shared with bluestore */);
5685 if (r < 0) {
5686 derr << __func__ << " add block device(" << bfn << ") returned: "
5687 << cpp_strerror(r) << dendl;
5688 goto free_bluefs;
5689 }
5690 if (create) {
5691 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5692 uint64_t initial =
5693 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5694 cct->_conf->bluestore_bluefs_gift_ratio);
5695 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
5696 uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size;
5697 if (alloc_size % min_alloc_size) {
5698 derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex
5699 << alloc_size << " is not a multiple of "
5700 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5701 r = -EINVAL;
5702 goto free_bluefs;
5703 }
5704 // align to bluefs's alloc_size
5705 initial = p2roundup(initial, alloc_size);
5706 // put bluefs in the middle of the device in case it is an HDD
5707 uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size);
5708 //avoiding superblock overwrite
5709 start = std::max(alloc_size, start);
5710 ceph_assert(start >=_get_ondisk_reserved());
5711
5712 bluefs->add_block_extent(bluefs_layout.shared_bdev, start, initial);
5713 bluefs_extents.insert(start, initial);
5714 ++out_of_sync_fm;
5715 }
5716
5717 bfn = path + "/block.wal";
5718 if (::stat(bfn.c_str(), &st) == 0) {
5719 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
5720 create && cct->_conf->bdev_enable_discard);
5721 if (r < 0) {
5722 derr << __func__ << " add block device(" << bfn << ") returned: "
5723 << cpp_strerror(r) << dendl;
5724 goto free_bluefs;
5725 }
5726
5727 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5728 r = _check_or_set_bdev_label(
5729 bfn,
5730 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5731 "bluefs wal", create);
5732 if (r < 0) {
5733 derr << __func__ << " check block device(" << bfn
5734 << ") label returned: " << cpp_strerror(r) << dendl;
5735 goto free_bluefs;
5736 }
5737 }
5738
5739 if (create) {
5740 bluefs->add_block_extent(
5741 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5742 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5743 BDEV_LABEL_BLOCK_SIZE);
5744 }
5745 bluefs_layout.dedicated_wal = true;
5746 } else {
5747 r = 0;
5748 if (::lstat(bfn.c_str(), &st) != -1) {
5749 r = -errno;
5750 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5751 << cpp_strerror(r) << dendl;
5752 goto free_bluefs;
5753 }
5754 }
5755 return 0;
5756
5757 free_bluefs:
5758 ceph_assert(bluefs);
5759 delete bluefs;
5760 bluefs = NULL;
5761 return r;
5762 }
5763
5764 int BlueStore::_open_bluefs(bool create)
5765 {
5766 int r = _minimal_open_bluefs(create);
5767 if (r < 0) {
5768 return r;
5769 }
5770 RocksDBBlueFSVolumeSelector* vselector = nullptr;
5771 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5772
5773 string options = cct->_conf->bluestore_rocksdb_options;
5774
5775 rocksdb::Options rocks_opts;
5776 int r = RocksDBStore::ParseOptionsFromStringStatic(
5777 cct,
5778 options,
5779 rocks_opts,
5780 nullptr);
5781 if (r < 0) {
5782 return r;
5783 }
5784
5785 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5786 vselector =
5787 new RocksDBBlueFSVolumeSelector(
5788 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5789 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5790 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5791 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5792 rocks_opts.max_bytes_for_level_base,
5793 rocks_opts.max_bytes_for_level_multiplier,
5794 reserved_factor,
5795 cct->_conf->bluestore_volume_selection_reserved,
5796 cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
5797 }
5798 if (create) {
5799 bluefs->mkfs(fsid, bluefs_layout);
5800 }
5801 bluefs->set_volume_selector(vselector);
5802 r = bluefs->mount();
5803 if (r < 0) {
5804 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5805 }
5806 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
5807 return r;
5808 }
5809
5810 void BlueStore::_close_bluefs(bool cold_close)
5811 {
5812 bluefs->umount(cold_close);
5813 _minimal_close_bluefs();
5814 }
5815
5816 void BlueStore::_minimal_close_bluefs()
5817 {
5818 delete bluefs;
5819 bluefs = NULL;
5820 }
5821
5822 int BlueStore::_is_bluefs(bool create, bool* ret)
5823 {
5824 if (create) {
5825 *ret = cct->_conf->bluestore_bluefs;
5826 } else {
5827 string s;
5828 int r = read_meta("bluefs", &s);
5829 if (r < 0) {
5830 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5831 return -EIO;
5832 }
5833 if (s == "1") {
5834 *ret = true;
5835 } else if (s == "0") {
5836 *ret = false;
5837 } else {
5838 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5839 << dendl;
5840 return -EIO;
5841 }
5842 }
5843 return 0;
5844 }
5845
5846 /*
5847 * opens both DB and dependant super_meta, FreelistManager and allocator
5848 * in the proper order
5849 */
5850 int BlueStore::_open_db_and_around(bool read_only)
5851 {
5852 int r;
5853 bool do_bluefs = false;
5854 _is_bluefs(false, &do_bluefs); // ignore err code
5855 if (do_bluefs) {
5856 // open in read-only first to read FM list and init allocator
5857 // as they might be needed for some BlueFS procedures
5858 r = _open_db(false, false, true);
5859 if (r < 0)
5860 return r;
5861
5862 r = _open_super_meta();
5863 if (r < 0) {
5864 goto out_db;
5865 }
5866
5867 r = _open_fm(nullptr, true);
5868 if (r < 0)
5869 goto out_db;
5870
5871 r = _open_alloc();
5872 if (r < 0)
5873 goto out_fm;
5874
5875 // now open in R/W mode
5876 if (!read_only) {
5877 _close_db(true);
5878
5879 r = _open_db(false, false, false);
5880 if (r < 0) {
5881 _close_alloc();
5882 _close_fm();
5883 return r;
5884 }
5885 fm->sync(db);
5886 }
5887 } else {
5888 r = _open_db(false, false);
5889 if (r < 0) {
5890 return r;
5891 }
5892 r = _open_super_meta();
5893 if (r < 0) {
5894 goto out_db;
5895 }
5896
5897 r = _open_fm(nullptr, false);
5898 if (r < 0)
5899 goto out_db;
5900
5901 r = _open_alloc();
5902 if (r < 0)
5903 goto out_fm;
5904 }
5905 return 0;
5906
5907 out_fm:
5908 _close_fm();
5909 out_db:
5910 _close_db(read_only);
5911 return r;
5912 }
5913
5914 void BlueStore::_close_db_and_around(bool read_only)
5915 {
5916 if (bluefs) {
5917 if (!read_only && out_of_sync_fm.fetch_and(0)) {
5918 _sync_bluefs_and_fm();
5919 }
5920 _close_db(read_only);
5921 while(!read_only && out_of_sync_fm.fetch_and(0)) {
5922 // if seen some allocations during close - repeat open_db, sync fm, close
5923 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5924 int r = _open_db(false, false, false);
5925 if (r < 0) {
5926 derr << __func__
5927 << " unable to open db, FreelistManager is probably out of sync"
5928 << dendl;
5929 break;
5930 }
5931 _sync_bluefs_and_fm();
5932 _close_db(false);
5933 }
5934 if (!_kv_only) {
5935 _close_alloc();
5936 _close_fm();
5937 }
5938 } else {
5939 _close_alloc();
5940 _close_fm();
5941 _close_db(read_only);
5942 }
5943 }
5944
5945 // updates legacy bluefs related recs in DB to a state valid for
5946 // downgrades from nautilus.
5947 void BlueStore::_sync_bluefs_and_fm()
5948 {
5949 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5950 bufferlist bl;
5951 encode(bluefs_extents, bl);
5952 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5953 << std::hex << bluefs_extents << std::dec
5954 << dendl;
5955 KeyValueDB::Transaction synct = db->get_transaction();
5956 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5957 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5958
5959 // Nice thing is that we don't need to update FreelistManager here.
5960 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5961 // pre-Nautilis releases.
5962 // So once we get an extent to bluefs_extents this means it's
5963 // been free in allocator and hence it's free in FM too.
5964
5965 db->submit_transaction_sync(synct);
5966 }
5967 }
5968
5969 int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5970 {
5971 int r;
5972 ceph_assert(!db);
5973 ceph_assert(!(create && read_only));
5974 string fn = path + "/db";
5975 string options;
5976 stringstream err;
5977 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5978
5979 string kv_backend;
5980 std::vector<KeyValueDB::ColumnFamily> cfs;
5981
5982 if (create) {
5983 kv_backend = cct->_conf->bluestore_kvbackend;
5984 } else {
5985 r = read_meta("kv_backend", &kv_backend);
5986 if (r < 0) {
5987 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5988 return -EIO;
5989 }
5990 }
5991 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5992
5993 bool do_bluefs;
5994 r = _is_bluefs(create, &do_bluefs);
5995 if (r < 0) {
5996 return r;
5997 }
5998 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5999
6000 map<string,string> kv_options;
6001 // force separate wal dir for all new deployments.
6002 kv_options["separate_wal_dir"] = 1;
6003 rocksdb::Env *env = NULL;
6004 if (do_bluefs) {
6005 dout(10) << __func__ << " initializing bluefs" << dendl;
6006 if (kv_backend != "rocksdb") {
6007 derr << " backend must be rocksdb to use bluefs" << dendl;
6008 return -EINVAL;
6009 }
6010
6011 r = _open_bluefs(create);
6012 if (r < 0) {
6013 return r;
6014 }
6015
6016 if (cct->_conf->bluestore_bluefs_env_mirror) {
6017 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6018 rocksdb::Env* b = rocksdb::Env::Default();
6019 if (create) {
6020 string cmd = "rm -rf " + path + "/db " +
6021 path + "/db.slow " +
6022 path + "/db.wal";
6023 int r = system(cmd.c_str());
6024 (void)r;
6025 }
6026 env = new rocksdb::EnvMirror(b, a, false, true);
6027 } else {
6028 env = new BlueRocksEnv(bluefs);
6029
6030 // simplify the dir names, too, as "seen" by rocksdb
6031 fn = "db";
6032 }
6033 bluefs->set_slow_device_expander(this);
6034 BlueFSVolumeSelector::paths paths;
6035 bluefs->get_vselector_paths(fn, paths);
6036
6037 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6038 // we have both block.db and block; tell rocksdb!
6039 // note: the second (last) size value doesn't really matter
6040 ostringstream db_paths;
6041 bool first = true;
6042 for (auto& p : paths) {
6043 if (!first) {
6044 db_paths << " ";
6045 }
6046 first = false;
6047 db_paths << p.first << "," << p.second;
6048
6049 }
6050 kv_options["db_paths"] = db_paths.str();
6051 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
6052 }
6053
6054 if (create) {
6055 for (auto& p : paths) {
6056 env->CreateDir(p.first);
6057 }
6058 // Selectors don't provide wal path so far hence create explicitly
6059 env->CreateDir(fn + ".wal");
6060 } else {
6061 std::vector<std::string> res;
6062 // check for dir presence
6063 auto r = env->GetChildren(fn+".wal", &res);
6064 if (r.IsNotFound()) {
6065 kv_options.erase("separate_wal_dir");
6066 }
6067 }
6068 } else {
6069 string walfn = path + "/db.wal";
6070
6071 if (create) {
6072 int r = ::mkdir(fn.c_str(), 0755);
6073 if (r < 0)
6074 r = -errno;
6075 if (r < 0 && r != -EEXIST) {
6076 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6077 << dendl;
6078 return r;
6079 }
6080
6081 // wal_dir, too!
6082 r = ::mkdir(walfn.c_str(), 0755);
6083 if (r < 0)
6084 r = -errno;
6085 if (r < 0 && r != -EEXIST) {
6086 derr << __func__ << " failed to create " << walfn
6087 << ": " << cpp_strerror(r)
6088 << dendl;
6089 return r;
6090 }
6091 } else {
6092 struct stat st;
6093 r = ::stat(walfn.c_str(), &st);
6094 if (r < 0 && errno == ENOENT) {
6095 kv_options.erase("separate_wal_dir");
6096 }
6097 }
6098 }
6099
6100
6101 db = KeyValueDB::create(cct,
6102 kv_backend,
6103 fn,
6104 kv_options,
6105 static_cast<void*>(env));
6106 if (!db) {
6107 derr << __func__ << " error creating db" << dendl;
6108 if (bluefs) {
6109 _close_bluefs(read_only);
6110 }
6111 // delete env manually here since we can't depend on db to do this
6112 // under this case
6113 delete env;
6114 env = NULL;
6115 return -EIO;
6116 }
6117
6118 FreelistManager::setup_merge_operators(db);
6119 db->set_merge_operator(PREFIX_STAT, merge_op);
6120 db->set_cache_size(cache_kv_ratio * cache_size);
6121
6122 if (kv_backend == "rocksdb") {
6123 options = cct->_conf->bluestore_rocksdb_options;
6124
6125 map<string,string> cf_map;
6126 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
6127 get_str_map,
6128 &cf_map,
6129 " \t");
6130 for (auto& i : cf_map) {
6131 dout(10) << "column family " << i.first << ": " << i.second << dendl;
6132 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
6133 }
6134 }
6135
6136 db->init(options);
6137 if (to_repair_db)
6138 return 0;
6139 if (create) {
6140 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6141 r = db->create_and_open(err, cfs);
6142 } else {
6143 r = db->create_and_open(err);
6144 }
6145 } else {
6146 // we pass in cf list here, but it is only used if the db already has
6147 // column families created.
6148 r = read_only ?
6149 db->open_read_only(err, cfs) :
6150 db->open(err, cfs);
6151 }
6152 if (r) {
6153 derr << __func__ << " erroring opening db: " << err.str() << dendl;
6154 _close_db(read_only);
6155 return -EIO;
6156 }
6157 dout(1) << __func__ << " opened " << kv_backend
6158 << " path " << fn << " options " << options << dendl;
6159 return 0;
6160 }
6161
6162 void BlueStore::_close_db(bool cold_close)
6163 {
6164 ceph_assert(db);
6165 delete db;
6166 db = NULL;
6167 if (bluefs) {
6168 _close_bluefs(cold_close);
6169 }
6170 }
6171
6172 void BlueStore::_dump_alloc_on_failure()
6173 {
6174 auto dump_interval =
6175 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6176 if (dump_interval > 0 &&
6177 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
6178 alloc->dump();
6179 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6180 next_dump_on_bluefs_alloc_failure += dump_interval;
6181 }
6182 }
6183
6184
6185 int BlueStore::allocate_bluefs_freespace(
6186 uint64_t min_size,
6187 uint64_t size,
6188 PExtentVector* extents_out)
6189 {
6190 ceph_assert(min_size <= size);
6191 if (size) {
6192 // round up to alloc size
6193 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
6194 min_size = p2roundup(min_size, alloc_size);
6195 size = p2roundup(size, alloc_size);
6196
6197 PExtentVector extents_local;
6198 PExtentVector* extents = extents_out ? extents_out : &extents_local;
6199
6200
6201 uint64_t gift;
6202 uint64_t allocated = 0;
6203 int64_t alloc_len;
6204 auto need = size;
6205 auto extent_count0 = extents->size();
6206 do {
6207 // hard cap to fit into 32 bits
6208 gift = std::min<uint64_t>(size, 1ull << 30);
6209 dout(10) << __func__ << " gifting " << gift
6210 << " (" << byte_u_t(gift) << ")" << dendl;
6211
6212 alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents);
6213 if (alloc_len > 0) {
6214 allocated += alloc_len;
6215 size -= alloc_len;
6216 }
6217
6218 if (alloc_len < 0 ||
6219 (alloc_len < (int64_t)gift && (min_size > allocated))) {
6220 derr << __func__
6221 << " failed to allocate on 0x" << std::hex << gift
6222 << " min_size 0x" << min_size
6223 << " > allocated total 0x" << allocated
6224 << " bluefs_shared_alloc_size 0x" << alloc_size
6225 << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len)
6226 << " available 0x " << alloc->get_free()
6227 << std::dec << dendl;
6228
6229 _dump_alloc_on_failure();
6230 alloc->release(*extents);
6231 extents->clear();
6232 return -ENOSPC;
6233 }
6234 } while (size && alloc_len > 0);
6235 _collect_allocation_stats(need, alloc_size, extents->size() - extent_count0);
6236
6237 for (auto& e : *extents) {
6238 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
6239 bluefs_extents.insert(e.offset, e.length);
6240 ++out_of_sync_fm;
6241 // apply to bluefs if not requested from outside
6242 if (!extents_out) {
6243 bluefs->add_block_extent(bluefs_layout.shared_bdev, e.offset, e.length);
6244 }
6245 }
6246 }
6247 return 0;
6248 }
6249
6250 uint64_t BlueStore::available_freespace(uint64_t alloc_size) {
6251 uint64_t total = 0;
6252 auto iterated_allocation = [&](uint64_t off, uint64_t len) {
6253 //only count in size that is alloc_size aligned
6254 uint64_t dist_to_alignment;
6255 uint64_t offset_in_block = off & (alloc_size - 1);
6256 if (offset_in_block == 0)
6257 dist_to_alignment = 0;
6258 else
6259 dist_to_alignment = alloc_size - offset_in_block;
6260 if (dist_to_alignment >= len)
6261 return;
6262 len -= dist_to_alignment;
6263 total += p2align(len, alloc_size);
6264 };
6265 alloc->dump(iterated_allocation);
6266 return total;
6267 }
6268
6269 int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
6270 {
6271 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
6272
6273 uint64_t my_free = alloc->get_free();
6274 uint64_t total = bdev->get_size();
6275 float my_free_ratio = (float)my_free / (float)total;
6276
6277 uint64_t total_free = bluefs_free + my_free;
6278
6279 float bluefs_ratio = (float)bluefs_free / (float)total_free;
6280
6281 dout(10) << __func__
6282 << " bluefs " << byte_u_t(bluefs_free)
6283 << " free (" << bluefs_free_ratio
6284 << ") bluestore " << byte_u_t(my_free)
6285 << " free (" << my_free_ratio
6286 << "), bluefs_ratio " << bluefs_ratio
6287 << dendl;
6288
6289 uint64_t gift = 0;
6290 uint64_t reclaim = 0;
6291 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
6292 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
6293 if (gift >= my_free)
6294 gift = my_free / 2;
6295 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6296 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
6297 << ", should gift " << byte_u_t(gift) << dendl;
6298 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
6299 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
6300 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
6301 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
6302 if (reclaim >= bluefs_free)
6303 reclaim = bluefs_free / 2;
6304 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6305 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
6306 << ", should reclaim " << byte_u_t(reclaim) << dendl;
6307 }
6308
6309 // don't take over too much of the freespace
6310 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
6311 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
6312 cct->_conf->bluestore_bluefs_min < free_cap) {
6313 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
6314 dout(10) << __func__ << " bluefs_total " << bluefs_total
6315 << " < min " << cct->_conf->bluestore_bluefs_min
6316 << ", should gift " << byte_u_t(g) << dendl;
6317 if (g > gift)
6318 gift = g;
6319 reclaim = 0;
6320 }
6321 uint64_t min_free =
6322 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6323 if (bluefs_free < min_free &&
6324 min_free < free_cap) {
6325 uint64_t g = min_free - bluefs_free;
6326 dout(10) << __func__ << " bluefs_free " << bluefs_free
6327 << " < min " << min_free
6328 << ", should gift " << byte_u_t(g) << dendl;
6329 if (g > gift)
6330 gift = g;
6331 reclaim = 0;
6332 }
6333 uint64_t max_free =
6334 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_max_free");
6335 if (bluefs_free > max_free) {
6336 dout(10) << __func__ << " bluefs_free " << bluefs_free
6337 << " > max " << max_free
6338 << ", stop gifting for now" << dendl;
6339 gift = 0;
6340 }
6341 ceph_assert((int64_t)gift >= 0);
6342 ceph_assert((int64_t)reclaim >= 0);
6343 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
6344 }
6345
6346 int BlueStore::_balance_bluefs_freespace()
6347 {
6348 int ret = 0;
6349 ceph_assert(bluefs);
6350
6351 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
6352 bluefs->get_usage(&bluefs_usage);
6353 ceph_assert(bluefs_usage.size() > bluefs_layout.shared_bdev);
6354
6355 bool clear_alert = true;
6356 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6357 auto& p = bluefs_usage[bluefs_layout.shared_bdev];
6358 if (p.first != p.second) {
6359 auto& db = bluefs_usage[BlueFS::BDEV_DB];
6360 ostringstream ss;
6361 ss << "spilled over " << byte_u_t(p.second - p.first)
6362 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
6363 << " used of " << byte_u_t(db.second) << ") to slow device";
6364 _set_spillover_alert(ss.str());
6365 clear_alert = false;
6366 }
6367 }
6368 if (clear_alert) {
6369 _clear_spillover_alert();
6370 }
6371
6372 // fixme: look at primary bdev only for now
6373 int64_t delta = _get_bluefs_size_delta(
6374 bluefs_usage[bluefs_layout.shared_bdev].first,
6375 bluefs_usage[bluefs_layout.shared_bdev].second);
6376
6377 // reclaim from bluefs?
6378 if (delta < 0) {
6379 // round up to alloc size
6380 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
6381 auto reclaim = p2roundup(uint64_t(-delta), alloc_size);
6382
6383 // hard cap to fit into 32 bits
6384 reclaim = std::min<uint64_t>(reclaim, 1ull << 30);
6385 dout(10) << __func__ << " reclaiming " << reclaim
6386 << " (" << byte_u_t(reclaim) << ")" << dendl;
6387
6388 while (reclaim > 0) {
6389 // NOTE: this will block and do IO.
6390 PExtentVector extents;
6391 int r = bluefs->reclaim_blocks(bluefs_layout.shared_bdev, reclaim,
6392 &extents);
6393 if (r < 0) {
6394 derr << __func__ << " failed to reclaim space from bluefs"
6395 << dendl;
6396 break;
6397 }
6398 for (auto e : extents) {
6399 ++out_of_sync_fm;
6400 bluefs_extents.erase(e.offset, e.length);
6401 bluefs_extents_reclaiming.insert(e.offset, e.length);
6402 reclaim -= e.length;
6403 }
6404 }
6405
6406 ret = 1;
6407 }
6408
6409 return ret;
6410 }
6411
6412 int BlueStore::_open_collections()
6413 {
6414 dout(10) << __func__ << dendl;
6415 collections_had_errors = false;
6416 ceph_assert(coll_map.empty());
6417 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6418 for (it->upper_bound(string());
6419 it->valid();
6420 it->next()) {
6421 coll_t cid;
6422 if (cid.parse(it->key())) {
6423 auto c = ceph::make_ref<Collection>(
6424 this,
6425 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6426 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6427 cid);
6428 bufferlist bl = it->value();
6429 auto p = bl.cbegin();
6430 try {
6431 decode(c->cnode, p);
6432 } catch (buffer::error& e) {
6433 derr << __func__ << " failed to decode cnode, key:"
6434 << pretty_binary_string(it->key()) << dendl;
6435 return -EIO;
6436 }
6437 dout(20) << __func__ << " opened " << cid << " " << c
6438 << " " << c->cnode << dendl;
6439 _osr_attach(c.get());
6440 coll_map[cid] = c;
6441
6442 } else {
6443 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6444 collections_had_errors = true;
6445 }
6446 }
6447 return 0;
6448 }
6449
6450 void BlueStore::_fsck_collections(int64_t* errors)
6451 {
6452 if (collections_had_errors) {
6453 dout(10) << __func__ << dendl;
6454 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6455 for (it->upper_bound(string());
6456 it->valid();
6457 it->next()) {
6458 coll_t cid;
6459 if (!cid.parse(it->key())) {
6460 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6461 if (errors) {
6462 (*errors)++;
6463 }
6464 }
6465 }
6466 }
6467 }
6468
6469 void BlueStore::_set_per_pool_omap()
6470 {
6471 per_pool_omap = false;
6472 bufferlist bl;
6473 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6474 if (bl.length()) {
6475 per_pool_omap = true;
6476 dout(10) << __func__ << " per_pool_omap=1" << dendl;
6477 } else {
6478 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6479 }
6480 _check_no_per_pool_omap_alert();
6481 }
6482
6483 void BlueStore::_open_statfs()
6484 {
6485 osd_pools.clear();
6486 vstatfs.reset();
6487
6488 bufferlist bl;
6489 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
6490 if (r >= 0) {
6491 per_pool_stat_collection = false;
6492 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
6493 auto it = bl.cbegin();
6494 vstatfs.decode(it);
6495 dout(10) << __func__ << " store_statfs is found" << dendl;
6496 } else {
6497 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6498 }
6499 _check_legacy_statfs_alert();
6500 } else {
6501 per_pool_stat_collection = true;
6502 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6503 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
6504 for (it->upper_bound(string());
6505 it->valid();
6506 it->next()) {
6507
6508 uint64_t pool_id;
6509 int r = get_key_pool_stat(it->key(), &pool_id);
6510 ceph_assert(r == 0);
6511
6512 bufferlist bl;
6513 bl = it->value();
6514 auto p = bl.cbegin();
6515 auto& st = osd_pools[pool_id];
6516 try {
6517 st.decode(p);
6518 vstatfs += st;
6519
6520 dout(30) << __func__ << " pool " << pool_id
6521 << " statfs " << st << dendl;
6522 } catch (buffer::error& e) {
6523 derr << __func__ << " failed to decode pool stats, key:"
6524 << pretty_binary_string(it->key()) << dendl;
6525 }
6526 }
6527 }
6528 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6529
6530 }
6531
6532 int BlueStore::_setup_block_symlink_or_file(
6533 string name,
6534 string epath,
6535 uint64_t size,
6536 bool create)
6537 {
6538 dout(20) << __func__ << " name " << name << " path " << epath
6539 << " size " << size << " create=" << (int)create << dendl;
6540 int r = 0;
6541 int flags = O_RDWR|O_CLOEXEC;
6542 if (create)
6543 flags |= O_CREAT;
6544 if (epath.length()) {
6545 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6546 if (r < 0) {
6547 r = -errno;
6548 derr << __func__ << " failed to create " << name << " symlink to "
6549 << epath << ": " << cpp_strerror(r) << dendl;
6550 return r;
6551 }
6552
6553 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6554 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6555 if (fd < 0) {
6556 r = -errno;
6557 derr << __func__ << " failed to open " << epath << " file: "
6558 << cpp_strerror(r) << dendl;
6559 return r;
6560 }
6561 // write the Transport ID of the NVMe device
6562 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6563 // where "0000:02:00.0" is the selector of a PCI device, see
6564 // the first column of "lspci -mm -n -D"
6565 string trid{"trtype:PCIe "};
6566 trid += "traddr:";
6567 trid += epath.substr(strlen(SPDK_PREFIX));
6568 r = ::write(fd, trid.c_str(), trid.size());
6569 ceph_assert(r == static_cast<int>(trid.size()));
6570 dout(1) << __func__ << " created " << name << " symlink to "
6571 << epath << dendl;
6572 VOID_TEMP_FAILURE_RETRY(::close(fd));
6573 }
6574 }
6575 if (size) {
6576 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6577 if (fd >= 0) {
6578 // block file is present
6579 struct stat st;
6580 int r = ::fstat(fd, &st);
6581 if (r == 0 &&
6582 S_ISREG(st.st_mode) && // if it is a regular file
6583 st.st_size == 0) { // and is 0 bytes
6584 r = ::ftruncate(fd, size);
6585 if (r < 0) {
6586 r = -errno;
6587 derr << __func__ << " failed to resize " << name << " file to "
6588 << size << ": " << cpp_strerror(r) << dendl;
6589 VOID_TEMP_FAILURE_RETRY(::close(fd));
6590 return r;
6591 }
6592
6593 if (cct->_conf->bluestore_block_preallocate_file) {
6594 r = ::ceph_posix_fallocate(fd, 0, size);
6595 if (r > 0) {
6596 derr << __func__ << " failed to prefallocate " << name << " file to "
6597 << size << ": " << cpp_strerror(r) << dendl;
6598 VOID_TEMP_FAILURE_RETRY(::close(fd));
6599 return -r;
6600 }
6601 }
6602 dout(1) << __func__ << " resized " << name << " file to "
6603 << byte_u_t(size) << dendl;
6604 }
6605 VOID_TEMP_FAILURE_RETRY(::close(fd));
6606 } else {
6607 int r = -errno;
6608 if (r != -ENOENT) {
6609 derr << __func__ << " failed to open " << name << " file: "
6610 << cpp_strerror(r) << dendl;
6611 return r;
6612 }
6613 }
6614 }
6615 return 0;
6616 }
6617
6618 int BlueStore::mkfs()
6619 {
6620 dout(1) << __func__ << " path " << path << dendl;
6621 int r;
6622 uuid_d old_fsid;
6623
6624 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6625 derr << __func__ << " osd_max_object_size "
6626 << cct->_conf->osd_max_object_size << " > bluestore max "
6627 << OBJECT_MAX_SIZE << dendl;
6628 return -EINVAL;
6629 }
6630
6631 {
6632 string done;
6633 r = read_meta("mkfs_done", &done);
6634 if (r == 0) {
6635 dout(1) << __func__ << " already created" << dendl;
6636 if (cct->_conf->bluestore_fsck_on_mkfs) {
6637 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6638 if (r < 0) {
6639 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6640 << dendl;
6641 return r;
6642 }
6643 if (r > 0) {
6644 derr << __func__ << " fsck found " << r << " errors" << dendl;
6645 r = -EIO;
6646 }
6647 }
6648 return r; // idempotent
6649 }
6650 }
6651
6652 {
6653 string type;
6654 r = read_meta("type", &type);
6655 if (r == 0) {
6656 if (type != "bluestore") {
6657 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6658 return -EIO;
6659 }
6660 } else {
6661 r = write_meta("type", "bluestore");
6662 if (r < 0)
6663 return r;
6664 }
6665 }
6666
6667 freelist_type = "bitmap";
6668
6669 r = _open_path();
6670 if (r < 0)
6671 return r;
6672
6673 r = _open_fsid(true);
6674 if (r < 0)
6675 goto out_path_fd;
6676
6677 r = _lock_fsid();
6678 if (r < 0)
6679 goto out_close_fsid;
6680
6681 r = _read_fsid(&old_fsid);
6682 if (r < 0 || old_fsid.is_zero()) {
6683 if (fsid.is_zero()) {
6684 fsid.generate_random();
6685 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6686 } else {
6687 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6688 }
6689 // we'll write it later.
6690 } else {
6691 if (!fsid.is_zero() && fsid != old_fsid) {
6692 derr << __func__ << " on-disk fsid " << old_fsid
6693 << " != provided " << fsid << dendl;
6694 r = -EINVAL;
6695 goto out_close_fsid;
6696 }
6697 fsid = old_fsid;
6698 }
6699
6700 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6701 cct->_conf->bluestore_block_size,
6702 cct->_conf->bluestore_block_create);
6703 if (r < 0)
6704 goto out_close_fsid;
6705 if (cct->_conf->bluestore_bluefs) {
6706 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6707 cct->_conf->bluestore_block_wal_size,
6708 cct->_conf->bluestore_block_wal_create);
6709 if (r < 0)
6710 goto out_close_fsid;
6711 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6712 cct->_conf->bluestore_block_db_size,
6713 cct->_conf->bluestore_block_db_create);
6714 if (r < 0)
6715 goto out_close_fsid;
6716 }
6717
6718 r = _open_bdev(true);
6719 if (r < 0)
6720 goto out_close_fsid;
6721
6722 // choose min_alloc_size
6723 if (cct->_conf->bluestore_min_alloc_size) {
6724 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6725 } else {
6726 ceph_assert(bdev);
6727 if (bdev->is_rotational()) {
6728 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6729 } else {
6730 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6731 }
6732 }
6733 _validate_bdev();
6734
6735 // make sure min_alloc_size is power of 2 aligned.
6736 if (!isp2(min_alloc_size)) {
6737 derr << __func__ << " min_alloc_size 0x"
6738 << std::hex << min_alloc_size << std::dec
6739 << " is not power of 2 aligned!"
6740 << dendl;
6741 r = -EINVAL;
6742 goto out_close_bdev;
6743 }
6744
6745 r = _open_db(true);
6746 if (r < 0)
6747 goto out_close_bdev;
6748
6749 {
6750 KeyValueDB::Transaction t = db->get_transaction();
6751 r = _open_fm(t, true);
6752 if (r < 0)
6753 goto out_close_db;
6754 {
6755 bufferlist bl;
6756 encode((uint64_t)0, bl);
6757 t->set(PREFIX_SUPER, "nid_max", bl);
6758 t->set(PREFIX_SUPER, "blobid_max", bl);
6759 }
6760
6761 {
6762 bufferlist bl;
6763 encode((uint64_t)min_alloc_size, bl);
6764 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6765 }
6766 {
6767 bufferlist bl;
6768 bl.append("1");
6769 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6770 }
6771 ondisk_format = latest_ondisk_format;
6772 _prepare_ondisk_format_super(t);
6773 db->submit_transaction_sync(t);
6774 }
6775
6776 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6777 if (r < 0)
6778 goto out_close_fm;
6779
6780 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
6781 if (r < 0)
6782 goto out_close_fm;
6783
6784 if (fsid != old_fsid) {
6785 r = _write_fsid();
6786 if (r < 0) {
6787 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
6788 goto out_close_fm;
6789 }
6790 }
6791
6792 if (out_of_sync_fm.fetch_and(0)) {
6793 _sync_bluefs_and_fm();
6794 }
6795
6796 out_close_fm:
6797 _close_fm();
6798 out_close_db:
6799 _close_db(false);
6800 out_close_bdev:
6801 _close_bdev();
6802 out_close_fsid:
6803 _close_fsid();
6804 out_path_fd:
6805 _close_path();
6806
6807 if (r == 0 &&
6808 cct->_conf->bluestore_fsck_on_mkfs) {
6809 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6810 if (rc < 0)
6811 return rc;
6812 if (rc > 0) {
6813 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6814 r = -EIO;
6815 }
6816 }
6817
6818 if (r == 0) {
6819 // indicate success by writing the 'mkfs_done' file
6820 r = write_meta("mkfs_done", "yes");
6821 }
6822
6823 if (r < 0) {
6824 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6825 } else {
6826 dout(0) << __func__ << " success" << dendl;
6827 }
6828 return r;
6829 }
6830
6831 int BlueStore::_mount_for_bluefs()
6832 {
6833 int r = _open_path();
6834 ceph_assert(r == 0);
6835 r = _open_fsid(false);
6836 ceph_assert(r == 0);
6837 r = _read_fsid(&fsid);
6838 ceph_assert(r == 0);
6839 r = _lock_fsid();
6840 ceph_assert(r == 0);
6841 r = _open_bluefs(false);
6842 ceph_assert(r == 0);
6843 return r;
6844 }
6845
6846 void BlueStore::_umount_for_bluefs()
6847 {
6848 _close_bluefs(false);
6849 _close_fsid();
6850 _close_path();
6851 }
6852
6853 int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6854 {
6855 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6856 int r;
6857 ceph_assert(path_fd < 0);
6858
6859 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6860
6861 if (!cct->_conf->bluestore_bluefs) {
6862 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6863 return -EIO;
6864 }
6865
6866 r = _mount_for_bluefs();
6867
6868 int reserved = 0;
6869 if (id == BlueFS::BDEV_NEWWAL) {
6870 string p = path + "/block.wal";
6871 r = _setup_block_symlink_or_file("block.wal", dev_path,
6872 cct->_conf->bluestore_block_wal_size,
6873 true);
6874 ceph_assert(r == 0);
6875
6876 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
6877 cct->_conf->bdev_enable_discard);
6878 ceph_assert(r == 0);
6879
6880 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6881 r = _check_or_set_bdev_label(
6882 p,
6883 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6884 "bluefs wal",
6885 true);
6886 ceph_assert(r == 0);
6887 }
6888
6889 reserved = BDEV_LABEL_BLOCK_SIZE;
6890 bluefs_layout.dedicated_wal = true;
6891 } else if (id == BlueFS::BDEV_NEWDB) {
6892 string p = path + "/block.db";
6893 r = _setup_block_symlink_or_file("block.db", dev_path,
6894 cct->_conf->bluestore_block_db_size,
6895 true);
6896 ceph_assert(r == 0);
6897
6898 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
6899 cct->_conf->bdev_enable_discard);
6900 ceph_assert(r == 0);
6901
6902 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6903 r = _check_or_set_bdev_label(
6904 p,
6905 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6906 "bluefs db",
6907 true);
6908 ceph_assert(r == 0);
6909 }
6910 reserved = SUPER_RESERVED;
6911 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6912 bluefs_layout.dedicated_db = true;
6913 }
6914
6915 bluefs->umount();
6916 bluefs->mount();
6917
6918 bluefs->add_block_extent(
6919 id,
6920 reserved,
6921 bluefs->get_block_device_size(id) - reserved, true);
6922
6923 r = bluefs->prepare_new_device(id, bluefs_layout);
6924 ceph_assert(r == 0);
6925
6926 if (r < 0) {
6927 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6928 } else {
6929 dout(0) << __func__ << " success" << dendl;
6930 }
6931
6932 _umount_for_bluefs();
6933 return r;
6934 }
6935
6936 int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6937 int id)
6938 {
6939 dout(10) << __func__ << " id:" << id << dendl;
6940 ceph_assert(path_fd < 0);
6941
6942 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6943
6944 if (!cct->_conf->bluestore_bluefs) {
6945 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6946 return -EIO;
6947 }
6948
6949 int r = _mount_for_bluefs();
6950
6951 // require bluestore_bluefs_min_free to be free at target device!
6952 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6953 for(auto src_id : devs_source) {
6954 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6955 }
6956 uint64_t target_free = bluefs->get_free(id);
6957 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6958 // will need to remount full BlueStore instance to allocate more space
6959 _umount_for_bluefs();
6960
6961 r = mount();
6962 ceph_assert(r == 0);
6963 dout(1) << __func__
6964 << " Allocating more space at slow device for BlueFS: +"
6965 << used_space - target_free << " bytes" << dendl;
6966 r = allocate_bluefs_freespace(
6967 used_space - target_free,
6968 used_space - target_free,
6969 nullptr);
6970
6971 umount();
6972 if (r != 0) {
6973 derr << __func__
6974 << " can't migrate, unable to allocate extra space: "
6975 << used_space - target_free << " at target:" << id
6976 << dendl;
6977 return -ENOSPC;
6978 }
6979
6980 r = _mount_for_bluefs();
6981 ceph_assert(r == 0);
6982 } else if (target_free < used_space) {
6983 derr << __func__
6984 << " can't migrate, free space at target: " << target_free
6985 << " is less than required space: " << used_space
6986 << dendl;
6987 return -ENOSPC;
6988 }
6989 if (devs_source.count(BlueFS::BDEV_DB)) {
6990 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6991 bluefs_layout.dedicated_db = false;
6992 }
6993 if (devs_source.count(BlueFS::BDEV_WAL)) {
6994 bluefs_layout.dedicated_wal = false;
6995 }
6996 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
6997 if (r < 0) {
6998 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6999 goto shutdown;
7000 }
7001
7002 if (devs_source.count(BlueFS::BDEV_DB)) {
7003 r = unlink(string(path + "/block.db").c_str());
7004 ceph_assert(r == 0);
7005 }
7006 if (devs_source.count(BlueFS::BDEV_WAL)) {
7007 r = unlink(string(path + "/block.wal").c_str());
7008 ceph_assert(r == 0);
7009 }
7010
7011 shutdown:
7012 _umount_for_bluefs();
7013 return r;
7014 }
7015
7016 int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
7017 int id,
7018 const string& dev_path)
7019 {
7020 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7021 int r;
7022 ceph_assert(path_fd < 0);
7023
7024 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7025
7026 if (!cct->_conf->bluestore_bluefs) {
7027 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7028 return -EIO;
7029 }
7030
7031 r = _mount_for_bluefs();
7032
7033 int reserved = 0;
7034 string link_db;
7035 string link_wal;
7036 if (devs_source.count(BlueFS::BDEV_DB) &&
7037 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
7038 link_db = path + "/block.db";
7039 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7040 bluefs_layout.dedicated_db = false;
7041 }
7042 if (devs_source.count(BlueFS::BDEV_WAL)) {
7043 link_wal = path + "/block.wal";
7044 bluefs_layout.dedicated_wal = false;
7045 }
7046
7047 size_t target_size;
7048 string target_name;
7049 if (id == BlueFS::BDEV_NEWWAL) {
7050 target_name = "block.wal";
7051 target_size = cct->_conf->bluestore_block_wal_size;
7052 bluefs_layout.dedicated_wal = true;
7053
7054 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
7055 cct->_conf->bdev_enable_discard);
7056 ceph_assert(r == 0);
7057
7058 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7059 r = _check_or_set_bdev_label(
7060 dev_path,
7061 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7062 "bluefs wal",
7063 true);
7064 ceph_assert(r == 0);
7065 }
7066 reserved = BDEV_LABEL_BLOCK_SIZE;
7067 } else if (id == BlueFS::BDEV_NEWDB) {
7068 target_name = "block.db";
7069 target_size = cct->_conf->bluestore_block_db_size;
7070 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7071 bluefs_layout.dedicated_db = true;
7072
7073 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
7074 cct->_conf->bdev_enable_discard);
7075 ceph_assert(r == 0);
7076
7077 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7078 r = _check_or_set_bdev_label(
7079 dev_path,
7080 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7081 "bluefs db",
7082 true);
7083 ceph_assert(r == 0);
7084 }
7085 reserved = SUPER_RESERVED;
7086 }
7087
7088 bluefs->umount();
7089 bluefs->mount();
7090
7091 bluefs->add_block_extent(
7092 id, reserved, bluefs->get_block_device_size(id) - reserved);
7093
7094 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
7095
7096 if (r < 0) {
7097 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7098 goto shutdown;
7099 }
7100
7101 if (!link_db.empty()) {
7102 r = unlink(link_db.c_str());
7103 ceph_assert(r == 0);
7104 }
7105 if (!link_wal.empty()) {
7106 r = unlink(link_wal.c_str());
7107 ceph_assert(r == 0);
7108 }
7109 r = _setup_block_symlink_or_file(
7110 target_name,
7111 dev_path,
7112 target_size,
7113 true);
7114 ceph_assert(r == 0);
7115 dout(0) << __func__ << " success" << dendl;
7116
7117 shutdown:
7118 _umount_for_bluefs();
7119 return r;
7120 }
7121
7122 string BlueStore::get_device_path(unsigned id)
7123 {
7124 string res;
7125 if (id < BlueFS::MAX_BDEV) {
7126 switch (id) {
7127 case BlueFS::BDEV_WAL:
7128 res = path + "/block.wal";
7129 break;
7130 case BlueFS::BDEV_DB:
7131 if (id == bluefs_layout.shared_bdev) {
7132 res = path + "/block";
7133 } else {
7134 res = path + "/block.db";
7135 }
7136 break;
7137 case BlueFS::BDEV_SLOW:
7138 res = path + "/block";
7139 break;
7140 }
7141 }
7142 return res;
7143 }
7144
7145 int BlueStore::expand_devices(ostream& out)
7146 {
7147 int r = cold_open();
7148 ceph_assert(r == 0);
7149 bluefs->dump_block_extents(out);
7150 out << "Expanding DB/WAL..." << std::endl;
7151 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
7152 if (devid == bluefs_layout.shared_bdev ) {
7153 continue;
7154 }
7155 uint64_t size = bluefs->get_block_device_size(devid);
7156 if (size == 0) {
7157 // no bdev
7158 continue;
7159 }
7160
7161 interval_set<uint64_t> before;
7162 bluefs->get_block_extents(devid, &before);
7163 ceph_assert(!before.empty());
7164 uint64_t end = before.range_end();
7165 if (end < size) {
7166 out << devid
7167 <<" : expanding " << " from 0x" << std::hex
7168 << end << " to 0x" << size << std::dec << std::endl;
7169 bluefs->add_block_extent(devid, end, size-end);
7170 string p = get_device_path(devid);
7171 const char* path = p.c_str();
7172 if (path == nullptr) {
7173 derr << devid
7174 <<": can't find device path " << dendl;
7175 continue;
7176 }
7177 bluestore_bdev_label_t label;
7178 int r = _read_bdev_label(cct, path, &label);
7179 if (r < 0) {
7180 derr << "unable to read label for " << path << ": "
7181 << cpp_strerror(r) << dendl;
7182 continue;
7183 }
7184 label.size = size;
7185 r = _write_bdev_label(cct, path, label);
7186 if (r < 0) {
7187 derr << "unable to write label for " << path << ": "
7188 << cpp_strerror(r) << dendl;
7189 continue;
7190 }
7191 out << devid
7192 <<" : size label updated to " << size
7193 << std::endl;
7194 }
7195 }
7196 uint64_t size0 = fm->get_size();
7197 uint64_t size = bdev->get_size();
7198 if (size0 < size) {
7199 out << bluefs_layout.shared_bdev
7200 << " : expanding " << " from 0x" << std::hex
7201 << size0 << " to 0x" << size << std::dec << std::endl;
7202 _write_out_fm_meta(size, true);
7203 cold_close();
7204
7205 // mount in read/write to sync expansion changes
7206 r = _mount(false);
7207 ceph_assert(r == 0);
7208 umount();
7209 } else {
7210 cold_close();
7211 }
7212 return r;
7213 }
7214
7215 int BlueStore::dump_bluefs_sizes(ostream& out)
7216 {
7217 int r = cold_open();
7218 ceph_assert(r == 0);
7219 bluefs->dump_block_extents(out);
7220 cold_close();
7221 return r;
7222 }
7223
7224 void BlueStore::set_cache_shards(unsigned num)
7225 {
7226 dout(10) << __func__ << " " << num << dendl;
7227 size_t oold = onode_cache_shards.size();
7228 size_t bold = buffer_cache_shards.size();
7229 ceph_assert(num >= oold && num >= bold);
7230 onode_cache_shards.resize(num);
7231 buffer_cache_shards.resize(num);
7232 for (unsigned i = oold; i < num; ++i) {
7233 onode_cache_shards[i] =
7234 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7235 logger);
7236 }
7237 for (unsigned i = bold; i < num; ++i) {
7238 buffer_cache_shards[i] =
7239 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7240 logger);
7241 }
7242 }
7243
7244 int BlueStore::_mount(bool kv_only, bool open_db)
7245 {
7246 dout(1) << __func__ << " path " << path << dendl;
7247
7248 _kv_only = kv_only;
7249
7250 {
7251 string type;
7252 int r = read_meta("type", &type);
7253 if (r < 0) {
7254 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
7255 << dendl;
7256 return r;
7257 }
7258
7259 if (type != "bluestore") {
7260 derr << __func__ << " expected bluestore, but type is " << type << dendl;
7261 return -EIO;
7262 }
7263 }
7264
7265 if (cct->_conf->bluestore_fsck_on_mount) {
7266 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7267 if (rc < 0)
7268 return rc;
7269 if (rc > 0) {
7270 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7271 return -EIO;
7272 }
7273 }
7274
7275 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7276 derr << __func__ << " osd_max_object_size "
7277 << cct->_conf->osd_max_object_size << " > bluestore max "
7278 << OBJECT_MAX_SIZE << dendl;
7279 return -EINVAL;
7280 }
7281
7282 int r = _open_path();
7283 if (r < 0)
7284 return r;
7285 r = _open_fsid(false);
7286 if (r < 0)
7287 goto out_path;
7288
7289 r = _read_fsid(&fsid);
7290 if (r < 0)
7291 goto out_fsid;
7292
7293 r = _lock_fsid();
7294 if (r < 0)
7295 goto out_fsid;
7296
7297 r = _open_bdev(false);
7298 if (r < 0)
7299 goto out_fsid;
7300
7301 if (open_db) {
7302 r = _open_db_and_around(false);
7303 } else {
7304 // we can bypass db open exclusively in case of kv_only mode
7305 ceph_assert(kv_only);
7306 r = _open_db(false, true);
7307 }
7308 if (r < 0) {
7309 goto out_bdev;
7310 }
7311
7312 if (kv_only)
7313 return 0;
7314
7315 r = _upgrade_super();
7316 if (r < 0) {
7317 goto out_db;
7318 }
7319
7320 r = _open_collections();
7321 if (r < 0)
7322 goto out_db;
7323
7324 r = _reload_logger();
7325 if (r < 0)
7326 goto out_coll;
7327
7328 _kv_start();
7329
7330 r = _deferred_replay();
7331 if (r < 0)
7332 goto out_stop;
7333
7334 mempool_thread.init();
7335
7336 if ((!per_pool_stat_collection || !per_pool_omap) &&
7337 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
7338
7339 bool was_per_pool_omap = per_pool_omap;
7340
7341 dout(1) << __func__ << " quick-fix on mount" << dendl;
7342 _fsck_on_open(FSCK_SHALLOW, true);
7343
7344 //reread statfs
7345 //FIXME minor: replace with actual open/close?
7346 _open_statfs();
7347 _check_legacy_statfs_alert();
7348
7349 //set again as hopefully it has been fixed
7350 if (!was_per_pool_omap) {
7351 _set_per_pool_omap();
7352 }
7353 }
7354
7355 mounted = true;
7356 return 0;
7357
7358 out_stop:
7359 _kv_stop();
7360 out_coll:
7361 _shutdown_cache();
7362 out_db:
7363 _close_db_and_around(false);
7364 out_bdev:
7365 _close_bdev();
7366 out_fsid:
7367 _close_fsid();
7368 out_path:
7369 _close_path();
7370 return r;
7371 }
7372
7373 int BlueStore::umount()
7374 {
7375 ceph_assert(_kv_only || mounted);
7376 dout(1) << __func__ << dendl;
7377
7378 _osr_drain_all();
7379
7380 mounted = false;
7381 if (!_kv_only) {
7382 mempool_thread.shutdown();
7383 dout(20) << __func__ << " stopping kv thread" << dendl;
7384 _kv_stop();
7385 _shutdown_cache();
7386 dout(20) << __func__ << " closing" << dendl;
7387
7388 }
7389 _close_db_and_around(false);
7390 _close_bdev();
7391 _close_fsid();
7392 _close_path();
7393
7394 if (cct->_conf->bluestore_fsck_on_umount) {
7395 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7396 if (rc < 0)
7397 return rc;
7398 if (rc > 0) {
7399 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7400 return -EIO;
7401 }
7402 }
7403 return 0;
7404 }
7405
7406 int BlueStore::cold_open()
7407 {
7408 int r = _open_path();
7409 if (r < 0)
7410 return r;
7411 r = _open_fsid(false);
7412 if (r < 0)
7413 goto out_path;
7414
7415 r = _read_fsid(&fsid);
7416 if (r < 0)
7417 goto out_fsid;
7418
7419 r = _lock_fsid();
7420 if (r < 0)
7421 goto out_fsid;
7422
7423 r = _open_bdev(false);
7424 if (r < 0)
7425 goto out_fsid;
7426 r = _open_db_and_around(true);
7427 if (r < 0) {
7428 goto out_bdev;
7429 }
7430 return 0;
7431 out_bdev:
7432 _close_bdev();
7433 out_fsid:
7434 _close_fsid();
7435 out_path:
7436 _close_path();
7437 return r;
7438 }
7439 int BlueStore::cold_close()
7440 {
7441 _close_db_and_around(true);
7442 _close_bdev();
7443 _close_fsid();
7444 _close_path();
7445 return 0;
7446 }
7447
7448 // derr wrapper to limit enormous output and avoid log flooding.
7449 // Of limited use where such output is expected for now
7450 #define fsck_derr(err_cnt, threshold) \
7451 if (err_cnt <= threshold) { \
7452 bool need_skip_print = err_cnt == threshold; \
7453 derr
7454
7455 #define fsck_dendl \
7456 dendl; \
7457 if (need_skip_print) \
7458 derr << "more error lines skipped..." << dendl; \
7459 }
7460
7461 int _fsck_sum_extents(
7462 const PExtentVector& extents,
7463 bool compressed,
7464 store_statfs_t& expected_statfs)
7465 {
7466 for (auto e : extents) {
7467 if (!e.is_valid())
7468 continue;
7469 expected_statfs.allocated += e.length;
7470 if (compressed) {
7471 expected_statfs.data_compressed_allocated += e.length;
7472 }
7473 }
7474 return 0;
7475 }
7476
7477 int BlueStore::_fsck_check_extents(
7478 const coll_t& cid,
7479 const ghobject_t& oid,
7480 const PExtentVector& extents,
7481 bool compressed,
7482 mempool_dynamic_bitset &used_blocks,
7483 uint64_t granularity,
7484 BlueStoreRepairer* repairer,
7485 store_statfs_t& expected_statfs,
7486 FSCKDepth depth)
7487 {
7488 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7489 int errors = 0;
7490 for (auto e : extents) {
7491 if (!e.is_valid())
7492 continue;
7493 expected_statfs.allocated += e.length;
7494 if (compressed) {
7495 expected_statfs.data_compressed_allocated += e.length;
7496 }
7497 if (depth != FSCK_SHALLOW) {
7498 bool already = false;
7499 apply_for_bitset_range(
7500 e.offset, e.length, granularity, used_blocks,
7501 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
7502 if (bs.test(pos)) {
7503 if (repairer) {
7504 repairer->note_misreference(
7505 pos * min_alloc_size, min_alloc_size, !already);
7506 }
7507 if (!already) {
7508 derr << "fsck error: " << oid << " extent " << e
7509 << " or a subset is already allocated (misreferenced)" << dendl;
7510 ++errors;
7511 already = true;
7512 }
7513 }
7514 else
7515 bs.set(pos);
7516 });
7517 if (repairer) {
7518 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
7519 }
7520
7521 if (e.end() > bdev->get_size()) {
7522 derr << "fsck error: " << oid << " extent " << e
7523 << " past end of block device" << dendl;
7524 ++errors;
7525 }
7526 }
7527 }
7528 return errors;
7529 }
7530
7531 void BlueStore::_fsck_check_pool_statfs(
7532 BlueStore::per_pool_statfs& expected_pool_statfs,
7533 int64_t& errors,
7534 int64_t& warnings,
7535 BlueStoreRepairer* repairer)
7536 {
7537 auto it = db->get_iterator(PREFIX_STAT);
7538 if (it) {
7539 for (it->lower_bound(string()); it->valid(); it->next()) {
7540 string key = it->key();
7541 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7542 if (repairer) {
7543 ++errors;
7544 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7545 derr << "fsck error: " << "legacy statfs record found, removing"
7546 << dendl;
7547 }
7548 continue;
7549 }
7550 uint64_t pool_id;
7551 if (get_key_pool_stat(key, &pool_id) < 0) {
7552 derr << "fsck error: bad key " << key
7553 << "in statfs namespece" << dendl;
7554 if (repairer) {
7555 repairer->remove_key(db, PREFIX_STAT, key);
7556 }
7557 ++errors;
7558 continue;
7559 }
7560
7561 volatile_statfs vstatfs;
7562 bufferlist bl = it->value();
7563 auto blp = bl.cbegin();
7564 try {
7565 vstatfs.decode(blp);
7566 } catch (buffer::error& e) {
7567 derr << "fsck error: failed to decode Pool StatFS record"
7568 << pretty_binary_string(key) << dendl;
7569 if (repairer) {
7570 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7571 << pretty_binary_string(key)
7572 << "', removing" << dendl;
7573 repairer->remove_key(db, PREFIX_STAT, key);
7574 }
7575 ++errors;
7576 vstatfs.reset();
7577 }
7578 auto stat_it = expected_pool_statfs.find(pool_id);
7579 if (stat_it == expected_pool_statfs.end()) {
7580 if (vstatfs.is_empty()) {
7581 // we don't consider that as an error since empty pool statfs
7582 // are left in DB for now
7583 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7584 << std::hex << pool_id << std::dec << dendl;
7585 if (repairer) {
7586 // but we need to increment error count in case of repair
7587 // to have proper counters at the end
7588 // (as repairer increments recovery counter anyway).
7589 ++errors;
7590 }
7591 } else {
7592 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7593 << std::hex << pool_id << std::dec << dendl;
7594 ++errors;
7595 }
7596 if (repairer) {
7597 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
7598 }
7599 continue;
7600 }
7601 store_statfs_t statfs;
7602 vstatfs.publish(&statfs);
7603 if (!(stat_it->second == statfs)) {
7604 derr << "fsck error: actual " << statfs
7605 << " != expected " << stat_it->second
7606 << " for pool "
7607 << std::hex << pool_id << std::dec << dendl;
7608 if (repairer) {
7609 repairer->fix_statfs(db, key, stat_it->second);
7610 }
7611 ++errors;
7612 }
7613 expected_pool_statfs.erase(stat_it);
7614 }
7615 } // if (it)
7616 for (auto& s : expected_pool_statfs) {
7617 if (s.second.is_zero()) {
7618 // we might lack empty statfs recs in DB
7619 continue;
7620 }
7621 derr << "fsck error: missing Pool StatFS record for pool "
7622 << std::hex << s.first << std::dec << dendl;
7623 if (repairer) {
7624 string key;
7625 get_pool_stat_key(s.first, &key);
7626 repairer->fix_statfs(db, key, s.second);
7627 }
7628 ++errors;
7629 }
7630 if (!per_pool_stat_collection &&
7631 repairer) {
7632 // by virtue of running this method, we correct the top-level
7633 // error of having global stats
7634 repairer->inc_repaired();
7635 }
7636 }
7637
7638 BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7639 BlueStore::FSCKDepth depth,
7640 int64_t pool_id,
7641 BlueStore::CollectionRef c,
7642 const ghobject_t& oid,
7643 const string& key,
7644 const bufferlist& value,
7645 mempool::bluestore_fsck::list<string>* expecting_shards,
7646 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7647 const BlueStore::FSCK_ObjectCtx& ctx)
7648 {
7649 auto& errors = ctx.errors;
7650 auto& num_objects = ctx.num_objects;
7651 auto& num_extents = ctx.num_extents;
7652 auto& num_blobs = ctx.num_blobs;
7653 auto& num_sharded_objects = ctx.num_sharded_objects;
7654 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7655 auto used_blocks = ctx.used_blocks;
7656 auto sb_info_lock = ctx.sb_info_lock;
7657 auto& sb_info = ctx.sb_info;
7658 auto repairer = ctx.repairer;
7659
7660 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7661 &ctx.expected_pool_statfs[pool_id] :
7662 &ctx.expected_store_statfs;
7663
7664 dout(10) << __func__ << " " << oid << dendl;
7665 OnodeRef o;
7666 o.reset(Onode::decode(c, oid, key, value));
7667 ++num_objects;
7668
7669 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7670
7671 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7672 _dump_onode<30>(cct, *o);
7673 // shards
7674 if (!o->extent_map.shards.empty()) {
7675 ++num_sharded_objects;
7676 if (depth != FSCK_SHALLOW) {
7677 ceph_assert(expecting_shards);
7678 for (auto& s : o->extent_map.shards) {
7679 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
7680 expecting_shards->push_back(string());
7681 get_extent_shard_key(o->key, s.shard_info->offset,
7682 &expecting_shards->back());
7683 if (s.shard_info->offset >= o->onode.size) {
7684 derr << "fsck error: " << oid << " shard 0x" << std::hex
7685 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7686 << std::dec << dendl;
7687 ++errors;
7688 }
7689 }
7690 }
7691 }
7692
7693 // lextents
7694 uint64_t pos = 0;
7695 mempool::bluestore_fsck::map<BlobRef,
7696 bluestore_blob_use_tracker_t> ref_map;
7697 for (auto& l : o->extent_map.extent_map) {
7698 dout(20) << __func__ << " " << l << dendl;
7699 if (l.logical_offset < pos) {
7700 derr << "fsck error: " << oid << " lextent at 0x"
7701 << std::hex << l.logical_offset
7702 << " overlaps with the previous, which ends at 0x" << pos
7703 << std::dec << dendl;
7704 ++errors;
7705 }
7706 if (depth != FSCK_SHALLOW &&
7707 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7708 derr << "fsck error: " << oid << " lextent at 0x"
7709 << std::hex << l.logical_offset << "~" << l.length
7710 << " spans a shard boundary"
7711 << std::dec << dendl;
7712 ++errors;
7713 }
7714 pos = l.logical_offset + l.length;
7715 res_statfs->data_stored += l.length;
7716 ceph_assert(l.blob);
7717 const bluestore_blob_t& blob = l.blob->get_blob();
7718
7719 auto& ref = ref_map[l.blob];
7720 if (ref.is_empty()) {
7721 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7722 uint32_t l = blob.get_logical_length();
7723 ref.init(l, min_release_size);
7724 }
7725 ref.get(
7726 l.blob_offset,
7727 l.length);
7728 ++num_extents;
7729 if (depth != FSCK_SHALLOW &&
7730 blob.has_unused()) {
7731 ceph_assert(referenced);
7732 auto p = referenced->find(l.blob);
7733 bluestore_blob_t::unused_t* pu;
7734 if (p == referenced->end()) {
7735 pu = &(*referenced)[l.blob];
7736 }
7737 else {
7738 pu = &p->second;
7739 }
7740 uint64_t blob_len = blob.get_logical_length();
7741 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7742 ceph_assert(l.blob_offset + l.length <= blob_len);
7743 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7744 uint64_t start = l.blob_offset / chunk_size;
7745 uint64_t end =
7746 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7747 for (auto i = start; i < end; ++i) {
7748 (*pu) |= (1u << i);
7749 }
7750 }
7751 } //for (auto& l : o->extent_map.extent_map)
7752
7753 for (auto& i : ref_map) {
7754 ++num_blobs;
7755 const bluestore_blob_t& blob = i.first->get_blob();
7756 bool equal =
7757 depth == FSCK_SHALLOW ? true :
7758 i.first->get_blob_use_tracker().equal(i.second);
7759 if (!equal) {
7760 derr << "fsck error: " << oid << " blob " << *i.first
7761 << " doesn't match expected ref_map " << i.second << dendl;
7762 ++errors;
7763 }
7764 if (blob.is_compressed()) {
7765 res_statfs->data_compressed += blob.get_compressed_payload_length();
7766 res_statfs->data_compressed_original +=
7767 i.first->get_referenced_bytes();
7768 }
7769 if (blob.is_shared()) {
7770 if (i.first->shared_blob->get_sbid() > blobid_max) {
7771 derr << "fsck error: " << oid << " blob " << blob
7772 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7773 << blobid_max << dendl;
7774 ++errors;
7775 }
7776 else if (i.first->shared_blob->get_sbid() == 0) {
7777 derr << "fsck error: " << oid << " blob " << blob
7778 << " marked as shared but has uninitialized sbid"
7779 << dendl;
7780 ++errors;
7781 }
7782 // the below lock is optional and provided in multithreading mode only
7783 if (sb_info_lock) {
7784 sb_info_lock->lock();
7785 }
7786 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7787 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7788 ceph_assert(sbi.pool_id == INT64_MIN ||
7789 sbi.pool_id == oid.hobj.get_logical_pool());
7790 sbi.cid = c->cid;
7791 sbi.pool_id = oid.hobj.get_logical_pool();
7792 sbi.sb = i.first->shared_blob;
7793 sbi.oids.push_back(oid);
7794 sbi.compressed = blob.is_compressed();
7795 for (auto e : blob.get_extents()) {
7796 if (e.is_valid()) {
7797 sbi.ref_map.get(e.offset, e.length);
7798 }
7799 }
7800 if (sb_info_lock) {
7801 sb_info_lock->unlock();
7802 }
7803 } else if (depth != FSCK_SHALLOW) {
7804 ceph_assert(used_blocks);
7805 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7806 blob.is_compressed(),
7807 *used_blocks,
7808 fm->get_alloc_size(),
7809 repairer,
7810 *res_statfs,
7811 depth);
7812 } else {
7813 errors += _fsck_sum_extents(
7814 blob.get_extents(),
7815 blob.is_compressed(),
7816 *res_statfs);
7817 }
7818 } // for (auto& i : ref_map)
7819
7820 {
7821 auto &sbm = o->extent_map.spanning_blob_map;
7822 size_t broken = 0;
7823 BlobRef first_broken;
7824 for (auto it = sbm.begin(); it != sbm.end();) {
7825 auto it1 = it++;
7826 if (ref_map.count(it1->second) == 0) {
7827 if (!broken) {
7828 first_broken = it1->second;
7829 ++errors;
7830 }
7831 broken++;
7832 if (repairer) {
7833 sbm.erase(it1);
7834 }
7835 }
7836 }
7837 if (broken) {
7838 derr << "fsck error: " << oid << " - " << broken
7839 << " zombie spanning blob(s) found, the first one: "
7840 << *first_broken << dendl;
7841 if(repairer) {
7842 auto txn = repairer->fix_spanning_blobs(db);
7843 _record_onode(o, txn);
7844 }
7845 }
7846 }
7847
7848 if (o->onode.has_omap()) {
7849 _fsck_check_object_omap(depth, o, ctx);
7850 }
7851
7852 return o;
7853 }
7854
7855 #include "common/WorkQueue.h"
7856
7857 class ShallowFSCKThreadPool : public ThreadPool
7858 {
7859 public:
7860 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7861 ThreadPool(cct_, nm, tn, n) {
7862 }
7863 void worker(ThreadPool::WorkThread* wt) override {
7864 int next_wq = 0;
7865 while (!_stop) {
7866 next_wq %= work_queues.size();
7867 WorkQueue_ *wq = work_queues[next_wq++];
7868
7869 void* item = wq->_void_dequeue();
7870 if (item) {
7871 processing++;
7872 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7873 wq->_void_process(item, tp_handle);
7874 processing--;
7875 }
7876 }
7877 }
7878 template <size_t BatchLen>
7879 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7880 {
7881 struct Entry {
7882 int64_t pool_id;
7883 BlueStore::CollectionRef c;
7884 ghobject_t oid;
7885 string key;
7886 bufferlist value;
7887 };
7888 struct Batch {
7889 std::atomic<size_t> running = { 0 };
7890 size_t entry_count = 0;
7891 std::array<Entry, BatchLen> entries;
7892
7893 int64_t errors = 0;
7894 int64_t warnings = 0;
7895 uint64_t num_objects = 0;
7896 uint64_t num_extents = 0;
7897 uint64_t num_blobs = 0;
7898 uint64_t num_sharded_objects = 0;
7899 uint64_t num_spanning_blobs = 0;
7900 store_statfs_t expected_store_statfs;
7901 BlueStore::per_pool_statfs expected_pool_statfs;
7902 };
7903
7904 size_t batchCount;
7905 BlueStore* store = nullptr;
7906
7907 ceph::mutex* sb_info_lock = nullptr;
7908 BlueStore::sb_info_map_t* sb_info = nullptr;
7909 BlueStoreRepairer* repairer = nullptr;
7910
7911 Batch* batches = nullptr;
7912 size_t last_batch_pos = 0;
7913 bool batch_acquired = false;
7914
7915 FSCKWorkQueue(std::string n,
7916 size_t _batchCount,
7917 BlueStore* _store,
7918 ceph::mutex* _sb_info_lock,
7919 BlueStore::sb_info_map_t& _sb_info,
7920 BlueStoreRepairer* _repairer) :
7921 WorkQueue_(n, time_t(), time_t()),
7922 batchCount(_batchCount),
7923 store(_store),
7924 sb_info_lock(_sb_info_lock),
7925 sb_info(&_sb_info),
7926 repairer(_repairer)
7927 {
7928 batches = new Batch[batchCount];
7929 }
7930 ~FSCKWorkQueue() {
7931 delete[] batches;
7932 }
7933
7934 /// Remove all work items from the queue.
7935 void _clear() override {
7936 //do nothing
7937 }
7938 /// Check whether there is anything to do.
7939 bool _empty() override {
7940 ceph_assert(false);
7941 }
7942
7943 /// Get the next work item to process.
7944 void* _void_dequeue() override {
7945 size_t pos = rand() % batchCount;
7946 size_t pos0 = pos;
7947 do {
7948 auto& batch = batches[pos];
7949 if (batch.running.fetch_add(1) == 0) {
7950 if (batch.entry_count) {
7951 return &batch;
7952 }
7953 }
7954 batch.running--;
7955 pos++;
7956 pos %= batchCount;
7957 } while (pos != pos0);
7958 return nullptr;
7959 }
7960 /** @brief Process the work item.
7961 * This function will be called several times in parallel
7962 * and must therefore be thread-safe. */
7963 void _void_process(void* item, TPHandle& handle) override {
7964 Batch* batch = (Batch*)item;
7965
7966 BlueStore::FSCK_ObjectCtx ctx(
7967 batch->errors,
7968 batch->warnings,
7969 batch->num_objects,
7970 batch->num_extents,
7971 batch->num_blobs,
7972 batch->num_sharded_objects,
7973 batch->num_spanning_blobs,
7974 nullptr, // used_blocks
7975 nullptr, //used_omap_head
7976 sb_info_lock,
7977 *sb_info,
7978 batch->expected_store_statfs,
7979 batch->expected_pool_statfs,
7980 repairer);
7981
7982 for (size_t i = 0; i < batch->entry_count; i++) {
7983 auto& entry = batch->entries[i];
7984
7985 store->fsck_check_objects_shallow(
7986 BlueStore::FSCK_SHALLOW,
7987 entry.pool_id,
7988 entry.c,
7989 entry.oid,
7990 entry.key,
7991 entry.value,
7992 nullptr, // expecting_shards - this will need a protection if passed
7993 nullptr, // referenced
7994 ctx);
7995 }
7996 //std::cout << "processed " << batch << std::endl;
7997 batch->entry_count = 0;
7998 batch->running--;
7999 }
8000 /** @brief Synchronously finish processing a work item.
8001 * This function is called after _void_process with the global thread pool lock held,
8002 * so at most one copy will execute simultaneously for a given thread pool.
8003 * It can be used for non-thread-safe finalization. */
8004 void _void_process_finish(void*) override {
8005 ceph_assert(false);
8006 }
8007
8008 bool queue(
8009 int64_t pool_id,
8010 BlueStore::CollectionRef c,
8011 const ghobject_t& oid,
8012 const string& key,
8013 const bufferlist& value) {
8014 bool res = false;
8015 size_t pos0 = last_batch_pos;
8016 if (!batch_acquired) {
8017 do {
8018 auto& batch = batches[last_batch_pos];
8019 if (batch.running.fetch_add(1) == 0) {
8020 if (batch.entry_count < BatchLen) {
8021 batch_acquired = true;
8022 break;
8023 }
8024 }
8025 batch.running.fetch_sub(1);
8026 last_batch_pos++;
8027 last_batch_pos %= batchCount;
8028 } while (last_batch_pos != pos0);
8029 }
8030 if (batch_acquired) {
8031 auto& batch = batches[last_batch_pos];
8032 ceph_assert(batch.running);
8033 ceph_assert(batch.entry_count < BatchLen);
8034
8035 auto& entry = batch.entries[batch.entry_count];
8036 entry.pool_id = pool_id;
8037 entry.c = c;
8038 entry.oid = oid;
8039 entry.key = key;
8040 entry.value = value;
8041
8042 ++batch.entry_count;
8043 if (batch.entry_count == BatchLen) {
8044 batch_acquired = false;
8045 batch.running.fetch_sub(1);
8046 last_batch_pos++;
8047 last_batch_pos %= batchCount;
8048 }
8049 res = true;
8050 }
8051 return res;
8052 }
8053
8054 void finalize(ThreadPool& tp,
8055 BlueStore::FSCK_ObjectCtx& ctx) {
8056 if (batch_acquired) {
8057 auto& batch = batches[last_batch_pos];
8058 ceph_assert(batch.running);
8059 batch.running.fetch_sub(1);
8060 }
8061 tp.stop();
8062
8063 for (size_t i = 0; i < batchCount; i++) {
8064 auto& batch = batches[i];
8065
8066 //process leftovers if any
8067 if (batch.entry_count) {
8068 TPHandle tp_handle(store->cct,
8069 nullptr,
8070 timeout_interval,
8071 suicide_interval);
8072 ceph_assert(batch.running == 0);
8073
8074 batch.running++; // just to be on-par with the regular call
8075 _void_process(&batch, tp_handle);
8076 }
8077 ceph_assert(batch.entry_count == 0);
8078
8079 ctx.errors += batch.errors;
8080 ctx.warnings += batch.warnings;
8081 ctx.num_objects += batch.num_objects;
8082 ctx.num_extents += batch.num_extents;
8083 ctx.num_blobs += batch.num_blobs;
8084 ctx.num_sharded_objects += batch.num_sharded_objects;
8085 ctx.num_spanning_blobs += batch.num_spanning_blobs;
8086
8087 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8088
8089 for (auto it = batch.expected_pool_statfs.begin();
8090 it != batch.expected_pool_statfs.end();
8091 it++) {
8092 ctx.expected_pool_statfs[it->first].add(it->second);
8093 }
8094 }
8095 }
8096 };
8097 };
8098
8099 void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8100 OnodeRef& o,
8101 const BlueStore::FSCK_ObjectCtx& ctx)
8102 {
8103 auto& errors = ctx.errors;
8104 auto& warnings = ctx.warnings;
8105 auto repairer = ctx.repairer;
8106
8107 ceph_assert(o->onode.has_omap());
8108 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
8109 if (per_pool_omap) {
8110 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8111 << "fsck error: " << o->oid
8112 << " has omap that is not per-pool or pgmeta"
8113 << fsck_dendl;
8114 ++errors;
8115 } else {
8116 const char* w;
8117 int64_t num;
8118 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8119 ++errors;
8120 num = errors;
8121 w = "error";
8122 } else {
8123 ++warnings;
8124 num = warnings;
8125 w = "warning";
8126 }
8127 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8128 << "fsck " << w << ": " << o->oid
8129 << " has omap that is not per-pool or pgmeta"
8130 << fsck_dendl;
8131 }
8132 }
8133 if (repairer &&
8134 !o->onode.is_perpool_omap() &&
8135 !o->onode.is_pgmeta_omap()) {
8136 dout(10) << "fsck converting " << o->oid << " omap to per-pool" << dendl;
8137 bufferlist h;
8138 map<string, bufferlist> kv;
8139 int r = _onode_omap_get(o, &h, &kv);
8140 if (r < 0) {
8141 derr << " got " << r << " " << cpp_strerror(r) << dendl;
8142 } else {
8143 KeyValueDB::Transaction txn = db->get_transaction();
8144 // remove old keys
8145 const string& old_omap_prefix = o->get_omap_prefix();
8146 string old_head, old_tail;
8147 o->get_omap_header(&old_head);
8148 o->get_omap_tail(&old_tail);
8149 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8150 txn->rmkey(old_omap_prefix, old_tail);
8151 // set flag
8152 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP);
8153 _record_onode(o, txn);
8154 const string& new_omap_prefix = o->get_omap_prefix();
8155 // head
8156 if (h.length()) {
8157 string new_head;
8158 o->get_omap_header(&new_head);
8159 txn->set(new_omap_prefix, new_head, h);
8160 }
8161 // tail
8162 string new_tail;
8163 o->get_omap_tail(&new_tail);
8164 bufferlist empty;
8165 txn->set(new_omap_prefix, new_tail, empty);
8166 // values
8167 string final_key;
8168 o->get_omap_key(string(), &final_key);
8169 size_t base_key_len = final_key.size();
8170 for (auto& i : kv) {
8171 final_key.resize(base_key_len);
8172 final_key += i.first;
8173 txn->set(new_omap_prefix, final_key, i.second);
8174 }
8175 db->submit_transaction_sync(txn);
8176 repairer->inc_repaired();
8177 }
8178 }
8179 }
8180
8181 void BlueStore::_fsck_check_objects(FSCKDepth depth,
8182 BlueStore::FSCK_ObjectCtx& ctx)
8183 {
8184 auto& errors = ctx.errors;
8185 auto sb_info_lock = ctx.sb_info_lock;
8186 auto& sb_info = ctx.sb_info;
8187 auto repairer = ctx.repairer;
8188
8189 uint64_t_btree_t used_nids;
8190
8191 size_t processed_myself = 0;
8192
8193 auto it = db->get_iterator(PREFIX_OBJ);
8194 mempool::bluestore_fsck::list<string> expecting_shards;
8195 if (it) {
8196 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8197 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8198 std::unique_ptr<WQ> wq(
8199 new WQ(
8200 "FSCKWorkQueue",
8201 (thread_count ? : 1) * 32,
8202 this,
8203 sb_info_lock,
8204 sb_info,
8205 repairer));
8206
8207 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8208
8209 thread_pool.add_work_queue(wq.get());
8210 if (depth == FSCK_SHALLOW && thread_count > 0) {
8211 //not the best place but let's check anyway
8212 ceph_assert(sb_info_lock);
8213 thread_pool.start();
8214 }
8215
8216 //fill global if not overriden below
8217 CollectionRef c;
8218 int64_t pool_id = -1;
8219 spg_t pgid;
8220 for (it->lower_bound(string()); it->valid(); it->next()) {
8221 dout(30) << __func__ << " key "
8222 << pretty_binary_string(it->key()) << dendl;
8223 if (is_extent_shard_key(it->key())) {
8224 if (depth == FSCK_SHALLOW) {
8225 continue;
8226 }
8227 while (!expecting_shards.empty() &&
8228 expecting_shards.front() < it->key()) {
8229 derr << "fsck error: missing shard key "
8230 << pretty_binary_string(expecting_shards.front())
8231 << dendl;
8232 ++errors;
8233 expecting_shards.pop_front();
8234 }
8235 if (!expecting_shards.empty() &&
8236 expecting_shards.front() == it->key()) {
8237 // all good
8238 expecting_shards.pop_front();
8239 continue;
8240 }
8241
8242 uint32_t offset;
8243 string okey;
8244 get_key_extent_shard(it->key(), &okey, &offset);
8245 derr << "fsck error: stray shard 0x" << std::hex << offset
8246 << std::dec << dendl;
8247 if (expecting_shards.empty()) {
8248 derr << "fsck error: " << pretty_binary_string(it->key())
8249 << " is unexpected" << dendl;
8250 ++errors;
8251 continue;
8252 }
8253 while (expecting_shards.front() > it->key()) {
8254 derr << "fsck error: saw " << pretty_binary_string(it->key())
8255 << dendl;
8256 derr << "fsck error: exp "
8257 << pretty_binary_string(expecting_shards.front()) << dendl;
8258 ++errors;
8259 expecting_shards.pop_front();
8260 if (expecting_shards.empty()) {
8261 break;
8262 }
8263 }
8264 continue;
8265 }
8266
8267 ghobject_t oid;
8268 int r = get_key_object(it->key(), &oid);
8269 if (r < 0) {
8270 derr << "fsck error: bad object key "
8271 << pretty_binary_string(it->key()) << dendl;
8272 ++errors;
8273 continue;
8274 }
8275 if (!c ||
8276 oid.shard_id != pgid.shard ||
8277 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8278 !c->contains(oid)) {
8279 c = nullptr;
8280 for (auto& p : coll_map) {
8281 if (p.second->contains(oid)) {
8282 c = p.second;
8283 break;
8284 }
8285 }
8286 if (!c) {
8287 derr << "fsck error: stray object " << oid
8288 << " not owned by any collection" << dendl;
8289 ++errors;
8290 continue;
8291 }
8292 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8293 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8294 << dendl;
8295 }
8296
8297 if (depth != FSCK_SHALLOW &&
8298 !expecting_shards.empty()) {
8299 for (auto& k : expecting_shards) {
8300 derr << "fsck error: missing shard key "
8301 << pretty_binary_string(k) << dendl;
8302 }
8303 ++errors;
8304 expecting_shards.clear();
8305 }
8306
8307 bool queued = false;
8308 if (depth == FSCK_SHALLOW && thread_count > 0) {
8309 queued = wq->queue(
8310 pool_id,
8311 c,
8312 oid,
8313 it->key(),
8314 it->value());
8315 }
8316 OnodeRef o;
8317 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8318
8319 if (!queued) {
8320 ++processed_myself;
8321
8322 o = fsck_check_objects_shallow(
8323 depth,
8324 pool_id,
8325 c,
8326 oid,
8327 it->key(),
8328 it->value(),
8329 &expecting_shards,
8330 &referenced,
8331 ctx);
8332 }
8333
8334 if (depth != FSCK_SHALLOW) {
8335 ceph_assert(o != nullptr);
8336 if (o->onode.nid) {
8337 if (o->onode.nid > nid_max) {
8338 derr << "fsck error: " << oid << " nid " << o->onode.nid
8339 << " > nid_max " << nid_max << dendl;
8340 ++errors;
8341 }
8342 if (used_nids.count(o->onode.nid)) {
8343 derr << "fsck error: " << oid << " nid " << o->onode.nid
8344 << " already in use" << dendl;
8345 ++errors;
8346 continue; // go for next object
8347 }
8348 used_nids.insert(o->onode.nid);
8349 }
8350 for (auto& i : referenced) {
8351 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8352 << std::dec << " for " << *i.first << dendl;
8353 const bluestore_blob_t& blob = i.first->get_blob();
8354 if (i.second & blob.unused) {
8355 derr << "fsck error: " << oid << " blob claims unused 0x"
8356 << std::hex << blob.unused
8357 << " but extents reference 0x" << i.second << std::dec
8358 << " on blob " << *i.first << dendl;
8359 ++errors;
8360 }
8361 if (blob.has_csum()) {
8362 uint64_t blob_len = blob.get_logical_length();
8363 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8364 unsigned csum_count = blob.get_csum_count();
8365 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8366 for (unsigned p = 0; p < csum_count; ++p) {
8367 unsigned pos = p * csum_chunk_size;
8368 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8369 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8370 unsigned mask = 1u << firstbit;
8371 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8372 mask |= 1u << b;
8373 }
8374 if ((blob.unused & mask) == mask) {
8375 // this csum chunk region is marked unused
8376 if (blob.get_csum_item(p) != 0) {
8377 derr << "fsck error: " << oid
8378 << " blob claims csum chunk 0x" << std::hex << pos
8379 << "~" << csum_chunk_size
8380 << " is unused (mask 0x" << mask << " of unused 0x"
8381 << blob.unused << ") but csum is non-zero 0x"
8382 << blob.get_csum_item(p) << std::dec << " on blob "
8383 << *i.first << dendl;
8384 ++errors;
8385 }
8386 }
8387 }
8388 }
8389 }
8390 // omap
8391 if (o->onode.has_omap()) {
8392 ceph_assert(ctx.used_omap_head);
8393 if (ctx.used_omap_head->count(o->onode.nid)) {
8394 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8395 << " already in use" << dendl;
8396 ++errors;
8397 } else {
8398 ctx.used_omap_head->insert(o->onode.nid);
8399 }
8400 } // if (o->onode.has_omap())
8401 if (depth == FSCK_DEEP) {
8402 bufferlist bl;
8403 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8404 uint64_t offset = 0;
8405 do {
8406 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8407 int r = _do_read(c.get(), o, offset, l, bl,
8408 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8409 if (r < 0) {
8410 ++errors;
8411 derr << "fsck error: " << oid << std::hex
8412 << " error during read: "
8413 << " " << offset << "~" << l
8414 << " " << cpp_strerror(r) << std::dec
8415 << dendl;
8416 break;
8417 }
8418 offset += l;
8419 } while (offset < o->onode.size);
8420 } // deep
8421 } //if (depth != FSCK_SHALLOW)
8422 } // for (it->lower_bound(string()); it->valid(); it->next())
8423 if (depth == FSCK_SHALLOW && thread_count > 0) {
8424 wq->finalize(thread_pool, ctx);
8425 if (processed_myself) {
8426 // may be needs more threads?
8427 dout(0) << __func__ << " partial offload"
8428 << ", done myself " << processed_myself
8429 << " of " << ctx.num_objects
8430 << "objects, threads " << thread_count
8431 << dendl;
8432 }
8433 }
8434 } // if (it)
8435 }
8436 /**
8437 An overview for currently implemented repair logics
8438 performed in fsck in two stages: detection(+preparation) and commit.
8439 Detection stage (in processing order):
8440 (Issue -> Repair action to schedule)
8441 - Detect undecodable keys for Shared Blobs -> Remove
8442 - Detect undecodable records for Shared Blobs -> Remove
8443 (might trigger missed Shared Blob detection below)
8444 - Detect stray records for Shared Blobs -> Remove
8445 - Detect misreferenced pextents -> Fix
8446 Prepare Bloom-like filter to track cid/oid -> pextent
8447 Prepare list of extents that are improperly referenced
8448 Enumerate Onode records that might use 'misreferenced' pextents
8449 (Bloom-like filter applied to reduce computation)
8450 Per each questinable Onode enumerate all blobs and identify broken ones
8451 (i.e. blobs having 'misreferences')
8452 Rewrite each broken blob data by allocating another extents and
8453 copying data there
8454 If blob is shared - unshare it and mark corresponding Shared Blob
8455 for removal
8456 Release previously allocated space
8457 Update Extent Map
8458 - Detect missed Shared Blobs -> Recreate
8459 - Detect undecodable deferred transaction -> Remove
8460 - Detect Freelist Manager's 'false free' entries -> Mark as used
8461 - Detect Freelist Manager's leaked entries -> Mark as free
8462 - Detect statfs inconsistency - Update
8463 Commit stage (separate DB commit per each step):
8464 - Apply leaked FM entries fix
8465 - Apply 'false free' FM entries fix
8466 - Apply 'Remove' actions
8467 - Apply fix for misreference pextents
8468 - Apply Shared Blob recreate
8469 (can be merged with the step above if misreferences were dectected)
8470 - Apply StatFS update
8471 */
8472 int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8473 {
8474 dout(1) << __func__
8475 << (repair ? " repair" : " check")
8476 << (depth == FSCK_DEEP ? " (deep)" :
8477 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8478 << dendl;
8479
8480 // in deep mode we need R/W write access to be able to replay deferred ops
8481 bool read_only = !(repair || depth == FSCK_DEEP);
8482
8483 int r = _open_path();
8484 if (r < 0)
8485 return r;
8486 r = _open_fsid(false);
8487 if (r < 0)
8488 goto out_path;
8489
8490 r = _read_fsid(&fsid);
8491 if (r < 0)
8492 goto out_fsid;
8493
8494 r = _lock_fsid();
8495 if (r < 0)
8496 goto out_fsid;
8497
8498 r = _open_bdev(false);
8499 if (r < 0)
8500 goto out_fsid;
8501
8502 r = _open_db_and_around(read_only);
8503 if (r < 0)
8504 goto out_bdev;
8505
8506 if (!read_only) {
8507 r = _upgrade_super();
8508 if (r < 0) {
8509 goto out_db;
8510 }
8511 }
8512
8513 r = _open_collections();
8514 if (r < 0)
8515 goto out_db;
8516
8517 mempool_thread.init();
8518
8519 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8520 // enable in repair or deep mode modes only
8521 if (!read_only) {
8522 _kv_start();
8523 r = _deferred_replay();
8524 _kv_stop();
8525 }
8526 if (r < 0)
8527 goto out_scan;
8528
8529 r = _fsck_on_open(depth, repair);
8530
8531 out_scan:
8532 mempool_thread.shutdown();
8533 _shutdown_cache();
8534 out_db:
8535 _close_db_and_around(false);
8536 out_bdev:
8537 _close_bdev();
8538 out_fsid:
8539 _close_fsid();
8540 out_path:
8541 _close_path();
8542
8543 return r;
8544 }
8545
8546 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8547 {
8548 dout(1) << __func__
8549 << " <<<START>>>"
8550 << (repair ? " repair" : " check")
8551 << (depth == FSCK_DEEP ? " (deep)" :
8552 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8553 << " start" << dendl;
8554 int64_t errors = 0;
8555 int64_t warnings = 0;
8556 unsigned repaired = 0;
8557
8558 uint64_t_btree_t used_omap_head;
8559 uint64_t_btree_t used_sbids;
8560
8561 mempool_dynamic_bitset used_blocks;
8562 KeyValueDB::Iterator it;
8563 store_statfs_t expected_store_statfs, actual_statfs;
8564 per_pool_statfs expected_pool_statfs;
8565
8566 sb_info_map_t sb_info;
8567
8568 uint64_t num_objects = 0;
8569 uint64_t num_extents = 0;
8570 uint64_t num_blobs = 0;
8571 uint64_t num_spanning_blobs = 0;
8572 uint64_t num_shared_blobs = 0;
8573 uint64_t num_sharded_objects = 0;
8574 BlueStoreRepairer repairer;
8575
8576 utime_t start = ceph_clock_now();
8577
8578 _fsck_collections(&errors);
8579 used_blocks.resize(fm->get_alloc_units());
8580 apply_for_bitset_range(
8581 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
8582 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8583 bs.set(pos);
8584 }
8585 );
8586 if (repair) {
8587 repairer.get_space_usage_tracker().init(
8588 bdev->get_size(),
8589 min_alloc_size);
8590 }
8591
8592 if (bluefs) {
8593 if( cct->_conf->bluestore_bluefs_db_compatibility) {
8594 interval_set<uint64_t> bluefs_extents_db;
8595 bufferlist bl;
8596 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
8597 auto p = bl.cbegin();
8598 auto prev_errors = errors;
8599 try {
8600 decode(bluefs_extents_db, p);
8601 bluefs_extents_db.union_of(bluefs_extents);
8602 bluefs_extents_db.subtract(bluefs_extents);
8603 if (!bluefs_extents_db.empty()) {
8604 derr << "fsck error: bluefs_extents inconsistency, "
8605 << "downgrade to previous releases might be broken."
8606 << dendl;
8607 ++errors;
8608 }
8609 }
8610 catch (buffer::error& e) {
8611 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
8612 ++errors;
8613 }
8614 if (errors != prev_errors && repair) {
8615 repairer.fix_bluefs_extents(out_of_sync_fm);
8616 }
8617 }
8618
8619 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
8620 apply_for_bitset_range(
8621 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
8622 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8623 bs.set(pos);
8624 });
8625 }
8626 int r = bluefs->fsck();
8627 if (r < 0) {
8628 return r;
8629 }
8630 if (r > 0)
8631 errors += r;
8632 }
8633
8634 if (!per_pool_stat_collection) {
8635 const char *w;
8636 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8637 w = "error";
8638 ++errors;
8639 } else {
8640 w = "warning";
8641 ++warnings;
8642 }
8643 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8644 << dendl;
8645 }
8646 if (!per_pool_omap) {
8647 const char *w;
8648 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8649 w = "error";
8650 ++errors;
8651 } else {
8652 w = "warning";
8653 ++warnings;
8654 }
8655 derr << "fsck " << w << ": store not yet converted to per-pool omap"
8656 << dendl;
8657 }
8658
8659 // get expected statfs; reset unaffected fields to be able to compare
8660 // structs
8661 statfs(&actual_statfs);
8662 actual_statfs.total = 0;
8663 actual_statfs.internally_reserved = 0;
8664 actual_statfs.available = 0;
8665 actual_statfs.internal_metadata = 0;
8666 actual_statfs.omap_allocated = 0;
8667
8668 if (g_conf()->bluestore_debug_fsck_abort) {
8669 dout(1) << __func__ << " debug abort" << dendl;
8670 goto out_scan;
8671 }
8672 // walk PREFIX_OBJ
8673 {
8674 dout(1) << __func__ << " walking object keyspace" << dendl;
8675 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8676 BlueStore::FSCK_ObjectCtx ctx(
8677 errors,
8678 warnings,
8679 num_objects,
8680 num_extents,
8681 num_blobs,
8682 num_sharded_objects,
8683 num_spanning_blobs,
8684 &used_blocks,
8685 &used_omap_head,
8686 //no need for the below lock when in non-shallow mode as
8687 // there is no multithreading in this case
8688 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
8689 sb_info,
8690 expected_store_statfs,
8691 expected_pool_statfs,
8692 repair ? &repairer : nullptr);
8693
8694 _fsck_check_objects(depth, ctx);
8695 }
8696
8697 dout(1) << __func__ << " checking shared_blobs" << dendl;
8698 it = db->get_iterator(PREFIX_SHARED_BLOB);
8699 if (it) {
8700 // FIXME minor: perhaps simplify for shallow mode?
8701 // fill global if not overriden below
8702 auto expected_statfs = &expected_store_statfs;
8703
8704 for (it->lower_bound(string()); it->valid(); it->next()) {
8705 string key = it->key();
8706 uint64_t sbid;
8707 if (get_key_shared_blob(key, &sbid)) {
8708 derr << "fsck error: bad key '" << key
8709 << "' in shared blob namespace" << dendl;
8710 if (repair) {
8711 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8712 }
8713 ++errors;
8714 continue;
8715 }
8716 auto p = sb_info.find(sbid);
8717 if (p == sb_info.end()) {
8718 derr << "fsck error: found stray shared blob data for sbid 0x"
8719 << std::hex << sbid << std::dec << dendl;
8720 if (repair) {
8721 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8722 }
8723 ++errors;
8724 } else {
8725 ++num_shared_blobs;
8726 sb_info_t& sbi = p->second;
8727 bluestore_shared_blob_t shared_blob(sbid);
8728 bufferlist bl = it->value();
8729 auto blp = bl.cbegin();
8730 try {
8731 decode(shared_blob, blp);
8732 } catch (buffer::error& e) {
8733 ++errors;
8734 // Force update and don't report as missing
8735 sbi.updated = sbi.passed = true;
8736
8737 derr << "fsck error: failed to decode Shared Blob"
8738 << pretty_binary_string(it->key()) << dendl;
8739 if (repair) {
8740 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8741 << pretty_binary_string(it->key())
8742 << "', removing" << dendl;
8743 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8744 }
8745 continue;
8746 }
8747 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8748 if (shared_blob.ref_map != sbi.ref_map) {
8749 derr << "fsck error: shared blob 0x" << std::hex << sbid
8750 << std::dec << " ref_map " << shared_blob.ref_map
8751 << " != expected " << sbi.ref_map << dendl;
8752 sbi.updated = true; // will update later in repair mode only!
8753 ++errors;
8754 }
8755 PExtentVector extents;
8756 for (auto &r : shared_blob.ref_map.ref_map) {
8757 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8758 }
8759 if (per_pool_stat_collection || repair) {
8760 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8761 }
8762 errors += _fsck_check_extents(sbi.cid,
8763 p->second.oids.front(),
8764 extents,
8765 p->second.compressed,
8766 used_blocks,
8767 fm->get_alloc_size(),
8768 repair ? &repairer : nullptr,
8769 *expected_statfs,
8770 depth);
8771 sbi.passed = true;
8772 }
8773 }
8774 } // if (it)
8775
8776 if (repair && repairer.preprocess_misreference(db)) {
8777
8778 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
8779 auto& space_tracker = repairer.get_space_usage_tracker();
8780 auto& misref_extents = repairer.get_misreferences();
8781 interval_set<uint64_t> to_release;
8782 it = db->get_iterator(PREFIX_OBJ);
8783 if (it) {
8784 // fill global if not overriden below
8785 auto expected_statfs = &expected_store_statfs;
8786
8787 CollectionRef c;
8788 spg_t pgid;
8789 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8790 bool bypass_rest = false;
8791 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8792 it->next()) {
8793 dout(30) << __func__ << " key "
8794 << pretty_binary_string(it->key()) << dendl;
8795 if (is_extent_shard_key(it->key())) {
8796 continue;
8797 }
8798
8799 ghobject_t oid;
8800 int r = get_key_object(it->key(), &oid);
8801 if (r < 0 || !space_tracker.is_used(oid)) {
8802 continue;
8803 }
8804
8805 if (!c ||
8806 oid.shard_id != pgid.shard ||
8807 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8808 !c->contains(oid)) {
8809 c = nullptr;
8810 for (auto& p : coll_map) {
8811 if (p.second->contains(oid)) {
8812 c = p.second;
8813 break;
8814 }
8815 }
8816 if (!c) {
8817 continue;
8818 }
8819 if (per_pool_stat_collection || repair) {
8820 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8821 expected_statfs = &expected_pool_statfs[pool_id];
8822 }
8823 }
8824 if (!space_tracker.is_used(c->cid)) {
8825 continue;
8826 }
8827
8828 dout(20) << __func__ << " check misreference for col:" << c->cid
8829 << " obj:" << oid << dendl;
8830
8831 OnodeRef o;
8832 o.reset(Onode::decode(c, oid, it->key(), it->value()));
8833 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8834 mempool::bluestore_fsck::set<BlobRef> blobs;
8835
8836 for (auto& e : o->extent_map.extent_map) {
8837 blobs.insert(e.blob);
8838 }
8839 bool need_onode_update = false;
8840 bool first_dump = true;
8841 for(auto b : blobs) {
8842 bool broken_blob = false;
8843 auto& pextents = b->dirty_blob().dirty_extents();
8844 for (auto& e : pextents) {
8845 if (!e.is_valid()) {
8846 continue;
8847 }
8848 // for the sake of simplicity and proper shared blob handling
8849 // always rewrite the whole blob even when it's partially
8850 // misreferenced.
8851 if (misref_extents.intersects(e.offset, e.length)) {
8852 if (first_dump) {
8853 first_dump = false;
8854 _dump_onode<10>(cct, *o);
8855 }
8856 broken_blob = true;
8857 break;
8858 }
8859 }
8860 if (!broken_blob)
8861 continue;
8862 bool compressed = b->get_blob().is_compressed();
8863 need_onode_update = true;
8864 dout(10) << __func__
8865 << " fix misreferences in oid:" << oid
8866 << " " << *b << dendl;
8867 uint64_t b_off = 0;
8868 PExtentVector pext_to_release;
8869 pext_to_release.reserve(pextents.size());
8870 // rewriting all valid pextents
8871 for (auto e = pextents.begin(); e != pextents.end();
8872 b_off += e->length, e++) {
8873 if (!e->is_valid()) {
8874 continue;
8875 }
8876 PExtentVector exts;
8877 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
8878 0, 0, &exts);
8879 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
8880 derr << __func__
8881 << " failed to allocate 0x" << std::hex << e->length
8882 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
8883 << " min_alloc_size 0x" << min_alloc_size
8884 << " available 0x " << alloc->get_free()
8885 << std::dec << dendl;
8886 if (alloc_len > 0) {
8887 alloc->release(exts);
8888 }
8889 bypass_rest = true;
8890 break;
8891 }
8892 expected_statfs->allocated += e->length;
8893 if (compressed) {
8894 expected_statfs->data_compressed_allocated += e->length;
8895 }
8896
8897 bufferlist bl;
8898 IOContext ioc(cct, NULL, true); // allow EIO
8899 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8900 if (r < 0) {
8901 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8902 <<"~" << e->length << std::dec << dendl;
8903 ceph_abort_msg("read failed, wtf");
8904 }
8905 pext_to_release.push_back(*e);
8906 e = pextents.erase(e);
8907 e = pextents.insert(e, exts.begin(), exts.end());
8908 b->get_blob().map_bl(
8909 b_off, bl,
8910 [&](uint64_t offset, bufferlist& t) {
8911 int r = bdev->write(offset, t, false);
8912 ceph_assert(r == 0);
8913 });
8914 e += exts.size() - 1;
8915 for (auto& p : exts) {
8916 fm->allocate(p.offset, p.length, txn);
8917 }
8918 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8919
8920 if (b->get_blob().is_shared()) {
8921 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8922
8923 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8924 ceph_assert(sb_it != sb_info.end());
8925 sb_info_t& sbi = sb_it->second;
8926
8927 for (auto& r : sbi.ref_map.ref_map) {
8928 expected_statfs->allocated -= r.second.length;
8929 if (sbi.compressed) {
8930 // NB: it's crucial to use compressed flag from sb_info_t
8931 // as we originally used that value while accumulating
8932 // expected_statfs
8933 expected_statfs->data_compressed_allocated -= r.second.length;
8934 }
8935 }
8936 sbi.updated = sbi.passed = true;
8937 sbi.ref_map.clear();
8938
8939 // relying on blob's pextents to decide what to release.
8940 for (auto& p : pext_to_release) {
8941 to_release.union_insert(p.offset, p.length);
8942 }
8943 } else {
8944 for (auto& p : pext_to_release) {
8945 expected_statfs->allocated -= p.length;
8946 if (compressed) {
8947 expected_statfs->data_compressed_allocated -= p.length;
8948 }
8949 to_release.union_insert(p.offset, p.length);
8950 }
8951 }
8952 if (bypass_rest) {
8953 break;
8954 }
8955 } // for(auto b : blobs)
8956 if (need_onode_update) {
8957 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8958 _record_onode(o, txn);
8959 }
8960 } // for (it->lower_bound(string()); it->valid(); it->next())
8961
8962 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8963 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8964 << "~" << it.get_len() << std::dec << dendl;
8965 fm->release(it.get_start(), it.get_len(), txn);
8966 }
8967 alloc->release(to_release);
8968 to_release.clear();
8969 } // if (it) {
8970 } //if (repair && repairer.preprocess_misreference()) {
8971
8972 if (depth != FSCK_SHALLOW) {
8973 for (auto &p : sb_info) {
8974 sb_info_t& sbi = p.second;
8975 if (!sbi.passed) {
8976 derr << "fsck error: missing " << *sbi.sb << dendl;
8977 ++errors;
8978 }
8979 if (repair && (!sbi.passed || sbi.updated)) {
8980 auto sbid = p.first;
8981 if (sbi.ref_map.empty()) {
8982 ceph_assert(sbi.passed);
8983 dout(20) << __func__ << " " << *sbi.sb
8984 << " is empty, removing" << dendl;
8985 repairer.fix_shared_blob(db, sbid, nullptr);
8986 } else {
8987 bufferlist bl;
8988 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8989 encode(persistent, bl);
8990 dout(20) << __func__ << " " << *sbi.sb
8991 << " is " << bl.length() << " bytes, updating" << dendl;
8992
8993 repairer.fix_shared_blob(db, sbid, &bl);
8994 }
8995 }
8996 }
8997 }
8998 sb_info.clear();
8999
9000 // check global stats only if fscking (not repairing) w/o per-pool stats
9001 if (!per_pool_stat_collection &&
9002 !repair &&
9003 !(actual_statfs == expected_store_statfs)) {
9004 derr << "fsck error: actual " << actual_statfs
9005 << " != expected " << expected_store_statfs << dendl;
9006 if (repair) {
9007 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
9008 expected_store_statfs);
9009 }
9010 ++errors;
9011 }
9012
9013 dout(1) << __func__ << " checking pool_statfs" << dendl;
9014 _fsck_check_pool_statfs(expected_pool_statfs,
9015 errors, warnings, repair ? &repairer : nullptr);
9016
9017 if (depth != FSCK_SHALLOW) {
9018 dout(1) << __func__ << " checking for stray omap data " << dendl;
9019 it = db->get_iterator(PREFIX_OMAP);
9020 if (it) {
9021 uint64_t last_omap_head = 0;
9022 for (it->lower_bound(string()); it->valid(); it->next()) {
9023 uint64_t omap_head;
9024 _key_decode_u64(it->key().c_str(), &omap_head);
9025 if (used_omap_head.count(omap_head) == 0 &&
9026 omap_head != last_omap_head) {
9027 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9028 << "fsck error: found stray omap data on omap_head "
9029 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head)<< fsck_dendl;
9030 ++errors;
9031 last_omap_head = omap_head;
9032 }
9033 }
9034 }
9035 it = db->get_iterator(PREFIX_PGMETA_OMAP);
9036 if (it) {
9037 uint64_t last_omap_head = 0;
9038 for (it->lower_bound(string()); it->valid(); it->next()) {
9039 uint64_t omap_head;
9040 _key_decode_u64(it->key().c_str(), &omap_head);
9041 if (used_omap_head.count(omap_head) == 0 &&
9042 omap_head != last_omap_head) {
9043 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9044 << "fsck error: found stray (pgmeta) omap data on omap_head "
9045 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9046 last_omap_head = omap_head;
9047 ++errors;
9048 }
9049 }
9050 }
9051 it = db->get_iterator(PREFIX_PERPOOL_OMAP);
9052 if (it) {
9053 uint64_t last_omap_head = 0;
9054 for (it->lower_bound(string()); it->valid(); it->next()) {
9055 uint64_t pool;
9056 uint64_t omap_head;
9057 string k = it->key();
9058 const char *c = k.c_str();
9059 c = _key_decode_u64(c, &pool);
9060 c = _key_decode_u64(c, &omap_head);
9061 if (used_omap_head.count(omap_head) == 0 &&
9062 omap_head != last_omap_head) {
9063 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9064 << "fsck error: found stray (per-pool) omap data on omap_head "
9065 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9066 ++errors;
9067 last_omap_head = omap_head;
9068 }
9069 }
9070 }
9071 dout(1) << __func__ << " checking deferred events" << dendl;
9072 it = db->get_iterator(PREFIX_DEFERRED);
9073 if (it) {
9074 for (it->lower_bound(string()); it->valid(); it->next()) {
9075 bufferlist bl = it->value();
9076 auto p = bl.cbegin();
9077 bluestore_deferred_transaction_t wt;
9078 try {
9079 decode(wt, p);
9080 } catch (buffer::error& e) {
9081 derr << "fsck error: failed to decode deferred txn "
9082 << pretty_binary_string(it->key()) << dendl;
9083 if (repair) {
9084 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
9085 << pretty_binary_string(it->key())
9086 << "', removing" << dendl;
9087 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9088 }
9089 continue;
9090 }
9091 dout(20) << __func__ << " deferred " << wt.seq
9092 << " ops " << wt.ops.size()
9093 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9094 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9095 apply_for_bitset_range(
9096 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
9097 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9098 bs.set(pos);
9099 }
9100 );
9101 }
9102 }
9103 }
9104
9105 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9106 {
9107 // remove bluefs_extents from used set since the freelist doesn't
9108 // know they are allocated.
9109 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9110 apply_for_bitset_range(
9111 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
9112 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9113 bs.reset(pos);
9114 }
9115 );
9116 }
9117 fm->enumerate_reset();
9118 uint64_t offset, length;
9119 while (fm->enumerate_next(db, &offset, &length)) {
9120 bool intersects = false;
9121 apply_for_bitset_range(
9122 offset, length, fm->get_alloc_size(), used_blocks,
9123 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9124 if (bs.test(pos)) {
9125 if (offset == SUPER_RESERVED &&
9126 length == min_alloc_size - SUPER_RESERVED) {
9127 // this is due to the change just after luminous to min_alloc_size
9128 // granularity allocations, and our baked in assumption at the top
9129 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9130 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9131 // since we will never allocate this region below min_alloc_size.
9132 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9133 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9134 << length << std::dec << dendl;
9135 } else {
9136 intersects = true;
9137 if (repair) {
9138 repairer.fix_false_free(db, fm,
9139 pos * min_alloc_size,
9140 min_alloc_size);
9141 }
9142 }
9143 } else {
9144 bs.set(pos);
9145 }
9146 }
9147 );
9148 if (intersects) {
9149 derr << "fsck error: free extent 0x" << std::hex << offset
9150 << "~" << length << std::dec
9151 << " intersects allocated blocks" << dendl;
9152 ++errors;
9153 }
9154 }
9155 fm->enumerate_reset();
9156 size_t count = used_blocks.count();
9157 if (used_blocks.size() != count) {
9158 ceph_assert(used_blocks.size() > count);
9159 used_blocks.flip();
9160 size_t start = used_blocks.find_first();
9161 while (start != decltype(used_blocks)::npos) {
9162 size_t cur = start;
9163 while (true) {
9164 size_t next = used_blocks.find_next(cur);
9165 if (next != cur + 1) {
9166 ++errors;
9167 derr << "fsck error: leaked extent 0x" << std::hex
9168 << ((uint64_t)start * fm->get_alloc_size()) << "~"
9169 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
9170 << dendl;
9171 if (repair) {
9172 repairer.fix_leaked(db,
9173 fm,
9174 start * min_alloc_size,
9175 (cur + 1 - start) * min_alloc_size);
9176 }
9177 start = next;
9178 break;
9179 }
9180 cur = next;
9181 }
9182 }
9183 used_blocks.flip();
9184 }
9185 }
9186 }
9187 if (repair) {
9188 if (!per_pool_omap) {
9189 dout(5) << __func__ << " marking per_pool_omap=1" << dendl;
9190 repairer.fix_per_pool_omap(db);
9191 }
9192
9193 dout(5) << __func__ << " applying repair results" << dendl;
9194 repaired = repairer.apply(db);
9195 dout(5) << __func__ << " repair applied" << dendl;
9196 }
9197
9198 out_scan:
9199 dout(2) << __func__ << " " << num_objects << " objects, "
9200 << num_sharded_objects << " of them sharded. "
9201 << dendl;
9202 dout(2) << __func__ << " " << num_extents << " extents to "
9203 << num_blobs << " blobs, "
9204 << num_spanning_blobs << " spanning, "
9205 << num_shared_blobs << " shared."
9206 << dendl;
9207
9208 utime_t duration = ceph_clock_now() - start;
9209 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
9210 << warnings << " warnings, "
9211 << repaired << " repaired, "
9212 << (errors + warnings - (int)repaired) << " remaining in "
9213 << duration << " seconds" << dendl;
9214
9215 // In non-repair mode we should return error count only as
9216 // it indicates if store status is OK.
9217 // In repair mode both errors and warnings are taken into account
9218 // since repaired counter relates to them both.
9219 return repair ? errors + warnings - (int)repaired : errors;
9220 }
9221
9222 /// methods to inject various errors fsck can repair
9223 void BlueStore::inject_broken_shared_blob_key(const string& key,
9224 const bufferlist& bl)
9225 {
9226 KeyValueDB::Transaction txn;
9227 txn = db->get_transaction();
9228 txn->set(PREFIX_SHARED_BLOB, key, bl);
9229 db->submit_transaction_sync(txn);
9230 };
9231
9232 void BlueStore::inject_leaked(uint64_t len)
9233 {
9234 KeyValueDB::Transaction txn;
9235 txn = db->get_transaction();
9236
9237 PExtentVector exts;
9238 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
9239 min_alloc_size * 256, 0, &exts);
9240 ceph_assert(alloc_len >= (int64_t)len);
9241 for (auto& p : exts) {
9242 fm->allocate(p.offset, p.length, txn);
9243 }
9244 db->submit_transaction_sync(txn);
9245 }
9246
9247 void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
9248 {
9249 KeyValueDB::Transaction txn;
9250 OnodeRef o;
9251 CollectionRef c = _get_collection(cid);
9252 ceph_assert(c);
9253 {
9254 std::unique_lock l{c->lock}; // just to avoid internal asserts
9255 o = c->get_onode(oid, false);
9256 ceph_assert(o);
9257 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9258 }
9259
9260 bool injected = false;
9261 txn = db->get_transaction();
9262 auto& em = o->extent_map.extent_map;
9263 std::vector<const PExtentVector*> v;
9264 if (em.size()) {
9265 v.push_back(&em.begin()->blob->get_blob().get_extents());
9266 }
9267 if (em.size() > 1) {
9268 auto it = em.end();
9269 --it;
9270 v.push_back(&(it->blob->get_blob().get_extents()));
9271 }
9272 for (auto pext : v) {
9273 if (pext->size()) {
9274 auto p = pext->begin();
9275 while (p != pext->end()) {
9276 if (p->is_valid()) {
9277 dout(20) << __func__ << " release 0x" << std::hex << p->offset
9278 << "~" << p->length << std::dec << dendl;
9279 fm->release(p->offset, p->length, txn);
9280 injected = true;
9281 break;
9282 }
9283 ++p;
9284 }
9285 }
9286 }
9287 ceph_assert(injected);
9288 db->submit_transaction_sync(txn);
9289 }
9290
9291 void BlueStore::inject_legacy_omap()
9292 {
9293 dout(1) << __func__ << dendl;
9294 per_pool_omap = false;
9295 KeyValueDB::Transaction txn;
9296 txn = db->get_transaction();
9297 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
9298 db->submit_transaction_sync(txn);
9299 }
9300
9301 void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
9302 {
9303 dout(1) << __func__ << " "
9304 << cid << " " << oid
9305 <<dendl;
9306 KeyValueDB::Transaction txn;
9307 OnodeRef o;
9308 CollectionRef c = _get_collection(cid);
9309 ceph_assert(c);
9310 {
9311 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9312 o = c->get_onode(oid, false);
9313 ceph_assert(o);
9314 }
9315 o->onode.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PGMETA_OMAP);
9316 txn = db->get_transaction();
9317 _record_onode(o, txn);
9318 db->submit_transaction_sync(txn);
9319 }
9320
9321
9322 void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
9323 {
9324 BlueStoreRepairer repairer;
9325 repairer.fix_statfs(db, key, new_statfs);
9326 repairer.apply(db);
9327 }
9328
9329 void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
9330 {
9331 KeyValueDB::Transaction t = db->get_transaction();
9332 volatile_statfs v;
9333 v = new_statfs;
9334 bufferlist bl;
9335 v.encode(bl);
9336 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
9337 db->submit_transaction_sync(t);
9338 }
9339
9340 void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9341 coll_t cid2, ghobject_t oid2,
9342 uint64_t offset)
9343 {
9344 OnodeRef o1;
9345 CollectionRef c1 = _get_collection(cid1);
9346 ceph_assert(c1);
9347 {
9348 std::unique_lock l{c1->lock}; // just to avoid internal asserts
9349 o1 = c1->get_onode(oid1, false);
9350 ceph_assert(o1);
9351 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9352 }
9353 OnodeRef o2;
9354 CollectionRef c2 = _get_collection(cid2);
9355 ceph_assert(c2);
9356 {
9357 std::unique_lock l{c2->lock}; // just to avoid internal asserts
9358 o2 = c2->get_onode(oid2, false);
9359 ceph_assert(o2);
9360 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9361 }
9362 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9363 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9364
9365 // require onode/extent layout to be the same (and simple)
9366 // to make things easier
9367 ceph_assert(o1->onode.extent_map_shards.empty());
9368 ceph_assert(o2->onode.extent_map_shards.empty());
9369 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9370 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9371 ceph_assert(e1.logical_offset == e2.logical_offset);
9372 ceph_assert(e1.length == e2.length);
9373 ceph_assert(e1.blob_offset == e2.blob_offset);
9374
9375 KeyValueDB::Transaction txn;
9376 txn = db->get_transaction();
9377
9378 // along with misreference error this will create space leaks errors
9379 e2.blob->dirty_blob() = e1.blob->get_blob();
9380 o2->extent_map.dirty_range(offset, e2.length);
9381 o2->extent_map.update(txn, false);
9382
9383 _record_onode(o2, txn);
9384 db->submit_transaction_sync(txn);
9385 }
9386
9387 void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
9388 int16_t blob_id)
9389 {
9390 OnodeRef o;
9391 CollectionRef c = _get_collection(cid);
9392 ceph_assert(c);
9393 {
9394 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9395 o = c->get_onode(oid, false);
9396 ceph_assert(o);
9397 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9398 }
9399
9400 BlobRef b = c->new_blob();
9401 b->id = blob_id;
9402 o->extent_map.spanning_blob_map[blob_id] = b;
9403
9404 KeyValueDB::Transaction txn;
9405 txn = db->get_transaction();
9406
9407 _record_onode(o, txn);
9408 db->submit_transaction_sync(txn);
9409 }
9410
9411 void BlueStore::collect_metadata(map<string,string> *pm)
9412 {
9413 dout(10) << __func__ << dendl;
9414 bdev->collect_metadata("bluestore_bdev_", pm);
9415 if (bluefs) {
9416 (*pm)["bluefs"] = "1";
9417 // this value is for backward compatibility only
9418 (*pm)["bluefs_single_shared_device"] = \
9419 stringify((int)bluefs_layout.single_shared_device());
9420 (*pm)["bluefs_dedicated_db"] = \
9421 stringify((int)bluefs_layout.dedicated_db);
9422 (*pm)["bluefs_dedicated_wal"] = \
9423 stringify((int)bluefs_layout.dedicated_wal);
9424 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
9425 } else {
9426 (*pm)["bluefs"] = "0";
9427 }
9428
9429 // report numa mapping for underlying devices
9430 int node = -1;
9431 set<int> nodes;
9432 set<string> failed;
9433 int r = get_numa_node(&node, &nodes, &failed);
9434 if (r >= 0) {
9435 if (!failed.empty()) {
9436 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9437 }
9438 if (!nodes.empty()) {
9439 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9440 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9441 }
9442 if (node >= 0) {
9443 (*pm)["objectstore_numa_node"] = stringify(node);
9444 }
9445 }
9446 }
9447
9448 int BlueStore::get_numa_node(
9449 int *final_node,
9450 set<int> *out_nodes,
9451 set<string> *out_failed)
9452 {
9453 int node = -1;
9454 set<string> devices;
9455 get_devices(&devices);
9456 set<int> nodes;
9457 set<string> failed;
9458 for (auto& devname : devices) {
9459 int n;
9460 BlkDev bdev(devname);
9461 int r = bdev.get_numa_node(&n);
9462 if (r < 0) {
9463 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9464 << dendl;
9465 failed.insert(devname);
9466 continue;
9467 }
9468 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9469 << dendl;
9470 nodes.insert(n);
9471 if (node < 0) {
9472 node = n;
9473 }
9474 }
9475 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9476 *final_node = node;
9477 }
9478 if (out_nodes) {
9479 *out_nodes = nodes;
9480 }
9481 if (out_failed) {
9482 *out_failed = failed;
9483 }
9484 return 0;
9485 }
9486
9487 int BlueStore::get_devices(set<string> *ls)
9488 {
9489 if (bdev) {
9490 bdev->get_devices(ls);
9491 if (bluefs) {
9492 bluefs->get_devices(ls);
9493 }
9494 return 0;
9495 }
9496
9497 // grumble, we haven't started up yet.
9498 int r = _open_path();
9499 if (r < 0)
9500 goto out;
9501 r = _open_fsid(false);
9502 if (r < 0)
9503 goto out_path;
9504 r = _read_fsid(&fsid);
9505 if (r < 0)
9506 goto out_fsid;
9507 r = _lock_fsid();
9508 if (r < 0)
9509 goto out_fsid;
9510 r = _open_bdev(false);
9511 if (r < 0)
9512 goto out_fsid;
9513 r = _minimal_open_bluefs(false);
9514 if (r < 0)
9515 goto out_bdev;
9516 bdev->get_devices(ls);
9517 if (bluefs) {
9518 bluefs->get_devices(ls);
9519 }
9520 r = 0;
9521 _minimal_close_bluefs();
9522 out_bdev:
9523 _close_bdev();
9524 out_fsid:
9525 _close_fsid();
9526 out_path:
9527 _close_path();
9528 out:
9529 return r;
9530 }
9531
9532 void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
9533 {
9534 buf->reset();
9535
9536 buf->omap_allocated =
9537 db->estimate_prefix_size(PREFIX_OMAP, string()) +
9538 db->estimate_prefix_size(PREFIX_PERPOOL_OMAP, string());
9539
9540 uint64_t bfree = alloc->get_free();
9541
9542 if (bluefs) {
9543 int64_t bluefs_total = bluefs->get_total(bluefs_layout.shared_bdev);
9544 int64_t bluefs_free = bluefs->get_free(bluefs_layout.shared_bdev);
9545 // part of our shared device is "free" according to BlueFS, but we
9546 // can't touch bluestore_bluefs_min of it.
9547 int64_t shared_available = std::min(
9548 bluefs_free,
9549 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
9550 buf->internally_reserved = bluefs_total - shared_available;
9551 if (shared_available > 0) {
9552 bfree += shared_available;
9553 }
9554 // include dedicated db, too, if that isn't the shared device.
9555 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
9556 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
9557 }
9558 // call any non-omap bluefs space "internal metadata"
9559 buf->internal_metadata =
9560 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
9561 - buf->omap_allocated;
9562 }
9563
9564 uint64_t thin_total, thin_avail;
9565 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9566 buf->total += thin_total;
9567
9568 // we are limited by both the size of the virtual device and the
9569 // underlying physical device.
9570 bfree = std::min(bfree, thin_avail);
9571
9572 buf->allocated = thin_total - thin_avail;
9573 } else {
9574 buf->total += bdev->get_size();
9575 }
9576 buf->available = bfree;
9577 }
9578
9579 int BlueStore::statfs(struct store_statfs_t *buf,
9580 osd_alert_list_t* alerts)
9581 {
9582 if (alerts) {
9583 alerts->clear();
9584 _log_alerts(*alerts);
9585 }
9586 _get_statfs_overall(buf);
9587 {
9588 std::lock_guard l(vstatfs_lock);
9589 buf->allocated = vstatfs.allocated();
9590 buf->data_stored = vstatfs.stored();
9591 buf->data_compressed = vstatfs.compressed();
9592 buf->data_compressed_original = vstatfs.compressed_original();
9593 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9594 }
9595
9596 dout(20) << __func__ << " " << *buf << dendl;
9597 return 0;
9598 }
9599
9600 int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9601 bool *out_per_pool_omap)
9602 {
9603 dout(20) << __func__ << " pool " << pool_id<< dendl;
9604
9605 if (!per_pool_stat_collection) {
9606 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9607 return -ENOTSUP;
9608 }
9609 buf->reset();
9610
9611 {
9612 std::lock_guard l(vstatfs_lock);
9613 osd_pools[pool_id].publish(buf);
9614 }
9615
9616 string key_prefix;
9617 _key_encode_u64(pool_id, &key_prefix);
9618 buf->omap_allocated = db->estimate_prefix_size(PREFIX_PERPOOL_OMAP,
9619 key_prefix);
9620 *out_per_pool_omap = per_pool_omap;
9621
9622 dout(10) << __func__ << *buf << dendl;
9623 return 0;
9624 }
9625
9626 void BlueStore::_check_legacy_statfs_alert()
9627 {
9628 string s;
9629 if (!per_pool_stat_collection &&
9630 cct->_conf->bluestore_warn_on_legacy_statfs) {
9631 s = "legacy statfs reporting detected, "
9632 "suggest to run store repair to get consistent statistic reports";
9633 }
9634 std::lock_guard l(qlock);
9635 legacy_statfs_alert = s;
9636 }
9637
9638 void BlueStore::_check_no_per_pool_omap_alert()
9639 {
9640 string s;
9641 if (!per_pool_omap &&
9642 cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9643 s = "legacy (not per-pool) omap detected, "
9644 "suggest to run store repair to measure per-pool omap usage";
9645 }
9646 std::lock_guard l(qlock);
9647 no_per_pool_omap_alert = s;
9648 }
9649
9650 // ---------------
9651 // cache
9652
9653 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9654 {
9655 std::shared_lock l(coll_lock);
9656 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9657 if (cp == coll_map.end())
9658 return CollectionRef();
9659 return cp->second;
9660 }
9661
9662 void BlueStore::_queue_reap_collection(CollectionRef& c)
9663 {
9664 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9665 // _reap_collections and this in the same thread,
9666 // so no need a lock.
9667 removed_collections.push_back(c);
9668 }
9669
9670 void BlueStore::_reap_collections()
9671 {
9672
9673 list<CollectionRef> removed_colls;
9674 {
9675 // _queue_reap_collection and this in the same thread.
9676 // So no need a lock.
9677 if (!removed_collections.empty())
9678 removed_colls.swap(removed_collections);
9679 else
9680 return;
9681 }
9682
9683 list<CollectionRef>::iterator p = removed_colls.begin();
9684 while (p != removed_colls.end()) {
9685 CollectionRef c = *p;
9686 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9687 if (c->onode_map.map_any([&](Onode* o) {
9688 ceph_assert(!o->exists);
9689 if (o->flushing_count.load()) {
9690 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9691 << " flush_txns " << o->flushing_count << dendl;
9692 return true;
9693 }
9694 return false;
9695 })) {
9696 ++p;
9697 continue;
9698 }
9699 c->onode_map.clear();
9700 p = removed_colls.erase(p);
9701 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9702 }
9703 if (removed_colls.empty()) {
9704 dout(10) << __func__ << " all reaped" << dendl;
9705 } else {
9706 removed_collections.splice(removed_collections.begin(), removed_colls);
9707 }
9708 }
9709
9710 void BlueStore::_update_cache_logger()
9711 {
9712 uint64_t num_onodes = 0;
9713 uint64_t num_pinned_onodes = 0;
9714 uint64_t num_extents = 0;
9715 uint64_t num_blobs = 0;
9716 uint64_t num_buffers = 0;
9717 uint64_t num_buffer_bytes = 0;
9718 for (auto c : onode_cache_shards) {
9719 c->add_stats(&num_onodes, &num_pinned_onodes);
9720 }
9721 for (auto c : buffer_cache_shards) {
9722 c->add_stats(&num_extents, &num_blobs,
9723 &num_buffers, &num_buffer_bytes);
9724 }
9725 logger->set(l_bluestore_onodes, num_onodes);
9726 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
9727 logger->set(l_bluestore_extents, num_extents);
9728 logger->set(l_bluestore_blobs, num_blobs);
9729 logger->set(l_bluestore_buffers, num_buffers);
9730 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9731 }
9732
9733 // ---------------
9734 // read operations
9735
9736 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9737 {
9738 return _get_collection(cid);
9739 }
9740
9741 ObjectStore::CollectionHandle BlueStore::create_new_collection(
9742 const coll_t& cid)
9743 {
9744 std::unique_lock l{coll_lock};
9745 auto c = ceph::make_ref<Collection>(
9746 this,
9747 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9748 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
9749 cid);
9750 new_coll_map[cid] = c;
9751 _osr_attach(c.get());
9752 return c;
9753 }
9754
9755 void BlueStore::set_collection_commit_queue(
9756 const coll_t& cid,
9757 ContextQueue *commit_queue)
9758 {
9759 if (commit_queue) {
9760 std::shared_lock l(coll_lock);
9761 if (coll_map.count(cid)) {
9762 coll_map[cid]->commit_queue = commit_queue;
9763 } else if (new_coll_map.count(cid)) {
9764 new_coll_map[cid]->commit_queue = commit_queue;
9765 }
9766 }
9767 }
9768
9769
9770 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9771 {
9772 Collection *c = static_cast<Collection *>(c_.get());
9773 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9774 if (!c->exists)
9775 return false;
9776
9777 bool r = true;
9778
9779 {
9780 std::shared_lock l(c->lock);
9781 OnodeRef o = c->get_onode(oid, false);
9782 if (!o || !o->exists)
9783 r = false;
9784 }
9785
9786 return r;
9787 }
9788
9789 int BlueStore::stat(
9790 CollectionHandle &c_,
9791 const ghobject_t& oid,
9792 struct stat *st,
9793 bool allow_eio)
9794 {
9795 Collection *c = static_cast<Collection *>(c_.get());
9796 if (!c->exists)
9797 return -ENOENT;
9798 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9799
9800 {
9801 std::shared_lock l(c->lock);
9802 OnodeRef o = c->get_onode(oid, false);
9803 if (!o || !o->exists)
9804 return -ENOENT;
9805 st->st_size = o->onode.size;
9806 st->st_blksize = 4096;
9807 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9808 st->st_nlink = 1;
9809 }
9810
9811 int r = 0;
9812 if (_debug_mdata_eio(oid)) {
9813 r = -EIO;
9814 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9815 }
9816 return r;
9817 }
9818 int BlueStore::set_collection_opts(
9819 CollectionHandle& ch,
9820 const pool_opts_t& opts)
9821 {
9822 Collection *c = static_cast<Collection *>(ch.get());
9823 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
9824 if (!c->exists)
9825 return -ENOENT;
9826 std::unique_lock l{c->lock};
9827 c->pool_opts = opts;
9828 return 0;
9829 }
9830
9831 int BlueStore::read(
9832 CollectionHandle &c_,
9833 const ghobject_t& oid,
9834 uint64_t offset,
9835 size_t length,
9836 bufferlist& bl,
9837 uint32_t op_flags)
9838 {
9839 auto start = mono_clock::now();
9840 Collection *c = static_cast<Collection *>(c_.get());
9841 const coll_t &cid = c->get_cid();
9842 dout(15) << __func__ << " " << cid << " " << oid
9843 << " 0x" << std::hex << offset << "~" << length << std::dec
9844 << dendl;
9845 if (!c->exists)
9846 return -ENOENT;
9847
9848 bl.clear();
9849 int r;
9850 {
9851 std::shared_lock l(c->lock);
9852 auto start1 = mono_clock::now();
9853 OnodeRef o = c->get_onode(oid, false);
9854 log_latency("get_onode@read",
9855 l_bluestore_read_onode_meta_lat,
9856 mono_clock::now() - start1,
9857 cct->_conf->bluestore_log_op_age);
9858 if (!o || !o->exists) {
9859 r = -ENOENT;
9860 goto out;
9861 }
9862
9863 if (offset == length && offset == 0)
9864 length = o->onode.size;
9865
9866 r = _do_read(c, o, offset, length, bl, op_flags);
9867 if (r == -EIO) {
9868 logger->inc(l_bluestore_read_eio);
9869 }
9870 }
9871
9872 out:
9873 if (r >= 0 && _debug_data_eio(oid)) {
9874 r = -EIO;
9875 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9876 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9877 cct->_conf->bluestore_debug_random_read_err &&
9878 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9879 100.0)) == 0) {
9880 dout(0) << __func__ << ": inject random EIO" << dendl;
9881 r = -EIO;
9882 }
9883 dout(10) << __func__ << " " << cid << " " << oid
9884 << " 0x" << std::hex << offset << "~" << length << std::dec
9885 << " = " << r << dendl;
9886 log_latency(__func__,
9887 l_bluestore_read_lat,
9888 mono_clock::now() - start,
9889 cct->_conf->bluestore_log_op_age);
9890 return r;
9891 }
9892
9893 void BlueStore::_read_cache(
9894 OnodeRef o,
9895 uint64_t offset,
9896 size_t length,
9897 int read_cache_policy,
9898 ready_regions_t& ready_regions,
9899 blobs2read_t& blobs2read)
9900 {
9901 // build blob-wise list to of stuff read (that isn't cached)
9902 unsigned left = length;
9903 uint64_t pos = offset;
9904 auto lp = o->extent_map.seek_lextent(offset);
9905 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9906 if (pos < lp->logical_offset) {
9907 unsigned hole = lp->logical_offset - pos;
9908 if (hole >= left) {
9909 break;
9910 }
9911 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9912 << std::dec << dendl;
9913 pos += hole;
9914 left -= hole;
9915 }
9916 BlobRef& bptr = lp->blob;
9917 unsigned l_off = pos - lp->logical_offset;
9918 unsigned b_off = l_off + lp->blob_offset;
9919 unsigned b_len = std::min(left, lp->length - l_off);
9920
9921 ready_regions_t cache_res;
9922 interval_set<uint32_t> cache_interval;
9923 bptr->shared_blob->bc.read(
9924 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9925 read_cache_policy);
9926 dout(20) << __func__ << " blob " << *bptr << std::hex
9927 << " need 0x" << b_off << "~" << b_len
9928 << " cache has 0x" << cache_interval
9929 << std::dec << dendl;
9930
9931 auto pc = cache_res.begin();
9932 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
9933 while (b_len > 0) {
9934 unsigned l;
9935 if (pc != cache_res.end() &&
9936 pc->first == b_off) {
9937 l = pc->second.length();
9938 ready_regions[pos].claim(pc->second);
9939 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9940 << b_off << "~" << l << std::dec << dendl;
9941 ++pc;
9942 } else {
9943 l = b_len;
9944 if (pc != cache_res.end()) {
9945 ceph_assert(pc->first > b_off);
9946 l = pc->first - b_off;
9947 }
9948 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9949 << b_off << "~" << l << std::dec << dendl;
9950 // merge regions
9951 {
9952 uint64_t r_off = b_off;
9953 uint64_t r_len = l;
9954 uint64_t front = r_off % chunk_size;
9955 if (front) {
9956 r_off -= front;
9957 r_len += front;
9958 }
9959 unsigned tail = r_len % chunk_size;
9960 if (tail) {
9961 r_len += chunk_size - tail;
9962 }
9963 bool merged = false;
9964 regions2read_t& r2r = blobs2read[bptr];
9965 if (r2r.size()) {
9966 read_req_t& pre = r2r.back();
9967 if (r_off <= (pre.r_off + pre.r_len)) {
9968 front += (r_off - pre.r_off);
9969 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9970 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9971 merged = true;
9972 }
9973 }
9974 if (!merged) {
9975 read_req_t req(r_off, r_len);
9976 req.regs.emplace_back(region_t(pos, b_off, l, front));
9977 r2r.emplace_back(std::move(req));
9978 }
9979 }
9980 }
9981 pos += l;
9982 b_off += l;
9983 left -= l;
9984 b_len -= l;
9985 }
9986 ++lp;
9987 }
9988 }
9989
9990 int BlueStore::_prepare_read_ioc(
9991 blobs2read_t& blobs2read,
9992 vector<bufferlist>* compressed_blob_bls,
9993 IOContext* ioc)
9994 {
9995 for (auto& p : blobs2read) {
9996 const BlobRef& bptr = p.first;
9997 regions2read_t& r2r = p.second;
9998 dout(20) << __func__ << " blob " << *bptr << std::hex
9999 << " need " << r2r << std::dec << dendl;
10000 if (bptr->get_blob().is_compressed()) {
10001 // read the whole thing
10002 if (compressed_blob_bls->empty()) {
10003 // ensure we avoid any reallocation on subsequent blobs
10004 compressed_blob_bls->reserve(blobs2read.size());
10005 }
10006 compressed_blob_bls->push_back(bufferlist());
10007 bufferlist& bl = compressed_blob_bls->back();
10008 auto r = bptr->get_blob().map(
10009 0, bptr->get_blob().get_ondisk_length(),
10010 [&](uint64_t offset, uint64_t length) {
10011 int r = bdev->aio_read(offset, length, &bl, ioc);
10012 if (r < 0)
10013 return r;
10014 return 0;
10015 });
10016 if (r < 0) {
10017 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
10018 if (r == -EIO) {
10019 // propagate EIO to caller
10020 return r;
10021 }
10022 ceph_assert(r == 0);
10023 }
10024 } else {
10025 // read the pieces
10026 for (auto& req : r2r) {
10027 dout(20) << __func__ << " region 0x" << std::hex
10028 << req.regs.front().logical_offset
10029 << ": 0x" << req.regs.front().blob_xoffset
10030 << " reading 0x" << req.r_off
10031 << "~" << req.r_len << std::dec
10032 << dendl;
10033
10034 // read it
10035 auto r = bptr->get_blob().map(
10036 req.r_off, req.r_len,
10037 [&](uint64_t offset, uint64_t length) {
10038 int r = bdev->aio_read(offset, length, &req.bl, ioc);
10039 if (r < 0)
10040 return r;
10041 return 0;
10042 });
10043 if (r < 0) {
10044 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
10045 << dendl;
10046 if (r == -EIO) {
10047 // propagate EIO to caller
10048 return r;
10049 }
10050 ceph_assert(r == 0);
10051 }
10052 ceph_assert(req.bl.length() == req.r_len);
10053 }
10054 }
10055 }
10056 return 0;
10057 }
10058
10059 int BlueStore::_generate_read_result_bl(
10060 OnodeRef o,
10061 uint64_t offset,
10062 size_t length,
10063 ready_regions_t& ready_regions,
10064 vector<bufferlist>& compressed_blob_bls,
10065 blobs2read_t& blobs2read,
10066 bool buffered,
10067 bool* csum_error,
10068 bufferlist& bl)
10069 {
10070 // enumerate and decompress desired blobs
10071 auto p = compressed_blob_bls.begin();
10072 blobs2read_t::iterator b2r_it = blobs2read.begin();
10073 while (b2r_it != blobs2read.end()) {
10074 const BlobRef& bptr = b2r_it->first;
10075 regions2read_t& r2r = b2r_it->second;
10076 dout(20) << __func__ << " blob " << *bptr << std::hex
10077 << " need 0x" << r2r << std::dec << dendl;
10078 if (bptr->get_blob().is_compressed()) {
10079 ceph_assert(p != compressed_blob_bls.end());
10080 bufferlist& compressed_bl = *p++;
10081 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
10082 r2r.front().regs.front().logical_offset) < 0) {
10083 *csum_error = true;
10084 return -EIO;
10085 }
10086 bufferlist raw_bl;
10087 auto r = _decompress(compressed_bl, &raw_bl);
10088 if (r < 0)
10089 return r;
10090 if (buffered) {
10091 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
10092 raw_bl);
10093 }
10094 for (auto& req : r2r) {
10095 for (auto& r : req.regs) {
10096 ready_regions[r.logical_offset].substr_of(
10097 raw_bl, r.blob_xoffset, r.length);
10098 }
10099 }
10100 } else {
10101 for (auto& req : r2r) {
10102 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
10103 req.regs.front().logical_offset) < 0) {
10104 *csum_error = true;
10105 return -EIO;
10106 }
10107 if (buffered) {
10108 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
10109 req.r_off, req.bl);
10110 }
10111
10112 // prune and keep result
10113 for (const auto& r : req.regs) {
10114 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
10115 }
10116 }
10117 }
10118 ++b2r_it;
10119 }
10120
10121 // generate a resulting buffer
10122 auto pr = ready_regions.begin();
10123 auto pr_end = ready_regions.end();
10124 uint64_t pos = 0;
10125 while (pos < length) {
10126 if (pr != pr_end && pr->first == pos + offset) {
10127 dout(30) << __func__ << " assemble 0x" << std::hex << pos
10128 << ": data from 0x" << pr->first << "~" << pr->second.length()
10129 << std::dec << dendl;
10130 pos += pr->second.length();
10131 bl.claim_append(pr->second);
10132 ++pr;
10133 } else {
10134 uint64_t l = length - pos;
10135 if (pr != pr_end) {
10136 ceph_assert(pr->first > pos + offset);
10137 l = pr->first - (pos + offset);
10138 }
10139 dout(30) << __func__ << " assemble 0x" << std::hex << pos
10140 << ": zeros for 0x" << (pos + offset) << "~" << l
10141 << std::dec << dendl;
10142 bl.append_zero(l);
10143 pos += l;
10144 }
10145 }
10146 ceph_assert(bl.length() == length);
10147 ceph_assert(pos == length);
10148 ceph_assert(pr == pr_end);
10149 return 0;
10150 }
10151
10152 int BlueStore::_do_read(
10153 Collection *c,
10154 OnodeRef o,
10155 uint64_t offset,
10156 size_t length,
10157 bufferlist& bl,
10158 uint32_t op_flags,
10159 uint64_t retry_count)
10160 {
10161 FUNCTRACE(cct);
10162 int r = 0;
10163 int read_cache_policy = 0; // do not bypass clean or dirty cache
10164
10165 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10166 << " size 0x" << o->onode.size << " (" << std::dec
10167 << o->onode.size << ")" << dendl;
10168 bl.clear();
10169
10170 if (offset >= o->onode.size) {
10171 return r;
10172 }
10173
10174 // generally, don't buffer anything, unless the client explicitly requests
10175 // it.
10176 bool buffered = false;
10177 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10178 dout(20) << __func__ << " will do buffered read" << dendl;
10179 buffered = true;
10180 } else if (cct->_conf->bluestore_default_buffered_read &&
10181 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10182 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10183 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10184 buffered = true;
10185 }
10186
10187 if (offset + length > o->onode.size) {
10188 length = o->onode.size - offset;
10189 }
10190
10191 auto start = mono_clock::now();
10192 o->extent_map.fault_range(db, offset, length);
10193 log_latency(__func__,
10194 l_bluestore_read_onode_meta_lat,
10195 mono_clock::now() - start,
10196 cct->_conf->bluestore_log_op_age);
10197 _dump_onode<30>(cct, *o);
10198
10199 // for deep-scrub, we only read dirty cache and bypass clean cache in
10200 // order to read underlying block device in case there are silent disk errors.
10201 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
10202 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
10203 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
10204 }
10205
10206 // build blob-wise list to of stuff read (that isn't cached)
10207 ready_regions_t ready_regions;
10208 blobs2read_t blobs2read;
10209 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
10210
10211
10212 // read raw blob data.
10213 start = mono_clock::now(); // for the sake of simplicity
10214 // measure the whole block below.
10215 // The error isn't that much...
10216 vector<bufferlist> compressed_blob_bls;
10217 IOContext ioc(cct, NULL, true); // allow EIO
10218 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
10219 // we always issue aio for reading, so errors other than EIO are not allowed
10220 if (r < 0)
10221 return r;
10222
10223 int64_t num_ios = length;
10224 if (ioc.has_pending_aios()) {
10225 num_ios = -ioc.get_num_ios();
10226 bdev->aio_submit(&ioc);
10227 dout(20) << __func__ << " waiting for aio" << dendl;
10228 ioc.aio_wait();
10229 r = ioc.get_return_value();
10230 if (r < 0) {
10231 ceph_assert(r == -EIO); // no other errors allowed
10232 return -EIO;
10233 }
10234 }
10235 log_latency_fn(__func__,
10236 l_bluestore_read_wait_aio_lat,
10237 mono_clock::now() - start,
10238 cct->_conf->bluestore_log_op_age,
10239 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10240 );
10241
10242 bool csum_error = false;
10243 r = _generate_read_result_bl(o, offset, length, ready_regions,
10244 compressed_blob_bls, blobs2read,
10245 buffered, &csum_error, bl);
10246 if (csum_error) {
10247 // Handles spurious read errors caused by a kernel bug.
10248 // We sometimes get all-zero pages as a result of the read under
10249 // high memory pressure. Retrying the failing read succeeds in most
10250 // cases.
10251 // See also: http://tracker.ceph.com/issues/22464
10252 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10253 return -EIO;
10254 }
10255 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
10256 }
10257 r = bl.length();
10258 if (retry_count) {
10259 logger->inc(l_bluestore_reads_with_retries);
10260 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
10261 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
10262 }
10263 return r;
10264 }
10265
10266 int BlueStore::_verify_csum(OnodeRef& o,
10267 const bluestore_blob_t* blob, uint64_t blob_xoffset,
10268 const bufferlist& bl,
10269 uint64_t logical_offset) const
10270 {
10271 int bad;
10272 uint64_t bad_csum;
10273 auto start = mono_clock::now();
10274 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
10275 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
10276 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
10277 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
10278 bad = blob_xoffset;
10279 r = -1;
10280 bad_csum = 0xDEADBEEF;
10281 }
10282 if (r < 0) {
10283 if (r == -1) {
10284 PExtentVector pex;
10285 blob->map(
10286 bad,
10287 blob->get_csum_chunk_size(),
10288 [&](uint64_t offset, uint64_t length) {
10289 pex.emplace_back(bluestore_pextent_t(offset, length));
10290 return 0;
10291 });
10292 derr << __func__ << " bad "
10293 << Checksummer::get_csum_type_string(blob->csum_type)
10294 << "/0x" << std::hex << blob->get_csum_chunk_size()
10295 << " checksum at blob offset 0x" << bad
10296 << ", got 0x" << bad_csum << ", expected 0x"
10297 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
10298 << ", device location " << pex
10299 << ", logical extent 0x" << std::hex
10300 << (logical_offset + bad - blob_xoffset) << "~"
10301 << blob->get_csum_chunk_size() << std::dec
10302 << ", object " << o->oid
10303 << dendl;
10304 } else {
10305 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
10306 }
10307 }
10308 log_latency(__func__,
10309 l_bluestore_csum_lat,
10310 mono_clock::now() - start,
10311 cct->_conf->bluestore_log_op_age);
10312 if (cct->_conf->bluestore_ignore_data_csum) {
10313 return 0;
10314 }
10315 return r;
10316 }
10317
10318 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
10319 {
10320 int r = 0;
10321 auto start = mono_clock::now();
10322 auto i = source.cbegin();
10323 bluestore_compression_header_t chdr;
10324 decode(chdr, i);
10325 int alg = int(chdr.type);
10326 CompressorRef cp = compressor;
10327 if (!cp || (int)cp->get_type() != alg) {
10328 cp = Compressor::create(cct, alg);
10329 }
10330
10331 if (!cp.get()) {
10332 // if compressor isn't available - error, because cannot return
10333 // decompressed data?
10334
10335 const char* alg_name = Compressor::get_comp_alg_name(alg);
10336 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10337 _set_compression_alert(false, alg_name);
10338 r = -EIO;
10339 } else {
10340 r = cp->decompress(i, chdr.length, *result);
10341 if (r < 0) {
10342 derr << __func__ << " decompression failed with exit code " << r << dendl;
10343 r = -EIO;
10344 }
10345 }
10346 log_latency(__func__,
10347 l_bluestore_decompress_lat,
10348 mono_clock::now() - start,
10349 cct->_conf->bluestore_log_op_age);
10350 return r;
10351 }
10352
10353 // this stores fiemap into interval_set, other variations
10354 // use it internally
10355 int BlueStore::_fiemap(
10356 CollectionHandle &c_,
10357 const ghobject_t& oid,
10358 uint64_t offset,
10359 size_t length,
10360 interval_set<uint64_t>& destset)
10361 {
10362 Collection *c = static_cast<Collection *>(c_.get());
10363 if (!c->exists)
10364 return -ENOENT;
10365 {
10366 std::shared_lock l(c->lock);
10367
10368 OnodeRef o = c->get_onode(oid, false);
10369 if (!o || !o->exists) {
10370 return -ENOENT;
10371 }
10372 _dump_onode<30>(cct, *o);
10373
10374 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10375 << " size 0x" << o->onode.size << std::dec << dendl;
10376
10377 boost::intrusive::set<Extent>::iterator ep, eend;
10378 if (offset >= o->onode.size)
10379 goto out;
10380
10381 if (offset + length > o->onode.size) {
10382 length = o->onode.size - offset;
10383 }
10384
10385 o->extent_map.fault_range(db, offset, length);
10386 eend = o->extent_map.extent_map.end();
10387 ep = o->extent_map.seek_lextent(offset);
10388 while (length > 0) {
10389 dout(20) << __func__ << " offset " << offset << dendl;
10390 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10391 ++ep;
10392 continue;
10393 }
10394
10395 uint64_t x_len = length;
10396 if (ep != eend && ep->logical_offset <= offset) {
10397 uint64_t x_off = offset - ep->logical_offset;
10398 x_len = std::min(x_len, ep->length - x_off);
10399 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10400 << x_len << std::dec << " blob " << ep->blob << dendl;
10401 destset.insert(offset, x_len);
10402 length -= x_len;
10403 offset += x_len;
10404 if (x_off + x_len == ep->length)
10405 ++ep;
10406 continue;
10407 }
10408 if (ep != eend &&
10409 ep->logical_offset > offset &&
10410 ep->logical_offset - offset < x_len) {
10411 x_len = ep->logical_offset - offset;
10412 }
10413 offset += x_len;
10414 length -= x_len;
10415 }
10416 }
10417
10418 out:
10419 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10420 << " size = 0x(" << destset << ")" << std::dec << dendl;
10421 return 0;
10422 }
10423
10424 int BlueStore::fiemap(
10425 CollectionHandle &c_,
10426 const ghobject_t& oid,
10427 uint64_t offset,
10428 size_t length,
10429 bufferlist& bl)
10430 {
10431 interval_set<uint64_t> m;
10432 int r = _fiemap(c_, oid, offset, length, m);
10433 if (r >= 0) {
10434 encode(m, bl);
10435 }
10436 return r;
10437 }
10438
10439 int BlueStore::fiemap(
10440 CollectionHandle &c_,
10441 const ghobject_t& oid,
10442 uint64_t offset,
10443 size_t length,
10444 map<uint64_t, uint64_t>& destmap)
10445 {
10446 interval_set<uint64_t> m;
10447 int r = _fiemap(c_, oid, offset, length, m);
10448 if (r >= 0) {
10449 destmap = std::move(m).detach();
10450 }
10451 return r;
10452 }
10453
10454 int BlueStore::readv(
10455 CollectionHandle &c_,
10456 const ghobject_t& oid,
10457 interval_set<uint64_t>& m,
10458 bufferlist& bl,
10459 uint32_t op_flags)
10460 {
10461 auto start = mono_clock::now();
10462 Collection *c = static_cast<Collection *>(c_.get());
10463 const coll_t &cid = c->get_cid();
10464 dout(15) << __func__ << " " << cid << " " << oid
10465 << " fiemap " << m
10466 << dendl;
10467 if (!c->exists)
10468 return -ENOENT;
10469
10470 bl.clear();
10471 int r;
10472 {
10473 std::shared_lock l(c->lock);
10474 auto start1 = mono_clock::now();
10475 OnodeRef o = c->get_onode(oid, false);
10476 log_latency("get_onode@read",
10477 l_bluestore_read_onode_meta_lat,
10478 mono_clock::now() - start1,
10479 cct->_conf->bluestore_log_op_age);
10480 if (!o || !o->exists) {
10481 r = -ENOENT;
10482 goto out;
10483 }
10484
10485 if (m.empty()) {
10486 r = 0;
10487 goto out;
10488 }
10489
10490 r = _do_readv(c, o, m, bl, op_flags);
10491 if (r == -EIO) {
10492 logger->inc(l_bluestore_read_eio);
10493 }
10494 }
10495
10496 out:
10497 if (r >= 0 && _debug_data_eio(oid)) {
10498 r = -EIO;
10499 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10500 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10501 cct->_conf->bluestore_debug_random_read_err &&
10502 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10503 100.0)) == 0) {
10504 dout(0) << __func__ << ": inject random EIO" << dendl;
10505 r = -EIO;
10506 }
10507 dout(10) << __func__ << " " << cid << " " << oid
10508 << " fiemap " << m << std::dec
10509 << " = " << r << dendl;
10510 log_latency(__func__,
10511 l_bluestore_read_lat,
10512 mono_clock::now() - start,
10513 cct->_conf->bluestore_log_op_age);
10514 return r;
10515 }
10516
10517 int BlueStore::_do_readv(
10518 Collection *c,
10519 OnodeRef o,
10520 const interval_set<uint64_t>& m,
10521 bufferlist& bl,
10522 uint32_t op_flags,
10523 uint64_t retry_count)
10524 {
10525 FUNCTRACE(cct);
10526 int r = 0;
10527 int read_cache_policy = 0; // do not bypass clean or dirty cache
10528
10529 dout(20) << __func__ << " fiemap " << m << std::hex
10530 << " size 0x" << o->onode.size << " (" << std::dec
10531 << o->onode.size << ")" << dendl;
10532
10533 // generally, don't buffer anything, unless the client explicitly requests
10534 // it.
10535 bool buffered = false;
10536 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10537 dout(20) << __func__ << " will do buffered read" << dendl;
10538 buffered = true;
10539 } else if (cct->_conf->bluestore_default_buffered_read &&
10540 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10541 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10542 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10543 buffered = true;
10544 }
10545 // this method must be idempotent since we may call it several times
10546 // before we finally read the expected result.
10547 bl.clear();
10548
10549 // call fiemap first!
10550 ceph_assert(m.range_start() <= o->onode.size);
10551 ceph_assert(m.range_end() <= o->onode.size);
10552 auto start = mono_clock::now();
10553 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10554 log_latency(__func__,
10555 l_bluestore_read_onode_meta_lat,
10556 mono_clock::now() - start,
10557 cct->_conf->bluestore_log_op_age);
10558 _dump_onode<30>(cct, *o);
10559
10560 IOContext ioc(cct, NULL, true); // allow EIO
10561 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10562 raw_results.reserve(m.num_intervals());
10563 int i = 0;
10564 for (auto p = m.begin(); p != m.end(); p++, i++) {
10565 raw_results.push_back({});
10566 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10567 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10568 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10569 // we always issue aio for reading, so errors other than EIO are not allowed
10570 if (r < 0)
10571 return r;
10572 }
10573
10574 auto num_ios = m.size();
10575 if (ioc.has_pending_aios()) {
10576 num_ios = ioc.get_num_ios();
10577 bdev->aio_submit(&ioc);
10578 dout(20) << __func__ << " waiting for aio" << dendl;
10579 ioc.aio_wait();
10580 r = ioc.get_return_value();
10581 if (r < 0) {
10582 ceph_assert(r == -EIO); // no other errors allowed
10583 return -EIO;
10584 }
10585 }
10586 log_latency_fn(__func__,
10587 l_bluestore_read_wait_aio_lat,
10588 mono_clock::now() - start,
10589 cct->_conf->bluestore_log_op_age,
10590 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10591 );
10592
10593 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10594 i = 0;
10595 for (auto p = m.begin(); p != m.end(); p++, i++) {
10596 bool csum_error = false;
10597 bufferlist t;
10598 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10599 std::get<0>(raw_results[i]),
10600 std::get<1>(raw_results[i]),
10601 std::get<2>(raw_results[i]),
10602 buffered, &csum_error, t);
10603 if (csum_error) {
10604 // Handles spurious read errors caused by a kernel bug.
10605 // We sometimes get all-zero pages as a result of the read under
10606 // high memory pressure. Retrying the failing read succeeds in most
10607 // cases.
10608 // See also: http://tracker.ceph.com/issues/22464
10609 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10610 return -EIO;
10611 }
10612 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10613 }
10614 bl.claim_append(t);
10615 }
10616 if (retry_count) {
10617 logger->inc(l_bluestore_reads_with_retries);
10618 dout(5) << __func__ << " read fiemap " << m
10619 << " failed " << retry_count << " times before succeeding"
10620 << dendl;
10621 }
10622 return bl.length();
10623 }
10624
10625 int BlueStore::dump_onode(CollectionHandle &c_,
10626 const ghobject_t& oid,
10627 const string& section_name,
10628 Formatter *f)
10629 {
10630 Collection *c = static_cast<Collection *>(c_.get());
10631 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10632 if (!c->exists)
10633 return -ENOENT;
10634
10635 int r;
10636 {
10637 std::shared_lock l(c->lock);
10638
10639 OnodeRef o = c->get_onode(oid, false);
10640 if (!o || !o->exists) {
10641 r = -ENOENT;
10642 goto out;
10643 }
10644 // FIXME minor: actually the next line isn't enough to
10645 // load shared blobs. Leaving as is for now..
10646 //
10647 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10648
10649 _dump_onode<0>(cct, *o);
10650 f->open_object_section(section_name.c_str());
10651 o->dump(f);
10652 f->close_section();
10653 r = 0;
10654 }
10655 out:
10656 dout(10) << __func__ << " " << c->cid << " " << oid
10657 << " = " << r << dendl;
10658 return r;
10659 }
10660
10661 int BlueStore::getattr(
10662 CollectionHandle &c_,
10663 const ghobject_t& oid,
10664 const char *name,
10665 bufferptr& value)
10666 {
10667 Collection *c = static_cast<Collection *>(c_.get());
10668 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10669 if (!c->exists)
10670 return -ENOENT;
10671
10672 int r;
10673 {
10674 std::shared_lock l(c->lock);
10675 mempool::bluestore_cache_meta::string k(name);
10676
10677 OnodeRef o = c->get_onode(oid, false);
10678 if (!o || !o->exists) {
10679 r = -ENOENT;
10680 goto out;
10681 }
10682
10683 if (!o->onode.attrs.count(k)) {
10684 r = -ENODATA;
10685 goto out;
10686 }
10687 value = o->onode.attrs[k];
10688 r = 0;
10689 }
10690 out:
10691 if (r == 0 && _debug_mdata_eio(oid)) {
10692 r = -EIO;
10693 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10694 }
10695 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10696 << " = " << r << dendl;
10697 return r;
10698 }
10699
10700 int BlueStore::getattrs(
10701 CollectionHandle &c_,
10702 const ghobject_t& oid,
10703 map<string,bufferptr>& aset)
10704 {
10705 Collection *c = static_cast<Collection *>(c_.get());
10706 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10707 if (!c->exists)
10708 return -ENOENT;
10709
10710 int r;
10711 {
10712 std::shared_lock l(c->lock);
10713
10714 OnodeRef o = c->get_onode(oid, false);
10715 if (!o || !o->exists) {
10716 r = -ENOENT;
10717 goto out;
10718 }
10719 for (auto& i : o->onode.attrs) {
10720 aset.emplace(i.first.c_str(), i.second);
10721 }
10722 r = 0;
10723 }
10724
10725 out:
10726 if (r == 0 && _debug_mdata_eio(oid)) {
10727 r = -EIO;
10728 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10729 }
10730 dout(10) << __func__ << " " << c->cid << " " << oid
10731 << " = " << r << dendl;
10732 return r;
10733 }
10734
10735 int BlueStore::list_collections(vector<coll_t>& ls)
10736 {
10737 std::shared_lock l(coll_lock);
10738 ls.reserve(coll_map.size());
10739 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10740 p != coll_map.end();
10741 ++p)
10742 ls.push_back(p->first);
10743 return 0;
10744 }
10745
10746 bool BlueStore::collection_exists(const coll_t& c)
10747 {
10748 std::shared_lock l(coll_lock);
10749 return coll_map.count(c);
10750 }
10751
10752 int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
10753 {
10754 dout(15) << __func__ << " " << ch->cid << dendl;
10755 vector<ghobject_t> ls;
10756 ghobject_t next;
10757 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
10758 &ls, &next);
10759 if (r < 0) {
10760 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10761 << dendl;
10762 return r;
10763 }
10764 *empty = ls.empty();
10765 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
10766 return 0;
10767 }
10768
10769 int BlueStore::collection_bits(CollectionHandle& ch)
10770 {
10771 dout(15) << __func__ << " " << ch->cid << dendl;
10772 Collection *c = static_cast<Collection*>(ch.get());
10773 std::shared_lock l(c->lock);
10774 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
10775 return c->cnode.bits;
10776 }
10777
10778 int BlueStore::collection_list(
10779 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10780 vector<ghobject_t> *ls, ghobject_t *pnext)
10781 {
10782 Collection *c = static_cast<Collection *>(c_.get());
10783 c->flush();
10784 dout(15) << __func__ << " " << c->cid
10785 << " start " << start << " end " << end << " max " << max << dendl;
10786 int r;
10787 {
10788 std::shared_lock l(c->lock);
10789 r = _collection_list(c, start, end, max, false, ls, pnext);
10790 }
10791
10792 dout(10) << __func__ << " " << c->cid
10793 << " start " << start << " end " << end << " max " << max
10794 << " = " << r << ", ls.size() = " << ls->size()
10795 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10796 return r;
10797 }
10798
10799 int BlueStore::collection_list_legacy(
10800 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10801 vector<ghobject_t> *ls, ghobject_t *pnext)
10802 {
10803 Collection *c = static_cast<Collection *>(c_.get());
10804 c->flush();
10805 dout(15) << __func__ << " " << c->cid
10806 << " start " << start << " end " << end << " max " << max << dendl;
10807 int r;
10808 {
10809 std::shared_lock l(c->lock);
10810 r = _collection_list(c, start, end, max, true, ls, pnext);
10811 }
10812
10813 dout(10) << __func__ << " " << c->cid
10814 << " start " << start << " end " << end << " max " << max
10815 << " = " << r << ", ls.size() = " << ls->size()
10816 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10817 return r;
10818 }
10819
10820 int BlueStore::_collection_list(
10821 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
10822 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
10823 {
10824
10825 if (!c->exists)
10826 return -ENOENT;
10827
10828 auto start_time = mono_clock::now();
10829 int r = 0;
10830 ghobject_t static_next;
10831 std::unique_ptr<CollectionListIterator> it;
10832 ghobject_t coll_range_temp_start, coll_range_temp_end;
10833 ghobject_t coll_range_start, coll_range_end;
10834 bool set_next = false;
10835 ghobject_t pend;
10836 bool temp;
10837
10838 if (!pnext)
10839 pnext = &static_next;
10840
10841 if (start.is_max() || start.hobj.is_max()) {
10842 goto out;
10843 }
10844 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
10845 &coll_range_temp_end, &coll_range_start, &coll_range_end);
10846 dout(20) << __func__
10847 << " range " << coll_range_temp_start
10848 << " to " << coll_range_temp_end
10849 << " and " << coll_range_start
10850 << " to " << coll_range_end
10851 << " start " << start << dendl;
10852 if (legacy) {
10853 it = std::make_unique<SimpleCollectionListIterator>(
10854 cct, db->get_iterator(PREFIX_OBJ));
10855 } else {
10856 it = std::make_unique<SortedCollectionListIterator>(
10857 db->get_iterator(PREFIX_OBJ));
10858 }
10859 if (start == ghobject_t() ||
10860 start.hobj == hobject_t() ||
10861 start == c->cid.get_min_hobj()) {
10862 it->upper_bound(coll_range_temp_start);
10863 temp = true;
10864 } else {
10865 if (start.hobj.is_temp()) {
10866 temp = true;
10867 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
10868 } else {
10869 temp = false;
10870 ceph_assert(start >= coll_range_start && start < coll_range_end);
10871 }
10872 dout(20) << __func__ << " temp=" << (int)temp << dendl;
10873 it->lower_bound(start);
10874 }
10875 if (end.hobj.is_max()) {
10876 pend = temp ? coll_range_temp_end : coll_range_end;
10877 } else {
10878 if (end.hobj.is_temp()) {
10879 if (temp)
10880 pend = end;
10881 else
10882 goto out;
10883 } else {
10884 pend = temp ? coll_range_temp_end : end;
10885 }
10886 }
10887 dout(20) << __func__ << " pend " << pend << dendl;
10888 while (true) {
10889 if (!it->valid() || it->is_ge(pend)) {
10890 if (!it->valid())
10891 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10892 else
10893 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
10894 if (temp) {
10895 if (end.hobj.is_temp()) {
10896 if (it->valid() && it->is_lt(coll_range_temp_end)) {
10897 *pnext = it->oid();
10898 set_next = true;
10899 }
10900 break;
10901 }
10902 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10903 temp = false;
10904 it->upper_bound(coll_range_start);
10905 if (end.hobj.is_max())
10906 pend = coll_range_end;
10907 else
10908 pend = end;
10909 dout(30) << __func__ << " pend " << pend << dendl;
10910 continue;
10911 }
10912 if (it->valid() && it->is_lt(coll_range_end)) {
10913 *pnext = it->oid();
10914 set_next = true;
10915 }
10916 break;
10917 }
10918 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
10919 if (ls->size() >= (unsigned)max) {
10920 dout(20) << __func__ << " reached max " << max << dendl;
10921 *pnext = it->oid();
10922 set_next = true;
10923 break;
10924 }
10925 ls->push_back(it->oid());
10926 it->next();
10927 }
10928 out:
10929 if (!set_next) {
10930 *pnext = ghobject_t::get_max();
10931 }
10932 log_latency_fn(
10933 __func__,
10934 l_bluestore_clist_lat,
10935 mono_clock::now() - start_time,
10936 cct->_conf->bluestore_log_collection_list_age,
10937 [&] (const ceph::timespan& lat) {
10938 ostringstream ostr;
10939 ostr << ", lat = " << timespan_str(lat)
10940 << " cid =" << c->cid
10941 << " start " << start << " end " << end
10942 << " max " << max;
10943 return ostr.str();
10944 }
10945 );
10946 return r;
10947 }
10948
10949 int BlueStore::omap_get(
10950 CollectionHandle &c_, ///< [in] Collection containing oid
10951 const ghobject_t &oid, ///< [in] Object containing omap
10952 bufferlist *header, ///< [out] omap header
10953 map<string, bufferlist> *out /// < [out] Key to value map
10954 )
10955 {
10956 Collection *c = static_cast<Collection *>(c_.get());
10957 return _omap_get(c, oid, header, out);
10958 }
10959
10960 int BlueStore::_omap_get(
10961 Collection *c, ///< [in] Collection containing oid
10962 const ghobject_t &oid, ///< [in] Object containing omap
10963 bufferlist *header, ///< [out] omap header
10964 map<string, bufferlist> *out /// < [out] Key to value map
10965 )
10966 {
10967 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10968 if (!c->exists)
10969 return -ENOENT;
10970 std::shared_lock l(c->lock);
10971 int r = 0;
10972 OnodeRef o = c->get_onode(oid, false);
10973 if (!o || !o->exists) {
10974 r = -ENOENT;
10975 goto out;
10976 }
10977 r = _onode_omap_get(o, header, out);
10978 out:
10979 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10980 << dendl;
10981 return r;
10982 }
10983
10984 int BlueStore::_onode_omap_get(
10985 const OnodeRef &o, ///< [in] Object containing omap
10986 bufferlist *header, ///< [out] omap header
10987 map<string, bufferlist> *out /// < [out] Key to value map
10988 )
10989 {
10990 int r = 0;
10991 if (!o || !o->exists) {
10992 r = -ENOENT;
10993 goto out;
10994 }
10995 if (!o->onode.has_omap())
10996 goto out;
10997 o->flush();
10998 {
10999 const string& prefix = o->get_omap_prefix();
11000 KeyValueDB::Iterator it = db->get_iterator(prefix);
11001 string head, tail;
11002 o->get_omap_header(&head);
11003 o->get_omap_tail(&tail);
11004 it->lower_bound(head);
11005 while (it->valid()) {
11006 if (it->key() == head) {
11007 dout(30) << __func__ << " got header" << dendl;
11008 *header = it->value();
11009 } else if (it->key() >= tail) {
11010 dout(30) << __func__ << " reached tail" << dendl;
11011 break;
11012 } else {
11013 string user_key;
11014 o->decode_omap_key(it->key(), &user_key);
11015 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11016 << " -> " << user_key << dendl;
11017 (*out)[user_key] = it->value();
11018 }
11019 it->next();
11020 }
11021 }
11022 out:
11023 return r;
11024 }
11025
11026 int BlueStore::omap_get_header(
11027 CollectionHandle &c_, ///< [in] Collection containing oid
11028 const ghobject_t &oid, ///< [in] Object containing omap
11029 bufferlist *header, ///< [out] omap header
11030 bool allow_eio ///< [in] don't assert on eio
11031 )
11032 {
11033 Collection *c = static_cast<Collection *>(c_.get());
11034 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11035 if (!c->exists)
11036 return -ENOENT;
11037 std::shared_lock l(c->lock);
11038 int r = 0;
11039 OnodeRef o = c->get_onode(oid, false);
11040 if (!o || !o->exists) {
11041 r = -ENOENT;
11042 goto out;
11043 }
11044 if (!o->onode.has_omap())
11045 goto out;
11046 o->flush();
11047 {
11048 string head;
11049 o->get_omap_header(&head);
11050 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
11051 dout(30) << __func__ << " got header" << dendl;
11052 } else {
11053 dout(30) << __func__ << " no header" << dendl;
11054 }
11055 }
11056 out:
11057 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11058 << dendl;
11059 return r;
11060 }
11061
11062 int BlueStore::omap_get_keys(
11063 CollectionHandle &c_, ///< [in] Collection containing oid
11064 const ghobject_t &oid, ///< [in] Object containing omap
11065 set<string> *keys ///< [out] Keys defined on oid
11066 )
11067 {
11068 Collection *c = static_cast<Collection *>(c_.get());
11069 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11070 if (!c->exists)
11071 return -ENOENT;
11072 auto start1 = mono_clock::now();
11073 std::shared_lock l(c->lock);
11074 int r = 0;
11075 OnodeRef o = c->get_onode(oid, false);
11076 if (!o || !o->exists) {
11077 r = -ENOENT;
11078 goto out;
11079 }
11080 if (!o->onode.has_omap())
11081 goto out;
11082 o->flush();
11083 {
11084 const string& prefix = o->get_omap_prefix();
11085 KeyValueDB::Iterator it = db->get_iterator(prefix);
11086 string head, tail;
11087 o->get_omap_key(string(), &head);
11088 o->get_omap_tail(&tail);
11089 it->lower_bound(head);
11090 while (it->valid()) {
11091 if (it->key() >= tail) {
11092 dout(30) << __func__ << " reached tail" << dendl;
11093 break;
11094 }
11095 string user_key;
11096 o->decode_omap_key(it->key(), &user_key);
11097 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11098 << " -> " << user_key << dendl;
11099 keys->insert(user_key);
11100 it->next();
11101 }
11102 }
11103 out:
11104 c->store->log_latency(
11105 __func__,
11106 l_bluestore_omap_get_keys_lat,
11107 mono_clock::now() - start1,
11108 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11109
11110 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11111 << dendl;
11112 return r;
11113 }
11114
11115 int BlueStore::omap_get_values(
11116 CollectionHandle &c_, ///< [in] Collection containing oid
11117 const ghobject_t &oid, ///< [in] Object containing omap
11118 const set<string> &keys, ///< [in] Keys to get
11119 map<string, bufferlist> *out ///< [out] Returned keys and values
11120 )
11121 {
11122 Collection *c = static_cast<Collection *>(c_.get());
11123 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11124 if (!c->exists)
11125 return -ENOENT;
11126 std::shared_lock l(c->lock);
11127 auto start1 = mono_clock::now();
11128 int r = 0;
11129 string final_key;
11130 OnodeRef o = c->get_onode(oid, false);
11131 if (!o || !o->exists) {
11132 r = -ENOENT;
11133 goto out;
11134 }
11135 if (!o->onode.has_omap()) {
11136 goto out;
11137 }
11138 o->flush();
11139 {
11140 const string& prefix = o->get_omap_prefix();
11141 o->get_omap_key(string(), &final_key);
11142 size_t base_key_len = final_key.size();
11143 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
11144 final_key.resize(base_key_len); // keep prefix
11145 final_key += *p;
11146 bufferlist val;
11147 if (db->get(prefix, final_key, &val) >= 0) {
11148 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
11149 << " -> " << *p << dendl;
11150 out->insert(make_pair(*p, val));
11151 }
11152 }
11153 }
11154 out:
11155 c->store->log_latency(
11156 __func__,
11157 l_bluestore_omap_get_values_lat,
11158 mono_clock::now() - start1,
11159 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11160
11161 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11162 << dendl;
11163 return r;
11164 }
11165
11166 #ifdef WITH_SEASTAR
11167 int BlueStore::omap_get_values(
11168 CollectionHandle &c_, ///< [in] Collection containing oid
11169 const ghobject_t &oid, ///< [in] Object containing omap
11170 const std::optional<string> &start_after, ///< [in] Keys to get
11171 map<string, bufferlist> *output ///< [out] Returned keys and values
11172 )
11173 {
11174 Collection *c = static_cast<Collection *>(c_.get());
11175 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11176 if (!c->exists)
11177 return -ENOENT;
11178 std::shared_lock l(c->lock);
11179 int r = 0;
11180 OnodeRef o = c->get_onode(oid, false);
11181 if (!o || !o->exists) {
11182 r = -ENOENT;
11183 goto out;
11184 }
11185 if (!o->onode.has_omap()) {
11186 goto out;
11187 }
11188 o->flush();
11189 {
11190 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
11191 if (!iter) {
11192 r = -ENOENT;
11193 goto out;
11194 }
11195 iter->upper_bound(*start_after);
11196 for (; iter->valid(); iter->next()) {
11197 output->insert(make_pair(iter->key(), iter->value()));
11198 }
11199 }
11200
11201 out:
11202 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11203 << dendl;
11204 return r;
11205 }
11206 #endif
11207
11208 int BlueStore::omap_check_keys(
11209 CollectionHandle &c_, ///< [in] Collection containing oid
11210 const ghobject_t &oid, ///< [in] Object containing omap
11211 const set<string> &keys, ///< [in] Keys to check
11212 set<string> *out ///< [out] Subset of keys defined on oid
11213 )
11214 {
11215 Collection *c = static_cast<Collection *>(c_.get());
11216 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11217 if (!c->exists)
11218 return -ENOENT;
11219 std::shared_lock l(c->lock);
11220 int r = 0;
11221 string final_key;
11222 OnodeRef o = c->get_onode(oid, false);
11223 if (!o || !o->exists) {
11224 r = -ENOENT;
11225 goto out;
11226 }
11227 if (!o->onode.has_omap()) {
11228 goto out;
11229 }
11230 o->flush();
11231 {
11232 const string& prefix = o->get_omap_prefix();
11233 o->get_omap_key(string(), &final_key);
11234 size_t base_key_len = final_key.size();
11235 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
11236 final_key.resize(base_key_len); // keep prefix
11237 final_key += *p;
11238 bufferlist val;
11239 if (db->get(prefix, final_key, &val) >= 0) {
11240 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
11241 << " -> " << *p << dendl;
11242 out->insert(*p);
11243 } else {
11244 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
11245 << " -> " << *p << dendl;
11246 }
11247 }
11248 }
11249 out:
11250 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11251 << dendl;
11252 return r;
11253 }
11254
11255 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
11256 CollectionHandle &c_, ///< [in] collection
11257 const ghobject_t &oid ///< [in] object
11258 )
11259 {
11260 Collection *c = static_cast<Collection *>(c_.get());
11261 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
11262 if (!c->exists) {
11263 return ObjectMap::ObjectMapIterator();
11264 }
11265 std::shared_lock l(c->lock);
11266 OnodeRef o = c->get_onode(oid, false);
11267 if (!o || !o->exists) {
11268 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
11269 return ObjectMap::ObjectMapIterator();
11270 }
11271 o->flush();
11272 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
11273 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
11274 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
11275 }
11276
11277 // -----------------
11278 // write helpers
11279
11280 uint64_t BlueStore::_get_ondisk_reserved() const {
11281 return round_up_to(
11282 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
11283 }
11284
11285 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
11286 {
11287 dout(10) << __func__ << " ondisk_format " << ondisk_format
11288 << " min_compat_ondisk_format " << min_compat_ondisk_format
11289 << dendl;
11290 ceph_assert(ondisk_format == latest_ondisk_format);
11291 {
11292 bufferlist bl;
11293 encode(ondisk_format, bl);
11294 t->set(PREFIX_SUPER, "ondisk_format", bl);
11295 }
11296 {
11297 bufferlist bl;
11298 encode(min_compat_ondisk_format, bl);
11299 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
11300 }
11301 }
11302
11303 int BlueStore::_open_super_meta()
11304 {
11305 // nid
11306 {
11307 nid_max = 0;
11308 bufferlist bl;
11309 db->get(PREFIX_SUPER, "nid_max", &bl);
11310 auto p = bl.cbegin();
11311 try {
11312 uint64_t v;
11313 decode(v, p);
11314 nid_max = v;
11315 } catch (buffer::error& e) {
11316 derr << __func__ << " unable to read nid_max" << dendl;
11317 return -EIO;
11318 }
11319 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
11320 nid_last = nid_max.load();
11321 }
11322
11323 // blobid
11324 {
11325 blobid_max = 0;
11326 bufferlist bl;
11327 db->get(PREFIX_SUPER, "blobid_max", &bl);
11328 auto p = bl.cbegin();
11329 try {
11330 uint64_t v;
11331 decode(v, p);
11332 blobid_max = v;
11333 } catch (buffer::error& e) {
11334 derr << __func__ << " unable to read blobid_max" << dendl;
11335 return -EIO;
11336 }
11337 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
11338 blobid_last = blobid_max.load();
11339 }
11340
11341 // freelist
11342 {
11343 bufferlist bl;
11344 db->get(PREFIX_SUPER, "freelist_type", &bl);
11345 if (bl.length()) {
11346 freelist_type = std::string(bl.c_str(), bl.length());
11347 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
11348 } else {
11349 ceph_abort_msg("Not Support extent freelist manager");
11350 }
11351 }
11352
11353 // ondisk format
11354 int32_t compat_ondisk_format = 0;
11355 {
11356 bufferlist bl;
11357 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
11358 if (r < 0) {
11359 // base case: kraken bluestore is v1 and readable by v1
11360 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
11361 << dendl;
11362 ondisk_format = 1;
11363 compat_ondisk_format = 1;
11364 } else {
11365 auto p = bl.cbegin();
11366 try {
11367 decode(ondisk_format, p);
11368 } catch (buffer::error& e) {
11369 derr << __func__ << " unable to read ondisk_format" << dendl;
11370 return -EIO;
11371 }
11372 bl.clear();
11373 {
11374 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11375 ceph_assert(!r);
11376 auto p = bl.cbegin();
11377 try {
11378 decode(compat_ondisk_format, p);
11379 } catch (buffer::error& e) {
11380 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11381 return -EIO;
11382 }
11383 }
11384 }
11385 dout(10) << __func__ << " ondisk_format " << ondisk_format
11386 << " compat_ondisk_format " << compat_ondisk_format
11387 << dendl;
11388 }
11389
11390 if (latest_ondisk_format < compat_ondisk_format) {
11391 derr << __func__ << " compat_ondisk_format is "
11392 << compat_ondisk_format << " but we only understand version "
11393 << latest_ondisk_format << dendl;
11394 return -EPERM;
11395 }
11396
11397 {
11398 bufferlist bl;
11399 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11400 auto p = bl.cbegin();
11401 try {
11402 uint64_t val;
11403 decode(val, p);
11404 min_alloc_size = val;
11405 min_alloc_size_order = ctz(val);
11406 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
11407 } catch (buffer::error& e) {
11408 derr << __func__ << " unable to read min_alloc_size" << dendl;
11409 return -EIO;
11410 }
11411 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11412 << std::dec << dendl;
11413 }
11414
11415 _set_per_pool_omap();
11416
11417 _open_statfs();
11418 _set_alloc_sizes();
11419 _set_throttle_params();
11420
11421 _set_csum();
11422 _set_compression();
11423 _set_blob_size();
11424
11425 _validate_bdev();
11426 return 0;
11427 }
11428
11429 int BlueStore::_upgrade_super()
11430 {
11431 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11432 << latest_ondisk_format << dendl;
11433 if (ondisk_format < latest_ondisk_format) {
11434 ceph_assert(ondisk_format > 0);
11435 ceph_assert(ondisk_format < latest_ondisk_format);
11436
11437 KeyValueDB::Transaction t = db->get_transaction();
11438 if (ondisk_format == 1) {
11439 // changes:
11440 // - super: added ondisk_format
11441 // - super: added min_readable_ondisk_format
11442 // - super: added min_compat_ondisk_format
11443 // - super: added min_alloc_size
11444 // - super: removed min_min_alloc_size
11445 {
11446 bufferlist bl;
11447 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11448 auto p = bl.cbegin();
11449 try {
11450 uint64_t val;
11451 decode(val, p);
11452 min_alloc_size = val;
11453 } catch (buffer::error& e) {
11454 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11455 return -EIO;
11456 }
11457 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11458 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
11459 }
11460 ondisk_format = 2;
11461 }
11462 if (ondisk_format == 2) {
11463 // changes:
11464 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11465 // ondes are using the per-pool prefix until a repair is run; at that
11466 // point the per_pool_omap=1 key will be set.
11467 // - super: added per_pool_omap key, which indicates that *all* objects
11468 // are using the new prefix and key format
11469 ondisk_format = 3;
11470 }
11471 if (ondisk_format == 3) {
11472 // changes:
11473 // - FreelistManager keeps meta within bdev label
11474 int r = _write_out_fm_meta(0);
11475 ceph_assert(r == 0);
11476 ondisk_format = 4;
11477 }
11478 // This to be the last operation
11479 _prepare_ondisk_format_super(t);
11480 int r = db->submit_transaction_sync(t);
11481 ceph_assert(r == 0);
11482 }
11483 // done
11484 dout(1) << __func__ << " done" << dendl;
11485 return 0;
11486 }
11487
11488 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11489 {
11490 if (o->onode.nid) {
11491 ceph_assert(o->exists);
11492 return;
11493 }
11494 uint64_t nid = ++nid_last;
11495 dout(20) << __func__ << " " << nid << dendl;
11496 o->onode.nid = nid;
11497 txc->last_nid = nid;
11498 o->exists = true;
11499 }
11500
11501 uint64_t BlueStore::_assign_blobid(TransContext *txc)
11502 {
11503 uint64_t bid = ++blobid_last;
11504 dout(20) << __func__ << " " << bid << dendl;
11505 txc->last_blobid = bid;
11506 return bid;
11507 }
11508
11509 void BlueStore::get_db_statistics(Formatter *f)
11510 {
11511 db->get_statistics(f);
11512 }
11513
11514 BlueStore::TransContext *BlueStore::_txc_create(
11515 Collection *c, OpSequencer *osr,
11516 list<Context*> *on_commits)
11517 {
11518 TransContext *txc = new TransContext(cct, c, osr, on_commits);
11519 txc->t = db->get_transaction();
11520 osr->queue_new(txc);
11521 dout(20) << __func__ << " osr " << osr << " = " << txc
11522 << " seq " << txc->seq << dendl;
11523 return txc;
11524 }
11525
11526 void BlueStore::_txc_calc_cost(TransContext *txc)
11527 {
11528 // one "io" for the kv commit
11529 auto ios = 1 + txc->ioc.get_num_ios();
11530 auto cost = throttle_cost_per_io.load();
11531 txc->cost = ios * cost + txc->bytes;
11532 txc->ios = ios;
11533 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11534 << ios << " ios * " << cost << " + " << txc->bytes
11535 << " bytes)" << dendl;
11536 }
11537
11538 void BlueStore::_txc_update_store_statfs(TransContext *txc)
11539 {
11540 if (txc->statfs_delta.is_empty())
11541 return;
11542
11543 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11544 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11545 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11546 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11547 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11548
11549 bufferlist bl;
11550 txc->statfs_delta.encode(bl);
11551 if (per_pool_stat_collection) {
11552 string key;
11553 get_pool_stat_key(txc->osd_pool_id, &key);
11554 txc->t->merge(PREFIX_STAT, key, bl);
11555
11556 std::lock_guard l(vstatfs_lock);
11557 auto& stats = osd_pools[txc->osd_pool_id];
11558 stats += txc->statfs_delta;
11559
11560 vstatfs += txc->statfs_delta; //non-persistent in this mode
11561
11562 } else {
11563 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
11564
11565 std::lock_guard l(vstatfs_lock);
11566 vstatfs += txc->statfs_delta;
11567 }
11568 txc->statfs_delta.reset();
11569 }
11570
11571 void BlueStore::_txc_state_proc(TransContext *txc)
11572 {
11573 while (true) {
11574 dout(10) << __func__ << " txc " << txc
11575 << " " << txc->get_state_name() << dendl;
11576 switch (txc->state) {
11577 case TransContext::STATE_PREPARE:
11578 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
11579 if (txc->ioc.has_pending_aios()) {
11580 txc->state = TransContext::STATE_AIO_WAIT;
11581 txc->had_ios = true;
11582 _txc_aio_submit(txc);
11583 return;
11584 }
11585 // ** fall-thru **
11586
11587 case TransContext::STATE_AIO_WAIT:
11588 {
11589 mono_clock::duration lat = throttle.log_state_latency(
11590 *txc, logger, l_bluestore_state_aio_wait_lat);
11591 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11592 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11593 << ", latency = " << lat
11594 << dendl;
11595 }
11596 }
11597
11598 _txc_finish_io(txc); // may trigger blocked txc's too
11599 return;
11600
11601 case TransContext::STATE_IO_DONE:
11602 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
11603 if (txc->had_ios) {
11604 ++txc->osr->txc_with_unstable_io;
11605 }
11606 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
11607 txc->state = TransContext::STATE_KV_QUEUED;
11608 if (cct->_conf->bluestore_sync_submit_transaction) {
11609 if (txc->last_nid >= nid_max ||
11610 txc->last_blobid >= blobid_max) {
11611 dout(20) << __func__
11612 << " last_{nid,blobid} exceeds max, submit via kv thread"
11613 << dendl;
11614 } else if (txc->osr->kv_committing_serially) {
11615 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11616 << dendl;
11617 // note: this is starvation-prone. once we have a txc in a busy
11618 // sequencer that is committing serially it is possible to keep
11619 // submitting new transactions fast enough that we get stuck doing
11620 // so. the alternative is to block here... fixme?
11621 } else if (txc->osr->txc_with_unstable_io) {
11622 dout(20) << __func__ << " prior txc(s) with unstable ios "
11623 << txc->osr->txc_with_unstable_io.load() << dendl;
11624 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11625 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11626 == 0) {
11627 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11628 << dendl;
11629 } else {
11630 _txc_apply_kv(txc, true);
11631 }
11632 }
11633 {
11634 std::lock_guard l(kv_lock);
11635 kv_queue.push_back(txc);
11636 if (!kv_sync_in_progress) {
11637 kv_sync_in_progress = true;
11638 kv_cond.notify_one();
11639 }
11640 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
11641 kv_queue_unsubmitted.push_back(txc);
11642 ++txc->osr->kv_committing_serially;
11643 }
11644 if (txc->had_ios)
11645 kv_ios++;
11646 kv_throttle_costs += txc->cost;
11647 }
11648 return;
11649 case TransContext::STATE_KV_SUBMITTED:
11650 _txc_committed_kv(txc);
11651 // ** fall-thru **
11652
11653 case TransContext::STATE_KV_DONE:
11654 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
11655 if (txc->deferred_txn) {
11656 txc->state = TransContext::STATE_DEFERRED_QUEUED;
11657 _deferred_queue(txc);
11658 return;
11659 }
11660 txc->state = TransContext::STATE_FINISHING;
11661 break;
11662
11663 case TransContext::STATE_DEFERRED_CLEANUP:
11664 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
11665 txc->state = TransContext::STATE_FINISHING;
11666 // ** fall-thru **
11667
11668 case TransContext::STATE_FINISHING:
11669 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
11670 _txc_finish(txc);
11671 return;
11672
11673 default:
11674 derr << __func__ << " unexpected txc " << txc
11675 << " state " << txc->get_state_name() << dendl;
11676 ceph_abort_msg("unexpected txc state");
11677 return;
11678 }
11679 }
11680 }
11681
11682 void BlueStore::_txc_finish_io(TransContext *txc)
11683 {
11684 dout(20) << __func__ << " " << txc << dendl;
11685
11686 /*
11687 * we need to preserve the order of kv transactions,
11688 * even though aio will complete in any order.
11689 */
11690
11691 OpSequencer *osr = txc->osr.get();
11692 std::lock_guard l(osr->qlock);
11693 txc->state = TransContext::STATE_IO_DONE;
11694 txc->ioc.release_running_aios();
11695 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11696 while (p != osr->q.begin()) {
11697 --p;
11698 if (p->state < TransContext::STATE_IO_DONE) {
11699 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11700 << p->get_state_name() << dendl;
11701 return;
11702 }
11703 if (p->state > TransContext::STATE_IO_DONE) {
11704 ++p;
11705 break;
11706 }
11707 }
11708 do {
11709 _txc_state_proc(&*p++);
11710 } while (p != osr->q.end() &&
11711 p->state == TransContext::STATE_IO_DONE);
11712
11713 if (osr->kv_submitted_waiters) {
11714 osr->qcond.notify_all();
11715 }
11716 }
11717
11718 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11719 {
11720 dout(20) << __func__ << " txc " << txc
11721 << " onodes " << txc->onodes
11722 << " shared_blobs " << txc->shared_blobs
11723 << dendl;
11724
11725 // finalize onodes
11726 for (auto o : txc->onodes) {
11727 _record_onode(o, t);
11728 o->flushing_count++;
11729 }
11730
11731 // objects we modified but didn't affect the onode
11732 auto p = txc->modified_objects.begin();
11733 while (p != txc->modified_objects.end()) {
11734 if (txc->onodes.count(*p) == 0) {
11735 (*p)->flushing_count++;
11736 ++p;
11737 } else {
11738 // remove dups with onodes list to avoid problems in _txc_finish
11739 p = txc->modified_objects.erase(p);
11740 }
11741 }
11742
11743 // finalize shared_blobs
11744 for (auto sb : txc->shared_blobs) {
11745 string key;
11746 auto sbid = sb->get_sbid();
11747 get_shared_blob_key(sbid, &key);
11748 if (sb->persistent->empty()) {
11749 dout(20) << __func__ << " shared_blob 0x"
11750 << std::hex << sbid << std::dec
11751 << " is empty" << dendl;
11752 t->rmkey(PREFIX_SHARED_BLOB, key);
11753 } else {
11754 bufferlist bl;
11755 encode(*(sb->persistent), bl);
11756 dout(20) << __func__ << " shared_blob 0x"
11757 << std::hex << sbid << std::dec
11758 << " is " << bl.length() << " " << *sb << dendl;
11759 t->set(PREFIX_SHARED_BLOB, key, bl);
11760 }
11761 }
11762 }
11763
11764 void BlueStore::BSPerfTracker::update_from_perfcounters(
11765 PerfCounters &logger)
11766 {
11767 os_commit_latency_ns.consume_next(
11768 logger.get_tavg_ns(
11769 l_bluestore_commit_lat));
11770 os_apply_latency_ns.consume_next(
11771 logger.get_tavg_ns(
11772 l_bluestore_commit_lat));
11773 }
11774
11775 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11776 {
11777 dout(20) << __func__ << " txc " << txc << std::hex
11778 << " allocated 0x" << txc->allocated
11779 << " released 0x" << txc->released
11780 << std::dec << dendl;
11781
11782 // We have to handle the case where we allocate *and* deallocate the
11783 // same region in this transaction. The freelist doesn't like that.
11784 // (Actually, the only thing that cares is the BitmapFreelistManager
11785 // debug check. But that's important.)
11786 interval_set<uint64_t> tmp_allocated, tmp_released;
11787 interval_set<uint64_t> *pallocated = &txc->allocated;
11788 interval_set<uint64_t> *preleased = &txc->released;
11789 if (!txc->allocated.empty() && !txc->released.empty()) {
11790 interval_set<uint64_t> overlap;
11791 overlap.intersection_of(txc->allocated, txc->released);
11792 if (!overlap.empty()) {
11793 tmp_allocated = txc->allocated;
11794 tmp_allocated.subtract(overlap);
11795 tmp_released = txc->released;
11796 tmp_released.subtract(overlap);
11797 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11798 << ", new allocated 0x" << tmp_allocated
11799 << " released 0x" << tmp_released << std::dec
11800 << dendl;
11801 pallocated = &tmp_allocated;
11802 preleased = &tmp_released;
11803 }
11804 }
11805
11806 // update freelist with non-overlap sets
11807 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11808 p != pallocated->end();
11809 ++p) {
11810 fm->allocate(p.get_start(), p.get_len(), t);
11811 }
11812 for (interval_set<uint64_t>::iterator p = preleased->begin();
11813 p != preleased->end();
11814 ++p) {
11815 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11816 << "~" << p.get_len() << std::dec << dendl;
11817 fm->release(p.get_start(), p.get_len(), t);
11818 }
11819
11820 _txc_update_store_statfs(txc);
11821 }
11822
11823 void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
11824 {
11825 ceph_assert(txc->state == TransContext::STATE_KV_QUEUED);
11826 {
11827 #if defined(WITH_LTTNG)
11828 auto start = mono_clock::now();
11829 #endif
11830
11831 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11832 ceph_assert(r == 0);
11833 txc->state = TransContext::STATE_KV_SUBMITTED;
11834 if (txc->osr->kv_submitted_waiters) {
11835 std::lock_guard l(txc->osr->qlock);
11836 txc->osr->qcond.notify_all();
11837 }
11838
11839 #if defined(WITH_LTTNG)
11840 if (txc->tracing) {
11841 tracepoint(
11842 bluestore,
11843 transaction_kv_submit_latency,
11844 txc->osr->get_sequencer_id(),
11845 txc->seq,
11846 sync_submit_transaction,
11847 ceph::to_seconds<double>(mono_clock::now() - start));
11848 }
11849 #endif
11850 }
11851
11852 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11853 for (auto& o : *ls) {
11854 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11855 << dendl;
11856 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11857 std::lock_guard l(o->flush_lock);
11858 o->flush_cond.notify_all();
11859 }
11860 }
11861 }
11862 }
11863
11864 void BlueStore::_txc_committed_kv(TransContext *txc)
11865 {
11866 dout(20) << __func__ << " txc " << txc << dendl;
11867 throttle.complete_kv(*txc);
11868 {
11869 std::lock_guard l(txc->osr->qlock);
11870 txc->state = TransContext::STATE_KV_DONE;
11871 if (txc->ch->commit_queue) {
11872 txc->ch->commit_queue->queue(txc->oncommits);
11873 } else {
11874 finisher.queue(txc->oncommits);
11875 }
11876 }
11877 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
11878 log_latency_fn(
11879 __func__,
11880 l_bluestore_commit_lat,
11881 mono_clock::now() - txc->start,
11882 cct->_conf->bluestore_log_op_age,
11883 [&](auto lat) {
11884 return ", txc = " + stringify(txc);
11885 }
11886 );
11887 }
11888
11889 void BlueStore::_txc_finish(TransContext *txc)
11890 {
11891 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11892 ceph_assert(txc->state == TransContext::STATE_FINISHING);
11893
11894 for (auto& sb : txc->shared_blobs_written) {
11895 sb->finish_write(txc->seq);
11896 }
11897 txc->shared_blobs_written.clear();
11898
11899 while (!txc->removed_collections.empty()) {
11900 _queue_reap_collection(txc->removed_collections.front());
11901 txc->removed_collections.pop_front();
11902 }
11903
11904 OpSequencerRef osr = txc->osr;
11905 bool empty = false;
11906 bool submit_deferred = false;
11907 OpSequencer::q_list_t releasing_txc;
11908 {
11909 std::lock_guard l(osr->qlock);
11910 txc->state = TransContext::STATE_DONE;
11911 bool notify = false;
11912 while (!osr->q.empty()) {
11913 TransContext *txc = &osr->q.front();
11914 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11915 << dendl;
11916 if (txc->state != TransContext::STATE_DONE) {
11917 if (txc->state == TransContext::STATE_PREPARE &&
11918 deferred_aggressive) {
11919 // for _osr_drain_preceding()
11920 notify = true;
11921 }
11922 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11923 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
11924 submit_deferred = true;
11925 }
11926 break;
11927 }
11928
11929 osr->q.pop_front();
11930 releasing_txc.push_back(*txc);
11931 }
11932
11933 if (osr->q.empty()) {
11934 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11935 empty = true;
11936 }
11937
11938 // only drain()/drain_preceding() need wakeup,
11939 // other cases use kv_submitted_waiters
11940 if (notify || empty) {
11941 osr->qcond.notify_all();
11942 }
11943 }
11944
11945 while (!releasing_txc.empty()) {
11946 // release to allocator only after all preceding txc's have also
11947 // finished any deferred writes that potentially land in these
11948 // blocks
11949 auto txc = &releasing_txc.front();
11950 _txc_release_alloc(txc);
11951 releasing_txc.pop_front();
11952 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11953 throttle.complete(*txc);
11954 delete txc;
11955 }
11956
11957 if (submit_deferred) {
11958 // we're pinning memory; flush! we could be more fine-grained here but
11959 // i'm not sure it's worth the bother.
11960 deferred_try_submit();
11961 }
11962
11963 if (empty && osr->zombie) {
11964 std::lock_guard l(zombie_osr_lock);
11965 if (zombie_osr_set.erase(osr->cid)) {
11966 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11967 } else {
11968 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11969 << dendl;
11970 }
11971 }
11972 }
11973
11974 void BlueStore::_txc_release_alloc(TransContext *txc)
11975 {
11976 // it's expected we're called with lazy_release_lock already taken!
11977 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11978 int r = 0;
11979 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11980 r = bdev->queue_discard(txc->released);
11981 if (r == 0) {
11982 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11983 << txc->released << std::dec << dendl;
11984 goto out;
11985 }
11986 } else if (cct->_conf->bdev_enable_discard) {
11987 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11988 bdev->discard(p.get_start(), p.get_len());
11989 }
11990 }
11991 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
11992 << txc->released << std::dec << dendl;
11993 alloc->release(txc->released);
11994 }
11995
11996 out:
11997 txc->allocated.clear();
11998 txc->released.clear();
11999 }
12000
12001 void BlueStore::_osr_attach(Collection *c)
12002 {
12003 // note: caller has RWLock on coll_map
12004 auto q = coll_map.find(c->cid);
12005 if (q != coll_map.end()) {
12006 c->osr = q->second->osr;
12007 ldout(cct, 10) << __func__ << " " << c->cid
12008 << " reusing osr " << c->osr << " from existing coll "
12009 << q->second << dendl;
12010 } else {
12011 std::lock_guard l(zombie_osr_lock);
12012 auto p = zombie_osr_set.find(c->cid);
12013 if (p == zombie_osr_set.end()) {
12014 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
12015 ldout(cct, 10) << __func__ << " " << c->cid
12016 << " fresh osr " << c->osr << dendl;
12017 } else {
12018 c->osr = p->second;
12019 zombie_osr_set.erase(p);
12020 ldout(cct, 10) << __func__ << " " << c->cid
12021 << " resurrecting zombie osr " << c->osr << dendl;
12022 c->osr->zombie = false;
12023 }
12024 }
12025 }
12026
12027 void BlueStore::_osr_register_zombie(OpSequencer *osr)
12028 {
12029 std::lock_guard l(zombie_osr_lock);
12030 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
12031 osr->zombie = true;
12032 auto i = zombie_osr_set.emplace(osr->cid, osr);
12033 // this is either a new insertion or the same osr is already there
12034 ceph_assert(i.second || i.first->second == osr);
12035 }
12036
12037 void BlueStore::_osr_drain_preceding(TransContext *txc)
12038 {
12039 OpSequencer *osr = txc->osr.get();
12040 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
12041 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
12042 {
12043 // submit anything pending
12044 deferred_lock.lock();
12045 if (osr->deferred_pending && !osr->deferred_running) {
12046 _deferred_submit_unlock(osr);
12047 } else {
12048 deferred_lock.unlock();
12049 }
12050 }
12051 {
12052 // wake up any previously finished deferred events
12053 std::lock_guard l(kv_lock);
12054 if (!kv_sync_in_progress) {
12055 kv_sync_in_progress = true;
12056 kv_cond.notify_one();
12057 }
12058 }
12059 osr->drain_preceding(txc);
12060 --deferred_aggressive;
12061 dout(10) << __func__ << " " << osr << " done" << dendl;
12062 }
12063
12064 void BlueStore::_osr_drain(OpSequencer *osr)
12065 {
12066 dout(10) << __func__ << " " << osr << dendl;
12067 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
12068 {
12069 // submit anything pending
12070 deferred_lock.lock();
12071 if (osr->deferred_pending && !osr->deferred_running) {
12072 _deferred_submit_unlock(osr);
12073 } else {
12074 deferred_lock.unlock();
12075 }
12076 }
12077 {
12078 // wake up any previously finished deferred events
12079 std::lock_guard l(kv_lock);
12080 if (!kv_sync_in_progress) {
12081 kv_sync_in_progress = true;
12082 kv_cond.notify_one();
12083 }
12084 }
12085 osr->drain();
12086 --deferred_aggressive;
12087 dout(10) << __func__ << " " << osr << " done" << dendl;
12088 }
12089
12090 void BlueStore::_osr_drain_all()
12091 {
12092 dout(10) << __func__ << dendl;
12093
12094 set<OpSequencerRef> s;
12095 vector<OpSequencerRef> zombies;
12096 {
12097 std::shared_lock l(coll_lock);
12098 for (auto& i : coll_map) {
12099 s.insert(i.second->osr);
12100 }
12101 }
12102 {
12103 std::lock_guard l(zombie_osr_lock);
12104 for (auto& i : zombie_osr_set) {
12105 s.insert(i.second);
12106 zombies.push_back(i.second);
12107 }
12108 }
12109 dout(20) << __func__ << " osr_set " << s << dendl;
12110
12111 ++deferred_aggressive;
12112 {
12113 // submit anything pending
12114 deferred_try_submit();
12115 }
12116 {
12117 // wake up any previously finished deferred events
12118 std::lock_guard l(kv_lock);
12119 kv_cond.notify_one();
12120 }
12121 {
12122 std::lock_guard l(kv_finalize_lock);
12123 kv_finalize_cond.notify_one();
12124 }
12125 for (auto osr : s) {
12126 dout(20) << __func__ << " drain " << osr << dendl;
12127 osr->drain();
12128 }
12129 --deferred_aggressive;
12130
12131 {
12132 std::lock_guard l(zombie_osr_lock);
12133 for (auto& osr : zombies) {
12134 if (zombie_osr_set.erase(osr->cid)) {
12135 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12136 ceph_assert(osr->q.empty());
12137 } else if (osr->zombie) {
12138 dout(10) << __func__ << " empty zombie osr " << osr
12139 << " already reaped" << dendl;
12140 ceph_assert(osr->q.empty());
12141 } else {
12142 dout(10) << __func__ << " empty zombie osr " << osr
12143 << " resurrected" << dendl;
12144 }
12145 }
12146 }
12147
12148 dout(10) << __func__ << " done" << dendl;
12149 }
12150
12151
12152 void BlueStore::_kv_start()
12153 {
12154 dout(10) << __func__ << dendl;
12155
12156 finisher.start();
12157 kv_sync_thread.create("bstore_kv_sync");
12158 kv_finalize_thread.create("bstore_kv_final");
12159 }
12160
12161 void BlueStore::_kv_stop()
12162 {
12163 dout(10) << __func__ << dendl;
12164 {
12165 std::unique_lock l{kv_lock};
12166 while (!kv_sync_started) {
12167 kv_cond.wait(l);
12168 }
12169 kv_stop = true;
12170 kv_cond.notify_all();
12171 }
12172 {
12173 std::unique_lock l{kv_finalize_lock};
12174 while (!kv_finalize_started) {
12175 kv_finalize_cond.wait(l);
12176 }
12177 kv_finalize_stop = true;
12178 kv_finalize_cond.notify_all();
12179 }
12180 kv_sync_thread.join();
12181 kv_finalize_thread.join();
12182 ceph_assert(removed_collections.empty());
12183 {
12184 std::lock_guard l(kv_lock);
12185 kv_stop = false;
12186 }
12187 {
12188 std::lock_guard l(kv_finalize_lock);
12189 kv_finalize_stop = false;
12190 }
12191 dout(10) << __func__ << " stopping finishers" << dendl;
12192 finisher.wait_for_empty();
12193 finisher.stop();
12194 dout(10) << __func__ << " stopped" << dendl;
12195 }
12196
12197 void BlueStore::_kv_sync_thread()
12198 {
12199 dout(10) << __func__ << " start" << dendl;
12200 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
12201 std::unique_lock l{kv_lock};
12202 ceph_assert(!kv_sync_started);
12203 kv_sync_started = true;
12204 kv_cond.notify_all();
12205
12206 auto t0 = mono_clock::now();
12207 timespan twait = ceph::make_timespan(0);
12208 size_t kv_submitted = 0;
12209
12210 while (true) {
12211 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
12212 auto observation_period =
12213 ceph::make_timespan(period);
12214 auto elapsed = mono_clock::now() - t0;
12215 if (period && elapsed >= observation_period) {
12216 dout(5) << __func__ << " utilization: idle "
12217 << twait << " of " << elapsed
12218 << ", submitted: " << kv_submitted
12219 <<dendl;
12220 t0 = mono_clock::now();
12221 twait = ceph::make_timespan(0);
12222 kv_submitted = 0;
12223 }
12224 ceph_assert(kv_committing.empty());
12225 if (kv_queue.empty() &&
12226 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
12227 !deferred_aggressive)) {
12228 if (kv_stop)
12229 break;
12230 dout(20) << __func__ << " sleep" << dendl;
12231 auto t = mono_clock::now();
12232 kv_sync_in_progress = false;
12233 kv_cond.wait(l);
12234 twait += mono_clock::now() - t;
12235
12236 dout(20) << __func__ << " wake" << dendl;
12237 } else {
12238 deque<TransContext*> kv_submitting;
12239 deque<DeferredBatch*> deferred_done, deferred_stable;
12240 uint64_t aios = 0, costs = 0;
12241
12242 dout(20) << __func__ << " committing " << kv_queue.size()
12243 << " submitting " << kv_queue_unsubmitted.size()
12244 << " deferred done " << deferred_done_queue.size()
12245 << " stable " << deferred_stable_queue.size()
12246 << dendl;
12247 kv_committing.swap(kv_queue);
12248 kv_submitting.swap(kv_queue_unsubmitted);
12249 deferred_done.swap(deferred_done_queue);
12250 deferred_stable.swap(deferred_stable_queue);
12251 aios = kv_ios;
12252 costs = kv_throttle_costs;
12253 kv_ios = 0;
12254 kv_throttle_costs = 0;
12255 l.unlock();
12256
12257 dout(30) << __func__ << " committing " << kv_committing << dendl;
12258 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
12259 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
12260 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
12261
12262 auto start = mono_clock::now();
12263
12264 bool force_flush = false;
12265 // if bluefs is sharing the same device as data (only), then we
12266 // can rely on the bluefs commit to flush the device and make
12267 // deferred aios stable. that means that if we do have done deferred
12268 // txcs AND we are not on a single device, we need to force a flush.
12269 if (bluefs && bluefs_layout.single_shared_device()) {
12270 if (aios) {
12271 force_flush = true;
12272 } else if (kv_committing.empty() && deferred_stable.empty()) {
12273 force_flush = true; // there's nothing else to commit!
12274 } else if (deferred_aggressive) {
12275 force_flush = true;
12276 }
12277 } else {
12278 if (aios || !deferred_done.empty()) {
12279 force_flush = true;
12280 } else {
12281 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
12282 }
12283 }
12284
12285 if (force_flush) {
12286 dout(20) << __func__ << " num_aios=" << aios
12287 << " force_flush=" << (int)force_flush
12288 << ", flushing, deferred done->stable" << dendl;
12289 // flush/barrier on block device
12290 bdev->flush();
12291
12292 // if we flush then deferred done are now deferred stable
12293 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
12294 deferred_done.end());
12295 deferred_done.clear();
12296 }
12297 auto after_flush = mono_clock::now();
12298
12299 // we will use one final transaction to force a sync
12300 KeyValueDB::Transaction synct = db->get_transaction();
12301
12302 // increase {nid,blobid}_max? note that this covers both the
12303 // case where we are approaching the max and the case we passed
12304 // it. in either case, we increase the max in the earlier txn
12305 // we submit.
12306 uint64_t new_nid_max = 0, new_blobid_max = 0;
12307 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
12308 KeyValueDB::Transaction t =
12309 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12310 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
12311 bufferlist bl;
12312 encode(new_nid_max, bl);
12313 t->set(PREFIX_SUPER, "nid_max", bl);
12314 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
12315 }
12316 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
12317 KeyValueDB::Transaction t =
12318 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12319 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
12320 bufferlist bl;
12321 encode(new_blobid_max, bl);
12322 t->set(PREFIX_SUPER, "blobid_max", bl);
12323 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
12324 }
12325
12326 for (auto txc : kv_committing) {
12327 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
12328 if (txc->state == TransContext::STATE_KV_QUEUED) {
12329 ++kv_submitted;
12330 _txc_apply_kv(txc, false);
12331 --txc->osr->kv_committing_serially;
12332 } else {
12333 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
12334 }
12335 if (txc->had_ios) {
12336 --txc->osr->txc_with_unstable_io;
12337 }
12338 }
12339
12340 // release throttle *before* we commit. this allows new ops
12341 // to be prepared and enter pipeline while we are waiting on
12342 // the kv commit sync/flush. then hopefully on the next
12343 // iteration there will already be ops awake. otherwise, we
12344 // end up going to sleep, and then wake up when the very first
12345 // transaction is ready for commit.
12346 throttle.release_kv_throttle(costs);
12347
12348 if (bluefs &&
12349 after_flush - bluefs_last_balance >
12350 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
12351 bluefs_last_balance = after_flush;
12352 int r = _balance_bluefs_freespace();
12353 ceph_assert(r >= 0);
12354 }
12355
12356 // cleanup sync deferred keys
12357 for (auto b : deferred_stable) {
12358 for (auto& txc : b->txcs) {
12359 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
12360 ceph_assert(wt.released.empty()); // only kraken did this
12361 string key;
12362 get_deferred_key(wt.seq, &key);
12363 synct->rm_single_key(PREFIX_DEFERRED, key);
12364 }
12365 }
12366
12367 #if defined(WITH_LTTNG)
12368 auto sync_start = mono_clock::now();
12369 #endif
12370 // submit synct synchronously (block and wait for it to commit)
12371 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
12372 ceph_assert(r == 0);
12373
12374 int committing_size = kv_committing.size();
12375 int deferred_size = deferred_stable.size();
12376
12377 #if defined(WITH_LTTNG)
12378 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
12379 for (auto txc: kv_committing) {
12380 if (txc->tracing) {
12381 tracepoint(
12382 bluestore,
12383 transaction_kv_sync_latency,
12384 txc->osr->get_sequencer_id(),
12385 txc->seq,
12386 kv_committing.size(),
12387 deferred_done.size(),
12388 deferred_stable.size(),
12389 sync_latency);
12390 }
12391 }
12392 #endif
12393
12394 {
12395 std::unique_lock m{kv_finalize_lock};
12396 if (kv_committing_to_finalize.empty()) {
12397 kv_committing_to_finalize.swap(kv_committing);
12398 } else {
12399 kv_committing_to_finalize.insert(
12400 kv_committing_to_finalize.end(),
12401 kv_committing.begin(),
12402 kv_committing.end());
12403 kv_committing.clear();
12404 }
12405 if (deferred_stable_to_finalize.empty()) {
12406 deferred_stable_to_finalize.swap(deferred_stable);
12407 } else {
12408 deferred_stable_to_finalize.insert(
12409 deferred_stable_to_finalize.end(),
12410 deferred_stable.begin(),
12411 deferred_stable.end());
12412 deferred_stable.clear();
12413 }
12414 if (!kv_finalize_in_progress) {
12415 kv_finalize_in_progress = true;
12416 kv_finalize_cond.notify_one();
12417 }
12418 }
12419
12420 if (new_nid_max) {
12421 nid_max = new_nid_max;
12422 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12423 }
12424 if (new_blobid_max) {
12425 blobid_max = new_blobid_max;
12426 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12427 }
12428
12429 {
12430 auto finish = mono_clock::now();
12431 ceph::timespan dur_flush = after_flush - start;
12432 ceph::timespan dur_kv = finish - after_flush;
12433 ceph::timespan dur = finish - start;
12434 dout(20) << __func__ << " committed " << committing_size
12435 << " cleaned " << deferred_size
12436 << " in " << dur
12437 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12438 << dendl;
12439 log_latency("kv_flush",
12440 l_bluestore_kv_flush_lat,
12441 dur_flush,
12442 cct->_conf->bluestore_log_op_age);
12443 log_latency("kv_commit",
12444 l_bluestore_kv_commit_lat,
12445 dur_kv,
12446 cct->_conf->bluestore_log_op_age);
12447 log_latency("kv_sync",
12448 l_bluestore_kv_sync_lat,
12449 dur,
12450 cct->_conf->bluestore_log_op_age);
12451 }
12452
12453 if (bluefs) {
12454 if (!bluefs_extents_reclaiming.empty()) {
12455 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
12456 << bluefs_extents_reclaiming << std::dec << dendl;
12457 int r = 0;
12458 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12459 r = bdev->queue_discard(bluefs_extents_reclaiming);
12460 if (r == 0) {
12461 goto clear;
12462 }
12463 } else if (cct->_conf->bdev_enable_discard) {
12464 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
12465 bdev->discard(p.get_start(), p.get_len());
12466 }
12467 }
12468
12469 alloc->release(bluefs_extents_reclaiming);
12470 clear:
12471 bluefs_extents_reclaiming.clear();
12472 }
12473 }
12474
12475 l.lock();
12476 // previously deferred "done" are now "stable" by virtue of this
12477 // commit cycle.
12478 deferred_stable_queue.swap(deferred_done);
12479 }
12480 }
12481 dout(10) << __func__ << " finish" << dendl;
12482 kv_sync_started = false;
12483 }
12484
12485 void BlueStore::_kv_finalize_thread()
12486 {
12487 deque<TransContext*> kv_committed;
12488 deque<DeferredBatch*> deferred_stable;
12489 dout(10) << __func__ << " start" << dendl;
12490 std::unique_lock l(kv_finalize_lock);
12491 ceph_assert(!kv_finalize_started);
12492 kv_finalize_started = true;
12493 kv_finalize_cond.notify_all();
12494 while (true) {
12495 ceph_assert(kv_committed.empty());
12496 ceph_assert(deferred_stable.empty());
12497 if (kv_committing_to_finalize.empty() &&
12498 deferred_stable_to_finalize.empty()) {
12499 if (kv_finalize_stop)
12500 break;
12501 dout(20) << __func__ << " sleep" << dendl;
12502 kv_finalize_in_progress = false;
12503 kv_finalize_cond.wait(l);
12504 dout(20) << __func__ << " wake" << dendl;
12505 } else {
12506 kv_committed.swap(kv_committing_to_finalize);
12507 deferred_stable.swap(deferred_stable_to_finalize);
12508 l.unlock();
12509 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12510 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12511
12512 auto start = mono_clock::now();
12513
12514 while (!kv_committed.empty()) {
12515 TransContext *txc = kv_committed.front();
12516 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
12517 _txc_state_proc(txc);
12518 kv_committed.pop_front();
12519 }
12520
12521 for (auto b : deferred_stable) {
12522 auto p = b->txcs.begin();
12523 while (p != b->txcs.end()) {
12524 TransContext *txc = &*p;
12525 p = b->txcs.erase(p); // unlink here because
12526 _txc_state_proc(txc); // this may destroy txc
12527 }
12528 delete b;
12529 }
12530 deferred_stable.clear();
12531
12532 if (!deferred_aggressive) {
12533 if (deferred_queue_size >= deferred_batch_ops.load() ||
12534 throttle.should_submit_deferred()) {
12535 deferred_try_submit();
12536 }
12537 }
12538
12539 // this is as good a place as any ...
12540 _reap_collections();
12541
12542 logger->set(l_bluestore_fragmentation,
12543 (uint64_t)(alloc->get_fragmentation() * 1000));
12544
12545 log_latency("kv_final",
12546 l_bluestore_kv_final_lat,
12547 mono_clock::now() - start,
12548 cct->_conf->bluestore_log_op_age);
12549
12550 l.lock();
12551 }
12552 }
12553 dout(10) << __func__ << " finish" << dendl;
12554 kv_finalize_started = false;
12555 }
12556
12557 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
12558 TransContext *txc)
12559 {
12560 if (!txc->deferred_txn) {
12561 txc->deferred_txn = new bluestore_deferred_transaction_t;
12562 }
12563 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12564 return &txc->deferred_txn->ops.back();
12565 }
12566
12567 void BlueStore::_deferred_queue(TransContext *txc)
12568 {
12569 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
12570 deferred_lock.lock();
12571 if (!txc->osr->deferred_pending &&
12572 !txc->osr->deferred_running) {
12573 deferred_queue.push_back(*txc->osr);
12574 }
12575 if (!txc->osr->deferred_pending) {
12576 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
12577 }
12578 ++deferred_queue_size;
12579 txc->osr->deferred_pending->txcs.push_back(*txc);
12580 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12581 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12582 const auto& op = *opi;
12583 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
12584 bufferlist::const_iterator p = op.data.begin();
12585 for (auto e : op.extents) {
12586 txc->osr->deferred_pending->prepare_write(
12587 cct, wt.seq, e.offset, e.length, p);
12588 }
12589 }
12590 if (deferred_aggressive &&
12591 !txc->osr->deferred_running) {
12592 _deferred_submit_unlock(txc->osr.get());
12593 } else {
12594 deferred_lock.unlock();
12595 }
12596 }
12597
12598 void BlueStore::deferred_try_submit()
12599 {
12600 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12601 << deferred_queue_size << " txcs" << dendl;
12602 std::lock_guard l(deferred_lock);
12603 vector<OpSequencerRef> osrs;
12604 osrs.reserve(deferred_queue.size());
12605 for (auto& osr : deferred_queue) {
12606 osrs.push_back(&osr);
12607 }
12608 for (auto& osr : osrs) {
12609 if (osr->deferred_pending) {
12610 if (!osr->deferred_running) {
12611 _deferred_submit_unlock(osr.get());
12612 deferred_lock.lock();
12613 } else {
12614 dout(20) << __func__ << " osr " << osr << " already has running"
12615 << dendl;
12616 }
12617 } else {
12618 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
12619 }
12620 }
12621
12622 deferred_last_submitted = ceph_clock_now();
12623 }
12624
12625 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
12626 {
12627 dout(10) << __func__ << " osr " << osr
12628 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12629 << dendl;
12630 ceph_assert(osr->deferred_pending);
12631 ceph_assert(!osr->deferred_running);
12632
12633 auto b = osr->deferred_pending;
12634 deferred_queue_size -= b->seq_bytes.size();
12635 ceph_assert(deferred_queue_size >= 0);
12636
12637 osr->deferred_running = osr->deferred_pending;
12638 osr->deferred_pending = nullptr;
12639
12640 deferred_lock.unlock();
12641
12642 for (auto& txc : b->txcs) {
12643 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
12644 }
12645 uint64_t start = 0, pos = 0;
12646 bufferlist bl;
12647 auto i = b->iomap.begin();
12648 while (true) {
12649 if (i == b->iomap.end() || i->first != pos) {
12650 if (bl.length()) {
12651 dout(20) << __func__ << " write 0x" << std::hex
12652 << start << "~" << bl.length()
12653 << " crc " << bl.crc32c(-1) << std::dec << dendl;
12654 if (!g_conf()->bluestore_debug_omit_block_device_write) {
12655 logger->inc(l_bluestore_deferred_write_ops);
12656 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12657 int r = bdev->aio_write(start, bl, &b->ioc, false);
12658 ceph_assert(r == 0);
12659 }
12660 }
12661 if (i == b->iomap.end()) {
12662 break;
12663 }
12664 start = 0;
12665 pos = i->first;
12666 bl.clear();
12667 }
12668 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12669 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12670 << dendl;
12671 if (!bl.length()) {
12672 start = pos;
12673 }
12674 pos += i->second.bl.length();
12675 bl.claim_append(i->second.bl);
12676 ++i;
12677 }
12678
12679 bdev->aio_submit(&b->ioc);
12680 }
12681
12682 struct C_DeferredTrySubmit : public Context {
12683 BlueStore *store;
12684 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12685 void finish(int r) {
12686 store->deferred_try_submit();
12687 }
12688 };
12689
12690 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12691 {
12692 dout(10) << __func__ << " osr " << osr << dendl;
12693 ceph_assert(osr->deferred_running);
12694 DeferredBatch *b = osr->deferred_running;
12695
12696 {
12697 deferred_lock.lock();
12698 ceph_assert(osr->deferred_running == b);
12699 osr->deferred_running = nullptr;
12700 if (!osr->deferred_pending) {
12701 dout(20) << __func__ << " dequeueing" << dendl;
12702 auto q = deferred_queue.iterator_to(*osr);
12703 deferred_queue.erase(q);
12704 deferred_lock.unlock();
12705 } else {
12706 deferred_lock.unlock();
12707 if (deferred_aggressive) {
12708 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12709 finisher.queue(new C_DeferredTrySubmit(this));
12710 } else {
12711 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12712 }
12713 }
12714 }
12715
12716 {
12717 uint64_t costs = 0;
12718 {
12719 for (auto& i : b->txcs) {
12720 TransContext *txc = &i;
12721 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
12722 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
12723 costs += txc->cost;
12724 }
12725 }
12726 throttle.release_deferred_throttle(costs);
12727 }
12728
12729 {
12730 std::lock_guard l(kv_lock);
12731 deferred_done_queue.emplace_back(b);
12732
12733 // in the normal case, do not bother waking up the kv thread; it will
12734 // catch us on the next commit anyway.
12735 if (deferred_aggressive && !kv_sync_in_progress) {
12736 kv_sync_in_progress = true;
12737 kv_cond.notify_one();
12738 }
12739 }
12740 }
12741
12742 int BlueStore::_deferred_replay()
12743 {
12744 dout(10) << __func__ << " start" << dendl;
12745 int count = 0;
12746 int r = 0;
12747 CollectionRef ch = _get_collection(coll_t::meta());
12748 bool fake_ch = false;
12749 if (!ch) {
12750 // hmm, replaying initial mkfs?
12751 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12752 fake_ch = true;
12753 }
12754 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
12755 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12756 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12757 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12758 << dendl;
12759 bluestore_deferred_transaction_t *deferred_txn =
12760 new bluestore_deferred_transaction_t;
12761 bufferlist bl = it->value();
12762 auto p = bl.cbegin();
12763 try {
12764 decode(*deferred_txn, p);
12765 } catch (buffer::error& e) {
12766 derr << __func__ << " failed to decode deferred txn "
12767 << pretty_binary_string(it->key()) << dendl;
12768 delete deferred_txn;
12769 r = -EIO;
12770 goto out;
12771 }
12772 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
12773 txc->deferred_txn = deferred_txn;
12774 txc->state = TransContext::STATE_KV_DONE;
12775 _txc_state_proc(txc);
12776 }
12777 out:
12778 dout(20) << __func__ << " draining osr" << dendl;
12779 _osr_register_zombie(osr);
12780 _osr_drain_all();
12781 if (fake_ch) {
12782 new_coll_map.clear();
12783 }
12784 dout(10) << __func__ << " completed " << count << " events" << dendl;
12785 return r;
12786 }
12787
12788 // ---------------------------
12789 // transactions
12790
12791 int BlueStore::queue_transactions(
12792 CollectionHandle& ch,
12793 vector<Transaction>& tls,
12794 TrackedOpRef op,
12795 ThreadPool::TPHandle *handle)
12796 {
12797 FUNCTRACE(cct);
12798 list<Context *> on_applied, on_commit, on_applied_sync;
12799 ObjectStore::Transaction::collect_contexts(
12800 tls, &on_applied, &on_commit, &on_applied_sync);
12801
12802 auto start = mono_clock::now();
12803
12804 Collection *c = static_cast<Collection*>(ch.get());
12805 OpSequencer *osr = c->osr.get();
12806 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
12807
12808 // prepare
12809 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
12810 &on_commit);
12811
12812 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
12813 txc->bytes += (*p).get_num_bytes();
12814 _txc_add_transaction(txc, &(*p));
12815 }
12816 _txc_calc_cost(txc);
12817
12818 _txc_write_nodes(txc, txc->t);
12819
12820 // journal deferred items
12821 if (txc->deferred_txn) {
12822 txc->deferred_txn->seq = ++deferred_seq;
12823 bufferlist bl;
12824 encode(*txc->deferred_txn, bl);
12825 string key;
12826 get_deferred_key(txc->deferred_txn->seq, &key);
12827 txc->t->set(PREFIX_DEFERRED, key, bl);
12828 }
12829
12830 _txc_finalize_kv(txc, txc->t);
12831 if (handle)
12832 handle->suspend_tp_timeout();
12833
12834 auto tstart = mono_clock::now();
12835
12836 if (!throttle.try_start_transaction(
12837 *db,
12838 *txc,
12839 tstart)) {
12840 // ensure we do not block here because of deferred writes
12841 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12842 << dendl;
12843 ++deferred_aggressive;
12844 deferred_try_submit();
12845 {
12846 // wake up any previously finished deferred events
12847 std::lock_guard l(kv_lock);
12848 if (!kv_sync_in_progress) {
12849 kv_sync_in_progress = true;
12850 kv_cond.notify_one();
12851 }
12852 }
12853 throttle.finish_start_transaction(*db, *txc, tstart);
12854 --deferred_aggressive;
12855 }
12856 auto tend = mono_clock::now();
12857
12858 if (handle)
12859 handle->reset_tp_timeout();
12860
12861 logger->inc(l_bluestore_txc);
12862
12863 // execute (start)
12864 _txc_state_proc(txc);
12865
12866 // we're immediately readable (unlike FileStore)
12867 for (auto c : on_applied_sync) {
12868 c->complete(0);
12869 }
12870 if (!on_applied.empty()) {
12871 if (c->commit_queue) {
12872 c->commit_queue->queue(on_applied);
12873 } else {
12874 finisher.queue(on_applied);
12875 }
12876 }
12877
12878 log_latency("submit_transact",
12879 l_bluestore_submit_lat,
12880 mono_clock::now() - start,
12881 cct->_conf->bluestore_log_op_age);
12882 log_latency("throttle_transact",
12883 l_bluestore_throttle_lat,
12884 tend - tstart,
12885 cct->_conf->bluestore_log_op_age);
12886 return 0;
12887 }
12888
12889 void BlueStore::_txc_aio_submit(TransContext *txc)
12890 {
12891 dout(10) << __func__ << " txc " << txc << dendl;
12892 bdev->aio_submit(&txc->ioc);
12893 }
12894
12895 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12896 {
12897 Transaction::iterator i = t->begin();
12898
12899 _dump_transaction<30>(cct, t);
12900
12901 vector<CollectionRef> cvec(i.colls.size());
12902 unsigned j = 0;
12903 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12904 ++p, ++j) {
12905 cvec[j] = _get_collection(*p);
12906 }
12907
12908 vector<OnodeRef> ovec(i.objects.size());
12909
12910 for (int pos = 0; i.have_op(); ++pos) {
12911 Transaction::Op *op = i.decode_op();
12912 int r = 0;
12913
12914 // no coll or obj
12915 if (op->op == Transaction::OP_NOP)
12916 continue;
12917
12918
12919 // collection operations
12920 CollectionRef &c = cvec[op->cid];
12921
12922 // initialize osd_pool_id and do a smoke test that all collections belong
12923 // to the same pool
12924 spg_t pgid;
12925 if (!!c ? c->cid.is_pg(&pgid) : false) {
12926 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12927 txc->osd_pool_id == pgid.pool());
12928 txc->osd_pool_id = pgid.pool();
12929 }
12930
12931 switch (op->op) {
12932 case Transaction::OP_RMCOLL:
12933 {
12934 const coll_t &cid = i.get_cid(op->cid);
12935 r = _remove_collection(txc, cid, &c);
12936 if (!r)
12937 continue;
12938 }
12939 break;
12940
12941 case Transaction::OP_MKCOLL:
12942 {
12943 ceph_assert(!c);
12944 const coll_t &cid = i.get_cid(op->cid);
12945 r = _create_collection(txc, cid, op->split_bits, &c);
12946 if (!r)
12947 continue;
12948 }
12949 break;
12950
12951 case Transaction::OP_SPLIT_COLLECTION:
12952 ceph_abort_msg("deprecated");
12953 break;
12954
12955 case Transaction::OP_SPLIT_COLLECTION2:
12956 {
12957 uint32_t bits = op->split_bits;
12958 uint32_t rem = op->split_rem;
12959 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12960 if (!r)
12961 continue;
12962 }
12963 break;
12964
12965 case Transaction::OP_MERGE_COLLECTION:
12966 {
12967 uint32_t bits = op->split_bits;
12968 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12969 if (!r)
12970 continue;
12971 }
12972 break;
12973
12974 case Transaction::OP_COLL_HINT:
12975 {
12976 uint32_t type = op->hint_type;
12977 bufferlist hint;
12978 i.decode_bl(hint);
12979 auto hiter = hint.cbegin();
12980 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12981 uint32_t pg_num;
12982 uint64_t num_objs;
12983 decode(pg_num, hiter);
12984 decode(num_objs, hiter);
12985 dout(10) << __func__ << " collection hint objects is a no-op, "
12986 << " pg_num " << pg_num << " num_objects " << num_objs
12987 << dendl;
12988 } else {
12989 // Ignore the hint
12990 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12991 }
12992 continue;
12993 }
12994 break;
12995
12996 case Transaction::OP_COLL_SETATTR:
12997 r = -EOPNOTSUPP;
12998 break;
12999
13000 case Transaction::OP_COLL_RMATTR:
13001 r = -EOPNOTSUPP;
13002 break;
13003
13004 case Transaction::OP_COLL_RENAME:
13005 ceph_abort_msg("not implemented");
13006 break;
13007 }
13008 if (r < 0) {
13009 derr << __func__ << " error " << cpp_strerror(r)
13010 << " not handled on operation " << op->op
13011 << " (op " << pos << ", counting from 0)" << dendl;
13012 _dump_transaction<0>(cct, t);
13013 ceph_abort_msg("unexpected error");
13014 }
13015
13016 // these operations implicity create the object
13017 bool create = false;
13018 if (op->op == Transaction::OP_TOUCH ||
13019 op->op == Transaction::OP_CREATE ||
13020 op->op == Transaction::OP_WRITE ||
13021 op->op == Transaction::OP_ZERO) {
13022 create = true;
13023 }
13024
13025 // object operations
13026 std::unique_lock l(c->lock);
13027 OnodeRef &o = ovec[op->oid];
13028 if (!o) {
13029 ghobject_t oid = i.get_oid(op->oid);
13030 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
13031 }
13032 if (!create && (!o || !o->exists)) {
13033 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
13034 << i.get_oid(op->oid) << dendl;
13035 r = -ENOENT;
13036 goto endop;
13037 }
13038
13039 switch (op->op) {
13040 case Transaction::OP_CREATE:
13041 case Transaction::OP_TOUCH:
13042 r = _touch(txc, c, o);
13043 break;
13044
13045 case Transaction::OP_WRITE:
13046 {
13047 uint64_t off = op->off;
13048 uint64_t len = op->len;
13049 uint32_t fadvise_flags = i.get_fadvise_flags();
13050 bufferlist bl;
13051 i.decode_bl(bl);
13052 r = _write(txc, c, o, off, len, bl, fadvise_flags);
13053 }
13054 break;
13055
13056 case Transaction::OP_ZERO:
13057 {
13058 uint64_t off = op->off;
13059 uint64_t len = op->len;
13060 r = _zero(txc, c, o, off, len);
13061 }
13062 break;
13063
13064 case Transaction::OP_TRIMCACHE:
13065 {
13066 // deprecated, no-op
13067 }
13068 break;
13069
13070 case Transaction::OP_TRUNCATE:
13071 {
13072 uint64_t off = op->off;
13073 r = _truncate(txc, c, o, off);
13074 }
13075 break;
13076
13077 case Transaction::OP_REMOVE:
13078 {
13079 r = _remove(txc, c, o);
13080 }
13081 break;
13082
13083 case Transaction::OP_SETATTR:
13084 {
13085 string name = i.decode_string();
13086 bufferptr bp;
13087 i.decode_bp(bp);
13088 r = _setattr(txc, c, o, name, bp);
13089 }
13090 break;
13091
13092 case Transaction::OP_SETATTRS:
13093 {
13094 map<string, bufferptr> aset;
13095 i.decode_attrset(aset);
13096 r = _setattrs(txc, c, o, aset);
13097 }
13098 break;
13099
13100 case Transaction::OP_RMATTR:
13101 {
13102 string name = i.decode_string();
13103 r = _rmattr(txc, c, o, name);
13104 }
13105 break;
13106
13107 case Transaction::OP_RMATTRS:
13108 {
13109 r = _rmattrs(txc, c, o);
13110 }
13111 break;
13112
13113 case Transaction::OP_CLONE:
13114 {
13115 OnodeRef& no = ovec[op->dest_oid];
13116 if (!no) {
13117 const ghobject_t& noid = i.get_oid(op->dest_oid);
13118 no = c->get_onode(noid, true);
13119 }
13120 r = _clone(txc, c, o, no);
13121 }
13122 break;
13123
13124 case Transaction::OP_CLONERANGE:
13125 ceph_abort_msg("deprecated");
13126 break;
13127
13128 case Transaction::OP_CLONERANGE2:
13129 {
13130 OnodeRef& no = ovec[op->dest_oid];
13131 if (!no) {
13132 const ghobject_t& noid = i.get_oid(op->dest_oid);
13133 no = c->get_onode(noid, true);
13134 }
13135 uint64_t srcoff = op->off;
13136 uint64_t len = op->len;
13137 uint64_t dstoff = op->dest_off;
13138 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
13139 }
13140 break;
13141
13142 case Transaction::OP_COLL_ADD:
13143 ceph_abort_msg("not implemented");
13144 break;
13145
13146 case Transaction::OP_COLL_REMOVE:
13147 ceph_abort_msg("not implemented");
13148 break;
13149
13150 case Transaction::OP_COLL_MOVE:
13151 ceph_abort_msg("deprecated");
13152 break;
13153
13154 case Transaction::OP_COLL_MOVE_RENAME:
13155 case Transaction::OP_TRY_RENAME:
13156 {
13157 ceph_assert(op->cid == op->dest_cid);
13158 const ghobject_t& noid = i.get_oid(op->dest_oid);
13159 OnodeRef& no = ovec[op->dest_oid];
13160 if (!no) {
13161 no = c->get_onode(noid, false);
13162 }
13163 r = _rename(txc, c, o, no, noid);
13164 }
13165 break;
13166
13167 case Transaction::OP_OMAP_CLEAR:
13168 {
13169 r = _omap_clear(txc, c, o);
13170 }
13171 break;
13172 case Transaction::OP_OMAP_SETKEYS:
13173 {
13174 bufferlist aset_bl;
13175 i.decode_attrset_bl(&aset_bl);
13176 r = _omap_setkeys(txc, c, o, aset_bl);
13177 }
13178 break;
13179 case Transaction::OP_OMAP_RMKEYS:
13180 {
13181 bufferlist keys_bl;
13182 i.decode_keyset_bl(&keys_bl);
13183 r = _omap_rmkeys(txc, c, o, keys_bl);
13184 }
13185 break;
13186 case Transaction::OP_OMAP_RMKEYRANGE:
13187 {
13188 string first, last;
13189 first = i.decode_string();
13190 last = i.decode_string();
13191 r = _omap_rmkey_range(txc, c, o, first, last);
13192 }
13193 break;
13194 case Transaction::OP_OMAP_SETHEADER:
13195 {
13196 bufferlist bl;
13197 i.decode_bl(bl);
13198 r = _omap_setheader(txc, c, o, bl);
13199 }
13200 break;
13201
13202 case Transaction::OP_SETALLOCHINT:
13203 {
13204 r = _set_alloc_hint(txc, c, o,
13205 op->expected_object_size,
13206 op->expected_write_size,
13207 op->alloc_hint_flags);
13208 }
13209 break;
13210
13211 default:
13212 derr << __func__ << " bad op " << op->op << dendl;
13213 ceph_abort();
13214 }
13215
13216 endop:
13217 if (r < 0) {
13218 bool ok = false;
13219
13220 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
13221 op->op == Transaction::OP_CLONE ||
13222 op->op == Transaction::OP_CLONERANGE2 ||
13223 op->op == Transaction::OP_COLL_ADD ||
13224 op->op == Transaction::OP_SETATTR ||
13225 op->op == Transaction::OP_SETATTRS ||
13226 op->op == Transaction::OP_RMATTR ||
13227 op->op == Transaction::OP_OMAP_SETKEYS ||
13228 op->op == Transaction::OP_OMAP_RMKEYS ||
13229 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
13230 op->op == Transaction::OP_OMAP_SETHEADER))
13231 // -ENOENT is usually okay
13232 ok = true;
13233 if (r == -ENODATA)
13234 ok = true;
13235
13236 if (!ok) {
13237 const char *msg = "unexpected error code";
13238
13239 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
13240 op->op == Transaction::OP_CLONE ||
13241 op->op == Transaction::OP_CLONERANGE2))
13242 msg = "ENOENT on clone suggests osd bug";
13243
13244 if (r == -ENOSPC)
13245 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13246 // by partially applying transactions.
13247 msg = "ENOSPC from bluestore, misconfigured cluster";
13248
13249 if (r == -ENOTEMPTY) {
13250 msg = "ENOTEMPTY suggests garbage data in osd data dir";
13251 }
13252
13253 derr << __func__ << " error " << cpp_strerror(r)
13254 << " not handled on operation " << op->op
13255 << " (op " << pos << ", counting from 0)"
13256 << dendl;
13257 derr << msg << dendl;
13258 _dump_transaction<0>(cct, t);
13259 ceph_abort_msg("unexpected error");
13260 }
13261 }
13262 }
13263 }
13264
13265
13266
13267 // -----------------
13268 // write operations
13269
13270 int BlueStore::_touch(TransContext *txc,
13271 CollectionRef& c,
13272 OnodeRef &o)
13273 {
13274 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13275 int r = 0;
13276 _assign_nid(txc, o);
13277 txc->write_onode(o);
13278 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13279 return r;
13280 }
13281
13282 void BlueStore::_pad_zeros(
13283 bufferlist *bl, uint64_t *offset,
13284 uint64_t chunk_size)
13285 {
13286 auto length = bl->length();
13287 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
13288 << " chunk_size 0x" << chunk_size << std::dec << dendl;
13289 dout(40) << "before:\n";
13290 bl->hexdump(*_dout);
13291 *_dout << dendl;
13292 // front
13293 size_t front_pad = *offset % chunk_size;
13294 size_t back_pad = 0;
13295 size_t pad_count = 0;
13296 if (front_pad) {
13297 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
13298 bufferptr z = buffer::create_small_page_aligned(chunk_size);
13299 z.zero(0, front_pad, false);
13300 pad_count += front_pad;
13301 bl->begin().copy(front_copy, z.c_str() + front_pad);
13302 if (front_copy + front_pad < chunk_size) {
13303 back_pad = chunk_size - (length + front_pad);
13304 z.zero(front_pad + length, back_pad, false);
13305 pad_count += back_pad;
13306 }
13307 bufferlist old, t;
13308 old.swap(*bl);
13309 t.substr_of(old, front_copy, length - front_copy);
13310 bl->append(z);
13311 bl->claim_append(t);
13312 *offset -= front_pad;
13313 length += pad_count;
13314 }
13315
13316 // back
13317 uint64_t end = *offset + length;
13318 unsigned back_copy = end % chunk_size;
13319 if (back_copy) {
13320 ceph_assert(back_pad == 0);
13321 back_pad = chunk_size - back_copy;
13322 ceph_assert(back_copy <= length);
13323 bufferptr tail(chunk_size);
13324 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
13325 tail.zero(back_copy, back_pad, false);
13326 bufferlist old;
13327 old.swap(*bl);
13328 bl->substr_of(old, 0, length - back_copy);
13329 bl->append(tail);
13330 length += back_pad;
13331 pad_count += back_pad;
13332 }
13333 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
13334 << back_pad << " on front/back, now 0x" << *offset << "~"
13335 << length << std::dec << dendl;
13336 dout(40) << "after:\n";
13337 bl->hexdump(*_dout);
13338 *_dout << dendl;
13339 if (pad_count)
13340 logger->inc(l_bluestore_write_pad_bytes, pad_count);
13341 ceph_assert(bl->length() == length);
13342 }
13343
13344 void BlueStore::_do_write_small(
13345 TransContext *txc,
13346 CollectionRef &c,
13347 OnodeRef o,
13348 uint64_t offset, uint64_t length,
13349 bufferlist::iterator& blp,
13350 WriteContext *wctx)
13351 {
13352 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13353 << std::dec << dendl;
13354 ceph_assert(length < min_alloc_size);
13355 uint64_t end_offs = offset + length;
13356
13357 logger->inc(l_bluestore_write_small);
13358 logger->inc(l_bluestore_write_small_bytes, length);
13359
13360 bufferlist bl;
13361 blp.copy(length, bl);
13362
13363 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13364 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13365 uint32_t alloc_len = min_alloc_size;
13366 auto offset0 = p2align<uint64_t>(offset, alloc_len);
13367
13368 bool any_change;
13369
13370 // search suitable extent in both forward and reverse direction in
13371 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13372 // then check if blob can be reused via can_reuse_blob func or apply
13373 // direct/deferred write (the latter for extents including or higher
13374 // than 'offset' only).
13375 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
13376
13377 // Look for an existing mutable blob we can use.
13378 auto begin = o->extent_map.extent_map.begin();
13379 auto end = o->extent_map.extent_map.end();
13380 auto ep = o->extent_map.seek_lextent(offset);
13381 if (ep != begin) {
13382 --ep;
13383 if (ep->blob_end() <= offset) {
13384 ++ep;
13385 }
13386 }
13387 auto prev_ep = ep;
13388 if (prev_ep != begin) {
13389 --prev_ep;
13390 } else {
13391 prev_ep = end; // to avoid this extent check as it's a duplicate
13392 }
13393
13394 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13395 // We don't want to have more blobs than min alloc units fit
13396 // into 2 max blobs
13397 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13398 bool above_blob_threshold = false;
13399
13400 inspected_blobs.reserve(blob_threshold);
13401
13402 uint64_t max_off = 0;
13403 auto start_ep = ep;
13404 auto end_ep = ep; // exclusively
13405 do {
13406 any_change = false;
13407
13408 if (ep != end && ep->logical_offset < offset + max_bsize) {
13409 BlobRef b = ep->blob;
13410 if (!above_blob_threshold) {
13411 inspected_blobs.insert(&b->get_blob());
13412 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13413 }
13414 max_off = ep->logical_end();
13415 auto bstart = ep->blob_start();
13416
13417 dout(20) << __func__ << " considering " << *b
13418 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13419 if (bstart >= end_offs) {
13420 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13421 } else if (!b->get_blob().is_mutable()) {
13422 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13423 } else if (ep->logical_offset % min_alloc_size !=
13424 ep->blob_offset % min_alloc_size) {
13425 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13426 } else {
13427 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13428 // can we pad our head/tail out with zeros?
13429 uint64_t head_pad, tail_pad;
13430 head_pad = p2phase(offset, chunk_size);
13431 tail_pad = p2nphase(end_offs, chunk_size);
13432 if (head_pad || tail_pad) {
13433 o->extent_map.fault_range(db, offset - head_pad,
13434 end_offs - offset + head_pad + tail_pad);
13435 }
13436 if (head_pad &&
13437 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
13438 head_pad = 0;
13439 }
13440 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13441 tail_pad = 0;
13442 }
13443
13444 uint64_t b_off = offset - head_pad - bstart;
13445 uint64_t b_len = length + head_pad + tail_pad;
13446
13447 // direct write into unused blocks of an existing mutable blob?
13448 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13449 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13450 b->get_blob().is_unused(b_off, b_len) &&
13451 b->get_blob().is_allocated(b_off, b_len)) {
13452 _apply_padding(head_pad, tail_pad, bl);
13453
13454 dout(20) << __func__ << " write to unused 0x" << std::hex
13455 << b_off << "~" << b_len
13456 << " pad 0x" << head_pad << " + 0x" << tail_pad
13457 << std::dec << " of mutable " << *b << dendl;
13458 _buffer_cache_write(txc, b, b_off, bl,
13459 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13460
13461 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13462 if (b_len <= prefer_deferred_size) {
13463 dout(20) << __func__ << " deferring small 0x" << std::hex
13464 << b_len << std::dec << " unused write via deferred" << dendl;
13465 bluestore_deferred_op_t *op = _get_deferred_op(txc);
13466 op->op = bluestore_deferred_op_t::OP_WRITE;
13467 b->get_blob().map(
13468 b_off, b_len,
13469 [&](uint64_t offset, uint64_t length) {
13470 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13471 return 0;
13472 });
13473 op->data = bl;
13474 } else {
13475 b->get_blob().map_bl(
13476 b_off, bl,
13477 [&](uint64_t offset, bufferlist& t) {
13478 bdev->aio_write(offset, t,
13479 &txc->ioc, wctx->buffered);
13480 });
13481 }
13482 }
13483 b->dirty_blob().calc_csum(b_off, bl);
13484 dout(20) << __func__ << " lex old " << *ep << dendl;
13485 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13486 b,
13487 &wctx->old_extents);
13488 b->dirty_blob().mark_used(le->blob_offset, le->length);
13489 txc->statfs_delta.stored() += le->length;
13490 dout(20) << __func__ << " lex " << *le << dendl;
13491 logger->inc(l_bluestore_write_small_unused);
13492 return;
13493 }
13494 // read some data to fill out the chunk?
13495 uint64_t head_read = p2phase(b_off, chunk_size);
13496 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
13497 if ((head_read || tail_read) &&
13498 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13499 head_read + tail_read < min_alloc_size) {
13500 b_off -= head_read;
13501 b_len += head_read + tail_read;
13502
13503 } else {
13504 head_read = tail_read = 0;
13505 }
13506
13507 // chunk-aligned deferred overwrite?
13508 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13509 b_off % chunk_size == 0 &&
13510 b_len % chunk_size == 0 &&
13511 b->get_blob().is_allocated(b_off, b_len)) {
13512
13513 _apply_padding(head_pad, tail_pad, bl);
13514
13515 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13516 << " and tail 0x" << tail_read << std::dec << dendl;
13517 if (head_read) {
13518 bufferlist head_bl;
13519 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13520 head_bl, 0);
13521 ceph_assert(r >= 0 && r <= (int)head_read);
13522 size_t zlen = head_read - r;
13523 if (zlen) {
13524 head_bl.append_zero(zlen);
13525 logger->inc(l_bluestore_write_pad_bytes, zlen);
13526 }
13527 head_bl.claim_append(bl);
13528 bl.swap(head_bl);
13529 logger->inc(l_bluestore_write_penalty_read_ops);
13530 }
13531 if (tail_read) {
13532 bufferlist tail_bl;
13533 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13534 tail_bl, 0);
13535 ceph_assert(r >= 0 && r <= (int)tail_read);
13536 size_t zlen = tail_read - r;
13537 if (zlen) {
13538 tail_bl.append_zero(zlen);
13539 logger->inc(l_bluestore_write_pad_bytes, zlen);
13540 }
13541 bl.claim_append(tail_bl);
13542 logger->inc(l_bluestore_write_penalty_read_ops);
13543 }
13544 logger->inc(l_bluestore_write_small_pre_read);
13545
13546 _buffer_cache_write(txc, b, b_off, bl,
13547 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13548
13549 if (b->get_blob().csum_type) {
13550 b->dirty_blob().calc_csum(b_off, bl);
13551 }
13552
13553 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13554 bluestore_deferred_op_t *op = _get_deferred_op(txc);
13555 op->op = bluestore_deferred_op_t::OP_WRITE;
13556 int r = b->get_blob().map(
13557 b_off, b_len,
13558 [&](uint64_t offset, uint64_t length) {
13559 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13560 return 0;
13561 });
13562 ceph_assert(r == 0);
13563 op->data.claim(bl);
13564 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13565 << b_len << std::dec << " of mutable " << *b
13566 << " at " << op->extents << dendl;
13567 }
13568
13569 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13570 b, &wctx->old_extents);
13571 b->dirty_blob().mark_used(le->blob_offset, le->length);
13572 txc->statfs_delta.stored() += le->length;
13573 dout(20) << __func__ << " lex " << *le << dendl;
13574 logger->inc(l_bluestore_write_small_deferred);
13575 return;
13576 }
13577 // try to reuse blob if we can
13578 if (b->can_reuse_blob(min_alloc_size,
13579 max_bsize,
13580 offset0 - bstart,
13581 &alloc_len)) {
13582 ceph_assert(alloc_len == min_alloc_size); // expecting data always
13583 // fit into reused blob
13584 // Need to check for pending writes desiring to
13585 // reuse the same pextent. The rationale is that during GC two chunks
13586 // from garbage blobs(compressed?) can share logical space within the same
13587 // AU. That's in turn might be caused by unaligned len in clone_range2.
13588 // Hence the second write will fail in an attempt to reuse blob at
13589 // do_alloc_write().
13590 if (!wctx->has_conflict(b,
13591 offset0,
13592 offset0 + alloc_len,
13593 min_alloc_size)) {
13594
13595 // we can't reuse pad_head/pad_tail since they might be truncated
13596 // due to existent extents
13597 uint64_t b_off = offset - bstart;
13598 uint64_t b_off0 = b_off;
13599 _pad_zeros(&bl, &b_off0, chunk_size);
13600
13601 dout(20) << __func__ << " reuse blob " << *b << std::hex
13602 << " (0x" << b_off0 << "~" << bl.length() << ")"
13603 << " (0x" << b_off << "~" << length << ")"
13604 << std::dec << dendl;
13605
13606 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13607 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13608 false, false);
13609 logger->inc(l_bluestore_write_small_unused);
13610 return;
13611 }
13612 }
13613 }
13614 ++ep;
13615 end_ep = ep;
13616 any_change = true;
13617 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13618
13619 // check extent for reuse in reverse order
13620 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13621 BlobRef b = prev_ep->blob;
13622 if (!above_blob_threshold) {
13623 inspected_blobs.insert(&b->get_blob());
13624 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13625 }
13626 start_ep = prev_ep;
13627 auto bstart = prev_ep->blob_start();
13628 dout(20) << __func__ << " considering " << *b
13629 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13630 if (b->can_reuse_blob(min_alloc_size,
13631 max_bsize,
13632 offset0 - bstart,
13633 &alloc_len)) {
13634 ceph_assert(alloc_len == min_alloc_size); // expecting data always
13635 // fit into reused blob
13636 // Need to check for pending writes desiring to
13637 // reuse the same pextent. The rationale is that during GC two chunks
13638 // from garbage blobs(compressed?) can share logical space within the same
13639 // AU. That's in turn might be caused by unaligned len in clone_range2.
13640 // Hence the second write will fail in an attempt to reuse blob at
13641 // do_alloc_write().
13642 if (!wctx->has_conflict(b,
13643 offset0,
13644 offset0 + alloc_len,
13645 min_alloc_size)) {
13646
13647 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13648 uint64_t b_off = offset - bstart;
13649 uint64_t b_off0 = b_off;
13650 _pad_zeros(&bl, &b_off0, chunk_size);
13651
13652 dout(20) << __func__ << " reuse blob " << *b << std::hex
13653 << " (0x" << b_off0 << "~" << bl.length() << ")"
13654 << " (0x" << b_off << "~" << length << ")"
13655 << std::dec << dendl;
13656
13657 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13658 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13659 false, false);
13660 logger->inc(l_bluestore_write_small_unused);
13661 return;
13662 }
13663 }
13664 if (prev_ep != begin) {
13665 --prev_ep;
13666 any_change = true;
13667 } else {
13668 prev_ep = end; // to avoid useless first extent re-check
13669 }
13670 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13671 } while (any_change);
13672
13673 if (above_blob_threshold) {
13674 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13675 << " " << std::hex << min_off << "~" << max_off << std::dec
13676 << dendl;
13677 ceph_assert(start_ep != end_ep);
13678 for (auto ep = start_ep; ep != end_ep; ++ep) {
13679 dout(20) << __func__ << " inserting for GC "
13680 << std::hex << ep->logical_offset << "~" << ep->length
13681 << std::dec << dendl;
13682
13683 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13684 }
13685 // insert newly written extent to GC
13686 wctx->extents_to_gc.union_insert(offset, length);
13687 dout(20) << __func__ << " inserting (last) for GC "
13688 << std::hex << offset << "~" << length
13689 << std::dec << dendl;
13690 }
13691 // new blob.
13692 BlobRef b = c->new_blob();
13693 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
13694 uint64_t b_off0 = b_off;
13695 _pad_zeros(&bl, &b_off0, block_size);
13696 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13697 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13698 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13699 // doesn't match disk one only
13700 true);
13701
13702 return;
13703 }
13704
13705 void BlueStore::_do_write_big(
13706 TransContext *txc,
13707 CollectionRef &c,
13708 OnodeRef o,
13709 uint64_t offset, uint64_t length,
13710 bufferlist::iterator& blp,
13711 WriteContext *wctx)
13712 {
13713 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13714 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13715 << " compress " << (int)wctx->compress
13716 << dendl;
13717 logger->inc(l_bluestore_write_big);
13718 logger->inc(l_bluestore_write_big_bytes, length);
13719 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13720 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13721 while (length > 0) {
13722 bool new_blob = false;
13723 uint32_t l = std::min(max_bsize, length);
13724 BlobRef b;
13725 uint32_t b_off = 0;
13726
13727 //attempting to reuse existing blob
13728 if (!wctx->compress) {
13729 // look for an existing mutable blob we can reuse
13730 auto begin = o->extent_map.extent_map.begin();
13731 auto end = o->extent_map.extent_map.end();
13732 auto ep = o->extent_map.seek_lextent(offset);
13733 auto prev_ep = ep;
13734 if (prev_ep != begin) {
13735 --prev_ep;
13736 } else {
13737 prev_ep = end; // to avoid this extent check as it's a duplicate
13738 }
13739 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13740 // search suitable extent in both forward and reverse direction in
13741 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13742 // then check if blob can be reused via can_reuse_blob func.
13743 bool any_change;
13744 do {
13745 any_change = false;
13746 if (ep != end && ep->logical_offset < offset + max_bsize) {
13747 if (offset >= ep->blob_start() &&
13748 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
13749 offset - ep->blob_start(),
13750 &l)) {
13751 b = ep->blob;
13752 b_off = offset - ep->blob_start();
13753 prev_ep = end; // to avoid check below
13754 dout(20) << __func__ << " reuse blob " << *b << std::hex
13755 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
13756 } else {
13757 ++ep;
13758 any_change = true;
13759 }
13760 }
13761
13762 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13763 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
13764 offset - prev_ep->blob_start(),
13765 &l)) {
13766 b = prev_ep->blob;
13767 b_off = offset - prev_ep->blob_start();
13768 dout(20) << __func__ << " reuse blob " << *b << std::hex
13769 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
13770 } else if (prev_ep != begin) {
13771 --prev_ep;
13772 any_change = true;
13773 } else {
13774 prev_ep = end; // to avoid useless first extent re-check
13775 }
13776 }
13777 } while (b == nullptr && any_change);
13778 }
13779 if (b == nullptr) {
13780 b = c->new_blob();
13781 b_off = 0;
13782 new_blob = true;
13783 }
13784
13785 bufferlist t;
13786 blp.copy(l, t);
13787 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13788 offset += l;
13789 length -= l;
13790 logger->inc(l_bluestore_write_big_blobs);
13791 }
13792 }
13793
13794 int BlueStore::_do_alloc_write(
13795 TransContext *txc,
13796 CollectionRef coll,
13797 OnodeRef o,
13798 WriteContext *wctx)
13799 {
13800 dout(20) << __func__ << " txc " << txc
13801 << " " << wctx->writes.size() << " blobs"
13802 << dendl;
13803 if (wctx->writes.empty()) {
13804 return 0;
13805 }
13806
13807 CompressorRef c;
13808 double crr = 0;
13809 if (wctx->compress) {
13810 c = select_option(
13811 "compression_algorithm",
13812 compressor,
13813 [&]() {
13814 string val;
13815 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13816 CompressorRef cp = compressor;
13817 if (!cp || cp->get_type_name() != val) {
13818 cp = Compressor::create(cct, val);
13819 if (!cp) {
13820 if (_set_compression_alert(false, val.c_str())) {
13821 derr << __func__ << " unable to initialize " << val.c_str()
13822 << " compressor" << dendl;
13823 }
13824 }
13825 }
13826 return boost::optional<CompressorRef>(cp);
13827 }
13828 return boost::optional<CompressorRef>();
13829 }
13830 );
13831
13832 crr = select_option(
13833 "compression_required_ratio",
13834 cct->_conf->bluestore_compression_required_ratio,
13835 [&]() {
13836 double val;
13837 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
13838 return boost::optional<double>(val);
13839 }
13840 return boost::optional<double>();
13841 }
13842 );
13843 }
13844
13845 // checksum
13846 int64_t csum = csum_type.load();
13847 csum = select_option(
13848 "csum_type",
13849 csum,
13850 [&]() {
13851 int64_t val;
13852 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
13853 return boost::optional<int64_t>(val);
13854 }
13855 return boost::optional<int64_t>();
13856 }
13857 );
13858
13859 // compress (as needed) and calc needed space
13860 uint64_t need = 0;
13861 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13862 for (auto& wi : wctx->writes) {
13863 if (c && wi.blob_length > min_alloc_size) {
13864 auto start = mono_clock::now();
13865
13866 // compress
13867 ceph_assert(wi.b_off == 0);
13868 ceph_assert(wi.blob_length == wi.bl.length());
13869
13870 // FIXME: memory alignment here is bad
13871 bufferlist t;
13872 int r = c->compress(wi.bl, t);
13873 uint64_t want_len_raw = wi.blob_length * crr;
13874 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
13875 bool rejected = false;
13876 uint64_t compressed_len = t.length();
13877 // do an approximate (fast) estimation for resulting blob size
13878 // that doesn't take header overhead into account
13879 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
13880 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
13881 bluestore_compression_header_t chdr;
13882 chdr.type = c->get_type();
13883 chdr.length = t.length();
13884 encode(chdr, wi.compressed_bl);
13885 wi.compressed_bl.claim_append(t);
13886
13887 compressed_len = wi.compressed_bl.length();
13888 result_len = p2roundup(compressed_len, min_alloc_size);
13889 if (result_len <= want_len && result_len < wi.blob_length) {
13890 // Cool. We compressed at least as much as we were hoping to.
13891 // pad out to min_alloc_size
13892 wi.compressed_bl.append_zero(result_len - compressed_len);
13893 wi.compressed_len = compressed_len;
13894 wi.compressed = true;
13895 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
13896 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
13897 << " -> 0x" << compressed_len << " => 0x" << result_len
13898 << " with " << c->get_type()
13899 << std::dec << dendl;
13900 txc->statfs_delta.compressed() += compressed_len;
13901 txc->statfs_delta.compressed_original() += wi.blob_length;
13902 txc->statfs_delta.compressed_allocated() += result_len;
13903 logger->inc(l_bluestore_compress_success_count);
13904 need += result_len;
13905 } else {
13906 rejected = true;
13907 }
13908 } else if (r != 0) {
13909 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
13910 << " bytes compressed using " << c->get_type_name()
13911 << std::dec
13912 << " failed with errcode = " << r
13913 << ", leaving uncompressed"
13914 << dendl;
13915 logger->inc(l_bluestore_compress_rejected_count);
13916 need += wi.blob_length;
13917 } else {
13918 rejected = true;
13919 }
13920
13921 if (rejected) {
13922 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
13923 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
13924 << " with " << c->get_type()
13925 << ", which is more than required 0x" << want_len_raw
13926 << " -> 0x" << want_len
13927 << ", leaving uncompressed"
13928 << std::dec << dendl;
13929 logger->inc(l_bluestore_compress_rejected_count);
13930 need += wi.blob_length;
13931 }
13932 log_latency("compress@_do_alloc_write",
13933 l_bluestore_compress_lat,
13934 mono_clock::now() - start,
13935 cct->_conf->bluestore_log_op_age );
13936 } else {
13937 need += wi.blob_length;
13938 }
13939 }
13940 PExtentVector prealloc;
13941 prealloc.reserve(2 * wctx->writes.size());;
13942 int64_t prealloc_left = 0;
13943 prealloc_left = alloc->allocate(
13944 need, min_alloc_size, need,
13945 0, &prealloc);
13946 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
13947 derr << __func__ << " failed to allocate 0x" << std::hex << need
13948 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
13949 << " min_alloc_size 0x" << min_alloc_size
13950 << " available 0x " << alloc->get_free()
13951 << std::dec << dendl;
13952 if (prealloc.size()) {
13953 alloc->release(prealloc);
13954 }
13955 return -ENOSPC;
13956 }
13957 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
13958
13959 dout(20) << __func__ << " prealloc " << prealloc << dendl;
13960 auto prealloc_pos = prealloc.begin();
13961
13962 for (auto& wi : wctx->writes) {
13963 BlobRef b = wi.b;
13964 bluestore_blob_t& dblob = b->dirty_blob();
13965 uint64_t b_off = wi.b_off;
13966 bufferlist *l = &wi.bl;
13967 uint64_t final_length = wi.blob_length;
13968 uint64_t csum_length = wi.blob_length;
13969 if (wi.compressed) {
13970 final_length = wi.compressed_bl.length();
13971 csum_length = final_length;
13972 unsigned csum_order = ctz(csum_length);
13973 l = &wi.compressed_bl;
13974 dblob.set_compressed(wi.blob_length, wi.compressed_len);
13975 if (csum != Checksummer::CSUM_NONE) {
13976 dout(20) << __func__ << " initialize csum setting for compressed blob " << *b
13977 << " csum_type " << Checksummer::get_csum_type_string(csum)
13978 << " csum_order " << csum_order
13979 << " csum_length 0x" << std::hex << csum_length
13980 << " blob_length 0x" << wi.blob_length
13981 << " compressed_length 0x" << wi.compressed_len << std::dec
13982 << dendl;
13983 dblob.init_csum(csum, csum_order, csum_length);
13984 }
13985 } else if (wi.new_blob) {
13986 unsigned csum_order;
13987 // initialize newly created blob only
13988 ceph_assert(dblob.is_mutable());
13989 if (l->length() != wi.blob_length) {
13990 // hrm, maybe we could do better here, but let's not bother.
13991 dout(20) << __func__ << " forcing csum_order to block_size_order "
13992 << block_size_order << dendl;
13993 csum_order = block_size_order;
13994 } else {
13995 csum_order = std::min(wctx->csum_order, ctz(l->length()));
13996 }
13997 // try to align blob with max_blob_size to improve
13998 // its reuse ratio, e.g. in case of reverse write
13999 uint32_t suggested_boff =
14000 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
14001 if ((suggested_boff % (1 << csum_order)) == 0 &&
14002 suggested_boff + final_length <= max_bsize &&
14003 suggested_boff > b_off) {
14004 dout(20) << __func__ << " forcing blob_offset to 0x"
14005 << std::hex << suggested_boff << std::dec << dendl;
14006 ceph_assert(suggested_boff >= b_off);
14007 csum_length += suggested_boff - b_off;
14008 b_off = suggested_boff;
14009 }
14010 if (csum != Checksummer::CSUM_NONE) {
14011 dout(20) << __func__ << " initialize csum setting for new blob " << *b
14012 << " csum_type " << Checksummer::get_csum_type_string(csum)
14013 << " csum_order " << csum_order
14014 << " csum_length 0x" << std::hex << csum_length << std::dec
14015 << dendl;
14016 dblob.init_csum(csum, csum_order, csum_length);
14017 }
14018 }
14019
14020 PExtentVector extents;
14021 int64_t left = final_length;
14022 while (left > 0) {
14023 ceph_assert(prealloc_left > 0);
14024 if (prealloc_pos->length <= left) {
14025 prealloc_left -= prealloc_pos->length;
14026 left -= prealloc_pos->length;
14027 txc->statfs_delta.allocated() += prealloc_pos->length;
14028 extents.push_back(*prealloc_pos);
14029 ++prealloc_pos;
14030 } else {
14031 extents.emplace_back(prealloc_pos->offset, left);
14032 prealloc_pos->offset += left;
14033 prealloc_pos->length -= left;
14034 prealloc_left -= left;
14035 txc->statfs_delta.allocated() += left;
14036 left = 0;
14037 break;
14038 }
14039 }
14040 for (auto& p : extents) {
14041 txc->allocated.insert(p.offset, p.length);
14042 }
14043 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
14044
14045 dout(20) << __func__ << " blob " << *b << dendl;
14046 if (dblob.has_csum()) {
14047 dblob.calc_csum(b_off, *l);
14048 }
14049
14050 if (wi.mark_unused) {
14051 ceph_assert(!dblob.is_compressed());
14052 auto b_end = b_off + wi.bl.length();
14053 if (b_off) {
14054 dblob.add_unused(0, b_off);
14055 }
14056 uint64_t llen = dblob.get_logical_length();
14057 if (b_end < llen) {
14058 dblob.add_unused(b_end, llen - b_end);
14059 }
14060 }
14061
14062 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
14063 b_off + (wi.b_off0 - wi.b_off),
14064 wi.length0,
14065 wi.b,
14066 nullptr);
14067 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
14068 txc->statfs_delta.stored() += le->length;
14069 dout(20) << __func__ << " lex " << *le << dendl;
14070 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
14071 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14072
14073 // queue io
14074 if (!g_conf()->bluestore_debug_omit_block_device_write) {
14075 if (l->length() <= prefer_deferred_size.load()) {
14076 dout(20) << __func__ << " deferring small 0x" << std::hex
14077 << l->length() << std::dec << " write via deferred" << dendl;
14078 bluestore_deferred_op_t *op = _get_deferred_op(txc);
14079 op->op = bluestore_deferred_op_t::OP_WRITE;
14080 int r = b->get_blob().map(
14081 b_off, l->length(),
14082 [&](uint64_t offset, uint64_t length) {
14083 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14084 return 0;
14085 });
14086 ceph_assert(r == 0);
14087 op->data = *l;
14088 logger->inc(l_bluestore_write_small_deferred);
14089 } else {
14090 b->get_blob().map_bl(
14091 b_off, *l,
14092 [&](uint64_t offset, bufferlist& t) {
14093 bdev->aio_write(offset, t, &txc->ioc, false);
14094 });
14095 logger->inc(l_bluestore_write_small_new);
14096 }
14097 }
14098 }
14099 ceph_assert(prealloc_pos == prealloc.end());
14100 ceph_assert(prealloc_left == 0);
14101 return 0;
14102 }
14103
14104 void BlueStore::_wctx_finish(
14105 TransContext *txc,
14106 CollectionRef& c,
14107 OnodeRef o,
14108 WriteContext *wctx,
14109 set<SharedBlob*> *maybe_unshared_blobs)
14110 {
14111 auto oep = wctx->old_extents.begin();
14112 while (oep != wctx->old_extents.end()) {
14113 auto &lo = *oep;
14114 oep = wctx->old_extents.erase(oep);
14115 dout(20) << __func__ << " lex_old " << lo.e << dendl;
14116 BlobRef b = lo.e.blob;
14117 const bluestore_blob_t& blob = b->get_blob();
14118 if (blob.is_compressed()) {
14119 if (lo.blob_empty) {
14120 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
14121 }
14122 txc->statfs_delta.compressed_original() -= lo.e.length;
14123 }
14124 auto& r = lo.r;
14125 txc->statfs_delta.stored() -= lo.e.length;
14126 if (!r.empty()) {
14127 dout(20) << __func__ << " blob release " << r << dendl;
14128 if (blob.is_shared()) {
14129 PExtentVector final;
14130 c->load_shared_blob(b->shared_blob);
14131 bool unshare = false;
14132 bool* unshare_ptr =
14133 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
14134 for (auto e : r) {
14135 b->shared_blob->put_ref(
14136 e.offset, e.length, &final,
14137 unshare_ptr);
14138 }
14139 if (unshare) {
14140 ceph_assert(maybe_unshared_blobs);
14141 maybe_unshared_blobs->insert(b->shared_blob.get());
14142 }
14143 dout(20) << __func__ << " shared_blob release " << final
14144 << " from " << *b->shared_blob << dendl;
14145 txc->write_shared_blob(b->shared_blob);
14146 r.clear();
14147 r.swap(final);
14148 }
14149 }
14150 // we can't invalidate our logical extents as we drop them because
14151 // other lextents (either in our onode or others) may still
14152 // reference them. but we can throw out anything that is no
14153 // longer allocated. Note that this will leave behind edge bits
14154 // that are no longer referenced but not deallocated (until they
14155 // age out of the cache naturally).
14156 b->discard_unallocated(c.get());
14157 for (auto e : r) {
14158 dout(20) << __func__ << " release " << e << dendl;
14159 txc->released.insert(e.offset, e.length);
14160 txc->statfs_delta.allocated() -= e.length;
14161 if (blob.is_compressed()) {
14162 txc->statfs_delta.compressed_allocated() -= e.length;
14163 }
14164 }
14165
14166 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
14167 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
14168 << dendl;
14169 o->extent_map.spanning_blob_map.erase(b->id);
14170 }
14171 delete &lo;
14172 }
14173 }
14174
14175 void BlueStore::_do_write_data(
14176 TransContext *txc,
14177 CollectionRef& c,
14178 OnodeRef o,
14179 uint64_t offset,
14180 uint64_t length,
14181 bufferlist& bl,
14182 WriteContext *wctx)
14183 {
14184 uint64_t end = offset + length;
14185 bufferlist::iterator p = bl.begin();
14186
14187 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
14188 (length != min_alloc_size)) {
14189 // we fall within the same block
14190 _do_write_small(txc, c, o, offset, length, p, wctx);
14191 } else {
14192 uint64_t head_offset, head_length;
14193 uint64_t middle_offset, middle_length;
14194 uint64_t tail_offset, tail_length;
14195
14196 head_offset = offset;
14197 head_length = p2nphase(offset, min_alloc_size);
14198
14199 tail_offset = p2align(end, min_alloc_size);
14200 tail_length = p2phase(end, min_alloc_size);
14201
14202 middle_offset = head_offset + head_length;
14203 middle_length = length - head_length - tail_length;
14204
14205 if (head_length) {
14206 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
14207 }
14208
14209 if (middle_length) {
14210 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
14211 }
14212
14213 if (tail_length) {
14214 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
14215 }
14216 }
14217 }
14218
14219 void BlueStore::_choose_write_options(
14220 CollectionRef& c,
14221 OnodeRef o,
14222 uint32_t fadvise_flags,
14223 WriteContext *wctx)
14224 {
14225 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
14226 dout(20) << __func__ << " will do buffered write" << dendl;
14227 wctx->buffered = true;
14228 } else if (cct->_conf->bluestore_default_buffered_write &&
14229 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
14230 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
14231 dout(20) << __func__ << " defaulting to buffered write" << dendl;
14232 wctx->buffered = true;
14233 }
14234
14235 // apply basic csum block size
14236 wctx->csum_order = block_size_order;
14237
14238 // compression parameters
14239 unsigned alloc_hints = o->onode.alloc_hint_flags;
14240 auto cm = select_option(
14241 "compression_mode",
14242 comp_mode.load(),
14243 [&]() {
14244 string val;
14245 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
14246 return boost::optional<Compressor::CompressionMode>(
14247 Compressor::get_comp_mode_type(val));
14248 }
14249 return boost::optional<Compressor::CompressionMode>();
14250 }
14251 );
14252
14253 wctx->compress = (cm != Compressor::COMP_NONE) &&
14254 ((cm == Compressor::COMP_FORCE) ||
14255 (cm == Compressor::COMP_AGGRESSIVE &&
14256 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
14257 (cm == Compressor::COMP_PASSIVE &&
14258 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
14259
14260 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
14261 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
14262 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
14263 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
14264 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
14265
14266 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
14267
14268 if (o->onode.expected_write_size) {
14269 wctx->csum_order = std::max(min_alloc_size_order,
14270 (uint8_t)ctz(o->onode.expected_write_size));
14271 } else {
14272 wctx->csum_order = min_alloc_size_order;
14273 }
14274
14275 if (wctx->compress) {
14276 wctx->target_blob_size = select_option(
14277 "compression_max_blob_size",
14278 comp_max_blob_size.load(),
14279 [&]() {
14280 int64_t val;
14281 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
14282 return boost::optional<uint64_t>((uint64_t)val);
14283 }
14284 return boost::optional<uint64_t>();
14285 }
14286 );
14287 }
14288 } else {
14289 if (wctx->compress) {
14290 wctx->target_blob_size = select_option(
14291 "compression_min_blob_size",
14292 comp_min_blob_size.load(),
14293 [&]() {
14294 int64_t val;
14295 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
14296 return boost::optional<uint64_t>((uint64_t)val);
14297 }
14298 return boost::optional<uint64_t>();
14299 }
14300 );
14301 }
14302 }
14303
14304 uint64_t max_bsize = max_blob_size.load();
14305 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
14306 wctx->target_blob_size = max_bsize;
14307 }
14308
14309 // set the min blob size floor at 2x the min_alloc_size, or else we
14310 // won't be able to allocate a smaller extent for the compressed
14311 // data.
14312 if (wctx->compress &&
14313 wctx->target_blob_size < min_alloc_size * 2) {
14314 wctx->target_blob_size = min_alloc_size * 2;
14315 }
14316
14317 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
14318 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
14319 << " compress=" << (int)wctx->compress
14320 << " buffered=" << (int)wctx->buffered
14321 << std::dec << dendl;
14322 }
14323
14324 int BlueStore::_do_gc(
14325 TransContext *txc,
14326 CollectionRef& c,
14327 OnodeRef o,
14328 const WriteContext& wctx,
14329 uint64_t *dirty_start,
14330 uint64_t *dirty_end)
14331 {
14332
14333 bool dirty_range_updated = false;
14334 WriteContext wctx_gc;
14335 wctx_gc.fork(wctx); // make a clone for garbage collection
14336
14337 auto & extents_to_collect = wctx.extents_to_gc;
14338 for (auto it = extents_to_collect.begin();
14339 it != extents_to_collect.end();
14340 ++it) {
14341 bufferlist bl;
14342 auto offset = (*it).first;
14343 auto length = (*it).second;
14344 dout(20) << __func__ << " processing " << std::hex
14345 << offset << "~" << length << std::dec
14346 << dendl;
14347 int r = _do_read(c.get(), o, offset, length, bl, 0);
14348 ceph_assert(r == (int)length);
14349
14350 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
14351 logger->inc(l_bluestore_gc_merged, length);
14352
14353 if (*dirty_start > offset) {
14354 *dirty_start = offset;
14355 dirty_range_updated = true;
14356 }
14357
14358 if (*dirty_end < offset + length) {
14359 *dirty_end = offset + length;
14360 dirty_range_updated = true;
14361 }
14362 }
14363 if (dirty_range_updated) {
14364 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
14365 }
14366
14367 dout(30) << __func__ << " alloc write" << dendl;
14368 int r = _do_alloc_write(txc, c, o, &wctx_gc);
14369 if (r < 0) {
14370 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14371 << dendl;
14372 return r;
14373 }
14374
14375 _wctx_finish(txc, c, o, &wctx_gc);
14376 return 0;
14377 }
14378
14379 int BlueStore::_do_write(
14380 TransContext *txc,
14381 CollectionRef& c,
14382 OnodeRef o,
14383 uint64_t offset,
14384 uint64_t length,
14385 bufferlist& bl,
14386 uint32_t fadvise_flags)
14387 {
14388 int r = 0;
14389
14390 dout(20) << __func__
14391 << " " << o->oid
14392 << " 0x" << std::hex << offset << "~" << length
14393 << " - have 0x" << o->onode.size
14394 << " (" << std::dec << o->onode.size << ")"
14395 << " bytes"
14396 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
14397 << dendl;
14398 _dump_onode<30>(cct, *o);
14399
14400 if (length == 0) {
14401 return 0;
14402 }
14403
14404 uint64_t end = offset + length;
14405
14406 GarbageCollector gc(c->store->cct);
14407 int64_t benefit = 0;
14408 auto dirty_start = offset;
14409 auto dirty_end = end;
14410
14411 WriteContext wctx;
14412 _choose_write_options(c, o, fadvise_flags, &wctx);
14413 o->extent_map.fault_range(db, offset, length);
14414 _do_write_data(txc, c, o, offset, length, bl, &wctx);
14415 r = _do_alloc_write(txc, c, o, &wctx);
14416 if (r < 0) {
14417 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14418 << dendl;
14419 goto out;
14420 }
14421
14422 if (wctx.extents_to_gc.empty() ||
14423 wctx.extents_to_gc.range_start() > offset ||
14424 wctx.extents_to_gc.range_end() < offset + length) {
14425 benefit = gc.estimate(offset,
14426 length,
14427 o->extent_map,
14428 wctx.old_extents,
14429 min_alloc_size);
14430 }
14431
14432 // NB: _wctx_finish() will empty old_extents
14433 // so we must do gc estimation before that
14434 _wctx_finish(txc, c, o, &wctx);
14435 if (end > o->onode.size) {
14436 dout(20) << __func__ << " extending size to 0x" << std::hex << end
14437 << std::dec << dendl;
14438 o->onode.size = end;
14439 }
14440
14441 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
14442 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14443 dout(20) << __func__
14444 << " perform garbage collection for compressed extents, "
14445 << "expected benefit = " << benefit << " AUs" << dendl;
14446 }
14447 if (!wctx.extents_to_gc.empty()) {
14448 dout(20) << __func__ << " perform garbage collection" << dendl;
14449
14450 r = _do_gc(txc, c, o,
14451 wctx,
14452 &dirty_start, &dirty_end);
14453 if (r < 0) {
14454 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14455 << dendl;
14456 goto out;
14457 }
14458 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14459 << "~" << dirty_end - dirty_start << std::dec << dendl;
14460 }
14461 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
14462 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14463
14464 r = 0;
14465
14466 out:
14467 return r;
14468 }
14469
14470 int BlueStore::_write(TransContext *txc,
14471 CollectionRef& c,
14472 OnodeRef& o,
14473 uint64_t offset, size_t length,
14474 bufferlist& bl,
14475 uint32_t fadvise_flags)
14476 {
14477 dout(15) << __func__ << " " << c->cid << " " << o->oid
14478 << " 0x" << std::hex << offset << "~" << length << std::dec
14479 << dendl;
14480 int r = 0;
14481 if (offset + length >= OBJECT_MAX_SIZE) {
14482 r = -E2BIG;
14483 } else {
14484 _assign_nid(txc, o);
14485 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14486 txc->write_onode(o);
14487 }
14488 dout(10) << __func__ << " " << c->cid << " " << o->oid
14489 << " 0x" << std::hex << offset << "~" << length << std::dec
14490 << " = " << r << dendl;
14491 return r;
14492 }
14493
14494 int BlueStore::_zero(TransContext *txc,
14495 CollectionRef& c,
14496 OnodeRef& o,
14497 uint64_t offset, size_t length)
14498 {
14499 dout(15) << __func__ << " " << c->cid << " " << o->oid
14500 << " 0x" << std::hex << offset << "~" << length << std::dec
14501 << dendl;
14502 int r = 0;
14503 if (offset + length >= OBJECT_MAX_SIZE) {
14504 r = -E2BIG;
14505 } else {
14506 _assign_nid(txc, o);
14507 r = _do_zero(txc, c, o, offset, length);
14508 }
14509 dout(10) << __func__ << " " << c->cid << " " << o->oid
14510 << " 0x" << std::hex << offset << "~" << length << std::dec
14511 << " = " << r << dendl;
14512 return r;
14513 }
14514
14515 int BlueStore::_do_zero(TransContext *txc,
14516 CollectionRef& c,
14517 OnodeRef& o,
14518 uint64_t offset, size_t length)
14519 {
14520 dout(15) << __func__ << " " << c->cid << " " << o->oid
14521 << " 0x" << std::hex << offset << "~" << length << std::dec
14522 << dendl;
14523 int r = 0;
14524
14525 _dump_onode<30>(cct, *o);
14526
14527 WriteContext wctx;
14528 o->extent_map.fault_range(db, offset, length);
14529 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
14530 o->extent_map.dirty_range(offset, length);
14531 _wctx_finish(txc, c, o, &wctx);
14532
14533 if (length > 0 && offset + length > o->onode.size) {
14534 o->onode.size = offset + length;
14535 dout(20) << __func__ << " extending size to " << offset + length
14536 << dendl;
14537 }
14538 txc->write_onode(o);
14539
14540 dout(10) << __func__ << " " << c->cid << " " << o->oid
14541 << " 0x" << std::hex << offset << "~" << length << std::dec
14542 << " = " << r << dendl;
14543 return r;
14544 }
14545
14546 void BlueStore::_do_truncate(
14547 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14548 set<SharedBlob*> *maybe_unshared_blobs)
14549 {
14550 dout(15) << __func__ << " " << c->cid << " " << o->oid
14551 << " 0x" << std::hex << offset << std::dec << dendl;
14552
14553 _dump_onode<30>(cct, *o);
14554
14555 if (offset == o->onode.size)
14556 return;
14557
14558 if (offset < o->onode.size) {
14559 WriteContext wctx;
14560 uint64_t length = o->onode.size - offset;
14561 o->extent_map.fault_range(db, offset, length);
14562 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
14563 o->extent_map.dirty_range(offset, length);
14564 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
14565
14566 // if we have shards past EOF, ask for a reshard
14567 if (!o->onode.extent_map_shards.empty() &&
14568 o->onode.extent_map_shards.back().offset >= offset) {
14569 dout(10) << __func__ << " request reshard past EOF" << dendl;
14570 if (offset) {
14571 o->extent_map.request_reshard(offset - 1, offset + length);
14572 } else {
14573 o->extent_map.request_reshard(0, length);
14574 }
14575 }
14576 }
14577
14578 o->onode.size = offset;
14579
14580 txc->write_onode(o);
14581 }
14582
14583 int BlueStore::_truncate(TransContext *txc,
14584 CollectionRef& c,
14585 OnodeRef& o,
14586 uint64_t offset)
14587 {
14588 dout(15) << __func__ << " " << c->cid << " " << o->oid
14589 << " 0x" << std::hex << offset << std::dec
14590 << dendl;
14591 int r = 0;
14592 if (offset >= OBJECT_MAX_SIZE) {
14593 r = -E2BIG;
14594 } else {
14595 _do_truncate(txc, c, o, offset);
14596 }
14597 dout(10) << __func__ << " " << c->cid << " " << o->oid
14598 << " 0x" << std::hex << offset << std::dec
14599 << " = " << r << dendl;
14600 return r;
14601 }
14602
14603 int BlueStore::_do_remove(
14604 TransContext *txc,
14605 CollectionRef& c,
14606 OnodeRef o)
14607 {
14608 set<SharedBlob*> maybe_unshared_blobs;
14609 bool is_gen = !o->oid.is_no_gen();
14610 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
14611 if (o->onode.has_omap()) {
14612 o->flush();
14613 _do_omap_clear(txc, o);
14614 }
14615 o->exists = false;
14616 string key;
14617 for (auto &s : o->extent_map.shards) {
14618 dout(20) << __func__ << " removing shard 0x" << std::hex
14619 << s.shard_info->offset << std::dec << dendl;
14620 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14621 [&](const string& final_key) {
14622 txc->t->rmkey(PREFIX_OBJ, final_key);
14623 }
14624 );
14625 }
14626 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
14627 txc->note_removed_object(o);
14628 o->extent_map.clear();
14629 o->onode = bluestore_onode_t();
14630 _debug_obj_on_delete(o->oid);
14631
14632 if (!is_gen || maybe_unshared_blobs.empty()) {
14633 return 0;
14634 }
14635
14636 // see if we can unshare blobs still referenced by the head
14637 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14638 << maybe_unshared_blobs << dendl;
14639 ghobject_t nogen = o->oid;
14640 nogen.generation = ghobject_t::NO_GEN;
14641 OnodeRef h = c->onode_map.lookup(nogen);
14642
14643 if (!h || !h->exists) {
14644 return 0;
14645 }
14646
14647 dout(20) << __func__ << " checking for unshareable blobs on " << h
14648 << " " << h->oid << dendl;
14649 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14650 for (auto& e : h->extent_map.extent_map) {
14651 const bluestore_blob_t& b = e.blob->get_blob();
14652 SharedBlob *sb = e.blob->shared_blob.get();
14653 if (b.is_shared() &&
14654 sb->loaded &&
14655 maybe_unshared_blobs.count(sb)) {
14656 if (b.is_compressed()) {
14657 expect[sb].get(0, b.get_ondisk_length());
14658 } else {
14659 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14660 expect[sb].get(off, len);
14661 return 0;
14662 });
14663 }
14664 }
14665 }
14666
14667 vector<SharedBlob*> unshared_blobs;
14668 unshared_blobs.reserve(maybe_unshared_blobs.size());
14669 for (auto& p : expect) {
14670 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14671 if (p.first->persistent->ref_map == p.second) {
14672 SharedBlob *sb = p.first;
14673 dout(20) << __func__ << " unsharing " << *sb << dendl;
14674 unshared_blobs.push_back(sb);
14675 txc->unshare_blob(sb);
14676 uint64_t sbid = c->make_blob_unshared(sb);
14677 string key;
14678 get_shared_blob_key(sbid, &key);
14679 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14680 }
14681 }
14682
14683 if (unshared_blobs.empty()) {
14684 return 0;
14685 }
14686
14687 for (auto& e : h->extent_map.extent_map) {
14688 const bluestore_blob_t& b = e.blob->get_blob();
14689 SharedBlob *sb = e.blob->shared_blob.get();
14690 if (b.is_shared() &&
14691 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14692 sb) != unshared_blobs.end()) {
14693 dout(20) << __func__ << " unsharing " << e << dendl;
14694 bluestore_blob_t& blob = e.blob->dirty_blob();
14695 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
14696 h->extent_map.dirty_range(e.logical_offset, 1);
14697 }
14698 }
14699 txc->write_onode(h);
14700
14701 return 0;
14702 }
14703
14704 int BlueStore::_remove(TransContext *txc,
14705 CollectionRef& c,
14706 OnodeRef &o)
14707 {
14708 dout(15) << __func__ << " " << c->cid << " " << o->oid
14709 << " onode " << o.get()
14710 << " txc "<< txc << dendl;
14711
14712 auto start_time = mono_clock::now();
14713 int r = _do_remove(txc, c, o);
14714 log_latency_fn(
14715 __func__,
14716 l_bluestore_remove_lat,
14717 mono_clock::now() - start_time,
14718 cct->_conf->bluestore_log_op_age,
14719 [&](const ceph::timespan& lat) {
14720 ostringstream ostr;
14721 ostr << ", lat = " << timespan_str(lat)
14722 << " cid =" << c->cid
14723 << " oid =" << o->oid;
14724 return ostr.str();
14725 }
14726 );
14727
14728 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14729 return r;
14730 }
14731
14732 int BlueStore::_setattr(TransContext *txc,
14733 CollectionRef& c,
14734 OnodeRef& o,
14735 const string& name,
14736 bufferptr& val)
14737 {
14738 dout(15) << __func__ << " " << c->cid << " " << o->oid
14739 << " " << name << " (" << val.length() << " bytes)"
14740 << dendl;
14741 int r = 0;
14742 if (val.is_partial()) {
14743 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14744 val.length());
14745 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14746 } else {
14747 auto& b = o->onode.attrs[name.c_str()] = val;
14748 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14749 }
14750 txc->write_onode(o);
14751 dout(10) << __func__ << " " << c->cid << " " << o->oid
14752 << " " << name << " (" << val.length() << " bytes)"
14753 << " = " << r << dendl;
14754 return r;
14755 }
14756
14757 int BlueStore::_setattrs(TransContext *txc,
14758 CollectionRef& c,
14759 OnodeRef& o,
14760 const map<string,bufferptr>& aset)
14761 {
14762 dout(15) << __func__ << " " << c->cid << " " << o->oid
14763 << " " << aset.size() << " keys"
14764 << dendl;
14765 int r = 0;
14766 for (map<string,bufferptr>::const_iterator p = aset.begin();
14767 p != aset.end(); ++p) {
14768 if (p->second.is_partial()) {
14769 auto& b = o->onode.attrs[p->first.c_str()] =
14770 bufferptr(p->second.c_str(), p->second.length());
14771 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14772 } else {
14773 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
14774 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
14775 }
14776 }
14777 txc->write_onode(o);
14778 dout(10) << __func__ << " " << c->cid << " " << o->oid
14779 << " " << aset.size() << " keys"
14780 << " = " << r << dendl;
14781 return r;
14782 }
14783
14784
14785 int BlueStore::_rmattr(TransContext *txc,
14786 CollectionRef& c,
14787 OnodeRef& o,
14788 const string& name)
14789 {
14790 dout(15) << __func__ << " " << c->cid << " " << o->oid
14791 << " " << name << dendl;
14792 int r = 0;
14793 auto it = o->onode.attrs.find(name.c_str());
14794 if (it == o->onode.attrs.end())
14795 goto out;
14796
14797 o->onode.attrs.erase(it);
14798 txc->write_onode(o);
14799
14800 out:
14801 dout(10) << __func__ << " " << c->cid << " " << o->oid
14802 << " " << name << " = " << r << dendl;
14803 return r;
14804 }
14805
14806 int BlueStore::_rmattrs(TransContext *txc,
14807 CollectionRef& c,
14808 OnodeRef& o)
14809 {
14810 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14811 int r = 0;
14812
14813 if (o->onode.attrs.empty())
14814 goto out;
14815
14816 o->onode.attrs.clear();
14817 txc->write_onode(o);
14818
14819 out:
14820 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14821 return r;
14822 }
14823
14824 void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
14825 {
14826 const string& omap_prefix = o->get_omap_prefix();
14827 string prefix, tail;
14828 o->get_omap_header(&prefix);
14829 o->get_omap_tail(&tail);
14830 txc->t->rm_range_keys(omap_prefix, prefix, tail);
14831 txc->t->rmkey(omap_prefix, tail);
14832 dout(20) << __func__ << " remove range start: "
14833 << pretty_binary_string(prefix) << " end: "
14834 << pretty_binary_string(tail) << dendl;
14835 }
14836
14837 int BlueStore::_omap_clear(TransContext *txc,
14838 CollectionRef& c,
14839 OnodeRef& o)
14840 {
14841 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14842 int r = 0;
14843 if (o->onode.has_omap()) {
14844 o->flush();
14845 _do_omap_clear(txc, o);
14846 o->onode.clear_omap_flag();
14847 txc->write_onode(o);
14848 }
14849 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14850 return r;
14851 }
14852
14853 int BlueStore::_omap_setkeys(TransContext *txc,
14854 CollectionRef& c,
14855 OnodeRef& o,
14856 bufferlist &bl)
14857 {
14858 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14859 int r;
14860 auto p = bl.cbegin();
14861 __u32 num;
14862 if (!o->onode.has_omap()) {
14863 if (o->oid.is_pgmeta()) {
14864 o->onode.set_omap_flags_pgmeta();
14865 } else {
14866 o->onode.set_omap_flags();
14867 }
14868 txc->write_onode(o);
14869
14870 const string& prefix = o->get_omap_prefix();
14871 string key_tail;
14872 bufferlist tail;
14873 o->get_omap_tail(&key_tail);
14874 txc->t->set(prefix, key_tail, tail);
14875 } else {
14876 txc->note_modified_object(o);
14877 }
14878 const string& prefix = o->get_omap_prefix();
14879 string final_key;
14880 o->get_omap_key(string(), &final_key);
14881 size_t base_key_len = final_key.size();
14882 decode(num, p);
14883 while (num--) {
14884 string key;
14885 bufferlist value;
14886 decode(key, p);
14887 decode(value, p);
14888 final_key.resize(base_key_len); // keep prefix
14889 final_key += key;
14890 dout(20) << __func__ << " " << pretty_binary_string(final_key)
14891 << " <- " << key << dendl;
14892 txc->t->set(prefix, final_key, value);
14893 }
14894 r = 0;
14895 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14896 return r;
14897 }
14898
14899 int BlueStore::_omap_setheader(TransContext *txc,
14900 CollectionRef& c,
14901 OnodeRef &o,
14902 bufferlist& bl)
14903 {
14904 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14905 int r;
14906 string key;
14907 if (!o->onode.has_omap()) {
14908 if (o->oid.is_pgmeta()) {
14909 o->onode.set_omap_flags_pgmeta();
14910 } else {
14911 o->onode.set_omap_flags();
14912 }
14913 txc->write_onode(o);
14914
14915 const string& prefix = o->get_omap_prefix();
14916 string key_tail;
14917 bufferlist tail;
14918 o->get_omap_tail(&key_tail);
14919 txc->t->set(prefix, key_tail, tail);
14920 } else {
14921 txc->note_modified_object(o);
14922 }
14923 const string& prefix = o->get_omap_prefix();
14924 o->get_omap_header(&key);
14925 txc->t->set(prefix, key, bl);
14926 r = 0;
14927 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14928 return r;
14929 }
14930
14931 int BlueStore::_omap_rmkeys(TransContext *txc,
14932 CollectionRef& c,
14933 OnodeRef& o,
14934 bufferlist& bl)
14935 {
14936 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14937 int r = 0;
14938 auto p = bl.cbegin();
14939 __u32 num;
14940 string final_key;
14941
14942 if (!o->onode.has_omap()) {
14943 goto out;
14944 }
14945 {
14946 const string& prefix = o->get_omap_prefix();
14947 o->get_omap_key(string(), &final_key);
14948 size_t base_key_len = final_key.size();
14949 decode(num, p);
14950 while (num--) {
14951 string key;
14952 decode(key, p);
14953 final_key.resize(base_key_len); // keep prefix
14954 final_key += key;
14955 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
14956 << " <- " << key << dendl;
14957 txc->t->rmkey(prefix, final_key);
14958 }
14959 }
14960 txc->note_modified_object(o);
14961
14962 out:
14963 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14964 return r;
14965 }
14966
14967 int BlueStore::_omap_rmkey_range(TransContext *txc,
14968 CollectionRef& c,
14969 OnodeRef& o,
14970 const string& first, const string& last)
14971 {
14972 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14973 string key_first, key_last;
14974 int r = 0;
14975 if (!o->onode.has_omap()) {
14976 goto out;
14977 }
14978 {
14979 const string& prefix = o->get_omap_prefix();
14980 o->flush();
14981 o->get_omap_key(first, &key_first);
14982 o->get_omap_key(last, &key_last);
14983 txc->t->rm_range_keys(prefix, key_first, key_last);
14984 dout(20) << __func__ << " remove range start: "
14985 << pretty_binary_string(key_first) << " end: "
14986 << pretty_binary_string(key_last) << dendl;
14987 }
14988 txc->note_modified_object(o);
14989
14990 out:
14991 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14992 return r;
14993 }
14994
14995 int BlueStore::_set_alloc_hint(
14996 TransContext *txc,
14997 CollectionRef& c,
14998 OnodeRef& o,
14999 uint64_t expected_object_size,
15000 uint64_t expected_write_size,
15001 uint32_t flags)
15002 {
15003 dout(15) << __func__ << " " << c->cid << " " << o->oid
15004 << " object_size " << expected_object_size
15005 << " write_size " << expected_write_size
15006 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15007 << dendl;
15008 int r = 0;
15009 o->onode.expected_object_size = expected_object_size;
15010 o->onode.expected_write_size = expected_write_size;
15011 o->onode.alloc_hint_flags = flags;
15012 txc->write_onode(o);
15013 dout(10) << __func__ << " " << c->cid << " " << o->oid
15014 << " object_size " << expected_object_size
15015 << " write_size " << expected_write_size
15016 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15017 << " = " << r << dendl;
15018 return r;
15019 }
15020
15021 int BlueStore::_clone(TransContext *txc,
15022 CollectionRef& c,
15023 OnodeRef& oldo,
15024 OnodeRef& newo)
15025 {
15026 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15027 << newo->oid << dendl;
15028 int r = 0;
15029 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
15030 derr << __func__ << " mismatched hash on " << oldo->oid
15031 << " and " << newo->oid << dendl;
15032 return -EINVAL;
15033 }
15034
15035 _assign_nid(txc, newo);
15036
15037 // clone data
15038 oldo->flush();
15039 _do_truncate(txc, c, newo, 0);
15040 if (cct->_conf->bluestore_clone_cow) {
15041 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
15042 } else {
15043 bufferlist bl;
15044 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
15045 if (r < 0)
15046 goto out;
15047 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
15048 if (r < 0)
15049 goto out;
15050 }
15051
15052 // clone attrs
15053 newo->onode.attrs = oldo->onode.attrs;
15054
15055 // clone omap
15056 if (newo->onode.has_omap()) {
15057 dout(20) << __func__ << " clearing old omap data" << dendl;
15058 newo->flush();
15059 _do_omap_clear(txc, newo);
15060 newo->onode.clear_omap_flag();
15061 }
15062 if (oldo->onode.has_omap()) {
15063 dout(20) << __func__ << " copying omap data" << dendl;
15064 if (newo->oid.is_pgmeta()) {
15065 newo->onode.set_omap_flags_pgmeta();
15066 } else {
15067 newo->onode.set_omap_flags();
15068 }
15069 const string& prefix = newo->get_omap_prefix();
15070 KeyValueDB::Iterator it = db->get_iterator(prefix);
15071 string head, tail;
15072 oldo->get_omap_header(&head);
15073 oldo->get_omap_tail(&tail);
15074 it->lower_bound(head);
15075 while (it->valid()) {
15076 if (it->key() >= tail) {
15077 dout(30) << __func__ << " reached tail" << dendl;
15078 break;
15079 } else {
15080 dout(30) << __func__ << " got header/data "
15081 << pretty_binary_string(it->key()) << dendl;
15082 string key;
15083 newo->rewrite_omap_key(it->key(), &key);
15084 txc->t->set(prefix, key, it->value());
15085 }
15086 it->next();
15087 }
15088 string new_tail;
15089 bufferlist new_tail_value;
15090 newo->get_omap_tail(&new_tail);
15091 txc->t->set(prefix, new_tail, new_tail_value);
15092 }
15093
15094 txc->write_onode(newo);
15095 r = 0;
15096
15097 out:
15098 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15099 << newo->oid << " = " << r << dendl;
15100 return r;
15101 }
15102
15103 int BlueStore::_do_clone_range(
15104 TransContext *txc,
15105 CollectionRef& c,
15106 OnodeRef& oldo,
15107 OnodeRef& newo,
15108 uint64_t srcoff,
15109 uint64_t length,
15110 uint64_t dstoff)
15111 {
15112 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15113 << newo->oid
15114 << " 0x" << std::hex << srcoff << "~" << length << " -> "
15115 << " 0x" << dstoff << "~" << length << std::dec << dendl;
15116 oldo->extent_map.fault_range(db, srcoff, length);
15117 newo->extent_map.fault_range(db, dstoff, length);
15118 _dump_onode<30>(cct, *oldo);
15119 _dump_onode<30>(cct, *newo);
15120
15121 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
15122 _dump_onode<30>(cct, *oldo);
15123 _dump_onode<30>(cct, *newo);
15124 return 0;
15125 }
15126
15127 int BlueStore::_clone_range(TransContext *txc,
15128 CollectionRef& c,
15129 OnodeRef& oldo,
15130 OnodeRef& newo,
15131 uint64_t srcoff, uint64_t length, uint64_t dstoff)
15132 {
15133 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15134 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15135 << " to offset 0x" << dstoff << std::dec << dendl;
15136 int r = 0;
15137
15138 if (srcoff + length >= OBJECT_MAX_SIZE ||
15139 dstoff + length >= OBJECT_MAX_SIZE) {
15140 r = -E2BIG;
15141 goto out;
15142 }
15143 if (srcoff + length > oldo->onode.size) {
15144 r = -EINVAL;
15145 goto out;
15146 }
15147
15148 _assign_nid(txc, newo);
15149
15150 if (length > 0) {
15151 if (cct->_conf->bluestore_clone_cow) {
15152 _do_zero(txc, c, newo, dstoff, length);
15153 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
15154 } else {
15155 bufferlist bl;
15156 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
15157 if (r < 0)
15158 goto out;
15159 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
15160 if (r < 0)
15161 goto out;
15162 }
15163 }
15164
15165 txc->write_onode(newo);
15166 r = 0;
15167
15168 out:
15169 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15170 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15171 << " to offset 0x" << dstoff << std::dec
15172 << " = " << r << dendl;
15173 return r;
15174 }
15175
15176 int BlueStore::_rename(TransContext *txc,
15177 CollectionRef& c,
15178 OnodeRef& oldo,
15179 OnodeRef& newo,
15180 const ghobject_t& new_oid)
15181 {
15182 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15183 << new_oid << dendl;
15184 int r;
15185 ghobject_t old_oid = oldo->oid;
15186 mempool::bluestore_cache_meta::string new_okey;
15187
15188 if (newo) {
15189 if (newo->exists) {
15190 r = -EEXIST;
15191 goto out;
15192 }
15193 ceph_assert(txc->onodes.count(newo) == 0);
15194 }
15195
15196 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
15197
15198 // rewrite shards
15199 {
15200 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
15201 get_object_key(cct, new_oid, &new_okey);
15202 string key;
15203 for (auto &s : oldo->extent_map.shards) {
15204 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
15205 [&](const string& final_key) {
15206 txc->t->rmkey(PREFIX_OBJ, final_key);
15207 }
15208 );
15209 s.dirty = true;
15210 }
15211 }
15212
15213 newo = oldo;
15214 txc->write_onode(newo);
15215
15216 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15217 // Onode in the old slot
15218 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
15219 r = 0;
15220
15221 // hold a ref to new Onode in old name position, to ensure we don't drop
15222 // it from the cache before this txc commits (or else someone may come along
15223 // and read newo's metadata via the old name).
15224 txc->note_modified_object(oldo);
15225
15226 out:
15227 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
15228 << new_oid << " = " << r << dendl;
15229 return r;
15230 }
15231
15232 // collections
15233
15234 int BlueStore::_create_collection(
15235 TransContext *txc,
15236 const coll_t &cid,
15237 unsigned bits,
15238 CollectionRef *c)
15239 {
15240 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
15241 int r;
15242 bufferlist bl;
15243
15244 {
15245 std::unique_lock l(coll_lock);
15246 if (*c) {
15247 r = -EEXIST;
15248 goto out;
15249 }
15250 auto p = new_coll_map.find(cid);
15251 ceph_assert(p != new_coll_map.end());
15252 *c = p->second;
15253 (*c)->cnode.bits = bits;
15254 coll_map[cid] = *c;
15255 new_coll_map.erase(p);
15256 }
15257 encode((*c)->cnode, bl);
15258 txc->t->set(PREFIX_COLL, stringify(cid), bl);
15259 r = 0;
15260
15261 out:
15262 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
15263 return r;
15264 }
15265
15266 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
15267 CollectionRef *c)
15268 {
15269 dout(15) << __func__ << " " << cid << dendl;
15270 int r;
15271
15272 (*c)->flush_all_but_last();
15273 {
15274 std::unique_lock l(coll_lock);
15275 if (!*c) {
15276 r = -ENOENT;
15277 goto out;
15278 }
15279 size_t nonexistent_count = 0;
15280 ceph_assert((*c)->exists);
15281 if ((*c)->onode_map.map_any([&](Onode* o) {
15282 if (o->exists) {
15283 dout(1) << __func__ << " " << o->oid << " " << o
15284 << " exists in onode_map" << dendl;
15285 return true;
15286 }
15287 ++nonexistent_count;
15288 return false;
15289 })) {
15290 r = -ENOTEMPTY;
15291 goto out;
15292 }
15293
15294 vector<ghobject_t> ls;
15295 ghobject_t next;
15296 // Enumerate onodes in db, up to nonexistent_count + 1
15297 // then check if all of them are marked as non-existent.
15298 // Bypass the check if (next != ghobject_t::get_max())
15299 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
15300 nonexistent_count + 1, false, &ls, &next);
15301 if (r >= 0) {
15302 // If true mean collecton has more objects than nonexistent_count,
15303 // so bypass check.
15304 bool exists = (!next.is_max());
15305 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
15306 dout(10) << __func__ << " oid " << *it << dendl;
15307 auto onode = (*c)->onode_map.lookup(*it);
15308 exists = !onode || onode->exists;
15309 if (exists) {
15310 dout(1) << __func__ << " " << *it
15311 << " exists in db, "
15312 << (!onode ? "not present in ram" : "present in ram")
15313 << dendl;
15314 }
15315 }
15316 if (!exists) {
15317 _do_remove_collection(txc, c);
15318 r = 0;
15319 } else {
15320 dout(10) << __func__ << " " << cid
15321 << " is non-empty" << dendl;
15322 r = -ENOTEMPTY;
15323 }
15324 }
15325 }
15326
15327 out:
15328 dout(10) << __func__ << " " << cid << " = " << r << dendl;
15329 return r;
15330 }
15331
15332 void BlueStore::_do_remove_collection(TransContext *txc,
15333 CollectionRef *c)
15334 {
15335 coll_map.erase((*c)->cid);
15336 txc->removed_collections.push_back(*c);
15337 (*c)->exists = false;
15338 _osr_register_zombie((*c)->osr.get());
15339 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
15340 c->reset();
15341 }
15342
15343 int BlueStore::_split_collection(TransContext *txc,
15344 CollectionRef& c,
15345 CollectionRef& d,
15346 unsigned bits, int rem)
15347 {
15348 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
15349 << " bits " << bits << dendl;
15350 std::unique_lock l(c->lock);
15351 std::unique_lock l2(d->lock);
15352 int r;
15353
15354 // flush all previous deferred writes on this sequencer. this is a bit
15355 // heavyweight, but we need to make sure all deferred writes complete
15356 // before we split as the new collection's sequencer may need to order
15357 // this after those writes, and we don't bother with the complexity of
15358 // moving those TransContexts over to the new osr.
15359 _osr_drain_preceding(txc);
15360
15361 // move any cached items (onodes and referenced shared blobs) that will
15362 // belong to the child collection post-split. leave everything else behind.
15363 // this may include things that don't strictly belong to the now-smaller
15364 // parent split, but the OSD will always send us a split for every new
15365 // child.
15366
15367 spg_t pgid, dest_pgid;
15368 bool is_pg = c->cid.is_pg(&pgid);
15369 ceph_assert(is_pg);
15370 is_pg = d->cid.is_pg(&dest_pgid);
15371 ceph_assert(is_pg);
15372
15373 // the destination should initially be empty.
15374 ceph_assert(d->onode_map.empty());
15375 ceph_assert(d->shared_blob_set.empty());
15376 ceph_assert(d->cnode.bits == bits);
15377
15378 c->split_cache(d.get());
15379
15380 // adjust bits. note that this will be redundant for all but the first
15381 // split call for this parent (first child).
15382 c->cnode.bits = bits;
15383 ceph_assert(d->cnode.bits == bits);
15384 r = 0;
15385
15386 bufferlist bl;
15387 encode(c->cnode, bl);
15388 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
15389
15390 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
15391 << " bits " << bits << " = " << r << dendl;
15392 return r;
15393 }
15394
15395 int BlueStore::_merge_collection(
15396 TransContext *txc,
15397 CollectionRef *c,
15398 CollectionRef& d,
15399 unsigned bits)
15400 {
15401 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
15402 << " bits " << bits << dendl;
15403 std::unique_lock l((*c)->lock);
15404 std::unique_lock l2(d->lock);
15405 int r;
15406
15407 coll_t cid = (*c)->cid;
15408
15409 // flush all previous deferred writes on the source collection to ensure
15410 // that all deferred writes complete before we merge as the target collection's
15411 // sequencer may need to order new ops after those writes.
15412
15413 _osr_drain((*c)->osr.get());
15414
15415 // move any cached items (onodes and referenced shared blobs) that will
15416 // belong to the child collection post-split. leave everything else behind.
15417 // this may include things that don't strictly belong to the now-smaller
15418 // parent split, but the OSD will always send us a split for every new
15419 // child.
15420
15421 spg_t pgid, dest_pgid;
15422 bool is_pg = cid.is_pg(&pgid);
15423 ceph_assert(is_pg);
15424 is_pg = d->cid.is_pg(&dest_pgid);
15425 ceph_assert(is_pg);
15426
15427 // adjust bits. note that this will be redundant for all but the first
15428 // merge call for the parent/target.
15429 d->cnode.bits = bits;
15430
15431 // behavior depends on target (d) bits, so this after that is updated.
15432 (*c)->split_cache(d.get());
15433
15434 // remove source collection
15435 {
15436 std::unique_lock l3(coll_lock);
15437 _do_remove_collection(txc, c);
15438 }
15439
15440 r = 0;
15441
15442 bufferlist bl;
15443 encode(d->cnode, bl);
15444 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15445
15446 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15447 << " bits " << bits << " = " << r << dendl;
15448 return r;
15449 }
15450
15451 void BlueStore::log_latency(
15452 const char* name,
15453 int idx,
15454 const ceph::timespan& l,
15455 double lat_threshold,
15456 const char* info) const
15457 {
15458 logger->tinc(idx, l);
15459 if (lat_threshold > 0.0 &&
15460 l >= make_timespan(lat_threshold)) {
15461 dout(0) << __func__ << " slow operation observed for " << name
15462 << ", latency = " << l
15463 << info
15464 << dendl;
15465 }
15466 }
15467
15468 void BlueStore::log_latency_fn(
15469 const char* name,
15470 int idx,
15471 const ceph::timespan& l,
15472 double lat_threshold,
15473 std::function<string (const ceph::timespan& lat)> fn) const
15474 {
15475 logger->tinc(idx, l);
15476 if (lat_threshold > 0.0 &&
15477 l >= make_timespan(lat_threshold)) {
15478 dout(0) << __func__ << " slow operation observed for " << name
15479 << ", latency = " << l
15480 << fn(l)
15481 << dendl;
15482 }
15483 }
15484
15485 #if defined(WITH_LTTNG)
15486 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15487 KeyValueDB &db,
15488 TransContext &txc,
15489 mono_clock::time_point start_throttle_acquire)
15490 {
15491 pending_kv_ios += txc.ios;
15492 if (txc.deferred_txn) {
15493 pending_deferred_ios += txc.ios;
15494 }
15495
15496 uint64_t started = 0;
15497 uint64_t completed = 0;
15498 if (should_trace(&started, &completed)) {
15499 txc.tracing = true;
15500 uint64_t rocksdb_base_level,
15501 rocksdb_estimate_pending_compaction_bytes,
15502 rocksdb_cur_size_all_mem_tables,
15503 rocksdb_compaction_pending,
15504 rocksdb_mem_table_flush_pending,
15505 rocksdb_num_running_compactions,
15506 rocksdb_num_running_flushes,
15507 rocksdb_actual_delayed_write_rate;
15508 db.get_property(
15509 "rocksdb.base-level",
15510 &rocksdb_base_level);
15511 db.get_property(
15512 "rocksdb.estimate-pending-compaction-bytes",
15513 &rocksdb_estimate_pending_compaction_bytes);
15514 db.get_property(
15515 "rocksdb.cur-size-all-mem-tables",
15516 &rocksdb_cur_size_all_mem_tables);
15517 db.get_property(
15518 "rocksdb.compaction-pending",
15519 &rocksdb_compaction_pending);
15520 db.get_property(
15521 "rocksdb.mem-table-flush-pending",
15522 &rocksdb_mem_table_flush_pending);
15523 db.get_property(
15524 "rocksdb.num-running-compactions",
15525 &rocksdb_num_running_compactions);
15526 db.get_property(
15527 "rocksdb.num-running-flushes",
15528 &rocksdb_num_running_flushes);
15529 db.get_property(
15530 "rocksdb.actual-delayed-write-rate",
15531 &rocksdb_actual_delayed_write_rate);
15532
15533
15534 tracepoint(
15535 bluestore,
15536 transaction_initial_state,
15537 txc.osr->get_sequencer_id(),
15538 txc.seq,
15539 throttle_bytes.get_current(),
15540 throttle_deferred_bytes.get_current(),
15541 pending_kv_ios,
15542 pending_deferred_ios,
15543 started,
15544 completed,
15545 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15546
15547 tracepoint(
15548 bluestore,
15549 transaction_initial_state_rocksdb,
15550 txc.osr->get_sequencer_id(),
15551 txc.seq,
15552 rocksdb_base_level,
15553 rocksdb_estimate_pending_compaction_bytes,
15554 rocksdb_cur_size_all_mem_tables,
15555 rocksdb_compaction_pending,
15556 rocksdb_mem_table_flush_pending,
15557 rocksdb_num_running_compactions,
15558 rocksdb_num_running_flushes,
15559 rocksdb_actual_delayed_write_rate);
15560 }
15561 }
15562 #endif
15563
15564 mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15565 TransContext &txc, PerfCounters *logger, int state)
15566 {
15567 mono_clock::time_point now = mono_clock::now();
15568 mono_clock::duration lat = now - txc.last_stamp;
15569 logger->tinc(state, lat);
15570 #if defined(WITH_LTTNG)
15571 if (txc.tracing &&
15572 state >= l_bluestore_state_prepare_lat &&
15573 state <= l_bluestore_state_done_lat) {
15574 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15575 tracepoint(
15576 bluestore,
15577 transaction_state_duration,
15578 txc.osr->get_sequencer_id(),
15579 txc.seq,
15580 state,
15581 ceph::to_seconds<double>(lat));
15582 }
15583 #endif
15584 txc.last_stamp = now;
15585 return lat;
15586 }
15587
15588 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15589 KeyValueDB &db,
15590 TransContext &txc,
15591 mono_clock::time_point start_throttle_acquire)
15592 {
15593 throttle_bytes.get(txc.cost);
15594
15595 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15596 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15597 return true;
15598 } else {
15599 return false;
15600 }
15601 }
15602
15603 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15604 KeyValueDB &db,
15605 TransContext &txc,
15606 mono_clock::time_point start_throttle_acquire)
15607 {
15608 ceph_assert(txc.deferred_txn);
15609 throttle_deferred_bytes.get(txc.cost);
15610 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15611 }
15612
15613 #if defined(WITH_LTTNG)
15614 void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15615 {
15616 pending_kv_ios -= 1;
15617 ios_completed_since_last_traced++;
15618 if (txc.tracing) {
15619 tracepoint(
15620 bluestore,
15621 transaction_commit_latency,
15622 txc.osr->get_sequencer_id(),
15623 txc.seq,
15624 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15625 }
15626 }
15627 #endif
15628
15629 #if defined(WITH_LTTNG)
15630 void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15631 {
15632 if (txc.deferred_txn) {
15633 pending_deferred_ios -= 1;
15634 }
15635 if (txc.tracing) {
15636 mono_clock::time_point now = mono_clock::now();
15637 mono_clock::duration lat = now - txc.start;
15638 tracepoint(
15639 bluestore,
15640 transaction_total_duration,
15641 txc.osr->get_sequencer_id(),
15642 txc.seq,
15643 ceph::to_seconds<double>(lat));
15644 }
15645 }
15646 #endif
15647
15648 // DB key value Histogram
15649 #define KEY_SLAB 32
15650 #define VALUE_SLAB 64
15651
15652 const string prefix_onode = "o";
15653 const string prefix_onode_shard = "x";
15654 const string prefix_other = "Z";
15655
15656 int BlueStore::DBHistogram::get_key_slab(size_t sz)
15657 {
15658 return (sz/KEY_SLAB);
15659 }
15660
15661 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15662 {
15663 int lower_bound = slab * KEY_SLAB;
15664 int upper_bound = (slab + 1) * KEY_SLAB;
15665 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15666 return ret;
15667 }
15668
15669 int BlueStore::DBHistogram::get_value_slab(size_t sz)
15670 {
15671 return (sz/VALUE_SLAB);
15672 }
15673
15674 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15675 {
15676 int lower_bound = slab * VALUE_SLAB;
15677 int upper_bound = (slab + 1) * VALUE_SLAB;
15678 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15679 return ret;
15680 }
15681
15682 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15683 const string &prefix, size_t key_size, size_t value_size)
15684 {
15685 uint32_t key_slab = get_key_slab(key_size);
15686 uint32_t value_slab = get_value_slab(value_size);
15687 key_hist[prefix][key_slab].count++;
15688 key_hist[prefix][key_slab].max_len =
15689 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
15690 key_hist[prefix][key_slab].val_map[value_slab].count++;
15691 key_hist[prefix][key_slab].val_map[value_slab].max_len =
15692 std::max<size_t>(value_size,
15693 key_hist[prefix][key_slab].val_map[value_slab].max_len);
15694 }
15695
15696 void BlueStore::DBHistogram::dump(Formatter *f)
15697 {
15698 f->open_object_section("rocksdb_value_distribution");
15699 for (auto i : value_hist) {
15700 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15701 }
15702 f->close_section();
15703
15704 f->open_object_section("rocksdb_key_value_histogram");
15705 for (auto i : key_hist) {
15706 f->dump_string("prefix", i.first);
15707 f->open_object_section("key_hist");
15708 for ( auto k : i.second) {
15709 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15710 f->dump_unsigned("max_len", k.second.max_len);
15711 f->open_object_section("value_hist");
15712 for ( auto j : k.second.val_map) {
15713 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15714 f->dump_unsigned("max_len", j.second.max_len);
15715 }
15716 f->close_section();
15717 }
15718 f->close_section();
15719 }
15720 f->close_section();
15721 }
15722
15723 //Itrerates through the db and collects the stats
15724 void BlueStore::generate_db_histogram(Formatter *f)
15725 {
15726 //globals
15727 uint64_t num_onodes = 0;
15728 uint64_t num_shards = 0;
15729 uint64_t num_super = 0;
15730 uint64_t num_coll = 0;
15731 uint64_t num_omap = 0;
15732 uint64_t num_pgmeta_omap = 0;
15733 uint64_t num_deferred = 0;
15734 uint64_t num_alloc = 0;
15735 uint64_t num_stat = 0;
15736 uint64_t num_others = 0;
15737 uint64_t num_shared_shards = 0;
15738 size_t max_key_size =0, max_value_size = 0;
15739 uint64_t total_key_size = 0, total_value_size = 0;
15740 size_t key_size = 0, value_size = 0;
15741 DBHistogram hist;
15742
15743 auto start = coarse_mono_clock::now();
15744
15745 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
15746 iter->seek_to_first();
15747 while (iter->valid()) {
15748 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15749 key_size = iter->key_size();
15750 value_size = iter->value_size();
15751 hist.value_hist[hist.get_value_slab(value_size)]++;
15752 max_key_size = std::max(max_key_size, key_size);
15753 max_value_size = std::max(max_value_size, value_size);
15754 total_key_size += key_size;
15755 total_value_size += value_size;
15756
15757 pair<string,string> key(iter->raw_key());
15758
15759 if (key.first == PREFIX_SUPER) {
15760 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15761 num_super++;
15762 } else if (key.first == PREFIX_STAT) {
15763 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15764 num_stat++;
15765 } else if (key.first == PREFIX_COLL) {
15766 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15767 num_coll++;
15768 } else if (key.first == PREFIX_OBJ) {
15769 if (key.second.back() == ONODE_KEY_SUFFIX) {
15770 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15771 num_onodes++;
15772 } else {
15773 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15774 num_shards++;
15775 }
15776 } else if (key.first == PREFIX_OMAP) {
15777 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15778 num_omap++;
15779 } else if (key.first == PREFIX_PGMETA_OMAP) {
15780 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15781 num_pgmeta_omap++;
15782 } else if (key.first == PREFIX_DEFERRED) {
15783 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15784 num_deferred++;
15785 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
15786 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15787 num_alloc++;
15788 } else if (key.first == PREFIX_SHARED_BLOB) {
15789 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15790 num_shared_shards++;
15791 } else {
15792 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15793 num_others++;
15794 }
15795 iter->next();
15796 }
15797
15798 ceph::timespan duration = coarse_mono_clock::now() - start;
15799 f->open_object_section("rocksdb_key_value_stats");
15800 f->dump_unsigned("num_onodes", num_onodes);
15801 f->dump_unsigned("num_shards", num_shards);
15802 f->dump_unsigned("num_super", num_super);
15803 f->dump_unsigned("num_coll", num_coll);
15804 f->dump_unsigned("num_omap", num_omap);
15805 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
15806 f->dump_unsigned("num_deferred", num_deferred);
15807 f->dump_unsigned("num_alloc", num_alloc);
15808 f->dump_unsigned("num_stat", num_stat);
15809 f->dump_unsigned("num_shared_shards", num_shared_shards);
15810 f->dump_unsigned("num_others", num_others);
15811 f->dump_unsigned("max_key_size", max_key_size);
15812 f->dump_unsigned("max_value_size", max_value_size);
15813 f->dump_unsigned("total_key_size", total_key_size);
15814 f->dump_unsigned("total_value_size", total_value_size);
15815 f->close_section();
15816
15817 hist.dump(f);
15818
15819 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
15820
15821 }
15822
15823 void BlueStore::_shutdown_cache()
15824 {
15825 dout(10) << __func__ << dendl;
15826 for (auto i : buffer_cache_shards) {
15827 i->flush();
15828 ceph_assert(i->empty());
15829 }
15830 for (auto& p : coll_map) {
15831 p.second->onode_map.clear();
15832 if (!p.second->shared_blob_set.empty()) {
15833 derr << __func__ << " stray shared blobs on " << p.first << dendl;
15834 p.second->shared_blob_set.dump<0>(cct);
15835 }
15836 ceph_assert(p.second->onode_map.empty());
15837 ceph_assert(p.second->shared_blob_set.empty());
15838 }
15839 coll_map.clear();
15840 for (auto i : onode_cache_shards) {
15841 ceph_assert(i->empty());
15842 }
15843 }
15844
15845 // For external caller.
15846 // We use a best-effort policy instead, e.g.,
15847 // we don't care if there are still some pinned onodes/data in the cache
15848 // after this command is completed.
15849 int BlueStore::flush_cache(ostream *os)
15850 {
15851 dout(10) << __func__ << dendl;
15852 for (auto i : onode_cache_shards) {
15853 i->flush();
15854 }
15855 for (auto i : buffer_cache_shards) {
15856 i->flush();
15857 }
15858
15859 return 0;
15860 }
15861
15862 void BlueStore::_apply_padding(uint64_t head_pad,
15863 uint64_t tail_pad,
15864 bufferlist& padded)
15865 {
15866 if (head_pad) {
15867 padded.prepend_zero(head_pad);
15868 }
15869 if (tail_pad) {
15870 padded.append_zero(tail_pad);
15871 }
15872 if (head_pad || tail_pad) {
15873 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
15874 << " tail 0x" << tail_pad << std::dec << dendl;
15875 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
15876 }
15877 }
15878
15879 void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
15880 {
15881 // finalize extent_map shards
15882 o->extent_map.update(txn, false);
15883 if (o->extent_map.needs_reshard()) {
15884 o->extent_map.reshard(db, txn);
15885 o->extent_map.update(txn, true);
15886 if (o->extent_map.needs_reshard()) {
15887 dout(20) << __func__ << " warning: still wants reshard, check options?"
15888 << dendl;
15889 o->extent_map.clear_needs_reshard();
15890 }
15891 logger->inc(l_bluestore_onode_reshard);
15892 }
15893
15894 // bound encode
15895 size_t bound = 0;
15896 denc(o->onode, bound);
15897 o->extent_map.bound_encode_spanning_blobs(bound);
15898 if (o->onode.extent_map_shards.empty()) {
15899 denc(o->extent_map.inline_bl, bound);
15900 }
15901
15902 // encode
15903 bufferlist bl;
15904 unsigned onode_part, blob_part, extent_part;
15905 {
15906 auto p = bl.get_contiguous_appender(bound, true);
15907 denc(o->onode, p);
15908 onode_part = p.get_logical_offset();
15909 o->extent_map.encode_spanning_blobs(p);
15910 blob_part = p.get_logical_offset() - onode_part;
15911 if (o->onode.extent_map_shards.empty()) {
15912 denc(o->extent_map.inline_bl, p);
15913 }
15914 extent_part = p.get_logical_offset() - onode_part - blob_part;
15915 }
15916
15917 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
15918 << " (" << onode_part << " bytes onode + "
15919 << blob_part << " bytes spanning blobs + "
15920 << extent_part << " bytes inline extents)"
15921 << dendl;
15922
15923
15924 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
15925 }
15926
15927 void BlueStore::_log_alerts(osd_alert_list_t& alerts)
15928 {
15929 std::lock_guard l(qlock);
15930
15931 if (!disk_size_mismatch_alert.empty()) {
15932 alerts.emplace(
15933 "BLUESTORE_DISK_SIZE_MISMATCH",
15934 disk_size_mismatch_alert);
15935 }
15936 if (!legacy_statfs_alert.empty()) {
15937 alerts.emplace(
15938 "BLUESTORE_LEGACY_STATFS",
15939 legacy_statfs_alert);
15940 }
15941 if (!spillover_alert.empty() &&
15942 cct->_conf->bluestore_warn_on_bluefs_spillover) {
15943 alerts.emplace(
15944 "BLUEFS_SPILLOVER",
15945 spillover_alert);
15946 }
15947 if (!no_per_pool_omap_alert.empty()) {
15948 alerts.emplace(
15949 "BLUESTORE_NO_PER_POOL_OMAP",
15950 no_per_pool_omap_alert);
15951 }
15952 string s0(failed_cmode);
15953
15954 if (!failed_compressors.empty()) {
15955 if (!s0.empty()) {
15956 s0 += ", ";
15957 }
15958 s0 += "unable to load:";
15959 bool first = true;
15960 for (auto& s : failed_compressors) {
15961 if (first) {
15962 first = false;
15963 } else {
15964 s0 += ", ";
15965 }
15966 s0 += s;
15967 }
15968 alerts.emplace(
15969 "BLUESTORE_NO_COMPRESSION",
15970 s0);
15971 }
15972 }
15973
15974 void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
15975 size_t extents)
15976 {
15977 alloc_stats_count++;
15978 alloc_stats_fragments += extents;
15979 alloc_stats_size += need;
15980 }
15981
15982 void BlueStore::_record_allocation_stats()
15983 {
15984 // don't care about data consistency,
15985 // fields can be partially modified while making the tuple
15986 auto t0 = std::make_tuple(
15987 alloc_stats_count.exchange(0),
15988 alloc_stats_fragments.exchange(0),
15989 alloc_stats_size.exchange(0));
15990
15991 dout(0) << " allocation stats probe "
15992 << probe_count << ":"
15993 << " cnt: " << std::get<0>(t0)
15994 << " frags: " << std::get<1>(t0)
15995 << " size: " << std::get<2>(t0)
15996 << dendl;
15997
15998
15999 //
16000 // Keep the history for probes from the power-of-two sequence:
16001 // -1, -2, -4, -8, -16
16002 //
16003 size_t base = 1;
16004 for (auto& t : alloc_stats_history) {
16005 dout(0) << " probe -"
16006 << base + (probe_count % base) << ": "
16007 << std::get<0>(t)
16008 << ", " << std::get<1>(t)
16009 << ", " << std::get<2>(t)
16010 << dendl;
16011 base <<= 1;
16012 }
16013 dout(0) << "------------" << dendl;
16014
16015 auto prev = probe_count++;
16016 auto mask = (1 << alloc_stats_history.size()) - 1;
16017 probe_count &= mask;
16018
16019 for (size_t i = cbits(prev ^ probe_count) - 1; i > 0 ; --i) {
16020 alloc_stats_history[i] = alloc_stats_history[i - 1];
16021 }
16022 alloc_stats_history[0].swap(t0);
16023 }
16024
16025 // ===========================================
16026 // BlueStoreRepairer
16027
16028 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16029 const interval_set<uint64_t>& extents)
16030 {
16031 ceph_assert(granularity); // initialized
16032 // can't call for the second time
16033 ceph_assert(!was_filtered_out);
16034 ceph_assert(collections_bfs.size() == objects_bfs.size());
16035
16036 uint64_t prev_pos = 0;
16037 uint64_t npos = collections_bfs.size();
16038
16039 bloom_vector collections_reduced;
16040 bloom_vector objects_reduced;
16041
16042 for (auto e : extents) {
16043 if (e.second == 0) {
16044 continue;
16045 }
16046 uint64_t pos = max(e.first / granularity, prev_pos);
16047 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
16048 while (pos != npos && pos < end_pos) {
16049 ceph_assert( collections_bfs[pos].element_count() ==
16050 objects_bfs[pos].element_count());
16051 if (collections_bfs[pos].element_count()) {
16052 collections_reduced.push_back(std::move(collections_bfs[pos]));
16053 objects_reduced.push_back(std::move(objects_bfs[pos]));
16054 }
16055 ++pos;
16056 }
16057 prev_pos = end_pos;
16058 }
16059 collections_reduced.swap(collections_bfs);
16060 objects_reduced.swap(objects_bfs);
16061 was_filtered_out = true;
16062 return collections_bfs.size();
16063 }
16064
16065 bool BlueStoreRepairer::remove_key(KeyValueDB *db,
16066 const string& prefix,
16067 const string& key)
16068 {
16069 if (!remove_key_txn) {
16070 remove_key_txn = db->get_transaction();
16071 }
16072 ++to_repair_cnt;
16073 remove_key_txn->rmkey(prefix, key);
16074
16075 return true;
16076 }
16077
16078 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db)
16079 {
16080 fix_per_pool_omap_txn = db->get_transaction();
16081 ++to_repair_cnt;
16082 bufferlist bl;
16083 bl.append("1");
16084 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
16085 }
16086
16087 bool BlueStoreRepairer::fix_shared_blob(
16088 KeyValueDB *db,
16089 uint64_t sbid,
16090 const bufferlist* bl)
16091 {
16092 KeyValueDB::Transaction txn;
16093 if (fix_misreferences_txn) { // reuse this txn
16094 txn = fix_misreferences_txn;
16095 } else {
16096 if (!fix_shared_blob_txn) {
16097 fix_shared_blob_txn = db->get_transaction();
16098 }
16099 txn = fix_shared_blob_txn;
16100 }
16101 string key;
16102 get_shared_blob_key(sbid, &key);
16103
16104 ++to_repair_cnt;
16105 if (bl) {
16106 txn->set(PREFIX_SHARED_BLOB, key, *bl);
16107 } else {
16108 txn->rmkey(PREFIX_SHARED_BLOB, key);
16109 }
16110 return true;
16111 }
16112
16113 bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
16114 const string& key,
16115 const store_statfs_t& new_statfs)
16116 {
16117 if (!fix_statfs_txn) {
16118 fix_statfs_txn = db->get_transaction();
16119 }
16120 BlueStore::volatile_statfs vstatfs;
16121 vstatfs = new_statfs;
16122 bufferlist bl;
16123 vstatfs.encode(bl);
16124 ++to_repair_cnt;
16125 fix_statfs_txn->set(PREFIX_STAT, key, bl);
16126 return true;
16127 }
16128
16129 bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
16130 FreelistManager* fm,
16131 uint64_t offset, uint64_t len)
16132 {
16133 if (!fix_fm_leaked_txn) {
16134 fix_fm_leaked_txn = db->get_transaction();
16135 }
16136 ++to_repair_cnt;
16137 fm->release(offset, len, fix_fm_leaked_txn);
16138 return true;
16139 }
16140 bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
16141 FreelistManager* fm,
16142 uint64_t offset, uint64_t len)
16143 {
16144 if (!fix_fm_false_free_txn) {
16145 fix_fm_false_free_txn = db->get_transaction();
16146 }
16147 ++to_repair_cnt;
16148 fm->allocate(offset, len, fix_fm_false_free_txn);
16149 return true;
16150 }
16151
16152 bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
16153 {
16154 // this is just a stub to count num of repairs properly,
16155 // actual repair happens in BlueStore::_close_db_and_around()
16156 // while doing _sync_bluefs_and_fm
16157 ++out_of_sync_flag;
16158 ++to_repair_cnt;
16159 return true;
16160 }
16161
16162 KeyValueDB::Transaction BlueStoreRepairer::fix_spanning_blobs(KeyValueDB* db)
16163 {
16164 if (!fix_onode_txn) {
16165 fix_onode_txn = db->get_transaction();
16166 }
16167 ++to_repair_cnt;
16168 return fix_onode_txn;
16169 }
16170
16171 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
16172 {
16173 if (misreferenced_extents.size()) {
16174 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
16175 ceph_assert(n > 0);
16176 if (!fix_misreferences_txn) {
16177 fix_misreferences_txn = db->get_transaction();
16178 }
16179 return true;
16180 }
16181 return false;
16182 }
16183
16184 unsigned BlueStoreRepairer::apply(KeyValueDB* db)
16185 {
16186 if (fix_per_pool_omap_txn) {
16187 db->submit_transaction_sync(fix_per_pool_omap_txn);
16188 fix_per_pool_omap_txn = nullptr;
16189 }
16190 if (fix_fm_leaked_txn) {
16191 db->submit_transaction_sync(fix_fm_leaked_txn);
16192 fix_fm_leaked_txn = nullptr;
16193 }
16194 if (fix_fm_false_free_txn) {
16195 db->submit_transaction_sync(fix_fm_false_free_txn);
16196 fix_fm_false_free_txn = nullptr;
16197 }
16198 if (remove_key_txn) {
16199 db->submit_transaction_sync(remove_key_txn);
16200 remove_key_txn = nullptr;
16201 }
16202 if (fix_misreferences_txn) {
16203 db->submit_transaction_sync(fix_misreferences_txn);
16204 fix_misreferences_txn = nullptr;
16205 }
16206 if (fix_onode_txn) {
16207 db->submit_transaction_sync(fix_onode_txn);
16208 fix_onode_txn = nullptr;
16209 }
16210 if (fix_shared_blob_txn) {
16211 db->submit_transaction_sync(fix_shared_blob_txn);
16212 fix_shared_blob_txn = nullptr;
16213 }
16214
16215 if (fix_statfs_txn) {
16216 db->submit_transaction_sync(fix_statfs_txn);
16217 fix_statfs_txn = nullptr;
16218 }
16219 unsigned repaired = to_repair_cnt;
16220 to_repair_cnt = 0;
16221 return repaired;
16222 }
16223
16224 // =======================================================
16225 // RocksDBBlueFSVolumeSelector
16226
16227 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
16228 ceph_assert(h != nullptr);
16229 uint64_t hint = reinterpret_cast<uint64_t>(h);
16230 uint8_t res;
16231 switch (hint) {
16232 case LEVEL_SLOW:
16233 res = BlueFS::BDEV_SLOW;
16234 if (db_avail4slow > 0) {
16235 // considering statically available db space vs.
16236 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16237 // - observed maximum spillovers
16238 uint64_t max_db_use = 0; // max db usage we potentially observed
16239 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
16240 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
16241 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
16242 // this could go to db hence using it in the estimation
16243 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
16244
16245 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
16246 uint64_t avail = min(
16247 db_avail4slow,
16248 max_db_use < db_total ? db_total - max_db_use : 0);
16249
16250 // considering current DB dev usage for SLOW data
16251 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
16252 res = BlueFS::BDEV_DB;
16253 }
16254 }
16255 break;
16256 case LEVEL_LOG:
16257 case LEVEL_WAL:
16258 res = BlueFS::BDEV_WAL;
16259 break;
16260 case LEVEL_DB:
16261 default:
16262 res = BlueFS::BDEV_DB;
16263 break;
16264 }
16265 return res;
16266 }
16267
16268 void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
16269 {
16270 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
16271 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
16272 }
16273
16274 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
16275 uint8_t res = LEVEL_DB;
16276 if (dirname.length() > 5) {
16277 // the "db.slow" and "db.wal" directory names are hard-coded at
16278 // match up with bluestore. the slow device is always the second
16279 // one (when a dedicated block.db device is present and used at
16280 // bdev 0). the wal device is always last.
16281 if (boost::algorithm::ends_with(dirname, ".slow")) {
16282 res = LEVEL_SLOW;
16283 }
16284 else if (boost::algorithm::ends_with(dirname, ".wal")) {
16285 res = LEVEL_WAL;
16286 }
16287 }
16288 return reinterpret_cast<void*>(res);
16289 }
16290
16291 void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
16292 auto max_x = per_level_per_dev_usage.get_max_x();
16293 auto max_y = per_level_per_dev_usage.get_max_y();
16294 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
16295 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
16296 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
16297 << ", db_avail:" << db_avail4slow << std::endl
16298 << "Usage matrix:" << std::endl;
16299 constexpr std::array<const char*, 8> names{ {
16300 "DEV/LEV",
16301 "WAL",
16302 "DB",
16303 "SLOW",
16304 "*",
16305 "*",
16306 "REAL",
16307 "FILES",
16308 } };
16309 const size_t width = 12;
16310 for (size_t i = 0; i < names.size(); ++i) {
16311 sout.setf(std::ios::left, std::ios::adjustfield);
16312 sout.width(width);
16313 sout << names[i];
16314 }
16315 sout << std::endl;
16316 for (size_t l = 0; l < max_y; l++) {
16317 sout.setf(std::ios::left, std::ios::adjustfield);
16318 sout.width(width);
16319 switch (l + LEVEL_FIRST) {
16320 case LEVEL_LOG:
16321 sout << "LOG"; break;
16322 case LEVEL_WAL:
16323 sout << "WAL"; break;
16324 case LEVEL_DB:
16325 sout << "DB"; break;
16326 case LEVEL_SLOW:
16327 sout << "SLOW"; break;
16328 case LEVEL_MAX:
16329 sout << "TOTALS"; break;
16330 }
16331 for (size_t d = 0; d < max_x; d++) {
16332 sout.setf(std::ios::left, std::ios::adjustfield);
16333 sout.width(width);
16334 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
16335 }
16336 sout.setf(std::ios::left, std::ios::adjustfield);
16337 sout.width(width);
16338 sout << stringify(per_level_files[l]) << std::endl;
16339 }
16340 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
16341 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
16342 sout << "MAXIMUMS:" << std::endl;
16343 for (size_t l = 0; l < max_y; l++) {
16344 sout.setf(std::ios::left, std::ios::adjustfield);
16345 sout.width(width);
16346 switch (l + LEVEL_FIRST) {
16347 case LEVEL_LOG:
16348 sout << "LOG"; break;
16349 case LEVEL_WAL:
16350 sout << "WAL"; break;
16351 case LEVEL_DB:
16352 sout << "DB"; break;
16353 case LEVEL_SLOW:
16354 sout << "SLOW"; break;
16355 case LEVEL_MAX:
16356 sout << "TOTALS"; break;
16357 }
16358 for (size_t d = 0; d < max_x - 1; d++) {
16359 sout.setf(std::ios::left, std::ios::adjustfield);
16360 sout.width(width);
16361 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
16362 }
16363 sout.setf(std::ios::left, std::ios::adjustfield);
16364 sout.width(width);
16365 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
16366 if (l < max_y - 1) {
16367 sout << std::endl;
16368 }
16369 }
16370 }
16371
16372 // =======================================================