]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
build: use dgit for download target
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
31f18b77
FG
21#include "include/cpp-btree/btree_set.h"
22
7c673cae
FG
23#include "BlueStore.h"
24#include "os/kv.h"
25#include "include/compat.h"
26#include "include/intarith.h"
27#include "include/stringify.h"
11fdf7f2
TL
28#include "include/str_map.h"
29#include "include/util.h"
7c673cae
FG
30#include "common/errno.h"
31#include "common/safe_io.h"
91327a77 32#include "common/PriorityCache.h"
7c673cae
FG
33#include "Allocator.h"
34#include "FreelistManager.h"
35#include "BlueFS.h"
36#include "BlueRocksEnv.h"
37#include "auth/Crypto.h"
38#include "common/EventTrace.h"
91327a77 39#include "perfglue/heap_profiler.h"
11fdf7f2
TL
40#include "common/blkdev.h"
41#include "common/numa.h"
7c673cae
FG
42
43#define dout_context cct
44#define dout_subsys ceph_subsys_bluestore
45
31f18b77
FG
46using bid_t = decltype(BlueStore::Blob::id);
47
48// bluestore_cache_onode
7c673cae 49MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 50 bluestore_cache_onode);
7c673cae 51
31f18b77 52// bluestore_cache_other
7c673cae 53MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 54 bluestore_cache_other);
7c673cae 55MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 56 bluestore_cache_other);
7c673cae 57MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 58 bluestore_cache_other);
7c673cae 59MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
60 bluestore_cache_other);
61
62// bluestore_txc
63MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
64 bluestore_txc);
65
7c673cae
FG
66
67// kv store prefixes
11fdf7f2
TL
68const string PREFIX_SUPER = "S"; // field -> value
69const string PREFIX_STAT = "T"; // field -> value(int64 array)
70const string PREFIX_COLL = "C"; // collection name -> cnode_t
71const string PREFIX_OBJ = "O"; // object name -> onode_t
72const string PREFIX_OMAP = "M"; // u64 + keyname -> value
73const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
74const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
75const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
76const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae
FG
77const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
78
11fdf7f2
TL
79const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
80
7c673cae
FG
81// write a label in the first block. always use this size. note that
82// bluefs makes a matching assumption about the location of its
83// superblock (always the second block of the device).
84#define BDEV_LABEL_BLOCK_SIZE 4096
85
86// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
87#define SUPER_RESERVED 8192
88
89#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
90
91
92/*
93 * extent map blob encoding
94 *
95 * we use the low bits of the blobid field to indicate some common scenarios
96 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
97 */
98#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
99#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
100#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
101#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
102#define BLOBID_SHIFT_BITS 4
103
104/*
105 * object name key structure
106 *
107 * encoded u8: shard + 2^7 (so that it sorts properly)
108 * encoded u64: poolid + 2^63 (so that it sorts properly)
109 * encoded u32: hash (bit reversed)
110 *
111 * escaped string: namespace
112 *
113 * escaped string: key or object name
114 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
115 * we are done. otherwise, we are followed by the object name.
116 * escaped string: object name (unless '=' above)
117 *
118 * encoded u64: snap
119 * encoded u64: generation
120 * 'o'
121 */
122#define ONODE_KEY_SUFFIX 'o'
123
124/*
125 * extent shard key
126 *
127 * object prefix key
128 * u32
129 * 'x'
130 */
131#define EXTENT_SHARD_KEY_SUFFIX 'x'
132
11fdf7f2
TL
133// FIXME minor: make a BlueStore method once we have effecient way to
134// map idx to counter nickname
135#define LOG_LATENCY_I(logger, cct, idx, v, info) { \
136 ceph::timespan lat = v; \
137 logger->tinc(idx, lat); \
138 if (cct->_conf->bluestore_log_op_age > 0.0 && \
139 lat >= make_timespan(cct->_conf->bluestore_log_op_age)) { \
140 dout(0) << __func__ << " slow operation observed for " #idx \
141 << ", latency = " << lat \
142 << info \
143 << dendl; \
144 } \
145}
146
147#define LOG_LATENCY_FN(logger, cct, idx, v, fn) { \
148 ceph::timespan lat = v; \
149 logger->tinc(idx, lat); \
150 if (cct->_conf->bluestore_log_op_age > 0.0 && \
151 lat >= make_timespan(cct->_conf->bluestore_log_op_age)) { \
152 dout(0) << __func__ << " slow operation observed for " #idx \
153 << ", latency = " << lat \
154 << fn(lat) \
155 << dendl; \
156 } \
157}
158
159#define LOG_LATENCY(logger, cct, idx, v) LOG_LATENCY_I(logger, cct, idx, v, "")
160
7c673cae
FG
161/*
162 * string encoding in the key
163 *
164 * The key string needs to lexicographically sort the same way that
165 * ghobject_t does. We do this by escaping anything <= to '#' with #
166 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
167 * hex digits.
168 *
169 * We use ! as a terminator for strings; this works because it is < #
170 * and will get escaped if it is present in the string.
171 *
172 */
173template<typename S>
174static void append_escaped(const string &in, S *out)
175{
224ce89b
WB
176 char hexbyte[in.length() * 3 + 1];
177 char* ptr = &hexbyte[0];
7c673cae
FG
178 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
179 if (*i <= '#') {
224ce89b
WB
180 *ptr++ = '#';
181 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
182 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 183 } else if (*i >= '~') {
224ce89b
WB
184 *ptr++ = '~';
185 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
186 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 187 } else {
224ce89b 188 *ptr++ = *i;
7c673cae
FG
189 }
190 }
224ce89b
WB
191 *ptr++ = '!';
192 out->append(hexbyte, ptr - &hexbyte[0]);
193}
194
195inline unsigned h2i(char c)
196{
197 if ((c >= '0') && (c <= '9')) {
198 return c - 0x30;
199 } else if ((c >= 'a') && (c <= 'f')) {
200 return c - 'a' + 10;
201 } else if ((c >= 'A') && (c <= 'F')) {
202 return c - 'A' + 10;
203 } else {
204 return 256; // make it always larger than 255
205 }
7c673cae
FG
206}
207
208static int decode_escaped(const char *p, string *out)
209{
224ce89b
WB
210 char buff[256];
211 char* ptr = &buff[0];
212 char* max = &buff[252];
7c673cae
FG
213 const char *orig_p = p;
214 while (*p && *p != '!') {
215 if (*p == '#' || *p == '~') {
224ce89b
WB
216 unsigned hex = 0;
217 p++;
218 hex = h2i(*p++) << 4;
219 if (hex > 255) {
220 return -EINVAL;
221 }
222 hex |= h2i(*p++);
223 if (hex > 255) {
224 return -EINVAL;
225 }
226 *ptr++ = hex;
7c673cae 227 } else {
224ce89b
WB
228 *ptr++ = *p++;
229 }
230 if (ptr > max) {
231 out->append(buff, ptr-buff);
232 ptr = &buff[0];
7c673cae
FG
233 }
234 }
224ce89b
WB
235 if (ptr != buff) {
236 out->append(buff, ptr-buff);
237 }
7c673cae
FG
238 return p - orig_p;
239}
240
241// some things we encode in binary (as le32 or le64); print the
242// resulting key strings nicely
243template<typename S>
244static string pretty_binary_string(const S& in)
245{
246 char buf[10];
247 string out;
248 out.reserve(in.length() * 3);
249 enum { NONE, HEX, STRING } mode = NONE;
250 unsigned from = 0, i;
251 for (i=0; i < in.length(); ++i) {
252 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
253 (mode == HEX && in.length() - i >= 4 &&
254 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
255 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
256 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
257 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
258 if (mode == STRING) {
259 out.append(in.c_str() + from, i - from);
260 out.push_back('\'');
261 }
262 if (mode != HEX) {
263 out.append("0x");
264 mode = HEX;
265 }
266 if (in.length() - i >= 4) {
267 // print a whole u32 at once
268 snprintf(buf, sizeof(buf), "%08x",
269 (uint32_t)(((unsigned char)in[i] << 24) |
270 ((unsigned char)in[i+1] << 16) |
271 ((unsigned char)in[i+2] << 8) |
272 ((unsigned char)in[i+3] << 0)));
273 i += 3;
274 } else {
275 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
276 }
277 out.append(buf);
278 } else {
279 if (mode != STRING) {
280 out.push_back('\'');
281 mode = STRING;
282 from = i;
283 }
284 }
285 }
286 if (mode == STRING) {
287 out.append(in.c_str() + from, i - from);
288 out.push_back('\'');
289 }
290 return out;
291}
292
293template<typename T>
294static void _key_encode_shard(shard_id_t shard, T *key)
295{
296 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
297}
298
299static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
300{
301 pshard->id = (uint8_t)*key - (uint8_t)0x80;
302 return key + 1;
303}
304
305static void get_coll_key_range(const coll_t& cid, int bits,
306 string *temp_start, string *temp_end,
307 string *start, string *end)
308{
309 temp_start->clear();
310 temp_end->clear();
311 start->clear();
312 end->clear();
313
314 spg_t pgid;
315 if (cid.is_pg(&pgid)) {
316 _key_encode_shard(pgid.shard, start);
317 *temp_start = *start;
318
319 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
320 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
321
322 *end = *start;
323 *temp_end = *temp_start;
324
325 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
326 _key_encode_u32(reverse_hash, start);
327 _key_encode_u32(reverse_hash, temp_start);
328
329 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
330 if (end_hash > 0xffffffffull)
331 end_hash = 0xffffffffull;
332
333 _key_encode_u32(end_hash, end);
334 _key_encode_u32(end_hash, temp_end);
335 } else {
336 _key_encode_shard(shard_id_t::NO_SHARD, start);
337 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
338 *end = *start;
339 _key_encode_u32(0, start);
340 _key_encode_u32(0xffffffff, end);
341
342 // no separate temp section
343 *temp_start = *end;
344 *temp_end = *end;
345 }
346}
347
348static void get_shared_blob_key(uint64_t sbid, string *key)
349{
350 key->clear();
351 _key_encode_u64(sbid, key);
352}
353
354static int get_key_shared_blob(const string& key, uint64_t *sbid)
355{
356 const char *p = key.c_str();
357 if (key.length() < sizeof(uint64_t))
358 return -1;
224ce89b 359 _key_decode_u64(p, sbid);
7c673cae
FG
360 return 0;
361}
362
363template<typename S>
364static int get_key_object(const S& key, ghobject_t *oid)
365{
366 int r;
367 const char *p = key.c_str();
368
369 if (key.length() < 1 + 8 + 4)
370 return -1;
371 p = _key_decode_shard(p, &oid->shard_id);
372
373 uint64_t pool;
374 p = _key_decode_u64(p, &pool);
375 oid->hobj.pool = pool - 0x8000000000000000ull;
376
377 unsigned hash;
378 p = _key_decode_u32(p, &hash);
379
380 oid->hobj.set_bitwise_key_u32(hash);
381
382 r = decode_escaped(p, &oid->hobj.nspace);
383 if (r < 0)
384 return -2;
385 p += r + 1;
386
387 string k;
388 r = decode_escaped(p, &k);
389 if (r < 0)
390 return -3;
391 p += r + 1;
392 if (*p == '=') {
393 // no key
394 ++p;
395 oid->hobj.oid.name = k;
396 } else if (*p == '<' || *p == '>') {
397 // key + name
398 ++p;
399 r = decode_escaped(p, &oid->hobj.oid.name);
400 if (r < 0)
401 return -5;
402 p += r + 1;
403 oid->hobj.set_key(k);
404 } else {
405 // malformed
406 return -6;
407 }
408
409 p = _key_decode_u64(p, &oid->hobj.snap.val);
410 p = _key_decode_u64(p, &oid->generation);
411
412 if (*p != ONODE_KEY_SUFFIX) {
413 return -7;
414 }
415 p++;
416 if (*p) {
417 // if we get something other than a null terminator here,
418 // something goes wrong.
419 return -8;
420 }
421
422 return 0;
423}
424
425template<typename S>
426static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
427{
428 key->clear();
429
430 size_t max_len = 1 + 8 + 4 +
431 (oid.hobj.nspace.length() * 3 + 1) +
432 (oid.hobj.get_key().length() * 3 + 1) +
433 1 + // for '<', '=', or '>'
434 (oid.hobj.oid.name.length() * 3 + 1) +
435 8 + 8 + 1;
436 key->reserve(max_len);
437
438 _key_encode_shard(oid.shard_id, key);
439 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
440 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
441
442 append_escaped(oid.hobj.nspace, key);
443
444 if (oid.hobj.get_key().length()) {
445 // is a key... could be < = or >.
446 append_escaped(oid.hobj.get_key(), key);
447 // (ASCII chars < = and > sort in that order, yay)
448 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
449 if (r) {
450 key->append(r > 0 ? ">" : "<");
451 append_escaped(oid.hobj.oid.name, key);
452 } else {
453 // same as no key
454 key->append("=");
455 }
456 } else {
457 // no key
458 append_escaped(oid.hobj.oid.name, key);
459 key->append("=");
460 }
461
462 _key_encode_u64(oid.hobj.snap, key);
463 _key_encode_u64(oid.generation, key);
464
465 key->push_back(ONODE_KEY_SUFFIX);
466
467 // sanity check
468 if (true) {
469 ghobject_t t;
470 int r = get_key_object(*key, &t);
471 if (r || t != oid) {
472 derr << " r " << r << dendl;
473 derr << "key " << pretty_binary_string(*key) << dendl;
474 derr << "oid " << oid << dendl;
475 derr << " t " << t << dendl;
11fdf7f2 476 ceph_assert(r == 0 && t == oid);
7c673cae
FG
477 }
478 }
479}
480
481
482// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
483// char lets us quickly test whether it is a shard key without decoding any
484// of the prefix bytes.
485template<typename S>
486static void get_extent_shard_key(const S& onode_key, uint32_t offset,
487 string *key)
488{
489 key->clear();
490 key->reserve(onode_key.length() + 4 + 1);
491 key->append(onode_key.c_str(), onode_key.size());
492 _key_encode_u32(offset, key);
493 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
494}
495
496static void rewrite_extent_shard_key(uint32_t offset, string *key)
497{
11fdf7f2
TL
498 ceph_assert(key->size() > sizeof(uint32_t) + 1);
499 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
500 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
501}
502
503template<typename S>
504static void generate_extent_shard_key_and_apply(
505 const S& onode_key,
506 uint32_t offset,
507 string *key,
508 std::function<void(const string& final_key)> apply)
509{
510 if (key->empty()) { // make full key
11fdf7f2 511 ceph_assert(!onode_key.empty());
7c673cae
FG
512 get_extent_shard_key(onode_key, offset, key);
513 } else {
514 rewrite_extent_shard_key(offset, key);
515 }
516 apply(*key);
517}
518
519int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
520{
11fdf7f2
TL
521 ceph_assert(key.size() > sizeof(uint32_t) + 1);
522 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
523 int okey_len = key.size() - sizeof(uint32_t) - 1;
524 *onode_key = key.substr(0, okey_len);
525 const char *p = key.data() + okey_len;
224ce89b 526 _key_decode_u32(p, offset);
7c673cae
FG
527 return 0;
528}
529
530static bool is_extent_shard_key(const string& key)
531{
532 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
533}
534
535// '-' < '.' < '~'
536static void get_omap_header(uint64_t id, string *out)
537{
538 _key_encode_u64(id, out);
539 out->push_back('-');
540}
541
542// hmm, I don't think there's any need to escape the user key since we
543// have a clean prefix.
544static void get_omap_key(uint64_t id, const string& key, string *out)
545{
546 _key_encode_u64(id, out);
547 out->push_back('.');
548 out->append(key);
549}
550
551static void rewrite_omap_key(uint64_t id, string old, string *out)
552{
553 _key_encode_u64(id, out);
554 out->append(old.c_str() + out->length(), old.size() - out->length());
555}
556
557static void decode_omap_key(const string& key, string *user_key)
558{
559 *user_key = key.substr(sizeof(uint64_t) + 1);
560}
561
562static void get_omap_tail(uint64_t id, string *out)
563{
564 _key_encode_u64(id, out);
565 out->push_back('~');
566}
567
568static void get_deferred_key(uint64_t seq, string *out)
569{
570 _key_encode_u64(seq, out);
571}
572
11fdf7f2
TL
573static void get_pool_stat_key(int64_t pool_id, string *key)
574{
575 key->clear();
576 _key_encode_u64(pool_id, key);
577}
578
579static int get_key_pool_stat(const string& key, uint64_t* pool_id)
580{
581 const char *p = key.c_str();
582 if (key.length() < sizeof(uint64_t))
583 return -1;
584 _key_decode_u64(p, pool_id);
585 return 0;
586}
7c673cae 587
81eedcae
TL
588template <int LogLevelV>
589void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
590{
591 uint64_t pos = 0;
592 for (auto& s : em.shards) {
593 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
594 << (s.loaded ? " (loaded)" : "")
595 << (s.dirty ? " (dirty)" : "")
596 << dendl;
597 }
598 for (auto& e : em.extent_map) {
599 dout(LogLevelV) << __func__ << " " << e << dendl;
600 ceph_assert(e.logical_offset >= pos);
601 pos = e.logical_offset + e.length;
602 const bluestore_blob_t& blob = e.blob->get_blob();
603 if (blob.has_csum()) {
604 vector<uint64_t> v;
605 unsigned n = blob.get_csum_count();
606 for (unsigned i = 0; i < n; ++i)
607 v.push_back(blob.get_csum_item(i));
608 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
609 << dendl;
610 }
611 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
612 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
613 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
614 << "~" << i.second->length << std::dec
615 << " " << *i.second << dendl;
616 }
617 }
618}
619
620template <int LogLevelV>
621void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
622{
623 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
624 return;
625 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
626 << " nid " << o.onode.nid
627 << " size 0x" << std::hex << o.onode.size
628 << " (" << std::dec << o.onode.size << ")"
629 << " expected_object_size " << o.onode.expected_object_size
630 << " expected_write_size " << o.onode.expected_write_size
631 << " in " << o.onode.extent_map_shards.size() << " shards"
632 << ", " << o.extent_map.spanning_blob_map.size()
633 << " spanning blobs"
634 << dendl;
635 for (auto p = o.onode.attrs.begin();
636 p != o.onode.attrs.end();
637 ++p) {
638 dout(LogLevelV) << __func__ << " attr " << p->first
639 << " len " << p->second.length() << dendl;
640 }
641 _dump_extent_map<LogLevelV>(cct, o.extent_map);
642}
643
644template <int LogLevelV>
645void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
646{
647 dout(LogLevelV) << __func__ << " transaction dump:\n";
648 JSONFormatter f(true);
649 f.open_object_section("transaction");
650 t->dump(&f);
651 f.close_section();
652 f.flush(*_dout);
653 *_dout << dendl;
654}
655
7c673cae
FG
656// merge operators
657
658struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
659 void merge_nonexistent(
660 const char *rdata, size_t rlen, std::string *new_value) override {
661 *new_value = std::string(rdata, rlen);
662 }
663 void merge(
664 const char *ldata, size_t llen,
665 const char *rdata, size_t rlen,
666 std::string *new_value) override {
11fdf7f2
TL
667 ceph_assert(llen == rlen);
668 ceph_assert((rlen % 8) == 0);
7c673cae
FG
669 new_value->resize(rlen);
670 const __le64* lv = (const __le64*)ldata;
671 const __le64* rv = (const __le64*)rdata;
672 __le64* nv = &(__le64&)new_value->at(0);
673 for (size_t i = 0; i < rlen >> 3; ++i) {
674 nv[i] = lv[i] + rv[i];
675 }
676 }
677 // We use each operator name and each prefix to construct the
678 // overall RocksDB operator name for consistency check at open time.
91327a77 679 const char *name() const override {
7c673cae
FG
680 return "int64_array";
681 }
682};
683
684
685// Buffer
686
687ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
688{
689 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
690 << b.offset << "~" << b.length << std::dec
691 << " " << BlueStore::Buffer::get_state_name(b.state);
692 if (b.flags)
693 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
694 return out << ")";
695}
696
697// Garbage Collector
698
699void BlueStore::GarbageCollector::process_protrusive_extents(
700 const BlueStore::ExtentMap& extent_map,
701 uint64_t start_offset,
702 uint64_t end_offset,
703 uint64_t start_touch_offset,
704 uint64_t end_touch_offset,
705 uint64_t min_alloc_size)
706{
11fdf7f2 707 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 708
11fdf7f2
TL
709 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
710 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
711
712 dout(30) << __func__ << " (hex): [" << std::hex
713 << lookup_start_offset << ", " << lookup_end_offset
714 << ")" << std::dec << dendl;
715
716 for (auto it = extent_map.seek_lextent(lookup_start_offset);
717 it != extent_map.extent_map.end() &&
718 it->logical_offset < lookup_end_offset;
719 ++it) {
720 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
721 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
722
723 dout(30) << __func__ << " " << *it
724 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
725 << dendl;
726
727 Blob* b = it->blob.get();
728
729 if (it->logical_offset >=start_touch_offset &&
730 it->logical_end() <= end_touch_offset) {
731 // Process extents within the range affected by
732 // the current write request.
733 // Need to take into account if existing extents
734 // can be merged with them (uncompressed case)
735 if (!b->get_blob().is_compressed()) {
736 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
737 --blob_info_counted->expected_allocations; // don't need to allocate
738 // new AU for compressed
739 // data since another
740 // collocated uncompressed
741 // blob already exists
742 dout(30) << __func__ << " --expected:"
743 << alloc_unit_start << dendl;
744 }
745 used_alloc_unit = alloc_unit_end;
746 blob_info_counted = nullptr;
747 }
748 } else if (b->get_blob().is_compressed()) {
749
750 // additionally we take compressed blobs that were not impacted
751 // by the write into account too
752 BlobInfo& bi =
753 affected_blobs.emplace(
754 b, BlobInfo(b->get_referenced_bytes())).first->second;
755
756 int adjust =
757 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
758 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
759 dout(30) << __func__ << " expected_allocations="
760 << bi.expected_allocations << " end_au:"
761 << alloc_unit_end << dendl;
762
763 blob_info_counted = &bi;
764 used_alloc_unit = alloc_unit_end;
765
11fdf7f2 766 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
767 bi.referenced_bytes -= it->length;
768 dout(30) << __func__ << " affected_blob:" << *b
769 << " unref 0x" << std::hex << it->length
770 << " referenced = 0x" << bi.referenced_bytes
771 << std::dec << dendl;
772 // NOTE: we can't move specific blob to resulting GC list here
773 // when reference counter == 0 since subsequent extents might
774 // decrement its expected_allocation.
775 // Hence need to enumerate all the extents first.
776 if (!bi.collect_candidate) {
777 bi.first_lextent = it;
778 bi.collect_candidate = true;
779 }
780 bi.last_lextent = it;
781 } else {
782 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
783 // don't need to allocate new AU for compressed data since another
784 // collocated uncompressed blob already exists
785 --blob_info_counted->expected_allocations;
786 dout(30) << __func__ << " --expected_allocations:"
787 << alloc_unit_start << dendl;
788 }
789 used_alloc_unit = alloc_unit_end;
790 blob_info_counted = nullptr;
791 }
792 }
793
794 for (auto b_it = affected_blobs.begin();
795 b_it != affected_blobs.end();
796 ++b_it) {
797 Blob* b = b_it->first;
798 BlobInfo& bi = b_it->second;
799 if (bi.referenced_bytes == 0) {
800 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
801 int64_t blob_expected_for_release =
11fdf7f2 802 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
803
804 dout(30) << __func__ << " " << *(b_it->first)
805 << " expected4release=" << blob_expected_for_release
806 << " expected_allocations=" << bi.expected_allocations
807 << dendl;
808 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 809 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
810 if (bi.collect_candidate) {
811 auto it = bi.first_lextent;
812 bool bExit = false;
813 do {
814 if (it->blob.get() == b) {
815 extents_to_collect.emplace_back(it->logical_offset, it->length);
816 }
817 bExit = it == bi.last_lextent;
818 ++it;
31f18b77 819 } while (!bExit);
7c673cae
FG
820 }
821 expected_for_release += blob_expected_for_release;
822 expected_allocations += bi.expected_allocations;
823 }
824 }
825 }
826}
827
828int64_t BlueStore::GarbageCollector::estimate(
829 uint64_t start_offset,
830 uint64_t length,
831 const BlueStore::ExtentMap& extent_map,
832 const BlueStore::old_extent_map_t& old_extents,
833 uint64_t min_alloc_size)
834{
835
836 affected_blobs.clear();
837 extents_to_collect.clear();
838 used_alloc_unit = boost::optional<uint64_t >();
839 blob_info_counted = nullptr;
840
841 gc_start_offset = start_offset;
842 gc_end_offset = start_offset + length;
843
844 uint64_t end_offset = start_offset + length;
845
846 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
847 Blob* b = it->e.blob.get();
848 if (b->get_blob().is_compressed()) {
849
850 // update gc_start_offset/gc_end_offset if needed
851 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 852 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
853
854 auto o = it->e.logical_offset;
855 auto l = it->e.length;
856
857 uint64_t ref_bytes = b->get_referenced_bytes();
858 // micro optimization to bypass blobs that have no more references
859 if (ref_bytes != 0) {
860 dout(30) << __func__ << " affected_blob:" << *b
861 << " unref 0x" << std::hex << o << "~" << l
862 << std::dec << dendl;
863 affected_blobs.emplace(b, BlobInfo(ref_bytes));
864 }
865 }
866 }
867 dout(30) << __func__ << " gc range(hex): [" << std::hex
868 << gc_start_offset << ", " << gc_end_offset
869 << ")" << std::dec << dendl;
870
871 // enumerate preceeding extents to check if they reference affected blobs
872 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
873 process_protrusive_extents(extent_map,
874 gc_start_offset,
875 gc_end_offset,
876 start_offset,
877 end_offset,
878 min_alloc_size);
879 }
880 return expected_for_release - expected_allocations;
881}
882
883// Cache
884
885BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
886 PerfCounters *logger)
887{
888 Cache *c = nullptr;
889
890 if (type == "lru")
891 c = new LRUCache(cct);
892 else if (type == "2q")
893 c = new TwoQCache(cct);
894 else
11fdf7f2 895 ceph_abort_msg("unrecognized cache type");
7c673cae
FG
896
897 c->logger = logger;
898 return c;
899}
900
91327a77 901void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
7c673cae 902{
11fdf7f2 903 std::lock_guard l(lock);
91327a77 904 _trim(onode_max, buffer_max);
7c673cae
FG
905}
906
91327a77 907void BlueStore::Cache::trim_all()
7c673cae 908{
11fdf7f2 909 std::lock_guard l(lock);
91327a77 910 _trim(0, 0);
7c673cae
FG
911}
912
7c673cae
FG
913// LRUCache
914#undef dout_prefix
915#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
916
917void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
918{
919 auto p = onode_lru.iterator_to(*o);
920 onode_lru.erase(p);
921 onode_lru.push_front(*o);
922}
923
924void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
925{
926 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
927 << " buffers " << buffer_size << " / " << buffer_max
928 << dendl;
929
930 _audit("trim start");
931
932 // buffers
933 while (buffer_size > buffer_max) {
934 auto i = buffer_lru.rbegin();
935 if (i == buffer_lru.rend()) {
936 // stop if buffer_lru is now empty
937 break;
938 }
939
940 Buffer *b = &*i;
11fdf7f2 941 ceph_assert(b->is_clean());
7c673cae
FG
942 dout(20) << __func__ << " rm " << *b << dendl;
943 b->space->_rm_buffer(this, b);
944 }
945
946 // onodes
91327a77 947 if (onode_max >= onode_lru.size()) {
7c673cae 948 return; // don't even try
91327a77
AA
949 }
950 uint64_t num = onode_lru.size() - onode_max;
7c673cae
FG
951
952 auto p = onode_lru.end();
11fdf7f2 953 ceph_assert(p != onode_lru.begin());
7c673cae
FG
954 --p;
955 int skipped = 0;
11fdf7f2 956 int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
7c673cae
FG
957 while (num > 0) {
958 Onode *o = &*p;
959 int refs = o->nref.load();
960 if (refs > 1) {
961 dout(20) << __func__ << " " << o->oid << " has " << refs
962 << " refs, skipping" << dendl;
963 if (++skipped >= max_skipped) {
964 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
965 << num << " left to trim" << dendl;
966 break;
967 }
968
969 if (p == onode_lru.begin()) {
970 break;
971 } else {
972 p--;
973 num--;
974 continue;
975 }
976 }
977 dout(30) << __func__ << " rm " << o->oid << dendl;
978 if (p != onode_lru.begin()) {
979 onode_lru.erase(p--);
980 } else {
981 onode_lru.erase(p);
11fdf7f2 982 ceph_assert(num == 1);
7c673cae
FG
983 }
984 o->get(); // paranoia
985 o->c->onode_map.remove(o->oid);
986 o->put();
987 --num;
988 }
989}
990
991#ifdef DEBUG_CACHE
992void BlueStore::LRUCache::_audit(const char *when)
993{
994 dout(10) << __func__ << " " << when << " start" << dendl;
995 uint64_t s = 0;
996 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
997 s += i->length;
998 }
999 if (s != buffer_size) {
1000 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
1001 << dendl;
1002 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
1003 derr << __func__ << " " << *i << dendl;
1004 }
11fdf7f2 1005 ceph_assert(s == buffer_size);
7c673cae
FG
1006 }
1007 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
1008 << " ok" << dendl;
1009}
1010#endif
1011
1012// TwoQCache
1013#undef dout_prefix
1014#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
1015
1016
1017void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
1018{
1019 auto p = onode_lru.iterator_to(*o);
1020 onode_lru.erase(p);
1021 onode_lru.push_front(*o);
1022}
1023
1024void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
1025{
1026 dout(20) << __func__ << " level " << level << " near " << near
1027 << " on " << *b
1028 << " which has cache_private " << b->cache_private << dendl;
1029 if (near) {
1030 b->cache_private = near->cache_private;
1031 switch (b->cache_private) {
1032 case BUFFER_WARM_IN:
1033 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
1034 break;
1035 case BUFFER_WARM_OUT:
11fdf7f2 1036 ceph_assert(b->is_empty());
7c673cae
FG
1037 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
1038 break;
1039 case BUFFER_HOT:
1040 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
1041 break;
1042 default:
11fdf7f2 1043 ceph_abort_msg("bad cache_private");
7c673cae
FG
1044 }
1045 } else if (b->cache_private == BUFFER_NEW) {
1046 b->cache_private = BUFFER_WARM_IN;
1047 if (level > 0) {
1048 buffer_warm_in.push_front(*b);
1049 } else {
1050 // take caller hint to start at the back of the warm queue
1051 buffer_warm_in.push_back(*b);
1052 }
1053 } else {
1054 // we got a hint from discard
1055 switch (b->cache_private) {
1056 case BUFFER_WARM_IN:
1057 // stay in warm_in. move to front, even though 2Q doesn't actually
1058 // do this.
1059 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1060 buffer_warm_in.push_front(*b);
1061 break;
1062 case BUFFER_WARM_OUT:
1063 b->cache_private = BUFFER_HOT;
1064 // move to hot. fall-thru
1065 case BUFFER_HOT:
1066 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1067 buffer_hot.push_front(*b);
1068 break;
1069 default:
11fdf7f2 1070 ceph_abort_msg("bad cache_private");
7c673cae
FG
1071 }
1072 }
1073 if (!b->is_empty()) {
1074 buffer_bytes += b->length;
1075 buffer_list_bytes[b->cache_private] += b->length;
1076 }
1077}
1078
1079void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1080{
1081 dout(20) << __func__ << " " << *b << dendl;
1082 if (!b->is_empty()) {
11fdf7f2 1083 ceph_assert(buffer_bytes >= b->length);
7c673cae 1084 buffer_bytes -= b->length;
11fdf7f2 1085 ceph_assert(buffer_list_bytes[b->cache_private] >= b->length);
7c673cae
FG
1086 buffer_list_bytes[b->cache_private] -= b->length;
1087 }
1088 switch (b->cache_private) {
1089 case BUFFER_WARM_IN:
1090 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1091 break;
1092 case BUFFER_WARM_OUT:
1093 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1094 break;
1095 case BUFFER_HOT:
1096 buffer_hot.erase(buffer_hot.iterator_to(*b));
1097 break;
1098 default:
11fdf7f2 1099 ceph_abort_msg("bad cache_private");
7c673cae
FG
1100 }
1101}
1102
1103void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1104{
1105 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1106 src->_rm_buffer(b);
1107
1108 // preserve which list we're on (even if we can't preserve the order!)
1109 switch (b->cache_private) {
1110 case BUFFER_WARM_IN:
11fdf7f2 1111 ceph_assert(!b->is_empty());
7c673cae
FG
1112 buffer_warm_in.push_back(*b);
1113 break;
1114 case BUFFER_WARM_OUT:
11fdf7f2 1115 ceph_assert(b->is_empty());
7c673cae
FG
1116 buffer_warm_out.push_back(*b);
1117 break;
1118 case BUFFER_HOT:
11fdf7f2 1119 ceph_assert(!b->is_empty());
7c673cae
FG
1120 buffer_hot.push_back(*b);
1121 break;
1122 default:
11fdf7f2 1123 ceph_abort_msg("bad cache_private");
7c673cae
FG
1124 }
1125 if (!b->is_empty()) {
1126 buffer_bytes += b->length;
1127 buffer_list_bytes[b->cache_private] += b->length;
1128 }
1129}
1130
1131void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1132{
1133 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1134 if (!b->is_empty()) {
11fdf7f2 1135 ceph_assert((int64_t)buffer_bytes + delta >= 0);
7c673cae 1136 buffer_bytes += delta;
11fdf7f2 1137 ceph_assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
7c673cae
FG
1138 buffer_list_bytes[b->cache_private] += delta;
1139 }
1140}
1141
1142void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1143{
1144 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1145 << " buffers " << buffer_bytes << " / " << buffer_max
1146 << dendl;
1147
1148 _audit("trim start");
1149
1150 // buffers
1151 if (buffer_bytes > buffer_max) {
1152 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1153 uint64_t khot = buffer_max - kin;
1154
1155 // pre-calculate kout based on average buffer size too,
1156 // which is typical(the warm_in and hot lists may change later)
1157 uint64_t kout = 0;
1158 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1159 if (buffer_num) {
1160 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
11fdf7f2 1161 ceph_assert(buffer_avg_size);
7c673cae
FG
1162 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1163 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1164 }
1165
1166 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1167 // hot is small, give slack to warm_in
1168 kin += khot - buffer_list_bytes[BUFFER_HOT];
1169 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1170 // warm_in is small, give slack to hot
1171 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1172 }
1173
1174 // adjust warm_in list
1175 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1176 uint64_t evicted = 0;
1177
1178 while (to_evict_bytes > 0) {
1179 auto p = buffer_warm_in.rbegin();
1180 if (p == buffer_warm_in.rend()) {
1181 // stop if warm_in list is now empty
1182 break;
1183 }
1184
1185 Buffer *b = &*p;
11fdf7f2 1186 ceph_assert(b->is_clean());
7c673cae 1187 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
11fdf7f2 1188 ceph_assert(buffer_bytes >= b->length);
7c673cae 1189 buffer_bytes -= b->length;
11fdf7f2 1190 ceph_assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
7c673cae
FG
1191 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1192 to_evict_bytes -= b->length;
1193 evicted += b->length;
1194 b->state = Buffer::STATE_EMPTY;
1195 b->data.clear();
1196 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1197 buffer_warm_out.push_front(*b);
1198 b->cache_private = BUFFER_WARM_OUT;
1199 }
1200
1201 if (evicted > 0) {
1adf2230 1202 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1203 << " from warm_in list, done evicting warm_in buffers"
1204 << dendl;
1205 }
1206
1207 // adjust hot list
1208 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1209 evicted = 0;
1210
1211 while (to_evict_bytes > 0) {
1212 auto p = buffer_hot.rbegin();
1213 if (p == buffer_hot.rend()) {
1214 // stop if hot list is now empty
1215 break;
1216 }
1217
1218 Buffer *b = &*p;
1219 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
11fdf7f2 1220 ceph_assert(b->is_clean());
7c673cae
FG
1221 // adjust evict size before buffer goes invalid
1222 to_evict_bytes -= b->length;
1223 evicted += b->length;
1224 b->space->_rm_buffer(this, b);
1225 }
1226
1227 if (evicted > 0) {
1adf2230 1228 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1229 << " from hot list, done evicting hot buffers"
1230 << dendl;
1231 }
1232
1233 // adjust warm out list too, if necessary
1234 int64_t num = buffer_warm_out.size() - kout;
1235 while (num-- > 0) {
1236 Buffer *b = &*buffer_warm_out.rbegin();
11fdf7f2 1237 ceph_assert(b->is_empty());
7c673cae
FG
1238 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1239 b->space->_rm_buffer(this, b);
1240 }
1241 }
1242
1243 // onodes
91327a77 1244 if (onode_max >= onode_lru.size()) {
7c673cae 1245 return; // don't even try
91327a77
AA
1246 }
1247 uint64_t num = onode_lru.size() - onode_max;
7c673cae
FG
1248
1249 auto p = onode_lru.end();
11fdf7f2 1250 ceph_assert(p != onode_lru.begin());
7c673cae
FG
1251 --p;
1252 int skipped = 0;
11fdf7f2 1253 int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
7c673cae
FG
1254 while (num > 0) {
1255 Onode *o = &*p;
1256 dout(20) << __func__ << " considering " << o << dendl;
1257 int refs = o->nref.load();
1258 if (refs > 1) {
1259 dout(20) << __func__ << " " << o->oid << " has " << refs
1260 << " refs; skipping" << dendl;
1261 if (++skipped >= max_skipped) {
1262 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1263 << num << " left to trim" << dendl;
1264 break;
1265 }
1266
1267 if (p == onode_lru.begin()) {
1268 break;
1269 } else {
1270 p--;
1271 num--;
1272 continue;
1273 }
1274 }
1275 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1276 if (p != onode_lru.begin()) {
1277 onode_lru.erase(p--);
1278 } else {
1279 onode_lru.erase(p);
11fdf7f2 1280 ceph_assert(num == 1);
7c673cae
FG
1281 }
1282 o->get(); // paranoia
1283 o->c->onode_map.remove(o->oid);
1284 o->put();
1285 --num;
1286 }
1287}
1288
1289#ifdef DEBUG_CACHE
1290void BlueStore::TwoQCache::_audit(const char *when)
1291{
1292 dout(10) << __func__ << " " << when << " start" << dendl;
1293 uint64_t s = 0;
1294 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1295 s += i->length;
1296 }
1297
1298 uint64_t hot_bytes = s;
1299 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1300 derr << __func__ << " hot_list_bytes "
1301 << buffer_list_bytes[BUFFER_HOT]
1302 << " != actual " << hot_bytes
1303 << dendl;
11fdf7f2 1304 ceph_assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
7c673cae
FG
1305 }
1306
1307 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1308 s += i->length;
1309 }
1310
1311 uint64_t warm_in_bytes = s - hot_bytes;
1312 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1313 derr << __func__ << " warm_in_list_bytes "
1314 << buffer_list_bytes[BUFFER_WARM_IN]
1315 << " != actual " << warm_in_bytes
1316 << dendl;
11fdf7f2 1317 ceph_assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
7c673cae
FG
1318 }
1319
1320 if (s != buffer_bytes) {
1321 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1322 << dendl;
11fdf7f2 1323 ceph_assert(s == buffer_bytes);
7c673cae
FG
1324 }
1325
1326 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1327 << " ok" << dendl;
1328}
1329#endif
1330
1331
1332// BufferSpace
1333
1334#undef dout_prefix
1335#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1336
1337void BlueStore::BufferSpace::_clear(Cache* cache)
1338{
1339 // note: we already hold cache->lock
1340 ldout(cache->cct, 20) << __func__ << dendl;
1341 while (!buffer_map.empty()) {
1342 _rm_buffer(cache, buffer_map.begin());
1343 }
1344}
1345
1346int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1347{
1348 // note: we already hold cache->lock
1349 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1350 << std::dec << dendl;
1351 int cache_private = 0;
1352 cache->_audit("discard start");
1353 auto i = _data_lower_bound(offset);
1354 uint32_t end = offset + length;
1355 while (i != buffer_map.end()) {
1356 Buffer *b = i->second.get();
1357 if (b->offset >= end) {
1358 break;
1359 }
1360 if (b->cache_private > cache_private) {
1361 cache_private = b->cache_private;
1362 }
1363 if (b->offset < offset) {
1364 int64_t front = offset - b->offset;
1365 if (b->end() > end) {
1366 // drop middle (split)
1367 uint32_t tail = b->end() - end;
1368 if (b->data.length()) {
1369 bufferlist bl;
1370 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1371 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1372 nb->maybe_rebuild();
1373 _add_buffer(cache, nb, 0, b);
7c673cae 1374 } else {
31f18b77
FG
1375 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1376 0, b);
7c673cae
FG
1377 }
1378 if (!b->is_writing()) {
1379 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1380 }
1381 b->truncate(front);
31f18b77 1382 b->maybe_rebuild();
7c673cae
FG
1383 cache->_audit("discard end 1");
1384 break;
1385 } else {
1386 // drop tail
1387 if (!b->is_writing()) {
1388 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1389 }
1390 b->truncate(front);
31f18b77 1391 b->maybe_rebuild();
7c673cae
FG
1392 ++i;
1393 continue;
1394 }
1395 }
1396 if (b->end() <= end) {
1397 // drop entire buffer
1398 _rm_buffer(cache, i++);
1399 continue;
1400 }
1401 // drop front
1402 uint32_t keep = b->end() - end;
1403 if (b->data.length()) {
1404 bufferlist bl;
1405 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1406 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1407 nb->maybe_rebuild();
1408 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1409 } else {
1410 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1411 }
1412 _rm_buffer(cache, i);
1413 cache->_audit("discard end 2");
1414 break;
1415 }
1416 return cache_private;
1417}
1418
1419void BlueStore::BufferSpace::read(
1420 Cache* cache,
224ce89b
WB
1421 uint32_t offset,
1422 uint32_t length,
7c673cae 1423 BlueStore::ready_regions_t& res,
91327a77
AA
1424 interval_set<uint32_t>& res_intervals,
1425 int flags)
7c673cae 1426{
7c673cae
FG
1427 res.clear();
1428 res_intervals.clear();
1429 uint32_t want_bytes = length;
1430 uint32_t end = offset + length;
224ce89b
WB
1431
1432 {
11fdf7f2 1433 std::lock_guard l(cache->lock);
224ce89b
WB
1434 for (auto i = _data_lower_bound(offset);
1435 i != buffer_map.end() && offset < end && i->first < end;
1436 ++i) {
1437 Buffer *b = i->second.get();
11fdf7f2 1438 ceph_assert(b->end() > offset);
91327a77
AA
1439
1440 bool val = false;
1441 if (flags & BYPASS_CLEAN_CACHE)
1442 val = b->is_writing();
1443 else
1444 val = b->is_writing() || b->is_clean();
1445 if (val) {
224ce89b
WB
1446 if (b->offset < offset) {
1447 uint32_t skip = offset - b->offset;
11fdf7f2 1448 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1449 res[offset].substr_of(b->data, skip, l);
1450 res_intervals.insert(offset, l);
1451 offset += l;
1452 length -= l;
1453 if (!b->is_writing()) {
1454 cache->_touch_buffer(b);
1455 }
1456 continue;
1457 }
1458 if (b->offset > offset) {
1459 uint32_t gap = b->offset - offset;
1460 if (length <= gap) {
1461 break;
1462 }
1463 offset += gap;
1464 length -= gap;
1465 }
1466 if (!b->is_writing()) {
7c673cae 1467 cache->_touch_buffer(b);
224ce89b
WB
1468 }
1469 if (b->length > length) {
1470 res[offset].substr_of(b->data, 0, length);
1471 res_intervals.insert(offset, length);
7c673cae 1472 break;
224ce89b
WB
1473 } else {
1474 res[offset].append(b->data);
1475 res_intervals.insert(offset, b->length);
1476 if (b->length == length)
1477 break;
1478 offset += b->length;
1479 length -= b->length;
1480 }
7c673cae
FG
1481 }
1482 }
1483 }
1484
1485 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1486 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1487 uint64_t miss_bytes = want_bytes - hit_bytes;
1488 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1489 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1490}
1491
f64942e4 1492void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
7c673cae 1493{
7c673cae
FG
1494 auto i = writing.begin();
1495 while (i != writing.end()) {
1496 if (i->seq > seq) {
1497 break;
1498 }
1499 if (i->seq < seq) {
1500 ++i;
1501 continue;
1502 }
1503
1504 Buffer *b = &*i;
11fdf7f2 1505 ceph_assert(b->is_writing());
7c673cae
FG
1506
1507 if (b->flags & Buffer::FLAG_NOCACHE) {
1508 writing.erase(i++);
1509 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1510 buffer_map.erase(b->offset);
1511 } else {
1512 b->state = Buffer::STATE_CLEAN;
1513 writing.erase(i++);
31f18b77
FG
1514 b->maybe_rebuild();
1515 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
1516 cache->_add_buffer(b, 1, nullptr);
1517 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1518 }
1519 }
1520
1521 cache->_audit("finish_write end");
1522}
1523
1524void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1525{
11fdf7f2 1526 std::lock_guard lk(cache->lock);
7c673cae
FG
1527 if (buffer_map.empty())
1528 return;
1529
1530 auto p = --buffer_map.end();
1531 while (true) {
1532 if (p->second->end() <= pos)
1533 break;
1534
1535 if (p->second->offset < pos) {
1536 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1537 size_t left = pos - p->second->offset;
1538 size_t right = p->second->length - left;
1539 if (p->second->data.length()) {
1540 bufferlist bl;
1541 bl.substr_of(p->second->data, left, right);
1542 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1543 0, p->second.get());
1544 } else {
1545 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1546 0, p->second.get());
1547 }
1548 cache->_adjust_buffer_size(p->second.get(), -right);
1549 p->second->truncate(left);
1550 break;
1551 }
1552
11fdf7f2 1553 ceph_assert(p->second->end() > pos);
7c673cae
FG
1554 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1555 if (p->second->data.length()) {
1556 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1557 p->second->offset - pos, p->second->data),
1558 0, p->second.get());
1559 } else {
1560 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1561 p->second->offset - pos, p->second->length),
1562 0, p->second.get());
1563 }
1564 if (p == buffer_map.begin()) {
1565 _rm_buffer(cache, p);
1566 break;
1567 } else {
1568 _rm_buffer(cache, p--);
1569 }
1570 }
11fdf7f2 1571 ceph_assert(writing.empty());
7c673cae
FG
1572}
1573
1574// OnodeSpace
1575
1576#undef dout_prefix
1577#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1578
1579BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1580{
11fdf7f2 1581 std::lock_guard l(cache->lock);
7c673cae
FG
1582 auto p = onode_map.find(oid);
1583 if (p != onode_map.end()) {
1584 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1585 << " raced, returning existing " << p->second
1586 << dendl;
1587 return p->second;
1588 }
1589 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1590 onode_map[oid] = o;
1591 cache->_add_onode(o, 1);
1592 return o;
1593}
1594
1595BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1596{
7c673cae 1597 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1598 OnodeRef o;
1599 bool hit = false;
1600
1601 {
11fdf7f2 1602 std::lock_guard l(cache->lock);
224ce89b
WB
1603 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1604 if (p == onode_map.end()) {
1605 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1606 } else {
1607 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1608 << dendl;
1609 cache->_touch_onode(p->second);
1610 hit = true;
1611 o = p->second;
1612 }
1613 }
1614
1615 if (hit) {
1616 cache->logger->inc(l_bluestore_onode_hits);
1617 } else {
7c673cae 1618 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1619 }
224ce89b 1620 return o;
7c673cae
FG
1621}
1622
1623void BlueStore::OnodeSpace::clear()
1624{
11fdf7f2 1625 std::lock_guard l(cache->lock);
7c673cae
FG
1626 ldout(cache->cct, 10) << __func__ << dendl;
1627 for (auto &p : onode_map) {
1628 cache->_rm_onode(p.second);
1629 }
1630 onode_map.clear();
1631}
1632
1633bool BlueStore::OnodeSpace::empty()
1634{
11fdf7f2 1635 std::lock_guard l(cache->lock);
7c673cae
FG
1636 return onode_map.empty();
1637}
1638
1639void BlueStore::OnodeSpace::rename(
1640 OnodeRef& oldo,
1641 const ghobject_t& old_oid,
1642 const ghobject_t& new_oid,
31f18b77 1643 const mempool::bluestore_cache_other::string& new_okey)
7c673cae 1644{
11fdf7f2 1645 std::lock_guard l(cache->lock);
7c673cae
FG
1646 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1647 << dendl;
1648 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1649 po = onode_map.find(old_oid);
1650 pn = onode_map.find(new_oid);
11fdf7f2 1651 ceph_assert(po != pn);
7c673cae 1652
11fdf7f2 1653 ceph_assert(po != onode_map.end());
7c673cae
FG
1654 if (pn != onode_map.end()) {
1655 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1656 << dendl;
1657 cache->_rm_onode(pn->second);
1658 onode_map.erase(pn);
1659 }
1660 OnodeRef o = po->second;
1661
1662 // install a non-existent onode at old location
1663 oldo.reset(new Onode(o->c, old_oid, o->key));
1664 po->second = oldo;
1665 cache->_add_onode(po->second, 1);
1666
1667 // add at new position and fix oid, key
1668 onode_map.insert(make_pair(new_oid, o));
1669 cache->_touch_onode(o);
1670 o->oid = new_oid;
1671 o->key = new_okey;
1672}
1673
1674bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1675{
11fdf7f2 1676 std::lock_guard l(cache->lock);
7c673cae
FG
1677 ldout(cache->cct, 20) << __func__ << dendl;
1678 for (auto& i : onode_map) {
1679 if (f(i.second)) {
1680 return true;
1681 }
1682 }
1683 return false;
1684}
1685
11fdf7f2
TL
1686template <int LogLevelV = 30>
1687void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1688{
1689 for (auto& i : onode_map) {
11fdf7f2 1690 ldout(cct, LogLevelV) << i.first << " : " << i.second << dendl;
3efd9988
FG
1691 }
1692}
7c673cae
FG
1693
1694// SharedBlob
1695
1696#undef dout_prefix
1697#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1698
1699ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1700{
1701 out << "SharedBlob(" << &sb;
1702
1703 if (sb.loaded) {
1704 out << " loaded " << *sb.persistent;
1705 } else {
1706 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1707 }
1708 return out << ")";
1709}
1710
1711BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1712 : coll(_coll), sbid_unloaded(i)
1713{
11fdf7f2 1714 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
1715 if (get_cache()) {
1716 get_cache()->add_blob();
1717 }
1718}
1719
1720BlueStore::SharedBlob::~SharedBlob()
1721{
7c673cae
FG
1722 if (loaded && persistent) {
1723 delete persistent;
1724 }
1725}
1726
1727void BlueStore::SharedBlob::put()
1728{
1729 if (--nref == 0) {
1730 ldout(coll->store->cct, 20) << __func__ << " " << this
1731 << " removing self from set " << get_parent()
1732 << dendl;
1adf2230
AA
1733 again:
1734 auto coll_snap = coll;
1735 if (coll_snap) {
11fdf7f2 1736 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
1737 if (coll_snap != coll) {
1738 goto again;
1739 }
91327a77
AA
1740 if (!coll_snap->shared_blob_set.remove(this, true)) {
1741 // race with lookup
1742 return;
1743 }
1adf2230
AA
1744 bc._clear(coll_snap->cache);
1745 coll_snap->cache->rm_blob();
7c673cae 1746 }
28e407b8 1747 delete this;
7c673cae
FG
1748 }
1749}
1750
1751void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1752{
11fdf7f2 1753 ceph_assert(persistent);
7c673cae
FG
1754 persistent->ref_map.get(offset, length);
1755}
1756
1757void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 1758 PExtentVector *r,
11fdf7f2 1759 bool *unshare)
7c673cae 1760{
11fdf7f2
TL
1761 ceph_assert(persistent);
1762 persistent->ref_map.put(offset, length, r,
1763 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
1764}
1765
f64942e4
AA
1766void BlueStore::SharedBlob::finish_write(uint64_t seq)
1767{
1768 while (true) {
1769 Cache *cache = coll->cache;
11fdf7f2 1770 std::lock_guard l(cache->lock);
f64942e4
AA
1771 if (coll->cache != cache) {
1772 ldout(coll->store->cct, 20) << __func__
1773 << " raced with sb cache update, was " << cache
1774 << ", now " << coll->cache << ", retrying"
1775 << dendl;
1776 continue;
1777 }
1778 bc._finish_write(cache, seq);
1779 break;
1780 }
1781}
1782
3efd9988
FG
1783// SharedBlobSet
1784
1785#undef dout_prefix
1786#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1787
11fdf7f2
TL
1788template <int LogLevelV = 30>
1789void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 1790{
11fdf7f2 1791 std::lock_guard l(lock);
3efd9988 1792 for (auto& i : sb_map) {
11fdf7f2 1793 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
1794 }
1795}
1796
7c673cae
FG
1797// Blob
1798
1799#undef dout_prefix
1800#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1801
1802ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1803{
1804 out << "Blob(" << &b;
1805 if (b.is_spanning()) {
1806 out << " spanning " << b.id;
1807 }
35e4c445
FG
1808 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1809 if (b.shared_blob) {
1810 out << " " << *b.shared_blob;
1811 } else {
1812 out << " (shared_blob=NULL)";
1813 }
1814 out << ")";
7c673cae
FG
1815 return out;
1816}
1817
1818void BlueStore::Blob::discard_unallocated(Collection *coll)
1819{
224ce89b 1820 if (get_blob().is_shared()) {
7c673cae
FG
1821 return;
1822 }
224ce89b 1823 if (get_blob().is_compressed()) {
7c673cae
FG
1824 bool discard = false;
1825 bool all_invalid = true;
224ce89b 1826 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1827 if (!e.is_valid()) {
1828 discard = true;
1829 } else {
1830 all_invalid = false;
1831 }
1832 }
11fdf7f2 1833 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
1834 // or none pextents are invalid.
1835 if (discard) {
224ce89b
WB
1836 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1837 get_blob().get_logical_length());
7c673cae
FG
1838 }
1839 } else {
1840 size_t pos = 0;
224ce89b 1841 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1842 if (!e.is_valid()) {
1843 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1844 << "~" << e.length
1845 << std::dec << dendl;
1846 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1847 }
1848 pos += e.length;
1849 }
224ce89b
WB
1850 if (get_blob().can_prune_tail()) {
1851 dirty_blob().prune_tail();
1852 used_in_blob.prune_tail(get_blob().get_ondisk_length());
7c673cae 1853 auto cct = coll->store->cct; //used by dout
224ce89b 1854 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1855 }
1856 }
1857}
1858
1859void BlueStore::Blob::get_ref(
1860 Collection *coll,
1861 uint32_t offset,
1862 uint32_t length)
1863{
1864 // Caller has to initialize Blob's logical length prior to increment
1865 // references. Otherwise one is neither unable to determine required
1866 // amount of counters in case of per-au tracking nor obtain min_release_size
1867 // for single counter mode.
11fdf7f2 1868 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
1869 auto cct = coll->store->cct;
1870 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1871 << std::dec << " " << *this << dendl;
1872
1873 if (used_in_blob.is_empty()) {
1874 uint32_t min_release_size =
224ce89b
WB
1875 get_blob().get_release_size(coll->store->min_alloc_size);
1876 uint64_t l = get_blob().get_logical_length();
1877 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1878 << min_release_size << std::dec << dendl;
7c673cae
FG
1879 used_in_blob.init(l, min_release_size);
1880 }
1881 used_in_blob.get(
1882 offset,
1883 length);
1884}
1885
1886bool BlueStore::Blob::put_ref(
1887 Collection *coll,
1888 uint32_t offset,
1889 uint32_t length,
1890 PExtentVector *r)
1891{
1892 PExtentVector logical;
1893
1894 auto cct = coll->store->cct;
1895 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1896 << std::dec << " " << *this << dendl;
1897
1898 bool empty = used_in_blob.put(
1899 offset,
1900 length,
1901 &logical);
1902 r->clear();
1903 // nothing to release
1904 if (!empty && logical.empty()) {
1905 return false;
1906 }
1907
1908 bluestore_blob_t& b = dirty_blob();
1909 return b.release_extents(empty, logical, r);
1910}
1911
224ce89b 1912bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1913 uint32_t target_blob_size,
1914 uint32_t b_offset,
1915 uint32_t *length0) {
11fdf7f2
TL
1916 ceph_assert(min_alloc_size);
1917 ceph_assert(target_blob_size);
7c673cae
FG
1918 if (!get_blob().is_mutable()) {
1919 return false;
1920 }
1921
1922 uint32_t length = *length0;
1923 uint32_t end = b_offset + length;
1924
1925 // Currently for the sake of simplicity we omit blob reuse if data is
1926 // unaligned with csum chunk. Later we can perform padding if needed.
1927 if (get_blob().has_csum() &&
1928 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1929 (end % get_blob().get_csum_chunk_size()) != 0)) {
1930 return false;
1931 }
1932
1933 auto blen = get_blob().get_logical_length();
1934 uint32_t new_blen = blen;
1935
1936 // make sure target_blob_size isn't less than current blob len
11fdf7f2 1937 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
1938
1939 if (b_offset >= blen) {
224ce89b
WB
1940 // new data totally stands out of the existing blob
1941 new_blen = end;
7c673cae 1942 } else {
224ce89b 1943 // new data overlaps with the existing blob
11fdf7f2 1944 new_blen = std::max(blen, end);
224ce89b
WB
1945
1946 uint32_t overlap = 0;
1947 if (new_blen > blen) {
1948 overlap = blen - b_offset;
1949 } else {
1950 overlap = length;
1951 }
1952
1953 if (!get_blob().is_unallocated(b_offset, overlap)) {
1954 // abort if any piece of the overlap has already been allocated
1955 return false;
7c673cae
FG
1956 }
1957 }
224ce89b 1958
7c673cae
FG
1959 if (new_blen > blen) {
1960 int64_t overflow = int64_t(new_blen) - target_blob_size;
1961 // Unable to decrease the provided length to fit into max_blob_size
1962 if (overflow >= length) {
1963 return false;
1964 }
1965
1966 // FIXME: in some cases we could reduce unused resolution
1967 if (get_blob().has_unused()) {
1968 return false;
1969 }
1970
1971 if (overflow > 0) {
1972 new_blen -= overflow;
1973 length -= overflow;
1974 *length0 = length;
1975 }
224ce89b 1976
7c673cae
FG
1977 if (new_blen > blen) {
1978 dirty_blob().add_tail(new_blen);
1979 used_in_blob.add_tail(new_blen,
224ce89b 1980 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
1981 }
1982 }
1983 return true;
1984}
1985
1986void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1987{
1988 auto cct = coll->store->cct; //used by dout
1989 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1990 << " start " << *this << dendl;
11fdf7f2
TL
1991 ceph_assert(blob.can_split());
1992 ceph_assert(used_in_blob.can_split());
7c673cae
FG
1993 bluestore_blob_t &lb = dirty_blob();
1994 bluestore_blob_t &rb = r->dirty_blob();
1995
1996 used_in_blob.split(
1997 blob_offset,
1998 &(r->used_in_blob));
1999
2000 lb.split(blob_offset, rb);
2001 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2002
2003 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2004 << " finish " << *this << dendl;
2005 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2006 << " and " << *r << dendl;
2007}
2008
2009#ifndef CACHE_BLOB_BL
2010void BlueStore::Blob::decode(
2011 Collection *coll,
11fdf7f2 2012 bufferptr::const_iterator& p,
7c673cae
FG
2013 uint64_t struct_v,
2014 uint64_t* sbid,
2015 bool include_ref_map)
2016{
2017 denc(blob, p, struct_v);
2018 if (blob.is_shared()) {
2019 denc(*sbid, p);
2020 }
2021 if (include_ref_map) {
2022 if (struct_v > 1) {
2023 used_in_blob.decode(p);
2024 } else {
2025 used_in_blob.clear();
2026 bluestore_extent_ref_map_t legacy_ref_map;
2027 legacy_ref_map.decode(p);
2028 for (auto r : legacy_ref_map.ref_map) {
2029 get_ref(
2030 coll,
2031 r.first,
2032 r.second.refs * r.second.length);
2033 }
2034 }
2035 }
2036}
2037#endif
2038
2039// Extent
2040
2041ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2042{
2043 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2044 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2045 << " " << *e.blob;
2046}
2047
2048// OldExtent
2049BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2050 uint32_t lo,
2051 uint32_t o,
2052 uint32_t l,
2053 BlobRef& b) {
2054 OldExtent* oe = new OldExtent(lo, o, l, b);
2055 b->put_ref(c.get(), o, l, &(oe->r));
2056 oe->blob_empty = b->get_referenced_bytes() == 0;
2057 return oe;
2058}
2059
2060// ExtentMap
2061
2062#undef dout_prefix
2063#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2064
2065BlueStore::ExtentMap::ExtentMap(Onode *o)
2066 : onode(o),
2067 inline_bl(
2068 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2069}
2070
11fdf7f2
TL
2071void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2072 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2073 uint64_t& length, uint64_t& dstoff) {
2074
2075 auto cct = onode->c->store->cct;
2076 bool inject_21040 =
2077 cct->_conf->bluestore_debug_inject_bug21040;
2078 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2079 for (auto& e : oldo->extent_map.extent_map) {
2080 e.blob->last_encoded_id = -1;
2081 }
2082
2083 int n = 0;
2084 uint64_t end = srcoff + length;
2085 uint32_t dirty_range_begin = 0;
2086 uint32_t dirty_range_end = 0;
2087 bool src_dirty = false;
2088 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2089 ep != oldo->extent_map.extent_map.end();
2090 ++ep) {
2091 auto& e = *ep;
2092 if (e.logical_offset >= end) {
2093 break;
2094 }
2095 dout(20) << __func__ << " src " << e << dendl;
2096 BlobRef cb;
2097 bool blob_duped = true;
2098 if (e.blob->last_encoded_id >= 0) {
2099 cb = id_to_blob[e.blob->last_encoded_id];
2100 blob_duped = false;
2101 } else {
2102 // dup the blob
2103 const bluestore_blob_t& blob = e.blob->get_blob();
2104 // make sure it is shared
2105 if (!blob.is_shared()) {
2106 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2107 if (!inject_21040 && !src_dirty) {
2108 src_dirty = true;
2109 dirty_range_begin = e.logical_offset;
2110 } else if (inject_21040 &&
2111 dirty_range_begin == 0 && dirty_range_end == 0) {
2112 dirty_range_begin = e.logical_offset;
2113 }
2114 ceph_assert(e.logical_end() > 0);
2115 // -1 to exclude next potential shard
2116 dirty_range_end = e.logical_end() - 1;
2117 } else {
2118 c->load_shared_blob(e.blob->shared_blob);
2119 }
2120 cb = new Blob();
2121 e.blob->last_encoded_id = n;
2122 id_to_blob[n] = cb;
2123 e.blob->dup(*cb);
2124 // bump the extent refs on the copied blob's extents
2125 for (auto p : blob.get_extents()) {
2126 if (p.is_valid()) {
2127 e.blob->shared_blob->get_ref(p.offset, p.length);
2128 }
2129 }
2130 txc->write_shared_blob(e.blob->shared_blob);
2131 dout(20) << __func__ << " new " << *cb << dendl;
2132 }
2133
2134 int skip_front, skip_back;
2135 if (e.logical_offset < srcoff) {
2136 skip_front = srcoff - e.logical_offset;
2137 } else {
2138 skip_front = 0;
2139 }
2140 if (e.logical_end() > end) {
2141 skip_back = e.logical_end() - end;
2142 } else {
2143 skip_back = 0;
2144 }
2145
2146 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2147 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2148 newo->extent_map.extent_map.insert(*ne);
2149 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2150 // fixme: we may leave parts of new blob unreferenced that could
2151 // be freed (relative to the shared_blob).
2152 txc->statfs_delta.stored() += ne->length;
2153 if (e.blob->get_blob().is_compressed()) {
2154 txc->statfs_delta.compressed_original() += ne->length;
2155 if (blob_duped) {
2156 txc->statfs_delta.compressed() +=
2157 cb->get_blob().get_compressed_payload_length();
2158 }
2159 }
2160 dout(20) << __func__ << " dst " << *ne << dendl;
2161 ++n;
2162 }
2163 if ((!inject_21040 && src_dirty) ||
2164 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2165 oldo->extent_map.dirty_range(dirty_range_begin,
2166 dirty_range_end - dirty_range_begin);
2167 txc->write_onode(oldo);
2168 }
2169 txc->write_onode(newo);
2170
2171 if (dstoff + length > newo->onode.size) {
2172 newo->onode.size = dstoff + length;
2173 }
2174 newo->extent_map.dirty_range(dstoff, length);
2175}
7c673cae
FG
2176void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2177 bool force)
2178{
2179 auto cct = onode->c->store->cct; //used by dout
2180 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2181 if (onode->onode.extent_map_shards.empty()) {
2182 if (inline_bl.length() == 0) {
2183 unsigned n;
2184 // we need to encode inline_bl to measure encoded length
2185 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
3efd9988 2186 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11fdf7f2 2187 ceph_assert(!never_happen);
7c673cae
FG
2188 size_t len = inline_bl.length();
2189 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2190 << " extents" << dendl;
2191 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2192 request_reshard(0, OBJECT_MAX_SIZE);
2193 return;
2194 }
2195 }
2196 // will persist in the onode key.
2197 } else {
2198 // pending shard update
2199 struct dirty_shard_t {
2200 Shard *shard;
2201 bufferlist bl;
2202 dirty_shard_t(Shard *s) : shard(s) {}
2203 };
2204 vector<dirty_shard_t> encoded_shards;
2205 // allocate slots for all shards in a single call instead of
2206 // doing multiple allocations - one per each dirty shard
2207 encoded_shards.reserve(shards.size());
2208
2209 auto p = shards.begin();
2210 auto prev_p = p;
2211 while (p != shards.end()) {
11fdf7f2 2212 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2213 auto n = p;
2214 ++n;
2215 if (p->dirty) {
2216 uint32_t endoff;
2217 if (n == shards.end()) {
2218 endoff = OBJECT_MAX_SIZE;
2219 } else {
2220 endoff = n->shard_info->offset;
2221 }
2222 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2223 bufferlist& bl = encoded_shards.back().bl;
2224 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2225 bl, &p->extents)) {
2226 if (force) {
2227 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2228 ceph_assert(!force);
7c673cae
FG
2229 }
2230 }
2231 size_t len = bl.length();
2232
2233 dout(20) << __func__ << " shard 0x" << std::hex
2234 << p->shard_info->offset << std::dec << " is " << len
2235 << " bytes (was " << p->shard_info->bytes << ") from "
2236 << p->extents << " extents" << dendl;
2237
2238 if (!force) {
2239 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2240 // we are big; reshard ourselves
2241 request_reshard(p->shard_info->offset, endoff);
2242 }
2243 // avoid resharding the trailing shard, even if it is small
2244 else if (n != shards.end() &&
11fdf7f2
TL
2245 len < g_conf()->bluestore_extent_map_shard_min_size) {
2246 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2247 if (p == shards.begin()) {
2248 // we are the first shard, combine with next shard
7c673cae 2249 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2250 } else {
31f18b77
FG
2251 // combine either with the previous shard or the next,
2252 // whichever is smaller
7c673cae
FG
2253 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2254 request_reshard(p->shard_info->offset, endoff + 1);
2255 } else {
2256 request_reshard(prev_p->shard_info->offset, endoff);
2257 }
2258 }
2259 }
2260 }
2261 }
2262 prev_p = p;
2263 p = n;
2264 }
2265 if (needs_reshard()) {
2266 return;
2267 }
2268
2269 // schedule DB update for dirty shards
2270 string key;
2271 for (auto& it : encoded_shards) {
2272 it.shard->dirty = false;
2273 it.shard->shard_info->bytes = it.bl.length();
2274 generate_extent_shard_key_and_apply(
2275 onode->key,
2276 it.shard->shard_info->offset,
2277 &key,
2278 [&](const string& final_key) {
2279 t->set(PREFIX_OBJ, final_key, it.bl);
2280 }
2281 );
2282 }
2283 }
2284}
2285
31f18b77
FG
2286bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2287{
2288 if (spanning_blob_map.empty())
2289 return 0;
2290 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2291 // bid is valid and available.
2292 if (bid >= 0)
2293 return bid;
2294 // Find next unused bid;
2295 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2296 const auto begin_bid = bid;
2297 do {
2298 if (!spanning_blob_map.count(bid))
2299 return bid;
2300 else {
2301 bid++;
2302 if (bid < 0) bid = 0;
2303 }
2304 } while (bid != begin_bid);
81eedcae
TL
2305 auto cct = onode->c->store->cct; // used by dout
2306 _dump_onode<0>(cct, *onode);
11fdf7f2 2307 ceph_abort_msg("no available blob id");
31f18b77
FG
2308}
2309
7c673cae
FG
2310void BlueStore::ExtentMap::reshard(
2311 KeyValueDB *db,
2312 KeyValueDB::Transaction t)
2313{
2314 auto cct = onode->c->store->cct; // used by dout
2315
2316 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2317 << needs_reshard_end << ")" << std::dec
2318 << " of " << onode->onode.extent_map_shards.size()
2319 << " shards on " << onode->oid << dendl;
2320 for (auto& p : spanning_blob_map) {
2321 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2322 << dendl;
2323 }
2324 // determine shard index range
2325 unsigned si_begin = 0, si_end = 0;
2326 if (!shards.empty()) {
2327 while (si_begin + 1 < shards.size() &&
2328 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2329 ++si_begin;
2330 }
2331 needs_reshard_begin = shards[si_begin].shard_info->offset;
2332 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2333 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2334 needs_reshard_end = shards[si_end].shard_info->offset;
2335 break;
2336 }
2337 }
2338 if (si_end == shards.size()) {
2339 needs_reshard_end = OBJECT_MAX_SIZE;
2340 }
2341 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2342 << " over 0x[" << std::hex << needs_reshard_begin << ","
2343 << needs_reshard_end << ")" << std::dec << dendl;
2344 }
2345
181888fb 2346 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2347
2348 // we may need to fault in a larger interval later must have all
2349 // referring extents for spanning blobs loaded in order to have
2350 // accurate use_tracker values.
2351 uint32_t spanning_scan_begin = needs_reshard_begin;
2352 uint32_t spanning_scan_end = needs_reshard_end;
2353
2354 // remove old keys
2355 string key;
2356 for (unsigned i = si_begin; i < si_end; ++i) {
2357 generate_extent_shard_key_and_apply(
2358 onode->key, shards[i].shard_info->offset, &key,
2359 [&](const string& final_key) {
2360 t->rmkey(PREFIX_OBJ, final_key);
2361 }
2362 );
2363 }
2364
2365 // calculate average extent size
2366 unsigned bytes = 0;
2367 unsigned extents = 0;
2368 if (onode->onode.extent_map_shards.empty()) {
2369 bytes = inline_bl.length();
2370 extents = extent_map.size();
2371 } else {
2372 for (unsigned i = si_begin; i < si_end; ++i) {
2373 bytes += shards[i].shard_info->bytes;
2374 extents += shards[i].extents;
2375 }
2376 }
2377 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2378 unsigned slop = target *
2379 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2380 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2381 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2382 << ", slop " << slop << dendl;
2383
2384 // reshard
2385 unsigned estimate = 0;
31f18b77 2386 unsigned offset = needs_reshard_begin;
7c673cae
FG
2387 vector<bluestore_onode_t::shard_info> new_shard_info;
2388 unsigned max_blob_end = 0;
2389 Extent dummy(needs_reshard_begin);
2390 for (auto e = extent_map.lower_bound(dummy);
2391 e != extent_map.end();
2392 ++e) {
2393 if (e->logical_offset >= needs_reshard_end) {
2394 break;
2395 }
2396 dout(30) << " extent " << *e << dendl;
2397
2398 // disfavor shard boundaries that span a blob
2399 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2400 if (estimate &&
2401 estimate + extent_avg > target + (would_span ? slop : 0)) {
2402 // new shard
31f18b77 2403 if (offset == needs_reshard_begin) {
7c673cae
FG
2404 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2405 new_shard_info.back().offset = offset;
2406 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2407 << std::dec << dendl;
7c673cae
FG
2408 }
2409 offset = e->logical_offset;
2410 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2411 new_shard_info.back().offset = offset;
2412 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2413 << std::dec << dendl;
2414 estimate = 0;
2415 }
2416 estimate += extent_avg;
31f18b77
FG
2417 unsigned bs = e->blob_start();
2418 if (bs < spanning_scan_begin) {
2419 spanning_scan_begin = bs;
7c673cae
FG
2420 }
2421 uint32_t be = e->blob_end();
2422 if (be > max_blob_end) {
2423 max_blob_end = be;
2424 }
2425 if (be > spanning_scan_end) {
2426 spanning_scan_end = be;
2427 }
2428 }
2429 if (new_shard_info.empty() && (si_begin > 0 ||
2430 si_end < shards.size())) {
2431 // we resharded a partial range; we must produce at least one output
2432 // shard
2433 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2434 new_shard_info.back().offset = needs_reshard_begin;
2435 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2436 << std::dec << " (singleton degenerate case)" << dendl;
2437 }
2438
2439 auto& sv = onode->onode.extent_map_shards;
2440 dout(20) << __func__ << " new " << new_shard_info << dendl;
2441 dout(20) << __func__ << " old " << sv << dendl;
2442 if (sv.empty()) {
2443 // no old shards to keep
2444 sv.swap(new_shard_info);
2445 init_shards(true, true);
2446 } else {
2447 // splice in new shards
2448 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2449 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2450 sv.insert(
2451 sv.begin() + si_begin,
2452 new_shard_info.begin(),
2453 new_shard_info.end());
2454 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2455 si_end = si_begin + new_shard_info.size();
31f18b77 2456
11fdf7f2 2457 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2458
2459 // note that we need to update every shard_info of shards here,
2460 // as sv might have been totally re-allocated above
2461 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2462 shards[i].shard_info = &sv[i];
31f18b77
FG
2463 }
2464
2465 // mark newly added shards as dirty
2466 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2467 shards[i].loaded = true;
2468 shards[i].dirty = true;
2469 }
7c673cae
FG
2470 }
2471 dout(20) << __func__ << " fin " << sv << dendl;
2472 inline_bl.clear();
2473
2474 if (sv.empty()) {
2475 // no more shards; unspan all previously spanning blobs
2476 auto p = spanning_blob_map.begin();
2477 while (p != spanning_blob_map.end()) {
2478 p->second->id = -1;
2479 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2480 p = spanning_blob_map.erase(p);
2481 }
2482 } else {
2483 // identify new spanning blobs
2484 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2485 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2486 if (spanning_scan_begin < needs_reshard_begin) {
2487 fault_range(db, spanning_scan_begin,
2488 needs_reshard_begin - spanning_scan_begin);
2489 }
2490 if (spanning_scan_end > needs_reshard_end) {
2491 fault_range(db, needs_reshard_end,
31f18b77 2492 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2493 }
2494 auto sp = sv.begin() + si_begin;
2495 auto esp = sv.end();
2496 unsigned shard_start = sp->offset;
2497 unsigned shard_end;
2498 ++sp;
2499 if (sp == esp) {
2500 shard_end = OBJECT_MAX_SIZE;
2501 } else {
2502 shard_end = sp->offset;
2503 }
7c673cae
FG
2504 Extent dummy(needs_reshard_begin);
2505 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2506 if (e->logical_offset >= needs_reshard_end) {
2507 break;
2508 }
2509 dout(30) << " extent " << *e << dendl;
2510 while (e->logical_offset >= shard_end) {
2511 shard_start = shard_end;
11fdf7f2 2512 ceph_assert(sp != esp);
7c673cae
FG
2513 ++sp;
2514 if (sp == esp) {
2515 shard_end = OBJECT_MAX_SIZE;
2516 } else {
2517 shard_end = sp->offset;
2518 }
2519 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2520 << " to 0x" << shard_end << std::dec << dendl;
2521 }
2522 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2523 if (!e->blob->is_spanning()) {
2524 // We have two options: (1) split the blob into pieces at the
2525 // shard boundaries (and adjust extents accordingly), or (2)
2526 // mark it spanning. We prefer to cut the blob if we can. Note that
2527 // we may have to split it multiple times--potentially at every
2528 // shard boundary.
2529 bool must_span = false;
2530 BlobRef b = e->blob;
2531 if (b->can_split()) {
2532 uint32_t bstart = e->blob_start();
2533 uint32_t bend = e->blob_end();
2534 for (const auto& sh : shards) {
2535 if (bstart < sh.shard_info->offset &&
2536 bend > sh.shard_info->offset) {
2537 uint32_t blob_offset = sh.shard_info->offset - bstart;
2538 if (b->can_split_at(blob_offset)) {
2539 dout(20) << __func__ << " splitting blob, bstart 0x"
2540 << std::hex << bstart << " blob_offset 0x"
2541 << blob_offset << std::dec << " " << *b << dendl;
2542 b = split_blob(b, blob_offset, sh.shard_info->offset);
2543 // switch b to the new right-hand side, in case it
2544 // *also* has to get split.
2545 bstart += blob_offset;
2546 onode->c->store->logger->inc(l_bluestore_blob_split);
2547 } else {
2548 must_span = true;
2549 break;
2550 }
2551 }
2552 }
2553 } else {
2554 must_span = true;
2555 }
2556 if (must_span) {
31f18b77
FG
2557 auto bid = allocate_spanning_blob_id();
2558 b->id = bid;
7c673cae
FG
2559 spanning_blob_map[b->id] = b;
2560 dout(20) << __func__ << " adding spanning " << *b << dendl;
2561 }
2562 }
2563 } else {
2564 if (e->blob->is_spanning()) {
2565 spanning_blob_map.erase(e->blob->id);
2566 e->blob->id = -1;
2567 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2568 }
2569 }
2570 }
2571 }
2572
2573 clear_needs_reshard();
2574}
2575
2576bool BlueStore::ExtentMap::encode_some(
2577 uint32_t offset,
2578 uint32_t length,
2579 bufferlist& bl,
2580 unsigned *pn)
2581{
2582 auto cct = onode->c->store->cct; //used by dout
2583 Extent dummy(offset);
2584 auto start = extent_map.lower_bound(dummy);
2585 uint32_t end = offset + length;
2586
2587 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2588 // serialization only. Hence there is no specific
2589 // handling at ExtentMap level.
2590
2591 unsigned n = 0;
2592 size_t bound = 0;
7c673cae
FG
2593 bool must_reshard = false;
2594 for (auto p = start;
2595 p != extent_map.end() && p->logical_offset < end;
2596 ++p, ++n) {
11fdf7f2 2597 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2598 p->blob->last_encoded_id = -1;
2599 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2600 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2601 << std::dec << " hit new spanning blob " << *p << dendl;
2602 request_reshard(p->blob_start(), p->blob_end());
2603 must_reshard = true;
2604 }
31f18b77
FG
2605 if (!must_reshard) {
2606 denc_varint(0, bound); // blobid
2607 denc_varint(0, bound); // logical_offset
2608 denc_varint(0, bound); // len
2609 denc_varint(0, bound); // blob_offset
7c673cae 2610
31f18b77
FG
2611 p->blob->bound_encode(
2612 bound,
2613 struct_v,
2614 p->blob->shared_blob->get_sbid(),
2615 false);
2616 }
7c673cae
FG
2617 }
2618 if (must_reshard) {
2619 return true;
2620 }
2621
31f18b77
FG
2622 denc(struct_v, bound);
2623 denc_varint(0, bound); // number of extents
2624
7c673cae
FG
2625 {
2626 auto app = bl.get_contiguous_appender(bound);
2627 denc(struct_v, app);
2628 denc_varint(n, app);
2629 if (pn) {
2630 *pn = n;
2631 }
2632
2633 n = 0;
2634 uint64_t pos = 0;
2635 uint64_t prev_len = 0;
2636 for (auto p = start;
2637 p != extent_map.end() && p->logical_offset < end;
2638 ++p, ++n) {
2639 unsigned blobid;
2640 bool include_blob = false;
2641 if (p->blob->is_spanning()) {
2642 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2643 blobid |= BLOBID_FLAG_SPANNING;
2644 } else if (p->blob->last_encoded_id < 0) {
2645 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2646 include_blob = true;
2647 blobid = 0; // the decoder will infer the id from n
2648 } else {
2649 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2650 }
2651 if (p->logical_offset == pos) {
2652 blobid |= BLOBID_FLAG_CONTIGUOUS;
2653 }
2654 if (p->blob_offset == 0) {
2655 blobid |= BLOBID_FLAG_ZEROOFFSET;
2656 }
2657 if (p->length == prev_len) {
2658 blobid |= BLOBID_FLAG_SAMELENGTH;
2659 } else {
2660 prev_len = p->length;
2661 }
2662 denc_varint(blobid, app);
2663 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2664 denc_varint_lowz(p->logical_offset - pos, app);
2665 }
2666 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2667 denc_varint_lowz(p->blob_offset, app);
2668 }
2669 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2670 denc_varint_lowz(p->length, app);
2671 }
2672 pos = p->logical_end();
2673 if (include_blob) {
2674 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2675 }
2676 }
2677 }
2678 /*derr << __func__ << bl << dendl;
2679 derr << __func__ << ":";
2680 bl.hexdump(*_dout);
2681 *_dout << dendl;
2682 */
2683 return false;
2684}
2685
2686unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2687{
2688 auto cct = onode->c->store->cct; //used by dout
2689 /*
2690 derr << __func__ << ":";
2691 bl.hexdump(*_dout);
2692 *_dout << dendl;
2693 */
2694
11fdf7f2 2695 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
2696 auto p = bl.front().begin_deep();
2697 __u8 struct_v;
2698 denc(struct_v, p);
2699 // Version 2 differs from v1 in blob's ref_map
2700 // serialization only. Hence there is no specific
2701 // handling at ExtentMap level below.
11fdf7f2 2702 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2703
2704 uint32_t num;
2705 denc_varint(num, p);
2706 vector<BlobRef> blobs(num);
2707 uint64_t pos = 0;
2708 uint64_t prev_len = 0;
2709 unsigned n = 0;
2710
2711 while (!p.end()) {
2712 Extent *le = new Extent();
2713 uint64_t blobid;
2714 denc_varint(blobid, p);
2715 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2716 uint64_t gap;
2717 denc_varint_lowz(gap, p);
2718 pos += gap;
2719 }
2720 le->logical_offset = pos;
2721 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2722 denc_varint_lowz(le->blob_offset, p);
2723 } else {
2724 le->blob_offset = 0;
2725 }
2726 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2727 denc_varint_lowz(prev_len, p);
2728 }
2729 le->length = prev_len;
2730
2731 if (blobid & BLOBID_FLAG_SPANNING) {
2732 dout(30) << __func__ << " getting spanning blob "
2733 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2734 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2735 } else {
2736 blobid >>= BLOBID_SHIFT_BITS;
2737 if (blobid) {
2738 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 2739 ceph_assert(le->blob);
7c673cae
FG
2740 } else {
2741 Blob *b = new Blob();
2742 uint64_t sbid = 0;
2743 b->decode(onode->c, p, struct_v, &sbid, false);
2744 blobs[n] = b;
2745 onode->c->open_shared_blob(sbid, b);
2746 le->assign_blob(b);
2747 }
2748 // we build ref_map dynamically for non-spanning blobs
2749 le->blob->get_ref(
2750 onode->c,
2751 le->blob_offset,
2752 le->length);
2753 }
2754 pos += prev_len;
2755 ++n;
2756 extent_map.insert(*le);
2757 }
2758
11fdf7f2 2759 ceph_assert(n == num);
7c673cae
FG
2760 return num;
2761}
2762
2763void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2764{
2765 // Version 2 differs from v1 in blob's ref_map
2766 // serialization only. Hence there is no specific
2767 // handling at ExtentMap level.
2768 __u8 struct_v = 2;
2769
2770 denc(struct_v, p);
2771 denc_varint((uint32_t)0, p);
2772 size_t key_size = 0;
2773 denc_varint((uint32_t)0, key_size);
2774 p += spanning_blob_map.size() * key_size;
2775 for (const auto& i : spanning_blob_map) {
2776 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2777 }
2778}
2779
2780void BlueStore::ExtentMap::encode_spanning_blobs(
2781 bufferlist::contiguous_appender& p)
2782{
2783 // Version 2 differs from v1 in blob's ref_map
2784 // serialization only. Hence there is no specific
2785 // handling at ExtentMap level.
2786 __u8 struct_v = 2;
2787
2788 denc(struct_v, p);
2789 denc_varint(spanning_blob_map.size(), p);
2790 for (auto& i : spanning_blob_map) {
2791 denc_varint(i.second->id, p);
2792 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2793 }
2794}
2795
2796void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 2797 bufferptr::const_iterator& p)
7c673cae
FG
2798{
2799 __u8 struct_v;
2800 denc(struct_v, p);
2801 // Version 2 differs from v1 in blob's ref_map
2802 // serialization only. Hence there is no specific
2803 // handling at ExtentMap level.
11fdf7f2 2804 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2805
2806 unsigned n;
2807 denc_varint(n, p);
2808 while (n--) {
2809 BlobRef b(new Blob());
2810 denc_varint(b->id, p);
2811 spanning_blob_map[b->id] = b;
2812 uint64_t sbid = 0;
2813 b->decode(onode->c, p, struct_v, &sbid, true);
2814 onode->c->open_shared_blob(sbid, b);
2815 }
2816}
2817
2818void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2819{
2820 shards.resize(onode->onode.extent_map_shards.size());
2821 unsigned i = 0;
2822 for (auto &s : onode->onode.extent_map_shards) {
2823 shards[i].shard_info = &s;
2824 shards[i].loaded = loaded;
2825 shards[i].dirty = dirty;
2826 ++i;
2827 }
2828}
2829
2830void BlueStore::ExtentMap::fault_range(
2831 KeyValueDB *db,
2832 uint32_t offset,
2833 uint32_t length)
2834{
2835 auto cct = onode->c->store->cct; //used by dout
2836 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2837 << std::dec << dendl;
2838 auto start = seek_shard(offset);
2839 auto last = seek_shard(offset + length);
2840
2841 if (start < 0)
2842 return;
2843
11fdf7f2 2844 ceph_assert(last >= start);
7c673cae
FG
2845 string key;
2846 while (start <= last) {
11fdf7f2 2847 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2848 auto p = &shards[start];
2849 if (!p->loaded) {
2850 dout(30) << __func__ << " opening shard 0x" << std::hex
2851 << p->shard_info->offset << std::dec << dendl;
2852 bufferlist v;
2853 generate_extent_shard_key_and_apply(
2854 onode->key, p->shard_info->offset, &key,
2855 [&](const string& final_key) {
2856 int r = db->get(PREFIX_OBJ, final_key, &v);
2857 if (r < 0) {
2858 derr << __func__ << " missing shard 0x" << std::hex
2859 << p->shard_info->offset << std::dec << " for " << onode->oid
2860 << dendl;
11fdf7f2 2861 ceph_assert(r >= 0);
7c673cae
FG
2862 }
2863 }
2864 );
2865 p->extents = decode_some(v);
2866 p->loaded = true;
2867 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
2868 << p->shard_info->offset
2869 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 2870 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
2871 ceph_assert(p->dirty == false);
2872 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
2873 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2874 } else {
2875 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2876 }
2877 ++start;
2878 }
2879}
2880
2881void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
2882 uint32_t offset,
2883 uint32_t length)
2884{
2885 auto cct = onode->c->store->cct; //used by dout
2886 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2887 << std::dec << dendl;
2888 if (shards.empty()) {
2889 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2890 inline_bl.clear();
2891 return;
2892 }
2893 auto start = seek_shard(offset);
11fdf7f2
TL
2894 if (length == 0) {
2895 length = 1;
2896 }
2897 auto last = seek_shard(offset + length - 1);
7c673cae
FG
2898 if (start < 0)
2899 return;
2900
11fdf7f2 2901 ceph_assert(last >= start);
7c673cae 2902 while (start <= last) {
11fdf7f2 2903 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2904 auto p = &shards[start];
2905 if (!p->loaded) {
11fdf7f2
TL
2906 derr << __func__ << "on write 0x" << std::hex << offset
2907 << "~" << length << " shard 0x" << p->shard_info->offset
2908 << std::dec << " is not loaded, can't mark dirty" << dendl;
2909 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
2910 }
2911 if (!p->dirty) {
2912 dout(20) << __func__ << " mark shard 0x" << std::hex
2913 << p->shard_info->offset << std::dec << " dirty" << dendl;
2914 p->dirty = true;
2915 }
2916 ++start;
2917 }
2918}
2919
2920BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2921 uint64_t offset)
2922{
2923 Extent dummy(offset);
2924 return extent_map.find(dummy);
2925}
2926
7c673cae
FG
2927BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2928 uint64_t offset)
2929{
2930 Extent dummy(offset);
2931 auto fp = extent_map.lower_bound(dummy);
2932 if (fp != extent_map.begin()) {
2933 --fp;
2934 if (fp->logical_end() <= offset) {
2935 ++fp;
2936 }
2937 }
2938 return fp;
2939}
2940
2941BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2942 uint64_t offset) const
2943{
2944 Extent dummy(offset);
2945 auto fp = extent_map.lower_bound(dummy);
2946 if (fp != extent_map.begin()) {
2947 --fp;
2948 if (fp->logical_end() <= offset) {
2949 ++fp;
2950 }
2951 }
2952 return fp;
2953}
2954
2955bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2956{
2957 auto fp = seek_lextent(offset);
2958 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2959 return false;
2960 }
2961 return true;
2962}
2963
2964int BlueStore::ExtentMap::compress_extent_map(
2965 uint64_t offset,
2966 uint64_t length)
2967{
2968 auto cct = onode->c->store->cct; //used by dout
2969 if (extent_map.empty())
2970 return 0;
2971 int removed = 0;
2972 auto p = seek_lextent(offset);
2973 if (p != extent_map.begin()) {
2974 --p; // start to the left of offset
2975 }
2976 // the caller should have just written to this region
11fdf7f2 2977 ceph_assert(p != extent_map.end());
7c673cae
FG
2978
2979 // identify the *next* shard
2980 auto pshard = shards.begin();
2981 while (pshard != shards.end() &&
2982 p->logical_offset >= pshard->shard_info->offset) {
2983 ++pshard;
2984 }
2985 uint64_t shard_end;
2986 if (pshard != shards.end()) {
2987 shard_end = pshard->shard_info->offset;
2988 } else {
2989 shard_end = OBJECT_MAX_SIZE;
2990 }
2991
2992 auto n = p;
2993 for (++n; n != extent_map.end(); p = n++) {
2994 if (n->logical_offset > offset + length) {
2995 break; // stop after end
2996 }
2997 while (n != extent_map.end() &&
2998 p->logical_end() == n->logical_offset &&
2999 p->blob == n->blob &&
3000 p->blob_offset + p->length == n->blob_offset &&
3001 n->logical_offset < shard_end) {
3002 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3003 << " next shard 0x" << shard_end << std::dec
3004 << " merging " << *p << " and " << *n << dendl;
3005 p->length += n->length;
3006 rm(n++);
3007 ++removed;
3008 }
3009 if (n == extent_map.end()) {
3010 break;
3011 }
3012 if (n->logical_offset >= shard_end) {
11fdf7f2 3013 ceph_assert(pshard != shards.end());
7c673cae
FG
3014 ++pshard;
3015 if (pshard != shards.end()) {
3016 shard_end = pshard->shard_info->offset;
3017 } else {
3018 shard_end = OBJECT_MAX_SIZE;
3019 }
3020 }
3021 }
11fdf7f2 3022 if (removed) {
7c673cae
FG
3023 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3024 }
3025 return removed;
3026}
3027
3028void BlueStore::ExtentMap::punch_hole(
3029 CollectionRef &c,
3030 uint64_t offset,
3031 uint64_t length,
3032 old_extent_map_t *old_extents)
3033{
3034 auto p = seek_lextent(offset);
3035 uint64_t end = offset + length;
3036 while (p != extent_map.end()) {
3037 if (p->logical_offset >= end) {
3038 break;
3039 }
3040 if (p->logical_offset < offset) {
3041 if (p->logical_end() > end) {
3042 // split and deref middle
3043 uint64_t front = offset - p->logical_offset;
3044 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3045 length, p->blob);
3046 old_extents->push_back(*oe);
3047 add(end,
3048 p->blob_offset + front + length,
3049 p->length - front - length,
3050 p->blob);
3051 p->length = front;
3052 break;
3053 } else {
3054 // deref tail
11fdf7f2 3055 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3056 uint64_t keep = offset - p->logical_offset;
3057 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3058 p->length - keep, p->blob);
3059 old_extents->push_back(*oe);
3060 p->length = keep;
3061 ++p;
3062 continue;
3063 }
3064 }
3065 if (p->logical_offset + p->length <= end) {
3066 // deref whole lextent
3067 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3068 p->length, p->blob);
3069 old_extents->push_back(*oe);
3070 rm(p++);
3071 continue;
3072 }
3073 // deref head
3074 uint64_t keep = p->logical_end() - end;
3075 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3076 p->length - keep, p->blob);
3077 old_extents->push_back(*oe);
3078
3079 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3080 rm(p);
3081 break;
3082 }
3083}
3084
3085BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3086 CollectionRef &c,
3087 uint64_t logical_offset,
3088 uint64_t blob_offset, uint64_t length, BlobRef b,
3089 old_extent_map_t *old_extents)
3090{
3091 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3092 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3093
3094 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3095 // old_extents list if we overwre the blob totally
3096 // This might happen during WAL overwrite.
3097 b->get_ref(onode->c, blob_offset, length);
3098
3099 if (old_extents) {
3100 punch_hole(c, logical_offset, length, old_extents);
3101 }
3102
3103 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3104 extent_map.insert(*le);
3105 if (spans_shard(logical_offset, length)) {
3106 request_reshard(logical_offset, logical_offset + length);
3107 }
3108 return le;
3109}
3110
3111BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3112 BlobRef lb,
3113 uint32_t blob_offset,
3114 uint32_t pos)
3115{
3116 auto cct = onode->c->store->cct; //used by dout
3117
3118 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3119 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3120 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3121 << dendl;
3122 BlobRef rb = onode->c->new_blob();
3123 lb->split(onode->c, blob_offset, rb.get());
3124
3125 for (auto ep = seek_lextent(pos);
3126 ep != extent_map.end() && ep->logical_offset < end_pos;
3127 ++ep) {
3128 if (ep->blob != lb) {
3129 continue;
3130 }
3131 if (ep->logical_offset < pos) {
3132 // split extent
3133 size_t left = pos - ep->logical_offset;
3134 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3135 extent_map.insert(*ne);
3136 ep->length = left;
3137 dout(30) << __func__ << " split " << *ep << dendl;
3138 dout(30) << __func__ << " to " << *ne << dendl;
3139 } else {
3140 // switch blob
11fdf7f2 3141 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3142
3143 ep->blob = rb;
3144 ep->blob_offset -= blob_offset;
3145 dout(30) << __func__ << " adjusted " << *ep << dendl;
3146 }
3147 }
3148 return rb;
3149}
3150
3151// Onode
3152
3153#undef dout_prefix
3154#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3155
3156void BlueStore::Onode::flush()
3157{
3158 if (flushing_count.load()) {
3159 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
11fdf7f2 3160 std::unique_lock l(flush_lock);
7c673cae
FG
3161 while (flushing_count.load()) {
3162 flush_cond.wait(l);
3163 }
3164 }
3165 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3166}
3167
3168// =======================================================
3169// WriteContext
3170
3171/// Checks for writes to the same pextent within a blob
3172bool BlueStore::WriteContext::has_conflict(
3173 BlobRef b,
3174 uint64_t loffs,
3175 uint64_t loffs_end,
3176 uint64_t min_alloc_size)
3177{
11fdf7f2
TL
3178 ceph_assert((loffs % min_alloc_size) == 0);
3179 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3180 for (auto w : writes) {
3181 if (b == w.b) {
11fdf7f2
TL
3182 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3183 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3184 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3185 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3186 return true;
3187 }
3188 }
3189 }
3190 return false;
3191}
3192
3193// =======================================================
3194
3195// DeferredBatch
3196#undef dout_prefix
3197#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3198
3199void BlueStore::DeferredBatch::prepare_write(
3200 CephContext *cct,
3201 uint64_t seq, uint64_t offset, uint64_t length,
3202 bufferlist::const_iterator& blp)
3203{
3204 _discard(cct, offset, length);
3205 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3206 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3207 i.first->second.seq = seq;
3208 blp.copy(length, i.first->second.bl);
31f18b77
FG
3209 i.first->second.bl.reassign_to_mempool(
3210 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3211 dout(20) << __func__ << " seq " << seq
3212 << " 0x" << std::hex << offset << "~" << length
3213 << " crc " << i.first->second.bl.crc32c(-1)
3214 << std::dec << dendl;
3215 seq_bytes[seq] += length;
3216#ifdef DEBUG_DEFERRED
3217 _audit(cct);
3218#endif
3219}
3220
3221void BlueStore::DeferredBatch::_discard(
3222 CephContext *cct, uint64_t offset, uint64_t length)
3223{
3224 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3225 << std::dec << dendl;
3226 auto p = iomap.lower_bound(offset);
3227 if (p != iomap.begin()) {
3228 --p;
3229 auto end = p->first + p->second.bl.length();
3230 if (end > offset) {
3231 bufferlist head;
3232 head.substr_of(p->second.bl, 0, offset - p->first);
3233 dout(20) << __func__ << " keep head " << p->second.seq
3234 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3235 << " -> 0x" << head.length() << std::dec << dendl;
3236 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3237 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3238 if (end > offset + length) {
3239 bufferlist tail;
3240 tail.substr_of(p->second.bl, offset + length - p->first,
3241 end - (offset + length));
3242 dout(20) << __func__ << " keep tail " << p->second.seq
3243 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3244 << " -> 0x" << tail.length() << std::dec << dendl;
3245 auto &n = iomap[offset + length];
3246 n.bl.swap(tail);
3247 n.seq = p->second.seq;
3248 i->second -= length;
3249 } else {
3250 i->second -= end - offset;
3251 }
11fdf7f2 3252 ceph_assert(i->second >= 0);
7c673cae
FG
3253 p->second.bl.swap(head);
3254 }
3255 ++p;
3256 }
3257 while (p != iomap.end()) {
3258 if (p->first >= offset + length) {
3259 break;
3260 }
3261 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3262 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3263 auto end = p->first + p->second.bl.length();
3264 if (end > offset + length) {
3265 unsigned drop_front = offset + length - p->first;
3266 unsigned keep_tail = end - (offset + length);
3267 dout(20) << __func__ << " truncate front " << p->second.seq
3268 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3269 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3270 << " to 0x" << (offset + length) << "~" << keep_tail
3271 << std::dec << dendl;
3272 auto &s = iomap[offset + length];
3273 s.seq = p->second.seq;
3274 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3275 i->second -= drop_front;
3276 } else {
3277 dout(20) << __func__ << " drop " << p->second.seq
3278 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3279 << std::dec << dendl;
3280 i->second -= p->second.bl.length();
3281 }
11fdf7f2 3282 ceph_assert(i->second >= 0);
7c673cae
FG
3283 p = iomap.erase(p);
3284 }
3285}
3286
3287void BlueStore::DeferredBatch::_audit(CephContext *cct)
3288{
3289 map<uint64_t,int> sb;
3290 for (auto p : seq_bytes) {
3291 sb[p.first] = 0; // make sure we have the same set of keys
3292 }
3293 uint64_t pos = 0;
3294 for (auto& p : iomap) {
11fdf7f2 3295 ceph_assert(p.first >= pos);
7c673cae
FG
3296 sb[p.second.seq] += p.second.bl.length();
3297 pos = p.first + p.second.bl.length();
3298 }
11fdf7f2 3299 ceph_assert(sb == seq_bytes);
7c673cae
FG
3300}
3301
3302
3303// Collection
3304
3305#undef dout_prefix
3306#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3307
11fdf7f2
TL
3308BlueStore::Collection::Collection(BlueStore *store_, Cache *c, coll_t cid)
3309 : CollectionImpl(cid),
3310 store(store_),
7c673cae 3311 cache(c),
7c673cae
FG
3312 lock("BlueStore::Collection::lock", true, false),
3313 exists(true),
11fdf7f2
TL
3314 onode_map(c),
3315 commit_queue(nullptr)
3316{
3317}
3318
3319bool BlueStore::Collection::flush_commit(Context *c)
3320{
3321 return osr->flush_commit(c);
3322}
3323
3324void BlueStore::Collection::flush()
3325{
3326 osr->flush();
3327}
3328
3329void BlueStore::Collection::flush_all_but_last()
7c673cae 3330{
11fdf7f2 3331 osr->flush_all_but_last();
7c673cae
FG
3332}
3333
3334void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3335{
11fdf7f2 3336 ceph_assert(!b->shared_blob);
7c673cae
FG
3337 const bluestore_blob_t& blob = b->get_blob();
3338 if (!blob.is_shared()) {
3339 b->shared_blob = new SharedBlob(this);
3340 return;
3341 }
3342
3343 b->shared_blob = shared_blob_set.lookup(sbid);
3344 if (b->shared_blob) {
3345 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3346 << std::dec << " had " << *b->shared_blob << dendl;
3347 } else {
3348 b->shared_blob = new SharedBlob(sbid, this);
3349 shared_blob_set.add(this, b->shared_blob.get());
3350 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3351 << std::dec << " opened " << *b->shared_blob
3352 << dendl;
3353 }
3354}
3355
3356void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3357{
3358 if (!sb->is_loaded()) {
3359
3360 bufferlist v;
3361 string key;
3362 auto sbid = sb->get_sbid();
3363 get_shared_blob_key(sbid, &key);
3364 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3365 if (r < 0) {
3366 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3367 << std::dec << " not found at key "
3368 << pretty_binary_string(key) << dendl;
11fdf7f2 3369 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3370 }
3371
3372 sb->loaded = true;
3373 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3374 auto p = v.cbegin();
3375 decode(*(sb->persistent), p);
7c673cae
FG
3376 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3377 << std::dec << " loaded shared_blob " << *sb << dendl;
3378 }
3379}
3380
3381void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3382{
7c673cae 3383 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3384 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3385
3386 // update blob
31f18b77 3387 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3388 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3389
3390 // update shared blob
3391 b->shared_blob->loaded = true;
3392 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3393 shared_blob_set.add(this, b->shared_blob.get());
3394 for (auto p : blob.get_extents()) {
3395 if (p.is_valid()) {
3396 b->shared_blob->get_ref(
3397 p.offset,
3398 p.length);
3399 }
3400 }
3401 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3402}
3403
31f18b77
FG
3404uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3405{
3406 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3407 ceph_assert(sb->is_loaded());
31f18b77
FG
3408
3409 uint64_t sbid = sb->get_sbid();
3410 shared_blob_set.remove(sb);
3411 sb->loaded = false;
3412 delete sb->persistent;
3413 sb->sbid_unloaded = 0;
3414 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3415 return sbid;
3416}
3417
7c673cae
FG
3418BlueStore::OnodeRef BlueStore::Collection::get_onode(
3419 const ghobject_t& oid,
3420 bool create)
3421{
11fdf7f2 3422 ceph_assert(create ? lock.is_wlocked() : lock.is_locked());
7c673cae
FG
3423
3424 spg_t pgid;
3425 if (cid.is_pg(&pgid)) {
3426 if (!oid.match(cnode.bits, pgid.ps())) {
3427 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3428 << pgid << " bits " << cnode.bits << dendl;
3429 ceph_abort();
3430 }
3431 }
3432
3433 OnodeRef o = onode_map.lookup(oid);
3434 if (o)
3435 return o;
3436
31f18b77 3437 mempool::bluestore_cache_other::string key;
7c673cae
FG
3438 get_object_key(store->cct, oid, &key);
3439
3440 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3441 << pretty_binary_string(key) << dendl;
3442
3443 bufferlist v;
3444 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3445 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3446 Onode *on;
3447 if (v.length() == 0) {
11fdf7f2 3448 ceph_assert(r == -ENOENT);
7c673cae
FG
3449 if (!store->cct->_conf->bluestore_debug_misc &&
3450 !create)
3451 return OnodeRef();
3452
3453 // new object, new onode
3454 on = new Onode(this, oid, key);
3455 } else {
3456 // loaded
11fdf7f2 3457 ceph_assert(r >= 0);
7c673cae
FG
3458 on = new Onode(this, oid, key);
3459 on->exists = true;
11fdf7f2 3460 auto p = v.front().begin_deep();
7c673cae 3461 on->onode.decode(p);
3efd9988
FG
3462 for (auto& i : on->onode.attrs) {
3463 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3464 }
7c673cae
FG
3465
3466 // initialize extent_map
3467 on->extent_map.decode_spanning_blobs(p);
3468 if (on->onode.extent_map_shards.empty()) {
3469 denc(on->extent_map.inline_bl, p);
3470 on->extent_map.decode_some(on->extent_map.inline_bl);
3efd9988
FG
3471 on->extent_map.inline_bl.reassign_to_mempool(
3472 mempool::mempool_bluestore_cache_other);
7c673cae
FG
3473 } else {
3474 on->extent_map.init_shards(false, false);
3475 }
3476 }
3477 o.reset(on);
3478 return onode_map.add(oid, o);
3479}
3480
3481void BlueStore::Collection::split_cache(
3482 Collection *dest)
3483{
3484 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3485
3486 // lock (one or both) cache shards
3487 std::lock(cache->lock, dest->cache->lock);
11fdf7f2
TL
3488 std::lock_guard l(cache->lock, std::adopt_lock);
3489 std::lock_guard l2(dest->cache->lock, std::adopt_lock);
7c673cae
FG
3490
3491 int destbits = dest->cnode.bits;
3492 spg_t destpg;
3493 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 3494 ceph_assert(is_pg);
7c673cae
FG
3495
3496 auto p = onode_map.onode_map.begin();
3497 while (p != onode_map.onode_map.end()) {
11fdf7f2 3498 OnodeRef o = p->second;
7c673cae
FG
3499 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3500 // onode does not belong to this child
11fdf7f2
TL
3501 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
3502 << dendl;
7c673cae
FG
3503 ++p;
3504 } else {
7c673cae
FG
3505 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3506 << dendl;
3507
3508 cache->_rm_onode(p->second);
3509 p = onode_map.onode_map.erase(p);
3510
3511 o->c = dest;
3512 dest->cache->_add_onode(o, 1);
3513 dest->onode_map.onode_map[o->oid] = o;
3514 dest->onode_map.cache = dest->cache;
3515
3516 // move over shared blobs and buffers. cover shared blobs from
3517 // both extent map and spanning blob map (the full extent map
3518 // may not be faulted in)
3519 vector<SharedBlob*> sbvec;
3520 for (auto& e : o->extent_map.extent_map) {
3521 sbvec.push_back(e.blob->shared_blob.get());
3522 }
3523 for (auto& b : o->extent_map.spanning_blob_map) {
3524 sbvec.push_back(b.second->shared_blob.get());
3525 }
3526 for (auto sb : sbvec) {
3527 if (sb->coll == dest) {
3528 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3529 << dendl;
3530 continue;
3531 }
3532 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
3533 if (sb->get_sbid()) {
3534 ldout(store->cct, 20) << __func__
3535 << " moving registration " << *sb << dendl;
3536 shared_blob_set.remove(sb);
3537 dest->shared_blob_set.add(dest, sb);
3538 }
3efd9988 3539 sb->coll = dest;
7c673cae 3540 if (dest->cache != cache) {
7c673cae
FG
3541 for (auto& i : sb->bc.buffer_map) {
3542 if (!i.second->is_writing()) {
3543 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3544 << dendl;
3545 dest->cache->_move_buffer(cache, i.second.get());
3546 }
3547 }
3548 }
3549 }
7c673cae
FG
3550 }
3551 }
3552}
3553
7c673cae
FG
3554// =======================================================
3555
91327a77
AA
3556// MempoolThread
3557
3558#undef dout_prefix
3559#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
3560
7c673cae
FG
3561void *BlueStore::MempoolThread::entry()
3562{
11fdf7f2
TL
3563 std::unique_lock l(lock);
3564
3565 std::list<std::shared_ptr<PriorityCache::PriCache>> caches;
3566 binned_kv_cache = store->db->get_priority_cache();
3567 if (binned_kv_cache != nullptr) {
3568 caches.push_back(binned_kv_cache);
3569 }
3570 caches.push_back(meta_cache);
3571 caches.push_back(data_cache);
31f18b77 3572
91327a77
AA
3573 autotune_cache_size = store->osd_memory_cache_min;
3574
3575 utime_t next_balance = ceph_clock_now();
3576 utime_t next_resize = ceph_clock_now();
31f18b77 3577
91327a77
AA
3578 bool interval_stats_trim = false;
3579 bool interval_stats_resize = false;
3580 while (!stop) {
91327a77
AA
3581 // Before we trim, check and see if it's time to rebalance/resize.
3582 double autotune_interval = store->cache_autotune_interval;
3583 double resize_interval = store->osd_memory_cache_resize_interval;
3584
3585 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
3586 _adjust_cache_settings();
3587
91327a77
AA
3588 // Log events at 5 instead of 20 when balance happens.
3589 interval_stats_resize = true;
3590 interval_stats_trim = true;
3591 if (store->cache_autotune) {
3592 _balance_cache(caches);
3593 }
31f18b77 3594
91327a77
AA
3595 next_balance = ceph_clock_now();
3596 next_balance += autotune_interval;
3597 }
3598 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
3599 if (ceph_using_tcmalloc() && store->cache_autotune) {
3600 _tune_cache_size(interval_stats_resize);
3601 interval_stats_resize = false;
3602 }
3603 next_resize = ceph_clock_now();
3604 next_resize += resize_interval;
31f18b77
FG
3605 }
3606
91327a77
AA
3607 // Now Trim
3608 _trim_shards(interval_stats_trim);
3609 interval_stats_trim = false;
31f18b77 3610
91327a77 3611 store->_update_cache_logger();
11fdf7f2
TL
3612 auto wait = ceph::make_timespan(
3613 store->cct->_conf->bluestore_cache_trim_interval);
3614 cond.wait_for(l, wait);
7c673cae
FG
3615 }
3616 stop = false;
3617 return NULL;
3618}
3619
91327a77
AA
3620void BlueStore::MempoolThread::_adjust_cache_settings()
3621{
11fdf7f2
TL
3622 if (binned_kv_cache != nullptr) {
3623 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
3624 }
3625 meta_cache->set_cache_ratio(store->cache_meta_ratio);
3626 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
3627}
3628
3629void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
3630{
3631 auto cct = store->cct;
3632 size_t num_shards = store->cache_shards.size();
3633
3634 int64_t kv_used = store->db->get_cache_usage();
11fdf7f2
TL
3635 int64_t meta_used = meta_cache->_get_used_bytes();
3636 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
3637
3638 uint64_t cache_size = store->cache_size;
3639 int64_t kv_alloc =
11fdf7f2 3640 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
91327a77 3641 int64_t meta_alloc =
11fdf7f2 3642 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 3643 int64_t data_alloc =
11fdf7f2 3644 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 3645
11fdf7f2 3646 if (binned_kv_cache != nullptr && store->cache_autotune) {
91327a77
AA
3647 cache_size = autotune_cache_size;
3648
11fdf7f2
TL
3649 kv_alloc = binned_kv_cache->get_committed_size();
3650 meta_alloc = meta_cache->get_committed_size();
3651 data_alloc = data_cache->get_committed_size();
91327a77
AA
3652 }
3653
3654 if (interval_stats) {
3655 ldout(cct, 5) << __func__ << " cache_size: " << cache_size
3656 << " kv_alloc: " << kv_alloc
3657 << " kv_used: " << kv_used
3658 << " meta_alloc: " << meta_alloc
3659 << " meta_used: " << meta_used
3660 << " data_alloc: " << data_alloc
3661 << " data_used: " << data_used << dendl;
3662 } else {
3663 ldout(cct, 20) << __func__ << " cache_size: " << cache_size
3664 << " kv_alloc: " << kv_alloc
3665 << " kv_used: " << kv_used
3666 << " meta_alloc: " << meta_alloc
3667 << " meta_used: " << meta_used
3668 << " data_alloc: " << data_alloc
3669 << " data_used: " << data_used << dendl;
3670 }
3671
3672 uint64_t max_shard_onodes = static_cast<uint64_t>(
11fdf7f2 3673 (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode());
91327a77
AA
3674 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
3675
3676 ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
3677 << " max_shard_buffer: " << max_shard_buffer << dendl;
3678
3679 for (auto i : store->cache_shards) {
3680 i->trim(max_shard_onodes, max_shard_buffer);
3681 }
3682}
3683
3684void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
3685{
3686 auto cct = store->cct;
3687 uint64_t target = store->osd_memory_target;
3688 uint64_t base = store->osd_memory_base;
3689 double fragmentation = store->osd_memory_expected_fragmentation;
91327a77 3690 uint64_t cache_min = store->osd_memory_cache_min;
f64942e4
AA
3691 uint64_t cache_max = cache_min;
3692 uint64_t limited_target = (1.0 - fragmentation) * target;
3693 if (limited_target > base + cache_min) {
3694 cache_max = limited_target - base;
3695 }
91327a77
AA
3696
3697 size_t heap_size = 0;
3698 size_t unmapped = 0;
3699 uint64_t mapped = 0;
3700
3701 ceph_heap_release_free_memory();
3702 ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
3703 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
3704 mapped = heap_size - unmapped;
3705
3706 uint64_t new_size = autotune_cache_size;
3707 new_size = (new_size < cache_max) ? new_size : cache_max;
3708 new_size = (new_size > cache_min) ? new_size : cache_min;
3709
3710 // Approach the min/max slowly, but bounce away quickly.
3711 if ((uint64_t) mapped < target) {
3712 double ratio = 1 - ((double) mapped / target);
3713 new_size += ratio * (cache_max - new_size);
3714 } else {
3715 double ratio = 1 - ((double) target / mapped);
3716 new_size -= ratio * (new_size - cache_min);
3717 }
3718
3719 if (interval_stats) {
3720 ldout(cct, 5) << __func__
3721 << " target: " << target
3722 << " heap: " << heap_size
3723 << " unmapped: " << unmapped
3724 << " mapped: " << mapped
3725 << " old cache_size: " << autotune_cache_size
3726 << " new cache size: " << new_size << dendl;
3727 } else {
3728 ldout(cct, 20) << __func__
3729 << " target: " << target
3730 << " heap: " << heap_size
3731 << " unmapped: " << unmapped
3732 << " mapped: " << mapped
3733 << " old cache_size: " << autotune_cache_size
3734 << " new cache size: " << new_size << dendl;
3735 }
3736 autotune_cache_size = new_size;
3737}
3738
3739void BlueStore::MempoolThread::_balance_cache(
11fdf7f2 3740 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches)
91327a77
AA
3741{
3742 int64_t mem_avail = autotune_cache_size;
11fdf7f2
TL
3743 /* Each cache is going to get at least 1 chunk's worth of memory from get_chunk
3744 * so shrink the available memory here to compensate. Don't shrink the amount of
3745 * memory below 0 however.
3746 */
3747 mem_avail -= PriorityCache::get_chunk(1, autotune_cache_size) * caches.size();
3748 if (mem_avail < 0) {
3749 mem_avail = 0;
3750 }
91327a77
AA
3751
3752 // Assign memory for each priority level
3753 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
3754 ldout(store->cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
3755 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
3756 _balance_cache_pri(&mem_avail, caches, pri);
3757 }
3758 // Assign any leftover memory based on the default ratios.
3759 if (mem_avail > 0) {
3760 for (auto it = caches.begin(); it != caches.end(); it++) {
3761 int64_t fair_share =
3762 static_cast<int64_t>((*it)->get_cache_ratio() * mem_avail);
3763 if (fair_share > 0) {
3764 (*it)->add_cache_bytes(PriorityCache::Priority::LAST, fair_share);
3765 }
3766 }
3767 }
3768 // assert if we assigned more memory than is available.
11fdf7f2 3769 ceph_assert(mem_avail >= 0);
91327a77
AA
3770
3771 // Finally commit the new cache sizes
3772 for (auto it = caches.begin(); it != caches.end(); it++) {
11fdf7f2 3773 (*it)->commit_cache_size(autotune_cache_size);
91327a77
AA
3774 }
3775}
3776
3777void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail,
11fdf7f2
TL
3778 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
3779 PriorityCache::Priority pri)
91327a77 3780{
11fdf7f2 3781 std::list<std::shared_ptr<PriorityCache::PriCache>> tmp_caches = caches;
91327a77
AA
3782 double cur_ratios = 0;
3783 double new_ratios = 0;
3784
3785 // Zero this priority's bytes, sum the initial ratios.
3786 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); it++) {
3787 (*it)->set_cache_bytes(pri, 0);
3788 cur_ratios += (*it)->get_cache_ratio();
3789 }
3790
3791 // For this priority, loop until caches are satisified or we run out of memory.
3792 // Since we can't allocate fractional bytes, stop if we have fewer bytes left
3793 // than the number of participating caches.
3794 while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
3795 uint64_t total_assigned = 0;
3796
3797 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); ) {
11fdf7f2 3798 int64_t cache_wants = (*it)->request_cache_bytes(pri, autotune_cache_size);
91327a77
AA
3799
3800 // Usually the ratio should be set to the fraction of the current caches'
3801 // assigned ratio compared to the total ratio of all caches that still
3802 // want memory. There is a special case where the only caches left are
3803 // all assigned 0% ratios but still want memory. In that case, give
3804 // them an equal shot at the remaining memory for this priority.
3805 double ratio = 1.0 / tmp_caches.size();
3806 if (cur_ratios > 0) {
3807 ratio = (*it)->get_cache_ratio() / cur_ratios;
3808 }
3809 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
3810
3811 if (cache_wants > fair_share) {
3812 // If we want too much, take what we can get but stick around for more
3813 (*it)->add_cache_bytes(pri, fair_share);
3814 total_assigned += fair_share;
3815
3816 new_ratios += (*it)->get_cache_ratio();
3817 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3818 << " wanted: " << cache_wants << " fair_share: " << fair_share
3819 << " mem_avail: " << *mem_avail
3820 << " staying in list. Size: " << tmp_caches.size()
3821 << dendl;
3822 ++it;
3823 } else {
3824 // Otherwise assign only what we want
3825 if (cache_wants > 0) {
3826 (*it)->add_cache_bytes(pri, cache_wants);
3827 total_assigned += cache_wants;
3828
3829 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3830 << " wanted: " << cache_wants << " fair_share: " << fair_share
3831 << " mem_avail: " << *mem_avail
3832 << " removing from list. New size: " << tmp_caches.size() - 1
3833 << dendl;
3834
3835 }
3836 // Either the cache didn't want anything or got what it wanted, so remove it from the tmp list.
3837 it = tmp_caches.erase(it);
3838 }
3839 }
3840 // Reset the ratios
3841 *mem_avail -= total_assigned;
3842 cur_ratios = new_ratios;
3843 new_ratios = 0;
3844 }
3845}
3846
7c673cae
FG
3847// =======================================================
3848
31f18b77
FG
3849// OmapIteratorImpl
3850
3851#undef dout_prefix
3852#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3853
3854BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3855 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3856 : c(c), o(o), it(it)
3857{
3858 RWLock::RLocker l(c->lock);
3859 if (o->onode.has_omap()) {
3860 get_omap_key(o->onode.nid, string(), &head);
3861 get_omap_tail(o->onode.nid, &tail);
3862 it->lower_bound(head);
3863 }
3864}
3865
11fdf7f2
TL
3866string BlueStore::OmapIteratorImpl::_stringify() const
3867{
3868 stringstream s;
3869 s << " omap_iterator(cid = " << c->cid
3870 <<", oid = " << o->oid << ")";
3871 return s.str();
3872}
3873
31f18b77
FG
3874int BlueStore::OmapIteratorImpl::seek_to_first()
3875{
3876 RWLock::RLocker l(c->lock);
11fdf7f2 3877 auto start1 = mono_clock::now();
31f18b77
FG
3878 if (o->onode.has_omap()) {
3879 it->lower_bound(head);
3880 } else {
3881 it = KeyValueDB::Iterator();
3882 }
11fdf7f2
TL
3883 c->store->log_latency_fn(
3884 l_bluestore_omap_seek_to_first_lat,
3885 mono_clock::now() - start1,
3886 [&] (const ceph::timespan& lat) {
3887 return ", lat = " + timespan_str(lat) + _stringify();
3888 }
3889 );
3890
31f18b77
FG
3891 return 0;
3892}
3893
3894int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3895{
3896 RWLock::RLocker l(c->lock);
11fdf7f2 3897 auto start1 = mono_clock::now();
31f18b77
FG
3898 if (o->onode.has_omap()) {
3899 string key;
3900 get_omap_key(o->onode.nid, after, &key);
3901 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3902 << pretty_binary_string(key) << dendl;
3903 it->upper_bound(key);
3904 } else {
3905 it = KeyValueDB::Iterator();
3906 }
11fdf7f2
TL
3907 c->store->log_latency_fn(
3908 l_bluestore_omap_upper_bound_lat,
3909 mono_clock::now() - start1,
3910 [&] (const ceph::timespan& lat) {
3911 return ", after = " + after + ", lat = " + timespan_str(lat) +
3912 _stringify();
3913 }
3914 );
31f18b77
FG
3915 return 0;
3916}
3917
3918int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3919{
3920 RWLock::RLocker l(c->lock);
11fdf7f2 3921 auto start1 = mono_clock::now();
31f18b77
FG
3922 if (o->onode.has_omap()) {
3923 string key;
3924 get_omap_key(o->onode.nid, to, &key);
3925 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3926 << pretty_binary_string(key) << dendl;
3927 it->lower_bound(key);
3928 } else {
3929 it = KeyValueDB::Iterator();
3930 }
11fdf7f2
TL
3931 c->store->log_latency_fn(
3932 l_bluestore_omap_lower_bound_lat,
3933 mono_clock::now() - start1,
3934 [&] (const ceph::timespan& lat) {
3935 return ", to = " + to + ", lat = " + timespan_str(lat) +
3936 _stringify();
3937 }
3938 );
31f18b77
FG
3939 return 0;
3940}
3941
3942bool BlueStore::OmapIteratorImpl::valid()
3943{
3944 RWLock::RLocker l(c->lock);
3945 bool r = o->onode.has_omap() && it && it->valid() &&
3946 it->raw_key().second <= tail;
3947 if (it && it->valid()) {
3948 ldout(c->store->cct,20) << __func__ << " is at "
3949 << pretty_binary_string(it->raw_key().second)
3950 << dendl;
3951 }
3952 return r;
3953}
3954
11fdf7f2 3955int BlueStore::OmapIteratorImpl::next()
31f18b77 3956{
11fdf7f2 3957 int r = -1;
31f18b77 3958 RWLock::RLocker l(c->lock);
11fdf7f2 3959 auto start1 = mono_clock::now();
31f18b77
FG
3960 if (o->onode.has_omap()) {
3961 it->next();
11fdf7f2 3962 r = 0;
31f18b77 3963 }
11fdf7f2
TL
3964 c->store->log_latency_fn(
3965 l_bluestore_omap_next_lat,
3966 mono_clock::now() - start1,
3967 [&] (const ceph::timespan& lat) {
3968 return ", lat = " + timespan_str(lat) + _stringify();
3969 }
3970 );
3971
3972 return r;
31f18b77
FG
3973}
3974
3975string BlueStore::OmapIteratorImpl::key()
3976{
3977 RWLock::RLocker l(c->lock);
11fdf7f2 3978 ceph_assert(it->valid());
31f18b77
FG
3979 string db_key = it->raw_key().second;
3980 string user_key;
3981 decode_omap_key(db_key, &user_key);
3982 return user_key;
3983}
3984
3985bufferlist BlueStore::OmapIteratorImpl::value()
3986{
3987 RWLock::RLocker l(c->lock);
11fdf7f2 3988 ceph_assert(it->valid());
31f18b77
FG
3989 return it->value();
3990}
3991
3992
3993// =====================================
3994
7c673cae
FG
3995#undef dout_prefix
3996#define dout_prefix *_dout << "bluestore(" << path << ") "
3997
3998
3999static void aio_cb(void *priv, void *priv2)
4000{
4001 BlueStore *store = static_cast<BlueStore*>(priv);
4002 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4003 c->aio_finish(store);
4004}
4005
11fdf7f2
TL
4006static void discard_cb(void *priv, void *priv2)
4007{
4008 BlueStore *store = static_cast<BlueStore*>(priv);
4009 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4010 store->handle_discard(*tmp);
4011}
4012
4013void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4014{
4015 dout(10) << __func__ << dendl;
4016 ceph_assert(alloc);
4017 alloc->release(to_release);
4018}
4019
7c673cae
FG
4020BlueStore::BlueStore(CephContext *cct, const string& path)
4021 : ObjectStore(cct, path),
4022 throttle_bytes(cct, "bluestore_throttle_bytes",
4023 cct->_conf->bluestore_throttle_bytes),
4024 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
4025 cct->_conf->bluestore_throttle_bytes +
4026 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 4027 deferred_finisher(cct, "defered_finisher", "dfin"),
11fdf7f2 4028 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4029 kv_sync_thread(this),
31f18b77 4030 kv_finalize_thread(this),
7c673cae
FG
4031 mempool_thread(this)
4032{
4033 _init_logger();
11fdf7f2 4034 cct->_conf.add_observer(this);
7c673cae 4035 set_cache_shards(1);
7c673cae
FG
4036}
4037
4038BlueStore::BlueStore(CephContext *cct,
4039 const string& path,
4040 uint64_t _min_alloc_size)
4041 : ObjectStore(cct, path),
4042 throttle_bytes(cct, "bluestore_throttle_bytes",
4043 cct->_conf->bluestore_throttle_bytes),
4044 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
4045 cct->_conf->bluestore_throttle_bytes +
4046 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 4047 deferred_finisher(cct, "defered_finisher", "dfin"),
11fdf7f2 4048 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4049 kv_sync_thread(this),
31f18b77 4050 kv_finalize_thread(this),
7c673cae
FG
4051 min_alloc_size(_min_alloc_size),
4052 min_alloc_size_order(ctz(_min_alloc_size)),
4053 mempool_thread(this)
4054{
4055 _init_logger();
11fdf7f2 4056 cct->_conf.add_observer(this);
7c673cae 4057 set_cache_shards(1);
7c673cae
FG
4058}
4059
4060BlueStore::~BlueStore()
4061{
11fdf7f2 4062 cct->_conf.remove_observer(this);
7c673cae 4063 _shutdown_logger();
11fdf7f2
TL
4064 ceph_assert(!mounted);
4065 ceph_assert(db == NULL);
4066 ceph_assert(bluefs == NULL);
4067 ceph_assert(fsid_fd < 0);
4068 ceph_assert(path_fd < 0);
7c673cae
FG
4069 for (auto i : cache_shards) {
4070 delete i;
4071 }
4072 cache_shards.clear();
4073}
4074
4075const char **BlueStore::get_tracked_conf_keys() const
4076{
4077 static const char* KEYS[] = {
4078 "bluestore_csum_type",
4079 "bluestore_compression_mode",
4080 "bluestore_compression_algorithm",
4081 "bluestore_compression_min_blob_size",
4082 "bluestore_compression_min_blob_size_ssd",
4083 "bluestore_compression_min_blob_size_hdd",
4084 "bluestore_compression_max_blob_size",
4085 "bluestore_compression_max_blob_size_ssd",
4086 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4087 "bluestore_compression_required_ratio",
7c673cae
FG
4088 "bluestore_max_alloc_size",
4089 "bluestore_prefer_deferred_size",
181888fb
FG
4090 "bluestore_prefer_deferred_size_hdd",
4091 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4092 "bluestore_deferred_batch_ops",
4093 "bluestore_deferred_batch_ops_hdd",
4094 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4095 "bluestore_throttle_bytes",
4096 "bluestore_throttle_deferred_bytes",
4097 "bluestore_throttle_cost_per_io_hdd",
4098 "bluestore_throttle_cost_per_io_ssd",
4099 "bluestore_throttle_cost_per_io",
4100 "bluestore_max_blob_size",
4101 "bluestore_max_blob_size_ssd",
4102 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4103 "osd_memory_target",
4104 "osd_memory_target_cgroup_limit_ratio",
4105 "osd_memory_base",
4106 "osd_memory_cache_min",
4107 "bluestore_cache_autotune",
4108 "bluestore_cache_autotune_interval",
81eedcae
TL
4109 "bluestore_no_per_pool_stats_tolerance",
4110 "bluestore_warn_on_legacy_statfs",
7c673cae
FG
4111 NULL
4112 };
4113 return KEYS;
4114}
4115
11fdf7f2 4116void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4117 const std::set<std::string> &changed)
4118{
81eedcae
TL
4119 if (changed.count("bluestore_no_per_pool_stats_tolerance") ||
4120 changed.count("bluestore_warn_on_legacy_statfs")) {
4121 _check_legacy_statfs_alert();
4122 }
4123
7c673cae
FG
4124 if (changed.count("bluestore_csum_type")) {
4125 _set_csum();
4126 }
4127 if (changed.count("bluestore_compression_mode") ||
4128 changed.count("bluestore_compression_algorithm") ||
4129 changed.count("bluestore_compression_min_blob_size") ||
4130 changed.count("bluestore_compression_max_blob_size")) {
4131 if (bdev) {
4132 _set_compression();
4133 }
4134 }
4135 if (changed.count("bluestore_max_blob_size") ||
4136 changed.count("bluestore_max_blob_size_ssd") ||
4137 changed.count("bluestore_max_blob_size_hdd")) {
4138 if (bdev) {
4139 // only after startup
4140 _set_blob_size();
4141 }
4142 }
4143 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4144 changed.count("bluestore_prefer_deferred_size_hdd") ||
4145 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4146 changed.count("bluestore_max_alloc_size") ||
4147 changed.count("bluestore_deferred_batch_ops") ||
4148 changed.count("bluestore_deferred_batch_ops_hdd") ||
4149 changed.count("bluestore_deferred_batch_ops_ssd")) {
4150 if (bdev) {
4151 // only after startup
4152 _set_alloc_sizes();
4153 }
4154 }
4155 if (changed.count("bluestore_throttle_cost_per_io") ||
4156 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4157 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4158 if (bdev) {
4159 _set_throttle_params();
4160 }
4161 }
4162 if (changed.count("bluestore_throttle_bytes")) {
4163 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
4164 throttle_deferred_bytes.reset_max(
4165 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
4166 }
4167 if (changed.count("bluestore_throttle_deferred_bytes")) {
4168 throttle_deferred_bytes.reset_max(
4169 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
4170 }
4171}
4172
4173void BlueStore::_set_compression()
4174{
224ce89b
WB
4175 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4176 if (m) {
11fdf7f2 4177 _clear_compression_alert();
224ce89b
WB
4178 comp_mode = *m;
4179 } else {
4180 derr << __func__ << " unrecognized value '"
4181 << cct->_conf->bluestore_compression_mode
4182 << "' for bluestore_compression_mode, reverting to 'none'"
4183 << dendl;
4184 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4185 string s("unknown mode: ");
4186 s += cct->_conf->bluestore_compression_mode;
4187 _set_compression_alert(true, s.c_str());
224ce89b
WB
4188 }
4189
4190 compressor = nullptr;
4191
4192 if (comp_mode == Compressor::COMP_NONE) {
4193 dout(10) << __func__ << " compression mode set to 'none', "
11fdf7f2 4194 << "ignore other compression settings" << dendl;
224ce89b
WB
4195 return;
4196 }
4197
3efd9988
FG
4198 if (cct->_conf->bluestore_compression_min_blob_size) {
4199 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4200 } else {
11fdf7f2 4201 ceph_assert(bdev);
7c673cae
FG
4202 if (bdev->is_rotational()) {
4203 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4204 } else {
4205 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4206 }
4207 }
4208
4209 if (cct->_conf->bluestore_compression_max_blob_size) {
4210 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4211 } else {
11fdf7f2 4212 ceph_assert(bdev);
7c673cae
FG
4213 if (bdev->is_rotational()) {
4214 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4215 } else {
4216 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4217 }
4218 }
4219
7c673cae
FG
4220 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4221 if (!alg_name.empty()) {
4222 compressor = Compressor::create(cct, alg_name);
4223 if (!compressor) {
4224 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4225 << dendl;
11fdf7f2 4226 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4227 }
4228 }
4229
4230 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4231 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4232 << " min_blob " << comp_min_blob_size
4233 << " max_blob " << comp_max_blob_size
7c673cae
FG
4234 << dendl;
4235}
4236
4237void BlueStore::_set_csum()
4238{
4239 csum_type = Checksummer::CSUM_NONE;
4240 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4241 if (t > Checksummer::CSUM_NONE)
4242 csum_type = t;
4243
4244 dout(10) << __func__ << " csum_type "
4245 << Checksummer::get_csum_type_string(csum_type)
4246 << dendl;
4247}
4248
4249void BlueStore::_set_throttle_params()
4250{
4251 if (cct->_conf->bluestore_throttle_cost_per_io) {
4252 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4253 } else {
11fdf7f2 4254 ceph_assert(bdev);
7c673cae
FG
4255 if (bdev->is_rotational()) {
4256 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4257 } else {
4258 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4259 }
4260 }
4261
4262 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4263 << dendl;
4264}
4265void BlueStore::_set_blob_size()
4266{
4267 if (cct->_conf->bluestore_max_blob_size) {
4268 max_blob_size = cct->_conf->bluestore_max_blob_size;
4269 } else {
11fdf7f2 4270 ceph_assert(bdev);
7c673cae
FG
4271 if (bdev->is_rotational()) {
4272 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4273 } else {
4274 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4275 }
4276 }
4277 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4278 << std::dec << dendl;
4279}
4280
11fdf7f2 4281int BlueStore::_set_cache_sizes()
1adf2230 4282{
11fdf7f2
TL
4283 // set osd_memory_target *default* based on cgroup limit?
4284 // (do this before we fetch the osd_memory_target value!)
4285 double cgroup_ratio = cct->_conf.get_val<double>(
4286 "osd_memory_target_cgroup_limit_ratio");
4287 if (cgroup_ratio > 0.0) {
4288 uint64_t cgroup_limit = 0;
4289 if (get_cgroup_memory_limit(&cgroup_limit) == 0 &&
4290 cgroup_limit) {
4291 uint64_t def = cgroup_limit * cgroup_ratio;
4292 dout(10) << __func__ << " osd_memory_target_cgroup_limit_ratio "
4293 << cgroup_ratio << ", cgroup_limit " << cgroup_limit
4294 << ", defaulting osd_memory_target to " << def
4295 << dendl;
4296 cct->_conf.set_val_default("osd_memory_target", stringify(def));
1adf2230
AA
4297 }
4298 }
1adf2230 4299
11fdf7f2
TL
4300 ceph_assert(bdev);
4301 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4302 cache_autotune_interval =
11fdf7f2
TL
4303 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4304 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4305 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4306 osd_memory_expected_fragmentation =
11fdf7f2
TL
4307 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4308 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4309 osd_memory_cache_resize_interval =
11fdf7f2 4310 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4311
224ce89b
WB
4312 if (cct->_conf->bluestore_cache_size) {
4313 cache_size = cct->_conf->bluestore_cache_size;
4314 } else {
4315 // choose global cache size based on backend type
4316 if (bdev->is_rotational()) {
4317 cache_size = cct->_conf->bluestore_cache_size_hdd;
4318 } else {
4319 cache_size = cct->_conf->bluestore_cache_size_ssd;
4320 }
4321 }
31f18b77 4322
91327a77 4323 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
224ce89b 4324 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4325 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4326 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4327 return -EINVAL;
4328 }
91327a77
AA
4329
4330 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b 4331 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4332 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4333 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4334 return -EINVAL;
4335 }
91327a77 4336
31f18b77 4337 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4338 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4339 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4340 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4341 << dendl;
31f18b77
FG
4342 return -EINVAL;
4343 }
91327a77
AA
4344
4345 cache_data_ratio =
4346 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
31f18b77
FG
4347 if (cache_data_ratio < 0) {
4348 // deal with floating point imprecision
4349 cache_data_ratio = 0;
4350 }
91327a77 4351
224ce89b
WB
4352 dout(1) << __func__ << " cache_size " << cache_size
4353 << " meta " << cache_meta_ratio
31f18b77
FG
4354 << " kv " << cache_kv_ratio
4355 << " data " << cache_data_ratio
4356 << dendl;
4357 return 0;
4358}
4359
3efd9988
FG
4360int BlueStore::write_meta(const std::string& key, const std::string& value)
4361{
4362 bluestore_bdev_label_t label;
4363 string p = path + "/block";
4364 int r = _read_bdev_label(cct, p, &label);
4365 if (r < 0) {
4366 return ObjectStore::write_meta(key, value);
4367 }
4368 label.meta[key] = value;
4369 r = _write_bdev_label(cct, p, label);
11fdf7f2 4370 ceph_assert(r == 0);
3efd9988
FG
4371 return ObjectStore::write_meta(key, value);
4372}
4373
4374int BlueStore::read_meta(const std::string& key, std::string *value)
4375{
4376 bluestore_bdev_label_t label;
4377 string p = path + "/block";
4378 int r = _read_bdev_label(cct, p, &label);
4379 if (r < 0) {
4380 return ObjectStore::read_meta(key, value);
4381 }
4382 auto i = label.meta.find(key);
4383 if (i == label.meta.end()) {
4384 return ObjectStore::read_meta(key, value);
4385 }
4386 *value = i->second;
4387 return 0;
4388}
4389
7c673cae
FG
4390void BlueStore::_init_logger()
4391{
4392 PerfCountersBuilder b(cct, "bluestore",
4393 l_bluestore_first, l_bluestore_last);
4394 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4395 "Average kv_thread flush latency",
4396 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4397 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4398 "Average kv_thread commit latency");
11fdf7f2
TL
4399 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4400 "Average kv_sync thread latency",
4401 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4402 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4403 "Average kv_finalize thread latency",
4404 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4405 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4406 "Average prepare state latency");
4407 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4408 "Average aio_wait state latency",
4409 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4410 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4411 "Average io_done state latency");
4412 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4413 "Average kv_queued state latency");
4414 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4415 "Average kv_commiting state latency");
4416 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4417 "Average kv_done state latency");
4418 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4419 "Average deferred_queued state latency");
4420 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4421 "Average aio_wait state latency");
4422 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4423 "Average cleanup state latency");
4424 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4425 "Average finishing state latency");
4426 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4427 "Average done state latency");
4428 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4429 "Average submit throttle latency",
4430 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4431 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4432 "Average submit latency",
4433 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4434 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4435 "Average commit latency",
4436 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4437 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4438 "Average read latency",
4439 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4440 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4441 "Average read onode metadata latency");
4442 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4443 "Average read latency");
4444 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4445 "Average compress latency");
4446 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4447 "Average decompress latency");
4448 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4449 "Average checksum latency");
4450 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4451 "Sum for beneficial compress ops");
4452 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4453 "Sum for compress ops rejected due to low net gain of space");
4454 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4455 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4456 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4457 "Sum for deferred write op");
4458 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4459 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4460 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4461 "Sum for write penalty read ops");
4462 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4463 "Sum for allocated bytes");
4464 b.add_u64(l_bluestore_stored, "bluestore_stored",
4465 "Sum for stored bytes");
4466 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
4467 "Sum for stored compressed bytes");
4468 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
4469 "Sum for bytes allocated for compressed data");
4470 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
4471 "Sum for original bytes that were compressed");
4472
4473 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4474 "Number of onodes in cache");
4475 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4476 "Sum for onode-lookups hit in the cache");
4477 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4478 "Sum for onode-lookups missed in the cache");
4479 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4480 "Sum for onode-shard lookups hit in the cache");
4481 b.add_u64_counter(l_bluestore_onode_shard_misses,
4482 "bluestore_onode_shard_misses",
4483 "Sum for onode-shard lookups missed in the cache");
4484 b.add_u64(l_bluestore_extents, "bluestore_extents",
4485 "Number of extents in cache");
4486 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4487 "Number of blobs in cache");
4488 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4489 "Number of buffers in cache");
4490 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4491 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4492 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4493 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4494 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4495 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4496
4497 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4498 "Large aligned writes into fresh blobs");
4499 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4500 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4501 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4502 "Large aligned writes into fresh blobs (blobs)");
4503 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4504 "Small writes into existing or sparse small blobs");
4505 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4506 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4507 b.add_u64_counter(l_bluestore_write_small_unused,
4508 "bluestore_write_small_unused",
4509 "Small writes into unused portion of existing blob");
4510 b.add_u64_counter(l_bluestore_write_small_deferred,
4511 "bluestore_write_small_deferred",
4512 "Small overwrites using deferred");
4513 b.add_u64_counter(l_bluestore_write_small_pre_read,
4514 "bluestore_write_small_pre_read",
4515 "Small writes that required we read some data (possibly "
4516 "cached) to fill out the block");
4517 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4518 "Small write into new (sparse) blob");
4519
4520 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4521 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4522 "Onode extent map reshard events");
4523 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4524 "Sum for blob splitting due to resharding");
4525 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4526 "Sum for extents that have been removed due to compression");
4527 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4528 "Sum for extents that have been merged due to garbage "
4529 "collection");
b32b8144
FG
4530 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4531 "Read EIO errors propagated to high level callers");
f64942e4
AA
4532 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4533 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
4534 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4535 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
4536 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4537 "Average omap iterator seek_to_first call latency");
4538 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4539 "Average omap iterator upper_bound call latency");
4540 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
4541 "Average omap iterator lower_bound call latency");
4542 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
4543 "Average omap iterator next call latency");
7c673cae
FG
4544 logger = b.create_perf_counters();
4545 cct->get_perfcounters_collection()->add(logger);
4546}
4547
4548int BlueStore::_reload_logger()
4549{
4550 struct store_statfs_t store_statfs;
7c673cae 4551 int r = statfs(&store_statfs);
11fdf7f2 4552 if (r >= 0) {
7c673cae 4553 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
4554 logger->set(l_bluestore_stored, store_statfs.data_stored);
4555 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
4556 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
4557 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
4558 }
4559 return r;
4560}
4561
4562void BlueStore::_shutdown_logger()
4563{
4564 cct->get_perfcounters_collection()->remove(logger);
4565 delete logger;
4566}
4567
4568int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4569 uuid_d *fsid)
4570{
4571 bluestore_bdev_label_t label;
4572 int r = _read_bdev_label(cct, path, &label);
4573 if (r < 0)
4574 return r;
4575 *fsid = label.osd_uuid;
4576 return 0;
4577}
4578
4579int BlueStore::_open_path()
4580{
b32b8144 4581 // sanity check(s)
11fdf7f2
TL
4582 auto osd_max_object_size =
4583 cct->_conf.get_val<Option::size_t>("osd_max_object_size");
4584 if (osd_max_object_size >= (size_t)OBJECT_MAX_SIZE) {
4585 derr << __func__ << " osd_max_object_size >= 0x" << std::hex << OBJECT_MAX_SIZE
4586 << "; BlueStore has hard limit of 0x" << OBJECT_MAX_SIZE << "." << std::dec << dendl;
b32b8144
FG
4587 return -EINVAL;
4588 }
11fdf7f2 4589 ceph_assert(path_fd < 0);
91327a77 4590 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
4591 if (path_fd < 0) {
4592 int r = -errno;
4593 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4594 << dendl;
4595 return r;
4596 }
4597 return 0;
4598}
4599
4600void BlueStore::_close_path()
4601{
4602 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4603 path_fd = -1;
4604}
4605
3efd9988
FG
4606int BlueStore::_write_bdev_label(CephContext *cct,
4607 string path, bluestore_bdev_label_t label)
7c673cae
FG
4608{
4609 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4610 bufferlist bl;
11fdf7f2 4611 encode(label, bl);
7c673cae 4612 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
4613 encode(crc, bl);
4614 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
4615 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4616 z.zero();
4617 bl.append(std::move(z));
4618
91327a77 4619 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
4620 if (fd < 0) {
4621 fd = -errno;
4622 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4623 << dendl;
4624 return fd;
4625 }
4626 int r = bl.write_fd(fd);
4627 if (r < 0) {
4628 derr << __func__ << " failed to write to " << path
4629 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 4630 goto out;
7c673cae 4631 }
3efd9988
FG
4632 r = ::fsync(fd);
4633 if (r < 0) {
4634 derr << __func__ << " failed to fsync " << path
4635 << ": " << cpp_strerror(r) << dendl;
4636 }
11fdf7f2 4637out:
7c673cae
FG
4638 VOID_TEMP_FAILURE_RETRY(::close(fd));
4639 return r;
4640}
4641
4642int BlueStore::_read_bdev_label(CephContext* cct, string path,
4643 bluestore_bdev_label_t *label)
4644{
4645 dout(10) << __func__ << dendl;
91327a77 4646 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
4647 if (fd < 0) {
4648 fd = -errno;
4649 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4650 << dendl;
4651 return fd;
4652 }
4653 bufferlist bl;
4654 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4655 VOID_TEMP_FAILURE_RETRY(::close(fd));
4656 if (r < 0) {
4657 derr << __func__ << " failed to read from " << path
4658 << ": " << cpp_strerror(r) << dendl;
4659 return r;
4660 }
4661
4662 uint32_t crc, expected_crc;
11fdf7f2 4663 auto p = bl.cbegin();
7c673cae 4664 try {
11fdf7f2 4665 decode(*label, p);
7c673cae
FG
4666 bufferlist t;
4667 t.substr_of(bl, 0, p.get_off());
4668 crc = t.crc32c(-1);
11fdf7f2 4669 decode(expected_crc, p);
7c673cae
FG
4670 }
4671 catch (buffer::error& e) {
b32b8144 4672 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
4673 << ": " << e.what()
4674 << dendl;
b32b8144 4675 return -ENOENT;
7c673cae
FG
4676 }
4677 if (crc != expected_crc) {
4678 derr << __func__ << " bad crc on label, expected " << expected_crc
4679 << " != actual " << crc << dendl;
4680 return -EIO;
4681 }
4682 dout(10) << __func__ << " got " << *label << dendl;
4683 return 0;
4684}
4685
4686int BlueStore::_check_or_set_bdev_label(
4687 string path, uint64_t size, string desc, bool create)
4688{
4689 bluestore_bdev_label_t label;
4690 if (create) {
4691 label.osd_uuid = fsid;
4692 label.size = size;
4693 label.btime = ceph_clock_now();
4694 label.description = desc;
3efd9988 4695 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
4696 if (r < 0)
4697 return r;
4698 } else {
4699 int r = _read_bdev_label(cct, path, &label);
4700 if (r < 0)
4701 return r;
31f18b77
FG
4702 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4703 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4704 << " and fsid " << fsid << " check bypassed" << dendl;
4705 }
4706 else if (label.osd_uuid != fsid) {
7c673cae
FG
4707 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4708 << " does not match our fsid " << fsid << dendl;
4709 return -EIO;
4710 }
4711 }
4712 return 0;
4713}
4714
4715void BlueStore::_set_alloc_sizes(void)
4716{
7c673cae
FG
4717 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4718
4719 if (cct->_conf->bluestore_prefer_deferred_size) {
4720 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4721 } else {
11fdf7f2 4722 ceph_assert(bdev);
7c673cae
FG
4723 if (bdev->is_rotational()) {
4724 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4725 } else {
4726 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4727 }
4728 }
4729
4730 if (cct->_conf->bluestore_deferred_batch_ops) {
4731 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4732 } else {
11fdf7f2 4733 ceph_assert(bdev);
7c673cae
FG
4734 if (bdev->is_rotational()) {
4735 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4736 } else {
4737 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4738 }
4739 }
4740
4741 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 4742 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
4743 << " max_alloc_size 0x" << std::hex << max_alloc_size
4744 << " prefer_deferred_size 0x" << prefer_deferred_size
4745 << std::dec
4746 << " deferred_batch_ops " << deferred_batch_ops
4747 << dendl;
4748}
4749
4750int BlueStore::_open_bdev(bool create)
4751{
11fdf7f2 4752 ceph_assert(bdev == NULL);
7c673cae 4753 string p = path + "/block";
11fdf7f2 4754 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
4755 int r = bdev->open(p);
4756 if (r < 0)
4757 goto fail;
4758
11fdf7f2
TL
4759 if (create && cct->_conf->bdev_enable_discard) {
4760 bdev->discard(0, bdev->get_size());
4761 }
4762
7c673cae
FG
4763 if (bdev->supported_bdev_label()) {
4764 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4765 if (r < 0)
4766 goto fail_close;
4767 }
4768
4769 // initialize global block parameters
4770 block_size = bdev->get_block_size();
4771 block_mask = ~(block_size - 1);
4772 block_size_order = ctz(block_size);
11fdf7f2 4773 ceph_assert(block_size == 1u << block_size_order);
224ce89b
WB
4774 // and set cache_size based on device type
4775 r = _set_cache_sizes();
4776 if (r < 0) {
4777 goto fail_close;
4778 }
7c673cae
FG
4779 return 0;
4780
4781 fail_close:
4782 bdev->close();
4783 fail:
4784 delete bdev;
4785 bdev = NULL;
4786 return r;
4787}
4788
11fdf7f2
TL
4789void BlueStore::_validate_bdev()
4790{
4791 ceph_assert(bdev);
4792 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
4793 uint64_t dev_size = bdev->get_size();
4794 if (dev_size <
4795 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
4796 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
4797 << " is too small, disable bluestore_bluefs_min for now"
4798 << dendl;
4799 ceph_assert(dev_size >= _get_ondisk_reserved());
4800
4801 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
4802 ceph_assert(r == 0);
4803 }
4804}
4805
7c673cae
FG
4806void BlueStore::_close_bdev()
4807{
11fdf7f2 4808 ceph_assert(bdev);
7c673cae
FG
4809 bdev->close();
4810 delete bdev;
4811 bdev = NULL;
4812}
4813
11fdf7f2 4814int BlueStore::_open_fm(KeyValueDB::Transaction t)
7c673cae 4815{
11fdf7f2
TL
4816 ceph_assert(fm == NULL);
4817 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
4818 ceph_assert(fm);
4819 if (t) {
4820 // create mode. initialize freespace
7c673cae 4821 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
4822 {
4823 bufferlist bl;
4824 bl.append(freelist_type);
4825 t->set(PREFIX_SUPER, "freelist_type", bl);
4826 }
b32b8144
FG
4827 // being able to allocate in units less than bdev block size
4828 // seems to be a bad idea.
11fdf7f2 4829 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
b32b8144 4830 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
4831
4832 // allocate superblock reserved space. note that we do not mark
4833 // bluefs space as allocated in the freelist; we instead rely on
4834 // bluefs_extents.
11fdf7f2 4835 auto reserved = _get_ondisk_reserved();
3efd9988 4836 fm->allocate(0, reserved, t);
7c673cae 4837
7c673cae 4838 if (cct->_conf->bluestore_bluefs) {
11fdf7f2 4839 ceph_assert(bluefs_extents.num_intervals() == 1);
7c673cae 4840 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
11fdf7f2 4841 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
4842 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4843 << " for bluefs" << dendl;
7c673cae
FG
4844 }
4845
4846 if (cct->_conf->bluestore_debug_prefill > 0) {
4847 uint64_t end = bdev->get_size() - reserved;
4848 dout(1) << __func__ << " pre-fragmenting freespace, using "
4849 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4850 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 4851 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
4852 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4853 float r = cct->_conf->bluestore_debug_prefill;
4854 r /= 1.0 - r;
4855 bool stop = false;
4856
4857 while (!stop && start < end) {
4858 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4859 if (start + l > end) {
4860 l = end - start;
11fdf7f2 4861 l = p2align(l, min_alloc_size);
7c673cae 4862 }
11fdf7f2 4863 ceph_assert(start + l <= end);
7c673cae
FG
4864
4865 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 4866 u = p2roundup(u, min_alloc_size);
7c673cae
FG
4867 if (start + l + u > end) {
4868 u = end - (start + l);
4869 // trim to align so we don't overflow again
11fdf7f2 4870 u = p2align(u, min_alloc_size);
7c673cae
FG
4871 stop = true;
4872 }
11fdf7f2 4873 ceph_assert(start + l + u <= end);
7c673cae 4874
11fdf7f2 4875 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
4876 << " use 0x" << u << std::dec << dendl;
4877
4878 if (u == 0) {
4879 // break if u has been trimmed to nothing
4880 break;
4881 }
4882
4883 fm->allocate(start + l, u, t);
4884 start += l + u;
4885 }
4886 }
7c673cae
FG
4887 }
4888
11fdf7f2 4889 int r = fm->init(db);
7c673cae
FG
4890 if (r < 0) {
4891 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4892 delete fm;
4893 fm = NULL;
4894 return r;
4895 }
81eedcae
TL
4896 // if space size tracked by free list manager is that higher than actual
4897 // dev size one can hit out-of-space allocation which will result
4898 // in data loss and/or assertions
4899 // Probably user altered the device size somehow.
4900 // The only fix for now is to redeploy OSD.
4901 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
4902 ostringstream ss;
4903 ss << "slow device size mismatch detected, "
4904 << " fm size(" << fm->get_size()
4905 << ") > slow device size(" << bdev->get_size()
4906 << "), Please stop using this OSD as it might cause data loss.";
4907 _set_disk_size_mismatch_alert(ss.str());
4908 }
7c673cae
FG
4909 return 0;
4910}
4911
4912void BlueStore::_close_fm()
4913{
4914 dout(10) << __func__ << dendl;
11fdf7f2 4915 ceph_assert(fm);
7c673cae
FG
4916 fm->shutdown();
4917 delete fm;
4918 fm = NULL;
4919}
4920
4921int BlueStore::_open_alloc()
4922{
11fdf7f2
TL
4923 ceph_assert(alloc == NULL);
4924 ceph_assert(bdev->get_size());
4925
4926 if (bluefs) {
4927 bluefs_extents.clear();
4928 auto r = bluefs->get_block_extents(bluefs_shared_bdev, &bluefs_extents);
4929 if (r < 0) {
4930 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
4931 << cpp_strerror(r) << dendl;
4932
4933 return r;
4934 }
4935 dout(10) << __func__ << " bluefs extents 0x"
4936 << std::hex << bluefs_extents << std::dec
4937 << dendl;
4938 }
4939
7c673cae
FG
4940 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4941 bdev->get_size(),
4942 min_alloc_size);
4943 if (!alloc) {
4944 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4945 << cct->_conf->bluestore_allocator
4946 << dendl;
4947 return -EINVAL;
4948 }
4949
4950 uint64_t num = 0, bytes = 0;
4951
4952 dout(1) << __func__ << " opening allocation metadata" << dendl;
4953 // initialize from freelist
4954 fm->enumerate_reset();
4955 uint64_t offset, length;
11fdf7f2 4956 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
4957 alloc->init_add_free(offset, length);
4958 ++num;
4959 bytes += length;
4960 }
224ce89b 4961 fm->enumerate_reset();
1adf2230 4962 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
7c673cae
FG
4963 << " in " << num << " extents"
4964 << dendl;
4965
4966 // also mark bluefs space as allocated
4967 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4968 alloc->init_rm_free(e.get_start(), e.get_len());
4969 }
7c673cae
FG
4970
4971 return 0;
4972}
4973
4974void BlueStore::_close_alloc()
4975{
11fdf7f2
TL
4976 ceph_assert(bdev);
4977 bdev->discard_drain();
4978
4979 ceph_assert(alloc);
7c673cae
FG
4980 alloc->shutdown();
4981 delete alloc;
4982 alloc = NULL;
11fdf7f2 4983 bluefs_extents.clear();
7c673cae
FG
4984}
4985
4986int BlueStore::_open_fsid(bool create)
4987{
11fdf7f2 4988 ceph_assert(fsid_fd < 0);
91327a77 4989 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
4990 if (create)
4991 flags |= O_CREAT;
4992 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4993 if (fsid_fd < 0) {
4994 int err = -errno;
4995 derr << __func__ << " " << cpp_strerror(err) << dendl;
4996 return err;
4997 }
4998 return 0;
4999}
5000
5001int BlueStore::_read_fsid(uuid_d *uuid)
5002{
5003 char fsid_str[40];
5004 memset(fsid_str, 0, sizeof(fsid_str));
5005 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5006 if (ret < 0) {
5007 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5008 return ret;
5009 }
5010 if (ret > 36)
5011 fsid_str[36] = 0;
5012 else
5013 fsid_str[ret] = 0;
5014 if (!uuid->parse(fsid_str)) {
5015 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5016 return -EINVAL;
5017 }
5018 return 0;
5019}
5020
5021int BlueStore::_write_fsid()
5022{
5023 int r = ::ftruncate(fsid_fd, 0);
5024 if (r < 0) {
5025 r = -errno;
5026 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5027 return r;
5028 }
5029 string str = stringify(fsid) + "\n";
5030 r = safe_write(fsid_fd, str.c_str(), str.length());
5031 if (r < 0) {
5032 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5033 return r;
5034 }
5035 r = ::fsync(fsid_fd);
5036 if (r < 0) {
5037 r = -errno;
5038 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5039 return r;
5040 }
5041 return 0;
5042}
5043
5044void BlueStore::_close_fsid()
5045{
5046 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5047 fsid_fd = -1;
5048}
5049
5050int BlueStore::_lock_fsid()
5051{
5052 struct flock l;
5053 memset(&l, 0, sizeof(l));
5054 l.l_type = F_WRLCK;
5055 l.l_whence = SEEK_SET;
5056 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5057 if (r < 0) {
5058 int err = errno;
5059 derr << __func__ << " failed to lock " << path << "/fsid"
5060 << " (is another ceph-osd still running?)"
5061 << cpp_strerror(err) << dendl;
5062 return -err;
5063 }
5064 return 0;
5065}
5066
31f18b77
FG
5067bool BlueStore::is_rotational()
5068{
5069 if (bdev) {
5070 return bdev->is_rotational();
5071 }
5072
5073 bool rotational = true;
5074 int r = _open_path();
5075 if (r < 0)
5076 goto out;
5077 r = _open_fsid(false);
5078 if (r < 0)
5079 goto out_path;
5080 r = _read_fsid(&fsid);
5081 if (r < 0)
5082 goto out_fsid;
5083 r = _lock_fsid();
5084 if (r < 0)
5085 goto out_fsid;
5086 r = _open_bdev(false);
5087 if (r < 0)
5088 goto out_fsid;
5089 rotational = bdev->is_rotational();
5090 _close_bdev();
5091 out_fsid:
5092 _close_fsid();
5093 out_path:
5094 _close_path();
5095 out:
5096 return rotational;
5097}
5098
d2e6a577
FG
5099bool BlueStore::is_journal_rotational()
5100{
5101 if (!bluefs) {
5102 dout(5) << __func__ << " bluefs disabled, default to store media type"
5103 << dendl;
5104 return is_rotational();
5105 }
5106 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5107 return bluefs->wal_is_rotational();
5108}
5109
7c673cae
FG
5110bool BlueStore::test_mount_in_use()
5111{
5112 // most error conditions mean the mount is not in use (e.g., because
5113 // it doesn't exist). only if we fail to lock do we conclude it is
5114 // in use.
5115 bool ret = false;
5116 int r = _open_path();
5117 if (r < 0)
5118 return false;
5119 r = _open_fsid(false);
5120 if (r < 0)
5121 goto out_path;
5122 r = _lock_fsid();
5123 if (r < 0)
5124 ret = true; // if we can't lock, it is in use
5125 _close_fsid();
5126 out_path:
5127 _close_path();
5128 return ret;
5129}
5130
11fdf7f2 5131int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5132{
5133 int r;
11fdf7f2 5134 bluefs = new BlueFS(cct);
7c673cae 5135
11fdf7f2
TL
5136 string bfn;
5137 struct stat st;
5138
5139 bfn = path + "/block.db";
5140 if (::stat(bfn.c_str(), &st) == 0) {
5141 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn,
5142 create && cct->_conf->bdev_enable_discard);
7c673cae 5143 if (r < 0) {
11fdf7f2
TL
5144 derr << __func__ << " add block device(" << bfn << ") returned: "
5145 << cpp_strerror(r) << dendl;
5146 goto free_bluefs;
7c673cae 5147 }
7c673cae 5148
11fdf7f2
TL
5149 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5150 r = _check_or_set_bdev_label(
5151 bfn,
5152 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5153 "bluefs db", create);
5154 if (r < 0) {
5155 derr << __func__
5156 << " check block device(" << bfn << ") label returned: "
5157 << cpp_strerror(r) << dendl;
5158 goto free_bluefs;
5159 }
7c673cae 5160 }
11fdf7f2
TL
5161 if (create) {
5162 bluefs->add_block_extent(
5163 BlueFS::BDEV_DB,
5164 SUPER_RESERVED,
5165 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5166 }
5167 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
5168 bluefs_single_shared_device = false;
5169 } else {
5170 r = -errno;
5171 if (::lstat(bfn.c_str(), &st) == -1) {
5172 r = 0;
5173 bluefs_shared_bdev = BlueFS::BDEV_DB;
7c673cae 5174 } else {
11fdf7f2
TL
5175 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5176 << cpp_strerror(r) << dendl;
5177 goto free_bluefs;
7c673cae
FG
5178 }
5179 }
7c673cae 5180
11fdf7f2
TL
5181 // shared device
5182 bfn = path + "/block";
5183 // never trim here
5184 r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false,
5185 true /* shared with bluestore */);
5186 if (r < 0) {
5187 derr << __func__ << " add block device(" << bfn << ") returned: "
5188 << cpp_strerror(r) << dendl;
5189 goto free_bluefs;
5190 }
5191 if (create) {
5192 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5193 uint64_t initial =
5194 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5195 cct->_conf->bluestore_bluefs_gift_ratio);
5196 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
5197 if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
5198 derr << __func__ << " bluefs_alloc_size 0x" << std::hex
5199 << cct->_conf->bluefs_alloc_size << " is not a multiple of "
5200 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5201 r = -EINVAL;
5202 goto free_bluefs;
7c673cae 5203 }
11fdf7f2
TL
5204 // align to bluefs's alloc_size
5205 initial = p2roundup(initial, cct->_conf->bluefs_alloc_size);
5206 // put bluefs in the middle of the device in case it is an HDD
5207 uint64_t start = p2align((bdev->get_size() - initial) / 2,
5208 cct->_conf->bluefs_alloc_size);
5209 //avoiding superblock overwrite
5210 ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved());
5211 start = std::max(cct->_conf->bluefs_alloc_size, start);
7c673cae 5212
11fdf7f2
TL
5213 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
5214 bluefs_extents.insert(start, initial);
5215 ++out_of_sync_fm;
5216 }
5217
5218 bfn = path + "/block.wal";
5219 if (::stat(bfn.c_str(), &st) == 0) {
5220 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
5221 create && cct->_conf->bdev_enable_discard);
5222 if (r < 0) {
5223 derr << __func__ << " add block device(" << bfn << ") returned: "
5224 << cpp_strerror(r) << dendl;
5225 goto free_bluefs;
5226 }
7c673cae 5227
11fdf7f2
TL
5228 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5229 r = _check_or_set_bdev_label(
5230 bfn,
5231 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5232 "bluefs wal", create);
7c673cae 5233 if (r < 0) {
11fdf7f2
TL
5234 derr << __func__ << " check block device(" << bfn
5235 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5236 goto free_bluefs;
5237 }
7c673cae
FG
5238 }
5239
11fdf7f2
TL
5240 if (create) {
5241 bluefs->add_block_extent(
5242 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5243 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5244 BDEV_LABEL_BLOCK_SIZE);
5245 }
5246 bluefs_single_shared_device = false;
5247 } else {
5248 r = 0;
5249 if (::lstat(bfn.c_str(), &st) != -1) {
5250 r = -errno;
5251 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5252 << cpp_strerror(r) << dendl;
7c673cae
FG
5253 goto free_bluefs;
5254 }
11fdf7f2
TL
5255 }
5256 return 0;
7c673cae 5257
11fdf7f2
TL
5258free_bluefs:
5259 ceph_assert(bluefs);
5260 delete bluefs;
5261 bluefs = NULL;
5262 return r;
5263}
7c673cae 5264
11fdf7f2
TL
5265int BlueStore::_open_bluefs(bool create)
5266{
5267 int r = _minimal_open_bluefs(create);
5268 if (r < 0) {
5269 return r;
5270 }
5271 if (create) {
5272 bluefs->mkfs(fsid);
5273 }
5274 r = bluefs->mount();
5275 if (r < 0) {
5276 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5277 }
5278 return r;
5279}
5280
5281void BlueStore::_close_bluefs()
5282{
5283 bluefs->umount();
5284 _minimal_close_bluefs();
5285}
5286
5287void BlueStore::_minimal_close_bluefs()
5288{
5289 delete bluefs;
5290 bluefs = NULL;
5291}
5292
5293int BlueStore::_is_bluefs(bool create, bool* ret)
5294{
5295 if (create) {
5296 *ret = cct->_conf->bluestore_bluefs;
5297 } else {
5298 string s;
5299 int r = read_meta("bluefs", &s);
5300 if (r < 0) {
5301 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5302 return -EIO;
5303 }
5304 if (s == "1") {
5305 *ret = true;
5306 } else if (s == "0") {
5307 *ret = false;
31f18b77 5308 } else {
11fdf7f2
TL
5309 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5310 << dendl;
5311 return -EIO;
5312 }
5313 }
5314 return 0;
5315}
5316
5317/*
5318* opens both DB and dependant super_meta, FreelistManager and allocator
5319* in the proper order
5320*/
5321int BlueStore::_open_db_and_around(bool read_only)
5322{
5323 int r;
5324 bool do_bluefs = false;
5325 _is_bluefs(false, &do_bluefs); // ignore err code
5326 if (do_bluefs) {
5327 // open in read-only first to read FM list and init allocator
5328 // as they might be needed for some BlueFS procedures
5329 r = _open_db(false, false, true);
5330 if (r < 0)
5331 return r;
5332
5333 r = _open_super_meta();
5334 if (r < 0) {
5335 goto out_db;
5336 }
5337
5338 r = _open_fm(nullptr);
5339 if (r < 0)
5340 goto out_db;
5341
5342 r = _open_alloc();
5343 if (r < 0)
5344 goto out_fm;
5345
5346 // now open in R/W mode
5347 if (!read_only) {
5348 _close_db();
5349
5350 r = _open_db(false, false, false);
5351 if (r < 0) {
5352 _close_alloc();
5353 _close_fm();
5354 return r;
28e407b8 5355 }
7c673cae 5356 }
11fdf7f2
TL
5357 } else {
5358 r = _open_db(false, false);
5359 if (r < 0) {
5360 return r;
5361 }
5362 r = _open_super_meta();
5363 if (r < 0) {
5364 goto out_db;
5365 }
7c673cae 5366
11fdf7f2
TL
5367 r = _open_fm(nullptr);
5368 if (r < 0)
5369 goto out_db;
5370
5371 r = _open_alloc();
5372 if (r < 0)
5373 goto out_fm;
5374 }
5375 return 0;
5376
5377 out_fm:
5378 _close_fm();
5379 out_db:
5380 _close_db();
5381 return r;
5382}
5383
5384void BlueStore::_close_db_and_around()
5385{
5386 if (bluefs) {
5387 if (out_of_sync_fm.fetch_and(0)) {
5388 _sync_bluefs_and_fm();
5389 }
5390 _close_db();
5391 while(out_of_sync_fm.fetch_and(0)) {
5392 // if seen some allocations during close - repeat open_db, sync fm, close
5393 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5394 int r = _open_db(false, false, false);
5395 if (r < 0) {
5396 derr << __func__
5397 << " unable to open db, FreelistManager is probably out of sync"
5398 << dendl;
5399 break;
5400 }
5401 _sync_bluefs_and_fm();
5402 _close_db();
7c673cae 5403 }
11fdf7f2
TL
5404 if (!_kv_only) {
5405 _close_alloc();
5406 _close_fm();
5407 }
5408 } else {
5409 _close_alloc();
5410 _close_fm();
5411 _close_db();
5412 }
5413}
5414
5415// updates legacy bluefs related recs in DB to a state valid for
5416// downgrades from nautilus.
5417void BlueStore::_sync_bluefs_and_fm()
5418{
5419 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5420 bufferlist bl;
5421 encode(bluefs_extents, bl);
5422 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5423 << std::hex << bluefs_extents << std::dec
5424 << dendl;
5425 KeyValueDB::Transaction synct = db->get_transaction();
5426 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5427 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5428
5429 // Nice thing is that we don't need to update FreelistManager here.
5430 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5431 // pre-Nautilis releases.
5432 // So once we get an extent to bluefs_extents this means it's
5433 // been free in allocator and hence it's free in FM too.
5434
5435 db->submit_transaction_sync(synct);
5436 }
5437}
5438
5439int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5440{
5441 int r;
5442 ceph_assert(!db);
5443 ceph_assert(!(create && read_only));
5444 string fn = path + "/db";
5445 string options;
5446 stringstream err;
5447 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5448
5449 string kv_backend;
5450 std::vector<KeyValueDB::ColumnFamily> cfs;
5451
5452 if (create) {
5453 kv_backend = cct->_conf->bluestore_kvbackend;
5454 } else {
5455 r = read_meta("kv_backend", &kv_backend);
7c673cae 5456 if (r < 0) {
11fdf7f2
TL
5457 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5458 return -EIO;
5459 }
5460 }
5461 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5462
5463 bool do_bluefs;
5464 r = _is_bluefs(create, &do_bluefs);
5465 if (r < 0) {
5466 return r;
5467 }
5468 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5469
5470 map<string,string> kv_options;
5471 // force separate wal dir for all new deployments.
5472 kv_options["separate_wal_dir"] = 1;
5473 rocksdb::Env *env = NULL;
5474 if (do_bluefs) {
5475 dout(10) << __func__ << " initializing bluefs" << dendl;
5476 if (kv_backend != "rocksdb") {
5477 derr << " backend must be rocksdb to use bluefs" << dendl;
5478 return -EINVAL;
7c673cae 5479 }
11fdf7f2
TL
5480
5481 r = _open_bluefs(create);
5482 if (r < 0) {
5483 return r;
5484 }
5485 bluefs->set_slow_device_expander(this);
5486
7c673cae
FG
5487 if (cct->_conf->bluestore_bluefs_env_mirror) {
5488 rocksdb::Env *a = new BlueRocksEnv(bluefs);
5489 rocksdb::Env *b = rocksdb::Env::Default();
5490 if (create) {
5491 string cmd = "rm -rf " + path + "/db " +
5492 path + "/db.slow " +
5493 path + "/db.wal";
5494 int r = system(cmd.c_str());
5495 (void)r;
5496 }
5497 env = new rocksdb::EnvMirror(b, a, false, true);
5498 } else {
5499 env = new BlueRocksEnv(bluefs);
5500
5501 // simplify the dir names, too, as "seen" by rocksdb
5502 fn = "db";
5503 }
5504
5505 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
5506 // we have both block.db and block; tell rocksdb!
5507 // note: the second (last) size value doesn't really matter
5508 ostringstream db_paths;
5509 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
5510 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
5511 db_paths << fn << ","
5512 << (uint64_t)(db_size * 95 / 100) << " "
5513 << fn + ".slow" << ","
5514 << (uint64_t)(slow_size * 95 / 100);
11fdf7f2
TL
5515 kv_options["db_paths"] = db_paths.str();
5516 dout(10) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
5517 }
5518
5519 if (create) {
5520 env->CreateDir(fn);
11fdf7f2
TL
5521 env->CreateDir(fn + ".wal");
5522 env->CreateDir(fn + ".slow");
5523 } else {
5524 std::vector<std::string> res;
5525 // check for dir presence
5526 auto r = env->GetChildren(fn+".wal", &res);
5527 if (r.IsNotFound()) {
5528 kv_options.erase("separate_wal_dir");
5529 }
7c673cae 5530 }
11fdf7f2
TL
5531 } else {
5532 string walfn = path + "/db.wal";
7c673cae 5533
11fdf7f2
TL
5534 if (create) {
5535 int r = ::mkdir(fn.c_str(), 0755);
5536 if (r < 0)
5537 r = -errno;
5538 if (r < 0 && r != -EEXIST) {
5539 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
5540 << dendl;
5541 return r;
5542 }
5543
5544 // wal_dir, too!
7c673cae
FG
5545 r = ::mkdir(walfn.c_str(), 0755);
5546 if (r < 0)
5547 r = -errno;
5548 if (r < 0 && r != -EEXIST) {
5549 derr << __func__ << " failed to create " << walfn
5550 << ": " << cpp_strerror(r)
5551 << dendl;
5552 return r;
5553 }
11fdf7f2
TL
5554 } else {
5555 struct stat st;
5556 r = ::stat(walfn.c_str(), &st);
5557 if (r < 0 && errno == ENOENT) {
5558 kv_options.erase("separate_wal_dir");
5559 }
7c673cae
FG
5560 }
5561 }
5562
91327a77 5563
7c673cae
FG
5564 db = KeyValueDB::create(cct,
5565 kv_backend,
5566 fn,
11fdf7f2 5567 kv_options,
7c673cae
FG
5568 static_cast<void*>(env));
5569 if (!db) {
5570 derr << __func__ << " error creating db" << dendl;
5571 if (bluefs) {
11fdf7f2 5572 _close_bluefs();
7c673cae
FG
5573 }
5574 // delete env manually here since we can't depend on db to do this
5575 // under this case
5576 delete env;
5577 env = NULL;
5578 return -EIO;
5579 }
5580
5581 FreelistManager::setup_merge_operators(db);
5582 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 5583 db->set_cache_size(cache_kv_ratio * cache_size);
31f18b77 5584
11fdf7f2 5585 if (kv_backend == "rocksdb") {
7c673cae 5586 options = cct->_conf->bluestore_rocksdb_options;
11fdf7f2
TL
5587
5588 map<string,string> cf_map;
5589 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
5590 get_str_map,
5591 &cf_map,
5592 " \t");
5593 for (auto& i : cf_map) {
5594 dout(10) << "column family " << i.first << ": " << i.second << dendl;
5595 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
5596 }
5597 }
5598
7c673cae 5599 db->init(options);
11fdf7f2
TL
5600 if (to_repair_db)
5601 return 0;
5602 if (create) {
5603 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
5604 r = db->create_and_open(err, cfs);
5605 } else {
5606 r = db->create_and_open(err);
5607 }
5608 } else {
5609 // we pass in cf list here, but it is only used if the db already has
5610 // column families created.
5611 r = read_only ?
5612 db->open_read_only(err, cfs) :
5613 db->open(err, cfs);
5614 }
7c673cae
FG
5615 if (r) {
5616 derr << __func__ << " erroring opening db: " << err.str() << dendl;
11fdf7f2 5617 _close_db();
7c673cae
FG
5618 return -EIO;
5619 }
5620 dout(1) << __func__ << " opened " << kv_backend
5621 << " path " << fn << " options " << options << dendl;
5622 return 0;
7c673cae
FG
5623}
5624
5625void BlueStore::_close_db()
5626{
11fdf7f2 5627 ceph_assert(db);
7c673cae
FG
5628 delete db;
5629 db = NULL;
5630 if (bluefs) {
11fdf7f2 5631 _close_bluefs();
7c673cae
FG
5632 }
5633}
5634
11fdf7f2 5635void BlueStore::_dump_alloc_on_failure()
7c673cae 5636{
11fdf7f2
TL
5637 auto dump_interval =
5638 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
5639 if (dump_interval > 0 &&
5640 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
5641 alloc->dump();
5642 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
5643 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 5644 }
11fdf7f2 5645}
7c673cae 5646
7c673cae 5647
11fdf7f2
TL
5648int BlueStore::allocate_bluefs_freespace(
5649 uint64_t min_size,
5650 uint64_t size,
5651 PExtentVector* extents_out)
5652{
5653 ceph_assert(min_size <= size);
5654 if (size) {
5655 // round up to alloc size
5656 min_size = p2roundup(min_size, cct->_conf->bluefs_alloc_size);
5657 size = p2roundup(size, cct->_conf->bluefs_alloc_size);
5658
5659 PExtentVector extents_local;
5660 PExtentVector* extents = extents_out ? extents_out : &extents_local;
5661
5662
5663 uint64_t gift;
5664 uint64_t allocated = 0;
5665 int64_t alloc_len;
5666 do {
5667 // hard cap to fit into 32 bits
5668 gift = std::min<uint64_t>(size, 1ull << 31);
5669 dout(10) << __func__ << " gifting " << gift
5670 << " (" << byte_u_t(gift) << ")" << dendl;
5671
5672 alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
5673 0, 0, extents);
5674 if (alloc_len) {
5675 allocated += alloc_len;
5676 size -= alloc_len;
5677 }
5678
5679 if (alloc_len < (int64_t)gift && (min_size > allocated)) {
5680 derr << __func__
5681 << " failed to allocate on 0x" << std::hex << gift
5682 << " min_size 0x" << min_size
5683 << " > allocated total 0x" << allocated
5684 << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size
5685 << " allocated 0x" << alloc_len
5686 << " available 0x " << alloc->get_free()
5687 << std::dec << dendl;
7c673cae 5688
11fdf7f2
TL
5689 alloc->dump();
5690 alloc->release(*extents);
5691 extents->clear();
5692 return -ENOSPC;
5693 }
5694 } while (size && alloc_len > 0);
5695 for (auto& e : *extents) {
5696 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
5697 bluefs_extents.insert(e.offset, e.length);
5698 ++out_of_sync_fm;
5699 // apply to bluefs if not requested from outside
5700 if (!extents_out) {
5701 bluefs->add_block_extent(bluefs_shared_bdev, e.offset, e.length);
5702 }
7c673cae
FG
5703 }
5704 }
7c673cae
FG
5705 return 0;
5706}
5707
11fdf7f2 5708int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
f64942e4 5709{
7c673cae
FG
5710 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
5711
5712 uint64_t my_free = alloc->get_free();
5713 uint64_t total = bdev->get_size();
5714 float my_free_ratio = (float)my_free / (float)total;
5715
5716 uint64_t total_free = bluefs_free + my_free;
5717
5718 float bluefs_ratio = (float)bluefs_free / (float)total_free;
5719
5720 dout(10) << __func__
1adf2230 5721 << " bluefs " << byte_u_t(bluefs_free)
7c673cae 5722 << " free (" << bluefs_free_ratio
1adf2230 5723 << ") bluestore " << byte_u_t(my_free)
7c673cae
FG
5724 << " free (" << my_free_ratio
5725 << "), bluefs_ratio " << bluefs_ratio
5726 << dendl;
5727
5728 uint64_t gift = 0;
5729 uint64_t reclaim = 0;
5730 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
5731 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
5732 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5733 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
1adf2230 5734 << ", should gift " << byte_u_t(gift) << dendl;
7c673cae
FG
5735 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
5736 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
5737 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
5738 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
5739 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5740 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
1adf2230 5741 << ", should reclaim " << byte_u_t(reclaim) << dendl;
7c673cae 5742 }
3efd9988
FG
5743
5744 // don't take over too much of the freespace
5745 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 5746 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 5747 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
5748 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
5749 dout(10) << __func__ << " bluefs_total " << bluefs_total
5750 << " < min " << cct->_conf->bluestore_bluefs_min
1adf2230 5751 << ", should gift " << byte_u_t(g) << dendl;
7c673cae
FG
5752 if (g > gift)
5753 gift = g;
5754 reclaim = 0;
5755 }
11fdf7f2 5756 uint64_t min_free = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
3efd9988
FG
5757 if (bluefs_free < min_free &&
5758 min_free < free_cap) {
5759 uint64_t g = min_free - bluefs_free;
11fdf7f2 5760 dout(10) << __func__ << " bluefs_free " << bluefs_free
3efd9988 5761 << " < min " << min_free
1adf2230 5762 << ", should gift " << byte_u_t(g) << dendl;
3efd9988
FG
5763 if (g > gift)
5764 gift = g;
5765 reclaim = 0;
5766 }
11fdf7f2
TL
5767 ceph_assert((int64_t)gift >= 0);
5768 ceph_assert((int64_t)reclaim >= 0);
5769 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
5770}
7c673cae 5771
11fdf7f2
TL
5772int BlueStore::_balance_bluefs_freespace()
5773{
5774 int ret = 0;
5775 ceph_assert(bluefs);
7c673cae 5776
11fdf7f2
TL
5777 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
5778 bluefs->get_usage(&bluefs_usage);
5779 ceph_assert(bluefs_usage.size() > bluefs_shared_bdev);
7c673cae 5780
11fdf7f2
TL
5781 bool clear_alert = true;
5782 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
5783 auto& p = bluefs_usage[bluefs_shared_bdev];
5784 if (p.first != p.second) {
5785 auto& db = bluefs_usage[BlueFS::BDEV_DB];
5786 ostringstream ss;
5787 ss << "spilled over " << byte_u_t(p.second - p.first)
5788 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
5789 << " used of " << byte_u_t(db.second) << ") to slow device";
5790 _set_spillover_alert(ss.str());
5791 clear_alert = false;
5792 }
5793 }
5794 if (clear_alert) {
5795 _clear_spillover_alert();
7c673cae
FG
5796 }
5797
11fdf7f2
TL
5798 // fixme: look at primary bdev only for now
5799 int64_t delta = _get_bluefs_size_delta(
5800 bluefs_usage[bluefs_shared_bdev].first,
5801 bluefs_usage[bluefs_shared_bdev].second);
5802
7c673cae 5803 // reclaim from bluefs?
11fdf7f2 5804 if (delta < 0) {
7c673cae 5805 // round up to alloc size
11fdf7f2 5806 auto reclaim = p2roundup(uint64_t(-delta), cct->_conf->bluefs_alloc_size);
7c673cae
FG
5807
5808 // hard cap to fit into 32 bits
11fdf7f2 5809 reclaim = std::min<uint64_t>(reclaim, 1ull << 31);
7c673cae 5810 dout(10) << __func__ << " reclaiming " << reclaim
1adf2230 5811 << " (" << byte_u_t(reclaim) << ")" << dendl;
7c673cae
FG
5812
5813 while (reclaim > 0) {
5814 // NOTE: this will block and do IO.
a8e16298 5815 PExtentVector extents;
7c673cae
FG
5816 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
5817 &extents);
5818 if (r < 0) {
5819 derr << __func__ << " failed to reclaim space from bluefs"
5820 << dendl;
5821 break;
5822 }
5823 for (auto e : extents) {
11fdf7f2 5824 ++out_of_sync_fm;
7c673cae
FG
5825 bluefs_extents.erase(e.offset, e.length);
5826 bluefs_extents_reclaiming.insert(e.offset, e.length);
5827 reclaim -= e.length;
5828 }
5829 }
5830
5831 ret = 1;
5832 }
5833
5834 return ret;
5835}
5836
7c673cae
FG
5837int BlueStore::_open_collections(int *errors)
5838{
28e407b8 5839 dout(10) << __func__ << dendl;
11fdf7f2 5840 ceph_assert(coll_map.empty());
7c673cae
FG
5841 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5842 for (it->upper_bound(string());
5843 it->valid();
5844 it->next()) {
5845 coll_t cid;
5846 if (cid.parse(it->key())) {
5847 CollectionRef c(
5848 new Collection(
5849 this,
5850 cache_shards[cid.hash_to_shard(cache_shards.size())],
5851 cid));
5852 bufferlist bl = it->value();
11fdf7f2 5853 auto p = bl.cbegin();
7c673cae 5854 try {
11fdf7f2 5855 decode(c->cnode, p);
7c673cae
FG
5856 } catch (buffer::error& e) {
5857 derr << __func__ << " failed to decode cnode, key:"
5858 << pretty_binary_string(it->key()) << dendl;
5859 return -EIO;
5860 }
28e407b8
AA
5861 dout(20) << __func__ << " opened " << cid << " " << c
5862 << " " << c->cnode << dendl;
11fdf7f2 5863 _osr_attach(c.get());
7c673cae 5864 coll_map[cid] = c;
11fdf7f2 5865
7c673cae
FG
5866 } else {
5867 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5868 if (errors)
5869 (*errors)++;
5870 }
5871 }
5872 return 0;
5873}
5874
224ce89b 5875void BlueStore::_open_statfs()
31f18b77 5876{
11fdf7f2
TL
5877 osd_pools.clear();
5878 vstatfs.reset();
5879
31f18b77 5880 bufferlist bl;
11fdf7f2 5881 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 5882 if (r >= 0) {
11fdf7f2 5883 per_pool_stat_collection = false;
31f18b77 5884 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 5885 auto it = bl.cbegin();
31f18b77 5886 vstatfs.decode(it);
11fdf7f2 5887 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 5888 } else {
31f18b77
FG
5889 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5890 }
81eedcae 5891 _check_legacy_statfs_alert();
11fdf7f2
TL
5892 } else if (cct->_conf->bluestore_no_per_pool_stats_tolerance == "enforce") {
5893 per_pool_stat_collection = false;
5894 dout(10) << __func__ << " store_statfs is requested but missing, using empty" << dendl;
5895 } else {
5896 per_pool_stat_collection = true;
5897 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
5898 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
5899 for (it->upper_bound(string());
5900 it->valid();
5901 it->next()) {
5902
5903 uint64_t pool_id;
5904 int r = get_key_pool_stat(it->key(), &pool_id);
5905 ceph_assert(r == 0);
5906
5907 bufferlist bl;
5908 bl = it->value();
5909 auto p = bl.cbegin();
5910 auto& st = osd_pools[pool_id];
5911 try {
5912 st.decode(p);
5913 vstatfs += st;
5914
5915 dout(30) << __func__ << " pool " << pool_id
5916 << " statfs " << st << dendl;
5917 } catch (buffer::error& e) {
5918 derr << __func__ << " failed to decode pool stats, key:"
5919 << pretty_binary_string(it->key()) << dendl;
5920 }
5921 }
31f18b77 5922 }
11fdf7f2
TL
5923 dout(30) << __func__ << " statfs " << vstatfs << dendl;
5924
31f18b77
FG
5925}
5926
7c673cae
FG
5927int BlueStore::_setup_block_symlink_or_file(
5928 string name,
5929 string epath,
5930 uint64_t size,
5931 bool create)
5932{
5933 dout(20) << __func__ << " name " << name << " path " << epath
5934 << " size " << size << " create=" << (int)create << dendl;
5935 int r = 0;
91327a77 5936 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5937 if (create)
5938 flags |= O_CREAT;
5939 if (epath.length()) {
5940 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5941 if (r < 0) {
5942 r = -errno;
5943 derr << __func__ << " failed to create " << name << " symlink to "
5944 << epath << ": " << cpp_strerror(r) << dendl;
5945 return r;
5946 }
5947
5948 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5949 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5950 if (fd < 0) {
5951 r = -errno;
5952 derr << __func__ << " failed to open " << epath << " file: "
5953 << cpp_strerror(r) << dendl;
5954 return r;
5955 }
11fdf7f2
TL
5956 // write the Transport ID of the NVMe device
5957 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
5958 // where "0000:02:00.0" is the selector of a PCI device, see
5959 // the first column of "lspci -mm -n -D"
5960 string trid{"trtype:PCIe "};
5961 trid += "traddr:";
5962 trid += epath.substr(strlen(SPDK_PREFIX));
5963 r = ::write(fd, trid.c_str(), trid.size());
5964 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
5965 dout(1) << __func__ << " created " << name << " symlink to "
5966 << epath << dendl;
5967 VOID_TEMP_FAILURE_RETRY(::close(fd));
5968 }
5969 }
5970 if (size) {
5971 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5972 if (fd >= 0) {
5973 // block file is present
5974 struct stat st;
5975 int r = ::fstat(fd, &st);
5976 if (r == 0 &&
5977 S_ISREG(st.st_mode) && // if it is a regular file
5978 st.st_size == 0) { // and is 0 bytes
5979 r = ::ftruncate(fd, size);
5980 if (r < 0) {
5981 r = -errno;
5982 derr << __func__ << " failed to resize " << name << " file to "
5983 << size << ": " << cpp_strerror(r) << dendl;
5984 VOID_TEMP_FAILURE_RETRY(::close(fd));
5985 return r;
5986 }
5987
5988 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
5989 r = ::ceph_posix_fallocate(fd, 0, size);
5990 if (r > 0) {
7c673cae
FG
5991 derr << __func__ << " failed to prefallocate " << name << " file to "
5992 << size << ": " << cpp_strerror(r) << dendl;
5993 VOID_TEMP_FAILURE_RETRY(::close(fd));
5994 return -r;
5995 }
7c673cae
FG
5996 }
5997 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 5998 << byte_u_t(size) << dendl;
7c673cae
FG
5999 }
6000 VOID_TEMP_FAILURE_RETRY(::close(fd));
6001 } else {
6002 int r = -errno;
6003 if (r != -ENOENT) {
6004 derr << __func__ << " failed to open " << name << " file: "
6005 << cpp_strerror(r) << dendl;
6006 return r;
6007 }
6008 }
6009 }
6010 return 0;
6011}
6012
6013int BlueStore::mkfs()
6014{
6015 dout(1) << __func__ << " path " << path << dendl;
6016 int r;
6017 uuid_d old_fsid;
6018
6019 {
6020 string done;
6021 r = read_meta("mkfs_done", &done);
6022 if (r == 0) {
6023 dout(1) << __func__ << " already created" << dendl;
6024 if (cct->_conf->bluestore_fsck_on_mkfs) {
6025 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6026 if (r < 0) {
6027 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6028 << dendl;
6029 return r;
6030 }
6031 if (r > 0) {
6032 derr << __func__ << " fsck found " << r << " errors" << dendl;
6033 r = -EIO;
6034 }
6035 }
6036 return r; // idempotent
6037 }
6038 }
6039
6040 {
6041 string type;
6042 r = read_meta("type", &type);
6043 if (r == 0) {
6044 if (type != "bluestore") {
6045 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6046 return -EIO;
6047 }
6048 } else {
6049 r = write_meta("type", "bluestore");
6050 if (r < 0)
6051 return r;
6052 }
6053 }
6054
6055 freelist_type = "bitmap";
6056
6057 r = _open_path();
6058 if (r < 0)
6059 return r;
6060
6061 r = _open_fsid(true);
6062 if (r < 0)
6063 goto out_path_fd;
6064
6065 r = _lock_fsid();
6066 if (r < 0)
6067 goto out_close_fsid;
6068
6069 r = _read_fsid(&old_fsid);
6070 if (r < 0 || old_fsid.is_zero()) {
6071 if (fsid.is_zero()) {
6072 fsid.generate_random();
6073 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6074 } else {
6075 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6076 }
6077 // we'll write it later.
6078 } else {
6079 if (!fsid.is_zero() && fsid != old_fsid) {
6080 derr << __func__ << " on-disk fsid " << old_fsid
6081 << " != provided " << fsid << dendl;
6082 r = -EINVAL;
6083 goto out_close_fsid;
6084 }
6085 fsid = old_fsid;
6086 }
6087
6088 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6089 cct->_conf->bluestore_block_size,
6090 cct->_conf->bluestore_block_create);
6091 if (r < 0)
6092 goto out_close_fsid;
6093 if (cct->_conf->bluestore_bluefs) {
6094 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6095 cct->_conf->bluestore_block_wal_size,
6096 cct->_conf->bluestore_block_wal_create);
6097 if (r < 0)
6098 goto out_close_fsid;
6099 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6100 cct->_conf->bluestore_block_db_size,
6101 cct->_conf->bluestore_block_db_create);
6102 if (r < 0)
6103 goto out_close_fsid;
6104 }
6105
6106 r = _open_bdev(true);
6107 if (r < 0)
6108 goto out_close_fsid;
6109
3efd9988
FG
6110 // choose min_alloc_size
6111 if (cct->_conf->bluestore_min_alloc_size) {
6112 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6113 } else {
11fdf7f2 6114 ceph_assert(bdev);
3efd9988
FG
6115 if (bdev->is_rotational()) {
6116 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6117 } else {
6118 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6119 }
6120 }
11fdf7f2 6121 _validate_bdev();
3efd9988
FG
6122
6123 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6124 if (!isp2(min_alloc_size)) {
3efd9988
FG
6125 derr << __func__ << " min_alloc_size 0x"
6126 << std::hex << min_alloc_size << std::dec
6127 << " is not power of 2 aligned!"
6128 << dendl;
6129 r = -EINVAL;
6130 goto out_close_bdev;
6131 }
6132
7c673cae
FG
6133 r = _open_db(true);
6134 if (r < 0)
6135 goto out_close_bdev;
6136
7c673cae
FG
6137 {
6138 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
6139 r = _open_fm(t);
6140 if (r < 0)
6141 goto out_close_db;
7c673cae
FG
6142 {
6143 bufferlist bl;
11fdf7f2 6144 encode((uint64_t)0, bl);
7c673cae
FG
6145 t->set(PREFIX_SUPER, "nid_max", bl);
6146 t->set(PREFIX_SUPER, "blobid_max", bl);
6147 }
6148
7c673cae
FG
6149 {
6150 bufferlist bl;
11fdf7f2 6151 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6152 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6153 }
6154
6155 ondisk_format = latest_ondisk_format;
6156 _prepare_ondisk_format_super(t);
6157 db->submit_transaction_sync(t);
6158 }
6159
7c673cae
FG
6160 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6161 if (r < 0)
224ce89b
WB
6162 goto out_close_fm;
6163
3efd9988 6164 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6165 if (r < 0)
224ce89b 6166 goto out_close_fm;
7c673cae
FG
6167
6168 if (fsid != old_fsid) {
6169 r = _write_fsid();
6170 if (r < 0) {
6171 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6172 goto out_close_fm;
7c673cae
FG
6173 }
6174 }
6175
11fdf7f2
TL
6176 if (out_of_sync_fm.fetch_and(0)) {
6177 _sync_bluefs_and_fm();
6178 }
6179
7c673cae
FG
6180 out_close_fm:
6181 _close_fm();
6182 out_close_db:
6183 _close_db();
6184 out_close_bdev:
6185 _close_bdev();
6186 out_close_fsid:
6187 _close_fsid();
6188 out_path_fd:
6189 _close_path();
6190
6191 if (r == 0 &&
6192 cct->_conf->bluestore_fsck_on_mkfs) {
6193 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6194 if (rc < 0)
6195 return rc;
6196 if (rc > 0) {
6197 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6198 r = -EIO;
6199 }
11fdf7f2
TL
6200 }
6201
6202 if (r == 0) {
6203 // indicate success by writing the 'mkfs_done' file
6204 r = write_meta("mkfs_done", "yes");
6205 }
6206
6207 if (r < 0) {
6208 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6209 } else {
6210 dout(0) << __func__ << " success" << dendl;
6211 }
6212 return r;
6213}
6214
6215int BlueStore::_mount_for_bluefs()
6216{
6217 int r = _open_path();
6218 ceph_assert(r == 0);
6219 r = _open_fsid(false);
6220 ceph_assert(r == 0);
6221 r = _read_fsid(&fsid);
6222 ceph_assert(r == 0);
6223 r = _lock_fsid();
6224 ceph_assert(r == 0);
6225 r = _open_bluefs(false);
6226 ceph_assert(r == 0);
6227 return r;
6228}
6229
6230void BlueStore::_umount_for_bluefs()
6231{
6232 _close_bluefs();
6233 _close_fsid();
6234 _close_path();
6235}
6236
6237int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6238{
6239 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6240 int r;
6241 ceph_assert(path_fd < 0);
6242
6243 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6244
6245 if (!cct->_conf->bluestore_bluefs) {
6246 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6247 return -EIO;
6248 }
6249
6250 r = _mount_for_bluefs();
6251
6252 int reserved = 0;
6253 if (id == BlueFS::BDEV_NEWWAL) {
6254 string p = path + "/block.wal";
6255 r = _setup_block_symlink_or_file("block.wal", dev_path,
6256 cct->_conf->bluestore_block_wal_size,
6257 true);
6258 ceph_assert(r == 0);
6259
6260 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
6261 cct->_conf->bdev_enable_discard);
6262 ceph_assert(r == 0);
6263
6264 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6265 r = _check_or_set_bdev_label(
6266 p,
6267 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6268 "bluefs wal",
6269 true);
6270 ceph_assert(r == 0);
6271 }
6272
6273 reserved = BDEV_LABEL_BLOCK_SIZE;
6274 } else if (id == BlueFS::BDEV_NEWDB) {
6275 string p = path + "/block.db";
6276 r = _setup_block_symlink_or_file("block.db", dev_path,
6277 cct->_conf->bluestore_block_db_size,
6278 true);
6279 ceph_assert(r == 0);
6280
6281 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
6282 cct->_conf->bdev_enable_discard);
6283 ceph_assert(r == 0);
6284
6285 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6286 r = _check_or_set_bdev_label(
6287 p,
6288 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6289 "bluefs db",
6290 true);
6291 ceph_assert(r == 0);
6292 }
6293 reserved = SUPER_RESERVED;
6294 }
6295
6296 bluefs->umount();
6297 bluefs->mount();
6298
6299 bluefs->add_block_extent(
6300 id,
6301 reserved,
6302 bluefs->get_block_device_size(id) - reserved);
6303
6304 r = bluefs->prepare_new_device(id);
6305 ceph_assert(r == 0);
6306
6307 if (r < 0) {
6308 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6309 } else {
6310 dout(0) << __func__ << " success" << dendl;
6311 }
6312
6313 _umount_for_bluefs();
6314 return r;
6315}
6316
6317int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6318 int id)
6319{
6320 dout(10) << __func__ << " id:" << id << dendl;
6321 ceph_assert(path_fd < 0);
6322
6323 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6324
6325 if (!cct->_conf->bluestore_bluefs) {
6326 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6327 return -EIO;
6328 }
6329
6330 int r = _mount_for_bluefs();
6331
6332 // require bluestore_bluefs_min_free to be free at target device!
6333 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6334 for(auto src_id : devs_source) {
6335 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6336 }
6337 uint64_t target_free = bluefs->get_free(id);
6338 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6339 // will need to remount full BlueStore instance to allocate more space
6340 _umount_for_bluefs();
6341
6342 r = mount();
6343 ceph_assert(r == 0);
6344 dout(1) << __func__
6345 << " Allocating more space at slow device for BlueFS: +"
6346 << used_space - target_free << " bytes" << dendl;
6347 r = allocate_bluefs_freespace(
6348 used_space - target_free,
6349 used_space - target_free,
6350 nullptr);
6351
6352 umount();
6353 if (r != 0) {
6354 derr << __func__
6355 << " can't migrate, unable to allocate extra space: "
6356 << used_space - target_free << " at target:" << id
6357 << dendl;
6358 return -ENOSPC;
6359 }
6360
6361 r = _mount_for_bluefs();
6362 ceph_assert(r == 0);
6363 } else if (target_free < used_space) {
6364 derr << __func__
6365 << " can't migrate, free space at target: " << target_free
6366 << " is less than required space: " << used_space
6367 << dendl;
6368 return -ENOSPC;
6369 }
6370 r = bluefs->device_migrate_to_existing(cct, devs_source, id);
6371 if (r < 0) {
6372 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6373 goto shutdown;
6374 }
6375
6376 if (devs_source.count(BlueFS::BDEV_DB)) {
6377 r = unlink(string(path + "/block.db").c_str());
6378 ceph_assert(r == 0);
6379 }
6380 if (devs_source.count(BlueFS::BDEV_WAL)) {
6381 r = unlink(string(path + "/block.wal").c_str());
6382 ceph_assert(r == 0);
6383 }
6384
6385shutdown:
6386 _umount_for_bluefs();
6387 return r;
6388}
6389
6390int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6391 int id,
6392 const string& dev_path)
6393{
6394 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6395 int r;
6396 ceph_assert(path_fd < 0);
6397
6398 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6399
6400 if (!cct->_conf->bluestore_bluefs) {
6401 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6402 return -EIO;
6403 }
6404
6405 r = _mount_for_bluefs();
6406
6407 int reserved = 0;
6408 string link_db;
6409 string link_wal;
6410 if (devs_source.count(BlueFS::BDEV_DB) &&
6411 bluefs_shared_bdev != BlueFS::BDEV_DB) {
6412 link_db = path + "/block.db";
6413 }
6414 if (devs_source.count(BlueFS::BDEV_WAL)) {
6415 link_wal = path + "/block.wal";
6416 }
6417
6418 size_t target_size;
6419 string target_name;
6420 if (id == BlueFS::BDEV_NEWWAL) {
6421 target_name = "block.wal";
6422 target_size = cct->_conf->bluestore_block_wal_size;
6423
6424 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
6425 cct->_conf->bdev_enable_discard);
6426 ceph_assert(r == 0);
6427
6428 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6429 r = _check_or_set_bdev_label(
6430 dev_path,
6431 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6432 "bluefs wal",
6433 true);
6434 ceph_assert(r == 0);
6435 }
6436 reserved = BDEV_LABEL_BLOCK_SIZE;
6437 } else if (id == BlueFS::BDEV_NEWDB) {
6438 target_name = "block.db";
6439 target_size = cct->_conf->bluestore_block_db_size;
31f18b77 6440
11fdf7f2
TL
6441 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
6442 cct->_conf->bdev_enable_discard);
6443 ceph_assert(r == 0);
6444
6445 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6446 r = _check_or_set_bdev_label(
6447 dev_path,
6448 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6449 "bluefs db",
6450 true);
6451 ceph_assert(r == 0);
6452 }
6453 reserved = SUPER_RESERVED;
31f18b77
FG
6454 }
6455
11fdf7f2
TL
6456 bluefs->umount();
6457 bluefs->mount();
6458
6459 bluefs->add_block_extent(
6460 id, reserved, bluefs->get_block_device_size(id) - reserved);
6461
6462 r = bluefs->device_migrate_to_new(cct, devs_source, id);
6463
7c673cae 6464 if (r < 0) {
11fdf7f2
TL
6465 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6466 goto shutdown;
6467 }
6468
6469 if (!link_db.empty()) {
6470 r = unlink(link_db.c_str());
6471 ceph_assert(r == 0);
6472 }
6473 if (!link_wal.empty()) {
6474 r = unlink(link_wal.c_str());
6475 ceph_assert(r == 0);
6476 }
6477 r = _setup_block_symlink_or_file(
6478 target_name,
6479 dev_path,
6480 target_size,
6481 true);
6482 ceph_assert(r == 0);
6483 dout(0) << __func__ << " success" << dendl;
6484
6485shutdown:
6486 _umount_for_bluefs();
6487 return r;
6488}
6489
6490string BlueStore::get_device_path(unsigned id)
6491{
6492 string res;
6493 if (id < BlueFS::MAX_BDEV) {
6494 switch (id) {
6495 case BlueFS::BDEV_WAL:
6496 res = path + "/block.wal";
6497 break;
6498 case BlueFS::BDEV_DB:
6499 if (id == bluefs_shared_bdev) {
6500 res = path + "/block";
6501 } else {
6502 res = path + "/block.db";
6503 }
6504 break;
6505 case BlueFS::BDEV_SLOW:
6506 res = path + "/block";
6507 break;
6508 }
6509 }
6510 return res;
6511}
6512
6513int BlueStore::expand_devices(ostream& out)
6514{
6515 int r = _mount(false);
6516 ceph_assert(r == 0);
6517 bluefs->dump_block_extents(out);
6518 out << "Expanding..." << std::endl;
6519 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
6520 if (devid == bluefs_shared_bdev ) {
6521 continue;
6522 }
6523 uint64_t size = bluefs->get_block_device_size(devid);
6524 if (size == 0) {
6525 // no bdev
6526 continue;
6527 }
6528
6529 interval_set<uint64_t> before;
6530 bluefs->get_block_extents(devid, &before);
6531 ceph_assert(!before.empty());
6532 uint64_t end = before.range_end();
6533 if (end < size) {
6534 out << devid
6535 <<" : expanding " << " from 0x" << std::hex
6536 << end << " to 0x" << size << std::dec << std::endl;
6537 bluefs->add_block_extent(devid, end, size-end);
6538 string p = get_device_path(devid);
6539 const char* path = p.c_str();
6540 if (path == nullptr) {
6541 derr << devid
6542 <<": can't find device path " << dendl;
6543 continue;
6544 }
6545 bluestore_bdev_label_t label;
6546 int r = _read_bdev_label(cct, path, &label);
6547 if (r < 0) {
6548 derr << "unable to read label for " << path << ": "
6549 << cpp_strerror(r) << dendl;
6550 continue;
6551 }
6552 label.size = size;
6553 r = _write_bdev_label(cct, path, label);
6554 if (r < 0) {
6555 derr << "unable to write label for " << path << ": "
6556 << cpp_strerror(r) << dendl;
6557 continue;
6558 }
6559 out << devid
6560 <<" : size label updated to " << size
6561 << std::endl;
6562 }
6563 }
6564 uint64_t size0 = fm->get_size();
6565 uint64_t size = bdev->get_size();
6566 if (size0 < size) {
6567 out << bluefs_shared_bdev
6568 <<" : expanding " << " from 0x" << std::hex
6569 << size0 << " to 0x" << size << std::dec << std::endl;
6570 KeyValueDB::Transaction txn;
6571 txn = db->get_transaction();
6572 int r = fm->expand(size, txn);
6573 ceph_assert(r == 0);
6574 db->submit_transaction_sync(txn);
6575
6576 // always reference to slow device here
6577 string p = get_device_path(BlueFS::BDEV_SLOW);
6578 ceph_assert(!p.empty());
6579 const char* path = p.c_str();
6580 bluestore_bdev_label_t label;
6581 r = _read_bdev_label(cct, path, &label);
6582 if (r < 0) {
6583 derr << "unable to read label for " << path << ": "
6584 << cpp_strerror(r) << dendl;
6585 } else {
6586 label.size = size;
6587 r = _write_bdev_label(cct, path, label);
6588 if (r < 0) {
6589 derr << "unable to write label for " << path << ": "
6590 << cpp_strerror(r) << dendl;
6591 } else {
6592 out << bluefs_shared_bdev
6593 <<" : size label updated to " << size
6594 << std::endl;
6595 }
6596 }
7c673cae 6597 }
11fdf7f2 6598 umount();
7c673cae
FG
6599 return r;
6600}
6601
6602void BlueStore::set_cache_shards(unsigned num)
6603{
6604 dout(10) << __func__ << " " << num << dendl;
6605 size_t old = cache_shards.size();
11fdf7f2 6606 ceph_assert(num >= old);
7c673cae
FG
6607 cache_shards.resize(num);
6608 for (unsigned i = old; i < num; ++i) {
6609 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
6610 logger);
6611 }
6612}
6613
11fdf7f2 6614int BlueStore::_mount(bool kv_only, bool open_db)
7c673cae
FG
6615{
6616 dout(1) << __func__ << " path " << path << dendl;
6617
3efd9988
FG
6618 _kv_only = kv_only;
6619
7c673cae
FG
6620 {
6621 string type;
6622 int r = read_meta("type", &type);
6623 if (r < 0) {
6624 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6625 << dendl;
6626 return r;
6627 }
6628
6629 if (type != "bluestore") {
6630 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6631 return -EIO;
6632 }
6633 }
6634
6635 if (cct->_conf->bluestore_fsck_on_mount) {
6636 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6637 if (rc < 0)
6638 return rc;
6639 if (rc > 0) {
6640 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6641 return -EIO;
6642 }
6643 }
6644
6645 int r = _open_path();
6646 if (r < 0)
6647 return r;
6648 r = _open_fsid(false);
6649 if (r < 0)
6650 goto out_path;
6651
6652 r = _read_fsid(&fsid);
6653 if (r < 0)
6654 goto out_fsid;
6655
6656 r = _lock_fsid();
6657 if (r < 0)
6658 goto out_fsid;
6659
6660 r = _open_bdev(false);
6661 if (r < 0)
6662 goto out_fsid;
6663
11fdf7f2
TL
6664 if (open_db) {
6665 r = _open_db_and_around(false);
6666 } else {
6667 // we can bypass db open exclusively in case of kv_only mode
6668 ceph_assert(kv_only);
6669 r = _open_db(false, true);
6670 if (r < 0)
6671 goto out_bdev;
6672 }
7c673cae
FG
6673
6674 if (kv_only)
6675 return 0;
6676
11fdf7f2
TL
6677 r = _upgrade_super();
6678 if (r < 0) {
7c673cae 6679 goto out_db;
11fdf7f2 6680 }
7c673cae
FG
6681
6682 r = _open_collections();
6683 if (r < 0)
11fdf7f2 6684 goto out_db;
7c673cae
FG
6685
6686 r = _reload_logger();
6687 if (r < 0)
6688 goto out_coll;
6689
31f18b77 6690 _kv_start();
7c673cae
FG
6691
6692 r = _deferred_replay();
6693 if (r < 0)
6694 goto out_stop;
6695
6696 mempool_thread.init();
6697
7c673cae
FG
6698 mounted = true;
6699 return 0;
6700
6701 out_stop:
6702 _kv_stop();
7c673cae 6703 out_coll:
31f18b77 6704 _flush_cache();
7c673cae 6705 out_db:
11fdf7f2 6706 _close_db_and_around();
7c673cae
FG
6707 out_bdev:
6708 _close_bdev();
6709 out_fsid:
6710 _close_fsid();
6711 out_path:
6712 _close_path();
6713 return r;
6714}
6715
6716int BlueStore::umount()
6717{
11fdf7f2 6718 ceph_assert(_kv_only || mounted);
7c673cae
FG
6719 dout(1) << __func__ << dendl;
6720
6721 _osr_drain_all();
7c673cae 6722
7c673cae 6723 mounted = false;
3efd9988
FG
6724 if (!_kv_only) {
6725 mempool_thread.shutdown();
6726 dout(20) << __func__ << " stopping kv thread" << dendl;
6727 _kv_stop();
3efd9988
FG
6728 _flush_cache();
6729 dout(20) << __func__ << " closing" << dendl;
6730
3efd9988 6731 }
11fdf7f2 6732 _close_db_and_around();
7c673cae
FG
6733 _close_bdev();
6734 _close_fsid();
6735 _close_path();
6736
6737 if (cct->_conf->bluestore_fsck_on_umount) {
6738 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
6739 if (rc < 0)
6740 return rc;
6741 if (rc > 0) {
6742 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6743 return -EIO;
6744 }
6745 }
6746 return 0;
6747}
6748
6749static void apply(uint64_t off,
6750 uint64_t len,
6751 uint64_t granularity,
6752 BlueStore::mempool_dynamic_bitset &bitset,
7c673cae
FG
6753 std::function<void(uint64_t,
6754 BlueStore::mempool_dynamic_bitset &)> f) {
11fdf7f2 6755 auto end = round_up_to(off + len, granularity);
7c673cae
FG
6756 while (off < end) {
6757 uint64_t pos = off / granularity;
6758 f(pos, bitset);
6759 off += granularity;
6760 }
6761}
6762
6763int BlueStore::_fsck_check_extents(
11fdf7f2 6764 const coll_t& cid,
7c673cae
FG
6765 const ghobject_t& oid,
6766 const PExtentVector& extents,
6767 bool compressed,
6768 mempool_dynamic_bitset &used_blocks,
b32b8144 6769 uint64_t granularity,
11fdf7f2 6770 BlueStoreRepairer* repairer,
7c673cae
FG
6771 store_statfs_t& expected_statfs)
6772{
6773 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
6774 int errors = 0;
6775 for (auto e : extents) {
6776 if (!e.is_valid())
6777 continue;
6778 expected_statfs.allocated += e.length;
6779 if (compressed) {
11fdf7f2 6780 expected_statfs.data_compressed_allocated += e.length;
7c673cae
FG
6781 }
6782 bool already = false;
6783 apply(
b32b8144 6784 e.offset, e.length, granularity, used_blocks,
7c673cae 6785 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2
TL
6786 ceph_assert(pos < bs.size());
6787 if (bs.test(pos)) {
6788 if (repairer) {
6789 repairer->note_misreference(
6790 pos * min_alloc_size, min_alloc_size, !already);
6791 }
6792 if (!already) {
6793 derr << "fsck error: " << oid << " extent " << e
6794 << " or a subset is already allocated (misreferenced)" << dendl;
6795 ++errors;
6796 already = true;
6797 }
6798 }
7c673cae
FG
6799 else
6800 bs.set(pos);
6801 });
11fdf7f2
TL
6802 if (repairer) {
6803 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
6804 }
6805
7c673cae 6806 if (e.end() > bdev->get_size()) {
11fdf7f2 6807 derr << "fsck error: " << oid << " extent " << e
7c673cae
FG
6808 << " past end of block device" << dendl;
6809 ++errors;
6810 }
6811 }
6812 return errors;
6813}
6814
11fdf7f2
TL
6815void BlueStore::_fsck_check_pool_statfs(
6816 BlueStore::per_pool_statfs& expected_pool_statfs,
6817 bool need_per_pool_stats,
6818 int& errors,
6819 BlueStoreRepairer* repairer)
6820{
6821 auto it = db->get_iterator(PREFIX_STAT);
6822 if (it) {
6823 for (it->lower_bound(string()); it->valid(); it->next()) {
6824 string key = it->key();
6825 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
6826 if (repairer) {
6827 if (need_per_pool_stats) {
6828 ++errors;
6829 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
6830 derr << "fsck error: " << "legacy statfs record found, removing" << dendl;
6831 } else {
6832 derr << "fsck warning: " << "legacy statfs record found, bypassing" << dendl;
6833 }
6834 } else {
6835 const char* s = "fsck warning: ";
6836 if (need_per_pool_stats) {
6837 ++errors;
6838 s = "fsck error: ";
6839 }
6840 derr << s << "legacy statfs record found, suggest to "
6841 "run store repair to get consistent statistic reports"
6842 << dendl;
6843 }
6844 continue;
6845 }
6846 if (!need_per_pool_stats) {
6847 continue;
6848 }
6849 uint64_t pool_id;
6850 if (get_key_pool_stat(key, &pool_id) < 0) {
6851 derr << "fsck error: bad key " << key
6852 << "in statfs namespece" << dendl;
6853 if (repairer) {
6854 repairer->remove_key(db, PREFIX_STAT, key);
6855 }
6856 ++errors;
6857 continue;
6858 }
6859
6860 volatile_statfs vstatfs;
6861 bufferlist bl = it->value();
6862 auto blp = bl.cbegin();
6863 try {
6864 vstatfs.decode(blp);
6865 } catch (buffer::error& e) {
6866 derr << "fsck error: failed to decode Pool StatFS record"
6867 << pretty_binary_string(key) << dendl;
6868 if (repairer) {
6869 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
6870 << pretty_binary_string(key)
6871 << "', removing" << dendl;
6872 repairer->remove_key(db, PREFIX_STAT, key);
6873 }
6874 ++errors;
6875 vstatfs.reset();
6876 }
6877 auto stat_it = expected_pool_statfs.find(pool_id);
6878 if (stat_it == expected_pool_statfs.end()) {
6879 if (vstatfs.is_empty()) {
6880 // we don't consider that as an error since empty pool statfs
6881 // are left in DB for now
6882 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
6883 << std::hex << pool_id << std::dec << dendl;
6884 if (repairer) {
6885 // but we need to increment error count in case of repair
6886 // to have proper counters at the end
6887 // (as repairer increments recovery counter anyway).
6888 ++errors;
6889 }
6890 } else {
6891 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
6892 << std::hex << pool_id << std::dec << dendl;
6893 ++errors;
6894 }
6895 if (repairer) {
6896 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
6897 }
6898 continue;
6899 }
6900 store_statfs_t statfs;
6901 vstatfs.publish(&statfs);
6902 if (!(stat_it->second == statfs)) {
6903 derr << "fsck error: actual " << statfs
6904 << " != expected " << stat_it->second
6905 << " for pool "
6906 << std::hex << pool_id << std::dec << dendl;
6907 if (repairer) {
6908 repairer->fix_statfs(db, key, stat_it->second);
6909 }
6910 ++errors;
6911 }
6912 expected_pool_statfs.erase(stat_it);
6913 }
6914 } // if (it)
6915 for( auto s = expected_pool_statfs.begin(); s != expected_pool_statfs.end();
6916 ++s) {
6917 if (s->second.is_zero()) {
6918 // we might lack empty statfs recs in DB
6919 continue;
6920 }
6921 derr << "fsck error: missing Pool StatFS record for pool "
6922 << std::hex << s->first << std::dec << dendl;
6923 if (repairer) {
6924 string key;
6925 get_pool_stat_key(s->first, &key);
6926 repairer->fix_statfs(db, key, s->second);
6927 }
6928 ++errors;
6929 }
6930}
6931
6932/**
6933An overview for currently implemented repair logics
6934performed in fsck in two stages: detection(+preparation) and commit.
6935Detection stage (in processing order):
6936 (Issue -> Repair action to schedule)
6937 - Detect undecodable keys for Shared Blobs -> Remove
6938 - Detect undecodable records for Shared Blobs -> Remove
6939 (might trigger missed Shared Blob detection below)
6940 - Detect stray records for Shared Blobs -> Remove
6941 - Detect misreferenced pextents -> Fix
6942 Prepare Bloom-like filter to track cid/oid -> pextent
6943 Prepare list of extents that are improperly referenced
6944 Enumerate Onode records that might use 'misreferenced' pextents
6945 (Bloom-like filter applied to reduce computation)
6946 Per each questinable Onode enumerate all blobs and identify broken ones
6947 (i.e. blobs having 'misreferences')
6948 Rewrite each broken blob data by allocating another extents and
6949 copying data there
6950 If blob is shared - unshare it and mark corresponding Shared Blob
6951 for removal
6952 Release previously allocated space
6953 Update Extent Map
6954 - Detect missed Shared Blobs -> Recreate
6955 - Detect undecodable deferred transaction -> Remove
6956 - Detect Freelist Manager's 'false free' entries -> Mark as used
6957 - Detect Freelist Manager's leaked entries -> Mark as free
6958 - Detect statfs inconsistency - Update
6959 Commit stage (separate DB commit per each step):
6960 - Apply leaked FM entries fix
6961 - Apply 'false free' FM entries fix
6962 - Apply 'Remove' actions
6963 - Apply fix for misreference pextents
6964 - Apply Shared Blob recreate
6965 (can be merged with the step above if misreferences were dectected)
6966 - Apply StatFS update
6967*/
3efd9988 6968int BlueStore::_fsck(bool deep, bool repair)
7c673cae 6969{
3efd9988 6970 dout(1) << __func__
11fdf7f2
TL
6971 << " <<<START>>>"
6972 << (repair ? " repair" : " check")
3efd9988 6973 << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
7c673cae 6974 int errors = 0;
11fdf7f2 6975 unsigned repaired = 0;
31f18b77
FG
6976
6977 typedef btree::btree_set<
6978 uint64_t,std::less<uint64_t>,
6979 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
6980 uint64_t_btree_t used_nids;
6981 uint64_t_btree_t used_omap_head;
11fdf7f2 6982 uint64_t_btree_t used_pgmeta_omap_head;
31f18b77
FG
6983 uint64_t_btree_t used_sbids;
6984
7c673cae 6985 mempool_dynamic_bitset used_blocks;
7c673cae 6986 KeyValueDB::Iterator it;
11fdf7f2
TL
6987 store_statfs_t expected_store_statfs, actual_statfs;
6988 per_pool_statfs expected_pool_statfs;
6989
7c673cae 6990 struct sb_info_t {
11fdf7f2
TL
6991 coll_t cid;
6992 int64_t pool_id = INT64_MIN;
7c673cae
FG
6993 list<ghobject_t> oids;
6994 SharedBlobRef sb;
6995 bluestore_extent_ref_map_t ref_map;
11fdf7f2
TL
6996 bool compressed = false;
6997 bool passed = false;
6998 bool updated = false;
7c673cae
FG
6999 };
7000 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
7001
7002 uint64_t num_objects = 0;
7003 uint64_t num_extents = 0;
7004 uint64_t num_blobs = 0;
7005 uint64_t num_spanning_blobs = 0;
7006 uint64_t num_shared_blobs = 0;
7007 uint64_t num_sharded_objects = 0;
7008 uint64_t num_object_shards = 0;
11fdf7f2
TL
7009 BlueStoreRepairer repairer;
7010 store_statfs_t* expected_statfs = nullptr;
7011 // in deep mode we need R/W write access to be able to replay deferred ops
7012 bool read_only = !(repair || deep);
7c673cae
FG
7013
7014 utime_t start = ceph_clock_now();
11fdf7f2
TL
7015 const auto& no_pps_mode = cct->_conf->bluestore_no_per_pool_stats_tolerance;
7016 bool need_per_pool_stats = no_pps_mode == "until_fsck" ||
7017 (no_pps_mode == "until_repair" && repair);
7018 bool enforce_no_per_pool_stats = no_pps_mode == "enforce";
7c673cae
FG
7019
7020 int r = _open_path();
7021 if (r < 0)
7022 return r;
7023 r = _open_fsid(false);
7024 if (r < 0)
7025 goto out_path;
7026
7027 r = _read_fsid(&fsid);
7028 if (r < 0)
7029 goto out_fsid;
7030
7031 r = _lock_fsid();
7032 if (r < 0)
7033 goto out_fsid;
7034
7035 r = _open_bdev(false);
7036 if (r < 0)
7037 goto out_fsid;
7038
11fdf7f2 7039 r = _open_db_and_around(read_only);
7c673cae
FG
7040 if (r < 0)
7041 goto out_bdev;
7042
11fdf7f2
TL
7043 if (!read_only) {
7044 r = _upgrade_super();
7045 if (r < 0) {
7046 goto out_db;
7047 }
7048 }
7c673cae
FG
7049
7050 r = _open_collections(&errors);
7051 if (r < 0)
11fdf7f2 7052 goto out_db;
7c673cae
FG
7053
7054 mempool_thread.init();
7055
11fdf7f2
TL
7056 // we need finisher and kv_{sync,finalize}_thread *just* for replay
7057 // enable in repair or deep mode modes only
7058 if (!read_only) {
7059 _kv_start();
7060 r = _deferred_replay();
7061 _kv_stop();
7062 }
7c673cae
FG
7063 if (r < 0)
7064 goto out_scan;
7065
b32b8144 7066 used_blocks.resize(fm->get_alloc_units());
7c673cae 7067 apply(
11fdf7f2 7068 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae 7069 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7070 ceph_assert(pos < bs.size());
7c673cae
FG
7071 bs.set(pos);
7072 }
7073 );
11fdf7f2
TL
7074 if (repair) {
7075 repairer.get_space_usage_tracker().init(
7076 bdev->get_size(),
7077 min_alloc_size);
7078 }
7c673cae
FG
7079
7080 if (bluefs) {
11fdf7f2
TL
7081 if( cct->_conf->bluestore_bluefs_db_compatibility) {
7082 interval_set<uint64_t> bluefs_extents_db;
7083 bufferlist bl;
7084 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7085 auto p = bl.cbegin();
7086 auto prev_errors = errors;
7087 try {
7088 decode(bluefs_extents_db, p);
7089 bluefs_extents_db.union_of(bluefs_extents);
7090 bluefs_extents_db.subtract(bluefs_extents);
7091 if (!bluefs_extents_db.empty()) {
7092 derr << "fsck error: bluefs_extents inconsistency, "
7093 << "downgrade to previous releases might be broken."
7094 << dendl;
7095 ++errors;
7096 }
7097 }
7098 catch (buffer::error& e) {
7099 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
7100 ++errors;
7101 }
7102 if (errors != prev_errors && repair) {
7103 repairer.fix_bluefs_extents(out_of_sync_fm);
7104 }
7105 }
7106
7c673cae
FG
7107 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
7108 apply(
b32b8144 7109 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 7110 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7111 ceph_assert(pos < bs.size());
7c673cae
FG
7112 bs.set(pos);
7113 }
7114 );
7115 }
7116 r = bluefs->fsck();
7117 if (r < 0) {
7118 goto out_scan;
7119 }
7120 if (r > 0)
7121 errors += r;
7122 }
7123
11fdf7f2 7124 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
7125 // structs
7126 statfs(&actual_statfs);
11fdf7f2
TL
7127 actual_statfs.total = 0;
7128 actual_statfs.internally_reserved = 0;
7129 actual_statfs.available = 0;
7130 actual_statfs.internal_metadata = 0;
7131 actual_statfs.omap_allocated = 0;
7132
7133 need_per_pool_stats = per_pool_stat_collection || need_per_pool_stats;
7c673cae
FG
7134
7135 // walk PREFIX_OBJ
7136 dout(1) << __func__ << " walking object keyspace" << dendl;
7137 it = db->get_iterator(PREFIX_OBJ);
7138 if (it) {
11fdf7f2
TL
7139 //fill global if not overriden below
7140 expected_statfs = &expected_store_statfs;
7141
7c673cae
FG
7142 CollectionRef c;
7143 spg_t pgid;
7144 mempool::bluestore_fsck::list<string> expecting_shards;
7145 for (it->lower_bound(string()); it->valid(); it->next()) {
11fdf7f2 7146 if (g_conf()->bluestore_debug_fsck_abort) {
31f18b77
FG
7147 goto out_scan;
7148 }
11fdf7f2
TL
7149 dout(30) << __func__ << " key "
7150 << pretty_binary_string(it->key()) << dendl;
7c673cae
FG
7151 if (is_extent_shard_key(it->key())) {
7152 while (!expecting_shards.empty() &&
7153 expecting_shards.front() < it->key()) {
3efd9988 7154 derr << "fsck error: missing shard key "
7c673cae
FG
7155 << pretty_binary_string(expecting_shards.front())
7156 << dendl;
7157 ++errors;
7158 expecting_shards.pop_front();
7159 }
7160 if (!expecting_shards.empty() &&
7161 expecting_shards.front() == it->key()) {
7162 // all good
7163 expecting_shards.pop_front();
7164 continue;
7165 }
7166
7167 uint32_t offset;
7168 string okey;
7169 get_key_extent_shard(it->key(), &okey, &offset);
3efd9988 7170 derr << "fsck error: stray shard 0x" << std::hex << offset
7c673cae
FG
7171 << std::dec << dendl;
7172 if (expecting_shards.empty()) {
3efd9988 7173 derr << "fsck error: " << pretty_binary_string(it->key())
7c673cae
FG
7174 << " is unexpected" << dendl;
7175 ++errors;
7176 continue;
7177 }
7178 while (expecting_shards.front() > it->key()) {
3efd9988 7179 derr << "fsck error: saw " << pretty_binary_string(it->key())
7c673cae 7180 << dendl;
3efd9988 7181 derr << "fsck error: exp "
7c673cae
FG
7182 << pretty_binary_string(expecting_shards.front()) << dendl;
7183 ++errors;
7184 expecting_shards.pop_front();
7185 if (expecting_shards.empty()) {
7186 break;
7187 }
7188 }
7189 continue;
7190 }
7191
7192 ghobject_t oid;
7193 int r = get_key_object(it->key(), &oid);
7194 if (r < 0) {
3efd9988 7195 derr << "fsck error: bad object key "
7c673cae
FG
7196 << pretty_binary_string(it->key()) << dendl;
7197 ++errors;
7198 continue;
7199 }
7200 if (!c ||
7201 oid.shard_id != pgid.shard ||
11fdf7f2 7202 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7c673cae
FG
7203 !c->contains(oid)) {
7204 c = nullptr;
11fdf7f2
TL
7205 for (auto& p : coll_map) {
7206 if (p.second->contains(oid)) {
7207 c = p.second;
7c673cae
FG
7208 break;
7209 }
7210 }
7211 if (!c) {
3efd9988 7212 derr << "fsck error: stray object " << oid
7c673cae
FG
7213 << " not owned by any collection" << dendl;
7214 ++errors;
7215 continue;
7216 }
11fdf7f2
TL
7217 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7218 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7219 << dendl;
7220 if (need_per_pool_stats) {
7221 expected_statfs = &expected_pool_statfs[pool_id];
7222 }
7223
28e407b8
AA
7224 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7225 << dendl;
7c673cae
FG
7226 }
7227
7228 if (!expecting_shards.empty()) {
7229 for (auto &k : expecting_shards) {
3efd9988 7230 derr << "fsck error: missing shard key "
7c673cae
FG
7231 << pretty_binary_string(k) << dendl;
7232 }
7233 ++errors;
7234 expecting_shards.clear();
7235 }
7236
7237 dout(10) << __func__ << " " << oid << dendl;
11fdf7f2 7238 store_statfs_t onode_statfs;
7c673cae
FG
7239 RWLock::RLocker l(c->lock);
7240 OnodeRef o = c->get_onode(oid, false);
7241 if (o->onode.nid) {
7242 if (o->onode.nid > nid_max) {
3efd9988 7243 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
7244 << " > nid_max " << nid_max << dendl;
7245 ++errors;
7246 }
7247 if (used_nids.count(o->onode.nid)) {
3efd9988 7248 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
7249 << " already in use" << dendl;
7250 ++errors;
7251 continue; // go for next object
7252 }
7253 used_nids.insert(o->onode.nid);
7254 }
7255 ++num_objects;
7256 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7257 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
81eedcae 7258 _dump_onode<30>(cct, *o);
7c673cae
FG
7259 // shards
7260 if (!o->extent_map.shards.empty()) {
7261 ++num_sharded_objects;
7262 num_object_shards += o->extent_map.shards.size();
7263 }
7264 for (auto& s : o->extent_map.shards) {
7265 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
7266 expecting_shards.push_back(string());
7267 get_extent_shard_key(o->key, s.shard_info->offset,
7268 &expecting_shards.back());
7269 if (s.shard_info->offset >= o->onode.size) {
3efd9988 7270 derr << "fsck error: " << oid << " shard 0x" << std::hex
7c673cae
FG
7271 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7272 << std::dec << dendl;
7273 ++errors;
7274 }
7275 }
7276 // lextents
7277 map<BlobRef,bluestore_blob_t::unused_t> referenced;
7278 uint64_t pos = 0;
7279 mempool::bluestore_fsck::map<BlobRef,
7280 bluestore_blob_use_tracker_t> ref_map;
7281 for (auto& l : o->extent_map.extent_map) {
7282 dout(20) << __func__ << " " << l << dendl;
7283 if (l.logical_offset < pos) {
3efd9988 7284 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
7285 << std::hex << l.logical_offset
7286 << " overlaps with the previous, which ends at 0x" << pos
7287 << std::dec << dendl;
7288 ++errors;
7289 }
7290 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
3efd9988 7291 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
7292 << std::hex << l.logical_offset << "~" << l.length
7293 << " spans a shard boundary"
7294 << std::dec << dendl;
7295 ++errors;
7296 }
7297 pos = l.logical_offset + l.length;
11fdf7f2
TL
7298 onode_statfs.data_stored += l.length;
7299 ceph_assert(l.blob);
7c673cae
FG
7300 const bluestore_blob_t& blob = l.blob->get_blob();
7301
7302 auto& ref = ref_map[l.blob];
7303 if (ref.is_empty()) {
7304 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7305 uint32_t l = blob.get_logical_length();
7306 ref.init(l, min_release_size);
7307 }
7308 ref.get(
7309 l.blob_offset,
7310 l.length);
7311 ++num_extents;
7312 if (blob.has_unused()) {
7313 auto p = referenced.find(l.blob);
7314 bluestore_blob_t::unused_t *pu;
7315 if (p == referenced.end()) {
7316 pu = &referenced[l.blob];
7317 } else {
7318 pu = &p->second;
7319 }
7320 uint64_t blob_len = blob.get_logical_length();
11fdf7f2
TL
7321 ceph_assert((blob_len % (sizeof(*pu)*8)) == 0);
7322 ceph_assert(l.blob_offset + l.length <= blob_len);
7c673cae
FG
7323 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
7324 uint64_t start = l.blob_offset / chunk_size;
7325 uint64_t end =
11fdf7f2 7326 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7c673cae
FG
7327 for (auto i = start; i < end; ++i) {
7328 (*pu) |= (1u << i);
7329 }
7330 }
7331 }
7332 for (auto &i : referenced) {
7333 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
7334 << std::dec << " for " << *i.first << dendl;
7335 const bluestore_blob_t& blob = i.first->get_blob();
7336 if (i.second & blob.unused) {
3efd9988 7337 derr << "fsck error: " << oid << " blob claims unused 0x"
7c673cae 7338 << std::hex << blob.unused
11fdf7f2 7339 << " but extents reference 0x" << i.second << std::dec
7c673cae
FG
7340 << " on blob " << *i.first << dendl;
7341 ++errors;
7342 }
7343 if (blob.has_csum()) {
7344 uint64_t blob_len = blob.get_logical_length();
7345 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
7346 unsigned csum_count = blob.get_csum_count();
7347 unsigned csum_chunk_size = blob.get_csum_chunk_size();
7348 for (unsigned p = 0; p < csum_count; ++p) {
7349 unsigned pos = p * csum_chunk_size;
7350 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
7351 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
7352 unsigned mask = 1u << firstbit;
7353 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
7354 mask |= 1u << b;
7355 }
7356 if ((blob.unused & mask) == mask) {
7357 // this csum chunk region is marked unused
7358 if (blob.get_csum_item(p) != 0) {
3efd9988 7359 derr << "fsck error: " << oid
7c673cae
FG
7360 << " blob claims csum chunk 0x" << std::hex << pos
7361 << "~" << csum_chunk_size
7362 << " is unused (mask 0x" << mask << " of unused 0x"
7363 << blob.unused << ") but csum is non-zero 0x"
7364 << blob.get_csum_item(p) << std::dec << " on blob "
7365 << *i.first << dendl;
7366 ++errors;
7367 }
7368 }
7369 }
7370 }
7371 }
7372 for (auto &i : ref_map) {
7373 ++num_blobs;
7374 const bluestore_blob_t& blob = i.first->get_blob();
7375 bool equal = i.first->get_blob_use_tracker().equal(i.second);
7376 if (!equal) {
3efd9988 7377 derr << "fsck error: " << oid << " blob " << *i.first
7c673cae
FG
7378 << " doesn't match expected ref_map " << i.second << dendl;
7379 ++errors;
7380 }
7381 if (blob.is_compressed()) {
11fdf7f2
TL
7382 onode_statfs.data_compressed += blob.get_compressed_payload_length();
7383 onode_statfs.data_compressed_original +=
7c673cae
FG
7384 i.first->get_referenced_bytes();
7385 }
7386 if (blob.is_shared()) {
7387 if (i.first->shared_blob->get_sbid() > blobid_max) {
3efd9988 7388 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
7389 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7390 << blobid_max << dendl;
7391 ++errors;
7392 } else if (i.first->shared_blob->get_sbid() == 0) {
3efd9988 7393 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
7394 << " marked as shared but has uninitialized sbid"
7395 << dendl;
7396 ++errors;
7397 }
7398 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
11fdf7f2
TL
7399 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7400 ceph_assert(sbi.pool_id == INT64_MIN ||
7401 sbi.pool_id == oid.hobj.get_logical_pool());
7402 sbi.cid = c->cid;
7403 sbi.pool_id = oid.hobj.get_logical_pool();
7c673cae
FG
7404 sbi.sb = i.first->shared_blob;
7405 sbi.oids.push_back(oid);
7406 sbi.compressed = blob.is_compressed();
7407 for (auto e : blob.get_extents()) {
7408 if (e.is_valid()) {
7409 sbi.ref_map.get(e.offset, e.length);
7410 }
7411 }
7412 } else {
11fdf7f2 7413 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7c673cae
FG
7414 blob.is_compressed(),
7415 used_blocks,
b32b8144 7416 fm->get_alloc_size(),
11fdf7f2
TL
7417 repair ? &repairer : nullptr,
7418 onode_statfs);
7c673cae
FG
7419 }
7420 }
7421 if (deep) {
7422 bufferlist bl;
a8e16298
TL
7423 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
7424 uint64_t offset = 0;
7425 do {
7426 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
7427 int r = _do_read(c.get(), o, offset, l, bl,
7428 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
7429 if (r < 0) {
7430 ++errors;
7431 derr << "fsck error: " << oid << std::hex
7432 << " error during read: "
7433 << " " << offset << "~" << l
7434 << " " << cpp_strerror(r) << std::dec
7435 << dendl;
7436 break;
7437 }
7438 offset += l;
7439 } while (offset < o->onode.size);
7c673cae
FG
7440 }
7441 // omap
7442 if (o->onode.has_omap()) {
11fdf7f2
TL
7443 auto& m =
7444 o->onode.is_pgmeta_omap() ? used_pgmeta_omap_head : used_omap_head;
7445 if (m.count(o->onode.nid)) {
3efd9988 7446 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
7c673cae
FG
7447 << " already in use" << dendl;
7448 ++errors;
7449 } else {
11fdf7f2 7450 m.insert(o->onode.nid);
7c673cae
FG
7451 }
7452 }
11fdf7f2
TL
7453 expected_statfs->add(onode_statfs);
7454 } // for (it->lower_bound(string()); it->valid(); it->next())
7455 } // if (it)
7456
7c673cae
FG
7457 dout(1) << __func__ << " checking shared_blobs" << dendl;
7458 it = db->get_iterator(PREFIX_SHARED_BLOB);
7459 if (it) {
11fdf7f2
TL
7460 //fill global if not overriden below
7461 expected_statfs = &expected_store_statfs;
7462
7c673cae
FG
7463 for (it->lower_bound(string()); it->valid(); it->next()) {
7464 string key = it->key();
7465 uint64_t sbid;
7466 if (get_key_shared_blob(key, &sbid)) {
3efd9988 7467 derr << "fsck error: bad key '" << key
7c673cae 7468 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
7469 if (repair) {
7470 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
7471 }
7c673cae
FG
7472 ++errors;
7473 continue;
7474 }
7475 auto p = sb_info.find(sbid);
7476 if (p == sb_info.end()) {
3efd9988 7477 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 7478 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
7479 if (repair) {
7480 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
7481 }
7c673cae
FG
7482 ++errors;
7483 } else {
7484 ++num_shared_blobs;
7485 sb_info_t& sbi = p->second;
7486 bluestore_shared_blob_t shared_blob(sbid);
7487 bufferlist bl = it->value();
11fdf7f2
TL
7488 auto blp = bl.cbegin();
7489 try {
7490 decode(shared_blob, blp);
7491 } catch (buffer::error& e) {
7492 ++errors;
7493 // Force update and don't report as missing
7494 sbi.updated = sbi.passed = true;
7495
7496 derr << "fsck error: failed to decode Shared Blob"
7497 << pretty_binary_string(it->key()) << dendl;
7498 if (repair) {
7499 dout(20) << __func__ << " undecodable Shared Blob, key:'"
7500 << pretty_binary_string(it->key())
7501 << "', removing" << dendl;
7502 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
7503 }
7504 continue;
7505 }
7c673cae
FG
7506 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
7507 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 7508 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
7509 << std::dec << " ref_map " << shared_blob.ref_map
7510 << " != expected " << sbi.ref_map << dendl;
7511 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
7512 ++errors;
7513 }
7514 PExtentVector extents;
7515 for (auto &r : shared_blob.ref_map.ref_map) {
7516 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
7517 }
11fdf7f2
TL
7518 if (need_per_pool_stats) {
7519 expected_statfs = &expected_pool_statfs[sbi.pool_id];
7520 }
7521 errors += _fsck_check_extents(sbi.cid,
7522 p->second.oids.front(),
7c673cae
FG
7523 extents,
7524 p->second.compressed,
b32b8144
FG
7525 used_blocks,
7526 fm->get_alloc_size(),
11fdf7f2
TL
7527 repair ? &repairer : nullptr,
7528 *expected_statfs);
7529 sbi.passed = true;
7530 }
7531 }
7532 } // if (it)
7533
7534 if (repair && repairer.preprocess_misreference(db)) {
7535
7536 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
7537 auto& space_tracker = repairer.get_space_usage_tracker();
7538 auto& misref_extents = repairer.get_misreferences();
7539 interval_set<uint64_t> to_release;
7540 it = db->get_iterator(PREFIX_OBJ);
7541 if (it) {
7542 //fill global if not overriden below
7543 expected_statfs = &expected_store_statfs;
7544
7545 CollectionRef c;
7546 spg_t pgid;
7547 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
7548 bool bypass_rest = false;
7549 for (it->lower_bound(string()); it->valid() && !bypass_rest;
7550 it->next()) {
7551 dout(30) << __func__ << " key "
7552 << pretty_binary_string(it->key()) << dendl;
7553 if (is_extent_shard_key(it->key())) {
7554 continue;
7555 }
7556
7557 ghobject_t oid;
7558 int r = get_key_object(it->key(), &oid);
7559 if (r < 0 || !space_tracker.is_used(oid)) {
7560 continue;
7561 }
7562
7563 if (!c ||
7564 oid.shard_id != pgid.shard ||
7565 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7566 !c->contains(oid)) {
7567 c = nullptr;
7568 for (auto& p : coll_map) {
7569 if (p.second->contains(oid)) {
7570 c = p.second;
7571 break;
7572 }
7573 }
7574 if (!c) {
7575 continue;
7576 }
7577 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7578 if (need_per_pool_stats) {
7579 expected_statfs = &expected_pool_statfs[pool_id];
7580 }
7581 }
7582 if (!space_tracker.is_used(c->cid)) {
7583 continue;
7584 }
7585
7586 dout(20) << __func__ << " check misreference for col:" << c->cid
7587 << " obj:" << oid << dendl;
7588
7589 RWLock::RLocker l(c->lock);
7590 OnodeRef o = c->get_onode(oid, false);
7591 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7592 mempool::bluestore_fsck::set<BlobRef> blobs;
7593
7594 for (auto& e : o->extent_map.extent_map) {
7595 blobs.insert(e.blob);
7596 }
7597 bool need_onode_update = false;
7598 bool first_dump = true;
7599 for(auto b : blobs) {
7600 bool broken_blob = false;
7601 auto& pextents = b->dirty_blob().dirty_extents();
7602 for (auto& e : pextents) {
7603 if (!e.is_valid()) {
7604 continue;
7605 }
7606 // for the sake of simplicity and proper shared blob handling
7607 // always rewrite the whole blob even when it's partially
7608 // misreferenced.
7609 if (misref_extents.intersects(e.offset, e.length)) {
7610 if (first_dump) {
7611 first_dump = false;
81eedcae 7612 _dump_onode<10>(cct, *o);
11fdf7f2
TL
7613 }
7614 broken_blob = true;
7615 break;
7616 }
7617 }
7618 if (!broken_blob)
7619 continue;
7620 bool compressed = b->get_blob().is_compressed();
7621 need_onode_update = true;
7622 dout(10) << __func__
7623 << " fix misreferences in oid:" << oid
7624 << " " << *b << dendl;
7625 uint64_t b_off = 0;
7626 PExtentVector pext_to_release;
7627 pext_to_release.reserve(pextents.size());
7628 // rewriting all valid pextents
7629 for (auto e = pextents.begin(); e != pextents.end();
7630 b_off += e->length, e++) {
7631 if (!e->is_valid()) {
7632 continue;
7633 }
7634 PExtentVector exts;
7635 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
7636 0, 0, &exts);
7637 if (alloc_len < (int64_t)e->length) {
7638 derr << __func__
7639 << " failed to allocate 0x" << std::hex << e->length
7640 << " allocated 0x " << alloc_len
7641 << " min_alloc_size 0x" << min_alloc_size
7642 << " available 0x " << alloc->get_free()
7643 << std::dec << dendl;
7644 if (alloc_len > 0) {
7645 alloc->release(exts);
7646 }
7647 bypass_rest = true;
7648 break;
7649 }
7650 expected_statfs->allocated += e->length;
7651 if (compressed) {
7652 expected_statfs->data_compressed_allocated += e->length;
7653 }
7654
7655 bufferlist bl;
7656 IOContext ioc(cct, NULL, true); // allow EIO
7657 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
7658 if (r < 0) {
7659 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
7660 <<"~" << e->length << std::dec << dendl;
7661 ceph_abort_msg("read failed, wtf");
7662 }
7663 pext_to_release.push_back(*e);
7664 e = pextents.erase(e);
7665 e = pextents.insert(e, exts.begin(), exts.end());
7666 b->get_blob().map_bl(
7667 b_off, bl,
7668 [&](uint64_t offset, bufferlist& t) {
7669 int r = bdev->write(offset, t, false);
7670 ceph_assert(r == 0);
7671 });
7672 e += exts.size() - 1;
7673 for (auto& p : exts) {
7674 fm->allocate(p.offset, p.length, txn);
7675 }
7676 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
7677
7678 if (b->get_blob().is_shared()) {
7679 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
7680
7681 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
7682 ceph_assert(sb_it != sb_info.end());
7683 sb_info_t& sbi = sb_it->second;
7684
7685 for (auto& r : sbi.ref_map.ref_map) {
7686 expected_statfs->allocated -= r.second.length;
7687 if (sbi.compressed) {
7688 // NB: it's crucial to use compressed flag from sb_info_t
7689 // as we originally used that value while accumulating
7690 // expected_statfs
7691 expected_statfs->data_compressed_allocated -= r.second.length;
7692 }
7693 }
7694 sbi.updated = sbi.passed = true;
7695 sbi.ref_map.clear();
7696
7697 // relying on blob's pextents to decide what to release.
7698 for (auto& p : pext_to_release) {
7699 to_release.union_insert(p.offset, p.length);
7700 }
7701 } else {
7702 for (auto& p : pext_to_release) {
7703 expected_statfs->allocated -= p.length;
7704 if (compressed) {
7705 expected_statfs->data_compressed_allocated -= p.length;
7706 }
7707 to_release.union_insert(p.offset, p.length);
7708 }
7709 }
7710 if (bypass_rest) {
7711 break;
7712 }
7713 } // for(auto b : blobs)
7714 if (need_onode_update) {
7715 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
7716 _record_onode(o, txn);
7717 }
7718 } // for (it->lower_bound(string()); it->valid(); it->next())
7719
7720 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
7721 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
7722 << "~" << it.get_len() << std::dec << dendl;
7723 fm->release(it.get_start(), it.get_len(), txn);
7724 }
7725 alloc->release(to_release);
7726 to_release.clear();
7727 } // if (it) {
7728 } //if (repair && repairer.preprocess_misreference()) {
7729
7730 for (auto &p : sb_info) {
7731 sb_info_t& sbi = p.second;
7732 if (!sbi.passed) {
7733 derr << "fsck error: missing " << *sbi.sb << dendl;
7734 ++errors;
7735 }
7736 if (repair && (!sbi.passed || sbi.updated)) {
7737 auto sbid = p.first;
7738 if (sbi.ref_map.empty()) {
7739 ceph_assert(sbi.passed);
7740 dout(20) << __func__ << " " << *sbi.sb
7741 << " is empty, removing" << dendl;
7742 repairer.fix_shared_blob(db, sbid, nullptr);
7743 } else {
7744 bufferlist bl;
7745 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
7746 encode(persistent, bl);
7747 dout(20) << __func__ << " " << *sbi.sb
7748 << " is " << bl.length() << " bytes, updating" << dendl;
7749
7750 repairer.fix_shared_blob(db, sbid, &bl);
7c673cae
FG
7751 }
7752 }
7753 }
11fdf7f2
TL
7754 sb_info.clear();
7755
7756 // check global stats if no-pps is enforced only
7757 if (!need_per_pool_stats) {
7758 if (!(actual_statfs == expected_store_statfs)) {
7759 derr << "fsck error: actual " << actual_statfs
7760 << " != expected " << expected_store_statfs << dendl;
7761 if (repair) {
7762 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
7763 expected_store_statfs);
7764 }
7765 ++errors;
7766 }
7c673cae 7767 }
11fdf7f2
TL
7768 if (!enforce_no_per_pool_stats) {
7769 dout(1) << __func__ << " checking pool_statfs" << dendl;
7770 _fsck_check_pool_statfs(expected_pool_statfs, need_per_pool_stats,
7771 errors, repair ? &repairer : nullptr);
7c673cae
FG
7772 }
7773
7774 dout(1) << __func__ << " checking for stray omap data" << dendl;
7775 it = db->get_iterator(PREFIX_OMAP);
7776 if (it) {
7777 for (it->lower_bound(string()); it->valid(); it->next()) {
7778 uint64_t omap_head;
7779 _key_decode_u64(it->key().c_str(), &omap_head);
7780 if (used_omap_head.count(omap_head) == 0) {
3efd9988 7781 derr << "fsck error: found stray omap data on omap_head "
7c673cae
FG
7782 << omap_head << dendl;
7783 ++errors;
7784 }
7785 }
7786 }
11fdf7f2
TL
7787 it = db->get_iterator(PREFIX_PGMETA_OMAP);
7788 if (it) {
7789 for (it->lower_bound(string()); it->valid(); it->next()) {
7790 uint64_t omap_head;
7791 _key_decode_u64(it->key().c_str(), &omap_head);
7792 if (used_pgmeta_omap_head.count(omap_head) == 0) {
7793 derr << "fsck error: found stray omap data on omap_head "
7794 << omap_head << dendl;
7795 ++errors;
7796 }
7797 }
7798 }
7c673cae
FG
7799
7800 dout(1) << __func__ << " checking deferred events" << dendl;
7801 it = db->get_iterator(PREFIX_DEFERRED);
7802 if (it) {
7803 for (it->lower_bound(string()); it->valid(); it->next()) {
7804 bufferlist bl = it->value();
11fdf7f2 7805 auto p = bl.cbegin();
7c673cae
FG
7806 bluestore_deferred_transaction_t wt;
7807 try {
11fdf7f2 7808 decode(wt, p);
7c673cae 7809 } catch (buffer::error& e) {
3efd9988 7810 derr << "fsck error: failed to decode deferred txn "
7c673cae 7811 << pretty_binary_string(it->key()) << dendl;
11fdf7f2
TL
7812 if (repair) {
7813 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
7814 << pretty_binary_string(it->key())
7815 << "', removing" << dendl;
7816 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
7817 }
7818 continue;
7c673cae
FG
7819 }
7820 dout(20) << __func__ << " deferred " << wt.seq
7821 << " ops " << wt.ops.size()
7822 << " released 0x" << std::hex << wt.released << std::dec << dendl;
7823 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
7824 apply(
b32b8144 7825 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 7826 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7827 ceph_assert(pos < bs.size());
7c673cae
FG
7828 bs.set(pos);
7829 }
7830 );
7831 }
7832 }
7833 }
7834
7835 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
7836 {
7837 // remove bluefs_extents from used set since the freelist doesn't
7838 // know they are allocated.
7839 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
7840 apply(
b32b8144 7841 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 7842 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7843 ceph_assert(pos < bs.size());
7c673cae
FG
7844 bs.reset(pos);
7845 }
7846 );
7847 }
7848 fm->enumerate_reset();
7849 uint64_t offset, length;
11fdf7f2 7850 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
7851 bool intersects = false;
7852 apply(
b32b8144 7853 offset, length, fm->get_alloc_size(), used_blocks,
7c673cae 7854 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7855 ceph_assert(pos < bs.size());
7c673cae 7856 if (bs.test(pos)) {
11fdf7f2
TL
7857 if (offset == SUPER_RESERVED &&
7858 length == min_alloc_size - SUPER_RESERVED) {
7859 // this is due to the change just after luminous to min_alloc_size
7860 // granularity allocations, and our baked in assumption at the top
7861 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
7862 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
7863 // since we will never allocate this region below min_alloc_size.
7864 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
7865 << " and min_alloc_size, 0x" << std::hex << offset << "~"
7866 << length << std::dec << dendl;
7867 } else {
7868 intersects = true;
7869 if (repair) {
7870 repairer.fix_false_free(db, fm,
7871 pos * min_alloc_size,
7872 min_alloc_size);
7873 }
7874 }
7c673cae
FG
7875 } else {
7876 bs.set(pos);
7877 }
7878 }
7879 );
7880 if (intersects) {
11fdf7f2
TL
7881 derr << "fsck error: free extent 0x" << std::hex << offset
7882 << "~" << length << std::dec
7883 << " intersects allocated blocks" << dendl;
7884 ++errors;
b5b8bbf5
FG
7885 }
7886 }
3efd9988
FG
7887 fm->enumerate_reset();
7888 size_t count = used_blocks.count();
7c673cae 7889 if (used_blocks.size() != count) {
11fdf7f2 7890 ceph_assert(used_blocks.size() > count);
b5b8bbf5
FG
7891 used_blocks.flip();
7892 size_t start = used_blocks.find_first();
7893 while (start != decltype(used_blocks)::npos) {
7894 size_t cur = start;
7895 while (true) {
7896 size_t next = used_blocks.find_next(cur);
7897 if (next != cur + 1) {
11fdf7f2 7898 ++errors;
3efd9988 7899 derr << "fsck error: leaked extent 0x" << std::hex
b32b8144
FG
7900 << ((uint64_t)start * fm->get_alloc_size()) << "~"
7901 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
b5b8bbf5 7902 << dendl;
11fdf7f2
TL
7903 if (repair) {
7904 repairer.fix_leaked(db,
7905 fm,
7906 start * min_alloc_size,
7907 (cur + 1 - start) * min_alloc_size);
7908 }
b5b8bbf5
FG
7909 start = next;
7910 break;
7911 }
7912 cur = next;
7913 }
7914 }
7915 used_blocks.flip();
7c673cae
FG
7916 }
7917 }
11fdf7f2
TL
7918 if (repair) {
7919 dout(5) << __func__ << " applying repair results" << dendl;
7920 repaired = repairer.apply(db);
7921 dout(5) << __func__ << " repair applied" << dendl;
7922 }
7c673cae
FG
7923 out_scan:
7924 mempool_thread.shutdown();
31f18b77 7925 _flush_cache();
7c673cae
FG
7926 out_db:
7927 it.reset(); // before db is closed
11fdf7f2 7928 _close_db_and_around();
7c673cae
FG
7929 out_bdev:
7930 _close_bdev();
7931 out_fsid:
7932 _close_fsid();
7933 out_path:
7934 _close_path();
7935
7936 // fatal errors take precedence
7937 if (r < 0)
7938 return r;
7939
7940 dout(2) << __func__ << " " << num_objects << " objects, "
7941 << num_sharded_objects << " of them sharded. "
7942 << dendl;
7943 dout(2) << __func__ << " " << num_extents << " extents to "
7944 << num_blobs << " blobs, "
7945 << num_spanning_blobs << " spanning, "
7946 << num_shared_blobs << " shared."
7947 << dendl;
7948
7949 utime_t duration = ceph_clock_now() - start;
11fdf7f2
TL
7950 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " << repaired
7951 << " repaired, " << (errors - (int)repaired) << " remaining in "
7c673cae 7952 << duration << " seconds" << dendl;
11fdf7f2
TL
7953 return errors - (int)repaired;
7954}
7955
7956/// methods to inject various errors fsck can repair
7957void BlueStore::inject_broken_shared_blob_key(const string& key,
7958 const bufferlist& bl)
7959{
7960 KeyValueDB::Transaction txn;
7961 txn = db->get_transaction();
7962 txn->set(PREFIX_SHARED_BLOB, key, bl);
7963 db->submit_transaction_sync(txn);
7964};
7965
7966void BlueStore::inject_leaked(uint64_t len)
7967{
7968 KeyValueDB::Transaction txn;
7969 txn = db->get_transaction();
7970
7971 PExtentVector exts;
7972 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
7973 min_alloc_size * 256, 0, &exts);
7974 ceph_assert(alloc_len >= (int64_t)len);
7975 for (auto& p : exts) {
7976 fm->allocate(p.offset, p.length, txn);
7977 }
7978 db->submit_transaction_sync(txn);
7979}
7980
7981void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
7982{
7983 KeyValueDB::Transaction txn;
7984 OnodeRef o;
7985 CollectionRef c = _get_collection(cid);
7986 ceph_assert(c);
7987 {
7988 RWLock::WLocker l(c->lock); // just to avoid internal asserts
7989 o = c->get_onode(oid, false);
7990 ceph_assert(o);
7991 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7992 }
7993
7994 bool injected = false;
7995 txn = db->get_transaction();
7996 auto& em = o->extent_map.extent_map;
7997 std::vector<const PExtentVector*> v;
7998 if (em.size()) {
7999 v.push_back(&em.begin()->blob->get_blob().get_extents());
8000 }
8001 if (em.size() > 1) {
8002 auto it = em.end();
8003 --it;
8004 v.push_back(&(it->blob->get_blob().get_extents()));
8005 }
8006 for (auto pext : v) {
8007 if (pext->size()) {
8008 auto p = pext->begin();
8009 while (p != pext->end()) {
8010 if (p->is_valid()) {
8011 dout(20) << __func__ << " release 0x" << std::hex << p->offset
8012 << "~" << p->length << std::dec << dendl;
8013 fm->release(p->offset, p->length, txn);
8014 injected = true;
8015 break;
8016 }
8017 ++p;
8018 }
8019 }
8020 }
8021 ceph_assert(injected);
8022 db->submit_transaction_sync(txn);
8023}
8024
8025void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
8026{
8027 BlueStoreRepairer repairer;
8028 repairer.fix_statfs(db, key, new_statfs);
8029 repairer.apply(db);
8030}
8031
8032void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
8033 coll_t cid2, ghobject_t oid2,
8034 uint64_t offset)
8035{
8036 OnodeRef o1;
8037 CollectionRef c1 = _get_collection(cid1);
8038 ceph_assert(c1);
8039 {
8040 RWLock::WLocker l(c1->lock); // just to avoid internal asserts
8041 o1 = c1->get_onode(oid1, false);
8042 ceph_assert(o1);
8043 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
8044 }
8045 OnodeRef o2;
8046 CollectionRef c2 = _get_collection(cid2);
8047 ceph_assert(c2);
8048 {
8049 RWLock::WLocker l(c2->lock); // just to avoid internal asserts
8050 o2 = c2->get_onode(oid2, false);
8051 ceph_assert(o2);
8052 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
8053 }
8054 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
8055 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
8056
8057 // require onode/extent layout to be the same (and simple)
8058 // to make things easier
8059 ceph_assert(o1->onode.extent_map_shards.empty());
8060 ceph_assert(o2->onode.extent_map_shards.empty());
8061 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
8062 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
8063 ceph_assert(e1.logical_offset == e2.logical_offset);
8064 ceph_assert(e1.length == e2.length);
8065 ceph_assert(e1.blob_offset == e2.blob_offset);
8066
8067 KeyValueDB::Transaction txn;
8068 txn = db->get_transaction();
8069
8070 // along with misreference error this will create space leaks errors
8071 e2.blob->dirty_blob() = e1.blob->get_blob();
8072 o2->extent_map.dirty_range(offset, e2.length);
8073 o2->extent_map.update(txn, false);
8074
8075 _record_onode(o2, txn);
8076 db->submit_transaction_sync(txn);
7c673cae
FG
8077}
8078
8079void BlueStore::collect_metadata(map<string,string> *pm)
8080{
8081 dout(10) << __func__ << dendl;
8082 bdev->collect_metadata("bluestore_bdev_", pm);
8083 if (bluefs) {
8084 (*pm)["bluefs"] = "1";
8085 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
11fdf7f2 8086 bluefs->collect_metadata(pm, bluefs_shared_bdev);
7c673cae
FG
8087 } else {
8088 (*pm)["bluefs"] = "0";
8089 }
11fdf7f2
TL
8090
8091 // report numa mapping for underlying devices
8092 int node = -1;
8093 set<int> nodes;
8094 set<string> failed;
8095 int r = get_numa_node(&node, &nodes, &failed);
8096 if (r >= 0) {
8097 if (!failed.empty()) {
8098 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
8099 }
8100 if (!nodes.empty()) {
8101 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
8102 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
8103 }
8104 if (node >= 0) {
8105 (*pm)["objectstore_numa_node"] = stringify(node);
8106 }
8107 }
8108}
8109
8110int BlueStore::get_numa_node(
8111 int *final_node,
8112 set<int> *out_nodes,
8113 set<string> *out_failed)
8114{
8115 int node = -1;
8116 set<string> devices;
8117 get_devices(&devices);
8118 set<int> nodes;
8119 set<string> failed;
8120 for (auto& devname : devices) {
8121 int n;
8122 BlkDev bdev(devname);
8123 int r = bdev.get_numa_node(&n);
8124 if (r < 0) {
8125 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
8126 << dendl;
8127 failed.insert(devname);
8128 continue;
8129 }
8130 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
8131 << dendl;
8132 nodes.insert(n);
8133 if (node < 0) {
8134 node = n;
8135 }
8136 }
8137 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
8138 *final_node = node;
8139 }
8140 if (out_nodes) {
8141 *out_nodes = nodes;
8142 }
8143 if (out_failed) {
8144 *out_failed = failed;
8145 }
8146 return 0;
8147}
8148
8149int BlueStore::get_devices(set<string> *ls)
8150{
8151 if (bdev) {
8152 bdev->get_devices(ls);
8153 if (bluefs) {
8154 bluefs->get_devices(ls);
8155 }
8156 return 0;
8157 }
8158
8159 // grumble, we haven't started up yet.
8160 int r = _open_path();
8161 if (r < 0)
8162 goto out;
8163 r = _open_fsid(false);
8164 if (r < 0)
8165 goto out_path;
8166 r = _read_fsid(&fsid);
8167 if (r < 0)
8168 goto out_fsid;
8169 r = _lock_fsid();
8170 if (r < 0)
8171 goto out_fsid;
8172 r = _open_bdev(false);
8173 if (r < 0)
8174 goto out_fsid;
8175 r = _minimal_open_bluefs(false);
8176 if (r < 0)
8177 goto out_bdev;
8178 bdev->get_devices(ls);
8179 if (bluefs) {
8180 bluefs->get_devices(ls);
8181 }
8182 r = 0;
8183 _minimal_close_bluefs();
8184 out_bdev:
8185 _close_bdev();
8186 out_fsid:
8187 _close_fsid();
8188 out_path:
8189 _close_path();
8190 out:
8191 return r;
7c673cae
FG
8192}
8193
11fdf7f2 8194void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
8195{
8196 buf->reset();
11fdf7f2
TL
8197
8198 buf->omap_allocated = db->estimate_prefix_size(PREFIX_OMAP);
8199
8200 uint64_t bfree = alloc->get_free();
7c673cae
FG
8201
8202 if (bluefs) {
11fdf7f2
TL
8203 int64_t bluefs_total = bluefs->get_total(bluefs_shared_bdev);
8204 int64_t bluefs_free = bluefs->get_free(bluefs_shared_bdev);
94b18763
FG
8205 // part of our shared device is "free" according to BlueFS, but we
8206 // can't touch bluestore_bluefs_min of it.
8207 int64_t shared_available = std::min(
11fdf7f2
TL
8208 bluefs_free,
8209 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
8210 buf->internally_reserved = bluefs_total - shared_available;
94b18763 8211 if (shared_available > 0) {
11fdf7f2
TL
8212 bfree += shared_available;
8213 }
8214 // include dedicated db, too, if that isn't the shared device.
8215 if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
8216 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 8217 }
11fdf7f2
TL
8218 // call any non-omap bluefs space "internal metadata"
8219 buf->internal_metadata =
8220 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
8221 - buf->omap_allocated;
7c673cae
FG
8222 }
8223
11fdf7f2
TL
8224 uint64_t thin_total, thin_avail;
8225 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
8226 buf->total += thin_total;
8227
8228 // we are limited by both the size of the virtual device and the
8229 // underlying physical device.
8230 bfree = std::min(bfree, thin_avail);
8231
8232 buf->allocated = thin_total - thin_avail;
8233 } else {
8234 buf->total += bdev->get_size();
8235 }
8236 buf->available = bfree;
8237}
8238
8239int BlueStore::statfs(struct store_statfs_t *buf,
8240 osd_alert_list_t* alerts)
8241{
8242 if (alerts) {
8243 alerts->clear();
8244 _log_alerts(*alerts);
8245 }
8246 _get_statfs_overall(buf);
31f18b77 8247 {
11fdf7f2 8248 std::lock_guard l(vstatfs_lock);
31f18b77 8249 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
8250 buf->data_stored = vstatfs.stored();
8251 buf->data_compressed = vstatfs.compressed();
8252 buf->data_compressed_original = vstatfs.compressed_original();
8253 buf->data_compressed_allocated = vstatfs.compressed_allocated();
8254 }
8255
8256 dout(20) << __func__ << " " << *buf << dendl;
8257 return 0;
8258}
8259
8260int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
8261{
8262 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 8263
11fdf7f2
TL
8264 if (!per_pool_stat_collection) {
8265 dout(20) << __func__ << " not supported in legacy mode " << dendl;
8266 return -ENOTSUP;
7c673cae 8267 }
11fdf7f2 8268 buf->reset();
7c673cae 8269
11fdf7f2
TL
8270 {
8271 std::lock_guard l(vstatfs_lock);
8272 osd_pools[pool_id].publish(buf);
8273 }
8274 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
8275 return 0;
8276}
8277
81eedcae
TL
8278void BlueStore::_check_legacy_statfs_alert()
8279{
8280 string s;
8281 if (!per_pool_stat_collection &&
8282 cct->_conf->bluestore_no_per_pool_stats_tolerance != "enforce" &&
8283 cct->_conf->bluestore_warn_on_legacy_statfs) {
8284 s = "legacy statfs reporting detected, "
8285 "suggest to run store repair to get consistent statistic reports";
8286 }
8287 std::lock_guard l(qlock);
8288 legacy_statfs_alert = s;
8289}
8290
7c673cae
FG
8291// ---------------
8292// cache
8293
8294BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
8295{
8296 RWLock::RLocker l(coll_lock);
8297 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
8298 if (cp == coll_map.end())
8299 return CollectionRef();
8300 return cp->second;
8301}
8302
8303void BlueStore::_queue_reap_collection(CollectionRef& c)
8304{
8305 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
8306 // _reap_collections and this in the same thread,
8307 // so no need a lock.
7c673cae
FG
8308 removed_collections.push_back(c);
8309}
8310
8311void BlueStore::_reap_collections()
8312{
94b18763 8313
7c673cae
FG
8314 list<CollectionRef> removed_colls;
8315 {
94b18763
FG
8316 // _queue_reap_collection and this in the same thread.
8317 // So no need a lock.
8318 if (!removed_collections.empty())
8319 removed_colls.swap(removed_collections);
8320 else
8321 return;
7c673cae
FG
8322 }
8323
94b18763
FG
8324 list<CollectionRef>::iterator p = removed_colls.begin();
8325 while (p != removed_colls.end()) {
7c673cae
FG
8326 CollectionRef c = *p;
8327 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
8328 if (c->onode_map.map_any([&](OnodeRef o) {
11fdf7f2 8329 ceph_assert(!o->exists);
7c673cae
FG
8330 if (o->flushing_count.load()) {
8331 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
8332 << " flush_txns " << o->flushing_count << dendl;
94b18763 8333 return true;
7c673cae 8334 }
94b18763 8335 return false;
7c673cae 8336 })) {
94b18763 8337 ++p;
7c673cae
FG
8338 continue;
8339 }
8340 c->onode_map.clear();
94b18763 8341 p = removed_colls.erase(p);
7c673cae
FG
8342 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
8343 }
94b18763 8344 if (removed_colls.empty()) {
7c673cae 8345 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
8346 } else {
8347 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
8348 }
8349}
8350
8351void BlueStore::_update_cache_logger()
8352{
8353 uint64_t num_onodes = 0;
8354 uint64_t num_extents = 0;
8355 uint64_t num_blobs = 0;
8356 uint64_t num_buffers = 0;
8357 uint64_t num_buffer_bytes = 0;
8358 for (auto c : cache_shards) {
8359 c->add_stats(&num_onodes, &num_extents, &num_blobs,
8360 &num_buffers, &num_buffer_bytes);
8361 }
8362 logger->set(l_bluestore_onodes, num_onodes);
8363 logger->set(l_bluestore_extents, num_extents);
8364 logger->set(l_bluestore_blobs, num_blobs);
8365 logger->set(l_bluestore_buffers, num_buffers);
8366 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
8367}
8368
8369// ---------------
8370// read operations
8371
8372ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
8373{
8374 return _get_collection(cid);
8375}
8376
11fdf7f2
TL
8377ObjectStore::CollectionHandle BlueStore::create_new_collection(
8378 const coll_t& cid)
7c673cae 8379{
11fdf7f2
TL
8380 RWLock::WLocker l(coll_lock);
8381 Collection *c = new Collection(
8382 this,
8383 cache_shards[cid.hash_to_shard(cache_shards.size())],
8384 cid);
8385 new_coll_map[cid] = c;
8386 _osr_attach(c);
8387 return c;
8388}
8389
8390void BlueStore::set_collection_commit_queue(
8391 const coll_t& cid,
8392 ContextQueue *commit_queue)
8393{
8394 if (commit_queue) {
8395 RWLock::RLocker l(coll_lock);
8396 if (coll_map.count(cid)) {
8397 coll_map[cid]->commit_queue = commit_queue;
8398 } else if (new_coll_map.count(cid)) {
8399 new_coll_map[cid]->commit_queue = commit_queue;
8400 }
8401 }
7c673cae
FG
8402}
8403
11fdf7f2 8404
7c673cae
FG
8405bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
8406{
8407 Collection *c = static_cast<Collection *>(c_.get());
8408 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
8409 if (!c->exists)
8410 return false;
8411
8412 bool r = true;
8413
8414 {
8415 RWLock::RLocker l(c->lock);
8416 OnodeRef o = c->get_onode(oid, false);
8417 if (!o || !o->exists)
8418 r = false;
8419 }
8420
7c673cae
FG
8421 return r;
8422}
8423
7c673cae
FG
8424int BlueStore::stat(
8425 CollectionHandle &c_,
8426 const ghobject_t& oid,
8427 struct stat *st,
8428 bool allow_eio)
8429{
8430 Collection *c = static_cast<Collection *>(c_.get());
8431 if (!c->exists)
8432 return -ENOENT;
8433 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
8434
8435 {
8436 RWLock::RLocker l(c->lock);
8437 OnodeRef o = c->get_onode(oid, false);
8438 if (!o || !o->exists)
8439 return -ENOENT;
8440 st->st_size = o->onode.size;
8441 st->st_blksize = 4096;
8442 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
8443 st->st_nlink = 1;
8444 }
8445
7c673cae
FG
8446 int r = 0;
8447 if (_debug_mdata_eio(oid)) {
8448 r = -EIO;
8449 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
8450 }
8451 return r;
8452}
8453int BlueStore::set_collection_opts(
11fdf7f2 8454 CollectionHandle& ch,
7c673cae
FG
8455 const pool_opts_t& opts)
8456{
7c673cae 8457 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 8458 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
8459 if (!c->exists)
8460 return -ENOENT;
8461 RWLock::WLocker l(c->lock);
8462 c->pool_opts = opts;
8463 return 0;
8464}
8465
7c673cae
FG
8466int BlueStore::read(
8467 CollectionHandle &c_,
8468 const ghobject_t& oid,
8469 uint64_t offset,
8470 size_t length,
8471 bufferlist& bl,
224ce89b 8472 uint32_t op_flags)
7c673cae 8473{
11fdf7f2 8474 auto start = mono_clock::now();
7c673cae
FG
8475 Collection *c = static_cast<Collection *>(c_.get());
8476 const coll_t &cid = c->get_cid();
8477 dout(15) << __func__ << " " << cid << " " << oid
8478 << " 0x" << std::hex << offset << "~" << length << std::dec
8479 << dendl;
8480 if (!c->exists)
8481 return -ENOENT;
8482
8483 bl.clear();
8484 int r;
8485 {
8486 RWLock::RLocker l(c->lock);
11fdf7f2 8487 auto start1 = mono_clock::now();
7c673cae 8488 OnodeRef o = c->get_onode(oid, false);
11fdf7f2 8489 LOG_LATENCY(logger, cct, l_bluestore_read_onode_meta_lat, mono_clock::now() - start1);
7c673cae
FG
8490 if (!o || !o->exists) {
8491 r = -ENOENT;
8492 goto out;
8493 }
8494
8495 if (offset == length && offset == 0)
8496 length = o->onode.size;
8497
8498 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
8499 if (r == -EIO) {
8500 logger->inc(l_bluestore_read_eio);
8501 }
7c673cae
FG
8502 }
8503
8504 out:
28e407b8 8505 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
8506 r = -EIO;
8507 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
8508 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
8509 cct->_conf->bluestore_debug_random_read_err &&
8510 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
8511 100.0)) == 0) {
224ce89b
WB
8512 dout(0) << __func__ << ": inject random EIO" << dendl;
8513 r = -EIO;
7c673cae
FG
8514 }
8515 dout(10) << __func__ << " " << cid << " " << oid
8516 << " 0x" << std::hex << offset << "~" << length << std::dec
8517 << " = " << r << dendl;
11fdf7f2 8518 LOG_LATENCY(logger, cct, l_bluestore_read_lat, mono_clock::now() - start);
7c673cae
FG
8519 return r;
8520}
8521
8522// --------------------------------------------------------
8523// intermediate data structures used while reading
8524struct region_t {
8525 uint64_t logical_offset;
8526 uint64_t blob_xoffset; //region offset within the blob
8527 uint64_t length;
7c673cae
FG
8528
8529 // used later in read process
8530 uint64_t front = 0;
7c673cae 8531
11fdf7f2 8532 region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
7c673cae
FG
8533 : logical_offset(offset),
8534 blob_xoffset(b_offs),
11fdf7f2
TL
8535 length(len),
8536 front(front){}
7c673cae
FG
8537 region_t(const region_t& from)
8538 : logical_offset(from.logical_offset),
8539 blob_xoffset(from.blob_xoffset),
11fdf7f2
TL
8540 length(from.length),
8541 front(from.front){}
7c673cae
FG
8542
8543 friend ostream& operator<<(ostream& out, const region_t& r) {
8544 return out << "0x" << std::hex << r.logical_offset << ":"
8545 << r.blob_xoffset << "~" << r.length << std::dec;
8546 }
8547};
8548
11fdf7f2
TL
8549// merged blob read request
8550struct read_req_t {
8551 uint64_t r_off = 0;
8552 uint64_t r_len = 0;
8553 bufferlist bl;
8554 std::list<region_t> regs; // original read regions
8555
8556 read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
8557
8558 friend ostream& operator<<(ostream& out, const read_req_t& r) {
8559 out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
8560 for (const auto& reg : r.regs)
8561 out << reg;
8562 return out << "]}" << std::dec;
8563 }
8564};
8565
8566typedef list<read_req_t> regions2read_t;
7c673cae
FG
8567typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
8568
8569int BlueStore::_do_read(
8570 Collection *c,
8571 OnodeRef o,
8572 uint64_t offset,
8573 size_t length,
8574 bufferlist& bl,
f64942e4
AA
8575 uint32_t op_flags,
8576 uint64_t retry_count)
7c673cae 8577{
11fdf7f2 8578 FUNCTRACE(cct);
7c673cae 8579 int r = 0;
91327a77 8580 int read_cache_policy = 0; // do not bypass clean or dirty cache
7c673cae
FG
8581
8582 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
8583 << " size 0x" << o->onode.size << " (" << std::dec
8584 << o->onode.size << ")" << dendl;
8585 bl.clear();
8586
8587 if (offset >= o->onode.size) {
8588 return r;
8589 }
8590
8591 // generally, don't buffer anything, unless the client explicitly requests
8592 // it.
8593 bool buffered = false;
8594 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
8595 dout(20) << __func__ << " will do buffered read" << dendl;
8596 buffered = true;
8597 } else if (cct->_conf->bluestore_default_buffered_read &&
8598 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
8599 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
8600 dout(20) << __func__ << " defaulting to buffered read" << dendl;
8601 buffered = true;
8602 }
8603
8604 if (offset + length > o->onode.size) {
8605 length = o->onode.size - offset;
8606 }
8607
11fdf7f2 8608 auto start = mono_clock::now();
7c673cae 8609 o->extent_map.fault_range(db, offset, length);
11fdf7f2 8610 LOG_LATENCY(logger, cct, l_bluestore_read_onode_meta_lat, mono_clock::now() - start);
81eedcae 8611 _dump_onode<30>(cct, *o);
7c673cae
FG
8612
8613 ready_regions_t ready_regions;
8614
91327a77
AA
8615 // for deep-scrub, we only read dirty cache and bypass clean cache in
8616 // order to read underlying block device in case there are silent disk errors.
8617 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
8618 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
8619 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
8620 }
8621
7c673cae
FG
8622 // build blob-wise list to of stuff read (that isn't cached)
8623 blobs2read_t blobs2read;
8624 unsigned left = length;
8625 uint64_t pos = offset;
8626 unsigned num_regions = 0;
8627 auto lp = o->extent_map.seek_lextent(offset);
8628 while (left > 0 && lp != o->extent_map.extent_map.end()) {
8629 if (pos < lp->logical_offset) {
8630 unsigned hole = lp->logical_offset - pos;
8631 if (hole >= left) {
8632 break;
8633 }
8634 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
8635 << std::dec << dendl;
8636 pos += hole;
8637 left -= hole;
8638 }
94b18763 8639 BlobRef& bptr = lp->blob;
7c673cae
FG
8640 unsigned l_off = pos - lp->logical_offset;
8641 unsigned b_off = l_off + lp->blob_offset;
8642 unsigned b_len = std::min(left, lp->length - l_off);
8643
8644 ready_regions_t cache_res;
8645 interval_set<uint32_t> cache_interval;
8646 bptr->shared_blob->bc.read(
91327a77
AA
8647 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
8648 read_cache_policy);
7c673cae
FG
8649 dout(20) << __func__ << " blob " << *bptr << std::hex
8650 << " need 0x" << b_off << "~" << b_len
8651 << " cache has 0x" << cache_interval
8652 << std::dec << dendl;
8653
8654 auto pc = cache_res.begin();
11fdf7f2 8655 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
8656 while (b_len > 0) {
8657 unsigned l;
8658 if (pc != cache_res.end() &&
8659 pc->first == b_off) {
8660 l = pc->second.length();
8661 ready_regions[pos].claim(pc->second);
8662 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
8663 << b_off << "~" << l << std::dec << dendl;
8664 ++pc;
8665 } else {
8666 l = b_len;
8667 if (pc != cache_res.end()) {
11fdf7f2 8668 ceph_assert(pc->first > b_off);
7c673cae
FG
8669 l = pc->first - b_off;
8670 }
8671 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
8672 << b_off << "~" << l << std::dec << dendl;
11fdf7f2
TL
8673 // merge regions
8674 {
8675 uint64_t r_off = b_off;
8676 uint64_t r_len = l;
8677 uint64_t front = r_off % chunk_size;
8678 if (front) {
8679 r_off -= front;
8680 r_len += front;
8681 }
8682 unsigned tail = r_len % chunk_size;
8683 if (tail) {
8684 r_len += chunk_size - tail;
8685 }
8686 bool merged = false;
8687 regions2read_t& r2r = blobs2read[bptr];
8688 if (r2r.size()) {
8689 read_req_t& pre = r2r.back();
8690 if (r_off <= (pre.r_off + pre.r_len)) {
8691 front += (r_off - pre.r_off);
8692 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
8693 pre.regs.emplace_back(region_t(pos, b_off, l, front));
8694 merged = true;
8695 }
8696 }
8697 if (!merged) {
8698 read_req_t req(r_off, r_len);
8699 req.regs.emplace_back(region_t(pos, b_off, l, front));
8700 r2r.emplace_back(std::move(req));
8701 }
8702 }
7c673cae
FG
8703 ++num_regions;
8704 }
8705 pos += l;
8706 b_off += l;
8707 left -= l;
8708 b_len -= l;
8709 }
8710 ++lp;
8711 }
8712
8713 // read raw blob data. use aio if we have >1 blobs to read.
11fdf7f2
TL
8714 start = mono_clock::now(); // for the sake of simplicity
8715 // measure the whole block below.
8716 // The error isn't that much...
7c673cae 8717 vector<bufferlist> compressed_blob_bls;
b32b8144 8718 IOContext ioc(cct, NULL, true); // allow EIO
7c673cae 8719 for (auto& p : blobs2read) {
94b18763 8720 const BlobRef& bptr = p.first;
11fdf7f2 8721 regions2read_t& r2r = p.second;
7c673cae 8722 dout(20) << __func__ << " blob " << *bptr << std::hex
11fdf7f2 8723 << " need " << r2r << std::dec << dendl;
7c673cae
FG
8724 if (bptr->get_blob().is_compressed()) {
8725 // read the whole thing
8726 if (compressed_blob_bls.empty()) {
8727 // ensure we avoid any reallocation on subsequent blobs
8728 compressed_blob_bls.reserve(blobs2read.size());
8729 }
8730 compressed_blob_bls.push_back(bufferlist());
8731 bufferlist& bl = compressed_blob_bls.back();
8732 r = bptr->get_blob().map(
8733 0, bptr->get_blob().get_ondisk_length(),
8734 [&](uint64_t offset, uint64_t length) {
8735 int r;
8736 // use aio if there are more regions to read than those in this blob
11fdf7f2 8737 if (num_regions > r2r.size()) {
7c673cae
FG
8738 r = bdev->aio_read(offset, length, &bl, &ioc);
8739 } else {
8740 r = bdev->read(offset, length, &bl, &ioc, false);
8741 }
8742 if (r < 0)
8743 return r;
8744 return 0;
8745 });
b32b8144
FG
8746 if (r < 0) {
8747 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
8748 if (r == -EIO) {
8749 // propagate EIO to caller
8750 return r;
8751 }
11fdf7f2 8752 ceph_assert(r == 0);
b32b8144 8753 }
7c673cae
FG
8754 } else {
8755 // read the pieces
11fdf7f2 8756 for (auto& req : r2r) {
7c673cae 8757 dout(20) << __func__ << " region 0x" << std::hex
11fdf7f2
TL
8758 << req.regs.front().logical_offset
8759 << ": 0x" << req.regs.front().blob_xoffset
8760 << " reading 0x" << req.r_off
8761 << "~" << req.r_len << std::dec
7c673cae
FG
8762 << dendl;
8763
8764 // read it
8765 r = bptr->get_blob().map(
11fdf7f2 8766 req.r_off, req.r_len,
7c673cae
FG
8767 [&](uint64_t offset, uint64_t length) {
8768 int r;
8769 // use aio if there is more than one region to read
8770 if (num_regions > 1) {
11fdf7f2 8771 r = bdev->aio_read(offset, length, &req.bl, &ioc);
7c673cae 8772 } else {
11fdf7f2 8773 r = bdev->read(offset, length, &req.bl, &ioc, false);
7c673cae
FG
8774 }
8775 if (r < 0)
8776 return r;
8777 return 0;
8778 });
b32b8144
FG
8779 if (r < 0) {
8780 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
8781 << dendl;
8782 if (r == -EIO) {
8783 // propagate EIO to caller
8784 return r;
8785 }
11fdf7f2 8786 ceph_assert(r == 0);
b32b8144 8787 }
11fdf7f2 8788 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
8789 }
8790 }
8791 }
11fdf7f2
TL
8792
8793 int64_t num_ios = length;
7c673cae 8794 if (ioc.has_pending_aios()) {
11fdf7f2 8795 num_ios = -ioc.get_num_ios();
7c673cae
FG
8796 bdev->aio_submit(&ioc);
8797 dout(20) << __func__ << " waiting for aio" << dendl;
8798 ioc.aio_wait();
b32b8144
FG
8799 r = ioc.get_return_value();
8800 if (r < 0) {
11fdf7f2 8801 ceph_assert(r == -EIO); // no other errors allowed
b32b8144
FG
8802 return -EIO;
8803 }
7c673cae 8804 }
11fdf7f2
TL
8805 LOG_LATENCY_FN(logger, cct, l_bluestore_read_wait_aio_lat,
8806 mono_clock::now() - start,
8807 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
8808 );
7c673cae
FG
8809
8810 // enumerate and decompress desired blobs
8811 auto p = compressed_blob_bls.begin();
8812 blobs2read_t::iterator b2r_it = blobs2read.begin();
8813 while (b2r_it != blobs2read.end()) {
94b18763 8814 const BlobRef& bptr = b2r_it->first;
11fdf7f2 8815 regions2read_t& r2r = b2r_it->second;
7c673cae 8816 dout(20) << __func__ << " blob " << *bptr << std::hex
11fdf7f2 8817 << " need 0x" << r2r << std::dec << dendl;
7c673cae 8818 if (bptr->get_blob().is_compressed()) {
11fdf7f2 8819 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
8820 bufferlist& compressed_bl = *p++;
8821 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
11fdf7f2 8822 r2r.front().regs.front().logical_offset) < 0) {
f64942e4
AA
8823 // Handles spurious read errors caused by a kernel bug.
8824 // We sometimes get all-zero pages as a result of the read under
11fdf7f2
TL
8825 // high memory pressure. Retrying the failing read succeeds in most
8826 // cases.
f64942e4
AA
8827 // See also: http://tracker.ceph.com/issues/22464
8828 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
8829 return -EIO;
8830 }
8831 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
7c673cae
FG
8832 }
8833 bufferlist raw_bl;
8834 r = _decompress(compressed_bl, &raw_bl);
8835 if (r < 0)
8836 return r;
8837 if (buffered) {
8838 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
8839 raw_bl);
8840 }
11fdf7f2
TL
8841 for (auto& req : r2r) {
8842 for (auto& r : req.regs) {
8843 ready_regions[r.logical_offset].substr_of(
8844 raw_bl, r.blob_xoffset, r.length);
8845 }
7c673cae
FG
8846 }
8847 } else {
11fdf7f2
TL
8848 for (auto& req : r2r) {
8849 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
8850 req.regs.front().logical_offset) < 0) {
f64942e4
AA
8851 // Handles spurious read errors caused by a kernel bug.
8852 // We sometimes get all-zero pages as a result of the read under
11fdf7f2
TL
8853 // high memory pressure. Retrying the failing read succeeds in most
8854 // cases.
f64942e4
AA
8855 // See also: http://tracker.ceph.com/issues/22464
8856 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
8857 return -EIO;
8858 }
8859 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
7c673cae
FG
8860 }
8861 if (buffered) {
8862 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
11fdf7f2 8863 req.r_off, req.bl);
7c673cae
FG
8864 }
8865
8866 // prune and keep result
11fdf7f2
TL
8867 for (const auto& r : req.regs) {
8868 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
8869 }
7c673cae
FG
8870 }
8871 }
8872 ++b2r_it;
8873 }
8874
8875 // generate a resulting buffer
8876 auto pr = ready_regions.begin();
8877 auto pr_end = ready_regions.end();
8878 pos = 0;
8879 while (pos < length) {
8880 if (pr != pr_end && pr->first == pos + offset) {
8881 dout(30) << __func__ << " assemble 0x" << std::hex << pos
8882 << ": data from 0x" << pr->first << "~" << pr->second.length()
8883 << std::dec << dendl;
8884 pos += pr->second.length();
8885 bl.claim_append(pr->second);
8886 ++pr;
8887 } else {
8888 uint64_t l = length - pos;
8889 if (pr != pr_end) {
11fdf7f2 8890 ceph_assert(pr->first > pos + offset);
7c673cae
FG
8891 l = pr->first - (pos + offset);
8892 }
8893 dout(30) << __func__ << " assemble 0x" << std::hex << pos
8894 << ": zeros for 0x" << (pos + offset) << "~" << l
8895 << std::dec << dendl;
8896 bl.append_zero(l);
8897 pos += l;
8898 }
8899 }
11fdf7f2
TL
8900 ceph_assert(bl.length() == length);
8901 ceph_assert(pos == length);
8902 ceph_assert(pr == pr_end);
7c673cae 8903 r = bl.length();
f64942e4
AA
8904 if (retry_count) {
8905 logger->inc(l_bluestore_reads_with_retries);
8906 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
8907 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
8908 }
7c673cae
FG
8909 return r;
8910}
8911
8912int BlueStore::_verify_csum(OnodeRef& o,
8913 const bluestore_blob_t* blob, uint64_t blob_xoffset,
8914 const bufferlist& bl,
8915 uint64_t logical_offset) const
8916{
8917 int bad;
8918 uint64_t bad_csum;
11fdf7f2 8919 auto start = mono_clock::now();
7c673cae 8920 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
8921 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
8922 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
8923 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
8924 bad = blob_xoffset;
8925 r = -1;
8926 bad_csum = 0xDEADBEEF;
8927 }
7c673cae
FG
8928 if (r < 0) {
8929 if (r == -1) {
8930 PExtentVector pex;
8931 blob->map(
8932 bad,
8933 blob->get_csum_chunk_size(),
8934 [&](uint64_t offset, uint64_t length) {
8935 pex.emplace_back(bluestore_pextent_t(offset, length));
8936 return 0;
8937 });
8938 derr << __func__ << " bad "
8939 << Checksummer::get_csum_type_string(blob->csum_type)
8940 << "/0x" << std::hex << blob->get_csum_chunk_size()
8941 << " checksum at blob offset 0x" << bad
8942 << ", got 0x" << bad_csum << ", expected 0x"
8943 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
8944 << ", device location " << pex
8945 << ", logical extent 0x" << std::hex
8946 << (logical_offset + bad - blob_xoffset) << "~"
8947 << blob->get_csum_chunk_size() << std::dec
8948 << ", object " << o->oid
8949 << dendl;
8950 } else {
8951 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
8952 }
8953 }
11fdf7f2
TL
8954 LOG_LATENCY(logger, cct, l_bluestore_csum_lat, mono_clock::now() - start);
8955 if (cct->_conf->bluestore_ignore_data_csum) {
8956 return 0;
8957 }
7c673cae
FG
8958 return r;
8959}
8960
8961int BlueStore::_decompress(bufferlist& source, bufferlist* result)
8962{
8963 int r = 0;
11fdf7f2
TL
8964 auto start = mono_clock::now();
8965 auto i = source.cbegin();
7c673cae 8966 bluestore_compression_header_t chdr;
11fdf7f2 8967 decode(chdr, i);
7c673cae
FG
8968 int alg = int(chdr.type);
8969 CompressorRef cp = compressor;
8970 if (!cp || (int)cp->get_type() != alg) {
8971 cp = Compressor::create(cct, alg);
8972 }
8973
8974 if (!cp.get()) {
8975 // if compressor isn't available - error, because cannot return
8976 // decompressed data?
11fdf7f2
TL
8977
8978 const char* alg_name = Compressor::get_comp_alg_name(alg);
8979 derr << __func__ << " can't load decompressor " << alg_name << dendl;
8980 _set_compression_alert(false, alg_name);
7c673cae
FG
8981 r = -EIO;
8982 } else {
8983 r = cp->decompress(i, chdr.length, *result);
8984 if (r < 0) {
8985 derr << __func__ << " decompression failed with exit code " << r << dendl;
8986 r = -EIO;
8987 }
8988 }
11fdf7f2 8989 LOG_LATENCY(logger, cct, l_bluestore_decompress_lat, mono_clock::now() - start);
7c673cae
FG
8990 return r;
8991}
8992
8993// this stores fiemap into interval_set, other variations
8994// use it internally
8995int BlueStore::_fiemap(
8996 CollectionHandle &c_,
8997 const ghobject_t& oid,
8998 uint64_t offset,
8999 size_t length,
9000 interval_set<uint64_t>& destset)
9001{
9002 Collection *c = static_cast<Collection *>(c_.get());
9003 if (!c->exists)
9004 return -ENOENT;
9005 {
9006 RWLock::RLocker l(c->lock);
9007
9008 OnodeRef o = c->get_onode(oid, false);
9009 if (!o || !o->exists) {
9010 return -ENOENT;
9011 }
81eedcae 9012 _dump_onode<30>(cct, *o);
7c673cae
FG
9013
9014 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9015 << " size 0x" << o->onode.size << std::dec << dendl;
9016
9017 boost::intrusive::set<Extent>::iterator ep, eend;
9018 if (offset >= o->onode.size)
9019 goto out;
9020
9021 if (offset + length > o->onode.size) {
9022 length = o->onode.size - offset;
9023 }
9024
9025 o->extent_map.fault_range(db, offset, length);
9026 eend = o->extent_map.extent_map.end();
9027 ep = o->extent_map.seek_lextent(offset);
9028 while (length > 0) {
9029 dout(20) << __func__ << " offset " << offset << dendl;
9030 if (ep != eend && ep->logical_offset + ep->length <= offset) {
9031 ++ep;
9032 continue;
9033 }
9034
9035 uint64_t x_len = length;
9036 if (ep != eend && ep->logical_offset <= offset) {
9037 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 9038 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
9039 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
9040 << x_len << std::dec << " blob " << ep->blob << dendl;
9041 destset.insert(offset, x_len);
9042 length -= x_len;
9043 offset += x_len;
9044 if (x_off + x_len == ep->length)
9045 ++ep;
9046 continue;
9047 }
9048 if (ep != eend &&
9049 ep->logical_offset > offset &&
9050 ep->logical_offset - offset < x_len) {
9051 x_len = ep->logical_offset - offset;
9052 }
9053 offset += x_len;
9054 length -= x_len;
9055 }
9056 }
9057
9058 out:
7c673cae
FG
9059 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9060 << " size = 0x(" << destset << ")" << std::dec << dendl;
9061 return 0;
9062}
9063
7c673cae
FG
9064int BlueStore::fiemap(
9065 CollectionHandle &c_,
9066 const ghobject_t& oid,
9067 uint64_t offset,
9068 size_t length,
9069 bufferlist& bl)
9070{
9071 interval_set<uint64_t> m;
9072 int r = _fiemap(c_, oid, offset, length, m);
9073 if (r >= 0) {
11fdf7f2 9074 encode(m, bl);
7c673cae
FG
9075 }
9076 return r;
9077}
9078
7c673cae
FG
9079int BlueStore::fiemap(
9080 CollectionHandle &c_,
9081 const ghobject_t& oid,
9082 uint64_t offset,
9083 size_t length,
9084 map<uint64_t, uint64_t>& destmap)
9085{
9086 interval_set<uint64_t> m;
9087 int r = _fiemap(c_, oid, offset, length, m);
9088 if (r >= 0) {
9089 m.move_into(destmap);
9090 }
9091 return r;
9092}
9093
7c673cae
FG
9094int BlueStore::getattr(
9095 CollectionHandle &c_,
9096 const ghobject_t& oid,
9097 const char *name,
9098 bufferptr& value)
9099{
9100 Collection *c = static_cast<Collection *>(c_.get());
9101 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
9102 if (!c->exists)
9103 return -ENOENT;
9104
9105 int r;
9106 {
9107 RWLock::RLocker l(c->lock);
31f18b77 9108 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
9109
9110 OnodeRef o = c->get_onode(oid, false);
9111 if (!o || !o->exists) {
9112 r = -ENOENT;
9113 goto out;
9114 }
9115
9116 if (!o->onode.attrs.count(k)) {
9117 r = -ENODATA;
9118 goto out;
9119 }
9120 value = o->onode.attrs[k];
9121 r = 0;
9122 }
9123 out:
7c673cae
FG
9124 if (r == 0 && _debug_mdata_eio(oid)) {
9125 r = -EIO;
9126 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9127 }
9128 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
9129 << " = " << r << dendl;
9130 return r;
9131}
9132
7c673cae
FG
9133int BlueStore::getattrs(
9134 CollectionHandle &c_,
9135 const ghobject_t& oid,
9136 map<string,bufferptr>& aset)
9137{
9138 Collection *c = static_cast<Collection *>(c_.get());
9139 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
9140 if (!c->exists)
9141 return -ENOENT;
9142
9143 int r;
9144 {
9145 RWLock::RLocker l(c->lock);
9146
9147 OnodeRef o = c->get_onode(oid, false);
9148 if (!o || !o->exists) {
9149 r = -ENOENT;
9150 goto out;
9151 }
9152 for (auto& i : o->onode.attrs) {
9153 aset.emplace(i.first.c_str(), i.second);
9154 }
9155 r = 0;
9156 }
9157
9158 out:
7c673cae
FG
9159 if (r == 0 && _debug_mdata_eio(oid)) {
9160 r = -EIO;
9161 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9162 }
9163 dout(10) << __func__ << " " << c->cid << " " << oid
9164 << " = " << r << dendl;
9165 return r;
9166}
9167
9168int BlueStore::list_collections(vector<coll_t>& ls)
9169{
9170 RWLock::RLocker l(coll_lock);
11fdf7f2 9171 ls.reserve(coll_map.size());
7c673cae
FG
9172 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
9173 p != coll_map.end();
9174 ++p)
9175 ls.push_back(p->first);
9176 return 0;
9177}
9178
9179bool BlueStore::collection_exists(const coll_t& c)
9180{
9181 RWLock::RLocker l(coll_lock);
9182 return coll_map.count(c);
9183}
9184
11fdf7f2 9185int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 9186{
11fdf7f2 9187 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
9188 vector<ghobject_t> ls;
9189 ghobject_t next;
11fdf7f2 9190 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
9191 &ls, &next);
9192 if (r < 0) {
9193 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
9194 << dendl;
9195 return r;
9196 }
9197 *empty = ls.empty();
11fdf7f2 9198 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
9199 return 0;
9200}
9201
11fdf7f2 9202int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 9203{
11fdf7f2
TL
9204 dout(15) << __func__ << " " << ch->cid << dendl;
9205 Collection *c = static_cast<Collection*>(ch.get());
7c673cae 9206 RWLock::RLocker l(c->lock);
11fdf7f2 9207 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
9208 return c->cnode.bits;
9209}
9210
7c673cae
FG
9211int BlueStore::collection_list(
9212 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
9213 vector<ghobject_t> *ls, ghobject_t *pnext)
9214{
9215 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 9216 c->flush();
7c673cae
FG
9217 dout(15) << __func__ << " " << c->cid
9218 << " start " << start << " end " << end << " max " << max << dendl;
9219 int r;
9220 {
9221 RWLock::RLocker l(c->lock);
9222 r = _collection_list(c, start, end, max, ls, pnext);
9223 }
9224
7c673cae
FG
9225 dout(10) << __func__ << " " << c->cid
9226 << " start " << start << " end " << end << " max " << max
9227 << " = " << r << ", ls.size() = " << ls->size()
9228 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
9229 return r;
9230}
9231
9232int BlueStore::_collection_list(
9233 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
9234 vector<ghobject_t> *ls, ghobject_t *pnext)
9235{
9236
9237 if (!c->exists)
9238 return -ENOENT;
9239
9240 int r = 0;
9241 ghobject_t static_next;
9242 KeyValueDB::Iterator it;
9243 string temp_start_key, temp_end_key;
9244 string start_key, end_key;
9245 bool set_next = false;
9246 string pend;
9247 bool temp;
9248
9249 if (!pnext)
9250 pnext = &static_next;
9251
11fdf7f2 9252 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
9253 goto out;
9254 }
9255 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
9256 &start_key, &end_key);
9257 dout(20) << __func__
9258 << " range " << pretty_binary_string(temp_start_key)
9259 << " to " << pretty_binary_string(temp_end_key)
9260 << " and " << pretty_binary_string(start_key)
9261 << " to " << pretty_binary_string(end_key)
9262 << " start " << start << dendl;
9263 it = db->get_iterator(PREFIX_OBJ);
9264 if (start == ghobject_t() ||
9265 start.hobj == hobject_t() ||
9266 start == c->cid.get_min_hobj()) {
9267 it->upper_bound(temp_start_key);
9268 temp = true;
9269 } else {
9270 string k;
9271 get_object_key(cct, start, &k);
9272 if (start.hobj.is_temp()) {
9273 temp = true;
11fdf7f2 9274 ceph_assert(k >= temp_start_key && k < temp_end_key);
7c673cae
FG
9275 } else {
9276 temp = false;
11fdf7f2 9277 ceph_assert(k >= start_key && k < end_key);
7c673cae 9278 }
11fdf7f2 9279 dout(20) << __func__ << " start from " << pretty_binary_string(k)
7c673cae
FG
9280 << " temp=" << (int)temp << dendl;
9281 it->lower_bound(k);
9282 }
9283 if (end.hobj.is_max()) {
9284 pend = temp ? temp_end_key : end_key;
9285 } else {
9286 get_object_key(cct, end, &end_key);
9287 if (end.hobj.is_temp()) {
9288 if (temp)
9289 pend = end_key;
9290 else
9291 goto out;
9292 } else {
9293 pend = temp ? temp_end_key : end_key;
9294 }
9295 }
9296 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
9297 while (true) {
9298 if (!it->valid() || it->key() >= pend) {
9299 if (!it->valid())
9300 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
9301 else
9302 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
9303 << " >= " << end << dendl;
9304 if (temp) {
9305 if (end.hobj.is_temp()) {
9306 break;
9307 }
9308 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
9309 temp = false;
9310 it->upper_bound(start_key);
9311 pend = end_key;
9312 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
9313 continue;
9314 }
9315 break;
9316 }
9317 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
9318 if (is_extent_shard_key(it->key())) {
9319 it->next();
9320 continue;
9321 }
9322 ghobject_t oid;
9323 int r = get_key_object(it->key(), &oid);
11fdf7f2 9324 ceph_assert(r == 0);
7c673cae
FG
9325 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
9326 if (ls->size() >= (unsigned)max) {
9327 dout(20) << __func__ << " reached max " << max << dendl;
9328 *pnext = oid;
9329 set_next = true;
9330 break;
9331 }
9332 ls->push_back(oid);
9333 it->next();
9334 }
9335out:
9336 if (!set_next) {
9337 *pnext = ghobject_t::get_max();
9338 }
9339
9340 return r;
9341}
9342
7c673cae
FG
9343int BlueStore::omap_get(
9344 CollectionHandle &c_, ///< [in] Collection containing oid
9345 const ghobject_t &oid, ///< [in] Object containing omap
9346 bufferlist *header, ///< [out] omap header
9347 map<string, bufferlist> *out /// < [out] Key to value map
9348 )
9349{
9350 Collection *c = static_cast<Collection *>(c_.get());
9351 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9352 if (!c->exists)
9353 return -ENOENT;
9354 RWLock::RLocker l(c->lock);
9355 int r = 0;
9356 OnodeRef o = c->get_onode(oid, false);
9357 if (!o || !o->exists) {
9358 r = -ENOENT;
9359 goto out;
9360 }
9361 if (!o->onode.has_omap())
9362 goto out;
9363 o->flush();
9364 {
11fdf7f2
TL
9365 const string& prefix =
9366 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9367 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
9368 string head, tail;
9369 get_omap_header(o->onode.nid, &head);
9370 get_omap_tail(o->onode.nid, &tail);
9371 it->lower_bound(head);
9372 while (it->valid()) {
9373 if (it->key() == head) {
9374 dout(30) << __func__ << " got header" << dendl;
9375 *header = it->value();
9376 } else if (it->key() >= tail) {
9377 dout(30) << __func__ << " reached tail" << dendl;
9378 break;
9379 } else {
9380 string user_key;
9381 decode_omap_key(it->key(), &user_key);
11fdf7f2 9382 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
9383 << " -> " << user_key << dendl;
9384 (*out)[user_key] = it->value();
9385 }
9386 it->next();
9387 }
9388 }
9389 out:
9390 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9391 << dendl;
9392 return r;
9393}
9394
7c673cae
FG
9395int BlueStore::omap_get_header(
9396 CollectionHandle &c_, ///< [in] Collection containing oid
9397 const ghobject_t &oid, ///< [in] Object containing omap
9398 bufferlist *header, ///< [out] omap header
9399 bool allow_eio ///< [in] don't assert on eio
9400 )
9401{
9402 Collection *c = static_cast<Collection *>(c_.get());
9403 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9404 if (!c->exists)
9405 return -ENOENT;
9406 RWLock::RLocker l(c->lock);
9407 int r = 0;
9408 OnodeRef o = c->get_onode(oid, false);
9409 if (!o || !o->exists) {
9410 r = -ENOENT;
9411 goto out;
9412 }
9413 if (!o->onode.has_omap())
9414 goto out;
9415 o->flush();
9416 {
9417 string head;
9418 get_omap_header(o->onode.nid, &head);
11fdf7f2
TL
9419 if (db->get(o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
9420 head, header) >= 0) {
7c673cae
FG
9421 dout(30) << __func__ << " got header" << dendl;
9422 } else {
9423 dout(30) << __func__ << " no header" << dendl;
9424 }
9425 }
9426 out:
9427 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9428 << dendl;
9429 return r;
9430}
9431
7c673cae
FG
9432int BlueStore::omap_get_keys(
9433 CollectionHandle &c_, ///< [in] Collection containing oid
9434 const ghobject_t &oid, ///< [in] Object containing omap
9435 set<string> *keys ///< [out] Keys defined on oid
9436 )
9437{
9438 Collection *c = static_cast<Collection *>(c_.get());
9439 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9440 if (!c->exists)
9441 return -ENOENT;
9442 RWLock::RLocker l(c->lock);
9443 int r = 0;
9444 OnodeRef o = c->get_onode(oid, false);
9445 if (!o || !o->exists) {
9446 r = -ENOENT;
9447 goto out;
9448 }
9449 if (!o->onode.has_omap())
9450 goto out;
9451 o->flush();
9452 {
11fdf7f2
TL
9453 const string& prefix =
9454 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9455 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
9456 string head, tail;
9457 get_omap_key(o->onode.nid, string(), &head);
9458 get_omap_tail(o->onode.nid, &tail);
9459 it->lower_bound(head);
9460 while (it->valid()) {
9461 if (it->key() >= tail) {
9462 dout(30) << __func__ << " reached tail" << dendl;
9463 break;
9464 }
9465 string user_key;
9466 decode_omap_key(it->key(), &user_key);
11fdf7f2 9467 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
9468 << " -> " << user_key << dendl;
9469 keys->insert(user_key);
9470 it->next();
11fdf7f2
TL
9471 }
9472 }
9473 out:
9474 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9475 << dendl;
9476 return r;
7c673cae
FG
9477}
9478
9479int BlueStore::omap_get_values(
9480 CollectionHandle &c_, ///< [in] Collection containing oid
9481 const ghobject_t &oid, ///< [in] Object containing omap
9482 const set<string> &keys, ///< [in] Keys to get
9483 map<string, bufferlist> *out ///< [out] Returned keys and values
9484 )
9485{
9486 Collection *c = static_cast<Collection *>(c_.get());
9487 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9488 if (!c->exists)
9489 return -ENOENT;
9490 RWLock::RLocker l(c->lock);
9491 int r = 0;
9492 string final_key;
9493 OnodeRef o = c->get_onode(oid, false);
9494 if (!o || !o->exists) {
9495 r = -ENOENT;
9496 goto out;
9497 }
9498 if (!o->onode.has_omap())
9499 goto out;
11fdf7f2
TL
9500 {
9501 const string& prefix =
9502 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9503 o->flush();
9504 _key_encode_u64(o->onode.nid, &final_key);
9505 final_key.push_back('.');
9506 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9507 final_key.resize(9); // keep prefix
9508 final_key += *p;
9509 bufferlist val;
9510 if (db->get(prefix, final_key, &val) >= 0) {
9511 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
9512 << " -> " << *p << dendl;
9513 out->insert(make_pair(*p, val));
9514 }
7c673cae
FG
9515 }
9516 }
9517 out:
9518 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9519 << dendl;
9520 return r;
9521}
9522
7c673cae
FG
9523int BlueStore::omap_check_keys(
9524 CollectionHandle &c_, ///< [in] Collection containing oid
9525 const ghobject_t &oid, ///< [in] Object containing omap
9526 const set<string> &keys, ///< [in] Keys to check
9527 set<string> *out ///< [out] Subset of keys defined on oid
9528 )
9529{
9530 Collection *c = static_cast<Collection *>(c_.get());
9531 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9532 if (!c->exists)
9533 return -ENOENT;
9534 RWLock::RLocker l(c->lock);
9535 int r = 0;
9536 string final_key;
9537 OnodeRef o = c->get_onode(oid, false);
9538 if (!o || !o->exists) {
9539 r = -ENOENT;
9540 goto out;
9541 }
9542 if (!o->onode.has_omap())
9543 goto out;
11fdf7f2
TL
9544 {
9545 const string& prefix =
9546 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9547 o->flush();
9548 _key_encode_u64(o->onode.nid, &final_key);
9549 final_key.push_back('.');
9550 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9551 final_key.resize(9); // keep prefix
9552 final_key += *p;
9553 bufferlist val;
9554 if (db->get(prefix, final_key, &val) >= 0) {
9555 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
9556 << " -> " << *p << dendl;
9557 out->insert(*p);
9558 } else {
9559 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
9560 << " -> " << *p << dendl;
9561 }
7c673cae
FG
9562 }
9563 }
9564 out:
9565 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9566 << dendl;
9567 return r;
9568}
9569
7c673cae
FG
9570ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
9571 CollectionHandle &c_, ///< [in] collection
9572 const ghobject_t &oid ///< [in] object
9573 )
9574{
9575 Collection *c = static_cast<Collection *>(c_.get());
9576 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9577 if (!c->exists) {
9578 return ObjectMap::ObjectMapIterator();
9579 }
9580 RWLock::RLocker l(c->lock);
9581 OnodeRef o = c->get_onode(oid, false);
9582 if (!o || !o->exists) {
9583 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
9584 return ObjectMap::ObjectMapIterator();
9585 }
9586 o->flush();
9587 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
11fdf7f2
TL
9588 KeyValueDB::Iterator it = db->get_iterator(
9589 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP);
7c673cae
FG
9590 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
9591}
9592
9593// -----------------
9594// write helpers
9595
11fdf7f2
TL
9596uint64_t BlueStore::_get_ondisk_reserved() const {
9597 return round_up_to(
9598 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
9599}
9600
7c673cae
FG
9601void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
9602{
9603 dout(10) << __func__ << " ondisk_format " << ondisk_format
9604 << " min_compat_ondisk_format " << min_compat_ondisk_format
9605 << dendl;
11fdf7f2 9606 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
9607 {
9608 bufferlist bl;
11fdf7f2 9609 encode(ondisk_format, bl);
7c673cae
FG
9610 t->set(PREFIX_SUPER, "ondisk_format", bl);
9611 }
9612 {
9613 bufferlist bl;
11fdf7f2 9614 encode(min_compat_ondisk_format, bl);
7c673cae
FG
9615 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
9616 }
9617}
9618
9619int BlueStore::_open_super_meta()
9620{
9621 // nid
9622 {
9623 nid_max = 0;
9624 bufferlist bl;
9625 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 9626 auto p = bl.cbegin();
7c673cae
FG
9627 try {
9628 uint64_t v;
11fdf7f2 9629 decode(v, p);
7c673cae
FG
9630 nid_max = v;
9631 } catch (buffer::error& e) {
9632 derr << __func__ << " unable to read nid_max" << dendl;
9633 return -EIO;
9634 }
9635 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
9636 nid_last = nid_max.load();
9637 }
9638
9639 // blobid
9640 {
9641 blobid_max = 0;
9642 bufferlist bl;
9643 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 9644 auto p = bl.cbegin();
7c673cae
FG
9645 try {
9646 uint64_t v;
11fdf7f2 9647 decode(v, p);
7c673cae
FG
9648 blobid_max = v;
9649 } catch (buffer::error& e) {
9650 derr << __func__ << " unable to read blobid_max" << dendl;
9651 return -EIO;
9652 }
9653 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
9654 blobid_last = blobid_max.load();
9655 }
9656
9657 // freelist
9658 {
9659 bufferlist bl;
9660 db->get(PREFIX_SUPER, "freelist_type", &bl);
9661 if (bl.length()) {
9662 freelist_type = std::string(bl.c_str(), bl.length());
9663 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
9664 } else {
11fdf7f2 9665 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 9666 }
7c673cae
FG
9667 }
9668
9669 // ondisk format
9670 int32_t compat_ondisk_format = 0;
9671 {
9672 bufferlist bl;
9673 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
9674 if (r < 0) {
9675 // base case: kraken bluestore is v1 and readable by v1
9676 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
9677 << dendl;
9678 ondisk_format = 1;
9679 compat_ondisk_format = 1;
9680 } else {
11fdf7f2 9681 auto p = bl.cbegin();
7c673cae 9682 try {
11fdf7f2 9683 decode(ondisk_format, p);
7c673cae
FG
9684 } catch (buffer::error& e) {
9685 derr << __func__ << " unable to read ondisk_format" << dendl;
9686 return -EIO;
9687 }
9688 bl.clear();
9689 {
9690 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
9691 ceph_assert(!r);
9692 auto p = bl.cbegin();
7c673cae 9693 try {
11fdf7f2 9694 decode(compat_ondisk_format, p);
7c673cae
FG
9695 } catch (buffer::error& e) {
9696 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
9697 return -EIO;
9698 }
9699 }
9700 }
9701 dout(10) << __func__ << " ondisk_format " << ondisk_format
9702 << " compat_ondisk_format " << compat_ondisk_format
9703 << dendl;
9704 }
9705
9706 if (latest_ondisk_format < compat_ondisk_format) {
9707 derr << __func__ << " compat_ondisk_format is "
9708 << compat_ondisk_format << " but we only understand version "
9709 << latest_ondisk_format << dendl;
9710 return -EPERM;
9711 }
7c673cae
FG
9712
9713 {
9714 bufferlist bl;
9715 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 9716 auto p = bl.cbegin();
7c673cae
FG
9717 try {
9718 uint64_t val;
11fdf7f2 9719 decode(val, p);
7c673cae 9720 min_alloc_size = val;
224ce89b 9721 min_alloc_size_order = ctz(val);
11fdf7f2 9722 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
9723 } catch (buffer::error& e) {
9724 derr << __func__ << " unable to read min_alloc_size" << dendl;
9725 return -EIO;
9726 }
9727 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
9728 << std::dec << dendl;
9729 }
224ce89b 9730 _open_statfs();
7c673cae
FG
9731 _set_alloc_sizes();
9732 _set_throttle_params();
9733
9734 _set_csum();
9735 _set_compression();
9736 _set_blob_size();
9737
11fdf7f2 9738 _validate_bdev();
7c673cae
FG
9739 return 0;
9740}
9741
9742int BlueStore::_upgrade_super()
9743{
9744 dout(1) << __func__ << " from " << ondisk_format << ", latest "
9745 << latest_ondisk_format << dendl;
11fdf7f2
TL
9746 if (ondisk_format < latest_ondisk_format) {
9747 ceph_assert(ondisk_format > 0);
9748 ceph_assert(ondisk_format < latest_ondisk_format);
9749
9750 if (ondisk_format == 1) {
9751 // changes:
9752 // - super: added ondisk_format
9753 // - super: added min_readable_ondisk_format
9754 // - super: added min_compat_ondisk_format
9755 // - super: added min_alloc_size
9756 // - super: removed min_min_alloc_size
9757 KeyValueDB::Transaction t = db->get_transaction();
9758 {
9759 bufferlist bl;
9760 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
9761 auto p = bl.cbegin();
9762 try {
9763 uint64_t val;
9764 decode(val, p);
9765 min_alloc_size = val;
9766 } catch (buffer::error& e) {
9767 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
9768 return -EIO;
9769 }
9770 t->set(PREFIX_SUPER, "min_alloc_size", bl);
9771 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 9772 }
11fdf7f2
TL
9773 ondisk_format = 2;
9774 _prepare_ondisk_format_super(t);
9775 int r = db->submit_transaction_sync(t);
9776 ceph_assert(r == 0);
7c673cae 9777 }
7c673cae 9778 }
7c673cae
FG
9779 // done
9780 dout(1) << __func__ << " done" << dendl;
9781 return 0;
9782}
9783
9784void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
9785{
224ce89b 9786 if (o->onode.nid) {
11fdf7f2 9787 ceph_assert(o->exists);
7c673cae 9788 return;
224ce89b 9789 }
7c673cae
FG
9790 uint64_t nid = ++nid_last;
9791 dout(20) << __func__ << " " << nid << dendl;
9792 o->onode.nid = nid;
9793 txc->last_nid = nid;
224ce89b 9794 o->exists = true;
7c673cae
FG
9795}
9796
9797uint64_t BlueStore::_assign_blobid(TransContext *txc)
9798{
9799 uint64_t bid = ++blobid_last;
9800 dout(20) << __func__ << " " << bid << dendl;
9801 txc->last_blobid = bid;
9802 return bid;
9803}
9804
9805void BlueStore::get_db_statistics(Formatter *f)
9806{
9807 db->get_statistics(f);
9808}
9809
11fdf7f2
TL
9810BlueStore::TransContext *BlueStore::_txc_create(
9811 Collection *c, OpSequencer *osr,
9812 list<Context*> *on_commits)
7c673cae 9813{
11fdf7f2 9814 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae
FG
9815 txc->t = db->get_transaction();
9816 osr->queue_new(txc);
9817 dout(20) << __func__ << " osr " << osr << " = " << txc
9818 << " seq " << txc->seq << dendl;
9819 return txc;
9820}
9821
9822void BlueStore::_txc_calc_cost(TransContext *txc)
9823{
11fdf7f2
TL
9824 // one "io" for the kv commit
9825 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
9826 auto cost = throttle_cost_per_io.load();
9827 txc->cost = ios * cost + txc->bytes;
9828 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
9829 << ios << " ios * " << cost << " + " << txc->bytes
9830 << " bytes)" << dendl;
9831}
9832
9833void BlueStore::_txc_update_store_statfs(TransContext *txc)
9834{
9835 if (txc->statfs_delta.is_empty())
9836 return;
9837
9838 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
9839 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
9840 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
9841 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
9842 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
9843
9844 bufferlist bl;
9845 txc->statfs_delta.encode(bl);
11fdf7f2
TL
9846 if (per_pool_stat_collection) {
9847 string key;
9848 get_pool_stat_key(txc->osd_pool_id, &key);
9849 txc->t->merge(PREFIX_STAT, key, bl);
9850
9851 std::lock_guard l(vstatfs_lock);
9852 auto& stats = osd_pools[txc->osd_pool_id];
9853 stats += txc->statfs_delta;
9854
9855 vstatfs += txc->statfs_delta; //non-persistent in this mode
9856
9857 } else {
9858 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 9859
11fdf7f2
TL
9860 std::lock_guard l(vstatfs_lock);
9861 vstatfs += txc->statfs_delta;
9862 }
7c673cae
FG
9863 txc->statfs_delta.reset();
9864}
9865
9866void BlueStore::_txc_state_proc(TransContext *txc)
9867{
9868 while (true) {
9869 dout(10) << __func__ << " txc " << txc
9870 << " " << txc->get_state_name() << dendl;
9871 switch (txc->state) {
9872 case TransContext::STATE_PREPARE:
9873 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
9874 if (txc->ioc.has_pending_aios()) {
9875 txc->state = TransContext::STATE_AIO_WAIT;
9876 txc->had_ios = true;
9877 _txc_aio_submit(txc);
9878 return;
9879 }
9880 // ** fall-thru **
9881
9882 case TransContext::STATE_AIO_WAIT:
11fdf7f2
TL
9883 {
9884 utime_t lat = txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
9885 if (lat >= cct->_conf->bluestore_log_op_age) {
9886 dout(0) << __func__ << " slow aio_wait, txc = " << txc
9887 << ", latency = " << lat
9888 << dendl;
9889 }
9890 }
9891
7c673cae
FG
9892 _txc_finish_io(txc); // may trigger blocked txc's too
9893 return;
9894
9895 case TransContext::STATE_IO_DONE:
11fdf7f2 9896 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
9897 if (txc->had_ios) {
9898 ++txc->osr->txc_with_unstable_io;
9899 }
9900 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
9901 txc->state = TransContext::STATE_KV_QUEUED;
9902 if (cct->_conf->bluestore_sync_submit_transaction) {
9903 if (txc->last_nid >= nid_max ||
9904 txc->last_blobid >= blobid_max) {
9905 dout(20) << __func__
9906 << " last_{nid,blobid} exceeds max, submit via kv thread"
9907 << dendl;
9908 } else if (txc->osr->kv_committing_serially) {
9909 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
9910 << dendl;
9911 // note: this is starvation-prone. once we have a txc in a busy
9912 // sequencer that is committing serially it is possible to keep
9913 // submitting new transactions fast enough that we get stuck doing
9914 // so. the alternative is to block here... fixme?
9915 } else if (txc->osr->txc_with_unstable_io) {
9916 dout(20) << __func__ << " prior txc(s) with unstable ios "
9917 << txc->osr->txc_with_unstable_io.load() << dendl;
9918 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
9919 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
9920 == 0) {
9921 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
9922 << dendl;
9923 } else {
9924 txc->state = TransContext::STATE_KV_SUBMITTED;
31f18b77 9925 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11fdf7f2 9926 ceph_assert(r == 0);
7c673cae
FG
9927 _txc_applied_kv(txc);
9928 }
9929 }
9930 {
11fdf7f2 9931 std::lock_guard l(kv_lock);
7c673cae
FG
9932 kv_queue.push_back(txc);
9933 kv_cond.notify_one();
9934 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
9935 kv_queue_unsubmitted.push_back(txc);
9936 ++txc->osr->kv_committing_serially;
9937 }
31f18b77
FG
9938 if (txc->had_ios)
9939 kv_ios++;
9940 kv_throttle_costs += txc->cost;
7c673cae
FG
9941 }
9942 return;
9943 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
9944 _txc_committed_kv(txc);
9945 // ** fall-thru **
9946
9947 case TransContext::STATE_KV_DONE:
9948 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
9949 if (txc->deferred_txn) {
9950 txc->state = TransContext::STATE_DEFERRED_QUEUED;
9951 _deferred_queue(txc);
9952 return;
9953 }
9954 txc->state = TransContext::STATE_FINISHING;
9955 break;
9956
9957 case TransContext::STATE_DEFERRED_CLEANUP:
9958 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
9959 txc->state = TransContext::STATE_FINISHING;
9960 // ** fall-thru **
9961
9962 case TransContext::STATE_FINISHING:
9963 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
9964 _txc_finish(txc);
9965 return;
9966
9967 default:
9968 derr << __func__ << " unexpected txc " << txc
9969 << " state " << txc->get_state_name() << dendl;
11fdf7f2 9970 ceph_abort_msg("unexpected txc state");
7c673cae
FG
9971 return;
9972 }
9973 }
9974}
9975
9976void BlueStore::_txc_finish_io(TransContext *txc)
9977{
9978 dout(20) << __func__ << " " << txc << dendl;
9979
9980 /*
9981 * we need to preserve the order of kv transactions,
9982 * even though aio will complete in any order.
9983 */
9984
9985 OpSequencer *osr = txc->osr.get();
11fdf7f2 9986 std::lock_guard l(osr->qlock);
7c673cae 9987 txc->state = TransContext::STATE_IO_DONE;
11fdf7f2 9988 txc->ioc.release_running_aios();
7c673cae
FG
9989 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
9990 while (p != osr->q.begin()) {
9991 --p;
9992 if (p->state < TransContext::STATE_IO_DONE) {
9993 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
9994 << p->get_state_name() << dendl;
9995 return;
9996 }
9997 if (p->state > TransContext::STATE_IO_DONE) {
9998 ++p;
9999 break;
10000 }
10001 }
10002 do {
10003 _txc_state_proc(&*p++);
10004 } while (p != osr->q.end() &&
10005 p->state == TransContext::STATE_IO_DONE);
10006
11fdf7f2 10007 if (osr->kv_submitted_waiters) {
7c673cae
FG
10008 osr->qcond.notify_all();
10009 }
10010}
10011
10012void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
10013{
10014 dout(20) << __func__ << " txc " << txc
10015 << " onodes " << txc->onodes
10016 << " shared_blobs " << txc->shared_blobs
10017 << dendl;
10018
10019 // finalize onodes
10020 for (auto o : txc->onodes) {
11fdf7f2 10021 _record_onode(o, t);
7c673cae
FG
10022 o->flushing_count++;
10023 }
10024
10025 // objects we modified but didn't affect the onode
10026 auto p = txc->modified_objects.begin();
10027 while (p != txc->modified_objects.end()) {
10028 if (txc->onodes.count(*p) == 0) {
10029 (*p)->flushing_count++;
10030 ++p;
10031 } else {
10032 // remove dups with onodes list to avoid problems in _txc_finish
10033 p = txc->modified_objects.erase(p);
10034 }
10035 }
10036
10037 // finalize shared_blobs
10038 for (auto sb : txc->shared_blobs) {
10039 string key;
10040 auto sbid = sb->get_sbid();
10041 get_shared_blob_key(sbid, &key);
10042 if (sb->persistent->empty()) {
11fdf7f2
TL
10043 dout(20) << __func__ << " shared_blob 0x"
10044 << std::hex << sbid << std::dec
7c673cae
FG
10045 << " is empty" << dendl;
10046 t->rmkey(PREFIX_SHARED_BLOB, key);
10047 } else {
10048 bufferlist bl;
11fdf7f2
TL
10049 encode(*(sb->persistent), bl);
10050 dout(20) << __func__ << " shared_blob 0x"
10051 << std::hex << sbid << std::dec
31f18b77 10052 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
10053 t->set(PREFIX_SHARED_BLOB, key, bl);
10054 }
10055 }
10056}
10057
10058void BlueStore::BSPerfTracker::update_from_perfcounters(
10059 PerfCounters &logger)
10060{
11fdf7f2
TL
10061 os_commit_latency_ns.consume_next(
10062 logger.get_tavg_ns(
7c673cae 10063 l_bluestore_commit_lat));
11fdf7f2
TL
10064 os_apply_latency_ns.consume_next(
10065 logger.get_tavg_ns(
7c673cae
FG
10066 l_bluestore_commit_lat));
10067}
10068
10069void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
10070{
10071 dout(20) << __func__ << " txc " << txc << std::hex
10072 << " allocated 0x" << txc->allocated
10073 << " released 0x" << txc->released
10074 << std::dec << dendl;
10075
10076 // We have to handle the case where we allocate *and* deallocate the
10077 // same region in this transaction. The freelist doesn't like that.
10078 // (Actually, the only thing that cares is the BitmapFreelistManager
10079 // debug check. But that's important.)
10080 interval_set<uint64_t> tmp_allocated, tmp_released;
10081 interval_set<uint64_t> *pallocated = &txc->allocated;
10082 interval_set<uint64_t> *preleased = &txc->released;
10083 if (!txc->allocated.empty() && !txc->released.empty()) {
10084 interval_set<uint64_t> overlap;
10085 overlap.intersection_of(txc->allocated, txc->released);
10086 if (!overlap.empty()) {
10087 tmp_allocated = txc->allocated;
10088 tmp_allocated.subtract(overlap);
10089 tmp_released = txc->released;
10090 tmp_released.subtract(overlap);
10091 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
10092 << ", new allocated 0x" << tmp_allocated
10093 << " released 0x" << tmp_released << std::dec
10094 << dendl;
10095 pallocated = &tmp_allocated;
10096 preleased = &tmp_released;
10097 }
10098 }
10099
10100 // update freelist with non-overlap sets
10101 for (interval_set<uint64_t>::iterator p = pallocated->begin();
10102 p != pallocated->end();
10103 ++p) {
10104 fm->allocate(p.get_start(), p.get_len(), t);
10105 }
10106 for (interval_set<uint64_t>::iterator p = preleased->begin();
10107 p != preleased->end();
10108 ++p) {
10109 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
10110 << "~" << p.get_len() << std::dec << dendl;
10111 fm->release(p.get_start(), p.get_len(), t);
10112 }
10113
10114 _txc_update_store_statfs(txc);
10115}
10116
10117void BlueStore::_txc_applied_kv(TransContext *txc)
10118{
10119 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
10120 for (auto& o : *ls) {
10121 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
10122 << dendl;
10123 if (--o->flushing_count == 0) {
11fdf7f2 10124 std::lock_guard l(o->flush_lock);
7c673cae
FG
10125 o->flush_cond.notify_all();
10126 }
10127 }
10128 }
10129}
10130
10131void BlueStore::_txc_committed_kv(TransContext *txc)
10132{
10133 dout(20) << __func__ << " txc " << txc << dendl;
1adf2230 10134 {
11fdf7f2 10135 std::lock_guard l(txc->osr->qlock);
1adf2230 10136 txc->state = TransContext::STATE_KV_DONE;
11fdf7f2
TL
10137 if (txc->ch->commit_queue) {
10138 txc->ch->commit_queue->queue(txc->oncommits);
10139 } else {
10140 finisher.queue(txc->oncommits);
1adf2230 10141 }
7c673cae 10142 }
1adf2230 10143 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
11fdf7f2
TL
10144 LOG_LATENCY_FN(logger, cct,
10145 l_bluestore_commit_lat,
10146 ceph::make_timespan(ceph_clock_now() - txc->start),
10147 [&](auto lat) {
10148 return ", txc = " + stringify(txc);
10149 }
10150 );
7c673cae
FG
10151}
10152
10153void BlueStore::_txc_finish(TransContext *txc)
10154{
10155 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11fdf7f2 10156 ceph_assert(txc->state == TransContext::STATE_FINISHING);
7c673cae
FG
10157
10158 for (auto& sb : txc->shared_blobs_written) {
f64942e4 10159 sb->finish_write(txc->seq);
7c673cae
FG
10160 }
10161 txc->shared_blobs_written.clear();
10162
10163 while (!txc->removed_collections.empty()) {
10164 _queue_reap_collection(txc->removed_collections.front());
10165 txc->removed_collections.pop_front();
10166 }
10167
10168 OpSequencerRef osr = txc->osr;
7c673cae 10169 bool empty = false;
31f18b77 10170 bool submit_deferred = false;
7c673cae
FG
10171 OpSequencer::q_list_t releasing_txc;
10172 {
11fdf7f2 10173 std::lock_guard l(osr->qlock);
7c673cae
FG
10174 txc->state = TransContext::STATE_DONE;
10175 bool notify = false;
10176 while (!osr->q.empty()) {
10177 TransContext *txc = &osr->q.front();
10178 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
10179 << dendl;
10180 if (txc->state != TransContext::STATE_DONE) {
10181 if (txc->state == TransContext::STATE_PREPARE &&
10182 deferred_aggressive) {
10183 // for _osr_drain_preceding()
10184 notify = true;
10185 }
31f18b77 10186 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 10187 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
10188 submit_deferred = true;
10189 }
7c673cae
FG
10190 break;
10191 }
10192
7c673cae
FG
10193 osr->q.pop_front();
10194 releasing_txc.push_back(*txc);
10195 notify = true;
10196 }
10197 if (notify) {
10198 osr->qcond.notify_all();
10199 }
10200 if (osr->q.empty()) {
10201 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
10202 empty = true;
10203 }
10204 }
10205 while (!releasing_txc.empty()) {
10206 // release to allocator only after all preceding txc's have also
10207 // finished any deferred writes that potentially land in these
10208 // blocks
10209 auto txc = &releasing_txc.front();
10210 _txc_release_alloc(txc);
10211 releasing_txc.pop_front();
10212 txc->log_state_latency(logger, l_bluestore_state_done_lat);
10213 delete txc;
10214 }
10215
31f18b77
FG
10216 if (submit_deferred) {
10217 // we're pinning memory; flush! we could be more fine-grained here but
10218 // i'm not sure it's worth the bother.
10219 deferred_try_submit();
7c673cae
FG
10220 }
10221
7c673cae 10222 if (empty && osr->zombie) {
11fdf7f2
TL
10223 std::lock_guard l(zombie_osr_lock);
10224 if (zombie_osr_set.erase(osr->cid)) {
10225 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
10226 } else {
10227 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
10228 << dendl;
10229 }
7c673cae 10230 }
11fdf7f2 10231 }
7c673cae
FG
10232
10233void BlueStore::_txc_release_alloc(TransContext *txc)
10234{
a8e16298 10235 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
10236 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
10237 int r = 0;
10238 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
10239 r = bdev->queue_discard(txc->released);
10240 if (r == 0) {
10241 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
10242 << txc->released << std::dec << dendl;
10243 goto out;
10244 }
10245 } else if (cct->_conf->bdev_enable_discard) {
10246 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
10247 bdev->discard(p.get_start(), p.get_len());
10248 }
10249 }
10250 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 10251 << txc->released << std::dec << dendl;
11fdf7f2 10252 alloc->release(txc->released);
7c673cae
FG
10253 }
10254
11fdf7f2 10255out:
7c673cae
FG
10256 txc->allocated.clear();
10257 txc->released.clear();
10258}
10259
11fdf7f2
TL
10260void BlueStore::_osr_attach(Collection *c)
10261{
10262 // note: caller has RWLock on coll_map
10263 auto q = coll_map.find(c->cid);
10264 if (q != coll_map.end()) {
10265 c->osr = q->second->osr;
10266 ldout(cct, 10) << __func__ << " " << c->cid
10267 << " reusing osr " << c->osr << " from existing coll "
10268 << q->second << dendl;
10269 } else {
10270 std::lock_guard l(zombie_osr_lock);
10271 auto p = zombie_osr_set.find(c->cid);
10272 if (p == zombie_osr_set.end()) {
10273 c->osr = new OpSequencer(this, c->cid);
10274 ldout(cct, 10) << __func__ << " " << c->cid
10275 << " fresh osr " << c->osr << dendl;
10276 } else {
10277 c->osr = p->second;
10278 zombie_osr_set.erase(p);
10279 ldout(cct, 10) << __func__ << " " << c->cid
10280 << " resurrecting zombie osr " << c->osr << dendl;
10281 c->osr->zombie = false;
10282 }
10283 }
10284}
10285
10286void BlueStore::_osr_register_zombie(OpSequencer *osr)
10287{
10288 std::lock_guard l(zombie_osr_lock);
10289 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
10290 osr->zombie = true;
10291 auto i = zombie_osr_set.emplace(osr->cid, osr);
10292 // this is either a new insertion or the same osr is already there
10293 ceph_assert(i.second || i.first->second == osr);
10294}
10295
7c673cae
FG
10296void BlueStore::_osr_drain_preceding(TransContext *txc)
10297{
10298 OpSequencer *osr = txc->osr.get();
10299 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
10300 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
10301 {
10302 // submit anything pending
224ce89b 10303 deferred_lock.lock();
11fdf7f2 10304 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
10305 _deferred_submit_unlock(osr);
10306 } else {
10307 deferred_lock.unlock();
7c673cae
FG
10308 }
10309 }
10310 {
10311 // wake up any previously finished deferred events
11fdf7f2 10312 std::lock_guard l(kv_lock);
7c673cae
FG
10313 kv_cond.notify_one();
10314 }
10315 osr->drain_preceding(txc);
10316 --deferred_aggressive;
10317 dout(10) << __func__ << " " << osr << " done" << dendl;
10318}
10319
11fdf7f2
TL
10320void BlueStore::_osr_drain(OpSequencer *osr)
10321{
10322 dout(10) << __func__ << " " << osr << dendl;
10323 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
10324 {
10325 // submit anything pending
10326 deferred_lock.lock();
10327 if (osr->deferred_pending && !osr->deferred_running) {
10328 _deferred_submit_unlock(osr);
10329 } else {
10330 deferred_lock.unlock();
10331 }
10332 }
10333 {
10334 // wake up any previously finished deferred events
10335 std::lock_guard l(kv_lock);
10336 kv_cond.notify_one();
10337 }
10338 osr->drain();
10339 --deferred_aggressive;
10340 dout(10) << __func__ << " " << osr << " done" << dendl;
10341}
10342
7c673cae
FG
10343void BlueStore::_osr_drain_all()
10344{
10345 dout(10) << __func__ << dendl;
10346
10347 set<OpSequencerRef> s;
11fdf7f2
TL
10348 vector<OpSequencerRef> zombies;
10349 {
10350 RWLock::RLocker l(coll_lock);
10351 for (auto& i : coll_map) {
10352 s.insert(i.second->osr);
10353 }
10354 }
7c673cae 10355 {
11fdf7f2
TL
10356 std::lock_guard l(zombie_osr_lock);
10357 for (auto& i : zombie_osr_set) {
10358 s.insert(i.second);
10359 zombies.push_back(i.second);
10360 }
7c673cae
FG
10361 }
10362 dout(20) << __func__ << " osr_set " << s << dendl;
10363
10364 ++deferred_aggressive;
10365 {
10366 // submit anything pending
224ce89b 10367 deferred_try_submit();
7c673cae
FG
10368 }
10369 {
10370 // wake up any previously finished deferred events
11fdf7f2 10371 std::lock_guard l(kv_lock);
7c673cae
FG
10372 kv_cond.notify_one();
10373 }
31f18b77 10374 {
11fdf7f2 10375 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
10376 kv_finalize_cond.notify_one();
10377 }
7c673cae
FG
10378 for (auto osr : s) {
10379 dout(20) << __func__ << " drain " << osr << dendl;
10380 osr->drain();
10381 }
10382 --deferred_aggressive;
10383
7c673cae 10384 {
11fdf7f2
TL
10385 std::lock_guard l(zombie_osr_lock);
10386 for (auto& osr : zombies) {
10387 if (zombie_osr_set.erase(osr->cid)) {
10388 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
10389 ceph_assert(osr->q.empty());
10390 } else if (osr->zombie) {
10391 dout(10) << __func__ << " empty zombie osr " << osr
10392 << " already reaped" << dendl;
10393 ceph_assert(osr->q.empty());
10394 } else {
10395 dout(10) << __func__ << " empty zombie osr " << osr
10396 << " resurrected" << dendl;
10397 }
7c673cae
FG
10398 }
10399 }
11fdf7f2
TL
10400
10401 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
10402}
10403
11fdf7f2 10404
31f18b77
FG
10405void BlueStore::_kv_start()
10406{
10407 dout(10) << __func__ << dendl;
10408
181888fb 10409 deferred_finisher.start();
11fdf7f2 10410 finisher.start();
31f18b77
FG
10411 kv_sync_thread.create("bstore_kv_sync");
10412 kv_finalize_thread.create("bstore_kv_final");
10413}
10414
10415void BlueStore::_kv_stop()
10416{
10417 dout(10) << __func__ << dendl;
10418 {
11fdf7f2 10419 std::unique_lock l(kv_lock);
31f18b77
FG
10420 while (!kv_sync_started) {
10421 kv_cond.wait(l);
10422 }
10423 kv_stop = true;
10424 kv_cond.notify_all();
10425 }
10426 {
11fdf7f2 10427 std::unique_lock l(kv_finalize_lock);
31f18b77
FG
10428 while (!kv_finalize_started) {
10429 kv_finalize_cond.wait(l);
10430 }
10431 kv_finalize_stop = true;
10432 kv_finalize_cond.notify_all();
10433 }
10434 kv_sync_thread.join();
10435 kv_finalize_thread.join();
11fdf7f2 10436 ceph_assert(removed_collections.empty());
31f18b77 10437 {
11fdf7f2 10438 std::lock_guard l(kv_lock);
31f18b77
FG
10439 kv_stop = false;
10440 }
10441 {
11fdf7f2 10442 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
10443 kv_finalize_stop = false;
10444 }
10445 dout(10) << __func__ << " stopping finishers" << dendl;
181888fb
FG
10446 deferred_finisher.wait_for_empty();
10447 deferred_finisher.stop();
11fdf7f2
TL
10448 finisher.wait_for_empty();
10449 finisher.stop();
31f18b77
FG
10450 dout(10) << __func__ << " stopped" << dendl;
10451}
10452
7c673cae
FG
10453void BlueStore::_kv_sync_thread()
10454{
10455 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
10456 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
10457 std::unique_lock l(kv_lock);
10458 ceph_assert(!kv_sync_started);
31f18b77
FG
10459 kv_sync_started = true;
10460 kv_cond.notify_all();
7c673cae 10461 while (true) {
11fdf7f2 10462 ceph_assert(kv_committing.empty());
7c673cae
FG
10463 if (kv_queue.empty() &&
10464 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 10465 !deferred_aggressive)) {
7c673cae
FG
10466 if (kv_stop)
10467 break;
10468 dout(20) << __func__ << " sleep" << dendl;
11fdf7f2 10469 kv_cond.wait(l);
7c673cae
FG
10470 dout(20) << __func__ << " wake" << dendl;
10471 } else {
10472 deque<TransContext*> kv_submitting;
10473 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
10474 uint64_t aios = 0, costs = 0;
10475
7c673cae
FG
10476 dout(20) << __func__ << " committing " << kv_queue.size()
10477 << " submitting " << kv_queue_unsubmitted.size()
10478 << " deferred done " << deferred_done_queue.size()
10479 << " stable " << deferred_stable_queue.size()
10480 << dendl;
10481 kv_committing.swap(kv_queue);
10482 kv_submitting.swap(kv_queue_unsubmitted);
10483 deferred_done.swap(deferred_done_queue);
10484 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
10485 aios = kv_ios;
10486 costs = kv_throttle_costs;
10487 kv_ios = 0;
10488 kv_throttle_costs = 0;
7c673cae
FG
10489 l.unlock();
10490
10491 dout(30) << __func__ << " committing " << kv_committing << dendl;
10492 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
10493 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
10494 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
10495
11fdf7f2
TL
10496 auto start = mono_clock::now();
10497
7c673cae
FG
10498 bool force_flush = false;
10499 // if bluefs is sharing the same device as data (only), then we
10500 // can rely on the bluefs commit to flush the device and make
10501 // deferred aios stable. that means that if we do have done deferred
10502 // txcs AND we are not on a single device, we need to force a flush.
10503 if (bluefs_single_shared_device && bluefs) {
31f18b77 10504 if (aios) {
7c673cae 10505 force_flush = true;
11fdf7f2 10506 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
10507 force_flush = true; // there's nothing else to commit!
10508 } else if (deferred_aggressive) {
10509 force_flush = true;
10510 }
11fdf7f2
TL
10511 } else {
10512 if (aios || !deferred_done.empty()) {
10513 force_flush = true;
10514 } else {
10515 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
10516 }
10517 }
7c673cae
FG
10518
10519 if (force_flush) {
31f18b77 10520 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
10521 << " force_flush=" << (int)force_flush
10522 << ", flushing, deferred done->stable" << dendl;
10523 // flush/barrier on block device
10524 bdev->flush();
10525
10526 // if we flush then deferred done are now deferred stable
10527 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
10528 deferred_done.end());
10529 deferred_done.clear();
10530 }
11fdf7f2 10531 auto after_flush = mono_clock::now();
7c673cae
FG
10532
10533 // we will use one final transaction to force a sync
10534 KeyValueDB::Transaction synct = db->get_transaction();
10535
10536 // increase {nid,blobid}_max? note that this covers both the
10537 // case where we are approaching the max and the case we passed
10538 // it. in either case, we increase the max in the earlier txn
10539 // we submit.
10540 uint64_t new_nid_max = 0, new_blobid_max = 0;
10541 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
10542 KeyValueDB::Transaction t =
10543 kv_submitting.empty() ? synct : kv_submitting.front()->t;
10544 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
10545 bufferlist bl;
11fdf7f2 10546 encode(new_nid_max, bl);
7c673cae
FG
10547 t->set(PREFIX_SUPER, "nid_max", bl);
10548 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
10549 }
10550 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
10551 KeyValueDB::Transaction t =
10552 kv_submitting.empty() ? synct : kv_submitting.front()->t;
10553 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
10554 bufferlist bl;
11fdf7f2 10555 encode(new_blobid_max, bl);
7c673cae
FG
10556 t->set(PREFIX_SUPER, "blobid_max", bl);
10557 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
10558 }
c07f9fc5
FG
10559
10560 for (auto txc : kv_committing) {
10561 if (txc->state == TransContext::STATE_KV_QUEUED) {
10562 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
10563 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11fdf7f2 10564 ceph_assert(r == 0);
c07f9fc5
FG
10565 _txc_applied_kv(txc);
10566 --txc->osr->kv_committing_serially;
10567 txc->state = TransContext::STATE_KV_SUBMITTED;
10568 if (txc->osr->kv_submitted_waiters) {
11fdf7f2
TL
10569 std::lock_guard l(txc->osr->qlock);
10570 txc->osr->qcond.notify_all();
7c673cae 10571 }
c07f9fc5
FG
10572
10573 } else {
11fdf7f2 10574 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
c07f9fc5 10575 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
7c673cae 10576 }
7c673cae
FG
10577 if (txc->had_ios) {
10578 --txc->osr->txc_with_unstable_io;
10579 }
7c673cae
FG
10580 }
10581
31f18b77
FG
10582 // release throttle *before* we commit. this allows new ops
10583 // to be prepared and enter pipeline while we are waiting on
10584 // the kv commit sync/flush. then hopefully on the next
10585 // iteration there will already be ops awake. otherwise, we
10586 // end up going to sleep, and then wake up when the very first
10587 // transaction is ready for commit.
10588 throttle_bytes.put(costs);
10589
7c673cae
FG
10590 if (bluefs &&
10591 after_flush - bluefs_last_balance >
11fdf7f2 10592 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
7c673cae 10593 bluefs_last_balance = after_flush;
11fdf7f2
TL
10594 int r = _balance_bluefs_freespace();
10595 ceph_assert(r >= 0);
7c673cae
FG
10596 }
10597
10598 // cleanup sync deferred keys
10599 for (auto b : deferred_stable) {
10600 for (auto& txc : b->txcs) {
10601 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 10602 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
10603 string key;
10604 get_deferred_key(wt.seq, &key);
10605 synct->rm_single_key(PREFIX_DEFERRED, key);
10606 }
10607 }
10608
10609 // submit synct synchronously (block and wait for it to commit)
31f18b77 10610 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
10611 ceph_assert(r == 0);
10612
10613 {
10614 std::unique_lock m(kv_finalize_lock);
10615 if (kv_committing_to_finalize.empty()) {
10616 kv_committing_to_finalize.swap(kv_committing);
10617 } else {
10618 kv_committing_to_finalize.insert(
10619 kv_committing_to_finalize.end(),
10620 kv_committing.begin(),
10621 kv_committing.end());
10622 kv_committing.clear();
10623 }
10624 if (deferred_stable_to_finalize.empty()) {
10625 deferred_stable_to_finalize.swap(deferred_stable);
10626 } else {
10627 deferred_stable_to_finalize.insert(
10628 deferred_stable_to_finalize.end(),
10629 deferred_stable.begin(),
10630 deferred_stable.end());
10631 deferred_stable.clear();
10632 }
10633 kv_finalize_cond.notify_one();
10634 }
7c673cae
FG
10635
10636 if (new_nid_max) {
10637 nid_max = new_nid_max;
10638 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
10639 }
10640 if (new_blobid_max) {
10641 blobid_max = new_blobid_max;
10642 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
10643 }
10644
224ce89b 10645 {
11fdf7f2
TL
10646 auto finish = mono_clock::now();
10647 ceph::timespan dur_flush = after_flush - start;
10648 ceph::timespan dur_kv = finish - after_flush;
10649 ceph::timespan dur = finish - start;
224ce89b
WB
10650 dout(20) << __func__ << " committed " << kv_committing.size()
10651 << " cleaned " << deferred_stable.size()
10652 << " in " << dur
10653 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
10654 << dendl;
11fdf7f2
TL
10655 LOG_LATENCY(logger, cct, l_bluestore_kv_flush_lat, dur_flush);
10656 LOG_LATENCY(logger, cct, l_bluestore_kv_commit_lat, dur_kv);
10657 LOG_LATENCY(logger, cct, l_bluestore_kv_sync_lat, dur);
7c673cae 10658 }
31f18b77
FG
10659
10660 if (bluefs) {
11fdf7f2
TL
10661 if (!bluefs_extents_reclaiming.empty()) {
10662 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
10663 << bluefs_extents_reclaiming << std::dec << dendl;
81eedcae
TL
10664 int r = 0;
10665 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
10666 r = bdev->queue_discard(bluefs_extents_reclaiming);
10667 if (r == 0) {
10668 goto clear;
10669 }
10670 } else if (cct->_conf->bdev_enable_discard) {
10671 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
10672 bdev->discard(p.get_start(), p.get_len());
10673 }
10674 }
10675
11fdf7f2 10676 alloc->release(bluefs_extents_reclaiming);
81eedcae 10677clear:
11fdf7f2 10678 bluefs_extents_reclaiming.clear();
31f18b77 10679 }
31f18b77
FG
10680 }
10681
10682 l.lock();
10683 // previously deferred "done" are now "stable" by virtue of this
10684 // commit cycle.
10685 deferred_stable_queue.swap(deferred_done);
10686 }
10687 }
10688 dout(10) << __func__ << " finish" << dendl;
10689 kv_sync_started = false;
10690}
10691
10692void BlueStore::_kv_finalize_thread()
10693{
10694 deque<TransContext*> kv_committed;
10695 deque<DeferredBatch*> deferred_stable;
10696 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
10697 std::unique_lock l(kv_finalize_lock);
10698 ceph_assert(!kv_finalize_started);
31f18b77
FG
10699 kv_finalize_started = true;
10700 kv_finalize_cond.notify_all();
10701 while (true) {
11fdf7f2
TL
10702 ceph_assert(kv_committed.empty());
10703 ceph_assert(deferred_stable.empty());
31f18b77
FG
10704 if (kv_committing_to_finalize.empty() &&
10705 deferred_stable_to_finalize.empty()) {
10706 if (kv_finalize_stop)
10707 break;
10708 dout(20) << __func__ << " sleep" << dendl;
10709 kv_finalize_cond.wait(l);
10710 dout(20) << __func__ << " wake" << dendl;
10711 } else {
10712 kv_committed.swap(kv_committing_to_finalize);
10713 deferred_stable.swap(deferred_stable_to_finalize);
10714 l.unlock();
10715 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
10716 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
10717
11fdf7f2
TL
10718 auto start = mono_clock::now();
10719
31f18b77
FG
10720 while (!kv_committed.empty()) {
10721 TransContext *txc = kv_committed.front();
11fdf7f2 10722 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 10723 _txc_state_proc(txc);
31f18b77 10724 kv_committed.pop_front();
7c673cae 10725 }
31f18b77 10726
7c673cae
FG
10727 for (auto b : deferred_stable) {
10728 auto p = b->txcs.begin();
10729 while (p != b->txcs.end()) {
10730 TransContext *txc = &*p;
10731 p = b->txcs.erase(p); // unlink here because
10732 _txc_state_proc(txc); // this may destroy txc
10733 }
10734 delete b;
10735 }
31f18b77 10736 deferred_stable.clear();
7c673cae
FG
10737
10738 if (!deferred_aggressive) {
31f18b77 10739 if (deferred_queue_size >= deferred_batch_ops.load() ||
7c673cae 10740 throttle_deferred_bytes.past_midpoint()) {
224ce89b 10741 deferred_try_submit();
7c673cae
FG
10742 }
10743 }
10744
10745 // this is as good a place as any ...
10746 _reap_collections();
10747
11fdf7f2
TL
10748 logger->set(l_bluestore_fragmentation,
10749 (uint64_t)(alloc->get_fragmentation(min_alloc_size) * 1000));
10750
10751 LOG_LATENCY(logger, cct, l_bluestore_kv_final_lat, mono_clock::now() - start);
10752
7c673cae 10753 l.lock();
7c673cae
FG
10754 }
10755 }
10756 dout(10) << __func__ << " finish" << dendl;
31f18b77 10757 kv_finalize_started = false;
7c673cae
FG
10758}
10759
10760bluestore_deferred_op_t *BlueStore::_get_deferred_op(
10761 TransContext *txc, OnodeRef o)
10762{
10763 if (!txc->deferred_txn) {
10764 txc->deferred_txn = new bluestore_deferred_transaction_t;
10765 }
10766 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
10767 return &txc->deferred_txn->ops.back();
10768}
10769
10770void BlueStore::_deferred_queue(TransContext *txc)
10771{
10772 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 10773 deferred_lock.lock();
7c673cae
FG
10774 if (!txc->osr->deferred_pending &&
10775 !txc->osr->deferred_running) {
10776 deferred_queue.push_back(*txc->osr);
10777 }
10778 if (!txc->osr->deferred_pending) {
10779 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
10780 }
10781 ++deferred_queue_size;
10782 txc->osr->deferred_pending->txcs.push_back(*txc);
10783 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
10784 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
10785 const auto& op = *opi;
11fdf7f2 10786 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
10787 bufferlist::const_iterator p = op.data.begin();
10788 for (auto e : op.extents) {
10789 txc->osr->deferred_pending->prepare_write(
10790 cct, wt.seq, e.offset, e.length, p);
10791 }
10792 }
10793 if (deferred_aggressive &&
10794 !txc->osr->deferred_running) {
224ce89b
WB
10795 _deferred_submit_unlock(txc->osr.get());
10796 } else {
10797 deferred_lock.unlock();
7c673cae
FG
10798 }
10799}
10800
224ce89b 10801void BlueStore::deferred_try_submit()
7c673cae
FG
10802{
10803 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
10804 << deferred_queue_size << " txcs" << dendl;
11fdf7f2 10805 std::lock_guard l(deferred_lock);
224ce89b
WB
10806 vector<OpSequencerRef> osrs;
10807 osrs.reserve(deferred_queue.size());
7c673cae 10808 for (auto& osr : deferred_queue) {
224ce89b
WB
10809 osrs.push_back(&osr);
10810 }
10811 for (auto& osr : osrs) {
181888fb
FG
10812 if (osr->deferred_pending) {
10813 if (!osr->deferred_running) {
10814 _deferred_submit_unlock(osr.get());
10815 deferred_lock.lock();
10816 } else {
10817 dout(20) << __func__ << " osr " << osr << " already has running"
10818 << dendl;
10819 }
10820 } else {
10821 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
10822 }
10823 }
10824}
10825
224ce89b 10826void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
10827{
10828 dout(10) << __func__ << " osr " << osr
10829 << " " << osr->deferred_pending->iomap.size() << " ios pending "
10830 << dendl;
11fdf7f2
TL
10831 ceph_assert(osr->deferred_pending);
10832 ceph_assert(!osr->deferred_running);
7c673cae
FG
10833
10834 auto b = osr->deferred_pending;
10835 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 10836 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
10837
10838 osr->deferred_running = osr->deferred_pending;
10839 osr->deferred_pending = nullptr;
10840
11fdf7f2
TL
10841 deferred_lock.unlock();
10842
10843 for (auto& txc : b->txcs) {
10844 txc.log_state_latency(logger, l_bluestore_state_deferred_queued_lat);
10845 }
7c673cae
FG
10846 uint64_t start = 0, pos = 0;
10847 bufferlist bl;
10848 auto i = b->iomap.begin();
10849 while (true) {
10850 if (i == b->iomap.end() || i->first != pos) {
10851 if (bl.length()) {
10852 dout(20) << __func__ << " write 0x" << std::hex
10853 << start << "~" << bl.length()
10854 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 10855 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
10856 logger->inc(l_bluestore_deferred_write_ops);
10857 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
10858 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 10859 ceph_assert(r == 0);
7c673cae
FG
10860 }
10861 }
10862 if (i == b->iomap.end()) {
10863 break;
10864 }
10865 start = 0;
10866 pos = i->first;
10867 bl.clear();
10868 }
10869 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
10870 << std::hex << pos << "~" << i->second.bl.length() << std::dec
10871 << dendl;
10872 if (!bl.length()) {
10873 start = pos;
10874 }
10875 pos += i->second.bl.length();
10876 bl.claim_append(i->second.bl);
10877 ++i;
10878 }
224ce89b 10879
7c673cae
FG
10880 bdev->aio_submit(&b->ioc);
10881}
10882
3efd9988
FG
10883struct C_DeferredTrySubmit : public Context {
10884 BlueStore *store;
10885 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
10886 void finish(int r) {
10887 store->deferred_try_submit();
10888 }
10889};
10890
7c673cae
FG
10891void BlueStore::_deferred_aio_finish(OpSequencer *osr)
10892{
10893 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 10894 ceph_assert(osr->deferred_running);
7c673cae
FG
10895 DeferredBatch *b = osr->deferred_running;
10896
10897 {
11fdf7f2
TL
10898 std::lock_guard l(deferred_lock);
10899 ceph_assert(osr->deferred_running == b);
7c673cae
FG
10900 osr->deferred_running = nullptr;
10901 if (!osr->deferred_pending) {
181888fb 10902 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
10903 auto q = deferred_queue.iterator_to(*osr);
10904 deferred_queue.erase(q);
10905 } else if (deferred_aggressive) {
224ce89b 10906 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
3efd9988 10907 deferred_finisher.queue(new C_DeferredTrySubmit(this));
181888fb
FG
10908 } else {
10909 dout(20) << __func__ << " leaving queued, more pending" << dendl;
7c673cae
FG
10910 }
10911 }
10912
10913 {
31f18b77 10914 uint64_t costs = 0;
11fdf7f2
TL
10915 {
10916 std::lock_guard l2(osr->qlock);
10917 for (auto& i : b->txcs) {
10918 TransContext *txc = &i;
10919 txc->log_state_latency(logger, l_bluestore_state_deferred_aio_wait_lat);
10920 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
10921 costs += txc->cost;
10922 }
7c673cae 10923 }
31f18b77 10924 throttle_deferred_bytes.put(costs);
11fdf7f2 10925 std::lock_guard l(kv_lock);
7c673cae
FG
10926 deferred_done_queue.emplace_back(b);
10927 }
10928
10929 // in the normal case, do not bother waking up the kv thread; it will
10930 // catch us on the next commit anyway.
10931 if (deferred_aggressive) {
11fdf7f2 10932 std::lock_guard l(kv_lock);
7c673cae
FG
10933 kv_cond.notify_one();
10934 }
10935}
10936
10937int BlueStore::_deferred_replay()
10938{
10939 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
10940 int count = 0;
10941 int r = 0;
11fdf7f2
TL
10942 CollectionRef ch = _get_collection(coll_t::meta());
10943 bool fake_ch = false;
10944 if (!ch) {
10945 // hmm, replaying initial mkfs?
10946 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
10947 fake_ch = true;
10948 }
10949 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
10950 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
10951 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
10952 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
10953 << dendl;
10954 bluestore_deferred_transaction_t *deferred_txn =
10955 new bluestore_deferred_transaction_t;
10956 bufferlist bl = it->value();
11fdf7f2 10957 auto p = bl.cbegin();
7c673cae 10958 try {
11fdf7f2 10959 decode(*deferred_txn, p);
7c673cae
FG
10960 } catch (buffer::error& e) {
10961 derr << __func__ << " failed to decode deferred txn "
10962 << pretty_binary_string(it->key()) << dendl;
10963 delete deferred_txn;
10964 r = -EIO;
10965 goto out;
10966 }
11fdf7f2 10967 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae
FG
10968 txc->deferred_txn = deferred_txn;
10969 txc->state = TransContext::STATE_KV_DONE;
10970 _txc_state_proc(txc);
10971 }
10972 out:
10973 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 10974 _osr_register_zombie(osr);
7c673cae 10975 _osr_drain_all();
11fdf7f2
TL
10976 if (fake_ch) {
10977 new_coll_map.clear();
10978 }
7c673cae
FG
10979 dout(10) << __func__ << " completed " << count << " events" << dendl;
10980 return r;
10981}
10982
10983// ---------------------------
10984// transactions
10985
10986int BlueStore::queue_transactions(
11fdf7f2
TL
10987 CollectionHandle& ch,
10988 vector<Transaction>& tls,
10989 TrackedOpRef op,
10990 ThreadPool::TPHandle *handle)
10991{
10992 FUNCTRACE(cct);
10993 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 10994 ObjectStore::Transaction::collect_contexts(
11fdf7f2 10995 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae
FG
10996
10997 if (cct->_conf->objectstore_blackhole) {
10998 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
10999 << dendl;
11fdf7f2
TL
11000 for (auto& l : { on_applied, on_commit, on_applied_sync }) {
11001 for (auto c : l) {
11002 delete c;
11003 }
11004 }
7c673cae
FG
11005 return 0;
11006 }
11fdf7f2
TL
11007 auto start = mono_clock::now();
11008
11009 Collection *c = static_cast<Collection*>(ch.get());
11010 OpSequencer *osr = c->osr.get();
11011 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
11012
11013 // prepare
11fdf7f2
TL
11014 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
11015 &on_commit);
7c673cae
FG
11016
11017 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
11018 txc->bytes += (*p).get_num_bytes();
11019 _txc_add_transaction(txc, &(*p));
11020 }
11021 _txc_calc_cost(txc);
11022
11023 _txc_write_nodes(txc, txc->t);
11024
11025 // journal deferred items
11026 if (txc->deferred_txn) {
11027 txc->deferred_txn->seq = ++deferred_seq;
11028 bufferlist bl;
11fdf7f2 11029 encode(*txc->deferred_txn, bl);
7c673cae
FG
11030 string key;
11031 get_deferred_key(txc->deferred_txn->seq, &key);
11032 txc->t->set(PREFIX_DEFERRED, key, bl);
11033 }
11034
11035 _txc_finalize_kv(txc, txc->t);
11036 if (handle)
11037 handle->suspend_tp_timeout();
11038
11fdf7f2 11039 auto tstart = mono_clock::now();
7c673cae
FG
11040 throttle_bytes.get(txc->cost);
11041 if (txc->deferred_txn) {
11042 // ensure we do not block here because of deferred writes
11043 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
d2e6a577
FG
11044 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
11045 << dendl;
11046 ++deferred_aggressive;
7c673cae 11047 deferred_try_submit();
3efd9988
FG
11048 {
11049 // wake up any previously finished deferred events
11fdf7f2 11050 std::lock_guard l(kv_lock);
3efd9988
FG
11051 kv_cond.notify_one();
11052 }
7c673cae 11053 throttle_deferred_bytes.get(txc->cost);
d2e6a577
FG
11054 --deferred_aggressive;
11055 }
7c673cae 11056 }
11fdf7f2 11057 auto tend = mono_clock::now();
7c673cae
FG
11058
11059 if (handle)
11060 handle->reset_tp_timeout();
11061
11062 logger->inc(l_bluestore_txc);
11063
11064 // execute (start)
11065 _txc_state_proc(txc);
11066
11fdf7f2
TL
11067 // we're immediately readable (unlike FileStore)
11068 for (auto c : on_applied_sync) {
11069 c->complete(0);
11070 }
11071 if (!on_applied.empty()) {
11072 if (c->commit_queue) {
11073 c->commit_queue->queue(on_applied);
11074 } else {
11075 finisher.queue(on_applied);
11076 }
11077 }
11078
11079 LOG_LATENCY(logger, cct, l_bluestore_submit_lat, mono_clock::now() - start);
11080 LOG_LATENCY(logger, cct, l_bluestore_throttle_lat, tend - tstart);
7c673cae
FG
11081 return 0;
11082}
11083
11084void BlueStore::_txc_aio_submit(TransContext *txc)
11085{
11086 dout(10) << __func__ << " txc " << txc << dendl;
11087 bdev->aio_submit(&txc->ioc);
11088}
11089
11090void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
11091{
11092 Transaction::iterator i = t->begin();
11093
81eedcae 11094 _dump_transaction<30>(cct, t);
7c673cae
FG
11095
11096 vector<CollectionRef> cvec(i.colls.size());
11097 unsigned j = 0;
11098 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
11099 ++p, ++j) {
11100 cvec[j] = _get_collection(*p);
7c673cae 11101 }
11fdf7f2 11102
7c673cae
FG
11103 vector<OnodeRef> ovec(i.objects.size());
11104
11105 for (int pos = 0; i.have_op(); ++pos) {
11106 Transaction::Op *op = i.decode_op();
11107 int r = 0;
11108
11109 // no coll or obj
11110 if (op->op == Transaction::OP_NOP)
11111 continue;
11112
11fdf7f2 11113
7c673cae
FG
11114 // collection operations
11115 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
11116
11117 // initialize osd_pool_id and do a smoke test that all collections belong
11118 // to the same pool
11119 spg_t pgid;
11120 if (!!c ? c->cid.is_pg(&pgid) : false) {
11121 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
11122 txc->osd_pool_id == pgid.pool());
11123 txc->osd_pool_id = pgid.pool();
11124 }
11125
7c673cae
FG
11126 switch (op->op) {
11127 case Transaction::OP_RMCOLL:
11128 {
11129 const coll_t &cid = i.get_cid(op->cid);
11130 r = _remove_collection(txc, cid, &c);
11131 if (!r)
11132 continue;
11133 }
11134 break;
11135
11136 case Transaction::OP_MKCOLL:
11137 {
11fdf7f2 11138 ceph_assert(!c);
7c673cae
FG
11139 const coll_t &cid = i.get_cid(op->cid);
11140 r = _create_collection(txc, cid, op->split_bits, &c);
11141 if (!r)
11142 continue;
11143 }
11144 break;
11145
11146 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 11147 ceph_abort_msg("deprecated");
7c673cae
FG
11148 break;
11149
11150 case Transaction::OP_SPLIT_COLLECTION2:
11151 {
11152 uint32_t bits = op->split_bits;
11153 uint32_t rem = op->split_rem;
11154 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
11155 if (!r)
11156 continue;
11157 }
11158 break;
11159
11fdf7f2
TL
11160 case Transaction::OP_MERGE_COLLECTION:
11161 {
11162 uint32_t bits = op->split_bits;
11163 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
11164 if (!r)
11165 continue;
11166 }
11167 break;
11168
7c673cae
FG
11169 case Transaction::OP_COLL_HINT:
11170 {
11171 uint32_t type = op->hint_type;
11172 bufferlist hint;
11173 i.decode_bl(hint);
11fdf7f2 11174 auto hiter = hint.cbegin();
7c673cae
FG
11175 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
11176 uint32_t pg_num;
11177 uint64_t num_objs;
11fdf7f2
TL
11178 decode(pg_num, hiter);
11179 decode(num_objs, hiter);
7c673cae
FG
11180 dout(10) << __func__ << " collection hint objects is a no-op, "
11181 << " pg_num " << pg_num << " num_objects " << num_objs
11182 << dendl;
11183 } else {
11184 // Ignore the hint
11185 dout(10) << __func__ << " unknown collection hint " << type << dendl;
11186 }
11187 continue;
11188 }
11189 break;
11190
11191 case Transaction::OP_COLL_SETATTR:
11192 r = -EOPNOTSUPP;
11193 break;
11194
11195 case Transaction::OP_COLL_RMATTR:
11196 r = -EOPNOTSUPP;
11197 break;
11198
11199 case Transaction::OP_COLL_RENAME:
11fdf7f2 11200 ceph_abort_msg("not implemented");
7c673cae
FG
11201 break;
11202 }
11203 if (r < 0) {
11204 derr << __func__ << " error " << cpp_strerror(r)
11205 << " not handled on operation " << op->op
11206 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 11207 _dump_transaction<0>(cct, t);
11fdf7f2 11208 ceph_abort_msg("unexpected error");
7c673cae
FG
11209 }
11210
11211 // these operations implicity create the object
11212 bool create = false;
11213 if (op->op == Transaction::OP_TOUCH ||
11214 op->op == Transaction::OP_WRITE ||
11215 op->op == Transaction::OP_ZERO) {
11216 create = true;
11217 }
11218
11219 // object operations
11220 RWLock::WLocker l(c->lock);
11221 OnodeRef &o = ovec[op->oid];
11222 if (!o) {
11223 ghobject_t oid = i.get_oid(op->oid);
11224 o = c->get_onode(oid, create);
11225 }
11226 if (!create && (!o || !o->exists)) {
11227 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
11228 << i.get_oid(op->oid) << dendl;
11229 r = -ENOENT;
11230 goto endop;
11231 }
11232
11233 switch (op->op) {
11234 case Transaction::OP_TOUCH:
11235 r = _touch(txc, c, o);
11236 break;
11237
11238 case Transaction::OP_WRITE:
11239 {
11240 uint64_t off = op->off;
11241 uint64_t len = op->len;
11242 uint32_t fadvise_flags = i.get_fadvise_flags();
11243 bufferlist bl;
11244 i.decode_bl(bl);
11245 r = _write(txc, c, o, off, len, bl, fadvise_flags);
11246 }
11247 break;
11248
11249 case Transaction::OP_ZERO:
11250 {
11251 uint64_t off = op->off;
11252 uint64_t len = op->len;
11253 r = _zero(txc, c, o, off, len);
11254 }
11255 break;
11256
11257 case Transaction::OP_TRIMCACHE:
11258 {
11259 // deprecated, no-op
11260 }
11261 break;
11262
11263 case Transaction::OP_TRUNCATE:
11264 {
11265 uint64_t off = op->off;
35e4c445 11266 r = _truncate(txc, c, o, off);
7c673cae
FG
11267 }
11268 break;
11269
11270 case Transaction::OP_REMOVE:
11271 {
11272 r = _remove(txc, c, o);
11273 }
11274 break;
11275
11276 case Transaction::OP_SETATTR:
11277 {
11278 string name = i.decode_string();
11279 bufferptr bp;
11280 i.decode_bp(bp);
11281 r = _setattr(txc, c, o, name, bp);
11282 }
11283 break;
11284
11285 case Transaction::OP_SETATTRS:
11286 {
11287 map<string, bufferptr> aset;
11288 i.decode_attrset(aset);
11289 r = _setattrs(txc, c, o, aset);
11290 }
11291 break;
11292
11293 case Transaction::OP_RMATTR:
11294 {
11295 string name = i.decode_string();
11296 r = _rmattr(txc, c, o, name);
11297 }
11298 break;
11299
11300 case Transaction::OP_RMATTRS:
11301 {
11302 r = _rmattrs(txc, c, o);
11303 }
11304 break;
11305
11306 case Transaction::OP_CLONE:
11307 {
11308 OnodeRef& no = ovec[op->dest_oid];
11309 if (!no) {
11310 const ghobject_t& noid = i.get_oid(op->dest_oid);
11311 no = c->get_onode(noid, true);
11312 }
11313 r = _clone(txc, c, o, no);
11314 }
11315 break;
11316
11317 case Transaction::OP_CLONERANGE:
11fdf7f2 11318 ceph_abort_msg("deprecated");
7c673cae
FG
11319 break;
11320
11321 case Transaction::OP_CLONERANGE2:
11322 {
11323 OnodeRef& no = ovec[op->dest_oid];
11324 if (!no) {
11325 const ghobject_t& noid = i.get_oid(op->dest_oid);
11326 no = c->get_onode(noid, true);
11327 }
11328 uint64_t srcoff = op->off;
11329 uint64_t len = op->len;
11330 uint64_t dstoff = op->dest_off;
11331 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
11332 }
11333 break;
11334
11335 case Transaction::OP_COLL_ADD:
11fdf7f2 11336 ceph_abort_msg("not implemented");
7c673cae
FG
11337 break;
11338
11339 case Transaction::OP_COLL_REMOVE:
11fdf7f2 11340 ceph_abort_msg("not implemented");
7c673cae
FG
11341 break;
11342
11343 case Transaction::OP_COLL_MOVE:
11fdf7f2 11344 ceph_abort_msg("deprecated");
7c673cae
FG
11345 break;
11346
11347 case Transaction::OP_COLL_MOVE_RENAME:
11348 case Transaction::OP_TRY_RENAME:
11349 {
11fdf7f2 11350 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
11351 const ghobject_t& noid = i.get_oid(op->dest_oid);
11352 OnodeRef& no = ovec[op->dest_oid];
11353 if (!no) {
11354 no = c->get_onode(noid, false);
11355 }
11356 r = _rename(txc, c, o, no, noid);
11357 }
11358 break;
11359
11360 case Transaction::OP_OMAP_CLEAR:
11361 {
11362 r = _omap_clear(txc, c, o);
11363 }
11364 break;
11365 case Transaction::OP_OMAP_SETKEYS:
11366 {
11367 bufferlist aset_bl;
11368 i.decode_attrset_bl(&aset_bl);
11369 r = _omap_setkeys(txc, c, o, aset_bl);
11370 }
11371 break;
11372 case Transaction::OP_OMAP_RMKEYS:
11373 {
11374 bufferlist keys_bl;
11375 i.decode_keyset_bl(&keys_bl);
11376 r = _omap_rmkeys(txc, c, o, keys_bl);
11377 }
11378 break;
11379 case Transaction::OP_OMAP_RMKEYRANGE:
11380 {
11381 string first, last;
11382 first = i.decode_string();
11383 last = i.decode_string();
11384 r = _omap_rmkey_range(txc, c, o, first, last);
11385 }
11386 break;
11387 case Transaction::OP_OMAP_SETHEADER:
11388 {
11389 bufferlist bl;
11390 i.decode_bl(bl);
11391 r = _omap_setheader(txc, c, o, bl);
11392 }
11393 break;
11394
11395 case Transaction::OP_SETALLOCHINT:
11396 {
11397 r = _set_alloc_hint(txc, c, o,
11398 op->expected_object_size,
11399 op->expected_write_size,
11400 op->alloc_hint_flags);
11401 }
11402 break;
11403
11404 default:
11fdf7f2 11405 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
11406 ceph_abort();
11407 }
11408
11409 endop:
11410 if (r < 0) {
11411 bool ok = false;
11412
11413 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
11414 op->op == Transaction::OP_CLONE ||
11415 op->op == Transaction::OP_CLONERANGE2 ||
11416 op->op == Transaction::OP_COLL_ADD ||
11417 op->op == Transaction::OP_SETATTR ||
11418 op->op == Transaction::OP_SETATTRS ||
11419 op->op == Transaction::OP_RMATTR ||
11420 op->op == Transaction::OP_OMAP_SETKEYS ||
11421 op->op == Transaction::OP_OMAP_RMKEYS ||
11422 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
11423 op->op == Transaction::OP_OMAP_SETHEADER))
11424 // -ENOENT is usually okay
11425 ok = true;
11426 if (r == -ENODATA)
11427 ok = true;
11428
11429 if (!ok) {
11430 const char *msg = "unexpected error code";
11431
11432 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
11433 op->op == Transaction::OP_CLONE ||
11434 op->op == Transaction::OP_CLONERANGE2))
11435 msg = "ENOENT on clone suggests osd bug";
11436
11437 if (r == -ENOSPC)
11438 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
11439 // by partially applying transactions.
11440 msg = "ENOSPC from bluestore, misconfigured cluster";
11441
11442 if (r == -ENOTEMPTY) {
11443 msg = "ENOTEMPTY suggests garbage data in osd data dir";
11444 }
11445
11446 derr << __func__ << " error " << cpp_strerror(r)
11447 << " not handled on operation " << op->op
11448 << " (op " << pos << ", counting from 0)"
11449 << dendl;
11450 derr << msg << dendl;
81eedcae 11451 _dump_transaction<0>(cct, t);
11fdf7f2 11452 ceph_abort_msg("unexpected error");
7c673cae
FG
11453 }
11454 }
11455 }
11456}
11457
11458
11459
11460// -----------------
11461// write operations
11462
11463int BlueStore::_touch(TransContext *txc,
11464 CollectionRef& c,
11465 OnodeRef &o)
11466{
11467 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11468 int r = 0;
7c673cae
FG
11469 _assign_nid(txc, o);
11470 txc->write_onode(o);
11471 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11472 return r;
11473}
11474
7c673cae
FG
11475void BlueStore::_pad_zeros(
11476 bufferlist *bl, uint64_t *offset,
11477 uint64_t chunk_size)
11478{
11479 auto length = bl->length();
11480 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
11481 << " chunk_size 0x" << chunk_size << std::dec << dendl;
11482 dout(40) << "before:\n";
11483 bl->hexdump(*_dout);
11484 *_dout << dendl;
11485 // front
11486 size_t front_pad = *offset % chunk_size;
11487 size_t back_pad = 0;
11488 size_t pad_count = 0;
11489 if (front_pad) {
11fdf7f2
TL
11490 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
11491 bufferptr z = buffer::create_small_page_aligned(chunk_size);
224ce89b 11492 z.zero(0, front_pad, false);
7c673cae 11493 pad_count += front_pad;
224ce89b 11494 bl->copy(0, front_copy, z.c_str() + front_pad);
7c673cae
FG
11495 if (front_copy + front_pad < chunk_size) {
11496 back_pad = chunk_size - (length + front_pad);
224ce89b 11497 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
11498 pad_count += back_pad;
11499 }
11500 bufferlist old, t;
11501 old.swap(*bl);
11502 t.substr_of(old, front_copy, length - front_copy);
11503 bl->append(z);
11504 bl->claim_append(t);
11505 *offset -= front_pad;
224ce89b 11506 length += pad_count;
7c673cae
FG
11507 }
11508
11509 // back
11510 uint64_t end = *offset + length;
11511 unsigned back_copy = end % chunk_size;
11512 if (back_copy) {
11fdf7f2 11513 ceph_assert(back_pad == 0);
7c673cae 11514 back_pad = chunk_size - back_copy;
11fdf7f2 11515 ceph_assert(back_copy <= length);
7c673cae 11516 bufferptr tail(chunk_size);
224ce89b
WB
11517 bl->copy(length - back_copy, back_copy, tail.c_str());
11518 tail.zero(back_copy, back_pad, false);
7c673cae
FG
11519 bufferlist old;
11520 old.swap(*bl);
11521 bl->substr_of(old, 0, length - back_copy);
11522 bl->append(tail);
11523 length += back_pad;
11524 pad_count += back_pad;
11525 }
11526 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
11527 << back_pad << " on front/back, now 0x" << *offset << "~"
11528 << length << std::dec << dendl;
11529 dout(40) << "after:\n";
11530 bl->hexdump(*_dout);
11531 *_dout << dendl;
11532 if (pad_count)
11533 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 11534 ceph_assert(bl->length() == length);
7c673cae
FG
11535}
11536
11537void BlueStore::_do_write_small(
11538 TransContext *txc,
11539 CollectionRef &c,
11540 OnodeRef o,
11541 uint64_t offset, uint64_t length,
11542 bufferlist::iterator& blp,
11543 WriteContext *wctx)
11544{
11545 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
11546 << std::dec << dendl;
11fdf7f2 11547 ceph_assert(length < min_alloc_size);
7c673cae
FG
11548 uint64_t end_offs = offset + length;
11549
11550 logger->inc(l_bluestore_write_small);
11551 logger->inc(l_bluestore_write_small_bytes, length);
11552
11553 bufferlist bl;
11554 blp.copy(length, bl);
11555
81eedcae
TL
11556 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
11557 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
11558 uint32_t alloc_len = min_alloc_size;
11559 auto offset0 = p2align<uint64_t>(offset, alloc_len);
11560
11561 bool any_change;
11562
11563 // search suitable extent in both forward and reverse direction in
11564 // [offset - target_max_blob_size, offset + target_max_blob_size] range
11565 // then check if blob can be reused via can_reuse_blob func or apply
11566 // direct/deferred write (the latter for extents including or higher
11567 // than 'offset' only).
11568 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
11569
7c673cae
FG
11570 // Look for an existing mutable blob we can use.
11571 auto begin = o->extent_map.extent_map.begin();
11572 auto end = o->extent_map.extent_map.end();
11573 auto ep = o->extent_map.seek_lextent(offset);
11574 if (ep != begin) {
11575 --ep;
11576 if (ep->blob_end() <= offset) {
11577 ++ep;
11578 }
11579 }
11580 auto prev_ep = ep;
11581 if (prev_ep != begin) {
11582 --prev_ep;
11583 } else {
11584 prev_ep = end; // to avoid this extent check as it's a duplicate
11585 }
11586
7c673cae
FG
11587 do {
11588 any_change = false;
11589
11590 if (ep != end && ep->logical_offset < offset + max_bsize) {
11591 BlobRef b = ep->blob;
11592 auto bstart = ep->blob_start();
11593 dout(20) << __func__ << " considering " << *b
11594 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
11595 if (bstart >= end_offs) {
11596 dout(20) << __func__ << " ignoring distant " << *b << dendl;
11597 } else if (!b->get_blob().is_mutable()) {
11598 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
11599 } else if (ep->logical_offset % min_alloc_size !=
11600 ep->blob_offset % min_alloc_size) {
11601 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
11602 } else {
11603 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
11604 // can we pad our head/tail out with zeros?
11605 uint64_t head_pad, tail_pad;
11fdf7f2
TL
11606 head_pad = p2phase(offset, chunk_size);
11607 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
11608 if (head_pad || tail_pad) {
11609 o->extent_map.fault_range(db, offset - head_pad,
11610 end_offs - offset + head_pad + tail_pad);
11611 }
11612 if (head_pad &&
11613 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
11614 head_pad = 0;
11615 }
11616 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
11617 tail_pad = 0;
11618 }
11619
11620 uint64_t b_off = offset - head_pad - bstart;
11621 uint64_t b_len = length + head_pad + tail_pad;
11622
11623 // direct write into unused blocks of an existing mutable blob?
11624 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
11625 b->get_blob().get_ondisk_length() >= b_off + b_len &&
11626 b->get_blob().is_unused(b_off, b_len) &&
11627 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 11628 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
11629
11630 dout(20) << __func__ << " write to unused 0x" << std::hex
11631 << b_off << "~" << b_len
11632 << " pad 0x" << head_pad << " + 0x" << tail_pad
11633 << std::dec << " of mutable " << *b << dendl;
224ce89b 11634 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
11635 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
11636
11fdf7f2 11637 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
11638 if (b_len <= prefer_deferred_size) {
11639 dout(20) << __func__ << " deferring small 0x" << std::hex
11640 << b_len << std::dec << " unused write via deferred" << dendl;
11641 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
11642 op->op = bluestore_deferred_op_t::OP_WRITE;
11643 b->get_blob().map(
11644 b_off, b_len,
11645 [&](uint64_t offset, uint64_t length) {
11646 op->extents.emplace_back(bluestore_pextent_t(offset, length));
11647 return 0;
11648 });
224ce89b 11649 op->data = bl;
7c673cae
FG
11650 } else {
11651 b->get_blob().map_bl(
224ce89b 11652 b_off, bl,
7c673cae
FG
11653 [&](uint64_t offset, bufferlist& t) {
11654 bdev->aio_write(offset, t,
11655 &txc->ioc, wctx->buffered);
11656 });
11657 }
11658 }
224ce89b 11659 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
11660 dout(20) << __func__ << " lex old " << *ep << dendl;
11661 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
11662 b,
11663 &wctx->old_extents);
11664 b->dirty_blob().mark_used(le->blob_offset, le->length);
11665 txc->statfs_delta.stored() += le->length;
11666 dout(20) << __func__ << " lex " << *le << dendl;
11667 logger->inc(l_bluestore_write_small_unused);
11668 return;
11669 }
11670 // read some data to fill out the chunk?
11fdf7f2
TL
11671 uint64_t head_read = p2phase(b_off, chunk_size);
11672 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
11673 if ((head_read || tail_read) &&
11674 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
11675 head_read + tail_read < min_alloc_size) {
11676 b_off -= head_read;
11677 b_len += head_read + tail_read;
11678
11679 } else {
11680 head_read = tail_read = 0;
11681 }
11682
11683 // chunk-aligned deferred overwrite?
11684 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
11685 b_off % chunk_size == 0 &&
11686 b_len % chunk_size == 0 &&
11687 b->get_blob().is_allocated(b_off, b_len)) {
11688
224ce89b 11689 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
11690
11691 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
11692 << " and tail 0x" << tail_read << std::dec << dendl;
11693 if (head_read) {
11694 bufferlist head_bl;
11695 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
11696 head_bl, 0);
11fdf7f2 11697 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
11698 size_t zlen = head_read - r;
11699 if (zlen) {
11700 head_bl.append_zero(zlen);
11701 logger->inc(l_bluestore_write_pad_bytes, zlen);
11702 }
11fdf7f2
TL
11703 head_bl.claim_append(bl);
11704 bl.swap(head_bl);
7c673cae
FG
11705 logger->inc(l_bluestore_write_penalty_read_ops);
11706 }
11707 if (tail_read) {
11708 bufferlist tail_bl;
11709 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
11710 tail_bl, 0);
11fdf7f2 11711 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
11712 size_t zlen = tail_read - r;
11713 if (zlen) {
11714 tail_bl.append_zero(zlen);
11715 logger->inc(l_bluestore_write_pad_bytes, zlen);
11716 }
224ce89b 11717 bl.claim_append(tail_bl);
7c673cae
FG
11718 logger->inc(l_bluestore_write_penalty_read_ops);
11719 }
11720 logger->inc(l_bluestore_write_small_pre_read);
11721
224ce89b 11722 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
11723 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
11724
7c673cae 11725 if (b->get_blob().csum_type) {
224ce89b 11726 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 11727 }
11fdf7f2
TL
11728
11729 if (!g_conf()->bluestore_debug_omit_block_device_write) {
11730 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
11731 op->op = bluestore_deferred_op_t::OP_WRITE;
11732 int r = b->get_blob().map(
11733 b_off, b_len,
11734 [&](uint64_t offset, uint64_t length) {
11735 op->extents.emplace_back(bluestore_pextent_t(offset, length));
11736 return 0;
11737 });
11738 ceph_assert(r == 0);
11739 op->data.claim(bl);
11740 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
11741 << b_len << std::dec << " of mutable " << *b
11742 << " at " << op->extents << dendl;
11743 }
11744
7c673cae
FG
11745 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
11746 b, &wctx->old_extents);
11747 b->dirty_blob().mark_used(le->blob_offset, le->length);
11748 txc->statfs_delta.stored() += le->length;
11749 dout(20) << __func__ << " lex " << *le << dendl;
11750 logger->inc(l_bluestore_write_small_deferred);
11751 return;
11752 }
224ce89b
WB
11753 // try to reuse blob if we can
11754 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
11755 max_bsize,
11756 offset0 - bstart,
11757 &alloc_len)) {
11fdf7f2 11758 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
11759 // fit into reused blob
11760 // Need to check for pending writes desiring to
11761 // reuse the same pextent. The rationale is that during GC two chunks
11762 // from garbage blobs(compressed?) can share logical space within the same
11763 // AU. That's in turn might be caused by unaligned len in clone_range2.
11764 // Hence the second write will fail in an attempt to reuse blob at
11765 // do_alloc_write().
11766 if (!wctx->has_conflict(b,
11767 offset0,
11768 offset0 + alloc_len,
11769 min_alloc_size)) {
11770
11771 // we can't reuse pad_head/pad_tail since they might be truncated
11772 // due to existent extents
11773 uint64_t b_off = offset - bstart;
11774 uint64_t b_off0 = b_off;
11775 _pad_zeros(&bl, &b_off0, chunk_size);
11776
11777 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
11778 << " (0x" << b_off0 << "~" << bl.length() << ")"
11779 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
11780 << std::dec << dendl;
11781
11782 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11783 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
11784 false, false);
11785 logger->inc(l_bluestore_write_small_unused);
11786 return;
11787 }
11788 }
11789 }
11790 ++ep;
11791 any_change = true;
11792 } // if (ep != end && ep->logical_offset < offset + max_bsize)
11793
11794 // check extent for reuse in reverse order
11795 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
11796 BlobRef b = prev_ep->blob;
11797 auto bstart = prev_ep->blob_start();
11798 dout(20) << __func__ << " considering " << *b
11799 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 11800 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
11801 max_bsize,
11802 offset0 - bstart,
11803 &alloc_len)) {
11fdf7f2 11804 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
11805 // fit into reused blob
11806 // Need to check for pending writes desiring to
11807 // reuse the same pextent. The rationale is that during GC two chunks
11808 // from garbage blobs(compressed?) can share logical space within the same
11809 // AU. That's in turn might be caused by unaligned len in clone_range2.
11810 // Hence the second write will fail in an attempt to reuse blob at
11811 // do_alloc_write().
11812 if (!wctx->has_conflict(b,
11813 offset0,
11814 offset0 + alloc_len,
11815 min_alloc_size)) {
11816
11817 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
11818 uint64_t b_off = offset - bstart;
11819 uint64_t b_off0 = b_off;
11820 _pad_zeros(&bl, &b_off0, chunk_size);
11821
11822 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
11823 << " (0x" << b_off0 << "~" << bl.length() << ")"
11824 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
11825 << std::dec << dendl;
11826
11827 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11828 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
11829 false, false);
11830 logger->inc(l_bluestore_write_small_unused);
11831 return;
11832 }
11833 }
11834 if (prev_ep != begin) {
11835 --prev_ep;
11836 any_change = true;
11837 } else {
11838 prev_ep = end; // to avoid useless first extent re-check
11839 }
11840 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
11841 } while (any_change);
11842
11843 // new blob.
7c673cae 11844 BlobRef b = c->new_blob();
11fdf7f2 11845 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
11846 uint64_t b_off0 = b_off;
11847 _pad_zeros(&bl, &b_off0, block_size);
11848 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11849 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
7c673cae
FG
11850
11851 return;
11852}
11853
11854void BlueStore::_do_write_big(
11855 TransContext *txc,
11856 CollectionRef &c,
11857 OnodeRef o,
11858 uint64_t offset, uint64_t length,
11859 bufferlist::iterator& blp,
11860 WriteContext *wctx)
11861{
11862 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
11863 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
11864 << " compress " << (int)wctx->compress
11865 << dendl;
11866 logger->inc(l_bluestore_write_big);
11867 logger->inc(l_bluestore_write_big_bytes, length);
11868 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11fdf7f2 11869 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae
FG
11870 while (length > 0) {
11871 bool new_blob = false;
11fdf7f2 11872 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
11873 BlobRef b;
11874 uint32_t b_off = 0;
11875
11876 //attempting to reuse existing blob
11877 if (!wctx->compress) {
11878 // look for an existing mutable blob we can reuse
11879 auto begin = o->extent_map.extent_map.begin();
11880 auto end = o->extent_map.extent_map.end();
11881 auto ep = o->extent_map.seek_lextent(offset);
11882 auto prev_ep = ep;
11883 if (prev_ep != begin) {
11884 --prev_ep;
11885 } else {
11886 prev_ep = end; // to avoid this extent check as it's a duplicate
11887 }
11888 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
11889 // search suitable extent in both forward and reverse direction in
11890 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 11891 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
11892 bool any_change;
11893 do {
11894 any_change = false;
11895 if (ep != end && ep->logical_offset < offset + max_bsize) {
11896 if (offset >= ep->blob_start() &&
224ce89b 11897 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
11898 offset - ep->blob_start(),
11899 &l)) {
11900 b = ep->blob;
11901 b_off = offset - ep->blob_start();
11902 prev_ep = end; // to avoid check below
11903 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 11904 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
11905 } else {
11906 ++ep;
11907 any_change = true;
11908 }
11909 }
11910
11911 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 11912 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
11913 offset - prev_ep->blob_start(),
11914 &l)) {
11915 b = prev_ep->blob;
11916 b_off = offset - prev_ep->blob_start();
11917 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 11918 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
11919 } else if (prev_ep != begin) {
11920 --prev_ep;
11921 any_change = true;
11922 } else {
11923 prev_ep = end; // to avoid useless first extent re-check
11924 }
11925 }
11926 } while (b == nullptr && any_change);
11927 }
11928 if (b == nullptr) {
11929 b = c->new_blob();
11930 b_off = 0;
11931 new_blob = true;
11932 }
11933
11934 bufferlist t;
11935 blp.copy(l, t);
11936 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
11937 offset += l;
11938 length -= l;
11939 logger->inc(l_bluestore_write_big_blobs);
11940 }
11941}
11942
11943int BlueStore::_do_alloc_write(
11944 TransContext *txc,
11945 CollectionRef coll,
11946 OnodeRef o,
11947 WriteContext *wctx)
11948{
11949 dout(20) << __func__ << " txc " << txc
11950 << " " << wctx->writes.size() << " blobs"
11951 << dendl;
3efd9988
FG
11952 if (wctx->writes.empty()) {
11953 return 0;
7c673cae
FG
11954 }
11955
7c673cae
FG
11956 CompressorRef c;
11957 double crr = 0;
11958 if (wctx->compress) {
11959 c = select_option(
11960 "compression_algorithm",
11961 compressor,
11962 [&]() {
11963 string val;
11964 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
11965 CompressorRef cp = compressor;
11966 if (!cp || cp->get_type_name() != val) {
11967 cp = Compressor::create(cct, val);
11fdf7f2
TL
11968 if (!cp) {
11969 if (_set_compression_alert(false, val.c_str())) {
11970 derr << __func__ << " unable to initialize " << val.c_str()
11971 << " compressor" << dendl;
11972 }
11973 }
7c673cae
FG
11974 }
11975 return boost::optional<CompressorRef>(cp);
11976 }
11977 return boost::optional<CompressorRef>();
11978 }
11979 );
11980
11981 crr = select_option(
11982 "compression_required_ratio",
11983 cct->_conf->bluestore_compression_required_ratio,
11984 [&]() {
11985 double val;
3efd9988 11986 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
11987 return boost::optional<double>(val);
11988 }
11989 return boost::optional<double>();
11990 }
11991 );
11992 }
11993
11994 // checksum
11fdf7f2 11995 int64_t csum = csum_type.load();
7c673cae
FG
11996 csum = select_option(
11997 "csum_type",
11998 csum,
11999 [&]() {
11fdf7f2 12000 int64_t val;
3efd9988 12001 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 12002 return boost::optional<int64_t>(val);
7c673cae 12003 }
11fdf7f2 12004 return boost::optional<int64_t>();
7c673cae
FG
12005 }
12006 );
12007
3efd9988
FG
12008 // compress (as needed) and calc needed space
12009 uint64_t need = 0;
11fdf7f2 12010 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 12011 for (auto& wi : wctx->writes) {
3efd9988 12012 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 12013 auto start = mono_clock::now();
7c673cae
FG
12014
12015 // compress
11fdf7f2
TL
12016 ceph_assert(wi.b_off == 0);
12017 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 12018
7c673cae
FG
12019 // FIXME: memory alignment here is bad
12020 bufferlist t;
3efd9988 12021 int r = c->compress(wi.bl, t);
3efd9988 12022 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 12023 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
12024 bool rejected = false;
12025 uint64_t compressed_len = t.length();
12026 // do an approximate (fast) estimation for resulting blob size
12027 // that doesn't take header overhead into account
11fdf7f2 12028 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
12029 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
12030 bluestore_compression_header_t chdr;
12031 chdr.type = c->get_type();
12032 chdr.length = t.length();
12033 encode(chdr, wi.compressed_bl);
12034 wi.compressed_bl.claim_append(t);
12035
12036 compressed_len = wi.compressed_bl.length();
11fdf7f2 12037 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
12038 if (result_len <= want_len && result_len < wi.blob_length) {
12039 // Cool. We compressed at least as much as we were hoping to.
12040 // pad out to min_alloc_size
12041 wi.compressed_bl.append_zero(result_len - compressed_len);
12042 wi.compressed_len = compressed_len;
12043 wi.compressed = true;
12044 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
12045 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
12046 << " -> 0x" << compressed_len << " => 0x" << result_len
12047 << " with " << c->get_type()
12048 << std::dec << dendl;
12049 txc->statfs_delta.compressed() += compressed_len;
12050 txc->statfs_delta.compressed_original() += wi.blob_length;
12051 txc->statfs_delta.compressed_allocated() += result_len;
12052 logger->inc(l_bluestore_compress_success_count);
12053 need += result_len;
12054 } else {
12055 rejected = true;
12056 }
12057 } else if (r != 0) {
12058 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
12059 << " bytes compressed using " << c->get_type_name()
12060 << std::dec
12061 << " failed with errcode = " << r
12062 << ", leaving uncompressed"
12063 << dendl;
12064 logger->inc(l_bluestore_compress_rejected_count);
12065 need += wi.blob_length;
7c673cae 12066 } else {
a8e16298
TL
12067 rejected = true;
12068 }
12069
12070 if (rejected) {
3efd9988 12071 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 12072 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
12073 << " with " << c->get_type()
12074 << ", which is more than required 0x" << want_len_raw
7c673cae 12075 << " -> 0x" << want_len
3efd9988
FG
12076 << ", leaving uncompressed"
12077 << std::dec << dendl;
12078 logger->inc(l_bluestore_compress_rejected_count);
12079 need += wi.blob_length;
7c673cae 12080 }
11fdf7f2
TL
12081 LOG_LATENCY(logger, cct, l_bluestore_compress_lat,
12082 mono_clock::now() - start);
3efd9988
FG
12083 } else {
12084 need += wi.blob_length;
7c673cae 12085 }
3efd9988 12086 }
a8e16298 12087 PExtentVector prealloc;
3efd9988 12088 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 12089 int64_t prealloc_left = 0;
3efd9988
FG
12090 prealloc_left = alloc->allocate(
12091 need, min_alloc_size, need,
12092 0, &prealloc);
11fdf7f2
TL
12093 if (prealloc_left < (int64_t)need) {
12094 derr << __func__ << " failed to allocate 0x" << std::hex << need
12095 << " allocated 0x " << prealloc_left
12096 << " min_alloc_size 0x" << min_alloc_size
12097 << " available 0x " << alloc->get_free()
12098 << std::dec << dendl;
12099 if (prealloc.size()) {
12100 alloc->release(prealloc);
12101 }
a8e16298
TL
12102 return -ENOSPC;
12103 }
a8e16298 12104
3efd9988
FG
12105 dout(20) << __func__ << " prealloc " << prealloc << dendl;
12106 auto prealloc_pos = prealloc.begin();
12107
12108 for (auto& wi : wctx->writes) {
12109 BlobRef b = wi.b;
12110 bluestore_blob_t& dblob = b->dirty_blob();
12111 uint64_t b_off = wi.b_off;
12112 bufferlist *l = &wi.bl;
12113 uint64_t final_length = wi.blob_length;
12114 uint64_t csum_length = wi.blob_length;
3efd9988
FG
12115 if (wi.compressed) {
12116 final_length = wi.compressed_bl.length();
12117 csum_length = final_length;
3efd9988
FG
12118 l = &wi.compressed_bl;
12119 dblob.set_compressed(wi.blob_length, wi.compressed_len);
12120 } else if (wi.new_blob) {
7c673cae 12121 // initialize newly created blob only
11fdf7f2
TL
12122 ceph_assert(dblob.is_mutable());
12123 unsigned csum_order;
7c673cae
FG
12124 if (l->length() != wi.blob_length) {
12125 // hrm, maybe we could do better here, but let's not bother.
12126 dout(20) << __func__ << " forcing csum_order to block_size_order "
12127 << block_size_order << dendl;
31f18b77 12128 csum_order = block_size_order;
7c673cae
FG
12129 } else {
12130 csum_order = std::min(wctx->csum_order, ctz(l->length()));
12131 }
12132 // try to align blob with max_blob_size to improve
12133 // its reuse ratio, e.g. in case of reverse write
12134 uint32_t suggested_boff =
12135 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
12136 if ((suggested_boff % (1 << csum_order)) == 0 &&
12137 suggested_boff + final_length <= max_bsize &&
12138 suggested_boff > b_off) {
181888fb 12139 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 12140 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 12141 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
12142 csum_length += suggested_boff - b_off;
12143 b_off = suggested_boff;
12144 }
181888fb
FG
12145 if (csum != Checksummer::CSUM_NONE) {
12146 dout(20) << __func__ << " initialize csum setting for new blob " << *b
12147 << " csum_type " << Checksummer::get_csum_type_string(csum)
12148 << " csum_order " << csum_order
12149 << " csum_length 0x" << std::hex << csum_length << std::dec
12150 << dendl;
12151 dblob.init_csum(csum, csum_order, csum_length);
12152 }
7c673cae
FG
12153 }
12154
a8e16298 12155 PExtentVector extents;
3efd9988
FG
12156 int64_t left = final_length;
12157 while (left > 0) {
11fdf7f2 12158 ceph_assert(prealloc_left > 0);
3efd9988
FG
12159 if (prealloc_pos->length <= left) {
12160 prealloc_left -= prealloc_pos->length;
12161 left -= prealloc_pos->length;
12162 txc->statfs_delta.allocated() += prealloc_pos->length;
12163 extents.push_back(*prealloc_pos);
12164 ++prealloc_pos;
12165 } else {
12166 extents.emplace_back(prealloc_pos->offset, left);
12167 prealloc_pos->offset += left;
12168 prealloc_pos->length -= left;
12169 prealloc_left -= left;
12170 txc->statfs_delta.allocated() += left;
12171 left = 0;
12172 break;
12173 }
12174 }
7c673cae 12175 for (auto& p : extents) {
3efd9988 12176 txc->allocated.insert(p.offset, p.length);
7c673cae 12177 }
11fdf7f2 12178 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 12179
181888fb
FG
12180 dout(20) << __func__ << " blob " << *b << dendl;
12181 if (dblob.has_csum()) {
7c673cae
FG
12182 dblob.calc_csum(b_off, *l);
12183 }
181888fb 12184
7c673cae
FG
12185 if (wi.mark_unused) {
12186 auto b_end = b_off + wi.bl.length();
12187 if (b_off) {
12188 dblob.add_unused(0, b_off);
12189 }
12190 if (b_end < wi.blob_length) {
12191 dblob.add_unused(b_end, wi.blob_length - b_end);
12192 }
12193 }
12194
12195 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
12196 b_off + (wi.b_off0 - wi.b_off),
12197 wi.length0,
12198 wi.b,
12199 nullptr);
12200 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
12201 txc->statfs_delta.stored() += le->length;
12202 dout(20) << __func__ << " lex " << *le << dendl;
12203 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
12204 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
12205
12206 // queue io
11fdf7f2 12207 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12208 if (l->length() <= prefer_deferred_size.load()) {
12209 dout(20) << __func__ << " deferring small 0x" << std::hex
12210 << l->length() << std::dec << " write via deferred" << dendl;
12211 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
12212 op->op = bluestore_deferred_op_t::OP_WRITE;
12213 int r = b->get_blob().map(
12214 b_off, l->length(),
12215 [&](uint64_t offset, uint64_t length) {
12216 op->extents.emplace_back(bluestore_pextent_t(offset, length));
12217 return 0;
12218 });
11fdf7f2 12219 ceph_assert(r == 0);
7c673cae 12220 op->data = *l;
81eedcae 12221 logger->inc(l_bluestore_write_small_deferred);
7c673cae
FG
12222 } else {
12223 b->get_blob().map_bl(
12224 b_off, *l,
12225 [&](uint64_t offset, bufferlist& t) {
12226 bdev->aio_write(offset, t, &txc->ioc, false);
12227 });
81eedcae 12228 logger->inc(l_bluestore_write_small_new);
7c673cae
FG
12229 }
12230 }
12231 }
11fdf7f2
TL
12232 ceph_assert(prealloc_pos == prealloc.end());
12233 ceph_assert(prealloc_left == 0);
7c673cae
FG
12234 return 0;
12235}
12236
12237void BlueStore::_wctx_finish(
12238 TransContext *txc,
12239 CollectionRef& c,
12240 OnodeRef o,
31f18b77
FG
12241 WriteContext *wctx,
12242 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
12243{
12244 auto oep = wctx->old_extents.begin();
12245 while (oep != wctx->old_extents.end()) {
12246 auto &lo = *oep;
12247 oep = wctx->old_extents.erase(oep);
12248 dout(20) << __func__ << " lex_old " << lo.e << dendl;
12249 BlobRef b = lo.e.blob;
12250 const bluestore_blob_t& blob = b->get_blob();
12251 if (blob.is_compressed()) {
12252 if (lo.blob_empty) {
12253 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
12254 }
12255 txc->statfs_delta.compressed_original() -= lo.e.length;
12256 }
12257 auto& r = lo.r;
12258 txc->statfs_delta.stored() -= lo.e.length;
12259 if (!r.empty()) {
12260 dout(20) << __func__ << " blob release " << r << dendl;
12261 if (blob.is_shared()) {
12262 PExtentVector final;
12263 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
12264 bool unshare = false;
12265 bool* unshare_ptr =
12266 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 12267 for (auto e : r) {
31f18b77
FG
12268 b->shared_blob->put_ref(
12269 e.offset, e.length, &final,
11fdf7f2
TL
12270 unshare_ptr);
12271 }
12272 if (unshare) {
12273 ceph_assert(maybe_unshared_blobs);
12274 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
12275 }
12276 dout(20) << __func__ << " shared_blob release " << final
12277 << " from " << *b->shared_blob << dendl;
12278 txc->write_shared_blob(b->shared_blob);
12279 r.clear();
12280 r.swap(final);
12281 }
12282 }
12283 // we can't invalidate our logical extents as we drop them because
12284 // other lextents (either in our onode or others) may still
12285 // reference them. but we can throw out anything that is no
12286 // longer allocated. Note that this will leave behind edge bits
12287 // that are no longer referenced but not deallocated (until they
12288 // age out of the cache naturally).
12289 b->discard_unallocated(c.get());
12290 for (auto e : r) {
12291 dout(20) << __func__ << " release " << e << dendl;
12292 txc->released.insert(e.offset, e.length);
12293 txc->statfs_delta.allocated() -= e.length;
12294 if (blob.is_compressed()) {
12295 txc->statfs_delta.compressed_allocated() -= e.length;
12296 }
12297 }
12298 delete &lo;
12299 if (b->is_spanning() && !b->is_referenced()) {
12300 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
12301 << dendl;
12302 o->extent_map.spanning_blob_map.erase(b->id);
12303 }
12304 }
12305}
12306
12307void BlueStore::_do_write_data(
12308 TransContext *txc,
12309 CollectionRef& c,
12310 OnodeRef o,
12311 uint64_t offset,
12312 uint64_t length,
12313 bufferlist& bl,
12314 WriteContext *wctx)
12315{
12316 uint64_t end = offset + length;
12317 bufferlist::iterator p = bl.begin();
12318
12319 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
12320 (length != min_alloc_size)) {
12321 // we fall within the same block
12322 _do_write_small(txc, c, o, offset, length, p, wctx);
12323 } else {
12324 uint64_t head_offset, head_length;
12325 uint64_t middle_offset, middle_length;
12326 uint64_t tail_offset, tail_length;
12327
12328 head_offset = offset;
11fdf7f2 12329 head_length = p2nphase(offset, min_alloc_size);
7c673cae 12330
11fdf7f2
TL
12331 tail_offset = p2align(end, min_alloc_size);
12332 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
12333
12334 middle_offset = head_offset + head_length;
12335 middle_length = length - head_length - tail_length;
12336
12337 if (head_length) {
12338 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
12339 }
12340
12341 if (middle_length) {
12342 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
12343 }
12344
12345 if (tail_length) {
12346 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
12347 }
12348 }
12349}
12350
31f18b77
FG
12351void BlueStore::_choose_write_options(
12352 CollectionRef& c,
12353 OnodeRef o,
12354 uint32_t fadvise_flags,
12355 WriteContext *wctx)
7c673cae 12356{
7c673cae
FG
12357 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
12358 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 12359 wctx->buffered = true;
7c673cae
FG
12360 } else if (cct->_conf->bluestore_default_buffered_write &&
12361 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
12362 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
12363 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 12364 wctx->buffered = true;
7c673cae
FG
12365 }
12366
31f18b77
FG
12367 // apply basic csum block size
12368 wctx->csum_order = block_size_order;
7c673cae
FG
12369
12370 // compression parameters
12371 unsigned alloc_hints = o->onode.alloc_hint_flags;
12372 auto cm = select_option(
12373 "compression_mode",
31f18b77 12374 comp_mode.load(),
7c673cae
FG
12375 [&]() {
12376 string val;
11fdf7f2 12377 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
12378 return boost::optional<Compressor::CompressionMode>(
12379 Compressor::get_comp_mode_type(val));
7c673cae
FG
12380 }
12381 return boost::optional<Compressor::CompressionMode>();
12382 }
12383 );
31f18b77
FG
12384
12385 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
12386 ((cm == Compressor::COMP_FORCE) ||
12387 (cm == Compressor::COMP_AGGRESSIVE &&
12388 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
12389 (cm == Compressor::COMP_PASSIVE &&
12390 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
12391
12392 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
12393 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
12394 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
12395 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 12396 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 12397
7c673cae 12398 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 12399
7c673cae 12400 if (o->onode.expected_write_size) {
224ce89b 12401 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 12402 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 12403 } else {
224ce89b 12404 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
12405 }
12406
31f18b77
FG
12407 if (wctx->compress) {
12408 wctx->target_blob_size = select_option(
7c673cae 12409 "compression_max_blob_size",
31f18b77 12410 comp_max_blob_size.load(),
7c673cae 12411 [&]() {
11fdf7f2
TL
12412 int64_t val;
12413 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
12414 return boost::optional<uint64_t>((uint64_t)val);
12415 }
12416 return boost::optional<uint64_t>();
12417 }
12418 );
12419 }
12420 } else {
31f18b77
FG
12421 if (wctx->compress) {
12422 wctx->target_blob_size = select_option(
7c673cae 12423 "compression_min_blob_size",
31f18b77 12424 comp_min_blob_size.load(),
7c673cae 12425 [&]() {
11fdf7f2
TL
12426 int64_t val;
12427 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
12428 return boost::optional<uint64_t>((uint64_t)val);
12429 }
12430 return boost::optional<uint64_t>();
12431 }
12432 );
12433 }
12434 }
31f18b77 12435
7c673cae 12436 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
12437 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
12438 wctx->target_blob_size = max_bsize;
7c673cae 12439 }
31f18b77 12440
7c673cae
FG
12441 // set the min blob size floor at 2x the min_alloc_size, or else we
12442 // won't be able to allocate a smaller extent for the compressed
12443 // data.
31f18b77
FG
12444 if (wctx->compress &&
12445 wctx->target_blob_size < min_alloc_size * 2) {
12446 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 12447 }
31f18b77
FG
12448
12449 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
12450 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
12451 << " compress=" << (int)wctx->compress
12452 << " buffered=" << (int)wctx->buffered
31f18b77
FG
12453 << std::dec << dendl;
12454}
12455
12456int BlueStore::_do_gc(
12457 TransContext *txc,
12458 CollectionRef& c,
12459 OnodeRef o,
12460 const GarbageCollector& gc,
12461 const WriteContext& wctx,
12462 uint64_t *dirty_start,
12463 uint64_t *dirty_end)
12464{
12465 auto& extents_to_collect = gc.get_extents_to_collect();
12466
1adf2230 12467 bool dirty_range_updated = false;
31f18b77 12468 WriteContext wctx_gc;
7c673cae 12469 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 12470
31f18b77
FG
12471 for (auto it = extents_to_collect.begin();
12472 it != extents_to_collect.end();
12473 ++it) {
12474 bufferlist bl;
12475 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
11fdf7f2 12476 ceph_assert(r == (int)it->length);
31f18b77 12477
31f18b77
FG
12478 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
12479 logger->inc(l_bluestore_gc_merged, it->length);
12480
12481 if (*dirty_start > it->offset) {
12482 *dirty_start = it->offset;
1adf2230 12483 dirty_range_updated = true;
31f18b77
FG
12484 }
12485
12486 if (*dirty_end < it->offset + it->length) {
12487 *dirty_end = it->offset + it->length;
1adf2230 12488 dirty_range_updated = true;
31f18b77
FG
12489 }
12490 }
1adf2230
AA
12491 if (dirty_range_updated) {
12492 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
12493 }
31f18b77
FG
12494
12495 dout(30) << __func__ << " alloc write" << dendl;
12496 int r = _do_alloc_write(txc, c, o, &wctx_gc);
12497 if (r < 0) {
12498 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
12499 << dendl;
12500 return r;
12501 }
12502
12503 _wctx_finish(txc, c, o, &wctx_gc);
12504 return 0;
12505}
12506
12507int BlueStore::_do_write(
12508 TransContext *txc,
12509 CollectionRef& c,
12510 OnodeRef o,
12511 uint64_t offset,
12512 uint64_t length,
12513 bufferlist& bl,
12514 uint32_t fadvise_flags)
12515{
12516 int r = 0;
12517
12518 dout(20) << __func__
12519 << " " << o->oid
12520 << " 0x" << std::hex << offset << "~" << length
12521 << " - have 0x" << o->onode.size
12522 << " (" << std::dec << o->onode.size << ")"
12523 << " bytes"
12524 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
12525 << dendl;
81eedcae 12526 _dump_onode<30>(cct, *o);
31f18b77
FG
12527
12528 if (length == 0) {
12529 return 0;
12530 }
12531
12532 uint64_t end = offset + length;
12533
12534 GarbageCollector gc(c->store->cct);
12535 int64_t benefit;
12536 auto dirty_start = offset;
12537 auto dirty_end = end;
12538
12539 WriteContext wctx;
12540 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
12541 o->extent_map.fault_range(db, offset, length);
12542 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
12543 r = _do_alloc_write(txc, c, o, &wctx);
12544 if (r < 0) {
12545 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
12546 << dendl;
12547 goto out;
12548 }
12549
31f18b77
FG
12550 // NB: _wctx_finish() will empty old_extents
12551 // so we must do gc estimation before that
7c673cae 12552 benefit = gc.estimate(offset,
31f18b77
FG
12553 length,
12554 o->extent_map,
12555 wctx.old_extents,
12556 min_alloc_size);
7c673cae
FG
12557
12558 _wctx_finish(txc, c, o, &wctx);
12559 if (end > o->onode.size) {
12560 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 12561 << std::dec << dendl;
7c673cae
FG
12562 o->onode.size = end;
12563 }
12564
11fdf7f2 12565 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
31f18b77
FG
12566 if (!gc.get_extents_to_collect().empty()) {
12567 dout(20) << __func__ << " perform garbage collection, "
12568 << "expected benefit = " << benefit << " AUs" << dendl;
12569 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
12570 if (r < 0) {
12571 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
12572 << dendl;
12573 goto out;
7c673cae 12574 }
1adf2230
AA
12575 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
12576 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae
FG
12577 }
12578 }
7c673cae 12579 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
12580 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
12581
7c673cae
FG
12582 r = 0;
12583
12584 out:
12585 return r;
12586}
12587
12588int BlueStore::_write(TransContext *txc,
12589 CollectionRef& c,
12590 OnodeRef& o,
31f18b77
FG
12591 uint64_t offset, size_t length,
12592 bufferlist& bl,
12593 uint32_t fadvise_flags)
7c673cae
FG
12594{
12595 dout(15) << __func__ << " " << c->cid << " " << o->oid
12596 << " 0x" << std::hex << offset << "~" << length << std::dec
12597 << dendl;
35e4c445
FG
12598 int r = 0;
12599 if (offset + length >= OBJECT_MAX_SIZE) {
12600 r = -E2BIG;
12601 } else {
12602 _assign_nid(txc, o);
12603 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
12604 txc->write_onode(o);
12605 }
7c673cae
FG
12606 dout(10) << __func__ << " " << c->cid << " " << o->oid
12607 << " 0x" << std::hex << offset << "~" << length << std::dec
12608 << " = " << r << dendl;
12609 return r;
12610}
12611
12612int BlueStore::_zero(TransContext *txc,
12613 CollectionRef& c,
12614 OnodeRef& o,
12615 uint64_t offset, size_t length)
12616{
12617 dout(15) << __func__ << " " << c->cid << " " << o->oid
12618 << " 0x" << std::hex << offset << "~" << length << std::dec
12619 << dendl;
35e4c445
FG
12620 int r = 0;
12621 if (offset + length >= OBJECT_MAX_SIZE) {
12622 r = -E2BIG;
12623 } else {
12624 _assign_nid(txc, o);
12625 r = _do_zero(txc, c, o, offset, length);
12626 }
7c673cae
FG
12627 dout(10) << __func__ << " " << c->cid << " " << o->oid
12628 << " 0x" << std::hex << offset << "~" << length << std::dec
12629 << " = " << r << dendl;
12630 return r;
12631}
12632
12633int BlueStore::_do_zero(TransContext *txc,
12634 CollectionRef& c,
12635 OnodeRef& o,
12636 uint64_t offset, size_t length)
12637{
12638 dout(15) << __func__ << " " << c->cid << " " << o->oid
12639 << " 0x" << std::hex << offset << "~" << length << std::dec
12640 << dendl;
12641 int r = 0;
12642
81eedcae 12643 _dump_onode<30>(cct, *o);
7c673cae
FG
12644
12645 WriteContext wctx;
12646 o->extent_map.fault_range(db, offset, length);
12647 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 12648 o->extent_map.dirty_range(offset, length);
7c673cae
FG
12649 _wctx_finish(txc, c, o, &wctx);
12650
b32b8144 12651 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
12652 o->onode.size = offset + length;
12653 dout(20) << __func__ << " extending size to " << offset + length
12654 << dendl;
12655 }
12656 txc->write_onode(o);
12657
12658 dout(10) << __func__ << " " << c->cid << " " << o->oid
12659 << " 0x" << std::hex << offset << "~" << length << std::dec
12660 << " = " << r << dendl;
12661 return r;
12662}
12663
12664void BlueStore::_do_truncate(
31f18b77
FG
12665 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
12666 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
12667{
12668 dout(15) << __func__ << " " << c->cid << " " << o->oid
12669 << " 0x" << std::hex << offset << std::dec << dendl;
12670
81eedcae 12671 _dump_onode<30>(cct, *o);
7c673cae
FG
12672
12673 if (offset == o->onode.size)
31f18b77 12674 return;
7c673cae
FG
12675
12676 if (offset < o->onode.size) {
12677 WriteContext wctx;
12678 uint64_t length = o->onode.size - offset;
12679 o->extent_map.fault_range(db, offset, length);
12680 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
12681 o->extent_map.dirty_range(offset, length);
12682 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
12683
12684 // if we have shards past EOF, ask for a reshard
12685 if (!o->onode.extent_map_shards.empty() &&
12686 o->onode.extent_map_shards.back().offset >= offset) {
12687 dout(10) << __func__ << " request reshard past EOF" << dendl;
12688 if (offset) {
12689 o->extent_map.request_reshard(offset - 1, offset + length);
12690 } else {
12691 o->extent_map.request_reshard(0, length);
12692 }
12693 }
12694 }
12695
12696 o->onode.size = offset;
12697
12698 txc->write_onode(o);
12699}
12700
35e4c445 12701int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
12702 CollectionRef& c,
12703 OnodeRef& o,
12704 uint64_t offset)
12705{
12706 dout(15) << __func__ << " " << c->cid << " " << o->oid
12707 << " 0x" << std::hex << offset << std::dec
12708 << dendl;
35e4c445
FG
12709 int r = 0;
12710 if (offset >= OBJECT_MAX_SIZE) {
12711 r = -E2BIG;
12712 } else {
12713 _do_truncate(txc, c, o, offset);
12714 }
12715 dout(10) << __func__ << " " << c->cid << " " << o->oid
12716 << " 0x" << std::hex << offset << std::dec
12717 << " = " << r << dendl;
12718 return r;
7c673cae
FG
12719}
12720
12721int BlueStore::_do_remove(
12722 TransContext *txc,
12723 CollectionRef& c,
12724 OnodeRef o)
12725{
31f18b77 12726 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
12727 bool is_gen = !o->oid.is_no_gen();
12728 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
12729 if (o->onode.has_omap()) {
12730 o->flush();
11fdf7f2
TL
12731 _do_omap_clear(txc,
12732 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
12733 o->onode.nid);
7c673cae
FG
12734 }
12735 o->exists = false;
12736 string key;
12737 for (auto &s : o->extent_map.shards) {
12738 dout(20) << __func__ << " removing shard 0x" << std::hex
12739 << s.shard_info->offset << std::dec << dendl;
12740 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
12741 [&](const string& final_key) {
12742 txc->t->rmkey(PREFIX_OBJ, final_key);
12743 }
12744 );
12745 }
12746 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 12747 txc->note_removed_object(o);
7c673cae
FG
12748 o->extent_map.clear();
12749 o->onode = bluestore_onode_t();
12750 _debug_obj_on_delete(o->oid);
31f18b77 12751
224ce89b
WB
12752 if (!is_gen || maybe_unshared_blobs.empty()) {
12753 return 0;
12754 }
31f18b77 12755
224ce89b
WB
12756 // see if we can unshare blobs still referenced by the head
12757 dout(10) << __func__ << " gen and maybe_unshared_blobs "
12758 << maybe_unshared_blobs << dendl;
12759 ghobject_t nogen = o->oid;
12760 nogen.generation = ghobject_t::NO_GEN;
12761 OnodeRef h = c->onode_map.lookup(nogen);
12762
12763 if (!h || !h->exists) {
12764 return 0;
12765 }
12766
12767 dout(20) << __func__ << " checking for unshareable blobs on " << h
12768 << " " << h->oid << dendl;
12769 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
12770 for (auto& e : h->extent_map.extent_map) {
12771 const bluestore_blob_t& b = e.blob->get_blob();
12772 SharedBlob *sb = e.blob->shared_blob.get();
12773 if (b.is_shared() &&
12774 sb->loaded &&
12775 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
12776 if (b.is_compressed()) {
12777 expect[sb].get(0, b.get_ondisk_length());
12778 } else {
12779 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
12780 expect[sb].get(off, len);
12781 return 0;
12782 });
12783 }
224ce89b
WB
12784 }
12785 }
31f18b77 12786
224ce89b
WB
12787 vector<SharedBlob*> unshared_blobs;
12788 unshared_blobs.reserve(maybe_unshared_blobs.size());
12789 for (auto& p : expect) {
12790 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
12791 if (p.first->persistent->ref_map == p.second) {
12792 SharedBlob *sb = p.first;
12793 dout(20) << __func__ << " unsharing " << *sb << dendl;
12794 unshared_blobs.push_back(sb);
12795 txc->unshare_blob(sb);
12796 uint64_t sbid = c->make_blob_unshared(sb);
12797 string key;
12798 get_shared_blob_key(sbid, &key);
12799 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
12800 }
12801 }
12802
12803 if (unshared_blobs.empty()) {
12804 return 0;
12805 }
12806
224ce89b
WB
12807 for (auto& e : h->extent_map.extent_map) {
12808 const bluestore_blob_t& b = e.blob->get_blob();
12809 SharedBlob *sb = e.blob->shared_blob.get();
12810 if (b.is_shared() &&
12811 std::find(unshared_blobs.begin(), unshared_blobs.end(),
12812 sb) != unshared_blobs.end()) {
12813 dout(20) << __func__ << " unsharing " << e << dendl;
12814 bluestore_blob_t& blob = e.blob->dirty_blob();
12815 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 12816 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
12817 }
12818 }
224ce89b
WB
12819 txc->write_onode(h);
12820
7c673cae
FG
12821 return 0;
12822}
12823
12824int BlueStore::_remove(TransContext *txc,
12825 CollectionRef& c,
12826 OnodeRef &o)
12827{
11fdf7f2
TL
12828 dout(15) << __func__ << " " << c->cid << " " << o->oid
12829 << " onode " << o.get()
12830 << " txc "<< txc << dendl;
7c673cae
FG
12831 int r = _do_remove(txc, c, o);
12832 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12833 return r;
12834}
12835
12836int BlueStore::_setattr(TransContext *txc,
12837 CollectionRef& c,
12838 OnodeRef& o,
12839 const string& name,
12840 bufferptr& val)
12841{
12842 dout(15) << __func__ << " " << c->cid << " " << o->oid
12843 << " " << name << " (" << val.length() << " bytes)"
12844 << dendl;
12845 int r = 0;
3efd9988
FG
12846 if (val.is_partial()) {
12847 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
12848 val.length());
12849 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12850 } else {
12851 auto& b = o->onode.attrs[name.c_str()] = val;
12852 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12853 }
7c673cae
FG
12854 txc->write_onode(o);
12855 dout(10) << __func__ << " " << c->cid << " " << o->oid
12856 << " " << name << " (" << val.length() << " bytes)"
12857 << " = " << r << dendl;
12858 return r;
12859}
12860
12861int BlueStore::_setattrs(TransContext *txc,
12862 CollectionRef& c,
12863 OnodeRef& o,
12864 const map<string,bufferptr>& aset)
12865{
12866 dout(15) << __func__ << " " << c->cid << " " << o->oid
12867 << " " << aset.size() << " keys"
12868 << dendl;
12869 int r = 0;
12870 for (map<string,bufferptr>::const_iterator p = aset.begin();
12871 p != aset.end(); ++p) {
3efd9988
FG
12872 if (p->second.is_partial()) {
12873 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 12874 bufferptr(p->second.c_str(), p->second.length());
3efd9988
FG
12875 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12876 } else {
12877 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
12878 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12879 }
7c673cae
FG
12880 }
12881 txc->write_onode(o);
12882 dout(10) << __func__ << " " << c->cid << " " << o->oid
12883 << " " << aset.size() << " keys"
12884 << " = " << r << dendl;
12885 return r;
12886}
12887
12888
12889int BlueStore::_rmattr(TransContext *txc,
12890 CollectionRef& c,
12891 OnodeRef& o,
12892 const string& name)
12893{
12894 dout(15) << __func__ << " " << c->cid << " " << o->oid
12895 << " " << name << dendl;
12896 int r = 0;
12897 auto it = o->onode.attrs.find(name.c_str());
12898 if (it == o->onode.attrs.end())
12899 goto out;
12900
12901 o->onode.attrs.erase(it);
12902 txc->write_onode(o);
12903
12904 out:
12905 dout(10) << __func__ << " " << c->cid << " " << o->oid
12906 << " " << name << " = " << r << dendl;
12907 return r;
12908}
12909
12910int BlueStore::_rmattrs(TransContext *txc,
12911 CollectionRef& c,
12912 OnodeRef& o)
12913{
12914 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12915 int r = 0;
12916
12917 if (o->onode.attrs.empty())
12918 goto out;
12919
12920 o->onode.attrs.clear();
12921 txc->write_onode(o);
12922
12923 out:
12924 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12925 return r;
12926}
12927
11fdf7f2
TL
12928void BlueStore::_do_omap_clear(TransContext *txc, const string& omap_prefix,
12929 uint64_t id)
7c673cae 12930{
7c673cae
FG
12931 string prefix, tail;
12932 get_omap_header(id, &prefix);
12933 get_omap_tail(id, &tail);
11fdf7f2
TL
12934 txc->t->rm_range_keys(omap_prefix, prefix, tail);
12935 dout(20) << __func__ << " remove range start: "
12936 << pretty_binary_string(prefix) << " end: "
12937 << pretty_binary_string(tail) << dendl;
7c673cae
FG
12938}
12939
12940int BlueStore::_omap_clear(TransContext *txc,
12941 CollectionRef& c,
12942 OnodeRef& o)
12943{
12944 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12945 int r = 0;
12946 if (o->onode.has_omap()) {
12947 o->flush();
11fdf7f2
TL
12948 _do_omap_clear(txc,
12949 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
12950 o->onode.nid);
7c673cae
FG
12951 o->onode.clear_omap_flag();
12952 txc->write_onode(o);
12953 }
12954 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12955 return r;
12956}
12957
12958int BlueStore::_omap_setkeys(TransContext *txc,
12959 CollectionRef& c,
12960 OnodeRef& o,
12961 bufferlist &bl)
12962{
12963 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12964 int r;
11fdf7f2 12965 auto p = bl.cbegin();
7c673cae
FG
12966 __u32 num;
12967 if (!o->onode.has_omap()) {
12968 o->onode.set_omap_flag();
11fdf7f2
TL
12969 if (o->oid.is_pgmeta()) {
12970 o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
12971 }
7c673cae
FG
12972 txc->write_onode(o);
12973 } else {
12974 txc->note_modified_object(o);
12975 }
11fdf7f2
TL
12976 const string& prefix =
12977 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
7c673cae
FG
12978 string final_key;
12979 _key_encode_u64(o->onode.nid, &final_key);
12980 final_key.push_back('.');
11fdf7f2 12981 decode(num, p);
7c673cae
FG
12982 while (num--) {
12983 string key;
12984 bufferlist value;
11fdf7f2
TL
12985 decode(key, p);
12986 decode(value, p);
7c673cae
FG
12987 final_key.resize(9); // keep prefix
12988 final_key += key;
11fdf7f2 12989 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 12990 << " <- " << key << dendl;
11fdf7f2 12991 txc->t->set(prefix, final_key, value);
7c673cae
FG
12992 }
12993 r = 0;
12994 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12995 return r;
12996}
12997
12998int BlueStore::_omap_setheader(TransContext *txc,
12999 CollectionRef& c,
13000 OnodeRef &o,
13001 bufferlist& bl)
13002{
13003 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13004 int r;
13005 string key;
13006 if (!o->onode.has_omap()) {
13007 o->onode.set_omap_flag();
11fdf7f2
TL
13008 if (o->oid.is_pgmeta()) {
13009 o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
13010 }
7c673cae
FG
13011 txc->write_onode(o);
13012 } else {
13013 txc->note_modified_object(o);
13014 }
11fdf7f2
TL
13015 const string& prefix =
13016 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
7c673cae 13017 get_omap_header(o->onode.nid, &key);
11fdf7f2 13018 txc->t->set(prefix, key, bl);
7c673cae
FG
13019 r = 0;
13020 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13021 return r;
13022}
13023
13024int BlueStore::_omap_rmkeys(TransContext *txc,
13025 CollectionRef& c,
13026 OnodeRef& o,
13027 bufferlist& bl)
13028{
13029 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13030 int r = 0;
11fdf7f2 13031 auto p = bl.cbegin();
7c673cae
FG
13032 __u32 num;
13033 string final_key;
13034
13035 if (!o->onode.has_omap()) {
13036 goto out;
13037 }
11fdf7f2
TL
13038 {
13039 const string& prefix =
13040 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13041 _key_encode_u64(o->onode.nid, &final_key);
13042 final_key.push_back('.');
13043 decode(num, p);
13044 while (num--) {
13045 string key;
13046 decode(key, p);
13047 final_key.resize(9); // keep prefix
13048 final_key += key;
13049 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
13050 << " <- " << key << dendl;
13051 txc->t->rmkey(prefix, final_key);
13052 }
7c673cae
FG
13053 }
13054 txc->note_modified_object(o);
13055
13056 out:
13057 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13058 return r;
13059}
13060
13061int BlueStore::_omap_rmkey_range(TransContext *txc,
13062 CollectionRef& c,
13063 OnodeRef& o,
13064 const string& first, const string& last)
13065{
13066 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
13067 string key_first, key_last;
13068 int r = 0;
13069 if (!o->onode.has_omap()) {
13070 goto out;
13071 }
11fdf7f2
TL
13072 {
13073 const string& prefix =
13074 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13075 o->flush();
13076 get_omap_key(o->onode.nid, first, &key_first);
13077 get_omap_key(o->onode.nid, last, &key_last);
13078 txc->t->rm_range_keys(prefix, key_first, key_last);
13079 dout(20) << __func__ << " remove range start: "
13080 << pretty_binary_string(key_first) << " end: "
13081 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
13082 }
13083 txc->note_modified_object(o);
13084
13085 out:
13086 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13087 return r;
13088}
13089
13090int BlueStore::_set_alloc_hint(
13091 TransContext *txc,
13092 CollectionRef& c,
13093 OnodeRef& o,
13094 uint64_t expected_object_size,
13095 uint64_t expected_write_size,
13096 uint32_t flags)
13097{
13098 dout(15) << __func__ << " " << c->cid << " " << o->oid
13099 << " object_size " << expected_object_size
13100 << " write_size " << expected_write_size
13101 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
13102 << dendl;
13103 int r = 0;
13104 o->onode.expected_object_size = expected_object_size;
13105 o->onode.expected_write_size = expected_write_size;
13106 o->onode.alloc_hint_flags = flags;
13107 txc->write_onode(o);
13108 dout(10) << __func__ << " " << c->cid << " " << o->oid
13109 << " object_size " << expected_object_size
13110 << " write_size " << expected_write_size
13111 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
13112 << " = " << r << dendl;
13113 return r;
13114}
13115
13116int BlueStore::_clone(TransContext *txc,
13117 CollectionRef& c,
13118 OnodeRef& oldo,
13119 OnodeRef& newo)
13120{
13121 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13122 << newo->oid << dendl;
13123 int r = 0;
13124 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
13125 derr << __func__ << " mismatched hash on " << oldo->oid
13126 << " and " << newo->oid << dendl;
13127 return -EINVAL;
13128 }
13129
7c673cae
FG
13130 _assign_nid(txc, newo);
13131
13132 // clone data
13133 oldo->flush();
13134 _do_truncate(txc, c, newo, 0);
13135 if (cct->_conf->bluestore_clone_cow) {
13136 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
13137 } else {
13138 bufferlist bl;
13139 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
13140 if (r < 0)
13141 goto out;
13142 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
13143 if (r < 0)
13144 goto out;
13145 }
13146
13147 // clone attrs
13148 newo->onode.attrs = oldo->onode.attrs;
13149
13150 // clone omap
13151 if (newo->onode.has_omap()) {
13152 dout(20) << __func__ << " clearing old omap data" << dendl;
13153 newo->flush();
11fdf7f2
TL
13154 _do_omap_clear(txc,
13155 newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP
13156 : PREFIX_OMAP,
13157 newo->onode.nid);
7c673cae
FG
13158 }
13159 if (oldo->onode.has_omap()) {
13160 dout(20) << __func__ << " copying omap data" << dendl;
13161 if (!newo->onode.has_omap()) {
13162 newo->onode.set_omap_flag();
11fdf7f2
TL
13163 if (newo->oid.is_pgmeta()) {
13164 newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
13165 }
7c673cae 13166 }
11fdf7f2
TL
13167 const string& prefix =
13168 newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13169 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
13170 string head, tail;
13171 get_omap_header(oldo->onode.nid, &head);
13172 get_omap_tail(oldo->onode.nid, &tail);
13173 it->lower_bound(head);
13174 while (it->valid()) {
13175 if (it->key() >= tail) {
13176 dout(30) << __func__ << " reached tail" << dendl;
13177 break;
13178 } else {
13179 dout(30) << __func__ << " got header/data "
13180 << pretty_binary_string(it->key()) << dendl;
13181 string key;
13182 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11fdf7f2 13183 txc->t->set(prefix, key, it->value());
7c673cae
FG
13184 }
13185 it->next();
13186 }
13187 } else {
13188 newo->onode.clear_omap_flag();
13189 }
13190
13191 txc->write_onode(newo);
13192 r = 0;
13193
13194 out:
13195 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13196 << newo->oid << " = " << r << dendl;
13197 return r;
13198}
13199
13200int BlueStore::_do_clone_range(
13201 TransContext *txc,
13202 CollectionRef& c,
13203 OnodeRef& oldo,
13204 OnodeRef& newo,
224ce89b
WB
13205 uint64_t srcoff,
13206 uint64_t length,
13207 uint64_t dstoff)
7c673cae
FG
13208{
13209 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13210 << newo->oid
13211 << " 0x" << std::hex << srcoff << "~" << length << " -> "
13212 << " 0x" << dstoff << "~" << length << std::dec << dendl;
13213 oldo->extent_map.fault_range(db, srcoff, length);
13214 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
13215 _dump_onode<30>(cct, *oldo);
13216 _dump_onode<30>(cct, *newo);
7c673cae 13217
11fdf7f2 13218 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
13219 _dump_onode<30>(cct, *oldo);
13220 _dump_onode<30>(cct, *newo);
7c673cae
FG
13221 return 0;
13222}
13223
13224int BlueStore::_clone_range(TransContext *txc,
13225 CollectionRef& c,
13226 OnodeRef& oldo,
13227 OnodeRef& newo,
13228 uint64_t srcoff, uint64_t length, uint64_t dstoff)
13229{
13230 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13231 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
13232 << " to offset 0x" << dstoff << std::dec << dendl;
13233 int r = 0;
13234
35e4c445
FG
13235 if (srcoff + length >= OBJECT_MAX_SIZE ||
13236 dstoff + length >= OBJECT_MAX_SIZE) {
13237 r = -E2BIG;
13238 goto out;
13239 }
7c673cae
FG
13240 if (srcoff + length > oldo->onode.size) {
13241 r = -EINVAL;
13242 goto out;
13243 }
13244
7c673cae
FG
13245 _assign_nid(txc, newo);
13246
13247 if (length > 0) {
13248 if (cct->_conf->bluestore_clone_cow) {
13249 _do_zero(txc, c, newo, dstoff, length);
13250 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
13251 } else {
13252 bufferlist bl;
13253 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
13254 if (r < 0)
13255 goto out;
13256 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
13257 if (r < 0)
13258 goto out;
13259 }
13260 }
13261
13262 txc->write_onode(newo);
13263 r = 0;
13264
13265 out:
13266 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13267 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
13268 << " to offset 0x" << dstoff << std::dec
13269 << " = " << r << dendl;
13270 return r;
13271}
13272
13273int BlueStore::_rename(TransContext *txc,
13274 CollectionRef& c,
13275 OnodeRef& oldo,
13276 OnodeRef& newo,
13277 const ghobject_t& new_oid)
13278{
13279 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13280 << new_oid << dendl;
13281 int r;
13282 ghobject_t old_oid = oldo->oid;
31f18b77 13283 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
13284
13285 if (newo) {
13286 if (newo->exists) {
13287 r = -EEXIST;
13288 goto out;
13289 }
11fdf7f2 13290 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
13291 }
13292
13293 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
13294
13295 // rewrite shards
13296 {
13297 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
13298 get_object_key(cct, new_oid, &new_okey);
13299 string key;
13300 for (auto &s : oldo->extent_map.shards) {
13301 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
13302 [&](const string& final_key) {
13303 txc->t->rmkey(PREFIX_OBJ, final_key);
13304 }
13305 );
13306 s.dirty = true;
13307 }
13308 }
13309
13310 newo = oldo;
13311 txc->write_onode(newo);
13312
13313 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
13314 // Onode in the old slot
13315 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
13316 r = 0;
13317
f64942e4
AA
13318 // hold a ref to new Onode in old name position, to ensure we don't drop
13319 // it from the cache before this txc commits (or else someone may come along
13320 // and read newo's metadata via the old name).
13321 txc->note_modified_object(oldo);
13322
7c673cae
FG
13323 out:
13324 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
13325 << new_oid << " = " << r << dendl;
13326 return r;
13327}
13328
13329// collections
13330
13331int BlueStore::_create_collection(
13332 TransContext *txc,
13333 const coll_t &cid,
13334 unsigned bits,
13335 CollectionRef *c)
13336{
13337 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
13338 int r;
13339 bufferlist bl;
13340
13341 {
13342 RWLock::WLocker l(coll_lock);
13343 if (*c) {
13344 r = -EEXIST;
13345 goto out;
13346 }
11fdf7f2
TL
13347 auto p = new_coll_map.find(cid);
13348 ceph_assert(p != new_coll_map.end());
13349 *c = p->second;
7c673cae
FG
13350 (*c)->cnode.bits = bits;
13351 coll_map[cid] = *c;
11fdf7f2 13352 new_coll_map.erase(p);
7c673cae 13353 }
11fdf7f2 13354 encode((*c)->cnode, bl);
7c673cae
FG
13355 txc->t->set(PREFIX_COLL, stringify(cid), bl);
13356 r = 0;
13357
13358 out:
13359 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
13360 return r;
13361}
13362
13363int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
13364 CollectionRef *c)
13365{
13366 dout(15) << __func__ << " " << cid << dendl;
13367 int r;
13368
11fdf7f2 13369 (*c)->flush_all_but_last();
7c673cae
FG
13370 {
13371 RWLock::WLocker l(coll_lock);
13372 if (!*c) {
13373 r = -ENOENT;
13374 goto out;
13375 }
13376 size_t nonexistent_count = 0;
11fdf7f2 13377 ceph_assert((*c)->exists);
7c673cae
FG
13378 if ((*c)->onode_map.map_any([&](OnodeRef o) {
13379 if (o->exists) {
13380 dout(10) << __func__ << " " << o->oid << " " << o
13381 << " exists in onode_map" << dendl;
13382 return true;
13383 }
13384 ++nonexistent_count;
13385 return false;
13386 })) {
13387 r = -ENOTEMPTY;
13388 goto out;
13389 }
13390
13391 vector<ghobject_t> ls;
13392 ghobject_t next;
13393 // Enumerate onodes in db, up to nonexistent_count + 1
13394 // then check if all of them are marked as non-existent.
11fdf7f2 13395 // Bypass the check if (next != ghobject_t::get_max())
7c673cae
FG
13396 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
13397 nonexistent_count + 1, &ls, &next);
13398 if (r >= 0) {
11fdf7f2
TL
13399 // If true mean collecton has more objects than nonexistent_count,
13400 // so bypass check.
13401 bool exists = (!next.is_max());
7c673cae
FG
13402 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
13403 dout(10) << __func__ << " oid " << *it << dendl;
13404 auto onode = (*c)->onode_map.lookup(*it);
13405 exists = !onode || onode->exists;
13406 if (exists) {
13407 dout(10) << __func__ << " " << *it
11fdf7f2
TL
13408 << " exists in db, "
13409 << (!onode ? "not present in ram" : "present in ram")
13410 << dendl;
7c673cae
FG
13411 }
13412 }
13413 if (!exists) {
11fdf7f2 13414 _do_remove_collection(txc, c);
7c673cae
FG
13415 r = 0;
13416 } else {
13417 dout(10) << __func__ << " " << cid
13418 << " is non-empty" << dendl;
13419 r = -ENOTEMPTY;
13420 }
13421 }
13422 }
13423
13424 out:
13425 dout(10) << __func__ << " " << cid << " = " << r << dendl;
13426 return r;
13427}
13428
11fdf7f2
TL
13429void BlueStore::_do_remove_collection(TransContext *txc,
13430 CollectionRef *c)
13431{
13432 coll_map.erase((*c)->cid);
13433 txc->removed_collections.push_back(*c);
13434 (*c)->exists = false;
13435 _osr_register_zombie((*c)->osr.get());
13436 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
13437 c->reset();
13438}
13439
7c673cae
FG
13440int BlueStore::_split_collection(TransContext *txc,
13441 CollectionRef& c,
13442 CollectionRef& d,
13443 unsigned bits, int rem)
13444{
13445 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
13446 << " bits " << bits << dendl;
13447 RWLock::WLocker l(c->lock);
13448 RWLock::WLocker l2(d->lock);
13449 int r;
13450
13451 // flush all previous deferred writes on this sequencer. this is a bit
13452 // heavyweight, but we need to make sure all deferred writes complete
13453 // before we split as the new collection's sequencer may need to order
13454 // this after those writes, and we don't bother with the complexity of
13455 // moving those TransContexts over to the new osr.
13456 _osr_drain_preceding(txc);
13457
13458 // move any cached items (onodes and referenced shared blobs) that will
13459 // belong to the child collection post-split. leave everything else behind.
13460 // this may include things that don't strictly belong to the now-smaller
13461 // parent split, but the OSD will always send us a split for every new
13462 // child.
13463
13464 spg_t pgid, dest_pgid;
13465 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 13466 ceph_assert(is_pg);
7c673cae 13467 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 13468 ceph_assert(is_pg);
7c673cae
FG
13469
13470 // the destination should initially be empty.
11fdf7f2
TL
13471 ceph_assert(d->onode_map.empty());
13472 ceph_assert(d->shared_blob_set.empty());
13473 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
13474
13475 c->split_cache(d.get());
13476
13477 // adjust bits. note that this will be redundant for all but the first
13478 // split call for this parent (first child).
13479 c->cnode.bits = bits;
11fdf7f2 13480 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
13481 r = 0;
13482
13483 bufferlist bl;
11fdf7f2 13484 encode(c->cnode, bl);
7c673cae
FG
13485 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
13486
13487 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
13488 << " bits " << bits << " = " << r << dendl;
13489 return r;
13490}
13491
11fdf7f2
TL
13492int BlueStore::_merge_collection(
13493 TransContext *txc,
13494 CollectionRef *c,
13495 CollectionRef& d,
13496 unsigned bits)
13497{
13498 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
13499 << " bits " << bits << dendl;
13500 RWLock::WLocker l((*c)->lock);
13501 RWLock::WLocker l2(d->lock);
13502 int r;
13503
13504 coll_t cid = (*c)->cid;
13505
13506 // flush all previous deferred writes on the source collection to ensure
13507 // that all deferred writes complete before we merge as the target collection's
13508 // sequencer may need to order new ops after those writes.
13509
13510 _osr_drain((*c)->osr.get());
13511
13512 // move any cached items (onodes and referenced shared blobs) that will
13513 // belong to the child collection post-split. leave everything else behind.
13514 // this may include things that don't strictly belong to the now-smaller
13515 // parent split, but the OSD will always send us a split for every new
13516 // child.
13517
13518 spg_t pgid, dest_pgid;
13519 bool is_pg = cid.is_pg(&pgid);
13520 ceph_assert(is_pg);
13521 is_pg = d->cid.is_pg(&dest_pgid);
13522 ceph_assert(is_pg);
13523
13524 // adjust bits. note that this will be redundant for all but the first
13525 // merge call for the parent/target.
13526 d->cnode.bits = bits;
13527
13528 // behavior depends on target (d) bits, so this after that is updated.
13529 (*c)->split_cache(d.get());
13530
13531 // remove source collection
13532 {
13533 RWLock::WLocker l3(coll_lock);
13534 _do_remove_collection(txc, c);
13535 }
13536
13537 r = 0;
13538
13539 bufferlist bl;
13540 encode(d->cnode, bl);
13541 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
13542
13543 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
13544 << " bits " << bits << " = " << r << dendl;
13545 return r;
13546}
13547
13548void BlueStore::log_latency_fn(
13549 int idx,
13550 const ceph::timespan& l,
13551 std::function<string (const ceph::timespan& lat)> fn)
13552{
13553 LOG_LATENCY_FN(logger, cct, idx, l, fn);
13554}
13555
13556
7c673cae
FG
13557// DB key value Histogram
13558#define KEY_SLAB 32
13559#define VALUE_SLAB 64
13560
13561const string prefix_onode = "o";
13562const string prefix_onode_shard = "x";
13563const string prefix_other = "Z";
13564
13565int BlueStore::DBHistogram::get_key_slab(size_t sz)
13566{
13567 return (sz/KEY_SLAB);
13568}
13569
13570string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
13571{
13572 int lower_bound = slab * KEY_SLAB;
13573 int upper_bound = (slab + 1) * KEY_SLAB;
13574 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
13575 return ret;
13576}
13577
13578int BlueStore::DBHistogram::get_value_slab(size_t sz)
13579{
13580 return (sz/VALUE_SLAB);
13581}
13582
13583string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
13584{
13585 int lower_bound = slab * VALUE_SLAB;
13586 int upper_bound = (slab + 1) * VALUE_SLAB;
13587 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
13588 return ret;
13589}
13590
13591void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
13592 const string &prefix, size_t key_size, size_t value_size)
13593{
13594 uint32_t key_slab = get_key_slab(key_size);
13595 uint32_t value_slab = get_value_slab(value_size);
13596 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
13597 key_hist[prefix][key_slab].max_len =
13598 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
13599 key_hist[prefix][key_slab].val_map[value_slab].count++;
13600 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
13601 std::max<size_t>(value_size,
13602 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
13603}
13604
13605void BlueStore::DBHistogram::dump(Formatter *f)
13606{
13607 f->open_object_section("rocksdb_value_distribution");
13608 for (auto i : value_hist) {
13609 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
13610 }
13611 f->close_section();
13612
13613 f->open_object_section("rocksdb_key_value_histogram");
13614 for (auto i : key_hist) {
13615 f->dump_string("prefix", i.first);
13616 f->open_object_section("key_hist");
13617 for ( auto k : i.second) {
13618 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
13619 f->dump_unsigned("max_len", k.second.max_len);
13620 f->open_object_section("value_hist");
13621 for ( auto j : k.second.val_map) {
13622 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
13623 f->dump_unsigned("max_len", j.second.max_len);
13624 }
13625 f->close_section();
13626 }
13627 f->close_section();
13628 }
13629 f->close_section();
13630}
13631
13632//Itrerates through the db and collects the stats
13633void BlueStore::generate_db_histogram(Formatter *f)
13634{
13635 //globals
13636 uint64_t num_onodes = 0;
13637 uint64_t num_shards = 0;
13638 uint64_t num_super = 0;
13639 uint64_t num_coll = 0;
13640 uint64_t num_omap = 0;
11fdf7f2 13641 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
13642 uint64_t num_deferred = 0;
13643 uint64_t num_alloc = 0;
13644 uint64_t num_stat = 0;
13645 uint64_t num_others = 0;
13646 uint64_t num_shared_shards = 0;
13647 size_t max_key_size =0, max_value_size = 0;
13648 uint64_t total_key_size = 0, total_value_size = 0;
13649 size_t key_size = 0, value_size = 0;
13650 DBHistogram hist;
13651
11fdf7f2 13652 auto start = coarse_mono_clock::now();
7c673cae 13653
11fdf7f2 13654 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
13655 iter->seek_to_first();
13656 while (iter->valid()) {
13657 dout(30) << __func__ << " Key: " << iter->key() << dendl;
13658 key_size = iter->key_size();
13659 value_size = iter->value_size();
13660 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
13661 max_key_size = std::max(max_key_size, key_size);
13662 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
13663 total_key_size += key_size;
13664 total_value_size += value_size;
13665
13666 pair<string,string> key(iter->raw_key());
13667
13668 if (key.first == PREFIX_SUPER) {
13669 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
13670 num_super++;
13671 } else if (key.first == PREFIX_STAT) {
13672 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
13673 num_stat++;
13674 } else if (key.first == PREFIX_COLL) {
13675 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
13676 num_coll++;
13677 } else if (key.first == PREFIX_OBJ) {
13678 if (key.second.back() == ONODE_KEY_SUFFIX) {
13679 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
13680 num_onodes++;
13681 } else {
13682 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
13683 num_shards++;
13684 }
13685 } else if (key.first == PREFIX_OMAP) {
13686 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
13687 num_omap++;
11fdf7f2
TL
13688 } else if (key.first == PREFIX_PGMETA_OMAP) {
13689 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
13690 num_pgmeta_omap++;
7c673cae
FG
13691 } else if (key.first == PREFIX_DEFERRED) {
13692 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
13693 num_deferred++;
11fdf7f2 13694 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
13695 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
13696 num_alloc++;
13697 } else if (key.first == PREFIX_SHARED_BLOB) {
13698 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
13699 num_shared_shards++;
13700 } else {
13701 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
13702 num_others++;
13703 }
13704 iter->next();
13705 }
13706
11fdf7f2 13707 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
13708 f->open_object_section("rocksdb_key_value_stats");
13709 f->dump_unsigned("num_onodes", num_onodes);
13710 f->dump_unsigned("num_shards", num_shards);
13711 f->dump_unsigned("num_super", num_super);
13712 f->dump_unsigned("num_coll", num_coll);
13713 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 13714 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
13715 f->dump_unsigned("num_deferred", num_deferred);
13716 f->dump_unsigned("num_alloc", num_alloc);
13717 f->dump_unsigned("num_stat", num_stat);
13718 f->dump_unsigned("num_shared_shards", num_shared_shards);
13719 f->dump_unsigned("num_others", num_others);
13720 f->dump_unsigned("max_key_size", max_key_size);
13721 f->dump_unsigned("max_value_size", max_value_size);
13722 f->dump_unsigned("total_key_size", total_key_size);
13723 f->dump_unsigned("total_value_size", total_value_size);
13724 f->close_section();
13725
13726 hist.dump(f);
13727
13728 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
13729
13730}
13731
31f18b77 13732void BlueStore::_flush_cache()
7c673cae
FG
13733{
13734 dout(10) << __func__ << dendl;
13735 for (auto i : cache_shards) {
13736 i->trim_all();
11fdf7f2 13737 ceph_assert(i->empty());
7c673cae
FG
13738 }
13739 for (auto& p : coll_map) {
3efd9988 13740 if (!p.second->onode_map.empty()) {
11fdf7f2
TL
13741 derr << __func__ << " stray onodes on " << p.first << dendl;
13742 p.second->onode_map.dump<0>(cct);
3efd9988
FG
13743 }
13744 if (!p.second->shared_blob_set.empty()) {
13745 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 13746 p.second->shared_blob_set.dump<0>(cct);
3efd9988 13747 }
11fdf7f2
TL
13748 ceph_assert(p.second->onode_map.empty());
13749 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
13750 }
13751 coll_map.clear();
13752}
13753
31f18b77
FG
13754// For external caller.
13755// We use a best-effort policy instead, e.g.,
13756// we don't care if there are still some pinned onodes/data in the cache
13757// after this command is completed.
11fdf7f2 13758int BlueStore::flush_cache(ostream *os)
31f18b77
FG
13759{
13760 dout(10) << __func__ << dendl;
13761 for (auto i : cache_shards) {
13762 i->trim_all();
13763 }
11fdf7f2
TL
13764
13765 return 0;
31f18b77
FG
13766}
13767
7c673cae
FG
13768void BlueStore::_apply_padding(uint64_t head_pad,
13769 uint64_t tail_pad,
7c673cae
FG
13770 bufferlist& padded)
13771{
7c673cae 13772 if (head_pad) {
224ce89b 13773 padded.prepend_zero(head_pad);
7c673cae
FG
13774 }
13775 if (tail_pad) {
13776 padded.append_zero(tail_pad);
13777 }
13778 if (head_pad || tail_pad) {
13779 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
13780 << " tail 0x" << tail_pad << std::dec << dendl;
13781 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
13782 }
13783}
13784
11fdf7f2
TL
13785void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
13786{
13787 // finalize extent_map shards
13788 o->extent_map.update(txn, false);
13789 if (o->extent_map.needs_reshard()) {
13790 o->extent_map.reshard(db, txn);
13791 o->extent_map.update(txn, true);
13792 if (o->extent_map.needs_reshard()) {
13793 dout(20) << __func__ << " warning: still wants reshard, check options?"
13794 << dendl;
13795 o->extent_map.clear_needs_reshard();
13796 }
13797 logger->inc(l_bluestore_onode_reshard);
13798 }
13799
13800 // bound encode
13801 size_t bound = 0;
13802 denc(o->onode, bound);
13803 o->extent_map.bound_encode_spanning_blobs(bound);
13804 if (o->onode.extent_map_shards.empty()) {
13805 denc(o->extent_map.inline_bl, bound);
13806 }
13807
13808 // encode
13809 bufferlist bl;
13810 unsigned onode_part, blob_part, extent_part;
13811 {
13812 auto p = bl.get_contiguous_appender(bound, true);
13813 denc(o->onode, p);
13814 onode_part = p.get_logical_offset();
13815 o->extent_map.encode_spanning_blobs(p);
13816 blob_part = p.get_logical_offset() - onode_part;
13817 if (o->onode.extent_map_shards.empty()) {
13818 denc(o->extent_map.inline_bl, p);
13819 }
13820 extent_part = p.get_logical_offset() - onode_part - blob_part;
13821 }
13822
13823 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
13824 << " (" << onode_part << " bytes onode + "
13825 << blob_part << " bytes spanning blobs + "
13826 << extent_part << " bytes inline extents)"
13827 << dendl;
13828
13829
13830 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
13831}
13832
13833void BlueStore::_log_alerts(osd_alert_list_t& alerts)
13834{
13835 std::lock_guard l(qlock);
13836
81eedcae
TL
13837 if (!disk_size_mismatch_alert.empty()) {
13838 alerts.emplace(
13839 "BLUESTORE_DISK_SIZE_MISMATCH",
13840 disk_size_mismatch_alert);
13841 }
13842 if (!legacy_statfs_alert.empty()) {
13843 alerts.emplace(
13844 "BLUESTORE_LEGACY_STATFS",
13845 legacy_statfs_alert);
13846 }
11fdf7f2
TL
13847 if (!spillover_alert.empty() &&
13848 cct->_conf->bluestore_warn_on_bluefs_spillover) {
13849 alerts.emplace(
13850 "BLUEFS_SPILLOVER",
13851 spillover_alert);
13852 }
13853 string s0(failed_cmode);
13854
13855 if (!failed_compressors.empty()) {
13856 if (!s0.empty()) {
13857 s0 += ", ";
13858 }
13859 s0 += "unable to load:";
13860 bool first = true;
13861 for (auto& s : failed_compressors) {
13862 if (first) {
13863 first = false;
13864 } else {
13865 s0 += ", ";
13866 }
13867 s0 += s;
13868 }
13869 alerts.emplace(
13870 "BLUESTORE_NO_COMPRESSION",
13871 s0);
13872 }
13873}
13874
7c673cae 13875// ===========================================
11fdf7f2
TL
13876// BlueStoreRepairer
13877
13878size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
13879 const interval_set<uint64_t>& extents)
13880{
13881 ceph_assert(granularity); // initialized
13882 // can't call for the second time
13883 ceph_assert(!was_filtered_out);
13884 ceph_assert(collections_bfs.size() == objects_bfs.size());
13885
13886 uint64_t prev_pos = 0;
13887 uint64_t npos = collections_bfs.size();
13888
13889 bloom_vector collections_reduced;
13890 bloom_vector objects_reduced;
13891
13892 for (auto e : extents) {
13893 if (e.second == 0) {
13894 continue;
13895 }
13896 uint64_t pos = max(e.first / granularity, prev_pos);
13897 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
13898 while (pos != npos && pos < end_pos) {
13899 ceph_assert( collections_bfs[pos].element_count() ==
13900 objects_bfs[pos].element_count());
13901 if (collections_bfs[pos].element_count()) {
13902 collections_reduced.push_back(std::move(collections_bfs[pos]));
13903 objects_reduced.push_back(std::move(objects_bfs[pos]));
13904 }
13905 ++pos;
13906 }
13907 prev_pos = end_pos;
13908 }
13909 collections_reduced.swap(collections_bfs);
13910 objects_reduced.swap(objects_bfs);
13911 was_filtered_out = true;
13912 return collections_bfs.size();
13913}
13914
13915bool BlueStoreRepairer::remove_key(KeyValueDB *db,
13916 const string& prefix,
13917 const string& key)
13918{
13919 if (!remove_key_txn) {
13920 remove_key_txn = db->get_transaction();
13921 }
13922 ++to_repair_cnt;
13923 remove_key_txn->rmkey(prefix, key);
13924
13925 return true;
13926}
13927
13928bool BlueStoreRepairer::fix_shared_blob(
13929 KeyValueDB *db,
13930 uint64_t sbid,
13931 const bufferlist* bl)
13932{
13933 KeyValueDB::Transaction txn;
13934 if (fix_misreferences_txn) { // reuse this txn
13935 txn = fix_misreferences_txn;
13936 } else {
13937 if (!fix_shared_blob_txn) {
13938 fix_shared_blob_txn = db->get_transaction();
13939 }
13940 txn = fix_shared_blob_txn;
13941 }
13942 string key;
13943 get_shared_blob_key(sbid, &key);
13944
13945 ++to_repair_cnt;
13946 if (bl) {
13947 txn->set(PREFIX_SHARED_BLOB, key, *bl);
13948 } else {
13949 txn->rmkey(PREFIX_SHARED_BLOB, key);
13950 }
13951 return true;
13952}
13953
13954bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
13955 const string& key,
13956 const store_statfs_t& new_statfs)
13957{
13958 if (!fix_statfs_txn) {
13959 fix_statfs_txn = db->get_transaction();
13960 }
13961 BlueStore::volatile_statfs vstatfs;
13962 vstatfs = new_statfs;
13963 bufferlist bl;
13964 vstatfs.encode(bl);
13965 ++to_repair_cnt;
13966 fix_statfs_txn->set(PREFIX_STAT, key, bl);
13967 return true;
13968}
13969
13970bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
13971 FreelistManager* fm,
13972 uint64_t offset, uint64_t len)
13973{
13974 if (!fix_fm_leaked_txn) {
13975 fix_fm_leaked_txn = db->get_transaction();
13976 }
13977 ++to_repair_cnt;
13978 fm->release(offset, len, fix_fm_leaked_txn);
13979 return true;
13980}
13981bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
13982 FreelistManager* fm,
13983 uint64_t offset, uint64_t len)
13984{
13985 if (!fix_fm_false_free_txn) {
13986 fix_fm_false_free_txn = db->get_transaction();
13987 }
13988 ++to_repair_cnt;
13989 fm->allocate(offset, len, fix_fm_false_free_txn);
13990 return true;
13991}
13992
13993bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
13994{
13995 // this is just a stub to count num of repairs properly,
13996 // actual repair happens in BlueStore::_close_db_and_around()
13997 // while doing _sync_bluefs_and_fm
13998 ++out_of_sync_flag;
13999 ++to_repair_cnt;
14000 return true;
14001}
14002
14003bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
14004{
14005 if (misreferenced_extents.size()) {
14006 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
14007 ceph_assert(n > 0);
14008 if (!fix_misreferences_txn) {
14009 fix_misreferences_txn = db->get_transaction();
14010 }
14011 return true;
14012 }
14013 return false;
14014}
14015
14016unsigned BlueStoreRepairer::apply(KeyValueDB* db)
14017{
14018 if (fix_fm_leaked_txn) {
14019 db->submit_transaction_sync(fix_fm_leaked_txn);
14020 fix_fm_leaked_txn = nullptr;
14021 }
14022 if (fix_fm_false_free_txn) {
14023 db->submit_transaction_sync(fix_fm_false_free_txn);
14024 fix_fm_false_free_txn = nullptr;
14025 }
14026 if (remove_key_txn) {
14027 db->submit_transaction_sync(remove_key_txn);
14028 remove_key_txn = nullptr;
14029 }
14030 if (fix_misreferences_txn) {
14031 db->submit_transaction_sync(fix_misreferences_txn);
14032 fix_misreferences_txn = nullptr;
14033 }
14034 if (fix_shared_blob_txn) {
14035 db->submit_transaction_sync(fix_shared_blob_txn);
14036 fix_shared_blob_txn = nullptr;
14037 }
14038
14039 if (fix_statfs_txn) {
14040 db->submit_transaction_sync(fix_statfs_txn);
14041 fix_statfs_txn = nullptr;
14042 }
14043 unsigned repaired = to_repair_cnt;
14044 to_repair_cnt = 0;
14045 return repaired;
14046}
14047
14048// =======================================================
14049