]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
buildsys: auto-determine current version for makefile
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
31f18b77
FG
21#include "include/cpp-btree/btree_set.h"
22
7c673cae
FG
23#include "BlueStore.h"
24#include "os/kv.h"
25#include "include/compat.h"
26#include "include/intarith.h"
27#include "include/stringify.h"
11fdf7f2
TL
28#include "include/str_map.h"
29#include "include/util.h"
7c673cae
FG
30#include "common/errno.h"
31#include "common/safe_io.h"
91327a77 32#include "common/PriorityCache.h"
7c673cae
FG
33#include "Allocator.h"
34#include "FreelistManager.h"
35#include "BlueFS.h"
36#include "BlueRocksEnv.h"
37#include "auth/Crypto.h"
38#include "common/EventTrace.h"
91327a77 39#include "perfglue/heap_profiler.h"
11fdf7f2
TL
40#include "common/blkdev.h"
41#include "common/numa.h"
7c673cae
FG
42
43#define dout_context cct
44#define dout_subsys ceph_subsys_bluestore
45
31f18b77
FG
46using bid_t = decltype(BlueStore::Blob::id);
47
48// bluestore_cache_onode
7c673cae 49MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 50 bluestore_cache_onode);
7c673cae 51
31f18b77 52// bluestore_cache_other
7c673cae 53MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 54 bluestore_cache_other);
7c673cae 55MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 56 bluestore_cache_other);
7c673cae 57MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 58 bluestore_cache_other);
7c673cae 59MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
60 bluestore_cache_other);
61
62// bluestore_txc
63MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
64 bluestore_txc);
65
7c673cae
FG
66
67// kv store prefixes
11fdf7f2
TL
68const string PREFIX_SUPER = "S"; // field -> value
69const string PREFIX_STAT = "T"; // field -> value(int64 array)
70const string PREFIX_COLL = "C"; // collection name -> cnode_t
71const string PREFIX_OBJ = "O"; // object name -> onode_t
72const string PREFIX_OMAP = "M"; // u64 + keyname -> value
73const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
74const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
75const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
76const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae
FG
77const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
78
11fdf7f2
TL
79const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
80
7c673cae
FG
81// write a label in the first block. always use this size. note that
82// bluefs makes a matching assumption about the location of its
83// superblock (always the second block of the device).
84#define BDEV_LABEL_BLOCK_SIZE 4096
85
86// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
87#define SUPER_RESERVED 8192
88
89#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
90
91
92/*
93 * extent map blob encoding
94 *
95 * we use the low bits of the blobid field to indicate some common scenarios
96 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
97 */
98#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
99#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
100#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
101#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
102#define BLOBID_SHIFT_BITS 4
103
104/*
105 * object name key structure
106 *
107 * encoded u8: shard + 2^7 (so that it sorts properly)
108 * encoded u64: poolid + 2^63 (so that it sorts properly)
109 * encoded u32: hash (bit reversed)
110 *
111 * escaped string: namespace
112 *
113 * escaped string: key or object name
114 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
115 * we are done. otherwise, we are followed by the object name.
116 * escaped string: object name (unless '=' above)
117 *
118 * encoded u64: snap
119 * encoded u64: generation
120 * 'o'
121 */
122#define ONODE_KEY_SUFFIX 'o'
123
124/*
125 * extent shard key
126 *
127 * object prefix key
128 * u32
129 * 'x'
130 */
131#define EXTENT_SHARD_KEY_SUFFIX 'x'
132
133/*
134 * string encoding in the key
135 *
136 * The key string needs to lexicographically sort the same way that
137 * ghobject_t does. We do this by escaping anything <= to '#' with #
138 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
139 * hex digits.
140 *
141 * We use ! as a terminator for strings; this works because it is < #
142 * and will get escaped if it is present in the string.
143 *
144 */
145template<typename S>
146static void append_escaped(const string &in, S *out)
147{
224ce89b
WB
148 char hexbyte[in.length() * 3 + 1];
149 char* ptr = &hexbyte[0];
7c673cae
FG
150 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
151 if (*i <= '#') {
224ce89b
WB
152 *ptr++ = '#';
153 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
154 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 155 } else if (*i >= '~') {
224ce89b
WB
156 *ptr++ = '~';
157 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
158 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 159 } else {
224ce89b 160 *ptr++ = *i;
7c673cae
FG
161 }
162 }
224ce89b
WB
163 *ptr++ = '!';
164 out->append(hexbyte, ptr - &hexbyte[0]);
165}
166
167inline unsigned h2i(char c)
168{
169 if ((c >= '0') && (c <= '9')) {
170 return c - 0x30;
171 } else if ((c >= 'a') && (c <= 'f')) {
172 return c - 'a' + 10;
173 } else if ((c >= 'A') && (c <= 'F')) {
174 return c - 'A' + 10;
175 } else {
176 return 256; // make it always larger than 255
177 }
7c673cae
FG
178}
179
180static int decode_escaped(const char *p, string *out)
181{
224ce89b
WB
182 char buff[256];
183 char* ptr = &buff[0];
184 char* max = &buff[252];
7c673cae
FG
185 const char *orig_p = p;
186 while (*p && *p != '!') {
187 if (*p == '#' || *p == '~') {
224ce89b
WB
188 unsigned hex = 0;
189 p++;
190 hex = h2i(*p++) << 4;
191 if (hex > 255) {
192 return -EINVAL;
193 }
194 hex |= h2i(*p++);
195 if (hex > 255) {
196 return -EINVAL;
197 }
198 *ptr++ = hex;
7c673cae 199 } else {
224ce89b
WB
200 *ptr++ = *p++;
201 }
202 if (ptr > max) {
203 out->append(buff, ptr-buff);
204 ptr = &buff[0];
7c673cae
FG
205 }
206 }
224ce89b
WB
207 if (ptr != buff) {
208 out->append(buff, ptr-buff);
209 }
7c673cae
FG
210 return p - orig_p;
211}
212
213// some things we encode in binary (as le32 or le64); print the
214// resulting key strings nicely
215template<typename S>
216static string pretty_binary_string(const S& in)
217{
218 char buf[10];
219 string out;
220 out.reserve(in.length() * 3);
221 enum { NONE, HEX, STRING } mode = NONE;
222 unsigned from = 0, i;
223 for (i=0; i < in.length(); ++i) {
224 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
225 (mode == HEX && in.length() - i >= 4 &&
226 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
227 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
228 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
229 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
230 if (mode == STRING) {
231 out.append(in.c_str() + from, i - from);
232 out.push_back('\'');
233 }
234 if (mode != HEX) {
235 out.append("0x");
236 mode = HEX;
237 }
238 if (in.length() - i >= 4) {
239 // print a whole u32 at once
240 snprintf(buf, sizeof(buf), "%08x",
241 (uint32_t)(((unsigned char)in[i] << 24) |
242 ((unsigned char)in[i+1] << 16) |
243 ((unsigned char)in[i+2] << 8) |
244 ((unsigned char)in[i+3] << 0)));
245 i += 3;
246 } else {
247 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
248 }
249 out.append(buf);
250 } else {
251 if (mode != STRING) {
252 out.push_back('\'');
253 mode = STRING;
254 from = i;
255 }
256 }
257 }
258 if (mode == STRING) {
259 out.append(in.c_str() + from, i - from);
260 out.push_back('\'');
261 }
262 return out;
263}
264
265template<typename T>
266static void _key_encode_shard(shard_id_t shard, T *key)
267{
268 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
269}
270
271static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
272{
273 pshard->id = (uint8_t)*key - (uint8_t)0x80;
274 return key + 1;
275}
276
277static void get_coll_key_range(const coll_t& cid, int bits,
278 string *temp_start, string *temp_end,
279 string *start, string *end)
280{
281 temp_start->clear();
282 temp_end->clear();
283 start->clear();
284 end->clear();
285
286 spg_t pgid;
287 if (cid.is_pg(&pgid)) {
288 _key_encode_shard(pgid.shard, start);
289 *temp_start = *start;
290
291 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
292 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
293
294 *end = *start;
295 *temp_end = *temp_start;
296
297 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
298 _key_encode_u32(reverse_hash, start);
299 _key_encode_u32(reverse_hash, temp_start);
300
301 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
302 if (end_hash > 0xffffffffull)
303 end_hash = 0xffffffffull;
304
305 _key_encode_u32(end_hash, end);
306 _key_encode_u32(end_hash, temp_end);
307 } else {
308 _key_encode_shard(shard_id_t::NO_SHARD, start);
309 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
310 *end = *start;
311 _key_encode_u32(0, start);
312 _key_encode_u32(0xffffffff, end);
313
314 // no separate temp section
315 *temp_start = *end;
316 *temp_end = *end;
317 }
318}
319
320static void get_shared_blob_key(uint64_t sbid, string *key)
321{
322 key->clear();
323 _key_encode_u64(sbid, key);
324}
325
326static int get_key_shared_blob(const string& key, uint64_t *sbid)
327{
328 const char *p = key.c_str();
329 if (key.length() < sizeof(uint64_t))
330 return -1;
224ce89b 331 _key_decode_u64(p, sbid);
7c673cae
FG
332 return 0;
333}
334
335template<typename S>
336static int get_key_object(const S& key, ghobject_t *oid)
337{
338 int r;
339 const char *p = key.c_str();
340
341 if (key.length() < 1 + 8 + 4)
342 return -1;
343 p = _key_decode_shard(p, &oid->shard_id);
344
345 uint64_t pool;
346 p = _key_decode_u64(p, &pool);
347 oid->hobj.pool = pool - 0x8000000000000000ull;
348
349 unsigned hash;
350 p = _key_decode_u32(p, &hash);
351
352 oid->hobj.set_bitwise_key_u32(hash);
353
354 r = decode_escaped(p, &oid->hobj.nspace);
355 if (r < 0)
356 return -2;
357 p += r + 1;
358
359 string k;
360 r = decode_escaped(p, &k);
361 if (r < 0)
362 return -3;
363 p += r + 1;
364 if (*p == '=') {
365 // no key
366 ++p;
367 oid->hobj.oid.name = k;
368 } else if (*p == '<' || *p == '>') {
369 // key + name
370 ++p;
371 r = decode_escaped(p, &oid->hobj.oid.name);
372 if (r < 0)
373 return -5;
374 p += r + 1;
375 oid->hobj.set_key(k);
376 } else {
377 // malformed
378 return -6;
379 }
380
381 p = _key_decode_u64(p, &oid->hobj.snap.val);
382 p = _key_decode_u64(p, &oid->generation);
383
384 if (*p != ONODE_KEY_SUFFIX) {
385 return -7;
386 }
387 p++;
388 if (*p) {
389 // if we get something other than a null terminator here,
390 // something goes wrong.
391 return -8;
392 }
393
394 return 0;
395}
396
397template<typename S>
398static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
399{
400 key->clear();
401
402 size_t max_len = 1 + 8 + 4 +
403 (oid.hobj.nspace.length() * 3 + 1) +
404 (oid.hobj.get_key().length() * 3 + 1) +
405 1 + // for '<', '=', or '>'
406 (oid.hobj.oid.name.length() * 3 + 1) +
407 8 + 8 + 1;
408 key->reserve(max_len);
409
410 _key_encode_shard(oid.shard_id, key);
411 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
412 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
413
414 append_escaped(oid.hobj.nspace, key);
415
416 if (oid.hobj.get_key().length()) {
417 // is a key... could be < = or >.
418 append_escaped(oid.hobj.get_key(), key);
419 // (ASCII chars < = and > sort in that order, yay)
420 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
421 if (r) {
422 key->append(r > 0 ? ">" : "<");
423 append_escaped(oid.hobj.oid.name, key);
424 } else {
425 // same as no key
426 key->append("=");
427 }
428 } else {
429 // no key
430 append_escaped(oid.hobj.oid.name, key);
431 key->append("=");
432 }
433
434 _key_encode_u64(oid.hobj.snap, key);
435 _key_encode_u64(oid.generation, key);
436
437 key->push_back(ONODE_KEY_SUFFIX);
438
439 // sanity check
440 if (true) {
441 ghobject_t t;
442 int r = get_key_object(*key, &t);
443 if (r || t != oid) {
444 derr << " r " << r << dendl;
445 derr << "key " << pretty_binary_string(*key) << dendl;
446 derr << "oid " << oid << dendl;
447 derr << " t " << t << dendl;
11fdf7f2 448 ceph_assert(r == 0 && t == oid);
7c673cae
FG
449 }
450 }
451}
452
453
454// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
455// char lets us quickly test whether it is a shard key without decoding any
456// of the prefix bytes.
457template<typename S>
458static void get_extent_shard_key(const S& onode_key, uint32_t offset,
459 string *key)
460{
461 key->clear();
462 key->reserve(onode_key.length() + 4 + 1);
463 key->append(onode_key.c_str(), onode_key.size());
464 _key_encode_u32(offset, key);
465 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
466}
467
468static void rewrite_extent_shard_key(uint32_t offset, string *key)
469{
11fdf7f2
TL
470 ceph_assert(key->size() > sizeof(uint32_t) + 1);
471 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
472 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
473}
474
475template<typename S>
476static void generate_extent_shard_key_and_apply(
477 const S& onode_key,
478 uint32_t offset,
479 string *key,
480 std::function<void(const string& final_key)> apply)
481{
482 if (key->empty()) { // make full key
11fdf7f2 483 ceph_assert(!onode_key.empty());
7c673cae
FG
484 get_extent_shard_key(onode_key, offset, key);
485 } else {
486 rewrite_extent_shard_key(offset, key);
487 }
488 apply(*key);
489}
490
491int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
492{
11fdf7f2
TL
493 ceph_assert(key.size() > sizeof(uint32_t) + 1);
494 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
495 int okey_len = key.size() - sizeof(uint32_t) - 1;
496 *onode_key = key.substr(0, okey_len);
497 const char *p = key.data() + okey_len;
224ce89b 498 _key_decode_u32(p, offset);
7c673cae
FG
499 return 0;
500}
501
502static bool is_extent_shard_key(const string& key)
503{
504 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
505}
506
507// '-' < '.' < '~'
508static void get_omap_header(uint64_t id, string *out)
509{
510 _key_encode_u64(id, out);
511 out->push_back('-');
512}
513
514// hmm, I don't think there's any need to escape the user key since we
515// have a clean prefix.
516static void get_omap_key(uint64_t id, const string& key, string *out)
517{
518 _key_encode_u64(id, out);
519 out->push_back('.');
520 out->append(key);
521}
522
523static void rewrite_omap_key(uint64_t id, string old, string *out)
524{
525 _key_encode_u64(id, out);
526 out->append(old.c_str() + out->length(), old.size() - out->length());
527}
528
529static void decode_omap_key(const string& key, string *user_key)
530{
531 *user_key = key.substr(sizeof(uint64_t) + 1);
532}
533
534static void get_omap_tail(uint64_t id, string *out)
535{
536 _key_encode_u64(id, out);
537 out->push_back('~');
538}
539
540static void get_deferred_key(uint64_t seq, string *out)
541{
542 _key_encode_u64(seq, out);
543}
544
11fdf7f2
TL
545static void get_pool_stat_key(int64_t pool_id, string *key)
546{
547 key->clear();
548 _key_encode_u64(pool_id, key);
549}
550
551static int get_key_pool_stat(const string& key, uint64_t* pool_id)
552{
553 const char *p = key.c_str();
554 if (key.length() < sizeof(uint64_t))
555 return -1;
556 _key_decode_u64(p, pool_id);
557 return 0;
558}
7c673cae 559
81eedcae
TL
560template <int LogLevelV>
561void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
562{
563 uint64_t pos = 0;
564 for (auto& s : em.shards) {
565 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
566 << (s.loaded ? " (loaded)" : "")
567 << (s.dirty ? " (dirty)" : "")
568 << dendl;
569 }
570 for (auto& e : em.extent_map) {
571 dout(LogLevelV) << __func__ << " " << e << dendl;
572 ceph_assert(e.logical_offset >= pos);
573 pos = e.logical_offset + e.length;
574 const bluestore_blob_t& blob = e.blob->get_blob();
575 if (blob.has_csum()) {
576 vector<uint64_t> v;
577 unsigned n = blob.get_csum_count();
578 for (unsigned i = 0; i < n; ++i)
579 v.push_back(blob.get_csum_item(i));
580 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
581 << dendl;
582 }
583 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
584 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
585 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
586 << "~" << i.second->length << std::dec
587 << " " << *i.second << dendl;
588 }
589 }
590}
591
592template <int LogLevelV>
593void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
594{
595 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
596 return;
597 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
598 << " nid " << o.onode.nid
599 << " size 0x" << std::hex << o.onode.size
600 << " (" << std::dec << o.onode.size << ")"
601 << " expected_object_size " << o.onode.expected_object_size
602 << " expected_write_size " << o.onode.expected_write_size
603 << " in " << o.onode.extent_map_shards.size() << " shards"
604 << ", " << o.extent_map.spanning_blob_map.size()
605 << " spanning blobs"
606 << dendl;
607 for (auto p = o.onode.attrs.begin();
608 p != o.onode.attrs.end();
609 ++p) {
610 dout(LogLevelV) << __func__ << " attr " << p->first
611 << " len " << p->second.length() << dendl;
612 }
613 _dump_extent_map<LogLevelV>(cct, o.extent_map);
614}
615
616template <int LogLevelV>
617void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
618{
619 dout(LogLevelV) << __func__ << " transaction dump:\n";
620 JSONFormatter f(true);
621 f.open_object_section("transaction");
622 t->dump(&f);
623 f.close_section();
624 f.flush(*_dout);
625 *_dout << dendl;
626}
627
7c673cae
FG
628// merge operators
629
630struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
631 void merge_nonexistent(
632 const char *rdata, size_t rlen, std::string *new_value) override {
633 *new_value = std::string(rdata, rlen);
634 }
635 void merge(
636 const char *ldata, size_t llen,
637 const char *rdata, size_t rlen,
638 std::string *new_value) override {
11fdf7f2
TL
639 ceph_assert(llen == rlen);
640 ceph_assert((rlen % 8) == 0);
7c673cae
FG
641 new_value->resize(rlen);
642 const __le64* lv = (const __le64*)ldata;
643 const __le64* rv = (const __le64*)rdata;
644 __le64* nv = &(__le64&)new_value->at(0);
645 for (size_t i = 0; i < rlen >> 3; ++i) {
646 nv[i] = lv[i] + rv[i];
647 }
648 }
649 // We use each operator name and each prefix to construct the
650 // overall RocksDB operator name for consistency check at open time.
91327a77 651 const char *name() const override {
7c673cae
FG
652 return "int64_array";
653 }
654};
655
656
657// Buffer
658
659ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
660{
661 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
662 << b.offset << "~" << b.length << std::dec
663 << " " << BlueStore::Buffer::get_state_name(b.state);
664 if (b.flags)
665 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
666 return out << ")";
667}
668
669// Garbage Collector
670
671void BlueStore::GarbageCollector::process_protrusive_extents(
672 const BlueStore::ExtentMap& extent_map,
673 uint64_t start_offset,
674 uint64_t end_offset,
675 uint64_t start_touch_offset,
676 uint64_t end_touch_offset,
677 uint64_t min_alloc_size)
678{
11fdf7f2 679 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 680
11fdf7f2
TL
681 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
682 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
683
684 dout(30) << __func__ << " (hex): [" << std::hex
685 << lookup_start_offset << ", " << lookup_end_offset
686 << ")" << std::dec << dendl;
687
688 for (auto it = extent_map.seek_lextent(lookup_start_offset);
689 it != extent_map.extent_map.end() &&
690 it->logical_offset < lookup_end_offset;
691 ++it) {
692 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
693 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
694
695 dout(30) << __func__ << " " << *it
696 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
697 << dendl;
698
699 Blob* b = it->blob.get();
700
701 if (it->logical_offset >=start_touch_offset &&
702 it->logical_end() <= end_touch_offset) {
703 // Process extents within the range affected by
704 // the current write request.
705 // Need to take into account if existing extents
706 // can be merged with them (uncompressed case)
707 if (!b->get_blob().is_compressed()) {
708 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
709 --blob_info_counted->expected_allocations; // don't need to allocate
710 // new AU for compressed
711 // data since another
712 // collocated uncompressed
713 // blob already exists
714 dout(30) << __func__ << " --expected:"
715 << alloc_unit_start << dendl;
716 }
717 used_alloc_unit = alloc_unit_end;
718 blob_info_counted = nullptr;
719 }
720 } else if (b->get_blob().is_compressed()) {
721
722 // additionally we take compressed blobs that were not impacted
723 // by the write into account too
724 BlobInfo& bi =
725 affected_blobs.emplace(
726 b, BlobInfo(b->get_referenced_bytes())).first->second;
727
728 int adjust =
729 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
730 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
731 dout(30) << __func__ << " expected_allocations="
732 << bi.expected_allocations << " end_au:"
733 << alloc_unit_end << dendl;
734
735 blob_info_counted = &bi;
736 used_alloc_unit = alloc_unit_end;
737
11fdf7f2 738 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
739 bi.referenced_bytes -= it->length;
740 dout(30) << __func__ << " affected_blob:" << *b
741 << " unref 0x" << std::hex << it->length
742 << " referenced = 0x" << bi.referenced_bytes
743 << std::dec << dendl;
744 // NOTE: we can't move specific blob to resulting GC list here
745 // when reference counter == 0 since subsequent extents might
746 // decrement its expected_allocation.
747 // Hence need to enumerate all the extents first.
748 if (!bi.collect_candidate) {
749 bi.first_lextent = it;
750 bi.collect_candidate = true;
751 }
752 bi.last_lextent = it;
753 } else {
754 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
755 // don't need to allocate new AU for compressed data since another
756 // collocated uncompressed blob already exists
757 --blob_info_counted->expected_allocations;
758 dout(30) << __func__ << " --expected_allocations:"
759 << alloc_unit_start << dendl;
760 }
761 used_alloc_unit = alloc_unit_end;
762 blob_info_counted = nullptr;
763 }
764 }
765
766 for (auto b_it = affected_blobs.begin();
767 b_it != affected_blobs.end();
768 ++b_it) {
769 Blob* b = b_it->first;
770 BlobInfo& bi = b_it->second;
771 if (bi.referenced_bytes == 0) {
772 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
773 int64_t blob_expected_for_release =
11fdf7f2 774 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
775
776 dout(30) << __func__ << " " << *(b_it->first)
777 << " expected4release=" << blob_expected_for_release
778 << " expected_allocations=" << bi.expected_allocations
779 << dendl;
780 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 781 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
782 if (bi.collect_candidate) {
783 auto it = bi.first_lextent;
784 bool bExit = false;
785 do {
786 if (it->blob.get() == b) {
787 extents_to_collect.emplace_back(it->logical_offset, it->length);
788 }
789 bExit = it == bi.last_lextent;
790 ++it;
31f18b77 791 } while (!bExit);
7c673cae
FG
792 }
793 expected_for_release += blob_expected_for_release;
794 expected_allocations += bi.expected_allocations;
795 }
796 }
797 }
798}
799
800int64_t BlueStore::GarbageCollector::estimate(
801 uint64_t start_offset,
802 uint64_t length,
803 const BlueStore::ExtentMap& extent_map,
804 const BlueStore::old_extent_map_t& old_extents,
805 uint64_t min_alloc_size)
806{
807
808 affected_blobs.clear();
809 extents_to_collect.clear();
810 used_alloc_unit = boost::optional<uint64_t >();
811 blob_info_counted = nullptr;
812
813 gc_start_offset = start_offset;
814 gc_end_offset = start_offset + length;
815
816 uint64_t end_offset = start_offset + length;
817
818 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
819 Blob* b = it->e.blob.get();
820 if (b->get_blob().is_compressed()) {
821
822 // update gc_start_offset/gc_end_offset if needed
823 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 824 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
825
826 auto o = it->e.logical_offset;
827 auto l = it->e.length;
828
829 uint64_t ref_bytes = b->get_referenced_bytes();
830 // micro optimization to bypass blobs that have no more references
831 if (ref_bytes != 0) {
832 dout(30) << __func__ << " affected_blob:" << *b
833 << " unref 0x" << std::hex << o << "~" << l
834 << std::dec << dendl;
835 affected_blobs.emplace(b, BlobInfo(ref_bytes));
836 }
837 }
838 }
839 dout(30) << __func__ << " gc range(hex): [" << std::hex
840 << gc_start_offset << ", " << gc_end_offset
841 << ")" << std::dec << dendl;
842
843 // enumerate preceeding extents to check if they reference affected blobs
844 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
845 process_protrusive_extents(extent_map,
846 gc_start_offset,
847 gc_end_offset,
848 start_offset,
849 end_offset,
850 min_alloc_size);
851 }
852 return expected_for_release - expected_allocations;
853}
854
855// Cache
856
857BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
858 PerfCounters *logger)
859{
860 Cache *c = nullptr;
861
862 if (type == "lru")
863 c = new LRUCache(cct);
864 else if (type == "2q")
865 c = new TwoQCache(cct);
866 else
11fdf7f2 867 ceph_abort_msg("unrecognized cache type");
7c673cae
FG
868
869 c->logger = logger;
870 return c;
871}
872
91327a77 873void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
7c673cae 874{
11fdf7f2 875 std::lock_guard l(lock);
91327a77 876 _trim(onode_max, buffer_max);
7c673cae
FG
877}
878
91327a77 879void BlueStore::Cache::trim_all()
7c673cae 880{
11fdf7f2 881 std::lock_guard l(lock);
91327a77 882 _trim(0, 0);
7c673cae
FG
883}
884
7c673cae
FG
885// LRUCache
886#undef dout_prefix
887#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
888
889void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
890{
891 auto p = onode_lru.iterator_to(*o);
892 onode_lru.erase(p);
893 onode_lru.push_front(*o);
894}
895
896void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
897{
898 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
899 << " buffers " << buffer_size << " / " << buffer_max
900 << dendl;
901
902 _audit("trim start");
903
904 // buffers
905 while (buffer_size > buffer_max) {
906 auto i = buffer_lru.rbegin();
907 if (i == buffer_lru.rend()) {
908 // stop if buffer_lru is now empty
909 break;
910 }
911
912 Buffer *b = &*i;
11fdf7f2 913 ceph_assert(b->is_clean());
7c673cae
FG
914 dout(20) << __func__ << " rm " << *b << dendl;
915 b->space->_rm_buffer(this, b);
916 }
917
918 // onodes
91327a77 919 if (onode_max >= onode_lru.size()) {
7c673cae 920 return; // don't even try
91327a77
AA
921 }
922 uint64_t num = onode_lru.size() - onode_max;
7c673cae
FG
923
924 auto p = onode_lru.end();
11fdf7f2 925 ceph_assert(p != onode_lru.begin());
7c673cae
FG
926 --p;
927 int skipped = 0;
11fdf7f2 928 int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
7c673cae
FG
929 while (num > 0) {
930 Onode *o = &*p;
931 int refs = o->nref.load();
932 if (refs > 1) {
933 dout(20) << __func__ << " " << o->oid << " has " << refs
934 << " refs, skipping" << dendl;
935 if (++skipped >= max_skipped) {
936 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
937 << num << " left to trim" << dendl;
938 break;
939 }
940
941 if (p == onode_lru.begin()) {
942 break;
943 } else {
944 p--;
945 num--;
946 continue;
947 }
948 }
949 dout(30) << __func__ << " rm " << o->oid << dendl;
950 if (p != onode_lru.begin()) {
951 onode_lru.erase(p--);
952 } else {
953 onode_lru.erase(p);
11fdf7f2 954 ceph_assert(num == 1);
7c673cae
FG
955 }
956 o->get(); // paranoia
957 o->c->onode_map.remove(o->oid);
958 o->put();
959 --num;
960 }
961}
962
963#ifdef DEBUG_CACHE
964void BlueStore::LRUCache::_audit(const char *when)
965{
966 dout(10) << __func__ << " " << when << " start" << dendl;
967 uint64_t s = 0;
968 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
969 s += i->length;
970 }
971 if (s != buffer_size) {
972 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
973 << dendl;
974 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
975 derr << __func__ << " " << *i << dendl;
976 }
11fdf7f2 977 ceph_assert(s == buffer_size);
7c673cae
FG
978 }
979 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
980 << " ok" << dendl;
981}
982#endif
983
984// TwoQCache
985#undef dout_prefix
986#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
987
988
989void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
990{
991 auto p = onode_lru.iterator_to(*o);
992 onode_lru.erase(p);
993 onode_lru.push_front(*o);
994}
995
996void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
997{
998 dout(20) << __func__ << " level " << level << " near " << near
999 << " on " << *b
1000 << " which has cache_private " << b->cache_private << dendl;
1001 if (near) {
1002 b->cache_private = near->cache_private;
1003 switch (b->cache_private) {
1004 case BUFFER_WARM_IN:
1005 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
1006 break;
1007 case BUFFER_WARM_OUT:
11fdf7f2 1008 ceph_assert(b->is_empty());
7c673cae
FG
1009 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
1010 break;
1011 case BUFFER_HOT:
1012 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
1013 break;
1014 default:
11fdf7f2 1015 ceph_abort_msg("bad cache_private");
7c673cae
FG
1016 }
1017 } else if (b->cache_private == BUFFER_NEW) {
1018 b->cache_private = BUFFER_WARM_IN;
1019 if (level > 0) {
1020 buffer_warm_in.push_front(*b);
1021 } else {
1022 // take caller hint to start at the back of the warm queue
1023 buffer_warm_in.push_back(*b);
1024 }
1025 } else {
1026 // we got a hint from discard
1027 switch (b->cache_private) {
1028 case BUFFER_WARM_IN:
1029 // stay in warm_in. move to front, even though 2Q doesn't actually
1030 // do this.
1031 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1032 buffer_warm_in.push_front(*b);
1033 break;
1034 case BUFFER_WARM_OUT:
1035 b->cache_private = BUFFER_HOT;
1036 // move to hot. fall-thru
1037 case BUFFER_HOT:
1038 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1039 buffer_hot.push_front(*b);
1040 break;
1041 default:
11fdf7f2 1042 ceph_abort_msg("bad cache_private");
7c673cae
FG
1043 }
1044 }
1045 if (!b->is_empty()) {
1046 buffer_bytes += b->length;
1047 buffer_list_bytes[b->cache_private] += b->length;
1048 }
1049}
1050
1051void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1052{
1053 dout(20) << __func__ << " " << *b << dendl;
1054 if (!b->is_empty()) {
11fdf7f2 1055 ceph_assert(buffer_bytes >= b->length);
7c673cae 1056 buffer_bytes -= b->length;
11fdf7f2 1057 ceph_assert(buffer_list_bytes[b->cache_private] >= b->length);
7c673cae
FG
1058 buffer_list_bytes[b->cache_private] -= b->length;
1059 }
1060 switch (b->cache_private) {
1061 case BUFFER_WARM_IN:
1062 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1063 break;
1064 case BUFFER_WARM_OUT:
1065 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1066 break;
1067 case BUFFER_HOT:
1068 buffer_hot.erase(buffer_hot.iterator_to(*b));
1069 break;
1070 default:
11fdf7f2 1071 ceph_abort_msg("bad cache_private");
7c673cae
FG
1072 }
1073}
1074
1075void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1076{
1077 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1078 src->_rm_buffer(b);
1079
1080 // preserve which list we're on (even if we can't preserve the order!)
1081 switch (b->cache_private) {
1082 case BUFFER_WARM_IN:
11fdf7f2 1083 ceph_assert(!b->is_empty());
7c673cae
FG
1084 buffer_warm_in.push_back(*b);
1085 break;
1086 case BUFFER_WARM_OUT:
11fdf7f2 1087 ceph_assert(b->is_empty());
7c673cae
FG
1088 buffer_warm_out.push_back(*b);
1089 break;
1090 case BUFFER_HOT:
11fdf7f2 1091 ceph_assert(!b->is_empty());
7c673cae
FG
1092 buffer_hot.push_back(*b);
1093 break;
1094 default:
11fdf7f2 1095 ceph_abort_msg("bad cache_private");
7c673cae
FG
1096 }
1097 if (!b->is_empty()) {
1098 buffer_bytes += b->length;
1099 buffer_list_bytes[b->cache_private] += b->length;
1100 }
1101}
1102
1103void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1104{
1105 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1106 if (!b->is_empty()) {
11fdf7f2 1107 ceph_assert((int64_t)buffer_bytes + delta >= 0);
7c673cae 1108 buffer_bytes += delta;
11fdf7f2 1109 ceph_assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
7c673cae
FG
1110 buffer_list_bytes[b->cache_private] += delta;
1111 }
1112}
1113
1114void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1115{
1116 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1117 << " buffers " << buffer_bytes << " / " << buffer_max
1118 << dendl;
1119
1120 _audit("trim start");
1121
1122 // buffers
1123 if (buffer_bytes > buffer_max) {
1124 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1125 uint64_t khot = buffer_max - kin;
1126
1127 // pre-calculate kout based on average buffer size too,
1128 // which is typical(the warm_in and hot lists may change later)
1129 uint64_t kout = 0;
1130 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1131 if (buffer_num) {
1132 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
11fdf7f2 1133 ceph_assert(buffer_avg_size);
7c673cae
FG
1134 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1135 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1136 }
1137
1138 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1139 // hot is small, give slack to warm_in
1140 kin += khot - buffer_list_bytes[BUFFER_HOT];
1141 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1142 // warm_in is small, give slack to hot
1143 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1144 }
1145
1146 // adjust warm_in list
1147 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1148 uint64_t evicted = 0;
1149
1150 while (to_evict_bytes > 0) {
1151 auto p = buffer_warm_in.rbegin();
1152 if (p == buffer_warm_in.rend()) {
1153 // stop if warm_in list is now empty
1154 break;
1155 }
1156
1157 Buffer *b = &*p;
11fdf7f2 1158 ceph_assert(b->is_clean());
7c673cae 1159 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
11fdf7f2 1160 ceph_assert(buffer_bytes >= b->length);
7c673cae 1161 buffer_bytes -= b->length;
11fdf7f2 1162 ceph_assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
7c673cae
FG
1163 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1164 to_evict_bytes -= b->length;
1165 evicted += b->length;
1166 b->state = Buffer::STATE_EMPTY;
1167 b->data.clear();
1168 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1169 buffer_warm_out.push_front(*b);
1170 b->cache_private = BUFFER_WARM_OUT;
1171 }
1172
1173 if (evicted > 0) {
1adf2230 1174 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1175 << " from warm_in list, done evicting warm_in buffers"
1176 << dendl;
1177 }
1178
1179 // adjust hot list
1180 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1181 evicted = 0;
1182
1183 while (to_evict_bytes > 0) {
1184 auto p = buffer_hot.rbegin();
1185 if (p == buffer_hot.rend()) {
1186 // stop if hot list is now empty
1187 break;
1188 }
1189
1190 Buffer *b = &*p;
1191 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
11fdf7f2 1192 ceph_assert(b->is_clean());
7c673cae
FG
1193 // adjust evict size before buffer goes invalid
1194 to_evict_bytes -= b->length;
1195 evicted += b->length;
1196 b->space->_rm_buffer(this, b);
1197 }
1198
1199 if (evicted > 0) {
1adf2230 1200 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1201 << " from hot list, done evicting hot buffers"
1202 << dendl;
1203 }
1204
1205 // adjust warm out list too, if necessary
1206 int64_t num = buffer_warm_out.size() - kout;
1207 while (num-- > 0) {
1208 Buffer *b = &*buffer_warm_out.rbegin();
11fdf7f2 1209 ceph_assert(b->is_empty());
7c673cae
FG
1210 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1211 b->space->_rm_buffer(this, b);
1212 }
1213 }
1214
1215 // onodes
91327a77 1216 if (onode_max >= onode_lru.size()) {
7c673cae 1217 return; // don't even try
91327a77
AA
1218 }
1219 uint64_t num = onode_lru.size() - onode_max;
7c673cae
FG
1220
1221 auto p = onode_lru.end();
11fdf7f2 1222 ceph_assert(p != onode_lru.begin());
7c673cae
FG
1223 --p;
1224 int skipped = 0;
11fdf7f2 1225 int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
7c673cae
FG
1226 while (num > 0) {
1227 Onode *o = &*p;
1228 dout(20) << __func__ << " considering " << o << dendl;
1229 int refs = o->nref.load();
1230 if (refs > 1) {
1231 dout(20) << __func__ << " " << o->oid << " has " << refs
1232 << " refs; skipping" << dendl;
1233 if (++skipped >= max_skipped) {
1234 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1235 << num << " left to trim" << dendl;
1236 break;
1237 }
1238
1239 if (p == onode_lru.begin()) {
1240 break;
1241 } else {
1242 p--;
1243 num--;
1244 continue;
1245 }
1246 }
1247 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1248 if (p != onode_lru.begin()) {
1249 onode_lru.erase(p--);
1250 } else {
1251 onode_lru.erase(p);
11fdf7f2 1252 ceph_assert(num == 1);
7c673cae
FG
1253 }
1254 o->get(); // paranoia
1255 o->c->onode_map.remove(o->oid);
1256 o->put();
1257 --num;
1258 }
1259}
1260
1261#ifdef DEBUG_CACHE
1262void BlueStore::TwoQCache::_audit(const char *when)
1263{
1264 dout(10) << __func__ << " " << when << " start" << dendl;
1265 uint64_t s = 0;
1266 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1267 s += i->length;
1268 }
1269
1270 uint64_t hot_bytes = s;
1271 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1272 derr << __func__ << " hot_list_bytes "
1273 << buffer_list_bytes[BUFFER_HOT]
1274 << " != actual " << hot_bytes
1275 << dendl;
11fdf7f2 1276 ceph_assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
7c673cae
FG
1277 }
1278
1279 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1280 s += i->length;
1281 }
1282
1283 uint64_t warm_in_bytes = s - hot_bytes;
1284 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1285 derr << __func__ << " warm_in_list_bytes "
1286 << buffer_list_bytes[BUFFER_WARM_IN]
1287 << " != actual " << warm_in_bytes
1288 << dendl;
11fdf7f2 1289 ceph_assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
7c673cae
FG
1290 }
1291
1292 if (s != buffer_bytes) {
1293 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1294 << dendl;
11fdf7f2 1295 ceph_assert(s == buffer_bytes);
7c673cae
FG
1296 }
1297
1298 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1299 << " ok" << dendl;
1300}
1301#endif
1302
1303
1304// BufferSpace
1305
1306#undef dout_prefix
1307#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1308
1309void BlueStore::BufferSpace::_clear(Cache* cache)
1310{
1311 // note: we already hold cache->lock
1312 ldout(cache->cct, 20) << __func__ << dendl;
1313 while (!buffer_map.empty()) {
1314 _rm_buffer(cache, buffer_map.begin());
1315 }
1316}
1317
1318int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1319{
1320 // note: we already hold cache->lock
1321 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1322 << std::dec << dendl;
1323 int cache_private = 0;
1324 cache->_audit("discard start");
1325 auto i = _data_lower_bound(offset);
1326 uint32_t end = offset + length;
1327 while (i != buffer_map.end()) {
1328 Buffer *b = i->second.get();
1329 if (b->offset >= end) {
1330 break;
1331 }
1332 if (b->cache_private > cache_private) {
1333 cache_private = b->cache_private;
1334 }
1335 if (b->offset < offset) {
1336 int64_t front = offset - b->offset;
1337 if (b->end() > end) {
1338 // drop middle (split)
1339 uint32_t tail = b->end() - end;
1340 if (b->data.length()) {
1341 bufferlist bl;
1342 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1343 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1344 nb->maybe_rebuild();
1345 _add_buffer(cache, nb, 0, b);
7c673cae 1346 } else {
31f18b77
FG
1347 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1348 0, b);
7c673cae
FG
1349 }
1350 if (!b->is_writing()) {
1351 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1352 }
1353 b->truncate(front);
31f18b77 1354 b->maybe_rebuild();
7c673cae
FG
1355 cache->_audit("discard end 1");
1356 break;
1357 } else {
1358 // drop tail
1359 if (!b->is_writing()) {
1360 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1361 }
1362 b->truncate(front);
31f18b77 1363 b->maybe_rebuild();
7c673cae
FG
1364 ++i;
1365 continue;
1366 }
1367 }
1368 if (b->end() <= end) {
1369 // drop entire buffer
1370 _rm_buffer(cache, i++);
1371 continue;
1372 }
1373 // drop front
1374 uint32_t keep = b->end() - end;
1375 if (b->data.length()) {
1376 bufferlist bl;
1377 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1378 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1379 nb->maybe_rebuild();
1380 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1381 } else {
1382 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1383 }
1384 _rm_buffer(cache, i);
1385 cache->_audit("discard end 2");
1386 break;
1387 }
1388 return cache_private;
1389}
1390
1391void BlueStore::BufferSpace::read(
1392 Cache* cache,
224ce89b
WB
1393 uint32_t offset,
1394 uint32_t length,
7c673cae 1395 BlueStore::ready_regions_t& res,
91327a77
AA
1396 interval_set<uint32_t>& res_intervals,
1397 int flags)
7c673cae 1398{
7c673cae
FG
1399 res.clear();
1400 res_intervals.clear();
1401 uint32_t want_bytes = length;
1402 uint32_t end = offset + length;
224ce89b
WB
1403
1404 {
11fdf7f2 1405 std::lock_guard l(cache->lock);
224ce89b
WB
1406 for (auto i = _data_lower_bound(offset);
1407 i != buffer_map.end() && offset < end && i->first < end;
1408 ++i) {
1409 Buffer *b = i->second.get();
11fdf7f2 1410 ceph_assert(b->end() > offset);
91327a77
AA
1411
1412 bool val = false;
1413 if (flags & BYPASS_CLEAN_CACHE)
1414 val = b->is_writing();
1415 else
1416 val = b->is_writing() || b->is_clean();
1417 if (val) {
224ce89b
WB
1418 if (b->offset < offset) {
1419 uint32_t skip = offset - b->offset;
11fdf7f2 1420 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1421 res[offset].substr_of(b->data, skip, l);
1422 res_intervals.insert(offset, l);
1423 offset += l;
1424 length -= l;
1425 if (!b->is_writing()) {
1426 cache->_touch_buffer(b);
1427 }
1428 continue;
1429 }
1430 if (b->offset > offset) {
1431 uint32_t gap = b->offset - offset;
1432 if (length <= gap) {
1433 break;
1434 }
1435 offset += gap;
1436 length -= gap;
1437 }
1438 if (!b->is_writing()) {
7c673cae 1439 cache->_touch_buffer(b);
224ce89b
WB
1440 }
1441 if (b->length > length) {
1442 res[offset].substr_of(b->data, 0, length);
1443 res_intervals.insert(offset, length);
7c673cae 1444 break;
224ce89b
WB
1445 } else {
1446 res[offset].append(b->data);
1447 res_intervals.insert(offset, b->length);
1448 if (b->length == length)
1449 break;
1450 offset += b->length;
1451 length -= b->length;
1452 }
7c673cae
FG
1453 }
1454 }
1455 }
1456
1457 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1458 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1459 uint64_t miss_bytes = want_bytes - hit_bytes;
1460 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1461 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1462}
1463
f64942e4 1464void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
7c673cae 1465{
7c673cae
FG
1466 auto i = writing.begin();
1467 while (i != writing.end()) {
1468 if (i->seq > seq) {
1469 break;
1470 }
1471 if (i->seq < seq) {
1472 ++i;
1473 continue;
1474 }
1475
1476 Buffer *b = &*i;
11fdf7f2 1477 ceph_assert(b->is_writing());
7c673cae
FG
1478
1479 if (b->flags & Buffer::FLAG_NOCACHE) {
1480 writing.erase(i++);
1481 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1482 buffer_map.erase(b->offset);
1483 } else {
1484 b->state = Buffer::STATE_CLEAN;
1485 writing.erase(i++);
31f18b77
FG
1486 b->maybe_rebuild();
1487 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
1488 cache->_add_buffer(b, 1, nullptr);
1489 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1490 }
1491 }
1492
1493 cache->_audit("finish_write end");
1494}
1495
1496void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1497{
11fdf7f2 1498 std::lock_guard lk(cache->lock);
7c673cae
FG
1499 if (buffer_map.empty())
1500 return;
1501
1502 auto p = --buffer_map.end();
1503 while (true) {
1504 if (p->second->end() <= pos)
1505 break;
1506
1507 if (p->second->offset < pos) {
1508 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1509 size_t left = pos - p->second->offset;
1510 size_t right = p->second->length - left;
1511 if (p->second->data.length()) {
1512 bufferlist bl;
1513 bl.substr_of(p->second->data, left, right);
1514 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1515 0, p->second.get());
1516 } else {
1517 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1518 0, p->second.get());
1519 }
1520 cache->_adjust_buffer_size(p->second.get(), -right);
1521 p->second->truncate(left);
1522 break;
1523 }
1524
11fdf7f2 1525 ceph_assert(p->second->end() > pos);
7c673cae
FG
1526 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1527 if (p->second->data.length()) {
1528 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1529 p->second->offset - pos, p->second->data),
1530 0, p->second.get());
1531 } else {
1532 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1533 p->second->offset - pos, p->second->length),
1534 0, p->second.get());
1535 }
1536 if (p == buffer_map.begin()) {
1537 _rm_buffer(cache, p);
1538 break;
1539 } else {
1540 _rm_buffer(cache, p--);
1541 }
1542 }
11fdf7f2 1543 ceph_assert(writing.empty());
7c673cae
FG
1544}
1545
1546// OnodeSpace
1547
1548#undef dout_prefix
1549#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1550
1551BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1552{
11fdf7f2 1553 std::lock_guard l(cache->lock);
7c673cae
FG
1554 auto p = onode_map.find(oid);
1555 if (p != onode_map.end()) {
1556 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1557 << " raced, returning existing " << p->second
1558 << dendl;
1559 return p->second;
1560 }
1561 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1562 onode_map[oid] = o;
1563 cache->_add_onode(o, 1);
1564 return o;
1565}
1566
1567BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1568{
7c673cae 1569 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1570 OnodeRef o;
1571 bool hit = false;
1572
1573 {
11fdf7f2 1574 std::lock_guard l(cache->lock);
224ce89b
WB
1575 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1576 if (p == onode_map.end()) {
1577 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1578 } else {
1579 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1580 << dendl;
1581 cache->_touch_onode(p->second);
1582 hit = true;
1583 o = p->second;
1584 }
1585 }
1586
1587 if (hit) {
1588 cache->logger->inc(l_bluestore_onode_hits);
1589 } else {
7c673cae 1590 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1591 }
224ce89b 1592 return o;
7c673cae
FG
1593}
1594
1595void BlueStore::OnodeSpace::clear()
1596{
11fdf7f2 1597 std::lock_guard l(cache->lock);
7c673cae
FG
1598 ldout(cache->cct, 10) << __func__ << dendl;
1599 for (auto &p : onode_map) {
1600 cache->_rm_onode(p.second);
1601 }
1602 onode_map.clear();
1603}
1604
1605bool BlueStore::OnodeSpace::empty()
1606{
11fdf7f2 1607 std::lock_guard l(cache->lock);
7c673cae
FG
1608 return onode_map.empty();
1609}
1610
1611void BlueStore::OnodeSpace::rename(
1612 OnodeRef& oldo,
1613 const ghobject_t& old_oid,
1614 const ghobject_t& new_oid,
31f18b77 1615 const mempool::bluestore_cache_other::string& new_okey)
7c673cae 1616{
11fdf7f2 1617 std::lock_guard l(cache->lock);
7c673cae
FG
1618 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1619 << dendl;
1620 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1621 po = onode_map.find(old_oid);
1622 pn = onode_map.find(new_oid);
11fdf7f2 1623 ceph_assert(po != pn);
7c673cae 1624
11fdf7f2 1625 ceph_assert(po != onode_map.end());
7c673cae
FG
1626 if (pn != onode_map.end()) {
1627 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1628 << dendl;
1629 cache->_rm_onode(pn->second);
1630 onode_map.erase(pn);
1631 }
1632 OnodeRef o = po->second;
1633
1634 // install a non-existent onode at old location
1635 oldo.reset(new Onode(o->c, old_oid, o->key));
1636 po->second = oldo;
1637 cache->_add_onode(po->second, 1);
1638
1639 // add at new position and fix oid, key
1640 onode_map.insert(make_pair(new_oid, o));
1641 cache->_touch_onode(o);
1642 o->oid = new_oid;
1643 o->key = new_okey;
1644}
1645
1646bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1647{
11fdf7f2 1648 std::lock_guard l(cache->lock);
7c673cae
FG
1649 ldout(cache->cct, 20) << __func__ << dendl;
1650 for (auto& i : onode_map) {
1651 if (f(i.second)) {
1652 return true;
1653 }
1654 }
1655 return false;
1656}
1657
11fdf7f2
TL
1658template <int LogLevelV = 30>
1659void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1660{
1661 for (auto& i : onode_map) {
11fdf7f2 1662 ldout(cct, LogLevelV) << i.first << " : " << i.second << dendl;
3efd9988
FG
1663 }
1664}
7c673cae
FG
1665
1666// SharedBlob
1667
1668#undef dout_prefix
1669#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1670
1671ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1672{
1673 out << "SharedBlob(" << &sb;
1674
1675 if (sb.loaded) {
1676 out << " loaded " << *sb.persistent;
1677 } else {
1678 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1679 }
1680 return out << ")";
1681}
1682
1683BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1684 : coll(_coll), sbid_unloaded(i)
1685{
11fdf7f2 1686 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
1687 if (get_cache()) {
1688 get_cache()->add_blob();
1689 }
1690}
1691
1692BlueStore::SharedBlob::~SharedBlob()
1693{
7c673cae
FG
1694 if (loaded && persistent) {
1695 delete persistent;
1696 }
1697}
1698
1699void BlueStore::SharedBlob::put()
1700{
1701 if (--nref == 0) {
1702 ldout(coll->store->cct, 20) << __func__ << " " << this
1703 << " removing self from set " << get_parent()
1704 << dendl;
1adf2230
AA
1705 again:
1706 auto coll_snap = coll;
1707 if (coll_snap) {
11fdf7f2 1708 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
1709 if (coll_snap != coll) {
1710 goto again;
1711 }
91327a77
AA
1712 if (!coll_snap->shared_blob_set.remove(this, true)) {
1713 // race with lookup
1714 return;
1715 }
1adf2230
AA
1716 bc._clear(coll_snap->cache);
1717 coll_snap->cache->rm_blob();
7c673cae 1718 }
28e407b8 1719 delete this;
7c673cae
FG
1720 }
1721}
1722
1723void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1724{
11fdf7f2 1725 ceph_assert(persistent);
7c673cae
FG
1726 persistent->ref_map.get(offset, length);
1727}
1728
1729void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 1730 PExtentVector *r,
11fdf7f2 1731 bool *unshare)
7c673cae 1732{
11fdf7f2
TL
1733 ceph_assert(persistent);
1734 persistent->ref_map.put(offset, length, r,
1735 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
1736}
1737
f64942e4
AA
1738void BlueStore::SharedBlob::finish_write(uint64_t seq)
1739{
1740 while (true) {
1741 Cache *cache = coll->cache;
11fdf7f2 1742 std::lock_guard l(cache->lock);
f64942e4
AA
1743 if (coll->cache != cache) {
1744 ldout(coll->store->cct, 20) << __func__
1745 << " raced with sb cache update, was " << cache
1746 << ", now " << coll->cache << ", retrying"
1747 << dendl;
1748 continue;
1749 }
1750 bc._finish_write(cache, seq);
1751 break;
1752 }
1753}
1754
3efd9988
FG
1755// SharedBlobSet
1756
1757#undef dout_prefix
1758#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1759
11fdf7f2
TL
1760template <int LogLevelV = 30>
1761void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 1762{
11fdf7f2 1763 std::lock_guard l(lock);
3efd9988 1764 for (auto& i : sb_map) {
11fdf7f2 1765 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
1766 }
1767}
1768
7c673cae
FG
1769// Blob
1770
1771#undef dout_prefix
1772#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1773
1774ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1775{
1776 out << "Blob(" << &b;
1777 if (b.is_spanning()) {
1778 out << " spanning " << b.id;
1779 }
35e4c445
FG
1780 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1781 if (b.shared_blob) {
1782 out << " " << *b.shared_blob;
1783 } else {
1784 out << " (shared_blob=NULL)";
1785 }
1786 out << ")";
7c673cae
FG
1787 return out;
1788}
1789
1790void BlueStore::Blob::discard_unallocated(Collection *coll)
1791{
224ce89b 1792 if (get_blob().is_shared()) {
7c673cae
FG
1793 return;
1794 }
224ce89b 1795 if (get_blob().is_compressed()) {
7c673cae
FG
1796 bool discard = false;
1797 bool all_invalid = true;
224ce89b 1798 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1799 if (!e.is_valid()) {
1800 discard = true;
1801 } else {
1802 all_invalid = false;
1803 }
1804 }
11fdf7f2 1805 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
1806 // or none pextents are invalid.
1807 if (discard) {
224ce89b
WB
1808 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1809 get_blob().get_logical_length());
7c673cae
FG
1810 }
1811 } else {
1812 size_t pos = 0;
224ce89b 1813 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1814 if (!e.is_valid()) {
1815 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1816 << "~" << e.length
1817 << std::dec << dendl;
1818 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1819 }
1820 pos += e.length;
1821 }
224ce89b
WB
1822 if (get_blob().can_prune_tail()) {
1823 dirty_blob().prune_tail();
1824 used_in_blob.prune_tail(get_blob().get_ondisk_length());
7c673cae 1825 auto cct = coll->store->cct; //used by dout
224ce89b 1826 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1827 }
1828 }
1829}
1830
1831void BlueStore::Blob::get_ref(
1832 Collection *coll,
1833 uint32_t offset,
1834 uint32_t length)
1835{
1836 // Caller has to initialize Blob's logical length prior to increment
1837 // references. Otherwise one is neither unable to determine required
1838 // amount of counters in case of per-au tracking nor obtain min_release_size
1839 // for single counter mode.
11fdf7f2 1840 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
1841 auto cct = coll->store->cct;
1842 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1843 << std::dec << " " << *this << dendl;
1844
1845 if (used_in_blob.is_empty()) {
1846 uint32_t min_release_size =
224ce89b
WB
1847 get_blob().get_release_size(coll->store->min_alloc_size);
1848 uint64_t l = get_blob().get_logical_length();
1849 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1850 << min_release_size << std::dec << dendl;
7c673cae
FG
1851 used_in_blob.init(l, min_release_size);
1852 }
1853 used_in_blob.get(
1854 offset,
1855 length);
1856}
1857
1858bool BlueStore::Blob::put_ref(
1859 Collection *coll,
1860 uint32_t offset,
1861 uint32_t length,
1862 PExtentVector *r)
1863{
1864 PExtentVector logical;
1865
1866 auto cct = coll->store->cct;
1867 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1868 << std::dec << " " << *this << dendl;
1869
1870 bool empty = used_in_blob.put(
1871 offset,
1872 length,
1873 &logical);
1874 r->clear();
1875 // nothing to release
1876 if (!empty && logical.empty()) {
1877 return false;
1878 }
1879
1880 bluestore_blob_t& b = dirty_blob();
1881 return b.release_extents(empty, logical, r);
1882}
1883
224ce89b 1884bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1885 uint32_t target_blob_size,
1886 uint32_t b_offset,
1887 uint32_t *length0) {
11fdf7f2
TL
1888 ceph_assert(min_alloc_size);
1889 ceph_assert(target_blob_size);
7c673cae
FG
1890 if (!get_blob().is_mutable()) {
1891 return false;
1892 }
1893
1894 uint32_t length = *length0;
1895 uint32_t end = b_offset + length;
1896
1897 // Currently for the sake of simplicity we omit blob reuse if data is
1898 // unaligned with csum chunk. Later we can perform padding if needed.
1899 if (get_blob().has_csum() &&
1900 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1901 (end % get_blob().get_csum_chunk_size()) != 0)) {
1902 return false;
1903 }
1904
1905 auto blen = get_blob().get_logical_length();
1906 uint32_t new_blen = blen;
1907
1908 // make sure target_blob_size isn't less than current blob len
11fdf7f2 1909 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
1910
1911 if (b_offset >= blen) {
224ce89b
WB
1912 // new data totally stands out of the existing blob
1913 new_blen = end;
7c673cae 1914 } else {
224ce89b 1915 // new data overlaps with the existing blob
11fdf7f2 1916 new_blen = std::max(blen, end);
224ce89b
WB
1917
1918 uint32_t overlap = 0;
1919 if (new_blen > blen) {
1920 overlap = blen - b_offset;
1921 } else {
1922 overlap = length;
1923 }
1924
1925 if (!get_blob().is_unallocated(b_offset, overlap)) {
1926 // abort if any piece of the overlap has already been allocated
1927 return false;
7c673cae
FG
1928 }
1929 }
224ce89b 1930
7c673cae
FG
1931 if (new_blen > blen) {
1932 int64_t overflow = int64_t(new_blen) - target_blob_size;
1933 // Unable to decrease the provided length to fit into max_blob_size
1934 if (overflow >= length) {
1935 return false;
1936 }
1937
1938 // FIXME: in some cases we could reduce unused resolution
1939 if (get_blob().has_unused()) {
1940 return false;
1941 }
1942
1943 if (overflow > 0) {
1944 new_blen -= overflow;
1945 length -= overflow;
1946 *length0 = length;
1947 }
224ce89b 1948
7c673cae
FG
1949 if (new_blen > blen) {
1950 dirty_blob().add_tail(new_blen);
1951 used_in_blob.add_tail(new_blen,
224ce89b 1952 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
1953 }
1954 }
1955 return true;
1956}
1957
1958void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1959{
1960 auto cct = coll->store->cct; //used by dout
1961 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1962 << " start " << *this << dendl;
11fdf7f2
TL
1963 ceph_assert(blob.can_split());
1964 ceph_assert(used_in_blob.can_split());
7c673cae
FG
1965 bluestore_blob_t &lb = dirty_blob();
1966 bluestore_blob_t &rb = r->dirty_blob();
1967
1968 used_in_blob.split(
1969 blob_offset,
1970 &(r->used_in_blob));
1971
1972 lb.split(blob_offset, rb);
1973 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1974
1975 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1976 << " finish " << *this << dendl;
1977 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1978 << " and " << *r << dendl;
1979}
1980
1981#ifndef CACHE_BLOB_BL
1982void BlueStore::Blob::decode(
1983 Collection *coll,
11fdf7f2 1984 bufferptr::const_iterator& p,
7c673cae
FG
1985 uint64_t struct_v,
1986 uint64_t* sbid,
1987 bool include_ref_map)
1988{
1989 denc(blob, p, struct_v);
1990 if (blob.is_shared()) {
1991 denc(*sbid, p);
1992 }
1993 if (include_ref_map) {
1994 if (struct_v > 1) {
1995 used_in_blob.decode(p);
1996 } else {
1997 used_in_blob.clear();
1998 bluestore_extent_ref_map_t legacy_ref_map;
1999 legacy_ref_map.decode(p);
2000 for (auto r : legacy_ref_map.ref_map) {
2001 get_ref(
2002 coll,
2003 r.first,
2004 r.second.refs * r.second.length);
2005 }
2006 }
2007 }
2008}
2009#endif
2010
2011// Extent
2012
2013ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2014{
2015 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2016 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2017 << " " << *e.blob;
2018}
2019
2020// OldExtent
2021BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2022 uint32_t lo,
2023 uint32_t o,
2024 uint32_t l,
2025 BlobRef& b) {
2026 OldExtent* oe = new OldExtent(lo, o, l, b);
2027 b->put_ref(c.get(), o, l, &(oe->r));
2028 oe->blob_empty = b->get_referenced_bytes() == 0;
2029 return oe;
2030}
2031
2032// ExtentMap
2033
2034#undef dout_prefix
2035#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2036
2037BlueStore::ExtentMap::ExtentMap(Onode *o)
2038 : onode(o),
2039 inline_bl(
2040 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2041}
2042
11fdf7f2
TL
2043void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2044 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2045 uint64_t& length, uint64_t& dstoff) {
2046
2047 auto cct = onode->c->store->cct;
2048 bool inject_21040 =
2049 cct->_conf->bluestore_debug_inject_bug21040;
2050 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2051 for (auto& e : oldo->extent_map.extent_map) {
2052 e.blob->last_encoded_id = -1;
2053 }
2054
2055 int n = 0;
2056 uint64_t end = srcoff + length;
2057 uint32_t dirty_range_begin = 0;
2058 uint32_t dirty_range_end = 0;
2059 bool src_dirty = false;
2060 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2061 ep != oldo->extent_map.extent_map.end();
2062 ++ep) {
2063 auto& e = *ep;
2064 if (e.logical_offset >= end) {
2065 break;
2066 }
2067 dout(20) << __func__ << " src " << e << dendl;
2068 BlobRef cb;
2069 bool blob_duped = true;
2070 if (e.blob->last_encoded_id >= 0) {
2071 cb = id_to_blob[e.blob->last_encoded_id];
2072 blob_duped = false;
2073 } else {
2074 // dup the blob
2075 const bluestore_blob_t& blob = e.blob->get_blob();
2076 // make sure it is shared
2077 if (!blob.is_shared()) {
2078 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2079 if (!inject_21040 && !src_dirty) {
2080 src_dirty = true;
2081 dirty_range_begin = e.logical_offset;
2082 } else if (inject_21040 &&
2083 dirty_range_begin == 0 && dirty_range_end == 0) {
2084 dirty_range_begin = e.logical_offset;
2085 }
2086 ceph_assert(e.logical_end() > 0);
2087 // -1 to exclude next potential shard
2088 dirty_range_end = e.logical_end() - 1;
2089 } else {
2090 c->load_shared_blob(e.blob->shared_blob);
2091 }
2092 cb = new Blob();
2093 e.blob->last_encoded_id = n;
2094 id_to_blob[n] = cb;
2095 e.blob->dup(*cb);
2096 // bump the extent refs on the copied blob's extents
2097 for (auto p : blob.get_extents()) {
2098 if (p.is_valid()) {
2099 e.blob->shared_blob->get_ref(p.offset, p.length);
2100 }
2101 }
2102 txc->write_shared_blob(e.blob->shared_blob);
2103 dout(20) << __func__ << " new " << *cb << dendl;
2104 }
2105
2106 int skip_front, skip_back;
2107 if (e.logical_offset < srcoff) {
2108 skip_front = srcoff - e.logical_offset;
2109 } else {
2110 skip_front = 0;
2111 }
2112 if (e.logical_end() > end) {
2113 skip_back = e.logical_end() - end;
2114 } else {
2115 skip_back = 0;
2116 }
2117
2118 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2119 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2120 newo->extent_map.extent_map.insert(*ne);
2121 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2122 // fixme: we may leave parts of new blob unreferenced that could
2123 // be freed (relative to the shared_blob).
2124 txc->statfs_delta.stored() += ne->length;
2125 if (e.blob->get_blob().is_compressed()) {
2126 txc->statfs_delta.compressed_original() += ne->length;
2127 if (blob_duped) {
2128 txc->statfs_delta.compressed() +=
2129 cb->get_blob().get_compressed_payload_length();
2130 }
2131 }
2132 dout(20) << __func__ << " dst " << *ne << dendl;
2133 ++n;
2134 }
2135 if ((!inject_21040 && src_dirty) ||
2136 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2137 oldo->extent_map.dirty_range(dirty_range_begin,
2138 dirty_range_end - dirty_range_begin);
2139 txc->write_onode(oldo);
2140 }
2141 txc->write_onode(newo);
2142
2143 if (dstoff + length > newo->onode.size) {
2144 newo->onode.size = dstoff + length;
2145 }
2146 newo->extent_map.dirty_range(dstoff, length);
2147}
7c673cae
FG
2148void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2149 bool force)
2150{
2151 auto cct = onode->c->store->cct; //used by dout
2152 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2153 if (onode->onode.extent_map_shards.empty()) {
2154 if (inline_bl.length() == 0) {
2155 unsigned n;
2156 // we need to encode inline_bl to measure encoded length
2157 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
3efd9988 2158 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11fdf7f2 2159 ceph_assert(!never_happen);
7c673cae
FG
2160 size_t len = inline_bl.length();
2161 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2162 << " extents" << dendl;
2163 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2164 request_reshard(0, OBJECT_MAX_SIZE);
2165 return;
2166 }
2167 }
2168 // will persist in the onode key.
2169 } else {
2170 // pending shard update
2171 struct dirty_shard_t {
2172 Shard *shard;
2173 bufferlist bl;
2174 dirty_shard_t(Shard *s) : shard(s) {}
2175 };
2176 vector<dirty_shard_t> encoded_shards;
2177 // allocate slots for all shards in a single call instead of
2178 // doing multiple allocations - one per each dirty shard
2179 encoded_shards.reserve(shards.size());
2180
2181 auto p = shards.begin();
2182 auto prev_p = p;
2183 while (p != shards.end()) {
11fdf7f2 2184 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2185 auto n = p;
2186 ++n;
2187 if (p->dirty) {
2188 uint32_t endoff;
2189 if (n == shards.end()) {
2190 endoff = OBJECT_MAX_SIZE;
2191 } else {
2192 endoff = n->shard_info->offset;
2193 }
2194 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2195 bufferlist& bl = encoded_shards.back().bl;
2196 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2197 bl, &p->extents)) {
2198 if (force) {
2199 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2200 ceph_assert(!force);
7c673cae
FG
2201 }
2202 }
2203 size_t len = bl.length();
2204
2205 dout(20) << __func__ << " shard 0x" << std::hex
2206 << p->shard_info->offset << std::dec << " is " << len
2207 << " bytes (was " << p->shard_info->bytes << ") from "
2208 << p->extents << " extents" << dendl;
2209
2210 if (!force) {
2211 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2212 // we are big; reshard ourselves
2213 request_reshard(p->shard_info->offset, endoff);
2214 }
2215 // avoid resharding the trailing shard, even if it is small
2216 else if (n != shards.end() &&
11fdf7f2
TL
2217 len < g_conf()->bluestore_extent_map_shard_min_size) {
2218 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2219 if (p == shards.begin()) {
2220 // we are the first shard, combine with next shard
7c673cae 2221 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2222 } else {
31f18b77
FG
2223 // combine either with the previous shard or the next,
2224 // whichever is smaller
7c673cae
FG
2225 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2226 request_reshard(p->shard_info->offset, endoff + 1);
2227 } else {
2228 request_reshard(prev_p->shard_info->offset, endoff);
2229 }
2230 }
2231 }
2232 }
2233 }
2234 prev_p = p;
2235 p = n;
2236 }
2237 if (needs_reshard()) {
2238 return;
2239 }
2240
2241 // schedule DB update for dirty shards
2242 string key;
2243 for (auto& it : encoded_shards) {
2244 it.shard->dirty = false;
2245 it.shard->shard_info->bytes = it.bl.length();
2246 generate_extent_shard_key_and_apply(
2247 onode->key,
2248 it.shard->shard_info->offset,
2249 &key,
2250 [&](const string& final_key) {
2251 t->set(PREFIX_OBJ, final_key, it.bl);
2252 }
2253 );
2254 }
2255 }
2256}
2257
31f18b77
FG
2258bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2259{
2260 if (spanning_blob_map.empty())
2261 return 0;
2262 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2263 // bid is valid and available.
2264 if (bid >= 0)
2265 return bid;
2266 // Find next unused bid;
2267 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2268 const auto begin_bid = bid;
2269 do {
2270 if (!spanning_blob_map.count(bid))
2271 return bid;
2272 else {
2273 bid++;
2274 if (bid < 0) bid = 0;
2275 }
2276 } while (bid != begin_bid);
81eedcae
TL
2277 auto cct = onode->c->store->cct; // used by dout
2278 _dump_onode<0>(cct, *onode);
11fdf7f2 2279 ceph_abort_msg("no available blob id");
31f18b77
FG
2280}
2281
7c673cae
FG
2282void BlueStore::ExtentMap::reshard(
2283 KeyValueDB *db,
2284 KeyValueDB::Transaction t)
2285{
2286 auto cct = onode->c->store->cct; // used by dout
2287
2288 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2289 << needs_reshard_end << ")" << std::dec
2290 << " of " << onode->onode.extent_map_shards.size()
2291 << " shards on " << onode->oid << dendl;
2292 for (auto& p : spanning_blob_map) {
2293 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2294 << dendl;
2295 }
2296 // determine shard index range
2297 unsigned si_begin = 0, si_end = 0;
2298 if (!shards.empty()) {
2299 while (si_begin + 1 < shards.size() &&
2300 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2301 ++si_begin;
2302 }
2303 needs_reshard_begin = shards[si_begin].shard_info->offset;
2304 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2305 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2306 needs_reshard_end = shards[si_end].shard_info->offset;
2307 break;
2308 }
2309 }
2310 if (si_end == shards.size()) {
2311 needs_reshard_end = OBJECT_MAX_SIZE;
2312 }
2313 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2314 << " over 0x[" << std::hex << needs_reshard_begin << ","
2315 << needs_reshard_end << ")" << std::dec << dendl;
2316 }
2317
181888fb 2318 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2319
2320 // we may need to fault in a larger interval later must have all
2321 // referring extents for spanning blobs loaded in order to have
2322 // accurate use_tracker values.
2323 uint32_t spanning_scan_begin = needs_reshard_begin;
2324 uint32_t spanning_scan_end = needs_reshard_end;
2325
2326 // remove old keys
2327 string key;
2328 for (unsigned i = si_begin; i < si_end; ++i) {
2329 generate_extent_shard_key_and_apply(
2330 onode->key, shards[i].shard_info->offset, &key,
2331 [&](const string& final_key) {
2332 t->rmkey(PREFIX_OBJ, final_key);
2333 }
2334 );
2335 }
2336
2337 // calculate average extent size
2338 unsigned bytes = 0;
2339 unsigned extents = 0;
2340 if (onode->onode.extent_map_shards.empty()) {
2341 bytes = inline_bl.length();
2342 extents = extent_map.size();
2343 } else {
2344 for (unsigned i = si_begin; i < si_end; ++i) {
2345 bytes += shards[i].shard_info->bytes;
2346 extents += shards[i].extents;
2347 }
2348 }
2349 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2350 unsigned slop = target *
2351 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2352 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2353 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2354 << ", slop " << slop << dendl;
2355
2356 // reshard
2357 unsigned estimate = 0;
31f18b77 2358 unsigned offset = needs_reshard_begin;
7c673cae
FG
2359 vector<bluestore_onode_t::shard_info> new_shard_info;
2360 unsigned max_blob_end = 0;
2361 Extent dummy(needs_reshard_begin);
2362 for (auto e = extent_map.lower_bound(dummy);
2363 e != extent_map.end();
2364 ++e) {
2365 if (e->logical_offset >= needs_reshard_end) {
2366 break;
2367 }
2368 dout(30) << " extent " << *e << dendl;
2369
2370 // disfavor shard boundaries that span a blob
2371 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2372 if (estimate &&
2373 estimate + extent_avg > target + (would_span ? slop : 0)) {
2374 // new shard
31f18b77 2375 if (offset == needs_reshard_begin) {
7c673cae
FG
2376 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2377 new_shard_info.back().offset = offset;
2378 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2379 << std::dec << dendl;
7c673cae
FG
2380 }
2381 offset = e->logical_offset;
2382 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2383 new_shard_info.back().offset = offset;
2384 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2385 << std::dec << dendl;
2386 estimate = 0;
2387 }
2388 estimate += extent_avg;
31f18b77
FG
2389 unsigned bs = e->blob_start();
2390 if (bs < spanning_scan_begin) {
2391 spanning_scan_begin = bs;
7c673cae
FG
2392 }
2393 uint32_t be = e->blob_end();
2394 if (be > max_blob_end) {
2395 max_blob_end = be;
2396 }
2397 if (be > spanning_scan_end) {
2398 spanning_scan_end = be;
2399 }
2400 }
2401 if (new_shard_info.empty() && (si_begin > 0 ||
2402 si_end < shards.size())) {
2403 // we resharded a partial range; we must produce at least one output
2404 // shard
2405 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2406 new_shard_info.back().offset = needs_reshard_begin;
2407 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2408 << std::dec << " (singleton degenerate case)" << dendl;
2409 }
2410
2411 auto& sv = onode->onode.extent_map_shards;
2412 dout(20) << __func__ << " new " << new_shard_info << dendl;
2413 dout(20) << __func__ << " old " << sv << dendl;
2414 if (sv.empty()) {
2415 // no old shards to keep
2416 sv.swap(new_shard_info);
2417 init_shards(true, true);
2418 } else {
2419 // splice in new shards
2420 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2421 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2422 sv.insert(
2423 sv.begin() + si_begin,
2424 new_shard_info.begin(),
2425 new_shard_info.end());
2426 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2427 si_end = si_begin + new_shard_info.size();
31f18b77 2428
11fdf7f2 2429 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2430
2431 // note that we need to update every shard_info of shards here,
2432 // as sv might have been totally re-allocated above
2433 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2434 shards[i].shard_info = &sv[i];
31f18b77
FG
2435 }
2436
2437 // mark newly added shards as dirty
2438 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2439 shards[i].loaded = true;
2440 shards[i].dirty = true;
2441 }
7c673cae
FG
2442 }
2443 dout(20) << __func__ << " fin " << sv << dendl;
2444 inline_bl.clear();
2445
2446 if (sv.empty()) {
2447 // no more shards; unspan all previously spanning blobs
2448 auto p = spanning_blob_map.begin();
2449 while (p != spanning_blob_map.end()) {
2450 p->second->id = -1;
2451 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2452 p = spanning_blob_map.erase(p);
2453 }
2454 } else {
2455 // identify new spanning blobs
2456 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2457 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2458 if (spanning_scan_begin < needs_reshard_begin) {
2459 fault_range(db, spanning_scan_begin,
2460 needs_reshard_begin - spanning_scan_begin);
2461 }
2462 if (spanning_scan_end > needs_reshard_end) {
2463 fault_range(db, needs_reshard_end,
31f18b77 2464 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2465 }
2466 auto sp = sv.begin() + si_begin;
2467 auto esp = sv.end();
2468 unsigned shard_start = sp->offset;
2469 unsigned shard_end;
2470 ++sp;
2471 if (sp == esp) {
2472 shard_end = OBJECT_MAX_SIZE;
2473 } else {
2474 shard_end = sp->offset;
2475 }
7c673cae
FG
2476 Extent dummy(needs_reshard_begin);
2477 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2478 if (e->logical_offset >= needs_reshard_end) {
2479 break;
2480 }
2481 dout(30) << " extent " << *e << dendl;
2482 while (e->logical_offset >= shard_end) {
2483 shard_start = shard_end;
11fdf7f2 2484 ceph_assert(sp != esp);
7c673cae
FG
2485 ++sp;
2486 if (sp == esp) {
2487 shard_end = OBJECT_MAX_SIZE;
2488 } else {
2489 shard_end = sp->offset;
2490 }
2491 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2492 << " to 0x" << shard_end << std::dec << dendl;
2493 }
2494 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2495 if (!e->blob->is_spanning()) {
2496 // We have two options: (1) split the blob into pieces at the
2497 // shard boundaries (and adjust extents accordingly), or (2)
2498 // mark it spanning. We prefer to cut the blob if we can. Note that
2499 // we may have to split it multiple times--potentially at every
2500 // shard boundary.
2501 bool must_span = false;
2502 BlobRef b = e->blob;
2503 if (b->can_split()) {
2504 uint32_t bstart = e->blob_start();
2505 uint32_t bend = e->blob_end();
2506 for (const auto& sh : shards) {
2507 if (bstart < sh.shard_info->offset &&
2508 bend > sh.shard_info->offset) {
2509 uint32_t blob_offset = sh.shard_info->offset - bstart;
2510 if (b->can_split_at(blob_offset)) {
2511 dout(20) << __func__ << " splitting blob, bstart 0x"
2512 << std::hex << bstart << " blob_offset 0x"
2513 << blob_offset << std::dec << " " << *b << dendl;
2514 b = split_blob(b, blob_offset, sh.shard_info->offset);
2515 // switch b to the new right-hand side, in case it
2516 // *also* has to get split.
2517 bstart += blob_offset;
2518 onode->c->store->logger->inc(l_bluestore_blob_split);
2519 } else {
2520 must_span = true;
2521 break;
2522 }
2523 }
2524 }
2525 } else {
2526 must_span = true;
2527 }
2528 if (must_span) {
31f18b77
FG
2529 auto bid = allocate_spanning_blob_id();
2530 b->id = bid;
7c673cae
FG
2531 spanning_blob_map[b->id] = b;
2532 dout(20) << __func__ << " adding spanning " << *b << dendl;
2533 }
2534 }
2535 } else {
2536 if (e->blob->is_spanning()) {
2537 spanning_blob_map.erase(e->blob->id);
2538 e->blob->id = -1;
2539 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2540 }
2541 }
2542 }
2543 }
2544
2545 clear_needs_reshard();
2546}
2547
2548bool BlueStore::ExtentMap::encode_some(
2549 uint32_t offset,
2550 uint32_t length,
2551 bufferlist& bl,
2552 unsigned *pn)
2553{
2554 auto cct = onode->c->store->cct; //used by dout
2555 Extent dummy(offset);
2556 auto start = extent_map.lower_bound(dummy);
2557 uint32_t end = offset + length;
2558
2559 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2560 // serialization only. Hence there is no specific
2561 // handling at ExtentMap level.
2562
2563 unsigned n = 0;
2564 size_t bound = 0;
7c673cae
FG
2565 bool must_reshard = false;
2566 for (auto p = start;
2567 p != extent_map.end() && p->logical_offset < end;
2568 ++p, ++n) {
11fdf7f2 2569 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2570 p->blob->last_encoded_id = -1;
2571 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2572 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2573 << std::dec << " hit new spanning blob " << *p << dendl;
2574 request_reshard(p->blob_start(), p->blob_end());
2575 must_reshard = true;
2576 }
31f18b77
FG
2577 if (!must_reshard) {
2578 denc_varint(0, bound); // blobid
2579 denc_varint(0, bound); // logical_offset
2580 denc_varint(0, bound); // len
2581 denc_varint(0, bound); // blob_offset
7c673cae 2582
31f18b77
FG
2583 p->blob->bound_encode(
2584 bound,
2585 struct_v,
2586 p->blob->shared_blob->get_sbid(),
2587 false);
2588 }
7c673cae
FG
2589 }
2590 if (must_reshard) {
2591 return true;
2592 }
2593
31f18b77
FG
2594 denc(struct_v, bound);
2595 denc_varint(0, bound); // number of extents
2596
7c673cae
FG
2597 {
2598 auto app = bl.get_contiguous_appender(bound);
2599 denc(struct_v, app);
2600 denc_varint(n, app);
2601 if (pn) {
2602 *pn = n;
2603 }
2604
2605 n = 0;
2606 uint64_t pos = 0;
2607 uint64_t prev_len = 0;
2608 for (auto p = start;
2609 p != extent_map.end() && p->logical_offset < end;
2610 ++p, ++n) {
2611 unsigned blobid;
2612 bool include_blob = false;
2613 if (p->blob->is_spanning()) {
2614 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2615 blobid |= BLOBID_FLAG_SPANNING;
2616 } else if (p->blob->last_encoded_id < 0) {
2617 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2618 include_blob = true;
2619 blobid = 0; // the decoder will infer the id from n
2620 } else {
2621 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2622 }
2623 if (p->logical_offset == pos) {
2624 blobid |= BLOBID_FLAG_CONTIGUOUS;
2625 }
2626 if (p->blob_offset == 0) {
2627 blobid |= BLOBID_FLAG_ZEROOFFSET;
2628 }
2629 if (p->length == prev_len) {
2630 blobid |= BLOBID_FLAG_SAMELENGTH;
2631 } else {
2632 prev_len = p->length;
2633 }
2634 denc_varint(blobid, app);
2635 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2636 denc_varint_lowz(p->logical_offset - pos, app);
2637 }
2638 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2639 denc_varint_lowz(p->blob_offset, app);
2640 }
2641 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2642 denc_varint_lowz(p->length, app);
2643 }
2644 pos = p->logical_end();
2645 if (include_blob) {
2646 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2647 }
2648 }
2649 }
2650 /*derr << __func__ << bl << dendl;
2651 derr << __func__ << ":";
2652 bl.hexdump(*_dout);
2653 *_dout << dendl;
2654 */
2655 return false;
2656}
2657
2658unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2659{
2660 auto cct = onode->c->store->cct; //used by dout
2661 /*
2662 derr << __func__ << ":";
2663 bl.hexdump(*_dout);
2664 *_dout << dendl;
2665 */
2666
11fdf7f2 2667 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
2668 auto p = bl.front().begin_deep();
2669 __u8 struct_v;
2670 denc(struct_v, p);
2671 // Version 2 differs from v1 in blob's ref_map
2672 // serialization only. Hence there is no specific
2673 // handling at ExtentMap level below.
11fdf7f2 2674 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2675
2676 uint32_t num;
2677 denc_varint(num, p);
2678 vector<BlobRef> blobs(num);
2679 uint64_t pos = 0;
2680 uint64_t prev_len = 0;
2681 unsigned n = 0;
2682
2683 while (!p.end()) {
2684 Extent *le = new Extent();
2685 uint64_t blobid;
2686 denc_varint(blobid, p);
2687 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2688 uint64_t gap;
2689 denc_varint_lowz(gap, p);
2690 pos += gap;
2691 }
2692 le->logical_offset = pos;
2693 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2694 denc_varint_lowz(le->blob_offset, p);
2695 } else {
2696 le->blob_offset = 0;
2697 }
2698 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2699 denc_varint_lowz(prev_len, p);
2700 }
2701 le->length = prev_len;
2702
2703 if (blobid & BLOBID_FLAG_SPANNING) {
2704 dout(30) << __func__ << " getting spanning blob "
2705 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2706 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2707 } else {
2708 blobid >>= BLOBID_SHIFT_BITS;
2709 if (blobid) {
2710 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 2711 ceph_assert(le->blob);
7c673cae
FG
2712 } else {
2713 Blob *b = new Blob();
2714 uint64_t sbid = 0;
2715 b->decode(onode->c, p, struct_v, &sbid, false);
2716 blobs[n] = b;
2717 onode->c->open_shared_blob(sbid, b);
2718 le->assign_blob(b);
2719 }
2720 // we build ref_map dynamically for non-spanning blobs
2721 le->blob->get_ref(
2722 onode->c,
2723 le->blob_offset,
2724 le->length);
2725 }
2726 pos += prev_len;
2727 ++n;
2728 extent_map.insert(*le);
2729 }
2730
11fdf7f2 2731 ceph_assert(n == num);
7c673cae
FG
2732 return num;
2733}
2734
2735void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2736{
2737 // Version 2 differs from v1 in blob's ref_map
2738 // serialization only. Hence there is no specific
2739 // handling at ExtentMap level.
2740 __u8 struct_v = 2;
2741
2742 denc(struct_v, p);
2743 denc_varint((uint32_t)0, p);
2744 size_t key_size = 0;
2745 denc_varint((uint32_t)0, key_size);
2746 p += spanning_blob_map.size() * key_size;
2747 for (const auto& i : spanning_blob_map) {
2748 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2749 }
2750}
2751
2752void BlueStore::ExtentMap::encode_spanning_blobs(
2753 bufferlist::contiguous_appender& p)
2754{
2755 // Version 2 differs from v1 in blob's ref_map
2756 // serialization only. Hence there is no specific
2757 // handling at ExtentMap level.
2758 __u8 struct_v = 2;
2759
2760 denc(struct_v, p);
2761 denc_varint(spanning_blob_map.size(), p);
2762 for (auto& i : spanning_blob_map) {
2763 denc_varint(i.second->id, p);
2764 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2765 }
2766}
2767
2768void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 2769 bufferptr::const_iterator& p)
7c673cae
FG
2770{
2771 __u8 struct_v;
2772 denc(struct_v, p);
2773 // Version 2 differs from v1 in blob's ref_map
2774 // serialization only. Hence there is no specific
2775 // handling at ExtentMap level.
11fdf7f2 2776 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2777
2778 unsigned n;
2779 denc_varint(n, p);
2780 while (n--) {
2781 BlobRef b(new Blob());
2782 denc_varint(b->id, p);
2783 spanning_blob_map[b->id] = b;
2784 uint64_t sbid = 0;
2785 b->decode(onode->c, p, struct_v, &sbid, true);
2786 onode->c->open_shared_blob(sbid, b);
2787 }
2788}
2789
2790void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2791{
2792 shards.resize(onode->onode.extent_map_shards.size());
2793 unsigned i = 0;
2794 for (auto &s : onode->onode.extent_map_shards) {
2795 shards[i].shard_info = &s;
2796 shards[i].loaded = loaded;
2797 shards[i].dirty = dirty;
2798 ++i;
2799 }
2800}
2801
2802void BlueStore::ExtentMap::fault_range(
2803 KeyValueDB *db,
2804 uint32_t offset,
2805 uint32_t length)
2806{
2807 auto cct = onode->c->store->cct; //used by dout
2808 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2809 << std::dec << dendl;
2810 auto start = seek_shard(offset);
2811 auto last = seek_shard(offset + length);
2812
2813 if (start < 0)
2814 return;
2815
11fdf7f2 2816 ceph_assert(last >= start);
7c673cae
FG
2817 string key;
2818 while (start <= last) {
11fdf7f2 2819 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2820 auto p = &shards[start];
2821 if (!p->loaded) {
2822 dout(30) << __func__ << " opening shard 0x" << std::hex
2823 << p->shard_info->offset << std::dec << dendl;
2824 bufferlist v;
2825 generate_extent_shard_key_and_apply(
2826 onode->key, p->shard_info->offset, &key,
2827 [&](const string& final_key) {
2828 int r = db->get(PREFIX_OBJ, final_key, &v);
2829 if (r < 0) {
2830 derr << __func__ << " missing shard 0x" << std::hex
2831 << p->shard_info->offset << std::dec << " for " << onode->oid
2832 << dendl;
11fdf7f2 2833 ceph_assert(r >= 0);
7c673cae
FG
2834 }
2835 }
2836 );
2837 p->extents = decode_some(v);
2838 p->loaded = true;
2839 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
2840 << p->shard_info->offset
2841 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 2842 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
2843 ceph_assert(p->dirty == false);
2844 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
2845 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2846 } else {
2847 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2848 }
2849 ++start;
2850 }
2851}
2852
2853void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
2854 uint32_t offset,
2855 uint32_t length)
2856{
2857 auto cct = onode->c->store->cct; //used by dout
2858 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2859 << std::dec << dendl;
2860 if (shards.empty()) {
2861 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2862 inline_bl.clear();
2863 return;
2864 }
2865 auto start = seek_shard(offset);
11fdf7f2
TL
2866 if (length == 0) {
2867 length = 1;
2868 }
2869 auto last = seek_shard(offset + length - 1);
7c673cae
FG
2870 if (start < 0)
2871 return;
2872
11fdf7f2 2873 ceph_assert(last >= start);
7c673cae 2874 while (start <= last) {
11fdf7f2 2875 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2876 auto p = &shards[start];
2877 if (!p->loaded) {
11fdf7f2
TL
2878 derr << __func__ << "on write 0x" << std::hex << offset
2879 << "~" << length << " shard 0x" << p->shard_info->offset
2880 << std::dec << " is not loaded, can't mark dirty" << dendl;
2881 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
2882 }
2883 if (!p->dirty) {
2884 dout(20) << __func__ << " mark shard 0x" << std::hex
2885 << p->shard_info->offset << std::dec << " dirty" << dendl;
2886 p->dirty = true;
2887 }
2888 ++start;
2889 }
2890}
2891
2892BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2893 uint64_t offset)
2894{
2895 Extent dummy(offset);
2896 return extent_map.find(dummy);
2897}
2898
7c673cae
FG
2899BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2900 uint64_t offset)
2901{
2902 Extent dummy(offset);
2903 auto fp = extent_map.lower_bound(dummy);
2904 if (fp != extent_map.begin()) {
2905 --fp;
2906 if (fp->logical_end() <= offset) {
2907 ++fp;
2908 }
2909 }
2910 return fp;
2911}
2912
2913BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2914 uint64_t offset) const
2915{
2916 Extent dummy(offset);
2917 auto fp = extent_map.lower_bound(dummy);
2918 if (fp != extent_map.begin()) {
2919 --fp;
2920 if (fp->logical_end() <= offset) {
2921 ++fp;
2922 }
2923 }
2924 return fp;
2925}
2926
2927bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2928{
2929 auto fp = seek_lextent(offset);
2930 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2931 return false;
2932 }
2933 return true;
2934}
2935
2936int BlueStore::ExtentMap::compress_extent_map(
2937 uint64_t offset,
2938 uint64_t length)
2939{
2940 auto cct = onode->c->store->cct; //used by dout
2941 if (extent_map.empty())
2942 return 0;
2943 int removed = 0;
2944 auto p = seek_lextent(offset);
2945 if (p != extent_map.begin()) {
2946 --p; // start to the left of offset
2947 }
2948 // the caller should have just written to this region
11fdf7f2 2949 ceph_assert(p != extent_map.end());
7c673cae
FG
2950
2951 // identify the *next* shard
2952 auto pshard = shards.begin();
2953 while (pshard != shards.end() &&
2954 p->logical_offset >= pshard->shard_info->offset) {
2955 ++pshard;
2956 }
2957 uint64_t shard_end;
2958 if (pshard != shards.end()) {
2959 shard_end = pshard->shard_info->offset;
2960 } else {
2961 shard_end = OBJECT_MAX_SIZE;
2962 }
2963
2964 auto n = p;
2965 for (++n; n != extent_map.end(); p = n++) {
2966 if (n->logical_offset > offset + length) {
2967 break; // stop after end
2968 }
2969 while (n != extent_map.end() &&
2970 p->logical_end() == n->logical_offset &&
2971 p->blob == n->blob &&
2972 p->blob_offset + p->length == n->blob_offset &&
2973 n->logical_offset < shard_end) {
2974 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2975 << " next shard 0x" << shard_end << std::dec
2976 << " merging " << *p << " and " << *n << dendl;
2977 p->length += n->length;
2978 rm(n++);
2979 ++removed;
2980 }
2981 if (n == extent_map.end()) {
2982 break;
2983 }
2984 if (n->logical_offset >= shard_end) {
11fdf7f2 2985 ceph_assert(pshard != shards.end());
7c673cae
FG
2986 ++pshard;
2987 if (pshard != shards.end()) {
2988 shard_end = pshard->shard_info->offset;
2989 } else {
2990 shard_end = OBJECT_MAX_SIZE;
2991 }
2992 }
2993 }
11fdf7f2 2994 if (removed) {
7c673cae
FG
2995 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2996 }
2997 return removed;
2998}
2999
3000void BlueStore::ExtentMap::punch_hole(
3001 CollectionRef &c,
3002 uint64_t offset,
3003 uint64_t length,
3004 old_extent_map_t *old_extents)
3005{
3006 auto p = seek_lextent(offset);
3007 uint64_t end = offset + length;
3008 while (p != extent_map.end()) {
3009 if (p->logical_offset >= end) {
3010 break;
3011 }
3012 if (p->logical_offset < offset) {
3013 if (p->logical_end() > end) {
3014 // split and deref middle
3015 uint64_t front = offset - p->logical_offset;
3016 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3017 length, p->blob);
3018 old_extents->push_back(*oe);
3019 add(end,
3020 p->blob_offset + front + length,
3021 p->length - front - length,
3022 p->blob);
3023 p->length = front;
3024 break;
3025 } else {
3026 // deref tail
11fdf7f2 3027 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3028 uint64_t keep = offset - p->logical_offset;
3029 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3030 p->length - keep, p->blob);
3031 old_extents->push_back(*oe);
3032 p->length = keep;
3033 ++p;
3034 continue;
3035 }
3036 }
3037 if (p->logical_offset + p->length <= end) {
3038 // deref whole lextent
3039 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3040 p->length, p->blob);
3041 old_extents->push_back(*oe);
3042 rm(p++);
3043 continue;
3044 }
3045 // deref head
3046 uint64_t keep = p->logical_end() - end;
3047 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3048 p->length - keep, p->blob);
3049 old_extents->push_back(*oe);
3050
3051 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3052 rm(p);
3053 break;
3054 }
3055}
3056
3057BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3058 CollectionRef &c,
3059 uint64_t logical_offset,
3060 uint64_t blob_offset, uint64_t length, BlobRef b,
3061 old_extent_map_t *old_extents)
3062{
3063 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3064 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3065
3066 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3067 // old_extents list if we overwre the blob totally
3068 // This might happen during WAL overwrite.
3069 b->get_ref(onode->c, blob_offset, length);
3070
3071 if (old_extents) {
3072 punch_hole(c, logical_offset, length, old_extents);
3073 }
3074
3075 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3076 extent_map.insert(*le);
3077 if (spans_shard(logical_offset, length)) {
3078 request_reshard(logical_offset, logical_offset + length);
3079 }
3080 return le;
3081}
3082
3083BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3084 BlobRef lb,
3085 uint32_t blob_offset,
3086 uint32_t pos)
3087{
3088 auto cct = onode->c->store->cct; //used by dout
3089
3090 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3091 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3092 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3093 << dendl;
3094 BlobRef rb = onode->c->new_blob();
3095 lb->split(onode->c, blob_offset, rb.get());
3096
3097 for (auto ep = seek_lextent(pos);
3098 ep != extent_map.end() && ep->logical_offset < end_pos;
3099 ++ep) {
3100 if (ep->blob != lb) {
3101 continue;
3102 }
3103 if (ep->logical_offset < pos) {
3104 // split extent
3105 size_t left = pos - ep->logical_offset;
3106 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3107 extent_map.insert(*ne);
3108 ep->length = left;
3109 dout(30) << __func__ << " split " << *ep << dendl;
3110 dout(30) << __func__ << " to " << *ne << dendl;
3111 } else {
3112 // switch blob
11fdf7f2 3113 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3114
3115 ep->blob = rb;
3116 ep->blob_offset -= blob_offset;
3117 dout(30) << __func__ << " adjusted " << *ep << dendl;
3118 }
3119 }
3120 return rb;
3121}
3122
3123// Onode
3124
3125#undef dout_prefix
3126#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3127
3128void BlueStore::Onode::flush()
3129{
3130 if (flushing_count.load()) {
3131 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
11fdf7f2 3132 std::unique_lock l(flush_lock);
7c673cae
FG
3133 while (flushing_count.load()) {
3134 flush_cond.wait(l);
3135 }
3136 }
3137 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3138}
3139
3140// =======================================================
3141// WriteContext
3142
3143/// Checks for writes to the same pextent within a blob
3144bool BlueStore::WriteContext::has_conflict(
3145 BlobRef b,
3146 uint64_t loffs,
3147 uint64_t loffs_end,
3148 uint64_t min_alloc_size)
3149{
11fdf7f2
TL
3150 ceph_assert((loffs % min_alloc_size) == 0);
3151 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3152 for (auto w : writes) {
3153 if (b == w.b) {
11fdf7f2
TL
3154 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3155 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3156 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3157 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3158 return true;
3159 }
3160 }
3161 }
3162 return false;
3163}
3164
3165// =======================================================
3166
3167// DeferredBatch
3168#undef dout_prefix
3169#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3170
3171void BlueStore::DeferredBatch::prepare_write(
3172 CephContext *cct,
3173 uint64_t seq, uint64_t offset, uint64_t length,
3174 bufferlist::const_iterator& blp)
3175{
3176 _discard(cct, offset, length);
3177 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3178 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3179 i.first->second.seq = seq;
3180 blp.copy(length, i.first->second.bl);
31f18b77
FG
3181 i.first->second.bl.reassign_to_mempool(
3182 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3183 dout(20) << __func__ << " seq " << seq
3184 << " 0x" << std::hex << offset << "~" << length
3185 << " crc " << i.first->second.bl.crc32c(-1)
3186 << std::dec << dendl;
3187 seq_bytes[seq] += length;
3188#ifdef DEBUG_DEFERRED
3189 _audit(cct);
3190#endif
3191}
3192
3193void BlueStore::DeferredBatch::_discard(
3194 CephContext *cct, uint64_t offset, uint64_t length)
3195{
3196 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3197 << std::dec << dendl;
3198 auto p = iomap.lower_bound(offset);
3199 if (p != iomap.begin()) {
3200 --p;
3201 auto end = p->first + p->second.bl.length();
3202 if (end > offset) {
3203 bufferlist head;
3204 head.substr_of(p->second.bl, 0, offset - p->first);
3205 dout(20) << __func__ << " keep head " << p->second.seq
3206 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3207 << " -> 0x" << head.length() << std::dec << dendl;
3208 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3209 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3210 if (end > offset + length) {
3211 bufferlist tail;
3212 tail.substr_of(p->second.bl, offset + length - p->first,
3213 end - (offset + length));
3214 dout(20) << __func__ << " keep tail " << p->second.seq
3215 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3216 << " -> 0x" << tail.length() << std::dec << dendl;
3217 auto &n = iomap[offset + length];
3218 n.bl.swap(tail);
3219 n.seq = p->second.seq;
3220 i->second -= length;
3221 } else {
3222 i->second -= end - offset;
3223 }
11fdf7f2 3224 ceph_assert(i->second >= 0);
7c673cae
FG
3225 p->second.bl.swap(head);
3226 }
3227 ++p;
3228 }
3229 while (p != iomap.end()) {
3230 if (p->first >= offset + length) {
3231 break;
3232 }
3233 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3234 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3235 auto end = p->first + p->second.bl.length();
3236 if (end > offset + length) {
3237 unsigned drop_front = offset + length - p->first;
3238 unsigned keep_tail = end - (offset + length);
3239 dout(20) << __func__ << " truncate front " << p->second.seq
3240 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3241 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3242 << " to 0x" << (offset + length) << "~" << keep_tail
3243 << std::dec << dendl;
3244 auto &s = iomap[offset + length];
3245 s.seq = p->second.seq;
3246 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3247 i->second -= drop_front;
3248 } else {
3249 dout(20) << __func__ << " drop " << p->second.seq
3250 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3251 << std::dec << dendl;
3252 i->second -= p->second.bl.length();
3253 }
11fdf7f2 3254 ceph_assert(i->second >= 0);
7c673cae
FG
3255 p = iomap.erase(p);
3256 }
3257}
3258
3259void BlueStore::DeferredBatch::_audit(CephContext *cct)
3260{
3261 map<uint64_t,int> sb;
3262 for (auto p : seq_bytes) {
3263 sb[p.first] = 0; // make sure we have the same set of keys
3264 }
3265 uint64_t pos = 0;
3266 for (auto& p : iomap) {
11fdf7f2 3267 ceph_assert(p.first >= pos);
7c673cae
FG
3268 sb[p.second.seq] += p.second.bl.length();
3269 pos = p.first + p.second.bl.length();
3270 }
11fdf7f2 3271 ceph_assert(sb == seq_bytes);
7c673cae
FG
3272}
3273
3274
3275// Collection
3276
3277#undef dout_prefix
3278#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3279
11fdf7f2
TL
3280BlueStore::Collection::Collection(BlueStore *store_, Cache *c, coll_t cid)
3281 : CollectionImpl(cid),
3282 store(store_),
7c673cae 3283 cache(c),
7c673cae
FG
3284 lock("BlueStore::Collection::lock", true, false),
3285 exists(true),
11fdf7f2
TL
3286 onode_map(c),
3287 commit_queue(nullptr)
3288{
3289}
3290
3291bool BlueStore::Collection::flush_commit(Context *c)
3292{
3293 return osr->flush_commit(c);
3294}
3295
3296void BlueStore::Collection::flush()
3297{
3298 osr->flush();
3299}
3300
3301void BlueStore::Collection::flush_all_but_last()
7c673cae 3302{
11fdf7f2 3303 osr->flush_all_but_last();
7c673cae
FG
3304}
3305
3306void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3307{
11fdf7f2 3308 ceph_assert(!b->shared_blob);
7c673cae
FG
3309 const bluestore_blob_t& blob = b->get_blob();
3310 if (!blob.is_shared()) {
3311 b->shared_blob = new SharedBlob(this);
3312 return;
3313 }
3314
3315 b->shared_blob = shared_blob_set.lookup(sbid);
3316 if (b->shared_blob) {
3317 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3318 << std::dec << " had " << *b->shared_blob << dendl;
3319 } else {
3320 b->shared_blob = new SharedBlob(sbid, this);
3321 shared_blob_set.add(this, b->shared_blob.get());
3322 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3323 << std::dec << " opened " << *b->shared_blob
3324 << dendl;
3325 }
3326}
3327
3328void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3329{
3330 if (!sb->is_loaded()) {
3331
3332 bufferlist v;
3333 string key;
3334 auto sbid = sb->get_sbid();
3335 get_shared_blob_key(sbid, &key);
3336 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3337 if (r < 0) {
3338 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3339 << std::dec << " not found at key "
3340 << pretty_binary_string(key) << dendl;
11fdf7f2 3341 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3342 }
3343
3344 sb->loaded = true;
3345 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3346 auto p = v.cbegin();
3347 decode(*(sb->persistent), p);
7c673cae
FG
3348 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3349 << std::dec << " loaded shared_blob " << *sb << dendl;
3350 }
3351}
3352
3353void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3354{
7c673cae 3355 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3356 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3357
3358 // update blob
31f18b77 3359 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3360 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3361
3362 // update shared blob
3363 b->shared_blob->loaded = true;
3364 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3365 shared_blob_set.add(this, b->shared_blob.get());
3366 for (auto p : blob.get_extents()) {
3367 if (p.is_valid()) {
3368 b->shared_blob->get_ref(
3369 p.offset,
3370 p.length);
3371 }
3372 }
3373 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3374}
3375
31f18b77
FG
3376uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3377{
3378 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3379 ceph_assert(sb->is_loaded());
31f18b77
FG
3380
3381 uint64_t sbid = sb->get_sbid();
3382 shared_blob_set.remove(sb);
3383 sb->loaded = false;
3384 delete sb->persistent;
3385 sb->sbid_unloaded = 0;
3386 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3387 return sbid;
3388}
3389
7c673cae
FG
3390BlueStore::OnodeRef BlueStore::Collection::get_onode(
3391 const ghobject_t& oid,
3392 bool create)
3393{
11fdf7f2 3394 ceph_assert(create ? lock.is_wlocked() : lock.is_locked());
7c673cae
FG
3395
3396 spg_t pgid;
3397 if (cid.is_pg(&pgid)) {
3398 if (!oid.match(cnode.bits, pgid.ps())) {
3399 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3400 << pgid << " bits " << cnode.bits << dendl;
3401 ceph_abort();
3402 }
3403 }
3404
3405 OnodeRef o = onode_map.lookup(oid);
3406 if (o)
3407 return o;
3408
31f18b77 3409 mempool::bluestore_cache_other::string key;
7c673cae
FG
3410 get_object_key(store->cct, oid, &key);
3411
3412 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3413 << pretty_binary_string(key) << dendl;
3414
3415 bufferlist v;
3416 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3417 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3418 Onode *on;
3419 if (v.length() == 0) {
11fdf7f2 3420 ceph_assert(r == -ENOENT);
7c673cae
FG
3421 if (!store->cct->_conf->bluestore_debug_misc &&
3422 !create)
3423 return OnodeRef();
3424
3425 // new object, new onode
3426 on = new Onode(this, oid, key);
3427 } else {
3428 // loaded
11fdf7f2 3429 ceph_assert(r >= 0);
7c673cae
FG
3430 on = new Onode(this, oid, key);
3431 on->exists = true;
11fdf7f2 3432 auto p = v.front().begin_deep();
7c673cae 3433 on->onode.decode(p);
3efd9988
FG
3434 for (auto& i : on->onode.attrs) {
3435 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3436 }
7c673cae
FG
3437
3438 // initialize extent_map
3439 on->extent_map.decode_spanning_blobs(p);
3440 if (on->onode.extent_map_shards.empty()) {
3441 denc(on->extent_map.inline_bl, p);
3442 on->extent_map.decode_some(on->extent_map.inline_bl);
3efd9988
FG
3443 on->extent_map.inline_bl.reassign_to_mempool(
3444 mempool::mempool_bluestore_cache_other);
7c673cae
FG
3445 } else {
3446 on->extent_map.init_shards(false, false);
3447 }
3448 }
3449 o.reset(on);
3450 return onode_map.add(oid, o);
3451}
3452
3453void BlueStore::Collection::split_cache(
3454 Collection *dest)
3455{
3456 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3457
3458 // lock (one or both) cache shards
3459 std::lock(cache->lock, dest->cache->lock);
11fdf7f2
TL
3460 std::lock_guard l(cache->lock, std::adopt_lock);
3461 std::lock_guard l2(dest->cache->lock, std::adopt_lock);
7c673cae
FG
3462
3463 int destbits = dest->cnode.bits;
3464 spg_t destpg;
3465 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 3466 ceph_assert(is_pg);
7c673cae
FG
3467
3468 auto p = onode_map.onode_map.begin();
3469 while (p != onode_map.onode_map.end()) {
11fdf7f2 3470 OnodeRef o = p->second;
7c673cae
FG
3471 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3472 // onode does not belong to this child
11fdf7f2
TL
3473 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
3474 << dendl;
7c673cae
FG
3475 ++p;
3476 } else {
7c673cae
FG
3477 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3478 << dendl;
3479
3480 cache->_rm_onode(p->second);
3481 p = onode_map.onode_map.erase(p);
3482
3483 o->c = dest;
3484 dest->cache->_add_onode(o, 1);
3485 dest->onode_map.onode_map[o->oid] = o;
3486 dest->onode_map.cache = dest->cache;
3487
3488 // move over shared blobs and buffers. cover shared blobs from
3489 // both extent map and spanning blob map (the full extent map
3490 // may not be faulted in)
3491 vector<SharedBlob*> sbvec;
3492 for (auto& e : o->extent_map.extent_map) {
3493 sbvec.push_back(e.blob->shared_blob.get());
3494 }
3495 for (auto& b : o->extent_map.spanning_blob_map) {
3496 sbvec.push_back(b.second->shared_blob.get());
3497 }
3498 for (auto sb : sbvec) {
3499 if (sb->coll == dest) {
3500 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3501 << dendl;
3502 continue;
3503 }
3504 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
3505 if (sb->get_sbid()) {
3506 ldout(store->cct, 20) << __func__
3507 << " moving registration " << *sb << dendl;
3508 shared_blob_set.remove(sb);
3509 dest->shared_blob_set.add(dest, sb);
3510 }
3efd9988 3511 sb->coll = dest;
7c673cae 3512 if (dest->cache != cache) {
7c673cae
FG
3513 for (auto& i : sb->bc.buffer_map) {
3514 if (!i.second->is_writing()) {
3515 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3516 << dendl;
3517 dest->cache->_move_buffer(cache, i.second.get());
3518 }
3519 }
3520 }
3521 }
7c673cae
FG
3522 }
3523 }
3524}
3525
7c673cae
FG
3526// =======================================================
3527
91327a77
AA
3528// MempoolThread
3529
3530#undef dout_prefix
3531#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
3532
7c673cae
FG
3533void *BlueStore::MempoolThread::entry()
3534{
11fdf7f2
TL
3535 std::unique_lock l(lock);
3536
3537 std::list<std::shared_ptr<PriorityCache::PriCache>> caches;
3538 binned_kv_cache = store->db->get_priority_cache();
3539 if (binned_kv_cache != nullptr) {
3540 caches.push_back(binned_kv_cache);
3541 }
3542 caches.push_back(meta_cache);
3543 caches.push_back(data_cache);
31f18b77 3544
91327a77
AA
3545 autotune_cache_size = store->osd_memory_cache_min;
3546
3547 utime_t next_balance = ceph_clock_now();
3548 utime_t next_resize = ceph_clock_now();
31f18b77 3549
91327a77
AA
3550 bool interval_stats_trim = false;
3551 bool interval_stats_resize = false;
3552 while (!stop) {
91327a77
AA
3553 // Before we trim, check and see if it's time to rebalance/resize.
3554 double autotune_interval = store->cache_autotune_interval;
3555 double resize_interval = store->osd_memory_cache_resize_interval;
3556
3557 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
3558 _adjust_cache_settings();
3559
91327a77
AA
3560 // Log events at 5 instead of 20 when balance happens.
3561 interval_stats_resize = true;
3562 interval_stats_trim = true;
3563 if (store->cache_autotune) {
3564 _balance_cache(caches);
3565 }
31f18b77 3566
91327a77
AA
3567 next_balance = ceph_clock_now();
3568 next_balance += autotune_interval;
3569 }
3570 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
3571 if (ceph_using_tcmalloc() && store->cache_autotune) {
3572 _tune_cache_size(interval_stats_resize);
3573 interval_stats_resize = false;
3574 }
3575 next_resize = ceph_clock_now();
3576 next_resize += resize_interval;
31f18b77
FG
3577 }
3578
91327a77
AA
3579 // Now Trim
3580 _trim_shards(interval_stats_trim);
3581 interval_stats_trim = false;
31f18b77 3582
91327a77 3583 store->_update_cache_logger();
11fdf7f2
TL
3584 auto wait = ceph::make_timespan(
3585 store->cct->_conf->bluestore_cache_trim_interval);
3586 cond.wait_for(l, wait);
7c673cae
FG
3587 }
3588 stop = false;
3589 return NULL;
3590}
3591
91327a77
AA
3592void BlueStore::MempoolThread::_adjust_cache_settings()
3593{
11fdf7f2
TL
3594 if (binned_kv_cache != nullptr) {
3595 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
3596 }
3597 meta_cache->set_cache_ratio(store->cache_meta_ratio);
3598 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
3599}
3600
3601void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
3602{
3603 auto cct = store->cct;
3604 size_t num_shards = store->cache_shards.size();
3605
3606 int64_t kv_used = store->db->get_cache_usage();
11fdf7f2
TL
3607 int64_t meta_used = meta_cache->_get_used_bytes();
3608 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
3609
3610 uint64_t cache_size = store->cache_size;
3611 int64_t kv_alloc =
11fdf7f2 3612 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
91327a77 3613 int64_t meta_alloc =
11fdf7f2 3614 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 3615 int64_t data_alloc =
11fdf7f2 3616 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 3617
11fdf7f2 3618 if (binned_kv_cache != nullptr && store->cache_autotune) {
91327a77
AA
3619 cache_size = autotune_cache_size;
3620
11fdf7f2
TL
3621 kv_alloc = binned_kv_cache->get_committed_size();
3622 meta_alloc = meta_cache->get_committed_size();
3623 data_alloc = data_cache->get_committed_size();
91327a77
AA
3624 }
3625
3626 if (interval_stats) {
3627 ldout(cct, 5) << __func__ << " cache_size: " << cache_size
3628 << " kv_alloc: " << kv_alloc
3629 << " kv_used: " << kv_used
3630 << " meta_alloc: " << meta_alloc
3631 << " meta_used: " << meta_used
3632 << " data_alloc: " << data_alloc
3633 << " data_used: " << data_used << dendl;
3634 } else {
3635 ldout(cct, 20) << __func__ << " cache_size: " << cache_size
3636 << " kv_alloc: " << kv_alloc
3637 << " kv_used: " << kv_used
3638 << " meta_alloc: " << meta_alloc
3639 << " meta_used: " << meta_used
3640 << " data_alloc: " << data_alloc
3641 << " data_used: " << data_used << dendl;
3642 }
3643
3644 uint64_t max_shard_onodes = static_cast<uint64_t>(
11fdf7f2 3645 (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode());
91327a77
AA
3646 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
3647
3648 ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
3649 << " max_shard_buffer: " << max_shard_buffer << dendl;
3650
3651 for (auto i : store->cache_shards) {
3652 i->trim(max_shard_onodes, max_shard_buffer);
3653 }
3654}
3655
3656void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
3657{
3658 auto cct = store->cct;
3659 uint64_t target = store->osd_memory_target;
3660 uint64_t base = store->osd_memory_base;
3661 double fragmentation = store->osd_memory_expected_fragmentation;
91327a77 3662 uint64_t cache_min = store->osd_memory_cache_min;
f64942e4
AA
3663 uint64_t cache_max = cache_min;
3664 uint64_t limited_target = (1.0 - fragmentation) * target;
3665 if (limited_target > base + cache_min) {
3666 cache_max = limited_target - base;
3667 }
91327a77
AA
3668
3669 size_t heap_size = 0;
3670 size_t unmapped = 0;
3671 uint64_t mapped = 0;
3672
3673 ceph_heap_release_free_memory();
3674 ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
3675 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
3676 mapped = heap_size - unmapped;
3677
3678 uint64_t new_size = autotune_cache_size;
3679 new_size = (new_size < cache_max) ? new_size : cache_max;
3680 new_size = (new_size > cache_min) ? new_size : cache_min;
3681
3682 // Approach the min/max slowly, but bounce away quickly.
3683 if ((uint64_t) mapped < target) {
3684 double ratio = 1 - ((double) mapped / target);
3685 new_size += ratio * (cache_max - new_size);
3686 } else {
3687 double ratio = 1 - ((double) target / mapped);
3688 new_size -= ratio * (new_size - cache_min);
3689 }
3690
3691 if (interval_stats) {
3692 ldout(cct, 5) << __func__
3693 << " target: " << target
3694 << " heap: " << heap_size
3695 << " unmapped: " << unmapped
3696 << " mapped: " << mapped
3697 << " old cache_size: " << autotune_cache_size
3698 << " new cache size: " << new_size << dendl;
3699 } else {
3700 ldout(cct, 20) << __func__
3701 << " target: " << target
3702 << " heap: " << heap_size
3703 << " unmapped: " << unmapped
3704 << " mapped: " << mapped
3705 << " old cache_size: " << autotune_cache_size
3706 << " new cache size: " << new_size << dendl;
3707 }
3708 autotune_cache_size = new_size;
3709}
3710
3711void BlueStore::MempoolThread::_balance_cache(
11fdf7f2 3712 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches)
91327a77
AA
3713{
3714 int64_t mem_avail = autotune_cache_size;
11fdf7f2
TL
3715 /* Each cache is going to get at least 1 chunk's worth of memory from get_chunk
3716 * so shrink the available memory here to compensate. Don't shrink the amount of
3717 * memory below 0 however.
3718 */
3719 mem_avail -= PriorityCache::get_chunk(1, autotune_cache_size) * caches.size();
3720 if (mem_avail < 0) {
3721 mem_avail = 0;
3722 }
91327a77
AA
3723
3724 // Assign memory for each priority level
3725 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
3726 ldout(store->cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
3727 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
3728 _balance_cache_pri(&mem_avail, caches, pri);
3729 }
3730 // Assign any leftover memory based on the default ratios.
3731 if (mem_avail > 0) {
3732 for (auto it = caches.begin(); it != caches.end(); it++) {
3733 int64_t fair_share =
3734 static_cast<int64_t>((*it)->get_cache_ratio() * mem_avail);
3735 if (fair_share > 0) {
3736 (*it)->add_cache_bytes(PriorityCache::Priority::LAST, fair_share);
3737 }
3738 }
3739 }
3740 // assert if we assigned more memory than is available.
11fdf7f2 3741 ceph_assert(mem_avail >= 0);
91327a77
AA
3742
3743 // Finally commit the new cache sizes
3744 for (auto it = caches.begin(); it != caches.end(); it++) {
11fdf7f2 3745 (*it)->commit_cache_size(autotune_cache_size);
91327a77
AA
3746 }
3747}
3748
3749void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail,
11fdf7f2
TL
3750 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
3751 PriorityCache::Priority pri)
91327a77 3752{
11fdf7f2 3753 std::list<std::shared_ptr<PriorityCache::PriCache>> tmp_caches = caches;
91327a77
AA
3754 double cur_ratios = 0;
3755 double new_ratios = 0;
3756
3757 // Zero this priority's bytes, sum the initial ratios.
3758 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); it++) {
3759 (*it)->set_cache_bytes(pri, 0);
3760 cur_ratios += (*it)->get_cache_ratio();
3761 }
3762
3763 // For this priority, loop until caches are satisified or we run out of memory.
3764 // Since we can't allocate fractional bytes, stop if we have fewer bytes left
3765 // than the number of participating caches.
3766 while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
3767 uint64_t total_assigned = 0;
3768
3769 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); ) {
11fdf7f2 3770 int64_t cache_wants = (*it)->request_cache_bytes(pri, autotune_cache_size);
91327a77
AA
3771
3772 // Usually the ratio should be set to the fraction of the current caches'
3773 // assigned ratio compared to the total ratio of all caches that still
3774 // want memory. There is a special case where the only caches left are
3775 // all assigned 0% ratios but still want memory. In that case, give
3776 // them an equal shot at the remaining memory for this priority.
3777 double ratio = 1.0 / tmp_caches.size();
3778 if (cur_ratios > 0) {
3779 ratio = (*it)->get_cache_ratio() / cur_ratios;
3780 }
3781 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
3782
3783 if (cache_wants > fair_share) {
3784 // If we want too much, take what we can get but stick around for more
3785 (*it)->add_cache_bytes(pri, fair_share);
3786 total_assigned += fair_share;
3787
3788 new_ratios += (*it)->get_cache_ratio();
3789 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3790 << " wanted: " << cache_wants << " fair_share: " << fair_share
3791 << " mem_avail: " << *mem_avail
3792 << " staying in list. Size: " << tmp_caches.size()
3793 << dendl;
3794 ++it;
3795 } else {
3796 // Otherwise assign only what we want
3797 if (cache_wants > 0) {
3798 (*it)->add_cache_bytes(pri, cache_wants);
3799 total_assigned += cache_wants;
3800
3801 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3802 << " wanted: " << cache_wants << " fair_share: " << fair_share
3803 << " mem_avail: " << *mem_avail
3804 << " removing from list. New size: " << tmp_caches.size() - 1
3805 << dendl;
3806
3807 }
3808 // Either the cache didn't want anything or got what it wanted, so remove it from the tmp list.
3809 it = tmp_caches.erase(it);
3810 }
3811 }
3812 // Reset the ratios
3813 *mem_avail -= total_assigned;
3814 cur_ratios = new_ratios;
3815 new_ratios = 0;
3816 }
3817}
3818
7c673cae
FG
3819// =======================================================
3820
31f18b77
FG
3821// OmapIteratorImpl
3822
3823#undef dout_prefix
3824#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3825
3826BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3827 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3828 : c(c), o(o), it(it)
3829{
3830 RWLock::RLocker l(c->lock);
3831 if (o->onode.has_omap()) {
3832 get_omap_key(o->onode.nid, string(), &head);
3833 get_omap_tail(o->onode.nid, &tail);
3834 it->lower_bound(head);
3835 }
3836}
3837
11fdf7f2
TL
3838string BlueStore::OmapIteratorImpl::_stringify() const
3839{
3840 stringstream s;
3841 s << " omap_iterator(cid = " << c->cid
3842 <<", oid = " << o->oid << ")";
3843 return s.str();
3844}
3845
31f18b77
FG
3846int BlueStore::OmapIteratorImpl::seek_to_first()
3847{
3848 RWLock::RLocker l(c->lock);
11fdf7f2 3849 auto start1 = mono_clock::now();
31f18b77
FG
3850 if (o->onode.has_omap()) {
3851 it->lower_bound(head);
3852 } else {
3853 it = KeyValueDB::Iterator();
3854 }
494da23a
TL
3855 c->store->log_latency(
3856 __func__,
11fdf7f2
TL
3857 l_bluestore_omap_seek_to_first_lat,
3858 mono_clock::now() - start1,
494da23a 3859 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 3860
31f18b77
FG
3861 return 0;
3862}
3863
3864int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3865{
3866 RWLock::RLocker l(c->lock);
11fdf7f2 3867 auto start1 = mono_clock::now();
31f18b77
FG
3868 if (o->onode.has_omap()) {
3869 string key;
3870 get_omap_key(o->onode.nid, after, &key);
3871 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3872 << pretty_binary_string(key) << dendl;
3873 it->upper_bound(key);
3874 } else {
3875 it = KeyValueDB::Iterator();
3876 }
11fdf7f2 3877 c->store->log_latency_fn(
494da23a 3878 __func__,
11fdf7f2
TL
3879 l_bluestore_omap_upper_bound_lat,
3880 mono_clock::now() - start1,
494da23a 3881 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 3882 [&] (const ceph::timespan& lat) {
494da23a 3883 return ", after = " + after +
11fdf7f2
TL
3884 _stringify();
3885 }
3886 );
31f18b77
FG
3887 return 0;
3888}
3889
3890int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3891{
3892 RWLock::RLocker l(c->lock);
11fdf7f2 3893 auto start1 = mono_clock::now();
31f18b77
FG
3894 if (o->onode.has_omap()) {
3895 string key;
3896 get_omap_key(o->onode.nid, to, &key);
3897 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3898 << pretty_binary_string(key) << dendl;
3899 it->lower_bound(key);
3900 } else {
3901 it = KeyValueDB::Iterator();
3902 }
11fdf7f2 3903 c->store->log_latency_fn(
494da23a 3904 __func__,
11fdf7f2
TL
3905 l_bluestore_omap_lower_bound_lat,
3906 mono_clock::now() - start1,
494da23a 3907 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 3908 [&] (const ceph::timespan& lat) {
494da23a 3909 return ", to = " + to +
11fdf7f2
TL
3910 _stringify();
3911 }
3912 );
31f18b77
FG
3913 return 0;
3914}
3915
3916bool BlueStore::OmapIteratorImpl::valid()
3917{
3918 RWLock::RLocker l(c->lock);
3919 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 3920 it->raw_key().second < tail;
31f18b77
FG
3921 if (it && it->valid()) {
3922 ldout(c->store->cct,20) << __func__ << " is at "
3923 << pretty_binary_string(it->raw_key().second)
3924 << dendl;
3925 }
3926 return r;
3927}
3928
11fdf7f2 3929int BlueStore::OmapIteratorImpl::next()
31f18b77 3930{
11fdf7f2 3931 int r = -1;
31f18b77 3932 RWLock::RLocker l(c->lock);
11fdf7f2 3933 auto start1 = mono_clock::now();
31f18b77
FG
3934 if (o->onode.has_omap()) {
3935 it->next();
11fdf7f2 3936 r = 0;
31f18b77 3937 }
494da23a
TL
3938 c->store->log_latency(
3939 __func__,
11fdf7f2
TL
3940 l_bluestore_omap_next_lat,
3941 mono_clock::now() - start1,
494da23a 3942 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
3943
3944 return r;
31f18b77
FG
3945}
3946
3947string BlueStore::OmapIteratorImpl::key()
3948{
3949 RWLock::RLocker l(c->lock);
11fdf7f2 3950 ceph_assert(it->valid());
31f18b77
FG
3951 string db_key = it->raw_key().second;
3952 string user_key;
3953 decode_omap_key(db_key, &user_key);
494da23a 3954
31f18b77
FG
3955 return user_key;
3956}
3957
3958bufferlist BlueStore::OmapIteratorImpl::value()
3959{
3960 RWLock::RLocker l(c->lock);
11fdf7f2 3961 ceph_assert(it->valid());
31f18b77
FG
3962 return it->value();
3963}
3964
3965
3966// =====================================
3967
7c673cae
FG
3968#undef dout_prefix
3969#define dout_prefix *_dout << "bluestore(" << path << ") "
3970
3971
3972static void aio_cb(void *priv, void *priv2)
3973{
3974 BlueStore *store = static_cast<BlueStore*>(priv);
3975 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3976 c->aio_finish(store);
3977}
3978
11fdf7f2
TL
3979static void discard_cb(void *priv, void *priv2)
3980{
3981 BlueStore *store = static_cast<BlueStore*>(priv);
3982 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
3983 store->handle_discard(*tmp);
3984}
3985
3986void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
3987{
3988 dout(10) << __func__ << dendl;
3989 ceph_assert(alloc);
3990 alloc->release(to_release);
3991}
3992
7c673cae
FG
3993BlueStore::BlueStore(CephContext *cct, const string& path)
3994 : ObjectStore(cct, path),
3995 throttle_bytes(cct, "bluestore_throttle_bytes",
3996 cct->_conf->bluestore_throttle_bytes),
3997 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3998 cct->_conf->bluestore_throttle_bytes +
3999 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 4000 deferred_finisher(cct, "defered_finisher", "dfin"),
11fdf7f2 4001 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4002 kv_sync_thread(this),
31f18b77 4003 kv_finalize_thread(this),
7c673cae
FG
4004 mempool_thread(this)
4005{
4006 _init_logger();
11fdf7f2 4007 cct->_conf.add_observer(this);
7c673cae 4008 set_cache_shards(1);
7c673cae
FG
4009}
4010
4011BlueStore::BlueStore(CephContext *cct,
4012 const string& path,
4013 uint64_t _min_alloc_size)
4014 : ObjectStore(cct, path),
4015 throttle_bytes(cct, "bluestore_throttle_bytes",
4016 cct->_conf->bluestore_throttle_bytes),
4017 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
4018 cct->_conf->bluestore_throttle_bytes +
4019 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 4020 deferred_finisher(cct, "defered_finisher", "dfin"),
11fdf7f2 4021 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4022 kv_sync_thread(this),
31f18b77 4023 kv_finalize_thread(this),
7c673cae
FG
4024 min_alloc_size(_min_alloc_size),
4025 min_alloc_size_order(ctz(_min_alloc_size)),
4026 mempool_thread(this)
4027{
4028 _init_logger();
11fdf7f2 4029 cct->_conf.add_observer(this);
7c673cae 4030 set_cache_shards(1);
7c673cae
FG
4031}
4032
4033BlueStore::~BlueStore()
4034{
11fdf7f2 4035 cct->_conf.remove_observer(this);
7c673cae 4036 _shutdown_logger();
11fdf7f2
TL
4037 ceph_assert(!mounted);
4038 ceph_assert(db == NULL);
4039 ceph_assert(bluefs == NULL);
4040 ceph_assert(fsid_fd < 0);
4041 ceph_assert(path_fd < 0);
7c673cae
FG
4042 for (auto i : cache_shards) {
4043 delete i;
4044 }
4045 cache_shards.clear();
4046}
4047
4048const char **BlueStore::get_tracked_conf_keys() const
4049{
4050 static const char* KEYS[] = {
4051 "bluestore_csum_type",
4052 "bluestore_compression_mode",
4053 "bluestore_compression_algorithm",
4054 "bluestore_compression_min_blob_size",
4055 "bluestore_compression_min_blob_size_ssd",
4056 "bluestore_compression_min_blob_size_hdd",
4057 "bluestore_compression_max_blob_size",
4058 "bluestore_compression_max_blob_size_ssd",
4059 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4060 "bluestore_compression_required_ratio",
7c673cae
FG
4061 "bluestore_max_alloc_size",
4062 "bluestore_prefer_deferred_size",
181888fb
FG
4063 "bluestore_prefer_deferred_size_hdd",
4064 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4065 "bluestore_deferred_batch_ops",
4066 "bluestore_deferred_batch_ops_hdd",
4067 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4068 "bluestore_throttle_bytes",
4069 "bluestore_throttle_deferred_bytes",
4070 "bluestore_throttle_cost_per_io_hdd",
4071 "bluestore_throttle_cost_per_io_ssd",
4072 "bluestore_throttle_cost_per_io",
4073 "bluestore_max_blob_size",
4074 "bluestore_max_blob_size_ssd",
4075 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4076 "osd_memory_target",
4077 "osd_memory_target_cgroup_limit_ratio",
4078 "osd_memory_base",
4079 "osd_memory_cache_min",
4080 "bluestore_cache_autotune",
4081 "bluestore_cache_autotune_interval",
81eedcae
TL
4082 "bluestore_no_per_pool_stats_tolerance",
4083 "bluestore_warn_on_legacy_statfs",
7c673cae
FG
4084 NULL
4085 };
4086 return KEYS;
4087}
4088
11fdf7f2 4089void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4090 const std::set<std::string> &changed)
4091{
81eedcae
TL
4092 if (changed.count("bluestore_no_per_pool_stats_tolerance") ||
4093 changed.count("bluestore_warn_on_legacy_statfs")) {
4094 _check_legacy_statfs_alert();
4095 }
4096
7c673cae
FG
4097 if (changed.count("bluestore_csum_type")) {
4098 _set_csum();
4099 }
4100 if (changed.count("bluestore_compression_mode") ||
4101 changed.count("bluestore_compression_algorithm") ||
4102 changed.count("bluestore_compression_min_blob_size") ||
4103 changed.count("bluestore_compression_max_blob_size")) {
4104 if (bdev) {
4105 _set_compression();
4106 }
4107 }
4108 if (changed.count("bluestore_max_blob_size") ||
4109 changed.count("bluestore_max_blob_size_ssd") ||
4110 changed.count("bluestore_max_blob_size_hdd")) {
4111 if (bdev) {
4112 // only after startup
4113 _set_blob_size();
4114 }
4115 }
4116 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4117 changed.count("bluestore_prefer_deferred_size_hdd") ||
4118 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4119 changed.count("bluestore_max_alloc_size") ||
4120 changed.count("bluestore_deferred_batch_ops") ||
4121 changed.count("bluestore_deferred_batch_ops_hdd") ||
4122 changed.count("bluestore_deferred_batch_ops_ssd")) {
4123 if (bdev) {
4124 // only after startup
4125 _set_alloc_sizes();
4126 }
4127 }
4128 if (changed.count("bluestore_throttle_cost_per_io") ||
4129 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4130 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4131 if (bdev) {
4132 _set_throttle_params();
4133 }
4134 }
4135 if (changed.count("bluestore_throttle_bytes")) {
4136 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
4137 throttle_deferred_bytes.reset_max(
4138 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
4139 }
4140 if (changed.count("bluestore_throttle_deferred_bytes")) {
4141 throttle_deferred_bytes.reset_max(
4142 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
4143 }
4144}
4145
4146void BlueStore::_set_compression()
4147{
224ce89b
WB
4148 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4149 if (m) {
11fdf7f2 4150 _clear_compression_alert();
224ce89b
WB
4151 comp_mode = *m;
4152 } else {
4153 derr << __func__ << " unrecognized value '"
4154 << cct->_conf->bluestore_compression_mode
4155 << "' for bluestore_compression_mode, reverting to 'none'"
4156 << dendl;
4157 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4158 string s("unknown mode: ");
4159 s += cct->_conf->bluestore_compression_mode;
4160 _set_compression_alert(true, s.c_str());
224ce89b
WB
4161 }
4162
4163 compressor = nullptr;
4164
3efd9988
FG
4165 if (cct->_conf->bluestore_compression_min_blob_size) {
4166 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4167 } else {
11fdf7f2 4168 ceph_assert(bdev);
7c673cae
FG
4169 if (bdev->is_rotational()) {
4170 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4171 } else {
4172 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4173 }
4174 }
4175
4176 if (cct->_conf->bluestore_compression_max_blob_size) {
4177 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4178 } else {
11fdf7f2 4179 ceph_assert(bdev);
7c673cae
FG
4180 if (bdev->is_rotational()) {
4181 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4182 } else {
4183 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4184 }
4185 }
4186
7c673cae
FG
4187 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4188 if (!alg_name.empty()) {
4189 compressor = Compressor::create(cct, alg_name);
4190 if (!compressor) {
4191 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4192 << dendl;
11fdf7f2 4193 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4194 }
4195 }
4196
4197 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4198 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4199 << " min_blob " << comp_min_blob_size
4200 << " max_blob " << comp_max_blob_size
7c673cae
FG
4201 << dendl;
4202}
4203
4204void BlueStore::_set_csum()
4205{
4206 csum_type = Checksummer::CSUM_NONE;
4207 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4208 if (t > Checksummer::CSUM_NONE)
4209 csum_type = t;
4210
4211 dout(10) << __func__ << " csum_type "
4212 << Checksummer::get_csum_type_string(csum_type)
4213 << dendl;
4214}
4215
4216void BlueStore::_set_throttle_params()
4217{
4218 if (cct->_conf->bluestore_throttle_cost_per_io) {
4219 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4220 } else {
11fdf7f2 4221 ceph_assert(bdev);
7c673cae
FG
4222 if (bdev->is_rotational()) {
4223 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4224 } else {
4225 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4226 }
4227 }
4228
4229 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4230 << dendl;
4231}
4232void BlueStore::_set_blob_size()
4233{
4234 if (cct->_conf->bluestore_max_blob_size) {
4235 max_blob_size = cct->_conf->bluestore_max_blob_size;
4236 } else {
11fdf7f2 4237 ceph_assert(bdev);
7c673cae
FG
4238 if (bdev->is_rotational()) {
4239 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4240 } else {
4241 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4242 }
4243 }
4244 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4245 << std::dec << dendl;
4246}
4247
11fdf7f2 4248int BlueStore::_set_cache_sizes()
1adf2230 4249{
11fdf7f2
TL
4250 ceph_assert(bdev);
4251 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4252 cache_autotune_interval =
11fdf7f2
TL
4253 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4254 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4255 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4256 osd_memory_expected_fragmentation =
11fdf7f2
TL
4257 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4258 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4259 osd_memory_cache_resize_interval =
11fdf7f2 4260 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4261
224ce89b
WB
4262 if (cct->_conf->bluestore_cache_size) {
4263 cache_size = cct->_conf->bluestore_cache_size;
4264 } else {
4265 // choose global cache size based on backend type
4266 if (bdev->is_rotational()) {
4267 cache_size = cct->_conf->bluestore_cache_size_hdd;
4268 } else {
4269 cache_size = cct->_conf->bluestore_cache_size_ssd;
4270 }
4271 }
31f18b77 4272
91327a77 4273 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
224ce89b 4274 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4275 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4276 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4277 return -EINVAL;
4278 }
91327a77
AA
4279
4280 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b 4281 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4282 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4283 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4284 return -EINVAL;
4285 }
91327a77 4286
31f18b77 4287 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4288 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4289 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4290 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4291 << dendl;
31f18b77
FG
4292 return -EINVAL;
4293 }
91327a77
AA
4294
4295 cache_data_ratio =
4296 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
31f18b77
FG
4297 if (cache_data_ratio < 0) {
4298 // deal with floating point imprecision
4299 cache_data_ratio = 0;
4300 }
91327a77 4301
224ce89b
WB
4302 dout(1) << __func__ << " cache_size " << cache_size
4303 << " meta " << cache_meta_ratio
31f18b77
FG
4304 << " kv " << cache_kv_ratio
4305 << " data " << cache_data_ratio
4306 << dendl;
4307 return 0;
4308}
4309
3efd9988
FG
4310int BlueStore::write_meta(const std::string& key, const std::string& value)
4311{
4312 bluestore_bdev_label_t label;
4313 string p = path + "/block";
4314 int r = _read_bdev_label(cct, p, &label);
4315 if (r < 0) {
4316 return ObjectStore::write_meta(key, value);
4317 }
4318 label.meta[key] = value;
4319 r = _write_bdev_label(cct, p, label);
11fdf7f2 4320 ceph_assert(r == 0);
3efd9988
FG
4321 return ObjectStore::write_meta(key, value);
4322}
4323
4324int BlueStore::read_meta(const std::string& key, std::string *value)
4325{
4326 bluestore_bdev_label_t label;
4327 string p = path + "/block";
4328 int r = _read_bdev_label(cct, p, &label);
4329 if (r < 0) {
4330 return ObjectStore::read_meta(key, value);
4331 }
4332 auto i = label.meta.find(key);
4333 if (i == label.meta.end()) {
4334 return ObjectStore::read_meta(key, value);
4335 }
4336 *value = i->second;
4337 return 0;
4338}
4339
7c673cae
FG
4340void BlueStore::_init_logger()
4341{
4342 PerfCountersBuilder b(cct, "bluestore",
4343 l_bluestore_first, l_bluestore_last);
4344 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4345 "Average kv_thread flush latency",
4346 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4347 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4348 "Average kv_thread commit latency");
11fdf7f2
TL
4349 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4350 "Average kv_sync thread latency",
4351 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4352 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4353 "Average kv_finalize thread latency",
4354 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4355 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4356 "Average prepare state latency");
4357 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4358 "Average aio_wait state latency",
4359 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4360 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4361 "Average io_done state latency");
4362 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4363 "Average kv_queued state latency");
4364 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4365 "Average kv_commiting state latency");
4366 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4367 "Average kv_done state latency");
4368 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4369 "Average deferred_queued state latency");
4370 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4371 "Average aio_wait state latency");
4372 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4373 "Average cleanup state latency");
4374 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4375 "Average finishing state latency");
4376 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4377 "Average done state latency");
4378 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4379 "Average submit throttle latency",
4380 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4381 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4382 "Average submit latency",
4383 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4384 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4385 "Average commit latency",
4386 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4387 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4388 "Average read latency",
4389 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4390 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4391 "Average read onode metadata latency");
4392 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4393 "Average read latency");
4394 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4395 "Average compress latency");
4396 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4397 "Average decompress latency");
4398 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4399 "Average checksum latency");
4400 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4401 "Sum for beneficial compress ops");
4402 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4403 "Sum for compress ops rejected due to low net gain of space");
4404 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4405 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4406 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4407 "Sum for deferred write op");
4408 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4409 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4410 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4411 "Sum for write penalty read ops");
4412 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4413 "Sum for allocated bytes");
4414 b.add_u64(l_bluestore_stored, "bluestore_stored",
4415 "Sum for stored bytes");
4416 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
4417 "Sum for stored compressed bytes");
4418 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
4419 "Sum for bytes allocated for compressed data");
4420 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
4421 "Sum for original bytes that were compressed");
4422
4423 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4424 "Number of onodes in cache");
4425 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4426 "Sum for onode-lookups hit in the cache");
4427 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4428 "Sum for onode-lookups missed in the cache");
4429 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4430 "Sum for onode-shard lookups hit in the cache");
4431 b.add_u64_counter(l_bluestore_onode_shard_misses,
4432 "bluestore_onode_shard_misses",
4433 "Sum for onode-shard lookups missed in the cache");
4434 b.add_u64(l_bluestore_extents, "bluestore_extents",
4435 "Number of extents in cache");
4436 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4437 "Number of blobs in cache");
4438 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4439 "Number of buffers in cache");
4440 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4441 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4442 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4443 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4444 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4445 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4446
4447 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4448 "Large aligned writes into fresh blobs");
4449 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4450 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4451 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4452 "Large aligned writes into fresh blobs (blobs)");
4453 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4454 "Small writes into existing or sparse small blobs");
4455 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4456 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4457 b.add_u64_counter(l_bluestore_write_small_unused,
4458 "bluestore_write_small_unused",
4459 "Small writes into unused portion of existing blob");
4460 b.add_u64_counter(l_bluestore_write_small_deferred,
4461 "bluestore_write_small_deferred",
4462 "Small overwrites using deferred");
4463 b.add_u64_counter(l_bluestore_write_small_pre_read,
4464 "bluestore_write_small_pre_read",
4465 "Small writes that required we read some data (possibly "
4466 "cached) to fill out the block");
4467 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4468 "Small write into new (sparse) blob");
4469
4470 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4471 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4472 "Onode extent map reshard events");
4473 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4474 "Sum for blob splitting due to resharding");
4475 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4476 "Sum for extents that have been removed due to compression");
4477 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4478 "Sum for extents that have been merged due to garbage "
4479 "collection");
b32b8144
FG
4480 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4481 "Read EIO errors propagated to high level callers");
f64942e4
AA
4482 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4483 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
4484 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4485 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
4486 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4487 "Average omap iterator seek_to_first call latency");
4488 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4489 "Average omap iterator upper_bound call latency");
4490 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
4491 "Average omap iterator lower_bound call latency");
4492 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
4493 "Average omap iterator next call latency");
494da23a
TL
4494 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
4495 "Average collection listing latency");
7c673cae
FG
4496 logger = b.create_perf_counters();
4497 cct->get_perfcounters_collection()->add(logger);
4498}
4499
4500int BlueStore::_reload_logger()
4501{
4502 struct store_statfs_t store_statfs;
7c673cae 4503 int r = statfs(&store_statfs);
11fdf7f2 4504 if (r >= 0) {
7c673cae 4505 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
4506 logger->set(l_bluestore_stored, store_statfs.data_stored);
4507 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
4508 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
4509 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
4510 }
4511 return r;
4512}
4513
4514void BlueStore::_shutdown_logger()
4515{
4516 cct->get_perfcounters_collection()->remove(logger);
4517 delete logger;
4518}
4519
4520int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4521 uuid_d *fsid)
4522{
4523 bluestore_bdev_label_t label;
4524 int r = _read_bdev_label(cct, path, &label);
4525 if (r < 0)
4526 return r;
4527 *fsid = label.osd_uuid;
4528 return 0;
4529}
4530
4531int BlueStore::_open_path()
4532{
b32b8144 4533 // sanity check(s)
11fdf7f2
TL
4534 auto osd_max_object_size =
4535 cct->_conf.get_val<Option::size_t>("osd_max_object_size");
4536 if (osd_max_object_size >= (size_t)OBJECT_MAX_SIZE) {
4537 derr << __func__ << " osd_max_object_size >= 0x" << std::hex << OBJECT_MAX_SIZE
4538 << "; BlueStore has hard limit of 0x" << OBJECT_MAX_SIZE << "." << std::dec << dendl;
b32b8144
FG
4539 return -EINVAL;
4540 }
11fdf7f2 4541 ceph_assert(path_fd < 0);
91327a77 4542 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
4543 if (path_fd < 0) {
4544 int r = -errno;
4545 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4546 << dendl;
4547 return r;
4548 }
4549 return 0;
4550}
4551
4552void BlueStore::_close_path()
4553{
4554 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4555 path_fd = -1;
4556}
4557
3efd9988
FG
4558int BlueStore::_write_bdev_label(CephContext *cct,
4559 string path, bluestore_bdev_label_t label)
7c673cae
FG
4560{
4561 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4562 bufferlist bl;
11fdf7f2 4563 encode(label, bl);
7c673cae 4564 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
4565 encode(crc, bl);
4566 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
4567 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4568 z.zero();
4569 bl.append(std::move(z));
4570
91327a77 4571 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
4572 if (fd < 0) {
4573 fd = -errno;
4574 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4575 << dendl;
4576 return fd;
4577 }
4578 int r = bl.write_fd(fd);
4579 if (r < 0) {
4580 derr << __func__ << " failed to write to " << path
4581 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 4582 goto out;
7c673cae 4583 }
3efd9988
FG
4584 r = ::fsync(fd);
4585 if (r < 0) {
4586 derr << __func__ << " failed to fsync " << path
4587 << ": " << cpp_strerror(r) << dendl;
4588 }
11fdf7f2 4589out:
7c673cae
FG
4590 VOID_TEMP_FAILURE_RETRY(::close(fd));
4591 return r;
4592}
4593
4594int BlueStore::_read_bdev_label(CephContext* cct, string path,
4595 bluestore_bdev_label_t *label)
4596{
4597 dout(10) << __func__ << dendl;
91327a77 4598 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
4599 if (fd < 0) {
4600 fd = -errno;
4601 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4602 << dendl;
4603 return fd;
4604 }
4605 bufferlist bl;
4606 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4607 VOID_TEMP_FAILURE_RETRY(::close(fd));
4608 if (r < 0) {
4609 derr << __func__ << " failed to read from " << path
4610 << ": " << cpp_strerror(r) << dendl;
4611 return r;
4612 }
4613
4614 uint32_t crc, expected_crc;
11fdf7f2 4615 auto p = bl.cbegin();
7c673cae 4616 try {
11fdf7f2 4617 decode(*label, p);
7c673cae
FG
4618 bufferlist t;
4619 t.substr_of(bl, 0, p.get_off());
4620 crc = t.crc32c(-1);
11fdf7f2 4621 decode(expected_crc, p);
7c673cae
FG
4622 }
4623 catch (buffer::error& e) {
b32b8144 4624 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
4625 << ": " << e.what()
4626 << dendl;
b32b8144 4627 return -ENOENT;
7c673cae
FG
4628 }
4629 if (crc != expected_crc) {
4630 derr << __func__ << " bad crc on label, expected " << expected_crc
4631 << " != actual " << crc << dendl;
4632 return -EIO;
4633 }
4634 dout(10) << __func__ << " got " << *label << dendl;
4635 return 0;
4636}
4637
4638int BlueStore::_check_or_set_bdev_label(
4639 string path, uint64_t size, string desc, bool create)
4640{
4641 bluestore_bdev_label_t label;
4642 if (create) {
4643 label.osd_uuid = fsid;
4644 label.size = size;
4645 label.btime = ceph_clock_now();
4646 label.description = desc;
3efd9988 4647 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
4648 if (r < 0)
4649 return r;
4650 } else {
4651 int r = _read_bdev_label(cct, path, &label);
4652 if (r < 0)
4653 return r;
31f18b77
FG
4654 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4655 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4656 << " and fsid " << fsid << " check bypassed" << dendl;
4657 }
4658 else if (label.osd_uuid != fsid) {
7c673cae
FG
4659 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4660 << " does not match our fsid " << fsid << dendl;
4661 return -EIO;
4662 }
4663 }
4664 return 0;
4665}
4666
4667void BlueStore::_set_alloc_sizes(void)
4668{
7c673cae
FG
4669 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4670
4671 if (cct->_conf->bluestore_prefer_deferred_size) {
4672 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4673 } else {
11fdf7f2 4674 ceph_assert(bdev);
7c673cae
FG
4675 if (bdev->is_rotational()) {
4676 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4677 } else {
4678 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4679 }
4680 }
4681
4682 if (cct->_conf->bluestore_deferred_batch_ops) {
4683 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4684 } else {
11fdf7f2 4685 ceph_assert(bdev);
7c673cae
FG
4686 if (bdev->is_rotational()) {
4687 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4688 } else {
4689 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4690 }
4691 }
4692
4693 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 4694 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
4695 << " max_alloc_size 0x" << std::hex << max_alloc_size
4696 << " prefer_deferred_size 0x" << prefer_deferred_size
4697 << std::dec
4698 << " deferred_batch_ops " << deferred_batch_ops
4699 << dendl;
4700}
4701
4702int BlueStore::_open_bdev(bool create)
4703{
11fdf7f2 4704 ceph_assert(bdev == NULL);
7c673cae 4705 string p = path + "/block";
11fdf7f2 4706 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
4707 int r = bdev->open(p);
4708 if (r < 0)
4709 goto fail;
4710
11fdf7f2
TL
4711 if (create && cct->_conf->bdev_enable_discard) {
4712 bdev->discard(0, bdev->get_size());
4713 }
4714
7c673cae
FG
4715 if (bdev->supported_bdev_label()) {
4716 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4717 if (r < 0)
4718 goto fail_close;
4719 }
4720
4721 // initialize global block parameters
4722 block_size = bdev->get_block_size();
4723 block_mask = ~(block_size - 1);
4724 block_size_order = ctz(block_size);
11fdf7f2 4725 ceph_assert(block_size == 1u << block_size_order);
224ce89b
WB
4726 // and set cache_size based on device type
4727 r = _set_cache_sizes();
4728 if (r < 0) {
4729 goto fail_close;
4730 }
7c673cae
FG
4731 return 0;
4732
4733 fail_close:
4734 bdev->close();
4735 fail:
4736 delete bdev;
4737 bdev = NULL;
4738 return r;
4739}
4740
11fdf7f2
TL
4741void BlueStore::_validate_bdev()
4742{
4743 ceph_assert(bdev);
4744 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
4745 uint64_t dev_size = bdev->get_size();
4746 if (dev_size <
4747 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
4748 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
4749 << " is too small, disable bluestore_bluefs_min for now"
4750 << dendl;
4751 ceph_assert(dev_size >= _get_ondisk_reserved());
4752
4753 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
4754 ceph_assert(r == 0);
4755 }
4756}
4757
7c673cae
FG
4758void BlueStore::_close_bdev()
4759{
11fdf7f2 4760 ceph_assert(bdev);
7c673cae
FG
4761 bdev->close();
4762 delete bdev;
4763 bdev = NULL;
4764}
4765
11fdf7f2 4766int BlueStore::_open_fm(KeyValueDB::Transaction t)
7c673cae 4767{
11fdf7f2
TL
4768 ceph_assert(fm == NULL);
4769 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
4770 ceph_assert(fm);
4771 if (t) {
4772 // create mode. initialize freespace
7c673cae 4773 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
4774 {
4775 bufferlist bl;
4776 bl.append(freelist_type);
4777 t->set(PREFIX_SUPER, "freelist_type", bl);
4778 }
b32b8144
FG
4779 // being able to allocate in units less than bdev block size
4780 // seems to be a bad idea.
11fdf7f2 4781 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
b32b8144 4782 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
4783
4784 // allocate superblock reserved space. note that we do not mark
4785 // bluefs space as allocated in the freelist; we instead rely on
4786 // bluefs_extents.
11fdf7f2 4787 auto reserved = _get_ondisk_reserved();
3efd9988 4788 fm->allocate(0, reserved, t);
7c673cae 4789
7c673cae 4790 if (cct->_conf->bluestore_bluefs) {
11fdf7f2 4791 ceph_assert(bluefs_extents.num_intervals() == 1);
7c673cae 4792 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
11fdf7f2 4793 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
4794 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4795 << " for bluefs" << dendl;
7c673cae
FG
4796 }
4797
4798 if (cct->_conf->bluestore_debug_prefill > 0) {
4799 uint64_t end = bdev->get_size() - reserved;
4800 dout(1) << __func__ << " pre-fragmenting freespace, using "
4801 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4802 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 4803 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
4804 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4805 float r = cct->_conf->bluestore_debug_prefill;
4806 r /= 1.0 - r;
4807 bool stop = false;
4808
4809 while (!stop && start < end) {
4810 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4811 if (start + l > end) {
4812 l = end - start;
11fdf7f2 4813 l = p2align(l, min_alloc_size);
7c673cae 4814 }
11fdf7f2 4815 ceph_assert(start + l <= end);
7c673cae
FG
4816
4817 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 4818 u = p2roundup(u, min_alloc_size);
7c673cae
FG
4819 if (start + l + u > end) {
4820 u = end - (start + l);
4821 // trim to align so we don't overflow again
11fdf7f2 4822 u = p2align(u, min_alloc_size);
7c673cae
FG
4823 stop = true;
4824 }
11fdf7f2 4825 ceph_assert(start + l + u <= end);
7c673cae 4826
11fdf7f2 4827 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
4828 << " use 0x" << u << std::dec << dendl;
4829
4830 if (u == 0) {
4831 // break if u has been trimmed to nothing
4832 break;
4833 }
4834
4835 fm->allocate(start + l, u, t);
4836 start += l + u;
4837 }
4838 }
7c673cae
FG
4839 }
4840
11fdf7f2 4841 int r = fm->init(db);
7c673cae
FG
4842 if (r < 0) {
4843 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4844 delete fm;
4845 fm = NULL;
4846 return r;
4847 }
81eedcae
TL
4848 // if space size tracked by free list manager is that higher than actual
4849 // dev size one can hit out-of-space allocation which will result
4850 // in data loss and/or assertions
4851 // Probably user altered the device size somehow.
4852 // The only fix for now is to redeploy OSD.
4853 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
4854 ostringstream ss;
4855 ss << "slow device size mismatch detected, "
4856 << " fm size(" << fm->get_size()
4857 << ") > slow device size(" << bdev->get_size()
4858 << "), Please stop using this OSD as it might cause data loss.";
4859 _set_disk_size_mismatch_alert(ss.str());
4860 }
7c673cae
FG
4861 return 0;
4862}
4863
4864void BlueStore::_close_fm()
4865{
4866 dout(10) << __func__ << dendl;
11fdf7f2 4867 ceph_assert(fm);
7c673cae
FG
4868 fm->shutdown();
4869 delete fm;
4870 fm = NULL;
4871}
4872
4873int BlueStore::_open_alloc()
4874{
11fdf7f2
TL
4875 ceph_assert(alloc == NULL);
4876 ceph_assert(bdev->get_size());
4877
4878 if (bluefs) {
4879 bluefs_extents.clear();
4880 auto r = bluefs->get_block_extents(bluefs_shared_bdev, &bluefs_extents);
4881 if (r < 0) {
4882 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
4883 << cpp_strerror(r) << dendl;
4884
4885 return r;
4886 }
4887 dout(10) << __func__ << " bluefs extents 0x"
4888 << std::hex << bluefs_extents << std::dec
4889 << dendl;
4890 }
4891
7c673cae
FG
4892 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4893 bdev->get_size(),
4894 min_alloc_size);
4895 if (!alloc) {
4896 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4897 << cct->_conf->bluestore_allocator
4898 << dendl;
4899 return -EINVAL;
4900 }
4901
4902 uint64_t num = 0, bytes = 0;
4903
4904 dout(1) << __func__ << " opening allocation metadata" << dendl;
4905 // initialize from freelist
4906 fm->enumerate_reset();
4907 uint64_t offset, length;
11fdf7f2 4908 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
4909 alloc->init_add_free(offset, length);
4910 ++num;
4911 bytes += length;
4912 }
224ce89b 4913 fm->enumerate_reset();
1adf2230 4914 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
7c673cae
FG
4915 << " in " << num << " extents"
4916 << dendl;
4917
4918 // also mark bluefs space as allocated
4919 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4920 alloc->init_rm_free(e.get_start(), e.get_len());
4921 }
7c673cae
FG
4922
4923 return 0;
4924}
4925
4926void BlueStore::_close_alloc()
4927{
11fdf7f2
TL
4928 ceph_assert(bdev);
4929 bdev->discard_drain();
4930
4931 ceph_assert(alloc);
7c673cae
FG
4932 alloc->shutdown();
4933 delete alloc;
4934 alloc = NULL;
11fdf7f2 4935 bluefs_extents.clear();
7c673cae
FG
4936}
4937
4938int BlueStore::_open_fsid(bool create)
4939{
11fdf7f2 4940 ceph_assert(fsid_fd < 0);
91327a77 4941 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
4942 if (create)
4943 flags |= O_CREAT;
4944 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4945 if (fsid_fd < 0) {
4946 int err = -errno;
4947 derr << __func__ << " " << cpp_strerror(err) << dendl;
4948 return err;
4949 }
4950 return 0;
4951}
4952
4953int BlueStore::_read_fsid(uuid_d *uuid)
4954{
4955 char fsid_str[40];
4956 memset(fsid_str, 0, sizeof(fsid_str));
4957 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4958 if (ret < 0) {
4959 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4960 return ret;
4961 }
4962 if (ret > 36)
4963 fsid_str[36] = 0;
4964 else
4965 fsid_str[ret] = 0;
4966 if (!uuid->parse(fsid_str)) {
4967 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4968 return -EINVAL;
4969 }
4970 return 0;
4971}
4972
4973int BlueStore::_write_fsid()
4974{
4975 int r = ::ftruncate(fsid_fd, 0);
4976 if (r < 0) {
4977 r = -errno;
4978 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4979 return r;
4980 }
4981 string str = stringify(fsid) + "\n";
4982 r = safe_write(fsid_fd, str.c_str(), str.length());
4983 if (r < 0) {
4984 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4985 return r;
4986 }
4987 r = ::fsync(fsid_fd);
4988 if (r < 0) {
4989 r = -errno;
4990 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4991 return r;
4992 }
4993 return 0;
4994}
4995
4996void BlueStore::_close_fsid()
4997{
4998 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4999 fsid_fd = -1;
5000}
5001
5002int BlueStore::_lock_fsid()
5003{
5004 struct flock l;
5005 memset(&l, 0, sizeof(l));
5006 l.l_type = F_WRLCK;
5007 l.l_whence = SEEK_SET;
5008 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5009 if (r < 0) {
5010 int err = errno;
5011 derr << __func__ << " failed to lock " << path << "/fsid"
5012 << " (is another ceph-osd still running?)"
5013 << cpp_strerror(err) << dendl;
5014 return -err;
5015 }
5016 return 0;
5017}
5018
31f18b77
FG
5019bool BlueStore::is_rotational()
5020{
5021 if (bdev) {
5022 return bdev->is_rotational();
5023 }
5024
5025 bool rotational = true;
5026 int r = _open_path();
5027 if (r < 0)
5028 goto out;
5029 r = _open_fsid(false);
5030 if (r < 0)
5031 goto out_path;
5032 r = _read_fsid(&fsid);
5033 if (r < 0)
5034 goto out_fsid;
5035 r = _lock_fsid();
5036 if (r < 0)
5037 goto out_fsid;
5038 r = _open_bdev(false);
5039 if (r < 0)
5040 goto out_fsid;
5041 rotational = bdev->is_rotational();
5042 _close_bdev();
5043 out_fsid:
5044 _close_fsid();
5045 out_path:
5046 _close_path();
5047 out:
5048 return rotational;
5049}
5050
d2e6a577
FG
5051bool BlueStore::is_journal_rotational()
5052{
5053 if (!bluefs) {
5054 dout(5) << __func__ << " bluefs disabled, default to store media type"
5055 << dendl;
5056 return is_rotational();
5057 }
5058 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5059 return bluefs->wal_is_rotational();
5060}
5061
7c673cae
FG
5062bool BlueStore::test_mount_in_use()
5063{
5064 // most error conditions mean the mount is not in use (e.g., because
5065 // it doesn't exist). only if we fail to lock do we conclude it is
5066 // in use.
5067 bool ret = false;
5068 int r = _open_path();
5069 if (r < 0)
5070 return false;
5071 r = _open_fsid(false);
5072 if (r < 0)
5073 goto out_path;
5074 r = _lock_fsid();
5075 if (r < 0)
5076 ret = true; // if we can't lock, it is in use
5077 _close_fsid();
5078 out_path:
5079 _close_path();
5080 return ret;
5081}
5082
11fdf7f2 5083int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5084{
5085 int r;
11fdf7f2 5086 bluefs = new BlueFS(cct);
7c673cae 5087
11fdf7f2
TL
5088 string bfn;
5089 struct stat st;
5090
5091 bfn = path + "/block.db";
5092 if (::stat(bfn.c_str(), &st) == 0) {
5093 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn,
5094 create && cct->_conf->bdev_enable_discard);
7c673cae 5095 if (r < 0) {
11fdf7f2
TL
5096 derr << __func__ << " add block device(" << bfn << ") returned: "
5097 << cpp_strerror(r) << dendl;
5098 goto free_bluefs;
7c673cae 5099 }
7c673cae 5100
11fdf7f2
TL
5101 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5102 r = _check_or_set_bdev_label(
5103 bfn,
5104 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5105 "bluefs db", create);
5106 if (r < 0) {
5107 derr << __func__
5108 << " check block device(" << bfn << ") label returned: "
5109 << cpp_strerror(r) << dendl;
5110 goto free_bluefs;
5111 }
7c673cae 5112 }
11fdf7f2
TL
5113 if (create) {
5114 bluefs->add_block_extent(
5115 BlueFS::BDEV_DB,
5116 SUPER_RESERVED,
5117 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5118 }
5119 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
5120 bluefs_single_shared_device = false;
5121 } else {
5122 r = -errno;
5123 if (::lstat(bfn.c_str(), &st) == -1) {
5124 r = 0;
5125 bluefs_shared_bdev = BlueFS::BDEV_DB;
7c673cae 5126 } else {
11fdf7f2
TL
5127 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5128 << cpp_strerror(r) << dendl;
5129 goto free_bluefs;
7c673cae
FG
5130 }
5131 }
7c673cae 5132
11fdf7f2
TL
5133 // shared device
5134 bfn = path + "/block";
5135 // never trim here
5136 r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false,
5137 true /* shared with bluestore */);
5138 if (r < 0) {
5139 derr << __func__ << " add block device(" << bfn << ") returned: "
5140 << cpp_strerror(r) << dendl;
5141 goto free_bluefs;
5142 }
5143 if (create) {
5144 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5145 uint64_t initial =
5146 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5147 cct->_conf->bluestore_bluefs_gift_ratio);
5148 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
5149 if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
5150 derr << __func__ << " bluefs_alloc_size 0x" << std::hex
5151 << cct->_conf->bluefs_alloc_size << " is not a multiple of "
5152 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5153 r = -EINVAL;
5154 goto free_bluefs;
7c673cae 5155 }
11fdf7f2
TL
5156 // align to bluefs's alloc_size
5157 initial = p2roundup(initial, cct->_conf->bluefs_alloc_size);
5158 // put bluefs in the middle of the device in case it is an HDD
5159 uint64_t start = p2align((bdev->get_size() - initial) / 2,
5160 cct->_conf->bluefs_alloc_size);
5161 //avoiding superblock overwrite
5162 ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved());
5163 start = std::max(cct->_conf->bluefs_alloc_size, start);
7c673cae 5164
11fdf7f2
TL
5165 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
5166 bluefs_extents.insert(start, initial);
5167 ++out_of_sync_fm;
5168 }
5169
5170 bfn = path + "/block.wal";
5171 if (::stat(bfn.c_str(), &st) == 0) {
5172 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
5173 create && cct->_conf->bdev_enable_discard);
5174 if (r < 0) {
5175 derr << __func__ << " add block device(" << bfn << ") returned: "
5176 << cpp_strerror(r) << dendl;
5177 goto free_bluefs;
5178 }
7c673cae 5179
11fdf7f2
TL
5180 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5181 r = _check_or_set_bdev_label(
5182 bfn,
5183 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5184 "bluefs wal", create);
7c673cae 5185 if (r < 0) {
11fdf7f2
TL
5186 derr << __func__ << " check block device(" << bfn
5187 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5188 goto free_bluefs;
5189 }
7c673cae
FG
5190 }
5191
11fdf7f2
TL
5192 if (create) {
5193 bluefs->add_block_extent(
5194 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5195 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5196 BDEV_LABEL_BLOCK_SIZE);
5197 }
5198 bluefs_single_shared_device = false;
5199 } else {
5200 r = 0;
5201 if (::lstat(bfn.c_str(), &st) != -1) {
5202 r = -errno;
5203 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5204 << cpp_strerror(r) << dendl;
7c673cae
FG
5205 goto free_bluefs;
5206 }
11fdf7f2
TL
5207 }
5208 return 0;
7c673cae 5209
11fdf7f2
TL
5210free_bluefs:
5211 ceph_assert(bluefs);
5212 delete bluefs;
5213 bluefs = NULL;
5214 return r;
5215}
7c673cae 5216
11fdf7f2
TL
5217int BlueStore::_open_bluefs(bool create)
5218{
5219 int r = _minimal_open_bluefs(create);
5220 if (r < 0) {
5221 return r;
5222 }
5223 if (create) {
5224 bluefs->mkfs(fsid);
5225 }
5226 r = bluefs->mount();
5227 if (r < 0) {
5228 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5229 }
5230 return r;
5231}
5232
5233void BlueStore::_close_bluefs()
5234{
5235 bluefs->umount();
5236 _minimal_close_bluefs();
5237}
5238
5239void BlueStore::_minimal_close_bluefs()
5240{
5241 delete bluefs;
5242 bluefs = NULL;
5243}
5244
5245int BlueStore::_is_bluefs(bool create, bool* ret)
5246{
5247 if (create) {
5248 *ret = cct->_conf->bluestore_bluefs;
5249 } else {
5250 string s;
5251 int r = read_meta("bluefs", &s);
5252 if (r < 0) {
5253 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5254 return -EIO;
5255 }
5256 if (s == "1") {
5257 *ret = true;
5258 } else if (s == "0") {
5259 *ret = false;
31f18b77 5260 } else {
11fdf7f2
TL
5261 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5262 << dendl;
5263 return -EIO;
5264 }
5265 }
5266 return 0;
5267}
5268
5269/*
5270* opens both DB and dependant super_meta, FreelistManager and allocator
5271* in the proper order
5272*/
5273int BlueStore::_open_db_and_around(bool read_only)
5274{
5275 int r;
5276 bool do_bluefs = false;
5277 _is_bluefs(false, &do_bluefs); // ignore err code
5278 if (do_bluefs) {
5279 // open in read-only first to read FM list and init allocator
5280 // as they might be needed for some BlueFS procedures
5281 r = _open_db(false, false, true);
5282 if (r < 0)
5283 return r;
5284
5285 r = _open_super_meta();
5286 if (r < 0) {
5287 goto out_db;
5288 }
5289
5290 r = _open_fm(nullptr);
5291 if (r < 0)
5292 goto out_db;
5293
5294 r = _open_alloc();
5295 if (r < 0)
5296 goto out_fm;
5297
5298 // now open in R/W mode
5299 if (!read_only) {
5300 _close_db();
5301
5302 r = _open_db(false, false, false);
5303 if (r < 0) {
5304 _close_alloc();
5305 _close_fm();
5306 return r;
28e407b8 5307 }
7c673cae 5308 }
11fdf7f2
TL
5309 } else {
5310 r = _open_db(false, false);
5311 if (r < 0) {
5312 return r;
5313 }
5314 r = _open_super_meta();
5315 if (r < 0) {
5316 goto out_db;
5317 }
7c673cae 5318
11fdf7f2
TL
5319 r = _open_fm(nullptr);
5320 if (r < 0)
5321 goto out_db;
5322
5323 r = _open_alloc();
5324 if (r < 0)
5325 goto out_fm;
5326 }
5327 return 0;
5328
5329 out_fm:
5330 _close_fm();
5331 out_db:
5332 _close_db();
5333 return r;
5334}
5335
5336void BlueStore::_close_db_and_around()
5337{
5338 if (bluefs) {
5339 if (out_of_sync_fm.fetch_and(0)) {
5340 _sync_bluefs_and_fm();
5341 }
5342 _close_db();
5343 while(out_of_sync_fm.fetch_and(0)) {
5344 // if seen some allocations during close - repeat open_db, sync fm, close
5345 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5346 int r = _open_db(false, false, false);
5347 if (r < 0) {
5348 derr << __func__
5349 << " unable to open db, FreelistManager is probably out of sync"
5350 << dendl;
5351 break;
5352 }
5353 _sync_bluefs_and_fm();
5354 _close_db();
7c673cae 5355 }
11fdf7f2
TL
5356 if (!_kv_only) {
5357 _close_alloc();
5358 _close_fm();
5359 }
5360 } else {
5361 _close_alloc();
5362 _close_fm();
5363 _close_db();
5364 }
5365}
5366
5367// updates legacy bluefs related recs in DB to a state valid for
5368// downgrades from nautilus.
5369void BlueStore::_sync_bluefs_and_fm()
5370{
5371 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5372 bufferlist bl;
5373 encode(bluefs_extents, bl);
5374 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5375 << std::hex << bluefs_extents << std::dec
5376 << dendl;
5377 KeyValueDB::Transaction synct = db->get_transaction();
5378 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5379 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5380
5381 // Nice thing is that we don't need to update FreelistManager here.
5382 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5383 // pre-Nautilis releases.
5384 // So once we get an extent to bluefs_extents this means it's
5385 // been free in allocator and hence it's free in FM too.
5386
5387 db->submit_transaction_sync(synct);
5388 }
5389}
5390
5391int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5392{
5393 int r;
5394 ceph_assert(!db);
5395 ceph_assert(!(create && read_only));
5396 string fn = path + "/db";
5397 string options;
5398 stringstream err;
5399 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5400
5401 string kv_backend;
5402 std::vector<KeyValueDB::ColumnFamily> cfs;
5403
5404 if (create) {
5405 kv_backend = cct->_conf->bluestore_kvbackend;
5406 } else {
5407 r = read_meta("kv_backend", &kv_backend);
7c673cae 5408 if (r < 0) {
11fdf7f2
TL
5409 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5410 return -EIO;
5411 }
5412 }
5413 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5414
5415 bool do_bluefs;
5416 r = _is_bluefs(create, &do_bluefs);
5417 if (r < 0) {
5418 return r;
5419 }
5420 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5421
5422 map<string,string> kv_options;
5423 // force separate wal dir for all new deployments.
5424 kv_options["separate_wal_dir"] = 1;
5425 rocksdb::Env *env = NULL;
5426 if (do_bluefs) {
5427 dout(10) << __func__ << " initializing bluefs" << dendl;
5428 if (kv_backend != "rocksdb") {
5429 derr << " backend must be rocksdb to use bluefs" << dendl;
5430 return -EINVAL;
7c673cae 5431 }
11fdf7f2
TL
5432
5433 r = _open_bluefs(create);
5434 if (r < 0) {
5435 return r;
5436 }
5437 bluefs->set_slow_device_expander(this);
5438
7c673cae
FG
5439 if (cct->_conf->bluestore_bluefs_env_mirror) {
5440 rocksdb::Env *a = new BlueRocksEnv(bluefs);
5441 rocksdb::Env *b = rocksdb::Env::Default();
5442 if (create) {
5443 string cmd = "rm -rf " + path + "/db " +
5444 path + "/db.slow " +
5445 path + "/db.wal";
5446 int r = system(cmd.c_str());
5447 (void)r;
5448 }
5449 env = new rocksdb::EnvMirror(b, a, false, true);
5450 } else {
5451 env = new BlueRocksEnv(bluefs);
5452
5453 // simplify the dir names, too, as "seen" by rocksdb
5454 fn = "db";
5455 }
5456
5457 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
5458 // we have both block.db and block; tell rocksdb!
5459 // note: the second (last) size value doesn't really matter
5460 ostringstream db_paths;
5461 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
5462 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
5463 db_paths << fn << ","
5464 << (uint64_t)(db_size * 95 / 100) << " "
5465 << fn + ".slow" << ","
5466 << (uint64_t)(slow_size * 95 / 100);
11fdf7f2
TL
5467 kv_options["db_paths"] = db_paths.str();
5468 dout(10) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
5469 }
5470
5471 if (create) {
5472 env->CreateDir(fn);
11fdf7f2
TL
5473 env->CreateDir(fn + ".wal");
5474 env->CreateDir(fn + ".slow");
5475 } else {
5476 std::vector<std::string> res;
5477 // check for dir presence
5478 auto r = env->GetChildren(fn+".wal", &res);
5479 if (r.IsNotFound()) {
5480 kv_options.erase("separate_wal_dir");
5481 }
7c673cae 5482 }
11fdf7f2
TL
5483 } else {
5484 string walfn = path + "/db.wal";
7c673cae 5485
11fdf7f2
TL
5486 if (create) {
5487 int r = ::mkdir(fn.c_str(), 0755);
5488 if (r < 0)
5489 r = -errno;
5490 if (r < 0 && r != -EEXIST) {
5491 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
5492 << dendl;
5493 return r;
5494 }
5495
5496 // wal_dir, too!
7c673cae
FG
5497 r = ::mkdir(walfn.c_str(), 0755);
5498 if (r < 0)
5499 r = -errno;
5500 if (r < 0 && r != -EEXIST) {
5501 derr << __func__ << " failed to create " << walfn
5502 << ": " << cpp_strerror(r)
5503 << dendl;
5504 return r;
5505 }
11fdf7f2
TL
5506 } else {
5507 struct stat st;
5508 r = ::stat(walfn.c_str(), &st);
5509 if (r < 0 && errno == ENOENT) {
5510 kv_options.erase("separate_wal_dir");
5511 }
7c673cae
FG
5512 }
5513 }
5514
91327a77 5515
7c673cae
FG
5516 db = KeyValueDB::create(cct,
5517 kv_backend,
5518 fn,
11fdf7f2 5519 kv_options,
7c673cae
FG
5520 static_cast<void*>(env));
5521 if (!db) {
5522 derr << __func__ << " error creating db" << dendl;
5523 if (bluefs) {
11fdf7f2 5524 _close_bluefs();
7c673cae
FG
5525 }
5526 // delete env manually here since we can't depend on db to do this
5527 // under this case
5528 delete env;
5529 env = NULL;
5530 return -EIO;
5531 }
5532
5533 FreelistManager::setup_merge_operators(db);
5534 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 5535 db->set_cache_size(cache_kv_ratio * cache_size);
31f18b77 5536
11fdf7f2 5537 if (kv_backend == "rocksdb") {
7c673cae 5538 options = cct->_conf->bluestore_rocksdb_options;
11fdf7f2
TL
5539
5540 map<string,string> cf_map;
5541 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
5542 get_str_map,
5543 &cf_map,
5544 " \t");
5545 for (auto& i : cf_map) {
5546 dout(10) << "column family " << i.first << ": " << i.second << dendl;
5547 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
5548 }
5549 }
5550
7c673cae 5551 db->init(options);
11fdf7f2
TL
5552 if (to_repair_db)
5553 return 0;
5554 if (create) {
5555 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
5556 r = db->create_and_open(err, cfs);
5557 } else {
5558 r = db->create_and_open(err);
5559 }
5560 } else {
5561 // we pass in cf list here, but it is only used if the db already has
5562 // column families created.
5563 r = read_only ?
5564 db->open_read_only(err, cfs) :
5565 db->open(err, cfs);
5566 }
7c673cae
FG
5567 if (r) {
5568 derr << __func__ << " erroring opening db: " << err.str() << dendl;
11fdf7f2 5569 _close_db();
7c673cae
FG
5570 return -EIO;
5571 }
5572 dout(1) << __func__ << " opened " << kv_backend
5573 << " path " << fn << " options " << options << dendl;
5574 return 0;
7c673cae
FG
5575}
5576
5577void BlueStore::_close_db()
5578{
11fdf7f2 5579 ceph_assert(db);
7c673cae
FG
5580 delete db;
5581 db = NULL;
5582 if (bluefs) {
11fdf7f2 5583 _close_bluefs();
7c673cae
FG
5584 }
5585}
5586
11fdf7f2 5587void BlueStore::_dump_alloc_on_failure()
7c673cae 5588{
11fdf7f2
TL
5589 auto dump_interval =
5590 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
5591 if (dump_interval > 0 &&
5592 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
5593 alloc->dump();
5594 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
5595 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 5596 }
11fdf7f2 5597}
7c673cae 5598
7c673cae 5599
11fdf7f2
TL
5600int BlueStore::allocate_bluefs_freespace(
5601 uint64_t min_size,
5602 uint64_t size,
5603 PExtentVector* extents_out)
5604{
5605 ceph_assert(min_size <= size);
5606 if (size) {
5607 // round up to alloc size
5608 min_size = p2roundup(min_size, cct->_conf->bluefs_alloc_size);
5609 size = p2roundup(size, cct->_conf->bluefs_alloc_size);
5610
5611 PExtentVector extents_local;
5612 PExtentVector* extents = extents_out ? extents_out : &extents_local;
5613
5614
5615 uint64_t gift;
5616 uint64_t allocated = 0;
5617 int64_t alloc_len;
5618 do {
5619 // hard cap to fit into 32 bits
5620 gift = std::min<uint64_t>(size, 1ull << 31);
5621 dout(10) << __func__ << " gifting " << gift
5622 << " (" << byte_u_t(gift) << ")" << dendl;
5623
5624 alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
5625 0, 0, extents);
5626 if (alloc_len) {
5627 allocated += alloc_len;
5628 size -= alloc_len;
5629 }
5630
5631 if (alloc_len < (int64_t)gift && (min_size > allocated)) {
5632 derr << __func__
5633 << " failed to allocate on 0x" << std::hex << gift
5634 << " min_size 0x" << min_size
5635 << " > allocated total 0x" << allocated
5636 << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size
5637 << " allocated 0x" << alloc_len
5638 << " available 0x " << alloc->get_free()
5639 << std::dec << dendl;
7c673cae 5640
494da23a 5641 _dump_alloc_on_failure();
11fdf7f2
TL
5642 alloc->release(*extents);
5643 extents->clear();
5644 return -ENOSPC;
5645 }
5646 } while (size && alloc_len > 0);
5647 for (auto& e : *extents) {
5648 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
5649 bluefs_extents.insert(e.offset, e.length);
5650 ++out_of_sync_fm;
5651 // apply to bluefs if not requested from outside
5652 if (!extents_out) {
5653 bluefs->add_block_extent(bluefs_shared_bdev, e.offset, e.length);
5654 }
7c673cae
FG
5655 }
5656 }
7c673cae
FG
5657 return 0;
5658}
5659
11fdf7f2 5660int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
f64942e4 5661{
7c673cae
FG
5662 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
5663
5664 uint64_t my_free = alloc->get_free();
5665 uint64_t total = bdev->get_size();
5666 float my_free_ratio = (float)my_free / (float)total;
5667
5668 uint64_t total_free = bluefs_free + my_free;
5669
5670 float bluefs_ratio = (float)bluefs_free / (float)total_free;
5671
5672 dout(10) << __func__
1adf2230 5673 << " bluefs " << byte_u_t(bluefs_free)
7c673cae 5674 << " free (" << bluefs_free_ratio
1adf2230 5675 << ") bluestore " << byte_u_t(my_free)
7c673cae
FG
5676 << " free (" << my_free_ratio
5677 << "), bluefs_ratio " << bluefs_ratio
5678 << dendl;
5679
5680 uint64_t gift = 0;
5681 uint64_t reclaim = 0;
5682 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
5683 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
5684 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5685 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
1adf2230 5686 << ", should gift " << byte_u_t(gift) << dendl;
7c673cae
FG
5687 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
5688 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
5689 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
5690 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
5691 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5692 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
1adf2230 5693 << ", should reclaim " << byte_u_t(reclaim) << dendl;
7c673cae 5694 }
3efd9988
FG
5695
5696 // don't take over too much of the freespace
5697 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 5698 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 5699 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
5700 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
5701 dout(10) << __func__ << " bluefs_total " << bluefs_total
5702 << " < min " << cct->_conf->bluestore_bluefs_min
1adf2230 5703 << ", should gift " << byte_u_t(g) << dendl;
7c673cae
FG
5704 if (g > gift)
5705 gift = g;
5706 reclaim = 0;
5707 }
11fdf7f2 5708 uint64_t min_free = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
3efd9988
FG
5709 if (bluefs_free < min_free &&
5710 min_free < free_cap) {
5711 uint64_t g = min_free - bluefs_free;
11fdf7f2 5712 dout(10) << __func__ << " bluefs_free " << bluefs_free
3efd9988 5713 << " < min " << min_free
1adf2230 5714 << ", should gift " << byte_u_t(g) << dendl;
3efd9988
FG
5715 if (g > gift)
5716 gift = g;
5717 reclaim = 0;
5718 }
11fdf7f2
TL
5719 ceph_assert((int64_t)gift >= 0);
5720 ceph_assert((int64_t)reclaim >= 0);
5721 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
5722}
7c673cae 5723
11fdf7f2
TL
5724int BlueStore::_balance_bluefs_freespace()
5725{
5726 int ret = 0;
5727 ceph_assert(bluefs);
7c673cae 5728
11fdf7f2
TL
5729 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
5730 bluefs->get_usage(&bluefs_usage);
5731 ceph_assert(bluefs_usage.size() > bluefs_shared_bdev);
7c673cae 5732
11fdf7f2
TL
5733 bool clear_alert = true;
5734 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
5735 auto& p = bluefs_usage[bluefs_shared_bdev];
5736 if (p.first != p.second) {
5737 auto& db = bluefs_usage[BlueFS::BDEV_DB];
5738 ostringstream ss;
5739 ss << "spilled over " << byte_u_t(p.second - p.first)
5740 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
5741 << " used of " << byte_u_t(db.second) << ") to slow device";
5742 _set_spillover_alert(ss.str());
5743 clear_alert = false;
5744 }
5745 }
5746 if (clear_alert) {
5747 _clear_spillover_alert();
7c673cae
FG
5748 }
5749
11fdf7f2
TL
5750 // fixme: look at primary bdev only for now
5751 int64_t delta = _get_bluefs_size_delta(
5752 bluefs_usage[bluefs_shared_bdev].first,
5753 bluefs_usage[bluefs_shared_bdev].second);
5754
7c673cae 5755 // reclaim from bluefs?
11fdf7f2 5756 if (delta < 0) {
7c673cae 5757 // round up to alloc size
11fdf7f2 5758 auto reclaim = p2roundup(uint64_t(-delta), cct->_conf->bluefs_alloc_size);
7c673cae
FG
5759
5760 // hard cap to fit into 32 bits
11fdf7f2 5761 reclaim = std::min<uint64_t>(reclaim, 1ull << 31);
7c673cae 5762 dout(10) << __func__ << " reclaiming " << reclaim
1adf2230 5763 << " (" << byte_u_t(reclaim) << ")" << dendl;
7c673cae
FG
5764
5765 while (reclaim > 0) {
5766 // NOTE: this will block and do IO.
a8e16298 5767 PExtentVector extents;
7c673cae
FG
5768 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
5769 &extents);
5770 if (r < 0) {
5771 derr << __func__ << " failed to reclaim space from bluefs"
5772 << dendl;
5773 break;
5774 }
5775 for (auto e : extents) {
11fdf7f2 5776 ++out_of_sync_fm;
7c673cae
FG
5777 bluefs_extents.erase(e.offset, e.length);
5778 bluefs_extents_reclaiming.insert(e.offset, e.length);
5779 reclaim -= e.length;
5780 }
5781 }
5782
5783 ret = 1;
5784 }
5785
5786 return ret;
5787}
5788
7c673cae
FG
5789int BlueStore::_open_collections(int *errors)
5790{
28e407b8 5791 dout(10) << __func__ << dendl;
11fdf7f2 5792 ceph_assert(coll_map.empty());
7c673cae
FG
5793 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5794 for (it->upper_bound(string());
5795 it->valid();
5796 it->next()) {
5797 coll_t cid;
5798 if (cid.parse(it->key())) {
5799 CollectionRef c(
5800 new Collection(
5801 this,
5802 cache_shards[cid.hash_to_shard(cache_shards.size())],
5803 cid));
5804 bufferlist bl = it->value();
11fdf7f2 5805 auto p = bl.cbegin();
7c673cae 5806 try {
11fdf7f2 5807 decode(c->cnode, p);
7c673cae
FG
5808 } catch (buffer::error& e) {
5809 derr << __func__ << " failed to decode cnode, key:"
5810 << pretty_binary_string(it->key()) << dendl;
5811 return -EIO;
5812 }
28e407b8
AA
5813 dout(20) << __func__ << " opened " << cid << " " << c
5814 << " " << c->cnode << dendl;
11fdf7f2 5815 _osr_attach(c.get());
7c673cae 5816 coll_map[cid] = c;
11fdf7f2 5817
7c673cae
FG
5818 } else {
5819 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5820 if (errors)
5821 (*errors)++;
5822 }
5823 }
5824 return 0;
5825}
5826
224ce89b 5827void BlueStore::_open_statfs()
31f18b77 5828{
11fdf7f2
TL
5829 osd_pools.clear();
5830 vstatfs.reset();
5831
31f18b77 5832 bufferlist bl;
11fdf7f2 5833 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 5834 if (r >= 0) {
11fdf7f2 5835 per_pool_stat_collection = false;
31f18b77 5836 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 5837 auto it = bl.cbegin();
31f18b77 5838 vstatfs.decode(it);
11fdf7f2 5839 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 5840 } else {
31f18b77
FG
5841 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5842 }
81eedcae 5843 _check_legacy_statfs_alert();
11fdf7f2
TL
5844 } else if (cct->_conf->bluestore_no_per_pool_stats_tolerance == "enforce") {
5845 per_pool_stat_collection = false;
5846 dout(10) << __func__ << " store_statfs is requested but missing, using empty" << dendl;
5847 } else {
5848 per_pool_stat_collection = true;
5849 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
5850 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
5851 for (it->upper_bound(string());
5852 it->valid();
5853 it->next()) {
5854
5855 uint64_t pool_id;
5856 int r = get_key_pool_stat(it->key(), &pool_id);
5857 ceph_assert(r == 0);
5858
5859 bufferlist bl;
5860 bl = it->value();
5861 auto p = bl.cbegin();
5862 auto& st = osd_pools[pool_id];
5863 try {
5864 st.decode(p);
5865 vstatfs += st;
5866
5867 dout(30) << __func__ << " pool " << pool_id
5868 << " statfs " << st << dendl;
5869 } catch (buffer::error& e) {
5870 derr << __func__ << " failed to decode pool stats, key:"
5871 << pretty_binary_string(it->key()) << dendl;
5872 }
5873 }
31f18b77 5874 }
11fdf7f2
TL
5875 dout(30) << __func__ << " statfs " << vstatfs << dendl;
5876
31f18b77
FG
5877}
5878
7c673cae
FG
5879int BlueStore::_setup_block_symlink_or_file(
5880 string name,
5881 string epath,
5882 uint64_t size,
5883 bool create)
5884{
5885 dout(20) << __func__ << " name " << name << " path " << epath
5886 << " size " << size << " create=" << (int)create << dendl;
5887 int r = 0;
91327a77 5888 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5889 if (create)
5890 flags |= O_CREAT;
5891 if (epath.length()) {
5892 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5893 if (r < 0) {
5894 r = -errno;
5895 derr << __func__ << " failed to create " << name << " symlink to "
5896 << epath << ": " << cpp_strerror(r) << dendl;
5897 return r;
5898 }
5899
5900 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5901 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5902 if (fd < 0) {
5903 r = -errno;
5904 derr << __func__ << " failed to open " << epath << " file: "
5905 << cpp_strerror(r) << dendl;
5906 return r;
5907 }
11fdf7f2
TL
5908 // write the Transport ID of the NVMe device
5909 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
5910 // where "0000:02:00.0" is the selector of a PCI device, see
5911 // the first column of "lspci -mm -n -D"
5912 string trid{"trtype:PCIe "};
5913 trid += "traddr:";
5914 trid += epath.substr(strlen(SPDK_PREFIX));
5915 r = ::write(fd, trid.c_str(), trid.size());
5916 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
5917 dout(1) << __func__ << " created " << name << " symlink to "
5918 << epath << dendl;
5919 VOID_TEMP_FAILURE_RETRY(::close(fd));
5920 }
5921 }
5922 if (size) {
5923 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5924 if (fd >= 0) {
5925 // block file is present
5926 struct stat st;
5927 int r = ::fstat(fd, &st);
5928 if (r == 0 &&
5929 S_ISREG(st.st_mode) && // if it is a regular file
5930 st.st_size == 0) { // and is 0 bytes
5931 r = ::ftruncate(fd, size);
5932 if (r < 0) {
5933 r = -errno;
5934 derr << __func__ << " failed to resize " << name << " file to "
5935 << size << ": " << cpp_strerror(r) << dendl;
5936 VOID_TEMP_FAILURE_RETRY(::close(fd));
5937 return r;
5938 }
5939
5940 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
5941 r = ::ceph_posix_fallocate(fd, 0, size);
5942 if (r > 0) {
7c673cae
FG
5943 derr << __func__ << " failed to prefallocate " << name << " file to "
5944 << size << ": " << cpp_strerror(r) << dendl;
5945 VOID_TEMP_FAILURE_RETRY(::close(fd));
5946 return -r;
5947 }
7c673cae
FG
5948 }
5949 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 5950 << byte_u_t(size) << dendl;
7c673cae
FG
5951 }
5952 VOID_TEMP_FAILURE_RETRY(::close(fd));
5953 } else {
5954 int r = -errno;
5955 if (r != -ENOENT) {
5956 derr << __func__ << " failed to open " << name << " file: "
5957 << cpp_strerror(r) << dendl;
5958 return r;
5959 }
5960 }
5961 }
5962 return 0;
5963}
5964
5965int BlueStore::mkfs()
5966{
5967 dout(1) << __func__ << " path " << path << dendl;
5968 int r;
5969 uuid_d old_fsid;
5970
5971 {
5972 string done;
5973 r = read_meta("mkfs_done", &done);
5974 if (r == 0) {
5975 dout(1) << __func__ << " already created" << dendl;
5976 if (cct->_conf->bluestore_fsck_on_mkfs) {
5977 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5978 if (r < 0) {
5979 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5980 << dendl;
5981 return r;
5982 }
5983 if (r > 0) {
5984 derr << __func__ << " fsck found " << r << " errors" << dendl;
5985 r = -EIO;
5986 }
5987 }
5988 return r; // idempotent
5989 }
5990 }
5991
5992 {
5993 string type;
5994 r = read_meta("type", &type);
5995 if (r == 0) {
5996 if (type != "bluestore") {
5997 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5998 return -EIO;
5999 }
6000 } else {
6001 r = write_meta("type", "bluestore");
6002 if (r < 0)
6003 return r;
6004 }
6005 }
6006
6007 freelist_type = "bitmap";
6008
6009 r = _open_path();
6010 if (r < 0)
6011 return r;
6012
6013 r = _open_fsid(true);
6014 if (r < 0)
6015 goto out_path_fd;
6016
6017 r = _lock_fsid();
6018 if (r < 0)
6019 goto out_close_fsid;
6020
6021 r = _read_fsid(&old_fsid);
6022 if (r < 0 || old_fsid.is_zero()) {
6023 if (fsid.is_zero()) {
6024 fsid.generate_random();
6025 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6026 } else {
6027 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6028 }
6029 // we'll write it later.
6030 } else {
6031 if (!fsid.is_zero() && fsid != old_fsid) {
6032 derr << __func__ << " on-disk fsid " << old_fsid
6033 << " != provided " << fsid << dendl;
6034 r = -EINVAL;
6035 goto out_close_fsid;
6036 }
6037 fsid = old_fsid;
6038 }
6039
6040 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6041 cct->_conf->bluestore_block_size,
6042 cct->_conf->bluestore_block_create);
6043 if (r < 0)
6044 goto out_close_fsid;
6045 if (cct->_conf->bluestore_bluefs) {
6046 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6047 cct->_conf->bluestore_block_wal_size,
6048 cct->_conf->bluestore_block_wal_create);
6049 if (r < 0)
6050 goto out_close_fsid;
6051 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6052 cct->_conf->bluestore_block_db_size,
6053 cct->_conf->bluestore_block_db_create);
6054 if (r < 0)
6055 goto out_close_fsid;
6056 }
6057
6058 r = _open_bdev(true);
6059 if (r < 0)
6060 goto out_close_fsid;
6061
3efd9988
FG
6062 // choose min_alloc_size
6063 if (cct->_conf->bluestore_min_alloc_size) {
6064 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6065 } else {
11fdf7f2 6066 ceph_assert(bdev);
3efd9988
FG
6067 if (bdev->is_rotational()) {
6068 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6069 } else {
6070 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6071 }
6072 }
11fdf7f2 6073 _validate_bdev();
3efd9988
FG
6074
6075 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6076 if (!isp2(min_alloc_size)) {
3efd9988
FG
6077 derr << __func__ << " min_alloc_size 0x"
6078 << std::hex << min_alloc_size << std::dec
6079 << " is not power of 2 aligned!"
6080 << dendl;
6081 r = -EINVAL;
6082 goto out_close_bdev;
6083 }
6084
7c673cae
FG
6085 r = _open_db(true);
6086 if (r < 0)
6087 goto out_close_bdev;
6088
7c673cae
FG
6089 {
6090 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
6091 r = _open_fm(t);
6092 if (r < 0)
6093 goto out_close_db;
7c673cae
FG
6094 {
6095 bufferlist bl;
11fdf7f2 6096 encode((uint64_t)0, bl);
7c673cae
FG
6097 t->set(PREFIX_SUPER, "nid_max", bl);
6098 t->set(PREFIX_SUPER, "blobid_max", bl);
6099 }
6100
7c673cae
FG
6101 {
6102 bufferlist bl;
11fdf7f2 6103 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6104 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6105 }
6106
6107 ondisk_format = latest_ondisk_format;
6108 _prepare_ondisk_format_super(t);
6109 db->submit_transaction_sync(t);
6110 }
6111
7c673cae
FG
6112 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6113 if (r < 0)
224ce89b
WB
6114 goto out_close_fm;
6115
3efd9988 6116 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6117 if (r < 0)
224ce89b 6118 goto out_close_fm;
7c673cae
FG
6119
6120 if (fsid != old_fsid) {
6121 r = _write_fsid();
6122 if (r < 0) {
6123 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6124 goto out_close_fm;
7c673cae
FG
6125 }
6126 }
6127
11fdf7f2
TL
6128 if (out_of_sync_fm.fetch_and(0)) {
6129 _sync_bluefs_and_fm();
6130 }
6131
7c673cae
FG
6132 out_close_fm:
6133 _close_fm();
6134 out_close_db:
6135 _close_db();
6136 out_close_bdev:
6137 _close_bdev();
6138 out_close_fsid:
6139 _close_fsid();
6140 out_path_fd:
6141 _close_path();
6142
6143 if (r == 0 &&
6144 cct->_conf->bluestore_fsck_on_mkfs) {
6145 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6146 if (rc < 0)
6147 return rc;
6148 if (rc > 0) {
6149 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6150 r = -EIO;
6151 }
11fdf7f2
TL
6152 }
6153
6154 if (r == 0) {
6155 // indicate success by writing the 'mkfs_done' file
6156 r = write_meta("mkfs_done", "yes");
6157 }
6158
6159 if (r < 0) {
6160 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6161 } else {
6162 dout(0) << __func__ << " success" << dendl;
6163 }
6164 return r;
6165}
6166
6167int BlueStore::_mount_for_bluefs()
6168{
6169 int r = _open_path();
6170 ceph_assert(r == 0);
6171 r = _open_fsid(false);
6172 ceph_assert(r == 0);
6173 r = _read_fsid(&fsid);
6174 ceph_assert(r == 0);
6175 r = _lock_fsid();
6176 ceph_assert(r == 0);
6177 r = _open_bluefs(false);
6178 ceph_assert(r == 0);
6179 return r;
6180}
6181
6182void BlueStore::_umount_for_bluefs()
6183{
6184 _close_bluefs();
6185 _close_fsid();
6186 _close_path();
6187}
6188
6189int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6190{
6191 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6192 int r;
6193 ceph_assert(path_fd < 0);
6194
6195 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6196
6197 if (!cct->_conf->bluestore_bluefs) {
6198 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6199 return -EIO;
6200 }
6201
6202 r = _mount_for_bluefs();
6203
6204 int reserved = 0;
6205 if (id == BlueFS::BDEV_NEWWAL) {
6206 string p = path + "/block.wal";
6207 r = _setup_block_symlink_or_file("block.wal", dev_path,
6208 cct->_conf->bluestore_block_wal_size,
6209 true);
6210 ceph_assert(r == 0);
6211
6212 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
6213 cct->_conf->bdev_enable_discard);
6214 ceph_assert(r == 0);
6215
6216 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6217 r = _check_or_set_bdev_label(
6218 p,
6219 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6220 "bluefs wal",
6221 true);
6222 ceph_assert(r == 0);
6223 }
6224
6225 reserved = BDEV_LABEL_BLOCK_SIZE;
6226 } else if (id == BlueFS::BDEV_NEWDB) {
6227 string p = path + "/block.db";
6228 r = _setup_block_symlink_or_file("block.db", dev_path,
6229 cct->_conf->bluestore_block_db_size,
6230 true);
6231 ceph_assert(r == 0);
6232
6233 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
6234 cct->_conf->bdev_enable_discard);
6235 ceph_assert(r == 0);
6236
6237 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6238 r = _check_or_set_bdev_label(
6239 p,
6240 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6241 "bluefs db",
6242 true);
6243 ceph_assert(r == 0);
6244 }
6245 reserved = SUPER_RESERVED;
6246 }
6247
6248 bluefs->umount();
6249 bluefs->mount();
6250
6251 bluefs->add_block_extent(
6252 id,
6253 reserved,
6254 bluefs->get_block_device_size(id) - reserved);
6255
6256 r = bluefs->prepare_new_device(id);
6257 ceph_assert(r == 0);
6258
6259 if (r < 0) {
6260 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6261 } else {
6262 dout(0) << __func__ << " success" << dendl;
6263 }
6264
6265 _umount_for_bluefs();
6266 return r;
6267}
6268
6269int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6270 int id)
6271{
6272 dout(10) << __func__ << " id:" << id << dendl;
6273 ceph_assert(path_fd < 0);
6274
6275 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6276
6277 if (!cct->_conf->bluestore_bluefs) {
6278 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6279 return -EIO;
6280 }
6281
6282 int r = _mount_for_bluefs();
6283
6284 // require bluestore_bluefs_min_free to be free at target device!
6285 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6286 for(auto src_id : devs_source) {
6287 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6288 }
6289 uint64_t target_free = bluefs->get_free(id);
6290 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6291 // will need to remount full BlueStore instance to allocate more space
6292 _umount_for_bluefs();
6293
6294 r = mount();
6295 ceph_assert(r == 0);
6296 dout(1) << __func__
6297 << " Allocating more space at slow device for BlueFS: +"
6298 << used_space - target_free << " bytes" << dendl;
6299 r = allocate_bluefs_freespace(
6300 used_space - target_free,
6301 used_space - target_free,
6302 nullptr);
6303
6304 umount();
6305 if (r != 0) {
6306 derr << __func__
6307 << " can't migrate, unable to allocate extra space: "
6308 << used_space - target_free << " at target:" << id
6309 << dendl;
6310 return -ENOSPC;
6311 }
6312
6313 r = _mount_for_bluefs();
6314 ceph_assert(r == 0);
6315 } else if (target_free < used_space) {
6316 derr << __func__
6317 << " can't migrate, free space at target: " << target_free
6318 << " is less than required space: " << used_space
6319 << dendl;
6320 return -ENOSPC;
6321 }
6322 r = bluefs->device_migrate_to_existing(cct, devs_source, id);
6323 if (r < 0) {
6324 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6325 goto shutdown;
6326 }
6327
6328 if (devs_source.count(BlueFS::BDEV_DB)) {
6329 r = unlink(string(path + "/block.db").c_str());
6330 ceph_assert(r == 0);
6331 }
6332 if (devs_source.count(BlueFS::BDEV_WAL)) {
6333 r = unlink(string(path + "/block.wal").c_str());
6334 ceph_assert(r == 0);
6335 }
6336
6337shutdown:
6338 _umount_for_bluefs();
6339 return r;
6340}
6341
6342int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6343 int id,
6344 const string& dev_path)
6345{
6346 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6347 int r;
6348 ceph_assert(path_fd < 0);
6349
6350 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6351
6352 if (!cct->_conf->bluestore_bluefs) {
6353 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6354 return -EIO;
6355 }
6356
6357 r = _mount_for_bluefs();
6358
6359 int reserved = 0;
6360 string link_db;
6361 string link_wal;
6362 if (devs_source.count(BlueFS::BDEV_DB) &&
6363 bluefs_shared_bdev != BlueFS::BDEV_DB) {
6364 link_db = path + "/block.db";
6365 }
6366 if (devs_source.count(BlueFS::BDEV_WAL)) {
6367 link_wal = path + "/block.wal";
6368 }
6369
6370 size_t target_size;
6371 string target_name;
6372 if (id == BlueFS::BDEV_NEWWAL) {
6373 target_name = "block.wal";
6374 target_size = cct->_conf->bluestore_block_wal_size;
6375
6376 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
6377 cct->_conf->bdev_enable_discard);
6378 ceph_assert(r == 0);
6379
6380 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6381 r = _check_or_set_bdev_label(
6382 dev_path,
6383 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6384 "bluefs wal",
6385 true);
6386 ceph_assert(r == 0);
6387 }
6388 reserved = BDEV_LABEL_BLOCK_SIZE;
6389 } else if (id == BlueFS::BDEV_NEWDB) {
6390 target_name = "block.db";
6391 target_size = cct->_conf->bluestore_block_db_size;
31f18b77 6392
11fdf7f2
TL
6393 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
6394 cct->_conf->bdev_enable_discard);
6395 ceph_assert(r == 0);
6396
6397 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6398 r = _check_or_set_bdev_label(
6399 dev_path,
6400 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6401 "bluefs db",
6402 true);
6403 ceph_assert(r == 0);
6404 }
6405 reserved = SUPER_RESERVED;
31f18b77
FG
6406 }
6407
11fdf7f2
TL
6408 bluefs->umount();
6409 bluefs->mount();
6410
6411 bluefs->add_block_extent(
6412 id, reserved, bluefs->get_block_device_size(id) - reserved);
6413
6414 r = bluefs->device_migrate_to_new(cct, devs_source, id);
6415
7c673cae 6416 if (r < 0) {
11fdf7f2
TL
6417 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6418 goto shutdown;
6419 }
6420
6421 if (!link_db.empty()) {
6422 r = unlink(link_db.c_str());
6423 ceph_assert(r == 0);
6424 }
6425 if (!link_wal.empty()) {
6426 r = unlink(link_wal.c_str());
6427 ceph_assert(r == 0);
6428 }
6429 r = _setup_block_symlink_or_file(
6430 target_name,
6431 dev_path,
6432 target_size,
6433 true);
6434 ceph_assert(r == 0);
6435 dout(0) << __func__ << " success" << dendl;
6436
6437shutdown:
6438 _umount_for_bluefs();
6439 return r;
6440}
6441
6442string BlueStore::get_device_path(unsigned id)
6443{
6444 string res;
6445 if (id < BlueFS::MAX_BDEV) {
6446 switch (id) {
6447 case BlueFS::BDEV_WAL:
6448 res = path + "/block.wal";
6449 break;
6450 case BlueFS::BDEV_DB:
6451 if (id == bluefs_shared_bdev) {
6452 res = path + "/block";
6453 } else {
6454 res = path + "/block.db";
6455 }
6456 break;
6457 case BlueFS::BDEV_SLOW:
6458 res = path + "/block";
6459 break;
6460 }
6461 }
6462 return res;
6463}
6464
6465int BlueStore::expand_devices(ostream& out)
6466{
6467 int r = _mount(false);
6468 ceph_assert(r == 0);
6469 bluefs->dump_block_extents(out);
6470 out << "Expanding..." << std::endl;
6471 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
6472 if (devid == bluefs_shared_bdev ) {
6473 continue;
6474 }
6475 uint64_t size = bluefs->get_block_device_size(devid);
6476 if (size == 0) {
6477 // no bdev
6478 continue;
6479 }
6480
6481 interval_set<uint64_t> before;
6482 bluefs->get_block_extents(devid, &before);
6483 ceph_assert(!before.empty());
6484 uint64_t end = before.range_end();
6485 if (end < size) {
6486 out << devid
6487 <<" : expanding " << " from 0x" << std::hex
6488 << end << " to 0x" << size << std::dec << std::endl;
6489 bluefs->add_block_extent(devid, end, size-end);
6490 string p = get_device_path(devid);
6491 const char* path = p.c_str();
6492 if (path == nullptr) {
6493 derr << devid
6494 <<": can't find device path " << dendl;
6495 continue;
6496 }
6497 bluestore_bdev_label_t label;
6498 int r = _read_bdev_label(cct, path, &label);
6499 if (r < 0) {
6500 derr << "unable to read label for " << path << ": "
6501 << cpp_strerror(r) << dendl;
6502 continue;
6503 }
6504 label.size = size;
6505 r = _write_bdev_label(cct, path, label);
6506 if (r < 0) {
6507 derr << "unable to write label for " << path << ": "
6508 << cpp_strerror(r) << dendl;
6509 continue;
6510 }
6511 out << devid
6512 <<" : size label updated to " << size
6513 << std::endl;
6514 }
6515 }
6516 uint64_t size0 = fm->get_size();
6517 uint64_t size = bdev->get_size();
6518 if (size0 < size) {
6519 out << bluefs_shared_bdev
6520 <<" : expanding " << " from 0x" << std::hex
6521 << size0 << " to 0x" << size << std::dec << std::endl;
6522 KeyValueDB::Transaction txn;
6523 txn = db->get_transaction();
6524 int r = fm->expand(size, txn);
6525 ceph_assert(r == 0);
6526 db->submit_transaction_sync(txn);
6527
6528 // always reference to slow device here
6529 string p = get_device_path(BlueFS::BDEV_SLOW);
6530 ceph_assert(!p.empty());
6531 const char* path = p.c_str();
6532 bluestore_bdev_label_t label;
6533 r = _read_bdev_label(cct, path, &label);
6534 if (r < 0) {
6535 derr << "unable to read label for " << path << ": "
6536 << cpp_strerror(r) << dendl;
6537 } else {
6538 label.size = size;
6539 r = _write_bdev_label(cct, path, label);
6540 if (r < 0) {
6541 derr << "unable to write label for " << path << ": "
6542 << cpp_strerror(r) << dendl;
6543 } else {
6544 out << bluefs_shared_bdev
6545 <<" : size label updated to " << size
6546 << std::endl;
6547 }
6548 }
7c673cae 6549 }
11fdf7f2 6550 umount();
7c673cae
FG
6551 return r;
6552}
6553
6554void BlueStore::set_cache_shards(unsigned num)
6555{
6556 dout(10) << __func__ << " " << num << dendl;
6557 size_t old = cache_shards.size();
11fdf7f2 6558 ceph_assert(num >= old);
7c673cae
FG
6559 cache_shards.resize(num);
6560 for (unsigned i = old; i < num; ++i) {
6561 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
6562 logger);
6563 }
6564}
6565
11fdf7f2 6566int BlueStore::_mount(bool kv_only, bool open_db)
7c673cae
FG
6567{
6568 dout(1) << __func__ << " path " << path << dendl;
6569
3efd9988
FG
6570 _kv_only = kv_only;
6571
7c673cae
FG
6572 {
6573 string type;
6574 int r = read_meta("type", &type);
6575 if (r < 0) {
6576 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6577 << dendl;
6578 return r;
6579 }
6580
6581 if (type != "bluestore") {
6582 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6583 return -EIO;
6584 }
6585 }
6586
6587 if (cct->_conf->bluestore_fsck_on_mount) {
6588 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6589 if (rc < 0)
6590 return rc;
6591 if (rc > 0) {
6592 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6593 return -EIO;
6594 }
6595 }
6596
6597 int r = _open_path();
6598 if (r < 0)
6599 return r;
6600 r = _open_fsid(false);
6601 if (r < 0)
6602 goto out_path;
6603
6604 r = _read_fsid(&fsid);
6605 if (r < 0)
6606 goto out_fsid;
6607
6608 r = _lock_fsid();
6609 if (r < 0)
6610 goto out_fsid;
6611
6612 r = _open_bdev(false);
6613 if (r < 0)
6614 goto out_fsid;
6615
11fdf7f2
TL
6616 if (open_db) {
6617 r = _open_db_and_around(false);
6618 } else {
6619 // we can bypass db open exclusively in case of kv_only mode
6620 ceph_assert(kv_only);
6621 r = _open_db(false, true);
6622 if (r < 0)
6623 goto out_bdev;
6624 }
7c673cae
FG
6625
6626 if (kv_only)
6627 return 0;
6628
11fdf7f2
TL
6629 r = _upgrade_super();
6630 if (r < 0) {
7c673cae 6631 goto out_db;
11fdf7f2 6632 }
7c673cae
FG
6633
6634 r = _open_collections();
6635 if (r < 0)
11fdf7f2 6636 goto out_db;
7c673cae
FG
6637
6638 r = _reload_logger();
6639 if (r < 0)
6640 goto out_coll;
6641
31f18b77 6642 _kv_start();
7c673cae
FG
6643
6644 r = _deferred_replay();
6645 if (r < 0)
6646 goto out_stop;
6647
6648 mempool_thread.init();
6649
7c673cae
FG
6650 mounted = true;
6651 return 0;
6652
6653 out_stop:
6654 _kv_stop();
7c673cae 6655 out_coll:
31f18b77 6656 _flush_cache();
7c673cae 6657 out_db:
11fdf7f2 6658 _close_db_and_around();
7c673cae
FG
6659 out_bdev:
6660 _close_bdev();
6661 out_fsid:
6662 _close_fsid();
6663 out_path:
6664 _close_path();
6665 return r;
6666}
6667
6668int BlueStore::umount()
6669{
11fdf7f2 6670 ceph_assert(_kv_only || mounted);
7c673cae
FG
6671 dout(1) << __func__ << dendl;
6672
6673 _osr_drain_all();
7c673cae 6674
7c673cae 6675 mounted = false;
3efd9988
FG
6676 if (!_kv_only) {
6677 mempool_thread.shutdown();
6678 dout(20) << __func__ << " stopping kv thread" << dendl;
6679 _kv_stop();
3efd9988
FG
6680 _flush_cache();
6681 dout(20) << __func__ << " closing" << dendl;
6682
3efd9988 6683 }
11fdf7f2 6684 _close_db_and_around();
7c673cae
FG
6685 _close_bdev();
6686 _close_fsid();
6687 _close_path();
6688
6689 if (cct->_conf->bluestore_fsck_on_umount) {
6690 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
6691 if (rc < 0)
6692 return rc;
6693 if (rc > 0) {
6694 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6695 return -EIO;
6696 }
6697 }
6698 return 0;
6699}
6700
6701static void apply(uint64_t off,
6702 uint64_t len,
6703 uint64_t granularity,
6704 BlueStore::mempool_dynamic_bitset &bitset,
7c673cae
FG
6705 std::function<void(uint64_t,
6706 BlueStore::mempool_dynamic_bitset &)> f) {
11fdf7f2 6707 auto end = round_up_to(off + len, granularity);
7c673cae
FG
6708 while (off < end) {
6709 uint64_t pos = off / granularity;
6710 f(pos, bitset);
6711 off += granularity;
6712 }
6713}
6714
6715int BlueStore::_fsck_check_extents(
11fdf7f2 6716 const coll_t& cid,
7c673cae
FG
6717 const ghobject_t& oid,
6718 const PExtentVector& extents,
6719 bool compressed,
6720 mempool_dynamic_bitset &used_blocks,
b32b8144 6721 uint64_t granularity,
11fdf7f2 6722 BlueStoreRepairer* repairer,
7c673cae
FG
6723 store_statfs_t& expected_statfs)
6724{
6725 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
6726 int errors = 0;
6727 for (auto e : extents) {
6728 if (!e.is_valid())
6729 continue;
6730 expected_statfs.allocated += e.length;
6731 if (compressed) {
11fdf7f2 6732 expected_statfs.data_compressed_allocated += e.length;
7c673cae
FG
6733 }
6734 bool already = false;
6735 apply(
b32b8144 6736 e.offset, e.length, granularity, used_blocks,
7c673cae 6737 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2
TL
6738 ceph_assert(pos < bs.size());
6739 if (bs.test(pos)) {
6740 if (repairer) {
6741 repairer->note_misreference(
6742 pos * min_alloc_size, min_alloc_size, !already);
6743 }
6744 if (!already) {
6745 derr << "fsck error: " << oid << " extent " << e
6746 << " or a subset is already allocated (misreferenced)" << dendl;
6747 ++errors;
6748 already = true;
6749 }
6750 }
7c673cae
FG
6751 else
6752 bs.set(pos);
6753 });
11fdf7f2
TL
6754 if (repairer) {
6755 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
6756 }
6757
7c673cae 6758 if (e.end() > bdev->get_size()) {
11fdf7f2 6759 derr << "fsck error: " << oid << " extent " << e
7c673cae
FG
6760 << " past end of block device" << dendl;
6761 ++errors;
6762 }
6763 }
6764 return errors;
6765}
6766
11fdf7f2
TL
6767void BlueStore::_fsck_check_pool_statfs(
6768 BlueStore::per_pool_statfs& expected_pool_statfs,
6769 bool need_per_pool_stats,
6770 int& errors,
6771 BlueStoreRepairer* repairer)
6772{
6773 auto it = db->get_iterator(PREFIX_STAT);
6774 if (it) {
6775 for (it->lower_bound(string()); it->valid(); it->next()) {
6776 string key = it->key();
6777 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
6778 if (repairer) {
6779 if (need_per_pool_stats) {
6780 ++errors;
6781 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
6782 derr << "fsck error: " << "legacy statfs record found, removing" << dendl;
6783 } else {
6784 derr << "fsck warning: " << "legacy statfs record found, bypassing" << dendl;
6785 }
6786 } else {
6787 const char* s = "fsck warning: ";
6788 if (need_per_pool_stats) {
6789 ++errors;
6790 s = "fsck error: ";
6791 }
6792 derr << s << "legacy statfs record found, suggest to "
6793 "run store repair to get consistent statistic reports"
6794 << dendl;
6795 }
6796 continue;
6797 }
6798 if (!need_per_pool_stats) {
6799 continue;
6800 }
6801 uint64_t pool_id;
6802 if (get_key_pool_stat(key, &pool_id) < 0) {
6803 derr << "fsck error: bad key " << key
6804 << "in statfs namespece" << dendl;
6805 if (repairer) {
6806 repairer->remove_key(db, PREFIX_STAT, key);
6807 }
6808 ++errors;
6809 continue;
6810 }
6811
6812 volatile_statfs vstatfs;
6813 bufferlist bl = it->value();
6814 auto blp = bl.cbegin();
6815 try {
6816 vstatfs.decode(blp);
6817 } catch (buffer::error& e) {
6818 derr << "fsck error: failed to decode Pool StatFS record"
6819 << pretty_binary_string(key) << dendl;
6820 if (repairer) {
6821 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
6822 << pretty_binary_string(key)
6823 << "', removing" << dendl;
6824 repairer->remove_key(db, PREFIX_STAT, key);
6825 }
6826 ++errors;
6827 vstatfs.reset();
6828 }
6829 auto stat_it = expected_pool_statfs.find(pool_id);
6830 if (stat_it == expected_pool_statfs.end()) {
6831 if (vstatfs.is_empty()) {
6832 // we don't consider that as an error since empty pool statfs
6833 // are left in DB for now
6834 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
6835 << std::hex << pool_id << std::dec << dendl;
6836 if (repairer) {
6837 // but we need to increment error count in case of repair
6838 // to have proper counters at the end
6839 // (as repairer increments recovery counter anyway).
6840 ++errors;
6841 }
6842 } else {
6843 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
6844 << std::hex << pool_id << std::dec << dendl;
6845 ++errors;
6846 }
6847 if (repairer) {
6848 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
6849 }
6850 continue;
6851 }
6852 store_statfs_t statfs;
6853 vstatfs.publish(&statfs);
6854 if (!(stat_it->second == statfs)) {
6855 derr << "fsck error: actual " << statfs
6856 << " != expected " << stat_it->second
6857 << " for pool "
6858 << std::hex << pool_id << std::dec << dendl;
6859 if (repairer) {
6860 repairer->fix_statfs(db, key, stat_it->second);
6861 }
6862 ++errors;
6863 }
6864 expected_pool_statfs.erase(stat_it);
6865 }
6866 } // if (it)
6867 for( auto s = expected_pool_statfs.begin(); s != expected_pool_statfs.end();
6868 ++s) {
6869 if (s->second.is_zero()) {
6870 // we might lack empty statfs recs in DB
6871 continue;
6872 }
6873 derr << "fsck error: missing Pool StatFS record for pool "
6874 << std::hex << s->first << std::dec << dendl;
6875 if (repairer) {
6876 string key;
6877 get_pool_stat_key(s->first, &key);
6878 repairer->fix_statfs(db, key, s->second);
6879 }
6880 ++errors;
6881 }
6882}
6883
6884/**
6885An overview for currently implemented repair logics
6886performed in fsck in two stages: detection(+preparation) and commit.
6887Detection stage (in processing order):
6888 (Issue -> Repair action to schedule)
6889 - Detect undecodable keys for Shared Blobs -> Remove
6890 - Detect undecodable records for Shared Blobs -> Remove
6891 (might trigger missed Shared Blob detection below)
6892 - Detect stray records for Shared Blobs -> Remove
6893 - Detect misreferenced pextents -> Fix
6894 Prepare Bloom-like filter to track cid/oid -> pextent
6895 Prepare list of extents that are improperly referenced
6896 Enumerate Onode records that might use 'misreferenced' pextents
6897 (Bloom-like filter applied to reduce computation)
6898 Per each questinable Onode enumerate all blobs and identify broken ones
6899 (i.e. blobs having 'misreferences')
6900 Rewrite each broken blob data by allocating another extents and
6901 copying data there
6902 If blob is shared - unshare it and mark corresponding Shared Blob
6903 for removal
6904 Release previously allocated space
6905 Update Extent Map
6906 - Detect missed Shared Blobs -> Recreate
6907 - Detect undecodable deferred transaction -> Remove
6908 - Detect Freelist Manager's 'false free' entries -> Mark as used
6909 - Detect Freelist Manager's leaked entries -> Mark as free
6910 - Detect statfs inconsistency - Update
6911 Commit stage (separate DB commit per each step):
6912 - Apply leaked FM entries fix
6913 - Apply 'false free' FM entries fix
6914 - Apply 'Remove' actions
6915 - Apply fix for misreference pextents
6916 - Apply Shared Blob recreate
6917 (can be merged with the step above if misreferences were dectected)
6918 - Apply StatFS update
6919*/
3efd9988 6920int BlueStore::_fsck(bool deep, bool repair)
7c673cae 6921{
3efd9988 6922 dout(1) << __func__
11fdf7f2
TL
6923 << " <<<START>>>"
6924 << (repair ? " repair" : " check")
3efd9988 6925 << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
7c673cae 6926 int errors = 0;
11fdf7f2 6927 unsigned repaired = 0;
31f18b77
FG
6928
6929 typedef btree::btree_set<
6930 uint64_t,std::less<uint64_t>,
6931 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
6932 uint64_t_btree_t used_nids;
6933 uint64_t_btree_t used_omap_head;
11fdf7f2 6934 uint64_t_btree_t used_pgmeta_omap_head;
31f18b77
FG
6935 uint64_t_btree_t used_sbids;
6936
7c673cae 6937 mempool_dynamic_bitset used_blocks;
7c673cae 6938 KeyValueDB::Iterator it;
11fdf7f2
TL
6939 store_statfs_t expected_store_statfs, actual_statfs;
6940 per_pool_statfs expected_pool_statfs;
6941
7c673cae 6942 struct sb_info_t {
11fdf7f2
TL
6943 coll_t cid;
6944 int64_t pool_id = INT64_MIN;
7c673cae
FG
6945 list<ghobject_t> oids;
6946 SharedBlobRef sb;
6947 bluestore_extent_ref_map_t ref_map;
11fdf7f2
TL
6948 bool compressed = false;
6949 bool passed = false;
6950 bool updated = false;
7c673cae
FG
6951 };
6952 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
6953
6954 uint64_t num_objects = 0;
6955 uint64_t num_extents = 0;
6956 uint64_t num_blobs = 0;
6957 uint64_t num_spanning_blobs = 0;
6958 uint64_t num_shared_blobs = 0;
6959 uint64_t num_sharded_objects = 0;
6960 uint64_t num_object_shards = 0;
11fdf7f2
TL
6961 BlueStoreRepairer repairer;
6962 store_statfs_t* expected_statfs = nullptr;
6963 // in deep mode we need R/W write access to be able to replay deferred ops
6964 bool read_only = !(repair || deep);
7c673cae
FG
6965
6966 utime_t start = ceph_clock_now();
11fdf7f2
TL
6967 const auto& no_pps_mode = cct->_conf->bluestore_no_per_pool_stats_tolerance;
6968 bool need_per_pool_stats = no_pps_mode == "until_fsck" ||
6969 (no_pps_mode == "until_repair" && repair);
6970 bool enforce_no_per_pool_stats = no_pps_mode == "enforce";
7c673cae
FG
6971
6972 int r = _open_path();
6973 if (r < 0)
6974 return r;
6975 r = _open_fsid(false);
6976 if (r < 0)
6977 goto out_path;
6978
6979 r = _read_fsid(&fsid);
6980 if (r < 0)
6981 goto out_fsid;
6982
6983 r = _lock_fsid();
6984 if (r < 0)
6985 goto out_fsid;
6986
6987 r = _open_bdev(false);
6988 if (r < 0)
6989 goto out_fsid;
6990
11fdf7f2 6991 r = _open_db_and_around(read_only);
7c673cae
FG
6992 if (r < 0)
6993 goto out_bdev;
6994
11fdf7f2
TL
6995 if (!read_only) {
6996 r = _upgrade_super();
6997 if (r < 0) {
6998 goto out_db;
6999 }
7000 }
7c673cae
FG
7001
7002 r = _open_collections(&errors);
7003 if (r < 0)
11fdf7f2 7004 goto out_db;
7c673cae
FG
7005
7006 mempool_thread.init();
7007
11fdf7f2
TL
7008 // we need finisher and kv_{sync,finalize}_thread *just* for replay
7009 // enable in repair or deep mode modes only
7010 if (!read_only) {
7011 _kv_start();
7012 r = _deferred_replay();
7013 _kv_stop();
7014 }
7c673cae
FG
7015 if (r < 0)
7016 goto out_scan;
7017
b32b8144 7018 used_blocks.resize(fm->get_alloc_units());
7c673cae 7019 apply(
11fdf7f2 7020 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae 7021 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7022 ceph_assert(pos < bs.size());
7c673cae
FG
7023 bs.set(pos);
7024 }
7025 );
11fdf7f2
TL
7026 if (repair) {
7027 repairer.get_space_usage_tracker().init(
7028 bdev->get_size(),
7029 min_alloc_size);
7030 }
7c673cae
FG
7031
7032 if (bluefs) {
11fdf7f2
TL
7033 if( cct->_conf->bluestore_bluefs_db_compatibility) {
7034 interval_set<uint64_t> bluefs_extents_db;
7035 bufferlist bl;
7036 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7037 auto p = bl.cbegin();
7038 auto prev_errors = errors;
7039 try {
7040 decode(bluefs_extents_db, p);
7041 bluefs_extents_db.union_of(bluefs_extents);
7042 bluefs_extents_db.subtract(bluefs_extents);
7043 if (!bluefs_extents_db.empty()) {
7044 derr << "fsck error: bluefs_extents inconsistency, "
7045 << "downgrade to previous releases might be broken."
7046 << dendl;
7047 ++errors;
7048 }
7049 }
7050 catch (buffer::error& e) {
7051 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
7052 ++errors;
7053 }
7054 if (errors != prev_errors && repair) {
7055 repairer.fix_bluefs_extents(out_of_sync_fm);
7056 }
7057 }
7058
7c673cae
FG
7059 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
7060 apply(
b32b8144 7061 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 7062 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7063 ceph_assert(pos < bs.size());
7c673cae
FG
7064 bs.set(pos);
7065 }
7066 );
7067 }
7068 r = bluefs->fsck();
7069 if (r < 0) {
7070 goto out_scan;
7071 }
7072 if (r > 0)
7073 errors += r;
7074 }
7075
11fdf7f2 7076 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
7077 // structs
7078 statfs(&actual_statfs);
11fdf7f2
TL
7079 actual_statfs.total = 0;
7080 actual_statfs.internally_reserved = 0;
7081 actual_statfs.available = 0;
7082 actual_statfs.internal_metadata = 0;
7083 actual_statfs.omap_allocated = 0;
7084
7085 need_per_pool_stats = per_pool_stat_collection || need_per_pool_stats;
7c673cae
FG
7086
7087 // walk PREFIX_OBJ
7088 dout(1) << __func__ << " walking object keyspace" << dendl;
7089 it = db->get_iterator(PREFIX_OBJ);
7090 if (it) {
11fdf7f2
TL
7091 //fill global if not overriden below
7092 expected_statfs = &expected_store_statfs;
7093
7c673cae
FG
7094 CollectionRef c;
7095 spg_t pgid;
7096 mempool::bluestore_fsck::list<string> expecting_shards;
7097 for (it->lower_bound(string()); it->valid(); it->next()) {
11fdf7f2 7098 if (g_conf()->bluestore_debug_fsck_abort) {
31f18b77
FG
7099 goto out_scan;
7100 }
11fdf7f2
TL
7101 dout(30) << __func__ << " key "
7102 << pretty_binary_string(it->key()) << dendl;
7c673cae
FG
7103 if (is_extent_shard_key(it->key())) {
7104 while (!expecting_shards.empty() &&
7105 expecting_shards.front() < it->key()) {
3efd9988 7106 derr << "fsck error: missing shard key "
7c673cae
FG
7107 << pretty_binary_string(expecting_shards.front())
7108 << dendl;
7109 ++errors;
7110 expecting_shards.pop_front();
7111 }
7112 if (!expecting_shards.empty() &&
7113 expecting_shards.front() == it->key()) {
7114 // all good
7115 expecting_shards.pop_front();
7116 continue;
7117 }
7118
7119 uint32_t offset;
7120 string okey;
7121 get_key_extent_shard(it->key(), &okey, &offset);
3efd9988 7122 derr << "fsck error: stray shard 0x" << std::hex << offset
7c673cae
FG
7123 << std::dec << dendl;
7124 if (expecting_shards.empty()) {
3efd9988 7125 derr << "fsck error: " << pretty_binary_string(it->key())
7c673cae
FG
7126 << " is unexpected" << dendl;
7127 ++errors;
7128 continue;
7129 }
7130 while (expecting_shards.front() > it->key()) {
3efd9988 7131 derr << "fsck error: saw " << pretty_binary_string(it->key())
7c673cae 7132 << dendl;
3efd9988 7133 derr << "fsck error: exp "
7c673cae
FG
7134 << pretty_binary_string(expecting_shards.front()) << dendl;
7135 ++errors;
7136 expecting_shards.pop_front();
7137 if (expecting_shards.empty()) {
7138 break;
7139 }
7140 }
7141 continue;
7142 }
7143
7144 ghobject_t oid;
7145 int r = get_key_object(it->key(), &oid);
7146 if (r < 0) {
3efd9988 7147 derr << "fsck error: bad object key "
7c673cae
FG
7148 << pretty_binary_string(it->key()) << dendl;
7149 ++errors;
7150 continue;
7151 }
7152 if (!c ||
7153 oid.shard_id != pgid.shard ||
11fdf7f2 7154 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7c673cae
FG
7155 !c->contains(oid)) {
7156 c = nullptr;
11fdf7f2
TL
7157 for (auto& p : coll_map) {
7158 if (p.second->contains(oid)) {
7159 c = p.second;
7c673cae
FG
7160 break;
7161 }
7162 }
7163 if (!c) {
3efd9988 7164 derr << "fsck error: stray object " << oid
7c673cae
FG
7165 << " not owned by any collection" << dendl;
7166 ++errors;
7167 continue;
7168 }
11fdf7f2
TL
7169 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7170 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7171 << dendl;
7172 if (need_per_pool_stats) {
7173 expected_statfs = &expected_pool_statfs[pool_id];
7174 }
7175
28e407b8
AA
7176 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7177 << dendl;
7c673cae
FG
7178 }
7179
7180 if (!expecting_shards.empty()) {
7181 for (auto &k : expecting_shards) {
3efd9988 7182 derr << "fsck error: missing shard key "
7c673cae
FG
7183 << pretty_binary_string(k) << dendl;
7184 }
7185 ++errors;
7186 expecting_shards.clear();
7187 }
7188
7189 dout(10) << __func__ << " " << oid << dendl;
11fdf7f2 7190 store_statfs_t onode_statfs;
7c673cae
FG
7191 RWLock::RLocker l(c->lock);
7192 OnodeRef o = c->get_onode(oid, false);
7193 if (o->onode.nid) {
7194 if (o->onode.nid > nid_max) {
3efd9988 7195 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
7196 << " > nid_max " << nid_max << dendl;
7197 ++errors;
7198 }
7199 if (used_nids.count(o->onode.nid)) {
3efd9988 7200 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
7201 << " already in use" << dendl;
7202 ++errors;
7203 continue; // go for next object
7204 }
7205 used_nids.insert(o->onode.nid);
7206 }
7207 ++num_objects;
7208 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7209 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
81eedcae 7210 _dump_onode<30>(cct, *o);
7c673cae
FG
7211 // shards
7212 if (!o->extent_map.shards.empty()) {
7213 ++num_sharded_objects;
7214 num_object_shards += o->extent_map.shards.size();
7215 }
7216 for (auto& s : o->extent_map.shards) {
7217 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
7218 expecting_shards.push_back(string());
7219 get_extent_shard_key(o->key, s.shard_info->offset,
7220 &expecting_shards.back());
7221 if (s.shard_info->offset >= o->onode.size) {
3efd9988 7222 derr << "fsck error: " << oid << " shard 0x" << std::hex
7c673cae
FG
7223 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7224 << std::dec << dendl;
7225 ++errors;
7226 }
7227 }
7228 // lextents
7229 map<BlobRef,bluestore_blob_t::unused_t> referenced;
7230 uint64_t pos = 0;
7231 mempool::bluestore_fsck::map<BlobRef,
7232 bluestore_blob_use_tracker_t> ref_map;
7233 for (auto& l : o->extent_map.extent_map) {
7234 dout(20) << __func__ << " " << l << dendl;
7235 if (l.logical_offset < pos) {
3efd9988 7236 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
7237 << std::hex << l.logical_offset
7238 << " overlaps with the previous, which ends at 0x" << pos
7239 << std::dec << dendl;
7240 ++errors;
7241 }
7242 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
3efd9988 7243 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
7244 << std::hex << l.logical_offset << "~" << l.length
7245 << " spans a shard boundary"
7246 << std::dec << dendl;
7247 ++errors;
7248 }
7249 pos = l.logical_offset + l.length;
11fdf7f2
TL
7250 onode_statfs.data_stored += l.length;
7251 ceph_assert(l.blob);
7c673cae
FG
7252 const bluestore_blob_t& blob = l.blob->get_blob();
7253
7254 auto& ref = ref_map[l.blob];
7255 if (ref.is_empty()) {
7256 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7257 uint32_t l = blob.get_logical_length();
7258 ref.init(l, min_release_size);
7259 }
7260 ref.get(
7261 l.blob_offset,
7262 l.length);
7263 ++num_extents;
7264 if (blob.has_unused()) {
7265 auto p = referenced.find(l.blob);
7266 bluestore_blob_t::unused_t *pu;
7267 if (p == referenced.end()) {
7268 pu = &referenced[l.blob];
7269 } else {
7270 pu = &p->second;
7271 }
7272 uint64_t blob_len = blob.get_logical_length();
11fdf7f2
TL
7273 ceph_assert((blob_len % (sizeof(*pu)*8)) == 0);
7274 ceph_assert(l.blob_offset + l.length <= blob_len);
7c673cae
FG
7275 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
7276 uint64_t start = l.blob_offset / chunk_size;
7277 uint64_t end =
11fdf7f2 7278 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7c673cae
FG
7279 for (auto i = start; i < end; ++i) {
7280 (*pu) |= (1u << i);
7281 }
7282 }
7283 }
7284 for (auto &i : referenced) {
7285 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
7286 << std::dec << " for " << *i.first << dendl;
7287 const bluestore_blob_t& blob = i.first->get_blob();
7288 if (i.second & blob.unused) {
3efd9988 7289 derr << "fsck error: " << oid << " blob claims unused 0x"
7c673cae 7290 << std::hex << blob.unused
11fdf7f2 7291 << " but extents reference 0x" << i.second << std::dec
7c673cae
FG
7292 << " on blob " << *i.first << dendl;
7293 ++errors;
7294 }
7295 if (blob.has_csum()) {
7296 uint64_t blob_len = blob.get_logical_length();
7297 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
7298 unsigned csum_count = blob.get_csum_count();
7299 unsigned csum_chunk_size = blob.get_csum_chunk_size();
7300 for (unsigned p = 0; p < csum_count; ++p) {
7301 unsigned pos = p * csum_chunk_size;
7302 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
7303 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
7304 unsigned mask = 1u << firstbit;
7305 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
7306 mask |= 1u << b;
7307 }
7308 if ((blob.unused & mask) == mask) {
7309 // this csum chunk region is marked unused
7310 if (blob.get_csum_item(p) != 0) {
3efd9988 7311 derr << "fsck error: " << oid
7c673cae
FG
7312 << " blob claims csum chunk 0x" << std::hex << pos
7313 << "~" << csum_chunk_size
7314 << " is unused (mask 0x" << mask << " of unused 0x"
7315 << blob.unused << ") but csum is non-zero 0x"
7316 << blob.get_csum_item(p) << std::dec << " on blob "
7317 << *i.first << dendl;
7318 ++errors;
7319 }
7320 }
7321 }
7322 }
7323 }
7324 for (auto &i : ref_map) {
7325 ++num_blobs;
7326 const bluestore_blob_t& blob = i.first->get_blob();
7327 bool equal = i.first->get_blob_use_tracker().equal(i.second);
7328 if (!equal) {
3efd9988 7329 derr << "fsck error: " << oid << " blob " << *i.first
7c673cae
FG
7330 << " doesn't match expected ref_map " << i.second << dendl;
7331 ++errors;
7332 }
7333 if (blob.is_compressed()) {
11fdf7f2
TL
7334 onode_statfs.data_compressed += blob.get_compressed_payload_length();
7335 onode_statfs.data_compressed_original +=
7c673cae
FG
7336 i.first->get_referenced_bytes();
7337 }
7338 if (blob.is_shared()) {
7339 if (i.first->shared_blob->get_sbid() > blobid_max) {
3efd9988 7340 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
7341 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7342 << blobid_max << dendl;
7343 ++errors;
7344 } else if (i.first->shared_blob->get_sbid() == 0) {
3efd9988 7345 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
7346 << " marked as shared but has uninitialized sbid"
7347 << dendl;
7348 ++errors;
7349 }
7350 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
11fdf7f2
TL
7351 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7352 ceph_assert(sbi.pool_id == INT64_MIN ||
7353 sbi.pool_id == oid.hobj.get_logical_pool());
7354 sbi.cid = c->cid;
7355 sbi.pool_id = oid.hobj.get_logical_pool();
7c673cae
FG
7356 sbi.sb = i.first->shared_blob;
7357 sbi.oids.push_back(oid);
7358 sbi.compressed = blob.is_compressed();
7359 for (auto e : blob.get_extents()) {
7360 if (e.is_valid()) {
7361 sbi.ref_map.get(e.offset, e.length);
7362 }
7363 }
7364 } else {
11fdf7f2 7365 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7c673cae
FG
7366 blob.is_compressed(),
7367 used_blocks,
b32b8144 7368 fm->get_alloc_size(),
11fdf7f2
TL
7369 repair ? &repairer : nullptr,
7370 onode_statfs);
7c673cae
FG
7371 }
7372 }
7373 if (deep) {
7374 bufferlist bl;
a8e16298
TL
7375 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
7376 uint64_t offset = 0;
7377 do {
7378 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
7379 int r = _do_read(c.get(), o, offset, l, bl,
7380 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
7381 if (r < 0) {
7382 ++errors;
7383 derr << "fsck error: " << oid << std::hex
7384 << " error during read: "
7385 << " " << offset << "~" << l
7386 << " " << cpp_strerror(r) << std::dec
7387 << dendl;
7388 break;
7389 }
7390 offset += l;
7391 } while (offset < o->onode.size);
7c673cae
FG
7392 }
7393 // omap
7394 if (o->onode.has_omap()) {
11fdf7f2
TL
7395 auto& m =
7396 o->onode.is_pgmeta_omap() ? used_pgmeta_omap_head : used_omap_head;
7397 if (m.count(o->onode.nid)) {
3efd9988 7398 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
7c673cae
FG
7399 << " already in use" << dendl;
7400 ++errors;
7401 } else {
11fdf7f2 7402 m.insert(o->onode.nid);
7c673cae
FG
7403 }
7404 }
11fdf7f2
TL
7405 expected_statfs->add(onode_statfs);
7406 } // for (it->lower_bound(string()); it->valid(); it->next())
7407 } // if (it)
7408
7c673cae
FG
7409 dout(1) << __func__ << " checking shared_blobs" << dendl;
7410 it = db->get_iterator(PREFIX_SHARED_BLOB);
7411 if (it) {
11fdf7f2
TL
7412 //fill global if not overriden below
7413 expected_statfs = &expected_store_statfs;
7414
7c673cae
FG
7415 for (it->lower_bound(string()); it->valid(); it->next()) {
7416 string key = it->key();
7417 uint64_t sbid;
7418 if (get_key_shared_blob(key, &sbid)) {
3efd9988 7419 derr << "fsck error: bad key '" << key
7c673cae 7420 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
7421 if (repair) {
7422 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
7423 }
7c673cae
FG
7424 ++errors;
7425 continue;
7426 }
7427 auto p = sb_info.find(sbid);
7428 if (p == sb_info.end()) {
3efd9988 7429 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 7430 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
7431 if (repair) {
7432 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
7433 }
7c673cae
FG
7434 ++errors;
7435 } else {
7436 ++num_shared_blobs;
7437 sb_info_t& sbi = p->second;
7438 bluestore_shared_blob_t shared_blob(sbid);
7439 bufferlist bl = it->value();
11fdf7f2
TL
7440 auto blp = bl.cbegin();
7441 try {
7442 decode(shared_blob, blp);
7443 } catch (buffer::error& e) {
7444 ++errors;
7445 // Force update and don't report as missing
7446 sbi.updated = sbi.passed = true;
7447
7448 derr << "fsck error: failed to decode Shared Blob"
7449 << pretty_binary_string(it->key()) << dendl;
7450 if (repair) {
7451 dout(20) << __func__ << " undecodable Shared Blob, key:'"
7452 << pretty_binary_string(it->key())
7453 << "', removing" << dendl;
7454 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
7455 }
7456 continue;
7457 }
7c673cae
FG
7458 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
7459 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 7460 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
7461 << std::dec << " ref_map " << shared_blob.ref_map
7462 << " != expected " << sbi.ref_map << dendl;
7463 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
7464 ++errors;
7465 }
7466 PExtentVector extents;
7467 for (auto &r : shared_blob.ref_map.ref_map) {
7468 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
7469 }
11fdf7f2
TL
7470 if (need_per_pool_stats) {
7471 expected_statfs = &expected_pool_statfs[sbi.pool_id];
7472 }
7473 errors += _fsck_check_extents(sbi.cid,
7474 p->second.oids.front(),
7c673cae
FG
7475 extents,
7476 p->second.compressed,
b32b8144
FG
7477 used_blocks,
7478 fm->get_alloc_size(),
11fdf7f2
TL
7479 repair ? &repairer : nullptr,
7480 *expected_statfs);
7481 sbi.passed = true;
7482 }
7483 }
7484 } // if (it)
7485
7486 if (repair && repairer.preprocess_misreference(db)) {
7487
7488 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
7489 auto& space_tracker = repairer.get_space_usage_tracker();
7490 auto& misref_extents = repairer.get_misreferences();
7491 interval_set<uint64_t> to_release;
7492 it = db->get_iterator(PREFIX_OBJ);
7493 if (it) {
7494 //fill global if not overriden below
7495 expected_statfs = &expected_store_statfs;
7496
7497 CollectionRef c;
7498 spg_t pgid;
7499 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
7500 bool bypass_rest = false;
7501 for (it->lower_bound(string()); it->valid() && !bypass_rest;
7502 it->next()) {
7503 dout(30) << __func__ << " key "
7504 << pretty_binary_string(it->key()) << dendl;
7505 if (is_extent_shard_key(it->key())) {
7506 continue;
7507 }
7508
7509 ghobject_t oid;
7510 int r = get_key_object(it->key(), &oid);
7511 if (r < 0 || !space_tracker.is_used(oid)) {
7512 continue;
7513 }
7514
7515 if (!c ||
7516 oid.shard_id != pgid.shard ||
7517 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7518 !c->contains(oid)) {
7519 c = nullptr;
7520 for (auto& p : coll_map) {
7521 if (p.second->contains(oid)) {
7522 c = p.second;
7523 break;
7524 }
7525 }
7526 if (!c) {
7527 continue;
7528 }
7529 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7530 if (need_per_pool_stats) {
7531 expected_statfs = &expected_pool_statfs[pool_id];
7532 }
7533 }
7534 if (!space_tracker.is_used(c->cid)) {
7535 continue;
7536 }
7537
7538 dout(20) << __func__ << " check misreference for col:" << c->cid
7539 << " obj:" << oid << dendl;
7540
7541 RWLock::RLocker l(c->lock);
7542 OnodeRef o = c->get_onode(oid, false);
7543 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7544 mempool::bluestore_fsck::set<BlobRef> blobs;
7545
7546 for (auto& e : o->extent_map.extent_map) {
7547 blobs.insert(e.blob);
7548 }
7549 bool need_onode_update = false;
7550 bool first_dump = true;
7551 for(auto b : blobs) {
7552 bool broken_blob = false;
7553 auto& pextents = b->dirty_blob().dirty_extents();
7554 for (auto& e : pextents) {
7555 if (!e.is_valid()) {
7556 continue;
7557 }
7558 // for the sake of simplicity and proper shared blob handling
7559 // always rewrite the whole blob even when it's partially
7560 // misreferenced.
7561 if (misref_extents.intersects(e.offset, e.length)) {
7562 if (first_dump) {
7563 first_dump = false;
81eedcae 7564 _dump_onode<10>(cct, *o);
11fdf7f2
TL
7565 }
7566 broken_blob = true;
7567 break;
7568 }
7569 }
7570 if (!broken_blob)
7571 continue;
7572 bool compressed = b->get_blob().is_compressed();
7573 need_onode_update = true;
7574 dout(10) << __func__
7575 << " fix misreferences in oid:" << oid
7576 << " " << *b << dendl;
7577 uint64_t b_off = 0;
7578 PExtentVector pext_to_release;
7579 pext_to_release.reserve(pextents.size());
7580 // rewriting all valid pextents
7581 for (auto e = pextents.begin(); e != pextents.end();
7582 b_off += e->length, e++) {
7583 if (!e->is_valid()) {
7584 continue;
7585 }
7586 PExtentVector exts;
7587 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
7588 0, 0, &exts);
7589 if (alloc_len < (int64_t)e->length) {
7590 derr << __func__
7591 << " failed to allocate 0x" << std::hex << e->length
7592 << " allocated 0x " << alloc_len
7593 << " min_alloc_size 0x" << min_alloc_size
7594 << " available 0x " << alloc->get_free()
7595 << std::dec << dendl;
7596 if (alloc_len > 0) {
7597 alloc->release(exts);
7598 }
7599 bypass_rest = true;
7600 break;
7601 }
7602 expected_statfs->allocated += e->length;
7603 if (compressed) {
7604 expected_statfs->data_compressed_allocated += e->length;
7605 }
7606
7607 bufferlist bl;
7608 IOContext ioc(cct, NULL, true); // allow EIO
7609 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
7610 if (r < 0) {
7611 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
7612 <<"~" << e->length << std::dec << dendl;
7613 ceph_abort_msg("read failed, wtf");
7614 }
7615 pext_to_release.push_back(*e);
7616 e = pextents.erase(e);
7617 e = pextents.insert(e, exts.begin(), exts.end());
7618 b->get_blob().map_bl(
7619 b_off, bl,
7620 [&](uint64_t offset, bufferlist& t) {
7621 int r = bdev->write(offset, t, false);
7622 ceph_assert(r == 0);
7623 });
7624 e += exts.size() - 1;
7625 for (auto& p : exts) {
7626 fm->allocate(p.offset, p.length, txn);
7627 }
7628 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
7629
7630 if (b->get_blob().is_shared()) {
7631 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
7632
7633 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
7634 ceph_assert(sb_it != sb_info.end());
7635 sb_info_t& sbi = sb_it->second;
7636
7637 for (auto& r : sbi.ref_map.ref_map) {
7638 expected_statfs->allocated -= r.second.length;
7639 if (sbi.compressed) {
7640 // NB: it's crucial to use compressed flag from sb_info_t
7641 // as we originally used that value while accumulating
7642 // expected_statfs
7643 expected_statfs->data_compressed_allocated -= r.second.length;
7644 }
7645 }
7646 sbi.updated = sbi.passed = true;
7647 sbi.ref_map.clear();
7648
7649 // relying on blob's pextents to decide what to release.
7650 for (auto& p : pext_to_release) {
7651 to_release.union_insert(p.offset, p.length);
7652 }
7653 } else {
7654 for (auto& p : pext_to_release) {
7655 expected_statfs->allocated -= p.length;
7656 if (compressed) {
7657 expected_statfs->data_compressed_allocated -= p.length;
7658 }
7659 to_release.union_insert(p.offset, p.length);
7660 }
7661 }
7662 if (bypass_rest) {
7663 break;
7664 }
7665 } // for(auto b : blobs)
7666 if (need_onode_update) {
7667 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
7668 _record_onode(o, txn);
7669 }
7670 } // for (it->lower_bound(string()); it->valid(); it->next())
7671
7672 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
7673 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
7674 << "~" << it.get_len() << std::dec << dendl;
7675 fm->release(it.get_start(), it.get_len(), txn);
7676 }
7677 alloc->release(to_release);
7678 to_release.clear();
7679 } // if (it) {
7680 } //if (repair && repairer.preprocess_misreference()) {
7681
7682 for (auto &p : sb_info) {
7683 sb_info_t& sbi = p.second;
7684 if (!sbi.passed) {
7685 derr << "fsck error: missing " << *sbi.sb << dendl;
7686 ++errors;
7687 }
7688 if (repair && (!sbi.passed || sbi.updated)) {
7689 auto sbid = p.first;
7690 if (sbi.ref_map.empty()) {
7691 ceph_assert(sbi.passed);
7692 dout(20) << __func__ << " " << *sbi.sb
7693 << " is empty, removing" << dendl;
7694 repairer.fix_shared_blob(db, sbid, nullptr);
7695 } else {
7696 bufferlist bl;
7697 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
7698 encode(persistent, bl);
7699 dout(20) << __func__ << " " << *sbi.sb
7700 << " is " << bl.length() << " bytes, updating" << dendl;
7701
7702 repairer.fix_shared_blob(db, sbid, &bl);
7c673cae
FG
7703 }
7704 }
7705 }
11fdf7f2
TL
7706 sb_info.clear();
7707
7708 // check global stats if no-pps is enforced only
7709 if (!need_per_pool_stats) {
7710 if (!(actual_statfs == expected_store_statfs)) {
7711 derr << "fsck error: actual " << actual_statfs
7712 << " != expected " << expected_store_statfs << dendl;
7713 if (repair) {
7714 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
7715 expected_store_statfs);
7716 }
7717 ++errors;
7718 }
7c673cae 7719 }
11fdf7f2
TL
7720 if (!enforce_no_per_pool_stats) {
7721 dout(1) << __func__ << " checking pool_statfs" << dendl;
7722 _fsck_check_pool_statfs(expected_pool_statfs, need_per_pool_stats,
7723 errors, repair ? &repairer : nullptr);
7c673cae
FG
7724 }
7725
7726 dout(1) << __func__ << " checking for stray omap data" << dendl;
7727 it = db->get_iterator(PREFIX_OMAP);
7728 if (it) {
7729 for (it->lower_bound(string()); it->valid(); it->next()) {
7730 uint64_t omap_head;
7731 _key_decode_u64(it->key().c_str(), &omap_head);
7732 if (used_omap_head.count(omap_head) == 0) {
3efd9988 7733 derr << "fsck error: found stray omap data on omap_head "
7c673cae
FG
7734 << omap_head << dendl;
7735 ++errors;
7736 }
7737 }
7738 }
11fdf7f2
TL
7739 it = db->get_iterator(PREFIX_PGMETA_OMAP);
7740 if (it) {
7741 for (it->lower_bound(string()); it->valid(); it->next()) {
7742 uint64_t omap_head;
7743 _key_decode_u64(it->key().c_str(), &omap_head);
7744 if (used_pgmeta_omap_head.count(omap_head) == 0) {
7745 derr << "fsck error: found stray omap data on omap_head "
7746 << omap_head << dendl;
7747 ++errors;
7748 }
7749 }
7750 }
7c673cae
FG
7751
7752 dout(1) << __func__ << " checking deferred events" << dendl;
7753 it = db->get_iterator(PREFIX_DEFERRED);
7754 if (it) {
7755 for (it->lower_bound(string()); it->valid(); it->next()) {
7756 bufferlist bl = it->value();
11fdf7f2 7757 auto p = bl.cbegin();
7c673cae
FG
7758 bluestore_deferred_transaction_t wt;
7759 try {
11fdf7f2 7760 decode(wt, p);
7c673cae 7761 } catch (buffer::error& e) {
3efd9988 7762 derr << "fsck error: failed to decode deferred txn "
7c673cae 7763 << pretty_binary_string(it->key()) << dendl;
11fdf7f2
TL
7764 if (repair) {
7765 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
7766 << pretty_binary_string(it->key())
7767 << "', removing" << dendl;
7768 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
7769 }
7770 continue;
7c673cae
FG
7771 }
7772 dout(20) << __func__ << " deferred " << wt.seq
7773 << " ops " << wt.ops.size()
7774 << " released 0x" << std::hex << wt.released << std::dec << dendl;
7775 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
7776 apply(
b32b8144 7777 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 7778 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7779 ceph_assert(pos < bs.size());
7c673cae
FG
7780 bs.set(pos);
7781 }
7782 );
7783 }
7784 }
7785 }
7786
7787 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
7788 {
7789 // remove bluefs_extents from used set since the freelist doesn't
7790 // know they are allocated.
7791 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
7792 apply(
b32b8144 7793 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 7794 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7795 ceph_assert(pos < bs.size());
7c673cae
FG
7796 bs.reset(pos);
7797 }
7798 );
7799 }
7800 fm->enumerate_reset();
7801 uint64_t offset, length;
11fdf7f2 7802 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
7803 bool intersects = false;
7804 apply(
b32b8144 7805 offset, length, fm->get_alloc_size(), used_blocks,
7c673cae 7806 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7807 ceph_assert(pos < bs.size());
7c673cae 7808 if (bs.test(pos)) {
11fdf7f2
TL
7809 if (offset == SUPER_RESERVED &&
7810 length == min_alloc_size - SUPER_RESERVED) {
7811 // this is due to the change just after luminous to min_alloc_size
7812 // granularity allocations, and our baked in assumption at the top
7813 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
7814 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
7815 // since we will never allocate this region below min_alloc_size.
7816 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
7817 << " and min_alloc_size, 0x" << std::hex << offset << "~"
7818 << length << std::dec << dendl;
7819 } else {
7820 intersects = true;
7821 if (repair) {
7822 repairer.fix_false_free(db, fm,
7823 pos * min_alloc_size,
7824 min_alloc_size);
7825 }
7826 }
7c673cae
FG
7827 } else {
7828 bs.set(pos);
7829 }
7830 }
7831 );
7832 if (intersects) {
11fdf7f2
TL
7833 derr << "fsck error: free extent 0x" << std::hex << offset
7834 << "~" << length << std::dec
7835 << " intersects allocated blocks" << dendl;
7836 ++errors;
b5b8bbf5
FG
7837 }
7838 }
3efd9988
FG
7839 fm->enumerate_reset();
7840 size_t count = used_blocks.count();
7c673cae 7841 if (used_blocks.size() != count) {
11fdf7f2 7842 ceph_assert(used_blocks.size() > count);
b5b8bbf5
FG
7843 used_blocks.flip();
7844 size_t start = used_blocks.find_first();
7845 while (start != decltype(used_blocks)::npos) {
7846 size_t cur = start;
7847 while (true) {
7848 size_t next = used_blocks.find_next(cur);
7849 if (next != cur + 1) {
11fdf7f2 7850 ++errors;
3efd9988 7851 derr << "fsck error: leaked extent 0x" << std::hex
b32b8144
FG
7852 << ((uint64_t)start * fm->get_alloc_size()) << "~"
7853 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
b5b8bbf5 7854 << dendl;
11fdf7f2
TL
7855 if (repair) {
7856 repairer.fix_leaked(db,
7857 fm,
7858 start * min_alloc_size,
7859 (cur + 1 - start) * min_alloc_size);
7860 }
b5b8bbf5
FG
7861 start = next;
7862 break;
7863 }
7864 cur = next;
7865 }
7866 }
7867 used_blocks.flip();
7c673cae
FG
7868 }
7869 }
11fdf7f2
TL
7870 if (repair) {
7871 dout(5) << __func__ << " applying repair results" << dendl;
7872 repaired = repairer.apply(db);
7873 dout(5) << __func__ << " repair applied" << dendl;
7874 }
7c673cae
FG
7875 out_scan:
7876 mempool_thread.shutdown();
31f18b77 7877 _flush_cache();
7c673cae
FG
7878 out_db:
7879 it.reset(); // before db is closed
11fdf7f2 7880 _close_db_and_around();
7c673cae
FG
7881 out_bdev:
7882 _close_bdev();
7883 out_fsid:
7884 _close_fsid();
7885 out_path:
7886 _close_path();
7887
7888 // fatal errors take precedence
7889 if (r < 0)
7890 return r;
7891
7892 dout(2) << __func__ << " " << num_objects << " objects, "
7893 << num_sharded_objects << " of them sharded. "
7894 << dendl;
7895 dout(2) << __func__ << " " << num_extents << " extents to "
7896 << num_blobs << " blobs, "
7897 << num_spanning_blobs << " spanning, "
7898 << num_shared_blobs << " shared."
7899 << dendl;
7900
7901 utime_t duration = ceph_clock_now() - start;
11fdf7f2
TL
7902 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " << repaired
7903 << " repaired, " << (errors - (int)repaired) << " remaining in "
7c673cae 7904 << duration << " seconds" << dendl;
11fdf7f2
TL
7905 return errors - (int)repaired;
7906}
7907
7908/// methods to inject various errors fsck can repair
7909void BlueStore::inject_broken_shared_blob_key(const string& key,
7910 const bufferlist& bl)
7911{
7912 KeyValueDB::Transaction txn;
7913 txn = db->get_transaction();
7914 txn->set(PREFIX_SHARED_BLOB, key, bl);
7915 db->submit_transaction_sync(txn);
7916};
7917
7918void BlueStore::inject_leaked(uint64_t len)
7919{
7920 KeyValueDB::Transaction txn;
7921 txn = db->get_transaction();
7922
7923 PExtentVector exts;
7924 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
7925 min_alloc_size * 256, 0, &exts);
7926 ceph_assert(alloc_len >= (int64_t)len);
7927 for (auto& p : exts) {
7928 fm->allocate(p.offset, p.length, txn);
7929 }
7930 db->submit_transaction_sync(txn);
7931}
7932
7933void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
7934{
7935 KeyValueDB::Transaction txn;
7936 OnodeRef o;
7937 CollectionRef c = _get_collection(cid);
7938 ceph_assert(c);
7939 {
7940 RWLock::WLocker l(c->lock); // just to avoid internal asserts
7941 o = c->get_onode(oid, false);
7942 ceph_assert(o);
7943 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7944 }
7945
7946 bool injected = false;
7947 txn = db->get_transaction();
7948 auto& em = o->extent_map.extent_map;
7949 std::vector<const PExtentVector*> v;
7950 if (em.size()) {
7951 v.push_back(&em.begin()->blob->get_blob().get_extents());
7952 }
7953 if (em.size() > 1) {
7954 auto it = em.end();
7955 --it;
7956 v.push_back(&(it->blob->get_blob().get_extents()));
7957 }
7958 for (auto pext : v) {
7959 if (pext->size()) {
7960 auto p = pext->begin();
7961 while (p != pext->end()) {
7962 if (p->is_valid()) {
7963 dout(20) << __func__ << " release 0x" << std::hex << p->offset
7964 << "~" << p->length << std::dec << dendl;
7965 fm->release(p->offset, p->length, txn);
7966 injected = true;
7967 break;
7968 }
7969 ++p;
7970 }
7971 }
7972 }
7973 ceph_assert(injected);
7974 db->submit_transaction_sync(txn);
7975}
7976
7977void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
7978{
7979 BlueStoreRepairer repairer;
7980 repairer.fix_statfs(db, key, new_statfs);
7981 repairer.apply(db);
7982}
7983
7984void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
7985 coll_t cid2, ghobject_t oid2,
7986 uint64_t offset)
7987{
7988 OnodeRef o1;
7989 CollectionRef c1 = _get_collection(cid1);
7990 ceph_assert(c1);
7991 {
7992 RWLock::WLocker l(c1->lock); // just to avoid internal asserts
7993 o1 = c1->get_onode(oid1, false);
7994 ceph_assert(o1);
7995 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
7996 }
7997 OnodeRef o2;
7998 CollectionRef c2 = _get_collection(cid2);
7999 ceph_assert(c2);
8000 {
8001 RWLock::WLocker l(c2->lock); // just to avoid internal asserts
8002 o2 = c2->get_onode(oid2, false);
8003 ceph_assert(o2);
8004 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
8005 }
8006 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
8007 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
8008
8009 // require onode/extent layout to be the same (and simple)
8010 // to make things easier
8011 ceph_assert(o1->onode.extent_map_shards.empty());
8012 ceph_assert(o2->onode.extent_map_shards.empty());
8013 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
8014 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
8015 ceph_assert(e1.logical_offset == e2.logical_offset);
8016 ceph_assert(e1.length == e2.length);
8017 ceph_assert(e1.blob_offset == e2.blob_offset);
8018
8019 KeyValueDB::Transaction txn;
8020 txn = db->get_transaction();
8021
8022 // along with misreference error this will create space leaks errors
8023 e2.blob->dirty_blob() = e1.blob->get_blob();
8024 o2->extent_map.dirty_range(offset, e2.length);
8025 o2->extent_map.update(txn, false);
8026
8027 _record_onode(o2, txn);
8028 db->submit_transaction_sync(txn);
7c673cae
FG
8029}
8030
8031void BlueStore::collect_metadata(map<string,string> *pm)
8032{
8033 dout(10) << __func__ << dendl;
8034 bdev->collect_metadata("bluestore_bdev_", pm);
8035 if (bluefs) {
8036 (*pm)["bluefs"] = "1";
8037 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
11fdf7f2 8038 bluefs->collect_metadata(pm, bluefs_shared_bdev);
7c673cae
FG
8039 } else {
8040 (*pm)["bluefs"] = "0";
8041 }
11fdf7f2
TL
8042
8043 // report numa mapping for underlying devices
8044 int node = -1;
8045 set<int> nodes;
8046 set<string> failed;
8047 int r = get_numa_node(&node, &nodes, &failed);
8048 if (r >= 0) {
8049 if (!failed.empty()) {
8050 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
8051 }
8052 if (!nodes.empty()) {
8053 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
8054 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
8055 }
8056 if (node >= 0) {
8057 (*pm)["objectstore_numa_node"] = stringify(node);
8058 }
8059 }
8060}
8061
8062int BlueStore::get_numa_node(
8063 int *final_node,
8064 set<int> *out_nodes,
8065 set<string> *out_failed)
8066{
8067 int node = -1;
8068 set<string> devices;
8069 get_devices(&devices);
8070 set<int> nodes;
8071 set<string> failed;
8072 for (auto& devname : devices) {
8073 int n;
8074 BlkDev bdev(devname);
8075 int r = bdev.get_numa_node(&n);
8076 if (r < 0) {
8077 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
8078 << dendl;
8079 failed.insert(devname);
8080 continue;
8081 }
8082 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
8083 << dendl;
8084 nodes.insert(n);
8085 if (node < 0) {
8086 node = n;
8087 }
8088 }
8089 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
8090 *final_node = node;
8091 }
8092 if (out_nodes) {
8093 *out_nodes = nodes;
8094 }
8095 if (out_failed) {
8096 *out_failed = failed;
8097 }
8098 return 0;
8099}
8100
8101int BlueStore::get_devices(set<string> *ls)
8102{
8103 if (bdev) {
8104 bdev->get_devices(ls);
8105 if (bluefs) {
8106 bluefs->get_devices(ls);
8107 }
8108 return 0;
8109 }
8110
8111 // grumble, we haven't started up yet.
8112 int r = _open_path();
8113 if (r < 0)
8114 goto out;
8115 r = _open_fsid(false);
8116 if (r < 0)
8117 goto out_path;
8118 r = _read_fsid(&fsid);
8119 if (r < 0)
8120 goto out_fsid;
8121 r = _lock_fsid();
8122 if (r < 0)
8123 goto out_fsid;
8124 r = _open_bdev(false);
8125 if (r < 0)
8126 goto out_fsid;
8127 r = _minimal_open_bluefs(false);
8128 if (r < 0)
8129 goto out_bdev;
8130 bdev->get_devices(ls);
8131 if (bluefs) {
8132 bluefs->get_devices(ls);
8133 }
8134 r = 0;
8135 _minimal_close_bluefs();
8136 out_bdev:
8137 _close_bdev();
8138 out_fsid:
8139 _close_fsid();
8140 out_path:
8141 _close_path();
8142 out:
8143 return r;
7c673cae
FG
8144}
8145
11fdf7f2 8146void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
8147{
8148 buf->reset();
11fdf7f2
TL
8149
8150 buf->omap_allocated = db->estimate_prefix_size(PREFIX_OMAP);
8151
8152 uint64_t bfree = alloc->get_free();
7c673cae
FG
8153
8154 if (bluefs) {
11fdf7f2
TL
8155 int64_t bluefs_total = bluefs->get_total(bluefs_shared_bdev);
8156 int64_t bluefs_free = bluefs->get_free(bluefs_shared_bdev);
94b18763
FG
8157 // part of our shared device is "free" according to BlueFS, but we
8158 // can't touch bluestore_bluefs_min of it.
8159 int64_t shared_available = std::min(
11fdf7f2
TL
8160 bluefs_free,
8161 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
8162 buf->internally_reserved = bluefs_total - shared_available;
94b18763 8163 if (shared_available > 0) {
11fdf7f2
TL
8164 bfree += shared_available;
8165 }
8166 // include dedicated db, too, if that isn't the shared device.
8167 if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
8168 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 8169 }
11fdf7f2
TL
8170 // call any non-omap bluefs space "internal metadata"
8171 buf->internal_metadata =
8172 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
8173 - buf->omap_allocated;
7c673cae
FG
8174 }
8175
11fdf7f2
TL
8176 uint64_t thin_total, thin_avail;
8177 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
8178 buf->total += thin_total;
8179
8180 // we are limited by both the size of the virtual device and the
8181 // underlying physical device.
8182 bfree = std::min(bfree, thin_avail);
8183
8184 buf->allocated = thin_total - thin_avail;
8185 } else {
8186 buf->total += bdev->get_size();
8187 }
8188 buf->available = bfree;
8189}
8190
8191int BlueStore::statfs(struct store_statfs_t *buf,
8192 osd_alert_list_t* alerts)
8193{
8194 if (alerts) {
8195 alerts->clear();
8196 _log_alerts(*alerts);
8197 }
8198 _get_statfs_overall(buf);
31f18b77 8199 {
11fdf7f2 8200 std::lock_guard l(vstatfs_lock);
31f18b77 8201 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
8202 buf->data_stored = vstatfs.stored();
8203 buf->data_compressed = vstatfs.compressed();
8204 buf->data_compressed_original = vstatfs.compressed_original();
8205 buf->data_compressed_allocated = vstatfs.compressed_allocated();
8206 }
8207
8208 dout(20) << __func__ << " " << *buf << dendl;
8209 return 0;
8210}
8211
8212int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
8213{
8214 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 8215
11fdf7f2
TL
8216 if (!per_pool_stat_collection) {
8217 dout(20) << __func__ << " not supported in legacy mode " << dendl;
8218 return -ENOTSUP;
7c673cae 8219 }
11fdf7f2 8220 buf->reset();
7c673cae 8221
11fdf7f2
TL
8222 {
8223 std::lock_guard l(vstatfs_lock);
8224 osd_pools[pool_id].publish(buf);
8225 }
8226 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
8227 return 0;
8228}
8229
81eedcae
TL
8230void BlueStore::_check_legacy_statfs_alert()
8231{
8232 string s;
8233 if (!per_pool_stat_collection &&
8234 cct->_conf->bluestore_no_per_pool_stats_tolerance != "enforce" &&
8235 cct->_conf->bluestore_warn_on_legacy_statfs) {
8236 s = "legacy statfs reporting detected, "
8237 "suggest to run store repair to get consistent statistic reports";
8238 }
8239 std::lock_guard l(qlock);
8240 legacy_statfs_alert = s;
8241}
8242
7c673cae
FG
8243// ---------------
8244// cache
8245
8246BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
8247{
8248 RWLock::RLocker l(coll_lock);
8249 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
8250 if (cp == coll_map.end())
8251 return CollectionRef();
8252 return cp->second;
8253}
8254
8255void BlueStore::_queue_reap_collection(CollectionRef& c)
8256{
8257 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
8258 // _reap_collections and this in the same thread,
8259 // so no need a lock.
7c673cae
FG
8260 removed_collections.push_back(c);
8261}
8262
8263void BlueStore::_reap_collections()
8264{
94b18763 8265
7c673cae
FG
8266 list<CollectionRef> removed_colls;
8267 {
94b18763
FG
8268 // _queue_reap_collection and this in the same thread.
8269 // So no need a lock.
8270 if (!removed_collections.empty())
8271 removed_colls.swap(removed_collections);
8272 else
8273 return;
7c673cae
FG
8274 }
8275
94b18763
FG
8276 list<CollectionRef>::iterator p = removed_colls.begin();
8277 while (p != removed_colls.end()) {
7c673cae
FG
8278 CollectionRef c = *p;
8279 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
8280 if (c->onode_map.map_any([&](OnodeRef o) {
11fdf7f2 8281 ceph_assert(!o->exists);
7c673cae
FG
8282 if (o->flushing_count.load()) {
8283 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
8284 << " flush_txns " << o->flushing_count << dendl;
94b18763 8285 return true;
7c673cae 8286 }
94b18763 8287 return false;
7c673cae 8288 })) {
94b18763 8289 ++p;
7c673cae
FG
8290 continue;
8291 }
8292 c->onode_map.clear();
94b18763 8293 p = removed_colls.erase(p);
7c673cae
FG
8294 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
8295 }
94b18763 8296 if (removed_colls.empty()) {
7c673cae 8297 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
8298 } else {
8299 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
8300 }
8301}
8302
8303void BlueStore::_update_cache_logger()
8304{
8305 uint64_t num_onodes = 0;
8306 uint64_t num_extents = 0;
8307 uint64_t num_blobs = 0;
8308 uint64_t num_buffers = 0;
8309 uint64_t num_buffer_bytes = 0;
8310 for (auto c : cache_shards) {
8311 c->add_stats(&num_onodes, &num_extents, &num_blobs,
8312 &num_buffers, &num_buffer_bytes);
8313 }
8314 logger->set(l_bluestore_onodes, num_onodes);
8315 logger->set(l_bluestore_extents, num_extents);
8316 logger->set(l_bluestore_blobs, num_blobs);
8317 logger->set(l_bluestore_buffers, num_buffers);
8318 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
8319}
8320
8321// ---------------
8322// read operations
8323
8324ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
8325{
8326 return _get_collection(cid);
8327}
8328
11fdf7f2
TL
8329ObjectStore::CollectionHandle BlueStore::create_new_collection(
8330 const coll_t& cid)
7c673cae 8331{
11fdf7f2
TL
8332 RWLock::WLocker l(coll_lock);
8333 Collection *c = new Collection(
8334 this,
8335 cache_shards[cid.hash_to_shard(cache_shards.size())],
8336 cid);
8337 new_coll_map[cid] = c;
8338 _osr_attach(c);
8339 return c;
8340}
8341
8342void BlueStore::set_collection_commit_queue(
8343 const coll_t& cid,
8344 ContextQueue *commit_queue)
8345{
8346 if (commit_queue) {
8347 RWLock::RLocker l(coll_lock);
8348 if (coll_map.count(cid)) {
8349 coll_map[cid]->commit_queue = commit_queue;
8350 } else if (new_coll_map.count(cid)) {
8351 new_coll_map[cid]->commit_queue = commit_queue;
8352 }
8353 }
7c673cae
FG
8354}
8355
11fdf7f2 8356
7c673cae
FG
8357bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
8358{
8359 Collection *c = static_cast<Collection *>(c_.get());
8360 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
8361 if (!c->exists)
8362 return false;
8363
8364 bool r = true;
8365
8366 {
8367 RWLock::RLocker l(c->lock);
8368 OnodeRef o = c->get_onode(oid, false);
8369 if (!o || !o->exists)
8370 r = false;
8371 }
8372
7c673cae
FG
8373 return r;
8374}
8375
7c673cae
FG
8376int BlueStore::stat(
8377 CollectionHandle &c_,
8378 const ghobject_t& oid,
8379 struct stat *st,
8380 bool allow_eio)
8381{
8382 Collection *c = static_cast<Collection *>(c_.get());
8383 if (!c->exists)
8384 return -ENOENT;
8385 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
8386
8387 {
8388 RWLock::RLocker l(c->lock);
8389 OnodeRef o = c->get_onode(oid, false);
8390 if (!o || !o->exists)
8391 return -ENOENT;
8392 st->st_size = o->onode.size;
8393 st->st_blksize = 4096;
8394 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
8395 st->st_nlink = 1;
8396 }
8397
7c673cae
FG
8398 int r = 0;
8399 if (_debug_mdata_eio(oid)) {
8400 r = -EIO;
8401 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
8402 }
8403 return r;
8404}
8405int BlueStore::set_collection_opts(
11fdf7f2 8406 CollectionHandle& ch,
7c673cae
FG
8407 const pool_opts_t& opts)
8408{
7c673cae 8409 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 8410 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
8411 if (!c->exists)
8412 return -ENOENT;
8413 RWLock::WLocker l(c->lock);
8414 c->pool_opts = opts;
8415 return 0;
8416}
8417
7c673cae
FG
8418int BlueStore::read(
8419 CollectionHandle &c_,
8420 const ghobject_t& oid,
8421 uint64_t offset,
8422 size_t length,
8423 bufferlist& bl,
224ce89b 8424 uint32_t op_flags)
7c673cae 8425{
11fdf7f2 8426 auto start = mono_clock::now();
7c673cae
FG
8427 Collection *c = static_cast<Collection *>(c_.get());
8428 const coll_t &cid = c->get_cid();
8429 dout(15) << __func__ << " " << cid << " " << oid
8430 << " 0x" << std::hex << offset << "~" << length << std::dec
8431 << dendl;
8432 if (!c->exists)
8433 return -ENOENT;
8434
8435 bl.clear();
8436 int r;
8437 {
8438 RWLock::RLocker l(c->lock);
11fdf7f2 8439 auto start1 = mono_clock::now();
7c673cae 8440 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
8441 log_latency("get_onode@read",
8442 l_bluestore_read_onode_meta_lat,
8443 mono_clock::now() - start1,
8444 cct->_conf->bluestore_log_op_age);
7c673cae
FG
8445 if (!o || !o->exists) {
8446 r = -ENOENT;
8447 goto out;
8448 }
8449
8450 if (offset == length && offset == 0)
8451 length = o->onode.size;
8452
8453 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
8454 if (r == -EIO) {
8455 logger->inc(l_bluestore_read_eio);
8456 }
7c673cae
FG
8457 }
8458
8459 out:
28e407b8 8460 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
8461 r = -EIO;
8462 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
8463 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
8464 cct->_conf->bluestore_debug_random_read_err &&
8465 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
8466 100.0)) == 0) {
224ce89b
WB
8467 dout(0) << __func__ << ": inject random EIO" << dendl;
8468 r = -EIO;
7c673cae
FG
8469 }
8470 dout(10) << __func__ << " " << cid << " " << oid
8471 << " 0x" << std::hex << offset << "~" << length << std::dec
8472 << " = " << r << dendl;
494da23a
TL
8473 log_latency(__func__,
8474 l_bluestore_read_lat,
8475 mono_clock::now() - start,
8476 cct->_conf->bluestore_log_op_age);
7c673cae
FG
8477 return r;
8478}
8479
8480// --------------------------------------------------------
8481// intermediate data structures used while reading
8482struct region_t {
8483 uint64_t logical_offset;
8484 uint64_t blob_xoffset; //region offset within the blob
8485 uint64_t length;
7c673cae
FG
8486
8487 // used later in read process
8488 uint64_t front = 0;
7c673cae 8489
11fdf7f2 8490 region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
7c673cae
FG
8491 : logical_offset(offset),
8492 blob_xoffset(b_offs),
11fdf7f2
TL
8493 length(len),
8494 front(front){}
7c673cae
FG
8495 region_t(const region_t& from)
8496 : logical_offset(from.logical_offset),
8497 blob_xoffset(from.blob_xoffset),
11fdf7f2
TL
8498 length(from.length),
8499 front(from.front){}
7c673cae
FG
8500
8501 friend ostream& operator<<(ostream& out, const region_t& r) {
8502 return out << "0x" << std::hex << r.logical_offset << ":"
8503 << r.blob_xoffset << "~" << r.length << std::dec;
8504 }
8505};
8506
11fdf7f2
TL
8507// merged blob read request
8508struct read_req_t {
8509 uint64_t r_off = 0;
8510 uint64_t r_len = 0;
8511 bufferlist bl;
8512 std::list<region_t> regs; // original read regions
8513
8514 read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
8515
8516 friend ostream& operator<<(ostream& out, const read_req_t& r) {
8517 out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
8518 for (const auto& reg : r.regs)
8519 out << reg;
8520 return out << "]}" << std::dec;
8521 }
8522};
8523
8524typedef list<read_req_t> regions2read_t;
7c673cae
FG
8525typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
8526
8527int BlueStore::_do_read(
8528 Collection *c,
8529 OnodeRef o,
8530 uint64_t offset,
8531 size_t length,
8532 bufferlist& bl,
f64942e4
AA
8533 uint32_t op_flags,
8534 uint64_t retry_count)
7c673cae 8535{
11fdf7f2 8536 FUNCTRACE(cct);
7c673cae 8537 int r = 0;
91327a77 8538 int read_cache_policy = 0; // do not bypass clean or dirty cache
7c673cae
FG
8539
8540 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
8541 << " size 0x" << o->onode.size << " (" << std::dec
8542 << o->onode.size << ")" << dendl;
8543 bl.clear();
8544
8545 if (offset >= o->onode.size) {
8546 return r;
8547 }
8548
8549 // generally, don't buffer anything, unless the client explicitly requests
8550 // it.
8551 bool buffered = false;
8552 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
8553 dout(20) << __func__ << " will do buffered read" << dendl;
8554 buffered = true;
8555 } else if (cct->_conf->bluestore_default_buffered_read &&
8556 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
8557 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
8558 dout(20) << __func__ << " defaulting to buffered read" << dendl;
8559 buffered = true;
8560 }
8561
8562 if (offset + length > o->onode.size) {
8563 length = o->onode.size - offset;
8564 }
8565
11fdf7f2 8566 auto start = mono_clock::now();
7c673cae 8567 o->extent_map.fault_range(db, offset, length);
494da23a
TL
8568 log_latency(__func__,
8569 l_bluestore_read_onode_meta_lat,
8570 mono_clock::now() - start,
8571 cct->_conf->bluestore_log_op_age);
81eedcae 8572 _dump_onode<30>(cct, *o);
7c673cae
FG
8573
8574 ready_regions_t ready_regions;
8575
91327a77
AA
8576 // for deep-scrub, we only read dirty cache and bypass clean cache in
8577 // order to read underlying block device in case there are silent disk errors.
8578 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
8579 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
8580 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
8581 }
8582
7c673cae
FG
8583 // build blob-wise list to of stuff read (that isn't cached)
8584 blobs2read_t blobs2read;
8585 unsigned left = length;
8586 uint64_t pos = offset;
8587 unsigned num_regions = 0;
8588 auto lp = o->extent_map.seek_lextent(offset);
8589 while (left > 0 && lp != o->extent_map.extent_map.end()) {
8590 if (pos < lp->logical_offset) {
8591 unsigned hole = lp->logical_offset - pos;
8592 if (hole >= left) {
8593 break;
8594 }
8595 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
8596 << std::dec << dendl;
8597 pos += hole;
8598 left -= hole;
8599 }
94b18763 8600 BlobRef& bptr = lp->blob;
7c673cae
FG
8601 unsigned l_off = pos - lp->logical_offset;
8602 unsigned b_off = l_off + lp->blob_offset;
8603 unsigned b_len = std::min(left, lp->length - l_off);
8604
8605 ready_regions_t cache_res;
8606 interval_set<uint32_t> cache_interval;
8607 bptr->shared_blob->bc.read(
91327a77
AA
8608 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
8609 read_cache_policy);
7c673cae
FG
8610 dout(20) << __func__ << " blob " << *bptr << std::hex
8611 << " need 0x" << b_off << "~" << b_len
8612 << " cache has 0x" << cache_interval
8613 << std::dec << dendl;
8614
8615 auto pc = cache_res.begin();
11fdf7f2 8616 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
8617 while (b_len > 0) {
8618 unsigned l;
8619 if (pc != cache_res.end() &&
8620 pc->first == b_off) {
8621 l = pc->second.length();
8622 ready_regions[pos].claim(pc->second);
8623 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
8624 << b_off << "~" << l << std::dec << dendl;
8625 ++pc;
8626 } else {
8627 l = b_len;
8628 if (pc != cache_res.end()) {
11fdf7f2 8629 ceph_assert(pc->first > b_off);
7c673cae
FG
8630 l = pc->first - b_off;
8631 }
8632 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
8633 << b_off << "~" << l << std::dec << dendl;
11fdf7f2
TL
8634 // merge regions
8635 {
8636 uint64_t r_off = b_off;
8637 uint64_t r_len = l;
8638 uint64_t front = r_off % chunk_size;
8639 if (front) {
8640 r_off -= front;
8641 r_len += front;
8642 }
8643 unsigned tail = r_len % chunk_size;
8644 if (tail) {
8645 r_len += chunk_size - tail;
8646 }
8647 bool merged = false;
8648 regions2read_t& r2r = blobs2read[bptr];
8649 if (r2r.size()) {
8650 read_req_t& pre = r2r.back();
8651 if (r_off <= (pre.r_off + pre.r_len)) {
8652 front += (r_off - pre.r_off);
8653 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
8654 pre.regs.emplace_back(region_t(pos, b_off, l, front));
8655 merged = true;
8656 }
8657 }
8658 if (!merged) {
8659 read_req_t req(r_off, r_len);
8660 req.regs.emplace_back(region_t(pos, b_off, l, front));
8661 r2r.emplace_back(std::move(req));
8662 }
8663 }
7c673cae
FG
8664 ++num_regions;
8665 }
8666 pos += l;
8667 b_off += l;
8668 left -= l;
8669 b_len -= l;
8670 }
8671 ++lp;
8672 }
8673
8674 // read raw blob data. use aio if we have >1 blobs to read.
11fdf7f2
TL
8675 start = mono_clock::now(); // for the sake of simplicity
8676 // measure the whole block below.
8677 // The error isn't that much...
7c673cae 8678 vector<bufferlist> compressed_blob_bls;
b32b8144 8679 IOContext ioc(cct, NULL, true); // allow EIO
7c673cae 8680 for (auto& p : blobs2read) {
94b18763 8681 const BlobRef& bptr = p.first;
11fdf7f2 8682 regions2read_t& r2r = p.second;
7c673cae 8683 dout(20) << __func__ << " blob " << *bptr << std::hex
11fdf7f2 8684 << " need " << r2r << std::dec << dendl;
7c673cae
FG
8685 if (bptr->get_blob().is_compressed()) {
8686 // read the whole thing
8687 if (compressed_blob_bls.empty()) {
8688 // ensure we avoid any reallocation on subsequent blobs
8689 compressed_blob_bls.reserve(blobs2read.size());
8690 }
8691 compressed_blob_bls.push_back(bufferlist());
8692 bufferlist& bl = compressed_blob_bls.back();
8693 r = bptr->get_blob().map(
8694 0, bptr->get_blob().get_ondisk_length(),
8695 [&](uint64_t offset, uint64_t length) {
8696 int r;
8697 // use aio if there are more regions to read than those in this blob
11fdf7f2 8698 if (num_regions > r2r.size()) {
7c673cae
FG
8699 r = bdev->aio_read(offset, length, &bl, &ioc);
8700 } else {
8701 r = bdev->read(offset, length, &bl, &ioc, false);
8702 }
8703 if (r < 0)
8704 return r;
8705 return 0;
8706 });
b32b8144
FG
8707 if (r < 0) {
8708 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
8709 if (r == -EIO) {
8710 // propagate EIO to caller
8711 return r;
8712 }
11fdf7f2 8713 ceph_assert(r == 0);
b32b8144 8714 }
7c673cae
FG
8715 } else {
8716 // read the pieces
11fdf7f2 8717 for (auto& req : r2r) {
7c673cae 8718 dout(20) << __func__ << " region 0x" << std::hex
11fdf7f2
TL
8719 << req.regs.front().logical_offset
8720 << ": 0x" << req.regs.front().blob_xoffset
8721 << " reading 0x" << req.r_off
8722 << "~" << req.r_len << std::dec
7c673cae
FG
8723 << dendl;
8724
8725 // read it
8726 r = bptr->get_blob().map(
11fdf7f2 8727 req.r_off, req.r_len,
7c673cae
FG
8728 [&](uint64_t offset, uint64_t length) {
8729 int r;
8730 // use aio if there is more than one region to read
8731 if (num_regions > 1) {
11fdf7f2 8732 r = bdev->aio_read(offset, length, &req.bl, &ioc);
7c673cae 8733 } else {
11fdf7f2 8734 r = bdev->read(offset, length, &req.bl, &ioc, false);
7c673cae
FG
8735 }
8736 if (r < 0)
8737 return r;
8738 return 0;
8739 });
b32b8144
FG
8740 if (r < 0) {
8741 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
8742 << dendl;
8743 if (r == -EIO) {
8744 // propagate EIO to caller
8745 return r;
8746 }
11fdf7f2 8747 ceph_assert(r == 0);
b32b8144 8748 }
11fdf7f2 8749 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
8750 }
8751 }
8752 }
11fdf7f2
TL
8753
8754 int64_t num_ios = length;
7c673cae 8755 if (ioc.has_pending_aios()) {
11fdf7f2 8756 num_ios = -ioc.get_num_ios();
7c673cae
FG
8757 bdev->aio_submit(&ioc);
8758 dout(20) << __func__ << " waiting for aio" << dendl;
8759 ioc.aio_wait();
b32b8144
FG
8760 r = ioc.get_return_value();
8761 if (r < 0) {
11fdf7f2 8762 ceph_assert(r == -EIO); // no other errors allowed
b32b8144
FG
8763 return -EIO;
8764 }
7c673cae 8765 }
494da23a
TL
8766 log_latency_fn(__func__,
8767 l_bluestore_read_wait_aio_lat,
11fdf7f2 8768 mono_clock::now() - start,
494da23a 8769 cct->_conf->bluestore_log_op_age,
11fdf7f2
TL
8770 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
8771 );
7c673cae
FG
8772
8773 // enumerate and decompress desired blobs
8774 auto p = compressed_blob_bls.begin();
8775 blobs2read_t::iterator b2r_it = blobs2read.begin();
8776 while (b2r_it != blobs2read.end()) {
94b18763 8777 const BlobRef& bptr = b2r_it->first;
11fdf7f2 8778 regions2read_t& r2r = b2r_it->second;
7c673cae 8779 dout(20) << __func__ << " blob " << *bptr << std::hex
11fdf7f2 8780 << " need 0x" << r2r << std::dec << dendl;
7c673cae 8781 if (bptr->get_blob().is_compressed()) {
11fdf7f2 8782 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
8783 bufferlist& compressed_bl = *p++;
8784 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
11fdf7f2 8785 r2r.front().regs.front().logical_offset) < 0) {
f64942e4
AA
8786 // Handles spurious read errors caused by a kernel bug.
8787 // We sometimes get all-zero pages as a result of the read under
11fdf7f2
TL
8788 // high memory pressure. Retrying the failing read succeeds in most
8789 // cases.
f64942e4
AA
8790 // See also: http://tracker.ceph.com/issues/22464
8791 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
8792 return -EIO;
8793 }
8794 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
7c673cae
FG
8795 }
8796 bufferlist raw_bl;
8797 r = _decompress(compressed_bl, &raw_bl);
8798 if (r < 0)
8799 return r;
8800 if (buffered) {
8801 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
8802 raw_bl);
8803 }
11fdf7f2
TL
8804 for (auto& req : r2r) {
8805 for (auto& r : req.regs) {
8806 ready_regions[r.logical_offset].substr_of(
8807 raw_bl, r.blob_xoffset, r.length);
8808 }
7c673cae
FG
8809 }
8810 } else {
11fdf7f2
TL
8811 for (auto& req : r2r) {
8812 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
8813 req.regs.front().logical_offset) < 0) {
f64942e4
AA
8814 // Handles spurious read errors caused by a kernel bug.
8815 // We sometimes get all-zero pages as a result of the read under
11fdf7f2
TL
8816 // high memory pressure. Retrying the failing read succeeds in most
8817 // cases.
f64942e4
AA
8818 // See also: http://tracker.ceph.com/issues/22464
8819 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
8820 return -EIO;
8821 }
8822 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
7c673cae
FG
8823 }
8824 if (buffered) {
8825 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
11fdf7f2 8826 req.r_off, req.bl);
7c673cae
FG
8827 }
8828
8829 // prune and keep result
11fdf7f2
TL
8830 for (const auto& r : req.regs) {
8831 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
8832 }
7c673cae
FG
8833 }
8834 }
8835 ++b2r_it;
8836 }
8837
8838 // generate a resulting buffer
8839 auto pr = ready_regions.begin();
8840 auto pr_end = ready_regions.end();
8841 pos = 0;
8842 while (pos < length) {
8843 if (pr != pr_end && pr->first == pos + offset) {
8844 dout(30) << __func__ << " assemble 0x" << std::hex << pos
8845 << ": data from 0x" << pr->first << "~" << pr->second.length()
8846 << std::dec << dendl;
8847 pos += pr->second.length();
8848 bl.claim_append(pr->second);
8849 ++pr;
8850 } else {
8851 uint64_t l = length - pos;
8852 if (pr != pr_end) {
11fdf7f2 8853 ceph_assert(pr->first > pos + offset);
7c673cae
FG
8854 l = pr->first - (pos + offset);
8855 }
8856 dout(30) << __func__ << " assemble 0x" << std::hex << pos
8857 << ": zeros for 0x" << (pos + offset) << "~" << l
8858 << std::dec << dendl;
8859 bl.append_zero(l);
8860 pos += l;
8861 }
8862 }
11fdf7f2
TL
8863 ceph_assert(bl.length() == length);
8864 ceph_assert(pos == length);
8865 ceph_assert(pr == pr_end);
7c673cae 8866 r = bl.length();
f64942e4
AA
8867 if (retry_count) {
8868 logger->inc(l_bluestore_reads_with_retries);
8869 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
8870 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
8871 }
7c673cae
FG
8872 return r;
8873}
8874
8875int BlueStore::_verify_csum(OnodeRef& o,
8876 const bluestore_blob_t* blob, uint64_t blob_xoffset,
8877 const bufferlist& bl,
8878 uint64_t logical_offset) const
8879{
8880 int bad;
8881 uint64_t bad_csum;
11fdf7f2 8882 auto start = mono_clock::now();
7c673cae 8883 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
8884 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
8885 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
8886 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
8887 bad = blob_xoffset;
8888 r = -1;
8889 bad_csum = 0xDEADBEEF;
8890 }
7c673cae
FG
8891 if (r < 0) {
8892 if (r == -1) {
8893 PExtentVector pex;
8894 blob->map(
8895 bad,
8896 blob->get_csum_chunk_size(),
8897 [&](uint64_t offset, uint64_t length) {
8898 pex.emplace_back(bluestore_pextent_t(offset, length));
8899 return 0;
8900 });
8901 derr << __func__ << " bad "
8902 << Checksummer::get_csum_type_string(blob->csum_type)
8903 << "/0x" << std::hex << blob->get_csum_chunk_size()
8904 << " checksum at blob offset 0x" << bad
8905 << ", got 0x" << bad_csum << ", expected 0x"
8906 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
8907 << ", device location " << pex
8908 << ", logical extent 0x" << std::hex
8909 << (logical_offset + bad - blob_xoffset) << "~"
8910 << blob->get_csum_chunk_size() << std::dec
8911 << ", object " << o->oid
8912 << dendl;
8913 } else {
8914 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
8915 }
8916 }
494da23a
TL
8917 log_latency(__func__,
8918 l_bluestore_csum_lat,
8919 mono_clock::now() - start,
8920 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
8921 if (cct->_conf->bluestore_ignore_data_csum) {
8922 return 0;
8923 }
7c673cae
FG
8924 return r;
8925}
8926
8927int BlueStore::_decompress(bufferlist& source, bufferlist* result)
8928{
8929 int r = 0;
11fdf7f2
TL
8930 auto start = mono_clock::now();
8931 auto i = source.cbegin();
7c673cae 8932 bluestore_compression_header_t chdr;
11fdf7f2 8933 decode(chdr, i);
7c673cae
FG
8934 int alg = int(chdr.type);
8935 CompressorRef cp = compressor;
8936 if (!cp || (int)cp->get_type() != alg) {
8937 cp = Compressor::create(cct, alg);
8938 }
8939
8940 if (!cp.get()) {
8941 // if compressor isn't available - error, because cannot return
8942 // decompressed data?
11fdf7f2
TL
8943
8944 const char* alg_name = Compressor::get_comp_alg_name(alg);
8945 derr << __func__ << " can't load decompressor " << alg_name << dendl;
8946 _set_compression_alert(false, alg_name);
7c673cae
FG
8947 r = -EIO;
8948 } else {
8949 r = cp->decompress(i, chdr.length, *result);
8950 if (r < 0) {
8951 derr << __func__ << " decompression failed with exit code " << r << dendl;
8952 r = -EIO;
8953 }
8954 }
494da23a
TL
8955 log_latency(__func__,
8956 l_bluestore_decompress_lat,
8957 mono_clock::now() - start,
8958 cct->_conf->bluestore_log_op_age);
7c673cae
FG
8959 return r;
8960}
8961
8962// this stores fiemap into interval_set, other variations
8963// use it internally
8964int BlueStore::_fiemap(
8965 CollectionHandle &c_,
8966 const ghobject_t& oid,
8967 uint64_t offset,
8968 size_t length,
8969 interval_set<uint64_t>& destset)
8970{
8971 Collection *c = static_cast<Collection *>(c_.get());
8972 if (!c->exists)
8973 return -ENOENT;
8974 {
8975 RWLock::RLocker l(c->lock);
8976
8977 OnodeRef o = c->get_onode(oid, false);
8978 if (!o || !o->exists) {
8979 return -ENOENT;
8980 }
81eedcae 8981 _dump_onode<30>(cct, *o);
7c673cae
FG
8982
8983 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
8984 << " size 0x" << o->onode.size << std::dec << dendl;
8985
8986 boost::intrusive::set<Extent>::iterator ep, eend;
8987 if (offset >= o->onode.size)
8988 goto out;
8989
8990 if (offset + length > o->onode.size) {
8991 length = o->onode.size - offset;
8992 }
8993
8994 o->extent_map.fault_range(db, offset, length);
8995 eend = o->extent_map.extent_map.end();
8996 ep = o->extent_map.seek_lextent(offset);
8997 while (length > 0) {
8998 dout(20) << __func__ << " offset " << offset << dendl;
8999 if (ep != eend && ep->logical_offset + ep->length <= offset) {
9000 ++ep;
9001 continue;
9002 }
9003
9004 uint64_t x_len = length;
9005 if (ep != eend && ep->logical_offset <= offset) {
9006 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 9007 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
9008 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
9009 << x_len << std::dec << " blob " << ep->blob << dendl;
9010 destset.insert(offset, x_len);
9011 length -= x_len;
9012 offset += x_len;
9013 if (x_off + x_len == ep->length)
9014 ++ep;
9015 continue;
9016 }
9017 if (ep != eend &&
9018 ep->logical_offset > offset &&
9019 ep->logical_offset - offset < x_len) {
9020 x_len = ep->logical_offset - offset;
9021 }
9022 offset += x_len;
9023 length -= x_len;
9024 }
9025 }
9026
9027 out:
7c673cae
FG
9028 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9029 << " size = 0x(" << destset << ")" << std::dec << dendl;
9030 return 0;
9031}
9032
7c673cae
FG
9033int BlueStore::fiemap(
9034 CollectionHandle &c_,
9035 const ghobject_t& oid,
9036 uint64_t offset,
9037 size_t length,
9038 bufferlist& bl)
9039{
9040 interval_set<uint64_t> m;
9041 int r = _fiemap(c_, oid, offset, length, m);
9042 if (r >= 0) {
11fdf7f2 9043 encode(m, bl);
7c673cae
FG
9044 }
9045 return r;
9046}
9047
7c673cae
FG
9048int BlueStore::fiemap(
9049 CollectionHandle &c_,
9050 const ghobject_t& oid,
9051 uint64_t offset,
9052 size_t length,
9053 map<uint64_t, uint64_t>& destmap)
9054{
9055 interval_set<uint64_t> m;
9056 int r = _fiemap(c_, oid, offset, length, m);
9057 if (r >= 0) {
9058 m.move_into(destmap);
9059 }
9060 return r;
9061}
9062
7c673cae
FG
9063int BlueStore::getattr(
9064 CollectionHandle &c_,
9065 const ghobject_t& oid,
9066 const char *name,
9067 bufferptr& value)
9068{
9069 Collection *c = static_cast<Collection *>(c_.get());
9070 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
9071 if (!c->exists)
9072 return -ENOENT;
9073
9074 int r;
9075 {
9076 RWLock::RLocker l(c->lock);
31f18b77 9077 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
9078
9079 OnodeRef o = c->get_onode(oid, false);
9080 if (!o || !o->exists) {
9081 r = -ENOENT;
9082 goto out;
9083 }
9084
9085 if (!o->onode.attrs.count(k)) {
9086 r = -ENODATA;
9087 goto out;
9088 }
9089 value = o->onode.attrs[k];
9090 r = 0;
9091 }
9092 out:
7c673cae
FG
9093 if (r == 0 && _debug_mdata_eio(oid)) {
9094 r = -EIO;
9095 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9096 }
9097 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
9098 << " = " << r << dendl;
9099 return r;
9100}
9101
7c673cae
FG
9102int BlueStore::getattrs(
9103 CollectionHandle &c_,
9104 const ghobject_t& oid,
9105 map<string,bufferptr>& aset)
9106{
9107 Collection *c = static_cast<Collection *>(c_.get());
9108 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
9109 if (!c->exists)
9110 return -ENOENT;
9111
9112 int r;
9113 {
9114 RWLock::RLocker l(c->lock);
9115
9116 OnodeRef o = c->get_onode(oid, false);
9117 if (!o || !o->exists) {
9118 r = -ENOENT;
9119 goto out;
9120 }
9121 for (auto& i : o->onode.attrs) {
9122 aset.emplace(i.first.c_str(), i.second);
9123 }
9124 r = 0;
9125 }
9126
9127 out:
7c673cae
FG
9128 if (r == 0 && _debug_mdata_eio(oid)) {
9129 r = -EIO;
9130 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9131 }
9132 dout(10) << __func__ << " " << c->cid << " " << oid
9133 << " = " << r << dendl;
9134 return r;
9135}
9136
9137int BlueStore::list_collections(vector<coll_t>& ls)
9138{
9139 RWLock::RLocker l(coll_lock);
11fdf7f2 9140 ls.reserve(coll_map.size());
7c673cae
FG
9141 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
9142 p != coll_map.end();
9143 ++p)
9144 ls.push_back(p->first);
9145 return 0;
9146}
9147
9148bool BlueStore::collection_exists(const coll_t& c)
9149{
9150 RWLock::RLocker l(coll_lock);
9151 return coll_map.count(c);
9152}
9153
11fdf7f2 9154int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 9155{
11fdf7f2 9156 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
9157 vector<ghobject_t> ls;
9158 ghobject_t next;
11fdf7f2 9159 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
9160 &ls, &next);
9161 if (r < 0) {
9162 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
9163 << dendl;
9164 return r;
9165 }
9166 *empty = ls.empty();
11fdf7f2 9167 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
9168 return 0;
9169}
9170
11fdf7f2 9171int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 9172{
11fdf7f2
TL
9173 dout(15) << __func__ << " " << ch->cid << dendl;
9174 Collection *c = static_cast<Collection*>(ch.get());
7c673cae 9175 RWLock::RLocker l(c->lock);
11fdf7f2 9176 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
9177 return c->cnode.bits;
9178}
9179
7c673cae
FG
9180int BlueStore::collection_list(
9181 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
9182 vector<ghobject_t> *ls, ghobject_t *pnext)
9183{
9184 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 9185 c->flush();
7c673cae
FG
9186 dout(15) << __func__ << " " << c->cid
9187 << " start " << start << " end " << end << " max " << max << dendl;
9188 int r;
9189 {
9190 RWLock::RLocker l(c->lock);
9191 r = _collection_list(c, start, end, max, ls, pnext);
9192 }
9193
7c673cae
FG
9194 dout(10) << __func__ << " " << c->cid
9195 << " start " << start << " end " << end << " max " << max
9196 << " = " << r << ", ls.size() = " << ls->size()
9197 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
9198 return r;
9199}
9200
9201int BlueStore::_collection_list(
9202 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
9203 vector<ghobject_t> *ls, ghobject_t *pnext)
9204{
9205
9206 if (!c->exists)
9207 return -ENOENT;
9208
494da23a 9209 auto start_time = mono_clock::now();
7c673cae
FG
9210 int r = 0;
9211 ghobject_t static_next;
9212 KeyValueDB::Iterator it;
9213 string temp_start_key, temp_end_key;
9214 string start_key, end_key;
9215 bool set_next = false;
9216 string pend;
9217 bool temp;
9218
9219 if (!pnext)
9220 pnext = &static_next;
9221
11fdf7f2 9222 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
9223 goto out;
9224 }
9225 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
9226 &start_key, &end_key);
9227 dout(20) << __func__
9228 << " range " << pretty_binary_string(temp_start_key)
9229 << " to " << pretty_binary_string(temp_end_key)
9230 << " and " << pretty_binary_string(start_key)
9231 << " to " << pretty_binary_string(end_key)
9232 << " start " << start << dendl;
9233 it = db->get_iterator(PREFIX_OBJ);
9234 if (start == ghobject_t() ||
9235 start.hobj == hobject_t() ||
9236 start == c->cid.get_min_hobj()) {
9237 it->upper_bound(temp_start_key);
9238 temp = true;
9239 } else {
9240 string k;
9241 get_object_key(cct, start, &k);
9242 if (start.hobj.is_temp()) {
9243 temp = true;
11fdf7f2 9244 ceph_assert(k >= temp_start_key && k < temp_end_key);
7c673cae
FG
9245 } else {
9246 temp = false;
11fdf7f2 9247 ceph_assert(k >= start_key && k < end_key);
7c673cae 9248 }
11fdf7f2 9249 dout(20) << __func__ << " start from " << pretty_binary_string(k)
7c673cae
FG
9250 << " temp=" << (int)temp << dendl;
9251 it->lower_bound(k);
9252 }
9253 if (end.hobj.is_max()) {
9254 pend = temp ? temp_end_key : end_key;
9255 } else {
9256 get_object_key(cct, end, &end_key);
9257 if (end.hobj.is_temp()) {
9258 if (temp)
9259 pend = end_key;
9260 else
9261 goto out;
9262 } else {
9263 pend = temp ? temp_end_key : end_key;
9264 }
9265 }
9266 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
9267 while (true) {
9268 if (!it->valid() || it->key() >= pend) {
9269 if (!it->valid())
9270 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
9271 else
9272 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
9273 << " >= " << end << dendl;
9274 if (temp) {
9275 if (end.hobj.is_temp()) {
9276 break;
9277 }
9278 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
9279 temp = false;
9280 it->upper_bound(start_key);
9281 pend = end_key;
9282 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
9283 continue;
9284 }
9285 break;
9286 }
9287 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
9288 if (is_extent_shard_key(it->key())) {
9289 it->next();
9290 continue;
9291 }
9292 ghobject_t oid;
9293 int r = get_key_object(it->key(), &oid);
11fdf7f2 9294 ceph_assert(r == 0);
7c673cae
FG
9295 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
9296 if (ls->size() >= (unsigned)max) {
9297 dout(20) << __func__ << " reached max " << max << dendl;
9298 *pnext = oid;
9299 set_next = true;
9300 break;
9301 }
9302 ls->push_back(oid);
9303 it->next();
9304 }
9305out:
9306 if (!set_next) {
9307 *pnext = ghobject_t::get_max();
9308 }
494da23a
TL
9309 log_latency_fn(
9310 __func__,
9311 l_bluestore_clist_lat,
9312 mono_clock::now() - start_time,
9313 cct->_conf->bluestore_log_collection_list_age,
9314 [&] (const ceph::timespan& lat) {
9315 ostringstream ostr;
9316 ostr << ", lat = " << timespan_str(lat)
9317 << " cid =" << c->cid
9318 << " start " << start << " end " << end
9319 << " max " << max;
9320 return ostr.str();
9321 }
9322 );
7c673cae
FG
9323 return r;
9324}
9325
7c673cae
FG
9326int BlueStore::omap_get(
9327 CollectionHandle &c_, ///< [in] Collection containing oid
9328 const ghobject_t &oid, ///< [in] Object containing omap
9329 bufferlist *header, ///< [out] omap header
9330 map<string, bufferlist> *out /// < [out] Key to value map
9331 )
9332{
9333 Collection *c = static_cast<Collection *>(c_.get());
9334 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9335 if (!c->exists)
9336 return -ENOENT;
9337 RWLock::RLocker l(c->lock);
9338 int r = 0;
9339 OnodeRef o = c->get_onode(oid, false);
9340 if (!o || !o->exists) {
9341 r = -ENOENT;
9342 goto out;
9343 }
9344 if (!o->onode.has_omap())
9345 goto out;
9346 o->flush();
9347 {
11fdf7f2
TL
9348 const string& prefix =
9349 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9350 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
9351 string head, tail;
9352 get_omap_header(o->onode.nid, &head);
9353 get_omap_tail(o->onode.nid, &tail);
9354 it->lower_bound(head);
9355 while (it->valid()) {
9356 if (it->key() == head) {
9357 dout(30) << __func__ << " got header" << dendl;
9358 *header = it->value();
9359 } else if (it->key() >= tail) {
9360 dout(30) << __func__ << " reached tail" << dendl;
9361 break;
9362 } else {
9363 string user_key;
9364 decode_omap_key(it->key(), &user_key);
11fdf7f2 9365 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
9366 << " -> " << user_key << dendl;
9367 (*out)[user_key] = it->value();
9368 }
9369 it->next();
9370 }
9371 }
9372 out:
9373 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9374 << dendl;
9375 return r;
9376}
9377
7c673cae
FG
9378int BlueStore::omap_get_header(
9379 CollectionHandle &c_, ///< [in] Collection containing oid
9380 const ghobject_t &oid, ///< [in] Object containing omap
9381 bufferlist *header, ///< [out] omap header
9382 bool allow_eio ///< [in] don't assert on eio
9383 )
9384{
9385 Collection *c = static_cast<Collection *>(c_.get());
9386 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9387 if (!c->exists)
9388 return -ENOENT;
9389 RWLock::RLocker l(c->lock);
9390 int r = 0;
9391 OnodeRef o = c->get_onode(oid, false);
9392 if (!o || !o->exists) {
9393 r = -ENOENT;
9394 goto out;
9395 }
9396 if (!o->onode.has_omap())
9397 goto out;
9398 o->flush();
9399 {
9400 string head;
9401 get_omap_header(o->onode.nid, &head);
11fdf7f2
TL
9402 if (db->get(o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
9403 head, header) >= 0) {
7c673cae
FG
9404 dout(30) << __func__ << " got header" << dendl;
9405 } else {
9406 dout(30) << __func__ << " no header" << dendl;
9407 }
9408 }
9409 out:
9410 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9411 << dendl;
9412 return r;
9413}
9414
7c673cae
FG
9415int BlueStore::omap_get_keys(
9416 CollectionHandle &c_, ///< [in] Collection containing oid
9417 const ghobject_t &oid, ///< [in] Object containing omap
9418 set<string> *keys ///< [out] Keys defined on oid
9419 )
9420{
9421 Collection *c = static_cast<Collection *>(c_.get());
9422 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9423 if (!c->exists)
9424 return -ENOENT;
9425 RWLock::RLocker l(c->lock);
9426 int r = 0;
9427 OnodeRef o = c->get_onode(oid, false);
9428 if (!o || !o->exists) {
9429 r = -ENOENT;
9430 goto out;
9431 }
9432 if (!o->onode.has_omap())
9433 goto out;
9434 o->flush();
9435 {
11fdf7f2
TL
9436 const string& prefix =
9437 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9438 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
9439 string head, tail;
9440 get_omap_key(o->onode.nid, string(), &head);
9441 get_omap_tail(o->onode.nid, &tail);
9442 it->lower_bound(head);
9443 while (it->valid()) {
9444 if (it->key() >= tail) {
9445 dout(30) << __func__ << " reached tail" << dendl;
9446 break;
9447 }
9448 string user_key;
9449 decode_omap_key(it->key(), &user_key);
11fdf7f2 9450 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
9451 << " -> " << user_key << dendl;
9452 keys->insert(user_key);
9453 it->next();
11fdf7f2
TL
9454 }
9455 }
9456 out:
9457 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9458 << dendl;
9459 return r;
7c673cae
FG
9460}
9461
9462int BlueStore::omap_get_values(
9463 CollectionHandle &c_, ///< [in] Collection containing oid
9464 const ghobject_t &oid, ///< [in] Object containing omap
9465 const set<string> &keys, ///< [in] Keys to get
9466 map<string, bufferlist> *out ///< [out] Returned keys and values
9467 )
9468{
9469 Collection *c = static_cast<Collection *>(c_.get());
9470 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9471 if (!c->exists)
9472 return -ENOENT;
9473 RWLock::RLocker l(c->lock);
9474 int r = 0;
9475 string final_key;
9476 OnodeRef o = c->get_onode(oid, false);
9477 if (!o || !o->exists) {
9478 r = -ENOENT;
9479 goto out;
9480 }
9481 if (!o->onode.has_omap())
9482 goto out;
11fdf7f2
TL
9483 {
9484 const string& prefix =
9485 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9486 o->flush();
9487 _key_encode_u64(o->onode.nid, &final_key);
9488 final_key.push_back('.');
9489 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9490 final_key.resize(9); // keep prefix
9491 final_key += *p;
9492 bufferlist val;
9493 if (db->get(prefix, final_key, &val) >= 0) {
9494 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
9495 << " -> " << *p << dendl;
9496 out->insert(make_pair(*p, val));
9497 }
7c673cae
FG
9498 }
9499 }
9500 out:
9501 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9502 << dendl;
9503 return r;
9504}
9505
7c673cae
FG
9506int BlueStore::omap_check_keys(
9507 CollectionHandle &c_, ///< [in] Collection containing oid
9508 const ghobject_t &oid, ///< [in] Object containing omap
9509 const set<string> &keys, ///< [in] Keys to check
9510 set<string> *out ///< [out] Subset of keys defined on oid
9511 )
9512{
9513 Collection *c = static_cast<Collection *>(c_.get());
9514 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9515 if (!c->exists)
9516 return -ENOENT;
9517 RWLock::RLocker l(c->lock);
9518 int r = 0;
9519 string final_key;
9520 OnodeRef o = c->get_onode(oid, false);
9521 if (!o || !o->exists) {
9522 r = -ENOENT;
9523 goto out;
9524 }
9525 if (!o->onode.has_omap())
9526 goto out;
11fdf7f2
TL
9527 {
9528 const string& prefix =
9529 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9530 o->flush();
9531 _key_encode_u64(o->onode.nid, &final_key);
9532 final_key.push_back('.');
9533 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9534 final_key.resize(9); // keep prefix
9535 final_key += *p;
9536 bufferlist val;
9537 if (db->get(prefix, final_key, &val) >= 0) {
9538 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
9539 << " -> " << *p << dendl;
9540 out->insert(*p);
9541 } else {
9542 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
9543 << " -> " << *p << dendl;
9544 }
7c673cae
FG
9545 }
9546 }
9547 out:
9548 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9549 << dendl;
9550 return r;
9551}
9552
7c673cae
FG
9553ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
9554 CollectionHandle &c_, ///< [in] collection
9555 const ghobject_t &oid ///< [in] object
9556 )
9557{
9558 Collection *c = static_cast<Collection *>(c_.get());
9559 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9560 if (!c->exists) {
9561 return ObjectMap::ObjectMapIterator();
9562 }
9563 RWLock::RLocker l(c->lock);
9564 OnodeRef o = c->get_onode(oid, false);
9565 if (!o || !o->exists) {
9566 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
9567 return ObjectMap::ObjectMapIterator();
9568 }
9569 o->flush();
9570 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
11fdf7f2
TL
9571 KeyValueDB::Iterator it = db->get_iterator(
9572 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP);
7c673cae
FG
9573 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
9574}
9575
9576// -----------------
9577// write helpers
9578
11fdf7f2
TL
9579uint64_t BlueStore::_get_ondisk_reserved() const {
9580 return round_up_to(
9581 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
9582}
9583
7c673cae
FG
9584void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
9585{
9586 dout(10) << __func__ << " ondisk_format " << ondisk_format
9587 << " min_compat_ondisk_format " << min_compat_ondisk_format
9588 << dendl;
11fdf7f2 9589 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
9590 {
9591 bufferlist bl;
11fdf7f2 9592 encode(ondisk_format, bl);
7c673cae
FG
9593 t->set(PREFIX_SUPER, "ondisk_format", bl);
9594 }
9595 {
9596 bufferlist bl;
11fdf7f2 9597 encode(min_compat_ondisk_format, bl);
7c673cae
FG
9598 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
9599 }
9600}
9601
9602int BlueStore::_open_super_meta()
9603{
9604 // nid
9605 {
9606 nid_max = 0;
9607 bufferlist bl;
9608 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 9609 auto p = bl.cbegin();
7c673cae
FG
9610 try {
9611 uint64_t v;
11fdf7f2 9612 decode(v, p);
7c673cae
FG
9613 nid_max = v;
9614 } catch (buffer::error& e) {
9615 derr << __func__ << " unable to read nid_max" << dendl;
9616 return -EIO;
9617 }
9618 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
9619 nid_last = nid_max.load();
9620 }
9621
9622 // blobid
9623 {
9624 blobid_max = 0;
9625 bufferlist bl;
9626 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 9627 auto p = bl.cbegin();
7c673cae
FG
9628 try {
9629 uint64_t v;
11fdf7f2 9630 decode(v, p);
7c673cae
FG
9631 blobid_max = v;
9632 } catch (buffer::error& e) {
9633 derr << __func__ << " unable to read blobid_max" << dendl;
9634 return -EIO;
9635 }
9636 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
9637 blobid_last = blobid_max.load();
9638 }
9639
9640 // freelist
9641 {
9642 bufferlist bl;
9643 db->get(PREFIX_SUPER, "freelist_type", &bl);
9644 if (bl.length()) {
9645 freelist_type = std::string(bl.c_str(), bl.length());
9646 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
9647 } else {
11fdf7f2 9648 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 9649 }
7c673cae
FG
9650 }
9651
9652 // ondisk format
9653 int32_t compat_ondisk_format = 0;
9654 {
9655 bufferlist bl;
9656 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
9657 if (r < 0) {
9658 // base case: kraken bluestore is v1 and readable by v1
9659 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
9660 << dendl;
9661 ondisk_format = 1;
9662 compat_ondisk_format = 1;
9663 } else {
11fdf7f2 9664 auto p = bl.cbegin();
7c673cae 9665 try {
11fdf7f2 9666 decode(ondisk_format, p);
7c673cae
FG
9667 } catch (buffer::error& e) {
9668 derr << __func__ << " unable to read ondisk_format" << dendl;
9669 return -EIO;
9670 }
9671 bl.clear();
9672 {
9673 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
9674 ceph_assert(!r);
9675 auto p = bl.cbegin();
7c673cae 9676 try {
11fdf7f2 9677 decode(compat_ondisk_format, p);
7c673cae
FG
9678 } catch (buffer::error& e) {
9679 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
9680 return -EIO;
9681 }
9682 }
9683 }
9684 dout(10) << __func__ << " ondisk_format " << ondisk_format
9685 << " compat_ondisk_format " << compat_ondisk_format
9686 << dendl;
9687 }
9688
9689 if (latest_ondisk_format < compat_ondisk_format) {
9690 derr << __func__ << " compat_ondisk_format is "
9691 << compat_ondisk_format << " but we only understand version "
9692 << latest_ondisk_format << dendl;
9693 return -EPERM;
9694 }
7c673cae
FG
9695
9696 {
9697 bufferlist bl;
9698 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 9699 auto p = bl.cbegin();
7c673cae
FG
9700 try {
9701 uint64_t val;
11fdf7f2 9702 decode(val, p);
7c673cae 9703 min_alloc_size = val;
224ce89b 9704 min_alloc_size_order = ctz(val);
11fdf7f2 9705 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
9706 } catch (buffer::error& e) {
9707 derr << __func__ << " unable to read min_alloc_size" << dendl;
9708 return -EIO;
9709 }
9710 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
9711 << std::dec << dendl;
9712 }
224ce89b 9713 _open_statfs();
7c673cae
FG
9714 _set_alloc_sizes();
9715 _set_throttle_params();
9716
9717 _set_csum();
9718 _set_compression();
9719 _set_blob_size();
9720
11fdf7f2 9721 _validate_bdev();
7c673cae
FG
9722 return 0;
9723}
9724
9725int BlueStore::_upgrade_super()
9726{
9727 dout(1) << __func__ << " from " << ondisk_format << ", latest "
9728 << latest_ondisk_format << dendl;
11fdf7f2
TL
9729 if (ondisk_format < latest_ondisk_format) {
9730 ceph_assert(ondisk_format > 0);
9731 ceph_assert(ondisk_format < latest_ondisk_format);
9732
9733 if (ondisk_format == 1) {
9734 // changes:
9735 // - super: added ondisk_format
9736 // - super: added min_readable_ondisk_format
9737 // - super: added min_compat_ondisk_format
9738 // - super: added min_alloc_size
9739 // - super: removed min_min_alloc_size
9740 KeyValueDB::Transaction t = db->get_transaction();
9741 {
9742 bufferlist bl;
9743 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
9744 auto p = bl.cbegin();
9745 try {
9746 uint64_t val;
9747 decode(val, p);
9748 min_alloc_size = val;
9749 } catch (buffer::error& e) {
9750 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
9751 return -EIO;
9752 }
9753 t->set(PREFIX_SUPER, "min_alloc_size", bl);
9754 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 9755 }
11fdf7f2
TL
9756 ondisk_format = 2;
9757 _prepare_ondisk_format_super(t);
9758 int r = db->submit_transaction_sync(t);
9759 ceph_assert(r == 0);
7c673cae 9760 }
7c673cae 9761 }
7c673cae
FG
9762 // done
9763 dout(1) << __func__ << " done" << dendl;
9764 return 0;
9765}
9766
9767void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
9768{
224ce89b 9769 if (o->onode.nid) {
11fdf7f2 9770 ceph_assert(o->exists);
7c673cae 9771 return;
224ce89b 9772 }
7c673cae
FG
9773 uint64_t nid = ++nid_last;
9774 dout(20) << __func__ << " " << nid << dendl;
9775 o->onode.nid = nid;
9776 txc->last_nid = nid;
224ce89b 9777 o->exists = true;
7c673cae
FG
9778}
9779
9780uint64_t BlueStore::_assign_blobid(TransContext *txc)
9781{
9782 uint64_t bid = ++blobid_last;
9783 dout(20) << __func__ << " " << bid << dendl;
9784 txc->last_blobid = bid;
9785 return bid;
9786}
9787
9788void BlueStore::get_db_statistics(Formatter *f)
9789{
9790 db->get_statistics(f);
9791}
9792
11fdf7f2
TL
9793BlueStore::TransContext *BlueStore::_txc_create(
9794 Collection *c, OpSequencer *osr,
9795 list<Context*> *on_commits)
7c673cae 9796{
11fdf7f2 9797 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae
FG
9798 txc->t = db->get_transaction();
9799 osr->queue_new(txc);
9800 dout(20) << __func__ << " osr " << osr << " = " << txc
9801 << " seq " << txc->seq << dendl;
9802 return txc;
9803}
9804
9805void BlueStore::_txc_calc_cost(TransContext *txc)
9806{
11fdf7f2
TL
9807 // one "io" for the kv commit
9808 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
9809 auto cost = throttle_cost_per_io.load();
9810 txc->cost = ios * cost + txc->bytes;
9811 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
9812 << ios << " ios * " << cost << " + " << txc->bytes
9813 << " bytes)" << dendl;
9814}
9815
9816void BlueStore::_txc_update_store_statfs(TransContext *txc)
9817{
9818 if (txc->statfs_delta.is_empty())
9819 return;
9820
9821 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
9822 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
9823 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
9824 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
9825 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
9826
9827 bufferlist bl;
9828 txc->statfs_delta.encode(bl);
11fdf7f2
TL
9829 if (per_pool_stat_collection) {
9830 string key;
9831 get_pool_stat_key(txc->osd_pool_id, &key);
9832 txc->t->merge(PREFIX_STAT, key, bl);
9833
9834 std::lock_guard l(vstatfs_lock);
9835 auto& stats = osd_pools[txc->osd_pool_id];
9836 stats += txc->statfs_delta;
9837
9838 vstatfs += txc->statfs_delta; //non-persistent in this mode
9839
9840 } else {
9841 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 9842
11fdf7f2
TL
9843 std::lock_guard l(vstatfs_lock);
9844 vstatfs += txc->statfs_delta;
9845 }
7c673cae
FG
9846 txc->statfs_delta.reset();
9847}
9848
9849void BlueStore::_txc_state_proc(TransContext *txc)
9850{
9851 while (true) {
9852 dout(10) << __func__ << " txc " << txc
9853 << " " << txc->get_state_name() << dendl;
9854 switch (txc->state) {
9855 case TransContext::STATE_PREPARE:
9856 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
9857 if (txc->ioc.has_pending_aios()) {
9858 txc->state = TransContext::STATE_AIO_WAIT;
9859 txc->had_ios = true;
9860 _txc_aio_submit(txc);
9861 return;
9862 }
9863 // ** fall-thru **
9864
9865 case TransContext::STATE_AIO_WAIT:
11fdf7f2
TL
9866 {
9867 utime_t lat = txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
9868 if (lat >= cct->_conf->bluestore_log_op_age) {
9869 dout(0) << __func__ << " slow aio_wait, txc = " << txc
9870 << ", latency = " << lat
9871 << dendl;
9872 }
9873 }
9874
7c673cae
FG
9875 _txc_finish_io(txc); // may trigger blocked txc's too
9876 return;
9877
9878 case TransContext::STATE_IO_DONE:
11fdf7f2 9879 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
9880 if (txc->had_ios) {
9881 ++txc->osr->txc_with_unstable_io;
9882 }
9883 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
9884 txc->state = TransContext::STATE_KV_QUEUED;
9885 if (cct->_conf->bluestore_sync_submit_transaction) {
9886 if (txc->last_nid >= nid_max ||
9887 txc->last_blobid >= blobid_max) {
9888 dout(20) << __func__
9889 << " last_{nid,blobid} exceeds max, submit via kv thread"
9890 << dendl;
9891 } else if (txc->osr->kv_committing_serially) {
9892 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
9893 << dendl;
9894 // note: this is starvation-prone. once we have a txc in a busy
9895 // sequencer that is committing serially it is possible to keep
9896 // submitting new transactions fast enough that we get stuck doing
9897 // so. the alternative is to block here... fixme?
9898 } else if (txc->osr->txc_with_unstable_io) {
9899 dout(20) << __func__ << " prior txc(s) with unstable ios "
9900 << txc->osr->txc_with_unstable_io.load() << dendl;
9901 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
9902 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
9903 == 0) {
9904 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
9905 << dendl;
9906 } else {
9907 txc->state = TransContext::STATE_KV_SUBMITTED;
31f18b77 9908 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11fdf7f2 9909 ceph_assert(r == 0);
7c673cae
FG
9910 _txc_applied_kv(txc);
9911 }
9912 }
9913 {
11fdf7f2 9914 std::lock_guard l(kv_lock);
7c673cae
FG
9915 kv_queue.push_back(txc);
9916 kv_cond.notify_one();
9917 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
9918 kv_queue_unsubmitted.push_back(txc);
9919 ++txc->osr->kv_committing_serially;
9920 }
31f18b77
FG
9921 if (txc->had_ios)
9922 kv_ios++;
9923 kv_throttle_costs += txc->cost;
7c673cae
FG
9924 }
9925 return;
9926 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
9927 _txc_committed_kv(txc);
9928 // ** fall-thru **
9929
9930 case TransContext::STATE_KV_DONE:
9931 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
9932 if (txc->deferred_txn) {
9933 txc->state = TransContext::STATE_DEFERRED_QUEUED;
9934 _deferred_queue(txc);
9935 return;
9936 }
9937 txc->state = TransContext::STATE_FINISHING;
9938 break;
9939
9940 case TransContext::STATE_DEFERRED_CLEANUP:
9941 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
9942 txc->state = TransContext::STATE_FINISHING;
9943 // ** fall-thru **
9944
9945 case TransContext::STATE_FINISHING:
9946 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
9947 _txc_finish(txc);
9948 return;
9949
9950 default:
9951 derr << __func__ << " unexpected txc " << txc
9952 << " state " << txc->get_state_name() << dendl;
11fdf7f2 9953 ceph_abort_msg("unexpected txc state");
7c673cae
FG
9954 return;
9955 }
9956 }
9957}
9958
9959void BlueStore::_txc_finish_io(TransContext *txc)
9960{
9961 dout(20) << __func__ << " " << txc << dendl;
9962
9963 /*
9964 * we need to preserve the order of kv transactions,
9965 * even though aio will complete in any order.
9966 */
9967
9968 OpSequencer *osr = txc->osr.get();
11fdf7f2 9969 std::lock_guard l(osr->qlock);
7c673cae 9970 txc->state = TransContext::STATE_IO_DONE;
11fdf7f2 9971 txc->ioc.release_running_aios();
7c673cae
FG
9972 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
9973 while (p != osr->q.begin()) {
9974 --p;
9975 if (p->state < TransContext::STATE_IO_DONE) {
9976 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
9977 << p->get_state_name() << dendl;
9978 return;
9979 }
9980 if (p->state > TransContext::STATE_IO_DONE) {
9981 ++p;
9982 break;
9983 }
9984 }
9985 do {
9986 _txc_state_proc(&*p++);
9987 } while (p != osr->q.end() &&
9988 p->state == TransContext::STATE_IO_DONE);
9989
11fdf7f2 9990 if (osr->kv_submitted_waiters) {
7c673cae
FG
9991 osr->qcond.notify_all();
9992 }
9993}
9994
9995void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
9996{
9997 dout(20) << __func__ << " txc " << txc
9998 << " onodes " << txc->onodes
9999 << " shared_blobs " << txc->shared_blobs
10000 << dendl;
10001
10002 // finalize onodes
10003 for (auto o : txc->onodes) {
11fdf7f2 10004 _record_onode(o, t);
7c673cae
FG
10005 o->flushing_count++;
10006 }
10007
10008 // objects we modified but didn't affect the onode
10009 auto p = txc->modified_objects.begin();
10010 while (p != txc->modified_objects.end()) {
10011 if (txc->onodes.count(*p) == 0) {
10012 (*p)->flushing_count++;
10013 ++p;
10014 } else {
10015 // remove dups with onodes list to avoid problems in _txc_finish
10016 p = txc->modified_objects.erase(p);
10017 }
10018 }
10019
10020 // finalize shared_blobs
10021 for (auto sb : txc->shared_blobs) {
10022 string key;
10023 auto sbid = sb->get_sbid();
10024 get_shared_blob_key(sbid, &key);
10025 if (sb->persistent->empty()) {
11fdf7f2
TL
10026 dout(20) << __func__ << " shared_blob 0x"
10027 << std::hex << sbid << std::dec
7c673cae
FG
10028 << " is empty" << dendl;
10029 t->rmkey(PREFIX_SHARED_BLOB, key);
10030 } else {
10031 bufferlist bl;
11fdf7f2
TL
10032 encode(*(sb->persistent), bl);
10033 dout(20) << __func__ << " shared_blob 0x"
10034 << std::hex << sbid << std::dec
31f18b77 10035 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
10036 t->set(PREFIX_SHARED_BLOB, key, bl);
10037 }
10038 }
10039}
10040
10041void BlueStore::BSPerfTracker::update_from_perfcounters(
10042 PerfCounters &logger)
10043{
11fdf7f2
TL
10044 os_commit_latency_ns.consume_next(
10045 logger.get_tavg_ns(
7c673cae 10046 l_bluestore_commit_lat));
11fdf7f2
TL
10047 os_apply_latency_ns.consume_next(
10048 logger.get_tavg_ns(
7c673cae
FG
10049 l_bluestore_commit_lat));
10050}
10051
10052void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
10053{
10054 dout(20) << __func__ << " txc " << txc << std::hex
10055 << " allocated 0x" << txc->allocated
10056 << " released 0x" << txc->released
10057 << std::dec << dendl;
10058
10059 // We have to handle the case where we allocate *and* deallocate the
10060 // same region in this transaction. The freelist doesn't like that.
10061 // (Actually, the only thing that cares is the BitmapFreelistManager
10062 // debug check. But that's important.)
10063 interval_set<uint64_t> tmp_allocated, tmp_released;
10064 interval_set<uint64_t> *pallocated = &txc->allocated;
10065 interval_set<uint64_t> *preleased = &txc->released;
10066 if (!txc->allocated.empty() && !txc->released.empty()) {
10067 interval_set<uint64_t> overlap;
10068 overlap.intersection_of(txc->allocated, txc->released);
10069 if (!overlap.empty()) {
10070 tmp_allocated = txc->allocated;
10071 tmp_allocated.subtract(overlap);
10072 tmp_released = txc->released;
10073 tmp_released.subtract(overlap);
10074 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
10075 << ", new allocated 0x" << tmp_allocated
10076 << " released 0x" << tmp_released << std::dec
10077 << dendl;
10078 pallocated = &tmp_allocated;
10079 preleased = &tmp_released;
10080 }
10081 }
10082
10083 // update freelist with non-overlap sets
10084 for (interval_set<uint64_t>::iterator p = pallocated->begin();
10085 p != pallocated->end();
10086 ++p) {
10087 fm->allocate(p.get_start(), p.get_len(), t);
10088 }
10089 for (interval_set<uint64_t>::iterator p = preleased->begin();
10090 p != preleased->end();
10091 ++p) {
10092 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
10093 << "~" << p.get_len() << std::dec << dendl;
10094 fm->release(p.get_start(), p.get_len(), t);
10095 }
10096
10097 _txc_update_store_statfs(txc);
10098}
10099
10100void BlueStore::_txc_applied_kv(TransContext *txc)
10101{
10102 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
10103 for (auto& o : *ls) {
10104 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
10105 << dendl;
10106 if (--o->flushing_count == 0) {
11fdf7f2 10107 std::lock_guard l(o->flush_lock);
7c673cae
FG
10108 o->flush_cond.notify_all();
10109 }
10110 }
10111 }
10112}
10113
10114void BlueStore::_txc_committed_kv(TransContext *txc)
10115{
10116 dout(20) << __func__ << " txc " << txc << dendl;
1adf2230 10117 {
11fdf7f2 10118 std::lock_guard l(txc->osr->qlock);
1adf2230 10119 txc->state = TransContext::STATE_KV_DONE;
11fdf7f2
TL
10120 if (txc->ch->commit_queue) {
10121 txc->ch->commit_queue->queue(txc->oncommits);
10122 } else {
10123 finisher.queue(txc->oncommits);
1adf2230 10124 }
7c673cae 10125 }
1adf2230 10126 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
10127 log_latency_fn(
10128 __func__,
10129 l_bluestore_commit_lat,
10130 ceph::make_timespan(ceph_clock_now() - txc->start),
10131 cct->_conf->bluestore_log_op_age,
10132 [&](auto lat) {
10133 return ", txc = " + stringify(txc);
10134 }
11fdf7f2 10135 );
7c673cae
FG
10136}
10137
10138void BlueStore::_txc_finish(TransContext *txc)
10139{
10140 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11fdf7f2 10141 ceph_assert(txc->state == TransContext::STATE_FINISHING);
7c673cae
FG
10142
10143 for (auto& sb : txc->shared_blobs_written) {
f64942e4 10144 sb->finish_write(txc->seq);
7c673cae
FG
10145 }
10146 txc->shared_blobs_written.clear();
10147
10148 while (!txc->removed_collections.empty()) {
10149 _queue_reap_collection(txc->removed_collections.front());
10150 txc->removed_collections.pop_front();
10151 }
10152
10153 OpSequencerRef osr = txc->osr;
7c673cae 10154 bool empty = false;
31f18b77 10155 bool submit_deferred = false;
7c673cae
FG
10156 OpSequencer::q_list_t releasing_txc;
10157 {
11fdf7f2 10158 std::lock_guard l(osr->qlock);
7c673cae
FG
10159 txc->state = TransContext::STATE_DONE;
10160 bool notify = false;
10161 while (!osr->q.empty()) {
10162 TransContext *txc = &osr->q.front();
10163 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
10164 << dendl;
10165 if (txc->state != TransContext::STATE_DONE) {
10166 if (txc->state == TransContext::STATE_PREPARE &&
10167 deferred_aggressive) {
10168 // for _osr_drain_preceding()
10169 notify = true;
10170 }
31f18b77 10171 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 10172 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
10173 submit_deferred = true;
10174 }
7c673cae
FG
10175 break;
10176 }
10177
7c673cae
FG
10178 osr->q.pop_front();
10179 releasing_txc.push_back(*txc);
10180 notify = true;
10181 }
10182 if (notify) {
10183 osr->qcond.notify_all();
10184 }
10185 if (osr->q.empty()) {
10186 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
10187 empty = true;
10188 }
10189 }
10190 while (!releasing_txc.empty()) {
10191 // release to allocator only after all preceding txc's have also
10192 // finished any deferred writes that potentially land in these
10193 // blocks
10194 auto txc = &releasing_txc.front();
10195 _txc_release_alloc(txc);
10196 releasing_txc.pop_front();
10197 txc->log_state_latency(logger, l_bluestore_state_done_lat);
10198 delete txc;
10199 }
10200
31f18b77
FG
10201 if (submit_deferred) {
10202 // we're pinning memory; flush! we could be more fine-grained here but
10203 // i'm not sure it's worth the bother.
10204 deferred_try_submit();
7c673cae
FG
10205 }
10206
7c673cae 10207 if (empty && osr->zombie) {
11fdf7f2
TL
10208 std::lock_guard l(zombie_osr_lock);
10209 if (zombie_osr_set.erase(osr->cid)) {
10210 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
10211 } else {
10212 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
10213 << dendl;
10214 }
7c673cae 10215 }
11fdf7f2 10216 }
7c673cae
FG
10217
10218void BlueStore::_txc_release_alloc(TransContext *txc)
10219{
a8e16298 10220 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
10221 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
10222 int r = 0;
10223 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
10224 r = bdev->queue_discard(txc->released);
10225 if (r == 0) {
10226 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
10227 << txc->released << std::dec << dendl;
10228 goto out;
10229 }
10230 } else if (cct->_conf->bdev_enable_discard) {
10231 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
10232 bdev->discard(p.get_start(), p.get_len());
10233 }
10234 }
10235 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 10236 << txc->released << std::dec << dendl;
11fdf7f2 10237 alloc->release(txc->released);
7c673cae
FG
10238 }
10239
11fdf7f2 10240out:
7c673cae
FG
10241 txc->allocated.clear();
10242 txc->released.clear();
10243}
10244
11fdf7f2
TL
10245void BlueStore::_osr_attach(Collection *c)
10246{
10247 // note: caller has RWLock on coll_map
10248 auto q = coll_map.find(c->cid);
10249 if (q != coll_map.end()) {
10250 c->osr = q->second->osr;
10251 ldout(cct, 10) << __func__ << " " << c->cid
10252 << " reusing osr " << c->osr << " from existing coll "
10253 << q->second << dendl;
10254 } else {
10255 std::lock_guard l(zombie_osr_lock);
10256 auto p = zombie_osr_set.find(c->cid);
10257 if (p == zombie_osr_set.end()) {
10258 c->osr = new OpSequencer(this, c->cid);
10259 ldout(cct, 10) << __func__ << " " << c->cid
10260 << " fresh osr " << c->osr << dendl;
10261 } else {
10262 c->osr = p->second;
10263 zombie_osr_set.erase(p);
10264 ldout(cct, 10) << __func__ << " " << c->cid
10265 << " resurrecting zombie osr " << c->osr << dendl;
10266 c->osr->zombie = false;
10267 }
10268 }
10269}
10270
10271void BlueStore::_osr_register_zombie(OpSequencer *osr)
10272{
10273 std::lock_guard l(zombie_osr_lock);
10274 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
10275 osr->zombie = true;
10276 auto i = zombie_osr_set.emplace(osr->cid, osr);
10277 // this is either a new insertion or the same osr is already there
10278 ceph_assert(i.second || i.first->second == osr);
10279}
10280
7c673cae
FG
10281void BlueStore::_osr_drain_preceding(TransContext *txc)
10282{
10283 OpSequencer *osr = txc->osr.get();
10284 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
10285 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
10286 {
10287 // submit anything pending
224ce89b 10288 deferred_lock.lock();
11fdf7f2 10289 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
10290 _deferred_submit_unlock(osr);
10291 } else {
10292 deferred_lock.unlock();
7c673cae
FG
10293 }
10294 }
10295 {
10296 // wake up any previously finished deferred events
11fdf7f2 10297 std::lock_guard l(kv_lock);
7c673cae
FG
10298 kv_cond.notify_one();
10299 }
10300 osr->drain_preceding(txc);
10301 --deferred_aggressive;
10302 dout(10) << __func__ << " " << osr << " done" << dendl;
10303}
10304
11fdf7f2
TL
10305void BlueStore::_osr_drain(OpSequencer *osr)
10306{
10307 dout(10) << __func__ << " " << osr << dendl;
10308 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
10309 {
10310 // submit anything pending
10311 deferred_lock.lock();
10312 if (osr->deferred_pending && !osr->deferred_running) {
10313 _deferred_submit_unlock(osr);
10314 } else {
10315 deferred_lock.unlock();
10316 }
10317 }
10318 {
10319 // wake up any previously finished deferred events
10320 std::lock_guard l(kv_lock);
10321 kv_cond.notify_one();
10322 }
10323 osr->drain();
10324 --deferred_aggressive;
10325 dout(10) << __func__ << " " << osr << " done" << dendl;
10326}
10327
7c673cae
FG
10328void BlueStore::_osr_drain_all()
10329{
10330 dout(10) << __func__ << dendl;
10331
10332 set<OpSequencerRef> s;
11fdf7f2
TL
10333 vector<OpSequencerRef> zombies;
10334 {
10335 RWLock::RLocker l(coll_lock);
10336 for (auto& i : coll_map) {
10337 s.insert(i.second->osr);
10338 }
10339 }
7c673cae 10340 {
11fdf7f2
TL
10341 std::lock_guard l(zombie_osr_lock);
10342 for (auto& i : zombie_osr_set) {
10343 s.insert(i.second);
10344 zombies.push_back(i.second);
10345 }
7c673cae
FG
10346 }
10347 dout(20) << __func__ << " osr_set " << s << dendl;
10348
10349 ++deferred_aggressive;
10350 {
10351 // submit anything pending
224ce89b 10352 deferred_try_submit();
7c673cae
FG
10353 }
10354 {
10355 // wake up any previously finished deferred events
11fdf7f2 10356 std::lock_guard l(kv_lock);
7c673cae
FG
10357 kv_cond.notify_one();
10358 }
31f18b77 10359 {
11fdf7f2 10360 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
10361 kv_finalize_cond.notify_one();
10362 }
7c673cae
FG
10363 for (auto osr : s) {
10364 dout(20) << __func__ << " drain " << osr << dendl;
10365 osr->drain();
10366 }
10367 --deferred_aggressive;
10368
7c673cae 10369 {
11fdf7f2
TL
10370 std::lock_guard l(zombie_osr_lock);
10371 for (auto& osr : zombies) {
10372 if (zombie_osr_set.erase(osr->cid)) {
10373 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
10374 ceph_assert(osr->q.empty());
10375 } else if (osr->zombie) {
10376 dout(10) << __func__ << " empty zombie osr " << osr
10377 << " already reaped" << dendl;
10378 ceph_assert(osr->q.empty());
10379 } else {
10380 dout(10) << __func__ << " empty zombie osr " << osr
10381 << " resurrected" << dendl;
10382 }
7c673cae
FG
10383 }
10384 }
11fdf7f2
TL
10385
10386 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
10387}
10388
11fdf7f2 10389
31f18b77
FG
10390void BlueStore::_kv_start()
10391{
10392 dout(10) << __func__ << dendl;
10393
181888fb 10394 deferred_finisher.start();
11fdf7f2 10395 finisher.start();
31f18b77
FG
10396 kv_sync_thread.create("bstore_kv_sync");
10397 kv_finalize_thread.create("bstore_kv_final");
10398}
10399
10400void BlueStore::_kv_stop()
10401{
10402 dout(10) << __func__ << dendl;
10403 {
11fdf7f2 10404 std::unique_lock l(kv_lock);
31f18b77
FG
10405 while (!kv_sync_started) {
10406 kv_cond.wait(l);
10407 }
10408 kv_stop = true;
10409 kv_cond.notify_all();
10410 }
10411 {
11fdf7f2 10412 std::unique_lock l(kv_finalize_lock);
31f18b77
FG
10413 while (!kv_finalize_started) {
10414 kv_finalize_cond.wait(l);
10415 }
10416 kv_finalize_stop = true;
10417 kv_finalize_cond.notify_all();
10418 }
10419 kv_sync_thread.join();
10420 kv_finalize_thread.join();
11fdf7f2 10421 ceph_assert(removed_collections.empty());
31f18b77 10422 {
11fdf7f2 10423 std::lock_guard l(kv_lock);
31f18b77
FG
10424 kv_stop = false;
10425 }
10426 {
11fdf7f2 10427 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
10428 kv_finalize_stop = false;
10429 }
10430 dout(10) << __func__ << " stopping finishers" << dendl;
181888fb
FG
10431 deferred_finisher.wait_for_empty();
10432 deferred_finisher.stop();
11fdf7f2
TL
10433 finisher.wait_for_empty();
10434 finisher.stop();
31f18b77
FG
10435 dout(10) << __func__ << " stopped" << dendl;
10436}
10437
7c673cae
FG
10438void BlueStore::_kv_sync_thread()
10439{
10440 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
10441 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
10442 std::unique_lock l(kv_lock);
10443 ceph_assert(!kv_sync_started);
31f18b77
FG
10444 kv_sync_started = true;
10445 kv_cond.notify_all();
7c673cae 10446 while (true) {
11fdf7f2 10447 ceph_assert(kv_committing.empty());
7c673cae
FG
10448 if (kv_queue.empty() &&
10449 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 10450 !deferred_aggressive)) {
7c673cae
FG
10451 if (kv_stop)
10452 break;
10453 dout(20) << __func__ << " sleep" << dendl;
11fdf7f2 10454 kv_cond.wait(l);
7c673cae
FG
10455 dout(20) << __func__ << " wake" << dendl;
10456 } else {
10457 deque<TransContext*> kv_submitting;
10458 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
10459 uint64_t aios = 0, costs = 0;
10460
7c673cae
FG
10461 dout(20) << __func__ << " committing " << kv_queue.size()
10462 << " submitting " << kv_queue_unsubmitted.size()
10463 << " deferred done " << deferred_done_queue.size()
10464 << " stable " << deferred_stable_queue.size()
10465 << dendl;
10466 kv_committing.swap(kv_queue);
10467 kv_submitting.swap(kv_queue_unsubmitted);
10468 deferred_done.swap(deferred_done_queue);
10469 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
10470 aios = kv_ios;
10471 costs = kv_throttle_costs;
10472 kv_ios = 0;
10473 kv_throttle_costs = 0;
7c673cae
FG
10474 l.unlock();
10475
10476 dout(30) << __func__ << " committing " << kv_committing << dendl;
10477 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
10478 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
10479 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
10480
11fdf7f2
TL
10481 auto start = mono_clock::now();
10482
7c673cae
FG
10483 bool force_flush = false;
10484 // if bluefs is sharing the same device as data (only), then we
10485 // can rely on the bluefs commit to flush the device and make
10486 // deferred aios stable. that means that if we do have done deferred
10487 // txcs AND we are not on a single device, we need to force a flush.
10488 if (bluefs_single_shared_device && bluefs) {
31f18b77 10489 if (aios) {
7c673cae 10490 force_flush = true;
11fdf7f2 10491 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
10492 force_flush = true; // there's nothing else to commit!
10493 } else if (deferred_aggressive) {
10494 force_flush = true;
10495 }
11fdf7f2
TL
10496 } else {
10497 if (aios || !deferred_done.empty()) {
10498 force_flush = true;
10499 } else {
10500 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
10501 }
10502 }
7c673cae
FG
10503
10504 if (force_flush) {
31f18b77 10505 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
10506 << " force_flush=" << (int)force_flush
10507 << ", flushing, deferred done->stable" << dendl;
10508 // flush/barrier on block device
10509 bdev->flush();
10510
10511 // if we flush then deferred done are now deferred stable
10512 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
10513 deferred_done.end());
10514 deferred_done.clear();
10515 }
11fdf7f2 10516 auto after_flush = mono_clock::now();
7c673cae
FG
10517
10518 // we will use one final transaction to force a sync
10519 KeyValueDB::Transaction synct = db->get_transaction();
10520
10521 // increase {nid,blobid}_max? note that this covers both the
10522 // case where we are approaching the max and the case we passed
10523 // it. in either case, we increase the max in the earlier txn
10524 // we submit.
10525 uint64_t new_nid_max = 0, new_blobid_max = 0;
10526 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
10527 KeyValueDB::Transaction t =
10528 kv_submitting.empty() ? synct : kv_submitting.front()->t;
10529 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
10530 bufferlist bl;
11fdf7f2 10531 encode(new_nid_max, bl);
7c673cae
FG
10532 t->set(PREFIX_SUPER, "nid_max", bl);
10533 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
10534 }
10535 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
10536 KeyValueDB::Transaction t =
10537 kv_submitting.empty() ? synct : kv_submitting.front()->t;
10538 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
10539 bufferlist bl;
11fdf7f2 10540 encode(new_blobid_max, bl);
7c673cae
FG
10541 t->set(PREFIX_SUPER, "blobid_max", bl);
10542 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
10543 }
c07f9fc5
FG
10544
10545 for (auto txc : kv_committing) {
10546 if (txc->state == TransContext::STATE_KV_QUEUED) {
10547 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
10548 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11fdf7f2 10549 ceph_assert(r == 0);
c07f9fc5
FG
10550 _txc_applied_kv(txc);
10551 --txc->osr->kv_committing_serially;
10552 txc->state = TransContext::STATE_KV_SUBMITTED;
10553 if (txc->osr->kv_submitted_waiters) {
11fdf7f2
TL
10554 std::lock_guard l(txc->osr->qlock);
10555 txc->osr->qcond.notify_all();
7c673cae 10556 }
c07f9fc5
FG
10557
10558 } else {
11fdf7f2 10559 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
c07f9fc5 10560 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
7c673cae 10561 }
7c673cae
FG
10562 if (txc->had_ios) {
10563 --txc->osr->txc_with_unstable_io;
10564 }
7c673cae
FG
10565 }
10566
31f18b77
FG
10567 // release throttle *before* we commit. this allows new ops
10568 // to be prepared and enter pipeline while we are waiting on
10569 // the kv commit sync/flush. then hopefully on the next
10570 // iteration there will already be ops awake. otherwise, we
10571 // end up going to sleep, and then wake up when the very first
10572 // transaction is ready for commit.
10573 throttle_bytes.put(costs);
10574
7c673cae
FG
10575 if (bluefs &&
10576 after_flush - bluefs_last_balance >
11fdf7f2 10577 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
7c673cae 10578 bluefs_last_balance = after_flush;
11fdf7f2
TL
10579 int r = _balance_bluefs_freespace();
10580 ceph_assert(r >= 0);
7c673cae
FG
10581 }
10582
10583 // cleanup sync deferred keys
10584 for (auto b : deferred_stable) {
10585 for (auto& txc : b->txcs) {
10586 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 10587 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
10588 string key;
10589 get_deferred_key(wt.seq, &key);
10590 synct->rm_single_key(PREFIX_DEFERRED, key);
10591 }
10592 }
10593
10594 // submit synct synchronously (block and wait for it to commit)
31f18b77 10595 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
10596 ceph_assert(r == 0);
10597
10598 {
10599 std::unique_lock m(kv_finalize_lock);
10600 if (kv_committing_to_finalize.empty()) {
10601 kv_committing_to_finalize.swap(kv_committing);
10602 } else {
10603 kv_committing_to_finalize.insert(
10604 kv_committing_to_finalize.end(),
10605 kv_committing.begin(),
10606 kv_committing.end());
10607 kv_committing.clear();
10608 }
10609 if (deferred_stable_to_finalize.empty()) {
10610 deferred_stable_to_finalize.swap(deferred_stable);
10611 } else {
10612 deferred_stable_to_finalize.insert(
10613 deferred_stable_to_finalize.end(),
10614 deferred_stable.begin(),
10615 deferred_stable.end());
10616 deferred_stable.clear();
10617 }
10618 kv_finalize_cond.notify_one();
10619 }
7c673cae
FG
10620
10621 if (new_nid_max) {
10622 nid_max = new_nid_max;
10623 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
10624 }
10625 if (new_blobid_max) {
10626 blobid_max = new_blobid_max;
10627 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
10628 }
10629
224ce89b 10630 {
11fdf7f2
TL
10631 auto finish = mono_clock::now();
10632 ceph::timespan dur_flush = after_flush - start;
10633 ceph::timespan dur_kv = finish - after_flush;
10634 ceph::timespan dur = finish - start;
224ce89b
WB
10635 dout(20) << __func__ << " committed " << kv_committing.size()
10636 << " cleaned " << deferred_stable.size()
10637 << " in " << dur
10638 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
10639 << dendl;
494da23a
TL
10640 log_latency("kv_flush",
10641 l_bluestore_kv_flush_lat,
10642 dur_flush,
10643 cct->_conf->bluestore_log_op_age);
10644 log_latency("kv_commit",
10645 l_bluestore_kv_commit_lat,
10646 dur_kv,
10647 cct->_conf->bluestore_log_op_age);
10648 log_latency("kv_sync",
10649 l_bluestore_kv_sync_lat,
10650 dur,
10651 cct->_conf->bluestore_log_op_age);
7c673cae 10652 }
31f18b77
FG
10653
10654 if (bluefs) {
11fdf7f2
TL
10655 if (!bluefs_extents_reclaiming.empty()) {
10656 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
10657 << bluefs_extents_reclaiming << std::dec << dendl;
81eedcae
TL
10658 int r = 0;
10659 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
10660 r = bdev->queue_discard(bluefs_extents_reclaiming);
10661 if (r == 0) {
10662 goto clear;
10663 }
10664 } else if (cct->_conf->bdev_enable_discard) {
10665 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
10666 bdev->discard(p.get_start(), p.get_len());
10667 }
10668 }
10669
11fdf7f2 10670 alloc->release(bluefs_extents_reclaiming);
81eedcae 10671clear:
11fdf7f2 10672 bluefs_extents_reclaiming.clear();
31f18b77 10673 }
31f18b77
FG
10674 }
10675
10676 l.lock();
10677 // previously deferred "done" are now "stable" by virtue of this
10678 // commit cycle.
10679 deferred_stable_queue.swap(deferred_done);
10680 }
10681 }
10682 dout(10) << __func__ << " finish" << dendl;
10683 kv_sync_started = false;
10684}
10685
10686void BlueStore::_kv_finalize_thread()
10687{
10688 deque<TransContext*> kv_committed;
10689 deque<DeferredBatch*> deferred_stable;
10690 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
10691 std::unique_lock l(kv_finalize_lock);
10692 ceph_assert(!kv_finalize_started);
31f18b77
FG
10693 kv_finalize_started = true;
10694 kv_finalize_cond.notify_all();
10695 while (true) {
11fdf7f2
TL
10696 ceph_assert(kv_committed.empty());
10697 ceph_assert(deferred_stable.empty());
31f18b77
FG
10698 if (kv_committing_to_finalize.empty() &&
10699 deferred_stable_to_finalize.empty()) {
10700 if (kv_finalize_stop)
10701 break;
10702 dout(20) << __func__ << " sleep" << dendl;
10703 kv_finalize_cond.wait(l);
10704 dout(20) << __func__ << " wake" << dendl;
10705 } else {
10706 kv_committed.swap(kv_committing_to_finalize);
10707 deferred_stable.swap(deferred_stable_to_finalize);
10708 l.unlock();
10709 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
10710 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
10711
11fdf7f2
TL
10712 auto start = mono_clock::now();
10713
31f18b77
FG
10714 while (!kv_committed.empty()) {
10715 TransContext *txc = kv_committed.front();
11fdf7f2 10716 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 10717 _txc_state_proc(txc);
31f18b77 10718 kv_committed.pop_front();
7c673cae 10719 }
31f18b77 10720
7c673cae
FG
10721 for (auto b : deferred_stable) {
10722 auto p = b->txcs.begin();
10723 while (p != b->txcs.end()) {
10724 TransContext *txc = &*p;
10725 p = b->txcs.erase(p); // unlink here because
10726 _txc_state_proc(txc); // this may destroy txc
10727 }
10728 delete b;
10729 }
31f18b77 10730 deferred_stable.clear();
7c673cae
FG
10731
10732 if (!deferred_aggressive) {
31f18b77 10733 if (deferred_queue_size >= deferred_batch_ops.load() ||
7c673cae 10734 throttle_deferred_bytes.past_midpoint()) {
224ce89b 10735 deferred_try_submit();
7c673cae
FG
10736 }
10737 }
10738
10739 // this is as good a place as any ...
10740 _reap_collections();
10741
11fdf7f2
TL
10742 logger->set(l_bluestore_fragmentation,
10743 (uint64_t)(alloc->get_fragmentation(min_alloc_size) * 1000));
10744
494da23a
TL
10745 log_latency("kv_final",
10746 l_bluestore_kv_final_lat,
10747 mono_clock::now() - start,
10748 cct->_conf->bluestore_log_op_age);
11fdf7f2 10749
7c673cae 10750 l.lock();
7c673cae
FG
10751 }
10752 }
10753 dout(10) << __func__ << " finish" << dendl;
31f18b77 10754 kv_finalize_started = false;
7c673cae
FG
10755}
10756
10757bluestore_deferred_op_t *BlueStore::_get_deferred_op(
10758 TransContext *txc, OnodeRef o)
10759{
10760 if (!txc->deferred_txn) {
10761 txc->deferred_txn = new bluestore_deferred_transaction_t;
10762 }
10763 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
10764 return &txc->deferred_txn->ops.back();
10765}
10766
10767void BlueStore::_deferred_queue(TransContext *txc)
10768{
10769 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 10770 deferred_lock.lock();
7c673cae
FG
10771 if (!txc->osr->deferred_pending &&
10772 !txc->osr->deferred_running) {
10773 deferred_queue.push_back(*txc->osr);
10774 }
10775 if (!txc->osr->deferred_pending) {
10776 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
10777 }
10778 ++deferred_queue_size;
10779 txc->osr->deferred_pending->txcs.push_back(*txc);
10780 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
10781 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
10782 const auto& op = *opi;
11fdf7f2 10783 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
10784 bufferlist::const_iterator p = op.data.begin();
10785 for (auto e : op.extents) {
10786 txc->osr->deferred_pending->prepare_write(
10787 cct, wt.seq, e.offset, e.length, p);
10788 }
10789 }
10790 if (deferred_aggressive &&
10791 !txc->osr->deferred_running) {
224ce89b
WB
10792 _deferred_submit_unlock(txc->osr.get());
10793 } else {
10794 deferred_lock.unlock();
7c673cae
FG
10795 }
10796}
10797
224ce89b 10798void BlueStore::deferred_try_submit()
7c673cae
FG
10799{
10800 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
10801 << deferred_queue_size << " txcs" << dendl;
11fdf7f2 10802 std::lock_guard l(deferred_lock);
224ce89b
WB
10803 vector<OpSequencerRef> osrs;
10804 osrs.reserve(deferred_queue.size());
7c673cae 10805 for (auto& osr : deferred_queue) {
224ce89b
WB
10806 osrs.push_back(&osr);
10807 }
10808 for (auto& osr : osrs) {
181888fb
FG
10809 if (osr->deferred_pending) {
10810 if (!osr->deferred_running) {
10811 _deferred_submit_unlock(osr.get());
10812 deferred_lock.lock();
10813 } else {
10814 dout(20) << __func__ << " osr " << osr << " already has running"
10815 << dendl;
10816 }
10817 } else {
10818 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
10819 }
10820 }
10821}
10822
224ce89b 10823void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
10824{
10825 dout(10) << __func__ << " osr " << osr
10826 << " " << osr->deferred_pending->iomap.size() << " ios pending "
10827 << dendl;
11fdf7f2
TL
10828 ceph_assert(osr->deferred_pending);
10829 ceph_assert(!osr->deferred_running);
7c673cae
FG
10830
10831 auto b = osr->deferred_pending;
10832 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 10833 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
10834
10835 osr->deferred_running = osr->deferred_pending;
10836 osr->deferred_pending = nullptr;
10837
11fdf7f2
TL
10838 deferred_lock.unlock();
10839
10840 for (auto& txc : b->txcs) {
10841 txc.log_state_latency(logger, l_bluestore_state_deferred_queued_lat);
10842 }
7c673cae
FG
10843 uint64_t start = 0, pos = 0;
10844 bufferlist bl;
10845 auto i = b->iomap.begin();
10846 while (true) {
10847 if (i == b->iomap.end() || i->first != pos) {
10848 if (bl.length()) {
10849 dout(20) << __func__ << " write 0x" << std::hex
10850 << start << "~" << bl.length()
10851 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 10852 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
10853 logger->inc(l_bluestore_deferred_write_ops);
10854 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
10855 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 10856 ceph_assert(r == 0);
7c673cae
FG
10857 }
10858 }
10859 if (i == b->iomap.end()) {
10860 break;
10861 }
10862 start = 0;
10863 pos = i->first;
10864 bl.clear();
10865 }
10866 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
10867 << std::hex << pos << "~" << i->second.bl.length() << std::dec
10868 << dendl;
10869 if (!bl.length()) {
10870 start = pos;
10871 }
10872 pos += i->second.bl.length();
10873 bl.claim_append(i->second.bl);
10874 ++i;
10875 }
224ce89b 10876
7c673cae
FG
10877 bdev->aio_submit(&b->ioc);
10878}
10879
3efd9988
FG
10880struct C_DeferredTrySubmit : public Context {
10881 BlueStore *store;
10882 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
10883 void finish(int r) {
10884 store->deferred_try_submit();
10885 }
10886};
10887
7c673cae
FG
10888void BlueStore::_deferred_aio_finish(OpSequencer *osr)
10889{
10890 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 10891 ceph_assert(osr->deferred_running);
7c673cae
FG
10892 DeferredBatch *b = osr->deferred_running;
10893
10894 {
11fdf7f2
TL
10895 std::lock_guard l(deferred_lock);
10896 ceph_assert(osr->deferred_running == b);
7c673cae
FG
10897 osr->deferred_running = nullptr;
10898 if (!osr->deferred_pending) {
181888fb 10899 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
10900 auto q = deferred_queue.iterator_to(*osr);
10901 deferred_queue.erase(q);
10902 } else if (deferred_aggressive) {
224ce89b 10903 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
3efd9988 10904 deferred_finisher.queue(new C_DeferredTrySubmit(this));
181888fb
FG
10905 } else {
10906 dout(20) << __func__ << " leaving queued, more pending" << dendl;
7c673cae
FG
10907 }
10908 }
10909
10910 {
31f18b77 10911 uint64_t costs = 0;
11fdf7f2
TL
10912 {
10913 std::lock_guard l2(osr->qlock);
10914 for (auto& i : b->txcs) {
10915 TransContext *txc = &i;
10916 txc->log_state_latency(logger, l_bluestore_state_deferred_aio_wait_lat);
10917 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
10918 costs += txc->cost;
10919 }
7c673cae 10920 }
31f18b77 10921 throttle_deferred_bytes.put(costs);
11fdf7f2 10922 std::lock_guard l(kv_lock);
7c673cae
FG
10923 deferred_done_queue.emplace_back(b);
10924 }
10925
10926 // in the normal case, do not bother waking up the kv thread; it will
10927 // catch us on the next commit anyway.
10928 if (deferred_aggressive) {
11fdf7f2 10929 std::lock_guard l(kv_lock);
7c673cae
FG
10930 kv_cond.notify_one();
10931 }
10932}
10933
10934int BlueStore::_deferred_replay()
10935{
10936 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
10937 int count = 0;
10938 int r = 0;
11fdf7f2
TL
10939 CollectionRef ch = _get_collection(coll_t::meta());
10940 bool fake_ch = false;
10941 if (!ch) {
10942 // hmm, replaying initial mkfs?
10943 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
10944 fake_ch = true;
10945 }
10946 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
10947 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
10948 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
10949 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
10950 << dendl;
10951 bluestore_deferred_transaction_t *deferred_txn =
10952 new bluestore_deferred_transaction_t;
10953 bufferlist bl = it->value();
11fdf7f2 10954 auto p = bl.cbegin();
7c673cae 10955 try {
11fdf7f2 10956 decode(*deferred_txn, p);
7c673cae
FG
10957 } catch (buffer::error& e) {
10958 derr << __func__ << " failed to decode deferred txn "
10959 << pretty_binary_string(it->key()) << dendl;
10960 delete deferred_txn;
10961 r = -EIO;
10962 goto out;
10963 }
11fdf7f2 10964 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae
FG
10965 txc->deferred_txn = deferred_txn;
10966 txc->state = TransContext::STATE_KV_DONE;
10967 _txc_state_proc(txc);
10968 }
10969 out:
10970 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 10971 _osr_register_zombie(osr);
7c673cae 10972 _osr_drain_all();
11fdf7f2
TL
10973 if (fake_ch) {
10974 new_coll_map.clear();
10975 }
7c673cae
FG
10976 dout(10) << __func__ << " completed " << count << " events" << dendl;
10977 return r;
10978}
10979
10980// ---------------------------
10981// transactions
10982
10983int BlueStore::queue_transactions(
11fdf7f2
TL
10984 CollectionHandle& ch,
10985 vector<Transaction>& tls,
10986 TrackedOpRef op,
10987 ThreadPool::TPHandle *handle)
10988{
10989 FUNCTRACE(cct);
10990 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 10991 ObjectStore::Transaction::collect_contexts(
11fdf7f2 10992 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae
FG
10993
10994 if (cct->_conf->objectstore_blackhole) {
10995 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
10996 << dendl;
11fdf7f2
TL
10997 for (auto& l : { on_applied, on_commit, on_applied_sync }) {
10998 for (auto c : l) {
10999 delete c;
11000 }
11001 }
7c673cae
FG
11002 return 0;
11003 }
11fdf7f2
TL
11004 auto start = mono_clock::now();
11005
11006 Collection *c = static_cast<Collection*>(ch.get());
11007 OpSequencer *osr = c->osr.get();
11008 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
11009
11010 // prepare
11fdf7f2
TL
11011 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
11012 &on_commit);
7c673cae
FG
11013
11014 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
11015 txc->bytes += (*p).get_num_bytes();
11016 _txc_add_transaction(txc, &(*p));
11017 }
11018 _txc_calc_cost(txc);
11019
11020 _txc_write_nodes(txc, txc->t);
11021
11022 // journal deferred items
11023 if (txc->deferred_txn) {
11024 txc->deferred_txn->seq = ++deferred_seq;
11025 bufferlist bl;
11fdf7f2 11026 encode(*txc->deferred_txn, bl);
7c673cae
FG
11027 string key;
11028 get_deferred_key(txc->deferred_txn->seq, &key);
11029 txc->t->set(PREFIX_DEFERRED, key, bl);
11030 }
11031
11032 _txc_finalize_kv(txc, txc->t);
11033 if (handle)
11034 handle->suspend_tp_timeout();
11035
11fdf7f2 11036 auto tstart = mono_clock::now();
7c673cae
FG
11037 throttle_bytes.get(txc->cost);
11038 if (txc->deferred_txn) {
11039 // ensure we do not block here because of deferred writes
11040 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
d2e6a577
FG
11041 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
11042 << dendl;
11043 ++deferred_aggressive;
7c673cae 11044 deferred_try_submit();
3efd9988
FG
11045 {
11046 // wake up any previously finished deferred events
11fdf7f2 11047 std::lock_guard l(kv_lock);
3efd9988
FG
11048 kv_cond.notify_one();
11049 }
7c673cae 11050 throttle_deferred_bytes.get(txc->cost);
d2e6a577
FG
11051 --deferred_aggressive;
11052 }
7c673cae 11053 }
11fdf7f2 11054 auto tend = mono_clock::now();
7c673cae
FG
11055
11056 if (handle)
11057 handle->reset_tp_timeout();
11058
11059 logger->inc(l_bluestore_txc);
11060
11061 // execute (start)
11062 _txc_state_proc(txc);
11063
11fdf7f2
TL
11064 // we're immediately readable (unlike FileStore)
11065 for (auto c : on_applied_sync) {
11066 c->complete(0);
11067 }
11068 if (!on_applied.empty()) {
11069 if (c->commit_queue) {
11070 c->commit_queue->queue(on_applied);
11071 } else {
11072 finisher.queue(on_applied);
11073 }
11074 }
11075
494da23a
TL
11076 log_latency("submit_transact",
11077 l_bluestore_submit_lat,
11078 mono_clock::now() - start,
11079 cct->_conf->bluestore_log_op_age);
11080 log_latency("throttle_transact",
11081 l_bluestore_throttle_lat,
11082 tend - tstart,
11083 cct->_conf->bluestore_log_op_age);
7c673cae
FG
11084 return 0;
11085}
11086
11087void BlueStore::_txc_aio_submit(TransContext *txc)
11088{
11089 dout(10) << __func__ << " txc " << txc << dendl;
11090 bdev->aio_submit(&txc->ioc);
11091}
11092
11093void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
11094{
11095 Transaction::iterator i = t->begin();
11096
81eedcae 11097 _dump_transaction<30>(cct, t);
7c673cae
FG
11098
11099 vector<CollectionRef> cvec(i.colls.size());
11100 unsigned j = 0;
11101 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
11102 ++p, ++j) {
11103 cvec[j] = _get_collection(*p);
7c673cae 11104 }
11fdf7f2 11105
7c673cae
FG
11106 vector<OnodeRef> ovec(i.objects.size());
11107
11108 for (int pos = 0; i.have_op(); ++pos) {
11109 Transaction::Op *op = i.decode_op();
11110 int r = 0;
11111
11112 // no coll or obj
11113 if (op->op == Transaction::OP_NOP)
11114 continue;
11115
11fdf7f2 11116
7c673cae
FG
11117 // collection operations
11118 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
11119
11120 // initialize osd_pool_id and do a smoke test that all collections belong
11121 // to the same pool
11122 spg_t pgid;
11123 if (!!c ? c->cid.is_pg(&pgid) : false) {
11124 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
11125 txc->osd_pool_id == pgid.pool());
11126 txc->osd_pool_id = pgid.pool();
11127 }
11128
7c673cae
FG
11129 switch (op->op) {
11130 case Transaction::OP_RMCOLL:
11131 {
11132 const coll_t &cid = i.get_cid(op->cid);
11133 r = _remove_collection(txc, cid, &c);
11134 if (!r)
11135 continue;
11136 }
11137 break;
11138
11139 case Transaction::OP_MKCOLL:
11140 {
11fdf7f2 11141 ceph_assert(!c);
7c673cae
FG
11142 const coll_t &cid = i.get_cid(op->cid);
11143 r = _create_collection(txc, cid, op->split_bits, &c);
11144 if (!r)
11145 continue;
11146 }
11147 break;
11148
11149 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 11150 ceph_abort_msg("deprecated");
7c673cae
FG
11151 break;
11152
11153 case Transaction::OP_SPLIT_COLLECTION2:
11154 {
11155 uint32_t bits = op->split_bits;
11156 uint32_t rem = op->split_rem;
11157 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
11158 if (!r)
11159 continue;
11160 }
11161 break;
11162
11fdf7f2
TL
11163 case Transaction::OP_MERGE_COLLECTION:
11164 {
11165 uint32_t bits = op->split_bits;
11166 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
11167 if (!r)
11168 continue;
11169 }
11170 break;
11171
7c673cae
FG
11172 case Transaction::OP_COLL_HINT:
11173 {
11174 uint32_t type = op->hint_type;
11175 bufferlist hint;
11176 i.decode_bl(hint);
11fdf7f2 11177 auto hiter = hint.cbegin();
7c673cae
FG
11178 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
11179 uint32_t pg_num;
11180 uint64_t num_objs;
11fdf7f2
TL
11181 decode(pg_num, hiter);
11182 decode(num_objs, hiter);
7c673cae
FG
11183 dout(10) << __func__ << " collection hint objects is a no-op, "
11184 << " pg_num " << pg_num << " num_objects " << num_objs
11185 << dendl;
11186 } else {
11187 // Ignore the hint
11188 dout(10) << __func__ << " unknown collection hint " << type << dendl;
11189 }
11190 continue;
11191 }
11192 break;
11193
11194 case Transaction::OP_COLL_SETATTR:
11195 r = -EOPNOTSUPP;
11196 break;
11197
11198 case Transaction::OP_COLL_RMATTR:
11199 r = -EOPNOTSUPP;
11200 break;
11201
11202 case Transaction::OP_COLL_RENAME:
11fdf7f2 11203 ceph_abort_msg("not implemented");
7c673cae
FG
11204 break;
11205 }
11206 if (r < 0) {
11207 derr << __func__ << " error " << cpp_strerror(r)
11208 << " not handled on operation " << op->op
11209 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 11210 _dump_transaction<0>(cct, t);
11fdf7f2 11211 ceph_abort_msg("unexpected error");
7c673cae
FG
11212 }
11213
11214 // these operations implicity create the object
11215 bool create = false;
11216 if (op->op == Transaction::OP_TOUCH ||
11217 op->op == Transaction::OP_WRITE ||
11218 op->op == Transaction::OP_ZERO) {
11219 create = true;
11220 }
11221
11222 // object operations
11223 RWLock::WLocker l(c->lock);
11224 OnodeRef &o = ovec[op->oid];
11225 if (!o) {
11226 ghobject_t oid = i.get_oid(op->oid);
11227 o = c->get_onode(oid, create);
11228 }
11229 if (!create && (!o || !o->exists)) {
11230 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
11231 << i.get_oid(op->oid) << dendl;
11232 r = -ENOENT;
11233 goto endop;
11234 }
11235
11236 switch (op->op) {
11237 case Transaction::OP_TOUCH:
11238 r = _touch(txc, c, o);
11239 break;
11240
11241 case Transaction::OP_WRITE:
11242 {
11243 uint64_t off = op->off;
11244 uint64_t len = op->len;
11245 uint32_t fadvise_flags = i.get_fadvise_flags();
11246 bufferlist bl;
11247 i.decode_bl(bl);
11248 r = _write(txc, c, o, off, len, bl, fadvise_flags);
11249 }
11250 break;
11251
11252 case Transaction::OP_ZERO:
11253 {
11254 uint64_t off = op->off;
11255 uint64_t len = op->len;
11256 r = _zero(txc, c, o, off, len);
11257 }
11258 break;
11259
11260 case Transaction::OP_TRIMCACHE:
11261 {
11262 // deprecated, no-op
11263 }
11264 break;
11265
11266 case Transaction::OP_TRUNCATE:
11267 {
11268 uint64_t off = op->off;
35e4c445 11269 r = _truncate(txc, c, o, off);
7c673cae
FG
11270 }
11271 break;
11272
11273 case Transaction::OP_REMOVE:
11274 {
11275 r = _remove(txc, c, o);
11276 }
11277 break;
11278
11279 case Transaction::OP_SETATTR:
11280 {
11281 string name = i.decode_string();
11282 bufferptr bp;
11283 i.decode_bp(bp);
11284 r = _setattr(txc, c, o, name, bp);
11285 }
11286 break;
11287
11288 case Transaction::OP_SETATTRS:
11289 {
11290 map<string, bufferptr> aset;
11291 i.decode_attrset(aset);
11292 r = _setattrs(txc, c, o, aset);
11293 }
11294 break;
11295
11296 case Transaction::OP_RMATTR:
11297 {
11298 string name = i.decode_string();
11299 r = _rmattr(txc, c, o, name);
11300 }
11301 break;
11302
11303 case Transaction::OP_RMATTRS:
11304 {
11305 r = _rmattrs(txc, c, o);
11306 }
11307 break;
11308
11309 case Transaction::OP_CLONE:
11310 {
11311 OnodeRef& no = ovec[op->dest_oid];
11312 if (!no) {
11313 const ghobject_t& noid = i.get_oid(op->dest_oid);
11314 no = c->get_onode(noid, true);
11315 }
11316 r = _clone(txc, c, o, no);
11317 }
11318 break;
11319
11320 case Transaction::OP_CLONERANGE:
11fdf7f2 11321 ceph_abort_msg("deprecated");
7c673cae
FG
11322 break;
11323
11324 case Transaction::OP_CLONERANGE2:
11325 {
11326 OnodeRef& no = ovec[op->dest_oid];
11327 if (!no) {
11328 const ghobject_t& noid = i.get_oid(op->dest_oid);
11329 no = c->get_onode(noid, true);
11330 }
11331 uint64_t srcoff = op->off;
11332 uint64_t len = op->len;
11333 uint64_t dstoff = op->dest_off;
11334 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
11335 }
11336 break;
11337
11338 case Transaction::OP_COLL_ADD:
11fdf7f2 11339 ceph_abort_msg("not implemented");
7c673cae
FG
11340 break;
11341
11342 case Transaction::OP_COLL_REMOVE:
11fdf7f2 11343 ceph_abort_msg("not implemented");
7c673cae
FG
11344 break;
11345
11346 case Transaction::OP_COLL_MOVE:
11fdf7f2 11347 ceph_abort_msg("deprecated");
7c673cae
FG
11348 break;
11349
11350 case Transaction::OP_COLL_MOVE_RENAME:
11351 case Transaction::OP_TRY_RENAME:
11352 {
11fdf7f2 11353 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
11354 const ghobject_t& noid = i.get_oid(op->dest_oid);
11355 OnodeRef& no = ovec[op->dest_oid];
11356 if (!no) {
11357 no = c->get_onode(noid, false);
11358 }
11359 r = _rename(txc, c, o, no, noid);
11360 }
11361 break;
11362
11363 case Transaction::OP_OMAP_CLEAR:
11364 {
11365 r = _omap_clear(txc, c, o);
11366 }
11367 break;
11368 case Transaction::OP_OMAP_SETKEYS:
11369 {
11370 bufferlist aset_bl;
11371 i.decode_attrset_bl(&aset_bl);
11372 r = _omap_setkeys(txc, c, o, aset_bl);
11373 }
11374 break;
11375 case Transaction::OP_OMAP_RMKEYS:
11376 {
11377 bufferlist keys_bl;
11378 i.decode_keyset_bl(&keys_bl);
11379 r = _omap_rmkeys(txc, c, o, keys_bl);
11380 }
11381 break;
11382 case Transaction::OP_OMAP_RMKEYRANGE:
11383 {
11384 string first, last;
11385 first = i.decode_string();
11386 last = i.decode_string();
11387 r = _omap_rmkey_range(txc, c, o, first, last);
11388 }
11389 break;
11390 case Transaction::OP_OMAP_SETHEADER:
11391 {
11392 bufferlist bl;
11393 i.decode_bl(bl);
11394 r = _omap_setheader(txc, c, o, bl);
11395 }
11396 break;
11397
11398 case Transaction::OP_SETALLOCHINT:
11399 {
11400 r = _set_alloc_hint(txc, c, o,
11401 op->expected_object_size,
11402 op->expected_write_size,
11403 op->alloc_hint_flags);
11404 }
11405 break;
11406
11407 default:
11fdf7f2 11408 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
11409 ceph_abort();
11410 }
11411
11412 endop:
11413 if (r < 0) {
11414 bool ok = false;
11415
11416 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
11417 op->op == Transaction::OP_CLONE ||
11418 op->op == Transaction::OP_CLONERANGE2 ||
11419 op->op == Transaction::OP_COLL_ADD ||
11420 op->op == Transaction::OP_SETATTR ||
11421 op->op == Transaction::OP_SETATTRS ||
11422 op->op == Transaction::OP_RMATTR ||
11423 op->op == Transaction::OP_OMAP_SETKEYS ||
11424 op->op == Transaction::OP_OMAP_RMKEYS ||
11425 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
11426 op->op == Transaction::OP_OMAP_SETHEADER))
11427 // -ENOENT is usually okay
11428 ok = true;
11429 if (r == -ENODATA)
11430 ok = true;
11431
11432 if (!ok) {
11433 const char *msg = "unexpected error code";
11434
11435 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
11436 op->op == Transaction::OP_CLONE ||
11437 op->op == Transaction::OP_CLONERANGE2))
11438 msg = "ENOENT on clone suggests osd bug";
11439
11440 if (r == -ENOSPC)
11441 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
11442 // by partially applying transactions.
11443 msg = "ENOSPC from bluestore, misconfigured cluster";
11444
11445 if (r == -ENOTEMPTY) {
11446 msg = "ENOTEMPTY suggests garbage data in osd data dir";
11447 }
11448
11449 derr << __func__ << " error " << cpp_strerror(r)
11450 << " not handled on operation " << op->op
11451 << " (op " << pos << ", counting from 0)"
11452 << dendl;
11453 derr << msg << dendl;
81eedcae 11454 _dump_transaction<0>(cct, t);
11fdf7f2 11455 ceph_abort_msg("unexpected error");
7c673cae
FG
11456 }
11457 }
11458 }
11459}
11460
11461
11462
11463// -----------------
11464// write operations
11465
11466int BlueStore::_touch(TransContext *txc,
11467 CollectionRef& c,
11468 OnodeRef &o)
11469{
11470 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11471 int r = 0;
7c673cae
FG
11472 _assign_nid(txc, o);
11473 txc->write_onode(o);
11474 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11475 return r;
11476}
11477
7c673cae
FG
11478void BlueStore::_pad_zeros(
11479 bufferlist *bl, uint64_t *offset,
11480 uint64_t chunk_size)
11481{
11482 auto length = bl->length();
11483 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
11484 << " chunk_size 0x" << chunk_size << std::dec << dendl;
11485 dout(40) << "before:\n";
11486 bl->hexdump(*_dout);
11487 *_dout << dendl;
11488 // front
11489 size_t front_pad = *offset % chunk_size;
11490 size_t back_pad = 0;
11491 size_t pad_count = 0;
11492 if (front_pad) {
11fdf7f2
TL
11493 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
11494 bufferptr z = buffer::create_small_page_aligned(chunk_size);
224ce89b 11495 z.zero(0, front_pad, false);
7c673cae 11496 pad_count += front_pad;
224ce89b 11497 bl->copy(0, front_copy, z.c_str() + front_pad);
7c673cae
FG
11498 if (front_copy + front_pad < chunk_size) {
11499 back_pad = chunk_size - (length + front_pad);
224ce89b 11500 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
11501 pad_count += back_pad;
11502 }
11503 bufferlist old, t;
11504 old.swap(*bl);
11505 t.substr_of(old, front_copy, length - front_copy);
11506 bl->append(z);
11507 bl->claim_append(t);
11508 *offset -= front_pad;
224ce89b 11509 length += pad_count;
7c673cae
FG
11510 }
11511
11512 // back
11513 uint64_t end = *offset + length;
11514 unsigned back_copy = end % chunk_size;
11515 if (back_copy) {
11fdf7f2 11516 ceph_assert(back_pad == 0);
7c673cae 11517 back_pad = chunk_size - back_copy;
11fdf7f2 11518 ceph_assert(back_copy <= length);
7c673cae 11519 bufferptr tail(chunk_size);
224ce89b
WB
11520 bl->copy(length - back_copy, back_copy, tail.c_str());
11521 tail.zero(back_copy, back_pad, false);
7c673cae
FG
11522 bufferlist old;
11523 old.swap(*bl);
11524 bl->substr_of(old, 0, length - back_copy);
11525 bl->append(tail);
11526 length += back_pad;
11527 pad_count += back_pad;
11528 }
11529 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
11530 << back_pad << " on front/back, now 0x" << *offset << "~"
11531 << length << std::dec << dendl;
11532 dout(40) << "after:\n";
11533 bl->hexdump(*_dout);
11534 *_dout << dendl;
11535 if (pad_count)
11536 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 11537 ceph_assert(bl->length() == length);
7c673cae
FG
11538}
11539
11540void BlueStore::_do_write_small(
11541 TransContext *txc,
11542 CollectionRef &c,
11543 OnodeRef o,
11544 uint64_t offset, uint64_t length,
11545 bufferlist::iterator& blp,
11546 WriteContext *wctx)
11547{
11548 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
11549 << std::dec << dendl;
11fdf7f2 11550 ceph_assert(length < min_alloc_size);
7c673cae
FG
11551 uint64_t end_offs = offset + length;
11552
11553 logger->inc(l_bluestore_write_small);
11554 logger->inc(l_bluestore_write_small_bytes, length);
11555
11556 bufferlist bl;
11557 blp.copy(length, bl);
11558
81eedcae
TL
11559 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
11560 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
11561 uint32_t alloc_len = min_alloc_size;
11562 auto offset0 = p2align<uint64_t>(offset, alloc_len);
11563
11564 bool any_change;
11565
11566 // search suitable extent in both forward and reverse direction in
11567 // [offset - target_max_blob_size, offset + target_max_blob_size] range
11568 // then check if blob can be reused via can_reuse_blob func or apply
11569 // direct/deferred write (the latter for extents including or higher
11570 // than 'offset' only).
11571 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
11572
7c673cae
FG
11573 // Look for an existing mutable blob we can use.
11574 auto begin = o->extent_map.extent_map.begin();
11575 auto end = o->extent_map.extent_map.end();
11576 auto ep = o->extent_map.seek_lextent(offset);
11577 if (ep != begin) {
11578 --ep;
11579 if (ep->blob_end() <= offset) {
11580 ++ep;
11581 }
11582 }
11583 auto prev_ep = ep;
11584 if (prev_ep != begin) {
11585 --prev_ep;
11586 } else {
11587 prev_ep = end; // to avoid this extent check as it's a duplicate
11588 }
11589
7c673cae
FG
11590 do {
11591 any_change = false;
11592
11593 if (ep != end && ep->logical_offset < offset + max_bsize) {
11594 BlobRef b = ep->blob;
11595 auto bstart = ep->blob_start();
11596 dout(20) << __func__ << " considering " << *b
11597 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
11598 if (bstart >= end_offs) {
11599 dout(20) << __func__ << " ignoring distant " << *b << dendl;
11600 } else if (!b->get_blob().is_mutable()) {
11601 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
11602 } else if (ep->logical_offset % min_alloc_size !=
11603 ep->blob_offset % min_alloc_size) {
11604 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
11605 } else {
11606 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
11607 // can we pad our head/tail out with zeros?
11608 uint64_t head_pad, tail_pad;
11fdf7f2
TL
11609 head_pad = p2phase(offset, chunk_size);
11610 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
11611 if (head_pad || tail_pad) {
11612 o->extent_map.fault_range(db, offset - head_pad,
11613 end_offs - offset + head_pad + tail_pad);
11614 }
11615 if (head_pad &&
11616 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
11617 head_pad = 0;
11618 }
11619 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
11620 tail_pad = 0;
11621 }
11622
11623 uint64_t b_off = offset - head_pad - bstart;
11624 uint64_t b_len = length + head_pad + tail_pad;
11625
11626 // direct write into unused blocks of an existing mutable blob?
11627 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
11628 b->get_blob().get_ondisk_length() >= b_off + b_len &&
11629 b->get_blob().is_unused(b_off, b_len) &&
11630 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 11631 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
11632
11633 dout(20) << __func__ << " write to unused 0x" << std::hex
11634 << b_off << "~" << b_len
11635 << " pad 0x" << head_pad << " + 0x" << tail_pad
11636 << std::dec << " of mutable " << *b << dendl;
224ce89b 11637 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
11638 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
11639
11fdf7f2 11640 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
11641 if (b_len <= prefer_deferred_size) {
11642 dout(20) << __func__ << " deferring small 0x" << std::hex
11643 << b_len << std::dec << " unused write via deferred" << dendl;
11644 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
11645 op->op = bluestore_deferred_op_t::OP_WRITE;
11646 b->get_blob().map(
11647 b_off, b_len,
11648 [&](uint64_t offset, uint64_t length) {
11649 op->extents.emplace_back(bluestore_pextent_t(offset, length));
11650 return 0;
11651 });
224ce89b 11652 op->data = bl;
7c673cae
FG
11653 } else {
11654 b->get_blob().map_bl(
224ce89b 11655 b_off, bl,
7c673cae
FG
11656 [&](uint64_t offset, bufferlist& t) {
11657 bdev->aio_write(offset, t,
11658 &txc->ioc, wctx->buffered);
11659 });
11660 }
11661 }
224ce89b 11662 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
11663 dout(20) << __func__ << " lex old " << *ep << dendl;
11664 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
11665 b,
11666 &wctx->old_extents);
11667 b->dirty_blob().mark_used(le->blob_offset, le->length);
11668 txc->statfs_delta.stored() += le->length;
11669 dout(20) << __func__ << " lex " << *le << dendl;
11670 logger->inc(l_bluestore_write_small_unused);
11671 return;
11672 }
11673 // read some data to fill out the chunk?
11fdf7f2
TL
11674 uint64_t head_read = p2phase(b_off, chunk_size);
11675 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
11676 if ((head_read || tail_read) &&
11677 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
11678 head_read + tail_read < min_alloc_size) {
11679 b_off -= head_read;
11680 b_len += head_read + tail_read;
11681
11682 } else {
11683 head_read = tail_read = 0;
11684 }
11685
11686 // chunk-aligned deferred overwrite?
11687 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
11688 b_off % chunk_size == 0 &&
11689 b_len % chunk_size == 0 &&
11690 b->get_blob().is_allocated(b_off, b_len)) {
11691
224ce89b 11692 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
11693
11694 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
11695 << " and tail 0x" << tail_read << std::dec << dendl;
11696 if (head_read) {
11697 bufferlist head_bl;
11698 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
11699 head_bl, 0);
11fdf7f2 11700 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
11701 size_t zlen = head_read - r;
11702 if (zlen) {
11703 head_bl.append_zero(zlen);
11704 logger->inc(l_bluestore_write_pad_bytes, zlen);
11705 }
11fdf7f2
TL
11706 head_bl.claim_append(bl);
11707 bl.swap(head_bl);
7c673cae
FG
11708 logger->inc(l_bluestore_write_penalty_read_ops);
11709 }
11710 if (tail_read) {
11711 bufferlist tail_bl;
11712 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
11713 tail_bl, 0);
11fdf7f2 11714 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
11715 size_t zlen = tail_read - r;
11716 if (zlen) {
11717 tail_bl.append_zero(zlen);
11718 logger->inc(l_bluestore_write_pad_bytes, zlen);
11719 }
224ce89b 11720 bl.claim_append(tail_bl);
7c673cae
FG
11721 logger->inc(l_bluestore_write_penalty_read_ops);
11722 }
11723 logger->inc(l_bluestore_write_small_pre_read);
11724
224ce89b 11725 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
11726 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
11727
7c673cae 11728 if (b->get_blob().csum_type) {
224ce89b 11729 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 11730 }
11fdf7f2
TL
11731
11732 if (!g_conf()->bluestore_debug_omit_block_device_write) {
11733 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
11734 op->op = bluestore_deferred_op_t::OP_WRITE;
11735 int r = b->get_blob().map(
11736 b_off, b_len,
11737 [&](uint64_t offset, uint64_t length) {
11738 op->extents.emplace_back(bluestore_pextent_t(offset, length));
11739 return 0;
11740 });
11741 ceph_assert(r == 0);
11742 op->data.claim(bl);
11743 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
11744 << b_len << std::dec << " of mutable " << *b
11745 << " at " << op->extents << dendl;
11746 }
11747
7c673cae
FG
11748 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
11749 b, &wctx->old_extents);
11750 b->dirty_blob().mark_used(le->blob_offset, le->length);
11751 txc->statfs_delta.stored() += le->length;
11752 dout(20) << __func__ << " lex " << *le << dendl;
11753 logger->inc(l_bluestore_write_small_deferred);
11754 return;
11755 }
224ce89b
WB
11756 // try to reuse blob if we can
11757 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
11758 max_bsize,
11759 offset0 - bstart,
11760 &alloc_len)) {
11fdf7f2 11761 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
11762 // fit into reused blob
11763 // Need to check for pending writes desiring to
11764 // reuse the same pextent. The rationale is that during GC two chunks
11765 // from garbage blobs(compressed?) can share logical space within the same
11766 // AU. That's in turn might be caused by unaligned len in clone_range2.
11767 // Hence the second write will fail in an attempt to reuse blob at
11768 // do_alloc_write().
11769 if (!wctx->has_conflict(b,
11770 offset0,
11771 offset0 + alloc_len,
11772 min_alloc_size)) {
11773
11774 // we can't reuse pad_head/pad_tail since they might be truncated
11775 // due to existent extents
11776 uint64_t b_off = offset - bstart;
11777 uint64_t b_off0 = b_off;
11778 _pad_zeros(&bl, &b_off0, chunk_size);
11779
11780 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
11781 << " (0x" << b_off0 << "~" << bl.length() << ")"
11782 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
11783 << std::dec << dendl;
11784
11785 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11786 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
11787 false, false);
11788 logger->inc(l_bluestore_write_small_unused);
11789 return;
11790 }
11791 }
11792 }
11793 ++ep;
11794 any_change = true;
11795 } // if (ep != end && ep->logical_offset < offset + max_bsize)
11796
11797 // check extent for reuse in reverse order
11798 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
11799 BlobRef b = prev_ep->blob;
11800 auto bstart = prev_ep->blob_start();
11801 dout(20) << __func__ << " considering " << *b
11802 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 11803 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
11804 max_bsize,
11805 offset0 - bstart,
11806 &alloc_len)) {
11fdf7f2 11807 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
11808 // fit into reused blob
11809 // Need to check for pending writes desiring to
11810 // reuse the same pextent. The rationale is that during GC two chunks
11811 // from garbage blobs(compressed?) can share logical space within the same
11812 // AU. That's in turn might be caused by unaligned len in clone_range2.
11813 // Hence the second write will fail in an attempt to reuse blob at
11814 // do_alloc_write().
11815 if (!wctx->has_conflict(b,
11816 offset0,
11817 offset0 + alloc_len,
11818 min_alloc_size)) {
11819
11820 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
11821 uint64_t b_off = offset - bstart;
11822 uint64_t b_off0 = b_off;
11823 _pad_zeros(&bl, &b_off0, chunk_size);
11824
11825 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
11826 << " (0x" << b_off0 << "~" << bl.length() << ")"
11827 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
11828 << std::dec << dendl;
11829
11830 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11831 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
11832 false, false);
11833 logger->inc(l_bluestore_write_small_unused);
11834 return;
11835 }
11836 }
11837 if (prev_ep != begin) {
11838 --prev_ep;
11839 any_change = true;
11840 } else {
11841 prev_ep = end; // to avoid useless first extent re-check
11842 }
11843 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
11844 } while (any_change);
11845
11846 // new blob.
7c673cae 11847 BlobRef b = c->new_blob();
11fdf7f2 11848 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
11849 uint64_t b_off0 = b_off;
11850 _pad_zeros(&bl, &b_off0, block_size);
11851 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11852 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
7c673cae
FG
11853
11854 return;
11855}
11856
11857void BlueStore::_do_write_big(
11858 TransContext *txc,
11859 CollectionRef &c,
11860 OnodeRef o,
11861 uint64_t offset, uint64_t length,
11862 bufferlist::iterator& blp,
11863 WriteContext *wctx)
11864{
11865 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
11866 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
11867 << " compress " << (int)wctx->compress
11868 << dendl;
11869 logger->inc(l_bluestore_write_big);
11870 logger->inc(l_bluestore_write_big_bytes, length);
11871 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11fdf7f2 11872 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae
FG
11873 while (length > 0) {
11874 bool new_blob = false;
11fdf7f2 11875 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
11876 BlobRef b;
11877 uint32_t b_off = 0;
11878
11879 //attempting to reuse existing blob
11880 if (!wctx->compress) {
11881 // look for an existing mutable blob we can reuse
11882 auto begin = o->extent_map.extent_map.begin();
11883 auto end = o->extent_map.extent_map.end();
11884 auto ep = o->extent_map.seek_lextent(offset);
11885 auto prev_ep = ep;
11886 if (prev_ep != begin) {
11887 --prev_ep;
11888 } else {
11889 prev_ep = end; // to avoid this extent check as it's a duplicate
11890 }
11891 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
11892 // search suitable extent in both forward and reverse direction in
11893 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 11894 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
11895 bool any_change;
11896 do {
11897 any_change = false;
11898 if (ep != end && ep->logical_offset < offset + max_bsize) {
11899 if (offset >= ep->blob_start() &&
224ce89b 11900 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
11901 offset - ep->blob_start(),
11902 &l)) {
11903 b = ep->blob;
11904 b_off = offset - ep->blob_start();
11905 prev_ep = end; // to avoid check below
11906 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 11907 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
11908 } else {
11909 ++ep;
11910 any_change = true;
11911 }
11912 }
11913
11914 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 11915 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
11916 offset - prev_ep->blob_start(),
11917 &l)) {
11918 b = prev_ep->blob;
11919 b_off = offset - prev_ep->blob_start();
11920 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 11921 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
11922 } else if (prev_ep != begin) {
11923 --prev_ep;
11924 any_change = true;
11925 } else {
11926 prev_ep = end; // to avoid useless first extent re-check
11927 }
11928 }
11929 } while (b == nullptr && any_change);
11930 }
11931 if (b == nullptr) {
11932 b = c->new_blob();
11933 b_off = 0;
11934 new_blob = true;
11935 }
11936
11937 bufferlist t;
11938 blp.copy(l, t);
11939 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
11940 offset += l;
11941 length -= l;
11942 logger->inc(l_bluestore_write_big_blobs);
11943 }
11944}
11945
11946int BlueStore::_do_alloc_write(
11947 TransContext *txc,
11948 CollectionRef coll,
11949 OnodeRef o,
11950 WriteContext *wctx)
11951{
11952 dout(20) << __func__ << " txc " << txc
11953 << " " << wctx->writes.size() << " blobs"
11954 << dendl;
3efd9988
FG
11955 if (wctx->writes.empty()) {
11956 return 0;
7c673cae
FG
11957 }
11958
7c673cae
FG
11959 CompressorRef c;
11960 double crr = 0;
11961 if (wctx->compress) {
11962 c = select_option(
11963 "compression_algorithm",
11964 compressor,
11965 [&]() {
11966 string val;
11967 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
11968 CompressorRef cp = compressor;
11969 if (!cp || cp->get_type_name() != val) {
11970 cp = Compressor::create(cct, val);
11fdf7f2
TL
11971 if (!cp) {
11972 if (_set_compression_alert(false, val.c_str())) {
11973 derr << __func__ << " unable to initialize " << val.c_str()
11974 << " compressor" << dendl;
11975 }
11976 }
7c673cae
FG
11977 }
11978 return boost::optional<CompressorRef>(cp);
11979 }
11980 return boost::optional<CompressorRef>();
11981 }
11982 );
11983
11984 crr = select_option(
11985 "compression_required_ratio",
11986 cct->_conf->bluestore_compression_required_ratio,
11987 [&]() {
11988 double val;
3efd9988 11989 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
11990 return boost::optional<double>(val);
11991 }
11992 return boost::optional<double>();
11993 }
11994 );
11995 }
11996
11997 // checksum
11fdf7f2 11998 int64_t csum = csum_type.load();
7c673cae
FG
11999 csum = select_option(
12000 "csum_type",
12001 csum,
12002 [&]() {
11fdf7f2 12003 int64_t val;
3efd9988 12004 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 12005 return boost::optional<int64_t>(val);
7c673cae 12006 }
11fdf7f2 12007 return boost::optional<int64_t>();
7c673cae
FG
12008 }
12009 );
12010
3efd9988
FG
12011 // compress (as needed) and calc needed space
12012 uint64_t need = 0;
11fdf7f2 12013 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 12014 for (auto& wi : wctx->writes) {
3efd9988 12015 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 12016 auto start = mono_clock::now();
7c673cae
FG
12017
12018 // compress
11fdf7f2
TL
12019 ceph_assert(wi.b_off == 0);
12020 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 12021
7c673cae
FG
12022 // FIXME: memory alignment here is bad
12023 bufferlist t;
3efd9988 12024 int r = c->compress(wi.bl, t);
3efd9988 12025 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 12026 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
12027 bool rejected = false;
12028 uint64_t compressed_len = t.length();
12029 // do an approximate (fast) estimation for resulting blob size
12030 // that doesn't take header overhead into account
11fdf7f2 12031 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
12032 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
12033 bluestore_compression_header_t chdr;
12034 chdr.type = c->get_type();
12035 chdr.length = t.length();
12036 encode(chdr, wi.compressed_bl);
12037 wi.compressed_bl.claim_append(t);
12038
12039 compressed_len = wi.compressed_bl.length();
11fdf7f2 12040 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
12041 if (result_len <= want_len && result_len < wi.blob_length) {
12042 // Cool. We compressed at least as much as we were hoping to.
12043 // pad out to min_alloc_size
12044 wi.compressed_bl.append_zero(result_len - compressed_len);
12045 wi.compressed_len = compressed_len;
12046 wi.compressed = true;
12047 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
12048 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
12049 << " -> 0x" << compressed_len << " => 0x" << result_len
12050 << " with " << c->get_type()
12051 << std::dec << dendl;
12052 txc->statfs_delta.compressed() += compressed_len;
12053 txc->statfs_delta.compressed_original() += wi.blob_length;
12054 txc->statfs_delta.compressed_allocated() += result_len;
12055 logger->inc(l_bluestore_compress_success_count);
12056 need += result_len;
12057 } else {
12058 rejected = true;
12059 }
12060 } else if (r != 0) {
12061 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
12062 << " bytes compressed using " << c->get_type_name()
12063 << std::dec
12064 << " failed with errcode = " << r
12065 << ", leaving uncompressed"
12066 << dendl;
12067 logger->inc(l_bluestore_compress_rejected_count);
12068 need += wi.blob_length;
7c673cae 12069 } else {
a8e16298
TL
12070 rejected = true;
12071 }
12072
12073 if (rejected) {
3efd9988 12074 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 12075 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
12076 << " with " << c->get_type()
12077 << ", which is more than required 0x" << want_len_raw
7c673cae 12078 << " -> 0x" << want_len
3efd9988
FG
12079 << ", leaving uncompressed"
12080 << std::dec << dendl;
12081 logger->inc(l_bluestore_compress_rejected_count);
12082 need += wi.blob_length;
7c673cae 12083 }
494da23a
TL
12084 log_latency("compress@_do_alloc_write",
12085 l_bluestore_compress_lat,
12086 mono_clock::now() - start,
12087 cct->_conf->bluestore_log_op_age );
3efd9988
FG
12088 } else {
12089 need += wi.blob_length;
7c673cae 12090 }
3efd9988 12091 }
a8e16298 12092 PExtentVector prealloc;
3efd9988 12093 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 12094 int64_t prealloc_left = 0;
3efd9988
FG
12095 prealloc_left = alloc->allocate(
12096 need, min_alloc_size, need,
12097 0, &prealloc);
11fdf7f2
TL
12098 if (prealloc_left < (int64_t)need) {
12099 derr << __func__ << " failed to allocate 0x" << std::hex << need
12100 << " allocated 0x " << prealloc_left
12101 << " min_alloc_size 0x" << min_alloc_size
12102 << " available 0x " << alloc->get_free()
12103 << std::dec << dendl;
12104 if (prealloc.size()) {
12105 alloc->release(prealloc);
12106 }
a8e16298
TL
12107 return -ENOSPC;
12108 }
a8e16298 12109
3efd9988
FG
12110 dout(20) << __func__ << " prealloc " << prealloc << dendl;
12111 auto prealloc_pos = prealloc.begin();
12112
12113 for (auto& wi : wctx->writes) {
12114 BlobRef b = wi.b;
12115 bluestore_blob_t& dblob = b->dirty_blob();
12116 uint64_t b_off = wi.b_off;
12117 bufferlist *l = &wi.bl;
12118 uint64_t final_length = wi.blob_length;
12119 uint64_t csum_length = wi.blob_length;
3efd9988
FG
12120 if (wi.compressed) {
12121 final_length = wi.compressed_bl.length();
12122 csum_length = final_length;
3efd9988
FG
12123 l = &wi.compressed_bl;
12124 dblob.set_compressed(wi.blob_length, wi.compressed_len);
12125 } else if (wi.new_blob) {
7c673cae 12126 // initialize newly created blob only
11fdf7f2
TL
12127 ceph_assert(dblob.is_mutable());
12128 unsigned csum_order;
7c673cae
FG
12129 if (l->length() != wi.blob_length) {
12130 // hrm, maybe we could do better here, but let's not bother.
12131 dout(20) << __func__ << " forcing csum_order to block_size_order "
12132 << block_size_order << dendl;
31f18b77 12133 csum_order = block_size_order;
7c673cae
FG
12134 } else {
12135 csum_order = std::min(wctx->csum_order, ctz(l->length()));
12136 }
12137 // try to align blob with max_blob_size to improve
12138 // its reuse ratio, e.g. in case of reverse write
12139 uint32_t suggested_boff =
12140 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
12141 if ((suggested_boff % (1 << csum_order)) == 0 &&
12142 suggested_boff + final_length <= max_bsize &&
12143 suggested_boff > b_off) {
181888fb 12144 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 12145 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 12146 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
12147 csum_length += suggested_boff - b_off;
12148 b_off = suggested_boff;
12149 }
181888fb
FG
12150 if (csum != Checksummer::CSUM_NONE) {
12151 dout(20) << __func__ << " initialize csum setting for new blob " << *b
12152 << " csum_type " << Checksummer::get_csum_type_string(csum)
12153 << " csum_order " << csum_order
12154 << " csum_length 0x" << std::hex << csum_length << std::dec
12155 << dendl;
12156 dblob.init_csum(csum, csum_order, csum_length);
12157 }
7c673cae
FG
12158 }
12159
a8e16298 12160 PExtentVector extents;
3efd9988
FG
12161 int64_t left = final_length;
12162 while (left > 0) {
11fdf7f2 12163 ceph_assert(prealloc_left > 0);
3efd9988
FG
12164 if (prealloc_pos->length <= left) {
12165 prealloc_left -= prealloc_pos->length;
12166 left -= prealloc_pos->length;
12167 txc->statfs_delta.allocated() += prealloc_pos->length;
12168 extents.push_back(*prealloc_pos);
12169 ++prealloc_pos;
12170 } else {
12171 extents.emplace_back(prealloc_pos->offset, left);
12172 prealloc_pos->offset += left;
12173 prealloc_pos->length -= left;
12174 prealloc_left -= left;
12175 txc->statfs_delta.allocated() += left;
12176 left = 0;
12177 break;
12178 }
12179 }
7c673cae 12180 for (auto& p : extents) {
3efd9988 12181 txc->allocated.insert(p.offset, p.length);
7c673cae 12182 }
11fdf7f2 12183 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 12184
181888fb
FG
12185 dout(20) << __func__ << " blob " << *b << dendl;
12186 if (dblob.has_csum()) {
7c673cae
FG
12187 dblob.calc_csum(b_off, *l);
12188 }
181888fb 12189
7c673cae
FG
12190 if (wi.mark_unused) {
12191 auto b_end = b_off + wi.bl.length();
12192 if (b_off) {
12193 dblob.add_unused(0, b_off);
12194 }
12195 if (b_end < wi.blob_length) {
12196 dblob.add_unused(b_end, wi.blob_length - b_end);
12197 }
12198 }
12199
12200 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
12201 b_off + (wi.b_off0 - wi.b_off),
12202 wi.length0,
12203 wi.b,
12204 nullptr);
12205 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
12206 txc->statfs_delta.stored() += le->length;
12207 dout(20) << __func__ << " lex " << *le << dendl;
12208 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
12209 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
12210
12211 // queue io
11fdf7f2 12212 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12213 if (l->length() <= prefer_deferred_size.load()) {
12214 dout(20) << __func__ << " deferring small 0x" << std::hex
12215 << l->length() << std::dec << " write via deferred" << dendl;
12216 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
12217 op->op = bluestore_deferred_op_t::OP_WRITE;
12218 int r = b->get_blob().map(
12219 b_off, l->length(),
12220 [&](uint64_t offset, uint64_t length) {
12221 op->extents.emplace_back(bluestore_pextent_t(offset, length));
12222 return 0;
12223 });
11fdf7f2 12224 ceph_assert(r == 0);
7c673cae 12225 op->data = *l;
81eedcae 12226 logger->inc(l_bluestore_write_small_deferred);
7c673cae
FG
12227 } else {
12228 b->get_blob().map_bl(
12229 b_off, *l,
12230 [&](uint64_t offset, bufferlist& t) {
12231 bdev->aio_write(offset, t, &txc->ioc, false);
12232 });
81eedcae 12233 logger->inc(l_bluestore_write_small_new);
7c673cae
FG
12234 }
12235 }
12236 }
11fdf7f2
TL
12237 ceph_assert(prealloc_pos == prealloc.end());
12238 ceph_assert(prealloc_left == 0);
7c673cae
FG
12239 return 0;
12240}
12241
12242void BlueStore::_wctx_finish(
12243 TransContext *txc,
12244 CollectionRef& c,
12245 OnodeRef o,
31f18b77
FG
12246 WriteContext *wctx,
12247 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
12248{
12249 auto oep = wctx->old_extents.begin();
12250 while (oep != wctx->old_extents.end()) {
12251 auto &lo = *oep;
12252 oep = wctx->old_extents.erase(oep);
12253 dout(20) << __func__ << " lex_old " << lo.e << dendl;
12254 BlobRef b = lo.e.blob;
12255 const bluestore_blob_t& blob = b->get_blob();
12256 if (blob.is_compressed()) {
12257 if (lo.blob_empty) {
12258 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
12259 }
12260 txc->statfs_delta.compressed_original() -= lo.e.length;
12261 }
12262 auto& r = lo.r;
12263 txc->statfs_delta.stored() -= lo.e.length;
12264 if (!r.empty()) {
12265 dout(20) << __func__ << " blob release " << r << dendl;
12266 if (blob.is_shared()) {
12267 PExtentVector final;
12268 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
12269 bool unshare = false;
12270 bool* unshare_ptr =
12271 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 12272 for (auto e : r) {
31f18b77
FG
12273 b->shared_blob->put_ref(
12274 e.offset, e.length, &final,
11fdf7f2
TL
12275 unshare_ptr);
12276 }
12277 if (unshare) {
12278 ceph_assert(maybe_unshared_blobs);
12279 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
12280 }
12281 dout(20) << __func__ << " shared_blob release " << final
12282 << " from " << *b->shared_blob << dendl;
12283 txc->write_shared_blob(b->shared_blob);
12284 r.clear();
12285 r.swap(final);
12286 }
12287 }
12288 // we can't invalidate our logical extents as we drop them because
12289 // other lextents (either in our onode or others) may still
12290 // reference them. but we can throw out anything that is no
12291 // longer allocated. Note that this will leave behind edge bits
12292 // that are no longer referenced but not deallocated (until they
12293 // age out of the cache naturally).
12294 b->discard_unallocated(c.get());
12295 for (auto e : r) {
12296 dout(20) << __func__ << " release " << e << dendl;
12297 txc->released.insert(e.offset, e.length);
12298 txc->statfs_delta.allocated() -= e.length;
12299 if (blob.is_compressed()) {
12300 txc->statfs_delta.compressed_allocated() -= e.length;
12301 }
12302 }
12303 delete &lo;
12304 if (b->is_spanning() && !b->is_referenced()) {
12305 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
12306 << dendl;
12307 o->extent_map.spanning_blob_map.erase(b->id);
12308 }
12309 }
12310}
12311
12312void BlueStore::_do_write_data(
12313 TransContext *txc,
12314 CollectionRef& c,
12315 OnodeRef o,
12316 uint64_t offset,
12317 uint64_t length,
12318 bufferlist& bl,
12319 WriteContext *wctx)
12320{
12321 uint64_t end = offset + length;
12322 bufferlist::iterator p = bl.begin();
12323
12324 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
12325 (length != min_alloc_size)) {
12326 // we fall within the same block
12327 _do_write_small(txc, c, o, offset, length, p, wctx);
12328 } else {
12329 uint64_t head_offset, head_length;
12330 uint64_t middle_offset, middle_length;
12331 uint64_t tail_offset, tail_length;
12332
12333 head_offset = offset;
11fdf7f2 12334 head_length = p2nphase(offset, min_alloc_size);
7c673cae 12335
11fdf7f2
TL
12336 tail_offset = p2align(end, min_alloc_size);
12337 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
12338
12339 middle_offset = head_offset + head_length;
12340 middle_length = length - head_length - tail_length;
12341
12342 if (head_length) {
12343 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
12344 }
12345
12346 if (middle_length) {
12347 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
12348 }
12349
12350 if (tail_length) {
12351 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
12352 }
12353 }
12354}
12355
31f18b77
FG
12356void BlueStore::_choose_write_options(
12357 CollectionRef& c,
12358 OnodeRef o,
12359 uint32_t fadvise_flags,
12360 WriteContext *wctx)
7c673cae 12361{
7c673cae
FG
12362 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
12363 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 12364 wctx->buffered = true;
7c673cae
FG
12365 } else if (cct->_conf->bluestore_default_buffered_write &&
12366 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
12367 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
12368 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 12369 wctx->buffered = true;
7c673cae
FG
12370 }
12371
31f18b77
FG
12372 // apply basic csum block size
12373 wctx->csum_order = block_size_order;
7c673cae
FG
12374
12375 // compression parameters
12376 unsigned alloc_hints = o->onode.alloc_hint_flags;
12377 auto cm = select_option(
12378 "compression_mode",
31f18b77 12379 comp_mode.load(),
7c673cae
FG
12380 [&]() {
12381 string val;
11fdf7f2 12382 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
12383 return boost::optional<Compressor::CompressionMode>(
12384 Compressor::get_comp_mode_type(val));
7c673cae
FG
12385 }
12386 return boost::optional<Compressor::CompressionMode>();
12387 }
12388 );
31f18b77
FG
12389
12390 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
12391 ((cm == Compressor::COMP_FORCE) ||
12392 (cm == Compressor::COMP_AGGRESSIVE &&
12393 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
12394 (cm == Compressor::COMP_PASSIVE &&
12395 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
12396
12397 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
12398 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
12399 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
12400 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 12401 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 12402
7c673cae 12403 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 12404
7c673cae 12405 if (o->onode.expected_write_size) {
224ce89b 12406 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 12407 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 12408 } else {
224ce89b 12409 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
12410 }
12411
31f18b77
FG
12412 if (wctx->compress) {
12413 wctx->target_blob_size = select_option(
7c673cae 12414 "compression_max_blob_size",
31f18b77 12415 comp_max_blob_size.load(),
7c673cae 12416 [&]() {
11fdf7f2
TL
12417 int64_t val;
12418 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
12419 return boost::optional<uint64_t>((uint64_t)val);
12420 }
12421 return boost::optional<uint64_t>();
12422 }
12423 );
12424 }
12425 } else {
31f18b77
FG
12426 if (wctx->compress) {
12427 wctx->target_blob_size = select_option(
7c673cae 12428 "compression_min_blob_size",
31f18b77 12429 comp_min_blob_size.load(),
7c673cae 12430 [&]() {
11fdf7f2
TL
12431 int64_t val;
12432 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
12433 return boost::optional<uint64_t>((uint64_t)val);
12434 }
12435 return boost::optional<uint64_t>();
12436 }
12437 );
12438 }
12439 }
31f18b77 12440
7c673cae 12441 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
12442 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
12443 wctx->target_blob_size = max_bsize;
7c673cae 12444 }
31f18b77 12445
7c673cae
FG
12446 // set the min blob size floor at 2x the min_alloc_size, or else we
12447 // won't be able to allocate a smaller extent for the compressed
12448 // data.
31f18b77
FG
12449 if (wctx->compress &&
12450 wctx->target_blob_size < min_alloc_size * 2) {
12451 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 12452 }
31f18b77
FG
12453
12454 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
12455 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
12456 << " compress=" << (int)wctx->compress
12457 << " buffered=" << (int)wctx->buffered
31f18b77
FG
12458 << std::dec << dendl;
12459}
12460
12461int BlueStore::_do_gc(
12462 TransContext *txc,
12463 CollectionRef& c,
12464 OnodeRef o,
12465 const GarbageCollector& gc,
12466 const WriteContext& wctx,
12467 uint64_t *dirty_start,
12468 uint64_t *dirty_end)
12469{
12470 auto& extents_to_collect = gc.get_extents_to_collect();
12471
1adf2230 12472 bool dirty_range_updated = false;
31f18b77 12473 WriteContext wctx_gc;
7c673cae 12474 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 12475
31f18b77
FG
12476 for (auto it = extents_to_collect.begin();
12477 it != extents_to_collect.end();
12478 ++it) {
12479 bufferlist bl;
12480 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
11fdf7f2 12481 ceph_assert(r == (int)it->length);
31f18b77 12482
31f18b77
FG
12483 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
12484 logger->inc(l_bluestore_gc_merged, it->length);
12485
12486 if (*dirty_start > it->offset) {
12487 *dirty_start = it->offset;
1adf2230 12488 dirty_range_updated = true;
31f18b77
FG
12489 }
12490
12491 if (*dirty_end < it->offset + it->length) {
12492 *dirty_end = it->offset + it->length;
1adf2230 12493 dirty_range_updated = true;
31f18b77
FG
12494 }
12495 }
1adf2230
AA
12496 if (dirty_range_updated) {
12497 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
12498 }
31f18b77
FG
12499
12500 dout(30) << __func__ << " alloc write" << dendl;
12501 int r = _do_alloc_write(txc, c, o, &wctx_gc);
12502 if (r < 0) {
12503 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
12504 << dendl;
12505 return r;
12506 }
12507
12508 _wctx_finish(txc, c, o, &wctx_gc);
12509 return 0;
12510}
12511
12512int BlueStore::_do_write(
12513 TransContext *txc,
12514 CollectionRef& c,
12515 OnodeRef o,
12516 uint64_t offset,
12517 uint64_t length,
12518 bufferlist& bl,
12519 uint32_t fadvise_flags)
12520{
12521 int r = 0;
12522
12523 dout(20) << __func__
12524 << " " << o->oid
12525 << " 0x" << std::hex << offset << "~" << length
12526 << " - have 0x" << o->onode.size
12527 << " (" << std::dec << o->onode.size << ")"
12528 << " bytes"
12529 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
12530 << dendl;
81eedcae 12531 _dump_onode<30>(cct, *o);
31f18b77
FG
12532
12533 if (length == 0) {
12534 return 0;
12535 }
12536
12537 uint64_t end = offset + length;
12538
12539 GarbageCollector gc(c->store->cct);
12540 int64_t benefit;
12541 auto dirty_start = offset;
12542 auto dirty_end = end;
12543
12544 WriteContext wctx;
12545 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
12546 o->extent_map.fault_range(db, offset, length);
12547 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
12548 r = _do_alloc_write(txc, c, o, &wctx);
12549 if (r < 0) {
12550 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
12551 << dendl;
12552 goto out;
12553 }
12554
31f18b77
FG
12555 // NB: _wctx_finish() will empty old_extents
12556 // so we must do gc estimation before that
7c673cae 12557 benefit = gc.estimate(offset,
31f18b77
FG
12558 length,
12559 o->extent_map,
12560 wctx.old_extents,
12561 min_alloc_size);
7c673cae
FG
12562
12563 _wctx_finish(txc, c, o, &wctx);
12564 if (end > o->onode.size) {
12565 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 12566 << std::dec << dendl;
7c673cae
FG
12567 o->onode.size = end;
12568 }
12569
11fdf7f2 12570 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
31f18b77
FG
12571 if (!gc.get_extents_to_collect().empty()) {
12572 dout(20) << __func__ << " perform garbage collection, "
12573 << "expected benefit = " << benefit << " AUs" << dendl;
12574 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
12575 if (r < 0) {
12576 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
12577 << dendl;
12578 goto out;
7c673cae 12579 }
1adf2230
AA
12580 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
12581 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae
FG
12582 }
12583 }
7c673cae 12584 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
12585 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
12586
7c673cae
FG
12587 r = 0;
12588
12589 out:
12590 return r;
12591}
12592
12593int BlueStore::_write(TransContext *txc,
12594 CollectionRef& c,
12595 OnodeRef& o,
31f18b77
FG
12596 uint64_t offset, size_t length,
12597 bufferlist& bl,
12598 uint32_t fadvise_flags)
7c673cae
FG
12599{
12600 dout(15) << __func__ << " " << c->cid << " " << o->oid
12601 << " 0x" << std::hex << offset << "~" << length << std::dec
12602 << dendl;
35e4c445
FG
12603 int r = 0;
12604 if (offset + length >= OBJECT_MAX_SIZE) {
12605 r = -E2BIG;
12606 } else {
12607 _assign_nid(txc, o);
12608 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
12609 txc->write_onode(o);
12610 }
7c673cae
FG
12611 dout(10) << __func__ << " " << c->cid << " " << o->oid
12612 << " 0x" << std::hex << offset << "~" << length << std::dec
12613 << " = " << r << dendl;
12614 return r;
12615}
12616
12617int BlueStore::_zero(TransContext *txc,
12618 CollectionRef& c,
12619 OnodeRef& o,
12620 uint64_t offset, size_t length)
12621{
12622 dout(15) << __func__ << " " << c->cid << " " << o->oid
12623 << " 0x" << std::hex << offset << "~" << length << std::dec
12624 << dendl;
35e4c445
FG
12625 int r = 0;
12626 if (offset + length >= OBJECT_MAX_SIZE) {
12627 r = -E2BIG;
12628 } else {
12629 _assign_nid(txc, o);
12630 r = _do_zero(txc, c, o, offset, length);
12631 }
7c673cae
FG
12632 dout(10) << __func__ << " " << c->cid << " " << o->oid
12633 << " 0x" << std::hex << offset << "~" << length << std::dec
12634 << " = " << r << dendl;
12635 return r;
12636}
12637
12638int BlueStore::_do_zero(TransContext *txc,
12639 CollectionRef& c,
12640 OnodeRef& o,
12641 uint64_t offset, size_t length)
12642{
12643 dout(15) << __func__ << " " << c->cid << " " << o->oid
12644 << " 0x" << std::hex << offset << "~" << length << std::dec
12645 << dendl;
12646 int r = 0;
12647
81eedcae 12648 _dump_onode<30>(cct, *o);
7c673cae
FG
12649
12650 WriteContext wctx;
12651 o->extent_map.fault_range(db, offset, length);
12652 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 12653 o->extent_map.dirty_range(offset, length);
7c673cae
FG
12654 _wctx_finish(txc, c, o, &wctx);
12655
b32b8144 12656 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
12657 o->onode.size = offset + length;
12658 dout(20) << __func__ << " extending size to " << offset + length
12659 << dendl;
12660 }
12661 txc->write_onode(o);
12662
12663 dout(10) << __func__ << " " << c->cid << " " << o->oid
12664 << " 0x" << std::hex << offset << "~" << length << std::dec
12665 << " = " << r << dendl;
12666 return r;
12667}
12668
12669void BlueStore::_do_truncate(
31f18b77
FG
12670 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
12671 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
12672{
12673 dout(15) << __func__ << " " << c->cid << " " << o->oid
12674 << " 0x" << std::hex << offset << std::dec << dendl;
12675
81eedcae 12676 _dump_onode<30>(cct, *o);
7c673cae
FG
12677
12678 if (offset == o->onode.size)
31f18b77 12679 return;
7c673cae
FG
12680
12681 if (offset < o->onode.size) {
12682 WriteContext wctx;
12683 uint64_t length = o->onode.size - offset;
12684 o->extent_map.fault_range(db, offset, length);
12685 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
12686 o->extent_map.dirty_range(offset, length);
12687 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
12688
12689 // if we have shards past EOF, ask for a reshard
12690 if (!o->onode.extent_map_shards.empty() &&
12691 o->onode.extent_map_shards.back().offset >= offset) {
12692 dout(10) << __func__ << " request reshard past EOF" << dendl;
12693 if (offset) {
12694 o->extent_map.request_reshard(offset - 1, offset + length);
12695 } else {
12696 o->extent_map.request_reshard(0, length);
12697 }
12698 }
12699 }
12700
12701 o->onode.size = offset;
12702
12703 txc->write_onode(o);
12704}
12705
35e4c445 12706int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
12707 CollectionRef& c,
12708 OnodeRef& o,
12709 uint64_t offset)
12710{
12711 dout(15) << __func__ << " " << c->cid << " " << o->oid
12712 << " 0x" << std::hex << offset << std::dec
12713 << dendl;
35e4c445
FG
12714 int r = 0;
12715 if (offset >= OBJECT_MAX_SIZE) {
12716 r = -E2BIG;
12717 } else {
12718 _do_truncate(txc, c, o, offset);
12719 }
12720 dout(10) << __func__ << " " << c->cid << " " << o->oid
12721 << " 0x" << std::hex << offset << std::dec
12722 << " = " << r << dendl;
12723 return r;
7c673cae
FG
12724}
12725
12726int BlueStore::_do_remove(
12727 TransContext *txc,
12728 CollectionRef& c,
12729 OnodeRef o)
12730{
31f18b77 12731 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
12732 bool is_gen = !o->oid.is_no_gen();
12733 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
12734 if (o->onode.has_omap()) {
12735 o->flush();
11fdf7f2
TL
12736 _do_omap_clear(txc,
12737 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
12738 o->onode.nid);
7c673cae
FG
12739 }
12740 o->exists = false;
12741 string key;
12742 for (auto &s : o->extent_map.shards) {
12743 dout(20) << __func__ << " removing shard 0x" << std::hex
12744 << s.shard_info->offset << std::dec << dendl;
12745 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
12746 [&](const string& final_key) {
12747 txc->t->rmkey(PREFIX_OBJ, final_key);
12748 }
12749 );
12750 }
12751 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 12752 txc->note_removed_object(o);
7c673cae
FG
12753 o->extent_map.clear();
12754 o->onode = bluestore_onode_t();
12755 _debug_obj_on_delete(o->oid);
31f18b77 12756
224ce89b
WB
12757 if (!is_gen || maybe_unshared_blobs.empty()) {
12758 return 0;
12759 }
31f18b77 12760
224ce89b
WB
12761 // see if we can unshare blobs still referenced by the head
12762 dout(10) << __func__ << " gen and maybe_unshared_blobs "
12763 << maybe_unshared_blobs << dendl;
12764 ghobject_t nogen = o->oid;
12765 nogen.generation = ghobject_t::NO_GEN;
12766 OnodeRef h = c->onode_map.lookup(nogen);
12767
12768 if (!h || !h->exists) {
12769 return 0;
12770 }
12771
12772 dout(20) << __func__ << " checking for unshareable blobs on " << h
12773 << " " << h->oid << dendl;
12774 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
12775 for (auto& e : h->extent_map.extent_map) {
12776 const bluestore_blob_t& b = e.blob->get_blob();
12777 SharedBlob *sb = e.blob->shared_blob.get();
12778 if (b.is_shared() &&
12779 sb->loaded &&
12780 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
12781 if (b.is_compressed()) {
12782 expect[sb].get(0, b.get_ondisk_length());
12783 } else {
12784 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
12785 expect[sb].get(off, len);
12786 return 0;
12787 });
12788 }
224ce89b
WB
12789 }
12790 }
31f18b77 12791
224ce89b
WB
12792 vector<SharedBlob*> unshared_blobs;
12793 unshared_blobs.reserve(maybe_unshared_blobs.size());
12794 for (auto& p : expect) {
12795 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
12796 if (p.first->persistent->ref_map == p.second) {
12797 SharedBlob *sb = p.first;
12798 dout(20) << __func__ << " unsharing " << *sb << dendl;
12799 unshared_blobs.push_back(sb);
12800 txc->unshare_blob(sb);
12801 uint64_t sbid = c->make_blob_unshared(sb);
12802 string key;
12803 get_shared_blob_key(sbid, &key);
12804 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
12805 }
12806 }
12807
12808 if (unshared_blobs.empty()) {
12809 return 0;
12810 }
12811
224ce89b
WB
12812 for (auto& e : h->extent_map.extent_map) {
12813 const bluestore_blob_t& b = e.blob->get_blob();
12814 SharedBlob *sb = e.blob->shared_blob.get();
12815 if (b.is_shared() &&
12816 std::find(unshared_blobs.begin(), unshared_blobs.end(),
12817 sb) != unshared_blobs.end()) {
12818 dout(20) << __func__ << " unsharing " << e << dendl;
12819 bluestore_blob_t& blob = e.blob->dirty_blob();
12820 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 12821 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
12822 }
12823 }
224ce89b
WB
12824 txc->write_onode(h);
12825
7c673cae
FG
12826 return 0;
12827}
12828
12829int BlueStore::_remove(TransContext *txc,
12830 CollectionRef& c,
12831 OnodeRef &o)
12832{
11fdf7f2
TL
12833 dout(15) << __func__ << " " << c->cid << " " << o->oid
12834 << " onode " << o.get()
12835 << " txc "<< txc << dendl;
7c673cae
FG
12836 int r = _do_remove(txc, c, o);
12837 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12838 return r;
12839}
12840
12841int BlueStore::_setattr(TransContext *txc,
12842 CollectionRef& c,
12843 OnodeRef& o,
12844 const string& name,
12845 bufferptr& val)
12846{
12847 dout(15) << __func__ << " " << c->cid << " " << o->oid
12848 << " " << name << " (" << val.length() << " bytes)"
12849 << dendl;
12850 int r = 0;
3efd9988
FG
12851 if (val.is_partial()) {
12852 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
12853 val.length());
12854 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12855 } else {
12856 auto& b = o->onode.attrs[name.c_str()] = val;
12857 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12858 }
7c673cae
FG
12859 txc->write_onode(o);
12860 dout(10) << __func__ << " " << c->cid << " " << o->oid
12861 << " " << name << " (" << val.length() << " bytes)"
12862 << " = " << r << dendl;
12863 return r;
12864}
12865
12866int BlueStore::_setattrs(TransContext *txc,
12867 CollectionRef& c,
12868 OnodeRef& o,
12869 const map<string,bufferptr>& aset)
12870{
12871 dout(15) << __func__ << " " << c->cid << " " << o->oid
12872 << " " << aset.size() << " keys"
12873 << dendl;
12874 int r = 0;
12875 for (map<string,bufferptr>::const_iterator p = aset.begin();
12876 p != aset.end(); ++p) {
3efd9988
FG
12877 if (p->second.is_partial()) {
12878 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 12879 bufferptr(p->second.c_str(), p->second.length());
3efd9988
FG
12880 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12881 } else {
12882 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
12883 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
12884 }
7c673cae
FG
12885 }
12886 txc->write_onode(o);
12887 dout(10) << __func__ << " " << c->cid << " " << o->oid
12888 << " " << aset.size() << " keys"
12889 << " = " << r << dendl;
12890 return r;
12891}
12892
12893
12894int BlueStore::_rmattr(TransContext *txc,
12895 CollectionRef& c,
12896 OnodeRef& o,
12897 const string& name)
12898{
12899 dout(15) << __func__ << " " << c->cid << " " << o->oid
12900 << " " << name << dendl;
12901 int r = 0;
12902 auto it = o->onode.attrs.find(name.c_str());
12903 if (it == o->onode.attrs.end())
12904 goto out;
12905
12906 o->onode.attrs.erase(it);
12907 txc->write_onode(o);
12908
12909 out:
12910 dout(10) << __func__ << " " << c->cid << " " << o->oid
12911 << " " << name << " = " << r << dendl;
12912 return r;
12913}
12914
12915int BlueStore::_rmattrs(TransContext *txc,
12916 CollectionRef& c,
12917 OnodeRef& o)
12918{
12919 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12920 int r = 0;
12921
12922 if (o->onode.attrs.empty())
12923 goto out;
12924
12925 o->onode.attrs.clear();
12926 txc->write_onode(o);
12927
12928 out:
12929 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12930 return r;
12931}
12932
11fdf7f2
TL
12933void BlueStore::_do_omap_clear(TransContext *txc, const string& omap_prefix,
12934 uint64_t id)
7c673cae 12935{
7c673cae
FG
12936 string prefix, tail;
12937 get_omap_header(id, &prefix);
12938 get_omap_tail(id, &tail);
11fdf7f2 12939 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 12940 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
12941 dout(20) << __func__ << " remove range start: "
12942 << pretty_binary_string(prefix) << " end: "
12943 << pretty_binary_string(tail) << dendl;
7c673cae
FG
12944}
12945
12946int BlueStore::_omap_clear(TransContext *txc,
12947 CollectionRef& c,
12948 OnodeRef& o)
12949{
12950 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12951 int r = 0;
12952 if (o->onode.has_omap()) {
12953 o->flush();
11fdf7f2
TL
12954 _do_omap_clear(txc,
12955 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
12956 o->onode.nid);
7c673cae
FG
12957 o->onode.clear_omap_flag();
12958 txc->write_onode(o);
12959 }
12960 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12961 return r;
12962}
12963
12964int BlueStore::_omap_setkeys(TransContext *txc,
12965 CollectionRef& c,
12966 OnodeRef& o,
12967 bufferlist &bl)
12968{
12969 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12970 int r;
11fdf7f2 12971 auto p = bl.cbegin();
7c673cae
FG
12972 __u32 num;
12973 if (!o->onode.has_omap()) {
12974 o->onode.set_omap_flag();
11fdf7f2
TL
12975 if (o->oid.is_pgmeta()) {
12976 o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
12977 }
7c673cae 12978 txc->write_onode(o);
494da23a
TL
12979
12980 const string& prefix =
12981 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
12982 string key_tail;
12983 bufferlist tail;
12984 get_omap_tail(o->onode.nid, &key_tail);
12985 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
12986 } else {
12987 txc->note_modified_object(o);
12988 }
11fdf7f2
TL
12989 const string& prefix =
12990 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
7c673cae
FG
12991 string final_key;
12992 _key_encode_u64(o->onode.nid, &final_key);
12993 final_key.push_back('.');
11fdf7f2 12994 decode(num, p);
7c673cae
FG
12995 while (num--) {
12996 string key;
12997 bufferlist value;
11fdf7f2
TL
12998 decode(key, p);
12999 decode(value, p);
7c673cae
FG
13000 final_key.resize(9); // keep prefix
13001 final_key += key;
11fdf7f2 13002 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 13003 << " <- " << key << dendl;
11fdf7f2 13004 txc->t->set(prefix, final_key, value);
7c673cae
FG
13005 }
13006 r = 0;
13007 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13008 return r;
13009}
13010
13011int BlueStore::_omap_setheader(TransContext *txc,
13012 CollectionRef& c,
13013 OnodeRef &o,
13014 bufferlist& bl)
13015{
13016 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13017 int r;
13018 string key;
13019 if (!o->onode.has_omap()) {
13020 o->onode.set_omap_flag();
11fdf7f2
TL
13021 if (o->oid.is_pgmeta()) {
13022 o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
13023 }
7c673cae 13024 txc->write_onode(o);
494da23a
TL
13025
13026 const string& prefix =
13027 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13028 string key_tail;
13029 bufferlist tail;
13030 get_omap_tail(o->onode.nid, &key_tail);
13031 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
13032 } else {
13033 txc->note_modified_object(o);
13034 }
11fdf7f2
TL
13035 const string& prefix =
13036 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
7c673cae 13037 get_omap_header(o->onode.nid, &key);
11fdf7f2 13038 txc->t->set(prefix, key, bl);
7c673cae
FG
13039 r = 0;
13040 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13041 return r;
13042}
13043
13044int BlueStore::_omap_rmkeys(TransContext *txc,
13045 CollectionRef& c,
13046 OnodeRef& o,
13047 bufferlist& bl)
13048{
13049 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13050 int r = 0;
11fdf7f2 13051 auto p = bl.cbegin();
7c673cae
FG
13052 __u32 num;
13053 string final_key;
13054
13055 if (!o->onode.has_omap()) {
13056 goto out;
13057 }
11fdf7f2
TL
13058 {
13059 const string& prefix =
13060 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13061 _key_encode_u64(o->onode.nid, &final_key);
13062 final_key.push_back('.');
13063 decode(num, p);
13064 while (num--) {
13065 string key;
13066 decode(key, p);
13067 final_key.resize(9); // keep prefix
13068 final_key += key;
13069 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
13070 << " <- " << key << dendl;
13071 txc->t->rmkey(prefix, final_key);
13072 }
7c673cae
FG
13073 }
13074 txc->note_modified_object(o);
13075
13076 out:
13077 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13078 return r;
13079}
13080
13081int BlueStore::_omap_rmkey_range(TransContext *txc,
13082 CollectionRef& c,
13083 OnodeRef& o,
13084 const string& first, const string& last)
13085{
13086 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
13087 string key_first, key_last;
13088 int r = 0;
13089 if (!o->onode.has_omap()) {
13090 goto out;
13091 }
11fdf7f2
TL
13092 {
13093 const string& prefix =
13094 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13095 o->flush();
13096 get_omap_key(o->onode.nid, first, &key_first);
13097 get_omap_key(o->onode.nid, last, &key_last);
13098 txc->t->rm_range_keys(prefix, key_first, key_last);
13099 dout(20) << __func__ << " remove range start: "
13100 << pretty_binary_string(key_first) << " end: "
13101 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
13102 }
13103 txc->note_modified_object(o);
13104
13105 out:
13106 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13107 return r;
13108}
13109
13110int BlueStore::_set_alloc_hint(
13111 TransContext *txc,
13112 CollectionRef& c,
13113 OnodeRef& o,
13114 uint64_t expected_object_size,
13115 uint64_t expected_write_size,
13116 uint32_t flags)
13117{
13118 dout(15) << __func__ << " " << c->cid << " " << o->oid
13119 << " object_size " << expected_object_size
13120 << " write_size " << expected_write_size
13121 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
13122 << dendl;
13123 int r = 0;
13124 o->onode.expected_object_size = expected_object_size;
13125 o->onode.expected_write_size = expected_write_size;
13126 o->onode.alloc_hint_flags = flags;
13127 txc->write_onode(o);
13128 dout(10) << __func__ << " " << c->cid << " " << o->oid
13129 << " object_size " << expected_object_size
13130 << " write_size " << expected_write_size
13131 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
13132 << " = " << r << dendl;
13133 return r;
13134}
13135
13136int BlueStore::_clone(TransContext *txc,
13137 CollectionRef& c,
13138 OnodeRef& oldo,
13139 OnodeRef& newo)
13140{
13141 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13142 << newo->oid << dendl;
13143 int r = 0;
13144 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
13145 derr << __func__ << " mismatched hash on " << oldo->oid
13146 << " and " << newo->oid << dendl;
13147 return -EINVAL;
13148 }
13149
7c673cae
FG
13150 _assign_nid(txc, newo);
13151
13152 // clone data
13153 oldo->flush();
13154 _do_truncate(txc, c, newo, 0);
13155 if (cct->_conf->bluestore_clone_cow) {
13156 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
13157 } else {
13158 bufferlist bl;
13159 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
13160 if (r < 0)
13161 goto out;
13162 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
13163 if (r < 0)
13164 goto out;
13165 }
13166
13167 // clone attrs
13168 newo->onode.attrs = oldo->onode.attrs;
13169
13170 // clone omap
13171 if (newo->onode.has_omap()) {
13172 dout(20) << __func__ << " clearing old omap data" << dendl;
13173 newo->flush();
11fdf7f2
TL
13174 _do_omap_clear(txc,
13175 newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP
13176 : PREFIX_OMAP,
13177 newo->onode.nid);
494da23a 13178 newo->onode.clear_omap_flag();
7c673cae
FG
13179 }
13180 if (oldo->onode.has_omap()) {
13181 dout(20) << __func__ << " copying omap data" << dendl;
494da23a
TL
13182 newo->onode.set_omap_flag();
13183 if (newo->oid.is_pgmeta()) {
13184 newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
7c673cae 13185 }
11fdf7f2
TL
13186 const string& prefix =
13187 newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13188 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
13189 string head, tail;
13190 get_omap_header(oldo->onode.nid, &head);
13191 get_omap_tail(oldo->onode.nid, &tail);
13192 it->lower_bound(head);
13193 while (it->valid()) {
13194 if (it->key() >= tail) {
13195 dout(30) << __func__ << " reached tail" << dendl;
13196 break;
13197 } else {
13198 dout(30) << __func__ << " got header/data "
13199 << pretty_binary_string(it->key()) << dendl;
13200 string key;
13201 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11fdf7f2 13202 txc->t->set(prefix, key, it->value());
7c673cae
FG
13203 }
13204 it->next();
13205 }
494da23a
TL
13206 string new_tail;
13207 bufferlist new_tail_value;
13208 get_omap_tail(newo->onode.nid, &new_tail);
13209 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
13210 }
13211
13212 txc->write_onode(newo);
13213 r = 0;
13214
13215 out:
13216 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13217 << newo->oid << " = " << r << dendl;
13218 return r;
13219}
13220
13221int BlueStore::_do_clone_range(
13222 TransContext *txc,
13223 CollectionRef& c,
13224 OnodeRef& oldo,
13225 OnodeRef& newo,
224ce89b
WB
13226 uint64_t srcoff,
13227 uint64_t length,
13228 uint64_t dstoff)
7c673cae
FG
13229{
13230 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13231 << newo->oid
13232 << " 0x" << std::hex << srcoff << "~" << length << " -> "
13233 << " 0x" << dstoff << "~" << length << std::dec << dendl;
13234 oldo->extent_map.fault_range(db, srcoff, length);
13235 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
13236 _dump_onode<30>(cct, *oldo);
13237 _dump_onode<30>(cct, *newo);
7c673cae 13238
11fdf7f2 13239 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
13240 _dump_onode<30>(cct, *oldo);
13241 _dump_onode<30>(cct, *newo);
7c673cae
FG
13242 return 0;
13243}
13244
13245int BlueStore::_clone_range(TransContext *txc,
13246 CollectionRef& c,
13247 OnodeRef& oldo,
13248 OnodeRef& newo,
13249 uint64_t srcoff, uint64_t length, uint64_t dstoff)
13250{
13251 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13252 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
13253 << " to offset 0x" << dstoff << std::dec << dendl;
13254 int r = 0;
13255
35e4c445
FG
13256 if (srcoff + length >= OBJECT_MAX_SIZE ||
13257 dstoff + length >= OBJECT_MAX_SIZE) {
13258 r = -E2BIG;
13259 goto out;
13260 }
7c673cae
FG
13261 if (srcoff + length > oldo->onode.size) {
13262 r = -EINVAL;
13263 goto out;
13264 }
13265
7c673cae
FG
13266 _assign_nid(txc, newo);
13267
13268 if (length > 0) {
13269 if (cct->_conf->bluestore_clone_cow) {
13270 _do_zero(txc, c, newo, dstoff, length);
13271 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
13272 } else {
13273 bufferlist bl;
13274 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
13275 if (r < 0)
13276 goto out;
13277 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
13278 if (r < 0)
13279 goto out;
13280 }
13281 }
13282
13283 txc->write_onode(newo);
13284 r = 0;
13285
13286 out:
13287 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13288 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
13289 << " to offset 0x" << dstoff << std::dec
13290 << " = " << r << dendl;
13291 return r;
13292}
13293
13294int BlueStore::_rename(TransContext *txc,
13295 CollectionRef& c,
13296 OnodeRef& oldo,
13297 OnodeRef& newo,
13298 const ghobject_t& new_oid)
13299{
13300 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13301 << new_oid << dendl;
13302 int r;
13303 ghobject_t old_oid = oldo->oid;
31f18b77 13304 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
13305
13306 if (newo) {
13307 if (newo->exists) {
13308 r = -EEXIST;
13309 goto out;
13310 }
11fdf7f2 13311 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
13312 }
13313
13314 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
13315
13316 // rewrite shards
13317 {
13318 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
13319 get_object_key(cct, new_oid, &new_okey);
13320 string key;
13321 for (auto &s : oldo->extent_map.shards) {
13322 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
13323 [&](const string& final_key) {
13324 txc->t->rmkey(PREFIX_OBJ, final_key);
13325 }
13326 );
13327 s.dirty = true;
13328 }
13329 }
13330
13331 newo = oldo;
13332 txc->write_onode(newo);
13333
13334 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
13335 // Onode in the old slot
13336 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
13337 r = 0;
13338
f64942e4
AA
13339 // hold a ref to new Onode in old name position, to ensure we don't drop
13340 // it from the cache before this txc commits (or else someone may come along
13341 // and read newo's metadata via the old name).
13342 txc->note_modified_object(oldo);
13343
7c673cae
FG
13344 out:
13345 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
13346 << new_oid << " = " << r << dendl;
13347 return r;
13348}
13349
13350// collections
13351
13352int BlueStore::_create_collection(
13353 TransContext *txc,
13354 const coll_t &cid,
13355 unsigned bits,
13356 CollectionRef *c)
13357{
13358 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
13359 int r;
13360 bufferlist bl;
13361
13362 {
13363 RWLock::WLocker l(coll_lock);
13364 if (*c) {
13365 r = -EEXIST;
13366 goto out;
13367 }
11fdf7f2
TL
13368 auto p = new_coll_map.find(cid);
13369 ceph_assert(p != new_coll_map.end());
13370 *c = p->second;
7c673cae
FG
13371 (*c)->cnode.bits = bits;
13372 coll_map[cid] = *c;
11fdf7f2 13373 new_coll_map.erase(p);
7c673cae 13374 }
11fdf7f2 13375 encode((*c)->cnode, bl);
7c673cae
FG
13376 txc->t->set(PREFIX_COLL, stringify(cid), bl);
13377 r = 0;
13378
13379 out:
13380 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
13381 return r;
13382}
13383
13384int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
13385 CollectionRef *c)
13386{
13387 dout(15) << __func__ << " " << cid << dendl;
13388 int r;
13389
11fdf7f2 13390 (*c)->flush_all_but_last();
7c673cae
FG
13391 {
13392 RWLock::WLocker l(coll_lock);
13393 if (!*c) {
13394 r = -ENOENT;
13395 goto out;
13396 }
13397 size_t nonexistent_count = 0;
11fdf7f2 13398 ceph_assert((*c)->exists);
7c673cae
FG
13399 if ((*c)->onode_map.map_any([&](OnodeRef o) {
13400 if (o->exists) {
494da23a
TL
13401 dout(1) << __func__ << " " << o->oid << " " << o
13402 << " exists in onode_map" << dendl;
7c673cae
FG
13403 return true;
13404 }
13405 ++nonexistent_count;
13406 return false;
13407 })) {
13408 r = -ENOTEMPTY;
13409 goto out;
13410 }
13411
13412 vector<ghobject_t> ls;
13413 ghobject_t next;
13414 // Enumerate onodes in db, up to nonexistent_count + 1
13415 // then check if all of them are marked as non-existent.
11fdf7f2 13416 // Bypass the check if (next != ghobject_t::get_max())
7c673cae
FG
13417 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
13418 nonexistent_count + 1, &ls, &next);
13419 if (r >= 0) {
11fdf7f2
TL
13420 // If true mean collecton has more objects than nonexistent_count,
13421 // so bypass check.
13422 bool exists = (!next.is_max());
7c673cae
FG
13423 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
13424 dout(10) << __func__ << " oid " << *it << dendl;
13425 auto onode = (*c)->onode_map.lookup(*it);
13426 exists = !onode || onode->exists;
13427 if (exists) {
494da23a
TL
13428 dout(1) << __func__ << " " << *it
13429 << " exists in db, "
13430 << (!onode ? "not present in ram" : "present in ram")
13431 << dendl;
7c673cae
FG
13432 }
13433 }
13434 if (!exists) {
11fdf7f2 13435 _do_remove_collection(txc, c);
7c673cae
FG
13436 r = 0;
13437 } else {
13438 dout(10) << __func__ << " " << cid
13439 << " is non-empty" << dendl;
13440 r = -ENOTEMPTY;
13441 }
13442 }
13443 }
13444
13445 out:
13446 dout(10) << __func__ << " " << cid << " = " << r << dendl;
13447 return r;
13448}
13449
11fdf7f2
TL
13450void BlueStore::_do_remove_collection(TransContext *txc,
13451 CollectionRef *c)
13452{
13453 coll_map.erase((*c)->cid);
13454 txc->removed_collections.push_back(*c);
13455 (*c)->exists = false;
13456 _osr_register_zombie((*c)->osr.get());
13457 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
13458 c->reset();
13459}
13460
7c673cae
FG
13461int BlueStore::_split_collection(TransContext *txc,
13462 CollectionRef& c,
13463 CollectionRef& d,
13464 unsigned bits, int rem)
13465{
13466 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
13467 << " bits " << bits << dendl;
13468 RWLock::WLocker l(c->lock);
13469 RWLock::WLocker l2(d->lock);
13470 int r;
13471
13472 // flush all previous deferred writes on this sequencer. this is a bit
13473 // heavyweight, but we need to make sure all deferred writes complete
13474 // before we split as the new collection's sequencer may need to order
13475 // this after those writes, and we don't bother with the complexity of
13476 // moving those TransContexts over to the new osr.
13477 _osr_drain_preceding(txc);
13478
13479 // move any cached items (onodes and referenced shared blobs) that will
13480 // belong to the child collection post-split. leave everything else behind.
13481 // this may include things that don't strictly belong to the now-smaller
13482 // parent split, but the OSD will always send us a split for every new
13483 // child.
13484
13485 spg_t pgid, dest_pgid;
13486 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 13487 ceph_assert(is_pg);
7c673cae 13488 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 13489 ceph_assert(is_pg);
7c673cae
FG
13490
13491 // the destination should initially be empty.
11fdf7f2
TL
13492 ceph_assert(d->onode_map.empty());
13493 ceph_assert(d->shared_blob_set.empty());
13494 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
13495
13496 c->split_cache(d.get());
13497
13498 // adjust bits. note that this will be redundant for all but the first
13499 // split call for this parent (first child).
13500 c->cnode.bits = bits;
11fdf7f2 13501 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
13502 r = 0;
13503
13504 bufferlist bl;
11fdf7f2 13505 encode(c->cnode, bl);
7c673cae
FG
13506 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
13507
13508 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
13509 << " bits " << bits << " = " << r << dendl;
13510 return r;
13511}
13512
11fdf7f2
TL
13513int BlueStore::_merge_collection(
13514 TransContext *txc,
13515 CollectionRef *c,
13516 CollectionRef& d,
13517 unsigned bits)
13518{
13519 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
13520 << " bits " << bits << dendl;
13521 RWLock::WLocker l((*c)->lock);
13522 RWLock::WLocker l2(d->lock);
13523 int r;
13524
13525 coll_t cid = (*c)->cid;
13526
13527 // flush all previous deferred writes on the source collection to ensure
13528 // that all deferred writes complete before we merge as the target collection's
13529 // sequencer may need to order new ops after those writes.
13530
13531 _osr_drain((*c)->osr.get());
13532
13533 // move any cached items (onodes and referenced shared blobs) that will
13534 // belong to the child collection post-split. leave everything else behind.
13535 // this may include things that don't strictly belong to the now-smaller
13536 // parent split, but the OSD will always send us a split for every new
13537 // child.
13538
13539 spg_t pgid, dest_pgid;
13540 bool is_pg = cid.is_pg(&pgid);
13541 ceph_assert(is_pg);
13542 is_pg = d->cid.is_pg(&dest_pgid);
13543 ceph_assert(is_pg);
13544
13545 // adjust bits. note that this will be redundant for all but the first
13546 // merge call for the parent/target.
13547 d->cnode.bits = bits;
13548
13549 // behavior depends on target (d) bits, so this after that is updated.
13550 (*c)->split_cache(d.get());
13551
13552 // remove source collection
13553 {
13554 RWLock::WLocker l3(coll_lock);
13555 _do_remove_collection(txc, c);
13556 }
13557
13558 r = 0;
13559
13560 bufferlist bl;
13561 encode(d->cnode, bl);
13562 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
13563
13564 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
13565 << " bits " << bits << " = " << r << dendl;
13566 return r;
13567}
13568
494da23a
TL
13569void BlueStore::log_latency(
13570 const char* name,
13571 int idx,
13572 const ceph::timespan& l,
13573 double lat_threshold,
13574 const char* info) const
13575{
13576 logger->tinc(idx, l);
13577 if (lat_threshold > 0.0 &&
13578 l >= make_timespan(lat_threshold)) {
13579 dout(0) << __func__ << " slow operation observed for " << name
13580 << ", latency = " << l
13581 << info
13582 << dendl;
13583 }
13584}
13585
11fdf7f2 13586void BlueStore::log_latency_fn(
494da23a 13587 const char* name,
11fdf7f2
TL
13588 int idx,
13589 const ceph::timespan& l,
494da23a
TL
13590 double lat_threshold,
13591 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 13592{
494da23a
TL
13593 logger->tinc(idx, l);
13594 if (lat_threshold > 0.0 &&
13595 l >= make_timespan(lat_threshold)) {
13596 dout(0) << __func__ << " slow operation observed for " << name
13597 << ", latency = " << l
13598 << fn(l)
13599 << dendl;
13600 }
11fdf7f2
TL
13601}
13602
13603
7c673cae
FG
13604// DB key value Histogram
13605#define KEY_SLAB 32
13606#define VALUE_SLAB 64
13607
13608const string prefix_onode = "o";
13609const string prefix_onode_shard = "x";
13610const string prefix_other = "Z";
13611
13612int BlueStore::DBHistogram::get_key_slab(size_t sz)
13613{
13614 return (sz/KEY_SLAB);
13615}
13616
13617string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
13618{
13619 int lower_bound = slab * KEY_SLAB;
13620 int upper_bound = (slab + 1) * KEY_SLAB;
13621 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
13622 return ret;
13623}
13624
13625int BlueStore::DBHistogram::get_value_slab(size_t sz)
13626{
13627 return (sz/VALUE_SLAB);
13628}
13629
13630string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
13631{
13632 int lower_bound = slab * VALUE_SLAB;
13633 int upper_bound = (slab + 1) * VALUE_SLAB;
13634 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
13635 return ret;
13636}
13637
13638void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
13639 const string &prefix, size_t key_size, size_t value_size)
13640{
13641 uint32_t key_slab = get_key_slab(key_size);
13642 uint32_t value_slab = get_value_slab(value_size);
13643 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
13644 key_hist[prefix][key_slab].max_len =
13645 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
13646 key_hist[prefix][key_slab].val_map[value_slab].count++;
13647 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
13648 std::max<size_t>(value_size,
13649 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
13650}
13651
13652void BlueStore::DBHistogram::dump(Formatter *f)
13653{
13654 f->open_object_section("rocksdb_value_distribution");
13655 for (auto i : value_hist) {
13656 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
13657 }
13658 f->close_section();
13659
13660 f->open_object_section("rocksdb_key_value_histogram");
13661 for (auto i : key_hist) {
13662 f->dump_string("prefix", i.first);
13663 f->open_object_section("key_hist");
13664 for ( auto k : i.second) {
13665 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
13666 f->dump_unsigned("max_len", k.second.max_len);
13667 f->open_object_section("value_hist");
13668 for ( auto j : k.second.val_map) {
13669 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
13670 f->dump_unsigned("max_len", j.second.max_len);
13671 }
13672 f->close_section();
13673 }
13674 f->close_section();
13675 }
13676 f->close_section();
13677}
13678
13679//Itrerates through the db and collects the stats
13680void BlueStore::generate_db_histogram(Formatter *f)
13681{
13682 //globals
13683 uint64_t num_onodes = 0;
13684 uint64_t num_shards = 0;
13685 uint64_t num_super = 0;
13686 uint64_t num_coll = 0;
13687 uint64_t num_omap = 0;
11fdf7f2 13688 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
13689 uint64_t num_deferred = 0;
13690 uint64_t num_alloc = 0;
13691 uint64_t num_stat = 0;
13692 uint64_t num_others = 0;
13693 uint64_t num_shared_shards = 0;
13694 size_t max_key_size =0, max_value_size = 0;
13695 uint64_t total_key_size = 0, total_value_size = 0;
13696 size_t key_size = 0, value_size = 0;
13697 DBHistogram hist;
13698
11fdf7f2 13699 auto start = coarse_mono_clock::now();
7c673cae 13700
11fdf7f2 13701 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
13702 iter->seek_to_first();
13703 while (iter->valid()) {
13704 dout(30) << __func__ << " Key: " << iter->key() << dendl;
13705 key_size = iter->key_size();
13706 value_size = iter->value_size();
13707 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
13708 max_key_size = std::max(max_key_size, key_size);
13709 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
13710 total_key_size += key_size;
13711 total_value_size += value_size;
13712
13713 pair<string,string> key(iter->raw_key());
13714
13715 if (key.first == PREFIX_SUPER) {
13716 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
13717 num_super++;
13718 } else if (key.first == PREFIX_STAT) {
13719 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
13720 num_stat++;
13721 } else if (key.first == PREFIX_COLL) {
13722 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
13723 num_coll++;
13724 } else if (key.first == PREFIX_OBJ) {
13725 if (key.second.back() == ONODE_KEY_SUFFIX) {
13726 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
13727 num_onodes++;
13728 } else {
13729 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
13730 num_shards++;
13731 }
13732 } else if (key.first == PREFIX_OMAP) {
13733 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
13734 num_omap++;
11fdf7f2
TL
13735 } else if (key.first == PREFIX_PGMETA_OMAP) {
13736 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
13737 num_pgmeta_omap++;
7c673cae
FG
13738 } else if (key.first == PREFIX_DEFERRED) {
13739 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
13740 num_deferred++;
11fdf7f2 13741 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
13742 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
13743 num_alloc++;
13744 } else if (key.first == PREFIX_SHARED_BLOB) {
13745 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
13746 num_shared_shards++;
13747 } else {
13748 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
13749 num_others++;
13750 }
13751 iter->next();
13752 }
13753
11fdf7f2 13754 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
13755 f->open_object_section("rocksdb_key_value_stats");
13756 f->dump_unsigned("num_onodes", num_onodes);
13757 f->dump_unsigned("num_shards", num_shards);
13758 f->dump_unsigned("num_super", num_super);
13759 f->dump_unsigned("num_coll", num_coll);
13760 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 13761 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
13762 f->dump_unsigned("num_deferred", num_deferred);
13763 f->dump_unsigned("num_alloc", num_alloc);
13764 f->dump_unsigned("num_stat", num_stat);
13765 f->dump_unsigned("num_shared_shards", num_shared_shards);
13766 f->dump_unsigned("num_others", num_others);
13767 f->dump_unsigned("max_key_size", max_key_size);
13768 f->dump_unsigned("max_value_size", max_value_size);
13769 f->dump_unsigned("total_key_size", total_key_size);
13770 f->dump_unsigned("total_value_size", total_value_size);
13771 f->close_section();
13772
13773 hist.dump(f);
13774
13775 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
13776
13777}
13778
31f18b77 13779void BlueStore::_flush_cache()
7c673cae
FG
13780{
13781 dout(10) << __func__ << dendl;
13782 for (auto i : cache_shards) {
13783 i->trim_all();
11fdf7f2 13784 ceph_assert(i->empty());
7c673cae
FG
13785 }
13786 for (auto& p : coll_map) {
3efd9988 13787 if (!p.second->onode_map.empty()) {
11fdf7f2
TL
13788 derr << __func__ << " stray onodes on " << p.first << dendl;
13789 p.second->onode_map.dump<0>(cct);
3efd9988
FG
13790 }
13791 if (!p.second->shared_blob_set.empty()) {
13792 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 13793 p.second->shared_blob_set.dump<0>(cct);
3efd9988 13794 }
11fdf7f2
TL
13795 ceph_assert(p.second->onode_map.empty());
13796 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
13797 }
13798 coll_map.clear();
13799}
13800
31f18b77
FG
13801// For external caller.
13802// We use a best-effort policy instead, e.g.,
13803// we don't care if there are still some pinned onodes/data in the cache
13804// after this command is completed.
11fdf7f2 13805int BlueStore::flush_cache(ostream *os)
31f18b77
FG
13806{
13807 dout(10) << __func__ << dendl;
13808 for (auto i : cache_shards) {
13809 i->trim_all();
13810 }
11fdf7f2
TL
13811
13812 return 0;
31f18b77
FG
13813}
13814
7c673cae
FG
13815void BlueStore::_apply_padding(uint64_t head_pad,
13816 uint64_t tail_pad,
7c673cae
FG
13817 bufferlist& padded)
13818{
7c673cae 13819 if (head_pad) {
224ce89b 13820 padded.prepend_zero(head_pad);
7c673cae
FG
13821 }
13822 if (tail_pad) {
13823 padded.append_zero(tail_pad);
13824 }
13825 if (head_pad || tail_pad) {
13826 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
13827 << " tail 0x" << tail_pad << std::dec << dendl;
13828 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
13829 }
13830}
13831
11fdf7f2
TL
13832void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
13833{
13834 // finalize extent_map shards
13835 o->extent_map.update(txn, false);
13836 if (o->extent_map.needs_reshard()) {
13837 o->extent_map.reshard(db, txn);
13838 o->extent_map.update(txn, true);
13839 if (o->extent_map.needs_reshard()) {
13840 dout(20) << __func__ << " warning: still wants reshard, check options?"
13841 << dendl;
13842 o->extent_map.clear_needs_reshard();
13843 }
13844 logger->inc(l_bluestore_onode_reshard);
13845 }
13846
13847 // bound encode
13848 size_t bound = 0;
13849 denc(o->onode, bound);
13850 o->extent_map.bound_encode_spanning_blobs(bound);
13851 if (o->onode.extent_map_shards.empty()) {
13852 denc(o->extent_map.inline_bl, bound);
13853 }
13854
13855 // encode
13856 bufferlist bl;
13857 unsigned onode_part, blob_part, extent_part;
13858 {
13859 auto p = bl.get_contiguous_appender(bound, true);
13860 denc(o->onode, p);
13861 onode_part = p.get_logical_offset();
13862 o->extent_map.encode_spanning_blobs(p);
13863 blob_part = p.get_logical_offset() - onode_part;
13864 if (o->onode.extent_map_shards.empty()) {
13865 denc(o->extent_map.inline_bl, p);
13866 }
13867 extent_part = p.get_logical_offset() - onode_part - blob_part;
13868 }
13869
13870 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
13871 << " (" << onode_part << " bytes onode + "
13872 << blob_part << " bytes spanning blobs + "
13873 << extent_part << " bytes inline extents)"
13874 << dendl;
13875
13876
13877 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
13878}
13879
13880void BlueStore::_log_alerts(osd_alert_list_t& alerts)
13881{
13882 std::lock_guard l(qlock);
13883
81eedcae
TL
13884 if (!disk_size_mismatch_alert.empty()) {
13885 alerts.emplace(
13886 "BLUESTORE_DISK_SIZE_MISMATCH",
13887 disk_size_mismatch_alert);
13888 }
13889 if (!legacy_statfs_alert.empty()) {
13890 alerts.emplace(
13891 "BLUESTORE_LEGACY_STATFS",
13892 legacy_statfs_alert);
13893 }
11fdf7f2
TL
13894 if (!spillover_alert.empty() &&
13895 cct->_conf->bluestore_warn_on_bluefs_spillover) {
13896 alerts.emplace(
13897 "BLUEFS_SPILLOVER",
13898 spillover_alert);
13899 }
13900 string s0(failed_cmode);
13901
13902 if (!failed_compressors.empty()) {
13903 if (!s0.empty()) {
13904 s0 += ", ";
13905 }
13906 s0 += "unable to load:";
13907 bool first = true;
13908 for (auto& s : failed_compressors) {
13909 if (first) {
13910 first = false;
13911 } else {
13912 s0 += ", ";
13913 }
13914 s0 += s;
13915 }
13916 alerts.emplace(
13917 "BLUESTORE_NO_COMPRESSION",
13918 s0);
13919 }
13920}
13921
7c673cae 13922// ===========================================
11fdf7f2
TL
13923// BlueStoreRepairer
13924
13925size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
13926 const interval_set<uint64_t>& extents)
13927{
13928 ceph_assert(granularity); // initialized
13929 // can't call for the second time
13930 ceph_assert(!was_filtered_out);
13931 ceph_assert(collections_bfs.size() == objects_bfs.size());
13932
13933 uint64_t prev_pos = 0;
13934 uint64_t npos = collections_bfs.size();
13935
13936 bloom_vector collections_reduced;
13937 bloom_vector objects_reduced;
13938
13939 for (auto e : extents) {
13940 if (e.second == 0) {
13941 continue;
13942 }
13943 uint64_t pos = max(e.first / granularity, prev_pos);
13944 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
13945 while (pos != npos && pos < end_pos) {
13946 ceph_assert( collections_bfs[pos].element_count() ==
13947 objects_bfs[pos].element_count());
13948 if (collections_bfs[pos].element_count()) {
13949 collections_reduced.push_back(std::move(collections_bfs[pos]));
13950 objects_reduced.push_back(std::move(objects_bfs[pos]));
13951 }
13952 ++pos;
13953 }
13954 prev_pos = end_pos;
13955 }
13956 collections_reduced.swap(collections_bfs);
13957 objects_reduced.swap(objects_bfs);
13958 was_filtered_out = true;
13959 return collections_bfs.size();
13960}
13961
13962bool BlueStoreRepairer::remove_key(KeyValueDB *db,
13963 const string& prefix,
13964 const string& key)
13965{
13966 if (!remove_key_txn) {
13967 remove_key_txn = db->get_transaction();
13968 }
13969 ++to_repair_cnt;
13970 remove_key_txn->rmkey(prefix, key);
13971
13972 return true;
13973}
13974
13975bool BlueStoreRepairer::fix_shared_blob(
13976 KeyValueDB *db,
13977 uint64_t sbid,
13978 const bufferlist* bl)
13979{
13980 KeyValueDB::Transaction txn;
13981 if (fix_misreferences_txn) { // reuse this txn
13982 txn = fix_misreferences_txn;
13983 } else {
13984 if (!fix_shared_blob_txn) {
13985 fix_shared_blob_txn = db->get_transaction();
13986 }
13987 txn = fix_shared_blob_txn;
13988 }
13989 string key;
13990 get_shared_blob_key(sbid, &key);
13991
13992 ++to_repair_cnt;
13993 if (bl) {
13994 txn->set(PREFIX_SHARED_BLOB, key, *bl);
13995 } else {
13996 txn->rmkey(PREFIX_SHARED_BLOB, key);
13997 }
13998 return true;
13999}
14000
14001bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
14002 const string& key,
14003 const store_statfs_t& new_statfs)
14004{
14005 if (!fix_statfs_txn) {
14006 fix_statfs_txn = db->get_transaction();
14007 }
14008 BlueStore::volatile_statfs vstatfs;
14009 vstatfs = new_statfs;
14010 bufferlist bl;
14011 vstatfs.encode(bl);
14012 ++to_repair_cnt;
14013 fix_statfs_txn->set(PREFIX_STAT, key, bl);
14014 return true;
14015}
14016
14017bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
14018 FreelistManager* fm,
14019 uint64_t offset, uint64_t len)
14020{
14021 if (!fix_fm_leaked_txn) {
14022 fix_fm_leaked_txn = db->get_transaction();
14023 }
14024 ++to_repair_cnt;
14025 fm->release(offset, len, fix_fm_leaked_txn);
14026 return true;
14027}
14028bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
14029 FreelistManager* fm,
14030 uint64_t offset, uint64_t len)
14031{
14032 if (!fix_fm_false_free_txn) {
14033 fix_fm_false_free_txn = db->get_transaction();
14034 }
14035 ++to_repair_cnt;
14036 fm->allocate(offset, len, fix_fm_false_free_txn);
14037 return true;
14038}
14039
14040bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
14041{
14042 // this is just a stub to count num of repairs properly,
14043 // actual repair happens in BlueStore::_close_db_and_around()
14044 // while doing _sync_bluefs_and_fm
14045 ++out_of_sync_flag;
14046 ++to_repair_cnt;
14047 return true;
14048}
14049
14050bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
14051{
14052 if (misreferenced_extents.size()) {
14053 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
14054 ceph_assert(n > 0);
14055 if (!fix_misreferences_txn) {
14056 fix_misreferences_txn = db->get_transaction();
14057 }
14058 return true;
14059 }
14060 return false;
14061}
14062
14063unsigned BlueStoreRepairer::apply(KeyValueDB* db)
14064{
14065 if (fix_fm_leaked_txn) {
14066 db->submit_transaction_sync(fix_fm_leaked_txn);
14067 fix_fm_leaked_txn = nullptr;
14068 }
14069 if (fix_fm_false_free_txn) {
14070 db->submit_transaction_sync(fix_fm_false_free_txn);
14071 fix_fm_false_free_txn = nullptr;
14072 }
14073 if (remove_key_txn) {
14074 db->submit_transaction_sync(remove_key_txn);
14075 remove_key_txn = nullptr;
14076 }
14077 if (fix_misreferences_txn) {
14078 db->submit_transaction_sync(fix_misreferences_txn);
14079 fix_misreferences_txn = nullptr;
14080 }
14081 if (fix_shared_blob_txn) {
14082 db->submit_transaction_sync(fix_shared_blob_txn);
14083 fix_shared_blob_txn = nullptr;
14084 }
14085
14086 if (fix_statfs_txn) {
14087 db->submit_transaction_sync(fix_statfs_txn);
14088 fix_statfs_txn = nullptr;
14089 }
14090 unsigned repaired = to_repair_cnt;
14091 to_repair_cnt = 0;
14092 return repaired;
14093}
14094
14095// =======================================================
14096