]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
update download target update for octopus release
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
eafe8130
TL
21#include <boost/container/flat_set.hpp>
22
31f18b77
FG
23#include "include/cpp-btree/btree_set.h"
24
7c673cae
FG
25#include "BlueStore.h"
26#include "os/kv.h"
27#include "include/compat.h"
28#include "include/intarith.h"
29#include "include/stringify.h"
11fdf7f2
TL
30#include "include/str_map.h"
31#include "include/util.h"
7c673cae
FG
32#include "common/errno.h"
33#include "common/safe_io.h"
91327a77 34#include "common/PriorityCache.h"
7c673cae
FG
35#include "Allocator.h"
36#include "FreelistManager.h"
37#include "BlueFS.h"
38#include "BlueRocksEnv.h"
39#include "auth/Crypto.h"
40#include "common/EventTrace.h"
91327a77 41#include "perfglue/heap_profiler.h"
11fdf7f2
TL
42#include "common/blkdev.h"
43#include "common/numa.h"
7c673cae
FG
44
45#define dout_context cct
46#define dout_subsys ceph_subsys_bluestore
47
31f18b77
FG
48using bid_t = decltype(BlueStore::Blob::id);
49
50// bluestore_cache_onode
7c673cae 51MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 52 bluestore_cache_onode);
7c673cae 53
31f18b77 54// bluestore_cache_other
7c673cae 55MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 56 bluestore_cache_other);
7c673cae 57MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 58 bluestore_cache_other);
7c673cae 59MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 60 bluestore_cache_other);
7c673cae 61MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
62 bluestore_cache_other);
63
64// bluestore_txc
65MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
66 bluestore_txc);
67
7c673cae
FG
68
69// kv store prefixes
11fdf7f2
TL
70const string PREFIX_SUPER = "S"; // field -> value
71const string PREFIX_STAT = "T"; // field -> value(int64 array)
72const string PREFIX_COLL = "C"; // collection name -> cnode_t
73const string PREFIX_OBJ = "O"; // object name -> onode_t
74const string PREFIX_OMAP = "M"; // u64 + keyname -> value
75const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
76const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
77const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
78const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae
FG
79const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
80
11fdf7f2
TL
81const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
82
7c673cae
FG
83// write a label in the first block. always use this size. note that
84// bluefs makes a matching assumption about the location of its
85// superblock (always the second block of the device).
86#define BDEV_LABEL_BLOCK_SIZE 4096
87
88// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
89#define SUPER_RESERVED 8192
90
91#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
92
93
94/*
95 * extent map blob encoding
96 *
97 * we use the low bits of the blobid field to indicate some common scenarios
98 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
99 */
100#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
101#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
102#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
103#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
104#define BLOBID_SHIFT_BITS 4
105
106/*
107 * object name key structure
108 *
109 * encoded u8: shard + 2^7 (so that it sorts properly)
110 * encoded u64: poolid + 2^63 (so that it sorts properly)
111 * encoded u32: hash (bit reversed)
112 *
113 * escaped string: namespace
114 *
115 * escaped string: key or object name
116 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
117 * we are done. otherwise, we are followed by the object name.
118 * escaped string: object name (unless '=' above)
119 *
120 * encoded u64: snap
121 * encoded u64: generation
122 * 'o'
123 */
124#define ONODE_KEY_SUFFIX 'o'
125
126/*
127 * extent shard key
128 *
129 * object prefix key
130 * u32
131 * 'x'
132 */
133#define EXTENT_SHARD_KEY_SUFFIX 'x'
134
135/*
136 * string encoding in the key
137 *
138 * The key string needs to lexicographically sort the same way that
139 * ghobject_t does. We do this by escaping anything <= to '#' with #
140 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
141 * hex digits.
142 *
143 * We use ! as a terminator for strings; this works because it is < #
144 * and will get escaped if it is present in the string.
145 *
146 */
147template<typename S>
148static void append_escaped(const string &in, S *out)
149{
224ce89b
WB
150 char hexbyte[in.length() * 3 + 1];
151 char* ptr = &hexbyte[0];
7c673cae
FG
152 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
153 if (*i <= '#') {
224ce89b
WB
154 *ptr++ = '#';
155 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
156 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 157 } else if (*i >= '~') {
224ce89b
WB
158 *ptr++ = '~';
159 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
160 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 161 } else {
224ce89b 162 *ptr++ = *i;
7c673cae
FG
163 }
164 }
224ce89b
WB
165 *ptr++ = '!';
166 out->append(hexbyte, ptr - &hexbyte[0]);
167}
168
169inline unsigned h2i(char c)
170{
171 if ((c >= '0') && (c <= '9')) {
172 return c - 0x30;
173 } else if ((c >= 'a') && (c <= 'f')) {
174 return c - 'a' + 10;
175 } else if ((c >= 'A') && (c <= 'F')) {
176 return c - 'A' + 10;
177 } else {
178 return 256; // make it always larger than 255
179 }
7c673cae
FG
180}
181
182static int decode_escaped(const char *p, string *out)
183{
224ce89b
WB
184 char buff[256];
185 char* ptr = &buff[0];
186 char* max = &buff[252];
7c673cae
FG
187 const char *orig_p = p;
188 while (*p && *p != '!') {
189 if (*p == '#' || *p == '~') {
224ce89b
WB
190 unsigned hex = 0;
191 p++;
192 hex = h2i(*p++) << 4;
193 if (hex > 255) {
194 return -EINVAL;
195 }
196 hex |= h2i(*p++);
197 if (hex > 255) {
198 return -EINVAL;
199 }
200 *ptr++ = hex;
7c673cae 201 } else {
224ce89b
WB
202 *ptr++ = *p++;
203 }
204 if (ptr > max) {
205 out->append(buff, ptr-buff);
206 ptr = &buff[0];
7c673cae
FG
207 }
208 }
224ce89b
WB
209 if (ptr != buff) {
210 out->append(buff, ptr-buff);
211 }
7c673cae
FG
212 return p - orig_p;
213}
214
215// some things we encode in binary (as le32 or le64); print the
216// resulting key strings nicely
217template<typename S>
218static string pretty_binary_string(const S& in)
219{
220 char buf[10];
221 string out;
222 out.reserve(in.length() * 3);
223 enum { NONE, HEX, STRING } mode = NONE;
224 unsigned from = 0, i;
225 for (i=0; i < in.length(); ++i) {
226 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
227 (mode == HEX && in.length() - i >= 4 &&
228 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
229 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
230 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
231 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
232 if (mode == STRING) {
233 out.append(in.c_str() + from, i - from);
234 out.push_back('\'');
235 }
236 if (mode != HEX) {
237 out.append("0x");
238 mode = HEX;
239 }
240 if (in.length() - i >= 4) {
241 // print a whole u32 at once
242 snprintf(buf, sizeof(buf), "%08x",
243 (uint32_t)(((unsigned char)in[i] << 24) |
244 ((unsigned char)in[i+1] << 16) |
245 ((unsigned char)in[i+2] << 8) |
246 ((unsigned char)in[i+3] << 0)));
247 i += 3;
248 } else {
249 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
250 }
251 out.append(buf);
252 } else {
253 if (mode != STRING) {
254 out.push_back('\'');
255 mode = STRING;
256 from = i;
257 }
258 }
259 }
260 if (mode == STRING) {
261 out.append(in.c_str() + from, i - from);
262 out.push_back('\'');
263 }
264 return out;
265}
266
267template<typename T>
268static void _key_encode_shard(shard_id_t shard, T *key)
269{
270 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
271}
272
273static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
274{
275 pshard->id = (uint8_t)*key - (uint8_t)0x80;
276 return key + 1;
277}
278
279static void get_coll_key_range(const coll_t& cid, int bits,
280 string *temp_start, string *temp_end,
281 string *start, string *end)
282{
283 temp_start->clear();
284 temp_end->clear();
285 start->clear();
286 end->clear();
287
288 spg_t pgid;
289 if (cid.is_pg(&pgid)) {
290 _key_encode_shard(pgid.shard, start);
291 *temp_start = *start;
292
293 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
294 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
295
296 *end = *start;
297 *temp_end = *temp_start;
298
299 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
300 _key_encode_u32(reverse_hash, start);
301 _key_encode_u32(reverse_hash, temp_start);
302
303 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
304 if (end_hash > 0xffffffffull)
305 end_hash = 0xffffffffull;
306
307 _key_encode_u32(end_hash, end);
308 _key_encode_u32(end_hash, temp_end);
309 } else {
310 _key_encode_shard(shard_id_t::NO_SHARD, start);
311 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
312 *end = *start;
313 _key_encode_u32(0, start);
314 _key_encode_u32(0xffffffff, end);
315
316 // no separate temp section
317 *temp_start = *end;
318 *temp_end = *end;
319 }
320}
321
322static void get_shared_blob_key(uint64_t sbid, string *key)
323{
324 key->clear();
325 _key_encode_u64(sbid, key);
326}
327
328static int get_key_shared_blob(const string& key, uint64_t *sbid)
329{
330 const char *p = key.c_str();
331 if (key.length() < sizeof(uint64_t))
332 return -1;
224ce89b 333 _key_decode_u64(p, sbid);
7c673cae
FG
334 return 0;
335}
336
337template<typename S>
338static int get_key_object(const S& key, ghobject_t *oid)
339{
340 int r;
341 const char *p = key.c_str();
342
343 if (key.length() < 1 + 8 + 4)
344 return -1;
345 p = _key_decode_shard(p, &oid->shard_id);
346
347 uint64_t pool;
348 p = _key_decode_u64(p, &pool);
349 oid->hobj.pool = pool - 0x8000000000000000ull;
350
351 unsigned hash;
352 p = _key_decode_u32(p, &hash);
353
354 oid->hobj.set_bitwise_key_u32(hash);
355
356 r = decode_escaped(p, &oid->hobj.nspace);
357 if (r < 0)
358 return -2;
359 p += r + 1;
360
361 string k;
362 r = decode_escaped(p, &k);
363 if (r < 0)
364 return -3;
365 p += r + 1;
366 if (*p == '=') {
367 // no key
368 ++p;
369 oid->hobj.oid.name = k;
370 } else if (*p == '<' || *p == '>') {
371 // key + name
372 ++p;
373 r = decode_escaped(p, &oid->hobj.oid.name);
374 if (r < 0)
375 return -5;
376 p += r + 1;
377 oid->hobj.set_key(k);
378 } else {
379 // malformed
380 return -6;
381 }
382
383 p = _key_decode_u64(p, &oid->hobj.snap.val);
384 p = _key_decode_u64(p, &oid->generation);
385
386 if (*p != ONODE_KEY_SUFFIX) {
387 return -7;
388 }
389 p++;
390 if (*p) {
391 // if we get something other than a null terminator here,
392 // something goes wrong.
393 return -8;
394 }
395
396 return 0;
397}
398
399template<typename S>
400static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
401{
402 key->clear();
403
404 size_t max_len = 1 + 8 + 4 +
405 (oid.hobj.nspace.length() * 3 + 1) +
406 (oid.hobj.get_key().length() * 3 + 1) +
407 1 + // for '<', '=', or '>'
408 (oid.hobj.oid.name.length() * 3 + 1) +
409 8 + 8 + 1;
410 key->reserve(max_len);
411
412 _key_encode_shard(oid.shard_id, key);
413 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
414 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
415
416 append_escaped(oid.hobj.nspace, key);
417
418 if (oid.hobj.get_key().length()) {
419 // is a key... could be < = or >.
420 append_escaped(oid.hobj.get_key(), key);
421 // (ASCII chars < = and > sort in that order, yay)
422 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
423 if (r) {
424 key->append(r > 0 ? ">" : "<");
425 append_escaped(oid.hobj.oid.name, key);
426 } else {
427 // same as no key
428 key->append("=");
429 }
430 } else {
431 // no key
432 append_escaped(oid.hobj.oid.name, key);
433 key->append("=");
434 }
435
436 _key_encode_u64(oid.hobj.snap, key);
437 _key_encode_u64(oid.generation, key);
438
439 key->push_back(ONODE_KEY_SUFFIX);
440
441 // sanity check
442 if (true) {
443 ghobject_t t;
444 int r = get_key_object(*key, &t);
445 if (r || t != oid) {
446 derr << " r " << r << dendl;
447 derr << "key " << pretty_binary_string(*key) << dendl;
448 derr << "oid " << oid << dendl;
449 derr << " t " << t << dendl;
11fdf7f2 450 ceph_assert(r == 0 && t == oid);
7c673cae
FG
451 }
452 }
453}
454
455
456// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
457// char lets us quickly test whether it is a shard key without decoding any
458// of the prefix bytes.
459template<typename S>
460static void get_extent_shard_key(const S& onode_key, uint32_t offset,
461 string *key)
462{
463 key->clear();
464 key->reserve(onode_key.length() + 4 + 1);
465 key->append(onode_key.c_str(), onode_key.size());
466 _key_encode_u32(offset, key);
467 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
468}
469
470static void rewrite_extent_shard_key(uint32_t offset, string *key)
471{
11fdf7f2
TL
472 ceph_assert(key->size() > sizeof(uint32_t) + 1);
473 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
474 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
475}
476
477template<typename S>
478static void generate_extent_shard_key_and_apply(
479 const S& onode_key,
480 uint32_t offset,
481 string *key,
482 std::function<void(const string& final_key)> apply)
483{
484 if (key->empty()) { // make full key
11fdf7f2 485 ceph_assert(!onode_key.empty());
7c673cae
FG
486 get_extent_shard_key(onode_key, offset, key);
487 } else {
488 rewrite_extent_shard_key(offset, key);
489 }
490 apply(*key);
491}
492
493int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
494{
11fdf7f2
TL
495 ceph_assert(key.size() > sizeof(uint32_t) + 1);
496 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
497 int okey_len = key.size() - sizeof(uint32_t) - 1;
498 *onode_key = key.substr(0, okey_len);
499 const char *p = key.data() + okey_len;
224ce89b 500 _key_decode_u32(p, offset);
7c673cae
FG
501 return 0;
502}
503
504static bool is_extent_shard_key(const string& key)
505{
506 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
507}
508
509// '-' < '.' < '~'
510static void get_omap_header(uint64_t id, string *out)
511{
512 _key_encode_u64(id, out);
513 out->push_back('-');
514}
515
516// hmm, I don't think there's any need to escape the user key since we
517// have a clean prefix.
518static void get_omap_key(uint64_t id, const string& key, string *out)
519{
520 _key_encode_u64(id, out);
521 out->push_back('.');
522 out->append(key);
523}
524
525static void rewrite_omap_key(uint64_t id, string old, string *out)
526{
527 _key_encode_u64(id, out);
528 out->append(old.c_str() + out->length(), old.size() - out->length());
529}
530
531static void decode_omap_key(const string& key, string *user_key)
532{
533 *user_key = key.substr(sizeof(uint64_t) + 1);
534}
535
536static void get_omap_tail(uint64_t id, string *out)
537{
538 _key_encode_u64(id, out);
539 out->push_back('~');
540}
541
542static void get_deferred_key(uint64_t seq, string *out)
543{
544 _key_encode_u64(seq, out);
545}
546
11fdf7f2
TL
547static void get_pool_stat_key(int64_t pool_id, string *key)
548{
549 key->clear();
550 _key_encode_u64(pool_id, key);
551}
552
553static int get_key_pool_stat(const string& key, uint64_t* pool_id)
554{
555 const char *p = key.c_str();
556 if (key.length() < sizeof(uint64_t))
557 return -1;
558 _key_decode_u64(p, pool_id);
559 return 0;
560}
7c673cae 561
81eedcae
TL
562template <int LogLevelV>
563void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
564{
565 uint64_t pos = 0;
566 for (auto& s : em.shards) {
567 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
568 << (s.loaded ? " (loaded)" : "")
569 << (s.dirty ? " (dirty)" : "")
570 << dendl;
571 }
572 for (auto& e : em.extent_map) {
573 dout(LogLevelV) << __func__ << " " << e << dendl;
574 ceph_assert(e.logical_offset >= pos);
575 pos = e.logical_offset + e.length;
576 const bluestore_blob_t& blob = e.blob->get_blob();
577 if (blob.has_csum()) {
578 vector<uint64_t> v;
579 unsigned n = blob.get_csum_count();
580 for (unsigned i = 0; i < n; ++i)
581 v.push_back(blob.get_csum_item(i));
582 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
583 << dendl;
584 }
585 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
586 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
587 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
588 << "~" << i.second->length << std::dec
589 << " " << *i.second << dendl;
590 }
591 }
592}
593
594template <int LogLevelV>
595void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
596{
597 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
598 return;
599 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
600 << " nid " << o.onode.nid
601 << " size 0x" << std::hex << o.onode.size
602 << " (" << std::dec << o.onode.size << ")"
603 << " expected_object_size " << o.onode.expected_object_size
604 << " expected_write_size " << o.onode.expected_write_size
605 << " in " << o.onode.extent_map_shards.size() << " shards"
606 << ", " << o.extent_map.spanning_blob_map.size()
607 << " spanning blobs"
608 << dendl;
609 for (auto p = o.onode.attrs.begin();
610 p != o.onode.attrs.end();
611 ++p) {
612 dout(LogLevelV) << __func__ << " attr " << p->first
613 << " len " << p->second.length() << dendl;
614 }
615 _dump_extent_map<LogLevelV>(cct, o.extent_map);
616}
617
618template <int LogLevelV>
619void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
620{
621 dout(LogLevelV) << __func__ << " transaction dump:\n";
622 JSONFormatter f(true);
623 f.open_object_section("transaction");
624 t->dump(&f);
625 f.close_section();
626 f.flush(*_dout);
627 *_dout << dendl;
628}
629
7c673cae
FG
630// merge operators
631
632struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
633 void merge_nonexistent(
634 const char *rdata, size_t rlen, std::string *new_value) override {
635 *new_value = std::string(rdata, rlen);
636 }
637 void merge(
638 const char *ldata, size_t llen,
639 const char *rdata, size_t rlen,
640 std::string *new_value) override {
11fdf7f2
TL
641 ceph_assert(llen == rlen);
642 ceph_assert((rlen % 8) == 0);
7c673cae 643 new_value->resize(rlen);
eafe8130
TL
644 const ceph_le64* lv = (const ceph_le64*)ldata;
645 const ceph_le64* rv = (const ceph_le64*)rdata;
646 ceph_le64* nv = &(ceph_le64&)new_value->at(0);
7c673cae
FG
647 for (size_t i = 0; i < rlen >> 3; ++i) {
648 nv[i] = lv[i] + rv[i];
649 }
650 }
651 // We use each operator name and each prefix to construct the
652 // overall RocksDB operator name for consistency check at open time.
91327a77 653 const char *name() const override {
7c673cae
FG
654 return "int64_array";
655 }
656};
657
658
659// Buffer
660
661ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
662{
663 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
664 << b.offset << "~" << b.length << std::dec
665 << " " << BlueStore::Buffer::get_state_name(b.state);
666 if (b.flags)
667 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
668 return out << ")";
669}
670
671// Garbage Collector
672
673void BlueStore::GarbageCollector::process_protrusive_extents(
674 const BlueStore::ExtentMap& extent_map,
675 uint64_t start_offset,
676 uint64_t end_offset,
677 uint64_t start_touch_offset,
678 uint64_t end_touch_offset,
679 uint64_t min_alloc_size)
680{
11fdf7f2 681 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 682
11fdf7f2
TL
683 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
684 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
685
686 dout(30) << __func__ << " (hex): [" << std::hex
687 << lookup_start_offset << ", " << lookup_end_offset
688 << ")" << std::dec << dendl;
689
690 for (auto it = extent_map.seek_lextent(lookup_start_offset);
691 it != extent_map.extent_map.end() &&
692 it->logical_offset < lookup_end_offset;
693 ++it) {
694 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
695 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
696
697 dout(30) << __func__ << " " << *it
698 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
699 << dendl;
700
701 Blob* b = it->blob.get();
702
703 if (it->logical_offset >=start_touch_offset &&
704 it->logical_end() <= end_touch_offset) {
705 // Process extents within the range affected by
706 // the current write request.
707 // Need to take into account if existing extents
708 // can be merged with them (uncompressed case)
709 if (!b->get_blob().is_compressed()) {
710 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
711 --blob_info_counted->expected_allocations; // don't need to allocate
712 // new AU for compressed
713 // data since another
714 // collocated uncompressed
715 // blob already exists
716 dout(30) << __func__ << " --expected:"
717 << alloc_unit_start << dendl;
718 }
719 used_alloc_unit = alloc_unit_end;
720 blob_info_counted = nullptr;
721 }
722 } else if (b->get_blob().is_compressed()) {
723
724 // additionally we take compressed blobs that were not impacted
725 // by the write into account too
726 BlobInfo& bi =
727 affected_blobs.emplace(
728 b, BlobInfo(b->get_referenced_bytes())).first->second;
729
730 int adjust =
731 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
732 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
733 dout(30) << __func__ << " expected_allocations="
734 << bi.expected_allocations << " end_au:"
735 << alloc_unit_end << dendl;
736
737 blob_info_counted = &bi;
738 used_alloc_unit = alloc_unit_end;
739
11fdf7f2 740 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
741 bi.referenced_bytes -= it->length;
742 dout(30) << __func__ << " affected_blob:" << *b
743 << " unref 0x" << std::hex << it->length
744 << " referenced = 0x" << bi.referenced_bytes
745 << std::dec << dendl;
746 // NOTE: we can't move specific blob to resulting GC list here
747 // when reference counter == 0 since subsequent extents might
748 // decrement its expected_allocation.
749 // Hence need to enumerate all the extents first.
750 if (!bi.collect_candidate) {
751 bi.first_lextent = it;
752 bi.collect_candidate = true;
753 }
754 bi.last_lextent = it;
755 } else {
756 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
757 // don't need to allocate new AU for compressed data since another
758 // collocated uncompressed blob already exists
759 --blob_info_counted->expected_allocations;
760 dout(30) << __func__ << " --expected_allocations:"
761 << alloc_unit_start << dendl;
762 }
763 used_alloc_unit = alloc_unit_end;
764 blob_info_counted = nullptr;
765 }
766 }
767
768 for (auto b_it = affected_blobs.begin();
769 b_it != affected_blobs.end();
770 ++b_it) {
771 Blob* b = b_it->first;
772 BlobInfo& bi = b_it->second;
773 if (bi.referenced_bytes == 0) {
774 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
775 int64_t blob_expected_for_release =
11fdf7f2 776 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
777
778 dout(30) << __func__ << " " << *(b_it->first)
779 << " expected4release=" << blob_expected_for_release
780 << " expected_allocations=" << bi.expected_allocations
781 << dendl;
782 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 783 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
784 if (bi.collect_candidate) {
785 auto it = bi.first_lextent;
786 bool bExit = false;
787 do {
788 if (it->blob.get() == b) {
eafe8130 789 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
790 }
791 bExit = it == bi.last_lextent;
792 ++it;
31f18b77 793 } while (!bExit);
7c673cae
FG
794 }
795 expected_for_release += blob_expected_for_release;
796 expected_allocations += bi.expected_allocations;
797 }
798 }
799 }
800}
801
802int64_t BlueStore::GarbageCollector::estimate(
803 uint64_t start_offset,
804 uint64_t length,
805 const BlueStore::ExtentMap& extent_map,
806 const BlueStore::old_extent_map_t& old_extents,
807 uint64_t min_alloc_size)
808{
809
810 affected_blobs.clear();
811 extents_to_collect.clear();
812 used_alloc_unit = boost::optional<uint64_t >();
813 blob_info_counted = nullptr;
814
eafe8130
TL
815 uint64_t gc_start_offset = start_offset;
816 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
817
818 uint64_t end_offset = start_offset + length;
819
820 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
821 Blob* b = it->e.blob.get();
822 if (b->get_blob().is_compressed()) {
823
824 // update gc_start_offset/gc_end_offset if needed
825 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 826 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
827
828 auto o = it->e.logical_offset;
829 auto l = it->e.length;
830
831 uint64_t ref_bytes = b->get_referenced_bytes();
832 // micro optimization to bypass blobs that have no more references
833 if (ref_bytes != 0) {
834 dout(30) << __func__ << " affected_blob:" << *b
835 << " unref 0x" << std::hex << o << "~" << l
836 << std::dec << dendl;
837 affected_blobs.emplace(b, BlobInfo(ref_bytes));
838 }
839 }
840 }
841 dout(30) << __func__ << " gc range(hex): [" << std::hex
842 << gc_start_offset << ", " << gc_end_offset
843 << ")" << std::dec << dendl;
844
845 // enumerate preceeding extents to check if they reference affected blobs
846 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
847 process_protrusive_extents(extent_map,
848 gc_start_offset,
849 gc_end_offset,
850 start_offset,
851 end_offset,
852 min_alloc_size);
853 }
854 return expected_for_release - expected_allocations;
855}
856
857// Cache
858
859BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
860 PerfCounters *logger)
861{
862 Cache *c = nullptr;
863
864 if (type == "lru")
865 c = new LRUCache(cct);
866 else if (type == "2q")
867 c = new TwoQCache(cct);
868 else
11fdf7f2 869 ceph_abort_msg("unrecognized cache type");
7c673cae
FG
870
871 c->logger = logger;
872 return c;
873}
874
91327a77 875void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
7c673cae 876{
11fdf7f2 877 std::lock_guard l(lock);
eafe8130
TL
878 if (cct->_conf->objectstore_blackhole) {
879 // do not trim if we are throwing away IOs a layer down
880 return;
881 }
91327a77 882 _trim(onode_max, buffer_max);
7c673cae
FG
883}
884
91327a77 885void BlueStore::Cache::trim_all()
7c673cae 886{
11fdf7f2 887 std::lock_guard l(lock);
eafe8130
TL
888 // we should not be shutting down after the blackhole is enabled
889 assert(!cct->_conf->objectstore_blackhole);
91327a77 890 _trim(0, 0);
7c673cae
FG
891}
892
7c673cae
FG
893// LRUCache
894#undef dout_prefix
895#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
896
897void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
898{
899 auto p = onode_lru.iterator_to(*o);
900 onode_lru.erase(p);
901 onode_lru.push_front(*o);
902}
903
904void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
905{
906 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
907 << " buffers " << buffer_size << " / " << buffer_max
908 << dendl;
909
910 _audit("trim start");
911
912 // buffers
913 while (buffer_size > buffer_max) {
914 auto i = buffer_lru.rbegin();
915 if (i == buffer_lru.rend()) {
916 // stop if buffer_lru is now empty
917 break;
918 }
919
920 Buffer *b = &*i;
11fdf7f2 921 ceph_assert(b->is_clean());
7c673cae
FG
922 dout(20) << __func__ << " rm " << *b << dendl;
923 b->space->_rm_buffer(this, b);
924 }
925
926 // onodes
91327a77 927 if (onode_max >= onode_lru.size()) {
7c673cae 928 return; // don't even try
91327a77
AA
929 }
930 uint64_t num = onode_lru.size() - onode_max;
7c673cae
FG
931
932 auto p = onode_lru.end();
11fdf7f2 933 ceph_assert(p != onode_lru.begin());
7c673cae
FG
934 --p;
935 int skipped = 0;
11fdf7f2 936 int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
7c673cae
FG
937 while (num > 0) {
938 Onode *o = &*p;
939 int refs = o->nref.load();
940 if (refs > 1) {
941 dout(20) << __func__ << " " << o->oid << " has " << refs
942 << " refs, skipping" << dendl;
943 if (++skipped >= max_skipped) {
944 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
945 << num << " left to trim" << dendl;
946 break;
947 }
948
949 if (p == onode_lru.begin()) {
950 break;
951 } else {
952 p--;
953 num--;
954 continue;
955 }
956 }
957 dout(30) << __func__ << " rm " << o->oid << dendl;
958 if (p != onode_lru.begin()) {
959 onode_lru.erase(p--);
960 } else {
961 onode_lru.erase(p);
11fdf7f2 962 ceph_assert(num == 1);
7c673cae
FG
963 }
964 o->get(); // paranoia
965 o->c->onode_map.remove(o->oid);
966 o->put();
967 --num;
968 }
969}
970
971#ifdef DEBUG_CACHE
972void BlueStore::LRUCache::_audit(const char *when)
973{
974 dout(10) << __func__ << " " << when << " start" << dendl;
975 uint64_t s = 0;
976 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
977 s += i->length;
978 }
979 if (s != buffer_size) {
980 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
981 << dendl;
982 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
983 derr << __func__ << " " << *i << dendl;
984 }
11fdf7f2 985 ceph_assert(s == buffer_size);
7c673cae
FG
986 }
987 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
988 << " ok" << dendl;
989}
990#endif
991
992// TwoQCache
993#undef dout_prefix
994#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
995
996
997void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
998{
999 auto p = onode_lru.iterator_to(*o);
1000 onode_lru.erase(p);
1001 onode_lru.push_front(*o);
1002}
1003
1004void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
1005{
1006 dout(20) << __func__ << " level " << level << " near " << near
1007 << " on " << *b
1008 << " which has cache_private " << b->cache_private << dendl;
1009 if (near) {
1010 b->cache_private = near->cache_private;
1011 switch (b->cache_private) {
1012 case BUFFER_WARM_IN:
1013 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
1014 break;
1015 case BUFFER_WARM_OUT:
11fdf7f2 1016 ceph_assert(b->is_empty());
7c673cae
FG
1017 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
1018 break;
1019 case BUFFER_HOT:
1020 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
1021 break;
1022 default:
11fdf7f2 1023 ceph_abort_msg("bad cache_private");
7c673cae
FG
1024 }
1025 } else if (b->cache_private == BUFFER_NEW) {
1026 b->cache_private = BUFFER_WARM_IN;
1027 if (level > 0) {
1028 buffer_warm_in.push_front(*b);
1029 } else {
1030 // take caller hint to start at the back of the warm queue
1031 buffer_warm_in.push_back(*b);
1032 }
1033 } else {
1034 // we got a hint from discard
1035 switch (b->cache_private) {
1036 case BUFFER_WARM_IN:
1037 // stay in warm_in. move to front, even though 2Q doesn't actually
1038 // do this.
1039 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1040 buffer_warm_in.push_front(*b);
1041 break;
1042 case BUFFER_WARM_OUT:
1043 b->cache_private = BUFFER_HOT;
1044 // move to hot. fall-thru
1045 case BUFFER_HOT:
1046 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1047 buffer_hot.push_front(*b);
1048 break;
1049 default:
11fdf7f2 1050 ceph_abort_msg("bad cache_private");
7c673cae
FG
1051 }
1052 }
1053 if (!b->is_empty()) {
1054 buffer_bytes += b->length;
1055 buffer_list_bytes[b->cache_private] += b->length;
1056 }
1057}
1058
1059void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1060{
1061 dout(20) << __func__ << " " << *b << dendl;
1062 if (!b->is_empty()) {
11fdf7f2 1063 ceph_assert(buffer_bytes >= b->length);
7c673cae 1064 buffer_bytes -= b->length;
11fdf7f2 1065 ceph_assert(buffer_list_bytes[b->cache_private] >= b->length);
7c673cae
FG
1066 buffer_list_bytes[b->cache_private] -= b->length;
1067 }
1068 switch (b->cache_private) {
1069 case BUFFER_WARM_IN:
1070 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1071 break;
1072 case BUFFER_WARM_OUT:
1073 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1074 break;
1075 case BUFFER_HOT:
1076 buffer_hot.erase(buffer_hot.iterator_to(*b));
1077 break;
1078 default:
11fdf7f2 1079 ceph_abort_msg("bad cache_private");
7c673cae
FG
1080 }
1081}
1082
1083void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1084{
1085 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1086 src->_rm_buffer(b);
1087
1088 // preserve which list we're on (even if we can't preserve the order!)
1089 switch (b->cache_private) {
1090 case BUFFER_WARM_IN:
11fdf7f2 1091 ceph_assert(!b->is_empty());
7c673cae
FG
1092 buffer_warm_in.push_back(*b);
1093 break;
1094 case BUFFER_WARM_OUT:
11fdf7f2 1095 ceph_assert(b->is_empty());
7c673cae
FG
1096 buffer_warm_out.push_back(*b);
1097 break;
1098 case BUFFER_HOT:
11fdf7f2 1099 ceph_assert(!b->is_empty());
7c673cae
FG
1100 buffer_hot.push_back(*b);
1101 break;
1102 default:
11fdf7f2 1103 ceph_abort_msg("bad cache_private");
7c673cae
FG
1104 }
1105 if (!b->is_empty()) {
1106 buffer_bytes += b->length;
1107 buffer_list_bytes[b->cache_private] += b->length;
1108 }
1109}
1110
1111void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1112{
1113 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1114 if (!b->is_empty()) {
11fdf7f2 1115 ceph_assert((int64_t)buffer_bytes + delta >= 0);
7c673cae 1116 buffer_bytes += delta;
11fdf7f2 1117 ceph_assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
7c673cae
FG
1118 buffer_list_bytes[b->cache_private] += delta;
1119 }
1120}
1121
1122void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1123{
1124 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1125 << " buffers " << buffer_bytes << " / " << buffer_max
1126 << dendl;
1127
1128 _audit("trim start");
1129
1130 // buffers
1131 if (buffer_bytes > buffer_max) {
1132 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1133 uint64_t khot = buffer_max - kin;
1134
1135 // pre-calculate kout based on average buffer size too,
1136 // which is typical(the warm_in and hot lists may change later)
1137 uint64_t kout = 0;
1138 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1139 if (buffer_num) {
1140 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
11fdf7f2 1141 ceph_assert(buffer_avg_size);
7c673cae
FG
1142 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1143 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1144 }
1145
1146 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1147 // hot is small, give slack to warm_in
1148 kin += khot - buffer_list_bytes[BUFFER_HOT];
1149 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1150 // warm_in is small, give slack to hot
1151 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1152 }
1153
1154 // adjust warm_in list
1155 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1156 uint64_t evicted = 0;
1157
1158 while (to_evict_bytes > 0) {
1159 auto p = buffer_warm_in.rbegin();
1160 if (p == buffer_warm_in.rend()) {
1161 // stop if warm_in list is now empty
1162 break;
1163 }
1164
1165 Buffer *b = &*p;
11fdf7f2 1166 ceph_assert(b->is_clean());
7c673cae 1167 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
11fdf7f2 1168 ceph_assert(buffer_bytes >= b->length);
7c673cae 1169 buffer_bytes -= b->length;
11fdf7f2 1170 ceph_assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
7c673cae
FG
1171 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1172 to_evict_bytes -= b->length;
1173 evicted += b->length;
1174 b->state = Buffer::STATE_EMPTY;
1175 b->data.clear();
1176 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1177 buffer_warm_out.push_front(*b);
1178 b->cache_private = BUFFER_WARM_OUT;
1179 }
1180
1181 if (evicted > 0) {
1adf2230 1182 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1183 << " from warm_in list, done evicting warm_in buffers"
1184 << dendl;
1185 }
1186
1187 // adjust hot list
1188 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1189 evicted = 0;
1190
1191 while (to_evict_bytes > 0) {
1192 auto p = buffer_hot.rbegin();
1193 if (p == buffer_hot.rend()) {
1194 // stop if hot list is now empty
1195 break;
1196 }
1197
1198 Buffer *b = &*p;
1199 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
11fdf7f2 1200 ceph_assert(b->is_clean());
7c673cae
FG
1201 // adjust evict size before buffer goes invalid
1202 to_evict_bytes -= b->length;
1203 evicted += b->length;
1204 b->space->_rm_buffer(this, b);
1205 }
1206
1207 if (evicted > 0) {
1adf2230 1208 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1209 << " from hot list, done evicting hot buffers"
1210 << dendl;
1211 }
1212
1213 // adjust warm out list too, if necessary
1214 int64_t num = buffer_warm_out.size() - kout;
1215 while (num-- > 0) {
1216 Buffer *b = &*buffer_warm_out.rbegin();
11fdf7f2 1217 ceph_assert(b->is_empty());
7c673cae
FG
1218 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1219 b->space->_rm_buffer(this, b);
1220 }
1221 }
1222
1223 // onodes
91327a77 1224 if (onode_max >= onode_lru.size()) {
7c673cae 1225 return; // don't even try
91327a77
AA
1226 }
1227 uint64_t num = onode_lru.size() - onode_max;
7c673cae
FG
1228
1229 auto p = onode_lru.end();
11fdf7f2 1230 ceph_assert(p != onode_lru.begin());
7c673cae
FG
1231 --p;
1232 int skipped = 0;
11fdf7f2 1233 int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
7c673cae
FG
1234 while (num > 0) {
1235 Onode *o = &*p;
1236 dout(20) << __func__ << " considering " << o << dendl;
1237 int refs = o->nref.load();
1238 if (refs > 1) {
1239 dout(20) << __func__ << " " << o->oid << " has " << refs
1240 << " refs; skipping" << dendl;
1241 if (++skipped >= max_skipped) {
1242 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1243 << num << " left to trim" << dendl;
1244 break;
1245 }
1246
1247 if (p == onode_lru.begin()) {
1248 break;
1249 } else {
1250 p--;
1251 num--;
1252 continue;
1253 }
1254 }
1255 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1256 if (p != onode_lru.begin()) {
1257 onode_lru.erase(p--);
1258 } else {
1259 onode_lru.erase(p);
11fdf7f2 1260 ceph_assert(num == 1);
7c673cae
FG
1261 }
1262 o->get(); // paranoia
1263 o->c->onode_map.remove(o->oid);
1264 o->put();
1265 --num;
1266 }
1267}
1268
1269#ifdef DEBUG_CACHE
1270void BlueStore::TwoQCache::_audit(const char *when)
1271{
1272 dout(10) << __func__ << " " << when << " start" << dendl;
1273 uint64_t s = 0;
1274 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1275 s += i->length;
1276 }
1277
1278 uint64_t hot_bytes = s;
1279 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1280 derr << __func__ << " hot_list_bytes "
1281 << buffer_list_bytes[BUFFER_HOT]
1282 << " != actual " << hot_bytes
1283 << dendl;
11fdf7f2 1284 ceph_assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
7c673cae
FG
1285 }
1286
1287 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1288 s += i->length;
1289 }
1290
1291 uint64_t warm_in_bytes = s - hot_bytes;
1292 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1293 derr << __func__ << " warm_in_list_bytes "
1294 << buffer_list_bytes[BUFFER_WARM_IN]
1295 << " != actual " << warm_in_bytes
1296 << dendl;
11fdf7f2 1297 ceph_assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
7c673cae
FG
1298 }
1299
1300 if (s != buffer_bytes) {
1301 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1302 << dendl;
11fdf7f2 1303 ceph_assert(s == buffer_bytes);
7c673cae
FG
1304 }
1305
1306 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1307 << " ok" << dendl;
1308}
1309#endif
1310
1311
1312// BufferSpace
1313
1314#undef dout_prefix
1315#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1316
1317void BlueStore::BufferSpace::_clear(Cache* cache)
1318{
1319 // note: we already hold cache->lock
1320 ldout(cache->cct, 20) << __func__ << dendl;
1321 while (!buffer_map.empty()) {
1322 _rm_buffer(cache, buffer_map.begin());
1323 }
1324}
1325
1326int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1327{
1328 // note: we already hold cache->lock
1329 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1330 << std::dec << dendl;
1331 int cache_private = 0;
1332 cache->_audit("discard start");
1333 auto i = _data_lower_bound(offset);
1334 uint32_t end = offset + length;
1335 while (i != buffer_map.end()) {
1336 Buffer *b = i->second.get();
1337 if (b->offset >= end) {
1338 break;
1339 }
1340 if (b->cache_private > cache_private) {
1341 cache_private = b->cache_private;
1342 }
1343 if (b->offset < offset) {
1344 int64_t front = offset - b->offset;
1345 if (b->end() > end) {
1346 // drop middle (split)
1347 uint32_t tail = b->end() - end;
1348 if (b->data.length()) {
1349 bufferlist bl;
1350 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1351 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1352 nb->maybe_rebuild();
1353 _add_buffer(cache, nb, 0, b);
7c673cae 1354 } else {
31f18b77
FG
1355 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1356 0, b);
7c673cae
FG
1357 }
1358 if (!b->is_writing()) {
1359 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1360 }
1361 b->truncate(front);
31f18b77 1362 b->maybe_rebuild();
7c673cae
FG
1363 cache->_audit("discard end 1");
1364 break;
1365 } else {
1366 // drop tail
1367 if (!b->is_writing()) {
1368 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1369 }
1370 b->truncate(front);
31f18b77 1371 b->maybe_rebuild();
7c673cae
FG
1372 ++i;
1373 continue;
1374 }
1375 }
1376 if (b->end() <= end) {
1377 // drop entire buffer
1378 _rm_buffer(cache, i++);
1379 continue;
1380 }
1381 // drop front
1382 uint32_t keep = b->end() - end;
1383 if (b->data.length()) {
1384 bufferlist bl;
1385 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1386 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1387 nb->maybe_rebuild();
1388 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1389 } else {
1390 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1391 }
1392 _rm_buffer(cache, i);
1393 cache->_audit("discard end 2");
1394 break;
1395 }
1396 return cache_private;
1397}
1398
1399void BlueStore::BufferSpace::read(
1400 Cache* cache,
224ce89b
WB
1401 uint32_t offset,
1402 uint32_t length,
7c673cae 1403 BlueStore::ready_regions_t& res,
91327a77
AA
1404 interval_set<uint32_t>& res_intervals,
1405 int flags)
7c673cae 1406{
7c673cae
FG
1407 res.clear();
1408 res_intervals.clear();
1409 uint32_t want_bytes = length;
1410 uint32_t end = offset + length;
224ce89b
WB
1411
1412 {
11fdf7f2 1413 std::lock_guard l(cache->lock);
224ce89b
WB
1414 for (auto i = _data_lower_bound(offset);
1415 i != buffer_map.end() && offset < end && i->first < end;
1416 ++i) {
1417 Buffer *b = i->second.get();
11fdf7f2 1418 ceph_assert(b->end() > offset);
91327a77
AA
1419
1420 bool val = false;
1421 if (flags & BYPASS_CLEAN_CACHE)
1422 val = b->is_writing();
1423 else
1424 val = b->is_writing() || b->is_clean();
1425 if (val) {
224ce89b
WB
1426 if (b->offset < offset) {
1427 uint32_t skip = offset - b->offset;
11fdf7f2 1428 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1429 res[offset].substr_of(b->data, skip, l);
1430 res_intervals.insert(offset, l);
1431 offset += l;
1432 length -= l;
1433 if (!b->is_writing()) {
1434 cache->_touch_buffer(b);
1435 }
1436 continue;
1437 }
1438 if (b->offset > offset) {
1439 uint32_t gap = b->offset - offset;
1440 if (length <= gap) {
1441 break;
1442 }
1443 offset += gap;
1444 length -= gap;
1445 }
1446 if (!b->is_writing()) {
7c673cae 1447 cache->_touch_buffer(b);
224ce89b
WB
1448 }
1449 if (b->length > length) {
1450 res[offset].substr_of(b->data, 0, length);
1451 res_intervals.insert(offset, length);
7c673cae 1452 break;
224ce89b
WB
1453 } else {
1454 res[offset].append(b->data);
1455 res_intervals.insert(offset, b->length);
1456 if (b->length == length)
1457 break;
1458 offset += b->length;
1459 length -= b->length;
1460 }
7c673cae
FG
1461 }
1462 }
1463 }
1464
1465 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1466 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1467 uint64_t miss_bytes = want_bytes - hit_bytes;
1468 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1469 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1470}
1471
f64942e4 1472void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
7c673cae 1473{
7c673cae
FG
1474 auto i = writing.begin();
1475 while (i != writing.end()) {
1476 if (i->seq > seq) {
1477 break;
1478 }
1479 if (i->seq < seq) {
1480 ++i;
1481 continue;
1482 }
1483
1484 Buffer *b = &*i;
11fdf7f2 1485 ceph_assert(b->is_writing());
7c673cae
FG
1486
1487 if (b->flags & Buffer::FLAG_NOCACHE) {
1488 writing.erase(i++);
1489 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1490 buffer_map.erase(b->offset);
1491 } else {
1492 b->state = Buffer::STATE_CLEAN;
1493 writing.erase(i++);
31f18b77
FG
1494 b->maybe_rebuild();
1495 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
1496 cache->_add_buffer(b, 1, nullptr);
1497 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1498 }
1499 }
1500
1501 cache->_audit("finish_write end");
1502}
1503
1504void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1505{
11fdf7f2 1506 std::lock_guard lk(cache->lock);
7c673cae
FG
1507 if (buffer_map.empty())
1508 return;
1509
1510 auto p = --buffer_map.end();
1511 while (true) {
1512 if (p->second->end() <= pos)
1513 break;
1514
1515 if (p->second->offset < pos) {
1516 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1517 size_t left = pos - p->second->offset;
1518 size_t right = p->second->length - left;
1519 if (p->second->data.length()) {
1520 bufferlist bl;
1521 bl.substr_of(p->second->data, left, right);
1522 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1523 0, p->second.get());
1524 } else {
1525 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1526 0, p->second.get());
1527 }
1528 cache->_adjust_buffer_size(p->second.get(), -right);
1529 p->second->truncate(left);
1530 break;
1531 }
1532
11fdf7f2 1533 ceph_assert(p->second->end() > pos);
7c673cae
FG
1534 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1535 if (p->second->data.length()) {
1536 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1537 p->second->offset - pos, p->second->data),
1538 0, p->second.get());
1539 } else {
1540 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1541 p->second->offset - pos, p->second->length),
1542 0, p->second.get());
1543 }
1544 if (p == buffer_map.begin()) {
1545 _rm_buffer(cache, p);
1546 break;
1547 } else {
1548 _rm_buffer(cache, p--);
1549 }
1550 }
11fdf7f2 1551 ceph_assert(writing.empty());
7c673cae
FG
1552}
1553
1554// OnodeSpace
1555
1556#undef dout_prefix
1557#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1558
1559BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1560{
11fdf7f2 1561 std::lock_guard l(cache->lock);
7c673cae
FG
1562 auto p = onode_map.find(oid);
1563 if (p != onode_map.end()) {
1564 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1565 << " raced, returning existing " << p->second
1566 << dendl;
1567 return p->second;
1568 }
1569 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1570 onode_map[oid] = o;
1571 cache->_add_onode(o, 1);
1572 return o;
1573}
1574
1575BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1576{
7c673cae 1577 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1578 OnodeRef o;
1579 bool hit = false;
1580
1581 {
11fdf7f2 1582 std::lock_guard l(cache->lock);
224ce89b
WB
1583 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1584 if (p == onode_map.end()) {
1585 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1586 } else {
1587 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1588 << dendl;
1589 cache->_touch_onode(p->second);
1590 hit = true;
1591 o = p->second;
1592 }
1593 }
1594
1595 if (hit) {
1596 cache->logger->inc(l_bluestore_onode_hits);
1597 } else {
7c673cae 1598 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1599 }
224ce89b 1600 return o;
7c673cae
FG
1601}
1602
1603void BlueStore::OnodeSpace::clear()
1604{
11fdf7f2 1605 std::lock_guard l(cache->lock);
7c673cae
FG
1606 ldout(cache->cct, 10) << __func__ << dendl;
1607 for (auto &p : onode_map) {
1608 cache->_rm_onode(p.second);
1609 }
1610 onode_map.clear();
1611}
1612
1613bool BlueStore::OnodeSpace::empty()
1614{
11fdf7f2 1615 std::lock_guard l(cache->lock);
7c673cae
FG
1616 return onode_map.empty();
1617}
1618
1619void BlueStore::OnodeSpace::rename(
1620 OnodeRef& oldo,
1621 const ghobject_t& old_oid,
1622 const ghobject_t& new_oid,
31f18b77 1623 const mempool::bluestore_cache_other::string& new_okey)
7c673cae 1624{
11fdf7f2 1625 std::lock_guard l(cache->lock);
7c673cae
FG
1626 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1627 << dendl;
1628 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1629 po = onode_map.find(old_oid);
1630 pn = onode_map.find(new_oid);
11fdf7f2 1631 ceph_assert(po != pn);
7c673cae 1632
11fdf7f2 1633 ceph_assert(po != onode_map.end());
7c673cae
FG
1634 if (pn != onode_map.end()) {
1635 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1636 << dendl;
1637 cache->_rm_onode(pn->second);
1638 onode_map.erase(pn);
1639 }
1640 OnodeRef o = po->second;
1641
1642 // install a non-existent onode at old location
1643 oldo.reset(new Onode(o->c, old_oid, o->key));
1644 po->second = oldo;
1645 cache->_add_onode(po->second, 1);
1646
1647 // add at new position and fix oid, key
1648 onode_map.insert(make_pair(new_oid, o));
1649 cache->_touch_onode(o);
1650 o->oid = new_oid;
1651 o->key = new_okey;
1652}
1653
1654bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1655{
11fdf7f2 1656 std::lock_guard l(cache->lock);
7c673cae
FG
1657 ldout(cache->cct, 20) << __func__ << dendl;
1658 for (auto& i : onode_map) {
1659 if (f(i.second)) {
1660 return true;
1661 }
1662 }
1663 return false;
1664}
1665
11fdf7f2
TL
1666template <int LogLevelV = 30>
1667void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1668{
1669 for (auto& i : onode_map) {
11fdf7f2 1670 ldout(cct, LogLevelV) << i.first << " : " << i.second << dendl;
3efd9988
FG
1671 }
1672}
7c673cae
FG
1673
1674// SharedBlob
1675
1676#undef dout_prefix
1677#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1678
1679ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1680{
1681 out << "SharedBlob(" << &sb;
1682
1683 if (sb.loaded) {
1684 out << " loaded " << *sb.persistent;
1685 } else {
1686 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1687 }
1688 return out << ")";
1689}
1690
1691BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1692 : coll(_coll), sbid_unloaded(i)
1693{
11fdf7f2 1694 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
1695 if (get_cache()) {
1696 get_cache()->add_blob();
1697 }
1698}
1699
1700BlueStore::SharedBlob::~SharedBlob()
1701{
7c673cae
FG
1702 if (loaded && persistent) {
1703 delete persistent;
1704 }
1705}
1706
1707void BlueStore::SharedBlob::put()
1708{
1709 if (--nref == 0) {
1710 ldout(coll->store->cct, 20) << __func__ << " " << this
1711 << " removing self from set " << get_parent()
1712 << dendl;
1adf2230
AA
1713 again:
1714 auto coll_snap = coll;
1715 if (coll_snap) {
11fdf7f2 1716 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
1717 if (coll_snap != coll) {
1718 goto again;
1719 }
91327a77
AA
1720 if (!coll_snap->shared_blob_set.remove(this, true)) {
1721 // race with lookup
1722 return;
1723 }
1adf2230
AA
1724 bc._clear(coll_snap->cache);
1725 coll_snap->cache->rm_blob();
7c673cae 1726 }
28e407b8 1727 delete this;
7c673cae
FG
1728 }
1729}
1730
1731void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1732{
11fdf7f2 1733 ceph_assert(persistent);
7c673cae
FG
1734 persistent->ref_map.get(offset, length);
1735}
1736
1737void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 1738 PExtentVector *r,
11fdf7f2 1739 bool *unshare)
7c673cae 1740{
11fdf7f2
TL
1741 ceph_assert(persistent);
1742 persistent->ref_map.put(offset, length, r,
1743 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
1744}
1745
f64942e4
AA
1746void BlueStore::SharedBlob::finish_write(uint64_t seq)
1747{
1748 while (true) {
1749 Cache *cache = coll->cache;
11fdf7f2 1750 std::lock_guard l(cache->lock);
f64942e4
AA
1751 if (coll->cache != cache) {
1752 ldout(coll->store->cct, 20) << __func__
1753 << " raced with sb cache update, was " << cache
1754 << ", now " << coll->cache << ", retrying"
1755 << dendl;
1756 continue;
1757 }
1758 bc._finish_write(cache, seq);
1759 break;
1760 }
1761}
1762
3efd9988
FG
1763// SharedBlobSet
1764
1765#undef dout_prefix
1766#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1767
11fdf7f2
TL
1768template <int LogLevelV = 30>
1769void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 1770{
11fdf7f2 1771 std::lock_guard l(lock);
3efd9988 1772 for (auto& i : sb_map) {
11fdf7f2 1773 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
1774 }
1775}
1776
7c673cae
FG
1777// Blob
1778
1779#undef dout_prefix
1780#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1781
1782ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1783{
1784 out << "Blob(" << &b;
1785 if (b.is_spanning()) {
1786 out << " spanning " << b.id;
1787 }
35e4c445
FG
1788 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1789 if (b.shared_blob) {
1790 out << " " << *b.shared_blob;
1791 } else {
1792 out << " (shared_blob=NULL)";
1793 }
1794 out << ")";
7c673cae
FG
1795 return out;
1796}
1797
1798void BlueStore::Blob::discard_unallocated(Collection *coll)
1799{
224ce89b 1800 if (get_blob().is_shared()) {
7c673cae
FG
1801 return;
1802 }
224ce89b 1803 if (get_blob().is_compressed()) {
7c673cae
FG
1804 bool discard = false;
1805 bool all_invalid = true;
224ce89b 1806 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1807 if (!e.is_valid()) {
1808 discard = true;
1809 } else {
1810 all_invalid = false;
1811 }
1812 }
11fdf7f2 1813 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
1814 // or none pextents are invalid.
1815 if (discard) {
224ce89b
WB
1816 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1817 get_blob().get_logical_length());
7c673cae
FG
1818 }
1819 } else {
1820 size_t pos = 0;
224ce89b 1821 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1822 if (!e.is_valid()) {
1823 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1824 << "~" << e.length
1825 << std::dec << dendl;
1826 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1827 }
1828 pos += e.length;
1829 }
224ce89b
WB
1830 if (get_blob().can_prune_tail()) {
1831 dirty_blob().prune_tail();
1832 used_in_blob.prune_tail(get_blob().get_ondisk_length());
7c673cae 1833 auto cct = coll->store->cct; //used by dout
224ce89b 1834 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1835 }
1836 }
1837}
1838
1839void BlueStore::Blob::get_ref(
1840 Collection *coll,
1841 uint32_t offset,
1842 uint32_t length)
1843{
1844 // Caller has to initialize Blob's logical length prior to increment
1845 // references. Otherwise one is neither unable to determine required
1846 // amount of counters in case of per-au tracking nor obtain min_release_size
1847 // for single counter mode.
11fdf7f2 1848 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
1849 auto cct = coll->store->cct;
1850 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1851 << std::dec << " " << *this << dendl;
1852
1853 if (used_in_blob.is_empty()) {
1854 uint32_t min_release_size =
224ce89b
WB
1855 get_blob().get_release_size(coll->store->min_alloc_size);
1856 uint64_t l = get_blob().get_logical_length();
1857 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1858 << min_release_size << std::dec << dendl;
7c673cae
FG
1859 used_in_blob.init(l, min_release_size);
1860 }
1861 used_in_blob.get(
1862 offset,
1863 length);
1864}
1865
1866bool BlueStore::Blob::put_ref(
1867 Collection *coll,
1868 uint32_t offset,
1869 uint32_t length,
1870 PExtentVector *r)
1871{
1872 PExtentVector logical;
1873
1874 auto cct = coll->store->cct;
1875 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1876 << std::dec << " " << *this << dendl;
1877
1878 bool empty = used_in_blob.put(
1879 offset,
1880 length,
1881 &logical);
1882 r->clear();
1883 // nothing to release
1884 if (!empty && logical.empty()) {
1885 return false;
1886 }
1887
1888 bluestore_blob_t& b = dirty_blob();
1889 return b.release_extents(empty, logical, r);
1890}
1891
224ce89b 1892bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1893 uint32_t target_blob_size,
1894 uint32_t b_offset,
1895 uint32_t *length0) {
11fdf7f2
TL
1896 ceph_assert(min_alloc_size);
1897 ceph_assert(target_blob_size);
7c673cae
FG
1898 if (!get_blob().is_mutable()) {
1899 return false;
1900 }
1901
1902 uint32_t length = *length0;
1903 uint32_t end = b_offset + length;
1904
1905 // Currently for the sake of simplicity we omit blob reuse if data is
1906 // unaligned with csum chunk. Later we can perform padding if needed.
1907 if (get_blob().has_csum() &&
1908 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1909 (end % get_blob().get_csum_chunk_size()) != 0)) {
1910 return false;
1911 }
1912
1913 auto blen = get_blob().get_logical_length();
1914 uint32_t new_blen = blen;
1915
1916 // make sure target_blob_size isn't less than current blob len
11fdf7f2 1917 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
1918
1919 if (b_offset >= blen) {
224ce89b
WB
1920 // new data totally stands out of the existing blob
1921 new_blen = end;
7c673cae 1922 } else {
224ce89b 1923 // new data overlaps with the existing blob
11fdf7f2 1924 new_blen = std::max(blen, end);
224ce89b
WB
1925
1926 uint32_t overlap = 0;
1927 if (new_blen > blen) {
1928 overlap = blen - b_offset;
1929 } else {
1930 overlap = length;
1931 }
1932
1933 if (!get_blob().is_unallocated(b_offset, overlap)) {
1934 // abort if any piece of the overlap has already been allocated
1935 return false;
7c673cae
FG
1936 }
1937 }
224ce89b 1938
7c673cae
FG
1939 if (new_blen > blen) {
1940 int64_t overflow = int64_t(new_blen) - target_blob_size;
1941 // Unable to decrease the provided length to fit into max_blob_size
1942 if (overflow >= length) {
1943 return false;
1944 }
1945
1946 // FIXME: in some cases we could reduce unused resolution
1947 if (get_blob().has_unused()) {
1948 return false;
1949 }
1950
1951 if (overflow > 0) {
1952 new_blen -= overflow;
1953 length -= overflow;
1954 *length0 = length;
1955 }
224ce89b 1956
7c673cae
FG
1957 if (new_blen > blen) {
1958 dirty_blob().add_tail(new_blen);
1959 used_in_blob.add_tail(new_blen,
224ce89b 1960 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
1961 }
1962 }
1963 return true;
1964}
1965
1966void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1967{
1968 auto cct = coll->store->cct; //used by dout
1969 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1970 << " start " << *this << dendl;
11fdf7f2
TL
1971 ceph_assert(blob.can_split());
1972 ceph_assert(used_in_blob.can_split());
7c673cae
FG
1973 bluestore_blob_t &lb = dirty_blob();
1974 bluestore_blob_t &rb = r->dirty_blob();
1975
1976 used_in_blob.split(
1977 blob_offset,
1978 &(r->used_in_blob));
1979
1980 lb.split(blob_offset, rb);
1981 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1982
1983 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1984 << " finish " << *this << dendl;
1985 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1986 << " and " << *r << dendl;
1987}
1988
1989#ifndef CACHE_BLOB_BL
1990void BlueStore::Blob::decode(
1991 Collection *coll,
11fdf7f2 1992 bufferptr::const_iterator& p,
7c673cae
FG
1993 uint64_t struct_v,
1994 uint64_t* sbid,
1995 bool include_ref_map)
1996{
1997 denc(blob, p, struct_v);
1998 if (blob.is_shared()) {
1999 denc(*sbid, p);
2000 }
2001 if (include_ref_map) {
2002 if (struct_v > 1) {
2003 used_in_blob.decode(p);
2004 } else {
2005 used_in_blob.clear();
2006 bluestore_extent_ref_map_t legacy_ref_map;
2007 legacy_ref_map.decode(p);
2008 for (auto r : legacy_ref_map.ref_map) {
2009 get_ref(
2010 coll,
2011 r.first,
2012 r.second.refs * r.second.length);
2013 }
2014 }
2015 }
2016}
2017#endif
2018
2019// Extent
2020
2021ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2022{
2023 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2024 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2025 << " " << *e.blob;
2026}
2027
2028// OldExtent
2029BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2030 uint32_t lo,
2031 uint32_t o,
2032 uint32_t l,
2033 BlobRef& b) {
2034 OldExtent* oe = new OldExtent(lo, o, l, b);
2035 b->put_ref(c.get(), o, l, &(oe->r));
2036 oe->blob_empty = b->get_referenced_bytes() == 0;
2037 return oe;
2038}
2039
2040// ExtentMap
2041
2042#undef dout_prefix
2043#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2044
2045BlueStore::ExtentMap::ExtentMap(Onode *o)
2046 : onode(o),
2047 inline_bl(
2048 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2049}
2050
11fdf7f2
TL
2051void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2052 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2053 uint64_t& length, uint64_t& dstoff) {
2054
2055 auto cct = onode->c->store->cct;
2056 bool inject_21040 =
2057 cct->_conf->bluestore_debug_inject_bug21040;
2058 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2059 for (auto& e : oldo->extent_map.extent_map) {
2060 e.blob->last_encoded_id = -1;
2061 }
2062
2063 int n = 0;
2064 uint64_t end = srcoff + length;
2065 uint32_t dirty_range_begin = 0;
2066 uint32_t dirty_range_end = 0;
2067 bool src_dirty = false;
2068 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2069 ep != oldo->extent_map.extent_map.end();
2070 ++ep) {
2071 auto& e = *ep;
2072 if (e.logical_offset >= end) {
2073 break;
2074 }
2075 dout(20) << __func__ << " src " << e << dendl;
2076 BlobRef cb;
2077 bool blob_duped = true;
2078 if (e.blob->last_encoded_id >= 0) {
2079 cb = id_to_blob[e.blob->last_encoded_id];
2080 blob_duped = false;
2081 } else {
2082 // dup the blob
2083 const bluestore_blob_t& blob = e.blob->get_blob();
2084 // make sure it is shared
2085 if (!blob.is_shared()) {
2086 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2087 if (!inject_21040 && !src_dirty) {
2088 src_dirty = true;
2089 dirty_range_begin = e.logical_offset;
2090 } else if (inject_21040 &&
2091 dirty_range_begin == 0 && dirty_range_end == 0) {
2092 dirty_range_begin = e.logical_offset;
2093 }
2094 ceph_assert(e.logical_end() > 0);
2095 // -1 to exclude next potential shard
2096 dirty_range_end = e.logical_end() - 1;
2097 } else {
2098 c->load_shared_blob(e.blob->shared_blob);
2099 }
2100 cb = new Blob();
2101 e.blob->last_encoded_id = n;
2102 id_to_blob[n] = cb;
2103 e.blob->dup(*cb);
2104 // bump the extent refs on the copied blob's extents
2105 for (auto p : blob.get_extents()) {
2106 if (p.is_valid()) {
2107 e.blob->shared_blob->get_ref(p.offset, p.length);
2108 }
2109 }
2110 txc->write_shared_blob(e.blob->shared_blob);
2111 dout(20) << __func__ << " new " << *cb << dendl;
2112 }
2113
2114 int skip_front, skip_back;
2115 if (e.logical_offset < srcoff) {
2116 skip_front = srcoff - e.logical_offset;
2117 } else {
2118 skip_front = 0;
2119 }
2120 if (e.logical_end() > end) {
2121 skip_back = e.logical_end() - end;
2122 } else {
2123 skip_back = 0;
2124 }
2125
2126 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2127 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2128 newo->extent_map.extent_map.insert(*ne);
2129 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2130 // fixme: we may leave parts of new blob unreferenced that could
2131 // be freed (relative to the shared_blob).
2132 txc->statfs_delta.stored() += ne->length;
2133 if (e.blob->get_blob().is_compressed()) {
2134 txc->statfs_delta.compressed_original() += ne->length;
2135 if (blob_duped) {
2136 txc->statfs_delta.compressed() +=
2137 cb->get_blob().get_compressed_payload_length();
2138 }
2139 }
2140 dout(20) << __func__ << " dst " << *ne << dendl;
2141 ++n;
2142 }
2143 if ((!inject_21040 && src_dirty) ||
2144 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2145 oldo->extent_map.dirty_range(dirty_range_begin,
2146 dirty_range_end - dirty_range_begin);
2147 txc->write_onode(oldo);
2148 }
2149 txc->write_onode(newo);
2150
2151 if (dstoff + length > newo->onode.size) {
2152 newo->onode.size = dstoff + length;
2153 }
2154 newo->extent_map.dirty_range(dstoff, length);
2155}
7c673cae
FG
2156void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2157 bool force)
2158{
2159 auto cct = onode->c->store->cct; //used by dout
2160 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2161 if (onode->onode.extent_map_shards.empty()) {
2162 if (inline_bl.length() == 0) {
2163 unsigned n;
2164 // we need to encode inline_bl to measure encoded length
2165 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
3efd9988 2166 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11fdf7f2 2167 ceph_assert(!never_happen);
7c673cae
FG
2168 size_t len = inline_bl.length();
2169 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2170 << " extents" << dendl;
2171 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2172 request_reshard(0, OBJECT_MAX_SIZE);
2173 return;
2174 }
2175 }
2176 // will persist in the onode key.
2177 } else {
2178 // pending shard update
2179 struct dirty_shard_t {
2180 Shard *shard;
2181 bufferlist bl;
2182 dirty_shard_t(Shard *s) : shard(s) {}
2183 };
2184 vector<dirty_shard_t> encoded_shards;
2185 // allocate slots for all shards in a single call instead of
2186 // doing multiple allocations - one per each dirty shard
2187 encoded_shards.reserve(shards.size());
2188
2189 auto p = shards.begin();
2190 auto prev_p = p;
2191 while (p != shards.end()) {
11fdf7f2 2192 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2193 auto n = p;
2194 ++n;
2195 if (p->dirty) {
2196 uint32_t endoff;
2197 if (n == shards.end()) {
2198 endoff = OBJECT_MAX_SIZE;
2199 } else {
2200 endoff = n->shard_info->offset;
2201 }
2202 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2203 bufferlist& bl = encoded_shards.back().bl;
2204 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2205 bl, &p->extents)) {
2206 if (force) {
2207 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2208 ceph_assert(!force);
7c673cae
FG
2209 }
2210 }
2211 size_t len = bl.length();
2212
2213 dout(20) << __func__ << " shard 0x" << std::hex
2214 << p->shard_info->offset << std::dec << " is " << len
2215 << " bytes (was " << p->shard_info->bytes << ") from "
2216 << p->extents << " extents" << dendl;
2217
2218 if (!force) {
2219 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2220 // we are big; reshard ourselves
2221 request_reshard(p->shard_info->offset, endoff);
2222 }
2223 // avoid resharding the trailing shard, even if it is small
2224 else if (n != shards.end() &&
11fdf7f2
TL
2225 len < g_conf()->bluestore_extent_map_shard_min_size) {
2226 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2227 if (p == shards.begin()) {
2228 // we are the first shard, combine with next shard
7c673cae 2229 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2230 } else {
31f18b77
FG
2231 // combine either with the previous shard or the next,
2232 // whichever is smaller
7c673cae
FG
2233 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2234 request_reshard(p->shard_info->offset, endoff + 1);
2235 } else {
2236 request_reshard(prev_p->shard_info->offset, endoff);
2237 }
2238 }
2239 }
2240 }
2241 }
2242 prev_p = p;
2243 p = n;
2244 }
2245 if (needs_reshard()) {
2246 return;
2247 }
2248
2249 // schedule DB update for dirty shards
2250 string key;
2251 for (auto& it : encoded_shards) {
2252 it.shard->dirty = false;
2253 it.shard->shard_info->bytes = it.bl.length();
2254 generate_extent_shard_key_and_apply(
2255 onode->key,
2256 it.shard->shard_info->offset,
2257 &key,
2258 [&](const string& final_key) {
2259 t->set(PREFIX_OBJ, final_key, it.bl);
2260 }
2261 );
2262 }
2263 }
2264}
2265
31f18b77
FG
2266bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2267{
2268 if (spanning_blob_map.empty())
2269 return 0;
2270 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2271 // bid is valid and available.
2272 if (bid >= 0)
2273 return bid;
2274 // Find next unused bid;
2275 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2276 const auto begin_bid = bid;
2277 do {
2278 if (!spanning_blob_map.count(bid))
2279 return bid;
2280 else {
2281 bid++;
2282 if (bid < 0) bid = 0;
2283 }
2284 } while (bid != begin_bid);
81eedcae
TL
2285 auto cct = onode->c->store->cct; // used by dout
2286 _dump_onode<0>(cct, *onode);
11fdf7f2 2287 ceph_abort_msg("no available blob id");
31f18b77
FG
2288}
2289
7c673cae
FG
2290void BlueStore::ExtentMap::reshard(
2291 KeyValueDB *db,
2292 KeyValueDB::Transaction t)
2293{
2294 auto cct = onode->c->store->cct; // used by dout
2295
2296 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2297 << needs_reshard_end << ")" << std::dec
2298 << " of " << onode->onode.extent_map_shards.size()
2299 << " shards on " << onode->oid << dendl;
2300 for (auto& p : spanning_blob_map) {
2301 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2302 << dendl;
2303 }
2304 // determine shard index range
2305 unsigned si_begin = 0, si_end = 0;
2306 if (!shards.empty()) {
2307 while (si_begin + 1 < shards.size() &&
2308 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2309 ++si_begin;
2310 }
2311 needs_reshard_begin = shards[si_begin].shard_info->offset;
2312 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2313 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2314 needs_reshard_end = shards[si_end].shard_info->offset;
2315 break;
2316 }
2317 }
2318 if (si_end == shards.size()) {
2319 needs_reshard_end = OBJECT_MAX_SIZE;
2320 }
2321 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2322 << " over 0x[" << std::hex << needs_reshard_begin << ","
2323 << needs_reshard_end << ")" << std::dec << dendl;
2324 }
2325
181888fb 2326 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2327
2328 // we may need to fault in a larger interval later must have all
2329 // referring extents for spanning blobs loaded in order to have
2330 // accurate use_tracker values.
2331 uint32_t spanning_scan_begin = needs_reshard_begin;
2332 uint32_t spanning_scan_end = needs_reshard_end;
2333
2334 // remove old keys
2335 string key;
2336 for (unsigned i = si_begin; i < si_end; ++i) {
2337 generate_extent_shard_key_and_apply(
2338 onode->key, shards[i].shard_info->offset, &key,
2339 [&](const string& final_key) {
2340 t->rmkey(PREFIX_OBJ, final_key);
2341 }
2342 );
2343 }
2344
2345 // calculate average extent size
2346 unsigned bytes = 0;
2347 unsigned extents = 0;
2348 if (onode->onode.extent_map_shards.empty()) {
2349 bytes = inline_bl.length();
2350 extents = extent_map.size();
2351 } else {
2352 for (unsigned i = si_begin; i < si_end; ++i) {
2353 bytes += shards[i].shard_info->bytes;
2354 extents += shards[i].extents;
2355 }
2356 }
2357 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2358 unsigned slop = target *
2359 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2360 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2361 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2362 << ", slop " << slop << dendl;
2363
2364 // reshard
2365 unsigned estimate = 0;
31f18b77 2366 unsigned offset = needs_reshard_begin;
7c673cae
FG
2367 vector<bluestore_onode_t::shard_info> new_shard_info;
2368 unsigned max_blob_end = 0;
2369 Extent dummy(needs_reshard_begin);
2370 for (auto e = extent_map.lower_bound(dummy);
2371 e != extent_map.end();
2372 ++e) {
2373 if (e->logical_offset >= needs_reshard_end) {
2374 break;
2375 }
2376 dout(30) << " extent " << *e << dendl;
2377
2378 // disfavor shard boundaries that span a blob
2379 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2380 if (estimate &&
2381 estimate + extent_avg > target + (would_span ? slop : 0)) {
2382 // new shard
31f18b77 2383 if (offset == needs_reshard_begin) {
7c673cae
FG
2384 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2385 new_shard_info.back().offset = offset;
2386 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2387 << std::dec << dendl;
7c673cae
FG
2388 }
2389 offset = e->logical_offset;
2390 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2391 new_shard_info.back().offset = offset;
2392 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2393 << std::dec << dendl;
2394 estimate = 0;
2395 }
2396 estimate += extent_avg;
31f18b77
FG
2397 unsigned bs = e->blob_start();
2398 if (bs < spanning_scan_begin) {
2399 spanning_scan_begin = bs;
7c673cae
FG
2400 }
2401 uint32_t be = e->blob_end();
2402 if (be > max_blob_end) {
2403 max_blob_end = be;
2404 }
2405 if (be > spanning_scan_end) {
2406 spanning_scan_end = be;
2407 }
2408 }
2409 if (new_shard_info.empty() && (si_begin > 0 ||
2410 si_end < shards.size())) {
2411 // we resharded a partial range; we must produce at least one output
2412 // shard
2413 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2414 new_shard_info.back().offset = needs_reshard_begin;
2415 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2416 << std::dec << " (singleton degenerate case)" << dendl;
2417 }
2418
2419 auto& sv = onode->onode.extent_map_shards;
2420 dout(20) << __func__ << " new " << new_shard_info << dendl;
2421 dout(20) << __func__ << " old " << sv << dendl;
2422 if (sv.empty()) {
2423 // no old shards to keep
2424 sv.swap(new_shard_info);
2425 init_shards(true, true);
2426 } else {
2427 // splice in new shards
2428 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2429 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2430 sv.insert(
2431 sv.begin() + si_begin,
2432 new_shard_info.begin(),
2433 new_shard_info.end());
2434 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2435 si_end = si_begin + new_shard_info.size();
31f18b77 2436
11fdf7f2 2437 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2438
2439 // note that we need to update every shard_info of shards here,
2440 // as sv might have been totally re-allocated above
2441 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2442 shards[i].shard_info = &sv[i];
31f18b77
FG
2443 }
2444
2445 // mark newly added shards as dirty
2446 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2447 shards[i].loaded = true;
2448 shards[i].dirty = true;
2449 }
7c673cae
FG
2450 }
2451 dout(20) << __func__ << " fin " << sv << dendl;
2452 inline_bl.clear();
2453
2454 if (sv.empty()) {
2455 // no more shards; unspan all previously spanning blobs
2456 auto p = spanning_blob_map.begin();
2457 while (p != spanning_blob_map.end()) {
2458 p->second->id = -1;
2459 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2460 p = spanning_blob_map.erase(p);
2461 }
2462 } else {
2463 // identify new spanning blobs
2464 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2465 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2466 if (spanning_scan_begin < needs_reshard_begin) {
2467 fault_range(db, spanning_scan_begin,
2468 needs_reshard_begin - spanning_scan_begin);
2469 }
2470 if (spanning_scan_end > needs_reshard_end) {
2471 fault_range(db, needs_reshard_end,
31f18b77 2472 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2473 }
2474 auto sp = sv.begin() + si_begin;
2475 auto esp = sv.end();
2476 unsigned shard_start = sp->offset;
2477 unsigned shard_end;
2478 ++sp;
2479 if (sp == esp) {
2480 shard_end = OBJECT_MAX_SIZE;
2481 } else {
2482 shard_end = sp->offset;
2483 }
7c673cae
FG
2484 Extent dummy(needs_reshard_begin);
2485 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2486 if (e->logical_offset >= needs_reshard_end) {
2487 break;
2488 }
2489 dout(30) << " extent " << *e << dendl;
2490 while (e->logical_offset >= shard_end) {
2491 shard_start = shard_end;
11fdf7f2 2492 ceph_assert(sp != esp);
7c673cae
FG
2493 ++sp;
2494 if (sp == esp) {
2495 shard_end = OBJECT_MAX_SIZE;
2496 } else {
2497 shard_end = sp->offset;
2498 }
2499 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2500 << " to 0x" << shard_end << std::dec << dendl;
2501 }
2502 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2503 if (!e->blob->is_spanning()) {
2504 // We have two options: (1) split the blob into pieces at the
2505 // shard boundaries (and adjust extents accordingly), or (2)
2506 // mark it spanning. We prefer to cut the blob if we can. Note that
2507 // we may have to split it multiple times--potentially at every
2508 // shard boundary.
2509 bool must_span = false;
2510 BlobRef b = e->blob;
2511 if (b->can_split()) {
2512 uint32_t bstart = e->blob_start();
2513 uint32_t bend = e->blob_end();
2514 for (const auto& sh : shards) {
2515 if (bstart < sh.shard_info->offset &&
2516 bend > sh.shard_info->offset) {
2517 uint32_t blob_offset = sh.shard_info->offset - bstart;
2518 if (b->can_split_at(blob_offset)) {
2519 dout(20) << __func__ << " splitting blob, bstart 0x"
2520 << std::hex << bstart << " blob_offset 0x"
2521 << blob_offset << std::dec << " " << *b << dendl;
2522 b = split_blob(b, blob_offset, sh.shard_info->offset);
2523 // switch b to the new right-hand side, in case it
2524 // *also* has to get split.
2525 bstart += blob_offset;
2526 onode->c->store->logger->inc(l_bluestore_blob_split);
2527 } else {
2528 must_span = true;
2529 break;
2530 }
2531 }
2532 }
2533 } else {
2534 must_span = true;
2535 }
2536 if (must_span) {
31f18b77
FG
2537 auto bid = allocate_spanning_blob_id();
2538 b->id = bid;
7c673cae
FG
2539 spanning_blob_map[b->id] = b;
2540 dout(20) << __func__ << " adding spanning " << *b << dendl;
2541 }
2542 }
2543 } else {
2544 if (e->blob->is_spanning()) {
2545 spanning_blob_map.erase(e->blob->id);
2546 e->blob->id = -1;
2547 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2548 }
2549 }
2550 }
2551 }
2552
2553 clear_needs_reshard();
2554}
2555
2556bool BlueStore::ExtentMap::encode_some(
2557 uint32_t offset,
2558 uint32_t length,
2559 bufferlist& bl,
2560 unsigned *pn)
2561{
2562 auto cct = onode->c->store->cct; //used by dout
2563 Extent dummy(offset);
2564 auto start = extent_map.lower_bound(dummy);
2565 uint32_t end = offset + length;
2566
2567 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2568 // serialization only. Hence there is no specific
2569 // handling at ExtentMap level.
2570
2571 unsigned n = 0;
2572 size_t bound = 0;
7c673cae
FG
2573 bool must_reshard = false;
2574 for (auto p = start;
2575 p != extent_map.end() && p->logical_offset < end;
2576 ++p, ++n) {
11fdf7f2 2577 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2578 p->blob->last_encoded_id = -1;
2579 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2580 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2581 << std::dec << " hit new spanning blob " << *p << dendl;
2582 request_reshard(p->blob_start(), p->blob_end());
2583 must_reshard = true;
2584 }
31f18b77
FG
2585 if (!must_reshard) {
2586 denc_varint(0, bound); // blobid
2587 denc_varint(0, bound); // logical_offset
2588 denc_varint(0, bound); // len
2589 denc_varint(0, bound); // blob_offset
7c673cae 2590
31f18b77
FG
2591 p->blob->bound_encode(
2592 bound,
2593 struct_v,
2594 p->blob->shared_blob->get_sbid(),
2595 false);
2596 }
7c673cae
FG
2597 }
2598 if (must_reshard) {
2599 return true;
2600 }
2601
31f18b77
FG
2602 denc(struct_v, bound);
2603 denc_varint(0, bound); // number of extents
2604
7c673cae
FG
2605 {
2606 auto app = bl.get_contiguous_appender(bound);
2607 denc(struct_v, app);
2608 denc_varint(n, app);
2609 if (pn) {
2610 *pn = n;
2611 }
2612
2613 n = 0;
2614 uint64_t pos = 0;
2615 uint64_t prev_len = 0;
2616 for (auto p = start;
2617 p != extent_map.end() && p->logical_offset < end;
2618 ++p, ++n) {
2619 unsigned blobid;
2620 bool include_blob = false;
2621 if (p->blob->is_spanning()) {
2622 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2623 blobid |= BLOBID_FLAG_SPANNING;
2624 } else if (p->blob->last_encoded_id < 0) {
2625 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2626 include_blob = true;
2627 blobid = 0; // the decoder will infer the id from n
2628 } else {
2629 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2630 }
2631 if (p->logical_offset == pos) {
2632 blobid |= BLOBID_FLAG_CONTIGUOUS;
2633 }
2634 if (p->blob_offset == 0) {
2635 blobid |= BLOBID_FLAG_ZEROOFFSET;
2636 }
2637 if (p->length == prev_len) {
2638 blobid |= BLOBID_FLAG_SAMELENGTH;
2639 } else {
2640 prev_len = p->length;
2641 }
2642 denc_varint(blobid, app);
2643 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2644 denc_varint_lowz(p->logical_offset - pos, app);
2645 }
2646 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2647 denc_varint_lowz(p->blob_offset, app);
2648 }
2649 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2650 denc_varint_lowz(p->length, app);
2651 }
2652 pos = p->logical_end();
2653 if (include_blob) {
2654 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2655 }
2656 }
2657 }
2658 /*derr << __func__ << bl << dendl;
2659 derr << __func__ << ":";
2660 bl.hexdump(*_dout);
2661 *_dout << dendl;
2662 */
2663 return false;
2664}
2665
2666unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2667{
2668 auto cct = onode->c->store->cct; //used by dout
2669 /*
2670 derr << __func__ << ":";
2671 bl.hexdump(*_dout);
2672 *_dout << dendl;
2673 */
2674
11fdf7f2 2675 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
2676 auto p = bl.front().begin_deep();
2677 __u8 struct_v;
2678 denc(struct_v, p);
2679 // Version 2 differs from v1 in blob's ref_map
2680 // serialization only. Hence there is no specific
2681 // handling at ExtentMap level below.
11fdf7f2 2682 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2683
2684 uint32_t num;
2685 denc_varint(num, p);
2686 vector<BlobRef> blobs(num);
2687 uint64_t pos = 0;
2688 uint64_t prev_len = 0;
2689 unsigned n = 0;
2690
2691 while (!p.end()) {
2692 Extent *le = new Extent();
2693 uint64_t blobid;
2694 denc_varint(blobid, p);
2695 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2696 uint64_t gap;
2697 denc_varint_lowz(gap, p);
2698 pos += gap;
2699 }
2700 le->logical_offset = pos;
2701 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2702 denc_varint_lowz(le->blob_offset, p);
2703 } else {
2704 le->blob_offset = 0;
2705 }
2706 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2707 denc_varint_lowz(prev_len, p);
2708 }
2709 le->length = prev_len;
2710
2711 if (blobid & BLOBID_FLAG_SPANNING) {
2712 dout(30) << __func__ << " getting spanning blob "
2713 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2714 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2715 } else {
2716 blobid >>= BLOBID_SHIFT_BITS;
2717 if (blobid) {
2718 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 2719 ceph_assert(le->blob);
7c673cae
FG
2720 } else {
2721 Blob *b = new Blob();
2722 uint64_t sbid = 0;
2723 b->decode(onode->c, p, struct_v, &sbid, false);
2724 blobs[n] = b;
2725 onode->c->open_shared_blob(sbid, b);
2726 le->assign_blob(b);
2727 }
2728 // we build ref_map dynamically for non-spanning blobs
2729 le->blob->get_ref(
2730 onode->c,
2731 le->blob_offset,
2732 le->length);
2733 }
2734 pos += prev_len;
2735 ++n;
2736 extent_map.insert(*le);
2737 }
2738
11fdf7f2 2739 ceph_assert(n == num);
7c673cae
FG
2740 return num;
2741}
2742
2743void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2744{
2745 // Version 2 differs from v1 in blob's ref_map
2746 // serialization only. Hence there is no specific
2747 // handling at ExtentMap level.
2748 __u8 struct_v = 2;
2749
2750 denc(struct_v, p);
2751 denc_varint((uint32_t)0, p);
2752 size_t key_size = 0;
2753 denc_varint((uint32_t)0, key_size);
2754 p += spanning_blob_map.size() * key_size;
2755 for (const auto& i : spanning_blob_map) {
2756 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2757 }
2758}
2759
2760void BlueStore::ExtentMap::encode_spanning_blobs(
2761 bufferlist::contiguous_appender& p)
2762{
2763 // Version 2 differs from v1 in blob's ref_map
2764 // serialization only. Hence there is no specific
2765 // handling at ExtentMap level.
2766 __u8 struct_v = 2;
2767
2768 denc(struct_v, p);
2769 denc_varint(spanning_blob_map.size(), p);
2770 for (auto& i : spanning_blob_map) {
2771 denc_varint(i.second->id, p);
2772 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2773 }
2774}
2775
2776void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 2777 bufferptr::const_iterator& p)
7c673cae
FG
2778{
2779 __u8 struct_v;
2780 denc(struct_v, p);
2781 // Version 2 differs from v1 in blob's ref_map
2782 // serialization only. Hence there is no specific
2783 // handling at ExtentMap level.
11fdf7f2 2784 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2785
2786 unsigned n;
2787 denc_varint(n, p);
2788 while (n--) {
2789 BlobRef b(new Blob());
2790 denc_varint(b->id, p);
2791 spanning_blob_map[b->id] = b;
2792 uint64_t sbid = 0;
2793 b->decode(onode->c, p, struct_v, &sbid, true);
2794 onode->c->open_shared_blob(sbid, b);
2795 }
2796}
2797
2798void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2799{
2800 shards.resize(onode->onode.extent_map_shards.size());
2801 unsigned i = 0;
2802 for (auto &s : onode->onode.extent_map_shards) {
2803 shards[i].shard_info = &s;
2804 shards[i].loaded = loaded;
2805 shards[i].dirty = dirty;
2806 ++i;
2807 }
2808}
2809
2810void BlueStore::ExtentMap::fault_range(
2811 KeyValueDB *db,
2812 uint32_t offset,
2813 uint32_t length)
2814{
2815 auto cct = onode->c->store->cct; //used by dout
2816 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2817 << std::dec << dendl;
2818 auto start = seek_shard(offset);
2819 auto last = seek_shard(offset + length);
2820
2821 if (start < 0)
2822 return;
2823
11fdf7f2 2824 ceph_assert(last >= start);
7c673cae
FG
2825 string key;
2826 while (start <= last) {
11fdf7f2 2827 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2828 auto p = &shards[start];
2829 if (!p->loaded) {
2830 dout(30) << __func__ << " opening shard 0x" << std::hex
2831 << p->shard_info->offset << std::dec << dendl;
2832 bufferlist v;
2833 generate_extent_shard_key_and_apply(
2834 onode->key, p->shard_info->offset, &key,
2835 [&](const string& final_key) {
2836 int r = db->get(PREFIX_OBJ, final_key, &v);
2837 if (r < 0) {
2838 derr << __func__ << " missing shard 0x" << std::hex
2839 << p->shard_info->offset << std::dec << " for " << onode->oid
2840 << dendl;
11fdf7f2 2841 ceph_assert(r >= 0);
7c673cae
FG
2842 }
2843 }
2844 );
2845 p->extents = decode_some(v);
2846 p->loaded = true;
2847 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
2848 << p->shard_info->offset
2849 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 2850 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
2851 ceph_assert(p->dirty == false);
2852 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
2853 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2854 } else {
2855 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2856 }
2857 ++start;
2858 }
2859}
2860
2861void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
2862 uint32_t offset,
2863 uint32_t length)
2864{
2865 auto cct = onode->c->store->cct; //used by dout
2866 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2867 << std::dec << dendl;
2868 if (shards.empty()) {
2869 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2870 inline_bl.clear();
2871 return;
2872 }
2873 auto start = seek_shard(offset);
11fdf7f2
TL
2874 if (length == 0) {
2875 length = 1;
2876 }
2877 auto last = seek_shard(offset + length - 1);
7c673cae
FG
2878 if (start < 0)
2879 return;
2880
11fdf7f2 2881 ceph_assert(last >= start);
7c673cae 2882 while (start <= last) {
11fdf7f2 2883 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2884 auto p = &shards[start];
2885 if (!p->loaded) {
11fdf7f2
TL
2886 derr << __func__ << "on write 0x" << std::hex << offset
2887 << "~" << length << " shard 0x" << p->shard_info->offset
2888 << std::dec << " is not loaded, can't mark dirty" << dendl;
2889 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
2890 }
2891 if (!p->dirty) {
2892 dout(20) << __func__ << " mark shard 0x" << std::hex
2893 << p->shard_info->offset << std::dec << " dirty" << dendl;
2894 p->dirty = true;
2895 }
2896 ++start;
2897 }
2898}
2899
2900BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2901 uint64_t offset)
2902{
2903 Extent dummy(offset);
2904 return extent_map.find(dummy);
2905}
2906
7c673cae
FG
2907BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2908 uint64_t offset)
2909{
2910 Extent dummy(offset);
2911 auto fp = extent_map.lower_bound(dummy);
2912 if (fp != extent_map.begin()) {
2913 --fp;
2914 if (fp->logical_end() <= offset) {
2915 ++fp;
2916 }
2917 }
2918 return fp;
2919}
2920
2921BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2922 uint64_t offset) const
2923{
2924 Extent dummy(offset);
2925 auto fp = extent_map.lower_bound(dummy);
2926 if (fp != extent_map.begin()) {
2927 --fp;
2928 if (fp->logical_end() <= offset) {
2929 ++fp;
2930 }
2931 }
2932 return fp;
2933}
2934
2935bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2936{
2937 auto fp = seek_lextent(offset);
2938 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2939 return false;
2940 }
2941 return true;
2942}
2943
2944int BlueStore::ExtentMap::compress_extent_map(
2945 uint64_t offset,
2946 uint64_t length)
2947{
2948 auto cct = onode->c->store->cct; //used by dout
2949 if (extent_map.empty())
2950 return 0;
2951 int removed = 0;
2952 auto p = seek_lextent(offset);
2953 if (p != extent_map.begin()) {
2954 --p; // start to the left of offset
2955 }
2956 // the caller should have just written to this region
11fdf7f2 2957 ceph_assert(p != extent_map.end());
7c673cae
FG
2958
2959 // identify the *next* shard
2960 auto pshard = shards.begin();
2961 while (pshard != shards.end() &&
2962 p->logical_offset >= pshard->shard_info->offset) {
2963 ++pshard;
2964 }
2965 uint64_t shard_end;
2966 if (pshard != shards.end()) {
2967 shard_end = pshard->shard_info->offset;
2968 } else {
2969 shard_end = OBJECT_MAX_SIZE;
2970 }
2971
2972 auto n = p;
2973 for (++n; n != extent_map.end(); p = n++) {
2974 if (n->logical_offset > offset + length) {
2975 break; // stop after end
2976 }
2977 while (n != extent_map.end() &&
2978 p->logical_end() == n->logical_offset &&
2979 p->blob == n->blob &&
2980 p->blob_offset + p->length == n->blob_offset &&
2981 n->logical_offset < shard_end) {
2982 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2983 << " next shard 0x" << shard_end << std::dec
2984 << " merging " << *p << " and " << *n << dendl;
2985 p->length += n->length;
2986 rm(n++);
2987 ++removed;
2988 }
2989 if (n == extent_map.end()) {
2990 break;
2991 }
2992 if (n->logical_offset >= shard_end) {
11fdf7f2 2993 ceph_assert(pshard != shards.end());
7c673cae
FG
2994 ++pshard;
2995 if (pshard != shards.end()) {
2996 shard_end = pshard->shard_info->offset;
2997 } else {
2998 shard_end = OBJECT_MAX_SIZE;
2999 }
3000 }
3001 }
11fdf7f2 3002 if (removed) {
7c673cae
FG
3003 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3004 }
3005 return removed;
3006}
3007
3008void BlueStore::ExtentMap::punch_hole(
3009 CollectionRef &c,
3010 uint64_t offset,
3011 uint64_t length,
3012 old_extent_map_t *old_extents)
3013{
3014 auto p = seek_lextent(offset);
3015 uint64_t end = offset + length;
3016 while (p != extent_map.end()) {
3017 if (p->logical_offset >= end) {
3018 break;
3019 }
3020 if (p->logical_offset < offset) {
3021 if (p->logical_end() > end) {
3022 // split and deref middle
3023 uint64_t front = offset - p->logical_offset;
3024 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3025 length, p->blob);
3026 old_extents->push_back(*oe);
3027 add(end,
3028 p->blob_offset + front + length,
3029 p->length - front - length,
3030 p->blob);
3031 p->length = front;
3032 break;
3033 } else {
3034 // deref tail
11fdf7f2 3035 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3036 uint64_t keep = offset - p->logical_offset;
3037 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3038 p->length - keep, p->blob);
3039 old_extents->push_back(*oe);
3040 p->length = keep;
3041 ++p;
3042 continue;
3043 }
3044 }
3045 if (p->logical_offset + p->length <= end) {
3046 // deref whole lextent
3047 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3048 p->length, p->blob);
3049 old_extents->push_back(*oe);
3050 rm(p++);
3051 continue;
3052 }
3053 // deref head
3054 uint64_t keep = p->logical_end() - end;
3055 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3056 p->length - keep, p->blob);
3057 old_extents->push_back(*oe);
3058
3059 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3060 rm(p);
3061 break;
3062 }
3063}
3064
3065BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3066 CollectionRef &c,
3067 uint64_t logical_offset,
3068 uint64_t blob_offset, uint64_t length, BlobRef b,
3069 old_extent_map_t *old_extents)
3070{
3071 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3072 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3073
3074 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3075 // old_extents list if we overwre the blob totally
3076 // This might happen during WAL overwrite.
3077 b->get_ref(onode->c, blob_offset, length);
3078
3079 if (old_extents) {
3080 punch_hole(c, logical_offset, length, old_extents);
3081 }
3082
3083 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3084 extent_map.insert(*le);
3085 if (spans_shard(logical_offset, length)) {
3086 request_reshard(logical_offset, logical_offset + length);
3087 }
3088 return le;
3089}
3090
3091BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3092 BlobRef lb,
3093 uint32_t blob_offset,
3094 uint32_t pos)
3095{
3096 auto cct = onode->c->store->cct; //used by dout
3097
3098 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3099 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3100 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3101 << dendl;
3102 BlobRef rb = onode->c->new_blob();
3103 lb->split(onode->c, blob_offset, rb.get());
3104
3105 for (auto ep = seek_lextent(pos);
3106 ep != extent_map.end() && ep->logical_offset < end_pos;
3107 ++ep) {
3108 if (ep->blob != lb) {
3109 continue;
3110 }
3111 if (ep->logical_offset < pos) {
3112 // split extent
3113 size_t left = pos - ep->logical_offset;
3114 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3115 extent_map.insert(*ne);
3116 ep->length = left;
3117 dout(30) << __func__ << " split " << *ep << dendl;
3118 dout(30) << __func__ << " to " << *ne << dendl;
3119 } else {
3120 // switch blob
11fdf7f2 3121 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3122
3123 ep->blob = rb;
3124 ep->blob_offset -= blob_offset;
3125 dout(30) << __func__ << " adjusted " << *ep << dendl;
3126 }
3127 }
3128 return rb;
3129}
3130
3131// Onode
3132
3133#undef dout_prefix
3134#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3135
eafe8130
TL
3136BlueStore::Onode* BlueStore::Onode::decode(
3137 CollectionRef c,
3138 const ghobject_t& oid,
3139 const string& key,
3140 const bufferlist& v)
3141{
3142 Onode* on = new Onode(c.get(), oid, key);
3143 on->exists = true;
3144 auto p = v.front().begin_deep();
3145 on->onode.decode(p);
3146 for (auto& i : on->onode.attrs) {
3147 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3148 }
3149
3150 // initialize extent_map
3151 on->extent_map.decode_spanning_blobs(p);
3152 if (on->onode.extent_map_shards.empty()) {
3153 denc(on->extent_map.inline_bl, p);
3154 on->extent_map.decode_some(on->extent_map.inline_bl);
3155 on->extent_map.inline_bl.reassign_to_mempool(
3156 mempool::mempool_bluestore_cache_other);
3157 }
3158 else {
3159 on->extent_map.init_shards(false, false);
3160 }
3161 return on;
3162}
3163
7c673cae
FG
3164void BlueStore::Onode::flush()
3165{
3166 if (flushing_count.load()) {
3167 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
11fdf7f2 3168 std::unique_lock l(flush_lock);
7c673cae
FG
3169 while (flushing_count.load()) {
3170 flush_cond.wait(l);
3171 }
3172 }
3173 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3174}
3175
3176// =======================================================
3177// WriteContext
3178
3179/// Checks for writes to the same pextent within a blob
3180bool BlueStore::WriteContext::has_conflict(
3181 BlobRef b,
3182 uint64_t loffs,
3183 uint64_t loffs_end,
3184 uint64_t min_alloc_size)
3185{
11fdf7f2
TL
3186 ceph_assert((loffs % min_alloc_size) == 0);
3187 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3188 for (auto w : writes) {
3189 if (b == w.b) {
11fdf7f2
TL
3190 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3191 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3192 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3193 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3194 return true;
3195 }
3196 }
3197 }
3198 return false;
3199}
3200
3201// =======================================================
3202
3203// DeferredBatch
3204#undef dout_prefix
3205#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3206
3207void BlueStore::DeferredBatch::prepare_write(
3208 CephContext *cct,
3209 uint64_t seq, uint64_t offset, uint64_t length,
3210 bufferlist::const_iterator& blp)
3211{
3212 _discard(cct, offset, length);
3213 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3214 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3215 i.first->second.seq = seq;
3216 blp.copy(length, i.first->second.bl);
31f18b77
FG
3217 i.first->second.bl.reassign_to_mempool(
3218 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3219 dout(20) << __func__ << " seq " << seq
3220 << " 0x" << std::hex << offset << "~" << length
3221 << " crc " << i.first->second.bl.crc32c(-1)
3222 << std::dec << dendl;
3223 seq_bytes[seq] += length;
3224#ifdef DEBUG_DEFERRED
3225 _audit(cct);
3226#endif
3227}
3228
3229void BlueStore::DeferredBatch::_discard(
3230 CephContext *cct, uint64_t offset, uint64_t length)
3231{
3232 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3233 << std::dec << dendl;
3234 auto p = iomap.lower_bound(offset);
3235 if (p != iomap.begin()) {
3236 --p;
3237 auto end = p->first + p->second.bl.length();
3238 if (end > offset) {
3239 bufferlist head;
3240 head.substr_of(p->second.bl, 0, offset - p->first);
3241 dout(20) << __func__ << " keep head " << p->second.seq
3242 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3243 << " -> 0x" << head.length() << std::dec << dendl;
3244 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3245 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3246 if (end > offset + length) {
3247 bufferlist tail;
3248 tail.substr_of(p->second.bl, offset + length - p->first,
3249 end - (offset + length));
3250 dout(20) << __func__ << " keep tail " << p->second.seq
3251 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3252 << " -> 0x" << tail.length() << std::dec << dendl;
3253 auto &n = iomap[offset + length];
3254 n.bl.swap(tail);
3255 n.seq = p->second.seq;
3256 i->second -= length;
3257 } else {
3258 i->second -= end - offset;
3259 }
11fdf7f2 3260 ceph_assert(i->second >= 0);
7c673cae
FG
3261 p->second.bl.swap(head);
3262 }
3263 ++p;
3264 }
3265 while (p != iomap.end()) {
3266 if (p->first >= offset + length) {
3267 break;
3268 }
3269 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3270 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3271 auto end = p->first + p->second.bl.length();
3272 if (end > offset + length) {
3273 unsigned drop_front = offset + length - p->first;
3274 unsigned keep_tail = end - (offset + length);
3275 dout(20) << __func__ << " truncate front " << p->second.seq
3276 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3277 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3278 << " to 0x" << (offset + length) << "~" << keep_tail
3279 << std::dec << dendl;
3280 auto &s = iomap[offset + length];
3281 s.seq = p->second.seq;
3282 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3283 i->second -= drop_front;
3284 } else {
3285 dout(20) << __func__ << " drop " << p->second.seq
3286 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3287 << std::dec << dendl;
3288 i->second -= p->second.bl.length();
3289 }
11fdf7f2 3290 ceph_assert(i->second >= 0);
7c673cae
FG
3291 p = iomap.erase(p);
3292 }
3293}
3294
3295void BlueStore::DeferredBatch::_audit(CephContext *cct)
3296{
3297 map<uint64_t,int> sb;
3298 for (auto p : seq_bytes) {
3299 sb[p.first] = 0; // make sure we have the same set of keys
3300 }
3301 uint64_t pos = 0;
3302 for (auto& p : iomap) {
11fdf7f2 3303 ceph_assert(p.first >= pos);
7c673cae
FG
3304 sb[p.second.seq] += p.second.bl.length();
3305 pos = p.first + p.second.bl.length();
3306 }
11fdf7f2 3307 ceph_assert(sb == seq_bytes);
7c673cae
FG
3308}
3309
3310
3311// Collection
3312
3313#undef dout_prefix
3314#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3315
11fdf7f2
TL
3316BlueStore::Collection::Collection(BlueStore *store_, Cache *c, coll_t cid)
3317 : CollectionImpl(cid),
3318 store(store_),
7c673cae 3319 cache(c),
7c673cae
FG
3320 lock("BlueStore::Collection::lock", true, false),
3321 exists(true),
11fdf7f2
TL
3322 onode_map(c),
3323 commit_queue(nullptr)
3324{
3325}
3326
3327bool BlueStore::Collection::flush_commit(Context *c)
3328{
3329 return osr->flush_commit(c);
3330}
3331
3332void BlueStore::Collection::flush()
3333{
3334 osr->flush();
3335}
3336
3337void BlueStore::Collection::flush_all_but_last()
7c673cae 3338{
11fdf7f2 3339 osr->flush_all_but_last();
7c673cae
FG
3340}
3341
3342void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3343{
11fdf7f2 3344 ceph_assert(!b->shared_blob);
7c673cae
FG
3345 const bluestore_blob_t& blob = b->get_blob();
3346 if (!blob.is_shared()) {
3347 b->shared_blob = new SharedBlob(this);
3348 return;
3349 }
3350
3351 b->shared_blob = shared_blob_set.lookup(sbid);
3352 if (b->shared_blob) {
3353 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3354 << std::dec << " had " << *b->shared_blob << dendl;
3355 } else {
3356 b->shared_blob = new SharedBlob(sbid, this);
3357 shared_blob_set.add(this, b->shared_blob.get());
3358 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3359 << std::dec << " opened " << *b->shared_blob
3360 << dendl;
3361 }
3362}
3363
3364void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3365{
3366 if (!sb->is_loaded()) {
3367
3368 bufferlist v;
3369 string key;
3370 auto sbid = sb->get_sbid();
3371 get_shared_blob_key(sbid, &key);
3372 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3373 if (r < 0) {
3374 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3375 << std::dec << " not found at key "
3376 << pretty_binary_string(key) << dendl;
11fdf7f2 3377 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3378 }
3379
3380 sb->loaded = true;
3381 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3382 auto p = v.cbegin();
3383 decode(*(sb->persistent), p);
7c673cae
FG
3384 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3385 << std::dec << " loaded shared_blob " << *sb << dendl;
3386 }
3387}
3388
3389void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3390{
7c673cae 3391 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3392 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3393
3394 // update blob
31f18b77 3395 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3396 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3397
3398 // update shared blob
3399 b->shared_blob->loaded = true;
3400 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3401 shared_blob_set.add(this, b->shared_blob.get());
3402 for (auto p : blob.get_extents()) {
3403 if (p.is_valid()) {
3404 b->shared_blob->get_ref(
3405 p.offset,
3406 p.length);
3407 }
3408 }
3409 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3410}
3411
31f18b77
FG
3412uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3413{
3414 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3415 ceph_assert(sb->is_loaded());
31f18b77
FG
3416
3417 uint64_t sbid = sb->get_sbid();
3418 shared_blob_set.remove(sb);
3419 sb->loaded = false;
3420 delete sb->persistent;
3421 sb->sbid_unloaded = 0;
3422 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3423 return sbid;
3424}
3425
7c673cae
FG
3426BlueStore::OnodeRef BlueStore::Collection::get_onode(
3427 const ghobject_t& oid,
3428 bool create)
3429{
11fdf7f2 3430 ceph_assert(create ? lock.is_wlocked() : lock.is_locked());
7c673cae
FG
3431
3432 spg_t pgid;
3433 if (cid.is_pg(&pgid)) {
3434 if (!oid.match(cnode.bits, pgid.ps())) {
3435 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3436 << pgid << " bits " << cnode.bits << dendl;
3437 ceph_abort();
3438 }
3439 }
3440
3441 OnodeRef o = onode_map.lookup(oid);
3442 if (o)
3443 return o;
3444
eafe8130 3445 string key;
7c673cae
FG
3446 get_object_key(store->cct, oid, &key);
3447
3448 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3449 << pretty_binary_string(key) << dendl;
3450
3451 bufferlist v;
3452 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3453 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3454 Onode *on;
3455 if (v.length() == 0) {
11fdf7f2 3456 ceph_assert(r == -ENOENT);
7c673cae
FG
3457 if (!store->cct->_conf->bluestore_debug_misc &&
3458 !create)
3459 return OnodeRef();
3460
3461 // new object, new onode
3462 on = new Onode(this, oid, key);
3463 } else {
3464 // loaded
11fdf7f2 3465 ceph_assert(r >= 0);
eafe8130 3466 on = Onode::decode(this, oid, key, v);
7c673cae
FG
3467 }
3468 o.reset(on);
3469 return onode_map.add(oid, o);
3470}
3471
3472void BlueStore::Collection::split_cache(
3473 Collection *dest)
3474{
3475 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3476
3477 // lock (one or both) cache shards
3478 std::lock(cache->lock, dest->cache->lock);
11fdf7f2
TL
3479 std::lock_guard l(cache->lock, std::adopt_lock);
3480 std::lock_guard l2(dest->cache->lock, std::adopt_lock);
7c673cae
FG
3481
3482 int destbits = dest->cnode.bits;
3483 spg_t destpg;
3484 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 3485 ceph_assert(is_pg);
7c673cae
FG
3486
3487 auto p = onode_map.onode_map.begin();
3488 while (p != onode_map.onode_map.end()) {
11fdf7f2 3489 OnodeRef o = p->second;
7c673cae
FG
3490 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3491 // onode does not belong to this child
11fdf7f2
TL
3492 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
3493 << dendl;
7c673cae
FG
3494 ++p;
3495 } else {
7c673cae
FG
3496 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3497 << dendl;
3498
3499 cache->_rm_onode(p->second);
3500 p = onode_map.onode_map.erase(p);
3501
3502 o->c = dest;
3503 dest->cache->_add_onode(o, 1);
3504 dest->onode_map.onode_map[o->oid] = o;
3505 dest->onode_map.cache = dest->cache;
3506
3507 // move over shared blobs and buffers. cover shared blobs from
3508 // both extent map and spanning blob map (the full extent map
3509 // may not be faulted in)
3510 vector<SharedBlob*> sbvec;
3511 for (auto& e : o->extent_map.extent_map) {
3512 sbvec.push_back(e.blob->shared_blob.get());
3513 }
3514 for (auto& b : o->extent_map.spanning_blob_map) {
3515 sbvec.push_back(b.second->shared_blob.get());
3516 }
3517 for (auto sb : sbvec) {
3518 if (sb->coll == dest) {
3519 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3520 << dendl;
3521 continue;
3522 }
3523 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
3524 if (sb->get_sbid()) {
3525 ldout(store->cct, 20) << __func__
3526 << " moving registration " << *sb << dendl;
3527 shared_blob_set.remove(sb);
3528 dest->shared_blob_set.add(dest, sb);
3529 }
3efd9988 3530 sb->coll = dest;
7c673cae 3531 if (dest->cache != cache) {
7c673cae
FG
3532 for (auto& i : sb->bc.buffer_map) {
3533 if (!i.second->is_writing()) {
3534 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3535 << dendl;
3536 dest->cache->_move_buffer(cache, i.second.get());
3537 }
3538 }
3539 }
3540 }
7c673cae
FG
3541 }
3542 }
3543}
3544
7c673cae
FG
3545// =======================================================
3546
91327a77
AA
3547// MempoolThread
3548
3549#undef dout_prefix
3550#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
3551
7c673cae
FG
3552void *BlueStore::MempoolThread::entry()
3553{
11fdf7f2
TL
3554 std::unique_lock l(lock);
3555
92f5a8d4 3556 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
3557 uint64_t base = store->osd_memory_base;
3558 double fragmentation = store->osd_memory_expected_fragmentation;
3559 uint64_t target = store->osd_memory_target;
3560 uint64_t min = store->osd_memory_cache_min;
3561 uint64_t max = min;
3562
3563 // When setting the maximum amount of memory to use for cache, first
3564 // assume some base amount of memory for the OSD and then fudge in
3565 // some overhead for fragmentation that scales with cache usage.
3566 uint64_t ltarget = (1.0 - fragmentation) * target;
3567 if (ltarget > base + min) {
3568 max = ltarget - base;
11fdf7f2 3569 }
31f18b77 3570
eafe8130
TL
3571 binned_kv_cache = store->db->get_priority_cache();
3572 if (store->cache_autotune && binned_kv_cache != nullptr) {
3573 pcm = std::make_shared<PriorityCache::Manager>(
3574 store->cct, min, max, target, true);
3575 pcm->insert("kv", binned_kv_cache, true);
3576 pcm->insert("meta", meta_cache, true);
3577 pcm->insert("data", data_cache, true);
3578 }
91327a77
AA
3579
3580 utime_t next_balance = ceph_clock_now();
3581 utime_t next_resize = ceph_clock_now();
31f18b77 3582
91327a77 3583 bool interval_stats_trim = false;
91327a77 3584 while (!stop) {
92f5a8d4
TL
3585 // Update pcm cache settings if related configuration was changed
3586 uint32_t cur_config_change = store->config_changed.load();
3587 if (cur_config_change != prev_config_change) {
3588 _update_cache_settings();
3589 prev_config_change = cur_config_change;
3590 }
3591
91327a77
AA
3592 // Before we trim, check and see if it's time to rebalance/resize.
3593 double autotune_interval = store->cache_autotune_interval;
3594 double resize_interval = store->osd_memory_cache_resize_interval;
3595
3596 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
3597 _adjust_cache_settings();
3598
91327a77 3599 // Log events at 5 instead of 20 when balance happens.
91327a77 3600 interval_stats_trim = true;
eafe8130
TL
3601
3602 if (pcm != nullptr) {
3603 pcm->balance();
91327a77 3604 }
31f18b77 3605
91327a77
AA
3606 next_balance = ceph_clock_now();
3607 next_balance += autotune_interval;
3608 }
3609 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
3610 if (ceph_using_tcmalloc() && pcm != nullptr) {
3611 pcm->tune_memory();
91327a77
AA
3612 }
3613 next_resize = ceph_clock_now();
3614 next_resize += resize_interval;
31f18b77
FG
3615 }
3616
91327a77
AA
3617 // Now Trim
3618 _trim_shards(interval_stats_trim);
3619 interval_stats_trim = false;
31f18b77 3620
91327a77 3621 store->_update_cache_logger();
11fdf7f2
TL
3622 auto wait = ceph::make_timespan(
3623 store->cct->_conf->bluestore_cache_trim_interval);
3624 cond.wait_for(l, wait);
7c673cae
FG
3625 }
3626 stop = false;
3627 return NULL;
3628}
3629
91327a77
AA
3630void BlueStore::MempoolThread::_adjust_cache_settings()
3631{
11fdf7f2
TL
3632 if (binned_kv_cache != nullptr) {
3633 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
3634 }
3635 meta_cache->set_cache_ratio(store->cache_meta_ratio);
3636 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
3637}
3638
3639void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
3640{
3641 auto cct = store->cct;
3642 size_t num_shards = store->cache_shards.size();
3643
3644 int64_t kv_used = store->db->get_cache_usage();
11fdf7f2
TL
3645 int64_t meta_used = meta_cache->_get_used_bytes();
3646 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
3647
3648 uint64_t cache_size = store->cache_size;
3649 int64_t kv_alloc =
11fdf7f2 3650 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
91327a77 3651 int64_t meta_alloc =
11fdf7f2 3652 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 3653 int64_t data_alloc =
11fdf7f2 3654 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 3655
eafe8130
TL
3656 if (pcm != nullptr && binned_kv_cache != nullptr) {
3657 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
3658 kv_alloc = binned_kv_cache->get_committed_size();
3659 meta_alloc = meta_cache->get_committed_size();
3660 data_alloc = data_cache->get_committed_size();
91327a77
AA
3661 }
3662
3663 if (interval_stats) {
3664 ldout(cct, 5) << __func__ << " cache_size: " << cache_size
3665 << " kv_alloc: " << kv_alloc
3666 << " kv_used: " << kv_used
3667 << " meta_alloc: " << meta_alloc
3668 << " meta_used: " << meta_used
3669 << " data_alloc: " << data_alloc
3670 << " data_used: " << data_used << dendl;
3671 } else {
3672 ldout(cct, 20) << __func__ << " cache_size: " << cache_size
3673 << " kv_alloc: " << kv_alloc
3674 << " kv_used: " << kv_used
3675 << " meta_alloc: " << meta_alloc
3676 << " meta_used: " << meta_used
3677 << " data_alloc: " << data_alloc
3678 << " data_used: " << data_used << dendl;
3679 }
3680
3681 uint64_t max_shard_onodes = static_cast<uint64_t>(
11fdf7f2 3682 (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode());
91327a77
AA
3683 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
3684
3685 ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
3686 << " max_shard_buffer: " << max_shard_buffer << dendl;
3687
3688 for (auto i : store->cache_shards) {
3689 i->trim(max_shard_onodes, max_shard_buffer);
3690 }
3691}
3692
92f5a8d4
TL
3693void BlueStore::MempoolThread::_update_cache_settings()
3694{
3695 // Nothing to do if pcm is not used.
3696 if (pcm == nullptr) {
3697 return;
3698 }
3699
3700 auto cct = store->cct;
3701 uint64_t target = store->osd_memory_target;
3702 uint64_t base = store->osd_memory_base;
3703 uint64_t min = store->osd_memory_cache_min;
3704 uint64_t max = min;
3705 double fragmentation = store->osd_memory_expected_fragmentation;
3706
3707 uint64_t ltarget = (1.0 - fragmentation) * target;
3708 if (ltarget > base + min) {
3709 max = ltarget - base;
3710 }
3711
3712 // set pcm cache levels
3713 pcm->set_target_memory(target);
3714 pcm->set_min_memory(min);
3715 pcm->set_max_memory(max);
3716
3717 ldout(cct, 5) << __func__ << " updated pcm target: " << target
3718 << " pcm min: " << min
3719 << " pcm max: " << max
3720 << dendl;
3721}
3722
7c673cae
FG
3723// =======================================================
3724
31f18b77
FG
3725// OmapIteratorImpl
3726
3727#undef dout_prefix
3728#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3729
3730BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3731 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3732 : c(c), o(o), it(it)
3733{
3734 RWLock::RLocker l(c->lock);
3735 if (o->onode.has_omap()) {
3736 get_omap_key(o->onode.nid, string(), &head);
3737 get_omap_tail(o->onode.nid, &tail);
3738 it->lower_bound(head);
3739 }
3740}
3741
11fdf7f2
TL
3742string BlueStore::OmapIteratorImpl::_stringify() const
3743{
3744 stringstream s;
3745 s << " omap_iterator(cid = " << c->cid
3746 <<", oid = " << o->oid << ")";
3747 return s.str();
3748}
3749
31f18b77
FG
3750int BlueStore::OmapIteratorImpl::seek_to_first()
3751{
3752 RWLock::RLocker l(c->lock);
11fdf7f2 3753 auto start1 = mono_clock::now();
31f18b77
FG
3754 if (o->onode.has_omap()) {
3755 it->lower_bound(head);
3756 } else {
3757 it = KeyValueDB::Iterator();
3758 }
494da23a
TL
3759 c->store->log_latency(
3760 __func__,
11fdf7f2
TL
3761 l_bluestore_omap_seek_to_first_lat,
3762 mono_clock::now() - start1,
494da23a 3763 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 3764
31f18b77
FG
3765 return 0;
3766}
3767
3768int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3769{
3770 RWLock::RLocker l(c->lock);
11fdf7f2 3771 auto start1 = mono_clock::now();
31f18b77
FG
3772 if (o->onode.has_omap()) {
3773 string key;
3774 get_omap_key(o->onode.nid, after, &key);
3775 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3776 << pretty_binary_string(key) << dendl;
3777 it->upper_bound(key);
3778 } else {
3779 it = KeyValueDB::Iterator();
3780 }
11fdf7f2 3781 c->store->log_latency_fn(
494da23a 3782 __func__,
11fdf7f2
TL
3783 l_bluestore_omap_upper_bound_lat,
3784 mono_clock::now() - start1,
494da23a 3785 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 3786 [&] (const ceph::timespan& lat) {
494da23a 3787 return ", after = " + after +
11fdf7f2
TL
3788 _stringify();
3789 }
3790 );
31f18b77
FG
3791 return 0;
3792}
3793
3794int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3795{
3796 RWLock::RLocker l(c->lock);
11fdf7f2 3797 auto start1 = mono_clock::now();
31f18b77
FG
3798 if (o->onode.has_omap()) {
3799 string key;
3800 get_omap_key(o->onode.nid, to, &key);
3801 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3802 << pretty_binary_string(key) << dendl;
3803 it->lower_bound(key);
3804 } else {
3805 it = KeyValueDB::Iterator();
3806 }
11fdf7f2 3807 c->store->log_latency_fn(
494da23a 3808 __func__,
11fdf7f2
TL
3809 l_bluestore_omap_lower_bound_lat,
3810 mono_clock::now() - start1,
494da23a 3811 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 3812 [&] (const ceph::timespan& lat) {
494da23a 3813 return ", to = " + to +
11fdf7f2
TL
3814 _stringify();
3815 }
3816 );
31f18b77
FG
3817 return 0;
3818}
3819
3820bool BlueStore::OmapIteratorImpl::valid()
3821{
3822 RWLock::RLocker l(c->lock);
3823 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 3824 it->raw_key().second < tail;
31f18b77
FG
3825 if (it && it->valid()) {
3826 ldout(c->store->cct,20) << __func__ << " is at "
3827 << pretty_binary_string(it->raw_key().second)
3828 << dendl;
3829 }
3830 return r;
3831}
3832
11fdf7f2 3833int BlueStore::OmapIteratorImpl::next()
31f18b77 3834{
11fdf7f2 3835 int r = -1;
31f18b77 3836 RWLock::RLocker l(c->lock);
11fdf7f2 3837 auto start1 = mono_clock::now();
31f18b77
FG
3838 if (o->onode.has_omap()) {
3839 it->next();
11fdf7f2 3840 r = 0;
31f18b77 3841 }
494da23a
TL
3842 c->store->log_latency(
3843 __func__,
11fdf7f2
TL
3844 l_bluestore_omap_next_lat,
3845 mono_clock::now() - start1,
494da23a 3846 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
3847
3848 return r;
31f18b77
FG
3849}
3850
3851string BlueStore::OmapIteratorImpl::key()
3852{
3853 RWLock::RLocker l(c->lock);
11fdf7f2 3854 ceph_assert(it->valid());
31f18b77
FG
3855 string db_key = it->raw_key().second;
3856 string user_key;
3857 decode_omap_key(db_key, &user_key);
494da23a 3858
31f18b77
FG
3859 return user_key;
3860}
3861
3862bufferlist BlueStore::OmapIteratorImpl::value()
3863{
3864 RWLock::RLocker l(c->lock);
11fdf7f2 3865 ceph_assert(it->valid());
31f18b77
FG
3866 return it->value();
3867}
3868
3869
3870// =====================================
3871
7c673cae
FG
3872#undef dout_prefix
3873#define dout_prefix *_dout << "bluestore(" << path << ") "
3874
3875
3876static void aio_cb(void *priv, void *priv2)
3877{
3878 BlueStore *store = static_cast<BlueStore*>(priv);
3879 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3880 c->aio_finish(store);
3881}
3882
11fdf7f2
TL
3883static void discard_cb(void *priv, void *priv2)
3884{
3885 BlueStore *store = static_cast<BlueStore*>(priv);
3886 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
3887 store->handle_discard(*tmp);
3888}
3889
3890void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
3891{
3892 dout(10) << __func__ << dendl;
3893 ceph_assert(alloc);
3894 alloc->release(to_release);
3895}
3896
7c673cae
FG
3897BlueStore::BlueStore(CephContext *cct, const string& path)
3898 : ObjectStore(cct, path),
3899 throttle_bytes(cct, "bluestore_throttle_bytes",
3900 cct->_conf->bluestore_throttle_bytes),
3901 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3902 cct->_conf->bluestore_throttle_bytes +
3903 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 3904 deferred_finisher(cct, "defered_finisher", "dfin"),
11fdf7f2 3905 finisher(cct, "commit_finisher", "cfin"),
7c673cae 3906 kv_sync_thread(this),
31f18b77 3907 kv_finalize_thread(this),
7c673cae
FG
3908 mempool_thread(this)
3909{
3910 _init_logger();
11fdf7f2 3911 cct->_conf.add_observer(this);
7c673cae 3912 set_cache_shards(1);
7c673cae
FG
3913}
3914
3915BlueStore::BlueStore(CephContext *cct,
3916 const string& path,
3917 uint64_t _min_alloc_size)
3918 : ObjectStore(cct, path),
3919 throttle_bytes(cct, "bluestore_throttle_bytes",
3920 cct->_conf->bluestore_throttle_bytes),
3921 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3922 cct->_conf->bluestore_throttle_bytes +
3923 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 3924 deferred_finisher(cct, "defered_finisher", "dfin"),
11fdf7f2 3925 finisher(cct, "commit_finisher", "cfin"),
7c673cae 3926 kv_sync_thread(this),
31f18b77 3927 kv_finalize_thread(this),
7c673cae
FG
3928 min_alloc_size(_min_alloc_size),
3929 min_alloc_size_order(ctz(_min_alloc_size)),
3930 mempool_thread(this)
3931{
3932 _init_logger();
11fdf7f2 3933 cct->_conf.add_observer(this);
7c673cae 3934 set_cache_shards(1);
7c673cae
FG
3935}
3936
3937BlueStore::~BlueStore()
3938{
11fdf7f2 3939 cct->_conf.remove_observer(this);
7c673cae 3940 _shutdown_logger();
11fdf7f2
TL
3941 ceph_assert(!mounted);
3942 ceph_assert(db == NULL);
3943 ceph_assert(bluefs == NULL);
3944 ceph_assert(fsid_fd < 0);
3945 ceph_assert(path_fd < 0);
7c673cae
FG
3946 for (auto i : cache_shards) {
3947 delete i;
3948 }
3949 cache_shards.clear();
3950}
3951
3952const char **BlueStore::get_tracked_conf_keys() const
3953{
3954 static const char* KEYS[] = {
3955 "bluestore_csum_type",
3956 "bluestore_compression_mode",
3957 "bluestore_compression_algorithm",
3958 "bluestore_compression_min_blob_size",
3959 "bluestore_compression_min_blob_size_ssd",
3960 "bluestore_compression_min_blob_size_hdd",
3961 "bluestore_compression_max_blob_size",
3962 "bluestore_compression_max_blob_size_ssd",
3963 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 3964 "bluestore_compression_required_ratio",
7c673cae
FG
3965 "bluestore_max_alloc_size",
3966 "bluestore_prefer_deferred_size",
181888fb
FG
3967 "bluestore_prefer_deferred_size_hdd",
3968 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
3969 "bluestore_deferred_batch_ops",
3970 "bluestore_deferred_batch_ops_hdd",
3971 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
3972 "bluestore_throttle_bytes",
3973 "bluestore_throttle_deferred_bytes",
3974 "bluestore_throttle_cost_per_io_hdd",
3975 "bluestore_throttle_cost_per_io_ssd",
3976 "bluestore_throttle_cost_per_io",
3977 "bluestore_max_blob_size",
3978 "bluestore_max_blob_size_ssd",
3979 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
3980 "osd_memory_target",
3981 "osd_memory_target_cgroup_limit_ratio",
3982 "osd_memory_base",
3983 "osd_memory_cache_min",
92f5a8d4 3984 "osd_memory_expected_fragmentation",
11fdf7f2
TL
3985 "bluestore_cache_autotune",
3986 "bluestore_cache_autotune_interval",
81eedcae 3987 "bluestore_warn_on_legacy_statfs",
7c673cae
FG
3988 NULL
3989 };
3990 return KEYS;
3991}
3992
11fdf7f2 3993void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
3994 const std::set<std::string> &changed)
3995{
eafe8130 3996 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
3997 _check_legacy_statfs_alert();
3998 }
3999
7c673cae
FG
4000 if (changed.count("bluestore_csum_type")) {
4001 _set_csum();
4002 }
4003 if (changed.count("bluestore_compression_mode") ||
4004 changed.count("bluestore_compression_algorithm") ||
4005 changed.count("bluestore_compression_min_blob_size") ||
4006 changed.count("bluestore_compression_max_blob_size")) {
4007 if (bdev) {
4008 _set_compression();
4009 }
4010 }
4011 if (changed.count("bluestore_max_blob_size") ||
4012 changed.count("bluestore_max_blob_size_ssd") ||
4013 changed.count("bluestore_max_blob_size_hdd")) {
4014 if (bdev) {
4015 // only after startup
4016 _set_blob_size();
4017 }
4018 }
4019 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4020 changed.count("bluestore_prefer_deferred_size_hdd") ||
4021 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4022 changed.count("bluestore_max_alloc_size") ||
4023 changed.count("bluestore_deferred_batch_ops") ||
4024 changed.count("bluestore_deferred_batch_ops_hdd") ||
4025 changed.count("bluestore_deferred_batch_ops_ssd")) {
4026 if (bdev) {
4027 // only after startup
4028 _set_alloc_sizes();
4029 }
4030 }
4031 if (changed.count("bluestore_throttle_cost_per_io") ||
4032 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4033 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4034 if (bdev) {
4035 _set_throttle_params();
4036 }
4037 }
4038 if (changed.count("bluestore_throttle_bytes")) {
4039 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
4040 throttle_deferred_bytes.reset_max(
4041 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
4042 }
4043 if (changed.count("bluestore_throttle_deferred_bytes")) {
4044 throttle_deferred_bytes.reset_max(
4045 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
4046 }
92f5a8d4
TL
4047 if (changed.count("osd_memory_target") ||
4048 changed.count("osd_memory_base") ||
4049 changed.count("osd_memory_cache_min") ||
4050 changed.count("osd_memory_expected_fragmentation")) {
4051 _update_osd_memory_options();
4052 }
7c673cae
FG
4053}
4054
4055void BlueStore::_set_compression()
4056{
224ce89b
WB
4057 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4058 if (m) {
11fdf7f2 4059 _clear_compression_alert();
224ce89b
WB
4060 comp_mode = *m;
4061 } else {
4062 derr << __func__ << " unrecognized value '"
4063 << cct->_conf->bluestore_compression_mode
4064 << "' for bluestore_compression_mode, reverting to 'none'"
4065 << dendl;
4066 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4067 string s("unknown mode: ");
4068 s += cct->_conf->bluestore_compression_mode;
4069 _set_compression_alert(true, s.c_str());
224ce89b
WB
4070 }
4071
4072 compressor = nullptr;
4073
3efd9988
FG
4074 if (cct->_conf->bluestore_compression_min_blob_size) {
4075 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4076 } else {
11fdf7f2 4077 ceph_assert(bdev);
7c673cae
FG
4078 if (bdev->is_rotational()) {
4079 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4080 } else {
4081 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4082 }
4083 }
4084
4085 if (cct->_conf->bluestore_compression_max_blob_size) {
4086 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4087 } else {
11fdf7f2 4088 ceph_assert(bdev);
7c673cae
FG
4089 if (bdev->is_rotational()) {
4090 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4091 } else {
4092 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4093 }
4094 }
4095
7c673cae
FG
4096 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4097 if (!alg_name.empty()) {
4098 compressor = Compressor::create(cct, alg_name);
4099 if (!compressor) {
4100 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4101 << dendl;
11fdf7f2 4102 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4103 }
4104 }
4105
4106 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4107 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4108 << " min_blob " << comp_min_blob_size
4109 << " max_blob " << comp_max_blob_size
7c673cae
FG
4110 << dendl;
4111}
4112
4113void BlueStore::_set_csum()
4114{
4115 csum_type = Checksummer::CSUM_NONE;
4116 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4117 if (t > Checksummer::CSUM_NONE)
4118 csum_type = t;
4119
4120 dout(10) << __func__ << " csum_type "
4121 << Checksummer::get_csum_type_string(csum_type)
4122 << dendl;
4123}
4124
4125void BlueStore::_set_throttle_params()
4126{
4127 if (cct->_conf->bluestore_throttle_cost_per_io) {
4128 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4129 } else {
11fdf7f2 4130 ceph_assert(bdev);
7c673cae
FG
4131 if (bdev->is_rotational()) {
4132 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4133 } else {
4134 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4135 }
4136 }
4137
4138 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4139 << dendl;
4140}
4141void BlueStore::_set_blob_size()
4142{
4143 if (cct->_conf->bluestore_max_blob_size) {
4144 max_blob_size = cct->_conf->bluestore_max_blob_size;
4145 } else {
11fdf7f2 4146 ceph_assert(bdev);
7c673cae
FG
4147 if (bdev->is_rotational()) {
4148 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4149 } else {
4150 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4151 }
4152 }
4153 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4154 << std::dec << dendl;
4155}
4156
92f5a8d4
TL
4157void BlueStore::_update_osd_memory_options()
4158{
4159 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4160 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4161 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4162 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4163 config_changed++;
4164 dout(10) << __func__
4165 << " osd_memory_target " << osd_memory_target
4166 << " osd_memory_base " << osd_memory_base
4167 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4168 << " osd_memory_cache_min " << osd_memory_cache_min
4169 << dendl;
4170}
4171
11fdf7f2 4172int BlueStore::_set_cache_sizes()
1adf2230 4173{
11fdf7f2
TL
4174 ceph_assert(bdev);
4175 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4176 cache_autotune_interval =
11fdf7f2
TL
4177 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4178 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4179 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4180 osd_memory_expected_fragmentation =
11fdf7f2
TL
4181 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4182 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4183 osd_memory_cache_resize_interval =
11fdf7f2 4184 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4185
224ce89b
WB
4186 if (cct->_conf->bluestore_cache_size) {
4187 cache_size = cct->_conf->bluestore_cache_size;
4188 } else {
4189 // choose global cache size based on backend type
4190 if (bdev->is_rotational()) {
4191 cache_size = cct->_conf->bluestore_cache_size_hdd;
4192 } else {
4193 cache_size = cct->_conf->bluestore_cache_size_ssd;
4194 }
4195 }
31f18b77 4196
91327a77 4197 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
224ce89b 4198 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4199 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4200 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4201 return -EINVAL;
4202 }
91327a77
AA
4203
4204 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b 4205 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4206 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4207 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4208 return -EINVAL;
4209 }
91327a77 4210
31f18b77 4211 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4212 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4213 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4214 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4215 << dendl;
31f18b77
FG
4216 return -EINVAL;
4217 }
91327a77
AA
4218
4219 cache_data_ratio =
4220 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
31f18b77
FG
4221 if (cache_data_ratio < 0) {
4222 // deal with floating point imprecision
4223 cache_data_ratio = 0;
4224 }
91327a77 4225
224ce89b
WB
4226 dout(1) << __func__ << " cache_size " << cache_size
4227 << " meta " << cache_meta_ratio
31f18b77
FG
4228 << " kv " << cache_kv_ratio
4229 << " data " << cache_data_ratio
4230 << dendl;
4231 return 0;
4232}
4233
3efd9988
FG
4234int BlueStore::write_meta(const std::string& key, const std::string& value)
4235{
4236 bluestore_bdev_label_t label;
4237 string p = path + "/block";
4238 int r = _read_bdev_label(cct, p, &label);
4239 if (r < 0) {
4240 return ObjectStore::write_meta(key, value);
4241 }
4242 label.meta[key] = value;
4243 r = _write_bdev_label(cct, p, label);
11fdf7f2 4244 ceph_assert(r == 0);
3efd9988
FG
4245 return ObjectStore::write_meta(key, value);
4246}
4247
4248int BlueStore::read_meta(const std::string& key, std::string *value)
4249{
4250 bluestore_bdev_label_t label;
4251 string p = path + "/block";
4252 int r = _read_bdev_label(cct, p, &label);
4253 if (r < 0) {
4254 return ObjectStore::read_meta(key, value);
4255 }
4256 auto i = label.meta.find(key);
4257 if (i == label.meta.end()) {
4258 return ObjectStore::read_meta(key, value);
4259 }
4260 *value = i->second;
4261 return 0;
4262}
4263
7c673cae
FG
4264void BlueStore::_init_logger()
4265{
4266 PerfCountersBuilder b(cct, "bluestore",
4267 l_bluestore_first, l_bluestore_last);
4268 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4269 "Average kv_thread flush latency",
4270 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4271 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4272 "Average kv_thread commit latency");
11fdf7f2
TL
4273 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4274 "Average kv_sync thread latency",
4275 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4276 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4277 "Average kv_finalize thread latency",
4278 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4279 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4280 "Average prepare state latency");
4281 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4282 "Average aio_wait state latency",
4283 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4284 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4285 "Average io_done state latency");
4286 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4287 "Average kv_queued state latency");
4288 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4289 "Average kv_commiting state latency");
4290 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4291 "Average kv_done state latency");
4292 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4293 "Average deferred_queued state latency");
4294 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4295 "Average aio_wait state latency");
4296 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4297 "Average cleanup state latency");
4298 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4299 "Average finishing state latency");
4300 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4301 "Average done state latency");
4302 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4303 "Average submit throttle latency",
4304 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4305 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4306 "Average submit latency",
4307 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4308 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4309 "Average commit latency",
4310 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4311 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4312 "Average read latency",
4313 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4314 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4315 "Average read onode metadata latency");
4316 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4317 "Average read latency");
4318 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4319 "Average compress latency");
4320 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4321 "Average decompress latency");
4322 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4323 "Average checksum latency");
4324 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4325 "Sum for beneficial compress ops");
4326 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4327 "Sum for compress ops rejected due to low net gain of space");
4328 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4329 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4330 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4331 "Sum for deferred write op");
4332 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4333 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4334 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4335 "Sum for write penalty read ops");
4336 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4337 "Sum for allocated bytes");
4338 b.add_u64(l_bluestore_stored, "bluestore_stored",
4339 "Sum for stored bytes");
4340 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
92f5a8d4
TL
4341 "Sum for stored compressed bytes",
4342 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4343 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
92f5a8d4
TL
4344 "Sum for bytes allocated for compressed data",
4345 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4346 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
92f5a8d4
TL
4347 "Sum for original bytes that were compressed",
4348 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
4349 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4350 "Number of onodes in cache");
4351 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4352 "Sum for onode-lookups hit in the cache");
4353 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4354 "Sum for onode-lookups missed in the cache");
4355 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4356 "Sum for onode-shard lookups hit in the cache");
4357 b.add_u64_counter(l_bluestore_onode_shard_misses,
4358 "bluestore_onode_shard_misses",
4359 "Sum for onode-shard lookups missed in the cache");
4360 b.add_u64(l_bluestore_extents, "bluestore_extents",
4361 "Number of extents in cache");
4362 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4363 "Number of blobs in cache");
4364 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4365 "Number of buffers in cache");
4366 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4367 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4368 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4369 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4370 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4371 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4372
4373 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4374 "Large aligned writes into fresh blobs");
4375 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4376 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4377 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4378 "Large aligned writes into fresh blobs (blobs)");
4379 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4380 "Small writes into existing or sparse small blobs");
4381 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4382 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4383 b.add_u64_counter(l_bluestore_write_small_unused,
4384 "bluestore_write_small_unused",
4385 "Small writes into unused portion of existing blob");
4386 b.add_u64_counter(l_bluestore_write_small_deferred,
4387 "bluestore_write_small_deferred",
4388 "Small overwrites using deferred");
4389 b.add_u64_counter(l_bluestore_write_small_pre_read,
4390 "bluestore_write_small_pre_read",
4391 "Small writes that required we read some data (possibly "
4392 "cached) to fill out the block");
4393 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4394 "Small write into new (sparse) blob");
4395
4396 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4397 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4398 "Onode extent map reshard events");
4399 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4400 "Sum for blob splitting due to resharding");
4401 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4402 "Sum for extents that have been removed due to compression");
4403 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4404 "Sum for extents that have been merged due to garbage "
4405 "collection");
b32b8144
FG
4406 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4407 "Read EIO errors propagated to high level callers");
f64942e4
AA
4408 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4409 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
4410 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4411 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
4412 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4413 "Average omap iterator seek_to_first call latency");
4414 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4415 "Average omap iterator upper_bound call latency");
4416 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
4417 "Average omap iterator lower_bound call latency");
4418 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
4419 "Average omap iterator next call latency");
494da23a
TL
4420 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
4421 "Average collection listing latency");
7c673cae
FG
4422 logger = b.create_perf_counters();
4423 cct->get_perfcounters_collection()->add(logger);
4424}
4425
4426int BlueStore::_reload_logger()
4427{
4428 struct store_statfs_t store_statfs;
7c673cae 4429 int r = statfs(&store_statfs);
11fdf7f2 4430 if (r >= 0) {
7c673cae 4431 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
4432 logger->set(l_bluestore_stored, store_statfs.data_stored);
4433 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
4434 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
4435 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
4436 }
4437 return r;
4438}
4439
4440void BlueStore::_shutdown_logger()
4441{
4442 cct->get_perfcounters_collection()->remove(logger);
4443 delete logger;
4444}
4445
4446int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4447 uuid_d *fsid)
4448{
4449 bluestore_bdev_label_t label;
4450 int r = _read_bdev_label(cct, path, &label);
4451 if (r < 0)
4452 return r;
4453 *fsid = label.osd_uuid;
4454 return 0;
4455}
4456
4457int BlueStore::_open_path()
4458{
b32b8144 4459 // sanity check(s)
11fdf7f2
TL
4460 auto osd_max_object_size =
4461 cct->_conf.get_val<Option::size_t>("osd_max_object_size");
4462 if (osd_max_object_size >= (size_t)OBJECT_MAX_SIZE) {
4463 derr << __func__ << " osd_max_object_size >= 0x" << std::hex << OBJECT_MAX_SIZE
4464 << "; BlueStore has hard limit of 0x" << OBJECT_MAX_SIZE << "." << std::dec << dendl;
b32b8144
FG
4465 return -EINVAL;
4466 }
11fdf7f2 4467 ceph_assert(path_fd < 0);
91327a77 4468 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
4469 if (path_fd < 0) {
4470 int r = -errno;
4471 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4472 << dendl;
4473 return r;
4474 }
4475 return 0;
4476}
4477
4478void BlueStore::_close_path()
4479{
4480 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4481 path_fd = -1;
4482}
4483
3efd9988
FG
4484int BlueStore::_write_bdev_label(CephContext *cct,
4485 string path, bluestore_bdev_label_t label)
7c673cae
FG
4486{
4487 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4488 bufferlist bl;
11fdf7f2 4489 encode(label, bl);
7c673cae 4490 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
4491 encode(crc, bl);
4492 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
4493 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4494 z.zero();
4495 bl.append(std::move(z));
4496
91327a77 4497 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
4498 if (fd < 0) {
4499 fd = -errno;
4500 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4501 << dendl;
4502 return fd;
4503 }
4504 int r = bl.write_fd(fd);
4505 if (r < 0) {
4506 derr << __func__ << " failed to write to " << path
4507 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 4508 goto out;
7c673cae 4509 }
3efd9988
FG
4510 r = ::fsync(fd);
4511 if (r < 0) {
4512 derr << __func__ << " failed to fsync " << path
4513 << ": " << cpp_strerror(r) << dendl;
4514 }
11fdf7f2 4515out:
7c673cae
FG
4516 VOID_TEMP_FAILURE_RETRY(::close(fd));
4517 return r;
4518}
4519
4520int BlueStore::_read_bdev_label(CephContext* cct, string path,
4521 bluestore_bdev_label_t *label)
4522{
4523 dout(10) << __func__ << dendl;
91327a77 4524 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
4525 if (fd < 0) {
4526 fd = -errno;
4527 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4528 << dendl;
4529 return fd;
4530 }
4531 bufferlist bl;
4532 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4533 VOID_TEMP_FAILURE_RETRY(::close(fd));
4534 if (r < 0) {
4535 derr << __func__ << " failed to read from " << path
4536 << ": " << cpp_strerror(r) << dendl;
4537 return r;
4538 }
4539
4540 uint32_t crc, expected_crc;
11fdf7f2 4541 auto p = bl.cbegin();
7c673cae 4542 try {
11fdf7f2 4543 decode(*label, p);
7c673cae
FG
4544 bufferlist t;
4545 t.substr_of(bl, 0, p.get_off());
4546 crc = t.crc32c(-1);
11fdf7f2 4547 decode(expected_crc, p);
7c673cae
FG
4548 }
4549 catch (buffer::error& e) {
b32b8144 4550 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
4551 << ": " << e.what()
4552 << dendl;
b32b8144 4553 return -ENOENT;
7c673cae
FG
4554 }
4555 if (crc != expected_crc) {
4556 derr << __func__ << " bad crc on label, expected " << expected_crc
4557 << " != actual " << crc << dendl;
4558 return -EIO;
4559 }
4560 dout(10) << __func__ << " got " << *label << dendl;
4561 return 0;
4562}
4563
4564int BlueStore::_check_or_set_bdev_label(
4565 string path, uint64_t size, string desc, bool create)
4566{
4567 bluestore_bdev_label_t label;
4568 if (create) {
4569 label.osd_uuid = fsid;
4570 label.size = size;
4571 label.btime = ceph_clock_now();
4572 label.description = desc;
3efd9988 4573 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
4574 if (r < 0)
4575 return r;
4576 } else {
4577 int r = _read_bdev_label(cct, path, &label);
4578 if (r < 0)
4579 return r;
31f18b77
FG
4580 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4581 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4582 << " and fsid " << fsid << " check bypassed" << dendl;
4583 }
4584 else if (label.osd_uuid != fsid) {
7c673cae
FG
4585 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4586 << " does not match our fsid " << fsid << dendl;
4587 return -EIO;
4588 }
4589 }
4590 return 0;
4591}
4592
4593void BlueStore::_set_alloc_sizes(void)
4594{
7c673cae
FG
4595 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4596
4597 if (cct->_conf->bluestore_prefer_deferred_size) {
4598 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4599 } else {
11fdf7f2 4600 ceph_assert(bdev);
7c673cae
FG
4601 if (bdev->is_rotational()) {
4602 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4603 } else {
4604 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4605 }
4606 }
4607
4608 if (cct->_conf->bluestore_deferred_batch_ops) {
4609 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4610 } else {
11fdf7f2 4611 ceph_assert(bdev);
7c673cae
FG
4612 if (bdev->is_rotational()) {
4613 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4614 } else {
4615 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4616 }
4617 }
4618
4619 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 4620 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
4621 << " max_alloc_size 0x" << std::hex << max_alloc_size
4622 << " prefer_deferred_size 0x" << prefer_deferred_size
4623 << std::dec
4624 << " deferred_batch_ops " << deferred_batch_ops
4625 << dendl;
4626}
4627
4628int BlueStore::_open_bdev(bool create)
4629{
11fdf7f2 4630 ceph_assert(bdev == NULL);
7c673cae 4631 string p = path + "/block";
11fdf7f2 4632 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
4633 int r = bdev->open(p);
4634 if (r < 0)
4635 goto fail;
4636
11fdf7f2
TL
4637 if (create && cct->_conf->bdev_enable_discard) {
4638 bdev->discard(0, bdev->get_size());
4639 }
4640
7c673cae
FG
4641 if (bdev->supported_bdev_label()) {
4642 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4643 if (r < 0)
4644 goto fail_close;
4645 }
4646
4647 // initialize global block parameters
4648 block_size = bdev->get_block_size();
4649 block_mask = ~(block_size - 1);
4650 block_size_order = ctz(block_size);
11fdf7f2 4651 ceph_assert(block_size == 1u << block_size_order);
224ce89b
WB
4652 // and set cache_size based on device type
4653 r = _set_cache_sizes();
4654 if (r < 0) {
4655 goto fail_close;
4656 }
7c673cae
FG
4657 return 0;
4658
4659 fail_close:
4660 bdev->close();
4661 fail:
4662 delete bdev;
4663 bdev = NULL;
4664 return r;
4665}
4666
11fdf7f2
TL
4667void BlueStore::_validate_bdev()
4668{
4669 ceph_assert(bdev);
4670 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
4671 uint64_t dev_size = bdev->get_size();
4672 if (dev_size <
4673 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
4674 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
4675 << " is too small, disable bluestore_bluefs_min for now"
4676 << dendl;
4677 ceph_assert(dev_size >= _get_ondisk_reserved());
4678
4679 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
4680 ceph_assert(r == 0);
4681 }
4682}
4683
7c673cae
FG
4684void BlueStore::_close_bdev()
4685{
11fdf7f2 4686 ceph_assert(bdev);
7c673cae
FG
4687 bdev->close();
4688 delete bdev;
4689 bdev = NULL;
4690}
4691
11fdf7f2 4692int BlueStore::_open_fm(KeyValueDB::Transaction t)
7c673cae 4693{
11fdf7f2
TL
4694 ceph_assert(fm == NULL);
4695 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
4696 ceph_assert(fm);
4697 if (t) {
4698 // create mode. initialize freespace
7c673cae 4699 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
4700 {
4701 bufferlist bl;
4702 bl.append(freelist_type);
4703 t->set(PREFIX_SUPER, "freelist_type", bl);
4704 }
b32b8144
FG
4705 // being able to allocate in units less than bdev block size
4706 // seems to be a bad idea.
11fdf7f2 4707 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
b32b8144 4708 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
4709
4710 // allocate superblock reserved space. note that we do not mark
4711 // bluefs space as allocated in the freelist; we instead rely on
4712 // bluefs_extents.
11fdf7f2 4713 auto reserved = _get_ondisk_reserved();
3efd9988 4714 fm->allocate(0, reserved, t);
7c673cae 4715
7c673cae 4716 if (cct->_conf->bluestore_bluefs) {
11fdf7f2 4717 ceph_assert(bluefs_extents.num_intervals() == 1);
7c673cae 4718 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
11fdf7f2 4719 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
4720 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4721 << " for bluefs" << dendl;
7c673cae
FG
4722 }
4723
4724 if (cct->_conf->bluestore_debug_prefill > 0) {
4725 uint64_t end = bdev->get_size() - reserved;
4726 dout(1) << __func__ << " pre-fragmenting freespace, using "
4727 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4728 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 4729 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
4730 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4731 float r = cct->_conf->bluestore_debug_prefill;
4732 r /= 1.0 - r;
4733 bool stop = false;
4734
4735 while (!stop && start < end) {
4736 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4737 if (start + l > end) {
4738 l = end - start;
11fdf7f2 4739 l = p2align(l, min_alloc_size);
7c673cae 4740 }
11fdf7f2 4741 ceph_assert(start + l <= end);
7c673cae
FG
4742
4743 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 4744 u = p2roundup(u, min_alloc_size);
7c673cae
FG
4745 if (start + l + u > end) {
4746 u = end - (start + l);
4747 // trim to align so we don't overflow again
11fdf7f2 4748 u = p2align(u, min_alloc_size);
7c673cae
FG
4749 stop = true;
4750 }
11fdf7f2 4751 ceph_assert(start + l + u <= end);
7c673cae 4752
11fdf7f2 4753 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
4754 << " use 0x" << u << std::dec << dendl;
4755
4756 if (u == 0) {
4757 // break if u has been trimmed to nothing
4758 break;
4759 }
4760
4761 fm->allocate(start + l, u, t);
4762 start += l + u;
4763 }
4764 }
7c673cae
FG
4765 }
4766
11fdf7f2 4767 int r = fm->init(db);
7c673cae
FG
4768 if (r < 0) {
4769 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4770 delete fm;
4771 fm = NULL;
4772 return r;
4773 }
81eedcae
TL
4774 // if space size tracked by free list manager is that higher than actual
4775 // dev size one can hit out-of-space allocation which will result
4776 // in data loss and/or assertions
4777 // Probably user altered the device size somehow.
4778 // The only fix for now is to redeploy OSD.
4779 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
4780 ostringstream ss;
4781 ss << "slow device size mismatch detected, "
4782 << " fm size(" << fm->get_size()
4783 << ") > slow device size(" << bdev->get_size()
4784 << "), Please stop using this OSD as it might cause data loss.";
4785 _set_disk_size_mismatch_alert(ss.str());
4786 }
7c673cae
FG
4787 return 0;
4788}
4789
4790void BlueStore::_close_fm()
4791{
4792 dout(10) << __func__ << dendl;
11fdf7f2 4793 ceph_assert(fm);
7c673cae
FG
4794 fm->shutdown();
4795 delete fm;
4796 fm = NULL;
4797}
4798
4799int BlueStore::_open_alloc()
4800{
11fdf7f2
TL
4801 ceph_assert(alloc == NULL);
4802 ceph_assert(bdev->get_size());
4803
4804 if (bluefs) {
4805 bluefs_extents.clear();
4806 auto r = bluefs->get_block_extents(bluefs_shared_bdev, &bluefs_extents);
4807 if (r < 0) {
4808 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
4809 << cpp_strerror(r) << dendl;
4810
4811 return r;
4812 }
4813 dout(10) << __func__ << " bluefs extents 0x"
4814 << std::hex << bluefs_extents << std::dec
4815 << dendl;
4816 }
4817
7c673cae
FG
4818 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4819 bdev->get_size(),
eafe8130 4820 min_alloc_size, "block");
7c673cae
FG
4821 if (!alloc) {
4822 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4823 << cct->_conf->bluestore_allocator
4824 << dendl;
4825 return -EINVAL;
4826 }
4827
4828 uint64_t num = 0, bytes = 0;
4829
4830 dout(1) << __func__ << " opening allocation metadata" << dendl;
4831 // initialize from freelist
4832 fm->enumerate_reset();
4833 uint64_t offset, length;
11fdf7f2 4834 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
4835 alloc->init_add_free(offset, length);
4836 ++num;
4837 bytes += length;
4838 }
224ce89b 4839 fm->enumerate_reset();
1adf2230 4840 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
7c673cae
FG
4841 << " in " << num << " extents"
4842 << dendl;
4843
4844 // also mark bluefs space as allocated
4845 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4846 alloc->init_rm_free(e.get_start(), e.get_len());
4847 }
7c673cae
FG
4848
4849 return 0;
4850}
4851
4852void BlueStore::_close_alloc()
4853{
11fdf7f2
TL
4854 ceph_assert(bdev);
4855 bdev->discard_drain();
4856
4857 ceph_assert(alloc);
7c673cae
FG
4858 alloc->shutdown();
4859 delete alloc;
4860 alloc = NULL;
11fdf7f2 4861 bluefs_extents.clear();
7c673cae
FG
4862}
4863
4864int BlueStore::_open_fsid(bool create)
4865{
11fdf7f2 4866 ceph_assert(fsid_fd < 0);
91327a77 4867 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
4868 if (create)
4869 flags |= O_CREAT;
4870 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4871 if (fsid_fd < 0) {
4872 int err = -errno;
4873 derr << __func__ << " " << cpp_strerror(err) << dendl;
4874 return err;
4875 }
4876 return 0;
4877}
4878
4879int BlueStore::_read_fsid(uuid_d *uuid)
4880{
4881 char fsid_str[40];
4882 memset(fsid_str, 0, sizeof(fsid_str));
4883 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4884 if (ret < 0) {
4885 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4886 return ret;
4887 }
4888 if (ret > 36)
4889 fsid_str[36] = 0;
4890 else
4891 fsid_str[ret] = 0;
4892 if (!uuid->parse(fsid_str)) {
4893 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4894 return -EINVAL;
4895 }
4896 return 0;
4897}
4898
4899int BlueStore::_write_fsid()
4900{
4901 int r = ::ftruncate(fsid_fd, 0);
4902 if (r < 0) {
4903 r = -errno;
4904 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4905 return r;
4906 }
4907 string str = stringify(fsid) + "\n";
4908 r = safe_write(fsid_fd, str.c_str(), str.length());
4909 if (r < 0) {
4910 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4911 return r;
4912 }
4913 r = ::fsync(fsid_fd);
4914 if (r < 0) {
4915 r = -errno;
4916 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4917 return r;
4918 }
4919 return 0;
4920}
4921
4922void BlueStore::_close_fsid()
4923{
4924 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4925 fsid_fd = -1;
4926}
4927
4928int BlueStore::_lock_fsid()
4929{
4930 struct flock l;
4931 memset(&l, 0, sizeof(l));
4932 l.l_type = F_WRLCK;
4933 l.l_whence = SEEK_SET;
4934 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4935 if (r < 0) {
4936 int err = errno;
4937 derr << __func__ << " failed to lock " << path << "/fsid"
4938 << " (is another ceph-osd still running?)"
4939 << cpp_strerror(err) << dendl;
4940 return -err;
4941 }
4942 return 0;
4943}
4944
31f18b77
FG
4945bool BlueStore::is_rotational()
4946{
4947 if (bdev) {
4948 return bdev->is_rotational();
4949 }
4950
4951 bool rotational = true;
4952 int r = _open_path();
4953 if (r < 0)
4954 goto out;
4955 r = _open_fsid(false);
4956 if (r < 0)
4957 goto out_path;
4958 r = _read_fsid(&fsid);
4959 if (r < 0)
4960 goto out_fsid;
4961 r = _lock_fsid();
4962 if (r < 0)
4963 goto out_fsid;
4964 r = _open_bdev(false);
4965 if (r < 0)
4966 goto out_fsid;
4967 rotational = bdev->is_rotational();
4968 _close_bdev();
4969 out_fsid:
4970 _close_fsid();
4971 out_path:
4972 _close_path();
4973 out:
4974 return rotational;
4975}
4976
d2e6a577
FG
4977bool BlueStore::is_journal_rotational()
4978{
4979 if (!bluefs) {
4980 dout(5) << __func__ << " bluefs disabled, default to store media type"
4981 << dendl;
4982 return is_rotational();
4983 }
4984 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4985 return bluefs->wal_is_rotational();
4986}
4987
7c673cae
FG
4988bool BlueStore::test_mount_in_use()
4989{
4990 // most error conditions mean the mount is not in use (e.g., because
4991 // it doesn't exist). only if we fail to lock do we conclude it is
4992 // in use.
4993 bool ret = false;
4994 int r = _open_path();
4995 if (r < 0)
4996 return false;
4997 r = _open_fsid(false);
4998 if (r < 0)
4999 goto out_path;
5000 r = _lock_fsid();
5001 if (r < 0)
5002 ret = true; // if we can't lock, it is in use
5003 _close_fsid();
5004 out_path:
5005 _close_path();
5006 return ret;
5007}
5008
11fdf7f2 5009int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5010{
5011 int r;
11fdf7f2 5012 bluefs = new BlueFS(cct);
7c673cae 5013
11fdf7f2
TL
5014 string bfn;
5015 struct stat st;
5016
5017 bfn = path + "/block.db";
5018 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
5019 r = bluefs->add_block_device(
5020 BlueFS::BDEV_DB, bfn,
5021 create && cct->_conf->bdev_enable_discard);
7c673cae 5022 if (r < 0) {
11fdf7f2
TL
5023 derr << __func__ << " add block device(" << bfn << ") returned: "
5024 << cpp_strerror(r) << dendl;
5025 goto free_bluefs;
7c673cae 5026 }
7c673cae 5027
11fdf7f2
TL
5028 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5029 r = _check_or_set_bdev_label(
5030 bfn,
5031 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5032 "bluefs db", create);
5033 if (r < 0) {
5034 derr << __func__
5035 << " check block device(" << bfn << ") label returned: "
5036 << cpp_strerror(r) << dendl;
5037 goto free_bluefs;
5038 }
7c673cae 5039 }
11fdf7f2
TL
5040 if (create) {
5041 bluefs->add_block_extent(
5042 BlueFS::BDEV_DB,
5043 SUPER_RESERVED,
5044 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5045 }
5046 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
5047 bluefs_single_shared_device = false;
5048 } else {
5049 r = -errno;
5050 if (::lstat(bfn.c_str(), &st) == -1) {
5051 r = 0;
5052 bluefs_shared_bdev = BlueFS::BDEV_DB;
7c673cae 5053 } else {
11fdf7f2
TL
5054 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5055 << cpp_strerror(r) << dendl;
5056 goto free_bluefs;
7c673cae
FG
5057 }
5058 }
7c673cae 5059
11fdf7f2
TL
5060 // shared device
5061 bfn = path + "/block";
5062 // never trim here
5063 r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false,
5064 true /* shared with bluestore */);
5065 if (r < 0) {
5066 derr << __func__ << " add block device(" << bfn << ") returned: "
5067 << cpp_strerror(r) << dendl;
5068 goto free_bluefs;
5069 }
5070 if (create) {
5071 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5072 uint64_t initial =
5073 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5074 cct->_conf->bluestore_bluefs_gift_ratio);
5075 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
eafe8130
TL
5076 uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size;
5077 if (alloc_size % min_alloc_size) {
5078 derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex
5079 << alloc_size << " is not a multiple of "
11fdf7f2
TL
5080 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5081 r = -EINVAL;
5082 goto free_bluefs;
7c673cae 5083 }
11fdf7f2 5084 // align to bluefs's alloc_size
eafe8130 5085 initial = p2roundup(initial, alloc_size);
11fdf7f2 5086 // put bluefs in the middle of the device in case it is an HDD
eafe8130 5087 uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size);
11fdf7f2 5088 //avoiding superblock overwrite
eafe8130
TL
5089 start = std::max(alloc_size, start);
5090 ceph_assert(start >=_get_ondisk_reserved());
7c673cae 5091
11fdf7f2
TL
5092 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
5093 bluefs_extents.insert(start, initial);
5094 ++out_of_sync_fm;
5095 }
5096
5097 bfn = path + "/block.wal";
5098 if (::stat(bfn.c_str(), &st) == 0) {
5099 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
eafe8130 5100 create && cct->_conf->bdev_enable_discard);
11fdf7f2
TL
5101 if (r < 0) {
5102 derr << __func__ << " add block device(" << bfn << ") returned: "
5103 << cpp_strerror(r) << dendl;
5104 goto free_bluefs;
5105 }
7c673cae 5106
11fdf7f2
TL
5107 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5108 r = _check_or_set_bdev_label(
5109 bfn,
5110 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5111 "bluefs wal", create);
7c673cae 5112 if (r < 0) {
11fdf7f2
TL
5113 derr << __func__ << " check block device(" << bfn
5114 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5115 goto free_bluefs;
5116 }
7c673cae
FG
5117 }
5118
11fdf7f2
TL
5119 if (create) {
5120 bluefs->add_block_extent(
5121 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5122 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5123 BDEV_LABEL_BLOCK_SIZE);
5124 }
5125 bluefs_single_shared_device = false;
5126 } else {
5127 r = 0;
5128 if (::lstat(bfn.c_str(), &st) != -1) {
5129 r = -errno;
5130 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5131 << cpp_strerror(r) << dendl;
7c673cae
FG
5132 goto free_bluefs;
5133 }
11fdf7f2
TL
5134 }
5135 return 0;
7c673cae 5136
11fdf7f2
TL
5137free_bluefs:
5138 ceph_assert(bluefs);
5139 delete bluefs;
5140 bluefs = NULL;
5141 return r;
5142}
7c673cae 5143
11fdf7f2
TL
5144int BlueStore::_open_bluefs(bool create)
5145{
5146 int r = _minimal_open_bluefs(create);
5147 if (r < 0) {
5148 return r;
5149 }
5150 if (create) {
5151 bluefs->mkfs(fsid);
5152 }
5153 r = bluefs->mount();
5154 if (r < 0) {
5155 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5156 }
5157 return r;
5158}
5159
5160void BlueStore::_close_bluefs()
5161{
5162 bluefs->umount();
5163 _minimal_close_bluefs();
5164}
5165
5166void BlueStore::_minimal_close_bluefs()
5167{
5168 delete bluefs;
5169 bluefs = NULL;
5170}
5171
5172int BlueStore::_is_bluefs(bool create, bool* ret)
5173{
5174 if (create) {
5175 *ret = cct->_conf->bluestore_bluefs;
5176 } else {
5177 string s;
5178 int r = read_meta("bluefs", &s);
5179 if (r < 0) {
5180 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5181 return -EIO;
5182 }
5183 if (s == "1") {
5184 *ret = true;
5185 } else if (s == "0") {
5186 *ret = false;
31f18b77 5187 } else {
11fdf7f2
TL
5188 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5189 << dendl;
5190 return -EIO;
5191 }
5192 }
5193 return 0;
5194}
5195
5196/*
5197* opens both DB and dependant super_meta, FreelistManager and allocator
5198* in the proper order
5199*/
5200int BlueStore::_open_db_and_around(bool read_only)
5201{
5202 int r;
5203 bool do_bluefs = false;
5204 _is_bluefs(false, &do_bluefs); // ignore err code
5205 if (do_bluefs) {
5206 // open in read-only first to read FM list and init allocator
5207 // as they might be needed for some BlueFS procedures
5208 r = _open_db(false, false, true);
5209 if (r < 0)
5210 return r;
5211
5212 r = _open_super_meta();
5213 if (r < 0) {
5214 goto out_db;
5215 }
5216
5217 r = _open_fm(nullptr);
5218 if (r < 0)
5219 goto out_db;
5220
5221 r = _open_alloc();
5222 if (r < 0)
5223 goto out_fm;
5224
5225 // now open in R/W mode
5226 if (!read_only) {
5227 _close_db();
5228
5229 r = _open_db(false, false, false);
5230 if (r < 0) {
5231 _close_alloc();
5232 _close_fm();
5233 return r;
28e407b8 5234 }
7c673cae 5235 }
11fdf7f2
TL
5236 } else {
5237 r = _open_db(false, false);
5238 if (r < 0) {
5239 return r;
5240 }
5241 r = _open_super_meta();
5242 if (r < 0) {
5243 goto out_db;
5244 }
7c673cae 5245
11fdf7f2
TL
5246 r = _open_fm(nullptr);
5247 if (r < 0)
5248 goto out_db;
5249
5250 r = _open_alloc();
5251 if (r < 0)
5252 goto out_fm;
5253 }
5254 return 0;
5255
5256 out_fm:
5257 _close_fm();
5258 out_db:
5259 _close_db();
5260 return r;
5261}
5262
5263void BlueStore::_close_db_and_around()
5264{
5265 if (bluefs) {
5266 if (out_of_sync_fm.fetch_and(0)) {
5267 _sync_bluefs_and_fm();
5268 }
5269 _close_db();
5270 while(out_of_sync_fm.fetch_and(0)) {
5271 // if seen some allocations during close - repeat open_db, sync fm, close
5272 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5273 int r = _open_db(false, false, false);
5274 if (r < 0) {
5275 derr << __func__
5276 << " unable to open db, FreelistManager is probably out of sync"
5277 << dendl;
5278 break;
5279 }
5280 _sync_bluefs_and_fm();
5281 _close_db();
7c673cae 5282 }
11fdf7f2
TL
5283 if (!_kv_only) {
5284 _close_alloc();
5285 _close_fm();
5286 }
5287 } else {
5288 _close_alloc();
5289 _close_fm();
5290 _close_db();
5291 }
5292}
5293
5294// updates legacy bluefs related recs in DB to a state valid for
5295// downgrades from nautilus.
5296void BlueStore::_sync_bluefs_and_fm()
5297{
5298 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5299 bufferlist bl;
5300 encode(bluefs_extents, bl);
5301 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5302 << std::hex << bluefs_extents << std::dec
5303 << dendl;
5304 KeyValueDB::Transaction synct = db->get_transaction();
5305 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5306 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5307
5308 // Nice thing is that we don't need to update FreelistManager here.
5309 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5310 // pre-Nautilis releases.
5311 // So once we get an extent to bluefs_extents this means it's
5312 // been free in allocator and hence it's free in FM too.
5313
5314 db->submit_transaction_sync(synct);
5315 }
5316}
5317
5318int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5319{
5320 int r;
5321 ceph_assert(!db);
5322 ceph_assert(!(create && read_only));
5323 string fn = path + "/db";
5324 string options;
5325 stringstream err;
5326 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5327
5328 string kv_backend;
5329 std::vector<KeyValueDB::ColumnFamily> cfs;
5330
5331 if (create) {
5332 kv_backend = cct->_conf->bluestore_kvbackend;
5333 } else {
5334 r = read_meta("kv_backend", &kv_backend);
7c673cae 5335 if (r < 0) {
11fdf7f2
TL
5336 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5337 return -EIO;
5338 }
5339 }
5340 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5341
5342 bool do_bluefs;
5343 r = _is_bluefs(create, &do_bluefs);
5344 if (r < 0) {
5345 return r;
5346 }
5347 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5348
5349 map<string,string> kv_options;
5350 // force separate wal dir for all new deployments.
5351 kv_options["separate_wal_dir"] = 1;
5352 rocksdb::Env *env = NULL;
5353 if (do_bluefs) {
5354 dout(10) << __func__ << " initializing bluefs" << dendl;
5355 if (kv_backend != "rocksdb") {
5356 derr << " backend must be rocksdb to use bluefs" << dendl;
5357 return -EINVAL;
7c673cae 5358 }
11fdf7f2
TL
5359
5360 r = _open_bluefs(create);
5361 if (r < 0) {
5362 return r;
5363 }
5364 bluefs->set_slow_device_expander(this);
5365
7c673cae
FG
5366 if (cct->_conf->bluestore_bluefs_env_mirror) {
5367 rocksdb::Env *a = new BlueRocksEnv(bluefs);
5368 rocksdb::Env *b = rocksdb::Env::Default();
5369 if (create) {
5370 string cmd = "rm -rf " + path + "/db " +
5371 path + "/db.slow " +
5372 path + "/db.wal";
5373 int r = system(cmd.c_str());
5374 (void)r;
5375 }
5376 env = new rocksdb::EnvMirror(b, a, false, true);
5377 } else {
5378 env = new BlueRocksEnv(bluefs);
5379
5380 // simplify the dir names, too, as "seen" by rocksdb
5381 fn = "db";
5382 }
5383
5384 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
5385 // we have both block.db and block; tell rocksdb!
5386 // note: the second (last) size value doesn't really matter
5387 ostringstream db_paths;
5388 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
5389 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
5390 db_paths << fn << ","
5391 << (uint64_t)(db_size * 95 / 100) << " "
5392 << fn + ".slow" << ","
5393 << (uint64_t)(slow_size * 95 / 100);
11fdf7f2
TL
5394 kv_options["db_paths"] = db_paths.str();
5395 dout(10) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
5396 }
5397
5398 if (create) {
5399 env->CreateDir(fn);
11fdf7f2
TL
5400 env->CreateDir(fn + ".wal");
5401 env->CreateDir(fn + ".slow");
5402 } else {
5403 std::vector<std::string> res;
5404 // check for dir presence
5405 auto r = env->GetChildren(fn+".wal", &res);
5406 if (r.IsNotFound()) {
5407 kv_options.erase("separate_wal_dir");
5408 }
7c673cae 5409 }
11fdf7f2
TL
5410 } else {
5411 string walfn = path + "/db.wal";
7c673cae 5412
11fdf7f2
TL
5413 if (create) {
5414 int r = ::mkdir(fn.c_str(), 0755);
5415 if (r < 0)
5416 r = -errno;
5417 if (r < 0 && r != -EEXIST) {
5418 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
5419 << dendl;
5420 return r;
5421 }
5422
5423 // wal_dir, too!
7c673cae
FG
5424 r = ::mkdir(walfn.c_str(), 0755);
5425 if (r < 0)
5426 r = -errno;
5427 if (r < 0 && r != -EEXIST) {
5428 derr << __func__ << " failed to create " << walfn
5429 << ": " << cpp_strerror(r)
5430 << dendl;
5431 return r;
5432 }
11fdf7f2
TL
5433 } else {
5434 struct stat st;
5435 r = ::stat(walfn.c_str(), &st);
5436 if (r < 0 && errno == ENOENT) {
5437 kv_options.erase("separate_wal_dir");
5438 }
7c673cae
FG
5439 }
5440 }
5441
91327a77 5442
7c673cae
FG
5443 db = KeyValueDB::create(cct,
5444 kv_backend,
5445 fn,
11fdf7f2 5446 kv_options,
7c673cae
FG
5447 static_cast<void*>(env));
5448 if (!db) {
5449 derr << __func__ << " error creating db" << dendl;
5450 if (bluefs) {
11fdf7f2 5451 _close_bluefs();
7c673cae
FG
5452 }
5453 // delete env manually here since we can't depend on db to do this
5454 // under this case
5455 delete env;
5456 env = NULL;
5457 return -EIO;
5458 }
5459
5460 FreelistManager::setup_merge_operators(db);
5461 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 5462 db->set_cache_size(cache_kv_ratio * cache_size);
31f18b77 5463
11fdf7f2 5464 if (kv_backend == "rocksdb") {
7c673cae 5465 options = cct->_conf->bluestore_rocksdb_options;
11fdf7f2
TL
5466
5467 map<string,string> cf_map;
5468 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
5469 get_str_map,
5470 &cf_map,
5471 " \t");
5472 for (auto& i : cf_map) {
5473 dout(10) << "column family " << i.first << ": " << i.second << dendl;
5474 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
5475 }
5476 }
5477
7c673cae 5478 db->init(options);
11fdf7f2
TL
5479 if (to_repair_db)
5480 return 0;
5481 if (create) {
5482 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
5483 r = db->create_and_open(err, cfs);
5484 } else {
5485 r = db->create_and_open(err);
5486 }
5487 } else {
5488 // we pass in cf list here, but it is only used if the db already has
5489 // column families created.
5490 r = read_only ?
5491 db->open_read_only(err, cfs) :
5492 db->open(err, cfs);
5493 }
7c673cae
FG
5494 if (r) {
5495 derr << __func__ << " erroring opening db: " << err.str() << dendl;
11fdf7f2 5496 _close_db();
7c673cae
FG
5497 return -EIO;
5498 }
5499 dout(1) << __func__ << " opened " << kv_backend
5500 << " path " << fn << " options " << options << dendl;
5501 return 0;
7c673cae
FG
5502}
5503
5504void BlueStore::_close_db()
5505{
11fdf7f2 5506 ceph_assert(db);
7c673cae
FG
5507 delete db;
5508 db = NULL;
5509 if (bluefs) {
11fdf7f2 5510 _close_bluefs();
7c673cae
FG
5511 }
5512}
5513
11fdf7f2 5514void BlueStore::_dump_alloc_on_failure()
7c673cae 5515{
11fdf7f2
TL
5516 auto dump_interval =
5517 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
5518 if (dump_interval > 0 &&
5519 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
5520 alloc->dump();
5521 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
5522 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 5523 }
11fdf7f2 5524}
7c673cae 5525
7c673cae 5526
11fdf7f2
TL
5527int BlueStore::allocate_bluefs_freespace(
5528 uint64_t min_size,
5529 uint64_t size,
5530 PExtentVector* extents_out)
5531{
5532 ceph_assert(min_size <= size);
5533 if (size) {
5534 // round up to alloc size
eafe8130
TL
5535 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_shared_bdev);
5536 min_size = p2roundup(min_size, alloc_size);
5537 size = p2roundup(size, alloc_size);
11fdf7f2
TL
5538
5539 PExtentVector extents_local;
5540 PExtentVector* extents = extents_out ? extents_out : &extents_local;
5541
5542
5543 uint64_t gift;
5544 uint64_t allocated = 0;
5545 int64_t alloc_len;
5546 do {
5547 // hard cap to fit into 32 bits
5548 gift = std::min<uint64_t>(size, 1ull << 31);
5549 dout(10) << __func__ << " gifting " << gift
5550 << " (" << byte_u_t(gift) << ")" << dendl;
5551
eafe8130
TL
5552 alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents);
5553 if (alloc_len > 0) {
11fdf7f2
TL
5554 allocated += alloc_len;
5555 size -= alloc_len;
5556 }
5557
eafe8130
TL
5558 if (alloc_len < 0 ||
5559 (alloc_len < (int64_t)gift && (min_size > allocated))) {
11fdf7f2
TL
5560 derr << __func__
5561 << " failed to allocate on 0x" << std::hex << gift
5562 << " min_size 0x" << min_size
5563 << " > allocated total 0x" << allocated
eafe8130
TL
5564 << " bluefs_shared_alloc_size 0x" << alloc_size
5565 << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
5566 << " available 0x " << alloc->get_free()
5567 << std::dec << dendl;
7c673cae 5568
494da23a 5569 _dump_alloc_on_failure();
11fdf7f2
TL
5570 alloc->release(*extents);
5571 extents->clear();
5572 return -ENOSPC;
5573 }
5574 } while (size && alloc_len > 0);
5575 for (auto& e : *extents) {
5576 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
5577 bluefs_extents.insert(e.offset, e.length);
5578 ++out_of_sync_fm;
5579 // apply to bluefs if not requested from outside
5580 if (!extents_out) {
5581 bluefs->add_block_extent(bluefs_shared_bdev, e.offset, e.length);
5582 }
7c673cae
FG
5583 }
5584 }
7c673cae
FG
5585 return 0;
5586}
5587
eafe8130
TL
5588size_t BlueStore::available_freespace(uint64_t alloc_size) {
5589 size_t total = 0;
5590 auto iterated_allocation = [&](size_t off, size_t len) {
5591 //only count in size that is alloc_size aligned
5592 size_t dist_to_alignment;
5593 size_t offset_in_block = off & (alloc_size - 1);
5594 if (offset_in_block == 0)
5595 dist_to_alignment = 0;
5596 else
5597 dist_to_alignment = alloc_size - offset_in_block;
5598 if (dist_to_alignment >= len)
5599 return;
5600 len -= dist_to_alignment;
5601 total += p2align(len, alloc_size);
5602 };
5603 alloc->dump(iterated_allocation);
5604 return total;
5605}
5606
11fdf7f2 5607int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
f64942e4 5608{
7c673cae
FG
5609 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
5610
5611 uint64_t my_free = alloc->get_free();
5612 uint64_t total = bdev->get_size();
5613 float my_free_ratio = (float)my_free / (float)total;
5614
5615 uint64_t total_free = bluefs_free + my_free;
5616
5617 float bluefs_ratio = (float)bluefs_free / (float)total_free;
5618
5619 dout(10) << __func__
1adf2230 5620 << " bluefs " << byte_u_t(bluefs_free)
7c673cae 5621 << " free (" << bluefs_free_ratio
1adf2230 5622 << ") bluestore " << byte_u_t(my_free)
7c673cae
FG
5623 << " free (" << my_free_ratio
5624 << "), bluefs_ratio " << bluefs_ratio
5625 << dendl;
5626
5627 uint64_t gift = 0;
5628 uint64_t reclaim = 0;
5629 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
5630 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
5631 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5632 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
1adf2230 5633 << ", should gift " << byte_u_t(gift) << dendl;
7c673cae
FG
5634 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
5635 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
5636 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
5637 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
5638 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5639 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
1adf2230 5640 << ", should reclaim " << byte_u_t(reclaim) << dendl;
7c673cae 5641 }
3efd9988
FG
5642
5643 // don't take over too much of the freespace
5644 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 5645 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 5646 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
5647 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
5648 dout(10) << __func__ << " bluefs_total " << bluefs_total
5649 << " < min " << cct->_conf->bluestore_bluefs_min
1adf2230 5650 << ", should gift " << byte_u_t(g) << dendl;
7c673cae
FG
5651 if (g > gift)
5652 gift = g;
5653 reclaim = 0;
5654 }
11fdf7f2 5655 uint64_t min_free = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
3efd9988
FG
5656 if (bluefs_free < min_free &&
5657 min_free < free_cap) {
5658 uint64_t g = min_free - bluefs_free;
11fdf7f2 5659 dout(10) << __func__ << " bluefs_free " << bluefs_free
3efd9988 5660 << " < min " << min_free
1adf2230 5661 << ", should gift " << byte_u_t(g) << dendl;
3efd9988
FG
5662 if (g > gift)
5663 gift = g;
5664 reclaim = 0;
5665 }
11fdf7f2
TL
5666 ceph_assert((int64_t)gift >= 0);
5667 ceph_assert((int64_t)reclaim >= 0);
5668 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
5669}
7c673cae 5670
11fdf7f2
TL
5671int BlueStore::_balance_bluefs_freespace()
5672{
5673 int ret = 0;
5674 ceph_assert(bluefs);
7c673cae 5675
11fdf7f2
TL
5676 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
5677 bluefs->get_usage(&bluefs_usage);
5678 ceph_assert(bluefs_usage.size() > bluefs_shared_bdev);
7c673cae 5679
11fdf7f2
TL
5680 bool clear_alert = true;
5681 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
5682 auto& p = bluefs_usage[bluefs_shared_bdev];
5683 if (p.first != p.second) {
5684 auto& db = bluefs_usage[BlueFS::BDEV_DB];
5685 ostringstream ss;
5686 ss << "spilled over " << byte_u_t(p.second - p.first)
5687 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
5688 << " used of " << byte_u_t(db.second) << ") to slow device";
5689 _set_spillover_alert(ss.str());
5690 clear_alert = false;
5691 }
5692 }
5693 if (clear_alert) {
5694 _clear_spillover_alert();
7c673cae
FG
5695 }
5696
11fdf7f2
TL
5697 // fixme: look at primary bdev only for now
5698 int64_t delta = _get_bluefs_size_delta(
5699 bluefs_usage[bluefs_shared_bdev].first,
5700 bluefs_usage[bluefs_shared_bdev].second);
5701
7c673cae 5702 // reclaim from bluefs?
11fdf7f2 5703 if (delta < 0) {
7c673cae 5704 // round up to alloc size
eafe8130
TL
5705 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_shared_bdev);
5706 auto reclaim = p2roundup(uint64_t(-delta), alloc_size);
7c673cae
FG
5707
5708 // hard cap to fit into 32 bits
11fdf7f2 5709 reclaim = std::min<uint64_t>(reclaim, 1ull << 31);
7c673cae 5710 dout(10) << __func__ << " reclaiming " << reclaim
1adf2230 5711 << " (" << byte_u_t(reclaim) << ")" << dendl;
7c673cae
FG
5712
5713 while (reclaim > 0) {
5714 // NOTE: this will block and do IO.
a8e16298 5715 PExtentVector extents;
7c673cae
FG
5716 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
5717 &extents);
5718 if (r < 0) {
5719 derr << __func__ << " failed to reclaim space from bluefs"
5720 << dendl;
5721 break;
5722 }
5723 for (auto e : extents) {
11fdf7f2 5724 ++out_of_sync_fm;
7c673cae
FG
5725 bluefs_extents.erase(e.offset, e.length);
5726 bluefs_extents_reclaiming.insert(e.offset, e.length);
5727 reclaim -= e.length;
5728 }
5729 }
5730
5731 ret = 1;
5732 }
5733
5734 return ret;
5735}
5736
eafe8130 5737int BlueStore::_open_collections()
7c673cae 5738{
28e407b8 5739 dout(10) << __func__ << dendl;
eafe8130 5740 collections_had_errors = false;
11fdf7f2 5741 ceph_assert(coll_map.empty());
7c673cae
FG
5742 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5743 for (it->upper_bound(string());
5744 it->valid();
5745 it->next()) {
5746 coll_t cid;
5747 if (cid.parse(it->key())) {
5748 CollectionRef c(
5749 new Collection(
5750 this,
5751 cache_shards[cid.hash_to_shard(cache_shards.size())],
5752 cid));
5753 bufferlist bl = it->value();
11fdf7f2 5754 auto p = bl.cbegin();
7c673cae 5755 try {
11fdf7f2 5756 decode(c->cnode, p);
7c673cae
FG
5757 } catch (buffer::error& e) {
5758 derr << __func__ << " failed to decode cnode, key:"
5759 << pretty_binary_string(it->key()) << dendl;
5760 return -EIO;
5761 }
28e407b8
AA
5762 dout(20) << __func__ << " opened " << cid << " " << c
5763 << " " << c->cnode << dendl;
11fdf7f2 5764 _osr_attach(c.get());
7c673cae 5765 coll_map[cid] = c;
11fdf7f2 5766
7c673cae
FG
5767 } else {
5768 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 5769 collections_had_errors = true;
7c673cae
FG
5770 }
5771 }
5772 return 0;
5773}
5774
eafe8130
TL
5775void BlueStore::_fsck_collections(int64_t* errors)
5776{
5777 if (collections_had_errors) {
5778 dout(10) << __func__ << dendl;
5779 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5780 for (it->upper_bound(string());
5781 it->valid();
5782 it->next()) {
5783 coll_t cid;
5784 if (!cid.parse(it->key())) {
5785 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5786 if (errors) {
5787 (*errors)++;
5788 }
5789 }
5790 }
5791 }
5792}
5793
224ce89b 5794void BlueStore::_open_statfs()
31f18b77 5795{
11fdf7f2
TL
5796 osd_pools.clear();
5797 vstatfs.reset();
5798
31f18b77 5799 bufferlist bl;
11fdf7f2 5800 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 5801 if (r >= 0) {
11fdf7f2 5802 per_pool_stat_collection = false;
31f18b77 5803 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 5804 auto it = bl.cbegin();
31f18b77 5805 vstatfs.decode(it);
11fdf7f2 5806 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 5807 } else {
31f18b77
FG
5808 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5809 }
81eedcae 5810 _check_legacy_statfs_alert();
11fdf7f2
TL
5811 } else {
5812 per_pool_stat_collection = true;
5813 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
5814 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
5815 for (it->upper_bound(string());
5816 it->valid();
5817 it->next()) {
5818
5819 uint64_t pool_id;
5820 int r = get_key_pool_stat(it->key(), &pool_id);
5821 ceph_assert(r == 0);
5822
5823 bufferlist bl;
5824 bl = it->value();
5825 auto p = bl.cbegin();
5826 auto& st = osd_pools[pool_id];
5827 try {
5828 st.decode(p);
5829 vstatfs += st;
5830
5831 dout(30) << __func__ << " pool " << pool_id
5832 << " statfs " << st << dendl;
5833 } catch (buffer::error& e) {
5834 derr << __func__ << " failed to decode pool stats, key:"
5835 << pretty_binary_string(it->key()) << dendl;
5836 }
5837 }
31f18b77 5838 }
11fdf7f2
TL
5839 dout(30) << __func__ << " statfs " << vstatfs << dendl;
5840
31f18b77
FG
5841}
5842
7c673cae
FG
5843int BlueStore::_setup_block_symlink_or_file(
5844 string name,
5845 string epath,
5846 uint64_t size,
5847 bool create)
5848{
5849 dout(20) << __func__ << " name " << name << " path " << epath
5850 << " size " << size << " create=" << (int)create << dendl;
5851 int r = 0;
91327a77 5852 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5853 if (create)
5854 flags |= O_CREAT;
5855 if (epath.length()) {
5856 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5857 if (r < 0) {
5858 r = -errno;
5859 derr << __func__ << " failed to create " << name << " symlink to "
5860 << epath << ": " << cpp_strerror(r) << dendl;
5861 return r;
5862 }
5863
5864 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5865 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5866 if (fd < 0) {
5867 r = -errno;
5868 derr << __func__ << " failed to open " << epath << " file: "
5869 << cpp_strerror(r) << dendl;
5870 return r;
5871 }
11fdf7f2
TL
5872 // write the Transport ID of the NVMe device
5873 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
5874 // where "0000:02:00.0" is the selector of a PCI device, see
5875 // the first column of "lspci -mm -n -D"
5876 string trid{"trtype:PCIe "};
5877 trid += "traddr:";
5878 trid += epath.substr(strlen(SPDK_PREFIX));
5879 r = ::write(fd, trid.c_str(), trid.size());
5880 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
5881 dout(1) << __func__ << " created " << name << " symlink to "
5882 << epath << dendl;
5883 VOID_TEMP_FAILURE_RETRY(::close(fd));
5884 }
5885 }
5886 if (size) {
5887 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5888 if (fd >= 0) {
5889 // block file is present
5890 struct stat st;
5891 int r = ::fstat(fd, &st);
5892 if (r == 0 &&
5893 S_ISREG(st.st_mode) && // if it is a regular file
5894 st.st_size == 0) { // and is 0 bytes
5895 r = ::ftruncate(fd, size);
5896 if (r < 0) {
5897 r = -errno;
5898 derr << __func__ << " failed to resize " << name << " file to "
5899 << size << ": " << cpp_strerror(r) << dendl;
5900 VOID_TEMP_FAILURE_RETRY(::close(fd));
5901 return r;
5902 }
5903
5904 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
5905 r = ::ceph_posix_fallocate(fd, 0, size);
5906 if (r > 0) {
7c673cae
FG
5907 derr << __func__ << " failed to prefallocate " << name << " file to "
5908 << size << ": " << cpp_strerror(r) << dendl;
5909 VOID_TEMP_FAILURE_RETRY(::close(fd));
5910 return -r;
5911 }
7c673cae
FG
5912 }
5913 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 5914 << byte_u_t(size) << dendl;
7c673cae
FG
5915 }
5916 VOID_TEMP_FAILURE_RETRY(::close(fd));
5917 } else {
5918 int r = -errno;
5919 if (r != -ENOENT) {
5920 derr << __func__ << " failed to open " << name << " file: "
5921 << cpp_strerror(r) << dendl;
5922 return r;
5923 }
5924 }
5925 }
5926 return 0;
5927}
5928
5929int BlueStore::mkfs()
5930{
5931 dout(1) << __func__ << " path " << path << dendl;
5932 int r;
5933 uuid_d old_fsid;
5934
eafe8130
TL
5935 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
5936 derr << __func__ << " osd_max_object_size "
5937 << cct->_conf->osd_max_object_size << " > bluestore max "
5938 << OBJECT_MAX_SIZE << dendl;
5939 return -EINVAL;
5940 }
5941
7c673cae
FG
5942 {
5943 string done;
5944 r = read_meta("mkfs_done", &done);
5945 if (r == 0) {
5946 dout(1) << __func__ << " already created" << dendl;
5947 if (cct->_conf->bluestore_fsck_on_mkfs) {
5948 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5949 if (r < 0) {
5950 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5951 << dendl;
5952 return r;
5953 }
5954 if (r > 0) {
5955 derr << __func__ << " fsck found " << r << " errors" << dendl;
5956 r = -EIO;
5957 }
5958 }
5959 return r; // idempotent
5960 }
5961 }
5962
5963 {
5964 string type;
5965 r = read_meta("type", &type);
5966 if (r == 0) {
5967 if (type != "bluestore") {
5968 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5969 return -EIO;
5970 }
5971 } else {
5972 r = write_meta("type", "bluestore");
5973 if (r < 0)
5974 return r;
5975 }
5976 }
5977
5978 freelist_type = "bitmap";
5979
5980 r = _open_path();
5981 if (r < 0)
5982 return r;
5983
5984 r = _open_fsid(true);
5985 if (r < 0)
5986 goto out_path_fd;
5987
5988 r = _lock_fsid();
5989 if (r < 0)
5990 goto out_close_fsid;
5991
5992 r = _read_fsid(&old_fsid);
5993 if (r < 0 || old_fsid.is_zero()) {
5994 if (fsid.is_zero()) {
5995 fsid.generate_random();
5996 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5997 } else {
5998 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5999 }
6000 // we'll write it later.
6001 } else {
6002 if (!fsid.is_zero() && fsid != old_fsid) {
6003 derr << __func__ << " on-disk fsid " << old_fsid
6004 << " != provided " << fsid << dendl;
6005 r = -EINVAL;
6006 goto out_close_fsid;
6007 }
6008 fsid = old_fsid;
6009 }
6010
6011 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6012 cct->_conf->bluestore_block_size,
6013 cct->_conf->bluestore_block_create);
6014 if (r < 0)
6015 goto out_close_fsid;
6016 if (cct->_conf->bluestore_bluefs) {
6017 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6018 cct->_conf->bluestore_block_wal_size,
6019 cct->_conf->bluestore_block_wal_create);
6020 if (r < 0)
6021 goto out_close_fsid;
6022 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6023 cct->_conf->bluestore_block_db_size,
6024 cct->_conf->bluestore_block_db_create);
6025 if (r < 0)
6026 goto out_close_fsid;
6027 }
6028
6029 r = _open_bdev(true);
6030 if (r < 0)
6031 goto out_close_fsid;
6032
3efd9988
FG
6033 // choose min_alloc_size
6034 if (cct->_conf->bluestore_min_alloc_size) {
6035 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6036 } else {
11fdf7f2 6037 ceph_assert(bdev);
3efd9988
FG
6038 if (bdev->is_rotational()) {
6039 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6040 } else {
6041 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6042 }
6043 }
11fdf7f2 6044 _validate_bdev();
3efd9988
FG
6045
6046 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6047 if (!isp2(min_alloc_size)) {
3efd9988
FG
6048 derr << __func__ << " min_alloc_size 0x"
6049 << std::hex << min_alloc_size << std::dec
6050 << " is not power of 2 aligned!"
6051 << dendl;
6052 r = -EINVAL;
6053 goto out_close_bdev;
6054 }
6055
7c673cae
FG
6056 r = _open_db(true);
6057 if (r < 0)
6058 goto out_close_bdev;
6059
7c673cae
FG
6060 {
6061 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
6062 r = _open_fm(t);
6063 if (r < 0)
6064 goto out_close_db;
7c673cae
FG
6065 {
6066 bufferlist bl;
11fdf7f2 6067 encode((uint64_t)0, bl);
7c673cae
FG
6068 t->set(PREFIX_SUPER, "nid_max", bl);
6069 t->set(PREFIX_SUPER, "blobid_max", bl);
6070 }
6071
7c673cae
FG
6072 {
6073 bufferlist bl;
11fdf7f2 6074 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6075 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6076 }
6077
6078 ondisk_format = latest_ondisk_format;
6079 _prepare_ondisk_format_super(t);
6080 db->submit_transaction_sync(t);
6081 }
6082
7c673cae
FG
6083 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6084 if (r < 0)
224ce89b
WB
6085 goto out_close_fm;
6086
3efd9988 6087 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6088 if (r < 0)
224ce89b 6089 goto out_close_fm;
7c673cae
FG
6090
6091 if (fsid != old_fsid) {
6092 r = _write_fsid();
6093 if (r < 0) {
6094 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6095 goto out_close_fm;
7c673cae
FG
6096 }
6097 }
6098
11fdf7f2
TL
6099 if (out_of_sync_fm.fetch_and(0)) {
6100 _sync_bluefs_and_fm();
6101 }
6102
7c673cae
FG
6103 out_close_fm:
6104 _close_fm();
6105 out_close_db:
6106 _close_db();
6107 out_close_bdev:
6108 _close_bdev();
6109 out_close_fsid:
6110 _close_fsid();
6111 out_path_fd:
6112 _close_path();
6113
6114 if (r == 0 &&
6115 cct->_conf->bluestore_fsck_on_mkfs) {
6116 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6117 if (rc < 0)
6118 return rc;
6119 if (rc > 0) {
6120 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6121 r = -EIO;
6122 }
11fdf7f2
TL
6123 }
6124
6125 if (r == 0) {
6126 // indicate success by writing the 'mkfs_done' file
6127 r = write_meta("mkfs_done", "yes");
6128 }
6129
6130 if (r < 0) {
6131 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6132 } else {
6133 dout(0) << __func__ << " success" << dendl;
6134 }
6135 return r;
6136}
6137
6138int BlueStore::_mount_for_bluefs()
6139{
6140 int r = _open_path();
6141 ceph_assert(r == 0);
6142 r = _open_fsid(false);
6143 ceph_assert(r == 0);
6144 r = _read_fsid(&fsid);
6145 ceph_assert(r == 0);
6146 r = _lock_fsid();
6147 ceph_assert(r == 0);
6148 r = _open_bluefs(false);
6149 ceph_assert(r == 0);
6150 return r;
6151}
6152
6153void BlueStore::_umount_for_bluefs()
6154{
6155 _close_bluefs();
6156 _close_fsid();
6157 _close_path();
6158}
6159
6160int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6161{
6162 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6163 int r;
6164 ceph_assert(path_fd < 0);
6165
6166 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6167
6168 if (!cct->_conf->bluestore_bluefs) {
6169 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6170 return -EIO;
6171 }
6172
6173 r = _mount_for_bluefs();
6174
6175 int reserved = 0;
6176 if (id == BlueFS::BDEV_NEWWAL) {
6177 string p = path + "/block.wal";
6178 r = _setup_block_symlink_or_file("block.wal", dev_path,
6179 cct->_conf->bluestore_block_wal_size,
6180 true);
6181 ceph_assert(r == 0);
6182
6183 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
eafe8130 6184 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6185 ceph_assert(r == 0);
6186
6187 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6188 r = _check_or_set_bdev_label(
6189 p,
6190 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6191 "bluefs wal",
6192 true);
6193 ceph_assert(r == 0);
6194 }
6195
6196 reserved = BDEV_LABEL_BLOCK_SIZE;
6197 } else if (id == BlueFS::BDEV_NEWDB) {
6198 string p = path + "/block.db";
6199 r = _setup_block_symlink_or_file("block.db", dev_path,
6200 cct->_conf->bluestore_block_db_size,
6201 true);
6202 ceph_assert(r == 0);
6203
6204 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
eafe8130 6205 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6206 ceph_assert(r == 0);
6207
6208 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6209 r = _check_or_set_bdev_label(
6210 p,
6211 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6212 "bluefs db",
6213 true);
6214 ceph_assert(r == 0);
6215 }
6216 reserved = SUPER_RESERVED;
6217 }
6218
6219 bluefs->umount();
6220 bluefs->mount();
6221
6222 bluefs->add_block_extent(
6223 id,
6224 reserved,
6225 bluefs->get_block_device_size(id) - reserved);
6226
6227 r = bluefs->prepare_new_device(id);
6228 ceph_assert(r == 0);
6229
6230 if (r < 0) {
6231 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6232 } else {
6233 dout(0) << __func__ << " success" << dendl;
6234 }
6235
6236 _umount_for_bluefs();
6237 return r;
6238}
6239
6240int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6241 int id)
6242{
6243 dout(10) << __func__ << " id:" << id << dendl;
6244 ceph_assert(path_fd < 0);
6245
6246 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6247
6248 if (!cct->_conf->bluestore_bluefs) {
6249 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6250 return -EIO;
6251 }
6252
6253 int r = _mount_for_bluefs();
6254
6255 // require bluestore_bluefs_min_free to be free at target device!
6256 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6257 for(auto src_id : devs_source) {
6258 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6259 }
6260 uint64_t target_free = bluefs->get_free(id);
6261 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6262 // will need to remount full BlueStore instance to allocate more space
6263 _umount_for_bluefs();
6264
6265 r = mount();
6266 ceph_assert(r == 0);
6267 dout(1) << __func__
6268 << " Allocating more space at slow device for BlueFS: +"
6269 << used_space - target_free << " bytes" << dendl;
6270 r = allocate_bluefs_freespace(
6271 used_space - target_free,
6272 used_space - target_free,
6273 nullptr);
6274
6275 umount();
6276 if (r != 0) {
6277 derr << __func__
6278 << " can't migrate, unable to allocate extra space: "
6279 << used_space - target_free << " at target:" << id
6280 << dendl;
6281 return -ENOSPC;
6282 }
6283
6284 r = _mount_for_bluefs();
6285 ceph_assert(r == 0);
6286 } else if (target_free < used_space) {
6287 derr << __func__
6288 << " can't migrate, free space at target: " << target_free
6289 << " is less than required space: " << used_space
6290 << dendl;
6291 return -ENOSPC;
6292 }
6293 r = bluefs->device_migrate_to_existing(cct, devs_source, id);
6294 if (r < 0) {
6295 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6296 goto shutdown;
6297 }
6298
6299 if (devs_source.count(BlueFS::BDEV_DB)) {
6300 r = unlink(string(path + "/block.db").c_str());
6301 ceph_assert(r == 0);
6302 }
6303 if (devs_source.count(BlueFS::BDEV_WAL)) {
6304 r = unlink(string(path + "/block.wal").c_str());
6305 ceph_assert(r == 0);
6306 }
6307
6308shutdown:
6309 _umount_for_bluefs();
6310 return r;
6311}
6312
6313int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6314 int id,
6315 const string& dev_path)
6316{
6317 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6318 int r;
6319 ceph_assert(path_fd < 0);
6320
6321 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6322
6323 if (!cct->_conf->bluestore_bluefs) {
6324 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6325 return -EIO;
6326 }
6327
6328 r = _mount_for_bluefs();
6329
6330 int reserved = 0;
6331 string link_db;
6332 string link_wal;
6333 if (devs_source.count(BlueFS::BDEV_DB) &&
6334 bluefs_shared_bdev != BlueFS::BDEV_DB) {
6335 link_db = path + "/block.db";
6336 }
6337 if (devs_source.count(BlueFS::BDEV_WAL)) {
6338 link_wal = path + "/block.wal";
6339 }
6340
6341 size_t target_size;
6342 string target_name;
6343 if (id == BlueFS::BDEV_NEWWAL) {
6344 target_name = "block.wal";
6345 target_size = cct->_conf->bluestore_block_wal_size;
6346
6347 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
eafe8130 6348 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6349 ceph_assert(r == 0);
6350
6351 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6352 r = _check_or_set_bdev_label(
6353 dev_path,
6354 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6355 "bluefs wal",
6356 true);
6357 ceph_assert(r == 0);
6358 }
6359 reserved = BDEV_LABEL_BLOCK_SIZE;
6360 } else if (id == BlueFS::BDEV_NEWDB) {
6361 target_name = "block.db";
6362 target_size = cct->_conf->bluestore_block_db_size;
31f18b77 6363
11fdf7f2 6364 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
eafe8130 6365 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6366 ceph_assert(r == 0);
6367
6368 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6369 r = _check_or_set_bdev_label(
6370 dev_path,
6371 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6372 "bluefs db",
6373 true);
6374 ceph_assert(r == 0);
6375 }
6376 reserved = SUPER_RESERVED;
31f18b77
FG
6377 }
6378
11fdf7f2
TL
6379 bluefs->umount();
6380 bluefs->mount();
6381
6382 bluefs->add_block_extent(
6383 id, reserved, bluefs->get_block_device_size(id) - reserved);
6384
6385 r = bluefs->device_migrate_to_new(cct, devs_source, id);
6386
7c673cae 6387 if (r < 0) {
11fdf7f2
TL
6388 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6389 goto shutdown;
6390 }
6391
6392 if (!link_db.empty()) {
6393 r = unlink(link_db.c_str());
6394 ceph_assert(r == 0);
6395 }
6396 if (!link_wal.empty()) {
6397 r = unlink(link_wal.c_str());
6398 ceph_assert(r == 0);
6399 }
6400 r = _setup_block_symlink_or_file(
6401 target_name,
6402 dev_path,
6403 target_size,
6404 true);
6405 ceph_assert(r == 0);
6406 dout(0) << __func__ << " success" << dendl;
6407
6408shutdown:
6409 _umount_for_bluefs();
6410 return r;
6411}
6412
6413string BlueStore::get_device_path(unsigned id)
6414{
6415 string res;
6416 if (id < BlueFS::MAX_BDEV) {
6417 switch (id) {
6418 case BlueFS::BDEV_WAL:
6419 res = path + "/block.wal";
6420 break;
6421 case BlueFS::BDEV_DB:
6422 if (id == bluefs_shared_bdev) {
6423 res = path + "/block";
6424 } else {
6425 res = path + "/block.db";
6426 }
6427 break;
6428 case BlueFS::BDEV_SLOW:
6429 res = path + "/block";
6430 break;
6431 }
6432 }
6433 return res;
6434}
6435
6436int BlueStore::expand_devices(ostream& out)
6437{
6438 int r = _mount(false);
6439 ceph_assert(r == 0);
6440 bluefs->dump_block_extents(out);
6441 out << "Expanding..." << std::endl;
6442 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
6443 if (devid == bluefs_shared_bdev ) {
6444 continue;
6445 }
6446 uint64_t size = bluefs->get_block_device_size(devid);
6447 if (size == 0) {
6448 // no bdev
6449 continue;
6450 }
6451
6452 interval_set<uint64_t> before;
6453 bluefs->get_block_extents(devid, &before);
6454 ceph_assert(!before.empty());
6455 uint64_t end = before.range_end();
6456 if (end < size) {
6457 out << devid
6458 <<" : expanding " << " from 0x" << std::hex
6459 << end << " to 0x" << size << std::dec << std::endl;
6460 bluefs->add_block_extent(devid, end, size-end);
6461 string p = get_device_path(devid);
6462 const char* path = p.c_str();
6463 if (path == nullptr) {
6464 derr << devid
6465 <<": can't find device path " << dendl;
6466 continue;
6467 }
6468 bluestore_bdev_label_t label;
6469 int r = _read_bdev_label(cct, path, &label);
6470 if (r < 0) {
6471 derr << "unable to read label for " << path << ": "
6472 << cpp_strerror(r) << dendl;
6473 continue;
6474 }
6475 label.size = size;
6476 r = _write_bdev_label(cct, path, label);
6477 if (r < 0) {
6478 derr << "unable to write label for " << path << ": "
6479 << cpp_strerror(r) << dendl;
6480 continue;
6481 }
6482 out << devid
6483 <<" : size label updated to " << size
6484 << std::endl;
6485 }
6486 }
6487 uint64_t size0 = fm->get_size();
6488 uint64_t size = bdev->get_size();
6489 if (size0 < size) {
6490 out << bluefs_shared_bdev
6491 <<" : expanding " << " from 0x" << std::hex
6492 << size0 << " to 0x" << size << std::dec << std::endl;
6493 KeyValueDB::Transaction txn;
6494 txn = db->get_transaction();
6495 int r = fm->expand(size, txn);
6496 ceph_assert(r == 0);
6497 db->submit_transaction_sync(txn);
6498
6499 // always reference to slow device here
6500 string p = get_device_path(BlueFS::BDEV_SLOW);
6501 ceph_assert(!p.empty());
6502 const char* path = p.c_str();
6503 bluestore_bdev_label_t label;
6504 r = _read_bdev_label(cct, path, &label);
6505 if (r < 0) {
6506 derr << "unable to read label for " << path << ": "
6507 << cpp_strerror(r) << dendl;
6508 } else {
6509 label.size = size;
6510 r = _write_bdev_label(cct, path, label);
6511 if (r < 0) {
6512 derr << "unable to write label for " << path << ": "
6513 << cpp_strerror(r) << dendl;
6514 } else {
6515 out << bluefs_shared_bdev
6516 <<" : size label updated to " << size
6517 << std::endl;
6518 }
6519 }
7c673cae 6520 }
11fdf7f2 6521 umount();
7c673cae
FG
6522 return r;
6523}
6524
6525void BlueStore::set_cache_shards(unsigned num)
6526{
6527 dout(10) << __func__ << " " << num << dendl;
6528 size_t old = cache_shards.size();
11fdf7f2 6529 ceph_assert(num >= old);
7c673cae
FG
6530 cache_shards.resize(num);
6531 for (unsigned i = old; i < num; ++i) {
6532 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
6533 logger);
6534 }
6535}
6536
11fdf7f2 6537int BlueStore::_mount(bool kv_only, bool open_db)
7c673cae
FG
6538{
6539 dout(1) << __func__ << " path " << path << dendl;
6540
3efd9988
FG
6541 _kv_only = kv_only;
6542
7c673cae
FG
6543 {
6544 string type;
6545 int r = read_meta("type", &type);
6546 if (r < 0) {
6547 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6548 << dendl;
6549 return r;
6550 }
6551
6552 if (type != "bluestore") {
6553 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6554 return -EIO;
6555 }
6556 }
6557
6558 if (cct->_conf->bluestore_fsck_on_mount) {
6559 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6560 if (rc < 0)
6561 return rc;
6562 if (rc > 0) {
6563 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6564 return -EIO;
6565 }
6566 }
6567
eafe8130
TL
6568 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6569 derr << __func__ << " osd_max_object_size "
6570 << cct->_conf->osd_max_object_size << " > bluestore max "
6571 << OBJECT_MAX_SIZE << dendl;
6572 return -EINVAL;
6573 }
6574
7c673cae
FG
6575 int r = _open_path();
6576 if (r < 0)
6577 return r;
6578 r = _open_fsid(false);
6579 if (r < 0)
6580 goto out_path;
6581
6582 r = _read_fsid(&fsid);
6583 if (r < 0)
6584 goto out_fsid;
6585
6586 r = _lock_fsid();
6587 if (r < 0)
6588 goto out_fsid;
6589
6590 r = _open_bdev(false);
6591 if (r < 0)
6592 goto out_fsid;
6593
11fdf7f2
TL
6594 if (open_db) {
6595 r = _open_db_and_around(false);
6596 } else {
6597 // we can bypass db open exclusively in case of kv_only mode
6598 ceph_assert(kv_only);
6599 r = _open_db(false, true);
6600 if (r < 0)
6601 goto out_bdev;
6602 }
7c673cae
FG
6603
6604 if (kv_only)
6605 return 0;
6606
11fdf7f2
TL
6607 r = _upgrade_super();
6608 if (r < 0) {
7c673cae 6609 goto out_db;
11fdf7f2 6610 }
7c673cae
FG
6611
6612 r = _open_collections();
6613 if (r < 0)
11fdf7f2 6614 goto out_db;
7c673cae
FG
6615
6616 r = _reload_logger();
6617 if (r < 0)
6618 goto out_coll;
6619
31f18b77 6620 _kv_start();
7c673cae
FG
6621
6622 r = _deferred_replay();
6623 if (r < 0)
6624 goto out_stop;
6625
6626 mempool_thread.init();
6627
eafe8130
TL
6628 if (!per_pool_stat_collection &&
6629 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
6630 dout(1) << __func__ << " quick-fix on mount" << dendl;
6631 _fsck_on_open(FSCK_SHALLOW, true);
6632
6633 //reread statfs
6634 //FIXME minor: replace with actual open/close?
6635 _open_statfs();
6636
6637 _check_legacy_statfs_alert();
6638 }
6639
7c673cae
FG
6640 mounted = true;
6641 return 0;
6642
6643 out_stop:
6644 _kv_stop();
7c673cae 6645 out_coll:
31f18b77 6646 _flush_cache();
7c673cae 6647 out_db:
11fdf7f2 6648 _close_db_and_around();
7c673cae
FG
6649 out_bdev:
6650 _close_bdev();
6651 out_fsid:
6652 _close_fsid();
6653 out_path:
6654 _close_path();
6655 return r;
6656}
6657
6658int BlueStore::umount()
6659{
11fdf7f2 6660 ceph_assert(_kv_only || mounted);
7c673cae
FG
6661 dout(1) << __func__ << dendl;
6662
6663 _osr_drain_all();
7c673cae 6664
7c673cae 6665 mounted = false;
3efd9988
FG
6666 if (!_kv_only) {
6667 mempool_thread.shutdown();
6668 dout(20) << __func__ << " stopping kv thread" << dendl;
6669 _kv_stop();
3efd9988
FG
6670 _flush_cache();
6671 dout(20) << __func__ << " closing" << dendl;
6672
3efd9988 6673 }
11fdf7f2 6674 _close_db_and_around();
7c673cae
FG
6675 _close_bdev();
6676 _close_fsid();
6677 _close_path();
6678
6679 if (cct->_conf->bluestore_fsck_on_umount) {
6680 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
6681 if (rc < 0)
6682 return rc;
6683 if (rc > 0) {
6684 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6685 return -EIO;
6686 }
6687 }
6688 return 0;
6689}
6690
eafe8130
TL
6691int BlueStore::cold_open()
6692{
6693 int r = _open_path();
6694 if (r < 0)
6695 return r;
6696 r = _open_fsid(false);
6697 if (r < 0)
6698 goto out_path;
6699
6700 r = _read_fsid(&fsid);
6701 if (r < 0)
6702 goto out_fsid;
6703
6704 r = _lock_fsid();
6705 if (r < 0)
6706 goto out_fsid;
6707
6708 r = _open_bdev(false);
6709 if (r < 0)
6710 goto out_fsid;
6711 r = _open_db_and_around(true);
6712 if (r < 0) {
6713 goto out_bdev;
6714 }
6715 return 0;
6716 out_bdev:
6717 _close_bdev();
6718 out_fsid:
6719 _close_fsid();
6720 out_path:
6721 _close_path();
6722 return r;
6723}
6724int BlueStore::cold_close()
6725{
6726 _close_db_and_around();
6727 _close_bdev();
6728 _close_fsid();
6729 _close_path();
6730 return 0;
6731}
6732
7c673cae
FG
6733static void apply(uint64_t off,
6734 uint64_t len,
6735 uint64_t granularity,
6736 BlueStore::mempool_dynamic_bitset &bitset,
7c673cae
FG
6737 std::function<void(uint64_t,
6738 BlueStore::mempool_dynamic_bitset &)> f) {
11fdf7f2 6739 auto end = round_up_to(off + len, granularity);
7c673cae
FG
6740 while (off < end) {
6741 uint64_t pos = off / granularity;
6742 f(pos, bitset);
6743 off += granularity;
6744 }
6745}
6746
eafe8130
TL
6747int _fsck_sum_extents(
6748 const PExtentVector& extents,
6749 bool compressed,
6750 store_statfs_t& expected_statfs)
6751{
6752 for (auto e : extents) {
6753 if (!e.is_valid())
6754 continue;
6755 expected_statfs.allocated += e.length;
6756 if (compressed) {
6757 expected_statfs.data_compressed_allocated += e.length;
6758 }
6759 }
6760 return 0;
6761}
6762
7c673cae 6763int BlueStore::_fsck_check_extents(
11fdf7f2 6764 const coll_t& cid,
7c673cae
FG
6765 const ghobject_t& oid,
6766 const PExtentVector& extents,
6767 bool compressed,
6768 mempool_dynamic_bitset &used_blocks,
b32b8144 6769 uint64_t granularity,
11fdf7f2 6770 BlueStoreRepairer* repairer,
eafe8130
TL
6771 store_statfs_t& expected_statfs,
6772 FSCKDepth depth)
7c673cae
FG
6773{
6774 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
6775 int errors = 0;
6776 for (auto e : extents) {
6777 if (!e.is_valid())
6778 continue;
6779 expected_statfs.allocated += e.length;
6780 if (compressed) {
11fdf7f2 6781 expected_statfs.data_compressed_allocated += e.length;
7c673cae 6782 }
eafe8130
TL
6783 if (depth != FSCK_SHALLOW) {
6784 bool already = false;
6785 apply(
6786 e.offset, e.length, granularity, used_blocks,
6787 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6788 ceph_assert(pos < bs.size());
6789 if (bs.test(pos)) {
6790 if (repairer) {
6791 repairer->note_misreference(
6792 pos * min_alloc_size, min_alloc_size, !already);
6793 }
6794 if (!already) {
6795 derr << "fsck error: " << oid << " extent " << e
6796 << " or a subset is already allocated (misreferenced)" << dendl;
6797 ++errors;
6798 already = true;
6799 }
11fdf7f2 6800 }
eafe8130
TL
6801 else
6802 bs.set(pos);
6803 });
6804 if (repairer) {
6805 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
6806 }
11fdf7f2 6807
eafe8130
TL
6808 if (e.end() > bdev->get_size()) {
6809 derr << "fsck error: " << oid << " extent " << e
6810 << " past end of block device" << dendl;
6811 ++errors;
6812 }
7c673cae
FG
6813 }
6814 }
6815 return errors;
6816}
6817
11fdf7f2
TL
6818void BlueStore::_fsck_check_pool_statfs(
6819 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
6820 int64_t& errors,
6821 int64_t& warnings,
11fdf7f2
TL
6822 BlueStoreRepairer* repairer)
6823{
6824 auto it = db->get_iterator(PREFIX_STAT);
6825 if (it) {
6826 for (it->lower_bound(string()); it->valid(); it->next()) {
6827 string key = it->key();
6828 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
6829 if (repairer) {
eafe8130
TL
6830 ++errors;
6831 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
6832 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
6833 << dendl;
6834 }
6835 continue;
6836 }
11fdf7f2
TL
6837 uint64_t pool_id;
6838 if (get_key_pool_stat(key, &pool_id) < 0) {
6839 derr << "fsck error: bad key " << key
6840 << "in statfs namespece" << dendl;
6841 if (repairer) {
6842 repairer->remove_key(db, PREFIX_STAT, key);
6843 }
6844 ++errors;
6845 continue;
6846 }
6847
6848 volatile_statfs vstatfs;
6849 bufferlist bl = it->value();
6850 auto blp = bl.cbegin();
6851 try {
6852 vstatfs.decode(blp);
6853 } catch (buffer::error& e) {
6854 derr << "fsck error: failed to decode Pool StatFS record"
6855 << pretty_binary_string(key) << dendl;
6856 if (repairer) {
6857 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
6858 << pretty_binary_string(key)
6859 << "', removing" << dendl;
6860 repairer->remove_key(db, PREFIX_STAT, key);
6861 }
6862 ++errors;
6863 vstatfs.reset();
6864 }
6865 auto stat_it = expected_pool_statfs.find(pool_id);
6866 if (stat_it == expected_pool_statfs.end()) {
6867 if (vstatfs.is_empty()) {
6868 // we don't consider that as an error since empty pool statfs
6869 // are left in DB for now
6870 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
6871 << std::hex << pool_id << std::dec << dendl;
6872 if (repairer) {
6873 // but we need to increment error count in case of repair
6874 // to have proper counters at the end
6875 // (as repairer increments recovery counter anyway).
6876 ++errors;
6877 }
6878 } else {
6879 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
6880 << std::hex << pool_id << std::dec << dendl;
6881 ++errors;
6882 }
6883 if (repairer) {
6884 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
6885 }
6886 continue;
6887 }
6888 store_statfs_t statfs;
6889 vstatfs.publish(&statfs);
6890 if (!(stat_it->second == statfs)) {
6891 derr << "fsck error: actual " << statfs
6892 << " != expected " << stat_it->second
6893 << " for pool "
6894 << std::hex << pool_id << std::dec << dendl;
6895 if (repairer) {
6896 repairer->fix_statfs(db, key, stat_it->second);
6897 }
6898 ++errors;
6899 }
6900 expected_pool_statfs.erase(stat_it);
6901 }
6902 } // if (it)
eafe8130
TL
6903 for (auto& s : expected_pool_statfs) {
6904 if (s.second.is_zero()) {
11fdf7f2
TL
6905 // we might lack empty statfs recs in DB
6906 continue;
6907 }
6908 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 6909 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
6910 if (repairer) {
6911 string key;
eafe8130
TL
6912 get_pool_stat_key(s.first, &key);
6913 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
6914 }
6915 ++errors;
6916 }
eafe8130
TL
6917 if (!per_pool_stat_collection &&
6918 cct->_conf->bluestore_fsck_error_on_no_per_pool_stats &&
6919 repairer) {
6920 // by virtue of running this method, we correct the top-level
6921 // error of having global stats
6922 repairer->inc_repaired();
6923 }
11fdf7f2
TL
6924}
6925
eafe8130
TL
6926BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
6927 BlueStore::FSCKDepth depth,
6928 int64_t pool_id,
6929 BlueStore::CollectionRef c,
6930 const ghobject_t& oid,
6931 const string& key,
6932 const bufferlist& value,
6933 mempool::bluestore_fsck::list<string>& expecting_shards,
6934 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
6935 const BlueStore::FSCK_ObjectCtx& ctx)
6936{
6937 auto& errors = ctx.errors;
6938 auto& num_objects = ctx.num_objects;
6939 auto& num_extents = ctx.num_extents;
6940 auto& num_blobs = ctx.num_blobs;
6941 auto& num_sharded_objects = ctx.num_sharded_objects;
6942 auto& num_spanning_blobs = ctx.num_spanning_blobs;
6943 auto used_blocks = ctx.used_blocks;
6944 auto sb_info_lock = ctx.sb_info_lock;
6945 auto& sb_info = ctx.sb_info;
6946 auto repairer = ctx.repairer;
6947
6948 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
6949 &ctx.expected_pool_statfs[pool_id] :
6950 &ctx.expected_store_statfs;
6951
6952 dout(10) << __func__ << " " << oid << dendl;
6953 OnodeRef o;
6954 o.reset(Onode::decode(c, oid, key, value));
6955 ++num_objects;
7c673cae 6956
eafe8130 6957 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 6958
eafe8130
TL
6959 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
6960 _dump_onode<30>(cct, *o);
6961 // shards
6962 if (!o->extent_map.shards.empty()) {
6963 ++num_sharded_objects;
6964 if (depth != FSCK_SHALLOW) {
6965 for (auto& s : o->extent_map.shards) {
6966 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
6967 expecting_shards.push_back(string());
6968 get_extent_shard_key(o->key, s.shard_info->offset,
6969 &expecting_shards.back());
6970 if (s.shard_info->offset >= o->onode.size) {
6971 derr << "fsck error: " << oid << " shard 0x" << std::hex
6972 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
6973 << std::dec << dendl;
6974 ++errors;
6975 }
6976 }
6977 }
6978 }
7c673cae 6979
eafe8130
TL
6980 // lextents
6981 uint64_t pos = 0;
6982 mempool::bluestore_fsck::map<BlobRef,
6983 bluestore_blob_use_tracker_t> ref_map;
6984 for (auto& l : o->extent_map.extent_map) {
6985 dout(20) << __func__ << " " << l << dendl;
6986 if (l.logical_offset < pos) {
6987 derr << "fsck error: " << oid << " lextent at 0x"
6988 << std::hex << l.logical_offset
6989 << " overlaps with the previous, which ends at 0x" << pos
6990 << std::dec << dendl;
6991 ++errors;
6992 }
6993 if (depth != FSCK_SHALLOW &&
6994 o->extent_map.spans_shard(l.logical_offset, l.length)) {
6995 derr << "fsck error: " << oid << " lextent at 0x"
6996 << std::hex << l.logical_offset << "~" << l.length
6997 << " spans a shard boundary"
6998 << std::dec << dendl;
6999 ++errors;
7000 }
7001 pos = l.logical_offset + l.length;
7002 res_statfs->data_stored += l.length;
7003 ceph_assert(l.blob);
7004 const bluestore_blob_t& blob = l.blob->get_blob();
7005
7006 auto& ref = ref_map[l.blob];
7007 if (ref.is_empty()) {
7008 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7009 uint32_t l = blob.get_logical_length();
7010 ref.init(l, min_release_size);
7011 }
7012 ref.get(
7013 l.blob_offset,
7014 l.length);
7015 ++num_extents;
7016 if (depth != FSCK_SHALLOW &&
7017 blob.has_unused()) {
7018 ceph_assert(referenced);
7019 auto p = referenced->find(l.blob);
7020 bluestore_blob_t::unused_t* pu;
7021 if (p == referenced->end()) {
7022 pu = &(*referenced)[l.blob];
7023 }
7024 else {
7025 pu = &p->second;
7026 }
7027 uint64_t blob_len = blob.get_logical_length();
7028 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7029 ceph_assert(l.blob_offset + l.length <= blob_len);
7030 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7031 uint64_t start = l.blob_offset / chunk_size;
7032 uint64_t end =
7033 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7034 for (auto i = start; i < end; ++i) {
7035 (*pu) |= (1u << i);
7036 }
7037 }
7038 } //for (auto& l : o->extent_map.extent_map)
7039
7040 for (auto& i : ref_map) {
7041 ++num_blobs;
7042 const bluestore_blob_t& blob = i.first->get_blob();
7043 bool equal =
7044 depth == FSCK_SHALLOW ? true :
7045 i.first->get_blob_use_tracker().equal(i.second);
7046 if (!equal) {
7047 derr << "fsck error: " << oid << " blob " << *i.first
7048 << " doesn't match expected ref_map " << i.second << dendl;
7049 ++errors;
7050 }
7051 if (blob.is_compressed()) {
7052 res_statfs->data_compressed += blob.get_compressed_payload_length();
7053 res_statfs->data_compressed_original +=
7054 i.first->get_referenced_bytes();
7055 }
7056 if (blob.is_shared()) {
7057 if (i.first->shared_blob->get_sbid() > blobid_max) {
7058 derr << "fsck error: " << oid << " blob " << blob
7059 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7060 << blobid_max << dendl;
7061 ++errors;
7062 }
7063 else if (i.first->shared_blob->get_sbid() == 0) {
7064 derr << "fsck error: " << oid << " blob " << blob
7065 << " marked as shared but has uninitialized sbid"
7066 << dendl;
7067 ++errors;
7068 }
7069 // the below lock is optional and provided in multithreading mode only
7070 if (sb_info_lock) {
7071 sb_info_lock->lock();
7072 }
7073 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7074 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7075 ceph_assert(sbi.pool_id == INT64_MIN ||
7076 sbi.pool_id == oid.hobj.get_logical_pool());
7077 sbi.cid = c->cid;
7078 sbi.pool_id = oid.hobj.get_logical_pool();
7079 sbi.sb = i.first->shared_blob;
7080 sbi.oids.push_back(oid);
7081 sbi.compressed = blob.is_compressed();
7082 for (auto e : blob.get_extents()) {
7083 if (e.is_valid()) {
7084 sbi.ref_map.get(e.offset, e.length);
7085 }
7086 }
7087 if (sb_info_lock) {
7088 sb_info_lock->unlock();
7089 }
7090 } else if (depth != FSCK_SHALLOW) {
7091 ceph_assert(used_blocks);
7092 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7093 blob.is_compressed(),
7094 *used_blocks,
7095 fm->get_alloc_size(),
7096 repairer,
7097 *res_statfs,
7098 depth);
7099 } else {
7100 errors += _fsck_sum_extents(
7101 blob.get_extents(),
7102 blob.is_compressed(),
7103 *res_statfs);
7104 }
7105 } // for (auto& i : ref_map)
7106 return o;
7107}
7108
7109#include "common/WorkQueue.h"
7110
7111class ShallowFSCKThreadPool : public ThreadPool
7112{
7113public:
7114 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7115 ThreadPool(cct_, nm, tn, n) {
7116 }
7117 void worker(ThreadPool::WorkThread* wt) override {
7118 int next_wq = 0;
7119 while (!_stop) {
7120 next_wq %= work_queues.size();
7121 WorkQueue_ *wq = work_queues[next_wq++];
7122
7123 void* item = wq->_void_dequeue();
7124 if (item) {
7125 processing++;
7126 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7127 wq->_void_process(item, tp_handle);
7128 processing--;
7129 }
7130 }
7131 }
7132 template <size_t BatchLen>
7133 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7134 {
7135 struct Entry {
7136 int64_t pool_id;
7137 BlueStore::CollectionRef c;
7138 ghobject_t oid;
7139 string key;
7140 bufferlist value;
7141 };
7142 struct Batch {
7143 std::atomic<size_t> running = { 0 };
7144 size_t entry_count = 0;
7145 std::array<Entry, BatchLen> entries;
7146
7147 int64_t errors = 0;
7148 int64_t warnings = 0;
7149 uint64_t num_objects = 0;
7150 uint64_t num_extents = 0;
7151 uint64_t num_blobs = 0;
7152 uint64_t num_sharded_objects = 0;
7153 uint64_t num_spanning_blobs = 0;
7154 store_statfs_t expected_store_statfs;
7155 BlueStore::per_pool_statfs expected_pool_statfs;
7156 };
7157
7158 size_t batchCount;
7159 BlueStore* store = nullptr;
7160
7161 mempool::bluestore_fsck::list<string>* expecting_shards = nullptr;
7162 ceph::mutex* sb_info_lock = nullptr;
7163 BlueStore::sb_info_map_t* sb_info = nullptr;
7164 BlueStoreRepairer* repairer = nullptr;
7165
7166 Batch* batches = nullptr;
7167 size_t last_batch_pos = 0;
7168 bool batch_acquired = false;
7169
7170 FSCKWorkQueue(std::string n,
7171 size_t _batchCount,
7172 BlueStore* _store,
7173 mempool::bluestore_fsck::list<string>& _expecting_shards,
7174 ceph::mutex* _sb_info_lock,
7175 BlueStore::sb_info_map_t& _sb_info,
7176 BlueStoreRepairer* _repairer) :
7177 WorkQueue_(n, time_t(), time_t()),
7178 batchCount(_batchCount),
7179 store(_store),
7180 expecting_shards(&_expecting_shards),
7181 sb_info_lock(_sb_info_lock),
7182 sb_info(&_sb_info),
7183 repairer(_repairer)
7184 {
7185 batches = new Batch[batchCount];
7186 }
7187 ~FSCKWorkQueue() {
7188 delete[] batches;
7189 }
7190
7191 /// Remove all work items from the queue.
7192 void _clear() override {
7193 //do nothing
7194 }
7195 /// Check whether there is anything to do.
7196 bool _empty() override {
7197 ceph_assert(false);
7198 }
7199
7200 /// Get the next work item to process.
7201 void* _void_dequeue() override {
7202 size_t pos = rand() % batchCount;
7203 size_t pos0 = pos;
7204 do {
7205 auto& batch = batches[pos];
7206 if (batch.running.fetch_add(1) == 0) {
7207 if (batch.entry_count) {
7208 return &batch;
7209 }
7210 }
7211 batch.running--;
7212 pos++;
7213 pos %= batchCount;
7214 } while (pos != pos0);
7215 return nullptr;
7216 }
7217 /** @brief Process the work item.
7218 * This function will be called several times in parallel
7219 * and must therefore be thread-safe. */
7220 void _void_process(void* item, TPHandle& handle) override {
7221 Batch* batch = (Batch*)item;
7222
7223 BlueStore::FSCK_ObjectCtx ctx(
7224 batch->errors,
7225 batch->warnings,
7226 batch->num_objects,
7227 batch->num_extents,
7228 batch->num_blobs,
7229 batch->num_sharded_objects,
7230 batch->num_spanning_blobs,
7231 nullptr, // used_blocks
7232 nullptr, // used_omap_head;
7233 nullptr, // used_per_pool_omap_head;
7234 nullptr, // used_pgmeta_omap_head;
7235 sb_info_lock,
7236 *sb_info,
7237 batch->expected_store_statfs,
7238 batch->expected_pool_statfs,
7239 repairer);
7240
7241 for (size_t i = 0; i < batch->entry_count; i++) {
7242 auto& entry = batch->entries[i];
7243
7244 store->fsck_check_objects_shallow(
7245 BlueStore::FSCK_SHALLOW,
7246 entry.pool_id,
7247 entry.c,
7248 entry.oid,
7249 entry.key,
7250 entry.value,
7251 *expecting_shards,
7252 nullptr, // referenced
7253 ctx);
7254 }
7255 //std::cout << "processed " << batch << std::endl;
7256 batch->entry_count = 0;
7257 batch->running--;
7258 }
7259 /** @brief Synchronously finish processing a work item.
7260 * This function is called after _void_process with the global thread pool lock held,
7261 * so at most one copy will execute simultaneously for a given thread pool.
7262 * It can be used for non-thread-safe finalization. */
7263 void _void_process_finish(void*) override {
7264 ceph_assert(false);
7265 }
7266
7267 bool queue(
7268 int64_t pool_id,
7269 BlueStore::CollectionRef c,
7270 const ghobject_t& oid,
7271 const string& key,
7272 const bufferlist& value) {
7273 bool res = false;
7274 size_t pos0 = last_batch_pos;
7275 if (!batch_acquired) {
7276 do {
7277 auto& batch = batches[last_batch_pos];
7278 if (batch.running.fetch_add(1) == 0) {
7279 if (batch.entry_count < BatchLen) {
7280 batch_acquired = true;
7281 break;
7282 }
7283 }
7284 batch.running.fetch_sub(1);
7285 last_batch_pos++;
7286 last_batch_pos %= batchCount;
7287 } while (last_batch_pos != pos0);
7288 }
7289 if (batch_acquired) {
7290 auto& batch = batches[last_batch_pos];
7291 ceph_assert(batch.running);
7292 ceph_assert(batch.entry_count < BatchLen);
7293
7294 auto& entry = batch.entries[batch.entry_count];
7295 entry.pool_id = pool_id;
7296 entry.c = c;
7297 entry.oid = oid;
7298 entry.key = key;
7299 entry.value = value;
7300
7301 ++batch.entry_count;
7302 if (batch.entry_count == BatchLen) {
7303 batch_acquired = false;
7304 batch.running.fetch_sub(1);
7305 last_batch_pos++;
7306 last_batch_pos %= batchCount;
7307 }
7308 res = true;
7309 }
7310 return res;
7311 }
7312
7313 void finalize(ThreadPool& tp,
7314 BlueStore::FSCK_ObjectCtx& ctx) {
7315 if (batch_acquired) {
7316 auto& batch = batches[last_batch_pos];
7317 ceph_assert(batch.running);
7318 batch.running.fetch_sub(1);
7319 }
7320 tp.stop();
7321
7322 for (size_t i = 0; i < batchCount; i++) {
7323 auto& batch = batches[i];
7324
7325 //process leftovers if any
7326 if (batch.entry_count) {
7327 TPHandle tp_handle(store->cct,
7328 nullptr,
7329 timeout_interval,
7330 suicide_interval);
7331 ceph_assert(batch.running == 0);
7332
7333 batch.running++; // just to be on-par with the regular call
7334 _void_process(&batch, tp_handle);
7335 }
7336 ceph_assert(batch.entry_count == 0);
7337
7338 ctx.errors += batch.errors;
7339 ctx.warnings += batch.warnings;
7340 ctx.num_objects += batch.num_objects;
7341 ctx.num_extents += batch.num_extents;
7342 ctx.num_blobs += batch.num_blobs;
7343 ctx.num_sharded_objects += batch.num_sharded_objects;
7344 ctx.num_spanning_blobs += batch.num_spanning_blobs;
7345 ctx.expected_store_statfs.add(batch.expected_store_statfs);
7346
7347 for (auto it = batch.expected_pool_statfs.begin();
7348 it != batch.expected_pool_statfs.end();
7349 it++) {
7350 ctx.expected_pool_statfs[it->first].add(it->second);
7351 }
7352 }
7353 }
7354 };
7355};
7356
7357void BlueStore::_fsck_check_objects(FSCKDepth depth,
7358 BlueStore::FSCK_ObjectCtx& ctx)
7359{
7360 //no need for the below lock when in non-shallow mode as
7361 // there is no multithreading in this case
7362 if (depth != FSCK_SHALLOW) {
7363 ctx.sb_info_lock = nullptr;
7364 }
7365
7366 auto& errors = ctx.errors;
7367 auto used_omap_head = ctx.used_omap_head;
7368 auto used_pgmeta_omap_head = ctx.used_pgmeta_omap_head;
7369 auto sb_info_lock = ctx.sb_info_lock;
7370 auto& sb_info = ctx.sb_info;
7371 auto repairer = ctx.repairer;
7372
7373 uint64_t_btree_t used_nids;
7374
7375 size_t processed_myself = 0;
7376
7377 auto it = db->get_iterator(PREFIX_OBJ);
7378 mempool::bluestore_fsck::list<string> expecting_shards;
7379 if (it) {
7380 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
7381 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
7382 std::unique_ptr<WQ> wq(
7383 new WQ(
7384 "FSCKWorkQueue",
7385 (thread_count ? : 1) * 32,
7386 this,
7387 expecting_shards,
7388 sb_info_lock,
7389 sb_info,
7390 repairer));
7391
7392 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
7393
7394 thread_pool.add_work_queue(wq.get());
7395 if (depth == FSCK_SHALLOW && thread_count > 0) {
7396 //not the best place but let's check anyway
7397 ceph_assert(sb_info_lock);
7398 thread_pool.start();
7399 }
7400
7401 //fill global if not overriden below
7402 CollectionRef c;
7403 int64_t pool_id = -1;
7404 spg_t pgid;
7405 for (it->lower_bound(string()); it->valid(); it->next()) {
7406 dout(30) << __func__ << " key "
7407 << pretty_binary_string(it->key()) << dendl;
7408 if (is_extent_shard_key(it->key())) {
7409 if (depth == FSCK_SHALLOW) {
7410 continue;
7411 }
7412 while (!expecting_shards.empty() &&
7413 expecting_shards.front() < it->key()) {
7414 derr << "fsck error: missing shard key "
7415 << pretty_binary_string(expecting_shards.front())
7416 << dendl;
7417 ++errors;
7418 expecting_shards.pop_front();
7419 }
7420 if (!expecting_shards.empty() &&
7421 expecting_shards.front() == it->key()) {
7422 // all good
7423 expecting_shards.pop_front();
7424 continue;
7425 }
7426
7427 uint32_t offset;
7428 string okey;
7429 get_key_extent_shard(it->key(), &okey, &offset);
7430 derr << "fsck error: stray shard 0x" << std::hex << offset
7431 << std::dec << dendl;
7432 if (expecting_shards.empty()) {
7433 derr << "fsck error: " << pretty_binary_string(it->key())
7434 << " is unexpected" << dendl;
7435 ++errors;
7436 continue;
7437 }
7438 while (expecting_shards.front() > it->key()) {
7439 derr << "fsck error: saw " << pretty_binary_string(it->key())
7440 << dendl;
7441 derr << "fsck error: exp "
7442 << pretty_binary_string(expecting_shards.front()) << dendl;
7443 ++errors;
7444 expecting_shards.pop_front();
7445 if (expecting_shards.empty()) {
7446 break;
7447 }
7448 }
7449 continue;
7450 }
7451
7452 ghobject_t oid;
7453 int r = get_key_object(it->key(), &oid);
7454 if (r < 0) {
7455 derr << "fsck error: bad object key "
7456 << pretty_binary_string(it->key()) << dendl;
7457 ++errors;
7458 continue;
7459 }
7460 if (!c ||
7461 oid.shard_id != pgid.shard ||
7462 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7463 !c->contains(oid)) {
7464 c = nullptr;
7465 for (auto& p : coll_map) {
7466 if (p.second->contains(oid)) {
7467 c = p.second;
7468 break;
7469 }
7470 }
7471 if (!c) {
7472 derr << "fsck error: stray object " << oid
7473 << " not owned by any collection" << dendl;
7474 ++errors;
7475 continue;
7476 }
7477 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7478 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7479 << dendl;
7480 }
7481
7482 if (depth != FSCK_SHALLOW &&
7483 !expecting_shards.empty()) {
7484 for (auto& k : expecting_shards) {
7485 derr << "fsck error: missing shard key "
7486 << pretty_binary_string(k) << dendl;
7487 }
7488 ++errors;
7489 expecting_shards.clear();
7490 }
7491
7492 bool queued = false;
7493 if (depth == FSCK_SHALLOW && thread_count > 0) {
7494 queued = wq->queue(
7495 pool_id,
7496 c,
7497 oid,
7498 it->key(),
7499 it->value());
7500 }
7501 OnodeRef o;
7502 map<BlobRef, bluestore_blob_t::unused_t> referenced;
7503
7504 if (!queued) {
7505 ++processed_myself;
7506
7507 o = fsck_check_objects_shallow(
7508 depth,
7509 pool_id,
7510 c,
7511 oid,
7512 it->key(),
7513 it->value(),
7514 expecting_shards,
7515 &referenced,
7516 ctx);
7517 }
7518
7519 if (depth != FSCK_SHALLOW) {
7520 ceph_assert(o != nullptr);
7521 if (o->onode.nid) {
7522 if (o->onode.nid > nid_max) {
7523 derr << "fsck error: " << oid << " nid " << o->onode.nid
7524 << " > nid_max " << nid_max << dendl;
7525 ++errors;
7526 }
7527 if (used_nids.count(o->onode.nid)) {
7528 derr << "fsck error: " << oid << " nid " << o->onode.nid
7529 << " already in use" << dendl;
7530 ++errors;
7531 continue; // go for next object
7532 }
7533 used_nids.insert(o->onode.nid);
7534 }
7535 for (auto& i : referenced) {
7536 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
7537 << std::dec << " for " << *i.first << dendl;
7538 const bluestore_blob_t& blob = i.first->get_blob();
7539 if (i.second & blob.unused) {
7540 derr << "fsck error: " << oid << " blob claims unused 0x"
7541 << std::hex << blob.unused
7542 << " but extents reference 0x" << i.second << std::dec
7543 << " on blob " << *i.first << dendl;
7544 ++errors;
7545 }
7546 if (blob.has_csum()) {
7547 uint64_t blob_len = blob.get_logical_length();
7548 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
7549 unsigned csum_count = blob.get_csum_count();
7550 unsigned csum_chunk_size = blob.get_csum_chunk_size();
7551 for (unsigned p = 0; p < csum_count; ++p) {
7552 unsigned pos = p * csum_chunk_size;
7553 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
7554 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
7555 unsigned mask = 1u << firstbit;
7556 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
7557 mask |= 1u << b;
7558 }
7559 if ((blob.unused & mask) == mask) {
7560 // this csum chunk region is marked unused
7561 if (blob.get_csum_item(p) != 0) {
7562 derr << "fsck error: " << oid
7563 << " blob claims csum chunk 0x" << std::hex << pos
7564 << "~" << csum_chunk_size
7565 << " is unused (mask 0x" << mask << " of unused 0x"
7566 << blob.unused << ") but csum is non-zero 0x"
7567 << blob.get_csum_item(p) << std::dec << " on blob "
7568 << *i.first << dendl;
7569 ++errors;
7570 }
7571 }
7572 }
7573 }
7574 }
7575 // omap
7576 if (o->onode.has_omap()) {
7577 ceph_assert(used_omap_head);
7578 ceph_assert(used_pgmeta_omap_head);
7579 auto m =
7580 o->onode.is_pgmeta_omap() ? used_pgmeta_omap_head : used_omap_head;
7581 if (m->count(o->onode.nid)) {
7582 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
7583 << " already in use" << dendl;
7584 ++errors;
7585 } else {
7586 m->insert(o->onode.nid);
7587 }
7588 }
7589 if (depth == FSCK_DEEP) {
7590 bufferlist bl;
7591 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
7592 uint64_t offset = 0;
7593 do {
7594 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
7595 int r = _do_read(c.get(), o, offset, l, bl,
7596 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
7597 if (r < 0) {
7598 ++errors;
7599 derr << "fsck error: " << oid << std::hex
7600 << " error during read: "
7601 << " " << offset << "~" << l
7602 << " " << cpp_strerror(r) << std::dec
7603 << dendl;
7604 break;
7605 }
7606 offset += l;
7607 } while (offset < o->onode.size);
7608 } // deep
7609 } //if (depth != FSCK_SHALLOW)
7610 } // for (it->lower_bound(string()); it->valid(); it->next())
7611 if (depth == FSCK_SHALLOW && thread_count > 0) {
7612 wq->finalize(thread_pool, ctx);
7613 if (processed_myself) {
7614 // may be needs more threads?
7615 dout(0) << __func__ << " partial offload"
7616 << ", done myself " << processed_myself
7617 << " of " << ctx.num_objects
7618 << "objects, threads " << thread_count
7619 << dendl;
7620 }
7621 }
7622 } // if (it)
7623}
7624/**
7625An overview for currently implemented repair logics
7626performed in fsck in two stages: detection(+preparation) and commit.
7627Detection stage (in processing order):
7628 (Issue -> Repair action to schedule)
7629 - Detect undecodable keys for Shared Blobs -> Remove
7630 - Detect undecodable records for Shared Blobs -> Remove
7631 (might trigger missed Shared Blob detection below)
7632 - Detect stray records for Shared Blobs -> Remove
7633 - Detect misreferenced pextents -> Fix
7634 Prepare Bloom-like filter to track cid/oid -> pextent
7635 Prepare list of extents that are improperly referenced
7636 Enumerate Onode records that might use 'misreferenced' pextents
7637 (Bloom-like filter applied to reduce computation)
7638 Per each questinable Onode enumerate all blobs and identify broken ones
7639 (i.e. blobs having 'misreferences')
7640 Rewrite each broken blob data by allocating another extents and
7641 copying data there
7642 If blob is shared - unshare it and mark corresponding Shared Blob
7643 for removal
7644 Release previously allocated space
7645 Update Extent Map
7646 - Detect missed Shared Blobs -> Recreate
7647 - Detect undecodable deferred transaction -> Remove
7648 - Detect Freelist Manager's 'false free' entries -> Mark as used
7649 - Detect Freelist Manager's leaked entries -> Mark as free
7650 - Detect statfs inconsistency - Update
7651 Commit stage (separate DB commit per each step):
7652 - Apply leaked FM entries fix
7653 - Apply 'false free' FM entries fix
7654 - Apply 'Remove' actions
7655 - Apply fix for misreference pextents
7656 - Apply Shared Blob recreate
7657 (can be merged with the step above if misreferences were dectected)
7658 - Apply StatFS update
7659*/
7660int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
7661{
7662 dout(1) << __func__
7663 << (repair ? " repair" : " check")
7664 << (depth == FSCK_DEEP ? " (deep)" :
7665 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
7666 << dendl;
7667
7668 // in deep mode we need R/W write access to be able to replay deferred ops
7669 bool read_only = !(repair || depth == FSCK_DEEP);
7670
7671 int r = _open_path();
7672 if (r < 0)
7673 return r;
7c673cae
FG
7674 r = _open_fsid(false);
7675 if (r < 0)
7676 goto out_path;
7677
7678 r = _read_fsid(&fsid);
7679 if (r < 0)
7680 goto out_fsid;
7681
7682 r = _lock_fsid();
7683 if (r < 0)
7684 goto out_fsid;
7685
7686 r = _open_bdev(false);
7687 if (r < 0)
7688 goto out_fsid;
7689
11fdf7f2 7690 r = _open_db_and_around(read_only);
7c673cae
FG
7691 if (r < 0)
7692 goto out_bdev;
7693
11fdf7f2
TL
7694 if (!read_only) {
7695 r = _upgrade_super();
7696 if (r < 0) {
7697 goto out_db;
7698 }
7699 }
7c673cae 7700
eafe8130 7701 r = _open_collections();
7c673cae 7702 if (r < 0)
11fdf7f2 7703 goto out_db;
7c673cae
FG
7704
7705 mempool_thread.init();
7706
11fdf7f2
TL
7707 // we need finisher and kv_{sync,finalize}_thread *just* for replay
7708 // enable in repair or deep mode modes only
7709 if (!read_only) {
7710 _kv_start();
7711 r = _deferred_replay();
7712 _kv_stop();
7713 }
7c673cae
FG
7714 if (r < 0)
7715 goto out_scan;
7716
eafe8130
TL
7717 r = _fsck_on_open(depth, repair);
7718
7719out_scan:
7720 mempool_thread.shutdown();
7721 _flush_cache();
7722out_db:
7723 _close_db_and_around();
7724out_bdev:
7725 _close_bdev();
7726out_fsid:
7727 _close_fsid();
7728out_path:
7729 _close_path();
7730
7731 return r;
7732}
7733
7734int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
7735{
7736 dout(1) << __func__
7737 << " <<<START>>>"
7738 << (repair ? " repair" : " check")
7739 << (depth == FSCK_DEEP ? " (deep)" :
7740 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
7741 << " start" << dendl;
7742 int64_t errors = 0;
7743 int64_t warnings = 0;
7744 unsigned repaired = 0;
7745
7746 uint64_t_btree_t used_omap_head;
7747 uint64_t_btree_t used_per_pool_omap_head;
7748 uint64_t_btree_t used_pgmeta_omap_head;
7749 uint64_t_btree_t used_sbids;
7750
7751 mempool_dynamic_bitset used_blocks;
7752 KeyValueDB::Iterator it;
7753 store_statfs_t expected_store_statfs, actual_statfs;
7754 per_pool_statfs expected_pool_statfs;
7755
7756 sb_info_map_t sb_info;
7757
7758 uint64_t num_objects = 0;
7759 uint64_t num_extents = 0;
7760 uint64_t num_blobs = 0;
7761 uint64_t num_spanning_blobs = 0;
7762 uint64_t num_shared_blobs = 0;
7763 uint64_t num_sharded_objects = 0;
7764 BlueStoreRepairer repairer;
7765
7766 utime_t start = ceph_clock_now();
7767
7768 _fsck_collections(&errors);
b32b8144 7769 used_blocks.resize(fm->get_alloc_units());
7c673cae 7770 apply(
11fdf7f2 7771 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae 7772 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7773 ceph_assert(pos < bs.size());
7c673cae
FG
7774 bs.set(pos);
7775 }
7776 );
11fdf7f2
TL
7777 if (repair) {
7778 repairer.get_space_usage_tracker().init(
7779 bdev->get_size(),
7780 min_alloc_size);
7781 }
7c673cae
FG
7782
7783 if (bluefs) {
11fdf7f2
TL
7784 if( cct->_conf->bluestore_bluefs_db_compatibility) {
7785 interval_set<uint64_t> bluefs_extents_db;
7786 bufferlist bl;
7787 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7788 auto p = bl.cbegin();
7789 auto prev_errors = errors;
7790 try {
7791 decode(bluefs_extents_db, p);
7792 bluefs_extents_db.union_of(bluefs_extents);
7793 bluefs_extents_db.subtract(bluefs_extents);
7794 if (!bluefs_extents_db.empty()) {
7795 derr << "fsck error: bluefs_extents inconsistency, "
7796 << "downgrade to previous releases might be broken."
7797 << dendl;
7798 ++errors;
7799 }
7800 }
7801 catch (buffer::error& e) {
7802 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
7803 ++errors;
7804 }
7805 if (errors != prev_errors && repair) {
7806 repairer.fix_bluefs_extents(out_of_sync_fm);
7807 }
7808 }
7809
7c673cae
FG
7810 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
7811 apply(
b32b8144 7812 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 7813 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
11fdf7f2 7814 ceph_assert(pos < bs.size());
7c673cae
FG
7815 bs.set(pos);
7816 }
7817 );
7818 }
eafe8130 7819 int r = bluefs->fsck();
7c673cae 7820 if (r < 0) {
eafe8130 7821 return r;
7c673cae
FG
7822 }
7823 if (r > 0)
7824 errors += r;
7825 }
7826
eafe8130
TL
7827 if (!per_pool_stat_collection) {
7828 const char *w;
7829 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
7830 w = "error";
7831 ++errors;
7832 } else {
7833 w = "warning";
7834 ++warnings;
7835 }
7836 derr << "fsck " << w << ": store not yet converted to per-pool stats"
7837 << dendl;
7838 }
11fdf7f2 7839 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
7840 // structs
7841 statfs(&actual_statfs);
11fdf7f2
TL
7842 actual_statfs.total = 0;
7843 actual_statfs.internally_reserved = 0;
7844 actual_statfs.available = 0;
7845 actual_statfs.internal_metadata = 0;
7846 actual_statfs.omap_allocated = 0;
7847
eafe8130
TL
7848 if (g_conf()->bluestore_debug_fsck_abort) {
7849 dout(1) << __func__ << " debug abort" << dendl;
7850 goto out_scan;
7851 }
7c673cae 7852 // walk PREFIX_OBJ
eafe8130
TL
7853 {
7854 dout(1) << __func__ << " walking object keyspace" << dendl;
7855 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
7856 BlueStore::FSCK_ObjectCtx ctx(
7857 errors,
7858 warnings,
7859 num_objects,
7860 num_extents,
7861 num_blobs,
7862 num_sharded_objects,
7863 num_spanning_blobs,
7864 &used_blocks,
7865 &used_omap_head,
7866 nullptr,
7867 &used_pgmeta_omap_head,
7868 &sb_info_lock,
7869 sb_info,
7870 expected_store_statfs,
7871 expected_pool_statfs,
7872 repair ? &repairer : nullptr);
7873 _fsck_check_objects(depth,
7874 ctx);
7875 }
11fdf7f2 7876
7c673cae
FG
7877 dout(1) << __func__ << " checking shared_blobs" << dendl;
7878 it = db->get_iterator(PREFIX_SHARED_BLOB);
7879 if (it) {
eafe8130
TL
7880 // FIXME minor: perhaps simplify for shallow mode?
7881 // fill global if not overriden below
7882 auto expected_statfs = &expected_store_statfs;
11fdf7f2 7883
7c673cae
FG
7884 for (it->lower_bound(string()); it->valid(); it->next()) {
7885 string key = it->key();
7886 uint64_t sbid;
7887 if (get_key_shared_blob(key, &sbid)) {
3efd9988 7888 derr << "fsck error: bad key '" << key
7c673cae 7889 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
7890 if (repair) {
7891 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
7892 }
7c673cae
FG
7893 ++errors;
7894 continue;
7895 }
7896 auto p = sb_info.find(sbid);
7897 if (p == sb_info.end()) {
3efd9988 7898 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 7899 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
7900 if (repair) {
7901 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
7902 }
7c673cae
FG
7903 ++errors;
7904 } else {
7905 ++num_shared_blobs;
7906 sb_info_t& sbi = p->second;
7907 bluestore_shared_blob_t shared_blob(sbid);
7908 bufferlist bl = it->value();
11fdf7f2
TL
7909 auto blp = bl.cbegin();
7910 try {
7911 decode(shared_blob, blp);
7912 } catch (buffer::error& e) {
7913 ++errors;
7914 // Force update and don't report as missing
7915 sbi.updated = sbi.passed = true;
7916
7917 derr << "fsck error: failed to decode Shared Blob"
7918 << pretty_binary_string(it->key()) << dendl;
7919 if (repair) {
7920 dout(20) << __func__ << " undecodable Shared Blob, key:'"
7921 << pretty_binary_string(it->key())
7922 << "', removing" << dendl;
7923 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
7924 }
7925 continue;
7926 }
7c673cae
FG
7927 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
7928 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 7929 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
7930 << std::dec << " ref_map " << shared_blob.ref_map
7931 << " != expected " << sbi.ref_map << dendl;
7932 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
7933 ++errors;
7934 }
7935 PExtentVector extents;
7936 for (auto &r : shared_blob.ref_map.ref_map) {
7937 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
7938 }
eafe8130 7939 if (per_pool_stat_collection || repair) {
11fdf7f2
TL
7940 expected_statfs = &expected_pool_statfs[sbi.pool_id];
7941 }
7942 errors += _fsck_check_extents(sbi.cid,
7943 p->second.oids.front(),
7c673cae
FG
7944 extents,
7945 p->second.compressed,
b32b8144
FG
7946 used_blocks,
7947 fm->get_alloc_size(),
11fdf7f2 7948 repair ? &repairer : nullptr,
eafe8130
TL
7949 *expected_statfs,
7950 depth);
11fdf7f2
TL
7951 sbi.passed = true;
7952 }
7953 }
7954 } // if (it)
7955
7956 if (repair && repairer.preprocess_misreference(db)) {
7957
7958 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
7959 auto& space_tracker = repairer.get_space_usage_tracker();
7960 auto& misref_extents = repairer.get_misreferences();
7961 interval_set<uint64_t> to_release;
7962 it = db->get_iterator(PREFIX_OBJ);
7963 if (it) {
eafe8130
TL
7964 // fill global if not overriden below
7965 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
7966
7967 CollectionRef c;
7968 spg_t pgid;
7969 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
7970 bool bypass_rest = false;
7971 for (it->lower_bound(string()); it->valid() && !bypass_rest;
7972 it->next()) {
7973 dout(30) << __func__ << " key "
7974 << pretty_binary_string(it->key()) << dendl;
7975 if (is_extent_shard_key(it->key())) {
7976 continue;
7977 }
7978
7979 ghobject_t oid;
7980 int r = get_key_object(it->key(), &oid);
7981 if (r < 0 || !space_tracker.is_used(oid)) {
7982 continue;
7983 }
7984
7985 if (!c ||
7986 oid.shard_id != pgid.shard ||
7987 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7988 !c->contains(oid)) {
7989 c = nullptr;
7990 for (auto& p : coll_map) {
7991 if (p.second->contains(oid)) {
7992 c = p.second;
7993 break;
7994 }
7995 }
7996 if (!c) {
7997 continue;
7998 }
eafe8130
TL
7999 if (per_pool_stat_collection || repair) {
8000 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
8001 expected_statfs = &expected_pool_statfs[pool_id];
8002 }
8003 }
8004 if (!space_tracker.is_used(c->cid)) {
8005 continue;
8006 }
8007
8008 dout(20) << __func__ << " check misreference for col:" << c->cid
8009 << " obj:" << oid << dendl;
8010
eafe8130
TL
8011 OnodeRef o;
8012 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
8013 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8014 mempool::bluestore_fsck::set<BlobRef> blobs;
8015
8016 for (auto& e : o->extent_map.extent_map) {
8017 blobs.insert(e.blob);
8018 }
8019 bool need_onode_update = false;
8020 bool first_dump = true;
8021 for(auto b : blobs) {
8022 bool broken_blob = false;
8023 auto& pextents = b->dirty_blob().dirty_extents();
8024 for (auto& e : pextents) {
8025 if (!e.is_valid()) {
8026 continue;
8027 }
8028 // for the sake of simplicity and proper shared blob handling
8029 // always rewrite the whole blob even when it's partially
8030 // misreferenced.
8031 if (misref_extents.intersects(e.offset, e.length)) {
8032 if (first_dump) {
8033 first_dump = false;
81eedcae 8034 _dump_onode<10>(cct, *o);
11fdf7f2
TL
8035 }
8036 broken_blob = true;
8037 break;
8038 }
8039 }
8040 if (!broken_blob)
8041 continue;
8042 bool compressed = b->get_blob().is_compressed();
8043 need_onode_update = true;
8044 dout(10) << __func__
8045 << " fix misreferences in oid:" << oid
8046 << " " << *b << dendl;
8047 uint64_t b_off = 0;
8048 PExtentVector pext_to_release;
8049 pext_to_release.reserve(pextents.size());
8050 // rewriting all valid pextents
8051 for (auto e = pextents.begin(); e != pextents.end();
8052 b_off += e->length, e++) {
8053 if (!e->is_valid()) {
8054 continue;
8055 }
8056 PExtentVector exts;
8057 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
8058 0, 0, &exts);
eafe8130 8059 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
8060 derr << __func__
8061 << " failed to allocate 0x" << std::hex << e->length
eafe8130 8062 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
8063 << " min_alloc_size 0x" << min_alloc_size
8064 << " available 0x " << alloc->get_free()
8065 << std::dec << dendl;
8066 if (alloc_len > 0) {
8067 alloc->release(exts);
8068 }
8069 bypass_rest = true;
8070 break;
8071 }
8072 expected_statfs->allocated += e->length;
8073 if (compressed) {
8074 expected_statfs->data_compressed_allocated += e->length;
8075 }
8076
8077 bufferlist bl;
8078 IOContext ioc(cct, NULL, true); // allow EIO
8079 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8080 if (r < 0) {
8081 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8082 <<"~" << e->length << std::dec << dendl;
8083 ceph_abort_msg("read failed, wtf");
8084 }
8085 pext_to_release.push_back(*e);
8086 e = pextents.erase(e);
8087 e = pextents.insert(e, exts.begin(), exts.end());
8088 b->get_blob().map_bl(
8089 b_off, bl,
8090 [&](uint64_t offset, bufferlist& t) {
8091 int r = bdev->write(offset, t, false);
8092 ceph_assert(r == 0);
8093 });
8094 e += exts.size() - 1;
8095 for (auto& p : exts) {
8096 fm->allocate(p.offset, p.length, txn);
8097 }
8098 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8099
8100 if (b->get_blob().is_shared()) {
8101 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8102
8103 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8104 ceph_assert(sb_it != sb_info.end());
8105 sb_info_t& sbi = sb_it->second;
8106
8107 for (auto& r : sbi.ref_map.ref_map) {
8108 expected_statfs->allocated -= r.second.length;
8109 if (sbi.compressed) {
8110 // NB: it's crucial to use compressed flag from sb_info_t
8111 // as we originally used that value while accumulating
8112 // expected_statfs
8113 expected_statfs->data_compressed_allocated -= r.second.length;
8114 }
8115 }
8116 sbi.updated = sbi.passed = true;
8117 sbi.ref_map.clear();
8118
8119 // relying on blob's pextents to decide what to release.
8120 for (auto& p : pext_to_release) {
8121 to_release.union_insert(p.offset, p.length);
8122 }
8123 } else {
8124 for (auto& p : pext_to_release) {
8125 expected_statfs->allocated -= p.length;
8126 if (compressed) {
8127 expected_statfs->data_compressed_allocated -= p.length;
8128 }
8129 to_release.union_insert(p.offset, p.length);
8130 }
8131 }
8132 if (bypass_rest) {
8133 break;
8134 }
8135 } // for(auto b : blobs)
8136 if (need_onode_update) {
8137 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8138 _record_onode(o, txn);
8139 }
8140 } // for (it->lower_bound(string()); it->valid(); it->next())
8141
8142 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8143 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8144 << "~" << it.get_len() << std::dec << dendl;
8145 fm->release(it.get_start(), it.get_len(), txn);
8146 }
8147 alloc->release(to_release);
8148 to_release.clear();
8149 } // if (it) {
8150 } //if (repair && repairer.preprocess_misreference()) {
8151
eafe8130
TL
8152 if (depth != FSCK_SHALLOW) {
8153 for (auto &p : sb_info) {
8154 sb_info_t& sbi = p.second;
8155 if (!sbi.passed) {
8156 derr << "fsck error: missing " << *sbi.sb << dendl;
8157 ++errors;
8158 }
8159 if (repair && (!sbi.passed || sbi.updated)) {
8160 auto sbid = p.first;
8161 if (sbi.ref_map.empty()) {
8162 ceph_assert(sbi.passed);
8163 dout(20) << __func__ << " " << *sbi.sb
8164 << " is empty, removing" << dendl;
8165 repairer.fix_shared_blob(db, sbid, nullptr);
8166 } else {
8167 bufferlist bl;
8168 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8169 encode(persistent, bl);
8170 dout(20) << __func__ << " " << *sbi.sb
8171 << " is " << bl.length() << " bytes, updating" << dendl;
11fdf7f2 8172
eafe8130
TL
8173 repairer.fix_shared_blob(db, sbid, &bl);
8174 }
7c673cae
FG
8175 }
8176 }
8177 }
11fdf7f2
TL
8178 sb_info.clear();
8179
eafe8130
TL
8180 // check global stats only if fscking (not repairing) w/o per-pool stats
8181 if (!per_pool_stat_collection &&
8182 !repair &&
8183 !(actual_statfs == expected_store_statfs)) {
8184 derr << "fsck error: actual " << actual_statfs
8185 << " != expected " << expected_store_statfs << dendl;
8186 if (repair) {
8187 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8188 expected_store_statfs);
11fdf7f2 8189 }
eafe8130 8190 ++errors;
7c673cae
FG
8191 }
8192
eafe8130
TL
8193 dout(1) << __func__ << " checking pool_statfs" << dendl;
8194 _fsck_check_pool_statfs(expected_pool_statfs,
8195 errors, warnings, repair ? &repairer : nullptr);
8196
8197 if (depth != FSCK_SHALLOW) {
8198 dout(1) << __func__ << " checking for stray omap data" << dendl;
8199 it = db->get_iterator(PREFIX_OMAP);
8200 if (it) {
8201 for (it->lower_bound(string()); it->valid(); it->next()) {
8202 uint64_t omap_head;
8203 _key_decode_u64(it->key().c_str(), &omap_head);
8204 if (used_omap_head.count(omap_head) == 0) {
8205 derr << "fsck error: found stray omap data on omap_head "
8206 << omap_head << dendl;
8207 ++errors;
8208 }
7c673cae
FG
8209 }
8210 }
eafe8130
TL
8211 it = db->get_iterator(PREFIX_PGMETA_OMAP);
8212 if (it) {
8213 for (it->lower_bound(string()); it->valid(); it->next()) {
8214 uint64_t omap_head;
8215 _key_decode_u64(it->key().c_str(), &omap_head);
8216 if (used_pgmeta_omap_head.count(omap_head) == 0) {
8217 derr << "fsck error: found stray (pgmeta) omap data on omap_head "
8218 << omap_head << dendl;
8219 ++errors;
8220 }
11fdf7f2
TL
8221 }
8222 }
eafe8130
TL
8223 dout(1) << __func__ << " checking deferred events" << dendl;
8224 it = db->get_iterator(PREFIX_DEFERRED);
8225 if (it) {
8226 for (it->lower_bound(string()); it->valid(); it->next()) {
8227 bufferlist bl = it->value();
8228 auto p = bl.cbegin();
8229 bluestore_deferred_transaction_t wt;
8230 try {
8231 decode(wt, p);
8232 } catch (buffer::error& e) {
8233 derr << "fsck error: failed to decode deferred txn "
8234 << pretty_binary_string(it->key()) << dendl;
8235 if (repair) {
8236 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8237 << pretty_binary_string(it->key())
8238 << "', removing" << dendl;
8239 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8240 }
8241 continue;
8242 }
8243 dout(20) << __func__ << " deferred " << wt.seq
8244 << " ops " << wt.ops.size()
8245 << " released 0x" << std::hex << wt.released << std::dec << dendl;
8246 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
8247 apply(
8248 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
8249 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8250 ceph_assert(pos < bs.size());
8251 bs.set(pos);
8252 }
8253 );
8254 }
7c673cae 8255 }
eafe8130
TL
8256 }
8257
8258 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
8259 {
8260 // remove bluefs_extents from used set since the freelist doesn't
8261 // know they are allocated.
8262 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
7c673cae 8263 apply(
b32b8144 8264 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 8265 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8266 ceph_assert(pos < bs.size());
8267 bs.reset(pos);
7c673cae
FG
8268 }
8269 );
8270 }
eafe8130
TL
8271 fm->enumerate_reset();
8272 uint64_t offset, length;
8273 while (fm->enumerate_next(db, &offset, &length)) {
8274 bool intersects = false;
8275 apply(
8276 offset, length, fm->get_alloc_size(), used_blocks,
8277 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8278 ceph_assert(pos < bs.size());
8279 if (bs.test(pos)) {
8280 if (offset == SUPER_RESERVED &&
8281 length == min_alloc_size - SUPER_RESERVED) {
8282 // this is due to the change just after luminous to min_alloc_size
8283 // granularity allocations, and our baked in assumption at the top
8284 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8285 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8286 // since we will never allocate this region below min_alloc_size.
8287 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
8288 << " and min_alloc_size, 0x" << std::hex << offset << "~"
8289 << length << std::dec << dendl;
8290 } else {
8291 intersects = true;
8292 if (repair) {
8293 repairer.fix_false_free(db, fm,
8294 pos * min_alloc_size,
8295 min_alloc_size);
8296 }
11fdf7f2 8297 }
eafe8130
TL
8298 } else {
8299 bs.set(pos);
8300 }
7c673cae 8301 }
eafe8130
TL
8302 );
8303 if (intersects) {
8304 derr << "fsck error: free extent 0x" << std::hex << offset
8305 << "~" << length << std::dec
8306 << " intersects allocated blocks" << dendl;
8307 ++errors;
7c673cae 8308 }
b5b8bbf5 8309 }
eafe8130
TL
8310 fm->enumerate_reset();
8311 size_t count = used_blocks.count();
8312 if (used_blocks.size() != count) {
8313 ceph_assert(used_blocks.size() > count);
8314 used_blocks.flip();
8315 size_t start = used_blocks.find_first();
8316 while (start != decltype(used_blocks)::npos) {
8317 size_t cur = start;
8318 while (true) {
8319 size_t next = used_blocks.find_next(cur);
8320 if (next != cur + 1) {
8321 ++errors;
8322 derr << "fsck error: leaked extent 0x" << std::hex
8323 << ((uint64_t)start * fm->get_alloc_size()) << "~"
8324 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
8325 << dendl;
8326 if (repair) {
8327 repairer.fix_leaked(db,
8328 fm,
8329 start * min_alloc_size,
8330 (cur + 1 - start) * min_alloc_size);
8331 }
8332 start = next;
8333 break;
11fdf7f2 8334 }
eafe8130 8335 cur = next;
b5b8bbf5 8336 }
eafe8130
TL
8337 }
8338 used_blocks.flip();
b5b8bbf5 8339 }
7c673cae
FG
8340 }
8341 }
11fdf7f2
TL
8342 if (repair) {
8343 dout(5) << __func__ << " applying repair results" << dendl;
8344 repaired = repairer.apply(db);
8345 dout(5) << __func__ << " repair applied" << dendl;
8346 }
7c673cae 8347
eafe8130 8348out_scan:
7c673cae
FG
8349 dout(2) << __func__ << " " << num_objects << " objects, "
8350 << num_sharded_objects << " of them sharded. "
8351 << dendl;
8352 dout(2) << __func__ << " " << num_extents << " extents to "
8353 << num_blobs << " blobs, "
8354 << num_spanning_blobs << " spanning, "
8355 << num_shared_blobs << " shared."
8356 << dendl;
8357
8358 utime_t duration = ceph_clock_now() - start;
11fdf7f2
TL
8359 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " << repaired
8360 << " repaired, " << (errors - (int)repaired) << " remaining in "
7c673cae 8361 << duration << " seconds" << dendl;
11fdf7f2
TL
8362 return errors - (int)repaired;
8363}
8364
8365/// methods to inject various errors fsck can repair
8366void BlueStore::inject_broken_shared_blob_key(const string& key,
8367 const bufferlist& bl)
8368{
8369 KeyValueDB::Transaction txn;
8370 txn = db->get_transaction();
8371 txn->set(PREFIX_SHARED_BLOB, key, bl);
8372 db->submit_transaction_sync(txn);
8373};
8374
8375void BlueStore::inject_leaked(uint64_t len)
8376{
8377 KeyValueDB::Transaction txn;
8378 txn = db->get_transaction();
8379
8380 PExtentVector exts;
8381 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
8382 min_alloc_size * 256, 0, &exts);
8383 ceph_assert(alloc_len >= (int64_t)len);
8384 for (auto& p : exts) {
8385 fm->allocate(p.offset, p.length, txn);
8386 }
8387 db->submit_transaction_sync(txn);
8388}
8389
8390void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
8391{
8392 KeyValueDB::Transaction txn;
8393 OnodeRef o;
8394 CollectionRef c = _get_collection(cid);
8395 ceph_assert(c);
8396 {
8397 RWLock::WLocker l(c->lock); // just to avoid internal asserts
8398 o = c->get_onode(oid, false);
8399 ceph_assert(o);
8400 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8401 }
8402
8403 bool injected = false;
8404 txn = db->get_transaction();
8405 auto& em = o->extent_map.extent_map;
8406 std::vector<const PExtentVector*> v;
8407 if (em.size()) {
8408 v.push_back(&em.begin()->blob->get_blob().get_extents());
8409 }
8410 if (em.size() > 1) {
8411 auto it = em.end();
8412 --it;
8413 v.push_back(&(it->blob->get_blob().get_extents()));
8414 }
8415 for (auto pext : v) {
8416 if (pext->size()) {
8417 auto p = pext->begin();
8418 while (p != pext->end()) {
8419 if (p->is_valid()) {
8420 dout(20) << __func__ << " release 0x" << std::hex << p->offset
8421 << "~" << p->length << std::dec << dendl;
8422 fm->release(p->offset, p->length, txn);
8423 injected = true;
8424 break;
8425 }
8426 ++p;
8427 }
8428 }
8429 }
8430 ceph_assert(injected);
8431 db->submit_transaction_sync(txn);
8432}
8433
8434void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
8435{
8436 BlueStoreRepairer repairer;
8437 repairer.fix_statfs(db, key, new_statfs);
8438 repairer.apply(db);
8439}
8440
eafe8130
TL
8441void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
8442{
8443 KeyValueDB::Transaction t = db->get_transaction();
8444 volatile_statfs v;
8445 v = new_statfs;
8446 bufferlist bl;
8447 v.encode(bl);
8448 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
8449 db->submit_transaction_sync(t);
8450}
8451
11fdf7f2
TL
8452void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
8453 coll_t cid2, ghobject_t oid2,
8454 uint64_t offset)
8455{
8456 OnodeRef o1;
8457 CollectionRef c1 = _get_collection(cid1);
8458 ceph_assert(c1);
8459 {
8460 RWLock::WLocker l(c1->lock); // just to avoid internal asserts
8461 o1 = c1->get_onode(oid1, false);
8462 ceph_assert(o1);
8463 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
8464 }
8465 OnodeRef o2;
8466 CollectionRef c2 = _get_collection(cid2);
8467 ceph_assert(c2);
8468 {
8469 RWLock::WLocker l(c2->lock); // just to avoid internal asserts
8470 o2 = c2->get_onode(oid2, false);
8471 ceph_assert(o2);
8472 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
8473 }
8474 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
8475 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
8476
8477 // require onode/extent layout to be the same (and simple)
8478 // to make things easier
8479 ceph_assert(o1->onode.extent_map_shards.empty());
8480 ceph_assert(o2->onode.extent_map_shards.empty());
8481 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
8482 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
8483 ceph_assert(e1.logical_offset == e2.logical_offset);
8484 ceph_assert(e1.length == e2.length);
8485 ceph_assert(e1.blob_offset == e2.blob_offset);
8486
8487 KeyValueDB::Transaction txn;
8488 txn = db->get_transaction();
8489
8490 // along with misreference error this will create space leaks errors
8491 e2.blob->dirty_blob() = e1.blob->get_blob();
8492 o2->extent_map.dirty_range(offset, e2.length);
8493 o2->extent_map.update(txn, false);
8494
8495 _record_onode(o2, txn);
8496 db->submit_transaction_sync(txn);
7c673cae
FG
8497}
8498
8499void BlueStore::collect_metadata(map<string,string> *pm)
8500{
8501 dout(10) << __func__ << dendl;
8502 bdev->collect_metadata("bluestore_bdev_", pm);
8503 if (bluefs) {
8504 (*pm)["bluefs"] = "1";
8505 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
11fdf7f2 8506 bluefs->collect_metadata(pm, bluefs_shared_bdev);
7c673cae
FG
8507 } else {
8508 (*pm)["bluefs"] = "0";
8509 }
11fdf7f2
TL
8510
8511 // report numa mapping for underlying devices
8512 int node = -1;
8513 set<int> nodes;
8514 set<string> failed;
8515 int r = get_numa_node(&node, &nodes, &failed);
8516 if (r >= 0) {
8517 if (!failed.empty()) {
8518 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
8519 }
8520 if (!nodes.empty()) {
8521 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
8522 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
8523 }
8524 if (node >= 0) {
8525 (*pm)["objectstore_numa_node"] = stringify(node);
8526 }
8527 }
8528}
8529
8530int BlueStore::get_numa_node(
8531 int *final_node,
8532 set<int> *out_nodes,
8533 set<string> *out_failed)
8534{
8535 int node = -1;
8536 set<string> devices;
8537 get_devices(&devices);
8538 set<int> nodes;
8539 set<string> failed;
8540 for (auto& devname : devices) {
8541 int n;
8542 BlkDev bdev(devname);
8543 int r = bdev.get_numa_node(&n);
8544 if (r < 0) {
8545 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
8546 << dendl;
8547 failed.insert(devname);
8548 continue;
8549 }
8550 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
8551 << dendl;
8552 nodes.insert(n);
8553 if (node < 0) {
8554 node = n;
8555 }
8556 }
8557 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
8558 *final_node = node;
8559 }
8560 if (out_nodes) {
8561 *out_nodes = nodes;
8562 }
8563 if (out_failed) {
8564 *out_failed = failed;
8565 }
8566 return 0;
8567}
8568
8569int BlueStore::get_devices(set<string> *ls)
8570{
8571 if (bdev) {
8572 bdev->get_devices(ls);
8573 if (bluefs) {
8574 bluefs->get_devices(ls);
8575 }
8576 return 0;
8577 }
8578
8579 // grumble, we haven't started up yet.
8580 int r = _open_path();
8581 if (r < 0)
8582 goto out;
8583 r = _open_fsid(false);
8584 if (r < 0)
8585 goto out_path;
8586 r = _read_fsid(&fsid);
8587 if (r < 0)
8588 goto out_fsid;
8589 r = _lock_fsid();
8590 if (r < 0)
8591 goto out_fsid;
8592 r = _open_bdev(false);
8593 if (r < 0)
8594 goto out_fsid;
8595 r = _minimal_open_bluefs(false);
8596 if (r < 0)
8597 goto out_bdev;
8598 bdev->get_devices(ls);
8599 if (bluefs) {
8600 bluefs->get_devices(ls);
8601 }
8602 r = 0;
8603 _minimal_close_bluefs();
8604 out_bdev:
8605 _close_bdev();
8606 out_fsid:
8607 _close_fsid();
8608 out_path:
8609 _close_path();
8610 out:
8611 return r;
7c673cae
FG
8612}
8613
11fdf7f2 8614void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
8615{
8616 buf->reset();
11fdf7f2
TL
8617
8618 buf->omap_allocated = db->estimate_prefix_size(PREFIX_OMAP);
8619
8620 uint64_t bfree = alloc->get_free();
7c673cae
FG
8621
8622 if (bluefs) {
11fdf7f2
TL
8623 int64_t bluefs_total = bluefs->get_total(bluefs_shared_bdev);
8624 int64_t bluefs_free = bluefs->get_free(bluefs_shared_bdev);
94b18763
FG
8625 // part of our shared device is "free" according to BlueFS, but we
8626 // can't touch bluestore_bluefs_min of it.
8627 int64_t shared_available = std::min(
11fdf7f2
TL
8628 bluefs_free,
8629 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
8630 buf->internally_reserved = bluefs_total - shared_available;
94b18763 8631 if (shared_available > 0) {
11fdf7f2
TL
8632 bfree += shared_available;
8633 }
8634 // include dedicated db, too, if that isn't the shared device.
8635 if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
8636 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 8637 }
11fdf7f2
TL
8638 // call any non-omap bluefs space "internal metadata"
8639 buf->internal_metadata =
8640 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
8641 - buf->omap_allocated;
7c673cae
FG
8642 }
8643
11fdf7f2
TL
8644 uint64_t thin_total, thin_avail;
8645 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
8646 buf->total += thin_total;
8647
8648 // we are limited by both the size of the virtual device and the
8649 // underlying physical device.
8650 bfree = std::min(bfree, thin_avail);
8651
8652 buf->allocated = thin_total - thin_avail;
8653 } else {
8654 buf->total += bdev->get_size();
8655 }
8656 buf->available = bfree;
8657}
8658
8659int BlueStore::statfs(struct store_statfs_t *buf,
8660 osd_alert_list_t* alerts)
8661{
8662 if (alerts) {
8663 alerts->clear();
8664 _log_alerts(*alerts);
8665 }
8666 _get_statfs_overall(buf);
31f18b77 8667 {
11fdf7f2 8668 std::lock_guard l(vstatfs_lock);
31f18b77 8669 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
8670 buf->data_stored = vstatfs.stored();
8671 buf->data_compressed = vstatfs.compressed();
8672 buf->data_compressed_original = vstatfs.compressed_original();
8673 buf->data_compressed_allocated = vstatfs.compressed_allocated();
8674 }
8675
8676 dout(20) << __func__ << " " << *buf << dendl;
8677 return 0;
8678}
8679
8680int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
8681{
8682 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 8683
11fdf7f2
TL
8684 if (!per_pool_stat_collection) {
8685 dout(20) << __func__ << " not supported in legacy mode " << dendl;
8686 return -ENOTSUP;
7c673cae 8687 }
11fdf7f2 8688 buf->reset();
7c673cae 8689
11fdf7f2
TL
8690 {
8691 std::lock_guard l(vstatfs_lock);
8692 osd_pools[pool_id].publish(buf);
8693 }
8694 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
8695 return 0;
8696}
8697
81eedcae
TL
8698void BlueStore::_check_legacy_statfs_alert()
8699{
8700 string s;
8701 if (!per_pool_stat_collection &&
eafe8130 8702 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
8703 s = "legacy statfs reporting detected, "
8704 "suggest to run store repair to get consistent statistic reports";
8705 }
8706 std::lock_guard l(qlock);
8707 legacy_statfs_alert = s;
8708}
8709
7c673cae
FG
8710// ---------------
8711// cache
8712
8713BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
8714{
8715 RWLock::RLocker l(coll_lock);
8716 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
8717 if (cp == coll_map.end())
8718 return CollectionRef();
8719 return cp->second;
8720}
8721
8722void BlueStore::_queue_reap_collection(CollectionRef& c)
8723{
8724 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
8725 // _reap_collections and this in the same thread,
8726 // so no need a lock.
7c673cae
FG
8727 removed_collections.push_back(c);
8728}
8729
8730void BlueStore::_reap_collections()
8731{
94b18763 8732
7c673cae
FG
8733 list<CollectionRef> removed_colls;
8734 {
94b18763
FG
8735 // _queue_reap_collection and this in the same thread.
8736 // So no need a lock.
8737 if (!removed_collections.empty())
8738 removed_colls.swap(removed_collections);
8739 else
8740 return;
7c673cae
FG
8741 }
8742
94b18763
FG
8743 list<CollectionRef>::iterator p = removed_colls.begin();
8744 while (p != removed_colls.end()) {
7c673cae
FG
8745 CollectionRef c = *p;
8746 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
8747 if (c->onode_map.map_any([&](OnodeRef o) {
11fdf7f2 8748 ceph_assert(!o->exists);
7c673cae
FG
8749 if (o->flushing_count.load()) {
8750 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
8751 << " flush_txns " << o->flushing_count << dendl;
94b18763 8752 return true;
7c673cae 8753 }
94b18763 8754 return false;
7c673cae 8755 })) {
94b18763 8756 ++p;
7c673cae
FG
8757 continue;
8758 }
8759 c->onode_map.clear();
94b18763 8760 p = removed_colls.erase(p);
7c673cae
FG
8761 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
8762 }
94b18763 8763 if (removed_colls.empty()) {
7c673cae 8764 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
8765 } else {
8766 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
8767 }
8768}
8769
8770void BlueStore::_update_cache_logger()
8771{
8772 uint64_t num_onodes = 0;
8773 uint64_t num_extents = 0;
8774 uint64_t num_blobs = 0;
8775 uint64_t num_buffers = 0;
8776 uint64_t num_buffer_bytes = 0;
8777 for (auto c : cache_shards) {
8778 c->add_stats(&num_onodes, &num_extents, &num_blobs,
8779 &num_buffers, &num_buffer_bytes);
8780 }
8781 logger->set(l_bluestore_onodes, num_onodes);
8782 logger->set(l_bluestore_extents, num_extents);
8783 logger->set(l_bluestore_blobs, num_blobs);
8784 logger->set(l_bluestore_buffers, num_buffers);
8785 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
8786}
8787
8788// ---------------
8789// read operations
8790
8791ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
8792{
8793 return _get_collection(cid);
8794}
8795
11fdf7f2
TL
8796ObjectStore::CollectionHandle BlueStore::create_new_collection(
8797 const coll_t& cid)
7c673cae 8798{
11fdf7f2
TL
8799 RWLock::WLocker l(coll_lock);
8800 Collection *c = new Collection(
8801 this,
8802 cache_shards[cid.hash_to_shard(cache_shards.size())],
8803 cid);
8804 new_coll_map[cid] = c;
8805 _osr_attach(c);
8806 return c;
8807}
8808
8809void BlueStore::set_collection_commit_queue(
8810 const coll_t& cid,
8811 ContextQueue *commit_queue)
8812{
8813 if (commit_queue) {
8814 RWLock::RLocker l(coll_lock);
8815 if (coll_map.count(cid)) {
8816 coll_map[cid]->commit_queue = commit_queue;
8817 } else if (new_coll_map.count(cid)) {
8818 new_coll_map[cid]->commit_queue = commit_queue;
8819 }
8820 }
7c673cae
FG
8821}
8822
11fdf7f2 8823
7c673cae
FG
8824bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
8825{
8826 Collection *c = static_cast<Collection *>(c_.get());
8827 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
8828 if (!c->exists)
8829 return false;
8830
8831 bool r = true;
8832
8833 {
8834 RWLock::RLocker l(c->lock);
8835 OnodeRef o = c->get_onode(oid, false);
8836 if (!o || !o->exists)
8837 r = false;
8838 }
8839
7c673cae
FG
8840 return r;
8841}
8842
7c673cae
FG
8843int BlueStore::stat(
8844 CollectionHandle &c_,
8845 const ghobject_t& oid,
8846 struct stat *st,
8847 bool allow_eio)
8848{
8849 Collection *c = static_cast<Collection *>(c_.get());
8850 if (!c->exists)
8851 return -ENOENT;
8852 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
8853
8854 {
8855 RWLock::RLocker l(c->lock);
8856 OnodeRef o = c->get_onode(oid, false);
8857 if (!o || !o->exists)
8858 return -ENOENT;
8859 st->st_size = o->onode.size;
8860 st->st_blksize = 4096;
8861 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
8862 st->st_nlink = 1;
8863 }
8864
7c673cae
FG
8865 int r = 0;
8866 if (_debug_mdata_eio(oid)) {
8867 r = -EIO;
8868 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
8869 }
8870 return r;
8871}
8872int BlueStore::set_collection_opts(
11fdf7f2 8873 CollectionHandle& ch,
7c673cae
FG
8874 const pool_opts_t& opts)
8875{
7c673cae 8876 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 8877 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
8878 if (!c->exists)
8879 return -ENOENT;
8880 RWLock::WLocker l(c->lock);
8881 c->pool_opts = opts;
8882 return 0;
8883}
8884
7c673cae
FG
8885int BlueStore::read(
8886 CollectionHandle &c_,
8887 const ghobject_t& oid,
8888 uint64_t offset,
8889 size_t length,
8890 bufferlist& bl,
224ce89b 8891 uint32_t op_flags)
7c673cae 8892{
11fdf7f2 8893 auto start = mono_clock::now();
7c673cae
FG
8894 Collection *c = static_cast<Collection *>(c_.get());
8895 const coll_t &cid = c->get_cid();
8896 dout(15) << __func__ << " " << cid << " " << oid
8897 << " 0x" << std::hex << offset << "~" << length << std::dec
8898 << dendl;
8899 if (!c->exists)
8900 return -ENOENT;
8901
8902 bl.clear();
8903 int r;
8904 {
8905 RWLock::RLocker l(c->lock);
11fdf7f2 8906 auto start1 = mono_clock::now();
7c673cae 8907 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
8908 log_latency("get_onode@read",
8909 l_bluestore_read_onode_meta_lat,
8910 mono_clock::now() - start1,
8911 cct->_conf->bluestore_log_op_age);
7c673cae
FG
8912 if (!o || !o->exists) {
8913 r = -ENOENT;
8914 goto out;
8915 }
8916
8917 if (offset == length && offset == 0)
8918 length = o->onode.size;
8919
8920 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
8921 if (r == -EIO) {
8922 logger->inc(l_bluestore_read_eio);
8923 }
7c673cae
FG
8924 }
8925
8926 out:
28e407b8 8927 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
8928 r = -EIO;
8929 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
8930 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
8931 cct->_conf->bluestore_debug_random_read_err &&
8932 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
8933 100.0)) == 0) {
224ce89b
WB
8934 dout(0) << __func__ << ": inject random EIO" << dendl;
8935 r = -EIO;
7c673cae
FG
8936 }
8937 dout(10) << __func__ << " " << cid << " " << oid
8938 << " 0x" << std::hex << offset << "~" << length << std::dec
8939 << " = " << r << dendl;
494da23a
TL
8940 log_latency(__func__,
8941 l_bluestore_read_lat,
8942 mono_clock::now() - start,
8943 cct->_conf->bluestore_log_op_age);
7c673cae
FG
8944 return r;
8945}
8946
8947// --------------------------------------------------------
8948// intermediate data structures used while reading
8949struct region_t {
8950 uint64_t logical_offset;
8951 uint64_t blob_xoffset; //region offset within the blob
8952 uint64_t length;
7c673cae
FG
8953
8954 // used later in read process
8955 uint64_t front = 0;
7c673cae 8956
11fdf7f2 8957 region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
7c673cae
FG
8958 : logical_offset(offset),
8959 blob_xoffset(b_offs),
11fdf7f2
TL
8960 length(len),
8961 front(front){}
7c673cae
FG
8962 region_t(const region_t& from)
8963 : logical_offset(from.logical_offset),
8964 blob_xoffset(from.blob_xoffset),
11fdf7f2
TL
8965 length(from.length),
8966 front(from.front){}
7c673cae
FG
8967
8968 friend ostream& operator<<(ostream& out, const region_t& r) {
8969 return out << "0x" << std::hex << r.logical_offset << ":"
8970 << r.blob_xoffset << "~" << r.length << std::dec;
8971 }
8972};
8973
11fdf7f2
TL
8974// merged blob read request
8975struct read_req_t {
8976 uint64_t r_off = 0;
8977 uint64_t r_len = 0;
8978 bufferlist bl;
8979 std::list<region_t> regs; // original read regions
8980
8981 read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
8982
8983 friend ostream& operator<<(ostream& out, const read_req_t& r) {
8984 out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
8985 for (const auto& reg : r.regs)
8986 out << reg;
8987 return out << "]}" << std::dec;
8988 }
8989};
8990
8991typedef list<read_req_t> regions2read_t;
7c673cae
FG
8992typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
8993
8994int BlueStore::_do_read(
8995 Collection *c,
8996 OnodeRef o,
8997 uint64_t offset,
8998 size_t length,
8999 bufferlist& bl,
f64942e4
AA
9000 uint32_t op_flags,
9001 uint64_t retry_count)
7c673cae 9002{
11fdf7f2 9003 FUNCTRACE(cct);
7c673cae 9004 int r = 0;
91327a77 9005 int read_cache_policy = 0; // do not bypass clean or dirty cache
7c673cae
FG
9006
9007 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9008 << " size 0x" << o->onode.size << " (" << std::dec
9009 << o->onode.size << ")" << dendl;
9010 bl.clear();
9011
9012 if (offset >= o->onode.size) {
9013 return r;
9014 }
9015
9016 // generally, don't buffer anything, unless the client explicitly requests
9017 // it.
9018 bool buffered = false;
9019 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9020 dout(20) << __func__ << " will do buffered read" << dendl;
9021 buffered = true;
9022 } else if (cct->_conf->bluestore_default_buffered_read &&
9023 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9024 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9025 dout(20) << __func__ << " defaulting to buffered read" << dendl;
9026 buffered = true;
9027 }
9028
9029 if (offset + length > o->onode.size) {
9030 length = o->onode.size - offset;
9031 }
9032
11fdf7f2 9033 auto start = mono_clock::now();
7c673cae 9034 o->extent_map.fault_range(db, offset, length);
494da23a
TL
9035 log_latency(__func__,
9036 l_bluestore_read_onode_meta_lat,
9037 mono_clock::now() - start,
9038 cct->_conf->bluestore_log_op_age);
81eedcae 9039 _dump_onode<30>(cct, *o);
7c673cae
FG
9040
9041 ready_regions_t ready_regions;
9042
91327a77
AA
9043 // for deep-scrub, we only read dirty cache and bypass clean cache in
9044 // order to read underlying block device in case there are silent disk errors.
9045 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
9046 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
9047 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
9048 }
9049
7c673cae
FG
9050 // build blob-wise list to of stuff read (that isn't cached)
9051 blobs2read_t blobs2read;
9052 unsigned left = length;
9053 uint64_t pos = offset;
9054 unsigned num_regions = 0;
9055 auto lp = o->extent_map.seek_lextent(offset);
9056 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9057 if (pos < lp->logical_offset) {
9058 unsigned hole = lp->logical_offset - pos;
9059 if (hole >= left) {
9060 break;
9061 }
9062 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9063 << std::dec << dendl;
9064 pos += hole;
9065 left -= hole;
9066 }
94b18763 9067 BlobRef& bptr = lp->blob;
7c673cae
FG
9068 unsigned l_off = pos - lp->logical_offset;
9069 unsigned b_off = l_off + lp->blob_offset;
9070 unsigned b_len = std::min(left, lp->length - l_off);
9071
9072 ready_regions_t cache_res;
9073 interval_set<uint32_t> cache_interval;
9074 bptr->shared_blob->bc.read(
91327a77
AA
9075 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9076 read_cache_policy);
7c673cae
FG
9077 dout(20) << __func__ << " blob " << *bptr << std::hex
9078 << " need 0x" << b_off << "~" << b_len
9079 << " cache has 0x" << cache_interval
9080 << std::dec << dendl;
9081
9082 auto pc = cache_res.begin();
11fdf7f2 9083 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
9084 while (b_len > 0) {
9085 unsigned l;
9086 if (pc != cache_res.end() &&
9087 pc->first == b_off) {
9088 l = pc->second.length();
9089 ready_regions[pos].claim(pc->second);
9090 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9091 << b_off << "~" << l << std::dec << dendl;
9092 ++pc;
9093 } else {
9094 l = b_len;
9095 if (pc != cache_res.end()) {
11fdf7f2 9096 ceph_assert(pc->first > b_off);
7c673cae
FG
9097 l = pc->first - b_off;
9098 }
9099 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9100 << b_off << "~" << l << std::dec << dendl;
11fdf7f2
TL
9101 // merge regions
9102 {
9103 uint64_t r_off = b_off;
9104 uint64_t r_len = l;
9105 uint64_t front = r_off % chunk_size;
9106 if (front) {
9107 r_off -= front;
9108 r_len += front;
9109 }
9110 unsigned tail = r_len % chunk_size;
9111 if (tail) {
9112 r_len += chunk_size - tail;
9113 }
9114 bool merged = false;
9115 regions2read_t& r2r = blobs2read[bptr];
9116 if (r2r.size()) {
9117 read_req_t& pre = r2r.back();
9118 if (r_off <= (pre.r_off + pre.r_len)) {
9119 front += (r_off - pre.r_off);
9120 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9121 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9122 merged = true;
9123 }
9124 }
9125 if (!merged) {
9126 read_req_t req(r_off, r_len);
9127 req.regs.emplace_back(region_t(pos, b_off, l, front));
9128 r2r.emplace_back(std::move(req));
9129 }
9130 }
7c673cae
FG
9131 ++num_regions;
9132 }
9133 pos += l;
9134 b_off += l;
9135 left -= l;
9136 b_len -= l;
9137 }
9138 ++lp;
9139 }
9140
9141 // read raw blob data. use aio if we have >1 blobs to read.
11fdf7f2
TL
9142 start = mono_clock::now(); // for the sake of simplicity
9143 // measure the whole block below.
9144 // The error isn't that much...
7c673cae 9145 vector<bufferlist> compressed_blob_bls;
b32b8144 9146 IOContext ioc(cct, NULL, true); // allow EIO
7c673cae 9147 for (auto& p : blobs2read) {
94b18763 9148 const BlobRef& bptr = p.first;
11fdf7f2 9149 regions2read_t& r2r = p.second;
7c673cae 9150 dout(20) << __func__ << " blob " << *bptr << std::hex
11fdf7f2 9151 << " need " << r2r << std::dec << dendl;
7c673cae
FG
9152 if (bptr->get_blob().is_compressed()) {
9153 // read the whole thing
9154 if (compressed_blob_bls.empty()) {
9155 // ensure we avoid any reallocation on subsequent blobs
9156 compressed_blob_bls.reserve(blobs2read.size());
9157 }
9158 compressed_blob_bls.push_back(bufferlist());
9159 bufferlist& bl = compressed_blob_bls.back();
9160 r = bptr->get_blob().map(
9161 0, bptr->get_blob().get_ondisk_length(),
9162 [&](uint64_t offset, uint64_t length) {
9163 int r;
9164 // use aio if there are more regions to read than those in this blob
11fdf7f2 9165 if (num_regions > r2r.size()) {
7c673cae
FG
9166 r = bdev->aio_read(offset, length, &bl, &ioc);
9167 } else {
9168 r = bdev->read(offset, length, &bl, &ioc, false);
9169 }
9170 if (r < 0)
9171 return r;
9172 return 0;
9173 });
b32b8144
FG
9174 if (r < 0) {
9175 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9176 if (r == -EIO) {
9177 // propagate EIO to caller
9178 return r;
9179 }
11fdf7f2 9180 ceph_assert(r == 0);
b32b8144 9181 }
7c673cae
FG
9182 } else {
9183 // read the pieces
11fdf7f2 9184 for (auto& req : r2r) {
7c673cae 9185 dout(20) << __func__ << " region 0x" << std::hex
11fdf7f2
TL
9186 << req.regs.front().logical_offset
9187 << ": 0x" << req.regs.front().blob_xoffset
9188 << " reading 0x" << req.r_off
9189 << "~" << req.r_len << std::dec
7c673cae
FG
9190 << dendl;
9191
9192 // read it
9193 r = bptr->get_blob().map(
11fdf7f2 9194 req.r_off, req.r_len,
7c673cae
FG
9195 [&](uint64_t offset, uint64_t length) {
9196 int r;
9197 // use aio if there is more than one region to read
9198 if (num_regions > 1) {
11fdf7f2 9199 r = bdev->aio_read(offset, length, &req.bl, &ioc);
7c673cae 9200 } else {
11fdf7f2 9201 r = bdev->read(offset, length, &req.bl, &ioc, false);
7c673cae
FG
9202 }
9203 if (r < 0)
9204 return r;
9205 return 0;
9206 });
b32b8144
FG
9207 if (r < 0) {
9208 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9209 << dendl;
9210 if (r == -EIO) {
9211 // propagate EIO to caller
9212 return r;
9213 }
11fdf7f2 9214 ceph_assert(r == 0);
b32b8144 9215 }
11fdf7f2 9216 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
9217 }
9218 }
9219 }
11fdf7f2
TL
9220
9221 int64_t num_ios = length;
7c673cae 9222 if (ioc.has_pending_aios()) {
11fdf7f2 9223 num_ios = -ioc.get_num_ios();
7c673cae
FG
9224 bdev->aio_submit(&ioc);
9225 dout(20) << __func__ << " waiting for aio" << dendl;
9226 ioc.aio_wait();
b32b8144
FG
9227 r = ioc.get_return_value();
9228 if (r < 0) {
11fdf7f2 9229 ceph_assert(r == -EIO); // no other errors allowed
b32b8144
FG
9230 return -EIO;
9231 }
7c673cae 9232 }
494da23a
TL
9233 log_latency_fn(__func__,
9234 l_bluestore_read_wait_aio_lat,
11fdf7f2 9235 mono_clock::now() - start,
494da23a 9236 cct->_conf->bluestore_log_op_age,
11fdf7f2
TL
9237 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
9238 );
7c673cae
FG
9239
9240 // enumerate and decompress desired blobs
9241 auto p = compressed_blob_bls.begin();
9242 blobs2read_t::iterator b2r_it = blobs2read.begin();
9243 while (b2r_it != blobs2read.end()) {
94b18763 9244 const BlobRef& bptr = b2r_it->first;
11fdf7f2 9245 regions2read_t& r2r = b2r_it->second;
7c673cae 9246 dout(20) << __func__ << " blob " << *bptr << std::hex
11fdf7f2 9247 << " need 0x" << r2r << std::dec << dendl;
7c673cae 9248 if (bptr->get_blob().is_compressed()) {
11fdf7f2 9249 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
9250 bufferlist& compressed_bl = *p++;
9251 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
11fdf7f2 9252 r2r.front().regs.front().logical_offset) < 0) {
f64942e4
AA
9253 // Handles spurious read errors caused by a kernel bug.
9254 // We sometimes get all-zero pages as a result of the read under
11fdf7f2
TL
9255 // high memory pressure. Retrying the failing read succeeds in most
9256 // cases.
f64942e4
AA
9257 // See also: http://tracker.ceph.com/issues/22464
9258 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
9259 return -EIO;
9260 }
9261 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
7c673cae
FG
9262 }
9263 bufferlist raw_bl;
9264 r = _decompress(compressed_bl, &raw_bl);
9265 if (r < 0)
9266 return r;
9267 if (buffered) {
9268 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9269 raw_bl);
9270 }
11fdf7f2
TL
9271 for (auto& req : r2r) {
9272 for (auto& r : req.regs) {
9273 ready_regions[r.logical_offset].substr_of(
9274 raw_bl, r.blob_xoffset, r.length);
9275 }
7c673cae
FG
9276 }
9277 } else {
11fdf7f2
TL
9278 for (auto& req : r2r) {
9279 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9280 req.regs.front().logical_offset) < 0) {
f64942e4
AA
9281 // Handles spurious read errors caused by a kernel bug.
9282 // We sometimes get all-zero pages as a result of the read under
11fdf7f2
TL
9283 // high memory pressure. Retrying the failing read succeeds in most
9284 // cases.
f64942e4
AA
9285 // See also: http://tracker.ceph.com/issues/22464
9286 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
9287 return -EIO;
9288 }
9289 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
7c673cae
FG
9290 }
9291 if (buffered) {
9292 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
11fdf7f2 9293 req.r_off, req.bl);
7c673cae
FG
9294 }
9295
9296 // prune and keep result
11fdf7f2
TL
9297 for (const auto& r : req.regs) {
9298 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
9299 }
7c673cae
FG
9300 }
9301 }
9302 ++b2r_it;
9303 }
9304
9305 // generate a resulting buffer
9306 auto pr = ready_regions.begin();
9307 auto pr_end = ready_regions.end();
9308 pos = 0;
9309 while (pos < length) {
9310 if (pr != pr_end && pr->first == pos + offset) {
9311 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9312 << ": data from 0x" << pr->first << "~" << pr->second.length()
9313 << std::dec << dendl;
9314 pos += pr->second.length();
9315 bl.claim_append(pr->second);
9316 ++pr;
9317 } else {
9318 uint64_t l = length - pos;
9319 if (pr != pr_end) {
11fdf7f2 9320 ceph_assert(pr->first > pos + offset);
7c673cae
FG
9321 l = pr->first - (pos + offset);
9322 }
9323 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9324 << ": zeros for 0x" << (pos + offset) << "~" << l
9325 << std::dec << dendl;
9326 bl.append_zero(l);
9327 pos += l;
9328 }
9329 }
11fdf7f2
TL
9330 ceph_assert(bl.length() == length);
9331 ceph_assert(pos == length);
9332 ceph_assert(pr == pr_end);
7c673cae 9333 r = bl.length();
f64942e4
AA
9334 if (retry_count) {
9335 logger->inc(l_bluestore_reads_with_retries);
9336 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
9337 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
9338 }
7c673cae
FG
9339 return r;
9340}
9341
9342int BlueStore::_verify_csum(OnodeRef& o,
9343 const bluestore_blob_t* blob, uint64_t blob_xoffset,
9344 const bufferlist& bl,
9345 uint64_t logical_offset) const
9346{
9347 int bad;
9348 uint64_t bad_csum;
11fdf7f2 9349 auto start = mono_clock::now();
7c673cae 9350 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
9351 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
9352 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
9353 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
9354 bad = blob_xoffset;
9355 r = -1;
9356 bad_csum = 0xDEADBEEF;
9357 }
7c673cae
FG
9358 if (r < 0) {
9359 if (r == -1) {
9360 PExtentVector pex;
9361 blob->map(
9362 bad,
9363 blob->get_csum_chunk_size(),
9364 [&](uint64_t offset, uint64_t length) {
9365 pex.emplace_back(bluestore_pextent_t(offset, length));
9366 return 0;
9367 });
9368 derr << __func__ << " bad "
9369 << Checksummer::get_csum_type_string(blob->csum_type)
9370 << "/0x" << std::hex << blob->get_csum_chunk_size()
9371 << " checksum at blob offset 0x" << bad
9372 << ", got 0x" << bad_csum << ", expected 0x"
9373 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
9374 << ", device location " << pex
9375 << ", logical extent 0x" << std::hex
9376 << (logical_offset + bad - blob_xoffset) << "~"
9377 << blob->get_csum_chunk_size() << std::dec
9378 << ", object " << o->oid
9379 << dendl;
9380 } else {
9381 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
9382 }
9383 }
494da23a
TL
9384 log_latency(__func__,
9385 l_bluestore_csum_lat,
9386 mono_clock::now() - start,
9387 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
9388 if (cct->_conf->bluestore_ignore_data_csum) {
9389 return 0;
9390 }
7c673cae
FG
9391 return r;
9392}
9393
9394int BlueStore::_decompress(bufferlist& source, bufferlist* result)
9395{
9396 int r = 0;
11fdf7f2
TL
9397 auto start = mono_clock::now();
9398 auto i = source.cbegin();
7c673cae 9399 bluestore_compression_header_t chdr;
11fdf7f2 9400 decode(chdr, i);
7c673cae
FG
9401 int alg = int(chdr.type);
9402 CompressorRef cp = compressor;
9403 if (!cp || (int)cp->get_type() != alg) {
9404 cp = Compressor::create(cct, alg);
9405 }
9406
9407 if (!cp.get()) {
9408 // if compressor isn't available - error, because cannot return
9409 // decompressed data?
11fdf7f2
TL
9410
9411 const char* alg_name = Compressor::get_comp_alg_name(alg);
9412 derr << __func__ << " can't load decompressor " << alg_name << dendl;
9413 _set_compression_alert(false, alg_name);
7c673cae
FG
9414 r = -EIO;
9415 } else {
9416 r = cp->decompress(i, chdr.length, *result);
9417 if (r < 0) {
9418 derr << __func__ << " decompression failed with exit code " << r << dendl;
9419 r = -EIO;
9420 }
9421 }
494da23a
TL
9422 log_latency(__func__,
9423 l_bluestore_decompress_lat,
9424 mono_clock::now() - start,
9425 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9426 return r;
9427}
9428
9429// this stores fiemap into interval_set, other variations
9430// use it internally
9431int BlueStore::_fiemap(
9432 CollectionHandle &c_,
9433 const ghobject_t& oid,
9434 uint64_t offset,
9435 size_t length,
9436 interval_set<uint64_t>& destset)
9437{
9438 Collection *c = static_cast<Collection *>(c_.get());
9439 if (!c->exists)
9440 return -ENOENT;
9441 {
9442 RWLock::RLocker l(c->lock);
9443
9444 OnodeRef o = c->get_onode(oid, false);
9445 if (!o || !o->exists) {
9446 return -ENOENT;
9447 }
81eedcae 9448 _dump_onode<30>(cct, *o);
7c673cae
FG
9449
9450 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9451 << " size 0x" << o->onode.size << std::dec << dendl;
9452
9453 boost::intrusive::set<Extent>::iterator ep, eend;
9454 if (offset >= o->onode.size)
9455 goto out;
9456
9457 if (offset + length > o->onode.size) {
9458 length = o->onode.size - offset;
9459 }
9460
9461 o->extent_map.fault_range(db, offset, length);
9462 eend = o->extent_map.extent_map.end();
9463 ep = o->extent_map.seek_lextent(offset);
9464 while (length > 0) {
9465 dout(20) << __func__ << " offset " << offset << dendl;
9466 if (ep != eend && ep->logical_offset + ep->length <= offset) {
9467 ++ep;
9468 continue;
9469 }
9470
9471 uint64_t x_len = length;
9472 if (ep != eend && ep->logical_offset <= offset) {
9473 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 9474 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
9475 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
9476 << x_len << std::dec << " blob " << ep->blob << dendl;
9477 destset.insert(offset, x_len);
9478 length -= x_len;
9479 offset += x_len;
9480 if (x_off + x_len == ep->length)
9481 ++ep;
9482 continue;
9483 }
9484 if (ep != eend &&
9485 ep->logical_offset > offset &&
9486 ep->logical_offset - offset < x_len) {
9487 x_len = ep->logical_offset - offset;
9488 }
9489 offset += x_len;
9490 length -= x_len;
9491 }
9492 }
9493
9494 out:
7c673cae
FG
9495 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9496 << " size = 0x(" << destset << ")" << std::dec << dendl;
9497 return 0;
9498}
9499
7c673cae
FG
9500int BlueStore::fiemap(
9501 CollectionHandle &c_,
9502 const ghobject_t& oid,
9503 uint64_t offset,
9504 size_t length,
9505 bufferlist& bl)
9506{
9507 interval_set<uint64_t> m;
9508 int r = _fiemap(c_, oid, offset, length, m);
9509 if (r >= 0) {
11fdf7f2 9510 encode(m, bl);
7c673cae
FG
9511 }
9512 return r;
9513}
9514
7c673cae
FG
9515int BlueStore::fiemap(
9516 CollectionHandle &c_,
9517 const ghobject_t& oid,
9518 uint64_t offset,
9519 size_t length,
9520 map<uint64_t, uint64_t>& destmap)
9521{
9522 interval_set<uint64_t> m;
9523 int r = _fiemap(c_, oid, offset, length, m);
9524 if (r >= 0) {
9525 m.move_into(destmap);
9526 }
9527 return r;
9528}
9529
7c673cae
FG
9530int BlueStore::getattr(
9531 CollectionHandle &c_,
9532 const ghobject_t& oid,
9533 const char *name,
9534 bufferptr& value)
9535{
9536 Collection *c = static_cast<Collection *>(c_.get());
9537 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
9538 if (!c->exists)
9539 return -ENOENT;
9540
9541 int r;
9542 {
9543 RWLock::RLocker l(c->lock);
31f18b77 9544 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
9545
9546 OnodeRef o = c->get_onode(oid, false);
9547 if (!o || !o->exists) {
9548 r = -ENOENT;
9549 goto out;
9550 }
9551
9552 if (!o->onode.attrs.count(k)) {
9553 r = -ENODATA;
9554 goto out;
9555 }
9556 value = o->onode.attrs[k];
9557 r = 0;
9558 }
9559 out:
7c673cae
FG
9560 if (r == 0 && _debug_mdata_eio(oid)) {
9561 r = -EIO;
9562 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9563 }
9564 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
9565 << " = " << r << dendl;
9566 return r;
9567}
9568
7c673cae
FG
9569int BlueStore::getattrs(
9570 CollectionHandle &c_,
9571 const ghobject_t& oid,
9572 map<string,bufferptr>& aset)
9573{
9574 Collection *c = static_cast<Collection *>(c_.get());
9575 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
9576 if (!c->exists)
9577 return -ENOENT;
9578
9579 int r;
9580 {
9581 RWLock::RLocker l(c->lock);
9582
9583 OnodeRef o = c->get_onode(oid, false);
9584 if (!o || !o->exists) {
9585 r = -ENOENT;
9586 goto out;
9587 }
9588 for (auto& i : o->onode.attrs) {
9589 aset.emplace(i.first.c_str(), i.second);
9590 }
9591 r = 0;
9592 }
9593
9594 out:
7c673cae
FG
9595 if (r == 0 && _debug_mdata_eio(oid)) {
9596 r = -EIO;
9597 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9598 }
9599 dout(10) << __func__ << " " << c->cid << " " << oid
9600 << " = " << r << dendl;
9601 return r;
9602}
9603
9604int BlueStore::list_collections(vector<coll_t>& ls)
9605{
9606 RWLock::RLocker l(coll_lock);
11fdf7f2 9607 ls.reserve(coll_map.size());
7c673cae
FG
9608 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
9609 p != coll_map.end();
9610 ++p)
9611 ls.push_back(p->first);
9612 return 0;
9613}
9614
9615bool BlueStore::collection_exists(const coll_t& c)
9616{
9617 RWLock::RLocker l(coll_lock);
9618 return coll_map.count(c);
9619}
9620
11fdf7f2 9621int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 9622{
11fdf7f2 9623 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
9624 vector<ghobject_t> ls;
9625 ghobject_t next;
11fdf7f2 9626 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
9627 &ls, &next);
9628 if (r < 0) {
9629 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
9630 << dendl;
9631 return r;
9632 }
9633 *empty = ls.empty();
11fdf7f2 9634 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
9635 return 0;
9636}
9637
11fdf7f2 9638int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 9639{
11fdf7f2
TL
9640 dout(15) << __func__ << " " << ch->cid << dendl;
9641 Collection *c = static_cast<Collection*>(ch.get());
7c673cae 9642 RWLock::RLocker l(c->lock);
11fdf7f2 9643 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
9644 return c->cnode.bits;
9645}
9646
7c673cae
FG
9647int BlueStore::collection_list(
9648 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
9649 vector<ghobject_t> *ls, ghobject_t *pnext)
9650{
9651 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 9652 c->flush();
7c673cae
FG
9653 dout(15) << __func__ << " " << c->cid
9654 << " start " << start << " end " << end << " max " << max << dendl;
9655 int r;
9656 {
9657 RWLock::RLocker l(c->lock);
9658 r = _collection_list(c, start, end, max, ls, pnext);
9659 }
9660
7c673cae
FG
9661 dout(10) << __func__ << " " << c->cid
9662 << " start " << start << " end " << end << " max " << max
9663 << " = " << r << ", ls.size() = " << ls->size()
9664 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
9665 return r;
9666}
9667
9668int BlueStore::_collection_list(
9669 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
9670 vector<ghobject_t> *ls, ghobject_t *pnext)
9671{
9672
9673 if (!c->exists)
9674 return -ENOENT;
9675
494da23a 9676 auto start_time = mono_clock::now();
7c673cae
FG
9677 int r = 0;
9678 ghobject_t static_next;
9679 KeyValueDB::Iterator it;
9680 string temp_start_key, temp_end_key;
9681 string start_key, end_key;
9682 bool set_next = false;
9683 string pend;
9684 bool temp;
9685
9686 if (!pnext)
9687 pnext = &static_next;
9688
11fdf7f2 9689 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
9690 goto out;
9691 }
9692 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
9693 &start_key, &end_key);
9694 dout(20) << __func__
9695 << " range " << pretty_binary_string(temp_start_key)
9696 << " to " << pretty_binary_string(temp_end_key)
9697 << " and " << pretty_binary_string(start_key)
9698 << " to " << pretty_binary_string(end_key)
9699 << " start " << start << dendl;
9700 it = db->get_iterator(PREFIX_OBJ);
9701 if (start == ghobject_t() ||
9702 start.hobj == hobject_t() ||
9703 start == c->cid.get_min_hobj()) {
9704 it->upper_bound(temp_start_key);
9705 temp = true;
9706 } else {
9707 string k;
9708 get_object_key(cct, start, &k);
9709 if (start.hobj.is_temp()) {
9710 temp = true;
11fdf7f2 9711 ceph_assert(k >= temp_start_key && k < temp_end_key);
7c673cae
FG
9712 } else {
9713 temp = false;
11fdf7f2 9714 ceph_assert(k >= start_key && k < end_key);
7c673cae 9715 }
11fdf7f2 9716 dout(20) << __func__ << " start from " << pretty_binary_string(k)
7c673cae
FG
9717 << " temp=" << (int)temp << dendl;
9718 it->lower_bound(k);
9719 }
9720 if (end.hobj.is_max()) {
9721 pend = temp ? temp_end_key : end_key;
9722 } else {
9723 get_object_key(cct, end, &end_key);
9724 if (end.hobj.is_temp()) {
9725 if (temp)
9726 pend = end_key;
9727 else
9728 goto out;
9729 } else {
9730 pend = temp ? temp_end_key : end_key;
9731 }
9732 }
9733 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
9734 while (true) {
9735 if (!it->valid() || it->key() >= pend) {
9736 if (!it->valid())
9737 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
9738 else
9739 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
9740 << " >= " << end << dendl;
9741 if (temp) {
9742 if (end.hobj.is_temp()) {
9743 break;
9744 }
9745 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
9746 temp = false;
9747 it->upper_bound(start_key);
9748 pend = end_key;
9749 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
9750 continue;
9751 }
9752 break;
9753 }
9754 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
9755 if (is_extent_shard_key(it->key())) {
9756 it->next();
9757 continue;
9758 }
9759 ghobject_t oid;
9760 int r = get_key_object(it->key(), &oid);
11fdf7f2 9761 ceph_assert(r == 0);
7c673cae
FG
9762 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
9763 if (ls->size() >= (unsigned)max) {
9764 dout(20) << __func__ << " reached max " << max << dendl;
9765 *pnext = oid;
9766 set_next = true;
9767 break;
9768 }
9769 ls->push_back(oid);
9770 it->next();
9771 }
9772out:
9773 if (!set_next) {
9774 *pnext = ghobject_t::get_max();
9775 }
494da23a
TL
9776 log_latency_fn(
9777 __func__,
9778 l_bluestore_clist_lat,
9779 mono_clock::now() - start_time,
9780 cct->_conf->bluestore_log_collection_list_age,
9781 [&] (const ceph::timespan& lat) {
9782 ostringstream ostr;
9783 ostr << ", lat = " << timespan_str(lat)
9784 << " cid =" << c->cid
9785 << " start " << start << " end " << end
9786 << " max " << max;
9787 return ostr.str();
9788 }
9789 );
7c673cae
FG
9790 return r;
9791}
9792
7c673cae
FG
9793int BlueStore::omap_get(
9794 CollectionHandle &c_, ///< [in] Collection containing oid
9795 const ghobject_t &oid, ///< [in] Object containing omap
9796 bufferlist *header, ///< [out] omap header
9797 map<string, bufferlist> *out /// < [out] Key to value map
9798 )
9799{
9800 Collection *c = static_cast<Collection *>(c_.get());
9801 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9802 if (!c->exists)
9803 return -ENOENT;
9804 RWLock::RLocker l(c->lock);
9805 int r = 0;
9806 OnodeRef o = c->get_onode(oid, false);
9807 if (!o || !o->exists) {
9808 r = -ENOENT;
9809 goto out;
9810 }
9811 if (!o->onode.has_omap())
9812 goto out;
9813 o->flush();
9814 {
11fdf7f2
TL
9815 const string& prefix =
9816 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9817 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
9818 string head, tail;
9819 get_omap_header(o->onode.nid, &head);
9820 get_omap_tail(o->onode.nid, &tail);
9821 it->lower_bound(head);
9822 while (it->valid()) {
9823 if (it->key() == head) {
9824 dout(30) << __func__ << " got header" << dendl;
9825 *header = it->value();
9826 } else if (it->key() >= tail) {
9827 dout(30) << __func__ << " reached tail" << dendl;
9828 break;
9829 } else {
9830 string user_key;
9831 decode_omap_key(it->key(), &user_key);
11fdf7f2 9832 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
9833 << " -> " << user_key << dendl;
9834 (*out)[user_key] = it->value();
9835 }
9836 it->next();
9837 }
9838 }
9839 out:
9840 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9841 << dendl;
9842 return r;
9843}
9844
7c673cae
FG
9845int BlueStore::omap_get_header(
9846 CollectionHandle &c_, ///< [in] Collection containing oid
9847 const ghobject_t &oid, ///< [in] Object containing omap
9848 bufferlist *header, ///< [out] omap header
9849 bool allow_eio ///< [in] don't assert on eio
9850 )
9851{
9852 Collection *c = static_cast<Collection *>(c_.get());
9853 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9854 if (!c->exists)
9855 return -ENOENT;
9856 RWLock::RLocker l(c->lock);
9857 int r = 0;
9858 OnodeRef o = c->get_onode(oid, false);
9859 if (!o || !o->exists) {
9860 r = -ENOENT;
9861 goto out;
9862 }
9863 if (!o->onode.has_omap())
9864 goto out;
9865 o->flush();
9866 {
9867 string head;
9868 get_omap_header(o->onode.nid, &head);
11fdf7f2
TL
9869 if (db->get(o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
9870 head, header) >= 0) {
7c673cae
FG
9871 dout(30) << __func__ << " got header" << dendl;
9872 } else {
9873 dout(30) << __func__ << " no header" << dendl;
9874 }
9875 }
9876 out:
9877 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9878 << dendl;
9879 return r;
9880}
9881
7c673cae
FG
9882int BlueStore::omap_get_keys(
9883 CollectionHandle &c_, ///< [in] Collection containing oid
9884 const ghobject_t &oid, ///< [in] Object containing omap
9885 set<string> *keys ///< [out] Keys defined on oid
9886 )
9887{
9888 Collection *c = static_cast<Collection *>(c_.get());
9889 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9890 if (!c->exists)
9891 return -ENOENT;
9892 RWLock::RLocker l(c->lock);
9893 int r = 0;
9894 OnodeRef o = c->get_onode(oid, false);
9895 if (!o || !o->exists) {
9896 r = -ENOENT;
9897 goto out;
9898 }
9899 if (!o->onode.has_omap())
9900 goto out;
9901 o->flush();
9902 {
11fdf7f2
TL
9903 const string& prefix =
9904 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9905 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
9906 string head, tail;
9907 get_omap_key(o->onode.nid, string(), &head);
9908 get_omap_tail(o->onode.nid, &tail);
9909 it->lower_bound(head);
9910 while (it->valid()) {
9911 if (it->key() >= tail) {
9912 dout(30) << __func__ << " reached tail" << dendl;
9913 break;
9914 }
9915 string user_key;
9916 decode_omap_key(it->key(), &user_key);
11fdf7f2 9917 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
9918 << " -> " << user_key << dendl;
9919 keys->insert(user_key);
9920 it->next();
11fdf7f2
TL
9921 }
9922 }
9923 out:
9924 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9925 << dendl;
9926 return r;
7c673cae
FG
9927}
9928
9929int BlueStore::omap_get_values(
9930 CollectionHandle &c_, ///< [in] Collection containing oid
9931 const ghobject_t &oid, ///< [in] Object containing omap
9932 const set<string> &keys, ///< [in] Keys to get
9933 map<string, bufferlist> *out ///< [out] Returned keys and values
9934 )
9935{
9936 Collection *c = static_cast<Collection *>(c_.get());
9937 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9938 if (!c->exists)
9939 return -ENOENT;
9940 RWLock::RLocker l(c->lock);
9941 int r = 0;
9942 string final_key;
9943 OnodeRef o = c->get_onode(oid, false);
9944 if (!o || !o->exists) {
9945 r = -ENOENT;
9946 goto out;
9947 }
9948 if (!o->onode.has_omap())
9949 goto out;
11fdf7f2
TL
9950 {
9951 const string& prefix =
9952 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9953 o->flush();
9954 _key_encode_u64(o->onode.nid, &final_key);
9955 final_key.push_back('.');
9956 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9957 final_key.resize(9); // keep prefix
9958 final_key += *p;
9959 bufferlist val;
9960 if (db->get(prefix, final_key, &val) >= 0) {
9961 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
9962 << " -> " << *p << dendl;
9963 out->insert(make_pair(*p, val));
9964 }
7c673cae
FG
9965 }
9966 }
9967 out:
9968 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
9969 << dendl;
9970 return r;
9971}
9972
7c673cae
FG
9973int BlueStore::omap_check_keys(
9974 CollectionHandle &c_, ///< [in] Collection containing oid
9975 const ghobject_t &oid, ///< [in] Object containing omap
9976 const set<string> &keys, ///< [in] Keys to check
9977 set<string> *out ///< [out] Subset of keys defined on oid
9978 )
9979{
9980 Collection *c = static_cast<Collection *>(c_.get());
9981 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
9982 if (!c->exists)
9983 return -ENOENT;
9984 RWLock::RLocker l(c->lock);
9985 int r = 0;
9986 string final_key;
9987 OnodeRef o = c->get_onode(oid, false);
9988 if (!o || !o->exists) {
9989 r = -ENOENT;
9990 goto out;
9991 }
9992 if (!o->onode.has_omap())
9993 goto out;
11fdf7f2
TL
9994 {
9995 const string& prefix =
9996 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
9997 o->flush();
9998 _key_encode_u64(o->onode.nid, &final_key);
9999 final_key.push_back('.');
10000 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
10001 final_key.resize(9); // keep prefix
10002 final_key += *p;
10003 bufferlist val;
10004 if (db->get(prefix, final_key, &val) >= 0) {
10005 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
10006 << " -> " << *p << dendl;
10007 out->insert(*p);
10008 } else {
10009 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
10010 << " -> " << *p << dendl;
10011 }
7c673cae
FG
10012 }
10013 }
10014 out:
10015 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10016 << dendl;
10017 return r;
10018}
10019
7c673cae
FG
10020ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
10021 CollectionHandle &c_, ///< [in] collection
10022 const ghobject_t &oid ///< [in] object
10023 )
10024{
10025 Collection *c = static_cast<Collection *>(c_.get());
10026 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10027 if (!c->exists) {
10028 return ObjectMap::ObjectMapIterator();
10029 }
10030 RWLock::RLocker l(c->lock);
10031 OnodeRef o = c->get_onode(oid, false);
10032 if (!o || !o->exists) {
10033 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
10034 return ObjectMap::ObjectMapIterator();
10035 }
10036 o->flush();
10037 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
11fdf7f2
TL
10038 KeyValueDB::Iterator it = db->get_iterator(
10039 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP);
7c673cae
FG
10040 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
10041}
10042
10043// -----------------
10044// write helpers
10045
11fdf7f2
TL
10046uint64_t BlueStore::_get_ondisk_reserved() const {
10047 return round_up_to(
10048 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
10049}
10050
7c673cae
FG
10051void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
10052{
10053 dout(10) << __func__ << " ondisk_format " << ondisk_format
10054 << " min_compat_ondisk_format " << min_compat_ondisk_format
10055 << dendl;
11fdf7f2 10056 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
10057 {
10058 bufferlist bl;
11fdf7f2 10059 encode(ondisk_format, bl);
7c673cae
FG
10060 t->set(PREFIX_SUPER, "ondisk_format", bl);
10061 }
10062 {
10063 bufferlist bl;
11fdf7f2 10064 encode(min_compat_ondisk_format, bl);
7c673cae
FG
10065 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
10066 }
10067}
10068
10069int BlueStore::_open_super_meta()
10070{
10071 // nid
10072 {
10073 nid_max = 0;
10074 bufferlist bl;
10075 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 10076 auto p = bl.cbegin();
7c673cae
FG
10077 try {
10078 uint64_t v;
11fdf7f2 10079 decode(v, p);
7c673cae
FG
10080 nid_max = v;
10081 } catch (buffer::error& e) {
10082 derr << __func__ << " unable to read nid_max" << dendl;
10083 return -EIO;
10084 }
10085 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
10086 nid_last = nid_max.load();
10087 }
10088
10089 // blobid
10090 {
10091 blobid_max = 0;
10092 bufferlist bl;
10093 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 10094 auto p = bl.cbegin();
7c673cae
FG
10095 try {
10096 uint64_t v;
11fdf7f2 10097 decode(v, p);
7c673cae
FG
10098 blobid_max = v;
10099 } catch (buffer::error& e) {
10100 derr << __func__ << " unable to read blobid_max" << dendl;
10101 return -EIO;
10102 }
10103 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
10104 blobid_last = blobid_max.load();
10105 }
10106
10107 // freelist
10108 {
10109 bufferlist bl;
10110 db->get(PREFIX_SUPER, "freelist_type", &bl);
10111 if (bl.length()) {
10112 freelist_type = std::string(bl.c_str(), bl.length());
10113 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
10114 } else {
11fdf7f2 10115 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 10116 }
7c673cae
FG
10117 }
10118
10119 // ondisk format
10120 int32_t compat_ondisk_format = 0;
10121 {
10122 bufferlist bl;
10123 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
10124 if (r < 0) {
10125 // base case: kraken bluestore is v1 and readable by v1
10126 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
10127 << dendl;
10128 ondisk_format = 1;
10129 compat_ondisk_format = 1;
10130 } else {
11fdf7f2 10131 auto p = bl.cbegin();
7c673cae 10132 try {
11fdf7f2 10133 decode(ondisk_format, p);
7c673cae
FG
10134 } catch (buffer::error& e) {
10135 derr << __func__ << " unable to read ondisk_format" << dendl;
10136 return -EIO;
10137 }
10138 bl.clear();
10139 {
10140 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
10141 ceph_assert(!r);
10142 auto p = bl.cbegin();
7c673cae 10143 try {
11fdf7f2 10144 decode(compat_ondisk_format, p);
7c673cae
FG
10145 } catch (buffer::error& e) {
10146 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
10147 return -EIO;
10148 }
10149 }
10150 }
10151 dout(10) << __func__ << " ondisk_format " << ondisk_format
10152 << " compat_ondisk_format " << compat_ondisk_format
10153 << dendl;
10154 }
10155
10156 if (latest_ondisk_format < compat_ondisk_format) {
10157 derr << __func__ << " compat_ondisk_format is "
10158 << compat_ondisk_format << " but we only understand version "
10159 << latest_ondisk_format << dendl;
10160 return -EPERM;
10161 }
7c673cae
FG
10162
10163 {
10164 bufferlist bl;
10165 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 10166 auto p = bl.cbegin();
7c673cae
FG
10167 try {
10168 uint64_t val;
11fdf7f2 10169 decode(val, p);
7c673cae 10170 min_alloc_size = val;
224ce89b 10171 min_alloc_size_order = ctz(val);
11fdf7f2 10172 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
10173 } catch (buffer::error& e) {
10174 derr << __func__ << " unable to read min_alloc_size" << dendl;
10175 return -EIO;
10176 }
10177 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
10178 << std::dec << dendl;
10179 }
224ce89b 10180 _open_statfs();
7c673cae
FG
10181 _set_alloc_sizes();
10182 _set_throttle_params();
10183
10184 _set_csum();
10185 _set_compression();
10186 _set_blob_size();
10187
11fdf7f2 10188 _validate_bdev();
7c673cae
FG
10189 return 0;
10190}
10191
10192int BlueStore::_upgrade_super()
10193{
10194 dout(1) << __func__ << " from " << ondisk_format << ", latest "
10195 << latest_ondisk_format << dendl;
11fdf7f2
TL
10196 if (ondisk_format < latest_ondisk_format) {
10197 ceph_assert(ondisk_format > 0);
10198 ceph_assert(ondisk_format < latest_ondisk_format);
10199
10200 if (ondisk_format == 1) {
10201 // changes:
10202 // - super: added ondisk_format
10203 // - super: added min_readable_ondisk_format
10204 // - super: added min_compat_ondisk_format
10205 // - super: added min_alloc_size
10206 // - super: removed min_min_alloc_size
10207 KeyValueDB::Transaction t = db->get_transaction();
10208 {
10209 bufferlist bl;
10210 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
10211 auto p = bl.cbegin();
10212 try {
10213 uint64_t val;
10214 decode(val, p);
10215 min_alloc_size = val;
10216 } catch (buffer::error& e) {
10217 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
10218 return -EIO;
10219 }
10220 t->set(PREFIX_SUPER, "min_alloc_size", bl);
10221 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 10222 }
11fdf7f2
TL
10223 ondisk_format = 2;
10224 _prepare_ondisk_format_super(t);
10225 int r = db->submit_transaction_sync(t);
10226 ceph_assert(r == 0);
7c673cae 10227 }
7c673cae 10228 }
7c673cae
FG
10229 // done
10230 dout(1) << __func__ << " done" << dendl;
10231 return 0;
10232}
10233
10234void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
10235{
224ce89b 10236 if (o->onode.nid) {
11fdf7f2 10237 ceph_assert(o->exists);
7c673cae 10238 return;
224ce89b 10239 }
7c673cae
FG
10240 uint64_t nid = ++nid_last;
10241 dout(20) << __func__ << " " << nid << dendl;
10242 o->onode.nid = nid;
10243 txc->last_nid = nid;
224ce89b 10244 o->exists = true;
7c673cae
FG
10245}
10246
10247uint64_t BlueStore::_assign_blobid(TransContext *txc)
10248{
10249 uint64_t bid = ++blobid_last;
10250 dout(20) << __func__ << " " << bid << dendl;
10251 txc->last_blobid = bid;
10252 return bid;
10253}
10254
10255void BlueStore::get_db_statistics(Formatter *f)
10256{
10257 db->get_statistics(f);
10258}
10259
11fdf7f2
TL
10260BlueStore::TransContext *BlueStore::_txc_create(
10261 Collection *c, OpSequencer *osr,
10262 list<Context*> *on_commits)
7c673cae 10263{
11fdf7f2 10264 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae
FG
10265 txc->t = db->get_transaction();
10266 osr->queue_new(txc);
10267 dout(20) << __func__ << " osr " << osr << " = " << txc
10268 << " seq " << txc->seq << dendl;
10269 return txc;
10270}
10271
10272void BlueStore::_txc_calc_cost(TransContext *txc)
10273{
11fdf7f2
TL
10274 // one "io" for the kv commit
10275 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
10276 auto cost = throttle_cost_per_io.load();
10277 txc->cost = ios * cost + txc->bytes;
10278 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
10279 << ios << " ios * " << cost << " + " << txc->bytes
10280 << " bytes)" << dendl;
10281}
10282
10283void BlueStore::_txc_update_store_statfs(TransContext *txc)
10284{
10285 if (txc->statfs_delta.is_empty())
10286 return;
10287
10288 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
10289 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
10290 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
10291 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
10292 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
10293
10294 bufferlist bl;
10295 txc->statfs_delta.encode(bl);
11fdf7f2
TL
10296 if (per_pool_stat_collection) {
10297 string key;
10298 get_pool_stat_key(txc->osd_pool_id, &key);
10299 txc->t->merge(PREFIX_STAT, key, bl);
10300
10301 std::lock_guard l(vstatfs_lock);
10302 auto& stats = osd_pools[txc->osd_pool_id];
10303 stats += txc->statfs_delta;
10304
10305 vstatfs += txc->statfs_delta; //non-persistent in this mode
10306
10307 } else {
10308 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 10309
11fdf7f2
TL
10310 std::lock_guard l(vstatfs_lock);
10311 vstatfs += txc->statfs_delta;
10312 }
7c673cae
FG
10313 txc->statfs_delta.reset();
10314}
10315
10316void BlueStore::_txc_state_proc(TransContext *txc)
10317{
10318 while (true) {
10319 dout(10) << __func__ << " txc " << txc
10320 << " " << txc->get_state_name() << dendl;
10321 switch (txc->state) {
10322 case TransContext::STATE_PREPARE:
10323 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
10324 if (txc->ioc.has_pending_aios()) {
10325 txc->state = TransContext::STATE_AIO_WAIT;
10326 txc->had_ios = true;
10327 _txc_aio_submit(txc);
10328 return;
10329 }
10330 // ** fall-thru **
10331
10332 case TransContext::STATE_AIO_WAIT:
11fdf7f2
TL
10333 {
10334 utime_t lat = txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
10335 if (lat >= cct->_conf->bluestore_log_op_age) {
10336 dout(0) << __func__ << " slow aio_wait, txc = " << txc
10337 << ", latency = " << lat
10338 << dendl;
10339 }
10340 }
10341
7c673cae
FG
10342 _txc_finish_io(txc); // may trigger blocked txc's too
10343 return;
10344
10345 case TransContext::STATE_IO_DONE:
11fdf7f2 10346 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
10347 if (txc->had_ios) {
10348 ++txc->osr->txc_with_unstable_io;
10349 }
10350 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
10351 txc->state = TransContext::STATE_KV_QUEUED;
10352 if (cct->_conf->bluestore_sync_submit_transaction) {
10353 if (txc->last_nid >= nid_max ||
10354 txc->last_blobid >= blobid_max) {
10355 dout(20) << __func__
10356 << " last_{nid,blobid} exceeds max, submit via kv thread"
10357 << dendl;
10358 } else if (txc->osr->kv_committing_serially) {
10359 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
10360 << dendl;
10361 // note: this is starvation-prone. once we have a txc in a busy
10362 // sequencer that is committing serially it is possible to keep
10363 // submitting new transactions fast enough that we get stuck doing
10364 // so. the alternative is to block here... fixme?
10365 } else if (txc->osr->txc_with_unstable_io) {
10366 dout(20) << __func__ << " prior txc(s) with unstable ios "
10367 << txc->osr->txc_with_unstable_io.load() << dendl;
10368 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
10369 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
10370 == 0) {
10371 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
10372 << dendl;
10373 } else {
31f18b77 10374 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11fdf7f2 10375 ceph_assert(r == 0);
eafe8130 10376 txc->state = TransContext::STATE_KV_SUBMITTED;
7c673cae
FG
10377 _txc_applied_kv(txc);
10378 }
10379 }
10380 {
11fdf7f2 10381 std::lock_guard l(kv_lock);
7c673cae
FG
10382 kv_queue.push_back(txc);
10383 kv_cond.notify_one();
10384 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
10385 kv_queue_unsubmitted.push_back(txc);
10386 ++txc->osr->kv_committing_serially;
10387 }
31f18b77
FG
10388 if (txc->had_ios)
10389 kv_ios++;
10390 kv_throttle_costs += txc->cost;
7c673cae
FG
10391 }
10392 return;
10393 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
10394 _txc_committed_kv(txc);
10395 // ** fall-thru **
10396
10397 case TransContext::STATE_KV_DONE:
10398 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
10399 if (txc->deferred_txn) {
10400 txc->state = TransContext::STATE_DEFERRED_QUEUED;
10401 _deferred_queue(txc);
10402 return;
10403 }
10404 txc->state = TransContext::STATE_FINISHING;
10405 break;
10406
10407 case TransContext::STATE_DEFERRED_CLEANUP:
10408 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
10409 txc->state = TransContext::STATE_FINISHING;
10410 // ** fall-thru **
10411
10412 case TransContext::STATE_FINISHING:
10413 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
10414 _txc_finish(txc);
10415 return;
10416
10417 default:
10418 derr << __func__ << " unexpected txc " << txc
10419 << " state " << txc->get_state_name() << dendl;
11fdf7f2 10420 ceph_abort_msg("unexpected txc state");
7c673cae
FG
10421 return;
10422 }
10423 }
10424}
10425
10426void BlueStore::_txc_finish_io(TransContext *txc)
10427{
10428 dout(20) << __func__ << " " << txc << dendl;
10429
10430 /*
10431 * we need to preserve the order of kv transactions,
10432 * even though aio will complete in any order.
10433 */
10434
10435 OpSequencer *osr = txc->osr.get();
11fdf7f2 10436 std::lock_guard l(osr->qlock);
7c673cae 10437 txc->state = TransContext::STATE_IO_DONE;
11fdf7f2 10438 txc->ioc.release_running_aios();
7c673cae
FG
10439 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
10440 while (p != osr->q.begin()) {
10441 --p;
10442 if (p->state < TransContext::STATE_IO_DONE) {
10443 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
10444 << p->get_state_name() << dendl;
10445 return;
10446 }
10447 if (p->state > TransContext::STATE_IO_DONE) {
10448 ++p;
10449 break;
10450 }
10451 }
10452 do {
10453 _txc_state_proc(&*p++);
10454 } while (p != osr->q.end() &&
10455 p->state == TransContext::STATE_IO_DONE);
10456
11fdf7f2 10457 if (osr->kv_submitted_waiters) {
7c673cae
FG
10458 osr->qcond.notify_all();
10459 }
10460}
10461
10462void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
10463{
10464 dout(20) << __func__ << " txc " << txc
10465 << " onodes " << txc->onodes
10466 << " shared_blobs " << txc->shared_blobs
10467 << dendl;
10468
10469 // finalize onodes
10470 for (auto o : txc->onodes) {
11fdf7f2 10471 _record_onode(o, t);
7c673cae
FG
10472 o->flushing_count++;
10473 }
10474
10475 // objects we modified but didn't affect the onode
10476 auto p = txc->modified_objects.begin();
10477 while (p != txc->modified_objects.end()) {
10478 if (txc->onodes.count(*p) == 0) {
10479 (*p)->flushing_count++;
10480 ++p;
10481 } else {
10482 // remove dups with onodes list to avoid problems in _txc_finish
10483 p = txc->modified_objects.erase(p);
10484 }
10485 }
10486
10487 // finalize shared_blobs
10488 for (auto sb : txc->shared_blobs) {
10489 string key;
10490 auto sbid = sb->get_sbid();
10491 get_shared_blob_key(sbid, &key);
10492 if (sb->persistent->empty()) {
11fdf7f2
TL
10493 dout(20) << __func__ << " shared_blob 0x"
10494 << std::hex << sbid << std::dec
7c673cae
FG
10495 << " is empty" << dendl;
10496 t->rmkey(PREFIX_SHARED_BLOB, key);
10497 } else {
10498 bufferlist bl;
11fdf7f2
TL
10499 encode(*(sb->persistent), bl);
10500 dout(20) << __func__ << " shared_blob 0x"
10501 << std::hex << sbid << std::dec
31f18b77 10502 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
10503 t->set(PREFIX_SHARED_BLOB, key, bl);
10504 }
10505 }
10506}
10507
10508void BlueStore::BSPerfTracker::update_from_perfcounters(
10509 PerfCounters &logger)
10510{
11fdf7f2
TL
10511 os_commit_latency_ns.consume_next(
10512 logger.get_tavg_ns(
7c673cae 10513 l_bluestore_commit_lat));
11fdf7f2
TL
10514 os_apply_latency_ns.consume_next(
10515 logger.get_tavg_ns(
7c673cae
FG
10516 l_bluestore_commit_lat));
10517}
10518
10519void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
10520{
10521 dout(20) << __func__ << " txc " << txc << std::hex
10522 << " allocated 0x" << txc->allocated
10523 << " released 0x" << txc->released
10524 << std::dec << dendl;
10525
10526 // We have to handle the case where we allocate *and* deallocate the
10527 // same region in this transaction. The freelist doesn't like that.
10528 // (Actually, the only thing that cares is the BitmapFreelistManager
10529 // debug check. But that's important.)
10530 interval_set<uint64_t> tmp_allocated, tmp_released;
10531 interval_set<uint64_t> *pallocated = &txc->allocated;
10532 interval_set<uint64_t> *preleased = &txc->released;
10533 if (!txc->allocated.empty() && !txc->released.empty()) {
10534 interval_set<uint64_t> overlap;
10535 overlap.intersection_of(txc->allocated, txc->released);
10536 if (!overlap.empty()) {
10537 tmp_allocated = txc->allocated;
10538 tmp_allocated.subtract(overlap);
10539 tmp_released = txc->released;
10540 tmp_released.subtract(overlap);
10541 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
10542 << ", new allocated 0x" << tmp_allocated
10543 << " released 0x" << tmp_released << std::dec
10544 << dendl;
10545 pallocated = &tmp_allocated;
10546 preleased = &tmp_released;
10547 }
10548 }
10549
10550 // update freelist with non-overlap sets
10551 for (interval_set<uint64_t>::iterator p = pallocated->begin();
10552 p != pallocated->end();
10553 ++p) {
10554 fm->allocate(p.get_start(), p.get_len(), t);
10555 }
10556 for (interval_set<uint64_t>::iterator p = preleased->begin();
10557 p != preleased->end();
10558 ++p) {
10559 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
10560 << "~" << p.get_len() << std::dec << dendl;
10561 fm->release(p.get_start(), p.get_len(), t);
10562 }
10563
10564 _txc_update_store_statfs(txc);
10565}
10566
10567void BlueStore::_txc_applied_kv(TransContext *txc)
10568{
10569 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
10570 for (auto& o : *ls) {
10571 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
10572 << dendl;
10573 if (--o->flushing_count == 0) {
11fdf7f2 10574 std::lock_guard l(o->flush_lock);
7c673cae
FG
10575 o->flush_cond.notify_all();
10576 }
10577 }
10578 }
10579}
10580
10581void BlueStore::_txc_committed_kv(TransContext *txc)
10582{
10583 dout(20) << __func__ << " txc " << txc << dendl;
1adf2230 10584 {
11fdf7f2 10585 std::lock_guard l(txc->osr->qlock);
1adf2230 10586 txc->state = TransContext::STATE_KV_DONE;
11fdf7f2
TL
10587 if (txc->ch->commit_queue) {
10588 txc->ch->commit_queue->queue(txc->oncommits);
10589 } else {
10590 finisher.queue(txc->oncommits);
1adf2230 10591 }
7c673cae 10592 }
1adf2230 10593 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
10594 log_latency_fn(
10595 __func__,
10596 l_bluestore_commit_lat,
10597 ceph::make_timespan(ceph_clock_now() - txc->start),
10598 cct->_conf->bluestore_log_op_age,
10599 [&](auto lat) {
10600 return ", txc = " + stringify(txc);
10601 }
11fdf7f2 10602 );
7c673cae
FG
10603}
10604
10605void BlueStore::_txc_finish(TransContext *txc)
10606{
10607 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11fdf7f2 10608 ceph_assert(txc->state == TransContext::STATE_FINISHING);
7c673cae
FG
10609
10610 for (auto& sb : txc->shared_blobs_written) {
f64942e4 10611 sb->finish_write(txc->seq);
7c673cae
FG
10612 }
10613 txc->shared_blobs_written.clear();
10614
10615 while (!txc->removed_collections.empty()) {
10616 _queue_reap_collection(txc->removed_collections.front());
10617 txc->removed_collections.pop_front();
10618 }
10619
10620 OpSequencerRef osr = txc->osr;
7c673cae 10621 bool empty = false;
31f18b77 10622 bool submit_deferred = false;
7c673cae
FG
10623 OpSequencer::q_list_t releasing_txc;
10624 {
11fdf7f2 10625 std::lock_guard l(osr->qlock);
7c673cae
FG
10626 txc->state = TransContext::STATE_DONE;
10627 bool notify = false;
10628 while (!osr->q.empty()) {
10629 TransContext *txc = &osr->q.front();
10630 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
10631 << dendl;
10632 if (txc->state != TransContext::STATE_DONE) {
10633 if (txc->state == TransContext::STATE_PREPARE &&
10634 deferred_aggressive) {
10635 // for _osr_drain_preceding()
10636 notify = true;
10637 }
31f18b77 10638 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 10639 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
10640 submit_deferred = true;
10641 }
7c673cae
FG
10642 break;
10643 }
10644
7c673cae
FG
10645 osr->q.pop_front();
10646 releasing_txc.push_back(*txc);
10647 notify = true;
10648 }
10649 if (notify) {
10650 osr->qcond.notify_all();
10651 }
10652 if (osr->q.empty()) {
10653 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
10654 empty = true;
10655 }
10656 }
10657 while (!releasing_txc.empty()) {
10658 // release to allocator only after all preceding txc's have also
10659 // finished any deferred writes that potentially land in these
10660 // blocks
10661 auto txc = &releasing_txc.front();
10662 _txc_release_alloc(txc);
10663 releasing_txc.pop_front();
10664 txc->log_state_latency(logger, l_bluestore_state_done_lat);
10665 delete txc;
10666 }
10667
31f18b77
FG
10668 if (submit_deferred) {
10669 // we're pinning memory; flush! we could be more fine-grained here but
10670 // i'm not sure it's worth the bother.
10671 deferred_try_submit();
7c673cae
FG
10672 }
10673
7c673cae 10674 if (empty && osr->zombie) {
11fdf7f2
TL
10675 std::lock_guard l(zombie_osr_lock);
10676 if (zombie_osr_set.erase(osr->cid)) {
10677 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
10678 } else {
10679 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
10680 << dendl;
10681 }
7c673cae 10682 }
11fdf7f2 10683 }
7c673cae
FG
10684
10685void BlueStore::_txc_release_alloc(TransContext *txc)
10686{
a8e16298 10687 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
10688 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
10689 int r = 0;
10690 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
10691 r = bdev->queue_discard(txc->released);
10692 if (r == 0) {
10693 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
10694 << txc->released << std::dec << dendl;
10695 goto out;
10696 }
10697 } else if (cct->_conf->bdev_enable_discard) {
10698 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
10699 bdev->discard(p.get_start(), p.get_len());
10700 }
10701 }
10702 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 10703 << txc->released << std::dec << dendl;
11fdf7f2 10704 alloc->release(txc->released);
7c673cae
FG
10705 }
10706
11fdf7f2 10707out:
7c673cae
FG
10708 txc->allocated.clear();
10709 txc->released.clear();
10710}
10711
11fdf7f2
TL
10712void BlueStore::_osr_attach(Collection *c)
10713{
10714 // note: caller has RWLock on coll_map
10715 auto q = coll_map.find(c->cid);
10716 if (q != coll_map.end()) {
10717 c->osr = q->second->osr;
10718 ldout(cct, 10) << __func__ << " " << c->cid
10719 << " reusing osr " << c->osr << " from existing coll "
10720 << q->second << dendl;
10721 } else {
10722 std::lock_guard l(zombie_osr_lock);
10723 auto p = zombie_osr_set.find(c->cid);
10724 if (p == zombie_osr_set.end()) {
10725 c->osr = new OpSequencer(this, c->cid);
10726 ldout(cct, 10) << __func__ << " " << c->cid
10727 << " fresh osr " << c->osr << dendl;
10728 } else {
10729 c->osr = p->second;
10730 zombie_osr_set.erase(p);
10731 ldout(cct, 10) << __func__ << " " << c->cid
10732 << " resurrecting zombie osr " << c->osr << dendl;
10733 c->osr->zombie = false;
10734 }
10735 }
10736}
10737
10738void BlueStore::_osr_register_zombie(OpSequencer *osr)
10739{
10740 std::lock_guard l(zombie_osr_lock);
10741 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
10742 osr->zombie = true;
10743 auto i = zombie_osr_set.emplace(osr->cid, osr);
10744 // this is either a new insertion or the same osr is already there
10745 ceph_assert(i.second || i.first->second == osr);
10746}
10747
7c673cae
FG
10748void BlueStore::_osr_drain_preceding(TransContext *txc)
10749{
10750 OpSequencer *osr = txc->osr.get();
10751 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
10752 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
10753 {
10754 // submit anything pending
224ce89b 10755 deferred_lock.lock();
11fdf7f2 10756 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
10757 _deferred_submit_unlock(osr);
10758 } else {
10759 deferred_lock.unlock();
7c673cae
FG
10760 }
10761 }
10762 {
10763 // wake up any previously finished deferred events
11fdf7f2 10764 std::lock_guard l(kv_lock);
7c673cae
FG
10765 kv_cond.notify_one();
10766 }
10767 osr->drain_preceding(txc);
10768 --deferred_aggressive;
10769 dout(10) << __func__ << " " << osr << " done" << dendl;
10770}
10771
11fdf7f2
TL
10772void BlueStore::_osr_drain(OpSequencer *osr)
10773{
10774 dout(10) << __func__ << " " << osr << dendl;
10775 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
10776 {
10777 // submit anything pending
10778 deferred_lock.lock();
10779 if (osr->deferred_pending && !osr->deferred_running) {
10780 _deferred_submit_unlock(osr);
10781 } else {
10782 deferred_lock.unlock();
10783 }
10784 }
10785 {
10786 // wake up any previously finished deferred events
10787 std::lock_guard l(kv_lock);
10788 kv_cond.notify_one();
10789 }
10790 osr->drain();
10791 --deferred_aggressive;
10792 dout(10) << __func__ << " " << osr << " done" << dendl;
10793}
10794
7c673cae
FG
10795void BlueStore::_osr_drain_all()
10796{
10797 dout(10) << __func__ << dendl;
10798
10799 set<OpSequencerRef> s;
11fdf7f2
TL
10800 vector<OpSequencerRef> zombies;
10801 {
10802 RWLock::RLocker l(coll_lock);
10803 for (auto& i : coll_map) {
10804 s.insert(i.second->osr);
10805 }
10806 }
7c673cae 10807 {
11fdf7f2
TL
10808 std::lock_guard l(zombie_osr_lock);
10809 for (auto& i : zombie_osr_set) {
10810 s.insert(i.second);
10811 zombies.push_back(i.second);
10812 }
7c673cae
FG
10813 }
10814 dout(20) << __func__ << " osr_set " << s << dendl;
10815
10816 ++deferred_aggressive;
10817 {
10818 // submit anything pending
224ce89b 10819 deferred_try_submit();
7c673cae
FG
10820 }
10821 {
10822 // wake up any previously finished deferred events
11fdf7f2 10823 std::lock_guard l(kv_lock);
7c673cae
FG
10824 kv_cond.notify_one();
10825 }
31f18b77 10826 {
11fdf7f2 10827 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
10828 kv_finalize_cond.notify_one();
10829 }
7c673cae
FG
10830 for (auto osr : s) {
10831 dout(20) << __func__ << " drain " << osr << dendl;
10832 osr->drain();
10833 }
10834 --deferred_aggressive;
10835
7c673cae 10836 {
11fdf7f2
TL
10837 std::lock_guard l(zombie_osr_lock);
10838 for (auto& osr : zombies) {
10839 if (zombie_osr_set.erase(osr->cid)) {
10840 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
10841 ceph_assert(osr->q.empty());
10842 } else if (osr->zombie) {
10843 dout(10) << __func__ << " empty zombie osr " << osr
10844 << " already reaped" << dendl;
10845 ceph_assert(osr->q.empty());
10846 } else {
10847 dout(10) << __func__ << " empty zombie osr " << osr
10848 << " resurrected" << dendl;
10849 }
7c673cae
FG
10850 }
10851 }
11fdf7f2
TL
10852
10853 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
10854}
10855
11fdf7f2 10856
31f18b77
FG
10857void BlueStore::_kv_start()
10858{
10859 dout(10) << __func__ << dendl;
10860
181888fb 10861 deferred_finisher.start();
11fdf7f2 10862 finisher.start();
31f18b77
FG
10863 kv_sync_thread.create("bstore_kv_sync");
10864 kv_finalize_thread.create("bstore_kv_final");
10865}
10866
10867void BlueStore::_kv_stop()
10868{
10869 dout(10) << __func__ << dendl;
10870 {
11fdf7f2 10871 std::unique_lock l(kv_lock);
31f18b77
FG
10872 while (!kv_sync_started) {
10873 kv_cond.wait(l);
10874 }
10875 kv_stop = true;
10876 kv_cond.notify_all();
10877 }
10878 {
11fdf7f2 10879 std::unique_lock l(kv_finalize_lock);
31f18b77
FG
10880 while (!kv_finalize_started) {
10881 kv_finalize_cond.wait(l);
10882 }
10883 kv_finalize_stop = true;
10884 kv_finalize_cond.notify_all();
10885 }
10886 kv_sync_thread.join();
10887 kv_finalize_thread.join();
11fdf7f2 10888 ceph_assert(removed_collections.empty());
31f18b77 10889 {
11fdf7f2 10890 std::lock_guard l(kv_lock);
31f18b77
FG
10891 kv_stop = false;
10892 }
10893 {
11fdf7f2 10894 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
10895 kv_finalize_stop = false;
10896 }
10897 dout(10) << __func__ << " stopping finishers" << dendl;
181888fb
FG
10898 deferred_finisher.wait_for_empty();
10899 deferred_finisher.stop();
11fdf7f2
TL
10900 finisher.wait_for_empty();
10901 finisher.stop();
31f18b77
FG
10902 dout(10) << __func__ << " stopped" << dendl;
10903}
10904
7c673cae
FG
10905void BlueStore::_kv_sync_thread()
10906{
10907 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
10908 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
10909 std::unique_lock l(kv_lock);
10910 ceph_assert(!kv_sync_started);
31f18b77
FG
10911 kv_sync_started = true;
10912 kv_cond.notify_all();
7c673cae 10913 while (true) {
11fdf7f2 10914 ceph_assert(kv_committing.empty());
7c673cae
FG
10915 if (kv_queue.empty() &&
10916 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 10917 !deferred_aggressive)) {
7c673cae
FG
10918 if (kv_stop)
10919 break;
10920 dout(20) << __func__ << " sleep" << dendl;
11fdf7f2 10921 kv_cond.wait(l);
7c673cae
FG
10922 dout(20) << __func__ << " wake" << dendl;
10923 } else {
10924 deque<TransContext*> kv_submitting;
10925 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
10926 uint64_t aios = 0, costs = 0;
10927
7c673cae
FG
10928 dout(20) << __func__ << " committing " << kv_queue.size()
10929 << " submitting " << kv_queue_unsubmitted.size()
10930 << " deferred done " << deferred_done_queue.size()
10931 << " stable " << deferred_stable_queue.size()
10932 << dendl;
10933 kv_committing.swap(kv_queue);
10934 kv_submitting.swap(kv_queue_unsubmitted);
10935 deferred_done.swap(deferred_done_queue);
10936 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
10937 aios = kv_ios;
10938 costs = kv_throttle_costs;
10939 kv_ios = 0;
10940 kv_throttle_costs = 0;
7c673cae
FG
10941 l.unlock();
10942
10943 dout(30) << __func__ << " committing " << kv_committing << dendl;
10944 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
10945 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
10946 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
10947
11fdf7f2
TL
10948 auto start = mono_clock::now();
10949
7c673cae
FG
10950 bool force_flush = false;
10951 // if bluefs is sharing the same device as data (only), then we
10952 // can rely on the bluefs commit to flush the device and make
10953 // deferred aios stable. that means that if we do have done deferred
10954 // txcs AND we are not on a single device, we need to force a flush.
10955 if (bluefs_single_shared_device && bluefs) {
31f18b77 10956 if (aios) {
7c673cae 10957 force_flush = true;
11fdf7f2 10958 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
10959 force_flush = true; // there's nothing else to commit!
10960 } else if (deferred_aggressive) {
10961 force_flush = true;
10962 }
11fdf7f2
TL
10963 } else {
10964 if (aios || !deferred_done.empty()) {
10965 force_flush = true;
10966 } else {
10967 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
10968 }
10969 }
7c673cae
FG
10970
10971 if (force_flush) {
31f18b77 10972 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
10973 << " force_flush=" << (int)force_flush
10974 << ", flushing, deferred done->stable" << dendl;
10975 // flush/barrier on block device
10976 bdev->flush();
10977
10978 // if we flush then deferred done are now deferred stable
10979 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
10980 deferred_done.end());
10981 deferred_done.clear();
10982 }
11fdf7f2 10983 auto after_flush = mono_clock::now();
7c673cae
FG
10984
10985 // we will use one final transaction to force a sync
10986 KeyValueDB::Transaction synct = db->get_transaction();
10987
10988 // increase {nid,blobid}_max? note that this covers both the
10989 // case where we are approaching the max and the case we passed
10990 // it. in either case, we increase the max in the earlier txn
10991 // we submit.
10992 uint64_t new_nid_max = 0, new_blobid_max = 0;
10993 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
10994 KeyValueDB::Transaction t =
10995 kv_submitting.empty() ? synct : kv_submitting.front()->t;
10996 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
10997 bufferlist bl;
11fdf7f2 10998 encode(new_nid_max, bl);
7c673cae
FG
10999 t->set(PREFIX_SUPER, "nid_max", bl);
11000 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
11001 }
11002 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
11003 KeyValueDB::Transaction t =
11004 kv_submitting.empty() ? synct : kv_submitting.front()->t;
11005 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
11006 bufferlist bl;
11fdf7f2 11007 encode(new_blobid_max, bl);
7c673cae
FG
11008 t->set(PREFIX_SUPER, "blobid_max", bl);
11009 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
11010 }
c07f9fc5
FG
11011
11012 for (auto txc : kv_committing) {
11013 if (txc->state == TransContext::STATE_KV_QUEUED) {
11014 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
11015 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11fdf7f2 11016 ceph_assert(r == 0);
eafe8130 11017 txc->state = TransContext::STATE_KV_SUBMITTED;
c07f9fc5
FG
11018 _txc_applied_kv(txc);
11019 --txc->osr->kv_committing_serially;
c07f9fc5 11020 if (txc->osr->kv_submitted_waiters) {
11fdf7f2
TL
11021 std::lock_guard l(txc->osr->qlock);
11022 txc->osr->qcond.notify_all();
7c673cae 11023 }
c07f9fc5
FG
11024
11025 } else {
11fdf7f2 11026 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
c07f9fc5 11027 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
7c673cae 11028 }
7c673cae
FG
11029 if (txc->had_ios) {
11030 --txc->osr->txc_with_unstable_io;
11031 }
7c673cae
FG
11032 }
11033
31f18b77
FG
11034 // release throttle *before* we commit. this allows new ops
11035 // to be prepared and enter pipeline while we are waiting on
11036 // the kv commit sync/flush. then hopefully on the next
11037 // iteration there will already be ops awake. otherwise, we
11038 // end up going to sleep, and then wake up when the very first
11039 // transaction is ready for commit.
11040 throttle_bytes.put(costs);
11041
7c673cae
FG
11042 if (bluefs &&
11043 after_flush - bluefs_last_balance >
11fdf7f2 11044 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
7c673cae 11045 bluefs_last_balance = after_flush;
11fdf7f2
TL
11046 int r = _balance_bluefs_freespace();
11047 ceph_assert(r >= 0);
7c673cae
FG
11048 }
11049
11050 // cleanup sync deferred keys
11051 for (auto b : deferred_stable) {
11052 for (auto& txc : b->txcs) {
11053 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 11054 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
11055 string key;
11056 get_deferred_key(wt.seq, &key);
11057 synct->rm_single_key(PREFIX_DEFERRED, key);
11058 }
11059 }
11060
11061 // submit synct synchronously (block and wait for it to commit)
31f18b77 11062 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
11063 ceph_assert(r == 0);
11064
11065 {
11066 std::unique_lock m(kv_finalize_lock);
11067 if (kv_committing_to_finalize.empty()) {
11068 kv_committing_to_finalize.swap(kv_committing);
11069 } else {
11070 kv_committing_to_finalize.insert(
11071 kv_committing_to_finalize.end(),
11072 kv_committing.begin(),
11073 kv_committing.end());
11074 kv_committing.clear();
11075 }
11076 if (deferred_stable_to_finalize.empty()) {
11077 deferred_stable_to_finalize.swap(deferred_stable);
11078 } else {
11079 deferred_stable_to_finalize.insert(
11080 deferred_stable_to_finalize.end(),
11081 deferred_stable.begin(),
11082 deferred_stable.end());
11083 deferred_stable.clear();
11084 }
11085 kv_finalize_cond.notify_one();
11086 }
7c673cae
FG
11087
11088 if (new_nid_max) {
11089 nid_max = new_nid_max;
11090 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
11091 }
11092 if (new_blobid_max) {
11093 blobid_max = new_blobid_max;
11094 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
11095 }
11096
224ce89b 11097 {
11fdf7f2
TL
11098 auto finish = mono_clock::now();
11099 ceph::timespan dur_flush = after_flush - start;
11100 ceph::timespan dur_kv = finish - after_flush;
11101 ceph::timespan dur = finish - start;
224ce89b
WB
11102 dout(20) << __func__ << " committed " << kv_committing.size()
11103 << " cleaned " << deferred_stable.size()
11104 << " in " << dur
11105 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
11106 << dendl;
494da23a
TL
11107 log_latency("kv_flush",
11108 l_bluestore_kv_flush_lat,
11109 dur_flush,
11110 cct->_conf->bluestore_log_op_age);
11111 log_latency("kv_commit",
11112 l_bluestore_kv_commit_lat,
11113 dur_kv,
11114 cct->_conf->bluestore_log_op_age);
11115 log_latency("kv_sync",
11116 l_bluestore_kv_sync_lat,
11117 dur,
11118 cct->_conf->bluestore_log_op_age);
7c673cae 11119 }
31f18b77
FG
11120
11121 if (bluefs) {
11fdf7f2
TL
11122 if (!bluefs_extents_reclaiming.empty()) {
11123 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
11124 << bluefs_extents_reclaiming << std::dec << dendl;
81eedcae
TL
11125 int r = 0;
11126 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11127 r = bdev->queue_discard(bluefs_extents_reclaiming);
11128 if (r == 0) {
11129 goto clear;
11130 }
11131 } else if (cct->_conf->bdev_enable_discard) {
11132 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
11133 bdev->discard(p.get_start(), p.get_len());
11134 }
11135 }
11136
11fdf7f2 11137 alloc->release(bluefs_extents_reclaiming);
81eedcae 11138clear:
11fdf7f2 11139 bluefs_extents_reclaiming.clear();
31f18b77 11140 }
31f18b77
FG
11141 }
11142
11143 l.lock();
11144 // previously deferred "done" are now "stable" by virtue of this
11145 // commit cycle.
11146 deferred_stable_queue.swap(deferred_done);
11147 }
11148 }
11149 dout(10) << __func__ << " finish" << dendl;
11150 kv_sync_started = false;
11151}
11152
11153void BlueStore::_kv_finalize_thread()
11154{
11155 deque<TransContext*> kv_committed;
11156 deque<DeferredBatch*> deferred_stable;
11157 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
11158 std::unique_lock l(kv_finalize_lock);
11159 ceph_assert(!kv_finalize_started);
31f18b77
FG
11160 kv_finalize_started = true;
11161 kv_finalize_cond.notify_all();
11162 while (true) {
11fdf7f2
TL
11163 ceph_assert(kv_committed.empty());
11164 ceph_assert(deferred_stable.empty());
31f18b77
FG
11165 if (kv_committing_to_finalize.empty() &&
11166 deferred_stable_to_finalize.empty()) {
11167 if (kv_finalize_stop)
11168 break;
11169 dout(20) << __func__ << " sleep" << dendl;
11170 kv_finalize_cond.wait(l);
11171 dout(20) << __func__ << " wake" << dendl;
11172 } else {
11173 kv_committed.swap(kv_committing_to_finalize);
11174 deferred_stable.swap(deferred_stable_to_finalize);
11175 l.unlock();
11176 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
11177 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
11178
11fdf7f2
TL
11179 auto start = mono_clock::now();
11180
31f18b77
FG
11181 while (!kv_committed.empty()) {
11182 TransContext *txc = kv_committed.front();
11fdf7f2 11183 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 11184 _txc_state_proc(txc);
31f18b77 11185 kv_committed.pop_front();
7c673cae 11186 }
31f18b77 11187
7c673cae
FG
11188 for (auto b : deferred_stable) {
11189 auto p = b->txcs.begin();
11190 while (p != b->txcs.end()) {
11191 TransContext *txc = &*p;
11192 p = b->txcs.erase(p); // unlink here because
11193 _txc_state_proc(txc); // this may destroy txc
11194 }
11195 delete b;
11196 }
31f18b77 11197 deferred_stable.clear();
7c673cae
FG
11198
11199 if (!deferred_aggressive) {
31f18b77 11200 if (deferred_queue_size >= deferred_batch_ops.load() ||
7c673cae 11201 throttle_deferred_bytes.past_midpoint()) {
224ce89b 11202 deferred_try_submit();
7c673cae
FG
11203 }
11204 }
11205
11206 // this is as good a place as any ...
11207 _reap_collections();
11208
11fdf7f2
TL
11209 logger->set(l_bluestore_fragmentation,
11210 (uint64_t)(alloc->get_fragmentation(min_alloc_size) * 1000));
11211
494da23a
TL
11212 log_latency("kv_final",
11213 l_bluestore_kv_final_lat,
11214 mono_clock::now() - start,
11215 cct->_conf->bluestore_log_op_age);
11fdf7f2 11216
7c673cae 11217 l.lock();
7c673cae
FG
11218 }
11219 }
11220 dout(10) << __func__ << " finish" << dendl;
31f18b77 11221 kv_finalize_started = false;
7c673cae
FG
11222}
11223
11224bluestore_deferred_op_t *BlueStore::_get_deferred_op(
11225 TransContext *txc, OnodeRef o)
11226{
11227 if (!txc->deferred_txn) {
11228 txc->deferred_txn = new bluestore_deferred_transaction_t;
11229 }
11230 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
11231 return &txc->deferred_txn->ops.back();
11232}
11233
11234void BlueStore::_deferred_queue(TransContext *txc)
11235{
11236 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 11237 deferred_lock.lock();
7c673cae
FG
11238 if (!txc->osr->deferred_pending &&
11239 !txc->osr->deferred_running) {
11240 deferred_queue.push_back(*txc->osr);
11241 }
11242 if (!txc->osr->deferred_pending) {
11243 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
11244 }
11245 ++deferred_queue_size;
11246 txc->osr->deferred_pending->txcs.push_back(*txc);
11247 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
11248 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
11249 const auto& op = *opi;
11fdf7f2 11250 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
11251 bufferlist::const_iterator p = op.data.begin();
11252 for (auto e : op.extents) {
11253 txc->osr->deferred_pending->prepare_write(
11254 cct, wt.seq, e.offset, e.length, p);
11255 }
11256 }
11257 if (deferred_aggressive &&
11258 !txc->osr->deferred_running) {
224ce89b
WB
11259 _deferred_submit_unlock(txc->osr.get());
11260 } else {
11261 deferred_lock.unlock();
7c673cae
FG
11262 }
11263}
11264
224ce89b 11265void BlueStore::deferred_try_submit()
7c673cae
FG
11266{
11267 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
11268 << deferred_queue_size << " txcs" << dendl;
11fdf7f2 11269 std::lock_guard l(deferred_lock);
224ce89b
WB
11270 vector<OpSequencerRef> osrs;
11271 osrs.reserve(deferred_queue.size());
7c673cae 11272 for (auto& osr : deferred_queue) {
224ce89b
WB
11273 osrs.push_back(&osr);
11274 }
11275 for (auto& osr : osrs) {
181888fb
FG
11276 if (osr->deferred_pending) {
11277 if (!osr->deferred_running) {
11278 _deferred_submit_unlock(osr.get());
11279 deferred_lock.lock();
11280 } else {
11281 dout(20) << __func__ << " osr " << osr << " already has running"
11282 << dendl;
11283 }
11284 } else {
11285 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
11286 }
11287 }
11288}
11289
224ce89b 11290void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
11291{
11292 dout(10) << __func__ << " osr " << osr
11293 << " " << osr->deferred_pending->iomap.size() << " ios pending "
11294 << dendl;
11fdf7f2
TL
11295 ceph_assert(osr->deferred_pending);
11296 ceph_assert(!osr->deferred_running);
7c673cae
FG
11297
11298 auto b = osr->deferred_pending;
11299 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 11300 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
11301
11302 osr->deferred_running = osr->deferred_pending;
11303 osr->deferred_pending = nullptr;
11304
11fdf7f2
TL
11305 deferred_lock.unlock();
11306
11307 for (auto& txc : b->txcs) {
11308 txc.log_state_latency(logger, l_bluestore_state_deferred_queued_lat);
11309 }
7c673cae
FG
11310 uint64_t start = 0, pos = 0;
11311 bufferlist bl;
11312 auto i = b->iomap.begin();
11313 while (true) {
11314 if (i == b->iomap.end() || i->first != pos) {
11315 if (bl.length()) {
11316 dout(20) << __func__ << " write 0x" << std::hex
11317 << start << "~" << bl.length()
11318 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 11319 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
11320 logger->inc(l_bluestore_deferred_write_ops);
11321 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
11322 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 11323 ceph_assert(r == 0);
7c673cae
FG
11324 }
11325 }
11326 if (i == b->iomap.end()) {
11327 break;
11328 }
11329 start = 0;
11330 pos = i->first;
11331 bl.clear();
11332 }
11333 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
11334 << std::hex << pos << "~" << i->second.bl.length() << std::dec
11335 << dendl;
11336 if (!bl.length()) {
11337 start = pos;
11338 }
11339 pos += i->second.bl.length();
11340 bl.claim_append(i->second.bl);
11341 ++i;
11342 }
224ce89b 11343
7c673cae
FG
11344 bdev->aio_submit(&b->ioc);
11345}
11346
3efd9988
FG
11347struct C_DeferredTrySubmit : public Context {
11348 BlueStore *store;
11349 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
11350 void finish(int r) {
11351 store->deferred_try_submit();
11352 }
11353};
11354
7c673cae
FG
11355void BlueStore::_deferred_aio_finish(OpSequencer *osr)
11356{
11357 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 11358 ceph_assert(osr->deferred_running);
7c673cae
FG
11359 DeferredBatch *b = osr->deferred_running;
11360
11361 {
11fdf7f2
TL
11362 std::lock_guard l(deferred_lock);
11363 ceph_assert(osr->deferred_running == b);
7c673cae
FG
11364 osr->deferred_running = nullptr;
11365 if (!osr->deferred_pending) {
181888fb 11366 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
11367 auto q = deferred_queue.iterator_to(*osr);
11368 deferred_queue.erase(q);
11369 } else if (deferred_aggressive) {
224ce89b 11370 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
3efd9988 11371 deferred_finisher.queue(new C_DeferredTrySubmit(this));
181888fb
FG
11372 } else {
11373 dout(20) << __func__ << " leaving queued, more pending" << dendl;
7c673cae
FG
11374 }
11375 }
11376
11377 {
31f18b77 11378 uint64_t costs = 0;
11fdf7f2
TL
11379 {
11380 std::lock_guard l2(osr->qlock);
11381 for (auto& i : b->txcs) {
11382 TransContext *txc = &i;
11383 txc->log_state_latency(logger, l_bluestore_state_deferred_aio_wait_lat);
11384 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
11385 costs += txc->cost;
11386 }
7c673cae 11387 }
31f18b77 11388 throttle_deferred_bytes.put(costs);
11fdf7f2 11389 std::lock_guard l(kv_lock);
7c673cae
FG
11390 deferred_done_queue.emplace_back(b);
11391 }
11392
11393 // in the normal case, do not bother waking up the kv thread; it will
11394 // catch us on the next commit anyway.
11395 if (deferred_aggressive) {
11fdf7f2 11396 std::lock_guard l(kv_lock);
7c673cae
FG
11397 kv_cond.notify_one();
11398 }
11399}
11400
11401int BlueStore::_deferred_replay()
11402{
11403 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
11404 int count = 0;
11405 int r = 0;
11fdf7f2
TL
11406 CollectionRef ch = _get_collection(coll_t::meta());
11407 bool fake_ch = false;
11408 if (!ch) {
11409 // hmm, replaying initial mkfs?
11410 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
11411 fake_ch = true;
11412 }
11413 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
11414 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
11415 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
11416 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
11417 << dendl;
11418 bluestore_deferred_transaction_t *deferred_txn =
11419 new bluestore_deferred_transaction_t;
11420 bufferlist bl = it->value();
11fdf7f2 11421 auto p = bl.cbegin();
7c673cae 11422 try {
11fdf7f2 11423 decode(*deferred_txn, p);
7c673cae
FG
11424 } catch (buffer::error& e) {
11425 derr << __func__ << " failed to decode deferred txn "
11426 << pretty_binary_string(it->key()) << dendl;
11427 delete deferred_txn;
11428 r = -EIO;
11429 goto out;
11430 }
11fdf7f2 11431 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae
FG
11432 txc->deferred_txn = deferred_txn;
11433 txc->state = TransContext::STATE_KV_DONE;
11434 _txc_state_proc(txc);
11435 }
11436 out:
11437 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 11438 _osr_register_zombie(osr);
7c673cae 11439 _osr_drain_all();
11fdf7f2
TL
11440 if (fake_ch) {
11441 new_coll_map.clear();
11442 }
7c673cae
FG
11443 dout(10) << __func__ << " completed " << count << " events" << dendl;
11444 return r;
11445}
11446
11447// ---------------------------
11448// transactions
11449
11450int BlueStore::queue_transactions(
11fdf7f2
TL
11451 CollectionHandle& ch,
11452 vector<Transaction>& tls,
11453 TrackedOpRef op,
11454 ThreadPool::TPHandle *handle)
11455{
11456 FUNCTRACE(cct);
11457 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 11458 ObjectStore::Transaction::collect_contexts(
11fdf7f2 11459 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 11460
11fdf7f2
TL
11461 auto start = mono_clock::now();
11462
11463 Collection *c = static_cast<Collection*>(ch.get());
11464 OpSequencer *osr = c->osr.get();
11465 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
11466
11467 // prepare
11fdf7f2
TL
11468 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
11469 &on_commit);
7c673cae
FG
11470
11471 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
11472 txc->bytes += (*p).get_num_bytes();
11473 _txc_add_transaction(txc, &(*p));
11474 }
11475 _txc_calc_cost(txc);
11476
11477 _txc_write_nodes(txc, txc->t);
11478
11479 // journal deferred items
11480 if (txc->deferred_txn) {
11481 txc->deferred_txn->seq = ++deferred_seq;
11482 bufferlist bl;
11fdf7f2 11483 encode(*txc->deferred_txn, bl);
7c673cae
FG
11484 string key;
11485 get_deferred_key(txc->deferred_txn->seq, &key);
11486 txc->t->set(PREFIX_DEFERRED, key, bl);
11487 }
11488
11489 _txc_finalize_kv(txc, txc->t);
11490 if (handle)
11491 handle->suspend_tp_timeout();
11492
11fdf7f2 11493 auto tstart = mono_clock::now();
7c673cae
FG
11494 throttle_bytes.get(txc->cost);
11495 if (txc->deferred_txn) {
11496 // ensure we do not block here because of deferred writes
11497 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
d2e6a577
FG
11498 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
11499 << dendl;
11500 ++deferred_aggressive;
7c673cae 11501 deferred_try_submit();
3efd9988
FG
11502 {
11503 // wake up any previously finished deferred events
11fdf7f2 11504 std::lock_guard l(kv_lock);
3efd9988
FG
11505 kv_cond.notify_one();
11506 }
7c673cae 11507 throttle_deferred_bytes.get(txc->cost);
d2e6a577
FG
11508 --deferred_aggressive;
11509 }
7c673cae 11510 }
11fdf7f2 11511 auto tend = mono_clock::now();
7c673cae
FG
11512
11513 if (handle)
11514 handle->reset_tp_timeout();
11515
11516 logger->inc(l_bluestore_txc);
11517
11518 // execute (start)
11519 _txc_state_proc(txc);
11520
11fdf7f2
TL
11521 // we're immediately readable (unlike FileStore)
11522 for (auto c : on_applied_sync) {
11523 c->complete(0);
11524 }
11525 if (!on_applied.empty()) {
11526 if (c->commit_queue) {
11527 c->commit_queue->queue(on_applied);
11528 } else {
11529 finisher.queue(on_applied);
11530 }
11531 }
11532
494da23a
TL
11533 log_latency("submit_transact",
11534 l_bluestore_submit_lat,
11535 mono_clock::now() - start,
11536 cct->_conf->bluestore_log_op_age);
11537 log_latency("throttle_transact",
11538 l_bluestore_throttle_lat,
11539 tend - tstart,
11540 cct->_conf->bluestore_log_op_age);
7c673cae
FG
11541 return 0;
11542}
11543
11544void BlueStore::_txc_aio_submit(TransContext *txc)
11545{
11546 dout(10) << __func__ << " txc " << txc << dendl;
11547 bdev->aio_submit(&txc->ioc);
11548}
11549
11550void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
11551{
11552 Transaction::iterator i = t->begin();
11553
81eedcae 11554 _dump_transaction<30>(cct, t);
7c673cae
FG
11555
11556 vector<CollectionRef> cvec(i.colls.size());
11557 unsigned j = 0;
11558 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
11559 ++p, ++j) {
11560 cvec[j] = _get_collection(*p);
7c673cae 11561 }
11fdf7f2 11562
7c673cae
FG
11563 vector<OnodeRef> ovec(i.objects.size());
11564
11565 for (int pos = 0; i.have_op(); ++pos) {
11566 Transaction::Op *op = i.decode_op();
11567 int r = 0;
11568
11569 // no coll or obj
11570 if (op->op == Transaction::OP_NOP)
11571 continue;
11572
11fdf7f2 11573
7c673cae
FG
11574 // collection operations
11575 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
11576
11577 // initialize osd_pool_id and do a smoke test that all collections belong
11578 // to the same pool
11579 spg_t pgid;
11580 if (!!c ? c->cid.is_pg(&pgid) : false) {
11581 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
11582 txc->osd_pool_id == pgid.pool());
11583 txc->osd_pool_id = pgid.pool();
11584 }
11585
7c673cae
FG
11586 switch (op->op) {
11587 case Transaction::OP_RMCOLL:
11588 {
11589 const coll_t &cid = i.get_cid(op->cid);
11590 r = _remove_collection(txc, cid, &c);
11591 if (!r)
11592 continue;
11593 }
11594 break;
11595
11596 case Transaction::OP_MKCOLL:
11597 {
11fdf7f2 11598 ceph_assert(!c);
7c673cae
FG
11599 const coll_t &cid = i.get_cid(op->cid);
11600 r = _create_collection(txc, cid, op->split_bits, &c);
11601 if (!r)
11602 continue;
11603 }
11604 break;
11605
11606 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 11607 ceph_abort_msg("deprecated");
7c673cae
FG
11608 break;
11609
11610 case Transaction::OP_SPLIT_COLLECTION2:
11611 {
11612 uint32_t bits = op->split_bits;
11613 uint32_t rem = op->split_rem;
11614 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
11615 if (!r)
11616 continue;
11617 }
11618 break;
11619
11fdf7f2
TL
11620 case Transaction::OP_MERGE_COLLECTION:
11621 {
11622 uint32_t bits = op->split_bits;
11623 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
11624 if (!r)
11625 continue;
11626 }
11627 break;
11628
7c673cae
FG
11629 case Transaction::OP_COLL_HINT:
11630 {
11631 uint32_t type = op->hint_type;
11632 bufferlist hint;
11633 i.decode_bl(hint);
11fdf7f2 11634 auto hiter = hint.cbegin();
7c673cae
FG
11635 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
11636 uint32_t pg_num;
11637 uint64_t num_objs;
11fdf7f2
TL
11638 decode(pg_num, hiter);
11639 decode(num_objs, hiter);
7c673cae
FG
11640 dout(10) << __func__ << " collection hint objects is a no-op, "
11641 << " pg_num " << pg_num << " num_objects " << num_objs
11642 << dendl;
11643 } else {
11644 // Ignore the hint
11645 dout(10) << __func__ << " unknown collection hint " << type << dendl;
11646 }
11647 continue;
11648 }
11649 break;
11650
11651 case Transaction::OP_COLL_SETATTR:
11652 r = -EOPNOTSUPP;
11653 break;
11654
11655 case Transaction::OP_COLL_RMATTR:
11656 r = -EOPNOTSUPP;
11657 break;
11658
11659 case Transaction::OP_COLL_RENAME:
11fdf7f2 11660 ceph_abort_msg("not implemented");
7c673cae
FG
11661 break;
11662 }
11663 if (r < 0) {
11664 derr << __func__ << " error " << cpp_strerror(r)
11665 << " not handled on operation " << op->op
11666 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 11667 _dump_transaction<0>(cct, t);
11fdf7f2 11668 ceph_abort_msg("unexpected error");
7c673cae
FG
11669 }
11670
11671 // these operations implicity create the object
11672 bool create = false;
11673 if (op->op == Transaction::OP_TOUCH ||
11674 op->op == Transaction::OP_WRITE ||
11675 op->op == Transaction::OP_ZERO) {
11676 create = true;
11677 }
11678
11679 // object operations
11680 RWLock::WLocker l(c->lock);
11681 OnodeRef &o = ovec[op->oid];
11682 if (!o) {
11683 ghobject_t oid = i.get_oid(op->oid);
11684 o = c->get_onode(oid, create);
11685 }
11686 if (!create && (!o || !o->exists)) {
11687 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
11688 << i.get_oid(op->oid) << dendl;
11689 r = -ENOENT;
11690 goto endop;
11691 }
11692
11693 switch (op->op) {
11694 case Transaction::OP_TOUCH:
11695 r = _touch(txc, c, o);
11696 break;
11697
11698 case Transaction::OP_WRITE:
11699 {
11700 uint64_t off = op->off;
11701 uint64_t len = op->len;
11702 uint32_t fadvise_flags = i.get_fadvise_flags();
11703 bufferlist bl;
11704 i.decode_bl(bl);
11705 r = _write(txc, c, o, off, len, bl, fadvise_flags);
11706 }
11707 break;
11708
11709 case Transaction::OP_ZERO:
11710 {
11711 uint64_t off = op->off;
11712 uint64_t len = op->len;
11713 r = _zero(txc, c, o, off, len);
11714 }
11715 break;
11716
11717 case Transaction::OP_TRIMCACHE:
11718 {
11719 // deprecated, no-op
11720 }
11721 break;
11722
11723 case Transaction::OP_TRUNCATE:
11724 {
11725 uint64_t off = op->off;
35e4c445 11726 r = _truncate(txc, c, o, off);
7c673cae
FG
11727 }
11728 break;
11729
11730 case Transaction::OP_REMOVE:
11731 {
11732 r = _remove(txc, c, o);
11733 }
11734 break;
11735
11736 case Transaction::OP_SETATTR:
11737 {
11738 string name = i.decode_string();
11739 bufferptr bp;
11740 i.decode_bp(bp);
11741 r = _setattr(txc, c, o, name, bp);
11742 }
11743 break;
11744
11745 case Transaction::OP_SETATTRS:
11746 {
11747 map<string, bufferptr> aset;
11748 i.decode_attrset(aset);
11749 r = _setattrs(txc, c, o, aset);
11750 }
11751 break;
11752
11753 case Transaction::OP_RMATTR:
11754 {
11755 string name = i.decode_string();
11756 r = _rmattr(txc, c, o, name);
11757 }
11758 break;
11759
11760 case Transaction::OP_RMATTRS:
11761 {
11762 r = _rmattrs(txc, c, o);
11763 }
11764 break;
11765
11766 case Transaction::OP_CLONE:
11767 {
11768 OnodeRef& no = ovec[op->dest_oid];
11769 if (!no) {
11770 const ghobject_t& noid = i.get_oid(op->dest_oid);
11771 no = c->get_onode(noid, true);
11772 }
11773 r = _clone(txc, c, o, no);
11774 }
11775 break;
11776
11777 case Transaction::OP_CLONERANGE:
11fdf7f2 11778 ceph_abort_msg("deprecated");
7c673cae
FG
11779 break;
11780
11781 case Transaction::OP_CLONERANGE2:
11782 {
11783 OnodeRef& no = ovec[op->dest_oid];
11784 if (!no) {
11785 const ghobject_t& noid = i.get_oid(op->dest_oid);
11786 no = c->get_onode(noid, true);
11787 }
11788 uint64_t srcoff = op->off;
11789 uint64_t len = op->len;
11790 uint64_t dstoff = op->dest_off;
11791 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
11792 }
11793 break;
11794
11795 case Transaction::OP_COLL_ADD:
11fdf7f2 11796 ceph_abort_msg("not implemented");
7c673cae
FG
11797 break;
11798
11799 case Transaction::OP_COLL_REMOVE:
11fdf7f2 11800 ceph_abort_msg("not implemented");
7c673cae
FG
11801 break;
11802
11803 case Transaction::OP_COLL_MOVE:
11fdf7f2 11804 ceph_abort_msg("deprecated");
7c673cae
FG
11805 break;
11806
11807 case Transaction::OP_COLL_MOVE_RENAME:
11808 case Transaction::OP_TRY_RENAME:
11809 {
11fdf7f2 11810 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
11811 const ghobject_t& noid = i.get_oid(op->dest_oid);
11812 OnodeRef& no = ovec[op->dest_oid];
11813 if (!no) {
11814 no = c->get_onode(noid, false);
11815 }
11816 r = _rename(txc, c, o, no, noid);
11817 }
11818 break;
11819
11820 case Transaction::OP_OMAP_CLEAR:
11821 {
11822 r = _omap_clear(txc, c, o);
11823 }
11824 break;
11825 case Transaction::OP_OMAP_SETKEYS:
11826 {
11827 bufferlist aset_bl;
11828 i.decode_attrset_bl(&aset_bl);
11829 r = _omap_setkeys(txc, c, o, aset_bl);
11830 }
11831 break;
11832 case Transaction::OP_OMAP_RMKEYS:
11833 {
11834 bufferlist keys_bl;
11835 i.decode_keyset_bl(&keys_bl);
11836 r = _omap_rmkeys(txc, c, o, keys_bl);
11837 }
11838 break;
11839 case Transaction::OP_OMAP_RMKEYRANGE:
11840 {
11841 string first, last;
11842 first = i.decode_string();
11843 last = i.decode_string();
11844 r = _omap_rmkey_range(txc, c, o, first, last);
11845 }
11846 break;
11847 case Transaction::OP_OMAP_SETHEADER:
11848 {
11849 bufferlist bl;
11850 i.decode_bl(bl);
11851 r = _omap_setheader(txc, c, o, bl);
11852 }
11853 break;
11854
11855 case Transaction::OP_SETALLOCHINT:
11856 {
11857 r = _set_alloc_hint(txc, c, o,
11858 op->expected_object_size,
11859 op->expected_write_size,
11860 op->alloc_hint_flags);
11861 }
11862 break;
11863
11864 default:
11fdf7f2 11865 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
11866 ceph_abort();
11867 }
11868
11869 endop:
11870 if (r < 0) {
11871 bool ok = false;
11872
11873 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
11874 op->op == Transaction::OP_CLONE ||
11875 op->op == Transaction::OP_CLONERANGE2 ||
11876 op->op == Transaction::OP_COLL_ADD ||
11877 op->op == Transaction::OP_SETATTR ||
11878 op->op == Transaction::OP_SETATTRS ||
11879 op->op == Transaction::OP_RMATTR ||
11880 op->op == Transaction::OP_OMAP_SETKEYS ||
11881 op->op == Transaction::OP_OMAP_RMKEYS ||
11882 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
11883 op->op == Transaction::OP_OMAP_SETHEADER))
11884 // -ENOENT is usually okay
11885 ok = true;
11886 if (r == -ENODATA)
11887 ok = true;
11888
11889 if (!ok) {
11890 const char *msg = "unexpected error code";
11891
11892 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
11893 op->op == Transaction::OP_CLONE ||
11894 op->op == Transaction::OP_CLONERANGE2))
11895 msg = "ENOENT on clone suggests osd bug";
11896
11897 if (r == -ENOSPC)
11898 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
11899 // by partially applying transactions.
11900 msg = "ENOSPC from bluestore, misconfigured cluster";
11901
11902 if (r == -ENOTEMPTY) {
11903 msg = "ENOTEMPTY suggests garbage data in osd data dir";
11904 }
11905
11906 derr << __func__ << " error " << cpp_strerror(r)
11907 << " not handled on operation " << op->op
11908 << " (op " << pos << ", counting from 0)"
11909 << dendl;
11910 derr << msg << dendl;
81eedcae 11911 _dump_transaction<0>(cct, t);
11fdf7f2 11912 ceph_abort_msg("unexpected error");
7c673cae
FG
11913 }
11914 }
11915 }
11916}
11917
11918
11919
11920// -----------------
11921// write operations
11922
11923int BlueStore::_touch(TransContext *txc,
11924 CollectionRef& c,
11925 OnodeRef &o)
11926{
11927 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11928 int r = 0;
7c673cae
FG
11929 _assign_nid(txc, o);
11930 txc->write_onode(o);
11931 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11932 return r;
11933}
11934
7c673cae
FG
11935void BlueStore::_pad_zeros(
11936 bufferlist *bl, uint64_t *offset,
11937 uint64_t chunk_size)
11938{
11939 auto length = bl->length();
11940 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
11941 << " chunk_size 0x" << chunk_size << std::dec << dendl;
11942 dout(40) << "before:\n";
11943 bl->hexdump(*_dout);
11944 *_dout << dendl;
11945 // front
11946 size_t front_pad = *offset % chunk_size;
11947 size_t back_pad = 0;
11948 size_t pad_count = 0;
11949 if (front_pad) {
11fdf7f2
TL
11950 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
11951 bufferptr z = buffer::create_small_page_aligned(chunk_size);
224ce89b 11952 z.zero(0, front_pad, false);
7c673cae 11953 pad_count += front_pad;
224ce89b 11954 bl->copy(0, front_copy, z.c_str() + front_pad);
7c673cae
FG
11955 if (front_copy + front_pad < chunk_size) {
11956 back_pad = chunk_size - (length + front_pad);
224ce89b 11957 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
11958 pad_count += back_pad;
11959 }
11960 bufferlist old, t;
11961 old.swap(*bl);
11962 t.substr_of(old, front_copy, length - front_copy);
11963 bl->append(z);
11964 bl->claim_append(t);
11965 *offset -= front_pad;
224ce89b 11966 length += pad_count;
7c673cae
FG
11967 }
11968
11969 // back
11970 uint64_t end = *offset + length;
11971 unsigned back_copy = end % chunk_size;
11972 if (back_copy) {
11fdf7f2 11973 ceph_assert(back_pad == 0);
7c673cae 11974 back_pad = chunk_size - back_copy;
11fdf7f2 11975 ceph_assert(back_copy <= length);
7c673cae 11976 bufferptr tail(chunk_size);
224ce89b
WB
11977 bl->copy(length - back_copy, back_copy, tail.c_str());
11978 tail.zero(back_copy, back_pad, false);
7c673cae
FG
11979 bufferlist old;
11980 old.swap(*bl);
11981 bl->substr_of(old, 0, length - back_copy);
11982 bl->append(tail);
11983 length += back_pad;
11984 pad_count += back_pad;
11985 }
11986 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
11987 << back_pad << " on front/back, now 0x" << *offset << "~"
11988 << length << std::dec << dendl;
11989 dout(40) << "after:\n";
11990 bl->hexdump(*_dout);
11991 *_dout << dendl;
11992 if (pad_count)
11993 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 11994 ceph_assert(bl->length() == length);
7c673cae
FG
11995}
11996
11997void BlueStore::_do_write_small(
11998 TransContext *txc,
11999 CollectionRef &c,
12000 OnodeRef o,
12001 uint64_t offset, uint64_t length,
12002 bufferlist::iterator& blp,
12003 WriteContext *wctx)
12004{
12005 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
12006 << std::dec << dendl;
11fdf7f2 12007 ceph_assert(length < min_alloc_size);
7c673cae
FG
12008 uint64_t end_offs = offset + length;
12009
12010 logger->inc(l_bluestore_write_small);
12011 logger->inc(l_bluestore_write_small_bytes, length);
12012
12013 bufferlist bl;
12014 blp.copy(length, bl);
12015
81eedcae
TL
12016 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
12017 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
12018 uint32_t alloc_len = min_alloc_size;
12019 auto offset0 = p2align<uint64_t>(offset, alloc_len);
12020
12021 bool any_change;
12022
12023 // search suitable extent in both forward and reverse direction in
12024 // [offset - target_max_blob_size, offset + target_max_blob_size] range
12025 // then check if blob can be reused via can_reuse_blob func or apply
12026 // direct/deferred write (the latter for extents including or higher
12027 // than 'offset' only).
12028 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
12029
7c673cae
FG
12030 // Look for an existing mutable blob we can use.
12031 auto begin = o->extent_map.extent_map.begin();
12032 auto end = o->extent_map.extent_map.end();
12033 auto ep = o->extent_map.seek_lextent(offset);
12034 if (ep != begin) {
12035 --ep;
12036 if (ep->blob_end() <= offset) {
12037 ++ep;
12038 }
12039 }
12040 auto prev_ep = ep;
12041 if (prev_ep != begin) {
12042 --prev_ep;
12043 } else {
12044 prev_ep = end; // to avoid this extent check as it's a duplicate
12045 }
12046
eafe8130
TL
12047 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
12048 // We don't want to have more blobs than min alloc units fit
12049 // into 2 max blobs
12050 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
12051 bool above_blob_threshold = false;
12052
12053 inspected_blobs.reserve(blob_threshold);
12054
12055 uint64_t max_off = 0;
12056 auto start_ep = ep;
12057 auto end_ep = ep; // exclusively
7c673cae
FG
12058 do {
12059 any_change = false;
12060
12061 if (ep != end && ep->logical_offset < offset + max_bsize) {
12062 BlobRef b = ep->blob;
eafe8130
TL
12063 if (!above_blob_threshold) {
12064 inspected_blobs.insert(&b->get_blob());
12065 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
12066 }
12067 max_off = ep->logical_end();
7c673cae 12068 auto bstart = ep->blob_start();
eafe8130 12069
7c673cae
FG
12070 dout(20) << __func__ << " considering " << *b
12071 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
12072 if (bstart >= end_offs) {
12073 dout(20) << __func__ << " ignoring distant " << *b << dendl;
12074 } else if (!b->get_blob().is_mutable()) {
12075 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
12076 } else if (ep->logical_offset % min_alloc_size !=
12077 ep->blob_offset % min_alloc_size) {
12078 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
12079 } else {
12080 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
12081 // can we pad our head/tail out with zeros?
12082 uint64_t head_pad, tail_pad;
11fdf7f2
TL
12083 head_pad = p2phase(offset, chunk_size);
12084 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
12085 if (head_pad || tail_pad) {
12086 o->extent_map.fault_range(db, offset - head_pad,
12087 end_offs - offset + head_pad + tail_pad);
12088 }
12089 if (head_pad &&
12090 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
12091 head_pad = 0;
12092 }
12093 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
12094 tail_pad = 0;
12095 }
12096
12097 uint64_t b_off = offset - head_pad - bstart;
12098 uint64_t b_len = length + head_pad + tail_pad;
12099
12100 // direct write into unused blocks of an existing mutable blob?
12101 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
12102 b->get_blob().get_ondisk_length() >= b_off + b_len &&
12103 b->get_blob().is_unused(b_off, b_len) &&
12104 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 12105 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
12106
12107 dout(20) << __func__ << " write to unused 0x" << std::hex
12108 << b_off << "~" << b_len
12109 << " pad 0x" << head_pad << " + 0x" << tail_pad
12110 << std::dec << " of mutable " << *b << dendl;
224ce89b 12111 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
12112 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
12113
11fdf7f2 12114 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12115 if (b_len <= prefer_deferred_size) {
12116 dout(20) << __func__ << " deferring small 0x" << std::hex
12117 << b_len << std::dec << " unused write via deferred" << dendl;
12118 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
12119 op->op = bluestore_deferred_op_t::OP_WRITE;
12120 b->get_blob().map(
12121 b_off, b_len,
12122 [&](uint64_t offset, uint64_t length) {
12123 op->extents.emplace_back(bluestore_pextent_t(offset, length));
12124 return 0;
12125 });
224ce89b 12126 op->data = bl;
7c673cae
FG
12127 } else {
12128 b->get_blob().map_bl(
224ce89b 12129 b_off, bl,
7c673cae
FG
12130 [&](uint64_t offset, bufferlist& t) {
12131 bdev->aio_write(offset, t,
12132 &txc->ioc, wctx->buffered);
12133 });
12134 }
12135 }
224ce89b 12136 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
12137 dout(20) << __func__ << " lex old " << *ep << dendl;
12138 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
12139 b,
12140 &wctx->old_extents);
12141 b->dirty_blob().mark_used(le->blob_offset, le->length);
12142 txc->statfs_delta.stored() += le->length;
12143 dout(20) << __func__ << " lex " << *le << dendl;
12144 logger->inc(l_bluestore_write_small_unused);
12145 return;
12146 }
12147 // read some data to fill out the chunk?
11fdf7f2
TL
12148 uint64_t head_read = p2phase(b_off, chunk_size);
12149 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
12150 if ((head_read || tail_read) &&
12151 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
12152 head_read + tail_read < min_alloc_size) {
12153 b_off -= head_read;
12154 b_len += head_read + tail_read;
12155
12156 } else {
12157 head_read = tail_read = 0;
12158 }
12159
12160 // chunk-aligned deferred overwrite?
12161 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
12162 b_off % chunk_size == 0 &&
12163 b_len % chunk_size == 0 &&
12164 b->get_blob().is_allocated(b_off, b_len)) {
12165
224ce89b 12166 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
12167
12168 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
12169 << " and tail 0x" << tail_read << std::dec << dendl;
12170 if (head_read) {
12171 bufferlist head_bl;
12172 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
12173 head_bl, 0);
11fdf7f2 12174 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
12175 size_t zlen = head_read - r;
12176 if (zlen) {
12177 head_bl.append_zero(zlen);
12178 logger->inc(l_bluestore_write_pad_bytes, zlen);
12179 }
11fdf7f2
TL
12180 head_bl.claim_append(bl);
12181 bl.swap(head_bl);
7c673cae
FG
12182 logger->inc(l_bluestore_write_penalty_read_ops);
12183 }
12184 if (tail_read) {
12185 bufferlist tail_bl;
12186 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
12187 tail_bl, 0);
11fdf7f2 12188 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
12189 size_t zlen = tail_read - r;
12190 if (zlen) {
12191 tail_bl.append_zero(zlen);
12192 logger->inc(l_bluestore_write_pad_bytes, zlen);
12193 }
224ce89b 12194 bl.claim_append(tail_bl);
7c673cae
FG
12195 logger->inc(l_bluestore_write_penalty_read_ops);
12196 }
12197 logger->inc(l_bluestore_write_small_pre_read);
12198
224ce89b 12199 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
12200 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
12201
7c673cae 12202 if (b->get_blob().csum_type) {
224ce89b 12203 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 12204 }
11fdf7f2
TL
12205
12206 if (!g_conf()->bluestore_debug_omit_block_device_write) {
12207 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
12208 op->op = bluestore_deferred_op_t::OP_WRITE;
12209 int r = b->get_blob().map(
12210 b_off, b_len,
12211 [&](uint64_t offset, uint64_t length) {
12212 op->extents.emplace_back(bluestore_pextent_t(offset, length));
12213 return 0;
12214 });
12215 ceph_assert(r == 0);
12216 op->data.claim(bl);
12217 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
12218 << b_len << std::dec << " of mutable " << *b
12219 << " at " << op->extents << dendl;
12220 }
12221
7c673cae
FG
12222 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
12223 b, &wctx->old_extents);
12224 b->dirty_blob().mark_used(le->blob_offset, le->length);
12225 txc->statfs_delta.stored() += le->length;
12226 dout(20) << __func__ << " lex " << *le << dendl;
12227 logger->inc(l_bluestore_write_small_deferred);
12228 return;
12229 }
224ce89b
WB
12230 // try to reuse blob if we can
12231 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
12232 max_bsize,
12233 offset0 - bstart,
12234 &alloc_len)) {
11fdf7f2 12235 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
12236 // fit into reused blob
12237 // Need to check for pending writes desiring to
12238 // reuse the same pextent. The rationale is that during GC two chunks
12239 // from garbage blobs(compressed?) can share logical space within the same
12240 // AU. That's in turn might be caused by unaligned len in clone_range2.
12241 // Hence the second write will fail in an attempt to reuse blob at
12242 // do_alloc_write().
12243 if (!wctx->has_conflict(b,
12244 offset0,
12245 offset0 + alloc_len,
12246 min_alloc_size)) {
12247
12248 // we can't reuse pad_head/pad_tail since they might be truncated
12249 // due to existent extents
12250 uint64_t b_off = offset - bstart;
12251 uint64_t b_off0 = b_off;
12252 _pad_zeros(&bl, &b_off0, chunk_size);
12253
12254 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
12255 << " (0x" << b_off0 << "~" << bl.length() << ")"
12256 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
12257 << std::dec << dendl;
12258
12259 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
12260 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
12261 false, false);
12262 logger->inc(l_bluestore_write_small_unused);
12263 return;
12264 }
12265 }
12266 }
12267 ++ep;
eafe8130 12268 end_ep = ep;
7c673cae
FG
12269 any_change = true;
12270 } // if (ep != end && ep->logical_offset < offset + max_bsize)
12271
12272 // check extent for reuse in reverse order
12273 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
12274 BlobRef b = prev_ep->blob;
eafe8130
TL
12275 if (!above_blob_threshold) {
12276 inspected_blobs.insert(&b->get_blob());
12277 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
12278 }
12279 start_ep = prev_ep;
7c673cae
FG
12280 auto bstart = prev_ep->blob_start();
12281 dout(20) << __func__ << " considering " << *b
12282 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 12283 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
12284 max_bsize,
12285 offset0 - bstart,
12286 &alloc_len)) {
11fdf7f2 12287 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
12288 // fit into reused blob
12289 // Need to check for pending writes desiring to
12290 // reuse the same pextent. The rationale is that during GC two chunks
12291 // from garbage blobs(compressed?) can share logical space within the same
12292 // AU. That's in turn might be caused by unaligned len in clone_range2.
12293 // Hence the second write will fail in an attempt to reuse blob at
12294 // do_alloc_write().
12295 if (!wctx->has_conflict(b,
12296 offset0,
12297 offset0 + alloc_len,
12298 min_alloc_size)) {
12299
12300 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
12301 uint64_t b_off = offset - bstart;
12302 uint64_t b_off0 = b_off;
12303 _pad_zeros(&bl, &b_off0, chunk_size);
12304
12305 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
12306 << " (0x" << b_off0 << "~" << bl.length() << ")"
12307 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
12308 << std::dec << dendl;
12309
12310 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
12311 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
12312 false, false);
12313 logger->inc(l_bluestore_write_small_unused);
12314 return;
12315 }
12316 }
12317 if (prev_ep != begin) {
12318 --prev_ep;
12319 any_change = true;
12320 } else {
12321 prev_ep = end; // to avoid useless first extent re-check
12322 }
12323 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
12324 } while (any_change);
12325
eafe8130
TL
12326 if (above_blob_threshold) {
12327 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
12328 << " " << std::hex << min_off << "~" << max_off << std::dec
12329 << dendl;
12330 ceph_assert(start_ep != end_ep);
12331 for (auto ep = start_ep; ep != end_ep; ++ep) {
12332 dout(20) << __func__ << " inserting for GC "
12333 << std::hex << ep->logical_offset << "~" << ep->length
12334 << std::dec << dendl;
12335
12336 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
12337 }
12338 // insert newly written extent to GC
12339 wctx->extents_to_gc.union_insert(offset, length);
12340 dout(20) << __func__ << " inserting (last) for GC "
12341 << std::hex << offset << "~" << length
12342 << std::dec << dendl;
12343 }
7c673cae 12344 // new blob.
7c673cae 12345 BlobRef b = c->new_blob();
11fdf7f2 12346 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
12347 uint64_t b_off0 = b_off;
12348 _pad_zeros(&bl, &b_off0, block_size);
12349 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
12350 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
7c673cae
FG
12351
12352 return;
12353}
12354
12355void BlueStore::_do_write_big(
12356 TransContext *txc,
12357 CollectionRef &c,
12358 OnodeRef o,
12359 uint64_t offset, uint64_t length,
12360 bufferlist::iterator& blp,
12361 WriteContext *wctx)
12362{
12363 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
12364 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
12365 << " compress " << (int)wctx->compress
12366 << dendl;
12367 logger->inc(l_bluestore_write_big);
12368 logger->inc(l_bluestore_write_big_bytes, length);
12369 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11fdf7f2 12370 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae
FG
12371 while (length > 0) {
12372 bool new_blob = false;
11fdf7f2 12373 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
12374 BlobRef b;
12375 uint32_t b_off = 0;
12376
12377 //attempting to reuse existing blob
12378 if (!wctx->compress) {
12379 // look for an existing mutable blob we can reuse
12380 auto begin = o->extent_map.extent_map.begin();
12381 auto end = o->extent_map.extent_map.end();
12382 auto ep = o->extent_map.seek_lextent(offset);
12383 auto prev_ep = ep;
12384 if (prev_ep != begin) {
12385 --prev_ep;
12386 } else {
12387 prev_ep = end; // to avoid this extent check as it's a duplicate
12388 }
12389 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
12390 // search suitable extent in both forward and reverse direction in
12391 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 12392 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
12393 bool any_change;
12394 do {
12395 any_change = false;
12396 if (ep != end && ep->logical_offset < offset + max_bsize) {
12397 if (offset >= ep->blob_start() &&
224ce89b 12398 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
12399 offset - ep->blob_start(),
12400 &l)) {
12401 b = ep->blob;
12402 b_off = offset - ep->blob_start();
12403 prev_ep = end; // to avoid check below
12404 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 12405 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
12406 } else {
12407 ++ep;
12408 any_change = true;
12409 }
12410 }
12411
12412 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 12413 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
12414 offset - prev_ep->blob_start(),
12415 &l)) {
12416 b = prev_ep->blob;
12417 b_off = offset - prev_ep->blob_start();
12418 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 12419 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
12420 } else if (prev_ep != begin) {
12421 --prev_ep;
12422 any_change = true;
12423 } else {
12424 prev_ep = end; // to avoid useless first extent re-check
12425 }
12426 }
12427 } while (b == nullptr && any_change);
12428 }
12429 if (b == nullptr) {
12430 b = c->new_blob();
12431 b_off = 0;
12432 new_blob = true;
12433 }
12434
12435 bufferlist t;
12436 blp.copy(l, t);
12437 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
12438 offset += l;
12439 length -= l;
12440 logger->inc(l_bluestore_write_big_blobs);
12441 }
12442}
12443
12444int BlueStore::_do_alloc_write(
12445 TransContext *txc,
12446 CollectionRef coll,
12447 OnodeRef o,
12448 WriteContext *wctx)
12449{
12450 dout(20) << __func__ << " txc " << txc
12451 << " " << wctx->writes.size() << " blobs"
12452 << dendl;
3efd9988
FG
12453 if (wctx->writes.empty()) {
12454 return 0;
7c673cae
FG
12455 }
12456
7c673cae
FG
12457 CompressorRef c;
12458 double crr = 0;
12459 if (wctx->compress) {
12460 c = select_option(
12461 "compression_algorithm",
12462 compressor,
12463 [&]() {
12464 string val;
12465 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
12466 CompressorRef cp = compressor;
12467 if (!cp || cp->get_type_name() != val) {
12468 cp = Compressor::create(cct, val);
11fdf7f2
TL
12469 if (!cp) {
12470 if (_set_compression_alert(false, val.c_str())) {
12471 derr << __func__ << " unable to initialize " << val.c_str()
12472 << " compressor" << dendl;
12473 }
12474 }
7c673cae
FG
12475 }
12476 return boost::optional<CompressorRef>(cp);
12477 }
12478 return boost::optional<CompressorRef>();
12479 }
12480 );
12481
12482 crr = select_option(
12483 "compression_required_ratio",
12484 cct->_conf->bluestore_compression_required_ratio,
12485 [&]() {
12486 double val;
3efd9988 12487 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
12488 return boost::optional<double>(val);
12489 }
12490 return boost::optional<double>();
12491 }
12492 );
12493 }
12494
12495 // checksum
11fdf7f2 12496 int64_t csum = csum_type.load();
7c673cae
FG
12497 csum = select_option(
12498 "csum_type",
12499 csum,
12500 [&]() {
11fdf7f2 12501 int64_t val;
3efd9988 12502 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 12503 return boost::optional<int64_t>(val);
7c673cae 12504 }
11fdf7f2 12505 return boost::optional<int64_t>();
7c673cae
FG
12506 }
12507 );
12508
3efd9988
FG
12509 // compress (as needed) and calc needed space
12510 uint64_t need = 0;
11fdf7f2 12511 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 12512 for (auto& wi : wctx->writes) {
3efd9988 12513 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 12514 auto start = mono_clock::now();
7c673cae
FG
12515
12516 // compress
11fdf7f2
TL
12517 ceph_assert(wi.b_off == 0);
12518 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 12519
7c673cae
FG
12520 // FIXME: memory alignment here is bad
12521 bufferlist t;
3efd9988 12522 int r = c->compress(wi.bl, t);
3efd9988 12523 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 12524 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
12525 bool rejected = false;
12526 uint64_t compressed_len = t.length();
12527 // do an approximate (fast) estimation for resulting blob size
12528 // that doesn't take header overhead into account
11fdf7f2 12529 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
12530 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
12531 bluestore_compression_header_t chdr;
12532 chdr.type = c->get_type();
12533 chdr.length = t.length();
12534 encode(chdr, wi.compressed_bl);
12535 wi.compressed_bl.claim_append(t);
12536
12537 compressed_len = wi.compressed_bl.length();
11fdf7f2 12538 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
12539 if (result_len <= want_len && result_len < wi.blob_length) {
12540 // Cool. We compressed at least as much as we were hoping to.
12541 // pad out to min_alloc_size
12542 wi.compressed_bl.append_zero(result_len - compressed_len);
12543 wi.compressed_len = compressed_len;
12544 wi.compressed = true;
12545 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
12546 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
12547 << " -> 0x" << compressed_len << " => 0x" << result_len
12548 << " with " << c->get_type()
12549 << std::dec << dendl;
12550 txc->statfs_delta.compressed() += compressed_len;
12551 txc->statfs_delta.compressed_original() += wi.blob_length;
12552 txc->statfs_delta.compressed_allocated() += result_len;
12553 logger->inc(l_bluestore_compress_success_count);
12554 need += result_len;
12555 } else {
12556 rejected = true;
12557 }
12558 } else if (r != 0) {
12559 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
12560 << " bytes compressed using " << c->get_type_name()
12561 << std::dec
12562 << " failed with errcode = " << r
12563 << ", leaving uncompressed"
12564 << dendl;
12565 logger->inc(l_bluestore_compress_rejected_count);
12566 need += wi.blob_length;
7c673cae 12567 } else {
a8e16298
TL
12568 rejected = true;
12569 }
12570
12571 if (rejected) {
3efd9988 12572 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 12573 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
12574 << " with " << c->get_type()
12575 << ", which is more than required 0x" << want_len_raw
7c673cae 12576 << " -> 0x" << want_len
3efd9988
FG
12577 << ", leaving uncompressed"
12578 << std::dec << dendl;
12579 logger->inc(l_bluestore_compress_rejected_count);
12580 need += wi.blob_length;
7c673cae 12581 }
494da23a
TL
12582 log_latency("compress@_do_alloc_write",
12583 l_bluestore_compress_lat,
12584 mono_clock::now() - start,
12585 cct->_conf->bluestore_log_op_age );
3efd9988
FG
12586 } else {
12587 need += wi.blob_length;
7c673cae 12588 }
3efd9988 12589 }
a8e16298 12590 PExtentVector prealloc;
3efd9988 12591 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 12592 int64_t prealloc_left = 0;
3efd9988
FG
12593 prealloc_left = alloc->allocate(
12594 need, min_alloc_size, need,
12595 0, &prealloc);
eafe8130 12596 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 12597 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 12598 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2
TL
12599 << " min_alloc_size 0x" << min_alloc_size
12600 << " available 0x " << alloc->get_free()
12601 << std::dec << dendl;
12602 if (prealloc.size()) {
12603 alloc->release(prealloc);
12604 }
a8e16298
TL
12605 return -ENOSPC;
12606 }
a8e16298 12607
3efd9988
FG
12608 dout(20) << __func__ << " prealloc " << prealloc << dendl;
12609 auto prealloc_pos = prealloc.begin();
12610
12611 for (auto& wi : wctx->writes) {
12612 BlobRef b = wi.b;
12613 bluestore_blob_t& dblob = b->dirty_blob();
12614 uint64_t b_off = wi.b_off;
12615 bufferlist *l = &wi.bl;
12616 uint64_t final_length = wi.blob_length;
12617 uint64_t csum_length = wi.blob_length;
3efd9988
FG
12618 if (wi.compressed) {
12619 final_length = wi.compressed_bl.length();
12620 csum_length = final_length;
3efd9988
FG
12621 l = &wi.compressed_bl;
12622 dblob.set_compressed(wi.blob_length, wi.compressed_len);
12623 } else if (wi.new_blob) {
7c673cae 12624 // initialize newly created blob only
11fdf7f2
TL
12625 ceph_assert(dblob.is_mutable());
12626 unsigned csum_order;
7c673cae
FG
12627 if (l->length() != wi.blob_length) {
12628 // hrm, maybe we could do better here, but let's not bother.
12629 dout(20) << __func__ << " forcing csum_order to block_size_order "
12630 << block_size_order << dendl;
31f18b77 12631 csum_order = block_size_order;
7c673cae
FG
12632 } else {
12633 csum_order = std::min(wctx->csum_order, ctz(l->length()));
12634 }
12635 // try to align blob with max_blob_size to improve
12636 // its reuse ratio, e.g. in case of reverse write
12637 uint32_t suggested_boff =
12638 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
12639 if ((suggested_boff % (1 << csum_order)) == 0 &&
12640 suggested_boff + final_length <= max_bsize &&
12641 suggested_boff > b_off) {
181888fb 12642 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 12643 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 12644 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
12645 csum_length += suggested_boff - b_off;
12646 b_off = suggested_boff;
12647 }
181888fb
FG
12648 if (csum != Checksummer::CSUM_NONE) {
12649 dout(20) << __func__ << " initialize csum setting for new blob " << *b
12650 << " csum_type " << Checksummer::get_csum_type_string(csum)
12651 << " csum_order " << csum_order
12652 << " csum_length 0x" << std::hex << csum_length << std::dec
12653 << dendl;
12654 dblob.init_csum(csum, csum_order, csum_length);
12655 }
7c673cae
FG
12656 }
12657
a8e16298 12658 PExtentVector extents;
3efd9988
FG
12659 int64_t left = final_length;
12660 while (left > 0) {
11fdf7f2 12661 ceph_assert(prealloc_left > 0);
3efd9988
FG
12662 if (prealloc_pos->length <= left) {
12663 prealloc_left -= prealloc_pos->length;
12664 left -= prealloc_pos->length;
12665 txc->statfs_delta.allocated() += prealloc_pos->length;
12666 extents.push_back(*prealloc_pos);
12667 ++prealloc_pos;
12668 } else {
12669 extents.emplace_back(prealloc_pos->offset, left);
12670 prealloc_pos->offset += left;
12671 prealloc_pos->length -= left;
12672 prealloc_left -= left;
12673 txc->statfs_delta.allocated() += left;
12674 left = 0;
12675 break;
12676 }
12677 }
7c673cae 12678 for (auto& p : extents) {
3efd9988 12679 txc->allocated.insert(p.offset, p.length);
7c673cae 12680 }
11fdf7f2 12681 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 12682
181888fb
FG
12683 dout(20) << __func__ << " blob " << *b << dendl;
12684 if (dblob.has_csum()) {
7c673cae
FG
12685 dblob.calc_csum(b_off, *l);
12686 }
181888fb 12687
7c673cae
FG
12688 if (wi.mark_unused) {
12689 auto b_end = b_off + wi.bl.length();
12690 if (b_off) {
12691 dblob.add_unused(0, b_off);
12692 }
12693 if (b_end < wi.blob_length) {
12694 dblob.add_unused(b_end, wi.blob_length - b_end);
12695 }
12696 }
12697
12698 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
12699 b_off + (wi.b_off0 - wi.b_off),
12700 wi.length0,
12701 wi.b,
12702 nullptr);
12703 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
12704 txc->statfs_delta.stored() += le->length;
12705 dout(20) << __func__ << " lex " << *le << dendl;
12706 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
12707 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
12708
12709 // queue io
11fdf7f2 12710 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12711 if (l->length() <= prefer_deferred_size.load()) {
12712 dout(20) << __func__ << " deferring small 0x" << std::hex
12713 << l->length() << std::dec << " write via deferred" << dendl;
12714 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
12715 op->op = bluestore_deferred_op_t::OP_WRITE;
12716 int r = b->get_blob().map(
12717 b_off, l->length(),
12718 [&](uint64_t offset, uint64_t length) {
12719 op->extents.emplace_back(bluestore_pextent_t(offset, length));
12720 return 0;
12721 });
11fdf7f2 12722 ceph_assert(r == 0);
7c673cae 12723 op->data = *l;
81eedcae 12724 logger->inc(l_bluestore_write_small_deferred);
7c673cae
FG
12725 } else {
12726 b->get_blob().map_bl(
12727 b_off, *l,
12728 [&](uint64_t offset, bufferlist& t) {
12729 bdev->aio_write(offset, t, &txc->ioc, false);
12730 });
81eedcae 12731 logger->inc(l_bluestore_write_small_new);
7c673cae
FG
12732 }
12733 }
12734 }
11fdf7f2
TL
12735 ceph_assert(prealloc_pos == prealloc.end());
12736 ceph_assert(prealloc_left == 0);
7c673cae
FG
12737 return 0;
12738}
12739
12740void BlueStore::_wctx_finish(
12741 TransContext *txc,
12742 CollectionRef& c,
12743 OnodeRef o,
31f18b77
FG
12744 WriteContext *wctx,
12745 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
12746{
12747 auto oep = wctx->old_extents.begin();
12748 while (oep != wctx->old_extents.end()) {
12749 auto &lo = *oep;
12750 oep = wctx->old_extents.erase(oep);
12751 dout(20) << __func__ << " lex_old " << lo.e << dendl;
12752 BlobRef b = lo.e.blob;
12753 const bluestore_blob_t& blob = b->get_blob();
12754 if (blob.is_compressed()) {
12755 if (lo.blob_empty) {
12756 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
12757 }
12758 txc->statfs_delta.compressed_original() -= lo.e.length;
12759 }
12760 auto& r = lo.r;
12761 txc->statfs_delta.stored() -= lo.e.length;
12762 if (!r.empty()) {
12763 dout(20) << __func__ << " blob release " << r << dendl;
12764 if (blob.is_shared()) {
12765 PExtentVector final;
12766 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
12767 bool unshare = false;
12768 bool* unshare_ptr =
12769 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 12770 for (auto e : r) {
31f18b77
FG
12771 b->shared_blob->put_ref(
12772 e.offset, e.length, &final,
11fdf7f2
TL
12773 unshare_ptr);
12774 }
12775 if (unshare) {
12776 ceph_assert(maybe_unshared_blobs);
12777 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
12778 }
12779 dout(20) << __func__ << " shared_blob release " << final
12780 << " from " << *b->shared_blob << dendl;
12781 txc->write_shared_blob(b->shared_blob);
12782 r.clear();
12783 r.swap(final);
12784 }
12785 }
12786 // we can't invalidate our logical extents as we drop them because
12787 // other lextents (either in our onode or others) may still
12788 // reference them. but we can throw out anything that is no
12789 // longer allocated. Note that this will leave behind edge bits
12790 // that are no longer referenced but not deallocated (until they
12791 // age out of the cache naturally).
12792 b->discard_unallocated(c.get());
12793 for (auto e : r) {
12794 dout(20) << __func__ << " release " << e << dendl;
12795 txc->released.insert(e.offset, e.length);
12796 txc->statfs_delta.allocated() -= e.length;
12797 if (blob.is_compressed()) {
12798 txc->statfs_delta.compressed_allocated() -= e.length;
12799 }
12800 }
12801 delete &lo;
12802 if (b->is_spanning() && !b->is_referenced()) {
12803 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
12804 << dendl;
12805 o->extent_map.spanning_blob_map.erase(b->id);
12806 }
12807 }
12808}
12809
12810void BlueStore::_do_write_data(
12811 TransContext *txc,
12812 CollectionRef& c,
12813 OnodeRef o,
12814 uint64_t offset,
12815 uint64_t length,
12816 bufferlist& bl,
12817 WriteContext *wctx)
12818{
12819 uint64_t end = offset + length;
12820 bufferlist::iterator p = bl.begin();
12821
12822 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
12823 (length != min_alloc_size)) {
12824 // we fall within the same block
12825 _do_write_small(txc, c, o, offset, length, p, wctx);
12826 } else {
12827 uint64_t head_offset, head_length;
12828 uint64_t middle_offset, middle_length;
12829 uint64_t tail_offset, tail_length;
12830
12831 head_offset = offset;
11fdf7f2 12832 head_length = p2nphase(offset, min_alloc_size);
7c673cae 12833
11fdf7f2
TL
12834 tail_offset = p2align(end, min_alloc_size);
12835 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
12836
12837 middle_offset = head_offset + head_length;
12838 middle_length = length - head_length - tail_length;
12839
12840 if (head_length) {
12841 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
12842 }
12843
12844 if (middle_length) {
12845 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
12846 }
12847
12848 if (tail_length) {
12849 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
12850 }
12851 }
12852}
12853
31f18b77
FG
12854void BlueStore::_choose_write_options(
12855 CollectionRef& c,
12856 OnodeRef o,
12857 uint32_t fadvise_flags,
12858 WriteContext *wctx)
7c673cae 12859{
7c673cae
FG
12860 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
12861 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 12862 wctx->buffered = true;
7c673cae
FG
12863 } else if (cct->_conf->bluestore_default_buffered_write &&
12864 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
12865 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
12866 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 12867 wctx->buffered = true;
7c673cae
FG
12868 }
12869
31f18b77
FG
12870 // apply basic csum block size
12871 wctx->csum_order = block_size_order;
7c673cae
FG
12872
12873 // compression parameters
12874 unsigned alloc_hints = o->onode.alloc_hint_flags;
12875 auto cm = select_option(
12876 "compression_mode",
31f18b77 12877 comp_mode.load(),
7c673cae
FG
12878 [&]() {
12879 string val;
11fdf7f2 12880 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
12881 return boost::optional<Compressor::CompressionMode>(
12882 Compressor::get_comp_mode_type(val));
7c673cae
FG
12883 }
12884 return boost::optional<Compressor::CompressionMode>();
12885 }
12886 );
31f18b77
FG
12887
12888 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
12889 ((cm == Compressor::COMP_FORCE) ||
12890 (cm == Compressor::COMP_AGGRESSIVE &&
12891 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
12892 (cm == Compressor::COMP_PASSIVE &&
12893 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
12894
12895 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
12896 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
12897 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
12898 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 12899 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 12900
7c673cae 12901 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 12902
7c673cae 12903 if (o->onode.expected_write_size) {
224ce89b 12904 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 12905 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 12906 } else {
224ce89b 12907 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
12908 }
12909
31f18b77
FG
12910 if (wctx->compress) {
12911 wctx->target_blob_size = select_option(
7c673cae 12912 "compression_max_blob_size",
31f18b77 12913 comp_max_blob_size.load(),
7c673cae 12914 [&]() {
11fdf7f2
TL
12915 int64_t val;
12916 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
12917 return boost::optional<uint64_t>((uint64_t)val);
12918 }
12919 return boost::optional<uint64_t>();
12920 }
12921 );
12922 }
12923 } else {
31f18b77
FG
12924 if (wctx->compress) {
12925 wctx->target_blob_size = select_option(
7c673cae 12926 "compression_min_blob_size",
31f18b77 12927 comp_min_blob_size.load(),
7c673cae 12928 [&]() {
11fdf7f2
TL
12929 int64_t val;
12930 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
12931 return boost::optional<uint64_t>((uint64_t)val);
12932 }
12933 return boost::optional<uint64_t>();
12934 }
12935 );
12936 }
12937 }
31f18b77 12938
7c673cae 12939 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
12940 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
12941 wctx->target_blob_size = max_bsize;
7c673cae 12942 }
31f18b77 12943
7c673cae
FG
12944 // set the min blob size floor at 2x the min_alloc_size, or else we
12945 // won't be able to allocate a smaller extent for the compressed
12946 // data.
31f18b77
FG
12947 if (wctx->compress &&
12948 wctx->target_blob_size < min_alloc_size * 2) {
12949 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 12950 }
31f18b77
FG
12951
12952 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
12953 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
12954 << " compress=" << (int)wctx->compress
12955 << " buffered=" << (int)wctx->buffered
31f18b77
FG
12956 << std::dec << dendl;
12957}
12958
12959int BlueStore::_do_gc(
12960 TransContext *txc,
12961 CollectionRef& c,
12962 OnodeRef o,
31f18b77
FG
12963 const WriteContext& wctx,
12964 uint64_t *dirty_start,
12965 uint64_t *dirty_end)
12966{
31f18b77 12967
1adf2230 12968 bool dirty_range_updated = false;
31f18b77 12969 WriteContext wctx_gc;
7c673cae 12970 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 12971
eafe8130 12972 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
12973 for (auto it = extents_to_collect.begin();
12974 it != extents_to_collect.end();
12975 ++it) {
12976 bufferlist bl;
eafe8130
TL
12977 auto offset = (*it).first;
12978 auto length = (*it).second;
12979 dout(20) << __func__ << " processing " << std::hex
12980 << offset << "~" << length << std::dec
12981 << dendl;
12982 int r = _do_read(c.get(), o, offset, length, bl, 0);
12983 ceph_assert(r == (int)length);
31f18b77 12984
eafe8130
TL
12985 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
12986 logger->inc(l_bluestore_gc_merged, length);
31f18b77 12987
eafe8130
TL
12988 if (*dirty_start > offset) {
12989 *dirty_start = offset;
1adf2230 12990 dirty_range_updated = true;
31f18b77
FG
12991 }
12992
eafe8130
TL
12993 if (*dirty_end < offset + length) {
12994 *dirty_end = offset + length;
1adf2230 12995 dirty_range_updated = true;
31f18b77
FG
12996 }
12997 }
1adf2230
AA
12998 if (dirty_range_updated) {
12999 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
13000 }
31f18b77
FG
13001
13002 dout(30) << __func__ << " alloc write" << dendl;
13003 int r = _do_alloc_write(txc, c, o, &wctx_gc);
13004 if (r < 0) {
13005 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
13006 << dendl;
13007 return r;
13008 }
13009
13010 _wctx_finish(txc, c, o, &wctx_gc);
13011 return 0;
13012}
13013
13014int BlueStore::_do_write(
13015 TransContext *txc,
13016 CollectionRef& c,
13017 OnodeRef o,
13018 uint64_t offset,
13019 uint64_t length,
13020 bufferlist& bl,
13021 uint32_t fadvise_flags)
13022{
13023 int r = 0;
13024
13025 dout(20) << __func__
13026 << " " << o->oid
13027 << " 0x" << std::hex << offset << "~" << length
13028 << " - have 0x" << o->onode.size
13029 << " (" << std::dec << o->onode.size << ")"
13030 << " bytes"
13031 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
13032 << dendl;
81eedcae 13033 _dump_onode<30>(cct, *o);
31f18b77
FG
13034
13035 if (length == 0) {
13036 return 0;
13037 }
13038
13039 uint64_t end = offset + length;
13040
13041 GarbageCollector gc(c->store->cct);
eafe8130 13042 int64_t benefit = 0;
31f18b77
FG
13043 auto dirty_start = offset;
13044 auto dirty_end = end;
13045
13046 WriteContext wctx;
13047 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
13048 o->extent_map.fault_range(db, offset, length);
13049 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
13050 r = _do_alloc_write(txc, c, o, &wctx);
13051 if (r < 0) {
13052 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
13053 << dendl;
13054 goto out;
13055 }
13056
eafe8130
TL
13057 if (wctx.extents_to_gc.empty() ||
13058 wctx.extents_to_gc.range_start() > offset ||
13059 wctx.extents_to_gc.range_end() < offset + length) {
13060 benefit = gc.estimate(offset,
13061 length,
13062 o->extent_map,
13063 wctx.old_extents,
13064 min_alloc_size);
13065 }
13066
31f18b77
FG
13067 // NB: _wctx_finish() will empty old_extents
13068 // so we must do gc estimation before that
7c673cae
FG
13069 _wctx_finish(txc, c, o, &wctx);
13070 if (end > o->onode.size) {
13071 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 13072 << std::dec << dendl;
7c673cae
FG
13073 o->onode.size = end;
13074 }
13075
11fdf7f2 13076 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
13077 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
13078 dout(20) << __func__
13079 << " perform garbage collection for compressed extents, "
13080 << "expected benefit = " << benefit << " AUs" << dendl;
13081 }
13082 if (!wctx.extents_to_gc.empty()) {
13083 dout(20) << __func__ << " perform garbage collection" << dendl;
13084
13085 r = _do_gc(txc, c, o,
13086 wctx,
13087 &dirty_start, &dirty_end);
13088 if (r < 0) {
13089 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
13090 << dendl;
13091 goto out;
7c673cae 13092 }
eafe8130
TL
13093 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
13094 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 13095 }
7c673cae 13096 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
13097 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
13098
7c673cae
FG
13099 r = 0;
13100
13101 out:
13102 return r;
13103}
13104
13105int BlueStore::_write(TransContext *txc,
13106 CollectionRef& c,
13107 OnodeRef& o,
31f18b77
FG
13108 uint64_t offset, size_t length,
13109 bufferlist& bl,
13110 uint32_t fadvise_flags)
7c673cae
FG
13111{
13112 dout(15) << __func__ << " " << c->cid << " " << o->oid
13113 << " 0x" << std::hex << offset << "~" << length << std::dec
13114 << dendl;
35e4c445
FG
13115 int r = 0;
13116 if (offset + length >= OBJECT_MAX_SIZE) {
13117 r = -E2BIG;
13118 } else {
13119 _assign_nid(txc, o);
13120 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
13121 txc->write_onode(o);
13122 }
7c673cae
FG
13123 dout(10) << __func__ << " " << c->cid << " " << o->oid
13124 << " 0x" << std::hex << offset << "~" << length << std::dec
13125 << " = " << r << dendl;
13126 return r;
13127}
13128
13129int BlueStore::_zero(TransContext *txc,
13130 CollectionRef& c,
13131 OnodeRef& o,
13132 uint64_t offset, size_t length)
13133{
13134 dout(15) << __func__ << " " << c->cid << " " << o->oid
13135 << " 0x" << std::hex << offset << "~" << length << std::dec
13136 << dendl;
35e4c445
FG
13137 int r = 0;
13138 if (offset + length >= OBJECT_MAX_SIZE) {
13139 r = -E2BIG;
13140 } else {
13141 _assign_nid(txc, o);
13142 r = _do_zero(txc, c, o, offset, length);
13143 }
7c673cae
FG
13144 dout(10) << __func__ << " " << c->cid << " " << o->oid
13145 << " 0x" << std::hex << offset << "~" << length << std::dec
13146 << " = " << r << dendl;
13147 return r;
13148}
13149
13150int BlueStore::_do_zero(TransContext *txc,
13151 CollectionRef& c,
13152 OnodeRef& o,
13153 uint64_t offset, size_t length)
13154{
13155 dout(15) << __func__ << " " << c->cid << " " << o->oid
13156 << " 0x" << std::hex << offset << "~" << length << std::dec
13157 << dendl;
13158 int r = 0;
13159
81eedcae 13160 _dump_onode<30>(cct, *o);
7c673cae
FG
13161
13162 WriteContext wctx;
13163 o->extent_map.fault_range(db, offset, length);
13164 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 13165 o->extent_map.dirty_range(offset, length);
7c673cae
FG
13166 _wctx_finish(txc, c, o, &wctx);
13167
b32b8144 13168 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
13169 o->onode.size = offset + length;
13170 dout(20) << __func__ << " extending size to " << offset + length
13171 << dendl;
13172 }
13173 txc->write_onode(o);
13174
13175 dout(10) << __func__ << " " << c->cid << " " << o->oid
13176 << " 0x" << std::hex << offset << "~" << length << std::dec
13177 << " = " << r << dendl;
13178 return r;
13179}
13180
13181void BlueStore::_do_truncate(
31f18b77
FG
13182 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
13183 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
13184{
13185 dout(15) << __func__ << " " << c->cid << " " << o->oid
13186 << " 0x" << std::hex << offset << std::dec << dendl;
13187
81eedcae 13188 _dump_onode<30>(cct, *o);
7c673cae
FG
13189
13190 if (offset == o->onode.size)
31f18b77 13191 return;
7c673cae
FG
13192
13193 if (offset < o->onode.size) {
13194 WriteContext wctx;
13195 uint64_t length = o->onode.size - offset;
13196 o->extent_map.fault_range(db, offset, length);
13197 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
13198 o->extent_map.dirty_range(offset, length);
13199 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
13200
13201 // if we have shards past EOF, ask for a reshard
13202 if (!o->onode.extent_map_shards.empty() &&
13203 o->onode.extent_map_shards.back().offset >= offset) {
13204 dout(10) << __func__ << " request reshard past EOF" << dendl;
13205 if (offset) {
13206 o->extent_map.request_reshard(offset - 1, offset + length);
13207 } else {
13208 o->extent_map.request_reshard(0, length);
13209 }
13210 }
13211 }
13212
13213 o->onode.size = offset;
13214
13215 txc->write_onode(o);
13216}
13217
35e4c445 13218int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
13219 CollectionRef& c,
13220 OnodeRef& o,
13221 uint64_t offset)
13222{
13223 dout(15) << __func__ << " " << c->cid << " " << o->oid
13224 << " 0x" << std::hex << offset << std::dec
13225 << dendl;
35e4c445
FG
13226 int r = 0;
13227 if (offset >= OBJECT_MAX_SIZE) {
13228 r = -E2BIG;
13229 } else {
13230 _do_truncate(txc, c, o, offset);
13231 }
13232 dout(10) << __func__ << " " << c->cid << " " << o->oid
13233 << " 0x" << std::hex << offset << std::dec
13234 << " = " << r << dendl;
13235 return r;
7c673cae
FG
13236}
13237
13238int BlueStore::_do_remove(
13239 TransContext *txc,
13240 CollectionRef& c,
13241 OnodeRef o)
13242{
31f18b77 13243 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
13244 bool is_gen = !o->oid.is_no_gen();
13245 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
13246 if (o->onode.has_omap()) {
13247 o->flush();
11fdf7f2
TL
13248 _do_omap_clear(txc,
13249 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
13250 o->onode.nid);
7c673cae
FG
13251 }
13252 o->exists = false;
13253 string key;
13254 for (auto &s : o->extent_map.shards) {
13255 dout(20) << __func__ << " removing shard 0x" << std::hex
13256 << s.shard_info->offset << std::dec << dendl;
13257 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
13258 [&](const string& final_key) {
13259 txc->t->rmkey(PREFIX_OBJ, final_key);
13260 }
13261 );
13262 }
13263 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 13264 txc->note_removed_object(o);
7c673cae
FG
13265 o->extent_map.clear();
13266 o->onode = bluestore_onode_t();
13267 _debug_obj_on_delete(o->oid);
31f18b77 13268
224ce89b
WB
13269 if (!is_gen || maybe_unshared_blobs.empty()) {
13270 return 0;
13271 }
31f18b77 13272
224ce89b
WB
13273 // see if we can unshare blobs still referenced by the head
13274 dout(10) << __func__ << " gen and maybe_unshared_blobs "
13275 << maybe_unshared_blobs << dendl;
13276 ghobject_t nogen = o->oid;
13277 nogen.generation = ghobject_t::NO_GEN;
13278 OnodeRef h = c->onode_map.lookup(nogen);
13279
13280 if (!h || !h->exists) {
13281 return 0;
13282 }
13283
13284 dout(20) << __func__ << " checking for unshareable blobs on " << h
13285 << " " << h->oid << dendl;
13286 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
13287 for (auto& e : h->extent_map.extent_map) {
13288 const bluestore_blob_t& b = e.blob->get_blob();
13289 SharedBlob *sb = e.blob->shared_blob.get();
13290 if (b.is_shared() &&
13291 sb->loaded &&
13292 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
13293 if (b.is_compressed()) {
13294 expect[sb].get(0, b.get_ondisk_length());
13295 } else {
13296 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
13297 expect[sb].get(off, len);
13298 return 0;
13299 });
13300 }
224ce89b
WB
13301 }
13302 }
31f18b77 13303
224ce89b
WB
13304 vector<SharedBlob*> unshared_blobs;
13305 unshared_blobs.reserve(maybe_unshared_blobs.size());
13306 for (auto& p : expect) {
13307 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
13308 if (p.first->persistent->ref_map == p.second) {
13309 SharedBlob *sb = p.first;
13310 dout(20) << __func__ << " unsharing " << *sb << dendl;
13311 unshared_blobs.push_back(sb);
13312 txc->unshare_blob(sb);
13313 uint64_t sbid = c->make_blob_unshared(sb);
13314 string key;
13315 get_shared_blob_key(sbid, &key);
13316 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
13317 }
13318 }
13319
13320 if (unshared_blobs.empty()) {
13321 return 0;
13322 }
13323
224ce89b
WB
13324 for (auto& e : h->extent_map.extent_map) {
13325 const bluestore_blob_t& b = e.blob->get_blob();
13326 SharedBlob *sb = e.blob->shared_blob.get();
13327 if (b.is_shared() &&
13328 std::find(unshared_blobs.begin(), unshared_blobs.end(),
13329 sb) != unshared_blobs.end()) {
13330 dout(20) << __func__ << " unsharing " << e << dendl;
13331 bluestore_blob_t& blob = e.blob->dirty_blob();
13332 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 13333 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
13334 }
13335 }
224ce89b
WB
13336 txc->write_onode(h);
13337
7c673cae
FG
13338 return 0;
13339}
13340
13341int BlueStore::_remove(TransContext *txc,
13342 CollectionRef& c,
13343 OnodeRef &o)
13344{
11fdf7f2
TL
13345 dout(15) << __func__ << " " << c->cid << " " << o->oid
13346 << " onode " << o.get()
13347 << " txc "<< txc << dendl;
7c673cae
FG
13348 int r = _do_remove(txc, c, o);
13349 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13350 return r;
13351}
13352
13353int BlueStore::_setattr(TransContext *txc,
13354 CollectionRef& c,
13355 OnodeRef& o,
13356 const string& name,
13357 bufferptr& val)
13358{
13359 dout(15) << __func__ << " " << c->cid << " " << o->oid
13360 << " " << name << " (" << val.length() << " bytes)"
13361 << dendl;
13362 int r = 0;
3efd9988
FG
13363 if (val.is_partial()) {
13364 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
13365 val.length());
13366 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
13367 } else {
13368 auto& b = o->onode.attrs[name.c_str()] = val;
13369 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
13370 }
7c673cae
FG
13371 txc->write_onode(o);
13372 dout(10) << __func__ << " " << c->cid << " " << o->oid
13373 << " " << name << " (" << val.length() << " bytes)"
13374 << " = " << r << dendl;
13375 return r;
13376}
13377
13378int BlueStore::_setattrs(TransContext *txc,
13379 CollectionRef& c,
13380 OnodeRef& o,
13381 const map<string,bufferptr>& aset)
13382{
13383 dout(15) << __func__ << " " << c->cid << " " << o->oid
13384 << " " << aset.size() << " keys"
13385 << dendl;
13386 int r = 0;
13387 for (map<string,bufferptr>::const_iterator p = aset.begin();
13388 p != aset.end(); ++p) {
3efd9988
FG
13389 if (p->second.is_partial()) {
13390 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 13391 bufferptr(p->second.c_str(), p->second.length());
3efd9988
FG
13392 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
13393 } else {
13394 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
13395 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
13396 }
7c673cae
FG
13397 }
13398 txc->write_onode(o);
13399 dout(10) << __func__ << " " << c->cid << " " << o->oid
13400 << " " << aset.size() << " keys"
13401 << " = " << r << dendl;
13402 return r;
13403}
13404
13405
13406int BlueStore::_rmattr(TransContext *txc,
13407 CollectionRef& c,
13408 OnodeRef& o,
13409 const string& name)
13410{
13411 dout(15) << __func__ << " " << c->cid << " " << o->oid
13412 << " " << name << dendl;
13413 int r = 0;
13414 auto it = o->onode.attrs.find(name.c_str());
13415 if (it == o->onode.attrs.end())
13416 goto out;
13417
13418 o->onode.attrs.erase(it);
13419 txc->write_onode(o);
13420
13421 out:
13422 dout(10) << __func__ << " " << c->cid << " " << o->oid
13423 << " " << name << " = " << r << dendl;
13424 return r;
13425}
13426
13427int BlueStore::_rmattrs(TransContext *txc,
13428 CollectionRef& c,
13429 OnodeRef& o)
13430{
13431 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13432 int r = 0;
13433
13434 if (o->onode.attrs.empty())
13435 goto out;
13436
13437 o->onode.attrs.clear();
13438 txc->write_onode(o);
13439
13440 out:
13441 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13442 return r;
13443}
13444
11fdf7f2
TL
13445void BlueStore::_do_omap_clear(TransContext *txc, const string& omap_prefix,
13446 uint64_t id)
7c673cae 13447{
7c673cae
FG
13448 string prefix, tail;
13449 get_omap_header(id, &prefix);
13450 get_omap_tail(id, &tail);
11fdf7f2 13451 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 13452 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
13453 dout(20) << __func__ << " remove range start: "
13454 << pretty_binary_string(prefix) << " end: "
13455 << pretty_binary_string(tail) << dendl;
7c673cae
FG
13456}
13457
13458int BlueStore::_omap_clear(TransContext *txc,
13459 CollectionRef& c,
13460 OnodeRef& o)
13461{
13462 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13463 int r = 0;
13464 if (o->onode.has_omap()) {
13465 o->flush();
11fdf7f2
TL
13466 _do_omap_clear(txc,
13467 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
13468 o->onode.nid);
7c673cae
FG
13469 o->onode.clear_omap_flag();
13470 txc->write_onode(o);
13471 }
13472 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13473 return r;
13474}
13475
13476int BlueStore::_omap_setkeys(TransContext *txc,
13477 CollectionRef& c,
13478 OnodeRef& o,
13479 bufferlist &bl)
13480{
13481 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13482 int r;
11fdf7f2 13483 auto p = bl.cbegin();
7c673cae
FG
13484 __u32 num;
13485 if (!o->onode.has_omap()) {
13486 o->onode.set_omap_flag();
11fdf7f2
TL
13487 if (o->oid.is_pgmeta()) {
13488 o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
13489 }
7c673cae 13490 txc->write_onode(o);
494da23a
TL
13491
13492 const string& prefix =
13493 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13494 string key_tail;
13495 bufferlist tail;
13496 get_omap_tail(o->onode.nid, &key_tail);
13497 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
13498 } else {
13499 txc->note_modified_object(o);
13500 }
11fdf7f2
TL
13501 const string& prefix =
13502 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
7c673cae
FG
13503 string final_key;
13504 _key_encode_u64(o->onode.nid, &final_key);
13505 final_key.push_back('.');
11fdf7f2 13506 decode(num, p);
7c673cae
FG
13507 while (num--) {
13508 string key;
13509 bufferlist value;
11fdf7f2
TL
13510 decode(key, p);
13511 decode(value, p);
7c673cae
FG
13512 final_key.resize(9); // keep prefix
13513 final_key += key;
11fdf7f2 13514 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 13515 << " <- " << key << dendl;
11fdf7f2 13516 txc->t->set(prefix, final_key, value);
7c673cae
FG
13517 }
13518 r = 0;
13519 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13520 return r;
13521}
13522
13523int BlueStore::_omap_setheader(TransContext *txc,
13524 CollectionRef& c,
13525 OnodeRef &o,
13526 bufferlist& bl)
13527{
13528 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13529 int r;
13530 string key;
13531 if (!o->onode.has_omap()) {
13532 o->onode.set_omap_flag();
11fdf7f2
TL
13533 if (o->oid.is_pgmeta()) {
13534 o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
13535 }
7c673cae 13536 txc->write_onode(o);
494da23a
TL
13537
13538 const string& prefix =
13539 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13540 string key_tail;
13541 bufferlist tail;
13542 get_omap_tail(o->onode.nid, &key_tail);
13543 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
13544 } else {
13545 txc->note_modified_object(o);
13546 }
11fdf7f2
TL
13547 const string& prefix =
13548 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
7c673cae 13549 get_omap_header(o->onode.nid, &key);
11fdf7f2 13550 txc->t->set(prefix, key, bl);
7c673cae
FG
13551 r = 0;
13552 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13553 return r;
13554}
13555
13556int BlueStore::_omap_rmkeys(TransContext *txc,
13557 CollectionRef& c,
13558 OnodeRef& o,
13559 bufferlist& bl)
13560{
13561 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13562 int r = 0;
11fdf7f2 13563 auto p = bl.cbegin();
7c673cae
FG
13564 __u32 num;
13565 string final_key;
13566
13567 if (!o->onode.has_omap()) {
13568 goto out;
13569 }
11fdf7f2
TL
13570 {
13571 const string& prefix =
13572 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13573 _key_encode_u64(o->onode.nid, &final_key);
13574 final_key.push_back('.');
13575 decode(num, p);
13576 while (num--) {
13577 string key;
13578 decode(key, p);
13579 final_key.resize(9); // keep prefix
13580 final_key += key;
13581 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
13582 << " <- " << key << dendl;
13583 txc->t->rmkey(prefix, final_key);
13584 }
7c673cae
FG
13585 }
13586 txc->note_modified_object(o);
13587
13588 out:
13589 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13590 return r;
13591}
13592
13593int BlueStore::_omap_rmkey_range(TransContext *txc,
13594 CollectionRef& c,
13595 OnodeRef& o,
13596 const string& first, const string& last)
13597{
13598 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
13599 string key_first, key_last;
13600 int r = 0;
13601 if (!o->onode.has_omap()) {
13602 goto out;
13603 }
11fdf7f2
TL
13604 {
13605 const string& prefix =
13606 o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13607 o->flush();
13608 get_omap_key(o->onode.nid, first, &key_first);
13609 get_omap_key(o->onode.nid, last, &key_last);
13610 txc->t->rm_range_keys(prefix, key_first, key_last);
13611 dout(20) << __func__ << " remove range start: "
13612 << pretty_binary_string(key_first) << " end: "
13613 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
13614 }
13615 txc->note_modified_object(o);
13616
13617 out:
13618 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13619 return r;
13620}
13621
13622int BlueStore::_set_alloc_hint(
13623 TransContext *txc,
13624 CollectionRef& c,
13625 OnodeRef& o,
13626 uint64_t expected_object_size,
13627 uint64_t expected_write_size,
13628 uint32_t flags)
13629{
13630 dout(15) << __func__ << " " << c->cid << " " << o->oid
13631 << " object_size " << expected_object_size
13632 << " write_size " << expected_write_size
13633 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
13634 << dendl;
13635 int r = 0;
13636 o->onode.expected_object_size = expected_object_size;
13637 o->onode.expected_write_size = expected_write_size;
13638 o->onode.alloc_hint_flags = flags;
13639 txc->write_onode(o);
13640 dout(10) << __func__ << " " << c->cid << " " << o->oid
13641 << " object_size " << expected_object_size
13642 << " write_size " << expected_write_size
13643 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
13644 << " = " << r << dendl;
13645 return r;
13646}
13647
13648int BlueStore::_clone(TransContext *txc,
13649 CollectionRef& c,
13650 OnodeRef& oldo,
13651 OnodeRef& newo)
13652{
13653 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13654 << newo->oid << dendl;
13655 int r = 0;
13656 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
13657 derr << __func__ << " mismatched hash on " << oldo->oid
13658 << " and " << newo->oid << dendl;
13659 return -EINVAL;
13660 }
13661
7c673cae
FG
13662 _assign_nid(txc, newo);
13663
13664 // clone data
13665 oldo->flush();
13666 _do_truncate(txc, c, newo, 0);
13667 if (cct->_conf->bluestore_clone_cow) {
13668 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
13669 } else {
13670 bufferlist bl;
13671 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
13672 if (r < 0)
13673 goto out;
13674 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
13675 if (r < 0)
13676 goto out;
13677 }
13678
13679 // clone attrs
13680 newo->onode.attrs = oldo->onode.attrs;
13681
13682 // clone omap
13683 if (newo->onode.has_omap()) {
13684 dout(20) << __func__ << " clearing old omap data" << dendl;
13685 newo->flush();
11fdf7f2
TL
13686 _do_omap_clear(txc,
13687 newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP
13688 : PREFIX_OMAP,
13689 newo->onode.nid);
494da23a 13690 newo->onode.clear_omap_flag();
7c673cae
FG
13691 }
13692 if (oldo->onode.has_omap()) {
13693 dout(20) << __func__ << " copying omap data" << dendl;
494da23a
TL
13694 newo->onode.set_omap_flag();
13695 if (newo->oid.is_pgmeta()) {
13696 newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
7c673cae 13697 }
11fdf7f2
TL
13698 const string& prefix =
13699 newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
13700 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae
FG
13701 string head, tail;
13702 get_omap_header(oldo->onode.nid, &head);
13703 get_omap_tail(oldo->onode.nid, &tail);
13704 it->lower_bound(head);
13705 while (it->valid()) {
13706 if (it->key() >= tail) {
13707 dout(30) << __func__ << " reached tail" << dendl;
13708 break;
13709 } else {
13710 dout(30) << __func__ << " got header/data "
13711 << pretty_binary_string(it->key()) << dendl;
13712 string key;
13713 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11fdf7f2 13714 txc->t->set(prefix, key, it->value());
7c673cae
FG
13715 }
13716 it->next();
13717 }
494da23a
TL
13718 string new_tail;
13719 bufferlist new_tail_value;
13720 get_omap_tail(newo->onode.nid, &new_tail);
13721 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
13722 }
13723
13724 txc->write_onode(newo);
13725 r = 0;
13726
13727 out:
13728 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13729 << newo->oid << " = " << r << dendl;
13730 return r;
13731}
13732
13733int BlueStore::_do_clone_range(
13734 TransContext *txc,
13735 CollectionRef& c,
13736 OnodeRef& oldo,
13737 OnodeRef& newo,
224ce89b
WB
13738 uint64_t srcoff,
13739 uint64_t length,
13740 uint64_t dstoff)
7c673cae
FG
13741{
13742 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13743 << newo->oid
13744 << " 0x" << std::hex << srcoff << "~" << length << " -> "
13745 << " 0x" << dstoff << "~" << length << std::dec << dendl;
13746 oldo->extent_map.fault_range(db, srcoff, length);
13747 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
13748 _dump_onode<30>(cct, *oldo);
13749 _dump_onode<30>(cct, *newo);
7c673cae 13750
11fdf7f2 13751 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
13752 _dump_onode<30>(cct, *oldo);
13753 _dump_onode<30>(cct, *newo);
7c673cae
FG
13754 return 0;
13755}
13756
13757int BlueStore::_clone_range(TransContext *txc,
13758 CollectionRef& c,
13759 OnodeRef& oldo,
13760 OnodeRef& newo,
13761 uint64_t srcoff, uint64_t length, uint64_t dstoff)
13762{
13763 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13764 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
13765 << " to offset 0x" << dstoff << std::dec << dendl;
13766 int r = 0;
13767
35e4c445
FG
13768 if (srcoff + length >= OBJECT_MAX_SIZE ||
13769 dstoff + length >= OBJECT_MAX_SIZE) {
13770 r = -E2BIG;
13771 goto out;
13772 }
7c673cae
FG
13773 if (srcoff + length > oldo->onode.size) {
13774 r = -EINVAL;
13775 goto out;
13776 }
13777
7c673cae
FG
13778 _assign_nid(txc, newo);
13779
13780 if (length > 0) {
13781 if (cct->_conf->bluestore_clone_cow) {
13782 _do_zero(txc, c, newo, dstoff, length);
13783 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
13784 } else {
13785 bufferlist bl;
13786 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
13787 if (r < 0)
13788 goto out;
13789 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
13790 if (r < 0)
13791 goto out;
13792 }
13793 }
13794
13795 txc->write_onode(newo);
13796 r = 0;
13797
13798 out:
13799 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13800 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
13801 << " to offset 0x" << dstoff << std::dec
13802 << " = " << r << dendl;
13803 return r;
13804}
13805
13806int BlueStore::_rename(TransContext *txc,
13807 CollectionRef& c,
13808 OnodeRef& oldo,
13809 OnodeRef& newo,
13810 const ghobject_t& new_oid)
13811{
13812 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
13813 << new_oid << dendl;
13814 int r;
13815 ghobject_t old_oid = oldo->oid;
31f18b77 13816 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
13817
13818 if (newo) {
13819 if (newo->exists) {
13820 r = -EEXIST;
13821 goto out;
13822 }
11fdf7f2 13823 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
13824 }
13825
13826 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
13827
13828 // rewrite shards
13829 {
13830 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
13831 get_object_key(cct, new_oid, &new_okey);
13832 string key;
13833 for (auto &s : oldo->extent_map.shards) {
13834 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
13835 [&](const string& final_key) {
13836 txc->t->rmkey(PREFIX_OBJ, final_key);
13837 }
13838 );
13839 s.dirty = true;
13840 }
13841 }
13842
13843 newo = oldo;
13844 txc->write_onode(newo);
13845
13846 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
13847 // Onode in the old slot
13848 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
13849 r = 0;
13850
f64942e4
AA
13851 // hold a ref to new Onode in old name position, to ensure we don't drop
13852 // it from the cache before this txc commits (or else someone may come along
13853 // and read newo's metadata via the old name).
13854 txc->note_modified_object(oldo);
13855
7c673cae
FG
13856 out:
13857 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
13858 << new_oid << " = " << r << dendl;
13859 return r;
13860}
13861
13862// collections
13863
13864int BlueStore::_create_collection(
13865 TransContext *txc,
13866 const coll_t &cid,
13867 unsigned bits,
13868 CollectionRef *c)
13869{
13870 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
13871 int r;
13872 bufferlist bl;
13873
13874 {
13875 RWLock::WLocker l(coll_lock);
13876 if (*c) {
13877 r = -EEXIST;
13878 goto out;
13879 }
11fdf7f2
TL
13880 auto p = new_coll_map.find(cid);
13881 ceph_assert(p != new_coll_map.end());
13882 *c = p->second;
7c673cae
FG
13883 (*c)->cnode.bits = bits;
13884 coll_map[cid] = *c;
11fdf7f2 13885 new_coll_map.erase(p);
7c673cae 13886 }
11fdf7f2 13887 encode((*c)->cnode, bl);
7c673cae
FG
13888 txc->t->set(PREFIX_COLL, stringify(cid), bl);
13889 r = 0;
13890
13891 out:
13892 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
13893 return r;
13894}
13895
13896int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
13897 CollectionRef *c)
13898{
13899 dout(15) << __func__ << " " << cid << dendl;
13900 int r;
13901
11fdf7f2 13902 (*c)->flush_all_but_last();
7c673cae
FG
13903 {
13904 RWLock::WLocker l(coll_lock);
13905 if (!*c) {
13906 r = -ENOENT;
13907 goto out;
13908 }
13909 size_t nonexistent_count = 0;
11fdf7f2 13910 ceph_assert((*c)->exists);
7c673cae
FG
13911 if ((*c)->onode_map.map_any([&](OnodeRef o) {
13912 if (o->exists) {
494da23a
TL
13913 dout(1) << __func__ << " " << o->oid << " " << o
13914 << " exists in onode_map" << dendl;
7c673cae
FG
13915 return true;
13916 }
13917 ++nonexistent_count;
13918 return false;
13919 })) {
13920 r = -ENOTEMPTY;
13921 goto out;
13922 }
13923
13924 vector<ghobject_t> ls;
13925 ghobject_t next;
13926 // Enumerate onodes in db, up to nonexistent_count + 1
13927 // then check if all of them are marked as non-existent.
11fdf7f2 13928 // Bypass the check if (next != ghobject_t::get_max())
7c673cae
FG
13929 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
13930 nonexistent_count + 1, &ls, &next);
13931 if (r >= 0) {
11fdf7f2
TL
13932 // If true mean collecton has more objects than nonexistent_count,
13933 // so bypass check.
13934 bool exists = (!next.is_max());
7c673cae
FG
13935 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
13936 dout(10) << __func__ << " oid " << *it << dendl;
13937 auto onode = (*c)->onode_map.lookup(*it);
13938 exists = !onode || onode->exists;
13939 if (exists) {
494da23a
TL
13940 dout(1) << __func__ << " " << *it
13941 << " exists in db, "
13942 << (!onode ? "not present in ram" : "present in ram")
13943 << dendl;
7c673cae
FG
13944 }
13945 }
13946 if (!exists) {
11fdf7f2 13947 _do_remove_collection(txc, c);
7c673cae
FG
13948 r = 0;
13949 } else {
13950 dout(10) << __func__ << " " << cid
13951 << " is non-empty" << dendl;
13952 r = -ENOTEMPTY;
13953 }
13954 }
13955 }
13956
13957 out:
13958 dout(10) << __func__ << " " << cid << " = " << r << dendl;
13959 return r;
13960}
13961
11fdf7f2
TL
13962void BlueStore::_do_remove_collection(TransContext *txc,
13963 CollectionRef *c)
13964{
13965 coll_map.erase((*c)->cid);
13966 txc->removed_collections.push_back(*c);
13967 (*c)->exists = false;
13968 _osr_register_zombie((*c)->osr.get());
13969 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
13970 c->reset();
13971}
13972
7c673cae
FG
13973int BlueStore::_split_collection(TransContext *txc,
13974 CollectionRef& c,
13975 CollectionRef& d,
13976 unsigned bits, int rem)
13977{
13978 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
13979 << " bits " << bits << dendl;
13980 RWLock::WLocker l(c->lock);
13981 RWLock::WLocker l2(d->lock);
13982 int r;
13983
13984 // flush all previous deferred writes on this sequencer. this is a bit
13985 // heavyweight, but we need to make sure all deferred writes complete
13986 // before we split as the new collection's sequencer may need to order
13987 // this after those writes, and we don't bother with the complexity of
13988 // moving those TransContexts over to the new osr.
13989 _osr_drain_preceding(txc);
13990
13991 // move any cached items (onodes and referenced shared blobs) that will
13992 // belong to the child collection post-split. leave everything else behind.
13993 // this may include things that don't strictly belong to the now-smaller
13994 // parent split, but the OSD will always send us a split for every new
13995 // child.
13996
13997 spg_t pgid, dest_pgid;
13998 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 13999 ceph_assert(is_pg);
7c673cae 14000 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 14001 ceph_assert(is_pg);
7c673cae
FG
14002
14003 // the destination should initially be empty.
11fdf7f2
TL
14004 ceph_assert(d->onode_map.empty());
14005 ceph_assert(d->shared_blob_set.empty());
14006 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
14007
14008 c->split_cache(d.get());
14009
14010 // adjust bits. note that this will be redundant for all but the first
14011 // split call for this parent (first child).
14012 c->cnode.bits = bits;
11fdf7f2 14013 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
14014 r = 0;
14015
14016 bufferlist bl;
11fdf7f2 14017 encode(c->cnode, bl);
7c673cae
FG
14018 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
14019
14020 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
14021 << " bits " << bits << " = " << r << dendl;
14022 return r;
14023}
14024
11fdf7f2
TL
14025int BlueStore::_merge_collection(
14026 TransContext *txc,
14027 CollectionRef *c,
14028 CollectionRef& d,
14029 unsigned bits)
14030{
14031 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
14032 << " bits " << bits << dendl;
14033 RWLock::WLocker l((*c)->lock);
14034 RWLock::WLocker l2(d->lock);
14035 int r;
14036
14037 coll_t cid = (*c)->cid;
14038
14039 // flush all previous deferred writes on the source collection to ensure
14040 // that all deferred writes complete before we merge as the target collection's
14041 // sequencer may need to order new ops after those writes.
14042
14043 _osr_drain((*c)->osr.get());
14044
14045 // move any cached items (onodes and referenced shared blobs) that will
14046 // belong to the child collection post-split. leave everything else behind.
14047 // this may include things that don't strictly belong to the now-smaller
14048 // parent split, but the OSD will always send us a split for every new
14049 // child.
14050
14051 spg_t pgid, dest_pgid;
14052 bool is_pg = cid.is_pg(&pgid);
14053 ceph_assert(is_pg);
14054 is_pg = d->cid.is_pg(&dest_pgid);
14055 ceph_assert(is_pg);
14056
14057 // adjust bits. note that this will be redundant for all but the first
14058 // merge call for the parent/target.
14059 d->cnode.bits = bits;
14060
14061 // behavior depends on target (d) bits, so this after that is updated.
14062 (*c)->split_cache(d.get());
14063
14064 // remove source collection
14065 {
14066 RWLock::WLocker l3(coll_lock);
14067 _do_remove_collection(txc, c);
14068 }
14069
14070 r = 0;
14071
14072 bufferlist bl;
14073 encode(d->cnode, bl);
14074 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
14075
14076 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
14077 << " bits " << bits << " = " << r << dendl;
14078 return r;
14079}
14080
494da23a
TL
14081void BlueStore::log_latency(
14082 const char* name,
14083 int idx,
14084 const ceph::timespan& l,
14085 double lat_threshold,
14086 const char* info) const
14087{
14088 logger->tinc(idx, l);
14089 if (lat_threshold > 0.0 &&
14090 l >= make_timespan(lat_threshold)) {
14091 dout(0) << __func__ << " slow operation observed for " << name
14092 << ", latency = " << l
14093 << info
14094 << dendl;
14095 }
14096}
14097
11fdf7f2 14098void BlueStore::log_latency_fn(
494da23a 14099 const char* name,
11fdf7f2
TL
14100 int idx,
14101 const ceph::timespan& l,
494da23a
TL
14102 double lat_threshold,
14103 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 14104{
494da23a
TL
14105 logger->tinc(idx, l);
14106 if (lat_threshold > 0.0 &&
14107 l >= make_timespan(lat_threshold)) {
14108 dout(0) << __func__ << " slow operation observed for " << name
14109 << ", latency = " << l
14110 << fn(l)
14111 << dendl;
14112 }
11fdf7f2
TL
14113}
14114
14115
7c673cae
FG
14116// DB key value Histogram
14117#define KEY_SLAB 32
14118#define VALUE_SLAB 64
14119
14120const string prefix_onode = "o";
14121const string prefix_onode_shard = "x";
14122const string prefix_other = "Z";
14123
14124int BlueStore::DBHistogram::get_key_slab(size_t sz)
14125{
14126 return (sz/KEY_SLAB);
14127}
14128
14129string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
14130{
14131 int lower_bound = slab * KEY_SLAB;
14132 int upper_bound = (slab + 1) * KEY_SLAB;
14133 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
14134 return ret;
14135}
14136
14137int BlueStore::DBHistogram::get_value_slab(size_t sz)
14138{
14139 return (sz/VALUE_SLAB);
14140}
14141
14142string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
14143{
14144 int lower_bound = slab * VALUE_SLAB;
14145 int upper_bound = (slab + 1) * VALUE_SLAB;
14146 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
14147 return ret;
14148}
14149
14150void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
14151 const string &prefix, size_t key_size, size_t value_size)
14152{
14153 uint32_t key_slab = get_key_slab(key_size);
14154 uint32_t value_slab = get_value_slab(value_size);
14155 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
14156 key_hist[prefix][key_slab].max_len =
14157 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
14158 key_hist[prefix][key_slab].val_map[value_slab].count++;
14159 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
14160 std::max<size_t>(value_size,
14161 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
14162}
14163
14164void BlueStore::DBHistogram::dump(Formatter *f)
14165{
14166 f->open_object_section("rocksdb_value_distribution");
14167 for (auto i : value_hist) {
14168 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
14169 }
14170 f->close_section();
14171
14172 f->open_object_section("rocksdb_key_value_histogram");
14173 for (auto i : key_hist) {
14174 f->dump_string("prefix", i.first);
14175 f->open_object_section("key_hist");
14176 for ( auto k : i.second) {
14177 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
14178 f->dump_unsigned("max_len", k.second.max_len);
14179 f->open_object_section("value_hist");
14180 for ( auto j : k.second.val_map) {
14181 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
14182 f->dump_unsigned("max_len", j.second.max_len);
14183 }
14184 f->close_section();
14185 }
14186 f->close_section();
14187 }
14188 f->close_section();
14189}
14190
14191//Itrerates through the db and collects the stats
14192void BlueStore::generate_db_histogram(Formatter *f)
14193{
14194 //globals
14195 uint64_t num_onodes = 0;
14196 uint64_t num_shards = 0;
14197 uint64_t num_super = 0;
14198 uint64_t num_coll = 0;
14199 uint64_t num_omap = 0;
11fdf7f2 14200 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
14201 uint64_t num_deferred = 0;
14202 uint64_t num_alloc = 0;
14203 uint64_t num_stat = 0;
14204 uint64_t num_others = 0;
14205 uint64_t num_shared_shards = 0;
14206 size_t max_key_size =0, max_value_size = 0;
14207 uint64_t total_key_size = 0, total_value_size = 0;
14208 size_t key_size = 0, value_size = 0;
14209 DBHistogram hist;
14210
11fdf7f2 14211 auto start = coarse_mono_clock::now();
7c673cae 14212
11fdf7f2 14213 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
14214 iter->seek_to_first();
14215 while (iter->valid()) {
14216 dout(30) << __func__ << " Key: " << iter->key() << dendl;
14217 key_size = iter->key_size();
14218 value_size = iter->value_size();
14219 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
14220 max_key_size = std::max(max_key_size, key_size);
14221 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
14222 total_key_size += key_size;
14223 total_value_size += value_size;
14224
14225 pair<string,string> key(iter->raw_key());
14226
14227 if (key.first == PREFIX_SUPER) {
14228 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
14229 num_super++;
14230 } else if (key.first == PREFIX_STAT) {
14231 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
14232 num_stat++;
14233 } else if (key.first == PREFIX_COLL) {
14234 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
14235 num_coll++;
14236 } else if (key.first == PREFIX_OBJ) {
14237 if (key.second.back() == ONODE_KEY_SUFFIX) {
14238 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
14239 num_onodes++;
14240 } else {
14241 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
14242 num_shards++;
14243 }
14244 } else if (key.first == PREFIX_OMAP) {
14245 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
14246 num_omap++;
11fdf7f2
TL
14247 } else if (key.first == PREFIX_PGMETA_OMAP) {
14248 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
14249 num_pgmeta_omap++;
7c673cae
FG
14250 } else if (key.first == PREFIX_DEFERRED) {
14251 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
14252 num_deferred++;
11fdf7f2 14253 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
14254 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
14255 num_alloc++;
14256 } else if (key.first == PREFIX_SHARED_BLOB) {
14257 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
14258 num_shared_shards++;
14259 } else {
14260 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
14261 num_others++;
14262 }
14263 iter->next();
14264 }
14265
11fdf7f2 14266 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
14267 f->open_object_section("rocksdb_key_value_stats");
14268 f->dump_unsigned("num_onodes", num_onodes);
14269 f->dump_unsigned("num_shards", num_shards);
14270 f->dump_unsigned("num_super", num_super);
14271 f->dump_unsigned("num_coll", num_coll);
14272 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 14273 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
14274 f->dump_unsigned("num_deferred", num_deferred);
14275 f->dump_unsigned("num_alloc", num_alloc);
14276 f->dump_unsigned("num_stat", num_stat);
14277 f->dump_unsigned("num_shared_shards", num_shared_shards);
14278 f->dump_unsigned("num_others", num_others);
14279 f->dump_unsigned("max_key_size", max_key_size);
14280 f->dump_unsigned("max_value_size", max_value_size);
14281 f->dump_unsigned("total_key_size", total_key_size);
14282 f->dump_unsigned("total_value_size", total_value_size);
14283 f->close_section();
14284
14285 hist.dump(f);
14286
14287 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
14288
14289}
14290
31f18b77 14291void BlueStore::_flush_cache()
7c673cae
FG
14292{
14293 dout(10) << __func__ << dendl;
14294 for (auto i : cache_shards) {
14295 i->trim_all();
11fdf7f2 14296 ceph_assert(i->empty());
7c673cae
FG
14297 }
14298 for (auto& p : coll_map) {
3efd9988 14299 if (!p.second->onode_map.empty()) {
11fdf7f2
TL
14300 derr << __func__ << " stray onodes on " << p.first << dendl;
14301 p.second->onode_map.dump<0>(cct);
3efd9988
FG
14302 }
14303 if (!p.second->shared_blob_set.empty()) {
14304 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 14305 p.second->shared_blob_set.dump<0>(cct);
3efd9988 14306 }
11fdf7f2
TL
14307 ceph_assert(p.second->onode_map.empty());
14308 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
14309 }
14310 coll_map.clear();
14311}
14312
31f18b77
FG
14313// For external caller.
14314// We use a best-effort policy instead, e.g.,
14315// we don't care if there are still some pinned onodes/data in the cache
14316// after this command is completed.
11fdf7f2 14317int BlueStore::flush_cache(ostream *os)
31f18b77
FG
14318{
14319 dout(10) << __func__ << dendl;
14320 for (auto i : cache_shards) {
14321 i->trim_all();
14322 }
11fdf7f2
TL
14323
14324 return 0;
31f18b77
FG
14325}
14326
7c673cae
FG
14327void BlueStore::_apply_padding(uint64_t head_pad,
14328 uint64_t tail_pad,
7c673cae
FG
14329 bufferlist& padded)
14330{
7c673cae 14331 if (head_pad) {
224ce89b 14332 padded.prepend_zero(head_pad);
7c673cae
FG
14333 }
14334 if (tail_pad) {
14335 padded.append_zero(tail_pad);
14336 }
14337 if (head_pad || tail_pad) {
14338 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
14339 << " tail 0x" << tail_pad << std::dec << dendl;
14340 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
14341 }
14342}
14343
11fdf7f2
TL
14344void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
14345{
14346 // finalize extent_map shards
14347 o->extent_map.update(txn, false);
14348 if (o->extent_map.needs_reshard()) {
14349 o->extent_map.reshard(db, txn);
14350 o->extent_map.update(txn, true);
14351 if (o->extent_map.needs_reshard()) {
14352 dout(20) << __func__ << " warning: still wants reshard, check options?"
14353 << dendl;
14354 o->extent_map.clear_needs_reshard();
14355 }
14356 logger->inc(l_bluestore_onode_reshard);
14357 }
14358
14359 // bound encode
14360 size_t bound = 0;
14361 denc(o->onode, bound);
14362 o->extent_map.bound_encode_spanning_blobs(bound);
14363 if (o->onode.extent_map_shards.empty()) {
14364 denc(o->extent_map.inline_bl, bound);
14365 }
14366
14367 // encode
14368 bufferlist bl;
14369 unsigned onode_part, blob_part, extent_part;
14370 {
14371 auto p = bl.get_contiguous_appender(bound, true);
14372 denc(o->onode, p);
14373 onode_part = p.get_logical_offset();
14374 o->extent_map.encode_spanning_blobs(p);
14375 blob_part = p.get_logical_offset() - onode_part;
14376 if (o->onode.extent_map_shards.empty()) {
14377 denc(o->extent_map.inline_bl, p);
14378 }
14379 extent_part = p.get_logical_offset() - onode_part - blob_part;
14380 }
14381
14382 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
14383 << " (" << onode_part << " bytes onode + "
14384 << blob_part << " bytes spanning blobs + "
14385 << extent_part << " bytes inline extents)"
14386 << dendl;
14387
14388
14389 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
14390}
14391
14392void BlueStore::_log_alerts(osd_alert_list_t& alerts)
14393{
14394 std::lock_guard l(qlock);
14395
81eedcae
TL
14396 if (!disk_size_mismatch_alert.empty()) {
14397 alerts.emplace(
14398 "BLUESTORE_DISK_SIZE_MISMATCH",
14399 disk_size_mismatch_alert);
14400 }
14401 if (!legacy_statfs_alert.empty()) {
14402 alerts.emplace(
14403 "BLUESTORE_LEGACY_STATFS",
14404 legacy_statfs_alert);
14405 }
11fdf7f2
TL
14406 if (!spillover_alert.empty() &&
14407 cct->_conf->bluestore_warn_on_bluefs_spillover) {
14408 alerts.emplace(
14409 "BLUEFS_SPILLOVER",
14410 spillover_alert);
14411 }
14412 string s0(failed_cmode);
14413
14414 if (!failed_compressors.empty()) {
14415 if (!s0.empty()) {
14416 s0 += ", ";
14417 }
14418 s0 += "unable to load:";
14419 bool first = true;
14420 for (auto& s : failed_compressors) {
14421 if (first) {
14422 first = false;
14423 } else {
14424 s0 += ", ";
14425 }
14426 s0 += s;
14427 }
14428 alerts.emplace(
14429 "BLUESTORE_NO_COMPRESSION",
14430 s0);
14431 }
14432}
14433
7c673cae 14434// ===========================================
11fdf7f2
TL
14435// BlueStoreRepairer
14436
14437size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
14438 const interval_set<uint64_t>& extents)
14439{
14440 ceph_assert(granularity); // initialized
14441 // can't call for the second time
14442 ceph_assert(!was_filtered_out);
14443 ceph_assert(collections_bfs.size() == objects_bfs.size());
14444
14445 uint64_t prev_pos = 0;
14446 uint64_t npos = collections_bfs.size();
14447
14448 bloom_vector collections_reduced;
14449 bloom_vector objects_reduced;
14450
14451 for (auto e : extents) {
14452 if (e.second == 0) {
14453 continue;
14454 }
14455 uint64_t pos = max(e.first / granularity, prev_pos);
14456 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
14457 while (pos != npos && pos < end_pos) {
14458 ceph_assert( collections_bfs[pos].element_count() ==
14459 objects_bfs[pos].element_count());
14460 if (collections_bfs[pos].element_count()) {
14461 collections_reduced.push_back(std::move(collections_bfs[pos]));
14462 objects_reduced.push_back(std::move(objects_bfs[pos]));
14463 }
14464 ++pos;
14465 }
14466 prev_pos = end_pos;
14467 }
14468 collections_reduced.swap(collections_bfs);
14469 objects_reduced.swap(objects_bfs);
14470 was_filtered_out = true;
14471 return collections_bfs.size();
14472}
14473
14474bool BlueStoreRepairer::remove_key(KeyValueDB *db,
14475 const string& prefix,
14476 const string& key)
14477{
14478 if (!remove_key_txn) {
14479 remove_key_txn = db->get_transaction();
14480 }
14481 ++to_repair_cnt;
14482 remove_key_txn->rmkey(prefix, key);
14483
14484 return true;
14485}
14486
14487bool BlueStoreRepairer::fix_shared_blob(
14488 KeyValueDB *db,
14489 uint64_t sbid,
14490 const bufferlist* bl)
14491{
14492 KeyValueDB::Transaction txn;
14493 if (fix_misreferences_txn) { // reuse this txn
14494 txn = fix_misreferences_txn;
14495 } else {
14496 if (!fix_shared_blob_txn) {
14497 fix_shared_blob_txn = db->get_transaction();
14498 }
14499 txn = fix_shared_blob_txn;
14500 }
14501 string key;
14502 get_shared_blob_key(sbid, &key);
14503
14504 ++to_repair_cnt;
14505 if (bl) {
14506 txn->set(PREFIX_SHARED_BLOB, key, *bl);
14507 } else {
14508 txn->rmkey(PREFIX_SHARED_BLOB, key);
14509 }
14510 return true;
14511}
14512
14513bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
14514 const string& key,
14515 const store_statfs_t& new_statfs)
14516{
14517 if (!fix_statfs_txn) {
14518 fix_statfs_txn = db->get_transaction();
14519 }
14520 BlueStore::volatile_statfs vstatfs;
14521 vstatfs = new_statfs;
14522 bufferlist bl;
14523 vstatfs.encode(bl);
14524 ++to_repair_cnt;
14525 fix_statfs_txn->set(PREFIX_STAT, key, bl);
14526 return true;
14527}
14528
14529bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
14530 FreelistManager* fm,
14531 uint64_t offset, uint64_t len)
14532{
14533 if (!fix_fm_leaked_txn) {
14534 fix_fm_leaked_txn = db->get_transaction();
14535 }
14536 ++to_repair_cnt;
14537 fm->release(offset, len, fix_fm_leaked_txn);
14538 return true;
14539}
14540bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
14541 FreelistManager* fm,
14542 uint64_t offset, uint64_t len)
14543{
14544 if (!fix_fm_false_free_txn) {
14545 fix_fm_false_free_txn = db->get_transaction();
14546 }
14547 ++to_repair_cnt;
14548 fm->allocate(offset, len, fix_fm_false_free_txn);
14549 return true;
14550}
14551
14552bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
14553{
14554 // this is just a stub to count num of repairs properly,
14555 // actual repair happens in BlueStore::_close_db_and_around()
14556 // while doing _sync_bluefs_and_fm
14557 ++out_of_sync_flag;
14558 ++to_repair_cnt;
14559 return true;
14560}
14561
14562bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
14563{
14564 if (misreferenced_extents.size()) {
14565 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
14566 ceph_assert(n > 0);
14567 if (!fix_misreferences_txn) {
14568 fix_misreferences_txn = db->get_transaction();
14569 }
14570 return true;
14571 }
14572 return false;
14573}
14574
14575unsigned BlueStoreRepairer::apply(KeyValueDB* db)
14576{
14577 if (fix_fm_leaked_txn) {
14578 db->submit_transaction_sync(fix_fm_leaked_txn);
14579 fix_fm_leaked_txn = nullptr;
14580 }
14581 if (fix_fm_false_free_txn) {
14582 db->submit_transaction_sync(fix_fm_false_free_txn);
14583 fix_fm_false_free_txn = nullptr;
14584 }
14585 if (remove_key_txn) {
14586 db->submit_transaction_sync(remove_key_txn);
14587 remove_key_txn = nullptr;
14588 }
14589 if (fix_misreferences_txn) {
14590 db->submit_transaction_sync(fix_misreferences_txn);
14591 fix_misreferences_txn = nullptr;
14592 }
14593 if (fix_shared_blob_txn) {
14594 db->submit_transaction_sync(fix_shared_blob_txn);
14595 fix_shared_blob_txn = nullptr;
14596 }
14597
14598 if (fix_statfs_txn) {
14599 db->submit_transaction_sync(fix_statfs_txn);
14600 fix_statfs_txn = nullptr;
14601 }
14602 unsigned repaired = to_repair_cnt;
14603 to_repair_cnt = 0;
14604 return repaired;
14605}
14606
14607// =======================================================
14608