]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
update sources to v12.1.3
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
7c673cae
FG
1// vim: ts=8 sw=2 smarttab
2/*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14#include <unistd.h>
15#include <stdlib.h>
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <fcntl.h>
19
31f18b77
FG
20#include "include/cpp-btree/btree_set.h"
21
7c673cae
FG
22#include "BlueStore.h"
23#include "os/kv.h"
24#include "include/compat.h"
25#include "include/intarith.h"
26#include "include/stringify.h"
27#include "common/errno.h"
28#include "common/safe_io.h"
29#include "Allocator.h"
30#include "FreelistManager.h"
31#include "BlueFS.h"
32#include "BlueRocksEnv.h"
33#include "auth/Crypto.h"
34#include "common/EventTrace.h"
35
36#define dout_context cct
37#define dout_subsys ceph_subsys_bluestore
38
31f18b77
FG
39using bid_t = decltype(BlueStore::Blob::id);
40
41// bluestore_cache_onode
7c673cae 42MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 43 bluestore_cache_onode);
7c673cae 44
31f18b77 45// bluestore_cache_other
7c673cae 46MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 47 bluestore_cache_other);
7c673cae 48MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 49 bluestore_cache_other);
7c673cae 50MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 51 bluestore_cache_other);
7c673cae 52MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
53 bluestore_cache_other);
54
55// bluestore_txc
56MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
57 bluestore_txc);
58
7c673cae
FG
59
60// kv store prefixes
61const string PREFIX_SUPER = "S"; // field -> value
62const string PREFIX_STAT = "T"; // field -> value(int64 array)
63const string PREFIX_COLL = "C"; // collection name -> cnode_t
64const string PREFIX_OBJ = "O"; // object name -> onode_t
65const string PREFIX_OMAP = "M"; // u64 + keyname -> value
66const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
67const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
68const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
69
70// write a label in the first block. always use this size. note that
71// bluefs makes a matching assumption about the location of its
72// superblock (always the second block of the device).
73#define BDEV_LABEL_BLOCK_SIZE 4096
74
75// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76#define SUPER_RESERVED 8192
77
78#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
79
80
81/*
82 * extent map blob encoding
83 *
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
86 */
87#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91#define BLOBID_SHIFT_BITS 4
92
93/*
94 * object name key structure
95 *
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
99 *
100 * escaped string: namespace
101 *
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
106 *
107 * encoded u64: snap
108 * encoded u64: generation
109 * 'o'
110 */
111#define ONODE_KEY_SUFFIX 'o'
112
113/*
114 * extent shard key
115 *
116 * object prefix key
117 * u32
118 * 'x'
119 */
120#define EXTENT_SHARD_KEY_SUFFIX 'x'
121
122/*
123 * string encoding in the key
124 *
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
128 * hex digits.
129 *
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
132 *
133 */
134template<typename S>
135static void append_escaped(const string &in, S *out)
136{
224ce89b
WB
137 char hexbyte[in.length() * 3 + 1];
138 char* ptr = &hexbyte[0];
7c673cae
FG
139 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
140 if (*i <= '#') {
224ce89b
WB
141 *ptr++ = '#';
142 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
143 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 144 } else if (*i >= '~') {
224ce89b
WB
145 *ptr++ = '~';
146 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
147 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 148 } else {
224ce89b 149 *ptr++ = *i;
7c673cae
FG
150 }
151 }
224ce89b
WB
152 *ptr++ = '!';
153 out->append(hexbyte, ptr - &hexbyte[0]);
154}
155
156inline unsigned h2i(char c)
157{
158 if ((c >= '0') && (c <= '9')) {
159 return c - 0x30;
160 } else if ((c >= 'a') && (c <= 'f')) {
161 return c - 'a' + 10;
162 } else if ((c >= 'A') && (c <= 'F')) {
163 return c - 'A' + 10;
164 } else {
165 return 256; // make it always larger than 255
166 }
7c673cae
FG
167}
168
169static int decode_escaped(const char *p, string *out)
170{
224ce89b
WB
171 char buff[256];
172 char* ptr = &buff[0];
173 char* max = &buff[252];
7c673cae
FG
174 const char *orig_p = p;
175 while (*p && *p != '!') {
176 if (*p == '#' || *p == '~') {
224ce89b
WB
177 unsigned hex = 0;
178 p++;
179 hex = h2i(*p++) << 4;
180 if (hex > 255) {
181 return -EINVAL;
182 }
183 hex |= h2i(*p++);
184 if (hex > 255) {
185 return -EINVAL;
186 }
187 *ptr++ = hex;
7c673cae 188 } else {
224ce89b
WB
189 *ptr++ = *p++;
190 }
191 if (ptr > max) {
192 out->append(buff, ptr-buff);
193 ptr = &buff[0];
7c673cae
FG
194 }
195 }
224ce89b
WB
196 if (ptr != buff) {
197 out->append(buff, ptr-buff);
198 }
7c673cae
FG
199 return p - orig_p;
200}
201
202// some things we encode in binary (as le32 or le64); print the
203// resulting key strings nicely
204template<typename S>
205static string pretty_binary_string(const S& in)
206{
207 char buf[10];
208 string out;
209 out.reserve(in.length() * 3);
210 enum { NONE, HEX, STRING } mode = NONE;
211 unsigned from = 0, i;
212 for (i=0; i < in.length(); ++i) {
213 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
214 (mode == HEX && in.length() - i >= 4 &&
215 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
216 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
217 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
218 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
219 if (mode == STRING) {
220 out.append(in.c_str() + from, i - from);
221 out.push_back('\'');
222 }
223 if (mode != HEX) {
224 out.append("0x");
225 mode = HEX;
226 }
227 if (in.length() - i >= 4) {
228 // print a whole u32 at once
229 snprintf(buf, sizeof(buf), "%08x",
230 (uint32_t)(((unsigned char)in[i] << 24) |
231 ((unsigned char)in[i+1] << 16) |
232 ((unsigned char)in[i+2] << 8) |
233 ((unsigned char)in[i+3] << 0)));
234 i += 3;
235 } else {
236 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
237 }
238 out.append(buf);
239 } else {
240 if (mode != STRING) {
241 out.push_back('\'');
242 mode = STRING;
243 from = i;
244 }
245 }
246 }
247 if (mode == STRING) {
248 out.append(in.c_str() + from, i - from);
249 out.push_back('\'');
250 }
251 return out;
252}
253
254template<typename T>
255static void _key_encode_shard(shard_id_t shard, T *key)
256{
257 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
258}
259
260static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
261{
262 pshard->id = (uint8_t)*key - (uint8_t)0x80;
263 return key + 1;
264}
265
266static void get_coll_key_range(const coll_t& cid, int bits,
267 string *temp_start, string *temp_end,
268 string *start, string *end)
269{
270 temp_start->clear();
271 temp_end->clear();
272 start->clear();
273 end->clear();
274
275 spg_t pgid;
276 if (cid.is_pg(&pgid)) {
277 _key_encode_shard(pgid.shard, start);
278 *temp_start = *start;
279
280 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
281 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
282
283 *end = *start;
284 *temp_end = *temp_start;
285
286 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
287 _key_encode_u32(reverse_hash, start);
288 _key_encode_u32(reverse_hash, temp_start);
289
290 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
291 if (end_hash > 0xffffffffull)
292 end_hash = 0xffffffffull;
293
294 _key_encode_u32(end_hash, end);
295 _key_encode_u32(end_hash, temp_end);
296 } else {
297 _key_encode_shard(shard_id_t::NO_SHARD, start);
298 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
299 *end = *start;
300 _key_encode_u32(0, start);
301 _key_encode_u32(0xffffffff, end);
302
303 // no separate temp section
304 *temp_start = *end;
305 *temp_end = *end;
306 }
307}
308
309static void get_shared_blob_key(uint64_t sbid, string *key)
310{
311 key->clear();
312 _key_encode_u64(sbid, key);
313}
314
315static int get_key_shared_blob(const string& key, uint64_t *sbid)
316{
317 const char *p = key.c_str();
318 if (key.length() < sizeof(uint64_t))
319 return -1;
224ce89b 320 _key_decode_u64(p, sbid);
7c673cae
FG
321 return 0;
322}
323
324template<typename S>
325static int get_key_object(const S& key, ghobject_t *oid)
326{
327 int r;
328 const char *p = key.c_str();
329
330 if (key.length() < 1 + 8 + 4)
331 return -1;
332 p = _key_decode_shard(p, &oid->shard_id);
333
334 uint64_t pool;
335 p = _key_decode_u64(p, &pool);
336 oid->hobj.pool = pool - 0x8000000000000000ull;
337
338 unsigned hash;
339 p = _key_decode_u32(p, &hash);
340
341 oid->hobj.set_bitwise_key_u32(hash);
342
343 r = decode_escaped(p, &oid->hobj.nspace);
344 if (r < 0)
345 return -2;
346 p += r + 1;
347
348 string k;
349 r = decode_escaped(p, &k);
350 if (r < 0)
351 return -3;
352 p += r + 1;
353 if (*p == '=') {
354 // no key
355 ++p;
356 oid->hobj.oid.name = k;
357 } else if (*p == '<' || *p == '>') {
358 // key + name
359 ++p;
360 r = decode_escaped(p, &oid->hobj.oid.name);
361 if (r < 0)
362 return -5;
363 p += r + 1;
364 oid->hobj.set_key(k);
365 } else {
366 // malformed
367 return -6;
368 }
369
370 p = _key_decode_u64(p, &oid->hobj.snap.val);
371 p = _key_decode_u64(p, &oid->generation);
372
373 if (*p != ONODE_KEY_SUFFIX) {
374 return -7;
375 }
376 p++;
377 if (*p) {
378 // if we get something other than a null terminator here,
379 // something goes wrong.
380 return -8;
381 }
382
383 return 0;
384}
385
386template<typename S>
387static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
388{
389 key->clear();
390
391 size_t max_len = 1 + 8 + 4 +
392 (oid.hobj.nspace.length() * 3 + 1) +
393 (oid.hobj.get_key().length() * 3 + 1) +
394 1 + // for '<', '=', or '>'
395 (oid.hobj.oid.name.length() * 3 + 1) +
396 8 + 8 + 1;
397 key->reserve(max_len);
398
399 _key_encode_shard(oid.shard_id, key);
400 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
401 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
402
403 append_escaped(oid.hobj.nspace, key);
404
405 if (oid.hobj.get_key().length()) {
406 // is a key... could be < = or >.
407 append_escaped(oid.hobj.get_key(), key);
408 // (ASCII chars < = and > sort in that order, yay)
409 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
410 if (r) {
411 key->append(r > 0 ? ">" : "<");
412 append_escaped(oid.hobj.oid.name, key);
413 } else {
414 // same as no key
415 key->append("=");
416 }
417 } else {
418 // no key
419 append_escaped(oid.hobj.oid.name, key);
420 key->append("=");
421 }
422
423 _key_encode_u64(oid.hobj.snap, key);
424 _key_encode_u64(oid.generation, key);
425
426 key->push_back(ONODE_KEY_SUFFIX);
427
428 // sanity check
429 if (true) {
430 ghobject_t t;
431 int r = get_key_object(*key, &t);
432 if (r || t != oid) {
433 derr << " r " << r << dendl;
434 derr << "key " << pretty_binary_string(*key) << dendl;
435 derr << "oid " << oid << dendl;
436 derr << " t " << t << dendl;
437 assert(r == 0 && t == oid);
438 }
439 }
440}
441
442
443// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
444// char lets us quickly test whether it is a shard key without decoding any
445// of the prefix bytes.
446template<typename S>
447static void get_extent_shard_key(const S& onode_key, uint32_t offset,
448 string *key)
449{
450 key->clear();
451 key->reserve(onode_key.length() + 4 + 1);
452 key->append(onode_key.c_str(), onode_key.size());
453 _key_encode_u32(offset, key);
454 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
455}
456
457static void rewrite_extent_shard_key(uint32_t offset, string *key)
458{
459 assert(key->size() > sizeof(uint32_t) + 1);
460 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
461 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
462}
463
464template<typename S>
465static void generate_extent_shard_key_and_apply(
466 const S& onode_key,
467 uint32_t offset,
468 string *key,
469 std::function<void(const string& final_key)> apply)
470{
471 if (key->empty()) { // make full key
472 assert(!onode_key.empty());
473 get_extent_shard_key(onode_key, offset, key);
474 } else {
475 rewrite_extent_shard_key(offset, key);
476 }
477 apply(*key);
478}
479
480int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
481{
482 assert(key.size() > sizeof(uint32_t) + 1);
483 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
484 int okey_len = key.size() - sizeof(uint32_t) - 1;
485 *onode_key = key.substr(0, okey_len);
486 const char *p = key.data() + okey_len;
224ce89b 487 _key_decode_u32(p, offset);
7c673cae
FG
488 return 0;
489}
490
491static bool is_extent_shard_key(const string& key)
492{
493 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
494}
495
496// '-' < '.' < '~'
497static void get_omap_header(uint64_t id, string *out)
498{
499 _key_encode_u64(id, out);
500 out->push_back('-');
501}
502
503// hmm, I don't think there's any need to escape the user key since we
504// have a clean prefix.
505static void get_omap_key(uint64_t id, const string& key, string *out)
506{
507 _key_encode_u64(id, out);
508 out->push_back('.');
509 out->append(key);
510}
511
512static void rewrite_omap_key(uint64_t id, string old, string *out)
513{
514 _key_encode_u64(id, out);
515 out->append(old.c_str() + out->length(), old.size() - out->length());
516}
517
518static void decode_omap_key(const string& key, string *user_key)
519{
520 *user_key = key.substr(sizeof(uint64_t) + 1);
521}
522
523static void get_omap_tail(uint64_t id, string *out)
524{
525 _key_encode_u64(id, out);
526 out->push_back('~');
527}
528
529static void get_deferred_key(uint64_t seq, string *out)
530{
531 _key_encode_u64(seq, out);
532}
533
534
535// merge operators
536
537struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
538 void merge_nonexistent(
539 const char *rdata, size_t rlen, std::string *new_value) override {
540 *new_value = std::string(rdata, rlen);
541 }
542 void merge(
543 const char *ldata, size_t llen,
544 const char *rdata, size_t rlen,
545 std::string *new_value) override {
546 assert(llen == rlen);
547 assert((rlen % 8) == 0);
548 new_value->resize(rlen);
549 const __le64* lv = (const __le64*)ldata;
550 const __le64* rv = (const __le64*)rdata;
551 __le64* nv = &(__le64&)new_value->at(0);
552 for (size_t i = 0; i < rlen >> 3; ++i) {
553 nv[i] = lv[i] + rv[i];
554 }
555 }
556 // We use each operator name and each prefix to construct the
557 // overall RocksDB operator name for consistency check at open time.
558 string name() const override {
559 return "int64_array";
560 }
561};
562
563
564// Buffer
565
566ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
567{
568 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
569 << b.offset << "~" << b.length << std::dec
570 << " " << BlueStore::Buffer::get_state_name(b.state);
571 if (b.flags)
572 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
573 return out << ")";
574}
575
576// Garbage Collector
577
578void BlueStore::GarbageCollector::process_protrusive_extents(
579 const BlueStore::ExtentMap& extent_map,
580 uint64_t start_offset,
581 uint64_t end_offset,
582 uint64_t start_touch_offset,
583 uint64_t end_touch_offset,
584 uint64_t min_alloc_size)
585{
586 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
587
588 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
589 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
590
591 dout(30) << __func__ << " (hex): [" << std::hex
592 << lookup_start_offset << ", " << lookup_end_offset
593 << ")" << std::dec << dendl;
594
595 for (auto it = extent_map.seek_lextent(lookup_start_offset);
596 it != extent_map.extent_map.end() &&
597 it->logical_offset < lookup_end_offset;
598 ++it) {
599 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
600 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
601
602 dout(30) << __func__ << " " << *it
603 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
604 << dendl;
605
606 Blob* b = it->blob.get();
607
608 if (it->logical_offset >=start_touch_offset &&
609 it->logical_end() <= end_touch_offset) {
610 // Process extents within the range affected by
611 // the current write request.
612 // Need to take into account if existing extents
613 // can be merged with them (uncompressed case)
614 if (!b->get_blob().is_compressed()) {
615 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
616 --blob_info_counted->expected_allocations; // don't need to allocate
617 // new AU for compressed
618 // data since another
619 // collocated uncompressed
620 // blob already exists
621 dout(30) << __func__ << " --expected:"
622 << alloc_unit_start << dendl;
623 }
624 used_alloc_unit = alloc_unit_end;
625 blob_info_counted = nullptr;
626 }
627 } else if (b->get_blob().is_compressed()) {
628
629 // additionally we take compressed blobs that were not impacted
630 // by the write into account too
631 BlobInfo& bi =
632 affected_blobs.emplace(
633 b, BlobInfo(b->get_referenced_bytes())).first->second;
634
635 int adjust =
636 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
637 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
638 dout(30) << __func__ << " expected_allocations="
639 << bi.expected_allocations << " end_au:"
640 << alloc_unit_end << dendl;
641
642 blob_info_counted = &bi;
643 used_alloc_unit = alloc_unit_end;
644
645 assert(it->length <= bi.referenced_bytes);
646 bi.referenced_bytes -= it->length;
647 dout(30) << __func__ << " affected_blob:" << *b
648 << " unref 0x" << std::hex << it->length
649 << " referenced = 0x" << bi.referenced_bytes
650 << std::dec << dendl;
651 // NOTE: we can't move specific blob to resulting GC list here
652 // when reference counter == 0 since subsequent extents might
653 // decrement its expected_allocation.
654 // Hence need to enumerate all the extents first.
655 if (!bi.collect_candidate) {
656 bi.first_lextent = it;
657 bi.collect_candidate = true;
658 }
659 bi.last_lextent = it;
660 } else {
661 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
662 // don't need to allocate new AU for compressed data since another
663 // collocated uncompressed blob already exists
664 --blob_info_counted->expected_allocations;
665 dout(30) << __func__ << " --expected_allocations:"
666 << alloc_unit_start << dendl;
667 }
668 used_alloc_unit = alloc_unit_end;
669 blob_info_counted = nullptr;
670 }
671 }
672
673 for (auto b_it = affected_blobs.begin();
674 b_it != affected_blobs.end();
675 ++b_it) {
676 Blob* b = b_it->first;
677 BlobInfo& bi = b_it->second;
678 if (bi.referenced_bytes == 0) {
679 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
680 int64_t blob_expected_for_release =
681 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
682
683 dout(30) << __func__ << " " << *(b_it->first)
684 << " expected4release=" << blob_expected_for_release
685 << " expected_allocations=" << bi.expected_allocations
686 << dendl;
687 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
688 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
689 if (bi.collect_candidate) {
690 auto it = bi.first_lextent;
691 bool bExit = false;
692 do {
693 if (it->blob.get() == b) {
694 extents_to_collect.emplace_back(it->logical_offset, it->length);
695 }
696 bExit = it == bi.last_lextent;
697 ++it;
31f18b77 698 } while (!bExit);
7c673cae
FG
699 }
700 expected_for_release += blob_expected_for_release;
701 expected_allocations += bi.expected_allocations;
702 }
703 }
704 }
705}
706
707int64_t BlueStore::GarbageCollector::estimate(
708 uint64_t start_offset,
709 uint64_t length,
710 const BlueStore::ExtentMap& extent_map,
711 const BlueStore::old_extent_map_t& old_extents,
712 uint64_t min_alloc_size)
713{
714
715 affected_blobs.clear();
716 extents_to_collect.clear();
717 used_alloc_unit = boost::optional<uint64_t >();
718 blob_info_counted = nullptr;
719
720 gc_start_offset = start_offset;
721 gc_end_offset = start_offset + length;
722
723 uint64_t end_offset = start_offset + length;
724
725 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
726 Blob* b = it->e.blob.get();
727 if (b->get_blob().is_compressed()) {
728
729 // update gc_start_offset/gc_end_offset if needed
730 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
731 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
732
733 auto o = it->e.logical_offset;
734 auto l = it->e.length;
735
736 uint64_t ref_bytes = b->get_referenced_bytes();
737 // micro optimization to bypass blobs that have no more references
738 if (ref_bytes != 0) {
739 dout(30) << __func__ << " affected_blob:" << *b
740 << " unref 0x" << std::hex << o << "~" << l
741 << std::dec << dendl;
742 affected_blobs.emplace(b, BlobInfo(ref_bytes));
743 }
744 }
745 }
746 dout(30) << __func__ << " gc range(hex): [" << std::hex
747 << gc_start_offset << ", " << gc_end_offset
748 << ")" << std::dec << dendl;
749
750 // enumerate preceeding extents to check if they reference affected blobs
751 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
752 process_protrusive_extents(extent_map,
753 gc_start_offset,
754 gc_end_offset,
755 start_offset,
756 end_offset,
757 min_alloc_size);
758 }
759 return expected_for_release - expected_allocations;
760}
761
762// Cache
763
764BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
765 PerfCounters *logger)
766{
767 Cache *c = nullptr;
768
769 if (type == "lru")
770 c = new LRUCache(cct);
771 else if (type == "2q")
772 c = new TwoQCache(cct);
773 else
774 assert(0 == "unrecognized cache type");
775
776 c->logger = logger;
777 return c;
778}
779
780void BlueStore::Cache::trim_all()
781{
782 std::lock_guard<std::recursive_mutex> l(lock);
783 _trim(0, 0);
7c673cae
FG
784}
785
786void BlueStore::Cache::trim(
787 uint64_t target_bytes,
788 float target_meta_ratio,
31f18b77 789 float target_data_ratio,
7c673cae
FG
790 float bytes_per_onode)
791{
792 std::lock_guard<std::recursive_mutex> l(lock);
793 uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
794 uint64_t current_buffer = _get_buffer_bytes();
795 uint64_t current = current_meta + current_buffer;
796
31f18b77
FG
797 uint64_t target_meta = target_bytes * target_meta_ratio;
798 uint64_t target_buffer = target_bytes * target_data_ratio;
7c673cae 799
31f18b77
FG
800 // correct for overflow or float imprecision
801 target_meta = min(target_bytes, target_meta);
802 target_buffer = min(target_bytes - target_meta, target_buffer);
7c673cae
FG
803
804 if (current <= target_bytes) {
805 dout(10) << __func__
806 << " shard target " << pretty_si_t(target_bytes)
31f18b77
FG
807 << " meta/data ratios " << target_meta_ratio
808 << " + " << target_data_ratio << " ("
7c673cae
FG
809 << pretty_si_t(target_meta) << " + "
810 << pretty_si_t(target_buffer) << "), "
811 << " current " << pretty_si_t(current) << " ("
812 << pretty_si_t(current_meta) << " + "
813 << pretty_si_t(current_buffer) << ")"
814 << dendl;
815 return;
816 }
817
818 uint64_t need_to_free = current - target_bytes;
819 uint64_t free_buffer = 0;
820 uint64_t free_meta = 0;
821 if (current_buffer > target_buffer) {
822 free_buffer = current_buffer - target_buffer;
823 if (free_buffer > need_to_free) {
824 free_buffer = need_to_free;
825 }
826 }
827 free_meta = need_to_free - free_buffer;
828
829 // start bounds at what we have now
830 uint64_t max_buffer = current_buffer - free_buffer;
831 uint64_t max_meta = current_meta - free_meta;
832 uint64_t max_onodes = max_meta / bytes_per_onode;
833
834 dout(10) << __func__
835 << " shard target " << pretty_si_t(target_bytes)
836 << " ratio " << target_meta_ratio << " ("
837 << pretty_si_t(target_meta) << " + "
838 << pretty_si_t(target_buffer) << "), "
839 << " current " << pretty_si_t(current) << " ("
840 << pretty_si_t(current_meta) << " + "
841 << pretty_si_t(current_buffer) << "),"
842 << " need_to_free " << pretty_si_t(need_to_free) << " ("
843 << pretty_si_t(free_meta) << " + "
844 << pretty_si_t(free_buffer) << ")"
845 << " -> max " << max_onodes << " onodes + "
846 << max_buffer << " buffer"
847 << dendl;
848 _trim(max_onodes, max_buffer);
849}
850
851
852// LRUCache
853#undef dout_prefix
854#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
855
856void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
857{
858 auto p = onode_lru.iterator_to(*o);
859 onode_lru.erase(p);
860 onode_lru.push_front(*o);
861}
862
863void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
864{
865 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
866 << " buffers " << buffer_size << " / " << buffer_max
867 << dendl;
868
869 _audit("trim start");
870
871 // buffers
872 while (buffer_size > buffer_max) {
873 auto i = buffer_lru.rbegin();
874 if (i == buffer_lru.rend()) {
875 // stop if buffer_lru is now empty
876 break;
877 }
878
879 Buffer *b = &*i;
880 assert(b->is_clean());
881 dout(20) << __func__ << " rm " << *b << dendl;
882 b->space->_rm_buffer(this, b);
883 }
884
885 // onodes
886 int num = onode_lru.size() - onode_max;
887 if (num <= 0)
888 return; // don't even try
889
890 auto p = onode_lru.end();
891 assert(p != onode_lru.begin());
892 --p;
893 int skipped = 0;
894 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
895 while (num > 0) {
896 Onode *o = &*p;
897 int refs = o->nref.load();
898 if (refs > 1) {
899 dout(20) << __func__ << " " << o->oid << " has " << refs
900 << " refs, skipping" << dendl;
901 if (++skipped >= max_skipped) {
902 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
903 << num << " left to trim" << dendl;
904 break;
905 }
906
907 if (p == onode_lru.begin()) {
908 break;
909 } else {
910 p--;
911 num--;
912 continue;
913 }
914 }
915 dout(30) << __func__ << " rm " << o->oid << dendl;
916 if (p != onode_lru.begin()) {
917 onode_lru.erase(p--);
918 } else {
919 onode_lru.erase(p);
920 assert(num == 1);
921 }
922 o->get(); // paranoia
923 o->c->onode_map.remove(o->oid);
924 o->put();
925 --num;
926 }
927}
928
929#ifdef DEBUG_CACHE
930void BlueStore::LRUCache::_audit(const char *when)
931{
932 dout(10) << __func__ << " " << when << " start" << dendl;
933 uint64_t s = 0;
934 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
935 s += i->length;
936 }
937 if (s != buffer_size) {
938 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
939 << dendl;
940 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
941 derr << __func__ << " " << *i << dendl;
942 }
943 assert(s == buffer_size);
944 }
945 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
946 << " ok" << dendl;
947}
948#endif
949
950// TwoQCache
951#undef dout_prefix
952#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
953
954
955void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
956{
957 auto p = onode_lru.iterator_to(*o);
958 onode_lru.erase(p);
959 onode_lru.push_front(*o);
960}
961
962void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
963{
964 dout(20) << __func__ << " level " << level << " near " << near
965 << " on " << *b
966 << " which has cache_private " << b->cache_private << dendl;
967 if (near) {
968 b->cache_private = near->cache_private;
969 switch (b->cache_private) {
970 case BUFFER_WARM_IN:
971 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
972 break;
973 case BUFFER_WARM_OUT:
974 assert(b->is_empty());
975 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
976 break;
977 case BUFFER_HOT:
978 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
979 break;
980 default:
981 assert(0 == "bad cache_private");
982 }
983 } else if (b->cache_private == BUFFER_NEW) {
984 b->cache_private = BUFFER_WARM_IN;
985 if (level > 0) {
986 buffer_warm_in.push_front(*b);
987 } else {
988 // take caller hint to start at the back of the warm queue
989 buffer_warm_in.push_back(*b);
990 }
991 } else {
992 // we got a hint from discard
993 switch (b->cache_private) {
994 case BUFFER_WARM_IN:
995 // stay in warm_in. move to front, even though 2Q doesn't actually
996 // do this.
997 dout(20) << __func__ << " move to front of warm " << *b << dendl;
998 buffer_warm_in.push_front(*b);
999 break;
1000 case BUFFER_WARM_OUT:
1001 b->cache_private = BUFFER_HOT;
1002 // move to hot. fall-thru
1003 case BUFFER_HOT:
1004 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1005 buffer_hot.push_front(*b);
1006 break;
1007 default:
1008 assert(0 == "bad cache_private");
1009 }
1010 }
1011 if (!b->is_empty()) {
1012 buffer_bytes += b->length;
1013 buffer_list_bytes[b->cache_private] += b->length;
1014 }
1015}
1016
1017void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1018{
1019 dout(20) << __func__ << " " << *b << dendl;
1020 if (!b->is_empty()) {
1021 assert(buffer_bytes >= b->length);
1022 buffer_bytes -= b->length;
1023 assert(buffer_list_bytes[b->cache_private] >= b->length);
1024 buffer_list_bytes[b->cache_private] -= b->length;
1025 }
1026 switch (b->cache_private) {
1027 case BUFFER_WARM_IN:
1028 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1029 break;
1030 case BUFFER_WARM_OUT:
1031 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1032 break;
1033 case BUFFER_HOT:
1034 buffer_hot.erase(buffer_hot.iterator_to(*b));
1035 break;
1036 default:
1037 assert(0 == "bad cache_private");
1038 }
1039}
1040
1041void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1042{
1043 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1044 src->_rm_buffer(b);
1045
1046 // preserve which list we're on (even if we can't preserve the order!)
1047 switch (b->cache_private) {
1048 case BUFFER_WARM_IN:
1049 assert(!b->is_empty());
1050 buffer_warm_in.push_back(*b);
1051 break;
1052 case BUFFER_WARM_OUT:
1053 assert(b->is_empty());
1054 buffer_warm_out.push_back(*b);
1055 break;
1056 case BUFFER_HOT:
1057 assert(!b->is_empty());
1058 buffer_hot.push_back(*b);
1059 break;
1060 default:
1061 assert(0 == "bad cache_private");
1062 }
1063 if (!b->is_empty()) {
1064 buffer_bytes += b->length;
1065 buffer_list_bytes[b->cache_private] += b->length;
1066 }
1067}
1068
1069void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1070{
1071 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1072 if (!b->is_empty()) {
1073 assert((int64_t)buffer_bytes + delta >= 0);
1074 buffer_bytes += delta;
1075 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1076 buffer_list_bytes[b->cache_private] += delta;
1077 }
1078}
1079
1080void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1081{
1082 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1083 << " buffers " << buffer_bytes << " / " << buffer_max
1084 << dendl;
1085
1086 _audit("trim start");
1087
1088 // buffers
1089 if (buffer_bytes > buffer_max) {
1090 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1091 uint64_t khot = buffer_max - kin;
1092
1093 // pre-calculate kout based on average buffer size too,
1094 // which is typical(the warm_in and hot lists may change later)
1095 uint64_t kout = 0;
1096 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1097 if (buffer_num) {
1098 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1099 assert(buffer_avg_size);
1100 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1101 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1102 }
1103
1104 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1105 // hot is small, give slack to warm_in
1106 kin += khot - buffer_list_bytes[BUFFER_HOT];
1107 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1108 // warm_in is small, give slack to hot
1109 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1110 }
1111
1112 // adjust warm_in list
1113 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1114 uint64_t evicted = 0;
1115
1116 while (to_evict_bytes > 0) {
1117 auto p = buffer_warm_in.rbegin();
1118 if (p == buffer_warm_in.rend()) {
1119 // stop if warm_in list is now empty
1120 break;
1121 }
1122
1123 Buffer *b = &*p;
1124 assert(b->is_clean());
1125 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1126 assert(buffer_bytes >= b->length);
1127 buffer_bytes -= b->length;
1128 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1129 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1130 to_evict_bytes -= b->length;
1131 evicted += b->length;
1132 b->state = Buffer::STATE_EMPTY;
1133 b->data.clear();
1134 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1135 buffer_warm_out.push_front(*b);
1136 b->cache_private = BUFFER_WARM_OUT;
1137 }
1138
1139 if (evicted > 0) {
1140 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1141 << " from warm_in list, done evicting warm_in buffers"
1142 << dendl;
1143 }
1144
1145 // adjust hot list
1146 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1147 evicted = 0;
1148
1149 while (to_evict_bytes > 0) {
1150 auto p = buffer_hot.rbegin();
1151 if (p == buffer_hot.rend()) {
1152 // stop if hot list is now empty
1153 break;
1154 }
1155
1156 Buffer *b = &*p;
1157 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1158 assert(b->is_clean());
1159 // adjust evict size before buffer goes invalid
1160 to_evict_bytes -= b->length;
1161 evicted += b->length;
1162 b->space->_rm_buffer(this, b);
1163 }
1164
1165 if (evicted > 0) {
1166 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1167 << " from hot list, done evicting hot buffers"
1168 << dendl;
1169 }
1170
1171 // adjust warm out list too, if necessary
1172 int64_t num = buffer_warm_out.size() - kout;
1173 while (num-- > 0) {
1174 Buffer *b = &*buffer_warm_out.rbegin();
1175 assert(b->is_empty());
1176 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1177 b->space->_rm_buffer(this, b);
1178 }
1179 }
1180
1181 // onodes
1182 int num = onode_lru.size() - onode_max;
1183 if (num <= 0)
1184 return; // don't even try
1185
1186 auto p = onode_lru.end();
1187 assert(p != onode_lru.begin());
1188 --p;
1189 int skipped = 0;
1190 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1191 while (num > 0) {
1192 Onode *o = &*p;
1193 dout(20) << __func__ << " considering " << o << dendl;
1194 int refs = o->nref.load();
1195 if (refs > 1) {
1196 dout(20) << __func__ << " " << o->oid << " has " << refs
1197 << " refs; skipping" << dendl;
1198 if (++skipped >= max_skipped) {
1199 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1200 << num << " left to trim" << dendl;
1201 break;
1202 }
1203
1204 if (p == onode_lru.begin()) {
1205 break;
1206 } else {
1207 p--;
1208 num--;
1209 continue;
1210 }
1211 }
1212 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1213 if (p != onode_lru.begin()) {
1214 onode_lru.erase(p--);
1215 } else {
1216 onode_lru.erase(p);
1217 assert(num == 1);
1218 }
1219 o->get(); // paranoia
1220 o->c->onode_map.remove(o->oid);
1221 o->put();
1222 --num;
1223 }
1224}
1225
1226#ifdef DEBUG_CACHE
1227void BlueStore::TwoQCache::_audit(const char *when)
1228{
1229 dout(10) << __func__ << " " << when << " start" << dendl;
1230 uint64_t s = 0;
1231 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1232 s += i->length;
1233 }
1234
1235 uint64_t hot_bytes = s;
1236 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1237 derr << __func__ << " hot_list_bytes "
1238 << buffer_list_bytes[BUFFER_HOT]
1239 << " != actual " << hot_bytes
1240 << dendl;
1241 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1242 }
1243
1244 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1245 s += i->length;
1246 }
1247
1248 uint64_t warm_in_bytes = s - hot_bytes;
1249 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1250 derr << __func__ << " warm_in_list_bytes "
1251 << buffer_list_bytes[BUFFER_WARM_IN]
1252 << " != actual " << warm_in_bytes
1253 << dendl;
1254 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1255 }
1256
1257 if (s != buffer_bytes) {
1258 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1259 << dendl;
1260 assert(s == buffer_bytes);
1261 }
1262
1263 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1264 << " ok" << dendl;
1265}
1266#endif
1267
1268
1269// BufferSpace
1270
1271#undef dout_prefix
1272#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1273
1274void BlueStore::BufferSpace::_clear(Cache* cache)
1275{
1276 // note: we already hold cache->lock
1277 ldout(cache->cct, 20) << __func__ << dendl;
1278 while (!buffer_map.empty()) {
1279 _rm_buffer(cache, buffer_map.begin());
1280 }
1281}
1282
1283int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1284{
1285 // note: we already hold cache->lock
1286 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1287 << std::dec << dendl;
1288 int cache_private = 0;
1289 cache->_audit("discard start");
1290 auto i = _data_lower_bound(offset);
1291 uint32_t end = offset + length;
1292 while (i != buffer_map.end()) {
1293 Buffer *b = i->second.get();
1294 if (b->offset >= end) {
1295 break;
1296 }
1297 if (b->cache_private > cache_private) {
1298 cache_private = b->cache_private;
1299 }
1300 if (b->offset < offset) {
1301 int64_t front = offset - b->offset;
1302 if (b->end() > end) {
1303 // drop middle (split)
1304 uint32_t tail = b->end() - end;
1305 if (b->data.length()) {
1306 bufferlist bl;
1307 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1308 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1309 nb->maybe_rebuild();
1310 _add_buffer(cache, nb, 0, b);
7c673cae 1311 } else {
31f18b77
FG
1312 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1313 0, b);
7c673cae
FG
1314 }
1315 if (!b->is_writing()) {
1316 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1317 }
1318 b->truncate(front);
31f18b77 1319 b->maybe_rebuild();
7c673cae
FG
1320 cache->_audit("discard end 1");
1321 break;
1322 } else {
1323 // drop tail
1324 if (!b->is_writing()) {
1325 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1326 }
1327 b->truncate(front);
31f18b77 1328 b->maybe_rebuild();
7c673cae
FG
1329 ++i;
1330 continue;
1331 }
1332 }
1333 if (b->end() <= end) {
1334 // drop entire buffer
1335 _rm_buffer(cache, i++);
1336 continue;
1337 }
1338 // drop front
1339 uint32_t keep = b->end() - end;
1340 if (b->data.length()) {
1341 bufferlist bl;
1342 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1343 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1344 nb->maybe_rebuild();
1345 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1346 } else {
1347 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1348 }
1349 _rm_buffer(cache, i);
1350 cache->_audit("discard end 2");
1351 break;
1352 }
1353 return cache_private;
1354}
1355
1356void BlueStore::BufferSpace::read(
1357 Cache* cache,
224ce89b
WB
1358 uint32_t offset,
1359 uint32_t length,
7c673cae
FG
1360 BlueStore::ready_regions_t& res,
1361 interval_set<uint32_t>& res_intervals)
1362{
7c673cae
FG
1363 res.clear();
1364 res_intervals.clear();
1365 uint32_t want_bytes = length;
1366 uint32_t end = offset + length;
224ce89b
WB
1367
1368 {
1369 std::lock_guard<std::recursive_mutex> l(cache->lock);
1370 for (auto i = _data_lower_bound(offset);
1371 i != buffer_map.end() && offset < end && i->first < end;
1372 ++i) {
1373 Buffer *b = i->second.get();
1374 assert(b->end() > offset);
1375 if (b->is_writing() || b->is_clean()) {
1376 if (b->offset < offset) {
1377 uint32_t skip = offset - b->offset;
1378 uint32_t l = MIN(length, b->length - skip);
1379 res[offset].substr_of(b->data, skip, l);
1380 res_intervals.insert(offset, l);
1381 offset += l;
1382 length -= l;
1383 if (!b->is_writing()) {
1384 cache->_touch_buffer(b);
1385 }
1386 continue;
1387 }
1388 if (b->offset > offset) {
1389 uint32_t gap = b->offset - offset;
1390 if (length <= gap) {
1391 break;
1392 }
1393 offset += gap;
1394 length -= gap;
1395 }
1396 if (!b->is_writing()) {
7c673cae 1397 cache->_touch_buffer(b);
224ce89b
WB
1398 }
1399 if (b->length > length) {
1400 res[offset].substr_of(b->data, 0, length);
1401 res_intervals.insert(offset, length);
7c673cae 1402 break;
224ce89b
WB
1403 } else {
1404 res[offset].append(b->data);
1405 res_intervals.insert(offset, b->length);
1406 if (b->length == length)
1407 break;
1408 offset += b->length;
1409 length -= b->length;
1410 }
7c673cae
FG
1411 }
1412 }
1413 }
1414
1415 uint64_t hit_bytes = res_intervals.size();
1416 assert(hit_bytes <= want_bytes);
1417 uint64_t miss_bytes = want_bytes - hit_bytes;
1418 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1419 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1420}
1421
1422void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1423{
1424 std::lock_guard<std::recursive_mutex> l(cache->lock);
1425
1426 auto i = writing.begin();
1427 while (i != writing.end()) {
1428 if (i->seq > seq) {
1429 break;
1430 }
1431 if (i->seq < seq) {
1432 ++i;
1433 continue;
1434 }
1435
1436 Buffer *b = &*i;
1437 assert(b->is_writing());
1438
1439 if (b->flags & Buffer::FLAG_NOCACHE) {
1440 writing.erase(i++);
1441 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1442 buffer_map.erase(b->offset);
1443 } else {
1444 b->state = Buffer::STATE_CLEAN;
1445 writing.erase(i++);
31f18b77
FG
1446 b->maybe_rebuild();
1447 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
1448 cache->_add_buffer(b, 1, nullptr);
1449 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1450 }
1451 }
1452
1453 cache->_audit("finish_write end");
1454}
1455
1456void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1457{
1458 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1459 if (buffer_map.empty())
1460 return;
1461
1462 auto p = --buffer_map.end();
1463 while (true) {
1464 if (p->second->end() <= pos)
1465 break;
1466
1467 if (p->second->offset < pos) {
1468 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1469 size_t left = pos - p->second->offset;
1470 size_t right = p->second->length - left;
1471 if (p->second->data.length()) {
1472 bufferlist bl;
1473 bl.substr_of(p->second->data, left, right);
1474 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1475 0, p->second.get());
1476 } else {
1477 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1478 0, p->second.get());
1479 }
1480 cache->_adjust_buffer_size(p->second.get(), -right);
1481 p->second->truncate(left);
1482 break;
1483 }
1484
1485 assert(p->second->end() > pos);
1486 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1487 if (p->second->data.length()) {
1488 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1489 p->second->offset - pos, p->second->data),
1490 0, p->second.get());
1491 } else {
1492 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1493 p->second->offset - pos, p->second->length),
1494 0, p->second.get());
1495 }
1496 if (p == buffer_map.begin()) {
1497 _rm_buffer(cache, p);
1498 break;
1499 } else {
1500 _rm_buffer(cache, p--);
1501 }
1502 }
1503 assert(writing.empty());
1504}
1505
1506// OnodeSpace
1507
1508#undef dout_prefix
1509#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1510
1511BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1512{
1513 std::lock_guard<std::recursive_mutex> l(cache->lock);
1514 auto p = onode_map.find(oid);
1515 if (p != onode_map.end()) {
1516 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1517 << " raced, returning existing " << p->second
1518 << dendl;
1519 return p->second;
1520 }
1521 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1522 onode_map[oid] = o;
1523 cache->_add_onode(o, 1);
1524 return o;
1525}
1526
1527BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1528{
7c673cae 1529 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1530 OnodeRef o;
1531 bool hit = false;
1532
1533 {
1534 std::lock_guard<std::recursive_mutex> l(cache->lock);
1535 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1536 if (p == onode_map.end()) {
1537 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1538 } else {
1539 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1540 << dendl;
1541 cache->_touch_onode(p->second);
1542 hit = true;
1543 o = p->second;
1544 }
1545 }
1546
1547 if (hit) {
1548 cache->logger->inc(l_bluestore_onode_hits);
1549 } else {
7c673cae 1550 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1551 }
224ce89b 1552 return o;
7c673cae
FG
1553}
1554
1555void BlueStore::OnodeSpace::clear()
1556{
1557 std::lock_guard<std::recursive_mutex> l(cache->lock);
1558 ldout(cache->cct, 10) << __func__ << dendl;
1559 for (auto &p : onode_map) {
1560 cache->_rm_onode(p.second);
1561 }
1562 onode_map.clear();
1563}
1564
1565bool BlueStore::OnodeSpace::empty()
1566{
1567 std::lock_guard<std::recursive_mutex> l(cache->lock);
1568 return onode_map.empty();
1569}
1570
1571void BlueStore::OnodeSpace::rename(
1572 OnodeRef& oldo,
1573 const ghobject_t& old_oid,
1574 const ghobject_t& new_oid,
31f18b77 1575 const mempool::bluestore_cache_other::string& new_okey)
7c673cae
FG
1576{
1577 std::lock_guard<std::recursive_mutex> l(cache->lock);
1578 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1579 << dendl;
1580 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1581 po = onode_map.find(old_oid);
1582 pn = onode_map.find(new_oid);
1583 assert(po != pn);
1584
1585 assert(po != onode_map.end());
1586 if (pn != onode_map.end()) {
1587 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1588 << dendl;
1589 cache->_rm_onode(pn->second);
1590 onode_map.erase(pn);
1591 }
1592 OnodeRef o = po->second;
1593
1594 // install a non-existent onode at old location
1595 oldo.reset(new Onode(o->c, old_oid, o->key));
1596 po->second = oldo;
1597 cache->_add_onode(po->second, 1);
1598
1599 // add at new position and fix oid, key
1600 onode_map.insert(make_pair(new_oid, o));
1601 cache->_touch_onode(o);
1602 o->oid = new_oid;
1603 o->key = new_okey;
1604}
1605
1606bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1607{
1608 std::lock_guard<std::recursive_mutex> l(cache->lock);
1609 ldout(cache->cct, 20) << __func__ << dendl;
1610 for (auto& i : onode_map) {
1611 if (f(i.second)) {
1612 return true;
1613 }
1614 }
1615 return false;
1616}
1617
1618
1619// SharedBlob
1620
1621#undef dout_prefix
1622#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1623
1624ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1625{
1626 out << "SharedBlob(" << &sb;
1627
1628 if (sb.loaded) {
1629 out << " loaded " << *sb.persistent;
1630 } else {
1631 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1632 }
1633 return out << ")";
1634}
1635
1636BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1637 : coll(_coll), sbid_unloaded(i)
1638{
1639 assert(sbid_unloaded > 0);
1640 if (get_cache()) {
1641 get_cache()->add_blob();
1642 }
1643}
1644
1645BlueStore::SharedBlob::~SharedBlob()
1646{
1647 if (get_cache()) { // the dummy instances have a nullptr
1648 std::lock_guard<std::recursive_mutex> l(get_cache()->lock);
1649 bc._clear(get_cache());
1650 get_cache()->rm_blob();
1651 }
1652 if (loaded && persistent) {
1653 delete persistent;
1654 }
1655}
1656
1657void BlueStore::SharedBlob::put()
1658{
1659 if (--nref == 0) {
1660 ldout(coll->store->cct, 20) << __func__ << " " << this
1661 << " removing self from set " << get_parent()
1662 << dendl;
1663 if (get_parent()) {
1664 if (get_parent()->remove(this)) {
1665 delete this;
1666 } else {
1667 ldout(coll->store->cct, 20)
1668 << __func__ << " " << this << " lost race to remove myself from set"
1669 << dendl;
1670 }
1671 } else {
1672 delete this;
1673 }
1674 }
1675}
1676
1677void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1678{
1679 assert(persistent);
1680 persistent->ref_map.get(offset, length);
1681}
1682
1683void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77
FG
1684 PExtentVector *r,
1685 set<SharedBlob*> *maybe_unshared)
7c673cae
FG
1686{
1687 assert(persistent);
31f18b77
FG
1688 bool maybe = false;
1689 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1690 if (maybe_unshared && maybe) {
1691 maybe_unshared->insert(this);
1692 }
7c673cae
FG
1693}
1694
1695// Blob
1696
1697#undef dout_prefix
1698#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1699
1700ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1701{
1702 out << "Blob(" << &b;
1703 if (b.is_spanning()) {
1704 out << " spanning " << b.id;
1705 }
1706 out << " " << b.get_blob() << " " << b.get_blob_use_tracker()
1707 << " " << *b.shared_blob
1708 << ")";
1709 return out;
1710}
1711
1712void BlueStore::Blob::discard_unallocated(Collection *coll)
1713{
224ce89b 1714 if (get_blob().is_shared()) {
7c673cae
FG
1715 return;
1716 }
224ce89b 1717 if (get_blob().is_compressed()) {
7c673cae
FG
1718 bool discard = false;
1719 bool all_invalid = true;
224ce89b 1720 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1721 if (!e.is_valid()) {
1722 discard = true;
1723 } else {
1724 all_invalid = false;
1725 }
1726 }
1727 assert(discard == all_invalid); // in case of compressed blob all
1728 // or none pextents are invalid.
1729 if (discard) {
224ce89b
WB
1730 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1731 get_blob().get_logical_length());
7c673cae
FG
1732 }
1733 } else {
1734 size_t pos = 0;
224ce89b 1735 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1736 if (!e.is_valid()) {
1737 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1738 << "~" << e.length
1739 << std::dec << dendl;
1740 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1741 }
1742 pos += e.length;
1743 }
224ce89b
WB
1744 if (get_blob().can_prune_tail()) {
1745 dirty_blob().prune_tail();
1746 used_in_blob.prune_tail(get_blob().get_ondisk_length());
7c673cae 1747 auto cct = coll->store->cct; //used by dout
224ce89b 1748 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1749 }
1750 }
1751}
1752
1753void BlueStore::Blob::get_ref(
1754 Collection *coll,
1755 uint32_t offset,
1756 uint32_t length)
1757{
1758 // Caller has to initialize Blob's logical length prior to increment
1759 // references. Otherwise one is neither unable to determine required
1760 // amount of counters in case of per-au tracking nor obtain min_release_size
1761 // for single counter mode.
1762 assert(get_blob().get_logical_length() != 0);
1763 auto cct = coll->store->cct;
1764 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1765 << std::dec << " " << *this << dendl;
1766
1767 if (used_in_blob.is_empty()) {
1768 uint32_t min_release_size =
224ce89b
WB
1769 get_blob().get_release_size(coll->store->min_alloc_size);
1770 uint64_t l = get_blob().get_logical_length();
1771 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1772 << min_release_size << std::dec << dendl;
7c673cae
FG
1773 used_in_blob.init(l, min_release_size);
1774 }
1775 used_in_blob.get(
1776 offset,
1777 length);
1778}
1779
1780bool BlueStore::Blob::put_ref(
1781 Collection *coll,
1782 uint32_t offset,
1783 uint32_t length,
1784 PExtentVector *r)
1785{
1786 PExtentVector logical;
1787
1788 auto cct = coll->store->cct;
1789 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1790 << std::dec << " " << *this << dendl;
1791
1792 bool empty = used_in_blob.put(
1793 offset,
1794 length,
1795 &logical);
1796 r->clear();
1797 // nothing to release
1798 if (!empty && logical.empty()) {
1799 return false;
1800 }
1801
1802 bluestore_blob_t& b = dirty_blob();
1803 return b.release_extents(empty, logical, r);
1804}
1805
224ce89b 1806bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1807 uint32_t target_blob_size,
1808 uint32_t b_offset,
1809 uint32_t *length0) {
1810 assert(min_alloc_size);
1811 assert(target_blob_size);
1812 if (!get_blob().is_mutable()) {
1813 return false;
1814 }
1815
1816 uint32_t length = *length0;
1817 uint32_t end = b_offset + length;
1818
1819 // Currently for the sake of simplicity we omit blob reuse if data is
1820 // unaligned with csum chunk. Later we can perform padding if needed.
1821 if (get_blob().has_csum() &&
1822 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1823 (end % get_blob().get_csum_chunk_size()) != 0)) {
1824 return false;
1825 }
1826
1827 auto blen = get_blob().get_logical_length();
1828 uint32_t new_blen = blen;
1829
1830 // make sure target_blob_size isn't less than current blob len
1831 target_blob_size = MAX(blen, target_blob_size);
1832
1833 if (b_offset >= blen) {
224ce89b
WB
1834 // new data totally stands out of the existing blob
1835 new_blen = end;
7c673cae 1836 } else {
224ce89b
WB
1837 // new data overlaps with the existing blob
1838 new_blen = MAX(blen, end);
1839
1840 uint32_t overlap = 0;
1841 if (new_blen > blen) {
1842 overlap = blen - b_offset;
1843 } else {
1844 overlap = length;
1845 }
1846
1847 if (!get_blob().is_unallocated(b_offset, overlap)) {
1848 // abort if any piece of the overlap has already been allocated
1849 return false;
7c673cae
FG
1850 }
1851 }
224ce89b 1852
7c673cae
FG
1853 if (new_blen > blen) {
1854 int64_t overflow = int64_t(new_blen) - target_blob_size;
1855 // Unable to decrease the provided length to fit into max_blob_size
1856 if (overflow >= length) {
1857 return false;
1858 }
1859
1860 // FIXME: in some cases we could reduce unused resolution
1861 if (get_blob().has_unused()) {
1862 return false;
1863 }
1864
1865 if (overflow > 0) {
1866 new_blen -= overflow;
1867 length -= overflow;
1868 *length0 = length;
1869 }
224ce89b 1870
7c673cae
FG
1871 if (new_blen > blen) {
1872 dirty_blob().add_tail(new_blen);
1873 used_in_blob.add_tail(new_blen,
224ce89b 1874 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
1875 }
1876 }
1877 return true;
1878}
1879
1880void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1881{
1882 auto cct = coll->store->cct; //used by dout
1883 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1884 << " start " << *this << dendl;
1885 assert(blob.can_split());
1886 assert(used_in_blob.can_split());
1887 bluestore_blob_t &lb = dirty_blob();
1888 bluestore_blob_t &rb = r->dirty_blob();
1889
1890 used_in_blob.split(
1891 blob_offset,
1892 &(r->used_in_blob));
1893
1894 lb.split(blob_offset, rb);
1895 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1896
1897 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1898 << " finish " << *this << dendl;
1899 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1900 << " and " << *r << dendl;
1901}
1902
1903#ifndef CACHE_BLOB_BL
1904void BlueStore::Blob::decode(
1905 Collection *coll,
1906 bufferptr::iterator& p,
1907 uint64_t struct_v,
1908 uint64_t* sbid,
1909 bool include_ref_map)
1910{
1911 denc(blob, p, struct_v);
1912 if (blob.is_shared()) {
1913 denc(*sbid, p);
1914 }
1915 if (include_ref_map) {
1916 if (struct_v > 1) {
1917 used_in_blob.decode(p);
1918 } else {
1919 used_in_blob.clear();
1920 bluestore_extent_ref_map_t legacy_ref_map;
1921 legacy_ref_map.decode(p);
1922 for (auto r : legacy_ref_map.ref_map) {
1923 get_ref(
1924 coll,
1925 r.first,
1926 r.second.refs * r.second.length);
1927 }
1928 }
1929 }
1930}
1931#endif
1932
1933// Extent
1934
1935ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1936{
1937 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1938 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1939 << " " << *e.blob;
1940}
1941
1942// OldExtent
1943BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1944 uint32_t lo,
1945 uint32_t o,
1946 uint32_t l,
1947 BlobRef& b) {
1948 OldExtent* oe = new OldExtent(lo, o, l, b);
1949 b->put_ref(c.get(), o, l, &(oe->r));
1950 oe->blob_empty = b->get_referenced_bytes() == 0;
1951 return oe;
1952}
1953
1954// ExtentMap
1955
1956#undef dout_prefix
1957#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1958
1959BlueStore::ExtentMap::ExtentMap(Onode *o)
1960 : onode(o),
1961 inline_bl(
1962 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1963}
1964
1965void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1966 bool force)
1967{
1968 auto cct = onode->c->store->cct; //used by dout
1969 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1970 if (onode->onode.extent_map_shards.empty()) {
1971 if (inline_bl.length() == 0) {
1972 unsigned n;
1973 // we need to encode inline_bl to measure encoded length
1974 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
1975 assert(!never_happen);
1976 size_t len = inline_bl.length();
1977 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1978 << " extents" << dendl;
1979 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
1980 request_reshard(0, OBJECT_MAX_SIZE);
1981 return;
1982 }
1983 }
1984 // will persist in the onode key.
1985 } else {
1986 // pending shard update
1987 struct dirty_shard_t {
1988 Shard *shard;
1989 bufferlist bl;
1990 dirty_shard_t(Shard *s) : shard(s) {}
1991 };
1992 vector<dirty_shard_t> encoded_shards;
1993 // allocate slots for all shards in a single call instead of
1994 // doing multiple allocations - one per each dirty shard
1995 encoded_shards.reserve(shards.size());
1996
1997 auto p = shards.begin();
1998 auto prev_p = p;
1999 while (p != shards.end()) {
31f18b77 2000 assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2001 auto n = p;
2002 ++n;
2003 if (p->dirty) {
2004 uint32_t endoff;
2005 if (n == shards.end()) {
2006 endoff = OBJECT_MAX_SIZE;
2007 } else {
2008 endoff = n->shard_info->offset;
2009 }
2010 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2011 bufferlist& bl = encoded_shards.back().bl;
2012 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2013 bl, &p->extents)) {
2014 if (force) {
2015 derr << __func__ << " encode_some needs reshard" << dendl;
2016 assert(!force);
2017 }
2018 }
2019 size_t len = bl.length();
2020
2021 dout(20) << __func__ << " shard 0x" << std::hex
2022 << p->shard_info->offset << std::dec << " is " << len
2023 << " bytes (was " << p->shard_info->bytes << ") from "
2024 << p->extents << " extents" << dendl;
2025
2026 if (!force) {
2027 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2028 // we are big; reshard ourselves
2029 request_reshard(p->shard_info->offset, endoff);
2030 }
2031 // avoid resharding the trailing shard, even if it is small
2032 else if (n != shards.end() &&
2033 len < g_conf->bluestore_extent_map_shard_min_size) {
31f18b77
FG
2034 assert(endoff != OBJECT_MAX_SIZE);
2035 if (p == shards.begin()) {
2036 // we are the first shard, combine with next shard
7c673cae 2037 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2038 } else {
31f18b77
FG
2039 // combine either with the previous shard or the next,
2040 // whichever is smaller
7c673cae
FG
2041 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2042 request_reshard(p->shard_info->offset, endoff + 1);
2043 } else {
2044 request_reshard(prev_p->shard_info->offset, endoff);
2045 }
2046 }
2047 }
2048 }
2049 }
2050 prev_p = p;
2051 p = n;
2052 }
2053 if (needs_reshard()) {
2054 return;
2055 }
2056
2057 // schedule DB update for dirty shards
2058 string key;
2059 for (auto& it : encoded_shards) {
2060 it.shard->dirty = false;
2061 it.shard->shard_info->bytes = it.bl.length();
2062 generate_extent_shard_key_and_apply(
2063 onode->key,
2064 it.shard->shard_info->offset,
2065 &key,
2066 [&](const string& final_key) {
2067 t->set(PREFIX_OBJ, final_key, it.bl);
2068 }
2069 );
2070 }
2071 }
2072}
2073
31f18b77
FG
2074bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2075{
2076 if (spanning_blob_map.empty())
2077 return 0;
2078 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2079 // bid is valid and available.
2080 if (bid >= 0)
2081 return bid;
2082 // Find next unused bid;
2083 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2084 const auto begin_bid = bid;
2085 do {
2086 if (!spanning_blob_map.count(bid))
2087 return bid;
2088 else {
2089 bid++;
2090 if (bid < 0) bid = 0;
2091 }
2092 } while (bid != begin_bid);
2093 assert(0 == "no available blob id");
2094}
2095
7c673cae
FG
2096void BlueStore::ExtentMap::reshard(
2097 KeyValueDB *db,
2098 KeyValueDB::Transaction t)
2099{
2100 auto cct = onode->c->store->cct; // used by dout
2101
2102 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2103 << needs_reshard_end << ")" << std::dec
2104 << " of " << onode->onode.extent_map_shards.size()
2105 << " shards on " << onode->oid << dendl;
2106 for (auto& p : spanning_blob_map) {
2107 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2108 << dendl;
2109 }
2110 // determine shard index range
2111 unsigned si_begin = 0, si_end = 0;
2112 if (!shards.empty()) {
2113 while (si_begin + 1 < shards.size() &&
2114 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2115 ++si_begin;
2116 }
2117 needs_reshard_begin = shards[si_begin].shard_info->offset;
2118 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2119 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2120 needs_reshard_end = shards[si_end].shard_info->offset;
2121 break;
2122 }
2123 }
2124 if (si_end == shards.size()) {
2125 needs_reshard_end = OBJECT_MAX_SIZE;
2126 }
2127 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2128 << " over 0x[" << std::hex << needs_reshard_begin << ","
2129 << needs_reshard_end << ")" << std::dec << dendl;
2130 }
2131
2132 fault_range(db, needs_reshard_begin, needs_reshard_end);
2133
2134 // we may need to fault in a larger interval later must have all
2135 // referring extents for spanning blobs loaded in order to have
2136 // accurate use_tracker values.
2137 uint32_t spanning_scan_begin = needs_reshard_begin;
2138 uint32_t spanning_scan_end = needs_reshard_end;
2139
2140 // remove old keys
2141 string key;
2142 for (unsigned i = si_begin; i < si_end; ++i) {
2143 generate_extent_shard_key_and_apply(
2144 onode->key, shards[i].shard_info->offset, &key,
2145 [&](const string& final_key) {
2146 t->rmkey(PREFIX_OBJ, final_key);
2147 }
2148 );
2149 }
2150
2151 // calculate average extent size
2152 unsigned bytes = 0;
2153 unsigned extents = 0;
2154 if (onode->onode.extent_map_shards.empty()) {
2155 bytes = inline_bl.length();
2156 extents = extent_map.size();
2157 } else {
2158 for (unsigned i = si_begin; i < si_end; ++i) {
2159 bytes += shards[i].shard_info->bytes;
2160 extents += shards[i].extents;
2161 }
2162 }
2163 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2164 unsigned slop = target *
2165 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2166 unsigned extent_avg = bytes / MAX(1, extents);
2167 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2168 << ", slop " << slop << dendl;
2169
2170 // reshard
2171 unsigned estimate = 0;
31f18b77 2172 unsigned offset = needs_reshard_begin;
7c673cae
FG
2173 vector<bluestore_onode_t::shard_info> new_shard_info;
2174 unsigned max_blob_end = 0;
2175 Extent dummy(needs_reshard_begin);
2176 for (auto e = extent_map.lower_bound(dummy);
2177 e != extent_map.end();
2178 ++e) {
2179 if (e->logical_offset >= needs_reshard_end) {
2180 break;
2181 }
2182 dout(30) << " extent " << *e << dendl;
2183
2184 // disfavor shard boundaries that span a blob
2185 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2186 if (estimate &&
2187 estimate + extent_avg > target + (would_span ? slop : 0)) {
2188 // new shard
31f18b77 2189 if (offset == needs_reshard_begin) {
7c673cae
FG
2190 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2191 new_shard_info.back().offset = offset;
2192 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2193 << std::dec << dendl;
7c673cae
FG
2194 }
2195 offset = e->logical_offset;
2196 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2197 new_shard_info.back().offset = offset;
2198 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2199 << std::dec << dendl;
2200 estimate = 0;
2201 }
2202 estimate += extent_avg;
31f18b77
FG
2203 unsigned bs = e->blob_start();
2204 if (bs < spanning_scan_begin) {
2205 spanning_scan_begin = bs;
7c673cae
FG
2206 }
2207 uint32_t be = e->blob_end();
2208 if (be > max_blob_end) {
2209 max_blob_end = be;
2210 }
2211 if (be > spanning_scan_end) {
2212 spanning_scan_end = be;
2213 }
2214 }
2215 if (new_shard_info.empty() && (si_begin > 0 ||
2216 si_end < shards.size())) {
2217 // we resharded a partial range; we must produce at least one output
2218 // shard
2219 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2220 new_shard_info.back().offset = needs_reshard_begin;
2221 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2222 << std::dec << " (singleton degenerate case)" << dendl;
2223 }
2224
2225 auto& sv = onode->onode.extent_map_shards;
2226 dout(20) << __func__ << " new " << new_shard_info << dendl;
2227 dout(20) << __func__ << " old " << sv << dendl;
2228 if (sv.empty()) {
2229 // no old shards to keep
2230 sv.swap(new_shard_info);
2231 init_shards(true, true);
2232 } else {
2233 // splice in new shards
2234 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2235 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2236 sv.insert(
2237 sv.begin() + si_begin,
2238 new_shard_info.begin(),
2239 new_shard_info.end());
2240 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2241 si_end = si_begin + new_shard_info.size();
31f18b77
FG
2242
2243 assert(sv.size() == shards.size());
2244
2245 // note that we need to update every shard_info of shards here,
2246 // as sv might have been totally re-allocated above
2247 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2248 shards[i].shard_info = &sv[i];
31f18b77
FG
2249 }
2250
2251 // mark newly added shards as dirty
2252 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2253 shards[i].loaded = true;
2254 shards[i].dirty = true;
2255 }
7c673cae
FG
2256 }
2257 dout(20) << __func__ << " fin " << sv << dendl;
2258 inline_bl.clear();
2259
2260 if (sv.empty()) {
2261 // no more shards; unspan all previously spanning blobs
2262 auto p = spanning_blob_map.begin();
2263 while (p != spanning_blob_map.end()) {
2264 p->second->id = -1;
2265 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2266 p = spanning_blob_map.erase(p);
2267 }
2268 } else {
2269 // identify new spanning blobs
2270 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2271 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2272 if (spanning_scan_begin < needs_reshard_begin) {
2273 fault_range(db, spanning_scan_begin,
2274 needs_reshard_begin - spanning_scan_begin);
2275 }
2276 if (spanning_scan_end > needs_reshard_end) {
2277 fault_range(db, needs_reshard_end,
31f18b77 2278 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2279 }
2280 auto sp = sv.begin() + si_begin;
2281 auto esp = sv.end();
2282 unsigned shard_start = sp->offset;
2283 unsigned shard_end;
2284 ++sp;
2285 if (sp == esp) {
2286 shard_end = OBJECT_MAX_SIZE;
2287 } else {
2288 shard_end = sp->offset;
2289 }
7c673cae
FG
2290 Extent dummy(needs_reshard_begin);
2291 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2292 if (e->logical_offset >= needs_reshard_end) {
2293 break;
2294 }
2295 dout(30) << " extent " << *e << dendl;
2296 while (e->logical_offset >= shard_end) {
2297 shard_start = shard_end;
2298 assert(sp != esp);
2299 ++sp;
2300 if (sp == esp) {
2301 shard_end = OBJECT_MAX_SIZE;
2302 } else {
2303 shard_end = sp->offset;
2304 }
2305 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2306 << " to 0x" << shard_end << std::dec << dendl;
2307 }
2308 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2309 if (!e->blob->is_spanning()) {
2310 // We have two options: (1) split the blob into pieces at the
2311 // shard boundaries (and adjust extents accordingly), or (2)
2312 // mark it spanning. We prefer to cut the blob if we can. Note that
2313 // we may have to split it multiple times--potentially at every
2314 // shard boundary.
2315 bool must_span = false;
2316 BlobRef b = e->blob;
2317 if (b->can_split()) {
2318 uint32_t bstart = e->blob_start();
2319 uint32_t bend = e->blob_end();
2320 for (const auto& sh : shards) {
2321 if (bstart < sh.shard_info->offset &&
2322 bend > sh.shard_info->offset) {
2323 uint32_t blob_offset = sh.shard_info->offset - bstart;
2324 if (b->can_split_at(blob_offset)) {
2325 dout(20) << __func__ << " splitting blob, bstart 0x"
2326 << std::hex << bstart << " blob_offset 0x"
2327 << blob_offset << std::dec << " " << *b << dendl;
2328 b = split_blob(b, blob_offset, sh.shard_info->offset);
2329 // switch b to the new right-hand side, in case it
2330 // *also* has to get split.
2331 bstart += blob_offset;
2332 onode->c->store->logger->inc(l_bluestore_blob_split);
2333 } else {
2334 must_span = true;
2335 break;
2336 }
2337 }
2338 }
2339 } else {
2340 must_span = true;
2341 }
2342 if (must_span) {
31f18b77
FG
2343 auto bid = allocate_spanning_blob_id();
2344 b->id = bid;
7c673cae
FG
2345 spanning_blob_map[b->id] = b;
2346 dout(20) << __func__ << " adding spanning " << *b << dendl;
2347 }
2348 }
2349 } else {
2350 if (e->blob->is_spanning()) {
2351 spanning_blob_map.erase(e->blob->id);
2352 e->blob->id = -1;
2353 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2354 }
2355 }
2356 }
2357 }
2358
2359 clear_needs_reshard();
2360}
2361
2362bool BlueStore::ExtentMap::encode_some(
2363 uint32_t offset,
2364 uint32_t length,
2365 bufferlist& bl,
2366 unsigned *pn)
2367{
2368 auto cct = onode->c->store->cct; //used by dout
2369 Extent dummy(offset);
2370 auto start = extent_map.lower_bound(dummy);
2371 uint32_t end = offset + length;
2372
2373 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2374 // serialization only. Hence there is no specific
2375 // handling at ExtentMap level.
2376
2377 unsigned n = 0;
2378 size_t bound = 0;
7c673cae
FG
2379 bool must_reshard = false;
2380 for (auto p = start;
2381 p != extent_map.end() && p->logical_offset < end;
2382 ++p, ++n) {
2383 assert(p->logical_offset >= offset);
2384 p->blob->last_encoded_id = -1;
2385 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2386 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2387 << std::dec << " hit new spanning blob " << *p << dendl;
2388 request_reshard(p->blob_start(), p->blob_end());
2389 must_reshard = true;
2390 }
31f18b77
FG
2391 if (!must_reshard) {
2392 denc_varint(0, bound); // blobid
2393 denc_varint(0, bound); // logical_offset
2394 denc_varint(0, bound); // len
2395 denc_varint(0, bound); // blob_offset
7c673cae 2396
31f18b77
FG
2397 p->blob->bound_encode(
2398 bound,
2399 struct_v,
2400 p->blob->shared_blob->get_sbid(),
2401 false);
2402 }
7c673cae
FG
2403 }
2404 if (must_reshard) {
2405 return true;
2406 }
2407
31f18b77
FG
2408 denc(struct_v, bound);
2409 denc_varint(0, bound); // number of extents
2410
7c673cae
FG
2411 {
2412 auto app = bl.get_contiguous_appender(bound);
2413 denc(struct_v, app);
2414 denc_varint(n, app);
2415 if (pn) {
2416 *pn = n;
2417 }
2418
2419 n = 0;
2420 uint64_t pos = 0;
2421 uint64_t prev_len = 0;
2422 for (auto p = start;
2423 p != extent_map.end() && p->logical_offset < end;
2424 ++p, ++n) {
2425 unsigned blobid;
2426 bool include_blob = false;
2427 if (p->blob->is_spanning()) {
2428 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2429 blobid |= BLOBID_FLAG_SPANNING;
2430 } else if (p->blob->last_encoded_id < 0) {
2431 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2432 include_blob = true;
2433 blobid = 0; // the decoder will infer the id from n
2434 } else {
2435 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2436 }
2437 if (p->logical_offset == pos) {
2438 blobid |= BLOBID_FLAG_CONTIGUOUS;
2439 }
2440 if (p->blob_offset == 0) {
2441 blobid |= BLOBID_FLAG_ZEROOFFSET;
2442 }
2443 if (p->length == prev_len) {
2444 blobid |= BLOBID_FLAG_SAMELENGTH;
2445 } else {
2446 prev_len = p->length;
2447 }
2448 denc_varint(blobid, app);
2449 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2450 denc_varint_lowz(p->logical_offset - pos, app);
2451 }
2452 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2453 denc_varint_lowz(p->blob_offset, app);
2454 }
2455 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2456 denc_varint_lowz(p->length, app);
2457 }
2458 pos = p->logical_end();
2459 if (include_blob) {
2460 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2461 }
2462 }
2463 }
2464 /*derr << __func__ << bl << dendl;
2465 derr << __func__ << ":";
2466 bl.hexdump(*_dout);
2467 *_dout << dendl;
2468 */
2469 return false;
2470}
2471
2472unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2473{
2474 auto cct = onode->c->store->cct; //used by dout
2475 /*
2476 derr << __func__ << ":";
2477 bl.hexdump(*_dout);
2478 *_dout << dendl;
2479 */
2480
2481 assert(bl.get_num_buffers() <= 1);
2482 auto p = bl.front().begin_deep();
2483 __u8 struct_v;
2484 denc(struct_v, p);
2485 // Version 2 differs from v1 in blob's ref_map
2486 // serialization only. Hence there is no specific
2487 // handling at ExtentMap level below.
2488 assert(struct_v == 1 || struct_v == 2);
2489
2490 uint32_t num;
2491 denc_varint(num, p);
2492 vector<BlobRef> blobs(num);
2493 uint64_t pos = 0;
2494 uint64_t prev_len = 0;
2495 unsigned n = 0;
2496
2497 while (!p.end()) {
2498 Extent *le = new Extent();
2499 uint64_t blobid;
2500 denc_varint(blobid, p);
2501 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2502 uint64_t gap;
2503 denc_varint_lowz(gap, p);
2504 pos += gap;
2505 }
2506 le->logical_offset = pos;
2507 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2508 denc_varint_lowz(le->blob_offset, p);
2509 } else {
2510 le->blob_offset = 0;
2511 }
2512 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2513 denc_varint_lowz(prev_len, p);
2514 }
2515 le->length = prev_len;
2516
2517 if (blobid & BLOBID_FLAG_SPANNING) {
2518 dout(30) << __func__ << " getting spanning blob "
2519 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2520 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2521 } else {
2522 blobid >>= BLOBID_SHIFT_BITS;
2523 if (blobid) {
2524 le->assign_blob(blobs[blobid - 1]);
2525 assert(le->blob);
2526 } else {
2527 Blob *b = new Blob();
2528 uint64_t sbid = 0;
2529 b->decode(onode->c, p, struct_v, &sbid, false);
2530 blobs[n] = b;
2531 onode->c->open_shared_blob(sbid, b);
2532 le->assign_blob(b);
2533 }
2534 // we build ref_map dynamically for non-spanning blobs
2535 le->blob->get_ref(
2536 onode->c,
2537 le->blob_offset,
2538 le->length);
2539 }
2540 pos += prev_len;
2541 ++n;
2542 extent_map.insert(*le);
2543 }
2544
2545 assert(n == num);
2546 return num;
2547}
2548
2549void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2550{
2551 // Version 2 differs from v1 in blob's ref_map
2552 // serialization only. Hence there is no specific
2553 // handling at ExtentMap level.
2554 __u8 struct_v = 2;
2555
2556 denc(struct_v, p);
2557 denc_varint((uint32_t)0, p);
2558 size_t key_size = 0;
2559 denc_varint((uint32_t)0, key_size);
2560 p += spanning_blob_map.size() * key_size;
2561 for (const auto& i : spanning_blob_map) {
2562 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2563 }
2564}
2565
2566void BlueStore::ExtentMap::encode_spanning_blobs(
2567 bufferlist::contiguous_appender& p)
2568{
2569 // Version 2 differs from v1 in blob's ref_map
2570 // serialization only. Hence there is no specific
2571 // handling at ExtentMap level.
2572 __u8 struct_v = 2;
2573
2574 denc(struct_v, p);
2575 denc_varint(spanning_blob_map.size(), p);
2576 for (auto& i : spanning_blob_map) {
2577 denc_varint(i.second->id, p);
2578 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2579 }
2580}
2581
2582void BlueStore::ExtentMap::decode_spanning_blobs(
2583 bufferptr::iterator& p)
2584{
2585 __u8 struct_v;
2586 denc(struct_v, p);
2587 // Version 2 differs from v1 in blob's ref_map
2588 // serialization only. Hence there is no specific
2589 // handling at ExtentMap level.
2590 assert(struct_v == 1 || struct_v == 2);
2591
2592 unsigned n;
2593 denc_varint(n, p);
2594 while (n--) {
2595 BlobRef b(new Blob());
2596 denc_varint(b->id, p);
2597 spanning_blob_map[b->id] = b;
2598 uint64_t sbid = 0;
2599 b->decode(onode->c, p, struct_v, &sbid, true);
2600 onode->c->open_shared_blob(sbid, b);
2601 }
2602}
2603
2604void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2605{
2606 shards.resize(onode->onode.extent_map_shards.size());
2607 unsigned i = 0;
2608 for (auto &s : onode->onode.extent_map_shards) {
2609 shards[i].shard_info = &s;
2610 shards[i].loaded = loaded;
2611 shards[i].dirty = dirty;
2612 ++i;
2613 }
2614}
2615
2616void BlueStore::ExtentMap::fault_range(
2617 KeyValueDB *db,
2618 uint32_t offset,
2619 uint32_t length)
2620{
2621 auto cct = onode->c->store->cct; //used by dout
2622 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2623 << std::dec << dendl;
2624 auto start = seek_shard(offset);
2625 auto last = seek_shard(offset + length);
2626
2627 if (start < 0)
2628 return;
2629
2630 assert(last >= start);
2631 string key;
2632 while (start <= last) {
2633 assert((size_t)start < shards.size());
2634 auto p = &shards[start];
2635 if (!p->loaded) {
2636 dout(30) << __func__ << " opening shard 0x" << std::hex
2637 << p->shard_info->offset << std::dec << dendl;
2638 bufferlist v;
2639 generate_extent_shard_key_and_apply(
2640 onode->key, p->shard_info->offset, &key,
2641 [&](const string& final_key) {
2642 int r = db->get(PREFIX_OBJ, final_key, &v);
2643 if (r < 0) {
2644 derr << __func__ << " missing shard 0x" << std::hex
2645 << p->shard_info->offset << std::dec << " for " << onode->oid
2646 << dendl;
2647 assert(r >= 0);
2648 }
2649 }
2650 );
2651 p->extents = decode_some(v);
2652 p->loaded = true;
2653 dout(20) << __func__ << " open shard 0x" << std::hex
2654 << p->shard_info->offset << std::dec
2655 << " (" << v.length() << " bytes)" << dendl;
2656 assert(p->dirty == false);
2657 assert(v.length() == p->shard_info->bytes);
2658 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2659 } else {
2660 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2661 }
2662 ++start;
2663 }
2664}
2665
2666void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
2667 uint32_t offset,
2668 uint32_t length)
2669{
2670 auto cct = onode->c->store->cct; //used by dout
2671 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2672 << std::dec << dendl;
2673 if (shards.empty()) {
2674 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2675 inline_bl.clear();
2676 return;
2677 }
2678 auto start = seek_shard(offset);
2679 auto last = seek_shard(offset + length);
2680 if (start < 0)
2681 return;
2682
2683 assert(last >= start);
2684 while (start <= last) {
2685 assert((size_t)start < shards.size());
2686 auto p = &shards[start];
2687 if (!p->loaded) {
2688 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2689 << std::dec << " is not loaded, can't mark dirty" << dendl;
2690 assert(0 == "can't mark unloaded shard dirty");
2691 }
2692 if (!p->dirty) {
2693 dout(20) << __func__ << " mark shard 0x" << std::hex
2694 << p->shard_info->offset << std::dec << " dirty" << dendl;
2695 p->dirty = true;
2696 }
2697 ++start;
2698 }
2699}
2700
2701BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2702 uint64_t offset)
2703{
2704 Extent dummy(offset);
2705 return extent_map.find(dummy);
2706}
2707
7c673cae
FG
2708BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2709 uint64_t offset)
2710{
2711 Extent dummy(offset);
2712 auto fp = extent_map.lower_bound(dummy);
2713 if (fp != extent_map.begin()) {
2714 --fp;
2715 if (fp->logical_end() <= offset) {
2716 ++fp;
2717 }
2718 }
2719 return fp;
2720}
2721
2722BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2723 uint64_t offset) const
2724{
2725 Extent dummy(offset);
2726 auto fp = extent_map.lower_bound(dummy);
2727 if (fp != extent_map.begin()) {
2728 --fp;
2729 if (fp->logical_end() <= offset) {
2730 ++fp;
2731 }
2732 }
2733 return fp;
2734}
2735
2736bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2737{
2738 auto fp = seek_lextent(offset);
2739 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2740 return false;
2741 }
2742 return true;
2743}
2744
2745int BlueStore::ExtentMap::compress_extent_map(
2746 uint64_t offset,
2747 uint64_t length)
2748{
2749 auto cct = onode->c->store->cct; //used by dout
2750 if (extent_map.empty())
2751 return 0;
2752 int removed = 0;
2753 auto p = seek_lextent(offset);
2754 if (p != extent_map.begin()) {
2755 --p; // start to the left of offset
2756 }
2757 // the caller should have just written to this region
2758 assert(p != extent_map.end());
2759
2760 // identify the *next* shard
2761 auto pshard = shards.begin();
2762 while (pshard != shards.end() &&
2763 p->logical_offset >= pshard->shard_info->offset) {
2764 ++pshard;
2765 }
2766 uint64_t shard_end;
2767 if (pshard != shards.end()) {
2768 shard_end = pshard->shard_info->offset;
2769 } else {
2770 shard_end = OBJECT_MAX_SIZE;
2771 }
2772
2773 auto n = p;
2774 for (++n; n != extent_map.end(); p = n++) {
2775 if (n->logical_offset > offset + length) {
2776 break; // stop after end
2777 }
2778 while (n != extent_map.end() &&
2779 p->logical_end() == n->logical_offset &&
2780 p->blob == n->blob &&
2781 p->blob_offset + p->length == n->blob_offset &&
2782 n->logical_offset < shard_end) {
2783 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2784 << " next shard 0x" << shard_end << std::dec
2785 << " merging " << *p << " and " << *n << dendl;
2786 p->length += n->length;
2787 rm(n++);
2788 ++removed;
2789 }
2790 if (n == extent_map.end()) {
2791 break;
2792 }
2793 if (n->logical_offset >= shard_end) {
2794 assert(pshard != shards.end());
2795 ++pshard;
2796 if (pshard != shards.end()) {
2797 shard_end = pshard->shard_info->offset;
2798 } else {
2799 shard_end = OBJECT_MAX_SIZE;
2800 }
2801 }
2802 }
2803 if (removed && onode) {
2804 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2805 }
2806 return removed;
2807}
2808
2809void BlueStore::ExtentMap::punch_hole(
2810 CollectionRef &c,
2811 uint64_t offset,
2812 uint64_t length,
2813 old_extent_map_t *old_extents)
2814{
2815 auto p = seek_lextent(offset);
2816 uint64_t end = offset + length;
2817 while (p != extent_map.end()) {
2818 if (p->logical_offset >= end) {
2819 break;
2820 }
2821 if (p->logical_offset < offset) {
2822 if (p->logical_end() > end) {
2823 // split and deref middle
2824 uint64_t front = offset - p->logical_offset;
2825 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2826 length, p->blob);
2827 old_extents->push_back(*oe);
2828 add(end,
2829 p->blob_offset + front + length,
2830 p->length - front - length,
2831 p->blob);
2832 p->length = front;
2833 break;
2834 } else {
2835 // deref tail
2836 assert(p->logical_end() > offset); // else seek_lextent bug
2837 uint64_t keep = offset - p->logical_offset;
2838 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2839 p->length - keep, p->blob);
2840 old_extents->push_back(*oe);
2841 p->length = keep;
2842 ++p;
2843 continue;
2844 }
2845 }
2846 if (p->logical_offset + p->length <= end) {
2847 // deref whole lextent
2848 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2849 p->length, p->blob);
2850 old_extents->push_back(*oe);
2851 rm(p++);
2852 continue;
2853 }
2854 // deref head
2855 uint64_t keep = p->logical_end() - end;
2856 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2857 p->length - keep, p->blob);
2858 old_extents->push_back(*oe);
2859
2860 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2861 rm(p);
2862 break;
2863 }
2864}
2865
2866BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2867 CollectionRef &c,
2868 uint64_t logical_offset,
2869 uint64_t blob_offset, uint64_t length, BlobRef b,
2870 old_extent_map_t *old_extents)
2871{
2872 // We need to have completely initialized Blob to increment its ref counters.
2873 assert(b->get_blob().get_logical_length() != 0);
2874
2875 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2876 // old_extents list if we overwre the blob totally
2877 // This might happen during WAL overwrite.
2878 b->get_ref(onode->c, blob_offset, length);
2879
2880 if (old_extents) {
2881 punch_hole(c, logical_offset, length, old_extents);
2882 }
2883
2884 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2885 extent_map.insert(*le);
2886 if (spans_shard(logical_offset, length)) {
2887 request_reshard(logical_offset, logical_offset + length);
2888 }
2889 return le;
2890}
2891
2892BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2893 BlobRef lb,
2894 uint32_t blob_offset,
2895 uint32_t pos)
2896{
2897 auto cct = onode->c->store->cct; //used by dout
2898
2899 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2900 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2901 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2902 << dendl;
2903 BlobRef rb = onode->c->new_blob();
2904 lb->split(onode->c, blob_offset, rb.get());
2905
2906 for (auto ep = seek_lextent(pos);
2907 ep != extent_map.end() && ep->logical_offset < end_pos;
2908 ++ep) {
2909 if (ep->blob != lb) {
2910 continue;
2911 }
2912 if (ep->logical_offset < pos) {
2913 // split extent
2914 size_t left = pos - ep->logical_offset;
2915 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2916 extent_map.insert(*ne);
2917 ep->length = left;
2918 dout(30) << __func__ << " split " << *ep << dendl;
2919 dout(30) << __func__ << " to " << *ne << dendl;
2920 } else {
2921 // switch blob
2922 assert(ep->blob_offset >= blob_offset);
2923
2924 ep->blob = rb;
2925 ep->blob_offset -= blob_offset;
2926 dout(30) << __func__ << " adjusted " << *ep << dendl;
2927 }
2928 }
2929 return rb;
2930}
2931
2932// Onode
2933
2934#undef dout_prefix
2935#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2936
2937void BlueStore::Onode::flush()
2938{
2939 if (flushing_count.load()) {
2940 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2941 std::unique_lock<std::mutex> l(flush_lock);
2942 while (flushing_count.load()) {
2943 flush_cond.wait(l);
2944 }
2945 }
2946 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2947}
2948
2949// =======================================================
2950// WriteContext
2951
2952/// Checks for writes to the same pextent within a blob
2953bool BlueStore::WriteContext::has_conflict(
2954 BlobRef b,
2955 uint64_t loffs,
2956 uint64_t loffs_end,
2957 uint64_t min_alloc_size)
2958{
2959 assert((loffs % min_alloc_size) == 0);
2960 assert((loffs_end % min_alloc_size) == 0);
2961 for (auto w : writes) {
2962 if (b == w.b) {
2963 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
224ce89b 2964 auto loffs2_end = P2ROUNDUP(w.logical_offset + w.length0, min_alloc_size);
7c673cae 2965 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 2966 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
2967 return true;
2968 }
2969 }
2970 }
2971 return false;
2972}
2973
2974// =======================================================
2975
2976// DeferredBatch
2977#undef dout_prefix
2978#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2979
2980void BlueStore::DeferredBatch::prepare_write(
2981 CephContext *cct,
2982 uint64_t seq, uint64_t offset, uint64_t length,
2983 bufferlist::const_iterator& blp)
2984{
2985 _discard(cct, offset, length);
2986 auto i = iomap.insert(make_pair(offset, deferred_io()));
2987 assert(i.second); // this should be a new insertion
2988 i.first->second.seq = seq;
2989 blp.copy(length, i.first->second.bl);
31f18b77
FG
2990 i.first->second.bl.reassign_to_mempool(
2991 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
2992 dout(20) << __func__ << " seq " << seq
2993 << " 0x" << std::hex << offset << "~" << length
2994 << " crc " << i.first->second.bl.crc32c(-1)
2995 << std::dec << dendl;
2996 seq_bytes[seq] += length;
2997#ifdef DEBUG_DEFERRED
2998 _audit(cct);
2999#endif
3000}
3001
3002void BlueStore::DeferredBatch::_discard(
3003 CephContext *cct, uint64_t offset, uint64_t length)
3004{
3005 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3006 << std::dec << dendl;
3007 auto p = iomap.lower_bound(offset);
3008 if (p != iomap.begin()) {
3009 --p;
3010 auto end = p->first + p->second.bl.length();
3011 if (end > offset) {
3012 bufferlist head;
3013 head.substr_of(p->second.bl, 0, offset - p->first);
3014 dout(20) << __func__ << " keep head " << p->second.seq
3015 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3016 << " -> 0x" << head.length() << std::dec << dendl;
3017 auto i = seq_bytes.find(p->second.seq);
224ce89b 3018 assert(i != seq_bytes.end());
7c673cae
FG
3019 if (end > offset + length) {
3020 bufferlist tail;
3021 tail.substr_of(p->second.bl, offset + length - p->first,
3022 end - (offset + length));
3023 dout(20) << __func__ << " keep tail " << p->second.seq
3024 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3025 << " -> 0x" << tail.length() << std::dec << dendl;
3026 auto &n = iomap[offset + length];
3027 n.bl.swap(tail);
3028 n.seq = p->second.seq;
3029 i->second -= length;
3030 } else {
3031 i->second -= end - offset;
3032 }
224ce89b 3033 assert(i->second >= 0);
7c673cae
FG
3034 p->second.bl.swap(head);
3035 }
3036 ++p;
3037 }
3038 while (p != iomap.end()) {
3039 if (p->first >= offset + length) {
3040 break;
3041 }
3042 auto i = seq_bytes.find(p->second.seq);
224ce89b 3043 assert(i != seq_bytes.end());
7c673cae
FG
3044 auto end = p->first + p->second.bl.length();
3045 if (end > offset + length) {
3046 unsigned drop_front = offset + length - p->first;
3047 unsigned keep_tail = end - (offset + length);
3048 dout(20) << __func__ << " truncate front " << p->second.seq
3049 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3050 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3051 << " to 0x" << (offset + length) << "~" << keep_tail
3052 << std::dec << dendl;
3053 auto &s = iomap[offset + length];
3054 s.seq = p->second.seq;
3055 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3056 i->second -= drop_front;
3057 } else {
3058 dout(20) << __func__ << " drop " << p->second.seq
3059 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3060 << std::dec << dendl;
3061 i->second -= p->second.bl.length();
3062 }
224ce89b 3063 assert(i->second >= 0);
7c673cae
FG
3064 p = iomap.erase(p);
3065 }
3066}
3067
3068void BlueStore::DeferredBatch::_audit(CephContext *cct)
3069{
3070 map<uint64_t,int> sb;
3071 for (auto p : seq_bytes) {
3072 sb[p.first] = 0; // make sure we have the same set of keys
3073 }
3074 uint64_t pos = 0;
3075 for (auto& p : iomap) {
3076 assert(p.first >= pos);
3077 sb[p.second.seq] += p.second.bl.length();
3078 pos = p.first + p.second.bl.length();
3079 }
3080 assert(sb == seq_bytes);
3081}
3082
3083
3084// Collection
3085
3086#undef dout_prefix
3087#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3088
3089BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3090 : store(ns),
3091 cache(c),
3092 cid(cid),
3093 lock("BlueStore::Collection::lock", true, false),
3094 exists(true),
3095 onode_map(c)
3096{
3097}
3098
3099void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3100{
3101 assert(!b->shared_blob);
3102 const bluestore_blob_t& blob = b->get_blob();
3103 if (!blob.is_shared()) {
3104 b->shared_blob = new SharedBlob(this);
3105 return;
3106 }
3107
3108 b->shared_blob = shared_blob_set.lookup(sbid);
3109 if (b->shared_blob) {
3110 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3111 << std::dec << " had " << *b->shared_blob << dendl;
3112 } else {
3113 b->shared_blob = new SharedBlob(sbid, this);
3114 shared_blob_set.add(this, b->shared_blob.get());
3115 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3116 << std::dec << " opened " << *b->shared_blob
3117 << dendl;
3118 }
3119}
3120
3121void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3122{
3123 if (!sb->is_loaded()) {
3124
3125 bufferlist v;
3126 string key;
3127 auto sbid = sb->get_sbid();
3128 get_shared_blob_key(sbid, &key);
3129 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3130 if (r < 0) {
3131 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3132 << std::dec << " not found at key "
3133 << pretty_binary_string(key) << dendl;
3134 assert(0 == "uh oh, missing shared_blob");
3135 }
3136
3137 sb->loaded = true;
3138 sb->persistent = new bluestore_shared_blob_t(sbid);
3139 bufferlist::iterator p = v.begin();
3140 ::decode(*(sb->persistent), p);
3141 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3142 << std::dec << " loaded shared_blob " << *sb << dendl;
3143 }
3144}
3145
3146void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3147{
7c673cae 3148 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
31f18b77 3149 assert(!b->shared_blob->is_loaded());
7c673cae
FG
3150
3151 // update blob
31f18b77 3152 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3153 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3154
3155 // update shared blob
3156 b->shared_blob->loaded = true;
3157 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3158 shared_blob_set.add(this, b->shared_blob.get());
3159 for (auto p : blob.get_extents()) {
3160 if (p.is_valid()) {
3161 b->shared_blob->get_ref(
3162 p.offset,
3163 p.length);
3164 }
3165 }
3166 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3167}
3168
31f18b77
FG
3169uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3170{
3171 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3172 assert(sb->is_loaded());
3173
3174 uint64_t sbid = sb->get_sbid();
3175 shared_blob_set.remove(sb);
3176 sb->loaded = false;
3177 delete sb->persistent;
3178 sb->sbid_unloaded = 0;
3179 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3180 return sbid;
3181}
3182
7c673cae
FG
3183BlueStore::OnodeRef BlueStore::Collection::get_onode(
3184 const ghobject_t& oid,
3185 bool create)
3186{
3187 assert(create ? lock.is_wlocked() : lock.is_locked());
3188
3189 spg_t pgid;
3190 if (cid.is_pg(&pgid)) {
3191 if (!oid.match(cnode.bits, pgid.ps())) {
3192 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3193 << pgid << " bits " << cnode.bits << dendl;
3194 ceph_abort();
3195 }
3196 }
3197
3198 OnodeRef o = onode_map.lookup(oid);
3199 if (o)
3200 return o;
3201
31f18b77 3202 mempool::bluestore_cache_other::string key;
7c673cae
FG
3203 get_object_key(store->cct, oid, &key);
3204
3205 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3206 << pretty_binary_string(key) << dendl;
3207
3208 bufferlist v;
3209 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3210 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3211 Onode *on;
3212 if (v.length() == 0) {
3213 assert(r == -ENOENT);
3214 if (!store->cct->_conf->bluestore_debug_misc &&
3215 !create)
3216 return OnodeRef();
3217
3218 // new object, new onode
3219 on = new Onode(this, oid, key);
3220 } else {
3221 // loaded
3222 assert(r >= 0);
3223 on = new Onode(this, oid, key);
3224 on->exists = true;
31f18b77 3225 bufferptr::iterator p = v.front().begin_deep();
7c673cae
FG
3226 on->onode.decode(p);
3227
3228 // initialize extent_map
3229 on->extent_map.decode_spanning_blobs(p);
3230 if (on->onode.extent_map_shards.empty()) {
3231 denc(on->extent_map.inline_bl, p);
3232 on->extent_map.decode_some(on->extent_map.inline_bl);
3233 } else {
3234 on->extent_map.init_shards(false, false);
3235 }
3236 }
3237 o.reset(on);
3238 return onode_map.add(oid, o);
3239}
3240
3241void BlueStore::Collection::split_cache(
3242 Collection *dest)
3243{
3244 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3245
3246 // lock (one or both) cache shards
3247 std::lock(cache->lock, dest->cache->lock);
3248 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3249 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3250
3251 int destbits = dest->cnode.bits;
3252 spg_t destpg;
3253 bool is_pg = dest->cid.is_pg(&destpg);
3254 assert(is_pg);
3255
3256 auto p = onode_map.onode_map.begin();
3257 while (p != onode_map.onode_map.end()) {
3258 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3259 // onode does not belong to this child
3260 ++p;
3261 } else {
3262 OnodeRef o = p->second;
3263 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3264 << dendl;
3265
3266 cache->_rm_onode(p->second);
3267 p = onode_map.onode_map.erase(p);
3268
3269 o->c = dest;
3270 dest->cache->_add_onode(o, 1);
3271 dest->onode_map.onode_map[o->oid] = o;
3272 dest->onode_map.cache = dest->cache;
3273
3274 // move over shared blobs and buffers. cover shared blobs from
3275 // both extent map and spanning blob map (the full extent map
3276 // may not be faulted in)
3277 vector<SharedBlob*> sbvec;
3278 for (auto& e : o->extent_map.extent_map) {
3279 sbvec.push_back(e.blob->shared_blob.get());
3280 }
3281 for (auto& b : o->extent_map.spanning_blob_map) {
3282 sbvec.push_back(b.second->shared_blob.get());
3283 }
3284 for (auto sb : sbvec) {
3285 if (sb->coll == dest) {
3286 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3287 << dendl;
3288 continue;
3289 }
3290 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
3291 sb->coll = dest;
31f18b77
FG
3292 if (sb->get_sbid()) {
3293 ldout(store->cct, 20) << __func__
3294 << " moving registration " << *sb << dendl;
3295 shared_blob_set.remove(sb);
3296 dest->shared_blob_set.add(dest, sb);
3297 }
7c673cae 3298 if (dest->cache != cache) {
7c673cae
FG
3299 for (auto& i : sb->bc.buffer_map) {
3300 if (!i.second->is_writing()) {
3301 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3302 << dendl;
3303 dest->cache->_move_buffer(cache, i.second.get());
3304 }
3305 }
3306 }
3307 }
7c673cae
FG
3308 }
3309 }
3310}
3311
7c673cae
FG
3312// =======================================================
3313
3314void *BlueStore::MempoolThread::entry()
3315{
3316 Mutex::Locker l(lock);
3317 while (!stop) {
31f18b77
FG
3318 uint64_t meta_bytes =
3319 mempool::bluestore_cache_other::allocated_bytes() +
3320 mempool::bluestore_cache_onode::allocated_bytes();
3321 uint64_t onode_num =
3322 mempool::bluestore_cache_onode::allocated_items();
3323
3324 if (onode_num < 2) {
3325 onode_num = 2;
3326 }
3327
3328 float bytes_per_onode = (float)meta_bytes / (float)onode_num;
3329 size_t num_shards = store->cache_shards.size();
3330 float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
3331 // A little sloppy but should be close enough
224ce89b 3332 uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
31f18b77
FG
3333
3334 for (auto i : store->cache_shards) {
3335 i->trim(shard_target,
3336 store->cache_meta_ratio,
3337 store->cache_data_ratio,
3338 bytes_per_onode);
3339 }
3340
3341 store->_update_cache_logger();
3342
7c673cae
FG
3343 utime_t wait;
3344 wait += store->cct->_conf->bluestore_cache_trim_interval;
3345 cond.WaitInterval(lock, wait);
3346 }
3347 stop = false;
3348 return NULL;
3349}
3350
3351// =======================================================
3352
31f18b77
FG
3353// OmapIteratorImpl
3354
3355#undef dout_prefix
3356#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3357
3358BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3359 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3360 : c(c), o(o), it(it)
3361{
3362 RWLock::RLocker l(c->lock);
3363 if (o->onode.has_omap()) {
3364 get_omap_key(o->onode.nid, string(), &head);
3365 get_omap_tail(o->onode.nid, &tail);
3366 it->lower_bound(head);
3367 }
3368}
3369
3370int BlueStore::OmapIteratorImpl::seek_to_first()
3371{
3372 RWLock::RLocker l(c->lock);
3373 if (o->onode.has_omap()) {
3374 it->lower_bound(head);
3375 } else {
3376 it = KeyValueDB::Iterator();
3377 }
3378 return 0;
3379}
3380
3381int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3382{
3383 RWLock::RLocker l(c->lock);
3384 if (o->onode.has_omap()) {
3385 string key;
3386 get_omap_key(o->onode.nid, after, &key);
3387 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3388 << pretty_binary_string(key) << dendl;
3389 it->upper_bound(key);
3390 } else {
3391 it = KeyValueDB::Iterator();
3392 }
3393 return 0;
3394}
3395
3396int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3397{
3398 RWLock::RLocker l(c->lock);
3399 if (o->onode.has_omap()) {
3400 string key;
3401 get_omap_key(o->onode.nid, to, &key);
3402 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3403 << pretty_binary_string(key) << dendl;
3404 it->lower_bound(key);
3405 } else {
3406 it = KeyValueDB::Iterator();
3407 }
3408 return 0;
3409}
3410
3411bool BlueStore::OmapIteratorImpl::valid()
3412{
3413 RWLock::RLocker l(c->lock);
3414 bool r = o->onode.has_omap() && it && it->valid() &&
3415 it->raw_key().second <= tail;
3416 if (it && it->valid()) {
3417 ldout(c->store->cct,20) << __func__ << " is at "
3418 << pretty_binary_string(it->raw_key().second)
3419 << dendl;
3420 }
3421 return r;
3422}
3423
3424int BlueStore::OmapIteratorImpl::next(bool validate)
3425{
3426 RWLock::RLocker l(c->lock);
3427 if (o->onode.has_omap()) {
3428 it->next();
3429 return 0;
3430 } else {
3431 return -1;
3432 }
3433}
3434
3435string BlueStore::OmapIteratorImpl::key()
3436{
3437 RWLock::RLocker l(c->lock);
3438 assert(it->valid());
3439 string db_key = it->raw_key().second;
3440 string user_key;
3441 decode_omap_key(db_key, &user_key);
3442 return user_key;
3443}
3444
3445bufferlist BlueStore::OmapIteratorImpl::value()
3446{
3447 RWLock::RLocker l(c->lock);
3448 assert(it->valid());
3449 return it->value();
3450}
3451
3452
3453// =====================================
3454
7c673cae
FG
3455#undef dout_prefix
3456#define dout_prefix *_dout << "bluestore(" << path << ") "
3457
3458
3459static void aio_cb(void *priv, void *priv2)
3460{
3461 BlueStore *store = static_cast<BlueStore*>(priv);
3462 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3463 c->aio_finish(store);
3464}
3465
3466BlueStore::BlueStore(CephContext *cct, const string& path)
3467 : ObjectStore(cct, path),
3468 throttle_bytes(cct, "bluestore_throttle_bytes",
3469 cct->_conf->bluestore_throttle_bytes),
3470 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3471 cct->_conf->bluestore_throttle_bytes +
3472 cct->_conf->bluestore_throttle_deferred_bytes),
3473 kv_sync_thread(this),
31f18b77 3474 kv_finalize_thread(this),
7c673cae
FG
3475 mempool_thread(this)
3476{
3477 _init_logger();
3478 cct->_conf->add_observer(this);
3479 set_cache_shards(1);
7c673cae
FG
3480}
3481
3482BlueStore::BlueStore(CephContext *cct,
3483 const string& path,
3484 uint64_t _min_alloc_size)
3485 : ObjectStore(cct, path),
3486 throttle_bytes(cct, "bluestore_throttle_bytes",
3487 cct->_conf->bluestore_throttle_bytes),
3488 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3489 cct->_conf->bluestore_throttle_bytes +
3490 cct->_conf->bluestore_throttle_deferred_bytes),
3491 kv_sync_thread(this),
31f18b77 3492 kv_finalize_thread(this),
7c673cae
FG
3493 min_alloc_size(_min_alloc_size),
3494 min_alloc_size_order(ctz(_min_alloc_size)),
3495 mempool_thread(this)
3496{
3497 _init_logger();
3498 cct->_conf->add_observer(this);
3499 set_cache_shards(1);
7c673cae
FG
3500}
3501
3502BlueStore::~BlueStore()
3503{
3504 for (auto f : finishers) {
3505 delete f;
3506 }
3507 finishers.clear();
3508
3509 cct->_conf->remove_observer(this);
3510 _shutdown_logger();
3511 assert(!mounted);
3512 assert(db == NULL);
3513 assert(bluefs == NULL);
3514 assert(fsid_fd < 0);
3515 assert(path_fd < 0);
3516 for (auto i : cache_shards) {
3517 delete i;
3518 }
3519 cache_shards.clear();
3520}
3521
3522const char **BlueStore::get_tracked_conf_keys() const
3523{
3524 static const char* KEYS[] = {
3525 "bluestore_csum_type",
3526 "bluestore_compression_mode",
3527 "bluestore_compression_algorithm",
3528 "bluestore_compression_min_blob_size",
3529 "bluestore_compression_min_blob_size_ssd",
3530 "bluestore_compression_min_blob_size_hdd",
3531 "bluestore_compression_max_blob_size",
3532 "bluestore_compression_max_blob_size_ssd",
3533 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 3534 "bluestore_compression_required_ratio",
7c673cae
FG
3535 "bluestore_max_alloc_size",
3536 "bluestore_prefer_deferred_size",
31f18b77
FG
3537 "bluestore_deferred_batch_ops",
3538 "bluestore_deferred_batch_ops_hdd",
3539 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
3540 "bluestore_throttle_bytes",
3541 "bluestore_throttle_deferred_bytes",
3542 "bluestore_throttle_cost_per_io_hdd",
3543 "bluestore_throttle_cost_per_io_ssd",
3544 "bluestore_throttle_cost_per_io",
3545 "bluestore_max_blob_size",
3546 "bluestore_max_blob_size_ssd",
3547 "bluestore_max_blob_size_hdd",
3548 NULL
3549 };
3550 return KEYS;
3551}
3552
3553void BlueStore::handle_conf_change(const struct md_config_t *conf,
3554 const std::set<std::string> &changed)
3555{
3556 if (changed.count("bluestore_csum_type")) {
3557 _set_csum();
3558 }
3559 if (changed.count("bluestore_compression_mode") ||
3560 changed.count("bluestore_compression_algorithm") ||
3561 changed.count("bluestore_compression_min_blob_size") ||
3562 changed.count("bluestore_compression_max_blob_size")) {
3563 if (bdev) {
3564 _set_compression();
3565 }
3566 }
3567 if (changed.count("bluestore_max_blob_size") ||
3568 changed.count("bluestore_max_blob_size_ssd") ||
3569 changed.count("bluestore_max_blob_size_hdd")) {
3570 if (bdev) {
3571 // only after startup
3572 _set_blob_size();
3573 }
3574 }
3575 if (changed.count("bluestore_prefer_deferred_size") ||
3576 changed.count("bluestore_max_alloc_size") ||
3577 changed.count("bluestore_deferred_batch_ops") ||
3578 changed.count("bluestore_deferred_batch_ops_hdd") ||
3579 changed.count("bluestore_deferred_batch_ops_ssd")) {
3580 if (bdev) {
3581 // only after startup
3582 _set_alloc_sizes();
3583 }
3584 }
3585 if (changed.count("bluestore_throttle_cost_per_io") ||
3586 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3587 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3588 if (bdev) {
3589 _set_throttle_params();
3590 }
3591 }
3592 if (changed.count("bluestore_throttle_bytes")) {
3593 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3594 throttle_deferred_bytes.reset_max(
3595 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3596 }
3597 if (changed.count("bluestore_throttle_deferred_bytes")) {
3598 throttle_deferred_bytes.reset_max(
3599 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3600 }
3601}
3602
3603void BlueStore::_set_compression()
3604{
224ce89b
WB
3605 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3606 if (m) {
3607 comp_mode = *m;
3608 } else {
3609 derr << __func__ << " unrecognized value '"
3610 << cct->_conf->bluestore_compression_mode
3611 << "' for bluestore_compression_mode, reverting to 'none'"
3612 << dendl;
3613 comp_mode = Compressor::COMP_NONE;
3614 }
3615
3616 compressor = nullptr;
3617
3618 if (comp_mode == Compressor::COMP_NONE) {
3619 dout(10) << __func__ << " compression mode set to 'none', "
3620 << "ignore other compression setttings" << dendl;
3621 return;
3622 }
3623
7c673cae
FG
3624 if (cct->_conf->bluestore_compression_max_blob_size) {
3625 comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3626 } else {
3627 assert(bdev);
3628 if (bdev->is_rotational()) {
3629 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3630 } else {
3631 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3632 }
3633 }
3634
3635 if (cct->_conf->bluestore_compression_max_blob_size) {
3636 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3637 } else {
3638 assert(bdev);
3639 if (bdev->is_rotational()) {
3640 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3641 } else {
3642 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3643 }
3644 }
3645
7c673cae
FG
3646 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3647 if (!alg_name.empty()) {
3648 compressor = Compressor::create(cct, alg_name);
3649 if (!compressor) {
3650 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3651 << dendl;
3652 }
3653 }
3654
3655 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3656 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3657 << dendl;
3658}
3659
3660void BlueStore::_set_csum()
3661{
3662 csum_type = Checksummer::CSUM_NONE;
3663 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3664 if (t > Checksummer::CSUM_NONE)
3665 csum_type = t;
3666
3667 dout(10) << __func__ << " csum_type "
3668 << Checksummer::get_csum_type_string(csum_type)
3669 << dendl;
3670}
3671
3672void BlueStore::_set_throttle_params()
3673{
3674 if (cct->_conf->bluestore_throttle_cost_per_io) {
3675 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3676 } else {
3677 assert(bdev);
3678 if (bdev->is_rotational()) {
3679 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3680 } else {
3681 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3682 }
3683 }
3684
3685 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3686 << dendl;
3687}
3688void BlueStore::_set_blob_size()
3689{
3690 if (cct->_conf->bluestore_max_blob_size) {
3691 max_blob_size = cct->_conf->bluestore_max_blob_size;
3692 } else {
3693 assert(bdev);
3694 if (bdev->is_rotational()) {
3695 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3696 } else {
3697 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3698 }
3699 }
3700 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3701 << std::dec << dendl;
3702}
3703
31f18b77
FG
3704int BlueStore::_set_cache_sizes()
3705{
224ce89b
WB
3706 assert(bdev);
3707 if (cct->_conf->bluestore_cache_size) {
3708 cache_size = cct->_conf->bluestore_cache_size;
3709 } else {
3710 // choose global cache size based on backend type
3711 if (bdev->is_rotational()) {
3712 cache_size = cct->_conf->bluestore_cache_size_hdd;
3713 } else {
3714 cache_size = cct->_conf->bluestore_cache_size_ssd;
3715 }
3716 }
31f18b77
FG
3717 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3718 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b
WB
3719
3720 double cache_kv_max = cct->_conf->bluestore_cache_kv_max;
3721 double cache_kv_max_ratio = 0;
3722
3723 // if cache_kv_max is negative, disable it
3724 if (cache_size > 0 && cache_kv_max >= 0) {
3725 cache_kv_max_ratio = (double) cache_kv_max / (double) cache_size;
3726 if (cache_kv_max_ratio < 1.0 && cache_kv_max_ratio < cache_kv_ratio) {
3727 dout(1) << __func__ << " max " << cache_kv_max_ratio
3728 << " < ratio " << cache_kv_ratio
3729 << dendl;
3730 cache_meta_ratio = cache_meta_ratio + cache_kv_ratio - cache_kv_max_ratio;
3731 cache_kv_ratio = cache_kv_max_ratio;
3732 }
3733 }
3734
31f18b77
FG
3735 cache_data_ratio =
3736 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
3737
224ce89b 3738 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 3739 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
224ce89b 3740 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
3741 return -EINVAL;
3742 }
224ce89b 3743 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 3744 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
224ce89b 3745 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
3746 return -EINVAL;
3747 }
3748 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 3749 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
31f18b77
FG
3750 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3751 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
3752 << dendl;
3753 return -EINVAL;
3754 }
3755 if (cache_data_ratio < 0) {
3756 // deal with floating point imprecision
3757 cache_data_ratio = 0;
3758 }
224ce89b
WB
3759 dout(1) << __func__ << " cache_size " << cache_size
3760 << " meta " << cache_meta_ratio
31f18b77
FG
3761 << " kv " << cache_kv_ratio
3762 << " data " << cache_data_ratio
3763 << dendl;
3764 return 0;
3765}
3766
7c673cae
FG
3767void BlueStore::_init_logger()
3768{
3769 PerfCountersBuilder b(cct, "bluestore",
3770 l_bluestore_first, l_bluestore_last);
3771 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
3772 "Average kv_thread flush latency",
3773 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
3774 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
3775 "Average kv_thread commit latency");
3776 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
3777 "Average kv_thread sync latency",
3778 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
3779 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
3780 "Average prepare state latency");
3781 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
3782 "Average aio_wait state latency",
3783 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
3784 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
3785 "Average io_done state latency");
3786 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
3787 "Average kv_queued state latency");
3788 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
3789 "Average kv_commiting state latency");
3790 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
3791 "Average kv_done state latency");
3792 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
3793 "Average deferred_queued state latency");
3794 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
3795 "Average aio_wait state latency");
3796 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
3797 "Average cleanup state latency");
3798 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
3799 "Average finishing state latency");
3800 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
3801 "Average done state latency");
3802 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
3803 "Average submit throttle latency",
3804 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
3805 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
3806 "Average submit latency",
3807 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
3808 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
3809 "Average commit latency",
3810 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
3811 b.add_time_avg(l_bluestore_read_lat, "read_lat",
3812 "Average read latency",
3813 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
3814 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
3815 "Average read onode metadata latency");
3816 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
3817 "Average read latency");
3818 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
3819 "Average compress latency");
3820 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
3821 "Average decompress latency");
3822 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
3823 "Average checksum latency");
3824 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
3825 "Sum for beneficial compress ops");
3826 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
3827 "Sum for compress ops rejected due to low net gain of space");
3828 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
3829 "Sum for write-op padded bytes");
3830 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
3831 "Sum for deferred write op");
3832 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
3833 "Sum for deferred write bytes", "def");
3834 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
3835 "Sum for write penalty read ops");
3836 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
3837 "Sum for allocated bytes");
3838 b.add_u64(l_bluestore_stored, "bluestore_stored",
3839 "Sum for stored bytes");
3840 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
3841 "Sum for stored compressed bytes");
3842 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
3843 "Sum for bytes allocated for compressed data");
3844 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
3845 "Sum for original bytes that were compressed");
3846
3847 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
3848 "Number of onodes in cache");
3849 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
3850 "Sum for onode-lookups hit in the cache");
3851 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
3852 "Sum for onode-lookups missed in the cache");
3853 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
3854 "Sum for onode-shard lookups hit in the cache");
3855 b.add_u64_counter(l_bluestore_onode_shard_misses,
3856 "bluestore_onode_shard_misses",
3857 "Sum for onode-shard lookups missed in the cache");
3858 b.add_u64(l_bluestore_extents, "bluestore_extents",
3859 "Number of extents in cache");
3860 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
3861 "Number of blobs in cache");
3862 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
3863 "Number of buffers in cache");
3864 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
3865 "Number of buffer bytes in cache");
3866 b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
3867 "Sum for bytes of read hit in the cache");
3868 b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
3869 "Sum for bytes of read missed in the cache");
3870
3871 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
3872 "Large aligned writes into fresh blobs");
3873 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
3874 "Large aligned writes into fresh blobs (bytes)");
3875 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
3876 "Large aligned writes into fresh blobs (blobs)");
3877 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
3878 "Small writes into existing or sparse small blobs");
3879 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
3880 "Small writes into existing or sparse small blobs (bytes)");
3881 b.add_u64_counter(l_bluestore_write_small_unused,
3882 "bluestore_write_small_unused",
3883 "Small writes into unused portion of existing blob");
3884 b.add_u64_counter(l_bluestore_write_small_deferred,
3885 "bluestore_write_small_deferred",
3886 "Small overwrites using deferred");
3887 b.add_u64_counter(l_bluestore_write_small_pre_read,
3888 "bluestore_write_small_pre_read",
3889 "Small writes that required we read some data (possibly "
3890 "cached) to fill out the block");
3891 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
3892 "Small write into new (sparse) blob");
3893
3894 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
3895 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
3896 "Onode extent map reshard events");
3897 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
3898 "Sum for blob splitting due to resharding");
3899 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
3900 "Sum for extents that have been removed due to compression");
3901 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
3902 "Sum for extents that have been merged due to garbage "
3903 "collection");
3904 logger = b.create_perf_counters();
3905 cct->get_perfcounters_collection()->add(logger);
3906}
3907
3908int BlueStore::_reload_logger()
3909{
3910 struct store_statfs_t store_statfs;
3911
3912 int r = statfs(&store_statfs);
3913 if(r >= 0) {
3914 logger->set(l_bluestore_allocated, store_statfs.allocated);
3915 logger->set(l_bluestore_stored, store_statfs.stored);
3916 logger->set(l_bluestore_compressed, store_statfs.compressed);
3917 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
3918 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
3919 }
3920 return r;
3921}
3922
3923void BlueStore::_shutdown_logger()
3924{
3925 cct->get_perfcounters_collection()->remove(logger);
3926 delete logger;
3927}
3928
3929int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
3930 uuid_d *fsid)
3931{
3932 bluestore_bdev_label_t label;
3933 int r = _read_bdev_label(cct, path, &label);
3934 if (r < 0)
3935 return r;
3936 *fsid = label.osd_uuid;
3937 return 0;
3938}
3939
3940int BlueStore::_open_path()
3941{
3942 assert(path_fd < 0);
224ce89b 3943 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY));
7c673cae
FG
3944 if (path_fd < 0) {
3945 int r = -errno;
3946 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
3947 << dendl;
3948 return r;
3949 }
3950 return 0;
3951}
3952
3953void BlueStore::_close_path()
3954{
3955 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
3956 path_fd = -1;
3957}
3958
3959int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
3960{
3961 dout(10) << __func__ << " path " << path << " label " << label << dendl;
3962 bufferlist bl;
3963 ::encode(label, bl);
3964 uint32_t crc = bl.crc32c(-1);
3965 ::encode(crc, bl);
3966 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
3967 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
3968 z.zero();
3969 bl.append(std::move(z));
3970
224ce89b 3971 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY));
7c673cae
FG
3972 if (fd < 0) {
3973 fd = -errno;
3974 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
3975 << dendl;
3976 return fd;
3977 }
3978 int r = bl.write_fd(fd);
3979 if (r < 0) {
3980 derr << __func__ << " failed to write to " << path
3981 << ": " << cpp_strerror(r) << dendl;
3982 }
3983 VOID_TEMP_FAILURE_RETRY(::close(fd));
3984 return r;
3985}
3986
3987int BlueStore::_read_bdev_label(CephContext* cct, string path,
3988 bluestore_bdev_label_t *label)
3989{
3990 dout(10) << __func__ << dendl;
224ce89b 3991 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY));
7c673cae
FG
3992 if (fd < 0) {
3993 fd = -errno;
3994 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
3995 << dendl;
3996 return fd;
3997 }
3998 bufferlist bl;
3999 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4000 VOID_TEMP_FAILURE_RETRY(::close(fd));
4001 if (r < 0) {
4002 derr << __func__ << " failed to read from " << path
4003 << ": " << cpp_strerror(r) << dendl;
4004 return r;
4005 }
4006
4007 uint32_t crc, expected_crc;
4008 bufferlist::iterator p = bl.begin();
4009 try {
4010 ::decode(*label, p);
4011 bufferlist t;
4012 t.substr_of(bl, 0, p.get_off());
4013 crc = t.crc32c(-1);
4014 ::decode(expected_crc, p);
4015 }
4016 catch (buffer::error& e) {
4017 derr << __func__ << " unable to decode label at offset " << p.get_off()
4018 << ": " << e.what()
4019 << dendl;
4020 return -EINVAL;
4021 }
4022 if (crc != expected_crc) {
4023 derr << __func__ << " bad crc on label, expected " << expected_crc
4024 << " != actual " << crc << dendl;
4025 return -EIO;
4026 }
4027 dout(10) << __func__ << " got " << *label << dendl;
4028 return 0;
4029}
4030
4031int BlueStore::_check_or_set_bdev_label(
4032 string path, uint64_t size, string desc, bool create)
4033{
4034 bluestore_bdev_label_t label;
4035 if (create) {
4036 label.osd_uuid = fsid;
4037 label.size = size;
4038 label.btime = ceph_clock_now();
4039 label.description = desc;
4040 int r = _write_bdev_label(path, label);
4041 if (r < 0)
4042 return r;
4043 } else {
4044 int r = _read_bdev_label(cct, path, &label);
4045 if (r < 0)
4046 return r;
31f18b77
FG
4047 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4048 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4049 << " and fsid " << fsid << " check bypassed" << dendl;
4050 }
4051 else if (label.osd_uuid != fsid) {
7c673cae
FG
4052 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4053 << " does not match our fsid " << fsid << dendl;
4054 return -EIO;
4055 }
4056 }
4057 return 0;
4058}
4059
4060void BlueStore::_set_alloc_sizes(void)
4061{
7c673cae
FG
4062 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4063
4064 if (cct->_conf->bluestore_prefer_deferred_size) {
4065 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4066 } else {
4067 assert(bdev);
4068 if (bdev->is_rotational()) {
4069 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4070 } else {
4071 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4072 }
4073 }
4074
4075 if (cct->_conf->bluestore_deferred_batch_ops) {
4076 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4077 } else {
4078 assert(bdev);
4079 if (bdev->is_rotational()) {
4080 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4081 } else {
4082 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4083 }
4084 }
4085
4086 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4087 << std::dec << " order " << min_alloc_size_order
4088 << " max_alloc_size 0x" << std::hex << max_alloc_size
4089 << " prefer_deferred_size 0x" << prefer_deferred_size
4090 << std::dec
4091 << " deferred_batch_ops " << deferred_batch_ops
4092 << dendl;
4093}
4094
4095int BlueStore::_open_bdev(bool create)
4096{
4097 assert(bdev == NULL);
4098 string p = path + "/block";
4099 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4100 int r = bdev->open(p);
4101 if (r < 0)
4102 goto fail;
4103
4104 if (bdev->supported_bdev_label()) {
4105 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4106 if (r < 0)
4107 goto fail_close;
4108 }
4109
4110 // initialize global block parameters
4111 block_size = bdev->get_block_size();
4112 block_mask = ~(block_size - 1);
4113 block_size_order = ctz(block_size);
4114 assert(block_size == 1u << block_size_order);
224ce89b
WB
4115 // and set cache_size based on device type
4116 r = _set_cache_sizes();
4117 if (r < 0) {
4118 goto fail_close;
4119 }
7c673cae
FG
4120 return 0;
4121
4122 fail_close:
4123 bdev->close();
4124 fail:
4125 delete bdev;
4126 bdev = NULL;
4127 return r;
4128}
4129
4130void BlueStore::_close_bdev()
4131{
4132 assert(bdev);
4133 bdev->close();
4134 delete bdev;
4135 bdev = NULL;
4136}
4137
4138int BlueStore::_open_fm(bool create)
4139{
4140 assert(fm == NULL);
4141 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4142
4143 if (create) {
4144 // initialize freespace
4145 dout(20) << __func__ << " initializing freespace" << dendl;
4146 KeyValueDB::Transaction t = db->get_transaction();
4147 {
4148 bufferlist bl;
4149 bl.append(freelist_type);
4150 t->set(PREFIX_SUPER, "freelist_type", bl);
4151 }
4152 fm->create(bdev->get_size(), t);
4153
4154 // allocate superblock reserved space. note that we do not mark
4155 // bluefs space as allocated in the freelist; we instead rely on
4156 // bluefs_extents.
4157 fm->allocate(0, SUPER_RESERVED, t);
4158
4159 uint64_t reserved = 0;
4160 if (cct->_conf->bluestore_bluefs) {
4161 assert(bluefs_extents.num_intervals() == 1);
4162 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
4163 reserved = p.get_start() + p.get_len();
4164 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4165 << " for bluefs" << dendl;
4166 bufferlist bl;
4167 ::encode(bluefs_extents, bl);
4168 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4169 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4170 << std::dec << dendl;
4171 } else {
4172 reserved = SUPER_RESERVED;
4173 }
4174
4175 if (cct->_conf->bluestore_debug_prefill > 0) {
4176 uint64_t end = bdev->get_size() - reserved;
4177 dout(1) << __func__ << " pre-fragmenting freespace, using "
4178 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4179 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4180 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4181 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4182 float r = cct->_conf->bluestore_debug_prefill;
4183 r /= 1.0 - r;
4184 bool stop = false;
4185
4186 while (!stop && start < end) {
4187 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4188 if (start + l > end) {
4189 l = end - start;
4190 l = P2ALIGN(l, min_alloc_size);
4191 }
4192 assert(start + l <= end);
4193
4194 uint64_t u = 1 + (uint64_t)(r * (double)l);
4195 u = P2ROUNDUP(u, min_alloc_size);
4196 if (start + l + u > end) {
4197 u = end - (start + l);
4198 // trim to align so we don't overflow again
4199 u = P2ALIGN(u, min_alloc_size);
4200 stop = true;
4201 }
4202 assert(start + l + u <= end);
4203
4204 dout(20) << " free 0x" << std::hex << start << "~" << l
4205 << " use 0x" << u << std::dec << dendl;
4206
4207 if (u == 0) {
4208 // break if u has been trimmed to nothing
4209 break;
4210 }
4211
4212 fm->allocate(start + l, u, t);
4213 start += l + u;
4214 }
4215 }
4216 db->submit_transaction_sync(t);
4217 }
4218
4219 int r = fm->init();
4220 if (r < 0) {
4221 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4222 delete fm;
4223 fm = NULL;
4224 return r;
4225 }
4226 return 0;
4227}
4228
4229void BlueStore::_close_fm()
4230{
4231 dout(10) << __func__ << dendl;
4232 assert(fm);
4233 fm->shutdown();
4234 delete fm;
4235 fm = NULL;
4236}
4237
4238int BlueStore::_open_alloc()
4239{
4240 assert(alloc == NULL);
4241 assert(bdev->get_size());
4242 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4243 bdev->get_size(),
4244 min_alloc_size);
4245 if (!alloc) {
4246 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4247 << cct->_conf->bluestore_allocator
4248 << dendl;
4249 return -EINVAL;
4250 }
4251
4252 uint64_t num = 0, bytes = 0;
4253
4254 dout(1) << __func__ << " opening allocation metadata" << dendl;
4255 // initialize from freelist
4256 fm->enumerate_reset();
4257 uint64_t offset, length;
4258 while (fm->enumerate_next(&offset, &length)) {
4259 alloc->init_add_free(offset, length);
4260 ++num;
4261 bytes += length;
4262 }
224ce89b 4263 fm->enumerate_reset();
7c673cae
FG
4264 dout(1) << __func__ << " loaded " << pretty_si_t(bytes)
4265 << " in " << num << " extents"
4266 << dendl;
4267
4268 // also mark bluefs space as allocated
4269 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4270 alloc->init_rm_free(e.get_start(), e.get_len());
4271 }
4272 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4273 << bluefs_extents << std::dec << " as allocated" << dendl;
4274
4275 return 0;
4276}
4277
4278void BlueStore::_close_alloc()
4279{
4280 assert(alloc);
4281 alloc->shutdown();
4282 delete alloc;
4283 alloc = NULL;
4284}
4285
4286int BlueStore::_open_fsid(bool create)
4287{
4288 assert(fsid_fd < 0);
4289 int flags = O_RDWR;
4290 if (create)
4291 flags |= O_CREAT;
4292 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4293 if (fsid_fd < 0) {
4294 int err = -errno;
4295 derr << __func__ << " " << cpp_strerror(err) << dendl;
4296 return err;
4297 }
4298 return 0;
4299}
4300
4301int BlueStore::_read_fsid(uuid_d *uuid)
4302{
4303 char fsid_str[40];
4304 memset(fsid_str, 0, sizeof(fsid_str));
4305 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4306 if (ret < 0) {
4307 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4308 return ret;
4309 }
4310 if (ret > 36)
4311 fsid_str[36] = 0;
4312 else
4313 fsid_str[ret] = 0;
4314 if (!uuid->parse(fsid_str)) {
4315 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4316 return -EINVAL;
4317 }
4318 return 0;
4319}
4320
4321int BlueStore::_write_fsid()
4322{
4323 int r = ::ftruncate(fsid_fd, 0);
4324 if (r < 0) {
4325 r = -errno;
4326 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4327 return r;
4328 }
4329 string str = stringify(fsid) + "\n";
4330 r = safe_write(fsid_fd, str.c_str(), str.length());
4331 if (r < 0) {
4332 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4333 return r;
4334 }
4335 r = ::fsync(fsid_fd);
4336 if (r < 0) {
4337 r = -errno;
4338 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4339 return r;
4340 }
4341 return 0;
4342}
4343
4344void BlueStore::_close_fsid()
4345{
4346 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4347 fsid_fd = -1;
4348}
4349
4350int BlueStore::_lock_fsid()
4351{
4352 struct flock l;
4353 memset(&l, 0, sizeof(l));
4354 l.l_type = F_WRLCK;
4355 l.l_whence = SEEK_SET;
4356 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4357 if (r < 0) {
4358 int err = errno;
4359 derr << __func__ << " failed to lock " << path << "/fsid"
4360 << " (is another ceph-osd still running?)"
4361 << cpp_strerror(err) << dendl;
4362 return -err;
4363 }
4364 return 0;
4365}
4366
31f18b77
FG
4367bool BlueStore::is_rotational()
4368{
4369 if (bdev) {
4370 return bdev->is_rotational();
4371 }
4372
4373 bool rotational = true;
4374 int r = _open_path();
4375 if (r < 0)
4376 goto out;
4377 r = _open_fsid(false);
4378 if (r < 0)
4379 goto out_path;
4380 r = _read_fsid(&fsid);
4381 if (r < 0)
4382 goto out_fsid;
4383 r = _lock_fsid();
4384 if (r < 0)
4385 goto out_fsid;
4386 r = _open_bdev(false);
4387 if (r < 0)
4388 goto out_fsid;
4389 rotational = bdev->is_rotational();
4390 _close_bdev();
4391 out_fsid:
4392 _close_fsid();
4393 out_path:
4394 _close_path();
4395 out:
4396 return rotational;
4397}
4398
d2e6a577
FG
4399bool BlueStore::is_journal_rotational()
4400{
4401 if (!bluefs) {
4402 dout(5) << __func__ << " bluefs disabled, default to store media type"
4403 << dendl;
4404 return is_rotational();
4405 }
4406 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4407 return bluefs->wal_is_rotational();
4408}
4409
7c673cae
FG
4410bool BlueStore::test_mount_in_use()
4411{
4412 // most error conditions mean the mount is not in use (e.g., because
4413 // it doesn't exist). only if we fail to lock do we conclude it is
4414 // in use.
4415 bool ret = false;
4416 int r = _open_path();
4417 if (r < 0)
4418 return false;
4419 r = _open_fsid(false);
4420 if (r < 0)
4421 goto out_path;
4422 r = _lock_fsid();
4423 if (r < 0)
4424 ret = true; // if we can't lock, it is in use
4425 _close_fsid();
4426 out_path:
4427 _close_path();
4428 return ret;
4429}
4430
4431int BlueStore::_open_db(bool create)
4432{
4433 int r;
4434 assert(!db);
4435 string fn = path + "/db";
4436 string options;
4437 stringstream err;
4438 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4439
4440 string kv_backend;
4441 if (create) {
4442 kv_backend = cct->_conf->bluestore_kvbackend;
4443 } else {
4444 r = read_meta("kv_backend", &kv_backend);
4445 if (r < 0) {
4446 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4447 return -EIO;
4448 }
4449 }
4450 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4451
4452 bool do_bluefs;
4453 if (create) {
4454 do_bluefs = cct->_conf->bluestore_bluefs;
4455 } else {
4456 string s;
4457 r = read_meta("bluefs", &s);
4458 if (r < 0) {
4459 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4460 return -EIO;
4461 }
4462 if (s == "1") {
4463 do_bluefs = true;
4464 } else if (s == "0") {
4465 do_bluefs = false;
4466 } else {
4467 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4468 << dendl;
4469 return -EIO;
4470 }
4471 }
4472 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4473
4474 rocksdb::Env *env = NULL;
4475 if (do_bluefs) {
4476 dout(10) << __func__ << " initializing bluefs" << dendl;
4477 if (kv_backend != "rocksdb") {
4478 derr << " backend must be rocksdb to use bluefs" << dendl;
4479 return -EINVAL;
4480 }
4481 bluefs = new BlueFS(cct);
4482
4483 string bfn;
4484 struct stat st;
4485
4486 bfn = path + "/block.db";
4487 if (::stat(bfn.c_str(), &st) == 0) {
4488 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4489 if (r < 0) {
4490 derr << __func__ << " add block device(" << bfn << ") returned: "
4491 << cpp_strerror(r) << dendl;
4492 goto free_bluefs;
4493 }
4494
4495 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4496 r = _check_or_set_bdev_label(
4497 bfn,
4498 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4499 "bluefs db", create);
4500 if (r < 0) {
4501 derr << __func__
4502 << " check block device(" << bfn << ") label returned: "
4503 << cpp_strerror(r) << dendl;
4504 goto free_bluefs;
4505 }
4506 }
4507 if (create) {
4508 bluefs->add_block_extent(
4509 BlueFS::BDEV_DB,
4510 SUPER_RESERVED,
4511 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4512 }
4513 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4514 bluefs_single_shared_device = false;
31f18b77 4515 } else if (::lstat(bfn.c_str(), &st) == -1) {
7c673cae 4516 bluefs_shared_bdev = BlueFS::BDEV_DB;
31f18b77
FG
4517 } else {
4518 //symlink exist is bug
4519 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4520 r = -errno;
4521 goto free_bluefs;
7c673cae
FG
4522 }
4523
4524 // shared device
4525 bfn = path + "/block";
4526 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4527 if (r < 0) {
4528 derr << __func__ << " add block device(" << bfn << ") returned: "
4529 << cpp_strerror(r) << dendl;
4530 goto free_bluefs;
4531 }
4532 if (create) {
4533 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4534 uint64_t initial =
4535 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4536 cct->_conf->bluestore_bluefs_gift_ratio);
4537 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
4538 // align to bluefs's alloc_size
4539 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
31f18b77
FG
4540 // put bluefs in the middle of the device in case it is an HDD
4541 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4542 cct->_conf->bluefs_alloc_size);
4543 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4544 bluefs_extents.insert(start, initial);
7c673cae
FG
4545 }
4546
4547 bfn = path + "/block.wal";
4548 if (::stat(bfn.c_str(), &st) == 0) {
4549 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4550 if (r < 0) {
4551 derr << __func__ << " add block device(" << bfn << ") returned: "
4552 << cpp_strerror(r) << dendl;
4553 goto free_bluefs;
4554 }
4555
4556 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4557 r = _check_or_set_bdev_label(
4558 bfn,
4559 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4560 "bluefs wal", create);
4561 if (r < 0) {
4562 derr << __func__ << " check block device(" << bfn
4563 << ") label returned: " << cpp_strerror(r) << dendl;
4564 goto free_bluefs;
4565 }
4566 }
4567
4568 if (create) {
4569 bluefs->add_block_extent(
4570 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4571 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4572 BDEV_LABEL_BLOCK_SIZE);
4573 }
4574 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4575 bluefs_single_shared_device = false;
31f18b77 4576 } else if (::lstat(bfn.c_str(), &st) == -1) {
7c673cae 4577 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
31f18b77
FG
4578 } else {
4579 //symlink exist is bug
4580 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4581 r = -errno;
4582 goto free_bluefs;
7c673cae
FG
4583 }
4584
4585 if (create) {
4586 bluefs->mkfs(fsid);
4587 }
4588 r = bluefs->mount();
4589 if (r < 0) {
4590 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4591 goto free_bluefs;
4592 }
4593 if (cct->_conf->bluestore_bluefs_env_mirror) {
4594 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4595 rocksdb::Env *b = rocksdb::Env::Default();
4596 if (create) {
4597 string cmd = "rm -rf " + path + "/db " +
4598 path + "/db.slow " +
4599 path + "/db.wal";
4600 int r = system(cmd.c_str());
4601 (void)r;
4602 }
4603 env = new rocksdb::EnvMirror(b, a, false, true);
4604 } else {
4605 env = new BlueRocksEnv(bluefs);
4606
4607 // simplify the dir names, too, as "seen" by rocksdb
4608 fn = "db";
4609 }
4610
4611 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4612 // we have both block.db and block; tell rocksdb!
4613 // note: the second (last) size value doesn't really matter
4614 ostringstream db_paths;
4615 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4616 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4617 db_paths << fn << ","
4618 << (uint64_t)(db_size * 95 / 100) << " "
4619 << fn + ".slow" << ","
4620 << (uint64_t)(slow_size * 95 / 100);
4621 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4622 dout(10) << __func__ << " set rocksdb_db_paths to "
4623 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4624 }
4625
4626 if (create) {
4627 env->CreateDir(fn);
4628 if (cct->_conf->rocksdb_separate_wal_dir)
4629 env->CreateDir(fn + ".wal");
4630 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4631 env->CreateDir(fn + ".slow");
4632 }
4633 } else if (create) {
4634 int r = ::mkdir(fn.c_str(), 0755);
4635 if (r < 0)
4636 r = -errno;
4637 if (r < 0 && r != -EEXIST) {
4638 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4639 << dendl;
4640 return r;
4641 }
4642
4643 // wal_dir, too!
4644 if (cct->_conf->rocksdb_separate_wal_dir) {
4645 string walfn = path + "/db.wal";
4646 r = ::mkdir(walfn.c_str(), 0755);
4647 if (r < 0)
4648 r = -errno;
4649 if (r < 0 && r != -EEXIST) {
4650 derr << __func__ << " failed to create " << walfn
4651 << ": " << cpp_strerror(r)
4652 << dendl;
4653 return r;
4654 }
4655 }
4656 }
4657
4658 db = KeyValueDB::create(cct,
4659 kv_backend,
4660 fn,
4661 static_cast<void*>(env));
4662 if (!db) {
4663 derr << __func__ << " error creating db" << dendl;
4664 if (bluefs) {
4665 bluefs->umount();
4666 delete bluefs;
4667 bluefs = NULL;
4668 }
4669 // delete env manually here since we can't depend on db to do this
4670 // under this case
4671 delete env;
4672 env = NULL;
4673 return -EIO;
4674 }
4675
4676 FreelistManager::setup_merge_operators(db);
4677 db->set_merge_operator(PREFIX_STAT, merge_op);
4678
224ce89b 4679 db->set_cache_size(cache_size * cache_kv_ratio);
31f18b77 4680
7c673cae
FG
4681 if (kv_backend == "rocksdb")
4682 options = cct->_conf->bluestore_rocksdb_options;
4683 db->init(options);
4684 if (create)
4685 r = db->create_and_open(err);
4686 else
4687 r = db->open(err);
4688 if (r) {
4689 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4690 if (bluefs) {
4691 bluefs->umount();
4692 delete bluefs;
4693 bluefs = NULL;
4694 }
4695 delete db;
4696 db = NULL;
4697 return -EIO;
4698 }
4699 dout(1) << __func__ << " opened " << kv_backend
4700 << " path " << fn << " options " << options << dendl;
4701 return 0;
4702
4703free_bluefs:
4704 assert(bluefs);
4705 delete bluefs;
4706 bluefs = NULL;
4707 return r;
4708}
4709
4710void BlueStore::_close_db()
4711{
4712 assert(db);
4713 delete db;
4714 db = NULL;
4715 if (bluefs) {
4716 bluefs->umount();
4717 delete bluefs;
4718 bluefs = NULL;
4719 }
4720}
4721
4722int BlueStore::_reconcile_bluefs_freespace()
4723{
4724 dout(10) << __func__ << dendl;
4725 interval_set<uint64_t> bset;
4726 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
4727 assert(r == 0);
4728 if (bset == bluefs_extents) {
4729 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
4730 << std::dec << dendl;
4731 return 0;
4732 }
4733 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
4734 << dendl;
4735 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
4736 << std::dec << dendl;
4737
4738 interval_set<uint64_t> overlap;
4739 overlap.intersection_of(bset, bluefs_extents);
4740
4741 bset.subtract(overlap);
4742 if (!bset.empty()) {
4743 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
4744 << dendl;
4745 return -EIO;
4746 }
4747
4748 interval_set<uint64_t> super_extra;
4749 super_extra = bluefs_extents;
4750 super_extra.subtract(overlap);
4751 if (!super_extra.empty()) {
4752 // This is normal: it can happen if we commit to give extents to
4753 // bluefs and we crash before bluefs commits that it owns them.
4754 dout(10) << __func__ << " super extra " << super_extra << dendl;
4755 for (interval_set<uint64_t>::iterator p = super_extra.begin();
4756 p != super_extra.end();
4757 ++p) {
4758 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
4759 }
4760 }
4761
4762 return 0;
4763}
4764
4765int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
4766{
4767 int ret = 0;
4768 assert(bluefs);
4769
4770 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
4771 bluefs->get_usage(&bluefs_usage);
4772 assert(bluefs_usage.size() > bluefs_shared_bdev);
4773
4774 // fixme: look at primary bdev only for now
4775 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
4776 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
4777 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
4778
4779 uint64_t my_free = alloc->get_free();
4780 uint64_t total = bdev->get_size();
4781 float my_free_ratio = (float)my_free / (float)total;
4782
4783 uint64_t total_free = bluefs_free + my_free;
4784
4785 float bluefs_ratio = (float)bluefs_free / (float)total_free;
4786
4787 dout(10) << __func__
4788 << " bluefs " << pretty_si_t(bluefs_free)
4789 << " free (" << bluefs_free_ratio
4790 << ") bluestore " << pretty_si_t(my_free)
4791 << " free (" << my_free_ratio
4792 << "), bluefs_ratio " << bluefs_ratio
4793 << dendl;
4794
4795 uint64_t gift = 0;
4796 uint64_t reclaim = 0;
4797 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
4798 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
4799 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4800 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
4801 << ", should gift " << pretty_si_t(gift) << dendl;
4802 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
4803 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
4804 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
4805 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
4806 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4807 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
4808 << ", should reclaim " << pretty_si_t(reclaim) << dendl;
4809 }
4810 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
4811 cct->_conf->bluestore_bluefs_min <
4812 (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
4813 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
4814 dout(10) << __func__ << " bluefs_total " << bluefs_total
4815 << " < min " << cct->_conf->bluestore_bluefs_min
4816 << ", should gift " << pretty_si_t(g) << dendl;
4817 if (g > gift)
4818 gift = g;
4819 reclaim = 0;
4820 }
4821
4822 if (gift) {
4823 // round up to alloc size
4824 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
4825
4826 // hard cap to fit into 32 bits
4827 gift = MIN(gift, 1ull<<31);
4828 dout(10) << __func__ << " gifting " << gift
4829 << " (" << pretty_si_t(gift) << ")" << dendl;
4830
4831 // fixme: just do one allocation to start...
4832 int r = alloc->reserve(gift);
4833 assert(r == 0);
4834
4835 AllocExtentVector exts;
4836 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
4837 0, 0, &exts);
4838
4839 if (alloc_len < (int64_t)gift) {
4840 derr << __func__ << " allocate failed on 0x" << std::hex << gift
4841 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4842 alloc->dump();
4843 assert(0 == "allocate failed, wtf");
4844 return -ENOSPC;
4845 }
4846 for (auto& p : exts) {
4847 bluestore_pextent_t e = bluestore_pextent_t(p);
4848 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
4849 extents->push_back(e);
4850 }
4851 gift = 0;
4852
4853 ret = 1;
4854 }
4855
4856 // reclaim from bluefs?
4857 if (reclaim) {
4858 // round up to alloc size
4859 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
4860
4861 // hard cap to fit into 32 bits
4862 reclaim = MIN(reclaim, 1ull<<31);
4863 dout(10) << __func__ << " reclaiming " << reclaim
4864 << " (" << pretty_si_t(reclaim) << ")" << dendl;
4865
4866 while (reclaim > 0) {
4867 // NOTE: this will block and do IO.
4868 AllocExtentVector extents;
4869 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
4870 &extents);
4871 if (r < 0) {
4872 derr << __func__ << " failed to reclaim space from bluefs"
4873 << dendl;
4874 break;
4875 }
4876 for (auto e : extents) {
4877 bluefs_extents.erase(e.offset, e.length);
4878 bluefs_extents_reclaiming.insert(e.offset, e.length);
4879 reclaim -= e.length;
4880 }
4881 }
4882
4883 ret = 1;
4884 }
4885
4886 return ret;
4887}
4888
4889void BlueStore::_commit_bluefs_freespace(
4890 const PExtentVector& bluefs_gift_extents)
4891{
4892 dout(10) << __func__ << dendl;
4893 for (auto& p : bluefs_gift_extents) {
4894 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
4895 }
4896}
4897
4898int BlueStore::_open_collections(int *errors)
4899{
4900 assert(coll_map.empty());
4901 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
4902 for (it->upper_bound(string());
4903 it->valid();
4904 it->next()) {
4905 coll_t cid;
4906 if (cid.parse(it->key())) {
4907 CollectionRef c(
4908 new Collection(
4909 this,
4910 cache_shards[cid.hash_to_shard(cache_shards.size())],
4911 cid));
4912 bufferlist bl = it->value();
4913 bufferlist::iterator p = bl.begin();
4914 try {
4915 ::decode(c->cnode, p);
4916 } catch (buffer::error& e) {
4917 derr << __func__ << " failed to decode cnode, key:"
4918 << pretty_binary_string(it->key()) << dendl;
4919 return -EIO;
4920 }
4921 dout(20) << __func__ << " opened " << cid << " " << c << dendl;
4922 coll_map[cid] = c;
4923 } else {
4924 derr << __func__ << " unrecognized collection " << it->key() << dendl;
4925 if (errors)
4926 (*errors)++;
4927 }
4928 }
4929 return 0;
4930}
4931
224ce89b 4932void BlueStore::_open_statfs()
31f18b77
FG
4933{
4934 bufferlist bl;
4935 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
4936 if (r >= 0) {
4937 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
4938 auto it = bl.begin();
4939 vstatfs.decode(it);
224ce89b 4940 } else {
31f18b77
FG
4941 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
4942 }
4943 }
4944 else {
4945 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
4946 }
4947}
4948
7c673cae
FG
4949int BlueStore::_setup_block_symlink_or_file(
4950 string name,
4951 string epath,
4952 uint64_t size,
4953 bool create)
4954{
4955 dout(20) << __func__ << " name " << name << " path " << epath
4956 << " size " << size << " create=" << (int)create << dendl;
4957 int r = 0;
4958 int flags = O_RDWR;
4959 if (create)
4960 flags |= O_CREAT;
4961 if (epath.length()) {
4962 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
4963 if (r < 0) {
4964 r = -errno;
4965 derr << __func__ << " failed to create " << name << " symlink to "
4966 << epath << ": " << cpp_strerror(r) << dendl;
4967 return r;
4968 }
4969
4970 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
4971 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
4972 if (fd < 0) {
4973 r = -errno;
4974 derr << __func__ << " failed to open " << epath << " file: "
4975 << cpp_strerror(r) << dendl;
4976 return r;
4977 }
4978 string serial_number = epath.substr(strlen(SPDK_PREFIX));
4979 r = ::write(fd, serial_number.c_str(), serial_number.size());
4980 assert(r == (int)serial_number.size());
4981 dout(1) << __func__ << " created " << name << " symlink to "
4982 << epath << dendl;
4983 VOID_TEMP_FAILURE_RETRY(::close(fd));
4984 }
4985 }
4986 if (size) {
4987 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
4988 if (fd >= 0) {
4989 // block file is present
4990 struct stat st;
4991 int r = ::fstat(fd, &st);
4992 if (r == 0 &&
4993 S_ISREG(st.st_mode) && // if it is a regular file
4994 st.st_size == 0) { // and is 0 bytes
4995 r = ::ftruncate(fd, size);
4996 if (r < 0) {
4997 r = -errno;
4998 derr << __func__ << " failed to resize " << name << " file to "
4999 << size << ": " << cpp_strerror(r) << dendl;
5000 VOID_TEMP_FAILURE_RETRY(::close(fd));
5001 return r;
5002 }
5003
5004 if (cct->_conf->bluestore_block_preallocate_file) {
5005#ifdef HAVE_POSIX_FALLOCATE
5006 r = ::posix_fallocate(fd, 0, size);
5007 if (r) {
5008 derr << __func__ << " failed to prefallocate " << name << " file to "
5009 << size << ": " << cpp_strerror(r) << dendl;
5010 VOID_TEMP_FAILURE_RETRY(::close(fd));
5011 return -r;
5012 }
5013#else
5014 char data[1024*128];
5015 for (uint64_t off = 0; off < size; off += sizeof(data)) {
5016 if (off + sizeof(data) > size)
5017 r = ::write(fd, data, size - off);
5018 else
5019 r = ::write(fd, data, sizeof(data));
5020 if (r < 0) {
5021 r = -errno;
5022 derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
5023 << size << ": " << cpp_strerror(r) << dendl;
5024 VOID_TEMP_FAILURE_RETRY(::close(fd));
5025 return r;
5026 }
5027 }
5028#endif
5029 }
5030 dout(1) << __func__ << " resized " << name << " file to "
5031 << pretty_si_t(size) << "B" << dendl;
5032 }
5033 VOID_TEMP_FAILURE_RETRY(::close(fd));
5034 } else {
5035 int r = -errno;
5036 if (r != -ENOENT) {
5037 derr << __func__ << " failed to open " << name << " file: "
5038 << cpp_strerror(r) << dendl;
5039 return r;
5040 }
5041 }
5042 }
5043 return 0;
5044}
5045
5046int BlueStore::mkfs()
5047{
5048 dout(1) << __func__ << " path " << path << dendl;
5049 int r;
5050 uuid_d old_fsid;
5051
5052 {
5053 string done;
5054 r = read_meta("mkfs_done", &done);
5055 if (r == 0) {
5056 dout(1) << __func__ << " already created" << dendl;
5057 if (cct->_conf->bluestore_fsck_on_mkfs) {
5058 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5059 if (r < 0) {
5060 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5061 << dendl;
5062 return r;
5063 }
5064 if (r > 0) {
5065 derr << __func__ << " fsck found " << r << " errors" << dendl;
5066 r = -EIO;
5067 }
5068 }
5069 return r; // idempotent
5070 }
5071 }
5072
5073 {
5074 string type;
5075 r = read_meta("type", &type);
5076 if (r == 0) {
5077 if (type != "bluestore") {
5078 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5079 return -EIO;
5080 }
5081 } else {
5082 r = write_meta("type", "bluestore");
5083 if (r < 0)
5084 return r;
5085 }
5086 }
5087
5088 freelist_type = "bitmap";
5089
5090 r = _open_path();
5091 if (r < 0)
5092 return r;
5093
5094 r = _open_fsid(true);
5095 if (r < 0)
5096 goto out_path_fd;
5097
5098 r = _lock_fsid();
5099 if (r < 0)
5100 goto out_close_fsid;
5101
5102 r = _read_fsid(&old_fsid);
5103 if (r < 0 || old_fsid.is_zero()) {
5104 if (fsid.is_zero()) {
5105 fsid.generate_random();
5106 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5107 } else {
5108 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5109 }
5110 // we'll write it later.
5111 } else {
5112 if (!fsid.is_zero() && fsid != old_fsid) {
5113 derr << __func__ << " on-disk fsid " << old_fsid
5114 << " != provided " << fsid << dendl;
5115 r = -EINVAL;
5116 goto out_close_fsid;
5117 }
5118 fsid = old_fsid;
5119 }
5120
5121 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5122 cct->_conf->bluestore_block_size,
5123 cct->_conf->bluestore_block_create);
5124 if (r < 0)
5125 goto out_close_fsid;
5126 if (cct->_conf->bluestore_bluefs) {
5127 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5128 cct->_conf->bluestore_block_wal_size,
5129 cct->_conf->bluestore_block_wal_create);
5130 if (r < 0)
5131 goto out_close_fsid;
5132 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5133 cct->_conf->bluestore_block_db_size,
5134 cct->_conf->bluestore_block_db_create);
5135 if (r < 0)
5136 goto out_close_fsid;
5137 }
5138
5139 r = _open_bdev(true);
5140 if (r < 0)
5141 goto out_close_fsid;
5142
5143 r = _open_db(true);
5144 if (r < 0)
5145 goto out_close_bdev;
5146
5147 r = _open_fm(true);
5148 if (r < 0)
5149 goto out_close_db;
5150
5151 {
5152 KeyValueDB::Transaction t = db->get_transaction();
5153 {
5154 bufferlist bl;
5155 ::encode((uint64_t)0, bl);
5156 t->set(PREFIX_SUPER, "nid_max", bl);
5157 t->set(PREFIX_SUPER, "blobid_max", bl);
5158 }
5159
5160 // choose min_alloc_size
5161 if (cct->_conf->bluestore_min_alloc_size) {
5162 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5163 } else {
5164 assert(bdev);
5165 if (bdev->is_rotational()) {
5166 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5167 } else {
5168 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5169 }
5170 }
224ce89b
WB
5171
5172 // make sure min_alloc_size is power of 2 aligned.
5173 if (!ISP2(min_alloc_size)) {
5174 derr << __func__ << " min_alloc_size 0x"
5175 << std::hex << min_alloc_size << std::dec
5176 << " is not power of 2 aligned!"
5177 << dendl;
5178 r = -EINVAL;
5179 goto out_close_fm;
5180 }
5181
7c673cae
FG
5182 {
5183 bufferlist bl;
5184 ::encode((uint64_t)min_alloc_size, bl);
5185 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5186 }
5187
5188 ondisk_format = latest_ondisk_format;
5189 _prepare_ondisk_format_super(t);
5190 db->submit_transaction_sync(t);
5191 }
5192
7c673cae
FG
5193
5194 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5195 if (r < 0)
224ce89b
WB
5196 goto out_close_fm;
5197
7c673cae
FG
5198 r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
5199 if (r < 0)
224ce89b 5200 goto out_close_fm;
7c673cae
FG
5201
5202 if (fsid != old_fsid) {
5203 r = _write_fsid();
5204 if (r < 0) {
5205 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 5206 goto out_close_fm;
7c673cae
FG
5207 }
5208 }
5209
7c673cae
FG
5210 out_close_fm:
5211 _close_fm();
5212 out_close_db:
5213 _close_db();
5214 out_close_bdev:
5215 _close_bdev();
5216 out_close_fsid:
5217 _close_fsid();
5218 out_path_fd:
5219 _close_path();
5220
5221 if (r == 0 &&
5222 cct->_conf->bluestore_fsck_on_mkfs) {
5223 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5224 if (rc < 0)
5225 return rc;
5226 if (rc > 0) {
5227 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5228 r = -EIO;
5229 }
5230 }
31f18b77
FG
5231
5232 if (r == 0) {
5233 // indicate success by writing the 'mkfs_done' file
5234 r = write_meta("mkfs_done", "yes");
5235 }
5236
7c673cae
FG
5237 if (r < 0) {
5238 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
31f18b77
FG
5239 } else {
5240 dout(0) << __func__ << " success" << dendl;
7c673cae
FG
5241 }
5242 return r;
5243}
5244
5245void BlueStore::set_cache_shards(unsigned num)
5246{
5247 dout(10) << __func__ << " " << num << dendl;
5248 size_t old = cache_shards.size();
5249 assert(num >= old);
5250 cache_shards.resize(num);
5251 for (unsigned i = old; i < num; ++i) {
5252 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5253 logger);
5254 }
5255}
5256
5257int BlueStore::_mount(bool kv_only)
5258{
5259 dout(1) << __func__ << " path " << path << dendl;
5260
5261 {
5262 string type;
5263 int r = read_meta("type", &type);
5264 if (r < 0) {
5265 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5266 << dendl;
5267 return r;
5268 }
5269
5270 if (type != "bluestore") {
5271 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5272 return -EIO;
5273 }
5274 }
5275
5276 if (cct->_conf->bluestore_fsck_on_mount) {
5277 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5278 if (rc < 0)
5279 return rc;
5280 if (rc > 0) {
5281 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5282 return -EIO;
5283 }
5284 }
5285
5286 int r = _open_path();
5287 if (r < 0)
5288 return r;
5289 r = _open_fsid(false);
5290 if (r < 0)
5291 goto out_path;
5292
5293 r = _read_fsid(&fsid);
5294 if (r < 0)
5295 goto out_fsid;
5296
5297 r = _lock_fsid();
5298 if (r < 0)
5299 goto out_fsid;
5300
5301 r = _open_bdev(false);
5302 if (r < 0)
5303 goto out_fsid;
5304
5305 r = _open_db(false);
5306 if (r < 0)
5307 goto out_bdev;
5308
5309 if (kv_only)
5310 return 0;
5311
5312 r = _open_super_meta();
5313 if (r < 0)
5314 goto out_db;
5315
5316 r = _open_fm(false);
5317 if (r < 0)
5318 goto out_db;
5319
5320 r = _open_alloc();
5321 if (r < 0)
5322 goto out_fm;
5323
5324 r = _open_collections();
5325 if (r < 0)
5326 goto out_alloc;
5327
5328 r = _reload_logger();
5329 if (r < 0)
5330 goto out_coll;
5331
5332 if (bluefs) {
5333 r = _reconcile_bluefs_freespace();
5334 if (r < 0)
5335 goto out_coll;
5336 }
5337
31f18b77 5338 _kv_start();
7c673cae
FG
5339
5340 r = _deferred_replay();
5341 if (r < 0)
5342 goto out_stop;
5343
5344 mempool_thread.init();
5345
5346
5347 mounted = true;
5348 return 0;
5349
5350 out_stop:
5351 _kv_stop();
7c673cae 5352 out_coll:
31f18b77 5353 _flush_cache();
7c673cae
FG
5354 out_alloc:
5355 _close_alloc();
5356 out_fm:
5357 _close_fm();
5358 out_db:
5359 _close_db();
5360 out_bdev:
5361 _close_bdev();
5362 out_fsid:
5363 _close_fsid();
5364 out_path:
5365 _close_path();
5366 return r;
5367}
5368
5369int BlueStore::umount()
5370{
5371 assert(mounted);
5372 dout(1) << __func__ << dendl;
5373
5374 _osr_drain_all();
5375 _osr_unregister_all();
5376
5377 mempool_thread.shutdown();
5378
5379 dout(20) << __func__ << " stopping kv thread" << dendl;
5380 _kv_stop();
7c673cae 5381 _reap_collections();
31f18b77 5382 _flush_cache();
7c673cae
FG
5383 dout(20) << __func__ << " closing" << dendl;
5384
5385 mounted = false;
5386 _close_alloc();
5387 _close_fm();
5388 _close_db();
5389 _close_bdev();
5390 _close_fsid();
5391 _close_path();
5392
5393 if (cct->_conf->bluestore_fsck_on_umount) {
5394 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5395 if (rc < 0)
5396 return rc;
5397 if (rc > 0) {
5398 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5399 return -EIO;
5400 }
5401 }
5402 return 0;
5403}
5404
5405static void apply(uint64_t off,
5406 uint64_t len,
5407 uint64_t granularity,
5408 BlueStore::mempool_dynamic_bitset &bitset,
5409 const char *what,
5410 std::function<void(uint64_t,
5411 BlueStore::mempool_dynamic_bitset &)> f) {
5412 auto end = ROUND_UP_TO(off + len, granularity);
5413 while (off < end) {
5414 uint64_t pos = off / granularity;
5415 f(pos, bitset);
5416 off += granularity;
5417 }
5418}
5419
5420int BlueStore::_fsck_check_extents(
5421 const ghobject_t& oid,
5422 const PExtentVector& extents,
5423 bool compressed,
5424 mempool_dynamic_bitset &used_blocks,
5425 store_statfs_t& expected_statfs)
5426{
5427 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5428 int errors = 0;
5429 for (auto e : extents) {
5430 if (!e.is_valid())
5431 continue;
5432 expected_statfs.allocated += e.length;
5433 if (compressed) {
5434 expected_statfs.compressed_allocated += e.length;
5435 }
5436 bool already = false;
5437 apply(
5438 e.offset, e.length, block_size, used_blocks, __func__,
5439 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5440 if (bs.test(pos))
5441 already = true;
5442 else
5443 bs.set(pos);
5444 });
5445 if (already) {
5446 derr << " " << oid << " extent " << e
5447 << " or a subset is already allocated" << dendl;
5448 ++errors;
5449 }
5450 if (e.end() > bdev->get_size()) {
5451 derr << " " << oid << " extent " << e
5452 << " past end of block device" << dendl;
5453 ++errors;
5454 }
5455 }
5456 return errors;
5457}
5458
5459int BlueStore::fsck(bool deep)
5460{
5461 dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
5462 int errors = 0;
31f18b77
FG
5463
5464 typedef btree::btree_set<
5465 uint64_t,std::less<uint64_t>,
5466 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5467 uint64_t_btree_t used_nids;
5468 uint64_t_btree_t used_omap_head;
5469 uint64_t_btree_t used_sbids;
5470
7c673cae 5471 mempool_dynamic_bitset used_blocks;
7c673cae
FG
5472 KeyValueDB::Iterator it;
5473 store_statfs_t expected_statfs, actual_statfs;
5474 struct sb_info_t {
5475 list<ghobject_t> oids;
5476 SharedBlobRef sb;
5477 bluestore_extent_ref_map_t ref_map;
5478 bool compressed;
5479 };
5480 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5481
5482 uint64_t num_objects = 0;
5483 uint64_t num_extents = 0;
5484 uint64_t num_blobs = 0;
5485 uint64_t num_spanning_blobs = 0;
5486 uint64_t num_shared_blobs = 0;
5487 uint64_t num_sharded_objects = 0;
5488 uint64_t num_object_shards = 0;
5489
5490 utime_t start = ceph_clock_now();
5491
5492 int r = _open_path();
5493 if (r < 0)
5494 return r;
5495 r = _open_fsid(false);
5496 if (r < 0)
5497 goto out_path;
5498
5499 r = _read_fsid(&fsid);
5500 if (r < 0)
5501 goto out_fsid;
5502
5503 r = _lock_fsid();
5504 if (r < 0)
5505 goto out_fsid;
5506
5507 r = _open_bdev(false);
5508 if (r < 0)
5509 goto out_fsid;
5510
5511 r = _open_db(false);
5512 if (r < 0)
5513 goto out_bdev;
5514
5515 r = _open_super_meta();
5516 if (r < 0)
5517 goto out_db;
5518
5519 r = _open_fm(false);
5520 if (r < 0)
5521 goto out_db;
5522
5523 r = _open_alloc();
5524 if (r < 0)
5525 goto out_fm;
5526
5527 r = _open_collections(&errors);
5528 if (r < 0)
5529 goto out_alloc;
5530
5531 mempool_thread.init();
5532
31f18b77
FG
5533 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5534 _kv_start();
7c673cae 5535 r = _deferred_replay();
31f18b77 5536 _kv_stop();
7c673cae
FG
5537 if (r < 0)
5538 goto out_scan;
5539
5540 used_blocks.resize(bdev->get_size() / block_size);
5541 apply(
5542 0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
5543 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5544 bs.set(pos);
5545 }
5546 );
5547
5548 if (bluefs) {
5549 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5550 apply(
5551 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
5552 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5553 bs.set(pos);
5554 }
5555 );
5556 }
5557 r = bluefs->fsck();
5558 if (r < 0) {
5559 goto out_scan;
5560 }
5561 if (r > 0)
5562 errors += r;
5563 }
5564
5565 // get expected statfs; fill unaffected fields to be able to compare
5566 // structs
5567 statfs(&actual_statfs);
5568 expected_statfs.total = actual_statfs.total;
5569 expected_statfs.available = actual_statfs.available;
5570
5571 // walk PREFIX_OBJ
5572 dout(1) << __func__ << " walking object keyspace" << dendl;
5573 it = db->get_iterator(PREFIX_OBJ);
5574 if (it) {
5575 CollectionRef c;
5576 spg_t pgid;
5577 mempool::bluestore_fsck::list<string> expecting_shards;
5578 for (it->lower_bound(string()); it->valid(); it->next()) {
31f18b77
FG
5579 if (g_conf->bluestore_debug_fsck_abort) {
5580 goto out_scan;
5581 }
7c673cae
FG
5582 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5583 if (is_extent_shard_key(it->key())) {
5584 while (!expecting_shards.empty() &&
5585 expecting_shards.front() < it->key()) {
5586 derr << __func__ << " error: missing shard key "
5587 << pretty_binary_string(expecting_shards.front())
5588 << dendl;
5589 ++errors;
5590 expecting_shards.pop_front();
5591 }
5592 if (!expecting_shards.empty() &&
5593 expecting_shards.front() == it->key()) {
5594 // all good
5595 expecting_shards.pop_front();
5596 continue;
5597 }
5598
5599 uint32_t offset;
5600 string okey;
5601 get_key_extent_shard(it->key(), &okey, &offset);
5602 derr << __func__ << " error: stray shard 0x" << std::hex << offset
5603 << std::dec << dendl;
5604 if (expecting_shards.empty()) {
5605 derr << __func__ << " error: " << pretty_binary_string(it->key())
5606 << " is unexpected" << dendl;
5607 ++errors;
5608 continue;
5609 }
5610 while (expecting_shards.front() > it->key()) {
5611 derr << __func__ << " error: saw " << pretty_binary_string(it->key())
5612 << dendl;
5613 derr << __func__ << " error: exp "
5614 << pretty_binary_string(expecting_shards.front()) << dendl;
5615 ++errors;
5616 expecting_shards.pop_front();
5617 if (expecting_shards.empty()) {
5618 break;
5619 }
5620 }
5621 continue;
5622 }
5623
5624 ghobject_t oid;
5625 int r = get_key_object(it->key(), &oid);
5626 if (r < 0) {
5627 derr << __func__ << " error: bad object key "
5628 << pretty_binary_string(it->key()) << dendl;
5629 ++errors;
5630 continue;
5631 }
5632 if (!c ||
5633 oid.shard_id != pgid.shard ||
5634 oid.hobj.pool != (int64_t)pgid.pool() ||
5635 !c->contains(oid)) {
5636 c = nullptr;
5637 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5638 coll_map.begin();
5639 p != coll_map.end();
5640 ++p) {
5641 if (p->second->contains(oid)) {
5642 c = p->second;
5643 break;
5644 }
5645 }
5646 if (!c) {
5647 derr << __func__ << " error: stray object " << oid
5648 << " not owned by any collection" << dendl;
5649 ++errors;
5650 continue;
5651 }
5652 c->cid.is_pg(&pgid);
5653 dout(20) << __func__ << " collection " << c->cid << dendl;
5654 }
5655
5656 if (!expecting_shards.empty()) {
5657 for (auto &k : expecting_shards) {
5658 derr << __func__ << " error: missing shard key "
5659 << pretty_binary_string(k) << dendl;
5660 }
5661 ++errors;
5662 expecting_shards.clear();
5663 }
5664
5665 dout(10) << __func__ << " " << oid << dendl;
5666 RWLock::RLocker l(c->lock);
5667 OnodeRef o = c->get_onode(oid, false);
5668 if (o->onode.nid) {
5669 if (o->onode.nid > nid_max) {
5670 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5671 << " > nid_max " << nid_max << dendl;
5672 ++errors;
5673 }
5674 if (used_nids.count(o->onode.nid)) {
5675 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5676 << " already in use" << dendl;
5677 ++errors;
5678 continue; // go for next object
5679 }
5680 used_nids.insert(o->onode.nid);
5681 }
5682 ++num_objects;
5683 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5684 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5685 _dump_onode(o, 30);
5686 // shards
5687 if (!o->extent_map.shards.empty()) {
5688 ++num_sharded_objects;
5689 num_object_shards += o->extent_map.shards.size();
5690 }
5691 for (auto& s : o->extent_map.shards) {
5692 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
5693 expecting_shards.push_back(string());
5694 get_extent_shard_key(o->key, s.shard_info->offset,
5695 &expecting_shards.back());
5696 if (s.shard_info->offset >= o->onode.size) {
5697 derr << __func__ << " error: " << oid << " shard 0x" << std::hex
5698 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
5699 << std::dec << dendl;
5700 ++errors;
5701 }
5702 }
5703 // lextents
5704 map<BlobRef,bluestore_blob_t::unused_t> referenced;
5705 uint64_t pos = 0;
5706 mempool::bluestore_fsck::map<BlobRef,
5707 bluestore_blob_use_tracker_t> ref_map;
5708 for (auto& l : o->extent_map.extent_map) {
5709 dout(20) << __func__ << " " << l << dendl;
5710 if (l.logical_offset < pos) {
5711 derr << __func__ << " error: " << oid << " lextent at 0x"
5712 << std::hex << l.logical_offset
5713 << " overlaps with the previous, which ends at 0x" << pos
5714 << std::dec << dendl;
5715 ++errors;
5716 }
5717 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
5718 derr << __func__ << " error: " << oid << " lextent at 0x"
5719 << std::hex << l.logical_offset << "~" << l.length
5720 << " spans a shard boundary"
5721 << std::dec << dendl;
5722 ++errors;
5723 }
5724 pos = l.logical_offset + l.length;
5725 expected_statfs.stored += l.length;
5726 assert(l.blob);
5727 const bluestore_blob_t& blob = l.blob->get_blob();
5728
5729 auto& ref = ref_map[l.blob];
5730 if (ref.is_empty()) {
5731 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
5732 uint32_t l = blob.get_logical_length();
5733 ref.init(l, min_release_size);
5734 }
5735 ref.get(
5736 l.blob_offset,
5737 l.length);
5738 ++num_extents;
5739 if (blob.has_unused()) {
5740 auto p = referenced.find(l.blob);
5741 bluestore_blob_t::unused_t *pu;
5742 if (p == referenced.end()) {
5743 pu = &referenced[l.blob];
5744 } else {
5745 pu = &p->second;
5746 }
5747 uint64_t blob_len = blob.get_logical_length();
5748 assert((blob_len % (sizeof(*pu)*8)) == 0);
5749 assert(l.blob_offset + l.length <= blob_len);
5750 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
5751 uint64_t start = l.blob_offset / chunk_size;
5752 uint64_t end =
5753 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
5754 for (auto i = start; i < end; ++i) {
5755 (*pu) |= (1u << i);
5756 }
5757 }
5758 }
5759 for (auto &i : referenced) {
5760 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
5761 << std::dec << " for " << *i.first << dendl;
5762 const bluestore_blob_t& blob = i.first->get_blob();
5763 if (i.second & blob.unused) {
5764 derr << __func__ << " error: " << oid << " blob claims unused 0x"
5765 << std::hex << blob.unused
5766 << " but extents reference 0x" << i.second
5767 << " on blob " << *i.first << dendl;
5768 ++errors;
5769 }
5770 if (blob.has_csum()) {
5771 uint64_t blob_len = blob.get_logical_length();
5772 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
5773 unsigned csum_count = blob.get_csum_count();
5774 unsigned csum_chunk_size = blob.get_csum_chunk_size();
5775 for (unsigned p = 0; p < csum_count; ++p) {
5776 unsigned pos = p * csum_chunk_size;
5777 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
5778 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
5779 unsigned mask = 1u << firstbit;
5780 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
5781 mask |= 1u << b;
5782 }
5783 if ((blob.unused & mask) == mask) {
5784 // this csum chunk region is marked unused
5785 if (blob.get_csum_item(p) != 0) {
5786 derr << __func__ << " error: " << oid
5787 << " blob claims csum chunk 0x" << std::hex << pos
5788 << "~" << csum_chunk_size
5789 << " is unused (mask 0x" << mask << " of unused 0x"
5790 << blob.unused << ") but csum is non-zero 0x"
5791 << blob.get_csum_item(p) << std::dec << " on blob "
5792 << *i.first << dendl;
5793 ++errors;
5794 }
5795 }
5796 }
5797 }
5798 }
5799 for (auto &i : ref_map) {
5800 ++num_blobs;
5801 const bluestore_blob_t& blob = i.first->get_blob();
5802 bool equal = i.first->get_blob_use_tracker().equal(i.second);
5803 if (!equal) {
5804 derr << __func__ << " error: " << oid << " blob " << *i.first
5805 << " doesn't match expected ref_map " << i.second << dendl;
5806 ++errors;
5807 }
5808 if (blob.is_compressed()) {
5809 expected_statfs.compressed += blob.get_compressed_payload_length();
5810 expected_statfs.compressed_original +=
5811 i.first->get_referenced_bytes();
5812 }
5813 if (blob.is_shared()) {
5814 if (i.first->shared_blob->get_sbid() > blobid_max) {
5815 derr << __func__ << " error: " << oid << " blob " << blob
5816 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
5817 << blobid_max << dendl;
5818 ++errors;
5819 } else if (i.first->shared_blob->get_sbid() == 0) {
5820 derr << __func__ << " error: " << oid << " blob " << blob
5821 << " marked as shared but has uninitialized sbid"
5822 << dendl;
5823 ++errors;
5824 }
5825 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
5826 sbi.sb = i.first->shared_blob;
5827 sbi.oids.push_back(oid);
5828 sbi.compressed = blob.is_compressed();
5829 for (auto e : blob.get_extents()) {
5830 if (e.is_valid()) {
5831 sbi.ref_map.get(e.offset, e.length);
5832 }
5833 }
5834 } else {
5835 errors += _fsck_check_extents(oid, blob.get_extents(),
5836 blob.is_compressed(),
5837 used_blocks,
5838 expected_statfs);
5839 }
5840 }
5841 if (deep) {
5842 bufferlist bl;
5843 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
5844 if (r < 0) {
5845 ++errors;
5846 derr << __func__ << " error: " << oid << " error during read: "
5847 << cpp_strerror(r) << dendl;
5848 }
5849 }
5850 // omap
5851 if (o->onode.has_omap()) {
5852 if (used_omap_head.count(o->onode.nid)) {
5853 derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
5854 << " already in use" << dendl;
5855 ++errors;
5856 } else {
5857 used_omap_head.insert(o->onode.nid);
5858 }
5859 }
7c673cae
FG
5860 }
5861 }
5862 dout(1) << __func__ << " checking shared_blobs" << dendl;
5863 it = db->get_iterator(PREFIX_SHARED_BLOB);
5864 if (it) {
5865 for (it->lower_bound(string()); it->valid(); it->next()) {
5866 string key = it->key();
5867 uint64_t sbid;
5868 if (get_key_shared_blob(key, &sbid)) {
5869 derr << __func__ << " error: bad key '" << key
5870 << "' in shared blob namespace" << dendl;
5871 ++errors;
5872 continue;
5873 }
5874 auto p = sb_info.find(sbid);
5875 if (p == sb_info.end()) {
5876 derr << __func__ << " error: found stray shared blob data for sbid 0x"
5877 << std::hex << sbid << std::dec << dendl;
5878 ++errors;
5879 } else {
5880 ++num_shared_blobs;
5881 sb_info_t& sbi = p->second;
5882 bluestore_shared_blob_t shared_blob(sbid);
5883 bufferlist bl = it->value();
5884 bufferlist::iterator blp = bl.begin();
5885 ::decode(shared_blob, blp);
5886 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
5887 if (shared_blob.ref_map != sbi.ref_map) {
5888 derr << __func__ << " error: shared blob 0x" << std::hex << sbid
5889 << std::dec << " ref_map " << shared_blob.ref_map
5890 << " != expected " << sbi.ref_map << dendl;
5891 ++errors;
5892 }
5893 PExtentVector extents;
5894 for (auto &r : shared_blob.ref_map.ref_map) {
5895 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
5896 }
5897 errors += _fsck_check_extents(p->second.oids.front(),
5898 extents,
5899 p->second.compressed,
5900 used_blocks, expected_statfs);
5901 sb_info.erase(p);
5902 }
5903 }
5904 }
5905 for (auto &p : sb_info) {
5906 derr << __func__ << " error: shared_blob 0x" << p.first
5907 << " key is missing (" << *p.second.sb << ")" << dendl;
5908 ++errors;
5909 }
5910 if (!(actual_statfs == expected_statfs)) {
5911 derr << __func__ << " error: actual " << actual_statfs
5912 << " != expected " << expected_statfs << dendl;
5913 ++errors;
5914 }
5915
5916 dout(1) << __func__ << " checking for stray omap data" << dendl;
5917 it = db->get_iterator(PREFIX_OMAP);
5918 if (it) {
5919 for (it->lower_bound(string()); it->valid(); it->next()) {
5920 uint64_t omap_head;
5921 _key_decode_u64(it->key().c_str(), &omap_head);
5922 if (used_omap_head.count(omap_head) == 0) {
5923 derr << __func__ << " error: found stray omap data on omap_head "
5924 << omap_head << dendl;
5925 ++errors;
5926 }
5927 }
5928 }
5929
5930 dout(1) << __func__ << " checking deferred events" << dendl;
5931 it = db->get_iterator(PREFIX_DEFERRED);
5932 if (it) {
5933 for (it->lower_bound(string()); it->valid(); it->next()) {
5934 bufferlist bl = it->value();
5935 bufferlist::iterator p = bl.begin();
5936 bluestore_deferred_transaction_t wt;
5937 try {
5938 ::decode(wt, p);
5939 } catch (buffer::error& e) {
5940 derr << __func__ << " error: failed to decode deferred txn "
5941 << pretty_binary_string(it->key()) << dendl;
5942 r = -EIO;
5943 goto out_scan;
5944 }
5945 dout(20) << __func__ << " deferred " << wt.seq
5946 << " ops " << wt.ops.size()
5947 << " released 0x" << std::hex << wt.released << std::dec << dendl;
5948 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
5949 apply(
5950 e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
5951 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5952 bs.set(pos);
5953 }
5954 );
5955 }
5956 }
5957 }
5958
5959 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
5960 {
5961 // remove bluefs_extents from used set since the freelist doesn't
5962 // know they are allocated.
5963 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5964 apply(
5965 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
5966 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5967 bs.reset(pos);
5968 }
5969 );
5970 }
5971 fm->enumerate_reset();
5972 uint64_t offset, length;
5973 while (fm->enumerate_next(&offset, &length)) {
5974 bool intersects = false;
5975 apply(
5976 offset, length, block_size, used_blocks, "free",
5977 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5978 if (bs.test(pos)) {
5979 intersects = true;
5980 } else {
5981 bs.set(pos);
5982 }
5983 }
5984 );
5985 if (intersects) {
5986 derr << __func__ << " error: free extent 0x" << std::hex << offset
5987 << "~" << length << std::dec
5988 << " intersects allocated blocks" << dendl;
5989 ++errors;
5990 }
5991 }
224ce89b 5992 fm->enumerate_reset();
7c673cae
FG
5993 size_t count = used_blocks.count();
5994 if (used_blocks.size() != count) {
5995 assert(used_blocks.size() > count);
5996 derr << __func__ << " error: leaked some space;"
5997 << (used_blocks.size() - count) * min_alloc_size
5998 << " bytes leaked" << dendl;
5999 ++errors;
6000 }
6001 }
6002
6003 out_scan:
6004 mempool_thread.shutdown();
31f18b77 6005 _flush_cache();
7c673cae
FG
6006 out_alloc:
6007 _close_alloc();
6008 out_fm:
6009 _close_fm();
6010 out_db:
6011 it.reset(); // before db is closed
6012 _close_db();
6013 out_bdev:
6014 _close_bdev();
6015 out_fsid:
6016 _close_fsid();
6017 out_path:
6018 _close_path();
6019
6020 // fatal errors take precedence
6021 if (r < 0)
6022 return r;
6023
6024 dout(2) << __func__ << " " << num_objects << " objects, "
6025 << num_sharded_objects << " of them sharded. "
6026 << dendl;
6027 dout(2) << __func__ << " " << num_extents << " extents to "
6028 << num_blobs << " blobs, "
6029 << num_spanning_blobs << " spanning, "
6030 << num_shared_blobs << " shared."
6031 << dendl;
6032
6033 utime_t duration = ceph_clock_now() - start;
6034 dout(1) << __func__ << " finish with " << errors << " errors in "
6035 << duration << " seconds" << dendl;
6036 return errors;
6037}
6038
6039void BlueStore::collect_metadata(map<string,string> *pm)
6040{
6041 dout(10) << __func__ << dendl;
6042 bdev->collect_metadata("bluestore_bdev_", pm);
6043 if (bluefs) {
6044 (*pm)["bluefs"] = "1";
6045 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
6046 bluefs->collect_metadata(pm);
6047 } else {
6048 (*pm)["bluefs"] = "0";
6049 }
6050}
6051
6052int BlueStore::statfs(struct store_statfs_t *buf)
6053{
6054 buf->reset();
6055 buf->total = bdev->get_size();
6056 buf->available = alloc->get_free();
6057
6058 if (bluefs) {
6059 // part of our shared device is "free" according to BlueFS
6060 // Don't include bluestore_bluefs_min because that space can't
6061 // be used for any other purpose.
6062 buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;
6063
6064 // include dedicated db, too, if that isn't the shared device.
6065 if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
6066 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
6067 }
6068 }
6069
31f18b77
FG
6070 {
6071 std::lock_guard<std::mutex> l(vstatfs_lock);
6072
6073 buf->allocated = vstatfs.allocated();
6074 buf->stored = vstatfs.stored();
6075 buf->compressed = vstatfs.compressed();
6076 buf->compressed_original = vstatfs.compressed_original();
6077 buf->compressed_allocated = vstatfs.compressed_allocated();
7c673cae
FG
6078 }
6079
7c673cae
FG
6080 dout(20) << __func__ << *buf << dendl;
6081 return 0;
6082}
6083
6084// ---------------
6085// cache
6086
6087BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
6088{
6089 RWLock::RLocker l(coll_lock);
6090 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
6091 if (cp == coll_map.end())
6092 return CollectionRef();
6093 return cp->second;
6094}
6095
6096void BlueStore::_queue_reap_collection(CollectionRef& c)
6097{
6098 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6099 std::lock_guard<std::mutex> l(reap_lock);
6100 removed_collections.push_back(c);
6101}
6102
6103void BlueStore::_reap_collections()
6104{
6105 list<CollectionRef> removed_colls;
6106 {
6107 std::lock_guard<std::mutex> l(reap_lock);
6108 removed_colls.swap(removed_collections);
6109 }
6110
6111 bool all_reaped = true;
6112
6113 for (list<CollectionRef>::iterator p = removed_colls.begin();
6114 p != removed_colls.end();
6115 ++p) {
6116 CollectionRef c = *p;
6117 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6118 if (c->onode_map.map_any([&](OnodeRef o) {
6119 assert(!o->exists);
6120 if (o->flushing_count.load()) {
6121 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6122 << " flush_txns " << o->flushing_count << dendl;
6123 return false;
6124 }
6125 return true;
6126 })) {
6127 all_reaped = false;
6128 continue;
6129 }
6130 c->onode_map.clear();
6131 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6132 }
6133
6134 if (all_reaped) {
6135 dout(10) << __func__ << " all reaped" << dendl;
6136 }
6137}
6138
6139void BlueStore::_update_cache_logger()
6140{
6141 uint64_t num_onodes = 0;
6142 uint64_t num_extents = 0;
6143 uint64_t num_blobs = 0;
6144 uint64_t num_buffers = 0;
6145 uint64_t num_buffer_bytes = 0;
6146 for (auto c : cache_shards) {
6147 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6148 &num_buffers, &num_buffer_bytes);
6149 }
6150 logger->set(l_bluestore_onodes, num_onodes);
6151 logger->set(l_bluestore_extents, num_extents);
6152 logger->set(l_bluestore_blobs, num_blobs);
6153 logger->set(l_bluestore_buffers, num_buffers);
6154 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6155}
6156
6157// ---------------
6158// read operations
6159
6160ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6161{
6162 return _get_collection(cid);
6163}
6164
6165bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6166{
6167 CollectionHandle c = _get_collection(cid);
6168 if (!c)
6169 return false;
6170 return exists(c, oid);
6171}
6172
6173bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6174{
6175 Collection *c = static_cast<Collection *>(c_.get());
6176 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6177 if (!c->exists)
6178 return false;
6179
6180 bool r = true;
6181
6182 {
6183 RWLock::RLocker l(c->lock);
6184 OnodeRef o = c->get_onode(oid, false);
6185 if (!o || !o->exists)
6186 r = false;
6187 }
6188
7c673cae
FG
6189 return r;
6190}
6191
6192int BlueStore::stat(
6193 const coll_t& cid,
6194 const ghobject_t& oid,
6195 struct stat *st,
6196 bool allow_eio)
6197{
6198 CollectionHandle c = _get_collection(cid);
6199 if (!c)
6200 return -ENOENT;
6201 return stat(c, oid, st, allow_eio);
6202}
6203
6204int BlueStore::stat(
6205 CollectionHandle &c_,
6206 const ghobject_t& oid,
6207 struct stat *st,
6208 bool allow_eio)
6209{
6210 Collection *c = static_cast<Collection *>(c_.get());
6211 if (!c->exists)
6212 return -ENOENT;
6213 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6214
6215 {
6216 RWLock::RLocker l(c->lock);
6217 OnodeRef o = c->get_onode(oid, false);
6218 if (!o || !o->exists)
6219 return -ENOENT;
6220 st->st_size = o->onode.size;
6221 st->st_blksize = 4096;
6222 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6223 st->st_nlink = 1;
6224 }
6225
7c673cae
FG
6226 int r = 0;
6227 if (_debug_mdata_eio(oid)) {
6228 r = -EIO;
6229 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6230 }
6231 return r;
6232}
6233int BlueStore::set_collection_opts(
6234 const coll_t& cid,
6235 const pool_opts_t& opts)
6236{
6237 CollectionHandle ch = _get_collection(cid);
6238 if (!ch)
6239 return -ENOENT;
6240 Collection *c = static_cast<Collection *>(ch.get());
6241 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6242 if (!c->exists)
6243 return -ENOENT;
6244 RWLock::WLocker l(c->lock);
6245 c->pool_opts = opts;
6246 return 0;
6247}
6248
6249int BlueStore::read(
6250 const coll_t& cid,
6251 const ghobject_t& oid,
6252 uint64_t offset,
6253 size_t length,
6254 bufferlist& bl,
224ce89b 6255 uint32_t op_flags)
7c673cae
FG
6256{
6257 CollectionHandle c = _get_collection(cid);
6258 if (!c)
6259 return -ENOENT;
224ce89b 6260 return read(c, oid, offset, length, bl, op_flags);
7c673cae
FG
6261}
6262
6263int BlueStore::read(
6264 CollectionHandle &c_,
6265 const ghobject_t& oid,
6266 uint64_t offset,
6267 size_t length,
6268 bufferlist& bl,
224ce89b 6269 uint32_t op_flags)
7c673cae
FG
6270{
6271 utime_t start = ceph_clock_now();
6272 Collection *c = static_cast<Collection *>(c_.get());
6273 const coll_t &cid = c->get_cid();
6274 dout(15) << __func__ << " " << cid << " " << oid
6275 << " 0x" << std::hex << offset << "~" << length << std::dec
6276 << dendl;
6277 if (!c->exists)
6278 return -ENOENT;
6279
6280 bl.clear();
6281 int r;
6282 {
6283 RWLock::RLocker l(c->lock);
6284 utime_t start1 = ceph_clock_now();
6285 OnodeRef o = c->get_onode(oid, false);
6286 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6287 if (!o || !o->exists) {
6288 r = -ENOENT;
6289 goto out;
6290 }
6291
6292 if (offset == length && offset == 0)
6293 length = o->onode.size;
6294
6295 r = _do_read(c, o, offset, length, bl, op_flags);
6296 }
6297
6298 out:
7c673cae
FG
6299 if (r == 0 && _debug_data_eio(oid)) {
6300 r = -EIO;
6301 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
224ce89b
WB
6302 } else if (cct->_conf->bluestore_debug_random_read_err &&
6303 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) {
6304 dout(0) << __func__ << ": inject random EIO" << dendl;
6305 r = -EIO;
7c673cae
FG
6306 }
6307 dout(10) << __func__ << " " << cid << " " << oid
6308 << " 0x" << std::hex << offset << "~" << length << std::dec
6309 << " = " << r << dendl;
6310 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6311 return r;
6312}
6313
6314// --------------------------------------------------------
6315// intermediate data structures used while reading
6316struct region_t {
6317 uint64_t logical_offset;
6318 uint64_t blob_xoffset; //region offset within the blob
6319 uint64_t length;
6320 bufferlist bl;
6321
6322 // used later in read process
6323 uint64_t front = 0;
6324 uint64_t r_off = 0;
6325
6326 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6327 : logical_offset(offset),
6328 blob_xoffset(b_offs),
6329 length(len){}
6330 region_t(const region_t& from)
6331 : logical_offset(from.logical_offset),
6332 blob_xoffset(from.blob_xoffset),
6333 length(from.length){}
6334
6335 friend ostream& operator<<(ostream& out, const region_t& r) {
6336 return out << "0x" << std::hex << r.logical_offset << ":"
6337 << r.blob_xoffset << "~" << r.length << std::dec;
6338 }
6339};
6340
6341typedef list<region_t> regions2read_t;
6342typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6343
6344int BlueStore::_do_read(
6345 Collection *c,
6346 OnodeRef o,
6347 uint64_t offset,
6348 size_t length,
6349 bufferlist& bl,
6350 uint32_t op_flags)
6351{
6352 FUNCTRACE();
7c673cae
FG
6353 int r = 0;
6354
6355 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6356 << " size 0x" << o->onode.size << " (" << std::dec
6357 << o->onode.size << ")" << dendl;
6358 bl.clear();
6359
6360 if (offset >= o->onode.size) {
6361 return r;
6362 }
6363
6364 // generally, don't buffer anything, unless the client explicitly requests
6365 // it.
6366 bool buffered = false;
6367 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6368 dout(20) << __func__ << " will do buffered read" << dendl;
6369 buffered = true;
6370 } else if (cct->_conf->bluestore_default_buffered_read &&
6371 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6372 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6373 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6374 buffered = true;
6375 }
6376
6377 if (offset + length > o->onode.size) {
6378 length = o->onode.size - offset;
6379 }
6380
6381 utime_t start = ceph_clock_now();
6382 o->extent_map.fault_range(db, offset, length);
6383 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6384 _dump_onode(o);
6385
6386 ready_regions_t ready_regions;
6387
6388 // build blob-wise list to of stuff read (that isn't cached)
6389 blobs2read_t blobs2read;
6390 unsigned left = length;
6391 uint64_t pos = offset;
6392 unsigned num_regions = 0;
6393 auto lp = o->extent_map.seek_lextent(offset);
6394 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6395 if (pos < lp->logical_offset) {
6396 unsigned hole = lp->logical_offset - pos;
6397 if (hole >= left) {
6398 break;
6399 }
6400 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6401 << std::dec << dendl;
6402 pos += hole;
6403 left -= hole;
6404 }
6405 BlobRef bptr = lp->blob;
6406 unsigned l_off = pos - lp->logical_offset;
6407 unsigned b_off = l_off + lp->blob_offset;
6408 unsigned b_len = std::min(left, lp->length - l_off);
6409
6410 ready_regions_t cache_res;
6411 interval_set<uint32_t> cache_interval;
6412 bptr->shared_blob->bc.read(
6413 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
6414 dout(20) << __func__ << " blob " << *bptr << std::hex
6415 << " need 0x" << b_off << "~" << b_len
6416 << " cache has 0x" << cache_interval
6417 << std::dec << dendl;
6418
6419 auto pc = cache_res.begin();
6420 while (b_len > 0) {
6421 unsigned l;
6422 if (pc != cache_res.end() &&
6423 pc->first == b_off) {
6424 l = pc->second.length();
6425 ready_regions[pos].claim(pc->second);
6426 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6427 << b_off << "~" << l << std::dec << dendl;
6428 ++pc;
6429 } else {
6430 l = b_len;
6431 if (pc != cache_res.end()) {
6432 assert(pc->first > b_off);
6433 l = pc->first - b_off;
6434 }
6435 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6436 << b_off << "~" << l << std::dec << dendl;
6437 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6438 ++num_regions;
6439 }
6440 pos += l;
6441 b_off += l;
6442 left -= l;
6443 b_len -= l;
6444 }
6445 ++lp;
6446 }
6447
6448 // read raw blob data. use aio if we have >1 blobs to read.
6449 start = ceph_clock_now(); // for the sake of simplicity
6450 // measure the whole block below.
6451 // The error isn't that much...
6452 vector<bufferlist> compressed_blob_bls;
6453 IOContext ioc(cct, NULL);
6454 for (auto& p : blobs2read) {
6455 BlobRef bptr = p.first;
6456 dout(20) << __func__ << " blob " << *bptr << std::hex
6457 << " need " << p.second << std::dec << dendl;
6458 if (bptr->get_blob().is_compressed()) {
6459 // read the whole thing
6460 if (compressed_blob_bls.empty()) {
6461 // ensure we avoid any reallocation on subsequent blobs
6462 compressed_blob_bls.reserve(blobs2read.size());
6463 }
6464 compressed_blob_bls.push_back(bufferlist());
6465 bufferlist& bl = compressed_blob_bls.back();
6466 r = bptr->get_blob().map(
6467 0, bptr->get_blob().get_ondisk_length(),
6468 [&](uint64_t offset, uint64_t length) {
6469 int r;
6470 // use aio if there are more regions to read than those in this blob
6471 if (num_regions > p.second.size()) {
6472 r = bdev->aio_read(offset, length, &bl, &ioc);
6473 } else {
6474 r = bdev->read(offset, length, &bl, &ioc, false);
6475 }
6476 if (r < 0)
6477 return r;
6478 return 0;
6479 });
6480 assert(r == 0);
6481 } else {
6482 // read the pieces
6483 for (auto& reg : p.second) {
6484 // determine how much of the blob to read
6485 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6486 reg.r_off = reg.blob_xoffset;
6487 uint64_t r_len = reg.length;
6488 reg.front = reg.r_off % chunk_size;
6489 if (reg.front) {
6490 reg.r_off -= reg.front;
6491 r_len += reg.front;
6492 }
6493 unsigned tail = r_len % chunk_size;
6494 if (tail) {
6495 r_len += chunk_size - tail;
6496 }
6497 dout(20) << __func__ << " region 0x" << std::hex
6498 << reg.logical_offset
6499 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6500 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6501 << dendl;
6502
6503 // read it
6504 r = bptr->get_blob().map(
6505 reg.r_off, r_len,
6506 [&](uint64_t offset, uint64_t length) {
6507 int r;
6508 // use aio if there is more than one region to read
6509 if (num_regions > 1) {
6510 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6511 } else {
6512 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6513 }
6514 if (r < 0)
6515 return r;
6516 return 0;
6517 });
6518 assert(r == 0);
6519 assert(reg.bl.length() == r_len);
6520 }
6521 }
6522 }
6523 if (ioc.has_pending_aios()) {
6524 bdev->aio_submit(&ioc);
6525 dout(20) << __func__ << " waiting for aio" << dendl;
6526 ioc.aio_wait();
6527 }
6528 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6529
6530 // enumerate and decompress desired blobs
6531 auto p = compressed_blob_bls.begin();
6532 blobs2read_t::iterator b2r_it = blobs2read.begin();
6533 while (b2r_it != blobs2read.end()) {
6534 BlobRef bptr = b2r_it->first;
6535 dout(20) << __func__ << " blob " << *bptr << std::hex
6536 << " need 0x" << b2r_it->second << std::dec << dendl;
6537 if (bptr->get_blob().is_compressed()) {
6538 assert(p != compressed_blob_bls.end());
6539 bufferlist& compressed_bl = *p++;
6540 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6541 b2r_it->second.front().logical_offset) < 0) {
6542 return -EIO;
6543 }
6544 bufferlist raw_bl;
6545 r = _decompress(compressed_bl, &raw_bl);
6546 if (r < 0)
6547 return r;
6548 if (buffered) {
6549 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6550 raw_bl);
6551 }
6552 for (auto& i : b2r_it->second) {
6553 ready_regions[i.logical_offset].substr_of(
6554 raw_bl, i.blob_xoffset, i.length);
6555 }
6556 } else {
6557 for (auto& reg : b2r_it->second) {
6558 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6559 reg.logical_offset) < 0) {
6560 return -EIO;
6561 }
6562 if (buffered) {
6563 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6564 reg.r_off, reg.bl);
6565 }
6566
6567 // prune and keep result
6568 ready_regions[reg.logical_offset].substr_of(
6569 reg.bl, reg.front, reg.length);
6570 }
6571 }
6572 ++b2r_it;
6573 }
6574
6575 // generate a resulting buffer
6576 auto pr = ready_regions.begin();
6577 auto pr_end = ready_regions.end();
6578 pos = 0;
6579 while (pos < length) {
6580 if (pr != pr_end && pr->first == pos + offset) {
6581 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6582 << ": data from 0x" << pr->first << "~" << pr->second.length()
6583 << std::dec << dendl;
6584 pos += pr->second.length();
6585 bl.claim_append(pr->second);
6586 ++pr;
6587 } else {
6588 uint64_t l = length - pos;
6589 if (pr != pr_end) {
6590 assert(pr->first > pos + offset);
6591 l = pr->first - (pos + offset);
6592 }
6593 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6594 << ": zeros for 0x" << (pos + offset) << "~" << l
6595 << std::dec << dendl;
6596 bl.append_zero(l);
6597 pos += l;
6598 }
6599 }
6600 assert(bl.length() == length);
6601 assert(pos == length);
6602 assert(pr == pr_end);
6603 r = bl.length();
6604 return r;
6605}
6606
6607int BlueStore::_verify_csum(OnodeRef& o,
6608 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6609 const bufferlist& bl,
6610 uint64_t logical_offset) const
6611{
6612 int bad;
6613 uint64_t bad_csum;
6614 utime_t start = ceph_clock_now();
6615 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6616 if (r < 0) {
6617 if (r == -1) {
6618 PExtentVector pex;
6619 blob->map(
6620 bad,
6621 blob->get_csum_chunk_size(),
6622 [&](uint64_t offset, uint64_t length) {
6623 pex.emplace_back(bluestore_pextent_t(offset, length));
6624 return 0;
6625 });
6626 derr << __func__ << " bad "
6627 << Checksummer::get_csum_type_string(blob->csum_type)
6628 << "/0x" << std::hex << blob->get_csum_chunk_size()
6629 << " checksum at blob offset 0x" << bad
6630 << ", got 0x" << bad_csum << ", expected 0x"
6631 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
6632 << ", device location " << pex
6633 << ", logical extent 0x" << std::hex
6634 << (logical_offset + bad - blob_xoffset) << "~"
6635 << blob->get_csum_chunk_size() << std::dec
6636 << ", object " << o->oid
6637 << dendl;
6638 } else {
6639 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
6640 }
6641 }
6642 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
6643 return r;
6644}
6645
6646int BlueStore::_decompress(bufferlist& source, bufferlist* result)
6647{
6648 int r = 0;
6649 utime_t start = ceph_clock_now();
6650 bufferlist::iterator i = source.begin();
6651 bluestore_compression_header_t chdr;
6652 ::decode(chdr, i);
6653 int alg = int(chdr.type);
6654 CompressorRef cp = compressor;
6655 if (!cp || (int)cp->get_type() != alg) {
6656 cp = Compressor::create(cct, alg);
6657 }
6658
6659 if (!cp.get()) {
6660 // if compressor isn't available - error, because cannot return
6661 // decompressed data?
6662 derr << __func__ << " can't load decompressor " << alg << dendl;
6663 r = -EIO;
6664 } else {
6665 r = cp->decompress(i, chdr.length, *result);
6666 if (r < 0) {
6667 derr << __func__ << " decompression failed with exit code " << r << dendl;
6668 r = -EIO;
6669 }
6670 }
6671 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
6672 return r;
6673}
6674
6675// this stores fiemap into interval_set, other variations
6676// use it internally
6677int BlueStore::_fiemap(
6678 CollectionHandle &c_,
6679 const ghobject_t& oid,
6680 uint64_t offset,
6681 size_t length,
6682 interval_set<uint64_t>& destset)
6683{
6684 Collection *c = static_cast<Collection *>(c_.get());
6685 if (!c->exists)
6686 return -ENOENT;
6687 {
6688 RWLock::RLocker l(c->lock);
6689
6690 OnodeRef o = c->get_onode(oid, false);
6691 if (!o || !o->exists) {
6692 return -ENOENT;
6693 }
6694 _dump_onode(o);
6695
6696 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6697 << " size 0x" << o->onode.size << std::dec << dendl;
6698
6699 boost::intrusive::set<Extent>::iterator ep, eend;
6700 if (offset >= o->onode.size)
6701 goto out;
6702
6703 if (offset + length > o->onode.size) {
6704 length = o->onode.size - offset;
6705 }
6706
6707 o->extent_map.fault_range(db, offset, length);
6708 eend = o->extent_map.extent_map.end();
6709 ep = o->extent_map.seek_lextent(offset);
6710 while (length > 0) {
6711 dout(20) << __func__ << " offset " << offset << dendl;
6712 if (ep != eend && ep->logical_offset + ep->length <= offset) {
6713 ++ep;
6714 continue;
6715 }
6716
6717 uint64_t x_len = length;
6718 if (ep != eend && ep->logical_offset <= offset) {
6719 uint64_t x_off = offset - ep->logical_offset;
6720 x_len = MIN(x_len, ep->length - x_off);
6721 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
6722 << x_len << std::dec << " blob " << ep->blob << dendl;
6723 destset.insert(offset, x_len);
6724 length -= x_len;
6725 offset += x_len;
6726 if (x_off + x_len == ep->length)
6727 ++ep;
6728 continue;
6729 }
6730 if (ep != eend &&
6731 ep->logical_offset > offset &&
6732 ep->logical_offset - offset < x_len) {
6733 x_len = ep->logical_offset - offset;
6734 }
6735 offset += x_len;
6736 length -= x_len;
6737 }
6738 }
6739
6740 out:
7c673cae
FG
6741 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6742 << " size = 0x(" << destset << ")" << std::dec << dendl;
6743 return 0;
6744}
6745
6746int BlueStore::fiemap(
6747 const coll_t& cid,
6748 const ghobject_t& oid,
6749 uint64_t offset,
6750 size_t len,
6751 bufferlist& bl)
6752{
6753 CollectionHandle c = _get_collection(cid);
6754 if (!c)
6755 return -ENOENT;
6756 return fiemap(c, oid, offset, len, bl);
6757}
6758
6759int BlueStore::fiemap(
6760 CollectionHandle &c_,
6761 const ghobject_t& oid,
6762 uint64_t offset,
6763 size_t length,
6764 bufferlist& bl)
6765{
6766 interval_set<uint64_t> m;
6767 int r = _fiemap(c_, oid, offset, length, m);
6768 if (r >= 0) {
6769 ::encode(m, bl);
6770 }
6771 return r;
6772}
6773
6774int BlueStore::fiemap(
6775 const coll_t& cid,
6776 const ghobject_t& oid,
6777 uint64_t offset,
6778 size_t len,
6779 map<uint64_t, uint64_t>& destmap)
6780{
6781 CollectionHandle c = _get_collection(cid);
6782 if (!c)
6783 return -ENOENT;
6784 return fiemap(c, oid, offset, len, destmap);
6785}
6786
6787int BlueStore::fiemap(
6788 CollectionHandle &c_,
6789 const ghobject_t& oid,
6790 uint64_t offset,
6791 size_t length,
6792 map<uint64_t, uint64_t>& destmap)
6793{
6794 interval_set<uint64_t> m;
6795 int r = _fiemap(c_, oid, offset, length, m);
6796 if (r >= 0) {
6797 m.move_into(destmap);
6798 }
6799 return r;
6800}
6801
6802int BlueStore::getattr(
6803 const coll_t& cid,
6804 const ghobject_t& oid,
6805 const char *name,
6806 bufferptr& value)
6807{
6808 CollectionHandle c = _get_collection(cid);
6809 if (!c)
6810 return -ENOENT;
6811 return getattr(c, oid, name, value);
6812}
6813
6814int BlueStore::getattr(
6815 CollectionHandle &c_,
6816 const ghobject_t& oid,
6817 const char *name,
6818 bufferptr& value)
6819{
6820 Collection *c = static_cast<Collection *>(c_.get());
6821 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
6822 if (!c->exists)
6823 return -ENOENT;
6824
6825 int r;
6826 {
6827 RWLock::RLocker l(c->lock);
31f18b77 6828 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
6829
6830 OnodeRef o = c->get_onode(oid, false);
6831 if (!o || !o->exists) {
6832 r = -ENOENT;
6833 goto out;
6834 }
6835
6836 if (!o->onode.attrs.count(k)) {
6837 r = -ENODATA;
6838 goto out;
6839 }
6840 value = o->onode.attrs[k];
6841 r = 0;
6842 }
6843 out:
7c673cae
FG
6844 if (r == 0 && _debug_mdata_eio(oid)) {
6845 r = -EIO;
6846 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6847 }
6848 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
6849 << " = " << r << dendl;
6850 return r;
6851}
6852
6853
6854int BlueStore::getattrs(
6855 const coll_t& cid,
6856 const ghobject_t& oid,
6857 map<string,bufferptr>& aset)
6858{
6859 CollectionHandle c = _get_collection(cid);
6860 if (!c)
6861 return -ENOENT;
6862 return getattrs(c, oid, aset);
6863}
6864
6865int BlueStore::getattrs(
6866 CollectionHandle &c_,
6867 const ghobject_t& oid,
6868 map<string,bufferptr>& aset)
6869{
6870 Collection *c = static_cast<Collection *>(c_.get());
6871 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
6872 if (!c->exists)
6873 return -ENOENT;
6874
6875 int r;
6876 {
6877 RWLock::RLocker l(c->lock);
6878
6879 OnodeRef o = c->get_onode(oid, false);
6880 if (!o || !o->exists) {
6881 r = -ENOENT;
6882 goto out;
6883 }
6884 for (auto& i : o->onode.attrs) {
6885 aset.emplace(i.first.c_str(), i.second);
6886 }
6887 r = 0;
6888 }
6889
6890 out:
7c673cae
FG
6891 if (r == 0 && _debug_mdata_eio(oid)) {
6892 r = -EIO;
6893 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6894 }
6895 dout(10) << __func__ << " " << c->cid << " " << oid
6896 << " = " << r << dendl;
6897 return r;
6898}
6899
6900int BlueStore::list_collections(vector<coll_t>& ls)
6901{
6902 RWLock::RLocker l(coll_lock);
6903 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
6904 p != coll_map.end();
6905 ++p)
6906 ls.push_back(p->first);
6907 return 0;
6908}
6909
6910bool BlueStore::collection_exists(const coll_t& c)
6911{
6912 RWLock::RLocker l(coll_lock);
6913 return coll_map.count(c);
6914}
6915
6916int BlueStore::collection_empty(const coll_t& cid, bool *empty)
6917{
6918 dout(15) << __func__ << " " << cid << dendl;
6919 vector<ghobject_t> ls;
6920 ghobject_t next;
6921 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
6922 &ls, &next);
6923 if (r < 0) {
6924 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
6925 << dendl;
6926 return r;
6927 }
6928 *empty = ls.empty();
6929 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
6930 return 0;
6931}
6932
6933int BlueStore::collection_bits(const coll_t& cid)
6934{
6935 dout(15) << __func__ << " " << cid << dendl;
6936 CollectionRef c = _get_collection(cid);
6937 if (!c)
6938 return -ENOENT;
6939 RWLock::RLocker l(c->lock);
6940 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
6941 return c->cnode.bits;
6942}
6943
6944int BlueStore::collection_list(
6945 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
6946 vector<ghobject_t> *ls, ghobject_t *pnext)
6947{
6948 CollectionHandle c = _get_collection(cid);
6949 if (!c)
6950 return -ENOENT;
6951 return collection_list(c, start, end, max, ls, pnext);
6952}
6953
6954int BlueStore::collection_list(
6955 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
6956 vector<ghobject_t> *ls, ghobject_t *pnext)
6957{
6958 Collection *c = static_cast<Collection *>(c_.get());
6959 dout(15) << __func__ << " " << c->cid
6960 << " start " << start << " end " << end << " max " << max << dendl;
6961 int r;
6962 {
6963 RWLock::RLocker l(c->lock);
6964 r = _collection_list(c, start, end, max, ls, pnext);
6965 }
6966
7c673cae
FG
6967 dout(10) << __func__ << " " << c->cid
6968 << " start " << start << " end " << end << " max " << max
6969 << " = " << r << ", ls.size() = " << ls->size()
6970 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
6971 return r;
6972}
6973
6974int BlueStore::_collection_list(
6975 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
6976 vector<ghobject_t> *ls, ghobject_t *pnext)
6977{
6978
6979 if (!c->exists)
6980 return -ENOENT;
6981
6982 int r = 0;
6983 ghobject_t static_next;
6984 KeyValueDB::Iterator it;
6985 string temp_start_key, temp_end_key;
6986 string start_key, end_key;
6987 bool set_next = false;
6988 string pend;
6989 bool temp;
6990
6991 if (!pnext)
6992 pnext = &static_next;
6993
6994 if (start == ghobject_t::get_max() ||
6995 start.hobj.is_max()) {
6996 goto out;
6997 }
6998 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
6999 &start_key, &end_key);
7000 dout(20) << __func__
7001 << " range " << pretty_binary_string(temp_start_key)
7002 << " to " << pretty_binary_string(temp_end_key)
7003 << " and " << pretty_binary_string(start_key)
7004 << " to " << pretty_binary_string(end_key)
7005 << " start " << start << dendl;
7006 it = db->get_iterator(PREFIX_OBJ);
7007 if (start == ghobject_t() ||
7008 start.hobj == hobject_t() ||
7009 start == c->cid.get_min_hobj()) {
7010 it->upper_bound(temp_start_key);
7011 temp = true;
7012 } else {
7013 string k;
7014 get_object_key(cct, start, &k);
7015 if (start.hobj.is_temp()) {
7016 temp = true;
7017 assert(k >= temp_start_key && k < temp_end_key);
7018 } else {
7019 temp = false;
7020 assert(k >= start_key && k < end_key);
7021 }
7022 dout(20) << " start from " << pretty_binary_string(k)
7023 << " temp=" << (int)temp << dendl;
7024 it->lower_bound(k);
7025 }
7026 if (end.hobj.is_max()) {
7027 pend = temp ? temp_end_key : end_key;
7028 } else {
7029 get_object_key(cct, end, &end_key);
7030 if (end.hobj.is_temp()) {
7031 if (temp)
7032 pend = end_key;
7033 else
7034 goto out;
7035 } else {
7036 pend = temp ? temp_end_key : end_key;
7037 }
7038 }
7039 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7040 while (true) {
7041 if (!it->valid() || it->key() >= pend) {
7042 if (!it->valid())
7043 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
7044 else
7045 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
7046 << " >= " << end << dendl;
7047 if (temp) {
7048 if (end.hobj.is_temp()) {
7049 break;
7050 }
7051 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
7052 temp = false;
7053 it->upper_bound(start_key);
7054 pend = end_key;
7055 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7056 continue;
7057 }
7058 break;
7059 }
7060 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
7061 if (is_extent_shard_key(it->key())) {
7062 it->next();
7063 continue;
7064 }
7065 ghobject_t oid;
7066 int r = get_key_object(it->key(), &oid);
7067 assert(r == 0);
7068 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
7069 if (ls->size() >= (unsigned)max) {
7070 dout(20) << __func__ << " reached max " << max << dendl;
7071 *pnext = oid;
7072 set_next = true;
7073 break;
7074 }
7075 ls->push_back(oid);
7076 it->next();
7077 }
7078out:
7079 if (!set_next) {
7080 *pnext = ghobject_t::get_max();
7081 }
7082
7083 return r;
7084}
7085
7c673cae
FG
7086int BlueStore::omap_get(
7087 const coll_t& cid, ///< [in] Collection containing oid
7088 const ghobject_t &oid, ///< [in] Object containing omap
7089 bufferlist *header, ///< [out] omap header
7090 map<string, bufferlist> *out /// < [out] Key to value map
7091 )
7092{
7093 CollectionHandle c = _get_collection(cid);
7094 if (!c)
7095 return -ENOENT;
7096 return omap_get(c, oid, header, out);
7097}
7098
7099int BlueStore::omap_get(
7100 CollectionHandle &c_, ///< [in] Collection containing oid
7101 const ghobject_t &oid, ///< [in] Object containing omap
7102 bufferlist *header, ///< [out] omap header
7103 map<string, bufferlist> *out /// < [out] Key to value map
7104 )
7105{
7106 Collection *c = static_cast<Collection *>(c_.get());
7107 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7108 if (!c->exists)
7109 return -ENOENT;
7110 RWLock::RLocker l(c->lock);
7111 int r = 0;
7112 OnodeRef o = c->get_onode(oid, false);
7113 if (!o || !o->exists) {
7114 r = -ENOENT;
7115 goto out;
7116 }
7117 if (!o->onode.has_omap())
7118 goto out;
7119 o->flush();
7120 {
7121 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7122 string head, tail;
7123 get_omap_header(o->onode.nid, &head);
7124 get_omap_tail(o->onode.nid, &tail);
7125 it->lower_bound(head);
7126 while (it->valid()) {
7127 if (it->key() == head) {
7128 dout(30) << __func__ << " got header" << dendl;
7129 *header = it->value();
7130 } else if (it->key() >= tail) {
7131 dout(30) << __func__ << " reached tail" << dendl;
7132 break;
7133 } else {
7134 string user_key;
7135 decode_omap_key(it->key(), &user_key);
7136 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7137 << " -> " << user_key << dendl;
7138 (*out)[user_key] = it->value();
7139 }
7140 it->next();
7141 }
7142 }
7143 out:
7144 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7145 << dendl;
7146 return r;
7147}
7148
7149int BlueStore::omap_get_header(
7150 const coll_t& cid, ///< [in] Collection containing oid
7151 const ghobject_t &oid, ///< [in] Object containing omap
7152 bufferlist *header, ///< [out] omap header
7153 bool allow_eio ///< [in] don't assert on eio
7154 )
7155{
7156 CollectionHandle c = _get_collection(cid);
7157 if (!c)
7158 return -ENOENT;
7159 return omap_get_header(c, oid, header, allow_eio);
7160}
7161
7162int BlueStore::omap_get_header(
7163 CollectionHandle &c_, ///< [in] Collection containing oid
7164 const ghobject_t &oid, ///< [in] Object containing omap
7165 bufferlist *header, ///< [out] omap header
7166 bool allow_eio ///< [in] don't assert on eio
7167 )
7168{
7169 Collection *c = static_cast<Collection *>(c_.get());
7170 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7171 if (!c->exists)
7172 return -ENOENT;
7173 RWLock::RLocker l(c->lock);
7174 int r = 0;
7175 OnodeRef o = c->get_onode(oid, false);
7176 if (!o || !o->exists) {
7177 r = -ENOENT;
7178 goto out;
7179 }
7180 if (!o->onode.has_omap())
7181 goto out;
7182 o->flush();
7183 {
7184 string head;
7185 get_omap_header(o->onode.nid, &head);
7186 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7187 dout(30) << __func__ << " got header" << dendl;
7188 } else {
7189 dout(30) << __func__ << " no header" << dendl;
7190 }
7191 }
7192 out:
7193 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7194 << dendl;
7195 return r;
7196}
7197
7198int BlueStore::omap_get_keys(
7199 const coll_t& cid, ///< [in] Collection containing oid
7200 const ghobject_t &oid, ///< [in] Object containing omap
7201 set<string> *keys ///< [out] Keys defined on oid
7202 )
7203{
7204 CollectionHandle c = _get_collection(cid);
7205 if (!c)
7206 return -ENOENT;
7207 return omap_get_keys(c, oid, keys);
7208}
7209
7210int BlueStore::omap_get_keys(
7211 CollectionHandle &c_, ///< [in] Collection containing oid
7212 const ghobject_t &oid, ///< [in] Object containing omap
7213 set<string> *keys ///< [out] Keys defined on oid
7214 )
7215{
7216 Collection *c = static_cast<Collection *>(c_.get());
7217 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7218 if (!c->exists)
7219 return -ENOENT;
7220 RWLock::RLocker l(c->lock);
7221 int r = 0;
7222 OnodeRef o = c->get_onode(oid, false);
7223 if (!o || !o->exists) {
7224 r = -ENOENT;
7225 goto out;
7226 }
7227 if (!o->onode.has_omap())
7228 goto out;
7229 o->flush();
7230 {
7231 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7232 string head, tail;
7233 get_omap_key(o->onode.nid, string(), &head);
7234 get_omap_tail(o->onode.nid, &tail);
7235 it->lower_bound(head);
7236 while (it->valid()) {
7237 if (it->key() >= tail) {
7238 dout(30) << __func__ << " reached tail" << dendl;
7239 break;
7240 }
7241 string user_key;
7242 decode_omap_key(it->key(), &user_key);
7243 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7244 << " -> " << user_key << dendl;
7245 keys->insert(user_key);
7246 it->next();
7247 }
7248 }
7249 out:
7250 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7251 << dendl;
7252 return r;
7253}
7254
7255int BlueStore::omap_get_values(
7256 const coll_t& cid, ///< [in] Collection containing oid
7257 const ghobject_t &oid, ///< [in] Object containing omap
7258 const set<string> &keys, ///< [in] Keys to get
7259 map<string, bufferlist> *out ///< [out] Returned keys and values
7260 )
7261{
7262 CollectionHandle c = _get_collection(cid);
7263 if (!c)
7264 return -ENOENT;
7265 return omap_get_values(c, oid, keys, out);
7266}
7267
7268int BlueStore::omap_get_values(
7269 CollectionHandle &c_, ///< [in] Collection containing oid
7270 const ghobject_t &oid, ///< [in] Object containing omap
7271 const set<string> &keys, ///< [in] Keys to get
7272 map<string, bufferlist> *out ///< [out] Returned keys and values
7273 )
7274{
7275 Collection *c = static_cast<Collection *>(c_.get());
7276 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7277 if (!c->exists)
7278 return -ENOENT;
7279 RWLock::RLocker l(c->lock);
7280 int r = 0;
7281 string final_key;
7282 OnodeRef o = c->get_onode(oid, false);
7283 if (!o || !o->exists) {
7284 r = -ENOENT;
7285 goto out;
7286 }
7287 if (!o->onode.has_omap())
7288 goto out;
7289 o->flush();
7290 _key_encode_u64(o->onode.nid, &final_key);
7291 final_key.push_back('.');
7292 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7293 final_key.resize(9); // keep prefix
7294 final_key += *p;
7295 bufferlist val;
7296 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7297 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7298 << " -> " << *p << dendl;
7299 out->insert(make_pair(*p, val));
7300 }
7301 }
7302 out:
7303 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7304 << dendl;
7305 return r;
7306}
7307
7308int BlueStore::omap_check_keys(
7309 const coll_t& cid, ///< [in] Collection containing oid
7310 const ghobject_t &oid, ///< [in] Object containing omap
7311 const set<string> &keys, ///< [in] Keys to check
7312 set<string> *out ///< [out] Subset of keys defined on oid
7313 )
7314{
7315 CollectionHandle c = _get_collection(cid);
7316 if (!c)
7317 return -ENOENT;
7318 return omap_check_keys(c, oid, keys, out);
7319}
7320
7321int BlueStore::omap_check_keys(
7322 CollectionHandle &c_, ///< [in] Collection containing oid
7323 const ghobject_t &oid, ///< [in] Object containing omap
7324 const set<string> &keys, ///< [in] Keys to check
7325 set<string> *out ///< [out] Subset of keys defined on oid
7326 )
7327{
7328 Collection *c = static_cast<Collection *>(c_.get());
7329 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7330 if (!c->exists)
7331 return -ENOENT;
7332 RWLock::RLocker l(c->lock);
7333 int r = 0;
7334 string final_key;
7335 OnodeRef o = c->get_onode(oid, false);
7336 if (!o || !o->exists) {
7337 r = -ENOENT;
7338 goto out;
7339 }
7340 if (!o->onode.has_omap())
7341 goto out;
7342 o->flush();
7343 _key_encode_u64(o->onode.nid, &final_key);
7344 final_key.push_back('.');
7345 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7346 final_key.resize(9); // keep prefix
7347 final_key += *p;
7348 bufferlist val;
7349 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7350 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7351 << " -> " << *p << dendl;
7352 out->insert(*p);
7353 } else {
7354 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7355 << " -> " << *p << dendl;
7356 }
7357 }
7358 out:
7359 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7360 << dendl;
7361 return r;
7362}
7363
7364ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7365 const coll_t& cid, ///< [in] collection
7366 const ghobject_t &oid ///< [in] object
7367 )
7368{
7369 CollectionHandle c = _get_collection(cid);
7370 if (!c) {
7371 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7372 return ObjectMap::ObjectMapIterator();
7373 }
7374 return get_omap_iterator(c, oid);
7375}
7376
7377ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7378 CollectionHandle &c_, ///< [in] collection
7379 const ghobject_t &oid ///< [in] object
7380 )
7381{
7382 Collection *c = static_cast<Collection *>(c_.get());
7383 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7384 if (!c->exists) {
7385 return ObjectMap::ObjectMapIterator();
7386 }
7387 RWLock::RLocker l(c->lock);
7388 OnodeRef o = c->get_onode(oid, false);
7389 if (!o || !o->exists) {
7390 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7391 return ObjectMap::ObjectMapIterator();
7392 }
7393 o->flush();
7394 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7395 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7396 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7397}
7398
7399// -----------------
7400// write helpers
7401
7402void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7403{
7404 dout(10) << __func__ << " ondisk_format " << ondisk_format
7405 << " min_compat_ondisk_format " << min_compat_ondisk_format
7406 << dendl;
7407 assert(ondisk_format == latest_ondisk_format);
7408 {
7409 bufferlist bl;
7410 ::encode(ondisk_format, bl);
7411 t->set(PREFIX_SUPER, "ondisk_format", bl);
7412 }
7413 {
7414 bufferlist bl;
7415 ::encode(min_compat_ondisk_format, bl);
7416 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7417 }
7418}
7419
7420int BlueStore::_open_super_meta()
7421{
7422 // nid
7423 {
7424 nid_max = 0;
7425 bufferlist bl;
7426 db->get(PREFIX_SUPER, "nid_max", &bl);
7427 bufferlist::iterator p = bl.begin();
7428 try {
7429 uint64_t v;
7430 ::decode(v, p);
7431 nid_max = v;
7432 } catch (buffer::error& e) {
7433 derr << __func__ << " unable to read nid_max" << dendl;
7434 return -EIO;
7435 }
7436 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7437 nid_last = nid_max.load();
7438 }
7439
7440 // blobid
7441 {
7442 blobid_max = 0;
7443 bufferlist bl;
7444 db->get(PREFIX_SUPER, "blobid_max", &bl);
7445 bufferlist::iterator p = bl.begin();
7446 try {
7447 uint64_t v;
7448 ::decode(v, p);
7449 blobid_max = v;
7450 } catch (buffer::error& e) {
7451 derr << __func__ << " unable to read blobid_max" << dendl;
7452 return -EIO;
7453 }
7454 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7455 blobid_last = blobid_max.load();
7456 }
7457
7458 // freelist
7459 {
7460 bufferlist bl;
7461 db->get(PREFIX_SUPER, "freelist_type", &bl);
7462 if (bl.length()) {
7463 freelist_type = std::string(bl.c_str(), bl.length());
7464 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7465 } else {
7466 assert("Not Support extent freelist manager" == 0);
7467 }
7468 }
7469
7470 // bluefs alloc
7471 if (cct->_conf->bluestore_bluefs) {
7472 bluefs_extents.clear();
7473 bufferlist bl;
7474 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7475 bufferlist::iterator p = bl.begin();
7476 try {
7477 ::decode(bluefs_extents, p);
7478 }
7479 catch (buffer::error& e) {
7480 derr << __func__ << " unable to read bluefs_extents" << dendl;
7481 return -EIO;
7482 }
7483 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7484 << std::dec << dendl;
7485 }
7486
7487 // ondisk format
7488 int32_t compat_ondisk_format = 0;
7489 {
7490 bufferlist bl;
7491 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7492 if (r < 0) {
7493 // base case: kraken bluestore is v1 and readable by v1
7494 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7495 << dendl;
7496 ondisk_format = 1;
7497 compat_ondisk_format = 1;
7498 } else {
7499 auto p = bl.begin();
7500 try {
7501 ::decode(ondisk_format, p);
7502 } catch (buffer::error& e) {
7503 derr << __func__ << " unable to read ondisk_format" << dendl;
7504 return -EIO;
7505 }
7506 bl.clear();
7507 {
7508 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7509 assert(!r);
7510 auto p = bl.begin();
7511 try {
7512 ::decode(compat_ondisk_format, p);
7513 } catch (buffer::error& e) {
7514 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7515 return -EIO;
7516 }
7517 }
7518 }
7519 dout(10) << __func__ << " ondisk_format " << ondisk_format
7520 << " compat_ondisk_format " << compat_ondisk_format
7521 << dendl;
7522 }
7523
7524 if (latest_ondisk_format < compat_ondisk_format) {
7525 derr << __func__ << " compat_ondisk_format is "
7526 << compat_ondisk_format << " but we only understand version "
7527 << latest_ondisk_format << dendl;
7528 return -EPERM;
7529 }
7530 if (ondisk_format < latest_ondisk_format) {
7531 int r = _upgrade_super();
7532 if (r < 0) {
7533 return r;
7534 }
7535 }
7536
7537 {
7538 bufferlist bl;
7539 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7540 auto p = bl.begin();
7541 try {
7542 uint64_t val;
7543 ::decode(val, p);
7544 min_alloc_size = val;
224ce89b
WB
7545 min_alloc_size_order = ctz(val);
7546 assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
7547 } catch (buffer::error& e) {
7548 derr << __func__ << " unable to read min_alloc_size" << dendl;
7549 return -EIO;
7550 }
7551 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7552 << std::dec << dendl;
7553 }
224ce89b 7554 _open_statfs();
7c673cae
FG
7555 _set_alloc_sizes();
7556 _set_throttle_params();
7557
7558 _set_csum();
7559 _set_compression();
7560 _set_blob_size();
7561
7562 return 0;
7563}
7564
7565int BlueStore::_upgrade_super()
7566{
7567 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7568 << latest_ondisk_format << dendl;
7569 assert(ondisk_format > 0);
7570 assert(ondisk_format < latest_ondisk_format);
7571
7572 if (ondisk_format == 1) {
7573 // changes:
7574 // - super: added ondisk_format
7575 // - super: added min_readable_ondisk_format
7576 // - super: added min_compat_ondisk_format
7577 // - super: added min_alloc_size
7578 // - super: removed min_min_alloc_size
7579 KeyValueDB::Transaction t = db->get_transaction();
7580 {
7581 bufferlist bl;
7582 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7583 auto p = bl.begin();
7584 try {
7585 uint64_t val;
7586 ::decode(val, p);
7587 min_alloc_size = val;
7588 } catch (buffer::error& e) {
7589 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7590 return -EIO;
7591 }
7592 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7593 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7594 }
7595 ondisk_format = 2;
7596 _prepare_ondisk_format_super(t);
7597 int r = db->submit_transaction_sync(t);
7598 assert(r == 0);
7599 }
7600
7601 // done
7602 dout(1) << __func__ << " done" << dendl;
7603 return 0;
7604}
7605
7606void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7607{
224ce89b
WB
7608 if (o->onode.nid) {
7609 assert(o->exists);
7c673cae 7610 return;
224ce89b 7611 }
7c673cae
FG
7612 uint64_t nid = ++nid_last;
7613 dout(20) << __func__ << " " << nid << dendl;
7614 o->onode.nid = nid;
7615 txc->last_nid = nid;
224ce89b 7616 o->exists = true;
7c673cae
FG
7617}
7618
7619uint64_t BlueStore::_assign_blobid(TransContext *txc)
7620{
7621 uint64_t bid = ++blobid_last;
7622 dout(20) << __func__ << " " << bid << dendl;
7623 txc->last_blobid = bid;
7624 return bid;
7625}
7626
7627void BlueStore::get_db_statistics(Formatter *f)
7628{
7629 db->get_statistics(f);
7630}
7631
7632BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
7633{
7634 TransContext *txc = new TransContext(cct, osr);
7635 txc->t = db->get_transaction();
7636 osr->queue_new(txc);
7637 dout(20) << __func__ << " osr " << osr << " = " << txc
7638 << " seq " << txc->seq << dendl;
7639 return txc;
7640}
7641
7642void BlueStore::_txc_calc_cost(TransContext *txc)
7643{
7644 // this is about the simplest model for transaction cost you can
7645 // imagine. there is some fixed overhead cost by saying there is a
7646 // minimum of one "io". and then we have some cost per "io" that is
7647 // a configurable (with different hdd and ssd defaults), and add
7648 // that to the bytes value.
7649 int ios = 1; // one "io" for the kv commit
7650 for (auto& p : txc->ioc.pending_aios) {
7651 ios += p.iov.size();
7652 }
7653 auto cost = throttle_cost_per_io.load();
7654 txc->cost = ios * cost + txc->bytes;
7655 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
7656 << ios << " ios * " << cost << " + " << txc->bytes
7657 << " bytes)" << dendl;
7658}
7659
7660void BlueStore::_txc_update_store_statfs(TransContext *txc)
7661{
7662 if (txc->statfs_delta.is_empty())
7663 return;
7664
7665 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
7666 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
7667 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
7668 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
7669 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
7670
31f18b77
FG
7671 {
7672 std::lock_guard<std::mutex> l(vstatfs_lock);
7673 vstatfs += txc->statfs_delta;
7674 }
7675
7c673cae
FG
7676 bufferlist bl;
7677 txc->statfs_delta.encode(bl);
7678
7679 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
7680 txc->statfs_delta.reset();
7681}
7682
7683void BlueStore::_txc_state_proc(TransContext *txc)
7684{
7685 while (true) {
7686 dout(10) << __func__ << " txc " << txc
7687 << " " << txc->get_state_name() << dendl;
7688 switch (txc->state) {
7689 case TransContext::STATE_PREPARE:
7690 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
7691 if (txc->ioc.has_pending_aios()) {
7692 txc->state = TransContext::STATE_AIO_WAIT;
7693 txc->had_ios = true;
7694 _txc_aio_submit(txc);
7695 return;
7696 }
7697 // ** fall-thru **
7698
7699 case TransContext::STATE_AIO_WAIT:
7700 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
7701 _txc_finish_io(txc); // may trigger blocked txc's too
7702 return;
7703
7704 case TransContext::STATE_IO_DONE:
7705 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7706 if (txc->had_ios) {
7707 ++txc->osr->txc_with_unstable_io;
7708 }
7709 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
7710 txc->state = TransContext::STATE_KV_QUEUED;
7711 if (cct->_conf->bluestore_sync_submit_transaction) {
7712 if (txc->last_nid >= nid_max ||
7713 txc->last_blobid >= blobid_max) {
7714 dout(20) << __func__
7715 << " last_{nid,blobid} exceeds max, submit via kv thread"
7716 << dendl;
7717 } else if (txc->osr->kv_committing_serially) {
7718 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
7719 << dendl;
7720 // note: this is starvation-prone. once we have a txc in a busy
7721 // sequencer that is committing serially it is possible to keep
7722 // submitting new transactions fast enough that we get stuck doing
7723 // so. the alternative is to block here... fixme?
7724 } else if (txc->osr->txc_with_unstable_io) {
7725 dout(20) << __func__ << " prior txc(s) with unstable ios "
7726 << txc->osr->txc_with_unstable_io.load() << dendl;
7727 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
7728 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
7729 == 0) {
7730 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
7731 << dendl;
7732 } else {
7733 txc->state = TransContext::STATE_KV_SUBMITTED;
31f18b77 7734 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
7c673cae
FG
7735 assert(r == 0);
7736 _txc_applied_kv(txc);
7737 }
7738 }
7739 {
7740 std::lock_guard<std::mutex> l(kv_lock);
7741 kv_queue.push_back(txc);
7742 kv_cond.notify_one();
7743 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
7744 kv_queue_unsubmitted.push_back(txc);
7745 ++txc->osr->kv_committing_serially;
7746 }
31f18b77
FG
7747 if (txc->had_ios)
7748 kv_ios++;
7749 kv_throttle_costs += txc->cost;
7c673cae
FG
7750 }
7751 return;
7752 case TransContext::STATE_KV_SUBMITTED:
7753 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
7754 txc->state = TransContext::STATE_KV_DONE;
7755 _txc_committed_kv(txc);
7756 // ** fall-thru **
7757
7758 case TransContext::STATE_KV_DONE:
7759 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
7760 if (txc->deferred_txn) {
7761 txc->state = TransContext::STATE_DEFERRED_QUEUED;
7762 _deferred_queue(txc);
7763 return;
7764 }
7765 txc->state = TransContext::STATE_FINISHING;
7766 break;
7767
7768 case TransContext::STATE_DEFERRED_CLEANUP:
7769 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
7770 txc->state = TransContext::STATE_FINISHING;
7771 // ** fall-thru **
7772
7773 case TransContext::STATE_FINISHING:
7774 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
7775 _txc_finish(txc);
7776 return;
7777
7778 default:
7779 derr << __func__ << " unexpected txc " << txc
7780 << " state " << txc->get_state_name() << dendl;
7781 assert(0 == "unexpected txc state");
7782 return;
7783 }
7784 }
7785}
7786
7787void BlueStore::_txc_finish_io(TransContext *txc)
7788{
7789 dout(20) << __func__ << " " << txc << dendl;
7790
7791 /*
7792 * we need to preserve the order of kv transactions,
7793 * even though aio will complete in any order.
7794 */
7795
7796 OpSequencer *osr = txc->osr.get();
7797 std::lock_guard<std::mutex> l(osr->qlock);
7798 txc->state = TransContext::STATE_IO_DONE;
7799
31f18b77
FG
7800 // release aio contexts (including pinned buffers).
7801 txc->ioc.running_aios.clear();
7802
7c673cae
FG
7803 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
7804 while (p != osr->q.begin()) {
7805 --p;
7806 if (p->state < TransContext::STATE_IO_DONE) {
7807 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
7808 << p->get_state_name() << dendl;
7809 return;
7810 }
7811 if (p->state > TransContext::STATE_IO_DONE) {
7812 ++p;
7813 break;
7814 }
7815 }
7816 do {
7817 _txc_state_proc(&*p++);
7818 } while (p != osr->q.end() &&
7819 p->state == TransContext::STATE_IO_DONE);
7820
7821 if (osr->kv_submitted_waiters &&
7822 osr->_is_all_kv_submitted()) {
7823 osr->qcond.notify_all();
7824 }
7825}
7826
7827void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
7828{
7829 dout(20) << __func__ << " txc " << txc
7830 << " onodes " << txc->onodes
7831 << " shared_blobs " << txc->shared_blobs
7832 << dendl;
7833
7834 // finalize onodes
7835 for (auto o : txc->onodes) {
7836 // finalize extent_map shards
7837 o->extent_map.update(t, false);
7838 if (o->extent_map.needs_reshard()) {
7839 o->extent_map.reshard(db, t);
7840 o->extent_map.update(t, true);
7841 if (o->extent_map.needs_reshard()) {
7842 dout(20) << __func__ << " warning: still wants reshard, check options?"
7843 << dendl;
7844 o->extent_map.clear_needs_reshard();
7845 }
7846 logger->inc(l_bluestore_onode_reshard);
7847 }
7848
7849 // bound encode
7850 size_t bound = 0;
7851 denc(o->onode, bound);
7852 o->extent_map.bound_encode_spanning_blobs(bound);
7853 if (o->onode.extent_map_shards.empty()) {
7854 denc(o->extent_map.inline_bl, bound);
7855 }
7856
7857 // encode
7858 bufferlist bl;
7859 unsigned onode_part, blob_part, extent_part;
7860 {
7861 auto p = bl.get_contiguous_appender(bound, true);
7862 denc(o->onode, p);
7863 onode_part = p.get_logical_offset();
7864 o->extent_map.encode_spanning_blobs(p);
7865 blob_part = p.get_logical_offset() - onode_part;
7866 if (o->onode.extent_map_shards.empty()) {
7867 denc(o->extent_map.inline_bl, p);
7868 }
7869 extent_part = p.get_logical_offset() - onode_part - blob_part;
7870 }
7871
7872 dout(20) << " onode " << o->oid << " is " << bl.length()
7873 << " (" << onode_part << " bytes onode + "
7874 << blob_part << " bytes spanning blobs + "
7875 << extent_part << " bytes inline extents)"
7876 << dendl;
7877 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
7878 o->flushing_count++;
7879 }
7880
7881 // objects we modified but didn't affect the onode
7882 auto p = txc->modified_objects.begin();
7883 while (p != txc->modified_objects.end()) {
7884 if (txc->onodes.count(*p) == 0) {
7885 (*p)->flushing_count++;
7886 ++p;
7887 } else {
7888 // remove dups with onodes list to avoid problems in _txc_finish
7889 p = txc->modified_objects.erase(p);
7890 }
7891 }
7892
7893 // finalize shared_blobs
7894 for (auto sb : txc->shared_blobs) {
7895 string key;
7896 auto sbid = sb->get_sbid();
7897 get_shared_blob_key(sbid, &key);
7898 if (sb->persistent->empty()) {
7899 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
7900 << " is empty" << dendl;
7901 t->rmkey(PREFIX_SHARED_BLOB, key);
7902 } else {
7903 bufferlist bl;
7904 ::encode(*(sb->persistent), bl);
7905 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
31f18b77 7906 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
7907 t->set(PREFIX_SHARED_BLOB, key, bl);
7908 }
7909 }
7910}
7911
7912void BlueStore::BSPerfTracker::update_from_perfcounters(
7913 PerfCounters &logger)
7914{
7915 os_commit_latency.consume_next(
7916 logger.get_tavg_ms(
7917 l_bluestore_commit_lat));
7918 os_apply_latency.consume_next(
7919 logger.get_tavg_ms(
7920 l_bluestore_commit_lat));
7921}
7922
7923void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
7924{
7925 dout(20) << __func__ << " txc " << txc << std::hex
7926 << " allocated 0x" << txc->allocated
7927 << " released 0x" << txc->released
7928 << std::dec << dendl;
7929
7930 // We have to handle the case where we allocate *and* deallocate the
7931 // same region in this transaction. The freelist doesn't like that.
7932 // (Actually, the only thing that cares is the BitmapFreelistManager
7933 // debug check. But that's important.)
7934 interval_set<uint64_t> tmp_allocated, tmp_released;
7935 interval_set<uint64_t> *pallocated = &txc->allocated;
7936 interval_set<uint64_t> *preleased = &txc->released;
7937 if (!txc->allocated.empty() && !txc->released.empty()) {
7938 interval_set<uint64_t> overlap;
7939 overlap.intersection_of(txc->allocated, txc->released);
7940 if (!overlap.empty()) {
7941 tmp_allocated = txc->allocated;
7942 tmp_allocated.subtract(overlap);
7943 tmp_released = txc->released;
7944 tmp_released.subtract(overlap);
7945 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
7946 << ", new allocated 0x" << tmp_allocated
7947 << " released 0x" << tmp_released << std::dec
7948 << dendl;
7949 pallocated = &tmp_allocated;
7950 preleased = &tmp_released;
7951 }
7952 }
7953
7954 // update freelist with non-overlap sets
7955 for (interval_set<uint64_t>::iterator p = pallocated->begin();
7956 p != pallocated->end();
7957 ++p) {
7958 fm->allocate(p.get_start(), p.get_len(), t);
7959 }
7960 for (interval_set<uint64_t>::iterator p = preleased->begin();
7961 p != preleased->end();
7962 ++p) {
7963 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
7964 << "~" << p.get_len() << std::dec << dendl;
7965 fm->release(p.get_start(), p.get_len(), t);
7966 }
7967
7968 _txc_update_store_statfs(txc);
7969}
7970
7971void BlueStore::_txc_applied_kv(TransContext *txc)
7972{
7973 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
7974 for (auto& o : *ls) {
7975 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
7976 << dendl;
7977 if (--o->flushing_count == 0) {
7978 std::lock_guard<std::mutex> l(o->flush_lock);
7979 o->flush_cond.notify_all();
7980 }
7981 }
7982 }
7983}
7984
7985void BlueStore::_txc_committed_kv(TransContext *txc)
7986{
7987 dout(20) << __func__ << " txc " << txc << dendl;
7988
7989 // warning: we're calling onreadable_sync inside the sequencer lock
7990 if (txc->onreadable_sync) {
7991 txc->onreadable_sync->complete(0);
7992 txc->onreadable_sync = NULL;
7993 }
7994 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
7995 if (txc->oncommit) {
7996 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
7997 finishers[n]->queue(txc->oncommit);
7998 txc->oncommit = NULL;
7999 }
8000 if (txc->onreadable) {
8001 finishers[n]->queue(txc->onreadable);
8002 txc->onreadable = NULL;
8003 }
8004
8005 if (!txc->oncommits.empty()) {
8006 finishers[n]->queue(txc->oncommits);
8007 }
8008}
8009
8010void BlueStore::_txc_finish(TransContext *txc)
8011{
8012 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
8013 assert(txc->state == TransContext::STATE_FINISHING);
8014
8015 for (auto& sb : txc->shared_blobs_written) {
8016 sb->bc.finish_write(sb->get_cache(), txc->seq);
8017 }
8018 txc->shared_blobs_written.clear();
8019
8020 while (!txc->removed_collections.empty()) {
8021 _queue_reap_collection(txc->removed_collections.front());
8022 txc->removed_collections.pop_front();
8023 }
8024
8025 OpSequencerRef osr = txc->osr;
7c673cae 8026 bool empty = false;
31f18b77 8027 bool submit_deferred = false;
7c673cae
FG
8028 OpSequencer::q_list_t releasing_txc;
8029 {
8030 std::lock_guard<std::mutex> l(osr->qlock);
8031 txc->state = TransContext::STATE_DONE;
8032 bool notify = false;
8033 while (!osr->q.empty()) {
8034 TransContext *txc = &osr->q.front();
8035 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
8036 << dendl;
8037 if (txc->state != TransContext::STATE_DONE) {
8038 if (txc->state == TransContext::STATE_PREPARE &&
8039 deferred_aggressive) {
8040 // for _osr_drain_preceding()
8041 notify = true;
8042 }
31f18b77
FG
8043 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
8044 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
8045 submit_deferred = true;
8046 }
7c673cae
FG
8047 break;
8048 }
8049
7c673cae
FG
8050 osr->q.pop_front();
8051 releasing_txc.push_back(*txc);
8052 notify = true;
8053 }
8054 if (notify) {
8055 osr->qcond.notify_all();
8056 }
8057 if (osr->q.empty()) {
8058 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
8059 empty = true;
8060 }
8061 }
8062 while (!releasing_txc.empty()) {
8063 // release to allocator only after all preceding txc's have also
8064 // finished any deferred writes that potentially land in these
8065 // blocks
8066 auto txc = &releasing_txc.front();
8067 _txc_release_alloc(txc);
8068 releasing_txc.pop_front();
8069 txc->log_state_latency(logger, l_bluestore_state_done_lat);
8070 delete txc;
8071 }
8072
31f18b77
FG
8073 if (submit_deferred) {
8074 // we're pinning memory; flush! we could be more fine-grained here but
8075 // i'm not sure it's worth the bother.
8076 deferred_try_submit();
7c673cae
FG
8077 }
8078
7c673cae
FG
8079 if (empty && osr->zombie) {
8080 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
8081 osr->_unregister();
8082 }
8083}
8084
8085void BlueStore::_txc_release_alloc(TransContext *txc)
8086{
8087 // update allocator with full released set
8088 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
8089 dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
8090 for (interval_set<uint64_t>::iterator p = txc->released.begin();
8091 p != txc->released.end();
8092 ++p) {
8093 alloc->release(p.get_start(), p.get_len());
8094 }
8095 }
8096
8097 txc->allocated.clear();
8098 txc->released.clear();
8099}
8100
8101void BlueStore::_osr_drain_preceding(TransContext *txc)
8102{
8103 OpSequencer *osr = txc->osr.get();
8104 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8105 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8106 {
8107 // submit anything pending
224ce89b 8108 deferred_lock.lock();
7c673cae 8109 if (osr->deferred_pending) {
224ce89b
WB
8110 _deferred_submit_unlock(osr);
8111 } else {
8112 deferred_lock.unlock();
7c673cae
FG
8113 }
8114 }
8115 {
8116 // wake up any previously finished deferred events
8117 std::lock_guard<std::mutex> l(kv_lock);
8118 kv_cond.notify_one();
8119 }
8120 osr->drain_preceding(txc);
8121 --deferred_aggressive;
8122 dout(10) << __func__ << " " << osr << " done" << dendl;
8123}
8124
8125void BlueStore::_osr_drain_all()
8126{
8127 dout(10) << __func__ << dendl;
8128
8129 set<OpSequencerRef> s;
8130 {
8131 std::lock_guard<std::mutex> l(osr_lock);
8132 s = osr_set;
8133 }
8134 dout(20) << __func__ << " osr_set " << s << dendl;
8135
8136 ++deferred_aggressive;
8137 {
8138 // submit anything pending
224ce89b 8139 deferred_try_submit();
7c673cae
FG
8140 }
8141 {
8142 // wake up any previously finished deferred events
8143 std::lock_guard<std::mutex> l(kv_lock);
8144 kv_cond.notify_one();
8145 }
31f18b77
FG
8146 {
8147 std::lock_guard<std::mutex> l(kv_finalize_lock);
8148 kv_finalize_cond.notify_one();
8149 }
7c673cae
FG
8150 for (auto osr : s) {
8151 dout(20) << __func__ << " drain " << osr << dendl;
8152 osr->drain();
8153 }
8154 --deferred_aggressive;
8155
8156 dout(10) << __func__ << " done" << dendl;
8157}
8158
8159void BlueStore::_osr_unregister_all()
8160{
8161 set<OpSequencerRef> s;
8162 {
8163 std::lock_guard<std::mutex> l(osr_lock);
8164 s = osr_set;
8165 }
8166 dout(10) << __func__ << " " << s << dendl;
8167 for (auto osr : s) {
8168 osr->_unregister();
8169
8170 if (!osr->zombie) {
8171 // break link from Sequencer to us so that this OpSequencer
8172 // instance can die with this mount/umount cycle. note that
8173 // we assume umount() will not race against ~Sequencer.
8174 assert(osr->parent);
8175 osr->parent->p.reset();
8176 }
8177 }
8178 // nobody should be creating sequencers during umount either.
8179 {
8180 std::lock_guard<std::mutex> l(osr_lock);
8181 assert(osr_set.empty());
8182 }
8183}
8184
31f18b77
FG
8185void BlueStore::_kv_start()
8186{
8187 dout(10) << __func__ << dendl;
8188
8189 if (cct->_conf->bluestore_shard_finishers) {
8190 if (cct->_conf->osd_op_num_shards) {
8191 m_finisher_num = cct->_conf->osd_op_num_shards;
8192 } else {
8193 assert(bdev);
8194 if (bdev->is_rotational()) {
8195 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
8196 } else {
8197 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
8198 }
8199 }
8200 }
8201
8202 assert(m_finisher_num != 0);
8203
8204 for (int i = 0; i < m_finisher_num; ++i) {
8205 ostringstream oss;
8206 oss << "finisher-" << i;
8207 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8208 finishers.push_back(f);
8209 }
8210
8211 for (auto f : finishers) {
8212 f->start();
8213 }
8214 kv_sync_thread.create("bstore_kv_sync");
8215 kv_finalize_thread.create("bstore_kv_final");
8216}
8217
8218void BlueStore::_kv_stop()
8219{
8220 dout(10) << __func__ << dendl;
8221 {
8222 std::unique_lock<std::mutex> l(kv_lock);
8223 while (!kv_sync_started) {
8224 kv_cond.wait(l);
8225 }
8226 kv_stop = true;
8227 kv_cond.notify_all();
8228 }
8229 {
8230 std::unique_lock<std::mutex> l(kv_finalize_lock);
8231 while (!kv_finalize_started) {
8232 kv_finalize_cond.wait(l);
8233 }
8234 kv_finalize_stop = true;
8235 kv_finalize_cond.notify_all();
8236 }
8237 kv_sync_thread.join();
8238 kv_finalize_thread.join();
8239 {
8240 std::lock_guard<std::mutex> l(kv_lock);
8241 kv_stop = false;
8242 }
8243 {
8244 std::lock_guard<std::mutex> l(kv_finalize_lock);
8245 kv_finalize_stop = false;
8246 }
8247 dout(10) << __func__ << " stopping finishers" << dendl;
8248 for (auto f : finishers) {
8249 f->wait_for_empty();
8250 f->stop();
8251 }
8252 dout(10) << __func__ << " stopped" << dendl;
8253}
8254
7c673cae
FG
8255void BlueStore::_kv_sync_thread()
8256{
8257 dout(10) << __func__ << " start" << dendl;
8258 std::unique_lock<std::mutex> l(kv_lock);
31f18b77
FG
8259 assert(!kv_sync_started);
8260 kv_sync_started = true;
8261 kv_cond.notify_all();
7c673cae
FG
8262 while (true) {
8263 assert(kv_committing.empty());
8264 if (kv_queue.empty() &&
8265 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8266 !deferred_aggressive)) {
8267 if (kv_stop)
8268 break;
8269 dout(20) << __func__ << " sleep" << dendl;
8270 kv_cond.wait(l);
8271 dout(20) << __func__ << " wake" << dendl;
8272 } else {
8273 deque<TransContext*> kv_submitting;
8274 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
8275 uint64_t aios = 0, costs = 0;
8276
7c673cae
FG
8277 dout(20) << __func__ << " committing " << kv_queue.size()
8278 << " submitting " << kv_queue_unsubmitted.size()
8279 << " deferred done " << deferred_done_queue.size()
8280 << " stable " << deferred_stable_queue.size()
8281 << dendl;
8282 kv_committing.swap(kv_queue);
8283 kv_submitting.swap(kv_queue_unsubmitted);
8284 deferred_done.swap(deferred_done_queue);
8285 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
8286 aios = kv_ios;
8287 costs = kv_throttle_costs;
8288 kv_ios = 0;
8289 kv_throttle_costs = 0;
7c673cae
FG
8290 utime_t start = ceph_clock_now();
8291 l.unlock();
8292
8293 dout(30) << __func__ << " committing " << kv_committing << dendl;
8294 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8295 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8296 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8297
7c673cae
FG
8298 bool force_flush = false;
8299 // if bluefs is sharing the same device as data (only), then we
8300 // can rely on the bluefs commit to flush the device and make
8301 // deferred aios stable. that means that if we do have done deferred
8302 // txcs AND we are not on a single device, we need to force a flush.
8303 if (bluefs_single_shared_device && bluefs) {
31f18b77 8304 if (aios) {
7c673cae
FG
8305 force_flush = true;
8306 } else if (kv_committing.empty() && kv_submitting.empty() &&
8307 deferred_stable.empty()) {
8308 force_flush = true; // there's nothing else to commit!
8309 } else if (deferred_aggressive) {
8310 force_flush = true;
8311 }
8312 } else
8313 force_flush = true;
8314
8315 if (force_flush) {
31f18b77 8316 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
8317 << " force_flush=" << (int)force_flush
8318 << ", flushing, deferred done->stable" << dendl;
8319 // flush/barrier on block device
8320 bdev->flush();
8321
8322 // if we flush then deferred done are now deferred stable
8323 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8324 deferred_done.end());
8325 deferred_done.clear();
8326 }
8327 utime_t after_flush = ceph_clock_now();
8328
8329 // we will use one final transaction to force a sync
8330 KeyValueDB::Transaction synct = db->get_transaction();
8331
8332 // increase {nid,blobid}_max? note that this covers both the
8333 // case where we are approaching the max and the case we passed
8334 // it. in either case, we increase the max in the earlier txn
8335 // we submit.
8336 uint64_t new_nid_max = 0, new_blobid_max = 0;
8337 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8338 KeyValueDB::Transaction t =
8339 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8340 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8341 bufferlist bl;
8342 ::encode(new_nid_max, bl);
8343 t->set(PREFIX_SUPER, "nid_max", bl);
8344 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8345 }
8346 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8347 KeyValueDB::Transaction t =
8348 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8349 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8350 bufferlist bl;
8351 ::encode(new_blobid_max, bl);
8352 t->set(PREFIX_SUPER, "blobid_max", bl);
8353 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8354 }
c07f9fc5
FG
8355
8356 for (auto txc : kv_committing) {
8357 if (txc->state == TransContext::STATE_KV_QUEUED) {
8358 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8359 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8360 assert(r == 0);
8361 _txc_applied_kv(txc);
8362 --txc->osr->kv_committing_serially;
8363 txc->state = TransContext::STATE_KV_SUBMITTED;
8364 if (txc->osr->kv_submitted_waiters) {
8365 std::lock_guard<std::mutex> l(txc->osr->qlock);
8366 if (txc->osr->_is_all_kv_submitted()) {
8367 txc->osr->qcond.notify_all();
8368 }
7c673cae 8369 }
c07f9fc5
FG
8370
8371 } else {
8372 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8373 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
7c673cae 8374 }
7c673cae
FG
8375 if (txc->had_ios) {
8376 --txc->osr->txc_with_unstable_io;
8377 }
7c673cae
FG
8378 }
8379
31f18b77
FG
8380 // release throttle *before* we commit. this allows new ops
8381 // to be prepared and enter pipeline while we are waiting on
8382 // the kv commit sync/flush. then hopefully on the next
8383 // iteration there will already be ops awake. otherwise, we
8384 // end up going to sleep, and then wake up when the very first
8385 // transaction is ready for commit.
8386 throttle_bytes.put(costs);
8387
7c673cae
FG
8388 PExtentVector bluefs_gift_extents;
8389 if (bluefs &&
8390 after_flush - bluefs_last_balance >
8391 cct->_conf->bluestore_bluefs_balance_interval) {
8392 bluefs_last_balance = after_flush;
8393 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8394 assert(r >= 0);
8395 if (r > 0) {
8396 for (auto& p : bluefs_gift_extents) {
8397 bluefs_extents.insert(p.offset, p.length);
8398 }
8399 bufferlist bl;
8400 ::encode(bluefs_extents, bl);
8401 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8402 << bluefs_extents << std::dec << dendl;
8403 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8404 }
8405 }
8406
8407 // cleanup sync deferred keys
8408 for (auto b : deferred_stable) {
8409 for (auto& txc : b->txcs) {
8410 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8411 if (!wt.released.empty()) {
8412 // kraken replay compat only
8413 txc.released = wt.released;
8414 dout(10) << __func__ << " deferred txn has released "
8415 << txc.released
8416 << " (we just upgraded from kraken) on " << &txc << dendl;
8417 _txc_finalize_kv(&txc, synct);
8418 }
8419 // cleanup the deferred
8420 string key;
8421 get_deferred_key(wt.seq, &key);
8422 synct->rm_single_key(PREFIX_DEFERRED, key);
8423 }
8424 }
8425
8426 // submit synct synchronously (block and wait for it to commit)
31f18b77 8427 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
7c673cae
FG
8428 assert(r == 0);
8429
8430 if (new_nid_max) {
8431 nid_max = new_nid_max;
8432 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8433 }
8434 if (new_blobid_max) {
8435 blobid_max = new_blobid_max;
8436 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8437 }
8438
224ce89b
WB
8439 {
8440 utime_t finish = ceph_clock_now();
8441 utime_t dur_flush = after_flush - start;
8442 utime_t dur_kv = finish - after_flush;
8443 utime_t dur = finish - start;
8444 dout(20) << __func__ << " committed " << kv_committing.size()
8445 << " cleaned " << deferred_stable.size()
8446 << " in " << dur
8447 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8448 << dendl;
7c673cae
FG
8449 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8450 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8451 logger->tinc(l_bluestore_kv_lat, dur);
8452 }
31f18b77
FG
8453
8454 if (bluefs) {
8455 if (!bluefs_gift_extents.empty()) {
8456 _commit_bluefs_freespace(bluefs_gift_extents);
8457 }
8458 for (auto p = bluefs_extents_reclaiming.begin();
8459 p != bluefs_extents_reclaiming.end();
8460 ++p) {
8461 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8462 << p.get_start() << "~" << p.get_len() << std::dec
8463 << dendl;
8464 alloc->release(p.get_start(), p.get_len());
8465 }
8466 bluefs_extents_reclaiming.clear();
8467 }
8468
8469 {
8470 std::unique_lock<std::mutex> m(kv_finalize_lock);
8471 if (kv_committing_to_finalize.empty()) {
8472 kv_committing_to_finalize.swap(kv_committing);
8473 } else {
8474 kv_committing_to_finalize.insert(
8475 kv_committing_to_finalize.end(),
8476 kv_committing.begin(),
8477 kv_committing.end());
8478 kv_committing.clear();
8479 }
8480 if (deferred_stable_to_finalize.empty()) {
8481 deferred_stable_to_finalize.swap(deferred_stable);
8482 } else {
8483 deferred_stable_to_finalize.insert(
8484 deferred_stable_to_finalize.end(),
8485 deferred_stable.begin(),
8486 deferred_stable.end());
8487 deferred_stable.clear();
8488 }
8489 kv_finalize_cond.notify_one();
8490 }
8491
8492 l.lock();
8493 // previously deferred "done" are now "stable" by virtue of this
8494 // commit cycle.
8495 deferred_stable_queue.swap(deferred_done);
8496 }
8497 }
8498 dout(10) << __func__ << " finish" << dendl;
8499 kv_sync_started = false;
8500}
8501
8502void BlueStore::_kv_finalize_thread()
8503{
8504 deque<TransContext*> kv_committed;
8505 deque<DeferredBatch*> deferred_stable;
8506 dout(10) << __func__ << " start" << dendl;
8507 std::unique_lock<std::mutex> l(kv_finalize_lock);
8508 assert(!kv_finalize_started);
8509 kv_finalize_started = true;
8510 kv_finalize_cond.notify_all();
8511 while (true) {
8512 assert(kv_committed.empty());
8513 assert(deferred_stable.empty());
8514 if (kv_committing_to_finalize.empty() &&
8515 deferred_stable_to_finalize.empty()) {
8516 if (kv_finalize_stop)
8517 break;
8518 dout(20) << __func__ << " sleep" << dendl;
8519 kv_finalize_cond.wait(l);
8520 dout(20) << __func__ << " wake" << dendl;
8521 } else {
8522 kv_committed.swap(kv_committing_to_finalize);
8523 deferred_stable.swap(deferred_stable_to_finalize);
8524 l.unlock();
8525 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8526 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8527
8528 while (!kv_committed.empty()) {
8529 TransContext *txc = kv_committed.front();
7c673cae
FG
8530 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8531 _txc_state_proc(txc);
31f18b77 8532 kv_committed.pop_front();
7c673cae 8533 }
31f18b77 8534
7c673cae
FG
8535 for (auto b : deferred_stable) {
8536 auto p = b->txcs.begin();
8537 while (p != b->txcs.end()) {
8538 TransContext *txc = &*p;
8539 p = b->txcs.erase(p); // unlink here because
8540 _txc_state_proc(txc); // this may destroy txc
8541 }
8542 delete b;
8543 }
31f18b77 8544 deferred_stable.clear();
7c673cae
FG
8545
8546 if (!deferred_aggressive) {
31f18b77 8547 if (deferred_queue_size >= deferred_batch_ops.load() ||
7c673cae 8548 throttle_deferred_bytes.past_midpoint()) {
224ce89b 8549 deferred_try_submit();
7c673cae
FG
8550 }
8551 }
8552
8553 // this is as good a place as any ...
8554 _reap_collections();
8555
7c673cae 8556 l.lock();
7c673cae
FG
8557 }
8558 }
8559 dout(10) << __func__ << " finish" << dendl;
31f18b77 8560 kv_finalize_started = false;
7c673cae
FG
8561}
8562
8563bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8564 TransContext *txc, OnodeRef o)
8565{
8566 if (!txc->deferred_txn) {
8567 txc->deferred_txn = new bluestore_deferred_transaction_t;
8568 }
8569 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8570 return &txc->deferred_txn->ops.back();
8571}
8572
8573void BlueStore::_deferred_queue(TransContext *txc)
8574{
8575 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 8576 deferred_lock.lock();
7c673cae
FG
8577 if (!txc->osr->deferred_pending &&
8578 !txc->osr->deferred_running) {
8579 deferred_queue.push_back(*txc->osr);
8580 }
8581 if (!txc->osr->deferred_pending) {
8582 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8583 }
8584 ++deferred_queue_size;
8585 txc->osr->deferred_pending->txcs.push_back(*txc);
8586 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8587 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8588 const auto& op = *opi;
8589 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8590 bufferlist::const_iterator p = op.data.begin();
8591 for (auto e : op.extents) {
8592 txc->osr->deferred_pending->prepare_write(
8593 cct, wt.seq, e.offset, e.length, p);
8594 }
8595 }
8596 if (deferred_aggressive &&
8597 !txc->osr->deferred_running) {
224ce89b
WB
8598 _deferred_submit_unlock(txc->osr.get());
8599 } else {
8600 deferred_lock.unlock();
7c673cae
FG
8601 }
8602}
8603
224ce89b 8604void BlueStore::deferred_try_submit()
7c673cae
FG
8605{
8606 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8607 << deferred_queue_size << " txcs" << dendl;
224ce89b
WB
8608 std::lock_guard<std::mutex> l(deferred_lock);
8609 vector<OpSequencerRef> osrs;
8610 osrs.reserve(deferred_queue.size());
7c673cae 8611 for (auto& osr : deferred_queue) {
224ce89b
WB
8612 osrs.push_back(&osr);
8613 }
8614 for (auto& osr : osrs) {
8615 if (osr->deferred_pending && !osr->deferred_running) {
8616 _deferred_submit_unlock(osr.get());
8617 deferred_lock.lock();
7c673cae
FG
8618 }
8619 }
8620}
8621
224ce89b 8622void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
8623{
8624 dout(10) << __func__ << " osr " << osr
8625 << " " << osr->deferred_pending->iomap.size() << " ios pending "
8626 << dendl;
8627 assert(osr->deferred_pending);
8628 assert(!osr->deferred_running);
8629
8630 auto b = osr->deferred_pending;
8631 deferred_queue_size -= b->seq_bytes.size();
8632 assert(deferred_queue_size >= 0);
8633
8634 osr->deferred_running = osr->deferred_pending;
8635 osr->deferred_pending = nullptr;
8636
8637 uint64_t start = 0, pos = 0;
8638 bufferlist bl;
8639 auto i = b->iomap.begin();
8640 while (true) {
8641 if (i == b->iomap.end() || i->first != pos) {
8642 if (bl.length()) {
8643 dout(20) << __func__ << " write 0x" << std::hex
8644 << start << "~" << bl.length()
8645 << " crc " << bl.crc32c(-1) << std::dec << dendl;
8646 if (!g_conf->bluestore_debug_omit_block_device_write) {
8647 logger->inc(l_bluestore_deferred_write_ops);
8648 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
8649 int r = bdev->aio_write(start, bl, &b->ioc, false);
8650 assert(r == 0);
8651 }
8652 }
8653 if (i == b->iomap.end()) {
8654 break;
8655 }
8656 start = 0;
8657 pos = i->first;
8658 bl.clear();
8659 }
8660 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
8661 << std::hex << pos << "~" << i->second.bl.length() << std::dec
8662 << dendl;
8663 if (!bl.length()) {
8664 start = pos;
8665 }
8666 pos += i->second.bl.length();
8667 bl.claim_append(i->second.bl);
8668 ++i;
8669 }
224ce89b
WB
8670
8671 // demote to deferred_submit_lock, then drop that too
8672 std::lock_guard<std::mutex> l(deferred_submit_lock);
8673 deferred_lock.unlock();
7c673cae
FG
8674 bdev->aio_submit(&b->ioc);
8675}
8676
8677void BlueStore::_deferred_aio_finish(OpSequencer *osr)
8678{
8679 dout(10) << __func__ << " osr " << osr << dendl;
8680 assert(osr->deferred_running);
8681 DeferredBatch *b = osr->deferred_running;
8682
8683 {
8684 std::lock_guard<std::mutex> l(deferred_lock);
8685 assert(osr->deferred_running == b);
8686 osr->deferred_running = nullptr;
8687 if (!osr->deferred_pending) {
8688 auto q = deferred_queue.iterator_to(*osr);
8689 deferred_queue.erase(q);
8690 } else if (deferred_aggressive) {
224ce89b
WB
8691 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
8692 finishers[0]->queue(new FunctionContext([&](int) {
8693 deferred_try_submit();
8694 }));
7c673cae
FG
8695 }
8696 }
8697
8698 {
31f18b77 8699 uint64_t costs = 0;
7c673cae
FG
8700 std::lock_guard<std::mutex> l2(osr->qlock);
8701 for (auto& i : b->txcs) {
8702 TransContext *txc = &i;
8703 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
31f18b77 8704 costs += txc->cost;
7c673cae 8705 }
31f18b77
FG
8706 osr->qcond.notify_all();
8707 throttle_deferred_bytes.put(costs);
7c673cae
FG
8708 std::lock_guard<std::mutex> l(kv_lock);
8709 deferred_done_queue.emplace_back(b);
8710 }
8711
8712 // in the normal case, do not bother waking up the kv thread; it will
8713 // catch us on the next commit anyway.
8714 if (deferred_aggressive) {
8715 std::lock_guard<std::mutex> l(kv_lock);
8716 kv_cond.notify_one();
8717 }
8718}
8719
8720int BlueStore::_deferred_replay()
8721{
8722 dout(10) << __func__ << " start" << dendl;
8723 OpSequencerRef osr = new OpSequencer(cct, this);
8724 int count = 0;
8725 int r = 0;
8726 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
8727 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
8728 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
8729 << dendl;
8730 bluestore_deferred_transaction_t *deferred_txn =
8731 new bluestore_deferred_transaction_t;
8732 bufferlist bl = it->value();
8733 bufferlist::iterator p = bl.begin();
8734 try {
8735 ::decode(*deferred_txn, p);
8736 } catch (buffer::error& e) {
8737 derr << __func__ << " failed to decode deferred txn "
8738 << pretty_binary_string(it->key()) << dendl;
8739 delete deferred_txn;
8740 r = -EIO;
8741 goto out;
8742 }
8743 TransContext *txc = _txc_create(osr.get());
8744 txc->deferred_txn = deferred_txn;
8745 txc->state = TransContext::STATE_KV_DONE;
8746 _txc_state_proc(txc);
8747 }
8748 out:
8749 dout(20) << __func__ << " draining osr" << dendl;
8750 _osr_drain_all();
8751 osr->discard();
8752 dout(10) << __func__ << " completed " << count << " events" << dendl;
8753 return r;
8754}
8755
8756// ---------------------------
8757// transactions
8758
8759int BlueStore::queue_transactions(
8760 Sequencer *posr,
8761 vector<Transaction>& tls,
8762 TrackedOpRef op,
8763 ThreadPool::TPHandle *handle)
8764{
8765 FUNCTRACE();
8766 Context *onreadable;
8767 Context *ondisk;
8768 Context *onreadable_sync;
8769 ObjectStore::Transaction::collect_contexts(
8770 tls, &onreadable, &ondisk, &onreadable_sync);
8771
8772 if (cct->_conf->objectstore_blackhole) {
8773 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
8774 << dendl;
8775 delete ondisk;
8776 delete onreadable;
8777 delete onreadable_sync;
8778 return 0;
8779 }
8780 utime_t start = ceph_clock_now();
8781 // set up the sequencer
8782 OpSequencer *osr;
8783 assert(posr);
8784 if (posr->p) {
8785 osr = static_cast<OpSequencer *>(posr->p.get());
8786 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
8787 } else {
8788 osr = new OpSequencer(cct, this);
8789 osr->parent = posr;
8790 posr->p = osr;
8791 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
8792 }
8793
8794 // prepare
8795 TransContext *txc = _txc_create(osr);
8796 txc->onreadable = onreadable;
8797 txc->onreadable_sync = onreadable_sync;
8798 txc->oncommit = ondisk;
8799
8800 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
8801 (*p).set_osr(osr);
8802 txc->bytes += (*p).get_num_bytes();
8803 _txc_add_transaction(txc, &(*p));
8804 }
8805 _txc_calc_cost(txc);
8806
8807 _txc_write_nodes(txc, txc->t);
8808
8809 // journal deferred items
8810 if (txc->deferred_txn) {
8811 txc->deferred_txn->seq = ++deferred_seq;
8812 bufferlist bl;
8813 ::encode(*txc->deferred_txn, bl);
8814 string key;
8815 get_deferred_key(txc->deferred_txn->seq, &key);
8816 txc->t->set(PREFIX_DEFERRED, key, bl);
8817 }
8818
8819 _txc_finalize_kv(txc, txc->t);
8820 if (handle)
8821 handle->suspend_tp_timeout();
8822
8823 utime_t tstart = ceph_clock_now();
8824 throttle_bytes.get(txc->cost);
8825 if (txc->deferred_txn) {
8826 // ensure we do not block here because of deferred writes
8827 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
d2e6a577
FG
8828 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
8829 << dendl;
8830 ++deferred_aggressive;
7c673cae
FG
8831 deferred_try_submit();
8832 throttle_deferred_bytes.get(txc->cost);
d2e6a577
FG
8833 --deferred_aggressive;
8834 }
7c673cae
FG
8835 }
8836 utime_t tend = ceph_clock_now();
8837
8838 if (handle)
8839 handle->reset_tp_timeout();
8840
8841 logger->inc(l_bluestore_txc);
8842
8843 // execute (start)
8844 _txc_state_proc(txc);
8845
8846 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
8847 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
8848 return 0;
8849}
8850
8851void BlueStore::_txc_aio_submit(TransContext *txc)
8852{
8853 dout(10) << __func__ << " txc " << txc << dendl;
8854 bdev->aio_submit(&txc->ioc);
8855}
8856
8857void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
8858{
8859 Transaction::iterator i = t->begin();
8860
8861 _dump_transaction(t);
8862
8863 vector<CollectionRef> cvec(i.colls.size());
8864 unsigned j = 0;
8865 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
8866 ++p, ++j) {
8867 cvec[j] = _get_collection(*p);
7c673cae
FG
8868 }
8869 vector<OnodeRef> ovec(i.objects.size());
8870
8871 for (int pos = 0; i.have_op(); ++pos) {
8872 Transaction::Op *op = i.decode_op();
8873 int r = 0;
8874
8875 // no coll or obj
8876 if (op->op == Transaction::OP_NOP)
8877 continue;
8878
8879 // collection operations
8880 CollectionRef &c = cvec[op->cid];
8881 switch (op->op) {
8882 case Transaction::OP_RMCOLL:
8883 {
8884 const coll_t &cid = i.get_cid(op->cid);
8885 r = _remove_collection(txc, cid, &c);
8886 if (!r)
8887 continue;
8888 }
8889 break;
8890
8891 case Transaction::OP_MKCOLL:
8892 {
8893 assert(!c);
8894 const coll_t &cid = i.get_cid(op->cid);
8895 r = _create_collection(txc, cid, op->split_bits, &c);
8896 if (!r)
8897 continue;
8898 }
8899 break;
8900
8901 case Transaction::OP_SPLIT_COLLECTION:
8902 assert(0 == "deprecated");
8903 break;
8904
8905 case Transaction::OP_SPLIT_COLLECTION2:
8906 {
8907 uint32_t bits = op->split_bits;
8908 uint32_t rem = op->split_rem;
8909 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
8910 if (!r)
8911 continue;
8912 }
8913 break;
8914
8915 case Transaction::OP_COLL_HINT:
8916 {
8917 uint32_t type = op->hint_type;
8918 bufferlist hint;
8919 i.decode_bl(hint);
8920 bufferlist::iterator hiter = hint.begin();
8921 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
8922 uint32_t pg_num;
8923 uint64_t num_objs;
8924 ::decode(pg_num, hiter);
8925 ::decode(num_objs, hiter);
8926 dout(10) << __func__ << " collection hint objects is a no-op, "
8927 << " pg_num " << pg_num << " num_objects " << num_objs
8928 << dendl;
8929 } else {
8930 // Ignore the hint
8931 dout(10) << __func__ << " unknown collection hint " << type << dendl;
8932 }
8933 continue;
8934 }
8935 break;
8936
8937 case Transaction::OP_COLL_SETATTR:
8938 r = -EOPNOTSUPP;
8939 break;
8940
8941 case Transaction::OP_COLL_RMATTR:
8942 r = -EOPNOTSUPP;
8943 break;
8944
8945 case Transaction::OP_COLL_RENAME:
8946 assert(0 == "not implemented");
8947 break;
8948 }
8949 if (r < 0) {
8950 derr << __func__ << " error " << cpp_strerror(r)
8951 << " not handled on operation " << op->op
8952 << " (op " << pos << ", counting from 0)" << dendl;
8953 _dump_transaction(t, 0);
8954 assert(0 == "unexpected error");
8955 }
8956
8957 // these operations implicity create the object
8958 bool create = false;
8959 if (op->op == Transaction::OP_TOUCH ||
8960 op->op == Transaction::OP_WRITE ||
8961 op->op == Transaction::OP_ZERO) {
8962 create = true;
8963 }
8964
8965 // object operations
8966 RWLock::WLocker l(c->lock);
8967 OnodeRef &o = ovec[op->oid];
8968 if (!o) {
8969 ghobject_t oid = i.get_oid(op->oid);
8970 o = c->get_onode(oid, create);
8971 }
8972 if (!create && (!o || !o->exists)) {
8973 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
8974 << i.get_oid(op->oid) << dendl;
8975 r = -ENOENT;
8976 goto endop;
8977 }
8978
8979 switch (op->op) {
8980 case Transaction::OP_TOUCH:
8981 r = _touch(txc, c, o);
8982 break;
8983
8984 case Transaction::OP_WRITE:
8985 {
8986 uint64_t off = op->off;
8987 uint64_t len = op->len;
8988 uint32_t fadvise_flags = i.get_fadvise_flags();
8989 bufferlist bl;
8990 i.decode_bl(bl);
8991 r = _write(txc, c, o, off, len, bl, fadvise_flags);
8992 }
8993 break;
8994
8995 case Transaction::OP_ZERO:
8996 {
8997 uint64_t off = op->off;
8998 uint64_t len = op->len;
8999 r = _zero(txc, c, o, off, len);
9000 }
9001 break;
9002
9003 case Transaction::OP_TRIMCACHE:
9004 {
9005 // deprecated, no-op
9006 }
9007 break;
9008
9009 case Transaction::OP_TRUNCATE:
9010 {
9011 uint64_t off = op->off;
9012 _truncate(txc, c, o, off);
9013 }
9014 break;
9015
9016 case Transaction::OP_REMOVE:
9017 {
9018 r = _remove(txc, c, o);
9019 }
9020 break;
9021
9022 case Transaction::OP_SETATTR:
9023 {
9024 string name = i.decode_string();
9025 bufferptr bp;
9026 i.decode_bp(bp);
9027 r = _setattr(txc, c, o, name, bp);
9028 }
9029 break;
9030
9031 case Transaction::OP_SETATTRS:
9032 {
9033 map<string, bufferptr> aset;
9034 i.decode_attrset(aset);
9035 r = _setattrs(txc, c, o, aset);
9036 }
9037 break;
9038
9039 case Transaction::OP_RMATTR:
9040 {
9041 string name = i.decode_string();
9042 r = _rmattr(txc, c, o, name);
9043 }
9044 break;
9045
9046 case Transaction::OP_RMATTRS:
9047 {
9048 r = _rmattrs(txc, c, o);
9049 }
9050 break;
9051
9052 case Transaction::OP_CLONE:
9053 {
9054 OnodeRef& no = ovec[op->dest_oid];
9055 if (!no) {
9056 const ghobject_t& noid = i.get_oid(op->dest_oid);
9057 no = c->get_onode(noid, true);
9058 }
9059 r = _clone(txc, c, o, no);
9060 }
9061 break;
9062
9063 case Transaction::OP_CLONERANGE:
9064 assert(0 == "deprecated");
9065 break;
9066
9067 case Transaction::OP_CLONERANGE2:
9068 {
9069 OnodeRef& no = ovec[op->dest_oid];
9070 if (!no) {
9071 const ghobject_t& noid = i.get_oid(op->dest_oid);
9072 no = c->get_onode(noid, true);
9073 }
9074 uint64_t srcoff = op->off;
9075 uint64_t len = op->len;
9076 uint64_t dstoff = op->dest_off;
9077 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
9078 }
9079 break;
9080
9081 case Transaction::OP_COLL_ADD:
9082 assert(0 == "not implemented");
9083 break;
9084
9085 case Transaction::OP_COLL_REMOVE:
9086 assert(0 == "not implemented");
9087 break;
9088
9089 case Transaction::OP_COLL_MOVE:
9090 assert(0 == "deprecated");
9091 break;
9092
9093 case Transaction::OP_COLL_MOVE_RENAME:
9094 case Transaction::OP_TRY_RENAME:
9095 {
9096 assert(op->cid == op->dest_cid);
9097 const ghobject_t& noid = i.get_oid(op->dest_oid);
9098 OnodeRef& no = ovec[op->dest_oid];
9099 if (!no) {
9100 no = c->get_onode(noid, false);
9101 }
9102 r = _rename(txc, c, o, no, noid);
9103 }
9104 break;
9105
9106 case Transaction::OP_OMAP_CLEAR:
9107 {
9108 r = _omap_clear(txc, c, o);
9109 }
9110 break;
9111 case Transaction::OP_OMAP_SETKEYS:
9112 {
9113 bufferlist aset_bl;
9114 i.decode_attrset_bl(&aset_bl);
9115 r = _omap_setkeys(txc, c, o, aset_bl);
9116 }
9117 break;
9118 case Transaction::OP_OMAP_RMKEYS:
9119 {
9120 bufferlist keys_bl;
9121 i.decode_keyset_bl(&keys_bl);
9122 r = _omap_rmkeys(txc, c, o, keys_bl);
9123 }
9124 break;
9125 case Transaction::OP_OMAP_RMKEYRANGE:
9126 {
9127 string first, last;
9128 first = i.decode_string();
9129 last = i.decode_string();
9130 r = _omap_rmkey_range(txc, c, o, first, last);
9131 }
9132 break;
9133 case Transaction::OP_OMAP_SETHEADER:
9134 {
9135 bufferlist bl;
9136 i.decode_bl(bl);
9137 r = _omap_setheader(txc, c, o, bl);
9138 }
9139 break;
9140
9141 case Transaction::OP_SETALLOCHINT:
9142 {
9143 r = _set_alloc_hint(txc, c, o,
9144 op->expected_object_size,
9145 op->expected_write_size,
9146 op->alloc_hint_flags);
9147 }
9148 break;
9149
9150 default:
9151 derr << __func__ << "bad op " << op->op << dendl;
9152 ceph_abort();
9153 }
9154
9155 endop:
9156 if (r < 0) {
9157 bool ok = false;
9158
9159 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9160 op->op == Transaction::OP_CLONE ||
9161 op->op == Transaction::OP_CLONERANGE2 ||
9162 op->op == Transaction::OP_COLL_ADD ||
9163 op->op == Transaction::OP_SETATTR ||
9164 op->op == Transaction::OP_SETATTRS ||
9165 op->op == Transaction::OP_RMATTR ||
9166 op->op == Transaction::OP_OMAP_SETKEYS ||
9167 op->op == Transaction::OP_OMAP_RMKEYS ||
9168 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9169 op->op == Transaction::OP_OMAP_SETHEADER))
9170 // -ENOENT is usually okay
9171 ok = true;
9172 if (r == -ENODATA)
9173 ok = true;
9174
9175 if (!ok) {
9176 const char *msg = "unexpected error code";
9177
9178 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9179 op->op == Transaction::OP_CLONE ||
9180 op->op == Transaction::OP_CLONERANGE2))
9181 msg = "ENOENT on clone suggests osd bug";
9182
9183 if (r == -ENOSPC)
9184 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9185 // by partially applying transactions.
9186 msg = "ENOSPC from bluestore, misconfigured cluster";
9187
9188 if (r == -ENOTEMPTY) {
9189 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9190 }
9191
9192 derr << __func__ << " error " << cpp_strerror(r)
9193 << " not handled on operation " << op->op
9194 << " (op " << pos << ", counting from 0)"
9195 << dendl;
9196 derr << msg << dendl;
9197 _dump_transaction(t, 0);
9198 assert(0 == "unexpected error");
9199 }
9200 }
9201 }
9202}
9203
9204
9205
9206// -----------------
9207// write operations
9208
9209int BlueStore::_touch(TransContext *txc,
9210 CollectionRef& c,
9211 OnodeRef &o)
9212{
9213 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9214 int r = 0;
7c673cae
FG
9215 _assign_nid(txc, o);
9216 txc->write_onode(o);
9217 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9218 return r;
9219}
9220
9221void BlueStore::_dump_onode(OnodeRef o, int log_level)
9222{
9223 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9224 return;
9225 dout(log_level) << __func__ << " " << o << " " << o->oid
9226 << " nid " << o->onode.nid
9227 << " size 0x" << std::hex << o->onode.size
9228 << " (" << std::dec << o->onode.size << ")"
9229 << " expected_object_size " << o->onode.expected_object_size
9230 << " expected_write_size " << o->onode.expected_write_size
9231 << " in " << o->onode.extent_map_shards.size() << " shards"
9232 << ", " << o->extent_map.spanning_blob_map.size()
9233 << " spanning blobs"
9234 << dendl;
9235 for (auto p = o->onode.attrs.begin();
9236 p != o->onode.attrs.end();
9237 ++p) {
9238 dout(log_level) << __func__ << " attr " << p->first
9239 << " len " << p->second.length() << dendl;
9240 }
9241 _dump_extent_map(o->extent_map, log_level);
9242}
9243
9244void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9245{
9246 uint64_t pos = 0;
9247 for (auto& s : em.shards) {
9248 dout(log_level) << __func__ << " shard " << *s.shard_info
9249 << (s.loaded ? " (loaded)" : "")
9250 << (s.dirty ? " (dirty)" : "")
9251 << dendl;
9252 }
9253 for (auto& e : em.extent_map) {
9254 dout(log_level) << __func__ << " " << e << dendl;
9255 assert(e.logical_offset >= pos);
9256 pos = e.logical_offset + e.length;
9257 const bluestore_blob_t& blob = e.blob->get_blob();
9258 if (blob.has_csum()) {
9259 vector<uint64_t> v;
9260 unsigned n = blob.get_csum_count();
9261 for (unsigned i = 0; i < n; ++i)
9262 v.push_back(blob.get_csum_item(i));
9263 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9264 << dendl;
9265 }
9266 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9267 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9268 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9269 << "~" << i.second->length << std::dec
9270 << " " << *i.second << dendl;
9271 }
9272 }
9273}
9274
9275void BlueStore::_dump_transaction(Transaction *t, int log_level)
9276{
9277 dout(log_level) << " transaction dump:\n";
9278 JSONFormatter f(true);
9279 f.open_object_section("transaction");
9280 t->dump(&f);
9281 f.close_section();
9282 f.flush(*_dout);
9283 *_dout << dendl;
9284}
9285
9286void BlueStore::_pad_zeros(
9287 bufferlist *bl, uint64_t *offset,
9288 uint64_t chunk_size)
9289{
9290 auto length = bl->length();
9291 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9292 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9293 dout(40) << "before:\n";
9294 bl->hexdump(*_dout);
9295 *_dout << dendl;
9296 // front
9297 size_t front_pad = *offset % chunk_size;
9298 size_t back_pad = 0;
9299 size_t pad_count = 0;
9300 if (front_pad) {
9301 size_t front_copy = MIN(chunk_size - front_pad, length);
9302 bufferptr z = buffer::create_page_aligned(chunk_size);
224ce89b 9303 z.zero(0, front_pad, false);
7c673cae 9304 pad_count += front_pad;
224ce89b 9305 bl->copy(0, front_copy, z.c_str() + front_pad);
7c673cae
FG
9306 if (front_copy + front_pad < chunk_size) {
9307 back_pad = chunk_size - (length + front_pad);
224ce89b 9308 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
9309 pad_count += back_pad;
9310 }
9311 bufferlist old, t;
9312 old.swap(*bl);
9313 t.substr_of(old, front_copy, length - front_copy);
9314 bl->append(z);
9315 bl->claim_append(t);
9316 *offset -= front_pad;
224ce89b 9317 length += pad_count;
7c673cae
FG
9318 }
9319
9320 // back
9321 uint64_t end = *offset + length;
9322 unsigned back_copy = end % chunk_size;
9323 if (back_copy) {
9324 assert(back_pad == 0);
9325 back_pad = chunk_size - back_copy;
9326 assert(back_copy <= length);
9327 bufferptr tail(chunk_size);
224ce89b
WB
9328 bl->copy(length - back_copy, back_copy, tail.c_str());
9329 tail.zero(back_copy, back_pad, false);
7c673cae
FG
9330 bufferlist old;
9331 old.swap(*bl);
9332 bl->substr_of(old, 0, length - back_copy);
9333 bl->append(tail);
9334 length += back_pad;
9335 pad_count += back_pad;
9336 }
9337 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9338 << back_pad << " on front/back, now 0x" << *offset << "~"
9339 << length << std::dec << dendl;
9340 dout(40) << "after:\n";
9341 bl->hexdump(*_dout);
9342 *_dout << dendl;
9343 if (pad_count)
9344 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9345 assert(bl->length() == length);
9346}
9347
9348void BlueStore::_do_write_small(
9349 TransContext *txc,
9350 CollectionRef &c,
9351 OnodeRef o,
9352 uint64_t offset, uint64_t length,
9353 bufferlist::iterator& blp,
9354 WriteContext *wctx)
9355{
9356 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9357 << std::dec << dendl;
9358 assert(length < min_alloc_size);
9359 uint64_t end_offs = offset + length;
9360
9361 logger->inc(l_bluestore_write_small);
9362 logger->inc(l_bluestore_write_small_bytes, length);
9363
9364 bufferlist bl;
9365 blp.copy(length, bl);
9366
9367 // Look for an existing mutable blob we can use.
9368 auto begin = o->extent_map.extent_map.begin();
9369 auto end = o->extent_map.extent_map.end();
9370 auto ep = o->extent_map.seek_lextent(offset);
9371 if (ep != begin) {
9372 --ep;
9373 if (ep->blob_end() <= offset) {
9374 ++ep;
9375 }
9376 }
9377 auto prev_ep = ep;
9378 if (prev_ep != begin) {
9379 --prev_ep;
9380 } else {
9381 prev_ep = end; // to avoid this extent check as it's a duplicate
9382 }
9383
9384 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9385 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9386 uint32_t alloc_len = min_alloc_size;
9387 auto offset0 = P2ALIGN(offset, alloc_len);
9388
9389 bool any_change;
9390
9391 // search suitable extent in both forward and reverse direction in
9392 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 9393 // then check if blob can be reused via can_reuse_blob func or apply
7c673cae
FG
9394 // direct/deferred write (the latter for extents including or higher
9395 // than 'offset' only).
9396 do {
9397 any_change = false;
9398
9399 if (ep != end && ep->logical_offset < offset + max_bsize) {
9400 BlobRef b = ep->blob;
9401 auto bstart = ep->blob_start();
9402 dout(20) << __func__ << " considering " << *b
9403 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9404 if (bstart >= end_offs) {
9405 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9406 } else if (!b->get_blob().is_mutable()) {
9407 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9408 } else if (ep->logical_offset % min_alloc_size !=
9409 ep->blob_offset % min_alloc_size) {
9410 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9411 } else {
9412 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9413 // can we pad our head/tail out with zeros?
9414 uint64_t head_pad, tail_pad;
9415 head_pad = P2PHASE(offset, chunk_size);
9416 tail_pad = P2NPHASE(end_offs, chunk_size);
9417 if (head_pad || tail_pad) {
9418 o->extent_map.fault_range(db, offset - head_pad,
9419 end_offs - offset + head_pad + tail_pad);
9420 }
9421 if (head_pad &&
9422 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9423 head_pad = 0;
9424 }
9425 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9426 tail_pad = 0;
9427 }
9428
9429 uint64_t b_off = offset - head_pad - bstart;
9430 uint64_t b_len = length + head_pad + tail_pad;
9431
9432 // direct write into unused blocks of an existing mutable blob?
9433 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9434 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9435 b->get_blob().is_unused(b_off, b_len) &&
9436 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 9437 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
9438
9439 dout(20) << __func__ << " write to unused 0x" << std::hex
9440 << b_off << "~" << b_len
9441 << " pad 0x" << head_pad << " + 0x" << tail_pad
9442 << std::dec << " of mutable " << *b << dendl;
224ce89b 9443 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
9444 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9445
9446 if (!g_conf->bluestore_debug_omit_block_device_write) {
9447 if (b_len <= prefer_deferred_size) {
9448 dout(20) << __func__ << " deferring small 0x" << std::hex
9449 << b_len << std::dec << " unused write via deferred" << dendl;
9450 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9451 op->op = bluestore_deferred_op_t::OP_WRITE;
9452 b->get_blob().map(
9453 b_off, b_len,
9454 [&](uint64_t offset, uint64_t length) {
9455 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9456 return 0;
9457 });
224ce89b 9458 op->data = bl;
7c673cae
FG
9459 } else {
9460 b->get_blob().map_bl(
224ce89b 9461 b_off, bl,
7c673cae
FG
9462 [&](uint64_t offset, bufferlist& t) {
9463 bdev->aio_write(offset, t,
9464 &txc->ioc, wctx->buffered);
9465 });
9466 }
9467 }
224ce89b 9468 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
9469 dout(20) << __func__ << " lex old " << *ep << dendl;
9470 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9471 b,
9472 &wctx->old_extents);
9473 b->dirty_blob().mark_used(le->blob_offset, le->length);
9474 txc->statfs_delta.stored() += le->length;
9475 dout(20) << __func__ << " lex " << *le << dendl;
9476 logger->inc(l_bluestore_write_small_unused);
9477 return;
9478 }
9479 // read some data to fill out the chunk?
9480 uint64_t head_read = P2PHASE(b_off, chunk_size);
9481 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9482 if ((head_read || tail_read) &&
9483 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9484 head_read + tail_read < min_alloc_size) {
9485 b_off -= head_read;
9486 b_len += head_read + tail_read;
9487
9488 } else {
9489 head_read = tail_read = 0;
9490 }
9491
9492 // chunk-aligned deferred overwrite?
9493 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9494 b_off % chunk_size == 0 &&
9495 b_len % chunk_size == 0 &&
9496 b->get_blob().is_allocated(b_off, b_len)) {
9497
224ce89b 9498 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
9499
9500 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9501 << " and tail 0x" << tail_read << std::dec << dendl;
9502 if (head_read) {
9503 bufferlist head_bl;
9504 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9505 head_bl, 0);
9506 assert(r >= 0 && r <= (int)head_read);
9507 size_t zlen = head_read - r;
9508 if (zlen) {
9509 head_bl.append_zero(zlen);
9510 logger->inc(l_bluestore_write_pad_bytes, zlen);
9511 }
224ce89b 9512 bl.claim_prepend(head_bl);
7c673cae
FG
9513 logger->inc(l_bluestore_write_penalty_read_ops);
9514 }
9515 if (tail_read) {
9516 bufferlist tail_bl;
9517 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9518 tail_bl, 0);
9519 assert(r >= 0 && r <= (int)tail_read);
9520 size_t zlen = tail_read - r;
9521 if (zlen) {
9522 tail_bl.append_zero(zlen);
9523 logger->inc(l_bluestore_write_pad_bytes, zlen);
9524 }
224ce89b 9525 bl.claim_append(tail_bl);
7c673cae
FG
9526 logger->inc(l_bluestore_write_penalty_read_ops);
9527 }
9528 logger->inc(l_bluestore_write_small_pre_read);
9529
9530 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9531 op->op = bluestore_deferred_op_t::OP_WRITE;
224ce89b 9532 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
9533 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9534
9535 int r = b->get_blob().map(
9536 b_off, b_len,
9537 [&](uint64_t offset, uint64_t length) {
9538 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9539 return 0;
9540 });
9541 assert(r == 0);
9542 if (b->get_blob().csum_type) {
224ce89b 9543 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 9544 }
224ce89b 9545 op->data.claim(bl);
7c673cae
FG
9546 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9547 << b_len << std::dec << " of mutable " << *b
9548 << " at " << op->extents << dendl;
9549 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9550 b, &wctx->old_extents);
9551 b->dirty_blob().mark_used(le->blob_offset, le->length);
9552 txc->statfs_delta.stored() += le->length;
9553 dout(20) << __func__ << " lex " << *le << dendl;
9554 logger->inc(l_bluestore_write_small_deferred);
9555 return;
9556 }
224ce89b
WB
9557 // try to reuse blob if we can
9558 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
9559 max_bsize,
9560 offset0 - bstart,
9561 &alloc_len)) {
9562 assert(alloc_len == min_alloc_size); // expecting data always
9563 // fit into reused blob
9564 // Need to check for pending writes desiring to
9565 // reuse the same pextent. The rationale is that during GC two chunks
9566 // from garbage blobs(compressed?) can share logical space within the same
9567 // AU. That's in turn might be caused by unaligned len in clone_range2.
9568 // Hence the second write will fail in an attempt to reuse blob at
9569 // do_alloc_write().
9570 if (!wctx->has_conflict(b,
9571 offset0,
9572 offset0 + alloc_len,
9573 min_alloc_size)) {
9574
9575 // we can't reuse pad_head/pad_tail since they might be truncated
9576 // due to existent extents
9577 uint64_t b_off = offset - bstart;
9578 uint64_t b_off0 = b_off;
9579 _pad_zeros(&bl, &b_off0, chunk_size);
9580
9581 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
9582 << " (0x" << b_off0 << "~" << bl.length() << ")"
9583 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
9584 << std::dec << dendl;
9585
9586 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9587 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9588 false, false);
9589 logger->inc(l_bluestore_write_small_unused);
9590 return;
9591 }
9592 }
9593 }
9594 ++ep;
9595 any_change = true;
9596 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9597
9598 // check extent for reuse in reverse order
9599 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9600 BlobRef b = prev_ep->blob;
9601 auto bstart = prev_ep->blob_start();
9602 dout(20) << __func__ << " considering " << *b
9603 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 9604 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
9605 max_bsize,
9606 offset0 - bstart,
9607 &alloc_len)) {
9608 assert(alloc_len == min_alloc_size); // expecting data always
9609 // fit into reused blob
9610 // Need to check for pending writes desiring to
9611 // reuse the same pextent. The rationale is that during GC two chunks
9612 // from garbage blobs(compressed?) can share logical space within the same
9613 // AU. That's in turn might be caused by unaligned len in clone_range2.
9614 // Hence the second write will fail in an attempt to reuse blob at
9615 // do_alloc_write().
9616 if (!wctx->has_conflict(b,
9617 offset0,
9618 offset0 + alloc_len,
9619 min_alloc_size)) {
9620
9621 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9622 uint64_t b_off = offset - bstart;
9623 uint64_t b_off0 = b_off;
9624 _pad_zeros(&bl, &b_off0, chunk_size);
9625
9626 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
9627 << " (0x" << b_off0 << "~" << bl.length() << ")"
9628 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
9629 << std::dec << dendl;
9630
9631 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9632 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9633 false, false);
9634 logger->inc(l_bluestore_write_small_unused);
9635 return;
9636 }
9637 }
9638 if (prev_ep != begin) {
9639 --prev_ep;
9640 any_change = true;
9641 } else {
9642 prev_ep = end; // to avoid useless first extent re-check
9643 }
9644 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9645 } while (any_change);
9646
9647 // new blob.
9648
9649 BlobRef b = c->new_blob();
9650 uint64_t b_off = P2PHASE(offset, alloc_len);
9651 uint64_t b_off0 = b_off;
9652 _pad_zeros(&bl, &b_off0, block_size);
9653 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9654 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
9655 logger->inc(l_bluestore_write_small_new);
9656
9657 return;
9658}
9659
9660void BlueStore::_do_write_big(
9661 TransContext *txc,
9662 CollectionRef &c,
9663 OnodeRef o,
9664 uint64_t offset, uint64_t length,
9665 bufferlist::iterator& blp,
9666 WriteContext *wctx)
9667{
9668 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9669 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
9670 << " compress " << (int)wctx->compress
9671 << dendl;
9672 logger->inc(l_bluestore_write_big);
9673 logger->inc(l_bluestore_write_big_bytes, length);
9674 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9675 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9676 while (length > 0) {
9677 bool new_blob = false;
9678 uint32_t l = MIN(max_bsize, length);
9679 BlobRef b;
9680 uint32_t b_off = 0;
9681
9682 //attempting to reuse existing blob
9683 if (!wctx->compress) {
9684 // look for an existing mutable blob we can reuse
9685 auto begin = o->extent_map.extent_map.begin();
9686 auto end = o->extent_map.extent_map.end();
9687 auto ep = o->extent_map.seek_lextent(offset);
9688 auto prev_ep = ep;
9689 if (prev_ep != begin) {
9690 --prev_ep;
9691 } else {
9692 prev_ep = end; // to avoid this extent check as it's a duplicate
9693 }
9694 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9695 // search suitable extent in both forward and reverse direction in
9696 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 9697 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
9698 bool any_change;
9699 do {
9700 any_change = false;
9701 if (ep != end && ep->logical_offset < offset + max_bsize) {
9702 if (offset >= ep->blob_start() &&
224ce89b 9703 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
9704 offset - ep->blob_start(),
9705 &l)) {
9706 b = ep->blob;
9707 b_off = offset - ep->blob_start();
9708 prev_ep = end; // to avoid check below
9709 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 9710 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
9711 } else {
9712 ++ep;
9713 any_change = true;
9714 }
9715 }
9716
9717 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 9718 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
9719 offset - prev_ep->blob_start(),
9720 &l)) {
9721 b = prev_ep->blob;
9722 b_off = offset - prev_ep->blob_start();
9723 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 9724 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
9725 } else if (prev_ep != begin) {
9726 --prev_ep;
9727 any_change = true;
9728 } else {
9729 prev_ep = end; // to avoid useless first extent re-check
9730 }
9731 }
9732 } while (b == nullptr && any_change);
9733 }
9734 if (b == nullptr) {
9735 b = c->new_blob();
9736 b_off = 0;
9737 new_blob = true;
9738 }
9739
9740 bufferlist t;
9741 blp.copy(l, t);
9742 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
9743 offset += l;
9744 length -= l;
9745 logger->inc(l_bluestore_write_big_blobs);
9746 }
9747}
9748
9749int BlueStore::_do_alloc_write(
9750 TransContext *txc,
9751 CollectionRef coll,
9752 OnodeRef o,
9753 WriteContext *wctx)
9754{
9755 dout(20) << __func__ << " txc " << txc
9756 << " " << wctx->writes.size() << " blobs"
9757 << dendl;
9758
9759 uint64_t need = 0;
9760 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9761 for (auto &wi : wctx->writes) {
9762 need += wi.blob_length;
9763 }
9764 int r = alloc->reserve(need);
9765 if (r < 0) {
9766 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
9767 << dendl;
9768 return r;
9769 }
9770
9771 uint64_t hint = 0;
9772 CompressorRef c;
9773 double crr = 0;
9774 if (wctx->compress) {
9775 c = select_option(
9776 "compression_algorithm",
9777 compressor,
9778 [&]() {
9779 string val;
9780 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
9781 CompressorRef cp = compressor;
9782 if (!cp || cp->get_type_name() != val) {
9783 cp = Compressor::create(cct, val);
9784 }
9785 return boost::optional<CompressorRef>(cp);
9786 }
9787 return boost::optional<CompressorRef>();
9788 }
9789 );
9790
9791 crr = select_option(
9792 "compression_required_ratio",
9793 cct->_conf->bluestore_compression_required_ratio,
9794 [&]() {
9795 double val;
9796 if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
9797 return boost::optional<double>(val);
9798 }
9799 return boost::optional<double>();
9800 }
9801 );
9802 }
9803
9804 // checksum
9805 int csum = csum_type.load();
9806 csum = select_option(
9807 "csum_type",
9808 csum,
9809 [&]() {
9810 int val;
9811 if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
9812 return boost::optional<int>(val);
9813 }
9814 return boost::optional<int>();
9815 }
9816 );
9817
9818 for (auto& wi : wctx->writes) {
9819 BlobRef b = wi.b;
9820 bluestore_blob_t& dblob = b->dirty_blob();
9821 uint64_t b_off = wi.b_off;
9822 bufferlist *l = &wi.bl;
9823 uint64_t final_length = wi.blob_length;
9824 uint64_t csum_length = wi.blob_length;
9825 unsigned csum_order = block_size_order;
9826 bufferlist compressed_bl;
9827 bool compressed = false;
9828 if(c && wi.blob_length > min_alloc_size) {
9829
9830 utime_t start = ceph_clock_now();
9831
9832 // compress
9833 assert(b_off == 0);
9834 assert(wi.blob_length == l->length());
9835 bluestore_compression_header_t chdr;
9836 chdr.type = c->get_type();
9837 // FIXME: memory alignment here is bad
9838 bufferlist t;
9839
9840 r = c->compress(*l, t);
9841 assert(r == 0);
9842
9843 chdr.length = t.length();
9844 ::encode(chdr, compressed_bl);
9845 compressed_bl.claim_append(t);
9846 uint64_t rawlen = compressed_bl.length();
9847 uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
9848 uint64_t want_len_raw = final_length * crr;
9849 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
9850 if (newlen <= want_len && newlen < final_length) {
9851 // Cool. We compressed at least as much as we were hoping to.
9852 // pad out to min_alloc_size
9853 compressed_bl.append_zero(newlen - rawlen);
9854 logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
9855 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
9856 << " -> 0x" << rawlen << " => 0x" << newlen
9857 << " with " << c->get_type()
9858 << std::dec << dendl;
9859 txc->statfs_delta.compressed() += rawlen;
9860 txc->statfs_delta.compressed_original() += l->length();
9861 txc->statfs_delta.compressed_allocated() += newlen;
9862 l = &compressed_bl;
9863 final_length = newlen;
9864 csum_length = newlen;
9865 csum_order = ctz(newlen);
9866 dblob.set_compressed(wi.blob_length, rawlen);
9867 compressed = true;
9868 logger->inc(l_bluestore_compress_success_count);
9869 } else {
9870 dout(20) << __func__ << std::hex << " 0x" << l->length()
9871 << " compressed to 0x" << rawlen << " -> 0x" << newlen
9872 << " with " << c->get_type()
9873 << ", which is more than required 0x" << want_len_raw
9874 << " -> 0x" << want_len
9875 << ", leaving uncompressed"
9876 << std::dec << dendl;
9877 logger->inc(l_bluestore_compress_rejected_count);
9878 }
9879 logger->tinc(l_bluestore_compress_lat,
9880 ceph_clock_now() - start);
9881 }
9882 if (!compressed && wi.new_blob) {
9883 // initialize newly created blob only
31f18b77 9884 assert(dblob.is_mutable());
7c673cae
FG
9885 if (l->length() != wi.blob_length) {
9886 // hrm, maybe we could do better here, but let's not bother.
9887 dout(20) << __func__ << " forcing csum_order to block_size_order "
9888 << block_size_order << dendl;
31f18b77 9889 csum_order = block_size_order;
7c673cae
FG
9890 } else {
9891 csum_order = std::min(wctx->csum_order, ctz(l->length()));
9892 }
9893 // try to align blob with max_blob_size to improve
9894 // its reuse ratio, e.g. in case of reverse write
9895 uint32_t suggested_boff =
9896 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
9897 if ((suggested_boff % (1 << csum_order)) == 0 &&
9898 suggested_boff + final_length <= max_bsize &&
9899 suggested_boff > b_off) {
9900 dout(20) << __func__ << " forcing blob_offset to "
9901 << std::hex << suggested_boff << std::dec << dendl;
9902 assert(suggested_boff >= b_off);
9903 csum_length += suggested_boff - b_off;
9904 b_off = suggested_boff;
9905 }
9906 }
9907
9908 AllocExtentVector extents;
9909 extents.reserve(4); // 4 should be (more than) enough for most allocations
9910 int64_t got = alloc->allocate(final_length, min_alloc_size,
9911 max_alloc_size.load(),
9912 hint, &extents);
9913 assert(got == (int64_t)final_length);
9914 need -= got;
9915 txc->statfs_delta.allocated() += got;
9916 for (auto& p : extents) {
9917 bluestore_pextent_t e = bluestore_pextent_t(p);
9918 txc->allocated.insert(e.offset, e.length);
9919 hint = p.end();
9920 }
9921 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
9922
9923 dout(20) << __func__ << " blob " << *b
9924 << " csum_type " << Checksummer::get_csum_type_string(csum)
9925 << " csum_order " << csum_order
9926 << " csum_length 0x" << std::hex << csum_length << std::dec
9927 << dendl;
9928
9929 if (csum != Checksummer::CSUM_NONE) {
9930 if (!dblob.has_csum()) {
9931 dblob.init_csum(csum, csum_order, csum_length);
9932 }
9933 dblob.calc_csum(b_off, *l);
9934 }
9935 if (wi.mark_unused) {
9936 auto b_end = b_off + wi.bl.length();
9937 if (b_off) {
9938 dblob.add_unused(0, b_off);
9939 }
9940 if (b_end < wi.blob_length) {
9941 dblob.add_unused(b_end, wi.blob_length - b_end);
9942 }
9943 }
9944
9945 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
9946 b_off + (wi.b_off0 - wi.b_off),
9947 wi.length0,
9948 wi.b,
9949 nullptr);
9950 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
9951 txc->statfs_delta.stored() += le->length;
9952 dout(20) << __func__ << " lex " << *le << dendl;
9953 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
9954 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9955
9956 // queue io
9957 if (!g_conf->bluestore_debug_omit_block_device_write) {
9958 if (l->length() <= prefer_deferred_size.load()) {
9959 dout(20) << __func__ << " deferring small 0x" << std::hex
9960 << l->length() << std::dec << " write via deferred" << dendl;
9961 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9962 op->op = bluestore_deferred_op_t::OP_WRITE;
9963 int r = b->get_blob().map(
9964 b_off, l->length(),
9965 [&](uint64_t offset, uint64_t length) {
9966 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9967 return 0;
9968 });
9969 assert(r == 0);
9970 op->data = *l;
9971 } else {
9972 b->get_blob().map_bl(
9973 b_off, *l,
9974 [&](uint64_t offset, bufferlist& t) {
9975 bdev->aio_write(offset, t, &txc->ioc, false);
9976 });
9977 }
9978 }
9979 }
9980 if (need > 0) {
9981 alloc->unreserve(need);
9982 }
9983 return 0;
9984}
9985
9986void BlueStore::_wctx_finish(
9987 TransContext *txc,
9988 CollectionRef& c,
9989 OnodeRef o,
31f18b77
FG
9990 WriteContext *wctx,
9991 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
9992{
9993 auto oep = wctx->old_extents.begin();
9994 while (oep != wctx->old_extents.end()) {
9995 auto &lo = *oep;
9996 oep = wctx->old_extents.erase(oep);
9997 dout(20) << __func__ << " lex_old " << lo.e << dendl;
9998 BlobRef b = lo.e.blob;
9999 const bluestore_blob_t& blob = b->get_blob();
10000 if (blob.is_compressed()) {
10001 if (lo.blob_empty) {
10002 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
10003 }
10004 txc->statfs_delta.compressed_original() -= lo.e.length;
10005 }
10006 auto& r = lo.r;
10007 txc->statfs_delta.stored() -= lo.e.length;
10008 if (!r.empty()) {
10009 dout(20) << __func__ << " blob release " << r << dendl;
10010 if (blob.is_shared()) {
10011 PExtentVector final;
10012 c->load_shared_blob(b->shared_blob);
10013 for (auto e : r) {
31f18b77
FG
10014 b->shared_blob->put_ref(
10015 e.offset, e.length, &final,
10016 b->is_referenced() ? nullptr : maybe_unshared_blobs);
7c673cae
FG
10017 }
10018 dout(20) << __func__ << " shared_blob release " << final
10019 << " from " << *b->shared_blob << dendl;
10020 txc->write_shared_blob(b->shared_blob);
10021 r.clear();
10022 r.swap(final);
10023 }
10024 }
10025 // we can't invalidate our logical extents as we drop them because
10026 // other lextents (either in our onode or others) may still
10027 // reference them. but we can throw out anything that is no
10028 // longer allocated. Note that this will leave behind edge bits
10029 // that are no longer referenced but not deallocated (until they
10030 // age out of the cache naturally).
10031 b->discard_unallocated(c.get());
10032 for (auto e : r) {
10033 dout(20) << __func__ << " release " << e << dendl;
10034 txc->released.insert(e.offset, e.length);
10035 txc->statfs_delta.allocated() -= e.length;
10036 if (blob.is_compressed()) {
10037 txc->statfs_delta.compressed_allocated() -= e.length;
10038 }
10039 }
10040 delete &lo;
10041 if (b->is_spanning() && !b->is_referenced()) {
10042 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
10043 << dendl;
10044 o->extent_map.spanning_blob_map.erase(b->id);
10045 }
10046 }
10047}
10048
10049void BlueStore::_do_write_data(
10050 TransContext *txc,
10051 CollectionRef& c,
10052 OnodeRef o,
10053 uint64_t offset,
10054 uint64_t length,
10055 bufferlist& bl,
10056 WriteContext *wctx)
10057{
10058 uint64_t end = offset + length;
10059 bufferlist::iterator p = bl.begin();
10060
10061 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
10062 (length != min_alloc_size)) {
10063 // we fall within the same block
10064 _do_write_small(txc, c, o, offset, length, p, wctx);
10065 } else {
10066 uint64_t head_offset, head_length;
10067 uint64_t middle_offset, middle_length;
10068 uint64_t tail_offset, tail_length;
10069
10070 head_offset = offset;
10071 head_length = P2NPHASE(offset, min_alloc_size);
10072
10073 tail_offset = P2ALIGN(end, min_alloc_size);
10074 tail_length = P2PHASE(end, min_alloc_size);
10075
10076 middle_offset = head_offset + head_length;
10077 middle_length = length - head_length - tail_length;
10078
10079 if (head_length) {
10080 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
10081 }
10082
10083 if (middle_length) {
10084 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
10085 }
10086
10087 if (tail_length) {
10088 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
10089 }
10090 }
10091}
10092
31f18b77
FG
10093void BlueStore::_choose_write_options(
10094 CollectionRef& c,
10095 OnodeRef o,
10096 uint32_t fadvise_flags,
10097 WriteContext *wctx)
7c673cae 10098{
7c673cae
FG
10099 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10100 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 10101 wctx->buffered = true;
7c673cae
FG
10102 } else if (cct->_conf->bluestore_default_buffered_write &&
10103 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10104 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10105 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 10106 wctx->buffered = true;
7c673cae
FG
10107 }
10108
31f18b77
FG
10109 // apply basic csum block size
10110 wctx->csum_order = block_size_order;
7c673cae
FG
10111
10112 // compression parameters
10113 unsigned alloc_hints = o->onode.alloc_hint_flags;
10114 auto cm = select_option(
10115 "compression_mode",
31f18b77 10116 comp_mode.load(),
7c673cae
FG
10117 [&]() {
10118 string val;
10119 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
10120 return boost::optional<Compressor::CompressionMode>(
10121 Compressor::get_comp_mode_type(val));
7c673cae
FG
10122 }
10123 return boost::optional<Compressor::CompressionMode>();
10124 }
10125 );
31f18b77
FG
10126
10127 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
10128 ((cm == Compressor::COMP_FORCE) ||
10129 (cm == Compressor::COMP_AGGRESSIVE &&
10130 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10131 (cm == Compressor::COMP_PASSIVE &&
10132 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10133
10134 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10135 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
10136 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10137 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 10138 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 10139
7c673cae 10140 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 10141
7c673cae 10142 if (o->onode.expected_write_size) {
224ce89b 10143 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 10144 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 10145 } else {
224ce89b 10146 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
10147 }
10148
31f18b77
FG
10149 if (wctx->compress) {
10150 wctx->target_blob_size = select_option(
7c673cae 10151 "compression_max_blob_size",
31f18b77 10152 comp_max_blob_size.load(),
7c673cae
FG
10153 [&]() {
10154 int val;
10155 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10156 return boost::optional<uint64_t>((uint64_t)val);
10157 }
10158 return boost::optional<uint64_t>();
10159 }
10160 );
10161 }
10162 } else {
31f18b77
FG
10163 if (wctx->compress) {
10164 wctx->target_blob_size = select_option(
7c673cae 10165 "compression_min_blob_size",
31f18b77 10166 comp_min_blob_size.load(),
7c673cae
FG
10167 [&]() {
10168 int val;
10169 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10170 return boost::optional<uint64_t>((uint64_t)val);
10171 }
10172 return boost::optional<uint64_t>();
10173 }
10174 );
10175 }
10176 }
31f18b77 10177
7c673cae 10178 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
10179 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10180 wctx->target_blob_size = max_bsize;
7c673cae 10181 }
31f18b77 10182
7c673cae
FG
10183 // set the min blob size floor at 2x the min_alloc_size, or else we
10184 // won't be able to allocate a smaller extent for the compressed
10185 // data.
31f18b77
FG
10186 if (wctx->compress &&
10187 wctx->target_blob_size < min_alloc_size * 2) {
10188 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 10189 }
31f18b77
FG
10190
10191 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10192 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10193 << std::dec << dendl;
10194}
10195
10196int BlueStore::_do_gc(
10197 TransContext *txc,
10198 CollectionRef& c,
10199 OnodeRef o,
10200 const GarbageCollector& gc,
10201 const WriteContext& wctx,
10202 uint64_t *dirty_start,
10203 uint64_t *dirty_end)
10204{
10205 auto& extents_to_collect = gc.get_extents_to_collect();
10206
10207 WriteContext wctx_gc;
7c673cae 10208 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 10209
31f18b77
FG
10210 for (auto it = extents_to_collect.begin();
10211 it != extents_to_collect.end();
10212 ++it) {
10213 bufferlist bl;
10214 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10215 assert(r == (int)it->length);
10216
10217 o->extent_map.fault_range(db, it->offset, it->length);
10218 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10219 logger->inc(l_bluestore_gc_merged, it->length);
10220
10221 if (*dirty_start > it->offset) {
10222 *dirty_start = it->offset;
10223 }
10224
10225 if (*dirty_end < it->offset + it->length) {
10226 *dirty_end = it->offset + it->length;
10227 }
10228 }
10229
10230 dout(30) << __func__ << " alloc write" << dendl;
10231 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10232 if (r < 0) {
10233 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10234 << dendl;
10235 return r;
10236 }
10237
10238 _wctx_finish(txc, c, o, &wctx_gc);
10239 return 0;
10240}
10241
10242int BlueStore::_do_write(
10243 TransContext *txc,
10244 CollectionRef& c,
10245 OnodeRef o,
10246 uint64_t offset,
10247 uint64_t length,
10248 bufferlist& bl,
10249 uint32_t fadvise_flags)
10250{
10251 int r = 0;
10252
10253 dout(20) << __func__
10254 << " " << o->oid
10255 << " 0x" << std::hex << offset << "~" << length
10256 << " - have 0x" << o->onode.size
10257 << " (" << std::dec << o->onode.size << ")"
10258 << " bytes"
10259 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10260 << dendl;
10261 _dump_onode(o);
10262
10263 if (length == 0) {
10264 return 0;
10265 }
10266
10267 uint64_t end = offset + length;
10268
10269 GarbageCollector gc(c->store->cct);
10270 int64_t benefit;
10271 auto dirty_start = offset;
10272 auto dirty_end = end;
10273
10274 WriteContext wctx;
10275 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
10276 o->extent_map.fault_range(db, offset, length);
10277 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
10278 r = _do_alloc_write(txc, c, o, &wctx);
10279 if (r < 0) {
10280 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10281 << dendl;
10282 goto out;
10283 }
10284
31f18b77
FG
10285 // NB: _wctx_finish() will empty old_extents
10286 // so we must do gc estimation before that
7c673cae 10287 benefit = gc.estimate(offset,
31f18b77
FG
10288 length,
10289 o->extent_map,
10290 wctx.old_extents,
10291 min_alloc_size);
7c673cae
FG
10292
10293 _wctx_finish(txc, c, o, &wctx);
10294 if (end > o->onode.size) {
10295 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 10296 << std::dec << dendl;
7c673cae
FG
10297 o->onode.size = end;
10298 }
10299
10300 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
31f18b77
FG
10301 if (!gc.get_extents_to_collect().empty()) {
10302 dout(20) << __func__ << " perform garbage collection, "
10303 << "expected benefit = " << benefit << " AUs" << dendl;
10304 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10305 if (r < 0) {
10306 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10307 << dendl;
10308 goto out;
7c673cae
FG
10309 }
10310 }
10311 }
7c673cae
FG
10312
10313 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
10314 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10315
7c673cae
FG
10316 r = 0;
10317
10318 out:
10319 return r;
10320}
10321
10322int BlueStore::_write(TransContext *txc,
10323 CollectionRef& c,
10324 OnodeRef& o,
31f18b77
FG
10325 uint64_t offset, size_t length,
10326 bufferlist& bl,
10327 uint32_t fadvise_flags)
7c673cae
FG
10328{
10329 dout(15) << __func__ << " " << c->cid << " " << o->oid
10330 << " 0x" << std::hex << offset << "~" << length << std::dec
10331 << dendl;
7c673cae
FG
10332 _assign_nid(txc, o);
10333 int r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10334 txc->write_onode(o);
10335
10336 dout(10) << __func__ << " " << c->cid << " " << o->oid
10337 << " 0x" << std::hex << offset << "~" << length << std::dec
10338 << " = " << r << dendl;
10339 return r;
10340}
10341
10342int BlueStore::_zero(TransContext *txc,
10343 CollectionRef& c,
10344 OnodeRef& o,
10345 uint64_t offset, size_t length)
10346{
10347 dout(15) << __func__ << " " << c->cid << " " << o->oid
10348 << " 0x" << std::hex << offset << "~" << length << std::dec
10349 << dendl;
7c673cae
FG
10350 _assign_nid(txc, o);
10351 int r = _do_zero(txc, c, o, offset, length);
10352 dout(10) << __func__ << " " << c->cid << " " << o->oid
10353 << " 0x" << std::hex << offset << "~" << length << std::dec
10354 << " = " << r << dendl;
10355 return r;
10356}
10357
10358int BlueStore::_do_zero(TransContext *txc,
10359 CollectionRef& c,
10360 OnodeRef& o,
10361 uint64_t offset, size_t length)
10362{
10363 dout(15) << __func__ << " " << c->cid << " " << o->oid
10364 << " 0x" << std::hex << offset << "~" << length << std::dec
10365 << dendl;
10366 int r = 0;
10367
10368 _dump_onode(o);
10369
10370 WriteContext wctx;
10371 o->extent_map.fault_range(db, offset, length);
10372 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 10373 o->extent_map.dirty_range(offset, length);
7c673cae
FG
10374 _wctx_finish(txc, c, o, &wctx);
10375
10376 if (offset + length > o->onode.size) {
10377 o->onode.size = offset + length;
10378 dout(20) << __func__ << " extending size to " << offset + length
10379 << dendl;
10380 }
10381 txc->write_onode(o);
10382
10383 dout(10) << __func__ << " " << c->cid << " " << o->oid
10384 << " 0x" << std::hex << offset << "~" << length << std::dec
10385 << " = " << r << dendl;
10386 return r;
10387}
10388
10389void BlueStore::_do_truncate(
31f18b77
FG
10390 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10391 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
10392{
10393 dout(15) << __func__ << " " << c->cid << " " << o->oid
10394 << " 0x" << std::hex << offset << std::dec << dendl;
10395
10396 _dump_onode(o, 30);
10397
10398 if (offset == o->onode.size)
31f18b77 10399 return;
7c673cae
FG
10400
10401 if (offset < o->onode.size) {
10402 WriteContext wctx;
10403 uint64_t length = o->onode.size - offset;
10404 o->extent_map.fault_range(db, offset, length);
10405 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
10406 o->extent_map.dirty_range(offset, length);
10407 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
10408
10409 // if we have shards past EOF, ask for a reshard
10410 if (!o->onode.extent_map_shards.empty() &&
10411 o->onode.extent_map_shards.back().offset >= offset) {
10412 dout(10) << __func__ << " request reshard past EOF" << dendl;
10413 if (offset) {
10414 o->extent_map.request_reshard(offset - 1, offset + length);
10415 } else {
10416 o->extent_map.request_reshard(0, length);
10417 }
10418 }
10419 }
10420
10421 o->onode.size = offset;
10422
10423 txc->write_onode(o);
10424}
10425
10426void BlueStore::_truncate(TransContext *txc,
10427 CollectionRef& c,
10428 OnodeRef& o,
10429 uint64_t offset)
10430{
10431 dout(15) << __func__ << " " << c->cid << " " << o->oid
10432 << " 0x" << std::hex << offset << std::dec
10433 << dendl;
10434 _do_truncate(txc, c, o, offset);
10435}
10436
10437int BlueStore::_do_remove(
10438 TransContext *txc,
10439 CollectionRef& c,
10440 OnodeRef o)
10441{
31f18b77 10442 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
10443 bool is_gen = !o->oid.is_no_gen();
10444 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
10445 if (o->onode.has_omap()) {
10446 o->flush();
10447 _do_omap_clear(txc, o->onode.nid);
10448 }
10449 o->exists = false;
10450 string key;
10451 for (auto &s : o->extent_map.shards) {
10452 dout(20) << __func__ << " removing shard 0x" << std::hex
10453 << s.shard_info->offset << std::dec << dendl;
10454 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10455 [&](const string& final_key) {
10456 txc->t->rmkey(PREFIX_OBJ, final_key);
10457 }
10458 );
10459 }
10460 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10461 txc->removed(o);
10462 o->extent_map.clear();
10463 o->onode = bluestore_onode_t();
10464 _debug_obj_on_delete(o->oid);
31f18b77 10465
224ce89b
WB
10466 if (!is_gen || maybe_unshared_blobs.empty()) {
10467 return 0;
10468 }
31f18b77 10469
224ce89b
WB
10470 // see if we can unshare blobs still referenced by the head
10471 dout(10) << __func__ << " gen and maybe_unshared_blobs "
10472 << maybe_unshared_blobs << dendl;
10473 ghobject_t nogen = o->oid;
10474 nogen.generation = ghobject_t::NO_GEN;
10475 OnodeRef h = c->onode_map.lookup(nogen);
10476
10477 if (!h || !h->exists) {
10478 return 0;
10479 }
10480
10481 dout(20) << __func__ << " checking for unshareable blobs on " << h
10482 << " " << h->oid << dendl;
10483 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
10484 for (auto& e : h->extent_map.extent_map) {
10485 const bluestore_blob_t& b = e.blob->get_blob();
10486 SharedBlob *sb = e.blob->shared_blob.get();
10487 if (b.is_shared() &&
10488 sb->loaded &&
10489 maybe_unshared_blobs.count(sb)) {
10490 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
10491 expect[sb].get(off, len);
10492 return 0;
10493 });
10494 }
10495 }
31f18b77 10496
224ce89b
WB
10497 vector<SharedBlob*> unshared_blobs;
10498 unshared_blobs.reserve(maybe_unshared_blobs.size());
10499 for (auto& p : expect) {
10500 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
10501 if (p.first->persistent->ref_map == p.second) {
10502 SharedBlob *sb = p.first;
10503 dout(20) << __func__ << " unsharing " << *sb << dendl;
10504 unshared_blobs.push_back(sb);
10505 txc->unshare_blob(sb);
10506 uint64_t sbid = c->make_blob_unshared(sb);
10507 string key;
10508 get_shared_blob_key(sbid, &key);
10509 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
10510 }
10511 }
10512
10513 if (unshared_blobs.empty()) {
10514 return 0;
10515 }
10516
224ce89b
WB
10517 for (auto& e : h->extent_map.extent_map) {
10518 const bluestore_blob_t& b = e.blob->get_blob();
10519 SharedBlob *sb = e.blob->shared_blob.get();
10520 if (b.is_shared() &&
10521 std::find(unshared_blobs.begin(), unshared_blobs.end(),
10522 sb) != unshared_blobs.end()) {
10523 dout(20) << __func__ << " unsharing " << e << dendl;
10524 bluestore_blob_t& blob = e.blob->dirty_blob();
10525 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 10526 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
10527 }
10528 }
224ce89b
WB
10529 txc->write_onode(h);
10530
7c673cae
FG
10531 return 0;
10532}
10533
10534int BlueStore::_remove(TransContext *txc,
10535 CollectionRef& c,
10536 OnodeRef &o)
10537{
10538 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10539 int r = _do_remove(txc, c, o);
10540 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10541 return r;
10542}
10543
10544int BlueStore::_setattr(TransContext *txc,
10545 CollectionRef& c,
10546 OnodeRef& o,
10547 const string& name,
10548 bufferptr& val)
10549{
10550 dout(15) << __func__ << " " << c->cid << " " << o->oid
10551 << " " << name << " (" << val.length() << " bytes)"
10552 << dendl;
10553 int r = 0;
10554 if (val.is_partial())
10555 o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
10556 else
10557 o->onode.attrs[name.c_str()] = val;
10558 txc->write_onode(o);
10559 dout(10) << __func__ << " " << c->cid << " " << o->oid
10560 << " " << name << " (" << val.length() << " bytes)"
10561 << " = " << r << dendl;
10562 return r;
10563}
10564
10565int BlueStore::_setattrs(TransContext *txc,
10566 CollectionRef& c,
10567 OnodeRef& o,
10568 const map<string,bufferptr>& aset)
10569{
10570 dout(15) << __func__ << " " << c->cid << " " << o->oid
10571 << " " << aset.size() << " keys"
10572 << dendl;
10573 int r = 0;
10574 for (map<string,bufferptr>::const_iterator p = aset.begin();
10575 p != aset.end(); ++p) {
10576 if (p->second.is_partial())
10577 o->onode.attrs[p->first.c_str()] =
10578 bufferptr(p->second.c_str(), p->second.length());
10579 else
10580 o->onode.attrs[p->first.c_str()] = p->second;
10581 }
10582 txc->write_onode(o);
10583 dout(10) << __func__ << " " << c->cid << " " << o->oid
10584 << " " << aset.size() << " keys"
10585 << " = " << r << dendl;
10586 return r;
10587}
10588
10589
10590int BlueStore::_rmattr(TransContext *txc,
10591 CollectionRef& c,
10592 OnodeRef& o,
10593 const string& name)
10594{
10595 dout(15) << __func__ << " " << c->cid << " " << o->oid
10596 << " " << name << dendl;
10597 int r = 0;
10598 auto it = o->onode.attrs.find(name.c_str());
10599 if (it == o->onode.attrs.end())
10600 goto out;
10601
10602 o->onode.attrs.erase(it);
10603 txc->write_onode(o);
10604
10605 out:
10606 dout(10) << __func__ << " " << c->cid << " " << o->oid
10607 << " " << name << " = " << r << dendl;
10608 return r;
10609}
10610
10611int BlueStore::_rmattrs(TransContext *txc,
10612 CollectionRef& c,
10613 OnodeRef& o)
10614{
10615 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10616 int r = 0;
10617
10618 if (o->onode.attrs.empty())
10619 goto out;
10620
10621 o->onode.attrs.clear();
10622 txc->write_onode(o);
10623
10624 out:
10625 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10626 return r;
10627}
10628
10629void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
10630{
10631 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10632 string prefix, tail;
10633 get_omap_header(id, &prefix);
10634 get_omap_tail(id, &tail);
10635 it->lower_bound(prefix);
10636 while (it->valid()) {
10637 if (it->key() >= tail) {
10638 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
10639 << dendl;
10640 break;
10641 }
10642 txc->t->rmkey(PREFIX_OMAP, it->key());
10643 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10644 it->next();
10645 }
10646}
10647
10648int BlueStore::_omap_clear(TransContext *txc,
10649 CollectionRef& c,
10650 OnodeRef& o)
10651{
10652 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10653 int r = 0;
10654 if (o->onode.has_omap()) {
10655 o->flush();
10656 _do_omap_clear(txc, o->onode.nid);
10657 o->onode.clear_omap_flag();
10658 txc->write_onode(o);
10659 }
10660 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10661 return r;
10662}
10663
10664int BlueStore::_omap_setkeys(TransContext *txc,
10665 CollectionRef& c,
10666 OnodeRef& o,
10667 bufferlist &bl)
10668{
10669 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10670 int r;
10671 bufferlist::iterator p = bl.begin();
10672 __u32 num;
10673 if (!o->onode.has_omap()) {
10674 o->onode.set_omap_flag();
10675 txc->write_onode(o);
10676 } else {
10677 txc->note_modified_object(o);
10678 }
10679 string final_key;
10680 _key_encode_u64(o->onode.nid, &final_key);
10681 final_key.push_back('.');
10682 ::decode(num, p);
10683 while (num--) {
10684 string key;
10685 bufferlist value;
10686 ::decode(key, p);
10687 ::decode(value, p);
10688 final_key.resize(9); // keep prefix
10689 final_key += key;
10690 dout(30) << __func__ << " " << pretty_binary_string(final_key)
10691 << " <- " << key << dendl;
10692 txc->t->set(PREFIX_OMAP, final_key, value);
10693 }
10694 r = 0;
10695 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10696 return r;
10697}
10698
10699int BlueStore::_omap_setheader(TransContext *txc,
10700 CollectionRef& c,
10701 OnodeRef &o,
10702 bufferlist& bl)
10703{
10704 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10705 int r;
10706 string key;
10707 if (!o->onode.has_omap()) {
10708 o->onode.set_omap_flag();
10709 txc->write_onode(o);
10710 } else {
10711 txc->note_modified_object(o);
10712 }
10713 get_omap_header(o->onode.nid, &key);
10714 txc->t->set(PREFIX_OMAP, key, bl);
10715 r = 0;
10716 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10717 return r;
10718}
10719
10720int BlueStore::_omap_rmkeys(TransContext *txc,
10721 CollectionRef& c,
10722 OnodeRef& o,
10723 bufferlist& bl)
10724{
10725 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10726 int r = 0;
10727 bufferlist::iterator p = bl.begin();
10728 __u32 num;
10729 string final_key;
10730
10731 if (!o->onode.has_omap()) {
10732 goto out;
10733 }
10734 _key_encode_u64(o->onode.nid, &final_key);
10735 final_key.push_back('.');
10736 ::decode(num, p);
10737 while (num--) {
10738 string key;
10739 ::decode(key, p);
10740 final_key.resize(9); // keep prefix
10741 final_key += key;
10742 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
10743 << " <- " << key << dendl;
10744 txc->t->rmkey(PREFIX_OMAP, final_key);
10745 }
10746 txc->note_modified_object(o);
10747
10748 out:
10749 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10750 return r;
10751}
10752
10753int BlueStore::_omap_rmkey_range(TransContext *txc,
10754 CollectionRef& c,
10755 OnodeRef& o,
10756 const string& first, const string& last)
10757{
10758 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10759 KeyValueDB::Iterator it;
10760 string key_first, key_last;
10761 int r = 0;
10762 if (!o->onode.has_omap()) {
10763 goto out;
10764 }
10765 o->flush();
10766 it = db->get_iterator(PREFIX_OMAP);
10767 get_omap_key(o->onode.nid, first, &key_first);
10768 get_omap_key(o->onode.nid, last, &key_last);
10769 it->lower_bound(key_first);
10770 while (it->valid()) {
10771 if (it->key() >= key_last) {
10772 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
10773 << dendl;
10774 break;
10775 }
10776 txc->t->rmkey(PREFIX_OMAP, it->key());
10777 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10778 it->next();
10779 }
10780 txc->note_modified_object(o);
10781
10782 out:
10783 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10784 return r;
10785}
10786
10787int BlueStore::_set_alloc_hint(
10788 TransContext *txc,
10789 CollectionRef& c,
10790 OnodeRef& o,
10791 uint64_t expected_object_size,
10792 uint64_t expected_write_size,
10793 uint32_t flags)
10794{
10795 dout(15) << __func__ << " " << c->cid << " " << o->oid
10796 << " object_size " << expected_object_size
10797 << " write_size " << expected_write_size
10798 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10799 << dendl;
10800 int r = 0;
10801 o->onode.expected_object_size = expected_object_size;
10802 o->onode.expected_write_size = expected_write_size;
10803 o->onode.alloc_hint_flags = flags;
10804 txc->write_onode(o);
10805 dout(10) << __func__ << " " << c->cid << " " << o->oid
10806 << " object_size " << expected_object_size
10807 << " write_size " << expected_write_size
10808 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10809 << " = " << r << dendl;
10810 return r;
10811}
10812
10813int BlueStore::_clone(TransContext *txc,
10814 CollectionRef& c,
10815 OnodeRef& oldo,
10816 OnodeRef& newo)
10817{
10818 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10819 << newo->oid << dendl;
10820 int r = 0;
10821 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
10822 derr << __func__ << " mismatched hash on " << oldo->oid
10823 << " and " << newo->oid << dendl;
10824 return -EINVAL;
10825 }
10826
7c673cae
FG
10827 _assign_nid(txc, newo);
10828
10829 // clone data
10830 oldo->flush();
10831 _do_truncate(txc, c, newo, 0);
10832 if (cct->_conf->bluestore_clone_cow) {
10833 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
10834 } else {
10835 bufferlist bl;
10836 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
10837 if (r < 0)
10838 goto out;
10839 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
10840 if (r < 0)
10841 goto out;
10842 }
10843
10844 // clone attrs
10845 newo->onode.attrs = oldo->onode.attrs;
10846
10847 // clone omap
10848 if (newo->onode.has_omap()) {
10849 dout(20) << __func__ << " clearing old omap data" << dendl;
10850 newo->flush();
10851 _do_omap_clear(txc, newo->onode.nid);
10852 }
10853 if (oldo->onode.has_omap()) {
10854 dout(20) << __func__ << " copying omap data" << dendl;
10855 if (!newo->onode.has_omap()) {
10856 newo->onode.set_omap_flag();
10857 }
10858 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10859 string head, tail;
10860 get_omap_header(oldo->onode.nid, &head);
10861 get_omap_tail(oldo->onode.nid, &tail);
10862 it->lower_bound(head);
10863 while (it->valid()) {
10864 if (it->key() >= tail) {
10865 dout(30) << __func__ << " reached tail" << dendl;
10866 break;
10867 } else {
10868 dout(30) << __func__ << " got header/data "
10869 << pretty_binary_string(it->key()) << dendl;
10870 string key;
10871 rewrite_omap_key(newo->onode.nid, it->key(), &key);
10872 txc->t->set(PREFIX_OMAP, key, it->value());
10873 }
10874 it->next();
10875 }
10876 } else {
10877 newo->onode.clear_omap_flag();
10878 }
10879
10880 txc->write_onode(newo);
10881 r = 0;
10882
10883 out:
10884 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10885 << newo->oid << " = " << r << dendl;
10886 return r;
10887}
10888
10889int BlueStore::_do_clone_range(
10890 TransContext *txc,
10891 CollectionRef& c,
10892 OnodeRef& oldo,
10893 OnodeRef& newo,
224ce89b
WB
10894 uint64_t srcoff,
10895 uint64_t length,
10896 uint64_t dstoff)
7c673cae
FG
10897{
10898 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10899 << newo->oid
10900 << " 0x" << std::hex << srcoff << "~" << length << " -> "
10901 << " 0x" << dstoff << "~" << length << std::dec << dendl;
10902 oldo->extent_map.fault_range(db, srcoff, length);
10903 newo->extent_map.fault_range(db, dstoff, length);
10904 _dump_onode(oldo);
10905 _dump_onode(newo);
10906
10907 // hmm, this could go into an ExtentMap::dup() method.
10908 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
10909 for (auto &e : oldo->extent_map.extent_map) {
10910 e.blob->last_encoded_id = -1;
10911 }
10912 int n = 0;
7c673cae 10913 uint64_t end = srcoff + length;
224ce89b
WB
10914 uint32_t dirty_range_begin = 0;
10915 uint32_t dirty_range_end = 0;
7c673cae
FG
10916 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
10917 ep != oldo->extent_map.extent_map.end();
10918 ++ep) {
10919 auto& e = *ep;
10920 if (e.logical_offset >= end) {
10921 break;
10922 }
10923 dout(20) << __func__ << " src " << e << dendl;
10924 BlobRef cb;
10925 bool blob_duped = true;
10926 if (e.blob->last_encoded_id >= 0) {
10927 // blob is already duped
10928 cb = id_to_blob[e.blob->last_encoded_id];
10929 blob_duped = false;
10930 } else {
10931 // dup the blob
10932 const bluestore_blob_t& blob = e.blob->get_blob();
10933 // make sure it is shared
10934 if (!blob.is_shared()) {
10935 c->make_blob_shared(_assign_blobid(txc), e.blob);
d2e6a577 10936 if (dirty_range_begin == 0 && dirty_range_end == 0) {
224ce89b
WB
10937 dirty_range_begin = e.logical_offset;
10938 }
10939 assert(e.logical_end() > 0);
10940 // -1 to exclude next potential shard
10941 dirty_range_end = e.logical_end() - 1;
7c673cae
FG
10942 } else {
10943 c->load_shared_blob(e.blob->shared_blob);
10944 }
10945 cb = new Blob();
10946 e.blob->last_encoded_id = n;
10947 id_to_blob[n] = cb;
10948 e.blob->dup(*cb);
10949 // bump the extent refs on the copied blob's extents
10950 for (auto p : blob.get_extents()) {
10951 if (p.is_valid()) {
10952 e.blob->shared_blob->get_ref(p.offset, p.length);
10953 }
10954 }
10955 txc->write_shared_blob(e.blob->shared_blob);
10956 dout(20) << __func__ << " new " << *cb << dendl;
10957 }
10958 // dup extent
10959 int skip_front, skip_back;
10960 if (e.logical_offset < srcoff) {
10961 skip_front = srcoff - e.logical_offset;
10962 } else {
10963 skip_front = 0;
10964 }
10965 if (e.logical_end() > end) {
10966 skip_back = e.logical_end() - end;
10967 } else {
10968 skip_back = 0;
10969 }
10970 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
10971 e.blob_offset + skip_front,
10972 e.length - skip_front - skip_back, cb);
10973 newo->extent_map.extent_map.insert(*ne);
10974 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
10975 // fixme: we may leave parts of new blob unreferenced that could
10976 // be freed (relative to the shared_blob).
10977 txc->statfs_delta.stored() += ne->length;
10978 if (e.blob->get_blob().is_compressed()) {
10979 txc->statfs_delta.compressed_original() += ne->length;
10980 if (blob_duped){
10981 txc->statfs_delta.compressed() +=
10982 cb->get_blob().get_compressed_payload_length();
10983 }
10984 }
10985 dout(20) << __func__ << " dst " << *ne << dendl;
10986 ++n;
10987 }
224ce89b
WB
10988 if (dirty_range_end > dirty_range_begin) {
10989 oldo->extent_map.dirty_range(dirty_range_begin,
10990 dirty_range_end - dirty_range_begin);
7c673cae
FG
10991 txc->write_onode(oldo);
10992 }
10993 txc->write_onode(newo);
10994
10995 if (dstoff + length > newo->onode.size) {
10996 newo->onode.size = dstoff + length;
10997 }
31f18b77 10998 newo->extent_map.dirty_range(dstoff, length);
7c673cae
FG
10999 _dump_onode(oldo);
11000 _dump_onode(newo);
11001 return 0;
11002}
11003
11004int BlueStore::_clone_range(TransContext *txc,
11005 CollectionRef& c,
11006 OnodeRef& oldo,
11007 OnodeRef& newo,
11008 uint64_t srcoff, uint64_t length, uint64_t dstoff)
11009{
11010 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11011 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11012 << " to offset 0x" << dstoff << std::dec << dendl;
11013 int r = 0;
11014
11015 if (srcoff + length > oldo->onode.size) {
11016 r = -EINVAL;
11017 goto out;
11018 }
11019
7c673cae
FG
11020 _assign_nid(txc, newo);
11021
11022 if (length > 0) {
11023 if (cct->_conf->bluestore_clone_cow) {
11024 _do_zero(txc, c, newo, dstoff, length);
11025 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
11026 } else {
11027 bufferlist bl;
11028 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
11029 if (r < 0)
11030 goto out;
11031 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
11032 if (r < 0)
11033 goto out;
11034 }
11035 }
11036
11037 txc->write_onode(newo);
11038 r = 0;
11039
11040 out:
11041 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11042 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11043 << " to offset 0x" << dstoff << std::dec
11044 << " = " << r << dendl;
11045 return r;
11046}
11047
11048int BlueStore::_rename(TransContext *txc,
11049 CollectionRef& c,
11050 OnodeRef& oldo,
11051 OnodeRef& newo,
11052 const ghobject_t& new_oid)
11053{
11054 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11055 << new_oid << dendl;
11056 int r;
11057 ghobject_t old_oid = oldo->oid;
31f18b77 11058 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
11059
11060 if (newo) {
11061 if (newo->exists) {
11062 r = -EEXIST;
11063 goto out;
11064 }
11065 assert(txc->onodes.count(newo) == 0);
11066 }
11067
11068 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
11069
11070 // rewrite shards
11071 {
11072 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
11073 get_object_key(cct, new_oid, &new_okey);
11074 string key;
11075 for (auto &s : oldo->extent_map.shards) {
11076 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
11077 [&](const string& final_key) {
11078 txc->t->rmkey(PREFIX_OBJ, final_key);
11079 }
11080 );
11081 s.dirty = true;
11082 }
11083 }
11084
11085 newo = oldo;
11086 txc->write_onode(newo);
11087
11088 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11089 // Onode in the old slot
11090 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
11091 r = 0;
11092
11093 out:
11094 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
11095 << new_oid << " = " << r << dendl;
11096 return r;
11097}
11098
11099// collections
11100
11101int BlueStore::_create_collection(
11102 TransContext *txc,
11103 const coll_t &cid,
11104 unsigned bits,
11105 CollectionRef *c)
11106{
11107 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
11108 int r;
11109 bufferlist bl;
11110
11111 {
11112 RWLock::WLocker l(coll_lock);
11113 if (*c) {
11114 r = -EEXIST;
11115 goto out;
11116 }
11117 c->reset(
11118 new Collection(
11119 this,
11120 cache_shards[cid.hash_to_shard(cache_shards.size())],
11121 cid));
11122 (*c)->cnode.bits = bits;
11123 coll_map[cid] = *c;
11124 }
11125 ::encode((*c)->cnode, bl);
11126 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11127 r = 0;
11128
11129 out:
11130 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11131 return r;
11132}
11133
11134int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11135 CollectionRef *c)
11136{
11137 dout(15) << __func__ << " " << cid << dendl;
11138 int r;
11139
11140 {
11141 RWLock::WLocker l(coll_lock);
11142 if (!*c) {
11143 r = -ENOENT;
11144 goto out;
11145 }
11146 size_t nonexistent_count = 0;
11147 assert((*c)->exists);
11148 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11149 if (o->exists) {
11150 dout(10) << __func__ << " " << o->oid << " " << o
11151 << " exists in onode_map" << dendl;
11152 return true;
11153 }
11154 ++nonexistent_count;
11155 return false;
11156 })) {
11157 r = -ENOTEMPTY;
11158 goto out;
11159 }
11160
11161 vector<ghobject_t> ls;
11162 ghobject_t next;
11163 // Enumerate onodes in db, up to nonexistent_count + 1
11164 // then check if all of them are marked as non-existent.
11165 // Bypass the check if returned number is greater than nonexistent_count
11166 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11167 nonexistent_count + 1, &ls, &next);
11168 if (r >= 0) {
11169 bool exists = false; //ls.size() > nonexistent_count;
11170 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11171 dout(10) << __func__ << " oid " << *it << dendl;
11172 auto onode = (*c)->onode_map.lookup(*it);
11173 exists = !onode || onode->exists;
11174 if (exists) {
11175 dout(10) << __func__ << " " << *it
11176 << " exists in db" << dendl;
11177 }
11178 }
11179 if (!exists) {
11180 coll_map.erase(cid);
11181 txc->removed_collections.push_back(*c);
11182 (*c)->exists = false;
11183 c->reset();
11184 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11185 r = 0;
11186 } else {
11187 dout(10) << __func__ << " " << cid
11188 << " is non-empty" << dendl;
11189 r = -ENOTEMPTY;
11190 }
11191 }
11192 }
11193
11194 out:
11195 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11196 return r;
11197}
11198
11199int BlueStore::_split_collection(TransContext *txc,
11200 CollectionRef& c,
11201 CollectionRef& d,
11202 unsigned bits, int rem)
11203{
11204 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11205 << " bits " << bits << dendl;
11206 RWLock::WLocker l(c->lock);
11207 RWLock::WLocker l2(d->lock);
11208 int r;
11209
11210 // flush all previous deferred writes on this sequencer. this is a bit
11211 // heavyweight, but we need to make sure all deferred writes complete
11212 // before we split as the new collection's sequencer may need to order
11213 // this after those writes, and we don't bother with the complexity of
11214 // moving those TransContexts over to the new osr.
11215 _osr_drain_preceding(txc);
11216
11217 // move any cached items (onodes and referenced shared blobs) that will
11218 // belong to the child collection post-split. leave everything else behind.
11219 // this may include things that don't strictly belong to the now-smaller
11220 // parent split, but the OSD will always send us a split for every new
11221 // child.
11222
11223 spg_t pgid, dest_pgid;
11224 bool is_pg = c->cid.is_pg(&pgid);
11225 assert(is_pg);
11226 is_pg = d->cid.is_pg(&dest_pgid);
11227 assert(is_pg);
11228
11229 // the destination should initially be empty.
11230 assert(d->onode_map.empty());
11231 assert(d->shared_blob_set.empty());
11232 assert(d->cnode.bits == bits);
11233
11234 c->split_cache(d.get());
11235
11236 // adjust bits. note that this will be redundant for all but the first
11237 // split call for this parent (first child).
11238 c->cnode.bits = bits;
11239 assert(d->cnode.bits == bits);
11240 r = 0;
11241
11242 bufferlist bl;
11243 ::encode(c->cnode, bl);
11244 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11245
11246 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11247 << " bits " << bits << " = " << r << dendl;
11248 return r;
11249}
11250
11251// DB key value Histogram
11252#define KEY_SLAB 32
11253#define VALUE_SLAB 64
11254
11255const string prefix_onode = "o";
11256const string prefix_onode_shard = "x";
11257const string prefix_other = "Z";
11258
11259int BlueStore::DBHistogram::get_key_slab(size_t sz)
11260{
11261 return (sz/KEY_SLAB);
11262}
11263
11264string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11265{
11266 int lower_bound = slab * KEY_SLAB;
11267 int upper_bound = (slab + 1) * KEY_SLAB;
11268 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11269 return ret;
11270}
11271
11272int BlueStore::DBHistogram::get_value_slab(size_t sz)
11273{
11274 return (sz/VALUE_SLAB);
11275}
11276
11277string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11278{
11279 int lower_bound = slab * VALUE_SLAB;
11280 int upper_bound = (slab + 1) * VALUE_SLAB;
11281 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11282 return ret;
11283}
11284
11285void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11286 const string &prefix, size_t key_size, size_t value_size)
11287{
11288 uint32_t key_slab = get_key_slab(key_size);
11289 uint32_t value_slab = get_value_slab(value_size);
11290 key_hist[prefix][key_slab].count++;
11291 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11292 key_hist[prefix][key_slab].val_map[value_slab].count++;
11293 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11294 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11295}
11296
11297void BlueStore::DBHistogram::dump(Formatter *f)
11298{
11299 f->open_object_section("rocksdb_value_distribution");
11300 for (auto i : value_hist) {
11301 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11302 }
11303 f->close_section();
11304
11305 f->open_object_section("rocksdb_key_value_histogram");
11306 for (auto i : key_hist) {
11307 f->dump_string("prefix", i.first);
11308 f->open_object_section("key_hist");
11309 for ( auto k : i.second) {
11310 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11311 f->dump_unsigned("max_len", k.second.max_len);
11312 f->open_object_section("value_hist");
11313 for ( auto j : k.second.val_map) {
11314 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11315 f->dump_unsigned("max_len", j.second.max_len);
11316 }
11317 f->close_section();
11318 }
11319 f->close_section();
11320 }
11321 f->close_section();
11322}
11323
11324//Itrerates through the db and collects the stats
11325void BlueStore::generate_db_histogram(Formatter *f)
11326{
11327 //globals
11328 uint64_t num_onodes = 0;
11329 uint64_t num_shards = 0;
11330 uint64_t num_super = 0;
11331 uint64_t num_coll = 0;
11332 uint64_t num_omap = 0;
11333 uint64_t num_deferred = 0;
11334 uint64_t num_alloc = 0;
11335 uint64_t num_stat = 0;
11336 uint64_t num_others = 0;
11337 uint64_t num_shared_shards = 0;
11338 size_t max_key_size =0, max_value_size = 0;
11339 uint64_t total_key_size = 0, total_value_size = 0;
11340 size_t key_size = 0, value_size = 0;
11341 DBHistogram hist;
11342
11343 utime_t start = ceph_clock_now();
11344
11345 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11346 iter->seek_to_first();
11347 while (iter->valid()) {
11348 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11349 key_size = iter->key_size();
11350 value_size = iter->value_size();
11351 hist.value_hist[hist.get_value_slab(value_size)]++;
11352 max_key_size = MAX(max_key_size, key_size);
11353 max_value_size = MAX(max_value_size, value_size);
11354 total_key_size += key_size;
11355 total_value_size += value_size;
11356
11357 pair<string,string> key(iter->raw_key());
11358
11359 if (key.first == PREFIX_SUPER) {
11360 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11361 num_super++;
11362 } else if (key.first == PREFIX_STAT) {
11363 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11364 num_stat++;
11365 } else if (key.first == PREFIX_COLL) {
11366 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11367 num_coll++;
11368 } else if (key.first == PREFIX_OBJ) {
11369 if (key.second.back() == ONODE_KEY_SUFFIX) {
11370 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11371 num_onodes++;
11372 } else {
11373 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11374 num_shards++;
11375 }
11376 } else if (key.first == PREFIX_OMAP) {
11377 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11378 num_omap++;
11379 } else if (key.first == PREFIX_DEFERRED) {
11380 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11381 num_deferred++;
11382 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11383 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11384 num_alloc++;
11385 } else if (key.first == PREFIX_SHARED_BLOB) {
11386 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11387 num_shared_shards++;
11388 } else {
11389 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11390 num_others++;
11391 }
11392 iter->next();
11393 }
11394
11395 utime_t duration = ceph_clock_now() - start;
11396 f->open_object_section("rocksdb_key_value_stats");
11397 f->dump_unsigned("num_onodes", num_onodes);
11398 f->dump_unsigned("num_shards", num_shards);
11399 f->dump_unsigned("num_super", num_super);
11400 f->dump_unsigned("num_coll", num_coll);
11401 f->dump_unsigned("num_omap", num_omap);
11402 f->dump_unsigned("num_deferred", num_deferred);
11403 f->dump_unsigned("num_alloc", num_alloc);
11404 f->dump_unsigned("num_stat", num_stat);
11405 f->dump_unsigned("num_shared_shards", num_shared_shards);
11406 f->dump_unsigned("num_others", num_others);
11407 f->dump_unsigned("max_key_size", max_key_size);
11408 f->dump_unsigned("max_value_size", max_value_size);
11409 f->dump_unsigned("total_key_size", total_key_size);
11410 f->dump_unsigned("total_value_size", total_value_size);
11411 f->close_section();
11412
11413 hist.dump(f);
11414
11415 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11416
11417}
11418
31f18b77 11419void BlueStore::_flush_cache()
7c673cae
FG
11420{
11421 dout(10) << __func__ << dendl;
11422 for (auto i : cache_shards) {
11423 i->trim_all();
31f18b77 11424 assert(i->empty());
7c673cae
FG
11425 }
11426 for (auto& p : coll_map) {
11427 assert(p.second->onode_map.empty());
11428 assert(p.second->shared_blob_set.empty());
11429 }
11430 coll_map.clear();
11431}
11432
31f18b77
FG
11433// For external caller.
11434// We use a best-effort policy instead, e.g.,
11435// we don't care if there are still some pinned onodes/data in the cache
11436// after this command is completed.
11437void BlueStore::flush_cache()
11438{
11439 dout(10) << __func__ << dendl;
11440 for (auto i : cache_shards) {
11441 i->trim_all();
11442 }
11443}
11444
7c673cae
FG
11445void BlueStore::_apply_padding(uint64_t head_pad,
11446 uint64_t tail_pad,
7c673cae
FG
11447 bufferlist& padded)
11448{
7c673cae 11449 if (head_pad) {
224ce89b 11450 padded.prepend_zero(head_pad);
7c673cae
FG
11451 }
11452 if (tail_pad) {
11453 padded.append_zero(tail_pad);
11454 }
11455 if (head_pad || tail_pad) {
11456 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
11457 << " tail 0x" << tail_pad << std::dec << dendl;
11458 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
11459 }
11460}
11461
11462// ===========================================