]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
update sources to 12.2.8
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
7c673cae
FG
1// vim: ts=8 sw=2 smarttab
2/*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14#include <unistd.h>
15#include <stdlib.h>
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <fcntl.h>
19
31f18b77
FG
20#include "include/cpp-btree/btree_set.h"
21
7c673cae
FG
22#include "BlueStore.h"
23#include "os/kv.h"
24#include "include/compat.h"
25#include "include/intarith.h"
26#include "include/stringify.h"
27#include "common/errno.h"
28#include "common/safe_io.h"
29#include "Allocator.h"
30#include "FreelistManager.h"
31#include "BlueFS.h"
32#include "BlueRocksEnv.h"
33#include "auth/Crypto.h"
34#include "common/EventTrace.h"
35
36#define dout_context cct
37#define dout_subsys ceph_subsys_bluestore
38
31f18b77
FG
39using bid_t = decltype(BlueStore::Blob::id);
40
41// bluestore_cache_onode
7c673cae 42MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 43 bluestore_cache_onode);
7c673cae 44
31f18b77 45// bluestore_cache_other
7c673cae 46MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 47 bluestore_cache_other);
7c673cae 48MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 49 bluestore_cache_other);
7c673cae 50MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 51 bluestore_cache_other);
7c673cae 52MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
53 bluestore_cache_other);
54
55// bluestore_txc
56MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
57 bluestore_txc);
58
7c673cae
FG
59
60// kv store prefixes
61const string PREFIX_SUPER = "S"; // field -> value
62const string PREFIX_STAT = "T"; // field -> value(int64 array)
63const string PREFIX_COLL = "C"; // collection name -> cnode_t
64const string PREFIX_OBJ = "O"; // object name -> onode_t
65const string PREFIX_OMAP = "M"; // u64 + keyname -> value
66const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
67const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
68const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
69
70// write a label in the first block. always use this size. note that
71// bluefs makes a matching assumption about the location of its
72// superblock (always the second block of the device).
73#define BDEV_LABEL_BLOCK_SIZE 4096
74
75// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76#define SUPER_RESERVED 8192
77
78#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
79
80
81/*
82 * extent map blob encoding
83 *
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
86 */
87#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91#define BLOBID_SHIFT_BITS 4
92
93/*
94 * object name key structure
95 *
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
99 *
100 * escaped string: namespace
101 *
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
106 *
107 * encoded u64: snap
108 * encoded u64: generation
109 * 'o'
110 */
111#define ONODE_KEY_SUFFIX 'o'
112
113/*
114 * extent shard key
115 *
116 * object prefix key
117 * u32
118 * 'x'
119 */
120#define EXTENT_SHARD_KEY_SUFFIX 'x'
121
122/*
123 * string encoding in the key
124 *
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
128 * hex digits.
129 *
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
132 *
133 */
134template<typename S>
135static void append_escaped(const string &in, S *out)
136{
224ce89b
WB
137 char hexbyte[in.length() * 3 + 1];
138 char* ptr = &hexbyte[0];
7c673cae
FG
139 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
140 if (*i <= '#') {
224ce89b
WB
141 *ptr++ = '#';
142 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
143 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 144 } else if (*i >= '~') {
224ce89b
WB
145 *ptr++ = '~';
146 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
147 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 148 } else {
224ce89b 149 *ptr++ = *i;
7c673cae
FG
150 }
151 }
224ce89b
WB
152 *ptr++ = '!';
153 out->append(hexbyte, ptr - &hexbyte[0]);
154}
155
156inline unsigned h2i(char c)
157{
158 if ((c >= '0') && (c <= '9')) {
159 return c - 0x30;
160 } else if ((c >= 'a') && (c <= 'f')) {
161 return c - 'a' + 10;
162 } else if ((c >= 'A') && (c <= 'F')) {
163 return c - 'A' + 10;
164 } else {
165 return 256; // make it always larger than 255
166 }
7c673cae
FG
167}
168
169static int decode_escaped(const char *p, string *out)
170{
224ce89b
WB
171 char buff[256];
172 char* ptr = &buff[0];
173 char* max = &buff[252];
7c673cae
FG
174 const char *orig_p = p;
175 while (*p && *p != '!') {
176 if (*p == '#' || *p == '~') {
224ce89b
WB
177 unsigned hex = 0;
178 p++;
179 hex = h2i(*p++) << 4;
180 if (hex > 255) {
181 return -EINVAL;
182 }
183 hex |= h2i(*p++);
184 if (hex > 255) {
185 return -EINVAL;
186 }
187 *ptr++ = hex;
7c673cae 188 } else {
224ce89b
WB
189 *ptr++ = *p++;
190 }
191 if (ptr > max) {
192 out->append(buff, ptr-buff);
193 ptr = &buff[0];
7c673cae
FG
194 }
195 }
224ce89b
WB
196 if (ptr != buff) {
197 out->append(buff, ptr-buff);
198 }
7c673cae
FG
199 return p - orig_p;
200}
201
202// some things we encode in binary (as le32 or le64); print the
203// resulting key strings nicely
204template<typename S>
205static string pretty_binary_string(const S& in)
206{
207 char buf[10];
208 string out;
209 out.reserve(in.length() * 3);
210 enum { NONE, HEX, STRING } mode = NONE;
211 unsigned from = 0, i;
212 for (i=0; i < in.length(); ++i) {
213 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
214 (mode == HEX && in.length() - i >= 4 &&
215 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
216 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
217 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
218 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
219 if (mode == STRING) {
220 out.append(in.c_str() + from, i - from);
221 out.push_back('\'');
222 }
223 if (mode != HEX) {
224 out.append("0x");
225 mode = HEX;
226 }
227 if (in.length() - i >= 4) {
228 // print a whole u32 at once
229 snprintf(buf, sizeof(buf), "%08x",
230 (uint32_t)(((unsigned char)in[i] << 24) |
231 ((unsigned char)in[i+1] << 16) |
232 ((unsigned char)in[i+2] << 8) |
233 ((unsigned char)in[i+3] << 0)));
234 i += 3;
235 } else {
236 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
237 }
238 out.append(buf);
239 } else {
240 if (mode != STRING) {
241 out.push_back('\'');
242 mode = STRING;
243 from = i;
244 }
245 }
246 }
247 if (mode == STRING) {
248 out.append(in.c_str() + from, i - from);
249 out.push_back('\'');
250 }
251 return out;
252}
253
254template<typename T>
255static void _key_encode_shard(shard_id_t shard, T *key)
256{
257 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
258}
259
260static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
261{
262 pshard->id = (uint8_t)*key - (uint8_t)0x80;
263 return key + 1;
264}
265
266static void get_coll_key_range(const coll_t& cid, int bits,
267 string *temp_start, string *temp_end,
268 string *start, string *end)
269{
270 temp_start->clear();
271 temp_end->clear();
272 start->clear();
273 end->clear();
274
275 spg_t pgid;
276 if (cid.is_pg(&pgid)) {
277 _key_encode_shard(pgid.shard, start);
278 *temp_start = *start;
279
280 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
281 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
282
283 *end = *start;
284 *temp_end = *temp_start;
285
286 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
287 _key_encode_u32(reverse_hash, start);
288 _key_encode_u32(reverse_hash, temp_start);
289
290 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
291 if (end_hash > 0xffffffffull)
292 end_hash = 0xffffffffull;
293
294 _key_encode_u32(end_hash, end);
295 _key_encode_u32(end_hash, temp_end);
296 } else {
297 _key_encode_shard(shard_id_t::NO_SHARD, start);
298 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
299 *end = *start;
300 _key_encode_u32(0, start);
301 _key_encode_u32(0xffffffff, end);
302
303 // no separate temp section
304 *temp_start = *end;
305 *temp_end = *end;
306 }
307}
308
309static void get_shared_blob_key(uint64_t sbid, string *key)
310{
311 key->clear();
312 _key_encode_u64(sbid, key);
313}
314
315static int get_key_shared_blob(const string& key, uint64_t *sbid)
316{
317 const char *p = key.c_str();
318 if (key.length() < sizeof(uint64_t))
319 return -1;
224ce89b 320 _key_decode_u64(p, sbid);
7c673cae
FG
321 return 0;
322}
323
324template<typename S>
325static int get_key_object(const S& key, ghobject_t *oid)
326{
327 int r;
328 const char *p = key.c_str();
329
330 if (key.length() < 1 + 8 + 4)
331 return -1;
332 p = _key_decode_shard(p, &oid->shard_id);
333
334 uint64_t pool;
335 p = _key_decode_u64(p, &pool);
336 oid->hobj.pool = pool - 0x8000000000000000ull;
337
338 unsigned hash;
339 p = _key_decode_u32(p, &hash);
340
341 oid->hobj.set_bitwise_key_u32(hash);
342
343 r = decode_escaped(p, &oid->hobj.nspace);
344 if (r < 0)
345 return -2;
346 p += r + 1;
347
348 string k;
349 r = decode_escaped(p, &k);
350 if (r < 0)
351 return -3;
352 p += r + 1;
353 if (*p == '=') {
354 // no key
355 ++p;
356 oid->hobj.oid.name = k;
357 } else if (*p == '<' || *p == '>') {
358 // key + name
359 ++p;
360 r = decode_escaped(p, &oid->hobj.oid.name);
361 if (r < 0)
362 return -5;
363 p += r + 1;
364 oid->hobj.set_key(k);
365 } else {
366 // malformed
367 return -6;
368 }
369
370 p = _key_decode_u64(p, &oid->hobj.snap.val);
371 p = _key_decode_u64(p, &oid->generation);
372
373 if (*p != ONODE_KEY_SUFFIX) {
374 return -7;
375 }
376 p++;
377 if (*p) {
378 // if we get something other than a null terminator here,
379 // something goes wrong.
380 return -8;
381 }
382
383 return 0;
384}
385
386template<typename S>
387static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
388{
389 key->clear();
390
391 size_t max_len = 1 + 8 + 4 +
392 (oid.hobj.nspace.length() * 3 + 1) +
393 (oid.hobj.get_key().length() * 3 + 1) +
394 1 + // for '<', '=', or '>'
395 (oid.hobj.oid.name.length() * 3 + 1) +
396 8 + 8 + 1;
397 key->reserve(max_len);
398
399 _key_encode_shard(oid.shard_id, key);
400 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
401 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
402
403 append_escaped(oid.hobj.nspace, key);
404
405 if (oid.hobj.get_key().length()) {
406 // is a key... could be < = or >.
407 append_escaped(oid.hobj.get_key(), key);
408 // (ASCII chars < = and > sort in that order, yay)
409 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
410 if (r) {
411 key->append(r > 0 ? ">" : "<");
412 append_escaped(oid.hobj.oid.name, key);
413 } else {
414 // same as no key
415 key->append("=");
416 }
417 } else {
418 // no key
419 append_escaped(oid.hobj.oid.name, key);
420 key->append("=");
421 }
422
423 _key_encode_u64(oid.hobj.snap, key);
424 _key_encode_u64(oid.generation, key);
425
426 key->push_back(ONODE_KEY_SUFFIX);
427
428 // sanity check
429 if (true) {
430 ghobject_t t;
431 int r = get_key_object(*key, &t);
432 if (r || t != oid) {
433 derr << " r " << r << dendl;
434 derr << "key " << pretty_binary_string(*key) << dendl;
435 derr << "oid " << oid << dendl;
436 derr << " t " << t << dendl;
437 assert(r == 0 && t == oid);
438 }
439 }
440}
441
442
443// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
444// char lets us quickly test whether it is a shard key without decoding any
445// of the prefix bytes.
446template<typename S>
447static void get_extent_shard_key(const S& onode_key, uint32_t offset,
448 string *key)
449{
450 key->clear();
451 key->reserve(onode_key.length() + 4 + 1);
452 key->append(onode_key.c_str(), onode_key.size());
453 _key_encode_u32(offset, key);
454 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
455}
456
457static void rewrite_extent_shard_key(uint32_t offset, string *key)
458{
459 assert(key->size() > sizeof(uint32_t) + 1);
460 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
461 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
462}
463
464template<typename S>
465static void generate_extent_shard_key_and_apply(
466 const S& onode_key,
467 uint32_t offset,
468 string *key,
469 std::function<void(const string& final_key)> apply)
470{
471 if (key->empty()) { // make full key
472 assert(!onode_key.empty());
473 get_extent_shard_key(onode_key, offset, key);
474 } else {
475 rewrite_extent_shard_key(offset, key);
476 }
477 apply(*key);
478}
479
480int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
481{
482 assert(key.size() > sizeof(uint32_t) + 1);
483 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
484 int okey_len = key.size() - sizeof(uint32_t) - 1;
485 *onode_key = key.substr(0, okey_len);
486 const char *p = key.data() + okey_len;
224ce89b 487 _key_decode_u32(p, offset);
7c673cae
FG
488 return 0;
489}
490
491static bool is_extent_shard_key(const string& key)
492{
493 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
494}
495
496// '-' < '.' < '~'
497static void get_omap_header(uint64_t id, string *out)
498{
499 _key_encode_u64(id, out);
500 out->push_back('-');
501}
502
503// hmm, I don't think there's any need to escape the user key since we
504// have a clean prefix.
505static void get_omap_key(uint64_t id, const string& key, string *out)
506{
507 _key_encode_u64(id, out);
508 out->push_back('.');
509 out->append(key);
510}
511
512static void rewrite_omap_key(uint64_t id, string old, string *out)
513{
514 _key_encode_u64(id, out);
515 out->append(old.c_str() + out->length(), old.size() - out->length());
516}
517
518static void decode_omap_key(const string& key, string *user_key)
519{
520 *user_key = key.substr(sizeof(uint64_t) + 1);
521}
522
523static void get_omap_tail(uint64_t id, string *out)
524{
525 _key_encode_u64(id, out);
526 out->push_back('~');
527}
528
529static void get_deferred_key(uint64_t seq, string *out)
530{
531 _key_encode_u64(seq, out);
532}
533
534
535// merge operators
536
537struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
538 void merge_nonexistent(
539 const char *rdata, size_t rlen, std::string *new_value) override {
540 *new_value = std::string(rdata, rlen);
541 }
542 void merge(
543 const char *ldata, size_t llen,
544 const char *rdata, size_t rlen,
545 std::string *new_value) override {
546 assert(llen == rlen);
547 assert((rlen % 8) == 0);
548 new_value->resize(rlen);
549 const __le64* lv = (const __le64*)ldata;
550 const __le64* rv = (const __le64*)rdata;
551 __le64* nv = &(__le64&)new_value->at(0);
552 for (size_t i = 0; i < rlen >> 3; ++i) {
553 nv[i] = lv[i] + rv[i];
554 }
555 }
556 // We use each operator name and each prefix to construct the
557 // overall RocksDB operator name for consistency check at open time.
558 string name() const override {
559 return "int64_array";
560 }
561};
562
563
564// Buffer
565
566ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
567{
568 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
569 << b.offset << "~" << b.length << std::dec
570 << " " << BlueStore::Buffer::get_state_name(b.state);
571 if (b.flags)
572 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
573 return out << ")";
574}
575
576// Garbage Collector
577
578void BlueStore::GarbageCollector::process_protrusive_extents(
579 const BlueStore::ExtentMap& extent_map,
580 uint64_t start_offset,
581 uint64_t end_offset,
582 uint64_t start_touch_offset,
583 uint64_t end_touch_offset,
584 uint64_t min_alloc_size)
585{
586 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
587
588 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
589 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
590
591 dout(30) << __func__ << " (hex): [" << std::hex
592 << lookup_start_offset << ", " << lookup_end_offset
593 << ")" << std::dec << dendl;
594
595 for (auto it = extent_map.seek_lextent(lookup_start_offset);
596 it != extent_map.extent_map.end() &&
597 it->logical_offset < lookup_end_offset;
598 ++it) {
599 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
600 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
601
602 dout(30) << __func__ << " " << *it
603 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
604 << dendl;
605
606 Blob* b = it->blob.get();
607
608 if (it->logical_offset >=start_touch_offset &&
609 it->logical_end() <= end_touch_offset) {
610 // Process extents within the range affected by
611 // the current write request.
612 // Need to take into account if existing extents
613 // can be merged with them (uncompressed case)
614 if (!b->get_blob().is_compressed()) {
615 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
616 --blob_info_counted->expected_allocations; // don't need to allocate
617 // new AU for compressed
618 // data since another
619 // collocated uncompressed
620 // blob already exists
621 dout(30) << __func__ << " --expected:"
622 << alloc_unit_start << dendl;
623 }
624 used_alloc_unit = alloc_unit_end;
625 blob_info_counted = nullptr;
626 }
627 } else if (b->get_blob().is_compressed()) {
628
629 // additionally we take compressed blobs that were not impacted
630 // by the write into account too
631 BlobInfo& bi =
632 affected_blobs.emplace(
633 b, BlobInfo(b->get_referenced_bytes())).first->second;
634
635 int adjust =
636 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
637 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
638 dout(30) << __func__ << " expected_allocations="
639 << bi.expected_allocations << " end_au:"
640 << alloc_unit_end << dendl;
641
642 blob_info_counted = &bi;
643 used_alloc_unit = alloc_unit_end;
644
645 assert(it->length <= bi.referenced_bytes);
646 bi.referenced_bytes -= it->length;
647 dout(30) << __func__ << " affected_blob:" << *b
648 << " unref 0x" << std::hex << it->length
649 << " referenced = 0x" << bi.referenced_bytes
650 << std::dec << dendl;
651 // NOTE: we can't move specific blob to resulting GC list here
652 // when reference counter == 0 since subsequent extents might
653 // decrement its expected_allocation.
654 // Hence need to enumerate all the extents first.
655 if (!bi.collect_candidate) {
656 bi.first_lextent = it;
657 bi.collect_candidate = true;
658 }
659 bi.last_lextent = it;
660 } else {
661 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
662 // don't need to allocate new AU for compressed data since another
663 // collocated uncompressed blob already exists
664 --blob_info_counted->expected_allocations;
665 dout(30) << __func__ << " --expected_allocations:"
666 << alloc_unit_start << dendl;
667 }
668 used_alloc_unit = alloc_unit_end;
669 blob_info_counted = nullptr;
670 }
671 }
672
673 for (auto b_it = affected_blobs.begin();
674 b_it != affected_blobs.end();
675 ++b_it) {
676 Blob* b = b_it->first;
677 BlobInfo& bi = b_it->second;
678 if (bi.referenced_bytes == 0) {
679 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
680 int64_t blob_expected_for_release =
681 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
682
683 dout(30) << __func__ << " " << *(b_it->first)
684 << " expected4release=" << blob_expected_for_release
685 << " expected_allocations=" << bi.expected_allocations
686 << dendl;
687 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
688 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
689 if (bi.collect_candidate) {
690 auto it = bi.first_lextent;
691 bool bExit = false;
692 do {
693 if (it->blob.get() == b) {
694 extents_to_collect.emplace_back(it->logical_offset, it->length);
695 }
696 bExit = it == bi.last_lextent;
697 ++it;
31f18b77 698 } while (!bExit);
7c673cae
FG
699 }
700 expected_for_release += blob_expected_for_release;
701 expected_allocations += bi.expected_allocations;
702 }
703 }
704 }
705}
706
707int64_t BlueStore::GarbageCollector::estimate(
708 uint64_t start_offset,
709 uint64_t length,
710 const BlueStore::ExtentMap& extent_map,
711 const BlueStore::old_extent_map_t& old_extents,
712 uint64_t min_alloc_size)
713{
714
715 affected_blobs.clear();
716 extents_to_collect.clear();
717 used_alloc_unit = boost::optional<uint64_t >();
718 blob_info_counted = nullptr;
719
720 gc_start_offset = start_offset;
721 gc_end_offset = start_offset + length;
722
723 uint64_t end_offset = start_offset + length;
724
725 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
726 Blob* b = it->e.blob.get();
727 if (b->get_blob().is_compressed()) {
728
729 // update gc_start_offset/gc_end_offset if needed
730 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
731 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
732
733 auto o = it->e.logical_offset;
734 auto l = it->e.length;
735
736 uint64_t ref_bytes = b->get_referenced_bytes();
737 // micro optimization to bypass blobs that have no more references
738 if (ref_bytes != 0) {
739 dout(30) << __func__ << " affected_blob:" << *b
740 << " unref 0x" << std::hex << o << "~" << l
741 << std::dec << dendl;
742 affected_blobs.emplace(b, BlobInfo(ref_bytes));
743 }
744 }
745 }
746 dout(30) << __func__ << " gc range(hex): [" << std::hex
747 << gc_start_offset << ", " << gc_end_offset
748 << ")" << std::dec << dendl;
749
750 // enumerate preceeding extents to check if they reference affected blobs
751 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
752 process_protrusive_extents(extent_map,
753 gc_start_offset,
754 gc_end_offset,
755 start_offset,
756 end_offset,
757 min_alloc_size);
758 }
759 return expected_for_release - expected_allocations;
760}
761
762// Cache
763
764BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
765 PerfCounters *logger)
766{
767 Cache *c = nullptr;
768
769 if (type == "lru")
770 c = new LRUCache(cct);
771 else if (type == "2q")
772 c = new TwoQCache(cct);
773 else
774 assert(0 == "unrecognized cache type");
775
776 c->logger = logger;
777 return c;
778}
779
780void BlueStore::Cache::trim_all()
781{
782 std::lock_guard<std::recursive_mutex> l(lock);
783 _trim(0, 0);
7c673cae
FG
784}
785
786void BlueStore::Cache::trim(
787 uint64_t target_bytes,
788 float target_meta_ratio,
31f18b77 789 float target_data_ratio,
7c673cae
FG
790 float bytes_per_onode)
791{
792 std::lock_guard<std::recursive_mutex> l(lock);
793 uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
794 uint64_t current_buffer = _get_buffer_bytes();
795 uint64_t current = current_meta + current_buffer;
796
31f18b77
FG
797 uint64_t target_meta = target_bytes * target_meta_ratio;
798 uint64_t target_buffer = target_bytes * target_data_ratio;
7c673cae 799
31f18b77
FG
800 // correct for overflow or float imprecision
801 target_meta = min(target_bytes, target_meta);
802 target_buffer = min(target_bytes - target_meta, target_buffer);
7c673cae
FG
803
804 if (current <= target_bytes) {
1adf2230
AA
805 dout(30) << __func__
806 << " shard target " << byte_u_t(target_bytes)
31f18b77
FG
807 << " meta/data ratios " << target_meta_ratio
808 << " + " << target_data_ratio << " ("
1adf2230
AA
809 << byte_u_t(target_meta) << " + "
810 << byte_u_t(target_buffer) << "), "
811 << " current " << byte_u_t(current) << " ("
812 << byte_u_t(current_meta) << " + "
813 << byte_u_t(current_buffer) << ")"
7c673cae
FG
814 << dendl;
815 return;
816 }
817
818 uint64_t need_to_free = current - target_bytes;
819 uint64_t free_buffer = 0;
820 uint64_t free_meta = 0;
821 if (current_buffer > target_buffer) {
822 free_buffer = current_buffer - target_buffer;
823 if (free_buffer > need_to_free) {
824 free_buffer = need_to_free;
825 }
826 }
827 free_meta = need_to_free - free_buffer;
828
829 // start bounds at what we have now
830 uint64_t max_buffer = current_buffer - free_buffer;
831 uint64_t max_meta = current_meta - free_meta;
832 uint64_t max_onodes = max_meta / bytes_per_onode;
833
1adf2230
AA
834 dout(20) << __func__
835 << " shard target " << byte_u_t(target_bytes)
7c673cae 836 << " ratio " << target_meta_ratio << " ("
1adf2230
AA
837 << byte_u_t(target_meta) << " + "
838 << byte_u_t(target_buffer) << "), "
839 << " current " << byte_u_t(current) << " ("
840 << byte_u_t(current_meta) << " + "
841 << byte_u_t(current_buffer) << "),"
842 << " need_to_free " << byte_u_t(need_to_free) << " ("
843 << byte_u_t(free_meta) << " + "
844 << byte_u_t(free_buffer) << ")"
7c673cae
FG
845 << " -> max " << max_onodes << " onodes + "
846 << max_buffer << " buffer"
847 << dendl;
848 _trim(max_onodes, max_buffer);
849}
850
851
852// LRUCache
853#undef dout_prefix
854#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
855
856void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
857{
858 auto p = onode_lru.iterator_to(*o);
859 onode_lru.erase(p);
860 onode_lru.push_front(*o);
861}
862
863void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
864{
865 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
866 << " buffers " << buffer_size << " / " << buffer_max
867 << dendl;
868
869 _audit("trim start");
870
871 // buffers
872 while (buffer_size > buffer_max) {
873 auto i = buffer_lru.rbegin();
874 if (i == buffer_lru.rend()) {
875 // stop if buffer_lru is now empty
876 break;
877 }
878
879 Buffer *b = &*i;
880 assert(b->is_clean());
881 dout(20) << __func__ << " rm " << *b << dendl;
882 b->space->_rm_buffer(this, b);
883 }
884
885 // onodes
886 int num = onode_lru.size() - onode_max;
887 if (num <= 0)
888 return; // don't even try
889
890 auto p = onode_lru.end();
891 assert(p != onode_lru.begin());
892 --p;
893 int skipped = 0;
894 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
895 while (num > 0) {
896 Onode *o = &*p;
897 int refs = o->nref.load();
898 if (refs > 1) {
899 dout(20) << __func__ << " " << o->oid << " has " << refs
900 << " refs, skipping" << dendl;
901 if (++skipped >= max_skipped) {
902 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
903 << num << " left to trim" << dendl;
904 break;
905 }
906
907 if (p == onode_lru.begin()) {
908 break;
909 } else {
910 p--;
911 num--;
912 continue;
913 }
914 }
915 dout(30) << __func__ << " rm " << o->oid << dendl;
916 if (p != onode_lru.begin()) {
917 onode_lru.erase(p--);
918 } else {
919 onode_lru.erase(p);
920 assert(num == 1);
921 }
922 o->get(); // paranoia
923 o->c->onode_map.remove(o->oid);
924 o->put();
925 --num;
926 }
927}
928
929#ifdef DEBUG_CACHE
930void BlueStore::LRUCache::_audit(const char *when)
931{
932 dout(10) << __func__ << " " << when << " start" << dendl;
933 uint64_t s = 0;
934 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
935 s += i->length;
936 }
937 if (s != buffer_size) {
938 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
939 << dendl;
940 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
941 derr << __func__ << " " << *i << dendl;
942 }
943 assert(s == buffer_size);
944 }
945 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
946 << " ok" << dendl;
947}
948#endif
949
950// TwoQCache
951#undef dout_prefix
952#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
953
954
955void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
956{
957 auto p = onode_lru.iterator_to(*o);
958 onode_lru.erase(p);
959 onode_lru.push_front(*o);
960}
961
962void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
963{
964 dout(20) << __func__ << " level " << level << " near " << near
965 << " on " << *b
966 << " which has cache_private " << b->cache_private << dendl;
967 if (near) {
968 b->cache_private = near->cache_private;
969 switch (b->cache_private) {
970 case BUFFER_WARM_IN:
971 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
972 break;
973 case BUFFER_WARM_OUT:
974 assert(b->is_empty());
975 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
976 break;
977 case BUFFER_HOT:
978 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
979 break;
980 default:
981 assert(0 == "bad cache_private");
982 }
983 } else if (b->cache_private == BUFFER_NEW) {
984 b->cache_private = BUFFER_WARM_IN;
985 if (level > 0) {
986 buffer_warm_in.push_front(*b);
987 } else {
988 // take caller hint to start at the back of the warm queue
989 buffer_warm_in.push_back(*b);
990 }
991 } else {
992 // we got a hint from discard
993 switch (b->cache_private) {
994 case BUFFER_WARM_IN:
995 // stay in warm_in. move to front, even though 2Q doesn't actually
996 // do this.
997 dout(20) << __func__ << " move to front of warm " << *b << dendl;
998 buffer_warm_in.push_front(*b);
999 break;
1000 case BUFFER_WARM_OUT:
1001 b->cache_private = BUFFER_HOT;
1002 // move to hot. fall-thru
1003 case BUFFER_HOT:
1004 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1005 buffer_hot.push_front(*b);
1006 break;
1007 default:
1008 assert(0 == "bad cache_private");
1009 }
1010 }
1011 if (!b->is_empty()) {
1012 buffer_bytes += b->length;
1013 buffer_list_bytes[b->cache_private] += b->length;
1014 }
1015}
1016
1017void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1018{
1019 dout(20) << __func__ << " " << *b << dendl;
1020 if (!b->is_empty()) {
1021 assert(buffer_bytes >= b->length);
1022 buffer_bytes -= b->length;
1023 assert(buffer_list_bytes[b->cache_private] >= b->length);
1024 buffer_list_bytes[b->cache_private] -= b->length;
1025 }
1026 switch (b->cache_private) {
1027 case BUFFER_WARM_IN:
1028 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1029 break;
1030 case BUFFER_WARM_OUT:
1031 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1032 break;
1033 case BUFFER_HOT:
1034 buffer_hot.erase(buffer_hot.iterator_to(*b));
1035 break;
1036 default:
1037 assert(0 == "bad cache_private");
1038 }
1039}
1040
1041void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1042{
1043 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1044 src->_rm_buffer(b);
1045
1046 // preserve which list we're on (even if we can't preserve the order!)
1047 switch (b->cache_private) {
1048 case BUFFER_WARM_IN:
1049 assert(!b->is_empty());
1050 buffer_warm_in.push_back(*b);
1051 break;
1052 case BUFFER_WARM_OUT:
1053 assert(b->is_empty());
1054 buffer_warm_out.push_back(*b);
1055 break;
1056 case BUFFER_HOT:
1057 assert(!b->is_empty());
1058 buffer_hot.push_back(*b);
1059 break;
1060 default:
1061 assert(0 == "bad cache_private");
1062 }
1063 if (!b->is_empty()) {
1064 buffer_bytes += b->length;
1065 buffer_list_bytes[b->cache_private] += b->length;
1066 }
1067}
1068
1069void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1070{
1071 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1072 if (!b->is_empty()) {
1073 assert((int64_t)buffer_bytes + delta >= 0);
1074 buffer_bytes += delta;
1075 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1076 buffer_list_bytes[b->cache_private] += delta;
1077 }
1078}
1079
1080void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1081{
1082 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1083 << " buffers " << buffer_bytes << " / " << buffer_max
1084 << dendl;
1085
1086 _audit("trim start");
1087
1088 // buffers
1089 if (buffer_bytes > buffer_max) {
1090 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1091 uint64_t khot = buffer_max - kin;
1092
1093 // pre-calculate kout based on average buffer size too,
1094 // which is typical(the warm_in and hot lists may change later)
1095 uint64_t kout = 0;
1096 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1097 if (buffer_num) {
1098 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1099 assert(buffer_avg_size);
1100 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1101 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1102 }
1103
1104 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1105 // hot is small, give slack to warm_in
1106 kin += khot - buffer_list_bytes[BUFFER_HOT];
1107 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1108 // warm_in is small, give slack to hot
1109 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1110 }
1111
1112 // adjust warm_in list
1113 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1114 uint64_t evicted = 0;
1115
1116 while (to_evict_bytes > 0) {
1117 auto p = buffer_warm_in.rbegin();
1118 if (p == buffer_warm_in.rend()) {
1119 // stop if warm_in list is now empty
1120 break;
1121 }
1122
1123 Buffer *b = &*p;
1124 assert(b->is_clean());
1125 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1126 assert(buffer_bytes >= b->length);
1127 buffer_bytes -= b->length;
1128 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1129 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1130 to_evict_bytes -= b->length;
1131 evicted += b->length;
1132 b->state = Buffer::STATE_EMPTY;
1133 b->data.clear();
1134 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1135 buffer_warm_out.push_front(*b);
1136 b->cache_private = BUFFER_WARM_OUT;
1137 }
1138
1139 if (evicted > 0) {
1adf2230 1140 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1141 << " from warm_in list, done evicting warm_in buffers"
1142 << dendl;
1143 }
1144
1145 // adjust hot list
1146 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1147 evicted = 0;
1148
1149 while (to_evict_bytes > 0) {
1150 auto p = buffer_hot.rbegin();
1151 if (p == buffer_hot.rend()) {
1152 // stop if hot list is now empty
1153 break;
1154 }
1155
1156 Buffer *b = &*p;
1157 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1158 assert(b->is_clean());
1159 // adjust evict size before buffer goes invalid
1160 to_evict_bytes -= b->length;
1161 evicted += b->length;
1162 b->space->_rm_buffer(this, b);
1163 }
1164
1165 if (evicted > 0) {
1adf2230 1166 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
7c673cae
FG
1167 << " from hot list, done evicting hot buffers"
1168 << dendl;
1169 }
1170
1171 // adjust warm out list too, if necessary
1172 int64_t num = buffer_warm_out.size() - kout;
1173 while (num-- > 0) {
1174 Buffer *b = &*buffer_warm_out.rbegin();
1175 assert(b->is_empty());
1176 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1177 b->space->_rm_buffer(this, b);
1178 }
1179 }
1180
1181 // onodes
1182 int num = onode_lru.size() - onode_max;
1183 if (num <= 0)
1184 return; // don't even try
1185
1186 auto p = onode_lru.end();
1187 assert(p != onode_lru.begin());
1188 --p;
1189 int skipped = 0;
1190 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1191 while (num > 0) {
1192 Onode *o = &*p;
1193 dout(20) << __func__ << " considering " << o << dendl;
1194 int refs = o->nref.load();
1195 if (refs > 1) {
1196 dout(20) << __func__ << " " << o->oid << " has " << refs
1197 << " refs; skipping" << dendl;
1198 if (++skipped >= max_skipped) {
1199 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1200 << num << " left to trim" << dendl;
1201 break;
1202 }
1203
1204 if (p == onode_lru.begin()) {
1205 break;
1206 } else {
1207 p--;
1208 num--;
1209 continue;
1210 }
1211 }
1212 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1213 if (p != onode_lru.begin()) {
1214 onode_lru.erase(p--);
1215 } else {
1216 onode_lru.erase(p);
1217 assert(num == 1);
1218 }
1219 o->get(); // paranoia
1220 o->c->onode_map.remove(o->oid);
1221 o->put();
1222 --num;
1223 }
1224}
1225
1226#ifdef DEBUG_CACHE
1227void BlueStore::TwoQCache::_audit(const char *when)
1228{
1229 dout(10) << __func__ << " " << when << " start" << dendl;
1230 uint64_t s = 0;
1231 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1232 s += i->length;
1233 }
1234
1235 uint64_t hot_bytes = s;
1236 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1237 derr << __func__ << " hot_list_bytes "
1238 << buffer_list_bytes[BUFFER_HOT]
1239 << " != actual " << hot_bytes
1240 << dendl;
1241 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1242 }
1243
1244 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1245 s += i->length;
1246 }
1247
1248 uint64_t warm_in_bytes = s - hot_bytes;
1249 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1250 derr << __func__ << " warm_in_list_bytes "
1251 << buffer_list_bytes[BUFFER_WARM_IN]
1252 << " != actual " << warm_in_bytes
1253 << dendl;
1254 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1255 }
1256
1257 if (s != buffer_bytes) {
1258 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1259 << dendl;
1260 assert(s == buffer_bytes);
1261 }
1262
1263 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1264 << " ok" << dendl;
1265}
1266#endif
1267
1268
1269// BufferSpace
1270
1271#undef dout_prefix
1272#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1273
1274void BlueStore::BufferSpace::_clear(Cache* cache)
1275{
1276 // note: we already hold cache->lock
1277 ldout(cache->cct, 20) << __func__ << dendl;
1278 while (!buffer_map.empty()) {
1279 _rm_buffer(cache, buffer_map.begin());
1280 }
1281}
1282
1283int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1284{
1285 // note: we already hold cache->lock
1286 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1287 << std::dec << dendl;
1288 int cache_private = 0;
1289 cache->_audit("discard start");
1290 auto i = _data_lower_bound(offset);
1291 uint32_t end = offset + length;
1292 while (i != buffer_map.end()) {
1293 Buffer *b = i->second.get();
1294 if (b->offset >= end) {
1295 break;
1296 }
1297 if (b->cache_private > cache_private) {
1298 cache_private = b->cache_private;
1299 }
1300 if (b->offset < offset) {
1301 int64_t front = offset - b->offset;
1302 if (b->end() > end) {
1303 // drop middle (split)
1304 uint32_t tail = b->end() - end;
1305 if (b->data.length()) {
1306 bufferlist bl;
1307 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1308 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1309 nb->maybe_rebuild();
1310 _add_buffer(cache, nb, 0, b);
7c673cae 1311 } else {
31f18b77
FG
1312 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1313 0, b);
7c673cae
FG
1314 }
1315 if (!b->is_writing()) {
1316 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1317 }
1318 b->truncate(front);
31f18b77 1319 b->maybe_rebuild();
7c673cae
FG
1320 cache->_audit("discard end 1");
1321 break;
1322 } else {
1323 // drop tail
1324 if (!b->is_writing()) {
1325 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1326 }
1327 b->truncate(front);
31f18b77 1328 b->maybe_rebuild();
7c673cae
FG
1329 ++i;
1330 continue;
1331 }
1332 }
1333 if (b->end() <= end) {
1334 // drop entire buffer
1335 _rm_buffer(cache, i++);
1336 continue;
1337 }
1338 // drop front
1339 uint32_t keep = b->end() - end;
1340 if (b->data.length()) {
1341 bufferlist bl;
1342 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1343 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1344 nb->maybe_rebuild();
1345 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1346 } else {
1347 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1348 }
1349 _rm_buffer(cache, i);
1350 cache->_audit("discard end 2");
1351 break;
1352 }
1353 return cache_private;
1354}
1355
1356void BlueStore::BufferSpace::read(
1357 Cache* cache,
224ce89b
WB
1358 uint32_t offset,
1359 uint32_t length,
7c673cae
FG
1360 BlueStore::ready_regions_t& res,
1361 interval_set<uint32_t>& res_intervals)
1362{
7c673cae
FG
1363 res.clear();
1364 res_intervals.clear();
1365 uint32_t want_bytes = length;
1366 uint32_t end = offset + length;
224ce89b
WB
1367
1368 {
1369 std::lock_guard<std::recursive_mutex> l(cache->lock);
1370 for (auto i = _data_lower_bound(offset);
1371 i != buffer_map.end() && offset < end && i->first < end;
1372 ++i) {
1373 Buffer *b = i->second.get();
1374 assert(b->end() > offset);
1375 if (b->is_writing() || b->is_clean()) {
1376 if (b->offset < offset) {
1377 uint32_t skip = offset - b->offset;
1378 uint32_t l = MIN(length, b->length - skip);
1379 res[offset].substr_of(b->data, skip, l);
1380 res_intervals.insert(offset, l);
1381 offset += l;
1382 length -= l;
1383 if (!b->is_writing()) {
1384 cache->_touch_buffer(b);
1385 }
1386 continue;
1387 }
1388 if (b->offset > offset) {
1389 uint32_t gap = b->offset - offset;
1390 if (length <= gap) {
1391 break;
1392 }
1393 offset += gap;
1394 length -= gap;
1395 }
1396 if (!b->is_writing()) {
7c673cae 1397 cache->_touch_buffer(b);
224ce89b
WB
1398 }
1399 if (b->length > length) {
1400 res[offset].substr_of(b->data, 0, length);
1401 res_intervals.insert(offset, length);
7c673cae 1402 break;
224ce89b
WB
1403 } else {
1404 res[offset].append(b->data);
1405 res_intervals.insert(offset, b->length);
1406 if (b->length == length)
1407 break;
1408 offset += b->length;
1409 length -= b->length;
1410 }
7c673cae
FG
1411 }
1412 }
1413 }
1414
1415 uint64_t hit_bytes = res_intervals.size();
1416 assert(hit_bytes <= want_bytes);
1417 uint64_t miss_bytes = want_bytes - hit_bytes;
1418 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1419 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1420}
1421
1422void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1423{
1424 std::lock_guard<std::recursive_mutex> l(cache->lock);
1425
1426 auto i = writing.begin();
1427 while (i != writing.end()) {
1428 if (i->seq > seq) {
1429 break;
1430 }
1431 if (i->seq < seq) {
1432 ++i;
1433 continue;
1434 }
1435
1436 Buffer *b = &*i;
1437 assert(b->is_writing());
1438
1439 if (b->flags & Buffer::FLAG_NOCACHE) {
1440 writing.erase(i++);
1441 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1442 buffer_map.erase(b->offset);
1443 } else {
1444 b->state = Buffer::STATE_CLEAN;
1445 writing.erase(i++);
31f18b77
FG
1446 b->maybe_rebuild();
1447 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
1448 cache->_add_buffer(b, 1, nullptr);
1449 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1450 }
1451 }
1452
1453 cache->_audit("finish_write end");
1454}
1455
1456void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1457{
1458 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1459 if (buffer_map.empty())
1460 return;
1461
1462 auto p = --buffer_map.end();
1463 while (true) {
1464 if (p->second->end() <= pos)
1465 break;
1466
1467 if (p->second->offset < pos) {
1468 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1469 size_t left = pos - p->second->offset;
1470 size_t right = p->second->length - left;
1471 if (p->second->data.length()) {
1472 bufferlist bl;
1473 bl.substr_of(p->second->data, left, right);
1474 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1475 0, p->second.get());
1476 } else {
1477 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1478 0, p->second.get());
1479 }
1480 cache->_adjust_buffer_size(p->second.get(), -right);
1481 p->second->truncate(left);
1482 break;
1483 }
1484
1485 assert(p->second->end() > pos);
1486 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1487 if (p->second->data.length()) {
1488 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1489 p->second->offset - pos, p->second->data),
1490 0, p->second.get());
1491 } else {
1492 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1493 p->second->offset - pos, p->second->length),
1494 0, p->second.get());
1495 }
1496 if (p == buffer_map.begin()) {
1497 _rm_buffer(cache, p);
1498 break;
1499 } else {
1500 _rm_buffer(cache, p--);
1501 }
1502 }
1503 assert(writing.empty());
1504}
1505
1506// OnodeSpace
1507
1508#undef dout_prefix
1509#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1510
1511BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1512{
1513 std::lock_guard<std::recursive_mutex> l(cache->lock);
1514 auto p = onode_map.find(oid);
1515 if (p != onode_map.end()) {
1516 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1517 << " raced, returning existing " << p->second
1518 << dendl;
1519 return p->second;
1520 }
1521 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1522 onode_map[oid] = o;
1523 cache->_add_onode(o, 1);
1524 return o;
1525}
1526
1527BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1528{
7c673cae 1529 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1530 OnodeRef o;
1531 bool hit = false;
1532
1533 {
1534 std::lock_guard<std::recursive_mutex> l(cache->lock);
1535 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1536 if (p == onode_map.end()) {
1537 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1538 } else {
1539 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1540 << dendl;
1541 cache->_touch_onode(p->second);
1542 hit = true;
1543 o = p->second;
1544 }
1545 }
1546
1547 if (hit) {
1548 cache->logger->inc(l_bluestore_onode_hits);
1549 } else {
7c673cae 1550 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1551 }
224ce89b 1552 return o;
7c673cae
FG
1553}
1554
1555void BlueStore::OnodeSpace::clear()
1556{
1557 std::lock_guard<std::recursive_mutex> l(cache->lock);
1558 ldout(cache->cct, 10) << __func__ << dendl;
1559 for (auto &p : onode_map) {
1560 cache->_rm_onode(p.second);
1561 }
1562 onode_map.clear();
1563}
1564
1565bool BlueStore::OnodeSpace::empty()
1566{
1567 std::lock_guard<std::recursive_mutex> l(cache->lock);
1568 return onode_map.empty();
1569}
1570
1571void BlueStore::OnodeSpace::rename(
1572 OnodeRef& oldo,
1573 const ghobject_t& old_oid,
1574 const ghobject_t& new_oid,
31f18b77 1575 const mempool::bluestore_cache_other::string& new_okey)
7c673cae
FG
1576{
1577 std::lock_guard<std::recursive_mutex> l(cache->lock);
1578 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1579 << dendl;
1580 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1581 po = onode_map.find(old_oid);
1582 pn = onode_map.find(new_oid);
1583 assert(po != pn);
1584
1585 assert(po != onode_map.end());
1586 if (pn != onode_map.end()) {
1587 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1588 << dendl;
1589 cache->_rm_onode(pn->second);
1590 onode_map.erase(pn);
1591 }
1592 OnodeRef o = po->second;
1593
1594 // install a non-existent onode at old location
1595 oldo.reset(new Onode(o->c, old_oid, o->key));
1596 po->second = oldo;
1597 cache->_add_onode(po->second, 1);
1598
1599 // add at new position and fix oid, key
1600 onode_map.insert(make_pair(new_oid, o));
1601 cache->_touch_onode(o);
1602 o->oid = new_oid;
1603 o->key = new_okey;
1604}
1605
1606bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1607{
1608 std::lock_guard<std::recursive_mutex> l(cache->lock);
1609 ldout(cache->cct, 20) << __func__ << dendl;
1610 for (auto& i : onode_map) {
1611 if (f(i.second)) {
1612 return true;
1613 }
1614 }
1615 return false;
1616}
1617
3efd9988
FG
1618void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
1619{
1620 for (auto& i : onode_map) {
1621 ldout(cct, lvl) << i.first << " : " << i.second << dendl;
1622 }
1623}
7c673cae
FG
1624
1625// SharedBlob
1626
1627#undef dout_prefix
1628#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1629
1630ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1631{
1632 out << "SharedBlob(" << &sb;
1633
1634 if (sb.loaded) {
1635 out << " loaded " << *sb.persistent;
1636 } else {
1637 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1638 }
1639 return out << ")";
1640}
1641
1642BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1643 : coll(_coll), sbid_unloaded(i)
1644{
1645 assert(sbid_unloaded > 0);
1646 if (get_cache()) {
1647 get_cache()->add_blob();
1648 }
1649}
1650
1651BlueStore::SharedBlob::~SharedBlob()
1652{
7c673cae
FG
1653 if (loaded && persistent) {
1654 delete persistent;
1655 }
1656}
1657
1658void BlueStore::SharedBlob::put()
1659{
1660 if (--nref == 0) {
1661 ldout(coll->store->cct, 20) << __func__ << " " << this
1662 << " removing self from set " << get_parent()
1663 << dendl;
1adf2230
AA
1664 again:
1665 auto coll_snap = coll;
1666 if (coll_snap) {
1667 std::lock_guard<std::recursive_mutex> l(coll_snap->cache->lock);
1668 if (coll_snap != coll) {
1669 goto again;
1670 }
1671 coll_snap->shared_blob_set.remove(this);
1672
1673 bc._clear(coll_snap->cache);
1674 coll_snap->cache->rm_blob();
7c673cae 1675 }
28e407b8 1676 delete this;
7c673cae
FG
1677 }
1678}
1679
1680void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1681{
1682 assert(persistent);
1683 persistent->ref_map.get(offset, length);
1684}
1685
1686void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77
FG
1687 PExtentVector *r,
1688 set<SharedBlob*> *maybe_unshared)
7c673cae
FG
1689{
1690 assert(persistent);
31f18b77
FG
1691 bool maybe = false;
1692 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1693 if (maybe_unshared && maybe) {
1694 maybe_unshared->insert(this);
1695 }
7c673cae
FG
1696}
1697
3efd9988
FG
1698// SharedBlobSet
1699
1700#undef dout_prefix
1701#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1702
1703void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
1704{
1705 std::lock_guard<std::mutex> l(lock);
1706 for (auto& i : sb_map) {
1707 ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
1708 }
1709}
1710
7c673cae
FG
1711// Blob
1712
1713#undef dout_prefix
1714#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1715
1716ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1717{
1718 out << "Blob(" << &b;
1719 if (b.is_spanning()) {
1720 out << " spanning " << b.id;
1721 }
35e4c445
FG
1722 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1723 if (b.shared_blob) {
1724 out << " " << *b.shared_blob;
1725 } else {
1726 out << " (shared_blob=NULL)";
1727 }
1728 out << ")";
7c673cae
FG
1729 return out;
1730}
1731
1732void BlueStore::Blob::discard_unallocated(Collection *coll)
1733{
224ce89b 1734 if (get_blob().is_shared()) {
7c673cae
FG
1735 return;
1736 }
224ce89b 1737 if (get_blob().is_compressed()) {
7c673cae
FG
1738 bool discard = false;
1739 bool all_invalid = true;
224ce89b 1740 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1741 if (!e.is_valid()) {
1742 discard = true;
1743 } else {
1744 all_invalid = false;
1745 }
1746 }
1747 assert(discard == all_invalid); // in case of compressed blob all
1748 // or none pextents are invalid.
1749 if (discard) {
224ce89b
WB
1750 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1751 get_blob().get_logical_length());
7c673cae
FG
1752 }
1753 } else {
1754 size_t pos = 0;
224ce89b 1755 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1756 if (!e.is_valid()) {
1757 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1758 << "~" << e.length
1759 << std::dec << dendl;
1760 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1761 }
1762 pos += e.length;
1763 }
224ce89b
WB
1764 if (get_blob().can_prune_tail()) {
1765 dirty_blob().prune_tail();
1766 used_in_blob.prune_tail(get_blob().get_ondisk_length());
7c673cae 1767 auto cct = coll->store->cct; //used by dout
224ce89b 1768 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1769 }
1770 }
1771}
1772
1773void BlueStore::Blob::get_ref(
1774 Collection *coll,
1775 uint32_t offset,
1776 uint32_t length)
1777{
1778 // Caller has to initialize Blob's logical length prior to increment
1779 // references. Otherwise one is neither unable to determine required
1780 // amount of counters in case of per-au tracking nor obtain min_release_size
1781 // for single counter mode.
1782 assert(get_blob().get_logical_length() != 0);
1783 auto cct = coll->store->cct;
1784 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1785 << std::dec << " " << *this << dendl;
1786
1787 if (used_in_blob.is_empty()) {
1788 uint32_t min_release_size =
224ce89b
WB
1789 get_blob().get_release_size(coll->store->min_alloc_size);
1790 uint64_t l = get_blob().get_logical_length();
1791 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1792 << min_release_size << std::dec << dendl;
7c673cae
FG
1793 used_in_blob.init(l, min_release_size);
1794 }
1795 used_in_blob.get(
1796 offset,
1797 length);
1798}
1799
1800bool BlueStore::Blob::put_ref(
1801 Collection *coll,
1802 uint32_t offset,
1803 uint32_t length,
1804 PExtentVector *r)
1805{
1806 PExtentVector logical;
1807
1808 auto cct = coll->store->cct;
1809 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1810 << std::dec << " " << *this << dendl;
1811
1812 bool empty = used_in_blob.put(
1813 offset,
1814 length,
1815 &logical);
1816 r->clear();
1817 // nothing to release
1818 if (!empty && logical.empty()) {
1819 return false;
1820 }
1821
1822 bluestore_blob_t& b = dirty_blob();
1823 return b.release_extents(empty, logical, r);
1824}
1825
224ce89b 1826bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1827 uint32_t target_blob_size,
1828 uint32_t b_offset,
1829 uint32_t *length0) {
1830 assert(min_alloc_size);
1831 assert(target_blob_size);
1832 if (!get_blob().is_mutable()) {
1833 return false;
1834 }
1835
1836 uint32_t length = *length0;
1837 uint32_t end = b_offset + length;
1838
1839 // Currently for the sake of simplicity we omit blob reuse if data is
1840 // unaligned with csum chunk. Later we can perform padding if needed.
1841 if (get_blob().has_csum() &&
1842 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1843 (end % get_blob().get_csum_chunk_size()) != 0)) {
1844 return false;
1845 }
1846
1847 auto blen = get_blob().get_logical_length();
1848 uint32_t new_blen = blen;
1849
1850 // make sure target_blob_size isn't less than current blob len
1851 target_blob_size = MAX(blen, target_blob_size);
1852
1853 if (b_offset >= blen) {
224ce89b
WB
1854 // new data totally stands out of the existing blob
1855 new_blen = end;
7c673cae 1856 } else {
224ce89b
WB
1857 // new data overlaps with the existing blob
1858 new_blen = MAX(blen, end);
1859
1860 uint32_t overlap = 0;
1861 if (new_blen > blen) {
1862 overlap = blen - b_offset;
1863 } else {
1864 overlap = length;
1865 }
1866
1867 if (!get_blob().is_unallocated(b_offset, overlap)) {
1868 // abort if any piece of the overlap has already been allocated
1869 return false;
7c673cae
FG
1870 }
1871 }
224ce89b 1872
7c673cae
FG
1873 if (new_blen > blen) {
1874 int64_t overflow = int64_t(new_blen) - target_blob_size;
1875 // Unable to decrease the provided length to fit into max_blob_size
1876 if (overflow >= length) {
1877 return false;
1878 }
1879
1880 // FIXME: in some cases we could reduce unused resolution
1881 if (get_blob().has_unused()) {
1882 return false;
1883 }
1884
1885 if (overflow > 0) {
1886 new_blen -= overflow;
1887 length -= overflow;
1888 *length0 = length;
1889 }
224ce89b 1890
7c673cae
FG
1891 if (new_blen > blen) {
1892 dirty_blob().add_tail(new_blen);
1893 used_in_blob.add_tail(new_blen,
224ce89b 1894 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
1895 }
1896 }
1897 return true;
1898}
1899
1900void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1901{
1902 auto cct = coll->store->cct; //used by dout
1903 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1904 << " start " << *this << dendl;
1905 assert(blob.can_split());
1906 assert(used_in_blob.can_split());
1907 bluestore_blob_t &lb = dirty_blob();
1908 bluestore_blob_t &rb = r->dirty_blob();
1909
1910 used_in_blob.split(
1911 blob_offset,
1912 &(r->used_in_blob));
1913
1914 lb.split(blob_offset, rb);
1915 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1916
1917 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1918 << " finish " << *this << dendl;
1919 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1920 << " and " << *r << dendl;
1921}
1922
1923#ifndef CACHE_BLOB_BL
1924void BlueStore::Blob::decode(
1925 Collection *coll,
1926 bufferptr::iterator& p,
1927 uint64_t struct_v,
1928 uint64_t* sbid,
1929 bool include_ref_map)
1930{
1931 denc(blob, p, struct_v);
1932 if (blob.is_shared()) {
1933 denc(*sbid, p);
1934 }
1935 if (include_ref_map) {
1936 if (struct_v > 1) {
1937 used_in_blob.decode(p);
1938 } else {
1939 used_in_blob.clear();
1940 bluestore_extent_ref_map_t legacy_ref_map;
1941 legacy_ref_map.decode(p);
1942 for (auto r : legacy_ref_map.ref_map) {
1943 get_ref(
1944 coll,
1945 r.first,
1946 r.second.refs * r.second.length);
1947 }
1948 }
1949 }
1950}
1951#endif
1952
1953// Extent
1954
1955ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1956{
1957 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1958 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1959 << " " << *e.blob;
1960}
1961
1962// OldExtent
1963BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1964 uint32_t lo,
1965 uint32_t o,
1966 uint32_t l,
1967 BlobRef& b) {
1968 OldExtent* oe = new OldExtent(lo, o, l, b);
1969 b->put_ref(c.get(), o, l, &(oe->r));
1970 oe->blob_empty = b->get_referenced_bytes() == 0;
1971 return oe;
1972}
1973
1974// ExtentMap
1975
1976#undef dout_prefix
1977#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1978
1979BlueStore::ExtentMap::ExtentMap(Onode *o)
1980 : onode(o),
1981 inline_bl(
1982 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1983}
1984
1985void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1986 bool force)
1987{
1988 auto cct = onode->c->store->cct; //used by dout
1989 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1990 if (onode->onode.extent_map_shards.empty()) {
1991 if (inline_bl.length() == 0) {
1992 unsigned n;
1993 // we need to encode inline_bl to measure encoded length
1994 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
3efd9988 1995 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
7c673cae
FG
1996 assert(!never_happen);
1997 size_t len = inline_bl.length();
1998 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1999 << " extents" << dendl;
2000 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2001 request_reshard(0, OBJECT_MAX_SIZE);
2002 return;
2003 }
2004 }
2005 // will persist in the onode key.
2006 } else {
2007 // pending shard update
2008 struct dirty_shard_t {
2009 Shard *shard;
2010 bufferlist bl;
2011 dirty_shard_t(Shard *s) : shard(s) {}
2012 };
2013 vector<dirty_shard_t> encoded_shards;
2014 // allocate slots for all shards in a single call instead of
2015 // doing multiple allocations - one per each dirty shard
2016 encoded_shards.reserve(shards.size());
2017
2018 auto p = shards.begin();
2019 auto prev_p = p;
2020 while (p != shards.end()) {
31f18b77 2021 assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2022 auto n = p;
2023 ++n;
2024 if (p->dirty) {
2025 uint32_t endoff;
2026 if (n == shards.end()) {
2027 endoff = OBJECT_MAX_SIZE;
2028 } else {
2029 endoff = n->shard_info->offset;
2030 }
2031 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2032 bufferlist& bl = encoded_shards.back().bl;
2033 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2034 bl, &p->extents)) {
2035 if (force) {
2036 derr << __func__ << " encode_some needs reshard" << dendl;
2037 assert(!force);
2038 }
2039 }
2040 size_t len = bl.length();
2041
2042 dout(20) << __func__ << " shard 0x" << std::hex
2043 << p->shard_info->offset << std::dec << " is " << len
2044 << " bytes (was " << p->shard_info->bytes << ") from "
2045 << p->extents << " extents" << dendl;
2046
2047 if (!force) {
2048 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2049 // we are big; reshard ourselves
2050 request_reshard(p->shard_info->offset, endoff);
2051 }
2052 // avoid resharding the trailing shard, even if it is small
2053 else if (n != shards.end() &&
2054 len < g_conf->bluestore_extent_map_shard_min_size) {
31f18b77
FG
2055 assert(endoff != OBJECT_MAX_SIZE);
2056 if (p == shards.begin()) {
2057 // we are the first shard, combine with next shard
7c673cae 2058 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2059 } else {
31f18b77
FG
2060 // combine either with the previous shard or the next,
2061 // whichever is smaller
7c673cae
FG
2062 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2063 request_reshard(p->shard_info->offset, endoff + 1);
2064 } else {
2065 request_reshard(prev_p->shard_info->offset, endoff);
2066 }
2067 }
2068 }
2069 }
2070 }
2071 prev_p = p;
2072 p = n;
2073 }
2074 if (needs_reshard()) {
2075 return;
2076 }
2077
2078 // schedule DB update for dirty shards
2079 string key;
2080 for (auto& it : encoded_shards) {
2081 it.shard->dirty = false;
2082 it.shard->shard_info->bytes = it.bl.length();
2083 generate_extent_shard_key_and_apply(
2084 onode->key,
2085 it.shard->shard_info->offset,
2086 &key,
2087 [&](const string& final_key) {
2088 t->set(PREFIX_OBJ, final_key, it.bl);
2089 }
2090 );
2091 }
2092 }
2093}
2094
31f18b77
FG
2095bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2096{
2097 if (spanning_blob_map.empty())
2098 return 0;
2099 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2100 // bid is valid and available.
2101 if (bid >= 0)
2102 return bid;
2103 // Find next unused bid;
2104 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2105 const auto begin_bid = bid;
2106 do {
2107 if (!spanning_blob_map.count(bid))
2108 return bid;
2109 else {
2110 bid++;
2111 if (bid < 0) bid = 0;
2112 }
2113 } while (bid != begin_bid);
2114 assert(0 == "no available blob id");
2115}
2116
7c673cae
FG
2117void BlueStore::ExtentMap::reshard(
2118 KeyValueDB *db,
2119 KeyValueDB::Transaction t)
2120{
2121 auto cct = onode->c->store->cct; // used by dout
2122
2123 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2124 << needs_reshard_end << ")" << std::dec
2125 << " of " << onode->onode.extent_map_shards.size()
2126 << " shards on " << onode->oid << dendl;
2127 for (auto& p : spanning_blob_map) {
2128 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2129 << dendl;
2130 }
2131 // determine shard index range
2132 unsigned si_begin = 0, si_end = 0;
2133 if (!shards.empty()) {
2134 while (si_begin + 1 < shards.size() &&
2135 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2136 ++si_begin;
2137 }
2138 needs_reshard_begin = shards[si_begin].shard_info->offset;
2139 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2140 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2141 needs_reshard_end = shards[si_end].shard_info->offset;
2142 break;
2143 }
2144 }
2145 if (si_end == shards.size()) {
2146 needs_reshard_end = OBJECT_MAX_SIZE;
2147 }
2148 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2149 << " over 0x[" << std::hex << needs_reshard_begin << ","
2150 << needs_reshard_end << ")" << std::dec << dendl;
2151 }
2152
181888fb 2153 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2154
2155 // we may need to fault in a larger interval later must have all
2156 // referring extents for spanning blobs loaded in order to have
2157 // accurate use_tracker values.
2158 uint32_t spanning_scan_begin = needs_reshard_begin;
2159 uint32_t spanning_scan_end = needs_reshard_end;
2160
2161 // remove old keys
2162 string key;
2163 for (unsigned i = si_begin; i < si_end; ++i) {
2164 generate_extent_shard_key_and_apply(
2165 onode->key, shards[i].shard_info->offset, &key,
2166 [&](const string& final_key) {
2167 t->rmkey(PREFIX_OBJ, final_key);
2168 }
2169 );
2170 }
2171
2172 // calculate average extent size
2173 unsigned bytes = 0;
2174 unsigned extents = 0;
2175 if (onode->onode.extent_map_shards.empty()) {
2176 bytes = inline_bl.length();
2177 extents = extent_map.size();
2178 } else {
2179 for (unsigned i = si_begin; i < si_end; ++i) {
2180 bytes += shards[i].shard_info->bytes;
2181 extents += shards[i].extents;
2182 }
2183 }
2184 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2185 unsigned slop = target *
2186 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2187 unsigned extent_avg = bytes / MAX(1, extents);
2188 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2189 << ", slop " << slop << dendl;
2190
2191 // reshard
2192 unsigned estimate = 0;
31f18b77 2193 unsigned offset = needs_reshard_begin;
7c673cae
FG
2194 vector<bluestore_onode_t::shard_info> new_shard_info;
2195 unsigned max_blob_end = 0;
2196 Extent dummy(needs_reshard_begin);
2197 for (auto e = extent_map.lower_bound(dummy);
2198 e != extent_map.end();
2199 ++e) {
2200 if (e->logical_offset >= needs_reshard_end) {
2201 break;
2202 }
2203 dout(30) << " extent " << *e << dendl;
2204
2205 // disfavor shard boundaries that span a blob
2206 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2207 if (estimate &&
2208 estimate + extent_avg > target + (would_span ? slop : 0)) {
2209 // new shard
31f18b77 2210 if (offset == needs_reshard_begin) {
7c673cae
FG
2211 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2212 new_shard_info.back().offset = offset;
2213 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2214 << std::dec << dendl;
7c673cae
FG
2215 }
2216 offset = e->logical_offset;
2217 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2218 new_shard_info.back().offset = offset;
2219 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2220 << std::dec << dendl;
2221 estimate = 0;
2222 }
2223 estimate += extent_avg;
31f18b77
FG
2224 unsigned bs = e->blob_start();
2225 if (bs < spanning_scan_begin) {
2226 spanning_scan_begin = bs;
7c673cae
FG
2227 }
2228 uint32_t be = e->blob_end();
2229 if (be > max_blob_end) {
2230 max_blob_end = be;
2231 }
2232 if (be > spanning_scan_end) {
2233 spanning_scan_end = be;
2234 }
2235 }
2236 if (new_shard_info.empty() && (si_begin > 0 ||
2237 si_end < shards.size())) {
2238 // we resharded a partial range; we must produce at least one output
2239 // shard
2240 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2241 new_shard_info.back().offset = needs_reshard_begin;
2242 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2243 << std::dec << " (singleton degenerate case)" << dendl;
2244 }
2245
2246 auto& sv = onode->onode.extent_map_shards;
2247 dout(20) << __func__ << " new " << new_shard_info << dendl;
2248 dout(20) << __func__ << " old " << sv << dendl;
2249 if (sv.empty()) {
2250 // no old shards to keep
2251 sv.swap(new_shard_info);
2252 init_shards(true, true);
2253 } else {
2254 // splice in new shards
2255 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2256 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2257 sv.insert(
2258 sv.begin() + si_begin,
2259 new_shard_info.begin(),
2260 new_shard_info.end());
2261 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2262 si_end = si_begin + new_shard_info.size();
31f18b77
FG
2263
2264 assert(sv.size() == shards.size());
2265
2266 // note that we need to update every shard_info of shards here,
2267 // as sv might have been totally re-allocated above
2268 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2269 shards[i].shard_info = &sv[i];
31f18b77
FG
2270 }
2271
2272 // mark newly added shards as dirty
2273 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2274 shards[i].loaded = true;
2275 shards[i].dirty = true;
2276 }
7c673cae
FG
2277 }
2278 dout(20) << __func__ << " fin " << sv << dendl;
2279 inline_bl.clear();
2280
2281 if (sv.empty()) {
2282 // no more shards; unspan all previously spanning blobs
2283 auto p = spanning_blob_map.begin();
2284 while (p != spanning_blob_map.end()) {
2285 p->second->id = -1;
2286 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2287 p = spanning_blob_map.erase(p);
2288 }
2289 } else {
2290 // identify new spanning blobs
2291 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2292 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2293 if (spanning_scan_begin < needs_reshard_begin) {
2294 fault_range(db, spanning_scan_begin,
2295 needs_reshard_begin - spanning_scan_begin);
2296 }
2297 if (spanning_scan_end > needs_reshard_end) {
2298 fault_range(db, needs_reshard_end,
31f18b77 2299 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2300 }
2301 auto sp = sv.begin() + si_begin;
2302 auto esp = sv.end();
2303 unsigned shard_start = sp->offset;
2304 unsigned shard_end;
2305 ++sp;
2306 if (sp == esp) {
2307 shard_end = OBJECT_MAX_SIZE;
2308 } else {
2309 shard_end = sp->offset;
2310 }
7c673cae
FG
2311 Extent dummy(needs_reshard_begin);
2312 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2313 if (e->logical_offset >= needs_reshard_end) {
2314 break;
2315 }
2316 dout(30) << " extent " << *e << dendl;
2317 while (e->logical_offset >= shard_end) {
2318 shard_start = shard_end;
2319 assert(sp != esp);
2320 ++sp;
2321 if (sp == esp) {
2322 shard_end = OBJECT_MAX_SIZE;
2323 } else {
2324 shard_end = sp->offset;
2325 }
2326 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2327 << " to 0x" << shard_end << std::dec << dendl;
2328 }
2329 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2330 if (!e->blob->is_spanning()) {
2331 // We have two options: (1) split the blob into pieces at the
2332 // shard boundaries (and adjust extents accordingly), or (2)
2333 // mark it spanning. We prefer to cut the blob if we can. Note that
2334 // we may have to split it multiple times--potentially at every
2335 // shard boundary.
2336 bool must_span = false;
2337 BlobRef b = e->blob;
2338 if (b->can_split()) {
2339 uint32_t bstart = e->blob_start();
2340 uint32_t bend = e->blob_end();
2341 for (const auto& sh : shards) {
2342 if (bstart < sh.shard_info->offset &&
2343 bend > sh.shard_info->offset) {
2344 uint32_t blob_offset = sh.shard_info->offset - bstart;
2345 if (b->can_split_at(blob_offset)) {
2346 dout(20) << __func__ << " splitting blob, bstart 0x"
2347 << std::hex << bstart << " blob_offset 0x"
2348 << blob_offset << std::dec << " " << *b << dendl;
2349 b = split_blob(b, blob_offset, sh.shard_info->offset);
2350 // switch b to the new right-hand side, in case it
2351 // *also* has to get split.
2352 bstart += blob_offset;
2353 onode->c->store->logger->inc(l_bluestore_blob_split);
2354 } else {
2355 must_span = true;
2356 break;
2357 }
2358 }
2359 }
2360 } else {
2361 must_span = true;
2362 }
2363 if (must_span) {
31f18b77
FG
2364 auto bid = allocate_spanning_blob_id();
2365 b->id = bid;
7c673cae
FG
2366 spanning_blob_map[b->id] = b;
2367 dout(20) << __func__ << " adding spanning " << *b << dendl;
2368 }
2369 }
2370 } else {
2371 if (e->blob->is_spanning()) {
2372 spanning_blob_map.erase(e->blob->id);
2373 e->blob->id = -1;
2374 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2375 }
2376 }
2377 }
2378 }
2379
2380 clear_needs_reshard();
2381}
2382
2383bool BlueStore::ExtentMap::encode_some(
2384 uint32_t offset,
2385 uint32_t length,
2386 bufferlist& bl,
2387 unsigned *pn)
2388{
2389 auto cct = onode->c->store->cct; //used by dout
2390 Extent dummy(offset);
2391 auto start = extent_map.lower_bound(dummy);
2392 uint32_t end = offset + length;
2393
2394 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2395 // serialization only. Hence there is no specific
2396 // handling at ExtentMap level.
2397
2398 unsigned n = 0;
2399 size_t bound = 0;
7c673cae
FG
2400 bool must_reshard = false;
2401 for (auto p = start;
2402 p != extent_map.end() && p->logical_offset < end;
2403 ++p, ++n) {
2404 assert(p->logical_offset >= offset);
2405 p->blob->last_encoded_id = -1;
2406 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2407 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2408 << std::dec << " hit new spanning blob " << *p << dendl;
2409 request_reshard(p->blob_start(), p->blob_end());
2410 must_reshard = true;
2411 }
31f18b77
FG
2412 if (!must_reshard) {
2413 denc_varint(0, bound); // blobid
2414 denc_varint(0, bound); // logical_offset
2415 denc_varint(0, bound); // len
2416 denc_varint(0, bound); // blob_offset
7c673cae 2417
31f18b77
FG
2418 p->blob->bound_encode(
2419 bound,
2420 struct_v,
2421 p->blob->shared_blob->get_sbid(),
2422 false);
2423 }
7c673cae
FG
2424 }
2425 if (must_reshard) {
2426 return true;
2427 }
2428
31f18b77
FG
2429 denc(struct_v, bound);
2430 denc_varint(0, bound); // number of extents
2431
7c673cae
FG
2432 {
2433 auto app = bl.get_contiguous_appender(bound);
2434 denc(struct_v, app);
2435 denc_varint(n, app);
2436 if (pn) {
2437 *pn = n;
2438 }
2439
2440 n = 0;
2441 uint64_t pos = 0;
2442 uint64_t prev_len = 0;
2443 for (auto p = start;
2444 p != extent_map.end() && p->logical_offset < end;
2445 ++p, ++n) {
2446 unsigned blobid;
2447 bool include_blob = false;
2448 if (p->blob->is_spanning()) {
2449 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2450 blobid |= BLOBID_FLAG_SPANNING;
2451 } else if (p->blob->last_encoded_id < 0) {
2452 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2453 include_blob = true;
2454 blobid = 0; // the decoder will infer the id from n
2455 } else {
2456 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2457 }
2458 if (p->logical_offset == pos) {
2459 blobid |= BLOBID_FLAG_CONTIGUOUS;
2460 }
2461 if (p->blob_offset == 0) {
2462 blobid |= BLOBID_FLAG_ZEROOFFSET;
2463 }
2464 if (p->length == prev_len) {
2465 blobid |= BLOBID_FLAG_SAMELENGTH;
2466 } else {
2467 prev_len = p->length;
2468 }
2469 denc_varint(blobid, app);
2470 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2471 denc_varint_lowz(p->logical_offset - pos, app);
2472 }
2473 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2474 denc_varint_lowz(p->blob_offset, app);
2475 }
2476 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2477 denc_varint_lowz(p->length, app);
2478 }
2479 pos = p->logical_end();
2480 if (include_blob) {
2481 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2482 }
2483 }
2484 }
2485 /*derr << __func__ << bl << dendl;
2486 derr << __func__ << ":";
2487 bl.hexdump(*_dout);
2488 *_dout << dendl;
2489 */
2490 return false;
2491}
2492
2493unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2494{
2495 auto cct = onode->c->store->cct; //used by dout
2496 /*
2497 derr << __func__ << ":";
2498 bl.hexdump(*_dout);
2499 *_dout << dendl;
2500 */
2501
2502 assert(bl.get_num_buffers() <= 1);
2503 auto p = bl.front().begin_deep();
2504 __u8 struct_v;
2505 denc(struct_v, p);
2506 // Version 2 differs from v1 in blob's ref_map
2507 // serialization only. Hence there is no specific
2508 // handling at ExtentMap level below.
2509 assert(struct_v == 1 || struct_v == 2);
2510
2511 uint32_t num;
2512 denc_varint(num, p);
2513 vector<BlobRef> blobs(num);
2514 uint64_t pos = 0;
2515 uint64_t prev_len = 0;
2516 unsigned n = 0;
2517
2518 while (!p.end()) {
2519 Extent *le = new Extent();
2520 uint64_t blobid;
2521 denc_varint(blobid, p);
2522 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2523 uint64_t gap;
2524 denc_varint_lowz(gap, p);
2525 pos += gap;
2526 }
2527 le->logical_offset = pos;
2528 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2529 denc_varint_lowz(le->blob_offset, p);
2530 } else {
2531 le->blob_offset = 0;
2532 }
2533 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2534 denc_varint_lowz(prev_len, p);
2535 }
2536 le->length = prev_len;
2537
2538 if (blobid & BLOBID_FLAG_SPANNING) {
2539 dout(30) << __func__ << " getting spanning blob "
2540 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2541 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2542 } else {
2543 blobid >>= BLOBID_SHIFT_BITS;
2544 if (blobid) {
2545 le->assign_blob(blobs[blobid - 1]);
2546 assert(le->blob);
2547 } else {
2548 Blob *b = new Blob();
2549 uint64_t sbid = 0;
2550 b->decode(onode->c, p, struct_v, &sbid, false);
2551 blobs[n] = b;
2552 onode->c->open_shared_blob(sbid, b);
2553 le->assign_blob(b);
2554 }
2555 // we build ref_map dynamically for non-spanning blobs
2556 le->blob->get_ref(
2557 onode->c,
2558 le->blob_offset,
2559 le->length);
2560 }
2561 pos += prev_len;
2562 ++n;
2563 extent_map.insert(*le);
2564 }
2565
2566 assert(n == num);
2567 return num;
2568}
2569
2570void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2571{
2572 // Version 2 differs from v1 in blob's ref_map
2573 // serialization only. Hence there is no specific
2574 // handling at ExtentMap level.
2575 __u8 struct_v = 2;
2576
2577 denc(struct_v, p);
2578 denc_varint((uint32_t)0, p);
2579 size_t key_size = 0;
2580 denc_varint((uint32_t)0, key_size);
2581 p += spanning_blob_map.size() * key_size;
2582 for (const auto& i : spanning_blob_map) {
2583 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2584 }
2585}
2586
2587void BlueStore::ExtentMap::encode_spanning_blobs(
2588 bufferlist::contiguous_appender& p)
2589{
2590 // Version 2 differs from v1 in blob's ref_map
2591 // serialization only. Hence there is no specific
2592 // handling at ExtentMap level.
2593 __u8 struct_v = 2;
2594
2595 denc(struct_v, p);
2596 denc_varint(spanning_blob_map.size(), p);
2597 for (auto& i : spanning_blob_map) {
2598 denc_varint(i.second->id, p);
2599 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2600 }
2601}
2602
2603void BlueStore::ExtentMap::decode_spanning_blobs(
2604 bufferptr::iterator& p)
2605{
2606 __u8 struct_v;
2607 denc(struct_v, p);
2608 // Version 2 differs from v1 in blob's ref_map
2609 // serialization only. Hence there is no specific
2610 // handling at ExtentMap level.
2611 assert(struct_v == 1 || struct_v == 2);
2612
2613 unsigned n;
2614 denc_varint(n, p);
2615 while (n--) {
2616 BlobRef b(new Blob());
2617 denc_varint(b->id, p);
2618 spanning_blob_map[b->id] = b;
2619 uint64_t sbid = 0;
2620 b->decode(onode->c, p, struct_v, &sbid, true);
2621 onode->c->open_shared_blob(sbid, b);
2622 }
2623}
2624
2625void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2626{
2627 shards.resize(onode->onode.extent_map_shards.size());
2628 unsigned i = 0;
2629 for (auto &s : onode->onode.extent_map_shards) {
2630 shards[i].shard_info = &s;
2631 shards[i].loaded = loaded;
2632 shards[i].dirty = dirty;
2633 ++i;
2634 }
2635}
2636
2637void BlueStore::ExtentMap::fault_range(
2638 KeyValueDB *db,
2639 uint32_t offset,
2640 uint32_t length)
2641{
2642 auto cct = onode->c->store->cct; //used by dout
2643 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2644 << std::dec << dendl;
2645 auto start = seek_shard(offset);
2646 auto last = seek_shard(offset + length);
2647
2648 if (start < 0)
2649 return;
2650
2651 assert(last >= start);
2652 string key;
2653 while (start <= last) {
2654 assert((size_t)start < shards.size());
2655 auto p = &shards[start];
2656 if (!p->loaded) {
2657 dout(30) << __func__ << " opening shard 0x" << std::hex
2658 << p->shard_info->offset << std::dec << dendl;
2659 bufferlist v;
2660 generate_extent_shard_key_and_apply(
2661 onode->key, p->shard_info->offset, &key,
2662 [&](const string& final_key) {
2663 int r = db->get(PREFIX_OBJ, final_key, &v);
2664 if (r < 0) {
2665 derr << __func__ << " missing shard 0x" << std::hex
2666 << p->shard_info->offset << std::dec << " for " << onode->oid
2667 << dendl;
2668 assert(r >= 0);
2669 }
2670 }
2671 );
2672 p->extents = decode_some(v);
2673 p->loaded = true;
2674 dout(20) << __func__ << " open shard 0x" << std::hex
2675 << p->shard_info->offset << std::dec
2676 << " (" << v.length() << " bytes)" << dendl;
2677 assert(p->dirty == false);
2678 assert(v.length() == p->shard_info->bytes);
2679 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2680 } else {
2681 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2682 }
2683 ++start;
2684 }
2685}
2686
2687void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
2688 uint32_t offset,
2689 uint32_t length)
2690{
2691 auto cct = onode->c->store->cct; //used by dout
2692 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2693 << std::dec << dendl;
2694 if (shards.empty()) {
2695 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2696 inline_bl.clear();
2697 return;
2698 }
2699 auto start = seek_shard(offset);
2700 auto last = seek_shard(offset + length);
2701 if (start < 0)
2702 return;
2703
2704 assert(last >= start);
2705 while (start <= last) {
2706 assert((size_t)start < shards.size());
2707 auto p = &shards[start];
2708 if (!p->loaded) {
2709 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2710 << std::dec << " is not loaded, can't mark dirty" << dendl;
2711 assert(0 == "can't mark unloaded shard dirty");
2712 }
2713 if (!p->dirty) {
2714 dout(20) << __func__ << " mark shard 0x" << std::hex
2715 << p->shard_info->offset << std::dec << " dirty" << dendl;
2716 p->dirty = true;
2717 }
2718 ++start;
2719 }
2720}
2721
2722BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2723 uint64_t offset)
2724{
2725 Extent dummy(offset);
2726 return extent_map.find(dummy);
2727}
2728
7c673cae
FG
2729BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2730 uint64_t offset)
2731{
2732 Extent dummy(offset);
2733 auto fp = extent_map.lower_bound(dummy);
2734 if (fp != extent_map.begin()) {
2735 --fp;
2736 if (fp->logical_end() <= offset) {
2737 ++fp;
2738 }
2739 }
2740 return fp;
2741}
2742
2743BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2744 uint64_t offset) const
2745{
2746 Extent dummy(offset);
2747 auto fp = extent_map.lower_bound(dummy);
2748 if (fp != extent_map.begin()) {
2749 --fp;
2750 if (fp->logical_end() <= offset) {
2751 ++fp;
2752 }
2753 }
2754 return fp;
2755}
2756
2757bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2758{
2759 auto fp = seek_lextent(offset);
2760 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2761 return false;
2762 }
2763 return true;
2764}
2765
2766int BlueStore::ExtentMap::compress_extent_map(
2767 uint64_t offset,
2768 uint64_t length)
2769{
2770 auto cct = onode->c->store->cct; //used by dout
2771 if (extent_map.empty())
2772 return 0;
2773 int removed = 0;
2774 auto p = seek_lextent(offset);
2775 if (p != extent_map.begin()) {
2776 --p; // start to the left of offset
2777 }
2778 // the caller should have just written to this region
2779 assert(p != extent_map.end());
2780
2781 // identify the *next* shard
2782 auto pshard = shards.begin();
2783 while (pshard != shards.end() &&
2784 p->logical_offset >= pshard->shard_info->offset) {
2785 ++pshard;
2786 }
2787 uint64_t shard_end;
2788 if (pshard != shards.end()) {
2789 shard_end = pshard->shard_info->offset;
2790 } else {
2791 shard_end = OBJECT_MAX_SIZE;
2792 }
2793
2794 auto n = p;
2795 for (++n; n != extent_map.end(); p = n++) {
2796 if (n->logical_offset > offset + length) {
2797 break; // stop after end
2798 }
2799 while (n != extent_map.end() &&
2800 p->logical_end() == n->logical_offset &&
2801 p->blob == n->blob &&
2802 p->blob_offset + p->length == n->blob_offset &&
2803 n->logical_offset < shard_end) {
2804 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2805 << " next shard 0x" << shard_end << std::dec
2806 << " merging " << *p << " and " << *n << dendl;
2807 p->length += n->length;
2808 rm(n++);
2809 ++removed;
2810 }
2811 if (n == extent_map.end()) {
2812 break;
2813 }
2814 if (n->logical_offset >= shard_end) {
2815 assert(pshard != shards.end());
2816 ++pshard;
2817 if (pshard != shards.end()) {
2818 shard_end = pshard->shard_info->offset;
2819 } else {
2820 shard_end = OBJECT_MAX_SIZE;
2821 }
2822 }
2823 }
2824 if (removed && onode) {
2825 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2826 }
2827 return removed;
2828}
2829
2830void BlueStore::ExtentMap::punch_hole(
2831 CollectionRef &c,
2832 uint64_t offset,
2833 uint64_t length,
2834 old_extent_map_t *old_extents)
2835{
2836 auto p = seek_lextent(offset);
2837 uint64_t end = offset + length;
2838 while (p != extent_map.end()) {
2839 if (p->logical_offset >= end) {
2840 break;
2841 }
2842 if (p->logical_offset < offset) {
2843 if (p->logical_end() > end) {
2844 // split and deref middle
2845 uint64_t front = offset - p->logical_offset;
2846 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2847 length, p->blob);
2848 old_extents->push_back(*oe);
2849 add(end,
2850 p->blob_offset + front + length,
2851 p->length - front - length,
2852 p->blob);
2853 p->length = front;
2854 break;
2855 } else {
2856 // deref tail
2857 assert(p->logical_end() > offset); // else seek_lextent bug
2858 uint64_t keep = offset - p->logical_offset;
2859 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2860 p->length - keep, p->blob);
2861 old_extents->push_back(*oe);
2862 p->length = keep;
2863 ++p;
2864 continue;
2865 }
2866 }
2867 if (p->logical_offset + p->length <= end) {
2868 // deref whole lextent
2869 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2870 p->length, p->blob);
2871 old_extents->push_back(*oe);
2872 rm(p++);
2873 continue;
2874 }
2875 // deref head
2876 uint64_t keep = p->logical_end() - end;
2877 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2878 p->length - keep, p->blob);
2879 old_extents->push_back(*oe);
2880
2881 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2882 rm(p);
2883 break;
2884 }
2885}
2886
2887BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2888 CollectionRef &c,
2889 uint64_t logical_offset,
2890 uint64_t blob_offset, uint64_t length, BlobRef b,
2891 old_extent_map_t *old_extents)
2892{
2893 // We need to have completely initialized Blob to increment its ref counters.
2894 assert(b->get_blob().get_logical_length() != 0);
2895
2896 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2897 // old_extents list if we overwre the blob totally
2898 // This might happen during WAL overwrite.
2899 b->get_ref(onode->c, blob_offset, length);
2900
2901 if (old_extents) {
2902 punch_hole(c, logical_offset, length, old_extents);
2903 }
2904
2905 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2906 extent_map.insert(*le);
2907 if (spans_shard(logical_offset, length)) {
2908 request_reshard(logical_offset, logical_offset + length);
2909 }
2910 return le;
2911}
2912
2913BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2914 BlobRef lb,
2915 uint32_t blob_offset,
2916 uint32_t pos)
2917{
2918 auto cct = onode->c->store->cct; //used by dout
2919
2920 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2921 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2922 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2923 << dendl;
2924 BlobRef rb = onode->c->new_blob();
2925 lb->split(onode->c, blob_offset, rb.get());
2926
2927 for (auto ep = seek_lextent(pos);
2928 ep != extent_map.end() && ep->logical_offset < end_pos;
2929 ++ep) {
2930 if (ep->blob != lb) {
2931 continue;
2932 }
2933 if (ep->logical_offset < pos) {
2934 // split extent
2935 size_t left = pos - ep->logical_offset;
2936 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2937 extent_map.insert(*ne);
2938 ep->length = left;
2939 dout(30) << __func__ << " split " << *ep << dendl;
2940 dout(30) << __func__ << " to " << *ne << dendl;
2941 } else {
2942 // switch blob
2943 assert(ep->blob_offset >= blob_offset);
2944
2945 ep->blob = rb;
2946 ep->blob_offset -= blob_offset;
2947 dout(30) << __func__ << " adjusted " << *ep << dendl;
2948 }
2949 }
2950 return rb;
2951}
2952
2953// Onode
2954
2955#undef dout_prefix
2956#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2957
2958void BlueStore::Onode::flush()
2959{
2960 if (flushing_count.load()) {
2961 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2962 std::unique_lock<std::mutex> l(flush_lock);
2963 while (flushing_count.load()) {
2964 flush_cond.wait(l);
2965 }
2966 }
2967 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2968}
2969
2970// =======================================================
2971// WriteContext
2972
2973/// Checks for writes to the same pextent within a blob
2974bool BlueStore::WriteContext::has_conflict(
2975 BlobRef b,
2976 uint64_t loffs,
2977 uint64_t loffs_end,
2978 uint64_t min_alloc_size)
2979{
2980 assert((loffs % min_alloc_size) == 0);
2981 assert((loffs_end % min_alloc_size) == 0);
2982 for (auto w : writes) {
2983 if (b == w.b) {
2984 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
224ce89b 2985 auto loffs2_end = P2ROUNDUP(w.logical_offset + w.length0, min_alloc_size);
7c673cae 2986 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 2987 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
2988 return true;
2989 }
2990 }
2991 }
2992 return false;
2993}
2994
2995// =======================================================
2996
2997// DeferredBatch
2998#undef dout_prefix
2999#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3000
3001void BlueStore::DeferredBatch::prepare_write(
3002 CephContext *cct,
3003 uint64_t seq, uint64_t offset, uint64_t length,
3004 bufferlist::const_iterator& blp)
3005{
3006 _discard(cct, offset, length);
3007 auto i = iomap.insert(make_pair(offset, deferred_io()));
3008 assert(i.second); // this should be a new insertion
3009 i.first->second.seq = seq;
3010 blp.copy(length, i.first->second.bl);
31f18b77
FG
3011 i.first->second.bl.reassign_to_mempool(
3012 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3013 dout(20) << __func__ << " seq " << seq
3014 << " 0x" << std::hex << offset << "~" << length
3015 << " crc " << i.first->second.bl.crc32c(-1)
3016 << std::dec << dendl;
3017 seq_bytes[seq] += length;
3018#ifdef DEBUG_DEFERRED
3019 _audit(cct);
3020#endif
3021}
3022
3023void BlueStore::DeferredBatch::_discard(
3024 CephContext *cct, uint64_t offset, uint64_t length)
3025{
3026 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3027 << std::dec << dendl;
3028 auto p = iomap.lower_bound(offset);
3029 if (p != iomap.begin()) {
3030 --p;
3031 auto end = p->first + p->second.bl.length();
3032 if (end > offset) {
3033 bufferlist head;
3034 head.substr_of(p->second.bl, 0, offset - p->first);
3035 dout(20) << __func__ << " keep head " << p->second.seq
3036 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3037 << " -> 0x" << head.length() << std::dec << dendl;
3038 auto i = seq_bytes.find(p->second.seq);
224ce89b 3039 assert(i != seq_bytes.end());
7c673cae
FG
3040 if (end > offset + length) {
3041 bufferlist tail;
3042 tail.substr_of(p->second.bl, offset + length - p->first,
3043 end - (offset + length));
3044 dout(20) << __func__ << " keep tail " << p->second.seq
3045 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3046 << " -> 0x" << tail.length() << std::dec << dendl;
3047 auto &n = iomap[offset + length];
3048 n.bl.swap(tail);
3049 n.seq = p->second.seq;
3050 i->second -= length;
3051 } else {
3052 i->second -= end - offset;
3053 }
224ce89b 3054 assert(i->second >= 0);
7c673cae
FG
3055 p->second.bl.swap(head);
3056 }
3057 ++p;
3058 }
3059 while (p != iomap.end()) {
3060 if (p->first >= offset + length) {
3061 break;
3062 }
3063 auto i = seq_bytes.find(p->second.seq);
224ce89b 3064 assert(i != seq_bytes.end());
7c673cae
FG
3065 auto end = p->first + p->second.bl.length();
3066 if (end > offset + length) {
3067 unsigned drop_front = offset + length - p->first;
3068 unsigned keep_tail = end - (offset + length);
3069 dout(20) << __func__ << " truncate front " << p->second.seq
3070 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3071 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3072 << " to 0x" << (offset + length) << "~" << keep_tail
3073 << std::dec << dendl;
3074 auto &s = iomap[offset + length];
3075 s.seq = p->second.seq;
3076 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3077 i->second -= drop_front;
3078 } else {
3079 dout(20) << __func__ << " drop " << p->second.seq
3080 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3081 << std::dec << dendl;
3082 i->second -= p->second.bl.length();
3083 }
224ce89b 3084 assert(i->second >= 0);
7c673cae
FG
3085 p = iomap.erase(p);
3086 }
3087}
3088
3089void BlueStore::DeferredBatch::_audit(CephContext *cct)
3090{
3091 map<uint64_t,int> sb;
3092 for (auto p : seq_bytes) {
3093 sb[p.first] = 0; // make sure we have the same set of keys
3094 }
3095 uint64_t pos = 0;
3096 for (auto& p : iomap) {
3097 assert(p.first >= pos);
3098 sb[p.second.seq] += p.second.bl.length();
3099 pos = p.first + p.second.bl.length();
3100 }
3101 assert(sb == seq_bytes);
3102}
3103
3104
3105// Collection
3106
3107#undef dout_prefix
3108#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3109
3110BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3111 : store(ns),
3112 cache(c),
3113 cid(cid),
3114 lock("BlueStore::Collection::lock", true, false),
3115 exists(true),
3116 onode_map(c)
3117{
3118}
3119
3120void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3121{
3122 assert(!b->shared_blob);
3123 const bluestore_blob_t& blob = b->get_blob();
3124 if (!blob.is_shared()) {
3125 b->shared_blob = new SharedBlob(this);
3126 return;
3127 }
3128
3129 b->shared_blob = shared_blob_set.lookup(sbid);
3130 if (b->shared_blob) {
3131 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3132 << std::dec << " had " << *b->shared_blob << dendl;
3133 } else {
3134 b->shared_blob = new SharedBlob(sbid, this);
3135 shared_blob_set.add(this, b->shared_blob.get());
3136 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3137 << std::dec << " opened " << *b->shared_blob
3138 << dendl;
3139 }
3140}
3141
3142void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3143{
3144 if (!sb->is_loaded()) {
3145
3146 bufferlist v;
3147 string key;
3148 auto sbid = sb->get_sbid();
3149 get_shared_blob_key(sbid, &key);
3150 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3151 if (r < 0) {
3152 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3153 << std::dec << " not found at key "
3154 << pretty_binary_string(key) << dendl;
3155 assert(0 == "uh oh, missing shared_blob");
3156 }
3157
3158 sb->loaded = true;
3159 sb->persistent = new bluestore_shared_blob_t(sbid);
3160 bufferlist::iterator p = v.begin();
3161 ::decode(*(sb->persistent), p);
3162 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3163 << std::dec << " loaded shared_blob " << *sb << dendl;
3164 }
3165}
3166
3167void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3168{
7c673cae 3169 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
31f18b77 3170 assert(!b->shared_blob->is_loaded());
7c673cae
FG
3171
3172 // update blob
31f18b77 3173 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3174 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3175
3176 // update shared blob
3177 b->shared_blob->loaded = true;
3178 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3179 shared_blob_set.add(this, b->shared_blob.get());
3180 for (auto p : blob.get_extents()) {
3181 if (p.is_valid()) {
3182 b->shared_blob->get_ref(
3183 p.offset,
3184 p.length);
3185 }
3186 }
3187 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3188}
3189
31f18b77
FG
3190uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3191{
3192 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3193 assert(sb->is_loaded());
3194
3195 uint64_t sbid = sb->get_sbid();
3196 shared_blob_set.remove(sb);
3197 sb->loaded = false;
3198 delete sb->persistent;
3199 sb->sbid_unloaded = 0;
3200 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3201 return sbid;
3202}
3203
7c673cae
FG
3204BlueStore::OnodeRef BlueStore::Collection::get_onode(
3205 const ghobject_t& oid,
3206 bool create)
3207{
3208 assert(create ? lock.is_wlocked() : lock.is_locked());
3209
3210 spg_t pgid;
3211 if (cid.is_pg(&pgid)) {
3212 if (!oid.match(cnode.bits, pgid.ps())) {
3213 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3214 << pgid << " bits " << cnode.bits << dendl;
3215 ceph_abort();
3216 }
3217 }
3218
3219 OnodeRef o = onode_map.lookup(oid);
3220 if (o)
3221 return o;
3222
31f18b77 3223 mempool::bluestore_cache_other::string key;
7c673cae
FG
3224 get_object_key(store->cct, oid, &key);
3225
3226 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3227 << pretty_binary_string(key) << dendl;
3228
3229 bufferlist v;
3230 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3231 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3232 Onode *on;
3233 if (v.length() == 0) {
3234 assert(r == -ENOENT);
3235 if (!store->cct->_conf->bluestore_debug_misc &&
3236 !create)
3237 return OnodeRef();
3238
3239 // new object, new onode
3240 on = new Onode(this, oid, key);
3241 } else {
3242 // loaded
3243 assert(r >= 0);
3244 on = new Onode(this, oid, key);
3245 on->exists = true;
31f18b77 3246 bufferptr::iterator p = v.front().begin_deep();
7c673cae 3247 on->onode.decode(p);
3efd9988
FG
3248 for (auto& i : on->onode.attrs) {
3249 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3250 }
7c673cae
FG
3251
3252 // initialize extent_map
3253 on->extent_map.decode_spanning_blobs(p);
3254 if (on->onode.extent_map_shards.empty()) {
3255 denc(on->extent_map.inline_bl, p);
3256 on->extent_map.decode_some(on->extent_map.inline_bl);
3efd9988
FG
3257 on->extent_map.inline_bl.reassign_to_mempool(
3258 mempool::mempool_bluestore_cache_other);
7c673cae
FG
3259 } else {
3260 on->extent_map.init_shards(false, false);
3261 }
3262 }
3263 o.reset(on);
3264 return onode_map.add(oid, o);
3265}
3266
3267void BlueStore::Collection::split_cache(
3268 Collection *dest)
3269{
3270 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3271
3272 // lock (one or both) cache shards
3273 std::lock(cache->lock, dest->cache->lock);
3274 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3275 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3276
3277 int destbits = dest->cnode.bits;
3278 spg_t destpg;
3279 bool is_pg = dest->cid.is_pg(&destpg);
3280 assert(is_pg);
3281
3282 auto p = onode_map.onode_map.begin();
3283 while (p != onode_map.onode_map.end()) {
3284 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3285 // onode does not belong to this child
3286 ++p;
3287 } else {
3288 OnodeRef o = p->second;
3289 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3290 << dendl;
3291
3292 cache->_rm_onode(p->second);
3293 p = onode_map.onode_map.erase(p);
3294
3295 o->c = dest;
3296 dest->cache->_add_onode(o, 1);
3297 dest->onode_map.onode_map[o->oid] = o;
3298 dest->onode_map.cache = dest->cache;
3299
3300 // move over shared blobs and buffers. cover shared blobs from
3301 // both extent map and spanning blob map (the full extent map
3302 // may not be faulted in)
3303 vector<SharedBlob*> sbvec;
3304 for (auto& e : o->extent_map.extent_map) {
3305 sbvec.push_back(e.blob->shared_blob.get());
3306 }
3307 for (auto& b : o->extent_map.spanning_blob_map) {
3308 sbvec.push_back(b.second->shared_blob.get());
3309 }
3310 for (auto sb : sbvec) {
3311 if (sb->coll == dest) {
3312 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3313 << dendl;
3314 continue;
3315 }
3316 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
3317 if (sb->get_sbid()) {
3318 ldout(store->cct, 20) << __func__
3319 << " moving registration " << *sb << dendl;
3320 shared_blob_set.remove(sb);
3321 dest->shared_blob_set.add(dest, sb);
3322 }
3efd9988 3323 sb->coll = dest;
7c673cae 3324 if (dest->cache != cache) {
7c673cae
FG
3325 for (auto& i : sb->bc.buffer_map) {
3326 if (!i.second->is_writing()) {
3327 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3328 << dendl;
3329 dest->cache->_move_buffer(cache, i.second.get());
3330 }
3331 }
3332 }
3333 }
7c673cae
FG
3334 }
3335 }
3336}
3337
7c673cae
FG
3338// =======================================================
3339
3340void *BlueStore::MempoolThread::entry()
3341{
3342 Mutex::Locker l(lock);
3343 while (!stop) {
31f18b77
FG
3344 uint64_t meta_bytes =
3345 mempool::bluestore_cache_other::allocated_bytes() +
3346 mempool::bluestore_cache_onode::allocated_bytes();
3347 uint64_t onode_num =
3348 mempool::bluestore_cache_onode::allocated_items();
3349
3350 if (onode_num < 2) {
3351 onode_num = 2;
3352 }
3353
3354 float bytes_per_onode = (float)meta_bytes / (float)onode_num;
3355 size_t num_shards = store->cache_shards.size();
3356 float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
3357 // A little sloppy but should be close enough
224ce89b 3358 uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
31f18b77
FG
3359
3360 for (auto i : store->cache_shards) {
3361 i->trim(shard_target,
3362 store->cache_meta_ratio,
3363 store->cache_data_ratio,
3364 bytes_per_onode);
3365 }
3366
3367 store->_update_cache_logger();
3368
7c673cae
FG
3369 utime_t wait;
3370 wait += store->cct->_conf->bluestore_cache_trim_interval;
3371 cond.WaitInterval(lock, wait);
3372 }
3373 stop = false;
3374 return NULL;
3375}
3376
3377// =======================================================
3378
31f18b77
FG
3379// OmapIteratorImpl
3380
3381#undef dout_prefix
3382#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3383
3384BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3385 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3386 : c(c), o(o), it(it)
3387{
3388 RWLock::RLocker l(c->lock);
3389 if (o->onode.has_omap()) {
3390 get_omap_key(o->onode.nid, string(), &head);
3391 get_omap_tail(o->onode.nid, &tail);
3392 it->lower_bound(head);
3393 }
3394}
3395
3396int BlueStore::OmapIteratorImpl::seek_to_first()
3397{
3398 RWLock::RLocker l(c->lock);
3399 if (o->onode.has_omap()) {
3400 it->lower_bound(head);
3401 } else {
3402 it = KeyValueDB::Iterator();
3403 }
3404 return 0;
3405}
3406
3407int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3408{
3409 RWLock::RLocker l(c->lock);
3410 if (o->onode.has_omap()) {
3411 string key;
3412 get_omap_key(o->onode.nid, after, &key);
3413 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3414 << pretty_binary_string(key) << dendl;
3415 it->upper_bound(key);
3416 } else {
3417 it = KeyValueDB::Iterator();
3418 }
3419 return 0;
3420}
3421
3422int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3423{
3424 RWLock::RLocker l(c->lock);
3425 if (o->onode.has_omap()) {
3426 string key;
3427 get_omap_key(o->onode.nid, to, &key);
3428 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3429 << pretty_binary_string(key) << dendl;
3430 it->lower_bound(key);
3431 } else {
3432 it = KeyValueDB::Iterator();
3433 }
3434 return 0;
3435}
3436
3437bool BlueStore::OmapIteratorImpl::valid()
3438{
3439 RWLock::RLocker l(c->lock);
3440 bool r = o->onode.has_omap() && it && it->valid() &&
3441 it->raw_key().second <= tail;
3442 if (it && it->valid()) {
3443 ldout(c->store->cct,20) << __func__ << " is at "
3444 << pretty_binary_string(it->raw_key().second)
3445 << dendl;
3446 }
3447 return r;
3448}
3449
3450int BlueStore::OmapIteratorImpl::next(bool validate)
3451{
3452 RWLock::RLocker l(c->lock);
3453 if (o->onode.has_omap()) {
3454 it->next();
3455 return 0;
3456 } else {
3457 return -1;
3458 }
3459}
3460
3461string BlueStore::OmapIteratorImpl::key()
3462{
3463 RWLock::RLocker l(c->lock);
3464 assert(it->valid());
3465 string db_key = it->raw_key().second;
3466 string user_key;
3467 decode_omap_key(db_key, &user_key);
3468 return user_key;
3469}
3470
3471bufferlist BlueStore::OmapIteratorImpl::value()
3472{
3473 RWLock::RLocker l(c->lock);
3474 assert(it->valid());
3475 return it->value();
3476}
3477
3478
3479// =====================================
3480
7c673cae
FG
3481#undef dout_prefix
3482#define dout_prefix *_dout << "bluestore(" << path << ") "
3483
3484
3485static void aio_cb(void *priv, void *priv2)
3486{
3487 BlueStore *store = static_cast<BlueStore*>(priv);
3488 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3489 c->aio_finish(store);
3490}
3491
3492BlueStore::BlueStore(CephContext *cct, const string& path)
3493 : ObjectStore(cct, path),
3494 throttle_bytes(cct, "bluestore_throttle_bytes",
3495 cct->_conf->bluestore_throttle_bytes),
3496 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3497 cct->_conf->bluestore_throttle_bytes +
3498 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 3499 deferred_finisher(cct, "defered_finisher", "dfin"),
7c673cae 3500 kv_sync_thread(this),
31f18b77 3501 kv_finalize_thread(this),
7c673cae
FG
3502 mempool_thread(this)
3503{
3504 _init_logger();
3505 cct->_conf->add_observer(this);
3506 set_cache_shards(1);
7c673cae
FG
3507}
3508
3509BlueStore::BlueStore(CephContext *cct,
3510 const string& path,
3511 uint64_t _min_alloc_size)
3512 : ObjectStore(cct, path),
3513 throttle_bytes(cct, "bluestore_throttle_bytes",
3514 cct->_conf->bluestore_throttle_bytes),
3515 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3516 cct->_conf->bluestore_throttle_bytes +
3517 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 3518 deferred_finisher(cct, "defered_finisher", "dfin"),
7c673cae 3519 kv_sync_thread(this),
31f18b77 3520 kv_finalize_thread(this),
7c673cae
FG
3521 min_alloc_size(_min_alloc_size),
3522 min_alloc_size_order(ctz(_min_alloc_size)),
3523 mempool_thread(this)
3524{
3525 _init_logger();
3526 cct->_conf->add_observer(this);
3527 set_cache_shards(1);
7c673cae
FG
3528}
3529
3530BlueStore::~BlueStore()
3531{
3532 for (auto f : finishers) {
3533 delete f;
3534 }
3535 finishers.clear();
3536
3537 cct->_conf->remove_observer(this);
3538 _shutdown_logger();
3539 assert(!mounted);
3540 assert(db == NULL);
3541 assert(bluefs == NULL);
3542 assert(fsid_fd < 0);
3543 assert(path_fd < 0);
3544 for (auto i : cache_shards) {
3545 delete i;
3546 }
3547 cache_shards.clear();
3548}
3549
3550const char **BlueStore::get_tracked_conf_keys() const
3551{
3552 static const char* KEYS[] = {
3553 "bluestore_csum_type",
3554 "bluestore_compression_mode",
3555 "bluestore_compression_algorithm",
3556 "bluestore_compression_min_blob_size",
3557 "bluestore_compression_min_blob_size_ssd",
3558 "bluestore_compression_min_blob_size_hdd",
3559 "bluestore_compression_max_blob_size",
3560 "bluestore_compression_max_blob_size_ssd",
3561 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 3562 "bluestore_compression_required_ratio",
7c673cae
FG
3563 "bluestore_max_alloc_size",
3564 "bluestore_prefer_deferred_size",
181888fb
FG
3565 "bluestore_prefer_deferred_size_hdd",
3566 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
3567 "bluestore_deferred_batch_ops",
3568 "bluestore_deferred_batch_ops_hdd",
3569 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
3570 "bluestore_throttle_bytes",
3571 "bluestore_throttle_deferred_bytes",
3572 "bluestore_throttle_cost_per_io_hdd",
3573 "bluestore_throttle_cost_per_io_ssd",
3574 "bluestore_throttle_cost_per_io",
3575 "bluestore_max_blob_size",
3576 "bluestore_max_blob_size_ssd",
3577 "bluestore_max_blob_size_hdd",
3578 NULL
3579 };
3580 return KEYS;
3581}
3582
3583void BlueStore::handle_conf_change(const struct md_config_t *conf,
3584 const std::set<std::string> &changed)
3585{
3586 if (changed.count("bluestore_csum_type")) {
3587 _set_csum();
3588 }
3589 if (changed.count("bluestore_compression_mode") ||
3590 changed.count("bluestore_compression_algorithm") ||
3591 changed.count("bluestore_compression_min_blob_size") ||
3592 changed.count("bluestore_compression_max_blob_size")) {
3593 if (bdev) {
3594 _set_compression();
3595 }
3596 }
3597 if (changed.count("bluestore_max_blob_size") ||
3598 changed.count("bluestore_max_blob_size_ssd") ||
3599 changed.count("bluestore_max_blob_size_hdd")) {
3600 if (bdev) {
3601 // only after startup
3602 _set_blob_size();
3603 }
3604 }
3605 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
3606 changed.count("bluestore_prefer_deferred_size_hdd") ||
3607 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
3608 changed.count("bluestore_max_alloc_size") ||
3609 changed.count("bluestore_deferred_batch_ops") ||
3610 changed.count("bluestore_deferred_batch_ops_hdd") ||
3611 changed.count("bluestore_deferred_batch_ops_ssd")) {
3612 if (bdev) {
3613 // only after startup
3614 _set_alloc_sizes();
3615 }
3616 }
3617 if (changed.count("bluestore_throttle_cost_per_io") ||
3618 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3619 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3620 if (bdev) {
3621 _set_throttle_params();
3622 }
3623 }
3624 if (changed.count("bluestore_throttle_bytes")) {
3625 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3626 throttle_deferred_bytes.reset_max(
3627 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3628 }
3629 if (changed.count("bluestore_throttle_deferred_bytes")) {
3630 throttle_deferred_bytes.reset_max(
3631 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3632 }
3633}
3634
3635void BlueStore::_set_compression()
3636{
224ce89b
WB
3637 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3638 if (m) {
3639 comp_mode = *m;
3640 } else {
3641 derr << __func__ << " unrecognized value '"
3642 << cct->_conf->bluestore_compression_mode
3643 << "' for bluestore_compression_mode, reverting to 'none'"
3644 << dendl;
3645 comp_mode = Compressor::COMP_NONE;
3646 }
3647
3648 compressor = nullptr;
3649
3650 if (comp_mode == Compressor::COMP_NONE) {
3651 dout(10) << __func__ << " compression mode set to 'none', "
3652 << "ignore other compression setttings" << dendl;
3653 return;
3654 }
3655
3efd9988
FG
3656 if (cct->_conf->bluestore_compression_min_blob_size) {
3657 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae
FG
3658 } else {
3659 assert(bdev);
3660 if (bdev->is_rotational()) {
3661 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3662 } else {
3663 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3664 }
3665 }
3666
3667 if (cct->_conf->bluestore_compression_max_blob_size) {
3668 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3669 } else {
3670 assert(bdev);
3671 if (bdev->is_rotational()) {
3672 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3673 } else {
3674 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3675 }
3676 }
3677
7c673cae
FG
3678 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3679 if (!alg_name.empty()) {
3680 compressor = Compressor::create(cct, alg_name);
3681 if (!compressor) {
3682 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3683 << dendl;
3684 }
3685 }
3686
3687 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3688 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3689 << dendl;
3690}
3691
3692void BlueStore::_set_csum()
3693{
3694 csum_type = Checksummer::CSUM_NONE;
3695 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3696 if (t > Checksummer::CSUM_NONE)
3697 csum_type = t;
3698
3699 dout(10) << __func__ << " csum_type "
3700 << Checksummer::get_csum_type_string(csum_type)
3701 << dendl;
3702}
3703
3704void BlueStore::_set_throttle_params()
3705{
3706 if (cct->_conf->bluestore_throttle_cost_per_io) {
3707 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3708 } else {
3709 assert(bdev);
3710 if (bdev->is_rotational()) {
3711 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3712 } else {
3713 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3714 }
3715 }
3716
3717 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3718 << dendl;
3719}
3720void BlueStore::_set_blob_size()
3721{
3722 if (cct->_conf->bluestore_max_blob_size) {
3723 max_blob_size = cct->_conf->bluestore_max_blob_size;
3724 } else {
3725 assert(bdev);
3726 if (bdev->is_rotational()) {
3727 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3728 } else {
3729 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3730 }
3731 }
3732 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3733 << std::dec << dendl;
3734}
3735
1adf2230
AA
3736void BlueStore::_set_finisher_num()
3737{
3738 if (cct->_conf->bluestore_shard_finishers) {
3739 if (cct->_conf->osd_op_num_shards) {
3740 m_finisher_num = cct->_conf->osd_op_num_shards;
3741 } else {
3742 assert(bdev);
3743 if (bdev->is_rotational()) {
3744 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
3745 } else {
3746 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
3747 }
3748 }
3749 }
3750 assert(m_finisher_num != 0);
3751}
3752
31f18b77
FG
3753int BlueStore::_set_cache_sizes()
3754{
224ce89b
WB
3755 assert(bdev);
3756 if (cct->_conf->bluestore_cache_size) {
3757 cache_size = cct->_conf->bluestore_cache_size;
3758 } else {
3759 // choose global cache size based on backend type
3760 if (bdev->is_rotational()) {
3761 cache_size = cct->_conf->bluestore_cache_size_hdd;
3762 } else {
3763 cache_size = cct->_conf->bluestore_cache_size_ssd;
3764 }
3765 }
31f18b77
FG
3766 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3767 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b
WB
3768
3769 double cache_kv_max = cct->_conf->bluestore_cache_kv_max;
3770 double cache_kv_max_ratio = 0;
3771
3772 // if cache_kv_max is negative, disable it
3773 if (cache_size > 0 && cache_kv_max >= 0) {
3774 cache_kv_max_ratio = (double) cache_kv_max / (double) cache_size;
3775 if (cache_kv_max_ratio < 1.0 && cache_kv_max_ratio < cache_kv_ratio) {
3776 dout(1) << __func__ << " max " << cache_kv_max_ratio
3777 << " < ratio " << cache_kv_ratio
3778 << dendl;
3779 cache_meta_ratio = cache_meta_ratio + cache_kv_ratio - cache_kv_max_ratio;
3780 cache_kv_ratio = cache_kv_max_ratio;
3781 }
3782 }
3783
31f18b77
FG
3784 cache_data_ratio =
3785 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
3786
224ce89b 3787 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 3788 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
224ce89b 3789 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
3790 return -EINVAL;
3791 }
224ce89b 3792 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 3793 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
224ce89b 3794 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
3795 return -EINVAL;
3796 }
3797 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 3798 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
31f18b77
FG
3799 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3800 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
3801 << dendl;
3802 return -EINVAL;
3803 }
3804 if (cache_data_ratio < 0) {
3805 // deal with floating point imprecision
3806 cache_data_ratio = 0;
3807 }
224ce89b
WB
3808 dout(1) << __func__ << " cache_size " << cache_size
3809 << " meta " << cache_meta_ratio
31f18b77
FG
3810 << " kv " << cache_kv_ratio
3811 << " data " << cache_data_ratio
3812 << dendl;
3813 return 0;
3814}
3815
3efd9988
FG
3816int BlueStore::write_meta(const std::string& key, const std::string& value)
3817{
3818 bluestore_bdev_label_t label;
3819 string p = path + "/block";
3820 int r = _read_bdev_label(cct, p, &label);
3821 if (r < 0) {
3822 return ObjectStore::write_meta(key, value);
3823 }
3824 label.meta[key] = value;
3825 r = _write_bdev_label(cct, p, label);
3826 assert(r == 0);
3827 return ObjectStore::write_meta(key, value);
3828}
3829
3830int BlueStore::read_meta(const std::string& key, std::string *value)
3831{
3832 bluestore_bdev_label_t label;
3833 string p = path + "/block";
3834 int r = _read_bdev_label(cct, p, &label);
3835 if (r < 0) {
3836 return ObjectStore::read_meta(key, value);
3837 }
3838 auto i = label.meta.find(key);
3839 if (i == label.meta.end()) {
3840 return ObjectStore::read_meta(key, value);
3841 }
3842 *value = i->second;
3843 return 0;
3844}
3845
7c673cae
FG
3846void BlueStore::_init_logger()
3847{
3848 PerfCountersBuilder b(cct, "bluestore",
3849 l_bluestore_first, l_bluestore_last);
3850 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
3851 "Average kv_thread flush latency",
3852 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
3853 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
3854 "Average kv_thread commit latency");
3855 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
3856 "Average kv_thread sync latency",
3857 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
3858 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
3859 "Average prepare state latency");
3860 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
3861 "Average aio_wait state latency",
3862 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
3863 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
3864 "Average io_done state latency");
3865 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
3866 "Average kv_queued state latency");
3867 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
3868 "Average kv_commiting state latency");
3869 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
3870 "Average kv_done state latency");
3871 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
3872 "Average deferred_queued state latency");
3873 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
3874 "Average aio_wait state latency");
3875 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
3876 "Average cleanup state latency");
3877 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
3878 "Average finishing state latency");
3879 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
3880 "Average done state latency");
3881 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
3882 "Average submit throttle latency",
3883 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
3884 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
3885 "Average submit latency",
3886 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
3887 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
3888 "Average commit latency",
3889 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
3890 b.add_time_avg(l_bluestore_read_lat, "read_lat",
3891 "Average read latency",
3892 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
3893 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
3894 "Average read onode metadata latency");
3895 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
3896 "Average read latency");
3897 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
3898 "Average compress latency");
3899 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
3900 "Average decompress latency");
3901 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
3902 "Average checksum latency");
3903 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
3904 "Sum for beneficial compress ops");
3905 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
3906 "Sum for compress ops rejected due to low net gain of space");
3907 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
1adf2230 3908 "Sum for write-op padded bytes", NULL, 0, unit_t(BYTES));
7c673cae
FG
3909 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
3910 "Sum for deferred write op");
3911 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
1adf2230 3912 "Sum for deferred write bytes", "def", 0, unit_t(BYTES));
7c673cae
FG
3913 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
3914 "Sum for write penalty read ops");
3915 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
3916 "Sum for allocated bytes");
3917 b.add_u64(l_bluestore_stored, "bluestore_stored",
3918 "Sum for stored bytes");
3919 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
3920 "Sum for stored compressed bytes");
3921 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
3922 "Sum for bytes allocated for compressed data");
3923 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
3924 "Sum for original bytes that were compressed");
3925
3926 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
3927 "Number of onodes in cache");
3928 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
3929 "Sum for onode-lookups hit in the cache");
3930 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
3931 "Sum for onode-lookups missed in the cache");
3932 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
3933 "Sum for onode-shard lookups hit in the cache");
3934 b.add_u64_counter(l_bluestore_onode_shard_misses,
3935 "bluestore_onode_shard_misses",
3936 "Sum for onode-shard lookups missed in the cache");
3937 b.add_u64(l_bluestore_extents, "bluestore_extents",
3938 "Number of extents in cache");
3939 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
3940 "Number of blobs in cache");
3941 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
3942 "Number of buffers in cache");
3943 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
1adf2230 3944 "Number of buffer bytes in cache", NULL, 0, unit_t(BYTES));
7c673cae 3945 b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
1adf2230 3946 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(BYTES));
7c673cae 3947 b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
1adf2230 3948 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(BYTES));
7c673cae
FG
3949
3950 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
3951 "Large aligned writes into fresh blobs");
3952 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
1adf2230 3953 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(BYTES));
7c673cae
FG
3954 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
3955 "Large aligned writes into fresh blobs (blobs)");
3956 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
3957 "Small writes into existing or sparse small blobs");
3958 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
1adf2230 3959 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(BYTES));
7c673cae
FG
3960 b.add_u64_counter(l_bluestore_write_small_unused,
3961 "bluestore_write_small_unused",
3962 "Small writes into unused portion of existing blob");
3963 b.add_u64_counter(l_bluestore_write_small_deferred,
3964 "bluestore_write_small_deferred",
3965 "Small overwrites using deferred");
3966 b.add_u64_counter(l_bluestore_write_small_pre_read,
3967 "bluestore_write_small_pre_read",
3968 "Small writes that required we read some data (possibly "
3969 "cached) to fill out the block");
3970 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
3971 "Small write into new (sparse) blob");
3972
3973 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
3974 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
3975 "Onode extent map reshard events");
3976 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
3977 "Sum for blob splitting due to resharding");
3978 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
3979 "Sum for extents that have been removed due to compression");
3980 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
3981 "Sum for extents that have been merged due to garbage "
3982 "collection");
b32b8144
FG
3983 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
3984 "Read EIO errors propagated to high level callers");
7c673cae
FG
3985 logger = b.create_perf_counters();
3986 cct->get_perfcounters_collection()->add(logger);
3987}
3988
3989int BlueStore::_reload_logger()
3990{
3991 struct store_statfs_t store_statfs;
3992
3993 int r = statfs(&store_statfs);
3994 if(r >= 0) {
3995 logger->set(l_bluestore_allocated, store_statfs.allocated);
3996 logger->set(l_bluestore_stored, store_statfs.stored);
3997 logger->set(l_bluestore_compressed, store_statfs.compressed);
3998 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
3999 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
4000 }
4001 return r;
4002}
4003
4004void BlueStore::_shutdown_logger()
4005{
4006 cct->get_perfcounters_collection()->remove(logger);
4007 delete logger;
4008}
4009
4010int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4011 uuid_d *fsid)
4012{
4013 bluestore_bdev_label_t label;
4014 int r = _read_bdev_label(cct, path, &label);
4015 if (r < 0)
4016 return r;
4017 *fsid = label.osd_uuid;
4018 return 0;
4019}
4020
4021int BlueStore::_open_path()
4022{
b32b8144
FG
4023 // sanity check(s)
4024 if (cct->_conf->get_val<uint64_t>("osd_max_object_size") >=
4025 4*1024*1024*1024ull) {
4026 derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl;
4027 return -EINVAL;
4028 }
7c673cae 4029 assert(path_fd < 0);
224ce89b 4030 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY));
7c673cae
FG
4031 if (path_fd < 0) {
4032 int r = -errno;
4033 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4034 << dendl;
4035 return r;
4036 }
4037 return 0;
4038}
4039
4040void BlueStore::_close_path()
4041{
4042 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4043 path_fd = -1;
4044}
4045
3efd9988
FG
4046int BlueStore::_write_bdev_label(CephContext *cct,
4047 string path, bluestore_bdev_label_t label)
7c673cae
FG
4048{
4049 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4050 bufferlist bl;
4051 ::encode(label, bl);
4052 uint32_t crc = bl.crc32c(-1);
4053 ::encode(crc, bl);
4054 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
4055 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4056 z.zero();
4057 bl.append(std::move(z));
4058
224ce89b 4059 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY));
7c673cae
FG
4060 if (fd < 0) {
4061 fd = -errno;
4062 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4063 << dendl;
4064 return fd;
4065 }
4066 int r = bl.write_fd(fd);
4067 if (r < 0) {
4068 derr << __func__ << " failed to write to " << path
4069 << ": " << cpp_strerror(r) << dendl;
4070 }
3efd9988
FG
4071 r = ::fsync(fd);
4072 if (r < 0) {
4073 derr << __func__ << " failed to fsync " << path
4074 << ": " << cpp_strerror(r) << dendl;
4075 }
7c673cae
FG
4076 VOID_TEMP_FAILURE_RETRY(::close(fd));
4077 return r;
4078}
4079
4080int BlueStore::_read_bdev_label(CephContext* cct, string path,
4081 bluestore_bdev_label_t *label)
4082{
4083 dout(10) << __func__ << dendl;
224ce89b 4084 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY));
7c673cae
FG
4085 if (fd < 0) {
4086 fd = -errno;
4087 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4088 << dendl;
4089 return fd;
4090 }
4091 bufferlist bl;
4092 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4093 VOID_TEMP_FAILURE_RETRY(::close(fd));
4094 if (r < 0) {
4095 derr << __func__ << " failed to read from " << path
4096 << ": " << cpp_strerror(r) << dendl;
4097 return r;
4098 }
4099
4100 uint32_t crc, expected_crc;
4101 bufferlist::iterator p = bl.begin();
4102 try {
4103 ::decode(*label, p);
4104 bufferlist t;
4105 t.substr_of(bl, 0, p.get_off());
4106 crc = t.crc32c(-1);
4107 ::decode(expected_crc, p);
4108 }
4109 catch (buffer::error& e) {
b32b8144 4110 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
4111 << ": " << e.what()
4112 << dendl;
b32b8144 4113 return -ENOENT;
7c673cae
FG
4114 }
4115 if (crc != expected_crc) {
4116 derr << __func__ << " bad crc on label, expected " << expected_crc
4117 << " != actual " << crc << dendl;
4118 return -EIO;
4119 }
4120 dout(10) << __func__ << " got " << *label << dendl;
4121 return 0;
4122}
4123
4124int BlueStore::_check_or_set_bdev_label(
4125 string path, uint64_t size, string desc, bool create)
4126{
4127 bluestore_bdev_label_t label;
4128 if (create) {
4129 label.osd_uuid = fsid;
4130 label.size = size;
4131 label.btime = ceph_clock_now();
4132 label.description = desc;
3efd9988 4133 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
4134 if (r < 0)
4135 return r;
4136 } else {
4137 int r = _read_bdev_label(cct, path, &label);
4138 if (r < 0)
4139 return r;
31f18b77
FG
4140 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4141 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4142 << " and fsid " << fsid << " check bypassed" << dendl;
4143 }
4144 else if (label.osd_uuid != fsid) {
7c673cae
FG
4145 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4146 << " does not match our fsid " << fsid << dendl;
4147 return -EIO;
4148 }
4149 }
4150 return 0;
4151}
4152
4153void BlueStore::_set_alloc_sizes(void)
4154{
7c673cae
FG
4155 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4156
4157 if (cct->_conf->bluestore_prefer_deferred_size) {
4158 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4159 } else {
4160 assert(bdev);
4161 if (bdev->is_rotational()) {
4162 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4163 } else {
4164 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4165 }
4166 }
4167
4168 if (cct->_conf->bluestore_deferred_batch_ops) {
4169 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4170 } else {
4171 assert(bdev);
4172 if (bdev->is_rotational()) {
4173 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4174 } else {
4175 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4176 }
4177 }
4178
4179 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4180 << std::dec << " order " << min_alloc_size_order
4181 << " max_alloc_size 0x" << std::hex << max_alloc_size
4182 << " prefer_deferred_size 0x" << prefer_deferred_size
4183 << std::dec
4184 << " deferred_batch_ops " << deferred_batch_ops
4185 << dendl;
4186}
4187
4188int BlueStore::_open_bdev(bool create)
4189{
4190 assert(bdev == NULL);
4191 string p = path + "/block";
4192 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4193 int r = bdev->open(p);
4194 if (r < 0)
4195 goto fail;
4196
4197 if (bdev->supported_bdev_label()) {
4198 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4199 if (r < 0)
4200 goto fail_close;
4201 }
4202
4203 // initialize global block parameters
4204 block_size = bdev->get_block_size();
4205 block_mask = ~(block_size - 1);
4206 block_size_order = ctz(block_size);
4207 assert(block_size == 1u << block_size_order);
224ce89b
WB
4208 // and set cache_size based on device type
4209 r = _set_cache_sizes();
4210 if (r < 0) {
4211 goto fail_close;
4212 }
7c673cae
FG
4213 return 0;
4214
4215 fail_close:
4216 bdev->close();
4217 fail:
4218 delete bdev;
4219 bdev = NULL;
4220 return r;
4221}
4222
4223void BlueStore::_close_bdev()
4224{
4225 assert(bdev);
4226 bdev->close();
4227 delete bdev;
4228 bdev = NULL;
4229}
4230
4231int BlueStore::_open_fm(bool create)
4232{
4233 assert(fm == NULL);
4234 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4235
4236 if (create) {
4237 // initialize freespace
4238 dout(20) << __func__ << " initializing freespace" << dendl;
4239 KeyValueDB::Transaction t = db->get_transaction();
4240 {
4241 bufferlist bl;
4242 bl.append(freelist_type);
4243 t->set(PREFIX_SUPER, "freelist_type", bl);
4244 }
b32b8144
FG
4245 // being able to allocate in units less than bdev block size
4246 // seems to be a bad idea.
4247 assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
4248 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
4249
4250 // allocate superblock reserved space. note that we do not mark
4251 // bluefs space as allocated in the freelist; we instead rely on
4252 // bluefs_extents.
3efd9988
FG
4253 uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
4254 min_alloc_size);
4255 fm->allocate(0, reserved, t);
7c673cae 4256
7c673cae
FG
4257 if (cct->_conf->bluestore_bluefs) {
4258 assert(bluefs_extents.num_intervals() == 1);
4259 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
3efd9988 4260 reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
4261 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4262 << " for bluefs" << dendl;
4263 bufferlist bl;
4264 ::encode(bluefs_extents, bl);
4265 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4266 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4267 << std::dec << dendl;
7c673cae
FG
4268 }
4269
4270 if (cct->_conf->bluestore_debug_prefill > 0) {
4271 uint64_t end = bdev->get_size() - reserved;
4272 dout(1) << __func__ << " pre-fragmenting freespace, using "
4273 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4274 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4275 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4276 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4277 float r = cct->_conf->bluestore_debug_prefill;
4278 r /= 1.0 - r;
4279 bool stop = false;
4280
4281 while (!stop && start < end) {
4282 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4283 if (start + l > end) {
4284 l = end - start;
4285 l = P2ALIGN(l, min_alloc_size);
4286 }
4287 assert(start + l <= end);
4288
4289 uint64_t u = 1 + (uint64_t)(r * (double)l);
4290 u = P2ROUNDUP(u, min_alloc_size);
4291 if (start + l + u > end) {
4292 u = end - (start + l);
4293 // trim to align so we don't overflow again
4294 u = P2ALIGN(u, min_alloc_size);
4295 stop = true;
4296 }
4297 assert(start + l + u <= end);
4298
4299 dout(20) << " free 0x" << std::hex << start << "~" << l
4300 << " use 0x" << u << std::dec << dendl;
4301
4302 if (u == 0) {
4303 // break if u has been trimmed to nothing
4304 break;
4305 }
4306
4307 fm->allocate(start + l, u, t);
4308 start += l + u;
4309 }
4310 }
4311 db->submit_transaction_sync(t);
4312 }
4313
3efd9988 4314 int r = fm->init(bdev->get_size());
7c673cae
FG
4315 if (r < 0) {
4316 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4317 delete fm;
4318 fm = NULL;
4319 return r;
4320 }
4321 return 0;
4322}
4323
4324void BlueStore::_close_fm()
4325{
4326 dout(10) << __func__ << dendl;
4327 assert(fm);
4328 fm->shutdown();
4329 delete fm;
4330 fm = NULL;
4331}
4332
4333int BlueStore::_open_alloc()
4334{
4335 assert(alloc == NULL);
4336 assert(bdev->get_size());
4337 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4338 bdev->get_size(),
4339 min_alloc_size);
4340 if (!alloc) {
4341 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4342 << cct->_conf->bluestore_allocator
4343 << dendl;
4344 return -EINVAL;
4345 }
4346
4347 uint64_t num = 0, bytes = 0;
4348
4349 dout(1) << __func__ << " opening allocation metadata" << dendl;
4350 // initialize from freelist
4351 fm->enumerate_reset();
4352 uint64_t offset, length;
4353 while (fm->enumerate_next(&offset, &length)) {
4354 alloc->init_add_free(offset, length);
4355 ++num;
4356 bytes += length;
4357 }
224ce89b 4358 fm->enumerate_reset();
1adf2230 4359 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
7c673cae
FG
4360 << " in " << num << " extents"
4361 << dendl;
4362
4363 // also mark bluefs space as allocated
4364 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4365 alloc->init_rm_free(e.get_start(), e.get_len());
4366 }
4367 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4368 << bluefs_extents << std::dec << " as allocated" << dendl;
4369
4370 return 0;
4371}
4372
4373void BlueStore::_close_alloc()
4374{
4375 assert(alloc);
4376 alloc->shutdown();
4377 delete alloc;
4378 alloc = NULL;
4379}
4380
4381int BlueStore::_open_fsid(bool create)
4382{
4383 assert(fsid_fd < 0);
4384 int flags = O_RDWR;
4385 if (create)
4386 flags |= O_CREAT;
4387 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4388 if (fsid_fd < 0) {
4389 int err = -errno;
4390 derr << __func__ << " " << cpp_strerror(err) << dendl;
4391 return err;
4392 }
4393 return 0;
4394}
4395
4396int BlueStore::_read_fsid(uuid_d *uuid)
4397{
4398 char fsid_str[40];
4399 memset(fsid_str, 0, sizeof(fsid_str));
4400 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4401 if (ret < 0) {
4402 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4403 return ret;
4404 }
4405 if (ret > 36)
4406 fsid_str[36] = 0;
4407 else
4408 fsid_str[ret] = 0;
4409 if (!uuid->parse(fsid_str)) {
4410 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4411 return -EINVAL;
4412 }
4413 return 0;
4414}
4415
4416int BlueStore::_write_fsid()
4417{
4418 int r = ::ftruncate(fsid_fd, 0);
4419 if (r < 0) {
4420 r = -errno;
4421 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4422 return r;
4423 }
4424 string str = stringify(fsid) + "\n";
4425 r = safe_write(fsid_fd, str.c_str(), str.length());
4426 if (r < 0) {
4427 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4428 return r;
4429 }
4430 r = ::fsync(fsid_fd);
4431 if (r < 0) {
4432 r = -errno;
4433 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4434 return r;
4435 }
4436 return 0;
4437}
4438
4439void BlueStore::_close_fsid()
4440{
4441 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4442 fsid_fd = -1;
4443}
4444
4445int BlueStore::_lock_fsid()
4446{
4447 struct flock l;
4448 memset(&l, 0, sizeof(l));
4449 l.l_type = F_WRLCK;
4450 l.l_whence = SEEK_SET;
4451 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4452 if (r < 0) {
4453 int err = errno;
4454 derr << __func__ << " failed to lock " << path << "/fsid"
4455 << " (is another ceph-osd still running?)"
4456 << cpp_strerror(err) << dendl;
4457 return -err;
4458 }
4459 return 0;
4460}
4461
31f18b77
FG
4462bool BlueStore::is_rotational()
4463{
4464 if (bdev) {
4465 return bdev->is_rotational();
4466 }
4467
4468 bool rotational = true;
4469 int r = _open_path();
4470 if (r < 0)
4471 goto out;
4472 r = _open_fsid(false);
4473 if (r < 0)
4474 goto out_path;
4475 r = _read_fsid(&fsid);
4476 if (r < 0)
4477 goto out_fsid;
4478 r = _lock_fsid();
4479 if (r < 0)
4480 goto out_fsid;
4481 r = _open_bdev(false);
4482 if (r < 0)
4483 goto out_fsid;
4484 rotational = bdev->is_rotational();
4485 _close_bdev();
4486 out_fsid:
4487 _close_fsid();
4488 out_path:
4489 _close_path();
4490 out:
4491 return rotational;
4492}
4493
d2e6a577
FG
4494bool BlueStore::is_journal_rotational()
4495{
4496 if (!bluefs) {
4497 dout(5) << __func__ << " bluefs disabled, default to store media type"
4498 << dendl;
4499 return is_rotational();
4500 }
4501 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4502 return bluefs->wal_is_rotational();
4503}
4504
7c673cae
FG
4505bool BlueStore::test_mount_in_use()
4506{
4507 // most error conditions mean the mount is not in use (e.g., because
4508 // it doesn't exist). only if we fail to lock do we conclude it is
4509 // in use.
4510 bool ret = false;
4511 int r = _open_path();
4512 if (r < 0)
4513 return false;
4514 r = _open_fsid(false);
4515 if (r < 0)
4516 goto out_path;
4517 r = _lock_fsid();
4518 if (r < 0)
4519 ret = true; // if we can't lock, it is in use
4520 _close_fsid();
4521 out_path:
4522 _close_path();
4523 return ret;
4524}
4525
4526int BlueStore::_open_db(bool create)
4527{
4528 int r;
4529 assert(!db);
4530 string fn = path + "/db";
4531 string options;
4532 stringstream err;
4533 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4534
4535 string kv_backend;
4536 if (create) {
4537 kv_backend = cct->_conf->bluestore_kvbackend;
4538 } else {
4539 r = read_meta("kv_backend", &kv_backend);
4540 if (r < 0) {
4541 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4542 return -EIO;
4543 }
4544 }
4545 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4546
4547 bool do_bluefs;
4548 if (create) {
4549 do_bluefs = cct->_conf->bluestore_bluefs;
4550 } else {
4551 string s;
4552 r = read_meta("bluefs", &s);
4553 if (r < 0) {
4554 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4555 return -EIO;
4556 }
4557 if (s == "1") {
4558 do_bluefs = true;
4559 } else if (s == "0") {
4560 do_bluefs = false;
4561 } else {
4562 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4563 << dendl;
4564 return -EIO;
4565 }
4566 }
4567 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4568
4569 rocksdb::Env *env = NULL;
4570 if (do_bluefs) {
4571 dout(10) << __func__ << " initializing bluefs" << dendl;
4572 if (kv_backend != "rocksdb") {
4573 derr << " backend must be rocksdb to use bluefs" << dendl;
4574 return -EINVAL;
4575 }
4576 bluefs = new BlueFS(cct);
4577
4578 string bfn;
4579 struct stat st;
4580
28e407b8 4581 bfn = path + "/block.db";
7c673cae
FG
4582 if (::stat(bfn.c_str(), &st) == 0) {
4583 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4584 if (r < 0) {
4585 derr << __func__ << " add block device(" << bfn << ") returned: "
4586 << cpp_strerror(r) << dendl;
4587 goto free_bluefs;
4588 }
4589
4590 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4591 r = _check_or_set_bdev_label(
4592 bfn,
4593 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4594 "bluefs db", create);
4595 if (r < 0) {
4596 derr << __func__
4597 << " check block device(" << bfn << ") label returned: "
4598 << cpp_strerror(r) << dendl;
4599 goto free_bluefs;
4600 }
4601 }
4602 if (create) {
4603 bluefs->add_block_extent(
4604 BlueFS::BDEV_DB,
4605 SUPER_RESERVED,
4606 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4607 }
4608 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4609 bluefs_single_shared_device = false;
31f18b77 4610 } else {
31f18b77 4611 r = -errno;
28e407b8
AA
4612 if (::lstat(bfn.c_str(), &st) == -1) {
4613 r = 0;
4614 bluefs_shared_bdev = BlueFS::BDEV_DB;
4615 } else {
4616 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4617 << cpp_strerror(r) << dendl;
4618 goto free_bluefs;
4619 }
7c673cae
FG
4620 }
4621
4622 // shared device
28e407b8 4623 bfn = path + "/block";
7c673cae
FG
4624 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4625 if (r < 0) {
4626 derr << __func__ << " add block device(" << bfn << ") returned: "
4627 << cpp_strerror(r) << dendl;
4628 goto free_bluefs;
4629 }
4630 if (create) {
4631 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4632 uint64_t initial =
4633 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4634 cct->_conf->bluestore_bluefs_gift_ratio);
4635 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
3efd9988
FG
4636 if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
4637 derr << __func__ << " bluefs_alloc_size 0x" << std::hex
4638 << cct->_conf->bluefs_alloc_size << " is not a multiple of "
4639 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4640 r = -EINVAL;
4641 goto free_bluefs;
4642 }
7c673cae
FG
4643 // align to bluefs's alloc_size
4644 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
31f18b77
FG
4645 // put bluefs in the middle of the device in case it is an HDD
4646 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4647 cct->_conf->bluefs_alloc_size);
4648 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4649 bluefs_extents.insert(start, initial);
7c673cae
FG
4650 }
4651
28e407b8 4652 bfn = path + "/block.wal";
7c673cae
FG
4653 if (::stat(bfn.c_str(), &st) == 0) {
4654 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4655 if (r < 0) {
4656 derr << __func__ << " add block device(" << bfn << ") returned: "
4657 << cpp_strerror(r) << dendl;
4658 goto free_bluefs;
4659 }
4660
4661 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4662 r = _check_or_set_bdev_label(
4663 bfn,
4664 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4665 "bluefs wal", create);
4666 if (r < 0) {
4667 derr << __func__ << " check block device(" << bfn
4668 << ") label returned: " << cpp_strerror(r) << dendl;
4669 goto free_bluefs;
4670 }
4671 }
4672
4673 if (create) {
4674 bluefs->add_block_extent(
4675 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4676 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4677 BDEV_LABEL_BLOCK_SIZE);
4678 }
4679 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4680 bluefs_single_shared_device = false;
31f18b77 4681 } else {
31f18b77 4682 r = -errno;
28e407b8
AA
4683 if (::lstat(bfn.c_str(), &st) == -1) {
4684 r = 0;
4685 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
4686 } else {
4687 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4688 << cpp_strerror(r) << dendl;
4689 goto free_bluefs;
4690 }
7c673cae
FG
4691 }
4692
4693 if (create) {
4694 bluefs->mkfs(fsid);
4695 }
4696 r = bluefs->mount();
4697 if (r < 0) {
4698 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4699 goto free_bluefs;
4700 }
4701 if (cct->_conf->bluestore_bluefs_env_mirror) {
4702 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4703 rocksdb::Env *b = rocksdb::Env::Default();
4704 if (create) {
4705 string cmd = "rm -rf " + path + "/db " +
4706 path + "/db.slow " +
4707 path + "/db.wal";
4708 int r = system(cmd.c_str());
4709 (void)r;
4710 }
4711 env = new rocksdb::EnvMirror(b, a, false, true);
4712 } else {
4713 env = new BlueRocksEnv(bluefs);
4714
4715 // simplify the dir names, too, as "seen" by rocksdb
4716 fn = "db";
4717 }
4718
4719 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4720 // we have both block.db and block; tell rocksdb!
4721 // note: the second (last) size value doesn't really matter
4722 ostringstream db_paths;
4723 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4724 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4725 db_paths << fn << ","
4726 << (uint64_t)(db_size * 95 / 100) << " "
4727 << fn + ".slow" << ","
4728 << (uint64_t)(slow_size * 95 / 100);
4729 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4730 dout(10) << __func__ << " set rocksdb_db_paths to "
4731 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4732 }
4733
4734 if (create) {
4735 env->CreateDir(fn);
4736 if (cct->_conf->rocksdb_separate_wal_dir)
4737 env->CreateDir(fn + ".wal");
4738 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4739 env->CreateDir(fn + ".slow");
4740 }
4741 } else if (create) {
4742 int r = ::mkdir(fn.c_str(), 0755);
4743 if (r < 0)
4744 r = -errno;
4745 if (r < 0 && r != -EEXIST) {
4746 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4747 << dendl;
4748 return r;
4749 }
4750
4751 // wal_dir, too!
4752 if (cct->_conf->rocksdb_separate_wal_dir) {
4753 string walfn = path + "/db.wal";
4754 r = ::mkdir(walfn.c_str(), 0755);
4755 if (r < 0)
4756 r = -errno;
4757 if (r < 0 && r != -EEXIST) {
4758 derr << __func__ << " failed to create " << walfn
4759 << ": " << cpp_strerror(r)
4760 << dendl;
4761 return r;
4762 }
4763 }
4764 }
4765
4766 db = KeyValueDB::create(cct,
4767 kv_backend,
4768 fn,
4769 static_cast<void*>(env));
4770 if (!db) {
4771 derr << __func__ << " error creating db" << dendl;
4772 if (bluefs) {
4773 bluefs->umount();
4774 delete bluefs;
4775 bluefs = NULL;
4776 }
4777 // delete env manually here since we can't depend on db to do this
4778 // under this case
4779 delete env;
4780 env = NULL;
4781 return -EIO;
4782 }
4783
4784 FreelistManager::setup_merge_operators(db);
4785 db->set_merge_operator(PREFIX_STAT, merge_op);
4786
224ce89b 4787 db->set_cache_size(cache_size * cache_kv_ratio);
31f18b77 4788
7c673cae
FG
4789 if (kv_backend == "rocksdb")
4790 options = cct->_conf->bluestore_rocksdb_options;
4791 db->init(options);
4792 if (create)
4793 r = db->create_and_open(err);
4794 else
4795 r = db->open(err);
4796 if (r) {
4797 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4798 if (bluefs) {
4799 bluefs->umount();
4800 delete bluefs;
4801 bluefs = NULL;
4802 }
4803 delete db;
4804 db = NULL;
4805 return -EIO;
4806 }
4807 dout(1) << __func__ << " opened " << kv_backend
4808 << " path " << fn << " options " << options << dendl;
4809 return 0;
4810
4811free_bluefs:
4812 assert(bluefs);
4813 delete bluefs;
4814 bluefs = NULL;
4815 return r;
4816}
4817
4818void BlueStore::_close_db()
4819{
4820 assert(db);
4821 delete db;
4822 db = NULL;
4823 if (bluefs) {
4824 bluefs->umount();
4825 delete bluefs;
4826 bluefs = NULL;
4827 }
4828}
4829
4830int BlueStore::_reconcile_bluefs_freespace()
4831{
4832 dout(10) << __func__ << dendl;
4833 interval_set<uint64_t> bset;
4834 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
4835 assert(r == 0);
4836 if (bset == bluefs_extents) {
4837 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
4838 << std::dec << dendl;
4839 return 0;
4840 }
4841 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
4842 << dendl;
4843 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
4844 << std::dec << dendl;
4845
4846 interval_set<uint64_t> overlap;
4847 overlap.intersection_of(bset, bluefs_extents);
4848
4849 bset.subtract(overlap);
4850 if (!bset.empty()) {
4851 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
4852 << dendl;
4853 return -EIO;
4854 }
4855
4856 interval_set<uint64_t> super_extra;
4857 super_extra = bluefs_extents;
4858 super_extra.subtract(overlap);
4859 if (!super_extra.empty()) {
4860 // This is normal: it can happen if we commit to give extents to
4861 // bluefs and we crash before bluefs commits that it owns them.
4862 dout(10) << __func__ << " super extra " << super_extra << dendl;
4863 for (interval_set<uint64_t>::iterator p = super_extra.begin();
4864 p != super_extra.end();
4865 ++p) {
4866 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
4867 }
4868 }
4869
4870 return 0;
4871}
4872
4873int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
4874{
4875 int ret = 0;
4876 assert(bluefs);
4877
4878 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
4879 bluefs->get_usage(&bluefs_usage);
4880 assert(bluefs_usage.size() > bluefs_shared_bdev);
4881
4882 // fixme: look at primary bdev only for now
4883 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
4884 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
4885 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
4886
4887 uint64_t my_free = alloc->get_free();
4888 uint64_t total = bdev->get_size();
4889 float my_free_ratio = (float)my_free / (float)total;
4890
4891 uint64_t total_free = bluefs_free + my_free;
4892
4893 float bluefs_ratio = (float)bluefs_free / (float)total_free;
4894
4895 dout(10) << __func__
1adf2230 4896 << " bluefs " << byte_u_t(bluefs_free)
7c673cae 4897 << " free (" << bluefs_free_ratio
1adf2230 4898 << ") bluestore " << byte_u_t(my_free)
7c673cae
FG
4899 << " free (" << my_free_ratio
4900 << "), bluefs_ratio " << bluefs_ratio
4901 << dendl;
4902
4903 uint64_t gift = 0;
4904 uint64_t reclaim = 0;
4905 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
4906 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
4907 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4908 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
1adf2230 4909 << ", should gift " << byte_u_t(gift) << dendl;
7c673cae
FG
4910 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
4911 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
4912 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
4913 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
4914 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4915 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
1adf2230 4916 << ", should reclaim " << byte_u_t(reclaim) << dendl;
7c673cae 4917 }
3efd9988
FG
4918
4919 // don't take over too much of the freespace
4920 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 4921 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 4922 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
4923 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
4924 dout(10) << __func__ << " bluefs_total " << bluefs_total
4925 << " < min " << cct->_conf->bluestore_bluefs_min
1adf2230 4926 << ", should gift " << byte_u_t(g) << dendl;
7c673cae
FG
4927 if (g > gift)
4928 gift = g;
4929 reclaim = 0;
4930 }
3efd9988
FG
4931 uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
4932 if (bluefs_free < min_free &&
4933 min_free < free_cap) {
4934 uint64_t g = min_free - bluefs_free;
4935 dout(10) << __func__ << " bluefs_free " << bluefs_total
4936 << " < min " << min_free
1adf2230 4937 << ", should gift " << byte_u_t(g) << dendl;
3efd9988
FG
4938 if (g > gift)
4939 gift = g;
4940 reclaim = 0;
4941 }
7c673cae
FG
4942
4943 if (gift) {
4944 // round up to alloc size
4945 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
4946
4947 // hard cap to fit into 32 bits
4948 gift = MIN(gift, 1ull<<31);
4949 dout(10) << __func__ << " gifting " << gift
1adf2230 4950 << " (" << byte_u_t(gift) << ")" << dendl;
7c673cae
FG
4951
4952 // fixme: just do one allocation to start...
4953 int r = alloc->reserve(gift);
4954 assert(r == 0);
4955
4956 AllocExtentVector exts;
4957 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
4958 0, 0, &exts);
4959
94b18763
FG
4960 if (alloc_len <= 0) {
4961 dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
4962 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4963 alloc->unreserve(gift);
4964 alloc->dump();
4965 return 0;
4966 } else if (alloc_len < (int64_t)gift) {
4967 dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
4968 << " min_alloc_size 0x" << min_alloc_size
4969 << " allocated 0x" << alloc_len
4970 << std::dec << dendl;
4971 alloc->unreserve(gift - alloc_len);
7c673cae 4972 alloc->dump();
7c673cae
FG
4973 }
4974 for (auto& p : exts) {
4975 bluestore_pextent_t e = bluestore_pextent_t(p);
4976 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
4977 extents->push_back(e);
4978 }
4979 gift = 0;
4980
4981 ret = 1;
4982 }
4983
4984 // reclaim from bluefs?
4985 if (reclaim) {
4986 // round up to alloc size
4987 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
4988
4989 // hard cap to fit into 32 bits
4990 reclaim = MIN(reclaim, 1ull<<31);
4991 dout(10) << __func__ << " reclaiming " << reclaim
1adf2230 4992 << " (" << byte_u_t(reclaim) << ")" << dendl;
7c673cae
FG
4993
4994 while (reclaim > 0) {
4995 // NOTE: this will block and do IO.
4996 AllocExtentVector extents;
4997 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
4998 &extents);
4999 if (r < 0) {
5000 derr << __func__ << " failed to reclaim space from bluefs"
5001 << dendl;
5002 break;
5003 }
5004 for (auto e : extents) {
5005 bluefs_extents.erase(e.offset, e.length);
5006 bluefs_extents_reclaiming.insert(e.offset, e.length);
5007 reclaim -= e.length;
5008 }
5009 }
5010
5011 ret = 1;
5012 }
5013
5014 return ret;
5015}
5016
5017void BlueStore::_commit_bluefs_freespace(
5018 const PExtentVector& bluefs_gift_extents)
5019{
5020 dout(10) << __func__ << dendl;
5021 for (auto& p : bluefs_gift_extents) {
5022 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
5023 }
5024}
5025
5026int BlueStore::_open_collections(int *errors)
5027{
28e407b8 5028 dout(10) << __func__ << dendl;
7c673cae
FG
5029 assert(coll_map.empty());
5030 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5031 for (it->upper_bound(string());
5032 it->valid();
5033 it->next()) {
5034 coll_t cid;
5035 if (cid.parse(it->key())) {
5036 CollectionRef c(
5037 new Collection(
5038 this,
5039 cache_shards[cid.hash_to_shard(cache_shards.size())],
5040 cid));
5041 bufferlist bl = it->value();
5042 bufferlist::iterator p = bl.begin();
5043 try {
5044 ::decode(c->cnode, p);
5045 } catch (buffer::error& e) {
5046 derr << __func__ << " failed to decode cnode, key:"
5047 << pretty_binary_string(it->key()) << dendl;
5048 return -EIO;
5049 }
28e407b8
AA
5050 dout(20) << __func__ << " opened " << cid << " " << c
5051 << " " << c->cnode << dendl;
7c673cae
FG
5052 coll_map[cid] = c;
5053 } else {
5054 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5055 if (errors)
5056 (*errors)++;
5057 }
5058 }
5059 return 0;
5060}
5061
224ce89b 5062void BlueStore::_open_statfs()
31f18b77
FG
5063{
5064 bufferlist bl;
5065 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
5066 if (r >= 0) {
5067 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
5068 auto it = bl.begin();
5069 vstatfs.decode(it);
224ce89b 5070 } else {
31f18b77
FG
5071 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5072 }
5073 }
5074 else {
5075 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
5076 }
5077}
5078
7c673cae
FG
5079int BlueStore::_setup_block_symlink_or_file(
5080 string name,
5081 string epath,
5082 uint64_t size,
5083 bool create)
5084{
5085 dout(20) << __func__ << " name " << name << " path " << epath
5086 << " size " << size << " create=" << (int)create << dendl;
5087 int r = 0;
5088 int flags = O_RDWR;
5089 if (create)
5090 flags |= O_CREAT;
5091 if (epath.length()) {
5092 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5093 if (r < 0) {
5094 r = -errno;
5095 derr << __func__ << " failed to create " << name << " symlink to "
5096 << epath << ": " << cpp_strerror(r) << dendl;
5097 return r;
5098 }
5099
5100 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5101 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5102 if (fd < 0) {
5103 r = -errno;
5104 derr << __func__ << " failed to open " << epath << " file: "
5105 << cpp_strerror(r) << dendl;
5106 return r;
5107 }
5108 string serial_number = epath.substr(strlen(SPDK_PREFIX));
5109 r = ::write(fd, serial_number.c_str(), serial_number.size());
5110 assert(r == (int)serial_number.size());
5111 dout(1) << __func__ << " created " << name << " symlink to "
5112 << epath << dendl;
5113 VOID_TEMP_FAILURE_RETRY(::close(fd));
5114 }
5115 }
5116 if (size) {
5117 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5118 if (fd >= 0) {
5119 // block file is present
5120 struct stat st;
5121 int r = ::fstat(fd, &st);
5122 if (r == 0 &&
5123 S_ISREG(st.st_mode) && // if it is a regular file
5124 st.st_size == 0) { // and is 0 bytes
5125 r = ::ftruncate(fd, size);
5126 if (r < 0) {
5127 r = -errno;
5128 derr << __func__ << " failed to resize " << name << " file to "
5129 << size << ": " << cpp_strerror(r) << dendl;
5130 VOID_TEMP_FAILURE_RETRY(::close(fd));
5131 return r;
5132 }
5133
5134 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
5135 r = ::ceph_posix_fallocate(fd, 0, size);
5136 if (r > 0) {
7c673cae
FG
5137 derr << __func__ << " failed to prefallocate " << name << " file to "
5138 << size << ": " << cpp_strerror(r) << dendl;
5139 VOID_TEMP_FAILURE_RETRY(::close(fd));
5140 return -r;
5141 }
7c673cae
FG
5142 }
5143 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 5144 << byte_u_t(size) << dendl;
7c673cae
FG
5145 }
5146 VOID_TEMP_FAILURE_RETRY(::close(fd));
5147 } else {
5148 int r = -errno;
5149 if (r != -ENOENT) {
5150 derr << __func__ << " failed to open " << name << " file: "
5151 << cpp_strerror(r) << dendl;
5152 return r;
5153 }
5154 }
5155 }
5156 return 0;
5157}
5158
5159int BlueStore::mkfs()
5160{
5161 dout(1) << __func__ << " path " << path << dendl;
5162 int r;
5163 uuid_d old_fsid;
5164
5165 {
5166 string done;
5167 r = read_meta("mkfs_done", &done);
5168 if (r == 0) {
5169 dout(1) << __func__ << " already created" << dendl;
5170 if (cct->_conf->bluestore_fsck_on_mkfs) {
5171 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5172 if (r < 0) {
5173 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5174 << dendl;
5175 return r;
5176 }
5177 if (r > 0) {
5178 derr << __func__ << " fsck found " << r << " errors" << dendl;
5179 r = -EIO;
5180 }
5181 }
5182 return r; // idempotent
5183 }
5184 }
5185
5186 {
5187 string type;
5188 r = read_meta("type", &type);
5189 if (r == 0) {
5190 if (type != "bluestore") {
5191 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5192 return -EIO;
5193 }
5194 } else {
5195 r = write_meta("type", "bluestore");
5196 if (r < 0)
5197 return r;
5198 }
5199 }
5200
5201 freelist_type = "bitmap";
5202
5203 r = _open_path();
5204 if (r < 0)
5205 return r;
5206
5207 r = _open_fsid(true);
5208 if (r < 0)
5209 goto out_path_fd;
5210
5211 r = _lock_fsid();
5212 if (r < 0)
5213 goto out_close_fsid;
5214
5215 r = _read_fsid(&old_fsid);
5216 if (r < 0 || old_fsid.is_zero()) {
5217 if (fsid.is_zero()) {
5218 fsid.generate_random();
5219 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5220 } else {
5221 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5222 }
5223 // we'll write it later.
5224 } else {
5225 if (!fsid.is_zero() && fsid != old_fsid) {
5226 derr << __func__ << " on-disk fsid " << old_fsid
5227 << " != provided " << fsid << dendl;
5228 r = -EINVAL;
5229 goto out_close_fsid;
5230 }
5231 fsid = old_fsid;
5232 }
5233
5234 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5235 cct->_conf->bluestore_block_size,
5236 cct->_conf->bluestore_block_create);
5237 if (r < 0)
5238 goto out_close_fsid;
5239 if (cct->_conf->bluestore_bluefs) {
5240 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5241 cct->_conf->bluestore_block_wal_size,
5242 cct->_conf->bluestore_block_wal_create);
5243 if (r < 0)
5244 goto out_close_fsid;
5245 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5246 cct->_conf->bluestore_block_db_size,
5247 cct->_conf->bluestore_block_db_create);
5248 if (r < 0)
5249 goto out_close_fsid;
5250 }
5251
5252 r = _open_bdev(true);
5253 if (r < 0)
5254 goto out_close_fsid;
5255
3efd9988
FG
5256 // choose min_alloc_size
5257 if (cct->_conf->bluestore_min_alloc_size) {
5258 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5259 } else {
5260 assert(bdev);
5261 if (bdev->is_rotational()) {
5262 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5263 } else {
5264 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5265 }
5266 }
5267
5268 // make sure min_alloc_size is power of 2 aligned.
5269 if (!ISP2(min_alloc_size)) {
5270 derr << __func__ << " min_alloc_size 0x"
5271 << std::hex << min_alloc_size << std::dec
5272 << " is not power of 2 aligned!"
5273 << dendl;
5274 r = -EINVAL;
5275 goto out_close_bdev;
5276 }
5277
7c673cae
FG
5278 r = _open_db(true);
5279 if (r < 0)
5280 goto out_close_bdev;
5281
5282 r = _open_fm(true);
5283 if (r < 0)
5284 goto out_close_db;
5285
5286 {
5287 KeyValueDB::Transaction t = db->get_transaction();
5288 {
5289 bufferlist bl;
5290 ::encode((uint64_t)0, bl);
5291 t->set(PREFIX_SUPER, "nid_max", bl);
5292 t->set(PREFIX_SUPER, "blobid_max", bl);
5293 }
5294
7c673cae
FG
5295 {
5296 bufferlist bl;
5297 ::encode((uint64_t)min_alloc_size, bl);
5298 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5299 }
5300
5301 ondisk_format = latest_ondisk_format;
5302 _prepare_ondisk_format_super(t);
5303 db->submit_transaction_sync(t);
5304 }
5305
7c673cae
FG
5306
5307 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5308 if (r < 0)
224ce89b
WB
5309 goto out_close_fm;
5310
3efd9988 5311 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 5312 if (r < 0)
224ce89b 5313 goto out_close_fm;
7c673cae
FG
5314
5315 if (fsid != old_fsid) {
5316 r = _write_fsid();
5317 if (r < 0) {
5318 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 5319 goto out_close_fm;
7c673cae
FG
5320 }
5321 }
5322
7c673cae
FG
5323 out_close_fm:
5324 _close_fm();
5325 out_close_db:
5326 _close_db();
5327 out_close_bdev:
5328 _close_bdev();
5329 out_close_fsid:
5330 _close_fsid();
5331 out_path_fd:
5332 _close_path();
5333
5334 if (r == 0 &&
5335 cct->_conf->bluestore_fsck_on_mkfs) {
5336 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5337 if (rc < 0)
5338 return rc;
5339 if (rc > 0) {
5340 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5341 r = -EIO;
5342 }
5343 }
31f18b77
FG
5344
5345 if (r == 0) {
5346 // indicate success by writing the 'mkfs_done' file
5347 r = write_meta("mkfs_done", "yes");
5348 }
5349
7c673cae
FG
5350 if (r < 0) {
5351 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
31f18b77
FG
5352 } else {
5353 dout(0) << __func__ << " success" << dendl;
7c673cae
FG
5354 }
5355 return r;
5356}
5357
5358void BlueStore::set_cache_shards(unsigned num)
5359{
5360 dout(10) << __func__ << " " << num << dendl;
5361 size_t old = cache_shards.size();
5362 assert(num >= old);
5363 cache_shards.resize(num);
5364 for (unsigned i = old; i < num; ++i) {
5365 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5366 logger);
5367 }
5368}
5369
5370int BlueStore::_mount(bool kv_only)
5371{
5372 dout(1) << __func__ << " path " << path << dendl;
5373
3efd9988
FG
5374 _kv_only = kv_only;
5375
7c673cae
FG
5376 {
5377 string type;
5378 int r = read_meta("type", &type);
5379 if (r < 0) {
5380 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5381 << dendl;
5382 return r;
5383 }
5384
5385 if (type != "bluestore") {
5386 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5387 return -EIO;
5388 }
5389 }
5390
5391 if (cct->_conf->bluestore_fsck_on_mount) {
5392 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5393 if (rc < 0)
5394 return rc;
5395 if (rc > 0) {
5396 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5397 return -EIO;
5398 }
5399 }
5400
5401 int r = _open_path();
5402 if (r < 0)
5403 return r;
5404 r = _open_fsid(false);
5405 if (r < 0)
5406 goto out_path;
5407
5408 r = _read_fsid(&fsid);
5409 if (r < 0)
5410 goto out_fsid;
5411
5412 r = _lock_fsid();
5413 if (r < 0)
5414 goto out_fsid;
5415
5416 r = _open_bdev(false);
5417 if (r < 0)
5418 goto out_fsid;
5419
5420 r = _open_db(false);
5421 if (r < 0)
5422 goto out_bdev;
5423
5424 if (kv_only)
5425 return 0;
5426
5427 r = _open_super_meta();
5428 if (r < 0)
5429 goto out_db;
5430
5431 r = _open_fm(false);
5432 if (r < 0)
5433 goto out_db;
5434
5435 r = _open_alloc();
5436 if (r < 0)
5437 goto out_fm;
5438
5439 r = _open_collections();
5440 if (r < 0)
5441 goto out_alloc;
5442
5443 r = _reload_logger();
5444 if (r < 0)
5445 goto out_coll;
5446
5447 if (bluefs) {
5448 r = _reconcile_bluefs_freespace();
5449 if (r < 0)
5450 goto out_coll;
5451 }
5452
31f18b77 5453 _kv_start();
7c673cae
FG
5454
5455 r = _deferred_replay();
5456 if (r < 0)
5457 goto out_stop;
5458
5459 mempool_thread.init();
5460
7c673cae
FG
5461 mounted = true;
5462 return 0;
5463
5464 out_stop:
5465 _kv_stop();
7c673cae 5466 out_coll:
31f18b77 5467 _flush_cache();
7c673cae
FG
5468 out_alloc:
5469 _close_alloc();
5470 out_fm:
5471 _close_fm();
5472 out_db:
5473 _close_db();
5474 out_bdev:
5475 _close_bdev();
5476 out_fsid:
5477 _close_fsid();
5478 out_path:
5479 _close_path();
5480 return r;
5481}
5482
5483int BlueStore::umount()
5484{
3efd9988 5485 assert(_kv_only || mounted);
7c673cae
FG
5486 dout(1) << __func__ << dendl;
5487
5488 _osr_drain_all();
5489 _osr_unregister_all();
5490
7c673cae 5491 mounted = false;
3efd9988
FG
5492 if (!_kv_only) {
5493 mempool_thread.shutdown();
5494 dout(20) << __func__ << " stopping kv thread" << dendl;
5495 _kv_stop();
3efd9988
FG
5496 _flush_cache();
5497 dout(20) << __func__ << " closing" << dendl;
5498
5499 _close_alloc();
5500 _close_fm();
5501 }
7c673cae
FG
5502 _close_db();
5503 _close_bdev();
5504 _close_fsid();
5505 _close_path();
5506
5507 if (cct->_conf->bluestore_fsck_on_umount) {
5508 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5509 if (rc < 0)
5510 return rc;
5511 if (rc > 0) {
5512 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5513 return -EIO;
5514 }
5515 }
5516 return 0;
5517}
5518
5519static void apply(uint64_t off,
5520 uint64_t len,
5521 uint64_t granularity,
5522 BlueStore::mempool_dynamic_bitset &bitset,
7c673cae
FG
5523 std::function<void(uint64_t,
5524 BlueStore::mempool_dynamic_bitset &)> f) {
5525 auto end = ROUND_UP_TO(off + len, granularity);
5526 while (off < end) {
5527 uint64_t pos = off / granularity;
5528 f(pos, bitset);
5529 off += granularity;
5530 }
5531}
5532
5533int BlueStore::_fsck_check_extents(
5534 const ghobject_t& oid,
5535 const PExtentVector& extents,
5536 bool compressed,
5537 mempool_dynamic_bitset &used_blocks,
b32b8144 5538 uint64_t granularity,
7c673cae
FG
5539 store_statfs_t& expected_statfs)
5540{
5541 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5542 int errors = 0;
5543 for (auto e : extents) {
5544 if (!e.is_valid())
5545 continue;
5546 expected_statfs.allocated += e.length;
5547 if (compressed) {
5548 expected_statfs.compressed_allocated += e.length;
5549 }
5550 bool already = false;
5551 apply(
b32b8144 5552 e.offset, e.length, granularity, used_blocks,
7c673cae 5553 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 5554 assert(pos < bs.size());
7c673cae
FG
5555 if (bs.test(pos))
5556 already = true;
5557 else
5558 bs.set(pos);
5559 });
5560 if (already) {
5561 derr << " " << oid << " extent " << e
5562 << " or a subset is already allocated" << dendl;
5563 ++errors;
5564 }
5565 if (e.end() > bdev->get_size()) {
5566 derr << " " << oid << " extent " << e
5567 << " past end of block device" << dendl;
5568 ++errors;
5569 }
5570 }
5571 return errors;
5572}
5573
3efd9988 5574int BlueStore::_fsck(bool deep, bool repair)
7c673cae 5575{
3efd9988
FG
5576 dout(1) << __func__
5577 << (repair ? " fsck" : " repair")
5578 << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
7c673cae 5579 int errors = 0;
3efd9988 5580 int repaired = 0;
31f18b77
FG
5581
5582 typedef btree::btree_set<
5583 uint64_t,std::less<uint64_t>,
5584 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5585 uint64_t_btree_t used_nids;
5586 uint64_t_btree_t used_omap_head;
5587 uint64_t_btree_t used_sbids;
5588
7c673cae 5589 mempool_dynamic_bitset used_blocks;
7c673cae
FG
5590 KeyValueDB::Iterator it;
5591 store_statfs_t expected_statfs, actual_statfs;
5592 struct sb_info_t {
5593 list<ghobject_t> oids;
5594 SharedBlobRef sb;
5595 bluestore_extent_ref_map_t ref_map;
5596 bool compressed;
5597 };
5598 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5599
5600 uint64_t num_objects = 0;
5601 uint64_t num_extents = 0;
5602 uint64_t num_blobs = 0;
5603 uint64_t num_spanning_blobs = 0;
5604 uint64_t num_shared_blobs = 0;
5605 uint64_t num_sharded_objects = 0;
5606 uint64_t num_object_shards = 0;
5607
5608 utime_t start = ceph_clock_now();
5609
5610 int r = _open_path();
5611 if (r < 0)
5612 return r;
5613 r = _open_fsid(false);
5614 if (r < 0)
5615 goto out_path;
5616
5617 r = _read_fsid(&fsid);
5618 if (r < 0)
5619 goto out_fsid;
5620
5621 r = _lock_fsid();
5622 if (r < 0)
5623 goto out_fsid;
5624
5625 r = _open_bdev(false);
5626 if (r < 0)
5627 goto out_fsid;
5628
5629 r = _open_db(false);
5630 if (r < 0)
5631 goto out_bdev;
5632
5633 r = _open_super_meta();
5634 if (r < 0)
5635 goto out_db;
5636
5637 r = _open_fm(false);
5638 if (r < 0)
5639 goto out_db;
5640
5641 r = _open_alloc();
5642 if (r < 0)
5643 goto out_fm;
5644
5645 r = _open_collections(&errors);
5646 if (r < 0)
5647 goto out_alloc;
5648
5649 mempool_thread.init();
5650
31f18b77
FG
5651 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5652 _kv_start();
7c673cae 5653 r = _deferred_replay();
31f18b77 5654 _kv_stop();
7c673cae
FG
5655 if (r < 0)
5656 goto out_scan;
5657
b32b8144 5658 used_blocks.resize(fm->get_alloc_units());
7c673cae 5659 apply(
b32b8144 5660 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae 5661 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 5662 assert(pos < bs.size());
7c673cae
FG
5663 bs.set(pos);
5664 }
5665 );
5666
5667 if (bluefs) {
5668 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5669 apply(
b32b8144 5670 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 5671 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 5672 assert(pos < bs.size());
7c673cae
FG
5673 bs.set(pos);
5674 }
5675 );
5676 }
5677 r = bluefs->fsck();
5678 if (r < 0) {
5679 goto out_scan;
5680 }
5681 if (r > 0)
5682 errors += r;
5683 }
5684
5685 // get expected statfs; fill unaffected fields to be able to compare
5686 // structs
5687 statfs(&actual_statfs);
5688 expected_statfs.total = actual_statfs.total;
5689 expected_statfs.available = actual_statfs.available;
5690
5691 // walk PREFIX_OBJ
5692 dout(1) << __func__ << " walking object keyspace" << dendl;
5693 it = db->get_iterator(PREFIX_OBJ);
5694 if (it) {
5695 CollectionRef c;
5696 spg_t pgid;
5697 mempool::bluestore_fsck::list<string> expecting_shards;
5698 for (it->lower_bound(string()); it->valid(); it->next()) {
31f18b77
FG
5699 if (g_conf->bluestore_debug_fsck_abort) {
5700 goto out_scan;
5701 }
7c673cae
FG
5702 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5703 if (is_extent_shard_key(it->key())) {
5704 while (!expecting_shards.empty() &&
5705 expecting_shards.front() < it->key()) {
3efd9988 5706 derr << "fsck error: missing shard key "
7c673cae
FG
5707 << pretty_binary_string(expecting_shards.front())
5708 << dendl;
5709 ++errors;
5710 expecting_shards.pop_front();
5711 }
5712 if (!expecting_shards.empty() &&
5713 expecting_shards.front() == it->key()) {
5714 // all good
5715 expecting_shards.pop_front();
5716 continue;
5717 }
5718
5719 uint32_t offset;
5720 string okey;
5721 get_key_extent_shard(it->key(), &okey, &offset);
3efd9988 5722 derr << "fsck error: stray shard 0x" << std::hex << offset
7c673cae
FG
5723 << std::dec << dendl;
5724 if (expecting_shards.empty()) {
3efd9988 5725 derr << "fsck error: " << pretty_binary_string(it->key())
7c673cae
FG
5726 << " is unexpected" << dendl;
5727 ++errors;
5728 continue;
5729 }
5730 while (expecting_shards.front() > it->key()) {
3efd9988 5731 derr << "fsck error: saw " << pretty_binary_string(it->key())
7c673cae 5732 << dendl;
3efd9988 5733 derr << "fsck error: exp "
7c673cae
FG
5734 << pretty_binary_string(expecting_shards.front()) << dendl;
5735 ++errors;
5736 expecting_shards.pop_front();
5737 if (expecting_shards.empty()) {
5738 break;
5739 }
5740 }
5741 continue;
5742 }
5743
5744 ghobject_t oid;
5745 int r = get_key_object(it->key(), &oid);
5746 if (r < 0) {
3efd9988 5747 derr << "fsck error: bad object key "
7c673cae
FG
5748 << pretty_binary_string(it->key()) << dendl;
5749 ++errors;
5750 continue;
5751 }
5752 if (!c ||
5753 oid.shard_id != pgid.shard ||
5754 oid.hobj.pool != (int64_t)pgid.pool() ||
5755 !c->contains(oid)) {
5756 c = nullptr;
5757 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5758 coll_map.begin();
5759 p != coll_map.end();
5760 ++p) {
5761 if (p->second->contains(oid)) {
5762 c = p->second;
5763 break;
5764 }
5765 }
5766 if (!c) {
3efd9988 5767 derr << "fsck error: stray object " << oid
7c673cae
FG
5768 << " not owned by any collection" << dendl;
5769 ++errors;
5770 continue;
5771 }
5772 c->cid.is_pg(&pgid);
28e407b8
AA
5773 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
5774 << dendl;
7c673cae
FG
5775 }
5776
5777 if (!expecting_shards.empty()) {
5778 for (auto &k : expecting_shards) {
3efd9988 5779 derr << "fsck error: missing shard key "
7c673cae
FG
5780 << pretty_binary_string(k) << dendl;
5781 }
5782 ++errors;
5783 expecting_shards.clear();
5784 }
5785
5786 dout(10) << __func__ << " " << oid << dendl;
5787 RWLock::RLocker l(c->lock);
5788 OnodeRef o = c->get_onode(oid, false);
5789 if (o->onode.nid) {
5790 if (o->onode.nid > nid_max) {
3efd9988 5791 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
5792 << " > nid_max " << nid_max << dendl;
5793 ++errors;
5794 }
5795 if (used_nids.count(o->onode.nid)) {
3efd9988 5796 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
5797 << " already in use" << dendl;
5798 ++errors;
5799 continue; // go for next object
5800 }
5801 used_nids.insert(o->onode.nid);
5802 }
5803 ++num_objects;
5804 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5805 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5806 _dump_onode(o, 30);
5807 // shards
5808 if (!o->extent_map.shards.empty()) {
5809 ++num_sharded_objects;
5810 num_object_shards += o->extent_map.shards.size();
5811 }
5812 for (auto& s : o->extent_map.shards) {
5813 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
5814 expecting_shards.push_back(string());
5815 get_extent_shard_key(o->key, s.shard_info->offset,
5816 &expecting_shards.back());
5817 if (s.shard_info->offset >= o->onode.size) {
3efd9988 5818 derr << "fsck error: " << oid << " shard 0x" << std::hex
7c673cae
FG
5819 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
5820 << std::dec << dendl;
5821 ++errors;
5822 }
5823 }
5824 // lextents
5825 map<BlobRef,bluestore_blob_t::unused_t> referenced;
5826 uint64_t pos = 0;
5827 mempool::bluestore_fsck::map<BlobRef,
5828 bluestore_blob_use_tracker_t> ref_map;
5829 for (auto& l : o->extent_map.extent_map) {
5830 dout(20) << __func__ << " " << l << dendl;
5831 if (l.logical_offset < pos) {
3efd9988 5832 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
5833 << std::hex << l.logical_offset
5834 << " overlaps with the previous, which ends at 0x" << pos
5835 << std::dec << dendl;
5836 ++errors;
5837 }
5838 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
3efd9988 5839 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
5840 << std::hex << l.logical_offset << "~" << l.length
5841 << " spans a shard boundary"
5842 << std::dec << dendl;
5843 ++errors;
5844 }
5845 pos = l.logical_offset + l.length;
5846 expected_statfs.stored += l.length;
5847 assert(l.blob);
5848 const bluestore_blob_t& blob = l.blob->get_blob();
5849
5850 auto& ref = ref_map[l.blob];
5851 if (ref.is_empty()) {
5852 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
5853 uint32_t l = blob.get_logical_length();
5854 ref.init(l, min_release_size);
5855 }
5856 ref.get(
5857 l.blob_offset,
5858 l.length);
5859 ++num_extents;
5860 if (blob.has_unused()) {
5861 auto p = referenced.find(l.blob);
5862 bluestore_blob_t::unused_t *pu;
5863 if (p == referenced.end()) {
5864 pu = &referenced[l.blob];
5865 } else {
5866 pu = &p->second;
5867 }
5868 uint64_t blob_len = blob.get_logical_length();
5869 assert((blob_len % (sizeof(*pu)*8)) == 0);
5870 assert(l.blob_offset + l.length <= blob_len);
5871 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
5872 uint64_t start = l.blob_offset / chunk_size;
5873 uint64_t end =
5874 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
5875 for (auto i = start; i < end; ++i) {
5876 (*pu) |= (1u << i);
5877 }
5878 }
5879 }
5880 for (auto &i : referenced) {
5881 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
5882 << std::dec << " for " << *i.first << dendl;
5883 const bluestore_blob_t& blob = i.first->get_blob();
5884 if (i.second & blob.unused) {
3efd9988 5885 derr << "fsck error: " << oid << " blob claims unused 0x"
7c673cae
FG
5886 << std::hex << blob.unused
5887 << " but extents reference 0x" << i.second
5888 << " on blob " << *i.first << dendl;
5889 ++errors;
5890 }
5891 if (blob.has_csum()) {
5892 uint64_t blob_len = blob.get_logical_length();
5893 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
5894 unsigned csum_count = blob.get_csum_count();
5895 unsigned csum_chunk_size = blob.get_csum_chunk_size();
5896 for (unsigned p = 0; p < csum_count; ++p) {
5897 unsigned pos = p * csum_chunk_size;
5898 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
5899 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
5900 unsigned mask = 1u << firstbit;
5901 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
5902 mask |= 1u << b;
5903 }
5904 if ((blob.unused & mask) == mask) {
5905 // this csum chunk region is marked unused
5906 if (blob.get_csum_item(p) != 0) {
3efd9988 5907 derr << "fsck error: " << oid
7c673cae
FG
5908 << " blob claims csum chunk 0x" << std::hex << pos
5909 << "~" << csum_chunk_size
5910 << " is unused (mask 0x" << mask << " of unused 0x"
5911 << blob.unused << ") but csum is non-zero 0x"
5912 << blob.get_csum_item(p) << std::dec << " on blob "
5913 << *i.first << dendl;
5914 ++errors;
5915 }
5916 }
5917 }
5918 }
5919 }
5920 for (auto &i : ref_map) {
5921 ++num_blobs;
5922 const bluestore_blob_t& blob = i.first->get_blob();
5923 bool equal = i.first->get_blob_use_tracker().equal(i.second);
5924 if (!equal) {
3efd9988 5925 derr << "fsck error: " << oid << " blob " << *i.first
7c673cae
FG
5926 << " doesn't match expected ref_map " << i.second << dendl;
5927 ++errors;
5928 }
5929 if (blob.is_compressed()) {
5930 expected_statfs.compressed += blob.get_compressed_payload_length();
5931 expected_statfs.compressed_original +=
5932 i.first->get_referenced_bytes();
5933 }
5934 if (blob.is_shared()) {
5935 if (i.first->shared_blob->get_sbid() > blobid_max) {
3efd9988 5936 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
5937 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
5938 << blobid_max << dendl;
5939 ++errors;
5940 } else if (i.first->shared_blob->get_sbid() == 0) {
3efd9988 5941 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
5942 << " marked as shared but has uninitialized sbid"
5943 << dendl;
5944 ++errors;
5945 }
5946 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
5947 sbi.sb = i.first->shared_blob;
5948 sbi.oids.push_back(oid);
5949 sbi.compressed = blob.is_compressed();
5950 for (auto e : blob.get_extents()) {
5951 if (e.is_valid()) {
5952 sbi.ref_map.get(e.offset, e.length);
5953 }
5954 }
5955 } else {
5956 errors += _fsck_check_extents(oid, blob.get_extents(),
5957 blob.is_compressed(),
5958 used_blocks,
b32b8144 5959 fm->get_alloc_size(),
7c673cae
FG
5960 expected_statfs);
5961 }
5962 }
5963 if (deep) {
5964 bufferlist bl;
5965 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
5966 if (r < 0) {
5967 ++errors;
3efd9988 5968 derr << "fsck error: " << oid << " error during read: "
7c673cae
FG
5969 << cpp_strerror(r) << dendl;
5970 }
5971 }
5972 // omap
5973 if (o->onode.has_omap()) {
5974 if (used_omap_head.count(o->onode.nid)) {
3efd9988 5975 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
7c673cae
FG
5976 << " already in use" << dendl;
5977 ++errors;
5978 } else {
5979 used_omap_head.insert(o->onode.nid);
5980 }
5981 }
7c673cae
FG
5982 }
5983 }
5984 dout(1) << __func__ << " checking shared_blobs" << dendl;
5985 it = db->get_iterator(PREFIX_SHARED_BLOB);
5986 if (it) {
5987 for (it->lower_bound(string()); it->valid(); it->next()) {
5988 string key = it->key();
5989 uint64_t sbid;
5990 if (get_key_shared_blob(key, &sbid)) {
3efd9988 5991 derr << "fsck error: bad key '" << key
7c673cae
FG
5992 << "' in shared blob namespace" << dendl;
5993 ++errors;
5994 continue;
5995 }
5996 auto p = sb_info.find(sbid);
5997 if (p == sb_info.end()) {
3efd9988 5998 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae
FG
5999 << std::hex << sbid << std::dec << dendl;
6000 ++errors;
6001 } else {
6002 ++num_shared_blobs;
6003 sb_info_t& sbi = p->second;
6004 bluestore_shared_blob_t shared_blob(sbid);
6005 bufferlist bl = it->value();
6006 bufferlist::iterator blp = bl.begin();
6007 ::decode(shared_blob, blp);
6008 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
6009 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 6010 derr << "fsck error: shared blob 0x" << std::hex << sbid
7c673cae
FG
6011 << std::dec << " ref_map " << shared_blob.ref_map
6012 << " != expected " << sbi.ref_map << dendl;
6013 ++errors;
6014 }
6015 PExtentVector extents;
6016 for (auto &r : shared_blob.ref_map.ref_map) {
6017 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
6018 }
6019 errors += _fsck_check_extents(p->second.oids.front(),
6020 extents,
6021 p->second.compressed,
b32b8144
FG
6022 used_blocks,
6023 fm->get_alloc_size(),
6024 expected_statfs);
7c673cae
FG
6025 sb_info.erase(p);
6026 }
6027 }
6028 }
6029 for (auto &p : sb_info) {
3efd9988 6030 derr << "fsck error: shared_blob 0x" << p.first
7c673cae
FG
6031 << " key is missing (" << *p.second.sb << ")" << dendl;
6032 ++errors;
6033 }
6034 if (!(actual_statfs == expected_statfs)) {
3efd9988 6035 derr << "fsck error: actual " << actual_statfs
7c673cae
FG
6036 << " != expected " << expected_statfs << dendl;
6037 ++errors;
6038 }
6039
6040 dout(1) << __func__ << " checking for stray omap data" << dendl;
6041 it = db->get_iterator(PREFIX_OMAP);
6042 if (it) {
6043 for (it->lower_bound(string()); it->valid(); it->next()) {
6044 uint64_t omap_head;
6045 _key_decode_u64(it->key().c_str(), &omap_head);
6046 if (used_omap_head.count(omap_head) == 0) {
3efd9988 6047 derr << "fsck error: found stray omap data on omap_head "
7c673cae
FG
6048 << omap_head << dendl;
6049 ++errors;
6050 }
6051 }
6052 }
6053
6054 dout(1) << __func__ << " checking deferred events" << dendl;
6055 it = db->get_iterator(PREFIX_DEFERRED);
6056 if (it) {
6057 for (it->lower_bound(string()); it->valid(); it->next()) {
6058 bufferlist bl = it->value();
6059 bufferlist::iterator p = bl.begin();
6060 bluestore_deferred_transaction_t wt;
6061 try {
6062 ::decode(wt, p);
6063 } catch (buffer::error& e) {
3efd9988 6064 derr << "fsck error: failed to decode deferred txn "
7c673cae
FG
6065 << pretty_binary_string(it->key()) << dendl;
6066 r = -EIO;
6067 goto out_scan;
6068 }
6069 dout(20) << __func__ << " deferred " << wt.seq
6070 << " ops " << wt.ops.size()
6071 << " released 0x" << std::hex << wt.released << std::dec << dendl;
6072 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
6073 apply(
b32b8144 6074 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 6075 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 6076 assert(pos < bs.size());
7c673cae
FG
6077 bs.set(pos);
6078 }
6079 );
6080 }
6081 }
6082 }
6083
6084 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
6085 {
6086 // remove bluefs_extents from used set since the freelist doesn't
6087 // know they are allocated.
6088 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
6089 apply(
b32b8144 6090 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 6091 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 6092 assert(pos < bs.size());
7c673cae
FG
6093 bs.reset(pos);
6094 }
6095 );
6096 }
6097 fm->enumerate_reset();
6098 uint64_t offset, length;
6099 while (fm->enumerate_next(&offset, &length)) {
6100 bool intersects = false;
6101 apply(
b32b8144 6102 offset, length, fm->get_alloc_size(), used_blocks,
7c673cae 6103 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 6104 assert(pos < bs.size());
7c673cae
FG
6105 if (bs.test(pos)) {
6106 intersects = true;
6107 } else {
6108 bs.set(pos);
6109 }
6110 }
6111 );
6112 if (intersects) {
3efd9988
FG
6113 if (offset == SUPER_RESERVED &&
6114 length == min_alloc_size - SUPER_RESERVED) {
6115 // this is due to the change just after luminous to min_alloc_size
6116 // granularity allocations, and our baked in assumption at the top
6117 // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
6118 // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
6119 // since we will never allocate this region below min_alloc_size.
6120 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
6121 << " and min_alloc_size, 0x" << std::hex << offset << "~"
6122 << length << dendl;
b5b8bbf5 6123 } else {
3efd9988
FG
6124 derr << "fsck error: free extent 0x" << std::hex << offset
6125 << "~" << length << std::dec
6126 << " intersects allocated blocks" << dendl;
6127 ++errors;
b5b8bbf5 6128 }
b5b8bbf5
FG
6129 }
6130 }
3efd9988
FG
6131 fm->enumerate_reset();
6132 size_t count = used_blocks.count();
7c673cae
FG
6133 if (used_blocks.size() != count) {
6134 assert(used_blocks.size() > count);
7c673cae 6135 ++errors;
b5b8bbf5
FG
6136 used_blocks.flip();
6137 size_t start = used_blocks.find_first();
6138 while (start != decltype(used_blocks)::npos) {
6139 size_t cur = start;
6140 while (true) {
6141 size_t next = used_blocks.find_next(cur);
6142 if (next != cur + 1) {
3efd9988 6143 derr << "fsck error: leaked extent 0x" << std::hex
b32b8144
FG
6144 << ((uint64_t)start * fm->get_alloc_size()) << "~"
6145 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
b5b8bbf5
FG
6146 << dendl;
6147 start = next;
6148 break;
6149 }
6150 cur = next;
6151 }
6152 }
6153 used_blocks.flip();
7c673cae
FG
6154 }
6155 }
6156
6157 out_scan:
6158 mempool_thread.shutdown();
31f18b77 6159 _flush_cache();
7c673cae
FG
6160 out_alloc:
6161 _close_alloc();
6162 out_fm:
6163 _close_fm();
6164 out_db:
6165 it.reset(); // before db is closed
6166 _close_db();
6167 out_bdev:
6168 _close_bdev();
6169 out_fsid:
6170 _close_fsid();
6171 out_path:
6172 _close_path();
6173
6174 // fatal errors take precedence
6175 if (r < 0)
6176 return r;
6177
6178 dout(2) << __func__ << " " << num_objects << " objects, "
6179 << num_sharded_objects << " of them sharded. "
6180 << dendl;
6181 dout(2) << __func__ << " " << num_extents << " extents to "
6182 << num_blobs << " blobs, "
6183 << num_spanning_blobs << " spanning, "
6184 << num_shared_blobs << " shared."
6185 << dendl;
6186
6187 utime_t duration = ceph_clock_now() - start;
3efd9988
FG
6188 dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
6189 << " repaired, " << (errors - repaired) << " remaining in "
7c673cae 6190 << duration << " seconds" << dendl;
3efd9988 6191 return errors - repaired;
7c673cae
FG
6192}
6193
6194void BlueStore::collect_metadata(map<string,string> *pm)
6195{
6196 dout(10) << __func__ << dendl;
6197 bdev->collect_metadata("bluestore_bdev_", pm);
6198 if (bluefs) {
6199 (*pm)["bluefs"] = "1";
6200 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
6201 bluefs->collect_metadata(pm);
6202 } else {
6203 (*pm)["bluefs"] = "0";
6204 }
6205}
6206
6207int BlueStore::statfs(struct store_statfs_t *buf)
6208{
6209 buf->reset();
6210 buf->total = bdev->get_size();
6211 buf->available = alloc->get_free();
6212
6213 if (bluefs) {
94b18763
FG
6214 // part of our shared device is "free" according to BlueFS, but we
6215 // can't touch bluestore_bluefs_min of it.
6216 int64_t shared_available = std::min(
6217 bluefs->get_free(bluefs_shared_bdev),
6218 bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
6219 if (shared_available > 0) {
6220 buf->available += shared_available;
7c673cae
FG
6221 }
6222 }
6223
31f18b77
FG
6224 {
6225 std::lock_guard<std::mutex> l(vstatfs_lock);
6226
6227 buf->allocated = vstatfs.allocated();
6228 buf->stored = vstatfs.stored();
6229 buf->compressed = vstatfs.compressed();
6230 buf->compressed_original = vstatfs.compressed_original();
6231 buf->compressed_allocated = vstatfs.compressed_allocated();
7c673cae
FG
6232 }
6233
7c673cae
FG
6234 dout(20) << __func__ << *buf << dendl;
6235 return 0;
6236}
6237
6238// ---------------
6239// cache
6240
6241BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
6242{
6243 RWLock::RLocker l(coll_lock);
6244 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
6245 if (cp == coll_map.end())
6246 return CollectionRef();
6247 return cp->second;
6248}
6249
6250void BlueStore::_queue_reap_collection(CollectionRef& c)
6251{
6252 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
6253 // _reap_collections and this in the same thread,
6254 // so no need a lock.
7c673cae
FG
6255 removed_collections.push_back(c);
6256}
6257
6258void BlueStore::_reap_collections()
6259{
94b18763 6260
7c673cae
FG
6261 list<CollectionRef> removed_colls;
6262 {
94b18763
FG
6263 // _queue_reap_collection and this in the same thread.
6264 // So no need a lock.
6265 if (!removed_collections.empty())
6266 removed_colls.swap(removed_collections);
6267 else
6268 return;
7c673cae
FG
6269 }
6270
94b18763
FG
6271 list<CollectionRef>::iterator p = removed_colls.begin();
6272 while (p != removed_colls.end()) {
7c673cae
FG
6273 CollectionRef c = *p;
6274 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6275 if (c->onode_map.map_any([&](OnodeRef o) {
6276 assert(!o->exists);
6277 if (o->flushing_count.load()) {
6278 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6279 << " flush_txns " << o->flushing_count << dendl;
94b18763 6280 return true;
7c673cae 6281 }
94b18763 6282 return false;
7c673cae 6283 })) {
94b18763 6284 ++p;
7c673cae
FG
6285 continue;
6286 }
6287 c->onode_map.clear();
94b18763 6288 p = removed_colls.erase(p);
7c673cae
FG
6289 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6290 }
94b18763 6291 if (removed_colls.empty()) {
7c673cae 6292 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
6293 } else {
6294 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
6295 }
6296}
6297
6298void BlueStore::_update_cache_logger()
6299{
6300 uint64_t num_onodes = 0;
6301 uint64_t num_extents = 0;
6302 uint64_t num_blobs = 0;
6303 uint64_t num_buffers = 0;
6304 uint64_t num_buffer_bytes = 0;
6305 for (auto c : cache_shards) {
6306 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6307 &num_buffers, &num_buffer_bytes);
6308 }
6309 logger->set(l_bluestore_onodes, num_onodes);
6310 logger->set(l_bluestore_extents, num_extents);
6311 logger->set(l_bluestore_blobs, num_blobs);
6312 logger->set(l_bluestore_buffers, num_buffers);
6313 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6314}
6315
6316// ---------------
6317// read operations
6318
6319ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6320{
6321 return _get_collection(cid);
6322}
6323
6324bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6325{
6326 CollectionHandle c = _get_collection(cid);
6327 if (!c)
6328 return false;
6329 return exists(c, oid);
6330}
6331
6332bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6333{
6334 Collection *c = static_cast<Collection *>(c_.get());
6335 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6336 if (!c->exists)
6337 return false;
6338
6339 bool r = true;
6340
6341 {
6342 RWLock::RLocker l(c->lock);
6343 OnodeRef o = c->get_onode(oid, false);
6344 if (!o || !o->exists)
6345 r = false;
6346 }
6347
7c673cae
FG
6348 return r;
6349}
6350
6351int BlueStore::stat(
6352 const coll_t& cid,
6353 const ghobject_t& oid,
6354 struct stat *st,
6355 bool allow_eio)
6356{
6357 CollectionHandle c = _get_collection(cid);
6358 if (!c)
6359 return -ENOENT;
6360 return stat(c, oid, st, allow_eio);
6361}
6362
6363int BlueStore::stat(
6364 CollectionHandle &c_,
6365 const ghobject_t& oid,
6366 struct stat *st,
6367 bool allow_eio)
6368{
6369 Collection *c = static_cast<Collection *>(c_.get());
6370 if (!c->exists)
6371 return -ENOENT;
6372 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6373
6374 {
6375 RWLock::RLocker l(c->lock);
6376 OnodeRef o = c->get_onode(oid, false);
6377 if (!o || !o->exists)
6378 return -ENOENT;
6379 st->st_size = o->onode.size;
6380 st->st_blksize = 4096;
6381 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6382 st->st_nlink = 1;
6383 }
6384
7c673cae
FG
6385 int r = 0;
6386 if (_debug_mdata_eio(oid)) {
6387 r = -EIO;
6388 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6389 }
6390 return r;
6391}
6392int BlueStore::set_collection_opts(
6393 const coll_t& cid,
6394 const pool_opts_t& opts)
6395{
6396 CollectionHandle ch = _get_collection(cid);
6397 if (!ch)
6398 return -ENOENT;
6399 Collection *c = static_cast<Collection *>(ch.get());
6400 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6401 if (!c->exists)
6402 return -ENOENT;
6403 RWLock::WLocker l(c->lock);
6404 c->pool_opts = opts;
6405 return 0;
6406}
6407
6408int BlueStore::read(
6409 const coll_t& cid,
6410 const ghobject_t& oid,
6411 uint64_t offset,
6412 size_t length,
6413 bufferlist& bl,
224ce89b 6414 uint32_t op_flags)
7c673cae
FG
6415{
6416 CollectionHandle c = _get_collection(cid);
6417 if (!c)
6418 return -ENOENT;
224ce89b 6419 return read(c, oid, offset, length, bl, op_flags);
7c673cae
FG
6420}
6421
6422int BlueStore::read(
6423 CollectionHandle &c_,
6424 const ghobject_t& oid,
6425 uint64_t offset,
6426 size_t length,
6427 bufferlist& bl,
224ce89b 6428 uint32_t op_flags)
7c673cae
FG
6429{
6430 utime_t start = ceph_clock_now();
6431 Collection *c = static_cast<Collection *>(c_.get());
6432 const coll_t &cid = c->get_cid();
6433 dout(15) << __func__ << " " << cid << " " << oid
6434 << " 0x" << std::hex << offset << "~" << length << std::dec
6435 << dendl;
6436 if (!c->exists)
6437 return -ENOENT;
6438
6439 bl.clear();
6440 int r;
6441 {
6442 RWLock::RLocker l(c->lock);
6443 utime_t start1 = ceph_clock_now();
6444 OnodeRef o = c->get_onode(oid, false);
6445 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6446 if (!o || !o->exists) {
6447 r = -ENOENT;
6448 goto out;
6449 }
6450
6451 if (offset == length && offset == 0)
6452 length = o->onode.size;
6453
6454 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
6455 if (r == -EIO) {
6456 logger->inc(l_bluestore_read_eio);
6457 }
7c673cae
FG
6458 }
6459
6460 out:
28e407b8 6461 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
6462 r = -EIO;
6463 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
224ce89b
WB
6464 } else if (cct->_conf->bluestore_debug_random_read_err &&
6465 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) {
6466 dout(0) << __func__ << ": inject random EIO" << dendl;
6467 r = -EIO;
7c673cae
FG
6468 }
6469 dout(10) << __func__ << " " << cid << " " << oid
6470 << " 0x" << std::hex << offset << "~" << length << std::dec
6471 << " = " << r << dendl;
6472 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6473 return r;
6474}
6475
6476// --------------------------------------------------------
6477// intermediate data structures used while reading
6478struct region_t {
6479 uint64_t logical_offset;
6480 uint64_t blob_xoffset; //region offset within the blob
6481 uint64_t length;
6482 bufferlist bl;
6483
6484 // used later in read process
6485 uint64_t front = 0;
6486 uint64_t r_off = 0;
6487
6488 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6489 : logical_offset(offset),
6490 blob_xoffset(b_offs),
6491 length(len){}
6492 region_t(const region_t& from)
6493 : logical_offset(from.logical_offset),
6494 blob_xoffset(from.blob_xoffset),
6495 length(from.length){}
6496
6497 friend ostream& operator<<(ostream& out, const region_t& r) {
6498 return out << "0x" << std::hex << r.logical_offset << ":"
6499 << r.blob_xoffset << "~" << r.length << std::dec;
6500 }
6501};
6502
6503typedef list<region_t> regions2read_t;
6504typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6505
6506int BlueStore::_do_read(
6507 Collection *c,
6508 OnodeRef o,
6509 uint64_t offset,
6510 size_t length,
6511 bufferlist& bl,
6512 uint32_t op_flags)
6513{
6514 FUNCTRACE();
7c673cae
FG
6515 int r = 0;
6516
6517 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6518 << " size 0x" << o->onode.size << " (" << std::dec
6519 << o->onode.size << ")" << dendl;
6520 bl.clear();
6521
6522 if (offset >= o->onode.size) {
6523 return r;
6524 }
6525
6526 // generally, don't buffer anything, unless the client explicitly requests
6527 // it.
6528 bool buffered = false;
6529 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6530 dout(20) << __func__ << " will do buffered read" << dendl;
6531 buffered = true;
6532 } else if (cct->_conf->bluestore_default_buffered_read &&
6533 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6534 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6535 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6536 buffered = true;
6537 }
6538
6539 if (offset + length > o->onode.size) {
6540 length = o->onode.size - offset;
6541 }
6542
6543 utime_t start = ceph_clock_now();
6544 o->extent_map.fault_range(db, offset, length);
6545 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6546 _dump_onode(o);
6547
6548 ready_regions_t ready_regions;
6549
6550 // build blob-wise list to of stuff read (that isn't cached)
6551 blobs2read_t blobs2read;
6552 unsigned left = length;
6553 uint64_t pos = offset;
6554 unsigned num_regions = 0;
6555 auto lp = o->extent_map.seek_lextent(offset);
6556 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6557 if (pos < lp->logical_offset) {
6558 unsigned hole = lp->logical_offset - pos;
6559 if (hole >= left) {
6560 break;
6561 }
6562 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6563 << std::dec << dendl;
6564 pos += hole;
6565 left -= hole;
6566 }
94b18763 6567 BlobRef& bptr = lp->blob;
7c673cae
FG
6568 unsigned l_off = pos - lp->logical_offset;
6569 unsigned b_off = l_off + lp->blob_offset;
6570 unsigned b_len = std::min(left, lp->length - l_off);
6571
6572 ready_regions_t cache_res;
6573 interval_set<uint32_t> cache_interval;
6574 bptr->shared_blob->bc.read(
6575 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
6576 dout(20) << __func__ << " blob " << *bptr << std::hex
6577 << " need 0x" << b_off << "~" << b_len
6578 << " cache has 0x" << cache_interval
6579 << std::dec << dendl;
6580
6581 auto pc = cache_res.begin();
6582 while (b_len > 0) {
6583 unsigned l;
6584 if (pc != cache_res.end() &&
6585 pc->first == b_off) {
6586 l = pc->second.length();
6587 ready_regions[pos].claim(pc->second);
6588 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6589 << b_off << "~" << l << std::dec << dendl;
6590 ++pc;
6591 } else {
6592 l = b_len;
6593 if (pc != cache_res.end()) {
6594 assert(pc->first > b_off);
6595 l = pc->first - b_off;
6596 }
6597 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6598 << b_off << "~" << l << std::dec << dendl;
6599 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6600 ++num_regions;
6601 }
6602 pos += l;
6603 b_off += l;
6604 left -= l;
6605 b_len -= l;
6606 }
6607 ++lp;
6608 }
6609
6610 // read raw blob data. use aio if we have >1 blobs to read.
6611 start = ceph_clock_now(); // for the sake of simplicity
6612 // measure the whole block below.
6613 // The error isn't that much...
6614 vector<bufferlist> compressed_blob_bls;
b32b8144 6615 IOContext ioc(cct, NULL, true); // allow EIO
7c673cae 6616 for (auto& p : blobs2read) {
94b18763 6617 const BlobRef& bptr = p.first;
7c673cae
FG
6618 dout(20) << __func__ << " blob " << *bptr << std::hex
6619 << " need " << p.second << std::dec << dendl;
6620 if (bptr->get_blob().is_compressed()) {
6621 // read the whole thing
6622 if (compressed_blob_bls.empty()) {
6623 // ensure we avoid any reallocation on subsequent blobs
6624 compressed_blob_bls.reserve(blobs2read.size());
6625 }
6626 compressed_blob_bls.push_back(bufferlist());
6627 bufferlist& bl = compressed_blob_bls.back();
6628 r = bptr->get_blob().map(
6629 0, bptr->get_blob().get_ondisk_length(),
6630 [&](uint64_t offset, uint64_t length) {
6631 int r;
6632 // use aio if there are more regions to read than those in this blob
6633 if (num_regions > p.second.size()) {
6634 r = bdev->aio_read(offset, length, &bl, &ioc);
6635 } else {
6636 r = bdev->read(offset, length, &bl, &ioc, false);
6637 }
6638 if (r < 0)
6639 return r;
6640 return 0;
6641 });
b32b8144
FG
6642 if (r < 0) {
6643 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
6644 if (r == -EIO) {
6645 // propagate EIO to caller
6646 return r;
6647 }
7c673cae 6648 assert(r == 0);
b32b8144 6649 }
7c673cae
FG
6650 } else {
6651 // read the pieces
6652 for (auto& reg : p.second) {
6653 // determine how much of the blob to read
6654 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6655 reg.r_off = reg.blob_xoffset;
6656 uint64_t r_len = reg.length;
6657 reg.front = reg.r_off % chunk_size;
6658 if (reg.front) {
6659 reg.r_off -= reg.front;
6660 r_len += reg.front;
6661 }
6662 unsigned tail = r_len % chunk_size;
6663 if (tail) {
6664 r_len += chunk_size - tail;
6665 }
6666 dout(20) << __func__ << " region 0x" << std::hex
6667 << reg.logical_offset
6668 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6669 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6670 << dendl;
6671
6672 // read it
6673 r = bptr->get_blob().map(
6674 reg.r_off, r_len,
6675 [&](uint64_t offset, uint64_t length) {
6676 int r;
6677 // use aio if there is more than one region to read
6678 if (num_regions > 1) {
6679 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6680 } else {
6681 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6682 }
6683 if (r < 0)
6684 return r;
6685 return 0;
6686 });
b32b8144
FG
6687 if (r < 0) {
6688 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
6689 << dendl;
6690 if (r == -EIO) {
6691 // propagate EIO to caller
6692 return r;
6693 }
6694 assert(r == 0);
6695 }
7c673cae
FG
6696 assert(reg.bl.length() == r_len);
6697 }
6698 }
6699 }
6700 if (ioc.has_pending_aios()) {
6701 bdev->aio_submit(&ioc);
6702 dout(20) << __func__ << " waiting for aio" << dendl;
6703 ioc.aio_wait();
b32b8144
FG
6704 r = ioc.get_return_value();
6705 if (r < 0) {
6706 assert(r == -EIO); // no other errors allowed
6707 return -EIO;
6708 }
7c673cae
FG
6709 }
6710 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6711
6712 // enumerate and decompress desired blobs
6713 auto p = compressed_blob_bls.begin();
6714 blobs2read_t::iterator b2r_it = blobs2read.begin();
6715 while (b2r_it != blobs2read.end()) {
94b18763 6716 const BlobRef& bptr = b2r_it->first;
7c673cae
FG
6717 dout(20) << __func__ << " blob " << *bptr << std::hex
6718 << " need 0x" << b2r_it->second << std::dec << dendl;
6719 if (bptr->get_blob().is_compressed()) {
6720 assert(p != compressed_blob_bls.end());
6721 bufferlist& compressed_bl = *p++;
6722 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6723 b2r_it->second.front().logical_offset) < 0) {
6724 return -EIO;
6725 }
6726 bufferlist raw_bl;
6727 r = _decompress(compressed_bl, &raw_bl);
6728 if (r < 0)
6729 return r;
6730 if (buffered) {
6731 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6732 raw_bl);
6733 }
6734 for (auto& i : b2r_it->second) {
6735 ready_regions[i.logical_offset].substr_of(
6736 raw_bl, i.blob_xoffset, i.length);
6737 }
6738 } else {
6739 for (auto& reg : b2r_it->second) {
6740 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6741 reg.logical_offset) < 0) {
6742 return -EIO;
6743 }
6744 if (buffered) {
6745 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6746 reg.r_off, reg.bl);
6747 }
6748
6749 // prune and keep result
6750 ready_regions[reg.logical_offset].substr_of(
6751 reg.bl, reg.front, reg.length);
6752 }
6753 }
6754 ++b2r_it;
6755 }
6756
6757 // generate a resulting buffer
6758 auto pr = ready_regions.begin();
6759 auto pr_end = ready_regions.end();
6760 pos = 0;
6761 while (pos < length) {
6762 if (pr != pr_end && pr->first == pos + offset) {
6763 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6764 << ": data from 0x" << pr->first << "~" << pr->second.length()
6765 << std::dec << dendl;
6766 pos += pr->second.length();
6767 bl.claim_append(pr->second);
6768 ++pr;
6769 } else {
6770 uint64_t l = length - pos;
6771 if (pr != pr_end) {
6772 assert(pr->first > pos + offset);
6773 l = pr->first - (pos + offset);
6774 }
6775 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6776 << ": zeros for 0x" << (pos + offset) << "~" << l
6777 << std::dec << dendl;
6778 bl.append_zero(l);
6779 pos += l;
6780 }
6781 }
6782 assert(bl.length() == length);
6783 assert(pos == length);
6784 assert(pr == pr_end);
6785 r = bl.length();
6786 return r;
6787}
6788
6789int BlueStore::_verify_csum(OnodeRef& o,
6790 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6791 const bufferlist& bl,
6792 uint64_t logical_offset) const
6793{
6794 int bad;
6795 uint64_t bad_csum;
6796 utime_t start = ceph_clock_now();
6797 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6798 if (r < 0) {
6799 if (r == -1) {
6800 PExtentVector pex;
6801 blob->map(
6802 bad,
6803 blob->get_csum_chunk_size(),
6804 [&](uint64_t offset, uint64_t length) {
6805 pex.emplace_back(bluestore_pextent_t(offset, length));
6806 return 0;
6807 });
6808 derr << __func__ << " bad "
6809 << Checksummer::get_csum_type_string(blob->csum_type)
6810 << "/0x" << std::hex << blob->get_csum_chunk_size()
6811 << " checksum at blob offset 0x" << bad
6812 << ", got 0x" << bad_csum << ", expected 0x"
6813 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
6814 << ", device location " << pex
6815 << ", logical extent 0x" << std::hex
6816 << (logical_offset + bad - blob_xoffset) << "~"
6817 << blob->get_csum_chunk_size() << std::dec
6818 << ", object " << o->oid
6819 << dendl;
6820 } else {
6821 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
6822 }
6823 }
6824 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
6825 return r;
6826}
6827
6828int BlueStore::_decompress(bufferlist& source, bufferlist* result)
6829{
6830 int r = 0;
6831 utime_t start = ceph_clock_now();
6832 bufferlist::iterator i = source.begin();
6833 bluestore_compression_header_t chdr;
6834 ::decode(chdr, i);
6835 int alg = int(chdr.type);
6836 CompressorRef cp = compressor;
6837 if (!cp || (int)cp->get_type() != alg) {
6838 cp = Compressor::create(cct, alg);
6839 }
6840
6841 if (!cp.get()) {
6842 // if compressor isn't available - error, because cannot return
6843 // decompressed data?
6844 derr << __func__ << " can't load decompressor " << alg << dendl;
6845 r = -EIO;
6846 } else {
6847 r = cp->decompress(i, chdr.length, *result);
6848 if (r < 0) {
6849 derr << __func__ << " decompression failed with exit code " << r << dendl;
6850 r = -EIO;
6851 }
6852 }
6853 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
6854 return r;
6855}
6856
6857// this stores fiemap into interval_set, other variations
6858// use it internally
6859int BlueStore::_fiemap(
6860 CollectionHandle &c_,
6861 const ghobject_t& oid,
6862 uint64_t offset,
6863 size_t length,
6864 interval_set<uint64_t>& destset)
6865{
6866 Collection *c = static_cast<Collection *>(c_.get());
6867 if (!c->exists)
6868 return -ENOENT;
6869 {
6870 RWLock::RLocker l(c->lock);
6871
6872 OnodeRef o = c->get_onode(oid, false);
6873 if (!o || !o->exists) {
6874 return -ENOENT;
6875 }
6876 _dump_onode(o);
6877
6878 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6879 << " size 0x" << o->onode.size << std::dec << dendl;
6880
6881 boost::intrusive::set<Extent>::iterator ep, eend;
6882 if (offset >= o->onode.size)
6883 goto out;
6884
6885 if (offset + length > o->onode.size) {
6886 length = o->onode.size - offset;
6887 }
6888
6889 o->extent_map.fault_range(db, offset, length);
6890 eend = o->extent_map.extent_map.end();
6891 ep = o->extent_map.seek_lextent(offset);
6892 while (length > 0) {
6893 dout(20) << __func__ << " offset " << offset << dendl;
6894 if (ep != eend && ep->logical_offset + ep->length <= offset) {
6895 ++ep;
6896 continue;
6897 }
6898
6899 uint64_t x_len = length;
6900 if (ep != eend && ep->logical_offset <= offset) {
6901 uint64_t x_off = offset - ep->logical_offset;
6902 x_len = MIN(x_len, ep->length - x_off);
6903 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
6904 << x_len << std::dec << " blob " << ep->blob << dendl;
6905 destset.insert(offset, x_len);
6906 length -= x_len;
6907 offset += x_len;
6908 if (x_off + x_len == ep->length)
6909 ++ep;
6910 continue;
6911 }
6912 if (ep != eend &&
6913 ep->logical_offset > offset &&
6914 ep->logical_offset - offset < x_len) {
6915 x_len = ep->logical_offset - offset;
6916 }
6917 offset += x_len;
6918 length -= x_len;
6919 }
6920 }
6921
6922 out:
7c673cae
FG
6923 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6924 << " size = 0x(" << destset << ")" << std::dec << dendl;
6925 return 0;
6926}
6927
6928int BlueStore::fiemap(
6929 const coll_t& cid,
6930 const ghobject_t& oid,
6931 uint64_t offset,
6932 size_t len,
6933 bufferlist& bl)
6934{
6935 CollectionHandle c = _get_collection(cid);
6936 if (!c)
6937 return -ENOENT;
6938 return fiemap(c, oid, offset, len, bl);
6939}
6940
6941int BlueStore::fiemap(
6942 CollectionHandle &c_,
6943 const ghobject_t& oid,
6944 uint64_t offset,
6945 size_t length,
6946 bufferlist& bl)
6947{
6948 interval_set<uint64_t> m;
6949 int r = _fiemap(c_, oid, offset, length, m);
6950 if (r >= 0) {
6951 ::encode(m, bl);
6952 }
6953 return r;
6954}
6955
6956int BlueStore::fiemap(
6957 const coll_t& cid,
6958 const ghobject_t& oid,
6959 uint64_t offset,
6960 size_t len,
6961 map<uint64_t, uint64_t>& destmap)
6962{
6963 CollectionHandle c = _get_collection(cid);
6964 if (!c)
6965 return -ENOENT;
6966 return fiemap(c, oid, offset, len, destmap);
6967}
6968
6969int BlueStore::fiemap(
6970 CollectionHandle &c_,
6971 const ghobject_t& oid,
6972 uint64_t offset,
6973 size_t length,
6974 map<uint64_t, uint64_t>& destmap)
6975{
6976 interval_set<uint64_t> m;
6977 int r = _fiemap(c_, oid, offset, length, m);
6978 if (r >= 0) {
6979 m.move_into(destmap);
6980 }
6981 return r;
6982}
6983
6984int BlueStore::getattr(
6985 const coll_t& cid,
6986 const ghobject_t& oid,
6987 const char *name,
6988 bufferptr& value)
6989{
6990 CollectionHandle c = _get_collection(cid);
6991 if (!c)
6992 return -ENOENT;
6993 return getattr(c, oid, name, value);
6994}
6995
6996int BlueStore::getattr(
6997 CollectionHandle &c_,
6998 const ghobject_t& oid,
6999 const char *name,
7000 bufferptr& value)
7001{
7002 Collection *c = static_cast<Collection *>(c_.get());
7003 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
7004 if (!c->exists)
7005 return -ENOENT;
7006
7007 int r;
7008 {
7009 RWLock::RLocker l(c->lock);
31f18b77 7010 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
7011
7012 OnodeRef o = c->get_onode(oid, false);
7013 if (!o || !o->exists) {
7014 r = -ENOENT;
7015 goto out;
7016 }
7017
7018 if (!o->onode.attrs.count(k)) {
7019 r = -ENODATA;
7020 goto out;
7021 }
7022 value = o->onode.attrs[k];
7023 r = 0;
7024 }
7025 out:
7c673cae
FG
7026 if (r == 0 && _debug_mdata_eio(oid)) {
7027 r = -EIO;
7028 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7029 }
7030 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
7031 << " = " << r << dendl;
7032 return r;
7033}
7034
7035
7036int BlueStore::getattrs(
7037 const coll_t& cid,
7038 const ghobject_t& oid,
7039 map<string,bufferptr>& aset)
7040{
7041 CollectionHandle c = _get_collection(cid);
7042 if (!c)
7043 return -ENOENT;
7044 return getattrs(c, oid, aset);
7045}
7046
7047int BlueStore::getattrs(
7048 CollectionHandle &c_,
7049 const ghobject_t& oid,
7050 map<string,bufferptr>& aset)
7051{
7052 Collection *c = static_cast<Collection *>(c_.get());
7053 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
7054 if (!c->exists)
7055 return -ENOENT;
7056
7057 int r;
7058 {
7059 RWLock::RLocker l(c->lock);
7060
7061 OnodeRef o = c->get_onode(oid, false);
7062 if (!o || !o->exists) {
7063 r = -ENOENT;
7064 goto out;
7065 }
7066 for (auto& i : o->onode.attrs) {
7067 aset.emplace(i.first.c_str(), i.second);
7068 }
7069 r = 0;
7070 }
7071
7072 out:
7c673cae
FG
7073 if (r == 0 && _debug_mdata_eio(oid)) {
7074 r = -EIO;
7075 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7076 }
7077 dout(10) << __func__ << " " << c->cid << " " << oid
7078 << " = " << r << dendl;
7079 return r;
7080}
7081
7082int BlueStore::list_collections(vector<coll_t>& ls)
7083{
7084 RWLock::RLocker l(coll_lock);
7085 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
7086 p != coll_map.end();
7087 ++p)
7088 ls.push_back(p->first);
7089 return 0;
7090}
7091
7092bool BlueStore::collection_exists(const coll_t& c)
7093{
7094 RWLock::RLocker l(coll_lock);
7095 return coll_map.count(c);
7096}
7097
7098int BlueStore::collection_empty(const coll_t& cid, bool *empty)
7099{
7100 dout(15) << __func__ << " " << cid << dendl;
7101 vector<ghobject_t> ls;
7102 ghobject_t next;
7103 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
7104 &ls, &next);
7105 if (r < 0) {
7106 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
7107 << dendl;
7108 return r;
7109 }
7110 *empty = ls.empty();
7111 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
7112 return 0;
7113}
7114
7115int BlueStore::collection_bits(const coll_t& cid)
7116{
7117 dout(15) << __func__ << " " << cid << dendl;
7118 CollectionRef c = _get_collection(cid);
7119 if (!c)
7120 return -ENOENT;
7121 RWLock::RLocker l(c->lock);
7122 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
7123 return c->cnode.bits;
7124}
7125
7126int BlueStore::collection_list(
7127 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
7128 vector<ghobject_t> *ls, ghobject_t *pnext)
7129{
7130 CollectionHandle c = _get_collection(cid);
7131 if (!c)
7132 return -ENOENT;
7133 return collection_list(c, start, end, max, ls, pnext);
7134}
7135
7136int BlueStore::collection_list(
7137 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
7138 vector<ghobject_t> *ls, ghobject_t *pnext)
7139{
7140 Collection *c = static_cast<Collection *>(c_.get());
7141 dout(15) << __func__ << " " << c->cid
7142 << " start " << start << " end " << end << " max " << max << dendl;
7143 int r;
7144 {
7145 RWLock::RLocker l(c->lock);
7146 r = _collection_list(c, start, end, max, ls, pnext);
7147 }
7148
7c673cae
FG
7149 dout(10) << __func__ << " " << c->cid
7150 << " start " << start << " end " << end << " max " << max
7151 << " = " << r << ", ls.size() = " << ls->size()
7152 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
7153 return r;
7154}
7155
7156int BlueStore::_collection_list(
7157 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
7158 vector<ghobject_t> *ls, ghobject_t *pnext)
7159{
7160
7161 if (!c->exists)
7162 return -ENOENT;
7163
7164 int r = 0;
7165 ghobject_t static_next;
7166 KeyValueDB::Iterator it;
7167 string temp_start_key, temp_end_key;
7168 string start_key, end_key;
7169 bool set_next = false;
7170 string pend;
7171 bool temp;
7172
7173 if (!pnext)
7174 pnext = &static_next;
7175
7176 if (start == ghobject_t::get_max() ||
7177 start.hobj.is_max()) {
7178 goto out;
7179 }
7180 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
7181 &start_key, &end_key);
7182 dout(20) << __func__
7183 << " range " << pretty_binary_string(temp_start_key)
7184 << " to " << pretty_binary_string(temp_end_key)
7185 << " and " << pretty_binary_string(start_key)
7186 << " to " << pretty_binary_string(end_key)
7187 << " start " << start << dendl;
7188 it = db->get_iterator(PREFIX_OBJ);
7189 if (start == ghobject_t() ||
7190 start.hobj == hobject_t() ||
7191 start == c->cid.get_min_hobj()) {
7192 it->upper_bound(temp_start_key);
7193 temp = true;
7194 } else {
7195 string k;
7196 get_object_key(cct, start, &k);
7197 if (start.hobj.is_temp()) {
7198 temp = true;
7199 assert(k >= temp_start_key && k < temp_end_key);
7200 } else {
7201 temp = false;
7202 assert(k >= start_key && k < end_key);
7203 }
7204 dout(20) << " start from " << pretty_binary_string(k)
7205 << " temp=" << (int)temp << dendl;
7206 it->lower_bound(k);
7207 }
7208 if (end.hobj.is_max()) {
7209 pend = temp ? temp_end_key : end_key;
7210 } else {
7211 get_object_key(cct, end, &end_key);
7212 if (end.hobj.is_temp()) {
7213 if (temp)
7214 pend = end_key;
7215 else
7216 goto out;
7217 } else {
7218 pend = temp ? temp_end_key : end_key;
7219 }
7220 }
7221 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7222 while (true) {
7223 if (!it->valid() || it->key() >= pend) {
7224 if (!it->valid())
7225 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
7226 else
7227 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
7228 << " >= " << end << dendl;
7229 if (temp) {
7230 if (end.hobj.is_temp()) {
7231 break;
7232 }
7233 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
7234 temp = false;
7235 it->upper_bound(start_key);
7236 pend = end_key;
7237 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7238 continue;
7239 }
7240 break;
7241 }
7242 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
7243 if (is_extent_shard_key(it->key())) {
7244 it->next();
7245 continue;
7246 }
7247 ghobject_t oid;
7248 int r = get_key_object(it->key(), &oid);
7249 assert(r == 0);
7250 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
7251 if (ls->size() >= (unsigned)max) {
7252 dout(20) << __func__ << " reached max " << max << dendl;
7253 *pnext = oid;
7254 set_next = true;
7255 break;
7256 }
7257 ls->push_back(oid);
7258 it->next();
7259 }
7260out:
7261 if (!set_next) {
7262 *pnext = ghobject_t::get_max();
7263 }
7264
7265 return r;
7266}
7267
7c673cae
FG
7268int BlueStore::omap_get(
7269 const coll_t& cid, ///< [in] Collection containing oid
7270 const ghobject_t &oid, ///< [in] Object containing omap
7271 bufferlist *header, ///< [out] omap header
7272 map<string, bufferlist> *out /// < [out] Key to value map
7273 )
7274{
7275 CollectionHandle c = _get_collection(cid);
7276 if (!c)
7277 return -ENOENT;
7278 return omap_get(c, oid, header, out);
7279}
7280
7281int BlueStore::omap_get(
7282 CollectionHandle &c_, ///< [in] Collection containing oid
7283 const ghobject_t &oid, ///< [in] Object containing omap
7284 bufferlist *header, ///< [out] omap header
7285 map<string, bufferlist> *out /// < [out] Key to value map
7286 )
7287{
7288 Collection *c = static_cast<Collection *>(c_.get());
7289 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7290 if (!c->exists)
7291 return -ENOENT;
7292 RWLock::RLocker l(c->lock);
7293 int r = 0;
7294 OnodeRef o = c->get_onode(oid, false);
7295 if (!o || !o->exists) {
7296 r = -ENOENT;
7297 goto out;
7298 }
7299 if (!o->onode.has_omap())
7300 goto out;
7301 o->flush();
7302 {
7303 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7304 string head, tail;
7305 get_omap_header(o->onode.nid, &head);
7306 get_omap_tail(o->onode.nid, &tail);
7307 it->lower_bound(head);
7308 while (it->valid()) {
7309 if (it->key() == head) {
7310 dout(30) << __func__ << " got header" << dendl;
7311 *header = it->value();
7312 } else if (it->key() >= tail) {
7313 dout(30) << __func__ << " reached tail" << dendl;
7314 break;
7315 } else {
7316 string user_key;
7317 decode_omap_key(it->key(), &user_key);
7318 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7319 << " -> " << user_key << dendl;
7320 (*out)[user_key] = it->value();
7321 }
7322 it->next();
7323 }
7324 }
7325 out:
7326 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7327 << dendl;
7328 return r;
7329}
7330
7331int BlueStore::omap_get_header(
7332 const coll_t& cid, ///< [in] Collection containing oid
7333 const ghobject_t &oid, ///< [in] Object containing omap
7334 bufferlist *header, ///< [out] omap header
7335 bool allow_eio ///< [in] don't assert on eio
7336 )
7337{
7338 CollectionHandle c = _get_collection(cid);
7339 if (!c)
7340 return -ENOENT;
7341 return omap_get_header(c, oid, header, allow_eio);
7342}
7343
7344int BlueStore::omap_get_header(
7345 CollectionHandle &c_, ///< [in] Collection containing oid
7346 const ghobject_t &oid, ///< [in] Object containing omap
7347 bufferlist *header, ///< [out] omap header
7348 bool allow_eio ///< [in] don't assert on eio
7349 )
7350{
7351 Collection *c = static_cast<Collection *>(c_.get());
7352 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7353 if (!c->exists)
7354 return -ENOENT;
7355 RWLock::RLocker l(c->lock);
7356 int r = 0;
7357 OnodeRef o = c->get_onode(oid, false);
7358 if (!o || !o->exists) {
7359 r = -ENOENT;
7360 goto out;
7361 }
7362 if (!o->onode.has_omap())
7363 goto out;
7364 o->flush();
7365 {
7366 string head;
7367 get_omap_header(o->onode.nid, &head);
7368 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7369 dout(30) << __func__ << " got header" << dendl;
7370 } else {
7371 dout(30) << __func__ << " no header" << dendl;
7372 }
7373 }
7374 out:
7375 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7376 << dendl;
7377 return r;
7378}
7379
7380int BlueStore::omap_get_keys(
7381 const coll_t& cid, ///< [in] Collection containing oid
7382 const ghobject_t &oid, ///< [in] Object containing omap
7383 set<string> *keys ///< [out] Keys defined on oid
7384 )
7385{
7386 CollectionHandle c = _get_collection(cid);
7387 if (!c)
7388 return -ENOENT;
7389 return omap_get_keys(c, oid, keys);
7390}
7391
7392int BlueStore::omap_get_keys(
7393 CollectionHandle &c_, ///< [in] Collection containing oid
7394 const ghobject_t &oid, ///< [in] Object containing omap
7395 set<string> *keys ///< [out] Keys defined on oid
7396 )
7397{
7398 Collection *c = static_cast<Collection *>(c_.get());
7399 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7400 if (!c->exists)
7401 return -ENOENT;
7402 RWLock::RLocker l(c->lock);
7403 int r = 0;
7404 OnodeRef o = c->get_onode(oid, false);
7405 if (!o || !o->exists) {
7406 r = -ENOENT;
7407 goto out;
7408 }
7409 if (!o->onode.has_omap())
7410 goto out;
7411 o->flush();
7412 {
7413 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7414 string head, tail;
7415 get_omap_key(o->onode.nid, string(), &head);
7416 get_omap_tail(o->onode.nid, &tail);
7417 it->lower_bound(head);
7418 while (it->valid()) {
7419 if (it->key() >= tail) {
7420 dout(30) << __func__ << " reached tail" << dendl;
7421 break;
7422 }
7423 string user_key;
7424 decode_omap_key(it->key(), &user_key);
7425 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7426 << " -> " << user_key << dendl;
7427 keys->insert(user_key);
7428 it->next();
7429 }
7430 }
7431 out:
7432 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7433 << dendl;
7434 return r;
7435}
7436
7437int BlueStore::omap_get_values(
7438 const coll_t& cid, ///< [in] Collection containing oid
7439 const ghobject_t &oid, ///< [in] Object containing omap
7440 const set<string> &keys, ///< [in] Keys to get
7441 map<string, bufferlist> *out ///< [out] Returned keys and values
7442 )
7443{
7444 CollectionHandle c = _get_collection(cid);
7445 if (!c)
7446 return -ENOENT;
7447 return omap_get_values(c, oid, keys, out);
7448}
7449
7450int BlueStore::omap_get_values(
7451 CollectionHandle &c_, ///< [in] Collection containing oid
7452 const ghobject_t &oid, ///< [in] Object containing omap
7453 const set<string> &keys, ///< [in] Keys to get
7454 map<string, bufferlist> *out ///< [out] Returned keys and values
7455 )
7456{
7457 Collection *c = static_cast<Collection *>(c_.get());
7458 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7459 if (!c->exists)
7460 return -ENOENT;
7461 RWLock::RLocker l(c->lock);
7462 int r = 0;
7463 string final_key;
7464 OnodeRef o = c->get_onode(oid, false);
7465 if (!o || !o->exists) {
7466 r = -ENOENT;
7467 goto out;
7468 }
7469 if (!o->onode.has_omap())
7470 goto out;
7471 o->flush();
7472 _key_encode_u64(o->onode.nid, &final_key);
7473 final_key.push_back('.');
7474 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7475 final_key.resize(9); // keep prefix
7476 final_key += *p;
7477 bufferlist val;
7478 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7479 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7480 << " -> " << *p << dendl;
7481 out->insert(make_pair(*p, val));
7482 }
7483 }
7484 out:
7485 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7486 << dendl;
7487 return r;
7488}
7489
7490int BlueStore::omap_check_keys(
7491 const coll_t& cid, ///< [in] Collection containing oid
7492 const ghobject_t &oid, ///< [in] Object containing omap
7493 const set<string> &keys, ///< [in] Keys to check
7494 set<string> *out ///< [out] Subset of keys defined on oid
7495 )
7496{
7497 CollectionHandle c = _get_collection(cid);
7498 if (!c)
7499 return -ENOENT;
7500 return omap_check_keys(c, oid, keys, out);
7501}
7502
7503int BlueStore::omap_check_keys(
7504 CollectionHandle &c_, ///< [in] Collection containing oid
7505 const ghobject_t &oid, ///< [in] Object containing omap
7506 const set<string> &keys, ///< [in] Keys to check
7507 set<string> *out ///< [out] Subset of keys defined on oid
7508 )
7509{
7510 Collection *c = static_cast<Collection *>(c_.get());
7511 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7512 if (!c->exists)
7513 return -ENOENT;
7514 RWLock::RLocker l(c->lock);
7515 int r = 0;
7516 string final_key;
7517 OnodeRef o = c->get_onode(oid, false);
7518 if (!o || !o->exists) {
7519 r = -ENOENT;
7520 goto out;
7521 }
7522 if (!o->onode.has_omap())
7523 goto out;
7524 o->flush();
7525 _key_encode_u64(o->onode.nid, &final_key);
7526 final_key.push_back('.');
7527 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7528 final_key.resize(9); // keep prefix
7529 final_key += *p;
7530 bufferlist val;
7531 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7532 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7533 << " -> " << *p << dendl;
7534 out->insert(*p);
7535 } else {
7536 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7537 << " -> " << *p << dendl;
7538 }
7539 }
7540 out:
7541 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7542 << dendl;
7543 return r;
7544}
7545
7546ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7547 const coll_t& cid, ///< [in] collection
7548 const ghobject_t &oid ///< [in] object
7549 )
7550{
7551 CollectionHandle c = _get_collection(cid);
7552 if (!c) {
7553 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7554 return ObjectMap::ObjectMapIterator();
7555 }
7556 return get_omap_iterator(c, oid);
7557}
7558
7559ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7560 CollectionHandle &c_, ///< [in] collection
7561 const ghobject_t &oid ///< [in] object
7562 )
7563{
7564 Collection *c = static_cast<Collection *>(c_.get());
7565 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7566 if (!c->exists) {
7567 return ObjectMap::ObjectMapIterator();
7568 }
7569 RWLock::RLocker l(c->lock);
7570 OnodeRef o = c->get_onode(oid, false);
7571 if (!o || !o->exists) {
7572 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7573 return ObjectMap::ObjectMapIterator();
7574 }
7575 o->flush();
7576 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7577 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7578 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7579}
7580
7581// -----------------
7582// write helpers
7583
7584void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7585{
7586 dout(10) << __func__ << " ondisk_format " << ondisk_format
7587 << " min_compat_ondisk_format " << min_compat_ondisk_format
7588 << dendl;
7589 assert(ondisk_format == latest_ondisk_format);
7590 {
7591 bufferlist bl;
7592 ::encode(ondisk_format, bl);
7593 t->set(PREFIX_SUPER, "ondisk_format", bl);
7594 }
7595 {
7596 bufferlist bl;
7597 ::encode(min_compat_ondisk_format, bl);
7598 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7599 }
7600}
7601
7602int BlueStore::_open_super_meta()
7603{
7604 // nid
7605 {
7606 nid_max = 0;
7607 bufferlist bl;
7608 db->get(PREFIX_SUPER, "nid_max", &bl);
7609 bufferlist::iterator p = bl.begin();
7610 try {
7611 uint64_t v;
7612 ::decode(v, p);
7613 nid_max = v;
7614 } catch (buffer::error& e) {
7615 derr << __func__ << " unable to read nid_max" << dendl;
7616 return -EIO;
7617 }
7618 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7619 nid_last = nid_max.load();
7620 }
7621
7622 // blobid
7623 {
7624 blobid_max = 0;
7625 bufferlist bl;
7626 db->get(PREFIX_SUPER, "blobid_max", &bl);
7627 bufferlist::iterator p = bl.begin();
7628 try {
7629 uint64_t v;
7630 ::decode(v, p);
7631 blobid_max = v;
7632 } catch (buffer::error& e) {
7633 derr << __func__ << " unable to read blobid_max" << dendl;
7634 return -EIO;
7635 }
7636 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7637 blobid_last = blobid_max.load();
7638 }
7639
7640 // freelist
7641 {
7642 bufferlist bl;
7643 db->get(PREFIX_SUPER, "freelist_type", &bl);
7644 if (bl.length()) {
7645 freelist_type = std::string(bl.c_str(), bl.length());
7646 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7647 } else {
7648 assert("Not Support extent freelist manager" == 0);
7649 }
7650 }
7651
7652 // bluefs alloc
7653 if (cct->_conf->bluestore_bluefs) {
7654 bluefs_extents.clear();
7655 bufferlist bl;
7656 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7657 bufferlist::iterator p = bl.begin();
7658 try {
7659 ::decode(bluefs_extents, p);
7660 }
7661 catch (buffer::error& e) {
7662 derr << __func__ << " unable to read bluefs_extents" << dendl;
7663 return -EIO;
7664 }
7665 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7666 << std::dec << dendl;
7667 }
7668
7669 // ondisk format
7670 int32_t compat_ondisk_format = 0;
7671 {
7672 bufferlist bl;
7673 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7674 if (r < 0) {
7675 // base case: kraken bluestore is v1 and readable by v1
7676 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7677 << dendl;
7678 ondisk_format = 1;
7679 compat_ondisk_format = 1;
7680 } else {
7681 auto p = bl.begin();
7682 try {
7683 ::decode(ondisk_format, p);
7684 } catch (buffer::error& e) {
7685 derr << __func__ << " unable to read ondisk_format" << dendl;
7686 return -EIO;
7687 }
7688 bl.clear();
7689 {
7690 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7691 assert(!r);
7692 auto p = bl.begin();
7693 try {
7694 ::decode(compat_ondisk_format, p);
7695 } catch (buffer::error& e) {
7696 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7697 return -EIO;
7698 }
7699 }
7700 }
7701 dout(10) << __func__ << " ondisk_format " << ondisk_format
7702 << " compat_ondisk_format " << compat_ondisk_format
7703 << dendl;
7704 }
7705
7706 if (latest_ondisk_format < compat_ondisk_format) {
7707 derr << __func__ << " compat_ondisk_format is "
7708 << compat_ondisk_format << " but we only understand version "
7709 << latest_ondisk_format << dendl;
7710 return -EPERM;
7711 }
7712 if (ondisk_format < latest_ondisk_format) {
7713 int r = _upgrade_super();
7714 if (r < 0) {
7715 return r;
7716 }
7717 }
7718
7719 {
7720 bufferlist bl;
7721 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7722 auto p = bl.begin();
7723 try {
7724 uint64_t val;
7725 ::decode(val, p);
7726 min_alloc_size = val;
224ce89b
WB
7727 min_alloc_size_order = ctz(val);
7728 assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
7729 } catch (buffer::error& e) {
7730 derr << __func__ << " unable to read min_alloc_size" << dendl;
7731 return -EIO;
7732 }
7733 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7734 << std::dec << dendl;
7735 }
224ce89b 7736 _open_statfs();
7c673cae
FG
7737 _set_alloc_sizes();
7738 _set_throttle_params();
7739
7740 _set_csum();
7741 _set_compression();
7742 _set_blob_size();
7743
1adf2230
AA
7744 _set_finisher_num();
7745
7c673cae
FG
7746 return 0;
7747}
7748
7749int BlueStore::_upgrade_super()
7750{
7751 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7752 << latest_ondisk_format << dendl;
7753 assert(ondisk_format > 0);
7754 assert(ondisk_format < latest_ondisk_format);
7755
7756 if (ondisk_format == 1) {
7757 // changes:
7758 // - super: added ondisk_format
7759 // - super: added min_readable_ondisk_format
7760 // - super: added min_compat_ondisk_format
7761 // - super: added min_alloc_size
7762 // - super: removed min_min_alloc_size
7763 KeyValueDB::Transaction t = db->get_transaction();
7764 {
7765 bufferlist bl;
7766 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7767 auto p = bl.begin();
7768 try {
7769 uint64_t val;
7770 ::decode(val, p);
7771 min_alloc_size = val;
7772 } catch (buffer::error& e) {
7773 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7774 return -EIO;
7775 }
7776 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7777 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7778 }
7779 ondisk_format = 2;
7780 _prepare_ondisk_format_super(t);
7781 int r = db->submit_transaction_sync(t);
7782 assert(r == 0);
7783 }
7784
7785 // done
7786 dout(1) << __func__ << " done" << dendl;
7787 return 0;
7788}
7789
7790void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7791{
224ce89b
WB
7792 if (o->onode.nid) {
7793 assert(o->exists);
7c673cae 7794 return;
224ce89b 7795 }
7c673cae
FG
7796 uint64_t nid = ++nid_last;
7797 dout(20) << __func__ << " " << nid << dendl;
7798 o->onode.nid = nid;
7799 txc->last_nid = nid;
224ce89b 7800 o->exists = true;
7c673cae
FG
7801}
7802
7803uint64_t BlueStore::_assign_blobid(TransContext *txc)
7804{
7805 uint64_t bid = ++blobid_last;
7806 dout(20) << __func__ << " " << bid << dendl;
7807 txc->last_blobid = bid;
7808 return bid;
7809}
7810
7811void BlueStore::get_db_statistics(Formatter *f)
7812{
7813 db->get_statistics(f);
7814}
7815
7816BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
7817{
7818 TransContext *txc = new TransContext(cct, osr);
7819 txc->t = db->get_transaction();
7820 osr->queue_new(txc);
7821 dout(20) << __func__ << " osr " << osr << " = " << txc
7822 << " seq " << txc->seq << dendl;
7823 return txc;
7824}
7825
7826void BlueStore::_txc_calc_cost(TransContext *txc)
7827{
7828 // this is about the simplest model for transaction cost you can
7829 // imagine. there is some fixed overhead cost by saying there is a
7830 // minimum of one "io". and then we have some cost per "io" that is
7831 // a configurable (with different hdd and ssd defaults), and add
7832 // that to the bytes value.
7833 int ios = 1; // one "io" for the kv commit
7834 for (auto& p : txc->ioc.pending_aios) {
7835 ios += p.iov.size();
7836 }
7837 auto cost = throttle_cost_per_io.load();
7838 txc->cost = ios * cost + txc->bytes;
7839 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
7840 << ios << " ios * " << cost << " + " << txc->bytes
7841 << " bytes)" << dendl;
7842}
7843
7844void BlueStore::_txc_update_store_statfs(TransContext *txc)
7845{
7846 if (txc->statfs_delta.is_empty())
7847 return;
7848
7849 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
7850 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
7851 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
7852 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
7853 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
7854
31f18b77
FG
7855 {
7856 std::lock_guard<std::mutex> l(vstatfs_lock);
7857 vstatfs += txc->statfs_delta;
7858 }
7859
7c673cae
FG
7860 bufferlist bl;
7861 txc->statfs_delta.encode(bl);
7862
7863 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
7864 txc->statfs_delta.reset();
7865}
7866
7867void BlueStore::_txc_state_proc(TransContext *txc)
7868{
7869 while (true) {
7870 dout(10) << __func__ << " txc " << txc
7871 << " " << txc->get_state_name() << dendl;
7872 switch (txc->state) {
7873 case TransContext::STATE_PREPARE:
7874 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
7875 if (txc->ioc.has_pending_aios()) {
7876 txc->state = TransContext::STATE_AIO_WAIT;
7877 txc->had_ios = true;
7878 _txc_aio_submit(txc);
7879 return;
7880 }
7881 // ** fall-thru **
7882
7883 case TransContext::STATE_AIO_WAIT:
7884 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
7885 _txc_finish_io(txc); // may trigger blocked txc's too
7886 return;
7887
7888 case TransContext::STATE_IO_DONE:
7889 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7890 if (txc->had_ios) {
7891 ++txc->osr->txc_with_unstable_io;
7892 }
7893 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
7894 txc->state = TransContext::STATE_KV_QUEUED;
7895 if (cct->_conf->bluestore_sync_submit_transaction) {
7896 if (txc->last_nid >= nid_max ||
7897 txc->last_blobid >= blobid_max) {
7898 dout(20) << __func__
7899 << " last_{nid,blobid} exceeds max, submit via kv thread"
7900 << dendl;
7901 } else if (txc->osr->kv_committing_serially) {
7902 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
7903 << dendl;
7904 // note: this is starvation-prone. once we have a txc in a busy
7905 // sequencer that is committing serially it is possible to keep
7906 // submitting new transactions fast enough that we get stuck doing
7907 // so. the alternative is to block here... fixme?
7908 } else if (txc->osr->txc_with_unstable_io) {
7909 dout(20) << __func__ << " prior txc(s) with unstable ios "
7910 << txc->osr->txc_with_unstable_io.load() << dendl;
7911 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
7912 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
7913 == 0) {
7914 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
7915 << dendl;
7916 } else {
7917 txc->state = TransContext::STATE_KV_SUBMITTED;
31f18b77 7918 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
7c673cae
FG
7919 assert(r == 0);
7920 _txc_applied_kv(txc);
7921 }
7922 }
7923 {
7924 std::lock_guard<std::mutex> l(kv_lock);
7925 kv_queue.push_back(txc);
7926 kv_cond.notify_one();
7927 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
7928 kv_queue_unsubmitted.push_back(txc);
7929 ++txc->osr->kv_committing_serially;
7930 }
31f18b77
FG
7931 if (txc->had_ios)
7932 kv_ios++;
7933 kv_throttle_costs += txc->cost;
7c673cae
FG
7934 }
7935 return;
7936 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
7937 _txc_committed_kv(txc);
7938 // ** fall-thru **
7939
7940 case TransContext::STATE_KV_DONE:
7941 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
7942 if (txc->deferred_txn) {
7943 txc->state = TransContext::STATE_DEFERRED_QUEUED;
7944 _deferred_queue(txc);
7945 return;
7946 }
7947 txc->state = TransContext::STATE_FINISHING;
7948 break;
7949
7950 case TransContext::STATE_DEFERRED_CLEANUP:
7951 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
7952 txc->state = TransContext::STATE_FINISHING;
7953 // ** fall-thru **
7954
7955 case TransContext::STATE_FINISHING:
7956 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
7957 _txc_finish(txc);
7958 return;
7959
7960 default:
7961 derr << __func__ << " unexpected txc " << txc
7962 << " state " << txc->get_state_name() << dendl;
7963 assert(0 == "unexpected txc state");
7964 return;
7965 }
7966 }
7967}
7968
7969void BlueStore::_txc_finish_io(TransContext *txc)
7970{
7971 dout(20) << __func__ << " " << txc << dendl;
7972
7973 /*
7974 * we need to preserve the order of kv transactions,
7975 * even though aio will complete in any order.
7976 */
7977
7978 OpSequencer *osr = txc->osr.get();
7979 std::lock_guard<std::mutex> l(osr->qlock);
7980 txc->state = TransContext::STATE_IO_DONE;
7981
31f18b77
FG
7982 // release aio contexts (including pinned buffers).
7983 txc->ioc.running_aios.clear();
7984
7c673cae
FG
7985 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
7986 while (p != osr->q.begin()) {
7987 --p;
7988 if (p->state < TransContext::STATE_IO_DONE) {
7989 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
7990 << p->get_state_name() << dendl;
7991 return;
7992 }
7993 if (p->state > TransContext::STATE_IO_DONE) {
7994 ++p;
7995 break;
7996 }
7997 }
7998 do {
7999 _txc_state_proc(&*p++);
8000 } while (p != osr->q.end() &&
8001 p->state == TransContext::STATE_IO_DONE);
8002
8003 if (osr->kv_submitted_waiters &&
8004 osr->_is_all_kv_submitted()) {
8005 osr->qcond.notify_all();
8006 }
8007}
8008
8009void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
8010{
8011 dout(20) << __func__ << " txc " << txc
8012 << " onodes " << txc->onodes
8013 << " shared_blobs " << txc->shared_blobs
8014 << dendl;
8015
8016 // finalize onodes
8017 for (auto o : txc->onodes) {
8018 // finalize extent_map shards
8019 o->extent_map.update(t, false);
8020 if (o->extent_map.needs_reshard()) {
8021 o->extent_map.reshard(db, t);
8022 o->extent_map.update(t, true);
8023 if (o->extent_map.needs_reshard()) {
8024 dout(20) << __func__ << " warning: still wants reshard, check options?"
8025 << dendl;
8026 o->extent_map.clear_needs_reshard();
8027 }
8028 logger->inc(l_bluestore_onode_reshard);
8029 }
8030
8031 // bound encode
8032 size_t bound = 0;
8033 denc(o->onode, bound);
8034 o->extent_map.bound_encode_spanning_blobs(bound);
8035 if (o->onode.extent_map_shards.empty()) {
8036 denc(o->extent_map.inline_bl, bound);
8037 }
8038
8039 // encode
8040 bufferlist bl;
8041 unsigned onode_part, blob_part, extent_part;
8042 {
8043 auto p = bl.get_contiguous_appender(bound, true);
8044 denc(o->onode, p);
8045 onode_part = p.get_logical_offset();
8046 o->extent_map.encode_spanning_blobs(p);
8047 blob_part = p.get_logical_offset() - onode_part;
8048 if (o->onode.extent_map_shards.empty()) {
8049 denc(o->extent_map.inline_bl, p);
8050 }
8051 extent_part = p.get_logical_offset() - onode_part - blob_part;
8052 }
8053
8054 dout(20) << " onode " << o->oid << " is " << bl.length()
8055 << " (" << onode_part << " bytes onode + "
8056 << blob_part << " bytes spanning blobs + "
8057 << extent_part << " bytes inline extents)"
8058 << dendl;
8059 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
8060 o->flushing_count++;
8061 }
8062
8063 // objects we modified but didn't affect the onode
8064 auto p = txc->modified_objects.begin();
8065 while (p != txc->modified_objects.end()) {
8066 if (txc->onodes.count(*p) == 0) {
8067 (*p)->flushing_count++;
8068 ++p;
8069 } else {
8070 // remove dups with onodes list to avoid problems in _txc_finish
8071 p = txc->modified_objects.erase(p);
8072 }
8073 }
8074
8075 // finalize shared_blobs
8076 for (auto sb : txc->shared_blobs) {
8077 string key;
8078 auto sbid = sb->get_sbid();
8079 get_shared_blob_key(sbid, &key);
8080 if (sb->persistent->empty()) {
8081 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8082 << " is empty" << dendl;
8083 t->rmkey(PREFIX_SHARED_BLOB, key);
8084 } else {
8085 bufferlist bl;
8086 ::encode(*(sb->persistent), bl);
8087 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
31f18b77 8088 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
8089 t->set(PREFIX_SHARED_BLOB, key, bl);
8090 }
8091 }
8092}
8093
8094void BlueStore::BSPerfTracker::update_from_perfcounters(
8095 PerfCounters &logger)
8096{
8097 os_commit_latency.consume_next(
8098 logger.get_tavg_ms(
8099 l_bluestore_commit_lat));
8100 os_apply_latency.consume_next(
8101 logger.get_tavg_ms(
8102 l_bluestore_commit_lat));
8103}
8104
8105void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
8106{
8107 dout(20) << __func__ << " txc " << txc << std::hex
8108 << " allocated 0x" << txc->allocated
8109 << " released 0x" << txc->released
8110 << std::dec << dendl;
8111
8112 // We have to handle the case where we allocate *and* deallocate the
8113 // same region in this transaction. The freelist doesn't like that.
8114 // (Actually, the only thing that cares is the BitmapFreelistManager
8115 // debug check. But that's important.)
8116 interval_set<uint64_t> tmp_allocated, tmp_released;
8117 interval_set<uint64_t> *pallocated = &txc->allocated;
8118 interval_set<uint64_t> *preleased = &txc->released;
8119 if (!txc->allocated.empty() && !txc->released.empty()) {
8120 interval_set<uint64_t> overlap;
8121 overlap.intersection_of(txc->allocated, txc->released);
8122 if (!overlap.empty()) {
8123 tmp_allocated = txc->allocated;
8124 tmp_allocated.subtract(overlap);
8125 tmp_released = txc->released;
8126 tmp_released.subtract(overlap);
8127 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
8128 << ", new allocated 0x" << tmp_allocated
8129 << " released 0x" << tmp_released << std::dec
8130 << dendl;
8131 pallocated = &tmp_allocated;
8132 preleased = &tmp_released;
8133 }
8134 }
8135
8136 // update freelist with non-overlap sets
8137 for (interval_set<uint64_t>::iterator p = pallocated->begin();
8138 p != pallocated->end();
8139 ++p) {
8140 fm->allocate(p.get_start(), p.get_len(), t);
8141 }
8142 for (interval_set<uint64_t>::iterator p = preleased->begin();
8143 p != preleased->end();
8144 ++p) {
8145 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
8146 << "~" << p.get_len() << std::dec << dendl;
8147 fm->release(p.get_start(), p.get_len(), t);
8148 }
8149
8150 _txc_update_store_statfs(txc);
8151}
8152
8153void BlueStore::_txc_applied_kv(TransContext *txc)
8154{
8155 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
8156 for (auto& o : *ls) {
8157 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
8158 << dendl;
8159 if (--o->flushing_count == 0) {
8160 std::lock_guard<std::mutex> l(o->flush_lock);
8161 o->flush_cond.notify_all();
8162 }
8163 }
8164 }
8165}
8166
8167void BlueStore::_txc_committed_kv(TransContext *txc)
8168{
8169 dout(20) << __func__ << " txc " << txc << dendl;
8170
8171 // warning: we're calling onreadable_sync inside the sequencer lock
8172 if (txc->onreadable_sync) {
8173 txc->onreadable_sync->complete(0);
8174 txc->onreadable_sync = NULL;
8175 }
8176 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
8177 if (txc->oncommit) {
8178 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
8179 finishers[n]->queue(txc->oncommit);
8180 txc->oncommit = NULL;
8181 }
8182 if (txc->onreadable) {
8183 finishers[n]->queue(txc->onreadable);
8184 txc->onreadable = NULL;
8185 }
8186
1adf2230
AA
8187 {
8188 std::lock_guard<std::mutex> l(txc->osr->qlock);
8189 txc->state = TransContext::STATE_KV_DONE;
8190 if (!txc->oncommits.empty()) {
8191 finishers[n]->queue(txc->oncommits);
8192 }
7c673cae 8193 }
1adf2230 8194 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
7c673cae
FG
8195}
8196
8197void BlueStore::_txc_finish(TransContext *txc)
8198{
8199 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
8200 assert(txc->state == TransContext::STATE_FINISHING);
8201
8202 for (auto& sb : txc->shared_blobs_written) {
8203 sb->bc.finish_write(sb->get_cache(), txc->seq);
8204 }
8205 txc->shared_blobs_written.clear();
8206
8207 while (!txc->removed_collections.empty()) {
8208 _queue_reap_collection(txc->removed_collections.front());
8209 txc->removed_collections.pop_front();
8210 }
8211
8212 OpSequencerRef osr = txc->osr;
7c673cae 8213 bool empty = false;
31f18b77 8214 bool submit_deferred = false;
7c673cae
FG
8215 OpSequencer::q_list_t releasing_txc;
8216 {
8217 std::lock_guard<std::mutex> l(osr->qlock);
8218 txc->state = TransContext::STATE_DONE;
8219 bool notify = false;
8220 while (!osr->q.empty()) {
8221 TransContext *txc = &osr->q.front();
8222 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
8223 << dendl;
8224 if (txc->state != TransContext::STATE_DONE) {
8225 if (txc->state == TransContext::STATE_PREPARE &&
8226 deferred_aggressive) {
8227 // for _osr_drain_preceding()
8228 notify = true;
8229 }
31f18b77
FG
8230 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
8231 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
8232 submit_deferred = true;
8233 }
7c673cae
FG
8234 break;
8235 }
8236
7c673cae
FG
8237 osr->q.pop_front();
8238 releasing_txc.push_back(*txc);
8239 notify = true;
8240 }
8241 if (notify) {
8242 osr->qcond.notify_all();
8243 }
8244 if (osr->q.empty()) {
8245 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
8246 empty = true;
8247 }
8248 }
8249 while (!releasing_txc.empty()) {
8250 // release to allocator only after all preceding txc's have also
8251 // finished any deferred writes that potentially land in these
8252 // blocks
8253 auto txc = &releasing_txc.front();
8254 _txc_release_alloc(txc);
8255 releasing_txc.pop_front();
8256 txc->log_state_latency(logger, l_bluestore_state_done_lat);
8257 delete txc;
8258 }
8259
31f18b77
FG
8260 if (submit_deferred) {
8261 // we're pinning memory; flush! we could be more fine-grained here but
8262 // i'm not sure it's worth the bother.
8263 deferred_try_submit();
7c673cae
FG
8264 }
8265
7c673cae
FG
8266 if (empty && osr->zombie) {
8267 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
8268 osr->_unregister();
8269 }
8270}
8271
8272void BlueStore::_txc_release_alloc(TransContext *txc)
8273{
8274 // update allocator with full released set
8275 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
94b18763
FG
8276 dout(10) << __func__ << " " << txc << " " << std::hex
8277 << txc->released << std::dec << dendl;
7c673cae
FG
8278 for (interval_set<uint64_t>::iterator p = txc->released.begin();
8279 p != txc->released.end();
8280 ++p) {
8281 alloc->release(p.get_start(), p.get_len());
8282 }
8283 }
8284
8285 txc->allocated.clear();
8286 txc->released.clear();
8287}
8288
8289void BlueStore::_osr_drain_preceding(TransContext *txc)
8290{
8291 OpSequencer *osr = txc->osr.get();
8292 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8293 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8294 {
8295 // submit anything pending
224ce89b 8296 deferred_lock.lock();
7c673cae 8297 if (osr->deferred_pending) {
224ce89b
WB
8298 _deferred_submit_unlock(osr);
8299 } else {
8300 deferred_lock.unlock();
7c673cae
FG
8301 }
8302 }
8303 {
8304 // wake up any previously finished deferred events
8305 std::lock_guard<std::mutex> l(kv_lock);
8306 kv_cond.notify_one();
8307 }
8308 osr->drain_preceding(txc);
8309 --deferred_aggressive;
8310 dout(10) << __func__ << " " << osr << " done" << dendl;
8311}
8312
8313void BlueStore::_osr_drain_all()
8314{
8315 dout(10) << __func__ << dendl;
8316
8317 set<OpSequencerRef> s;
8318 {
8319 std::lock_guard<std::mutex> l(osr_lock);
8320 s = osr_set;
8321 }
8322 dout(20) << __func__ << " osr_set " << s << dendl;
8323
8324 ++deferred_aggressive;
8325 {
8326 // submit anything pending
224ce89b 8327 deferred_try_submit();
7c673cae
FG
8328 }
8329 {
8330 // wake up any previously finished deferred events
8331 std::lock_guard<std::mutex> l(kv_lock);
8332 kv_cond.notify_one();
8333 }
31f18b77
FG
8334 {
8335 std::lock_guard<std::mutex> l(kv_finalize_lock);
8336 kv_finalize_cond.notify_one();
8337 }
7c673cae
FG
8338 for (auto osr : s) {
8339 dout(20) << __func__ << " drain " << osr << dendl;
8340 osr->drain();
8341 }
8342 --deferred_aggressive;
8343
8344 dout(10) << __func__ << " done" << dendl;
8345}
8346
8347void BlueStore::_osr_unregister_all()
8348{
8349 set<OpSequencerRef> s;
8350 {
8351 std::lock_guard<std::mutex> l(osr_lock);
8352 s = osr_set;
8353 }
8354 dout(10) << __func__ << " " << s << dendl;
8355 for (auto osr : s) {
8356 osr->_unregister();
8357
8358 if (!osr->zombie) {
8359 // break link from Sequencer to us so that this OpSequencer
8360 // instance can die with this mount/umount cycle. note that
8361 // we assume umount() will not race against ~Sequencer.
8362 assert(osr->parent);
8363 osr->parent->p.reset();
8364 }
8365 }
8366 // nobody should be creating sequencers during umount either.
8367 {
8368 std::lock_guard<std::mutex> l(osr_lock);
8369 assert(osr_set.empty());
8370 }
8371}
8372
31f18b77
FG
8373void BlueStore::_kv_start()
8374{
8375 dout(10) << __func__ << dendl;
8376
31f18b77
FG
8377 for (int i = 0; i < m_finisher_num; ++i) {
8378 ostringstream oss;
8379 oss << "finisher-" << i;
8380 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8381 finishers.push_back(f);
8382 }
8383
181888fb 8384 deferred_finisher.start();
31f18b77
FG
8385 for (auto f : finishers) {
8386 f->start();
8387 }
8388 kv_sync_thread.create("bstore_kv_sync");
8389 kv_finalize_thread.create("bstore_kv_final");
8390}
8391
8392void BlueStore::_kv_stop()
8393{
8394 dout(10) << __func__ << dendl;
8395 {
8396 std::unique_lock<std::mutex> l(kv_lock);
8397 while (!kv_sync_started) {
8398 kv_cond.wait(l);
8399 }
8400 kv_stop = true;
8401 kv_cond.notify_all();
8402 }
8403 {
8404 std::unique_lock<std::mutex> l(kv_finalize_lock);
8405 while (!kv_finalize_started) {
8406 kv_finalize_cond.wait(l);
8407 }
8408 kv_finalize_stop = true;
8409 kv_finalize_cond.notify_all();
8410 }
8411 kv_sync_thread.join();
8412 kv_finalize_thread.join();
94b18763 8413 assert(removed_collections.empty());
31f18b77
FG
8414 {
8415 std::lock_guard<std::mutex> l(kv_lock);
8416 kv_stop = false;
8417 }
8418 {
8419 std::lock_guard<std::mutex> l(kv_finalize_lock);
8420 kv_finalize_stop = false;
8421 }
8422 dout(10) << __func__ << " stopping finishers" << dendl;
181888fb
FG
8423 deferred_finisher.wait_for_empty();
8424 deferred_finisher.stop();
31f18b77
FG
8425 for (auto f : finishers) {
8426 f->wait_for_empty();
8427 f->stop();
8428 }
8429 dout(10) << __func__ << " stopped" << dendl;
8430}
8431
7c673cae
FG
8432void BlueStore::_kv_sync_thread()
8433{
8434 dout(10) << __func__ << " start" << dendl;
8435 std::unique_lock<std::mutex> l(kv_lock);
31f18b77
FG
8436 assert(!kv_sync_started);
8437 kv_sync_started = true;
8438 kv_cond.notify_all();
7c673cae
FG
8439 while (true) {
8440 assert(kv_committing.empty());
8441 if (kv_queue.empty() &&
8442 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8443 !deferred_aggressive)) {
8444 if (kv_stop)
8445 break;
8446 dout(20) << __func__ << " sleep" << dendl;
8447 kv_cond.wait(l);
8448 dout(20) << __func__ << " wake" << dendl;
8449 } else {
8450 deque<TransContext*> kv_submitting;
8451 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
8452 uint64_t aios = 0, costs = 0;
8453
7c673cae
FG
8454 dout(20) << __func__ << " committing " << kv_queue.size()
8455 << " submitting " << kv_queue_unsubmitted.size()
8456 << " deferred done " << deferred_done_queue.size()
8457 << " stable " << deferred_stable_queue.size()
8458 << dendl;
8459 kv_committing.swap(kv_queue);
8460 kv_submitting.swap(kv_queue_unsubmitted);
8461 deferred_done.swap(deferred_done_queue);
8462 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
8463 aios = kv_ios;
8464 costs = kv_throttle_costs;
8465 kv_ios = 0;
8466 kv_throttle_costs = 0;
7c673cae
FG
8467 utime_t start = ceph_clock_now();
8468 l.unlock();
8469
8470 dout(30) << __func__ << " committing " << kv_committing << dendl;
8471 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8472 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8473 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8474
7c673cae
FG
8475 bool force_flush = false;
8476 // if bluefs is sharing the same device as data (only), then we
8477 // can rely on the bluefs commit to flush the device and make
8478 // deferred aios stable. that means that if we do have done deferred
8479 // txcs AND we are not on a single device, we need to force a flush.
8480 if (bluefs_single_shared_device && bluefs) {
31f18b77 8481 if (aios) {
7c673cae
FG
8482 force_flush = true;
8483 } else if (kv_committing.empty() && kv_submitting.empty() &&
8484 deferred_stable.empty()) {
8485 force_flush = true; // there's nothing else to commit!
8486 } else if (deferred_aggressive) {
8487 force_flush = true;
8488 }
8489 } else
8490 force_flush = true;
8491
8492 if (force_flush) {
31f18b77 8493 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
8494 << " force_flush=" << (int)force_flush
8495 << ", flushing, deferred done->stable" << dendl;
8496 // flush/barrier on block device
8497 bdev->flush();
8498
8499 // if we flush then deferred done are now deferred stable
8500 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8501 deferred_done.end());
8502 deferred_done.clear();
8503 }
8504 utime_t after_flush = ceph_clock_now();
8505
8506 // we will use one final transaction to force a sync
8507 KeyValueDB::Transaction synct = db->get_transaction();
8508
8509 // increase {nid,blobid}_max? note that this covers both the
8510 // case where we are approaching the max and the case we passed
8511 // it. in either case, we increase the max in the earlier txn
8512 // we submit.
8513 uint64_t new_nid_max = 0, new_blobid_max = 0;
8514 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8515 KeyValueDB::Transaction t =
8516 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8517 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8518 bufferlist bl;
8519 ::encode(new_nid_max, bl);
8520 t->set(PREFIX_SUPER, "nid_max", bl);
8521 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8522 }
8523 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8524 KeyValueDB::Transaction t =
8525 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8526 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8527 bufferlist bl;
8528 ::encode(new_blobid_max, bl);
8529 t->set(PREFIX_SUPER, "blobid_max", bl);
8530 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8531 }
c07f9fc5
FG
8532
8533 for (auto txc : kv_committing) {
8534 if (txc->state == TransContext::STATE_KV_QUEUED) {
8535 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8536 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8537 assert(r == 0);
8538 _txc_applied_kv(txc);
8539 --txc->osr->kv_committing_serially;
8540 txc->state = TransContext::STATE_KV_SUBMITTED;
8541 if (txc->osr->kv_submitted_waiters) {
8542 std::lock_guard<std::mutex> l(txc->osr->qlock);
8543 if (txc->osr->_is_all_kv_submitted()) {
8544 txc->osr->qcond.notify_all();
8545 }
7c673cae 8546 }
c07f9fc5
FG
8547
8548 } else {
8549 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8550 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
7c673cae 8551 }
7c673cae
FG
8552 if (txc->had_ios) {
8553 --txc->osr->txc_with_unstable_io;
8554 }
7c673cae
FG
8555 }
8556
31f18b77
FG
8557 // release throttle *before* we commit. this allows new ops
8558 // to be prepared and enter pipeline while we are waiting on
8559 // the kv commit sync/flush. then hopefully on the next
8560 // iteration there will already be ops awake. otherwise, we
8561 // end up going to sleep, and then wake up when the very first
8562 // transaction is ready for commit.
8563 throttle_bytes.put(costs);
8564
7c673cae
FG
8565 PExtentVector bluefs_gift_extents;
8566 if (bluefs &&
8567 after_flush - bluefs_last_balance >
8568 cct->_conf->bluestore_bluefs_balance_interval) {
8569 bluefs_last_balance = after_flush;
8570 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8571 assert(r >= 0);
8572 if (r > 0) {
8573 for (auto& p : bluefs_gift_extents) {
8574 bluefs_extents.insert(p.offset, p.length);
8575 }
8576 bufferlist bl;
8577 ::encode(bluefs_extents, bl);
8578 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8579 << bluefs_extents << std::dec << dendl;
8580 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8581 }
8582 }
8583
8584 // cleanup sync deferred keys
8585 for (auto b : deferred_stable) {
8586 for (auto& txc : b->txcs) {
8587 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8588 if (!wt.released.empty()) {
8589 // kraken replay compat only
8590 txc.released = wt.released;
8591 dout(10) << __func__ << " deferred txn has released "
8592 << txc.released
8593 << " (we just upgraded from kraken) on " << &txc << dendl;
8594 _txc_finalize_kv(&txc, synct);
8595 }
8596 // cleanup the deferred
8597 string key;
8598 get_deferred_key(wt.seq, &key);
8599 synct->rm_single_key(PREFIX_DEFERRED, key);
8600 }
8601 }
8602
8603 // submit synct synchronously (block and wait for it to commit)
31f18b77 8604 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
7c673cae
FG
8605 assert(r == 0);
8606
8607 if (new_nid_max) {
8608 nid_max = new_nid_max;
8609 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8610 }
8611 if (new_blobid_max) {
8612 blobid_max = new_blobid_max;
8613 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8614 }
8615
224ce89b
WB
8616 {
8617 utime_t finish = ceph_clock_now();
8618 utime_t dur_flush = after_flush - start;
8619 utime_t dur_kv = finish - after_flush;
8620 utime_t dur = finish - start;
8621 dout(20) << __func__ << " committed " << kv_committing.size()
8622 << " cleaned " << deferred_stable.size()
8623 << " in " << dur
8624 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8625 << dendl;
7c673cae
FG
8626 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8627 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8628 logger->tinc(l_bluestore_kv_lat, dur);
8629 }
31f18b77
FG
8630
8631 if (bluefs) {
8632 if (!bluefs_gift_extents.empty()) {
8633 _commit_bluefs_freespace(bluefs_gift_extents);
8634 }
8635 for (auto p = bluefs_extents_reclaiming.begin();
8636 p != bluefs_extents_reclaiming.end();
8637 ++p) {
8638 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8639 << p.get_start() << "~" << p.get_len() << std::dec
8640 << dendl;
8641 alloc->release(p.get_start(), p.get_len());
8642 }
8643 bluefs_extents_reclaiming.clear();
8644 }
8645
8646 {
8647 std::unique_lock<std::mutex> m(kv_finalize_lock);
8648 if (kv_committing_to_finalize.empty()) {
8649 kv_committing_to_finalize.swap(kv_committing);
8650 } else {
8651 kv_committing_to_finalize.insert(
8652 kv_committing_to_finalize.end(),
8653 kv_committing.begin(),
8654 kv_committing.end());
8655 kv_committing.clear();
8656 }
8657 if (deferred_stable_to_finalize.empty()) {
8658 deferred_stable_to_finalize.swap(deferred_stable);
8659 } else {
8660 deferred_stable_to_finalize.insert(
8661 deferred_stable_to_finalize.end(),
8662 deferred_stable.begin(),
8663 deferred_stable.end());
8664 deferred_stable.clear();
8665 }
8666 kv_finalize_cond.notify_one();
8667 }
8668
8669 l.lock();
8670 // previously deferred "done" are now "stable" by virtue of this
8671 // commit cycle.
8672 deferred_stable_queue.swap(deferred_done);
8673 }
8674 }
8675 dout(10) << __func__ << " finish" << dendl;
8676 kv_sync_started = false;
8677}
8678
8679void BlueStore::_kv_finalize_thread()
8680{
8681 deque<TransContext*> kv_committed;
8682 deque<DeferredBatch*> deferred_stable;
8683 dout(10) << __func__ << " start" << dendl;
8684 std::unique_lock<std::mutex> l(kv_finalize_lock);
8685 assert(!kv_finalize_started);
8686 kv_finalize_started = true;
8687 kv_finalize_cond.notify_all();
8688 while (true) {
8689 assert(kv_committed.empty());
8690 assert(deferred_stable.empty());
8691 if (kv_committing_to_finalize.empty() &&
8692 deferred_stable_to_finalize.empty()) {
8693 if (kv_finalize_stop)
8694 break;
8695 dout(20) << __func__ << " sleep" << dendl;
8696 kv_finalize_cond.wait(l);
8697 dout(20) << __func__ << " wake" << dendl;
8698 } else {
8699 kv_committed.swap(kv_committing_to_finalize);
8700 deferred_stable.swap(deferred_stable_to_finalize);
8701 l.unlock();
8702 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8703 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8704
8705 while (!kv_committed.empty()) {
8706 TransContext *txc = kv_committed.front();
7c673cae
FG
8707 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8708 _txc_state_proc(txc);
31f18b77 8709 kv_committed.pop_front();
7c673cae 8710 }
31f18b77 8711
7c673cae
FG
8712 for (auto b : deferred_stable) {
8713 auto p = b->txcs.begin();
8714 while (p != b->txcs.end()) {
8715 TransContext *txc = &*p;
8716 p = b->txcs.erase(p); // unlink here because
8717 _txc_state_proc(txc); // this may destroy txc
8718 }
8719 delete b;
8720 }
31f18b77 8721 deferred_stable.clear();
7c673cae
FG
8722
8723 if (!deferred_aggressive) {
31f18b77 8724 if (deferred_queue_size >= deferred_batch_ops.load() ||
7c673cae 8725 throttle_deferred_bytes.past_midpoint()) {
224ce89b 8726 deferred_try_submit();
7c673cae
FG
8727 }
8728 }
8729
8730 // this is as good a place as any ...
8731 _reap_collections();
8732
7c673cae 8733 l.lock();
7c673cae
FG
8734 }
8735 }
8736 dout(10) << __func__ << " finish" << dendl;
31f18b77 8737 kv_finalize_started = false;
7c673cae
FG
8738}
8739
8740bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8741 TransContext *txc, OnodeRef o)
8742{
8743 if (!txc->deferred_txn) {
8744 txc->deferred_txn = new bluestore_deferred_transaction_t;
8745 }
8746 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8747 return &txc->deferred_txn->ops.back();
8748}
8749
8750void BlueStore::_deferred_queue(TransContext *txc)
8751{
8752 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 8753 deferred_lock.lock();
7c673cae
FG
8754 if (!txc->osr->deferred_pending &&
8755 !txc->osr->deferred_running) {
8756 deferred_queue.push_back(*txc->osr);
8757 }
8758 if (!txc->osr->deferred_pending) {
8759 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8760 }
8761 ++deferred_queue_size;
8762 txc->osr->deferred_pending->txcs.push_back(*txc);
8763 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8764 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8765 const auto& op = *opi;
8766 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8767 bufferlist::const_iterator p = op.data.begin();
8768 for (auto e : op.extents) {
8769 txc->osr->deferred_pending->prepare_write(
8770 cct, wt.seq, e.offset, e.length, p);
8771 }
8772 }
8773 if (deferred_aggressive &&
8774 !txc->osr->deferred_running) {
224ce89b
WB
8775 _deferred_submit_unlock(txc->osr.get());
8776 } else {
8777 deferred_lock.unlock();
7c673cae
FG
8778 }
8779}
8780
224ce89b 8781void BlueStore::deferred_try_submit()
7c673cae
FG
8782{
8783 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8784 << deferred_queue_size << " txcs" << dendl;
224ce89b
WB
8785 std::lock_guard<std::mutex> l(deferred_lock);
8786 vector<OpSequencerRef> osrs;
8787 osrs.reserve(deferred_queue.size());
7c673cae 8788 for (auto& osr : deferred_queue) {
224ce89b
WB
8789 osrs.push_back(&osr);
8790 }
8791 for (auto& osr : osrs) {
181888fb
FG
8792 if (osr->deferred_pending) {
8793 if (!osr->deferred_running) {
8794 _deferred_submit_unlock(osr.get());
8795 deferred_lock.lock();
8796 } else {
8797 dout(20) << __func__ << " osr " << osr << " already has running"
8798 << dendl;
8799 }
8800 } else {
8801 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
8802 }
8803 }
8804}
8805
224ce89b 8806void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
8807{
8808 dout(10) << __func__ << " osr " << osr
8809 << " " << osr->deferred_pending->iomap.size() << " ios pending "
8810 << dendl;
8811 assert(osr->deferred_pending);
8812 assert(!osr->deferred_running);
8813
8814 auto b = osr->deferred_pending;
8815 deferred_queue_size -= b->seq_bytes.size();
8816 assert(deferred_queue_size >= 0);
8817
8818 osr->deferred_running = osr->deferred_pending;
8819 osr->deferred_pending = nullptr;
8820
8821 uint64_t start = 0, pos = 0;
8822 bufferlist bl;
8823 auto i = b->iomap.begin();
8824 while (true) {
8825 if (i == b->iomap.end() || i->first != pos) {
8826 if (bl.length()) {
8827 dout(20) << __func__ << " write 0x" << std::hex
8828 << start << "~" << bl.length()
8829 << " crc " << bl.crc32c(-1) << std::dec << dendl;
8830 if (!g_conf->bluestore_debug_omit_block_device_write) {
8831 logger->inc(l_bluestore_deferred_write_ops);
8832 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
8833 int r = bdev->aio_write(start, bl, &b->ioc, false);
8834 assert(r == 0);
8835 }
8836 }
8837 if (i == b->iomap.end()) {
8838 break;
8839 }
8840 start = 0;
8841 pos = i->first;
8842 bl.clear();
8843 }
8844 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
8845 << std::hex << pos << "~" << i->second.bl.length() << std::dec
8846 << dendl;
8847 if (!bl.length()) {
8848 start = pos;
8849 }
8850 pos += i->second.bl.length();
8851 bl.claim_append(i->second.bl);
8852 ++i;
8853 }
224ce89b 8854
224ce89b 8855 deferred_lock.unlock();
7c673cae
FG
8856 bdev->aio_submit(&b->ioc);
8857}
8858
3efd9988
FG
8859struct C_DeferredTrySubmit : public Context {
8860 BlueStore *store;
8861 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
8862 void finish(int r) {
8863 store->deferred_try_submit();
8864 }
8865};
8866
7c673cae
FG
8867void BlueStore::_deferred_aio_finish(OpSequencer *osr)
8868{
8869 dout(10) << __func__ << " osr " << osr << dendl;
8870 assert(osr->deferred_running);
8871 DeferredBatch *b = osr->deferred_running;
8872
8873 {
8874 std::lock_guard<std::mutex> l(deferred_lock);
8875 assert(osr->deferred_running == b);
8876 osr->deferred_running = nullptr;
8877 if (!osr->deferred_pending) {
181888fb 8878 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
8879 auto q = deferred_queue.iterator_to(*osr);
8880 deferred_queue.erase(q);
8881 } else if (deferred_aggressive) {
224ce89b 8882 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
3efd9988 8883 deferred_finisher.queue(new C_DeferredTrySubmit(this));
181888fb
FG
8884 } else {
8885 dout(20) << __func__ << " leaving queued, more pending" << dendl;
7c673cae
FG
8886 }
8887 }
8888
8889 {
31f18b77 8890 uint64_t costs = 0;
7c673cae
FG
8891 std::lock_guard<std::mutex> l2(osr->qlock);
8892 for (auto& i : b->txcs) {
8893 TransContext *txc = &i;
8894 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
31f18b77 8895 costs += txc->cost;
7c673cae 8896 }
31f18b77
FG
8897 osr->qcond.notify_all();
8898 throttle_deferred_bytes.put(costs);
7c673cae
FG
8899 std::lock_guard<std::mutex> l(kv_lock);
8900 deferred_done_queue.emplace_back(b);
8901 }
8902
8903 // in the normal case, do not bother waking up the kv thread; it will
8904 // catch us on the next commit anyway.
8905 if (deferred_aggressive) {
8906 std::lock_guard<std::mutex> l(kv_lock);
8907 kv_cond.notify_one();
8908 }
8909}
8910
8911int BlueStore::_deferred_replay()
8912{
8913 dout(10) << __func__ << " start" << dendl;
8914 OpSequencerRef osr = new OpSequencer(cct, this);
8915 int count = 0;
8916 int r = 0;
8917 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
8918 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
8919 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
8920 << dendl;
8921 bluestore_deferred_transaction_t *deferred_txn =
8922 new bluestore_deferred_transaction_t;
8923 bufferlist bl = it->value();
8924 bufferlist::iterator p = bl.begin();
8925 try {
8926 ::decode(*deferred_txn, p);
8927 } catch (buffer::error& e) {
8928 derr << __func__ << " failed to decode deferred txn "
8929 << pretty_binary_string(it->key()) << dendl;
8930 delete deferred_txn;
8931 r = -EIO;
8932 goto out;
8933 }
8934 TransContext *txc = _txc_create(osr.get());
8935 txc->deferred_txn = deferred_txn;
8936 txc->state = TransContext::STATE_KV_DONE;
8937 _txc_state_proc(txc);
8938 }
8939 out:
8940 dout(20) << __func__ << " draining osr" << dendl;
8941 _osr_drain_all();
8942 osr->discard();
8943 dout(10) << __func__ << " completed " << count << " events" << dendl;
8944 return r;
8945}
8946
8947// ---------------------------
8948// transactions
8949
8950int BlueStore::queue_transactions(
8951 Sequencer *posr,
8952 vector<Transaction>& tls,
8953 TrackedOpRef op,
8954 ThreadPool::TPHandle *handle)
8955{
8956 FUNCTRACE();
8957 Context *onreadable;
8958 Context *ondisk;
8959 Context *onreadable_sync;
8960 ObjectStore::Transaction::collect_contexts(
8961 tls, &onreadable, &ondisk, &onreadable_sync);
8962
8963 if (cct->_conf->objectstore_blackhole) {
8964 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
8965 << dendl;
8966 delete ondisk;
8967 delete onreadable;
8968 delete onreadable_sync;
8969 return 0;
8970 }
8971 utime_t start = ceph_clock_now();
8972 // set up the sequencer
8973 OpSequencer *osr;
8974 assert(posr);
8975 if (posr->p) {
8976 osr = static_cast<OpSequencer *>(posr->p.get());
8977 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
8978 } else {
8979 osr = new OpSequencer(cct, this);
8980 osr->parent = posr;
8981 posr->p = osr;
8982 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
8983 }
8984
8985 // prepare
8986 TransContext *txc = _txc_create(osr);
8987 txc->onreadable = onreadable;
8988 txc->onreadable_sync = onreadable_sync;
8989 txc->oncommit = ondisk;
8990
8991 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
8992 (*p).set_osr(osr);
8993 txc->bytes += (*p).get_num_bytes();
8994 _txc_add_transaction(txc, &(*p));
8995 }
8996 _txc_calc_cost(txc);
8997
8998 _txc_write_nodes(txc, txc->t);
8999
9000 // journal deferred items
9001 if (txc->deferred_txn) {
9002 txc->deferred_txn->seq = ++deferred_seq;
9003 bufferlist bl;
9004 ::encode(*txc->deferred_txn, bl);
9005 string key;
9006 get_deferred_key(txc->deferred_txn->seq, &key);
9007 txc->t->set(PREFIX_DEFERRED, key, bl);
9008 }
9009
9010 _txc_finalize_kv(txc, txc->t);
9011 if (handle)
9012 handle->suspend_tp_timeout();
9013
9014 utime_t tstart = ceph_clock_now();
9015 throttle_bytes.get(txc->cost);
9016 if (txc->deferred_txn) {
9017 // ensure we do not block here because of deferred writes
9018 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
d2e6a577
FG
9019 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
9020 << dendl;
9021 ++deferred_aggressive;
7c673cae 9022 deferred_try_submit();
3efd9988
FG
9023 {
9024 // wake up any previously finished deferred events
9025 std::lock_guard<std::mutex> l(kv_lock);
9026 kv_cond.notify_one();
9027 }
7c673cae 9028 throttle_deferred_bytes.get(txc->cost);
d2e6a577
FG
9029 --deferred_aggressive;
9030 }
7c673cae
FG
9031 }
9032 utime_t tend = ceph_clock_now();
9033
9034 if (handle)
9035 handle->reset_tp_timeout();
9036
9037 logger->inc(l_bluestore_txc);
9038
9039 // execute (start)
9040 _txc_state_proc(txc);
9041
9042 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
9043 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
9044 return 0;
9045}
9046
9047void BlueStore::_txc_aio_submit(TransContext *txc)
9048{
9049 dout(10) << __func__ << " txc " << txc << dendl;
9050 bdev->aio_submit(&txc->ioc);
9051}
9052
9053void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
9054{
9055 Transaction::iterator i = t->begin();
9056
9057 _dump_transaction(t);
9058
9059 vector<CollectionRef> cvec(i.colls.size());
9060 unsigned j = 0;
9061 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
9062 ++p, ++j) {
9063 cvec[j] = _get_collection(*p);
7c673cae
FG
9064 }
9065 vector<OnodeRef> ovec(i.objects.size());
9066
9067 for (int pos = 0; i.have_op(); ++pos) {
9068 Transaction::Op *op = i.decode_op();
9069 int r = 0;
9070
9071 // no coll or obj
9072 if (op->op == Transaction::OP_NOP)
9073 continue;
9074
9075 // collection operations
9076 CollectionRef &c = cvec[op->cid];
9077 switch (op->op) {
9078 case Transaction::OP_RMCOLL:
9079 {
9080 const coll_t &cid = i.get_cid(op->cid);
9081 r = _remove_collection(txc, cid, &c);
9082 if (!r)
9083 continue;
9084 }
9085 break;
9086
9087 case Transaction::OP_MKCOLL:
9088 {
9089 assert(!c);
9090 const coll_t &cid = i.get_cid(op->cid);
9091 r = _create_collection(txc, cid, op->split_bits, &c);
9092 if (!r)
9093 continue;
9094 }
9095 break;
9096
9097 case Transaction::OP_SPLIT_COLLECTION:
9098 assert(0 == "deprecated");
9099 break;
9100
9101 case Transaction::OP_SPLIT_COLLECTION2:
9102 {
9103 uint32_t bits = op->split_bits;
9104 uint32_t rem = op->split_rem;
9105 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
9106 if (!r)
9107 continue;
9108 }
9109 break;
9110
9111 case Transaction::OP_COLL_HINT:
9112 {
9113 uint32_t type = op->hint_type;
9114 bufferlist hint;
9115 i.decode_bl(hint);
9116 bufferlist::iterator hiter = hint.begin();
9117 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
9118 uint32_t pg_num;
9119 uint64_t num_objs;
9120 ::decode(pg_num, hiter);
9121 ::decode(num_objs, hiter);
9122 dout(10) << __func__ << " collection hint objects is a no-op, "
9123 << " pg_num " << pg_num << " num_objects " << num_objs
9124 << dendl;
9125 } else {
9126 // Ignore the hint
9127 dout(10) << __func__ << " unknown collection hint " << type << dendl;
9128 }
9129 continue;
9130 }
9131 break;
9132
9133 case Transaction::OP_COLL_SETATTR:
9134 r = -EOPNOTSUPP;
9135 break;
9136
9137 case Transaction::OP_COLL_RMATTR:
9138 r = -EOPNOTSUPP;
9139 break;
9140
9141 case Transaction::OP_COLL_RENAME:
9142 assert(0 == "not implemented");
9143 break;
9144 }
9145 if (r < 0) {
9146 derr << __func__ << " error " << cpp_strerror(r)
9147 << " not handled on operation " << op->op
9148 << " (op " << pos << ", counting from 0)" << dendl;
9149 _dump_transaction(t, 0);
9150 assert(0 == "unexpected error");
9151 }
9152
9153 // these operations implicity create the object
9154 bool create = false;
9155 if (op->op == Transaction::OP_TOUCH ||
9156 op->op == Transaction::OP_WRITE ||
9157 op->op == Transaction::OP_ZERO) {
9158 create = true;
9159 }
9160
9161 // object operations
9162 RWLock::WLocker l(c->lock);
9163 OnodeRef &o = ovec[op->oid];
9164 if (!o) {
9165 ghobject_t oid = i.get_oid(op->oid);
9166 o = c->get_onode(oid, create);
9167 }
9168 if (!create && (!o || !o->exists)) {
9169 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
9170 << i.get_oid(op->oid) << dendl;
9171 r = -ENOENT;
9172 goto endop;
9173 }
9174
9175 switch (op->op) {
9176 case Transaction::OP_TOUCH:
9177 r = _touch(txc, c, o);
9178 break;
9179
9180 case Transaction::OP_WRITE:
9181 {
9182 uint64_t off = op->off;
9183 uint64_t len = op->len;
9184 uint32_t fadvise_flags = i.get_fadvise_flags();
9185 bufferlist bl;
9186 i.decode_bl(bl);
9187 r = _write(txc, c, o, off, len, bl, fadvise_flags);
9188 }
9189 break;
9190
9191 case Transaction::OP_ZERO:
9192 {
9193 uint64_t off = op->off;
9194 uint64_t len = op->len;
9195 r = _zero(txc, c, o, off, len);
9196 }
9197 break;
9198
9199 case Transaction::OP_TRIMCACHE:
9200 {
9201 // deprecated, no-op
9202 }
9203 break;
9204
9205 case Transaction::OP_TRUNCATE:
9206 {
9207 uint64_t off = op->off;
35e4c445 9208 r = _truncate(txc, c, o, off);
7c673cae
FG
9209 }
9210 break;
9211
9212 case Transaction::OP_REMOVE:
9213 {
9214 r = _remove(txc, c, o);
9215 }
9216 break;
9217
9218 case Transaction::OP_SETATTR:
9219 {
9220 string name = i.decode_string();
9221 bufferptr bp;
9222 i.decode_bp(bp);
9223 r = _setattr(txc, c, o, name, bp);
9224 }
9225 break;
9226
9227 case Transaction::OP_SETATTRS:
9228 {
9229 map<string, bufferptr> aset;
9230 i.decode_attrset(aset);
9231 r = _setattrs(txc, c, o, aset);
9232 }
9233 break;
9234
9235 case Transaction::OP_RMATTR:
9236 {
9237 string name = i.decode_string();
9238 r = _rmattr(txc, c, o, name);
9239 }
9240 break;
9241
9242 case Transaction::OP_RMATTRS:
9243 {
9244 r = _rmattrs(txc, c, o);
9245 }
9246 break;
9247
9248 case Transaction::OP_CLONE:
9249 {
9250 OnodeRef& no = ovec[op->dest_oid];
9251 if (!no) {
9252 const ghobject_t& noid = i.get_oid(op->dest_oid);
9253 no = c->get_onode(noid, true);
9254 }
9255 r = _clone(txc, c, o, no);
9256 }
9257 break;
9258
9259 case Transaction::OP_CLONERANGE:
9260 assert(0 == "deprecated");
9261 break;
9262
9263 case Transaction::OP_CLONERANGE2:
9264 {
9265 OnodeRef& no = ovec[op->dest_oid];
9266 if (!no) {
9267 const ghobject_t& noid = i.get_oid(op->dest_oid);
9268 no = c->get_onode(noid, true);
9269 }
9270 uint64_t srcoff = op->off;
9271 uint64_t len = op->len;
9272 uint64_t dstoff = op->dest_off;
9273 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
9274 }
9275 break;
9276
9277 case Transaction::OP_COLL_ADD:
9278 assert(0 == "not implemented");
9279 break;
9280
9281 case Transaction::OP_COLL_REMOVE:
9282 assert(0 == "not implemented");
9283 break;
9284
9285 case Transaction::OP_COLL_MOVE:
9286 assert(0 == "deprecated");
9287 break;
9288
9289 case Transaction::OP_COLL_MOVE_RENAME:
9290 case Transaction::OP_TRY_RENAME:
9291 {
9292 assert(op->cid == op->dest_cid);
9293 const ghobject_t& noid = i.get_oid(op->dest_oid);
9294 OnodeRef& no = ovec[op->dest_oid];
9295 if (!no) {
9296 no = c->get_onode(noid, false);
9297 }
9298 r = _rename(txc, c, o, no, noid);
9299 }
9300 break;
9301
9302 case Transaction::OP_OMAP_CLEAR:
9303 {
9304 r = _omap_clear(txc, c, o);
9305 }
9306 break;
9307 case Transaction::OP_OMAP_SETKEYS:
9308 {
9309 bufferlist aset_bl;
9310 i.decode_attrset_bl(&aset_bl);
9311 r = _omap_setkeys(txc, c, o, aset_bl);
9312 }
9313 break;
9314 case Transaction::OP_OMAP_RMKEYS:
9315 {
9316 bufferlist keys_bl;
9317 i.decode_keyset_bl(&keys_bl);
9318 r = _omap_rmkeys(txc, c, o, keys_bl);
9319 }
9320 break;
9321 case Transaction::OP_OMAP_RMKEYRANGE:
9322 {
9323 string first, last;
9324 first = i.decode_string();
9325 last = i.decode_string();
9326 r = _omap_rmkey_range(txc, c, o, first, last);
9327 }
9328 break;
9329 case Transaction::OP_OMAP_SETHEADER:
9330 {
9331 bufferlist bl;
9332 i.decode_bl(bl);
9333 r = _omap_setheader(txc, c, o, bl);
9334 }
9335 break;
9336
9337 case Transaction::OP_SETALLOCHINT:
9338 {
9339 r = _set_alloc_hint(txc, c, o,
9340 op->expected_object_size,
9341 op->expected_write_size,
9342 op->alloc_hint_flags);
9343 }
9344 break;
9345
9346 default:
9347 derr << __func__ << "bad op " << op->op << dendl;
9348 ceph_abort();
9349 }
9350
9351 endop:
9352 if (r < 0) {
9353 bool ok = false;
9354
9355 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9356 op->op == Transaction::OP_CLONE ||
9357 op->op == Transaction::OP_CLONERANGE2 ||
9358 op->op == Transaction::OP_COLL_ADD ||
9359 op->op == Transaction::OP_SETATTR ||
9360 op->op == Transaction::OP_SETATTRS ||
9361 op->op == Transaction::OP_RMATTR ||
9362 op->op == Transaction::OP_OMAP_SETKEYS ||
9363 op->op == Transaction::OP_OMAP_RMKEYS ||
9364 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9365 op->op == Transaction::OP_OMAP_SETHEADER))
9366 // -ENOENT is usually okay
9367 ok = true;
9368 if (r == -ENODATA)
9369 ok = true;
9370
9371 if (!ok) {
9372 const char *msg = "unexpected error code";
9373
9374 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9375 op->op == Transaction::OP_CLONE ||
9376 op->op == Transaction::OP_CLONERANGE2))
9377 msg = "ENOENT on clone suggests osd bug";
9378
9379 if (r == -ENOSPC)
9380 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9381 // by partially applying transactions.
9382 msg = "ENOSPC from bluestore, misconfigured cluster";
9383
9384 if (r == -ENOTEMPTY) {
9385 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9386 }
9387
9388 derr << __func__ << " error " << cpp_strerror(r)
9389 << " not handled on operation " << op->op
9390 << " (op " << pos << ", counting from 0)"
9391 << dendl;
9392 derr << msg << dendl;
9393 _dump_transaction(t, 0);
9394 assert(0 == "unexpected error");
9395 }
9396 }
9397 }
9398}
9399
9400
9401
9402// -----------------
9403// write operations
9404
9405int BlueStore::_touch(TransContext *txc,
9406 CollectionRef& c,
9407 OnodeRef &o)
9408{
9409 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9410 int r = 0;
7c673cae
FG
9411 _assign_nid(txc, o);
9412 txc->write_onode(o);
9413 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9414 return r;
9415}
9416
94b18763 9417void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
7c673cae
FG
9418{
9419 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9420 return;
9421 dout(log_level) << __func__ << " " << o << " " << o->oid
9422 << " nid " << o->onode.nid
9423 << " size 0x" << std::hex << o->onode.size
9424 << " (" << std::dec << o->onode.size << ")"
9425 << " expected_object_size " << o->onode.expected_object_size
9426 << " expected_write_size " << o->onode.expected_write_size
9427 << " in " << o->onode.extent_map_shards.size() << " shards"
9428 << ", " << o->extent_map.spanning_blob_map.size()
9429 << " spanning blobs"
9430 << dendl;
9431 for (auto p = o->onode.attrs.begin();
9432 p != o->onode.attrs.end();
9433 ++p) {
9434 dout(log_level) << __func__ << " attr " << p->first
9435 << " len " << p->second.length() << dendl;
9436 }
9437 _dump_extent_map(o->extent_map, log_level);
9438}
9439
9440void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9441{
9442 uint64_t pos = 0;
9443 for (auto& s : em.shards) {
9444 dout(log_level) << __func__ << " shard " << *s.shard_info
9445 << (s.loaded ? " (loaded)" : "")
9446 << (s.dirty ? " (dirty)" : "")
9447 << dendl;
9448 }
9449 for (auto& e : em.extent_map) {
9450 dout(log_level) << __func__ << " " << e << dendl;
9451 assert(e.logical_offset >= pos);
9452 pos = e.logical_offset + e.length;
9453 const bluestore_blob_t& blob = e.blob->get_blob();
9454 if (blob.has_csum()) {
9455 vector<uint64_t> v;
9456 unsigned n = blob.get_csum_count();
9457 for (unsigned i = 0; i < n; ++i)
9458 v.push_back(blob.get_csum_item(i));
9459 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9460 << dendl;
9461 }
9462 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9463 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9464 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9465 << "~" << i.second->length << std::dec
9466 << " " << *i.second << dendl;
9467 }
9468 }
9469}
9470
9471void BlueStore::_dump_transaction(Transaction *t, int log_level)
9472{
9473 dout(log_level) << " transaction dump:\n";
9474 JSONFormatter f(true);
9475 f.open_object_section("transaction");
9476 t->dump(&f);
9477 f.close_section();
9478 f.flush(*_dout);
9479 *_dout << dendl;
9480}
9481
9482void BlueStore::_pad_zeros(
9483 bufferlist *bl, uint64_t *offset,
9484 uint64_t chunk_size)
9485{
9486 auto length = bl->length();
9487 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9488 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9489 dout(40) << "before:\n";
9490 bl->hexdump(*_dout);
9491 *_dout << dendl;
9492 // front
9493 size_t front_pad = *offset % chunk_size;
9494 size_t back_pad = 0;
9495 size_t pad_count = 0;
9496 if (front_pad) {
9497 size_t front_copy = MIN(chunk_size - front_pad, length);
9498 bufferptr z = buffer::create_page_aligned(chunk_size);
224ce89b 9499 z.zero(0, front_pad, false);
7c673cae 9500 pad_count += front_pad;
224ce89b 9501 bl->copy(0, front_copy, z.c_str() + front_pad);
7c673cae
FG
9502 if (front_copy + front_pad < chunk_size) {
9503 back_pad = chunk_size - (length + front_pad);
224ce89b 9504 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
9505 pad_count += back_pad;
9506 }
9507 bufferlist old, t;
9508 old.swap(*bl);
9509 t.substr_of(old, front_copy, length - front_copy);
9510 bl->append(z);
9511 bl->claim_append(t);
9512 *offset -= front_pad;
224ce89b 9513 length += pad_count;
7c673cae
FG
9514 }
9515
9516 // back
9517 uint64_t end = *offset + length;
9518 unsigned back_copy = end % chunk_size;
9519 if (back_copy) {
9520 assert(back_pad == 0);
9521 back_pad = chunk_size - back_copy;
9522 assert(back_copy <= length);
9523 bufferptr tail(chunk_size);
224ce89b
WB
9524 bl->copy(length - back_copy, back_copy, tail.c_str());
9525 tail.zero(back_copy, back_pad, false);
7c673cae
FG
9526 bufferlist old;
9527 old.swap(*bl);
9528 bl->substr_of(old, 0, length - back_copy);
9529 bl->append(tail);
9530 length += back_pad;
9531 pad_count += back_pad;
9532 }
9533 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9534 << back_pad << " on front/back, now 0x" << *offset << "~"
9535 << length << std::dec << dendl;
9536 dout(40) << "after:\n";
9537 bl->hexdump(*_dout);
9538 *_dout << dendl;
9539 if (pad_count)
9540 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9541 assert(bl->length() == length);
9542}
9543
9544void BlueStore::_do_write_small(
9545 TransContext *txc,
9546 CollectionRef &c,
9547 OnodeRef o,
9548 uint64_t offset, uint64_t length,
9549 bufferlist::iterator& blp,
9550 WriteContext *wctx)
9551{
9552 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9553 << std::dec << dendl;
9554 assert(length < min_alloc_size);
9555 uint64_t end_offs = offset + length;
9556
9557 logger->inc(l_bluestore_write_small);
9558 logger->inc(l_bluestore_write_small_bytes, length);
9559
9560 bufferlist bl;
9561 blp.copy(length, bl);
9562
9563 // Look for an existing mutable blob we can use.
9564 auto begin = o->extent_map.extent_map.begin();
9565 auto end = o->extent_map.extent_map.end();
9566 auto ep = o->extent_map.seek_lextent(offset);
9567 if (ep != begin) {
9568 --ep;
9569 if (ep->blob_end() <= offset) {
9570 ++ep;
9571 }
9572 }
9573 auto prev_ep = ep;
9574 if (prev_ep != begin) {
9575 --prev_ep;
9576 } else {
9577 prev_ep = end; // to avoid this extent check as it's a duplicate
9578 }
9579
9580 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9581 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9582 uint32_t alloc_len = min_alloc_size;
9583 auto offset0 = P2ALIGN(offset, alloc_len);
9584
9585 bool any_change;
9586
9587 // search suitable extent in both forward and reverse direction in
9588 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 9589 // then check if blob can be reused via can_reuse_blob func or apply
7c673cae
FG
9590 // direct/deferred write (the latter for extents including or higher
9591 // than 'offset' only).
9592 do {
9593 any_change = false;
9594
9595 if (ep != end && ep->logical_offset < offset + max_bsize) {
9596 BlobRef b = ep->blob;
9597 auto bstart = ep->blob_start();
9598 dout(20) << __func__ << " considering " << *b
9599 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9600 if (bstart >= end_offs) {
9601 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9602 } else if (!b->get_blob().is_mutable()) {
9603 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9604 } else if (ep->logical_offset % min_alloc_size !=
9605 ep->blob_offset % min_alloc_size) {
9606 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9607 } else {
9608 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9609 // can we pad our head/tail out with zeros?
9610 uint64_t head_pad, tail_pad;
9611 head_pad = P2PHASE(offset, chunk_size);
9612 tail_pad = P2NPHASE(end_offs, chunk_size);
9613 if (head_pad || tail_pad) {
9614 o->extent_map.fault_range(db, offset - head_pad,
9615 end_offs - offset + head_pad + tail_pad);
9616 }
9617 if (head_pad &&
9618 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9619 head_pad = 0;
9620 }
9621 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9622 tail_pad = 0;
9623 }
9624
9625 uint64_t b_off = offset - head_pad - bstart;
9626 uint64_t b_len = length + head_pad + tail_pad;
9627
9628 // direct write into unused blocks of an existing mutable blob?
9629 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9630 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9631 b->get_blob().is_unused(b_off, b_len) &&
9632 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 9633 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
9634
9635 dout(20) << __func__ << " write to unused 0x" << std::hex
9636 << b_off << "~" << b_len
9637 << " pad 0x" << head_pad << " + 0x" << tail_pad
9638 << std::dec << " of mutable " << *b << dendl;
224ce89b 9639 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
9640 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9641
9642 if (!g_conf->bluestore_debug_omit_block_device_write) {
9643 if (b_len <= prefer_deferred_size) {
9644 dout(20) << __func__ << " deferring small 0x" << std::hex
9645 << b_len << std::dec << " unused write via deferred" << dendl;
9646 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9647 op->op = bluestore_deferred_op_t::OP_WRITE;
9648 b->get_blob().map(
9649 b_off, b_len,
9650 [&](uint64_t offset, uint64_t length) {
9651 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9652 return 0;
9653 });
224ce89b 9654 op->data = bl;
7c673cae
FG
9655 } else {
9656 b->get_blob().map_bl(
224ce89b 9657 b_off, bl,
7c673cae
FG
9658 [&](uint64_t offset, bufferlist& t) {
9659 bdev->aio_write(offset, t,
9660 &txc->ioc, wctx->buffered);
9661 });
9662 }
9663 }
224ce89b 9664 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
9665 dout(20) << __func__ << " lex old " << *ep << dendl;
9666 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9667 b,
9668 &wctx->old_extents);
9669 b->dirty_blob().mark_used(le->blob_offset, le->length);
9670 txc->statfs_delta.stored() += le->length;
9671 dout(20) << __func__ << " lex " << *le << dendl;
9672 logger->inc(l_bluestore_write_small_unused);
9673 return;
9674 }
9675 // read some data to fill out the chunk?
9676 uint64_t head_read = P2PHASE(b_off, chunk_size);
9677 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9678 if ((head_read || tail_read) &&
9679 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9680 head_read + tail_read < min_alloc_size) {
9681 b_off -= head_read;
9682 b_len += head_read + tail_read;
9683
9684 } else {
9685 head_read = tail_read = 0;
9686 }
9687
9688 // chunk-aligned deferred overwrite?
9689 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9690 b_off % chunk_size == 0 &&
9691 b_len % chunk_size == 0 &&
9692 b->get_blob().is_allocated(b_off, b_len)) {
9693
224ce89b 9694 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
9695
9696 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9697 << " and tail 0x" << tail_read << std::dec << dendl;
9698 if (head_read) {
9699 bufferlist head_bl;
9700 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9701 head_bl, 0);
9702 assert(r >= 0 && r <= (int)head_read);
9703 size_t zlen = head_read - r;
9704 if (zlen) {
9705 head_bl.append_zero(zlen);
9706 logger->inc(l_bluestore_write_pad_bytes, zlen);
9707 }
224ce89b 9708 bl.claim_prepend(head_bl);
7c673cae
FG
9709 logger->inc(l_bluestore_write_penalty_read_ops);
9710 }
9711 if (tail_read) {
9712 bufferlist tail_bl;
9713 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9714 tail_bl, 0);
9715 assert(r >= 0 && r <= (int)tail_read);
9716 size_t zlen = tail_read - r;
9717 if (zlen) {
9718 tail_bl.append_zero(zlen);
9719 logger->inc(l_bluestore_write_pad_bytes, zlen);
9720 }
224ce89b 9721 bl.claim_append(tail_bl);
7c673cae
FG
9722 logger->inc(l_bluestore_write_penalty_read_ops);
9723 }
9724 logger->inc(l_bluestore_write_small_pre_read);
9725
9726 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9727 op->op = bluestore_deferred_op_t::OP_WRITE;
224ce89b 9728 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
9729 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9730
9731 int r = b->get_blob().map(
9732 b_off, b_len,
9733 [&](uint64_t offset, uint64_t length) {
9734 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9735 return 0;
9736 });
9737 assert(r == 0);
9738 if (b->get_blob().csum_type) {
224ce89b 9739 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 9740 }
224ce89b 9741 op->data.claim(bl);
7c673cae
FG
9742 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9743 << b_len << std::dec << " of mutable " << *b
9744 << " at " << op->extents << dendl;
9745 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9746 b, &wctx->old_extents);
9747 b->dirty_blob().mark_used(le->blob_offset, le->length);
9748 txc->statfs_delta.stored() += le->length;
9749 dout(20) << __func__ << " lex " << *le << dendl;
9750 logger->inc(l_bluestore_write_small_deferred);
9751 return;
9752 }
224ce89b
WB
9753 // try to reuse blob if we can
9754 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
9755 max_bsize,
9756 offset0 - bstart,
9757 &alloc_len)) {
9758 assert(alloc_len == min_alloc_size); // expecting data always
9759 // fit into reused blob
9760 // Need to check for pending writes desiring to
9761 // reuse the same pextent. The rationale is that during GC two chunks
9762 // from garbage blobs(compressed?) can share logical space within the same
9763 // AU. That's in turn might be caused by unaligned len in clone_range2.
9764 // Hence the second write will fail in an attempt to reuse blob at
9765 // do_alloc_write().
9766 if (!wctx->has_conflict(b,
9767 offset0,
9768 offset0 + alloc_len,
9769 min_alloc_size)) {
9770
9771 // we can't reuse pad_head/pad_tail since they might be truncated
9772 // due to existent extents
9773 uint64_t b_off = offset - bstart;
9774 uint64_t b_off0 = b_off;
9775 _pad_zeros(&bl, &b_off0, chunk_size);
9776
9777 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
9778 << " (0x" << b_off0 << "~" << bl.length() << ")"
9779 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
9780 << std::dec << dendl;
9781
9782 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9783 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9784 false, false);
9785 logger->inc(l_bluestore_write_small_unused);
9786 return;
9787 }
9788 }
9789 }
9790 ++ep;
9791 any_change = true;
9792 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9793
9794 // check extent for reuse in reverse order
9795 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9796 BlobRef b = prev_ep->blob;
9797 auto bstart = prev_ep->blob_start();
9798 dout(20) << __func__ << " considering " << *b
9799 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 9800 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
9801 max_bsize,
9802 offset0 - bstart,
9803 &alloc_len)) {
9804 assert(alloc_len == min_alloc_size); // expecting data always
9805 // fit into reused blob
9806 // Need to check for pending writes desiring to
9807 // reuse the same pextent. The rationale is that during GC two chunks
9808 // from garbage blobs(compressed?) can share logical space within the same
9809 // AU. That's in turn might be caused by unaligned len in clone_range2.
9810 // Hence the second write will fail in an attempt to reuse blob at
9811 // do_alloc_write().
9812 if (!wctx->has_conflict(b,
9813 offset0,
9814 offset0 + alloc_len,
9815 min_alloc_size)) {
9816
9817 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9818 uint64_t b_off = offset - bstart;
9819 uint64_t b_off0 = b_off;
9820 _pad_zeros(&bl, &b_off0, chunk_size);
9821
9822 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
9823 << " (0x" << b_off0 << "~" << bl.length() << ")"
9824 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
9825 << std::dec << dendl;
9826
9827 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9828 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9829 false, false);
9830 logger->inc(l_bluestore_write_small_unused);
9831 return;
9832 }
9833 }
9834 if (prev_ep != begin) {
9835 --prev_ep;
9836 any_change = true;
9837 } else {
9838 prev_ep = end; // to avoid useless first extent re-check
9839 }
9840 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9841 } while (any_change);
9842
9843 // new blob.
9844
9845 BlobRef b = c->new_blob();
9846 uint64_t b_off = P2PHASE(offset, alloc_len);
9847 uint64_t b_off0 = b_off;
9848 _pad_zeros(&bl, &b_off0, block_size);
9849 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9850 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
9851 logger->inc(l_bluestore_write_small_new);
9852
9853 return;
9854}
9855
9856void BlueStore::_do_write_big(
9857 TransContext *txc,
9858 CollectionRef &c,
9859 OnodeRef o,
9860 uint64_t offset, uint64_t length,
9861 bufferlist::iterator& blp,
9862 WriteContext *wctx)
9863{
9864 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9865 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
9866 << " compress " << (int)wctx->compress
9867 << dendl;
9868 logger->inc(l_bluestore_write_big);
9869 logger->inc(l_bluestore_write_big_bytes, length);
9870 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9871 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9872 while (length > 0) {
9873 bool new_blob = false;
9874 uint32_t l = MIN(max_bsize, length);
9875 BlobRef b;
9876 uint32_t b_off = 0;
9877
9878 //attempting to reuse existing blob
9879 if (!wctx->compress) {
9880 // look for an existing mutable blob we can reuse
9881 auto begin = o->extent_map.extent_map.begin();
9882 auto end = o->extent_map.extent_map.end();
9883 auto ep = o->extent_map.seek_lextent(offset);
9884 auto prev_ep = ep;
9885 if (prev_ep != begin) {
9886 --prev_ep;
9887 } else {
9888 prev_ep = end; // to avoid this extent check as it's a duplicate
9889 }
9890 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9891 // search suitable extent in both forward and reverse direction in
9892 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 9893 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
9894 bool any_change;
9895 do {
9896 any_change = false;
9897 if (ep != end && ep->logical_offset < offset + max_bsize) {
9898 if (offset >= ep->blob_start() &&
224ce89b 9899 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
9900 offset - ep->blob_start(),
9901 &l)) {
9902 b = ep->blob;
9903 b_off = offset - ep->blob_start();
9904 prev_ep = end; // to avoid check below
9905 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 9906 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
9907 } else {
9908 ++ep;
9909 any_change = true;
9910 }
9911 }
9912
9913 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 9914 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
9915 offset - prev_ep->blob_start(),
9916 &l)) {
9917 b = prev_ep->blob;
9918 b_off = offset - prev_ep->blob_start();
9919 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 9920 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
9921 } else if (prev_ep != begin) {
9922 --prev_ep;
9923 any_change = true;
9924 } else {
9925 prev_ep = end; // to avoid useless first extent re-check
9926 }
9927 }
9928 } while (b == nullptr && any_change);
9929 }
9930 if (b == nullptr) {
9931 b = c->new_blob();
9932 b_off = 0;
9933 new_blob = true;
9934 }
9935
9936 bufferlist t;
9937 blp.copy(l, t);
9938 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
9939 offset += l;
9940 length -= l;
9941 logger->inc(l_bluestore_write_big_blobs);
9942 }
9943}
9944
9945int BlueStore::_do_alloc_write(
9946 TransContext *txc,
9947 CollectionRef coll,
9948 OnodeRef o,
9949 WriteContext *wctx)
9950{
9951 dout(20) << __func__ << " txc " << txc
9952 << " " << wctx->writes.size() << " blobs"
9953 << dendl;
3efd9988
FG
9954 if (wctx->writes.empty()) {
9955 return 0;
7c673cae
FG
9956 }
9957
7c673cae
FG
9958 CompressorRef c;
9959 double crr = 0;
9960 if (wctx->compress) {
9961 c = select_option(
9962 "compression_algorithm",
9963 compressor,
9964 [&]() {
9965 string val;
9966 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
9967 CompressorRef cp = compressor;
9968 if (!cp || cp->get_type_name() != val) {
9969 cp = Compressor::create(cct, val);
9970 }
9971 return boost::optional<CompressorRef>(cp);
9972 }
9973 return boost::optional<CompressorRef>();
9974 }
9975 );
9976
9977 crr = select_option(
9978 "compression_required_ratio",
9979 cct->_conf->bluestore_compression_required_ratio,
9980 [&]() {
9981 double val;
3efd9988 9982 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
9983 return boost::optional<double>(val);
9984 }
9985 return boost::optional<double>();
9986 }
9987 );
9988 }
9989
9990 // checksum
9991 int csum = csum_type.load();
9992 csum = select_option(
9993 "csum_type",
9994 csum,
9995 [&]() {
9996 int val;
3efd9988 9997 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
7c673cae
FG
9998 return boost::optional<int>(val);
9999 }
10000 return boost::optional<int>();
10001 }
10002 );
10003
3efd9988
FG
10004 // compress (as needed) and calc needed space
10005 uint64_t need = 0;
10006 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
7c673cae 10007 for (auto& wi : wctx->writes) {
3efd9988 10008 if (c && wi.blob_length > min_alloc_size) {
7c673cae
FG
10009 utime_t start = ceph_clock_now();
10010
10011 // compress
3efd9988
FG
10012 assert(wi.b_off == 0);
10013 assert(wi.blob_length == wi.bl.length());
10014
7c673cae
FG
10015 // FIXME: memory alignment here is bad
10016 bufferlist t;
3efd9988 10017 int r = c->compress(wi.bl, t);
7c673cae
FG
10018 assert(r == 0);
10019
3efd9988
FG
10020 bluestore_compression_header_t chdr;
10021 chdr.type = c->get_type();
7c673cae 10022 chdr.length = t.length();
3efd9988
FG
10023 ::encode(chdr, wi.compressed_bl);
10024 wi.compressed_bl.claim_append(t);
10025
10026 wi.compressed_len = wi.compressed_bl.length();
10027 uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
10028 uint64_t want_len_raw = wi.blob_length * crr;
7c673cae 10029 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
3efd9988
FG
10030 if (newlen <= want_len && newlen < wi.blob_length) {
10031 // Cool. We compressed at least as much as we were hoping to.
10032 // pad out to min_alloc_size
10033 wi.compressed_bl.append_zero(newlen - wi.compressed_len);
10034 logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
7c673cae 10035 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
3efd9988 10036 << " -> 0x" << wi.compressed_len << " => 0x" << newlen
7c673cae
FG
10037 << " with " << c->get_type()
10038 << std::dec << dendl;
3efd9988
FG
10039 txc->statfs_delta.compressed() += wi.compressed_len;
10040 txc->statfs_delta.compressed_original() += wi.blob_length;
7c673cae 10041 txc->statfs_delta.compressed_allocated() += newlen;
3efd9988
FG
10042 logger->inc(l_bluestore_compress_success_count);
10043 wi.compressed = true;
10044 need += newlen;
7c673cae 10045 } else {
3efd9988
FG
10046 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
10047 << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
10048 << " with " << c->get_type()
10049 << ", which is more than required 0x" << want_len_raw
7c673cae 10050 << " -> 0x" << want_len
3efd9988
FG
10051 << ", leaving uncompressed"
10052 << std::dec << dendl;
10053 logger->inc(l_bluestore_compress_rejected_count);
10054 need += wi.blob_length;
7c673cae
FG
10055 }
10056 logger->tinc(l_bluestore_compress_lat,
10057 ceph_clock_now() - start);
3efd9988
FG
10058 } else {
10059 need += wi.blob_length;
7c673cae 10060 }
3efd9988
FG
10061 }
10062 int r = alloc->reserve(need);
10063 if (r < 0) {
10064 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
10065 << dendl;
10066 return r;
10067 }
10068 AllocExtentVector prealloc;
10069 prealloc.reserve(2 * wctx->writes.size());;
10070 int prealloc_left = 0;
10071 prealloc_left = alloc->allocate(
10072 need, min_alloc_size, need,
10073 0, &prealloc);
10074 assert(prealloc_left == (int64_t)need);
10075 dout(20) << __func__ << " prealloc " << prealloc << dendl;
10076 auto prealloc_pos = prealloc.begin();
10077
10078 for (auto& wi : wctx->writes) {
10079 BlobRef b = wi.b;
10080 bluestore_blob_t& dblob = b->dirty_blob();
10081 uint64_t b_off = wi.b_off;
10082 bufferlist *l = &wi.bl;
10083 uint64_t final_length = wi.blob_length;
10084 uint64_t csum_length = wi.blob_length;
10085 unsigned csum_order = block_size_order;
10086 if (wi.compressed) {
10087 final_length = wi.compressed_bl.length();
10088 csum_length = final_length;
10089 csum_order = ctz(csum_length);
10090 l = &wi.compressed_bl;
10091 dblob.set_compressed(wi.blob_length, wi.compressed_len);
10092 } else if (wi.new_blob) {
7c673cae 10093 // initialize newly created blob only
31f18b77 10094 assert(dblob.is_mutable());
7c673cae
FG
10095 if (l->length() != wi.blob_length) {
10096 // hrm, maybe we could do better here, but let's not bother.
10097 dout(20) << __func__ << " forcing csum_order to block_size_order "
10098 << block_size_order << dendl;
31f18b77 10099 csum_order = block_size_order;
7c673cae
FG
10100 } else {
10101 csum_order = std::min(wctx->csum_order, ctz(l->length()));
10102 }
10103 // try to align blob with max_blob_size to improve
10104 // its reuse ratio, e.g. in case of reverse write
10105 uint32_t suggested_boff =
10106 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
10107 if ((suggested_boff % (1 << csum_order)) == 0 &&
10108 suggested_boff + final_length <= max_bsize &&
10109 suggested_boff > b_off) {
181888fb 10110 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae
FG
10111 << std::hex << suggested_boff << std::dec << dendl;
10112 assert(suggested_boff >= b_off);
10113 csum_length += suggested_boff - b_off;
10114 b_off = suggested_boff;
10115 }
181888fb
FG
10116 if (csum != Checksummer::CSUM_NONE) {
10117 dout(20) << __func__ << " initialize csum setting for new blob " << *b
10118 << " csum_type " << Checksummer::get_csum_type_string(csum)
10119 << " csum_order " << csum_order
10120 << " csum_length 0x" << std::hex << csum_length << std::dec
10121 << dendl;
10122 dblob.init_csum(csum, csum_order, csum_length);
10123 }
7c673cae
FG
10124 }
10125
10126 AllocExtentVector extents;
3efd9988
FG
10127 int64_t left = final_length;
10128 while (left > 0) {
10129 assert(prealloc_left > 0);
10130 if (prealloc_pos->length <= left) {
10131 prealloc_left -= prealloc_pos->length;
10132 left -= prealloc_pos->length;
10133 txc->statfs_delta.allocated() += prealloc_pos->length;
10134 extents.push_back(*prealloc_pos);
10135 ++prealloc_pos;
10136 } else {
10137 extents.emplace_back(prealloc_pos->offset, left);
10138 prealloc_pos->offset += left;
10139 prealloc_pos->length -= left;
10140 prealloc_left -= left;
10141 txc->statfs_delta.allocated() += left;
10142 left = 0;
10143 break;
10144 }
10145 }
7c673cae 10146 for (auto& p : extents) {
3efd9988 10147 txc->allocated.insert(p.offset, p.length);
7c673cae
FG
10148 }
10149 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
10150
181888fb
FG
10151 dout(20) << __func__ << " blob " << *b << dendl;
10152 if (dblob.has_csum()) {
7c673cae
FG
10153 dblob.calc_csum(b_off, *l);
10154 }
181888fb 10155
7c673cae
FG
10156 if (wi.mark_unused) {
10157 auto b_end = b_off + wi.bl.length();
10158 if (b_off) {
10159 dblob.add_unused(0, b_off);
10160 }
10161 if (b_end < wi.blob_length) {
10162 dblob.add_unused(b_end, wi.blob_length - b_end);
10163 }
10164 }
10165
10166 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
10167 b_off + (wi.b_off0 - wi.b_off),
10168 wi.length0,
10169 wi.b,
10170 nullptr);
10171 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
10172 txc->statfs_delta.stored() += le->length;
10173 dout(20) << __func__ << " lex " << *le << dendl;
10174 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
10175 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
10176
10177 // queue io
10178 if (!g_conf->bluestore_debug_omit_block_device_write) {
10179 if (l->length() <= prefer_deferred_size.load()) {
10180 dout(20) << __func__ << " deferring small 0x" << std::hex
10181 << l->length() << std::dec << " write via deferred" << dendl;
10182 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
10183 op->op = bluestore_deferred_op_t::OP_WRITE;
10184 int r = b->get_blob().map(
10185 b_off, l->length(),
10186 [&](uint64_t offset, uint64_t length) {
10187 op->extents.emplace_back(bluestore_pextent_t(offset, length));
10188 return 0;
10189 });
10190 assert(r == 0);
10191 op->data = *l;
10192 } else {
10193 b->get_blob().map_bl(
10194 b_off, *l,
10195 [&](uint64_t offset, bufferlist& t) {
10196 bdev->aio_write(offset, t, &txc->ioc, false);
10197 });
10198 }
10199 }
10200 }
3efd9988
FG
10201 assert(prealloc_pos == prealloc.end());
10202 assert(prealloc_left == 0);
7c673cae
FG
10203 return 0;
10204}
10205
10206void BlueStore::_wctx_finish(
10207 TransContext *txc,
10208 CollectionRef& c,
10209 OnodeRef o,
31f18b77
FG
10210 WriteContext *wctx,
10211 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
10212{
10213 auto oep = wctx->old_extents.begin();
10214 while (oep != wctx->old_extents.end()) {
10215 auto &lo = *oep;
10216 oep = wctx->old_extents.erase(oep);
10217 dout(20) << __func__ << " lex_old " << lo.e << dendl;
10218 BlobRef b = lo.e.blob;
10219 const bluestore_blob_t& blob = b->get_blob();
10220 if (blob.is_compressed()) {
10221 if (lo.blob_empty) {
10222 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
10223 }
10224 txc->statfs_delta.compressed_original() -= lo.e.length;
10225 }
10226 auto& r = lo.r;
10227 txc->statfs_delta.stored() -= lo.e.length;
10228 if (!r.empty()) {
10229 dout(20) << __func__ << " blob release " << r << dendl;
10230 if (blob.is_shared()) {
10231 PExtentVector final;
10232 c->load_shared_blob(b->shared_blob);
10233 for (auto e : r) {
31f18b77
FG
10234 b->shared_blob->put_ref(
10235 e.offset, e.length, &final,
10236 b->is_referenced() ? nullptr : maybe_unshared_blobs);
7c673cae
FG
10237 }
10238 dout(20) << __func__ << " shared_blob release " << final
10239 << " from " << *b->shared_blob << dendl;
10240 txc->write_shared_blob(b->shared_blob);
10241 r.clear();
10242 r.swap(final);
10243 }
10244 }
10245 // we can't invalidate our logical extents as we drop them because
10246 // other lextents (either in our onode or others) may still
10247 // reference them. but we can throw out anything that is no
10248 // longer allocated. Note that this will leave behind edge bits
10249 // that are no longer referenced but not deallocated (until they
10250 // age out of the cache naturally).
10251 b->discard_unallocated(c.get());
10252 for (auto e : r) {
10253 dout(20) << __func__ << " release " << e << dendl;
10254 txc->released.insert(e.offset, e.length);
10255 txc->statfs_delta.allocated() -= e.length;
10256 if (blob.is_compressed()) {
10257 txc->statfs_delta.compressed_allocated() -= e.length;
10258 }
10259 }
10260 delete &lo;
10261 if (b->is_spanning() && !b->is_referenced()) {
10262 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
10263 << dendl;
10264 o->extent_map.spanning_blob_map.erase(b->id);
10265 }
10266 }
10267}
10268
10269void BlueStore::_do_write_data(
10270 TransContext *txc,
10271 CollectionRef& c,
10272 OnodeRef o,
10273 uint64_t offset,
10274 uint64_t length,
10275 bufferlist& bl,
10276 WriteContext *wctx)
10277{
10278 uint64_t end = offset + length;
10279 bufferlist::iterator p = bl.begin();
10280
10281 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
10282 (length != min_alloc_size)) {
10283 // we fall within the same block
10284 _do_write_small(txc, c, o, offset, length, p, wctx);
10285 } else {
10286 uint64_t head_offset, head_length;
10287 uint64_t middle_offset, middle_length;
10288 uint64_t tail_offset, tail_length;
10289
10290 head_offset = offset;
10291 head_length = P2NPHASE(offset, min_alloc_size);
10292
10293 tail_offset = P2ALIGN(end, min_alloc_size);
10294 tail_length = P2PHASE(end, min_alloc_size);
10295
10296 middle_offset = head_offset + head_length;
10297 middle_length = length - head_length - tail_length;
10298
10299 if (head_length) {
10300 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
10301 }
10302
10303 if (middle_length) {
10304 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
10305 }
10306
10307 if (tail_length) {
10308 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
10309 }
10310 }
10311}
10312
31f18b77
FG
10313void BlueStore::_choose_write_options(
10314 CollectionRef& c,
10315 OnodeRef o,
10316 uint32_t fadvise_flags,
10317 WriteContext *wctx)
7c673cae 10318{
7c673cae
FG
10319 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10320 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 10321 wctx->buffered = true;
7c673cae
FG
10322 } else if (cct->_conf->bluestore_default_buffered_write &&
10323 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10324 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10325 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 10326 wctx->buffered = true;
7c673cae
FG
10327 }
10328
31f18b77
FG
10329 // apply basic csum block size
10330 wctx->csum_order = block_size_order;
7c673cae
FG
10331
10332 // compression parameters
10333 unsigned alloc_hints = o->onode.alloc_hint_flags;
10334 auto cm = select_option(
10335 "compression_mode",
31f18b77 10336 comp_mode.load(),
7c673cae
FG
10337 [&]() {
10338 string val;
10339 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
10340 return boost::optional<Compressor::CompressionMode>(
10341 Compressor::get_comp_mode_type(val));
7c673cae
FG
10342 }
10343 return boost::optional<Compressor::CompressionMode>();
10344 }
10345 );
31f18b77
FG
10346
10347 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
10348 ((cm == Compressor::COMP_FORCE) ||
10349 (cm == Compressor::COMP_AGGRESSIVE &&
10350 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10351 (cm == Compressor::COMP_PASSIVE &&
10352 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10353
10354 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10355 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
10356 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10357 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 10358 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 10359
7c673cae 10360 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 10361
7c673cae 10362 if (o->onode.expected_write_size) {
224ce89b 10363 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 10364 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 10365 } else {
224ce89b 10366 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
10367 }
10368
31f18b77
FG
10369 if (wctx->compress) {
10370 wctx->target_blob_size = select_option(
7c673cae 10371 "compression_max_blob_size",
31f18b77 10372 comp_max_blob_size.load(),
7c673cae
FG
10373 [&]() {
10374 int val;
10375 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10376 return boost::optional<uint64_t>((uint64_t)val);
10377 }
10378 return boost::optional<uint64_t>();
10379 }
10380 );
10381 }
10382 } else {
31f18b77
FG
10383 if (wctx->compress) {
10384 wctx->target_blob_size = select_option(
7c673cae 10385 "compression_min_blob_size",
31f18b77 10386 comp_min_blob_size.load(),
7c673cae
FG
10387 [&]() {
10388 int val;
10389 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10390 return boost::optional<uint64_t>((uint64_t)val);
10391 }
10392 return boost::optional<uint64_t>();
10393 }
10394 );
10395 }
10396 }
31f18b77 10397
7c673cae 10398 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
10399 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10400 wctx->target_blob_size = max_bsize;
7c673cae 10401 }
31f18b77 10402
7c673cae
FG
10403 // set the min blob size floor at 2x the min_alloc_size, or else we
10404 // won't be able to allocate a smaller extent for the compressed
10405 // data.
31f18b77
FG
10406 if (wctx->compress &&
10407 wctx->target_blob_size < min_alloc_size * 2) {
10408 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 10409 }
31f18b77
FG
10410
10411 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10412 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10413 << std::dec << dendl;
10414}
10415
10416int BlueStore::_do_gc(
10417 TransContext *txc,
10418 CollectionRef& c,
10419 OnodeRef o,
10420 const GarbageCollector& gc,
10421 const WriteContext& wctx,
10422 uint64_t *dirty_start,
10423 uint64_t *dirty_end)
10424{
10425 auto& extents_to_collect = gc.get_extents_to_collect();
10426
1adf2230 10427 bool dirty_range_updated = false;
31f18b77 10428 WriteContext wctx_gc;
7c673cae 10429 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 10430
31f18b77
FG
10431 for (auto it = extents_to_collect.begin();
10432 it != extents_to_collect.end();
10433 ++it) {
10434 bufferlist bl;
10435 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10436 assert(r == (int)it->length);
10437
10438 o->extent_map.fault_range(db, it->offset, it->length);
10439 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10440 logger->inc(l_bluestore_gc_merged, it->length);
10441
10442 if (*dirty_start > it->offset) {
10443 *dirty_start = it->offset;
1adf2230 10444 dirty_range_updated = true;
31f18b77
FG
10445 }
10446
10447 if (*dirty_end < it->offset + it->length) {
10448 *dirty_end = it->offset + it->length;
1adf2230 10449 dirty_range_updated = true;
31f18b77
FG
10450 }
10451 }
1adf2230
AA
10452 if (dirty_range_updated) {
10453 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
10454 }
31f18b77
FG
10455
10456 dout(30) << __func__ << " alloc write" << dendl;
10457 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10458 if (r < 0) {
10459 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10460 << dendl;
10461 return r;
10462 }
10463
10464 _wctx_finish(txc, c, o, &wctx_gc);
10465 return 0;
10466}
10467
10468int BlueStore::_do_write(
10469 TransContext *txc,
10470 CollectionRef& c,
10471 OnodeRef o,
10472 uint64_t offset,
10473 uint64_t length,
10474 bufferlist& bl,
10475 uint32_t fadvise_flags)
10476{
10477 int r = 0;
10478
10479 dout(20) << __func__
10480 << " " << o->oid
10481 << " 0x" << std::hex << offset << "~" << length
10482 << " - have 0x" << o->onode.size
10483 << " (" << std::dec << o->onode.size << ")"
10484 << " bytes"
10485 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10486 << dendl;
10487 _dump_onode(o);
10488
10489 if (length == 0) {
10490 return 0;
10491 }
10492
10493 uint64_t end = offset + length;
10494
10495 GarbageCollector gc(c->store->cct);
10496 int64_t benefit;
10497 auto dirty_start = offset;
10498 auto dirty_end = end;
10499
10500 WriteContext wctx;
10501 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
10502 o->extent_map.fault_range(db, offset, length);
10503 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
10504 r = _do_alloc_write(txc, c, o, &wctx);
10505 if (r < 0) {
10506 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10507 << dendl;
10508 goto out;
10509 }
10510
31f18b77
FG
10511 // NB: _wctx_finish() will empty old_extents
10512 // so we must do gc estimation before that
7c673cae 10513 benefit = gc.estimate(offset,
31f18b77
FG
10514 length,
10515 o->extent_map,
10516 wctx.old_extents,
10517 min_alloc_size);
7c673cae
FG
10518
10519 _wctx_finish(txc, c, o, &wctx);
10520 if (end > o->onode.size) {
10521 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 10522 << std::dec << dendl;
7c673cae
FG
10523 o->onode.size = end;
10524 }
10525
10526 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
31f18b77
FG
10527 if (!gc.get_extents_to_collect().empty()) {
10528 dout(20) << __func__ << " perform garbage collection, "
10529 << "expected benefit = " << benefit << " AUs" << dendl;
10530 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10531 if (r < 0) {
10532 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10533 << dendl;
10534 goto out;
7c673cae 10535 }
1adf2230
AA
10536 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
10537 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae
FG
10538 }
10539 }
7c673cae 10540 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
10541 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10542
7c673cae
FG
10543 r = 0;
10544
10545 out:
10546 return r;
10547}
10548
10549int BlueStore::_write(TransContext *txc,
10550 CollectionRef& c,
10551 OnodeRef& o,
31f18b77
FG
10552 uint64_t offset, size_t length,
10553 bufferlist& bl,
10554 uint32_t fadvise_flags)
7c673cae
FG
10555{
10556 dout(15) << __func__ << " " << c->cid << " " << o->oid
10557 << " 0x" << std::hex << offset << "~" << length << std::dec
10558 << dendl;
35e4c445
FG
10559 int r = 0;
10560 if (offset + length >= OBJECT_MAX_SIZE) {
10561 r = -E2BIG;
10562 } else {
10563 _assign_nid(txc, o);
10564 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10565 txc->write_onode(o);
10566 }
7c673cae
FG
10567 dout(10) << __func__ << " " << c->cid << " " << o->oid
10568 << " 0x" << std::hex << offset << "~" << length << std::dec
10569 << " = " << r << dendl;
10570 return r;
10571}
10572
10573int BlueStore::_zero(TransContext *txc,
10574 CollectionRef& c,
10575 OnodeRef& o,
10576 uint64_t offset, size_t length)
10577{
10578 dout(15) << __func__ << " " << c->cid << " " << o->oid
10579 << " 0x" << std::hex << offset << "~" << length << std::dec
10580 << dendl;
35e4c445
FG
10581 int r = 0;
10582 if (offset + length >= OBJECT_MAX_SIZE) {
10583 r = -E2BIG;
10584 } else {
10585 _assign_nid(txc, o);
10586 r = _do_zero(txc, c, o, offset, length);
10587 }
7c673cae
FG
10588 dout(10) << __func__ << " " << c->cid << " " << o->oid
10589 << " 0x" << std::hex << offset << "~" << length << std::dec
10590 << " = " << r << dendl;
10591 return r;
10592}
10593
10594int BlueStore::_do_zero(TransContext *txc,
10595 CollectionRef& c,
10596 OnodeRef& o,
10597 uint64_t offset, size_t length)
10598{
10599 dout(15) << __func__ << " " << c->cid << " " << o->oid
10600 << " 0x" << std::hex << offset << "~" << length << std::dec
10601 << dendl;
10602 int r = 0;
10603
10604 _dump_onode(o);
10605
10606 WriteContext wctx;
10607 o->extent_map.fault_range(db, offset, length);
10608 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 10609 o->extent_map.dirty_range(offset, length);
7c673cae
FG
10610 _wctx_finish(txc, c, o, &wctx);
10611
b32b8144 10612 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
10613 o->onode.size = offset + length;
10614 dout(20) << __func__ << " extending size to " << offset + length
10615 << dendl;
10616 }
10617 txc->write_onode(o);
10618
10619 dout(10) << __func__ << " " << c->cid << " " << o->oid
10620 << " 0x" << std::hex << offset << "~" << length << std::dec
10621 << " = " << r << dendl;
10622 return r;
10623}
10624
10625void BlueStore::_do_truncate(
31f18b77
FG
10626 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10627 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
10628{
10629 dout(15) << __func__ << " " << c->cid << " " << o->oid
10630 << " 0x" << std::hex << offset << std::dec << dendl;
10631
10632 _dump_onode(o, 30);
10633
10634 if (offset == o->onode.size)
31f18b77 10635 return;
7c673cae
FG
10636
10637 if (offset < o->onode.size) {
10638 WriteContext wctx;
10639 uint64_t length = o->onode.size - offset;
10640 o->extent_map.fault_range(db, offset, length);
10641 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
10642 o->extent_map.dirty_range(offset, length);
10643 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
10644
10645 // if we have shards past EOF, ask for a reshard
10646 if (!o->onode.extent_map_shards.empty() &&
10647 o->onode.extent_map_shards.back().offset >= offset) {
10648 dout(10) << __func__ << " request reshard past EOF" << dendl;
10649 if (offset) {
10650 o->extent_map.request_reshard(offset - 1, offset + length);
10651 } else {
10652 o->extent_map.request_reshard(0, length);
10653 }
10654 }
10655 }
10656
10657 o->onode.size = offset;
10658
10659 txc->write_onode(o);
10660}
10661
35e4c445 10662int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
10663 CollectionRef& c,
10664 OnodeRef& o,
10665 uint64_t offset)
10666{
10667 dout(15) << __func__ << " " << c->cid << " " << o->oid
10668 << " 0x" << std::hex << offset << std::dec
10669 << dendl;
35e4c445
FG
10670 int r = 0;
10671 if (offset >= OBJECT_MAX_SIZE) {
10672 r = -E2BIG;
10673 } else {
10674 _do_truncate(txc, c, o, offset);
10675 }
10676 dout(10) << __func__ << " " << c->cid << " " << o->oid
10677 << " 0x" << std::hex << offset << std::dec
10678 << " = " << r << dendl;
10679 return r;
7c673cae
FG
10680}
10681
10682int BlueStore::_do_remove(
10683 TransContext *txc,
10684 CollectionRef& c,
10685 OnodeRef o)
10686{
31f18b77 10687 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
10688 bool is_gen = !o->oid.is_no_gen();
10689 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
10690 if (o->onode.has_omap()) {
10691 o->flush();
10692 _do_omap_clear(txc, o->onode.nid);
10693 }
10694 o->exists = false;
10695 string key;
10696 for (auto &s : o->extent_map.shards) {
10697 dout(20) << __func__ << " removing shard 0x" << std::hex
10698 << s.shard_info->offset << std::dec << dendl;
10699 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10700 [&](const string& final_key) {
10701 txc->t->rmkey(PREFIX_OBJ, final_key);
10702 }
10703 );
10704 }
10705 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10706 txc->removed(o);
10707 o->extent_map.clear();
10708 o->onode = bluestore_onode_t();
10709 _debug_obj_on_delete(o->oid);
31f18b77 10710
224ce89b
WB
10711 if (!is_gen || maybe_unshared_blobs.empty()) {
10712 return 0;
10713 }
31f18b77 10714
224ce89b
WB
10715 // see if we can unshare blobs still referenced by the head
10716 dout(10) << __func__ << " gen and maybe_unshared_blobs "
10717 << maybe_unshared_blobs << dendl;
10718 ghobject_t nogen = o->oid;
10719 nogen.generation = ghobject_t::NO_GEN;
10720 OnodeRef h = c->onode_map.lookup(nogen);
10721
10722 if (!h || !h->exists) {
10723 return 0;
10724 }
10725
10726 dout(20) << __func__ << " checking for unshareable blobs on " << h
10727 << " " << h->oid << dendl;
10728 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
10729 for (auto& e : h->extent_map.extent_map) {
10730 const bluestore_blob_t& b = e.blob->get_blob();
10731 SharedBlob *sb = e.blob->shared_blob.get();
10732 if (b.is_shared() &&
10733 sb->loaded &&
10734 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
10735 if (b.is_compressed()) {
10736 expect[sb].get(0, b.get_ondisk_length());
10737 } else {
10738 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
10739 expect[sb].get(off, len);
10740 return 0;
10741 });
10742 }
224ce89b
WB
10743 }
10744 }
31f18b77 10745
224ce89b
WB
10746 vector<SharedBlob*> unshared_blobs;
10747 unshared_blobs.reserve(maybe_unshared_blobs.size());
10748 for (auto& p : expect) {
10749 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
10750 if (p.first->persistent->ref_map == p.second) {
10751 SharedBlob *sb = p.first;
10752 dout(20) << __func__ << " unsharing " << *sb << dendl;
10753 unshared_blobs.push_back(sb);
10754 txc->unshare_blob(sb);
10755 uint64_t sbid = c->make_blob_unshared(sb);
10756 string key;
10757 get_shared_blob_key(sbid, &key);
10758 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
10759 }
10760 }
10761
10762 if (unshared_blobs.empty()) {
10763 return 0;
10764 }
10765
224ce89b
WB
10766 for (auto& e : h->extent_map.extent_map) {
10767 const bluestore_blob_t& b = e.blob->get_blob();
10768 SharedBlob *sb = e.blob->shared_blob.get();
10769 if (b.is_shared() &&
10770 std::find(unshared_blobs.begin(), unshared_blobs.end(),
10771 sb) != unshared_blobs.end()) {
10772 dout(20) << __func__ << " unsharing " << e << dendl;
10773 bluestore_blob_t& blob = e.blob->dirty_blob();
10774 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 10775 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
10776 }
10777 }
224ce89b
WB
10778 txc->write_onode(h);
10779
7c673cae
FG
10780 return 0;
10781}
10782
10783int BlueStore::_remove(TransContext *txc,
10784 CollectionRef& c,
10785 OnodeRef &o)
10786{
10787 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10788 int r = _do_remove(txc, c, o);
10789 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10790 return r;
10791}
10792
10793int BlueStore::_setattr(TransContext *txc,
10794 CollectionRef& c,
10795 OnodeRef& o,
10796 const string& name,
10797 bufferptr& val)
10798{
10799 dout(15) << __func__ << " " << c->cid << " " << o->oid
10800 << " " << name << " (" << val.length() << " bytes)"
10801 << dendl;
10802 int r = 0;
3efd9988
FG
10803 if (val.is_partial()) {
10804 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
10805 val.length());
10806 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10807 } else {
10808 auto& b = o->onode.attrs[name.c_str()] = val;
10809 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10810 }
7c673cae
FG
10811 txc->write_onode(o);
10812 dout(10) << __func__ << " " << c->cid << " " << o->oid
10813 << " " << name << " (" << val.length() << " bytes)"
10814 << " = " << r << dendl;
10815 return r;
10816}
10817
10818int BlueStore::_setattrs(TransContext *txc,
10819 CollectionRef& c,
10820 OnodeRef& o,
10821 const map<string,bufferptr>& aset)
10822{
10823 dout(15) << __func__ << " " << c->cid << " " << o->oid
10824 << " " << aset.size() << " keys"
10825 << dendl;
10826 int r = 0;
10827 for (map<string,bufferptr>::const_iterator p = aset.begin();
10828 p != aset.end(); ++p) {
3efd9988
FG
10829 if (p->second.is_partial()) {
10830 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 10831 bufferptr(p->second.c_str(), p->second.length());
3efd9988
FG
10832 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10833 } else {
10834 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
10835 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10836 }
7c673cae
FG
10837 }
10838 txc->write_onode(o);
10839 dout(10) << __func__ << " " << c->cid << " " << o->oid
10840 << " " << aset.size() << " keys"
10841 << " = " << r << dendl;
10842 return r;
10843}
10844
10845
10846int BlueStore::_rmattr(TransContext *txc,
10847 CollectionRef& c,
10848 OnodeRef& o,
10849 const string& name)
10850{
10851 dout(15) << __func__ << " " << c->cid << " " << o->oid
10852 << " " << name << dendl;
10853 int r = 0;
10854 auto it = o->onode.attrs.find(name.c_str());
10855 if (it == o->onode.attrs.end())
10856 goto out;
10857
10858 o->onode.attrs.erase(it);
10859 txc->write_onode(o);
10860
10861 out:
10862 dout(10) << __func__ << " " << c->cid << " " << o->oid
10863 << " " << name << " = " << r << dendl;
10864 return r;
10865}
10866
10867int BlueStore::_rmattrs(TransContext *txc,
10868 CollectionRef& c,
10869 OnodeRef& o)
10870{
10871 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10872 int r = 0;
10873
10874 if (o->onode.attrs.empty())
10875 goto out;
10876
10877 o->onode.attrs.clear();
10878 txc->write_onode(o);
10879
10880 out:
10881 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10882 return r;
10883}
10884
10885void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
10886{
10887 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10888 string prefix, tail;
10889 get_omap_header(id, &prefix);
10890 get_omap_tail(id, &tail);
10891 it->lower_bound(prefix);
10892 while (it->valid()) {
10893 if (it->key() >= tail) {
10894 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
10895 << dendl;
10896 break;
10897 }
10898 txc->t->rmkey(PREFIX_OMAP, it->key());
10899 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10900 it->next();
10901 }
10902}
10903
10904int BlueStore::_omap_clear(TransContext *txc,
10905 CollectionRef& c,
10906 OnodeRef& o)
10907{
10908 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10909 int r = 0;
10910 if (o->onode.has_omap()) {
10911 o->flush();
10912 _do_omap_clear(txc, o->onode.nid);
10913 o->onode.clear_omap_flag();
10914 txc->write_onode(o);
10915 }
10916 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10917 return r;
10918}
10919
10920int BlueStore::_omap_setkeys(TransContext *txc,
10921 CollectionRef& c,
10922 OnodeRef& o,
10923 bufferlist &bl)
10924{
10925 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10926 int r;
10927 bufferlist::iterator p = bl.begin();
10928 __u32 num;
10929 if (!o->onode.has_omap()) {
10930 o->onode.set_omap_flag();
10931 txc->write_onode(o);
10932 } else {
10933 txc->note_modified_object(o);
10934 }
10935 string final_key;
10936 _key_encode_u64(o->onode.nid, &final_key);
10937 final_key.push_back('.');
10938 ::decode(num, p);
10939 while (num--) {
10940 string key;
10941 bufferlist value;
10942 ::decode(key, p);
10943 ::decode(value, p);
10944 final_key.resize(9); // keep prefix
10945 final_key += key;
10946 dout(30) << __func__ << " " << pretty_binary_string(final_key)
10947 << " <- " << key << dendl;
10948 txc->t->set(PREFIX_OMAP, final_key, value);
10949 }
10950 r = 0;
10951 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10952 return r;
10953}
10954
10955int BlueStore::_omap_setheader(TransContext *txc,
10956 CollectionRef& c,
10957 OnodeRef &o,
10958 bufferlist& bl)
10959{
10960 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10961 int r;
10962 string key;
10963 if (!o->onode.has_omap()) {
10964 o->onode.set_omap_flag();
10965 txc->write_onode(o);
10966 } else {
10967 txc->note_modified_object(o);
10968 }
10969 get_omap_header(o->onode.nid, &key);
10970 txc->t->set(PREFIX_OMAP, key, bl);
10971 r = 0;
10972 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10973 return r;
10974}
10975
10976int BlueStore::_omap_rmkeys(TransContext *txc,
10977 CollectionRef& c,
10978 OnodeRef& o,
10979 bufferlist& bl)
10980{
10981 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10982 int r = 0;
10983 bufferlist::iterator p = bl.begin();
10984 __u32 num;
10985 string final_key;
10986
10987 if (!o->onode.has_omap()) {
10988 goto out;
10989 }
10990 _key_encode_u64(o->onode.nid, &final_key);
10991 final_key.push_back('.');
10992 ::decode(num, p);
10993 while (num--) {
10994 string key;
10995 ::decode(key, p);
10996 final_key.resize(9); // keep prefix
10997 final_key += key;
10998 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
10999 << " <- " << key << dendl;
11000 txc->t->rmkey(PREFIX_OMAP, final_key);
11001 }
11002 txc->note_modified_object(o);
11003
11004 out:
11005 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11006 return r;
11007}
11008
11009int BlueStore::_omap_rmkey_range(TransContext *txc,
11010 CollectionRef& c,
11011 OnodeRef& o,
11012 const string& first, const string& last)
11013{
11014 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11015 KeyValueDB::Iterator it;
11016 string key_first, key_last;
11017 int r = 0;
11018 if (!o->onode.has_omap()) {
11019 goto out;
11020 }
11021 o->flush();
11022 it = db->get_iterator(PREFIX_OMAP);
11023 get_omap_key(o->onode.nid, first, &key_first);
11024 get_omap_key(o->onode.nid, last, &key_last);
11025 it->lower_bound(key_first);
11026 while (it->valid()) {
11027 if (it->key() >= key_last) {
11028 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
11029 << dendl;
11030 break;
11031 }
11032 txc->t->rmkey(PREFIX_OMAP, it->key());
11033 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
11034 it->next();
11035 }
11036 txc->note_modified_object(o);
11037
11038 out:
11039 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11040 return r;
11041}
11042
11043int BlueStore::_set_alloc_hint(
11044 TransContext *txc,
11045 CollectionRef& c,
11046 OnodeRef& o,
11047 uint64_t expected_object_size,
11048 uint64_t expected_write_size,
11049 uint32_t flags)
11050{
11051 dout(15) << __func__ << " " << c->cid << " " << o->oid
11052 << " object_size " << expected_object_size
11053 << " write_size " << expected_write_size
11054 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11055 << dendl;
11056 int r = 0;
11057 o->onode.expected_object_size = expected_object_size;
11058 o->onode.expected_write_size = expected_write_size;
11059 o->onode.alloc_hint_flags = flags;
11060 txc->write_onode(o);
11061 dout(10) << __func__ << " " << c->cid << " " << o->oid
11062 << " object_size " << expected_object_size
11063 << " write_size " << expected_write_size
11064 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11065 << " = " << r << dendl;
11066 return r;
11067}
11068
11069int BlueStore::_clone(TransContext *txc,
11070 CollectionRef& c,
11071 OnodeRef& oldo,
11072 OnodeRef& newo)
11073{
11074 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11075 << newo->oid << dendl;
11076 int r = 0;
11077 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
11078 derr << __func__ << " mismatched hash on " << oldo->oid
11079 << " and " << newo->oid << dendl;
11080 return -EINVAL;
11081 }
11082
7c673cae
FG
11083 _assign_nid(txc, newo);
11084
11085 // clone data
11086 oldo->flush();
11087 _do_truncate(txc, c, newo, 0);
11088 if (cct->_conf->bluestore_clone_cow) {
11089 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
11090 } else {
11091 bufferlist bl;
11092 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
11093 if (r < 0)
11094 goto out;
11095 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
11096 if (r < 0)
11097 goto out;
11098 }
11099
11100 // clone attrs
11101 newo->onode.attrs = oldo->onode.attrs;
11102
11103 // clone omap
11104 if (newo->onode.has_omap()) {
11105 dout(20) << __func__ << " clearing old omap data" << dendl;
11106 newo->flush();
11107 _do_omap_clear(txc, newo->onode.nid);
11108 }
11109 if (oldo->onode.has_omap()) {
11110 dout(20) << __func__ << " copying omap data" << dendl;
11111 if (!newo->onode.has_omap()) {
11112 newo->onode.set_omap_flag();
11113 }
11114 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
11115 string head, tail;
11116 get_omap_header(oldo->onode.nid, &head);
11117 get_omap_tail(oldo->onode.nid, &tail);
11118 it->lower_bound(head);
11119 while (it->valid()) {
11120 if (it->key() >= tail) {
11121 dout(30) << __func__ << " reached tail" << dendl;
11122 break;
11123 } else {
11124 dout(30) << __func__ << " got header/data "
11125 << pretty_binary_string(it->key()) << dendl;
11126 string key;
11127 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11128 txc->t->set(PREFIX_OMAP, key, it->value());
11129 }
11130 it->next();
11131 }
11132 } else {
11133 newo->onode.clear_omap_flag();
11134 }
11135
11136 txc->write_onode(newo);
11137 r = 0;
11138
11139 out:
11140 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11141 << newo->oid << " = " << r << dendl;
11142 return r;
11143}
11144
11145int BlueStore::_do_clone_range(
11146 TransContext *txc,
11147 CollectionRef& c,
11148 OnodeRef& oldo,
11149 OnodeRef& newo,
224ce89b
WB
11150 uint64_t srcoff,
11151 uint64_t length,
11152 uint64_t dstoff)
7c673cae
FG
11153{
11154 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11155 << newo->oid
11156 << " 0x" << std::hex << srcoff << "~" << length << " -> "
11157 << " 0x" << dstoff << "~" << length << std::dec << dendl;
11158 oldo->extent_map.fault_range(db, srcoff, length);
11159 newo->extent_map.fault_range(db, dstoff, length);
11160 _dump_onode(oldo);
11161 _dump_onode(newo);
11162
11163 // hmm, this could go into an ExtentMap::dup() method.
11164 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
11165 for (auto &e : oldo->extent_map.extent_map) {
11166 e.blob->last_encoded_id = -1;
11167 }
11168 int n = 0;
7c673cae 11169 uint64_t end = srcoff + length;
224ce89b
WB
11170 uint32_t dirty_range_begin = 0;
11171 uint32_t dirty_range_end = 0;
35e4c445 11172 bool src_dirty = false;
7c673cae
FG
11173 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
11174 ep != oldo->extent_map.extent_map.end();
11175 ++ep) {
11176 auto& e = *ep;
11177 if (e.logical_offset >= end) {
11178 break;
11179 }
11180 dout(20) << __func__ << " src " << e << dendl;
11181 BlobRef cb;
11182 bool blob_duped = true;
11183 if (e.blob->last_encoded_id >= 0) {
11184 // blob is already duped
11185 cb = id_to_blob[e.blob->last_encoded_id];
11186 blob_duped = false;
11187 } else {
11188 // dup the blob
11189 const bluestore_blob_t& blob = e.blob->get_blob();
11190 // make sure it is shared
11191 if (!blob.is_shared()) {
11192 c->make_blob_shared(_assign_blobid(txc), e.blob);
35e4c445
FG
11193 if (!src_dirty) {
11194 src_dirty = true;
224ce89b
WB
11195 dirty_range_begin = e.logical_offset;
11196 }
11197 assert(e.logical_end() > 0);
11198 // -1 to exclude next potential shard
11199 dirty_range_end = e.logical_end() - 1;
7c673cae
FG
11200 } else {
11201 c->load_shared_blob(e.blob->shared_blob);
11202 }
11203 cb = new Blob();
11204 e.blob->last_encoded_id = n;
11205 id_to_blob[n] = cb;
11206 e.blob->dup(*cb);
11207 // bump the extent refs on the copied blob's extents
11208 for (auto p : blob.get_extents()) {
11209 if (p.is_valid()) {
11210 e.blob->shared_blob->get_ref(p.offset, p.length);
11211 }
11212 }
11213 txc->write_shared_blob(e.blob->shared_blob);
11214 dout(20) << __func__ << " new " << *cb << dendl;
11215 }
11216 // dup extent
11217 int skip_front, skip_back;
11218 if (e.logical_offset < srcoff) {
11219 skip_front = srcoff - e.logical_offset;
11220 } else {
11221 skip_front = 0;
11222 }
11223 if (e.logical_end() > end) {
11224 skip_back = e.logical_end() - end;
11225 } else {
11226 skip_back = 0;
11227 }
11228 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
11229 e.blob_offset + skip_front,
11230 e.length - skip_front - skip_back, cb);
11231 newo->extent_map.extent_map.insert(*ne);
11232 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
11233 // fixme: we may leave parts of new blob unreferenced that could
11234 // be freed (relative to the shared_blob).
11235 txc->statfs_delta.stored() += ne->length;
11236 if (e.blob->get_blob().is_compressed()) {
11237 txc->statfs_delta.compressed_original() += ne->length;
11238 if (blob_duped){
11239 txc->statfs_delta.compressed() +=
11240 cb->get_blob().get_compressed_payload_length();
11241 }
11242 }
11243 dout(20) << __func__ << " dst " << *ne << dendl;
11244 ++n;
11245 }
35e4c445 11246 if (src_dirty) {
224ce89b
WB
11247 oldo->extent_map.dirty_range(dirty_range_begin,
11248 dirty_range_end - dirty_range_begin);
7c673cae
FG
11249 txc->write_onode(oldo);
11250 }
11251 txc->write_onode(newo);
11252
11253 if (dstoff + length > newo->onode.size) {
11254 newo->onode.size = dstoff + length;
11255 }
31f18b77 11256 newo->extent_map.dirty_range(dstoff, length);
7c673cae
FG
11257 _dump_onode(oldo);
11258 _dump_onode(newo);
11259 return 0;
11260}
11261
11262int BlueStore::_clone_range(TransContext *txc,
11263 CollectionRef& c,
11264 OnodeRef& oldo,
11265 OnodeRef& newo,
11266 uint64_t srcoff, uint64_t length, uint64_t dstoff)
11267{
11268 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11269 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11270 << " to offset 0x" << dstoff << std::dec << dendl;
11271 int r = 0;
11272
35e4c445
FG
11273 if (srcoff + length >= OBJECT_MAX_SIZE ||
11274 dstoff + length >= OBJECT_MAX_SIZE) {
11275 r = -E2BIG;
11276 goto out;
11277 }
7c673cae
FG
11278 if (srcoff + length > oldo->onode.size) {
11279 r = -EINVAL;
11280 goto out;
11281 }
11282
7c673cae
FG
11283 _assign_nid(txc, newo);
11284
11285 if (length > 0) {
11286 if (cct->_conf->bluestore_clone_cow) {
11287 _do_zero(txc, c, newo, dstoff, length);
11288 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
11289 } else {
11290 bufferlist bl;
11291 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
11292 if (r < 0)
11293 goto out;
11294 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
11295 if (r < 0)
11296 goto out;
11297 }
11298 }
11299
11300 txc->write_onode(newo);
11301 r = 0;
11302
11303 out:
11304 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11305 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11306 << " to offset 0x" << dstoff << std::dec
11307 << " = " << r << dendl;
11308 return r;
11309}
11310
11311int BlueStore::_rename(TransContext *txc,
11312 CollectionRef& c,
11313 OnodeRef& oldo,
11314 OnodeRef& newo,
11315 const ghobject_t& new_oid)
11316{
11317 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11318 << new_oid << dendl;
11319 int r;
11320 ghobject_t old_oid = oldo->oid;
31f18b77 11321 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
11322
11323 if (newo) {
11324 if (newo->exists) {
11325 r = -EEXIST;
11326 goto out;
11327 }
11328 assert(txc->onodes.count(newo) == 0);
11329 }
11330
11331 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
11332
11333 // rewrite shards
11334 {
11335 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
11336 get_object_key(cct, new_oid, &new_okey);
11337 string key;
11338 for (auto &s : oldo->extent_map.shards) {
11339 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
11340 [&](const string& final_key) {
11341 txc->t->rmkey(PREFIX_OBJ, final_key);
11342 }
11343 );
11344 s.dirty = true;
11345 }
11346 }
11347
11348 newo = oldo;
11349 txc->write_onode(newo);
11350
11351 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11352 // Onode in the old slot
11353 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
11354 r = 0;
11355
11356 out:
11357 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
11358 << new_oid << " = " << r << dendl;
11359 return r;
11360}
11361
11362// collections
11363
11364int BlueStore::_create_collection(
11365 TransContext *txc,
11366 const coll_t &cid,
11367 unsigned bits,
11368 CollectionRef *c)
11369{
11370 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
11371 int r;
11372 bufferlist bl;
11373
11374 {
11375 RWLock::WLocker l(coll_lock);
11376 if (*c) {
11377 r = -EEXIST;
11378 goto out;
11379 }
11380 c->reset(
11381 new Collection(
11382 this,
11383 cache_shards[cid.hash_to_shard(cache_shards.size())],
11384 cid));
11385 (*c)->cnode.bits = bits;
11386 coll_map[cid] = *c;
11387 }
11388 ::encode((*c)->cnode, bl);
11389 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11390 r = 0;
11391
11392 out:
11393 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11394 return r;
11395}
11396
11397int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11398 CollectionRef *c)
11399{
11400 dout(15) << __func__ << " " << cid << dendl;
11401 int r;
11402
11403 {
11404 RWLock::WLocker l(coll_lock);
11405 if (!*c) {
11406 r = -ENOENT;
11407 goto out;
11408 }
11409 size_t nonexistent_count = 0;
11410 assert((*c)->exists);
11411 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11412 if (o->exists) {
11413 dout(10) << __func__ << " " << o->oid << " " << o
11414 << " exists in onode_map" << dendl;
11415 return true;
11416 }
11417 ++nonexistent_count;
11418 return false;
11419 })) {
11420 r = -ENOTEMPTY;
11421 goto out;
11422 }
11423
11424 vector<ghobject_t> ls;
11425 ghobject_t next;
11426 // Enumerate onodes in db, up to nonexistent_count + 1
11427 // then check if all of them are marked as non-existent.
11428 // Bypass the check if returned number is greater than nonexistent_count
11429 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11430 nonexistent_count + 1, &ls, &next);
11431 if (r >= 0) {
11432 bool exists = false; //ls.size() > nonexistent_count;
11433 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11434 dout(10) << __func__ << " oid " << *it << dendl;
11435 auto onode = (*c)->onode_map.lookup(*it);
11436 exists = !onode || onode->exists;
11437 if (exists) {
11438 dout(10) << __func__ << " " << *it
11439 << " exists in db" << dendl;
11440 }
11441 }
11442 if (!exists) {
11443 coll_map.erase(cid);
11444 txc->removed_collections.push_back(*c);
11445 (*c)->exists = false;
11446 c->reset();
11447 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11448 r = 0;
11449 } else {
11450 dout(10) << __func__ << " " << cid
11451 << " is non-empty" << dendl;
11452 r = -ENOTEMPTY;
11453 }
11454 }
11455 }
11456
11457 out:
11458 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11459 return r;
11460}
11461
11462int BlueStore::_split_collection(TransContext *txc,
11463 CollectionRef& c,
11464 CollectionRef& d,
11465 unsigned bits, int rem)
11466{
11467 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11468 << " bits " << bits << dendl;
11469 RWLock::WLocker l(c->lock);
11470 RWLock::WLocker l2(d->lock);
11471 int r;
11472
11473 // flush all previous deferred writes on this sequencer. this is a bit
11474 // heavyweight, but we need to make sure all deferred writes complete
11475 // before we split as the new collection's sequencer may need to order
11476 // this after those writes, and we don't bother with the complexity of
11477 // moving those TransContexts over to the new osr.
11478 _osr_drain_preceding(txc);
11479
11480 // move any cached items (onodes and referenced shared blobs) that will
11481 // belong to the child collection post-split. leave everything else behind.
11482 // this may include things that don't strictly belong to the now-smaller
11483 // parent split, but the OSD will always send us a split for every new
11484 // child.
11485
11486 spg_t pgid, dest_pgid;
11487 bool is_pg = c->cid.is_pg(&pgid);
11488 assert(is_pg);
11489 is_pg = d->cid.is_pg(&dest_pgid);
11490 assert(is_pg);
11491
11492 // the destination should initially be empty.
11493 assert(d->onode_map.empty());
11494 assert(d->shared_blob_set.empty());
11495 assert(d->cnode.bits == bits);
11496
11497 c->split_cache(d.get());
11498
11499 // adjust bits. note that this will be redundant for all but the first
11500 // split call for this parent (first child).
11501 c->cnode.bits = bits;
11502 assert(d->cnode.bits == bits);
11503 r = 0;
11504
11505 bufferlist bl;
11506 ::encode(c->cnode, bl);
11507 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11508
11509 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11510 << " bits " << bits << " = " << r << dendl;
11511 return r;
11512}
11513
11514// DB key value Histogram
11515#define KEY_SLAB 32
11516#define VALUE_SLAB 64
11517
11518const string prefix_onode = "o";
11519const string prefix_onode_shard = "x";
11520const string prefix_other = "Z";
11521
11522int BlueStore::DBHistogram::get_key_slab(size_t sz)
11523{
11524 return (sz/KEY_SLAB);
11525}
11526
11527string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11528{
11529 int lower_bound = slab * KEY_SLAB;
11530 int upper_bound = (slab + 1) * KEY_SLAB;
11531 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11532 return ret;
11533}
11534
11535int BlueStore::DBHistogram::get_value_slab(size_t sz)
11536{
11537 return (sz/VALUE_SLAB);
11538}
11539
11540string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11541{
11542 int lower_bound = slab * VALUE_SLAB;
11543 int upper_bound = (slab + 1) * VALUE_SLAB;
11544 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11545 return ret;
11546}
11547
11548void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11549 const string &prefix, size_t key_size, size_t value_size)
11550{
11551 uint32_t key_slab = get_key_slab(key_size);
11552 uint32_t value_slab = get_value_slab(value_size);
11553 key_hist[prefix][key_slab].count++;
11554 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11555 key_hist[prefix][key_slab].val_map[value_slab].count++;
11556 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11557 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11558}
11559
11560void BlueStore::DBHistogram::dump(Formatter *f)
11561{
11562 f->open_object_section("rocksdb_value_distribution");
11563 for (auto i : value_hist) {
11564 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11565 }
11566 f->close_section();
11567
11568 f->open_object_section("rocksdb_key_value_histogram");
11569 for (auto i : key_hist) {
11570 f->dump_string("prefix", i.first);
11571 f->open_object_section("key_hist");
11572 for ( auto k : i.second) {
11573 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11574 f->dump_unsigned("max_len", k.second.max_len);
11575 f->open_object_section("value_hist");
11576 for ( auto j : k.second.val_map) {
11577 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11578 f->dump_unsigned("max_len", j.second.max_len);
11579 }
11580 f->close_section();
11581 }
11582 f->close_section();
11583 }
11584 f->close_section();
11585}
11586
11587//Itrerates through the db and collects the stats
11588void BlueStore::generate_db_histogram(Formatter *f)
11589{
11590 //globals
11591 uint64_t num_onodes = 0;
11592 uint64_t num_shards = 0;
11593 uint64_t num_super = 0;
11594 uint64_t num_coll = 0;
11595 uint64_t num_omap = 0;
11596 uint64_t num_deferred = 0;
11597 uint64_t num_alloc = 0;
11598 uint64_t num_stat = 0;
11599 uint64_t num_others = 0;
11600 uint64_t num_shared_shards = 0;
11601 size_t max_key_size =0, max_value_size = 0;
11602 uint64_t total_key_size = 0, total_value_size = 0;
11603 size_t key_size = 0, value_size = 0;
11604 DBHistogram hist;
11605
11606 utime_t start = ceph_clock_now();
11607
11608 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11609 iter->seek_to_first();
11610 while (iter->valid()) {
11611 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11612 key_size = iter->key_size();
11613 value_size = iter->value_size();
11614 hist.value_hist[hist.get_value_slab(value_size)]++;
11615 max_key_size = MAX(max_key_size, key_size);
11616 max_value_size = MAX(max_value_size, value_size);
11617 total_key_size += key_size;
11618 total_value_size += value_size;
11619
11620 pair<string,string> key(iter->raw_key());
11621
11622 if (key.first == PREFIX_SUPER) {
11623 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11624 num_super++;
11625 } else if (key.first == PREFIX_STAT) {
11626 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11627 num_stat++;
11628 } else if (key.first == PREFIX_COLL) {
11629 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11630 num_coll++;
11631 } else if (key.first == PREFIX_OBJ) {
11632 if (key.second.back() == ONODE_KEY_SUFFIX) {
11633 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11634 num_onodes++;
11635 } else {
11636 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11637 num_shards++;
11638 }
11639 } else if (key.first == PREFIX_OMAP) {
11640 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11641 num_omap++;
11642 } else if (key.first == PREFIX_DEFERRED) {
11643 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11644 num_deferred++;
11645 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11646 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11647 num_alloc++;
11648 } else if (key.first == PREFIX_SHARED_BLOB) {
11649 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11650 num_shared_shards++;
11651 } else {
11652 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11653 num_others++;
11654 }
11655 iter->next();
11656 }
11657
11658 utime_t duration = ceph_clock_now() - start;
11659 f->open_object_section("rocksdb_key_value_stats");
11660 f->dump_unsigned("num_onodes", num_onodes);
11661 f->dump_unsigned("num_shards", num_shards);
11662 f->dump_unsigned("num_super", num_super);
11663 f->dump_unsigned("num_coll", num_coll);
11664 f->dump_unsigned("num_omap", num_omap);
11665 f->dump_unsigned("num_deferred", num_deferred);
11666 f->dump_unsigned("num_alloc", num_alloc);
11667 f->dump_unsigned("num_stat", num_stat);
11668 f->dump_unsigned("num_shared_shards", num_shared_shards);
11669 f->dump_unsigned("num_others", num_others);
11670 f->dump_unsigned("max_key_size", max_key_size);
11671 f->dump_unsigned("max_value_size", max_value_size);
11672 f->dump_unsigned("total_key_size", total_key_size);
11673 f->dump_unsigned("total_value_size", total_value_size);
11674 f->close_section();
11675
11676 hist.dump(f);
11677
11678 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11679
11680}
11681
31f18b77 11682void BlueStore::_flush_cache()
7c673cae
FG
11683{
11684 dout(10) << __func__ << dendl;
11685 for (auto i : cache_shards) {
11686 i->trim_all();
31f18b77 11687 assert(i->empty());
7c673cae
FG
11688 }
11689 for (auto& p : coll_map) {
3efd9988
FG
11690 if (!p.second->onode_map.empty()) {
11691 derr << __func__ << "stray onodes on " << p.first << dendl;
11692 p.second->onode_map.dump(cct, 0);
11693 }
11694 if (!p.second->shared_blob_set.empty()) {
11695 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11696 p.second->shared_blob_set.dump(cct, 0);
11697 }
7c673cae
FG
11698 assert(p.second->onode_map.empty());
11699 assert(p.second->shared_blob_set.empty());
11700 }
11701 coll_map.clear();
11702}
11703
31f18b77
FG
11704// For external caller.
11705// We use a best-effort policy instead, e.g.,
11706// we don't care if there are still some pinned onodes/data in the cache
11707// after this command is completed.
11708void BlueStore::flush_cache()
11709{
11710 dout(10) << __func__ << dendl;
11711 for (auto i : cache_shards) {
11712 i->trim_all();
11713 }
11714}
11715
7c673cae
FG
11716void BlueStore::_apply_padding(uint64_t head_pad,
11717 uint64_t tail_pad,
7c673cae
FG
11718 bufferlist& padded)
11719{
7c673cae 11720 if (head_pad) {
224ce89b 11721 padded.prepend_zero(head_pad);
7c673cae
FG
11722 }
11723 if (tail_pad) {
11724 padded.append_zero(tail_pad);
11725 }
11726 if (head_pad || tail_pad) {
11727 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
11728 << " tail 0x" << tail_pad << std::dec << dendl;
11729 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
11730 }
11731}
11732
11733// ===========================================