]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
bump version to 12.2.5-pve1
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
7c673cae
FG
1// vim: ts=8 sw=2 smarttab
2/*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14#include <unistd.h>
15#include <stdlib.h>
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <fcntl.h>
19
31f18b77
FG
20#include "include/cpp-btree/btree_set.h"
21
7c673cae
FG
22#include "BlueStore.h"
23#include "os/kv.h"
24#include "include/compat.h"
25#include "include/intarith.h"
26#include "include/stringify.h"
27#include "common/errno.h"
28#include "common/safe_io.h"
29#include "Allocator.h"
30#include "FreelistManager.h"
31#include "BlueFS.h"
32#include "BlueRocksEnv.h"
33#include "auth/Crypto.h"
34#include "common/EventTrace.h"
35
36#define dout_context cct
37#define dout_subsys ceph_subsys_bluestore
38
31f18b77
FG
39using bid_t = decltype(BlueStore::Blob::id);
40
41// bluestore_cache_onode
7c673cae 42MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 43 bluestore_cache_onode);
7c673cae 44
31f18b77 45// bluestore_cache_other
7c673cae 46MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 47 bluestore_cache_other);
7c673cae 48MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 49 bluestore_cache_other);
7c673cae 50MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 51 bluestore_cache_other);
7c673cae 52MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
53 bluestore_cache_other);
54
55// bluestore_txc
56MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
57 bluestore_txc);
58
7c673cae
FG
59
60// kv store prefixes
61const string PREFIX_SUPER = "S"; // field -> value
62const string PREFIX_STAT = "T"; // field -> value(int64 array)
63const string PREFIX_COLL = "C"; // collection name -> cnode_t
64const string PREFIX_OBJ = "O"; // object name -> onode_t
65const string PREFIX_OMAP = "M"; // u64 + keyname -> value
66const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
67const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
68const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
69
70// write a label in the first block. always use this size. note that
71// bluefs makes a matching assumption about the location of its
72// superblock (always the second block of the device).
73#define BDEV_LABEL_BLOCK_SIZE 4096
74
75// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76#define SUPER_RESERVED 8192
77
78#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
79
80
81/*
82 * extent map blob encoding
83 *
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
86 */
87#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91#define BLOBID_SHIFT_BITS 4
92
93/*
94 * object name key structure
95 *
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
99 *
100 * escaped string: namespace
101 *
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
106 *
107 * encoded u64: snap
108 * encoded u64: generation
109 * 'o'
110 */
111#define ONODE_KEY_SUFFIX 'o'
112
113/*
114 * extent shard key
115 *
116 * object prefix key
117 * u32
118 * 'x'
119 */
120#define EXTENT_SHARD_KEY_SUFFIX 'x'
121
122/*
123 * string encoding in the key
124 *
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
128 * hex digits.
129 *
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
132 *
133 */
134template<typename S>
135static void append_escaped(const string &in, S *out)
136{
224ce89b
WB
137 char hexbyte[in.length() * 3 + 1];
138 char* ptr = &hexbyte[0];
7c673cae
FG
139 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
140 if (*i <= '#') {
224ce89b
WB
141 *ptr++ = '#';
142 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
143 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 144 } else if (*i >= '~') {
224ce89b
WB
145 *ptr++ = '~';
146 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
147 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 148 } else {
224ce89b 149 *ptr++ = *i;
7c673cae
FG
150 }
151 }
224ce89b
WB
152 *ptr++ = '!';
153 out->append(hexbyte, ptr - &hexbyte[0]);
154}
155
156inline unsigned h2i(char c)
157{
158 if ((c >= '0') && (c <= '9')) {
159 return c - 0x30;
160 } else if ((c >= 'a') && (c <= 'f')) {
161 return c - 'a' + 10;
162 } else if ((c >= 'A') && (c <= 'F')) {
163 return c - 'A' + 10;
164 } else {
165 return 256; // make it always larger than 255
166 }
7c673cae
FG
167}
168
169static int decode_escaped(const char *p, string *out)
170{
224ce89b
WB
171 char buff[256];
172 char* ptr = &buff[0];
173 char* max = &buff[252];
7c673cae
FG
174 const char *orig_p = p;
175 while (*p && *p != '!') {
176 if (*p == '#' || *p == '~') {
224ce89b
WB
177 unsigned hex = 0;
178 p++;
179 hex = h2i(*p++) << 4;
180 if (hex > 255) {
181 return -EINVAL;
182 }
183 hex |= h2i(*p++);
184 if (hex > 255) {
185 return -EINVAL;
186 }
187 *ptr++ = hex;
7c673cae 188 } else {
224ce89b
WB
189 *ptr++ = *p++;
190 }
191 if (ptr > max) {
192 out->append(buff, ptr-buff);
193 ptr = &buff[0];
7c673cae
FG
194 }
195 }
224ce89b
WB
196 if (ptr != buff) {
197 out->append(buff, ptr-buff);
198 }
7c673cae
FG
199 return p - orig_p;
200}
201
202// some things we encode in binary (as le32 or le64); print the
203// resulting key strings nicely
204template<typename S>
205static string pretty_binary_string(const S& in)
206{
207 char buf[10];
208 string out;
209 out.reserve(in.length() * 3);
210 enum { NONE, HEX, STRING } mode = NONE;
211 unsigned from = 0, i;
212 for (i=0; i < in.length(); ++i) {
213 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
214 (mode == HEX && in.length() - i >= 4 &&
215 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
216 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
217 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
218 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
219 if (mode == STRING) {
220 out.append(in.c_str() + from, i - from);
221 out.push_back('\'');
222 }
223 if (mode != HEX) {
224 out.append("0x");
225 mode = HEX;
226 }
227 if (in.length() - i >= 4) {
228 // print a whole u32 at once
229 snprintf(buf, sizeof(buf), "%08x",
230 (uint32_t)(((unsigned char)in[i] << 24) |
231 ((unsigned char)in[i+1] << 16) |
232 ((unsigned char)in[i+2] << 8) |
233 ((unsigned char)in[i+3] << 0)));
234 i += 3;
235 } else {
236 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
237 }
238 out.append(buf);
239 } else {
240 if (mode != STRING) {
241 out.push_back('\'');
242 mode = STRING;
243 from = i;
244 }
245 }
246 }
247 if (mode == STRING) {
248 out.append(in.c_str() + from, i - from);
249 out.push_back('\'');
250 }
251 return out;
252}
253
254template<typename T>
255static void _key_encode_shard(shard_id_t shard, T *key)
256{
257 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
258}
259
260static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
261{
262 pshard->id = (uint8_t)*key - (uint8_t)0x80;
263 return key + 1;
264}
265
266static void get_coll_key_range(const coll_t& cid, int bits,
267 string *temp_start, string *temp_end,
268 string *start, string *end)
269{
270 temp_start->clear();
271 temp_end->clear();
272 start->clear();
273 end->clear();
274
275 spg_t pgid;
276 if (cid.is_pg(&pgid)) {
277 _key_encode_shard(pgid.shard, start);
278 *temp_start = *start;
279
280 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
281 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
282
283 *end = *start;
284 *temp_end = *temp_start;
285
286 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
287 _key_encode_u32(reverse_hash, start);
288 _key_encode_u32(reverse_hash, temp_start);
289
290 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
291 if (end_hash > 0xffffffffull)
292 end_hash = 0xffffffffull;
293
294 _key_encode_u32(end_hash, end);
295 _key_encode_u32(end_hash, temp_end);
296 } else {
297 _key_encode_shard(shard_id_t::NO_SHARD, start);
298 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
299 *end = *start;
300 _key_encode_u32(0, start);
301 _key_encode_u32(0xffffffff, end);
302
303 // no separate temp section
304 *temp_start = *end;
305 *temp_end = *end;
306 }
307}
308
309static void get_shared_blob_key(uint64_t sbid, string *key)
310{
311 key->clear();
312 _key_encode_u64(sbid, key);
313}
314
315static int get_key_shared_blob(const string& key, uint64_t *sbid)
316{
317 const char *p = key.c_str();
318 if (key.length() < sizeof(uint64_t))
319 return -1;
224ce89b 320 _key_decode_u64(p, sbid);
7c673cae
FG
321 return 0;
322}
323
324template<typename S>
325static int get_key_object(const S& key, ghobject_t *oid)
326{
327 int r;
328 const char *p = key.c_str();
329
330 if (key.length() < 1 + 8 + 4)
331 return -1;
332 p = _key_decode_shard(p, &oid->shard_id);
333
334 uint64_t pool;
335 p = _key_decode_u64(p, &pool);
336 oid->hobj.pool = pool - 0x8000000000000000ull;
337
338 unsigned hash;
339 p = _key_decode_u32(p, &hash);
340
341 oid->hobj.set_bitwise_key_u32(hash);
342
343 r = decode_escaped(p, &oid->hobj.nspace);
344 if (r < 0)
345 return -2;
346 p += r + 1;
347
348 string k;
349 r = decode_escaped(p, &k);
350 if (r < 0)
351 return -3;
352 p += r + 1;
353 if (*p == '=') {
354 // no key
355 ++p;
356 oid->hobj.oid.name = k;
357 } else if (*p == '<' || *p == '>') {
358 // key + name
359 ++p;
360 r = decode_escaped(p, &oid->hobj.oid.name);
361 if (r < 0)
362 return -5;
363 p += r + 1;
364 oid->hobj.set_key(k);
365 } else {
366 // malformed
367 return -6;
368 }
369
370 p = _key_decode_u64(p, &oid->hobj.snap.val);
371 p = _key_decode_u64(p, &oid->generation);
372
373 if (*p != ONODE_KEY_SUFFIX) {
374 return -7;
375 }
376 p++;
377 if (*p) {
378 // if we get something other than a null terminator here,
379 // something goes wrong.
380 return -8;
381 }
382
383 return 0;
384}
385
386template<typename S>
387static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
388{
389 key->clear();
390
391 size_t max_len = 1 + 8 + 4 +
392 (oid.hobj.nspace.length() * 3 + 1) +
393 (oid.hobj.get_key().length() * 3 + 1) +
394 1 + // for '<', '=', or '>'
395 (oid.hobj.oid.name.length() * 3 + 1) +
396 8 + 8 + 1;
397 key->reserve(max_len);
398
399 _key_encode_shard(oid.shard_id, key);
400 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
401 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
402
403 append_escaped(oid.hobj.nspace, key);
404
405 if (oid.hobj.get_key().length()) {
406 // is a key... could be < = or >.
407 append_escaped(oid.hobj.get_key(), key);
408 // (ASCII chars < = and > sort in that order, yay)
409 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
410 if (r) {
411 key->append(r > 0 ? ">" : "<");
412 append_escaped(oid.hobj.oid.name, key);
413 } else {
414 // same as no key
415 key->append("=");
416 }
417 } else {
418 // no key
419 append_escaped(oid.hobj.oid.name, key);
420 key->append("=");
421 }
422
423 _key_encode_u64(oid.hobj.snap, key);
424 _key_encode_u64(oid.generation, key);
425
426 key->push_back(ONODE_KEY_SUFFIX);
427
428 // sanity check
429 if (true) {
430 ghobject_t t;
431 int r = get_key_object(*key, &t);
432 if (r || t != oid) {
433 derr << " r " << r << dendl;
434 derr << "key " << pretty_binary_string(*key) << dendl;
435 derr << "oid " << oid << dendl;
436 derr << " t " << t << dendl;
437 assert(r == 0 && t == oid);
438 }
439 }
440}
441
442
443// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
444// char lets us quickly test whether it is a shard key without decoding any
445// of the prefix bytes.
446template<typename S>
447static void get_extent_shard_key(const S& onode_key, uint32_t offset,
448 string *key)
449{
450 key->clear();
451 key->reserve(onode_key.length() + 4 + 1);
452 key->append(onode_key.c_str(), onode_key.size());
453 _key_encode_u32(offset, key);
454 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
455}
456
457static void rewrite_extent_shard_key(uint32_t offset, string *key)
458{
459 assert(key->size() > sizeof(uint32_t) + 1);
460 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
461 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
462}
463
464template<typename S>
465static void generate_extent_shard_key_and_apply(
466 const S& onode_key,
467 uint32_t offset,
468 string *key,
469 std::function<void(const string& final_key)> apply)
470{
471 if (key->empty()) { // make full key
472 assert(!onode_key.empty());
473 get_extent_shard_key(onode_key, offset, key);
474 } else {
475 rewrite_extent_shard_key(offset, key);
476 }
477 apply(*key);
478}
479
480int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
481{
482 assert(key.size() > sizeof(uint32_t) + 1);
483 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
484 int okey_len = key.size() - sizeof(uint32_t) - 1;
485 *onode_key = key.substr(0, okey_len);
486 const char *p = key.data() + okey_len;
224ce89b 487 _key_decode_u32(p, offset);
7c673cae
FG
488 return 0;
489}
490
491static bool is_extent_shard_key(const string& key)
492{
493 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
494}
495
496// '-' < '.' < '~'
497static void get_omap_header(uint64_t id, string *out)
498{
499 _key_encode_u64(id, out);
500 out->push_back('-');
501}
502
503// hmm, I don't think there's any need to escape the user key since we
504// have a clean prefix.
505static void get_omap_key(uint64_t id, const string& key, string *out)
506{
507 _key_encode_u64(id, out);
508 out->push_back('.');
509 out->append(key);
510}
511
512static void rewrite_omap_key(uint64_t id, string old, string *out)
513{
514 _key_encode_u64(id, out);
515 out->append(old.c_str() + out->length(), old.size() - out->length());
516}
517
518static void decode_omap_key(const string& key, string *user_key)
519{
520 *user_key = key.substr(sizeof(uint64_t) + 1);
521}
522
523static void get_omap_tail(uint64_t id, string *out)
524{
525 _key_encode_u64(id, out);
526 out->push_back('~');
527}
528
529static void get_deferred_key(uint64_t seq, string *out)
530{
531 _key_encode_u64(seq, out);
532}
533
534
535// merge operators
536
537struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
538 void merge_nonexistent(
539 const char *rdata, size_t rlen, std::string *new_value) override {
540 *new_value = std::string(rdata, rlen);
541 }
542 void merge(
543 const char *ldata, size_t llen,
544 const char *rdata, size_t rlen,
545 std::string *new_value) override {
546 assert(llen == rlen);
547 assert((rlen % 8) == 0);
548 new_value->resize(rlen);
549 const __le64* lv = (const __le64*)ldata;
550 const __le64* rv = (const __le64*)rdata;
551 __le64* nv = &(__le64&)new_value->at(0);
552 for (size_t i = 0; i < rlen >> 3; ++i) {
553 nv[i] = lv[i] + rv[i];
554 }
555 }
556 // We use each operator name and each prefix to construct the
557 // overall RocksDB operator name for consistency check at open time.
558 string name() const override {
559 return "int64_array";
560 }
561};
562
563
564// Buffer
565
566ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
567{
568 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
569 << b.offset << "~" << b.length << std::dec
570 << " " << BlueStore::Buffer::get_state_name(b.state);
571 if (b.flags)
572 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
573 return out << ")";
574}
575
576// Garbage Collector
577
578void BlueStore::GarbageCollector::process_protrusive_extents(
579 const BlueStore::ExtentMap& extent_map,
580 uint64_t start_offset,
581 uint64_t end_offset,
582 uint64_t start_touch_offset,
583 uint64_t end_touch_offset,
584 uint64_t min_alloc_size)
585{
586 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
587
588 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
589 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
590
591 dout(30) << __func__ << " (hex): [" << std::hex
592 << lookup_start_offset << ", " << lookup_end_offset
593 << ")" << std::dec << dendl;
594
595 for (auto it = extent_map.seek_lextent(lookup_start_offset);
596 it != extent_map.extent_map.end() &&
597 it->logical_offset < lookup_end_offset;
598 ++it) {
599 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
600 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
601
602 dout(30) << __func__ << " " << *it
603 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
604 << dendl;
605
606 Blob* b = it->blob.get();
607
608 if (it->logical_offset >=start_touch_offset &&
609 it->logical_end() <= end_touch_offset) {
610 // Process extents within the range affected by
611 // the current write request.
612 // Need to take into account if existing extents
613 // can be merged with them (uncompressed case)
614 if (!b->get_blob().is_compressed()) {
615 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
616 --blob_info_counted->expected_allocations; // don't need to allocate
617 // new AU for compressed
618 // data since another
619 // collocated uncompressed
620 // blob already exists
621 dout(30) << __func__ << " --expected:"
622 << alloc_unit_start << dendl;
623 }
624 used_alloc_unit = alloc_unit_end;
625 blob_info_counted = nullptr;
626 }
627 } else if (b->get_blob().is_compressed()) {
628
629 // additionally we take compressed blobs that were not impacted
630 // by the write into account too
631 BlobInfo& bi =
632 affected_blobs.emplace(
633 b, BlobInfo(b->get_referenced_bytes())).first->second;
634
635 int adjust =
636 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
637 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
638 dout(30) << __func__ << " expected_allocations="
639 << bi.expected_allocations << " end_au:"
640 << alloc_unit_end << dendl;
641
642 blob_info_counted = &bi;
643 used_alloc_unit = alloc_unit_end;
644
645 assert(it->length <= bi.referenced_bytes);
646 bi.referenced_bytes -= it->length;
647 dout(30) << __func__ << " affected_blob:" << *b
648 << " unref 0x" << std::hex << it->length
649 << " referenced = 0x" << bi.referenced_bytes
650 << std::dec << dendl;
651 // NOTE: we can't move specific blob to resulting GC list here
652 // when reference counter == 0 since subsequent extents might
653 // decrement its expected_allocation.
654 // Hence need to enumerate all the extents first.
655 if (!bi.collect_candidate) {
656 bi.first_lextent = it;
657 bi.collect_candidate = true;
658 }
659 bi.last_lextent = it;
660 } else {
661 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
662 // don't need to allocate new AU for compressed data since another
663 // collocated uncompressed blob already exists
664 --blob_info_counted->expected_allocations;
665 dout(30) << __func__ << " --expected_allocations:"
666 << alloc_unit_start << dendl;
667 }
668 used_alloc_unit = alloc_unit_end;
669 blob_info_counted = nullptr;
670 }
671 }
672
673 for (auto b_it = affected_blobs.begin();
674 b_it != affected_blobs.end();
675 ++b_it) {
676 Blob* b = b_it->first;
677 BlobInfo& bi = b_it->second;
678 if (bi.referenced_bytes == 0) {
679 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
680 int64_t blob_expected_for_release =
681 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
682
683 dout(30) << __func__ << " " << *(b_it->first)
684 << " expected4release=" << blob_expected_for_release
685 << " expected_allocations=" << bi.expected_allocations
686 << dendl;
687 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
688 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
689 if (bi.collect_candidate) {
690 auto it = bi.first_lextent;
691 bool bExit = false;
692 do {
693 if (it->blob.get() == b) {
694 extents_to_collect.emplace_back(it->logical_offset, it->length);
695 }
696 bExit = it == bi.last_lextent;
697 ++it;
31f18b77 698 } while (!bExit);
7c673cae
FG
699 }
700 expected_for_release += blob_expected_for_release;
701 expected_allocations += bi.expected_allocations;
702 }
703 }
704 }
705}
706
707int64_t BlueStore::GarbageCollector::estimate(
708 uint64_t start_offset,
709 uint64_t length,
710 const BlueStore::ExtentMap& extent_map,
711 const BlueStore::old_extent_map_t& old_extents,
712 uint64_t min_alloc_size)
713{
714
715 affected_blobs.clear();
716 extents_to_collect.clear();
717 used_alloc_unit = boost::optional<uint64_t >();
718 blob_info_counted = nullptr;
719
720 gc_start_offset = start_offset;
721 gc_end_offset = start_offset + length;
722
723 uint64_t end_offset = start_offset + length;
724
725 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
726 Blob* b = it->e.blob.get();
727 if (b->get_blob().is_compressed()) {
728
729 // update gc_start_offset/gc_end_offset if needed
730 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
731 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
732
733 auto o = it->e.logical_offset;
734 auto l = it->e.length;
735
736 uint64_t ref_bytes = b->get_referenced_bytes();
737 // micro optimization to bypass blobs that have no more references
738 if (ref_bytes != 0) {
739 dout(30) << __func__ << " affected_blob:" << *b
740 << " unref 0x" << std::hex << o << "~" << l
741 << std::dec << dendl;
742 affected_blobs.emplace(b, BlobInfo(ref_bytes));
743 }
744 }
745 }
746 dout(30) << __func__ << " gc range(hex): [" << std::hex
747 << gc_start_offset << ", " << gc_end_offset
748 << ")" << std::dec << dendl;
749
750 // enumerate preceeding extents to check if they reference affected blobs
751 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
752 process_protrusive_extents(extent_map,
753 gc_start_offset,
754 gc_end_offset,
755 start_offset,
756 end_offset,
757 min_alloc_size);
758 }
759 return expected_for_release - expected_allocations;
760}
761
762// Cache
763
764BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
765 PerfCounters *logger)
766{
767 Cache *c = nullptr;
768
769 if (type == "lru")
770 c = new LRUCache(cct);
771 else if (type == "2q")
772 c = new TwoQCache(cct);
773 else
774 assert(0 == "unrecognized cache type");
775
776 c->logger = logger;
777 return c;
778}
779
780void BlueStore::Cache::trim_all()
781{
782 std::lock_guard<std::recursive_mutex> l(lock);
783 _trim(0, 0);
7c673cae
FG
784}
785
786void BlueStore::Cache::trim(
787 uint64_t target_bytes,
788 float target_meta_ratio,
31f18b77 789 float target_data_ratio,
7c673cae
FG
790 float bytes_per_onode)
791{
792 std::lock_guard<std::recursive_mutex> l(lock);
793 uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
794 uint64_t current_buffer = _get_buffer_bytes();
795 uint64_t current = current_meta + current_buffer;
796
31f18b77
FG
797 uint64_t target_meta = target_bytes * target_meta_ratio;
798 uint64_t target_buffer = target_bytes * target_data_ratio;
7c673cae 799
31f18b77
FG
800 // correct for overflow or float imprecision
801 target_meta = min(target_bytes, target_meta);
802 target_buffer = min(target_bytes - target_meta, target_buffer);
7c673cae
FG
803
804 if (current <= target_bytes) {
805 dout(10) << __func__
806 << " shard target " << pretty_si_t(target_bytes)
31f18b77
FG
807 << " meta/data ratios " << target_meta_ratio
808 << " + " << target_data_ratio << " ("
7c673cae
FG
809 << pretty_si_t(target_meta) << " + "
810 << pretty_si_t(target_buffer) << "), "
811 << " current " << pretty_si_t(current) << " ("
812 << pretty_si_t(current_meta) << " + "
813 << pretty_si_t(current_buffer) << ")"
814 << dendl;
815 return;
816 }
817
818 uint64_t need_to_free = current - target_bytes;
819 uint64_t free_buffer = 0;
820 uint64_t free_meta = 0;
821 if (current_buffer > target_buffer) {
822 free_buffer = current_buffer - target_buffer;
823 if (free_buffer > need_to_free) {
824 free_buffer = need_to_free;
825 }
826 }
827 free_meta = need_to_free - free_buffer;
828
829 // start bounds at what we have now
830 uint64_t max_buffer = current_buffer - free_buffer;
831 uint64_t max_meta = current_meta - free_meta;
832 uint64_t max_onodes = max_meta / bytes_per_onode;
833
834 dout(10) << __func__
835 << " shard target " << pretty_si_t(target_bytes)
836 << " ratio " << target_meta_ratio << " ("
837 << pretty_si_t(target_meta) << " + "
838 << pretty_si_t(target_buffer) << "), "
839 << " current " << pretty_si_t(current) << " ("
840 << pretty_si_t(current_meta) << " + "
841 << pretty_si_t(current_buffer) << "),"
842 << " need_to_free " << pretty_si_t(need_to_free) << " ("
843 << pretty_si_t(free_meta) << " + "
844 << pretty_si_t(free_buffer) << ")"
845 << " -> max " << max_onodes << " onodes + "
846 << max_buffer << " buffer"
847 << dendl;
848 _trim(max_onodes, max_buffer);
849}
850
851
852// LRUCache
853#undef dout_prefix
854#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
855
856void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
857{
858 auto p = onode_lru.iterator_to(*o);
859 onode_lru.erase(p);
860 onode_lru.push_front(*o);
861}
862
863void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
864{
865 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
866 << " buffers " << buffer_size << " / " << buffer_max
867 << dendl;
868
869 _audit("trim start");
870
871 // buffers
872 while (buffer_size > buffer_max) {
873 auto i = buffer_lru.rbegin();
874 if (i == buffer_lru.rend()) {
875 // stop if buffer_lru is now empty
876 break;
877 }
878
879 Buffer *b = &*i;
880 assert(b->is_clean());
881 dout(20) << __func__ << " rm " << *b << dendl;
882 b->space->_rm_buffer(this, b);
883 }
884
885 // onodes
886 int num = onode_lru.size() - onode_max;
887 if (num <= 0)
888 return; // don't even try
889
890 auto p = onode_lru.end();
891 assert(p != onode_lru.begin());
892 --p;
893 int skipped = 0;
894 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
895 while (num > 0) {
896 Onode *o = &*p;
897 int refs = o->nref.load();
898 if (refs > 1) {
899 dout(20) << __func__ << " " << o->oid << " has " << refs
900 << " refs, skipping" << dendl;
901 if (++skipped >= max_skipped) {
902 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
903 << num << " left to trim" << dendl;
904 break;
905 }
906
907 if (p == onode_lru.begin()) {
908 break;
909 } else {
910 p--;
911 num--;
912 continue;
913 }
914 }
915 dout(30) << __func__ << " rm " << o->oid << dendl;
916 if (p != onode_lru.begin()) {
917 onode_lru.erase(p--);
918 } else {
919 onode_lru.erase(p);
920 assert(num == 1);
921 }
922 o->get(); // paranoia
923 o->c->onode_map.remove(o->oid);
924 o->put();
925 --num;
926 }
927}
928
929#ifdef DEBUG_CACHE
930void BlueStore::LRUCache::_audit(const char *when)
931{
932 dout(10) << __func__ << " " << when << " start" << dendl;
933 uint64_t s = 0;
934 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
935 s += i->length;
936 }
937 if (s != buffer_size) {
938 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
939 << dendl;
940 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
941 derr << __func__ << " " << *i << dendl;
942 }
943 assert(s == buffer_size);
944 }
945 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
946 << " ok" << dendl;
947}
948#endif
949
950// TwoQCache
951#undef dout_prefix
952#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
953
954
955void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
956{
957 auto p = onode_lru.iterator_to(*o);
958 onode_lru.erase(p);
959 onode_lru.push_front(*o);
960}
961
962void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
963{
964 dout(20) << __func__ << " level " << level << " near " << near
965 << " on " << *b
966 << " which has cache_private " << b->cache_private << dendl;
967 if (near) {
968 b->cache_private = near->cache_private;
969 switch (b->cache_private) {
970 case BUFFER_WARM_IN:
971 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
972 break;
973 case BUFFER_WARM_OUT:
974 assert(b->is_empty());
975 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
976 break;
977 case BUFFER_HOT:
978 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
979 break;
980 default:
981 assert(0 == "bad cache_private");
982 }
983 } else if (b->cache_private == BUFFER_NEW) {
984 b->cache_private = BUFFER_WARM_IN;
985 if (level > 0) {
986 buffer_warm_in.push_front(*b);
987 } else {
988 // take caller hint to start at the back of the warm queue
989 buffer_warm_in.push_back(*b);
990 }
991 } else {
992 // we got a hint from discard
993 switch (b->cache_private) {
994 case BUFFER_WARM_IN:
995 // stay in warm_in. move to front, even though 2Q doesn't actually
996 // do this.
997 dout(20) << __func__ << " move to front of warm " << *b << dendl;
998 buffer_warm_in.push_front(*b);
999 break;
1000 case BUFFER_WARM_OUT:
1001 b->cache_private = BUFFER_HOT;
1002 // move to hot. fall-thru
1003 case BUFFER_HOT:
1004 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1005 buffer_hot.push_front(*b);
1006 break;
1007 default:
1008 assert(0 == "bad cache_private");
1009 }
1010 }
1011 if (!b->is_empty()) {
1012 buffer_bytes += b->length;
1013 buffer_list_bytes[b->cache_private] += b->length;
1014 }
1015}
1016
1017void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1018{
1019 dout(20) << __func__ << " " << *b << dendl;
1020 if (!b->is_empty()) {
1021 assert(buffer_bytes >= b->length);
1022 buffer_bytes -= b->length;
1023 assert(buffer_list_bytes[b->cache_private] >= b->length);
1024 buffer_list_bytes[b->cache_private] -= b->length;
1025 }
1026 switch (b->cache_private) {
1027 case BUFFER_WARM_IN:
1028 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1029 break;
1030 case BUFFER_WARM_OUT:
1031 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1032 break;
1033 case BUFFER_HOT:
1034 buffer_hot.erase(buffer_hot.iterator_to(*b));
1035 break;
1036 default:
1037 assert(0 == "bad cache_private");
1038 }
1039}
1040
1041void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1042{
1043 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1044 src->_rm_buffer(b);
1045
1046 // preserve which list we're on (even if we can't preserve the order!)
1047 switch (b->cache_private) {
1048 case BUFFER_WARM_IN:
1049 assert(!b->is_empty());
1050 buffer_warm_in.push_back(*b);
1051 break;
1052 case BUFFER_WARM_OUT:
1053 assert(b->is_empty());
1054 buffer_warm_out.push_back(*b);
1055 break;
1056 case BUFFER_HOT:
1057 assert(!b->is_empty());
1058 buffer_hot.push_back(*b);
1059 break;
1060 default:
1061 assert(0 == "bad cache_private");
1062 }
1063 if (!b->is_empty()) {
1064 buffer_bytes += b->length;
1065 buffer_list_bytes[b->cache_private] += b->length;
1066 }
1067}
1068
1069void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1070{
1071 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1072 if (!b->is_empty()) {
1073 assert((int64_t)buffer_bytes + delta >= 0);
1074 buffer_bytes += delta;
1075 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1076 buffer_list_bytes[b->cache_private] += delta;
1077 }
1078}
1079
1080void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1081{
1082 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1083 << " buffers " << buffer_bytes << " / " << buffer_max
1084 << dendl;
1085
1086 _audit("trim start");
1087
1088 // buffers
1089 if (buffer_bytes > buffer_max) {
1090 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1091 uint64_t khot = buffer_max - kin;
1092
1093 // pre-calculate kout based on average buffer size too,
1094 // which is typical(the warm_in and hot lists may change later)
1095 uint64_t kout = 0;
1096 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1097 if (buffer_num) {
1098 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1099 assert(buffer_avg_size);
1100 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1101 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1102 }
1103
1104 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1105 // hot is small, give slack to warm_in
1106 kin += khot - buffer_list_bytes[BUFFER_HOT];
1107 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1108 // warm_in is small, give slack to hot
1109 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1110 }
1111
1112 // adjust warm_in list
1113 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1114 uint64_t evicted = 0;
1115
1116 while (to_evict_bytes > 0) {
1117 auto p = buffer_warm_in.rbegin();
1118 if (p == buffer_warm_in.rend()) {
1119 // stop if warm_in list is now empty
1120 break;
1121 }
1122
1123 Buffer *b = &*p;
1124 assert(b->is_clean());
1125 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1126 assert(buffer_bytes >= b->length);
1127 buffer_bytes -= b->length;
1128 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1129 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1130 to_evict_bytes -= b->length;
1131 evicted += b->length;
1132 b->state = Buffer::STATE_EMPTY;
1133 b->data.clear();
1134 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1135 buffer_warm_out.push_front(*b);
1136 b->cache_private = BUFFER_WARM_OUT;
1137 }
1138
1139 if (evicted > 0) {
1140 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1141 << " from warm_in list, done evicting warm_in buffers"
1142 << dendl;
1143 }
1144
1145 // adjust hot list
1146 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1147 evicted = 0;
1148
1149 while (to_evict_bytes > 0) {
1150 auto p = buffer_hot.rbegin();
1151 if (p == buffer_hot.rend()) {
1152 // stop if hot list is now empty
1153 break;
1154 }
1155
1156 Buffer *b = &*p;
1157 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1158 assert(b->is_clean());
1159 // adjust evict size before buffer goes invalid
1160 to_evict_bytes -= b->length;
1161 evicted += b->length;
1162 b->space->_rm_buffer(this, b);
1163 }
1164
1165 if (evicted > 0) {
1166 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1167 << " from hot list, done evicting hot buffers"
1168 << dendl;
1169 }
1170
1171 // adjust warm out list too, if necessary
1172 int64_t num = buffer_warm_out.size() - kout;
1173 while (num-- > 0) {
1174 Buffer *b = &*buffer_warm_out.rbegin();
1175 assert(b->is_empty());
1176 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1177 b->space->_rm_buffer(this, b);
1178 }
1179 }
1180
1181 // onodes
1182 int num = onode_lru.size() - onode_max;
1183 if (num <= 0)
1184 return; // don't even try
1185
1186 auto p = onode_lru.end();
1187 assert(p != onode_lru.begin());
1188 --p;
1189 int skipped = 0;
1190 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1191 while (num > 0) {
1192 Onode *o = &*p;
1193 dout(20) << __func__ << " considering " << o << dendl;
1194 int refs = o->nref.load();
1195 if (refs > 1) {
1196 dout(20) << __func__ << " " << o->oid << " has " << refs
1197 << " refs; skipping" << dendl;
1198 if (++skipped >= max_skipped) {
1199 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1200 << num << " left to trim" << dendl;
1201 break;
1202 }
1203
1204 if (p == onode_lru.begin()) {
1205 break;
1206 } else {
1207 p--;
1208 num--;
1209 continue;
1210 }
1211 }
1212 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1213 if (p != onode_lru.begin()) {
1214 onode_lru.erase(p--);
1215 } else {
1216 onode_lru.erase(p);
1217 assert(num == 1);
1218 }
1219 o->get(); // paranoia
1220 o->c->onode_map.remove(o->oid);
1221 o->put();
1222 --num;
1223 }
1224}
1225
1226#ifdef DEBUG_CACHE
1227void BlueStore::TwoQCache::_audit(const char *when)
1228{
1229 dout(10) << __func__ << " " << when << " start" << dendl;
1230 uint64_t s = 0;
1231 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1232 s += i->length;
1233 }
1234
1235 uint64_t hot_bytes = s;
1236 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1237 derr << __func__ << " hot_list_bytes "
1238 << buffer_list_bytes[BUFFER_HOT]
1239 << " != actual " << hot_bytes
1240 << dendl;
1241 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1242 }
1243
1244 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1245 s += i->length;
1246 }
1247
1248 uint64_t warm_in_bytes = s - hot_bytes;
1249 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1250 derr << __func__ << " warm_in_list_bytes "
1251 << buffer_list_bytes[BUFFER_WARM_IN]
1252 << " != actual " << warm_in_bytes
1253 << dendl;
1254 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1255 }
1256
1257 if (s != buffer_bytes) {
1258 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1259 << dendl;
1260 assert(s == buffer_bytes);
1261 }
1262
1263 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1264 << " ok" << dendl;
1265}
1266#endif
1267
1268
1269// BufferSpace
1270
1271#undef dout_prefix
1272#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1273
1274void BlueStore::BufferSpace::_clear(Cache* cache)
1275{
1276 // note: we already hold cache->lock
1277 ldout(cache->cct, 20) << __func__ << dendl;
1278 while (!buffer_map.empty()) {
1279 _rm_buffer(cache, buffer_map.begin());
1280 }
1281}
1282
1283int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1284{
1285 // note: we already hold cache->lock
1286 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1287 << std::dec << dendl;
1288 int cache_private = 0;
1289 cache->_audit("discard start");
1290 auto i = _data_lower_bound(offset);
1291 uint32_t end = offset + length;
1292 while (i != buffer_map.end()) {
1293 Buffer *b = i->second.get();
1294 if (b->offset >= end) {
1295 break;
1296 }
1297 if (b->cache_private > cache_private) {
1298 cache_private = b->cache_private;
1299 }
1300 if (b->offset < offset) {
1301 int64_t front = offset - b->offset;
1302 if (b->end() > end) {
1303 // drop middle (split)
1304 uint32_t tail = b->end() - end;
1305 if (b->data.length()) {
1306 bufferlist bl;
1307 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1308 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1309 nb->maybe_rebuild();
1310 _add_buffer(cache, nb, 0, b);
7c673cae 1311 } else {
31f18b77
FG
1312 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1313 0, b);
7c673cae
FG
1314 }
1315 if (!b->is_writing()) {
1316 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1317 }
1318 b->truncate(front);
31f18b77 1319 b->maybe_rebuild();
7c673cae
FG
1320 cache->_audit("discard end 1");
1321 break;
1322 } else {
1323 // drop tail
1324 if (!b->is_writing()) {
1325 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1326 }
1327 b->truncate(front);
31f18b77 1328 b->maybe_rebuild();
7c673cae
FG
1329 ++i;
1330 continue;
1331 }
1332 }
1333 if (b->end() <= end) {
1334 // drop entire buffer
1335 _rm_buffer(cache, i++);
1336 continue;
1337 }
1338 // drop front
1339 uint32_t keep = b->end() - end;
1340 if (b->data.length()) {
1341 bufferlist bl;
1342 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1343 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1344 nb->maybe_rebuild();
1345 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1346 } else {
1347 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1348 }
1349 _rm_buffer(cache, i);
1350 cache->_audit("discard end 2");
1351 break;
1352 }
1353 return cache_private;
1354}
1355
1356void BlueStore::BufferSpace::read(
1357 Cache* cache,
224ce89b
WB
1358 uint32_t offset,
1359 uint32_t length,
7c673cae
FG
1360 BlueStore::ready_regions_t& res,
1361 interval_set<uint32_t>& res_intervals)
1362{
7c673cae
FG
1363 res.clear();
1364 res_intervals.clear();
1365 uint32_t want_bytes = length;
1366 uint32_t end = offset + length;
224ce89b
WB
1367
1368 {
1369 std::lock_guard<std::recursive_mutex> l(cache->lock);
1370 for (auto i = _data_lower_bound(offset);
1371 i != buffer_map.end() && offset < end && i->first < end;
1372 ++i) {
1373 Buffer *b = i->second.get();
1374 assert(b->end() > offset);
1375 if (b->is_writing() || b->is_clean()) {
1376 if (b->offset < offset) {
1377 uint32_t skip = offset - b->offset;
1378 uint32_t l = MIN(length, b->length - skip);
1379 res[offset].substr_of(b->data, skip, l);
1380 res_intervals.insert(offset, l);
1381 offset += l;
1382 length -= l;
1383 if (!b->is_writing()) {
1384 cache->_touch_buffer(b);
1385 }
1386 continue;
1387 }
1388 if (b->offset > offset) {
1389 uint32_t gap = b->offset - offset;
1390 if (length <= gap) {
1391 break;
1392 }
1393 offset += gap;
1394 length -= gap;
1395 }
1396 if (!b->is_writing()) {
7c673cae 1397 cache->_touch_buffer(b);
224ce89b
WB
1398 }
1399 if (b->length > length) {
1400 res[offset].substr_of(b->data, 0, length);
1401 res_intervals.insert(offset, length);
7c673cae 1402 break;
224ce89b
WB
1403 } else {
1404 res[offset].append(b->data);
1405 res_intervals.insert(offset, b->length);
1406 if (b->length == length)
1407 break;
1408 offset += b->length;
1409 length -= b->length;
1410 }
7c673cae
FG
1411 }
1412 }
1413 }
1414
1415 uint64_t hit_bytes = res_intervals.size();
1416 assert(hit_bytes <= want_bytes);
1417 uint64_t miss_bytes = want_bytes - hit_bytes;
1418 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1419 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1420}
1421
1422void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1423{
1424 std::lock_guard<std::recursive_mutex> l(cache->lock);
1425
1426 auto i = writing.begin();
1427 while (i != writing.end()) {
1428 if (i->seq > seq) {
1429 break;
1430 }
1431 if (i->seq < seq) {
1432 ++i;
1433 continue;
1434 }
1435
1436 Buffer *b = &*i;
1437 assert(b->is_writing());
1438
1439 if (b->flags & Buffer::FLAG_NOCACHE) {
1440 writing.erase(i++);
1441 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1442 buffer_map.erase(b->offset);
1443 } else {
1444 b->state = Buffer::STATE_CLEAN;
1445 writing.erase(i++);
31f18b77
FG
1446 b->maybe_rebuild();
1447 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
1448 cache->_add_buffer(b, 1, nullptr);
1449 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1450 }
1451 }
1452
1453 cache->_audit("finish_write end");
1454}
1455
1456void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1457{
1458 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1459 if (buffer_map.empty())
1460 return;
1461
1462 auto p = --buffer_map.end();
1463 while (true) {
1464 if (p->second->end() <= pos)
1465 break;
1466
1467 if (p->second->offset < pos) {
1468 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1469 size_t left = pos - p->second->offset;
1470 size_t right = p->second->length - left;
1471 if (p->second->data.length()) {
1472 bufferlist bl;
1473 bl.substr_of(p->second->data, left, right);
1474 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1475 0, p->second.get());
1476 } else {
1477 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1478 0, p->second.get());
1479 }
1480 cache->_adjust_buffer_size(p->second.get(), -right);
1481 p->second->truncate(left);
1482 break;
1483 }
1484
1485 assert(p->second->end() > pos);
1486 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1487 if (p->second->data.length()) {
1488 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1489 p->second->offset - pos, p->second->data),
1490 0, p->second.get());
1491 } else {
1492 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1493 p->second->offset - pos, p->second->length),
1494 0, p->second.get());
1495 }
1496 if (p == buffer_map.begin()) {
1497 _rm_buffer(cache, p);
1498 break;
1499 } else {
1500 _rm_buffer(cache, p--);
1501 }
1502 }
1503 assert(writing.empty());
1504}
1505
1506// OnodeSpace
1507
1508#undef dout_prefix
1509#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1510
1511BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1512{
1513 std::lock_guard<std::recursive_mutex> l(cache->lock);
1514 auto p = onode_map.find(oid);
1515 if (p != onode_map.end()) {
1516 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1517 << " raced, returning existing " << p->second
1518 << dendl;
1519 return p->second;
1520 }
1521 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1522 onode_map[oid] = o;
1523 cache->_add_onode(o, 1);
1524 return o;
1525}
1526
1527BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1528{
7c673cae 1529 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1530 OnodeRef o;
1531 bool hit = false;
1532
1533 {
1534 std::lock_guard<std::recursive_mutex> l(cache->lock);
1535 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1536 if (p == onode_map.end()) {
1537 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1538 } else {
1539 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1540 << dendl;
1541 cache->_touch_onode(p->second);
1542 hit = true;
1543 o = p->second;
1544 }
1545 }
1546
1547 if (hit) {
1548 cache->logger->inc(l_bluestore_onode_hits);
1549 } else {
7c673cae 1550 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1551 }
224ce89b 1552 return o;
7c673cae
FG
1553}
1554
1555void BlueStore::OnodeSpace::clear()
1556{
1557 std::lock_guard<std::recursive_mutex> l(cache->lock);
1558 ldout(cache->cct, 10) << __func__ << dendl;
1559 for (auto &p : onode_map) {
1560 cache->_rm_onode(p.second);
1561 }
1562 onode_map.clear();
1563}
1564
1565bool BlueStore::OnodeSpace::empty()
1566{
1567 std::lock_guard<std::recursive_mutex> l(cache->lock);
1568 return onode_map.empty();
1569}
1570
1571void BlueStore::OnodeSpace::rename(
1572 OnodeRef& oldo,
1573 const ghobject_t& old_oid,
1574 const ghobject_t& new_oid,
31f18b77 1575 const mempool::bluestore_cache_other::string& new_okey)
7c673cae
FG
1576{
1577 std::lock_guard<std::recursive_mutex> l(cache->lock);
1578 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1579 << dendl;
1580 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1581 po = onode_map.find(old_oid);
1582 pn = onode_map.find(new_oid);
1583 assert(po != pn);
1584
1585 assert(po != onode_map.end());
1586 if (pn != onode_map.end()) {
1587 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1588 << dendl;
1589 cache->_rm_onode(pn->second);
1590 onode_map.erase(pn);
1591 }
1592 OnodeRef o = po->second;
1593
1594 // install a non-existent onode at old location
1595 oldo.reset(new Onode(o->c, old_oid, o->key));
1596 po->second = oldo;
1597 cache->_add_onode(po->second, 1);
1598
1599 // add at new position and fix oid, key
1600 onode_map.insert(make_pair(new_oid, o));
1601 cache->_touch_onode(o);
1602 o->oid = new_oid;
1603 o->key = new_okey;
1604}
1605
1606bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1607{
1608 std::lock_guard<std::recursive_mutex> l(cache->lock);
1609 ldout(cache->cct, 20) << __func__ << dendl;
1610 for (auto& i : onode_map) {
1611 if (f(i.second)) {
1612 return true;
1613 }
1614 }
1615 return false;
1616}
1617
3efd9988
FG
1618void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
1619{
1620 for (auto& i : onode_map) {
1621 ldout(cct, lvl) << i.first << " : " << i.second << dendl;
1622 }
1623}
7c673cae
FG
1624
1625// SharedBlob
1626
1627#undef dout_prefix
1628#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1629
1630ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1631{
1632 out << "SharedBlob(" << &sb;
1633
1634 if (sb.loaded) {
1635 out << " loaded " << *sb.persistent;
1636 } else {
1637 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1638 }
1639 return out << ")";
1640}
1641
1642BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1643 : coll(_coll), sbid_unloaded(i)
1644{
1645 assert(sbid_unloaded > 0);
1646 if (get_cache()) {
1647 get_cache()->add_blob();
1648 }
1649}
1650
1651BlueStore::SharedBlob::~SharedBlob()
1652{
1653 if (get_cache()) { // the dummy instances have a nullptr
1654 std::lock_guard<std::recursive_mutex> l(get_cache()->lock);
1655 bc._clear(get_cache());
1656 get_cache()->rm_blob();
1657 }
1658 if (loaded && persistent) {
1659 delete persistent;
1660 }
1661}
1662
1663void BlueStore::SharedBlob::put()
1664{
1665 if (--nref == 0) {
1666 ldout(coll->store->cct, 20) << __func__ << " " << this
1667 << " removing self from set " << get_parent()
1668 << dendl;
1669 if (get_parent()) {
3efd9988 1670 if (get_parent()->try_remove(this)) {
7c673cae
FG
1671 delete this;
1672 } else {
1673 ldout(coll->store->cct, 20)
1674 << __func__ << " " << this << " lost race to remove myself from set"
1675 << dendl;
1676 }
1677 } else {
1678 delete this;
1679 }
1680 }
1681}
1682
1683void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1684{
1685 assert(persistent);
1686 persistent->ref_map.get(offset, length);
1687}
1688
1689void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77
FG
1690 PExtentVector *r,
1691 set<SharedBlob*> *maybe_unshared)
7c673cae
FG
1692{
1693 assert(persistent);
31f18b77
FG
1694 bool maybe = false;
1695 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1696 if (maybe_unshared && maybe) {
1697 maybe_unshared->insert(this);
1698 }
7c673cae
FG
1699}
1700
3efd9988
FG
1701// SharedBlobSet
1702
1703#undef dout_prefix
1704#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1705
1706void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
1707{
1708 std::lock_guard<std::mutex> l(lock);
1709 for (auto& i : sb_map) {
1710 ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
1711 }
1712}
1713
7c673cae
FG
1714// Blob
1715
1716#undef dout_prefix
1717#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1718
1719ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1720{
1721 out << "Blob(" << &b;
1722 if (b.is_spanning()) {
1723 out << " spanning " << b.id;
1724 }
35e4c445
FG
1725 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1726 if (b.shared_blob) {
1727 out << " " << *b.shared_blob;
1728 } else {
1729 out << " (shared_blob=NULL)";
1730 }
1731 out << ")";
7c673cae
FG
1732 return out;
1733}
1734
1735void BlueStore::Blob::discard_unallocated(Collection *coll)
1736{
224ce89b 1737 if (get_blob().is_shared()) {
7c673cae
FG
1738 return;
1739 }
224ce89b 1740 if (get_blob().is_compressed()) {
7c673cae
FG
1741 bool discard = false;
1742 bool all_invalid = true;
224ce89b 1743 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1744 if (!e.is_valid()) {
1745 discard = true;
1746 } else {
1747 all_invalid = false;
1748 }
1749 }
1750 assert(discard == all_invalid); // in case of compressed blob all
1751 // or none pextents are invalid.
1752 if (discard) {
224ce89b
WB
1753 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1754 get_blob().get_logical_length());
7c673cae
FG
1755 }
1756 } else {
1757 size_t pos = 0;
224ce89b 1758 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1759 if (!e.is_valid()) {
1760 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1761 << "~" << e.length
1762 << std::dec << dendl;
1763 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1764 }
1765 pos += e.length;
1766 }
224ce89b
WB
1767 if (get_blob().can_prune_tail()) {
1768 dirty_blob().prune_tail();
1769 used_in_blob.prune_tail(get_blob().get_ondisk_length());
7c673cae 1770 auto cct = coll->store->cct; //used by dout
224ce89b 1771 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1772 }
1773 }
1774}
1775
1776void BlueStore::Blob::get_ref(
1777 Collection *coll,
1778 uint32_t offset,
1779 uint32_t length)
1780{
1781 // Caller has to initialize Blob's logical length prior to increment
1782 // references. Otherwise one is neither unable to determine required
1783 // amount of counters in case of per-au tracking nor obtain min_release_size
1784 // for single counter mode.
1785 assert(get_blob().get_logical_length() != 0);
1786 auto cct = coll->store->cct;
1787 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1788 << std::dec << " " << *this << dendl;
1789
1790 if (used_in_blob.is_empty()) {
1791 uint32_t min_release_size =
224ce89b
WB
1792 get_blob().get_release_size(coll->store->min_alloc_size);
1793 uint64_t l = get_blob().get_logical_length();
1794 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1795 << min_release_size << std::dec << dendl;
7c673cae
FG
1796 used_in_blob.init(l, min_release_size);
1797 }
1798 used_in_blob.get(
1799 offset,
1800 length);
1801}
1802
1803bool BlueStore::Blob::put_ref(
1804 Collection *coll,
1805 uint32_t offset,
1806 uint32_t length,
1807 PExtentVector *r)
1808{
1809 PExtentVector logical;
1810
1811 auto cct = coll->store->cct;
1812 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1813 << std::dec << " " << *this << dendl;
1814
1815 bool empty = used_in_blob.put(
1816 offset,
1817 length,
1818 &logical);
1819 r->clear();
1820 // nothing to release
1821 if (!empty && logical.empty()) {
1822 return false;
1823 }
1824
1825 bluestore_blob_t& b = dirty_blob();
1826 return b.release_extents(empty, logical, r);
1827}
1828
224ce89b 1829bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1830 uint32_t target_blob_size,
1831 uint32_t b_offset,
1832 uint32_t *length0) {
1833 assert(min_alloc_size);
1834 assert(target_blob_size);
1835 if (!get_blob().is_mutable()) {
1836 return false;
1837 }
1838
1839 uint32_t length = *length0;
1840 uint32_t end = b_offset + length;
1841
1842 // Currently for the sake of simplicity we omit blob reuse if data is
1843 // unaligned with csum chunk. Later we can perform padding if needed.
1844 if (get_blob().has_csum() &&
1845 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1846 (end % get_blob().get_csum_chunk_size()) != 0)) {
1847 return false;
1848 }
1849
1850 auto blen = get_blob().get_logical_length();
1851 uint32_t new_blen = blen;
1852
1853 // make sure target_blob_size isn't less than current blob len
1854 target_blob_size = MAX(blen, target_blob_size);
1855
1856 if (b_offset >= blen) {
224ce89b
WB
1857 // new data totally stands out of the existing blob
1858 new_blen = end;
7c673cae 1859 } else {
224ce89b
WB
1860 // new data overlaps with the existing blob
1861 new_blen = MAX(blen, end);
1862
1863 uint32_t overlap = 0;
1864 if (new_blen > blen) {
1865 overlap = blen - b_offset;
1866 } else {
1867 overlap = length;
1868 }
1869
1870 if (!get_blob().is_unallocated(b_offset, overlap)) {
1871 // abort if any piece of the overlap has already been allocated
1872 return false;
7c673cae
FG
1873 }
1874 }
224ce89b 1875
7c673cae
FG
1876 if (new_blen > blen) {
1877 int64_t overflow = int64_t(new_blen) - target_blob_size;
1878 // Unable to decrease the provided length to fit into max_blob_size
1879 if (overflow >= length) {
1880 return false;
1881 }
1882
1883 // FIXME: in some cases we could reduce unused resolution
1884 if (get_blob().has_unused()) {
1885 return false;
1886 }
1887
1888 if (overflow > 0) {
1889 new_blen -= overflow;
1890 length -= overflow;
1891 *length0 = length;
1892 }
224ce89b 1893
7c673cae
FG
1894 if (new_blen > blen) {
1895 dirty_blob().add_tail(new_blen);
1896 used_in_blob.add_tail(new_blen,
224ce89b 1897 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
1898 }
1899 }
1900 return true;
1901}
1902
1903void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1904{
1905 auto cct = coll->store->cct; //used by dout
1906 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1907 << " start " << *this << dendl;
1908 assert(blob.can_split());
1909 assert(used_in_blob.can_split());
1910 bluestore_blob_t &lb = dirty_blob();
1911 bluestore_blob_t &rb = r->dirty_blob();
1912
1913 used_in_blob.split(
1914 blob_offset,
1915 &(r->used_in_blob));
1916
1917 lb.split(blob_offset, rb);
1918 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1919
1920 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1921 << " finish " << *this << dendl;
1922 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1923 << " and " << *r << dendl;
1924}
1925
1926#ifndef CACHE_BLOB_BL
1927void BlueStore::Blob::decode(
1928 Collection *coll,
1929 bufferptr::iterator& p,
1930 uint64_t struct_v,
1931 uint64_t* sbid,
1932 bool include_ref_map)
1933{
1934 denc(blob, p, struct_v);
1935 if (blob.is_shared()) {
1936 denc(*sbid, p);
1937 }
1938 if (include_ref_map) {
1939 if (struct_v > 1) {
1940 used_in_blob.decode(p);
1941 } else {
1942 used_in_blob.clear();
1943 bluestore_extent_ref_map_t legacy_ref_map;
1944 legacy_ref_map.decode(p);
1945 for (auto r : legacy_ref_map.ref_map) {
1946 get_ref(
1947 coll,
1948 r.first,
1949 r.second.refs * r.second.length);
1950 }
1951 }
1952 }
1953}
1954#endif
1955
1956// Extent
1957
1958ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1959{
1960 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1961 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1962 << " " << *e.blob;
1963}
1964
1965// OldExtent
1966BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1967 uint32_t lo,
1968 uint32_t o,
1969 uint32_t l,
1970 BlobRef& b) {
1971 OldExtent* oe = new OldExtent(lo, o, l, b);
1972 b->put_ref(c.get(), o, l, &(oe->r));
1973 oe->blob_empty = b->get_referenced_bytes() == 0;
1974 return oe;
1975}
1976
1977// ExtentMap
1978
1979#undef dout_prefix
1980#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1981
1982BlueStore::ExtentMap::ExtentMap(Onode *o)
1983 : onode(o),
1984 inline_bl(
1985 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1986}
1987
1988void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1989 bool force)
1990{
1991 auto cct = onode->c->store->cct; //used by dout
1992 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1993 if (onode->onode.extent_map_shards.empty()) {
1994 if (inline_bl.length() == 0) {
1995 unsigned n;
1996 // we need to encode inline_bl to measure encoded length
1997 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
3efd9988 1998 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
7c673cae
FG
1999 assert(!never_happen);
2000 size_t len = inline_bl.length();
2001 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2002 << " extents" << dendl;
2003 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2004 request_reshard(0, OBJECT_MAX_SIZE);
2005 return;
2006 }
2007 }
2008 // will persist in the onode key.
2009 } else {
2010 // pending shard update
2011 struct dirty_shard_t {
2012 Shard *shard;
2013 bufferlist bl;
2014 dirty_shard_t(Shard *s) : shard(s) {}
2015 };
2016 vector<dirty_shard_t> encoded_shards;
2017 // allocate slots for all shards in a single call instead of
2018 // doing multiple allocations - one per each dirty shard
2019 encoded_shards.reserve(shards.size());
2020
2021 auto p = shards.begin();
2022 auto prev_p = p;
2023 while (p != shards.end()) {
31f18b77 2024 assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2025 auto n = p;
2026 ++n;
2027 if (p->dirty) {
2028 uint32_t endoff;
2029 if (n == shards.end()) {
2030 endoff = OBJECT_MAX_SIZE;
2031 } else {
2032 endoff = n->shard_info->offset;
2033 }
2034 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2035 bufferlist& bl = encoded_shards.back().bl;
2036 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2037 bl, &p->extents)) {
2038 if (force) {
2039 derr << __func__ << " encode_some needs reshard" << dendl;
2040 assert(!force);
2041 }
2042 }
2043 size_t len = bl.length();
2044
2045 dout(20) << __func__ << " shard 0x" << std::hex
2046 << p->shard_info->offset << std::dec << " is " << len
2047 << " bytes (was " << p->shard_info->bytes << ") from "
2048 << p->extents << " extents" << dendl;
2049
2050 if (!force) {
2051 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2052 // we are big; reshard ourselves
2053 request_reshard(p->shard_info->offset, endoff);
2054 }
2055 // avoid resharding the trailing shard, even if it is small
2056 else if (n != shards.end() &&
2057 len < g_conf->bluestore_extent_map_shard_min_size) {
31f18b77
FG
2058 assert(endoff != OBJECT_MAX_SIZE);
2059 if (p == shards.begin()) {
2060 // we are the first shard, combine with next shard
7c673cae 2061 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2062 } else {
31f18b77
FG
2063 // combine either with the previous shard or the next,
2064 // whichever is smaller
7c673cae
FG
2065 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2066 request_reshard(p->shard_info->offset, endoff + 1);
2067 } else {
2068 request_reshard(prev_p->shard_info->offset, endoff);
2069 }
2070 }
2071 }
2072 }
2073 }
2074 prev_p = p;
2075 p = n;
2076 }
2077 if (needs_reshard()) {
2078 return;
2079 }
2080
2081 // schedule DB update for dirty shards
2082 string key;
2083 for (auto& it : encoded_shards) {
2084 it.shard->dirty = false;
2085 it.shard->shard_info->bytes = it.bl.length();
2086 generate_extent_shard_key_and_apply(
2087 onode->key,
2088 it.shard->shard_info->offset,
2089 &key,
2090 [&](const string& final_key) {
2091 t->set(PREFIX_OBJ, final_key, it.bl);
2092 }
2093 );
2094 }
2095 }
2096}
2097
31f18b77
FG
2098bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2099{
2100 if (spanning_blob_map.empty())
2101 return 0;
2102 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2103 // bid is valid and available.
2104 if (bid >= 0)
2105 return bid;
2106 // Find next unused bid;
2107 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2108 const auto begin_bid = bid;
2109 do {
2110 if (!spanning_blob_map.count(bid))
2111 return bid;
2112 else {
2113 bid++;
2114 if (bid < 0) bid = 0;
2115 }
2116 } while (bid != begin_bid);
2117 assert(0 == "no available blob id");
2118}
2119
7c673cae
FG
2120void BlueStore::ExtentMap::reshard(
2121 KeyValueDB *db,
2122 KeyValueDB::Transaction t)
2123{
2124 auto cct = onode->c->store->cct; // used by dout
2125
2126 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2127 << needs_reshard_end << ")" << std::dec
2128 << " of " << onode->onode.extent_map_shards.size()
2129 << " shards on " << onode->oid << dendl;
2130 for (auto& p : spanning_blob_map) {
2131 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2132 << dendl;
2133 }
2134 // determine shard index range
2135 unsigned si_begin = 0, si_end = 0;
2136 if (!shards.empty()) {
2137 while (si_begin + 1 < shards.size() &&
2138 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2139 ++si_begin;
2140 }
2141 needs_reshard_begin = shards[si_begin].shard_info->offset;
2142 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2143 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2144 needs_reshard_end = shards[si_end].shard_info->offset;
2145 break;
2146 }
2147 }
2148 if (si_end == shards.size()) {
2149 needs_reshard_end = OBJECT_MAX_SIZE;
2150 }
2151 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2152 << " over 0x[" << std::hex << needs_reshard_begin << ","
2153 << needs_reshard_end << ")" << std::dec << dendl;
2154 }
2155
181888fb 2156 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2157
2158 // we may need to fault in a larger interval later must have all
2159 // referring extents for spanning blobs loaded in order to have
2160 // accurate use_tracker values.
2161 uint32_t spanning_scan_begin = needs_reshard_begin;
2162 uint32_t spanning_scan_end = needs_reshard_end;
2163
2164 // remove old keys
2165 string key;
2166 for (unsigned i = si_begin; i < si_end; ++i) {
2167 generate_extent_shard_key_and_apply(
2168 onode->key, shards[i].shard_info->offset, &key,
2169 [&](const string& final_key) {
2170 t->rmkey(PREFIX_OBJ, final_key);
2171 }
2172 );
2173 }
2174
2175 // calculate average extent size
2176 unsigned bytes = 0;
2177 unsigned extents = 0;
2178 if (onode->onode.extent_map_shards.empty()) {
2179 bytes = inline_bl.length();
2180 extents = extent_map.size();
2181 } else {
2182 for (unsigned i = si_begin; i < si_end; ++i) {
2183 bytes += shards[i].shard_info->bytes;
2184 extents += shards[i].extents;
2185 }
2186 }
2187 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2188 unsigned slop = target *
2189 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2190 unsigned extent_avg = bytes / MAX(1, extents);
2191 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2192 << ", slop " << slop << dendl;
2193
2194 // reshard
2195 unsigned estimate = 0;
31f18b77 2196 unsigned offset = needs_reshard_begin;
7c673cae
FG
2197 vector<bluestore_onode_t::shard_info> new_shard_info;
2198 unsigned max_blob_end = 0;
2199 Extent dummy(needs_reshard_begin);
2200 for (auto e = extent_map.lower_bound(dummy);
2201 e != extent_map.end();
2202 ++e) {
2203 if (e->logical_offset >= needs_reshard_end) {
2204 break;
2205 }
2206 dout(30) << " extent " << *e << dendl;
2207
2208 // disfavor shard boundaries that span a blob
2209 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2210 if (estimate &&
2211 estimate + extent_avg > target + (would_span ? slop : 0)) {
2212 // new shard
31f18b77 2213 if (offset == needs_reshard_begin) {
7c673cae
FG
2214 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2215 new_shard_info.back().offset = offset;
2216 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2217 << std::dec << dendl;
7c673cae
FG
2218 }
2219 offset = e->logical_offset;
2220 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2221 new_shard_info.back().offset = offset;
2222 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2223 << std::dec << dendl;
2224 estimate = 0;
2225 }
2226 estimate += extent_avg;
31f18b77
FG
2227 unsigned bs = e->blob_start();
2228 if (bs < spanning_scan_begin) {
2229 spanning_scan_begin = bs;
7c673cae
FG
2230 }
2231 uint32_t be = e->blob_end();
2232 if (be > max_blob_end) {
2233 max_blob_end = be;
2234 }
2235 if (be > spanning_scan_end) {
2236 spanning_scan_end = be;
2237 }
2238 }
2239 if (new_shard_info.empty() && (si_begin > 0 ||
2240 si_end < shards.size())) {
2241 // we resharded a partial range; we must produce at least one output
2242 // shard
2243 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2244 new_shard_info.back().offset = needs_reshard_begin;
2245 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2246 << std::dec << " (singleton degenerate case)" << dendl;
2247 }
2248
2249 auto& sv = onode->onode.extent_map_shards;
2250 dout(20) << __func__ << " new " << new_shard_info << dendl;
2251 dout(20) << __func__ << " old " << sv << dendl;
2252 if (sv.empty()) {
2253 // no old shards to keep
2254 sv.swap(new_shard_info);
2255 init_shards(true, true);
2256 } else {
2257 // splice in new shards
2258 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2259 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2260 sv.insert(
2261 sv.begin() + si_begin,
2262 new_shard_info.begin(),
2263 new_shard_info.end());
2264 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2265 si_end = si_begin + new_shard_info.size();
31f18b77
FG
2266
2267 assert(sv.size() == shards.size());
2268
2269 // note that we need to update every shard_info of shards here,
2270 // as sv might have been totally re-allocated above
2271 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2272 shards[i].shard_info = &sv[i];
31f18b77
FG
2273 }
2274
2275 // mark newly added shards as dirty
2276 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2277 shards[i].loaded = true;
2278 shards[i].dirty = true;
2279 }
7c673cae
FG
2280 }
2281 dout(20) << __func__ << " fin " << sv << dendl;
2282 inline_bl.clear();
2283
2284 if (sv.empty()) {
2285 // no more shards; unspan all previously spanning blobs
2286 auto p = spanning_blob_map.begin();
2287 while (p != spanning_blob_map.end()) {
2288 p->second->id = -1;
2289 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2290 p = spanning_blob_map.erase(p);
2291 }
2292 } else {
2293 // identify new spanning blobs
2294 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2295 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2296 if (spanning_scan_begin < needs_reshard_begin) {
2297 fault_range(db, spanning_scan_begin,
2298 needs_reshard_begin - spanning_scan_begin);
2299 }
2300 if (spanning_scan_end > needs_reshard_end) {
2301 fault_range(db, needs_reshard_end,
31f18b77 2302 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2303 }
2304 auto sp = sv.begin() + si_begin;
2305 auto esp = sv.end();
2306 unsigned shard_start = sp->offset;
2307 unsigned shard_end;
2308 ++sp;
2309 if (sp == esp) {
2310 shard_end = OBJECT_MAX_SIZE;
2311 } else {
2312 shard_end = sp->offset;
2313 }
7c673cae
FG
2314 Extent dummy(needs_reshard_begin);
2315 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2316 if (e->logical_offset >= needs_reshard_end) {
2317 break;
2318 }
2319 dout(30) << " extent " << *e << dendl;
2320 while (e->logical_offset >= shard_end) {
2321 shard_start = shard_end;
2322 assert(sp != esp);
2323 ++sp;
2324 if (sp == esp) {
2325 shard_end = OBJECT_MAX_SIZE;
2326 } else {
2327 shard_end = sp->offset;
2328 }
2329 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2330 << " to 0x" << shard_end << std::dec << dendl;
2331 }
2332 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2333 if (!e->blob->is_spanning()) {
2334 // We have two options: (1) split the blob into pieces at the
2335 // shard boundaries (and adjust extents accordingly), or (2)
2336 // mark it spanning. We prefer to cut the blob if we can. Note that
2337 // we may have to split it multiple times--potentially at every
2338 // shard boundary.
2339 bool must_span = false;
2340 BlobRef b = e->blob;
2341 if (b->can_split()) {
2342 uint32_t bstart = e->blob_start();
2343 uint32_t bend = e->blob_end();
2344 for (const auto& sh : shards) {
2345 if (bstart < sh.shard_info->offset &&
2346 bend > sh.shard_info->offset) {
2347 uint32_t blob_offset = sh.shard_info->offset - bstart;
2348 if (b->can_split_at(blob_offset)) {
2349 dout(20) << __func__ << " splitting blob, bstart 0x"
2350 << std::hex << bstart << " blob_offset 0x"
2351 << blob_offset << std::dec << " " << *b << dendl;
2352 b = split_blob(b, blob_offset, sh.shard_info->offset);
2353 // switch b to the new right-hand side, in case it
2354 // *also* has to get split.
2355 bstart += blob_offset;
2356 onode->c->store->logger->inc(l_bluestore_blob_split);
2357 } else {
2358 must_span = true;
2359 break;
2360 }
2361 }
2362 }
2363 } else {
2364 must_span = true;
2365 }
2366 if (must_span) {
31f18b77
FG
2367 auto bid = allocate_spanning_blob_id();
2368 b->id = bid;
7c673cae
FG
2369 spanning_blob_map[b->id] = b;
2370 dout(20) << __func__ << " adding spanning " << *b << dendl;
2371 }
2372 }
2373 } else {
2374 if (e->blob->is_spanning()) {
2375 spanning_blob_map.erase(e->blob->id);
2376 e->blob->id = -1;
2377 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2378 }
2379 }
2380 }
2381 }
2382
2383 clear_needs_reshard();
2384}
2385
2386bool BlueStore::ExtentMap::encode_some(
2387 uint32_t offset,
2388 uint32_t length,
2389 bufferlist& bl,
2390 unsigned *pn)
2391{
2392 auto cct = onode->c->store->cct; //used by dout
2393 Extent dummy(offset);
2394 auto start = extent_map.lower_bound(dummy);
2395 uint32_t end = offset + length;
2396
2397 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2398 // serialization only. Hence there is no specific
2399 // handling at ExtentMap level.
2400
2401 unsigned n = 0;
2402 size_t bound = 0;
7c673cae
FG
2403 bool must_reshard = false;
2404 for (auto p = start;
2405 p != extent_map.end() && p->logical_offset < end;
2406 ++p, ++n) {
2407 assert(p->logical_offset >= offset);
2408 p->blob->last_encoded_id = -1;
2409 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2410 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2411 << std::dec << " hit new spanning blob " << *p << dendl;
2412 request_reshard(p->blob_start(), p->blob_end());
2413 must_reshard = true;
2414 }
31f18b77
FG
2415 if (!must_reshard) {
2416 denc_varint(0, bound); // blobid
2417 denc_varint(0, bound); // logical_offset
2418 denc_varint(0, bound); // len
2419 denc_varint(0, bound); // blob_offset
7c673cae 2420
31f18b77
FG
2421 p->blob->bound_encode(
2422 bound,
2423 struct_v,
2424 p->blob->shared_blob->get_sbid(),
2425 false);
2426 }
7c673cae
FG
2427 }
2428 if (must_reshard) {
2429 return true;
2430 }
2431
31f18b77
FG
2432 denc(struct_v, bound);
2433 denc_varint(0, bound); // number of extents
2434
7c673cae
FG
2435 {
2436 auto app = bl.get_contiguous_appender(bound);
2437 denc(struct_v, app);
2438 denc_varint(n, app);
2439 if (pn) {
2440 *pn = n;
2441 }
2442
2443 n = 0;
2444 uint64_t pos = 0;
2445 uint64_t prev_len = 0;
2446 for (auto p = start;
2447 p != extent_map.end() && p->logical_offset < end;
2448 ++p, ++n) {
2449 unsigned blobid;
2450 bool include_blob = false;
2451 if (p->blob->is_spanning()) {
2452 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2453 blobid |= BLOBID_FLAG_SPANNING;
2454 } else if (p->blob->last_encoded_id < 0) {
2455 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2456 include_blob = true;
2457 blobid = 0; // the decoder will infer the id from n
2458 } else {
2459 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2460 }
2461 if (p->logical_offset == pos) {
2462 blobid |= BLOBID_FLAG_CONTIGUOUS;
2463 }
2464 if (p->blob_offset == 0) {
2465 blobid |= BLOBID_FLAG_ZEROOFFSET;
2466 }
2467 if (p->length == prev_len) {
2468 blobid |= BLOBID_FLAG_SAMELENGTH;
2469 } else {
2470 prev_len = p->length;
2471 }
2472 denc_varint(blobid, app);
2473 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2474 denc_varint_lowz(p->logical_offset - pos, app);
2475 }
2476 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2477 denc_varint_lowz(p->blob_offset, app);
2478 }
2479 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2480 denc_varint_lowz(p->length, app);
2481 }
2482 pos = p->logical_end();
2483 if (include_blob) {
2484 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2485 }
2486 }
2487 }
2488 /*derr << __func__ << bl << dendl;
2489 derr << __func__ << ":";
2490 bl.hexdump(*_dout);
2491 *_dout << dendl;
2492 */
2493 return false;
2494}
2495
2496unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2497{
2498 auto cct = onode->c->store->cct; //used by dout
2499 /*
2500 derr << __func__ << ":";
2501 bl.hexdump(*_dout);
2502 *_dout << dendl;
2503 */
2504
2505 assert(bl.get_num_buffers() <= 1);
2506 auto p = bl.front().begin_deep();
2507 __u8 struct_v;
2508 denc(struct_v, p);
2509 // Version 2 differs from v1 in blob's ref_map
2510 // serialization only. Hence there is no specific
2511 // handling at ExtentMap level below.
2512 assert(struct_v == 1 || struct_v == 2);
2513
2514 uint32_t num;
2515 denc_varint(num, p);
2516 vector<BlobRef> blobs(num);
2517 uint64_t pos = 0;
2518 uint64_t prev_len = 0;
2519 unsigned n = 0;
2520
2521 while (!p.end()) {
2522 Extent *le = new Extent();
2523 uint64_t blobid;
2524 denc_varint(blobid, p);
2525 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2526 uint64_t gap;
2527 denc_varint_lowz(gap, p);
2528 pos += gap;
2529 }
2530 le->logical_offset = pos;
2531 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2532 denc_varint_lowz(le->blob_offset, p);
2533 } else {
2534 le->blob_offset = 0;
2535 }
2536 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2537 denc_varint_lowz(prev_len, p);
2538 }
2539 le->length = prev_len;
2540
2541 if (blobid & BLOBID_FLAG_SPANNING) {
2542 dout(30) << __func__ << " getting spanning blob "
2543 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2544 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2545 } else {
2546 blobid >>= BLOBID_SHIFT_BITS;
2547 if (blobid) {
2548 le->assign_blob(blobs[blobid - 1]);
2549 assert(le->blob);
2550 } else {
2551 Blob *b = new Blob();
2552 uint64_t sbid = 0;
2553 b->decode(onode->c, p, struct_v, &sbid, false);
2554 blobs[n] = b;
2555 onode->c->open_shared_blob(sbid, b);
2556 le->assign_blob(b);
2557 }
2558 // we build ref_map dynamically for non-spanning blobs
2559 le->blob->get_ref(
2560 onode->c,
2561 le->blob_offset,
2562 le->length);
2563 }
2564 pos += prev_len;
2565 ++n;
2566 extent_map.insert(*le);
2567 }
2568
2569 assert(n == num);
2570 return num;
2571}
2572
2573void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2574{
2575 // Version 2 differs from v1 in blob's ref_map
2576 // serialization only. Hence there is no specific
2577 // handling at ExtentMap level.
2578 __u8 struct_v = 2;
2579
2580 denc(struct_v, p);
2581 denc_varint((uint32_t)0, p);
2582 size_t key_size = 0;
2583 denc_varint((uint32_t)0, key_size);
2584 p += spanning_blob_map.size() * key_size;
2585 for (const auto& i : spanning_blob_map) {
2586 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2587 }
2588}
2589
2590void BlueStore::ExtentMap::encode_spanning_blobs(
2591 bufferlist::contiguous_appender& p)
2592{
2593 // Version 2 differs from v1 in blob's ref_map
2594 // serialization only. Hence there is no specific
2595 // handling at ExtentMap level.
2596 __u8 struct_v = 2;
2597
2598 denc(struct_v, p);
2599 denc_varint(spanning_blob_map.size(), p);
2600 for (auto& i : spanning_blob_map) {
2601 denc_varint(i.second->id, p);
2602 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2603 }
2604}
2605
2606void BlueStore::ExtentMap::decode_spanning_blobs(
2607 bufferptr::iterator& p)
2608{
2609 __u8 struct_v;
2610 denc(struct_v, p);
2611 // Version 2 differs from v1 in blob's ref_map
2612 // serialization only. Hence there is no specific
2613 // handling at ExtentMap level.
2614 assert(struct_v == 1 || struct_v == 2);
2615
2616 unsigned n;
2617 denc_varint(n, p);
2618 while (n--) {
2619 BlobRef b(new Blob());
2620 denc_varint(b->id, p);
2621 spanning_blob_map[b->id] = b;
2622 uint64_t sbid = 0;
2623 b->decode(onode->c, p, struct_v, &sbid, true);
2624 onode->c->open_shared_blob(sbid, b);
2625 }
2626}
2627
2628void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2629{
2630 shards.resize(onode->onode.extent_map_shards.size());
2631 unsigned i = 0;
2632 for (auto &s : onode->onode.extent_map_shards) {
2633 shards[i].shard_info = &s;
2634 shards[i].loaded = loaded;
2635 shards[i].dirty = dirty;
2636 ++i;
2637 }
2638}
2639
2640void BlueStore::ExtentMap::fault_range(
2641 KeyValueDB *db,
2642 uint32_t offset,
2643 uint32_t length)
2644{
2645 auto cct = onode->c->store->cct; //used by dout
2646 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2647 << std::dec << dendl;
2648 auto start = seek_shard(offset);
2649 auto last = seek_shard(offset + length);
2650
2651 if (start < 0)
2652 return;
2653
2654 assert(last >= start);
2655 string key;
2656 while (start <= last) {
2657 assert((size_t)start < shards.size());
2658 auto p = &shards[start];
2659 if (!p->loaded) {
2660 dout(30) << __func__ << " opening shard 0x" << std::hex
2661 << p->shard_info->offset << std::dec << dendl;
2662 bufferlist v;
2663 generate_extent_shard_key_and_apply(
2664 onode->key, p->shard_info->offset, &key,
2665 [&](const string& final_key) {
2666 int r = db->get(PREFIX_OBJ, final_key, &v);
2667 if (r < 0) {
2668 derr << __func__ << " missing shard 0x" << std::hex
2669 << p->shard_info->offset << std::dec << " for " << onode->oid
2670 << dendl;
2671 assert(r >= 0);
2672 }
2673 }
2674 );
2675 p->extents = decode_some(v);
2676 p->loaded = true;
2677 dout(20) << __func__ << " open shard 0x" << std::hex
2678 << p->shard_info->offset << std::dec
2679 << " (" << v.length() << " bytes)" << dendl;
2680 assert(p->dirty == false);
2681 assert(v.length() == p->shard_info->bytes);
2682 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2683 } else {
2684 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2685 }
2686 ++start;
2687 }
2688}
2689
2690void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
2691 uint32_t offset,
2692 uint32_t length)
2693{
2694 auto cct = onode->c->store->cct; //used by dout
2695 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2696 << std::dec << dendl;
2697 if (shards.empty()) {
2698 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2699 inline_bl.clear();
2700 return;
2701 }
2702 auto start = seek_shard(offset);
2703 auto last = seek_shard(offset + length);
2704 if (start < 0)
2705 return;
2706
2707 assert(last >= start);
2708 while (start <= last) {
2709 assert((size_t)start < shards.size());
2710 auto p = &shards[start];
2711 if (!p->loaded) {
2712 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2713 << std::dec << " is not loaded, can't mark dirty" << dendl;
2714 assert(0 == "can't mark unloaded shard dirty");
2715 }
2716 if (!p->dirty) {
2717 dout(20) << __func__ << " mark shard 0x" << std::hex
2718 << p->shard_info->offset << std::dec << " dirty" << dendl;
2719 p->dirty = true;
2720 }
2721 ++start;
2722 }
2723}
2724
2725BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2726 uint64_t offset)
2727{
2728 Extent dummy(offset);
2729 return extent_map.find(dummy);
2730}
2731
7c673cae
FG
2732BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2733 uint64_t offset)
2734{
2735 Extent dummy(offset);
2736 auto fp = extent_map.lower_bound(dummy);
2737 if (fp != extent_map.begin()) {
2738 --fp;
2739 if (fp->logical_end() <= offset) {
2740 ++fp;
2741 }
2742 }
2743 return fp;
2744}
2745
2746BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2747 uint64_t offset) const
2748{
2749 Extent dummy(offset);
2750 auto fp = extent_map.lower_bound(dummy);
2751 if (fp != extent_map.begin()) {
2752 --fp;
2753 if (fp->logical_end() <= offset) {
2754 ++fp;
2755 }
2756 }
2757 return fp;
2758}
2759
2760bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2761{
2762 auto fp = seek_lextent(offset);
2763 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2764 return false;
2765 }
2766 return true;
2767}
2768
2769int BlueStore::ExtentMap::compress_extent_map(
2770 uint64_t offset,
2771 uint64_t length)
2772{
2773 auto cct = onode->c->store->cct; //used by dout
2774 if (extent_map.empty())
2775 return 0;
2776 int removed = 0;
2777 auto p = seek_lextent(offset);
2778 if (p != extent_map.begin()) {
2779 --p; // start to the left of offset
2780 }
2781 // the caller should have just written to this region
2782 assert(p != extent_map.end());
2783
2784 // identify the *next* shard
2785 auto pshard = shards.begin();
2786 while (pshard != shards.end() &&
2787 p->logical_offset >= pshard->shard_info->offset) {
2788 ++pshard;
2789 }
2790 uint64_t shard_end;
2791 if (pshard != shards.end()) {
2792 shard_end = pshard->shard_info->offset;
2793 } else {
2794 shard_end = OBJECT_MAX_SIZE;
2795 }
2796
2797 auto n = p;
2798 for (++n; n != extent_map.end(); p = n++) {
2799 if (n->logical_offset > offset + length) {
2800 break; // stop after end
2801 }
2802 while (n != extent_map.end() &&
2803 p->logical_end() == n->logical_offset &&
2804 p->blob == n->blob &&
2805 p->blob_offset + p->length == n->blob_offset &&
2806 n->logical_offset < shard_end) {
2807 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2808 << " next shard 0x" << shard_end << std::dec
2809 << " merging " << *p << " and " << *n << dendl;
2810 p->length += n->length;
2811 rm(n++);
2812 ++removed;
2813 }
2814 if (n == extent_map.end()) {
2815 break;
2816 }
2817 if (n->logical_offset >= shard_end) {
2818 assert(pshard != shards.end());
2819 ++pshard;
2820 if (pshard != shards.end()) {
2821 shard_end = pshard->shard_info->offset;
2822 } else {
2823 shard_end = OBJECT_MAX_SIZE;
2824 }
2825 }
2826 }
2827 if (removed && onode) {
2828 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2829 }
2830 return removed;
2831}
2832
2833void BlueStore::ExtentMap::punch_hole(
2834 CollectionRef &c,
2835 uint64_t offset,
2836 uint64_t length,
2837 old_extent_map_t *old_extents)
2838{
2839 auto p = seek_lextent(offset);
2840 uint64_t end = offset + length;
2841 while (p != extent_map.end()) {
2842 if (p->logical_offset >= end) {
2843 break;
2844 }
2845 if (p->logical_offset < offset) {
2846 if (p->logical_end() > end) {
2847 // split and deref middle
2848 uint64_t front = offset - p->logical_offset;
2849 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2850 length, p->blob);
2851 old_extents->push_back(*oe);
2852 add(end,
2853 p->blob_offset + front + length,
2854 p->length - front - length,
2855 p->blob);
2856 p->length = front;
2857 break;
2858 } else {
2859 // deref tail
2860 assert(p->logical_end() > offset); // else seek_lextent bug
2861 uint64_t keep = offset - p->logical_offset;
2862 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2863 p->length - keep, p->blob);
2864 old_extents->push_back(*oe);
2865 p->length = keep;
2866 ++p;
2867 continue;
2868 }
2869 }
2870 if (p->logical_offset + p->length <= end) {
2871 // deref whole lextent
2872 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2873 p->length, p->blob);
2874 old_extents->push_back(*oe);
2875 rm(p++);
2876 continue;
2877 }
2878 // deref head
2879 uint64_t keep = p->logical_end() - end;
2880 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2881 p->length - keep, p->blob);
2882 old_extents->push_back(*oe);
2883
2884 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2885 rm(p);
2886 break;
2887 }
2888}
2889
2890BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2891 CollectionRef &c,
2892 uint64_t logical_offset,
2893 uint64_t blob_offset, uint64_t length, BlobRef b,
2894 old_extent_map_t *old_extents)
2895{
2896 // We need to have completely initialized Blob to increment its ref counters.
2897 assert(b->get_blob().get_logical_length() != 0);
2898
2899 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2900 // old_extents list if we overwre the blob totally
2901 // This might happen during WAL overwrite.
2902 b->get_ref(onode->c, blob_offset, length);
2903
2904 if (old_extents) {
2905 punch_hole(c, logical_offset, length, old_extents);
2906 }
2907
2908 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2909 extent_map.insert(*le);
2910 if (spans_shard(logical_offset, length)) {
2911 request_reshard(logical_offset, logical_offset + length);
2912 }
2913 return le;
2914}
2915
2916BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2917 BlobRef lb,
2918 uint32_t blob_offset,
2919 uint32_t pos)
2920{
2921 auto cct = onode->c->store->cct; //used by dout
2922
2923 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2924 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2925 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2926 << dendl;
2927 BlobRef rb = onode->c->new_blob();
2928 lb->split(onode->c, blob_offset, rb.get());
2929
2930 for (auto ep = seek_lextent(pos);
2931 ep != extent_map.end() && ep->logical_offset < end_pos;
2932 ++ep) {
2933 if (ep->blob != lb) {
2934 continue;
2935 }
2936 if (ep->logical_offset < pos) {
2937 // split extent
2938 size_t left = pos - ep->logical_offset;
2939 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2940 extent_map.insert(*ne);
2941 ep->length = left;
2942 dout(30) << __func__ << " split " << *ep << dendl;
2943 dout(30) << __func__ << " to " << *ne << dendl;
2944 } else {
2945 // switch blob
2946 assert(ep->blob_offset >= blob_offset);
2947
2948 ep->blob = rb;
2949 ep->blob_offset -= blob_offset;
2950 dout(30) << __func__ << " adjusted " << *ep << dendl;
2951 }
2952 }
2953 return rb;
2954}
2955
2956// Onode
2957
2958#undef dout_prefix
2959#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2960
2961void BlueStore::Onode::flush()
2962{
2963 if (flushing_count.load()) {
2964 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2965 std::unique_lock<std::mutex> l(flush_lock);
2966 while (flushing_count.load()) {
2967 flush_cond.wait(l);
2968 }
2969 }
2970 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2971}
2972
2973// =======================================================
2974// WriteContext
2975
2976/// Checks for writes to the same pextent within a blob
2977bool BlueStore::WriteContext::has_conflict(
2978 BlobRef b,
2979 uint64_t loffs,
2980 uint64_t loffs_end,
2981 uint64_t min_alloc_size)
2982{
2983 assert((loffs % min_alloc_size) == 0);
2984 assert((loffs_end % min_alloc_size) == 0);
2985 for (auto w : writes) {
2986 if (b == w.b) {
2987 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
224ce89b 2988 auto loffs2_end = P2ROUNDUP(w.logical_offset + w.length0, min_alloc_size);
7c673cae 2989 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 2990 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
2991 return true;
2992 }
2993 }
2994 }
2995 return false;
2996}
2997
2998// =======================================================
2999
3000// DeferredBatch
3001#undef dout_prefix
3002#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3003
3004void BlueStore::DeferredBatch::prepare_write(
3005 CephContext *cct,
3006 uint64_t seq, uint64_t offset, uint64_t length,
3007 bufferlist::const_iterator& blp)
3008{
3009 _discard(cct, offset, length);
3010 auto i = iomap.insert(make_pair(offset, deferred_io()));
3011 assert(i.second); // this should be a new insertion
3012 i.first->second.seq = seq;
3013 blp.copy(length, i.first->second.bl);
31f18b77
FG
3014 i.first->second.bl.reassign_to_mempool(
3015 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3016 dout(20) << __func__ << " seq " << seq
3017 << " 0x" << std::hex << offset << "~" << length
3018 << " crc " << i.first->second.bl.crc32c(-1)
3019 << std::dec << dendl;
3020 seq_bytes[seq] += length;
3021#ifdef DEBUG_DEFERRED
3022 _audit(cct);
3023#endif
3024}
3025
3026void BlueStore::DeferredBatch::_discard(
3027 CephContext *cct, uint64_t offset, uint64_t length)
3028{
3029 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3030 << std::dec << dendl;
3031 auto p = iomap.lower_bound(offset);
3032 if (p != iomap.begin()) {
3033 --p;
3034 auto end = p->first + p->second.bl.length();
3035 if (end > offset) {
3036 bufferlist head;
3037 head.substr_of(p->second.bl, 0, offset - p->first);
3038 dout(20) << __func__ << " keep head " << p->second.seq
3039 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3040 << " -> 0x" << head.length() << std::dec << dendl;
3041 auto i = seq_bytes.find(p->second.seq);
224ce89b 3042 assert(i != seq_bytes.end());
7c673cae
FG
3043 if (end > offset + length) {
3044 bufferlist tail;
3045 tail.substr_of(p->second.bl, offset + length - p->first,
3046 end - (offset + length));
3047 dout(20) << __func__ << " keep tail " << p->second.seq
3048 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3049 << " -> 0x" << tail.length() << std::dec << dendl;
3050 auto &n = iomap[offset + length];
3051 n.bl.swap(tail);
3052 n.seq = p->second.seq;
3053 i->second -= length;
3054 } else {
3055 i->second -= end - offset;
3056 }
224ce89b 3057 assert(i->second >= 0);
7c673cae
FG
3058 p->second.bl.swap(head);
3059 }
3060 ++p;
3061 }
3062 while (p != iomap.end()) {
3063 if (p->first >= offset + length) {
3064 break;
3065 }
3066 auto i = seq_bytes.find(p->second.seq);
224ce89b 3067 assert(i != seq_bytes.end());
7c673cae
FG
3068 auto end = p->first + p->second.bl.length();
3069 if (end > offset + length) {
3070 unsigned drop_front = offset + length - p->first;
3071 unsigned keep_tail = end - (offset + length);
3072 dout(20) << __func__ << " truncate front " << p->second.seq
3073 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3074 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3075 << " to 0x" << (offset + length) << "~" << keep_tail
3076 << std::dec << dendl;
3077 auto &s = iomap[offset + length];
3078 s.seq = p->second.seq;
3079 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3080 i->second -= drop_front;
3081 } else {
3082 dout(20) << __func__ << " drop " << p->second.seq
3083 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3084 << std::dec << dendl;
3085 i->second -= p->second.bl.length();
3086 }
224ce89b 3087 assert(i->second >= 0);
7c673cae
FG
3088 p = iomap.erase(p);
3089 }
3090}
3091
3092void BlueStore::DeferredBatch::_audit(CephContext *cct)
3093{
3094 map<uint64_t,int> sb;
3095 for (auto p : seq_bytes) {
3096 sb[p.first] = 0; // make sure we have the same set of keys
3097 }
3098 uint64_t pos = 0;
3099 for (auto& p : iomap) {
3100 assert(p.first >= pos);
3101 sb[p.second.seq] += p.second.bl.length();
3102 pos = p.first + p.second.bl.length();
3103 }
3104 assert(sb == seq_bytes);
3105}
3106
3107
3108// Collection
3109
3110#undef dout_prefix
3111#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3112
3113BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3114 : store(ns),
3115 cache(c),
3116 cid(cid),
3117 lock("BlueStore::Collection::lock", true, false),
3118 exists(true),
3119 onode_map(c)
3120{
3121}
3122
3123void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3124{
3125 assert(!b->shared_blob);
3126 const bluestore_blob_t& blob = b->get_blob();
3127 if (!blob.is_shared()) {
3128 b->shared_blob = new SharedBlob(this);
3129 return;
3130 }
3131
3132 b->shared_blob = shared_blob_set.lookup(sbid);
3133 if (b->shared_blob) {
3134 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3135 << std::dec << " had " << *b->shared_blob << dendl;
3136 } else {
3137 b->shared_blob = new SharedBlob(sbid, this);
3138 shared_blob_set.add(this, b->shared_blob.get());
3139 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3140 << std::dec << " opened " << *b->shared_blob
3141 << dendl;
3142 }
3143}
3144
3145void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3146{
3147 if (!sb->is_loaded()) {
3148
3149 bufferlist v;
3150 string key;
3151 auto sbid = sb->get_sbid();
3152 get_shared_blob_key(sbid, &key);
3153 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3154 if (r < 0) {
3155 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3156 << std::dec << " not found at key "
3157 << pretty_binary_string(key) << dendl;
3158 assert(0 == "uh oh, missing shared_blob");
3159 }
3160
3161 sb->loaded = true;
3162 sb->persistent = new bluestore_shared_blob_t(sbid);
3163 bufferlist::iterator p = v.begin();
3164 ::decode(*(sb->persistent), p);
3165 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3166 << std::dec << " loaded shared_blob " << *sb << dendl;
3167 }
3168}
3169
3170void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3171{
7c673cae 3172 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
31f18b77 3173 assert(!b->shared_blob->is_loaded());
7c673cae
FG
3174
3175 // update blob
31f18b77 3176 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3177 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3178
3179 // update shared blob
3180 b->shared_blob->loaded = true;
3181 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3182 shared_blob_set.add(this, b->shared_blob.get());
3183 for (auto p : blob.get_extents()) {
3184 if (p.is_valid()) {
3185 b->shared_blob->get_ref(
3186 p.offset,
3187 p.length);
3188 }
3189 }
3190 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3191}
3192
31f18b77
FG
3193uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3194{
3195 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3196 assert(sb->is_loaded());
3197
3198 uint64_t sbid = sb->get_sbid();
3199 shared_blob_set.remove(sb);
3200 sb->loaded = false;
3201 delete sb->persistent;
3202 sb->sbid_unloaded = 0;
3203 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3204 return sbid;
3205}
3206
7c673cae
FG
3207BlueStore::OnodeRef BlueStore::Collection::get_onode(
3208 const ghobject_t& oid,
3209 bool create)
3210{
3211 assert(create ? lock.is_wlocked() : lock.is_locked());
3212
3213 spg_t pgid;
3214 if (cid.is_pg(&pgid)) {
3215 if (!oid.match(cnode.bits, pgid.ps())) {
3216 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3217 << pgid << " bits " << cnode.bits << dendl;
3218 ceph_abort();
3219 }
3220 }
3221
3222 OnodeRef o = onode_map.lookup(oid);
3223 if (o)
3224 return o;
3225
31f18b77 3226 mempool::bluestore_cache_other::string key;
7c673cae
FG
3227 get_object_key(store->cct, oid, &key);
3228
3229 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3230 << pretty_binary_string(key) << dendl;
3231
3232 bufferlist v;
3233 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3234 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3235 Onode *on;
3236 if (v.length() == 0) {
3237 assert(r == -ENOENT);
3238 if (!store->cct->_conf->bluestore_debug_misc &&
3239 !create)
3240 return OnodeRef();
3241
3242 // new object, new onode
3243 on = new Onode(this, oid, key);
3244 } else {
3245 // loaded
3246 assert(r >= 0);
3247 on = new Onode(this, oid, key);
3248 on->exists = true;
31f18b77 3249 bufferptr::iterator p = v.front().begin_deep();
7c673cae 3250 on->onode.decode(p);
3efd9988
FG
3251 for (auto& i : on->onode.attrs) {
3252 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3253 }
7c673cae
FG
3254
3255 // initialize extent_map
3256 on->extent_map.decode_spanning_blobs(p);
3257 if (on->onode.extent_map_shards.empty()) {
3258 denc(on->extent_map.inline_bl, p);
3259 on->extent_map.decode_some(on->extent_map.inline_bl);
3efd9988
FG
3260 on->extent_map.inline_bl.reassign_to_mempool(
3261 mempool::mempool_bluestore_cache_other);
7c673cae
FG
3262 } else {
3263 on->extent_map.init_shards(false, false);
3264 }
3265 }
3266 o.reset(on);
3267 return onode_map.add(oid, o);
3268}
3269
3270void BlueStore::Collection::split_cache(
3271 Collection *dest)
3272{
3273 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3274
3275 // lock (one or both) cache shards
3276 std::lock(cache->lock, dest->cache->lock);
3277 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3278 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3279
3280 int destbits = dest->cnode.bits;
3281 spg_t destpg;
3282 bool is_pg = dest->cid.is_pg(&destpg);
3283 assert(is_pg);
3284
3285 auto p = onode_map.onode_map.begin();
3286 while (p != onode_map.onode_map.end()) {
3287 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3288 // onode does not belong to this child
3289 ++p;
3290 } else {
3291 OnodeRef o = p->second;
3292 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3293 << dendl;
3294
3295 cache->_rm_onode(p->second);
3296 p = onode_map.onode_map.erase(p);
3297
3298 o->c = dest;
3299 dest->cache->_add_onode(o, 1);
3300 dest->onode_map.onode_map[o->oid] = o;
3301 dest->onode_map.cache = dest->cache;
3302
3303 // move over shared blobs and buffers. cover shared blobs from
3304 // both extent map and spanning blob map (the full extent map
3305 // may not be faulted in)
3306 vector<SharedBlob*> sbvec;
3307 for (auto& e : o->extent_map.extent_map) {
3308 sbvec.push_back(e.blob->shared_blob.get());
3309 }
3310 for (auto& b : o->extent_map.spanning_blob_map) {
3311 sbvec.push_back(b.second->shared_blob.get());
3312 }
3313 for (auto sb : sbvec) {
3314 if (sb->coll == dest) {
3315 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3316 << dendl;
3317 continue;
3318 }
3319 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
3320 if (sb->get_sbid()) {
3321 ldout(store->cct, 20) << __func__
3322 << " moving registration " << *sb << dendl;
3323 shared_blob_set.remove(sb);
3324 dest->shared_blob_set.add(dest, sb);
3325 }
3efd9988 3326 sb->coll = dest;
7c673cae 3327 if (dest->cache != cache) {
7c673cae
FG
3328 for (auto& i : sb->bc.buffer_map) {
3329 if (!i.second->is_writing()) {
3330 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3331 << dendl;
3332 dest->cache->_move_buffer(cache, i.second.get());
3333 }
3334 }
3335 }
3336 }
7c673cae
FG
3337 }
3338 }
3339}
3340
7c673cae
FG
3341// =======================================================
3342
3343void *BlueStore::MempoolThread::entry()
3344{
3345 Mutex::Locker l(lock);
3346 while (!stop) {
31f18b77
FG
3347 uint64_t meta_bytes =
3348 mempool::bluestore_cache_other::allocated_bytes() +
3349 mempool::bluestore_cache_onode::allocated_bytes();
3350 uint64_t onode_num =
3351 mempool::bluestore_cache_onode::allocated_items();
3352
3353 if (onode_num < 2) {
3354 onode_num = 2;
3355 }
3356
3357 float bytes_per_onode = (float)meta_bytes / (float)onode_num;
3358 size_t num_shards = store->cache_shards.size();
3359 float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
3360 // A little sloppy but should be close enough
224ce89b 3361 uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
31f18b77
FG
3362
3363 for (auto i : store->cache_shards) {
3364 i->trim(shard_target,
3365 store->cache_meta_ratio,
3366 store->cache_data_ratio,
3367 bytes_per_onode);
3368 }
3369
3370 store->_update_cache_logger();
3371
7c673cae
FG
3372 utime_t wait;
3373 wait += store->cct->_conf->bluestore_cache_trim_interval;
3374 cond.WaitInterval(lock, wait);
3375 }
3376 stop = false;
3377 return NULL;
3378}
3379
3380// =======================================================
3381
31f18b77
FG
3382// OmapIteratorImpl
3383
3384#undef dout_prefix
3385#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3386
3387BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3388 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3389 : c(c), o(o), it(it)
3390{
3391 RWLock::RLocker l(c->lock);
3392 if (o->onode.has_omap()) {
3393 get_omap_key(o->onode.nid, string(), &head);
3394 get_omap_tail(o->onode.nid, &tail);
3395 it->lower_bound(head);
3396 }
3397}
3398
3399int BlueStore::OmapIteratorImpl::seek_to_first()
3400{
3401 RWLock::RLocker l(c->lock);
3402 if (o->onode.has_omap()) {
3403 it->lower_bound(head);
3404 } else {
3405 it = KeyValueDB::Iterator();
3406 }
3407 return 0;
3408}
3409
3410int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3411{
3412 RWLock::RLocker l(c->lock);
3413 if (o->onode.has_omap()) {
3414 string key;
3415 get_omap_key(o->onode.nid, after, &key);
3416 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3417 << pretty_binary_string(key) << dendl;
3418 it->upper_bound(key);
3419 } else {
3420 it = KeyValueDB::Iterator();
3421 }
3422 return 0;
3423}
3424
3425int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3426{
3427 RWLock::RLocker l(c->lock);
3428 if (o->onode.has_omap()) {
3429 string key;
3430 get_omap_key(o->onode.nid, to, &key);
3431 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3432 << pretty_binary_string(key) << dendl;
3433 it->lower_bound(key);
3434 } else {
3435 it = KeyValueDB::Iterator();
3436 }
3437 return 0;
3438}
3439
3440bool BlueStore::OmapIteratorImpl::valid()
3441{
3442 RWLock::RLocker l(c->lock);
3443 bool r = o->onode.has_omap() && it && it->valid() &&
3444 it->raw_key().second <= tail;
3445 if (it && it->valid()) {
3446 ldout(c->store->cct,20) << __func__ << " is at "
3447 << pretty_binary_string(it->raw_key().second)
3448 << dendl;
3449 }
3450 return r;
3451}
3452
3453int BlueStore::OmapIteratorImpl::next(bool validate)
3454{
3455 RWLock::RLocker l(c->lock);
3456 if (o->onode.has_omap()) {
3457 it->next();
3458 return 0;
3459 } else {
3460 return -1;
3461 }
3462}
3463
3464string BlueStore::OmapIteratorImpl::key()
3465{
3466 RWLock::RLocker l(c->lock);
3467 assert(it->valid());
3468 string db_key = it->raw_key().second;
3469 string user_key;
3470 decode_omap_key(db_key, &user_key);
3471 return user_key;
3472}
3473
3474bufferlist BlueStore::OmapIteratorImpl::value()
3475{
3476 RWLock::RLocker l(c->lock);
3477 assert(it->valid());
3478 return it->value();
3479}
3480
3481
3482// =====================================
3483
7c673cae
FG
3484#undef dout_prefix
3485#define dout_prefix *_dout << "bluestore(" << path << ") "
3486
3487
3488static void aio_cb(void *priv, void *priv2)
3489{
3490 BlueStore *store = static_cast<BlueStore*>(priv);
3491 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3492 c->aio_finish(store);
3493}
3494
3495BlueStore::BlueStore(CephContext *cct, const string& path)
3496 : ObjectStore(cct, path),
3497 throttle_bytes(cct, "bluestore_throttle_bytes",
3498 cct->_conf->bluestore_throttle_bytes),
3499 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3500 cct->_conf->bluestore_throttle_bytes +
3501 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 3502 deferred_finisher(cct, "defered_finisher", "dfin"),
7c673cae 3503 kv_sync_thread(this),
31f18b77 3504 kv_finalize_thread(this),
7c673cae
FG
3505 mempool_thread(this)
3506{
3507 _init_logger();
3508 cct->_conf->add_observer(this);
3509 set_cache_shards(1);
7c673cae
FG
3510}
3511
3512BlueStore::BlueStore(CephContext *cct,
3513 const string& path,
3514 uint64_t _min_alloc_size)
3515 : ObjectStore(cct, path),
3516 throttle_bytes(cct, "bluestore_throttle_bytes",
3517 cct->_conf->bluestore_throttle_bytes),
3518 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3519 cct->_conf->bluestore_throttle_bytes +
3520 cct->_conf->bluestore_throttle_deferred_bytes),
181888fb 3521 deferred_finisher(cct, "defered_finisher", "dfin"),
7c673cae 3522 kv_sync_thread(this),
31f18b77 3523 kv_finalize_thread(this),
7c673cae
FG
3524 min_alloc_size(_min_alloc_size),
3525 min_alloc_size_order(ctz(_min_alloc_size)),
3526 mempool_thread(this)
3527{
3528 _init_logger();
3529 cct->_conf->add_observer(this);
3530 set_cache_shards(1);
7c673cae
FG
3531}
3532
3533BlueStore::~BlueStore()
3534{
3535 for (auto f : finishers) {
3536 delete f;
3537 }
3538 finishers.clear();
3539
3540 cct->_conf->remove_observer(this);
3541 _shutdown_logger();
3542 assert(!mounted);
3543 assert(db == NULL);
3544 assert(bluefs == NULL);
3545 assert(fsid_fd < 0);
3546 assert(path_fd < 0);
3547 for (auto i : cache_shards) {
3548 delete i;
3549 }
3550 cache_shards.clear();
3551}
3552
3553const char **BlueStore::get_tracked_conf_keys() const
3554{
3555 static const char* KEYS[] = {
3556 "bluestore_csum_type",
3557 "bluestore_compression_mode",
3558 "bluestore_compression_algorithm",
3559 "bluestore_compression_min_blob_size",
3560 "bluestore_compression_min_blob_size_ssd",
3561 "bluestore_compression_min_blob_size_hdd",
3562 "bluestore_compression_max_blob_size",
3563 "bluestore_compression_max_blob_size_ssd",
3564 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 3565 "bluestore_compression_required_ratio",
7c673cae
FG
3566 "bluestore_max_alloc_size",
3567 "bluestore_prefer_deferred_size",
181888fb
FG
3568 "bluestore_prefer_deferred_size_hdd",
3569 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
3570 "bluestore_deferred_batch_ops",
3571 "bluestore_deferred_batch_ops_hdd",
3572 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
3573 "bluestore_throttle_bytes",
3574 "bluestore_throttle_deferred_bytes",
3575 "bluestore_throttle_cost_per_io_hdd",
3576 "bluestore_throttle_cost_per_io_ssd",
3577 "bluestore_throttle_cost_per_io",
3578 "bluestore_max_blob_size",
3579 "bluestore_max_blob_size_ssd",
3580 "bluestore_max_blob_size_hdd",
3581 NULL
3582 };
3583 return KEYS;
3584}
3585
3586void BlueStore::handle_conf_change(const struct md_config_t *conf,
3587 const std::set<std::string> &changed)
3588{
3589 if (changed.count("bluestore_csum_type")) {
3590 _set_csum();
3591 }
3592 if (changed.count("bluestore_compression_mode") ||
3593 changed.count("bluestore_compression_algorithm") ||
3594 changed.count("bluestore_compression_min_blob_size") ||
3595 changed.count("bluestore_compression_max_blob_size")) {
3596 if (bdev) {
3597 _set_compression();
3598 }
3599 }
3600 if (changed.count("bluestore_max_blob_size") ||
3601 changed.count("bluestore_max_blob_size_ssd") ||
3602 changed.count("bluestore_max_blob_size_hdd")) {
3603 if (bdev) {
3604 // only after startup
3605 _set_blob_size();
3606 }
3607 }
3608 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
3609 changed.count("bluestore_prefer_deferred_size_hdd") ||
3610 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
3611 changed.count("bluestore_max_alloc_size") ||
3612 changed.count("bluestore_deferred_batch_ops") ||
3613 changed.count("bluestore_deferred_batch_ops_hdd") ||
3614 changed.count("bluestore_deferred_batch_ops_ssd")) {
3615 if (bdev) {
3616 // only after startup
3617 _set_alloc_sizes();
3618 }
3619 }
3620 if (changed.count("bluestore_throttle_cost_per_io") ||
3621 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3622 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3623 if (bdev) {
3624 _set_throttle_params();
3625 }
3626 }
3627 if (changed.count("bluestore_throttle_bytes")) {
3628 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3629 throttle_deferred_bytes.reset_max(
3630 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3631 }
3632 if (changed.count("bluestore_throttle_deferred_bytes")) {
3633 throttle_deferred_bytes.reset_max(
3634 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3635 }
3636}
3637
3638void BlueStore::_set_compression()
3639{
224ce89b
WB
3640 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3641 if (m) {
3642 comp_mode = *m;
3643 } else {
3644 derr << __func__ << " unrecognized value '"
3645 << cct->_conf->bluestore_compression_mode
3646 << "' for bluestore_compression_mode, reverting to 'none'"
3647 << dendl;
3648 comp_mode = Compressor::COMP_NONE;
3649 }
3650
3651 compressor = nullptr;
3652
3653 if (comp_mode == Compressor::COMP_NONE) {
3654 dout(10) << __func__ << " compression mode set to 'none', "
3655 << "ignore other compression setttings" << dendl;
3656 return;
3657 }
3658
3efd9988
FG
3659 if (cct->_conf->bluestore_compression_min_blob_size) {
3660 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae
FG
3661 } else {
3662 assert(bdev);
3663 if (bdev->is_rotational()) {
3664 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3665 } else {
3666 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3667 }
3668 }
3669
3670 if (cct->_conf->bluestore_compression_max_blob_size) {
3671 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3672 } else {
3673 assert(bdev);
3674 if (bdev->is_rotational()) {
3675 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3676 } else {
3677 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3678 }
3679 }
3680
7c673cae
FG
3681 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3682 if (!alg_name.empty()) {
3683 compressor = Compressor::create(cct, alg_name);
3684 if (!compressor) {
3685 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3686 << dendl;
3687 }
3688 }
3689
3690 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3691 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3692 << dendl;
3693}
3694
3695void BlueStore::_set_csum()
3696{
3697 csum_type = Checksummer::CSUM_NONE;
3698 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3699 if (t > Checksummer::CSUM_NONE)
3700 csum_type = t;
3701
3702 dout(10) << __func__ << " csum_type "
3703 << Checksummer::get_csum_type_string(csum_type)
3704 << dendl;
3705}
3706
3707void BlueStore::_set_throttle_params()
3708{
3709 if (cct->_conf->bluestore_throttle_cost_per_io) {
3710 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3711 } else {
3712 assert(bdev);
3713 if (bdev->is_rotational()) {
3714 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3715 } else {
3716 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3717 }
3718 }
3719
3720 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3721 << dendl;
3722}
3723void BlueStore::_set_blob_size()
3724{
3725 if (cct->_conf->bluestore_max_blob_size) {
3726 max_blob_size = cct->_conf->bluestore_max_blob_size;
3727 } else {
3728 assert(bdev);
3729 if (bdev->is_rotational()) {
3730 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3731 } else {
3732 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3733 }
3734 }
3735 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3736 << std::dec << dendl;
3737}
3738
31f18b77
FG
3739int BlueStore::_set_cache_sizes()
3740{
224ce89b
WB
3741 assert(bdev);
3742 if (cct->_conf->bluestore_cache_size) {
3743 cache_size = cct->_conf->bluestore_cache_size;
3744 } else {
3745 // choose global cache size based on backend type
3746 if (bdev->is_rotational()) {
3747 cache_size = cct->_conf->bluestore_cache_size_hdd;
3748 } else {
3749 cache_size = cct->_conf->bluestore_cache_size_ssd;
3750 }
3751 }
31f18b77
FG
3752 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3753 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b
WB
3754
3755 double cache_kv_max = cct->_conf->bluestore_cache_kv_max;
3756 double cache_kv_max_ratio = 0;
3757
3758 // if cache_kv_max is negative, disable it
3759 if (cache_size > 0 && cache_kv_max >= 0) {
3760 cache_kv_max_ratio = (double) cache_kv_max / (double) cache_size;
3761 if (cache_kv_max_ratio < 1.0 && cache_kv_max_ratio < cache_kv_ratio) {
3762 dout(1) << __func__ << " max " << cache_kv_max_ratio
3763 << " < ratio " << cache_kv_ratio
3764 << dendl;
3765 cache_meta_ratio = cache_meta_ratio + cache_kv_ratio - cache_kv_max_ratio;
3766 cache_kv_ratio = cache_kv_max_ratio;
3767 }
3768 }
3769
31f18b77
FG
3770 cache_data_ratio =
3771 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
3772
224ce89b 3773 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 3774 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
224ce89b 3775 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
3776 return -EINVAL;
3777 }
224ce89b 3778 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 3779 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
224ce89b 3780 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
3781 return -EINVAL;
3782 }
3783 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 3784 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
31f18b77
FG
3785 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3786 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
3787 << dendl;
3788 return -EINVAL;
3789 }
3790 if (cache_data_ratio < 0) {
3791 // deal with floating point imprecision
3792 cache_data_ratio = 0;
3793 }
224ce89b
WB
3794 dout(1) << __func__ << " cache_size " << cache_size
3795 << " meta " << cache_meta_ratio
31f18b77
FG
3796 << " kv " << cache_kv_ratio
3797 << " data " << cache_data_ratio
3798 << dendl;
3799 return 0;
3800}
3801
3efd9988
FG
3802int BlueStore::write_meta(const std::string& key, const std::string& value)
3803{
3804 bluestore_bdev_label_t label;
3805 string p = path + "/block";
3806 int r = _read_bdev_label(cct, p, &label);
3807 if (r < 0) {
3808 return ObjectStore::write_meta(key, value);
3809 }
3810 label.meta[key] = value;
3811 r = _write_bdev_label(cct, p, label);
3812 assert(r == 0);
3813 return ObjectStore::write_meta(key, value);
3814}
3815
3816int BlueStore::read_meta(const std::string& key, std::string *value)
3817{
3818 bluestore_bdev_label_t label;
3819 string p = path + "/block";
3820 int r = _read_bdev_label(cct, p, &label);
3821 if (r < 0) {
3822 return ObjectStore::read_meta(key, value);
3823 }
3824 auto i = label.meta.find(key);
3825 if (i == label.meta.end()) {
3826 return ObjectStore::read_meta(key, value);
3827 }
3828 *value = i->second;
3829 return 0;
3830}
3831
7c673cae
FG
3832void BlueStore::_init_logger()
3833{
3834 PerfCountersBuilder b(cct, "bluestore",
3835 l_bluestore_first, l_bluestore_last);
3836 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
3837 "Average kv_thread flush latency",
3838 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
3839 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
3840 "Average kv_thread commit latency");
3841 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
3842 "Average kv_thread sync latency",
3843 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
3844 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
3845 "Average prepare state latency");
3846 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
3847 "Average aio_wait state latency",
3848 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
3849 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
3850 "Average io_done state latency");
3851 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
3852 "Average kv_queued state latency");
3853 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
3854 "Average kv_commiting state latency");
3855 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
3856 "Average kv_done state latency");
3857 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
3858 "Average deferred_queued state latency");
3859 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
3860 "Average aio_wait state latency");
3861 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
3862 "Average cleanup state latency");
3863 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
3864 "Average finishing state latency");
3865 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
3866 "Average done state latency");
3867 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
3868 "Average submit throttle latency",
3869 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
3870 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
3871 "Average submit latency",
3872 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
3873 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
3874 "Average commit latency",
3875 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
3876 b.add_time_avg(l_bluestore_read_lat, "read_lat",
3877 "Average read latency",
3878 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
3879 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
3880 "Average read onode metadata latency");
3881 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
3882 "Average read latency");
3883 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
3884 "Average compress latency");
3885 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
3886 "Average decompress latency");
3887 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
3888 "Average checksum latency");
3889 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
3890 "Sum for beneficial compress ops");
3891 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
3892 "Sum for compress ops rejected due to low net gain of space");
3893 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
3894 "Sum for write-op padded bytes");
3895 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
3896 "Sum for deferred write op");
3897 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
3898 "Sum for deferred write bytes", "def");
3899 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
3900 "Sum for write penalty read ops");
3901 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
3902 "Sum for allocated bytes");
3903 b.add_u64(l_bluestore_stored, "bluestore_stored",
3904 "Sum for stored bytes");
3905 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
3906 "Sum for stored compressed bytes");
3907 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
3908 "Sum for bytes allocated for compressed data");
3909 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
3910 "Sum for original bytes that were compressed");
3911
3912 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
3913 "Number of onodes in cache");
3914 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
3915 "Sum for onode-lookups hit in the cache");
3916 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
3917 "Sum for onode-lookups missed in the cache");
3918 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
3919 "Sum for onode-shard lookups hit in the cache");
3920 b.add_u64_counter(l_bluestore_onode_shard_misses,
3921 "bluestore_onode_shard_misses",
3922 "Sum for onode-shard lookups missed in the cache");
3923 b.add_u64(l_bluestore_extents, "bluestore_extents",
3924 "Number of extents in cache");
3925 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
3926 "Number of blobs in cache");
3927 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
3928 "Number of buffers in cache");
3929 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
3930 "Number of buffer bytes in cache");
3931 b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
3932 "Sum for bytes of read hit in the cache");
3933 b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
3934 "Sum for bytes of read missed in the cache");
3935
3936 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
3937 "Large aligned writes into fresh blobs");
3938 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
3939 "Large aligned writes into fresh blobs (bytes)");
3940 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
3941 "Large aligned writes into fresh blobs (blobs)");
3942 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
3943 "Small writes into existing or sparse small blobs");
3944 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
3945 "Small writes into existing or sparse small blobs (bytes)");
3946 b.add_u64_counter(l_bluestore_write_small_unused,
3947 "bluestore_write_small_unused",
3948 "Small writes into unused portion of existing blob");
3949 b.add_u64_counter(l_bluestore_write_small_deferred,
3950 "bluestore_write_small_deferred",
3951 "Small overwrites using deferred");
3952 b.add_u64_counter(l_bluestore_write_small_pre_read,
3953 "bluestore_write_small_pre_read",
3954 "Small writes that required we read some data (possibly "
3955 "cached) to fill out the block");
3956 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
3957 "Small write into new (sparse) blob");
3958
3959 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
3960 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
3961 "Onode extent map reshard events");
3962 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
3963 "Sum for blob splitting due to resharding");
3964 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
3965 "Sum for extents that have been removed due to compression");
3966 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
3967 "Sum for extents that have been merged due to garbage "
3968 "collection");
b32b8144
FG
3969 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
3970 "Read EIO errors propagated to high level callers");
7c673cae
FG
3971 logger = b.create_perf_counters();
3972 cct->get_perfcounters_collection()->add(logger);
3973}
3974
3975int BlueStore::_reload_logger()
3976{
3977 struct store_statfs_t store_statfs;
3978
3979 int r = statfs(&store_statfs);
3980 if(r >= 0) {
3981 logger->set(l_bluestore_allocated, store_statfs.allocated);
3982 logger->set(l_bluestore_stored, store_statfs.stored);
3983 logger->set(l_bluestore_compressed, store_statfs.compressed);
3984 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
3985 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
3986 }
3987 return r;
3988}
3989
3990void BlueStore::_shutdown_logger()
3991{
3992 cct->get_perfcounters_collection()->remove(logger);
3993 delete logger;
3994}
3995
3996int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
3997 uuid_d *fsid)
3998{
3999 bluestore_bdev_label_t label;
4000 int r = _read_bdev_label(cct, path, &label);
4001 if (r < 0)
4002 return r;
4003 *fsid = label.osd_uuid;
4004 return 0;
4005}
4006
4007int BlueStore::_open_path()
4008{
b32b8144
FG
4009 // sanity check(s)
4010 if (cct->_conf->get_val<uint64_t>("osd_max_object_size") >=
4011 4*1024*1024*1024ull) {
4012 derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl;
4013 return -EINVAL;
4014 }
7c673cae 4015 assert(path_fd < 0);
224ce89b 4016 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY));
7c673cae
FG
4017 if (path_fd < 0) {
4018 int r = -errno;
4019 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4020 << dendl;
4021 return r;
4022 }
4023 return 0;
4024}
4025
4026void BlueStore::_close_path()
4027{
4028 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4029 path_fd = -1;
4030}
4031
3efd9988
FG
4032int BlueStore::_write_bdev_label(CephContext *cct,
4033 string path, bluestore_bdev_label_t label)
7c673cae
FG
4034{
4035 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4036 bufferlist bl;
4037 ::encode(label, bl);
4038 uint32_t crc = bl.crc32c(-1);
4039 ::encode(crc, bl);
4040 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
4041 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4042 z.zero();
4043 bl.append(std::move(z));
4044
224ce89b 4045 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY));
7c673cae
FG
4046 if (fd < 0) {
4047 fd = -errno;
4048 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4049 << dendl;
4050 return fd;
4051 }
4052 int r = bl.write_fd(fd);
4053 if (r < 0) {
4054 derr << __func__ << " failed to write to " << path
4055 << ": " << cpp_strerror(r) << dendl;
4056 }
3efd9988
FG
4057 r = ::fsync(fd);
4058 if (r < 0) {
4059 derr << __func__ << " failed to fsync " << path
4060 << ": " << cpp_strerror(r) << dendl;
4061 }
7c673cae
FG
4062 VOID_TEMP_FAILURE_RETRY(::close(fd));
4063 return r;
4064}
4065
4066int BlueStore::_read_bdev_label(CephContext* cct, string path,
4067 bluestore_bdev_label_t *label)
4068{
4069 dout(10) << __func__ << dendl;
224ce89b 4070 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY));
7c673cae
FG
4071 if (fd < 0) {
4072 fd = -errno;
4073 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4074 << dendl;
4075 return fd;
4076 }
4077 bufferlist bl;
4078 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4079 VOID_TEMP_FAILURE_RETRY(::close(fd));
4080 if (r < 0) {
4081 derr << __func__ << " failed to read from " << path
4082 << ": " << cpp_strerror(r) << dendl;
4083 return r;
4084 }
4085
4086 uint32_t crc, expected_crc;
4087 bufferlist::iterator p = bl.begin();
4088 try {
4089 ::decode(*label, p);
4090 bufferlist t;
4091 t.substr_of(bl, 0, p.get_off());
4092 crc = t.crc32c(-1);
4093 ::decode(expected_crc, p);
4094 }
4095 catch (buffer::error& e) {
b32b8144 4096 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
4097 << ": " << e.what()
4098 << dendl;
b32b8144 4099 return -ENOENT;
7c673cae
FG
4100 }
4101 if (crc != expected_crc) {
4102 derr << __func__ << " bad crc on label, expected " << expected_crc
4103 << " != actual " << crc << dendl;
4104 return -EIO;
4105 }
4106 dout(10) << __func__ << " got " << *label << dendl;
4107 return 0;
4108}
4109
4110int BlueStore::_check_or_set_bdev_label(
4111 string path, uint64_t size, string desc, bool create)
4112{
4113 bluestore_bdev_label_t label;
4114 if (create) {
4115 label.osd_uuid = fsid;
4116 label.size = size;
4117 label.btime = ceph_clock_now();
4118 label.description = desc;
3efd9988 4119 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
4120 if (r < 0)
4121 return r;
4122 } else {
4123 int r = _read_bdev_label(cct, path, &label);
4124 if (r < 0)
4125 return r;
31f18b77
FG
4126 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4127 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4128 << " and fsid " << fsid << " check bypassed" << dendl;
4129 }
4130 else if (label.osd_uuid != fsid) {
7c673cae
FG
4131 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4132 << " does not match our fsid " << fsid << dendl;
4133 return -EIO;
4134 }
4135 }
4136 return 0;
4137}
4138
4139void BlueStore::_set_alloc_sizes(void)
4140{
7c673cae
FG
4141 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4142
4143 if (cct->_conf->bluestore_prefer_deferred_size) {
4144 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4145 } else {
4146 assert(bdev);
4147 if (bdev->is_rotational()) {
4148 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4149 } else {
4150 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4151 }
4152 }
4153
4154 if (cct->_conf->bluestore_deferred_batch_ops) {
4155 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4156 } else {
4157 assert(bdev);
4158 if (bdev->is_rotational()) {
4159 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4160 } else {
4161 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4162 }
4163 }
4164
4165 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4166 << std::dec << " order " << min_alloc_size_order
4167 << " max_alloc_size 0x" << std::hex << max_alloc_size
4168 << " prefer_deferred_size 0x" << prefer_deferred_size
4169 << std::dec
4170 << " deferred_batch_ops " << deferred_batch_ops
4171 << dendl;
4172}
4173
4174int BlueStore::_open_bdev(bool create)
4175{
4176 assert(bdev == NULL);
4177 string p = path + "/block";
4178 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4179 int r = bdev->open(p);
4180 if (r < 0)
4181 goto fail;
4182
4183 if (bdev->supported_bdev_label()) {
4184 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4185 if (r < 0)
4186 goto fail_close;
4187 }
4188
4189 // initialize global block parameters
4190 block_size = bdev->get_block_size();
4191 block_mask = ~(block_size - 1);
4192 block_size_order = ctz(block_size);
4193 assert(block_size == 1u << block_size_order);
224ce89b
WB
4194 // and set cache_size based on device type
4195 r = _set_cache_sizes();
4196 if (r < 0) {
4197 goto fail_close;
4198 }
7c673cae
FG
4199 return 0;
4200
4201 fail_close:
4202 bdev->close();
4203 fail:
4204 delete bdev;
4205 bdev = NULL;
4206 return r;
4207}
4208
4209void BlueStore::_close_bdev()
4210{
4211 assert(bdev);
4212 bdev->close();
4213 delete bdev;
4214 bdev = NULL;
4215}
4216
4217int BlueStore::_open_fm(bool create)
4218{
4219 assert(fm == NULL);
4220 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4221
4222 if (create) {
4223 // initialize freespace
4224 dout(20) << __func__ << " initializing freespace" << dendl;
4225 KeyValueDB::Transaction t = db->get_transaction();
4226 {
4227 bufferlist bl;
4228 bl.append(freelist_type);
4229 t->set(PREFIX_SUPER, "freelist_type", bl);
4230 }
b32b8144
FG
4231 // being able to allocate in units less than bdev block size
4232 // seems to be a bad idea.
4233 assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
4234 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
4235
4236 // allocate superblock reserved space. note that we do not mark
4237 // bluefs space as allocated in the freelist; we instead rely on
4238 // bluefs_extents.
3efd9988
FG
4239 uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
4240 min_alloc_size);
4241 fm->allocate(0, reserved, t);
7c673cae 4242
7c673cae
FG
4243 if (cct->_conf->bluestore_bluefs) {
4244 assert(bluefs_extents.num_intervals() == 1);
4245 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
3efd9988 4246 reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
4247 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4248 << " for bluefs" << dendl;
4249 bufferlist bl;
4250 ::encode(bluefs_extents, bl);
4251 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4252 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4253 << std::dec << dendl;
7c673cae
FG
4254 }
4255
4256 if (cct->_conf->bluestore_debug_prefill > 0) {
4257 uint64_t end = bdev->get_size() - reserved;
4258 dout(1) << __func__ << " pre-fragmenting freespace, using "
4259 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4260 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4261 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4262 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4263 float r = cct->_conf->bluestore_debug_prefill;
4264 r /= 1.0 - r;
4265 bool stop = false;
4266
4267 while (!stop && start < end) {
4268 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4269 if (start + l > end) {
4270 l = end - start;
4271 l = P2ALIGN(l, min_alloc_size);
4272 }
4273 assert(start + l <= end);
4274
4275 uint64_t u = 1 + (uint64_t)(r * (double)l);
4276 u = P2ROUNDUP(u, min_alloc_size);
4277 if (start + l + u > end) {
4278 u = end - (start + l);
4279 // trim to align so we don't overflow again
4280 u = P2ALIGN(u, min_alloc_size);
4281 stop = true;
4282 }
4283 assert(start + l + u <= end);
4284
4285 dout(20) << " free 0x" << std::hex << start << "~" << l
4286 << " use 0x" << u << std::dec << dendl;
4287
4288 if (u == 0) {
4289 // break if u has been trimmed to nothing
4290 break;
4291 }
4292
4293 fm->allocate(start + l, u, t);
4294 start += l + u;
4295 }
4296 }
4297 db->submit_transaction_sync(t);
4298 }
4299
3efd9988 4300 int r = fm->init(bdev->get_size());
7c673cae
FG
4301 if (r < 0) {
4302 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4303 delete fm;
4304 fm = NULL;
4305 return r;
4306 }
4307 return 0;
4308}
4309
4310void BlueStore::_close_fm()
4311{
4312 dout(10) << __func__ << dendl;
4313 assert(fm);
4314 fm->shutdown();
4315 delete fm;
4316 fm = NULL;
4317}
4318
4319int BlueStore::_open_alloc()
4320{
4321 assert(alloc == NULL);
4322 assert(bdev->get_size());
4323 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4324 bdev->get_size(),
4325 min_alloc_size);
4326 if (!alloc) {
4327 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4328 << cct->_conf->bluestore_allocator
4329 << dendl;
4330 return -EINVAL;
4331 }
4332
4333 uint64_t num = 0, bytes = 0;
4334
4335 dout(1) << __func__ << " opening allocation metadata" << dendl;
4336 // initialize from freelist
4337 fm->enumerate_reset();
4338 uint64_t offset, length;
4339 while (fm->enumerate_next(&offset, &length)) {
4340 alloc->init_add_free(offset, length);
4341 ++num;
4342 bytes += length;
4343 }
224ce89b 4344 fm->enumerate_reset();
7c673cae
FG
4345 dout(1) << __func__ << " loaded " << pretty_si_t(bytes)
4346 << " in " << num << " extents"
4347 << dendl;
4348
4349 // also mark bluefs space as allocated
4350 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4351 alloc->init_rm_free(e.get_start(), e.get_len());
4352 }
4353 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4354 << bluefs_extents << std::dec << " as allocated" << dendl;
4355
4356 return 0;
4357}
4358
4359void BlueStore::_close_alloc()
4360{
4361 assert(alloc);
4362 alloc->shutdown();
4363 delete alloc;
4364 alloc = NULL;
4365}
4366
4367int BlueStore::_open_fsid(bool create)
4368{
4369 assert(fsid_fd < 0);
4370 int flags = O_RDWR;
4371 if (create)
4372 flags |= O_CREAT;
4373 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4374 if (fsid_fd < 0) {
4375 int err = -errno;
4376 derr << __func__ << " " << cpp_strerror(err) << dendl;
4377 return err;
4378 }
4379 return 0;
4380}
4381
4382int BlueStore::_read_fsid(uuid_d *uuid)
4383{
4384 char fsid_str[40];
4385 memset(fsid_str, 0, sizeof(fsid_str));
4386 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4387 if (ret < 0) {
4388 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4389 return ret;
4390 }
4391 if (ret > 36)
4392 fsid_str[36] = 0;
4393 else
4394 fsid_str[ret] = 0;
4395 if (!uuid->parse(fsid_str)) {
4396 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4397 return -EINVAL;
4398 }
4399 return 0;
4400}
4401
4402int BlueStore::_write_fsid()
4403{
4404 int r = ::ftruncate(fsid_fd, 0);
4405 if (r < 0) {
4406 r = -errno;
4407 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4408 return r;
4409 }
4410 string str = stringify(fsid) + "\n";
4411 r = safe_write(fsid_fd, str.c_str(), str.length());
4412 if (r < 0) {
4413 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4414 return r;
4415 }
4416 r = ::fsync(fsid_fd);
4417 if (r < 0) {
4418 r = -errno;
4419 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4420 return r;
4421 }
4422 return 0;
4423}
4424
4425void BlueStore::_close_fsid()
4426{
4427 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4428 fsid_fd = -1;
4429}
4430
4431int BlueStore::_lock_fsid()
4432{
4433 struct flock l;
4434 memset(&l, 0, sizeof(l));
4435 l.l_type = F_WRLCK;
4436 l.l_whence = SEEK_SET;
4437 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4438 if (r < 0) {
4439 int err = errno;
4440 derr << __func__ << " failed to lock " << path << "/fsid"
4441 << " (is another ceph-osd still running?)"
4442 << cpp_strerror(err) << dendl;
4443 return -err;
4444 }
4445 return 0;
4446}
4447
31f18b77
FG
4448bool BlueStore::is_rotational()
4449{
4450 if (bdev) {
4451 return bdev->is_rotational();
4452 }
4453
4454 bool rotational = true;
4455 int r = _open_path();
4456 if (r < 0)
4457 goto out;
4458 r = _open_fsid(false);
4459 if (r < 0)
4460 goto out_path;
4461 r = _read_fsid(&fsid);
4462 if (r < 0)
4463 goto out_fsid;
4464 r = _lock_fsid();
4465 if (r < 0)
4466 goto out_fsid;
4467 r = _open_bdev(false);
4468 if (r < 0)
4469 goto out_fsid;
4470 rotational = bdev->is_rotational();
4471 _close_bdev();
4472 out_fsid:
4473 _close_fsid();
4474 out_path:
4475 _close_path();
4476 out:
4477 return rotational;
4478}
4479
d2e6a577
FG
4480bool BlueStore::is_journal_rotational()
4481{
4482 if (!bluefs) {
4483 dout(5) << __func__ << " bluefs disabled, default to store media type"
4484 << dendl;
4485 return is_rotational();
4486 }
4487 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4488 return bluefs->wal_is_rotational();
4489}
4490
7c673cae
FG
4491bool BlueStore::test_mount_in_use()
4492{
4493 // most error conditions mean the mount is not in use (e.g., because
4494 // it doesn't exist). only if we fail to lock do we conclude it is
4495 // in use.
4496 bool ret = false;
4497 int r = _open_path();
4498 if (r < 0)
4499 return false;
4500 r = _open_fsid(false);
4501 if (r < 0)
4502 goto out_path;
4503 r = _lock_fsid();
4504 if (r < 0)
4505 ret = true; // if we can't lock, it is in use
4506 _close_fsid();
4507 out_path:
4508 _close_path();
4509 return ret;
4510}
4511
4512int BlueStore::_open_db(bool create)
4513{
4514 int r;
4515 assert(!db);
4516 string fn = path + "/db";
4517 string options;
4518 stringstream err;
4519 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4520
4521 string kv_backend;
4522 if (create) {
4523 kv_backend = cct->_conf->bluestore_kvbackend;
4524 } else {
4525 r = read_meta("kv_backend", &kv_backend);
4526 if (r < 0) {
4527 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4528 return -EIO;
4529 }
4530 }
4531 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4532
4533 bool do_bluefs;
4534 if (create) {
4535 do_bluefs = cct->_conf->bluestore_bluefs;
4536 } else {
4537 string s;
4538 r = read_meta("bluefs", &s);
4539 if (r < 0) {
4540 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4541 return -EIO;
4542 }
4543 if (s == "1") {
4544 do_bluefs = true;
4545 } else if (s == "0") {
4546 do_bluefs = false;
4547 } else {
4548 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4549 << dendl;
4550 return -EIO;
4551 }
4552 }
4553 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4554
4555 rocksdb::Env *env = NULL;
4556 if (do_bluefs) {
4557 dout(10) << __func__ << " initializing bluefs" << dendl;
4558 if (kv_backend != "rocksdb") {
4559 derr << " backend must be rocksdb to use bluefs" << dendl;
4560 return -EINVAL;
4561 }
4562 bluefs = new BlueFS(cct);
4563
4564 string bfn;
4565 struct stat st;
4566
3efd9988
FG
4567 if (read_meta("path_block.db", &bfn) < 0) {
4568 bfn = path + "/block.db";
4569 }
7c673cae
FG
4570 if (::stat(bfn.c_str(), &st) == 0) {
4571 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4572 if (r < 0) {
4573 derr << __func__ << " add block device(" << bfn << ") returned: "
4574 << cpp_strerror(r) << dendl;
4575 goto free_bluefs;
4576 }
4577
4578 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4579 r = _check_or_set_bdev_label(
4580 bfn,
4581 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4582 "bluefs db", create);
4583 if (r < 0) {
4584 derr << __func__
4585 << " check block device(" << bfn << ") label returned: "
4586 << cpp_strerror(r) << dendl;
4587 goto free_bluefs;
4588 }
4589 }
4590 if (create) {
4591 bluefs->add_block_extent(
4592 BlueFS::BDEV_DB,
4593 SUPER_RESERVED,
4594 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4595 }
4596 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4597 bluefs_single_shared_device = false;
31f18b77 4598 } else if (::lstat(bfn.c_str(), &st) == -1) {
7c673cae 4599 bluefs_shared_bdev = BlueFS::BDEV_DB;
31f18b77
FG
4600 } else {
4601 //symlink exist is bug
4602 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4603 r = -errno;
4604 goto free_bluefs;
7c673cae
FG
4605 }
4606
4607 // shared device
3efd9988
FG
4608 if (read_meta("path_block", &bfn) < 0) {
4609 bfn = path + "/block";
4610 }
7c673cae
FG
4611 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4612 if (r < 0) {
4613 derr << __func__ << " add block device(" << bfn << ") returned: "
4614 << cpp_strerror(r) << dendl;
4615 goto free_bluefs;
4616 }
4617 if (create) {
4618 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4619 uint64_t initial =
4620 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4621 cct->_conf->bluestore_bluefs_gift_ratio);
4622 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
3efd9988
FG
4623 if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
4624 derr << __func__ << " bluefs_alloc_size 0x" << std::hex
4625 << cct->_conf->bluefs_alloc_size << " is not a multiple of "
4626 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4627 r = -EINVAL;
4628 goto free_bluefs;
4629 }
7c673cae
FG
4630 // align to bluefs's alloc_size
4631 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
31f18b77
FG
4632 // put bluefs in the middle of the device in case it is an HDD
4633 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4634 cct->_conf->bluefs_alloc_size);
4635 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4636 bluefs_extents.insert(start, initial);
7c673cae
FG
4637 }
4638
3efd9988
FG
4639 if (read_meta("path_block.wal", &bfn) < 0) {
4640 bfn = path + "/block.wal";
4641 }
7c673cae
FG
4642 if (::stat(bfn.c_str(), &st) == 0) {
4643 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4644 if (r < 0) {
4645 derr << __func__ << " add block device(" << bfn << ") returned: "
4646 << cpp_strerror(r) << dendl;
4647 goto free_bluefs;
4648 }
4649
4650 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4651 r = _check_or_set_bdev_label(
4652 bfn,
4653 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4654 "bluefs wal", create);
4655 if (r < 0) {
4656 derr << __func__ << " check block device(" << bfn
4657 << ") label returned: " << cpp_strerror(r) << dendl;
4658 goto free_bluefs;
4659 }
4660 }
4661
4662 if (create) {
4663 bluefs->add_block_extent(
4664 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4665 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4666 BDEV_LABEL_BLOCK_SIZE);
4667 }
4668 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4669 bluefs_single_shared_device = false;
31f18b77 4670 } else if (::lstat(bfn.c_str(), &st) == -1) {
7c673cae 4671 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
31f18b77
FG
4672 } else {
4673 //symlink exist is bug
4674 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4675 r = -errno;
4676 goto free_bluefs;
7c673cae
FG
4677 }
4678
4679 if (create) {
4680 bluefs->mkfs(fsid);
4681 }
4682 r = bluefs->mount();
4683 if (r < 0) {
4684 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4685 goto free_bluefs;
4686 }
4687 if (cct->_conf->bluestore_bluefs_env_mirror) {
4688 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4689 rocksdb::Env *b = rocksdb::Env::Default();
4690 if (create) {
4691 string cmd = "rm -rf " + path + "/db " +
4692 path + "/db.slow " +
4693 path + "/db.wal";
4694 int r = system(cmd.c_str());
4695 (void)r;
4696 }
4697 env = new rocksdb::EnvMirror(b, a, false, true);
4698 } else {
4699 env = new BlueRocksEnv(bluefs);
4700
4701 // simplify the dir names, too, as "seen" by rocksdb
4702 fn = "db";
4703 }
4704
4705 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4706 // we have both block.db and block; tell rocksdb!
4707 // note: the second (last) size value doesn't really matter
4708 ostringstream db_paths;
4709 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4710 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4711 db_paths << fn << ","
4712 << (uint64_t)(db_size * 95 / 100) << " "
4713 << fn + ".slow" << ","
4714 << (uint64_t)(slow_size * 95 / 100);
4715 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4716 dout(10) << __func__ << " set rocksdb_db_paths to "
4717 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4718 }
4719
4720 if (create) {
4721 env->CreateDir(fn);
4722 if (cct->_conf->rocksdb_separate_wal_dir)
4723 env->CreateDir(fn + ".wal");
4724 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4725 env->CreateDir(fn + ".slow");
4726 }
4727 } else if (create) {
4728 int r = ::mkdir(fn.c_str(), 0755);
4729 if (r < 0)
4730 r = -errno;
4731 if (r < 0 && r != -EEXIST) {
4732 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4733 << dendl;
4734 return r;
4735 }
4736
4737 // wal_dir, too!
4738 if (cct->_conf->rocksdb_separate_wal_dir) {
4739 string walfn = path + "/db.wal";
4740 r = ::mkdir(walfn.c_str(), 0755);
4741 if (r < 0)
4742 r = -errno;
4743 if (r < 0 && r != -EEXIST) {
4744 derr << __func__ << " failed to create " << walfn
4745 << ": " << cpp_strerror(r)
4746 << dendl;
4747 return r;
4748 }
4749 }
4750 }
4751
4752 db = KeyValueDB::create(cct,
4753 kv_backend,
4754 fn,
4755 static_cast<void*>(env));
4756 if (!db) {
4757 derr << __func__ << " error creating db" << dendl;
4758 if (bluefs) {
4759 bluefs->umount();
4760 delete bluefs;
4761 bluefs = NULL;
4762 }
4763 // delete env manually here since we can't depend on db to do this
4764 // under this case
4765 delete env;
4766 env = NULL;
4767 return -EIO;
4768 }
4769
4770 FreelistManager::setup_merge_operators(db);
4771 db->set_merge_operator(PREFIX_STAT, merge_op);
4772
224ce89b 4773 db->set_cache_size(cache_size * cache_kv_ratio);
31f18b77 4774
7c673cae
FG
4775 if (kv_backend == "rocksdb")
4776 options = cct->_conf->bluestore_rocksdb_options;
4777 db->init(options);
4778 if (create)
4779 r = db->create_and_open(err);
4780 else
4781 r = db->open(err);
4782 if (r) {
4783 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4784 if (bluefs) {
4785 bluefs->umount();
4786 delete bluefs;
4787 bluefs = NULL;
4788 }
4789 delete db;
4790 db = NULL;
4791 return -EIO;
4792 }
4793 dout(1) << __func__ << " opened " << kv_backend
4794 << " path " << fn << " options " << options << dendl;
4795 return 0;
4796
4797free_bluefs:
4798 assert(bluefs);
4799 delete bluefs;
4800 bluefs = NULL;
4801 return r;
4802}
4803
4804void BlueStore::_close_db()
4805{
4806 assert(db);
4807 delete db;
4808 db = NULL;
4809 if (bluefs) {
4810 bluefs->umount();
4811 delete bluefs;
4812 bluefs = NULL;
4813 }
4814}
4815
4816int BlueStore::_reconcile_bluefs_freespace()
4817{
4818 dout(10) << __func__ << dendl;
4819 interval_set<uint64_t> bset;
4820 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
4821 assert(r == 0);
4822 if (bset == bluefs_extents) {
4823 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
4824 << std::dec << dendl;
4825 return 0;
4826 }
4827 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
4828 << dendl;
4829 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
4830 << std::dec << dendl;
4831
4832 interval_set<uint64_t> overlap;
4833 overlap.intersection_of(bset, bluefs_extents);
4834
4835 bset.subtract(overlap);
4836 if (!bset.empty()) {
4837 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
4838 << dendl;
4839 return -EIO;
4840 }
4841
4842 interval_set<uint64_t> super_extra;
4843 super_extra = bluefs_extents;
4844 super_extra.subtract(overlap);
4845 if (!super_extra.empty()) {
4846 // This is normal: it can happen if we commit to give extents to
4847 // bluefs and we crash before bluefs commits that it owns them.
4848 dout(10) << __func__ << " super extra " << super_extra << dendl;
4849 for (interval_set<uint64_t>::iterator p = super_extra.begin();
4850 p != super_extra.end();
4851 ++p) {
4852 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
4853 }
4854 }
4855
4856 return 0;
4857}
4858
4859int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
4860{
4861 int ret = 0;
4862 assert(bluefs);
4863
4864 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
4865 bluefs->get_usage(&bluefs_usage);
4866 assert(bluefs_usage.size() > bluefs_shared_bdev);
4867
4868 // fixme: look at primary bdev only for now
4869 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
4870 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
4871 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
4872
4873 uint64_t my_free = alloc->get_free();
4874 uint64_t total = bdev->get_size();
4875 float my_free_ratio = (float)my_free / (float)total;
4876
4877 uint64_t total_free = bluefs_free + my_free;
4878
4879 float bluefs_ratio = (float)bluefs_free / (float)total_free;
4880
4881 dout(10) << __func__
4882 << " bluefs " << pretty_si_t(bluefs_free)
4883 << " free (" << bluefs_free_ratio
4884 << ") bluestore " << pretty_si_t(my_free)
4885 << " free (" << my_free_ratio
4886 << "), bluefs_ratio " << bluefs_ratio
4887 << dendl;
4888
4889 uint64_t gift = 0;
4890 uint64_t reclaim = 0;
4891 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
4892 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
4893 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4894 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
4895 << ", should gift " << pretty_si_t(gift) << dendl;
4896 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
4897 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
4898 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
4899 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
4900 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4901 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
4902 << ", should reclaim " << pretty_si_t(reclaim) << dendl;
4903 }
3efd9988
FG
4904
4905 // don't take over too much of the freespace
4906 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 4907 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 4908 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
4909 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
4910 dout(10) << __func__ << " bluefs_total " << bluefs_total
4911 << " < min " << cct->_conf->bluestore_bluefs_min
4912 << ", should gift " << pretty_si_t(g) << dendl;
4913 if (g > gift)
4914 gift = g;
4915 reclaim = 0;
4916 }
3efd9988
FG
4917 uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
4918 if (bluefs_free < min_free &&
4919 min_free < free_cap) {
4920 uint64_t g = min_free - bluefs_free;
4921 dout(10) << __func__ << " bluefs_free " << bluefs_total
4922 << " < min " << min_free
4923 << ", should gift " << pretty_si_t(g) << dendl;
4924 if (g > gift)
4925 gift = g;
4926 reclaim = 0;
4927 }
7c673cae
FG
4928
4929 if (gift) {
4930 // round up to alloc size
4931 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
4932
4933 // hard cap to fit into 32 bits
4934 gift = MIN(gift, 1ull<<31);
4935 dout(10) << __func__ << " gifting " << gift
4936 << " (" << pretty_si_t(gift) << ")" << dendl;
4937
4938 // fixme: just do one allocation to start...
4939 int r = alloc->reserve(gift);
4940 assert(r == 0);
4941
4942 AllocExtentVector exts;
4943 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
4944 0, 0, &exts);
4945
94b18763
FG
4946 if (alloc_len <= 0) {
4947 dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
4948 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4949 alloc->unreserve(gift);
4950 alloc->dump();
4951 return 0;
4952 } else if (alloc_len < (int64_t)gift) {
4953 dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
4954 << " min_alloc_size 0x" << min_alloc_size
4955 << " allocated 0x" << alloc_len
4956 << std::dec << dendl;
4957 alloc->unreserve(gift - alloc_len);
7c673cae 4958 alloc->dump();
7c673cae
FG
4959 }
4960 for (auto& p : exts) {
4961 bluestore_pextent_t e = bluestore_pextent_t(p);
4962 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
4963 extents->push_back(e);
4964 }
4965 gift = 0;
4966
4967 ret = 1;
4968 }
4969
4970 // reclaim from bluefs?
4971 if (reclaim) {
4972 // round up to alloc size
4973 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
4974
4975 // hard cap to fit into 32 bits
4976 reclaim = MIN(reclaim, 1ull<<31);
4977 dout(10) << __func__ << " reclaiming " << reclaim
4978 << " (" << pretty_si_t(reclaim) << ")" << dendl;
4979
4980 while (reclaim > 0) {
4981 // NOTE: this will block and do IO.
4982 AllocExtentVector extents;
4983 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
4984 &extents);
4985 if (r < 0) {
4986 derr << __func__ << " failed to reclaim space from bluefs"
4987 << dendl;
4988 break;
4989 }
4990 for (auto e : extents) {
4991 bluefs_extents.erase(e.offset, e.length);
4992 bluefs_extents_reclaiming.insert(e.offset, e.length);
4993 reclaim -= e.length;
4994 }
4995 }
4996
4997 ret = 1;
4998 }
4999
5000 return ret;
5001}
5002
5003void BlueStore::_commit_bluefs_freespace(
5004 const PExtentVector& bluefs_gift_extents)
5005{
5006 dout(10) << __func__ << dendl;
5007 for (auto& p : bluefs_gift_extents) {
5008 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
5009 }
5010}
5011
5012int BlueStore::_open_collections(int *errors)
5013{
5014 assert(coll_map.empty());
5015 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5016 for (it->upper_bound(string());
5017 it->valid();
5018 it->next()) {
5019 coll_t cid;
5020 if (cid.parse(it->key())) {
5021 CollectionRef c(
5022 new Collection(
5023 this,
5024 cache_shards[cid.hash_to_shard(cache_shards.size())],
5025 cid));
5026 bufferlist bl = it->value();
5027 bufferlist::iterator p = bl.begin();
5028 try {
5029 ::decode(c->cnode, p);
5030 } catch (buffer::error& e) {
5031 derr << __func__ << " failed to decode cnode, key:"
5032 << pretty_binary_string(it->key()) << dendl;
5033 return -EIO;
5034 }
5035 dout(20) << __func__ << " opened " << cid << " " << c << dendl;
5036 coll_map[cid] = c;
5037 } else {
5038 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5039 if (errors)
5040 (*errors)++;
5041 }
5042 }
5043 return 0;
5044}
5045
224ce89b 5046void BlueStore::_open_statfs()
31f18b77
FG
5047{
5048 bufferlist bl;
5049 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
5050 if (r >= 0) {
5051 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
5052 auto it = bl.begin();
5053 vstatfs.decode(it);
224ce89b 5054 } else {
31f18b77
FG
5055 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5056 }
5057 }
5058 else {
5059 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
5060 }
5061}
5062
7c673cae
FG
5063int BlueStore::_setup_block_symlink_or_file(
5064 string name,
5065 string epath,
5066 uint64_t size,
5067 bool create)
5068{
5069 dout(20) << __func__ << " name " << name << " path " << epath
5070 << " size " << size << " create=" << (int)create << dendl;
5071 int r = 0;
5072 int flags = O_RDWR;
5073 if (create)
5074 flags |= O_CREAT;
5075 if (epath.length()) {
5076 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5077 if (r < 0) {
5078 r = -errno;
5079 derr << __func__ << " failed to create " << name << " symlink to "
5080 << epath << ": " << cpp_strerror(r) << dendl;
5081 return r;
5082 }
5083
5084 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5085 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5086 if (fd < 0) {
5087 r = -errno;
5088 derr << __func__ << " failed to open " << epath << " file: "
5089 << cpp_strerror(r) << dendl;
5090 return r;
5091 }
5092 string serial_number = epath.substr(strlen(SPDK_PREFIX));
5093 r = ::write(fd, serial_number.c_str(), serial_number.size());
5094 assert(r == (int)serial_number.size());
5095 dout(1) << __func__ << " created " << name << " symlink to "
5096 << epath << dendl;
5097 VOID_TEMP_FAILURE_RETRY(::close(fd));
5098 }
5099 }
5100 if (size) {
5101 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5102 if (fd >= 0) {
5103 // block file is present
5104 struct stat st;
5105 int r = ::fstat(fd, &st);
5106 if (r == 0 &&
5107 S_ISREG(st.st_mode) && // if it is a regular file
5108 st.st_size == 0) { // and is 0 bytes
5109 r = ::ftruncate(fd, size);
5110 if (r < 0) {
5111 r = -errno;
5112 derr << __func__ << " failed to resize " << name << " file to "
5113 << size << ": " << cpp_strerror(r) << dendl;
5114 VOID_TEMP_FAILURE_RETRY(::close(fd));
5115 return r;
5116 }
5117
5118 if (cct->_conf->bluestore_block_preallocate_file) {
5119#ifdef HAVE_POSIX_FALLOCATE
5120 r = ::posix_fallocate(fd, 0, size);
5121 if (r) {
5122 derr << __func__ << " failed to prefallocate " << name << " file to "
5123 << size << ": " << cpp_strerror(r) << dendl;
5124 VOID_TEMP_FAILURE_RETRY(::close(fd));
5125 return -r;
5126 }
5127#else
5128 char data[1024*128];
5129 for (uint64_t off = 0; off < size; off += sizeof(data)) {
5130 if (off + sizeof(data) > size)
5131 r = ::write(fd, data, size - off);
5132 else
5133 r = ::write(fd, data, sizeof(data));
5134 if (r < 0) {
5135 r = -errno;
5136 derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
5137 << size << ": " << cpp_strerror(r) << dendl;
5138 VOID_TEMP_FAILURE_RETRY(::close(fd));
5139 return r;
5140 }
5141 }
5142#endif
5143 }
5144 dout(1) << __func__ << " resized " << name << " file to "
5145 << pretty_si_t(size) << "B" << dendl;
5146 }
5147 VOID_TEMP_FAILURE_RETRY(::close(fd));
5148 } else {
5149 int r = -errno;
5150 if (r != -ENOENT) {
5151 derr << __func__ << " failed to open " << name << " file: "
5152 << cpp_strerror(r) << dendl;
5153 return r;
5154 }
5155 }
5156 }
5157 return 0;
5158}
5159
5160int BlueStore::mkfs()
5161{
5162 dout(1) << __func__ << " path " << path << dendl;
5163 int r;
5164 uuid_d old_fsid;
5165
5166 {
5167 string done;
5168 r = read_meta("mkfs_done", &done);
5169 if (r == 0) {
5170 dout(1) << __func__ << " already created" << dendl;
5171 if (cct->_conf->bluestore_fsck_on_mkfs) {
5172 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5173 if (r < 0) {
5174 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5175 << dendl;
5176 return r;
5177 }
5178 if (r > 0) {
5179 derr << __func__ << " fsck found " << r << " errors" << dendl;
5180 r = -EIO;
5181 }
5182 }
5183 return r; // idempotent
5184 }
5185 }
5186
5187 {
5188 string type;
5189 r = read_meta("type", &type);
5190 if (r == 0) {
5191 if (type != "bluestore") {
5192 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5193 return -EIO;
5194 }
5195 } else {
5196 r = write_meta("type", "bluestore");
5197 if (r < 0)
5198 return r;
5199 }
5200 }
5201
5202 freelist_type = "bitmap";
5203
5204 r = _open_path();
5205 if (r < 0)
5206 return r;
5207
5208 r = _open_fsid(true);
5209 if (r < 0)
5210 goto out_path_fd;
5211
5212 r = _lock_fsid();
5213 if (r < 0)
5214 goto out_close_fsid;
5215
5216 r = _read_fsid(&old_fsid);
5217 if (r < 0 || old_fsid.is_zero()) {
5218 if (fsid.is_zero()) {
5219 fsid.generate_random();
5220 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5221 } else {
5222 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5223 }
5224 // we'll write it later.
5225 } else {
5226 if (!fsid.is_zero() && fsid != old_fsid) {
5227 derr << __func__ << " on-disk fsid " << old_fsid
5228 << " != provided " << fsid << dendl;
5229 r = -EINVAL;
5230 goto out_close_fsid;
5231 }
5232 fsid = old_fsid;
5233 }
5234
5235 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5236 cct->_conf->bluestore_block_size,
5237 cct->_conf->bluestore_block_create);
5238 if (r < 0)
5239 goto out_close_fsid;
5240 if (cct->_conf->bluestore_bluefs) {
5241 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5242 cct->_conf->bluestore_block_wal_size,
5243 cct->_conf->bluestore_block_wal_create);
5244 if (r < 0)
5245 goto out_close_fsid;
5246 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5247 cct->_conf->bluestore_block_db_size,
5248 cct->_conf->bluestore_block_db_create);
5249 if (r < 0)
5250 goto out_close_fsid;
5251 }
5252
5253 r = _open_bdev(true);
5254 if (r < 0)
5255 goto out_close_fsid;
5256
3efd9988
FG
5257 {
5258 string wal_path = cct->_conf->get_val<string>("bluestore_block_wal_path");
5259 if (wal_path.size()) {
5260 write_meta("path_block.wal", wal_path);
5261 }
5262 string db_path = cct->_conf->get_val<string>("bluestore_block_db_path");
5263 if (db_path.size()) {
5264 write_meta("path_block.db", db_path);
5265 }
5266 }
5267
5268 // choose min_alloc_size
5269 if (cct->_conf->bluestore_min_alloc_size) {
5270 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5271 } else {
5272 assert(bdev);
5273 if (bdev->is_rotational()) {
5274 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5275 } else {
5276 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5277 }
5278 }
5279
5280 // make sure min_alloc_size is power of 2 aligned.
5281 if (!ISP2(min_alloc_size)) {
5282 derr << __func__ << " min_alloc_size 0x"
5283 << std::hex << min_alloc_size << std::dec
5284 << " is not power of 2 aligned!"
5285 << dendl;
5286 r = -EINVAL;
5287 goto out_close_bdev;
5288 }
5289
7c673cae
FG
5290 r = _open_db(true);
5291 if (r < 0)
5292 goto out_close_bdev;
5293
5294 r = _open_fm(true);
5295 if (r < 0)
5296 goto out_close_db;
5297
5298 {
5299 KeyValueDB::Transaction t = db->get_transaction();
5300 {
5301 bufferlist bl;
5302 ::encode((uint64_t)0, bl);
5303 t->set(PREFIX_SUPER, "nid_max", bl);
5304 t->set(PREFIX_SUPER, "blobid_max", bl);
5305 }
5306
7c673cae
FG
5307 {
5308 bufferlist bl;
5309 ::encode((uint64_t)min_alloc_size, bl);
5310 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5311 }
5312
5313 ondisk_format = latest_ondisk_format;
5314 _prepare_ondisk_format_super(t);
5315 db->submit_transaction_sync(t);
5316 }
5317
7c673cae
FG
5318
5319 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5320 if (r < 0)
224ce89b
WB
5321 goto out_close_fm;
5322
3efd9988 5323 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 5324 if (r < 0)
224ce89b 5325 goto out_close_fm;
7c673cae
FG
5326
5327 if (fsid != old_fsid) {
5328 r = _write_fsid();
5329 if (r < 0) {
5330 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 5331 goto out_close_fm;
7c673cae
FG
5332 }
5333 }
5334
7c673cae
FG
5335 out_close_fm:
5336 _close_fm();
5337 out_close_db:
5338 _close_db();
5339 out_close_bdev:
5340 _close_bdev();
5341 out_close_fsid:
5342 _close_fsid();
5343 out_path_fd:
5344 _close_path();
5345
5346 if (r == 0 &&
5347 cct->_conf->bluestore_fsck_on_mkfs) {
5348 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5349 if (rc < 0)
5350 return rc;
5351 if (rc > 0) {
5352 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5353 r = -EIO;
5354 }
5355 }
31f18b77
FG
5356
5357 if (r == 0) {
5358 // indicate success by writing the 'mkfs_done' file
5359 r = write_meta("mkfs_done", "yes");
5360 }
5361
7c673cae
FG
5362 if (r < 0) {
5363 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
31f18b77
FG
5364 } else {
5365 dout(0) << __func__ << " success" << dendl;
7c673cae
FG
5366 }
5367 return r;
5368}
5369
5370void BlueStore::set_cache_shards(unsigned num)
5371{
5372 dout(10) << __func__ << " " << num << dendl;
5373 size_t old = cache_shards.size();
5374 assert(num >= old);
5375 cache_shards.resize(num);
5376 for (unsigned i = old; i < num; ++i) {
5377 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5378 logger);
5379 }
5380}
5381
5382int BlueStore::_mount(bool kv_only)
5383{
5384 dout(1) << __func__ << " path " << path << dendl;
5385
3efd9988
FG
5386 _kv_only = kv_only;
5387
7c673cae
FG
5388 {
5389 string type;
5390 int r = read_meta("type", &type);
5391 if (r < 0) {
5392 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5393 << dendl;
5394 return r;
5395 }
5396
5397 if (type != "bluestore") {
5398 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5399 return -EIO;
5400 }
5401 }
5402
5403 if (cct->_conf->bluestore_fsck_on_mount) {
5404 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5405 if (rc < 0)
5406 return rc;
5407 if (rc > 0) {
5408 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5409 return -EIO;
5410 }
5411 }
5412
5413 int r = _open_path();
5414 if (r < 0)
5415 return r;
5416 r = _open_fsid(false);
5417 if (r < 0)
5418 goto out_path;
5419
5420 r = _read_fsid(&fsid);
5421 if (r < 0)
5422 goto out_fsid;
5423
5424 r = _lock_fsid();
5425 if (r < 0)
5426 goto out_fsid;
5427
5428 r = _open_bdev(false);
5429 if (r < 0)
5430 goto out_fsid;
5431
5432 r = _open_db(false);
5433 if (r < 0)
5434 goto out_bdev;
5435
5436 if (kv_only)
5437 return 0;
5438
5439 r = _open_super_meta();
5440 if (r < 0)
5441 goto out_db;
5442
5443 r = _open_fm(false);
5444 if (r < 0)
5445 goto out_db;
5446
5447 r = _open_alloc();
5448 if (r < 0)
5449 goto out_fm;
5450
5451 r = _open_collections();
5452 if (r < 0)
5453 goto out_alloc;
5454
5455 r = _reload_logger();
5456 if (r < 0)
5457 goto out_coll;
5458
5459 if (bluefs) {
5460 r = _reconcile_bluefs_freespace();
5461 if (r < 0)
5462 goto out_coll;
5463 }
5464
31f18b77 5465 _kv_start();
7c673cae
FG
5466
5467 r = _deferred_replay();
5468 if (r < 0)
5469 goto out_stop;
5470
5471 mempool_thread.init();
5472
7c673cae
FG
5473 mounted = true;
5474 return 0;
5475
5476 out_stop:
5477 _kv_stop();
7c673cae 5478 out_coll:
31f18b77 5479 _flush_cache();
7c673cae
FG
5480 out_alloc:
5481 _close_alloc();
5482 out_fm:
5483 _close_fm();
5484 out_db:
5485 _close_db();
5486 out_bdev:
5487 _close_bdev();
5488 out_fsid:
5489 _close_fsid();
5490 out_path:
5491 _close_path();
5492 return r;
5493}
5494
5495int BlueStore::umount()
5496{
3efd9988 5497 assert(_kv_only || mounted);
7c673cae
FG
5498 dout(1) << __func__ << dendl;
5499
5500 _osr_drain_all();
5501 _osr_unregister_all();
5502
7c673cae 5503 mounted = false;
3efd9988
FG
5504 if (!_kv_only) {
5505 mempool_thread.shutdown();
5506 dout(20) << __func__ << " stopping kv thread" << dendl;
5507 _kv_stop();
3efd9988
FG
5508 _flush_cache();
5509 dout(20) << __func__ << " closing" << dendl;
5510
5511 _close_alloc();
5512 _close_fm();
5513 }
7c673cae
FG
5514 _close_db();
5515 _close_bdev();
5516 _close_fsid();
5517 _close_path();
5518
5519 if (cct->_conf->bluestore_fsck_on_umount) {
5520 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5521 if (rc < 0)
5522 return rc;
5523 if (rc > 0) {
5524 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5525 return -EIO;
5526 }
5527 }
5528 return 0;
5529}
5530
5531static void apply(uint64_t off,
5532 uint64_t len,
5533 uint64_t granularity,
5534 BlueStore::mempool_dynamic_bitset &bitset,
7c673cae
FG
5535 std::function<void(uint64_t,
5536 BlueStore::mempool_dynamic_bitset &)> f) {
5537 auto end = ROUND_UP_TO(off + len, granularity);
5538 while (off < end) {
5539 uint64_t pos = off / granularity;
5540 f(pos, bitset);
5541 off += granularity;
5542 }
5543}
5544
5545int BlueStore::_fsck_check_extents(
5546 const ghobject_t& oid,
5547 const PExtentVector& extents,
5548 bool compressed,
5549 mempool_dynamic_bitset &used_blocks,
b32b8144 5550 uint64_t granularity,
7c673cae
FG
5551 store_statfs_t& expected_statfs)
5552{
5553 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5554 int errors = 0;
5555 for (auto e : extents) {
5556 if (!e.is_valid())
5557 continue;
5558 expected_statfs.allocated += e.length;
5559 if (compressed) {
5560 expected_statfs.compressed_allocated += e.length;
5561 }
5562 bool already = false;
5563 apply(
b32b8144 5564 e.offset, e.length, granularity, used_blocks,
7c673cae 5565 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 5566 assert(pos < bs.size());
7c673cae
FG
5567 if (bs.test(pos))
5568 already = true;
5569 else
5570 bs.set(pos);
5571 });
5572 if (already) {
5573 derr << " " << oid << " extent " << e
5574 << " or a subset is already allocated" << dendl;
5575 ++errors;
5576 }
5577 if (e.end() > bdev->get_size()) {
5578 derr << " " << oid << " extent " << e
5579 << " past end of block device" << dendl;
5580 ++errors;
5581 }
5582 }
5583 return errors;
5584}
5585
3efd9988 5586int BlueStore::_fsck(bool deep, bool repair)
7c673cae 5587{
3efd9988
FG
5588 dout(1) << __func__
5589 << (repair ? " fsck" : " repair")
5590 << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
7c673cae 5591 int errors = 0;
3efd9988 5592 int repaired = 0;
31f18b77
FG
5593
5594 typedef btree::btree_set<
5595 uint64_t,std::less<uint64_t>,
5596 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5597 uint64_t_btree_t used_nids;
5598 uint64_t_btree_t used_omap_head;
5599 uint64_t_btree_t used_sbids;
5600
7c673cae 5601 mempool_dynamic_bitset used_blocks;
7c673cae
FG
5602 KeyValueDB::Iterator it;
5603 store_statfs_t expected_statfs, actual_statfs;
5604 struct sb_info_t {
5605 list<ghobject_t> oids;
5606 SharedBlobRef sb;
5607 bluestore_extent_ref_map_t ref_map;
5608 bool compressed;
5609 };
5610 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5611
5612 uint64_t num_objects = 0;
5613 uint64_t num_extents = 0;
5614 uint64_t num_blobs = 0;
5615 uint64_t num_spanning_blobs = 0;
5616 uint64_t num_shared_blobs = 0;
5617 uint64_t num_sharded_objects = 0;
5618 uint64_t num_object_shards = 0;
5619
5620 utime_t start = ceph_clock_now();
5621
5622 int r = _open_path();
5623 if (r < 0)
5624 return r;
5625 r = _open_fsid(false);
5626 if (r < 0)
5627 goto out_path;
5628
5629 r = _read_fsid(&fsid);
5630 if (r < 0)
5631 goto out_fsid;
5632
5633 r = _lock_fsid();
5634 if (r < 0)
5635 goto out_fsid;
5636
5637 r = _open_bdev(false);
5638 if (r < 0)
5639 goto out_fsid;
5640
5641 r = _open_db(false);
5642 if (r < 0)
5643 goto out_bdev;
5644
5645 r = _open_super_meta();
5646 if (r < 0)
5647 goto out_db;
5648
5649 r = _open_fm(false);
5650 if (r < 0)
5651 goto out_db;
5652
5653 r = _open_alloc();
5654 if (r < 0)
5655 goto out_fm;
5656
5657 r = _open_collections(&errors);
5658 if (r < 0)
5659 goto out_alloc;
5660
5661 mempool_thread.init();
5662
31f18b77
FG
5663 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5664 _kv_start();
7c673cae 5665 r = _deferred_replay();
31f18b77 5666 _kv_stop();
7c673cae
FG
5667 if (r < 0)
5668 goto out_scan;
5669
b32b8144 5670 used_blocks.resize(fm->get_alloc_units());
7c673cae 5671 apply(
b32b8144 5672 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae 5673 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 5674 assert(pos < bs.size());
7c673cae
FG
5675 bs.set(pos);
5676 }
5677 );
5678
5679 if (bluefs) {
5680 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5681 apply(
b32b8144 5682 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 5683 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 5684 assert(pos < bs.size());
7c673cae
FG
5685 bs.set(pos);
5686 }
5687 );
5688 }
5689 r = bluefs->fsck();
5690 if (r < 0) {
5691 goto out_scan;
5692 }
5693 if (r > 0)
5694 errors += r;
5695 }
5696
5697 // get expected statfs; fill unaffected fields to be able to compare
5698 // structs
5699 statfs(&actual_statfs);
5700 expected_statfs.total = actual_statfs.total;
5701 expected_statfs.available = actual_statfs.available;
5702
5703 // walk PREFIX_OBJ
5704 dout(1) << __func__ << " walking object keyspace" << dendl;
5705 it = db->get_iterator(PREFIX_OBJ);
5706 if (it) {
5707 CollectionRef c;
5708 spg_t pgid;
5709 mempool::bluestore_fsck::list<string> expecting_shards;
5710 for (it->lower_bound(string()); it->valid(); it->next()) {
31f18b77
FG
5711 if (g_conf->bluestore_debug_fsck_abort) {
5712 goto out_scan;
5713 }
7c673cae
FG
5714 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5715 if (is_extent_shard_key(it->key())) {
5716 while (!expecting_shards.empty() &&
5717 expecting_shards.front() < it->key()) {
3efd9988 5718 derr << "fsck error: missing shard key "
7c673cae
FG
5719 << pretty_binary_string(expecting_shards.front())
5720 << dendl;
5721 ++errors;
5722 expecting_shards.pop_front();
5723 }
5724 if (!expecting_shards.empty() &&
5725 expecting_shards.front() == it->key()) {
5726 // all good
5727 expecting_shards.pop_front();
5728 continue;
5729 }
5730
5731 uint32_t offset;
5732 string okey;
5733 get_key_extent_shard(it->key(), &okey, &offset);
3efd9988 5734 derr << "fsck error: stray shard 0x" << std::hex << offset
7c673cae
FG
5735 << std::dec << dendl;
5736 if (expecting_shards.empty()) {
3efd9988 5737 derr << "fsck error: " << pretty_binary_string(it->key())
7c673cae
FG
5738 << " is unexpected" << dendl;
5739 ++errors;
5740 continue;
5741 }
5742 while (expecting_shards.front() > it->key()) {
3efd9988 5743 derr << "fsck error: saw " << pretty_binary_string(it->key())
7c673cae 5744 << dendl;
3efd9988 5745 derr << "fsck error: exp "
7c673cae
FG
5746 << pretty_binary_string(expecting_shards.front()) << dendl;
5747 ++errors;
5748 expecting_shards.pop_front();
5749 if (expecting_shards.empty()) {
5750 break;
5751 }
5752 }
5753 continue;
5754 }
5755
5756 ghobject_t oid;
5757 int r = get_key_object(it->key(), &oid);
5758 if (r < 0) {
3efd9988 5759 derr << "fsck error: bad object key "
7c673cae
FG
5760 << pretty_binary_string(it->key()) << dendl;
5761 ++errors;
5762 continue;
5763 }
5764 if (!c ||
5765 oid.shard_id != pgid.shard ||
5766 oid.hobj.pool != (int64_t)pgid.pool() ||
5767 !c->contains(oid)) {
5768 c = nullptr;
5769 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5770 coll_map.begin();
5771 p != coll_map.end();
5772 ++p) {
5773 if (p->second->contains(oid)) {
5774 c = p->second;
5775 break;
5776 }
5777 }
5778 if (!c) {
3efd9988 5779 derr << "fsck error: stray object " << oid
7c673cae
FG
5780 << " not owned by any collection" << dendl;
5781 ++errors;
5782 continue;
5783 }
5784 c->cid.is_pg(&pgid);
5785 dout(20) << __func__ << " collection " << c->cid << dendl;
5786 }
5787
5788 if (!expecting_shards.empty()) {
5789 for (auto &k : expecting_shards) {
3efd9988 5790 derr << "fsck error: missing shard key "
7c673cae
FG
5791 << pretty_binary_string(k) << dendl;
5792 }
5793 ++errors;
5794 expecting_shards.clear();
5795 }
5796
5797 dout(10) << __func__ << " " << oid << dendl;
5798 RWLock::RLocker l(c->lock);
5799 OnodeRef o = c->get_onode(oid, false);
5800 if (o->onode.nid) {
5801 if (o->onode.nid > nid_max) {
3efd9988 5802 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
5803 << " > nid_max " << nid_max << dendl;
5804 ++errors;
5805 }
5806 if (used_nids.count(o->onode.nid)) {
3efd9988 5807 derr << "fsck error: " << oid << " nid " << o->onode.nid
7c673cae
FG
5808 << " already in use" << dendl;
5809 ++errors;
5810 continue; // go for next object
5811 }
5812 used_nids.insert(o->onode.nid);
5813 }
5814 ++num_objects;
5815 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5816 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5817 _dump_onode(o, 30);
5818 // shards
5819 if (!o->extent_map.shards.empty()) {
5820 ++num_sharded_objects;
5821 num_object_shards += o->extent_map.shards.size();
5822 }
5823 for (auto& s : o->extent_map.shards) {
5824 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
5825 expecting_shards.push_back(string());
5826 get_extent_shard_key(o->key, s.shard_info->offset,
5827 &expecting_shards.back());
5828 if (s.shard_info->offset >= o->onode.size) {
3efd9988 5829 derr << "fsck error: " << oid << " shard 0x" << std::hex
7c673cae
FG
5830 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
5831 << std::dec << dendl;
5832 ++errors;
5833 }
5834 }
5835 // lextents
5836 map<BlobRef,bluestore_blob_t::unused_t> referenced;
5837 uint64_t pos = 0;
5838 mempool::bluestore_fsck::map<BlobRef,
5839 bluestore_blob_use_tracker_t> ref_map;
5840 for (auto& l : o->extent_map.extent_map) {
5841 dout(20) << __func__ << " " << l << dendl;
5842 if (l.logical_offset < pos) {
3efd9988 5843 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
5844 << std::hex << l.logical_offset
5845 << " overlaps with the previous, which ends at 0x" << pos
5846 << std::dec << dendl;
5847 ++errors;
5848 }
5849 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
3efd9988 5850 derr << "fsck error: " << oid << " lextent at 0x"
7c673cae
FG
5851 << std::hex << l.logical_offset << "~" << l.length
5852 << " spans a shard boundary"
5853 << std::dec << dendl;
5854 ++errors;
5855 }
5856 pos = l.logical_offset + l.length;
5857 expected_statfs.stored += l.length;
5858 assert(l.blob);
5859 const bluestore_blob_t& blob = l.blob->get_blob();
5860
5861 auto& ref = ref_map[l.blob];
5862 if (ref.is_empty()) {
5863 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
5864 uint32_t l = blob.get_logical_length();
5865 ref.init(l, min_release_size);
5866 }
5867 ref.get(
5868 l.blob_offset,
5869 l.length);
5870 ++num_extents;
5871 if (blob.has_unused()) {
5872 auto p = referenced.find(l.blob);
5873 bluestore_blob_t::unused_t *pu;
5874 if (p == referenced.end()) {
5875 pu = &referenced[l.blob];
5876 } else {
5877 pu = &p->second;
5878 }
5879 uint64_t blob_len = blob.get_logical_length();
5880 assert((blob_len % (sizeof(*pu)*8)) == 0);
5881 assert(l.blob_offset + l.length <= blob_len);
5882 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
5883 uint64_t start = l.blob_offset / chunk_size;
5884 uint64_t end =
5885 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
5886 for (auto i = start; i < end; ++i) {
5887 (*pu) |= (1u << i);
5888 }
5889 }
5890 }
5891 for (auto &i : referenced) {
5892 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
5893 << std::dec << " for " << *i.first << dendl;
5894 const bluestore_blob_t& blob = i.first->get_blob();
5895 if (i.second & blob.unused) {
3efd9988 5896 derr << "fsck error: " << oid << " blob claims unused 0x"
7c673cae
FG
5897 << std::hex << blob.unused
5898 << " but extents reference 0x" << i.second
5899 << " on blob " << *i.first << dendl;
5900 ++errors;
5901 }
5902 if (blob.has_csum()) {
5903 uint64_t blob_len = blob.get_logical_length();
5904 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
5905 unsigned csum_count = blob.get_csum_count();
5906 unsigned csum_chunk_size = blob.get_csum_chunk_size();
5907 for (unsigned p = 0; p < csum_count; ++p) {
5908 unsigned pos = p * csum_chunk_size;
5909 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
5910 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
5911 unsigned mask = 1u << firstbit;
5912 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
5913 mask |= 1u << b;
5914 }
5915 if ((blob.unused & mask) == mask) {
5916 // this csum chunk region is marked unused
5917 if (blob.get_csum_item(p) != 0) {
3efd9988 5918 derr << "fsck error: " << oid
7c673cae
FG
5919 << " blob claims csum chunk 0x" << std::hex << pos
5920 << "~" << csum_chunk_size
5921 << " is unused (mask 0x" << mask << " of unused 0x"
5922 << blob.unused << ") but csum is non-zero 0x"
5923 << blob.get_csum_item(p) << std::dec << " on blob "
5924 << *i.first << dendl;
5925 ++errors;
5926 }
5927 }
5928 }
5929 }
5930 }
5931 for (auto &i : ref_map) {
5932 ++num_blobs;
5933 const bluestore_blob_t& blob = i.first->get_blob();
5934 bool equal = i.first->get_blob_use_tracker().equal(i.second);
5935 if (!equal) {
3efd9988 5936 derr << "fsck error: " << oid << " blob " << *i.first
7c673cae
FG
5937 << " doesn't match expected ref_map " << i.second << dendl;
5938 ++errors;
5939 }
5940 if (blob.is_compressed()) {
5941 expected_statfs.compressed += blob.get_compressed_payload_length();
5942 expected_statfs.compressed_original +=
5943 i.first->get_referenced_bytes();
5944 }
5945 if (blob.is_shared()) {
5946 if (i.first->shared_blob->get_sbid() > blobid_max) {
3efd9988 5947 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
5948 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
5949 << blobid_max << dendl;
5950 ++errors;
5951 } else if (i.first->shared_blob->get_sbid() == 0) {
3efd9988 5952 derr << "fsck error: " << oid << " blob " << blob
7c673cae
FG
5953 << " marked as shared but has uninitialized sbid"
5954 << dendl;
5955 ++errors;
5956 }
5957 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
5958 sbi.sb = i.first->shared_blob;
5959 sbi.oids.push_back(oid);
5960 sbi.compressed = blob.is_compressed();
5961 for (auto e : blob.get_extents()) {
5962 if (e.is_valid()) {
5963 sbi.ref_map.get(e.offset, e.length);
5964 }
5965 }
5966 } else {
5967 errors += _fsck_check_extents(oid, blob.get_extents(),
5968 blob.is_compressed(),
5969 used_blocks,
b32b8144 5970 fm->get_alloc_size(),
7c673cae
FG
5971 expected_statfs);
5972 }
5973 }
5974 if (deep) {
5975 bufferlist bl;
5976 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
5977 if (r < 0) {
5978 ++errors;
3efd9988 5979 derr << "fsck error: " << oid << " error during read: "
7c673cae
FG
5980 << cpp_strerror(r) << dendl;
5981 }
5982 }
5983 // omap
5984 if (o->onode.has_omap()) {
5985 if (used_omap_head.count(o->onode.nid)) {
3efd9988 5986 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
7c673cae
FG
5987 << " already in use" << dendl;
5988 ++errors;
5989 } else {
5990 used_omap_head.insert(o->onode.nid);
5991 }
5992 }
7c673cae
FG
5993 }
5994 }
5995 dout(1) << __func__ << " checking shared_blobs" << dendl;
5996 it = db->get_iterator(PREFIX_SHARED_BLOB);
5997 if (it) {
5998 for (it->lower_bound(string()); it->valid(); it->next()) {
5999 string key = it->key();
6000 uint64_t sbid;
6001 if (get_key_shared_blob(key, &sbid)) {
3efd9988 6002 derr << "fsck error: bad key '" << key
7c673cae
FG
6003 << "' in shared blob namespace" << dendl;
6004 ++errors;
6005 continue;
6006 }
6007 auto p = sb_info.find(sbid);
6008 if (p == sb_info.end()) {
3efd9988 6009 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae
FG
6010 << std::hex << sbid << std::dec << dendl;
6011 ++errors;
6012 } else {
6013 ++num_shared_blobs;
6014 sb_info_t& sbi = p->second;
6015 bluestore_shared_blob_t shared_blob(sbid);
6016 bufferlist bl = it->value();
6017 bufferlist::iterator blp = bl.begin();
6018 ::decode(shared_blob, blp);
6019 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
6020 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 6021 derr << "fsck error: shared blob 0x" << std::hex << sbid
7c673cae
FG
6022 << std::dec << " ref_map " << shared_blob.ref_map
6023 << " != expected " << sbi.ref_map << dendl;
6024 ++errors;
6025 }
6026 PExtentVector extents;
6027 for (auto &r : shared_blob.ref_map.ref_map) {
6028 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
6029 }
6030 errors += _fsck_check_extents(p->second.oids.front(),
6031 extents,
6032 p->second.compressed,
b32b8144
FG
6033 used_blocks,
6034 fm->get_alloc_size(),
6035 expected_statfs);
7c673cae
FG
6036 sb_info.erase(p);
6037 }
6038 }
6039 }
6040 for (auto &p : sb_info) {
3efd9988 6041 derr << "fsck error: shared_blob 0x" << p.first
7c673cae
FG
6042 << " key is missing (" << *p.second.sb << ")" << dendl;
6043 ++errors;
6044 }
6045 if (!(actual_statfs == expected_statfs)) {
3efd9988 6046 derr << "fsck error: actual " << actual_statfs
7c673cae
FG
6047 << " != expected " << expected_statfs << dendl;
6048 ++errors;
6049 }
6050
6051 dout(1) << __func__ << " checking for stray omap data" << dendl;
6052 it = db->get_iterator(PREFIX_OMAP);
6053 if (it) {
6054 for (it->lower_bound(string()); it->valid(); it->next()) {
6055 uint64_t omap_head;
6056 _key_decode_u64(it->key().c_str(), &omap_head);
6057 if (used_omap_head.count(omap_head) == 0) {
3efd9988 6058 derr << "fsck error: found stray omap data on omap_head "
7c673cae
FG
6059 << omap_head << dendl;
6060 ++errors;
6061 }
6062 }
6063 }
6064
6065 dout(1) << __func__ << " checking deferred events" << dendl;
6066 it = db->get_iterator(PREFIX_DEFERRED);
6067 if (it) {
6068 for (it->lower_bound(string()); it->valid(); it->next()) {
6069 bufferlist bl = it->value();
6070 bufferlist::iterator p = bl.begin();
6071 bluestore_deferred_transaction_t wt;
6072 try {
6073 ::decode(wt, p);
6074 } catch (buffer::error& e) {
3efd9988 6075 derr << "fsck error: failed to decode deferred txn "
7c673cae
FG
6076 << pretty_binary_string(it->key()) << dendl;
6077 r = -EIO;
6078 goto out_scan;
6079 }
6080 dout(20) << __func__ << " deferred " << wt.seq
6081 << " ops " << wt.ops.size()
6082 << " released 0x" << std::hex << wt.released << std::dec << dendl;
6083 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
6084 apply(
b32b8144 6085 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 6086 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 6087 assert(pos < bs.size());
7c673cae
FG
6088 bs.set(pos);
6089 }
6090 );
6091 }
6092 }
6093 }
6094
6095 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
6096 {
6097 // remove bluefs_extents from used set since the freelist doesn't
6098 // know they are allocated.
6099 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
6100 apply(
b32b8144 6101 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 6102 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 6103 assert(pos < bs.size());
7c673cae
FG
6104 bs.reset(pos);
6105 }
6106 );
6107 }
6108 fm->enumerate_reset();
6109 uint64_t offset, length;
6110 while (fm->enumerate_next(&offset, &length)) {
6111 bool intersects = false;
6112 apply(
b32b8144 6113 offset, length, fm->get_alloc_size(), used_blocks,
7c673cae 6114 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
b32b8144 6115 assert(pos < bs.size());
7c673cae
FG
6116 if (bs.test(pos)) {
6117 intersects = true;
6118 } else {
6119 bs.set(pos);
6120 }
6121 }
6122 );
6123 if (intersects) {
3efd9988
FG
6124 if (offset == SUPER_RESERVED &&
6125 length == min_alloc_size - SUPER_RESERVED) {
6126 // this is due to the change just after luminous to min_alloc_size
6127 // granularity allocations, and our baked in assumption at the top
6128 // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
6129 // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
6130 // since we will never allocate this region below min_alloc_size.
6131 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
6132 << " and min_alloc_size, 0x" << std::hex << offset << "~"
6133 << length << dendl;
b5b8bbf5 6134 } else {
3efd9988
FG
6135 derr << "fsck error: free extent 0x" << std::hex << offset
6136 << "~" << length << std::dec
6137 << " intersects allocated blocks" << dendl;
6138 ++errors;
b5b8bbf5 6139 }
b5b8bbf5
FG
6140 }
6141 }
3efd9988
FG
6142 fm->enumerate_reset();
6143 size_t count = used_blocks.count();
7c673cae
FG
6144 if (used_blocks.size() != count) {
6145 assert(used_blocks.size() > count);
7c673cae 6146 ++errors;
b5b8bbf5
FG
6147 used_blocks.flip();
6148 size_t start = used_blocks.find_first();
6149 while (start != decltype(used_blocks)::npos) {
6150 size_t cur = start;
6151 while (true) {
6152 size_t next = used_blocks.find_next(cur);
6153 if (next != cur + 1) {
3efd9988 6154 derr << "fsck error: leaked extent 0x" << std::hex
b32b8144
FG
6155 << ((uint64_t)start * fm->get_alloc_size()) << "~"
6156 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
b5b8bbf5
FG
6157 << dendl;
6158 start = next;
6159 break;
6160 }
6161 cur = next;
6162 }
6163 }
6164 used_blocks.flip();
7c673cae
FG
6165 }
6166 }
6167
6168 out_scan:
6169 mempool_thread.shutdown();
31f18b77 6170 _flush_cache();
7c673cae
FG
6171 out_alloc:
6172 _close_alloc();
6173 out_fm:
6174 _close_fm();
6175 out_db:
6176 it.reset(); // before db is closed
6177 _close_db();
6178 out_bdev:
6179 _close_bdev();
6180 out_fsid:
6181 _close_fsid();
6182 out_path:
6183 _close_path();
6184
6185 // fatal errors take precedence
6186 if (r < 0)
6187 return r;
6188
6189 dout(2) << __func__ << " " << num_objects << " objects, "
6190 << num_sharded_objects << " of them sharded. "
6191 << dendl;
6192 dout(2) << __func__ << " " << num_extents << " extents to "
6193 << num_blobs << " blobs, "
6194 << num_spanning_blobs << " spanning, "
6195 << num_shared_blobs << " shared."
6196 << dendl;
6197
6198 utime_t duration = ceph_clock_now() - start;
3efd9988
FG
6199 dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
6200 << " repaired, " << (errors - repaired) << " remaining in "
7c673cae 6201 << duration << " seconds" << dendl;
3efd9988 6202 return errors - repaired;
7c673cae
FG
6203}
6204
6205void BlueStore::collect_metadata(map<string,string> *pm)
6206{
6207 dout(10) << __func__ << dendl;
6208 bdev->collect_metadata("bluestore_bdev_", pm);
6209 if (bluefs) {
6210 (*pm)["bluefs"] = "1";
6211 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
6212 bluefs->collect_metadata(pm);
6213 } else {
6214 (*pm)["bluefs"] = "0";
6215 }
6216}
6217
6218int BlueStore::statfs(struct store_statfs_t *buf)
6219{
6220 buf->reset();
6221 buf->total = bdev->get_size();
6222 buf->available = alloc->get_free();
6223
6224 if (bluefs) {
94b18763
FG
6225 // part of our shared device is "free" according to BlueFS, but we
6226 // can't touch bluestore_bluefs_min of it.
6227 int64_t shared_available = std::min(
6228 bluefs->get_free(bluefs_shared_bdev),
6229 bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
6230 if (shared_available > 0) {
6231 buf->available += shared_available;
7c673cae
FG
6232 }
6233 }
6234
31f18b77
FG
6235 {
6236 std::lock_guard<std::mutex> l(vstatfs_lock);
6237
6238 buf->allocated = vstatfs.allocated();
6239 buf->stored = vstatfs.stored();
6240 buf->compressed = vstatfs.compressed();
6241 buf->compressed_original = vstatfs.compressed_original();
6242 buf->compressed_allocated = vstatfs.compressed_allocated();
7c673cae
FG
6243 }
6244
7c673cae
FG
6245 dout(20) << __func__ << *buf << dendl;
6246 return 0;
6247}
6248
6249// ---------------
6250// cache
6251
6252BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
6253{
6254 RWLock::RLocker l(coll_lock);
6255 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
6256 if (cp == coll_map.end())
6257 return CollectionRef();
6258 return cp->second;
6259}
6260
6261void BlueStore::_queue_reap_collection(CollectionRef& c)
6262{
6263 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
6264 // _reap_collections and this in the same thread,
6265 // so no need a lock.
7c673cae
FG
6266 removed_collections.push_back(c);
6267}
6268
6269void BlueStore::_reap_collections()
6270{
94b18763 6271
7c673cae
FG
6272 list<CollectionRef> removed_colls;
6273 {
94b18763
FG
6274 // _queue_reap_collection and this in the same thread.
6275 // So no need a lock.
6276 if (!removed_collections.empty())
6277 removed_colls.swap(removed_collections);
6278 else
6279 return;
7c673cae
FG
6280 }
6281
94b18763
FG
6282 list<CollectionRef>::iterator p = removed_colls.begin();
6283 while (p != removed_colls.end()) {
7c673cae
FG
6284 CollectionRef c = *p;
6285 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6286 if (c->onode_map.map_any([&](OnodeRef o) {
6287 assert(!o->exists);
6288 if (o->flushing_count.load()) {
6289 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6290 << " flush_txns " << o->flushing_count << dendl;
94b18763 6291 return true;
7c673cae 6292 }
94b18763 6293 return false;
7c673cae 6294 })) {
94b18763 6295 ++p;
7c673cae
FG
6296 continue;
6297 }
6298 c->onode_map.clear();
94b18763 6299 p = removed_colls.erase(p);
7c673cae
FG
6300 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6301 }
94b18763 6302 if (removed_colls.empty()) {
7c673cae 6303 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
6304 } else {
6305 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
6306 }
6307}
6308
6309void BlueStore::_update_cache_logger()
6310{
6311 uint64_t num_onodes = 0;
6312 uint64_t num_extents = 0;
6313 uint64_t num_blobs = 0;
6314 uint64_t num_buffers = 0;
6315 uint64_t num_buffer_bytes = 0;
6316 for (auto c : cache_shards) {
6317 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6318 &num_buffers, &num_buffer_bytes);
6319 }
6320 logger->set(l_bluestore_onodes, num_onodes);
6321 logger->set(l_bluestore_extents, num_extents);
6322 logger->set(l_bluestore_blobs, num_blobs);
6323 logger->set(l_bluestore_buffers, num_buffers);
6324 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6325}
6326
6327// ---------------
6328// read operations
6329
6330ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6331{
6332 return _get_collection(cid);
6333}
6334
6335bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6336{
6337 CollectionHandle c = _get_collection(cid);
6338 if (!c)
6339 return false;
6340 return exists(c, oid);
6341}
6342
6343bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6344{
6345 Collection *c = static_cast<Collection *>(c_.get());
6346 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6347 if (!c->exists)
6348 return false;
6349
6350 bool r = true;
6351
6352 {
6353 RWLock::RLocker l(c->lock);
6354 OnodeRef o = c->get_onode(oid, false);
6355 if (!o || !o->exists)
6356 r = false;
6357 }
6358
7c673cae
FG
6359 return r;
6360}
6361
6362int BlueStore::stat(
6363 const coll_t& cid,
6364 const ghobject_t& oid,
6365 struct stat *st,
6366 bool allow_eio)
6367{
6368 CollectionHandle c = _get_collection(cid);
6369 if (!c)
6370 return -ENOENT;
6371 return stat(c, oid, st, allow_eio);
6372}
6373
6374int BlueStore::stat(
6375 CollectionHandle &c_,
6376 const ghobject_t& oid,
6377 struct stat *st,
6378 bool allow_eio)
6379{
6380 Collection *c = static_cast<Collection *>(c_.get());
6381 if (!c->exists)
6382 return -ENOENT;
6383 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6384
6385 {
6386 RWLock::RLocker l(c->lock);
6387 OnodeRef o = c->get_onode(oid, false);
6388 if (!o || !o->exists)
6389 return -ENOENT;
6390 st->st_size = o->onode.size;
6391 st->st_blksize = 4096;
6392 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6393 st->st_nlink = 1;
6394 }
6395
7c673cae
FG
6396 int r = 0;
6397 if (_debug_mdata_eio(oid)) {
6398 r = -EIO;
6399 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6400 }
6401 return r;
6402}
6403int BlueStore::set_collection_opts(
6404 const coll_t& cid,
6405 const pool_opts_t& opts)
6406{
6407 CollectionHandle ch = _get_collection(cid);
6408 if (!ch)
6409 return -ENOENT;
6410 Collection *c = static_cast<Collection *>(ch.get());
6411 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6412 if (!c->exists)
6413 return -ENOENT;
6414 RWLock::WLocker l(c->lock);
6415 c->pool_opts = opts;
6416 return 0;
6417}
6418
6419int BlueStore::read(
6420 const coll_t& cid,
6421 const ghobject_t& oid,
6422 uint64_t offset,
6423 size_t length,
6424 bufferlist& bl,
224ce89b 6425 uint32_t op_flags)
7c673cae
FG
6426{
6427 CollectionHandle c = _get_collection(cid);
6428 if (!c)
6429 return -ENOENT;
224ce89b 6430 return read(c, oid, offset, length, bl, op_flags);
7c673cae
FG
6431}
6432
6433int BlueStore::read(
6434 CollectionHandle &c_,
6435 const ghobject_t& oid,
6436 uint64_t offset,
6437 size_t length,
6438 bufferlist& bl,
224ce89b 6439 uint32_t op_flags)
7c673cae
FG
6440{
6441 utime_t start = ceph_clock_now();
6442 Collection *c = static_cast<Collection *>(c_.get());
6443 const coll_t &cid = c->get_cid();
6444 dout(15) << __func__ << " " << cid << " " << oid
6445 << " 0x" << std::hex << offset << "~" << length << std::dec
6446 << dendl;
6447 if (!c->exists)
6448 return -ENOENT;
6449
6450 bl.clear();
6451 int r;
6452 {
6453 RWLock::RLocker l(c->lock);
6454 utime_t start1 = ceph_clock_now();
6455 OnodeRef o = c->get_onode(oid, false);
6456 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6457 if (!o || !o->exists) {
6458 r = -ENOENT;
6459 goto out;
6460 }
6461
6462 if (offset == length && offset == 0)
6463 length = o->onode.size;
6464
6465 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
6466 if (r == -EIO) {
6467 logger->inc(l_bluestore_read_eio);
6468 }
7c673cae
FG
6469 }
6470
6471 out:
7c673cae
FG
6472 if (r == 0 && _debug_data_eio(oid)) {
6473 r = -EIO;
6474 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
224ce89b
WB
6475 } else if (cct->_conf->bluestore_debug_random_read_err &&
6476 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) {
6477 dout(0) << __func__ << ": inject random EIO" << dendl;
6478 r = -EIO;
7c673cae
FG
6479 }
6480 dout(10) << __func__ << " " << cid << " " << oid
6481 << " 0x" << std::hex << offset << "~" << length << std::dec
6482 << " = " << r << dendl;
6483 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6484 return r;
6485}
6486
6487// --------------------------------------------------------
6488// intermediate data structures used while reading
6489struct region_t {
6490 uint64_t logical_offset;
6491 uint64_t blob_xoffset; //region offset within the blob
6492 uint64_t length;
6493 bufferlist bl;
6494
6495 // used later in read process
6496 uint64_t front = 0;
6497 uint64_t r_off = 0;
6498
6499 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6500 : logical_offset(offset),
6501 blob_xoffset(b_offs),
6502 length(len){}
6503 region_t(const region_t& from)
6504 : logical_offset(from.logical_offset),
6505 blob_xoffset(from.blob_xoffset),
6506 length(from.length){}
6507
6508 friend ostream& operator<<(ostream& out, const region_t& r) {
6509 return out << "0x" << std::hex << r.logical_offset << ":"
6510 << r.blob_xoffset << "~" << r.length << std::dec;
6511 }
6512};
6513
6514typedef list<region_t> regions2read_t;
6515typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6516
6517int BlueStore::_do_read(
6518 Collection *c,
6519 OnodeRef o,
6520 uint64_t offset,
6521 size_t length,
6522 bufferlist& bl,
6523 uint32_t op_flags)
6524{
6525 FUNCTRACE();
7c673cae
FG
6526 int r = 0;
6527
6528 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6529 << " size 0x" << o->onode.size << " (" << std::dec
6530 << o->onode.size << ")" << dendl;
6531 bl.clear();
6532
6533 if (offset >= o->onode.size) {
6534 return r;
6535 }
6536
6537 // generally, don't buffer anything, unless the client explicitly requests
6538 // it.
6539 bool buffered = false;
6540 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6541 dout(20) << __func__ << " will do buffered read" << dendl;
6542 buffered = true;
6543 } else if (cct->_conf->bluestore_default_buffered_read &&
6544 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6545 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6546 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6547 buffered = true;
6548 }
6549
6550 if (offset + length > o->onode.size) {
6551 length = o->onode.size - offset;
6552 }
6553
6554 utime_t start = ceph_clock_now();
6555 o->extent_map.fault_range(db, offset, length);
6556 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6557 _dump_onode(o);
6558
6559 ready_regions_t ready_regions;
6560
6561 // build blob-wise list to of stuff read (that isn't cached)
6562 blobs2read_t blobs2read;
6563 unsigned left = length;
6564 uint64_t pos = offset;
6565 unsigned num_regions = 0;
6566 auto lp = o->extent_map.seek_lextent(offset);
6567 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6568 if (pos < lp->logical_offset) {
6569 unsigned hole = lp->logical_offset - pos;
6570 if (hole >= left) {
6571 break;
6572 }
6573 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6574 << std::dec << dendl;
6575 pos += hole;
6576 left -= hole;
6577 }
94b18763 6578 BlobRef& bptr = lp->blob;
7c673cae
FG
6579 unsigned l_off = pos - lp->logical_offset;
6580 unsigned b_off = l_off + lp->blob_offset;
6581 unsigned b_len = std::min(left, lp->length - l_off);
6582
6583 ready_regions_t cache_res;
6584 interval_set<uint32_t> cache_interval;
6585 bptr->shared_blob->bc.read(
6586 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
6587 dout(20) << __func__ << " blob " << *bptr << std::hex
6588 << " need 0x" << b_off << "~" << b_len
6589 << " cache has 0x" << cache_interval
6590 << std::dec << dendl;
6591
6592 auto pc = cache_res.begin();
6593 while (b_len > 0) {
6594 unsigned l;
6595 if (pc != cache_res.end() &&
6596 pc->first == b_off) {
6597 l = pc->second.length();
6598 ready_regions[pos].claim(pc->second);
6599 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6600 << b_off << "~" << l << std::dec << dendl;
6601 ++pc;
6602 } else {
6603 l = b_len;
6604 if (pc != cache_res.end()) {
6605 assert(pc->first > b_off);
6606 l = pc->first - b_off;
6607 }
6608 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6609 << b_off << "~" << l << std::dec << dendl;
6610 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6611 ++num_regions;
6612 }
6613 pos += l;
6614 b_off += l;
6615 left -= l;
6616 b_len -= l;
6617 }
6618 ++lp;
6619 }
6620
6621 // read raw blob data. use aio if we have >1 blobs to read.
6622 start = ceph_clock_now(); // for the sake of simplicity
6623 // measure the whole block below.
6624 // The error isn't that much...
6625 vector<bufferlist> compressed_blob_bls;
b32b8144 6626 IOContext ioc(cct, NULL, true); // allow EIO
7c673cae 6627 for (auto& p : blobs2read) {
94b18763 6628 const BlobRef& bptr = p.first;
7c673cae
FG
6629 dout(20) << __func__ << " blob " << *bptr << std::hex
6630 << " need " << p.second << std::dec << dendl;
6631 if (bptr->get_blob().is_compressed()) {
6632 // read the whole thing
6633 if (compressed_blob_bls.empty()) {
6634 // ensure we avoid any reallocation on subsequent blobs
6635 compressed_blob_bls.reserve(blobs2read.size());
6636 }
6637 compressed_blob_bls.push_back(bufferlist());
6638 bufferlist& bl = compressed_blob_bls.back();
6639 r = bptr->get_blob().map(
6640 0, bptr->get_blob().get_ondisk_length(),
6641 [&](uint64_t offset, uint64_t length) {
6642 int r;
6643 // use aio if there are more regions to read than those in this blob
6644 if (num_regions > p.second.size()) {
6645 r = bdev->aio_read(offset, length, &bl, &ioc);
6646 } else {
6647 r = bdev->read(offset, length, &bl, &ioc, false);
6648 }
6649 if (r < 0)
6650 return r;
6651 return 0;
6652 });
b32b8144
FG
6653 if (r < 0) {
6654 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
6655 if (r == -EIO) {
6656 // propagate EIO to caller
6657 return r;
6658 }
7c673cae 6659 assert(r == 0);
b32b8144 6660 }
7c673cae
FG
6661 } else {
6662 // read the pieces
6663 for (auto& reg : p.second) {
6664 // determine how much of the blob to read
6665 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6666 reg.r_off = reg.blob_xoffset;
6667 uint64_t r_len = reg.length;
6668 reg.front = reg.r_off % chunk_size;
6669 if (reg.front) {
6670 reg.r_off -= reg.front;
6671 r_len += reg.front;
6672 }
6673 unsigned tail = r_len % chunk_size;
6674 if (tail) {
6675 r_len += chunk_size - tail;
6676 }
6677 dout(20) << __func__ << " region 0x" << std::hex
6678 << reg.logical_offset
6679 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6680 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6681 << dendl;
6682
6683 // read it
6684 r = bptr->get_blob().map(
6685 reg.r_off, r_len,
6686 [&](uint64_t offset, uint64_t length) {
6687 int r;
6688 // use aio if there is more than one region to read
6689 if (num_regions > 1) {
6690 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6691 } else {
6692 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6693 }
6694 if (r < 0)
6695 return r;
6696 return 0;
6697 });
b32b8144
FG
6698 if (r < 0) {
6699 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
6700 << dendl;
6701 if (r == -EIO) {
6702 // propagate EIO to caller
6703 return r;
6704 }
6705 assert(r == 0);
6706 }
7c673cae
FG
6707 assert(reg.bl.length() == r_len);
6708 }
6709 }
6710 }
6711 if (ioc.has_pending_aios()) {
6712 bdev->aio_submit(&ioc);
6713 dout(20) << __func__ << " waiting for aio" << dendl;
6714 ioc.aio_wait();
b32b8144
FG
6715 r = ioc.get_return_value();
6716 if (r < 0) {
6717 assert(r == -EIO); // no other errors allowed
6718 return -EIO;
6719 }
7c673cae
FG
6720 }
6721 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6722
6723 // enumerate and decompress desired blobs
6724 auto p = compressed_blob_bls.begin();
6725 blobs2read_t::iterator b2r_it = blobs2read.begin();
6726 while (b2r_it != blobs2read.end()) {
94b18763 6727 const BlobRef& bptr = b2r_it->first;
7c673cae
FG
6728 dout(20) << __func__ << " blob " << *bptr << std::hex
6729 << " need 0x" << b2r_it->second << std::dec << dendl;
6730 if (bptr->get_blob().is_compressed()) {
6731 assert(p != compressed_blob_bls.end());
6732 bufferlist& compressed_bl = *p++;
6733 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6734 b2r_it->second.front().logical_offset) < 0) {
6735 return -EIO;
6736 }
6737 bufferlist raw_bl;
6738 r = _decompress(compressed_bl, &raw_bl);
6739 if (r < 0)
6740 return r;
6741 if (buffered) {
6742 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6743 raw_bl);
6744 }
6745 for (auto& i : b2r_it->second) {
6746 ready_regions[i.logical_offset].substr_of(
6747 raw_bl, i.blob_xoffset, i.length);
6748 }
6749 } else {
6750 for (auto& reg : b2r_it->second) {
6751 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6752 reg.logical_offset) < 0) {
6753 return -EIO;
6754 }
6755 if (buffered) {
6756 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6757 reg.r_off, reg.bl);
6758 }
6759
6760 // prune and keep result
6761 ready_regions[reg.logical_offset].substr_of(
6762 reg.bl, reg.front, reg.length);
6763 }
6764 }
6765 ++b2r_it;
6766 }
6767
6768 // generate a resulting buffer
6769 auto pr = ready_regions.begin();
6770 auto pr_end = ready_regions.end();
6771 pos = 0;
6772 while (pos < length) {
6773 if (pr != pr_end && pr->first == pos + offset) {
6774 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6775 << ": data from 0x" << pr->first << "~" << pr->second.length()
6776 << std::dec << dendl;
6777 pos += pr->second.length();
6778 bl.claim_append(pr->second);
6779 ++pr;
6780 } else {
6781 uint64_t l = length - pos;
6782 if (pr != pr_end) {
6783 assert(pr->first > pos + offset);
6784 l = pr->first - (pos + offset);
6785 }
6786 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6787 << ": zeros for 0x" << (pos + offset) << "~" << l
6788 << std::dec << dendl;
6789 bl.append_zero(l);
6790 pos += l;
6791 }
6792 }
6793 assert(bl.length() == length);
6794 assert(pos == length);
6795 assert(pr == pr_end);
6796 r = bl.length();
6797 return r;
6798}
6799
6800int BlueStore::_verify_csum(OnodeRef& o,
6801 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6802 const bufferlist& bl,
6803 uint64_t logical_offset) const
6804{
6805 int bad;
6806 uint64_t bad_csum;
6807 utime_t start = ceph_clock_now();
6808 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6809 if (r < 0) {
6810 if (r == -1) {
6811 PExtentVector pex;
6812 blob->map(
6813 bad,
6814 blob->get_csum_chunk_size(),
6815 [&](uint64_t offset, uint64_t length) {
6816 pex.emplace_back(bluestore_pextent_t(offset, length));
6817 return 0;
6818 });
6819 derr << __func__ << " bad "
6820 << Checksummer::get_csum_type_string(blob->csum_type)
6821 << "/0x" << std::hex << blob->get_csum_chunk_size()
6822 << " checksum at blob offset 0x" << bad
6823 << ", got 0x" << bad_csum << ", expected 0x"
6824 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
6825 << ", device location " << pex
6826 << ", logical extent 0x" << std::hex
6827 << (logical_offset + bad - blob_xoffset) << "~"
6828 << blob->get_csum_chunk_size() << std::dec
6829 << ", object " << o->oid
6830 << dendl;
6831 } else {
6832 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
6833 }
6834 }
6835 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
6836 return r;
6837}
6838
6839int BlueStore::_decompress(bufferlist& source, bufferlist* result)
6840{
6841 int r = 0;
6842 utime_t start = ceph_clock_now();
6843 bufferlist::iterator i = source.begin();
6844 bluestore_compression_header_t chdr;
6845 ::decode(chdr, i);
6846 int alg = int(chdr.type);
6847 CompressorRef cp = compressor;
6848 if (!cp || (int)cp->get_type() != alg) {
6849 cp = Compressor::create(cct, alg);
6850 }
6851
6852 if (!cp.get()) {
6853 // if compressor isn't available - error, because cannot return
6854 // decompressed data?
6855 derr << __func__ << " can't load decompressor " << alg << dendl;
6856 r = -EIO;
6857 } else {
6858 r = cp->decompress(i, chdr.length, *result);
6859 if (r < 0) {
6860 derr << __func__ << " decompression failed with exit code " << r << dendl;
6861 r = -EIO;
6862 }
6863 }
6864 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
6865 return r;
6866}
6867
6868// this stores fiemap into interval_set, other variations
6869// use it internally
6870int BlueStore::_fiemap(
6871 CollectionHandle &c_,
6872 const ghobject_t& oid,
6873 uint64_t offset,
6874 size_t length,
6875 interval_set<uint64_t>& destset)
6876{
6877 Collection *c = static_cast<Collection *>(c_.get());
6878 if (!c->exists)
6879 return -ENOENT;
6880 {
6881 RWLock::RLocker l(c->lock);
6882
6883 OnodeRef o = c->get_onode(oid, false);
6884 if (!o || !o->exists) {
6885 return -ENOENT;
6886 }
6887 _dump_onode(o);
6888
6889 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6890 << " size 0x" << o->onode.size << std::dec << dendl;
6891
6892 boost::intrusive::set<Extent>::iterator ep, eend;
6893 if (offset >= o->onode.size)
6894 goto out;
6895
6896 if (offset + length > o->onode.size) {
6897 length = o->onode.size - offset;
6898 }
6899
6900 o->extent_map.fault_range(db, offset, length);
6901 eend = o->extent_map.extent_map.end();
6902 ep = o->extent_map.seek_lextent(offset);
6903 while (length > 0) {
6904 dout(20) << __func__ << " offset " << offset << dendl;
6905 if (ep != eend && ep->logical_offset + ep->length <= offset) {
6906 ++ep;
6907 continue;
6908 }
6909
6910 uint64_t x_len = length;
6911 if (ep != eend && ep->logical_offset <= offset) {
6912 uint64_t x_off = offset - ep->logical_offset;
6913 x_len = MIN(x_len, ep->length - x_off);
6914 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
6915 << x_len << std::dec << " blob " << ep->blob << dendl;
6916 destset.insert(offset, x_len);
6917 length -= x_len;
6918 offset += x_len;
6919 if (x_off + x_len == ep->length)
6920 ++ep;
6921 continue;
6922 }
6923 if (ep != eend &&
6924 ep->logical_offset > offset &&
6925 ep->logical_offset - offset < x_len) {
6926 x_len = ep->logical_offset - offset;
6927 }
6928 offset += x_len;
6929 length -= x_len;
6930 }
6931 }
6932
6933 out:
7c673cae
FG
6934 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6935 << " size = 0x(" << destset << ")" << std::dec << dendl;
6936 return 0;
6937}
6938
6939int BlueStore::fiemap(
6940 const coll_t& cid,
6941 const ghobject_t& oid,
6942 uint64_t offset,
6943 size_t len,
6944 bufferlist& bl)
6945{
6946 CollectionHandle c = _get_collection(cid);
6947 if (!c)
6948 return -ENOENT;
6949 return fiemap(c, oid, offset, len, bl);
6950}
6951
6952int BlueStore::fiemap(
6953 CollectionHandle &c_,
6954 const ghobject_t& oid,
6955 uint64_t offset,
6956 size_t length,
6957 bufferlist& bl)
6958{
6959 interval_set<uint64_t> m;
6960 int r = _fiemap(c_, oid, offset, length, m);
6961 if (r >= 0) {
6962 ::encode(m, bl);
6963 }
6964 return r;
6965}
6966
6967int BlueStore::fiemap(
6968 const coll_t& cid,
6969 const ghobject_t& oid,
6970 uint64_t offset,
6971 size_t len,
6972 map<uint64_t, uint64_t>& destmap)
6973{
6974 CollectionHandle c = _get_collection(cid);
6975 if (!c)
6976 return -ENOENT;
6977 return fiemap(c, oid, offset, len, destmap);
6978}
6979
6980int BlueStore::fiemap(
6981 CollectionHandle &c_,
6982 const ghobject_t& oid,
6983 uint64_t offset,
6984 size_t length,
6985 map<uint64_t, uint64_t>& destmap)
6986{
6987 interval_set<uint64_t> m;
6988 int r = _fiemap(c_, oid, offset, length, m);
6989 if (r >= 0) {
6990 m.move_into(destmap);
6991 }
6992 return r;
6993}
6994
6995int BlueStore::getattr(
6996 const coll_t& cid,
6997 const ghobject_t& oid,
6998 const char *name,
6999 bufferptr& value)
7000{
7001 CollectionHandle c = _get_collection(cid);
7002 if (!c)
7003 return -ENOENT;
7004 return getattr(c, oid, name, value);
7005}
7006
7007int BlueStore::getattr(
7008 CollectionHandle &c_,
7009 const ghobject_t& oid,
7010 const char *name,
7011 bufferptr& value)
7012{
7013 Collection *c = static_cast<Collection *>(c_.get());
7014 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
7015 if (!c->exists)
7016 return -ENOENT;
7017
7018 int r;
7019 {
7020 RWLock::RLocker l(c->lock);
31f18b77 7021 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
7022
7023 OnodeRef o = c->get_onode(oid, false);
7024 if (!o || !o->exists) {
7025 r = -ENOENT;
7026 goto out;
7027 }
7028
7029 if (!o->onode.attrs.count(k)) {
7030 r = -ENODATA;
7031 goto out;
7032 }
7033 value = o->onode.attrs[k];
7034 r = 0;
7035 }
7036 out:
7c673cae
FG
7037 if (r == 0 && _debug_mdata_eio(oid)) {
7038 r = -EIO;
7039 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7040 }
7041 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
7042 << " = " << r << dendl;
7043 return r;
7044}
7045
7046
7047int BlueStore::getattrs(
7048 const coll_t& cid,
7049 const ghobject_t& oid,
7050 map<string,bufferptr>& aset)
7051{
7052 CollectionHandle c = _get_collection(cid);
7053 if (!c)
7054 return -ENOENT;
7055 return getattrs(c, oid, aset);
7056}
7057
7058int BlueStore::getattrs(
7059 CollectionHandle &c_,
7060 const ghobject_t& oid,
7061 map<string,bufferptr>& aset)
7062{
7063 Collection *c = static_cast<Collection *>(c_.get());
7064 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
7065 if (!c->exists)
7066 return -ENOENT;
7067
7068 int r;
7069 {
7070 RWLock::RLocker l(c->lock);
7071
7072 OnodeRef o = c->get_onode(oid, false);
7073 if (!o || !o->exists) {
7074 r = -ENOENT;
7075 goto out;
7076 }
7077 for (auto& i : o->onode.attrs) {
7078 aset.emplace(i.first.c_str(), i.second);
7079 }
7080 r = 0;
7081 }
7082
7083 out:
7c673cae
FG
7084 if (r == 0 && _debug_mdata_eio(oid)) {
7085 r = -EIO;
7086 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7087 }
7088 dout(10) << __func__ << " " << c->cid << " " << oid
7089 << " = " << r << dendl;
7090 return r;
7091}
7092
7093int BlueStore::list_collections(vector<coll_t>& ls)
7094{
7095 RWLock::RLocker l(coll_lock);
7096 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
7097 p != coll_map.end();
7098 ++p)
7099 ls.push_back(p->first);
7100 return 0;
7101}
7102
7103bool BlueStore::collection_exists(const coll_t& c)
7104{
7105 RWLock::RLocker l(coll_lock);
7106 return coll_map.count(c);
7107}
7108
7109int BlueStore::collection_empty(const coll_t& cid, bool *empty)
7110{
7111 dout(15) << __func__ << " " << cid << dendl;
7112 vector<ghobject_t> ls;
7113 ghobject_t next;
7114 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
7115 &ls, &next);
7116 if (r < 0) {
7117 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
7118 << dendl;
7119 return r;
7120 }
7121 *empty = ls.empty();
7122 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
7123 return 0;
7124}
7125
7126int BlueStore::collection_bits(const coll_t& cid)
7127{
7128 dout(15) << __func__ << " " << cid << dendl;
7129 CollectionRef c = _get_collection(cid);
7130 if (!c)
7131 return -ENOENT;
7132 RWLock::RLocker l(c->lock);
7133 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
7134 return c->cnode.bits;
7135}
7136
7137int BlueStore::collection_list(
7138 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
7139 vector<ghobject_t> *ls, ghobject_t *pnext)
7140{
7141 CollectionHandle c = _get_collection(cid);
7142 if (!c)
7143 return -ENOENT;
7144 return collection_list(c, start, end, max, ls, pnext);
7145}
7146
7147int BlueStore::collection_list(
7148 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
7149 vector<ghobject_t> *ls, ghobject_t *pnext)
7150{
7151 Collection *c = static_cast<Collection *>(c_.get());
7152 dout(15) << __func__ << " " << c->cid
7153 << " start " << start << " end " << end << " max " << max << dendl;
7154 int r;
7155 {
7156 RWLock::RLocker l(c->lock);
7157 r = _collection_list(c, start, end, max, ls, pnext);
7158 }
7159
7c673cae
FG
7160 dout(10) << __func__ << " " << c->cid
7161 << " start " << start << " end " << end << " max " << max
7162 << " = " << r << ", ls.size() = " << ls->size()
7163 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
7164 return r;
7165}
7166
7167int BlueStore::_collection_list(
7168 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
7169 vector<ghobject_t> *ls, ghobject_t *pnext)
7170{
7171
7172 if (!c->exists)
7173 return -ENOENT;
7174
7175 int r = 0;
7176 ghobject_t static_next;
7177 KeyValueDB::Iterator it;
7178 string temp_start_key, temp_end_key;
7179 string start_key, end_key;
7180 bool set_next = false;
7181 string pend;
7182 bool temp;
7183
7184 if (!pnext)
7185 pnext = &static_next;
7186
7187 if (start == ghobject_t::get_max() ||
7188 start.hobj.is_max()) {
7189 goto out;
7190 }
7191 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
7192 &start_key, &end_key);
7193 dout(20) << __func__
7194 << " range " << pretty_binary_string(temp_start_key)
7195 << " to " << pretty_binary_string(temp_end_key)
7196 << " and " << pretty_binary_string(start_key)
7197 << " to " << pretty_binary_string(end_key)
7198 << " start " << start << dendl;
7199 it = db->get_iterator(PREFIX_OBJ);
7200 if (start == ghobject_t() ||
7201 start.hobj == hobject_t() ||
7202 start == c->cid.get_min_hobj()) {
7203 it->upper_bound(temp_start_key);
7204 temp = true;
7205 } else {
7206 string k;
7207 get_object_key(cct, start, &k);
7208 if (start.hobj.is_temp()) {
7209 temp = true;
7210 assert(k >= temp_start_key && k < temp_end_key);
7211 } else {
7212 temp = false;
7213 assert(k >= start_key && k < end_key);
7214 }
7215 dout(20) << " start from " << pretty_binary_string(k)
7216 << " temp=" << (int)temp << dendl;
7217 it->lower_bound(k);
7218 }
7219 if (end.hobj.is_max()) {
7220 pend = temp ? temp_end_key : end_key;
7221 } else {
7222 get_object_key(cct, end, &end_key);
7223 if (end.hobj.is_temp()) {
7224 if (temp)
7225 pend = end_key;
7226 else
7227 goto out;
7228 } else {
7229 pend = temp ? temp_end_key : end_key;
7230 }
7231 }
7232 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7233 while (true) {
7234 if (!it->valid() || it->key() >= pend) {
7235 if (!it->valid())
7236 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
7237 else
7238 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
7239 << " >= " << end << dendl;
7240 if (temp) {
7241 if (end.hobj.is_temp()) {
7242 break;
7243 }
7244 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
7245 temp = false;
7246 it->upper_bound(start_key);
7247 pend = end_key;
7248 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7249 continue;
7250 }
7251 break;
7252 }
7253 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
7254 if (is_extent_shard_key(it->key())) {
7255 it->next();
7256 continue;
7257 }
7258 ghobject_t oid;
7259 int r = get_key_object(it->key(), &oid);
7260 assert(r == 0);
7261 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
7262 if (ls->size() >= (unsigned)max) {
7263 dout(20) << __func__ << " reached max " << max << dendl;
7264 *pnext = oid;
7265 set_next = true;
7266 break;
7267 }
7268 ls->push_back(oid);
7269 it->next();
7270 }
7271out:
7272 if (!set_next) {
7273 *pnext = ghobject_t::get_max();
7274 }
7275
7276 return r;
7277}
7278
7c673cae
FG
7279int BlueStore::omap_get(
7280 const coll_t& cid, ///< [in] Collection containing oid
7281 const ghobject_t &oid, ///< [in] Object containing omap
7282 bufferlist *header, ///< [out] omap header
7283 map<string, bufferlist> *out /// < [out] Key to value map
7284 )
7285{
7286 CollectionHandle c = _get_collection(cid);
7287 if (!c)
7288 return -ENOENT;
7289 return omap_get(c, oid, header, out);
7290}
7291
7292int BlueStore::omap_get(
7293 CollectionHandle &c_, ///< [in] Collection containing oid
7294 const ghobject_t &oid, ///< [in] Object containing omap
7295 bufferlist *header, ///< [out] omap header
7296 map<string, bufferlist> *out /// < [out] Key to value map
7297 )
7298{
7299 Collection *c = static_cast<Collection *>(c_.get());
7300 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7301 if (!c->exists)
7302 return -ENOENT;
7303 RWLock::RLocker l(c->lock);
7304 int r = 0;
7305 OnodeRef o = c->get_onode(oid, false);
7306 if (!o || !o->exists) {
7307 r = -ENOENT;
7308 goto out;
7309 }
7310 if (!o->onode.has_omap())
7311 goto out;
7312 o->flush();
7313 {
7314 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7315 string head, tail;
7316 get_omap_header(o->onode.nid, &head);
7317 get_omap_tail(o->onode.nid, &tail);
7318 it->lower_bound(head);
7319 while (it->valid()) {
7320 if (it->key() == head) {
7321 dout(30) << __func__ << " got header" << dendl;
7322 *header = it->value();
7323 } else if (it->key() >= tail) {
7324 dout(30) << __func__ << " reached tail" << dendl;
7325 break;
7326 } else {
7327 string user_key;
7328 decode_omap_key(it->key(), &user_key);
7329 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7330 << " -> " << user_key << dendl;
7331 (*out)[user_key] = it->value();
7332 }
7333 it->next();
7334 }
7335 }
7336 out:
7337 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7338 << dendl;
7339 return r;
7340}
7341
7342int BlueStore::omap_get_header(
7343 const coll_t& cid, ///< [in] Collection containing oid
7344 const ghobject_t &oid, ///< [in] Object containing omap
7345 bufferlist *header, ///< [out] omap header
7346 bool allow_eio ///< [in] don't assert on eio
7347 )
7348{
7349 CollectionHandle c = _get_collection(cid);
7350 if (!c)
7351 return -ENOENT;
7352 return omap_get_header(c, oid, header, allow_eio);
7353}
7354
7355int BlueStore::omap_get_header(
7356 CollectionHandle &c_, ///< [in] Collection containing oid
7357 const ghobject_t &oid, ///< [in] Object containing omap
7358 bufferlist *header, ///< [out] omap header
7359 bool allow_eio ///< [in] don't assert on eio
7360 )
7361{
7362 Collection *c = static_cast<Collection *>(c_.get());
7363 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7364 if (!c->exists)
7365 return -ENOENT;
7366 RWLock::RLocker l(c->lock);
7367 int r = 0;
7368 OnodeRef o = c->get_onode(oid, false);
7369 if (!o || !o->exists) {
7370 r = -ENOENT;
7371 goto out;
7372 }
7373 if (!o->onode.has_omap())
7374 goto out;
7375 o->flush();
7376 {
7377 string head;
7378 get_omap_header(o->onode.nid, &head);
7379 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7380 dout(30) << __func__ << " got header" << dendl;
7381 } else {
7382 dout(30) << __func__ << " no header" << dendl;
7383 }
7384 }
7385 out:
7386 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7387 << dendl;
7388 return r;
7389}
7390
7391int BlueStore::omap_get_keys(
7392 const coll_t& cid, ///< [in] Collection containing oid
7393 const ghobject_t &oid, ///< [in] Object containing omap
7394 set<string> *keys ///< [out] Keys defined on oid
7395 )
7396{
7397 CollectionHandle c = _get_collection(cid);
7398 if (!c)
7399 return -ENOENT;
7400 return omap_get_keys(c, oid, keys);
7401}
7402
7403int BlueStore::omap_get_keys(
7404 CollectionHandle &c_, ///< [in] Collection containing oid
7405 const ghobject_t &oid, ///< [in] Object containing omap
7406 set<string> *keys ///< [out] Keys defined on oid
7407 )
7408{
7409 Collection *c = static_cast<Collection *>(c_.get());
7410 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7411 if (!c->exists)
7412 return -ENOENT;
7413 RWLock::RLocker l(c->lock);
7414 int r = 0;
7415 OnodeRef o = c->get_onode(oid, false);
7416 if (!o || !o->exists) {
7417 r = -ENOENT;
7418 goto out;
7419 }
7420 if (!o->onode.has_omap())
7421 goto out;
7422 o->flush();
7423 {
7424 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7425 string head, tail;
7426 get_omap_key(o->onode.nid, string(), &head);
7427 get_omap_tail(o->onode.nid, &tail);
7428 it->lower_bound(head);
7429 while (it->valid()) {
7430 if (it->key() >= tail) {
7431 dout(30) << __func__ << " reached tail" << dendl;
7432 break;
7433 }
7434 string user_key;
7435 decode_omap_key(it->key(), &user_key);
7436 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7437 << " -> " << user_key << dendl;
7438 keys->insert(user_key);
7439 it->next();
7440 }
7441 }
7442 out:
7443 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7444 << dendl;
7445 return r;
7446}
7447
7448int BlueStore::omap_get_values(
7449 const coll_t& cid, ///< [in] Collection containing oid
7450 const ghobject_t &oid, ///< [in] Object containing omap
7451 const set<string> &keys, ///< [in] Keys to get
7452 map<string, bufferlist> *out ///< [out] Returned keys and values
7453 )
7454{
7455 CollectionHandle c = _get_collection(cid);
7456 if (!c)
7457 return -ENOENT;
7458 return omap_get_values(c, oid, keys, out);
7459}
7460
7461int BlueStore::omap_get_values(
7462 CollectionHandle &c_, ///< [in] Collection containing oid
7463 const ghobject_t &oid, ///< [in] Object containing omap
7464 const set<string> &keys, ///< [in] Keys to get
7465 map<string, bufferlist> *out ///< [out] Returned keys and values
7466 )
7467{
7468 Collection *c = static_cast<Collection *>(c_.get());
7469 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7470 if (!c->exists)
7471 return -ENOENT;
7472 RWLock::RLocker l(c->lock);
7473 int r = 0;
7474 string final_key;
7475 OnodeRef o = c->get_onode(oid, false);
7476 if (!o || !o->exists) {
7477 r = -ENOENT;
7478 goto out;
7479 }
7480 if (!o->onode.has_omap())
7481 goto out;
7482 o->flush();
7483 _key_encode_u64(o->onode.nid, &final_key);
7484 final_key.push_back('.');
7485 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7486 final_key.resize(9); // keep prefix
7487 final_key += *p;
7488 bufferlist val;
7489 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7490 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7491 << " -> " << *p << dendl;
7492 out->insert(make_pair(*p, val));
7493 }
7494 }
7495 out:
7496 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7497 << dendl;
7498 return r;
7499}
7500
7501int BlueStore::omap_check_keys(
7502 const coll_t& cid, ///< [in] Collection containing oid
7503 const ghobject_t &oid, ///< [in] Object containing omap
7504 const set<string> &keys, ///< [in] Keys to check
7505 set<string> *out ///< [out] Subset of keys defined on oid
7506 )
7507{
7508 CollectionHandle c = _get_collection(cid);
7509 if (!c)
7510 return -ENOENT;
7511 return omap_check_keys(c, oid, keys, out);
7512}
7513
7514int BlueStore::omap_check_keys(
7515 CollectionHandle &c_, ///< [in] Collection containing oid
7516 const ghobject_t &oid, ///< [in] Object containing omap
7517 const set<string> &keys, ///< [in] Keys to check
7518 set<string> *out ///< [out] Subset of keys defined on oid
7519 )
7520{
7521 Collection *c = static_cast<Collection *>(c_.get());
7522 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7523 if (!c->exists)
7524 return -ENOENT;
7525 RWLock::RLocker l(c->lock);
7526 int r = 0;
7527 string final_key;
7528 OnodeRef o = c->get_onode(oid, false);
7529 if (!o || !o->exists) {
7530 r = -ENOENT;
7531 goto out;
7532 }
7533 if (!o->onode.has_omap())
7534 goto out;
7535 o->flush();
7536 _key_encode_u64(o->onode.nid, &final_key);
7537 final_key.push_back('.');
7538 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7539 final_key.resize(9); // keep prefix
7540 final_key += *p;
7541 bufferlist val;
7542 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7543 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7544 << " -> " << *p << dendl;
7545 out->insert(*p);
7546 } else {
7547 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7548 << " -> " << *p << dendl;
7549 }
7550 }
7551 out:
7552 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7553 << dendl;
7554 return r;
7555}
7556
7557ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7558 const coll_t& cid, ///< [in] collection
7559 const ghobject_t &oid ///< [in] object
7560 )
7561{
7562 CollectionHandle c = _get_collection(cid);
7563 if (!c) {
7564 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7565 return ObjectMap::ObjectMapIterator();
7566 }
7567 return get_omap_iterator(c, oid);
7568}
7569
7570ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7571 CollectionHandle &c_, ///< [in] collection
7572 const ghobject_t &oid ///< [in] object
7573 )
7574{
7575 Collection *c = static_cast<Collection *>(c_.get());
7576 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7577 if (!c->exists) {
7578 return ObjectMap::ObjectMapIterator();
7579 }
7580 RWLock::RLocker l(c->lock);
7581 OnodeRef o = c->get_onode(oid, false);
7582 if (!o || !o->exists) {
7583 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7584 return ObjectMap::ObjectMapIterator();
7585 }
7586 o->flush();
7587 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7588 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7589 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7590}
7591
7592// -----------------
7593// write helpers
7594
7595void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7596{
7597 dout(10) << __func__ << " ondisk_format " << ondisk_format
7598 << " min_compat_ondisk_format " << min_compat_ondisk_format
7599 << dendl;
7600 assert(ondisk_format == latest_ondisk_format);
7601 {
7602 bufferlist bl;
7603 ::encode(ondisk_format, bl);
7604 t->set(PREFIX_SUPER, "ondisk_format", bl);
7605 }
7606 {
7607 bufferlist bl;
7608 ::encode(min_compat_ondisk_format, bl);
7609 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7610 }
7611}
7612
7613int BlueStore::_open_super_meta()
7614{
7615 // nid
7616 {
7617 nid_max = 0;
7618 bufferlist bl;
7619 db->get(PREFIX_SUPER, "nid_max", &bl);
7620 bufferlist::iterator p = bl.begin();
7621 try {
7622 uint64_t v;
7623 ::decode(v, p);
7624 nid_max = v;
7625 } catch (buffer::error& e) {
7626 derr << __func__ << " unable to read nid_max" << dendl;
7627 return -EIO;
7628 }
7629 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7630 nid_last = nid_max.load();
7631 }
7632
7633 // blobid
7634 {
7635 blobid_max = 0;
7636 bufferlist bl;
7637 db->get(PREFIX_SUPER, "blobid_max", &bl);
7638 bufferlist::iterator p = bl.begin();
7639 try {
7640 uint64_t v;
7641 ::decode(v, p);
7642 blobid_max = v;
7643 } catch (buffer::error& e) {
7644 derr << __func__ << " unable to read blobid_max" << dendl;
7645 return -EIO;
7646 }
7647 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7648 blobid_last = blobid_max.load();
7649 }
7650
7651 // freelist
7652 {
7653 bufferlist bl;
7654 db->get(PREFIX_SUPER, "freelist_type", &bl);
7655 if (bl.length()) {
7656 freelist_type = std::string(bl.c_str(), bl.length());
7657 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7658 } else {
7659 assert("Not Support extent freelist manager" == 0);
7660 }
7661 }
7662
7663 // bluefs alloc
7664 if (cct->_conf->bluestore_bluefs) {
7665 bluefs_extents.clear();
7666 bufferlist bl;
7667 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7668 bufferlist::iterator p = bl.begin();
7669 try {
7670 ::decode(bluefs_extents, p);
7671 }
7672 catch (buffer::error& e) {
7673 derr << __func__ << " unable to read bluefs_extents" << dendl;
7674 return -EIO;
7675 }
7676 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7677 << std::dec << dendl;
7678 }
7679
7680 // ondisk format
7681 int32_t compat_ondisk_format = 0;
7682 {
7683 bufferlist bl;
7684 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7685 if (r < 0) {
7686 // base case: kraken bluestore is v1 and readable by v1
7687 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7688 << dendl;
7689 ondisk_format = 1;
7690 compat_ondisk_format = 1;
7691 } else {
7692 auto p = bl.begin();
7693 try {
7694 ::decode(ondisk_format, p);
7695 } catch (buffer::error& e) {
7696 derr << __func__ << " unable to read ondisk_format" << dendl;
7697 return -EIO;
7698 }
7699 bl.clear();
7700 {
7701 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7702 assert(!r);
7703 auto p = bl.begin();
7704 try {
7705 ::decode(compat_ondisk_format, p);
7706 } catch (buffer::error& e) {
7707 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7708 return -EIO;
7709 }
7710 }
7711 }
7712 dout(10) << __func__ << " ondisk_format " << ondisk_format
7713 << " compat_ondisk_format " << compat_ondisk_format
7714 << dendl;
7715 }
7716
7717 if (latest_ondisk_format < compat_ondisk_format) {
7718 derr << __func__ << " compat_ondisk_format is "
7719 << compat_ondisk_format << " but we only understand version "
7720 << latest_ondisk_format << dendl;
7721 return -EPERM;
7722 }
7723 if (ondisk_format < latest_ondisk_format) {
7724 int r = _upgrade_super();
7725 if (r < 0) {
7726 return r;
7727 }
7728 }
7729
7730 {
7731 bufferlist bl;
7732 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7733 auto p = bl.begin();
7734 try {
7735 uint64_t val;
7736 ::decode(val, p);
7737 min_alloc_size = val;
224ce89b
WB
7738 min_alloc_size_order = ctz(val);
7739 assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
7740 } catch (buffer::error& e) {
7741 derr << __func__ << " unable to read min_alloc_size" << dendl;
7742 return -EIO;
7743 }
7744 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7745 << std::dec << dendl;
7746 }
224ce89b 7747 _open_statfs();
7c673cae
FG
7748 _set_alloc_sizes();
7749 _set_throttle_params();
7750
7751 _set_csum();
7752 _set_compression();
7753 _set_blob_size();
7754
7755 return 0;
7756}
7757
7758int BlueStore::_upgrade_super()
7759{
7760 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7761 << latest_ondisk_format << dendl;
7762 assert(ondisk_format > 0);
7763 assert(ondisk_format < latest_ondisk_format);
7764
7765 if (ondisk_format == 1) {
7766 // changes:
7767 // - super: added ondisk_format
7768 // - super: added min_readable_ondisk_format
7769 // - super: added min_compat_ondisk_format
7770 // - super: added min_alloc_size
7771 // - super: removed min_min_alloc_size
7772 KeyValueDB::Transaction t = db->get_transaction();
7773 {
7774 bufferlist bl;
7775 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7776 auto p = bl.begin();
7777 try {
7778 uint64_t val;
7779 ::decode(val, p);
7780 min_alloc_size = val;
7781 } catch (buffer::error& e) {
7782 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7783 return -EIO;
7784 }
7785 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7786 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7787 }
7788 ondisk_format = 2;
7789 _prepare_ondisk_format_super(t);
7790 int r = db->submit_transaction_sync(t);
7791 assert(r == 0);
7792 }
7793
7794 // done
7795 dout(1) << __func__ << " done" << dendl;
7796 return 0;
7797}
7798
7799void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7800{
224ce89b
WB
7801 if (o->onode.nid) {
7802 assert(o->exists);
7c673cae 7803 return;
224ce89b 7804 }
7c673cae
FG
7805 uint64_t nid = ++nid_last;
7806 dout(20) << __func__ << " " << nid << dendl;
7807 o->onode.nid = nid;
7808 txc->last_nid = nid;
224ce89b 7809 o->exists = true;
7c673cae
FG
7810}
7811
7812uint64_t BlueStore::_assign_blobid(TransContext *txc)
7813{
7814 uint64_t bid = ++blobid_last;
7815 dout(20) << __func__ << " " << bid << dendl;
7816 txc->last_blobid = bid;
7817 return bid;
7818}
7819
7820void BlueStore::get_db_statistics(Formatter *f)
7821{
7822 db->get_statistics(f);
7823}
7824
7825BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
7826{
7827 TransContext *txc = new TransContext(cct, osr);
7828 txc->t = db->get_transaction();
7829 osr->queue_new(txc);
7830 dout(20) << __func__ << " osr " << osr << " = " << txc
7831 << " seq " << txc->seq << dendl;
7832 return txc;
7833}
7834
7835void BlueStore::_txc_calc_cost(TransContext *txc)
7836{
7837 // this is about the simplest model for transaction cost you can
7838 // imagine. there is some fixed overhead cost by saying there is a
7839 // minimum of one "io". and then we have some cost per "io" that is
7840 // a configurable (with different hdd and ssd defaults), and add
7841 // that to the bytes value.
7842 int ios = 1; // one "io" for the kv commit
7843 for (auto& p : txc->ioc.pending_aios) {
7844 ios += p.iov.size();
7845 }
7846 auto cost = throttle_cost_per_io.load();
7847 txc->cost = ios * cost + txc->bytes;
7848 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
7849 << ios << " ios * " << cost << " + " << txc->bytes
7850 << " bytes)" << dendl;
7851}
7852
7853void BlueStore::_txc_update_store_statfs(TransContext *txc)
7854{
7855 if (txc->statfs_delta.is_empty())
7856 return;
7857
7858 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
7859 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
7860 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
7861 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
7862 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
7863
31f18b77
FG
7864 {
7865 std::lock_guard<std::mutex> l(vstatfs_lock);
7866 vstatfs += txc->statfs_delta;
7867 }
7868
7c673cae
FG
7869 bufferlist bl;
7870 txc->statfs_delta.encode(bl);
7871
7872 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
7873 txc->statfs_delta.reset();
7874}
7875
7876void BlueStore::_txc_state_proc(TransContext *txc)
7877{
7878 while (true) {
7879 dout(10) << __func__ << " txc " << txc
7880 << " " << txc->get_state_name() << dendl;
7881 switch (txc->state) {
7882 case TransContext::STATE_PREPARE:
7883 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
7884 if (txc->ioc.has_pending_aios()) {
7885 txc->state = TransContext::STATE_AIO_WAIT;
7886 txc->had_ios = true;
7887 _txc_aio_submit(txc);
7888 return;
7889 }
7890 // ** fall-thru **
7891
7892 case TransContext::STATE_AIO_WAIT:
7893 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
7894 _txc_finish_io(txc); // may trigger blocked txc's too
7895 return;
7896
7897 case TransContext::STATE_IO_DONE:
7898 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7899 if (txc->had_ios) {
7900 ++txc->osr->txc_with_unstable_io;
7901 }
7902 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
7903 txc->state = TransContext::STATE_KV_QUEUED;
7904 if (cct->_conf->bluestore_sync_submit_transaction) {
7905 if (txc->last_nid >= nid_max ||
7906 txc->last_blobid >= blobid_max) {
7907 dout(20) << __func__
7908 << " last_{nid,blobid} exceeds max, submit via kv thread"
7909 << dendl;
7910 } else if (txc->osr->kv_committing_serially) {
7911 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
7912 << dendl;
7913 // note: this is starvation-prone. once we have a txc in a busy
7914 // sequencer that is committing serially it is possible to keep
7915 // submitting new transactions fast enough that we get stuck doing
7916 // so. the alternative is to block here... fixme?
7917 } else if (txc->osr->txc_with_unstable_io) {
7918 dout(20) << __func__ << " prior txc(s) with unstable ios "
7919 << txc->osr->txc_with_unstable_io.load() << dendl;
7920 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
7921 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
7922 == 0) {
7923 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
7924 << dendl;
7925 } else {
7926 txc->state = TransContext::STATE_KV_SUBMITTED;
31f18b77 7927 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
7c673cae
FG
7928 assert(r == 0);
7929 _txc_applied_kv(txc);
7930 }
7931 }
7932 {
7933 std::lock_guard<std::mutex> l(kv_lock);
7934 kv_queue.push_back(txc);
7935 kv_cond.notify_one();
7936 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
7937 kv_queue_unsubmitted.push_back(txc);
7938 ++txc->osr->kv_committing_serially;
7939 }
31f18b77
FG
7940 if (txc->had_ios)
7941 kv_ios++;
7942 kv_throttle_costs += txc->cost;
7c673cae
FG
7943 }
7944 return;
7945 case TransContext::STATE_KV_SUBMITTED:
7946 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
7947 txc->state = TransContext::STATE_KV_DONE;
7948 _txc_committed_kv(txc);
7949 // ** fall-thru **
7950
7951 case TransContext::STATE_KV_DONE:
7952 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
7953 if (txc->deferred_txn) {
7954 txc->state = TransContext::STATE_DEFERRED_QUEUED;
7955 _deferred_queue(txc);
7956 return;
7957 }
7958 txc->state = TransContext::STATE_FINISHING;
7959 break;
7960
7961 case TransContext::STATE_DEFERRED_CLEANUP:
7962 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
7963 txc->state = TransContext::STATE_FINISHING;
7964 // ** fall-thru **
7965
7966 case TransContext::STATE_FINISHING:
7967 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
7968 _txc_finish(txc);
7969 return;
7970
7971 default:
7972 derr << __func__ << " unexpected txc " << txc
7973 << " state " << txc->get_state_name() << dendl;
7974 assert(0 == "unexpected txc state");
7975 return;
7976 }
7977 }
7978}
7979
7980void BlueStore::_txc_finish_io(TransContext *txc)
7981{
7982 dout(20) << __func__ << " " << txc << dendl;
7983
7984 /*
7985 * we need to preserve the order of kv transactions,
7986 * even though aio will complete in any order.
7987 */
7988
7989 OpSequencer *osr = txc->osr.get();
7990 std::lock_guard<std::mutex> l(osr->qlock);
7991 txc->state = TransContext::STATE_IO_DONE;
7992
31f18b77
FG
7993 // release aio contexts (including pinned buffers).
7994 txc->ioc.running_aios.clear();
7995
7c673cae
FG
7996 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
7997 while (p != osr->q.begin()) {
7998 --p;
7999 if (p->state < TransContext::STATE_IO_DONE) {
8000 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
8001 << p->get_state_name() << dendl;
8002 return;
8003 }
8004 if (p->state > TransContext::STATE_IO_DONE) {
8005 ++p;
8006 break;
8007 }
8008 }
8009 do {
8010 _txc_state_proc(&*p++);
8011 } while (p != osr->q.end() &&
8012 p->state == TransContext::STATE_IO_DONE);
8013
8014 if (osr->kv_submitted_waiters &&
8015 osr->_is_all_kv_submitted()) {
8016 osr->qcond.notify_all();
8017 }
8018}
8019
8020void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
8021{
8022 dout(20) << __func__ << " txc " << txc
8023 << " onodes " << txc->onodes
8024 << " shared_blobs " << txc->shared_blobs
8025 << dendl;
8026
8027 // finalize onodes
8028 for (auto o : txc->onodes) {
8029 // finalize extent_map shards
8030 o->extent_map.update(t, false);
8031 if (o->extent_map.needs_reshard()) {
8032 o->extent_map.reshard(db, t);
8033 o->extent_map.update(t, true);
8034 if (o->extent_map.needs_reshard()) {
8035 dout(20) << __func__ << " warning: still wants reshard, check options?"
8036 << dendl;
8037 o->extent_map.clear_needs_reshard();
8038 }
8039 logger->inc(l_bluestore_onode_reshard);
8040 }
8041
8042 // bound encode
8043 size_t bound = 0;
8044 denc(o->onode, bound);
8045 o->extent_map.bound_encode_spanning_blobs(bound);
8046 if (o->onode.extent_map_shards.empty()) {
8047 denc(o->extent_map.inline_bl, bound);
8048 }
8049
8050 // encode
8051 bufferlist bl;
8052 unsigned onode_part, blob_part, extent_part;
8053 {
8054 auto p = bl.get_contiguous_appender(bound, true);
8055 denc(o->onode, p);
8056 onode_part = p.get_logical_offset();
8057 o->extent_map.encode_spanning_blobs(p);
8058 blob_part = p.get_logical_offset() - onode_part;
8059 if (o->onode.extent_map_shards.empty()) {
8060 denc(o->extent_map.inline_bl, p);
8061 }
8062 extent_part = p.get_logical_offset() - onode_part - blob_part;
8063 }
8064
8065 dout(20) << " onode " << o->oid << " is " << bl.length()
8066 << " (" << onode_part << " bytes onode + "
8067 << blob_part << " bytes spanning blobs + "
8068 << extent_part << " bytes inline extents)"
8069 << dendl;
8070 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
8071 o->flushing_count++;
8072 }
8073
8074 // objects we modified but didn't affect the onode
8075 auto p = txc->modified_objects.begin();
8076 while (p != txc->modified_objects.end()) {
8077 if (txc->onodes.count(*p) == 0) {
8078 (*p)->flushing_count++;
8079 ++p;
8080 } else {
8081 // remove dups with onodes list to avoid problems in _txc_finish
8082 p = txc->modified_objects.erase(p);
8083 }
8084 }
8085
8086 // finalize shared_blobs
8087 for (auto sb : txc->shared_blobs) {
8088 string key;
8089 auto sbid = sb->get_sbid();
8090 get_shared_blob_key(sbid, &key);
8091 if (sb->persistent->empty()) {
8092 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8093 << " is empty" << dendl;
8094 t->rmkey(PREFIX_SHARED_BLOB, key);
8095 } else {
8096 bufferlist bl;
8097 ::encode(*(sb->persistent), bl);
8098 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
31f18b77 8099 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
8100 t->set(PREFIX_SHARED_BLOB, key, bl);
8101 }
8102 }
8103}
8104
8105void BlueStore::BSPerfTracker::update_from_perfcounters(
8106 PerfCounters &logger)
8107{
8108 os_commit_latency.consume_next(
8109 logger.get_tavg_ms(
8110 l_bluestore_commit_lat));
8111 os_apply_latency.consume_next(
8112 logger.get_tavg_ms(
8113 l_bluestore_commit_lat));
8114}
8115
8116void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
8117{
8118 dout(20) << __func__ << " txc " << txc << std::hex
8119 << " allocated 0x" << txc->allocated
8120 << " released 0x" << txc->released
8121 << std::dec << dendl;
8122
8123 // We have to handle the case where we allocate *and* deallocate the
8124 // same region in this transaction. The freelist doesn't like that.
8125 // (Actually, the only thing that cares is the BitmapFreelistManager
8126 // debug check. But that's important.)
8127 interval_set<uint64_t> tmp_allocated, tmp_released;
8128 interval_set<uint64_t> *pallocated = &txc->allocated;
8129 interval_set<uint64_t> *preleased = &txc->released;
8130 if (!txc->allocated.empty() && !txc->released.empty()) {
8131 interval_set<uint64_t> overlap;
8132 overlap.intersection_of(txc->allocated, txc->released);
8133 if (!overlap.empty()) {
8134 tmp_allocated = txc->allocated;
8135 tmp_allocated.subtract(overlap);
8136 tmp_released = txc->released;
8137 tmp_released.subtract(overlap);
8138 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
8139 << ", new allocated 0x" << tmp_allocated
8140 << " released 0x" << tmp_released << std::dec
8141 << dendl;
8142 pallocated = &tmp_allocated;
8143 preleased = &tmp_released;
8144 }
8145 }
8146
8147 // update freelist with non-overlap sets
8148 for (interval_set<uint64_t>::iterator p = pallocated->begin();
8149 p != pallocated->end();
8150 ++p) {
8151 fm->allocate(p.get_start(), p.get_len(), t);
8152 }
8153 for (interval_set<uint64_t>::iterator p = preleased->begin();
8154 p != preleased->end();
8155 ++p) {
8156 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
8157 << "~" << p.get_len() << std::dec << dendl;
8158 fm->release(p.get_start(), p.get_len(), t);
8159 }
8160
8161 _txc_update_store_statfs(txc);
8162}
8163
8164void BlueStore::_txc_applied_kv(TransContext *txc)
8165{
8166 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
8167 for (auto& o : *ls) {
8168 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
8169 << dendl;
8170 if (--o->flushing_count == 0) {
8171 std::lock_guard<std::mutex> l(o->flush_lock);
8172 o->flush_cond.notify_all();
8173 }
8174 }
8175 }
8176}
8177
8178void BlueStore::_txc_committed_kv(TransContext *txc)
8179{
8180 dout(20) << __func__ << " txc " << txc << dendl;
8181
8182 // warning: we're calling onreadable_sync inside the sequencer lock
8183 if (txc->onreadable_sync) {
8184 txc->onreadable_sync->complete(0);
8185 txc->onreadable_sync = NULL;
8186 }
8187 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
8188 if (txc->oncommit) {
8189 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
8190 finishers[n]->queue(txc->oncommit);
8191 txc->oncommit = NULL;
8192 }
8193 if (txc->onreadable) {
8194 finishers[n]->queue(txc->onreadable);
8195 txc->onreadable = NULL;
8196 }
8197
8198 if (!txc->oncommits.empty()) {
8199 finishers[n]->queue(txc->oncommits);
8200 }
8201}
8202
8203void BlueStore::_txc_finish(TransContext *txc)
8204{
8205 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
8206 assert(txc->state == TransContext::STATE_FINISHING);
8207
8208 for (auto& sb : txc->shared_blobs_written) {
8209 sb->bc.finish_write(sb->get_cache(), txc->seq);
8210 }
8211 txc->shared_blobs_written.clear();
8212
8213 while (!txc->removed_collections.empty()) {
8214 _queue_reap_collection(txc->removed_collections.front());
8215 txc->removed_collections.pop_front();
8216 }
8217
8218 OpSequencerRef osr = txc->osr;
7c673cae 8219 bool empty = false;
31f18b77 8220 bool submit_deferred = false;
7c673cae
FG
8221 OpSequencer::q_list_t releasing_txc;
8222 {
8223 std::lock_guard<std::mutex> l(osr->qlock);
8224 txc->state = TransContext::STATE_DONE;
8225 bool notify = false;
8226 while (!osr->q.empty()) {
8227 TransContext *txc = &osr->q.front();
8228 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
8229 << dendl;
8230 if (txc->state != TransContext::STATE_DONE) {
8231 if (txc->state == TransContext::STATE_PREPARE &&
8232 deferred_aggressive) {
8233 // for _osr_drain_preceding()
8234 notify = true;
8235 }
31f18b77
FG
8236 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
8237 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
8238 submit_deferred = true;
8239 }
7c673cae
FG
8240 break;
8241 }
8242
7c673cae
FG
8243 osr->q.pop_front();
8244 releasing_txc.push_back(*txc);
8245 notify = true;
8246 }
8247 if (notify) {
8248 osr->qcond.notify_all();
8249 }
8250 if (osr->q.empty()) {
8251 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
8252 empty = true;
8253 }
8254 }
8255 while (!releasing_txc.empty()) {
8256 // release to allocator only after all preceding txc's have also
8257 // finished any deferred writes that potentially land in these
8258 // blocks
8259 auto txc = &releasing_txc.front();
8260 _txc_release_alloc(txc);
8261 releasing_txc.pop_front();
8262 txc->log_state_latency(logger, l_bluestore_state_done_lat);
8263 delete txc;
8264 }
8265
31f18b77
FG
8266 if (submit_deferred) {
8267 // we're pinning memory; flush! we could be more fine-grained here but
8268 // i'm not sure it's worth the bother.
8269 deferred_try_submit();
7c673cae
FG
8270 }
8271
7c673cae
FG
8272 if (empty && osr->zombie) {
8273 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
8274 osr->_unregister();
8275 }
8276}
8277
8278void BlueStore::_txc_release_alloc(TransContext *txc)
8279{
8280 // update allocator with full released set
8281 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
94b18763
FG
8282 dout(10) << __func__ << " " << txc << " " << std::hex
8283 << txc->released << std::dec << dendl;
7c673cae
FG
8284 for (interval_set<uint64_t>::iterator p = txc->released.begin();
8285 p != txc->released.end();
8286 ++p) {
8287 alloc->release(p.get_start(), p.get_len());
8288 }
8289 }
8290
8291 txc->allocated.clear();
8292 txc->released.clear();
8293}
8294
8295void BlueStore::_osr_drain_preceding(TransContext *txc)
8296{
8297 OpSequencer *osr = txc->osr.get();
8298 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8299 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8300 {
8301 // submit anything pending
224ce89b 8302 deferred_lock.lock();
7c673cae 8303 if (osr->deferred_pending) {
224ce89b
WB
8304 _deferred_submit_unlock(osr);
8305 } else {
8306 deferred_lock.unlock();
7c673cae
FG
8307 }
8308 }
8309 {
8310 // wake up any previously finished deferred events
8311 std::lock_guard<std::mutex> l(kv_lock);
8312 kv_cond.notify_one();
8313 }
8314 osr->drain_preceding(txc);
8315 --deferred_aggressive;
8316 dout(10) << __func__ << " " << osr << " done" << dendl;
8317}
8318
8319void BlueStore::_osr_drain_all()
8320{
8321 dout(10) << __func__ << dendl;
8322
8323 set<OpSequencerRef> s;
8324 {
8325 std::lock_guard<std::mutex> l(osr_lock);
8326 s = osr_set;
8327 }
8328 dout(20) << __func__ << " osr_set " << s << dendl;
8329
8330 ++deferred_aggressive;
8331 {
8332 // submit anything pending
224ce89b 8333 deferred_try_submit();
7c673cae
FG
8334 }
8335 {
8336 // wake up any previously finished deferred events
8337 std::lock_guard<std::mutex> l(kv_lock);
8338 kv_cond.notify_one();
8339 }
31f18b77
FG
8340 {
8341 std::lock_guard<std::mutex> l(kv_finalize_lock);
8342 kv_finalize_cond.notify_one();
8343 }
7c673cae
FG
8344 for (auto osr : s) {
8345 dout(20) << __func__ << " drain " << osr << dendl;
8346 osr->drain();
8347 }
8348 --deferred_aggressive;
8349
8350 dout(10) << __func__ << " done" << dendl;
8351}
8352
8353void BlueStore::_osr_unregister_all()
8354{
8355 set<OpSequencerRef> s;
8356 {
8357 std::lock_guard<std::mutex> l(osr_lock);
8358 s = osr_set;
8359 }
8360 dout(10) << __func__ << " " << s << dendl;
8361 for (auto osr : s) {
8362 osr->_unregister();
8363
8364 if (!osr->zombie) {
8365 // break link from Sequencer to us so that this OpSequencer
8366 // instance can die with this mount/umount cycle. note that
8367 // we assume umount() will not race against ~Sequencer.
8368 assert(osr->parent);
8369 osr->parent->p.reset();
8370 }
8371 }
8372 // nobody should be creating sequencers during umount either.
8373 {
8374 std::lock_guard<std::mutex> l(osr_lock);
8375 assert(osr_set.empty());
8376 }
8377}
8378
31f18b77
FG
8379void BlueStore::_kv_start()
8380{
8381 dout(10) << __func__ << dendl;
8382
8383 if (cct->_conf->bluestore_shard_finishers) {
8384 if (cct->_conf->osd_op_num_shards) {
8385 m_finisher_num = cct->_conf->osd_op_num_shards;
8386 } else {
8387 assert(bdev);
8388 if (bdev->is_rotational()) {
8389 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
8390 } else {
8391 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
8392 }
8393 }
8394 }
8395
8396 assert(m_finisher_num != 0);
8397
8398 for (int i = 0; i < m_finisher_num; ++i) {
8399 ostringstream oss;
8400 oss << "finisher-" << i;
8401 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8402 finishers.push_back(f);
8403 }
8404
181888fb 8405 deferred_finisher.start();
31f18b77
FG
8406 for (auto f : finishers) {
8407 f->start();
8408 }
8409 kv_sync_thread.create("bstore_kv_sync");
8410 kv_finalize_thread.create("bstore_kv_final");
8411}
8412
8413void BlueStore::_kv_stop()
8414{
8415 dout(10) << __func__ << dendl;
8416 {
8417 std::unique_lock<std::mutex> l(kv_lock);
8418 while (!kv_sync_started) {
8419 kv_cond.wait(l);
8420 }
8421 kv_stop = true;
8422 kv_cond.notify_all();
8423 }
8424 {
8425 std::unique_lock<std::mutex> l(kv_finalize_lock);
8426 while (!kv_finalize_started) {
8427 kv_finalize_cond.wait(l);
8428 }
8429 kv_finalize_stop = true;
8430 kv_finalize_cond.notify_all();
8431 }
8432 kv_sync_thread.join();
8433 kv_finalize_thread.join();
94b18763 8434 assert(removed_collections.empty());
31f18b77
FG
8435 {
8436 std::lock_guard<std::mutex> l(kv_lock);
8437 kv_stop = false;
8438 }
8439 {
8440 std::lock_guard<std::mutex> l(kv_finalize_lock);
8441 kv_finalize_stop = false;
8442 }
8443 dout(10) << __func__ << " stopping finishers" << dendl;
181888fb
FG
8444 deferred_finisher.wait_for_empty();
8445 deferred_finisher.stop();
31f18b77
FG
8446 for (auto f : finishers) {
8447 f->wait_for_empty();
8448 f->stop();
8449 }
8450 dout(10) << __func__ << " stopped" << dendl;
8451}
8452
7c673cae
FG
8453void BlueStore::_kv_sync_thread()
8454{
8455 dout(10) << __func__ << " start" << dendl;
8456 std::unique_lock<std::mutex> l(kv_lock);
31f18b77
FG
8457 assert(!kv_sync_started);
8458 kv_sync_started = true;
8459 kv_cond.notify_all();
7c673cae
FG
8460 while (true) {
8461 assert(kv_committing.empty());
8462 if (kv_queue.empty() &&
8463 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8464 !deferred_aggressive)) {
8465 if (kv_stop)
8466 break;
8467 dout(20) << __func__ << " sleep" << dendl;
8468 kv_cond.wait(l);
8469 dout(20) << __func__ << " wake" << dendl;
8470 } else {
8471 deque<TransContext*> kv_submitting;
8472 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
8473 uint64_t aios = 0, costs = 0;
8474
7c673cae
FG
8475 dout(20) << __func__ << " committing " << kv_queue.size()
8476 << " submitting " << kv_queue_unsubmitted.size()
8477 << " deferred done " << deferred_done_queue.size()
8478 << " stable " << deferred_stable_queue.size()
8479 << dendl;
8480 kv_committing.swap(kv_queue);
8481 kv_submitting.swap(kv_queue_unsubmitted);
8482 deferred_done.swap(deferred_done_queue);
8483 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
8484 aios = kv_ios;
8485 costs = kv_throttle_costs;
8486 kv_ios = 0;
8487 kv_throttle_costs = 0;
7c673cae
FG
8488 utime_t start = ceph_clock_now();
8489 l.unlock();
8490
8491 dout(30) << __func__ << " committing " << kv_committing << dendl;
8492 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8493 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8494 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8495
7c673cae
FG
8496 bool force_flush = false;
8497 // if bluefs is sharing the same device as data (only), then we
8498 // can rely on the bluefs commit to flush the device and make
8499 // deferred aios stable. that means that if we do have done deferred
8500 // txcs AND we are not on a single device, we need to force a flush.
8501 if (bluefs_single_shared_device && bluefs) {
31f18b77 8502 if (aios) {
7c673cae
FG
8503 force_flush = true;
8504 } else if (kv_committing.empty() && kv_submitting.empty() &&
8505 deferred_stable.empty()) {
8506 force_flush = true; // there's nothing else to commit!
8507 } else if (deferred_aggressive) {
8508 force_flush = true;
8509 }
8510 } else
8511 force_flush = true;
8512
8513 if (force_flush) {
31f18b77 8514 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
8515 << " force_flush=" << (int)force_flush
8516 << ", flushing, deferred done->stable" << dendl;
8517 // flush/barrier on block device
8518 bdev->flush();
8519
8520 // if we flush then deferred done are now deferred stable
8521 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8522 deferred_done.end());
8523 deferred_done.clear();
8524 }
8525 utime_t after_flush = ceph_clock_now();
8526
8527 // we will use one final transaction to force a sync
8528 KeyValueDB::Transaction synct = db->get_transaction();
8529
8530 // increase {nid,blobid}_max? note that this covers both the
8531 // case where we are approaching the max and the case we passed
8532 // it. in either case, we increase the max in the earlier txn
8533 // we submit.
8534 uint64_t new_nid_max = 0, new_blobid_max = 0;
8535 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8536 KeyValueDB::Transaction t =
8537 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8538 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8539 bufferlist bl;
8540 ::encode(new_nid_max, bl);
8541 t->set(PREFIX_SUPER, "nid_max", bl);
8542 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8543 }
8544 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8545 KeyValueDB::Transaction t =
8546 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8547 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8548 bufferlist bl;
8549 ::encode(new_blobid_max, bl);
8550 t->set(PREFIX_SUPER, "blobid_max", bl);
8551 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8552 }
c07f9fc5
FG
8553
8554 for (auto txc : kv_committing) {
8555 if (txc->state == TransContext::STATE_KV_QUEUED) {
8556 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8557 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8558 assert(r == 0);
8559 _txc_applied_kv(txc);
8560 --txc->osr->kv_committing_serially;
8561 txc->state = TransContext::STATE_KV_SUBMITTED;
8562 if (txc->osr->kv_submitted_waiters) {
8563 std::lock_guard<std::mutex> l(txc->osr->qlock);
8564 if (txc->osr->_is_all_kv_submitted()) {
8565 txc->osr->qcond.notify_all();
8566 }
7c673cae 8567 }
c07f9fc5
FG
8568
8569 } else {
8570 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8571 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
7c673cae 8572 }
7c673cae
FG
8573 if (txc->had_ios) {
8574 --txc->osr->txc_with_unstable_io;
8575 }
7c673cae
FG
8576 }
8577
31f18b77
FG
8578 // release throttle *before* we commit. this allows new ops
8579 // to be prepared and enter pipeline while we are waiting on
8580 // the kv commit sync/flush. then hopefully on the next
8581 // iteration there will already be ops awake. otherwise, we
8582 // end up going to sleep, and then wake up when the very first
8583 // transaction is ready for commit.
8584 throttle_bytes.put(costs);
8585
7c673cae
FG
8586 PExtentVector bluefs_gift_extents;
8587 if (bluefs &&
8588 after_flush - bluefs_last_balance >
8589 cct->_conf->bluestore_bluefs_balance_interval) {
8590 bluefs_last_balance = after_flush;
8591 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8592 assert(r >= 0);
8593 if (r > 0) {
8594 for (auto& p : bluefs_gift_extents) {
8595 bluefs_extents.insert(p.offset, p.length);
8596 }
8597 bufferlist bl;
8598 ::encode(bluefs_extents, bl);
8599 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8600 << bluefs_extents << std::dec << dendl;
8601 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8602 }
8603 }
8604
8605 // cleanup sync deferred keys
8606 for (auto b : deferred_stable) {
8607 for (auto& txc : b->txcs) {
8608 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8609 if (!wt.released.empty()) {
8610 // kraken replay compat only
8611 txc.released = wt.released;
8612 dout(10) << __func__ << " deferred txn has released "
8613 << txc.released
8614 << " (we just upgraded from kraken) on " << &txc << dendl;
8615 _txc_finalize_kv(&txc, synct);
8616 }
8617 // cleanup the deferred
8618 string key;
8619 get_deferred_key(wt.seq, &key);
8620 synct->rm_single_key(PREFIX_DEFERRED, key);
8621 }
8622 }
8623
8624 // submit synct synchronously (block and wait for it to commit)
31f18b77 8625 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
7c673cae
FG
8626 assert(r == 0);
8627
8628 if (new_nid_max) {
8629 nid_max = new_nid_max;
8630 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8631 }
8632 if (new_blobid_max) {
8633 blobid_max = new_blobid_max;
8634 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8635 }
8636
224ce89b
WB
8637 {
8638 utime_t finish = ceph_clock_now();
8639 utime_t dur_flush = after_flush - start;
8640 utime_t dur_kv = finish - after_flush;
8641 utime_t dur = finish - start;
8642 dout(20) << __func__ << " committed " << kv_committing.size()
8643 << " cleaned " << deferred_stable.size()
8644 << " in " << dur
8645 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8646 << dendl;
7c673cae
FG
8647 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8648 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8649 logger->tinc(l_bluestore_kv_lat, dur);
8650 }
31f18b77
FG
8651
8652 if (bluefs) {
8653 if (!bluefs_gift_extents.empty()) {
8654 _commit_bluefs_freespace(bluefs_gift_extents);
8655 }
8656 for (auto p = bluefs_extents_reclaiming.begin();
8657 p != bluefs_extents_reclaiming.end();
8658 ++p) {
8659 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8660 << p.get_start() << "~" << p.get_len() << std::dec
8661 << dendl;
8662 alloc->release(p.get_start(), p.get_len());
8663 }
8664 bluefs_extents_reclaiming.clear();
8665 }
8666
8667 {
8668 std::unique_lock<std::mutex> m(kv_finalize_lock);
8669 if (kv_committing_to_finalize.empty()) {
8670 kv_committing_to_finalize.swap(kv_committing);
8671 } else {
8672 kv_committing_to_finalize.insert(
8673 kv_committing_to_finalize.end(),
8674 kv_committing.begin(),
8675 kv_committing.end());
8676 kv_committing.clear();
8677 }
8678 if (deferred_stable_to_finalize.empty()) {
8679 deferred_stable_to_finalize.swap(deferred_stable);
8680 } else {
8681 deferred_stable_to_finalize.insert(
8682 deferred_stable_to_finalize.end(),
8683 deferred_stable.begin(),
8684 deferred_stable.end());
8685 deferred_stable.clear();
8686 }
8687 kv_finalize_cond.notify_one();
8688 }
8689
8690 l.lock();
8691 // previously deferred "done" are now "stable" by virtue of this
8692 // commit cycle.
8693 deferred_stable_queue.swap(deferred_done);
8694 }
8695 }
8696 dout(10) << __func__ << " finish" << dendl;
8697 kv_sync_started = false;
8698}
8699
8700void BlueStore::_kv_finalize_thread()
8701{
8702 deque<TransContext*> kv_committed;
8703 deque<DeferredBatch*> deferred_stable;
8704 dout(10) << __func__ << " start" << dendl;
8705 std::unique_lock<std::mutex> l(kv_finalize_lock);
8706 assert(!kv_finalize_started);
8707 kv_finalize_started = true;
8708 kv_finalize_cond.notify_all();
8709 while (true) {
8710 assert(kv_committed.empty());
8711 assert(deferred_stable.empty());
8712 if (kv_committing_to_finalize.empty() &&
8713 deferred_stable_to_finalize.empty()) {
8714 if (kv_finalize_stop)
8715 break;
8716 dout(20) << __func__ << " sleep" << dendl;
8717 kv_finalize_cond.wait(l);
8718 dout(20) << __func__ << " wake" << dendl;
8719 } else {
8720 kv_committed.swap(kv_committing_to_finalize);
8721 deferred_stable.swap(deferred_stable_to_finalize);
8722 l.unlock();
8723 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8724 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8725
8726 while (!kv_committed.empty()) {
8727 TransContext *txc = kv_committed.front();
7c673cae
FG
8728 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8729 _txc_state_proc(txc);
31f18b77 8730 kv_committed.pop_front();
7c673cae 8731 }
31f18b77 8732
7c673cae
FG
8733 for (auto b : deferred_stable) {
8734 auto p = b->txcs.begin();
8735 while (p != b->txcs.end()) {
8736 TransContext *txc = &*p;
8737 p = b->txcs.erase(p); // unlink here because
8738 _txc_state_proc(txc); // this may destroy txc
8739 }
8740 delete b;
8741 }
31f18b77 8742 deferred_stable.clear();
7c673cae
FG
8743
8744 if (!deferred_aggressive) {
31f18b77 8745 if (deferred_queue_size >= deferred_batch_ops.load() ||
7c673cae 8746 throttle_deferred_bytes.past_midpoint()) {
224ce89b 8747 deferred_try_submit();
7c673cae
FG
8748 }
8749 }
8750
8751 // this is as good a place as any ...
8752 _reap_collections();
8753
7c673cae 8754 l.lock();
7c673cae
FG
8755 }
8756 }
8757 dout(10) << __func__ << " finish" << dendl;
31f18b77 8758 kv_finalize_started = false;
7c673cae
FG
8759}
8760
8761bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8762 TransContext *txc, OnodeRef o)
8763{
8764 if (!txc->deferred_txn) {
8765 txc->deferred_txn = new bluestore_deferred_transaction_t;
8766 }
8767 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8768 return &txc->deferred_txn->ops.back();
8769}
8770
8771void BlueStore::_deferred_queue(TransContext *txc)
8772{
8773 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 8774 deferred_lock.lock();
7c673cae
FG
8775 if (!txc->osr->deferred_pending &&
8776 !txc->osr->deferred_running) {
8777 deferred_queue.push_back(*txc->osr);
8778 }
8779 if (!txc->osr->deferred_pending) {
8780 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8781 }
8782 ++deferred_queue_size;
8783 txc->osr->deferred_pending->txcs.push_back(*txc);
8784 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8785 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8786 const auto& op = *opi;
8787 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8788 bufferlist::const_iterator p = op.data.begin();
8789 for (auto e : op.extents) {
8790 txc->osr->deferred_pending->prepare_write(
8791 cct, wt.seq, e.offset, e.length, p);
8792 }
8793 }
8794 if (deferred_aggressive &&
8795 !txc->osr->deferred_running) {
224ce89b
WB
8796 _deferred_submit_unlock(txc->osr.get());
8797 } else {
8798 deferred_lock.unlock();
7c673cae
FG
8799 }
8800}
8801
224ce89b 8802void BlueStore::deferred_try_submit()
7c673cae
FG
8803{
8804 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8805 << deferred_queue_size << " txcs" << dendl;
224ce89b
WB
8806 std::lock_guard<std::mutex> l(deferred_lock);
8807 vector<OpSequencerRef> osrs;
8808 osrs.reserve(deferred_queue.size());
7c673cae 8809 for (auto& osr : deferred_queue) {
224ce89b
WB
8810 osrs.push_back(&osr);
8811 }
8812 for (auto& osr : osrs) {
181888fb
FG
8813 if (osr->deferred_pending) {
8814 if (!osr->deferred_running) {
8815 _deferred_submit_unlock(osr.get());
8816 deferred_lock.lock();
8817 } else {
8818 dout(20) << __func__ << " osr " << osr << " already has running"
8819 << dendl;
8820 }
8821 } else {
8822 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
8823 }
8824 }
8825}
8826
224ce89b 8827void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
8828{
8829 dout(10) << __func__ << " osr " << osr
8830 << " " << osr->deferred_pending->iomap.size() << " ios pending "
8831 << dendl;
8832 assert(osr->deferred_pending);
8833 assert(!osr->deferred_running);
8834
8835 auto b = osr->deferred_pending;
8836 deferred_queue_size -= b->seq_bytes.size();
8837 assert(deferred_queue_size >= 0);
8838
8839 osr->deferred_running = osr->deferred_pending;
8840 osr->deferred_pending = nullptr;
8841
8842 uint64_t start = 0, pos = 0;
8843 bufferlist bl;
8844 auto i = b->iomap.begin();
8845 while (true) {
8846 if (i == b->iomap.end() || i->first != pos) {
8847 if (bl.length()) {
8848 dout(20) << __func__ << " write 0x" << std::hex
8849 << start << "~" << bl.length()
8850 << " crc " << bl.crc32c(-1) << std::dec << dendl;
8851 if (!g_conf->bluestore_debug_omit_block_device_write) {
8852 logger->inc(l_bluestore_deferred_write_ops);
8853 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
8854 int r = bdev->aio_write(start, bl, &b->ioc, false);
8855 assert(r == 0);
8856 }
8857 }
8858 if (i == b->iomap.end()) {
8859 break;
8860 }
8861 start = 0;
8862 pos = i->first;
8863 bl.clear();
8864 }
8865 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
8866 << std::hex << pos << "~" << i->second.bl.length() << std::dec
8867 << dendl;
8868 if (!bl.length()) {
8869 start = pos;
8870 }
8871 pos += i->second.bl.length();
8872 bl.claim_append(i->second.bl);
8873 ++i;
8874 }
224ce89b 8875
224ce89b 8876 deferred_lock.unlock();
7c673cae
FG
8877 bdev->aio_submit(&b->ioc);
8878}
8879
3efd9988
FG
8880struct C_DeferredTrySubmit : public Context {
8881 BlueStore *store;
8882 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
8883 void finish(int r) {
8884 store->deferred_try_submit();
8885 }
8886};
8887
7c673cae
FG
8888void BlueStore::_deferred_aio_finish(OpSequencer *osr)
8889{
8890 dout(10) << __func__ << " osr " << osr << dendl;
8891 assert(osr->deferred_running);
8892 DeferredBatch *b = osr->deferred_running;
8893
8894 {
8895 std::lock_guard<std::mutex> l(deferred_lock);
8896 assert(osr->deferred_running == b);
8897 osr->deferred_running = nullptr;
8898 if (!osr->deferred_pending) {
181888fb 8899 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
8900 auto q = deferred_queue.iterator_to(*osr);
8901 deferred_queue.erase(q);
8902 } else if (deferred_aggressive) {
224ce89b 8903 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
3efd9988 8904 deferred_finisher.queue(new C_DeferredTrySubmit(this));
181888fb
FG
8905 } else {
8906 dout(20) << __func__ << " leaving queued, more pending" << dendl;
7c673cae
FG
8907 }
8908 }
8909
8910 {
31f18b77 8911 uint64_t costs = 0;
7c673cae
FG
8912 std::lock_guard<std::mutex> l2(osr->qlock);
8913 for (auto& i : b->txcs) {
8914 TransContext *txc = &i;
8915 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
31f18b77 8916 costs += txc->cost;
7c673cae 8917 }
31f18b77
FG
8918 osr->qcond.notify_all();
8919 throttle_deferred_bytes.put(costs);
7c673cae
FG
8920 std::lock_guard<std::mutex> l(kv_lock);
8921 deferred_done_queue.emplace_back(b);
8922 }
8923
8924 // in the normal case, do not bother waking up the kv thread; it will
8925 // catch us on the next commit anyway.
8926 if (deferred_aggressive) {
8927 std::lock_guard<std::mutex> l(kv_lock);
8928 kv_cond.notify_one();
8929 }
8930}
8931
8932int BlueStore::_deferred_replay()
8933{
8934 dout(10) << __func__ << " start" << dendl;
8935 OpSequencerRef osr = new OpSequencer(cct, this);
8936 int count = 0;
8937 int r = 0;
8938 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
8939 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
8940 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
8941 << dendl;
8942 bluestore_deferred_transaction_t *deferred_txn =
8943 new bluestore_deferred_transaction_t;
8944 bufferlist bl = it->value();
8945 bufferlist::iterator p = bl.begin();
8946 try {
8947 ::decode(*deferred_txn, p);
8948 } catch (buffer::error& e) {
8949 derr << __func__ << " failed to decode deferred txn "
8950 << pretty_binary_string(it->key()) << dendl;
8951 delete deferred_txn;
8952 r = -EIO;
8953 goto out;
8954 }
8955 TransContext *txc = _txc_create(osr.get());
8956 txc->deferred_txn = deferred_txn;
8957 txc->state = TransContext::STATE_KV_DONE;
8958 _txc_state_proc(txc);
8959 }
8960 out:
8961 dout(20) << __func__ << " draining osr" << dendl;
8962 _osr_drain_all();
8963 osr->discard();
8964 dout(10) << __func__ << " completed " << count << " events" << dendl;
8965 return r;
8966}
8967
8968// ---------------------------
8969// transactions
8970
8971int BlueStore::queue_transactions(
8972 Sequencer *posr,
8973 vector<Transaction>& tls,
8974 TrackedOpRef op,
8975 ThreadPool::TPHandle *handle)
8976{
8977 FUNCTRACE();
8978 Context *onreadable;
8979 Context *ondisk;
8980 Context *onreadable_sync;
8981 ObjectStore::Transaction::collect_contexts(
8982 tls, &onreadable, &ondisk, &onreadable_sync);
8983
8984 if (cct->_conf->objectstore_blackhole) {
8985 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
8986 << dendl;
8987 delete ondisk;
8988 delete onreadable;
8989 delete onreadable_sync;
8990 return 0;
8991 }
8992 utime_t start = ceph_clock_now();
8993 // set up the sequencer
8994 OpSequencer *osr;
8995 assert(posr);
8996 if (posr->p) {
8997 osr = static_cast<OpSequencer *>(posr->p.get());
8998 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
8999 } else {
9000 osr = new OpSequencer(cct, this);
9001 osr->parent = posr;
9002 posr->p = osr;
9003 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
9004 }
9005
9006 // prepare
9007 TransContext *txc = _txc_create(osr);
9008 txc->onreadable = onreadable;
9009 txc->onreadable_sync = onreadable_sync;
9010 txc->oncommit = ondisk;
9011
9012 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
9013 (*p).set_osr(osr);
9014 txc->bytes += (*p).get_num_bytes();
9015 _txc_add_transaction(txc, &(*p));
9016 }
9017 _txc_calc_cost(txc);
9018
9019 _txc_write_nodes(txc, txc->t);
9020
9021 // journal deferred items
9022 if (txc->deferred_txn) {
9023 txc->deferred_txn->seq = ++deferred_seq;
9024 bufferlist bl;
9025 ::encode(*txc->deferred_txn, bl);
9026 string key;
9027 get_deferred_key(txc->deferred_txn->seq, &key);
9028 txc->t->set(PREFIX_DEFERRED, key, bl);
9029 }
9030
9031 _txc_finalize_kv(txc, txc->t);
9032 if (handle)
9033 handle->suspend_tp_timeout();
9034
9035 utime_t tstart = ceph_clock_now();
9036 throttle_bytes.get(txc->cost);
9037 if (txc->deferred_txn) {
9038 // ensure we do not block here because of deferred writes
9039 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
d2e6a577
FG
9040 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
9041 << dendl;
9042 ++deferred_aggressive;
7c673cae 9043 deferred_try_submit();
3efd9988
FG
9044 {
9045 // wake up any previously finished deferred events
9046 std::lock_guard<std::mutex> l(kv_lock);
9047 kv_cond.notify_one();
9048 }
7c673cae 9049 throttle_deferred_bytes.get(txc->cost);
d2e6a577
FG
9050 --deferred_aggressive;
9051 }
7c673cae
FG
9052 }
9053 utime_t tend = ceph_clock_now();
9054
9055 if (handle)
9056 handle->reset_tp_timeout();
9057
9058 logger->inc(l_bluestore_txc);
9059
9060 // execute (start)
9061 _txc_state_proc(txc);
9062
9063 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
9064 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
9065 return 0;
9066}
9067
9068void BlueStore::_txc_aio_submit(TransContext *txc)
9069{
9070 dout(10) << __func__ << " txc " << txc << dendl;
9071 bdev->aio_submit(&txc->ioc);
9072}
9073
9074void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
9075{
9076 Transaction::iterator i = t->begin();
9077
9078 _dump_transaction(t);
9079
9080 vector<CollectionRef> cvec(i.colls.size());
9081 unsigned j = 0;
9082 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
9083 ++p, ++j) {
9084 cvec[j] = _get_collection(*p);
7c673cae
FG
9085 }
9086 vector<OnodeRef> ovec(i.objects.size());
9087
9088 for (int pos = 0; i.have_op(); ++pos) {
9089 Transaction::Op *op = i.decode_op();
9090 int r = 0;
9091
9092 // no coll or obj
9093 if (op->op == Transaction::OP_NOP)
9094 continue;
9095
9096 // collection operations
9097 CollectionRef &c = cvec[op->cid];
9098 switch (op->op) {
9099 case Transaction::OP_RMCOLL:
9100 {
9101 const coll_t &cid = i.get_cid(op->cid);
9102 r = _remove_collection(txc, cid, &c);
9103 if (!r)
9104 continue;
9105 }
9106 break;
9107
9108 case Transaction::OP_MKCOLL:
9109 {
9110 assert(!c);
9111 const coll_t &cid = i.get_cid(op->cid);
9112 r = _create_collection(txc, cid, op->split_bits, &c);
9113 if (!r)
9114 continue;
9115 }
9116 break;
9117
9118 case Transaction::OP_SPLIT_COLLECTION:
9119 assert(0 == "deprecated");
9120 break;
9121
9122 case Transaction::OP_SPLIT_COLLECTION2:
9123 {
9124 uint32_t bits = op->split_bits;
9125 uint32_t rem = op->split_rem;
9126 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
9127 if (!r)
9128 continue;
9129 }
9130 break;
9131
9132 case Transaction::OP_COLL_HINT:
9133 {
9134 uint32_t type = op->hint_type;
9135 bufferlist hint;
9136 i.decode_bl(hint);
9137 bufferlist::iterator hiter = hint.begin();
9138 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
9139 uint32_t pg_num;
9140 uint64_t num_objs;
9141 ::decode(pg_num, hiter);
9142 ::decode(num_objs, hiter);
9143 dout(10) << __func__ << " collection hint objects is a no-op, "
9144 << " pg_num " << pg_num << " num_objects " << num_objs
9145 << dendl;
9146 } else {
9147 // Ignore the hint
9148 dout(10) << __func__ << " unknown collection hint " << type << dendl;
9149 }
9150 continue;
9151 }
9152 break;
9153
9154 case Transaction::OP_COLL_SETATTR:
9155 r = -EOPNOTSUPP;
9156 break;
9157
9158 case Transaction::OP_COLL_RMATTR:
9159 r = -EOPNOTSUPP;
9160 break;
9161
9162 case Transaction::OP_COLL_RENAME:
9163 assert(0 == "not implemented");
9164 break;
9165 }
9166 if (r < 0) {
9167 derr << __func__ << " error " << cpp_strerror(r)
9168 << " not handled on operation " << op->op
9169 << " (op " << pos << ", counting from 0)" << dendl;
9170 _dump_transaction(t, 0);
9171 assert(0 == "unexpected error");
9172 }
9173
9174 // these operations implicity create the object
9175 bool create = false;
9176 if (op->op == Transaction::OP_TOUCH ||
9177 op->op == Transaction::OP_WRITE ||
9178 op->op == Transaction::OP_ZERO) {
9179 create = true;
9180 }
9181
9182 // object operations
9183 RWLock::WLocker l(c->lock);
9184 OnodeRef &o = ovec[op->oid];
9185 if (!o) {
9186 ghobject_t oid = i.get_oid(op->oid);
9187 o = c->get_onode(oid, create);
9188 }
9189 if (!create && (!o || !o->exists)) {
9190 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
9191 << i.get_oid(op->oid) << dendl;
9192 r = -ENOENT;
9193 goto endop;
9194 }
9195
9196 switch (op->op) {
9197 case Transaction::OP_TOUCH:
9198 r = _touch(txc, c, o);
9199 break;
9200
9201 case Transaction::OP_WRITE:
9202 {
9203 uint64_t off = op->off;
9204 uint64_t len = op->len;
9205 uint32_t fadvise_flags = i.get_fadvise_flags();
9206 bufferlist bl;
9207 i.decode_bl(bl);
9208 r = _write(txc, c, o, off, len, bl, fadvise_flags);
9209 }
9210 break;
9211
9212 case Transaction::OP_ZERO:
9213 {
9214 uint64_t off = op->off;
9215 uint64_t len = op->len;
9216 r = _zero(txc, c, o, off, len);
9217 }
9218 break;
9219
9220 case Transaction::OP_TRIMCACHE:
9221 {
9222 // deprecated, no-op
9223 }
9224 break;
9225
9226 case Transaction::OP_TRUNCATE:
9227 {
9228 uint64_t off = op->off;
35e4c445 9229 r = _truncate(txc, c, o, off);
7c673cae
FG
9230 }
9231 break;
9232
9233 case Transaction::OP_REMOVE:
9234 {
9235 r = _remove(txc, c, o);
9236 }
9237 break;
9238
9239 case Transaction::OP_SETATTR:
9240 {
9241 string name = i.decode_string();
9242 bufferptr bp;
9243 i.decode_bp(bp);
9244 r = _setattr(txc, c, o, name, bp);
9245 }
9246 break;
9247
9248 case Transaction::OP_SETATTRS:
9249 {
9250 map<string, bufferptr> aset;
9251 i.decode_attrset(aset);
9252 r = _setattrs(txc, c, o, aset);
9253 }
9254 break;
9255
9256 case Transaction::OP_RMATTR:
9257 {
9258 string name = i.decode_string();
9259 r = _rmattr(txc, c, o, name);
9260 }
9261 break;
9262
9263 case Transaction::OP_RMATTRS:
9264 {
9265 r = _rmattrs(txc, c, o);
9266 }
9267 break;
9268
9269 case Transaction::OP_CLONE:
9270 {
9271 OnodeRef& no = ovec[op->dest_oid];
9272 if (!no) {
9273 const ghobject_t& noid = i.get_oid(op->dest_oid);
9274 no = c->get_onode(noid, true);
9275 }
9276 r = _clone(txc, c, o, no);
9277 }
9278 break;
9279
9280 case Transaction::OP_CLONERANGE:
9281 assert(0 == "deprecated");
9282 break;
9283
9284 case Transaction::OP_CLONERANGE2:
9285 {
9286 OnodeRef& no = ovec[op->dest_oid];
9287 if (!no) {
9288 const ghobject_t& noid = i.get_oid(op->dest_oid);
9289 no = c->get_onode(noid, true);
9290 }
9291 uint64_t srcoff = op->off;
9292 uint64_t len = op->len;
9293 uint64_t dstoff = op->dest_off;
9294 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
9295 }
9296 break;
9297
9298 case Transaction::OP_COLL_ADD:
9299 assert(0 == "not implemented");
9300 break;
9301
9302 case Transaction::OP_COLL_REMOVE:
9303 assert(0 == "not implemented");
9304 break;
9305
9306 case Transaction::OP_COLL_MOVE:
9307 assert(0 == "deprecated");
9308 break;
9309
9310 case Transaction::OP_COLL_MOVE_RENAME:
9311 case Transaction::OP_TRY_RENAME:
9312 {
9313 assert(op->cid == op->dest_cid);
9314 const ghobject_t& noid = i.get_oid(op->dest_oid);
9315 OnodeRef& no = ovec[op->dest_oid];
9316 if (!no) {
9317 no = c->get_onode(noid, false);
9318 }
9319 r = _rename(txc, c, o, no, noid);
9320 }
9321 break;
9322
9323 case Transaction::OP_OMAP_CLEAR:
9324 {
9325 r = _omap_clear(txc, c, o);
9326 }
9327 break;
9328 case Transaction::OP_OMAP_SETKEYS:
9329 {
9330 bufferlist aset_bl;
9331 i.decode_attrset_bl(&aset_bl);
9332 r = _omap_setkeys(txc, c, o, aset_bl);
9333 }
9334 break;
9335 case Transaction::OP_OMAP_RMKEYS:
9336 {
9337 bufferlist keys_bl;
9338 i.decode_keyset_bl(&keys_bl);
9339 r = _omap_rmkeys(txc, c, o, keys_bl);
9340 }
9341 break;
9342 case Transaction::OP_OMAP_RMKEYRANGE:
9343 {
9344 string first, last;
9345 first = i.decode_string();
9346 last = i.decode_string();
9347 r = _omap_rmkey_range(txc, c, o, first, last);
9348 }
9349 break;
9350 case Transaction::OP_OMAP_SETHEADER:
9351 {
9352 bufferlist bl;
9353 i.decode_bl(bl);
9354 r = _omap_setheader(txc, c, o, bl);
9355 }
9356 break;
9357
9358 case Transaction::OP_SETALLOCHINT:
9359 {
9360 r = _set_alloc_hint(txc, c, o,
9361 op->expected_object_size,
9362 op->expected_write_size,
9363 op->alloc_hint_flags);
9364 }
9365 break;
9366
9367 default:
9368 derr << __func__ << "bad op " << op->op << dendl;
9369 ceph_abort();
9370 }
9371
9372 endop:
9373 if (r < 0) {
9374 bool ok = false;
9375
9376 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9377 op->op == Transaction::OP_CLONE ||
9378 op->op == Transaction::OP_CLONERANGE2 ||
9379 op->op == Transaction::OP_COLL_ADD ||
9380 op->op == Transaction::OP_SETATTR ||
9381 op->op == Transaction::OP_SETATTRS ||
9382 op->op == Transaction::OP_RMATTR ||
9383 op->op == Transaction::OP_OMAP_SETKEYS ||
9384 op->op == Transaction::OP_OMAP_RMKEYS ||
9385 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9386 op->op == Transaction::OP_OMAP_SETHEADER))
9387 // -ENOENT is usually okay
9388 ok = true;
9389 if (r == -ENODATA)
9390 ok = true;
9391
9392 if (!ok) {
9393 const char *msg = "unexpected error code";
9394
9395 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9396 op->op == Transaction::OP_CLONE ||
9397 op->op == Transaction::OP_CLONERANGE2))
9398 msg = "ENOENT on clone suggests osd bug";
9399
9400 if (r == -ENOSPC)
9401 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9402 // by partially applying transactions.
9403 msg = "ENOSPC from bluestore, misconfigured cluster";
9404
9405 if (r == -ENOTEMPTY) {
9406 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9407 }
9408
9409 derr << __func__ << " error " << cpp_strerror(r)
9410 << " not handled on operation " << op->op
9411 << " (op " << pos << ", counting from 0)"
9412 << dendl;
9413 derr << msg << dendl;
9414 _dump_transaction(t, 0);
9415 assert(0 == "unexpected error");
9416 }
9417 }
9418 }
9419}
9420
9421
9422
9423// -----------------
9424// write operations
9425
9426int BlueStore::_touch(TransContext *txc,
9427 CollectionRef& c,
9428 OnodeRef &o)
9429{
9430 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9431 int r = 0;
7c673cae
FG
9432 _assign_nid(txc, o);
9433 txc->write_onode(o);
9434 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9435 return r;
9436}
9437
94b18763 9438void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
7c673cae
FG
9439{
9440 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9441 return;
9442 dout(log_level) << __func__ << " " << o << " " << o->oid
9443 << " nid " << o->onode.nid
9444 << " size 0x" << std::hex << o->onode.size
9445 << " (" << std::dec << o->onode.size << ")"
9446 << " expected_object_size " << o->onode.expected_object_size
9447 << " expected_write_size " << o->onode.expected_write_size
9448 << " in " << o->onode.extent_map_shards.size() << " shards"
9449 << ", " << o->extent_map.spanning_blob_map.size()
9450 << " spanning blobs"
9451 << dendl;
9452 for (auto p = o->onode.attrs.begin();
9453 p != o->onode.attrs.end();
9454 ++p) {
9455 dout(log_level) << __func__ << " attr " << p->first
9456 << " len " << p->second.length() << dendl;
9457 }
9458 _dump_extent_map(o->extent_map, log_level);
9459}
9460
9461void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9462{
9463 uint64_t pos = 0;
9464 for (auto& s : em.shards) {
9465 dout(log_level) << __func__ << " shard " << *s.shard_info
9466 << (s.loaded ? " (loaded)" : "")
9467 << (s.dirty ? " (dirty)" : "")
9468 << dendl;
9469 }
9470 for (auto& e : em.extent_map) {
9471 dout(log_level) << __func__ << " " << e << dendl;
9472 assert(e.logical_offset >= pos);
9473 pos = e.logical_offset + e.length;
9474 const bluestore_blob_t& blob = e.blob->get_blob();
9475 if (blob.has_csum()) {
9476 vector<uint64_t> v;
9477 unsigned n = blob.get_csum_count();
9478 for (unsigned i = 0; i < n; ++i)
9479 v.push_back(blob.get_csum_item(i));
9480 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9481 << dendl;
9482 }
9483 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9484 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9485 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9486 << "~" << i.second->length << std::dec
9487 << " " << *i.second << dendl;
9488 }
9489 }
9490}
9491
9492void BlueStore::_dump_transaction(Transaction *t, int log_level)
9493{
9494 dout(log_level) << " transaction dump:\n";
9495 JSONFormatter f(true);
9496 f.open_object_section("transaction");
9497 t->dump(&f);
9498 f.close_section();
9499 f.flush(*_dout);
9500 *_dout << dendl;
9501}
9502
9503void BlueStore::_pad_zeros(
9504 bufferlist *bl, uint64_t *offset,
9505 uint64_t chunk_size)
9506{
9507 auto length = bl->length();
9508 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9509 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9510 dout(40) << "before:\n";
9511 bl->hexdump(*_dout);
9512 *_dout << dendl;
9513 // front
9514 size_t front_pad = *offset % chunk_size;
9515 size_t back_pad = 0;
9516 size_t pad_count = 0;
9517 if (front_pad) {
9518 size_t front_copy = MIN(chunk_size - front_pad, length);
9519 bufferptr z = buffer::create_page_aligned(chunk_size);
224ce89b 9520 z.zero(0, front_pad, false);
7c673cae 9521 pad_count += front_pad;
224ce89b 9522 bl->copy(0, front_copy, z.c_str() + front_pad);
7c673cae
FG
9523 if (front_copy + front_pad < chunk_size) {
9524 back_pad = chunk_size - (length + front_pad);
224ce89b 9525 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
9526 pad_count += back_pad;
9527 }
9528 bufferlist old, t;
9529 old.swap(*bl);
9530 t.substr_of(old, front_copy, length - front_copy);
9531 bl->append(z);
9532 bl->claim_append(t);
9533 *offset -= front_pad;
224ce89b 9534 length += pad_count;
7c673cae
FG
9535 }
9536
9537 // back
9538 uint64_t end = *offset + length;
9539 unsigned back_copy = end % chunk_size;
9540 if (back_copy) {
9541 assert(back_pad == 0);
9542 back_pad = chunk_size - back_copy;
9543 assert(back_copy <= length);
9544 bufferptr tail(chunk_size);
224ce89b
WB
9545 bl->copy(length - back_copy, back_copy, tail.c_str());
9546 tail.zero(back_copy, back_pad, false);
7c673cae
FG
9547 bufferlist old;
9548 old.swap(*bl);
9549 bl->substr_of(old, 0, length - back_copy);
9550 bl->append(tail);
9551 length += back_pad;
9552 pad_count += back_pad;
9553 }
9554 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9555 << back_pad << " on front/back, now 0x" << *offset << "~"
9556 << length << std::dec << dendl;
9557 dout(40) << "after:\n";
9558 bl->hexdump(*_dout);
9559 *_dout << dendl;
9560 if (pad_count)
9561 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9562 assert(bl->length() == length);
9563}
9564
9565void BlueStore::_do_write_small(
9566 TransContext *txc,
9567 CollectionRef &c,
9568 OnodeRef o,
9569 uint64_t offset, uint64_t length,
9570 bufferlist::iterator& blp,
9571 WriteContext *wctx)
9572{
9573 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9574 << std::dec << dendl;
9575 assert(length < min_alloc_size);
9576 uint64_t end_offs = offset + length;
9577
9578 logger->inc(l_bluestore_write_small);
9579 logger->inc(l_bluestore_write_small_bytes, length);
9580
9581 bufferlist bl;
9582 blp.copy(length, bl);
9583
9584 // Look for an existing mutable blob we can use.
9585 auto begin = o->extent_map.extent_map.begin();
9586 auto end = o->extent_map.extent_map.end();
9587 auto ep = o->extent_map.seek_lextent(offset);
9588 if (ep != begin) {
9589 --ep;
9590 if (ep->blob_end() <= offset) {
9591 ++ep;
9592 }
9593 }
9594 auto prev_ep = ep;
9595 if (prev_ep != begin) {
9596 --prev_ep;
9597 } else {
9598 prev_ep = end; // to avoid this extent check as it's a duplicate
9599 }
9600
9601 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9602 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9603 uint32_t alloc_len = min_alloc_size;
9604 auto offset0 = P2ALIGN(offset, alloc_len);
9605
9606 bool any_change;
9607
9608 // search suitable extent in both forward and reverse direction in
9609 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 9610 // then check if blob can be reused via can_reuse_blob func or apply
7c673cae
FG
9611 // direct/deferred write (the latter for extents including or higher
9612 // than 'offset' only).
9613 do {
9614 any_change = false;
9615
9616 if (ep != end && ep->logical_offset < offset + max_bsize) {
9617 BlobRef b = ep->blob;
9618 auto bstart = ep->blob_start();
9619 dout(20) << __func__ << " considering " << *b
9620 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9621 if (bstart >= end_offs) {
9622 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9623 } else if (!b->get_blob().is_mutable()) {
9624 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9625 } else if (ep->logical_offset % min_alloc_size !=
9626 ep->blob_offset % min_alloc_size) {
9627 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9628 } else {
9629 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9630 // can we pad our head/tail out with zeros?
9631 uint64_t head_pad, tail_pad;
9632 head_pad = P2PHASE(offset, chunk_size);
9633 tail_pad = P2NPHASE(end_offs, chunk_size);
9634 if (head_pad || tail_pad) {
9635 o->extent_map.fault_range(db, offset - head_pad,
9636 end_offs - offset + head_pad + tail_pad);
9637 }
9638 if (head_pad &&
9639 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9640 head_pad = 0;
9641 }
9642 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9643 tail_pad = 0;
9644 }
9645
9646 uint64_t b_off = offset - head_pad - bstart;
9647 uint64_t b_len = length + head_pad + tail_pad;
9648
9649 // direct write into unused blocks of an existing mutable blob?
9650 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9651 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9652 b->get_blob().is_unused(b_off, b_len) &&
9653 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 9654 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
9655
9656 dout(20) << __func__ << " write to unused 0x" << std::hex
9657 << b_off << "~" << b_len
9658 << " pad 0x" << head_pad << " + 0x" << tail_pad
9659 << std::dec << " of mutable " << *b << dendl;
224ce89b 9660 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
9661 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9662
9663 if (!g_conf->bluestore_debug_omit_block_device_write) {
9664 if (b_len <= prefer_deferred_size) {
9665 dout(20) << __func__ << " deferring small 0x" << std::hex
9666 << b_len << std::dec << " unused write via deferred" << dendl;
9667 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9668 op->op = bluestore_deferred_op_t::OP_WRITE;
9669 b->get_blob().map(
9670 b_off, b_len,
9671 [&](uint64_t offset, uint64_t length) {
9672 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9673 return 0;
9674 });
224ce89b 9675 op->data = bl;
7c673cae
FG
9676 } else {
9677 b->get_blob().map_bl(
224ce89b 9678 b_off, bl,
7c673cae
FG
9679 [&](uint64_t offset, bufferlist& t) {
9680 bdev->aio_write(offset, t,
9681 &txc->ioc, wctx->buffered);
9682 });
9683 }
9684 }
224ce89b 9685 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
9686 dout(20) << __func__ << " lex old " << *ep << dendl;
9687 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9688 b,
9689 &wctx->old_extents);
9690 b->dirty_blob().mark_used(le->blob_offset, le->length);
9691 txc->statfs_delta.stored() += le->length;
9692 dout(20) << __func__ << " lex " << *le << dendl;
9693 logger->inc(l_bluestore_write_small_unused);
9694 return;
9695 }
9696 // read some data to fill out the chunk?
9697 uint64_t head_read = P2PHASE(b_off, chunk_size);
9698 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9699 if ((head_read || tail_read) &&
9700 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9701 head_read + tail_read < min_alloc_size) {
9702 b_off -= head_read;
9703 b_len += head_read + tail_read;
9704
9705 } else {
9706 head_read = tail_read = 0;
9707 }
9708
9709 // chunk-aligned deferred overwrite?
9710 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9711 b_off % chunk_size == 0 &&
9712 b_len % chunk_size == 0 &&
9713 b->get_blob().is_allocated(b_off, b_len)) {
9714
224ce89b 9715 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
9716
9717 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9718 << " and tail 0x" << tail_read << std::dec << dendl;
9719 if (head_read) {
9720 bufferlist head_bl;
9721 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9722 head_bl, 0);
9723 assert(r >= 0 && r <= (int)head_read);
9724 size_t zlen = head_read - r;
9725 if (zlen) {
9726 head_bl.append_zero(zlen);
9727 logger->inc(l_bluestore_write_pad_bytes, zlen);
9728 }
224ce89b 9729 bl.claim_prepend(head_bl);
7c673cae
FG
9730 logger->inc(l_bluestore_write_penalty_read_ops);
9731 }
9732 if (tail_read) {
9733 bufferlist tail_bl;
9734 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9735 tail_bl, 0);
9736 assert(r >= 0 && r <= (int)tail_read);
9737 size_t zlen = tail_read - r;
9738 if (zlen) {
9739 tail_bl.append_zero(zlen);
9740 logger->inc(l_bluestore_write_pad_bytes, zlen);
9741 }
224ce89b 9742 bl.claim_append(tail_bl);
7c673cae
FG
9743 logger->inc(l_bluestore_write_penalty_read_ops);
9744 }
9745 logger->inc(l_bluestore_write_small_pre_read);
9746
9747 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9748 op->op = bluestore_deferred_op_t::OP_WRITE;
224ce89b 9749 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
9750 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9751
9752 int r = b->get_blob().map(
9753 b_off, b_len,
9754 [&](uint64_t offset, uint64_t length) {
9755 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9756 return 0;
9757 });
9758 assert(r == 0);
9759 if (b->get_blob().csum_type) {
224ce89b 9760 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 9761 }
224ce89b 9762 op->data.claim(bl);
7c673cae
FG
9763 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9764 << b_len << std::dec << " of mutable " << *b
9765 << " at " << op->extents << dendl;
9766 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9767 b, &wctx->old_extents);
9768 b->dirty_blob().mark_used(le->blob_offset, le->length);
9769 txc->statfs_delta.stored() += le->length;
9770 dout(20) << __func__ << " lex " << *le << dendl;
9771 logger->inc(l_bluestore_write_small_deferred);
9772 return;
9773 }
224ce89b
WB
9774 // try to reuse blob if we can
9775 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
9776 max_bsize,
9777 offset0 - bstart,
9778 &alloc_len)) {
9779 assert(alloc_len == min_alloc_size); // expecting data always
9780 // fit into reused blob
9781 // Need to check for pending writes desiring to
9782 // reuse the same pextent. The rationale is that during GC two chunks
9783 // from garbage blobs(compressed?) can share logical space within the same
9784 // AU. That's in turn might be caused by unaligned len in clone_range2.
9785 // Hence the second write will fail in an attempt to reuse blob at
9786 // do_alloc_write().
9787 if (!wctx->has_conflict(b,
9788 offset0,
9789 offset0 + alloc_len,
9790 min_alloc_size)) {
9791
9792 // we can't reuse pad_head/pad_tail since they might be truncated
9793 // due to existent extents
9794 uint64_t b_off = offset - bstart;
9795 uint64_t b_off0 = b_off;
9796 _pad_zeros(&bl, &b_off0, chunk_size);
9797
9798 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
9799 << " (0x" << b_off0 << "~" << bl.length() << ")"
9800 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
9801 << std::dec << dendl;
9802
9803 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9804 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9805 false, false);
9806 logger->inc(l_bluestore_write_small_unused);
9807 return;
9808 }
9809 }
9810 }
9811 ++ep;
9812 any_change = true;
9813 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9814
9815 // check extent for reuse in reverse order
9816 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9817 BlobRef b = prev_ep->blob;
9818 auto bstart = prev_ep->blob_start();
9819 dout(20) << __func__ << " considering " << *b
9820 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 9821 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
9822 max_bsize,
9823 offset0 - bstart,
9824 &alloc_len)) {
9825 assert(alloc_len == min_alloc_size); // expecting data always
9826 // fit into reused blob
9827 // Need to check for pending writes desiring to
9828 // reuse the same pextent. The rationale is that during GC two chunks
9829 // from garbage blobs(compressed?) can share logical space within the same
9830 // AU. That's in turn might be caused by unaligned len in clone_range2.
9831 // Hence the second write will fail in an attempt to reuse blob at
9832 // do_alloc_write().
9833 if (!wctx->has_conflict(b,
9834 offset0,
9835 offset0 + alloc_len,
9836 min_alloc_size)) {
9837
9838 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9839 uint64_t b_off = offset - bstart;
9840 uint64_t b_off0 = b_off;
9841 _pad_zeros(&bl, &b_off0, chunk_size);
9842
9843 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
9844 << " (0x" << b_off0 << "~" << bl.length() << ")"
9845 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
9846 << std::dec << dendl;
9847
9848 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9849 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9850 false, false);
9851 logger->inc(l_bluestore_write_small_unused);
9852 return;
9853 }
9854 }
9855 if (prev_ep != begin) {
9856 --prev_ep;
9857 any_change = true;
9858 } else {
9859 prev_ep = end; // to avoid useless first extent re-check
9860 }
9861 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9862 } while (any_change);
9863
9864 // new blob.
9865
9866 BlobRef b = c->new_blob();
9867 uint64_t b_off = P2PHASE(offset, alloc_len);
9868 uint64_t b_off0 = b_off;
9869 _pad_zeros(&bl, &b_off0, block_size);
9870 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9871 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
9872 logger->inc(l_bluestore_write_small_new);
9873
9874 return;
9875}
9876
9877void BlueStore::_do_write_big(
9878 TransContext *txc,
9879 CollectionRef &c,
9880 OnodeRef o,
9881 uint64_t offset, uint64_t length,
9882 bufferlist::iterator& blp,
9883 WriteContext *wctx)
9884{
9885 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9886 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
9887 << " compress " << (int)wctx->compress
9888 << dendl;
9889 logger->inc(l_bluestore_write_big);
9890 logger->inc(l_bluestore_write_big_bytes, length);
9891 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9892 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9893 while (length > 0) {
9894 bool new_blob = false;
9895 uint32_t l = MIN(max_bsize, length);
9896 BlobRef b;
9897 uint32_t b_off = 0;
9898
9899 //attempting to reuse existing blob
9900 if (!wctx->compress) {
9901 // look for an existing mutable blob we can reuse
9902 auto begin = o->extent_map.extent_map.begin();
9903 auto end = o->extent_map.extent_map.end();
9904 auto ep = o->extent_map.seek_lextent(offset);
9905 auto prev_ep = ep;
9906 if (prev_ep != begin) {
9907 --prev_ep;
9908 } else {
9909 prev_ep = end; // to avoid this extent check as it's a duplicate
9910 }
9911 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9912 // search suitable extent in both forward and reverse direction in
9913 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 9914 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
9915 bool any_change;
9916 do {
9917 any_change = false;
9918 if (ep != end && ep->logical_offset < offset + max_bsize) {
9919 if (offset >= ep->blob_start() &&
224ce89b 9920 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
9921 offset - ep->blob_start(),
9922 &l)) {
9923 b = ep->blob;
9924 b_off = offset - ep->blob_start();
9925 prev_ep = end; // to avoid check below
9926 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 9927 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
9928 } else {
9929 ++ep;
9930 any_change = true;
9931 }
9932 }
9933
9934 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 9935 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
9936 offset - prev_ep->blob_start(),
9937 &l)) {
9938 b = prev_ep->blob;
9939 b_off = offset - prev_ep->blob_start();
9940 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 9941 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
9942 } else if (prev_ep != begin) {
9943 --prev_ep;
9944 any_change = true;
9945 } else {
9946 prev_ep = end; // to avoid useless first extent re-check
9947 }
9948 }
9949 } while (b == nullptr && any_change);
9950 }
9951 if (b == nullptr) {
9952 b = c->new_blob();
9953 b_off = 0;
9954 new_blob = true;
9955 }
9956
9957 bufferlist t;
9958 blp.copy(l, t);
9959 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
9960 offset += l;
9961 length -= l;
9962 logger->inc(l_bluestore_write_big_blobs);
9963 }
9964}
9965
9966int BlueStore::_do_alloc_write(
9967 TransContext *txc,
9968 CollectionRef coll,
9969 OnodeRef o,
9970 WriteContext *wctx)
9971{
9972 dout(20) << __func__ << " txc " << txc
9973 << " " << wctx->writes.size() << " blobs"
9974 << dendl;
3efd9988
FG
9975 if (wctx->writes.empty()) {
9976 return 0;
7c673cae
FG
9977 }
9978
7c673cae
FG
9979 CompressorRef c;
9980 double crr = 0;
9981 if (wctx->compress) {
9982 c = select_option(
9983 "compression_algorithm",
9984 compressor,
9985 [&]() {
9986 string val;
9987 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
9988 CompressorRef cp = compressor;
9989 if (!cp || cp->get_type_name() != val) {
9990 cp = Compressor::create(cct, val);
9991 }
9992 return boost::optional<CompressorRef>(cp);
9993 }
9994 return boost::optional<CompressorRef>();
9995 }
9996 );
9997
9998 crr = select_option(
9999 "compression_required_ratio",
10000 cct->_conf->bluestore_compression_required_ratio,
10001 [&]() {
10002 double val;
3efd9988 10003 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
10004 return boost::optional<double>(val);
10005 }
10006 return boost::optional<double>();
10007 }
10008 );
10009 }
10010
10011 // checksum
10012 int csum = csum_type.load();
10013 csum = select_option(
10014 "csum_type",
10015 csum,
10016 [&]() {
10017 int val;
3efd9988 10018 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
7c673cae
FG
10019 return boost::optional<int>(val);
10020 }
10021 return boost::optional<int>();
10022 }
10023 );
10024
3efd9988
FG
10025 // compress (as needed) and calc needed space
10026 uint64_t need = 0;
10027 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
7c673cae 10028 for (auto& wi : wctx->writes) {
3efd9988 10029 if (c && wi.blob_length > min_alloc_size) {
7c673cae
FG
10030 utime_t start = ceph_clock_now();
10031
10032 // compress
3efd9988
FG
10033 assert(wi.b_off == 0);
10034 assert(wi.blob_length == wi.bl.length());
10035
7c673cae
FG
10036 // FIXME: memory alignment here is bad
10037 bufferlist t;
3efd9988 10038 int r = c->compress(wi.bl, t);
7c673cae
FG
10039 assert(r == 0);
10040
3efd9988
FG
10041 bluestore_compression_header_t chdr;
10042 chdr.type = c->get_type();
7c673cae 10043 chdr.length = t.length();
3efd9988
FG
10044 ::encode(chdr, wi.compressed_bl);
10045 wi.compressed_bl.claim_append(t);
10046
10047 wi.compressed_len = wi.compressed_bl.length();
10048 uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
10049 uint64_t want_len_raw = wi.blob_length * crr;
7c673cae 10050 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
3efd9988
FG
10051 if (newlen <= want_len && newlen < wi.blob_length) {
10052 // Cool. We compressed at least as much as we were hoping to.
10053 // pad out to min_alloc_size
10054 wi.compressed_bl.append_zero(newlen - wi.compressed_len);
10055 logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
7c673cae 10056 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
3efd9988 10057 << " -> 0x" << wi.compressed_len << " => 0x" << newlen
7c673cae
FG
10058 << " with " << c->get_type()
10059 << std::dec << dendl;
3efd9988
FG
10060 txc->statfs_delta.compressed() += wi.compressed_len;
10061 txc->statfs_delta.compressed_original() += wi.blob_length;
7c673cae 10062 txc->statfs_delta.compressed_allocated() += newlen;
3efd9988
FG
10063 logger->inc(l_bluestore_compress_success_count);
10064 wi.compressed = true;
10065 need += newlen;
7c673cae 10066 } else {
3efd9988
FG
10067 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
10068 << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
10069 << " with " << c->get_type()
10070 << ", which is more than required 0x" << want_len_raw
7c673cae 10071 << " -> 0x" << want_len
3efd9988
FG
10072 << ", leaving uncompressed"
10073 << std::dec << dendl;
10074 logger->inc(l_bluestore_compress_rejected_count);
10075 need += wi.blob_length;
7c673cae
FG
10076 }
10077 logger->tinc(l_bluestore_compress_lat,
10078 ceph_clock_now() - start);
3efd9988
FG
10079 } else {
10080 need += wi.blob_length;
7c673cae 10081 }
3efd9988
FG
10082 }
10083 int r = alloc->reserve(need);
10084 if (r < 0) {
10085 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
10086 << dendl;
10087 return r;
10088 }
10089 AllocExtentVector prealloc;
10090 prealloc.reserve(2 * wctx->writes.size());;
10091 int prealloc_left = 0;
10092 prealloc_left = alloc->allocate(
10093 need, min_alloc_size, need,
10094 0, &prealloc);
10095 assert(prealloc_left == (int64_t)need);
10096 dout(20) << __func__ << " prealloc " << prealloc << dendl;
10097 auto prealloc_pos = prealloc.begin();
10098
10099 for (auto& wi : wctx->writes) {
10100 BlobRef b = wi.b;
10101 bluestore_blob_t& dblob = b->dirty_blob();
10102 uint64_t b_off = wi.b_off;
10103 bufferlist *l = &wi.bl;
10104 uint64_t final_length = wi.blob_length;
10105 uint64_t csum_length = wi.blob_length;
10106 unsigned csum_order = block_size_order;
10107 if (wi.compressed) {
10108 final_length = wi.compressed_bl.length();
10109 csum_length = final_length;
10110 csum_order = ctz(csum_length);
10111 l = &wi.compressed_bl;
10112 dblob.set_compressed(wi.blob_length, wi.compressed_len);
10113 } else if (wi.new_blob) {
7c673cae 10114 // initialize newly created blob only
31f18b77 10115 assert(dblob.is_mutable());
7c673cae
FG
10116 if (l->length() != wi.blob_length) {
10117 // hrm, maybe we could do better here, but let's not bother.
10118 dout(20) << __func__ << " forcing csum_order to block_size_order "
10119 << block_size_order << dendl;
31f18b77 10120 csum_order = block_size_order;
7c673cae
FG
10121 } else {
10122 csum_order = std::min(wctx->csum_order, ctz(l->length()));
10123 }
10124 // try to align blob with max_blob_size to improve
10125 // its reuse ratio, e.g. in case of reverse write
10126 uint32_t suggested_boff =
10127 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
10128 if ((suggested_boff % (1 << csum_order)) == 0 &&
10129 suggested_boff + final_length <= max_bsize &&
10130 suggested_boff > b_off) {
181888fb 10131 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae
FG
10132 << std::hex << suggested_boff << std::dec << dendl;
10133 assert(suggested_boff >= b_off);
10134 csum_length += suggested_boff - b_off;
10135 b_off = suggested_boff;
10136 }
181888fb
FG
10137 if (csum != Checksummer::CSUM_NONE) {
10138 dout(20) << __func__ << " initialize csum setting for new blob " << *b
10139 << " csum_type " << Checksummer::get_csum_type_string(csum)
10140 << " csum_order " << csum_order
10141 << " csum_length 0x" << std::hex << csum_length << std::dec
10142 << dendl;
10143 dblob.init_csum(csum, csum_order, csum_length);
10144 }
7c673cae
FG
10145 }
10146
10147 AllocExtentVector extents;
3efd9988
FG
10148 int64_t left = final_length;
10149 while (left > 0) {
10150 assert(prealloc_left > 0);
10151 if (prealloc_pos->length <= left) {
10152 prealloc_left -= prealloc_pos->length;
10153 left -= prealloc_pos->length;
10154 txc->statfs_delta.allocated() += prealloc_pos->length;
10155 extents.push_back(*prealloc_pos);
10156 ++prealloc_pos;
10157 } else {
10158 extents.emplace_back(prealloc_pos->offset, left);
10159 prealloc_pos->offset += left;
10160 prealloc_pos->length -= left;
10161 prealloc_left -= left;
10162 txc->statfs_delta.allocated() += left;
10163 left = 0;
10164 break;
10165 }
10166 }
7c673cae 10167 for (auto& p : extents) {
3efd9988 10168 txc->allocated.insert(p.offset, p.length);
7c673cae
FG
10169 }
10170 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
10171
181888fb
FG
10172 dout(20) << __func__ << " blob " << *b << dendl;
10173 if (dblob.has_csum()) {
7c673cae
FG
10174 dblob.calc_csum(b_off, *l);
10175 }
181888fb 10176
7c673cae
FG
10177 if (wi.mark_unused) {
10178 auto b_end = b_off + wi.bl.length();
10179 if (b_off) {
10180 dblob.add_unused(0, b_off);
10181 }
10182 if (b_end < wi.blob_length) {
10183 dblob.add_unused(b_end, wi.blob_length - b_end);
10184 }
10185 }
10186
10187 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
10188 b_off + (wi.b_off0 - wi.b_off),
10189 wi.length0,
10190 wi.b,
10191 nullptr);
10192 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
10193 txc->statfs_delta.stored() += le->length;
10194 dout(20) << __func__ << " lex " << *le << dendl;
10195 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
10196 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
10197
10198 // queue io
10199 if (!g_conf->bluestore_debug_omit_block_device_write) {
10200 if (l->length() <= prefer_deferred_size.load()) {
10201 dout(20) << __func__ << " deferring small 0x" << std::hex
10202 << l->length() << std::dec << " write via deferred" << dendl;
10203 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
10204 op->op = bluestore_deferred_op_t::OP_WRITE;
10205 int r = b->get_blob().map(
10206 b_off, l->length(),
10207 [&](uint64_t offset, uint64_t length) {
10208 op->extents.emplace_back(bluestore_pextent_t(offset, length));
10209 return 0;
10210 });
10211 assert(r == 0);
10212 op->data = *l;
10213 } else {
10214 b->get_blob().map_bl(
10215 b_off, *l,
10216 [&](uint64_t offset, bufferlist& t) {
10217 bdev->aio_write(offset, t, &txc->ioc, false);
10218 });
10219 }
10220 }
10221 }
3efd9988
FG
10222 assert(prealloc_pos == prealloc.end());
10223 assert(prealloc_left == 0);
7c673cae
FG
10224 return 0;
10225}
10226
10227void BlueStore::_wctx_finish(
10228 TransContext *txc,
10229 CollectionRef& c,
10230 OnodeRef o,
31f18b77
FG
10231 WriteContext *wctx,
10232 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
10233{
10234 auto oep = wctx->old_extents.begin();
10235 while (oep != wctx->old_extents.end()) {
10236 auto &lo = *oep;
10237 oep = wctx->old_extents.erase(oep);
10238 dout(20) << __func__ << " lex_old " << lo.e << dendl;
10239 BlobRef b = lo.e.blob;
10240 const bluestore_blob_t& blob = b->get_blob();
10241 if (blob.is_compressed()) {
10242 if (lo.blob_empty) {
10243 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
10244 }
10245 txc->statfs_delta.compressed_original() -= lo.e.length;
10246 }
10247 auto& r = lo.r;
10248 txc->statfs_delta.stored() -= lo.e.length;
10249 if (!r.empty()) {
10250 dout(20) << __func__ << " blob release " << r << dendl;
10251 if (blob.is_shared()) {
10252 PExtentVector final;
10253 c->load_shared_blob(b->shared_blob);
10254 for (auto e : r) {
31f18b77
FG
10255 b->shared_blob->put_ref(
10256 e.offset, e.length, &final,
10257 b->is_referenced() ? nullptr : maybe_unshared_blobs);
7c673cae
FG
10258 }
10259 dout(20) << __func__ << " shared_blob release " << final
10260 << " from " << *b->shared_blob << dendl;
10261 txc->write_shared_blob(b->shared_blob);
10262 r.clear();
10263 r.swap(final);
10264 }
10265 }
10266 // we can't invalidate our logical extents as we drop them because
10267 // other lextents (either in our onode or others) may still
10268 // reference them. but we can throw out anything that is no
10269 // longer allocated. Note that this will leave behind edge bits
10270 // that are no longer referenced but not deallocated (until they
10271 // age out of the cache naturally).
10272 b->discard_unallocated(c.get());
10273 for (auto e : r) {
10274 dout(20) << __func__ << " release " << e << dendl;
10275 txc->released.insert(e.offset, e.length);
10276 txc->statfs_delta.allocated() -= e.length;
10277 if (blob.is_compressed()) {
10278 txc->statfs_delta.compressed_allocated() -= e.length;
10279 }
10280 }
10281 delete &lo;
10282 if (b->is_spanning() && !b->is_referenced()) {
10283 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
10284 << dendl;
10285 o->extent_map.spanning_blob_map.erase(b->id);
10286 }
10287 }
10288}
10289
10290void BlueStore::_do_write_data(
10291 TransContext *txc,
10292 CollectionRef& c,
10293 OnodeRef o,
10294 uint64_t offset,
10295 uint64_t length,
10296 bufferlist& bl,
10297 WriteContext *wctx)
10298{
10299 uint64_t end = offset + length;
10300 bufferlist::iterator p = bl.begin();
10301
10302 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
10303 (length != min_alloc_size)) {
10304 // we fall within the same block
10305 _do_write_small(txc, c, o, offset, length, p, wctx);
10306 } else {
10307 uint64_t head_offset, head_length;
10308 uint64_t middle_offset, middle_length;
10309 uint64_t tail_offset, tail_length;
10310
10311 head_offset = offset;
10312 head_length = P2NPHASE(offset, min_alloc_size);
10313
10314 tail_offset = P2ALIGN(end, min_alloc_size);
10315 tail_length = P2PHASE(end, min_alloc_size);
10316
10317 middle_offset = head_offset + head_length;
10318 middle_length = length - head_length - tail_length;
10319
10320 if (head_length) {
10321 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
10322 }
10323
10324 if (middle_length) {
10325 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
10326 }
10327
10328 if (tail_length) {
10329 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
10330 }
10331 }
10332}
10333
31f18b77
FG
10334void BlueStore::_choose_write_options(
10335 CollectionRef& c,
10336 OnodeRef o,
10337 uint32_t fadvise_flags,
10338 WriteContext *wctx)
7c673cae 10339{
7c673cae
FG
10340 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10341 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 10342 wctx->buffered = true;
7c673cae
FG
10343 } else if (cct->_conf->bluestore_default_buffered_write &&
10344 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10345 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10346 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 10347 wctx->buffered = true;
7c673cae
FG
10348 }
10349
31f18b77
FG
10350 // apply basic csum block size
10351 wctx->csum_order = block_size_order;
7c673cae
FG
10352
10353 // compression parameters
10354 unsigned alloc_hints = o->onode.alloc_hint_flags;
10355 auto cm = select_option(
10356 "compression_mode",
31f18b77 10357 comp_mode.load(),
7c673cae
FG
10358 [&]() {
10359 string val;
10360 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
10361 return boost::optional<Compressor::CompressionMode>(
10362 Compressor::get_comp_mode_type(val));
7c673cae
FG
10363 }
10364 return boost::optional<Compressor::CompressionMode>();
10365 }
10366 );
31f18b77
FG
10367
10368 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
10369 ((cm == Compressor::COMP_FORCE) ||
10370 (cm == Compressor::COMP_AGGRESSIVE &&
10371 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10372 (cm == Compressor::COMP_PASSIVE &&
10373 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10374
10375 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10376 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
10377 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10378 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 10379 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 10380
7c673cae 10381 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 10382
7c673cae 10383 if (o->onode.expected_write_size) {
224ce89b 10384 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 10385 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 10386 } else {
224ce89b 10387 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
10388 }
10389
31f18b77
FG
10390 if (wctx->compress) {
10391 wctx->target_blob_size = select_option(
7c673cae 10392 "compression_max_blob_size",
31f18b77 10393 comp_max_blob_size.load(),
7c673cae
FG
10394 [&]() {
10395 int val;
10396 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10397 return boost::optional<uint64_t>((uint64_t)val);
10398 }
10399 return boost::optional<uint64_t>();
10400 }
10401 );
10402 }
10403 } else {
31f18b77
FG
10404 if (wctx->compress) {
10405 wctx->target_blob_size = select_option(
7c673cae 10406 "compression_min_blob_size",
31f18b77 10407 comp_min_blob_size.load(),
7c673cae
FG
10408 [&]() {
10409 int val;
10410 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10411 return boost::optional<uint64_t>((uint64_t)val);
10412 }
10413 return boost::optional<uint64_t>();
10414 }
10415 );
10416 }
10417 }
31f18b77 10418
7c673cae 10419 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
10420 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10421 wctx->target_blob_size = max_bsize;
7c673cae 10422 }
31f18b77 10423
7c673cae
FG
10424 // set the min blob size floor at 2x the min_alloc_size, or else we
10425 // won't be able to allocate a smaller extent for the compressed
10426 // data.
31f18b77
FG
10427 if (wctx->compress &&
10428 wctx->target_blob_size < min_alloc_size * 2) {
10429 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 10430 }
31f18b77
FG
10431
10432 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10433 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10434 << std::dec << dendl;
10435}
10436
10437int BlueStore::_do_gc(
10438 TransContext *txc,
10439 CollectionRef& c,
10440 OnodeRef o,
10441 const GarbageCollector& gc,
10442 const WriteContext& wctx,
10443 uint64_t *dirty_start,
10444 uint64_t *dirty_end)
10445{
10446 auto& extents_to_collect = gc.get_extents_to_collect();
10447
10448 WriteContext wctx_gc;
7c673cae 10449 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 10450
31f18b77
FG
10451 for (auto it = extents_to_collect.begin();
10452 it != extents_to_collect.end();
10453 ++it) {
10454 bufferlist bl;
10455 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10456 assert(r == (int)it->length);
10457
10458 o->extent_map.fault_range(db, it->offset, it->length);
10459 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10460 logger->inc(l_bluestore_gc_merged, it->length);
10461
10462 if (*dirty_start > it->offset) {
10463 *dirty_start = it->offset;
10464 }
10465
10466 if (*dirty_end < it->offset + it->length) {
10467 *dirty_end = it->offset + it->length;
10468 }
10469 }
10470
10471 dout(30) << __func__ << " alloc write" << dendl;
10472 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10473 if (r < 0) {
10474 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10475 << dendl;
10476 return r;
10477 }
10478
10479 _wctx_finish(txc, c, o, &wctx_gc);
10480 return 0;
10481}
10482
10483int BlueStore::_do_write(
10484 TransContext *txc,
10485 CollectionRef& c,
10486 OnodeRef o,
10487 uint64_t offset,
10488 uint64_t length,
10489 bufferlist& bl,
10490 uint32_t fadvise_flags)
10491{
10492 int r = 0;
10493
10494 dout(20) << __func__
10495 << " " << o->oid
10496 << " 0x" << std::hex << offset << "~" << length
10497 << " - have 0x" << o->onode.size
10498 << " (" << std::dec << o->onode.size << ")"
10499 << " bytes"
10500 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10501 << dendl;
10502 _dump_onode(o);
10503
10504 if (length == 0) {
10505 return 0;
10506 }
10507
10508 uint64_t end = offset + length;
10509
10510 GarbageCollector gc(c->store->cct);
10511 int64_t benefit;
10512 auto dirty_start = offset;
10513 auto dirty_end = end;
10514
10515 WriteContext wctx;
10516 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
10517 o->extent_map.fault_range(db, offset, length);
10518 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
10519 r = _do_alloc_write(txc, c, o, &wctx);
10520 if (r < 0) {
10521 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10522 << dendl;
10523 goto out;
10524 }
10525
31f18b77
FG
10526 // NB: _wctx_finish() will empty old_extents
10527 // so we must do gc estimation before that
7c673cae 10528 benefit = gc.estimate(offset,
31f18b77
FG
10529 length,
10530 o->extent_map,
10531 wctx.old_extents,
10532 min_alloc_size);
7c673cae
FG
10533
10534 _wctx_finish(txc, c, o, &wctx);
10535 if (end > o->onode.size) {
10536 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 10537 << std::dec << dendl;
7c673cae
FG
10538 o->onode.size = end;
10539 }
10540
10541 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
31f18b77
FG
10542 if (!gc.get_extents_to_collect().empty()) {
10543 dout(20) << __func__ << " perform garbage collection, "
10544 << "expected benefit = " << benefit << " AUs" << dendl;
10545 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10546 if (r < 0) {
10547 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10548 << dendl;
10549 goto out;
7c673cae
FG
10550 }
10551 }
10552 }
7c673cae
FG
10553
10554 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
10555 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10556
7c673cae
FG
10557 r = 0;
10558
10559 out:
10560 return r;
10561}
10562
10563int BlueStore::_write(TransContext *txc,
10564 CollectionRef& c,
10565 OnodeRef& o,
31f18b77
FG
10566 uint64_t offset, size_t length,
10567 bufferlist& bl,
10568 uint32_t fadvise_flags)
7c673cae
FG
10569{
10570 dout(15) << __func__ << " " << c->cid << " " << o->oid
10571 << " 0x" << std::hex << offset << "~" << length << std::dec
10572 << dendl;
35e4c445
FG
10573 int r = 0;
10574 if (offset + length >= OBJECT_MAX_SIZE) {
10575 r = -E2BIG;
10576 } else {
10577 _assign_nid(txc, o);
10578 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10579 txc->write_onode(o);
10580 }
7c673cae
FG
10581 dout(10) << __func__ << " " << c->cid << " " << o->oid
10582 << " 0x" << std::hex << offset << "~" << length << std::dec
10583 << " = " << r << dendl;
10584 return r;
10585}
10586
10587int BlueStore::_zero(TransContext *txc,
10588 CollectionRef& c,
10589 OnodeRef& o,
10590 uint64_t offset, size_t length)
10591{
10592 dout(15) << __func__ << " " << c->cid << " " << o->oid
10593 << " 0x" << std::hex << offset << "~" << length << std::dec
10594 << dendl;
35e4c445
FG
10595 int r = 0;
10596 if (offset + length >= OBJECT_MAX_SIZE) {
10597 r = -E2BIG;
10598 } else {
10599 _assign_nid(txc, o);
10600 r = _do_zero(txc, c, o, offset, length);
10601 }
7c673cae
FG
10602 dout(10) << __func__ << " " << c->cid << " " << o->oid
10603 << " 0x" << std::hex << offset << "~" << length << std::dec
10604 << " = " << r << dendl;
10605 return r;
10606}
10607
10608int BlueStore::_do_zero(TransContext *txc,
10609 CollectionRef& c,
10610 OnodeRef& o,
10611 uint64_t offset, size_t length)
10612{
10613 dout(15) << __func__ << " " << c->cid << " " << o->oid
10614 << " 0x" << std::hex << offset << "~" << length << std::dec
10615 << dendl;
10616 int r = 0;
10617
10618 _dump_onode(o);
10619
10620 WriteContext wctx;
10621 o->extent_map.fault_range(db, offset, length);
10622 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 10623 o->extent_map.dirty_range(offset, length);
7c673cae
FG
10624 _wctx_finish(txc, c, o, &wctx);
10625
b32b8144 10626 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
10627 o->onode.size = offset + length;
10628 dout(20) << __func__ << " extending size to " << offset + length
10629 << dendl;
10630 }
10631 txc->write_onode(o);
10632
10633 dout(10) << __func__ << " " << c->cid << " " << o->oid
10634 << " 0x" << std::hex << offset << "~" << length << std::dec
10635 << " = " << r << dendl;
10636 return r;
10637}
10638
10639void BlueStore::_do_truncate(
31f18b77
FG
10640 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10641 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
10642{
10643 dout(15) << __func__ << " " << c->cid << " " << o->oid
10644 << " 0x" << std::hex << offset << std::dec << dendl;
10645
10646 _dump_onode(o, 30);
10647
10648 if (offset == o->onode.size)
31f18b77 10649 return;
7c673cae
FG
10650
10651 if (offset < o->onode.size) {
10652 WriteContext wctx;
10653 uint64_t length = o->onode.size - offset;
10654 o->extent_map.fault_range(db, offset, length);
10655 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
10656 o->extent_map.dirty_range(offset, length);
10657 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
10658
10659 // if we have shards past EOF, ask for a reshard
10660 if (!o->onode.extent_map_shards.empty() &&
10661 o->onode.extent_map_shards.back().offset >= offset) {
10662 dout(10) << __func__ << " request reshard past EOF" << dendl;
10663 if (offset) {
10664 o->extent_map.request_reshard(offset - 1, offset + length);
10665 } else {
10666 o->extent_map.request_reshard(0, length);
10667 }
10668 }
10669 }
10670
10671 o->onode.size = offset;
10672
10673 txc->write_onode(o);
10674}
10675
35e4c445 10676int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
10677 CollectionRef& c,
10678 OnodeRef& o,
10679 uint64_t offset)
10680{
10681 dout(15) << __func__ << " " << c->cid << " " << o->oid
10682 << " 0x" << std::hex << offset << std::dec
10683 << dendl;
35e4c445
FG
10684 int r = 0;
10685 if (offset >= OBJECT_MAX_SIZE) {
10686 r = -E2BIG;
10687 } else {
10688 _do_truncate(txc, c, o, offset);
10689 }
10690 dout(10) << __func__ << " " << c->cid << " " << o->oid
10691 << " 0x" << std::hex << offset << std::dec
10692 << " = " << r << dendl;
10693 return r;
7c673cae
FG
10694}
10695
10696int BlueStore::_do_remove(
10697 TransContext *txc,
10698 CollectionRef& c,
10699 OnodeRef o)
10700{
31f18b77 10701 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
10702 bool is_gen = !o->oid.is_no_gen();
10703 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
10704 if (o->onode.has_omap()) {
10705 o->flush();
10706 _do_omap_clear(txc, o->onode.nid);
10707 }
10708 o->exists = false;
10709 string key;
10710 for (auto &s : o->extent_map.shards) {
10711 dout(20) << __func__ << " removing shard 0x" << std::hex
10712 << s.shard_info->offset << std::dec << dendl;
10713 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10714 [&](const string& final_key) {
10715 txc->t->rmkey(PREFIX_OBJ, final_key);
10716 }
10717 );
10718 }
10719 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10720 txc->removed(o);
10721 o->extent_map.clear();
10722 o->onode = bluestore_onode_t();
10723 _debug_obj_on_delete(o->oid);
31f18b77 10724
224ce89b
WB
10725 if (!is_gen || maybe_unshared_blobs.empty()) {
10726 return 0;
10727 }
31f18b77 10728
224ce89b
WB
10729 // see if we can unshare blobs still referenced by the head
10730 dout(10) << __func__ << " gen and maybe_unshared_blobs "
10731 << maybe_unshared_blobs << dendl;
10732 ghobject_t nogen = o->oid;
10733 nogen.generation = ghobject_t::NO_GEN;
10734 OnodeRef h = c->onode_map.lookup(nogen);
10735
10736 if (!h || !h->exists) {
10737 return 0;
10738 }
10739
10740 dout(20) << __func__ << " checking for unshareable blobs on " << h
10741 << " " << h->oid << dendl;
10742 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
10743 for (auto& e : h->extent_map.extent_map) {
10744 const bluestore_blob_t& b = e.blob->get_blob();
10745 SharedBlob *sb = e.blob->shared_blob.get();
10746 if (b.is_shared() &&
10747 sb->loaded &&
10748 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
10749 if (b.is_compressed()) {
10750 expect[sb].get(0, b.get_ondisk_length());
10751 } else {
10752 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
10753 expect[sb].get(off, len);
10754 return 0;
10755 });
10756 }
224ce89b
WB
10757 }
10758 }
31f18b77 10759
224ce89b
WB
10760 vector<SharedBlob*> unshared_blobs;
10761 unshared_blobs.reserve(maybe_unshared_blobs.size());
10762 for (auto& p : expect) {
10763 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
10764 if (p.first->persistent->ref_map == p.second) {
10765 SharedBlob *sb = p.first;
10766 dout(20) << __func__ << " unsharing " << *sb << dendl;
10767 unshared_blobs.push_back(sb);
10768 txc->unshare_blob(sb);
10769 uint64_t sbid = c->make_blob_unshared(sb);
10770 string key;
10771 get_shared_blob_key(sbid, &key);
10772 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
10773 }
10774 }
10775
10776 if (unshared_blobs.empty()) {
10777 return 0;
10778 }
10779
224ce89b
WB
10780 for (auto& e : h->extent_map.extent_map) {
10781 const bluestore_blob_t& b = e.blob->get_blob();
10782 SharedBlob *sb = e.blob->shared_blob.get();
10783 if (b.is_shared() &&
10784 std::find(unshared_blobs.begin(), unshared_blobs.end(),
10785 sb) != unshared_blobs.end()) {
10786 dout(20) << __func__ << " unsharing " << e << dendl;
10787 bluestore_blob_t& blob = e.blob->dirty_blob();
10788 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 10789 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
10790 }
10791 }
224ce89b
WB
10792 txc->write_onode(h);
10793
7c673cae
FG
10794 return 0;
10795}
10796
10797int BlueStore::_remove(TransContext *txc,
10798 CollectionRef& c,
10799 OnodeRef &o)
10800{
10801 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10802 int r = _do_remove(txc, c, o);
10803 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10804 return r;
10805}
10806
10807int BlueStore::_setattr(TransContext *txc,
10808 CollectionRef& c,
10809 OnodeRef& o,
10810 const string& name,
10811 bufferptr& val)
10812{
10813 dout(15) << __func__ << " " << c->cid << " " << o->oid
10814 << " " << name << " (" << val.length() << " bytes)"
10815 << dendl;
10816 int r = 0;
3efd9988
FG
10817 if (val.is_partial()) {
10818 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
10819 val.length());
10820 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10821 } else {
10822 auto& b = o->onode.attrs[name.c_str()] = val;
10823 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10824 }
7c673cae
FG
10825 txc->write_onode(o);
10826 dout(10) << __func__ << " " << c->cid << " " << o->oid
10827 << " " << name << " (" << val.length() << " bytes)"
10828 << " = " << r << dendl;
10829 return r;
10830}
10831
10832int BlueStore::_setattrs(TransContext *txc,
10833 CollectionRef& c,
10834 OnodeRef& o,
10835 const map<string,bufferptr>& aset)
10836{
10837 dout(15) << __func__ << " " << c->cid << " " << o->oid
10838 << " " << aset.size() << " keys"
10839 << dendl;
10840 int r = 0;
10841 for (map<string,bufferptr>::const_iterator p = aset.begin();
10842 p != aset.end(); ++p) {
3efd9988
FG
10843 if (p->second.is_partial()) {
10844 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 10845 bufferptr(p->second.c_str(), p->second.length());
3efd9988
FG
10846 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10847 } else {
10848 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
10849 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10850 }
7c673cae
FG
10851 }
10852 txc->write_onode(o);
10853 dout(10) << __func__ << " " << c->cid << " " << o->oid
10854 << " " << aset.size() << " keys"
10855 << " = " << r << dendl;
10856 return r;
10857}
10858
10859
10860int BlueStore::_rmattr(TransContext *txc,
10861 CollectionRef& c,
10862 OnodeRef& o,
10863 const string& name)
10864{
10865 dout(15) << __func__ << " " << c->cid << " " << o->oid
10866 << " " << name << dendl;
10867 int r = 0;
10868 auto it = o->onode.attrs.find(name.c_str());
10869 if (it == o->onode.attrs.end())
10870 goto out;
10871
10872 o->onode.attrs.erase(it);
10873 txc->write_onode(o);
10874
10875 out:
10876 dout(10) << __func__ << " " << c->cid << " " << o->oid
10877 << " " << name << " = " << r << dendl;
10878 return r;
10879}
10880
10881int BlueStore::_rmattrs(TransContext *txc,
10882 CollectionRef& c,
10883 OnodeRef& o)
10884{
10885 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10886 int r = 0;
10887
10888 if (o->onode.attrs.empty())
10889 goto out;
10890
10891 o->onode.attrs.clear();
10892 txc->write_onode(o);
10893
10894 out:
10895 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10896 return r;
10897}
10898
10899void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
10900{
10901 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10902 string prefix, tail;
10903 get_omap_header(id, &prefix);
10904 get_omap_tail(id, &tail);
10905 it->lower_bound(prefix);
10906 while (it->valid()) {
10907 if (it->key() >= tail) {
10908 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
10909 << dendl;
10910 break;
10911 }
10912 txc->t->rmkey(PREFIX_OMAP, it->key());
10913 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10914 it->next();
10915 }
10916}
10917
10918int BlueStore::_omap_clear(TransContext *txc,
10919 CollectionRef& c,
10920 OnodeRef& o)
10921{
10922 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10923 int r = 0;
10924 if (o->onode.has_omap()) {
10925 o->flush();
10926 _do_omap_clear(txc, o->onode.nid);
10927 o->onode.clear_omap_flag();
10928 txc->write_onode(o);
10929 }
10930 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10931 return r;
10932}
10933
10934int BlueStore::_omap_setkeys(TransContext *txc,
10935 CollectionRef& c,
10936 OnodeRef& o,
10937 bufferlist &bl)
10938{
10939 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10940 int r;
10941 bufferlist::iterator p = bl.begin();
10942 __u32 num;
10943 if (!o->onode.has_omap()) {
10944 o->onode.set_omap_flag();
10945 txc->write_onode(o);
10946 } else {
10947 txc->note_modified_object(o);
10948 }
10949 string final_key;
10950 _key_encode_u64(o->onode.nid, &final_key);
10951 final_key.push_back('.');
10952 ::decode(num, p);
10953 while (num--) {
10954 string key;
10955 bufferlist value;
10956 ::decode(key, p);
10957 ::decode(value, p);
10958 final_key.resize(9); // keep prefix
10959 final_key += key;
10960 dout(30) << __func__ << " " << pretty_binary_string(final_key)
10961 << " <- " << key << dendl;
10962 txc->t->set(PREFIX_OMAP, final_key, value);
10963 }
10964 r = 0;
10965 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10966 return r;
10967}
10968
10969int BlueStore::_omap_setheader(TransContext *txc,
10970 CollectionRef& c,
10971 OnodeRef &o,
10972 bufferlist& bl)
10973{
10974 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10975 int r;
10976 string key;
10977 if (!o->onode.has_omap()) {
10978 o->onode.set_omap_flag();
10979 txc->write_onode(o);
10980 } else {
10981 txc->note_modified_object(o);
10982 }
10983 get_omap_header(o->onode.nid, &key);
10984 txc->t->set(PREFIX_OMAP, key, bl);
10985 r = 0;
10986 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10987 return r;
10988}
10989
10990int BlueStore::_omap_rmkeys(TransContext *txc,
10991 CollectionRef& c,
10992 OnodeRef& o,
10993 bufferlist& bl)
10994{
10995 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10996 int r = 0;
10997 bufferlist::iterator p = bl.begin();
10998 __u32 num;
10999 string final_key;
11000
11001 if (!o->onode.has_omap()) {
11002 goto out;
11003 }
11004 _key_encode_u64(o->onode.nid, &final_key);
11005 final_key.push_back('.');
11006 ::decode(num, p);
11007 while (num--) {
11008 string key;
11009 ::decode(key, p);
11010 final_key.resize(9); // keep prefix
11011 final_key += key;
11012 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
11013 << " <- " << key << dendl;
11014 txc->t->rmkey(PREFIX_OMAP, final_key);
11015 }
11016 txc->note_modified_object(o);
11017
11018 out:
11019 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11020 return r;
11021}
11022
11023int BlueStore::_omap_rmkey_range(TransContext *txc,
11024 CollectionRef& c,
11025 OnodeRef& o,
11026 const string& first, const string& last)
11027{
11028 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11029 KeyValueDB::Iterator it;
11030 string key_first, key_last;
11031 int r = 0;
11032 if (!o->onode.has_omap()) {
11033 goto out;
11034 }
11035 o->flush();
11036 it = db->get_iterator(PREFIX_OMAP);
11037 get_omap_key(o->onode.nid, first, &key_first);
11038 get_omap_key(o->onode.nid, last, &key_last);
11039 it->lower_bound(key_first);
11040 while (it->valid()) {
11041 if (it->key() >= key_last) {
11042 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
11043 << dendl;
11044 break;
11045 }
11046 txc->t->rmkey(PREFIX_OMAP, it->key());
11047 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
11048 it->next();
11049 }
11050 txc->note_modified_object(o);
11051
11052 out:
11053 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11054 return r;
11055}
11056
11057int BlueStore::_set_alloc_hint(
11058 TransContext *txc,
11059 CollectionRef& c,
11060 OnodeRef& o,
11061 uint64_t expected_object_size,
11062 uint64_t expected_write_size,
11063 uint32_t flags)
11064{
11065 dout(15) << __func__ << " " << c->cid << " " << o->oid
11066 << " object_size " << expected_object_size
11067 << " write_size " << expected_write_size
11068 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11069 << dendl;
11070 int r = 0;
11071 o->onode.expected_object_size = expected_object_size;
11072 o->onode.expected_write_size = expected_write_size;
11073 o->onode.alloc_hint_flags = flags;
11074 txc->write_onode(o);
11075 dout(10) << __func__ << " " << c->cid << " " << o->oid
11076 << " object_size " << expected_object_size
11077 << " write_size " << expected_write_size
11078 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11079 << " = " << r << dendl;
11080 return r;
11081}
11082
11083int BlueStore::_clone(TransContext *txc,
11084 CollectionRef& c,
11085 OnodeRef& oldo,
11086 OnodeRef& newo)
11087{
11088 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11089 << newo->oid << dendl;
11090 int r = 0;
11091 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
11092 derr << __func__ << " mismatched hash on " << oldo->oid
11093 << " and " << newo->oid << dendl;
11094 return -EINVAL;
11095 }
11096
7c673cae
FG
11097 _assign_nid(txc, newo);
11098
11099 // clone data
11100 oldo->flush();
11101 _do_truncate(txc, c, newo, 0);
11102 if (cct->_conf->bluestore_clone_cow) {
11103 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
11104 } else {
11105 bufferlist bl;
11106 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
11107 if (r < 0)
11108 goto out;
11109 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
11110 if (r < 0)
11111 goto out;
11112 }
11113
11114 // clone attrs
11115 newo->onode.attrs = oldo->onode.attrs;
11116
11117 // clone omap
11118 if (newo->onode.has_omap()) {
11119 dout(20) << __func__ << " clearing old omap data" << dendl;
11120 newo->flush();
11121 _do_omap_clear(txc, newo->onode.nid);
11122 }
11123 if (oldo->onode.has_omap()) {
11124 dout(20) << __func__ << " copying omap data" << dendl;
11125 if (!newo->onode.has_omap()) {
11126 newo->onode.set_omap_flag();
11127 }
11128 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
11129 string head, tail;
11130 get_omap_header(oldo->onode.nid, &head);
11131 get_omap_tail(oldo->onode.nid, &tail);
11132 it->lower_bound(head);
11133 while (it->valid()) {
11134 if (it->key() >= tail) {
11135 dout(30) << __func__ << " reached tail" << dendl;
11136 break;
11137 } else {
11138 dout(30) << __func__ << " got header/data "
11139 << pretty_binary_string(it->key()) << dendl;
11140 string key;
11141 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11142 txc->t->set(PREFIX_OMAP, key, it->value());
11143 }
11144 it->next();
11145 }
11146 } else {
11147 newo->onode.clear_omap_flag();
11148 }
11149
11150 txc->write_onode(newo);
11151 r = 0;
11152
11153 out:
11154 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11155 << newo->oid << " = " << r << dendl;
11156 return r;
11157}
11158
11159int BlueStore::_do_clone_range(
11160 TransContext *txc,
11161 CollectionRef& c,
11162 OnodeRef& oldo,
11163 OnodeRef& newo,
224ce89b
WB
11164 uint64_t srcoff,
11165 uint64_t length,
11166 uint64_t dstoff)
7c673cae
FG
11167{
11168 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11169 << newo->oid
11170 << " 0x" << std::hex << srcoff << "~" << length << " -> "
11171 << " 0x" << dstoff << "~" << length << std::dec << dendl;
11172 oldo->extent_map.fault_range(db, srcoff, length);
11173 newo->extent_map.fault_range(db, dstoff, length);
11174 _dump_onode(oldo);
11175 _dump_onode(newo);
11176
11177 // hmm, this could go into an ExtentMap::dup() method.
11178 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
11179 for (auto &e : oldo->extent_map.extent_map) {
11180 e.blob->last_encoded_id = -1;
11181 }
11182 int n = 0;
7c673cae 11183 uint64_t end = srcoff + length;
224ce89b
WB
11184 uint32_t dirty_range_begin = 0;
11185 uint32_t dirty_range_end = 0;
35e4c445 11186 bool src_dirty = false;
7c673cae
FG
11187 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
11188 ep != oldo->extent_map.extent_map.end();
11189 ++ep) {
11190 auto& e = *ep;
11191 if (e.logical_offset >= end) {
11192 break;
11193 }
11194 dout(20) << __func__ << " src " << e << dendl;
11195 BlobRef cb;
11196 bool blob_duped = true;
11197 if (e.blob->last_encoded_id >= 0) {
11198 // blob is already duped
11199 cb = id_to_blob[e.blob->last_encoded_id];
11200 blob_duped = false;
11201 } else {
11202 // dup the blob
11203 const bluestore_blob_t& blob = e.blob->get_blob();
11204 // make sure it is shared
11205 if (!blob.is_shared()) {
11206 c->make_blob_shared(_assign_blobid(txc), e.blob);
35e4c445
FG
11207 if (!src_dirty) {
11208 src_dirty = true;
224ce89b
WB
11209 dirty_range_begin = e.logical_offset;
11210 }
11211 assert(e.logical_end() > 0);
11212 // -1 to exclude next potential shard
11213 dirty_range_end = e.logical_end() - 1;
7c673cae
FG
11214 } else {
11215 c->load_shared_blob(e.blob->shared_blob);
11216 }
11217 cb = new Blob();
11218 e.blob->last_encoded_id = n;
11219 id_to_blob[n] = cb;
11220 e.blob->dup(*cb);
11221 // bump the extent refs on the copied blob's extents
11222 for (auto p : blob.get_extents()) {
11223 if (p.is_valid()) {
11224 e.blob->shared_blob->get_ref(p.offset, p.length);
11225 }
11226 }
11227 txc->write_shared_blob(e.blob->shared_blob);
11228 dout(20) << __func__ << " new " << *cb << dendl;
11229 }
11230 // dup extent
11231 int skip_front, skip_back;
11232 if (e.logical_offset < srcoff) {
11233 skip_front = srcoff - e.logical_offset;
11234 } else {
11235 skip_front = 0;
11236 }
11237 if (e.logical_end() > end) {
11238 skip_back = e.logical_end() - end;
11239 } else {
11240 skip_back = 0;
11241 }
11242 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
11243 e.blob_offset + skip_front,
11244 e.length - skip_front - skip_back, cb);
11245 newo->extent_map.extent_map.insert(*ne);
11246 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
11247 // fixme: we may leave parts of new blob unreferenced that could
11248 // be freed (relative to the shared_blob).
11249 txc->statfs_delta.stored() += ne->length;
11250 if (e.blob->get_blob().is_compressed()) {
11251 txc->statfs_delta.compressed_original() += ne->length;
11252 if (blob_duped){
11253 txc->statfs_delta.compressed() +=
11254 cb->get_blob().get_compressed_payload_length();
11255 }
11256 }
11257 dout(20) << __func__ << " dst " << *ne << dendl;
11258 ++n;
11259 }
35e4c445 11260 if (src_dirty) {
224ce89b
WB
11261 oldo->extent_map.dirty_range(dirty_range_begin,
11262 dirty_range_end - dirty_range_begin);
7c673cae
FG
11263 txc->write_onode(oldo);
11264 }
11265 txc->write_onode(newo);
11266
11267 if (dstoff + length > newo->onode.size) {
11268 newo->onode.size = dstoff + length;
11269 }
31f18b77 11270 newo->extent_map.dirty_range(dstoff, length);
7c673cae
FG
11271 _dump_onode(oldo);
11272 _dump_onode(newo);
11273 return 0;
11274}
11275
11276int BlueStore::_clone_range(TransContext *txc,
11277 CollectionRef& c,
11278 OnodeRef& oldo,
11279 OnodeRef& newo,
11280 uint64_t srcoff, uint64_t length, uint64_t dstoff)
11281{
11282 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11283 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11284 << " to offset 0x" << dstoff << std::dec << dendl;
11285 int r = 0;
11286
35e4c445
FG
11287 if (srcoff + length >= OBJECT_MAX_SIZE ||
11288 dstoff + length >= OBJECT_MAX_SIZE) {
11289 r = -E2BIG;
11290 goto out;
11291 }
7c673cae
FG
11292 if (srcoff + length > oldo->onode.size) {
11293 r = -EINVAL;
11294 goto out;
11295 }
11296
7c673cae
FG
11297 _assign_nid(txc, newo);
11298
11299 if (length > 0) {
11300 if (cct->_conf->bluestore_clone_cow) {
11301 _do_zero(txc, c, newo, dstoff, length);
11302 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
11303 } else {
11304 bufferlist bl;
11305 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
11306 if (r < 0)
11307 goto out;
11308 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
11309 if (r < 0)
11310 goto out;
11311 }
11312 }
11313
11314 txc->write_onode(newo);
11315 r = 0;
11316
11317 out:
11318 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11319 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11320 << " to offset 0x" << dstoff << std::dec
11321 << " = " << r << dendl;
11322 return r;
11323}
11324
11325int BlueStore::_rename(TransContext *txc,
11326 CollectionRef& c,
11327 OnodeRef& oldo,
11328 OnodeRef& newo,
11329 const ghobject_t& new_oid)
11330{
11331 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11332 << new_oid << dendl;
11333 int r;
11334 ghobject_t old_oid = oldo->oid;
31f18b77 11335 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
11336
11337 if (newo) {
11338 if (newo->exists) {
11339 r = -EEXIST;
11340 goto out;
11341 }
11342 assert(txc->onodes.count(newo) == 0);
11343 }
11344
11345 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
11346
11347 // rewrite shards
11348 {
11349 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
11350 get_object_key(cct, new_oid, &new_okey);
11351 string key;
11352 for (auto &s : oldo->extent_map.shards) {
11353 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
11354 [&](const string& final_key) {
11355 txc->t->rmkey(PREFIX_OBJ, final_key);
11356 }
11357 );
11358 s.dirty = true;
11359 }
11360 }
11361
11362 newo = oldo;
11363 txc->write_onode(newo);
11364
11365 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11366 // Onode in the old slot
11367 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
11368 r = 0;
11369
11370 out:
11371 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
11372 << new_oid << " = " << r << dendl;
11373 return r;
11374}
11375
11376// collections
11377
11378int BlueStore::_create_collection(
11379 TransContext *txc,
11380 const coll_t &cid,
11381 unsigned bits,
11382 CollectionRef *c)
11383{
11384 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
11385 int r;
11386 bufferlist bl;
11387
11388 {
11389 RWLock::WLocker l(coll_lock);
11390 if (*c) {
11391 r = -EEXIST;
11392 goto out;
11393 }
11394 c->reset(
11395 new Collection(
11396 this,
11397 cache_shards[cid.hash_to_shard(cache_shards.size())],
11398 cid));
11399 (*c)->cnode.bits = bits;
11400 coll_map[cid] = *c;
11401 }
11402 ::encode((*c)->cnode, bl);
11403 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11404 r = 0;
11405
11406 out:
11407 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11408 return r;
11409}
11410
11411int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11412 CollectionRef *c)
11413{
11414 dout(15) << __func__ << " " << cid << dendl;
11415 int r;
11416
11417 {
11418 RWLock::WLocker l(coll_lock);
11419 if (!*c) {
11420 r = -ENOENT;
11421 goto out;
11422 }
11423 size_t nonexistent_count = 0;
11424 assert((*c)->exists);
11425 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11426 if (o->exists) {
11427 dout(10) << __func__ << " " << o->oid << " " << o
11428 << " exists in onode_map" << dendl;
11429 return true;
11430 }
11431 ++nonexistent_count;
11432 return false;
11433 })) {
11434 r = -ENOTEMPTY;
11435 goto out;
11436 }
11437
11438 vector<ghobject_t> ls;
11439 ghobject_t next;
11440 // Enumerate onodes in db, up to nonexistent_count + 1
11441 // then check if all of them are marked as non-existent.
11442 // Bypass the check if returned number is greater than nonexistent_count
11443 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11444 nonexistent_count + 1, &ls, &next);
11445 if (r >= 0) {
11446 bool exists = false; //ls.size() > nonexistent_count;
11447 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11448 dout(10) << __func__ << " oid " << *it << dendl;
11449 auto onode = (*c)->onode_map.lookup(*it);
11450 exists = !onode || onode->exists;
11451 if (exists) {
11452 dout(10) << __func__ << " " << *it
11453 << " exists in db" << dendl;
11454 }
11455 }
11456 if (!exists) {
11457 coll_map.erase(cid);
11458 txc->removed_collections.push_back(*c);
11459 (*c)->exists = false;
11460 c->reset();
11461 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11462 r = 0;
11463 } else {
11464 dout(10) << __func__ << " " << cid
11465 << " is non-empty" << dendl;
11466 r = -ENOTEMPTY;
11467 }
11468 }
11469 }
11470
11471 out:
11472 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11473 return r;
11474}
11475
11476int BlueStore::_split_collection(TransContext *txc,
11477 CollectionRef& c,
11478 CollectionRef& d,
11479 unsigned bits, int rem)
11480{
11481 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11482 << " bits " << bits << dendl;
11483 RWLock::WLocker l(c->lock);
11484 RWLock::WLocker l2(d->lock);
11485 int r;
11486
11487 // flush all previous deferred writes on this sequencer. this is a bit
11488 // heavyweight, but we need to make sure all deferred writes complete
11489 // before we split as the new collection's sequencer may need to order
11490 // this after those writes, and we don't bother with the complexity of
11491 // moving those TransContexts over to the new osr.
11492 _osr_drain_preceding(txc);
11493
11494 // move any cached items (onodes and referenced shared blobs) that will
11495 // belong to the child collection post-split. leave everything else behind.
11496 // this may include things that don't strictly belong to the now-smaller
11497 // parent split, but the OSD will always send us a split for every new
11498 // child.
11499
11500 spg_t pgid, dest_pgid;
11501 bool is_pg = c->cid.is_pg(&pgid);
11502 assert(is_pg);
11503 is_pg = d->cid.is_pg(&dest_pgid);
11504 assert(is_pg);
11505
11506 // the destination should initially be empty.
11507 assert(d->onode_map.empty());
11508 assert(d->shared_blob_set.empty());
11509 assert(d->cnode.bits == bits);
11510
11511 c->split_cache(d.get());
11512
11513 // adjust bits. note that this will be redundant for all but the first
11514 // split call for this parent (first child).
11515 c->cnode.bits = bits;
11516 assert(d->cnode.bits == bits);
11517 r = 0;
11518
11519 bufferlist bl;
11520 ::encode(c->cnode, bl);
11521 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11522
11523 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11524 << " bits " << bits << " = " << r << dendl;
11525 return r;
11526}
11527
11528// DB key value Histogram
11529#define KEY_SLAB 32
11530#define VALUE_SLAB 64
11531
11532const string prefix_onode = "o";
11533const string prefix_onode_shard = "x";
11534const string prefix_other = "Z";
11535
11536int BlueStore::DBHistogram::get_key_slab(size_t sz)
11537{
11538 return (sz/KEY_SLAB);
11539}
11540
11541string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11542{
11543 int lower_bound = slab * KEY_SLAB;
11544 int upper_bound = (slab + 1) * KEY_SLAB;
11545 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11546 return ret;
11547}
11548
11549int BlueStore::DBHistogram::get_value_slab(size_t sz)
11550{
11551 return (sz/VALUE_SLAB);
11552}
11553
11554string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11555{
11556 int lower_bound = slab * VALUE_SLAB;
11557 int upper_bound = (slab + 1) * VALUE_SLAB;
11558 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11559 return ret;
11560}
11561
11562void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11563 const string &prefix, size_t key_size, size_t value_size)
11564{
11565 uint32_t key_slab = get_key_slab(key_size);
11566 uint32_t value_slab = get_value_slab(value_size);
11567 key_hist[prefix][key_slab].count++;
11568 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11569 key_hist[prefix][key_slab].val_map[value_slab].count++;
11570 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11571 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11572}
11573
11574void BlueStore::DBHistogram::dump(Formatter *f)
11575{
11576 f->open_object_section("rocksdb_value_distribution");
11577 for (auto i : value_hist) {
11578 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11579 }
11580 f->close_section();
11581
11582 f->open_object_section("rocksdb_key_value_histogram");
11583 for (auto i : key_hist) {
11584 f->dump_string("prefix", i.first);
11585 f->open_object_section("key_hist");
11586 for ( auto k : i.second) {
11587 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11588 f->dump_unsigned("max_len", k.second.max_len);
11589 f->open_object_section("value_hist");
11590 for ( auto j : k.second.val_map) {
11591 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11592 f->dump_unsigned("max_len", j.second.max_len);
11593 }
11594 f->close_section();
11595 }
11596 f->close_section();
11597 }
11598 f->close_section();
11599}
11600
11601//Itrerates through the db and collects the stats
11602void BlueStore::generate_db_histogram(Formatter *f)
11603{
11604 //globals
11605 uint64_t num_onodes = 0;
11606 uint64_t num_shards = 0;
11607 uint64_t num_super = 0;
11608 uint64_t num_coll = 0;
11609 uint64_t num_omap = 0;
11610 uint64_t num_deferred = 0;
11611 uint64_t num_alloc = 0;
11612 uint64_t num_stat = 0;
11613 uint64_t num_others = 0;
11614 uint64_t num_shared_shards = 0;
11615 size_t max_key_size =0, max_value_size = 0;
11616 uint64_t total_key_size = 0, total_value_size = 0;
11617 size_t key_size = 0, value_size = 0;
11618 DBHistogram hist;
11619
11620 utime_t start = ceph_clock_now();
11621
11622 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11623 iter->seek_to_first();
11624 while (iter->valid()) {
11625 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11626 key_size = iter->key_size();
11627 value_size = iter->value_size();
11628 hist.value_hist[hist.get_value_slab(value_size)]++;
11629 max_key_size = MAX(max_key_size, key_size);
11630 max_value_size = MAX(max_value_size, value_size);
11631 total_key_size += key_size;
11632 total_value_size += value_size;
11633
11634 pair<string,string> key(iter->raw_key());
11635
11636 if (key.first == PREFIX_SUPER) {
11637 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11638 num_super++;
11639 } else if (key.first == PREFIX_STAT) {
11640 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11641 num_stat++;
11642 } else if (key.first == PREFIX_COLL) {
11643 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11644 num_coll++;
11645 } else if (key.first == PREFIX_OBJ) {
11646 if (key.second.back() == ONODE_KEY_SUFFIX) {
11647 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11648 num_onodes++;
11649 } else {
11650 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11651 num_shards++;
11652 }
11653 } else if (key.first == PREFIX_OMAP) {
11654 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11655 num_omap++;
11656 } else if (key.first == PREFIX_DEFERRED) {
11657 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11658 num_deferred++;
11659 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11660 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11661 num_alloc++;
11662 } else if (key.first == PREFIX_SHARED_BLOB) {
11663 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11664 num_shared_shards++;
11665 } else {
11666 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11667 num_others++;
11668 }
11669 iter->next();
11670 }
11671
11672 utime_t duration = ceph_clock_now() - start;
11673 f->open_object_section("rocksdb_key_value_stats");
11674 f->dump_unsigned("num_onodes", num_onodes);
11675 f->dump_unsigned("num_shards", num_shards);
11676 f->dump_unsigned("num_super", num_super);
11677 f->dump_unsigned("num_coll", num_coll);
11678 f->dump_unsigned("num_omap", num_omap);
11679 f->dump_unsigned("num_deferred", num_deferred);
11680 f->dump_unsigned("num_alloc", num_alloc);
11681 f->dump_unsigned("num_stat", num_stat);
11682 f->dump_unsigned("num_shared_shards", num_shared_shards);
11683 f->dump_unsigned("num_others", num_others);
11684 f->dump_unsigned("max_key_size", max_key_size);
11685 f->dump_unsigned("max_value_size", max_value_size);
11686 f->dump_unsigned("total_key_size", total_key_size);
11687 f->dump_unsigned("total_value_size", total_value_size);
11688 f->close_section();
11689
11690 hist.dump(f);
11691
11692 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11693
11694}
11695
31f18b77 11696void BlueStore::_flush_cache()
7c673cae
FG
11697{
11698 dout(10) << __func__ << dendl;
11699 for (auto i : cache_shards) {
11700 i->trim_all();
31f18b77 11701 assert(i->empty());
7c673cae
FG
11702 }
11703 for (auto& p : coll_map) {
3efd9988
FG
11704 if (!p.second->onode_map.empty()) {
11705 derr << __func__ << "stray onodes on " << p.first << dendl;
11706 p.second->onode_map.dump(cct, 0);
11707 }
11708 if (!p.second->shared_blob_set.empty()) {
11709 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11710 p.second->shared_blob_set.dump(cct, 0);
11711 }
7c673cae
FG
11712 assert(p.second->onode_map.empty());
11713 assert(p.second->shared_blob_set.empty());
11714 }
11715 coll_map.clear();
11716}
11717
31f18b77
FG
11718// For external caller.
11719// We use a best-effort policy instead, e.g.,
11720// we don't care if there are still some pinned onodes/data in the cache
11721// after this command is completed.
11722void BlueStore::flush_cache()
11723{
11724 dout(10) << __func__ << dendl;
11725 for (auto i : cache_shards) {
11726 i->trim_all();
11727 }
11728}
11729
7c673cae
FG
11730void BlueStore::_apply_padding(uint64_t head_pad,
11731 uint64_t tail_pad,
7c673cae
FG
11732 bufferlist& padded)
11733{
7c673cae 11734 if (head_pad) {
224ce89b 11735 padded.prepend_zero(head_pad);
7c673cae
FG
11736 }
11737 if (tail_pad) {
11738 padded.append_zero(tail_pad);
11739 }
11740 if (head_pad || tail_pad) {
11741 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
11742 << " tail 0x" << tail_pad << std::dec << dendl;
11743 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
11744 }
11745}
11746
11747// ===========================================