]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
bump version to 12.0.3-pve3
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
7c673cae
FG
1// vim: ts=8 sw=2 smarttab
2/*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14#include <unistd.h>
15#include <stdlib.h>
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <fcntl.h>
19
20#include "BlueStore.h"
21#include "os/kv.h"
22#include "include/compat.h"
23#include "include/intarith.h"
24#include "include/stringify.h"
25#include "common/errno.h"
26#include "common/safe_io.h"
27#include "Allocator.h"
28#include "FreelistManager.h"
29#include "BlueFS.h"
30#include "BlueRocksEnv.h"
31#include "auth/Crypto.h"
32#include "common/EventTrace.h"
33
34#define dout_context cct
35#define dout_subsys ceph_subsys_bluestore
36
37// bluestore_meta_onode
38MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
39 bluestore_meta_onode);
40
41// bluestore_meta_other
42MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
43 bluestore_meta_other);
44MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
45 bluestore_meta_other);
46MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
47 bluestore_meta_other);
48MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
49 bluestore_meta_other);
50
51// kv store prefixes
52const string PREFIX_SUPER = "S"; // field -> value
53const string PREFIX_STAT = "T"; // field -> value(int64 array)
54const string PREFIX_COLL = "C"; // collection name -> cnode_t
55const string PREFIX_OBJ = "O"; // object name -> onode_t
56const string PREFIX_OMAP = "M"; // u64 + keyname -> value
57const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
58const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
59const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
60
61// write a label in the first block. always use this size. note that
62// bluefs makes a matching assumption about the location of its
63// superblock (always the second block of the device).
64#define BDEV_LABEL_BLOCK_SIZE 4096
65
66// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
67#define SUPER_RESERVED 8192
68
69#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
70
71
72/*
73 * extent map blob encoding
74 *
75 * we use the low bits of the blobid field to indicate some common scenarios
76 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
77 */
78#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
79#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
80#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
81#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
82#define BLOBID_SHIFT_BITS 4
83
84/*
85 * object name key structure
86 *
87 * encoded u8: shard + 2^7 (so that it sorts properly)
88 * encoded u64: poolid + 2^63 (so that it sorts properly)
89 * encoded u32: hash (bit reversed)
90 *
91 * escaped string: namespace
92 *
93 * escaped string: key or object name
94 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
95 * we are done. otherwise, we are followed by the object name.
96 * escaped string: object name (unless '=' above)
97 *
98 * encoded u64: snap
99 * encoded u64: generation
100 * 'o'
101 */
102#define ONODE_KEY_SUFFIX 'o'
103
104/*
105 * extent shard key
106 *
107 * object prefix key
108 * u32
109 * 'x'
110 */
111#define EXTENT_SHARD_KEY_SUFFIX 'x'
112
113/*
114 * string encoding in the key
115 *
116 * The key string needs to lexicographically sort the same way that
117 * ghobject_t does. We do this by escaping anything <= to '#' with #
118 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
119 * hex digits.
120 *
121 * We use ! as a terminator for strings; this works because it is < #
122 * and will get escaped if it is present in the string.
123 *
124 */
125template<typename S>
126static void append_escaped(const string &in, S *out)
127{
128 char hexbyte[8];
129 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
130 if (*i <= '#') {
131 snprintf(hexbyte, sizeof(hexbyte), "#%02x", (uint8_t)*i);
132 out->append(hexbyte);
133 } else if (*i >= '~') {
134 snprintf(hexbyte, sizeof(hexbyte), "~%02x", (uint8_t)*i);
135 out->append(hexbyte);
136 } else {
137 out->push_back(*i);
138 }
139 }
140 out->push_back('!');
141}
142
143static int decode_escaped(const char *p, string *out)
144{
145 const char *orig_p = p;
146 while (*p && *p != '!') {
147 if (*p == '#' || *p == '~') {
148 unsigned hex;
149 int r = sscanf(++p, "%2x", &hex);
150 if (r < 1)
151 return -EINVAL;
152 out->push_back((char)hex);
153 p += 2;
154 } else {
155 out->push_back(*p++);
156 }
157 }
158 return p - orig_p;
159}
160
161// some things we encode in binary (as le32 or le64); print the
162// resulting key strings nicely
163template<typename S>
164static string pretty_binary_string(const S& in)
165{
166 char buf[10];
167 string out;
168 out.reserve(in.length() * 3);
169 enum { NONE, HEX, STRING } mode = NONE;
170 unsigned from = 0, i;
171 for (i=0; i < in.length(); ++i) {
172 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
173 (mode == HEX && in.length() - i >= 4 &&
174 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
175 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
176 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
177 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
178 if (mode == STRING) {
179 out.append(in.c_str() + from, i - from);
180 out.push_back('\'');
181 }
182 if (mode != HEX) {
183 out.append("0x");
184 mode = HEX;
185 }
186 if (in.length() - i >= 4) {
187 // print a whole u32 at once
188 snprintf(buf, sizeof(buf), "%08x",
189 (uint32_t)(((unsigned char)in[i] << 24) |
190 ((unsigned char)in[i+1] << 16) |
191 ((unsigned char)in[i+2] << 8) |
192 ((unsigned char)in[i+3] << 0)));
193 i += 3;
194 } else {
195 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
196 }
197 out.append(buf);
198 } else {
199 if (mode != STRING) {
200 out.push_back('\'');
201 mode = STRING;
202 from = i;
203 }
204 }
205 }
206 if (mode == STRING) {
207 out.append(in.c_str() + from, i - from);
208 out.push_back('\'');
209 }
210 return out;
211}
212
213template<typename T>
214static void _key_encode_shard(shard_id_t shard, T *key)
215{
216 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
217}
218
219static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
220{
221 pshard->id = (uint8_t)*key - (uint8_t)0x80;
222 return key + 1;
223}
224
225static void get_coll_key_range(const coll_t& cid, int bits,
226 string *temp_start, string *temp_end,
227 string *start, string *end)
228{
229 temp_start->clear();
230 temp_end->clear();
231 start->clear();
232 end->clear();
233
234 spg_t pgid;
235 if (cid.is_pg(&pgid)) {
236 _key_encode_shard(pgid.shard, start);
237 *temp_start = *start;
238
239 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
240 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
241
242 *end = *start;
243 *temp_end = *temp_start;
244
245 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
246 _key_encode_u32(reverse_hash, start);
247 _key_encode_u32(reverse_hash, temp_start);
248
249 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
250 if (end_hash > 0xffffffffull)
251 end_hash = 0xffffffffull;
252
253 _key_encode_u32(end_hash, end);
254 _key_encode_u32(end_hash, temp_end);
255 } else {
256 _key_encode_shard(shard_id_t::NO_SHARD, start);
257 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
258 *end = *start;
259 _key_encode_u32(0, start);
260 _key_encode_u32(0xffffffff, end);
261
262 // no separate temp section
263 *temp_start = *end;
264 *temp_end = *end;
265 }
266}
267
268static void get_shared_blob_key(uint64_t sbid, string *key)
269{
270 key->clear();
271 _key_encode_u64(sbid, key);
272}
273
274static int get_key_shared_blob(const string& key, uint64_t *sbid)
275{
276 const char *p = key.c_str();
277 if (key.length() < sizeof(uint64_t))
278 return -1;
279 p = _key_decode_u64(p, sbid);
280 return 0;
281}
282
283template<typename S>
284static int get_key_object(const S& key, ghobject_t *oid)
285{
286 int r;
287 const char *p = key.c_str();
288
289 if (key.length() < 1 + 8 + 4)
290 return -1;
291 p = _key_decode_shard(p, &oid->shard_id);
292
293 uint64_t pool;
294 p = _key_decode_u64(p, &pool);
295 oid->hobj.pool = pool - 0x8000000000000000ull;
296
297 unsigned hash;
298 p = _key_decode_u32(p, &hash);
299
300 oid->hobj.set_bitwise_key_u32(hash);
301
302 r = decode_escaped(p, &oid->hobj.nspace);
303 if (r < 0)
304 return -2;
305 p += r + 1;
306
307 string k;
308 r = decode_escaped(p, &k);
309 if (r < 0)
310 return -3;
311 p += r + 1;
312 if (*p == '=') {
313 // no key
314 ++p;
315 oid->hobj.oid.name = k;
316 } else if (*p == '<' || *p == '>') {
317 // key + name
318 ++p;
319 r = decode_escaped(p, &oid->hobj.oid.name);
320 if (r < 0)
321 return -5;
322 p += r + 1;
323 oid->hobj.set_key(k);
324 } else {
325 // malformed
326 return -6;
327 }
328
329 p = _key_decode_u64(p, &oid->hobj.snap.val);
330 p = _key_decode_u64(p, &oid->generation);
331
332 if (*p != ONODE_KEY_SUFFIX) {
333 return -7;
334 }
335 p++;
336 if (*p) {
337 // if we get something other than a null terminator here,
338 // something goes wrong.
339 return -8;
340 }
341
342 return 0;
343}
344
345template<typename S>
346static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
347{
348 key->clear();
349
350 size_t max_len = 1 + 8 + 4 +
351 (oid.hobj.nspace.length() * 3 + 1) +
352 (oid.hobj.get_key().length() * 3 + 1) +
353 1 + // for '<', '=', or '>'
354 (oid.hobj.oid.name.length() * 3 + 1) +
355 8 + 8 + 1;
356 key->reserve(max_len);
357
358 _key_encode_shard(oid.shard_id, key);
359 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
360 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
361
362 append_escaped(oid.hobj.nspace, key);
363
364 if (oid.hobj.get_key().length()) {
365 // is a key... could be < = or >.
366 append_escaped(oid.hobj.get_key(), key);
367 // (ASCII chars < = and > sort in that order, yay)
368 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
369 if (r) {
370 key->append(r > 0 ? ">" : "<");
371 append_escaped(oid.hobj.oid.name, key);
372 } else {
373 // same as no key
374 key->append("=");
375 }
376 } else {
377 // no key
378 append_escaped(oid.hobj.oid.name, key);
379 key->append("=");
380 }
381
382 _key_encode_u64(oid.hobj.snap, key);
383 _key_encode_u64(oid.generation, key);
384
385 key->push_back(ONODE_KEY_SUFFIX);
386
387 // sanity check
388 if (true) {
389 ghobject_t t;
390 int r = get_key_object(*key, &t);
391 if (r || t != oid) {
392 derr << " r " << r << dendl;
393 derr << "key " << pretty_binary_string(*key) << dendl;
394 derr << "oid " << oid << dendl;
395 derr << " t " << t << dendl;
396 assert(r == 0 && t == oid);
397 }
398 }
399}
400
401
402// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
403// char lets us quickly test whether it is a shard key without decoding any
404// of the prefix bytes.
405template<typename S>
406static void get_extent_shard_key(const S& onode_key, uint32_t offset,
407 string *key)
408{
409 key->clear();
410 key->reserve(onode_key.length() + 4 + 1);
411 key->append(onode_key.c_str(), onode_key.size());
412 _key_encode_u32(offset, key);
413 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
414}
415
416static void rewrite_extent_shard_key(uint32_t offset, string *key)
417{
418 assert(key->size() > sizeof(uint32_t) + 1);
419 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
420 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
421}
422
423template<typename S>
424static void generate_extent_shard_key_and_apply(
425 const S& onode_key,
426 uint32_t offset,
427 string *key,
428 std::function<void(const string& final_key)> apply)
429{
430 if (key->empty()) { // make full key
431 assert(!onode_key.empty());
432 get_extent_shard_key(onode_key, offset, key);
433 } else {
434 rewrite_extent_shard_key(offset, key);
435 }
436 apply(*key);
437}
438
439int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
440{
441 assert(key.size() > sizeof(uint32_t) + 1);
442 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
443 int okey_len = key.size() - sizeof(uint32_t) - 1;
444 *onode_key = key.substr(0, okey_len);
445 const char *p = key.data() + okey_len;
446 p = _key_decode_u32(p, offset);
447 return 0;
448}
449
450static bool is_extent_shard_key(const string& key)
451{
452 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
453}
454
455// '-' < '.' < '~'
456static void get_omap_header(uint64_t id, string *out)
457{
458 _key_encode_u64(id, out);
459 out->push_back('-');
460}
461
462// hmm, I don't think there's any need to escape the user key since we
463// have a clean prefix.
464static void get_omap_key(uint64_t id, const string& key, string *out)
465{
466 _key_encode_u64(id, out);
467 out->push_back('.');
468 out->append(key);
469}
470
471static void rewrite_omap_key(uint64_t id, string old, string *out)
472{
473 _key_encode_u64(id, out);
474 out->append(old.c_str() + out->length(), old.size() - out->length());
475}
476
477static void decode_omap_key(const string& key, string *user_key)
478{
479 *user_key = key.substr(sizeof(uint64_t) + 1);
480}
481
482static void get_omap_tail(uint64_t id, string *out)
483{
484 _key_encode_u64(id, out);
485 out->push_back('~');
486}
487
488static void get_deferred_key(uint64_t seq, string *out)
489{
490 _key_encode_u64(seq, out);
491}
492
493
494// merge operators
495
496struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
497 void merge_nonexistent(
498 const char *rdata, size_t rlen, std::string *new_value) override {
499 *new_value = std::string(rdata, rlen);
500 }
501 void merge(
502 const char *ldata, size_t llen,
503 const char *rdata, size_t rlen,
504 std::string *new_value) override {
505 assert(llen == rlen);
506 assert((rlen % 8) == 0);
507 new_value->resize(rlen);
508 const __le64* lv = (const __le64*)ldata;
509 const __le64* rv = (const __le64*)rdata;
510 __le64* nv = &(__le64&)new_value->at(0);
511 for (size_t i = 0; i < rlen >> 3; ++i) {
512 nv[i] = lv[i] + rv[i];
513 }
514 }
515 // We use each operator name and each prefix to construct the
516 // overall RocksDB operator name for consistency check at open time.
517 string name() const override {
518 return "int64_array";
519 }
520};
521
522
523// Buffer
524
525ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
526{
527 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
528 << b.offset << "~" << b.length << std::dec
529 << " " << BlueStore::Buffer::get_state_name(b.state);
530 if (b.flags)
531 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
532 return out << ")";
533}
534
535// Garbage Collector
536
537void BlueStore::GarbageCollector::process_protrusive_extents(
538 const BlueStore::ExtentMap& extent_map,
539 uint64_t start_offset,
540 uint64_t end_offset,
541 uint64_t start_touch_offset,
542 uint64_t end_touch_offset,
543 uint64_t min_alloc_size)
544{
545 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
546
547 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
548 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
549
550 dout(30) << __func__ << " (hex): [" << std::hex
551 << lookup_start_offset << ", " << lookup_end_offset
552 << ")" << std::dec << dendl;
553
554 for (auto it = extent_map.seek_lextent(lookup_start_offset);
555 it != extent_map.extent_map.end() &&
556 it->logical_offset < lookup_end_offset;
557 ++it) {
558 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
559 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
560
561 dout(30) << __func__ << " " << *it
562 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
563 << dendl;
564
565 Blob* b = it->blob.get();
566
567 if (it->logical_offset >=start_touch_offset &&
568 it->logical_end() <= end_touch_offset) {
569 // Process extents within the range affected by
570 // the current write request.
571 // Need to take into account if existing extents
572 // can be merged with them (uncompressed case)
573 if (!b->get_blob().is_compressed()) {
574 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
575 --blob_info_counted->expected_allocations; // don't need to allocate
576 // new AU for compressed
577 // data since another
578 // collocated uncompressed
579 // blob already exists
580 dout(30) << __func__ << " --expected:"
581 << alloc_unit_start << dendl;
582 }
583 used_alloc_unit = alloc_unit_end;
584 blob_info_counted = nullptr;
585 }
586 } else if (b->get_blob().is_compressed()) {
587
588 // additionally we take compressed blobs that were not impacted
589 // by the write into account too
590 BlobInfo& bi =
591 affected_blobs.emplace(
592 b, BlobInfo(b->get_referenced_bytes())).first->second;
593
594 int adjust =
595 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
596 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
597 dout(30) << __func__ << " expected_allocations="
598 << bi.expected_allocations << " end_au:"
599 << alloc_unit_end << dendl;
600
601 blob_info_counted = &bi;
602 used_alloc_unit = alloc_unit_end;
603
604 assert(it->length <= bi.referenced_bytes);
605 bi.referenced_bytes -= it->length;
606 dout(30) << __func__ << " affected_blob:" << *b
607 << " unref 0x" << std::hex << it->length
608 << " referenced = 0x" << bi.referenced_bytes
609 << std::dec << dendl;
610 // NOTE: we can't move specific blob to resulting GC list here
611 // when reference counter == 0 since subsequent extents might
612 // decrement its expected_allocation.
613 // Hence need to enumerate all the extents first.
614 if (!bi.collect_candidate) {
615 bi.first_lextent = it;
616 bi.collect_candidate = true;
617 }
618 bi.last_lextent = it;
619 } else {
620 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
621 // don't need to allocate new AU for compressed data since another
622 // collocated uncompressed blob already exists
623 --blob_info_counted->expected_allocations;
624 dout(30) << __func__ << " --expected_allocations:"
625 << alloc_unit_start << dendl;
626 }
627 used_alloc_unit = alloc_unit_end;
628 blob_info_counted = nullptr;
629 }
630 }
631
632 for (auto b_it = affected_blobs.begin();
633 b_it != affected_blobs.end();
634 ++b_it) {
635 Blob* b = b_it->first;
636 BlobInfo& bi = b_it->second;
637 if (bi.referenced_bytes == 0) {
638 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
639 int64_t blob_expected_for_release =
640 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
641
642 dout(30) << __func__ << " " << *(b_it->first)
643 << " expected4release=" << blob_expected_for_release
644 << " expected_allocations=" << bi.expected_allocations
645 << dendl;
646 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
647 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
648 if (bi.collect_candidate) {
649 auto it = bi.first_lextent;
650 bool bExit = false;
651 do {
652 if (it->blob.get() == b) {
653 extents_to_collect.emplace_back(it->logical_offset, it->length);
654 }
655 bExit = it == bi.last_lextent;
656 ++it;
657 } while(!bExit);
658 }
659 expected_for_release += blob_expected_for_release;
660 expected_allocations += bi.expected_allocations;
661 }
662 }
663 }
664}
665
666int64_t BlueStore::GarbageCollector::estimate(
667 uint64_t start_offset,
668 uint64_t length,
669 const BlueStore::ExtentMap& extent_map,
670 const BlueStore::old_extent_map_t& old_extents,
671 uint64_t min_alloc_size)
672{
673
674 affected_blobs.clear();
675 extents_to_collect.clear();
676 used_alloc_unit = boost::optional<uint64_t >();
677 blob_info_counted = nullptr;
678
679 gc_start_offset = start_offset;
680 gc_end_offset = start_offset + length;
681
682 uint64_t end_offset = start_offset + length;
683
684 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
685 Blob* b = it->e.blob.get();
686 if (b->get_blob().is_compressed()) {
687
688 // update gc_start_offset/gc_end_offset if needed
689 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
690 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
691
692 auto o = it->e.logical_offset;
693 auto l = it->e.length;
694
695 uint64_t ref_bytes = b->get_referenced_bytes();
696 // micro optimization to bypass blobs that have no more references
697 if (ref_bytes != 0) {
698 dout(30) << __func__ << " affected_blob:" << *b
699 << " unref 0x" << std::hex << o << "~" << l
700 << std::dec << dendl;
701 affected_blobs.emplace(b, BlobInfo(ref_bytes));
702 }
703 }
704 }
705 dout(30) << __func__ << " gc range(hex): [" << std::hex
706 << gc_start_offset << ", " << gc_end_offset
707 << ")" << std::dec << dendl;
708
709 // enumerate preceeding extents to check if they reference affected blobs
710 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
711 process_protrusive_extents(extent_map,
712 gc_start_offset,
713 gc_end_offset,
714 start_offset,
715 end_offset,
716 min_alloc_size);
717 }
718 return expected_for_release - expected_allocations;
719}
720
721// Cache
722
723BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
724 PerfCounters *logger)
725{
726 Cache *c = nullptr;
727
728 if (type == "lru")
729 c = new LRUCache(cct);
730 else if (type == "2q")
731 c = new TwoQCache(cct);
732 else
733 assert(0 == "unrecognized cache type");
734
735 c->logger = logger;
736 return c;
737}
738
739void BlueStore::Cache::trim_all()
740{
741 std::lock_guard<std::recursive_mutex> l(lock);
742 _trim(0, 0);
743 assert(_get_num_onodes() == 0);
744 assert(_get_buffer_bytes() == 0);
745}
746
747void BlueStore::Cache::trim(
748 uint64_t target_bytes,
749 float target_meta_ratio,
750 float bytes_per_onode)
751{
752 std::lock_guard<std::recursive_mutex> l(lock);
753 uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
754 uint64_t current_buffer = _get_buffer_bytes();
755 uint64_t current = current_meta + current_buffer;
756
757 uint64_t target_meta = target_bytes * (double)target_meta_ratio; //need to cast to double
758 //since float(1) might produce inaccurate value
759 // for target_meta (a bit greater than target_bytes)
760 // that causes overflow in target_buffer below.
761 //Consider the following code:
762 //uint64_t i =(uint64_t)227*1024*1024*1024 + 1;
763 //float f = 1;
764 //uint64_t i2 = i*f;
765 //assert(i == i2);
766
767 target_meta = min(target_bytes, target_meta); //and just in case that ratio is > 1
768 uint64_t target_buffer = target_bytes - target_meta;
769
770 if (current <= target_bytes) {
771 dout(10) << __func__
772 << " shard target " << pretty_si_t(target_bytes)
773 << " ratio " << target_meta_ratio << " ("
774 << pretty_si_t(target_meta) << " + "
775 << pretty_si_t(target_buffer) << "), "
776 << " current " << pretty_si_t(current) << " ("
777 << pretty_si_t(current_meta) << " + "
778 << pretty_si_t(current_buffer) << ")"
779 << dendl;
780 return;
781 }
782
783 uint64_t need_to_free = current - target_bytes;
784 uint64_t free_buffer = 0;
785 uint64_t free_meta = 0;
786 if (current_buffer > target_buffer) {
787 free_buffer = current_buffer - target_buffer;
788 if (free_buffer > need_to_free) {
789 free_buffer = need_to_free;
790 }
791 }
792 free_meta = need_to_free - free_buffer;
793
794 // start bounds at what we have now
795 uint64_t max_buffer = current_buffer - free_buffer;
796 uint64_t max_meta = current_meta - free_meta;
797 uint64_t max_onodes = max_meta / bytes_per_onode;
798
799 dout(10) << __func__
800 << " shard target " << pretty_si_t(target_bytes)
801 << " ratio " << target_meta_ratio << " ("
802 << pretty_si_t(target_meta) << " + "
803 << pretty_si_t(target_buffer) << "), "
804 << " current " << pretty_si_t(current) << " ("
805 << pretty_si_t(current_meta) << " + "
806 << pretty_si_t(current_buffer) << "),"
807 << " need_to_free " << pretty_si_t(need_to_free) << " ("
808 << pretty_si_t(free_meta) << " + "
809 << pretty_si_t(free_buffer) << ")"
810 << " -> max " << max_onodes << " onodes + "
811 << max_buffer << " buffer"
812 << dendl;
813 _trim(max_onodes, max_buffer);
814}
815
816
817// LRUCache
818#undef dout_prefix
819#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
820
821void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
822{
823 auto p = onode_lru.iterator_to(*o);
824 onode_lru.erase(p);
825 onode_lru.push_front(*o);
826}
827
828void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
829{
830 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
831 << " buffers " << buffer_size << " / " << buffer_max
832 << dendl;
833
834 _audit("trim start");
835
836 // buffers
837 while (buffer_size > buffer_max) {
838 auto i = buffer_lru.rbegin();
839 if (i == buffer_lru.rend()) {
840 // stop if buffer_lru is now empty
841 break;
842 }
843
844 Buffer *b = &*i;
845 assert(b->is_clean());
846 dout(20) << __func__ << " rm " << *b << dendl;
847 b->space->_rm_buffer(this, b);
848 }
849
850 // onodes
851 int num = onode_lru.size() - onode_max;
852 if (num <= 0)
853 return; // don't even try
854
855 auto p = onode_lru.end();
856 assert(p != onode_lru.begin());
857 --p;
858 int skipped = 0;
859 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
860 while (num > 0) {
861 Onode *o = &*p;
862 int refs = o->nref.load();
863 if (refs > 1) {
864 dout(20) << __func__ << " " << o->oid << " has " << refs
865 << " refs, skipping" << dendl;
866 if (++skipped >= max_skipped) {
867 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
868 << num << " left to trim" << dendl;
869 break;
870 }
871
872 if (p == onode_lru.begin()) {
873 break;
874 } else {
875 p--;
876 num--;
877 continue;
878 }
879 }
880 dout(30) << __func__ << " rm " << o->oid << dendl;
881 if (p != onode_lru.begin()) {
882 onode_lru.erase(p--);
883 } else {
884 onode_lru.erase(p);
885 assert(num == 1);
886 }
887 o->get(); // paranoia
888 o->c->onode_map.remove(o->oid);
889 o->put();
890 --num;
891 }
892}
893
894#ifdef DEBUG_CACHE
895void BlueStore::LRUCache::_audit(const char *when)
896{
897 dout(10) << __func__ << " " << when << " start" << dendl;
898 uint64_t s = 0;
899 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
900 s += i->length;
901 }
902 if (s != buffer_size) {
903 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
904 << dendl;
905 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
906 derr << __func__ << " " << *i << dendl;
907 }
908 assert(s == buffer_size);
909 }
910 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
911 << " ok" << dendl;
912}
913#endif
914
915// TwoQCache
916#undef dout_prefix
917#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
918
919
920void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
921{
922 auto p = onode_lru.iterator_to(*o);
923 onode_lru.erase(p);
924 onode_lru.push_front(*o);
925}
926
927void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
928{
929 dout(20) << __func__ << " level " << level << " near " << near
930 << " on " << *b
931 << " which has cache_private " << b->cache_private << dendl;
932 if (near) {
933 b->cache_private = near->cache_private;
934 switch (b->cache_private) {
935 case BUFFER_WARM_IN:
936 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
937 break;
938 case BUFFER_WARM_OUT:
939 assert(b->is_empty());
940 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
941 break;
942 case BUFFER_HOT:
943 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
944 break;
945 default:
946 assert(0 == "bad cache_private");
947 }
948 } else if (b->cache_private == BUFFER_NEW) {
949 b->cache_private = BUFFER_WARM_IN;
950 if (level > 0) {
951 buffer_warm_in.push_front(*b);
952 } else {
953 // take caller hint to start at the back of the warm queue
954 buffer_warm_in.push_back(*b);
955 }
956 } else {
957 // we got a hint from discard
958 switch (b->cache_private) {
959 case BUFFER_WARM_IN:
960 // stay in warm_in. move to front, even though 2Q doesn't actually
961 // do this.
962 dout(20) << __func__ << " move to front of warm " << *b << dendl;
963 buffer_warm_in.push_front(*b);
964 break;
965 case BUFFER_WARM_OUT:
966 b->cache_private = BUFFER_HOT;
967 // move to hot. fall-thru
968 case BUFFER_HOT:
969 dout(20) << __func__ << " move to front of hot " << *b << dendl;
970 buffer_hot.push_front(*b);
971 break;
972 default:
973 assert(0 == "bad cache_private");
974 }
975 }
976 if (!b->is_empty()) {
977 buffer_bytes += b->length;
978 buffer_list_bytes[b->cache_private] += b->length;
979 }
980}
981
982void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
983{
984 dout(20) << __func__ << " " << *b << dendl;
985 if (!b->is_empty()) {
986 assert(buffer_bytes >= b->length);
987 buffer_bytes -= b->length;
988 assert(buffer_list_bytes[b->cache_private] >= b->length);
989 buffer_list_bytes[b->cache_private] -= b->length;
990 }
991 switch (b->cache_private) {
992 case BUFFER_WARM_IN:
993 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
994 break;
995 case BUFFER_WARM_OUT:
996 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
997 break;
998 case BUFFER_HOT:
999 buffer_hot.erase(buffer_hot.iterator_to(*b));
1000 break;
1001 default:
1002 assert(0 == "bad cache_private");
1003 }
1004}
1005
1006void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1007{
1008 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1009 src->_rm_buffer(b);
1010
1011 // preserve which list we're on (even if we can't preserve the order!)
1012 switch (b->cache_private) {
1013 case BUFFER_WARM_IN:
1014 assert(!b->is_empty());
1015 buffer_warm_in.push_back(*b);
1016 break;
1017 case BUFFER_WARM_OUT:
1018 assert(b->is_empty());
1019 buffer_warm_out.push_back(*b);
1020 break;
1021 case BUFFER_HOT:
1022 assert(!b->is_empty());
1023 buffer_hot.push_back(*b);
1024 break;
1025 default:
1026 assert(0 == "bad cache_private");
1027 }
1028 if (!b->is_empty()) {
1029 buffer_bytes += b->length;
1030 buffer_list_bytes[b->cache_private] += b->length;
1031 }
1032}
1033
1034void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1035{
1036 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1037 if (!b->is_empty()) {
1038 assert((int64_t)buffer_bytes + delta >= 0);
1039 buffer_bytes += delta;
1040 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1041 buffer_list_bytes[b->cache_private] += delta;
1042 }
1043}
1044
1045void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1046{
1047 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1048 << " buffers " << buffer_bytes << " / " << buffer_max
1049 << dendl;
1050
1051 _audit("trim start");
1052
1053 // buffers
1054 if (buffer_bytes > buffer_max) {
1055 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1056 uint64_t khot = buffer_max - kin;
1057
1058 // pre-calculate kout based on average buffer size too,
1059 // which is typical(the warm_in and hot lists may change later)
1060 uint64_t kout = 0;
1061 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1062 if (buffer_num) {
1063 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1064 assert(buffer_avg_size);
1065 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1066 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1067 }
1068
1069 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1070 // hot is small, give slack to warm_in
1071 kin += khot - buffer_list_bytes[BUFFER_HOT];
1072 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1073 // warm_in is small, give slack to hot
1074 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1075 }
1076
1077 // adjust warm_in list
1078 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1079 uint64_t evicted = 0;
1080
1081 while (to_evict_bytes > 0) {
1082 auto p = buffer_warm_in.rbegin();
1083 if (p == buffer_warm_in.rend()) {
1084 // stop if warm_in list is now empty
1085 break;
1086 }
1087
1088 Buffer *b = &*p;
1089 assert(b->is_clean());
1090 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1091 assert(buffer_bytes >= b->length);
1092 buffer_bytes -= b->length;
1093 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1094 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1095 to_evict_bytes -= b->length;
1096 evicted += b->length;
1097 b->state = Buffer::STATE_EMPTY;
1098 b->data.clear();
1099 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1100 buffer_warm_out.push_front(*b);
1101 b->cache_private = BUFFER_WARM_OUT;
1102 }
1103
1104 if (evicted > 0) {
1105 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1106 << " from warm_in list, done evicting warm_in buffers"
1107 << dendl;
1108 }
1109
1110 // adjust hot list
1111 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1112 evicted = 0;
1113
1114 while (to_evict_bytes > 0) {
1115 auto p = buffer_hot.rbegin();
1116 if (p == buffer_hot.rend()) {
1117 // stop if hot list is now empty
1118 break;
1119 }
1120
1121 Buffer *b = &*p;
1122 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1123 assert(b->is_clean());
1124 // adjust evict size before buffer goes invalid
1125 to_evict_bytes -= b->length;
1126 evicted += b->length;
1127 b->space->_rm_buffer(this, b);
1128 }
1129
1130 if (evicted > 0) {
1131 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1132 << " from hot list, done evicting hot buffers"
1133 << dendl;
1134 }
1135
1136 // adjust warm out list too, if necessary
1137 int64_t num = buffer_warm_out.size() - kout;
1138 while (num-- > 0) {
1139 Buffer *b = &*buffer_warm_out.rbegin();
1140 assert(b->is_empty());
1141 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1142 b->space->_rm_buffer(this, b);
1143 }
1144 }
1145
1146 // onodes
1147 int num = onode_lru.size() - onode_max;
1148 if (num <= 0)
1149 return; // don't even try
1150
1151 auto p = onode_lru.end();
1152 assert(p != onode_lru.begin());
1153 --p;
1154 int skipped = 0;
1155 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1156 while (num > 0) {
1157 Onode *o = &*p;
1158 dout(20) << __func__ << " considering " << o << dendl;
1159 int refs = o->nref.load();
1160 if (refs > 1) {
1161 dout(20) << __func__ << " " << o->oid << " has " << refs
1162 << " refs; skipping" << dendl;
1163 if (++skipped >= max_skipped) {
1164 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1165 << num << " left to trim" << dendl;
1166 break;
1167 }
1168
1169 if (p == onode_lru.begin()) {
1170 break;
1171 } else {
1172 p--;
1173 num--;
1174 continue;
1175 }
1176 }
1177 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1178 if (p != onode_lru.begin()) {
1179 onode_lru.erase(p--);
1180 } else {
1181 onode_lru.erase(p);
1182 assert(num == 1);
1183 }
1184 o->get(); // paranoia
1185 o->c->onode_map.remove(o->oid);
1186 o->put();
1187 --num;
1188 }
1189}
1190
1191#ifdef DEBUG_CACHE
1192void BlueStore::TwoQCache::_audit(const char *when)
1193{
1194 dout(10) << __func__ << " " << when << " start" << dendl;
1195 uint64_t s = 0;
1196 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1197 s += i->length;
1198 }
1199
1200 uint64_t hot_bytes = s;
1201 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1202 derr << __func__ << " hot_list_bytes "
1203 << buffer_list_bytes[BUFFER_HOT]
1204 << " != actual " << hot_bytes
1205 << dendl;
1206 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1207 }
1208
1209 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1210 s += i->length;
1211 }
1212
1213 uint64_t warm_in_bytes = s - hot_bytes;
1214 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1215 derr << __func__ << " warm_in_list_bytes "
1216 << buffer_list_bytes[BUFFER_WARM_IN]
1217 << " != actual " << warm_in_bytes
1218 << dendl;
1219 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1220 }
1221
1222 if (s != buffer_bytes) {
1223 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1224 << dendl;
1225 assert(s == buffer_bytes);
1226 }
1227
1228 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1229 << " ok" << dendl;
1230}
1231#endif
1232
1233
1234// BufferSpace
1235
1236#undef dout_prefix
1237#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1238
1239void BlueStore::BufferSpace::_clear(Cache* cache)
1240{
1241 // note: we already hold cache->lock
1242 ldout(cache->cct, 20) << __func__ << dendl;
1243 while (!buffer_map.empty()) {
1244 _rm_buffer(cache, buffer_map.begin());
1245 }
1246}
1247
1248int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1249{
1250 // note: we already hold cache->lock
1251 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1252 << std::dec << dendl;
1253 int cache_private = 0;
1254 cache->_audit("discard start");
1255 auto i = _data_lower_bound(offset);
1256 uint32_t end = offset + length;
1257 while (i != buffer_map.end()) {
1258 Buffer *b = i->second.get();
1259 if (b->offset >= end) {
1260 break;
1261 }
1262 if (b->cache_private > cache_private) {
1263 cache_private = b->cache_private;
1264 }
1265 if (b->offset < offset) {
1266 int64_t front = offset - b->offset;
1267 if (b->end() > end) {
1268 // drop middle (split)
1269 uint32_t tail = b->end() - end;
1270 if (b->data.length()) {
1271 bufferlist bl;
1272 bl.substr_of(b->data, b->length - tail, tail);
1273 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, bl), 0, b);
1274 } else {
1275 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail), 0, b);
1276 }
1277 if (!b->is_writing()) {
1278 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1279 }
1280 b->truncate(front);
1281 cache->_audit("discard end 1");
1282 break;
1283 } else {
1284 // drop tail
1285 if (!b->is_writing()) {
1286 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1287 }
1288 b->truncate(front);
1289 ++i;
1290 continue;
1291 }
1292 }
1293 if (b->end() <= end) {
1294 // drop entire buffer
1295 _rm_buffer(cache, i++);
1296 continue;
1297 }
1298 // drop front
1299 uint32_t keep = b->end() - end;
1300 if (b->data.length()) {
1301 bufferlist bl;
1302 bl.substr_of(b->data, b->length - keep, keep);
1303 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, bl), 0, b);
1304 } else {
1305 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1306 }
1307 _rm_buffer(cache, i);
1308 cache->_audit("discard end 2");
1309 break;
1310 }
1311 return cache_private;
1312}
1313
1314void BlueStore::BufferSpace::read(
1315 Cache* cache,
1316 uint32_t offset, uint32_t length,
1317 BlueStore::ready_regions_t& res,
1318 interval_set<uint32_t>& res_intervals)
1319{
1320 std::lock_guard<std::recursive_mutex> l(cache->lock);
1321 res.clear();
1322 res_intervals.clear();
1323 uint32_t want_bytes = length;
1324 uint32_t end = offset + length;
1325 for (auto i = _data_lower_bound(offset);
1326 i != buffer_map.end() && offset < end && i->first < end;
1327 ++i) {
1328 Buffer *b = i->second.get();
1329 assert(b->end() > offset);
1330 if (b->is_writing() || b->is_clean()) {
1331 if (b->offset < offset) {
1332 uint32_t skip = offset - b->offset;
1333 uint32_t l = MIN(length, b->length - skip);
1334 res[offset].substr_of(b->data, skip, l);
1335 res_intervals.insert(offset, l);
1336 offset += l;
1337 length -= l;
1338 if (!b->is_writing()) {
1339 cache->_touch_buffer(b);
1340 }
1341 continue;
1342 }
1343 if (b->offset > offset) {
1344 uint32_t gap = b->offset - offset;
1345 if (length <= gap) {
1346 break;
1347 }
1348 offset += gap;
1349 length -= gap;
1350 }
1351 if (!b->is_writing()) {
1352 cache->_touch_buffer(b);
1353 }
1354 if (b->length > length) {
1355 res[offset].substr_of(b->data, 0, length);
1356 res_intervals.insert(offset, length);
1357 break;
1358 } else {
1359 res[offset].append(b->data);
1360 res_intervals.insert(offset, b->length);
1361 if (b->length == length)
1362 break;
1363 offset += b->length;
1364 length -= b->length;
1365 }
1366 }
1367 }
1368
1369 uint64_t hit_bytes = res_intervals.size();
1370 assert(hit_bytes <= want_bytes);
1371 uint64_t miss_bytes = want_bytes - hit_bytes;
1372 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1373 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1374}
1375
1376void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1377{
1378 std::lock_guard<std::recursive_mutex> l(cache->lock);
1379
1380 auto i = writing.begin();
1381 while (i != writing.end()) {
1382 if (i->seq > seq) {
1383 break;
1384 }
1385 if (i->seq < seq) {
1386 ++i;
1387 continue;
1388 }
1389
1390 Buffer *b = &*i;
1391 assert(b->is_writing());
1392
1393 if (b->flags & Buffer::FLAG_NOCACHE) {
1394 writing.erase(i++);
1395 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1396 buffer_map.erase(b->offset);
1397 } else {
1398 b->state = Buffer::STATE_CLEAN;
1399 writing.erase(i++);
1400 cache->_add_buffer(b, 1, nullptr);
1401 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1402 }
1403 }
1404
1405 cache->_audit("finish_write end");
1406}
1407
1408void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1409{
1410 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1411 if (buffer_map.empty())
1412 return;
1413
1414 auto p = --buffer_map.end();
1415 while (true) {
1416 if (p->second->end() <= pos)
1417 break;
1418
1419 if (p->second->offset < pos) {
1420 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1421 size_t left = pos - p->second->offset;
1422 size_t right = p->second->length - left;
1423 if (p->second->data.length()) {
1424 bufferlist bl;
1425 bl.substr_of(p->second->data, left, right);
1426 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1427 0, p->second.get());
1428 } else {
1429 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1430 0, p->second.get());
1431 }
1432 cache->_adjust_buffer_size(p->second.get(), -right);
1433 p->second->truncate(left);
1434 break;
1435 }
1436
1437 assert(p->second->end() > pos);
1438 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1439 if (p->second->data.length()) {
1440 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1441 p->second->offset - pos, p->second->data),
1442 0, p->second.get());
1443 } else {
1444 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1445 p->second->offset - pos, p->second->length),
1446 0, p->second.get());
1447 }
1448 if (p == buffer_map.begin()) {
1449 _rm_buffer(cache, p);
1450 break;
1451 } else {
1452 _rm_buffer(cache, p--);
1453 }
1454 }
1455 assert(writing.empty());
1456}
1457
1458// OnodeSpace
1459
1460#undef dout_prefix
1461#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1462
1463BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1464{
1465 std::lock_guard<std::recursive_mutex> l(cache->lock);
1466 auto p = onode_map.find(oid);
1467 if (p != onode_map.end()) {
1468 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1469 << " raced, returning existing " << p->second
1470 << dendl;
1471 return p->second;
1472 }
1473 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1474 onode_map[oid] = o;
1475 cache->_add_onode(o, 1);
1476 return o;
1477}
1478
1479BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1480{
1481 std::lock_guard<std::recursive_mutex> l(cache->lock);
1482 ldout(cache->cct, 30) << __func__ << dendl;
1483 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1484 if (p == onode_map.end()) {
1485 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1486 cache->logger->inc(l_bluestore_onode_misses);
1487 return OnodeRef();
1488 }
1489 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1490 << dendl;
1491 cache->_touch_onode(p->second);
1492 cache->logger->inc(l_bluestore_onode_hits);
1493 return p->second;
1494}
1495
1496void BlueStore::OnodeSpace::clear()
1497{
1498 std::lock_guard<std::recursive_mutex> l(cache->lock);
1499 ldout(cache->cct, 10) << __func__ << dendl;
1500 for (auto &p : onode_map) {
1501 cache->_rm_onode(p.second);
1502 }
1503 onode_map.clear();
1504}
1505
1506bool BlueStore::OnodeSpace::empty()
1507{
1508 std::lock_guard<std::recursive_mutex> l(cache->lock);
1509 return onode_map.empty();
1510}
1511
1512void BlueStore::OnodeSpace::rename(
1513 OnodeRef& oldo,
1514 const ghobject_t& old_oid,
1515 const ghobject_t& new_oid,
1516 const mempool::bluestore_meta_other::string& new_okey)
1517{
1518 std::lock_guard<std::recursive_mutex> l(cache->lock);
1519 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1520 << dendl;
1521 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1522 po = onode_map.find(old_oid);
1523 pn = onode_map.find(new_oid);
1524 assert(po != pn);
1525
1526 assert(po != onode_map.end());
1527 if (pn != onode_map.end()) {
1528 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1529 << dendl;
1530 cache->_rm_onode(pn->second);
1531 onode_map.erase(pn);
1532 }
1533 OnodeRef o = po->second;
1534
1535 // install a non-existent onode at old location
1536 oldo.reset(new Onode(o->c, old_oid, o->key));
1537 po->second = oldo;
1538 cache->_add_onode(po->second, 1);
1539
1540 // add at new position and fix oid, key
1541 onode_map.insert(make_pair(new_oid, o));
1542 cache->_touch_onode(o);
1543 o->oid = new_oid;
1544 o->key = new_okey;
1545}
1546
1547bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1548{
1549 std::lock_guard<std::recursive_mutex> l(cache->lock);
1550 ldout(cache->cct, 20) << __func__ << dendl;
1551 for (auto& i : onode_map) {
1552 if (f(i.second)) {
1553 return true;
1554 }
1555 }
1556 return false;
1557}
1558
1559
1560// SharedBlob
1561
1562#undef dout_prefix
1563#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1564
1565ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1566{
1567 out << "SharedBlob(" << &sb;
1568
1569 if (sb.loaded) {
1570 out << " loaded " << *sb.persistent;
1571 } else {
1572 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1573 }
1574 return out << ")";
1575}
1576
1577BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1578 : coll(_coll), sbid_unloaded(i)
1579{
1580 assert(sbid_unloaded > 0);
1581 if (get_cache()) {
1582 get_cache()->add_blob();
1583 }
1584}
1585
1586BlueStore::SharedBlob::~SharedBlob()
1587{
1588 if (get_cache()) { // the dummy instances have a nullptr
1589 std::lock_guard<std::recursive_mutex> l(get_cache()->lock);
1590 bc._clear(get_cache());
1591 get_cache()->rm_blob();
1592 }
1593 if (loaded && persistent) {
1594 delete persistent;
1595 }
1596}
1597
1598void BlueStore::SharedBlob::put()
1599{
1600 if (--nref == 0) {
1601 ldout(coll->store->cct, 20) << __func__ << " " << this
1602 << " removing self from set " << get_parent()
1603 << dendl;
1604 if (get_parent()) {
1605 if (get_parent()->remove(this)) {
1606 delete this;
1607 } else {
1608 ldout(coll->store->cct, 20)
1609 << __func__ << " " << this << " lost race to remove myself from set"
1610 << dendl;
1611 }
1612 } else {
1613 delete this;
1614 }
1615 }
1616}
1617
1618void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1619{
1620 assert(persistent);
1621 persistent->ref_map.get(offset, length);
1622}
1623
1624void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
1625 PExtentVector *r)
1626{
1627 assert(persistent);
1628 persistent->ref_map.put(offset, length, r);
1629}
1630
1631// Blob
1632
1633#undef dout_prefix
1634#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1635
1636ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1637{
1638 out << "Blob(" << &b;
1639 if (b.is_spanning()) {
1640 out << " spanning " << b.id;
1641 }
1642 out << " " << b.get_blob() << " " << b.get_blob_use_tracker()
1643 << " " << *b.shared_blob
1644 << ")";
1645 return out;
1646}
1647
1648void BlueStore::Blob::discard_unallocated(Collection *coll)
1649{
1650 if (blob.is_shared()) {
1651 return;
1652 }
1653 if (blob.is_compressed()) {
1654 bool discard = false;
1655 bool all_invalid = true;
1656 for (auto e : blob.get_extents()) {
1657 if (!e.is_valid()) {
1658 discard = true;
1659 } else {
1660 all_invalid = false;
1661 }
1662 }
1663 assert(discard == all_invalid); // in case of compressed blob all
1664 // or none pextents are invalid.
1665 if (discard) {
1666 shared_blob->bc.discard(shared_blob->get_cache(), 0, blob.get_logical_length());
1667 }
1668 } else {
1669 size_t pos = 0;
1670 for (auto e : blob.get_extents()) {
1671 if (!e.is_valid()) {
1672 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1673 << "~" << e.length
1674 << std::dec << dendl;
1675 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1676 }
1677 pos += e.length;
1678 }
1679 if (blob.can_prune_tail()) {
1680 dirty_blob();
1681 blob.prune_tail();
1682 used_in_blob.prune_tail(blob.get_ondisk_length());
1683 auto cct = coll->store->cct; //used by dout
1684 dout(20) << __func__ << " pruned tail, now " << blob << dendl;
1685 }
1686 }
1687}
1688
1689void BlueStore::Blob::get_ref(
1690 Collection *coll,
1691 uint32_t offset,
1692 uint32_t length)
1693{
1694 // Caller has to initialize Blob's logical length prior to increment
1695 // references. Otherwise one is neither unable to determine required
1696 // amount of counters in case of per-au tracking nor obtain min_release_size
1697 // for single counter mode.
1698 assert(get_blob().get_logical_length() != 0);
1699 auto cct = coll->store->cct;
1700 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1701 << std::dec << " " << *this << dendl;
1702
1703 if (used_in_blob.is_empty()) {
1704 uint32_t min_release_size =
1705 blob.get_release_size(coll->store->min_alloc_size);
1706 uint64_t l = blob.get_logical_length();
1707 dout(20) << __func__ << " init 0x" << std::hex << l << ", " << min_release_size
1708 << std::dec << dendl;
1709 used_in_blob.init(l, min_release_size);
1710 }
1711 used_in_blob.get(
1712 offset,
1713 length);
1714}
1715
1716bool BlueStore::Blob::put_ref(
1717 Collection *coll,
1718 uint32_t offset,
1719 uint32_t length,
1720 PExtentVector *r)
1721{
1722 PExtentVector logical;
1723
1724 auto cct = coll->store->cct;
1725 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1726 << std::dec << " " << *this << dendl;
1727
1728 bool empty = used_in_blob.put(
1729 offset,
1730 length,
1731 &logical);
1732 r->clear();
1733 // nothing to release
1734 if (!empty && logical.empty()) {
1735 return false;
1736 }
1737
1738 bluestore_blob_t& b = dirty_blob();
1739 return b.release_extents(empty, logical, r);
1740}
1741
1742bool BlueStore::Blob::try_reuse_blob(uint32_t min_alloc_size,
1743 uint32_t target_blob_size,
1744 uint32_t b_offset,
1745 uint32_t *length0) {
1746 assert(min_alloc_size);
1747 assert(target_blob_size);
1748 if (!get_blob().is_mutable()) {
1749 return false;
1750 }
1751
1752 uint32_t length = *length0;
1753 uint32_t end = b_offset + length;
1754
1755 // Currently for the sake of simplicity we omit blob reuse if data is
1756 // unaligned with csum chunk. Later we can perform padding if needed.
1757 if (get_blob().has_csum() &&
1758 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1759 (end % get_blob().get_csum_chunk_size()) != 0)) {
1760 return false;
1761 }
1762
1763 auto blen = get_blob().get_logical_length();
1764 uint32_t new_blen = blen;
1765
1766 // make sure target_blob_size isn't less than current blob len
1767 target_blob_size = MAX(blen, target_blob_size);
1768
1769 if (b_offset >= blen) {
1770 //new data totally stands out of the existing blob
1771 new_blen = b_offset + length;
1772 } else {
1773 //new data overlaps with the existing blob
1774 new_blen = MAX(blen, length + b_offset);
1775 if (!get_blob().is_unallocated(
1776 b_offset,
1777 new_blen > blen ? blen - b_offset : length)) {
1778 return false;
1779 }
1780 }
1781 if (new_blen > blen) {
1782 int64_t overflow = int64_t(new_blen) - target_blob_size;
1783 // Unable to decrease the provided length to fit into max_blob_size
1784 if (overflow >= length) {
1785 return false;
1786 }
1787
1788 // FIXME: in some cases we could reduce unused resolution
1789 if (get_blob().has_unused()) {
1790 return false;
1791 }
1792
1793 if (overflow > 0) {
1794 new_blen -= overflow;
1795 length -= overflow;
1796 *length0 = length;
1797 }
1798 if (new_blen > blen) {
1799 dirty_blob().add_tail(new_blen);
1800 used_in_blob.add_tail(new_blen,
1801 blob.get_release_size(min_alloc_size));
1802 }
1803 }
1804 return true;
1805}
1806
1807void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1808{
1809 auto cct = coll->store->cct; //used by dout
1810 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1811 << " start " << *this << dendl;
1812 assert(blob.can_split());
1813 assert(used_in_blob.can_split());
1814 bluestore_blob_t &lb = dirty_blob();
1815 bluestore_blob_t &rb = r->dirty_blob();
1816
1817 used_in_blob.split(
1818 blob_offset,
1819 &(r->used_in_blob));
1820
1821 lb.split(blob_offset, rb);
1822 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1823
1824 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1825 << " finish " << *this << dendl;
1826 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1827 << " and " << *r << dendl;
1828}
1829
1830#ifndef CACHE_BLOB_BL
1831void BlueStore::Blob::decode(
1832 Collection *coll,
1833 bufferptr::iterator& p,
1834 uint64_t struct_v,
1835 uint64_t* sbid,
1836 bool include_ref_map)
1837{
1838 denc(blob, p, struct_v);
1839 if (blob.is_shared()) {
1840 denc(*sbid, p);
1841 }
1842 if (include_ref_map) {
1843 if (struct_v > 1) {
1844 used_in_blob.decode(p);
1845 } else {
1846 used_in_blob.clear();
1847 bluestore_extent_ref_map_t legacy_ref_map;
1848 legacy_ref_map.decode(p);
1849 for (auto r : legacy_ref_map.ref_map) {
1850 get_ref(
1851 coll,
1852 r.first,
1853 r.second.refs * r.second.length);
1854 }
1855 }
1856 }
1857}
1858#endif
1859
1860// Extent
1861
1862ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1863{
1864 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1865 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1866 << " " << *e.blob;
1867}
1868
1869// OldExtent
1870BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1871 uint32_t lo,
1872 uint32_t o,
1873 uint32_t l,
1874 BlobRef& b) {
1875 OldExtent* oe = new OldExtent(lo, o, l, b);
1876 b->put_ref(c.get(), o, l, &(oe->r));
1877 oe->blob_empty = b->get_referenced_bytes() == 0;
1878 return oe;
1879}
1880
1881// ExtentMap
1882
1883#undef dout_prefix
1884#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1885
1886BlueStore::ExtentMap::ExtentMap(Onode *o)
1887 : onode(o),
1888 inline_bl(
1889 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1890}
1891
1892void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1893 bool force)
1894{
1895 auto cct = onode->c->store->cct; //used by dout
1896 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1897 if (onode->onode.extent_map_shards.empty()) {
1898 if (inline_bl.length() == 0) {
1899 unsigned n;
1900 // we need to encode inline_bl to measure encoded length
1901 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
1902 assert(!never_happen);
1903 size_t len = inline_bl.length();
1904 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1905 << " extents" << dendl;
1906 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
1907 request_reshard(0, OBJECT_MAX_SIZE);
1908 return;
1909 }
1910 }
1911 // will persist in the onode key.
1912 } else {
1913 // pending shard update
1914 struct dirty_shard_t {
1915 Shard *shard;
1916 bufferlist bl;
1917 dirty_shard_t(Shard *s) : shard(s) {}
1918 };
1919 vector<dirty_shard_t> encoded_shards;
1920 // allocate slots for all shards in a single call instead of
1921 // doing multiple allocations - one per each dirty shard
1922 encoded_shards.reserve(shards.size());
1923
1924 auto p = shards.begin();
1925 auto prev_p = p;
1926 while (p != shards.end()) {
1927 auto n = p;
1928 ++n;
1929 if (p->dirty) {
1930 uint32_t endoff;
1931 if (n == shards.end()) {
1932 endoff = OBJECT_MAX_SIZE;
1933 } else {
1934 endoff = n->shard_info->offset;
1935 }
1936 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
1937 bufferlist& bl = encoded_shards.back().bl;
1938 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
1939 bl, &p->extents)) {
1940 if (force) {
1941 derr << __func__ << " encode_some needs reshard" << dendl;
1942 assert(!force);
1943 }
1944 }
1945 size_t len = bl.length();
1946
1947 dout(20) << __func__ << " shard 0x" << std::hex
1948 << p->shard_info->offset << std::dec << " is " << len
1949 << " bytes (was " << p->shard_info->bytes << ") from "
1950 << p->extents << " extents" << dendl;
1951
1952 if (!force) {
1953 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
1954 // we are big; reshard ourselves
1955 request_reshard(p->shard_info->offset, endoff);
1956 }
1957 // avoid resharding the trailing shard, even if it is small
1958 else if (n != shards.end() &&
1959 len < g_conf->bluestore_extent_map_shard_min_size) {
1960 // we are small; combine with a neighbor
1961 if (p == shards.begin() && endoff == OBJECT_MAX_SIZE) {
1962 // we are an only shard
1963 request_reshard(0, OBJECT_MAX_SIZE);
1964 return;
1965 } else if (p == shards.begin()) {
1966 // combine with next shard
1967 request_reshard(p->shard_info->offset, endoff + 1);
1968 } else if (endoff == OBJECT_MAX_SIZE) {
1969 // combine with previous shard
1970 request_reshard(prev_p->shard_info->offset, endoff);
1971 return;
1972 } else {
1973 // combine with the smaller of the two
1974 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
1975 request_reshard(p->shard_info->offset, endoff + 1);
1976 } else {
1977 request_reshard(prev_p->shard_info->offset, endoff);
1978 }
1979 }
1980 }
1981 }
1982 }
1983 prev_p = p;
1984 p = n;
1985 }
1986 if (needs_reshard()) {
1987 return;
1988 }
1989
1990 // schedule DB update for dirty shards
1991 string key;
1992 for (auto& it : encoded_shards) {
1993 it.shard->dirty = false;
1994 it.shard->shard_info->bytes = it.bl.length();
1995 generate_extent_shard_key_and_apply(
1996 onode->key,
1997 it.shard->shard_info->offset,
1998 &key,
1999 [&](const string& final_key) {
2000 t->set(PREFIX_OBJ, final_key, it.bl);
2001 }
2002 );
2003 }
2004 }
2005}
2006
2007void BlueStore::ExtentMap::reshard(
2008 KeyValueDB *db,
2009 KeyValueDB::Transaction t)
2010{
2011 auto cct = onode->c->store->cct; // used by dout
2012
2013 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2014 << needs_reshard_end << ")" << std::dec
2015 << " of " << onode->onode.extent_map_shards.size()
2016 << " shards on " << onode->oid << dendl;
2017 for (auto& p : spanning_blob_map) {
2018 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2019 << dendl;
2020 }
2021 // determine shard index range
2022 unsigned si_begin = 0, si_end = 0;
2023 if (!shards.empty()) {
2024 while (si_begin + 1 < shards.size() &&
2025 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2026 ++si_begin;
2027 }
2028 needs_reshard_begin = shards[si_begin].shard_info->offset;
2029 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2030 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2031 needs_reshard_end = shards[si_end].shard_info->offset;
2032 break;
2033 }
2034 }
2035 if (si_end == shards.size()) {
2036 needs_reshard_end = OBJECT_MAX_SIZE;
2037 }
2038 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2039 << " over 0x[" << std::hex << needs_reshard_begin << ","
2040 << needs_reshard_end << ")" << std::dec << dendl;
2041 }
2042
2043 fault_range(db, needs_reshard_begin, needs_reshard_end);
2044
2045 // we may need to fault in a larger interval later must have all
2046 // referring extents for spanning blobs loaded in order to have
2047 // accurate use_tracker values.
2048 uint32_t spanning_scan_begin = needs_reshard_begin;
2049 uint32_t spanning_scan_end = needs_reshard_end;
2050
2051 // remove old keys
2052 string key;
2053 for (unsigned i = si_begin; i < si_end; ++i) {
2054 generate_extent_shard_key_and_apply(
2055 onode->key, shards[i].shard_info->offset, &key,
2056 [&](const string& final_key) {
2057 t->rmkey(PREFIX_OBJ, final_key);
2058 }
2059 );
2060 }
2061
2062 // calculate average extent size
2063 unsigned bytes = 0;
2064 unsigned extents = 0;
2065 if (onode->onode.extent_map_shards.empty()) {
2066 bytes = inline_bl.length();
2067 extents = extent_map.size();
2068 } else {
2069 for (unsigned i = si_begin; i < si_end; ++i) {
2070 bytes += shards[i].shard_info->bytes;
2071 extents += shards[i].extents;
2072 }
2073 }
2074 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2075 unsigned slop = target *
2076 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2077 unsigned extent_avg = bytes / MAX(1, extents);
2078 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2079 << ", slop " << slop << dendl;
2080
2081 // reshard
2082 unsigned estimate = 0;
2083 unsigned offset = 0;
2084 vector<bluestore_onode_t::shard_info> new_shard_info;
2085 unsigned max_blob_end = 0;
2086 Extent dummy(needs_reshard_begin);
2087 for (auto e = extent_map.lower_bound(dummy);
2088 e != extent_map.end();
2089 ++e) {
2090 if (e->logical_offset >= needs_reshard_end) {
2091 break;
2092 }
2093 dout(30) << " extent " << *e << dendl;
2094
2095 // disfavor shard boundaries that span a blob
2096 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2097 if (estimate &&
2098 estimate + extent_avg > target + (would_span ? slop : 0)) {
2099 // new shard
2100 if (offset == 0) {
2101 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2102 new_shard_info.back().offset = offset;
2103 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2104 << std::dec << dendl;
2105 }
2106 offset = e->logical_offset;
2107 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2108 new_shard_info.back().offset = offset;
2109 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2110 << std::dec << dendl;
2111 estimate = 0;
2112 }
2113 estimate += extent_avg;
2114 unsigned bb = e->blob_start();
2115 if (bb < spanning_scan_begin) {
2116 spanning_scan_begin = bb;
2117 }
2118 uint32_t be = e->blob_end();
2119 if (be > max_blob_end) {
2120 max_blob_end = be;
2121 }
2122 if (be > spanning_scan_end) {
2123 spanning_scan_end = be;
2124 }
2125 }
2126 if (new_shard_info.empty() && (si_begin > 0 ||
2127 si_end < shards.size())) {
2128 // we resharded a partial range; we must produce at least one output
2129 // shard
2130 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2131 new_shard_info.back().offset = needs_reshard_begin;
2132 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2133 << std::dec << " (singleton degenerate case)" << dendl;
2134 }
2135
2136 auto& sv = onode->onode.extent_map_shards;
2137 dout(20) << __func__ << " new " << new_shard_info << dendl;
2138 dout(20) << __func__ << " old " << sv << dendl;
2139 if (sv.empty()) {
2140 // no old shards to keep
2141 sv.swap(new_shard_info);
2142 init_shards(true, true);
2143 } else {
2144 // splice in new shards
2145 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2146 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2147 sv.insert(
2148 sv.begin() + si_begin,
2149 new_shard_info.begin(),
2150 new_shard_info.end());
2151 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2152 unsigned n = sv.size();
2153 si_end = si_begin + new_shard_info.size();
2154 for (unsigned i = si_begin; i < si_end; ++i) {
2155 shards[i].shard_info = &sv[i];
2156 shards[i].loaded = true;
2157 shards[i].dirty = true;
2158 }
2159 for (unsigned i = si_end; i < n; ++i) {
2160 shards[i].shard_info = &sv[i];
2161 }
2162 }
2163 dout(20) << __func__ << " fin " << sv << dendl;
2164 inline_bl.clear();
2165
2166 if (sv.empty()) {
2167 // no more shards; unspan all previously spanning blobs
2168 auto p = spanning_blob_map.begin();
2169 while (p != spanning_blob_map.end()) {
2170 p->second->id = -1;
2171 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2172 p = spanning_blob_map.erase(p);
2173 }
2174 } else {
2175 // identify new spanning blobs
2176 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2177 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2178 if (spanning_scan_begin < needs_reshard_begin) {
2179 fault_range(db, spanning_scan_begin,
2180 needs_reshard_begin - spanning_scan_begin);
2181 }
2182 if (spanning_scan_end > needs_reshard_end) {
2183 fault_range(db, needs_reshard_end,
2184 spanning_scan_end - needs_reshard_begin);
2185 }
2186 auto sp = sv.begin() + si_begin;
2187 auto esp = sv.end();
2188 unsigned shard_start = sp->offset;
2189 unsigned shard_end;
2190 ++sp;
2191 if (sp == esp) {
2192 shard_end = OBJECT_MAX_SIZE;
2193 } else {
2194 shard_end = sp->offset;
2195 }
2196 int bid;
2197 if (spanning_blob_map.empty()) {
2198 bid = 0;
2199 } else {
2200 bid = spanning_blob_map.rbegin()->first + 1;
2201 }
2202 Extent dummy(needs_reshard_begin);
2203 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2204 if (e->logical_offset >= needs_reshard_end) {
2205 break;
2206 }
2207 dout(30) << " extent " << *e << dendl;
2208 while (e->logical_offset >= shard_end) {
2209 shard_start = shard_end;
2210 assert(sp != esp);
2211 ++sp;
2212 if (sp == esp) {
2213 shard_end = OBJECT_MAX_SIZE;
2214 } else {
2215 shard_end = sp->offset;
2216 }
2217 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2218 << " to 0x" << shard_end << std::dec << dendl;
2219 }
2220 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2221 if (!e->blob->is_spanning()) {
2222 // We have two options: (1) split the blob into pieces at the
2223 // shard boundaries (and adjust extents accordingly), or (2)
2224 // mark it spanning. We prefer to cut the blob if we can. Note that
2225 // we may have to split it multiple times--potentially at every
2226 // shard boundary.
2227 bool must_span = false;
2228 BlobRef b = e->blob;
2229 if (b->can_split()) {
2230 uint32_t bstart = e->blob_start();
2231 uint32_t bend = e->blob_end();
2232 for (const auto& sh : shards) {
2233 if (bstart < sh.shard_info->offset &&
2234 bend > sh.shard_info->offset) {
2235 uint32_t blob_offset = sh.shard_info->offset - bstart;
2236 if (b->can_split_at(blob_offset)) {
2237 dout(20) << __func__ << " splitting blob, bstart 0x"
2238 << std::hex << bstart << " blob_offset 0x"
2239 << blob_offset << std::dec << " " << *b << dendl;
2240 b = split_blob(b, blob_offset, sh.shard_info->offset);
2241 // switch b to the new right-hand side, in case it
2242 // *also* has to get split.
2243 bstart += blob_offset;
2244 onode->c->store->logger->inc(l_bluestore_blob_split);
2245 } else {
2246 must_span = true;
2247 break;
2248 }
2249 }
2250 }
2251 } else {
2252 must_span = true;
2253 }
2254 if (must_span) {
2255 b->id = bid++;
2256 spanning_blob_map[b->id] = b;
2257 dout(20) << __func__ << " adding spanning " << *b << dendl;
2258 }
2259 }
2260 } else {
2261 if (e->blob->is_spanning()) {
2262 spanning_blob_map.erase(e->blob->id);
2263 e->blob->id = -1;
2264 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2265 }
2266 }
2267 }
2268 }
2269
2270 clear_needs_reshard();
2271}
2272
2273bool BlueStore::ExtentMap::encode_some(
2274 uint32_t offset,
2275 uint32_t length,
2276 bufferlist& bl,
2277 unsigned *pn)
2278{
2279 auto cct = onode->c->store->cct; //used by dout
2280 Extent dummy(offset);
2281 auto start = extent_map.lower_bound(dummy);
2282 uint32_t end = offset + length;
2283
2284 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2285 // serialization only. Hence there is no specific
2286 // handling at ExtentMap level.
2287
2288 unsigned n = 0;
2289 size_t bound = 0;
2290 denc(struct_v, bound);
2291 denc_varint(0, bound);
2292 bool must_reshard = false;
2293 for (auto p = start;
2294 p != extent_map.end() && p->logical_offset < end;
2295 ++p, ++n) {
2296 assert(p->logical_offset >= offset);
2297 p->blob->last_encoded_id = -1;
2298 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2299 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2300 << std::dec << " hit new spanning blob " << *p << dendl;
2301 request_reshard(p->blob_start(), p->blob_end());
2302 must_reshard = true;
2303 }
2304 denc_varint(0, bound); // blobid
2305 denc_varint(0, bound); // logical_offset
2306 denc_varint(0, bound); // len
2307 denc_varint(0, bound); // blob_offset
2308
2309 p->blob->bound_encode(
2310 bound,
2311 struct_v,
2312 p->blob->shared_blob->get_sbid(),
2313 false);
2314 }
2315 if (must_reshard) {
2316 return true;
2317 }
2318
2319 {
2320 auto app = bl.get_contiguous_appender(bound);
2321 denc(struct_v, app);
2322 denc_varint(n, app);
2323 if (pn) {
2324 *pn = n;
2325 }
2326
2327 n = 0;
2328 uint64_t pos = 0;
2329 uint64_t prev_len = 0;
2330 for (auto p = start;
2331 p != extent_map.end() && p->logical_offset < end;
2332 ++p, ++n) {
2333 unsigned blobid;
2334 bool include_blob = false;
2335 if (p->blob->is_spanning()) {
2336 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2337 blobid |= BLOBID_FLAG_SPANNING;
2338 } else if (p->blob->last_encoded_id < 0) {
2339 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2340 include_blob = true;
2341 blobid = 0; // the decoder will infer the id from n
2342 } else {
2343 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2344 }
2345 if (p->logical_offset == pos) {
2346 blobid |= BLOBID_FLAG_CONTIGUOUS;
2347 }
2348 if (p->blob_offset == 0) {
2349 blobid |= BLOBID_FLAG_ZEROOFFSET;
2350 }
2351 if (p->length == prev_len) {
2352 blobid |= BLOBID_FLAG_SAMELENGTH;
2353 } else {
2354 prev_len = p->length;
2355 }
2356 denc_varint(blobid, app);
2357 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2358 denc_varint_lowz(p->logical_offset - pos, app);
2359 }
2360 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2361 denc_varint_lowz(p->blob_offset, app);
2362 }
2363 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2364 denc_varint_lowz(p->length, app);
2365 }
2366 pos = p->logical_end();
2367 if (include_blob) {
2368 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2369 }
2370 }
2371 }
2372 /*derr << __func__ << bl << dendl;
2373 derr << __func__ << ":";
2374 bl.hexdump(*_dout);
2375 *_dout << dendl;
2376 */
2377 return false;
2378}
2379
2380unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2381{
2382 auto cct = onode->c->store->cct; //used by dout
2383 /*
2384 derr << __func__ << ":";
2385 bl.hexdump(*_dout);
2386 *_dout << dendl;
2387 */
2388
2389 assert(bl.get_num_buffers() <= 1);
2390 auto p = bl.front().begin_deep();
2391 __u8 struct_v;
2392 denc(struct_v, p);
2393 // Version 2 differs from v1 in blob's ref_map
2394 // serialization only. Hence there is no specific
2395 // handling at ExtentMap level below.
2396 assert(struct_v == 1 || struct_v == 2);
2397
2398 uint32_t num;
2399 denc_varint(num, p);
2400 vector<BlobRef> blobs(num);
2401 uint64_t pos = 0;
2402 uint64_t prev_len = 0;
2403 unsigned n = 0;
2404
2405 while (!p.end()) {
2406 Extent *le = new Extent();
2407 uint64_t blobid;
2408 denc_varint(blobid, p);
2409 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2410 uint64_t gap;
2411 denc_varint_lowz(gap, p);
2412 pos += gap;
2413 }
2414 le->logical_offset = pos;
2415 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2416 denc_varint_lowz(le->blob_offset, p);
2417 } else {
2418 le->blob_offset = 0;
2419 }
2420 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2421 denc_varint_lowz(prev_len, p);
2422 }
2423 le->length = prev_len;
2424
2425 if (blobid & BLOBID_FLAG_SPANNING) {
2426 dout(30) << __func__ << " getting spanning blob "
2427 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2428 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2429 } else {
2430 blobid >>= BLOBID_SHIFT_BITS;
2431 if (blobid) {
2432 le->assign_blob(blobs[blobid - 1]);
2433 assert(le->blob);
2434 } else {
2435 Blob *b = new Blob();
2436 uint64_t sbid = 0;
2437 b->decode(onode->c, p, struct_v, &sbid, false);
2438 blobs[n] = b;
2439 onode->c->open_shared_blob(sbid, b);
2440 le->assign_blob(b);
2441 }
2442 // we build ref_map dynamically for non-spanning blobs
2443 le->blob->get_ref(
2444 onode->c,
2445 le->blob_offset,
2446 le->length);
2447 }
2448 pos += prev_len;
2449 ++n;
2450 extent_map.insert(*le);
2451 }
2452
2453 assert(n == num);
2454 return num;
2455}
2456
2457void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2458{
2459 // Version 2 differs from v1 in blob's ref_map
2460 // serialization only. Hence there is no specific
2461 // handling at ExtentMap level.
2462 __u8 struct_v = 2;
2463
2464 denc(struct_v, p);
2465 denc_varint((uint32_t)0, p);
2466 size_t key_size = 0;
2467 denc_varint((uint32_t)0, key_size);
2468 p += spanning_blob_map.size() * key_size;
2469 for (const auto& i : spanning_blob_map) {
2470 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2471 }
2472}
2473
2474void BlueStore::ExtentMap::encode_spanning_blobs(
2475 bufferlist::contiguous_appender& p)
2476{
2477 // Version 2 differs from v1 in blob's ref_map
2478 // serialization only. Hence there is no specific
2479 // handling at ExtentMap level.
2480 __u8 struct_v = 2;
2481
2482 denc(struct_v, p);
2483 denc_varint(spanning_blob_map.size(), p);
2484 for (auto& i : spanning_blob_map) {
2485 denc_varint(i.second->id, p);
2486 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2487 }
2488}
2489
2490void BlueStore::ExtentMap::decode_spanning_blobs(
2491 bufferptr::iterator& p)
2492{
2493 __u8 struct_v;
2494 denc(struct_v, p);
2495 // Version 2 differs from v1 in blob's ref_map
2496 // serialization only. Hence there is no specific
2497 // handling at ExtentMap level.
2498 assert(struct_v == 1 || struct_v == 2);
2499
2500 unsigned n;
2501 denc_varint(n, p);
2502 while (n--) {
2503 BlobRef b(new Blob());
2504 denc_varint(b->id, p);
2505 spanning_blob_map[b->id] = b;
2506 uint64_t sbid = 0;
2507 b->decode(onode->c, p, struct_v, &sbid, true);
2508 onode->c->open_shared_blob(sbid, b);
2509 }
2510}
2511
2512void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2513{
2514 shards.resize(onode->onode.extent_map_shards.size());
2515 unsigned i = 0;
2516 for (auto &s : onode->onode.extent_map_shards) {
2517 shards[i].shard_info = &s;
2518 shards[i].loaded = loaded;
2519 shards[i].dirty = dirty;
2520 ++i;
2521 }
2522}
2523
2524void BlueStore::ExtentMap::fault_range(
2525 KeyValueDB *db,
2526 uint32_t offset,
2527 uint32_t length)
2528{
2529 auto cct = onode->c->store->cct; //used by dout
2530 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2531 << std::dec << dendl;
2532 auto start = seek_shard(offset);
2533 auto last = seek_shard(offset + length);
2534
2535 if (start < 0)
2536 return;
2537
2538 assert(last >= start);
2539 string key;
2540 while (start <= last) {
2541 assert((size_t)start < shards.size());
2542 auto p = &shards[start];
2543 if (!p->loaded) {
2544 dout(30) << __func__ << " opening shard 0x" << std::hex
2545 << p->shard_info->offset << std::dec << dendl;
2546 bufferlist v;
2547 generate_extent_shard_key_and_apply(
2548 onode->key, p->shard_info->offset, &key,
2549 [&](const string& final_key) {
2550 int r = db->get(PREFIX_OBJ, final_key, &v);
2551 if (r < 0) {
2552 derr << __func__ << " missing shard 0x" << std::hex
2553 << p->shard_info->offset << std::dec << " for " << onode->oid
2554 << dendl;
2555 assert(r >= 0);
2556 }
2557 }
2558 );
2559 p->extents = decode_some(v);
2560 p->loaded = true;
2561 dout(20) << __func__ << " open shard 0x" << std::hex
2562 << p->shard_info->offset << std::dec
2563 << " (" << v.length() << " bytes)" << dendl;
2564 assert(p->dirty == false);
2565 assert(v.length() == p->shard_info->bytes);
2566 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2567 } else {
2568 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2569 }
2570 ++start;
2571 }
2572}
2573
2574void BlueStore::ExtentMap::dirty_range(
2575 KeyValueDB::Transaction t,
2576 uint32_t offset,
2577 uint32_t length)
2578{
2579 auto cct = onode->c->store->cct; //used by dout
2580 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2581 << std::dec << dendl;
2582 if (shards.empty()) {
2583 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2584 inline_bl.clear();
2585 return;
2586 }
2587 auto start = seek_shard(offset);
2588 auto last = seek_shard(offset + length);
2589 if (start < 0)
2590 return;
2591
2592 assert(last >= start);
2593 while (start <= last) {
2594 assert((size_t)start < shards.size());
2595 auto p = &shards[start];
2596 if (!p->loaded) {
2597 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2598 << std::dec << " is not loaded, can't mark dirty" << dendl;
2599 assert(0 == "can't mark unloaded shard dirty");
2600 }
2601 if (!p->dirty) {
2602 dout(20) << __func__ << " mark shard 0x" << std::hex
2603 << p->shard_info->offset << std::dec << " dirty" << dendl;
2604 p->dirty = true;
2605 }
2606 ++start;
2607 }
2608}
2609
2610BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2611 uint64_t offset)
2612{
2613 Extent dummy(offset);
2614 return extent_map.find(dummy);
2615}
2616
2617BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find_lextent(
2618 uint64_t offset)
2619{
2620 auto fp = seek_lextent(offset);
2621 if (fp != extent_map.end() && fp->logical_offset > offset)
2622 return extent_map.end(); // extent is past offset
2623 return fp;
2624}
2625
2626BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2627 uint64_t offset)
2628{
2629 Extent dummy(offset);
2630 auto fp = extent_map.lower_bound(dummy);
2631 if (fp != extent_map.begin()) {
2632 --fp;
2633 if (fp->logical_end() <= offset) {
2634 ++fp;
2635 }
2636 }
2637 return fp;
2638}
2639
2640BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2641 uint64_t offset) const
2642{
2643 Extent dummy(offset);
2644 auto fp = extent_map.lower_bound(dummy);
2645 if (fp != extent_map.begin()) {
2646 --fp;
2647 if (fp->logical_end() <= offset) {
2648 ++fp;
2649 }
2650 }
2651 return fp;
2652}
2653
2654bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2655{
2656 auto fp = seek_lextent(offset);
2657 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2658 return false;
2659 }
2660 return true;
2661}
2662
2663int BlueStore::ExtentMap::compress_extent_map(
2664 uint64_t offset,
2665 uint64_t length)
2666{
2667 auto cct = onode->c->store->cct; //used by dout
2668 if (extent_map.empty())
2669 return 0;
2670 int removed = 0;
2671 auto p = seek_lextent(offset);
2672 if (p != extent_map.begin()) {
2673 --p; // start to the left of offset
2674 }
2675 // the caller should have just written to this region
2676 assert(p != extent_map.end());
2677
2678 // identify the *next* shard
2679 auto pshard = shards.begin();
2680 while (pshard != shards.end() &&
2681 p->logical_offset >= pshard->shard_info->offset) {
2682 ++pshard;
2683 }
2684 uint64_t shard_end;
2685 if (pshard != shards.end()) {
2686 shard_end = pshard->shard_info->offset;
2687 } else {
2688 shard_end = OBJECT_MAX_SIZE;
2689 }
2690
2691 auto n = p;
2692 for (++n; n != extent_map.end(); p = n++) {
2693 if (n->logical_offset > offset + length) {
2694 break; // stop after end
2695 }
2696 while (n != extent_map.end() &&
2697 p->logical_end() == n->logical_offset &&
2698 p->blob == n->blob &&
2699 p->blob_offset + p->length == n->blob_offset &&
2700 n->logical_offset < shard_end) {
2701 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2702 << " next shard 0x" << shard_end << std::dec
2703 << " merging " << *p << " and " << *n << dendl;
2704 p->length += n->length;
2705 rm(n++);
2706 ++removed;
2707 }
2708 if (n == extent_map.end()) {
2709 break;
2710 }
2711 if (n->logical_offset >= shard_end) {
2712 assert(pshard != shards.end());
2713 ++pshard;
2714 if (pshard != shards.end()) {
2715 shard_end = pshard->shard_info->offset;
2716 } else {
2717 shard_end = OBJECT_MAX_SIZE;
2718 }
2719 }
2720 }
2721 if (removed && onode) {
2722 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2723 }
2724 return removed;
2725}
2726
2727void BlueStore::ExtentMap::punch_hole(
2728 CollectionRef &c,
2729 uint64_t offset,
2730 uint64_t length,
2731 old_extent_map_t *old_extents)
2732{
2733 auto p = seek_lextent(offset);
2734 uint64_t end = offset + length;
2735 while (p != extent_map.end()) {
2736 if (p->logical_offset >= end) {
2737 break;
2738 }
2739 if (p->logical_offset < offset) {
2740 if (p->logical_end() > end) {
2741 // split and deref middle
2742 uint64_t front = offset - p->logical_offset;
2743 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2744 length, p->blob);
2745 old_extents->push_back(*oe);
2746 add(end,
2747 p->blob_offset + front + length,
2748 p->length - front - length,
2749 p->blob);
2750 p->length = front;
2751 break;
2752 } else {
2753 // deref tail
2754 assert(p->logical_end() > offset); // else seek_lextent bug
2755 uint64_t keep = offset - p->logical_offset;
2756 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2757 p->length - keep, p->blob);
2758 old_extents->push_back(*oe);
2759 p->length = keep;
2760 ++p;
2761 continue;
2762 }
2763 }
2764 if (p->logical_offset + p->length <= end) {
2765 // deref whole lextent
2766 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2767 p->length, p->blob);
2768 old_extents->push_back(*oe);
2769 rm(p++);
2770 continue;
2771 }
2772 // deref head
2773 uint64_t keep = p->logical_end() - end;
2774 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2775 p->length - keep, p->blob);
2776 old_extents->push_back(*oe);
2777
2778 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2779 rm(p);
2780 break;
2781 }
2782}
2783
2784BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2785 CollectionRef &c,
2786 uint64_t logical_offset,
2787 uint64_t blob_offset, uint64_t length, BlobRef b,
2788 old_extent_map_t *old_extents)
2789{
2790 // We need to have completely initialized Blob to increment its ref counters.
2791 assert(b->get_blob().get_logical_length() != 0);
2792
2793 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2794 // old_extents list if we overwre the blob totally
2795 // This might happen during WAL overwrite.
2796 b->get_ref(onode->c, blob_offset, length);
2797
2798 if (old_extents) {
2799 punch_hole(c, logical_offset, length, old_extents);
2800 }
2801
2802 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2803 extent_map.insert(*le);
2804 if (spans_shard(logical_offset, length)) {
2805 request_reshard(logical_offset, logical_offset + length);
2806 }
2807 return le;
2808}
2809
2810BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2811 BlobRef lb,
2812 uint32_t blob_offset,
2813 uint32_t pos)
2814{
2815 auto cct = onode->c->store->cct; //used by dout
2816
2817 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2818 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2819 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2820 << dendl;
2821 BlobRef rb = onode->c->new_blob();
2822 lb->split(onode->c, blob_offset, rb.get());
2823
2824 for (auto ep = seek_lextent(pos);
2825 ep != extent_map.end() && ep->logical_offset < end_pos;
2826 ++ep) {
2827 if (ep->blob != lb) {
2828 continue;
2829 }
2830 if (ep->logical_offset < pos) {
2831 // split extent
2832 size_t left = pos - ep->logical_offset;
2833 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2834 extent_map.insert(*ne);
2835 ep->length = left;
2836 dout(30) << __func__ << " split " << *ep << dendl;
2837 dout(30) << __func__ << " to " << *ne << dendl;
2838 } else {
2839 // switch blob
2840 assert(ep->blob_offset >= blob_offset);
2841
2842 ep->blob = rb;
2843 ep->blob_offset -= blob_offset;
2844 dout(30) << __func__ << " adjusted " << *ep << dendl;
2845 }
2846 }
2847 return rb;
2848}
2849
2850// Onode
2851
2852#undef dout_prefix
2853#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2854
2855void BlueStore::Onode::flush()
2856{
2857 if (flushing_count.load()) {
2858 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2859 std::unique_lock<std::mutex> l(flush_lock);
2860 while (flushing_count.load()) {
2861 flush_cond.wait(l);
2862 }
2863 }
2864 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2865}
2866
2867// =======================================================
2868// WriteContext
2869
2870/// Checks for writes to the same pextent within a blob
2871bool BlueStore::WriteContext::has_conflict(
2872 BlobRef b,
2873 uint64_t loffs,
2874 uint64_t loffs_end,
2875 uint64_t min_alloc_size)
2876{
2877 assert((loffs % min_alloc_size) == 0);
2878 assert((loffs_end % min_alloc_size) == 0);
2879 for (auto w : writes) {
2880 if (b == w.b) {
2881 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
2882 auto loffs2_end = ROUND_UP_TO( w.logical_offset + w.length0, min_alloc_size);
2883 if ((loffs <= loffs2 && loffs_end > loffs2) ||
2884 (loffs >= loffs2 && loffs < loffs2_end)) {
2885 return true;
2886 }
2887 }
2888 }
2889 return false;
2890}
2891
2892// =======================================================
2893
2894// DeferredBatch
2895#undef dout_prefix
2896#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2897
2898void BlueStore::DeferredBatch::prepare_write(
2899 CephContext *cct,
2900 uint64_t seq, uint64_t offset, uint64_t length,
2901 bufferlist::const_iterator& blp)
2902{
2903 _discard(cct, offset, length);
2904 auto i = iomap.insert(make_pair(offset, deferred_io()));
2905 assert(i.second); // this should be a new insertion
2906 i.first->second.seq = seq;
2907 blp.copy(length, i.first->second.bl);
2908 dout(20) << __func__ << " seq " << seq
2909 << " 0x" << std::hex << offset << "~" << length
2910 << " crc " << i.first->second.bl.crc32c(-1)
2911 << std::dec << dendl;
2912 seq_bytes[seq] += length;
2913#ifdef DEBUG_DEFERRED
2914 _audit(cct);
2915#endif
2916}
2917
2918void BlueStore::DeferredBatch::_discard(
2919 CephContext *cct, uint64_t offset, uint64_t length)
2920{
2921 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2922 << std::dec << dendl;
2923 auto p = iomap.lower_bound(offset);
2924 if (p != iomap.begin()) {
2925 --p;
2926 auto end = p->first + p->second.bl.length();
2927 if (end > offset) {
2928 bufferlist head;
2929 head.substr_of(p->second.bl, 0, offset - p->first);
2930 dout(20) << __func__ << " keep head " << p->second.seq
2931 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2932 << " -> 0x" << head.length() << std::dec << dendl;
2933 auto i = seq_bytes.find(p->second.seq);
2934 if (end > offset + length) {
2935 bufferlist tail;
2936 tail.substr_of(p->second.bl, offset + length - p->first,
2937 end - (offset + length));
2938 dout(20) << __func__ << " keep tail " << p->second.seq
2939 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2940 << " -> 0x" << tail.length() << std::dec << dendl;
2941 auto &n = iomap[offset + length];
2942 n.bl.swap(tail);
2943 n.seq = p->second.seq;
2944 i->second -= length;
2945 } else {
2946 i->second -= end - offset;
2947 }
2948 p->second.bl.swap(head);
2949 }
2950 ++p;
2951 }
2952 while (p != iomap.end()) {
2953 if (p->first >= offset + length) {
2954 break;
2955 }
2956 auto i = seq_bytes.find(p->second.seq);
2957 auto end = p->first + p->second.bl.length();
2958 if (end > offset + length) {
2959 unsigned drop_front = offset + length - p->first;
2960 unsigned keep_tail = end - (offset + length);
2961 dout(20) << __func__ << " truncate front " << p->second.seq
2962 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2963 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
2964 << " to 0x" << (offset + length) << "~" << keep_tail
2965 << std::dec << dendl;
2966 auto &s = iomap[offset + length];
2967 s.seq = p->second.seq;
2968 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
2969 i->second -= drop_front;
2970 } else {
2971 dout(20) << __func__ << " drop " << p->second.seq
2972 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2973 << std::dec << dendl;
2974 i->second -= p->second.bl.length();
2975 }
2976 p = iomap.erase(p);
2977 }
2978}
2979
2980void BlueStore::DeferredBatch::_audit(CephContext *cct)
2981{
2982 map<uint64_t,int> sb;
2983 for (auto p : seq_bytes) {
2984 sb[p.first] = 0; // make sure we have the same set of keys
2985 }
2986 uint64_t pos = 0;
2987 for (auto& p : iomap) {
2988 assert(p.first >= pos);
2989 sb[p.second.seq] += p.second.bl.length();
2990 pos = p.first + p.second.bl.length();
2991 }
2992 assert(sb == seq_bytes);
2993}
2994
2995
2996// Collection
2997
2998#undef dout_prefix
2999#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3000
3001BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3002 : store(ns),
3003 cache(c),
3004 cid(cid),
3005 lock("BlueStore::Collection::lock", true, false),
3006 exists(true),
3007 onode_map(c)
3008{
3009}
3010
3011void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3012{
3013 assert(!b->shared_blob);
3014 const bluestore_blob_t& blob = b->get_blob();
3015 if (!blob.is_shared()) {
3016 b->shared_blob = new SharedBlob(this);
3017 return;
3018 }
3019
3020 b->shared_blob = shared_blob_set.lookup(sbid);
3021 if (b->shared_blob) {
3022 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3023 << std::dec << " had " << *b->shared_blob << dendl;
3024 } else {
3025 b->shared_blob = new SharedBlob(sbid, this);
3026 shared_blob_set.add(this, b->shared_blob.get());
3027 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3028 << std::dec << " opened " << *b->shared_blob
3029 << dendl;
3030 }
3031}
3032
3033void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3034{
3035 if (!sb->is_loaded()) {
3036
3037 bufferlist v;
3038 string key;
3039 auto sbid = sb->get_sbid();
3040 get_shared_blob_key(sbid, &key);
3041 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3042 if (r < 0) {
3043 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3044 << std::dec << " not found at key "
3045 << pretty_binary_string(key) << dendl;
3046 assert(0 == "uh oh, missing shared_blob");
3047 }
3048
3049 sb->loaded = true;
3050 sb->persistent = new bluestore_shared_blob_t(sbid);
3051 bufferlist::iterator p = v.begin();
3052 ::decode(*(sb->persistent), p);
3053 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3054 << std::dec << " loaded shared_blob " << *sb << dendl;
3055 }
3056}
3057
3058void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3059{
3060 assert(!b->shared_blob->is_loaded());
3061
3062 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3063 bluestore_blob_t& blob = b->dirty_blob();
3064
3065 // update blob
3066 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3067 blob.clear_flag(bluestore_blob_t::FLAG_MUTABLE);
3068
3069 // update shared blob
3070 b->shared_blob->loaded = true;
3071 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3072 shared_blob_set.add(this, b->shared_blob.get());
3073 for (auto p : blob.get_extents()) {
3074 if (p.is_valid()) {
3075 b->shared_blob->get_ref(
3076 p.offset,
3077 p.length);
3078 }
3079 }
3080 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3081}
3082
3083BlueStore::OnodeRef BlueStore::Collection::get_onode(
3084 const ghobject_t& oid,
3085 bool create)
3086{
3087 assert(create ? lock.is_wlocked() : lock.is_locked());
3088
3089 spg_t pgid;
3090 if (cid.is_pg(&pgid)) {
3091 if (!oid.match(cnode.bits, pgid.ps())) {
3092 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3093 << pgid << " bits " << cnode.bits << dendl;
3094 ceph_abort();
3095 }
3096 }
3097
3098 OnodeRef o = onode_map.lookup(oid);
3099 if (o)
3100 return o;
3101
3102 mempool::bluestore_meta_other::string key;
3103 get_object_key(store->cct, oid, &key);
3104
3105 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3106 << pretty_binary_string(key) << dendl;
3107
3108 bufferlist v;
3109 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3110 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3111 Onode *on;
3112 if (v.length() == 0) {
3113 assert(r == -ENOENT);
3114 if (!store->cct->_conf->bluestore_debug_misc &&
3115 !create)
3116 return OnodeRef();
3117
3118 // new object, new onode
3119 on = new Onode(this, oid, key);
3120 } else {
3121 // loaded
3122 assert(r >= 0);
3123 on = new Onode(this, oid, key);
3124 on->exists = true;
3125 bufferptr::iterator p = v.front().begin();
3126 on->onode.decode(p);
3127
3128 // initialize extent_map
3129 on->extent_map.decode_spanning_blobs(p);
3130 if (on->onode.extent_map_shards.empty()) {
3131 denc(on->extent_map.inline_bl, p);
3132 on->extent_map.decode_some(on->extent_map.inline_bl);
3133 } else {
3134 on->extent_map.init_shards(false, false);
3135 }
3136 }
3137 o.reset(on);
3138 return onode_map.add(oid, o);
3139}
3140
3141void BlueStore::Collection::split_cache(
3142 Collection *dest)
3143{
3144 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3145
3146 // lock (one or both) cache shards
3147 std::lock(cache->lock, dest->cache->lock);
3148 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3149 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3150
3151 int destbits = dest->cnode.bits;
3152 spg_t destpg;
3153 bool is_pg = dest->cid.is_pg(&destpg);
3154 assert(is_pg);
3155
3156 auto p = onode_map.onode_map.begin();
3157 while (p != onode_map.onode_map.end()) {
3158 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3159 // onode does not belong to this child
3160 ++p;
3161 } else {
3162 OnodeRef o = p->second;
3163 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3164 << dendl;
3165
3166 cache->_rm_onode(p->second);
3167 p = onode_map.onode_map.erase(p);
3168
3169 o->c = dest;
3170 dest->cache->_add_onode(o, 1);
3171 dest->onode_map.onode_map[o->oid] = o;
3172 dest->onode_map.cache = dest->cache;
3173
3174 // move over shared blobs and buffers. cover shared blobs from
3175 // both extent map and spanning blob map (the full extent map
3176 // may not be faulted in)
3177 vector<SharedBlob*> sbvec;
3178 for (auto& e : o->extent_map.extent_map) {
3179 sbvec.push_back(e.blob->shared_blob.get());
3180 }
3181 for (auto& b : o->extent_map.spanning_blob_map) {
3182 sbvec.push_back(b.second->shared_blob.get());
3183 }
3184 for (auto sb : sbvec) {
3185 if (sb->coll == dest) {
3186 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3187 << dendl;
3188 continue;
3189 }
3190 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
3191 sb->coll = dest;
3192 if (dest->cache != cache) {
3193 if (sb->get_sbid()) {
3194 ldout(store->cct, 20) << __func__ << " moving registration " << *sb << dendl;
3195 shared_blob_set.remove(sb);
3196 dest->shared_blob_set.add(dest, sb);
3197 }
3198 for (auto& i : sb->bc.buffer_map) {
3199 if (!i.second->is_writing()) {
3200 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3201 << dendl;
3202 dest->cache->_move_buffer(cache, i.second.get());
3203 }
3204 }
3205 }
3206 }
3207
3208
3209 }
3210 }
3211}
3212
3213void BlueStore::Collection::trim_cache()
3214{
3215 // see if mempool stats have updated
3216 uint64_t total_bytes;
3217 uint64_t total_onodes;
3218 size_t seq;
3219 store->get_mempool_stats(&seq, &total_bytes, &total_onodes);
3220 if (seq == cache->last_trim_seq) {
3221 ldout(store->cct, 30) << __func__ << " no new mempool stats; nothing to do"
3222 << dendl;
3223 return;
3224 }
3225 cache->last_trim_seq = seq;
3226
3227 // trim
3228 if (total_onodes < 2) {
3229 total_onodes = 2;
3230 }
3231 float bytes_per_onode = (float)total_bytes / (float)total_onodes;
3232 size_t num_shards = store->cache_shards.size();
3233 uint64_t shard_target = store->cct->_conf->bluestore_cache_size / num_shards;
3234 ldout(store->cct, 30) << __func__
3235 << " total meta bytes " << total_bytes
3236 << ", total onodes " << total_onodes
3237 << ", bytes_per_onode " << bytes_per_onode
3238 << dendl;
3239 cache->trim(shard_target, store->cct->_conf->bluestore_cache_meta_ratio,
3240 bytes_per_onode);
3241
3242 store->_update_cache_logger();
3243}
3244
3245// =======================================================
3246
3247void *BlueStore::MempoolThread::entry()
3248{
3249 Mutex::Locker l(lock);
3250 while (!stop) {
3251 store->mempool_bytes = mempool::bluestore_meta_other::allocated_bytes() +
3252 mempool::bluestore_meta_onode::allocated_bytes();
3253 store->mempool_onodes = mempool::bluestore_meta_onode::allocated_items();
3254 ++store->mempool_seq;
3255 utime_t wait;
3256 wait += store->cct->_conf->bluestore_cache_trim_interval;
3257 cond.WaitInterval(lock, wait);
3258 }
3259 stop = false;
3260 return NULL;
3261}
3262
3263// =======================================================
3264
3265#undef dout_prefix
3266#define dout_prefix *_dout << "bluestore(" << path << ") "
3267
3268
3269static void aio_cb(void *priv, void *priv2)
3270{
3271 BlueStore *store = static_cast<BlueStore*>(priv);
3272 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3273 c->aio_finish(store);
3274}
3275
3276BlueStore::BlueStore(CephContext *cct, const string& path)
3277 : ObjectStore(cct, path),
3278 throttle_bytes(cct, "bluestore_throttle_bytes",
3279 cct->_conf->bluestore_throttle_bytes),
3280 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3281 cct->_conf->bluestore_throttle_bytes +
3282 cct->_conf->bluestore_throttle_deferred_bytes),
3283 kv_sync_thread(this),
3284 mempool_thread(this)
3285{
3286 _init_logger();
3287 cct->_conf->add_observer(this);
3288 set_cache_shards(1);
3289
3290 if (cct->_conf->bluestore_shard_finishers) {
3291 m_finisher_num = cct->_conf->osd_op_num_shards;
3292 }
3293
3294 for (int i = 0; i < m_finisher_num; ++i) {
3295 ostringstream oss;
3296 oss << "finisher-" << i;
3297 Finisher *f = new Finisher(cct, oss.str(), "finisher");
3298 finishers.push_back(f);
3299 }
3300}
3301
3302BlueStore::BlueStore(CephContext *cct,
3303 const string& path,
3304 uint64_t _min_alloc_size)
3305 : ObjectStore(cct, path),
3306 throttle_bytes(cct, "bluestore_throttle_bytes",
3307 cct->_conf->bluestore_throttle_bytes),
3308 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3309 cct->_conf->bluestore_throttle_bytes +
3310 cct->_conf->bluestore_throttle_deferred_bytes),
3311 kv_sync_thread(this),
3312 min_alloc_size(_min_alloc_size),
3313 min_alloc_size_order(ctz(_min_alloc_size)),
3314 mempool_thread(this)
3315{
3316 _init_logger();
3317 cct->_conf->add_observer(this);
3318 set_cache_shards(1);
3319
3320 if (cct->_conf->bluestore_shard_finishers) {
3321 m_finisher_num = cct->_conf->osd_op_num_shards;
3322 }
3323
3324 for (int i = 0; i < m_finisher_num; ++i) {
3325 ostringstream oss;
3326 oss << "finisher-" << i;
3327 Finisher *f = new Finisher(cct, oss.str(), "finisher");
3328 finishers.push_back(f);
3329 }
3330}
3331
3332BlueStore::~BlueStore()
3333{
3334 for (auto f : finishers) {
3335 delete f;
3336 }
3337 finishers.clear();
3338
3339 cct->_conf->remove_observer(this);
3340 _shutdown_logger();
3341 assert(!mounted);
3342 assert(db == NULL);
3343 assert(bluefs == NULL);
3344 assert(fsid_fd < 0);
3345 assert(path_fd < 0);
3346 for (auto i : cache_shards) {
3347 delete i;
3348 }
3349 cache_shards.clear();
3350}
3351
3352const char **BlueStore::get_tracked_conf_keys() const
3353{
3354 static const char* KEYS[] = {
3355 "bluestore_csum_type",
3356 "bluestore_compression_mode",
3357 "bluestore_compression_algorithm",
3358 "bluestore_compression_min_blob_size",
3359 "bluestore_compression_min_blob_size_ssd",
3360 "bluestore_compression_min_blob_size_hdd",
3361 "bluestore_compression_max_blob_size",
3362 "bluestore_compression_max_blob_size_ssd",
3363 "bluestore_compression_max_blob_size_hdd",
3364 "bluestore_max_alloc_size",
3365 "bluestore_prefer_deferred_size",
3366 "bleustore_deferred_batch_ops",
3367 "bleustore_deferred_batch_ops_hdd",
3368 "bleustore_deferred_batch_ops_ssd",
3369 "bluestore_throttle_bytes",
3370 "bluestore_throttle_deferred_bytes",
3371 "bluestore_throttle_cost_per_io_hdd",
3372 "bluestore_throttle_cost_per_io_ssd",
3373 "bluestore_throttle_cost_per_io",
3374 "bluestore_max_blob_size",
3375 "bluestore_max_blob_size_ssd",
3376 "bluestore_max_blob_size_hdd",
3377 NULL
3378 };
3379 return KEYS;
3380}
3381
3382void BlueStore::handle_conf_change(const struct md_config_t *conf,
3383 const std::set<std::string> &changed)
3384{
3385 if (changed.count("bluestore_csum_type")) {
3386 _set_csum();
3387 }
3388 if (changed.count("bluestore_compression_mode") ||
3389 changed.count("bluestore_compression_algorithm") ||
3390 changed.count("bluestore_compression_min_blob_size") ||
3391 changed.count("bluestore_compression_max_blob_size")) {
3392 if (bdev) {
3393 _set_compression();
3394 }
3395 }
3396 if (changed.count("bluestore_max_blob_size") ||
3397 changed.count("bluestore_max_blob_size_ssd") ||
3398 changed.count("bluestore_max_blob_size_hdd")) {
3399 if (bdev) {
3400 // only after startup
3401 _set_blob_size();
3402 }
3403 }
3404 if (changed.count("bluestore_prefer_deferred_size") ||
3405 changed.count("bluestore_max_alloc_size") ||
3406 changed.count("bluestore_deferred_batch_ops") ||
3407 changed.count("bluestore_deferred_batch_ops_hdd") ||
3408 changed.count("bluestore_deferred_batch_ops_ssd")) {
3409 if (bdev) {
3410 // only after startup
3411 _set_alloc_sizes();
3412 }
3413 }
3414 if (changed.count("bluestore_throttle_cost_per_io") ||
3415 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3416 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3417 if (bdev) {
3418 _set_throttle_params();
3419 }
3420 }
3421 if (changed.count("bluestore_throttle_bytes")) {
3422 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3423 throttle_deferred_bytes.reset_max(
3424 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3425 }
3426 if (changed.count("bluestore_throttle_deferred_bytes")) {
3427 throttle_deferred_bytes.reset_max(
3428 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3429 }
3430}
3431
3432void BlueStore::_set_compression()
3433{
3434 if (cct->_conf->bluestore_compression_max_blob_size) {
3435 comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3436 } else {
3437 assert(bdev);
3438 if (bdev->is_rotational()) {
3439 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3440 } else {
3441 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3442 }
3443 }
3444
3445 if (cct->_conf->bluestore_compression_max_blob_size) {
3446 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3447 } else {
3448 assert(bdev);
3449 if (bdev->is_rotational()) {
3450 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3451 } else {
3452 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3453 }
3454 }
3455
3456 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3457 if (m) {
3458 comp_mode = *m;
3459 } else {
3460 derr << __func__ << " unrecognized value '"
3461 << cct->_conf->bluestore_compression_mode
3462 << "' for bluestore_compression_mode, reverting to 'none'"
3463 << dendl;
3464 comp_mode = Compressor::COMP_NONE;
3465 }
3466
3467 compressor = nullptr;
3468
3469 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3470 if (!alg_name.empty()) {
3471 compressor = Compressor::create(cct, alg_name);
3472 if (!compressor) {
3473 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3474 << dendl;
3475 }
3476 }
3477
3478 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3479 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3480 << dendl;
3481}
3482
3483void BlueStore::_set_csum()
3484{
3485 csum_type = Checksummer::CSUM_NONE;
3486 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3487 if (t > Checksummer::CSUM_NONE)
3488 csum_type = t;
3489
3490 dout(10) << __func__ << " csum_type "
3491 << Checksummer::get_csum_type_string(csum_type)
3492 << dendl;
3493}
3494
3495void BlueStore::_set_throttle_params()
3496{
3497 if (cct->_conf->bluestore_throttle_cost_per_io) {
3498 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3499 } else {
3500 assert(bdev);
3501 if (bdev->is_rotational()) {
3502 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3503 } else {
3504 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3505 }
3506 }
3507
3508 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3509 << dendl;
3510}
3511void BlueStore::_set_blob_size()
3512{
3513 if (cct->_conf->bluestore_max_blob_size) {
3514 max_blob_size = cct->_conf->bluestore_max_blob_size;
3515 } else {
3516 assert(bdev);
3517 if (bdev->is_rotational()) {
3518 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3519 } else {
3520 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3521 }
3522 }
3523 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3524 << std::dec << dendl;
3525}
3526
3527void BlueStore::_init_logger()
3528{
3529 PerfCountersBuilder b(cct, "bluestore",
3530 l_bluestore_first, l_bluestore_last);
3531 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
3532 "Average kv_thread flush latency",
3533 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
3534 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
3535 "Average kv_thread commit latency");
3536 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
3537 "Average kv_thread sync latency",
3538 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
3539 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
3540 "Average prepare state latency");
3541 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
3542 "Average aio_wait state latency",
3543 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
3544 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
3545 "Average io_done state latency");
3546 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
3547 "Average kv_queued state latency");
3548 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
3549 "Average kv_commiting state latency");
3550 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
3551 "Average kv_done state latency");
3552 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
3553 "Average deferred_queued state latency");
3554 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
3555 "Average aio_wait state latency");
3556 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
3557 "Average cleanup state latency");
3558 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
3559 "Average finishing state latency");
3560 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
3561 "Average done state latency");
3562 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
3563 "Average submit throttle latency",
3564 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
3565 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
3566 "Average submit latency",
3567 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
3568 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
3569 "Average commit latency",
3570 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
3571 b.add_time_avg(l_bluestore_read_lat, "read_lat",
3572 "Average read latency",
3573 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
3574 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
3575 "Average read onode metadata latency");
3576 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
3577 "Average read latency");
3578 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
3579 "Average compress latency");
3580 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
3581 "Average decompress latency");
3582 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
3583 "Average checksum latency");
3584 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
3585 "Sum for beneficial compress ops");
3586 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
3587 "Sum for compress ops rejected due to low net gain of space");
3588 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
3589 "Sum for write-op padded bytes");
3590 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
3591 "Sum for deferred write op");
3592 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
3593 "Sum for deferred write bytes", "def");
3594 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
3595 "Sum for write penalty read ops");
3596 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
3597 "Sum for allocated bytes");
3598 b.add_u64(l_bluestore_stored, "bluestore_stored",
3599 "Sum for stored bytes");
3600 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
3601 "Sum for stored compressed bytes");
3602 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
3603 "Sum for bytes allocated for compressed data");
3604 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
3605 "Sum for original bytes that were compressed");
3606
3607 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
3608 "Number of onodes in cache");
3609 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
3610 "Sum for onode-lookups hit in the cache");
3611 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
3612 "Sum for onode-lookups missed in the cache");
3613 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
3614 "Sum for onode-shard lookups hit in the cache");
3615 b.add_u64_counter(l_bluestore_onode_shard_misses,
3616 "bluestore_onode_shard_misses",
3617 "Sum for onode-shard lookups missed in the cache");
3618 b.add_u64(l_bluestore_extents, "bluestore_extents",
3619 "Number of extents in cache");
3620 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
3621 "Number of blobs in cache");
3622 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
3623 "Number of buffers in cache");
3624 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
3625 "Number of buffer bytes in cache");
3626 b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
3627 "Sum for bytes of read hit in the cache");
3628 b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
3629 "Sum for bytes of read missed in the cache");
3630
3631 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
3632 "Large aligned writes into fresh blobs");
3633 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
3634 "Large aligned writes into fresh blobs (bytes)");
3635 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
3636 "Large aligned writes into fresh blobs (blobs)");
3637 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
3638 "Small writes into existing or sparse small blobs");
3639 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
3640 "Small writes into existing or sparse small blobs (bytes)");
3641 b.add_u64_counter(l_bluestore_write_small_unused,
3642 "bluestore_write_small_unused",
3643 "Small writes into unused portion of existing blob");
3644 b.add_u64_counter(l_bluestore_write_small_deferred,
3645 "bluestore_write_small_deferred",
3646 "Small overwrites using deferred");
3647 b.add_u64_counter(l_bluestore_write_small_pre_read,
3648 "bluestore_write_small_pre_read",
3649 "Small writes that required we read some data (possibly "
3650 "cached) to fill out the block");
3651 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
3652 "Small write into new (sparse) blob");
3653
3654 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
3655 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
3656 "Onode extent map reshard events");
3657 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
3658 "Sum for blob splitting due to resharding");
3659 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
3660 "Sum for extents that have been removed due to compression");
3661 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
3662 "Sum for extents that have been merged due to garbage "
3663 "collection");
3664 logger = b.create_perf_counters();
3665 cct->get_perfcounters_collection()->add(logger);
3666}
3667
3668int BlueStore::_reload_logger()
3669{
3670 struct store_statfs_t store_statfs;
3671
3672 int r = statfs(&store_statfs);
3673 if(r >= 0) {
3674 logger->set(l_bluestore_allocated, store_statfs.allocated);
3675 logger->set(l_bluestore_stored, store_statfs.stored);
3676 logger->set(l_bluestore_compressed, store_statfs.compressed);
3677 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
3678 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
3679 }
3680 return r;
3681}
3682
3683void BlueStore::_shutdown_logger()
3684{
3685 cct->get_perfcounters_collection()->remove(logger);
3686 delete logger;
3687}
3688
3689int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
3690 uuid_d *fsid)
3691{
3692 bluestore_bdev_label_t label;
3693 int r = _read_bdev_label(cct, path, &label);
3694 if (r < 0)
3695 return r;
3696 *fsid = label.osd_uuid;
3697 return 0;
3698}
3699
3700int BlueStore::_open_path()
3701{
3702 assert(path_fd < 0);
3703 path_fd = ::open(path.c_str(), O_DIRECTORY);
3704 if (path_fd < 0) {
3705 int r = -errno;
3706 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
3707 << dendl;
3708 return r;
3709 }
3710 return 0;
3711}
3712
3713void BlueStore::_close_path()
3714{
3715 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
3716 path_fd = -1;
3717}
3718
3719int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
3720{
3721 dout(10) << __func__ << " path " << path << " label " << label << dendl;
3722 bufferlist bl;
3723 ::encode(label, bl);
3724 uint32_t crc = bl.crc32c(-1);
3725 ::encode(crc, bl);
3726 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
3727 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
3728 z.zero();
3729 bl.append(std::move(z));
3730
3731 int fd = ::open(path.c_str(), O_WRONLY);
3732 if (fd < 0) {
3733 fd = -errno;
3734 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
3735 << dendl;
3736 return fd;
3737 }
3738 int r = bl.write_fd(fd);
3739 if (r < 0) {
3740 derr << __func__ << " failed to write to " << path
3741 << ": " << cpp_strerror(r) << dendl;
3742 }
3743 VOID_TEMP_FAILURE_RETRY(::close(fd));
3744 return r;
3745}
3746
3747int BlueStore::_read_bdev_label(CephContext* cct, string path,
3748 bluestore_bdev_label_t *label)
3749{
3750 dout(10) << __func__ << dendl;
3751 int fd = ::open(path.c_str(), O_RDONLY);
3752 if (fd < 0) {
3753 fd = -errno;
3754 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
3755 << dendl;
3756 return fd;
3757 }
3758 bufferlist bl;
3759 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
3760 VOID_TEMP_FAILURE_RETRY(::close(fd));
3761 if (r < 0) {
3762 derr << __func__ << " failed to read from " << path
3763 << ": " << cpp_strerror(r) << dendl;
3764 return r;
3765 }
3766
3767 uint32_t crc, expected_crc;
3768 bufferlist::iterator p = bl.begin();
3769 try {
3770 ::decode(*label, p);
3771 bufferlist t;
3772 t.substr_of(bl, 0, p.get_off());
3773 crc = t.crc32c(-1);
3774 ::decode(expected_crc, p);
3775 }
3776 catch (buffer::error& e) {
3777 derr << __func__ << " unable to decode label at offset " << p.get_off()
3778 << ": " << e.what()
3779 << dendl;
3780 return -EINVAL;
3781 }
3782 if (crc != expected_crc) {
3783 derr << __func__ << " bad crc on label, expected " << expected_crc
3784 << " != actual " << crc << dendl;
3785 return -EIO;
3786 }
3787 dout(10) << __func__ << " got " << *label << dendl;
3788 return 0;
3789}
3790
3791int BlueStore::_check_or_set_bdev_label(
3792 string path, uint64_t size, string desc, bool create)
3793{
3794 bluestore_bdev_label_t label;
3795 if (create) {
3796 label.osd_uuid = fsid;
3797 label.size = size;
3798 label.btime = ceph_clock_now();
3799 label.description = desc;
3800 int r = _write_bdev_label(path, label);
3801 if (r < 0)
3802 return r;
3803 } else {
3804 int r = _read_bdev_label(cct, path, &label);
3805 if (r < 0)
3806 return r;
3807 if (label.osd_uuid != fsid) {
3808 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
3809 << " does not match our fsid " << fsid << dendl;
3810 return -EIO;
3811 }
3812 }
3813 return 0;
3814}
3815
3816void BlueStore::_set_alloc_sizes(void)
3817{
3818 min_alloc_size_order = ctz(min_alloc_size);
3819 assert(min_alloc_size == 1u << min_alloc_size_order);
3820
3821 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
3822
3823 if (cct->_conf->bluestore_prefer_deferred_size) {
3824 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
3825 } else {
3826 assert(bdev);
3827 if (bdev->is_rotational()) {
3828 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
3829 } else {
3830 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
3831 }
3832 }
3833
3834 if (cct->_conf->bluestore_deferred_batch_ops) {
3835 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
3836 } else {
3837 assert(bdev);
3838 if (bdev->is_rotational()) {
3839 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
3840 } else {
3841 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
3842 }
3843 }
3844
3845 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
3846 << std::dec << " order " << min_alloc_size_order
3847 << " max_alloc_size 0x" << std::hex << max_alloc_size
3848 << " prefer_deferred_size 0x" << prefer_deferred_size
3849 << std::dec
3850 << " deferred_batch_ops " << deferred_batch_ops
3851 << dendl;
3852}
3853
3854int BlueStore::_open_bdev(bool create)
3855{
3856 assert(bdev == NULL);
3857 string p = path + "/block";
3858 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
3859 int r = bdev->open(p);
3860 if (r < 0)
3861 goto fail;
3862
3863 if (bdev->supported_bdev_label()) {
3864 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
3865 if (r < 0)
3866 goto fail_close;
3867 }
3868
3869 // initialize global block parameters
3870 block_size = bdev->get_block_size();
3871 block_mask = ~(block_size - 1);
3872 block_size_order = ctz(block_size);
3873 assert(block_size == 1u << block_size_order);
3874 return 0;
3875
3876 fail_close:
3877 bdev->close();
3878 fail:
3879 delete bdev;
3880 bdev = NULL;
3881 return r;
3882}
3883
3884void BlueStore::_close_bdev()
3885{
3886 assert(bdev);
3887 bdev->close();
3888 delete bdev;
3889 bdev = NULL;
3890}
3891
3892int BlueStore::_open_fm(bool create)
3893{
3894 assert(fm == NULL);
3895 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
3896
3897 if (create) {
3898 // initialize freespace
3899 dout(20) << __func__ << " initializing freespace" << dendl;
3900 KeyValueDB::Transaction t = db->get_transaction();
3901 {
3902 bufferlist bl;
3903 bl.append(freelist_type);
3904 t->set(PREFIX_SUPER, "freelist_type", bl);
3905 }
3906 fm->create(bdev->get_size(), t);
3907
3908 // allocate superblock reserved space. note that we do not mark
3909 // bluefs space as allocated in the freelist; we instead rely on
3910 // bluefs_extents.
3911 fm->allocate(0, SUPER_RESERVED, t);
3912
3913 uint64_t reserved = 0;
3914 if (cct->_conf->bluestore_bluefs) {
3915 assert(bluefs_extents.num_intervals() == 1);
3916 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
3917 reserved = p.get_start() + p.get_len();
3918 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
3919 << " for bluefs" << dendl;
3920 bufferlist bl;
3921 ::encode(bluefs_extents, bl);
3922 t->set(PREFIX_SUPER, "bluefs_extents", bl);
3923 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
3924 << std::dec << dendl;
3925 } else {
3926 reserved = SUPER_RESERVED;
3927 }
3928
3929 if (cct->_conf->bluestore_debug_prefill > 0) {
3930 uint64_t end = bdev->get_size() - reserved;
3931 dout(1) << __func__ << " pre-fragmenting freespace, using "
3932 << cct->_conf->bluestore_debug_prefill << " with max free extent "
3933 << cct->_conf->bluestore_debug_prefragment_max << dendl;
3934 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
3935 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
3936 float r = cct->_conf->bluestore_debug_prefill;
3937 r /= 1.0 - r;
3938 bool stop = false;
3939
3940 while (!stop && start < end) {
3941 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
3942 if (start + l > end) {
3943 l = end - start;
3944 l = P2ALIGN(l, min_alloc_size);
3945 }
3946 assert(start + l <= end);
3947
3948 uint64_t u = 1 + (uint64_t)(r * (double)l);
3949 u = P2ROUNDUP(u, min_alloc_size);
3950 if (start + l + u > end) {
3951 u = end - (start + l);
3952 // trim to align so we don't overflow again
3953 u = P2ALIGN(u, min_alloc_size);
3954 stop = true;
3955 }
3956 assert(start + l + u <= end);
3957
3958 dout(20) << " free 0x" << std::hex << start << "~" << l
3959 << " use 0x" << u << std::dec << dendl;
3960
3961 if (u == 0) {
3962 // break if u has been trimmed to nothing
3963 break;
3964 }
3965
3966 fm->allocate(start + l, u, t);
3967 start += l + u;
3968 }
3969 }
3970 db->submit_transaction_sync(t);
3971 }
3972
3973 int r = fm->init();
3974 if (r < 0) {
3975 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
3976 delete fm;
3977 fm = NULL;
3978 return r;
3979 }
3980 return 0;
3981}
3982
3983void BlueStore::_close_fm()
3984{
3985 dout(10) << __func__ << dendl;
3986 assert(fm);
3987 fm->shutdown();
3988 delete fm;
3989 fm = NULL;
3990}
3991
3992int BlueStore::_open_alloc()
3993{
3994 assert(alloc == NULL);
3995 assert(bdev->get_size());
3996 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
3997 bdev->get_size(),
3998 min_alloc_size);
3999 if (!alloc) {
4000 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4001 << cct->_conf->bluestore_allocator
4002 << dendl;
4003 return -EINVAL;
4004 }
4005
4006 uint64_t num = 0, bytes = 0;
4007
4008 dout(1) << __func__ << " opening allocation metadata" << dendl;
4009 // initialize from freelist
4010 fm->enumerate_reset();
4011 uint64_t offset, length;
4012 while (fm->enumerate_next(&offset, &length)) {
4013 alloc->init_add_free(offset, length);
4014 ++num;
4015 bytes += length;
4016 }
4017 dout(1) << __func__ << " loaded " << pretty_si_t(bytes)
4018 << " in " << num << " extents"
4019 << dendl;
4020
4021 // also mark bluefs space as allocated
4022 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4023 alloc->init_rm_free(e.get_start(), e.get_len());
4024 }
4025 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4026 << bluefs_extents << std::dec << " as allocated" << dendl;
4027
4028 return 0;
4029}
4030
4031void BlueStore::_close_alloc()
4032{
4033 assert(alloc);
4034 alloc->shutdown();
4035 delete alloc;
4036 alloc = NULL;
4037}
4038
4039int BlueStore::_open_fsid(bool create)
4040{
4041 assert(fsid_fd < 0);
4042 int flags = O_RDWR;
4043 if (create)
4044 flags |= O_CREAT;
4045 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4046 if (fsid_fd < 0) {
4047 int err = -errno;
4048 derr << __func__ << " " << cpp_strerror(err) << dendl;
4049 return err;
4050 }
4051 return 0;
4052}
4053
4054int BlueStore::_read_fsid(uuid_d *uuid)
4055{
4056 char fsid_str[40];
4057 memset(fsid_str, 0, sizeof(fsid_str));
4058 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4059 if (ret < 0) {
4060 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4061 return ret;
4062 }
4063 if (ret > 36)
4064 fsid_str[36] = 0;
4065 else
4066 fsid_str[ret] = 0;
4067 if (!uuid->parse(fsid_str)) {
4068 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4069 return -EINVAL;
4070 }
4071 return 0;
4072}
4073
4074int BlueStore::_write_fsid()
4075{
4076 int r = ::ftruncate(fsid_fd, 0);
4077 if (r < 0) {
4078 r = -errno;
4079 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4080 return r;
4081 }
4082 string str = stringify(fsid) + "\n";
4083 r = safe_write(fsid_fd, str.c_str(), str.length());
4084 if (r < 0) {
4085 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4086 return r;
4087 }
4088 r = ::fsync(fsid_fd);
4089 if (r < 0) {
4090 r = -errno;
4091 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4092 return r;
4093 }
4094 return 0;
4095}
4096
4097void BlueStore::_close_fsid()
4098{
4099 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4100 fsid_fd = -1;
4101}
4102
4103int BlueStore::_lock_fsid()
4104{
4105 struct flock l;
4106 memset(&l, 0, sizeof(l));
4107 l.l_type = F_WRLCK;
4108 l.l_whence = SEEK_SET;
4109 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4110 if (r < 0) {
4111 int err = errno;
4112 derr << __func__ << " failed to lock " << path << "/fsid"
4113 << " (is another ceph-osd still running?)"
4114 << cpp_strerror(err) << dendl;
4115 return -err;
4116 }
4117 return 0;
4118}
4119
4120bool BlueStore::test_mount_in_use()
4121{
4122 // most error conditions mean the mount is not in use (e.g., because
4123 // it doesn't exist). only if we fail to lock do we conclude it is
4124 // in use.
4125 bool ret = false;
4126 int r = _open_path();
4127 if (r < 0)
4128 return false;
4129 r = _open_fsid(false);
4130 if (r < 0)
4131 goto out_path;
4132 r = _lock_fsid();
4133 if (r < 0)
4134 ret = true; // if we can't lock, it is in use
4135 _close_fsid();
4136 out_path:
4137 _close_path();
4138 return ret;
4139}
4140
4141int BlueStore::_open_db(bool create)
4142{
4143 int r;
4144 assert(!db);
4145 string fn = path + "/db";
4146 string options;
4147 stringstream err;
4148 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4149
4150 string kv_backend;
4151 if (create) {
4152 kv_backend = cct->_conf->bluestore_kvbackend;
4153 } else {
4154 r = read_meta("kv_backend", &kv_backend);
4155 if (r < 0) {
4156 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4157 return -EIO;
4158 }
4159 }
4160 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4161
4162 bool do_bluefs;
4163 if (create) {
4164 do_bluefs = cct->_conf->bluestore_bluefs;
4165 } else {
4166 string s;
4167 r = read_meta("bluefs", &s);
4168 if (r < 0) {
4169 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4170 return -EIO;
4171 }
4172 if (s == "1") {
4173 do_bluefs = true;
4174 } else if (s == "0") {
4175 do_bluefs = false;
4176 } else {
4177 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4178 << dendl;
4179 return -EIO;
4180 }
4181 }
4182 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4183
4184 rocksdb::Env *env = NULL;
4185 if (do_bluefs) {
4186 dout(10) << __func__ << " initializing bluefs" << dendl;
4187 if (kv_backend != "rocksdb") {
4188 derr << " backend must be rocksdb to use bluefs" << dendl;
4189 return -EINVAL;
4190 }
4191 bluefs = new BlueFS(cct);
4192
4193 string bfn;
4194 struct stat st;
4195
4196 bfn = path + "/block.db";
4197 if (::stat(bfn.c_str(), &st) == 0) {
4198 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4199 if (r < 0) {
4200 derr << __func__ << " add block device(" << bfn << ") returned: "
4201 << cpp_strerror(r) << dendl;
4202 goto free_bluefs;
4203 }
4204
4205 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4206 r = _check_or_set_bdev_label(
4207 bfn,
4208 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4209 "bluefs db", create);
4210 if (r < 0) {
4211 derr << __func__
4212 << " check block device(" << bfn << ") label returned: "
4213 << cpp_strerror(r) << dendl;
4214 goto free_bluefs;
4215 }
4216 }
4217 if (create) {
4218 bluefs->add_block_extent(
4219 BlueFS::BDEV_DB,
4220 SUPER_RESERVED,
4221 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4222 }
4223 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4224 bluefs_single_shared_device = false;
4225 } else {
4226 bluefs_shared_bdev = BlueFS::BDEV_DB;
4227 }
4228
4229 // shared device
4230 bfn = path + "/block";
4231 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4232 if (r < 0) {
4233 derr << __func__ << " add block device(" << bfn << ") returned: "
4234 << cpp_strerror(r) << dendl;
4235 goto free_bluefs;
4236 }
4237 if (create) {
4238 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4239 uint64_t initial =
4240 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4241 cct->_conf->bluestore_bluefs_gift_ratio);
4242 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
4243 // align to bluefs's alloc_size
4244 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
4245 initial += cct->_conf->bluefs_alloc_size - SUPER_RESERVED;
4246 bluefs->add_block_extent(bluefs_shared_bdev, SUPER_RESERVED, initial);
4247 bluefs_extents.insert(SUPER_RESERVED, initial);
4248 }
4249
4250 bfn = path + "/block.wal";
4251 if (::stat(bfn.c_str(), &st) == 0) {
4252 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4253 if (r < 0) {
4254 derr << __func__ << " add block device(" << bfn << ") returned: "
4255 << cpp_strerror(r) << dendl;
4256 goto free_bluefs;
4257 }
4258
4259 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4260 r = _check_or_set_bdev_label(
4261 bfn,
4262 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4263 "bluefs wal", create);
4264 if (r < 0) {
4265 derr << __func__ << " check block device(" << bfn
4266 << ") label returned: " << cpp_strerror(r) << dendl;
4267 goto free_bluefs;
4268 }
4269 }
4270
4271 if (create) {
4272 bluefs->add_block_extent(
4273 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4274 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4275 BDEV_LABEL_BLOCK_SIZE);
4276 }
4277 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4278 bluefs_single_shared_device = false;
4279 } else {
4280 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
4281 }
4282
4283 if (create) {
4284 bluefs->mkfs(fsid);
4285 }
4286 r = bluefs->mount();
4287 if (r < 0) {
4288 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4289 goto free_bluefs;
4290 }
4291 if (cct->_conf->bluestore_bluefs_env_mirror) {
4292 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4293 rocksdb::Env *b = rocksdb::Env::Default();
4294 if (create) {
4295 string cmd = "rm -rf " + path + "/db " +
4296 path + "/db.slow " +
4297 path + "/db.wal";
4298 int r = system(cmd.c_str());
4299 (void)r;
4300 }
4301 env = new rocksdb::EnvMirror(b, a, false, true);
4302 } else {
4303 env = new BlueRocksEnv(bluefs);
4304
4305 // simplify the dir names, too, as "seen" by rocksdb
4306 fn = "db";
4307 }
4308
4309 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4310 // we have both block.db and block; tell rocksdb!
4311 // note: the second (last) size value doesn't really matter
4312 ostringstream db_paths;
4313 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4314 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4315 db_paths << fn << ","
4316 << (uint64_t)(db_size * 95 / 100) << " "
4317 << fn + ".slow" << ","
4318 << (uint64_t)(slow_size * 95 / 100);
4319 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4320 dout(10) << __func__ << " set rocksdb_db_paths to "
4321 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4322 }
4323
4324 if (create) {
4325 env->CreateDir(fn);
4326 if (cct->_conf->rocksdb_separate_wal_dir)
4327 env->CreateDir(fn + ".wal");
4328 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4329 env->CreateDir(fn + ".slow");
4330 }
4331 } else if (create) {
4332 int r = ::mkdir(fn.c_str(), 0755);
4333 if (r < 0)
4334 r = -errno;
4335 if (r < 0 && r != -EEXIST) {
4336 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4337 << dendl;
4338 return r;
4339 }
4340
4341 // wal_dir, too!
4342 if (cct->_conf->rocksdb_separate_wal_dir) {
4343 string walfn = path + "/db.wal";
4344 r = ::mkdir(walfn.c_str(), 0755);
4345 if (r < 0)
4346 r = -errno;
4347 if (r < 0 && r != -EEXIST) {
4348 derr << __func__ << " failed to create " << walfn
4349 << ": " << cpp_strerror(r)
4350 << dendl;
4351 return r;
4352 }
4353 }
4354 }
4355
4356 db = KeyValueDB::create(cct,
4357 kv_backend,
4358 fn,
4359 static_cast<void*>(env));
4360 if (!db) {
4361 derr << __func__ << " error creating db" << dendl;
4362 if (bluefs) {
4363 bluefs->umount();
4364 delete bluefs;
4365 bluefs = NULL;
4366 }
4367 // delete env manually here since we can't depend on db to do this
4368 // under this case
4369 delete env;
4370 env = NULL;
4371 return -EIO;
4372 }
4373
4374 FreelistManager::setup_merge_operators(db);
4375 db->set_merge_operator(PREFIX_STAT, merge_op);
4376
4377 if (kv_backend == "rocksdb")
4378 options = cct->_conf->bluestore_rocksdb_options;
4379 db->init(options);
4380 if (create)
4381 r = db->create_and_open(err);
4382 else
4383 r = db->open(err);
4384 if (r) {
4385 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4386 if (bluefs) {
4387 bluefs->umount();
4388 delete bluefs;
4389 bluefs = NULL;
4390 }
4391 delete db;
4392 db = NULL;
4393 return -EIO;
4394 }
4395 dout(1) << __func__ << " opened " << kv_backend
4396 << " path " << fn << " options " << options << dendl;
4397 return 0;
4398
4399free_bluefs:
4400 assert(bluefs);
4401 delete bluefs;
4402 bluefs = NULL;
4403 return r;
4404}
4405
4406void BlueStore::_close_db()
4407{
4408 assert(db);
4409 delete db;
4410 db = NULL;
4411 if (bluefs) {
4412 bluefs->umount();
4413 delete bluefs;
4414 bluefs = NULL;
4415 }
4416}
4417
4418int BlueStore::_reconcile_bluefs_freespace()
4419{
4420 dout(10) << __func__ << dendl;
4421 interval_set<uint64_t> bset;
4422 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
4423 assert(r == 0);
4424 if (bset == bluefs_extents) {
4425 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
4426 << std::dec << dendl;
4427 return 0;
4428 }
4429 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
4430 << dendl;
4431 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
4432 << std::dec << dendl;
4433
4434 interval_set<uint64_t> overlap;
4435 overlap.intersection_of(bset, bluefs_extents);
4436
4437 bset.subtract(overlap);
4438 if (!bset.empty()) {
4439 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
4440 << dendl;
4441 return -EIO;
4442 }
4443
4444 interval_set<uint64_t> super_extra;
4445 super_extra = bluefs_extents;
4446 super_extra.subtract(overlap);
4447 if (!super_extra.empty()) {
4448 // This is normal: it can happen if we commit to give extents to
4449 // bluefs and we crash before bluefs commits that it owns them.
4450 dout(10) << __func__ << " super extra " << super_extra << dendl;
4451 for (interval_set<uint64_t>::iterator p = super_extra.begin();
4452 p != super_extra.end();
4453 ++p) {
4454 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
4455 }
4456 }
4457
4458 return 0;
4459}
4460
4461int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
4462{
4463 int ret = 0;
4464 assert(bluefs);
4465
4466 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
4467 bluefs->get_usage(&bluefs_usage);
4468 assert(bluefs_usage.size() > bluefs_shared_bdev);
4469
4470 // fixme: look at primary bdev only for now
4471 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
4472 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
4473 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
4474
4475 uint64_t my_free = alloc->get_free();
4476 uint64_t total = bdev->get_size();
4477 float my_free_ratio = (float)my_free / (float)total;
4478
4479 uint64_t total_free = bluefs_free + my_free;
4480
4481 float bluefs_ratio = (float)bluefs_free / (float)total_free;
4482
4483 dout(10) << __func__
4484 << " bluefs " << pretty_si_t(bluefs_free)
4485 << " free (" << bluefs_free_ratio
4486 << ") bluestore " << pretty_si_t(my_free)
4487 << " free (" << my_free_ratio
4488 << "), bluefs_ratio " << bluefs_ratio
4489 << dendl;
4490
4491 uint64_t gift = 0;
4492 uint64_t reclaim = 0;
4493 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
4494 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
4495 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4496 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
4497 << ", should gift " << pretty_si_t(gift) << dendl;
4498 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
4499 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
4500 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
4501 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
4502 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4503 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
4504 << ", should reclaim " << pretty_si_t(reclaim) << dendl;
4505 }
4506 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
4507 cct->_conf->bluestore_bluefs_min <
4508 (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
4509 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
4510 dout(10) << __func__ << " bluefs_total " << bluefs_total
4511 << " < min " << cct->_conf->bluestore_bluefs_min
4512 << ", should gift " << pretty_si_t(g) << dendl;
4513 if (g > gift)
4514 gift = g;
4515 reclaim = 0;
4516 }
4517
4518 if (gift) {
4519 // round up to alloc size
4520 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
4521
4522 // hard cap to fit into 32 bits
4523 gift = MIN(gift, 1ull<<31);
4524 dout(10) << __func__ << " gifting " << gift
4525 << " (" << pretty_si_t(gift) << ")" << dendl;
4526
4527 // fixme: just do one allocation to start...
4528 int r = alloc->reserve(gift);
4529 assert(r == 0);
4530
4531 AllocExtentVector exts;
4532 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
4533 0, 0, &exts);
4534
4535 if (alloc_len < (int64_t)gift) {
4536 derr << __func__ << " allocate failed on 0x" << std::hex << gift
4537 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4538 alloc->dump();
4539 assert(0 == "allocate failed, wtf");
4540 return -ENOSPC;
4541 }
4542 for (auto& p : exts) {
4543 bluestore_pextent_t e = bluestore_pextent_t(p);
4544 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
4545 extents->push_back(e);
4546 }
4547 gift = 0;
4548
4549 ret = 1;
4550 }
4551
4552 // reclaim from bluefs?
4553 if (reclaim) {
4554 // round up to alloc size
4555 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
4556
4557 // hard cap to fit into 32 bits
4558 reclaim = MIN(reclaim, 1ull<<31);
4559 dout(10) << __func__ << " reclaiming " << reclaim
4560 << " (" << pretty_si_t(reclaim) << ")" << dendl;
4561
4562 while (reclaim > 0) {
4563 // NOTE: this will block and do IO.
4564 AllocExtentVector extents;
4565 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
4566 &extents);
4567 if (r < 0) {
4568 derr << __func__ << " failed to reclaim space from bluefs"
4569 << dendl;
4570 break;
4571 }
4572 for (auto e : extents) {
4573 bluefs_extents.erase(e.offset, e.length);
4574 bluefs_extents_reclaiming.insert(e.offset, e.length);
4575 reclaim -= e.length;
4576 }
4577 }
4578
4579 ret = 1;
4580 }
4581
4582 return ret;
4583}
4584
4585void BlueStore::_commit_bluefs_freespace(
4586 const PExtentVector& bluefs_gift_extents)
4587{
4588 dout(10) << __func__ << dendl;
4589 for (auto& p : bluefs_gift_extents) {
4590 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
4591 }
4592}
4593
4594int BlueStore::_open_collections(int *errors)
4595{
4596 assert(coll_map.empty());
4597 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
4598 for (it->upper_bound(string());
4599 it->valid();
4600 it->next()) {
4601 coll_t cid;
4602 if (cid.parse(it->key())) {
4603 CollectionRef c(
4604 new Collection(
4605 this,
4606 cache_shards[cid.hash_to_shard(cache_shards.size())],
4607 cid));
4608 bufferlist bl = it->value();
4609 bufferlist::iterator p = bl.begin();
4610 try {
4611 ::decode(c->cnode, p);
4612 } catch (buffer::error& e) {
4613 derr << __func__ << " failed to decode cnode, key:"
4614 << pretty_binary_string(it->key()) << dendl;
4615 return -EIO;
4616 }
4617 dout(20) << __func__ << " opened " << cid << " " << c << dendl;
4618 coll_map[cid] = c;
4619 } else {
4620 derr << __func__ << " unrecognized collection " << it->key() << dendl;
4621 if (errors)
4622 (*errors)++;
4623 }
4624 }
4625 return 0;
4626}
4627
4628int BlueStore::_setup_block_symlink_or_file(
4629 string name,
4630 string epath,
4631 uint64_t size,
4632 bool create)
4633{
4634 dout(20) << __func__ << " name " << name << " path " << epath
4635 << " size " << size << " create=" << (int)create << dendl;
4636 int r = 0;
4637 int flags = O_RDWR;
4638 if (create)
4639 flags |= O_CREAT;
4640 if (epath.length()) {
4641 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
4642 if (r < 0) {
4643 r = -errno;
4644 derr << __func__ << " failed to create " << name << " symlink to "
4645 << epath << ": " << cpp_strerror(r) << dendl;
4646 return r;
4647 }
4648
4649 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
4650 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
4651 if (fd < 0) {
4652 r = -errno;
4653 derr << __func__ << " failed to open " << epath << " file: "
4654 << cpp_strerror(r) << dendl;
4655 return r;
4656 }
4657 string serial_number = epath.substr(strlen(SPDK_PREFIX));
4658 r = ::write(fd, serial_number.c_str(), serial_number.size());
4659 assert(r == (int)serial_number.size());
4660 dout(1) << __func__ << " created " << name << " symlink to "
4661 << epath << dendl;
4662 VOID_TEMP_FAILURE_RETRY(::close(fd));
4663 }
4664 }
4665 if (size) {
4666 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
4667 if (fd >= 0) {
4668 // block file is present
4669 struct stat st;
4670 int r = ::fstat(fd, &st);
4671 if (r == 0 &&
4672 S_ISREG(st.st_mode) && // if it is a regular file
4673 st.st_size == 0) { // and is 0 bytes
4674 r = ::ftruncate(fd, size);
4675 if (r < 0) {
4676 r = -errno;
4677 derr << __func__ << " failed to resize " << name << " file to "
4678 << size << ": " << cpp_strerror(r) << dendl;
4679 VOID_TEMP_FAILURE_RETRY(::close(fd));
4680 return r;
4681 }
4682
4683 if (cct->_conf->bluestore_block_preallocate_file) {
4684#ifdef HAVE_POSIX_FALLOCATE
4685 r = ::posix_fallocate(fd, 0, size);
4686 if (r) {
4687 derr << __func__ << " failed to prefallocate " << name << " file to "
4688 << size << ": " << cpp_strerror(r) << dendl;
4689 VOID_TEMP_FAILURE_RETRY(::close(fd));
4690 return -r;
4691 }
4692#else
4693 char data[1024*128];
4694 for (uint64_t off = 0; off < size; off += sizeof(data)) {
4695 if (off + sizeof(data) > size)
4696 r = ::write(fd, data, size - off);
4697 else
4698 r = ::write(fd, data, sizeof(data));
4699 if (r < 0) {
4700 r = -errno;
4701 derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
4702 << size << ": " << cpp_strerror(r) << dendl;
4703 VOID_TEMP_FAILURE_RETRY(::close(fd));
4704 return r;
4705 }
4706 }
4707#endif
4708 }
4709 dout(1) << __func__ << " resized " << name << " file to "
4710 << pretty_si_t(size) << "B" << dendl;
4711 }
4712 VOID_TEMP_FAILURE_RETRY(::close(fd));
4713 } else {
4714 int r = -errno;
4715 if (r != -ENOENT) {
4716 derr << __func__ << " failed to open " << name << " file: "
4717 << cpp_strerror(r) << dendl;
4718 return r;
4719 }
4720 }
4721 }
4722 return 0;
4723}
4724
4725int BlueStore::mkfs()
4726{
4727 dout(1) << __func__ << " path " << path << dendl;
4728 int r;
4729 uuid_d old_fsid;
4730
4731 {
4732 string done;
4733 r = read_meta("mkfs_done", &done);
4734 if (r == 0) {
4735 dout(1) << __func__ << " already created" << dendl;
4736 if (cct->_conf->bluestore_fsck_on_mkfs) {
4737 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
4738 if (r < 0) {
4739 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
4740 << dendl;
4741 return r;
4742 }
4743 if (r > 0) {
4744 derr << __func__ << " fsck found " << r << " errors" << dendl;
4745 r = -EIO;
4746 }
4747 }
4748 return r; // idempotent
4749 }
4750 }
4751
4752 {
4753 string type;
4754 r = read_meta("type", &type);
4755 if (r == 0) {
4756 if (type != "bluestore") {
4757 derr << __func__ << " expected bluestore, but type is " << type << dendl;
4758 return -EIO;
4759 }
4760 } else {
4761 r = write_meta("type", "bluestore");
4762 if (r < 0)
4763 return r;
4764 }
4765 }
4766
4767 freelist_type = "bitmap";
4768
4769 r = _open_path();
4770 if (r < 0)
4771 return r;
4772
4773 r = _open_fsid(true);
4774 if (r < 0)
4775 goto out_path_fd;
4776
4777 r = _lock_fsid();
4778 if (r < 0)
4779 goto out_close_fsid;
4780
4781 r = _read_fsid(&old_fsid);
4782 if (r < 0 || old_fsid.is_zero()) {
4783 if (fsid.is_zero()) {
4784 fsid.generate_random();
4785 dout(1) << __func__ << " generated fsid " << fsid << dendl;
4786 } else {
4787 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
4788 }
4789 // we'll write it later.
4790 } else {
4791 if (!fsid.is_zero() && fsid != old_fsid) {
4792 derr << __func__ << " on-disk fsid " << old_fsid
4793 << " != provided " << fsid << dendl;
4794 r = -EINVAL;
4795 goto out_close_fsid;
4796 }
4797 fsid = old_fsid;
4798 }
4799
4800 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
4801 cct->_conf->bluestore_block_size,
4802 cct->_conf->bluestore_block_create);
4803 if (r < 0)
4804 goto out_close_fsid;
4805 if (cct->_conf->bluestore_bluefs) {
4806 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
4807 cct->_conf->bluestore_block_wal_size,
4808 cct->_conf->bluestore_block_wal_create);
4809 if (r < 0)
4810 goto out_close_fsid;
4811 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
4812 cct->_conf->bluestore_block_db_size,
4813 cct->_conf->bluestore_block_db_create);
4814 if (r < 0)
4815 goto out_close_fsid;
4816 }
4817
4818 r = _open_bdev(true);
4819 if (r < 0)
4820 goto out_close_fsid;
4821
4822 r = _open_db(true);
4823 if (r < 0)
4824 goto out_close_bdev;
4825
4826 r = _open_fm(true);
4827 if (r < 0)
4828 goto out_close_db;
4829
4830 {
4831 KeyValueDB::Transaction t = db->get_transaction();
4832 {
4833 bufferlist bl;
4834 ::encode((uint64_t)0, bl);
4835 t->set(PREFIX_SUPER, "nid_max", bl);
4836 t->set(PREFIX_SUPER, "blobid_max", bl);
4837 }
4838
4839 // choose min_alloc_size
4840 if (cct->_conf->bluestore_min_alloc_size) {
4841 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
4842 } else {
4843 assert(bdev);
4844 if (bdev->is_rotational()) {
4845 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
4846 } else {
4847 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
4848 }
4849 }
4850 _set_alloc_sizes();
4851 {
4852 bufferlist bl;
4853 ::encode((uint64_t)min_alloc_size, bl);
4854 t->set(PREFIX_SUPER, "min_alloc_size", bl);
4855 }
4856
4857 ondisk_format = latest_ondisk_format;
4858 _prepare_ondisk_format_super(t);
4859 db->submit_transaction_sync(t);
4860 }
4861
4862 r = _open_alloc();
4863 if (r < 0)
4864 goto out_close_fm;
4865
4866 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
4867 if (r < 0)
4868 goto out_close_alloc;
4869 r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
4870 if (r < 0)
4871 goto out_close_alloc;
4872
4873 if (fsid != old_fsid) {
4874 r = _write_fsid();
4875 if (r < 0) {
4876 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
4877 goto out_close_alloc;
4878 }
4879 }
4880
4881 // indicate success by writing the 'mkfs_done' file
4882 r = write_meta("mkfs_done", "yes");
4883 if (r < 0)
4884 goto out_close_alloc;
4885 dout(10) << __func__ << " success" << dendl;
4886
4887 out_close_alloc:
4888 _close_alloc();
4889 out_close_fm:
4890 _close_fm();
4891 out_close_db:
4892 _close_db();
4893 out_close_bdev:
4894 _close_bdev();
4895 out_close_fsid:
4896 _close_fsid();
4897 out_path_fd:
4898 _close_path();
4899
4900 if (r == 0 &&
4901 cct->_conf->bluestore_fsck_on_mkfs) {
4902 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
4903 if (rc < 0)
4904 return rc;
4905 if (rc > 0) {
4906 derr << __func__ << " fsck found " << rc << " errors" << dendl;
4907 r = -EIO;
4908 }
4909 }
4910 if (r < 0) {
4911 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
4912 }
4913 return r;
4914}
4915
4916void BlueStore::set_cache_shards(unsigned num)
4917{
4918 dout(10) << __func__ << " " << num << dendl;
4919 size_t old = cache_shards.size();
4920 assert(num >= old);
4921 cache_shards.resize(num);
4922 for (unsigned i = old; i < num; ++i) {
4923 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
4924 logger);
4925 }
4926}
4927
4928int BlueStore::_mount(bool kv_only)
4929{
4930 dout(1) << __func__ << " path " << path << dendl;
4931
4932 {
4933 string type;
4934 int r = read_meta("type", &type);
4935 if (r < 0) {
4936 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
4937 << dendl;
4938 return r;
4939 }
4940
4941 if (type != "bluestore") {
4942 derr << __func__ << " expected bluestore, but type is " << type << dendl;
4943 return -EIO;
4944 }
4945 }
4946
4947 if (cct->_conf->bluestore_fsck_on_mount) {
4948 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
4949 if (rc < 0)
4950 return rc;
4951 if (rc > 0) {
4952 derr << __func__ << " fsck found " << rc << " errors" << dendl;
4953 return -EIO;
4954 }
4955 }
4956
4957 int r = _open_path();
4958 if (r < 0)
4959 return r;
4960 r = _open_fsid(false);
4961 if (r < 0)
4962 goto out_path;
4963
4964 r = _read_fsid(&fsid);
4965 if (r < 0)
4966 goto out_fsid;
4967
4968 r = _lock_fsid();
4969 if (r < 0)
4970 goto out_fsid;
4971
4972 r = _open_bdev(false);
4973 if (r < 0)
4974 goto out_fsid;
4975
4976 r = _open_db(false);
4977 if (r < 0)
4978 goto out_bdev;
4979
4980 if (kv_only)
4981 return 0;
4982
4983 r = _open_super_meta();
4984 if (r < 0)
4985 goto out_db;
4986
4987 r = _open_fm(false);
4988 if (r < 0)
4989 goto out_db;
4990
4991 r = _open_alloc();
4992 if (r < 0)
4993 goto out_fm;
4994
4995 r = _open_collections();
4996 if (r < 0)
4997 goto out_alloc;
4998
4999 r = _reload_logger();
5000 if (r < 0)
5001 goto out_coll;
5002
5003 if (bluefs) {
5004 r = _reconcile_bluefs_freespace();
5005 if (r < 0)
5006 goto out_coll;
5007 }
5008
5009 for (auto f : finishers) {
5010 f->start();
5011 }
5012 kv_sync_thread.create("bstore_kv_sync");
5013
5014 r = _deferred_replay();
5015 if (r < 0)
5016 goto out_stop;
5017
5018 mempool_thread.init();
5019
5020
5021 mounted = true;
5022 return 0;
5023
5024 out_stop:
5025 _kv_stop();
5026 for (auto f : finishers) {
5027 f->wait_for_empty();
5028 f->stop();
5029 }
5030 out_coll:
5031 flush_cache();
5032 out_alloc:
5033 _close_alloc();
5034 out_fm:
5035 _close_fm();
5036 out_db:
5037 _close_db();
5038 out_bdev:
5039 _close_bdev();
5040 out_fsid:
5041 _close_fsid();
5042 out_path:
5043 _close_path();
5044 return r;
5045}
5046
5047int BlueStore::umount()
5048{
5049 assert(mounted);
5050 dout(1) << __func__ << dendl;
5051
5052 _osr_drain_all();
5053 _osr_unregister_all();
5054
5055 mempool_thread.shutdown();
5056
5057 dout(20) << __func__ << " stopping kv thread" << dendl;
5058 _kv_stop();
5059 for (auto f : finishers) {
5060 dout(20) << __func__ << " draining finisher" << dendl;
5061 f->wait_for_empty();
5062 dout(20) << __func__ << " stopping finisher" << dendl;
5063 f->stop();
5064 }
5065 _reap_collections();
5066 flush_cache();
5067 dout(20) << __func__ << " closing" << dendl;
5068
5069 mounted = false;
5070 _close_alloc();
5071 _close_fm();
5072 _close_db();
5073 _close_bdev();
5074 _close_fsid();
5075 _close_path();
5076
5077 if (cct->_conf->bluestore_fsck_on_umount) {
5078 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5079 if (rc < 0)
5080 return rc;
5081 if (rc > 0) {
5082 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5083 return -EIO;
5084 }
5085 }
5086 return 0;
5087}
5088
5089static void apply(uint64_t off,
5090 uint64_t len,
5091 uint64_t granularity,
5092 BlueStore::mempool_dynamic_bitset &bitset,
5093 const char *what,
5094 std::function<void(uint64_t,
5095 BlueStore::mempool_dynamic_bitset &)> f) {
5096 auto end = ROUND_UP_TO(off + len, granularity);
5097 while (off < end) {
5098 uint64_t pos = off / granularity;
5099 f(pos, bitset);
5100 off += granularity;
5101 }
5102}
5103
5104int BlueStore::_fsck_check_extents(
5105 const ghobject_t& oid,
5106 const PExtentVector& extents,
5107 bool compressed,
5108 mempool_dynamic_bitset &used_blocks,
5109 store_statfs_t& expected_statfs)
5110{
5111 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5112 int errors = 0;
5113 for (auto e : extents) {
5114 if (!e.is_valid())
5115 continue;
5116 expected_statfs.allocated += e.length;
5117 if (compressed) {
5118 expected_statfs.compressed_allocated += e.length;
5119 }
5120 bool already = false;
5121 apply(
5122 e.offset, e.length, block_size, used_blocks, __func__,
5123 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5124 if (bs.test(pos))
5125 already = true;
5126 else
5127 bs.set(pos);
5128 });
5129 if (already) {
5130 derr << " " << oid << " extent " << e
5131 << " or a subset is already allocated" << dendl;
5132 ++errors;
5133 }
5134 if (e.end() > bdev->get_size()) {
5135 derr << " " << oid << " extent " << e
5136 << " past end of block device" << dendl;
5137 ++errors;
5138 }
5139 }
5140 return errors;
5141}
5142
5143int BlueStore::fsck(bool deep)
5144{
5145 dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
5146 int errors = 0;
5147 mempool::bluestore_fsck::set<uint64_t> used_nids;
5148 mempool::bluestore_fsck::set<uint64_t> used_omap_head;
5149 mempool_dynamic_bitset used_blocks;
5150 mempool::bluestore_fsck::set<uint64_t> used_sbids;
5151 KeyValueDB::Iterator it;
5152 store_statfs_t expected_statfs, actual_statfs;
5153 struct sb_info_t {
5154 list<ghobject_t> oids;
5155 SharedBlobRef sb;
5156 bluestore_extent_ref_map_t ref_map;
5157 bool compressed;
5158 };
5159 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5160
5161 uint64_t num_objects = 0;
5162 uint64_t num_extents = 0;
5163 uint64_t num_blobs = 0;
5164 uint64_t num_spanning_blobs = 0;
5165 uint64_t num_shared_blobs = 0;
5166 uint64_t num_sharded_objects = 0;
5167 uint64_t num_object_shards = 0;
5168
5169 utime_t start = ceph_clock_now();
5170
5171 int r = _open_path();
5172 if (r < 0)
5173 return r;
5174 r = _open_fsid(false);
5175 if (r < 0)
5176 goto out_path;
5177
5178 r = _read_fsid(&fsid);
5179 if (r < 0)
5180 goto out_fsid;
5181
5182 r = _lock_fsid();
5183 if (r < 0)
5184 goto out_fsid;
5185
5186 r = _open_bdev(false);
5187 if (r < 0)
5188 goto out_fsid;
5189
5190 r = _open_db(false);
5191 if (r < 0)
5192 goto out_bdev;
5193
5194 r = _open_super_meta();
5195 if (r < 0)
5196 goto out_db;
5197
5198 r = _open_fm(false);
5199 if (r < 0)
5200 goto out_db;
5201
5202 r = _open_alloc();
5203 if (r < 0)
5204 goto out_fm;
5205
5206 r = _open_collections(&errors);
5207 if (r < 0)
5208 goto out_alloc;
5209
5210 mempool_thread.init();
5211
5212 r = _deferred_replay();
5213 if (r < 0)
5214 goto out_scan;
5215
5216 used_blocks.resize(bdev->get_size() / block_size);
5217 apply(
5218 0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
5219 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5220 bs.set(pos);
5221 }
5222 );
5223
5224 if (bluefs) {
5225 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5226 apply(
5227 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
5228 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5229 bs.set(pos);
5230 }
5231 );
5232 }
5233 r = bluefs->fsck();
5234 if (r < 0) {
5235 goto out_scan;
5236 }
5237 if (r > 0)
5238 errors += r;
5239 }
5240
5241 // get expected statfs; fill unaffected fields to be able to compare
5242 // structs
5243 statfs(&actual_statfs);
5244 expected_statfs.total = actual_statfs.total;
5245 expected_statfs.available = actual_statfs.available;
5246
5247 // walk PREFIX_OBJ
5248 dout(1) << __func__ << " walking object keyspace" << dendl;
5249 it = db->get_iterator(PREFIX_OBJ);
5250 if (it) {
5251 CollectionRef c;
5252 spg_t pgid;
5253 mempool::bluestore_fsck::list<string> expecting_shards;
5254 for (it->lower_bound(string()); it->valid(); it->next()) {
5255 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5256 if (is_extent_shard_key(it->key())) {
5257 while (!expecting_shards.empty() &&
5258 expecting_shards.front() < it->key()) {
5259 derr << __func__ << " error: missing shard key "
5260 << pretty_binary_string(expecting_shards.front())
5261 << dendl;
5262 ++errors;
5263 expecting_shards.pop_front();
5264 }
5265 if (!expecting_shards.empty() &&
5266 expecting_shards.front() == it->key()) {
5267 // all good
5268 expecting_shards.pop_front();
5269 continue;
5270 }
5271
5272 uint32_t offset;
5273 string okey;
5274 get_key_extent_shard(it->key(), &okey, &offset);
5275 derr << __func__ << " error: stray shard 0x" << std::hex << offset
5276 << std::dec << dendl;
5277 if (expecting_shards.empty()) {
5278 derr << __func__ << " error: " << pretty_binary_string(it->key())
5279 << " is unexpected" << dendl;
5280 ++errors;
5281 continue;
5282 }
5283 while (expecting_shards.front() > it->key()) {
5284 derr << __func__ << " error: saw " << pretty_binary_string(it->key())
5285 << dendl;
5286 derr << __func__ << " error: exp "
5287 << pretty_binary_string(expecting_shards.front()) << dendl;
5288 ++errors;
5289 expecting_shards.pop_front();
5290 if (expecting_shards.empty()) {
5291 break;
5292 }
5293 }
5294 continue;
5295 }
5296
5297 ghobject_t oid;
5298 int r = get_key_object(it->key(), &oid);
5299 if (r < 0) {
5300 derr << __func__ << " error: bad object key "
5301 << pretty_binary_string(it->key()) << dendl;
5302 ++errors;
5303 continue;
5304 }
5305 if (!c ||
5306 oid.shard_id != pgid.shard ||
5307 oid.hobj.pool != (int64_t)pgid.pool() ||
5308 !c->contains(oid)) {
5309 c = nullptr;
5310 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5311 coll_map.begin();
5312 p != coll_map.end();
5313 ++p) {
5314 if (p->second->contains(oid)) {
5315 c = p->second;
5316 break;
5317 }
5318 }
5319 if (!c) {
5320 derr << __func__ << " error: stray object " << oid
5321 << " not owned by any collection" << dendl;
5322 ++errors;
5323 continue;
5324 }
5325 c->cid.is_pg(&pgid);
5326 dout(20) << __func__ << " collection " << c->cid << dendl;
5327 }
5328
5329 if (!expecting_shards.empty()) {
5330 for (auto &k : expecting_shards) {
5331 derr << __func__ << " error: missing shard key "
5332 << pretty_binary_string(k) << dendl;
5333 }
5334 ++errors;
5335 expecting_shards.clear();
5336 }
5337
5338 dout(10) << __func__ << " " << oid << dendl;
5339 RWLock::RLocker l(c->lock);
5340 OnodeRef o = c->get_onode(oid, false);
5341 if (o->onode.nid) {
5342 if (o->onode.nid > nid_max) {
5343 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5344 << " > nid_max " << nid_max << dendl;
5345 ++errors;
5346 }
5347 if (used_nids.count(o->onode.nid)) {
5348 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5349 << " already in use" << dendl;
5350 ++errors;
5351 continue; // go for next object
5352 }
5353 used_nids.insert(o->onode.nid);
5354 }
5355 ++num_objects;
5356 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5357 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5358 _dump_onode(o, 30);
5359 // shards
5360 if (!o->extent_map.shards.empty()) {
5361 ++num_sharded_objects;
5362 num_object_shards += o->extent_map.shards.size();
5363 }
5364 for (auto& s : o->extent_map.shards) {
5365 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
5366 expecting_shards.push_back(string());
5367 get_extent_shard_key(o->key, s.shard_info->offset,
5368 &expecting_shards.back());
5369 if (s.shard_info->offset >= o->onode.size) {
5370 derr << __func__ << " error: " << oid << " shard 0x" << std::hex
5371 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
5372 << std::dec << dendl;
5373 ++errors;
5374 }
5375 }
5376 // lextents
5377 map<BlobRef,bluestore_blob_t::unused_t> referenced;
5378 uint64_t pos = 0;
5379 mempool::bluestore_fsck::map<BlobRef,
5380 bluestore_blob_use_tracker_t> ref_map;
5381 for (auto& l : o->extent_map.extent_map) {
5382 dout(20) << __func__ << " " << l << dendl;
5383 if (l.logical_offset < pos) {
5384 derr << __func__ << " error: " << oid << " lextent at 0x"
5385 << std::hex << l.logical_offset
5386 << " overlaps with the previous, which ends at 0x" << pos
5387 << std::dec << dendl;
5388 ++errors;
5389 }
5390 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
5391 derr << __func__ << " error: " << oid << " lextent at 0x"
5392 << std::hex << l.logical_offset << "~" << l.length
5393 << " spans a shard boundary"
5394 << std::dec << dendl;
5395 ++errors;
5396 }
5397 pos = l.logical_offset + l.length;
5398 expected_statfs.stored += l.length;
5399 assert(l.blob);
5400 const bluestore_blob_t& blob = l.blob->get_blob();
5401
5402 auto& ref = ref_map[l.blob];
5403 if (ref.is_empty()) {
5404 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
5405 uint32_t l = blob.get_logical_length();
5406 ref.init(l, min_release_size);
5407 }
5408 ref.get(
5409 l.blob_offset,
5410 l.length);
5411 ++num_extents;
5412 if (blob.has_unused()) {
5413 auto p = referenced.find(l.blob);
5414 bluestore_blob_t::unused_t *pu;
5415 if (p == referenced.end()) {
5416 pu = &referenced[l.blob];
5417 } else {
5418 pu = &p->second;
5419 }
5420 uint64_t blob_len = blob.get_logical_length();
5421 assert((blob_len % (sizeof(*pu)*8)) == 0);
5422 assert(l.blob_offset + l.length <= blob_len);
5423 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
5424 uint64_t start = l.blob_offset / chunk_size;
5425 uint64_t end =
5426 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
5427 for (auto i = start; i < end; ++i) {
5428 (*pu) |= (1u << i);
5429 }
5430 }
5431 }
5432 for (auto &i : referenced) {
5433 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
5434 << std::dec << " for " << *i.first << dendl;
5435 const bluestore_blob_t& blob = i.first->get_blob();
5436 if (i.second & blob.unused) {
5437 derr << __func__ << " error: " << oid << " blob claims unused 0x"
5438 << std::hex << blob.unused
5439 << " but extents reference 0x" << i.second
5440 << " on blob " << *i.first << dendl;
5441 ++errors;
5442 }
5443 if (blob.has_csum()) {
5444 uint64_t blob_len = blob.get_logical_length();
5445 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
5446 unsigned csum_count = blob.get_csum_count();
5447 unsigned csum_chunk_size = blob.get_csum_chunk_size();
5448 for (unsigned p = 0; p < csum_count; ++p) {
5449 unsigned pos = p * csum_chunk_size;
5450 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
5451 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
5452 unsigned mask = 1u << firstbit;
5453 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
5454 mask |= 1u << b;
5455 }
5456 if ((blob.unused & mask) == mask) {
5457 // this csum chunk region is marked unused
5458 if (blob.get_csum_item(p) != 0) {
5459 derr << __func__ << " error: " << oid
5460 << " blob claims csum chunk 0x" << std::hex << pos
5461 << "~" << csum_chunk_size
5462 << " is unused (mask 0x" << mask << " of unused 0x"
5463 << blob.unused << ") but csum is non-zero 0x"
5464 << blob.get_csum_item(p) << std::dec << " on blob "
5465 << *i.first << dendl;
5466 ++errors;
5467 }
5468 }
5469 }
5470 }
5471 }
5472 for (auto &i : ref_map) {
5473 ++num_blobs;
5474 const bluestore_blob_t& blob = i.first->get_blob();
5475 bool equal = i.first->get_blob_use_tracker().equal(i.second);
5476 if (!equal) {
5477 derr << __func__ << " error: " << oid << " blob " << *i.first
5478 << " doesn't match expected ref_map " << i.second << dendl;
5479 ++errors;
5480 }
5481 if (blob.is_compressed()) {
5482 expected_statfs.compressed += blob.get_compressed_payload_length();
5483 expected_statfs.compressed_original +=
5484 i.first->get_referenced_bytes();
5485 }
5486 if (blob.is_shared()) {
5487 if (i.first->shared_blob->get_sbid() > blobid_max) {
5488 derr << __func__ << " error: " << oid << " blob " << blob
5489 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
5490 << blobid_max << dendl;
5491 ++errors;
5492 } else if (i.first->shared_blob->get_sbid() == 0) {
5493 derr << __func__ << " error: " << oid << " blob " << blob
5494 << " marked as shared but has uninitialized sbid"
5495 << dendl;
5496 ++errors;
5497 }
5498 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
5499 sbi.sb = i.first->shared_blob;
5500 sbi.oids.push_back(oid);
5501 sbi.compressed = blob.is_compressed();
5502 for (auto e : blob.get_extents()) {
5503 if (e.is_valid()) {
5504 sbi.ref_map.get(e.offset, e.length);
5505 }
5506 }
5507 } else {
5508 errors += _fsck_check_extents(oid, blob.get_extents(),
5509 blob.is_compressed(),
5510 used_blocks,
5511 expected_statfs);
5512 }
5513 }
5514 if (deep) {
5515 bufferlist bl;
5516 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
5517 if (r < 0) {
5518 ++errors;
5519 derr << __func__ << " error: " << oid << " error during read: "
5520 << cpp_strerror(r) << dendl;
5521 }
5522 }
5523 // omap
5524 if (o->onode.has_omap()) {
5525 if (used_omap_head.count(o->onode.nid)) {
5526 derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
5527 << " already in use" << dendl;
5528 ++errors;
5529 } else {
5530 used_omap_head.insert(o->onode.nid);
5531 }
5532 }
5533 c->trim_cache();
5534 }
5535 }
5536 dout(1) << __func__ << " checking shared_blobs" << dendl;
5537 it = db->get_iterator(PREFIX_SHARED_BLOB);
5538 if (it) {
5539 for (it->lower_bound(string()); it->valid(); it->next()) {
5540 string key = it->key();
5541 uint64_t sbid;
5542 if (get_key_shared_blob(key, &sbid)) {
5543 derr << __func__ << " error: bad key '" << key
5544 << "' in shared blob namespace" << dendl;
5545 ++errors;
5546 continue;
5547 }
5548 auto p = sb_info.find(sbid);
5549 if (p == sb_info.end()) {
5550 derr << __func__ << " error: found stray shared blob data for sbid 0x"
5551 << std::hex << sbid << std::dec << dendl;
5552 ++errors;
5553 } else {
5554 ++num_shared_blobs;
5555 sb_info_t& sbi = p->second;
5556 bluestore_shared_blob_t shared_blob(sbid);
5557 bufferlist bl = it->value();
5558 bufferlist::iterator blp = bl.begin();
5559 ::decode(shared_blob, blp);
5560 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
5561 if (shared_blob.ref_map != sbi.ref_map) {
5562 derr << __func__ << " error: shared blob 0x" << std::hex << sbid
5563 << std::dec << " ref_map " << shared_blob.ref_map
5564 << " != expected " << sbi.ref_map << dendl;
5565 ++errors;
5566 }
5567 PExtentVector extents;
5568 for (auto &r : shared_blob.ref_map.ref_map) {
5569 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
5570 }
5571 errors += _fsck_check_extents(p->second.oids.front(),
5572 extents,
5573 p->second.compressed,
5574 used_blocks, expected_statfs);
5575 sb_info.erase(p);
5576 }
5577 }
5578 }
5579 for (auto &p : sb_info) {
5580 derr << __func__ << " error: shared_blob 0x" << p.first
5581 << " key is missing (" << *p.second.sb << ")" << dendl;
5582 ++errors;
5583 }
5584 if (!(actual_statfs == expected_statfs)) {
5585 derr << __func__ << " error: actual " << actual_statfs
5586 << " != expected " << expected_statfs << dendl;
5587 ++errors;
5588 }
5589
5590 dout(1) << __func__ << " checking for stray omap data" << dendl;
5591 it = db->get_iterator(PREFIX_OMAP);
5592 if (it) {
5593 for (it->lower_bound(string()); it->valid(); it->next()) {
5594 uint64_t omap_head;
5595 _key_decode_u64(it->key().c_str(), &omap_head);
5596 if (used_omap_head.count(omap_head) == 0) {
5597 derr << __func__ << " error: found stray omap data on omap_head "
5598 << omap_head << dendl;
5599 ++errors;
5600 }
5601 }
5602 }
5603
5604 dout(1) << __func__ << " checking deferred events" << dendl;
5605 it = db->get_iterator(PREFIX_DEFERRED);
5606 if (it) {
5607 for (it->lower_bound(string()); it->valid(); it->next()) {
5608 bufferlist bl = it->value();
5609 bufferlist::iterator p = bl.begin();
5610 bluestore_deferred_transaction_t wt;
5611 try {
5612 ::decode(wt, p);
5613 } catch (buffer::error& e) {
5614 derr << __func__ << " error: failed to decode deferred txn "
5615 << pretty_binary_string(it->key()) << dendl;
5616 r = -EIO;
5617 goto out_scan;
5618 }
5619 dout(20) << __func__ << " deferred " << wt.seq
5620 << " ops " << wt.ops.size()
5621 << " released 0x" << std::hex << wt.released << std::dec << dendl;
5622 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
5623 apply(
5624 e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
5625 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5626 bs.set(pos);
5627 }
5628 );
5629 }
5630 }
5631 }
5632
5633 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
5634 {
5635 // remove bluefs_extents from used set since the freelist doesn't
5636 // know they are allocated.
5637 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5638 apply(
5639 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
5640 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5641 bs.reset(pos);
5642 }
5643 );
5644 }
5645 fm->enumerate_reset();
5646 uint64_t offset, length;
5647 while (fm->enumerate_next(&offset, &length)) {
5648 bool intersects = false;
5649 apply(
5650 offset, length, block_size, used_blocks, "free",
5651 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5652 if (bs.test(pos)) {
5653 intersects = true;
5654 } else {
5655 bs.set(pos);
5656 }
5657 }
5658 );
5659 if (intersects) {
5660 derr << __func__ << " error: free extent 0x" << std::hex << offset
5661 << "~" << length << std::dec
5662 << " intersects allocated blocks" << dendl;
5663 ++errors;
5664 }
5665 }
5666 size_t count = used_blocks.count();
5667 if (used_blocks.size() != count) {
5668 assert(used_blocks.size() > count);
5669 derr << __func__ << " error: leaked some space;"
5670 << (used_blocks.size() - count) * min_alloc_size
5671 << " bytes leaked" << dendl;
5672 ++errors;
5673 }
5674 }
5675
5676 out_scan:
5677 mempool_thread.shutdown();
5678 flush_cache();
5679 out_alloc:
5680 _close_alloc();
5681 out_fm:
5682 _close_fm();
5683 out_db:
5684 it.reset(); // before db is closed
5685 _close_db();
5686 out_bdev:
5687 _close_bdev();
5688 out_fsid:
5689 _close_fsid();
5690 out_path:
5691 _close_path();
5692
5693 // fatal errors take precedence
5694 if (r < 0)
5695 return r;
5696
5697 dout(2) << __func__ << " " << num_objects << " objects, "
5698 << num_sharded_objects << " of them sharded. "
5699 << dendl;
5700 dout(2) << __func__ << " " << num_extents << " extents to "
5701 << num_blobs << " blobs, "
5702 << num_spanning_blobs << " spanning, "
5703 << num_shared_blobs << " shared."
5704 << dendl;
5705
5706 utime_t duration = ceph_clock_now() - start;
5707 dout(1) << __func__ << " finish with " << errors << " errors in "
5708 << duration << " seconds" << dendl;
5709 return errors;
5710}
5711
5712void BlueStore::collect_metadata(map<string,string> *pm)
5713{
5714 dout(10) << __func__ << dendl;
5715 bdev->collect_metadata("bluestore_bdev_", pm);
5716 if (bluefs) {
5717 (*pm)["bluefs"] = "1";
5718 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
5719 bluefs->collect_metadata(pm);
5720 } else {
5721 (*pm)["bluefs"] = "0";
5722 }
5723}
5724
5725int BlueStore::statfs(struct store_statfs_t *buf)
5726{
5727 buf->reset();
5728 buf->total = bdev->get_size();
5729 buf->available = alloc->get_free();
5730
5731 if (bluefs) {
5732 // part of our shared device is "free" according to BlueFS
5733 // Don't include bluestore_bluefs_min because that space can't
5734 // be used for any other purpose.
5735 buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;
5736
5737 // include dedicated db, too, if that isn't the shared device.
5738 if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
5739 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
5740 }
5741 }
5742
5743 bufferlist bl;
5744 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
5745 if (r >= 0) {
5746 TransContext::volatile_statfs vstatfs;
5747 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
5748 auto it = bl.begin();
5749 vstatfs.decode(it);
5750
5751 buf->allocated = vstatfs.allocated();
5752 buf->stored = vstatfs.stored();
5753 buf->compressed = vstatfs.compressed();
5754 buf->compressed_original = vstatfs.compressed_original();
5755 buf->compressed_allocated = vstatfs.compressed_allocated();
5756 } else {
5757 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5758 }
5759 } else {
5760 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
5761 }
5762
5763
5764 dout(20) << __func__ << *buf << dendl;
5765 return 0;
5766}
5767
5768// ---------------
5769// cache
5770
5771BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
5772{
5773 RWLock::RLocker l(coll_lock);
5774 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
5775 if (cp == coll_map.end())
5776 return CollectionRef();
5777 return cp->second;
5778}
5779
5780void BlueStore::_queue_reap_collection(CollectionRef& c)
5781{
5782 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
5783 std::lock_guard<std::mutex> l(reap_lock);
5784 removed_collections.push_back(c);
5785}
5786
5787void BlueStore::_reap_collections()
5788{
5789 list<CollectionRef> removed_colls;
5790 {
5791 std::lock_guard<std::mutex> l(reap_lock);
5792 removed_colls.swap(removed_collections);
5793 }
5794
5795 bool all_reaped = true;
5796
5797 for (list<CollectionRef>::iterator p = removed_colls.begin();
5798 p != removed_colls.end();
5799 ++p) {
5800 CollectionRef c = *p;
5801 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
5802 if (c->onode_map.map_any([&](OnodeRef o) {
5803 assert(!o->exists);
5804 if (o->flushing_count.load()) {
5805 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
5806 << " flush_txns " << o->flushing_count << dendl;
5807 return false;
5808 }
5809 return true;
5810 })) {
5811 all_reaped = false;
5812 continue;
5813 }
5814 c->onode_map.clear();
5815 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
5816 }
5817
5818 if (all_reaped) {
5819 dout(10) << __func__ << " all reaped" << dendl;
5820 }
5821}
5822
5823void BlueStore::_update_cache_logger()
5824{
5825 uint64_t num_onodes = 0;
5826 uint64_t num_extents = 0;
5827 uint64_t num_blobs = 0;
5828 uint64_t num_buffers = 0;
5829 uint64_t num_buffer_bytes = 0;
5830 for (auto c : cache_shards) {
5831 c->add_stats(&num_onodes, &num_extents, &num_blobs,
5832 &num_buffers, &num_buffer_bytes);
5833 }
5834 logger->set(l_bluestore_onodes, num_onodes);
5835 logger->set(l_bluestore_extents, num_extents);
5836 logger->set(l_bluestore_blobs, num_blobs);
5837 logger->set(l_bluestore_buffers, num_buffers);
5838 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
5839}
5840
5841// ---------------
5842// read operations
5843
5844ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
5845{
5846 return _get_collection(cid);
5847}
5848
5849bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
5850{
5851 CollectionHandle c = _get_collection(cid);
5852 if (!c)
5853 return false;
5854 return exists(c, oid);
5855}
5856
5857bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
5858{
5859 Collection *c = static_cast<Collection *>(c_.get());
5860 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
5861 if (!c->exists)
5862 return false;
5863
5864 bool r = true;
5865
5866 {
5867 RWLock::RLocker l(c->lock);
5868 OnodeRef o = c->get_onode(oid, false);
5869 if (!o || !o->exists)
5870 r = false;
5871 }
5872
5873 c->trim_cache();
5874 return r;
5875}
5876
5877int BlueStore::stat(
5878 const coll_t& cid,
5879 const ghobject_t& oid,
5880 struct stat *st,
5881 bool allow_eio)
5882{
5883 CollectionHandle c = _get_collection(cid);
5884 if (!c)
5885 return -ENOENT;
5886 return stat(c, oid, st, allow_eio);
5887}
5888
5889int BlueStore::stat(
5890 CollectionHandle &c_,
5891 const ghobject_t& oid,
5892 struct stat *st,
5893 bool allow_eio)
5894{
5895 Collection *c = static_cast<Collection *>(c_.get());
5896 if (!c->exists)
5897 return -ENOENT;
5898 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
5899
5900 {
5901 RWLock::RLocker l(c->lock);
5902 OnodeRef o = c->get_onode(oid, false);
5903 if (!o || !o->exists)
5904 return -ENOENT;
5905 st->st_size = o->onode.size;
5906 st->st_blksize = 4096;
5907 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
5908 st->st_nlink = 1;
5909 }
5910
5911 c->trim_cache();
5912 int r = 0;
5913 if (_debug_mdata_eio(oid)) {
5914 r = -EIO;
5915 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
5916 }
5917 return r;
5918}
5919int BlueStore::set_collection_opts(
5920 const coll_t& cid,
5921 const pool_opts_t& opts)
5922{
5923 CollectionHandle ch = _get_collection(cid);
5924 if (!ch)
5925 return -ENOENT;
5926 Collection *c = static_cast<Collection *>(ch.get());
5927 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
5928 if (!c->exists)
5929 return -ENOENT;
5930 RWLock::WLocker l(c->lock);
5931 c->pool_opts = opts;
5932 return 0;
5933}
5934
5935int BlueStore::read(
5936 const coll_t& cid,
5937 const ghobject_t& oid,
5938 uint64_t offset,
5939 size_t length,
5940 bufferlist& bl,
5941 uint32_t op_flags,
5942 bool allow_eio)
5943{
5944 CollectionHandle c = _get_collection(cid);
5945 if (!c)
5946 return -ENOENT;
5947 return read(c, oid, offset, length, bl, op_flags, allow_eio);
5948}
5949
5950int BlueStore::read(
5951 CollectionHandle &c_,
5952 const ghobject_t& oid,
5953 uint64_t offset,
5954 size_t length,
5955 bufferlist& bl,
5956 uint32_t op_flags,
5957 bool allow_eio)
5958{
5959 utime_t start = ceph_clock_now();
5960 Collection *c = static_cast<Collection *>(c_.get());
5961 const coll_t &cid = c->get_cid();
5962 dout(15) << __func__ << " " << cid << " " << oid
5963 << " 0x" << std::hex << offset << "~" << length << std::dec
5964 << dendl;
5965 if (!c->exists)
5966 return -ENOENT;
5967
5968 bl.clear();
5969 int r;
5970 {
5971 RWLock::RLocker l(c->lock);
5972 utime_t start1 = ceph_clock_now();
5973 OnodeRef o = c->get_onode(oid, false);
5974 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
5975 if (!o || !o->exists) {
5976 r = -ENOENT;
5977 goto out;
5978 }
5979
5980 if (offset == length && offset == 0)
5981 length = o->onode.size;
5982
5983 r = _do_read(c, o, offset, length, bl, op_flags);
5984 }
5985
5986 out:
5987 assert(allow_eio || r != -EIO);
5988 c->trim_cache();
5989 if (r == 0 && _debug_data_eio(oid)) {
5990 r = -EIO;
5991 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
5992 }
5993 dout(10) << __func__ << " " << cid << " " << oid
5994 << " 0x" << std::hex << offset << "~" << length << std::dec
5995 << " = " << r << dendl;
5996 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
5997 return r;
5998}
5999
6000// --------------------------------------------------------
6001// intermediate data structures used while reading
6002struct region_t {
6003 uint64_t logical_offset;
6004 uint64_t blob_xoffset; //region offset within the blob
6005 uint64_t length;
6006 bufferlist bl;
6007
6008 // used later in read process
6009 uint64_t front = 0;
6010 uint64_t r_off = 0;
6011
6012 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6013 : logical_offset(offset),
6014 blob_xoffset(b_offs),
6015 length(len){}
6016 region_t(const region_t& from)
6017 : logical_offset(from.logical_offset),
6018 blob_xoffset(from.blob_xoffset),
6019 length(from.length){}
6020
6021 friend ostream& operator<<(ostream& out, const region_t& r) {
6022 return out << "0x" << std::hex << r.logical_offset << ":"
6023 << r.blob_xoffset << "~" << r.length << std::dec;
6024 }
6025};
6026
6027typedef list<region_t> regions2read_t;
6028typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6029
6030int BlueStore::_do_read(
6031 Collection *c,
6032 OnodeRef o,
6033 uint64_t offset,
6034 size_t length,
6035 bufferlist& bl,
6036 uint32_t op_flags)
6037{
6038 FUNCTRACE();
6039 boost::intrusive::set<Extent>::iterator ep, eend;
6040 int r = 0;
6041
6042 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6043 << " size 0x" << o->onode.size << " (" << std::dec
6044 << o->onode.size << ")" << dendl;
6045 bl.clear();
6046
6047 if (offset >= o->onode.size) {
6048 return r;
6049 }
6050
6051 // generally, don't buffer anything, unless the client explicitly requests
6052 // it.
6053 bool buffered = false;
6054 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6055 dout(20) << __func__ << " will do buffered read" << dendl;
6056 buffered = true;
6057 } else if (cct->_conf->bluestore_default_buffered_read &&
6058 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6059 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6060 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6061 buffered = true;
6062 }
6063
6064 if (offset + length > o->onode.size) {
6065 length = o->onode.size - offset;
6066 }
6067
6068 utime_t start = ceph_clock_now();
6069 o->extent_map.fault_range(db, offset, length);
6070 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6071 _dump_onode(o);
6072
6073 ready_regions_t ready_regions;
6074
6075 // build blob-wise list to of stuff read (that isn't cached)
6076 blobs2read_t blobs2read;
6077 unsigned left = length;
6078 uint64_t pos = offset;
6079 unsigned num_regions = 0;
6080 auto lp = o->extent_map.seek_lextent(offset);
6081 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6082 if (pos < lp->logical_offset) {
6083 unsigned hole = lp->logical_offset - pos;
6084 if (hole >= left) {
6085 break;
6086 }
6087 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6088 << std::dec << dendl;
6089 pos += hole;
6090 left -= hole;
6091 }
6092 BlobRef bptr = lp->blob;
6093 unsigned l_off = pos - lp->logical_offset;
6094 unsigned b_off = l_off + lp->blob_offset;
6095 unsigned b_len = std::min(left, lp->length - l_off);
6096
6097 ready_regions_t cache_res;
6098 interval_set<uint32_t> cache_interval;
6099 bptr->shared_blob->bc.read(
6100 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
6101 dout(20) << __func__ << " blob " << *bptr << std::hex
6102 << " need 0x" << b_off << "~" << b_len
6103 << " cache has 0x" << cache_interval
6104 << std::dec << dendl;
6105
6106 auto pc = cache_res.begin();
6107 while (b_len > 0) {
6108 unsigned l;
6109 if (pc != cache_res.end() &&
6110 pc->first == b_off) {
6111 l = pc->second.length();
6112 ready_regions[pos].claim(pc->second);
6113 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6114 << b_off << "~" << l << std::dec << dendl;
6115 ++pc;
6116 } else {
6117 l = b_len;
6118 if (pc != cache_res.end()) {
6119 assert(pc->first > b_off);
6120 l = pc->first - b_off;
6121 }
6122 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6123 << b_off << "~" << l << std::dec << dendl;
6124 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6125 ++num_regions;
6126 }
6127 pos += l;
6128 b_off += l;
6129 left -= l;
6130 b_len -= l;
6131 }
6132 ++lp;
6133 }
6134
6135 // read raw blob data. use aio if we have >1 blobs to read.
6136 start = ceph_clock_now(); // for the sake of simplicity
6137 // measure the whole block below.
6138 // The error isn't that much...
6139 vector<bufferlist> compressed_blob_bls;
6140 IOContext ioc(cct, NULL);
6141 for (auto& p : blobs2read) {
6142 BlobRef bptr = p.first;
6143 dout(20) << __func__ << " blob " << *bptr << std::hex
6144 << " need " << p.second << std::dec << dendl;
6145 if (bptr->get_blob().is_compressed()) {
6146 // read the whole thing
6147 if (compressed_blob_bls.empty()) {
6148 // ensure we avoid any reallocation on subsequent blobs
6149 compressed_blob_bls.reserve(blobs2read.size());
6150 }
6151 compressed_blob_bls.push_back(bufferlist());
6152 bufferlist& bl = compressed_blob_bls.back();
6153 r = bptr->get_blob().map(
6154 0, bptr->get_blob().get_ondisk_length(),
6155 [&](uint64_t offset, uint64_t length) {
6156 int r;
6157 // use aio if there are more regions to read than those in this blob
6158 if (num_regions > p.second.size()) {
6159 r = bdev->aio_read(offset, length, &bl, &ioc);
6160 } else {
6161 r = bdev->read(offset, length, &bl, &ioc, false);
6162 }
6163 if (r < 0)
6164 return r;
6165 return 0;
6166 });
6167 assert(r == 0);
6168 } else {
6169 // read the pieces
6170 for (auto& reg : p.second) {
6171 // determine how much of the blob to read
6172 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6173 reg.r_off = reg.blob_xoffset;
6174 uint64_t r_len = reg.length;
6175 reg.front = reg.r_off % chunk_size;
6176 if (reg.front) {
6177 reg.r_off -= reg.front;
6178 r_len += reg.front;
6179 }
6180 unsigned tail = r_len % chunk_size;
6181 if (tail) {
6182 r_len += chunk_size - tail;
6183 }
6184 dout(20) << __func__ << " region 0x" << std::hex
6185 << reg.logical_offset
6186 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6187 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6188 << dendl;
6189
6190 // read it
6191 r = bptr->get_blob().map(
6192 reg.r_off, r_len,
6193 [&](uint64_t offset, uint64_t length) {
6194 int r;
6195 // use aio if there is more than one region to read
6196 if (num_regions > 1) {
6197 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6198 } else {
6199 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6200 }
6201 if (r < 0)
6202 return r;
6203 return 0;
6204 });
6205 assert(r == 0);
6206 assert(reg.bl.length() == r_len);
6207 }
6208 }
6209 }
6210 if (ioc.has_pending_aios()) {
6211 bdev->aio_submit(&ioc);
6212 dout(20) << __func__ << " waiting for aio" << dendl;
6213 ioc.aio_wait();
6214 }
6215 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6216
6217 // enumerate and decompress desired blobs
6218 auto p = compressed_blob_bls.begin();
6219 blobs2read_t::iterator b2r_it = blobs2read.begin();
6220 while (b2r_it != blobs2read.end()) {
6221 BlobRef bptr = b2r_it->first;
6222 dout(20) << __func__ << " blob " << *bptr << std::hex
6223 << " need 0x" << b2r_it->second << std::dec << dendl;
6224 if (bptr->get_blob().is_compressed()) {
6225 assert(p != compressed_blob_bls.end());
6226 bufferlist& compressed_bl = *p++;
6227 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6228 b2r_it->second.front().logical_offset) < 0) {
6229 return -EIO;
6230 }
6231 bufferlist raw_bl;
6232 r = _decompress(compressed_bl, &raw_bl);
6233 if (r < 0)
6234 return r;
6235 if (buffered) {
6236 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6237 raw_bl);
6238 }
6239 for (auto& i : b2r_it->second) {
6240 ready_regions[i.logical_offset].substr_of(
6241 raw_bl, i.blob_xoffset, i.length);
6242 }
6243 } else {
6244 for (auto& reg : b2r_it->second) {
6245 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6246 reg.logical_offset) < 0) {
6247 return -EIO;
6248 }
6249 if (buffered) {
6250 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6251 reg.r_off, reg.bl);
6252 }
6253
6254 // prune and keep result
6255 ready_regions[reg.logical_offset].substr_of(
6256 reg.bl, reg.front, reg.length);
6257 }
6258 }
6259 ++b2r_it;
6260 }
6261
6262 // generate a resulting buffer
6263 auto pr = ready_regions.begin();
6264 auto pr_end = ready_regions.end();
6265 pos = 0;
6266 while (pos < length) {
6267 if (pr != pr_end && pr->first == pos + offset) {
6268 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6269 << ": data from 0x" << pr->first << "~" << pr->second.length()
6270 << std::dec << dendl;
6271 pos += pr->second.length();
6272 bl.claim_append(pr->second);
6273 ++pr;
6274 } else {
6275 uint64_t l = length - pos;
6276 if (pr != pr_end) {
6277 assert(pr->first > pos + offset);
6278 l = pr->first - (pos + offset);
6279 }
6280 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6281 << ": zeros for 0x" << (pos + offset) << "~" << l
6282 << std::dec << dendl;
6283 bl.append_zero(l);
6284 pos += l;
6285 }
6286 }
6287 assert(bl.length() == length);
6288 assert(pos == length);
6289 assert(pr == pr_end);
6290 r = bl.length();
6291 return r;
6292}
6293
6294int BlueStore::_verify_csum(OnodeRef& o,
6295 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6296 const bufferlist& bl,
6297 uint64_t logical_offset) const
6298{
6299 int bad;
6300 uint64_t bad_csum;
6301 utime_t start = ceph_clock_now();
6302 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6303 if (r < 0) {
6304 if (r == -1) {
6305 PExtentVector pex;
6306 blob->map(
6307 bad,
6308 blob->get_csum_chunk_size(),
6309 [&](uint64_t offset, uint64_t length) {
6310 pex.emplace_back(bluestore_pextent_t(offset, length));
6311 return 0;
6312 });
6313 derr << __func__ << " bad "
6314 << Checksummer::get_csum_type_string(blob->csum_type)
6315 << "/0x" << std::hex << blob->get_csum_chunk_size()
6316 << " checksum at blob offset 0x" << bad
6317 << ", got 0x" << bad_csum << ", expected 0x"
6318 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
6319 << ", device location " << pex
6320 << ", logical extent 0x" << std::hex
6321 << (logical_offset + bad - blob_xoffset) << "~"
6322 << blob->get_csum_chunk_size() << std::dec
6323 << ", object " << o->oid
6324 << dendl;
6325 } else {
6326 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
6327 }
6328 }
6329 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
6330 return r;
6331}
6332
6333int BlueStore::_decompress(bufferlist& source, bufferlist* result)
6334{
6335 int r = 0;
6336 utime_t start = ceph_clock_now();
6337 bufferlist::iterator i = source.begin();
6338 bluestore_compression_header_t chdr;
6339 ::decode(chdr, i);
6340 int alg = int(chdr.type);
6341 CompressorRef cp = compressor;
6342 if (!cp || (int)cp->get_type() != alg) {
6343 cp = Compressor::create(cct, alg);
6344 }
6345
6346 if (!cp.get()) {
6347 // if compressor isn't available - error, because cannot return
6348 // decompressed data?
6349 derr << __func__ << " can't load decompressor " << alg << dendl;
6350 r = -EIO;
6351 } else {
6352 r = cp->decompress(i, chdr.length, *result);
6353 if (r < 0) {
6354 derr << __func__ << " decompression failed with exit code " << r << dendl;
6355 r = -EIO;
6356 }
6357 }
6358 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
6359 return r;
6360}
6361
6362// this stores fiemap into interval_set, other variations
6363// use it internally
6364int BlueStore::_fiemap(
6365 CollectionHandle &c_,
6366 const ghobject_t& oid,
6367 uint64_t offset,
6368 size_t length,
6369 interval_set<uint64_t>& destset)
6370{
6371 Collection *c = static_cast<Collection *>(c_.get());
6372 if (!c->exists)
6373 return -ENOENT;
6374 {
6375 RWLock::RLocker l(c->lock);
6376
6377 OnodeRef o = c->get_onode(oid, false);
6378 if (!o || !o->exists) {
6379 return -ENOENT;
6380 }
6381 _dump_onode(o);
6382
6383 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6384 << " size 0x" << o->onode.size << std::dec << dendl;
6385
6386 boost::intrusive::set<Extent>::iterator ep, eend;
6387 if (offset >= o->onode.size)
6388 goto out;
6389
6390 if (offset + length > o->onode.size) {
6391 length = o->onode.size - offset;
6392 }
6393
6394 o->extent_map.fault_range(db, offset, length);
6395 eend = o->extent_map.extent_map.end();
6396 ep = o->extent_map.seek_lextent(offset);
6397 while (length > 0) {
6398 dout(20) << __func__ << " offset " << offset << dendl;
6399 if (ep != eend && ep->logical_offset + ep->length <= offset) {
6400 ++ep;
6401 continue;
6402 }
6403
6404 uint64_t x_len = length;
6405 if (ep != eend && ep->logical_offset <= offset) {
6406 uint64_t x_off = offset - ep->logical_offset;
6407 x_len = MIN(x_len, ep->length - x_off);
6408 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
6409 << x_len << std::dec << " blob " << ep->blob << dendl;
6410 destset.insert(offset, x_len);
6411 length -= x_len;
6412 offset += x_len;
6413 if (x_off + x_len == ep->length)
6414 ++ep;
6415 continue;
6416 }
6417 if (ep != eend &&
6418 ep->logical_offset > offset &&
6419 ep->logical_offset - offset < x_len) {
6420 x_len = ep->logical_offset - offset;
6421 }
6422 offset += x_len;
6423 length -= x_len;
6424 }
6425 }
6426
6427 out:
6428 c->trim_cache();
6429 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6430 << " size = 0x(" << destset << ")" << std::dec << dendl;
6431 return 0;
6432}
6433
6434int BlueStore::fiemap(
6435 const coll_t& cid,
6436 const ghobject_t& oid,
6437 uint64_t offset,
6438 size_t len,
6439 bufferlist& bl)
6440{
6441 CollectionHandle c = _get_collection(cid);
6442 if (!c)
6443 return -ENOENT;
6444 return fiemap(c, oid, offset, len, bl);
6445}
6446
6447int BlueStore::fiemap(
6448 CollectionHandle &c_,
6449 const ghobject_t& oid,
6450 uint64_t offset,
6451 size_t length,
6452 bufferlist& bl)
6453{
6454 interval_set<uint64_t> m;
6455 int r = _fiemap(c_, oid, offset, length, m);
6456 if (r >= 0) {
6457 ::encode(m, bl);
6458 }
6459 return r;
6460}
6461
6462int BlueStore::fiemap(
6463 const coll_t& cid,
6464 const ghobject_t& oid,
6465 uint64_t offset,
6466 size_t len,
6467 map<uint64_t, uint64_t>& destmap)
6468{
6469 CollectionHandle c = _get_collection(cid);
6470 if (!c)
6471 return -ENOENT;
6472 return fiemap(c, oid, offset, len, destmap);
6473}
6474
6475int BlueStore::fiemap(
6476 CollectionHandle &c_,
6477 const ghobject_t& oid,
6478 uint64_t offset,
6479 size_t length,
6480 map<uint64_t, uint64_t>& destmap)
6481{
6482 interval_set<uint64_t> m;
6483 int r = _fiemap(c_, oid, offset, length, m);
6484 if (r >= 0) {
6485 m.move_into(destmap);
6486 }
6487 return r;
6488}
6489
6490int BlueStore::getattr(
6491 const coll_t& cid,
6492 const ghobject_t& oid,
6493 const char *name,
6494 bufferptr& value)
6495{
6496 CollectionHandle c = _get_collection(cid);
6497 if (!c)
6498 return -ENOENT;
6499 return getattr(c, oid, name, value);
6500}
6501
6502int BlueStore::getattr(
6503 CollectionHandle &c_,
6504 const ghobject_t& oid,
6505 const char *name,
6506 bufferptr& value)
6507{
6508 Collection *c = static_cast<Collection *>(c_.get());
6509 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
6510 if (!c->exists)
6511 return -ENOENT;
6512
6513 int r;
6514 {
6515 RWLock::RLocker l(c->lock);
6516 mempool::bluestore_meta_other::string k(name);
6517
6518 OnodeRef o = c->get_onode(oid, false);
6519 if (!o || !o->exists) {
6520 r = -ENOENT;
6521 goto out;
6522 }
6523
6524 if (!o->onode.attrs.count(k)) {
6525 r = -ENODATA;
6526 goto out;
6527 }
6528 value = o->onode.attrs[k];
6529 r = 0;
6530 }
6531 out:
6532 c->trim_cache();
6533 if (r == 0 && _debug_mdata_eio(oid)) {
6534 r = -EIO;
6535 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6536 }
6537 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
6538 << " = " << r << dendl;
6539 return r;
6540}
6541
6542
6543int BlueStore::getattrs(
6544 const coll_t& cid,
6545 const ghobject_t& oid,
6546 map<string,bufferptr>& aset)
6547{
6548 CollectionHandle c = _get_collection(cid);
6549 if (!c)
6550 return -ENOENT;
6551 return getattrs(c, oid, aset);
6552}
6553
6554int BlueStore::getattrs(
6555 CollectionHandle &c_,
6556 const ghobject_t& oid,
6557 map<string,bufferptr>& aset)
6558{
6559 Collection *c = static_cast<Collection *>(c_.get());
6560 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
6561 if (!c->exists)
6562 return -ENOENT;
6563
6564 int r;
6565 {
6566 RWLock::RLocker l(c->lock);
6567
6568 OnodeRef o = c->get_onode(oid, false);
6569 if (!o || !o->exists) {
6570 r = -ENOENT;
6571 goto out;
6572 }
6573 for (auto& i : o->onode.attrs) {
6574 aset.emplace(i.first.c_str(), i.second);
6575 }
6576 r = 0;
6577 }
6578
6579 out:
6580 c->trim_cache();
6581 if (r == 0 && _debug_mdata_eio(oid)) {
6582 r = -EIO;
6583 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6584 }
6585 dout(10) << __func__ << " " << c->cid << " " << oid
6586 << " = " << r << dendl;
6587 return r;
6588}
6589
6590int BlueStore::list_collections(vector<coll_t>& ls)
6591{
6592 RWLock::RLocker l(coll_lock);
6593 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
6594 p != coll_map.end();
6595 ++p)
6596 ls.push_back(p->first);
6597 return 0;
6598}
6599
6600bool BlueStore::collection_exists(const coll_t& c)
6601{
6602 RWLock::RLocker l(coll_lock);
6603 return coll_map.count(c);
6604}
6605
6606int BlueStore::collection_empty(const coll_t& cid, bool *empty)
6607{
6608 dout(15) << __func__ << " " << cid << dendl;
6609 vector<ghobject_t> ls;
6610 ghobject_t next;
6611 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
6612 &ls, &next);
6613 if (r < 0) {
6614 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
6615 << dendl;
6616 return r;
6617 }
6618 *empty = ls.empty();
6619 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
6620 return 0;
6621}
6622
6623int BlueStore::collection_bits(const coll_t& cid)
6624{
6625 dout(15) << __func__ << " " << cid << dendl;
6626 CollectionRef c = _get_collection(cid);
6627 if (!c)
6628 return -ENOENT;
6629 RWLock::RLocker l(c->lock);
6630 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
6631 return c->cnode.bits;
6632}
6633
6634int BlueStore::collection_list(
6635 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
6636 vector<ghobject_t> *ls, ghobject_t *pnext)
6637{
6638 CollectionHandle c = _get_collection(cid);
6639 if (!c)
6640 return -ENOENT;
6641 return collection_list(c, start, end, max, ls, pnext);
6642}
6643
6644int BlueStore::collection_list(
6645 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
6646 vector<ghobject_t> *ls, ghobject_t *pnext)
6647{
6648 Collection *c = static_cast<Collection *>(c_.get());
6649 dout(15) << __func__ << " " << c->cid
6650 << " start " << start << " end " << end << " max " << max << dendl;
6651 int r;
6652 {
6653 RWLock::RLocker l(c->lock);
6654 r = _collection_list(c, start, end, max, ls, pnext);
6655 }
6656
6657 c->trim_cache();
6658 dout(10) << __func__ << " " << c->cid
6659 << " start " << start << " end " << end << " max " << max
6660 << " = " << r << ", ls.size() = " << ls->size()
6661 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
6662 return r;
6663}
6664
6665int BlueStore::_collection_list(
6666 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
6667 vector<ghobject_t> *ls, ghobject_t *pnext)
6668{
6669
6670 if (!c->exists)
6671 return -ENOENT;
6672
6673 int r = 0;
6674 ghobject_t static_next;
6675 KeyValueDB::Iterator it;
6676 string temp_start_key, temp_end_key;
6677 string start_key, end_key;
6678 bool set_next = false;
6679 string pend;
6680 bool temp;
6681
6682 if (!pnext)
6683 pnext = &static_next;
6684
6685 if (start == ghobject_t::get_max() ||
6686 start.hobj.is_max()) {
6687 goto out;
6688 }
6689 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
6690 &start_key, &end_key);
6691 dout(20) << __func__
6692 << " range " << pretty_binary_string(temp_start_key)
6693 << " to " << pretty_binary_string(temp_end_key)
6694 << " and " << pretty_binary_string(start_key)
6695 << " to " << pretty_binary_string(end_key)
6696 << " start " << start << dendl;
6697 it = db->get_iterator(PREFIX_OBJ);
6698 if (start == ghobject_t() ||
6699 start.hobj == hobject_t() ||
6700 start == c->cid.get_min_hobj()) {
6701 it->upper_bound(temp_start_key);
6702 temp = true;
6703 } else {
6704 string k;
6705 get_object_key(cct, start, &k);
6706 if (start.hobj.is_temp()) {
6707 temp = true;
6708 assert(k >= temp_start_key && k < temp_end_key);
6709 } else {
6710 temp = false;
6711 assert(k >= start_key && k < end_key);
6712 }
6713 dout(20) << " start from " << pretty_binary_string(k)
6714 << " temp=" << (int)temp << dendl;
6715 it->lower_bound(k);
6716 }
6717 if (end.hobj.is_max()) {
6718 pend = temp ? temp_end_key : end_key;
6719 } else {
6720 get_object_key(cct, end, &end_key);
6721 if (end.hobj.is_temp()) {
6722 if (temp)
6723 pend = end_key;
6724 else
6725 goto out;
6726 } else {
6727 pend = temp ? temp_end_key : end_key;
6728 }
6729 }
6730 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
6731 while (true) {
6732 if (!it->valid() || it->key() >= pend) {
6733 if (!it->valid())
6734 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
6735 else
6736 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
6737 << " >= " << end << dendl;
6738 if (temp) {
6739 if (end.hobj.is_temp()) {
6740 break;
6741 }
6742 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
6743 temp = false;
6744 it->upper_bound(start_key);
6745 pend = end_key;
6746 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
6747 continue;
6748 }
6749 break;
6750 }
6751 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
6752 if (is_extent_shard_key(it->key())) {
6753 it->next();
6754 continue;
6755 }
6756 ghobject_t oid;
6757 int r = get_key_object(it->key(), &oid);
6758 assert(r == 0);
6759 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
6760 if (ls->size() >= (unsigned)max) {
6761 dout(20) << __func__ << " reached max " << max << dendl;
6762 *pnext = oid;
6763 set_next = true;
6764 break;
6765 }
6766 ls->push_back(oid);
6767 it->next();
6768 }
6769out:
6770 if (!set_next) {
6771 *pnext = ghobject_t::get_max();
6772 }
6773
6774 return r;
6775}
6776
6777// omap reads
6778
6779BlueStore::OmapIteratorImpl::OmapIteratorImpl(
6780 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
6781 : c(c), o(o), it(it)
6782{
6783 RWLock::RLocker l(c->lock);
6784 if (o->onode.has_omap()) {
6785 get_omap_key(o->onode.nid, string(), &head);
6786 get_omap_tail(o->onode.nid, &tail);
6787 it->lower_bound(head);
6788 }
6789}
6790
6791int BlueStore::OmapIteratorImpl::seek_to_first()
6792{
6793 RWLock::RLocker l(c->lock);
6794 if (o->onode.has_omap()) {
6795 it->lower_bound(head);
6796 } else {
6797 it = KeyValueDB::Iterator();
6798 }
6799 return 0;
6800}
6801
6802int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
6803{
6804 RWLock::RLocker l(c->lock);
6805 if (o->onode.has_omap()) {
6806 string key;
6807 get_omap_key(o->onode.nid, after, &key);
6808 it->upper_bound(key);
6809 } else {
6810 it = KeyValueDB::Iterator();
6811 }
6812 return 0;
6813}
6814
6815int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
6816{
6817 RWLock::RLocker l(c->lock);
6818 if (o->onode.has_omap()) {
6819 string key;
6820 get_omap_key(o->onode.nid, to, &key);
6821 it->lower_bound(key);
6822 } else {
6823 it = KeyValueDB::Iterator();
6824 }
6825 return 0;
6826}
6827
6828bool BlueStore::OmapIteratorImpl::valid()
6829{
6830 RWLock::RLocker l(c->lock);
6831 return o->onode.has_omap() && it && it->valid() && it->raw_key().second <= tail;
6832}
6833
6834int BlueStore::OmapIteratorImpl::next(bool validate)
6835{
6836 RWLock::RLocker l(c->lock);
6837 if (o->onode.has_omap()) {
6838 it->next();
6839 return 0;
6840 } else {
6841 return -1;
6842 }
6843}
6844
6845string BlueStore::OmapIteratorImpl::key()
6846{
6847 RWLock::RLocker l(c->lock);
6848 assert(it->valid());
6849 string db_key = it->raw_key().second;
6850 string user_key;
6851 decode_omap_key(db_key, &user_key);
6852 return user_key;
6853}
6854
6855bufferlist BlueStore::OmapIteratorImpl::value()
6856{
6857 RWLock::RLocker l(c->lock);
6858 assert(it->valid());
6859 return it->value();
6860}
6861
6862int BlueStore::omap_get(
6863 const coll_t& cid, ///< [in] Collection containing oid
6864 const ghobject_t &oid, ///< [in] Object containing omap
6865 bufferlist *header, ///< [out] omap header
6866 map<string, bufferlist> *out /// < [out] Key to value map
6867 )
6868{
6869 CollectionHandle c = _get_collection(cid);
6870 if (!c)
6871 return -ENOENT;
6872 return omap_get(c, oid, header, out);
6873}
6874
6875int BlueStore::omap_get(
6876 CollectionHandle &c_, ///< [in] Collection containing oid
6877 const ghobject_t &oid, ///< [in] Object containing omap
6878 bufferlist *header, ///< [out] omap header
6879 map<string, bufferlist> *out /// < [out] Key to value map
6880 )
6881{
6882 Collection *c = static_cast<Collection *>(c_.get());
6883 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
6884 if (!c->exists)
6885 return -ENOENT;
6886 RWLock::RLocker l(c->lock);
6887 int r = 0;
6888 OnodeRef o = c->get_onode(oid, false);
6889 if (!o || !o->exists) {
6890 r = -ENOENT;
6891 goto out;
6892 }
6893 if (!o->onode.has_omap())
6894 goto out;
6895 o->flush();
6896 {
6897 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
6898 string head, tail;
6899 get_omap_header(o->onode.nid, &head);
6900 get_omap_tail(o->onode.nid, &tail);
6901 it->lower_bound(head);
6902 while (it->valid()) {
6903 if (it->key() == head) {
6904 dout(30) << __func__ << " got header" << dendl;
6905 *header = it->value();
6906 } else if (it->key() >= tail) {
6907 dout(30) << __func__ << " reached tail" << dendl;
6908 break;
6909 } else {
6910 string user_key;
6911 decode_omap_key(it->key(), &user_key);
6912 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
6913 << " -> " << user_key << dendl;
6914 (*out)[user_key] = it->value();
6915 }
6916 it->next();
6917 }
6918 }
6919 out:
6920 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
6921 << dendl;
6922 return r;
6923}
6924
6925int BlueStore::omap_get_header(
6926 const coll_t& cid, ///< [in] Collection containing oid
6927 const ghobject_t &oid, ///< [in] Object containing omap
6928 bufferlist *header, ///< [out] omap header
6929 bool allow_eio ///< [in] don't assert on eio
6930 )
6931{
6932 CollectionHandle c = _get_collection(cid);
6933 if (!c)
6934 return -ENOENT;
6935 return omap_get_header(c, oid, header, allow_eio);
6936}
6937
6938int BlueStore::omap_get_header(
6939 CollectionHandle &c_, ///< [in] Collection containing oid
6940 const ghobject_t &oid, ///< [in] Object containing omap
6941 bufferlist *header, ///< [out] omap header
6942 bool allow_eio ///< [in] don't assert on eio
6943 )
6944{
6945 Collection *c = static_cast<Collection *>(c_.get());
6946 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
6947 if (!c->exists)
6948 return -ENOENT;
6949 RWLock::RLocker l(c->lock);
6950 int r = 0;
6951 OnodeRef o = c->get_onode(oid, false);
6952 if (!o || !o->exists) {
6953 r = -ENOENT;
6954 goto out;
6955 }
6956 if (!o->onode.has_omap())
6957 goto out;
6958 o->flush();
6959 {
6960 string head;
6961 get_omap_header(o->onode.nid, &head);
6962 if (db->get(PREFIX_OMAP, head, header) >= 0) {
6963 dout(30) << __func__ << " got header" << dendl;
6964 } else {
6965 dout(30) << __func__ << " no header" << dendl;
6966 }
6967 }
6968 out:
6969 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
6970 << dendl;
6971 return r;
6972}
6973
6974int BlueStore::omap_get_keys(
6975 const coll_t& cid, ///< [in] Collection containing oid
6976 const ghobject_t &oid, ///< [in] Object containing omap
6977 set<string> *keys ///< [out] Keys defined on oid
6978 )
6979{
6980 CollectionHandle c = _get_collection(cid);
6981 if (!c)
6982 return -ENOENT;
6983 return omap_get_keys(c, oid, keys);
6984}
6985
6986int BlueStore::omap_get_keys(
6987 CollectionHandle &c_, ///< [in] Collection containing oid
6988 const ghobject_t &oid, ///< [in] Object containing omap
6989 set<string> *keys ///< [out] Keys defined on oid
6990 )
6991{
6992 Collection *c = static_cast<Collection *>(c_.get());
6993 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
6994 if (!c->exists)
6995 return -ENOENT;
6996 RWLock::RLocker l(c->lock);
6997 int r = 0;
6998 OnodeRef o = c->get_onode(oid, false);
6999 if (!o || !o->exists) {
7000 r = -ENOENT;
7001 goto out;
7002 }
7003 if (!o->onode.has_omap())
7004 goto out;
7005 o->flush();
7006 {
7007 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7008 string head, tail;
7009 get_omap_key(o->onode.nid, string(), &head);
7010 get_omap_tail(o->onode.nid, &tail);
7011 it->lower_bound(head);
7012 while (it->valid()) {
7013 if (it->key() >= tail) {
7014 dout(30) << __func__ << " reached tail" << dendl;
7015 break;
7016 }
7017 string user_key;
7018 decode_omap_key(it->key(), &user_key);
7019 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7020 << " -> " << user_key << dendl;
7021 keys->insert(user_key);
7022 it->next();
7023 }
7024 }
7025 out:
7026 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7027 << dendl;
7028 return r;
7029}
7030
7031int BlueStore::omap_get_values(
7032 const coll_t& cid, ///< [in] Collection containing oid
7033 const ghobject_t &oid, ///< [in] Object containing omap
7034 const set<string> &keys, ///< [in] Keys to get
7035 map<string, bufferlist> *out ///< [out] Returned keys and values
7036 )
7037{
7038 CollectionHandle c = _get_collection(cid);
7039 if (!c)
7040 return -ENOENT;
7041 return omap_get_values(c, oid, keys, out);
7042}
7043
7044int BlueStore::omap_get_values(
7045 CollectionHandle &c_, ///< [in] Collection containing oid
7046 const ghobject_t &oid, ///< [in] Object containing omap
7047 const set<string> &keys, ///< [in] Keys to get
7048 map<string, bufferlist> *out ///< [out] Returned keys and values
7049 )
7050{
7051 Collection *c = static_cast<Collection *>(c_.get());
7052 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7053 if (!c->exists)
7054 return -ENOENT;
7055 RWLock::RLocker l(c->lock);
7056 int r = 0;
7057 string final_key;
7058 OnodeRef o = c->get_onode(oid, false);
7059 if (!o || !o->exists) {
7060 r = -ENOENT;
7061 goto out;
7062 }
7063 if (!o->onode.has_omap())
7064 goto out;
7065 o->flush();
7066 _key_encode_u64(o->onode.nid, &final_key);
7067 final_key.push_back('.');
7068 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7069 final_key.resize(9); // keep prefix
7070 final_key += *p;
7071 bufferlist val;
7072 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7073 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7074 << " -> " << *p << dendl;
7075 out->insert(make_pair(*p, val));
7076 }
7077 }
7078 out:
7079 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7080 << dendl;
7081 return r;
7082}
7083
7084int BlueStore::omap_check_keys(
7085 const coll_t& cid, ///< [in] Collection containing oid
7086 const ghobject_t &oid, ///< [in] Object containing omap
7087 const set<string> &keys, ///< [in] Keys to check
7088 set<string> *out ///< [out] Subset of keys defined on oid
7089 )
7090{
7091 CollectionHandle c = _get_collection(cid);
7092 if (!c)
7093 return -ENOENT;
7094 return omap_check_keys(c, oid, keys, out);
7095}
7096
7097int BlueStore::omap_check_keys(
7098 CollectionHandle &c_, ///< [in] Collection containing oid
7099 const ghobject_t &oid, ///< [in] Object containing omap
7100 const set<string> &keys, ///< [in] Keys to check
7101 set<string> *out ///< [out] Subset of keys defined on oid
7102 )
7103{
7104 Collection *c = static_cast<Collection *>(c_.get());
7105 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7106 if (!c->exists)
7107 return -ENOENT;
7108 RWLock::RLocker l(c->lock);
7109 int r = 0;
7110 string final_key;
7111 OnodeRef o = c->get_onode(oid, false);
7112 if (!o || !o->exists) {
7113 r = -ENOENT;
7114 goto out;
7115 }
7116 if (!o->onode.has_omap())
7117 goto out;
7118 o->flush();
7119 _key_encode_u64(o->onode.nid, &final_key);
7120 final_key.push_back('.');
7121 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7122 final_key.resize(9); // keep prefix
7123 final_key += *p;
7124 bufferlist val;
7125 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7126 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7127 << " -> " << *p << dendl;
7128 out->insert(*p);
7129 } else {
7130 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7131 << " -> " << *p << dendl;
7132 }
7133 }
7134 out:
7135 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7136 << dendl;
7137 return r;
7138}
7139
7140ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7141 const coll_t& cid, ///< [in] collection
7142 const ghobject_t &oid ///< [in] object
7143 )
7144{
7145 CollectionHandle c = _get_collection(cid);
7146 if (!c) {
7147 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7148 return ObjectMap::ObjectMapIterator();
7149 }
7150 return get_omap_iterator(c, oid);
7151}
7152
7153ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7154 CollectionHandle &c_, ///< [in] collection
7155 const ghobject_t &oid ///< [in] object
7156 )
7157{
7158 Collection *c = static_cast<Collection *>(c_.get());
7159 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7160 if (!c->exists) {
7161 return ObjectMap::ObjectMapIterator();
7162 }
7163 RWLock::RLocker l(c->lock);
7164 OnodeRef o = c->get_onode(oid, false);
7165 if (!o || !o->exists) {
7166 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7167 return ObjectMap::ObjectMapIterator();
7168 }
7169 o->flush();
7170 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7171 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7172 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7173}
7174
7175// -----------------
7176// write helpers
7177
7178void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7179{
7180 dout(10) << __func__ << " ondisk_format " << ondisk_format
7181 << " min_compat_ondisk_format " << min_compat_ondisk_format
7182 << dendl;
7183 assert(ondisk_format == latest_ondisk_format);
7184 {
7185 bufferlist bl;
7186 ::encode(ondisk_format, bl);
7187 t->set(PREFIX_SUPER, "ondisk_format", bl);
7188 }
7189 {
7190 bufferlist bl;
7191 ::encode(min_compat_ondisk_format, bl);
7192 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7193 }
7194}
7195
7196int BlueStore::_open_super_meta()
7197{
7198 // nid
7199 {
7200 nid_max = 0;
7201 bufferlist bl;
7202 db->get(PREFIX_SUPER, "nid_max", &bl);
7203 bufferlist::iterator p = bl.begin();
7204 try {
7205 uint64_t v;
7206 ::decode(v, p);
7207 nid_max = v;
7208 } catch (buffer::error& e) {
7209 derr << __func__ << " unable to read nid_max" << dendl;
7210 return -EIO;
7211 }
7212 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7213 nid_last = nid_max.load();
7214 }
7215
7216 // blobid
7217 {
7218 blobid_max = 0;
7219 bufferlist bl;
7220 db->get(PREFIX_SUPER, "blobid_max", &bl);
7221 bufferlist::iterator p = bl.begin();
7222 try {
7223 uint64_t v;
7224 ::decode(v, p);
7225 blobid_max = v;
7226 } catch (buffer::error& e) {
7227 derr << __func__ << " unable to read blobid_max" << dendl;
7228 return -EIO;
7229 }
7230 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7231 blobid_last = blobid_max.load();
7232 }
7233
7234 // freelist
7235 {
7236 bufferlist bl;
7237 db->get(PREFIX_SUPER, "freelist_type", &bl);
7238 if (bl.length()) {
7239 freelist_type = std::string(bl.c_str(), bl.length());
7240 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7241 } else {
7242 assert("Not Support extent freelist manager" == 0);
7243 }
7244 }
7245
7246 // bluefs alloc
7247 if (cct->_conf->bluestore_bluefs) {
7248 bluefs_extents.clear();
7249 bufferlist bl;
7250 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7251 bufferlist::iterator p = bl.begin();
7252 try {
7253 ::decode(bluefs_extents, p);
7254 }
7255 catch (buffer::error& e) {
7256 derr << __func__ << " unable to read bluefs_extents" << dendl;
7257 return -EIO;
7258 }
7259 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7260 << std::dec << dendl;
7261 }
7262
7263 // ondisk format
7264 int32_t compat_ondisk_format = 0;
7265 {
7266 bufferlist bl;
7267 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7268 if (r < 0) {
7269 // base case: kraken bluestore is v1 and readable by v1
7270 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7271 << dendl;
7272 ondisk_format = 1;
7273 compat_ondisk_format = 1;
7274 } else {
7275 auto p = bl.begin();
7276 try {
7277 ::decode(ondisk_format, p);
7278 } catch (buffer::error& e) {
7279 derr << __func__ << " unable to read ondisk_format" << dendl;
7280 return -EIO;
7281 }
7282 bl.clear();
7283 {
7284 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7285 assert(!r);
7286 auto p = bl.begin();
7287 try {
7288 ::decode(compat_ondisk_format, p);
7289 } catch (buffer::error& e) {
7290 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7291 return -EIO;
7292 }
7293 }
7294 }
7295 dout(10) << __func__ << " ondisk_format " << ondisk_format
7296 << " compat_ondisk_format " << compat_ondisk_format
7297 << dendl;
7298 }
7299
7300 if (latest_ondisk_format < compat_ondisk_format) {
7301 derr << __func__ << " compat_ondisk_format is "
7302 << compat_ondisk_format << " but we only understand version "
7303 << latest_ondisk_format << dendl;
7304 return -EPERM;
7305 }
7306 if (ondisk_format < latest_ondisk_format) {
7307 int r = _upgrade_super();
7308 if (r < 0) {
7309 return r;
7310 }
7311 }
7312
7313 {
7314 bufferlist bl;
7315 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7316 auto p = bl.begin();
7317 try {
7318 uint64_t val;
7319 ::decode(val, p);
7320 min_alloc_size = val;
7321 } catch (buffer::error& e) {
7322 derr << __func__ << " unable to read min_alloc_size" << dendl;
7323 return -EIO;
7324 }
7325 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7326 << std::dec << dendl;
7327 }
7328 _set_alloc_sizes();
7329 _set_throttle_params();
7330
7331 _set_csum();
7332 _set_compression();
7333 _set_blob_size();
7334
7335 return 0;
7336}
7337
7338int BlueStore::_upgrade_super()
7339{
7340 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7341 << latest_ondisk_format << dendl;
7342 assert(ondisk_format > 0);
7343 assert(ondisk_format < latest_ondisk_format);
7344
7345 if (ondisk_format == 1) {
7346 // changes:
7347 // - super: added ondisk_format
7348 // - super: added min_readable_ondisk_format
7349 // - super: added min_compat_ondisk_format
7350 // - super: added min_alloc_size
7351 // - super: removed min_min_alloc_size
7352 KeyValueDB::Transaction t = db->get_transaction();
7353 {
7354 bufferlist bl;
7355 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7356 auto p = bl.begin();
7357 try {
7358 uint64_t val;
7359 ::decode(val, p);
7360 min_alloc_size = val;
7361 } catch (buffer::error& e) {
7362 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7363 return -EIO;
7364 }
7365 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7366 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7367 }
7368 ondisk_format = 2;
7369 _prepare_ondisk_format_super(t);
7370 int r = db->submit_transaction_sync(t);
7371 assert(r == 0);
7372 }
7373
7374 // done
7375 dout(1) << __func__ << " done" << dendl;
7376 return 0;
7377}
7378
7379void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7380{
7381 if (o->onode.nid)
7382 return;
7383 uint64_t nid = ++nid_last;
7384 dout(20) << __func__ << " " << nid << dendl;
7385 o->onode.nid = nid;
7386 txc->last_nid = nid;
7387}
7388
7389uint64_t BlueStore::_assign_blobid(TransContext *txc)
7390{
7391 uint64_t bid = ++blobid_last;
7392 dout(20) << __func__ << " " << bid << dendl;
7393 txc->last_blobid = bid;
7394 return bid;
7395}
7396
7397void BlueStore::get_db_statistics(Formatter *f)
7398{
7399 db->get_statistics(f);
7400}
7401
7402BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
7403{
7404 TransContext *txc = new TransContext(cct, osr);
7405 txc->t = db->get_transaction();
7406 osr->queue_new(txc);
7407 dout(20) << __func__ << " osr " << osr << " = " << txc
7408 << " seq " << txc->seq << dendl;
7409 return txc;
7410}
7411
7412void BlueStore::_txc_calc_cost(TransContext *txc)
7413{
7414 // this is about the simplest model for transaction cost you can
7415 // imagine. there is some fixed overhead cost by saying there is a
7416 // minimum of one "io". and then we have some cost per "io" that is
7417 // a configurable (with different hdd and ssd defaults), and add
7418 // that to the bytes value.
7419 int ios = 1; // one "io" for the kv commit
7420 for (auto& p : txc->ioc.pending_aios) {
7421 ios += p.iov.size();
7422 }
7423 auto cost = throttle_cost_per_io.load();
7424 txc->cost = ios * cost + txc->bytes;
7425 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
7426 << ios << " ios * " << cost << " + " << txc->bytes
7427 << " bytes)" << dendl;
7428}
7429
7430void BlueStore::_txc_update_store_statfs(TransContext *txc)
7431{
7432 if (txc->statfs_delta.is_empty())
7433 return;
7434
7435 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
7436 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
7437 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
7438 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
7439 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
7440
7441 bufferlist bl;
7442 txc->statfs_delta.encode(bl);
7443
7444 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
7445 txc->statfs_delta.reset();
7446}
7447
7448void BlueStore::_txc_state_proc(TransContext *txc)
7449{
7450 while (true) {
7451 dout(10) << __func__ << " txc " << txc
7452 << " " << txc->get_state_name() << dendl;
7453 switch (txc->state) {
7454 case TransContext::STATE_PREPARE:
7455 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
7456 if (txc->ioc.has_pending_aios()) {
7457 txc->state = TransContext::STATE_AIO_WAIT;
7458 txc->had_ios = true;
7459 _txc_aio_submit(txc);
7460 return;
7461 }
7462 // ** fall-thru **
7463
7464 case TransContext::STATE_AIO_WAIT:
7465 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
7466 _txc_finish_io(txc); // may trigger blocked txc's too
7467 return;
7468
7469 case TransContext::STATE_IO_DONE:
7470 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7471 if (txc->had_ios) {
7472 ++txc->osr->txc_with_unstable_io;
7473 }
7474 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
7475 txc->state = TransContext::STATE_KV_QUEUED;
7476 if (cct->_conf->bluestore_sync_submit_transaction) {
7477 if (txc->last_nid >= nid_max ||
7478 txc->last_blobid >= blobid_max) {
7479 dout(20) << __func__
7480 << " last_{nid,blobid} exceeds max, submit via kv thread"
7481 << dendl;
7482 } else if (txc->osr->kv_committing_serially) {
7483 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
7484 << dendl;
7485 // note: this is starvation-prone. once we have a txc in a busy
7486 // sequencer that is committing serially it is possible to keep
7487 // submitting new transactions fast enough that we get stuck doing
7488 // so. the alternative is to block here... fixme?
7489 } else if (txc->osr->txc_with_unstable_io) {
7490 dout(20) << __func__ << " prior txc(s) with unstable ios "
7491 << txc->osr->txc_with_unstable_io.load() << dendl;
7492 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
7493 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
7494 == 0) {
7495 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
7496 << dendl;
7497 } else {
7498 txc->state = TransContext::STATE_KV_SUBMITTED;
7499 int r = db->submit_transaction(txc->t);
7500 assert(r == 0);
7501 _txc_applied_kv(txc);
7502 }
7503 }
7504 {
7505 std::lock_guard<std::mutex> l(kv_lock);
7506 kv_queue.push_back(txc);
7507 kv_cond.notify_one();
7508 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
7509 kv_queue_unsubmitted.push_back(txc);
7510 ++txc->osr->kv_committing_serially;
7511 }
7512 }
7513 return;
7514 case TransContext::STATE_KV_SUBMITTED:
7515 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
7516 txc->state = TransContext::STATE_KV_DONE;
7517 _txc_committed_kv(txc);
7518 // ** fall-thru **
7519
7520 case TransContext::STATE_KV_DONE:
7521 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
7522 if (txc->deferred_txn) {
7523 txc->state = TransContext::STATE_DEFERRED_QUEUED;
7524 _deferred_queue(txc);
7525 return;
7526 }
7527 txc->state = TransContext::STATE_FINISHING;
7528 break;
7529
7530 case TransContext::STATE_DEFERRED_CLEANUP:
7531 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
7532 txc->state = TransContext::STATE_FINISHING;
7533 // ** fall-thru **
7534
7535 case TransContext::STATE_FINISHING:
7536 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
7537 _txc_finish(txc);
7538 return;
7539
7540 default:
7541 derr << __func__ << " unexpected txc " << txc
7542 << " state " << txc->get_state_name() << dendl;
7543 assert(0 == "unexpected txc state");
7544 return;
7545 }
7546 }
7547}
7548
7549void BlueStore::_txc_finish_io(TransContext *txc)
7550{
7551 dout(20) << __func__ << " " << txc << dendl;
7552
7553 /*
7554 * we need to preserve the order of kv transactions,
7555 * even though aio will complete in any order.
7556 */
7557
7558 OpSequencer *osr = txc->osr.get();
7559 std::lock_guard<std::mutex> l(osr->qlock);
7560 txc->state = TransContext::STATE_IO_DONE;
7561
7562 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
7563 while (p != osr->q.begin()) {
7564 --p;
7565 if (p->state < TransContext::STATE_IO_DONE) {
7566 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
7567 << p->get_state_name() << dendl;
7568 return;
7569 }
7570 if (p->state > TransContext::STATE_IO_DONE) {
7571 ++p;
7572 break;
7573 }
7574 }
7575 do {
7576 _txc_state_proc(&*p++);
7577 } while (p != osr->q.end() &&
7578 p->state == TransContext::STATE_IO_DONE);
7579
7580 if (osr->kv_submitted_waiters &&
7581 osr->_is_all_kv_submitted()) {
7582 osr->qcond.notify_all();
7583 }
7584}
7585
7586void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
7587{
7588 dout(20) << __func__ << " txc " << txc
7589 << " onodes " << txc->onodes
7590 << " shared_blobs " << txc->shared_blobs
7591 << dendl;
7592
7593 // finalize onodes
7594 for (auto o : txc->onodes) {
7595 // finalize extent_map shards
7596 o->extent_map.update(t, false);
7597 if (o->extent_map.needs_reshard()) {
7598 o->extent_map.reshard(db, t);
7599 o->extent_map.update(t, true);
7600 if (o->extent_map.needs_reshard()) {
7601 dout(20) << __func__ << " warning: still wants reshard, check options?"
7602 << dendl;
7603 o->extent_map.clear_needs_reshard();
7604 }
7605 logger->inc(l_bluestore_onode_reshard);
7606 }
7607
7608 // bound encode
7609 size_t bound = 0;
7610 denc(o->onode, bound);
7611 o->extent_map.bound_encode_spanning_blobs(bound);
7612 if (o->onode.extent_map_shards.empty()) {
7613 denc(o->extent_map.inline_bl, bound);
7614 }
7615
7616 // encode
7617 bufferlist bl;
7618 unsigned onode_part, blob_part, extent_part;
7619 {
7620 auto p = bl.get_contiguous_appender(bound, true);
7621 denc(o->onode, p);
7622 onode_part = p.get_logical_offset();
7623 o->extent_map.encode_spanning_blobs(p);
7624 blob_part = p.get_logical_offset() - onode_part;
7625 if (o->onode.extent_map_shards.empty()) {
7626 denc(o->extent_map.inline_bl, p);
7627 }
7628 extent_part = p.get_logical_offset() - onode_part - blob_part;
7629 }
7630
7631 dout(20) << " onode " << o->oid << " is " << bl.length()
7632 << " (" << onode_part << " bytes onode + "
7633 << blob_part << " bytes spanning blobs + "
7634 << extent_part << " bytes inline extents)"
7635 << dendl;
7636 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
7637 o->flushing_count++;
7638 }
7639
7640 // objects we modified but didn't affect the onode
7641 auto p = txc->modified_objects.begin();
7642 while (p != txc->modified_objects.end()) {
7643 if (txc->onodes.count(*p) == 0) {
7644 (*p)->flushing_count++;
7645 ++p;
7646 } else {
7647 // remove dups with onodes list to avoid problems in _txc_finish
7648 p = txc->modified_objects.erase(p);
7649 }
7650 }
7651
7652 // finalize shared_blobs
7653 for (auto sb : txc->shared_blobs) {
7654 string key;
7655 auto sbid = sb->get_sbid();
7656 get_shared_blob_key(sbid, &key);
7657 if (sb->persistent->empty()) {
7658 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
7659 << " is empty" << dendl;
7660 t->rmkey(PREFIX_SHARED_BLOB, key);
7661 } else {
7662 bufferlist bl;
7663 ::encode(*(sb->persistent), bl);
7664 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
7665 << " is " << bl.length() << dendl;
7666 t->set(PREFIX_SHARED_BLOB, key, bl);
7667 }
7668 }
7669}
7670
7671void BlueStore::BSPerfTracker::update_from_perfcounters(
7672 PerfCounters &logger)
7673{
7674 os_commit_latency.consume_next(
7675 logger.get_tavg_ms(
7676 l_bluestore_commit_lat));
7677 os_apply_latency.consume_next(
7678 logger.get_tavg_ms(
7679 l_bluestore_commit_lat));
7680}
7681
7682void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
7683{
7684 dout(20) << __func__ << " txc " << txc << std::hex
7685 << " allocated 0x" << txc->allocated
7686 << " released 0x" << txc->released
7687 << std::dec << dendl;
7688
7689 // We have to handle the case where we allocate *and* deallocate the
7690 // same region in this transaction. The freelist doesn't like that.
7691 // (Actually, the only thing that cares is the BitmapFreelistManager
7692 // debug check. But that's important.)
7693 interval_set<uint64_t> tmp_allocated, tmp_released;
7694 interval_set<uint64_t> *pallocated = &txc->allocated;
7695 interval_set<uint64_t> *preleased = &txc->released;
7696 if (!txc->allocated.empty() && !txc->released.empty()) {
7697 interval_set<uint64_t> overlap;
7698 overlap.intersection_of(txc->allocated, txc->released);
7699 if (!overlap.empty()) {
7700 tmp_allocated = txc->allocated;
7701 tmp_allocated.subtract(overlap);
7702 tmp_released = txc->released;
7703 tmp_released.subtract(overlap);
7704 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
7705 << ", new allocated 0x" << tmp_allocated
7706 << " released 0x" << tmp_released << std::dec
7707 << dendl;
7708 pallocated = &tmp_allocated;
7709 preleased = &tmp_released;
7710 }
7711 }
7712
7713 // update freelist with non-overlap sets
7714 for (interval_set<uint64_t>::iterator p = pallocated->begin();
7715 p != pallocated->end();
7716 ++p) {
7717 fm->allocate(p.get_start(), p.get_len(), t);
7718 }
7719 for (interval_set<uint64_t>::iterator p = preleased->begin();
7720 p != preleased->end();
7721 ++p) {
7722 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
7723 << "~" << p.get_len() << std::dec << dendl;
7724 fm->release(p.get_start(), p.get_len(), t);
7725 }
7726
7727 _txc_update_store_statfs(txc);
7728}
7729
7730void BlueStore::_txc_applied_kv(TransContext *txc)
7731{
7732 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
7733 for (auto& o : *ls) {
7734 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
7735 << dendl;
7736 if (--o->flushing_count == 0) {
7737 std::lock_guard<std::mutex> l(o->flush_lock);
7738 o->flush_cond.notify_all();
7739 }
7740 }
7741 }
7742}
7743
7744void BlueStore::_txc_committed_kv(TransContext *txc)
7745{
7746 dout(20) << __func__ << " txc " << txc << dendl;
7747
7748 // warning: we're calling onreadable_sync inside the sequencer lock
7749 if (txc->onreadable_sync) {
7750 txc->onreadable_sync->complete(0);
7751 txc->onreadable_sync = NULL;
7752 }
7753 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
7754 if (txc->oncommit) {
7755 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
7756 finishers[n]->queue(txc->oncommit);
7757 txc->oncommit = NULL;
7758 }
7759 if (txc->onreadable) {
7760 finishers[n]->queue(txc->onreadable);
7761 txc->onreadable = NULL;
7762 }
7763
7764 if (!txc->oncommits.empty()) {
7765 finishers[n]->queue(txc->oncommits);
7766 }
7767}
7768
7769void BlueStore::_txc_finish(TransContext *txc)
7770{
7771 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
7772 assert(txc->state == TransContext::STATE_FINISHING);
7773
7774 for (auto& sb : txc->shared_blobs_written) {
7775 sb->bc.finish_write(sb->get_cache(), txc->seq);
7776 }
7777 txc->shared_blobs_written.clear();
7778
7779 while (!txc->removed_collections.empty()) {
7780 _queue_reap_collection(txc->removed_collections.front());
7781 txc->removed_collections.pop_front();
7782 }
7783
7784 OpSequencerRef osr = txc->osr;
7785 CollectionRef c;
7786 bool empty = false;
7787 OpSequencer::q_list_t releasing_txc;
7788 {
7789 std::lock_guard<std::mutex> l(osr->qlock);
7790 txc->state = TransContext::STATE_DONE;
7791 bool notify = false;
7792 while (!osr->q.empty()) {
7793 TransContext *txc = &osr->q.front();
7794 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
7795 << dendl;
7796 if (txc->state != TransContext::STATE_DONE) {
7797 if (txc->state == TransContext::STATE_PREPARE &&
7798 deferred_aggressive) {
7799 // for _osr_drain_preceding()
7800 notify = true;
7801 }
7802 break;
7803 }
7804
7805 if (!c && txc->first_collection) {
7806 c = txc->first_collection;
7807 }
7808 osr->q.pop_front();
7809 releasing_txc.push_back(*txc);
7810 notify = true;
7811 }
7812 if (notify) {
7813 osr->qcond.notify_all();
7814 }
7815 if (osr->q.empty()) {
7816 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
7817 empty = true;
7818 }
7819 }
7820 while (!releasing_txc.empty()) {
7821 // release to allocator only after all preceding txc's have also
7822 // finished any deferred writes that potentially land in these
7823 // blocks
7824 auto txc = &releasing_txc.front();
7825 _txc_release_alloc(txc);
7826 releasing_txc.pop_front();
7827 txc->log_state_latency(logger, l_bluestore_state_done_lat);
7828 delete txc;
7829 }
7830
7831 if (c) {
7832 c->trim_cache();
7833 }
7834
7835
7836 if (empty && osr->zombie) {
7837 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
7838 osr->_unregister();
7839 }
7840}
7841
7842void BlueStore::_txc_release_alloc(TransContext *txc)
7843{
7844 // update allocator with full released set
7845 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
7846 dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
7847 for (interval_set<uint64_t>::iterator p = txc->released.begin();
7848 p != txc->released.end();
7849 ++p) {
7850 alloc->release(p.get_start(), p.get_len());
7851 }
7852 }
7853
7854 txc->allocated.clear();
7855 txc->released.clear();
7856}
7857
7858void BlueStore::_osr_drain_preceding(TransContext *txc)
7859{
7860 OpSequencer *osr = txc->osr.get();
7861 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
7862 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
7863 {
7864 // submit anything pending
7865 std::lock_guard<std::mutex> l(deferred_lock);
7866 if (osr->deferred_pending) {
7867 _deferred_submit(osr);
7868 }
7869 }
7870 {
7871 // wake up any previously finished deferred events
7872 std::lock_guard<std::mutex> l(kv_lock);
7873 kv_cond.notify_one();
7874 }
7875 osr->drain_preceding(txc);
7876 --deferred_aggressive;
7877 dout(10) << __func__ << " " << osr << " done" << dendl;
7878}
7879
7880void BlueStore::_osr_drain_all()
7881{
7882 dout(10) << __func__ << dendl;
7883
7884 set<OpSequencerRef> s;
7885 {
7886 std::lock_guard<std::mutex> l(osr_lock);
7887 s = osr_set;
7888 }
7889 dout(20) << __func__ << " osr_set " << s << dendl;
7890
7891 ++deferred_aggressive;
7892 {
7893 // submit anything pending
7894 std::lock_guard<std::mutex> l(deferred_lock);
7895 _deferred_try_submit();
7896 }
7897 {
7898 // wake up any previously finished deferred events
7899 std::lock_guard<std::mutex> l(kv_lock);
7900 kv_cond.notify_one();
7901 }
7902 for (auto osr : s) {
7903 dout(20) << __func__ << " drain " << osr << dendl;
7904 osr->drain();
7905 }
7906 --deferred_aggressive;
7907
7908 dout(10) << __func__ << " done" << dendl;
7909}
7910
7911void BlueStore::_osr_unregister_all()
7912{
7913 set<OpSequencerRef> s;
7914 {
7915 std::lock_guard<std::mutex> l(osr_lock);
7916 s = osr_set;
7917 }
7918 dout(10) << __func__ << " " << s << dendl;
7919 for (auto osr : s) {
7920 osr->_unregister();
7921
7922 if (!osr->zombie) {
7923 // break link from Sequencer to us so that this OpSequencer
7924 // instance can die with this mount/umount cycle. note that
7925 // we assume umount() will not race against ~Sequencer.
7926 assert(osr->parent);
7927 osr->parent->p.reset();
7928 }
7929 }
7930 // nobody should be creating sequencers during umount either.
7931 {
7932 std::lock_guard<std::mutex> l(osr_lock);
7933 assert(osr_set.empty());
7934 }
7935}
7936
7937void BlueStore::_kv_sync_thread()
7938{
7939 dout(10) << __func__ << " start" << dendl;
7940 std::unique_lock<std::mutex> l(kv_lock);
7941 while (true) {
7942 assert(kv_committing.empty());
7943 if (kv_queue.empty() &&
7944 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
7945 !deferred_aggressive)) {
7946 if (kv_stop)
7947 break;
7948 dout(20) << __func__ << " sleep" << dendl;
7949 kv_cond.wait(l);
7950 dout(20) << __func__ << " wake" << dendl;
7951 } else {
7952 deque<TransContext*> kv_submitting;
7953 deque<DeferredBatch*> deferred_done, deferred_stable;
7954 dout(20) << __func__ << " committing " << kv_queue.size()
7955 << " submitting " << kv_queue_unsubmitted.size()
7956 << " deferred done " << deferred_done_queue.size()
7957 << " stable " << deferred_stable_queue.size()
7958 << dendl;
7959 kv_committing.swap(kv_queue);
7960 kv_submitting.swap(kv_queue_unsubmitted);
7961 deferred_done.swap(deferred_done_queue);
7962 deferred_stable.swap(deferred_stable_queue);
7963 utime_t start = ceph_clock_now();
7964 l.unlock();
7965
7966 dout(30) << __func__ << " committing " << kv_committing << dendl;
7967 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
7968 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
7969 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
7970
7971 int num_aios = 0;
7972 for (auto txc : kv_committing) {
7973 if (txc->had_ios) {
7974 ++num_aios;
7975 }
7976 }
7977
7978 bool force_flush = false;
7979 // if bluefs is sharing the same device as data (only), then we
7980 // can rely on the bluefs commit to flush the device and make
7981 // deferred aios stable. that means that if we do have done deferred
7982 // txcs AND we are not on a single device, we need to force a flush.
7983 if (bluefs_single_shared_device && bluefs) {
7984 if (num_aios) {
7985 force_flush = true;
7986 } else if (kv_committing.empty() && kv_submitting.empty() &&
7987 deferred_stable.empty()) {
7988 force_flush = true; // there's nothing else to commit!
7989 } else if (deferred_aggressive) {
7990 force_flush = true;
7991 }
7992 } else
7993 force_flush = true;
7994
7995 if (force_flush) {
7996 dout(20) << __func__ << " num_aios=" << num_aios
7997 << " force_flush=" << (int)force_flush
7998 << ", flushing, deferred done->stable" << dendl;
7999 // flush/barrier on block device
8000 bdev->flush();
8001
8002 // if we flush then deferred done are now deferred stable
8003 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8004 deferred_done.end());
8005 deferred_done.clear();
8006 }
8007 utime_t after_flush = ceph_clock_now();
8008
8009 // we will use one final transaction to force a sync
8010 KeyValueDB::Transaction synct = db->get_transaction();
8011
8012 // increase {nid,blobid}_max? note that this covers both the
8013 // case where we are approaching the max and the case we passed
8014 // it. in either case, we increase the max in the earlier txn
8015 // we submit.
8016 uint64_t new_nid_max = 0, new_blobid_max = 0;
8017 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8018 KeyValueDB::Transaction t =
8019 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8020 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8021 bufferlist bl;
8022 ::encode(new_nid_max, bl);
8023 t->set(PREFIX_SUPER, "nid_max", bl);
8024 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8025 }
8026 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8027 KeyValueDB::Transaction t =
8028 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8029 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8030 bufferlist bl;
8031 ::encode(new_blobid_max, bl);
8032 t->set(PREFIX_SUPER, "blobid_max", bl);
8033 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8034 }
8035 for (auto txc : kv_submitting) {
8036 assert(txc->state == TransContext::STATE_KV_QUEUED);
8037 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8038 int r = db->submit_transaction(txc->t);
8039 assert(r == 0);
8040 _txc_applied_kv(txc);
8041 --txc->osr->kv_committing_serially;
8042 txc->state = TransContext::STATE_KV_SUBMITTED;
8043 if (txc->osr->kv_submitted_waiters) {
8044 std::lock_guard<std::mutex> l(txc->osr->qlock);
8045 if (txc->osr->_is_all_kv_submitted()) {
8046 txc->osr->qcond.notify_all();
8047 }
8048 }
8049 }
8050 for (auto txc : kv_committing) {
8051 if (txc->had_ios) {
8052 --txc->osr->txc_with_unstable_io;
8053 }
8054 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8055 // release throttle *before* we commit. this allows new ops
8056 // to be prepared and enter pipeline while we are waiting on
8057 // the kv commit sync/flush. then hopefully on the next
8058 // iteration there will already be ops awake. otherwise, we
8059 // end up going to sleep, and then wake up when the very first
8060 // transaction is ready for commit.
8061 throttle_bytes.put(txc->cost);
8062 }
8063
8064 PExtentVector bluefs_gift_extents;
8065 if (bluefs &&
8066 after_flush - bluefs_last_balance >
8067 cct->_conf->bluestore_bluefs_balance_interval) {
8068 bluefs_last_balance = after_flush;
8069 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8070 assert(r >= 0);
8071 if (r > 0) {
8072 for (auto& p : bluefs_gift_extents) {
8073 bluefs_extents.insert(p.offset, p.length);
8074 }
8075 bufferlist bl;
8076 ::encode(bluefs_extents, bl);
8077 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8078 << bluefs_extents << std::dec << dendl;
8079 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8080 }
8081 }
8082
8083 // cleanup sync deferred keys
8084 for (auto b : deferred_stable) {
8085 for (auto& txc : b->txcs) {
8086 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8087 if (!wt.released.empty()) {
8088 // kraken replay compat only
8089 txc.released = wt.released;
8090 dout(10) << __func__ << " deferred txn has released "
8091 << txc.released
8092 << " (we just upgraded from kraken) on " << &txc << dendl;
8093 _txc_finalize_kv(&txc, synct);
8094 }
8095 // cleanup the deferred
8096 string key;
8097 get_deferred_key(wt.seq, &key);
8098 synct->rm_single_key(PREFIX_DEFERRED, key);
8099 }
8100 }
8101
8102 // submit synct synchronously (block and wait for it to commit)
8103 int r = db->submit_transaction_sync(synct);
8104 assert(r == 0);
8105
8106 if (new_nid_max) {
8107 nid_max = new_nid_max;
8108 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8109 }
8110 if (new_blobid_max) {
8111 blobid_max = new_blobid_max;
8112 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8113 }
8114
8115 utime_t finish = ceph_clock_now();
8116 utime_t dur_flush = after_flush - start;
8117 utime_t dur_kv = finish - after_flush;
8118 utime_t dur = finish - start;
8119 dout(20) << __func__ << " committed " << kv_committing.size()
8120 << " cleaned " << deferred_stable.size()
8121 << " in " << dur
8122 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8123 << dendl;
8124 if (logger) {
8125 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8126 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8127 logger->tinc(l_bluestore_kv_lat, dur);
8128 }
8129 while (!kv_committing.empty()) {
8130 TransContext *txc = kv_committing.front();
8131 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8132 _txc_state_proc(txc);
8133 kv_committing.pop_front();
8134 }
8135 for (auto b : deferred_stable) {
8136 auto p = b->txcs.begin();
8137 while (p != b->txcs.end()) {
8138 TransContext *txc = &*p;
8139 p = b->txcs.erase(p); // unlink here because
8140 _txc_state_proc(txc); // this may destroy txc
8141 }
8142 delete b;
8143 }
8144
8145 if (!deferred_aggressive) {
8146 std::lock_guard<std::mutex> l(deferred_lock);
8147 if (deferred_queue_size >= deferred_batch_ops ||
8148 throttle_deferred_bytes.past_midpoint()) {
8149 _deferred_try_submit();
8150 }
8151 }
8152
8153 // this is as good a place as any ...
8154 _reap_collections();
8155
8156 if (bluefs) {
8157 if (!bluefs_gift_extents.empty()) {
8158 _commit_bluefs_freespace(bluefs_gift_extents);
8159 }
8160 for (auto p = bluefs_extents_reclaiming.begin();
8161 p != bluefs_extents_reclaiming.end();
8162 ++p) {
8163 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8164 << p.get_start() << "~" << p.get_len() << std::dec
8165 << dendl;
8166 alloc->release(p.get_start(), p.get_len());
8167 }
8168 bluefs_extents_reclaiming.clear();
8169 }
8170
8171 l.lock();
8172 // previously deferred "done" are now "stable" by virtue of this
8173 // commit cycle.
8174 deferred_stable_queue.swap(deferred_done);
8175 }
8176 }
8177 dout(10) << __func__ << " finish" << dendl;
8178}
8179
8180bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8181 TransContext *txc, OnodeRef o)
8182{
8183 if (!txc->deferred_txn) {
8184 txc->deferred_txn = new bluestore_deferred_transaction_t;
8185 }
8186 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8187 return &txc->deferred_txn->ops.back();
8188}
8189
8190void BlueStore::_deferred_queue(TransContext *txc)
8191{
8192 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
8193 std::lock_guard<std::mutex> l(deferred_lock);
8194 if (!txc->osr->deferred_pending &&
8195 !txc->osr->deferred_running) {
8196 deferred_queue.push_back(*txc->osr);
8197 }
8198 if (!txc->osr->deferred_pending) {
8199 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8200 }
8201 ++deferred_queue_size;
8202 txc->osr->deferred_pending->txcs.push_back(*txc);
8203 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8204 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8205 const auto& op = *opi;
8206 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8207 bufferlist::const_iterator p = op.data.begin();
8208 for (auto e : op.extents) {
8209 txc->osr->deferred_pending->prepare_write(
8210 cct, wt.seq, e.offset, e.length, p);
8211 }
8212 }
8213 if (deferred_aggressive &&
8214 !txc->osr->deferred_running) {
8215 _deferred_submit(txc->osr.get());
8216 }
8217}
8218
8219void BlueStore::_deferred_try_submit()
8220{
8221 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8222 << deferred_queue_size << " txcs" << dendl;
8223 for (auto& osr : deferred_queue) {
8224 if (!osr.deferred_running) {
8225 _deferred_submit(&osr);
8226 }
8227 }
8228}
8229
8230void BlueStore::_deferred_submit(OpSequencer *osr)
8231{
8232 dout(10) << __func__ << " osr " << osr
8233 << " " << osr->deferred_pending->iomap.size() << " ios pending "
8234 << dendl;
8235 assert(osr->deferred_pending);
8236 assert(!osr->deferred_running);
8237
8238 auto b = osr->deferred_pending;
8239 deferred_queue_size -= b->seq_bytes.size();
8240 assert(deferred_queue_size >= 0);
8241
8242 osr->deferred_running = osr->deferred_pending;
8243 osr->deferred_pending = nullptr;
8244
8245 uint64_t start = 0, pos = 0;
8246 bufferlist bl;
8247 auto i = b->iomap.begin();
8248 while (true) {
8249 if (i == b->iomap.end() || i->first != pos) {
8250 if (bl.length()) {
8251 dout(20) << __func__ << " write 0x" << std::hex
8252 << start << "~" << bl.length()
8253 << " crc " << bl.crc32c(-1) << std::dec << dendl;
8254 if (!g_conf->bluestore_debug_omit_block_device_write) {
8255 logger->inc(l_bluestore_deferred_write_ops);
8256 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
8257 int r = bdev->aio_write(start, bl, &b->ioc, false);
8258 assert(r == 0);
8259 }
8260 }
8261 if (i == b->iomap.end()) {
8262 break;
8263 }
8264 start = 0;
8265 pos = i->first;
8266 bl.clear();
8267 }
8268 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
8269 << std::hex << pos << "~" << i->second.bl.length() << std::dec
8270 << dendl;
8271 if (!bl.length()) {
8272 start = pos;
8273 }
8274 pos += i->second.bl.length();
8275 bl.claim_append(i->second.bl);
8276 ++i;
8277 }
8278 bdev->aio_submit(&b->ioc);
8279}
8280
8281void BlueStore::_deferred_aio_finish(OpSequencer *osr)
8282{
8283 dout(10) << __func__ << " osr " << osr << dendl;
8284 assert(osr->deferred_running);
8285 DeferredBatch *b = osr->deferred_running;
8286
8287 {
8288 std::lock_guard<std::mutex> l(deferred_lock);
8289 assert(osr->deferred_running == b);
8290 osr->deferred_running = nullptr;
8291 if (!osr->deferred_pending) {
8292 auto q = deferred_queue.iterator_to(*osr);
8293 deferred_queue.erase(q);
8294 } else if (deferred_aggressive) {
8295 _deferred_submit(osr);
8296 }
8297 }
8298
8299 {
8300 std::lock_guard<std::mutex> l2(osr->qlock);
8301 for (auto& i : b->txcs) {
8302 TransContext *txc = &i;
8303 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
8304 txc->osr->qcond.notify_all();
8305 throttle_deferred_bytes.put(txc->cost);
8306 }
8307 std::lock_guard<std::mutex> l(kv_lock);
8308 deferred_done_queue.emplace_back(b);
8309 }
8310
8311 // in the normal case, do not bother waking up the kv thread; it will
8312 // catch us on the next commit anyway.
8313 if (deferred_aggressive) {
8314 std::lock_guard<std::mutex> l(kv_lock);
8315 kv_cond.notify_one();
8316 }
8317}
8318
8319int BlueStore::_deferred_replay()
8320{
8321 dout(10) << __func__ << " start" << dendl;
8322 OpSequencerRef osr = new OpSequencer(cct, this);
8323 int count = 0;
8324 int r = 0;
8325 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
8326 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
8327 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
8328 << dendl;
8329 bluestore_deferred_transaction_t *deferred_txn =
8330 new bluestore_deferred_transaction_t;
8331 bufferlist bl = it->value();
8332 bufferlist::iterator p = bl.begin();
8333 try {
8334 ::decode(*deferred_txn, p);
8335 } catch (buffer::error& e) {
8336 derr << __func__ << " failed to decode deferred txn "
8337 << pretty_binary_string(it->key()) << dendl;
8338 delete deferred_txn;
8339 r = -EIO;
8340 goto out;
8341 }
8342 TransContext *txc = _txc_create(osr.get());
8343 txc->deferred_txn = deferred_txn;
8344 txc->state = TransContext::STATE_KV_DONE;
8345 _txc_state_proc(txc);
8346 }
8347 out:
8348 dout(20) << __func__ << " draining osr" << dendl;
8349 _osr_drain_all();
8350 osr->discard();
8351 dout(10) << __func__ << " completed " << count << " events" << dendl;
8352 return r;
8353}
8354
8355// ---------------------------
8356// transactions
8357
8358int BlueStore::queue_transactions(
8359 Sequencer *posr,
8360 vector<Transaction>& tls,
8361 TrackedOpRef op,
8362 ThreadPool::TPHandle *handle)
8363{
8364 FUNCTRACE();
8365 Context *onreadable;
8366 Context *ondisk;
8367 Context *onreadable_sync;
8368 ObjectStore::Transaction::collect_contexts(
8369 tls, &onreadable, &ondisk, &onreadable_sync);
8370
8371 if (cct->_conf->objectstore_blackhole) {
8372 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
8373 << dendl;
8374 delete ondisk;
8375 delete onreadable;
8376 delete onreadable_sync;
8377 return 0;
8378 }
8379 utime_t start = ceph_clock_now();
8380 // set up the sequencer
8381 OpSequencer *osr;
8382 assert(posr);
8383 if (posr->p) {
8384 osr = static_cast<OpSequencer *>(posr->p.get());
8385 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
8386 } else {
8387 osr = new OpSequencer(cct, this);
8388 osr->parent = posr;
8389 posr->p = osr;
8390 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
8391 }
8392
8393 // prepare
8394 TransContext *txc = _txc_create(osr);
8395 txc->onreadable = onreadable;
8396 txc->onreadable_sync = onreadable_sync;
8397 txc->oncommit = ondisk;
8398
8399 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
8400 (*p).set_osr(osr);
8401 txc->bytes += (*p).get_num_bytes();
8402 _txc_add_transaction(txc, &(*p));
8403 }
8404 _txc_calc_cost(txc);
8405
8406 _txc_write_nodes(txc, txc->t);
8407
8408 // journal deferred items
8409 if (txc->deferred_txn) {
8410 txc->deferred_txn->seq = ++deferred_seq;
8411 bufferlist bl;
8412 ::encode(*txc->deferred_txn, bl);
8413 string key;
8414 get_deferred_key(txc->deferred_txn->seq, &key);
8415 txc->t->set(PREFIX_DEFERRED, key, bl);
8416 }
8417
8418 _txc_finalize_kv(txc, txc->t);
8419 if (handle)
8420 handle->suspend_tp_timeout();
8421
8422 utime_t tstart = ceph_clock_now();
8423 throttle_bytes.get(txc->cost);
8424 if (txc->deferred_txn) {
8425 // ensure we do not block here because of deferred writes
8426 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
8427 deferred_try_submit();
8428 throttle_deferred_bytes.get(txc->cost);
8429 }
8430 }
8431 utime_t tend = ceph_clock_now();
8432
8433 if (handle)
8434 handle->reset_tp_timeout();
8435
8436 logger->inc(l_bluestore_txc);
8437
8438 // execute (start)
8439 _txc_state_proc(txc);
8440
8441 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
8442 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
8443 return 0;
8444}
8445
8446void BlueStore::_txc_aio_submit(TransContext *txc)
8447{
8448 dout(10) << __func__ << " txc " << txc << dendl;
8449 bdev->aio_submit(&txc->ioc);
8450}
8451
8452void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
8453{
8454 Transaction::iterator i = t->begin();
8455
8456 _dump_transaction(t);
8457
8458 vector<CollectionRef> cvec(i.colls.size());
8459 unsigned j = 0;
8460 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
8461 ++p, ++j) {
8462 cvec[j] = _get_collection(*p);
8463
8464 // note first collection we reference
8465 if (!txc->first_collection)
8466 txc->first_collection = cvec[j];
8467 }
8468 vector<OnodeRef> ovec(i.objects.size());
8469
8470 for (int pos = 0; i.have_op(); ++pos) {
8471 Transaction::Op *op = i.decode_op();
8472 int r = 0;
8473
8474 // no coll or obj
8475 if (op->op == Transaction::OP_NOP)
8476 continue;
8477
8478 // collection operations
8479 CollectionRef &c = cvec[op->cid];
8480 switch (op->op) {
8481 case Transaction::OP_RMCOLL:
8482 {
8483 const coll_t &cid = i.get_cid(op->cid);
8484 r = _remove_collection(txc, cid, &c);
8485 if (!r)
8486 continue;
8487 }
8488 break;
8489
8490 case Transaction::OP_MKCOLL:
8491 {
8492 assert(!c);
8493 const coll_t &cid = i.get_cid(op->cid);
8494 r = _create_collection(txc, cid, op->split_bits, &c);
8495 if (!r)
8496 continue;
8497 }
8498 break;
8499
8500 case Transaction::OP_SPLIT_COLLECTION:
8501 assert(0 == "deprecated");
8502 break;
8503
8504 case Transaction::OP_SPLIT_COLLECTION2:
8505 {
8506 uint32_t bits = op->split_bits;
8507 uint32_t rem = op->split_rem;
8508 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
8509 if (!r)
8510 continue;
8511 }
8512 break;
8513
8514 case Transaction::OP_COLL_HINT:
8515 {
8516 uint32_t type = op->hint_type;
8517 bufferlist hint;
8518 i.decode_bl(hint);
8519 bufferlist::iterator hiter = hint.begin();
8520 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
8521 uint32_t pg_num;
8522 uint64_t num_objs;
8523 ::decode(pg_num, hiter);
8524 ::decode(num_objs, hiter);
8525 dout(10) << __func__ << " collection hint objects is a no-op, "
8526 << " pg_num " << pg_num << " num_objects " << num_objs
8527 << dendl;
8528 } else {
8529 // Ignore the hint
8530 dout(10) << __func__ << " unknown collection hint " << type << dendl;
8531 }
8532 continue;
8533 }
8534 break;
8535
8536 case Transaction::OP_COLL_SETATTR:
8537 r = -EOPNOTSUPP;
8538 break;
8539
8540 case Transaction::OP_COLL_RMATTR:
8541 r = -EOPNOTSUPP;
8542 break;
8543
8544 case Transaction::OP_COLL_RENAME:
8545 assert(0 == "not implemented");
8546 break;
8547 }
8548 if (r < 0) {
8549 derr << __func__ << " error " << cpp_strerror(r)
8550 << " not handled on operation " << op->op
8551 << " (op " << pos << ", counting from 0)" << dendl;
8552 _dump_transaction(t, 0);
8553 assert(0 == "unexpected error");
8554 }
8555
8556 // these operations implicity create the object
8557 bool create = false;
8558 if (op->op == Transaction::OP_TOUCH ||
8559 op->op == Transaction::OP_WRITE ||
8560 op->op == Transaction::OP_ZERO) {
8561 create = true;
8562 }
8563
8564 // object operations
8565 RWLock::WLocker l(c->lock);
8566 OnodeRef &o = ovec[op->oid];
8567 if (!o) {
8568 ghobject_t oid = i.get_oid(op->oid);
8569 o = c->get_onode(oid, create);
8570 }
8571 if (!create && (!o || !o->exists)) {
8572 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
8573 << i.get_oid(op->oid) << dendl;
8574 r = -ENOENT;
8575 goto endop;
8576 }
8577
8578 switch (op->op) {
8579 case Transaction::OP_TOUCH:
8580 r = _touch(txc, c, o);
8581 break;
8582
8583 case Transaction::OP_WRITE:
8584 {
8585 uint64_t off = op->off;
8586 uint64_t len = op->len;
8587 uint32_t fadvise_flags = i.get_fadvise_flags();
8588 bufferlist bl;
8589 i.decode_bl(bl);
8590 r = _write(txc, c, o, off, len, bl, fadvise_flags);
8591 }
8592 break;
8593
8594 case Transaction::OP_ZERO:
8595 {
8596 uint64_t off = op->off;
8597 uint64_t len = op->len;
8598 r = _zero(txc, c, o, off, len);
8599 }
8600 break;
8601
8602 case Transaction::OP_TRIMCACHE:
8603 {
8604 // deprecated, no-op
8605 }
8606 break;
8607
8608 case Transaction::OP_TRUNCATE:
8609 {
8610 uint64_t off = op->off;
8611 _truncate(txc, c, o, off);
8612 }
8613 break;
8614
8615 case Transaction::OP_REMOVE:
8616 {
8617 r = _remove(txc, c, o);
8618 }
8619 break;
8620
8621 case Transaction::OP_SETATTR:
8622 {
8623 string name = i.decode_string();
8624 bufferptr bp;
8625 i.decode_bp(bp);
8626 r = _setattr(txc, c, o, name, bp);
8627 }
8628 break;
8629
8630 case Transaction::OP_SETATTRS:
8631 {
8632 map<string, bufferptr> aset;
8633 i.decode_attrset(aset);
8634 r = _setattrs(txc, c, o, aset);
8635 }
8636 break;
8637
8638 case Transaction::OP_RMATTR:
8639 {
8640 string name = i.decode_string();
8641 r = _rmattr(txc, c, o, name);
8642 }
8643 break;
8644
8645 case Transaction::OP_RMATTRS:
8646 {
8647 r = _rmattrs(txc, c, o);
8648 }
8649 break;
8650
8651 case Transaction::OP_CLONE:
8652 {
8653 OnodeRef& no = ovec[op->dest_oid];
8654 if (!no) {
8655 const ghobject_t& noid = i.get_oid(op->dest_oid);
8656 no = c->get_onode(noid, true);
8657 }
8658 r = _clone(txc, c, o, no);
8659 }
8660 break;
8661
8662 case Transaction::OP_CLONERANGE:
8663 assert(0 == "deprecated");
8664 break;
8665
8666 case Transaction::OP_CLONERANGE2:
8667 {
8668 OnodeRef& no = ovec[op->dest_oid];
8669 if (!no) {
8670 const ghobject_t& noid = i.get_oid(op->dest_oid);
8671 no = c->get_onode(noid, true);
8672 }
8673 uint64_t srcoff = op->off;
8674 uint64_t len = op->len;
8675 uint64_t dstoff = op->dest_off;
8676 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
8677 }
8678 break;
8679
8680 case Transaction::OP_COLL_ADD:
8681 assert(0 == "not implemented");
8682 break;
8683
8684 case Transaction::OP_COLL_REMOVE:
8685 assert(0 == "not implemented");
8686 break;
8687
8688 case Transaction::OP_COLL_MOVE:
8689 assert(0 == "deprecated");
8690 break;
8691
8692 case Transaction::OP_COLL_MOVE_RENAME:
8693 case Transaction::OP_TRY_RENAME:
8694 {
8695 assert(op->cid == op->dest_cid);
8696 const ghobject_t& noid = i.get_oid(op->dest_oid);
8697 OnodeRef& no = ovec[op->dest_oid];
8698 if (!no) {
8699 no = c->get_onode(noid, false);
8700 }
8701 r = _rename(txc, c, o, no, noid);
8702 }
8703 break;
8704
8705 case Transaction::OP_OMAP_CLEAR:
8706 {
8707 r = _omap_clear(txc, c, o);
8708 }
8709 break;
8710 case Transaction::OP_OMAP_SETKEYS:
8711 {
8712 bufferlist aset_bl;
8713 i.decode_attrset_bl(&aset_bl);
8714 r = _omap_setkeys(txc, c, o, aset_bl);
8715 }
8716 break;
8717 case Transaction::OP_OMAP_RMKEYS:
8718 {
8719 bufferlist keys_bl;
8720 i.decode_keyset_bl(&keys_bl);
8721 r = _omap_rmkeys(txc, c, o, keys_bl);
8722 }
8723 break;
8724 case Transaction::OP_OMAP_RMKEYRANGE:
8725 {
8726 string first, last;
8727 first = i.decode_string();
8728 last = i.decode_string();
8729 r = _omap_rmkey_range(txc, c, o, first, last);
8730 }
8731 break;
8732 case Transaction::OP_OMAP_SETHEADER:
8733 {
8734 bufferlist bl;
8735 i.decode_bl(bl);
8736 r = _omap_setheader(txc, c, o, bl);
8737 }
8738 break;
8739
8740 case Transaction::OP_SETALLOCHINT:
8741 {
8742 r = _set_alloc_hint(txc, c, o,
8743 op->expected_object_size,
8744 op->expected_write_size,
8745 op->alloc_hint_flags);
8746 }
8747 break;
8748
8749 default:
8750 derr << __func__ << "bad op " << op->op << dendl;
8751 ceph_abort();
8752 }
8753
8754 endop:
8755 if (r < 0) {
8756 bool ok = false;
8757
8758 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
8759 op->op == Transaction::OP_CLONE ||
8760 op->op == Transaction::OP_CLONERANGE2 ||
8761 op->op == Transaction::OP_COLL_ADD ||
8762 op->op == Transaction::OP_SETATTR ||
8763 op->op == Transaction::OP_SETATTRS ||
8764 op->op == Transaction::OP_RMATTR ||
8765 op->op == Transaction::OP_OMAP_SETKEYS ||
8766 op->op == Transaction::OP_OMAP_RMKEYS ||
8767 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
8768 op->op == Transaction::OP_OMAP_SETHEADER))
8769 // -ENOENT is usually okay
8770 ok = true;
8771 if (r == -ENODATA)
8772 ok = true;
8773
8774 if (!ok) {
8775 const char *msg = "unexpected error code";
8776
8777 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
8778 op->op == Transaction::OP_CLONE ||
8779 op->op == Transaction::OP_CLONERANGE2))
8780 msg = "ENOENT on clone suggests osd bug";
8781
8782 if (r == -ENOSPC)
8783 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
8784 // by partially applying transactions.
8785 msg = "ENOSPC from bluestore, misconfigured cluster";
8786
8787 if (r == -ENOTEMPTY) {
8788 msg = "ENOTEMPTY suggests garbage data in osd data dir";
8789 }
8790
8791 derr << __func__ << " error " << cpp_strerror(r)
8792 << " not handled on operation " << op->op
8793 << " (op " << pos << ", counting from 0)"
8794 << dendl;
8795 derr << msg << dendl;
8796 _dump_transaction(t, 0);
8797 assert(0 == "unexpected error");
8798 }
8799 }
8800 }
8801}
8802
8803
8804
8805// -----------------
8806// write operations
8807
8808int BlueStore::_touch(TransContext *txc,
8809 CollectionRef& c,
8810 OnodeRef &o)
8811{
8812 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
8813 int r = 0;
8814 o->exists = true;
8815 _assign_nid(txc, o);
8816 txc->write_onode(o);
8817 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
8818 return r;
8819}
8820
8821void BlueStore::_dump_onode(OnodeRef o, int log_level)
8822{
8823 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
8824 return;
8825 dout(log_level) << __func__ << " " << o << " " << o->oid
8826 << " nid " << o->onode.nid
8827 << " size 0x" << std::hex << o->onode.size
8828 << " (" << std::dec << o->onode.size << ")"
8829 << " expected_object_size " << o->onode.expected_object_size
8830 << " expected_write_size " << o->onode.expected_write_size
8831 << " in " << o->onode.extent_map_shards.size() << " shards"
8832 << ", " << o->extent_map.spanning_blob_map.size()
8833 << " spanning blobs"
8834 << dendl;
8835 for (auto p = o->onode.attrs.begin();
8836 p != o->onode.attrs.end();
8837 ++p) {
8838 dout(log_level) << __func__ << " attr " << p->first
8839 << " len " << p->second.length() << dendl;
8840 }
8841 _dump_extent_map(o->extent_map, log_level);
8842}
8843
8844void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
8845{
8846 uint64_t pos = 0;
8847 for (auto& s : em.shards) {
8848 dout(log_level) << __func__ << " shard " << *s.shard_info
8849 << (s.loaded ? " (loaded)" : "")
8850 << (s.dirty ? " (dirty)" : "")
8851 << dendl;
8852 }
8853 for (auto& e : em.extent_map) {
8854 dout(log_level) << __func__ << " " << e << dendl;
8855 assert(e.logical_offset >= pos);
8856 pos = e.logical_offset + e.length;
8857 const bluestore_blob_t& blob = e.blob->get_blob();
8858 if (blob.has_csum()) {
8859 vector<uint64_t> v;
8860 unsigned n = blob.get_csum_count();
8861 for (unsigned i = 0; i < n; ++i)
8862 v.push_back(blob.get_csum_item(i));
8863 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
8864 << dendl;
8865 }
8866 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
8867 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
8868 dout(log_level) << __func__ << " 0x" << std::hex << i.first
8869 << "~" << i.second->length << std::dec
8870 << " " << *i.second << dendl;
8871 }
8872 }
8873}
8874
8875void BlueStore::_dump_transaction(Transaction *t, int log_level)
8876{
8877 dout(log_level) << " transaction dump:\n";
8878 JSONFormatter f(true);
8879 f.open_object_section("transaction");
8880 t->dump(&f);
8881 f.close_section();
8882 f.flush(*_dout);
8883 *_dout << dendl;
8884}
8885
8886void BlueStore::_pad_zeros(
8887 bufferlist *bl, uint64_t *offset,
8888 uint64_t chunk_size)
8889{
8890 auto length = bl->length();
8891 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
8892 << " chunk_size 0x" << chunk_size << std::dec << dendl;
8893 dout(40) << "before:\n";
8894 bl->hexdump(*_dout);
8895 *_dout << dendl;
8896 // front
8897 size_t front_pad = *offset % chunk_size;
8898 size_t back_pad = 0;
8899 size_t pad_count = 0;
8900 if (front_pad) {
8901 size_t front_copy = MIN(chunk_size - front_pad, length);
8902 bufferptr z = buffer::create_page_aligned(chunk_size);
8903 memset(z.c_str(), 0, front_pad);
8904 pad_count += front_pad;
8905 memcpy(z.c_str() + front_pad, bl->get_contiguous(0, front_copy), front_copy);
8906 if (front_copy + front_pad < chunk_size) {
8907 back_pad = chunk_size - (length + front_pad);
8908 memset(z.c_str() + front_pad + length, 0, back_pad);
8909 pad_count += back_pad;
8910 }
8911 bufferlist old, t;
8912 old.swap(*bl);
8913 t.substr_of(old, front_copy, length - front_copy);
8914 bl->append(z);
8915 bl->claim_append(t);
8916 *offset -= front_pad;
8917 length += front_pad + back_pad;
8918 }
8919
8920 // back
8921 uint64_t end = *offset + length;
8922 unsigned back_copy = end % chunk_size;
8923 if (back_copy) {
8924 assert(back_pad == 0);
8925 back_pad = chunk_size - back_copy;
8926 assert(back_copy <= length);
8927 bufferptr tail(chunk_size);
8928 memcpy(tail.c_str(), bl->get_contiguous(length - back_copy, back_copy),
8929 back_copy);
8930 memset(tail.c_str() + back_copy, 0, back_pad);
8931 bufferlist old;
8932 old.swap(*bl);
8933 bl->substr_of(old, 0, length - back_copy);
8934 bl->append(tail);
8935 length += back_pad;
8936 pad_count += back_pad;
8937 }
8938 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
8939 << back_pad << " on front/back, now 0x" << *offset << "~"
8940 << length << std::dec << dendl;
8941 dout(40) << "after:\n";
8942 bl->hexdump(*_dout);
8943 *_dout << dendl;
8944 if (pad_count)
8945 logger->inc(l_bluestore_write_pad_bytes, pad_count);
8946 assert(bl->length() == length);
8947}
8948
8949void BlueStore::_do_write_small(
8950 TransContext *txc,
8951 CollectionRef &c,
8952 OnodeRef o,
8953 uint64_t offset, uint64_t length,
8954 bufferlist::iterator& blp,
8955 WriteContext *wctx)
8956{
8957 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
8958 << std::dec << dendl;
8959 assert(length < min_alloc_size);
8960 uint64_t end_offs = offset + length;
8961
8962 logger->inc(l_bluestore_write_small);
8963 logger->inc(l_bluestore_write_small_bytes, length);
8964
8965 bufferlist bl;
8966 blp.copy(length, bl);
8967
8968 // Look for an existing mutable blob we can use.
8969 auto begin = o->extent_map.extent_map.begin();
8970 auto end = o->extent_map.extent_map.end();
8971 auto ep = o->extent_map.seek_lextent(offset);
8972 if (ep != begin) {
8973 --ep;
8974 if (ep->blob_end() <= offset) {
8975 ++ep;
8976 }
8977 }
8978 auto prev_ep = ep;
8979 if (prev_ep != begin) {
8980 --prev_ep;
8981 } else {
8982 prev_ep = end; // to avoid this extent check as it's a duplicate
8983 }
8984
8985 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
8986 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
8987 uint32_t alloc_len = min_alloc_size;
8988 auto offset0 = P2ALIGN(offset, alloc_len);
8989
8990 bool any_change;
8991
8992 // search suitable extent in both forward and reverse direction in
8993 // [offset - target_max_blob_size, offset + target_max_blob_size] range
8994 // then check if blob can be reused via try_reuse_blob func or apply
8995 // direct/deferred write (the latter for extents including or higher
8996 // than 'offset' only).
8997 do {
8998 any_change = false;
8999
9000 if (ep != end && ep->logical_offset < offset + max_bsize) {
9001 BlobRef b = ep->blob;
9002 auto bstart = ep->blob_start();
9003 dout(20) << __func__ << " considering " << *b
9004 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9005 if (bstart >= end_offs) {
9006 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9007 } else if (!b->get_blob().is_mutable()) {
9008 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9009 } else if (ep->logical_offset % min_alloc_size !=
9010 ep->blob_offset % min_alloc_size) {
9011 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9012 } else {
9013 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9014 // can we pad our head/tail out with zeros?
9015 uint64_t head_pad, tail_pad;
9016 head_pad = P2PHASE(offset, chunk_size);
9017 tail_pad = P2NPHASE(end_offs, chunk_size);
9018 if (head_pad || tail_pad) {
9019 o->extent_map.fault_range(db, offset - head_pad,
9020 end_offs - offset + head_pad + tail_pad);
9021 }
9022 if (head_pad &&
9023 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9024 head_pad = 0;
9025 }
9026 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9027 tail_pad = 0;
9028 }
9029
9030 uint64_t b_off = offset - head_pad - bstart;
9031 uint64_t b_len = length + head_pad + tail_pad;
9032
9033 // direct write into unused blocks of an existing mutable blob?
9034 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9035 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9036 b->get_blob().is_unused(b_off, b_len) &&
9037 b->get_blob().is_allocated(b_off, b_len)) {
9038 bufferlist padded;
9039 _apply_padding(head_pad, tail_pad, bl, padded);
9040
9041 dout(20) << __func__ << " write to unused 0x" << std::hex
9042 << b_off << "~" << b_len
9043 << " pad 0x" << head_pad << " + 0x" << tail_pad
9044 << std::dec << " of mutable " << *b << dendl;
9045 _buffer_cache_write(txc, b, b_off, padded,
9046 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9047
9048 if (!g_conf->bluestore_debug_omit_block_device_write) {
9049 if (b_len <= prefer_deferred_size) {
9050 dout(20) << __func__ << " deferring small 0x" << std::hex
9051 << b_len << std::dec << " unused write via deferred" << dendl;
9052 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9053 op->op = bluestore_deferred_op_t::OP_WRITE;
9054 b->get_blob().map(
9055 b_off, b_len,
9056 [&](uint64_t offset, uint64_t length) {
9057 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9058 return 0;
9059 });
9060 op->data = padded;
9061 } else {
9062 b->get_blob().map_bl(
9063 b_off, padded,
9064 [&](uint64_t offset, bufferlist& t) {
9065 bdev->aio_write(offset, t,
9066 &txc->ioc, wctx->buffered);
9067 });
9068 }
9069 }
9070 b->dirty_blob().calc_csum(b_off, padded);
9071 dout(20) << __func__ << " lex old " << *ep << dendl;
9072 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9073 b,
9074 &wctx->old_extents);
9075 b->dirty_blob().mark_used(le->blob_offset, le->length);
9076 txc->statfs_delta.stored() += le->length;
9077 dout(20) << __func__ << " lex " << *le << dendl;
9078 logger->inc(l_bluestore_write_small_unused);
9079 return;
9080 }
9081 // read some data to fill out the chunk?
9082 uint64_t head_read = P2PHASE(b_off, chunk_size);
9083 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9084 if ((head_read || tail_read) &&
9085 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9086 head_read + tail_read < min_alloc_size) {
9087 b_off -= head_read;
9088 b_len += head_read + tail_read;
9089
9090 } else {
9091 head_read = tail_read = 0;
9092 }
9093
9094 // chunk-aligned deferred overwrite?
9095 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9096 b_off % chunk_size == 0 &&
9097 b_len % chunk_size == 0 &&
9098 b->get_blob().is_allocated(b_off, b_len)) {
9099
9100 bufferlist padded;
9101 _apply_padding(head_pad, tail_pad, bl, padded);
9102
9103 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9104 << " and tail 0x" << tail_read << std::dec << dendl;
9105 if (head_read) {
9106 bufferlist head_bl;
9107 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9108 head_bl, 0);
9109 assert(r >= 0 && r <= (int)head_read);
9110 size_t zlen = head_read - r;
9111 if (zlen) {
9112 head_bl.append_zero(zlen);
9113 logger->inc(l_bluestore_write_pad_bytes, zlen);
9114 }
9115 head_bl.claim_append(padded);
9116 padded.swap(head_bl);
9117 logger->inc(l_bluestore_write_penalty_read_ops);
9118 }
9119 if (tail_read) {
9120 bufferlist tail_bl;
9121 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9122 tail_bl, 0);
9123 assert(r >= 0 && r <= (int)tail_read);
9124 size_t zlen = tail_read - r;
9125 if (zlen) {
9126 tail_bl.append_zero(zlen);
9127 logger->inc(l_bluestore_write_pad_bytes, zlen);
9128 }
9129 padded.claim_append(tail_bl);
9130 logger->inc(l_bluestore_write_penalty_read_ops);
9131 }
9132 logger->inc(l_bluestore_write_small_pre_read);
9133
9134 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9135 op->op = bluestore_deferred_op_t::OP_WRITE;
9136 _buffer_cache_write(txc, b, b_off, padded,
9137 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9138
9139 int r = b->get_blob().map(
9140 b_off, b_len,
9141 [&](uint64_t offset, uint64_t length) {
9142 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9143 return 0;
9144 });
9145 assert(r == 0);
9146 if (b->get_blob().csum_type) {
9147 b->dirty_blob().calc_csum(b_off, padded);
9148 }
9149 op->data.claim(padded);
9150 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9151 << b_len << std::dec << " of mutable " << *b
9152 << " at " << op->extents << dendl;
9153 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9154 b, &wctx->old_extents);
9155 b->dirty_blob().mark_used(le->blob_offset, le->length);
9156 txc->statfs_delta.stored() += le->length;
9157 dout(20) << __func__ << " lex " << *le << dendl;
9158 logger->inc(l_bluestore_write_small_deferred);
9159 return;
9160 }
9161 //try to reuse blob
9162 if (b->try_reuse_blob(min_alloc_size,
9163 max_bsize,
9164 offset0 - bstart,
9165 &alloc_len)) {
9166 assert(alloc_len == min_alloc_size); // expecting data always
9167 // fit into reused blob
9168 // Need to check for pending writes desiring to
9169 // reuse the same pextent. The rationale is that during GC two chunks
9170 // from garbage blobs(compressed?) can share logical space within the same
9171 // AU. That's in turn might be caused by unaligned len in clone_range2.
9172 // Hence the second write will fail in an attempt to reuse blob at
9173 // do_alloc_write().
9174 if (!wctx->has_conflict(b,
9175 offset0,
9176 offset0 + alloc_len,
9177 min_alloc_size)) {
9178
9179 // we can't reuse pad_head/pad_tail since they might be truncated
9180 // due to existent extents
9181 uint64_t b_off = offset - bstart;
9182 uint64_t b_off0 = b_off;
9183 _pad_zeros(&bl, &b_off0, chunk_size);
9184
9185 dout(20) << __func__ << " reuse blob " << *b << std::hex
9186 << " (" << b_off0 << "~" << bl.length() << ")"
9187 << " (" << b_off << "~" << length << ")"
9188 << std::dec << dendl;
9189
9190 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9191 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9192 false, false);
9193 logger->inc(l_bluestore_write_small_unused);
9194 return;
9195 }
9196 }
9197 }
9198 ++ep;
9199 any_change = true;
9200 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9201
9202 // check extent for reuse in reverse order
9203 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9204 BlobRef b = prev_ep->blob;
9205 auto bstart = prev_ep->blob_start();
9206 dout(20) << __func__ << " considering " << *b
9207 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9208 if (b->try_reuse_blob(min_alloc_size,
9209 max_bsize,
9210 offset0 - bstart,
9211 &alloc_len)) {
9212 assert(alloc_len == min_alloc_size); // expecting data always
9213 // fit into reused blob
9214 // Need to check for pending writes desiring to
9215 // reuse the same pextent. The rationale is that during GC two chunks
9216 // from garbage blobs(compressed?) can share logical space within the same
9217 // AU. That's in turn might be caused by unaligned len in clone_range2.
9218 // Hence the second write will fail in an attempt to reuse blob at
9219 // do_alloc_write().
9220 if (!wctx->has_conflict(b,
9221 offset0,
9222 offset0 + alloc_len,
9223 min_alloc_size)) {
9224
9225 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9226 uint64_t b_off = offset - bstart;
9227 uint64_t b_off0 = b_off;
9228 _pad_zeros(&bl, &b_off0, chunk_size);
9229
9230 dout(20) << __func__ << " reuse blob " << *b << std::hex
9231 << " (" << b_off0 << "~" << bl.length() << ")"
9232 << " (" << b_off << "~" << length << ")"
9233 << std::dec << dendl;
9234
9235 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9236 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9237 false, false);
9238 logger->inc(l_bluestore_write_small_unused);
9239 return;
9240 }
9241 }
9242 if (prev_ep != begin) {
9243 --prev_ep;
9244 any_change = true;
9245 } else {
9246 prev_ep = end; // to avoid useless first extent re-check
9247 }
9248 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9249 } while (any_change);
9250
9251 // new blob.
9252
9253 BlobRef b = c->new_blob();
9254 uint64_t b_off = P2PHASE(offset, alloc_len);
9255 uint64_t b_off0 = b_off;
9256 _pad_zeros(&bl, &b_off0, block_size);
9257 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9258 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
9259 logger->inc(l_bluestore_write_small_new);
9260
9261 return;
9262}
9263
9264void BlueStore::_do_write_big(
9265 TransContext *txc,
9266 CollectionRef &c,
9267 OnodeRef o,
9268 uint64_t offset, uint64_t length,
9269 bufferlist::iterator& blp,
9270 WriteContext *wctx)
9271{
9272 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9273 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
9274 << " compress " << (int)wctx->compress
9275 << dendl;
9276 logger->inc(l_bluestore_write_big);
9277 logger->inc(l_bluestore_write_big_bytes, length);
9278 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9279 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9280 while (length > 0) {
9281 bool new_blob = false;
9282 uint32_t l = MIN(max_bsize, length);
9283 BlobRef b;
9284 uint32_t b_off = 0;
9285
9286 //attempting to reuse existing blob
9287 if (!wctx->compress) {
9288 // look for an existing mutable blob we can reuse
9289 auto begin = o->extent_map.extent_map.begin();
9290 auto end = o->extent_map.extent_map.end();
9291 auto ep = o->extent_map.seek_lextent(offset);
9292 auto prev_ep = ep;
9293 if (prev_ep != begin) {
9294 --prev_ep;
9295 } else {
9296 prev_ep = end; // to avoid this extent check as it's a duplicate
9297 }
9298 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9299 // search suitable extent in both forward and reverse direction in
9300 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9301 // then check if blob can be reused via try_reuse_blob func.
9302 bool any_change;
9303 do {
9304 any_change = false;
9305 if (ep != end && ep->logical_offset < offset + max_bsize) {
9306 if (offset >= ep->blob_start() &&
9307 ep->blob->try_reuse_blob(min_alloc_size, max_bsize,
9308 offset - ep->blob_start(),
9309 &l)) {
9310 b = ep->blob;
9311 b_off = offset - ep->blob_start();
9312 prev_ep = end; // to avoid check below
9313 dout(20) << __func__ << " reuse blob " << *b << std::hex
9314 << " (" << b_off << "~" << l << ")" << std::dec << dendl;
9315 } else {
9316 ++ep;
9317 any_change = true;
9318 }
9319 }
9320
9321 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9322 if (prev_ep->blob->try_reuse_blob(min_alloc_size, max_bsize,
9323 offset - prev_ep->blob_start(),
9324 &l)) {
9325 b = prev_ep->blob;
9326 b_off = offset - prev_ep->blob_start();
9327 dout(20) << __func__ << " reuse blob " << *b << std::hex
9328 << " (" << b_off << "~" << l << ")" << std::dec << dendl;
9329 } else if (prev_ep != begin) {
9330 --prev_ep;
9331 any_change = true;
9332 } else {
9333 prev_ep = end; // to avoid useless first extent re-check
9334 }
9335 }
9336 } while (b == nullptr && any_change);
9337 }
9338 if (b == nullptr) {
9339 b = c->new_blob();
9340 b_off = 0;
9341 new_blob = true;
9342 }
9343
9344 bufferlist t;
9345 blp.copy(l, t);
9346 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
9347 offset += l;
9348 length -= l;
9349 logger->inc(l_bluestore_write_big_blobs);
9350 }
9351}
9352
9353int BlueStore::_do_alloc_write(
9354 TransContext *txc,
9355 CollectionRef coll,
9356 OnodeRef o,
9357 WriteContext *wctx)
9358{
9359 dout(20) << __func__ << " txc " << txc
9360 << " " << wctx->writes.size() << " blobs"
9361 << dendl;
9362
9363 uint64_t need = 0;
9364 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9365 for (auto &wi : wctx->writes) {
9366 need += wi.blob_length;
9367 }
9368 int r = alloc->reserve(need);
9369 if (r < 0) {
9370 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
9371 << dendl;
9372 return r;
9373 }
9374
9375 uint64_t hint = 0;
9376 CompressorRef c;
9377 double crr = 0;
9378 if (wctx->compress) {
9379 c = select_option(
9380 "compression_algorithm",
9381 compressor,
9382 [&]() {
9383 string val;
9384 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
9385 CompressorRef cp = compressor;
9386 if (!cp || cp->get_type_name() != val) {
9387 cp = Compressor::create(cct, val);
9388 }
9389 return boost::optional<CompressorRef>(cp);
9390 }
9391 return boost::optional<CompressorRef>();
9392 }
9393 );
9394
9395 crr = select_option(
9396 "compression_required_ratio",
9397 cct->_conf->bluestore_compression_required_ratio,
9398 [&]() {
9399 double val;
9400 if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
9401 return boost::optional<double>(val);
9402 }
9403 return boost::optional<double>();
9404 }
9405 );
9406 }
9407
9408 // checksum
9409 int csum = csum_type.load();
9410 csum = select_option(
9411 "csum_type",
9412 csum,
9413 [&]() {
9414 int val;
9415 if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
9416 return boost::optional<int>(val);
9417 }
9418 return boost::optional<int>();
9419 }
9420 );
9421
9422 for (auto& wi : wctx->writes) {
9423 BlobRef b = wi.b;
9424 bluestore_blob_t& dblob = b->dirty_blob();
9425 uint64_t b_off = wi.b_off;
9426 bufferlist *l = &wi.bl;
9427 uint64_t final_length = wi.blob_length;
9428 uint64_t csum_length = wi.blob_length;
9429 unsigned csum_order = block_size_order;
9430 bufferlist compressed_bl;
9431 bool compressed = false;
9432 if(c && wi.blob_length > min_alloc_size) {
9433
9434 utime_t start = ceph_clock_now();
9435
9436 // compress
9437 assert(b_off == 0);
9438 assert(wi.blob_length == l->length());
9439 bluestore_compression_header_t chdr;
9440 chdr.type = c->get_type();
9441 // FIXME: memory alignment here is bad
9442 bufferlist t;
9443
9444 r = c->compress(*l, t);
9445 assert(r == 0);
9446
9447 chdr.length = t.length();
9448 ::encode(chdr, compressed_bl);
9449 compressed_bl.claim_append(t);
9450 uint64_t rawlen = compressed_bl.length();
9451 uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
9452 uint64_t want_len_raw = final_length * crr;
9453 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
9454 if (newlen <= want_len && newlen < final_length) {
9455 // Cool. We compressed at least as much as we were hoping to.
9456 // pad out to min_alloc_size
9457 compressed_bl.append_zero(newlen - rawlen);
9458 logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
9459 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
9460 << " -> 0x" << rawlen << " => 0x" << newlen
9461 << " with " << c->get_type()
9462 << std::dec << dendl;
9463 txc->statfs_delta.compressed() += rawlen;
9464 txc->statfs_delta.compressed_original() += l->length();
9465 txc->statfs_delta.compressed_allocated() += newlen;
9466 l = &compressed_bl;
9467 final_length = newlen;
9468 csum_length = newlen;
9469 csum_order = ctz(newlen);
9470 dblob.set_compressed(wi.blob_length, rawlen);
9471 compressed = true;
9472 logger->inc(l_bluestore_compress_success_count);
9473 } else {
9474 dout(20) << __func__ << std::hex << " 0x" << l->length()
9475 << " compressed to 0x" << rawlen << " -> 0x" << newlen
9476 << " with " << c->get_type()
9477 << ", which is more than required 0x" << want_len_raw
9478 << " -> 0x" << want_len
9479 << ", leaving uncompressed"
9480 << std::dec << dendl;
9481 logger->inc(l_bluestore_compress_rejected_count);
9482 }
9483 logger->tinc(l_bluestore_compress_lat,
9484 ceph_clock_now() - start);
9485 }
9486 if (!compressed && wi.new_blob) {
9487 // initialize newly created blob only
9488 assert(!dblob.has_flag(bluestore_blob_t::FLAG_MUTABLE));
9489 dblob.set_flag(bluestore_blob_t::FLAG_MUTABLE);
9490
9491 if (l->length() != wi.blob_length) {
9492 // hrm, maybe we could do better here, but let's not bother.
9493 dout(20) << __func__ << " forcing csum_order to block_size_order "
9494 << block_size_order << dendl;
9495 csum_order = block_size_order;
9496 } else {
9497 csum_order = std::min(wctx->csum_order, ctz(l->length()));
9498 }
9499 // try to align blob with max_blob_size to improve
9500 // its reuse ratio, e.g. in case of reverse write
9501 uint32_t suggested_boff =
9502 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
9503 if ((suggested_boff % (1 << csum_order)) == 0 &&
9504 suggested_boff + final_length <= max_bsize &&
9505 suggested_boff > b_off) {
9506 dout(20) << __func__ << " forcing blob_offset to "
9507 << std::hex << suggested_boff << std::dec << dendl;
9508 assert(suggested_boff >= b_off);
9509 csum_length += suggested_boff - b_off;
9510 b_off = suggested_boff;
9511 }
9512 }
9513
9514 AllocExtentVector extents;
9515 extents.reserve(4); // 4 should be (more than) enough for most allocations
9516 int64_t got = alloc->allocate(final_length, min_alloc_size,
9517 max_alloc_size.load(),
9518 hint, &extents);
9519 assert(got == (int64_t)final_length);
9520 need -= got;
9521 txc->statfs_delta.allocated() += got;
9522 for (auto& p : extents) {
9523 bluestore_pextent_t e = bluestore_pextent_t(p);
9524 txc->allocated.insert(e.offset, e.length);
9525 hint = p.end();
9526 }
9527 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
9528
9529 dout(20) << __func__ << " blob " << *b
9530 << " csum_type " << Checksummer::get_csum_type_string(csum)
9531 << " csum_order " << csum_order
9532 << " csum_length 0x" << std::hex << csum_length << std::dec
9533 << dendl;
9534
9535 if (csum != Checksummer::CSUM_NONE) {
9536 if (!dblob.has_csum()) {
9537 dblob.init_csum(csum, csum_order, csum_length);
9538 }
9539 dblob.calc_csum(b_off, *l);
9540 }
9541 if (wi.mark_unused) {
9542 auto b_end = b_off + wi.bl.length();
9543 if (b_off) {
9544 dblob.add_unused(0, b_off);
9545 }
9546 if (b_end < wi.blob_length) {
9547 dblob.add_unused(b_end, wi.blob_length - b_end);
9548 }
9549 }
9550
9551 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
9552 b_off + (wi.b_off0 - wi.b_off),
9553 wi.length0,
9554 wi.b,
9555 nullptr);
9556 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
9557 txc->statfs_delta.stored() += le->length;
9558 dout(20) << __func__ << " lex " << *le << dendl;
9559 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
9560 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9561
9562 // queue io
9563 if (!g_conf->bluestore_debug_omit_block_device_write) {
9564 if (l->length() <= prefer_deferred_size.load()) {
9565 dout(20) << __func__ << " deferring small 0x" << std::hex
9566 << l->length() << std::dec << " write via deferred" << dendl;
9567 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9568 op->op = bluestore_deferred_op_t::OP_WRITE;
9569 int r = b->get_blob().map(
9570 b_off, l->length(),
9571 [&](uint64_t offset, uint64_t length) {
9572 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9573 return 0;
9574 });
9575 assert(r == 0);
9576 op->data = *l;
9577 } else {
9578 b->get_blob().map_bl(
9579 b_off, *l,
9580 [&](uint64_t offset, bufferlist& t) {
9581 bdev->aio_write(offset, t, &txc->ioc, false);
9582 });
9583 }
9584 }
9585 }
9586 if (need > 0) {
9587 alloc->unreserve(need);
9588 }
9589 return 0;
9590}
9591
9592void BlueStore::_wctx_finish(
9593 TransContext *txc,
9594 CollectionRef& c,
9595 OnodeRef o,
9596 WriteContext *wctx)
9597{
9598 auto oep = wctx->old_extents.begin();
9599 while (oep != wctx->old_extents.end()) {
9600 auto &lo = *oep;
9601 oep = wctx->old_extents.erase(oep);
9602 dout(20) << __func__ << " lex_old " << lo.e << dendl;
9603 BlobRef b = lo.e.blob;
9604 const bluestore_blob_t& blob = b->get_blob();
9605 if (blob.is_compressed()) {
9606 if (lo.blob_empty) {
9607 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
9608 }
9609 txc->statfs_delta.compressed_original() -= lo.e.length;
9610 }
9611 auto& r = lo.r;
9612 txc->statfs_delta.stored() -= lo.e.length;
9613 if (!r.empty()) {
9614 dout(20) << __func__ << " blob release " << r << dendl;
9615 if (blob.is_shared()) {
9616 PExtentVector final;
9617 c->load_shared_blob(b->shared_blob);
9618 for (auto e : r) {
9619 b->shared_blob->put_ref(e.offset, e.length, &final);
9620 }
9621 dout(20) << __func__ << " shared_blob release " << final
9622 << " from " << *b->shared_blob << dendl;
9623 txc->write_shared_blob(b->shared_blob);
9624 r.clear();
9625 r.swap(final);
9626 }
9627 }
9628 // we can't invalidate our logical extents as we drop them because
9629 // other lextents (either in our onode or others) may still
9630 // reference them. but we can throw out anything that is no
9631 // longer allocated. Note that this will leave behind edge bits
9632 // that are no longer referenced but not deallocated (until they
9633 // age out of the cache naturally).
9634 b->discard_unallocated(c.get());
9635 for (auto e : r) {
9636 dout(20) << __func__ << " release " << e << dendl;
9637 txc->released.insert(e.offset, e.length);
9638 txc->statfs_delta.allocated() -= e.length;
9639 if (blob.is_compressed()) {
9640 txc->statfs_delta.compressed_allocated() -= e.length;
9641 }
9642 }
9643 delete &lo;
9644 if (b->is_spanning() && !b->is_referenced()) {
9645 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
9646 << dendl;
9647 o->extent_map.spanning_blob_map.erase(b->id);
9648 }
9649 }
9650}
9651
9652void BlueStore::_do_write_data(
9653 TransContext *txc,
9654 CollectionRef& c,
9655 OnodeRef o,
9656 uint64_t offset,
9657 uint64_t length,
9658 bufferlist& bl,
9659 WriteContext *wctx)
9660{
9661 uint64_t end = offset + length;
9662 bufferlist::iterator p = bl.begin();
9663
9664 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
9665 (length != min_alloc_size)) {
9666 // we fall within the same block
9667 _do_write_small(txc, c, o, offset, length, p, wctx);
9668 } else {
9669 uint64_t head_offset, head_length;
9670 uint64_t middle_offset, middle_length;
9671 uint64_t tail_offset, tail_length;
9672
9673 head_offset = offset;
9674 head_length = P2NPHASE(offset, min_alloc_size);
9675
9676 tail_offset = P2ALIGN(end, min_alloc_size);
9677 tail_length = P2PHASE(end, min_alloc_size);
9678
9679 middle_offset = head_offset + head_length;
9680 middle_length = length - head_length - tail_length;
9681
9682 if (head_length) {
9683 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
9684 }
9685
9686 if (middle_length) {
9687 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
9688 }
9689
9690 if (tail_length) {
9691 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
9692 }
9693 }
9694}
9695
9696int BlueStore::_do_write(
9697 TransContext *txc,
9698 CollectionRef& c,
9699 OnodeRef o,
9700 uint64_t offset,
9701 uint64_t length,
9702 bufferlist& bl,
9703 uint32_t fadvise_flags)
9704{
9705 int r = 0;
9706
9707 dout(20) << __func__
9708 << " " << o->oid
9709 << " 0x" << std::hex << offset << "~" << length
9710 << " - have 0x" << o->onode.size
9711 << " (" << std::dec << o->onode.size << ")"
9712 << " bytes"
9713 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
9714 << dendl;
9715 _dump_onode(o);
9716
9717 if (length == 0) {
9718 return 0;
9719 }
9720
9721 uint64_t end = offset + length;
9722 bool was_gc = false;
9723 GarbageCollector gc(c->store->cct);
9724 int64_t benefit;
9725 auto dirty_start = offset;
9726 auto dirty_end = offset + length;
9727
9728 WriteContext wctx, wctx_gc;
9729 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9730 dout(20) << __func__ << " will do buffered write" << dendl;
9731 wctx.buffered = true;
9732 } else if (cct->_conf->bluestore_default_buffered_write &&
9733 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9734 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9735 dout(20) << __func__ << " defaulting to buffered write" << dendl;
9736 wctx.buffered = true;
9737 }
9738
9739 // FIXME: Using the MAX of the block_size_order and preferred_csum_order
9740 // results in poor small random read performance when data was initially
9741 // written out in large chunks. Reverting to previous behavior for now.
9742 wctx.csum_order = block_size_order;
9743
9744 // compression parameters
9745 unsigned alloc_hints = o->onode.alloc_hint_flags;
9746 auto cm = select_option(
9747 "compression_mode",
9748 comp_mode.load(),
9749 [&]() {
9750 string val;
9751 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
9752 return boost::optional<Compressor::CompressionMode>(Compressor::get_comp_mode_type(val));
9753 }
9754 return boost::optional<Compressor::CompressionMode>();
9755 }
9756 );
9757 wctx.compress = (cm != Compressor::COMP_NONE) &&
9758 ((cm == Compressor::COMP_FORCE) ||
9759 (cm == Compressor::COMP_AGGRESSIVE &&
9760 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
9761 (cm == Compressor::COMP_PASSIVE &&
9762 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
9763
9764 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
9765 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
9766 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE|
9767 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
9768 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
9769 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
9770 auto order = min_alloc_size_order.load();
9771 if (o->onode.expected_write_size) {
9772 wctx.csum_order = std::max(order,
9773 (uint8_t)ctz(o->onode.expected_write_size));
9774 } else {
9775 wctx.csum_order = order;
9776 }
9777
9778 if (wctx.compress) {
9779 wctx.target_blob_size = select_option(
9780 "compression_max_blob_size",
9781 comp_max_blob_size.load(),
9782 [&]() {
9783 int val;
9784 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
9785 return boost::optional<uint64_t>((uint64_t)val);
9786 }
9787 return boost::optional<uint64_t>();
9788 }
9789 );
9790 }
9791 } else {
9792 if (wctx.compress) {
9793 wctx.target_blob_size = select_option(
9794 "compression_min_blob_size",
9795 comp_min_blob_size.load(),
9796 [&]() {
9797 int val;
9798 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
9799 return boost::optional<uint64_t>((uint64_t)val);
9800 }
9801 return boost::optional<uint64_t>();
9802 }
9803 );
9804 }
9805 }
9806 uint64_t max_bsize = max_blob_size.load();
9807 if (wctx.target_blob_size == 0 || wctx.target_blob_size > max_bsize) {
9808 wctx.target_blob_size = max_bsize;
9809 }
9810 // set the min blob size floor at 2x the min_alloc_size, or else we
9811 // won't be able to allocate a smaller extent for the compressed
9812 // data.
9813 if (wctx.compress &&
9814 wctx.target_blob_size < min_alloc_size * 2) {
9815 wctx.target_blob_size = min_alloc_size * 2;
9816 }
9817 wctx_gc.fork(wctx); // make a clone for garbage collection
9818 dout(20) << __func__ << " prefer csum_order " << wctx.csum_order
9819 << " target_blob_size 0x" << std::hex << wctx.target_blob_size
9820 << std::dec << dendl;
9821
9822 o->extent_map.fault_range(db, offset, length);
9823 _do_write_data(txc, c, o, offset, length, bl, &wctx);
9824
9825 r = _do_alloc_write(txc, c, o, &wctx);
9826 if (r < 0) {
9827 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
9828 << dendl;
9829 goto out;
9830 }
9831
9832 benefit = gc.estimate(offset,
9833 length,
9834 o->extent_map,
9835 wctx.old_extents,
9836 min_alloc_size);
9837
9838 _wctx_finish(txc, c, o, &wctx);
9839 if (end > o->onode.size) {
9840 dout(20) << __func__ << " extending size to 0x" << std::hex << end
9841 << std::dec << dendl;
9842 o->onode.size = end;
9843 }
9844
9845 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
9846 dout(20) << __func__ << " perform garbage collection, expected benefit = "
9847 << benefit << " AUs" << dendl;
9848 auto& extents_to_collect = gc.get_extents_to_collect();
9849 for (auto it = extents_to_collect.begin();
9850 it != extents_to_collect.end();
9851 ++it) {
9852 bufferlist bl;
9853 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
9854 assert(r == (int)it->length);
9855 o->extent_map.fault_range(db, it->offset, it->length);
9856 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
9857 logger->inc(l_bluestore_gc_merged, it->length);
9858 was_gc = true;
9859 if (dirty_start > it->offset) {
9860 dirty_start = it->offset;
9861 }
9862 if (dirty_end < it->offset + it->length) {
9863 dirty_end = it->offset + it->length;
9864 }
9865 }
9866 }
9867 if (was_gc) {
9868 dout(30) << __func__ << " alloc write for GC" << dendl;
9869 r = _do_alloc_write(txc, c, o, &wctx_gc);
9870 if (r < 0) {
9871 derr << __func__ << " _do_alloc_write(gc) failed with " << cpp_strerror(r)
9872 << dendl;
9873 goto out;
9874 }
9875 _wctx_finish(txc, c, o, &wctx_gc);
9876 }
9877
9878 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
9879 o->extent_map.dirty_range(txc->t, dirty_start, dirty_end - dirty_start);
9880 r = 0;
9881
9882 out:
9883 return r;
9884}
9885
9886int BlueStore::_write(TransContext *txc,
9887 CollectionRef& c,
9888 OnodeRef& o,
9889 uint64_t offset, size_t length,
9890 bufferlist& bl,
9891 uint32_t fadvise_flags)
9892{
9893 dout(15) << __func__ << " " << c->cid << " " << o->oid
9894 << " 0x" << std::hex << offset << "~" << length << std::dec
9895 << dendl;
9896 o->exists = true;
9897 _assign_nid(txc, o);
9898 int r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
9899 txc->write_onode(o);
9900
9901 dout(10) << __func__ << " " << c->cid << " " << o->oid
9902 << " 0x" << std::hex << offset << "~" << length << std::dec
9903 << " = " << r << dendl;
9904 return r;
9905}
9906
9907int BlueStore::_zero(TransContext *txc,
9908 CollectionRef& c,
9909 OnodeRef& o,
9910 uint64_t offset, size_t length)
9911{
9912 dout(15) << __func__ << " " << c->cid << " " << o->oid
9913 << " 0x" << std::hex << offset << "~" << length << std::dec
9914 << dendl;
9915 o->exists = true;
9916 _assign_nid(txc, o);
9917 int r = _do_zero(txc, c, o, offset, length);
9918 dout(10) << __func__ << " " << c->cid << " " << o->oid
9919 << " 0x" << std::hex << offset << "~" << length << std::dec
9920 << " = " << r << dendl;
9921 return r;
9922}
9923
9924int BlueStore::_do_zero(TransContext *txc,
9925 CollectionRef& c,
9926 OnodeRef& o,
9927 uint64_t offset, size_t length)
9928{
9929 dout(15) << __func__ << " " << c->cid << " " << o->oid
9930 << " 0x" << std::hex << offset << "~" << length << std::dec
9931 << dendl;
9932 int r = 0;
9933
9934 _dump_onode(o);
9935
9936 WriteContext wctx;
9937 o->extent_map.fault_range(db, offset, length);
9938 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
9939 o->extent_map.dirty_range(txc->t, offset, length);
9940 _wctx_finish(txc, c, o, &wctx);
9941
9942 if (offset + length > o->onode.size) {
9943 o->onode.size = offset + length;
9944 dout(20) << __func__ << " extending size to " << offset + length
9945 << dendl;
9946 }
9947 txc->write_onode(o);
9948
9949 dout(10) << __func__ << " " << c->cid << " " << o->oid
9950 << " 0x" << std::hex << offset << "~" << length << std::dec
9951 << " = " << r << dendl;
9952 return r;
9953}
9954
9955void BlueStore::_do_truncate(
9956 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset)
9957{
9958 dout(15) << __func__ << " " << c->cid << " " << o->oid
9959 << " 0x" << std::hex << offset << std::dec << dendl;
9960
9961 _dump_onode(o, 30);
9962
9963 if (offset == o->onode.size)
9964 return ;
9965
9966 if (offset < o->onode.size) {
9967 WriteContext wctx;
9968 uint64_t length = o->onode.size - offset;
9969 o->extent_map.fault_range(db, offset, length);
9970 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
9971 o->extent_map.dirty_range(txc->t, offset, length);
9972 _wctx_finish(txc, c, o, &wctx);
9973
9974 // if we have shards past EOF, ask for a reshard
9975 if (!o->onode.extent_map_shards.empty() &&
9976 o->onode.extent_map_shards.back().offset >= offset) {
9977 dout(10) << __func__ << " request reshard past EOF" << dendl;
9978 if (offset) {
9979 o->extent_map.request_reshard(offset - 1, offset + length);
9980 } else {
9981 o->extent_map.request_reshard(0, length);
9982 }
9983 }
9984 }
9985
9986 o->onode.size = offset;
9987
9988 txc->write_onode(o);
9989}
9990
9991void BlueStore::_truncate(TransContext *txc,
9992 CollectionRef& c,
9993 OnodeRef& o,
9994 uint64_t offset)
9995{
9996 dout(15) << __func__ << " " << c->cid << " " << o->oid
9997 << " 0x" << std::hex << offset << std::dec
9998 << dendl;
9999 _do_truncate(txc, c, o, offset);
10000}
10001
10002int BlueStore::_do_remove(
10003 TransContext *txc,
10004 CollectionRef& c,
10005 OnodeRef o)
10006{
10007 _do_truncate(txc, c, o, 0);
10008 if (o->onode.has_omap()) {
10009 o->flush();
10010 _do_omap_clear(txc, o->onode.nid);
10011 }
10012 o->exists = false;
10013 string key;
10014 for (auto &s : o->extent_map.shards) {
10015 dout(20) << __func__ << " removing shard 0x" << std::hex
10016 << s.shard_info->offset << std::dec << dendl;
10017 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10018 [&](const string& final_key) {
10019 txc->t->rmkey(PREFIX_OBJ, final_key);
10020 }
10021 );
10022 }
10023 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10024 txc->removed(o);
10025 o->extent_map.clear();
10026 o->onode = bluestore_onode_t();
10027 _debug_obj_on_delete(o->oid);
10028 return 0;
10029}
10030
10031int BlueStore::_remove(TransContext *txc,
10032 CollectionRef& c,
10033 OnodeRef &o)
10034{
10035 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10036 int r = _do_remove(txc, c, o);
10037 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10038 return r;
10039}
10040
10041int BlueStore::_setattr(TransContext *txc,
10042 CollectionRef& c,
10043 OnodeRef& o,
10044 const string& name,
10045 bufferptr& val)
10046{
10047 dout(15) << __func__ << " " << c->cid << " " << o->oid
10048 << " " << name << " (" << val.length() << " bytes)"
10049 << dendl;
10050 int r = 0;
10051 if (val.is_partial())
10052 o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
10053 else
10054 o->onode.attrs[name.c_str()] = val;
10055 txc->write_onode(o);
10056 dout(10) << __func__ << " " << c->cid << " " << o->oid
10057 << " " << name << " (" << val.length() << " bytes)"
10058 << " = " << r << dendl;
10059 return r;
10060}
10061
10062int BlueStore::_setattrs(TransContext *txc,
10063 CollectionRef& c,
10064 OnodeRef& o,
10065 const map<string,bufferptr>& aset)
10066{
10067 dout(15) << __func__ << " " << c->cid << " " << o->oid
10068 << " " << aset.size() << " keys"
10069 << dendl;
10070 int r = 0;
10071 for (map<string,bufferptr>::const_iterator p = aset.begin();
10072 p != aset.end(); ++p) {
10073 if (p->second.is_partial())
10074 o->onode.attrs[p->first.c_str()] =
10075 bufferptr(p->second.c_str(), p->second.length());
10076 else
10077 o->onode.attrs[p->first.c_str()] = p->second;
10078 }
10079 txc->write_onode(o);
10080 dout(10) << __func__ << " " << c->cid << " " << o->oid
10081 << " " << aset.size() << " keys"
10082 << " = " << r << dendl;
10083 return r;
10084}
10085
10086
10087int BlueStore::_rmattr(TransContext *txc,
10088 CollectionRef& c,
10089 OnodeRef& o,
10090 const string& name)
10091{
10092 dout(15) << __func__ << " " << c->cid << " " << o->oid
10093 << " " << name << dendl;
10094 int r = 0;
10095 auto it = o->onode.attrs.find(name.c_str());
10096 if (it == o->onode.attrs.end())
10097 goto out;
10098
10099 o->onode.attrs.erase(it);
10100 txc->write_onode(o);
10101
10102 out:
10103 dout(10) << __func__ << " " << c->cid << " " << o->oid
10104 << " " << name << " = " << r << dendl;
10105 return r;
10106}
10107
10108int BlueStore::_rmattrs(TransContext *txc,
10109 CollectionRef& c,
10110 OnodeRef& o)
10111{
10112 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10113 int r = 0;
10114
10115 if (o->onode.attrs.empty())
10116 goto out;
10117
10118 o->onode.attrs.clear();
10119 txc->write_onode(o);
10120
10121 out:
10122 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10123 return r;
10124}
10125
10126void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
10127{
10128 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10129 string prefix, tail;
10130 get_omap_header(id, &prefix);
10131 get_omap_tail(id, &tail);
10132 it->lower_bound(prefix);
10133 while (it->valid()) {
10134 if (it->key() >= tail) {
10135 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
10136 << dendl;
10137 break;
10138 }
10139 txc->t->rmkey(PREFIX_OMAP, it->key());
10140 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10141 it->next();
10142 }
10143}
10144
10145int BlueStore::_omap_clear(TransContext *txc,
10146 CollectionRef& c,
10147 OnodeRef& o)
10148{
10149 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10150 int r = 0;
10151 if (o->onode.has_omap()) {
10152 o->flush();
10153 _do_omap_clear(txc, o->onode.nid);
10154 o->onode.clear_omap_flag();
10155 txc->write_onode(o);
10156 }
10157 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10158 return r;
10159}
10160
10161int BlueStore::_omap_setkeys(TransContext *txc,
10162 CollectionRef& c,
10163 OnodeRef& o,
10164 bufferlist &bl)
10165{
10166 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10167 int r;
10168 bufferlist::iterator p = bl.begin();
10169 __u32 num;
10170 if (!o->onode.has_omap()) {
10171 o->onode.set_omap_flag();
10172 txc->write_onode(o);
10173 } else {
10174 txc->note_modified_object(o);
10175 }
10176 string final_key;
10177 _key_encode_u64(o->onode.nid, &final_key);
10178 final_key.push_back('.');
10179 ::decode(num, p);
10180 while (num--) {
10181 string key;
10182 bufferlist value;
10183 ::decode(key, p);
10184 ::decode(value, p);
10185 final_key.resize(9); // keep prefix
10186 final_key += key;
10187 dout(30) << __func__ << " " << pretty_binary_string(final_key)
10188 << " <- " << key << dendl;
10189 txc->t->set(PREFIX_OMAP, final_key, value);
10190 }
10191 r = 0;
10192 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10193 return r;
10194}
10195
10196int BlueStore::_omap_setheader(TransContext *txc,
10197 CollectionRef& c,
10198 OnodeRef &o,
10199 bufferlist& bl)
10200{
10201 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10202 int r;
10203 string key;
10204 if (!o->onode.has_omap()) {
10205 o->onode.set_omap_flag();
10206 txc->write_onode(o);
10207 } else {
10208 txc->note_modified_object(o);
10209 }
10210 get_omap_header(o->onode.nid, &key);
10211 txc->t->set(PREFIX_OMAP, key, bl);
10212 r = 0;
10213 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10214 return r;
10215}
10216
10217int BlueStore::_omap_rmkeys(TransContext *txc,
10218 CollectionRef& c,
10219 OnodeRef& o,
10220 bufferlist& bl)
10221{
10222 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10223 int r = 0;
10224 bufferlist::iterator p = bl.begin();
10225 __u32 num;
10226 string final_key;
10227
10228 if (!o->onode.has_omap()) {
10229 goto out;
10230 }
10231 _key_encode_u64(o->onode.nid, &final_key);
10232 final_key.push_back('.');
10233 ::decode(num, p);
10234 while (num--) {
10235 string key;
10236 ::decode(key, p);
10237 final_key.resize(9); // keep prefix
10238 final_key += key;
10239 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
10240 << " <- " << key << dendl;
10241 txc->t->rmkey(PREFIX_OMAP, final_key);
10242 }
10243 txc->note_modified_object(o);
10244
10245 out:
10246 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10247 return r;
10248}
10249
10250int BlueStore::_omap_rmkey_range(TransContext *txc,
10251 CollectionRef& c,
10252 OnodeRef& o,
10253 const string& first, const string& last)
10254{
10255 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10256 KeyValueDB::Iterator it;
10257 string key_first, key_last;
10258 int r = 0;
10259 if (!o->onode.has_omap()) {
10260 goto out;
10261 }
10262 o->flush();
10263 it = db->get_iterator(PREFIX_OMAP);
10264 get_omap_key(o->onode.nid, first, &key_first);
10265 get_omap_key(o->onode.nid, last, &key_last);
10266 it->lower_bound(key_first);
10267 while (it->valid()) {
10268 if (it->key() >= key_last) {
10269 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
10270 << dendl;
10271 break;
10272 }
10273 txc->t->rmkey(PREFIX_OMAP, it->key());
10274 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10275 it->next();
10276 }
10277 txc->note_modified_object(o);
10278
10279 out:
10280 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10281 return r;
10282}
10283
10284int BlueStore::_set_alloc_hint(
10285 TransContext *txc,
10286 CollectionRef& c,
10287 OnodeRef& o,
10288 uint64_t expected_object_size,
10289 uint64_t expected_write_size,
10290 uint32_t flags)
10291{
10292 dout(15) << __func__ << " " << c->cid << " " << o->oid
10293 << " object_size " << expected_object_size
10294 << " write_size " << expected_write_size
10295 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10296 << dendl;
10297 int r = 0;
10298 o->onode.expected_object_size = expected_object_size;
10299 o->onode.expected_write_size = expected_write_size;
10300 o->onode.alloc_hint_flags = flags;
10301 txc->write_onode(o);
10302 dout(10) << __func__ << " " << c->cid << " " << o->oid
10303 << " object_size " << expected_object_size
10304 << " write_size " << expected_write_size
10305 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10306 << " = " << r << dendl;
10307 return r;
10308}
10309
10310int BlueStore::_clone(TransContext *txc,
10311 CollectionRef& c,
10312 OnodeRef& oldo,
10313 OnodeRef& newo)
10314{
10315 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10316 << newo->oid << dendl;
10317 int r = 0;
10318 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
10319 derr << __func__ << " mismatched hash on " << oldo->oid
10320 << " and " << newo->oid << dendl;
10321 return -EINVAL;
10322 }
10323
10324 newo->exists = true;
10325 _assign_nid(txc, newo);
10326
10327 // clone data
10328 oldo->flush();
10329 _do_truncate(txc, c, newo, 0);
10330 if (cct->_conf->bluestore_clone_cow) {
10331 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
10332 } else {
10333 bufferlist bl;
10334 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
10335 if (r < 0)
10336 goto out;
10337 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
10338 if (r < 0)
10339 goto out;
10340 }
10341
10342 // clone attrs
10343 newo->onode.attrs = oldo->onode.attrs;
10344
10345 // clone omap
10346 if (newo->onode.has_omap()) {
10347 dout(20) << __func__ << " clearing old omap data" << dendl;
10348 newo->flush();
10349 _do_omap_clear(txc, newo->onode.nid);
10350 }
10351 if (oldo->onode.has_omap()) {
10352 dout(20) << __func__ << " copying omap data" << dendl;
10353 if (!newo->onode.has_omap()) {
10354 newo->onode.set_omap_flag();
10355 }
10356 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10357 string head, tail;
10358 get_omap_header(oldo->onode.nid, &head);
10359 get_omap_tail(oldo->onode.nid, &tail);
10360 it->lower_bound(head);
10361 while (it->valid()) {
10362 if (it->key() >= tail) {
10363 dout(30) << __func__ << " reached tail" << dendl;
10364 break;
10365 } else {
10366 dout(30) << __func__ << " got header/data "
10367 << pretty_binary_string(it->key()) << dendl;
10368 string key;
10369 rewrite_omap_key(newo->onode.nid, it->key(), &key);
10370 txc->t->set(PREFIX_OMAP, key, it->value());
10371 }
10372 it->next();
10373 }
10374 } else {
10375 newo->onode.clear_omap_flag();
10376 }
10377
10378 txc->write_onode(newo);
10379 r = 0;
10380
10381 out:
10382 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10383 << newo->oid << " = " << r << dendl;
10384 return r;
10385}
10386
10387int BlueStore::_do_clone_range(
10388 TransContext *txc,
10389 CollectionRef& c,
10390 OnodeRef& oldo,
10391 OnodeRef& newo,
10392 uint64_t srcoff, uint64_t length, uint64_t dstoff)
10393{
10394 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10395 << newo->oid
10396 << " 0x" << std::hex << srcoff << "~" << length << " -> "
10397 << " 0x" << dstoff << "~" << length << std::dec << dendl;
10398 oldo->extent_map.fault_range(db, srcoff, length);
10399 newo->extent_map.fault_range(db, dstoff, length);
10400 _dump_onode(oldo);
10401 _dump_onode(newo);
10402
10403 // hmm, this could go into an ExtentMap::dup() method.
10404 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
10405 for (auto &e : oldo->extent_map.extent_map) {
10406 e.blob->last_encoded_id = -1;
10407 }
10408 int n = 0;
10409 bool dirtied_oldo = false;
10410 uint64_t end = srcoff + length;
10411 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
10412 ep != oldo->extent_map.extent_map.end();
10413 ++ep) {
10414 auto& e = *ep;
10415 if (e.logical_offset >= end) {
10416 break;
10417 }
10418 dout(20) << __func__ << " src " << e << dendl;
10419 BlobRef cb;
10420 bool blob_duped = true;
10421 if (e.blob->last_encoded_id >= 0) {
10422 // blob is already duped
10423 cb = id_to_blob[e.blob->last_encoded_id];
10424 blob_duped = false;
10425 } else {
10426 // dup the blob
10427 const bluestore_blob_t& blob = e.blob->get_blob();
10428 // make sure it is shared
10429 if (!blob.is_shared()) {
10430 c->make_blob_shared(_assign_blobid(txc), e.blob);
10431 dirtied_oldo = true; // fixme: overkill
10432 } else {
10433 c->load_shared_blob(e.blob->shared_blob);
10434 }
10435 cb = new Blob();
10436 e.blob->last_encoded_id = n;
10437 id_to_blob[n] = cb;
10438 e.blob->dup(*cb);
10439 // bump the extent refs on the copied blob's extents
10440 for (auto p : blob.get_extents()) {
10441 if (p.is_valid()) {
10442 e.blob->shared_blob->get_ref(p.offset, p.length);
10443 }
10444 }
10445 txc->write_shared_blob(e.blob->shared_blob);
10446 dout(20) << __func__ << " new " << *cb << dendl;
10447 }
10448 // dup extent
10449 int skip_front, skip_back;
10450 if (e.logical_offset < srcoff) {
10451 skip_front = srcoff - e.logical_offset;
10452 } else {
10453 skip_front = 0;
10454 }
10455 if (e.logical_end() > end) {
10456 skip_back = e.logical_end() - end;
10457 } else {
10458 skip_back = 0;
10459 }
10460 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
10461 e.blob_offset + skip_front,
10462 e.length - skip_front - skip_back, cb);
10463 newo->extent_map.extent_map.insert(*ne);
10464 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
10465 // fixme: we may leave parts of new blob unreferenced that could
10466 // be freed (relative to the shared_blob).
10467 txc->statfs_delta.stored() += ne->length;
10468 if (e.blob->get_blob().is_compressed()) {
10469 txc->statfs_delta.compressed_original() += ne->length;
10470 if (blob_duped){
10471 txc->statfs_delta.compressed() +=
10472 cb->get_blob().get_compressed_payload_length();
10473 }
10474 }
10475 dout(20) << __func__ << " dst " << *ne << dendl;
10476 ++n;
10477 }
10478 if (dirtied_oldo) {
10479 oldo->extent_map.dirty_range(txc->t, srcoff, length); // overkill
10480 txc->write_onode(oldo);
10481 }
10482 txc->write_onode(newo);
10483
10484 if (dstoff + length > newo->onode.size) {
10485 newo->onode.size = dstoff + length;
10486 }
10487 newo->extent_map.dirty_range(txc->t, dstoff, length);
10488 _dump_onode(oldo);
10489 _dump_onode(newo);
10490 return 0;
10491}
10492
10493int BlueStore::_clone_range(TransContext *txc,
10494 CollectionRef& c,
10495 OnodeRef& oldo,
10496 OnodeRef& newo,
10497 uint64_t srcoff, uint64_t length, uint64_t dstoff)
10498{
10499 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10500 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
10501 << " to offset 0x" << dstoff << std::dec << dendl;
10502 int r = 0;
10503
10504 if (srcoff + length > oldo->onode.size) {
10505 r = -EINVAL;
10506 goto out;
10507 }
10508
10509 newo->exists = true;
10510 _assign_nid(txc, newo);
10511
10512 if (length > 0) {
10513 if (cct->_conf->bluestore_clone_cow) {
10514 _do_zero(txc, c, newo, dstoff, length);
10515 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
10516 } else {
10517 bufferlist bl;
10518 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
10519 if (r < 0)
10520 goto out;
10521 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
10522 if (r < 0)
10523 goto out;
10524 }
10525 }
10526
10527 txc->write_onode(newo);
10528 r = 0;
10529
10530 out:
10531 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10532 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
10533 << " to offset 0x" << dstoff << std::dec
10534 << " = " << r << dendl;
10535 return r;
10536}
10537
10538int BlueStore::_rename(TransContext *txc,
10539 CollectionRef& c,
10540 OnodeRef& oldo,
10541 OnodeRef& newo,
10542 const ghobject_t& new_oid)
10543{
10544 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10545 << new_oid << dendl;
10546 int r;
10547 ghobject_t old_oid = oldo->oid;
10548 mempool::bluestore_meta_other::string new_okey;
10549
10550 if (newo) {
10551 if (newo->exists) {
10552 r = -EEXIST;
10553 goto out;
10554 }
10555 assert(txc->onodes.count(newo) == 0);
10556 }
10557
10558 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
10559
10560 // rewrite shards
10561 {
10562 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
10563 get_object_key(cct, new_oid, &new_okey);
10564 string key;
10565 for (auto &s : oldo->extent_map.shards) {
10566 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
10567 [&](const string& final_key) {
10568 txc->t->rmkey(PREFIX_OBJ, final_key);
10569 }
10570 );
10571 s.dirty = true;
10572 }
10573 }
10574
10575 newo = oldo;
10576 txc->write_onode(newo);
10577
10578 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
10579 // Onode in the old slot
10580 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
10581 r = 0;
10582
10583 out:
10584 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
10585 << new_oid << " = " << r << dendl;
10586 return r;
10587}
10588
10589// collections
10590
10591int BlueStore::_create_collection(
10592 TransContext *txc,
10593 const coll_t &cid,
10594 unsigned bits,
10595 CollectionRef *c)
10596{
10597 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
10598 int r;
10599 bufferlist bl;
10600
10601 {
10602 RWLock::WLocker l(coll_lock);
10603 if (*c) {
10604 r = -EEXIST;
10605 goto out;
10606 }
10607 c->reset(
10608 new Collection(
10609 this,
10610 cache_shards[cid.hash_to_shard(cache_shards.size())],
10611 cid));
10612 (*c)->cnode.bits = bits;
10613 coll_map[cid] = *c;
10614 }
10615 ::encode((*c)->cnode, bl);
10616 txc->t->set(PREFIX_COLL, stringify(cid), bl);
10617 r = 0;
10618
10619 out:
10620 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
10621 return r;
10622}
10623
10624int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
10625 CollectionRef *c)
10626{
10627 dout(15) << __func__ << " " << cid << dendl;
10628 int r;
10629
10630 {
10631 RWLock::WLocker l(coll_lock);
10632 if (!*c) {
10633 r = -ENOENT;
10634 goto out;
10635 }
10636 size_t nonexistent_count = 0;
10637 assert((*c)->exists);
10638 if ((*c)->onode_map.map_any([&](OnodeRef o) {
10639 if (o->exists) {
10640 dout(10) << __func__ << " " << o->oid << " " << o
10641 << " exists in onode_map" << dendl;
10642 return true;
10643 }
10644 ++nonexistent_count;
10645 return false;
10646 })) {
10647 r = -ENOTEMPTY;
10648 goto out;
10649 }
10650
10651 vector<ghobject_t> ls;
10652 ghobject_t next;
10653 // Enumerate onodes in db, up to nonexistent_count + 1
10654 // then check if all of them are marked as non-existent.
10655 // Bypass the check if returned number is greater than nonexistent_count
10656 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
10657 nonexistent_count + 1, &ls, &next);
10658 if (r >= 0) {
10659 bool exists = false; //ls.size() > nonexistent_count;
10660 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
10661 dout(10) << __func__ << " oid " << *it << dendl;
10662 auto onode = (*c)->onode_map.lookup(*it);
10663 exists = !onode || onode->exists;
10664 if (exists) {
10665 dout(10) << __func__ << " " << *it
10666 << " exists in db" << dendl;
10667 }
10668 }
10669 if (!exists) {
10670 coll_map.erase(cid);
10671 txc->removed_collections.push_back(*c);
10672 (*c)->exists = false;
10673 c->reset();
10674 txc->t->rmkey(PREFIX_COLL, stringify(cid));
10675 r = 0;
10676 } else {
10677 dout(10) << __func__ << " " << cid
10678 << " is non-empty" << dendl;
10679 r = -ENOTEMPTY;
10680 }
10681 }
10682 }
10683
10684 out:
10685 dout(10) << __func__ << " " << cid << " = " << r << dendl;
10686 return r;
10687}
10688
10689int BlueStore::_split_collection(TransContext *txc,
10690 CollectionRef& c,
10691 CollectionRef& d,
10692 unsigned bits, int rem)
10693{
10694 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
10695 << " bits " << bits << dendl;
10696 RWLock::WLocker l(c->lock);
10697 RWLock::WLocker l2(d->lock);
10698 int r;
10699
10700 // flush all previous deferred writes on this sequencer. this is a bit
10701 // heavyweight, but we need to make sure all deferred writes complete
10702 // before we split as the new collection's sequencer may need to order
10703 // this after those writes, and we don't bother with the complexity of
10704 // moving those TransContexts over to the new osr.
10705 _osr_drain_preceding(txc);
10706
10707 // move any cached items (onodes and referenced shared blobs) that will
10708 // belong to the child collection post-split. leave everything else behind.
10709 // this may include things that don't strictly belong to the now-smaller
10710 // parent split, but the OSD will always send us a split for every new
10711 // child.
10712
10713 spg_t pgid, dest_pgid;
10714 bool is_pg = c->cid.is_pg(&pgid);
10715 assert(is_pg);
10716 is_pg = d->cid.is_pg(&dest_pgid);
10717 assert(is_pg);
10718
10719 // the destination should initially be empty.
10720 assert(d->onode_map.empty());
10721 assert(d->shared_blob_set.empty());
10722 assert(d->cnode.bits == bits);
10723
10724 c->split_cache(d.get());
10725
10726 // adjust bits. note that this will be redundant for all but the first
10727 // split call for this parent (first child).
10728 c->cnode.bits = bits;
10729 assert(d->cnode.bits == bits);
10730 r = 0;
10731
10732 bufferlist bl;
10733 ::encode(c->cnode, bl);
10734 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
10735
10736 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
10737 << " bits " << bits << " = " << r << dendl;
10738 return r;
10739}
10740
10741// DB key value Histogram
10742#define KEY_SLAB 32
10743#define VALUE_SLAB 64
10744
10745const string prefix_onode = "o";
10746const string prefix_onode_shard = "x";
10747const string prefix_other = "Z";
10748
10749int BlueStore::DBHistogram::get_key_slab(size_t sz)
10750{
10751 return (sz/KEY_SLAB);
10752}
10753
10754string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
10755{
10756 int lower_bound = slab * KEY_SLAB;
10757 int upper_bound = (slab + 1) * KEY_SLAB;
10758 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
10759 return ret;
10760}
10761
10762int BlueStore::DBHistogram::get_value_slab(size_t sz)
10763{
10764 return (sz/VALUE_SLAB);
10765}
10766
10767string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
10768{
10769 int lower_bound = slab * VALUE_SLAB;
10770 int upper_bound = (slab + 1) * VALUE_SLAB;
10771 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
10772 return ret;
10773}
10774
10775void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
10776 const string &prefix, size_t key_size, size_t value_size)
10777{
10778 uint32_t key_slab = get_key_slab(key_size);
10779 uint32_t value_slab = get_value_slab(value_size);
10780 key_hist[prefix][key_slab].count++;
10781 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
10782 key_hist[prefix][key_slab].val_map[value_slab].count++;
10783 key_hist[prefix][key_slab].val_map[value_slab].max_len =
10784 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
10785}
10786
10787void BlueStore::DBHistogram::dump(Formatter *f)
10788{
10789 f->open_object_section("rocksdb_value_distribution");
10790 for (auto i : value_hist) {
10791 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
10792 }
10793 f->close_section();
10794
10795 f->open_object_section("rocksdb_key_value_histogram");
10796 for (auto i : key_hist) {
10797 f->dump_string("prefix", i.first);
10798 f->open_object_section("key_hist");
10799 for ( auto k : i.second) {
10800 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
10801 f->dump_unsigned("max_len", k.second.max_len);
10802 f->open_object_section("value_hist");
10803 for ( auto j : k.second.val_map) {
10804 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
10805 f->dump_unsigned("max_len", j.second.max_len);
10806 }
10807 f->close_section();
10808 }
10809 f->close_section();
10810 }
10811 f->close_section();
10812}
10813
10814//Itrerates through the db and collects the stats
10815void BlueStore::generate_db_histogram(Formatter *f)
10816{
10817 //globals
10818 uint64_t num_onodes = 0;
10819 uint64_t num_shards = 0;
10820 uint64_t num_super = 0;
10821 uint64_t num_coll = 0;
10822 uint64_t num_omap = 0;
10823 uint64_t num_deferred = 0;
10824 uint64_t num_alloc = 0;
10825 uint64_t num_stat = 0;
10826 uint64_t num_others = 0;
10827 uint64_t num_shared_shards = 0;
10828 size_t max_key_size =0, max_value_size = 0;
10829 uint64_t total_key_size = 0, total_value_size = 0;
10830 size_t key_size = 0, value_size = 0;
10831 DBHistogram hist;
10832
10833 utime_t start = ceph_clock_now();
10834
10835 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
10836 iter->seek_to_first();
10837 while (iter->valid()) {
10838 dout(30) << __func__ << " Key: " << iter->key() << dendl;
10839 key_size = iter->key_size();
10840 value_size = iter->value_size();
10841 hist.value_hist[hist.get_value_slab(value_size)]++;
10842 max_key_size = MAX(max_key_size, key_size);
10843 max_value_size = MAX(max_value_size, value_size);
10844 total_key_size += key_size;
10845 total_value_size += value_size;
10846
10847 pair<string,string> key(iter->raw_key());
10848
10849 if (key.first == PREFIX_SUPER) {
10850 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
10851 num_super++;
10852 } else if (key.first == PREFIX_STAT) {
10853 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
10854 num_stat++;
10855 } else if (key.first == PREFIX_COLL) {
10856 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
10857 num_coll++;
10858 } else if (key.first == PREFIX_OBJ) {
10859 if (key.second.back() == ONODE_KEY_SUFFIX) {
10860 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
10861 num_onodes++;
10862 } else {
10863 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
10864 num_shards++;
10865 }
10866 } else if (key.first == PREFIX_OMAP) {
10867 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
10868 num_omap++;
10869 } else if (key.first == PREFIX_DEFERRED) {
10870 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
10871 num_deferred++;
10872 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
10873 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
10874 num_alloc++;
10875 } else if (key.first == PREFIX_SHARED_BLOB) {
10876 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
10877 num_shared_shards++;
10878 } else {
10879 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
10880 num_others++;
10881 }
10882 iter->next();
10883 }
10884
10885 utime_t duration = ceph_clock_now() - start;
10886 f->open_object_section("rocksdb_key_value_stats");
10887 f->dump_unsigned("num_onodes", num_onodes);
10888 f->dump_unsigned("num_shards", num_shards);
10889 f->dump_unsigned("num_super", num_super);
10890 f->dump_unsigned("num_coll", num_coll);
10891 f->dump_unsigned("num_omap", num_omap);
10892 f->dump_unsigned("num_deferred", num_deferred);
10893 f->dump_unsigned("num_alloc", num_alloc);
10894 f->dump_unsigned("num_stat", num_stat);
10895 f->dump_unsigned("num_shared_shards", num_shared_shards);
10896 f->dump_unsigned("num_others", num_others);
10897 f->dump_unsigned("max_key_size", max_key_size);
10898 f->dump_unsigned("max_value_size", max_value_size);
10899 f->dump_unsigned("total_key_size", total_key_size);
10900 f->dump_unsigned("total_value_size", total_value_size);
10901 f->close_section();
10902
10903 hist.dump(f);
10904
10905 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
10906
10907}
10908
10909void BlueStore::flush_cache()
10910{
10911 dout(10) << __func__ << dendl;
10912 for (auto i : cache_shards) {
10913 i->trim_all();
10914 }
10915 for (auto& p : coll_map) {
10916 assert(p.second->onode_map.empty());
10917 assert(p.second->shared_blob_set.empty());
10918 }
10919 coll_map.clear();
10920}
10921
10922void BlueStore::_apply_padding(uint64_t head_pad,
10923 uint64_t tail_pad,
10924 bufferlist& bl,
10925 bufferlist& padded)
10926{
10927 padded = bl;
10928 if (head_pad) {
10929 bufferlist z;
10930 z.append_zero(head_pad);
10931 z.claim_append(padded);
10932 padded.claim(z);
10933 }
10934 if (tail_pad) {
10935 padded.append_zero(tail_pad);
10936 }
10937 if (head_pad || tail_pad) {
10938 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
10939 << " tail 0x" << tail_pad << std::dec << dendl;
10940 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
10941 }
10942}
10943
10944// ===========================================