]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
38d1e272ebc5a30fac5409213e83c922aa1b6088
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // vim: ts=8 sw=2 smarttab
2 /*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14 #include <unistd.h>
15 #include <stdlib.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <fcntl.h>
19
20 #include "include/cpp-btree/btree_set.h"
21
22 #include "BlueStore.h"
23 #include "os/kv.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "Allocator.h"
30 #include "FreelistManager.h"
31 #include "BlueFS.h"
32 #include "BlueRocksEnv.h"
33 #include "auth/Crypto.h"
34 #include "common/EventTrace.h"
35
36 #define dout_context cct
37 #define dout_subsys ceph_subsys_bluestore
38
39 using bid_t = decltype(BlueStore::Blob::id);
40
41 // bluestore_cache_onode
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
43 bluestore_cache_onode);
44
45 // bluestore_cache_other
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
47 bluestore_cache_other);
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
49 bluestore_cache_other);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
51 bluestore_cache_other);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
53 bluestore_cache_other);
54
55 // bluestore_txc
56 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
57 bluestore_txc);
58
59
60 // kv store prefixes
61 const string PREFIX_SUPER = "S"; // field -> value
62 const string PREFIX_STAT = "T"; // field -> value(int64 array)
63 const string PREFIX_COLL = "C"; // collection name -> cnode_t
64 const string PREFIX_OBJ = "O"; // object name -> onode_t
65 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
66 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
67 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
68 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
69
70 // write a label in the first block. always use this size. note that
71 // bluefs makes a matching assumption about the location of its
72 // superblock (always the second block of the device).
73 #define BDEV_LABEL_BLOCK_SIZE 4096
74
75 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76 #define SUPER_RESERVED 8192
77
78 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
79
80
81 /*
82 * extent map blob encoding
83 *
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
86 */
87 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91 #define BLOBID_SHIFT_BITS 4
92
93 /*
94 * object name key structure
95 *
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
99 *
100 * escaped string: namespace
101 *
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
106 *
107 * encoded u64: snap
108 * encoded u64: generation
109 * 'o'
110 */
111 #define ONODE_KEY_SUFFIX 'o'
112
113 /*
114 * extent shard key
115 *
116 * object prefix key
117 * u32
118 * 'x'
119 */
120 #define EXTENT_SHARD_KEY_SUFFIX 'x'
121
122 /*
123 * string encoding in the key
124 *
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
128 * hex digits.
129 *
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
132 *
133 */
134 template<typename S>
135 static void append_escaped(const string &in, S *out)
136 {
137 char hexbyte[8];
138 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
139 if (*i <= '#') {
140 snprintf(hexbyte, sizeof(hexbyte), "#%02x", (uint8_t)*i);
141 out->append(hexbyte);
142 } else if (*i >= '~') {
143 snprintf(hexbyte, sizeof(hexbyte), "~%02x", (uint8_t)*i);
144 out->append(hexbyte);
145 } else {
146 out->push_back(*i);
147 }
148 }
149 out->push_back('!');
150 }
151
152 static int decode_escaped(const char *p, string *out)
153 {
154 const char *orig_p = p;
155 while (*p && *p != '!') {
156 if (*p == '#' || *p == '~') {
157 unsigned hex;
158 int r = sscanf(++p, "%2x", &hex);
159 if (r < 1)
160 return -EINVAL;
161 out->push_back((char)hex);
162 p += 2;
163 } else {
164 out->push_back(*p++);
165 }
166 }
167 return p - orig_p;
168 }
169
170 // some things we encode in binary (as le32 or le64); print the
171 // resulting key strings nicely
172 template<typename S>
173 static string pretty_binary_string(const S& in)
174 {
175 char buf[10];
176 string out;
177 out.reserve(in.length() * 3);
178 enum { NONE, HEX, STRING } mode = NONE;
179 unsigned from = 0, i;
180 for (i=0; i < in.length(); ++i) {
181 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
182 (mode == HEX && in.length() - i >= 4 &&
183 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
184 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
185 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
186 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
187 if (mode == STRING) {
188 out.append(in.c_str() + from, i - from);
189 out.push_back('\'');
190 }
191 if (mode != HEX) {
192 out.append("0x");
193 mode = HEX;
194 }
195 if (in.length() - i >= 4) {
196 // print a whole u32 at once
197 snprintf(buf, sizeof(buf), "%08x",
198 (uint32_t)(((unsigned char)in[i] << 24) |
199 ((unsigned char)in[i+1] << 16) |
200 ((unsigned char)in[i+2] << 8) |
201 ((unsigned char)in[i+3] << 0)));
202 i += 3;
203 } else {
204 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
205 }
206 out.append(buf);
207 } else {
208 if (mode != STRING) {
209 out.push_back('\'');
210 mode = STRING;
211 from = i;
212 }
213 }
214 }
215 if (mode == STRING) {
216 out.append(in.c_str() + from, i - from);
217 out.push_back('\'');
218 }
219 return out;
220 }
221
222 template<typename T>
223 static void _key_encode_shard(shard_id_t shard, T *key)
224 {
225 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
226 }
227
228 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
229 {
230 pshard->id = (uint8_t)*key - (uint8_t)0x80;
231 return key + 1;
232 }
233
234 static void get_coll_key_range(const coll_t& cid, int bits,
235 string *temp_start, string *temp_end,
236 string *start, string *end)
237 {
238 temp_start->clear();
239 temp_end->clear();
240 start->clear();
241 end->clear();
242
243 spg_t pgid;
244 if (cid.is_pg(&pgid)) {
245 _key_encode_shard(pgid.shard, start);
246 *temp_start = *start;
247
248 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
249 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
250
251 *end = *start;
252 *temp_end = *temp_start;
253
254 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
255 _key_encode_u32(reverse_hash, start);
256 _key_encode_u32(reverse_hash, temp_start);
257
258 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
259 if (end_hash > 0xffffffffull)
260 end_hash = 0xffffffffull;
261
262 _key_encode_u32(end_hash, end);
263 _key_encode_u32(end_hash, temp_end);
264 } else {
265 _key_encode_shard(shard_id_t::NO_SHARD, start);
266 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
267 *end = *start;
268 _key_encode_u32(0, start);
269 _key_encode_u32(0xffffffff, end);
270
271 // no separate temp section
272 *temp_start = *end;
273 *temp_end = *end;
274 }
275 }
276
277 static void get_shared_blob_key(uint64_t sbid, string *key)
278 {
279 key->clear();
280 _key_encode_u64(sbid, key);
281 }
282
283 static int get_key_shared_blob(const string& key, uint64_t *sbid)
284 {
285 const char *p = key.c_str();
286 if (key.length() < sizeof(uint64_t))
287 return -1;
288 p = _key_decode_u64(p, sbid);
289 return 0;
290 }
291
292 template<typename S>
293 static int get_key_object(const S& key, ghobject_t *oid)
294 {
295 int r;
296 const char *p = key.c_str();
297
298 if (key.length() < 1 + 8 + 4)
299 return -1;
300 p = _key_decode_shard(p, &oid->shard_id);
301
302 uint64_t pool;
303 p = _key_decode_u64(p, &pool);
304 oid->hobj.pool = pool - 0x8000000000000000ull;
305
306 unsigned hash;
307 p = _key_decode_u32(p, &hash);
308
309 oid->hobj.set_bitwise_key_u32(hash);
310
311 r = decode_escaped(p, &oid->hobj.nspace);
312 if (r < 0)
313 return -2;
314 p += r + 1;
315
316 string k;
317 r = decode_escaped(p, &k);
318 if (r < 0)
319 return -3;
320 p += r + 1;
321 if (*p == '=') {
322 // no key
323 ++p;
324 oid->hobj.oid.name = k;
325 } else if (*p == '<' || *p == '>') {
326 // key + name
327 ++p;
328 r = decode_escaped(p, &oid->hobj.oid.name);
329 if (r < 0)
330 return -5;
331 p += r + 1;
332 oid->hobj.set_key(k);
333 } else {
334 // malformed
335 return -6;
336 }
337
338 p = _key_decode_u64(p, &oid->hobj.snap.val);
339 p = _key_decode_u64(p, &oid->generation);
340
341 if (*p != ONODE_KEY_SUFFIX) {
342 return -7;
343 }
344 p++;
345 if (*p) {
346 // if we get something other than a null terminator here,
347 // something goes wrong.
348 return -8;
349 }
350
351 return 0;
352 }
353
354 template<typename S>
355 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
356 {
357 key->clear();
358
359 size_t max_len = 1 + 8 + 4 +
360 (oid.hobj.nspace.length() * 3 + 1) +
361 (oid.hobj.get_key().length() * 3 + 1) +
362 1 + // for '<', '=', or '>'
363 (oid.hobj.oid.name.length() * 3 + 1) +
364 8 + 8 + 1;
365 key->reserve(max_len);
366
367 _key_encode_shard(oid.shard_id, key);
368 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
369 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
370
371 append_escaped(oid.hobj.nspace, key);
372
373 if (oid.hobj.get_key().length()) {
374 // is a key... could be < = or >.
375 append_escaped(oid.hobj.get_key(), key);
376 // (ASCII chars < = and > sort in that order, yay)
377 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
378 if (r) {
379 key->append(r > 0 ? ">" : "<");
380 append_escaped(oid.hobj.oid.name, key);
381 } else {
382 // same as no key
383 key->append("=");
384 }
385 } else {
386 // no key
387 append_escaped(oid.hobj.oid.name, key);
388 key->append("=");
389 }
390
391 _key_encode_u64(oid.hobj.snap, key);
392 _key_encode_u64(oid.generation, key);
393
394 key->push_back(ONODE_KEY_SUFFIX);
395
396 // sanity check
397 if (true) {
398 ghobject_t t;
399 int r = get_key_object(*key, &t);
400 if (r || t != oid) {
401 derr << " r " << r << dendl;
402 derr << "key " << pretty_binary_string(*key) << dendl;
403 derr << "oid " << oid << dendl;
404 derr << " t " << t << dendl;
405 assert(r == 0 && t == oid);
406 }
407 }
408 }
409
410
411 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
412 // char lets us quickly test whether it is a shard key without decoding any
413 // of the prefix bytes.
414 template<typename S>
415 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
416 string *key)
417 {
418 key->clear();
419 key->reserve(onode_key.length() + 4 + 1);
420 key->append(onode_key.c_str(), onode_key.size());
421 _key_encode_u32(offset, key);
422 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
423 }
424
425 static void rewrite_extent_shard_key(uint32_t offset, string *key)
426 {
427 assert(key->size() > sizeof(uint32_t) + 1);
428 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
429 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
430 }
431
432 template<typename S>
433 static void generate_extent_shard_key_and_apply(
434 const S& onode_key,
435 uint32_t offset,
436 string *key,
437 std::function<void(const string& final_key)> apply)
438 {
439 if (key->empty()) { // make full key
440 assert(!onode_key.empty());
441 get_extent_shard_key(onode_key, offset, key);
442 } else {
443 rewrite_extent_shard_key(offset, key);
444 }
445 apply(*key);
446 }
447
448 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
449 {
450 assert(key.size() > sizeof(uint32_t) + 1);
451 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
452 int okey_len = key.size() - sizeof(uint32_t) - 1;
453 *onode_key = key.substr(0, okey_len);
454 const char *p = key.data() + okey_len;
455 p = _key_decode_u32(p, offset);
456 return 0;
457 }
458
459 static bool is_extent_shard_key(const string& key)
460 {
461 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
462 }
463
464 // '-' < '.' < '~'
465 static void get_omap_header(uint64_t id, string *out)
466 {
467 _key_encode_u64(id, out);
468 out->push_back('-');
469 }
470
471 // hmm, I don't think there's any need to escape the user key since we
472 // have a clean prefix.
473 static void get_omap_key(uint64_t id, const string& key, string *out)
474 {
475 _key_encode_u64(id, out);
476 out->push_back('.');
477 out->append(key);
478 }
479
480 static void rewrite_omap_key(uint64_t id, string old, string *out)
481 {
482 _key_encode_u64(id, out);
483 out->append(old.c_str() + out->length(), old.size() - out->length());
484 }
485
486 static void decode_omap_key(const string& key, string *user_key)
487 {
488 *user_key = key.substr(sizeof(uint64_t) + 1);
489 }
490
491 static void get_omap_tail(uint64_t id, string *out)
492 {
493 _key_encode_u64(id, out);
494 out->push_back('~');
495 }
496
497 static void get_deferred_key(uint64_t seq, string *out)
498 {
499 _key_encode_u64(seq, out);
500 }
501
502
503 // merge operators
504
505 struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
506 void merge_nonexistent(
507 const char *rdata, size_t rlen, std::string *new_value) override {
508 *new_value = std::string(rdata, rlen);
509 }
510 void merge(
511 const char *ldata, size_t llen,
512 const char *rdata, size_t rlen,
513 std::string *new_value) override {
514 assert(llen == rlen);
515 assert((rlen % 8) == 0);
516 new_value->resize(rlen);
517 const __le64* lv = (const __le64*)ldata;
518 const __le64* rv = (const __le64*)rdata;
519 __le64* nv = &(__le64&)new_value->at(0);
520 for (size_t i = 0; i < rlen >> 3; ++i) {
521 nv[i] = lv[i] + rv[i];
522 }
523 }
524 // We use each operator name and each prefix to construct the
525 // overall RocksDB operator name for consistency check at open time.
526 string name() const override {
527 return "int64_array";
528 }
529 };
530
531
532 // Buffer
533
534 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
535 {
536 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
537 << b.offset << "~" << b.length << std::dec
538 << " " << BlueStore::Buffer::get_state_name(b.state);
539 if (b.flags)
540 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
541 return out << ")";
542 }
543
544 // Garbage Collector
545
546 void BlueStore::GarbageCollector::process_protrusive_extents(
547 const BlueStore::ExtentMap& extent_map,
548 uint64_t start_offset,
549 uint64_t end_offset,
550 uint64_t start_touch_offset,
551 uint64_t end_touch_offset,
552 uint64_t min_alloc_size)
553 {
554 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
555
556 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
557 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
558
559 dout(30) << __func__ << " (hex): [" << std::hex
560 << lookup_start_offset << ", " << lookup_end_offset
561 << ")" << std::dec << dendl;
562
563 for (auto it = extent_map.seek_lextent(lookup_start_offset);
564 it != extent_map.extent_map.end() &&
565 it->logical_offset < lookup_end_offset;
566 ++it) {
567 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
568 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
569
570 dout(30) << __func__ << " " << *it
571 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
572 << dendl;
573
574 Blob* b = it->blob.get();
575
576 if (it->logical_offset >=start_touch_offset &&
577 it->logical_end() <= end_touch_offset) {
578 // Process extents within the range affected by
579 // the current write request.
580 // Need to take into account if existing extents
581 // can be merged with them (uncompressed case)
582 if (!b->get_blob().is_compressed()) {
583 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
584 --blob_info_counted->expected_allocations; // don't need to allocate
585 // new AU for compressed
586 // data since another
587 // collocated uncompressed
588 // blob already exists
589 dout(30) << __func__ << " --expected:"
590 << alloc_unit_start << dendl;
591 }
592 used_alloc_unit = alloc_unit_end;
593 blob_info_counted = nullptr;
594 }
595 } else if (b->get_blob().is_compressed()) {
596
597 // additionally we take compressed blobs that were not impacted
598 // by the write into account too
599 BlobInfo& bi =
600 affected_blobs.emplace(
601 b, BlobInfo(b->get_referenced_bytes())).first->second;
602
603 int adjust =
604 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
605 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
606 dout(30) << __func__ << " expected_allocations="
607 << bi.expected_allocations << " end_au:"
608 << alloc_unit_end << dendl;
609
610 blob_info_counted = &bi;
611 used_alloc_unit = alloc_unit_end;
612
613 assert(it->length <= bi.referenced_bytes);
614 bi.referenced_bytes -= it->length;
615 dout(30) << __func__ << " affected_blob:" << *b
616 << " unref 0x" << std::hex << it->length
617 << " referenced = 0x" << bi.referenced_bytes
618 << std::dec << dendl;
619 // NOTE: we can't move specific blob to resulting GC list here
620 // when reference counter == 0 since subsequent extents might
621 // decrement its expected_allocation.
622 // Hence need to enumerate all the extents first.
623 if (!bi.collect_candidate) {
624 bi.first_lextent = it;
625 bi.collect_candidate = true;
626 }
627 bi.last_lextent = it;
628 } else {
629 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
630 // don't need to allocate new AU for compressed data since another
631 // collocated uncompressed blob already exists
632 --blob_info_counted->expected_allocations;
633 dout(30) << __func__ << " --expected_allocations:"
634 << alloc_unit_start << dendl;
635 }
636 used_alloc_unit = alloc_unit_end;
637 blob_info_counted = nullptr;
638 }
639 }
640
641 for (auto b_it = affected_blobs.begin();
642 b_it != affected_blobs.end();
643 ++b_it) {
644 Blob* b = b_it->first;
645 BlobInfo& bi = b_it->second;
646 if (bi.referenced_bytes == 0) {
647 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
648 int64_t blob_expected_for_release =
649 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
650
651 dout(30) << __func__ << " " << *(b_it->first)
652 << " expected4release=" << blob_expected_for_release
653 << " expected_allocations=" << bi.expected_allocations
654 << dendl;
655 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
656 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
657 if (bi.collect_candidate) {
658 auto it = bi.first_lextent;
659 bool bExit = false;
660 do {
661 if (it->blob.get() == b) {
662 extents_to_collect.emplace_back(it->logical_offset, it->length);
663 }
664 bExit = it == bi.last_lextent;
665 ++it;
666 } while (!bExit);
667 }
668 expected_for_release += blob_expected_for_release;
669 expected_allocations += bi.expected_allocations;
670 }
671 }
672 }
673 }
674
675 int64_t BlueStore::GarbageCollector::estimate(
676 uint64_t start_offset,
677 uint64_t length,
678 const BlueStore::ExtentMap& extent_map,
679 const BlueStore::old_extent_map_t& old_extents,
680 uint64_t min_alloc_size)
681 {
682
683 affected_blobs.clear();
684 extents_to_collect.clear();
685 used_alloc_unit = boost::optional<uint64_t >();
686 blob_info_counted = nullptr;
687
688 gc_start_offset = start_offset;
689 gc_end_offset = start_offset + length;
690
691 uint64_t end_offset = start_offset + length;
692
693 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
694 Blob* b = it->e.blob.get();
695 if (b->get_blob().is_compressed()) {
696
697 // update gc_start_offset/gc_end_offset if needed
698 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
699 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
700
701 auto o = it->e.logical_offset;
702 auto l = it->e.length;
703
704 uint64_t ref_bytes = b->get_referenced_bytes();
705 // micro optimization to bypass blobs that have no more references
706 if (ref_bytes != 0) {
707 dout(30) << __func__ << " affected_blob:" << *b
708 << " unref 0x" << std::hex << o << "~" << l
709 << std::dec << dendl;
710 affected_blobs.emplace(b, BlobInfo(ref_bytes));
711 }
712 }
713 }
714 dout(30) << __func__ << " gc range(hex): [" << std::hex
715 << gc_start_offset << ", " << gc_end_offset
716 << ")" << std::dec << dendl;
717
718 // enumerate preceeding extents to check if they reference affected blobs
719 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
720 process_protrusive_extents(extent_map,
721 gc_start_offset,
722 gc_end_offset,
723 start_offset,
724 end_offset,
725 min_alloc_size);
726 }
727 return expected_for_release - expected_allocations;
728 }
729
730 // Cache
731
732 BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
733 PerfCounters *logger)
734 {
735 Cache *c = nullptr;
736
737 if (type == "lru")
738 c = new LRUCache(cct);
739 else if (type == "2q")
740 c = new TwoQCache(cct);
741 else
742 assert(0 == "unrecognized cache type");
743
744 c->logger = logger;
745 return c;
746 }
747
748 void BlueStore::Cache::trim_all()
749 {
750 std::lock_guard<std::recursive_mutex> l(lock);
751 _trim(0, 0);
752 }
753
754 void BlueStore::Cache::trim(
755 uint64_t target_bytes,
756 float target_meta_ratio,
757 float target_data_ratio,
758 float bytes_per_onode)
759 {
760 std::lock_guard<std::recursive_mutex> l(lock);
761 uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
762 uint64_t current_buffer = _get_buffer_bytes();
763 uint64_t current = current_meta + current_buffer;
764
765 uint64_t target_meta = target_bytes * target_meta_ratio;
766 uint64_t target_buffer = target_bytes * target_data_ratio;
767
768 // correct for overflow or float imprecision
769 target_meta = min(target_bytes, target_meta);
770 target_buffer = min(target_bytes - target_meta, target_buffer);
771
772 if (current <= target_bytes) {
773 dout(10) << __func__
774 << " shard target " << pretty_si_t(target_bytes)
775 << " meta/data ratios " << target_meta_ratio
776 << " + " << target_data_ratio << " ("
777 << pretty_si_t(target_meta) << " + "
778 << pretty_si_t(target_buffer) << "), "
779 << " current " << pretty_si_t(current) << " ("
780 << pretty_si_t(current_meta) << " + "
781 << pretty_si_t(current_buffer) << ")"
782 << dendl;
783 return;
784 }
785
786 uint64_t need_to_free = current - target_bytes;
787 uint64_t free_buffer = 0;
788 uint64_t free_meta = 0;
789 if (current_buffer > target_buffer) {
790 free_buffer = current_buffer - target_buffer;
791 if (free_buffer > need_to_free) {
792 free_buffer = need_to_free;
793 }
794 }
795 free_meta = need_to_free - free_buffer;
796
797 // start bounds at what we have now
798 uint64_t max_buffer = current_buffer - free_buffer;
799 uint64_t max_meta = current_meta - free_meta;
800 uint64_t max_onodes = max_meta / bytes_per_onode;
801
802 dout(10) << __func__
803 << " shard target " << pretty_si_t(target_bytes)
804 << " ratio " << target_meta_ratio << " ("
805 << pretty_si_t(target_meta) << " + "
806 << pretty_si_t(target_buffer) << "), "
807 << " current " << pretty_si_t(current) << " ("
808 << pretty_si_t(current_meta) << " + "
809 << pretty_si_t(current_buffer) << "),"
810 << " need_to_free " << pretty_si_t(need_to_free) << " ("
811 << pretty_si_t(free_meta) << " + "
812 << pretty_si_t(free_buffer) << ")"
813 << " -> max " << max_onodes << " onodes + "
814 << max_buffer << " buffer"
815 << dendl;
816 _trim(max_onodes, max_buffer);
817 }
818
819
820 // LRUCache
821 #undef dout_prefix
822 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
823
824 void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
825 {
826 auto p = onode_lru.iterator_to(*o);
827 onode_lru.erase(p);
828 onode_lru.push_front(*o);
829 }
830
831 void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
832 {
833 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
834 << " buffers " << buffer_size << " / " << buffer_max
835 << dendl;
836
837 _audit("trim start");
838
839 // buffers
840 while (buffer_size > buffer_max) {
841 auto i = buffer_lru.rbegin();
842 if (i == buffer_lru.rend()) {
843 // stop if buffer_lru is now empty
844 break;
845 }
846
847 Buffer *b = &*i;
848 assert(b->is_clean());
849 dout(20) << __func__ << " rm " << *b << dendl;
850 b->space->_rm_buffer(this, b);
851 }
852
853 // onodes
854 int num = onode_lru.size() - onode_max;
855 if (num <= 0)
856 return; // don't even try
857
858 auto p = onode_lru.end();
859 assert(p != onode_lru.begin());
860 --p;
861 int skipped = 0;
862 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
863 while (num > 0) {
864 Onode *o = &*p;
865 int refs = o->nref.load();
866 if (refs > 1) {
867 dout(20) << __func__ << " " << o->oid << " has " << refs
868 << " refs, skipping" << dendl;
869 if (++skipped >= max_skipped) {
870 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
871 << num << " left to trim" << dendl;
872 break;
873 }
874
875 if (p == onode_lru.begin()) {
876 break;
877 } else {
878 p--;
879 num--;
880 continue;
881 }
882 }
883 dout(30) << __func__ << " rm " << o->oid << dendl;
884 if (p != onode_lru.begin()) {
885 onode_lru.erase(p--);
886 } else {
887 onode_lru.erase(p);
888 assert(num == 1);
889 }
890 o->get(); // paranoia
891 o->c->onode_map.remove(o->oid);
892 o->put();
893 --num;
894 }
895 }
896
897 #ifdef DEBUG_CACHE
898 void BlueStore::LRUCache::_audit(const char *when)
899 {
900 dout(10) << __func__ << " " << when << " start" << dendl;
901 uint64_t s = 0;
902 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
903 s += i->length;
904 }
905 if (s != buffer_size) {
906 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
907 << dendl;
908 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
909 derr << __func__ << " " << *i << dendl;
910 }
911 assert(s == buffer_size);
912 }
913 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
914 << " ok" << dendl;
915 }
916 #endif
917
918 // TwoQCache
919 #undef dout_prefix
920 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
921
922
923 void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
924 {
925 auto p = onode_lru.iterator_to(*o);
926 onode_lru.erase(p);
927 onode_lru.push_front(*o);
928 }
929
930 void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
931 {
932 dout(20) << __func__ << " level " << level << " near " << near
933 << " on " << *b
934 << " which has cache_private " << b->cache_private << dendl;
935 if (near) {
936 b->cache_private = near->cache_private;
937 switch (b->cache_private) {
938 case BUFFER_WARM_IN:
939 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
940 break;
941 case BUFFER_WARM_OUT:
942 assert(b->is_empty());
943 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
944 break;
945 case BUFFER_HOT:
946 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
947 break;
948 default:
949 assert(0 == "bad cache_private");
950 }
951 } else if (b->cache_private == BUFFER_NEW) {
952 b->cache_private = BUFFER_WARM_IN;
953 if (level > 0) {
954 buffer_warm_in.push_front(*b);
955 } else {
956 // take caller hint to start at the back of the warm queue
957 buffer_warm_in.push_back(*b);
958 }
959 } else {
960 // we got a hint from discard
961 switch (b->cache_private) {
962 case BUFFER_WARM_IN:
963 // stay in warm_in. move to front, even though 2Q doesn't actually
964 // do this.
965 dout(20) << __func__ << " move to front of warm " << *b << dendl;
966 buffer_warm_in.push_front(*b);
967 break;
968 case BUFFER_WARM_OUT:
969 b->cache_private = BUFFER_HOT;
970 // move to hot. fall-thru
971 case BUFFER_HOT:
972 dout(20) << __func__ << " move to front of hot " << *b << dendl;
973 buffer_hot.push_front(*b);
974 break;
975 default:
976 assert(0 == "bad cache_private");
977 }
978 }
979 if (!b->is_empty()) {
980 buffer_bytes += b->length;
981 buffer_list_bytes[b->cache_private] += b->length;
982 }
983 }
984
985 void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
986 {
987 dout(20) << __func__ << " " << *b << dendl;
988 if (!b->is_empty()) {
989 assert(buffer_bytes >= b->length);
990 buffer_bytes -= b->length;
991 assert(buffer_list_bytes[b->cache_private] >= b->length);
992 buffer_list_bytes[b->cache_private] -= b->length;
993 }
994 switch (b->cache_private) {
995 case BUFFER_WARM_IN:
996 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
997 break;
998 case BUFFER_WARM_OUT:
999 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1000 break;
1001 case BUFFER_HOT:
1002 buffer_hot.erase(buffer_hot.iterator_to(*b));
1003 break;
1004 default:
1005 assert(0 == "bad cache_private");
1006 }
1007 }
1008
1009 void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1010 {
1011 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1012 src->_rm_buffer(b);
1013
1014 // preserve which list we're on (even if we can't preserve the order!)
1015 switch (b->cache_private) {
1016 case BUFFER_WARM_IN:
1017 assert(!b->is_empty());
1018 buffer_warm_in.push_back(*b);
1019 break;
1020 case BUFFER_WARM_OUT:
1021 assert(b->is_empty());
1022 buffer_warm_out.push_back(*b);
1023 break;
1024 case BUFFER_HOT:
1025 assert(!b->is_empty());
1026 buffer_hot.push_back(*b);
1027 break;
1028 default:
1029 assert(0 == "bad cache_private");
1030 }
1031 if (!b->is_empty()) {
1032 buffer_bytes += b->length;
1033 buffer_list_bytes[b->cache_private] += b->length;
1034 }
1035 }
1036
1037 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1038 {
1039 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1040 if (!b->is_empty()) {
1041 assert((int64_t)buffer_bytes + delta >= 0);
1042 buffer_bytes += delta;
1043 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1044 buffer_list_bytes[b->cache_private] += delta;
1045 }
1046 }
1047
1048 void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1049 {
1050 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1051 << " buffers " << buffer_bytes << " / " << buffer_max
1052 << dendl;
1053
1054 _audit("trim start");
1055
1056 // buffers
1057 if (buffer_bytes > buffer_max) {
1058 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1059 uint64_t khot = buffer_max - kin;
1060
1061 // pre-calculate kout based on average buffer size too,
1062 // which is typical(the warm_in and hot lists may change later)
1063 uint64_t kout = 0;
1064 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1065 if (buffer_num) {
1066 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1067 assert(buffer_avg_size);
1068 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1069 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1070 }
1071
1072 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1073 // hot is small, give slack to warm_in
1074 kin += khot - buffer_list_bytes[BUFFER_HOT];
1075 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1076 // warm_in is small, give slack to hot
1077 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1078 }
1079
1080 // adjust warm_in list
1081 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1082 uint64_t evicted = 0;
1083
1084 while (to_evict_bytes > 0) {
1085 auto p = buffer_warm_in.rbegin();
1086 if (p == buffer_warm_in.rend()) {
1087 // stop if warm_in list is now empty
1088 break;
1089 }
1090
1091 Buffer *b = &*p;
1092 assert(b->is_clean());
1093 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1094 assert(buffer_bytes >= b->length);
1095 buffer_bytes -= b->length;
1096 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1097 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1098 to_evict_bytes -= b->length;
1099 evicted += b->length;
1100 b->state = Buffer::STATE_EMPTY;
1101 b->data.clear();
1102 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1103 buffer_warm_out.push_front(*b);
1104 b->cache_private = BUFFER_WARM_OUT;
1105 }
1106
1107 if (evicted > 0) {
1108 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1109 << " from warm_in list, done evicting warm_in buffers"
1110 << dendl;
1111 }
1112
1113 // adjust hot list
1114 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1115 evicted = 0;
1116
1117 while (to_evict_bytes > 0) {
1118 auto p = buffer_hot.rbegin();
1119 if (p == buffer_hot.rend()) {
1120 // stop if hot list is now empty
1121 break;
1122 }
1123
1124 Buffer *b = &*p;
1125 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1126 assert(b->is_clean());
1127 // adjust evict size before buffer goes invalid
1128 to_evict_bytes -= b->length;
1129 evicted += b->length;
1130 b->space->_rm_buffer(this, b);
1131 }
1132
1133 if (evicted > 0) {
1134 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1135 << " from hot list, done evicting hot buffers"
1136 << dendl;
1137 }
1138
1139 // adjust warm out list too, if necessary
1140 int64_t num = buffer_warm_out.size() - kout;
1141 while (num-- > 0) {
1142 Buffer *b = &*buffer_warm_out.rbegin();
1143 assert(b->is_empty());
1144 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1145 b->space->_rm_buffer(this, b);
1146 }
1147 }
1148
1149 // onodes
1150 int num = onode_lru.size() - onode_max;
1151 if (num <= 0)
1152 return; // don't even try
1153
1154 auto p = onode_lru.end();
1155 assert(p != onode_lru.begin());
1156 --p;
1157 int skipped = 0;
1158 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1159 while (num > 0) {
1160 Onode *o = &*p;
1161 dout(20) << __func__ << " considering " << o << dendl;
1162 int refs = o->nref.load();
1163 if (refs > 1) {
1164 dout(20) << __func__ << " " << o->oid << " has " << refs
1165 << " refs; skipping" << dendl;
1166 if (++skipped >= max_skipped) {
1167 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1168 << num << " left to trim" << dendl;
1169 break;
1170 }
1171
1172 if (p == onode_lru.begin()) {
1173 break;
1174 } else {
1175 p--;
1176 num--;
1177 continue;
1178 }
1179 }
1180 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1181 if (p != onode_lru.begin()) {
1182 onode_lru.erase(p--);
1183 } else {
1184 onode_lru.erase(p);
1185 assert(num == 1);
1186 }
1187 o->get(); // paranoia
1188 o->c->onode_map.remove(o->oid);
1189 o->put();
1190 --num;
1191 }
1192 }
1193
1194 #ifdef DEBUG_CACHE
1195 void BlueStore::TwoQCache::_audit(const char *when)
1196 {
1197 dout(10) << __func__ << " " << when << " start" << dendl;
1198 uint64_t s = 0;
1199 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1200 s += i->length;
1201 }
1202
1203 uint64_t hot_bytes = s;
1204 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1205 derr << __func__ << " hot_list_bytes "
1206 << buffer_list_bytes[BUFFER_HOT]
1207 << " != actual " << hot_bytes
1208 << dendl;
1209 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1210 }
1211
1212 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1213 s += i->length;
1214 }
1215
1216 uint64_t warm_in_bytes = s - hot_bytes;
1217 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1218 derr << __func__ << " warm_in_list_bytes "
1219 << buffer_list_bytes[BUFFER_WARM_IN]
1220 << " != actual " << warm_in_bytes
1221 << dendl;
1222 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1223 }
1224
1225 if (s != buffer_bytes) {
1226 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1227 << dendl;
1228 assert(s == buffer_bytes);
1229 }
1230
1231 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1232 << " ok" << dendl;
1233 }
1234 #endif
1235
1236
1237 // BufferSpace
1238
1239 #undef dout_prefix
1240 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1241
1242 void BlueStore::BufferSpace::_clear(Cache* cache)
1243 {
1244 // note: we already hold cache->lock
1245 ldout(cache->cct, 20) << __func__ << dendl;
1246 while (!buffer_map.empty()) {
1247 _rm_buffer(cache, buffer_map.begin());
1248 }
1249 }
1250
1251 int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1252 {
1253 // note: we already hold cache->lock
1254 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1255 << std::dec << dendl;
1256 int cache_private = 0;
1257 cache->_audit("discard start");
1258 auto i = _data_lower_bound(offset);
1259 uint32_t end = offset + length;
1260 while (i != buffer_map.end()) {
1261 Buffer *b = i->second.get();
1262 if (b->offset >= end) {
1263 break;
1264 }
1265 if (b->cache_private > cache_private) {
1266 cache_private = b->cache_private;
1267 }
1268 if (b->offset < offset) {
1269 int64_t front = offset - b->offset;
1270 if (b->end() > end) {
1271 // drop middle (split)
1272 uint32_t tail = b->end() - end;
1273 if (b->data.length()) {
1274 bufferlist bl;
1275 bl.substr_of(b->data, b->length - tail, tail);
1276 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1277 nb->maybe_rebuild();
1278 _add_buffer(cache, nb, 0, b);
1279 } else {
1280 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1281 0, b);
1282 }
1283 if (!b->is_writing()) {
1284 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1285 }
1286 b->truncate(front);
1287 b->maybe_rebuild();
1288 cache->_audit("discard end 1");
1289 break;
1290 } else {
1291 // drop tail
1292 if (!b->is_writing()) {
1293 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1294 }
1295 b->truncate(front);
1296 b->maybe_rebuild();
1297 ++i;
1298 continue;
1299 }
1300 }
1301 if (b->end() <= end) {
1302 // drop entire buffer
1303 _rm_buffer(cache, i++);
1304 continue;
1305 }
1306 // drop front
1307 uint32_t keep = b->end() - end;
1308 if (b->data.length()) {
1309 bufferlist bl;
1310 bl.substr_of(b->data, b->length - keep, keep);
1311 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1312 nb->maybe_rebuild();
1313 _add_buffer(cache, nb, 0, b);
1314 } else {
1315 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1316 }
1317 _rm_buffer(cache, i);
1318 cache->_audit("discard end 2");
1319 break;
1320 }
1321 return cache_private;
1322 }
1323
1324 void BlueStore::BufferSpace::read(
1325 Cache* cache,
1326 uint32_t offset, uint32_t length,
1327 BlueStore::ready_regions_t& res,
1328 interval_set<uint32_t>& res_intervals)
1329 {
1330 std::lock_guard<std::recursive_mutex> l(cache->lock);
1331 res.clear();
1332 res_intervals.clear();
1333 uint32_t want_bytes = length;
1334 uint32_t end = offset + length;
1335 for (auto i = _data_lower_bound(offset);
1336 i != buffer_map.end() && offset < end && i->first < end;
1337 ++i) {
1338 Buffer *b = i->second.get();
1339 assert(b->end() > offset);
1340 if (b->is_writing() || b->is_clean()) {
1341 if (b->offset < offset) {
1342 uint32_t skip = offset - b->offset;
1343 uint32_t l = MIN(length, b->length - skip);
1344 res[offset].substr_of(b->data, skip, l);
1345 res_intervals.insert(offset, l);
1346 offset += l;
1347 length -= l;
1348 if (!b->is_writing()) {
1349 cache->_touch_buffer(b);
1350 }
1351 continue;
1352 }
1353 if (b->offset > offset) {
1354 uint32_t gap = b->offset - offset;
1355 if (length <= gap) {
1356 break;
1357 }
1358 offset += gap;
1359 length -= gap;
1360 }
1361 if (!b->is_writing()) {
1362 cache->_touch_buffer(b);
1363 }
1364 if (b->length > length) {
1365 res[offset].substr_of(b->data, 0, length);
1366 res_intervals.insert(offset, length);
1367 break;
1368 } else {
1369 res[offset].append(b->data);
1370 res_intervals.insert(offset, b->length);
1371 if (b->length == length)
1372 break;
1373 offset += b->length;
1374 length -= b->length;
1375 }
1376 }
1377 }
1378
1379 uint64_t hit_bytes = res_intervals.size();
1380 assert(hit_bytes <= want_bytes);
1381 uint64_t miss_bytes = want_bytes - hit_bytes;
1382 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1383 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1384 }
1385
1386 void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1387 {
1388 std::lock_guard<std::recursive_mutex> l(cache->lock);
1389
1390 auto i = writing.begin();
1391 while (i != writing.end()) {
1392 if (i->seq > seq) {
1393 break;
1394 }
1395 if (i->seq < seq) {
1396 ++i;
1397 continue;
1398 }
1399
1400 Buffer *b = &*i;
1401 assert(b->is_writing());
1402
1403 if (b->flags & Buffer::FLAG_NOCACHE) {
1404 writing.erase(i++);
1405 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1406 buffer_map.erase(b->offset);
1407 } else {
1408 b->state = Buffer::STATE_CLEAN;
1409 writing.erase(i++);
1410 b->maybe_rebuild();
1411 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1412 cache->_add_buffer(b, 1, nullptr);
1413 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1414 }
1415 }
1416
1417 cache->_audit("finish_write end");
1418 }
1419
1420 void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1421 {
1422 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1423 if (buffer_map.empty())
1424 return;
1425
1426 auto p = --buffer_map.end();
1427 while (true) {
1428 if (p->second->end() <= pos)
1429 break;
1430
1431 if (p->second->offset < pos) {
1432 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1433 size_t left = pos - p->second->offset;
1434 size_t right = p->second->length - left;
1435 if (p->second->data.length()) {
1436 bufferlist bl;
1437 bl.substr_of(p->second->data, left, right);
1438 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1439 0, p->second.get());
1440 } else {
1441 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1442 0, p->second.get());
1443 }
1444 cache->_adjust_buffer_size(p->second.get(), -right);
1445 p->second->truncate(left);
1446 break;
1447 }
1448
1449 assert(p->second->end() > pos);
1450 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1451 if (p->second->data.length()) {
1452 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1453 p->second->offset - pos, p->second->data),
1454 0, p->second.get());
1455 } else {
1456 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1457 p->second->offset - pos, p->second->length),
1458 0, p->second.get());
1459 }
1460 if (p == buffer_map.begin()) {
1461 _rm_buffer(cache, p);
1462 break;
1463 } else {
1464 _rm_buffer(cache, p--);
1465 }
1466 }
1467 assert(writing.empty());
1468 }
1469
1470 // OnodeSpace
1471
1472 #undef dout_prefix
1473 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1474
1475 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1476 {
1477 std::lock_guard<std::recursive_mutex> l(cache->lock);
1478 auto p = onode_map.find(oid);
1479 if (p != onode_map.end()) {
1480 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1481 << " raced, returning existing " << p->second
1482 << dendl;
1483 return p->second;
1484 }
1485 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1486 onode_map[oid] = o;
1487 cache->_add_onode(o, 1);
1488 return o;
1489 }
1490
1491 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1492 {
1493 std::lock_guard<std::recursive_mutex> l(cache->lock);
1494 ldout(cache->cct, 30) << __func__ << dendl;
1495 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1496 if (p == onode_map.end()) {
1497 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1498 cache->logger->inc(l_bluestore_onode_misses);
1499 return OnodeRef();
1500 }
1501 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1502 << dendl;
1503 cache->_touch_onode(p->second);
1504 cache->logger->inc(l_bluestore_onode_hits);
1505 return p->second;
1506 }
1507
1508 void BlueStore::OnodeSpace::clear()
1509 {
1510 std::lock_guard<std::recursive_mutex> l(cache->lock);
1511 ldout(cache->cct, 10) << __func__ << dendl;
1512 for (auto &p : onode_map) {
1513 cache->_rm_onode(p.second);
1514 }
1515 onode_map.clear();
1516 }
1517
1518 bool BlueStore::OnodeSpace::empty()
1519 {
1520 std::lock_guard<std::recursive_mutex> l(cache->lock);
1521 return onode_map.empty();
1522 }
1523
1524 void BlueStore::OnodeSpace::rename(
1525 OnodeRef& oldo,
1526 const ghobject_t& old_oid,
1527 const ghobject_t& new_oid,
1528 const mempool::bluestore_cache_other::string& new_okey)
1529 {
1530 std::lock_guard<std::recursive_mutex> l(cache->lock);
1531 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1532 << dendl;
1533 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1534 po = onode_map.find(old_oid);
1535 pn = onode_map.find(new_oid);
1536 assert(po != pn);
1537
1538 assert(po != onode_map.end());
1539 if (pn != onode_map.end()) {
1540 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1541 << dendl;
1542 cache->_rm_onode(pn->second);
1543 onode_map.erase(pn);
1544 }
1545 OnodeRef o = po->second;
1546
1547 // install a non-existent onode at old location
1548 oldo.reset(new Onode(o->c, old_oid, o->key));
1549 po->second = oldo;
1550 cache->_add_onode(po->second, 1);
1551
1552 // add at new position and fix oid, key
1553 onode_map.insert(make_pair(new_oid, o));
1554 cache->_touch_onode(o);
1555 o->oid = new_oid;
1556 o->key = new_okey;
1557 }
1558
1559 bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1560 {
1561 std::lock_guard<std::recursive_mutex> l(cache->lock);
1562 ldout(cache->cct, 20) << __func__ << dendl;
1563 for (auto& i : onode_map) {
1564 if (f(i.second)) {
1565 return true;
1566 }
1567 }
1568 return false;
1569 }
1570
1571
1572 // SharedBlob
1573
1574 #undef dout_prefix
1575 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1576
1577 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1578 {
1579 out << "SharedBlob(" << &sb;
1580
1581 if (sb.loaded) {
1582 out << " loaded " << *sb.persistent;
1583 } else {
1584 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1585 }
1586 return out << ")";
1587 }
1588
1589 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1590 : coll(_coll), sbid_unloaded(i)
1591 {
1592 assert(sbid_unloaded > 0);
1593 if (get_cache()) {
1594 get_cache()->add_blob();
1595 }
1596 }
1597
1598 BlueStore::SharedBlob::~SharedBlob()
1599 {
1600 if (get_cache()) { // the dummy instances have a nullptr
1601 std::lock_guard<std::recursive_mutex> l(get_cache()->lock);
1602 bc._clear(get_cache());
1603 get_cache()->rm_blob();
1604 }
1605 if (loaded && persistent) {
1606 delete persistent;
1607 }
1608 }
1609
1610 void BlueStore::SharedBlob::put()
1611 {
1612 if (--nref == 0) {
1613 ldout(coll->store->cct, 20) << __func__ << " " << this
1614 << " removing self from set " << get_parent()
1615 << dendl;
1616 if (get_parent()) {
1617 if (get_parent()->remove(this)) {
1618 delete this;
1619 } else {
1620 ldout(coll->store->cct, 20)
1621 << __func__ << " " << this << " lost race to remove myself from set"
1622 << dendl;
1623 }
1624 } else {
1625 delete this;
1626 }
1627 }
1628 }
1629
1630 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1631 {
1632 assert(persistent);
1633 persistent->ref_map.get(offset, length);
1634 }
1635
1636 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
1637 PExtentVector *r,
1638 set<SharedBlob*> *maybe_unshared)
1639 {
1640 assert(persistent);
1641 bool maybe = false;
1642 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1643 if (maybe_unshared && maybe) {
1644 maybe_unshared->insert(this);
1645 }
1646 }
1647
1648 // Blob
1649
1650 #undef dout_prefix
1651 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1652
1653 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1654 {
1655 out << "Blob(" << &b;
1656 if (b.is_spanning()) {
1657 out << " spanning " << b.id;
1658 }
1659 out << " " << b.get_blob() << " " << b.get_blob_use_tracker()
1660 << " " << *b.shared_blob
1661 << ")";
1662 return out;
1663 }
1664
1665 void BlueStore::Blob::discard_unallocated(Collection *coll)
1666 {
1667 if (blob.is_shared()) {
1668 return;
1669 }
1670 if (blob.is_compressed()) {
1671 bool discard = false;
1672 bool all_invalid = true;
1673 for (auto e : blob.get_extents()) {
1674 if (!e.is_valid()) {
1675 discard = true;
1676 } else {
1677 all_invalid = false;
1678 }
1679 }
1680 assert(discard == all_invalid); // in case of compressed blob all
1681 // or none pextents are invalid.
1682 if (discard) {
1683 shared_blob->bc.discard(shared_blob->get_cache(), 0, blob.get_logical_length());
1684 }
1685 } else {
1686 size_t pos = 0;
1687 for (auto e : blob.get_extents()) {
1688 if (!e.is_valid()) {
1689 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1690 << "~" << e.length
1691 << std::dec << dendl;
1692 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1693 }
1694 pos += e.length;
1695 }
1696 if (blob.can_prune_tail()) {
1697 dirty_blob();
1698 blob.prune_tail();
1699 used_in_blob.prune_tail(blob.get_ondisk_length());
1700 auto cct = coll->store->cct; //used by dout
1701 dout(20) << __func__ << " pruned tail, now " << blob << dendl;
1702 }
1703 }
1704 }
1705
1706 void BlueStore::Blob::get_ref(
1707 Collection *coll,
1708 uint32_t offset,
1709 uint32_t length)
1710 {
1711 // Caller has to initialize Blob's logical length prior to increment
1712 // references. Otherwise one is neither unable to determine required
1713 // amount of counters in case of per-au tracking nor obtain min_release_size
1714 // for single counter mode.
1715 assert(get_blob().get_logical_length() != 0);
1716 auto cct = coll->store->cct;
1717 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1718 << std::dec << " " << *this << dendl;
1719
1720 if (used_in_blob.is_empty()) {
1721 uint32_t min_release_size =
1722 blob.get_release_size(coll->store->min_alloc_size);
1723 uint64_t l = blob.get_logical_length();
1724 dout(20) << __func__ << " init 0x" << std::hex << l << ", " << min_release_size
1725 << std::dec << dendl;
1726 used_in_blob.init(l, min_release_size);
1727 }
1728 used_in_blob.get(
1729 offset,
1730 length);
1731 }
1732
1733 bool BlueStore::Blob::put_ref(
1734 Collection *coll,
1735 uint32_t offset,
1736 uint32_t length,
1737 PExtentVector *r)
1738 {
1739 PExtentVector logical;
1740
1741 auto cct = coll->store->cct;
1742 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1743 << std::dec << " " << *this << dendl;
1744
1745 bool empty = used_in_blob.put(
1746 offset,
1747 length,
1748 &logical);
1749 r->clear();
1750 // nothing to release
1751 if (!empty && logical.empty()) {
1752 return false;
1753 }
1754
1755 bluestore_blob_t& b = dirty_blob();
1756 return b.release_extents(empty, logical, r);
1757 }
1758
1759 bool BlueStore::Blob::try_reuse_blob(uint32_t min_alloc_size,
1760 uint32_t target_blob_size,
1761 uint32_t b_offset,
1762 uint32_t *length0) {
1763 assert(min_alloc_size);
1764 assert(target_blob_size);
1765 if (!get_blob().is_mutable()) {
1766 return false;
1767 }
1768
1769 uint32_t length = *length0;
1770 uint32_t end = b_offset + length;
1771
1772 // Currently for the sake of simplicity we omit blob reuse if data is
1773 // unaligned with csum chunk. Later we can perform padding if needed.
1774 if (get_blob().has_csum() &&
1775 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1776 (end % get_blob().get_csum_chunk_size()) != 0)) {
1777 return false;
1778 }
1779
1780 auto blen = get_blob().get_logical_length();
1781 uint32_t new_blen = blen;
1782
1783 // make sure target_blob_size isn't less than current blob len
1784 target_blob_size = MAX(blen, target_blob_size);
1785
1786 if (b_offset >= blen) {
1787 //new data totally stands out of the existing blob
1788 new_blen = b_offset + length;
1789 } else {
1790 //new data overlaps with the existing blob
1791 new_blen = MAX(blen, length + b_offset);
1792 if (!get_blob().is_unallocated(
1793 b_offset,
1794 new_blen > blen ? blen - b_offset : length)) {
1795 return false;
1796 }
1797 }
1798 if (new_blen > blen) {
1799 int64_t overflow = int64_t(new_blen) - target_blob_size;
1800 // Unable to decrease the provided length to fit into max_blob_size
1801 if (overflow >= length) {
1802 return false;
1803 }
1804
1805 // FIXME: in some cases we could reduce unused resolution
1806 if (get_blob().has_unused()) {
1807 return false;
1808 }
1809
1810 if (overflow > 0) {
1811 new_blen -= overflow;
1812 length -= overflow;
1813 *length0 = length;
1814 }
1815 if (new_blen > blen) {
1816 dirty_blob().add_tail(new_blen);
1817 used_in_blob.add_tail(new_blen,
1818 blob.get_release_size(min_alloc_size));
1819 }
1820 }
1821 return true;
1822 }
1823
1824 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1825 {
1826 auto cct = coll->store->cct; //used by dout
1827 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1828 << " start " << *this << dendl;
1829 assert(blob.can_split());
1830 assert(used_in_blob.can_split());
1831 bluestore_blob_t &lb = dirty_blob();
1832 bluestore_blob_t &rb = r->dirty_blob();
1833
1834 used_in_blob.split(
1835 blob_offset,
1836 &(r->used_in_blob));
1837
1838 lb.split(blob_offset, rb);
1839 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1840
1841 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1842 << " finish " << *this << dendl;
1843 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1844 << " and " << *r << dendl;
1845 }
1846
1847 #ifndef CACHE_BLOB_BL
1848 void BlueStore::Blob::decode(
1849 Collection *coll,
1850 bufferptr::iterator& p,
1851 uint64_t struct_v,
1852 uint64_t* sbid,
1853 bool include_ref_map)
1854 {
1855 denc(blob, p, struct_v);
1856 if (blob.is_shared()) {
1857 denc(*sbid, p);
1858 }
1859 if (include_ref_map) {
1860 if (struct_v > 1) {
1861 used_in_blob.decode(p);
1862 } else {
1863 used_in_blob.clear();
1864 bluestore_extent_ref_map_t legacy_ref_map;
1865 legacy_ref_map.decode(p);
1866 for (auto r : legacy_ref_map.ref_map) {
1867 get_ref(
1868 coll,
1869 r.first,
1870 r.second.refs * r.second.length);
1871 }
1872 }
1873 }
1874 }
1875 #endif
1876
1877 // Extent
1878
1879 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1880 {
1881 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1882 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1883 << " " << *e.blob;
1884 }
1885
1886 // OldExtent
1887 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1888 uint32_t lo,
1889 uint32_t o,
1890 uint32_t l,
1891 BlobRef& b) {
1892 OldExtent* oe = new OldExtent(lo, o, l, b);
1893 b->put_ref(c.get(), o, l, &(oe->r));
1894 oe->blob_empty = b->get_referenced_bytes() == 0;
1895 return oe;
1896 }
1897
1898 // ExtentMap
1899
1900 #undef dout_prefix
1901 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1902
1903 BlueStore::ExtentMap::ExtentMap(Onode *o)
1904 : onode(o),
1905 inline_bl(
1906 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1907 }
1908
1909 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1910 bool force)
1911 {
1912 auto cct = onode->c->store->cct; //used by dout
1913 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1914 if (onode->onode.extent_map_shards.empty()) {
1915 if (inline_bl.length() == 0) {
1916 unsigned n;
1917 // we need to encode inline_bl to measure encoded length
1918 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
1919 assert(!never_happen);
1920 size_t len = inline_bl.length();
1921 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1922 << " extents" << dendl;
1923 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
1924 request_reshard(0, OBJECT_MAX_SIZE);
1925 return;
1926 }
1927 }
1928 // will persist in the onode key.
1929 } else {
1930 // pending shard update
1931 struct dirty_shard_t {
1932 Shard *shard;
1933 bufferlist bl;
1934 dirty_shard_t(Shard *s) : shard(s) {}
1935 };
1936 vector<dirty_shard_t> encoded_shards;
1937 // allocate slots for all shards in a single call instead of
1938 // doing multiple allocations - one per each dirty shard
1939 encoded_shards.reserve(shards.size());
1940
1941 auto p = shards.begin();
1942 auto prev_p = p;
1943 while (p != shards.end()) {
1944 assert(p->shard_info->offset >= prev_p->shard_info->offset);
1945 auto n = p;
1946 ++n;
1947 if (p->dirty) {
1948 uint32_t endoff;
1949 if (n == shards.end()) {
1950 endoff = OBJECT_MAX_SIZE;
1951 } else {
1952 endoff = n->shard_info->offset;
1953 }
1954 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
1955 bufferlist& bl = encoded_shards.back().bl;
1956 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
1957 bl, &p->extents)) {
1958 if (force) {
1959 derr << __func__ << " encode_some needs reshard" << dendl;
1960 assert(!force);
1961 }
1962 }
1963 size_t len = bl.length();
1964
1965 dout(20) << __func__ << " shard 0x" << std::hex
1966 << p->shard_info->offset << std::dec << " is " << len
1967 << " bytes (was " << p->shard_info->bytes << ") from "
1968 << p->extents << " extents" << dendl;
1969
1970 if (!force) {
1971 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
1972 // we are big; reshard ourselves
1973 request_reshard(p->shard_info->offset, endoff);
1974 }
1975 // avoid resharding the trailing shard, even if it is small
1976 else if (n != shards.end() &&
1977 len < g_conf->bluestore_extent_map_shard_min_size) {
1978 assert(endoff != OBJECT_MAX_SIZE);
1979 if (p == shards.begin()) {
1980 // we are the first shard, combine with next shard
1981 request_reshard(p->shard_info->offset, endoff + 1);
1982 } else {
1983 // combine either with the previous shard or the next,
1984 // whichever is smaller
1985 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
1986 request_reshard(p->shard_info->offset, endoff + 1);
1987 } else {
1988 request_reshard(prev_p->shard_info->offset, endoff);
1989 }
1990 }
1991 }
1992 }
1993 }
1994 prev_p = p;
1995 p = n;
1996 }
1997 if (needs_reshard()) {
1998 return;
1999 }
2000
2001 // schedule DB update for dirty shards
2002 string key;
2003 for (auto& it : encoded_shards) {
2004 it.shard->dirty = false;
2005 it.shard->shard_info->bytes = it.bl.length();
2006 generate_extent_shard_key_and_apply(
2007 onode->key,
2008 it.shard->shard_info->offset,
2009 &key,
2010 [&](const string& final_key) {
2011 t->set(PREFIX_OBJ, final_key, it.bl);
2012 }
2013 );
2014 }
2015 }
2016 }
2017
2018 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2019 {
2020 if (spanning_blob_map.empty())
2021 return 0;
2022 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2023 // bid is valid and available.
2024 if (bid >= 0)
2025 return bid;
2026 // Find next unused bid;
2027 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2028 const auto begin_bid = bid;
2029 do {
2030 if (!spanning_blob_map.count(bid))
2031 return bid;
2032 else {
2033 bid++;
2034 if (bid < 0) bid = 0;
2035 }
2036 } while (bid != begin_bid);
2037 assert(0 == "no available blob id");
2038 }
2039
2040 void BlueStore::ExtentMap::reshard(
2041 KeyValueDB *db,
2042 KeyValueDB::Transaction t)
2043 {
2044 auto cct = onode->c->store->cct; // used by dout
2045
2046 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2047 << needs_reshard_end << ")" << std::dec
2048 << " of " << onode->onode.extent_map_shards.size()
2049 << " shards on " << onode->oid << dendl;
2050 for (auto& p : spanning_blob_map) {
2051 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2052 << dendl;
2053 }
2054 // determine shard index range
2055 unsigned si_begin = 0, si_end = 0;
2056 if (!shards.empty()) {
2057 while (si_begin + 1 < shards.size() &&
2058 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2059 ++si_begin;
2060 }
2061 needs_reshard_begin = shards[si_begin].shard_info->offset;
2062 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2063 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2064 needs_reshard_end = shards[si_end].shard_info->offset;
2065 break;
2066 }
2067 }
2068 if (si_end == shards.size()) {
2069 needs_reshard_end = OBJECT_MAX_SIZE;
2070 }
2071 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2072 << " over 0x[" << std::hex << needs_reshard_begin << ","
2073 << needs_reshard_end << ")" << std::dec << dendl;
2074 }
2075
2076 fault_range(db, needs_reshard_begin, needs_reshard_end);
2077
2078 // we may need to fault in a larger interval later must have all
2079 // referring extents for spanning blobs loaded in order to have
2080 // accurate use_tracker values.
2081 uint32_t spanning_scan_begin = needs_reshard_begin;
2082 uint32_t spanning_scan_end = needs_reshard_end;
2083
2084 // remove old keys
2085 string key;
2086 for (unsigned i = si_begin; i < si_end; ++i) {
2087 generate_extent_shard_key_and_apply(
2088 onode->key, shards[i].shard_info->offset, &key,
2089 [&](const string& final_key) {
2090 t->rmkey(PREFIX_OBJ, final_key);
2091 }
2092 );
2093 }
2094
2095 // calculate average extent size
2096 unsigned bytes = 0;
2097 unsigned extents = 0;
2098 if (onode->onode.extent_map_shards.empty()) {
2099 bytes = inline_bl.length();
2100 extents = extent_map.size();
2101 } else {
2102 for (unsigned i = si_begin; i < si_end; ++i) {
2103 bytes += shards[i].shard_info->bytes;
2104 extents += shards[i].extents;
2105 }
2106 }
2107 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2108 unsigned slop = target *
2109 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2110 unsigned extent_avg = bytes / MAX(1, extents);
2111 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2112 << ", slop " << slop << dendl;
2113
2114 // reshard
2115 unsigned estimate = 0;
2116 unsigned offset = needs_reshard_begin;
2117 vector<bluestore_onode_t::shard_info> new_shard_info;
2118 unsigned max_blob_end = 0;
2119 Extent dummy(needs_reshard_begin);
2120 for (auto e = extent_map.lower_bound(dummy);
2121 e != extent_map.end();
2122 ++e) {
2123 if (e->logical_offset >= needs_reshard_end) {
2124 break;
2125 }
2126 dout(30) << " extent " << *e << dendl;
2127
2128 // disfavor shard boundaries that span a blob
2129 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2130 if (estimate &&
2131 estimate + extent_avg > target + (would_span ? slop : 0)) {
2132 // new shard
2133 if (offset == needs_reshard_begin) {
2134 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2135 new_shard_info.back().offset = offset;
2136 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2137 << std::dec << dendl;
2138 }
2139 offset = e->logical_offset;
2140 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2141 new_shard_info.back().offset = offset;
2142 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2143 << std::dec << dendl;
2144 estimate = 0;
2145 }
2146 estimate += extent_avg;
2147 unsigned bs = e->blob_start();
2148 if (bs < spanning_scan_begin) {
2149 spanning_scan_begin = bs;
2150 }
2151 uint32_t be = e->blob_end();
2152 if (be > max_blob_end) {
2153 max_blob_end = be;
2154 }
2155 if (be > spanning_scan_end) {
2156 spanning_scan_end = be;
2157 }
2158 }
2159 if (new_shard_info.empty() && (si_begin > 0 ||
2160 si_end < shards.size())) {
2161 // we resharded a partial range; we must produce at least one output
2162 // shard
2163 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2164 new_shard_info.back().offset = needs_reshard_begin;
2165 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2166 << std::dec << " (singleton degenerate case)" << dendl;
2167 }
2168
2169 auto& sv = onode->onode.extent_map_shards;
2170 dout(20) << __func__ << " new " << new_shard_info << dendl;
2171 dout(20) << __func__ << " old " << sv << dendl;
2172 if (sv.empty()) {
2173 // no old shards to keep
2174 sv.swap(new_shard_info);
2175 init_shards(true, true);
2176 } else {
2177 // splice in new shards
2178 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2179 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2180 sv.insert(
2181 sv.begin() + si_begin,
2182 new_shard_info.begin(),
2183 new_shard_info.end());
2184 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2185 si_end = si_begin + new_shard_info.size();
2186
2187 assert(sv.size() == shards.size());
2188
2189 // note that we need to update every shard_info of shards here,
2190 // as sv might have been totally re-allocated above
2191 for (unsigned i = 0; i < shards.size(); i++) {
2192 shards[i].shard_info = &sv[i];
2193 }
2194
2195 // mark newly added shards as dirty
2196 for (unsigned i = si_begin; i < si_end; ++i) {
2197 shards[i].loaded = true;
2198 shards[i].dirty = true;
2199 }
2200 }
2201 dout(20) << __func__ << " fin " << sv << dendl;
2202 inline_bl.clear();
2203
2204 if (sv.empty()) {
2205 // no more shards; unspan all previously spanning blobs
2206 auto p = spanning_blob_map.begin();
2207 while (p != spanning_blob_map.end()) {
2208 p->second->id = -1;
2209 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2210 p = spanning_blob_map.erase(p);
2211 }
2212 } else {
2213 // identify new spanning blobs
2214 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2215 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2216 if (spanning_scan_begin < needs_reshard_begin) {
2217 fault_range(db, spanning_scan_begin,
2218 needs_reshard_begin - spanning_scan_begin);
2219 }
2220 if (spanning_scan_end > needs_reshard_end) {
2221 fault_range(db, needs_reshard_end,
2222 spanning_scan_end - needs_reshard_end);
2223 }
2224 auto sp = sv.begin() + si_begin;
2225 auto esp = sv.end();
2226 unsigned shard_start = sp->offset;
2227 unsigned shard_end;
2228 ++sp;
2229 if (sp == esp) {
2230 shard_end = OBJECT_MAX_SIZE;
2231 } else {
2232 shard_end = sp->offset;
2233 }
2234 Extent dummy(needs_reshard_begin);
2235 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2236 if (e->logical_offset >= needs_reshard_end) {
2237 break;
2238 }
2239 dout(30) << " extent " << *e << dendl;
2240 while (e->logical_offset >= shard_end) {
2241 shard_start = shard_end;
2242 assert(sp != esp);
2243 ++sp;
2244 if (sp == esp) {
2245 shard_end = OBJECT_MAX_SIZE;
2246 } else {
2247 shard_end = sp->offset;
2248 }
2249 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2250 << " to 0x" << shard_end << std::dec << dendl;
2251 }
2252 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2253 if (!e->blob->is_spanning()) {
2254 // We have two options: (1) split the blob into pieces at the
2255 // shard boundaries (and adjust extents accordingly), or (2)
2256 // mark it spanning. We prefer to cut the blob if we can. Note that
2257 // we may have to split it multiple times--potentially at every
2258 // shard boundary.
2259 bool must_span = false;
2260 BlobRef b = e->blob;
2261 if (b->can_split()) {
2262 uint32_t bstart = e->blob_start();
2263 uint32_t bend = e->blob_end();
2264 for (const auto& sh : shards) {
2265 if (bstart < sh.shard_info->offset &&
2266 bend > sh.shard_info->offset) {
2267 uint32_t blob_offset = sh.shard_info->offset - bstart;
2268 if (b->can_split_at(blob_offset)) {
2269 dout(20) << __func__ << " splitting blob, bstart 0x"
2270 << std::hex << bstart << " blob_offset 0x"
2271 << blob_offset << std::dec << " " << *b << dendl;
2272 b = split_blob(b, blob_offset, sh.shard_info->offset);
2273 // switch b to the new right-hand side, in case it
2274 // *also* has to get split.
2275 bstart += blob_offset;
2276 onode->c->store->logger->inc(l_bluestore_blob_split);
2277 } else {
2278 must_span = true;
2279 break;
2280 }
2281 }
2282 }
2283 } else {
2284 must_span = true;
2285 }
2286 if (must_span) {
2287 auto bid = allocate_spanning_blob_id();
2288 b->id = bid;
2289 spanning_blob_map[b->id] = b;
2290 dout(20) << __func__ << " adding spanning " << *b << dendl;
2291 }
2292 }
2293 } else {
2294 if (e->blob->is_spanning()) {
2295 spanning_blob_map.erase(e->blob->id);
2296 e->blob->id = -1;
2297 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2298 }
2299 }
2300 }
2301 }
2302
2303 clear_needs_reshard();
2304 }
2305
2306 bool BlueStore::ExtentMap::encode_some(
2307 uint32_t offset,
2308 uint32_t length,
2309 bufferlist& bl,
2310 unsigned *pn)
2311 {
2312 auto cct = onode->c->store->cct; //used by dout
2313 Extent dummy(offset);
2314 auto start = extent_map.lower_bound(dummy);
2315 uint32_t end = offset + length;
2316
2317 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2318 // serialization only. Hence there is no specific
2319 // handling at ExtentMap level.
2320
2321 unsigned n = 0;
2322 size_t bound = 0;
2323 bool must_reshard = false;
2324 for (auto p = start;
2325 p != extent_map.end() && p->logical_offset < end;
2326 ++p, ++n) {
2327 assert(p->logical_offset >= offset);
2328 p->blob->last_encoded_id = -1;
2329 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2330 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2331 << std::dec << " hit new spanning blob " << *p << dendl;
2332 request_reshard(p->blob_start(), p->blob_end());
2333 must_reshard = true;
2334 }
2335 if (!must_reshard) {
2336 denc_varint(0, bound); // blobid
2337 denc_varint(0, bound); // logical_offset
2338 denc_varint(0, bound); // len
2339 denc_varint(0, bound); // blob_offset
2340
2341 p->blob->bound_encode(
2342 bound,
2343 struct_v,
2344 p->blob->shared_blob->get_sbid(),
2345 false);
2346 }
2347 }
2348 if (must_reshard) {
2349 return true;
2350 }
2351
2352 denc(struct_v, bound);
2353 denc_varint(0, bound); // number of extents
2354
2355 {
2356 auto app = bl.get_contiguous_appender(bound);
2357 denc(struct_v, app);
2358 denc_varint(n, app);
2359 if (pn) {
2360 *pn = n;
2361 }
2362
2363 n = 0;
2364 uint64_t pos = 0;
2365 uint64_t prev_len = 0;
2366 for (auto p = start;
2367 p != extent_map.end() && p->logical_offset < end;
2368 ++p, ++n) {
2369 unsigned blobid;
2370 bool include_blob = false;
2371 if (p->blob->is_spanning()) {
2372 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2373 blobid |= BLOBID_FLAG_SPANNING;
2374 } else if (p->blob->last_encoded_id < 0) {
2375 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2376 include_blob = true;
2377 blobid = 0; // the decoder will infer the id from n
2378 } else {
2379 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2380 }
2381 if (p->logical_offset == pos) {
2382 blobid |= BLOBID_FLAG_CONTIGUOUS;
2383 }
2384 if (p->blob_offset == 0) {
2385 blobid |= BLOBID_FLAG_ZEROOFFSET;
2386 }
2387 if (p->length == prev_len) {
2388 blobid |= BLOBID_FLAG_SAMELENGTH;
2389 } else {
2390 prev_len = p->length;
2391 }
2392 denc_varint(blobid, app);
2393 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2394 denc_varint_lowz(p->logical_offset - pos, app);
2395 }
2396 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2397 denc_varint_lowz(p->blob_offset, app);
2398 }
2399 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2400 denc_varint_lowz(p->length, app);
2401 }
2402 pos = p->logical_end();
2403 if (include_blob) {
2404 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2405 }
2406 }
2407 }
2408 /*derr << __func__ << bl << dendl;
2409 derr << __func__ << ":";
2410 bl.hexdump(*_dout);
2411 *_dout << dendl;
2412 */
2413 return false;
2414 }
2415
2416 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2417 {
2418 auto cct = onode->c->store->cct; //used by dout
2419 /*
2420 derr << __func__ << ":";
2421 bl.hexdump(*_dout);
2422 *_dout << dendl;
2423 */
2424
2425 assert(bl.get_num_buffers() <= 1);
2426 auto p = bl.front().begin_deep();
2427 __u8 struct_v;
2428 denc(struct_v, p);
2429 // Version 2 differs from v1 in blob's ref_map
2430 // serialization only. Hence there is no specific
2431 // handling at ExtentMap level below.
2432 assert(struct_v == 1 || struct_v == 2);
2433
2434 uint32_t num;
2435 denc_varint(num, p);
2436 vector<BlobRef> blobs(num);
2437 uint64_t pos = 0;
2438 uint64_t prev_len = 0;
2439 unsigned n = 0;
2440
2441 while (!p.end()) {
2442 Extent *le = new Extent();
2443 uint64_t blobid;
2444 denc_varint(blobid, p);
2445 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2446 uint64_t gap;
2447 denc_varint_lowz(gap, p);
2448 pos += gap;
2449 }
2450 le->logical_offset = pos;
2451 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2452 denc_varint_lowz(le->blob_offset, p);
2453 } else {
2454 le->blob_offset = 0;
2455 }
2456 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2457 denc_varint_lowz(prev_len, p);
2458 }
2459 le->length = prev_len;
2460
2461 if (blobid & BLOBID_FLAG_SPANNING) {
2462 dout(30) << __func__ << " getting spanning blob "
2463 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2464 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2465 } else {
2466 blobid >>= BLOBID_SHIFT_BITS;
2467 if (blobid) {
2468 le->assign_blob(blobs[blobid - 1]);
2469 assert(le->blob);
2470 } else {
2471 Blob *b = new Blob();
2472 uint64_t sbid = 0;
2473 b->decode(onode->c, p, struct_v, &sbid, false);
2474 blobs[n] = b;
2475 onode->c->open_shared_blob(sbid, b);
2476 le->assign_blob(b);
2477 }
2478 // we build ref_map dynamically for non-spanning blobs
2479 le->blob->get_ref(
2480 onode->c,
2481 le->blob_offset,
2482 le->length);
2483 }
2484 pos += prev_len;
2485 ++n;
2486 extent_map.insert(*le);
2487 }
2488
2489 assert(n == num);
2490 return num;
2491 }
2492
2493 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2494 {
2495 // Version 2 differs from v1 in blob's ref_map
2496 // serialization only. Hence there is no specific
2497 // handling at ExtentMap level.
2498 __u8 struct_v = 2;
2499
2500 denc(struct_v, p);
2501 denc_varint((uint32_t)0, p);
2502 size_t key_size = 0;
2503 denc_varint((uint32_t)0, key_size);
2504 p += spanning_blob_map.size() * key_size;
2505 for (const auto& i : spanning_blob_map) {
2506 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2507 }
2508 }
2509
2510 void BlueStore::ExtentMap::encode_spanning_blobs(
2511 bufferlist::contiguous_appender& p)
2512 {
2513 // Version 2 differs from v1 in blob's ref_map
2514 // serialization only. Hence there is no specific
2515 // handling at ExtentMap level.
2516 __u8 struct_v = 2;
2517
2518 denc(struct_v, p);
2519 denc_varint(spanning_blob_map.size(), p);
2520 for (auto& i : spanning_blob_map) {
2521 denc_varint(i.second->id, p);
2522 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2523 }
2524 }
2525
2526 void BlueStore::ExtentMap::decode_spanning_blobs(
2527 bufferptr::iterator& p)
2528 {
2529 __u8 struct_v;
2530 denc(struct_v, p);
2531 // Version 2 differs from v1 in blob's ref_map
2532 // serialization only. Hence there is no specific
2533 // handling at ExtentMap level.
2534 assert(struct_v == 1 || struct_v == 2);
2535
2536 unsigned n;
2537 denc_varint(n, p);
2538 while (n--) {
2539 BlobRef b(new Blob());
2540 denc_varint(b->id, p);
2541 spanning_blob_map[b->id] = b;
2542 uint64_t sbid = 0;
2543 b->decode(onode->c, p, struct_v, &sbid, true);
2544 onode->c->open_shared_blob(sbid, b);
2545 }
2546 }
2547
2548 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2549 {
2550 shards.resize(onode->onode.extent_map_shards.size());
2551 unsigned i = 0;
2552 for (auto &s : onode->onode.extent_map_shards) {
2553 shards[i].shard_info = &s;
2554 shards[i].loaded = loaded;
2555 shards[i].dirty = dirty;
2556 ++i;
2557 }
2558 }
2559
2560 void BlueStore::ExtentMap::fault_range(
2561 KeyValueDB *db,
2562 uint32_t offset,
2563 uint32_t length)
2564 {
2565 auto cct = onode->c->store->cct; //used by dout
2566 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2567 << std::dec << dendl;
2568 auto start = seek_shard(offset);
2569 auto last = seek_shard(offset + length);
2570
2571 if (start < 0)
2572 return;
2573
2574 assert(last >= start);
2575 string key;
2576 while (start <= last) {
2577 assert((size_t)start < shards.size());
2578 auto p = &shards[start];
2579 if (!p->loaded) {
2580 dout(30) << __func__ << " opening shard 0x" << std::hex
2581 << p->shard_info->offset << std::dec << dendl;
2582 bufferlist v;
2583 generate_extent_shard_key_and_apply(
2584 onode->key, p->shard_info->offset, &key,
2585 [&](const string& final_key) {
2586 int r = db->get(PREFIX_OBJ, final_key, &v);
2587 if (r < 0) {
2588 derr << __func__ << " missing shard 0x" << std::hex
2589 << p->shard_info->offset << std::dec << " for " << onode->oid
2590 << dendl;
2591 assert(r >= 0);
2592 }
2593 }
2594 );
2595 p->extents = decode_some(v);
2596 p->loaded = true;
2597 dout(20) << __func__ << " open shard 0x" << std::hex
2598 << p->shard_info->offset << std::dec
2599 << " (" << v.length() << " bytes)" << dendl;
2600 assert(p->dirty == false);
2601 assert(v.length() == p->shard_info->bytes);
2602 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2603 } else {
2604 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2605 }
2606 ++start;
2607 }
2608 }
2609
2610 void BlueStore::ExtentMap::dirty_range(
2611 uint32_t offset,
2612 uint32_t length)
2613 {
2614 auto cct = onode->c->store->cct; //used by dout
2615 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2616 << std::dec << dendl;
2617 if (shards.empty()) {
2618 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2619 inline_bl.clear();
2620 return;
2621 }
2622 auto start = seek_shard(offset);
2623 auto last = seek_shard(offset + length);
2624 if (start < 0)
2625 return;
2626
2627 assert(last >= start);
2628 while (start <= last) {
2629 assert((size_t)start < shards.size());
2630 auto p = &shards[start];
2631 if (!p->loaded) {
2632 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2633 << std::dec << " is not loaded, can't mark dirty" << dendl;
2634 assert(0 == "can't mark unloaded shard dirty");
2635 }
2636 if (!p->dirty) {
2637 dout(20) << __func__ << " mark shard 0x" << std::hex
2638 << p->shard_info->offset << std::dec << " dirty" << dendl;
2639 p->dirty = true;
2640 }
2641 ++start;
2642 }
2643 }
2644
2645 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2646 uint64_t offset)
2647 {
2648 Extent dummy(offset);
2649 return extent_map.find(dummy);
2650 }
2651
2652 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2653 uint64_t offset)
2654 {
2655 Extent dummy(offset);
2656 auto fp = extent_map.lower_bound(dummy);
2657 if (fp != extent_map.begin()) {
2658 --fp;
2659 if (fp->logical_end() <= offset) {
2660 ++fp;
2661 }
2662 }
2663 return fp;
2664 }
2665
2666 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2667 uint64_t offset) const
2668 {
2669 Extent dummy(offset);
2670 auto fp = extent_map.lower_bound(dummy);
2671 if (fp != extent_map.begin()) {
2672 --fp;
2673 if (fp->logical_end() <= offset) {
2674 ++fp;
2675 }
2676 }
2677 return fp;
2678 }
2679
2680 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2681 {
2682 auto fp = seek_lextent(offset);
2683 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2684 return false;
2685 }
2686 return true;
2687 }
2688
2689 int BlueStore::ExtentMap::compress_extent_map(
2690 uint64_t offset,
2691 uint64_t length)
2692 {
2693 auto cct = onode->c->store->cct; //used by dout
2694 if (extent_map.empty())
2695 return 0;
2696 int removed = 0;
2697 auto p = seek_lextent(offset);
2698 if (p != extent_map.begin()) {
2699 --p; // start to the left of offset
2700 }
2701 // the caller should have just written to this region
2702 assert(p != extent_map.end());
2703
2704 // identify the *next* shard
2705 auto pshard = shards.begin();
2706 while (pshard != shards.end() &&
2707 p->logical_offset >= pshard->shard_info->offset) {
2708 ++pshard;
2709 }
2710 uint64_t shard_end;
2711 if (pshard != shards.end()) {
2712 shard_end = pshard->shard_info->offset;
2713 } else {
2714 shard_end = OBJECT_MAX_SIZE;
2715 }
2716
2717 auto n = p;
2718 for (++n; n != extent_map.end(); p = n++) {
2719 if (n->logical_offset > offset + length) {
2720 break; // stop after end
2721 }
2722 while (n != extent_map.end() &&
2723 p->logical_end() == n->logical_offset &&
2724 p->blob == n->blob &&
2725 p->blob_offset + p->length == n->blob_offset &&
2726 n->logical_offset < shard_end) {
2727 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2728 << " next shard 0x" << shard_end << std::dec
2729 << " merging " << *p << " and " << *n << dendl;
2730 p->length += n->length;
2731 rm(n++);
2732 ++removed;
2733 }
2734 if (n == extent_map.end()) {
2735 break;
2736 }
2737 if (n->logical_offset >= shard_end) {
2738 assert(pshard != shards.end());
2739 ++pshard;
2740 if (pshard != shards.end()) {
2741 shard_end = pshard->shard_info->offset;
2742 } else {
2743 shard_end = OBJECT_MAX_SIZE;
2744 }
2745 }
2746 }
2747 if (removed && onode) {
2748 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2749 }
2750 return removed;
2751 }
2752
2753 void BlueStore::ExtentMap::punch_hole(
2754 CollectionRef &c,
2755 uint64_t offset,
2756 uint64_t length,
2757 old_extent_map_t *old_extents)
2758 {
2759 auto p = seek_lextent(offset);
2760 uint64_t end = offset + length;
2761 while (p != extent_map.end()) {
2762 if (p->logical_offset >= end) {
2763 break;
2764 }
2765 if (p->logical_offset < offset) {
2766 if (p->logical_end() > end) {
2767 // split and deref middle
2768 uint64_t front = offset - p->logical_offset;
2769 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2770 length, p->blob);
2771 old_extents->push_back(*oe);
2772 add(end,
2773 p->blob_offset + front + length,
2774 p->length - front - length,
2775 p->blob);
2776 p->length = front;
2777 break;
2778 } else {
2779 // deref tail
2780 assert(p->logical_end() > offset); // else seek_lextent bug
2781 uint64_t keep = offset - p->logical_offset;
2782 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2783 p->length - keep, p->blob);
2784 old_extents->push_back(*oe);
2785 p->length = keep;
2786 ++p;
2787 continue;
2788 }
2789 }
2790 if (p->logical_offset + p->length <= end) {
2791 // deref whole lextent
2792 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2793 p->length, p->blob);
2794 old_extents->push_back(*oe);
2795 rm(p++);
2796 continue;
2797 }
2798 // deref head
2799 uint64_t keep = p->logical_end() - end;
2800 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2801 p->length - keep, p->blob);
2802 old_extents->push_back(*oe);
2803
2804 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2805 rm(p);
2806 break;
2807 }
2808 }
2809
2810 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2811 CollectionRef &c,
2812 uint64_t logical_offset,
2813 uint64_t blob_offset, uint64_t length, BlobRef b,
2814 old_extent_map_t *old_extents)
2815 {
2816 // We need to have completely initialized Blob to increment its ref counters.
2817 assert(b->get_blob().get_logical_length() != 0);
2818
2819 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2820 // old_extents list if we overwre the blob totally
2821 // This might happen during WAL overwrite.
2822 b->get_ref(onode->c, blob_offset, length);
2823
2824 if (old_extents) {
2825 punch_hole(c, logical_offset, length, old_extents);
2826 }
2827
2828 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2829 extent_map.insert(*le);
2830 if (spans_shard(logical_offset, length)) {
2831 request_reshard(logical_offset, logical_offset + length);
2832 }
2833 return le;
2834 }
2835
2836 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2837 BlobRef lb,
2838 uint32_t blob_offset,
2839 uint32_t pos)
2840 {
2841 auto cct = onode->c->store->cct; //used by dout
2842
2843 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2844 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2845 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2846 << dendl;
2847 BlobRef rb = onode->c->new_blob();
2848 lb->split(onode->c, blob_offset, rb.get());
2849
2850 for (auto ep = seek_lextent(pos);
2851 ep != extent_map.end() && ep->logical_offset < end_pos;
2852 ++ep) {
2853 if (ep->blob != lb) {
2854 continue;
2855 }
2856 if (ep->logical_offset < pos) {
2857 // split extent
2858 size_t left = pos - ep->logical_offset;
2859 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2860 extent_map.insert(*ne);
2861 ep->length = left;
2862 dout(30) << __func__ << " split " << *ep << dendl;
2863 dout(30) << __func__ << " to " << *ne << dendl;
2864 } else {
2865 // switch blob
2866 assert(ep->blob_offset >= blob_offset);
2867
2868 ep->blob = rb;
2869 ep->blob_offset -= blob_offset;
2870 dout(30) << __func__ << " adjusted " << *ep << dendl;
2871 }
2872 }
2873 return rb;
2874 }
2875
2876 // Onode
2877
2878 #undef dout_prefix
2879 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2880
2881 void BlueStore::Onode::flush()
2882 {
2883 if (flushing_count.load()) {
2884 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2885 std::unique_lock<std::mutex> l(flush_lock);
2886 while (flushing_count.load()) {
2887 flush_cond.wait(l);
2888 }
2889 }
2890 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2891 }
2892
2893 // =======================================================
2894 // WriteContext
2895
2896 /// Checks for writes to the same pextent within a blob
2897 bool BlueStore::WriteContext::has_conflict(
2898 BlobRef b,
2899 uint64_t loffs,
2900 uint64_t loffs_end,
2901 uint64_t min_alloc_size)
2902 {
2903 assert((loffs % min_alloc_size) == 0);
2904 assert((loffs_end % min_alloc_size) == 0);
2905 for (auto w : writes) {
2906 if (b == w.b) {
2907 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
2908 auto loffs2_end = ROUND_UP_TO( w.logical_offset + w.length0, min_alloc_size);
2909 if ((loffs <= loffs2 && loffs_end > loffs2) ||
2910 (loffs >= loffs2 && loffs < loffs2_end)) {
2911 return true;
2912 }
2913 }
2914 }
2915 return false;
2916 }
2917
2918 // =======================================================
2919
2920 // DeferredBatch
2921 #undef dout_prefix
2922 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2923
2924 void BlueStore::DeferredBatch::prepare_write(
2925 CephContext *cct,
2926 uint64_t seq, uint64_t offset, uint64_t length,
2927 bufferlist::const_iterator& blp)
2928 {
2929 _discard(cct, offset, length);
2930 auto i = iomap.insert(make_pair(offset, deferred_io()));
2931 assert(i.second); // this should be a new insertion
2932 i.first->second.seq = seq;
2933 blp.copy(length, i.first->second.bl);
2934 i.first->second.bl.reassign_to_mempool(
2935 mempool::mempool_bluestore_writing_deferred);
2936 dout(20) << __func__ << " seq " << seq
2937 << " 0x" << std::hex << offset << "~" << length
2938 << " crc " << i.first->second.bl.crc32c(-1)
2939 << std::dec << dendl;
2940 seq_bytes[seq] += length;
2941 #ifdef DEBUG_DEFERRED
2942 _audit(cct);
2943 #endif
2944 }
2945
2946 void BlueStore::DeferredBatch::_discard(
2947 CephContext *cct, uint64_t offset, uint64_t length)
2948 {
2949 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2950 << std::dec << dendl;
2951 auto p = iomap.lower_bound(offset);
2952 if (p != iomap.begin()) {
2953 --p;
2954 auto end = p->first + p->second.bl.length();
2955 if (end > offset) {
2956 bufferlist head;
2957 head.substr_of(p->second.bl, 0, offset - p->first);
2958 dout(20) << __func__ << " keep head " << p->second.seq
2959 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2960 << " -> 0x" << head.length() << std::dec << dendl;
2961 auto i = seq_bytes.find(p->second.seq);
2962 if (end > offset + length) {
2963 bufferlist tail;
2964 tail.substr_of(p->second.bl, offset + length - p->first,
2965 end - (offset + length));
2966 dout(20) << __func__ << " keep tail " << p->second.seq
2967 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2968 << " -> 0x" << tail.length() << std::dec << dendl;
2969 auto &n = iomap[offset + length];
2970 n.bl.swap(tail);
2971 n.seq = p->second.seq;
2972 i->second -= length;
2973 } else {
2974 i->second -= end - offset;
2975 }
2976 p->second.bl.swap(head);
2977 }
2978 ++p;
2979 }
2980 while (p != iomap.end()) {
2981 if (p->first >= offset + length) {
2982 break;
2983 }
2984 auto i = seq_bytes.find(p->second.seq);
2985 auto end = p->first + p->second.bl.length();
2986 if (end > offset + length) {
2987 unsigned drop_front = offset + length - p->first;
2988 unsigned keep_tail = end - (offset + length);
2989 dout(20) << __func__ << " truncate front " << p->second.seq
2990 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2991 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
2992 << " to 0x" << (offset + length) << "~" << keep_tail
2993 << std::dec << dendl;
2994 auto &s = iomap[offset + length];
2995 s.seq = p->second.seq;
2996 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
2997 i->second -= drop_front;
2998 } else {
2999 dout(20) << __func__ << " drop " << p->second.seq
3000 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3001 << std::dec << dendl;
3002 i->second -= p->second.bl.length();
3003 }
3004 p = iomap.erase(p);
3005 }
3006 }
3007
3008 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3009 {
3010 map<uint64_t,int> sb;
3011 for (auto p : seq_bytes) {
3012 sb[p.first] = 0; // make sure we have the same set of keys
3013 }
3014 uint64_t pos = 0;
3015 for (auto& p : iomap) {
3016 assert(p.first >= pos);
3017 sb[p.second.seq] += p.second.bl.length();
3018 pos = p.first + p.second.bl.length();
3019 }
3020 assert(sb == seq_bytes);
3021 }
3022
3023
3024 // Collection
3025
3026 #undef dout_prefix
3027 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3028
3029 BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3030 : store(ns),
3031 cache(c),
3032 cid(cid),
3033 lock("BlueStore::Collection::lock", true, false),
3034 exists(true),
3035 onode_map(c)
3036 {
3037 }
3038
3039 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3040 {
3041 assert(!b->shared_blob);
3042 const bluestore_blob_t& blob = b->get_blob();
3043 if (!blob.is_shared()) {
3044 b->shared_blob = new SharedBlob(this);
3045 return;
3046 }
3047
3048 b->shared_blob = shared_blob_set.lookup(sbid);
3049 if (b->shared_blob) {
3050 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3051 << std::dec << " had " << *b->shared_blob << dendl;
3052 } else {
3053 b->shared_blob = new SharedBlob(sbid, this);
3054 shared_blob_set.add(this, b->shared_blob.get());
3055 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3056 << std::dec << " opened " << *b->shared_blob
3057 << dendl;
3058 }
3059 }
3060
3061 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3062 {
3063 if (!sb->is_loaded()) {
3064
3065 bufferlist v;
3066 string key;
3067 auto sbid = sb->get_sbid();
3068 get_shared_blob_key(sbid, &key);
3069 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3070 if (r < 0) {
3071 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3072 << std::dec << " not found at key "
3073 << pretty_binary_string(key) << dendl;
3074 assert(0 == "uh oh, missing shared_blob");
3075 }
3076
3077 sb->loaded = true;
3078 sb->persistent = new bluestore_shared_blob_t(sbid);
3079 bufferlist::iterator p = v.begin();
3080 ::decode(*(sb->persistent), p);
3081 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3082 << std::dec << " loaded shared_blob " << *sb << dendl;
3083 }
3084 }
3085
3086 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3087 {
3088 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3089 assert(!b->shared_blob->is_loaded());
3090
3091 // update blob
3092 bluestore_blob_t& blob = b->dirty_blob();
3093 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3094
3095 // update shared blob
3096 b->shared_blob->loaded = true;
3097 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3098 shared_blob_set.add(this, b->shared_blob.get());
3099 for (auto p : blob.get_extents()) {
3100 if (p.is_valid()) {
3101 b->shared_blob->get_ref(
3102 p.offset,
3103 p.length);
3104 }
3105 }
3106 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3107 }
3108
3109 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3110 {
3111 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3112 assert(sb->is_loaded());
3113
3114 uint64_t sbid = sb->get_sbid();
3115 shared_blob_set.remove(sb);
3116 sb->loaded = false;
3117 delete sb->persistent;
3118 sb->sbid_unloaded = 0;
3119 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3120 return sbid;
3121 }
3122
3123 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3124 const ghobject_t& oid,
3125 bool create)
3126 {
3127 assert(create ? lock.is_wlocked() : lock.is_locked());
3128
3129 spg_t pgid;
3130 if (cid.is_pg(&pgid)) {
3131 if (!oid.match(cnode.bits, pgid.ps())) {
3132 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3133 << pgid << " bits " << cnode.bits << dendl;
3134 ceph_abort();
3135 }
3136 }
3137
3138 OnodeRef o = onode_map.lookup(oid);
3139 if (o)
3140 return o;
3141
3142 mempool::bluestore_cache_other::string key;
3143 get_object_key(store->cct, oid, &key);
3144
3145 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3146 << pretty_binary_string(key) << dendl;
3147
3148 bufferlist v;
3149 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3150 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3151 Onode *on;
3152 if (v.length() == 0) {
3153 assert(r == -ENOENT);
3154 if (!store->cct->_conf->bluestore_debug_misc &&
3155 !create)
3156 return OnodeRef();
3157
3158 // new object, new onode
3159 on = new Onode(this, oid, key);
3160 } else {
3161 // loaded
3162 assert(r >= 0);
3163 on = new Onode(this, oid, key);
3164 on->exists = true;
3165 bufferptr::iterator p = v.front().begin_deep();
3166 on->onode.decode(p);
3167
3168 // initialize extent_map
3169 on->extent_map.decode_spanning_blobs(p);
3170 if (on->onode.extent_map_shards.empty()) {
3171 denc(on->extent_map.inline_bl, p);
3172 on->extent_map.decode_some(on->extent_map.inline_bl);
3173 } else {
3174 on->extent_map.init_shards(false, false);
3175 }
3176 }
3177 o.reset(on);
3178 return onode_map.add(oid, o);
3179 }
3180
3181 void BlueStore::Collection::split_cache(
3182 Collection *dest)
3183 {
3184 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3185
3186 // lock (one or both) cache shards
3187 std::lock(cache->lock, dest->cache->lock);
3188 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3189 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3190
3191 int destbits = dest->cnode.bits;
3192 spg_t destpg;
3193 bool is_pg = dest->cid.is_pg(&destpg);
3194 assert(is_pg);
3195
3196 auto p = onode_map.onode_map.begin();
3197 while (p != onode_map.onode_map.end()) {
3198 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3199 // onode does not belong to this child
3200 ++p;
3201 } else {
3202 OnodeRef o = p->second;
3203 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3204 << dendl;
3205
3206 cache->_rm_onode(p->second);
3207 p = onode_map.onode_map.erase(p);
3208
3209 o->c = dest;
3210 dest->cache->_add_onode(o, 1);
3211 dest->onode_map.onode_map[o->oid] = o;
3212 dest->onode_map.cache = dest->cache;
3213
3214 // move over shared blobs and buffers. cover shared blobs from
3215 // both extent map and spanning blob map (the full extent map
3216 // may not be faulted in)
3217 vector<SharedBlob*> sbvec;
3218 for (auto& e : o->extent_map.extent_map) {
3219 sbvec.push_back(e.blob->shared_blob.get());
3220 }
3221 for (auto& b : o->extent_map.spanning_blob_map) {
3222 sbvec.push_back(b.second->shared_blob.get());
3223 }
3224 for (auto sb : sbvec) {
3225 if (sb->coll == dest) {
3226 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3227 << dendl;
3228 continue;
3229 }
3230 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
3231 sb->coll = dest;
3232 if (sb->get_sbid()) {
3233 ldout(store->cct, 20) << __func__
3234 << " moving registration " << *sb << dendl;
3235 shared_blob_set.remove(sb);
3236 dest->shared_blob_set.add(dest, sb);
3237 }
3238 if (dest->cache != cache) {
3239 for (auto& i : sb->bc.buffer_map) {
3240 if (!i.second->is_writing()) {
3241 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3242 << dendl;
3243 dest->cache->_move_buffer(cache, i.second.get());
3244 }
3245 }
3246 }
3247 }
3248 }
3249 }
3250 }
3251
3252 // =======================================================
3253
3254 void *BlueStore::MempoolThread::entry()
3255 {
3256 Mutex::Locker l(lock);
3257 while (!stop) {
3258 uint64_t meta_bytes =
3259 mempool::bluestore_cache_other::allocated_bytes() +
3260 mempool::bluestore_cache_onode::allocated_bytes();
3261 uint64_t onode_num =
3262 mempool::bluestore_cache_onode::allocated_items();
3263
3264 if (onode_num < 2) {
3265 onode_num = 2;
3266 }
3267
3268 float bytes_per_onode = (float)meta_bytes / (float)onode_num;
3269 size_t num_shards = store->cache_shards.size();
3270 float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
3271 // A little sloppy but should be close enough
3272 uint64_t shard_target = target_ratio * (store->cct->_conf->bluestore_cache_size / num_shards);
3273
3274 for (auto i : store->cache_shards) {
3275 i->trim(shard_target,
3276 store->cache_meta_ratio,
3277 store->cache_data_ratio,
3278 bytes_per_onode);
3279 }
3280
3281 store->_update_cache_logger();
3282
3283 utime_t wait;
3284 wait += store->cct->_conf->bluestore_cache_trim_interval;
3285 cond.WaitInterval(lock, wait);
3286 }
3287 stop = false;
3288 return NULL;
3289 }
3290
3291 // =======================================================
3292
3293 // OmapIteratorImpl
3294
3295 #undef dout_prefix
3296 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3297
3298 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3299 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3300 : c(c), o(o), it(it)
3301 {
3302 RWLock::RLocker l(c->lock);
3303 if (o->onode.has_omap()) {
3304 get_omap_key(o->onode.nid, string(), &head);
3305 get_omap_tail(o->onode.nid, &tail);
3306 it->lower_bound(head);
3307 }
3308 }
3309
3310 int BlueStore::OmapIteratorImpl::seek_to_first()
3311 {
3312 RWLock::RLocker l(c->lock);
3313 if (o->onode.has_omap()) {
3314 it->lower_bound(head);
3315 } else {
3316 it = KeyValueDB::Iterator();
3317 }
3318 return 0;
3319 }
3320
3321 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3322 {
3323 RWLock::RLocker l(c->lock);
3324 if (o->onode.has_omap()) {
3325 string key;
3326 get_omap_key(o->onode.nid, after, &key);
3327 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3328 << pretty_binary_string(key) << dendl;
3329 it->upper_bound(key);
3330 } else {
3331 it = KeyValueDB::Iterator();
3332 }
3333 return 0;
3334 }
3335
3336 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3337 {
3338 RWLock::RLocker l(c->lock);
3339 if (o->onode.has_omap()) {
3340 string key;
3341 get_omap_key(o->onode.nid, to, &key);
3342 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3343 << pretty_binary_string(key) << dendl;
3344 it->lower_bound(key);
3345 } else {
3346 it = KeyValueDB::Iterator();
3347 }
3348 return 0;
3349 }
3350
3351 bool BlueStore::OmapIteratorImpl::valid()
3352 {
3353 RWLock::RLocker l(c->lock);
3354 bool r = o->onode.has_omap() && it && it->valid() &&
3355 it->raw_key().second <= tail;
3356 if (it && it->valid()) {
3357 ldout(c->store->cct,20) << __func__ << " is at "
3358 << pretty_binary_string(it->raw_key().second)
3359 << dendl;
3360 }
3361 return r;
3362 }
3363
3364 int BlueStore::OmapIteratorImpl::next(bool validate)
3365 {
3366 RWLock::RLocker l(c->lock);
3367 if (o->onode.has_omap()) {
3368 it->next();
3369 return 0;
3370 } else {
3371 return -1;
3372 }
3373 }
3374
3375 string BlueStore::OmapIteratorImpl::key()
3376 {
3377 RWLock::RLocker l(c->lock);
3378 assert(it->valid());
3379 string db_key = it->raw_key().second;
3380 string user_key;
3381 decode_omap_key(db_key, &user_key);
3382 return user_key;
3383 }
3384
3385 bufferlist BlueStore::OmapIteratorImpl::value()
3386 {
3387 RWLock::RLocker l(c->lock);
3388 assert(it->valid());
3389 return it->value();
3390 }
3391
3392
3393 // =====================================
3394
3395 #undef dout_prefix
3396 #define dout_prefix *_dout << "bluestore(" << path << ") "
3397
3398
3399 static void aio_cb(void *priv, void *priv2)
3400 {
3401 BlueStore *store = static_cast<BlueStore*>(priv);
3402 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3403 c->aio_finish(store);
3404 }
3405
3406 BlueStore::BlueStore(CephContext *cct, const string& path)
3407 : ObjectStore(cct, path),
3408 throttle_bytes(cct, "bluestore_throttle_bytes",
3409 cct->_conf->bluestore_throttle_bytes),
3410 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3411 cct->_conf->bluestore_throttle_bytes +
3412 cct->_conf->bluestore_throttle_deferred_bytes),
3413 kv_sync_thread(this),
3414 kv_finalize_thread(this),
3415 mempool_thread(this)
3416 {
3417 _init_logger();
3418 cct->_conf->add_observer(this);
3419 set_cache_shards(1);
3420 }
3421
3422 BlueStore::BlueStore(CephContext *cct,
3423 const string& path,
3424 uint64_t _min_alloc_size)
3425 : ObjectStore(cct, path),
3426 throttle_bytes(cct, "bluestore_throttle_bytes",
3427 cct->_conf->bluestore_throttle_bytes),
3428 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3429 cct->_conf->bluestore_throttle_bytes +
3430 cct->_conf->bluestore_throttle_deferred_bytes),
3431 kv_sync_thread(this),
3432 kv_finalize_thread(this),
3433 min_alloc_size(_min_alloc_size),
3434 min_alloc_size_order(ctz(_min_alloc_size)),
3435 mempool_thread(this)
3436 {
3437 _init_logger();
3438 cct->_conf->add_observer(this);
3439 set_cache_shards(1);
3440
3441 if (cct->_conf->bluestore_shard_finishers) {
3442 m_finisher_num = cct->_conf->osd_op_num_shards;
3443 }
3444
3445 for (int i = 0; i < m_finisher_num; ++i) {
3446 ostringstream oss;
3447 oss << "finisher-" << i;
3448 Finisher *f = new Finisher(cct, oss.str(), "finisher");
3449 finishers.push_back(f);
3450 }
3451 }
3452
3453 BlueStore::~BlueStore()
3454 {
3455 for (auto f : finishers) {
3456 delete f;
3457 }
3458 finishers.clear();
3459
3460 cct->_conf->remove_observer(this);
3461 _shutdown_logger();
3462 assert(!mounted);
3463 assert(db == NULL);
3464 assert(bluefs == NULL);
3465 assert(fsid_fd < 0);
3466 assert(path_fd < 0);
3467 for (auto i : cache_shards) {
3468 delete i;
3469 }
3470 cache_shards.clear();
3471 }
3472
3473 const char **BlueStore::get_tracked_conf_keys() const
3474 {
3475 static const char* KEYS[] = {
3476 "bluestore_csum_type",
3477 "bluestore_compression_mode",
3478 "bluestore_compression_algorithm",
3479 "bluestore_compression_min_blob_size",
3480 "bluestore_compression_min_blob_size_ssd",
3481 "bluestore_compression_min_blob_size_hdd",
3482 "bluestore_compression_max_blob_size",
3483 "bluestore_compression_max_blob_size_ssd",
3484 "bluestore_compression_max_blob_size_hdd",
3485 "bluestore_max_alloc_size",
3486 "bluestore_prefer_deferred_size",
3487 "bluestore_deferred_batch_ops",
3488 "bluestore_deferred_batch_ops_hdd",
3489 "bluestore_deferred_batch_ops_ssd",
3490 "bluestore_throttle_bytes",
3491 "bluestore_throttle_deferred_bytes",
3492 "bluestore_throttle_cost_per_io_hdd",
3493 "bluestore_throttle_cost_per_io_ssd",
3494 "bluestore_throttle_cost_per_io",
3495 "bluestore_max_blob_size",
3496 "bluestore_max_blob_size_ssd",
3497 "bluestore_max_blob_size_hdd",
3498 NULL
3499 };
3500 return KEYS;
3501 }
3502
3503 void BlueStore::handle_conf_change(const struct md_config_t *conf,
3504 const std::set<std::string> &changed)
3505 {
3506 if (changed.count("bluestore_csum_type")) {
3507 _set_csum();
3508 }
3509 if (changed.count("bluestore_compression_mode") ||
3510 changed.count("bluestore_compression_algorithm") ||
3511 changed.count("bluestore_compression_min_blob_size") ||
3512 changed.count("bluestore_compression_max_blob_size")) {
3513 if (bdev) {
3514 _set_compression();
3515 }
3516 }
3517 if (changed.count("bluestore_max_blob_size") ||
3518 changed.count("bluestore_max_blob_size_ssd") ||
3519 changed.count("bluestore_max_blob_size_hdd")) {
3520 if (bdev) {
3521 // only after startup
3522 _set_blob_size();
3523 }
3524 }
3525 if (changed.count("bluestore_prefer_deferred_size") ||
3526 changed.count("bluestore_max_alloc_size") ||
3527 changed.count("bluestore_deferred_batch_ops") ||
3528 changed.count("bluestore_deferred_batch_ops_hdd") ||
3529 changed.count("bluestore_deferred_batch_ops_ssd")) {
3530 if (bdev) {
3531 // only after startup
3532 _set_alloc_sizes();
3533 }
3534 }
3535 if (changed.count("bluestore_throttle_cost_per_io") ||
3536 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3537 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3538 if (bdev) {
3539 _set_throttle_params();
3540 }
3541 }
3542 if (changed.count("bluestore_throttle_bytes")) {
3543 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3544 throttle_deferred_bytes.reset_max(
3545 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3546 }
3547 if (changed.count("bluestore_throttle_deferred_bytes")) {
3548 throttle_deferred_bytes.reset_max(
3549 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3550 }
3551 }
3552
3553 void BlueStore::_set_compression()
3554 {
3555 if (cct->_conf->bluestore_compression_max_blob_size) {
3556 comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3557 } else {
3558 assert(bdev);
3559 if (bdev->is_rotational()) {
3560 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3561 } else {
3562 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3563 }
3564 }
3565
3566 if (cct->_conf->bluestore_compression_max_blob_size) {
3567 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3568 } else {
3569 assert(bdev);
3570 if (bdev->is_rotational()) {
3571 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3572 } else {
3573 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3574 }
3575 }
3576
3577 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3578 if (m) {
3579 comp_mode = *m;
3580 } else {
3581 derr << __func__ << " unrecognized value '"
3582 << cct->_conf->bluestore_compression_mode
3583 << "' for bluestore_compression_mode, reverting to 'none'"
3584 << dendl;
3585 comp_mode = Compressor::COMP_NONE;
3586 }
3587
3588 compressor = nullptr;
3589
3590 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3591 if (!alg_name.empty()) {
3592 compressor = Compressor::create(cct, alg_name);
3593 if (!compressor) {
3594 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3595 << dendl;
3596 }
3597 }
3598
3599 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3600 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3601 << dendl;
3602 }
3603
3604 void BlueStore::_set_csum()
3605 {
3606 csum_type = Checksummer::CSUM_NONE;
3607 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3608 if (t > Checksummer::CSUM_NONE)
3609 csum_type = t;
3610
3611 dout(10) << __func__ << " csum_type "
3612 << Checksummer::get_csum_type_string(csum_type)
3613 << dendl;
3614 }
3615
3616 void BlueStore::_set_throttle_params()
3617 {
3618 if (cct->_conf->bluestore_throttle_cost_per_io) {
3619 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3620 } else {
3621 assert(bdev);
3622 if (bdev->is_rotational()) {
3623 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3624 } else {
3625 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3626 }
3627 }
3628
3629 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3630 << dendl;
3631 }
3632 void BlueStore::_set_blob_size()
3633 {
3634 if (cct->_conf->bluestore_max_blob_size) {
3635 max_blob_size = cct->_conf->bluestore_max_blob_size;
3636 } else {
3637 assert(bdev);
3638 if (bdev->is_rotational()) {
3639 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3640 } else {
3641 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3642 }
3643 }
3644 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3645 << std::dec << dendl;
3646 }
3647
3648 int BlueStore::_set_cache_sizes()
3649 {
3650 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3651 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
3652 cache_data_ratio =
3653 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
3654
3655 if (cache_meta_ratio <= 0 || cache_meta_ratio > 1.0) {
3656 derr << __func__ << "bluestore_cache_meta_ratio (" << cache_meta_ratio
3657 << ") must be in range (0,1.0]" << dendl;
3658 return -EINVAL;
3659 }
3660 if (cache_kv_ratio <= 0 || cache_kv_ratio > 1.0) {
3661 derr << __func__ << "bluestore_cache_kv_ratio (" << cache_kv_ratio
3662 << ") must be in range (0,1.0]" << dendl;
3663 return -EINVAL;
3664 }
3665 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
3666 derr << __func__ << "bluestore_cache_meta_ratio (" << cache_meta_ratio
3667 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3668 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
3669 << dendl;
3670 return -EINVAL;
3671 }
3672 if (cache_data_ratio < 0) {
3673 // deal with floating point imprecision
3674 cache_data_ratio = 0;
3675 }
3676 dout(1) << __func__ << " meta " << cache_meta_ratio
3677 << " kv " << cache_kv_ratio
3678 << " data " << cache_data_ratio
3679 << dendl;
3680 return 0;
3681 }
3682
3683 void BlueStore::_init_logger()
3684 {
3685 PerfCountersBuilder b(cct, "bluestore",
3686 l_bluestore_first, l_bluestore_last);
3687 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
3688 "Average kv_thread flush latency",
3689 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
3690 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
3691 "Average kv_thread commit latency");
3692 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
3693 "Average kv_thread sync latency",
3694 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
3695 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
3696 "Average prepare state latency");
3697 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
3698 "Average aio_wait state latency",
3699 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
3700 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
3701 "Average io_done state latency");
3702 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
3703 "Average kv_queued state latency");
3704 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
3705 "Average kv_commiting state latency");
3706 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
3707 "Average kv_done state latency");
3708 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
3709 "Average deferred_queued state latency");
3710 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
3711 "Average aio_wait state latency");
3712 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
3713 "Average cleanup state latency");
3714 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
3715 "Average finishing state latency");
3716 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
3717 "Average done state latency");
3718 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
3719 "Average submit throttle latency",
3720 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
3721 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
3722 "Average submit latency",
3723 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
3724 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
3725 "Average commit latency",
3726 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
3727 b.add_time_avg(l_bluestore_read_lat, "read_lat",
3728 "Average read latency",
3729 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
3730 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
3731 "Average read onode metadata latency");
3732 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
3733 "Average read latency");
3734 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
3735 "Average compress latency");
3736 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
3737 "Average decompress latency");
3738 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
3739 "Average checksum latency");
3740 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
3741 "Sum for beneficial compress ops");
3742 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
3743 "Sum for compress ops rejected due to low net gain of space");
3744 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
3745 "Sum for write-op padded bytes");
3746 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
3747 "Sum for deferred write op");
3748 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
3749 "Sum for deferred write bytes", "def");
3750 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
3751 "Sum for write penalty read ops");
3752 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
3753 "Sum for allocated bytes");
3754 b.add_u64(l_bluestore_stored, "bluestore_stored",
3755 "Sum for stored bytes");
3756 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
3757 "Sum for stored compressed bytes");
3758 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
3759 "Sum for bytes allocated for compressed data");
3760 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
3761 "Sum for original bytes that were compressed");
3762
3763 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
3764 "Number of onodes in cache");
3765 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
3766 "Sum for onode-lookups hit in the cache");
3767 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
3768 "Sum for onode-lookups missed in the cache");
3769 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
3770 "Sum for onode-shard lookups hit in the cache");
3771 b.add_u64_counter(l_bluestore_onode_shard_misses,
3772 "bluestore_onode_shard_misses",
3773 "Sum for onode-shard lookups missed in the cache");
3774 b.add_u64(l_bluestore_extents, "bluestore_extents",
3775 "Number of extents in cache");
3776 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
3777 "Number of blobs in cache");
3778 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
3779 "Number of buffers in cache");
3780 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
3781 "Number of buffer bytes in cache");
3782 b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
3783 "Sum for bytes of read hit in the cache");
3784 b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
3785 "Sum for bytes of read missed in the cache");
3786
3787 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
3788 "Large aligned writes into fresh blobs");
3789 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
3790 "Large aligned writes into fresh blobs (bytes)");
3791 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
3792 "Large aligned writes into fresh blobs (blobs)");
3793 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
3794 "Small writes into existing or sparse small blobs");
3795 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
3796 "Small writes into existing or sparse small blobs (bytes)");
3797 b.add_u64_counter(l_bluestore_write_small_unused,
3798 "bluestore_write_small_unused",
3799 "Small writes into unused portion of existing blob");
3800 b.add_u64_counter(l_bluestore_write_small_deferred,
3801 "bluestore_write_small_deferred",
3802 "Small overwrites using deferred");
3803 b.add_u64_counter(l_bluestore_write_small_pre_read,
3804 "bluestore_write_small_pre_read",
3805 "Small writes that required we read some data (possibly "
3806 "cached) to fill out the block");
3807 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
3808 "Small write into new (sparse) blob");
3809
3810 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
3811 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
3812 "Onode extent map reshard events");
3813 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
3814 "Sum for blob splitting due to resharding");
3815 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
3816 "Sum for extents that have been removed due to compression");
3817 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
3818 "Sum for extents that have been merged due to garbage "
3819 "collection");
3820 logger = b.create_perf_counters();
3821 cct->get_perfcounters_collection()->add(logger);
3822 }
3823
3824 int BlueStore::_reload_logger()
3825 {
3826 struct store_statfs_t store_statfs;
3827
3828 int r = statfs(&store_statfs);
3829 if(r >= 0) {
3830 logger->set(l_bluestore_allocated, store_statfs.allocated);
3831 logger->set(l_bluestore_stored, store_statfs.stored);
3832 logger->set(l_bluestore_compressed, store_statfs.compressed);
3833 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
3834 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
3835 }
3836 return r;
3837 }
3838
3839 void BlueStore::_shutdown_logger()
3840 {
3841 cct->get_perfcounters_collection()->remove(logger);
3842 delete logger;
3843 }
3844
3845 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
3846 uuid_d *fsid)
3847 {
3848 bluestore_bdev_label_t label;
3849 int r = _read_bdev_label(cct, path, &label);
3850 if (r < 0)
3851 return r;
3852 *fsid = label.osd_uuid;
3853 return 0;
3854 }
3855
3856 int BlueStore::_open_path()
3857 {
3858 // initial sanity check
3859 int r = _set_cache_sizes();
3860 if (r < 0) {
3861 return r;
3862 }
3863
3864 assert(path_fd < 0);
3865 path_fd = ::open(path.c_str(), O_DIRECTORY);
3866 if (path_fd < 0) {
3867 int r = -errno;
3868 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
3869 << dendl;
3870 return r;
3871 }
3872 return 0;
3873 }
3874
3875 void BlueStore::_close_path()
3876 {
3877 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
3878 path_fd = -1;
3879 }
3880
3881 int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
3882 {
3883 dout(10) << __func__ << " path " << path << " label " << label << dendl;
3884 bufferlist bl;
3885 ::encode(label, bl);
3886 uint32_t crc = bl.crc32c(-1);
3887 ::encode(crc, bl);
3888 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
3889 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
3890 z.zero();
3891 bl.append(std::move(z));
3892
3893 int fd = ::open(path.c_str(), O_WRONLY);
3894 if (fd < 0) {
3895 fd = -errno;
3896 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
3897 << dendl;
3898 return fd;
3899 }
3900 int r = bl.write_fd(fd);
3901 if (r < 0) {
3902 derr << __func__ << " failed to write to " << path
3903 << ": " << cpp_strerror(r) << dendl;
3904 }
3905 VOID_TEMP_FAILURE_RETRY(::close(fd));
3906 return r;
3907 }
3908
3909 int BlueStore::_read_bdev_label(CephContext* cct, string path,
3910 bluestore_bdev_label_t *label)
3911 {
3912 dout(10) << __func__ << dendl;
3913 int fd = ::open(path.c_str(), O_RDONLY);
3914 if (fd < 0) {
3915 fd = -errno;
3916 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
3917 << dendl;
3918 return fd;
3919 }
3920 bufferlist bl;
3921 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
3922 VOID_TEMP_FAILURE_RETRY(::close(fd));
3923 if (r < 0) {
3924 derr << __func__ << " failed to read from " << path
3925 << ": " << cpp_strerror(r) << dendl;
3926 return r;
3927 }
3928
3929 uint32_t crc, expected_crc;
3930 bufferlist::iterator p = bl.begin();
3931 try {
3932 ::decode(*label, p);
3933 bufferlist t;
3934 t.substr_of(bl, 0, p.get_off());
3935 crc = t.crc32c(-1);
3936 ::decode(expected_crc, p);
3937 }
3938 catch (buffer::error& e) {
3939 derr << __func__ << " unable to decode label at offset " << p.get_off()
3940 << ": " << e.what()
3941 << dendl;
3942 return -EINVAL;
3943 }
3944 if (crc != expected_crc) {
3945 derr << __func__ << " bad crc on label, expected " << expected_crc
3946 << " != actual " << crc << dendl;
3947 return -EIO;
3948 }
3949 dout(10) << __func__ << " got " << *label << dendl;
3950 return 0;
3951 }
3952
3953 int BlueStore::_check_or_set_bdev_label(
3954 string path, uint64_t size, string desc, bool create)
3955 {
3956 bluestore_bdev_label_t label;
3957 if (create) {
3958 label.osd_uuid = fsid;
3959 label.size = size;
3960 label.btime = ceph_clock_now();
3961 label.description = desc;
3962 int r = _write_bdev_label(path, label);
3963 if (r < 0)
3964 return r;
3965 } else {
3966 int r = _read_bdev_label(cct, path, &label);
3967 if (r < 0)
3968 return r;
3969 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
3970 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
3971 << " and fsid " << fsid << " check bypassed" << dendl;
3972 }
3973 else if (label.osd_uuid != fsid) {
3974 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
3975 << " does not match our fsid " << fsid << dendl;
3976 return -EIO;
3977 }
3978 }
3979 return 0;
3980 }
3981
3982 void BlueStore::_set_alloc_sizes(void)
3983 {
3984 min_alloc_size_order = ctz(min_alloc_size);
3985 assert(min_alloc_size == 1u << min_alloc_size_order);
3986
3987 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
3988
3989 if (cct->_conf->bluestore_prefer_deferred_size) {
3990 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
3991 } else {
3992 assert(bdev);
3993 if (bdev->is_rotational()) {
3994 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
3995 } else {
3996 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
3997 }
3998 }
3999
4000 if (cct->_conf->bluestore_deferred_batch_ops) {
4001 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4002 } else {
4003 assert(bdev);
4004 if (bdev->is_rotational()) {
4005 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4006 } else {
4007 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4008 }
4009 }
4010
4011 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4012 << std::dec << " order " << min_alloc_size_order
4013 << " max_alloc_size 0x" << std::hex << max_alloc_size
4014 << " prefer_deferred_size 0x" << prefer_deferred_size
4015 << std::dec
4016 << " deferred_batch_ops " << deferred_batch_ops
4017 << dendl;
4018 }
4019
4020 int BlueStore::_open_bdev(bool create)
4021 {
4022 assert(bdev == NULL);
4023 string p = path + "/block";
4024 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4025 int r = bdev->open(p);
4026 if (r < 0)
4027 goto fail;
4028
4029 if (bdev->supported_bdev_label()) {
4030 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4031 if (r < 0)
4032 goto fail_close;
4033 }
4034
4035 // initialize global block parameters
4036 block_size = bdev->get_block_size();
4037 block_mask = ~(block_size - 1);
4038 block_size_order = ctz(block_size);
4039 assert(block_size == 1u << block_size_order);
4040 return 0;
4041
4042 fail_close:
4043 bdev->close();
4044 fail:
4045 delete bdev;
4046 bdev = NULL;
4047 return r;
4048 }
4049
4050 void BlueStore::_close_bdev()
4051 {
4052 assert(bdev);
4053 bdev->close();
4054 delete bdev;
4055 bdev = NULL;
4056 }
4057
4058 int BlueStore::_open_fm(bool create)
4059 {
4060 assert(fm == NULL);
4061 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4062
4063 if (create) {
4064 // initialize freespace
4065 dout(20) << __func__ << " initializing freespace" << dendl;
4066 KeyValueDB::Transaction t = db->get_transaction();
4067 {
4068 bufferlist bl;
4069 bl.append(freelist_type);
4070 t->set(PREFIX_SUPER, "freelist_type", bl);
4071 }
4072 fm->create(bdev->get_size(), t);
4073
4074 // allocate superblock reserved space. note that we do not mark
4075 // bluefs space as allocated in the freelist; we instead rely on
4076 // bluefs_extents.
4077 fm->allocate(0, SUPER_RESERVED, t);
4078
4079 uint64_t reserved = 0;
4080 if (cct->_conf->bluestore_bluefs) {
4081 assert(bluefs_extents.num_intervals() == 1);
4082 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
4083 reserved = p.get_start() + p.get_len();
4084 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4085 << " for bluefs" << dendl;
4086 bufferlist bl;
4087 ::encode(bluefs_extents, bl);
4088 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4089 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4090 << std::dec << dendl;
4091 } else {
4092 reserved = SUPER_RESERVED;
4093 }
4094
4095 if (cct->_conf->bluestore_debug_prefill > 0) {
4096 uint64_t end = bdev->get_size() - reserved;
4097 dout(1) << __func__ << " pre-fragmenting freespace, using "
4098 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4099 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4100 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4101 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4102 float r = cct->_conf->bluestore_debug_prefill;
4103 r /= 1.0 - r;
4104 bool stop = false;
4105
4106 while (!stop && start < end) {
4107 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4108 if (start + l > end) {
4109 l = end - start;
4110 l = P2ALIGN(l, min_alloc_size);
4111 }
4112 assert(start + l <= end);
4113
4114 uint64_t u = 1 + (uint64_t)(r * (double)l);
4115 u = P2ROUNDUP(u, min_alloc_size);
4116 if (start + l + u > end) {
4117 u = end - (start + l);
4118 // trim to align so we don't overflow again
4119 u = P2ALIGN(u, min_alloc_size);
4120 stop = true;
4121 }
4122 assert(start + l + u <= end);
4123
4124 dout(20) << " free 0x" << std::hex << start << "~" << l
4125 << " use 0x" << u << std::dec << dendl;
4126
4127 if (u == 0) {
4128 // break if u has been trimmed to nothing
4129 break;
4130 }
4131
4132 fm->allocate(start + l, u, t);
4133 start += l + u;
4134 }
4135 }
4136 db->submit_transaction_sync(t);
4137 }
4138
4139 int r = fm->init();
4140 if (r < 0) {
4141 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4142 delete fm;
4143 fm = NULL;
4144 return r;
4145 }
4146 return 0;
4147 }
4148
4149 void BlueStore::_close_fm()
4150 {
4151 dout(10) << __func__ << dendl;
4152 assert(fm);
4153 fm->shutdown();
4154 delete fm;
4155 fm = NULL;
4156 }
4157
4158 int BlueStore::_open_alloc()
4159 {
4160 assert(alloc == NULL);
4161 assert(bdev->get_size());
4162 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4163 bdev->get_size(),
4164 min_alloc_size);
4165 if (!alloc) {
4166 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4167 << cct->_conf->bluestore_allocator
4168 << dendl;
4169 return -EINVAL;
4170 }
4171
4172 uint64_t num = 0, bytes = 0;
4173
4174 dout(1) << __func__ << " opening allocation metadata" << dendl;
4175 // initialize from freelist
4176 fm->enumerate_reset();
4177 uint64_t offset, length;
4178 while (fm->enumerate_next(&offset, &length)) {
4179 alloc->init_add_free(offset, length);
4180 ++num;
4181 bytes += length;
4182 }
4183 dout(1) << __func__ << " loaded " << pretty_si_t(bytes)
4184 << " in " << num << " extents"
4185 << dendl;
4186
4187 // also mark bluefs space as allocated
4188 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4189 alloc->init_rm_free(e.get_start(), e.get_len());
4190 }
4191 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4192 << bluefs_extents << std::dec << " as allocated" << dendl;
4193
4194 return 0;
4195 }
4196
4197 void BlueStore::_close_alloc()
4198 {
4199 assert(alloc);
4200 alloc->shutdown();
4201 delete alloc;
4202 alloc = NULL;
4203 }
4204
4205 int BlueStore::_open_fsid(bool create)
4206 {
4207 assert(fsid_fd < 0);
4208 int flags = O_RDWR;
4209 if (create)
4210 flags |= O_CREAT;
4211 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4212 if (fsid_fd < 0) {
4213 int err = -errno;
4214 derr << __func__ << " " << cpp_strerror(err) << dendl;
4215 return err;
4216 }
4217 return 0;
4218 }
4219
4220 int BlueStore::_read_fsid(uuid_d *uuid)
4221 {
4222 char fsid_str[40];
4223 memset(fsid_str, 0, sizeof(fsid_str));
4224 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4225 if (ret < 0) {
4226 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4227 return ret;
4228 }
4229 if (ret > 36)
4230 fsid_str[36] = 0;
4231 else
4232 fsid_str[ret] = 0;
4233 if (!uuid->parse(fsid_str)) {
4234 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4235 return -EINVAL;
4236 }
4237 return 0;
4238 }
4239
4240 int BlueStore::_write_fsid()
4241 {
4242 int r = ::ftruncate(fsid_fd, 0);
4243 if (r < 0) {
4244 r = -errno;
4245 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4246 return r;
4247 }
4248 string str = stringify(fsid) + "\n";
4249 r = safe_write(fsid_fd, str.c_str(), str.length());
4250 if (r < 0) {
4251 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4252 return r;
4253 }
4254 r = ::fsync(fsid_fd);
4255 if (r < 0) {
4256 r = -errno;
4257 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4258 return r;
4259 }
4260 return 0;
4261 }
4262
4263 void BlueStore::_close_fsid()
4264 {
4265 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4266 fsid_fd = -1;
4267 }
4268
4269 int BlueStore::_lock_fsid()
4270 {
4271 struct flock l;
4272 memset(&l, 0, sizeof(l));
4273 l.l_type = F_WRLCK;
4274 l.l_whence = SEEK_SET;
4275 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4276 if (r < 0) {
4277 int err = errno;
4278 derr << __func__ << " failed to lock " << path << "/fsid"
4279 << " (is another ceph-osd still running?)"
4280 << cpp_strerror(err) << dendl;
4281 return -err;
4282 }
4283 return 0;
4284 }
4285
4286 bool BlueStore::is_rotational()
4287 {
4288 if (bdev) {
4289 return bdev->is_rotational();
4290 }
4291
4292 bool rotational = true;
4293 int r = _open_path();
4294 if (r < 0)
4295 goto out;
4296 r = _open_fsid(false);
4297 if (r < 0)
4298 goto out_path;
4299 r = _read_fsid(&fsid);
4300 if (r < 0)
4301 goto out_fsid;
4302 r = _lock_fsid();
4303 if (r < 0)
4304 goto out_fsid;
4305 r = _open_bdev(false);
4306 if (r < 0)
4307 goto out_fsid;
4308 rotational = bdev->is_rotational();
4309 _close_bdev();
4310 out_fsid:
4311 _close_fsid();
4312 out_path:
4313 _close_path();
4314 out:
4315 return rotational;
4316 }
4317
4318 bool BlueStore::test_mount_in_use()
4319 {
4320 // most error conditions mean the mount is not in use (e.g., because
4321 // it doesn't exist). only if we fail to lock do we conclude it is
4322 // in use.
4323 bool ret = false;
4324 int r = _open_path();
4325 if (r < 0)
4326 return false;
4327 r = _open_fsid(false);
4328 if (r < 0)
4329 goto out_path;
4330 r = _lock_fsid();
4331 if (r < 0)
4332 ret = true; // if we can't lock, it is in use
4333 _close_fsid();
4334 out_path:
4335 _close_path();
4336 return ret;
4337 }
4338
4339 int BlueStore::_open_db(bool create)
4340 {
4341 int r;
4342 assert(!db);
4343 string fn = path + "/db";
4344 string options;
4345 stringstream err;
4346 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4347
4348 string kv_backend;
4349 if (create) {
4350 kv_backend = cct->_conf->bluestore_kvbackend;
4351 } else {
4352 r = read_meta("kv_backend", &kv_backend);
4353 if (r < 0) {
4354 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4355 return -EIO;
4356 }
4357 }
4358 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4359
4360 bool do_bluefs;
4361 if (create) {
4362 do_bluefs = cct->_conf->bluestore_bluefs;
4363 } else {
4364 string s;
4365 r = read_meta("bluefs", &s);
4366 if (r < 0) {
4367 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4368 return -EIO;
4369 }
4370 if (s == "1") {
4371 do_bluefs = true;
4372 } else if (s == "0") {
4373 do_bluefs = false;
4374 } else {
4375 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4376 << dendl;
4377 return -EIO;
4378 }
4379 }
4380 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4381
4382 rocksdb::Env *env = NULL;
4383 if (do_bluefs) {
4384 dout(10) << __func__ << " initializing bluefs" << dendl;
4385 if (kv_backend != "rocksdb") {
4386 derr << " backend must be rocksdb to use bluefs" << dendl;
4387 return -EINVAL;
4388 }
4389 bluefs = new BlueFS(cct);
4390
4391 string bfn;
4392 struct stat st;
4393
4394 bfn = path + "/block.db";
4395 if (::stat(bfn.c_str(), &st) == 0) {
4396 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4397 if (r < 0) {
4398 derr << __func__ << " add block device(" << bfn << ") returned: "
4399 << cpp_strerror(r) << dendl;
4400 goto free_bluefs;
4401 }
4402
4403 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4404 r = _check_or_set_bdev_label(
4405 bfn,
4406 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4407 "bluefs db", create);
4408 if (r < 0) {
4409 derr << __func__
4410 << " check block device(" << bfn << ") label returned: "
4411 << cpp_strerror(r) << dendl;
4412 goto free_bluefs;
4413 }
4414 }
4415 if (create) {
4416 bluefs->add_block_extent(
4417 BlueFS::BDEV_DB,
4418 SUPER_RESERVED,
4419 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4420 }
4421 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4422 bluefs_single_shared_device = false;
4423 } else if (::lstat(bfn.c_str(), &st) == -1) {
4424 bluefs_shared_bdev = BlueFS::BDEV_DB;
4425 } else {
4426 //symlink exist is bug
4427 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4428 r = -errno;
4429 goto free_bluefs;
4430 }
4431
4432 // shared device
4433 bfn = path + "/block";
4434 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4435 if (r < 0) {
4436 derr << __func__ << " add block device(" << bfn << ") returned: "
4437 << cpp_strerror(r) << dendl;
4438 goto free_bluefs;
4439 }
4440 if (create) {
4441 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4442 uint64_t initial =
4443 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4444 cct->_conf->bluestore_bluefs_gift_ratio);
4445 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
4446 // align to bluefs's alloc_size
4447 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
4448 // put bluefs in the middle of the device in case it is an HDD
4449 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4450 cct->_conf->bluefs_alloc_size);
4451 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4452 bluefs_extents.insert(start, initial);
4453 }
4454
4455 bfn = path + "/block.wal";
4456 if (::stat(bfn.c_str(), &st) == 0) {
4457 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4458 if (r < 0) {
4459 derr << __func__ << " add block device(" << bfn << ") returned: "
4460 << cpp_strerror(r) << dendl;
4461 goto free_bluefs;
4462 }
4463
4464 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4465 r = _check_or_set_bdev_label(
4466 bfn,
4467 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4468 "bluefs wal", create);
4469 if (r < 0) {
4470 derr << __func__ << " check block device(" << bfn
4471 << ") label returned: " << cpp_strerror(r) << dendl;
4472 goto free_bluefs;
4473 }
4474 }
4475
4476 if (create) {
4477 bluefs->add_block_extent(
4478 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4479 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4480 BDEV_LABEL_BLOCK_SIZE);
4481 }
4482 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4483 bluefs_single_shared_device = false;
4484 } else if (::lstat(bfn.c_str(), &st) == -1) {
4485 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
4486 } else {
4487 //symlink exist is bug
4488 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4489 r = -errno;
4490 goto free_bluefs;
4491 }
4492
4493 if (create) {
4494 bluefs->mkfs(fsid);
4495 }
4496 r = bluefs->mount();
4497 if (r < 0) {
4498 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4499 goto free_bluefs;
4500 }
4501 if (cct->_conf->bluestore_bluefs_env_mirror) {
4502 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4503 rocksdb::Env *b = rocksdb::Env::Default();
4504 if (create) {
4505 string cmd = "rm -rf " + path + "/db " +
4506 path + "/db.slow " +
4507 path + "/db.wal";
4508 int r = system(cmd.c_str());
4509 (void)r;
4510 }
4511 env = new rocksdb::EnvMirror(b, a, false, true);
4512 } else {
4513 env = new BlueRocksEnv(bluefs);
4514
4515 // simplify the dir names, too, as "seen" by rocksdb
4516 fn = "db";
4517 }
4518
4519 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4520 // we have both block.db and block; tell rocksdb!
4521 // note: the second (last) size value doesn't really matter
4522 ostringstream db_paths;
4523 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4524 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4525 db_paths << fn << ","
4526 << (uint64_t)(db_size * 95 / 100) << " "
4527 << fn + ".slow" << ","
4528 << (uint64_t)(slow_size * 95 / 100);
4529 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4530 dout(10) << __func__ << " set rocksdb_db_paths to "
4531 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4532 }
4533
4534 if (create) {
4535 env->CreateDir(fn);
4536 if (cct->_conf->rocksdb_separate_wal_dir)
4537 env->CreateDir(fn + ".wal");
4538 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4539 env->CreateDir(fn + ".slow");
4540 }
4541 } else if (create) {
4542 int r = ::mkdir(fn.c_str(), 0755);
4543 if (r < 0)
4544 r = -errno;
4545 if (r < 0 && r != -EEXIST) {
4546 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4547 << dendl;
4548 return r;
4549 }
4550
4551 // wal_dir, too!
4552 if (cct->_conf->rocksdb_separate_wal_dir) {
4553 string walfn = path + "/db.wal";
4554 r = ::mkdir(walfn.c_str(), 0755);
4555 if (r < 0)
4556 r = -errno;
4557 if (r < 0 && r != -EEXIST) {
4558 derr << __func__ << " failed to create " << walfn
4559 << ": " << cpp_strerror(r)
4560 << dendl;
4561 return r;
4562 }
4563 }
4564 }
4565
4566 db = KeyValueDB::create(cct,
4567 kv_backend,
4568 fn,
4569 static_cast<void*>(env));
4570 if (!db) {
4571 derr << __func__ << " error creating db" << dendl;
4572 if (bluefs) {
4573 bluefs->umount();
4574 delete bluefs;
4575 bluefs = NULL;
4576 }
4577 // delete env manually here since we can't depend on db to do this
4578 // under this case
4579 delete env;
4580 env = NULL;
4581 return -EIO;
4582 }
4583
4584 FreelistManager::setup_merge_operators(db);
4585 db->set_merge_operator(PREFIX_STAT, merge_op);
4586
4587 db->set_cache_size(cct->_conf->bluestore_cache_size * cache_kv_ratio);
4588
4589 if (kv_backend == "rocksdb")
4590 options = cct->_conf->bluestore_rocksdb_options;
4591 db->init(options);
4592 if (create)
4593 r = db->create_and_open(err);
4594 else
4595 r = db->open(err);
4596 if (r) {
4597 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4598 if (bluefs) {
4599 bluefs->umount();
4600 delete bluefs;
4601 bluefs = NULL;
4602 }
4603 delete db;
4604 db = NULL;
4605 return -EIO;
4606 }
4607 dout(1) << __func__ << " opened " << kv_backend
4608 << " path " << fn << " options " << options << dendl;
4609 return 0;
4610
4611 free_bluefs:
4612 assert(bluefs);
4613 delete bluefs;
4614 bluefs = NULL;
4615 return r;
4616 }
4617
4618 void BlueStore::_close_db()
4619 {
4620 assert(db);
4621 delete db;
4622 db = NULL;
4623 if (bluefs) {
4624 bluefs->umount();
4625 delete bluefs;
4626 bluefs = NULL;
4627 }
4628 }
4629
4630 int BlueStore::_reconcile_bluefs_freespace()
4631 {
4632 dout(10) << __func__ << dendl;
4633 interval_set<uint64_t> bset;
4634 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
4635 assert(r == 0);
4636 if (bset == bluefs_extents) {
4637 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
4638 << std::dec << dendl;
4639 return 0;
4640 }
4641 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
4642 << dendl;
4643 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
4644 << std::dec << dendl;
4645
4646 interval_set<uint64_t> overlap;
4647 overlap.intersection_of(bset, bluefs_extents);
4648
4649 bset.subtract(overlap);
4650 if (!bset.empty()) {
4651 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
4652 << dendl;
4653 return -EIO;
4654 }
4655
4656 interval_set<uint64_t> super_extra;
4657 super_extra = bluefs_extents;
4658 super_extra.subtract(overlap);
4659 if (!super_extra.empty()) {
4660 // This is normal: it can happen if we commit to give extents to
4661 // bluefs and we crash before bluefs commits that it owns them.
4662 dout(10) << __func__ << " super extra " << super_extra << dendl;
4663 for (interval_set<uint64_t>::iterator p = super_extra.begin();
4664 p != super_extra.end();
4665 ++p) {
4666 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
4667 }
4668 }
4669
4670 return 0;
4671 }
4672
4673 int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
4674 {
4675 int ret = 0;
4676 assert(bluefs);
4677
4678 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
4679 bluefs->get_usage(&bluefs_usage);
4680 assert(bluefs_usage.size() > bluefs_shared_bdev);
4681
4682 // fixme: look at primary bdev only for now
4683 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
4684 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
4685 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
4686
4687 uint64_t my_free = alloc->get_free();
4688 uint64_t total = bdev->get_size();
4689 float my_free_ratio = (float)my_free / (float)total;
4690
4691 uint64_t total_free = bluefs_free + my_free;
4692
4693 float bluefs_ratio = (float)bluefs_free / (float)total_free;
4694
4695 dout(10) << __func__
4696 << " bluefs " << pretty_si_t(bluefs_free)
4697 << " free (" << bluefs_free_ratio
4698 << ") bluestore " << pretty_si_t(my_free)
4699 << " free (" << my_free_ratio
4700 << "), bluefs_ratio " << bluefs_ratio
4701 << dendl;
4702
4703 uint64_t gift = 0;
4704 uint64_t reclaim = 0;
4705 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
4706 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
4707 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4708 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
4709 << ", should gift " << pretty_si_t(gift) << dendl;
4710 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
4711 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
4712 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
4713 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
4714 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4715 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
4716 << ", should reclaim " << pretty_si_t(reclaim) << dendl;
4717 }
4718 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
4719 cct->_conf->bluestore_bluefs_min <
4720 (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
4721 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
4722 dout(10) << __func__ << " bluefs_total " << bluefs_total
4723 << " < min " << cct->_conf->bluestore_bluefs_min
4724 << ", should gift " << pretty_si_t(g) << dendl;
4725 if (g > gift)
4726 gift = g;
4727 reclaim = 0;
4728 }
4729
4730 if (gift) {
4731 // round up to alloc size
4732 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
4733
4734 // hard cap to fit into 32 bits
4735 gift = MIN(gift, 1ull<<31);
4736 dout(10) << __func__ << " gifting " << gift
4737 << " (" << pretty_si_t(gift) << ")" << dendl;
4738
4739 // fixme: just do one allocation to start...
4740 int r = alloc->reserve(gift);
4741 assert(r == 0);
4742
4743 AllocExtentVector exts;
4744 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
4745 0, 0, &exts);
4746
4747 if (alloc_len < (int64_t)gift) {
4748 derr << __func__ << " allocate failed on 0x" << std::hex << gift
4749 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4750 alloc->dump();
4751 assert(0 == "allocate failed, wtf");
4752 return -ENOSPC;
4753 }
4754 for (auto& p : exts) {
4755 bluestore_pextent_t e = bluestore_pextent_t(p);
4756 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
4757 extents->push_back(e);
4758 }
4759 gift = 0;
4760
4761 ret = 1;
4762 }
4763
4764 // reclaim from bluefs?
4765 if (reclaim) {
4766 // round up to alloc size
4767 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
4768
4769 // hard cap to fit into 32 bits
4770 reclaim = MIN(reclaim, 1ull<<31);
4771 dout(10) << __func__ << " reclaiming " << reclaim
4772 << " (" << pretty_si_t(reclaim) << ")" << dendl;
4773
4774 while (reclaim > 0) {
4775 // NOTE: this will block and do IO.
4776 AllocExtentVector extents;
4777 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
4778 &extents);
4779 if (r < 0) {
4780 derr << __func__ << " failed to reclaim space from bluefs"
4781 << dendl;
4782 break;
4783 }
4784 for (auto e : extents) {
4785 bluefs_extents.erase(e.offset, e.length);
4786 bluefs_extents_reclaiming.insert(e.offset, e.length);
4787 reclaim -= e.length;
4788 }
4789 }
4790
4791 ret = 1;
4792 }
4793
4794 return ret;
4795 }
4796
4797 void BlueStore::_commit_bluefs_freespace(
4798 const PExtentVector& bluefs_gift_extents)
4799 {
4800 dout(10) << __func__ << dendl;
4801 for (auto& p : bluefs_gift_extents) {
4802 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
4803 }
4804 }
4805
4806 int BlueStore::_open_collections(int *errors)
4807 {
4808 assert(coll_map.empty());
4809 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
4810 for (it->upper_bound(string());
4811 it->valid();
4812 it->next()) {
4813 coll_t cid;
4814 if (cid.parse(it->key())) {
4815 CollectionRef c(
4816 new Collection(
4817 this,
4818 cache_shards[cid.hash_to_shard(cache_shards.size())],
4819 cid));
4820 bufferlist bl = it->value();
4821 bufferlist::iterator p = bl.begin();
4822 try {
4823 ::decode(c->cnode, p);
4824 } catch (buffer::error& e) {
4825 derr << __func__ << " failed to decode cnode, key:"
4826 << pretty_binary_string(it->key()) << dendl;
4827 return -EIO;
4828 }
4829 dout(20) << __func__ << " opened " << cid << " " << c << dendl;
4830 coll_map[cid] = c;
4831 } else {
4832 derr << __func__ << " unrecognized collection " << it->key() << dendl;
4833 if (errors)
4834 (*errors)++;
4835 }
4836 }
4837 return 0;
4838 }
4839
4840 void BlueStore::open_statfs()
4841 {
4842 bufferlist bl;
4843 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
4844 if (r >= 0) {
4845 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
4846 auto it = bl.begin();
4847 vstatfs.decode(it);
4848 }
4849 else {
4850 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
4851 }
4852 }
4853 else {
4854 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
4855 }
4856 }
4857
4858 int BlueStore::_setup_block_symlink_or_file(
4859 string name,
4860 string epath,
4861 uint64_t size,
4862 bool create)
4863 {
4864 dout(20) << __func__ << " name " << name << " path " << epath
4865 << " size " << size << " create=" << (int)create << dendl;
4866 int r = 0;
4867 int flags = O_RDWR;
4868 if (create)
4869 flags |= O_CREAT;
4870 if (epath.length()) {
4871 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
4872 if (r < 0) {
4873 r = -errno;
4874 derr << __func__ << " failed to create " << name << " symlink to "
4875 << epath << ": " << cpp_strerror(r) << dendl;
4876 return r;
4877 }
4878
4879 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
4880 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
4881 if (fd < 0) {
4882 r = -errno;
4883 derr << __func__ << " failed to open " << epath << " file: "
4884 << cpp_strerror(r) << dendl;
4885 return r;
4886 }
4887 string serial_number = epath.substr(strlen(SPDK_PREFIX));
4888 r = ::write(fd, serial_number.c_str(), serial_number.size());
4889 assert(r == (int)serial_number.size());
4890 dout(1) << __func__ << " created " << name << " symlink to "
4891 << epath << dendl;
4892 VOID_TEMP_FAILURE_RETRY(::close(fd));
4893 }
4894 }
4895 if (size) {
4896 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
4897 if (fd >= 0) {
4898 // block file is present
4899 struct stat st;
4900 int r = ::fstat(fd, &st);
4901 if (r == 0 &&
4902 S_ISREG(st.st_mode) && // if it is a regular file
4903 st.st_size == 0) { // and is 0 bytes
4904 r = ::ftruncate(fd, size);
4905 if (r < 0) {
4906 r = -errno;
4907 derr << __func__ << " failed to resize " << name << " file to "
4908 << size << ": " << cpp_strerror(r) << dendl;
4909 VOID_TEMP_FAILURE_RETRY(::close(fd));
4910 return r;
4911 }
4912
4913 if (cct->_conf->bluestore_block_preallocate_file) {
4914 #ifdef HAVE_POSIX_FALLOCATE
4915 r = ::posix_fallocate(fd, 0, size);
4916 if (r) {
4917 derr << __func__ << " failed to prefallocate " << name << " file to "
4918 << size << ": " << cpp_strerror(r) << dendl;
4919 VOID_TEMP_FAILURE_RETRY(::close(fd));
4920 return -r;
4921 }
4922 #else
4923 char data[1024*128];
4924 for (uint64_t off = 0; off < size; off += sizeof(data)) {
4925 if (off + sizeof(data) > size)
4926 r = ::write(fd, data, size - off);
4927 else
4928 r = ::write(fd, data, sizeof(data));
4929 if (r < 0) {
4930 r = -errno;
4931 derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
4932 << size << ": " << cpp_strerror(r) << dendl;
4933 VOID_TEMP_FAILURE_RETRY(::close(fd));
4934 return r;
4935 }
4936 }
4937 #endif
4938 }
4939 dout(1) << __func__ << " resized " << name << " file to "
4940 << pretty_si_t(size) << "B" << dendl;
4941 }
4942 VOID_TEMP_FAILURE_RETRY(::close(fd));
4943 } else {
4944 int r = -errno;
4945 if (r != -ENOENT) {
4946 derr << __func__ << " failed to open " << name << " file: "
4947 << cpp_strerror(r) << dendl;
4948 return r;
4949 }
4950 }
4951 }
4952 return 0;
4953 }
4954
4955 int BlueStore::mkfs()
4956 {
4957 dout(1) << __func__ << " path " << path << dendl;
4958 int r;
4959 uuid_d old_fsid;
4960
4961 {
4962 string done;
4963 r = read_meta("mkfs_done", &done);
4964 if (r == 0) {
4965 dout(1) << __func__ << " already created" << dendl;
4966 if (cct->_conf->bluestore_fsck_on_mkfs) {
4967 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
4968 if (r < 0) {
4969 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
4970 << dendl;
4971 return r;
4972 }
4973 if (r > 0) {
4974 derr << __func__ << " fsck found " << r << " errors" << dendl;
4975 r = -EIO;
4976 }
4977 }
4978 return r; // idempotent
4979 }
4980 }
4981
4982 {
4983 string type;
4984 r = read_meta("type", &type);
4985 if (r == 0) {
4986 if (type != "bluestore") {
4987 derr << __func__ << " expected bluestore, but type is " << type << dendl;
4988 return -EIO;
4989 }
4990 } else {
4991 r = write_meta("type", "bluestore");
4992 if (r < 0)
4993 return r;
4994 }
4995 }
4996
4997 freelist_type = "bitmap";
4998
4999 r = _open_path();
5000 if (r < 0)
5001 return r;
5002
5003 r = _open_fsid(true);
5004 if (r < 0)
5005 goto out_path_fd;
5006
5007 r = _lock_fsid();
5008 if (r < 0)
5009 goto out_close_fsid;
5010
5011 r = _read_fsid(&old_fsid);
5012 if (r < 0 || old_fsid.is_zero()) {
5013 if (fsid.is_zero()) {
5014 fsid.generate_random();
5015 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5016 } else {
5017 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5018 }
5019 // we'll write it later.
5020 } else {
5021 if (!fsid.is_zero() && fsid != old_fsid) {
5022 derr << __func__ << " on-disk fsid " << old_fsid
5023 << " != provided " << fsid << dendl;
5024 r = -EINVAL;
5025 goto out_close_fsid;
5026 }
5027 fsid = old_fsid;
5028 }
5029
5030 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5031 cct->_conf->bluestore_block_size,
5032 cct->_conf->bluestore_block_create);
5033 if (r < 0)
5034 goto out_close_fsid;
5035 if (cct->_conf->bluestore_bluefs) {
5036 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5037 cct->_conf->bluestore_block_wal_size,
5038 cct->_conf->bluestore_block_wal_create);
5039 if (r < 0)
5040 goto out_close_fsid;
5041 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5042 cct->_conf->bluestore_block_db_size,
5043 cct->_conf->bluestore_block_db_create);
5044 if (r < 0)
5045 goto out_close_fsid;
5046 }
5047
5048 r = _open_bdev(true);
5049 if (r < 0)
5050 goto out_close_fsid;
5051
5052 r = _open_db(true);
5053 if (r < 0)
5054 goto out_close_bdev;
5055
5056 r = _open_fm(true);
5057 if (r < 0)
5058 goto out_close_db;
5059
5060 {
5061 KeyValueDB::Transaction t = db->get_transaction();
5062 {
5063 bufferlist bl;
5064 ::encode((uint64_t)0, bl);
5065 t->set(PREFIX_SUPER, "nid_max", bl);
5066 t->set(PREFIX_SUPER, "blobid_max", bl);
5067 }
5068
5069 // choose min_alloc_size
5070 if (cct->_conf->bluestore_min_alloc_size) {
5071 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5072 } else {
5073 assert(bdev);
5074 if (bdev->is_rotational()) {
5075 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5076 } else {
5077 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5078 }
5079 }
5080 _set_alloc_sizes();
5081 {
5082 bufferlist bl;
5083 ::encode((uint64_t)min_alloc_size, bl);
5084 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5085 }
5086
5087 ondisk_format = latest_ondisk_format;
5088 _prepare_ondisk_format_super(t);
5089 db->submit_transaction_sync(t);
5090 }
5091
5092 r = _open_alloc();
5093 if (r < 0)
5094 goto out_close_fm;
5095
5096 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5097 if (r < 0)
5098 goto out_close_alloc;
5099 r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
5100 if (r < 0)
5101 goto out_close_alloc;
5102
5103 if (fsid != old_fsid) {
5104 r = _write_fsid();
5105 if (r < 0) {
5106 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
5107 goto out_close_alloc;
5108 }
5109 }
5110
5111 out_close_alloc:
5112 _close_alloc();
5113 out_close_fm:
5114 _close_fm();
5115 out_close_db:
5116 _close_db();
5117 out_close_bdev:
5118 _close_bdev();
5119 out_close_fsid:
5120 _close_fsid();
5121 out_path_fd:
5122 _close_path();
5123
5124 if (r == 0 &&
5125 cct->_conf->bluestore_fsck_on_mkfs) {
5126 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5127 if (rc < 0)
5128 return rc;
5129 if (rc > 0) {
5130 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5131 r = -EIO;
5132 }
5133 }
5134
5135 if (r == 0) {
5136 // indicate success by writing the 'mkfs_done' file
5137 r = write_meta("mkfs_done", "yes");
5138 }
5139
5140 if (r < 0) {
5141 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
5142 } else {
5143 dout(0) << __func__ << " success" << dendl;
5144 }
5145 return r;
5146 }
5147
5148 void BlueStore::set_cache_shards(unsigned num)
5149 {
5150 dout(10) << __func__ << " " << num << dendl;
5151 size_t old = cache_shards.size();
5152 assert(num >= old);
5153 cache_shards.resize(num);
5154 for (unsigned i = old; i < num; ++i) {
5155 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5156 logger);
5157 }
5158 }
5159
5160 int BlueStore::_mount(bool kv_only)
5161 {
5162 dout(1) << __func__ << " path " << path << dendl;
5163
5164 {
5165 string type;
5166 int r = read_meta("type", &type);
5167 if (r < 0) {
5168 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5169 << dendl;
5170 return r;
5171 }
5172
5173 if (type != "bluestore") {
5174 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5175 return -EIO;
5176 }
5177 }
5178
5179 if (cct->_conf->bluestore_fsck_on_mount) {
5180 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5181 if (rc < 0)
5182 return rc;
5183 if (rc > 0) {
5184 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5185 return -EIO;
5186 }
5187 }
5188
5189 int r = _open_path();
5190 if (r < 0)
5191 return r;
5192 r = _open_fsid(false);
5193 if (r < 0)
5194 goto out_path;
5195
5196 r = _read_fsid(&fsid);
5197 if (r < 0)
5198 goto out_fsid;
5199
5200 r = _lock_fsid();
5201 if (r < 0)
5202 goto out_fsid;
5203
5204 r = _open_bdev(false);
5205 if (r < 0)
5206 goto out_fsid;
5207
5208 r = _open_db(false);
5209 if (r < 0)
5210 goto out_bdev;
5211
5212 if (kv_only)
5213 return 0;
5214
5215 r = _open_super_meta();
5216 if (r < 0)
5217 goto out_db;
5218
5219 r = _open_fm(false);
5220 if (r < 0)
5221 goto out_db;
5222
5223 r = _open_alloc();
5224 if (r < 0)
5225 goto out_fm;
5226
5227 r = _open_collections();
5228 if (r < 0)
5229 goto out_alloc;
5230
5231 r = _reload_logger();
5232 if (r < 0)
5233 goto out_coll;
5234
5235 if (bluefs) {
5236 r = _reconcile_bluefs_freespace();
5237 if (r < 0)
5238 goto out_coll;
5239 }
5240
5241 _kv_start();
5242
5243 r = _deferred_replay();
5244 if (r < 0)
5245 goto out_stop;
5246
5247 mempool_thread.init();
5248
5249
5250 mounted = true;
5251 return 0;
5252
5253 out_stop:
5254 _kv_stop();
5255 out_coll:
5256 _flush_cache();
5257 out_alloc:
5258 _close_alloc();
5259 out_fm:
5260 _close_fm();
5261 out_db:
5262 _close_db();
5263 out_bdev:
5264 _close_bdev();
5265 out_fsid:
5266 _close_fsid();
5267 out_path:
5268 _close_path();
5269 return r;
5270 }
5271
5272 int BlueStore::umount()
5273 {
5274 assert(mounted);
5275 dout(1) << __func__ << dendl;
5276
5277 _osr_drain_all();
5278 _osr_unregister_all();
5279
5280 mempool_thread.shutdown();
5281
5282 dout(20) << __func__ << " stopping kv thread" << dendl;
5283 _kv_stop();
5284 _reap_collections();
5285 _flush_cache();
5286 dout(20) << __func__ << " closing" << dendl;
5287
5288 mounted = false;
5289 _close_alloc();
5290 _close_fm();
5291 _close_db();
5292 _close_bdev();
5293 _close_fsid();
5294 _close_path();
5295
5296 if (cct->_conf->bluestore_fsck_on_umount) {
5297 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5298 if (rc < 0)
5299 return rc;
5300 if (rc > 0) {
5301 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5302 return -EIO;
5303 }
5304 }
5305 return 0;
5306 }
5307
5308 static void apply(uint64_t off,
5309 uint64_t len,
5310 uint64_t granularity,
5311 BlueStore::mempool_dynamic_bitset &bitset,
5312 const char *what,
5313 std::function<void(uint64_t,
5314 BlueStore::mempool_dynamic_bitset &)> f) {
5315 auto end = ROUND_UP_TO(off + len, granularity);
5316 while (off < end) {
5317 uint64_t pos = off / granularity;
5318 f(pos, bitset);
5319 off += granularity;
5320 }
5321 }
5322
5323 int BlueStore::_fsck_check_extents(
5324 const ghobject_t& oid,
5325 const PExtentVector& extents,
5326 bool compressed,
5327 mempool_dynamic_bitset &used_blocks,
5328 store_statfs_t& expected_statfs)
5329 {
5330 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5331 int errors = 0;
5332 for (auto e : extents) {
5333 if (!e.is_valid())
5334 continue;
5335 expected_statfs.allocated += e.length;
5336 if (compressed) {
5337 expected_statfs.compressed_allocated += e.length;
5338 }
5339 bool already = false;
5340 apply(
5341 e.offset, e.length, block_size, used_blocks, __func__,
5342 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5343 if (bs.test(pos))
5344 already = true;
5345 else
5346 bs.set(pos);
5347 });
5348 if (already) {
5349 derr << " " << oid << " extent " << e
5350 << " or a subset is already allocated" << dendl;
5351 ++errors;
5352 }
5353 if (e.end() > bdev->get_size()) {
5354 derr << " " << oid << " extent " << e
5355 << " past end of block device" << dendl;
5356 ++errors;
5357 }
5358 }
5359 return errors;
5360 }
5361
5362 int BlueStore::fsck(bool deep)
5363 {
5364 dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
5365 int errors = 0;
5366
5367 typedef btree::btree_set<
5368 uint64_t,std::less<uint64_t>,
5369 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5370 uint64_t_btree_t used_nids;
5371 uint64_t_btree_t used_omap_head;
5372 uint64_t_btree_t used_sbids;
5373
5374 mempool_dynamic_bitset used_blocks;
5375 KeyValueDB::Iterator it;
5376 store_statfs_t expected_statfs, actual_statfs;
5377 struct sb_info_t {
5378 list<ghobject_t> oids;
5379 SharedBlobRef sb;
5380 bluestore_extent_ref_map_t ref_map;
5381 bool compressed;
5382 };
5383 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5384
5385 uint64_t num_objects = 0;
5386 uint64_t num_extents = 0;
5387 uint64_t num_blobs = 0;
5388 uint64_t num_spanning_blobs = 0;
5389 uint64_t num_shared_blobs = 0;
5390 uint64_t num_sharded_objects = 0;
5391 uint64_t num_object_shards = 0;
5392
5393 utime_t start = ceph_clock_now();
5394
5395 int r = _open_path();
5396 if (r < 0)
5397 return r;
5398 r = _open_fsid(false);
5399 if (r < 0)
5400 goto out_path;
5401
5402 r = _read_fsid(&fsid);
5403 if (r < 0)
5404 goto out_fsid;
5405
5406 r = _lock_fsid();
5407 if (r < 0)
5408 goto out_fsid;
5409
5410 r = _open_bdev(false);
5411 if (r < 0)
5412 goto out_fsid;
5413
5414 r = _open_db(false);
5415 if (r < 0)
5416 goto out_bdev;
5417
5418 r = _open_super_meta();
5419 if (r < 0)
5420 goto out_db;
5421
5422 r = _open_fm(false);
5423 if (r < 0)
5424 goto out_db;
5425
5426 r = _open_alloc();
5427 if (r < 0)
5428 goto out_fm;
5429
5430 r = _open_collections(&errors);
5431 if (r < 0)
5432 goto out_alloc;
5433
5434 mempool_thread.init();
5435
5436 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5437 _kv_start();
5438 r = _deferred_replay();
5439 _kv_stop();
5440 if (r < 0)
5441 goto out_scan;
5442
5443 used_blocks.resize(bdev->get_size() / block_size);
5444 apply(
5445 0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
5446 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5447 bs.set(pos);
5448 }
5449 );
5450
5451 if (bluefs) {
5452 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5453 apply(
5454 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
5455 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5456 bs.set(pos);
5457 }
5458 );
5459 }
5460 r = bluefs->fsck();
5461 if (r < 0) {
5462 goto out_scan;
5463 }
5464 if (r > 0)
5465 errors += r;
5466 }
5467
5468 // get expected statfs; fill unaffected fields to be able to compare
5469 // structs
5470 statfs(&actual_statfs);
5471 expected_statfs.total = actual_statfs.total;
5472 expected_statfs.available = actual_statfs.available;
5473
5474 // walk PREFIX_OBJ
5475 dout(1) << __func__ << " walking object keyspace" << dendl;
5476 it = db->get_iterator(PREFIX_OBJ);
5477 if (it) {
5478 CollectionRef c;
5479 spg_t pgid;
5480 mempool::bluestore_fsck::list<string> expecting_shards;
5481 for (it->lower_bound(string()); it->valid(); it->next()) {
5482 if (g_conf->bluestore_debug_fsck_abort) {
5483 goto out_scan;
5484 }
5485 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5486 if (is_extent_shard_key(it->key())) {
5487 while (!expecting_shards.empty() &&
5488 expecting_shards.front() < it->key()) {
5489 derr << __func__ << " error: missing shard key "
5490 << pretty_binary_string(expecting_shards.front())
5491 << dendl;
5492 ++errors;
5493 expecting_shards.pop_front();
5494 }
5495 if (!expecting_shards.empty() &&
5496 expecting_shards.front() == it->key()) {
5497 // all good
5498 expecting_shards.pop_front();
5499 continue;
5500 }
5501
5502 uint32_t offset;
5503 string okey;
5504 get_key_extent_shard(it->key(), &okey, &offset);
5505 derr << __func__ << " error: stray shard 0x" << std::hex << offset
5506 << std::dec << dendl;
5507 if (expecting_shards.empty()) {
5508 derr << __func__ << " error: " << pretty_binary_string(it->key())
5509 << " is unexpected" << dendl;
5510 ++errors;
5511 continue;
5512 }
5513 while (expecting_shards.front() > it->key()) {
5514 derr << __func__ << " error: saw " << pretty_binary_string(it->key())
5515 << dendl;
5516 derr << __func__ << " error: exp "
5517 << pretty_binary_string(expecting_shards.front()) << dendl;
5518 ++errors;
5519 expecting_shards.pop_front();
5520 if (expecting_shards.empty()) {
5521 break;
5522 }
5523 }
5524 continue;
5525 }
5526
5527 ghobject_t oid;
5528 int r = get_key_object(it->key(), &oid);
5529 if (r < 0) {
5530 derr << __func__ << " error: bad object key "
5531 << pretty_binary_string(it->key()) << dendl;
5532 ++errors;
5533 continue;
5534 }
5535 if (!c ||
5536 oid.shard_id != pgid.shard ||
5537 oid.hobj.pool != (int64_t)pgid.pool() ||
5538 !c->contains(oid)) {
5539 c = nullptr;
5540 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5541 coll_map.begin();
5542 p != coll_map.end();
5543 ++p) {
5544 if (p->second->contains(oid)) {
5545 c = p->second;
5546 break;
5547 }
5548 }
5549 if (!c) {
5550 derr << __func__ << " error: stray object " << oid
5551 << " not owned by any collection" << dendl;
5552 ++errors;
5553 continue;
5554 }
5555 c->cid.is_pg(&pgid);
5556 dout(20) << __func__ << " collection " << c->cid << dendl;
5557 }
5558
5559 if (!expecting_shards.empty()) {
5560 for (auto &k : expecting_shards) {
5561 derr << __func__ << " error: missing shard key "
5562 << pretty_binary_string(k) << dendl;
5563 }
5564 ++errors;
5565 expecting_shards.clear();
5566 }
5567
5568 dout(10) << __func__ << " " << oid << dendl;
5569 RWLock::RLocker l(c->lock);
5570 OnodeRef o = c->get_onode(oid, false);
5571 if (o->onode.nid) {
5572 if (o->onode.nid > nid_max) {
5573 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5574 << " > nid_max " << nid_max << dendl;
5575 ++errors;
5576 }
5577 if (used_nids.count(o->onode.nid)) {
5578 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5579 << " already in use" << dendl;
5580 ++errors;
5581 continue; // go for next object
5582 }
5583 used_nids.insert(o->onode.nid);
5584 }
5585 ++num_objects;
5586 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5587 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5588 _dump_onode(o, 30);
5589 // shards
5590 if (!o->extent_map.shards.empty()) {
5591 ++num_sharded_objects;
5592 num_object_shards += o->extent_map.shards.size();
5593 }
5594 for (auto& s : o->extent_map.shards) {
5595 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
5596 expecting_shards.push_back(string());
5597 get_extent_shard_key(o->key, s.shard_info->offset,
5598 &expecting_shards.back());
5599 if (s.shard_info->offset >= o->onode.size) {
5600 derr << __func__ << " error: " << oid << " shard 0x" << std::hex
5601 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
5602 << std::dec << dendl;
5603 ++errors;
5604 }
5605 }
5606 // lextents
5607 map<BlobRef,bluestore_blob_t::unused_t> referenced;
5608 uint64_t pos = 0;
5609 mempool::bluestore_fsck::map<BlobRef,
5610 bluestore_blob_use_tracker_t> ref_map;
5611 for (auto& l : o->extent_map.extent_map) {
5612 dout(20) << __func__ << " " << l << dendl;
5613 if (l.logical_offset < pos) {
5614 derr << __func__ << " error: " << oid << " lextent at 0x"
5615 << std::hex << l.logical_offset
5616 << " overlaps with the previous, which ends at 0x" << pos
5617 << std::dec << dendl;
5618 ++errors;
5619 }
5620 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
5621 derr << __func__ << " error: " << oid << " lextent at 0x"
5622 << std::hex << l.logical_offset << "~" << l.length
5623 << " spans a shard boundary"
5624 << std::dec << dendl;
5625 ++errors;
5626 }
5627 pos = l.logical_offset + l.length;
5628 expected_statfs.stored += l.length;
5629 assert(l.blob);
5630 const bluestore_blob_t& blob = l.blob->get_blob();
5631
5632 auto& ref = ref_map[l.blob];
5633 if (ref.is_empty()) {
5634 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
5635 uint32_t l = blob.get_logical_length();
5636 ref.init(l, min_release_size);
5637 }
5638 ref.get(
5639 l.blob_offset,
5640 l.length);
5641 ++num_extents;
5642 if (blob.has_unused()) {
5643 auto p = referenced.find(l.blob);
5644 bluestore_blob_t::unused_t *pu;
5645 if (p == referenced.end()) {
5646 pu = &referenced[l.blob];
5647 } else {
5648 pu = &p->second;
5649 }
5650 uint64_t blob_len = blob.get_logical_length();
5651 assert((blob_len % (sizeof(*pu)*8)) == 0);
5652 assert(l.blob_offset + l.length <= blob_len);
5653 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
5654 uint64_t start = l.blob_offset / chunk_size;
5655 uint64_t end =
5656 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
5657 for (auto i = start; i < end; ++i) {
5658 (*pu) |= (1u << i);
5659 }
5660 }
5661 }
5662 for (auto &i : referenced) {
5663 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
5664 << std::dec << " for " << *i.first << dendl;
5665 const bluestore_blob_t& blob = i.first->get_blob();
5666 if (i.second & blob.unused) {
5667 derr << __func__ << " error: " << oid << " blob claims unused 0x"
5668 << std::hex << blob.unused
5669 << " but extents reference 0x" << i.second
5670 << " on blob " << *i.first << dendl;
5671 ++errors;
5672 }
5673 if (blob.has_csum()) {
5674 uint64_t blob_len = blob.get_logical_length();
5675 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
5676 unsigned csum_count = blob.get_csum_count();
5677 unsigned csum_chunk_size = blob.get_csum_chunk_size();
5678 for (unsigned p = 0; p < csum_count; ++p) {
5679 unsigned pos = p * csum_chunk_size;
5680 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
5681 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
5682 unsigned mask = 1u << firstbit;
5683 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
5684 mask |= 1u << b;
5685 }
5686 if ((blob.unused & mask) == mask) {
5687 // this csum chunk region is marked unused
5688 if (blob.get_csum_item(p) != 0) {
5689 derr << __func__ << " error: " << oid
5690 << " blob claims csum chunk 0x" << std::hex << pos
5691 << "~" << csum_chunk_size
5692 << " is unused (mask 0x" << mask << " of unused 0x"
5693 << blob.unused << ") but csum is non-zero 0x"
5694 << blob.get_csum_item(p) << std::dec << " on blob "
5695 << *i.first << dendl;
5696 ++errors;
5697 }
5698 }
5699 }
5700 }
5701 }
5702 for (auto &i : ref_map) {
5703 ++num_blobs;
5704 const bluestore_blob_t& blob = i.first->get_blob();
5705 bool equal = i.first->get_blob_use_tracker().equal(i.second);
5706 if (!equal) {
5707 derr << __func__ << " error: " << oid << " blob " << *i.first
5708 << " doesn't match expected ref_map " << i.second << dendl;
5709 ++errors;
5710 }
5711 if (blob.is_compressed()) {
5712 expected_statfs.compressed += blob.get_compressed_payload_length();
5713 expected_statfs.compressed_original +=
5714 i.first->get_referenced_bytes();
5715 }
5716 if (blob.is_shared()) {
5717 if (i.first->shared_blob->get_sbid() > blobid_max) {
5718 derr << __func__ << " error: " << oid << " blob " << blob
5719 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
5720 << blobid_max << dendl;
5721 ++errors;
5722 } else if (i.first->shared_blob->get_sbid() == 0) {
5723 derr << __func__ << " error: " << oid << " blob " << blob
5724 << " marked as shared but has uninitialized sbid"
5725 << dendl;
5726 ++errors;
5727 }
5728 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
5729 sbi.sb = i.first->shared_blob;
5730 sbi.oids.push_back(oid);
5731 sbi.compressed = blob.is_compressed();
5732 for (auto e : blob.get_extents()) {
5733 if (e.is_valid()) {
5734 sbi.ref_map.get(e.offset, e.length);
5735 }
5736 }
5737 } else {
5738 errors += _fsck_check_extents(oid, blob.get_extents(),
5739 blob.is_compressed(),
5740 used_blocks,
5741 expected_statfs);
5742 }
5743 }
5744 if (deep) {
5745 bufferlist bl;
5746 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
5747 if (r < 0) {
5748 ++errors;
5749 derr << __func__ << " error: " << oid << " error during read: "
5750 << cpp_strerror(r) << dendl;
5751 }
5752 }
5753 // omap
5754 if (o->onode.has_omap()) {
5755 if (used_omap_head.count(o->onode.nid)) {
5756 derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
5757 << " already in use" << dendl;
5758 ++errors;
5759 } else {
5760 used_omap_head.insert(o->onode.nid);
5761 }
5762 }
5763 }
5764 }
5765 dout(1) << __func__ << " checking shared_blobs" << dendl;
5766 it = db->get_iterator(PREFIX_SHARED_BLOB);
5767 if (it) {
5768 for (it->lower_bound(string()); it->valid(); it->next()) {
5769 string key = it->key();
5770 uint64_t sbid;
5771 if (get_key_shared_blob(key, &sbid)) {
5772 derr << __func__ << " error: bad key '" << key
5773 << "' in shared blob namespace" << dendl;
5774 ++errors;
5775 continue;
5776 }
5777 auto p = sb_info.find(sbid);
5778 if (p == sb_info.end()) {
5779 derr << __func__ << " error: found stray shared blob data for sbid 0x"
5780 << std::hex << sbid << std::dec << dendl;
5781 ++errors;
5782 } else {
5783 ++num_shared_blobs;
5784 sb_info_t& sbi = p->second;
5785 bluestore_shared_blob_t shared_blob(sbid);
5786 bufferlist bl = it->value();
5787 bufferlist::iterator blp = bl.begin();
5788 ::decode(shared_blob, blp);
5789 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
5790 if (shared_blob.ref_map != sbi.ref_map) {
5791 derr << __func__ << " error: shared blob 0x" << std::hex << sbid
5792 << std::dec << " ref_map " << shared_blob.ref_map
5793 << " != expected " << sbi.ref_map << dendl;
5794 ++errors;
5795 }
5796 PExtentVector extents;
5797 for (auto &r : shared_blob.ref_map.ref_map) {
5798 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
5799 }
5800 errors += _fsck_check_extents(p->second.oids.front(),
5801 extents,
5802 p->second.compressed,
5803 used_blocks, expected_statfs);
5804 sb_info.erase(p);
5805 }
5806 }
5807 }
5808 for (auto &p : sb_info) {
5809 derr << __func__ << " error: shared_blob 0x" << p.first
5810 << " key is missing (" << *p.second.sb << ")" << dendl;
5811 ++errors;
5812 }
5813 if (!(actual_statfs == expected_statfs)) {
5814 derr << __func__ << " error: actual " << actual_statfs
5815 << " != expected " << expected_statfs << dendl;
5816 ++errors;
5817 }
5818
5819 dout(1) << __func__ << " checking for stray omap data" << dendl;
5820 it = db->get_iterator(PREFIX_OMAP);
5821 if (it) {
5822 for (it->lower_bound(string()); it->valid(); it->next()) {
5823 uint64_t omap_head;
5824 _key_decode_u64(it->key().c_str(), &omap_head);
5825 if (used_omap_head.count(omap_head) == 0) {
5826 derr << __func__ << " error: found stray omap data on omap_head "
5827 << omap_head << dendl;
5828 ++errors;
5829 }
5830 }
5831 }
5832
5833 dout(1) << __func__ << " checking deferred events" << dendl;
5834 it = db->get_iterator(PREFIX_DEFERRED);
5835 if (it) {
5836 for (it->lower_bound(string()); it->valid(); it->next()) {
5837 bufferlist bl = it->value();
5838 bufferlist::iterator p = bl.begin();
5839 bluestore_deferred_transaction_t wt;
5840 try {
5841 ::decode(wt, p);
5842 } catch (buffer::error& e) {
5843 derr << __func__ << " error: failed to decode deferred txn "
5844 << pretty_binary_string(it->key()) << dendl;
5845 r = -EIO;
5846 goto out_scan;
5847 }
5848 dout(20) << __func__ << " deferred " << wt.seq
5849 << " ops " << wt.ops.size()
5850 << " released 0x" << std::hex << wt.released << std::dec << dendl;
5851 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
5852 apply(
5853 e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
5854 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5855 bs.set(pos);
5856 }
5857 );
5858 }
5859 }
5860 }
5861
5862 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
5863 {
5864 // remove bluefs_extents from used set since the freelist doesn't
5865 // know they are allocated.
5866 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5867 apply(
5868 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
5869 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5870 bs.reset(pos);
5871 }
5872 );
5873 }
5874 fm->enumerate_reset();
5875 uint64_t offset, length;
5876 while (fm->enumerate_next(&offset, &length)) {
5877 bool intersects = false;
5878 apply(
5879 offset, length, block_size, used_blocks, "free",
5880 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5881 if (bs.test(pos)) {
5882 intersects = true;
5883 } else {
5884 bs.set(pos);
5885 }
5886 }
5887 );
5888 if (intersects) {
5889 derr << __func__ << " error: free extent 0x" << std::hex << offset
5890 << "~" << length << std::dec
5891 << " intersects allocated blocks" << dendl;
5892 ++errors;
5893 }
5894 }
5895 size_t count = used_blocks.count();
5896 if (used_blocks.size() != count) {
5897 assert(used_blocks.size() > count);
5898 derr << __func__ << " error: leaked some space;"
5899 << (used_blocks.size() - count) * min_alloc_size
5900 << " bytes leaked" << dendl;
5901 ++errors;
5902 }
5903 }
5904
5905 out_scan:
5906 mempool_thread.shutdown();
5907 _flush_cache();
5908 out_alloc:
5909 _close_alloc();
5910 out_fm:
5911 _close_fm();
5912 out_db:
5913 it.reset(); // before db is closed
5914 _close_db();
5915 out_bdev:
5916 _close_bdev();
5917 out_fsid:
5918 _close_fsid();
5919 out_path:
5920 _close_path();
5921
5922 // fatal errors take precedence
5923 if (r < 0)
5924 return r;
5925
5926 dout(2) << __func__ << " " << num_objects << " objects, "
5927 << num_sharded_objects << " of them sharded. "
5928 << dendl;
5929 dout(2) << __func__ << " " << num_extents << " extents to "
5930 << num_blobs << " blobs, "
5931 << num_spanning_blobs << " spanning, "
5932 << num_shared_blobs << " shared."
5933 << dendl;
5934
5935 utime_t duration = ceph_clock_now() - start;
5936 dout(1) << __func__ << " finish with " << errors << " errors in "
5937 << duration << " seconds" << dendl;
5938 return errors;
5939 }
5940
5941 void BlueStore::collect_metadata(map<string,string> *pm)
5942 {
5943 dout(10) << __func__ << dendl;
5944 bdev->collect_metadata("bluestore_bdev_", pm);
5945 if (bluefs) {
5946 (*pm)["bluefs"] = "1";
5947 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
5948 bluefs->collect_metadata(pm);
5949 } else {
5950 (*pm)["bluefs"] = "0";
5951 }
5952 }
5953
5954 int BlueStore::statfs(struct store_statfs_t *buf)
5955 {
5956 buf->reset();
5957 buf->total = bdev->get_size();
5958 buf->available = alloc->get_free();
5959
5960 if (bluefs) {
5961 // part of our shared device is "free" according to BlueFS
5962 // Don't include bluestore_bluefs_min because that space can't
5963 // be used for any other purpose.
5964 buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;
5965
5966 // include dedicated db, too, if that isn't the shared device.
5967 if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
5968 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
5969 }
5970 }
5971
5972 {
5973 std::lock_guard<std::mutex> l(vstatfs_lock);
5974
5975 buf->allocated = vstatfs.allocated();
5976 buf->stored = vstatfs.stored();
5977 buf->compressed = vstatfs.compressed();
5978 buf->compressed_original = vstatfs.compressed_original();
5979 buf->compressed_allocated = vstatfs.compressed_allocated();
5980 }
5981
5982 dout(20) << __func__ << *buf << dendl;
5983 return 0;
5984 }
5985
5986 // ---------------
5987 // cache
5988
5989 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
5990 {
5991 RWLock::RLocker l(coll_lock);
5992 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
5993 if (cp == coll_map.end())
5994 return CollectionRef();
5995 return cp->second;
5996 }
5997
5998 void BlueStore::_queue_reap_collection(CollectionRef& c)
5999 {
6000 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6001 std::lock_guard<std::mutex> l(reap_lock);
6002 removed_collections.push_back(c);
6003 }
6004
6005 void BlueStore::_reap_collections()
6006 {
6007 list<CollectionRef> removed_colls;
6008 {
6009 std::lock_guard<std::mutex> l(reap_lock);
6010 removed_colls.swap(removed_collections);
6011 }
6012
6013 bool all_reaped = true;
6014
6015 for (list<CollectionRef>::iterator p = removed_colls.begin();
6016 p != removed_colls.end();
6017 ++p) {
6018 CollectionRef c = *p;
6019 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6020 if (c->onode_map.map_any([&](OnodeRef o) {
6021 assert(!o->exists);
6022 if (o->flushing_count.load()) {
6023 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6024 << " flush_txns " << o->flushing_count << dendl;
6025 return false;
6026 }
6027 return true;
6028 })) {
6029 all_reaped = false;
6030 continue;
6031 }
6032 c->onode_map.clear();
6033 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6034 }
6035
6036 if (all_reaped) {
6037 dout(10) << __func__ << " all reaped" << dendl;
6038 }
6039 }
6040
6041 void BlueStore::_update_cache_logger()
6042 {
6043 uint64_t num_onodes = 0;
6044 uint64_t num_extents = 0;
6045 uint64_t num_blobs = 0;
6046 uint64_t num_buffers = 0;
6047 uint64_t num_buffer_bytes = 0;
6048 for (auto c : cache_shards) {
6049 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6050 &num_buffers, &num_buffer_bytes);
6051 }
6052 logger->set(l_bluestore_onodes, num_onodes);
6053 logger->set(l_bluestore_extents, num_extents);
6054 logger->set(l_bluestore_blobs, num_blobs);
6055 logger->set(l_bluestore_buffers, num_buffers);
6056 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6057 }
6058
6059 // ---------------
6060 // read operations
6061
6062 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6063 {
6064 return _get_collection(cid);
6065 }
6066
6067 bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6068 {
6069 CollectionHandle c = _get_collection(cid);
6070 if (!c)
6071 return false;
6072 return exists(c, oid);
6073 }
6074
6075 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6076 {
6077 Collection *c = static_cast<Collection *>(c_.get());
6078 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6079 if (!c->exists)
6080 return false;
6081
6082 bool r = true;
6083
6084 {
6085 RWLock::RLocker l(c->lock);
6086 OnodeRef o = c->get_onode(oid, false);
6087 if (!o || !o->exists)
6088 r = false;
6089 }
6090
6091 return r;
6092 }
6093
6094 int BlueStore::stat(
6095 const coll_t& cid,
6096 const ghobject_t& oid,
6097 struct stat *st,
6098 bool allow_eio)
6099 {
6100 CollectionHandle c = _get_collection(cid);
6101 if (!c)
6102 return -ENOENT;
6103 return stat(c, oid, st, allow_eio);
6104 }
6105
6106 int BlueStore::stat(
6107 CollectionHandle &c_,
6108 const ghobject_t& oid,
6109 struct stat *st,
6110 bool allow_eio)
6111 {
6112 Collection *c = static_cast<Collection *>(c_.get());
6113 if (!c->exists)
6114 return -ENOENT;
6115 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6116
6117 {
6118 RWLock::RLocker l(c->lock);
6119 OnodeRef o = c->get_onode(oid, false);
6120 if (!o || !o->exists)
6121 return -ENOENT;
6122 st->st_size = o->onode.size;
6123 st->st_blksize = 4096;
6124 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6125 st->st_nlink = 1;
6126 }
6127
6128 int r = 0;
6129 if (_debug_mdata_eio(oid)) {
6130 r = -EIO;
6131 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6132 }
6133 return r;
6134 }
6135 int BlueStore::set_collection_opts(
6136 const coll_t& cid,
6137 const pool_opts_t& opts)
6138 {
6139 CollectionHandle ch = _get_collection(cid);
6140 if (!ch)
6141 return -ENOENT;
6142 Collection *c = static_cast<Collection *>(ch.get());
6143 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6144 if (!c->exists)
6145 return -ENOENT;
6146 RWLock::WLocker l(c->lock);
6147 c->pool_opts = opts;
6148 return 0;
6149 }
6150
6151 int BlueStore::read(
6152 const coll_t& cid,
6153 const ghobject_t& oid,
6154 uint64_t offset,
6155 size_t length,
6156 bufferlist& bl,
6157 uint32_t op_flags,
6158 bool allow_eio)
6159 {
6160 CollectionHandle c = _get_collection(cid);
6161 if (!c)
6162 return -ENOENT;
6163 return read(c, oid, offset, length, bl, op_flags, allow_eio);
6164 }
6165
6166 int BlueStore::read(
6167 CollectionHandle &c_,
6168 const ghobject_t& oid,
6169 uint64_t offset,
6170 size_t length,
6171 bufferlist& bl,
6172 uint32_t op_flags,
6173 bool allow_eio)
6174 {
6175 utime_t start = ceph_clock_now();
6176 Collection *c = static_cast<Collection *>(c_.get());
6177 const coll_t &cid = c->get_cid();
6178 dout(15) << __func__ << " " << cid << " " << oid
6179 << " 0x" << std::hex << offset << "~" << length << std::dec
6180 << dendl;
6181 if (!c->exists)
6182 return -ENOENT;
6183
6184 bl.clear();
6185 int r;
6186 {
6187 RWLock::RLocker l(c->lock);
6188 utime_t start1 = ceph_clock_now();
6189 OnodeRef o = c->get_onode(oid, false);
6190 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6191 if (!o || !o->exists) {
6192 r = -ENOENT;
6193 goto out;
6194 }
6195
6196 if (offset == length && offset == 0)
6197 length = o->onode.size;
6198
6199 r = _do_read(c, o, offset, length, bl, op_flags);
6200 }
6201
6202 out:
6203 assert(allow_eio || r != -EIO);
6204 if (r == 0 && _debug_data_eio(oid)) {
6205 r = -EIO;
6206 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6207 }
6208 dout(10) << __func__ << " " << cid << " " << oid
6209 << " 0x" << std::hex << offset << "~" << length << std::dec
6210 << " = " << r << dendl;
6211 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6212 return r;
6213 }
6214
6215 // --------------------------------------------------------
6216 // intermediate data structures used while reading
6217 struct region_t {
6218 uint64_t logical_offset;
6219 uint64_t blob_xoffset; //region offset within the blob
6220 uint64_t length;
6221 bufferlist bl;
6222
6223 // used later in read process
6224 uint64_t front = 0;
6225 uint64_t r_off = 0;
6226
6227 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6228 : logical_offset(offset),
6229 blob_xoffset(b_offs),
6230 length(len){}
6231 region_t(const region_t& from)
6232 : logical_offset(from.logical_offset),
6233 blob_xoffset(from.blob_xoffset),
6234 length(from.length){}
6235
6236 friend ostream& operator<<(ostream& out, const region_t& r) {
6237 return out << "0x" << std::hex << r.logical_offset << ":"
6238 << r.blob_xoffset << "~" << r.length << std::dec;
6239 }
6240 };
6241
6242 typedef list<region_t> regions2read_t;
6243 typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6244
6245 int BlueStore::_do_read(
6246 Collection *c,
6247 OnodeRef o,
6248 uint64_t offset,
6249 size_t length,
6250 bufferlist& bl,
6251 uint32_t op_flags)
6252 {
6253 FUNCTRACE();
6254 int r = 0;
6255
6256 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6257 << " size 0x" << o->onode.size << " (" << std::dec
6258 << o->onode.size << ")" << dendl;
6259 bl.clear();
6260
6261 if (offset >= o->onode.size) {
6262 return r;
6263 }
6264
6265 // generally, don't buffer anything, unless the client explicitly requests
6266 // it.
6267 bool buffered = false;
6268 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6269 dout(20) << __func__ << " will do buffered read" << dendl;
6270 buffered = true;
6271 } else if (cct->_conf->bluestore_default_buffered_read &&
6272 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6273 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6274 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6275 buffered = true;
6276 }
6277
6278 if (offset + length > o->onode.size) {
6279 length = o->onode.size - offset;
6280 }
6281
6282 utime_t start = ceph_clock_now();
6283 o->extent_map.fault_range(db, offset, length);
6284 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6285 _dump_onode(o);
6286
6287 ready_regions_t ready_regions;
6288
6289 // build blob-wise list to of stuff read (that isn't cached)
6290 blobs2read_t blobs2read;
6291 unsigned left = length;
6292 uint64_t pos = offset;
6293 unsigned num_regions = 0;
6294 auto lp = o->extent_map.seek_lextent(offset);
6295 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6296 if (pos < lp->logical_offset) {
6297 unsigned hole = lp->logical_offset - pos;
6298 if (hole >= left) {
6299 break;
6300 }
6301 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6302 << std::dec << dendl;
6303 pos += hole;
6304 left -= hole;
6305 }
6306 BlobRef bptr = lp->blob;
6307 unsigned l_off = pos - lp->logical_offset;
6308 unsigned b_off = l_off + lp->blob_offset;
6309 unsigned b_len = std::min(left, lp->length - l_off);
6310
6311 ready_regions_t cache_res;
6312 interval_set<uint32_t> cache_interval;
6313 bptr->shared_blob->bc.read(
6314 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
6315 dout(20) << __func__ << " blob " << *bptr << std::hex
6316 << " need 0x" << b_off << "~" << b_len
6317 << " cache has 0x" << cache_interval
6318 << std::dec << dendl;
6319
6320 auto pc = cache_res.begin();
6321 while (b_len > 0) {
6322 unsigned l;
6323 if (pc != cache_res.end() &&
6324 pc->first == b_off) {
6325 l = pc->second.length();
6326 ready_regions[pos].claim(pc->second);
6327 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6328 << b_off << "~" << l << std::dec << dendl;
6329 ++pc;
6330 } else {
6331 l = b_len;
6332 if (pc != cache_res.end()) {
6333 assert(pc->first > b_off);
6334 l = pc->first - b_off;
6335 }
6336 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6337 << b_off << "~" << l << std::dec << dendl;
6338 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6339 ++num_regions;
6340 }
6341 pos += l;
6342 b_off += l;
6343 left -= l;
6344 b_len -= l;
6345 }
6346 ++lp;
6347 }
6348
6349 // read raw blob data. use aio if we have >1 blobs to read.
6350 start = ceph_clock_now(); // for the sake of simplicity
6351 // measure the whole block below.
6352 // The error isn't that much...
6353 vector<bufferlist> compressed_blob_bls;
6354 IOContext ioc(cct, NULL);
6355 for (auto& p : blobs2read) {
6356 BlobRef bptr = p.first;
6357 dout(20) << __func__ << " blob " << *bptr << std::hex
6358 << " need " << p.second << std::dec << dendl;
6359 if (bptr->get_blob().is_compressed()) {
6360 // read the whole thing
6361 if (compressed_blob_bls.empty()) {
6362 // ensure we avoid any reallocation on subsequent blobs
6363 compressed_blob_bls.reserve(blobs2read.size());
6364 }
6365 compressed_blob_bls.push_back(bufferlist());
6366 bufferlist& bl = compressed_blob_bls.back();
6367 r = bptr->get_blob().map(
6368 0, bptr->get_blob().get_ondisk_length(),
6369 [&](uint64_t offset, uint64_t length) {
6370 int r;
6371 // use aio if there are more regions to read than those in this blob
6372 if (num_regions > p.second.size()) {
6373 r = bdev->aio_read(offset, length, &bl, &ioc);
6374 } else {
6375 r = bdev->read(offset, length, &bl, &ioc, false);
6376 }
6377 if (r < 0)
6378 return r;
6379 return 0;
6380 });
6381 assert(r == 0);
6382 } else {
6383 // read the pieces
6384 for (auto& reg : p.second) {
6385 // determine how much of the blob to read
6386 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6387 reg.r_off = reg.blob_xoffset;
6388 uint64_t r_len = reg.length;
6389 reg.front = reg.r_off % chunk_size;
6390 if (reg.front) {
6391 reg.r_off -= reg.front;
6392 r_len += reg.front;
6393 }
6394 unsigned tail = r_len % chunk_size;
6395 if (tail) {
6396 r_len += chunk_size - tail;
6397 }
6398 dout(20) << __func__ << " region 0x" << std::hex
6399 << reg.logical_offset
6400 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6401 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6402 << dendl;
6403
6404 // read it
6405 r = bptr->get_blob().map(
6406 reg.r_off, r_len,
6407 [&](uint64_t offset, uint64_t length) {
6408 int r;
6409 // use aio if there is more than one region to read
6410 if (num_regions > 1) {
6411 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6412 } else {
6413 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6414 }
6415 if (r < 0)
6416 return r;
6417 return 0;
6418 });
6419 assert(r == 0);
6420 assert(reg.bl.length() == r_len);
6421 }
6422 }
6423 }
6424 if (ioc.has_pending_aios()) {
6425 bdev->aio_submit(&ioc);
6426 dout(20) << __func__ << " waiting for aio" << dendl;
6427 ioc.aio_wait();
6428 }
6429 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6430
6431 // enumerate and decompress desired blobs
6432 auto p = compressed_blob_bls.begin();
6433 blobs2read_t::iterator b2r_it = blobs2read.begin();
6434 while (b2r_it != blobs2read.end()) {
6435 BlobRef bptr = b2r_it->first;
6436 dout(20) << __func__ << " blob " << *bptr << std::hex
6437 << " need 0x" << b2r_it->second << std::dec << dendl;
6438 if (bptr->get_blob().is_compressed()) {
6439 assert(p != compressed_blob_bls.end());
6440 bufferlist& compressed_bl = *p++;
6441 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6442 b2r_it->second.front().logical_offset) < 0) {
6443 return -EIO;
6444 }
6445 bufferlist raw_bl;
6446 r = _decompress(compressed_bl, &raw_bl);
6447 if (r < 0)
6448 return r;
6449 if (buffered) {
6450 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6451 raw_bl);
6452 }
6453 for (auto& i : b2r_it->second) {
6454 ready_regions[i.logical_offset].substr_of(
6455 raw_bl, i.blob_xoffset, i.length);
6456 }
6457 } else {
6458 for (auto& reg : b2r_it->second) {
6459 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6460 reg.logical_offset) < 0) {
6461 return -EIO;
6462 }
6463 if (buffered) {
6464 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6465 reg.r_off, reg.bl);
6466 }
6467
6468 // prune and keep result
6469 ready_regions[reg.logical_offset].substr_of(
6470 reg.bl, reg.front, reg.length);
6471 }
6472 }
6473 ++b2r_it;
6474 }
6475
6476 // generate a resulting buffer
6477 auto pr = ready_regions.begin();
6478 auto pr_end = ready_regions.end();
6479 pos = 0;
6480 while (pos < length) {
6481 if (pr != pr_end && pr->first == pos + offset) {
6482 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6483 << ": data from 0x" << pr->first << "~" << pr->second.length()
6484 << std::dec << dendl;
6485 pos += pr->second.length();
6486 bl.claim_append(pr->second);
6487 ++pr;
6488 } else {
6489 uint64_t l = length - pos;
6490 if (pr != pr_end) {
6491 assert(pr->first > pos + offset);
6492 l = pr->first - (pos + offset);
6493 }
6494 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6495 << ": zeros for 0x" << (pos + offset) << "~" << l
6496 << std::dec << dendl;
6497 bl.append_zero(l);
6498 pos += l;
6499 }
6500 }
6501 assert(bl.length() == length);
6502 assert(pos == length);
6503 assert(pr == pr_end);
6504 r = bl.length();
6505 return r;
6506 }
6507
6508 int BlueStore::_verify_csum(OnodeRef& o,
6509 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6510 const bufferlist& bl,
6511 uint64_t logical_offset) const
6512 {
6513 int bad;
6514 uint64_t bad_csum;
6515 utime_t start = ceph_clock_now();
6516 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6517 if (r < 0) {
6518 if (r == -1) {
6519 PExtentVector pex;
6520 blob->map(
6521 bad,
6522 blob->get_csum_chunk_size(),
6523 [&](uint64_t offset, uint64_t length) {
6524 pex.emplace_back(bluestore_pextent_t(offset, length));
6525 return 0;
6526 });
6527 derr << __func__ << " bad "
6528 << Checksummer::get_csum_type_string(blob->csum_type)
6529 << "/0x" << std::hex << blob->get_csum_chunk_size()
6530 << " checksum at blob offset 0x" << bad
6531 << ", got 0x" << bad_csum << ", expected 0x"
6532 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
6533 << ", device location " << pex
6534 << ", logical extent 0x" << std::hex
6535 << (logical_offset + bad - blob_xoffset) << "~"
6536 << blob->get_csum_chunk_size() << std::dec
6537 << ", object " << o->oid
6538 << dendl;
6539 } else {
6540 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
6541 }
6542 }
6543 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
6544 return r;
6545 }
6546
6547 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
6548 {
6549 int r = 0;
6550 utime_t start = ceph_clock_now();
6551 bufferlist::iterator i = source.begin();
6552 bluestore_compression_header_t chdr;
6553 ::decode(chdr, i);
6554 int alg = int(chdr.type);
6555 CompressorRef cp = compressor;
6556 if (!cp || (int)cp->get_type() != alg) {
6557 cp = Compressor::create(cct, alg);
6558 }
6559
6560 if (!cp.get()) {
6561 // if compressor isn't available - error, because cannot return
6562 // decompressed data?
6563 derr << __func__ << " can't load decompressor " << alg << dendl;
6564 r = -EIO;
6565 } else {
6566 r = cp->decompress(i, chdr.length, *result);
6567 if (r < 0) {
6568 derr << __func__ << " decompression failed with exit code " << r << dendl;
6569 r = -EIO;
6570 }
6571 }
6572 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
6573 return r;
6574 }
6575
6576 // this stores fiemap into interval_set, other variations
6577 // use it internally
6578 int BlueStore::_fiemap(
6579 CollectionHandle &c_,
6580 const ghobject_t& oid,
6581 uint64_t offset,
6582 size_t length,
6583 interval_set<uint64_t>& destset)
6584 {
6585 Collection *c = static_cast<Collection *>(c_.get());
6586 if (!c->exists)
6587 return -ENOENT;
6588 {
6589 RWLock::RLocker l(c->lock);
6590
6591 OnodeRef o = c->get_onode(oid, false);
6592 if (!o || !o->exists) {
6593 return -ENOENT;
6594 }
6595 _dump_onode(o);
6596
6597 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6598 << " size 0x" << o->onode.size << std::dec << dendl;
6599
6600 boost::intrusive::set<Extent>::iterator ep, eend;
6601 if (offset >= o->onode.size)
6602 goto out;
6603
6604 if (offset + length > o->onode.size) {
6605 length = o->onode.size - offset;
6606 }
6607
6608 o->extent_map.fault_range(db, offset, length);
6609 eend = o->extent_map.extent_map.end();
6610 ep = o->extent_map.seek_lextent(offset);
6611 while (length > 0) {
6612 dout(20) << __func__ << " offset " << offset << dendl;
6613 if (ep != eend && ep->logical_offset + ep->length <= offset) {
6614 ++ep;
6615 continue;
6616 }
6617
6618 uint64_t x_len = length;
6619 if (ep != eend && ep->logical_offset <= offset) {
6620 uint64_t x_off = offset - ep->logical_offset;
6621 x_len = MIN(x_len, ep->length - x_off);
6622 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
6623 << x_len << std::dec << " blob " << ep->blob << dendl;
6624 destset.insert(offset, x_len);
6625 length -= x_len;
6626 offset += x_len;
6627 if (x_off + x_len == ep->length)
6628 ++ep;
6629 continue;
6630 }
6631 if (ep != eend &&
6632 ep->logical_offset > offset &&
6633 ep->logical_offset - offset < x_len) {
6634 x_len = ep->logical_offset - offset;
6635 }
6636 offset += x_len;
6637 length -= x_len;
6638 }
6639 }
6640
6641 out:
6642 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6643 << " size = 0x(" << destset << ")" << std::dec << dendl;
6644 return 0;
6645 }
6646
6647 int BlueStore::fiemap(
6648 const coll_t& cid,
6649 const ghobject_t& oid,
6650 uint64_t offset,
6651 size_t len,
6652 bufferlist& bl)
6653 {
6654 CollectionHandle c = _get_collection(cid);
6655 if (!c)
6656 return -ENOENT;
6657 return fiemap(c, oid, offset, len, bl);
6658 }
6659
6660 int BlueStore::fiemap(
6661 CollectionHandle &c_,
6662 const ghobject_t& oid,
6663 uint64_t offset,
6664 size_t length,
6665 bufferlist& bl)
6666 {
6667 interval_set<uint64_t> m;
6668 int r = _fiemap(c_, oid, offset, length, m);
6669 if (r >= 0) {
6670 ::encode(m, bl);
6671 }
6672 return r;
6673 }
6674
6675 int BlueStore::fiemap(
6676 const coll_t& cid,
6677 const ghobject_t& oid,
6678 uint64_t offset,
6679 size_t len,
6680 map<uint64_t, uint64_t>& destmap)
6681 {
6682 CollectionHandle c = _get_collection(cid);
6683 if (!c)
6684 return -ENOENT;
6685 return fiemap(c, oid, offset, len, destmap);
6686 }
6687
6688 int BlueStore::fiemap(
6689 CollectionHandle &c_,
6690 const ghobject_t& oid,
6691 uint64_t offset,
6692 size_t length,
6693 map<uint64_t, uint64_t>& destmap)
6694 {
6695 interval_set<uint64_t> m;
6696 int r = _fiemap(c_, oid, offset, length, m);
6697 if (r >= 0) {
6698 m.move_into(destmap);
6699 }
6700 return r;
6701 }
6702
6703 int BlueStore::getattr(
6704 const coll_t& cid,
6705 const ghobject_t& oid,
6706 const char *name,
6707 bufferptr& value)
6708 {
6709 CollectionHandle c = _get_collection(cid);
6710 if (!c)
6711 return -ENOENT;
6712 return getattr(c, oid, name, value);
6713 }
6714
6715 int BlueStore::getattr(
6716 CollectionHandle &c_,
6717 const ghobject_t& oid,
6718 const char *name,
6719 bufferptr& value)
6720 {
6721 Collection *c = static_cast<Collection *>(c_.get());
6722 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
6723 if (!c->exists)
6724 return -ENOENT;
6725
6726 int r;
6727 {
6728 RWLock::RLocker l(c->lock);
6729 mempool::bluestore_cache_other::string k(name);
6730
6731 OnodeRef o = c->get_onode(oid, false);
6732 if (!o || !o->exists) {
6733 r = -ENOENT;
6734 goto out;
6735 }
6736
6737 if (!o->onode.attrs.count(k)) {
6738 r = -ENODATA;
6739 goto out;
6740 }
6741 value = o->onode.attrs[k];
6742 r = 0;
6743 }
6744 out:
6745 if (r == 0 && _debug_mdata_eio(oid)) {
6746 r = -EIO;
6747 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6748 }
6749 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
6750 << " = " << r << dendl;
6751 return r;
6752 }
6753
6754
6755 int BlueStore::getattrs(
6756 const coll_t& cid,
6757 const ghobject_t& oid,
6758 map<string,bufferptr>& aset)
6759 {
6760 CollectionHandle c = _get_collection(cid);
6761 if (!c)
6762 return -ENOENT;
6763 return getattrs(c, oid, aset);
6764 }
6765
6766 int BlueStore::getattrs(
6767 CollectionHandle &c_,
6768 const ghobject_t& oid,
6769 map<string,bufferptr>& aset)
6770 {
6771 Collection *c = static_cast<Collection *>(c_.get());
6772 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
6773 if (!c->exists)
6774 return -ENOENT;
6775
6776 int r;
6777 {
6778 RWLock::RLocker l(c->lock);
6779
6780 OnodeRef o = c->get_onode(oid, false);
6781 if (!o || !o->exists) {
6782 r = -ENOENT;
6783 goto out;
6784 }
6785 for (auto& i : o->onode.attrs) {
6786 aset.emplace(i.first.c_str(), i.second);
6787 }
6788 r = 0;
6789 }
6790
6791 out:
6792 if (r == 0 && _debug_mdata_eio(oid)) {
6793 r = -EIO;
6794 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6795 }
6796 dout(10) << __func__ << " " << c->cid << " " << oid
6797 << " = " << r << dendl;
6798 return r;
6799 }
6800
6801 int BlueStore::list_collections(vector<coll_t>& ls)
6802 {
6803 RWLock::RLocker l(coll_lock);
6804 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
6805 p != coll_map.end();
6806 ++p)
6807 ls.push_back(p->first);
6808 return 0;
6809 }
6810
6811 bool BlueStore::collection_exists(const coll_t& c)
6812 {
6813 RWLock::RLocker l(coll_lock);
6814 return coll_map.count(c);
6815 }
6816
6817 int BlueStore::collection_empty(const coll_t& cid, bool *empty)
6818 {
6819 dout(15) << __func__ << " " << cid << dendl;
6820 vector<ghobject_t> ls;
6821 ghobject_t next;
6822 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
6823 &ls, &next);
6824 if (r < 0) {
6825 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
6826 << dendl;
6827 return r;
6828 }
6829 *empty = ls.empty();
6830 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
6831 return 0;
6832 }
6833
6834 int BlueStore::collection_bits(const coll_t& cid)
6835 {
6836 dout(15) << __func__ << " " << cid << dendl;
6837 CollectionRef c = _get_collection(cid);
6838 if (!c)
6839 return -ENOENT;
6840 RWLock::RLocker l(c->lock);
6841 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
6842 return c->cnode.bits;
6843 }
6844
6845 int BlueStore::collection_list(
6846 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
6847 vector<ghobject_t> *ls, ghobject_t *pnext)
6848 {
6849 CollectionHandle c = _get_collection(cid);
6850 if (!c)
6851 return -ENOENT;
6852 return collection_list(c, start, end, max, ls, pnext);
6853 }
6854
6855 int BlueStore::collection_list(
6856 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
6857 vector<ghobject_t> *ls, ghobject_t *pnext)
6858 {
6859 Collection *c = static_cast<Collection *>(c_.get());
6860 dout(15) << __func__ << " " << c->cid
6861 << " start " << start << " end " << end << " max " << max << dendl;
6862 int r;
6863 {
6864 RWLock::RLocker l(c->lock);
6865 r = _collection_list(c, start, end, max, ls, pnext);
6866 }
6867
6868 dout(10) << __func__ << " " << c->cid
6869 << " start " << start << " end " << end << " max " << max
6870 << " = " << r << ", ls.size() = " << ls->size()
6871 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
6872 return r;
6873 }
6874
6875 int BlueStore::_collection_list(
6876 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
6877 vector<ghobject_t> *ls, ghobject_t *pnext)
6878 {
6879
6880 if (!c->exists)
6881 return -ENOENT;
6882
6883 int r = 0;
6884 ghobject_t static_next;
6885 KeyValueDB::Iterator it;
6886 string temp_start_key, temp_end_key;
6887 string start_key, end_key;
6888 bool set_next = false;
6889 string pend;
6890 bool temp;
6891
6892 if (!pnext)
6893 pnext = &static_next;
6894
6895 if (start == ghobject_t::get_max() ||
6896 start.hobj.is_max()) {
6897 goto out;
6898 }
6899 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
6900 &start_key, &end_key);
6901 dout(20) << __func__
6902 << " range " << pretty_binary_string(temp_start_key)
6903 << " to " << pretty_binary_string(temp_end_key)
6904 << " and " << pretty_binary_string(start_key)
6905 << " to " << pretty_binary_string(end_key)
6906 << " start " << start << dendl;
6907 it = db->get_iterator(PREFIX_OBJ);
6908 if (start == ghobject_t() ||
6909 start.hobj == hobject_t() ||
6910 start == c->cid.get_min_hobj()) {
6911 it->upper_bound(temp_start_key);
6912 temp = true;
6913 } else {
6914 string k;
6915 get_object_key(cct, start, &k);
6916 if (start.hobj.is_temp()) {
6917 temp = true;
6918 assert(k >= temp_start_key && k < temp_end_key);
6919 } else {
6920 temp = false;
6921 assert(k >= start_key && k < end_key);
6922 }
6923 dout(20) << " start from " << pretty_binary_string(k)
6924 << " temp=" << (int)temp << dendl;
6925 it->lower_bound(k);
6926 }
6927 if (end.hobj.is_max()) {
6928 pend = temp ? temp_end_key : end_key;
6929 } else {
6930 get_object_key(cct, end, &end_key);
6931 if (end.hobj.is_temp()) {
6932 if (temp)
6933 pend = end_key;
6934 else
6935 goto out;
6936 } else {
6937 pend = temp ? temp_end_key : end_key;
6938 }
6939 }
6940 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
6941 while (true) {
6942 if (!it->valid() || it->key() >= pend) {
6943 if (!it->valid())
6944 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
6945 else
6946 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
6947 << " >= " << end << dendl;
6948 if (temp) {
6949 if (end.hobj.is_temp()) {
6950 break;
6951 }
6952 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
6953 temp = false;
6954 it->upper_bound(start_key);
6955 pend = end_key;
6956 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
6957 continue;
6958 }
6959 break;
6960 }
6961 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
6962 if (is_extent_shard_key(it->key())) {
6963 it->next();
6964 continue;
6965 }
6966 ghobject_t oid;
6967 int r = get_key_object(it->key(), &oid);
6968 assert(r == 0);
6969 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
6970 if (ls->size() >= (unsigned)max) {
6971 dout(20) << __func__ << " reached max " << max << dendl;
6972 *pnext = oid;
6973 set_next = true;
6974 break;
6975 }
6976 ls->push_back(oid);
6977 it->next();
6978 }
6979 out:
6980 if (!set_next) {
6981 *pnext = ghobject_t::get_max();
6982 }
6983
6984 return r;
6985 }
6986
6987 int BlueStore::omap_get(
6988 const coll_t& cid, ///< [in] Collection containing oid
6989 const ghobject_t &oid, ///< [in] Object containing omap
6990 bufferlist *header, ///< [out] omap header
6991 map<string, bufferlist> *out /// < [out] Key to value map
6992 )
6993 {
6994 CollectionHandle c = _get_collection(cid);
6995 if (!c)
6996 return -ENOENT;
6997 return omap_get(c, oid, header, out);
6998 }
6999
7000 int BlueStore::omap_get(
7001 CollectionHandle &c_, ///< [in] Collection containing oid
7002 const ghobject_t &oid, ///< [in] Object containing omap
7003 bufferlist *header, ///< [out] omap header
7004 map<string, bufferlist> *out /// < [out] Key to value map
7005 )
7006 {
7007 Collection *c = static_cast<Collection *>(c_.get());
7008 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7009 if (!c->exists)
7010 return -ENOENT;
7011 RWLock::RLocker l(c->lock);
7012 int r = 0;
7013 OnodeRef o = c->get_onode(oid, false);
7014 if (!o || !o->exists) {
7015 r = -ENOENT;
7016 goto out;
7017 }
7018 if (!o->onode.has_omap())
7019 goto out;
7020 o->flush();
7021 {
7022 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7023 string head, tail;
7024 get_omap_header(o->onode.nid, &head);
7025 get_omap_tail(o->onode.nid, &tail);
7026 it->lower_bound(head);
7027 while (it->valid()) {
7028 if (it->key() == head) {
7029 dout(30) << __func__ << " got header" << dendl;
7030 *header = it->value();
7031 } else if (it->key() >= tail) {
7032 dout(30) << __func__ << " reached tail" << dendl;
7033 break;
7034 } else {
7035 string user_key;
7036 decode_omap_key(it->key(), &user_key);
7037 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7038 << " -> " << user_key << dendl;
7039 (*out)[user_key] = it->value();
7040 }
7041 it->next();
7042 }
7043 }
7044 out:
7045 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7046 << dendl;
7047 return r;
7048 }
7049
7050 int BlueStore::omap_get_header(
7051 const coll_t& cid, ///< [in] Collection containing oid
7052 const ghobject_t &oid, ///< [in] Object containing omap
7053 bufferlist *header, ///< [out] omap header
7054 bool allow_eio ///< [in] don't assert on eio
7055 )
7056 {
7057 CollectionHandle c = _get_collection(cid);
7058 if (!c)
7059 return -ENOENT;
7060 return omap_get_header(c, oid, header, allow_eio);
7061 }
7062
7063 int BlueStore::omap_get_header(
7064 CollectionHandle &c_, ///< [in] Collection containing oid
7065 const ghobject_t &oid, ///< [in] Object containing omap
7066 bufferlist *header, ///< [out] omap header
7067 bool allow_eio ///< [in] don't assert on eio
7068 )
7069 {
7070 Collection *c = static_cast<Collection *>(c_.get());
7071 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7072 if (!c->exists)
7073 return -ENOENT;
7074 RWLock::RLocker l(c->lock);
7075 int r = 0;
7076 OnodeRef o = c->get_onode(oid, false);
7077 if (!o || !o->exists) {
7078 r = -ENOENT;
7079 goto out;
7080 }
7081 if (!o->onode.has_omap())
7082 goto out;
7083 o->flush();
7084 {
7085 string head;
7086 get_omap_header(o->onode.nid, &head);
7087 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7088 dout(30) << __func__ << " got header" << dendl;
7089 } else {
7090 dout(30) << __func__ << " no header" << dendl;
7091 }
7092 }
7093 out:
7094 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7095 << dendl;
7096 return r;
7097 }
7098
7099 int BlueStore::omap_get_keys(
7100 const coll_t& cid, ///< [in] Collection containing oid
7101 const ghobject_t &oid, ///< [in] Object containing omap
7102 set<string> *keys ///< [out] Keys defined on oid
7103 )
7104 {
7105 CollectionHandle c = _get_collection(cid);
7106 if (!c)
7107 return -ENOENT;
7108 return omap_get_keys(c, oid, keys);
7109 }
7110
7111 int BlueStore::omap_get_keys(
7112 CollectionHandle &c_, ///< [in] Collection containing oid
7113 const ghobject_t &oid, ///< [in] Object containing omap
7114 set<string> *keys ///< [out] Keys defined on oid
7115 )
7116 {
7117 Collection *c = static_cast<Collection *>(c_.get());
7118 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7119 if (!c->exists)
7120 return -ENOENT;
7121 RWLock::RLocker l(c->lock);
7122 int r = 0;
7123 OnodeRef o = c->get_onode(oid, false);
7124 if (!o || !o->exists) {
7125 r = -ENOENT;
7126 goto out;
7127 }
7128 if (!o->onode.has_omap())
7129 goto out;
7130 o->flush();
7131 {
7132 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7133 string head, tail;
7134 get_omap_key(o->onode.nid, string(), &head);
7135 get_omap_tail(o->onode.nid, &tail);
7136 it->lower_bound(head);
7137 while (it->valid()) {
7138 if (it->key() >= tail) {
7139 dout(30) << __func__ << " reached tail" << dendl;
7140 break;
7141 }
7142 string user_key;
7143 decode_omap_key(it->key(), &user_key);
7144 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7145 << " -> " << user_key << dendl;
7146 keys->insert(user_key);
7147 it->next();
7148 }
7149 }
7150 out:
7151 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7152 << dendl;
7153 return r;
7154 }
7155
7156 int BlueStore::omap_get_values(
7157 const coll_t& cid, ///< [in] Collection containing oid
7158 const ghobject_t &oid, ///< [in] Object containing omap
7159 const set<string> &keys, ///< [in] Keys to get
7160 map<string, bufferlist> *out ///< [out] Returned keys and values
7161 )
7162 {
7163 CollectionHandle c = _get_collection(cid);
7164 if (!c)
7165 return -ENOENT;
7166 return omap_get_values(c, oid, keys, out);
7167 }
7168
7169 int BlueStore::omap_get_values(
7170 CollectionHandle &c_, ///< [in] Collection containing oid
7171 const ghobject_t &oid, ///< [in] Object containing omap
7172 const set<string> &keys, ///< [in] Keys to get
7173 map<string, bufferlist> *out ///< [out] Returned keys and values
7174 )
7175 {
7176 Collection *c = static_cast<Collection *>(c_.get());
7177 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7178 if (!c->exists)
7179 return -ENOENT;
7180 RWLock::RLocker l(c->lock);
7181 int r = 0;
7182 string final_key;
7183 OnodeRef o = c->get_onode(oid, false);
7184 if (!o || !o->exists) {
7185 r = -ENOENT;
7186 goto out;
7187 }
7188 if (!o->onode.has_omap())
7189 goto out;
7190 o->flush();
7191 _key_encode_u64(o->onode.nid, &final_key);
7192 final_key.push_back('.');
7193 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7194 final_key.resize(9); // keep prefix
7195 final_key += *p;
7196 bufferlist val;
7197 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7198 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7199 << " -> " << *p << dendl;
7200 out->insert(make_pair(*p, val));
7201 }
7202 }
7203 out:
7204 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7205 << dendl;
7206 return r;
7207 }
7208
7209 int BlueStore::omap_check_keys(
7210 const coll_t& cid, ///< [in] Collection containing oid
7211 const ghobject_t &oid, ///< [in] Object containing omap
7212 const set<string> &keys, ///< [in] Keys to check
7213 set<string> *out ///< [out] Subset of keys defined on oid
7214 )
7215 {
7216 CollectionHandle c = _get_collection(cid);
7217 if (!c)
7218 return -ENOENT;
7219 return omap_check_keys(c, oid, keys, out);
7220 }
7221
7222 int BlueStore::omap_check_keys(
7223 CollectionHandle &c_, ///< [in] Collection containing oid
7224 const ghobject_t &oid, ///< [in] Object containing omap
7225 const set<string> &keys, ///< [in] Keys to check
7226 set<string> *out ///< [out] Subset of keys defined on oid
7227 )
7228 {
7229 Collection *c = static_cast<Collection *>(c_.get());
7230 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7231 if (!c->exists)
7232 return -ENOENT;
7233 RWLock::RLocker l(c->lock);
7234 int r = 0;
7235 string final_key;
7236 OnodeRef o = c->get_onode(oid, false);
7237 if (!o || !o->exists) {
7238 r = -ENOENT;
7239 goto out;
7240 }
7241 if (!o->onode.has_omap())
7242 goto out;
7243 o->flush();
7244 _key_encode_u64(o->onode.nid, &final_key);
7245 final_key.push_back('.');
7246 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7247 final_key.resize(9); // keep prefix
7248 final_key += *p;
7249 bufferlist val;
7250 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7251 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7252 << " -> " << *p << dendl;
7253 out->insert(*p);
7254 } else {
7255 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7256 << " -> " << *p << dendl;
7257 }
7258 }
7259 out:
7260 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7261 << dendl;
7262 return r;
7263 }
7264
7265 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7266 const coll_t& cid, ///< [in] collection
7267 const ghobject_t &oid ///< [in] object
7268 )
7269 {
7270 CollectionHandle c = _get_collection(cid);
7271 if (!c) {
7272 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7273 return ObjectMap::ObjectMapIterator();
7274 }
7275 return get_omap_iterator(c, oid);
7276 }
7277
7278 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7279 CollectionHandle &c_, ///< [in] collection
7280 const ghobject_t &oid ///< [in] object
7281 )
7282 {
7283 Collection *c = static_cast<Collection *>(c_.get());
7284 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7285 if (!c->exists) {
7286 return ObjectMap::ObjectMapIterator();
7287 }
7288 RWLock::RLocker l(c->lock);
7289 OnodeRef o = c->get_onode(oid, false);
7290 if (!o || !o->exists) {
7291 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7292 return ObjectMap::ObjectMapIterator();
7293 }
7294 o->flush();
7295 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7296 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7297 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7298 }
7299
7300 // -----------------
7301 // write helpers
7302
7303 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7304 {
7305 dout(10) << __func__ << " ondisk_format " << ondisk_format
7306 << " min_compat_ondisk_format " << min_compat_ondisk_format
7307 << dendl;
7308 assert(ondisk_format == latest_ondisk_format);
7309 {
7310 bufferlist bl;
7311 ::encode(ondisk_format, bl);
7312 t->set(PREFIX_SUPER, "ondisk_format", bl);
7313 }
7314 {
7315 bufferlist bl;
7316 ::encode(min_compat_ondisk_format, bl);
7317 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7318 }
7319 }
7320
7321 int BlueStore::_open_super_meta()
7322 {
7323 // nid
7324 {
7325 nid_max = 0;
7326 bufferlist bl;
7327 db->get(PREFIX_SUPER, "nid_max", &bl);
7328 bufferlist::iterator p = bl.begin();
7329 try {
7330 uint64_t v;
7331 ::decode(v, p);
7332 nid_max = v;
7333 } catch (buffer::error& e) {
7334 derr << __func__ << " unable to read nid_max" << dendl;
7335 return -EIO;
7336 }
7337 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7338 nid_last = nid_max.load();
7339 }
7340
7341 // blobid
7342 {
7343 blobid_max = 0;
7344 bufferlist bl;
7345 db->get(PREFIX_SUPER, "blobid_max", &bl);
7346 bufferlist::iterator p = bl.begin();
7347 try {
7348 uint64_t v;
7349 ::decode(v, p);
7350 blobid_max = v;
7351 } catch (buffer::error& e) {
7352 derr << __func__ << " unable to read blobid_max" << dendl;
7353 return -EIO;
7354 }
7355 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7356 blobid_last = blobid_max.load();
7357 }
7358
7359 // freelist
7360 {
7361 bufferlist bl;
7362 db->get(PREFIX_SUPER, "freelist_type", &bl);
7363 if (bl.length()) {
7364 freelist_type = std::string(bl.c_str(), bl.length());
7365 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7366 } else {
7367 assert("Not Support extent freelist manager" == 0);
7368 }
7369 }
7370
7371 // bluefs alloc
7372 if (cct->_conf->bluestore_bluefs) {
7373 bluefs_extents.clear();
7374 bufferlist bl;
7375 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7376 bufferlist::iterator p = bl.begin();
7377 try {
7378 ::decode(bluefs_extents, p);
7379 }
7380 catch (buffer::error& e) {
7381 derr << __func__ << " unable to read bluefs_extents" << dendl;
7382 return -EIO;
7383 }
7384 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7385 << std::dec << dendl;
7386 }
7387
7388 // ondisk format
7389 int32_t compat_ondisk_format = 0;
7390 {
7391 bufferlist bl;
7392 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7393 if (r < 0) {
7394 // base case: kraken bluestore is v1 and readable by v1
7395 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7396 << dendl;
7397 ondisk_format = 1;
7398 compat_ondisk_format = 1;
7399 } else {
7400 auto p = bl.begin();
7401 try {
7402 ::decode(ondisk_format, p);
7403 } catch (buffer::error& e) {
7404 derr << __func__ << " unable to read ondisk_format" << dendl;
7405 return -EIO;
7406 }
7407 bl.clear();
7408 {
7409 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7410 assert(!r);
7411 auto p = bl.begin();
7412 try {
7413 ::decode(compat_ondisk_format, p);
7414 } catch (buffer::error& e) {
7415 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7416 return -EIO;
7417 }
7418 }
7419 }
7420 dout(10) << __func__ << " ondisk_format " << ondisk_format
7421 << " compat_ondisk_format " << compat_ondisk_format
7422 << dendl;
7423 }
7424
7425 if (latest_ondisk_format < compat_ondisk_format) {
7426 derr << __func__ << " compat_ondisk_format is "
7427 << compat_ondisk_format << " but we only understand version "
7428 << latest_ondisk_format << dendl;
7429 return -EPERM;
7430 }
7431 if (ondisk_format < latest_ondisk_format) {
7432 int r = _upgrade_super();
7433 if (r < 0) {
7434 return r;
7435 }
7436 }
7437
7438 {
7439 bufferlist bl;
7440 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7441 auto p = bl.begin();
7442 try {
7443 uint64_t val;
7444 ::decode(val, p);
7445 min_alloc_size = val;
7446 } catch (buffer::error& e) {
7447 derr << __func__ << " unable to read min_alloc_size" << dendl;
7448 return -EIO;
7449 }
7450 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7451 << std::dec << dendl;
7452 }
7453 open_statfs();
7454 _set_alloc_sizes();
7455 _set_throttle_params();
7456
7457 _set_csum();
7458 _set_compression();
7459 _set_blob_size();
7460
7461 return 0;
7462 }
7463
7464 int BlueStore::_upgrade_super()
7465 {
7466 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7467 << latest_ondisk_format << dendl;
7468 assert(ondisk_format > 0);
7469 assert(ondisk_format < latest_ondisk_format);
7470
7471 if (ondisk_format == 1) {
7472 // changes:
7473 // - super: added ondisk_format
7474 // - super: added min_readable_ondisk_format
7475 // - super: added min_compat_ondisk_format
7476 // - super: added min_alloc_size
7477 // - super: removed min_min_alloc_size
7478 KeyValueDB::Transaction t = db->get_transaction();
7479 {
7480 bufferlist bl;
7481 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7482 auto p = bl.begin();
7483 try {
7484 uint64_t val;
7485 ::decode(val, p);
7486 min_alloc_size = val;
7487 } catch (buffer::error& e) {
7488 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7489 return -EIO;
7490 }
7491 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7492 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7493 }
7494 ondisk_format = 2;
7495 _prepare_ondisk_format_super(t);
7496 int r = db->submit_transaction_sync(t);
7497 assert(r == 0);
7498 }
7499
7500 // done
7501 dout(1) << __func__ << " done" << dendl;
7502 return 0;
7503 }
7504
7505 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7506 {
7507 if (o->onode.nid)
7508 return;
7509 uint64_t nid = ++nid_last;
7510 dout(20) << __func__ << " " << nid << dendl;
7511 o->onode.nid = nid;
7512 txc->last_nid = nid;
7513 }
7514
7515 uint64_t BlueStore::_assign_blobid(TransContext *txc)
7516 {
7517 uint64_t bid = ++blobid_last;
7518 dout(20) << __func__ << " " << bid << dendl;
7519 txc->last_blobid = bid;
7520 return bid;
7521 }
7522
7523 void BlueStore::get_db_statistics(Formatter *f)
7524 {
7525 db->get_statistics(f);
7526 }
7527
7528 BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
7529 {
7530 TransContext *txc = new TransContext(cct, osr);
7531 txc->t = db->get_transaction();
7532 osr->queue_new(txc);
7533 dout(20) << __func__ << " osr " << osr << " = " << txc
7534 << " seq " << txc->seq << dendl;
7535 return txc;
7536 }
7537
7538 void BlueStore::_txc_calc_cost(TransContext *txc)
7539 {
7540 // this is about the simplest model for transaction cost you can
7541 // imagine. there is some fixed overhead cost by saying there is a
7542 // minimum of one "io". and then we have some cost per "io" that is
7543 // a configurable (with different hdd and ssd defaults), and add
7544 // that to the bytes value.
7545 int ios = 1; // one "io" for the kv commit
7546 for (auto& p : txc->ioc.pending_aios) {
7547 ios += p.iov.size();
7548 }
7549 auto cost = throttle_cost_per_io.load();
7550 txc->cost = ios * cost + txc->bytes;
7551 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
7552 << ios << " ios * " << cost << " + " << txc->bytes
7553 << " bytes)" << dendl;
7554 }
7555
7556 void BlueStore::_txc_update_store_statfs(TransContext *txc)
7557 {
7558 if (txc->statfs_delta.is_empty())
7559 return;
7560
7561 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
7562 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
7563 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
7564 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
7565 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
7566
7567 {
7568 std::lock_guard<std::mutex> l(vstatfs_lock);
7569 vstatfs += txc->statfs_delta;
7570 }
7571
7572 bufferlist bl;
7573 txc->statfs_delta.encode(bl);
7574
7575 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
7576 txc->statfs_delta.reset();
7577 }
7578
7579 void BlueStore::_txc_state_proc(TransContext *txc)
7580 {
7581 while (true) {
7582 dout(10) << __func__ << " txc " << txc
7583 << " " << txc->get_state_name() << dendl;
7584 switch (txc->state) {
7585 case TransContext::STATE_PREPARE:
7586 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
7587 if (txc->ioc.has_pending_aios()) {
7588 txc->state = TransContext::STATE_AIO_WAIT;
7589 txc->had_ios = true;
7590 _txc_aio_submit(txc);
7591 return;
7592 }
7593 // ** fall-thru **
7594
7595 case TransContext::STATE_AIO_WAIT:
7596 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
7597 _txc_finish_io(txc); // may trigger blocked txc's too
7598 return;
7599
7600 case TransContext::STATE_IO_DONE:
7601 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7602 if (txc->had_ios) {
7603 ++txc->osr->txc_with_unstable_io;
7604 }
7605 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
7606 txc->state = TransContext::STATE_KV_QUEUED;
7607 if (cct->_conf->bluestore_sync_submit_transaction) {
7608 if (txc->last_nid >= nid_max ||
7609 txc->last_blobid >= blobid_max) {
7610 dout(20) << __func__
7611 << " last_{nid,blobid} exceeds max, submit via kv thread"
7612 << dendl;
7613 } else if (txc->osr->kv_committing_serially) {
7614 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
7615 << dendl;
7616 // note: this is starvation-prone. once we have a txc in a busy
7617 // sequencer that is committing serially it is possible to keep
7618 // submitting new transactions fast enough that we get stuck doing
7619 // so. the alternative is to block here... fixme?
7620 } else if (txc->osr->txc_with_unstable_io) {
7621 dout(20) << __func__ << " prior txc(s) with unstable ios "
7622 << txc->osr->txc_with_unstable_io.load() << dendl;
7623 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
7624 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
7625 == 0) {
7626 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
7627 << dendl;
7628 } else {
7629 txc->state = TransContext::STATE_KV_SUBMITTED;
7630 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
7631 assert(r == 0);
7632 _txc_applied_kv(txc);
7633 }
7634 }
7635 {
7636 std::lock_guard<std::mutex> l(kv_lock);
7637 kv_queue.push_back(txc);
7638 kv_cond.notify_one();
7639 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
7640 kv_queue_unsubmitted.push_back(txc);
7641 ++txc->osr->kv_committing_serially;
7642 }
7643 if (txc->had_ios)
7644 kv_ios++;
7645 kv_throttle_costs += txc->cost;
7646 }
7647 return;
7648 case TransContext::STATE_KV_SUBMITTED:
7649 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
7650 txc->state = TransContext::STATE_KV_DONE;
7651 _txc_committed_kv(txc);
7652 // ** fall-thru **
7653
7654 case TransContext::STATE_KV_DONE:
7655 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
7656 if (txc->deferred_txn) {
7657 txc->state = TransContext::STATE_DEFERRED_QUEUED;
7658 _deferred_queue(txc);
7659 return;
7660 }
7661 txc->state = TransContext::STATE_FINISHING;
7662 break;
7663
7664 case TransContext::STATE_DEFERRED_CLEANUP:
7665 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
7666 txc->state = TransContext::STATE_FINISHING;
7667 // ** fall-thru **
7668
7669 case TransContext::STATE_FINISHING:
7670 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
7671 _txc_finish(txc);
7672 return;
7673
7674 default:
7675 derr << __func__ << " unexpected txc " << txc
7676 << " state " << txc->get_state_name() << dendl;
7677 assert(0 == "unexpected txc state");
7678 return;
7679 }
7680 }
7681 }
7682
7683 void BlueStore::_txc_finish_io(TransContext *txc)
7684 {
7685 dout(20) << __func__ << " " << txc << dendl;
7686
7687 /*
7688 * we need to preserve the order of kv transactions,
7689 * even though aio will complete in any order.
7690 */
7691
7692 OpSequencer *osr = txc->osr.get();
7693 std::lock_guard<std::mutex> l(osr->qlock);
7694 txc->state = TransContext::STATE_IO_DONE;
7695
7696 // release aio contexts (including pinned buffers).
7697 txc->ioc.running_aios.clear();
7698
7699 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
7700 while (p != osr->q.begin()) {
7701 --p;
7702 if (p->state < TransContext::STATE_IO_DONE) {
7703 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
7704 << p->get_state_name() << dendl;
7705 return;
7706 }
7707 if (p->state > TransContext::STATE_IO_DONE) {
7708 ++p;
7709 break;
7710 }
7711 }
7712 do {
7713 _txc_state_proc(&*p++);
7714 } while (p != osr->q.end() &&
7715 p->state == TransContext::STATE_IO_DONE);
7716
7717 if (osr->kv_submitted_waiters &&
7718 osr->_is_all_kv_submitted()) {
7719 osr->qcond.notify_all();
7720 }
7721 }
7722
7723 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
7724 {
7725 dout(20) << __func__ << " txc " << txc
7726 << " onodes " << txc->onodes
7727 << " shared_blobs " << txc->shared_blobs
7728 << dendl;
7729
7730 // finalize onodes
7731 for (auto o : txc->onodes) {
7732 // finalize extent_map shards
7733 o->extent_map.update(t, false);
7734 if (o->extent_map.needs_reshard()) {
7735 o->extent_map.reshard(db, t);
7736 o->extent_map.update(t, true);
7737 if (o->extent_map.needs_reshard()) {
7738 dout(20) << __func__ << " warning: still wants reshard, check options?"
7739 << dendl;
7740 o->extent_map.clear_needs_reshard();
7741 }
7742 logger->inc(l_bluestore_onode_reshard);
7743 }
7744
7745 // bound encode
7746 size_t bound = 0;
7747 denc(o->onode, bound);
7748 o->extent_map.bound_encode_spanning_blobs(bound);
7749 if (o->onode.extent_map_shards.empty()) {
7750 denc(o->extent_map.inline_bl, bound);
7751 }
7752
7753 // encode
7754 bufferlist bl;
7755 unsigned onode_part, blob_part, extent_part;
7756 {
7757 auto p = bl.get_contiguous_appender(bound, true);
7758 denc(o->onode, p);
7759 onode_part = p.get_logical_offset();
7760 o->extent_map.encode_spanning_blobs(p);
7761 blob_part = p.get_logical_offset() - onode_part;
7762 if (o->onode.extent_map_shards.empty()) {
7763 denc(o->extent_map.inline_bl, p);
7764 }
7765 extent_part = p.get_logical_offset() - onode_part - blob_part;
7766 }
7767
7768 dout(20) << " onode " << o->oid << " is " << bl.length()
7769 << " (" << onode_part << " bytes onode + "
7770 << blob_part << " bytes spanning blobs + "
7771 << extent_part << " bytes inline extents)"
7772 << dendl;
7773 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
7774 o->flushing_count++;
7775 }
7776
7777 // objects we modified but didn't affect the onode
7778 auto p = txc->modified_objects.begin();
7779 while (p != txc->modified_objects.end()) {
7780 if (txc->onodes.count(*p) == 0) {
7781 (*p)->flushing_count++;
7782 ++p;
7783 } else {
7784 // remove dups with onodes list to avoid problems in _txc_finish
7785 p = txc->modified_objects.erase(p);
7786 }
7787 }
7788
7789 // finalize shared_blobs
7790 for (auto sb : txc->shared_blobs) {
7791 string key;
7792 auto sbid = sb->get_sbid();
7793 get_shared_blob_key(sbid, &key);
7794 if (sb->persistent->empty()) {
7795 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
7796 << " is empty" << dendl;
7797 t->rmkey(PREFIX_SHARED_BLOB, key);
7798 } else {
7799 bufferlist bl;
7800 ::encode(*(sb->persistent), bl);
7801 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
7802 << " is " << bl.length() << " " << *sb << dendl;
7803 t->set(PREFIX_SHARED_BLOB, key, bl);
7804 }
7805 }
7806 }
7807
7808 void BlueStore::BSPerfTracker::update_from_perfcounters(
7809 PerfCounters &logger)
7810 {
7811 os_commit_latency.consume_next(
7812 logger.get_tavg_ms(
7813 l_bluestore_commit_lat));
7814 os_apply_latency.consume_next(
7815 logger.get_tavg_ms(
7816 l_bluestore_commit_lat));
7817 }
7818
7819 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
7820 {
7821 dout(20) << __func__ << " txc " << txc << std::hex
7822 << " allocated 0x" << txc->allocated
7823 << " released 0x" << txc->released
7824 << std::dec << dendl;
7825
7826 // We have to handle the case where we allocate *and* deallocate the
7827 // same region in this transaction. The freelist doesn't like that.
7828 // (Actually, the only thing that cares is the BitmapFreelistManager
7829 // debug check. But that's important.)
7830 interval_set<uint64_t> tmp_allocated, tmp_released;
7831 interval_set<uint64_t> *pallocated = &txc->allocated;
7832 interval_set<uint64_t> *preleased = &txc->released;
7833 if (!txc->allocated.empty() && !txc->released.empty()) {
7834 interval_set<uint64_t> overlap;
7835 overlap.intersection_of(txc->allocated, txc->released);
7836 if (!overlap.empty()) {
7837 tmp_allocated = txc->allocated;
7838 tmp_allocated.subtract(overlap);
7839 tmp_released = txc->released;
7840 tmp_released.subtract(overlap);
7841 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
7842 << ", new allocated 0x" << tmp_allocated
7843 << " released 0x" << tmp_released << std::dec
7844 << dendl;
7845 pallocated = &tmp_allocated;
7846 preleased = &tmp_released;
7847 }
7848 }
7849
7850 // update freelist with non-overlap sets
7851 for (interval_set<uint64_t>::iterator p = pallocated->begin();
7852 p != pallocated->end();
7853 ++p) {
7854 fm->allocate(p.get_start(), p.get_len(), t);
7855 }
7856 for (interval_set<uint64_t>::iterator p = preleased->begin();
7857 p != preleased->end();
7858 ++p) {
7859 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
7860 << "~" << p.get_len() << std::dec << dendl;
7861 fm->release(p.get_start(), p.get_len(), t);
7862 }
7863
7864 _txc_update_store_statfs(txc);
7865 }
7866
7867 void BlueStore::_txc_applied_kv(TransContext *txc)
7868 {
7869 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
7870 for (auto& o : *ls) {
7871 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
7872 << dendl;
7873 if (--o->flushing_count == 0) {
7874 std::lock_guard<std::mutex> l(o->flush_lock);
7875 o->flush_cond.notify_all();
7876 }
7877 }
7878 }
7879 }
7880
7881 void BlueStore::_txc_committed_kv(TransContext *txc)
7882 {
7883 dout(20) << __func__ << " txc " << txc << dendl;
7884
7885 // warning: we're calling onreadable_sync inside the sequencer lock
7886 if (txc->onreadable_sync) {
7887 txc->onreadable_sync->complete(0);
7888 txc->onreadable_sync = NULL;
7889 }
7890 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
7891 if (txc->oncommit) {
7892 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
7893 finishers[n]->queue(txc->oncommit);
7894 txc->oncommit = NULL;
7895 }
7896 if (txc->onreadable) {
7897 finishers[n]->queue(txc->onreadable);
7898 txc->onreadable = NULL;
7899 }
7900
7901 if (!txc->oncommits.empty()) {
7902 finishers[n]->queue(txc->oncommits);
7903 }
7904 }
7905
7906 void BlueStore::_txc_finish(TransContext *txc)
7907 {
7908 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
7909 assert(txc->state == TransContext::STATE_FINISHING);
7910
7911 for (auto& sb : txc->shared_blobs_written) {
7912 sb->bc.finish_write(sb->get_cache(), txc->seq);
7913 }
7914 txc->shared_blobs_written.clear();
7915
7916 while (!txc->removed_collections.empty()) {
7917 _queue_reap_collection(txc->removed_collections.front());
7918 txc->removed_collections.pop_front();
7919 }
7920
7921 OpSequencerRef osr = txc->osr;
7922 CollectionRef c;
7923 bool empty = false;
7924 bool submit_deferred = false;
7925 OpSequencer::q_list_t releasing_txc;
7926 {
7927 std::lock_guard<std::mutex> l(osr->qlock);
7928 txc->state = TransContext::STATE_DONE;
7929 bool notify = false;
7930 while (!osr->q.empty()) {
7931 TransContext *txc = &osr->q.front();
7932 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
7933 << dendl;
7934 if (txc->state != TransContext::STATE_DONE) {
7935 if (txc->state == TransContext::STATE_PREPARE &&
7936 deferred_aggressive) {
7937 // for _osr_drain_preceding()
7938 notify = true;
7939 }
7940 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
7941 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
7942 submit_deferred = true;
7943 }
7944 break;
7945 }
7946
7947 if (!c && txc->first_collection) {
7948 c = txc->first_collection;
7949 }
7950 osr->q.pop_front();
7951 releasing_txc.push_back(*txc);
7952 notify = true;
7953 }
7954 if (notify) {
7955 osr->qcond.notify_all();
7956 }
7957 if (osr->q.empty()) {
7958 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
7959 empty = true;
7960 }
7961 }
7962 while (!releasing_txc.empty()) {
7963 // release to allocator only after all preceding txc's have also
7964 // finished any deferred writes that potentially land in these
7965 // blocks
7966 auto txc = &releasing_txc.front();
7967 _txc_release_alloc(txc);
7968 releasing_txc.pop_front();
7969 txc->log_state_latency(logger, l_bluestore_state_done_lat);
7970 delete txc;
7971 }
7972
7973 if (submit_deferred) {
7974 // we're pinning memory; flush! we could be more fine-grained here but
7975 // i'm not sure it's worth the bother.
7976 deferred_try_submit();
7977 }
7978
7979 if (empty && osr->zombie) {
7980 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
7981 osr->_unregister();
7982 }
7983 }
7984
7985 void BlueStore::_txc_release_alloc(TransContext *txc)
7986 {
7987 // update allocator with full released set
7988 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
7989 dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
7990 for (interval_set<uint64_t>::iterator p = txc->released.begin();
7991 p != txc->released.end();
7992 ++p) {
7993 alloc->release(p.get_start(), p.get_len());
7994 }
7995 }
7996
7997 txc->allocated.clear();
7998 txc->released.clear();
7999 }
8000
8001 void BlueStore::_osr_drain_preceding(TransContext *txc)
8002 {
8003 OpSequencer *osr = txc->osr.get();
8004 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8005 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8006 {
8007 // submit anything pending
8008 std::lock_guard<std::mutex> l(deferred_lock);
8009 if (osr->deferred_pending) {
8010 _deferred_submit(osr);
8011 }
8012 }
8013 {
8014 // wake up any previously finished deferred events
8015 std::lock_guard<std::mutex> l(kv_lock);
8016 kv_cond.notify_one();
8017 }
8018 osr->drain_preceding(txc);
8019 --deferred_aggressive;
8020 dout(10) << __func__ << " " << osr << " done" << dendl;
8021 }
8022
8023 void BlueStore::_osr_drain_all()
8024 {
8025 dout(10) << __func__ << dendl;
8026
8027 set<OpSequencerRef> s;
8028 {
8029 std::lock_guard<std::mutex> l(osr_lock);
8030 s = osr_set;
8031 }
8032 dout(20) << __func__ << " osr_set " << s << dendl;
8033
8034 ++deferred_aggressive;
8035 {
8036 // submit anything pending
8037 std::lock_guard<std::mutex> l(deferred_lock);
8038 _deferred_try_submit();
8039 }
8040 {
8041 // wake up any previously finished deferred events
8042 std::lock_guard<std::mutex> l(kv_lock);
8043 kv_cond.notify_one();
8044 }
8045 {
8046 std::lock_guard<std::mutex> l(kv_finalize_lock);
8047 kv_finalize_cond.notify_one();
8048 }
8049 for (auto osr : s) {
8050 dout(20) << __func__ << " drain " << osr << dendl;
8051 osr->drain();
8052 }
8053 --deferred_aggressive;
8054
8055 dout(10) << __func__ << " done" << dendl;
8056 }
8057
8058 void BlueStore::_osr_unregister_all()
8059 {
8060 set<OpSequencerRef> s;
8061 {
8062 std::lock_guard<std::mutex> l(osr_lock);
8063 s = osr_set;
8064 }
8065 dout(10) << __func__ << " " << s << dendl;
8066 for (auto osr : s) {
8067 osr->_unregister();
8068
8069 if (!osr->zombie) {
8070 // break link from Sequencer to us so that this OpSequencer
8071 // instance can die with this mount/umount cycle. note that
8072 // we assume umount() will not race against ~Sequencer.
8073 assert(osr->parent);
8074 osr->parent->p.reset();
8075 }
8076 }
8077 // nobody should be creating sequencers during umount either.
8078 {
8079 std::lock_guard<std::mutex> l(osr_lock);
8080 assert(osr_set.empty());
8081 }
8082 }
8083
8084 void BlueStore::_kv_start()
8085 {
8086 dout(10) << __func__ << dendl;
8087
8088 if (cct->_conf->bluestore_shard_finishers) {
8089 if (cct->_conf->osd_op_num_shards) {
8090 m_finisher_num = cct->_conf->osd_op_num_shards;
8091 } else {
8092 assert(bdev);
8093 if (bdev->is_rotational()) {
8094 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
8095 } else {
8096 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
8097 }
8098 }
8099 }
8100
8101 assert(m_finisher_num != 0);
8102
8103 for (int i = 0; i < m_finisher_num; ++i) {
8104 ostringstream oss;
8105 oss << "finisher-" << i;
8106 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8107 finishers.push_back(f);
8108 }
8109
8110 for (auto f : finishers) {
8111 f->start();
8112 }
8113 kv_sync_thread.create("bstore_kv_sync");
8114 kv_finalize_thread.create("bstore_kv_final");
8115 }
8116
8117 void BlueStore::_kv_stop()
8118 {
8119 dout(10) << __func__ << dendl;
8120 {
8121 std::unique_lock<std::mutex> l(kv_lock);
8122 while (!kv_sync_started) {
8123 kv_cond.wait(l);
8124 }
8125 kv_stop = true;
8126 kv_cond.notify_all();
8127 }
8128 {
8129 std::unique_lock<std::mutex> l(kv_finalize_lock);
8130 while (!kv_finalize_started) {
8131 kv_finalize_cond.wait(l);
8132 }
8133 kv_finalize_stop = true;
8134 kv_finalize_cond.notify_all();
8135 }
8136 kv_sync_thread.join();
8137 kv_finalize_thread.join();
8138 {
8139 std::lock_guard<std::mutex> l(kv_lock);
8140 kv_stop = false;
8141 }
8142 {
8143 std::lock_guard<std::mutex> l(kv_finalize_lock);
8144 kv_finalize_stop = false;
8145 }
8146 dout(10) << __func__ << " stopping finishers" << dendl;
8147 for (auto f : finishers) {
8148 f->wait_for_empty();
8149 f->stop();
8150 }
8151 dout(10) << __func__ << " stopped" << dendl;
8152 }
8153
8154 void BlueStore::_kv_sync_thread()
8155 {
8156 dout(10) << __func__ << " start" << dendl;
8157 std::unique_lock<std::mutex> l(kv_lock);
8158 assert(!kv_sync_started);
8159 kv_sync_started = true;
8160 kv_cond.notify_all();
8161 while (true) {
8162 assert(kv_committing.empty());
8163 if (kv_queue.empty() &&
8164 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8165 !deferred_aggressive)) {
8166 if (kv_stop)
8167 break;
8168 dout(20) << __func__ << " sleep" << dendl;
8169 kv_cond.wait(l);
8170 dout(20) << __func__ << " wake" << dendl;
8171 } else {
8172 deque<TransContext*> kv_submitting;
8173 deque<DeferredBatch*> deferred_done, deferred_stable;
8174 uint64_t aios = 0, costs = 0;
8175
8176 dout(20) << __func__ << " committing " << kv_queue.size()
8177 << " submitting " << kv_queue_unsubmitted.size()
8178 << " deferred done " << deferred_done_queue.size()
8179 << " stable " << deferred_stable_queue.size()
8180 << dendl;
8181 kv_committing.swap(kv_queue);
8182 kv_submitting.swap(kv_queue_unsubmitted);
8183 deferred_done.swap(deferred_done_queue);
8184 deferred_stable.swap(deferred_stable_queue);
8185 aios = kv_ios;
8186 costs = kv_throttle_costs;
8187 kv_ios = 0;
8188 kv_throttle_costs = 0;
8189 utime_t start = ceph_clock_now();
8190 l.unlock();
8191
8192 dout(30) << __func__ << " committing " << kv_committing << dendl;
8193 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8194 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8195 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8196
8197 bool force_flush = false;
8198 // if bluefs is sharing the same device as data (only), then we
8199 // can rely on the bluefs commit to flush the device and make
8200 // deferred aios stable. that means that if we do have done deferred
8201 // txcs AND we are not on a single device, we need to force a flush.
8202 if (bluefs_single_shared_device && bluefs) {
8203 if (aios) {
8204 force_flush = true;
8205 } else if (kv_committing.empty() && kv_submitting.empty() &&
8206 deferred_stable.empty()) {
8207 force_flush = true; // there's nothing else to commit!
8208 } else if (deferred_aggressive) {
8209 force_flush = true;
8210 }
8211 } else
8212 force_flush = true;
8213
8214 if (force_flush) {
8215 dout(20) << __func__ << " num_aios=" << aios
8216 << " force_flush=" << (int)force_flush
8217 << ", flushing, deferred done->stable" << dendl;
8218 // flush/barrier on block device
8219 bdev->flush();
8220
8221 // if we flush then deferred done are now deferred stable
8222 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8223 deferred_done.end());
8224 deferred_done.clear();
8225 }
8226 utime_t after_flush = ceph_clock_now();
8227
8228 // we will use one final transaction to force a sync
8229 KeyValueDB::Transaction synct = db->get_transaction();
8230
8231 // increase {nid,blobid}_max? note that this covers both the
8232 // case where we are approaching the max and the case we passed
8233 // it. in either case, we increase the max in the earlier txn
8234 // we submit.
8235 uint64_t new_nid_max = 0, new_blobid_max = 0;
8236 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8237 KeyValueDB::Transaction t =
8238 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8239 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8240 bufferlist bl;
8241 ::encode(new_nid_max, bl);
8242 t->set(PREFIX_SUPER, "nid_max", bl);
8243 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8244 }
8245 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8246 KeyValueDB::Transaction t =
8247 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8248 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8249 bufferlist bl;
8250 ::encode(new_blobid_max, bl);
8251 t->set(PREFIX_SUPER, "blobid_max", bl);
8252 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8253 }
8254 for (auto txc : kv_submitting) {
8255 assert(txc->state == TransContext::STATE_KV_QUEUED);
8256 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8257 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8258 assert(r == 0);
8259 _txc_applied_kv(txc);
8260 --txc->osr->kv_committing_serially;
8261 txc->state = TransContext::STATE_KV_SUBMITTED;
8262 if (txc->osr->kv_submitted_waiters) {
8263 std::lock_guard<std::mutex> l(txc->osr->qlock);
8264 if (txc->osr->_is_all_kv_submitted()) {
8265 txc->osr->qcond.notify_all();
8266 }
8267 }
8268 }
8269 for (auto txc : kv_committing) {
8270 if (txc->had_ios) {
8271 --txc->osr->txc_with_unstable_io;
8272 }
8273 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8274 }
8275
8276 // release throttle *before* we commit. this allows new ops
8277 // to be prepared and enter pipeline while we are waiting on
8278 // the kv commit sync/flush. then hopefully on the next
8279 // iteration there will already be ops awake. otherwise, we
8280 // end up going to sleep, and then wake up when the very first
8281 // transaction is ready for commit.
8282 throttle_bytes.put(costs);
8283
8284 PExtentVector bluefs_gift_extents;
8285 if (bluefs &&
8286 after_flush - bluefs_last_balance >
8287 cct->_conf->bluestore_bluefs_balance_interval) {
8288 bluefs_last_balance = after_flush;
8289 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8290 assert(r >= 0);
8291 if (r > 0) {
8292 for (auto& p : bluefs_gift_extents) {
8293 bluefs_extents.insert(p.offset, p.length);
8294 }
8295 bufferlist bl;
8296 ::encode(bluefs_extents, bl);
8297 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8298 << bluefs_extents << std::dec << dendl;
8299 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8300 }
8301 }
8302
8303 // cleanup sync deferred keys
8304 for (auto b : deferred_stable) {
8305 for (auto& txc : b->txcs) {
8306 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8307 if (!wt.released.empty()) {
8308 // kraken replay compat only
8309 txc.released = wt.released;
8310 dout(10) << __func__ << " deferred txn has released "
8311 << txc.released
8312 << " (we just upgraded from kraken) on " << &txc << dendl;
8313 _txc_finalize_kv(&txc, synct);
8314 }
8315 // cleanup the deferred
8316 string key;
8317 get_deferred_key(wt.seq, &key);
8318 synct->rm_single_key(PREFIX_DEFERRED, key);
8319 }
8320 }
8321
8322 // submit synct synchronously (block and wait for it to commit)
8323 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
8324 assert(r == 0);
8325
8326 if (new_nid_max) {
8327 nid_max = new_nid_max;
8328 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8329 }
8330 if (new_blobid_max) {
8331 blobid_max = new_blobid_max;
8332 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8333 }
8334
8335 utime_t finish = ceph_clock_now();
8336 utime_t dur_flush = after_flush - start;
8337 utime_t dur_kv = finish - after_flush;
8338 utime_t dur = finish - start;
8339 dout(20) << __func__ << " committed " << kv_committing.size()
8340 << " cleaned " << deferred_stable.size()
8341 << " in " << dur
8342 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8343 << dendl;
8344 if (logger) {
8345 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8346 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8347 logger->tinc(l_bluestore_kv_lat, dur);
8348 }
8349
8350 if (bluefs) {
8351 if (!bluefs_gift_extents.empty()) {
8352 _commit_bluefs_freespace(bluefs_gift_extents);
8353 }
8354 for (auto p = bluefs_extents_reclaiming.begin();
8355 p != bluefs_extents_reclaiming.end();
8356 ++p) {
8357 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8358 << p.get_start() << "~" << p.get_len() << std::dec
8359 << dendl;
8360 alloc->release(p.get_start(), p.get_len());
8361 }
8362 bluefs_extents_reclaiming.clear();
8363 }
8364
8365 {
8366 std::unique_lock<std::mutex> m(kv_finalize_lock);
8367 if (kv_committing_to_finalize.empty()) {
8368 kv_committing_to_finalize.swap(kv_committing);
8369 } else {
8370 kv_committing_to_finalize.insert(
8371 kv_committing_to_finalize.end(),
8372 kv_committing.begin(),
8373 kv_committing.end());
8374 kv_committing.clear();
8375 }
8376 if (deferred_stable_to_finalize.empty()) {
8377 deferred_stable_to_finalize.swap(deferred_stable);
8378 } else {
8379 deferred_stable_to_finalize.insert(
8380 deferred_stable_to_finalize.end(),
8381 deferred_stable.begin(),
8382 deferred_stable.end());
8383 deferred_stable.clear();
8384 }
8385 kv_finalize_cond.notify_one();
8386 }
8387
8388 l.lock();
8389 // previously deferred "done" are now "stable" by virtue of this
8390 // commit cycle.
8391 deferred_stable_queue.swap(deferred_done);
8392 }
8393 }
8394 dout(10) << __func__ << " finish" << dendl;
8395 kv_sync_started = false;
8396 }
8397
8398 void BlueStore::_kv_finalize_thread()
8399 {
8400 deque<TransContext*> kv_committed;
8401 deque<DeferredBatch*> deferred_stable;
8402 dout(10) << __func__ << " start" << dendl;
8403 std::unique_lock<std::mutex> l(kv_finalize_lock);
8404 assert(!kv_finalize_started);
8405 kv_finalize_started = true;
8406 kv_finalize_cond.notify_all();
8407 while (true) {
8408 assert(kv_committed.empty());
8409 assert(deferred_stable.empty());
8410 if (kv_committing_to_finalize.empty() &&
8411 deferred_stable_to_finalize.empty()) {
8412 if (kv_finalize_stop)
8413 break;
8414 dout(20) << __func__ << " sleep" << dendl;
8415 kv_finalize_cond.wait(l);
8416 dout(20) << __func__ << " wake" << dendl;
8417 } else {
8418 kv_committed.swap(kv_committing_to_finalize);
8419 deferred_stable.swap(deferred_stable_to_finalize);
8420 l.unlock();
8421 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8422 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8423
8424 while (!kv_committed.empty()) {
8425 TransContext *txc = kv_committed.front();
8426 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8427 _txc_state_proc(txc);
8428 kv_committed.pop_front();
8429 }
8430
8431 for (auto b : deferred_stable) {
8432 auto p = b->txcs.begin();
8433 while (p != b->txcs.end()) {
8434 TransContext *txc = &*p;
8435 p = b->txcs.erase(p); // unlink here because
8436 _txc_state_proc(txc); // this may destroy txc
8437 }
8438 delete b;
8439 }
8440 deferred_stable.clear();
8441
8442 if (!deferred_aggressive) {
8443 std::lock_guard<std::mutex> l(deferred_lock);
8444 if (deferred_queue_size >= deferred_batch_ops.load() ||
8445 throttle_deferred_bytes.past_midpoint()) {
8446 _deferred_try_submit();
8447 }
8448 }
8449
8450 // this is as good a place as any ...
8451 _reap_collections();
8452
8453 l.lock();
8454 }
8455 }
8456 dout(10) << __func__ << " finish" << dendl;
8457 kv_finalize_started = false;
8458 }
8459
8460 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8461 TransContext *txc, OnodeRef o)
8462 {
8463 if (!txc->deferred_txn) {
8464 txc->deferred_txn = new bluestore_deferred_transaction_t;
8465 }
8466 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8467 return &txc->deferred_txn->ops.back();
8468 }
8469
8470 void BlueStore::_deferred_queue(TransContext *txc)
8471 {
8472 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
8473 std::lock_guard<std::mutex> l(deferred_lock);
8474 if (!txc->osr->deferred_pending &&
8475 !txc->osr->deferred_running) {
8476 deferred_queue.push_back(*txc->osr);
8477 }
8478 if (!txc->osr->deferred_pending) {
8479 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8480 }
8481 ++deferred_queue_size;
8482 txc->osr->deferred_pending->txcs.push_back(*txc);
8483 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8484 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8485 const auto& op = *opi;
8486 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8487 bufferlist::const_iterator p = op.data.begin();
8488 for (auto e : op.extents) {
8489 txc->osr->deferred_pending->prepare_write(
8490 cct, wt.seq, e.offset, e.length, p);
8491 }
8492 }
8493 if (deferred_aggressive &&
8494 !txc->osr->deferred_running) {
8495 _deferred_submit(txc->osr.get());
8496 }
8497 }
8498
8499 void BlueStore::_deferred_try_submit()
8500 {
8501 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8502 << deferred_queue_size << " txcs" << dendl;
8503 for (auto& osr : deferred_queue) {
8504 if (!osr.deferred_running) {
8505 _deferred_submit(&osr);
8506 }
8507 }
8508 }
8509
8510 void BlueStore::_deferred_submit(OpSequencer *osr)
8511 {
8512 dout(10) << __func__ << " osr " << osr
8513 << " " << osr->deferred_pending->iomap.size() << " ios pending "
8514 << dendl;
8515 assert(osr->deferred_pending);
8516 assert(!osr->deferred_running);
8517
8518 auto b = osr->deferred_pending;
8519 deferred_queue_size -= b->seq_bytes.size();
8520 assert(deferred_queue_size >= 0);
8521
8522 osr->deferred_running = osr->deferred_pending;
8523 osr->deferred_pending = nullptr;
8524
8525 uint64_t start = 0, pos = 0;
8526 bufferlist bl;
8527 auto i = b->iomap.begin();
8528 while (true) {
8529 if (i == b->iomap.end() || i->first != pos) {
8530 if (bl.length()) {
8531 dout(20) << __func__ << " write 0x" << std::hex
8532 << start << "~" << bl.length()
8533 << " crc " << bl.crc32c(-1) << std::dec << dendl;
8534 if (!g_conf->bluestore_debug_omit_block_device_write) {
8535 logger->inc(l_bluestore_deferred_write_ops);
8536 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
8537 int r = bdev->aio_write(start, bl, &b->ioc, false);
8538 assert(r == 0);
8539 }
8540 }
8541 if (i == b->iomap.end()) {
8542 break;
8543 }
8544 start = 0;
8545 pos = i->first;
8546 bl.clear();
8547 }
8548 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
8549 << std::hex << pos << "~" << i->second.bl.length() << std::dec
8550 << dendl;
8551 if (!bl.length()) {
8552 start = pos;
8553 }
8554 pos += i->second.bl.length();
8555 bl.claim_append(i->second.bl);
8556 ++i;
8557 }
8558 bdev->aio_submit(&b->ioc);
8559 }
8560
8561 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
8562 {
8563 dout(10) << __func__ << " osr " << osr << dendl;
8564 assert(osr->deferred_running);
8565 DeferredBatch *b = osr->deferred_running;
8566
8567 {
8568 std::lock_guard<std::mutex> l(deferred_lock);
8569 assert(osr->deferred_running == b);
8570 osr->deferred_running = nullptr;
8571 if (!osr->deferred_pending) {
8572 auto q = deferred_queue.iterator_to(*osr);
8573 deferred_queue.erase(q);
8574 } else if (deferred_aggressive) {
8575 _deferred_submit(osr);
8576 }
8577 }
8578
8579 {
8580 uint64_t costs = 0;
8581 std::lock_guard<std::mutex> l2(osr->qlock);
8582 for (auto& i : b->txcs) {
8583 TransContext *txc = &i;
8584 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
8585 costs += txc->cost;
8586 }
8587 osr->qcond.notify_all();
8588 throttle_deferred_bytes.put(costs);
8589 std::lock_guard<std::mutex> l(kv_lock);
8590 deferred_done_queue.emplace_back(b);
8591 }
8592
8593 // in the normal case, do not bother waking up the kv thread; it will
8594 // catch us on the next commit anyway.
8595 if (deferred_aggressive) {
8596 std::lock_guard<std::mutex> l(kv_lock);
8597 kv_cond.notify_one();
8598 }
8599 }
8600
8601 int BlueStore::_deferred_replay()
8602 {
8603 dout(10) << __func__ << " start" << dendl;
8604 OpSequencerRef osr = new OpSequencer(cct, this);
8605 int count = 0;
8606 int r = 0;
8607 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
8608 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
8609 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
8610 << dendl;
8611 bluestore_deferred_transaction_t *deferred_txn =
8612 new bluestore_deferred_transaction_t;
8613 bufferlist bl = it->value();
8614 bufferlist::iterator p = bl.begin();
8615 try {
8616 ::decode(*deferred_txn, p);
8617 } catch (buffer::error& e) {
8618 derr << __func__ << " failed to decode deferred txn "
8619 << pretty_binary_string(it->key()) << dendl;
8620 delete deferred_txn;
8621 r = -EIO;
8622 goto out;
8623 }
8624 TransContext *txc = _txc_create(osr.get());
8625 txc->deferred_txn = deferred_txn;
8626 txc->state = TransContext::STATE_KV_DONE;
8627 _txc_state_proc(txc);
8628 }
8629 out:
8630 dout(20) << __func__ << " draining osr" << dendl;
8631 _osr_drain_all();
8632 osr->discard();
8633 dout(10) << __func__ << " completed " << count << " events" << dendl;
8634 return r;
8635 }
8636
8637 // ---------------------------
8638 // transactions
8639
8640 int BlueStore::queue_transactions(
8641 Sequencer *posr,
8642 vector<Transaction>& tls,
8643 TrackedOpRef op,
8644 ThreadPool::TPHandle *handle)
8645 {
8646 FUNCTRACE();
8647 Context *onreadable;
8648 Context *ondisk;
8649 Context *onreadable_sync;
8650 ObjectStore::Transaction::collect_contexts(
8651 tls, &onreadable, &ondisk, &onreadable_sync);
8652
8653 if (cct->_conf->objectstore_blackhole) {
8654 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
8655 << dendl;
8656 delete ondisk;
8657 delete onreadable;
8658 delete onreadable_sync;
8659 return 0;
8660 }
8661 utime_t start = ceph_clock_now();
8662 // set up the sequencer
8663 OpSequencer *osr;
8664 assert(posr);
8665 if (posr->p) {
8666 osr = static_cast<OpSequencer *>(posr->p.get());
8667 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
8668 } else {
8669 osr = new OpSequencer(cct, this);
8670 osr->parent = posr;
8671 posr->p = osr;
8672 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
8673 }
8674
8675 // prepare
8676 TransContext *txc = _txc_create(osr);
8677 txc->onreadable = onreadable;
8678 txc->onreadable_sync = onreadable_sync;
8679 txc->oncommit = ondisk;
8680
8681 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
8682 (*p).set_osr(osr);
8683 txc->bytes += (*p).get_num_bytes();
8684 _txc_add_transaction(txc, &(*p));
8685 }
8686 _txc_calc_cost(txc);
8687
8688 _txc_write_nodes(txc, txc->t);
8689
8690 // journal deferred items
8691 if (txc->deferred_txn) {
8692 txc->deferred_txn->seq = ++deferred_seq;
8693 bufferlist bl;
8694 ::encode(*txc->deferred_txn, bl);
8695 string key;
8696 get_deferred_key(txc->deferred_txn->seq, &key);
8697 txc->t->set(PREFIX_DEFERRED, key, bl);
8698 }
8699
8700 _txc_finalize_kv(txc, txc->t);
8701 if (handle)
8702 handle->suspend_tp_timeout();
8703
8704 utime_t tstart = ceph_clock_now();
8705 throttle_bytes.get(txc->cost);
8706 if (txc->deferred_txn) {
8707 // ensure we do not block here because of deferred writes
8708 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
8709 deferred_try_submit();
8710 throttle_deferred_bytes.get(txc->cost);
8711 }
8712 }
8713 utime_t tend = ceph_clock_now();
8714
8715 if (handle)
8716 handle->reset_tp_timeout();
8717
8718 logger->inc(l_bluestore_txc);
8719
8720 // execute (start)
8721 _txc_state_proc(txc);
8722
8723 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
8724 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
8725 return 0;
8726 }
8727
8728 void BlueStore::_txc_aio_submit(TransContext *txc)
8729 {
8730 dout(10) << __func__ << " txc " << txc << dendl;
8731 bdev->aio_submit(&txc->ioc);
8732 }
8733
8734 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
8735 {
8736 Transaction::iterator i = t->begin();
8737
8738 _dump_transaction(t);
8739
8740 vector<CollectionRef> cvec(i.colls.size());
8741 unsigned j = 0;
8742 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
8743 ++p, ++j) {
8744 cvec[j] = _get_collection(*p);
8745
8746 // note first collection we reference
8747 if (!txc->first_collection)
8748 txc->first_collection = cvec[j];
8749 }
8750 vector<OnodeRef> ovec(i.objects.size());
8751
8752 for (int pos = 0; i.have_op(); ++pos) {
8753 Transaction::Op *op = i.decode_op();
8754 int r = 0;
8755
8756 // no coll or obj
8757 if (op->op == Transaction::OP_NOP)
8758 continue;
8759
8760 // collection operations
8761 CollectionRef &c = cvec[op->cid];
8762 switch (op->op) {
8763 case Transaction::OP_RMCOLL:
8764 {
8765 const coll_t &cid = i.get_cid(op->cid);
8766 r = _remove_collection(txc, cid, &c);
8767 if (!r)
8768 continue;
8769 }
8770 break;
8771
8772 case Transaction::OP_MKCOLL:
8773 {
8774 assert(!c);
8775 const coll_t &cid = i.get_cid(op->cid);
8776 r = _create_collection(txc, cid, op->split_bits, &c);
8777 if (!r)
8778 continue;
8779 }
8780 break;
8781
8782 case Transaction::OP_SPLIT_COLLECTION:
8783 assert(0 == "deprecated");
8784 break;
8785
8786 case Transaction::OP_SPLIT_COLLECTION2:
8787 {
8788 uint32_t bits = op->split_bits;
8789 uint32_t rem = op->split_rem;
8790 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
8791 if (!r)
8792 continue;
8793 }
8794 break;
8795
8796 case Transaction::OP_COLL_HINT:
8797 {
8798 uint32_t type = op->hint_type;
8799 bufferlist hint;
8800 i.decode_bl(hint);
8801 bufferlist::iterator hiter = hint.begin();
8802 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
8803 uint32_t pg_num;
8804 uint64_t num_objs;
8805 ::decode(pg_num, hiter);
8806 ::decode(num_objs, hiter);
8807 dout(10) << __func__ << " collection hint objects is a no-op, "
8808 << " pg_num " << pg_num << " num_objects " << num_objs
8809 << dendl;
8810 } else {
8811 // Ignore the hint
8812 dout(10) << __func__ << " unknown collection hint " << type << dendl;
8813 }
8814 continue;
8815 }
8816 break;
8817
8818 case Transaction::OP_COLL_SETATTR:
8819 r = -EOPNOTSUPP;
8820 break;
8821
8822 case Transaction::OP_COLL_RMATTR:
8823 r = -EOPNOTSUPP;
8824 break;
8825
8826 case Transaction::OP_COLL_RENAME:
8827 assert(0 == "not implemented");
8828 break;
8829 }
8830 if (r < 0) {
8831 derr << __func__ << " error " << cpp_strerror(r)
8832 << " not handled on operation " << op->op
8833 << " (op " << pos << ", counting from 0)" << dendl;
8834 _dump_transaction(t, 0);
8835 assert(0 == "unexpected error");
8836 }
8837
8838 // these operations implicity create the object
8839 bool create = false;
8840 if (op->op == Transaction::OP_TOUCH ||
8841 op->op == Transaction::OP_WRITE ||
8842 op->op == Transaction::OP_ZERO) {
8843 create = true;
8844 }
8845
8846 // object operations
8847 RWLock::WLocker l(c->lock);
8848 OnodeRef &o = ovec[op->oid];
8849 if (!o) {
8850 ghobject_t oid = i.get_oid(op->oid);
8851 o = c->get_onode(oid, create);
8852 }
8853 if (!create && (!o || !o->exists)) {
8854 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
8855 << i.get_oid(op->oid) << dendl;
8856 r = -ENOENT;
8857 goto endop;
8858 }
8859
8860 switch (op->op) {
8861 case Transaction::OP_TOUCH:
8862 r = _touch(txc, c, o);
8863 break;
8864
8865 case Transaction::OP_WRITE:
8866 {
8867 uint64_t off = op->off;
8868 uint64_t len = op->len;
8869 uint32_t fadvise_flags = i.get_fadvise_flags();
8870 bufferlist bl;
8871 i.decode_bl(bl);
8872 r = _write(txc, c, o, off, len, bl, fadvise_flags);
8873 }
8874 break;
8875
8876 case Transaction::OP_ZERO:
8877 {
8878 uint64_t off = op->off;
8879 uint64_t len = op->len;
8880 r = _zero(txc, c, o, off, len);
8881 }
8882 break;
8883
8884 case Transaction::OP_TRIMCACHE:
8885 {
8886 // deprecated, no-op
8887 }
8888 break;
8889
8890 case Transaction::OP_TRUNCATE:
8891 {
8892 uint64_t off = op->off;
8893 _truncate(txc, c, o, off);
8894 }
8895 break;
8896
8897 case Transaction::OP_REMOVE:
8898 {
8899 r = _remove(txc, c, o);
8900 }
8901 break;
8902
8903 case Transaction::OP_SETATTR:
8904 {
8905 string name = i.decode_string();
8906 bufferptr bp;
8907 i.decode_bp(bp);
8908 r = _setattr(txc, c, o, name, bp);
8909 }
8910 break;
8911
8912 case Transaction::OP_SETATTRS:
8913 {
8914 map<string, bufferptr> aset;
8915 i.decode_attrset(aset);
8916 r = _setattrs(txc, c, o, aset);
8917 }
8918 break;
8919
8920 case Transaction::OP_RMATTR:
8921 {
8922 string name = i.decode_string();
8923 r = _rmattr(txc, c, o, name);
8924 }
8925 break;
8926
8927 case Transaction::OP_RMATTRS:
8928 {
8929 r = _rmattrs(txc, c, o);
8930 }
8931 break;
8932
8933 case Transaction::OP_CLONE:
8934 {
8935 OnodeRef& no = ovec[op->dest_oid];
8936 if (!no) {
8937 const ghobject_t& noid = i.get_oid(op->dest_oid);
8938 no = c->get_onode(noid, true);
8939 }
8940 r = _clone(txc, c, o, no);
8941 }
8942 break;
8943
8944 case Transaction::OP_CLONERANGE:
8945 assert(0 == "deprecated");
8946 break;
8947
8948 case Transaction::OP_CLONERANGE2:
8949 {
8950 OnodeRef& no = ovec[op->dest_oid];
8951 if (!no) {
8952 const ghobject_t& noid = i.get_oid(op->dest_oid);
8953 no = c->get_onode(noid, true);
8954 }
8955 uint64_t srcoff = op->off;
8956 uint64_t len = op->len;
8957 uint64_t dstoff = op->dest_off;
8958 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
8959 }
8960 break;
8961
8962 case Transaction::OP_COLL_ADD:
8963 assert(0 == "not implemented");
8964 break;
8965
8966 case Transaction::OP_COLL_REMOVE:
8967 assert(0 == "not implemented");
8968 break;
8969
8970 case Transaction::OP_COLL_MOVE:
8971 assert(0 == "deprecated");
8972 break;
8973
8974 case Transaction::OP_COLL_MOVE_RENAME:
8975 case Transaction::OP_TRY_RENAME:
8976 {
8977 assert(op->cid == op->dest_cid);
8978 const ghobject_t& noid = i.get_oid(op->dest_oid);
8979 OnodeRef& no = ovec[op->dest_oid];
8980 if (!no) {
8981 no = c->get_onode(noid, false);
8982 }
8983 r = _rename(txc, c, o, no, noid);
8984 }
8985 break;
8986
8987 case Transaction::OP_OMAP_CLEAR:
8988 {
8989 r = _omap_clear(txc, c, o);
8990 }
8991 break;
8992 case Transaction::OP_OMAP_SETKEYS:
8993 {
8994 bufferlist aset_bl;
8995 i.decode_attrset_bl(&aset_bl);
8996 r = _omap_setkeys(txc, c, o, aset_bl);
8997 }
8998 break;
8999 case Transaction::OP_OMAP_RMKEYS:
9000 {
9001 bufferlist keys_bl;
9002 i.decode_keyset_bl(&keys_bl);
9003 r = _omap_rmkeys(txc, c, o, keys_bl);
9004 }
9005 break;
9006 case Transaction::OP_OMAP_RMKEYRANGE:
9007 {
9008 string first, last;
9009 first = i.decode_string();
9010 last = i.decode_string();
9011 r = _omap_rmkey_range(txc, c, o, first, last);
9012 }
9013 break;
9014 case Transaction::OP_OMAP_SETHEADER:
9015 {
9016 bufferlist bl;
9017 i.decode_bl(bl);
9018 r = _omap_setheader(txc, c, o, bl);
9019 }
9020 break;
9021
9022 case Transaction::OP_SETALLOCHINT:
9023 {
9024 r = _set_alloc_hint(txc, c, o,
9025 op->expected_object_size,
9026 op->expected_write_size,
9027 op->alloc_hint_flags);
9028 }
9029 break;
9030
9031 default:
9032 derr << __func__ << "bad op " << op->op << dendl;
9033 ceph_abort();
9034 }
9035
9036 endop:
9037 if (r < 0) {
9038 bool ok = false;
9039
9040 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9041 op->op == Transaction::OP_CLONE ||
9042 op->op == Transaction::OP_CLONERANGE2 ||
9043 op->op == Transaction::OP_COLL_ADD ||
9044 op->op == Transaction::OP_SETATTR ||
9045 op->op == Transaction::OP_SETATTRS ||
9046 op->op == Transaction::OP_RMATTR ||
9047 op->op == Transaction::OP_OMAP_SETKEYS ||
9048 op->op == Transaction::OP_OMAP_RMKEYS ||
9049 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9050 op->op == Transaction::OP_OMAP_SETHEADER))
9051 // -ENOENT is usually okay
9052 ok = true;
9053 if (r == -ENODATA)
9054 ok = true;
9055
9056 if (!ok) {
9057 const char *msg = "unexpected error code";
9058
9059 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9060 op->op == Transaction::OP_CLONE ||
9061 op->op == Transaction::OP_CLONERANGE2))
9062 msg = "ENOENT on clone suggests osd bug";
9063
9064 if (r == -ENOSPC)
9065 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9066 // by partially applying transactions.
9067 msg = "ENOSPC from bluestore, misconfigured cluster";
9068
9069 if (r == -ENOTEMPTY) {
9070 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9071 }
9072
9073 derr << __func__ << " error " << cpp_strerror(r)
9074 << " not handled on operation " << op->op
9075 << " (op " << pos << ", counting from 0)"
9076 << dendl;
9077 derr << msg << dendl;
9078 _dump_transaction(t, 0);
9079 assert(0 == "unexpected error");
9080 }
9081 }
9082 }
9083 }
9084
9085
9086
9087 // -----------------
9088 // write operations
9089
9090 int BlueStore::_touch(TransContext *txc,
9091 CollectionRef& c,
9092 OnodeRef &o)
9093 {
9094 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9095 int r = 0;
9096 o->exists = true;
9097 _assign_nid(txc, o);
9098 txc->write_onode(o);
9099 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9100 return r;
9101 }
9102
9103 void BlueStore::_dump_onode(OnodeRef o, int log_level)
9104 {
9105 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9106 return;
9107 dout(log_level) << __func__ << " " << o << " " << o->oid
9108 << " nid " << o->onode.nid
9109 << " size 0x" << std::hex << o->onode.size
9110 << " (" << std::dec << o->onode.size << ")"
9111 << " expected_object_size " << o->onode.expected_object_size
9112 << " expected_write_size " << o->onode.expected_write_size
9113 << " in " << o->onode.extent_map_shards.size() << " shards"
9114 << ", " << o->extent_map.spanning_blob_map.size()
9115 << " spanning blobs"
9116 << dendl;
9117 for (auto p = o->onode.attrs.begin();
9118 p != o->onode.attrs.end();
9119 ++p) {
9120 dout(log_level) << __func__ << " attr " << p->first
9121 << " len " << p->second.length() << dendl;
9122 }
9123 _dump_extent_map(o->extent_map, log_level);
9124 }
9125
9126 void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9127 {
9128 uint64_t pos = 0;
9129 for (auto& s : em.shards) {
9130 dout(log_level) << __func__ << " shard " << *s.shard_info
9131 << (s.loaded ? " (loaded)" : "")
9132 << (s.dirty ? " (dirty)" : "")
9133 << dendl;
9134 }
9135 for (auto& e : em.extent_map) {
9136 dout(log_level) << __func__ << " " << e << dendl;
9137 assert(e.logical_offset >= pos);
9138 pos = e.logical_offset + e.length;
9139 const bluestore_blob_t& blob = e.blob->get_blob();
9140 if (blob.has_csum()) {
9141 vector<uint64_t> v;
9142 unsigned n = blob.get_csum_count();
9143 for (unsigned i = 0; i < n; ++i)
9144 v.push_back(blob.get_csum_item(i));
9145 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9146 << dendl;
9147 }
9148 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9149 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9150 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9151 << "~" << i.second->length << std::dec
9152 << " " << *i.second << dendl;
9153 }
9154 }
9155 }
9156
9157 void BlueStore::_dump_transaction(Transaction *t, int log_level)
9158 {
9159 dout(log_level) << " transaction dump:\n";
9160 JSONFormatter f(true);
9161 f.open_object_section("transaction");
9162 t->dump(&f);
9163 f.close_section();
9164 f.flush(*_dout);
9165 *_dout << dendl;
9166 }
9167
9168 void BlueStore::_pad_zeros(
9169 bufferlist *bl, uint64_t *offset,
9170 uint64_t chunk_size)
9171 {
9172 auto length = bl->length();
9173 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9174 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9175 dout(40) << "before:\n";
9176 bl->hexdump(*_dout);
9177 *_dout << dendl;
9178 // front
9179 size_t front_pad = *offset % chunk_size;
9180 size_t back_pad = 0;
9181 size_t pad_count = 0;
9182 if (front_pad) {
9183 size_t front_copy = MIN(chunk_size - front_pad, length);
9184 bufferptr z = buffer::create_page_aligned(chunk_size);
9185 memset(z.c_str(), 0, front_pad);
9186 pad_count += front_pad;
9187 memcpy(z.c_str() + front_pad, bl->get_contiguous(0, front_copy), front_copy);
9188 if (front_copy + front_pad < chunk_size) {
9189 back_pad = chunk_size - (length + front_pad);
9190 memset(z.c_str() + front_pad + length, 0, back_pad);
9191 pad_count += back_pad;
9192 }
9193 bufferlist old, t;
9194 old.swap(*bl);
9195 t.substr_of(old, front_copy, length - front_copy);
9196 bl->append(z);
9197 bl->claim_append(t);
9198 *offset -= front_pad;
9199 length += front_pad + back_pad;
9200 }
9201
9202 // back
9203 uint64_t end = *offset + length;
9204 unsigned back_copy = end % chunk_size;
9205 if (back_copy) {
9206 assert(back_pad == 0);
9207 back_pad = chunk_size - back_copy;
9208 assert(back_copy <= length);
9209 bufferptr tail(chunk_size);
9210 memcpy(tail.c_str(), bl->get_contiguous(length - back_copy, back_copy),
9211 back_copy);
9212 memset(tail.c_str() + back_copy, 0, back_pad);
9213 bufferlist old;
9214 old.swap(*bl);
9215 bl->substr_of(old, 0, length - back_copy);
9216 bl->append(tail);
9217 length += back_pad;
9218 pad_count += back_pad;
9219 }
9220 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9221 << back_pad << " on front/back, now 0x" << *offset << "~"
9222 << length << std::dec << dendl;
9223 dout(40) << "after:\n";
9224 bl->hexdump(*_dout);
9225 *_dout << dendl;
9226 if (pad_count)
9227 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9228 assert(bl->length() == length);
9229 }
9230
9231 void BlueStore::_do_write_small(
9232 TransContext *txc,
9233 CollectionRef &c,
9234 OnodeRef o,
9235 uint64_t offset, uint64_t length,
9236 bufferlist::iterator& blp,
9237 WriteContext *wctx)
9238 {
9239 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9240 << std::dec << dendl;
9241 assert(length < min_alloc_size);
9242 uint64_t end_offs = offset + length;
9243
9244 logger->inc(l_bluestore_write_small);
9245 logger->inc(l_bluestore_write_small_bytes, length);
9246
9247 bufferlist bl;
9248 blp.copy(length, bl);
9249
9250 // Look for an existing mutable blob we can use.
9251 auto begin = o->extent_map.extent_map.begin();
9252 auto end = o->extent_map.extent_map.end();
9253 auto ep = o->extent_map.seek_lextent(offset);
9254 if (ep != begin) {
9255 --ep;
9256 if (ep->blob_end() <= offset) {
9257 ++ep;
9258 }
9259 }
9260 auto prev_ep = ep;
9261 if (prev_ep != begin) {
9262 --prev_ep;
9263 } else {
9264 prev_ep = end; // to avoid this extent check as it's a duplicate
9265 }
9266
9267 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9268 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9269 uint32_t alloc_len = min_alloc_size;
9270 auto offset0 = P2ALIGN(offset, alloc_len);
9271
9272 bool any_change;
9273
9274 // search suitable extent in both forward and reverse direction in
9275 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9276 // then check if blob can be reused via try_reuse_blob func or apply
9277 // direct/deferred write (the latter for extents including or higher
9278 // than 'offset' only).
9279 do {
9280 any_change = false;
9281
9282 if (ep != end && ep->logical_offset < offset + max_bsize) {
9283 BlobRef b = ep->blob;
9284 auto bstart = ep->blob_start();
9285 dout(20) << __func__ << " considering " << *b
9286 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9287 if (bstart >= end_offs) {
9288 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9289 } else if (!b->get_blob().is_mutable()) {
9290 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9291 } else if (ep->logical_offset % min_alloc_size !=
9292 ep->blob_offset % min_alloc_size) {
9293 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9294 } else {
9295 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9296 // can we pad our head/tail out with zeros?
9297 uint64_t head_pad, tail_pad;
9298 head_pad = P2PHASE(offset, chunk_size);
9299 tail_pad = P2NPHASE(end_offs, chunk_size);
9300 if (head_pad || tail_pad) {
9301 o->extent_map.fault_range(db, offset - head_pad,
9302 end_offs - offset + head_pad + tail_pad);
9303 }
9304 if (head_pad &&
9305 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9306 head_pad = 0;
9307 }
9308 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9309 tail_pad = 0;
9310 }
9311
9312 uint64_t b_off = offset - head_pad - bstart;
9313 uint64_t b_len = length + head_pad + tail_pad;
9314
9315 // direct write into unused blocks of an existing mutable blob?
9316 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9317 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9318 b->get_blob().is_unused(b_off, b_len) &&
9319 b->get_blob().is_allocated(b_off, b_len)) {
9320 bufferlist padded;
9321 _apply_padding(head_pad, tail_pad, bl, padded);
9322
9323 dout(20) << __func__ << " write to unused 0x" << std::hex
9324 << b_off << "~" << b_len
9325 << " pad 0x" << head_pad << " + 0x" << tail_pad
9326 << std::dec << " of mutable " << *b << dendl;
9327 _buffer_cache_write(txc, b, b_off, padded,
9328 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9329
9330 if (!g_conf->bluestore_debug_omit_block_device_write) {
9331 if (b_len <= prefer_deferred_size) {
9332 dout(20) << __func__ << " deferring small 0x" << std::hex
9333 << b_len << std::dec << " unused write via deferred" << dendl;
9334 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9335 op->op = bluestore_deferred_op_t::OP_WRITE;
9336 b->get_blob().map(
9337 b_off, b_len,
9338 [&](uint64_t offset, uint64_t length) {
9339 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9340 return 0;
9341 });
9342 op->data = padded;
9343 } else {
9344 b->get_blob().map_bl(
9345 b_off, padded,
9346 [&](uint64_t offset, bufferlist& t) {
9347 bdev->aio_write(offset, t,
9348 &txc->ioc, wctx->buffered);
9349 });
9350 }
9351 }
9352 b->dirty_blob().calc_csum(b_off, padded);
9353 dout(20) << __func__ << " lex old " << *ep << dendl;
9354 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9355 b,
9356 &wctx->old_extents);
9357 b->dirty_blob().mark_used(le->blob_offset, le->length);
9358 txc->statfs_delta.stored() += le->length;
9359 dout(20) << __func__ << " lex " << *le << dendl;
9360 logger->inc(l_bluestore_write_small_unused);
9361 return;
9362 }
9363 // read some data to fill out the chunk?
9364 uint64_t head_read = P2PHASE(b_off, chunk_size);
9365 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9366 if ((head_read || tail_read) &&
9367 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9368 head_read + tail_read < min_alloc_size) {
9369 b_off -= head_read;
9370 b_len += head_read + tail_read;
9371
9372 } else {
9373 head_read = tail_read = 0;
9374 }
9375
9376 // chunk-aligned deferred overwrite?
9377 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9378 b_off % chunk_size == 0 &&
9379 b_len % chunk_size == 0 &&
9380 b->get_blob().is_allocated(b_off, b_len)) {
9381
9382 bufferlist padded;
9383 _apply_padding(head_pad, tail_pad, bl, padded);
9384
9385 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9386 << " and tail 0x" << tail_read << std::dec << dendl;
9387 if (head_read) {
9388 bufferlist head_bl;
9389 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9390 head_bl, 0);
9391 assert(r >= 0 && r <= (int)head_read);
9392 size_t zlen = head_read - r;
9393 if (zlen) {
9394 head_bl.append_zero(zlen);
9395 logger->inc(l_bluestore_write_pad_bytes, zlen);
9396 }
9397 head_bl.claim_append(padded);
9398 padded.swap(head_bl);
9399 logger->inc(l_bluestore_write_penalty_read_ops);
9400 }
9401 if (tail_read) {
9402 bufferlist tail_bl;
9403 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9404 tail_bl, 0);
9405 assert(r >= 0 && r <= (int)tail_read);
9406 size_t zlen = tail_read - r;
9407 if (zlen) {
9408 tail_bl.append_zero(zlen);
9409 logger->inc(l_bluestore_write_pad_bytes, zlen);
9410 }
9411 padded.claim_append(tail_bl);
9412 logger->inc(l_bluestore_write_penalty_read_ops);
9413 }
9414 logger->inc(l_bluestore_write_small_pre_read);
9415
9416 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9417 op->op = bluestore_deferred_op_t::OP_WRITE;
9418 _buffer_cache_write(txc, b, b_off, padded,
9419 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9420
9421 int r = b->get_blob().map(
9422 b_off, b_len,
9423 [&](uint64_t offset, uint64_t length) {
9424 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9425 return 0;
9426 });
9427 assert(r == 0);
9428 if (b->get_blob().csum_type) {
9429 b->dirty_blob().calc_csum(b_off, padded);
9430 }
9431 op->data.claim(padded);
9432 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9433 << b_len << std::dec << " of mutable " << *b
9434 << " at " << op->extents << dendl;
9435 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9436 b, &wctx->old_extents);
9437 b->dirty_blob().mark_used(le->blob_offset, le->length);
9438 txc->statfs_delta.stored() += le->length;
9439 dout(20) << __func__ << " lex " << *le << dendl;
9440 logger->inc(l_bluestore_write_small_deferred);
9441 return;
9442 }
9443 //try to reuse blob
9444 if (b->try_reuse_blob(min_alloc_size,
9445 max_bsize,
9446 offset0 - bstart,
9447 &alloc_len)) {
9448 assert(alloc_len == min_alloc_size); // expecting data always
9449 // fit into reused blob
9450 // Need to check for pending writes desiring to
9451 // reuse the same pextent. The rationale is that during GC two chunks
9452 // from garbage blobs(compressed?) can share logical space within the same
9453 // AU. That's in turn might be caused by unaligned len in clone_range2.
9454 // Hence the second write will fail in an attempt to reuse blob at
9455 // do_alloc_write().
9456 if (!wctx->has_conflict(b,
9457 offset0,
9458 offset0 + alloc_len,
9459 min_alloc_size)) {
9460
9461 // we can't reuse pad_head/pad_tail since they might be truncated
9462 // due to existent extents
9463 uint64_t b_off = offset - bstart;
9464 uint64_t b_off0 = b_off;
9465 _pad_zeros(&bl, &b_off0, chunk_size);
9466
9467 dout(20) << __func__ << " reuse blob " << *b << std::hex
9468 << " (" << b_off0 << "~" << bl.length() << ")"
9469 << " (" << b_off << "~" << length << ")"
9470 << std::dec << dendl;
9471
9472 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9473 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9474 false, false);
9475 logger->inc(l_bluestore_write_small_unused);
9476 return;
9477 }
9478 }
9479 }
9480 ++ep;
9481 any_change = true;
9482 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9483
9484 // check extent for reuse in reverse order
9485 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9486 BlobRef b = prev_ep->blob;
9487 auto bstart = prev_ep->blob_start();
9488 dout(20) << __func__ << " considering " << *b
9489 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9490 if (b->try_reuse_blob(min_alloc_size,
9491 max_bsize,
9492 offset0 - bstart,
9493 &alloc_len)) {
9494 assert(alloc_len == min_alloc_size); // expecting data always
9495 // fit into reused blob
9496 // Need to check for pending writes desiring to
9497 // reuse the same pextent. The rationale is that during GC two chunks
9498 // from garbage blobs(compressed?) can share logical space within the same
9499 // AU. That's in turn might be caused by unaligned len in clone_range2.
9500 // Hence the second write will fail in an attempt to reuse blob at
9501 // do_alloc_write().
9502 if (!wctx->has_conflict(b,
9503 offset0,
9504 offset0 + alloc_len,
9505 min_alloc_size)) {
9506
9507 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9508 uint64_t b_off = offset - bstart;
9509 uint64_t b_off0 = b_off;
9510 _pad_zeros(&bl, &b_off0, chunk_size);
9511
9512 dout(20) << __func__ << " reuse blob " << *b << std::hex
9513 << " (" << b_off0 << "~" << bl.length() << ")"
9514 << " (" << b_off << "~" << length << ")"
9515 << std::dec << dendl;
9516
9517 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9518 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9519 false, false);
9520 logger->inc(l_bluestore_write_small_unused);
9521 return;
9522 }
9523 }
9524 if (prev_ep != begin) {
9525 --prev_ep;
9526 any_change = true;
9527 } else {
9528 prev_ep = end; // to avoid useless first extent re-check
9529 }
9530 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9531 } while (any_change);
9532
9533 // new blob.
9534
9535 BlobRef b = c->new_blob();
9536 uint64_t b_off = P2PHASE(offset, alloc_len);
9537 uint64_t b_off0 = b_off;
9538 _pad_zeros(&bl, &b_off0, block_size);
9539 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9540 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
9541 logger->inc(l_bluestore_write_small_new);
9542
9543 return;
9544 }
9545
9546 void BlueStore::_do_write_big(
9547 TransContext *txc,
9548 CollectionRef &c,
9549 OnodeRef o,
9550 uint64_t offset, uint64_t length,
9551 bufferlist::iterator& blp,
9552 WriteContext *wctx)
9553 {
9554 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9555 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
9556 << " compress " << (int)wctx->compress
9557 << dendl;
9558 logger->inc(l_bluestore_write_big);
9559 logger->inc(l_bluestore_write_big_bytes, length);
9560 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9561 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9562 while (length > 0) {
9563 bool new_blob = false;
9564 uint32_t l = MIN(max_bsize, length);
9565 BlobRef b;
9566 uint32_t b_off = 0;
9567
9568 //attempting to reuse existing blob
9569 if (!wctx->compress) {
9570 // look for an existing mutable blob we can reuse
9571 auto begin = o->extent_map.extent_map.begin();
9572 auto end = o->extent_map.extent_map.end();
9573 auto ep = o->extent_map.seek_lextent(offset);
9574 auto prev_ep = ep;
9575 if (prev_ep != begin) {
9576 --prev_ep;
9577 } else {
9578 prev_ep = end; // to avoid this extent check as it's a duplicate
9579 }
9580 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9581 // search suitable extent in both forward and reverse direction in
9582 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9583 // then check if blob can be reused via try_reuse_blob func.
9584 bool any_change;
9585 do {
9586 any_change = false;
9587 if (ep != end && ep->logical_offset < offset + max_bsize) {
9588 if (offset >= ep->blob_start() &&
9589 ep->blob->try_reuse_blob(min_alloc_size, max_bsize,
9590 offset - ep->blob_start(),
9591 &l)) {
9592 b = ep->blob;
9593 b_off = offset - ep->blob_start();
9594 prev_ep = end; // to avoid check below
9595 dout(20) << __func__ << " reuse blob " << *b << std::hex
9596 << " (" << b_off << "~" << l << ")" << std::dec << dendl;
9597 } else {
9598 ++ep;
9599 any_change = true;
9600 }
9601 }
9602
9603 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9604 if (prev_ep->blob->try_reuse_blob(min_alloc_size, max_bsize,
9605 offset - prev_ep->blob_start(),
9606 &l)) {
9607 b = prev_ep->blob;
9608 b_off = offset - prev_ep->blob_start();
9609 dout(20) << __func__ << " reuse blob " << *b << std::hex
9610 << " (" << b_off << "~" << l << ")" << std::dec << dendl;
9611 } else if (prev_ep != begin) {
9612 --prev_ep;
9613 any_change = true;
9614 } else {
9615 prev_ep = end; // to avoid useless first extent re-check
9616 }
9617 }
9618 } while (b == nullptr && any_change);
9619 }
9620 if (b == nullptr) {
9621 b = c->new_blob();
9622 b_off = 0;
9623 new_blob = true;
9624 }
9625
9626 bufferlist t;
9627 blp.copy(l, t);
9628 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
9629 offset += l;
9630 length -= l;
9631 logger->inc(l_bluestore_write_big_blobs);
9632 }
9633 }
9634
9635 int BlueStore::_do_alloc_write(
9636 TransContext *txc,
9637 CollectionRef coll,
9638 OnodeRef o,
9639 WriteContext *wctx)
9640 {
9641 dout(20) << __func__ << " txc " << txc
9642 << " " << wctx->writes.size() << " blobs"
9643 << dendl;
9644
9645 uint64_t need = 0;
9646 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9647 for (auto &wi : wctx->writes) {
9648 need += wi.blob_length;
9649 }
9650 int r = alloc->reserve(need);
9651 if (r < 0) {
9652 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
9653 << dendl;
9654 return r;
9655 }
9656
9657 uint64_t hint = 0;
9658 CompressorRef c;
9659 double crr = 0;
9660 if (wctx->compress) {
9661 c = select_option(
9662 "compression_algorithm",
9663 compressor,
9664 [&]() {
9665 string val;
9666 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
9667 CompressorRef cp = compressor;
9668 if (!cp || cp->get_type_name() != val) {
9669 cp = Compressor::create(cct, val);
9670 }
9671 return boost::optional<CompressorRef>(cp);
9672 }
9673 return boost::optional<CompressorRef>();
9674 }
9675 );
9676
9677 crr = select_option(
9678 "compression_required_ratio",
9679 cct->_conf->bluestore_compression_required_ratio,
9680 [&]() {
9681 double val;
9682 if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
9683 return boost::optional<double>(val);
9684 }
9685 return boost::optional<double>();
9686 }
9687 );
9688 }
9689
9690 // checksum
9691 int csum = csum_type.load();
9692 csum = select_option(
9693 "csum_type",
9694 csum,
9695 [&]() {
9696 int val;
9697 if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
9698 return boost::optional<int>(val);
9699 }
9700 return boost::optional<int>();
9701 }
9702 );
9703
9704 for (auto& wi : wctx->writes) {
9705 BlobRef b = wi.b;
9706 bluestore_blob_t& dblob = b->dirty_blob();
9707 uint64_t b_off = wi.b_off;
9708 bufferlist *l = &wi.bl;
9709 uint64_t final_length = wi.blob_length;
9710 uint64_t csum_length = wi.blob_length;
9711 unsigned csum_order = block_size_order;
9712 bufferlist compressed_bl;
9713 bool compressed = false;
9714 if(c && wi.blob_length > min_alloc_size) {
9715
9716 utime_t start = ceph_clock_now();
9717
9718 // compress
9719 assert(b_off == 0);
9720 assert(wi.blob_length == l->length());
9721 bluestore_compression_header_t chdr;
9722 chdr.type = c->get_type();
9723 // FIXME: memory alignment here is bad
9724 bufferlist t;
9725
9726 r = c->compress(*l, t);
9727 assert(r == 0);
9728
9729 chdr.length = t.length();
9730 ::encode(chdr, compressed_bl);
9731 compressed_bl.claim_append(t);
9732 uint64_t rawlen = compressed_bl.length();
9733 uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
9734 uint64_t want_len_raw = final_length * crr;
9735 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
9736 if (newlen <= want_len && newlen < final_length) {
9737 // Cool. We compressed at least as much as we were hoping to.
9738 // pad out to min_alloc_size
9739 compressed_bl.append_zero(newlen - rawlen);
9740 logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
9741 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
9742 << " -> 0x" << rawlen << " => 0x" << newlen
9743 << " with " << c->get_type()
9744 << std::dec << dendl;
9745 txc->statfs_delta.compressed() += rawlen;
9746 txc->statfs_delta.compressed_original() += l->length();
9747 txc->statfs_delta.compressed_allocated() += newlen;
9748 l = &compressed_bl;
9749 final_length = newlen;
9750 csum_length = newlen;
9751 csum_order = ctz(newlen);
9752 dblob.set_compressed(wi.blob_length, rawlen);
9753 compressed = true;
9754 logger->inc(l_bluestore_compress_success_count);
9755 } else {
9756 dout(20) << __func__ << std::hex << " 0x" << l->length()
9757 << " compressed to 0x" << rawlen << " -> 0x" << newlen
9758 << " with " << c->get_type()
9759 << ", which is more than required 0x" << want_len_raw
9760 << " -> 0x" << want_len
9761 << ", leaving uncompressed"
9762 << std::dec << dendl;
9763 logger->inc(l_bluestore_compress_rejected_count);
9764 }
9765 logger->tinc(l_bluestore_compress_lat,
9766 ceph_clock_now() - start);
9767 }
9768 if (!compressed && wi.new_blob) {
9769 // initialize newly created blob only
9770 assert(dblob.is_mutable());
9771 if (l->length() != wi.blob_length) {
9772 // hrm, maybe we could do better here, but let's not bother.
9773 dout(20) << __func__ << " forcing csum_order to block_size_order "
9774 << block_size_order << dendl;
9775 csum_order = block_size_order;
9776 } else {
9777 csum_order = std::min(wctx->csum_order, ctz(l->length()));
9778 }
9779 // try to align blob with max_blob_size to improve
9780 // its reuse ratio, e.g. in case of reverse write
9781 uint32_t suggested_boff =
9782 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
9783 if ((suggested_boff % (1 << csum_order)) == 0 &&
9784 suggested_boff + final_length <= max_bsize &&
9785 suggested_boff > b_off) {
9786 dout(20) << __func__ << " forcing blob_offset to "
9787 << std::hex << suggested_boff << std::dec << dendl;
9788 assert(suggested_boff >= b_off);
9789 csum_length += suggested_boff - b_off;
9790 b_off = suggested_boff;
9791 }
9792 }
9793
9794 AllocExtentVector extents;
9795 extents.reserve(4); // 4 should be (more than) enough for most allocations
9796 int64_t got = alloc->allocate(final_length, min_alloc_size,
9797 max_alloc_size.load(),
9798 hint, &extents);
9799 assert(got == (int64_t)final_length);
9800 need -= got;
9801 txc->statfs_delta.allocated() += got;
9802 for (auto& p : extents) {
9803 bluestore_pextent_t e = bluestore_pextent_t(p);
9804 txc->allocated.insert(e.offset, e.length);
9805 hint = p.end();
9806 }
9807 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
9808
9809 dout(20) << __func__ << " blob " << *b
9810 << " csum_type " << Checksummer::get_csum_type_string(csum)
9811 << " csum_order " << csum_order
9812 << " csum_length 0x" << std::hex << csum_length << std::dec
9813 << dendl;
9814
9815 if (csum != Checksummer::CSUM_NONE) {
9816 if (!dblob.has_csum()) {
9817 dblob.init_csum(csum, csum_order, csum_length);
9818 }
9819 dblob.calc_csum(b_off, *l);
9820 }
9821 if (wi.mark_unused) {
9822 auto b_end = b_off + wi.bl.length();
9823 if (b_off) {
9824 dblob.add_unused(0, b_off);
9825 }
9826 if (b_end < wi.blob_length) {
9827 dblob.add_unused(b_end, wi.blob_length - b_end);
9828 }
9829 }
9830
9831 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
9832 b_off + (wi.b_off0 - wi.b_off),
9833 wi.length0,
9834 wi.b,
9835 nullptr);
9836 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
9837 txc->statfs_delta.stored() += le->length;
9838 dout(20) << __func__ << " lex " << *le << dendl;
9839 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
9840 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9841
9842 // queue io
9843 if (!g_conf->bluestore_debug_omit_block_device_write) {
9844 if (l->length() <= prefer_deferred_size.load()) {
9845 dout(20) << __func__ << " deferring small 0x" << std::hex
9846 << l->length() << std::dec << " write via deferred" << dendl;
9847 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9848 op->op = bluestore_deferred_op_t::OP_WRITE;
9849 int r = b->get_blob().map(
9850 b_off, l->length(),
9851 [&](uint64_t offset, uint64_t length) {
9852 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9853 return 0;
9854 });
9855 assert(r == 0);
9856 op->data = *l;
9857 } else {
9858 b->get_blob().map_bl(
9859 b_off, *l,
9860 [&](uint64_t offset, bufferlist& t) {
9861 bdev->aio_write(offset, t, &txc->ioc, false);
9862 });
9863 }
9864 }
9865 }
9866 if (need > 0) {
9867 alloc->unreserve(need);
9868 }
9869 return 0;
9870 }
9871
9872 void BlueStore::_wctx_finish(
9873 TransContext *txc,
9874 CollectionRef& c,
9875 OnodeRef o,
9876 WriteContext *wctx,
9877 set<SharedBlob*> *maybe_unshared_blobs)
9878 {
9879 auto oep = wctx->old_extents.begin();
9880 while (oep != wctx->old_extents.end()) {
9881 auto &lo = *oep;
9882 oep = wctx->old_extents.erase(oep);
9883 dout(20) << __func__ << " lex_old " << lo.e << dendl;
9884 BlobRef b = lo.e.blob;
9885 const bluestore_blob_t& blob = b->get_blob();
9886 if (blob.is_compressed()) {
9887 if (lo.blob_empty) {
9888 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
9889 }
9890 txc->statfs_delta.compressed_original() -= lo.e.length;
9891 }
9892 auto& r = lo.r;
9893 txc->statfs_delta.stored() -= lo.e.length;
9894 if (!r.empty()) {
9895 dout(20) << __func__ << " blob release " << r << dendl;
9896 if (blob.is_shared()) {
9897 PExtentVector final;
9898 c->load_shared_blob(b->shared_blob);
9899 for (auto e : r) {
9900 b->shared_blob->put_ref(
9901 e.offset, e.length, &final,
9902 b->is_referenced() ? nullptr : maybe_unshared_blobs);
9903 }
9904 dout(20) << __func__ << " shared_blob release " << final
9905 << " from " << *b->shared_blob << dendl;
9906 txc->write_shared_blob(b->shared_blob);
9907 r.clear();
9908 r.swap(final);
9909 }
9910 }
9911 // we can't invalidate our logical extents as we drop them because
9912 // other lextents (either in our onode or others) may still
9913 // reference them. but we can throw out anything that is no
9914 // longer allocated. Note that this will leave behind edge bits
9915 // that are no longer referenced but not deallocated (until they
9916 // age out of the cache naturally).
9917 b->discard_unallocated(c.get());
9918 for (auto e : r) {
9919 dout(20) << __func__ << " release " << e << dendl;
9920 txc->released.insert(e.offset, e.length);
9921 txc->statfs_delta.allocated() -= e.length;
9922 if (blob.is_compressed()) {
9923 txc->statfs_delta.compressed_allocated() -= e.length;
9924 }
9925 }
9926 delete &lo;
9927 if (b->is_spanning() && !b->is_referenced()) {
9928 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
9929 << dendl;
9930 o->extent_map.spanning_blob_map.erase(b->id);
9931 }
9932 }
9933 }
9934
9935 void BlueStore::_do_write_data(
9936 TransContext *txc,
9937 CollectionRef& c,
9938 OnodeRef o,
9939 uint64_t offset,
9940 uint64_t length,
9941 bufferlist& bl,
9942 WriteContext *wctx)
9943 {
9944 uint64_t end = offset + length;
9945 bufferlist::iterator p = bl.begin();
9946
9947 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
9948 (length != min_alloc_size)) {
9949 // we fall within the same block
9950 _do_write_small(txc, c, o, offset, length, p, wctx);
9951 } else {
9952 uint64_t head_offset, head_length;
9953 uint64_t middle_offset, middle_length;
9954 uint64_t tail_offset, tail_length;
9955
9956 head_offset = offset;
9957 head_length = P2NPHASE(offset, min_alloc_size);
9958
9959 tail_offset = P2ALIGN(end, min_alloc_size);
9960 tail_length = P2PHASE(end, min_alloc_size);
9961
9962 middle_offset = head_offset + head_length;
9963 middle_length = length - head_length - tail_length;
9964
9965 if (head_length) {
9966 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
9967 }
9968
9969 if (middle_length) {
9970 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
9971 }
9972
9973 if (tail_length) {
9974 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
9975 }
9976 }
9977 }
9978
9979 void BlueStore::_choose_write_options(
9980 CollectionRef& c,
9981 OnodeRef o,
9982 uint32_t fadvise_flags,
9983 WriteContext *wctx)
9984 {
9985 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9986 dout(20) << __func__ << " will do buffered write" << dendl;
9987 wctx->buffered = true;
9988 } else if (cct->_conf->bluestore_default_buffered_write &&
9989 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9990 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9991 dout(20) << __func__ << " defaulting to buffered write" << dendl;
9992 wctx->buffered = true;
9993 }
9994
9995 // apply basic csum block size
9996 wctx->csum_order = block_size_order;
9997
9998 // compression parameters
9999 unsigned alloc_hints = o->onode.alloc_hint_flags;
10000 auto cm = select_option(
10001 "compression_mode",
10002 comp_mode.load(),
10003 [&]() {
10004 string val;
10005 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
10006 return boost::optional<Compressor::CompressionMode>(
10007 Compressor::get_comp_mode_type(val));
10008 }
10009 return boost::optional<Compressor::CompressionMode>();
10010 }
10011 );
10012
10013 wctx->compress = (cm != Compressor::COMP_NONE) &&
10014 ((cm == Compressor::COMP_FORCE) ||
10015 (cm == Compressor::COMP_AGGRESSIVE &&
10016 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10017 (cm == Compressor::COMP_PASSIVE &&
10018 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10019
10020 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10021 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
10022 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10023 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
10024 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
10025
10026 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
10027
10028 auto order = min_alloc_size_order.load();
10029 if (o->onode.expected_write_size) {
10030 wctx->csum_order = std::max(order,
10031 (uint8_t)ctz(o->onode.expected_write_size));
10032 } else {
10033 wctx->csum_order = order;
10034 }
10035
10036 if (wctx->compress) {
10037 wctx->target_blob_size = select_option(
10038 "compression_max_blob_size",
10039 comp_max_blob_size.load(),
10040 [&]() {
10041 int val;
10042 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10043 return boost::optional<uint64_t>((uint64_t)val);
10044 }
10045 return boost::optional<uint64_t>();
10046 }
10047 );
10048 }
10049 } else {
10050 if (wctx->compress) {
10051 wctx->target_blob_size = select_option(
10052 "compression_min_blob_size",
10053 comp_min_blob_size.load(),
10054 [&]() {
10055 int val;
10056 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10057 return boost::optional<uint64_t>((uint64_t)val);
10058 }
10059 return boost::optional<uint64_t>();
10060 }
10061 );
10062 }
10063 }
10064
10065 uint64_t max_bsize = max_blob_size.load();
10066 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10067 wctx->target_blob_size = max_bsize;
10068 }
10069
10070 // set the min blob size floor at 2x the min_alloc_size, or else we
10071 // won't be able to allocate a smaller extent for the compressed
10072 // data.
10073 if (wctx->compress &&
10074 wctx->target_blob_size < min_alloc_size * 2) {
10075 wctx->target_blob_size = min_alloc_size * 2;
10076 }
10077
10078 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10079 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10080 << std::dec << dendl;
10081 }
10082
10083 int BlueStore::_do_gc(
10084 TransContext *txc,
10085 CollectionRef& c,
10086 OnodeRef o,
10087 const GarbageCollector& gc,
10088 const WriteContext& wctx,
10089 uint64_t *dirty_start,
10090 uint64_t *dirty_end)
10091 {
10092 auto& extents_to_collect = gc.get_extents_to_collect();
10093
10094 WriteContext wctx_gc;
10095 wctx_gc.fork(wctx); // make a clone for garbage collection
10096
10097 for (auto it = extents_to_collect.begin();
10098 it != extents_to_collect.end();
10099 ++it) {
10100 bufferlist bl;
10101 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10102 assert(r == (int)it->length);
10103
10104 o->extent_map.fault_range(db, it->offset, it->length);
10105 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10106 logger->inc(l_bluestore_gc_merged, it->length);
10107
10108 if (*dirty_start > it->offset) {
10109 *dirty_start = it->offset;
10110 }
10111
10112 if (*dirty_end < it->offset + it->length) {
10113 *dirty_end = it->offset + it->length;
10114 }
10115 }
10116
10117 dout(30) << __func__ << " alloc write" << dendl;
10118 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10119 if (r < 0) {
10120 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10121 << dendl;
10122 return r;
10123 }
10124
10125 _wctx_finish(txc, c, o, &wctx_gc);
10126 return 0;
10127 }
10128
10129 int BlueStore::_do_write(
10130 TransContext *txc,
10131 CollectionRef& c,
10132 OnodeRef o,
10133 uint64_t offset,
10134 uint64_t length,
10135 bufferlist& bl,
10136 uint32_t fadvise_flags)
10137 {
10138 int r = 0;
10139
10140 dout(20) << __func__
10141 << " " << o->oid
10142 << " 0x" << std::hex << offset << "~" << length
10143 << " - have 0x" << o->onode.size
10144 << " (" << std::dec << o->onode.size << ")"
10145 << " bytes"
10146 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10147 << dendl;
10148 _dump_onode(o);
10149
10150 if (length == 0) {
10151 return 0;
10152 }
10153
10154 uint64_t end = offset + length;
10155
10156 GarbageCollector gc(c->store->cct);
10157 int64_t benefit;
10158 auto dirty_start = offset;
10159 auto dirty_end = end;
10160
10161 WriteContext wctx;
10162 _choose_write_options(c, o, fadvise_flags, &wctx);
10163 o->extent_map.fault_range(db, offset, length);
10164 _do_write_data(txc, c, o, offset, length, bl, &wctx);
10165 r = _do_alloc_write(txc, c, o, &wctx);
10166 if (r < 0) {
10167 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10168 << dendl;
10169 goto out;
10170 }
10171
10172 // NB: _wctx_finish() will empty old_extents
10173 // so we must do gc estimation before that
10174 benefit = gc.estimate(offset,
10175 length,
10176 o->extent_map,
10177 wctx.old_extents,
10178 min_alloc_size);
10179
10180 _wctx_finish(txc, c, o, &wctx);
10181 if (end > o->onode.size) {
10182 dout(20) << __func__ << " extending size to 0x" << std::hex << end
10183 << std::dec << dendl;
10184 o->onode.size = end;
10185 }
10186
10187 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
10188 if (!gc.get_extents_to_collect().empty()) {
10189 dout(20) << __func__ << " perform garbage collection, "
10190 << "expected benefit = " << benefit << " AUs" << dendl;
10191 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10192 if (r < 0) {
10193 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10194 << dendl;
10195 goto out;
10196 }
10197 }
10198 }
10199
10200 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
10201 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10202
10203 r = 0;
10204
10205 out:
10206 return r;
10207 }
10208
10209 int BlueStore::_write(TransContext *txc,
10210 CollectionRef& c,
10211 OnodeRef& o,
10212 uint64_t offset, size_t length,
10213 bufferlist& bl,
10214 uint32_t fadvise_flags)
10215 {
10216 dout(15) << __func__ << " " << c->cid << " " << o->oid
10217 << " 0x" << std::hex << offset << "~" << length << std::dec
10218 << dendl;
10219 o->exists = true;
10220 _assign_nid(txc, o);
10221 int r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10222 txc->write_onode(o);
10223
10224 dout(10) << __func__ << " " << c->cid << " " << o->oid
10225 << " 0x" << std::hex << offset << "~" << length << std::dec
10226 << " = " << r << dendl;
10227 return r;
10228 }
10229
10230 int BlueStore::_zero(TransContext *txc,
10231 CollectionRef& c,
10232 OnodeRef& o,
10233 uint64_t offset, size_t length)
10234 {
10235 dout(15) << __func__ << " " << c->cid << " " << o->oid
10236 << " 0x" << std::hex << offset << "~" << length << std::dec
10237 << dendl;
10238 o->exists = true;
10239 _assign_nid(txc, o);
10240 int r = _do_zero(txc, c, o, offset, length);
10241 dout(10) << __func__ << " " << c->cid << " " << o->oid
10242 << " 0x" << std::hex << offset << "~" << length << std::dec
10243 << " = " << r << dendl;
10244 return r;
10245 }
10246
10247 int BlueStore::_do_zero(TransContext *txc,
10248 CollectionRef& c,
10249 OnodeRef& o,
10250 uint64_t offset, size_t length)
10251 {
10252 dout(15) << __func__ << " " << c->cid << " " << o->oid
10253 << " 0x" << std::hex << offset << "~" << length << std::dec
10254 << dendl;
10255 int r = 0;
10256
10257 _dump_onode(o);
10258
10259 WriteContext wctx;
10260 o->extent_map.fault_range(db, offset, length);
10261 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10262 o->extent_map.dirty_range(offset, length);
10263 _wctx_finish(txc, c, o, &wctx);
10264
10265 if (offset + length > o->onode.size) {
10266 o->onode.size = offset + length;
10267 dout(20) << __func__ << " extending size to " << offset + length
10268 << dendl;
10269 }
10270 txc->write_onode(o);
10271
10272 dout(10) << __func__ << " " << c->cid << " " << o->oid
10273 << " 0x" << std::hex << offset << "~" << length << std::dec
10274 << " = " << r << dendl;
10275 return r;
10276 }
10277
10278 void BlueStore::_do_truncate(
10279 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10280 set<SharedBlob*> *maybe_unshared_blobs)
10281 {
10282 dout(15) << __func__ << " " << c->cid << " " << o->oid
10283 << " 0x" << std::hex << offset << std::dec << dendl;
10284
10285 _dump_onode(o, 30);
10286
10287 if (offset == o->onode.size)
10288 return;
10289
10290 if (offset < o->onode.size) {
10291 WriteContext wctx;
10292 uint64_t length = o->onode.size - offset;
10293 o->extent_map.fault_range(db, offset, length);
10294 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10295 o->extent_map.dirty_range(offset, length);
10296 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
10297
10298 // if we have shards past EOF, ask for a reshard
10299 if (!o->onode.extent_map_shards.empty() &&
10300 o->onode.extent_map_shards.back().offset >= offset) {
10301 dout(10) << __func__ << " request reshard past EOF" << dendl;
10302 if (offset) {
10303 o->extent_map.request_reshard(offset - 1, offset + length);
10304 } else {
10305 o->extent_map.request_reshard(0, length);
10306 }
10307 }
10308 }
10309
10310 o->onode.size = offset;
10311
10312 txc->write_onode(o);
10313 }
10314
10315 void BlueStore::_truncate(TransContext *txc,
10316 CollectionRef& c,
10317 OnodeRef& o,
10318 uint64_t offset)
10319 {
10320 dout(15) << __func__ << " " << c->cid << " " << o->oid
10321 << " 0x" << std::hex << offset << std::dec
10322 << dendl;
10323 _do_truncate(txc, c, o, offset);
10324 }
10325
10326 int BlueStore::_do_remove(
10327 TransContext *txc,
10328 CollectionRef& c,
10329 OnodeRef o)
10330 {
10331 set<SharedBlob*> maybe_unshared_blobs;
10332 _do_truncate(txc, c, o, 0, &maybe_unshared_blobs);
10333 if (o->onode.has_omap()) {
10334 o->flush();
10335 _do_omap_clear(txc, o->onode.nid);
10336 }
10337 o->exists = false;
10338 string key;
10339 for (auto &s : o->extent_map.shards) {
10340 dout(20) << __func__ << " removing shard 0x" << std::hex
10341 << s.shard_info->offset << std::dec << dendl;
10342 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10343 [&](const string& final_key) {
10344 txc->t->rmkey(PREFIX_OBJ, final_key);
10345 }
10346 );
10347 }
10348 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10349 txc->removed(o);
10350 o->extent_map.clear();
10351 o->onode = bluestore_onode_t();
10352 _debug_obj_on_delete(o->oid);
10353
10354 if (!o->oid.is_no_gen() &&
10355 !maybe_unshared_blobs.empty()) {
10356 // see if we can unshare blobs still referenced by the head
10357 dout(10) << __func__ << " gen and maybe_unshared_blobs "
10358 << maybe_unshared_blobs << dendl;
10359 ghobject_t nogen = o->oid;
10360 nogen.generation = ghobject_t::NO_GEN;
10361 OnodeRef h = c->onode_map.lookup(nogen);
10362 if (h && h->exists) {
10363 dout(20) << __func__ << " checking for unshareable blobs on " << h
10364 << " " << h->oid << dendl;
10365 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
10366 for (auto& e : h->extent_map.extent_map) {
10367 const bluestore_blob_t& b = e.blob->get_blob();
10368 SharedBlob *sb = e.blob->shared_blob.get();
10369 if (b.is_shared() &&
10370 sb->loaded &&
10371 maybe_unshared_blobs.count(sb)) {
10372 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
10373 expect[sb].get(off, len);
10374 return 0;
10375 });
10376 }
10377 }
10378 vector<SharedBlob*> unshared_blobs;
10379 unshared_blobs.reserve(maybe_unshared_blobs.size());
10380 for (auto& p : expect) {
10381 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
10382 if (p.first->persistent->ref_map == p.second) {
10383 SharedBlob *sb = p.first;
10384 dout(20) << __func__ << " unsharing " << *sb << dendl;
10385 unshared_blobs.push_back(sb);
10386 txc->unshare_blob(sb);
10387 uint64_t sbid = c->make_blob_unshared(sb);
10388 string key;
10389 get_shared_blob_key(sbid, &key);
10390 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
10391 }
10392 }
10393
10394 if (!unshared_blobs.empty()) {
10395 uint32_t b_start = OBJECT_MAX_SIZE;
10396 uint32_t b_end = 0;
10397 for (auto& e : h->extent_map.extent_map) {
10398 const bluestore_blob_t& b = e.blob->get_blob();
10399 SharedBlob *sb = e.blob->shared_blob.get();
10400 if (b.is_shared() &&
10401 std::find(unshared_blobs.begin(), unshared_blobs.end(),
10402 sb) != unshared_blobs.end()) {
10403 dout(20) << __func__ << " unsharing " << e << dendl;
10404 bluestore_blob_t& blob = e.blob->dirty_blob();
10405 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
10406 if (e.logical_offset < b_start) {
10407 b_start = e.logical_offset;
10408 }
10409 if (e.logical_end() > b_end) {
10410 b_end = e.logical_end();
10411 }
10412 }
10413 }
10414
10415 h->extent_map.dirty_range(b_start, b_end - b_start);
10416 txc->write_onode(h);
10417 }
10418 }
10419 }
10420 return 0;
10421 }
10422
10423 int BlueStore::_remove(TransContext *txc,
10424 CollectionRef& c,
10425 OnodeRef &o)
10426 {
10427 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10428 int r = _do_remove(txc, c, o);
10429 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10430 return r;
10431 }
10432
10433 int BlueStore::_setattr(TransContext *txc,
10434 CollectionRef& c,
10435 OnodeRef& o,
10436 const string& name,
10437 bufferptr& val)
10438 {
10439 dout(15) << __func__ << " " << c->cid << " " << o->oid
10440 << " " << name << " (" << val.length() << " bytes)"
10441 << dendl;
10442 int r = 0;
10443 if (val.is_partial())
10444 o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
10445 else
10446 o->onode.attrs[name.c_str()] = val;
10447 txc->write_onode(o);
10448 dout(10) << __func__ << " " << c->cid << " " << o->oid
10449 << " " << name << " (" << val.length() << " bytes)"
10450 << " = " << r << dendl;
10451 return r;
10452 }
10453
10454 int BlueStore::_setattrs(TransContext *txc,
10455 CollectionRef& c,
10456 OnodeRef& o,
10457 const map<string,bufferptr>& aset)
10458 {
10459 dout(15) << __func__ << " " << c->cid << " " << o->oid
10460 << " " << aset.size() << " keys"
10461 << dendl;
10462 int r = 0;
10463 for (map<string,bufferptr>::const_iterator p = aset.begin();
10464 p != aset.end(); ++p) {
10465 if (p->second.is_partial())
10466 o->onode.attrs[p->first.c_str()] =
10467 bufferptr(p->second.c_str(), p->second.length());
10468 else
10469 o->onode.attrs[p->first.c_str()] = p->second;
10470 }
10471 txc->write_onode(o);
10472 dout(10) << __func__ << " " << c->cid << " " << o->oid
10473 << " " << aset.size() << " keys"
10474 << " = " << r << dendl;
10475 return r;
10476 }
10477
10478
10479 int BlueStore::_rmattr(TransContext *txc,
10480 CollectionRef& c,
10481 OnodeRef& o,
10482 const string& name)
10483 {
10484 dout(15) << __func__ << " " << c->cid << " " << o->oid
10485 << " " << name << dendl;
10486 int r = 0;
10487 auto it = o->onode.attrs.find(name.c_str());
10488 if (it == o->onode.attrs.end())
10489 goto out;
10490
10491 o->onode.attrs.erase(it);
10492 txc->write_onode(o);
10493
10494 out:
10495 dout(10) << __func__ << " " << c->cid << " " << o->oid
10496 << " " << name << " = " << r << dendl;
10497 return r;
10498 }
10499
10500 int BlueStore::_rmattrs(TransContext *txc,
10501 CollectionRef& c,
10502 OnodeRef& o)
10503 {
10504 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10505 int r = 0;
10506
10507 if (o->onode.attrs.empty())
10508 goto out;
10509
10510 o->onode.attrs.clear();
10511 txc->write_onode(o);
10512
10513 out:
10514 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10515 return r;
10516 }
10517
10518 void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
10519 {
10520 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10521 string prefix, tail;
10522 get_omap_header(id, &prefix);
10523 get_omap_tail(id, &tail);
10524 it->lower_bound(prefix);
10525 while (it->valid()) {
10526 if (it->key() >= tail) {
10527 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
10528 << dendl;
10529 break;
10530 }
10531 txc->t->rmkey(PREFIX_OMAP, it->key());
10532 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10533 it->next();
10534 }
10535 }
10536
10537 int BlueStore::_omap_clear(TransContext *txc,
10538 CollectionRef& c,
10539 OnodeRef& o)
10540 {
10541 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10542 int r = 0;
10543 if (o->onode.has_omap()) {
10544 o->flush();
10545 _do_omap_clear(txc, o->onode.nid);
10546 o->onode.clear_omap_flag();
10547 txc->write_onode(o);
10548 }
10549 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10550 return r;
10551 }
10552
10553 int BlueStore::_omap_setkeys(TransContext *txc,
10554 CollectionRef& c,
10555 OnodeRef& o,
10556 bufferlist &bl)
10557 {
10558 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10559 int r;
10560 bufferlist::iterator p = bl.begin();
10561 __u32 num;
10562 if (!o->onode.has_omap()) {
10563 o->onode.set_omap_flag();
10564 txc->write_onode(o);
10565 } else {
10566 txc->note_modified_object(o);
10567 }
10568 string final_key;
10569 _key_encode_u64(o->onode.nid, &final_key);
10570 final_key.push_back('.');
10571 ::decode(num, p);
10572 while (num--) {
10573 string key;
10574 bufferlist value;
10575 ::decode(key, p);
10576 ::decode(value, p);
10577 final_key.resize(9); // keep prefix
10578 final_key += key;
10579 dout(30) << __func__ << " " << pretty_binary_string(final_key)
10580 << " <- " << key << dendl;
10581 txc->t->set(PREFIX_OMAP, final_key, value);
10582 }
10583 r = 0;
10584 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10585 return r;
10586 }
10587
10588 int BlueStore::_omap_setheader(TransContext *txc,
10589 CollectionRef& c,
10590 OnodeRef &o,
10591 bufferlist& bl)
10592 {
10593 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10594 int r;
10595 string key;
10596 if (!o->onode.has_omap()) {
10597 o->onode.set_omap_flag();
10598 txc->write_onode(o);
10599 } else {
10600 txc->note_modified_object(o);
10601 }
10602 get_omap_header(o->onode.nid, &key);
10603 txc->t->set(PREFIX_OMAP, key, bl);
10604 r = 0;
10605 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10606 return r;
10607 }
10608
10609 int BlueStore::_omap_rmkeys(TransContext *txc,
10610 CollectionRef& c,
10611 OnodeRef& o,
10612 bufferlist& bl)
10613 {
10614 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10615 int r = 0;
10616 bufferlist::iterator p = bl.begin();
10617 __u32 num;
10618 string final_key;
10619
10620 if (!o->onode.has_omap()) {
10621 goto out;
10622 }
10623 _key_encode_u64(o->onode.nid, &final_key);
10624 final_key.push_back('.');
10625 ::decode(num, p);
10626 while (num--) {
10627 string key;
10628 ::decode(key, p);
10629 final_key.resize(9); // keep prefix
10630 final_key += key;
10631 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
10632 << " <- " << key << dendl;
10633 txc->t->rmkey(PREFIX_OMAP, final_key);
10634 }
10635 txc->note_modified_object(o);
10636
10637 out:
10638 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10639 return r;
10640 }
10641
10642 int BlueStore::_omap_rmkey_range(TransContext *txc,
10643 CollectionRef& c,
10644 OnodeRef& o,
10645 const string& first, const string& last)
10646 {
10647 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10648 KeyValueDB::Iterator it;
10649 string key_first, key_last;
10650 int r = 0;
10651 if (!o->onode.has_omap()) {
10652 goto out;
10653 }
10654 o->flush();
10655 it = db->get_iterator(PREFIX_OMAP);
10656 get_omap_key(o->onode.nid, first, &key_first);
10657 get_omap_key(o->onode.nid, last, &key_last);
10658 it->lower_bound(key_first);
10659 while (it->valid()) {
10660 if (it->key() >= key_last) {
10661 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
10662 << dendl;
10663 break;
10664 }
10665 txc->t->rmkey(PREFIX_OMAP, it->key());
10666 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10667 it->next();
10668 }
10669 txc->note_modified_object(o);
10670
10671 out:
10672 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10673 return r;
10674 }
10675
10676 int BlueStore::_set_alloc_hint(
10677 TransContext *txc,
10678 CollectionRef& c,
10679 OnodeRef& o,
10680 uint64_t expected_object_size,
10681 uint64_t expected_write_size,
10682 uint32_t flags)
10683 {
10684 dout(15) << __func__ << " " << c->cid << " " << o->oid
10685 << " object_size " << expected_object_size
10686 << " write_size " << expected_write_size
10687 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10688 << dendl;
10689 int r = 0;
10690 o->onode.expected_object_size = expected_object_size;
10691 o->onode.expected_write_size = expected_write_size;
10692 o->onode.alloc_hint_flags = flags;
10693 txc->write_onode(o);
10694 dout(10) << __func__ << " " << c->cid << " " << o->oid
10695 << " object_size " << expected_object_size
10696 << " write_size " << expected_write_size
10697 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10698 << " = " << r << dendl;
10699 return r;
10700 }
10701
10702 int BlueStore::_clone(TransContext *txc,
10703 CollectionRef& c,
10704 OnodeRef& oldo,
10705 OnodeRef& newo)
10706 {
10707 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10708 << newo->oid << dendl;
10709 int r = 0;
10710 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
10711 derr << __func__ << " mismatched hash on " << oldo->oid
10712 << " and " << newo->oid << dendl;
10713 return -EINVAL;
10714 }
10715
10716 newo->exists = true;
10717 _assign_nid(txc, newo);
10718
10719 // clone data
10720 oldo->flush();
10721 _do_truncate(txc, c, newo, 0);
10722 if (cct->_conf->bluestore_clone_cow) {
10723 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
10724 } else {
10725 bufferlist bl;
10726 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
10727 if (r < 0)
10728 goto out;
10729 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
10730 if (r < 0)
10731 goto out;
10732 }
10733
10734 // clone attrs
10735 newo->onode.attrs = oldo->onode.attrs;
10736
10737 // clone omap
10738 if (newo->onode.has_omap()) {
10739 dout(20) << __func__ << " clearing old omap data" << dendl;
10740 newo->flush();
10741 _do_omap_clear(txc, newo->onode.nid);
10742 }
10743 if (oldo->onode.has_omap()) {
10744 dout(20) << __func__ << " copying omap data" << dendl;
10745 if (!newo->onode.has_omap()) {
10746 newo->onode.set_omap_flag();
10747 }
10748 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10749 string head, tail;
10750 get_omap_header(oldo->onode.nid, &head);
10751 get_omap_tail(oldo->onode.nid, &tail);
10752 it->lower_bound(head);
10753 while (it->valid()) {
10754 if (it->key() >= tail) {
10755 dout(30) << __func__ << " reached tail" << dendl;
10756 break;
10757 } else {
10758 dout(30) << __func__ << " got header/data "
10759 << pretty_binary_string(it->key()) << dendl;
10760 string key;
10761 rewrite_omap_key(newo->onode.nid, it->key(), &key);
10762 txc->t->set(PREFIX_OMAP, key, it->value());
10763 }
10764 it->next();
10765 }
10766 } else {
10767 newo->onode.clear_omap_flag();
10768 }
10769
10770 txc->write_onode(newo);
10771 r = 0;
10772
10773 out:
10774 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10775 << newo->oid << " = " << r << dendl;
10776 return r;
10777 }
10778
10779 int BlueStore::_do_clone_range(
10780 TransContext *txc,
10781 CollectionRef& c,
10782 OnodeRef& oldo,
10783 OnodeRef& newo,
10784 uint64_t srcoff, uint64_t length, uint64_t dstoff)
10785 {
10786 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10787 << newo->oid
10788 << " 0x" << std::hex << srcoff << "~" << length << " -> "
10789 << " 0x" << dstoff << "~" << length << std::dec << dendl;
10790 oldo->extent_map.fault_range(db, srcoff, length);
10791 newo->extent_map.fault_range(db, dstoff, length);
10792 _dump_onode(oldo);
10793 _dump_onode(newo);
10794
10795 // hmm, this could go into an ExtentMap::dup() method.
10796 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
10797 for (auto &e : oldo->extent_map.extent_map) {
10798 e.blob->last_encoded_id = -1;
10799 }
10800 int n = 0;
10801 bool dirtied_oldo = false;
10802 uint64_t end = srcoff + length;
10803 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
10804 ep != oldo->extent_map.extent_map.end();
10805 ++ep) {
10806 auto& e = *ep;
10807 if (e.logical_offset >= end) {
10808 break;
10809 }
10810 dout(20) << __func__ << " src " << e << dendl;
10811 BlobRef cb;
10812 bool blob_duped = true;
10813 if (e.blob->last_encoded_id >= 0) {
10814 // blob is already duped
10815 cb = id_to_blob[e.blob->last_encoded_id];
10816 blob_duped = false;
10817 } else {
10818 // dup the blob
10819 const bluestore_blob_t& blob = e.blob->get_blob();
10820 // make sure it is shared
10821 if (!blob.is_shared()) {
10822 c->make_blob_shared(_assign_blobid(txc), e.blob);
10823 dirtied_oldo = true; // fixme: overkill
10824 } else {
10825 c->load_shared_blob(e.blob->shared_blob);
10826 }
10827 cb = new Blob();
10828 e.blob->last_encoded_id = n;
10829 id_to_blob[n] = cb;
10830 e.blob->dup(*cb);
10831 // bump the extent refs on the copied blob's extents
10832 for (auto p : blob.get_extents()) {
10833 if (p.is_valid()) {
10834 e.blob->shared_blob->get_ref(p.offset, p.length);
10835 }
10836 }
10837 txc->write_shared_blob(e.blob->shared_blob);
10838 dout(20) << __func__ << " new " << *cb << dendl;
10839 }
10840 // dup extent
10841 int skip_front, skip_back;
10842 if (e.logical_offset < srcoff) {
10843 skip_front = srcoff - e.logical_offset;
10844 } else {
10845 skip_front = 0;
10846 }
10847 if (e.logical_end() > end) {
10848 skip_back = e.logical_end() - end;
10849 } else {
10850 skip_back = 0;
10851 }
10852 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
10853 e.blob_offset + skip_front,
10854 e.length - skip_front - skip_back, cb);
10855 newo->extent_map.extent_map.insert(*ne);
10856 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
10857 // fixme: we may leave parts of new blob unreferenced that could
10858 // be freed (relative to the shared_blob).
10859 txc->statfs_delta.stored() += ne->length;
10860 if (e.blob->get_blob().is_compressed()) {
10861 txc->statfs_delta.compressed_original() += ne->length;
10862 if (blob_duped){
10863 txc->statfs_delta.compressed() +=
10864 cb->get_blob().get_compressed_payload_length();
10865 }
10866 }
10867 dout(20) << __func__ << " dst " << *ne << dendl;
10868 ++n;
10869 }
10870 if (dirtied_oldo) {
10871 oldo->extent_map.dirty_range(srcoff, length); // overkill
10872 txc->write_onode(oldo);
10873 }
10874 txc->write_onode(newo);
10875
10876 if (dstoff + length > newo->onode.size) {
10877 newo->onode.size = dstoff + length;
10878 }
10879 newo->extent_map.dirty_range(dstoff, length);
10880 _dump_onode(oldo);
10881 _dump_onode(newo);
10882 return 0;
10883 }
10884
10885 int BlueStore::_clone_range(TransContext *txc,
10886 CollectionRef& c,
10887 OnodeRef& oldo,
10888 OnodeRef& newo,
10889 uint64_t srcoff, uint64_t length, uint64_t dstoff)
10890 {
10891 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10892 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
10893 << " to offset 0x" << dstoff << std::dec << dendl;
10894 int r = 0;
10895
10896 if (srcoff + length > oldo->onode.size) {
10897 r = -EINVAL;
10898 goto out;
10899 }
10900
10901 newo->exists = true;
10902 _assign_nid(txc, newo);
10903
10904 if (length > 0) {
10905 if (cct->_conf->bluestore_clone_cow) {
10906 _do_zero(txc, c, newo, dstoff, length);
10907 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
10908 } else {
10909 bufferlist bl;
10910 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
10911 if (r < 0)
10912 goto out;
10913 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
10914 if (r < 0)
10915 goto out;
10916 }
10917 }
10918
10919 txc->write_onode(newo);
10920 r = 0;
10921
10922 out:
10923 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10924 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
10925 << " to offset 0x" << dstoff << std::dec
10926 << " = " << r << dendl;
10927 return r;
10928 }
10929
10930 int BlueStore::_rename(TransContext *txc,
10931 CollectionRef& c,
10932 OnodeRef& oldo,
10933 OnodeRef& newo,
10934 const ghobject_t& new_oid)
10935 {
10936 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10937 << new_oid << dendl;
10938 int r;
10939 ghobject_t old_oid = oldo->oid;
10940 mempool::bluestore_cache_other::string new_okey;
10941
10942 if (newo) {
10943 if (newo->exists) {
10944 r = -EEXIST;
10945 goto out;
10946 }
10947 assert(txc->onodes.count(newo) == 0);
10948 }
10949
10950 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
10951
10952 // rewrite shards
10953 {
10954 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
10955 get_object_key(cct, new_oid, &new_okey);
10956 string key;
10957 for (auto &s : oldo->extent_map.shards) {
10958 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
10959 [&](const string& final_key) {
10960 txc->t->rmkey(PREFIX_OBJ, final_key);
10961 }
10962 );
10963 s.dirty = true;
10964 }
10965 }
10966
10967 newo = oldo;
10968 txc->write_onode(newo);
10969
10970 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
10971 // Onode in the old slot
10972 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
10973 r = 0;
10974
10975 out:
10976 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
10977 << new_oid << " = " << r << dendl;
10978 return r;
10979 }
10980
10981 // collections
10982
10983 int BlueStore::_create_collection(
10984 TransContext *txc,
10985 const coll_t &cid,
10986 unsigned bits,
10987 CollectionRef *c)
10988 {
10989 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
10990 int r;
10991 bufferlist bl;
10992
10993 {
10994 RWLock::WLocker l(coll_lock);
10995 if (*c) {
10996 r = -EEXIST;
10997 goto out;
10998 }
10999 c->reset(
11000 new Collection(
11001 this,
11002 cache_shards[cid.hash_to_shard(cache_shards.size())],
11003 cid));
11004 (*c)->cnode.bits = bits;
11005 coll_map[cid] = *c;
11006 }
11007 ::encode((*c)->cnode, bl);
11008 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11009 r = 0;
11010
11011 out:
11012 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11013 return r;
11014 }
11015
11016 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11017 CollectionRef *c)
11018 {
11019 dout(15) << __func__ << " " << cid << dendl;
11020 int r;
11021
11022 {
11023 RWLock::WLocker l(coll_lock);
11024 if (!*c) {
11025 r = -ENOENT;
11026 goto out;
11027 }
11028 size_t nonexistent_count = 0;
11029 assert((*c)->exists);
11030 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11031 if (o->exists) {
11032 dout(10) << __func__ << " " << o->oid << " " << o
11033 << " exists in onode_map" << dendl;
11034 return true;
11035 }
11036 ++nonexistent_count;
11037 return false;
11038 })) {
11039 r = -ENOTEMPTY;
11040 goto out;
11041 }
11042
11043 vector<ghobject_t> ls;
11044 ghobject_t next;
11045 // Enumerate onodes in db, up to nonexistent_count + 1
11046 // then check if all of them are marked as non-existent.
11047 // Bypass the check if returned number is greater than nonexistent_count
11048 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11049 nonexistent_count + 1, &ls, &next);
11050 if (r >= 0) {
11051 bool exists = false; //ls.size() > nonexistent_count;
11052 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11053 dout(10) << __func__ << " oid " << *it << dendl;
11054 auto onode = (*c)->onode_map.lookup(*it);
11055 exists = !onode || onode->exists;
11056 if (exists) {
11057 dout(10) << __func__ << " " << *it
11058 << " exists in db" << dendl;
11059 }
11060 }
11061 if (!exists) {
11062 coll_map.erase(cid);
11063 txc->removed_collections.push_back(*c);
11064 (*c)->exists = false;
11065 c->reset();
11066 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11067 r = 0;
11068 } else {
11069 dout(10) << __func__ << " " << cid
11070 << " is non-empty" << dendl;
11071 r = -ENOTEMPTY;
11072 }
11073 }
11074 }
11075
11076 out:
11077 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11078 return r;
11079 }
11080
11081 int BlueStore::_split_collection(TransContext *txc,
11082 CollectionRef& c,
11083 CollectionRef& d,
11084 unsigned bits, int rem)
11085 {
11086 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11087 << " bits " << bits << dendl;
11088 RWLock::WLocker l(c->lock);
11089 RWLock::WLocker l2(d->lock);
11090 int r;
11091
11092 // flush all previous deferred writes on this sequencer. this is a bit
11093 // heavyweight, but we need to make sure all deferred writes complete
11094 // before we split as the new collection's sequencer may need to order
11095 // this after those writes, and we don't bother with the complexity of
11096 // moving those TransContexts over to the new osr.
11097 _osr_drain_preceding(txc);
11098
11099 // move any cached items (onodes and referenced shared blobs) that will
11100 // belong to the child collection post-split. leave everything else behind.
11101 // this may include things that don't strictly belong to the now-smaller
11102 // parent split, but the OSD will always send us a split for every new
11103 // child.
11104
11105 spg_t pgid, dest_pgid;
11106 bool is_pg = c->cid.is_pg(&pgid);
11107 assert(is_pg);
11108 is_pg = d->cid.is_pg(&dest_pgid);
11109 assert(is_pg);
11110
11111 // the destination should initially be empty.
11112 assert(d->onode_map.empty());
11113 assert(d->shared_blob_set.empty());
11114 assert(d->cnode.bits == bits);
11115
11116 c->split_cache(d.get());
11117
11118 // adjust bits. note that this will be redundant for all but the first
11119 // split call for this parent (first child).
11120 c->cnode.bits = bits;
11121 assert(d->cnode.bits == bits);
11122 r = 0;
11123
11124 bufferlist bl;
11125 ::encode(c->cnode, bl);
11126 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11127
11128 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11129 << " bits " << bits << " = " << r << dendl;
11130 return r;
11131 }
11132
11133 // DB key value Histogram
11134 #define KEY_SLAB 32
11135 #define VALUE_SLAB 64
11136
11137 const string prefix_onode = "o";
11138 const string prefix_onode_shard = "x";
11139 const string prefix_other = "Z";
11140
11141 int BlueStore::DBHistogram::get_key_slab(size_t sz)
11142 {
11143 return (sz/KEY_SLAB);
11144 }
11145
11146 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11147 {
11148 int lower_bound = slab * KEY_SLAB;
11149 int upper_bound = (slab + 1) * KEY_SLAB;
11150 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11151 return ret;
11152 }
11153
11154 int BlueStore::DBHistogram::get_value_slab(size_t sz)
11155 {
11156 return (sz/VALUE_SLAB);
11157 }
11158
11159 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11160 {
11161 int lower_bound = slab * VALUE_SLAB;
11162 int upper_bound = (slab + 1) * VALUE_SLAB;
11163 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11164 return ret;
11165 }
11166
11167 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11168 const string &prefix, size_t key_size, size_t value_size)
11169 {
11170 uint32_t key_slab = get_key_slab(key_size);
11171 uint32_t value_slab = get_value_slab(value_size);
11172 key_hist[prefix][key_slab].count++;
11173 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11174 key_hist[prefix][key_slab].val_map[value_slab].count++;
11175 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11176 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11177 }
11178
11179 void BlueStore::DBHistogram::dump(Formatter *f)
11180 {
11181 f->open_object_section("rocksdb_value_distribution");
11182 for (auto i : value_hist) {
11183 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11184 }
11185 f->close_section();
11186
11187 f->open_object_section("rocksdb_key_value_histogram");
11188 for (auto i : key_hist) {
11189 f->dump_string("prefix", i.first);
11190 f->open_object_section("key_hist");
11191 for ( auto k : i.second) {
11192 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11193 f->dump_unsigned("max_len", k.second.max_len);
11194 f->open_object_section("value_hist");
11195 for ( auto j : k.second.val_map) {
11196 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11197 f->dump_unsigned("max_len", j.second.max_len);
11198 }
11199 f->close_section();
11200 }
11201 f->close_section();
11202 }
11203 f->close_section();
11204 }
11205
11206 //Itrerates through the db and collects the stats
11207 void BlueStore::generate_db_histogram(Formatter *f)
11208 {
11209 //globals
11210 uint64_t num_onodes = 0;
11211 uint64_t num_shards = 0;
11212 uint64_t num_super = 0;
11213 uint64_t num_coll = 0;
11214 uint64_t num_omap = 0;
11215 uint64_t num_deferred = 0;
11216 uint64_t num_alloc = 0;
11217 uint64_t num_stat = 0;
11218 uint64_t num_others = 0;
11219 uint64_t num_shared_shards = 0;
11220 size_t max_key_size =0, max_value_size = 0;
11221 uint64_t total_key_size = 0, total_value_size = 0;
11222 size_t key_size = 0, value_size = 0;
11223 DBHistogram hist;
11224
11225 utime_t start = ceph_clock_now();
11226
11227 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11228 iter->seek_to_first();
11229 while (iter->valid()) {
11230 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11231 key_size = iter->key_size();
11232 value_size = iter->value_size();
11233 hist.value_hist[hist.get_value_slab(value_size)]++;
11234 max_key_size = MAX(max_key_size, key_size);
11235 max_value_size = MAX(max_value_size, value_size);
11236 total_key_size += key_size;
11237 total_value_size += value_size;
11238
11239 pair<string,string> key(iter->raw_key());
11240
11241 if (key.first == PREFIX_SUPER) {
11242 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11243 num_super++;
11244 } else if (key.first == PREFIX_STAT) {
11245 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11246 num_stat++;
11247 } else if (key.first == PREFIX_COLL) {
11248 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11249 num_coll++;
11250 } else if (key.first == PREFIX_OBJ) {
11251 if (key.second.back() == ONODE_KEY_SUFFIX) {
11252 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11253 num_onodes++;
11254 } else {
11255 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11256 num_shards++;
11257 }
11258 } else if (key.first == PREFIX_OMAP) {
11259 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11260 num_omap++;
11261 } else if (key.first == PREFIX_DEFERRED) {
11262 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11263 num_deferred++;
11264 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11265 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11266 num_alloc++;
11267 } else if (key.first == PREFIX_SHARED_BLOB) {
11268 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11269 num_shared_shards++;
11270 } else {
11271 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11272 num_others++;
11273 }
11274 iter->next();
11275 }
11276
11277 utime_t duration = ceph_clock_now() - start;
11278 f->open_object_section("rocksdb_key_value_stats");
11279 f->dump_unsigned("num_onodes", num_onodes);
11280 f->dump_unsigned("num_shards", num_shards);
11281 f->dump_unsigned("num_super", num_super);
11282 f->dump_unsigned("num_coll", num_coll);
11283 f->dump_unsigned("num_omap", num_omap);
11284 f->dump_unsigned("num_deferred", num_deferred);
11285 f->dump_unsigned("num_alloc", num_alloc);
11286 f->dump_unsigned("num_stat", num_stat);
11287 f->dump_unsigned("num_shared_shards", num_shared_shards);
11288 f->dump_unsigned("num_others", num_others);
11289 f->dump_unsigned("max_key_size", max_key_size);
11290 f->dump_unsigned("max_value_size", max_value_size);
11291 f->dump_unsigned("total_key_size", total_key_size);
11292 f->dump_unsigned("total_value_size", total_value_size);
11293 f->close_section();
11294
11295 hist.dump(f);
11296
11297 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11298
11299 }
11300
11301 void BlueStore::_flush_cache()
11302 {
11303 dout(10) << __func__ << dendl;
11304 for (auto i : cache_shards) {
11305 i->trim_all();
11306 assert(i->empty());
11307 }
11308 for (auto& p : coll_map) {
11309 assert(p.second->onode_map.empty());
11310 assert(p.second->shared_blob_set.empty());
11311 }
11312 coll_map.clear();
11313 }
11314
11315 // For external caller.
11316 // We use a best-effort policy instead, e.g.,
11317 // we don't care if there are still some pinned onodes/data in the cache
11318 // after this command is completed.
11319 void BlueStore::flush_cache()
11320 {
11321 dout(10) << __func__ << dendl;
11322 for (auto i : cache_shards) {
11323 i->trim_all();
11324 }
11325 }
11326
11327 void BlueStore::_apply_padding(uint64_t head_pad,
11328 uint64_t tail_pad,
11329 bufferlist& bl,
11330 bufferlist& padded)
11331 {
11332 padded = bl;
11333 if (head_pad) {
11334 bufferlist z;
11335 z.append_zero(head_pad);
11336 z.claim_append(padded);
11337 padded.claim(z);
11338 }
11339 if (tail_pad) {
11340 padded.append_zero(tail_pad);
11341 }
11342 if (head_pad || tail_pad) {
11343 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
11344 << " tail 0x" << tail_pad << std::dec << dendl;
11345 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
11346 }
11347 }
11348
11349 // ===========================================