]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
b326d39bcd2ecaa9e3bf15ca3667abb5a0b5812c
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // vim: ts=8 sw=2 smarttab
2 /*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14 #include <unistd.h>
15 #include <stdlib.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <fcntl.h>
19
20 #include "include/cpp-btree/btree_set.h"
21
22 #include "BlueStore.h"
23 #include "os/kv.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "common/PriorityCache.h"
30 #include "Allocator.h"
31 #include "FreelistManager.h"
32 #include "BlueFS.h"
33 #include "BlueRocksEnv.h"
34 #include "auth/Crypto.h"
35 #include "common/EventTrace.h"
36 #include "perfglue/heap_profiler.h"
37
38 #define dout_context cct
39 #define dout_subsys ceph_subsys_bluestore
40
41 using bid_t = decltype(BlueStore::Blob::id);
42
43 // bluestore_cache_onode
44 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
45 bluestore_cache_onode);
46
47 // bluestore_cache_other
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
49 bluestore_cache_other);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
51 bluestore_cache_other);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
53 bluestore_cache_other);
54 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
55 bluestore_cache_other);
56
57 // bluestore_txc
58 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
59 bluestore_txc);
60
61
62 // kv store prefixes
63 const string PREFIX_SUPER = "S"; // field -> value
64 const string PREFIX_STAT = "T"; // field -> value(int64 array)
65 const string PREFIX_COLL = "C"; // collection name -> cnode_t
66 const string PREFIX_OBJ = "O"; // object name -> onode_t
67 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
68 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
69 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
70 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
71
72 // write a label in the first block. always use this size. note that
73 // bluefs makes a matching assumption about the location of its
74 // superblock (always the second block of the device).
75 #define BDEV_LABEL_BLOCK_SIZE 4096
76
77 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
78 #define SUPER_RESERVED 8192
79
80 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
81
82
83 /*
84 * extent map blob encoding
85 *
86 * we use the low bits of the blobid field to indicate some common scenarios
87 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
88 */
89 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
90 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
91 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
92 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
93 #define BLOBID_SHIFT_BITS 4
94
95 /*
96 * object name key structure
97 *
98 * encoded u8: shard + 2^7 (so that it sorts properly)
99 * encoded u64: poolid + 2^63 (so that it sorts properly)
100 * encoded u32: hash (bit reversed)
101 *
102 * escaped string: namespace
103 *
104 * escaped string: key or object name
105 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
106 * we are done. otherwise, we are followed by the object name.
107 * escaped string: object name (unless '=' above)
108 *
109 * encoded u64: snap
110 * encoded u64: generation
111 * 'o'
112 */
113 #define ONODE_KEY_SUFFIX 'o'
114
115 /*
116 * extent shard key
117 *
118 * object prefix key
119 * u32
120 * 'x'
121 */
122 #define EXTENT_SHARD_KEY_SUFFIX 'x'
123
124 /*
125 * string encoding in the key
126 *
127 * The key string needs to lexicographically sort the same way that
128 * ghobject_t does. We do this by escaping anything <= to '#' with #
129 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
130 * hex digits.
131 *
132 * We use ! as a terminator for strings; this works because it is < #
133 * and will get escaped if it is present in the string.
134 *
135 */
136 template<typename S>
137 static void append_escaped(const string &in, S *out)
138 {
139 char hexbyte[in.length() * 3 + 1];
140 char* ptr = &hexbyte[0];
141 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
142 if (*i <= '#') {
143 *ptr++ = '#';
144 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
145 *ptr++ = "0123456789abcdef"[*i & 0x0f];
146 } else if (*i >= '~') {
147 *ptr++ = '~';
148 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
149 *ptr++ = "0123456789abcdef"[*i & 0x0f];
150 } else {
151 *ptr++ = *i;
152 }
153 }
154 *ptr++ = '!';
155 out->append(hexbyte, ptr - &hexbyte[0]);
156 }
157
158 inline unsigned h2i(char c)
159 {
160 if ((c >= '0') && (c <= '9')) {
161 return c - 0x30;
162 } else if ((c >= 'a') && (c <= 'f')) {
163 return c - 'a' + 10;
164 } else if ((c >= 'A') && (c <= 'F')) {
165 return c - 'A' + 10;
166 } else {
167 return 256; // make it always larger than 255
168 }
169 }
170
171 static int decode_escaped(const char *p, string *out)
172 {
173 char buff[256];
174 char* ptr = &buff[0];
175 char* max = &buff[252];
176 const char *orig_p = p;
177 while (*p && *p != '!') {
178 if (*p == '#' || *p == '~') {
179 unsigned hex = 0;
180 p++;
181 hex = h2i(*p++) << 4;
182 if (hex > 255) {
183 return -EINVAL;
184 }
185 hex |= h2i(*p++);
186 if (hex > 255) {
187 return -EINVAL;
188 }
189 *ptr++ = hex;
190 } else {
191 *ptr++ = *p++;
192 }
193 if (ptr > max) {
194 out->append(buff, ptr-buff);
195 ptr = &buff[0];
196 }
197 }
198 if (ptr != buff) {
199 out->append(buff, ptr-buff);
200 }
201 return p - orig_p;
202 }
203
204 // some things we encode in binary (as le32 or le64); print the
205 // resulting key strings nicely
206 template<typename S>
207 static string pretty_binary_string(const S& in)
208 {
209 char buf[10];
210 string out;
211 out.reserve(in.length() * 3);
212 enum { NONE, HEX, STRING } mode = NONE;
213 unsigned from = 0, i;
214 for (i=0; i < in.length(); ++i) {
215 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
216 (mode == HEX && in.length() - i >= 4 &&
217 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
218 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
219 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
220 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
221 if (mode == STRING) {
222 out.append(in.c_str() + from, i - from);
223 out.push_back('\'');
224 }
225 if (mode != HEX) {
226 out.append("0x");
227 mode = HEX;
228 }
229 if (in.length() - i >= 4) {
230 // print a whole u32 at once
231 snprintf(buf, sizeof(buf), "%08x",
232 (uint32_t)(((unsigned char)in[i] << 24) |
233 ((unsigned char)in[i+1] << 16) |
234 ((unsigned char)in[i+2] << 8) |
235 ((unsigned char)in[i+3] << 0)));
236 i += 3;
237 } else {
238 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
239 }
240 out.append(buf);
241 } else {
242 if (mode != STRING) {
243 out.push_back('\'');
244 mode = STRING;
245 from = i;
246 }
247 }
248 }
249 if (mode == STRING) {
250 out.append(in.c_str() + from, i - from);
251 out.push_back('\'');
252 }
253 return out;
254 }
255
256 template<typename T>
257 static void _key_encode_shard(shard_id_t shard, T *key)
258 {
259 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
260 }
261
262 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
263 {
264 pshard->id = (uint8_t)*key - (uint8_t)0x80;
265 return key + 1;
266 }
267
268 static void get_coll_key_range(const coll_t& cid, int bits,
269 string *temp_start, string *temp_end,
270 string *start, string *end)
271 {
272 temp_start->clear();
273 temp_end->clear();
274 start->clear();
275 end->clear();
276
277 spg_t pgid;
278 if (cid.is_pg(&pgid)) {
279 _key_encode_shard(pgid.shard, start);
280 *temp_start = *start;
281
282 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
283 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
284
285 *end = *start;
286 *temp_end = *temp_start;
287
288 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
289 _key_encode_u32(reverse_hash, start);
290 _key_encode_u32(reverse_hash, temp_start);
291
292 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
293 if (end_hash > 0xffffffffull)
294 end_hash = 0xffffffffull;
295
296 _key_encode_u32(end_hash, end);
297 _key_encode_u32(end_hash, temp_end);
298 } else {
299 _key_encode_shard(shard_id_t::NO_SHARD, start);
300 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
301 *end = *start;
302 _key_encode_u32(0, start);
303 _key_encode_u32(0xffffffff, end);
304
305 // no separate temp section
306 *temp_start = *end;
307 *temp_end = *end;
308 }
309 }
310
311 static void get_shared_blob_key(uint64_t sbid, string *key)
312 {
313 key->clear();
314 _key_encode_u64(sbid, key);
315 }
316
317 static int get_key_shared_blob(const string& key, uint64_t *sbid)
318 {
319 const char *p = key.c_str();
320 if (key.length() < sizeof(uint64_t))
321 return -1;
322 _key_decode_u64(p, sbid);
323 return 0;
324 }
325
326 template<typename S>
327 static int get_key_object(const S& key, ghobject_t *oid)
328 {
329 int r;
330 const char *p = key.c_str();
331
332 if (key.length() < 1 + 8 + 4)
333 return -1;
334 p = _key_decode_shard(p, &oid->shard_id);
335
336 uint64_t pool;
337 p = _key_decode_u64(p, &pool);
338 oid->hobj.pool = pool - 0x8000000000000000ull;
339
340 unsigned hash;
341 p = _key_decode_u32(p, &hash);
342
343 oid->hobj.set_bitwise_key_u32(hash);
344
345 r = decode_escaped(p, &oid->hobj.nspace);
346 if (r < 0)
347 return -2;
348 p += r + 1;
349
350 string k;
351 r = decode_escaped(p, &k);
352 if (r < 0)
353 return -3;
354 p += r + 1;
355 if (*p == '=') {
356 // no key
357 ++p;
358 oid->hobj.oid.name = k;
359 } else if (*p == '<' || *p == '>') {
360 // key + name
361 ++p;
362 r = decode_escaped(p, &oid->hobj.oid.name);
363 if (r < 0)
364 return -5;
365 p += r + 1;
366 oid->hobj.set_key(k);
367 } else {
368 // malformed
369 return -6;
370 }
371
372 p = _key_decode_u64(p, &oid->hobj.snap.val);
373 p = _key_decode_u64(p, &oid->generation);
374
375 if (*p != ONODE_KEY_SUFFIX) {
376 return -7;
377 }
378 p++;
379 if (*p) {
380 // if we get something other than a null terminator here,
381 // something goes wrong.
382 return -8;
383 }
384
385 return 0;
386 }
387
388 template<typename S>
389 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
390 {
391 key->clear();
392
393 size_t max_len = 1 + 8 + 4 +
394 (oid.hobj.nspace.length() * 3 + 1) +
395 (oid.hobj.get_key().length() * 3 + 1) +
396 1 + // for '<', '=', or '>'
397 (oid.hobj.oid.name.length() * 3 + 1) +
398 8 + 8 + 1;
399 key->reserve(max_len);
400
401 _key_encode_shard(oid.shard_id, key);
402 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
403 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
404
405 append_escaped(oid.hobj.nspace, key);
406
407 if (oid.hobj.get_key().length()) {
408 // is a key... could be < = or >.
409 append_escaped(oid.hobj.get_key(), key);
410 // (ASCII chars < = and > sort in that order, yay)
411 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
412 if (r) {
413 key->append(r > 0 ? ">" : "<");
414 append_escaped(oid.hobj.oid.name, key);
415 } else {
416 // same as no key
417 key->append("=");
418 }
419 } else {
420 // no key
421 append_escaped(oid.hobj.oid.name, key);
422 key->append("=");
423 }
424
425 _key_encode_u64(oid.hobj.snap, key);
426 _key_encode_u64(oid.generation, key);
427
428 key->push_back(ONODE_KEY_SUFFIX);
429
430 // sanity check
431 if (true) {
432 ghobject_t t;
433 int r = get_key_object(*key, &t);
434 if (r || t != oid) {
435 derr << " r " << r << dendl;
436 derr << "key " << pretty_binary_string(*key) << dendl;
437 derr << "oid " << oid << dendl;
438 derr << " t " << t << dendl;
439 assert(r == 0 && t == oid);
440 }
441 }
442 }
443
444
445 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
446 // char lets us quickly test whether it is a shard key without decoding any
447 // of the prefix bytes.
448 template<typename S>
449 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
450 string *key)
451 {
452 key->clear();
453 key->reserve(onode_key.length() + 4 + 1);
454 key->append(onode_key.c_str(), onode_key.size());
455 _key_encode_u32(offset, key);
456 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
457 }
458
459 static void rewrite_extent_shard_key(uint32_t offset, string *key)
460 {
461 assert(key->size() > sizeof(uint32_t) + 1);
462 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
463 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
464 }
465
466 template<typename S>
467 static void generate_extent_shard_key_and_apply(
468 const S& onode_key,
469 uint32_t offset,
470 string *key,
471 std::function<void(const string& final_key)> apply)
472 {
473 if (key->empty()) { // make full key
474 assert(!onode_key.empty());
475 get_extent_shard_key(onode_key, offset, key);
476 } else {
477 rewrite_extent_shard_key(offset, key);
478 }
479 apply(*key);
480 }
481
482 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
483 {
484 assert(key.size() > sizeof(uint32_t) + 1);
485 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
486 int okey_len = key.size() - sizeof(uint32_t) - 1;
487 *onode_key = key.substr(0, okey_len);
488 const char *p = key.data() + okey_len;
489 _key_decode_u32(p, offset);
490 return 0;
491 }
492
493 static bool is_extent_shard_key(const string& key)
494 {
495 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
496 }
497
498 // '-' < '.' < '~'
499 static void get_omap_header(uint64_t id, string *out)
500 {
501 _key_encode_u64(id, out);
502 out->push_back('-');
503 }
504
505 // hmm, I don't think there's any need to escape the user key since we
506 // have a clean prefix.
507 static void get_omap_key(uint64_t id, const string& key, string *out)
508 {
509 _key_encode_u64(id, out);
510 out->push_back('.');
511 out->append(key);
512 }
513
514 static void rewrite_omap_key(uint64_t id, string old, string *out)
515 {
516 _key_encode_u64(id, out);
517 out->append(old.c_str() + out->length(), old.size() - out->length());
518 }
519
520 static void decode_omap_key(const string& key, string *user_key)
521 {
522 *user_key = key.substr(sizeof(uint64_t) + 1);
523 }
524
525 static void get_omap_tail(uint64_t id, string *out)
526 {
527 _key_encode_u64(id, out);
528 out->push_back('~');
529 }
530
531 static void get_deferred_key(uint64_t seq, string *out)
532 {
533 _key_encode_u64(seq, out);
534 }
535
536
537 // merge operators
538
539 struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
540 void merge_nonexistent(
541 const char *rdata, size_t rlen, std::string *new_value) override {
542 *new_value = std::string(rdata, rlen);
543 }
544 void merge(
545 const char *ldata, size_t llen,
546 const char *rdata, size_t rlen,
547 std::string *new_value) override {
548 assert(llen == rlen);
549 assert((rlen % 8) == 0);
550 new_value->resize(rlen);
551 const __le64* lv = (const __le64*)ldata;
552 const __le64* rv = (const __le64*)rdata;
553 __le64* nv = &(__le64&)new_value->at(0);
554 for (size_t i = 0; i < rlen >> 3; ++i) {
555 nv[i] = lv[i] + rv[i];
556 }
557 }
558 // We use each operator name and each prefix to construct the
559 // overall RocksDB operator name for consistency check at open time.
560 const char *name() const override {
561 return "int64_array";
562 }
563 };
564
565
566 // Buffer
567
568 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
569 {
570 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
571 << b.offset << "~" << b.length << std::dec
572 << " " << BlueStore::Buffer::get_state_name(b.state);
573 if (b.flags)
574 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
575 return out << ")";
576 }
577
578 // Garbage Collector
579
580 void BlueStore::GarbageCollector::process_protrusive_extents(
581 const BlueStore::ExtentMap& extent_map,
582 uint64_t start_offset,
583 uint64_t end_offset,
584 uint64_t start_touch_offset,
585 uint64_t end_touch_offset,
586 uint64_t min_alloc_size)
587 {
588 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
589
590 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
591 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
592
593 dout(30) << __func__ << " (hex): [" << std::hex
594 << lookup_start_offset << ", " << lookup_end_offset
595 << ")" << std::dec << dendl;
596
597 for (auto it = extent_map.seek_lextent(lookup_start_offset);
598 it != extent_map.extent_map.end() &&
599 it->logical_offset < lookup_end_offset;
600 ++it) {
601 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
602 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
603
604 dout(30) << __func__ << " " << *it
605 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
606 << dendl;
607
608 Blob* b = it->blob.get();
609
610 if (it->logical_offset >=start_touch_offset &&
611 it->logical_end() <= end_touch_offset) {
612 // Process extents within the range affected by
613 // the current write request.
614 // Need to take into account if existing extents
615 // can be merged with them (uncompressed case)
616 if (!b->get_blob().is_compressed()) {
617 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
618 --blob_info_counted->expected_allocations; // don't need to allocate
619 // new AU for compressed
620 // data since another
621 // collocated uncompressed
622 // blob already exists
623 dout(30) << __func__ << " --expected:"
624 << alloc_unit_start << dendl;
625 }
626 used_alloc_unit = alloc_unit_end;
627 blob_info_counted = nullptr;
628 }
629 } else if (b->get_blob().is_compressed()) {
630
631 // additionally we take compressed blobs that were not impacted
632 // by the write into account too
633 BlobInfo& bi =
634 affected_blobs.emplace(
635 b, BlobInfo(b->get_referenced_bytes())).first->second;
636
637 int adjust =
638 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
639 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
640 dout(30) << __func__ << " expected_allocations="
641 << bi.expected_allocations << " end_au:"
642 << alloc_unit_end << dendl;
643
644 blob_info_counted = &bi;
645 used_alloc_unit = alloc_unit_end;
646
647 assert(it->length <= bi.referenced_bytes);
648 bi.referenced_bytes -= it->length;
649 dout(30) << __func__ << " affected_blob:" << *b
650 << " unref 0x" << std::hex << it->length
651 << " referenced = 0x" << bi.referenced_bytes
652 << std::dec << dendl;
653 // NOTE: we can't move specific blob to resulting GC list here
654 // when reference counter == 0 since subsequent extents might
655 // decrement its expected_allocation.
656 // Hence need to enumerate all the extents first.
657 if (!bi.collect_candidate) {
658 bi.first_lextent = it;
659 bi.collect_candidate = true;
660 }
661 bi.last_lextent = it;
662 } else {
663 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
664 // don't need to allocate new AU for compressed data since another
665 // collocated uncompressed blob already exists
666 --blob_info_counted->expected_allocations;
667 dout(30) << __func__ << " --expected_allocations:"
668 << alloc_unit_start << dendl;
669 }
670 used_alloc_unit = alloc_unit_end;
671 blob_info_counted = nullptr;
672 }
673 }
674
675 for (auto b_it = affected_blobs.begin();
676 b_it != affected_blobs.end();
677 ++b_it) {
678 Blob* b = b_it->first;
679 BlobInfo& bi = b_it->second;
680 if (bi.referenced_bytes == 0) {
681 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
682 int64_t blob_expected_for_release =
683 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
684
685 dout(30) << __func__ << " " << *(b_it->first)
686 << " expected4release=" << blob_expected_for_release
687 << " expected_allocations=" << bi.expected_allocations
688 << dendl;
689 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
690 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
691 if (bi.collect_candidate) {
692 auto it = bi.first_lextent;
693 bool bExit = false;
694 do {
695 if (it->blob.get() == b) {
696 extents_to_collect.emplace_back(it->logical_offset, it->length);
697 }
698 bExit = it == bi.last_lextent;
699 ++it;
700 } while (!bExit);
701 }
702 expected_for_release += blob_expected_for_release;
703 expected_allocations += bi.expected_allocations;
704 }
705 }
706 }
707 }
708
709 int64_t BlueStore::GarbageCollector::estimate(
710 uint64_t start_offset,
711 uint64_t length,
712 const BlueStore::ExtentMap& extent_map,
713 const BlueStore::old_extent_map_t& old_extents,
714 uint64_t min_alloc_size)
715 {
716
717 affected_blobs.clear();
718 extents_to_collect.clear();
719 used_alloc_unit = boost::optional<uint64_t >();
720 blob_info_counted = nullptr;
721
722 gc_start_offset = start_offset;
723 gc_end_offset = start_offset + length;
724
725 uint64_t end_offset = start_offset + length;
726
727 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
728 Blob* b = it->e.blob.get();
729 if (b->get_blob().is_compressed()) {
730
731 // update gc_start_offset/gc_end_offset if needed
732 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
733 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
734
735 auto o = it->e.logical_offset;
736 auto l = it->e.length;
737
738 uint64_t ref_bytes = b->get_referenced_bytes();
739 // micro optimization to bypass blobs that have no more references
740 if (ref_bytes != 0) {
741 dout(30) << __func__ << " affected_blob:" << *b
742 << " unref 0x" << std::hex << o << "~" << l
743 << std::dec << dendl;
744 affected_blobs.emplace(b, BlobInfo(ref_bytes));
745 }
746 }
747 }
748 dout(30) << __func__ << " gc range(hex): [" << std::hex
749 << gc_start_offset << ", " << gc_end_offset
750 << ")" << std::dec << dendl;
751
752 // enumerate preceeding extents to check if they reference affected blobs
753 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
754 process_protrusive_extents(extent_map,
755 gc_start_offset,
756 gc_end_offset,
757 start_offset,
758 end_offset,
759 min_alloc_size);
760 }
761 return expected_for_release - expected_allocations;
762 }
763
764 // Cache
765
766 BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
767 PerfCounters *logger)
768 {
769 Cache *c = nullptr;
770
771 if (type == "lru")
772 c = new LRUCache(cct);
773 else if (type == "2q")
774 c = new TwoQCache(cct);
775 else
776 assert(0 == "unrecognized cache type");
777
778 c->logger = logger;
779 return c;
780 }
781
782 void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
783 {
784 std::lock_guard<std::recursive_mutex> l(lock);
785 _trim(onode_max, buffer_max);
786 }
787
788 void BlueStore::Cache::trim_all()
789 {
790 std::lock_guard<std::recursive_mutex> l(lock);
791 _trim(0, 0);
792 }
793
794 // LRUCache
795 #undef dout_prefix
796 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
797
798 void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
799 {
800 auto p = onode_lru.iterator_to(*o);
801 onode_lru.erase(p);
802 onode_lru.push_front(*o);
803 }
804
805 void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
806 {
807 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
808 << " buffers " << buffer_size << " / " << buffer_max
809 << dendl;
810
811 _audit("trim start");
812
813 // buffers
814 while (buffer_size > buffer_max) {
815 auto i = buffer_lru.rbegin();
816 if (i == buffer_lru.rend()) {
817 // stop if buffer_lru is now empty
818 break;
819 }
820
821 Buffer *b = &*i;
822 assert(b->is_clean());
823 dout(20) << __func__ << " rm " << *b << dendl;
824 b->space->_rm_buffer(this, b);
825 }
826
827 // onodes
828 if (onode_max >= onode_lru.size()) {
829 return; // don't even try
830 }
831 uint64_t num = onode_lru.size() - onode_max;
832
833 auto p = onode_lru.end();
834 assert(p != onode_lru.begin());
835 --p;
836 int skipped = 0;
837 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
838 while (num > 0) {
839 Onode *o = &*p;
840 int refs = o->nref.load();
841 if (refs > 1) {
842 dout(20) << __func__ << " " << o->oid << " has " << refs
843 << " refs, skipping" << dendl;
844 if (++skipped >= max_skipped) {
845 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
846 << num << " left to trim" << dendl;
847 break;
848 }
849
850 if (p == onode_lru.begin()) {
851 break;
852 } else {
853 p--;
854 num--;
855 continue;
856 }
857 }
858 dout(30) << __func__ << " rm " << o->oid << dendl;
859 if (p != onode_lru.begin()) {
860 onode_lru.erase(p--);
861 } else {
862 onode_lru.erase(p);
863 assert(num == 1);
864 }
865 o->get(); // paranoia
866 o->c->onode_map.remove(o->oid);
867 o->put();
868 --num;
869 }
870 }
871
872 #ifdef DEBUG_CACHE
873 void BlueStore::LRUCache::_audit(const char *when)
874 {
875 dout(10) << __func__ << " " << when << " start" << dendl;
876 uint64_t s = 0;
877 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
878 s += i->length;
879 }
880 if (s != buffer_size) {
881 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
882 << dendl;
883 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
884 derr << __func__ << " " << *i << dendl;
885 }
886 assert(s == buffer_size);
887 }
888 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
889 << " ok" << dendl;
890 }
891 #endif
892
893 // TwoQCache
894 #undef dout_prefix
895 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
896
897
898 void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
899 {
900 auto p = onode_lru.iterator_to(*o);
901 onode_lru.erase(p);
902 onode_lru.push_front(*o);
903 }
904
905 void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
906 {
907 dout(20) << __func__ << " level " << level << " near " << near
908 << " on " << *b
909 << " which has cache_private " << b->cache_private << dendl;
910 if (near) {
911 b->cache_private = near->cache_private;
912 switch (b->cache_private) {
913 case BUFFER_WARM_IN:
914 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
915 break;
916 case BUFFER_WARM_OUT:
917 assert(b->is_empty());
918 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
919 break;
920 case BUFFER_HOT:
921 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
922 break;
923 default:
924 assert(0 == "bad cache_private");
925 }
926 } else if (b->cache_private == BUFFER_NEW) {
927 b->cache_private = BUFFER_WARM_IN;
928 if (level > 0) {
929 buffer_warm_in.push_front(*b);
930 } else {
931 // take caller hint to start at the back of the warm queue
932 buffer_warm_in.push_back(*b);
933 }
934 } else {
935 // we got a hint from discard
936 switch (b->cache_private) {
937 case BUFFER_WARM_IN:
938 // stay in warm_in. move to front, even though 2Q doesn't actually
939 // do this.
940 dout(20) << __func__ << " move to front of warm " << *b << dendl;
941 buffer_warm_in.push_front(*b);
942 break;
943 case BUFFER_WARM_OUT:
944 b->cache_private = BUFFER_HOT;
945 // move to hot. fall-thru
946 case BUFFER_HOT:
947 dout(20) << __func__ << " move to front of hot " << *b << dendl;
948 buffer_hot.push_front(*b);
949 break;
950 default:
951 assert(0 == "bad cache_private");
952 }
953 }
954 if (!b->is_empty()) {
955 buffer_bytes += b->length;
956 buffer_list_bytes[b->cache_private] += b->length;
957 }
958 }
959
960 void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
961 {
962 dout(20) << __func__ << " " << *b << dendl;
963 if (!b->is_empty()) {
964 assert(buffer_bytes >= b->length);
965 buffer_bytes -= b->length;
966 assert(buffer_list_bytes[b->cache_private] >= b->length);
967 buffer_list_bytes[b->cache_private] -= b->length;
968 }
969 switch (b->cache_private) {
970 case BUFFER_WARM_IN:
971 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
972 break;
973 case BUFFER_WARM_OUT:
974 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
975 break;
976 case BUFFER_HOT:
977 buffer_hot.erase(buffer_hot.iterator_to(*b));
978 break;
979 default:
980 assert(0 == "bad cache_private");
981 }
982 }
983
984 void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
985 {
986 TwoQCache *src = static_cast<TwoQCache*>(srcc);
987 src->_rm_buffer(b);
988
989 // preserve which list we're on (even if we can't preserve the order!)
990 switch (b->cache_private) {
991 case BUFFER_WARM_IN:
992 assert(!b->is_empty());
993 buffer_warm_in.push_back(*b);
994 break;
995 case BUFFER_WARM_OUT:
996 assert(b->is_empty());
997 buffer_warm_out.push_back(*b);
998 break;
999 case BUFFER_HOT:
1000 assert(!b->is_empty());
1001 buffer_hot.push_back(*b);
1002 break;
1003 default:
1004 assert(0 == "bad cache_private");
1005 }
1006 if (!b->is_empty()) {
1007 buffer_bytes += b->length;
1008 buffer_list_bytes[b->cache_private] += b->length;
1009 }
1010 }
1011
1012 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1013 {
1014 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1015 if (!b->is_empty()) {
1016 assert((int64_t)buffer_bytes + delta >= 0);
1017 buffer_bytes += delta;
1018 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1019 buffer_list_bytes[b->cache_private] += delta;
1020 }
1021 }
1022
1023 void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1024 {
1025 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1026 << " buffers " << buffer_bytes << " / " << buffer_max
1027 << dendl;
1028
1029 _audit("trim start");
1030
1031 // buffers
1032 if (buffer_bytes > buffer_max) {
1033 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1034 uint64_t khot = buffer_max - kin;
1035
1036 // pre-calculate kout based on average buffer size too,
1037 // which is typical(the warm_in and hot lists may change later)
1038 uint64_t kout = 0;
1039 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1040 if (buffer_num) {
1041 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1042 assert(buffer_avg_size);
1043 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1044 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1045 }
1046
1047 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1048 // hot is small, give slack to warm_in
1049 kin += khot - buffer_list_bytes[BUFFER_HOT];
1050 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1051 // warm_in is small, give slack to hot
1052 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1053 }
1054
1055 // adjust warm_in list
1056 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1057 uint64_t evicted = 0;
1058
1059 while (to_evict_bytes > 0) {
1060 auto p = buffer_warm_in.rbegin();
1061 if (p == buffer_warm_in.rend()) {
1062 // stop if warm_in list is now empty
1063 break;
1064 }
1065
1066 Buffer *b = &*p;
1067 assert(b->is_clean());
1068 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1069 assert(buffer_bytes >= b->length);
1070 buffer_bytes -= b->length;
1071 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1072 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1073 to_evict_bytes -= b->length;
1074 evicted += b->length;
1075 b->state = Buffer::STATE_EMPTY;
1076 b->data.clear();
1077 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1078 buffer_warm_out.push_front(*b);
1079 b->cache_private = BUFFER_WARM_OUT;
1080 }
1081
1082 if (evicted > 0) {
1083 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1084 << " from warm_in list, done evicting warm_in buffers"
1085 << dendl;
1086 }
1087
1088 // adjust hot list
1089 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1090 evicted = 0;
1091
1092 while (to_evict_bytes > 0) {
1093 auto p = buffer_hot.rbegin();
1094 if (p == buffer_hot.rend()) {
1095 // stop if hot list is now empty
1096 break;
1097 }
1098
1099 Buffer *b = &*p;
1100 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1101 assert(b->is_clean());
1102 // adjust evict size before buffer goes invalid
1103 to_evict_bytes -= b->length;
1104 evicted += b->length;
1105 b->space->_rm_buffer(this, b);
1106 }
1107
1108 if (evicted > 0) {
1109 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1110 << " from hot list, done evicting hot buffers"
1111 << dendl;
1112 }
1113
1114 // adjust warm out list too, if necessary
1115 int64_t num = buffer_warm_out.size() - kout;
1116 while (num-- > 0) {
1117 Buffer *b = &*buffer_warm_out.rbegin();
1118 assert(b->is_empty());
1119 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1120 b->space->_rm_buffer(this, b);
1121 }
1122 }
1123
1124 // onodes
1125 if (onode_max >= onode_lru.size()) {
1126 return; // don't even try
1127 }
1128 uint64_t num = onode_lru.size() - onode_max;
1129
1130 auto p = onode_lru.end();
1131 assert(p != onode_lru.begin());
1132 --p;
1133 int skipped = 0;
1134 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1135 while (num > 0) {
1136 Onode *o = &*p;
1137 dout(20) << __func__ << " considering " << o << dendl;
1138 int refs = o->nref.load();
1139 if (refs > 1) {
1140 dout(20) << __func__ << " " << o->oid << " has " << refs
1141 << " refs; skipping" << dendl;
1142 if (++skipped >= max_skipped) {
1143 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1144 << num << " left to trim" << dendl;
1145 break;
1146 }
1147
1148 if (p == onode_lru.begin()) {
1149 break;
1150 } else {
1151 p--;
1152 num--;
1153 continue;
1154 }
1155 }
1156 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1157 if (p != onode_lru.begin()) {
1158 onode_lru.erase(p--);
1159 } else {
1160 onode_lru.erase(p);
1161 assert(num == 1);
1162 }
1163 o->get(); // paranoia
1164 o->c->onode_map.remove(o->oid);
1165 o->put();
1166 --num;
1167 }
1168 }
1169
1170 #ifdef DEBUG_CACHE
1171 void BlueStore::TwoQCache::_audit(const char *when)
1172 {
1173 dout(10) << __func__ << " " << when << " start" << dendl;
1174 uint64_t s = 0;
1175 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1176 s += i->length;
1177 }
1178
1179 uint64_t hot_bytes = s;
1180 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1181 derr << __func__ << " hot_list_bytes "
1182 << buffer_list_bytes[BUFFER_HOT]
1183 << " != actual " << hot_bytes
1184 << dendl;
1185 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1186 }
1187
1188 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1189 s += i->length;
1190 }
1191
1192 uint64_t warm_in_bytes = s - hot_bytes;
1193 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1194 derr << __func__ << " warm_in_list_bytes "
1195 << buffer_list_bytes[BUFFER_WARM_IN]
1196 << " != actual " << warm_in_bytes
1197 << dendl;
1198 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1199 }
1200
1201 if (s != buffer_bytes) {
1202 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1203 << dendl;
1204 assert(s == buffer_bytes);
1205 }
1206
1207 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1208 << " ok" << dendl;
1209 }
1210 #endif
1211
1212
1213 // BufferSpace
1214
1215 #undef dout_prefix
1216 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1217
1218 void BlueStore::BufferSpace::_clear(Cache* cache)
1219 {
1220 // note: we already hold cache->lock
1221 ldout(cache->cct, 20) << __func__ << dendl;
1222 while (!buffer_map.empty()) {
1223 _rm_buffer(cache, buffer_map.begin());
1224 }
1225 }
1226
1227 int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1228 {
1229 // note: we already hold cache->lock
1230 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1231 << std::dec << dendl;
1232 int cache_private = 0;
1233 cache->_audit("discard start");
1234 auto i = _data_lower_bound(offset);
1235 uint32_t end = offset + length;
1236 while (i != buffer_map.end()) {
1237 Buffer *b = i->second.get();
1238 if (b->offset >= end) {
1239 break;
1240 }
1241 if (b->cache_private > cache_private) {
1242 cache_private = b->cache_private;
1243 }
1244 if (b->offset < offset) {
1245 int64_t front = offset - b->offset;
1246 if (b->end() > end) {
1247 // drop middle (split)
1248 uint32_t tail = b->end() - end;
1249 if (b->data.length()) {
1250 bufferlist bl;
1251 bl.substr_of(b->data, b->length - tail, tail);
1252 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1253 nb->maybe_rebuild();
1254 _add_buffer(cache, nb, 0, b);
1255 } else {
1256 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1257 0, b);
1258 }
1259 if (!b->is_writing()) {
1260 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1261 }
1262 b->truncate(front);
1263 b->maybe_rebuild();
1264 cache->_audit("discard end 1");
1265 break;
1266 } else {
1267 // drop tail
1268 if (!b->is_writing()) {
1269 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1270 }
1271 b->truncate(front);
1272 b->maybe_rebuild();
1273 ++i;
1274 continue;
1275 }
1276 }
1277 if (b->end() <= end) {
1278 // drop entire buffer
1279 _rm_buffer(cache, i++);
1280 continue;
1281 }
1282 // drop front
1283 uint32_t keep = b->end() - end;
1284 if (b->data.length()) {
1285 bufferlist bl;
1286 bl.substr_of(b->data, b->length - keep, keep);
1287 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1288 nb->maybe_rebuild();
1289 _add_buffer(cache, nb, 0, b);
1290 } else {
1291 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1292 }
1293 _rm_buffer(cache, i);
1294 cache->_audit("discard end 2");
1295 break;
1296 }
1297 return cache_private;
1298 }
1299
1300 void BlueStore::BufferSpace::read(
1301 Cache* cache,
1302 uint32_t offset,
1303 uint32_t length,
1304 BlueStore::ready_regions_t& res,
1305 interval_set<uint32_t>& res_intervals,
1306 int flags)
1307 {
1308 res.clear();
1309 res_intervals.clear();
1310 uint32_t want_bytes = length;
1311 uint32_t end = offset + length;
1312
1313 {
1314 std::lock_guard<std::recursive_mutex> l(cache->lock);
1315 for (auto i = _data_lower_bound(offset);
1316 i != buffer_map.end() && offset < end && i->first < end;
1317 ++i) {
1318 Buffer *b = i->second.get();
1319 assert(b->end() > offset);
1320
1321 bool val = false;
1322 if (flags & BYPASS_CLEAN_CACHE)
1323 val = b->is_writing();
1324 else
1325 val = b->is_writing() || b->is_clean();
1326 if (val) {
1327 if (b->offset < offset) {
1328 uint32_t skip = offset - b->offset;
1329 uint32_t l = MIN(length, b->length - skip);
1330 res[offset].substr_of(b->data, skip, l);
1331 res_intervals.insert(offset, l);
1332 offset += l;
1333 length -= l;
1334 if (!b->is_writing()) {
1335 cache->_touch_buffer(b);
1336 }
1337 continue;
1338 }
1339 if (b->offset > offset) {
1340 uint32_t gap = b->offset - offset;
1341 if (length <= gap) {
1342 break;
1343 }
1344 offset += gap;
1345 length -= gap;
1346 }
1347 if (!b->is_writing()) {
1348 cache->_touch_buffer(b);
1349 }
1350 if (b->length > length) {
1351 res[offset].substr_of(b->data, 0, length);
1352 res_intervals.insert(offset, length);
1353 break;
1354 } else {
1355 res[offset].append(b->data);
1356 res_intervals.insert(offset, b->length);
1357 if (b->length == length)
1358 break;
1359 offset += b->length;
1360 length -= b->length;
1361 }
1362 }
1363 }
1364 }
1365
1366 uint64_t hit_bytes = res_intervals.size();
1367 assert(hit_bytes <= want_bytes);
1368 uint64_t miss_bytes = want_bytes - hit_bytes;
1369 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1370 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1371 }
1372
1373 void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1374 {
1375 std::lock_guard<std::recursive_mutex> l(cache->lock);
1376
1377 auto i = writing.begin();
1378 while (i != writing.end()) {
1379 if (i->seq > seq) {
1380 break;
1381 }
1382 if (i->seq < seq) {
1383 ++i;
1384 continue;
1385 }
1386
1387 Buffer *b = &*i;
1388 assert(b->is_writing());
1389
1390 if (b->flags & Buffer::FLAG_NOCACHE) {
1391 writing.erase(i++);
1392 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1393 buffer_map.erase(b->offset);
1394 } else {
1395 b->state = Buffer::STATE_CLEAN;
1396 writing.erase(i++);
1397 b->maybe_rebuild();
1398 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1399 cache->_add_buffer(b, 1, nullptr);
1400 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1401 }
1402 }
1403
1404 cache->_audit("finish_write end");
1405 }
1406
1407 void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1408 {
1409 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1410 if (buffer_map.empty())
1411 return;
1412
1413 auto p = --buffer_map.end();
1414 while (true) {
1415 if (p->second->end() <= pos)
1416 break;
1417
1418 if (p->second->offset < pos) {
1419 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1420 size_t left = pos - p->second->offset;
1421 size_t right = p->second->length - left;
1422 if (p->second->data.length()) {
1423 bufferlist bl;
1424 bl.substr_of(p->second->data, left, right);
1425 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1426 0, p->second.get());
1427 } else {
1428 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1429 0, p->second.get());
1430 }
1431 cache->_adjust_buffer_size(p->second.get(), -right);
1432 p->second->truncate(left);
1433 break;
1434 }
1435
1436 assert(p->second->end() > pos);
1437 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1438 if (p->second->data.length()) {
1439 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1440 p->second->offset - pos, p->second->data),
1441 0, p->second.get());
1442 } else {
1443 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1444 p->second->offset - pos, p->second->length),
1445 0, p->second.get());
1446 }
1447 if (p == buffer_map.begin()) {
1448 _rm_buffer(cache, p);
1449 break;
1450 } else {
1451 _rm_buffer(cache, p--);
1452 }
1453 }
1454 assert(writing.empty());
1455 }
1456
1457 // OnodeSpace
1458
1459 #undef dout_prefix
1460 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1461
1462 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1463 {
1464 std::lock_guard<std::recursive_mutex> l(cache->lock);
1465 auto p = onode_map.find(oid);
1466 if (p != onode_map.end()) {
1467 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1468 << " raced, returning existing " << p->second
1469 << dendl;
1470 return p->second;
1471 }
1472 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1473 onode_map[oid] = o;
1474 cache->_add_onode(o, 1);
1475 return o;
1476 }
1477
1478 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1479 {
1480 ldout(cache->cct, 30) << __func__ << dendl;
1481 OnodeRef o;
1482 bool hit = false;
1483
1484 {
1485 std::lock_guard<std::recursive_mutex> l(cache->lock);
1486 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1487 if (p == onode_map.end()) {
1488 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1489 } else {
1490 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1491 << dendl;
1492 cache->_touch_onode(p->second);
1493 hit = true;
1494 o = p->second;
1495 }
1496 }
1497
1498 if (hit) {
1499 cache->logger->inc(l_bluestore_onode_hits);
1500 } else {
1501 cache->logger->inc(l_bluestore_onode_misses);
1502 }
1503 return o;
1504 }
1505
1506 void BlueStore::OnodeSpace::clear()
1507 {
1508 std::lock_guard<std::recursive_mutex> l(cache->lock);
1509 ldout(cache->cct, 10) << __func__ << dendl;
1510 for (auto &p : onode_map) {
1511 cache->_rm_onode(p.second);
1512 }
1513 onode_map.clear();
1514 }
1515
1516 bool BlueStore::OnodeSpace::empty()
1517 {
1518 std::lock_guard<std::recursive_mutex> l(cache->lock);
1519 return onode_map.empty();
1520 }
1521
1522 void BlueStore::OnodeSpace::rename(
1523 OnodeRef& oldo,
1524 const ghobject_t& old_oid,
1525 const ghobject_t& new_oid,
1526 const mempool::bluestore_cache_other::string& new_okey)
1527 {
1528 std::lock_guard<std::recursive_mutex> l(cache->lock);
1529 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1530 << dendl;
1531 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1532 po = onode_map.find(old_oid);
1533 pn = onode_map.find(new_oid);
1534 assert(po != pn);
1535
1536 assert(po != onode_map.end());
1537 if (pn != onode_map.end()) {
1538 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1539 << dendl;
1540 cache->_rm_onode(pn->second);
1541 onode_map.erase(pn);
1542 }
1543 OnodeRef o = po->second;
1544
1545 // install a non-existent onode at old location
1546 oldo.reset(new Onode(o->c, old_oid, o->key));
1547 po->second = oldo;
1548 cache->_add_onode(po->second, 1);
1549
1550 // add at new position and fix oid, key
1551 onode_map.insert(make_pair(new_oid, o));
1552 cache->_touch_onode(o);
1553 o->oid = new_oid;
1554 o->key = new_okey;
1555 }
1556
1557 bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1558 {
1559 std::lock_guard<std::recursive_mutex> l(cache->lock);
1560 ldout(cache->cct, 20) << __func__ << dendl;
1561 for (auto& i : onode_map) {
1562 if (f(i.second)) {
1563 return true;
1564 }
1565 }
1566 return false;
1567 }
1568
1569 void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
1570 {
1571 for (auto& i : onode_map) {
1572 ldout(cct, lvl) << i.first << " : " << i.second << dendl;
1573 }
1574 }
1575
1576 // SharedBlob
1577
1578 #undef dout_prefix
1579 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1580
1581 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1582 {
1583 out << "SharedBlob(" << &sb;
1584
1585 if (sb.loaded) {
1586 out << " loaded " << *sb.persistent;
1587 } else {
1588 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1589 }
1590 return out << ")";
1591 }
1592
1593 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1594 : coll(_coll), sbid_unloaded(i)
1595 {
1596 assert(sbid_unloaded > 0);
1597 if (get_cache()) {
1598 get_cache()->add_blob();
1599 }
1600 }
1601
1602 BlueStore::SharedBlob::~SharedBlob()
1603 {
1604 if (loaded && persistent) {
1605 delete persistent;
1606 }
1607 }
1608
1609 void BlueStore::SharedBlob::put()
1610 {
1611 if (--nref == 0) {
1612 ldout(coll->store->cct, 20) << __func__ << " " << this
1613 << " removing self from set " << get_parent()
1614 << dendl;
1615 again:
1616 auto coll_snap = coll;
1617 if (coll_snap) {
1618 std::lock_guard<std::recursive_mutex> l(coll_snap->cache->lock);
1619 if (coll_snap != coll) {
1620 goto again;
1621 }
1622 if (!coll_snap->shared_blob_set.remove(this, true)) {
1623 // race with lookup
1624 return;
1625 }
1626 bc._clear(coll_snap->cache);
1627 coll_snap->cache->rm_blob();
1628 }
1629 delete this;
1630 }
1631 }
1632
1633 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1634 {
1635 assert(persistent);
1636 persistent->ref_map.get(offset, length);
1637 }
1638
1639 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
1640 PExtentVector *r,
1641 set<SharedBlob*> *maybe_unshared)
1642 {
1643 assert(persistent);
1644 bool maybe = false;
1645 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1646 if (maybe_unshared && maybe) {
1647 maybe_unshared->insert(this);
1648 }
1649 }
1650
1651 // SharedBlobSet
1652
1653 #undef dout_prefix
1654 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1655
1656 void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
1657 {
1658 std::lock_guard<std::mutex> l(lock);
1659 for (auto& i : sb_map) {
1660 ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
1661 }
1662 }
1663
1664 // Blob
1665
1666 #undef dout_prefix
1667 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1668
1669 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1670 {
1671 out << "Blob(" << &b;
1672 if (b.is_spanning()) {
1673 out << " spanning " << b.id;
1674 }
1675 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1676 if (b.shared_blob) {
1677 out << " " << *b.shared_blob;
1678 } else {
1679 out << " (shared_blob=NULL)";
1680 }
1681 out << ")";
1682 return out;
1683 }
1684
1685 void BlueStore::Blob::discard_unallocated(Collection *coll)
1686 {
1687 if (get_blob().is_shared()) {
1688 return;
1689 }
1690 if (get_blob().is_compressed()) {
1691 bool discard = false;
1692 bool all_invalid = true;
1693 for (auto e : get_blob().get_extents()) {
1694 if (!e.is_valid()) {
1695 discard = true;
1696 } else {
1697 all_invalid = false;
1698 }
1699 }
1700 assert(discard == all_invalid); // in case of compressed blob all
1701 // or none pextents are invalid.
1702 if (discard) {
1703 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1704 get_blob().get_logical_length());
1705 }
1706 } else {
1707 size_t pos = 0;
1708 for (auto e : get_blob().get_extents()) {
1709 if (!e.is_valid()) {
1710 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1711 << "~" << e.length
1712 << std::dec << dendl;
1713 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1714 }
1715 pos += e.length;
1716 }
1717 if (get_blob().can_prune_tail()) {
1718 dirty_blob().prune_tail();
1719 used_in_blob.prune_tail(get_blob().get_ondisk_length());
1720 auto cct = coll->store->cct; //used by dout
1721 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
1722 }
1723 }
1724 }
1725
1726 void BlueStore::Blob::get_ref(
1727 Collection *coll,
1728 uint32_t offset,
1729 uint32_t length)
1730 {
1731 // Caller has to initialize Blob's logical length prior to increment
1732 // references. Otherwise one is neither unable to determine required
1733 // amount of counters in case of per-au tracking nor obtain min_release_size
1734 // for single counter mode.
1735 assert(get_blob().get_logical_length() != 0);
1736 auto cct = coll->store->cct;
1737 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1738 << std::dec << " " << *this << dendl;
1739
1740 if (used_in_blob.is_empty()) {
1741 uint32_t min_release_size =
1742 get_blob().get_release_size(coll->store->min_alloc_size);
1743 uint64_t l = get_blob().get_logical_length();
1744 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1745 << min_release_size << std::dec << dendl;
1746 used_in_blob.init(l, min_release_size);
1747 }
1748 used_in_blob.get(
1749 offset,
1750 length);
1751 }
1752
1753 bool BlueStore::Blob::put_ref(
1754 Collection *coll,
1755 uint32_t offset,
1756 uint32_t length,
1757 PExtentVector *r)
1758 {
1759 PExtentVector logical;
1760
1761 auto cct = coll->store->cct;
1762 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1763 << std::dec << " " << *this << dendl;
1764
1765 bool empty = used_in_blob.put(
1766 offset,
1767 length,
1768 &logical);
1769 r->clear();
1770 // nothing to release
1771 if (!empty && logical.empty()) {
1772 return false;
1773 }
1774
1775 bluestore_blob_t& b = dirty_blob();
1776 return b.release_extents(empty, logical, r);
1777 }
1778
1779 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
1780 uint32_t target_blob_size,
1781 uint32_t b_offset,
1782 uint32_t *length0) {
1783 assert(min_alloc_size);
1784 assert(target_blob_size);
1785 if (!get_blob().is_mutable()) {
1786 return false;
1787 }
1788
1789 uint32_t length = *length0;
1790 uint32_t end = b_offset + length;
1791
1792 // Currently for the sake of simplicity we omit blob reuse if data is
1793 // unaligned with csum chunk. Later we can perform padding if needed.
1794 if (get_blob().has_csum() &&
1795 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1796 (end % get_blob().get_csum_chunk_size()) != 0)) {
1797 return false;
1798 }
1799
1800 auto blen = get_blob().get_logical_length();
1801 uint32_t new_blen = blen;
1802
1803 // make sure target_blob_size isn't less than current blob len
1804 target_blob_size = MAX(blen, target_blob_size);
1805
1806 if (b_offset >= blen) {
1807 // new data totally stands out of the existing blob
1808 new_blen = end;
1809 } else {
1810 // new data overlaps with the existing blob
1811 new_blen = MAX(blen, end);
1812
1813 uint32_t overlap = 0;
1814 if (new_blen > blen) {
1815 overlap = blen - b_offset;
1816 } else {
1817 overlap = length;
1818 }
1819
1820 if (!get_blob().is_unallocated(b_offset, overlap)) {
1821 // abort if any piece of the overlap has already been allocated
1822 return false;
1823 }
1824 }
1825
1826 if (new_blen > blen) {
1827 int64_t overflow = int64_t(new_blen) - target_blob_size;
1828 // Unable to decrease the provided length to fit into max_blob_size
1829 if (overflow >= length) {
1830 return false;
1831 }
1832
1833 // FIXME: in some cases we could reduce unused resolution
1834 if (get_blob().has_unused()) {
1835 return false;
1836 }
1837
1838 if (overflow > 0) {
1839 new_blen -= overflow;
1840 length -= overflow;
1841 *length0 = length;
1842 }
1843
1844 if (new_blen > blen) {
1845 dirty_blob().add_tail(new_blen);
1846 used_in_blob.add_tail(new_blen,
1847 get_blob().get_release_size(min_alloc_size));
1848 }
1849 }
1850 return true;
1851 }
1852
1853 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1854 {
1855 auto cct = coll->store->cct; //used by dout
1856 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1857 << " start " << *this << dendl;
1858 assert(blob.can_split());
1859 assert(used_in_blob.can_split());
1860 bluestore_blob_t &lb = dirty_blob();
1861 bluestore_blob_t &rb = r->dirty_blob();
1862
1863 used_in_blob.split(
1864 blob_offset,
1865 &(r->used_in_blob));
1866
1867 lb.split(blob_offset, rb);
1868 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1869
1870 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1871 << " finish " << *this << dendl;
1872 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1873 << " and " << *r << dendl;
1874 }
1875
1876 #ifndef CACHE_BLOB_BL
1877 void BlueStore::Blob::decode(
1878 Collection *coll,
1879 bufferptr::iterator& p,
1880 uint64_t struct_v,
1881 uint64_t* sbid,
1882 bool include_ref_map)
1883 {
1884 denc(blob, p, struct_v);
1885 if (blob.is_shared()) {
1886 denc(*sbid, p);
1887 }
1888 if (include_ref_map) {
1889 if (struct_v > 1) {
1890 used_in_blob.decode(p);
1891 } else {
1892 used_in_blob.clear();
1893 bluestore_extent_ref_map_t legacy_ref_map;
1894 legacy_ref_map.decode(p);
1895 for (auto r : legacy_ref_map.ref_map) {
1896 get_ref(
1897 coll,
1898 r.first,
1899 r.second.refs * r.second.length);
1900 }
1901 }
1902 }
1903 }
1904 #endif
1905
1906 // Extent
1907
1908 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1909 {
1910 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1911 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1912 << " " << *e.blob;
1913 }
1914
1915 // OldExtent
1916 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1917 uint32_t lo,
1918 uint32_t o,
1919 uint32_t l,
1920 BlobRef& b) {
1921 OldExtent* oe = new OldExtent(lo, o, l, b);
1922 b->put_ref(c.get(), o, l, &(oe->r));
1923 oe->blob_empty = b->get_referenced_bytes() == 0;
1924 return oe;
1925 }
1926
1927 // ExtentMap
1928
1929 #undef dout_prefix
1930 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1931
1932 BlueStore::ExtentMap::ExtentMap(Onode *o)
1933 : onode(o),
1934 inline_bl(
1935 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1936 }
1937
1938 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1939 bool force)
1940 {
1941 auto cct = onode->c->store->cct; //used by dout
1942 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1943 if (onode->onode.extent_map_shards.empty()) {
1944 if (inline_bl.length() == 0) {
1945 unsigned n;
1946 // we need to encode inline_bl to measure encoded length
1947 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
1948 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
1949 assert(!never_happen);
1950 size_t len = inline_bl.length();
1951 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1952 << " extents" << dendl;
1953 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
1954 request_reshard(0, OBJECT_MAX_SIZE);
1955 return;
1956 }
1957 }
1958 // will persist in the onode key.
1959 } else {
1960 // pending shard update
1961 struct dirty_shard_t {
1962 Shard *shard;
1963 bufferlist bl;
1964 dirty_shard_t(Shard *s) : shard(s) {}
1965 };
1966 vector<dirty_shard_t> encoded_shards;
1967 // allocate slots for all shards in a single call instead of
1968 // doing multiple allocations - one per each dirty shard
1969 encoded_shards.reserve(shards.size());
1970
1971 auto p = shards.begin();
1972 auto prev_p = p;
1973 while (p != shards.end()) {
1974 assert(p->shard_info->offset >= prev_p->shard_info->offset);
1975 auto n = p;
1976 ++n;
1977 if (p->dirty) {
1978 uint32_t endoff;
1979 if (n == shards.end()) {
1980 endoff = OBJECT_MAX_SIZE;
1981 } else {
1982 endoff = n->shard_info->offset;
1983 }
1984 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
1985 bufferlist& bl = encoded_shards.back().bl;
1986 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
1987 bl, &p->extents)) {
1988 if (force) {
1989 derr << __func__ << " encode_some needs reshard" << dendl;
1990 assert(!force);
1991 }
1992 }
1993 size_t len = bl.length();
1994
1995 dout(20) << __func__ << " shard 0x" << std::hex
1996 << p->shard_info->offset << std::dec << " is " << len
1997 << " bytes (was " << p->shard_info->bytes << ") from "
1998 << p->extents << " extents" << dendl;
1999
2000 if (!force) {
2001 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2002 // we are big; reshard ourselves
2003 request_reshard(p->shard_info->offset, endoff);
2004 }
2005 // avoid resharding the trailing shard, even if it is small
2006 else if (n != shards.end() &&
2007 len < g_conf->bluestore_extent_map_shard_min_size) {
2008 assert(endoff != OBJECT_MAX_SIZE);
2009 if (p == shards.begin()) {
2010 // we are the first shard, combine with next shard
2011 request_reshard(p->shard_info->offset, endoff + 1);
2012 } else {
2013 // combine either with the previous shard or the next,
2014 // whichever is smaller
2015 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2016 request_reshard(p->shard_info->offset, endoff + 1);
2017 } else {
2018 request_reshard(prev_p->shard_info->offset, endoff);
2019 }
2020 }
2021 }
2022 }
2023 }
2024 prev_p = p;
2025 p = n;
2026 }
2027 if (needs_reshard()) {
2028 return;
2029 }
2030
2031 // schedule DB update for dirty shards
2032 string key;
2033 for (auto& it : encoded_shards) {
2034 it.shard->dirty = false;
2035 it.shard->shard_info->bytes = it.bl.length();
2036 generate_extent_shard_key_and_apply(
2037 onode->key,
2038 it.shard->shard_info->offset,
2039 &key,
2040 [&](const string& final_key) {
2041 t->set(PREFIX_OBJ, final_key, it.bl);
2042 }
2043 );
2044 }
2045 }
2046 }
2047
2048 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2049 {
2050 if (spanning_blob_map.empty())
2051 return 0;
2052 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2053 // bid is valid and available.
2054 if (bid >= 0)
2055 return bid;
2056 // Find next unused bid;
2057 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2058 const auto begin_bid = bid;
2059 do {
2060 if (!spanning_blob_map.count(bid))
2061 return bid;
2062 else {
2063 bid++;
2064 if (bid < 0) bid = 0;
2065 }
2066 } while (bid != begin_bid);
2067 assert(0 == "no available blob id");
2068 }
2069
2070 void BlueStore::ExtentMap::reshard(
2071 KeyValueDB *db,
2072 KeyValueDB::Transaction t)
2073 {
2074 auto cct = onode->c->store->cct; // used by dout
2075
2076 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2077 << needs_reshard_end << ")" << std::dec
2078 << " of " << onode->onode.extent_map_shards.size()
2079 << " shards on " << onode->oid << dendl;
2080 for (auto& p : spanning_blob_map) {
2081 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2082 << dendl;
2083 }
2084 // determine shard index range
2085 unsigned si_begin = 0, si_end = 0;
2086 if (!shards.empty()) {
2087 while (si_begin + 1 < shards.size() &&
2088 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2089 ++si_begin;
2090 }
2091 needs_reshard_begin = shards[si_begin].shard_info->offset;
2092 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2093 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2094 needs_reshard_end = shards[si_end].shard_info->offset;
2095 break;
2096 }
2097 }
2098 if (si_end == shards.size()) {
2099 needs_reshard_end = OBJECT_MAX_SIZE;
2100 }
2101 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2102 << " over 0x[" << std::hex << needs_reshard_begin << ","
2103 << needs_reshard_end << ")" << std::dec << dendl;
2104 }
2105
2106 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2107
2108 // we may need to fault in a larger interval later must have all
2109 // referring extents for spanning blobs loaded in order to have
2110 // accurate use_tracker values.
2111 uint32_t spanning_scan_begin = needs_reshard_begin;
2112 uint32_t spanning_scan_end = needs_reshard_end;
2113
2114 // remove old keys
2115 string key;
2116 for (unsigned i = si_begin; i < si_end; ++i) {
2117 generate_extent_shard_key_and_apply(
2118 onode->key, shards[i].shard_info->offset, &key,
2119 [&](const string& final_key) {
2120 t->rmkey(PREFIX_OBJ, final_key);
2121 }
2122 );
2123 }
2124
2125 // calculate average extent size
2126 unsigned bytes = 0;
2127 unsigned extents = 0;
2128 if (onode->onode.extent_map_shards.empty()) {
2129 bytes = inline_bl.length();
2130 extents = extent_map.size();
2131 } else {
2132 for (unsigned i = si_begin; i < si_end; ++i) {
2133 bytes += shards[i].shard_info->bytes;
2134 extents += shards[i].extents;
2135 }
2136 }
2137 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2138 unsigned slop = target *
2139 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2140 unsigned extent_avg = bytes / MAX(1, extents);
2141 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2142 << ", slop " << slop << dendl;
2143
2144 // reshard
2145 unsigned estimate = 0;
2146 unsigned offset = needs_reshard_begin;
2147 vector<bluestore_onode_t::shard_info> new_shard_info;
2148 unsigned max_blob_end = 0;
2149 Extent dummy(needs_reshard_begin);
2150 for (auto e = extent_map.lower_bound(dummy);
2151 e != extent_map.end();
2152 ++e) {
2153 if (e->logical_offset >= needs_reshard_end) {
2154 break;
2155 }
2156 dout(30) << " extent " << *e << dendl;
2157
2158 // disfavor shard boundaries that span a blob
2159 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2160 if (estimate &&
2161 estimate + extent_avg > target + (would_span ? slop : 0)) {
2162 // new shard
2163 if (offset == needs_reshard_begin) {
2164 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2165 new_shard_info.back().offset = offset;
2166 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2167 << std::dec << dendl;
2168 }
2169 offset = e->logical_offset;
2170 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2171 new_shard_info.back().offset = offset;
2172 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2173 << std::dec << dendl;
2174 estimate = 0;
2175 }
2176 estimate += extent_avg;
2177 unsigned bs = e->blob_start();
2178 if (bs < spanning_scan_begin) {
2179 spanning_scan_begin = bs;
2180 }
2181 uint32_t be = e->blob_end();
2182 if (be > max_blob_end) {
2183 max_blob_end = be;
2184 }
2185 if (be > spanning_scan_end) {
2186 spanning_scan_end = be;
2187 }
2188 }
2189 if (new_shard_info.empty() && (si_begin > 0 ||
2190 si_end < shards.size())) {
2191 // we resharded a partial range; we must produce at least one output
2192 // shard
2193 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2194 new_shard_info.back().offset = needs_reshard_begin;
2195 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2196 << std::dec << " (singleton degenerate case)" << dendl;
2197 }
2198
2199 auto& sv = onode->onode.extent_map_shards;
2200 dout(20) << __func__ << " new " << new_shard_info << dendl;
2201 dout(20) << __func__ << " old " << sv << dendl;
2202 if (sv.empty()) {
2203 // no old shards to keep
2204 sv.swap(new_shard_info);
2205 init_shards(true, true);
2206 } else {
2207 // splice in new shards
2208 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2209 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2210 sv.insert(
2211 sv.begin() + si_begin,
2212 new_shard_info.begin(),
2213 new_shard_info.end());
2214 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2215 si_end = si_begin + new_shard_info.size();
2216
2217 assert(sv.size() == shards.size());
2218
2219 // note that we need to update every shard_info of shards here,
2220 // as sv might have been totally re-allocated above
2221 for (unsigned i = 0; i < shards.size(); i++) {
2222 shards[i].shard_info = &sv[i];
2223 }
2224
2225 // mark newly added shards as dirty
2226 for (unsigned i = si_begin; i < si_end; ++i) {
2227 shards[i].loaded = true;
2228 shards[i].dirty = true;
2229 }
2230 }
2231 dout(20) << __func__ << " fin " << sv << dendl;
2232 inline_bl.clear();
2233
2234 if (sv.empty()) {
2235 // no more shards; unspan all previously spanning blobs
2236 auto p = spanning_blob_map.begin();
2237 while (p != spanning_blob_map.end()) {
2238 p->second->id = -1;
2239 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2240 p = spanning_blob_map.erase(p);
2241 }
2242 } else {
2243 // identify new spanning blobs
2244 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2245 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2246 if (spanning_scan_begin < needs_reshard_begin) {
2247 fault_range(db, spanning_scan_begin,
2248 needs_reshard_begin - spanning_scan_begin);
2249 }
2250 if (spanning_scan_end > needs_reshard_end) {
2251 fault_range(db, needs_reshard_end,
2252 spanning_scan_end - needs_reshard_end);
2253 }
2254 auto sp = sv.begin() + si_begin;
2255 auto esp = sv.end();
2256 unsigned shard_start = sp->offset;
2257 unsigned shard_end;
2258 ++sp;
2259 if (sp == esp) {
2260 shard_end = OBJECT_MAX_SIZE;
2261 } else {
2262 shard_end = sp->offset;
2263 }
2264 Extent dummy(needs_reshard_begin);
2265 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2266 if (e->logical_offset >= needs_reshard_end) {
2267 break;
2268 }
2269 dout(30) << " extent " << *e << dendl;
2270 while (e->logical_offset >= shard_end) {
2271 shard_start = shard_end;
2272 assert(sp != esp);
2273 ++sp;
2274 if (sp == esp) {
2275 shard_end = OBJECT_MAX_SIZE;
2276 } else {
2277 shard_end = sp->offset;
2278 }
2279 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2280 << " to 0x" << shard_end << std::dec << dendl;
2281 }
2282 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2283 if (!e->blob->is_spanning()) {
2284 // We have two options: (1) split the blob into pieces at the
2285 // shard boundaries (and adjust extents accordingly), or (2)
2286 // mark it spanning. We prefer to cut the blob if we can. Note that
2287 // we may have to split it multiple times--potentially at every
2288 // shard boundary.
2289 bool must_span = false;
2290 BlobRef b = e->blob;
2291 if (b->can_split()) {
2292 uint32_t bstart = e->blob_start();
2293 uint32_t bend = e->blob_end();
2294 for (const auto& sh : shards) {
2295 if (bstart < sh.shard_info->offset &&
2296 bend > sh.shard_info->offset) {
2297 uint32_t blob_offset = sh.shard_info->offset - bstart;
2298 if (b->can_split_at(blob_offset)) {
2299 dout(20) << __func__ << " splitting blob, bstart 0x"
2300 << std::hex << bstart << " blob_offset 0x"
2301 << blob_offset << std::dec << " " << *b << dendl;
2302 b = split_blob(b, blob_offset, sh.shard_info->offset);
2303 // switch b to the new right-hand side, in case it
2304 // *also* has to get split.
2305 bstart += blob_offset;
2306 onode->c->store->logger->inc(l_bluestore_blob_split);
2307 } else {
2308 must_span = true;
2309 break;
2310 }
2311 }
2312 }
2313 } else {
2314 must_span = true;
2315 }
2316 if (must_span) {
2317 auto bid = allocate_spanning_blob_id();
2318 b->id = bid;
2319 spanning_blob_map[b->id] = b;
2320 dout(20) << __func__ << " adding spanning " << *b << dendl;
2321 }
2322 }
2323 } else {
2324 if (e->blob->is_spanning()) {
2325 spanning_blob_map.erase(e->blob->id);
2326 e->blob->id = -1;
2327 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2328 }
2329 }
2330 }
2331 }
2332
2333 clear_needs_reshard();
2334 }
2335
2336 bool BlueStore::ExtentMap::encode_some(
2337 uint32_t offset,
2338 uint32_t length,
2339 bufferlist& bl,
2340 unsigned *pn)
2341 {
2342 auto cct = onode->c->store->cct; //used by dout
2343 Extent dummy(offset);
2344 auto start = extent_map.lower_bound(dummy);
2345 uint32_t end = offset + length;
2346
2347 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2348 // serialization only. Hence there is no specific
2349 // handling at ExtentMap level.
2350
2351 unsigned n = 0;
2352 size_t bound = 0;
2353 bool must_reshard = false;
2354 for (auto p = start;
2355 p != extent_map.end() && p->logical_offset < end;
2356 ++p, ++n) {
2357 assert(p->logical_offset >= offset);
2358 p->blob->last_encoded_id = -1;
2359 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2360 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2361 << std::dec << " hit new spanning blob " << *p << dendl;
2362 request_reshard(p->blob_start(), p->blob_end());
2363 must_reshard = true;
2364 }
2365 if (!must_reshard) {
2366 denc_varint(0, bound); // blobid
2367 denc_varint(0, bound); // logical_offset
2368 denc_varint(0, bound); // len
2369 denc_varint(0, bound); // blob_offset
2370
2371 p->blob->bound_encode(
2372 bound,
2373 struct_v,
2374 p->blob->shared_blob->get_sbid(),
2375 false);
2376 }
2377 }
2378 if (must_reshard) {
2379 return true;
2380 }
2381
2382 denc(struct_v, bound);
2383 denc_varint(0, bound); // number of extents
2384
2385 {
2386 auto app = bl.get_contiguous_appender(bound);
2387 denc(struct_v, app);
2388 denc_varint(n, app);
2389 if (pn) {
2390 *pn = n;
2391 }
2392
2393 n = 0;
2394 uint64_t pos = 0;
2395 uint64_t prev_len = 0;
2396 for (auto p = start;
2397 p != extent_map.end() && p->logical_offset < end;
2398 ++p, ++n) {
2399 unsigned blobid;
2400 bool include_blob = false;
2401 if (p->blob->is_spanning()) {
2402 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2403 blobid |= BLOBID_FLAG_SPANNING;
2404 } else if (p->blob->last_encoded_id < 0) {
2405 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2406 include_blob = true;
2407 blobid = 0; // the decoder will infer the id from n
2408 } else {
2409 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2410 }
2411 if (p->logical_offset == pos) {
2412 blobid |= BLOBID_FLAG_CONTIGUOUS;
2413 }
2414 if (p->blob_offset == 0) {
2415 blobid |= BLOBID_FLAG_ZEROOFFSET;
2416 }
2417 if (p->length == prev_len) {
2418 blobid |= BLOBID_FLAG_SAMELENGTH;
2419 } else {
2420 prev_len = p->length;
2421 }
2422 denc_varint(blobid, app);
2423 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2424 denc_varint_lowz(p->logical_offset - pos, app);
2425 }
2426 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2427 denc_varint_lowz(p->blob_offset, app);
2428 }
2429 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2430 denc_varint_lowz(p->length, app);
2431 }
2432 pos = p->logical_end();
2433 if (include_blob) {
2434 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2435 }
2436 }
2437 }
2438 /*derr << __func__ << bl << dendl;
2439 derr << __func__ << ":";
2440 bl.hexdump(*_dout);
2441 *_dout << dendl;
2442 */
2443 return false;
2444 }
2445
2446 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2447 {
2448 auto cct = onode->c->store->cct; //used by dout
2449 /*
2450 derr << __func__ << ":";
2451 bl.hexdump(*_dout);
2452 *_dout << dendl;
2453 */
2454
2455 assert(bl.get_num_buffers() <= 1);
2456 auto p = bl.front().begin_deep();
2457 __u8 struct_v;
2458 denc(struct_v, p);
2459 // Version 2 differs from v1 in blob's ref_map
2460 // serialization only. Hence there is no specific
2461 // handling at ExtentMap level below.
2462 assert(struct_v == 1 || struct_v == 2);
2463
2464 uint32_t num;
2465 denc_varint(num, p);
2466 vector<BlobRef> blobs(num);
2467 uint64_t pos = 0;
2468 uint64_t prev_len = 0;
2469 unsigned n = 0;
2470
2471 while (!p.end()) {
2472 Extent *le = new Extent();
2473 uint64_t blobid;
2474 denc_varint(blobid, p);
2475 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2476 uint64_t gap;
2477 denc_varint_lowz(gap, p);
2478 pos += gap;
2479 }
2480 le->logical_offset = pos;
2481 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2482 denc_varint_lowz(le->blob_offset, p);
2483 } else {
2484 le->blob_offset = 0;
2485 }
2486 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2487 denc_varint_lowz(prev_len, p);
2488 }
2489 le->length = prev_len;
2490
2491 if (blobid & BLOBID_FLAG_SPANNING) {
2492 dout(30) << __func__ << " getting spanning blob "
2493 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2494 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2495 } else {
2496 blobid >>= BLOBID_SHIFT_BITS;
2497 if (blobid) {
2498 le->assign_blob(blobs[blobid - 1]);
2499 assert(le->blob);
2500 } else {
2501 Blob *b = new Blob();
2502 uint64_t sbid = 0;
2503 b->decode(onode->c, p, struct_v, &sbid, false);
2504 blobs[n] = b;
2505 onode->c->open_shared_blob(sbid, b);
2506 le->assign_blob(b);
2507 }
2508 // we build ref_map dynamically for non-spanning blobs
2509 le->blob->get_ref(
2510 onode->c,
2511 le->blob_offset,
2512 le->length);
2513 }
2514 pos += prev_len;
2515 ++n;
2516 extent_map.insert(*le);
2517 }
2518
2519 assert(n == num);
2520 return num;
2521 }
2522
2523 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2524 {
2525 // Version 2 differs from v1 in blob's ref_map
2526 // serialization only. Hence there is no specific
2527 // handling at ExtentMap level.
2528 __u8 struct_v = 2;
2529
2530 denc(struct_v, p);
2531 denc_varint((uint32_t)0, p);
2532 size_t key_size = 0;
2533 denc_varint((uint32_t)0, key_size);
2534 p += spanning_blob_map.size() * key_size;
2535 for (const auto& i : spanning_blob_map) {
2536 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2537 }
2538 }
2539
2540 void BlueStore::ExtentMap::encode_spanning_blobs(
2541 bufferlist::contiguous_appender& p)
2542 {
2543 // Version 2 differs from v1 in blob's ref_map
2544 // serialization only. Hence there is no specific
2545 // handling at ExtentMap level.
2546 __u8 struct_v = 2;
2547
2548 denc(struct_v, p);
2549 denc_varint(spanning_blob_map.size(), p);
2550 for (auto& i : spanning_blob_map) {
2551 denc_varint(i.second->id, p);
2552 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2553 }
2554 }
2555
2556 void BlueStore::ExtentMap::decode_spanning_blobs(
2557 bufferptr::iterator& p)
2558 {
2559 __u8 struct_v;
2560 denc(struct_v, p);
2561 // Version 2 differs from v1 in blob's ref_map
2562 // serialization only. Hence there is no specific
2563 // handling at ExtentMap level.
2564 assert(struct_v == 1 || struct_v == 2);
2565
2566 unsigned n;
2567 denc_varint(n, p);
2568 while (n--) {
2569 BlobRef b(new Blob());
2570 denc_varint(b->id, p);
2571 spanning_blob_map[b->id] = b;
2572 uint64_t sbid = 0;
2573 b->decode(onode->c, p, struct_v, &sbid, true);
2574 onode->c->open_shared_blob(sbid, b);
2575 }
2576 }
2577
2578 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2579 {
2580 shards.resize(onode->onode.extent_map_shards.size());
2581 unsigned i = 0;
2582 for (auto &s : onode->onode.extent_map_shards) {
2583 shards[i].shard_info = &s;
2584 shards[i].loaded = loaded;
2585 shards[i].dirty = dirty;
2586 ++i;
2587 }
2588 }
2589
2590 void BlueStore::ExtentMap::fault_range(
2591 KeyValueDB *db,
2592 uint32_t offset,
2593 uint32_t length)
2594 {
2595 auto cct = onode->c->store->cct; //used by dout
2596 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2597 << std::dec << dendl;
2598 auto start = seek_shard(offset);
2599 auto last = seek_shard(offset + length);
2600
2601 if (start < 0)
2602 return;
2603
2604 assert(last >= start);
2605 string key;
2606 while (start <= last) {
2607 assert((size_t)start < shards.size());
2608 auto p = &shards[start];
2609 if (!p->loaded) {
2610 dout(30) << __func__ << " opening shard 0x" << std::hex
2611 << p->shard_info->offset << std::dec << dendl;
2612 bufferlist v;
2613 generate_extent_shard_key_and_apply(
2614 onode->key, p->shard_info->offset, &key,
2615 [&](const string& final_key) {
2616 int r = db->get(PREFIX_OBJ, final_key, &v);
2617 if (r < 0) {
2618 derr << __func__ << " missing shard 0x" << std::hex
2619 << p->shard_info->offset << std::dec << " for " << onode->oid
2620 << dendl;
2621 assert(r >= 0);
2622 }
2623 }
2624 );
2625 p->extents = decode_some(v);
2626 p->loaded = true;
2627 dout(20) << __func__ << " open shard 0x" << std::hex
2628 << p->shard_info->offset << std::dec
2629 << " (" << v.length() << " bytes)" << dendl;
2630 assert(p->dirty == false);
2631 assert(v.length() == p->shard_info->bytes);
2632 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2633 } else {
2634 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2635 }
2636 ++start;
2637 }
2638 }
2639
2640 void BlueStore::ExtentMap::dirty_range(
2641 uint32_t offset,
2642 uint32_t length)
2643 {
2644 auto cct = onode->c->store->cct; //used by dout
2645 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2646 << std::dec << dendl;
2647 if (shards.empty()) {
2648 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2649 inline_bl.clear();
2650 return;
2651 }
2652 auto start = seek_shard(offset);
2653 auto last = seek_shard(offset + length);
2654 if (start < 0)
2655 return;
2656
2657 assert(last >= start);
2658 while (start <= last) {
2659 assert((size_t)start < shards.size());
2660 auto p = &shards[start];
2661 if (!p->loaded) {
2662 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2663 << std::dec << " is not loaded, can't mark dirty" << dendl;
2664 assert(0 == "can't mark unloaded shard dirty");
2665 }
2666 if (!p->dirty) {
2667 dout(20) << __func__ << " mark shard 0x" << std::hex
2668 << p->shard_info->offset << std::dec << " dirty" << dendl;
2669 p->dirty = true;
2670 }
2671 ++start;
2672 }
2673 }
2674
2675 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2676 uint64_t offset)
2677 {
2678 Extent dummy(offset);
2679 return extent_map.find(dummy);
2680 }
2681
2682 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2683 uint64_t offset)
2684 {
2685 Extent dummy(offset);
2686 auto fp = extent_map.lower_bound(dummy);
2687 if (fp != extent_map.begin()) {
2688 --fp;
2689 if (fp->logical_end() <= offset) {
2690 ++fp;
2691 }
2692 }
2693 return fp;
2694 }
2695
2696 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2697 uint64_t offset) const
2698 {
2699 Extent dummy(offset);
2700 auto fp = extent_map.lower_bound(dummy);
2701 if (fp != extent_map.begin()) {
2702 --fp;
2703 if (fp->logical_end() <= offset) {
2704 ++fp;
2705 }
2706 }
2707 return fp;
2708 }
2709
2710 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2711 {
2712 auto fp = seek_lextent(offset);
2713 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2714 return false;
2715 }
2716 return true;
2717 }
2718
2719 int BlueStore::ExtentMap::compress_extent_map(
2720 uint64_t offset,
2721 uint64_t length)
2722 {
2723 auto cct = onode->c->store->cct; //used by dout
2724 if (extent_map.empty())
2725 return 0;
2726 int removed = 0;
2727 auto p = seek_lextent(offset);
2728 if (p != extent_map.begin()) {
2729 --p; // start to the left of offset
2730 }
2731 // the caller should have just written to this region
2732 assert(p != extent_map.end());
2733
2734 // identify the *next* shard
2735 auto pshard = shards.begin();
2736 while (pshard != shards.end() &&
2737 p->logical_offset >= pshard->shard_info->offset) {
2738 ++pshard;
2739 }
2740 uint64_t shard_end;
2741 if (pshard != shards.end()) {
2742 shard_end = pshard->shard_info->offset;
2743 } else {
2744 shard_end = OBJECT_MAX_SIZE;
2745 }
2746
2747 auto n = p;
2748 for (++n; n != extent_map.end(); p = n++) {
2749 if (n->logical_offset > offset + length) {
2750 break; // stop after end
2751 }
2752 while (n != extent_map.end() &&
2753 p->logical_end() == n->logical_offset &&
2754 p->blob == n->blob &&
2755 p->blob_offset + p->length == n->blob_offset &&
2756 n->logical_offset < shard_end) {
2757 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2758 << " next shard 0x" << shard_end << std::dec
2759 << " merging " << *p << " and " << *n << dendl;
2760 p->length += n->length;
2761 rm(n++);
2762 ++removed;
2763 }
2764 if (n == extent_map.end()) {
2765 break;
2766 }
2767 if (n->logical_offset >= shard_end) {
2768 assert(pshard != shards.end());
2769 ++pshard;
2770 if (pshard != shards.end()) {
2771 shard_end = pshard->shard_info->offset;
2772 } else {
2773 shard_end = OBJECT_MAX_SIZE;
2774 }
2775 }
2776 }
2777 if (removed && onode) {
2778 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2779 }
2780 return removed;
2781 }
2782
2783 void BlueStore::ExtentMap::punch_hole(
2784 CollectionRef &c,
2785 uint64_t offset,
2786 uint64_t length,
2787 old_extent_map_t *old_extents)
2788 {
2789 auto p = seek_lextent(offset);
2790 uint64_t end = offset + length;
2791 while (p != extent_map.end()) {
2792 if (p->logical_offset >= end) {
2793 break;
2794 }
2795 if (p->logical_offset < offset) {
2796 if (p->logical_end() > end) {
2797 // split and deref middle
2798 uint64_t front = offset - p->logical_offset;
2799 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2800 length, p->blob);
2801 old_extents->push_back(*oe);
2802 add(end,
2803 p->blob_offset + front + length,
2804 p->length - front - length,
2805 p->blob);
2806 p->length = front;
2807 break;
2808 } else {
2809 // deref tail
2810 assert(p->logical_end() > offset); // else seek_lextent bug
2811 uint64_t keep = offset - p->logical_offset;
2812 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2813 p->length - keep, p->blob);
2814 old_extents->push_back(*oe);
2815 p->length = keep;
2816 ++p;
2817 continue;
2818 }
2819 }
2820 if (p->logical_offset + p->length <= end) {
2821 // deref whole lextent
2822 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2823 p->length, p->blob);
2824 old_extents->push_back(*oe);
2825 rm(p++);
2826 continue;
2827 }
2828 // deref head
2829 uint64_t keep = p->logical_end() - end;
2830 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2831 p->length - keep, p->blob);
2832 old_extents->push_back(*oe);
2833
2834 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2835 rm(p);
2836 break;
2837 }
2838 }
2839
2840 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2841 CollectionRef &c,
2842 uint64_t logical_offset,
2843 uint64_t blob_offset, uint64_t length, BlobRef b,
2844 old_extent_map_t *old_extents)
2845 {
2846 // We need to have completely initialized Blob to increment its ref counters.
2847 assert(b->get_blob().get_logical_length() != 0);
2848
2849 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2850 // old_extents list if we overwre the blob totally
2851 // This might happen during WAL overwrite.
2852 b->get_ref(onode->c, blob_offset, length);
2853
2854 if (old_extents) {
2855 punch_hole(c, logical_offset, length, old_extents);
2856 }
2857
2858 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2859 extent_map.insert(*le);
2860 if (spans_shard(logical_offset, length)) {
2861 request_reshard(logical_offset, logical_offset + length);
2862 }
2863 return le;
2864 }
2865
2866 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2867 BlobRef lb,
2868 uint32_t blob_offset,
2869 uint32_t pos)
2870 {
2871 auto cct = onode->c->store->cct; //used by dout
2872
2873 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2874 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2875 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2876 << dendl;
2877 BlobRef rb = onode->c->new_blob();
2878 lb->split(onode->c, blob_offset, rb.get());
2879
2880 for (auto ep = seek_lextent(pos);
2881 ep != extent_map.end() && ep->logical_offset < end_pos;
2882 ++ep) {
2883 if (ep->blob != lb) {
2884 continue;
2885 }
2886 if (ep->logical_offset < pos) {
2887 // split extent
2888 size_t left = pos - ep->logical_offset;
2889 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2890 extent_map.insert(*ne);
2891 ep->length = left;
2892 dout(30) << __func__ << " split " << *ep << dendl;
2893 dout(30) << __func__ << " to " << *ne << dendl;
2894 } else {
2895 // switch blob
2896 assert(ep->blob_offset >= blob_offset);
2897
2898 ep->blob = rb;
2899 ep->blob_offset -= blob_offset;
2900 dout(30) << __func__ << " adjusted " << *ep << dendl;
2901 }
2902 }
2903 return rb;
2904 }
2905
2906 // Onode
2907
2908 #undef dout_prefix
2909 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2910
2911 void BlueStore::Onode::flush()
2912 {
2913 if (flushing_count.load()) {
2914 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2915 std::unique_lock<std::mutex> l(flush_lock);
2916 while (flushing_count.load()) {
2917 flush_cond.wait(l);
2918 }
2919 }
2920 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2921 }
2922
2923 // =======================================================
2924 // WriteContext
2925
2926 /// Checks for writes to the same pextent within a blob
2927 bool BlueStore::WriteContext::has_conflict(
2928 BlobRef b,
2929 uint64_t loffs,
2930 uint64_t loffs_end,
2931 uint64_t min_alloc_size)
2932 {
2933 assert((loffs % min_alloc_size) == 0);
2934 assert((loffs_end % min_alloc_size) == 0);
2935 for (auto w : writes) {
2936 if (b == w.b) {
2937 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
2938 auto loffs2_end = P2ROUNDUP(w.logical_offset + w.length0, min_alloc_size);
2939 if ((loffs <= loffs2 && loffs_end > loffs2) ||
2940 (loffs >= loffs2 && loffs < loffs2_end)) {
2941 return true;
2942 }
2943 }
2944 }
2945 return false;
2946 }
2947
2948 // =======================================================
2949
2950 // DeferredBatch
2951 #undef dout_prefix
2952 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2953
2954 void BlueStore::DeferredBatch::prepare_write(
2955 CephContext *cct,
2956 uint64_t seq, uint64_t offset, uint64_t length,
2957 bufferlist::const_iterator& blp)
2958 {
2959 _discard(cct, offset, length);
2960 auto i = iomap.insert(make_pair(offset, deferred_io()));
2961 assert(i.second); // this should be a new insertion
2962 i.first->second.seq = seq;
2963 blp.copy(length, i.first->second.bl);
2964 i.first->second.bl.reassign_to_mempool(
2965 mempool::mempool_bluestore_writing_deferred);
2966 dout(20) << __func__ << " seq " << seq
2967 << " 0x" << std::hex << offset << "~" << length
2968 << " crc " << i.first->second.bl.crc32c(-1)
2969 << std::dec << dendl;
2970 seq_bytes[seq] += length;
2971 #ifdef DEBUG_DEFERRED
2972 _audit(cct);
2973 #endif
2974 }
2975
2976 void BlueStore::DeferredBatch::_discard(
2977 CephContext *cct, uint64_t offset, uint64_t length)
2978 {
2979 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2980 << std::dec << dendl;
2981 auto p = iomap.lower_bound(offset);
2982 if (p != iomap.begin()) {
2983 --p;
2984 auto end = p->first + p->second.bl.length();
2985 if (end > offset) {
2986 bufferlist head;
2987 head.substr_of(p->second.bl, 0, offset - p->first);
2988 dout(20) << __func__ << " keep head " << p->second.seq
2989 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2990 << " -> 0x" << head.length() << std::dec << dendl;
2991 auto i = seq_bytes.find(p->second.seq);
2992 assert(i != seq_bytes.end());
2993 if (end > offset + length) {
2994 bufferlist tail;
2995 tail.substr_of(p->second.bl, offset + length - p->first,
2996 end - (offset + length));
2997 dout(20) << __func__ << " keep tail " << p->second.seq
2998 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
2999 << " -> 0x" << tail.length() << std::dec << dendl;
3000 auto &n = iomap[offset + length];
3001 n.bl.swap(tail);
3002 n.seq = p->second.seq;
3003 i->second -= length;
3004 } else {
3005 i->second -= end - offset;
3006 }
3007 assert(i->second >= 0);
3008 p->second.bl.swap(head);
3009 }
3010 ++p;
3011 }
3012 while (p != iomap.end()) {
3013 if (p->first >= offset + length) {
3014 break;
3015 }
3016 auto i = seq_bytes.find(p->second.seq);
3017 assert(i != seq_bytes.end());
3018 auto end = p->first + p->second.bl.length();
3019 if (end > offset + length) {
3020 unsigned drop_front = offset + length - p->first;
3021 unsigned keep_tail = end - (offset + length);
3022 dout(20) << __func__ << " truncate front " << p->second.seq
3023 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3024 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3025 << " to 0x" << (offset + length) << "~" << keep_tail
3026 << std::dec << dendl;
3027 auto &s = iomap[offset + length];
3028 s.seq = p->second.seq;
3029 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3030 i->second -= drop_front;
3031 } else {
3032 dout(20) << __func__ << " drop " << p->second.seq
3033 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3034 << std::dec << dendl;
3035 i->second -= p->second.bl.length();
3036 }
3037 assert(i->second >= 0);
3038 p = iomap.erase(p);
3039 }
3040 }
3041
3042 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3043 {
3044 map<uint64_t,int> sb;
3045 for (auto p : seq_bytes) {
3046 sb[p.first] = 0; // make sure we have the same set of keys
3047 }
3048 uint64_t pos = 0;
3049 for (auto& p : iomap) {
3050 assert(p.first >= pos);
3051 sb[p.second.seq] += p.second.bl.length();
3052 pos = p.first + p.second.bl.length();
3053 }
3054 assert(sb == seq_bytes);
3055 }
3056
3057
3058 // Collection
3059
3060 #undef dout_prefix
3061 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3062
3063 BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3064 : store(ns),
3065 cache(c),
3066 cid(cid),
3067 lock("BlueStore::Collection::lock", true, false),
3068 exists(true),
3069 onode_map(c)
3070 {
3071 }
3072
3073 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3074 {
3075 assert(!b->shared_blob);
3076 const bluestore_blob_t& blob = b->get_blob();
3077 if (!blob.is_shared()) {
3078 b->shared_blob = new SharedBlob(this);
3079 return;
3080 }
3081
3082 b->shared_blob = shared_blob_set.lookup(sbid);
3083 if (b->shared_blob) {
3084 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3085 << std::dec << " had " << *b->shared_blob << dendl;
3086 } else {
3087 b->shared_blob = new SharedBlob(sbid, this);
3088 shared_blob_set.add(this, b->shared_blob.get());
3089 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3090 << std::dec << " opened " << *b->shared_blob
3091 << dendl;
3092 }
3093 }
3094
3095 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3096 {
3097 if (!sb->is_loaded()) {
3098
3099 bufferlist v;
3100 string key;
3101 auto sbid = sb->get_sbid();
3102 get_shared_blob_key(sbid, &key);
3103 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3104 if (r < 0) {
3105 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3106 << std::dec << " not found at key "
3107 << pretty_binary_string(key) << dendl;
3108 assert(0 == "uh oh, missing shared_blob");
3109 }
3110
3111 sb->loaded = true;
3112 sb->persistent = new bluestore_shared_blob_t(sbid);
3113 bufferlist::iterator p = v.begin();
3114 ::decode(*(sb->persistent), p);
3115 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3116 << std::dec << " loaded shared_blob " << *sb << dendl;
3117 }
3118 }
3119
3120 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3121 {
3122 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3123 assert(!b->shared_blob->is_loaded());
3124
3125 // update blob
3126 bluestore_blob_t& blob = b->dirty_blob();
3127 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3128
3129 // update shared blob
3130 b->shared_blob->loaded = true;
3131 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3132 shared_blob_set.add(this, b->shared_blob.get());
3133 for (auto p : blob.get_extents()) {
3134 if (p.is_valid()) {
3135 b->shared_blob->get_ref(
3136 p.offset,
3137 p.length);
3138 }
3139 }
3140 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3141 }
3142
3143 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3144 {
3145 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3146 assert(sb->is_loaded());
3147
3148 uint64_t sbid = sb->get_sbid();
3149 shared_blob_set.remove(sb);
3150 sb->loaded = false;
3151 delete sb->persistent;
3152 sb->sbid_unloaded = 0;
3153 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3154 return sbid;
3155 }
3156
3157 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3158 const ghobject_t& oid,
3159 bool create)
3160 {
3161 assert(create ? lock.is_wlocked() : lock.is_locked());
3162
3163 spg_t pgid;
3164 if (cid.is_pg(&pgid)) {
3165 if (!oid.match(cnode.bits, pgid.ps())) {
3166 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3167 << pgid << " bits " << cnode.bits << dendl;
3168 ceph_abort();
3169 }
3170 }
3171
3172 OnodeRef o = onode_map.lookup(oid);
3173 if (o)
3174 return o;
3175
3176 mempool::bluestore_cache_other::string key;
3177 get_object_key(store->cct, oid, &key);
3178
3179 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3180 << pretty_binary_string(key) << dendl;
3181
3182 bufferlist v;
3183 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3184 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3185 Onode *on;
3186 if (v.length() == 0) {
3187 assert(r == -ENOENT);
3188 if (!store->cct->_conf->bluestore_debug_misc &&
3189 !create)
3190 return OnodeRef();
3191
3192 // new object, new onode
3193 on = new Onode(this, oid, key);
3194 } else {
3195 // loaded
3196 assert(r >= 0);
3197 on = new Onode(this, oid, key);
3198 on->exists = true;
3199 bufferptr::iterator p = v.front().begin_deep();
3200 on->onode.decode(p);
3201 for (auto& i : on->onode.attrs) {
3202 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3203 }
3204
3205 // initialize extent_map
3206 on->extent_map.decode_spanning_blobs(p);
3207 if (on->onode.extent_map_shards.empty()) {
3208 denc(on->extent_map.inline_bl, p);
3209 on->extent_map.decode_some(on->extent_map.inline_bl);
3210 on->extent_map.inline_bl.reassign_to_mempool(
3211 mempool::mempool_bluestore_cache_other);
3212 } else {
3213 on->extent_map.init_shards(false, false);
3214 }
3215 }
3216 o.reset(on);
3217 return onode_map.add(oid, o);
3218 }
3219
3220 void BlueStore::Collection::split_cache(
3221 Collection *dest)
3222 {
3223 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3224
3225 // lock (one or both) cache shards
3226 std::lock(cache->lock, dest->cache->lock);
3227 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3228 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3229
3230 int destbits = dest->cnode.bits;
3231 spg_t destpg;
3232 bool is_pg = dest->cid.is_pg(&destpg);
3233 assert(is_pg);
3234
3235 auto p = onode_map.onode_map.begin();
3236 while (p != onode_map.onode_map.end()) {
3237 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3238 // onode does not belong to this child
3239 ++p;
3240 } else {
3241 OnodeRef o = p->second;
3242 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3243 << dendl;
3244
3245 cache->_rm_onode(p->second);
3246 p = onode_map.onode_map.erase(p);
3247
3248 o->c = dest;
3249 dest->cache->_add_onode(o, 1);
3250 dest->onode_map.onode_map[o->oid] = o;
3251 dest->onode_map.cache = dest->cache;
3252
3253 // move over shared blobs and buffers. cover shared blobs from
3254 // both extent map and spanning blob map (the full extent map
3255 // may not be faulted in)
3256 vector<SharedBlob*> sbvec;
3257 for (auto& e : o->extent_map.extent_map) {
3258 sbvec.push_back(e.blob->shared_blob.get());
3259 }
3260 for (auto& b : o->extent_map.spanning_blob_map) {
3261 sbvec.push_back(b.second->shared_blob.get());
3262 }
3263 for (auto sb : sbvec) {
3264 if (sb->coll == dest) {
3265 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3266 << dendl;
3267 continue;
3268 }
3269 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
3270 if (sb->get_sbid()) {
3271 ldout(store->cct, 20) << __func__
3272 << " moving registration " << *sb << dendl;
3273 shared_blob_set.remove(sb);
3274 dest->shared_blob_set.add(dest, sb);
3275 }
3276 sb->coll = dest;
3277 if (dest->cache != cache) {
3278 for (auto& i : sb->bc.buffer_map) {
3279 if (!i.second->is_writing()) {
3280 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3281 << dendl;
3282 dest->cache->_move_buffer(cache, i.second.get());
3283 }
3284 }
3285 }
3286 }
3287 }
3288 }
3289 }
3290
3291 // =======================================================
3292
3293 // MempoolThread
3294
3295 #undef dout_prefix
3296 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
3297
3298 void *BlueStore::MempoolThread::entry()
3299 {
3300 Mutex::Locker l(lock);
3301
3302 std::list<PriorityCache::PriCache *> caches;
3303 caches.push_back(store->db);
3304 caches.push_back(&meta_cache);
3305 caches.push_back(&data_cache);
3306 autotune_cache_size = store->osd_memory_cache_min;
3307
3308 utime_t next_balance = ceph_clock_now();
3309 utime_t next_resize = ceph_clock_now();
3310
3311 bool interval_stats_trim = false;
3312 bool interval_stats_resize = false;
3313 while (!stop) {
3314 _adjust_cache_settings();
3315
3316 // Before we trim, check and see if it's time to rebalance/resize.
3317 double autotune_interval = store->cache_autotune_interval;
3318 double resize_interval = store->osd_memory_cache_resize_interval;
3319
3320 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
3321 // Log events at 5 instead of 20 when balance happens.
3322 interval_stats_resize = true;
3323 interval_stats_trim = true;
3324 if (store->cache_autotune) {
3325 _balance_cache(caches);
3326 }
3327
3328 next_balance = ceph_clock_now();
3329 next_balance += autotune_interval;
3330 }
3331 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
3332 if (ceph_using_tcmalloc() && store->cache_autotune) {
3333 _tune_cache_size(interval_stats_resize);
3334 interval_stats_resize = false;
3335 }
3336 next_resize = ceph_clock_now();
3337 next_resize += resize_interval;
3338 }
3339
3340 // Now Trim
3341 _trim_shards(interval_stats_trim);
3342 interval_stats_trim = false;
3343
3344 store->_update_cache_logger();
3345 utime_t wait;
3346 wait += store->cct->_conf->bluestore_cache_trim_interval;
3347 cond.WaitInterval(lock, wait);
3348 }
3349 stop = false;
3350 return NULL;
3351 }
3352
3353 void BlueStore::MempoolThread::_adjust_cache_settings()
3354 {
3355 store->db->set_cache_ratio(store->cache_kv_ratio);
3356 meta_cache.set_cache_ratio(store->cache_meta_ratio);
3357 data_cache.set_cache_ratio(store->cache_data_ratio);
3358 }
3359
3360 void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
3361 {
3362 auto cct = store->cct;
3363 size_t num_shards = store->cache_shards.size();
3364
3365 int64_t kv_used = store->db->get_cache_usage();
3366 int64_t meta_used = meta_cache._get_used_bytes();
3367 int64_t data_used = data_cache._get_used_bytes();
3368
3369 uint64_t cache_size = store->cache_size;
3370 int64_t kv_alloc =
3371 static_cast<int64_t>(store->db->get_cache_ratio() * cache_size);
3372 int64_t meta_alloc =
3373 static_cast<int64_t>(meta_cache.get_cache_ratio() * cache_size);
3374 int64_t data_alloc =
3375 static_cast<int64_t>(data_cache.get_cache_ratio() * cache_size);
3376
3377 if (store->cache_autotune) {
3378 cache_size = autotune_cache_size;
3379
3380 kv_alloc = store->db->get_cache_bytes();
3381 meta_alloc = meta_cache.get_cache_bytes();
3382 data_alloc = data_cache.get_cache_bytes();
3383 }
3384
3385 if (interval_stats) {
3386 ldout(cct, 5) << __func__ << " cache_size: " << cache_size
3387 << " kv_alloc: " << kv_alloc
3388 << " kv_used: " << kv_used
3389 << " meta_alloc: " << meta_alloc
3390 << " meta_used: " << meta_used
3391 << " data_alloc: " << data_alloc
3392 << " data_used: " << data_used << dendl;
3393 } else {
3394 ldout(cct, 20) << __func__ << " cache_size: " << cache_size
3395 << " kv_alloc: " << kv_alloc
3396 << " kv_used: " << kv_used
3397 << " meta_alloc: " << meta_alloc
3398 << " meta_used: " << meta_used
3399 << " data_alloc: " << data_alloc
3400 << " data_used: " << data_used << dendl;
3401 }
3402
3403 uint64_t max_shard_onodes = static_cast<uint64_t>(
3404 (meta_alloc / (double) num_shards) / meta_cache.get_bytes_per_onode());
3405 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
3406
3407 ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
3408 << " max_shard_buffer: " << max_shard_buffer << dendl;
3409
3410 for (auto i : store->cache_shards) {
3411 i->trim(max_shard_onodes, max_shard_buffer);
3412 }
3413 }
3414
3415 void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
3416 {
3417 auto cct = store->cct;
3418 uint64_t target = store->osd_memory_target;
3419 uint64_t base = store->osd_memory_base;
3420 double fragmentation = store->osd_memory_expected_fragmentation;
3421 uint64_t cache_max = ((1.0 - fragmentation) * target) - base;
3422 uint64_t cache_min = store->osd_memory_cache_min;
3423
3424 size_t heap_size = 0;
3425 size_t unmapped = 0;
3426 uint64_t mapped = 0;
3427
3428 ceph_heap_release_free_memory();
3429 ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
3430 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
3431 mapped = heap_size - unmapped;
3432
3433 uint64_t new_size = autotune_cache_size;
3434 new_size = (new_size < cache_max) ? new_size : cache_max;
3435 new_size = (new_size > cache_min) ? new_size : cache_min;
3436
3437 // Approach the min/max slowly, but bounce away quickly.
3438 if ((uint64_t) mapped < target) {
3439 double ratio = 1 - ((double) mapped / target);
3440 new_size += ratio * (cache_max - new_size);
3441 } else {
3442 double ratio = 1 - ((double) target / mapped);
3443 new_size -= ratio * (new_size - cache_min);
3444 }
3445
3446 if (interval_stats) {
3447 ldout(cct, 5) << __func__
3448 << " target: " << target
3449 << " heap: " << heap_size
3450 << " unmapped: " << unmapped
3451 << " mapped: " << mapped
3452 << " old cache_size: " << autotune_cache_size
3453 << " new cache size: " << new_size << dendl;
3454 } else {
3455 ldout(cct, 20) << __func__
3456 << " target: " << target
3457 << " heap: " << heap_size
3458 << " unmapped: " << unmapped
3459 << " mapped: " << mapped
3460 << " old cache_size: " << autotune_cache_size
3461 << " new cache size: " << new_size << dendl;
3462 }
3463 autotune_cache_size = new_size;
3464 }
3465
3466 void BlueStore::MempoolThread::_balance_cache(
3467 const std::list<PriorityCache::PriCache *>& caches)
3468 {
3469 int64_t mem_avail = autotune_cache_size;
3470
3471 // Assign memory for each priority level
3472 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
3473 ldout(store->cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
3474 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
3475 _balance_cache_pri(&mem_avail, caches, pri);
3476 }
3477 // Assign any leftover memory based on the default ratios.
3478 if (mem_avail > 0) {
3479 for (auto it = caches.begin(); it != caches.end(); it++) {
3480 int64_t fair_share =
3481 static_cast<int64_t>((*it)->get_cache_ratio() * mem_avail);
3482 if (fair_share > 0) {
3483 (*it)->add_cache_bytes(PriorityCache::Priority::LAST, fair_share);
3484 }
3485 }
3486 }
3487 // assert if we assigned more memory than is available.
3488 assert(mem_avail >= 0);
3489
3490 // Finally commit the new cache sizes
3491 for (auto it = caches.begin(); it != caches.end(); it++) {
3492 (*it)->commit_cache_size();
3493 }
3494 }
3495
3496 void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail,
3497 const std::list<PriorityCache::PriCache *>& caches, PriorityCache::Priority pri)
3498 {
3499 std::list<PriorityCache::PriCache *> tmp_caches = caches;
3500 double cur_ratios = 0;
3501 double new_ratios = 0;
3502
3503 // Zero this priority's bytes, sum the initial ratios.
3504 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); it++) {
3505 (*it)->set_cache_bytes(pri, 0);
3506 cur_ratios += (*it)->get_cache_ratio();
3507 }
3508
3509 // For this priority, loop until caches are satisified or we run out of memory.
3510 // Since we can't allocate fractional bytes, stop if we have fewer bytes left
3511 // than the number of participating caches.
3512 while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
3513 uint64_t total_assigned = 0;
3514
3515 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); ) {
3516 int64_t cache_wants = (*it)->request_cache_bytes(pri, store->cache_autotune_chunk_size);
3517
3518 // Usually the ratio should be set to the fraction of the current caches'
3519 // assigned ratio compared to the total ratio of all caches that still
3520 // want memory. There is a special case where the only caches left are
3521 // all assigned 0% ratios but still want memory. In that case, give
3522 // them an equal shot at the remaining memory for this priority.
3523 double ratio = 1.0 / tmp_caches.size();
3524 if (cur_ratios > 0) {
3525 ratio = (*it)->get_cache_ratio() / cur_ratios;
3526 }
3527 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
3528
3529 if (cache_wants > fair_share) {
3530 // If we want too much, take what we can get but stick around for more
3531 (*it)->add_cache_bytes(pri, fair_share);
3532 total_assigned += fair_share;
3533
3534 new_ratios += (*it)->get_cache_ratio();
3535 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3536 << " wanted: " << cache_wants << " fair_share: " << fair_share
3537 << " mem_avail: " << *mem_avail
3538 << " staying in list. Size: " << tmp_caches.size()
3539 << dendl;
3540 ++it;
3541 } else {
3542 // Otherwise assign only what we want
3543 if (cache_wants > 0) {
3544 (*it)->add_cache_bytes(pri, cache_wants);
3545 total_assigned += cache_wants;
3546
3547 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3548 << " wanted: " << cache_wants << " fair_share: " << fair_share
3549 << " mem_avail: " << *mem_avail
3550 << " removing from list. New size: " << tmp_caches.size() - 1
3551 << dendl;
3552
3553 }
3554 // Either the cache didn't want anything or got what it wanted, so remove it from the tmp list.
3555 it = tmp_caches.erase(it);
3556 }
3557 }
3558 // Reset the ratios
3559 *mem_avail -= total_assigned;
3560 cur_ratios = new_ratios;
3561 new_ratios = 0;
3562 }
3563 }
3564
3565 // =======================================================
3566
3567 // OmapIteratorImpl
3568
3569 #undef dout_prefix
3570 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3571
3572 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3573 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3574 : c(c), o(o), it(it)
3575 {
3576 RWLock::RLocker l(c->lock);
3577 if (o->onode.has_omap()) {
3578 get_omap_key(o->onode.nid, string(), &head);
3579 get_omap_tail(o->onode.nid, &tail);
3580 it->lower_bound(head);
3581 }
3582 }
3583
3584 int BlueStore::OmapIteratorImpl::seek_to_first()
3585 {
3586 RWLock::RLocker l(c->lock);
3587 if (o->onode.has_omap()) {
3588 it->lower_bound(head);
3589 } else {
3590 it = KeyValueDB::Iterator();
3591 }
3592 return 0;
3593 }
3594
3595 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3596 {
3597 RWLock::RLocker l(c->lock);
3598 if (o->onode.has_omap()) {
3599 string key;
3600 get_omap_key(o->onode.nid, after, &key);
3601 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3602 << pretty_binary_string(key) << dendl;
3603 it->upper_bound(key);
3604 } else {
3605 it = KeyValueDB::Iterator();
3606 }
3607 return 0;
3608 }
3609
3610 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3611 {
3612 RWLock::RLocker l(c->lock);
3613 if (o->onode.has_omap()) {
3614 string key;
3615 get_omap_key(o->onode.nid, to, &key);
3616 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3617 << pretty_binary_string(key) << dendl;
3618 it->lower_bound(key);
3619 } else {
3620 it = KeyValueDB::Iterator();
3621 }
3622 return 0;
3623 }
3624
3625 bool BlueStore::OmapIteratorImpl::valid()
3626 {
3627 RWLock::RLocker l(c->lock);
3628 bool r = o->onode.has_omap() && it && it->valid() &&
3629 it->raw_key().second <= tail;
3630 if (it && it->valid()) {
3631 ldout(c->store->cct,20) << __func__ << " is at "
3632 << pretty_binary_string(it->raw_key().second)
3633 << dendl;
3634 }
3635 return r;
3636 }
3637
3638 int BlueStore::OmapIteratorImpl::next(bool validate)
3639 {
3640 RWLock::RLocker l(c->lock);
3641 if (o->onode.has_omap()) {
3642 it->next();
3643 return 0;
3644 } else {
3645 return -1;
3646 }
3647 }
3648
3649 string BlueStore::OmapIteratorImpl::key()
3650 {
3651 RWLock::RLocker l(c->lock);
3652 assert(it->valid());
3653 string db_key = it->raw_key().second;
3654 string user_key;
3655 decode_omap_key(db_key, &user_key);
3656 return user_key;
3657 }
3658
3659 bufferlist BlueStore::OmapIteratorImpl::value()
3660 {
3661 RWLock::RLocker l(c->lock);
3662 assert(it->valid());
3663 return it->value();
3664 }
3665
3666
3667 // =====================================
3668
3669 #undef dout_prefix
3670 #define dout_prefix *_dout << "bluestore(" << path << ") "
3671
3672
3673 static void aio_cb(void *priv, void *priv2)
3674 {
3675 BlueStore *store = static_cast<BlueStore*>(priv);
3676 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3677 c->aio_finish(store);
3678 }
3679
3680 BlueStore::BlueStore(CephContext *cct, const string& path)
3681 : ObjectStore(cct, path),
3682 throttle_bytes(cct, "bluestore_throttle_bytes",
3683 cct->_conf->bluestore_throttle_bytes),
3684 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3685 cct->_conf->bluestore_throttle_bytes +
3686 cct->_conf->bluestore_throttle_deferred_bytes),
3687 deferred_finisher(cct, "defered_finisher", "dfin"),
3688 kv_sync_thread(this),
3689 kv_finalize_thread(this),
3690 mempool_thread(this)
3691 {
3692 _init_logger();
3693 cct->_conf->add_observer(this);
3694 set_cache_shards(1);
3695 }
3696
3697 BlueStore::BlueStore(CephContext *cct,
3698 const string& path,
3699 uint64_t _min_alloc_size)
3700 : ObjectStore(cct, path),
3701 throttle_bytes(cct, "bluestore_throttle_bytes",
3702 cct->_conf->bluestore_throttle_bytes),
3703 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3704 cct->_conf->bluestore_throttle_bytes +
3705 cct->_conf->bluestore_throttle_deferred_bytes),
3706 deferred_finisher(cct, "defered_finisher", "dfin"),
3707 kv_sync_thread(this),
3708 kv_finalize_thread(this),
3709 min_alloc_size(_min_alloc_size),
3710 min_alloc_size_order(ctz(_min_alloc_size)),
3711 mempool_thread(this)
3712 {
3713 _init_logger();
3714 cct->_conf->add_observer(this);
3715 set_cache_shards(1);
3716 }
3717
3718 BlueStore::~BlueStore()
3719 {
3720 for (auto f : finishers) {
3721 delete f;
3722 }
3723 finishers.clear();
3724
3725 cct->_conf->remove_observer(this);
3726 _shutdown_logger();
3727 assert(!mounted);
3728 assert(db == NULL);
3729 assert(bluefs == NULL);
3730 assert(fsid_fd < 0);
3731 assert(path_fd < 0);
3732 for (auto i : cache_shards) {
3733 delete i;
3734 }
3735 cache_shards.clear();
3736 }
3737
3738 const char **BlueStore::get_tracked_conf_keys() const
3739 {
3740 static const char* KEYS[] = {
3741 "bluestore_csum_type",
3742 "bluestore_compression_mode",
3743 "bluestore_compression_algorithm",
3744 "bluestore_compression_min_blob_size",
3745 "bluestore_compression_min_blob_size_ssd",
3746 "bluestore_compression_min_blob_size_hdd",
3747 "bluestore_compression_max_blob_size",
3748 "bluestore_compression_max_blob_size_ssd",
3749 "bluestore_compression_max_blob_size_hdd",
3750 "bluestore_compression_required_ratio",
3751 "bluestore_max_alloc_size",
3752 "bluestore_prefer_deferred_size",
3753 "bluestore_prefer_deferred_size_hdd",
3754 "bluestore_prefer_deferred_size_ssd",
3755 "bluestore_deferred_batch_ops",
3756 "bluestore_deferred_batch_ops_hdd",
3757 "bluestore_deferred_batch_ops_ssd",
3758 "bluestore_throttle_bytes",
3759 "bluestore_throttle_deferred_bytes",
3760 "bluestore_throttle_cost_per_io_hdd",
3761 "bluestore_throttle_cost_per_io_ssd",
3762 "bluestore_throttle_cost_per_io",
3763 "bluestore_max_blob_size",
3764 "bluestore_max_blob_size_ssd",
3765 "bluestore_max_blob_size_hdd",
3766 NULL
3767 };
3768 return KEYS;
3769 }
3770
3771 void BlueStore::handle_conf_change(const struct md_config_t *conf,
3772 const std::set<std::string> &changed)
3773 {
3774 if (changed.count("bluestore_csum_type")) {
3775 _set_csum();
3776 }
3777 if (changed.count("bluestore_compression_mode") ||
3778 changed.count("bluestore_compression_algorithm") ||
3779 changed.count("bluestore_compression_min_blob_size") ||
3780 changed.count("bluestore_compression_max_blob_size")) {
3781 if (bdev) {
3782 _set_compression();
3783 }
3784 }
3785 if (changed.count("bluestore_max_blob_size") ||
3786 changed.count("bluestore_max_blob_size_ssd") ||
3787 changed.count("bluestore_max_blob_size_hdd")) {
3788 if (bdev) {
3789 // only after startup
3790 _set_blob_size();
3791 }
3792 }
3793 if (changed.count("bluestore_prefer_deferred_size") ||
3794 changed.count("bluestore_prefer_deferred_size_hdd") ||
3795 changed.count("bluestore_prefer_deferred_size_ssd") ||
3796 changed.count("bluestore_max_alloc_size") ||
3797 changed.count("bluestore_deferred_batch_ops") ||
3798 changed.count("bluestore_deferred_batch_ops_hdd") ||
3799 changed.count("bluestore_deferred_batch_ops_ssd")) {
3800 if (bdev) {
3801 // only after startup
3802 _set_alloc_sizes();
3803 }
3804 }
3805 if (changed.count("bluestore_throttle_cost_per_io") ||
3806 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3807 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3808 if (bdev) {
3809 _set_throttle_params();
3810 }
3811 }
3812 if (changed.count("bluestore_throttle_bytes")) {
3813 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3814 throttle_deferred_bytes.reset_max(
3815 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3816 }
3817 if (changed.count("bluestore_throttle_deferred_bytes")) {
3818 throttle_deferred_bytes.reset_max(
3819 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3820 }
3821 }
3822
3823 void BlueStore::_set_compression()
3824 {
3825 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3826 if (m) {
3827 comp_mode = *m;
3828 } else {
3829 derr << __func__ << " unrecognized value '"
3830 << cct->_conf->bluestore_compression_mode
3831 << "' for bluestore_compression_mode, reverting to 'none'"
3832 << dendl;
3833 comp_mode = Compressor::COMP_NONE;
3834 }
3835
3836 compressor = nullptr;
3837
3838 if (comp_mode == Compressor::COMP_NONE) {
3839 dout(10) << __func__ << " compression mode set to 'none', "
3840 << "ignore other compression setttings" << dendl;
3841 return;
3842 }
3843
3844 if (cct->_conf->bluestore_compression_min_blob_size) {
3845 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
3846 } else {
3847 assert(bdev);
3848 if (bdev->is_rotational()) {
3849 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3850 } else {
3851 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3852 }
3853 }
3854
3855 if (cct->_conf->bluestore_compression_max_blob_size) {
3856 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3857 } else {
3858 assert(bdev);
3859 if (bdev->is_rotational()) {
3860 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3861 } else {
3862 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3863 }
3864 }
3865
3866 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3867 if (!alg_name.empty()) {
3868 compressor = Compressor::create(cct, alg_name);
3869 if (!compressor) {
3870 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3871 << dendl;
3872 }
3873 }
3874
3875 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3876 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3877 << dendl;
3878 }
3879
3880 void BlueStore::_set_csum()
3881 {
3882 csum_type = Checksummer::CSUM_NONE;
3883 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3884 if (t > Checksummer::CSUM_NONE)
3885 csum_type = t;
3886
3887 dout(10) << __func__ << " csum_type "
3888 << Checksummer::get_csum_type_string(csum_type)
3889 << dendl;
3890 }
3891
3892 void BlueStore::_set_throttle_params()
3893 {
3894 if (cct->_conf->bluestore_throttle_cost_per_io) {
3895 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3896 } else {
3897 assert(bdev);
3898 if (bdev->is_rotational()) {
3899 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3900 } else {
3901 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3902 }
3903 }
3904
3905 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3906 << dendl;
3907 }
3908 void BlueStore::_set_blob_size()
3909 {
3910 if (cct->_conf->bluestore_max_blob_size) {
3911 max_blob_size = cct->_conf->bluestore_max_blob_size;
3912 } else {
3913 assert(bdev);
3914 if (bdev->is_rotational()) {
3915 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3916 } else {
3917 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3918 }
3919 }
3920 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3921 << std::dec << dendl;
3922 }
3923
3924 void BlueStore::_set_finisher_num()
3925 {
3926 if (cct->_conf->bluestore_shard_finishers) {
3927 if (cct->_conf->osd_op_num_shards) {
3928 m_finisher_num = cct->_conf->osd_op_num_shards;
3929 } else {
3930 assert(bdev);
3931 if (bdev->is_rotational()) {
3932 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
3933 } else {
3934 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
3935 }
3936 }
3937 }
3938 assert(m_finisher_num != 0);
3939 }
3940
3941 int BlueStore::_set_cache_sizes()
3942 {
3943 assert(bdev);
3944 cache_autotune = cct->_conf->get_val<bool>("bluestore_cache_autotune");
3945 cache_autotune_chunk_size =
3946 cct->_conf->get_val<uint64_t>("bluestore_cache_autotune_chunk_size");
3947 cache_autotune_interval =
3948 cct->_conf->get_val<double>("bluestore_cache_autotune_interval");
3949 osd_memory_target = cct->_conf->get_val<uint64_t>("osd_memory_target");
3950 osd_memory_base = cct->_conf->get_val<uint64_t>("osd_memory_base");
3951 osd_memory_expected_fragmentation =
3952 cct->_conf->get_val<double>("osd_memory_expected_fragmentation");
3953 osd_memory_cache_min = cct->_conf->get_val<uint64_t>("osd_memory_cache_min");
3954 osd_memory_cache_resize_interval =
3955 cct->_conf->get_val<double>("osd_memory_cache_resize_interval");
3956
3957 if (cct->_conf->bluestore_cache_size) {
3958 cache_size = cct->_conf->bluestore_cache_size;
3959 } else {
3960 // choose global cache size based on backend type
3961 if (bdev->is_rotational()) {
3962 cache_size = cct->_conf->bluestore_cache_size_hdd;
3963 } else {
3964 cache_size = cct->_conf->bluestore_cache_size_ssd;
3965 }
3966 }
3967
3968 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3969 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
3970 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
3971 << ") must be in range [0,1.0]" << dendl;
3972 return -EINVAL;
3973 }
3974
3975 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
3976 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
3977 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
3978 << ") must be in range [0,1.0]" << dendl;
3979 return -EINVAL;
3980 }
3981
3982 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
3983 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
3984 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3985 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
3986 << dendl;
3987 return -EINVAL;
3988 }
3989
3990 cache_data_ratio =
3991 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
3992 if (cache_data_ratio < 0) {
3993 // deal with floating point imprecision
3994 cache_data_ratio = 0;
3995 }
3996
3997 dout(1) << __func__ << " cache_size " << cache_size
3998 << " meta " << cache_meta_ratio
3999 << " kv " << cache_kv_ratio
4000 << " data " << cache_data_ratio
4001 << dendl;
4002 return 0;
4003 }
4004
4005 int BlueStore::write_meta(const std::string& key, const std::string& value)
4006 {
4007 bluestore_bdev_label_t label;
4008 string p = path + "/block";
4009 int r = _read_bdev_label(cct, p, &label);
4010 if (r < 0) {
4011 return ObjectStore::write_meta(key, value);
4012 }
4013 label.meta[key] = value;
4014 r = _write_bdev_label(cct, p, label);
4015 assert(r == 0);
4016 return ObjectStore::write_meta(key, value);
4017 }
4018
4019 int BlueStore::read_meta(const std::string& key, std::string *value)
4020 {
4021 bluestore_bdev_label_t label;
4022 string p = path + "/block";
4023 int r = _read_bdev_label(cct, p, &label);
4024 if (r < 0) {
4025 return ObjectStore::read_meta(key, value);
4026 }
4027 auto i = label.meta.find(key);
4028 if (i == label.meta.end()) {
4029 return ObjectStore::read_meta(key, value);
4030 }
4031 *value = i->second;
4032 return 0;
4033 }
4034
4035 void BlueStore::_init_logger()
4036 {
4037 PerfCountersBuilder b(cct, "bluestore",
4038 l_bluestore_first, l_bluestore_last);
4039 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4040 "Average kv_thread flush latency",
4041 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4042 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4043 "Average kv_thread commit latency");
4044 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
4045 "Average kv_thread sync latency",
4046 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
4047 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4048 "Average prepare state latency");
4049 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4050 "Average aio_wait state latency",
4051 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4052 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4053 "Average io_done state latency");
4054 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4055 "Average kv_queued state latency");
4056 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4057 "Average kv_commiting state latency");
4058 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4059 "Average kv_done state latency");
4060 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4061 "Average deferred_queued state latency");
4062 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4063 "Average aio_wait state latency");
4064 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4065 "Average cleanup state latency");
4066 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4067 "Average finishing state latency");
4068 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4069 "Average done state latency");
4070 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4071 "Average submit throttle latency",
4072 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4073 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4074 "Average submit latency",
4075 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4076 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4077 "Average commit latency",
4078 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4079 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4080 "Average read latency",
4081 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4082 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4083 "Average read onode metadata latency");
4084 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4085 "Average read latency");
4086 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4087 "Average compress latency");
4088 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4089 "Average decompress latency");
4090 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4091 "Average checksum latency");
4092 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4093 "Sum for beneficial compress ops");
4094 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4095 "Sum for compress ops rejected due to low net gain of space");
4096 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
4097 "Sum for write-op padded bytes", NULL, 0, unit_t(BYTES));
4098 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4099 "Sum for deferred write op");
4100 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
4101 "Sum for deferred write bytes", "def", 0, unit_t(BYTES));
4102 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4103 "Sum for write penalty read ops");
4104 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4105 "Sum for allocated bytes");
4106 b.add_u64(l_bluestore_stored, "bluestore_stored",
4107 "Sum for stored bytes");
4108 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
4109 "Sum for stored compressed bytes");
4110 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
4111 "Sum for bytes allocated for compressed data");
4112 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
4113 "Sum for original bytes that were compressed");
4114
4115 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4116 "Number of onodes in cache");
4117 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4118 "Sum for onode-lookups hit in the cache");
4119 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4120 "Sum for onode-lookups missed in the cache");
4121 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4122 "Sum for onode-shard lookups hit in the cache");
4123 b.add_u64_counter(l_bluestore_onode_shard_misses,
4124 "bluestore_onode_shard_misses",
4125 "Sum for onode-shard lookups missed in the cache");
4126 b.add_u64(l_bluestore_extents, "bluestore_extents",
4127 "Number of extents in cache");
4128 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4129 "Number of blobs in cache");
4130 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4131 "Number of buffers in cache");
4132 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
4133 "Number of buffer bytes in cache", NULL, 0, unit_t(BYTES));
4134 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
4135 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(BYTES));
4136 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
4137 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(BYTES));
4138
4139 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4140 "Large aligned writes into fresh blobs");
4141 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
4142 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(BYTES));
4143 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4144 "Large aligned writes into fresh blobs (blobs)");
4145 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4146 "Small writes into existing or sparse small blobs");
4147 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
4148 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(BYTES));
4149 b.add_u64_counter(l_bluestore_write_small_unused,
4150 "bluestore_write_small_unused",
4151 "Small writes into unused portion of existing blob");
4152 b.add_u64_counter(l_bluestore_write_small_deferred,
4153 "bluestore_write_small_deferred",
4154 "Small overwrites using deferred");
4155 b.add_u64_counter(l_bluestore_write_small_pre_read,
4156 "bluestore_write_small_pre_read",
4157 "Small writes that required we read some data (possibly "
4158 "cached) to fill out the block");
4159 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4160 "Small write into new (sparse) blob");
4161
4162 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4163 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4164 "Onode extent map reshard events");
4165 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4166 "Sum for blob splitting due to resharding");
4167 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4168 "Sum for extents that have been removed due to compression");
4169 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4170 "Sum for extents that have been merged due to garbage "
4171 "collection");
4172 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4173 "Read EIO errors propagated to high level callers");
4174 logger = b.create_perf_counters();
4175 cct->get_perfcounters_collection()->add(logger);
4176 }
4177
4178 int BlueStore::_reload_logger()
4179 {
4180 struct store_statfs_t store_statfs;
4181
4182 int r = statfs(&store_statfs);
4183 if(r >= 0) {
4184 logger->set(l_bluestore_allocated, store_statfs.allocated);
4185 logger->set(l_bluestore_stored, store_statfs.stored);
4186 logger->set(l_bluestore_compressed, store_statfs.compressed);
4187 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
4188 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
4189 }
4190 return r;
4191 }
4192
4193 void BlueStore::_shutdown_logger()
4194 {
4195 cct->get_perfcounters_collection()->remove(logger);
4196 delete logger;
4197 }
4198
4199 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4200 uuid_d *fsid)
4201 {
4202 bluestore_bdev_label_t label;
4203 int r = _read_bdev_label(cct, path, &label);
4204 if (r < 0)
4205 return r;
4206 *fsid = label.osd_uuid;
4207 return 0;
4208 }
4209
4210 int BlueStore::_open_path()
4211 {
4212 // sanity check(s)
4213 if (cct->_conf->get_val<uint64_t>("osd_max_object_size") >=
4214 4*1024*1024*1024ull) {
4215 derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl;
4216 return -EINVAL;
4217 }
4218 assert(path_fd < 0);
4219 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
4220 if (path_fd < 0) {
4221 int r = -errno;
4222 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4223 << dendl;
4224 return r;
4225 }
4226 return 0;
4227 }
4228
4229 void BlueStore::_close_path()
4230 {
4231 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4232 path_fd = -1;
4233 }
4234
4235 int BlueStore::_write_bdev_label(CephContext *cct,
4236 string path, bluestore_bdev_label_t label)
4237 {
4238 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4239 bufferlist bl;
4240 ::encode(label, bl);
4241 uint32_t crc = bl.crc32c(-1);
4242 ::encode(crc, bl);
4243 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
4244 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4245 z.zero();
4246 bl.append(std::move(z));
4247
4248 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
4249 if (fd < 0) {
4250 fd = -errno;
4251 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4252 << dendl;
4253 return fd;
4254 }
4255 int r = bl.write_fd(fd);
4256 if (r < 0) {
4257 derr << __func__ << " failed to write to " << path
4258 << ": " << cpp_strerror(r) << dendl;
4259 }
4260 r = ::fsync(fd);
4261 if (r < 0) {
4262 derr << __func__ << " failed to fsync " << path
4263 << ": " << cpp_strerror(r) << dendl;
4264 }
4265 VOID_TEMP_FAILURE_RETRY(::close(fd));
4266 return r;
4267 }
4268
4269 int BlueStore::_read_bdev_label(CephContext* cct, string path,
4270 bluestore_bdev_label_t *label)
4271 {
4272 dout(10) << __func__ << dendl;
4273 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
4274 if (fd < 0) {
4275 fd = -errno;
4276 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4277 << dendl;
4278 return fd;
4279 }
4280 bufferlist bl;
4281 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4282 VOID_TEMP_FAILURE_RETRY(::close(fd));
4283 if (r < 0) {
4284 derr << __func__ << " failed to read from " << path
4285 << ": " << cpp_strerror(r) << dendl;
4286 return r;
4287 }
4288
4289 uint32_t crc, expected_crc;
4290 bufferlist::iterator p = bl.begin();
4291 try {
4292 ::decode(*label, p);
4293 bufferlist t;
4294 t.substr_of(bl, 0, p.get_off());
4295 crc = t.crc32c(-1);
4296 ::decode(expected_crc, p);
4297 }
4298 catch (buffer::error& e) {
4299 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
4300 << ": " << e.what()
4301 << dendl;
4302 return -ENOENT;
4303 }
4304 if (crc != expected_crc) {
4305 derr << __func__ << " bad crc on label, expected " << expected_crc
4306 << " != actual " << crc << dendl;
4307 return -EIO;
4308 }
4309 dout(10) << __func__ << " got " << *label << dendl;
4310 return 0;
4311 }
4312
4313 int BlueStore::_check_or_set_bdev_label(
4314 string path, uint64_t size, string desc, bool create)
4315 {
4316 bluestore_bdev_label_t label;
4317 if (create) {
4318 label.osd_uuid = fsid;
4319 label.size = size;
4320 label.btime = ceph_clock_now();
4321 label.description = desc;
4322 int r = _write_bdev_label(cct, path, label);
4323 if (r < 0)
4324 return r;
4325 } else {
4326 int r = _read_bdev_label(cct, path, &label);
4327 if (r < 0)
4328 return r;
4329 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4330 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4331 << " and fsid " << fsid << " check bypassed" << dendl;
4332 }
4333 else if (label.osd_uuid != fsid) {
4334 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4335 << " does not match our fsid " << fsid << dendl;
4336 return -EIO;
4337 }
4338 }
4339 return 0;
4340 }
4341
4342 void BlueStore::_set_alloc_sizes(void)
4343 {
4344 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4345
4346 if (cct->_conf->bluestore_prefer_deferred_size) {
4347 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4348 } else {
4349 assert(bdev);
4350 if (bdev->is_rotational()) {
4351 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4352 } else {
4353 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4354 }
4355 }
4356
4357 if (cct->_conf->bluestore_deferred_batch_ops) {
4358 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4359 } else {
4360 assert(bdev);
4361 if (bdev->is_rotational()) {
4362 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4363 } else {
4364 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4365 }
4366 }
4367
4368 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4369 << std::dec << " order " << min_alloc_size_order
4370 << " max_alloc_size 0x" << std::hex << max_alloc_size
4371 << " prefer_deferred_size 0x" << prefer_deferred_size
4372 << std::dec
4373 << " deferred_batch_ops " << deferred_batch_ops
4374 << dendl;
4375 }
4376
4377 int BlueStore::_open_bdev(bool create)
4378 {
4379 assert(bdev == NULL);
4380 string p = path + "/block";
4381 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4382 int r = bdev->open(p);
4383 if (r < 0)
4384 goto fail;
4385
4386 if (bdev->supported_bdev_label()) {
4387 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4388 if (r < 0)
4389 goto fail_close;
4390 }
4391
4392 // initialize global block parameters
4393 block_size = bdev->get_block_size();
4394 block_mask = ~(block_size - 1);
4395 block_size_order = ctz(block_size);
4396 assert(block_size == 1u << block_size_order);
4397 // and set cache_size based on device type
4398 r = _set_cache_sizes();
4399 if (r < 0) {
4400 goto fail_close;
4401 }
4402 return 0;
4403
4404 fail_close:
4405 bdev->close();
4406 fail:
4407 delete bdev;
4408 bdev = NULL;
4409 return r;
4410 }
4411
4412 void BlueStore::_close_bdev()
4413 {
4414 assert(bdev);
4415 bdev->close();
4416 delete bdev;
4417 bdev = NULL;
4418 }
4419
4420 int BlueStore::_open_fm(bool create)
4421 {
4422 assert(fm == NULL);
4423 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4424
4425 if (create) {
4426 // initialize freespace
4427 dout(20) << __func__ << " initializing freespace" << dendl;
4428 KeyValueDB::Transaction t = db->get_transaction();
4429 {
4430 bufferlist bl;
4431 bl.append(freelist_type);
4432 t->set(PREFIX_SUPER, "freelist_type", bl);
4433 }
4434 // being able to allocate in units less than bdev block size
4435 // seems to be a bad idea.
4436 assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
4437 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
4438
4439 // allocate superblock reserved space. note that we do not mark
4440 // bluefs space as allocated in the freelist; we instead rely on
4441 // bluefs_extents.
4442 uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
4443 min_alloc_size);
4444 fm->allocate(0, reserved, t);
4445
4446 if (cct->_conf->bluestore_bluefs) {
4447 assert(bluefs_extents.num_intervals() == 1);
4448 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
4449 reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
4450 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4451 << " for bluefs" << dendl;
4452 bufferlist bl;
4453 ::encode(bluefs_extents, bl);
4454 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4455 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4456 << std::dec << dendl;
4457 }
4458
4459 if (cct->_conf->bluestore_debug_prefill > 0) {
4460 uint64_t end = bdev->get_size() - reserved;
4461 dout(1) << __func__ << " pre-fragmenting freespace, using "
4462 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4463 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4464 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4465 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4466 float r = cct->_conf->bluestore_debug_prefill;
4467 r /= 1.0 - r;
4468 bool stop = false;
4469
4470 while (!stop && start < end) {
4471 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4472 if (start + l > end) {
4473 l = end - start;
4474 l = P2ALIGN(l, min_alloc_size);
4475 }
4476 assert(start + l <= end);
4477
4478 uint64_t u = 1 + (uint64_t)(r * (double)l);
4479 u = P2ROUNDUP(u, min_alloc_size);
4480 if (start + l + u > end) {
4481 u = end - (start + l);
4482 // trim to align so we don't overflow again
4483 u = P2ALIGN(u, min_alloc_size);
4484 stop = true;
4485 }
4486 assert(start + l + u <= end);
4487
4488 dout(20) << " free 0x" << std::hex << start << "~" << l
4489 << " use 0x" << u << std::dec << dendl;
4490
4491 if (u == 0) {
4492 // break if u has been trimmed to nothing
4493 break;
4494 }
4495
4496 fm->allocate(start + l, u, t);
4497 start += l + u;
4498 }
4499 }
4500 db->submit_transaction_sync(t);
4501 }
4502
4503 int r = fm->init(bdev->get_size());
4504 if (r < 0) {
4505 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4506 delete fm;
4507 fm = NULL;
4508 return r;
4509 }
4510 return 0;
4511 }
4512
4513 void BlueStore::_close_fm()
4514 {
4515 dout(10) << __func__ << dendl;
4516 assert(fm);
4517 fm->shutdown();
4518 delete fm;
4519 fm = NULL;
4520 }
4521
4522 int BlueStore::_open_alloc()
4523 {
4524 assert(alloc == NULL);
4525 assert(bdev->get_size());
4526 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4527 bdev->get_size(),
4528 min_alloc_size);
4529 if (!alloc) {
4530 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4531 << cct->_conf->bluestore_allocator
4532 << dendl;
4533 return -EINVAL;
4534 }
4535
4536 uint64_t num = 0, bytes = 0;
4537
4538 dout(1) << __func__ << " opening allocation metadata" << dendl;
4539 // initialize from freelist
4540 fm->enumerate_reset();
4541 uint64_t offset, length;
4542 while (fm->enumerate_next(&offset, &length)) {
4543 alloc->init_add_free(offset, length);
4544 ++num;
4545 bytes += length;
4546 }
4547 fm->enumerate_reset();
4548 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
4549 << " in " << num << " extents"
4550 << dendl;
4551
4552 // also mark bluefs space as allocated
4553 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4554 alloc->init_rm_free(e.get_start(), e.get_len());
4555 }
4556 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4557 << bluefs_extents << std::dec << " as allocated" << dendl;
4558
4559 return 0;
4560 }
4561
4562 void BlueStore::_close_alloc()
4563 {
4564 assert(alloc);
4565 alloc->shutdown();
4566 delete alloc;
4567 alloc = NULL;
4568 }
4569
4570 int BlueStore::_open_fsid(bool create)
4571 {
4572 assert(fsid_fd < 0);
4573 int flags = O_RDWR|O_CLOEXEC;
4574 if (create)
4575 flags |= O_CREAT;
4576 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4577 if (fsid_fd < 0) {
4578 int err = -errno;
4579 derr << __func__ << " " << cpp_strerror(err) << dendl;
4580 return err;
4581 }
4582 return 0;
4583 }
4584
4585 int BlueStore::_read_fsid(uuid_d *uuid)
4586 {
4587 char fsid_str[40];
4588 memset(fsid_str, 0, sizeof(fsid_str));
4589 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4590 if (ret < 0) {
4591 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4592 return ret;
4593 }
4594 if (ret > 36)
4595 fsid_str[36] = 0;
4596 else
4597 fsid_str[ret] = 0;
4598 if (!uuid->parse(fsid_str)) {
4599 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4600 return -EINVAL;
4601 }
4602 return 0;
4603 }
4604
4605 int BlueStore::_write_fsid()
4606 {
4607 int r = ::ftruncate(fsid_fd, 0);
4608 if (r < 0) {
4609 r = -errno;
4610 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4611 return r;
4612 }
4613 string str = stringify(fsid) + "\n";
4614 r = safe_write(fsid_fd, str.c_str(), str.length());
4615 if (r < 0) {
4616 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4617 return r;
4618 }
4619 r = ::fsync(fsid_fd);
4620 if (r < 0) {
4621 r = -errno;
4622 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4623 return r;
4624 }
4625 return 0;
4626 }
4627
4628 void BlueStore::_close_fsid()
4629 {
4630 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4631 fsid_fd = -1;
4632 }
4633
4634 int BlueStore::_lock_fsid()
4635 {
4636 struct flock l;
4637 memset(&l, 0, sizeof(l));
4638 l.l_type = F_WRLCK;
4639 l.l_whence = SEEK_SET;
4640 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4641 if (r < 0) {
4642 int err = errno;
4643 derr << __func__ << " failed to lock " << path << "/fsid"
4644 << " (is another ceph-osd still running?)"
4645 << cpp_strerror(err) << dendl;
4646 return -err;
4647 }
4648 return 0;
4649 }
4650
4651 bool BlueStore::is_rotational()
4652 {
4653 if (bdev) {
4654 return bdev->is_rotational();
4655 }
4656
4657 bool rotational = true;
4658 int r = _open_path();
4659 if (r < 0)
4660 goto out;
4661 r = _open_fsid(false);
4662 if (r < 0)
4663 goto out_path;
4664 r = _read_fsid(&fsid);
4665 if (r < 0)
4666 goto out_fsid;
4667 r = _lock_fsid();
4668 if (r < 0)
4669 goto out_fsid;
4670 r = _open_bdev(false);
4671 if (r < 0)
4672 goto out_fsid;
4673 rotational = bdev->is_rotational();
4674 _close_bdev();
4675 out_fsid:
4676 _close_fsid();
4677 out_path:
4678 _close_path();
4679 out:
4680 return rotational;
4681 }
4682
4683 bool BlueStore::is_journal_rotational()
4684 {
4685 if (!bluefs) {
4686 dout(5) << __func__ << " bluefs disabled, default to store media type"
4687 << dendl;
4688 return is_rotational();
4689 }
4690 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4691 return bluefs->wal_is_rotational();
4692 }
4693
4694 bool BlueStore::test_mount_in_use()
4695 {
4696 // most error conditions mean the mount is not in use (e.g., because
4697 // it doesn't exist). only if we fail to lock do we conclude it is
4698 // in use.
4699 bool ret = false;
4700 int r = _open_path();
4701 if (r < 0)
4702 return false;
4703 r = _open_fsid(false);
4704 if (r < 0)
4705 goto out_path;
4706 r = _lock_fsid();
4707 if (r < 0)
4708 ret = true; // if we can't lock, it is in use
4709 _close_fsid();
4710 out_path:
4711 _close_path();
4712 return ret;
4713 }
4714
4715 int BlueStore::_open_db(bool create)
4716 {
4717 int r;
4718 assert(!db);
4719 string fn = path + "/db";
4720 string options;
4721 stringstream err;
4722 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4723
4724 string kv_backend;
4725 if (create) {
4726 kv_backend = cct->_conf->bluestore_kvbackend;
4727 } else {
4728 r = read_meta("kv_backend", &kv_backend);
4729 if (r < 0) {
4730 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4731 return -EIO;
4732 }
4733 }
4734 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4735
4736 bool do_bluefs;
4737 if (create) {
4738 do_bluefs = cct->_conf->bluestore_bluefs;
4739 } else {
4740 string s;
4741 r = read_meta("bluefs", &s);
4742 if (r < 0) {
4743 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4744 return -EIO;
4745 }
4746 if (s == "1") {
4747 do_bluefs = true;
4748 } else if (s == "0") {
4749 do_bluefs = false;
4750 } else {
4751 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4752 << dendl;
4753 return -EIO;
4754 }
4755 }
4756 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4757
4758 rocksdb::Env *env = NULL;
4759 if (do_bluefs) {
4760 dout(10) << __func__ << " initializing bluefs" << dendl;
4761 if (kv_backend != "rocksdb") {
4762 derr << " backend must be rocksdb to use bluefs" << dendl;
4763 return -EINVAL;
4764 }
4765 bluefs = new BlueFS(cct);
4766
4767 string bfn;
4768 struct stat st;
4769
4770 bfn = path + "/block.db";
4771 if (::stat(bfn.c_str(), &st) == 0) {
4772 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4773 if (r < 0) {
4774 derr << __func__ << " add block device(" << bfn << ") returned: "
4775 << cpp_strerror(r) << dendl;
4776 goto free_bluefs;
4777 }
4778
4779 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4780 r = _check_or_set_bdev_label(
4781 bfn,
4782 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4783 "bluefs db", create);
4784 if (r < 0) {
4785 derr << __func__
4786 << " check block device(" << bfn << ") label returned: "
4787 << cpp_strerror(r) << dendl;
4788 goto free_bluefs;
4789 }
4790 }
4791 if (create) {
4792 bluefs->add_block_extent(
4793 BlueFS::BDEV_DB,
4794 SUPER_RESERVED,
4795 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4796 }
4797 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4798 bluefs_single_shared_device = false;
4799 } else {
4800 r = -errno;
4801 if (::lstat(bfn.c_str(), &st) == -1) {
4802 r = 0;
4803 bluefs_shared_bdev = BlueFS::BDEV_DB;
4804 } else {
4805 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4806 << cpp_strerror(r) << dendl;
4807 goto free_bluefs;
4808 }
4809 }
4810
4811 // shared device
4812 bfn = path + "/block";
4813 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4814 if (r < 0) {
4815 derr << __func__ << " add block device(" << bfn << ") returned: "
4816 << cpp_strerror(r) << dendl;
4817 goto free_bluefs;
4818 }
4819 if (create) {
4820 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4821 uint64_t initial =
4822 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4823 cct->_conf->bluestore_bluefs_gift_ratio);
4824 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
4825 if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
4826 derr << __func__ << " bluefs_alloc_size 0x" << std::hex
4827 << cct->_conf->bluefs_alloc_size << " is not a multiple of "
4828 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4829 r = -EINVAL;
4830 goto free_bluefs;
4831 }
4832 // align to bluefs's alloc_size
4833 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
4834 // put bluefs in the middle of the device in case it is an HDD
4835 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4836 cct->_conf->bluefs_alloc_size);
4837 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4838 bluefs_extents.insert(start, initial);
4839 }
4840
4841 bfn = path + "/block.wal";
4842 if (::stat(bfn.c_str(), &st) == 0) {
4843 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4844 if (r < 0) {
4845 derr << __func__ << " add block device(" << bfn << ") returned: "
4846 << cpp_strerror(r) << dendl;
4847 goto free_bluefs;
4848 }
4849
4850 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4851 r = _check_or_set_bdev_label(
4852 bfn,
4853 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4854 "bluefs wal", create);
4855 if (r < 0) {
4856 derr << __func__ << " check block device(" << bfn
4857 << ") label returned: " << cpp_strerror(r) << dendl;
4858 goto free_bluefs;
4859 }
4860 }
4861
4862 if (create) {
4863 bluefs->add_block_extent(
4864 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4865 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4866 BDEV_LABEL_BLOCK_SIZE);
4867 }
4868 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4869 bluefs_single_shared_device = false;
4870 } else {
4871 r = -errno;
4872 if (::lstat(bfn.c_str(), &st) == -1) {
4873 r = 0;
4874 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
4875 } else {
4876 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4877 << cpp_strerror(r) << dendl;
4878 goto free_bluefs;
4879 }
4880 }
4881
4882 if (create) {
4883 bluefs->mkfs(fsid);
4884 }
4885 r = bluefs->mount();
4886 if (r < 0) {
4887 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4888 goto free_bluefs;
4889 }
4890 if (cct->_conf->bluestore_bluefs_env_mirror) {
4891 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4892 rocksdb::Env *b = rocksdb::Env::Default();
4893 if (create) {
4894 string cmd = "rm -rf " + path + "/db " +
4895 path + "/db.slow " +
4896 path + "/db.wal";
4897 int r = system(cmd.c_str());
4898 (void)r;
4899 }
4900 env = new rocksdb::EnvMirror(b, a, false, true);
4901 } else {
4902 env = new BlueRocksEnv(bluefs);
4903
4904 // simplify the dir names, too, as "seen" by rocksdb
4905 fn = "db";
4906 }
4907
4908 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4909 // we have both block.db and block; tell rocksdb!
4910 // note: the second (last) size value doesn't really matter
4911 ostringstream db_paths;
4912 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4913 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4914 db_paths << fn << ","
4915 << (uint64_t)(db_size * 95 / 100) << " "
4916 << fn + ".slow" << ","
4917 << (uint64_t)(slow_size * 95 / 100);
4918 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4919 dout(10) << __func__ << " set rocksdb_db_paths to "
4920 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4921 }
4922
4923 if (create) {
4924 env->CreateDir(fn);
4925 if (cct->_conf->rocksdb_separate_wal_dir)
4926 env->CreateDir(fn + ".wal");
4927 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4928 env->CreateDir(fn + ".slow");
4929 }
4930 } else if (create) {
4931 int r = ::mkdir(fn.c_str(), 0755);
4932 if (r < 0)
4933 r = -errno;
4934 if (r < 0 && r != -EEXIST) {
4935 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4936 << dendl;
4937 return r;
4938 }
4939
4940 // wal_dir, too!
4941 if (cct->_conf->rocksdb_separate_wal_dir) {
4942 string walfn = path + "/db.wal";
4943 r = ::mkdir(walfn.c_str(), 0755);
4944 if (r < 0)
4945 r = -errno;
4946 if (r < 0 && r != -EEXIST) {
4947 derr << __func__ << " failed to create " << walfn
4948 << ": " << cpp_strerror(r)
4949 << dendl;
4950 return r;
4951 }
4952 }
4953 }
4954
4955
4956 db = KeyValueDB::create(cct,
4957 kv_backend,
4958 fn,
4959 static_cast<void*>(env));
4960 if (!db) {
4961 derr << __func__ << " error creating db" << dendl;
4962 if (bluefs) {
4963 bluefs->umount();
4964 delete bluefs;
4965 bluefs = NULL;
4966 }
4967 // delete env manually here since we can't depend on db to do this
4968 // under this case
4969 delete env;
4970 env = NULL;
4971 return -EIO;
4972 }
4973
4974 FreelistManager::setup_merge_operators(db);
4975 db->set_merge_operator(PREFIX_STAT, merge_op);
4976 db->set_cache_size(cache_kv_ratio * cache_size);
4977
4978 if (kv_backend == "rocksdb")
4979 options = cct->_conf->bluestore_rocksdb_options;
4980 db->init(options);
4981 if (create)
4982 r = db->create_and_open(err);
4983 else
4984 r = db->open(err);
4985 if (r) {
4986 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4987 if (bluefs) {
4988 bluefs->umount();
4989 delete bluefs;
4990 bluefs = NULL;
4991 }
4992 delete db;
4993 db = NULL;
4994 return -EIO;
4995 }
4996 dout(1) << __func__ << " opened " << kv_backend
4997 << " path " << fn << " options " << options << dendl;
4998 return 0;
4999
5000 free_bluefs:
5001 assert(bluefs);
5002 delete bluefs;
5003 bluefs = NULL;
5004 return r;
5005 }
5006
5007 void BlueStore::_close_db()
5008 {
5009 assert(db);
5010 delete db;
5011 db = NULL;
5012 if (bluefs) {
5013 bluefs->umount();
5014 delete bluefs;
5015 bluefs = NULL;
5016 }
5017 }
5018
5019 int BlueStore::_reconcile_bluefs_freespace()
5020 {
5021 dout(10) << __func__ << dendl;
5022 interval_set<uint64_t> bset;
5023 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
5024 assert(r == 0);
5025 if (bset == bluefs_extents) {
5026 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
5027 << std::dec << dendl;
5028 return 0;
5029 }
5030 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
5031 << dendl;
5032 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
5033 << std::dec << dendl;
5034
5035 interval_set<uint64_t> overlap;
5036 overlap.intersection_of(bset, bluefs_extents);
5037
5038 bset.subtract(overlap);
5039 if (!bset.empty()) {
5040 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
5041 << dendl;
5042 return -EIO;
5043 }
5044
5045 interval_set<uint64_t> super_extra;
5046 super_extra = bluefs_extents;
5047 super_extra.subtract(overlap);
5048 if (!super_extra.empty()) {
5049 // This is normal: it can happen if we commit to give extents to
5050 // bluefs and we crash before bluefs commits that it owns them.
5051 dout(10) << __func__ << " super extra " << super_extra << dendl;
5052 for (interval_set<uint64_t>::iterator p = super_extra.begin();
5053 p != super_extra.end();
5054 ++p) {
5055 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
5056 }
5057 }
5058
5059 return 0;
5060 }
5061
5062 int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
5063 {
5064 int ret = 0;
5065 assert(bluefs);
5066
5067 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
5068 bluefs->get_usage(&bluefs_usage);
5069 assert(bluefs_usage.size() > bluefs_shared_bdev);
5070
5071 // fixme: look at primary bdev only for now
5072 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
5073 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
5074 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
5075
5076 uint64_t my_free = alloc->get_free();
5077 uint64_t total = bdev->get_size();
5078 float my_free_ratio = (float)my_free / (float)total;
5079
5080 uint64_t total_free = bluefs_free + my_free;
5081
5082 float bluefs_ratio = (float)bluefs_free / (float)total_free;
5083
5084 dout(10) << __func__
5085 << " bluefs " << byte_u_t(bluefs_free)
5086 << " free (" << bluefs_free_ratio
5087 << ") bluestore " << byte_u_t(my_free)
5088 << " free (" << my_free_ratio
5089 << "), bluefs_ratio " << bluefs_ratio
5090 << dendl;
5091
5092 uint64_t gift = 0;
5093 uint64_t reclaim = 0;
5094 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
5095 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
5096 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5097 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
5098 << ", should gift " << byte_u_t(gift) << dendl;
5099 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
5100 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
5101 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
5102 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
5103 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5104 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
5105 << ", should reclaim " << byte_u_t(reclaim) << dendl;
5106 }
5107
5108 // don't take over too much of the freespace
5109 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
5110 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
5111 cct->_conf->bluestore_bluefs_min < free_cap) {
5112 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
5113 dout(10) << __func__ << " bluefs_total " << bluefs_total
5114 << " < min " << cct->_conf->bluestore_bluefs_min
5115 << ", should gift " << byte_u_t(g) << dendl;
5116 if (g > gift)
5117 gift = g;
5118 reclaim = 0;
5119 }
5120 uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
5121 if (bluefs_free < min_free &&
5122 min_free < free_cap) {
5123 uint64_t g = min_free - bluefs_free;
5124 dout(10) << __func__ << " bluefs_free " << bluefs_total
5125 << " < min " << min_free
5126 << ", should gift " << byte_u_t(g) << dendl;
5127 if (g > gift)
5128 gift = g;
5129 reclaim = 0;
5130 }
5131
5132 if (gift) {
5133 // round up to alloc size
5134 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
5135
5136 // hard cap to fit into 32 bits
5137 gift = MIN(gift, 1ull<<31);
5138 dout(10) << __func__ << " gifting " << gift
5139 << " (" << byte_u_t(gift) << ")" << dendl;
5140
5141 // fixme: just do one allocation to start...
5142 int r = alloc->reserve(gift);
5143 assert(r == 0);
5144
5145 AllocExtentVector exts;
5146 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
5147 0, 0, &exts);
5148
5149 if (alloc_len <= 0) {
5150 dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
5151 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5152 alloc->unreserve(gift);
5153 alloc->dump();
5154 return 0;
5155 } else if (alloc_len < (int64_t)gift) {
5156 dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
5157 << " min_alloc_size 0x" << min_alloc_size
5158 << " allocated 0x" << alloc_len
5159 << std::dec << dendl;
5160 alloc->unreserve(gift - alloc_len);
5161 alloc->dump();
5162 }
5163 for (auto& p : exts) {
5164 bluestore_pextent_t e = bluestore_pextent_t(p);
5165 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
5166 extents->push_back(e);
5167 }
5168 gift = 0;
5169
5170 ret = 1;
5171 }
5172
5173 // reclaim from bluefs?
5174 if (reclaim) {
5175 // round up to alloc size
5176 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
5177
5178 // hard cap to fit into 32 bits
5179 reclaim = MIN(reclaim, 1ull<<31);
5180 dout(10) << __func__ << " reclaiming " << reclaim
5181 << " (" << byte_u_t(reclaim) << ")" << dendl;
5182
5183 while (reclaim > 0) {
5184 // NOTE: this will block and do IO.
5185 AllocExtentVector extents;
5186 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
5187 &extents);
5188 if (r < 0) {
5189 derr << __func__ << " failed to reclaim space from bluefs"
5190 << dendl;
5191 break;
5192 }
5193 for (auto e : extents) {
5194 bluefs_extents.erase(e.offset, e.length);
5195 bluefs_extents_reclaiming.insert(e.offset, e.length);
5196 reclaim -= e.length;
5197 }
5198 }
5199
5200 ret = 1;
5201 }
5202
5203 return ret;
5204 }
5205
5206 void BlueStore::_commit_bluefs_freespace(
5207 const PExtentVector& bluefs_gift_extents)
5208 {
5209 dout(10) << __func__ << dendl;
5210 for (auto& p : bluefs_gift_extents) {
5211 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
5212 }
5213 }
5214
5215 int BlueStore::_open_collections(int *errors)
5216 {
5217 dout(10) << __func__ << dendl;
5218 assert(coll_map.empty());
5219 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5220 for (it->upper_bound(string());
5221 it->valid();
5222 it->next()) {
5223 coll_t cid;
5224 if (cid.parse(it->key())) {
5225 CollectionRef c(
5226 new Collection(
5227 this,
5228 cache_shards[cid.hash_to_shard(cache_shards.size())],
5229 cid));
5230 bufferlist bl = it->value();
5231 bufferlist::iterator p = bl.begin();
5232 try {
5233 ::decode(c->cnode, p);
5234 } catch (buffer::error& e) {
5235 derr << __func__ << " failed to decode cnode, key:"
5236 << pretty_binary_string(it->key()) << dendl;
5237 return -EIO;
5238 }
5239 dout(20) << __func__ << " opened " << cid << " " << c
5240 << " " << c->cnode << dendl;
5241 coll_map[cid] = c;
5242 } else {
5243 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5244 if (errors)
5245 (*errors)++;
5246 }
5247 }
5248 return 0;
5249 }
5250
5251 void BlueStore::_open_statfs()
5252 {
5253 bufferlist bl;
5254 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
5255 if (r >= 0) {
5256 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
5257 auto it = bl.begin();
5258 vstatfs.decode(it);
5259 } else {
5260 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5261 }
5262 }
5263 else {
5264 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
5265 }
5266 }
5267
5268 int BlueStore::_setup_block_symlink_or_file(
5269 string name,
5270 string epath,
5271 uint64_t size,
5272 bool create)
5273 {
5274 dout(20) << __func__ << " name " << name << " path " << epath
5275 << " size " << size << " create=" << (int)create << dendl;
5276 int r = 0;
5277 int flags = O_RDWR|O_CLOEXEC;
5278 if (create)
5279 flags |= O_CREAT;
5280 if (epath.length()) {
5281 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5282 if (r < 0) {
5283 r = -errno;
5284 derr << __func__ << " failed to create " << name << " symlink to "
5285 << epath << ": " << cpp_strerror(r) << dendl;
5286 return r;
5287 }
5288
5289 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5290 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5291 if (fd < 0) {
5292 r = -errno;
5293 derr << __func__ << " failed to open " << epath << " file: "
5294 << cpp_strerror(r) << dendl;
5295 return r;
5296 }
5297 string serial_number = epath.substr(strlen(SPDK_PREFIX));
5298 r = ::write(fd, serial_number.c_str(), serial_number.size());
5299 assert(r == (int)serial_number.size());
5300 dout(1) << __func__ << " created " << name << " symlink to "
5301 << epath << dendl;
5302 VOID_TEMP_FAILURE_RETRY(::close(fd));
5303 }
5304 }
5305 if (size) {
5306 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5307 if (fd >= 0) {
5308 // block file is present
5309 struct stat st;
5310 int r = ::fstat(fd, &st);
5311 if (r == 0 &&
5312 S_ISREG(st.st_mode) && // if it is a regular file
5313 st.st_size == 0) { // and is 0 bytes
5314 r = ::ftruncate(fd, size);
5315 if (r < 0) {
5316 r = -errno;
5317 derr << __func__ << " failed to resize " << name << " file to "
5318 << size << ": " << cpp_strerror(r) << dendl;
5319 VOID_TEMP_FAILURE_RETRY(::close(fd));
5320 return r;
5321 }
5322
5323 if (cct->_conf->bluestore_block_preallocate_file) {
5324 r = ::ceph_posix_fallocate(fd, 0, size);
5325 if (r > 0) {
5326 derr << __func__ << " failed to prefallocate " << name << " file to "
5327 << size << ": " << cpp_strerror(r) << dendl;
5328 VOID_TEMP_FAILURE_RETRY(::close(fd));
5329 return -r;
5330 }
5331 }
5332 dout(1) << __func__ << " resized " << name << " file to "
5333 << byte_u_t(size) << dendl;
5334 }
5335 VOID_TEMP_FAILURE_RETRY(::close(fd));
5336 } else {
5337 int r = -errno;
5338 if (r != -ENOENT) {
5339 derr << __func__ << " failed to open " << name << " file: "
5340 << cpp_strerror(r) << dendl;
5341 return r;
5342 }
5343 }
5344 }
5345 return 0;
5346 }
5347
5348 int BlueStore::mkfs()
5349 {
5350 dout(1) << __func__ << " path " << path << dendl;
5351 int r;
5352 uuid_d old_fsid;
5353
5354 {
5355 string done;
5356 r = read_meta("mkfs_done", &done);
5357 if (r == 0) {
5358 dout(1) << __func__ << " already created" << dendl;
5359 if (cct->_conf->bluestore_fsck_on_mkfs) {
5360 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5361 if (r < 0) {
5362 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5363 << dendl;
5364 return r;
5365 }
5366 if (r > 0) {
5367 derr << __func__ << " fsck found " << r << " errors" << dendl;
5368 r = -EIO;
5369 }
5370 }
5371 return r; // idempotent
5372 }
5373 }
5374
5375 {
5376 string type;
5377 r = read_meta("type", &type);
5378 if (r == 0) {
5379 if (type != "bluestore") {
5380 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5381 return -EIO;
5382 }
5383 } else {
5384 r = write_meta("type", "bluestore");
5385 if (r < 0)
5386 return r;
5387 }
5388 }
5389
5390 freelist_type = "bitmap";
5391
5392 r = _open_path();
5393 if (r < 0)
5394 return r;
5395
5396 r = _open_fsid(true);
5397 if (r < 0)
5398 goto out_path_fd;
5399
5400 r = _lock_fsid();
5401 if (r < 0)
5402 goto out_close_fsid;
5403
5404 r = _read_fsid(&old_fsid);
5405 if (r < 0 || old_fsid.is_zero()) {
5406 if (fsid.is_zero()) {
5407 fsid.generate_random();
5408 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5409 } else {
5410 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5411 }
5412 // we'll write it later.
5413 } else {
5414 if (!fsid.is_zero() && fsid != old_fsid) {
5415 derr << __func__ << " on-disk fsid " << old_fsid
5416 << " != provided " << fsid << dendl;
5417 r = -EINVAL;
5418 goto out_close_fsid;
5419 }
5420 fsid = old_fsid;
5421 }
5422
5423 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5424 cct->_conf->bluestore_block_size,
5425 cct->_conf->bluestore_block_create);
5426 if (r < 0)
5427 goto out_close_fsid;
5428 if (cct->_conf->bluestore_bluefs) {
5429 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5430 cct->_conf->bluestore_block_wal_size,
5431 cct->_conf->bluestore_block_wal_create);
5432 if (r < 0)
5433 goto out_close_fsid;
5434 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5435 cct->_conf->bluestore_block_db_size,
5436 cct->_conf->bluestore_block_db_create);
5437 if (r < 0)
5438 goto out_close_fsid;
5439 }
5440
5441 r = _open_bdev(true);
5442 if (r < 0)
5443 goto out_close_fsid;
5444
5445 // choose min_alloc_size
5446 if (cct->_conf->bluestore_min_alloc_size) {
5447 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5448 } else {
5449 assert(bdev);
5450 if (bdev->is_rotational()) {
5451 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5452 } else {
5453 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5454 }
5455 }
5456
5457 // make sure min_alloc_size is power of 2 aligned.
5458 if (!ISP2(min_alloc_size)) {
5459 derr << __func__ << " min_alloc_size 0x"
5460 << std::hex << min_alloc_size << std::dec
5461 << " is not power of 2 aligned!"
5462 << dendl;
5463 r = -EINVAL;
5464 goto out_close_bdev;
5465 }
5466
5467 r = _open_db(true);
5468 if (r < 0)
5469 goto out_close_bdev;
5470
5471 r = _open_fm(true);
5472 if (r < 0)
5473 goto out_close_db;
5474
5475 {
5476 KeyValueDB::Transaction t = db->get_transaction();
5477 {
5478 bufferlist bl;
5479 ::encode((uint64_t)0, bl);
5480 t->set(PREFIX_SUPER, "nid_max", bl);
5481 t->set(PREFIX_SUPER, "blobid_max", bl);
5482 }
5483
5484 {
5485 bufferlist bl;
5486 ::encode((uint64_t)min_alloc_size, bl);
5487 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5488 }
5489
5490 ondisk_format = latest_ondisk_format;
5491 _prepare_ondisk_format_super(t);
5492 db->submit_transaction_sync(t);
5493 }
5494
5495
5496 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5497 if (r < 0)
5498 goto out_close_fm;
5499
5500 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
5501 if (r < 0)
5502 goto out_close_fm;
5503
5504 if (fsid != old_fsid) {
5505 r = _write_fsid();
5506 if (r < 0) {
5507 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
5508 goto out_close_fm;
5509 }
5510 }
5511
5512 out_close_fm:
5513 _close_fm();
5514 out_close_db:
5515 _close_db();
5516 out_close_bdev:
5517 _close_bdev();
5518 out_close_fsid:
5519 _close_fsid();
5520 out_path_fd:
5521 _close_path();
5522
5523 if (r == 0 &&
5524 cct->_conf->bluestore_fsck_on_mkfs) {
5525 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5526 if (rc < 0)
5527 return rc;
5528 if (rc > 0) {
5529 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5530 r = -EIO;
5531 }
5532 }
5533
5534 if (r == 0) {
5535 // indicate success by writing the 'mkfs_done' file
5536 r = write_meta("mkfs_done", "yes");
5537 }
5538
5539 if (r < 0) {
5540 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
5541 } else {
5542 dout(0) << __func__ << " success" << dendl;
5543 }
5544 return r;
5545 }
5546
5547 void BlueStore::set_cache_shards(unsigned num)
5548 {
5549 dout(10) << __func__ << " " << num << dendl;
5550 size_t old = cache_shards.size();
5551 assert(num >= old);
5552 cache_shards.resize(num);
5553 for (unsigned i = old; i < num; ++i) {
5554 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5555 logger);
5556 }
5557 }
5558
5559 int BlueStore::_mount(bool kv_only)
5560 {
5561 dout(1) << __func__ << " path " << path << dendl;
5562
5563 _kv_only = kv_only;
5564
5565 {
5566 string type;
5567 int r = read_meta("type", &type);
5568 if (r < 0) {
5569 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5570 << dendl;
5571 return r;
5572 }
5573
5574 if (type != "bluestore") {
5575 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5576 return -EIO;
5577 }
5578 }
5579
5580 if (cct->_conf->bluestore_fsck_on_mount) {
5581 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5582 if (rc < 0)
5583 return rc;
5584 if (rc > 0) {
5585 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5586 return -EIO;
5587 }
5588 }
5589
5590 int r = _open_path();
5591 if (r < 0)
5592 return r;
5593 r = _open_fsid(false);
5594 if (r < 0)
5595 goto out_path;
5596
5597 r = _read_fsid(&fsid);
5598 if (r < 0)
5599 goto out_fsid;
5600
5601 r = _lock_fsid();
5602 if (r < 0)
5603 goto out_fsid;
5604
5605 r = _open_bdev(false);
5606 if (r < 0)
5607 goto out_fsid;
5608
5609 r = _open_db(false);
5610 if (r < 0)
5611 goto out_bdev;
5612
5613 if (kv_only)
5614 return 0;
5615
5616 r = _open_super_meta();
5617 if (r < 0)
5618 goto out_db;
5619
5620 r = _open_fm(false);
5621 if (r < 0)
5622 goto out_db;
5623
5624 r = _open_alloc();
5625 if (r < 0)
5626 goto out_fm;
5627
5628 r = _open_collections();
5629 if (r < 0)
5630 goto out_alloc;
5631
5632 r = _reload_logger();
5633 if (r < 0)
5634 goto out_coll;
5635
5636 if (bluefs) {
5637 r = _reconcile_bluefs_freespace();
5638 if (r < 0)
5639 goto out_coll;
5640 }
5641
5642 _kv_start();
5643
5644 r = _deferred_replay();
5645 if (r < 0)
5646 goto out_stop;
5647
5648 mempool_thread.init();
5649
5650 mounted = true;
5651 return 0;
5652
5653 out_stop:
5654 _kv_stop();
5655 out_coll:
5656 _flush_cache();
5657 out_alloc:
5658 _close_alloc();
5659 out_fm:
5660 _close_fm();
5661 out_db:
5662 _close_db();
5663 out_bdev:
5664 _close_bdev();
5665 out_fsid:
5666 _close_fsid();
5667 out_path:
5668 _close_path();
5669 return r;
5670 }
5671
5672 int BlueStore::umount()
5673 {
5674 assert(_kv_only || mounted);
5675 dout(1) << __func__ << dendl;
5676
5677 _osr_drain_all();
5678 _osr_unregister_all();
5679
5680 mounted = false;
5681 if (!_kv_only) {
5682 mempool_thread.shutdown();
5683 dout(20) << __func__ << " stopping kv thread" << dendl;
5684 _kv_stop();
5685 _flush_cache();
5686 dout(20) << __func__ << " closing" << dendl;
5687
5688 _close_alloc();
5689 _close_fm();
5690 }
5691 _close_db();
5692 _close_bdev();
5693 _close_fsid();
5694 _close_path();
5695
5696 if (cct->_conf->bluestore_fsck_on_umount) {
5697 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5698 if (rc < 0)
5699 return rc;
5700 if (rc > 0) {
5701 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5702 return -EIO;
5703 }
5704 }
5705 return 0;
5706 }
5707
5708 static void apply(uint64_t off,
5709 uint64_t len,
5710 uint64_t granularity,
5711 BlueStore::mempool_dynamic_bitset &bitset,
5712 std::function<void(uint64_t,
5713 BlueStore::mempool_dynamic_bitset &)> f) {
5714 auto end = ROUND_UP_TO(off + len, granularity);
5715 while (off < end) {
5716 uint64_t pos = off / granularity;
5717 f(pos, bitset);
5718 off += granularity;
5719 }
5720 }
5721
5722 int BlueStore::_fsck_check_extents(
5723 const ghobject_t& oid,
5724 const PExtentVector& extents,
5725 bool compressed,
5726 mempool_dynamic_bitset &used_blocks,
5727 uint64_t granularity,
5728 store_statfs_t& expected_statfs)
5729 {
5730 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5731 int errors = 0;
5732 for (auto e : extents) {
5733 if (!e.is_valid())
5734 continue;
5735 expected_statfs.allocated += e.length;
5736 if (compressed) {
5737 expected_statfs.compressed_allocated += e.length;
5738 }
5739 bool already = false;
5740 apply(
5741 e.offset, e.length, granularity, used_blocks,
5742 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5743 assert(pos < bs.size());
5744 if (bs.test(pos))
5745 already = true;
5746 else
5747 bs.set(pos);
5748 });
5749 if (already) {
5750 derr << " " << oid << " extent " << e
5751 << " or a subset is already allocated" << dendl;
5752 ++errors;
5753 }
5754 if (e.end() > bdev->get_size()) {
5755 derr << " " << oid << " extent " << e
5756 << " past end of block device" << dendl;
5757 ++errors;
5758 }
5759 }
5760 return errors;
5761 }
5762
5763 int BlueStore::_fsck(bool deep, bool repair)
5764 {
5765 dout(1) << __func__
5766 << (repair ? " fsck" : " repair")
5767 << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
5768 int errors = 0;
5769 int repaired = 0;
5770
5771 typedef btree::btree_set<
5772 uint64_t,std::less<uint64_t>,
5773 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5774 uint64_t_btree_t used_nids;
5775 uint64_t_btree_t used_omap_head;
5776 uint64_t_btree_t used_sbids;
5777
5778 mempool_dynamic_bitset used_blocks;
5779 KeyValueDB::Iterator it;
5780 store_statfs_t expected_statfs, actual_statfs;
5781 struct sb_info_t {
5782 list<ghobject_t> oids;
5783 SharedBlobRef sb;
5784 bluestore_extent_ref_map_t ref_map;
5785 bool compressed;
5786 };
5787 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5788
5789 uint64_t num_objects = 0;
5790 uint64_t num_extents = 0;
5791 uint64_t num_blobs = 0;
5792 uint64_t num_spanning_blobs = 0;
5793 uint64_t num_shared_blobs = 0;
5794 uint64_t num_sharded_objects = 0;
5795 uint64_t num_object_shards = 0;
5796
5797 utime_t start = ceph_clock_now();
5798
5799 int r = _open_path();
5800 if (r < 0)
5801 return r;
5802 r = _open_fsid(false);
5803 if (r < 0)
5804 goto out_path;
5805
5806 r = _read_fsid(&fsid);
5807 if (r < 0)
5808 goto out_fsid;
5809
5810 r = _lock_fsid();
5811 if (r < 0)
5812 goto out_fsid;
5813
5814 r = _open_bdev(false);
5815 if (r < 0)
5816 goto out_fsid;
5817
5818 r = _open_db(false);
5819 if (r < 0)
5820 goto out_bdev;
5821
5822 r = _open_super_meta();
5823 if (r < 0)
5824 goto out_db;
5825
5826 r = _open_fm(false);
5827 if (r < 0)
5828 goto out_db;
5829
5830 r = _open_alloc();
5831 if (r < 0)
5832 goto out_fm;
5833
5834 r = _open_collections(&errors);
5835 if (r < 0)
5836 goto out_alloc;
5837
5838 mempool_thread.init();
5839
5840 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5841 _kv_start();
5842 r = _deferred_replay();
5843 _kv_stop();
5844 if (r < 0)
5845 goto out_scan;
5846
5847 used_blocks.resize(fm->get_alloc_units());
5848 apply(
5849 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
5850 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5851 assert(pos < bs.size());
5852 bs.set(pos);
5853 }
5854 );
5855
5856 if (bluefs) {
5857 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5858 apply(
5859 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
5860 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5861 assert(pos < bs.size());
5862 bs.set(pos);
5863 }
5864 );
5865 }
5866 r = bluefs->fsck();
5867 if (r < 0) {
5868 goto out_scan;
5869 }
5870 if (r > 0)
5871 errors += r;
5872 }
5873
5874 // get expected statfs; fill unaffected fields to be able to compare
5875 // structs
5876 statfs(&actual_statfs);
5877 expected_statfs.total = actual_statfs.total;
5878 expected_statfs.available = actual_statfs.available;
5879
5880 // walk PREFIX_OBJ
5881 dout(1) << __func__ << " walking object keyspace" << dendl;
5882 it = db->get_iterator(PREFIX_OBJ);
5883 if (it) {
5884 CollectionRef c;
5885 spg_t pgid;
5886 mempool::bluestore_fsck::list<string> expecting_shards;
5887 for (it->lower_bound(string()); it->valid(); it->next()) {
5888 if (g_conf->bluestore_debug_fsck_abort) {
5889 goto out_scan;
5890 }
5891 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5892 if (is_extent_shard_key(it->key())) {
5893 while (!expecting_shards.empty() &&
5894 expecting_shards.front() < it->key()) {
5895 derr << "fsck error: missing shard key "
5896 << pretty_binary_string(expecting_shards.front())
5897 << dendl;
5898 ++errors;
5899 expecting_shards.pop_front();
5900 }
5901 if (!expecting_shards.empty() &&
5902 expecting_shards.front() == it->key()) {
5903 // all good
5904 expecting_shards.pop_front();
5905 continue;
5906 }
5907
5908 uint32_t offset;
5909 string okey;
5910 get_key_extent_shard(it->key(), &okey, &offset);
5911 derr << "fsck error: stray shard 0x" << std::hex << offset
5912 << std::dec << dendl;
5913 if (expecting_shards.empty()) {
5914 derr << "fsck error: " << pretty_binary_string(it->key())
5915 << " is unexpected" << dendl;
5916 ++errors;
5917 continue;
5918 }
5919 while (expecting_shards.front() > it->key()) {
5920 derr << "fsck error: saw " << pretty_binary_string(it->key())
5921 << dendl;
5922 derr << "fsck error: exp "
5923 << pretty_binary_string(expecting_shards.front()) << dendl;
5924 ++errors;
5925 expecting_shards.pop_front();
5926 if (expecting_shards.empty()) {
5927 break;
5928 }
5929 }
5930 continue;
5931 }
5932
5933 ghobject_t oid;
5934 int r = get_key_object(it->key(), &oid);
5935 if (r < 0) {
5936 derr << "fsck error: bad object key "
5937 << pretty_binary_string(it->key()) << dendl;
5938 ++errors;
5939 continue;
5940 }
5941 if (!c ||
5942 oid.shard_id != pgid.shard ||
5943 oid.hobj.pool != (int64_t)pgid.pool() ||
5944 !c->contains(oid)) {
5945 c = nullptr;
5946 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5947 coll_map.begin();
5948 p != coll_map.end();
5949 ++p) {
5950 if (p->second->contains(oid)) {
5951 c = p->second;
5952 break;
5953 }
5954 }
5955 if (!c) {
5956 derr << "fsck error: stray object " << oid
5957 << " not owned by any collection" << dendl;
5958 ++errors;
5959 continue;
5960 }
5961 c->cid.is_pg(&pgid);
5962 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
5963 << dendl;
5964 }
5965
5966 if (!expecting_shards.empty()) {
5967 for (auto &k : expecting_shards) {
5968 derr << "fsck error: missing shard key "
5969 << pretty_binary_string(k) << dendl;
5970 }
5971 ++errors;
5972 expecting_shards.clear();
5973 }
5974
5975 dout(10) << __func__ << " " << oid << dendl;
5976 RWLock::RLocker l(c->lock);
5977 OnodeRef o = c->get_onode(oid, false);
5978 if (o->onode.nid) {
5979 if (o->onode.nid > nid_max) {
5980 derr << "fsck error: " << oid << " nid " << o->onode.nid
5981 << " > nid_max " << nid_max << dendl;
5982 ++errors;
5983 }
5984 if (used_nids.count(o->onode.nid)) {
5985 derr << "fsck error: " << oid << " nid " << o->onode.nid
5986 << " already in use" << dendl;
5987 ++errors;
5988 continue; // go for next object
5989 }
5990 used_nids.insert(o->onode.nid);
5991 }
5992 ++num_objects;
5993 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5994 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5995 _dump_onode(o, 30);
5996 // shards
5997 if (!o->extent_map.shards.empty()) {
5998 ++num_sharded_objects;
5999 num_object_shards += o->extent_map.shards.size();
6000 }
6001 for (auto& s : o->extent_map.shards) {
6002 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
6003 expecting_shards.push_back(string());
6004 get_extent_shard_key(o->key, s.shard_info->offset,
6005 &expecting_shards.back());
6006 if (s.shard_info->offset >= o->onode.size) {
6007 derr << "fsck error: " << oid << " shard 0x" << std::hex
6008 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
6009 << std::dec << dendl;
6010 ++errors;
6011 }
6012 }
6013 // lextents
6014 map<BlobRef,bluestore_blob_t::unused_t> referenced;
6015 uint64_t pos = 0;
6016 mempool::bluestore_fsck::map<BlobRef,
6017 bluestore_blob_use_tracker_t> ref_map;
6018 for (auto& l : o->extent_map.extent_map) {
6019 dout(20) << __func__ << " " << l << dendl;
6020 if (l.logical_offset < pos) {
6021 derr << "fsck error: " << oid << " lextent at 0x"
6022 << std::hex << l.logical_offset
6023 << " overlaps with the previous, which ends at 0x" << pos
6024 << std::dec << dendl;
6025 ++errors;
6026 }
6027 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
6028 derr << "fsck error: " << oid << " lextent at 0x"
6029 << std::hex << l.logical_offset << "~" << l.length
6030 << " spans a shard boundary"
6031 << std::dec << dendl;
6032 ++errors;
6033 }
6034 pos = l.logical_offset + l.length;
6035 expected_statfs.stored += l.length;
6036 assert(l.blob);
6037 const bluestore_blob_t& blob = l.blob->get_blob();
6038
6039 auto& ref = ref_map[l.blob];
6040 if (ref.is_empty()) {
6041 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
6042 uint32_t l = blob.get_logical_length();
6043 ref.init(l, min_release_size);
6044 }
6045 ref.get(
6046 l.blob_offset,
6047 l.length);
6048 ++num_extents;
6049 if (blob.has_unused()) {
6050 auto p = referenced.find(l.blob);
6051 bluestore_blob_t::unused_t *pu;
6052 if (p == referenced.end()) {
6053 pu = &referenced[l.blob];
6054 } else {
6055 pu = &p->second;
6056 }
6057 uint64_t blob_len = blob.get_logical_length();
6058 assert((blob_len % (sizeof(*pu)*8)) == 0);
6059 assert(l.blob_offset + l.length <= blob_len);
6060 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
6061 uint64_t start = l.blob_offset / chunk_size;
6062 uint64_t end =
6063 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
6064 for (auto i = start; i < end; ++i) {
6065 (*pu) |= (1u << i);
6066 }
6067 }
6068 }
6069 for (auto &i : referenced) {
6070 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
6071 << std::dec << " for " << *i.first << dendl;
6072 const bluestore_blob_t& blob = i.first->get_blob();
6073 if (i.second & blob.unused) {
6074 derr << "fsck error: " << oid << " blob claims unused 0x"
6075 << std::hex << blob.unused
6076 << " but extents reference 0x" << i.second
6077 << " on blob " << *i.first << dendl;
6078 ++errors;
6079 }
6080 if (blob.has_csum()) {
6081 uint64_t blob_len = blob.get_logical_length();
6082 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
6083 unsigned csum_count = blob.get_csum_count();
6084 unsigned csum_chunk_size = blob.get_csum_chunk_size();
6085 for (unsigned p = 0; p < csum_count; ++p) {
6086 unsigned pos = p * csum_chunk_size;
6087 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
6088 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
6089 unsigned mask = 1u << firstbit;
6090 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
6091 mask |= 1u << b;
6092 }
6093 if ((blob.unused & mask) == mask) {
6094 // this csum chunk region is marked unused
6095 if (blob.get_csum_item(p) != 0) {
6096 derr << "fsck error: " << oid
6097 << " blob claims csum chunk 0x" << std::hex << pos
6098 << "~" << csum_chunk_size
6099 << " is unused (mask 0x" << mask << " of unused 0x"
6100 << blob.unused << ") but csum is non-zero 0x"
6101 << blob.get_csum_item(p) << std::dec << " on blob "
6102 << *i.first << dendl;
6103 ++errors;
6104 }
6105 }
6106 }
6107 }
6108 }
6109 for (auto &i : ref_map) {
6110 ++num_blobs;
6111 const bluestore_blob_t& blob = i.first->get_blob();
6112 bool equal = i.first->get_blob_use_tracker().equal(i.second);
6113 if (!equal) {
6114 derr << "fsck error: " << oid << " blob " << *i.first
6115 << " doesn't match expected ref_map " << i.second << dendl;
6116 ++errors;
6117 }
6118 if (blob.is_compressed()) {
6119 expected_statfs.compressed += blob.get_compressed_payload_length();
6120 expected_statfs.compressed_original +=
6121 i.first->get_referenced_bytes();
6122 }
6123 if (blob.is_shared()) {
6124 if (i.first->shared_blob->get_sbid() > blobid_max) {
6125 derr << "fsck error: " << oid << " blob " << blob
6126 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
6127 << blobid_max << dendl;
6128 ++errors;
6129 } else if (i.first->shared_blob->get_sbid() == 0) {
6130 derr << "fsck error: " << oid << " blob " << blob
6131 << " marked as shared but has uninitialized sbid"
6132 << dendl;
6133 ++errors;
6134 }
6135 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
6136 sbi.sb = i.first->shared_blob;
6137 sbi.oids.push_back(oid);
6138 sbi.compressed = blob.is_compressed();
6139 for (auto e : blob.get_extents()) {
6140 if (e.is_valid()) {
6141 sbi.ref_map.get(e.offset, e.length);
6142 }
6143 }
6144 } else {
6145 errors += _fsck_check_extents(oid, blob.get_extents(),
6146 blob.is_compressed(),
6147 used_blocks,
6148 fm->get_alloc_size(),
6149 expected_statfs);
6150 }
6151 }
6152 if (deep) {
6153 bufferlist bl;
6154 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
6155 if (r < 0) {
6156 ++errors;
6157 derr << "fsck error: " << oid << " error during read: "
6158 << cpp_strerror(r) << dendl;
6159 }
6160 }
6161 // omap
6162 if (o->onode.has_omap()) {
6163 if (used_omap_head.count(o->onode.nid)) {
6164 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
6165 << " already in use" << dendl;
6166 ++errors;
6167 } else {
6168 used_omap_head.insert(o->onode.nid);
6169 }
6170 }
6171 }
6172 }
6173 dout(1) << __func__ << " checking shared_blobs" << dendl;
6174 it = db->get_iterator(PREFIX_SHARED_BLOB);
6175 if (it) {
6176 for (it->lower_bound(string()); it->valid(); it->next()) {
6177 string key = it->key();
6178 uint64_t sbid;
6179 if (get_key_shared_blob(key, &sbid)) {
6180 derr << "fsck error: bad key '" << key
6181 << "' in shared blob namespace" << dendl;
6182 ++errors;
6183 continue;
6184 }
6185 auto p = sb_info.find(sbid);
6186 if (p == sb_info.end()) {
6187 derr << "fsck error: found stray shared blob data for sbid 0x"
6188 << std::hex << sbid << std::dec << dendl;
6189 ++errors;
6190 } else {
6191 ++num_shared_blobs;
6192 sb_info_t& sbi = p->second;
6193 bluestore_shared_blob_t shared_blob(sbid);
6194 bufferlist bl = it->value();
6195 bufferlist::iterator blp = bl.begin();
6196 ::decode(shared_blob, blp);
6197 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
6198 if (shared_blob.ref_map != sbi.ref_map) {
6199 derr << "fsck error: shared blob 0x" << std::hex << sbid
6200 << std::dec << " ref_map " << shared_blob.ref_map
6201 << " != expected " << sbi.ref_map << dendl;
6202 ++errors;
6203 }
6204 PExtentVector extents;
6205 for (auto &r : shared_blob.ref_map.ref_map) {
6206 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
6207 }
6208 errors += _fsck_check_extents(p->second.oids.front(),
6209 extents,
6210 p->second.compressed,
6211 used_blocks,
6212 fm->get_alloc_size(),
6213 expected_statfs);
6214 sb_info.erase(p);
6215 }
6216 }
6217 }
6218 for (auto &p : sb_info) {
6219 derr << "fsck error: shared_blob 0x" << p.first
6220 << " key is missing (" << *p.second.sb << ")" << dendl;
6221 ++errors;
6222 }
6223 if (!(actual_statfs == expected_statfs)) {
6224 derr << "fsck error: actual " << actual_statfs
6225 << " != expected " << expected_statfs << dendl;
6226 ++errors;
6227 }
6228
6229 dout(1) << __func__ << " checking for stray omap data" << dendl;
6230 it = db->get_iterator(PREFIX_OMAP);
6231 if (it) {
6232 for (it->lower_bound(string()); it->valid(); it->next()) {
6233 uint64_t omap_head;
6234 _key_decode_u64(it->key().c_str(), &omap_head);
6235 if (used_omap_head.count(omap_head) == 0) {
6236 derr << "fsck error: found stray omap data on omap_head "
6237 << omap_head << dendl;
6238 ++errors;
6239 }
6240 }
6241 }
6242
6243 dout(1) << __func__ << " checking deferred events" << dendl;
6244 it = db->get_iterator(PREFIX_DEFERRED);
6245 if (it) {
6246 for (it->lower_bound(string()); it->valid(); it->next()) {
6247 bufferlist bl = it->value();
6248 bufferlist::iterator p = bl.begin();
6249 bluestore_deferred_transaction_t wt;
6250 try {
6251 ::decode(wt, p);
6252 } catch (buffer::error& e) {
6253 derr << "fsck error: failed to decode deferred txn "
6254 << pretty_binary_string(it->key()) << dendl;
6255 r = -EIO;
6256 goto out_scan;
6257 }
6258 dout(20) << __func__ << " deferred " << wt.seq
6259 << " ops " << wt.ops.size()
6260 << " released 0x" << std::hex << wt.released << std::dec << dendl;
6261 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
6262 apply(
6263 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
6264 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6265 assert(pos < bs.size());
6266 bs.set(pos);
6267 }
6268 );
6269 }
6270 }
6271 }
6272
6273 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
6274 {
6275 // remove bluefs_extents from used set since the freelist doesn't
6276 // know they are allocated.
6277 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
6278 apply(
6279 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
6280 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6281 assert(pos < bs.size());
6282 bs.reset(pos);
6283 }
6284 );
6285 }
6286 fm->enumerate_reset();
6287 uint64_t offset, length;
6288 while (fm->enumerate_next(&offset, &length)) {
6289 bool intersects = false;
6290 apply(
6291 offset, length, fm->get_alloc_size(), used_blocks,
6292 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6293 assert(pos < bs.size());
6294 if (bs.test(pos)) {
6295 intersects = true;
6296 } else {
6297 bs.set(pos);
6298 }
6299 }
6300 );
6301 if (intersects) {
6302 if (offset == SUPER_RESERVED &&
6303 length == min_alloc_size - SUPER_RESERVED) {
6304 // this is due to the change just after luminous to min_alloc_size
6305 // granularity allocations, and our baked in assumption at the top
6306 // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
6307 // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
6308 // since we will never allocate this region below min_alloc_size.
6309 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
6310 << " and min_alloc_size, 0x" << std::hex << offset << "~"
6311 << length << dendl;
6312 } else {
6313 derr << "fsck error: free extent 0x" << std::hex << offset
6314 << "~" << length << std::dec
6315 << " intersects allocated blocks" << dendl;
6316 ++errors;
6317 }
6318 }
6319 }
6320 fm->enumerate_reset();
6321 size_t count = used_blocks.count();
6322 if (used_blocks.size() != count) {
6323 assert(used_blocks.size() > count);
6324 ++errors;
6325 used_blocks.flip();
6326 size_t start = used_blocks.find_first();
6327 while (start != decltype(used_blocks)::npos) {
6328 size_t cur = start;
6329 while (true) {
6330 size_t next = used_blocks.find_next(cur);
6331 if (next != cur + 1) {
6332 derr << "fsck error: leaked extent 0x" << std::hex
6333 << ((uint64_t)start * fm->get_alloc_size()) << "~"
6334 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
6335 << dendl;
6336 start = next;
6337 break;
6338 }
6339 cur = next;
6340 }
6341 }
6342 used_blocks.flip();
6343 }
6344 }
6345
6346 out_scan:
6347 mempool_thread.shutdown();
6348 _flush_cache();
6349 out_alloc:
6350 _close_alloc();
6351 out_fm:
6352 _close_fm();
6353 out_db:
6354 it.reset(); // before db is closed
6355 _close_db();
6356 out_bdev:
6357 _close_bdev();
6358 out_fsid:
6359 _close_fsid();
6360 out_path:
6361 _close_path();
6362
6363 // fatal errors take precedence
6364 if (r < 0)
6365 return r;
6366
6367 dout(2) << __func__ << " " << num_objects << " objects, "
6368 << num_sharded_objects << " of them sharded. "
6369 << dendl;
6370 dout(2) << __func__ << " " << num_extents << " extents to "
6371 << num_blobs << " blobs, "
6372 << num_spanning_blobs << " spanning, "
6373 << num_shared_blobs << " shared."
6374 << dendl;
6375
6376 utime_t duration = ceph_clock_now() - start;
6377 dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
6378 << " repaired, " << (errors - repaired) << " remaining in "
6379 << duration << " seconds" << dendl;
6380 return errors - repaired;
6381 }
6382
6383 void BlueStore::collect_metadata(map<string,string> *pm)
6384 {
6385 dout(10) << __func__ << dendl;
6386 bdev->collect_metadata("bluestore_bdev_", pm);
6387 if (bluefs) {
6388 (*pm)["bluefs"] = "1";
6389 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
6390 bluefs->collect_metadata(pm);
6391 } else {
6392 (*pm)["bluefs"] = "0";
6393 }
6394 }
6395
6396 int BlueStore::statfs(struct store_statfs_t *buf)
6397 {
6398 buf->reset();
6399 buf->total = bdev->get_size();
6400 buf->available = alloc->get_free();
6401
6402 if (bluefs) {
6403 // part of our shared device is "free" according to BlueFS, but we
6404 // can't touch bluestore_bluefs_min of it.
6405 int64_t shared_available = std::min(
6406 bluefs->get_free(bluefs_shared_bdev),
6407 bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
6408 if (shared_available > 0) {
6409 buf->available += shared_available;
6410 }
6411 }
6412
6413 {
6414 std::lock_guard<std::mutex> l(vstatfs_lock);
6415
6416 buf->allocated = vstatfs.allocated();
6417 buf->stored = vstatfs.stored();
6418 buf->compressed = vstatfs.compressed();
6419 buf->compressed_original = vstatfs.compressed_original();
6420 buf->compressed_allocated = vstatfs.compressed_allocated();
6421 }
6422
6423 dout(20) << __func__ << *buf << dendl;
6424 return 0;
6425 }
6426
6427 // ---------------
6428 // cache
6429
6430 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
6431 {
6432 RWLock::RLocker l(coll_lock);
6433 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
6434 if (cp == coll_map.end())
6435 return CollectionRef();
6436 return cp->second;
6437 }
6438
6439 void BlueStore::_queue_reap_collection(CollectionRef& c)
6440 {
6441 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6442 // _reap_collections and this in the same thread,
6443 // so no need a lock.
6444 removed_collections.push_back(c);
6445 }
6446
6447 void BlueStore::_reap_collections()
6448 {
6449
6450 list<CollectionRef> removed_colls;
6451 {
6452 // _queue_reap_collection and this in the same thread.
6453 // So no need a lock.
6454 if (!removed_collections.empty())
6455 removed_colls.swap(removed_collections);
6456 else
6457 return;
6458 }
6459
6460 list<CollectionRef>::iterator p = removed_colls.begin();
6461 while (p != removed_colls.end()) {
6462 CollectionRef c = *p;
6463 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6464 if (c->onode_map.map_any([&](OnodeRef o) {
6465 assert(!o->exists);
6466 if (o->flushing_count.load()) {
6467 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6468 << " flush_txns " << o->flushing_count << dendl;
6469 return true;
6470 }
6471 return false;
6472 })) {
6473 ++p;
6474 continue;
6475 }
6476 c->onode_map.clear();
6477 p = removed_colls.erase(p);
6478 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6479 }
6480 if (removed_colls.empty()) {
6481 dout(10) << __func__ << " all reaped" << dendl;
6482 } else {
6483 removed_collections.splice(removed_collections.begin(), removed_colls);
6484 }
6485 }
6486
6487 void BlueStore::_update_cache_logger()
6488 {
6489 uint64_t num_onodes = 0;
6490 uint64_t num_extents = 0;
6491 uint64_t num_blobs = 0;
6492 uint64_t num_buffers = 0;
6493 uint64_t num_buffer_bytes = 0;
6494 for (auto c : cache_shards) {
6495 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6496 &num_buffers, &num_buffer_bytes);
6497 }
6498 logger->set(l_bluestore_onodes, num_onodes);
6499 logger->set(l_bluestore_extents, num_extents);
6500 logger->set(l_bluestore_blobs, num_blobs);
6501 logger->set(l_bluestore_buffers, num_buffers);
6502 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6503 }
6504
6505 // ---------------
6506 // read operations
6507
6508 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6509 {
6510 return _get_collection(cid);
6511 }
6512
6513 bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6514 {
6515 CollectionHandle c = _get_collection(cid);
6516 if (!c)
6517 return false;
6518 return exists(c, oid);
6519 }
6520
6521 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6522 {
6523 Collection *c = static_cast<Collection *>(c_.get());
6524 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6525 if (!c->exists)
6526 return false;
6527
6528 bool r = true;
6529
6530 {
6531 RWLock::RLocker l(c->lock);
6532 OnodeRef o = c->get_onode(oid, false);
6533 if (!o || !o->exists)
6534 r = false;
6535 }
6536
6537 return r;
6538 }
6539
6540 int BlueStore::stat(
6541 const coll_t& cid,
6542 const ghobject_t& oid,
6543 struct stat *st,
6544 bool allow_eio)
6545 {
6546 CollectionHandle c = _get_collection(cid);
6547 if (!c)
6548 return -ENOENT;
6549 return stat(c, oid, st, allow_eio);
6550 }
6551
6552 int BlueStore::stat(
6553 CollectionHandle &c_,
6554 const ghobject_t& oid,
6555 struct stat *st,
6556 bool allow_eio)
6557 {
6558 Collection *c = static_cast<Collection *>(c_.get());
6559 if (!c->exists)
6560 return -ENOENT;
6561 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6562
6563 {
6564 RWLock::RLocker l(c->lock);
6565 OnodeRef o = c->get_onode(oid, false);
6566 if (!o || !o->exists)
6567 return -ENOENT;
6568 st->st_size = o->onode.size;
6569 st->st_blksize = 4096;
6570 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6571 st->st_nlink = 1;
6572 }
6573
6574 int r = 0;
6575 if (_debug_mdata_eio(oid)) {
6576 r = -EIO;
6577 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6578 }
6579 return r;
6580 }
6581 int BlueStore::set_collection_opts(
6582 const coll_t& cid,
6583 const pool_opts_t& opts)
6584 {
6585 CollectionHandle ch = _get_collection(cid);
6586 if (!ch)
6587 return -ENOENT;
6588 Collection *c = static_cast<Collection *>(ch.get());
6589 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6590 if (!c->exists)
6591 return -ENOENT;
6592 RWLock::WLocker l(c->lock);
6593 c->pool_opts = opts;
6594 return 0;
6595 }
6596
6597 int BlueStore::read(
6598 const coll_t& cid,
6599 const ghobject_t& oid,
6600 uint64_t offset,
6601 size_t length,
6602 bufferlist& bl,
6603 uint32_t op_flags)
6604 {
6605 CollectionHandle c = _get_collection(cid);
6606 if (!c)
6607 return -ENOENT;
6608 return read(c, oid, offset, length, bl, op_flags);
6609 }
6610
6611 int BlueStore::read(
6612 CollectionHandle &c_,
6613 const ghobject_t& oid,
6614 uint64_t offset,
6615 size_t length,
6616 bufferlist& bl,
6617 uint32_t op_flags)
6618 {
6619 utime_t start = ceph_clock_now();
6620 Collection *c = static_cast<Collection *>(c_.get());
6621 const coll_t &cid = c->get_cid();
6622 dout(15) << __func__ << " " << cid << " " << oid
6623 << " 0x" << std::hex << offset << "~" << length << std::dec
6624 << dendl;
6625 if (!c->exists)
6626 return -ENOENT;
6627
6628 bl.clear();
6629 int r;
6630 {
6631 RWLock::RLocker l(c->lock);
6632 utime_t start1 = ceph_clock_now();
6633 OnodeRef o = c->get_onode(oid, false);
6634 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6635 if (!o || !o->exists) {
6636 r = -ENOENT;
6637 goto out;
6638 }
6639
6640 if (offset == length && offset == 0)
6641 length = o->onode.size;
6642
6643 r = _do_read(c, o, offset, length, bl, op_flags);
6644 if (r == -EIO) {
6645 logger->inc(l_bluestore_read_eio);
6646 }
6647 }
6648
6649 out:
6650 if (r >= 0 && _debug_data_eio(oid)) {
6651 r = -EIO;
6652 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6653 } else if (cct->_conf->bluestore_debug_random_read_err &&
6654 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) {
6655 dout(0) << __func__ << ": inject random EIO" << dendl;
6656 r = -EIO;
6657 }
6658 dout(10) << __func__ << " " << cid << " " << oid
6659 << " 0x" << std::hex << offset << "~" << length << std::dec
6660 << " = " << r << dendl;
6661 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6662 return r;
6663 }
6664
6665 // --------------------------------------------------------
6666 // intermediate data structures used while reading
6667 struct region_t {
6668 uint64_t logical_offset;
6669 uint64_t blob_xoffset; //region offset within the blob
6670 uint64_t length;
6671 bufferlist bl;
6672
6673 // used later in read process
6674 uint64_t front = 0;
6675 uint64_t r_off = 0;
6676
6677 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6678 : logical_offset(offset),
6679 blob_xoffset(b_offs),
6680 length(len){}
6681 region_t(const region_t& from)
6682 : logical_offset(from.logical_offset),
6683 blob_xoffset(from.blob_xoffset),
6684 length(from.length){}
6685
6686 friend ostream& operator<<(ostream& out, const region_t& r) {
6687 return out << "0x" << std::hex << r.logical_offset << ":"
6688 << r.blob_xoffset << "~" << r.length << std::dec;
6689 }
6690 };
6691
6692 typedef list<region_t> regions2read_t;
6693 typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6694
6695 int BlueStore::_do_read(
6696 Collection *c,
6697 OnodeRef o,
6698 uint64_t offset,
6699 size_t length,
6700 bufferlist& bl,
6701 uint32_t op_flags)
6702 {
6703 FUNCTRACE();
6704 int r = 0;
6705 int read_cache_policy = 0; // do not bypass clean or dirty cache
6706
6707 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6708 << " size 0x" << o->onode.size << " (" << std::dec
6709 << o->onode.size << ")" << dendl;
6710 bl.clear();
6711
6712 if (offset >= o->onode.size) {
6713 return r;
6714 }
6715
6716 // generally, don't buffer anything, unless the client explicitly requests
6717 // it.
6718 bool buffered = false;
6719 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6720 dout(20) << __func__ << " will do buffered read" << dendl;
6721 buffered = true;
6722 } else if (cct->_conf->bluestore_default_buffered_read &&
6723 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6724 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6725 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6726 buffered = true;
6727 }
6728
6729 if (offset + length > o->onode.size) {
6730 length = o->onode.size - offset;
6731 }
6732
6733 utime_t start = ceph_clock_now();
6734 o->extent_map.fault_range(db, offset, length);
6735 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6736 _dump_onode(o);
6737
6738 ready_regions_t ready_regions;
6739
6740 // for deep-scrub, we only read dirty cache and bypass clean cache in
6741 // order to read underlying block device in case there are silent disk errors.
6742 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
6743 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
6744 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
6745 }
6746
6747 // build blob-wise list to of stuff read (that isn't cached)
6748 blobs2read_t blobs2read;
6749 unsigned left = length;
6750 uint64_t pos = offset;
6751 unsigned num_regions = 0;
6752 auto lp = o->extent_map.seek_lextent(offset);
6753 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6754 if (pos < lp->logical_offset) {
6755 unsigned hole = lp->logical_offset - pos;
6756 if (hole >= left) {
6757 break;
6758 }
6759 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6760 << std::dec << dendl;
6761 pos += hole;
6762 left -= hole;
6763 }
6764 BlobRef& bptr = lp->blob;
6765 unsigned l_off = pos - lp->logical_offset;
6766 unsigned b_off = l_off + lp->blob_offset;
6767 unsigned b_len = std::min(left, lp->length - l_off);
6768
6769 ready_regions_t cache_res;
6770 interval_set<uint32_t> cache_interval;
6771 bptr->shared_blob->bc.read(
6772 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
6773 read_cache_policy);
6774 dout(20) << __func__ << " blob " << *bptr << std::hex
6775 << " need 0x" << b_off << "~" << b_len
6776 << " cache has 0x" << cache_interval
6777 << std::dec << dendl;
6778
6779 auto pc = cache_res.begin();
6780 while (b_len > 0) {
6781 unsigned l;
6782 if (pc != cache_res.end() &&
6783 pc->first == b_off) {
6784 l = pc->second.length();
6785 ready_regions[pos].claim(pc->second);
6786 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6787 << b_off << "~" << l << std::dec << dendl;
6788 ++pc;
6789 } else {
6790 l = b_len;
6791 if (pc != cache_res.end()) {
6792 assert(pc->first > b_off);
6793 l = pc->first - b_off;
6794 }
6795 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6796 << b_off << "~" << l << std::dec << dendl;
6797 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6798 ++num_regions;
6799 }
6800 pos += l;
6801 b_off += l;
6802 left -= l;
6803 b_len -= l;
6804 }
6805 ++lp;
6806 }
6807
6808 // read raw blob data. use aio if we have >1 blobs to read.
6809 start = ceph_clock_now(); // for the sake of simplicity
6810 // measure the whole block below.
6811 // The error isn't that much...
6812 vector<bufferlist> compressed_blob_bls;
6813 IOContext ioc(cct, NULL, true); // allow EIO
6814 for (auto& p : blobs2read) {
6815 const BlobRef& bptr = p.first;
6816 dout(20) << __func__ << " blob " << *bptr << std::hex
6817 << " need " << p.second << std::dec << dendl;
6818 if (bptr->get_blob().is_compressed()) {
6819 // read the whole thing
6820 if (compressed_blob_bls.empty()) {
6821 // ensure we avoid any reallocation on subsequent blobs
6822 compressed_blob_bls.reserve(blobs2read.size());
6823 }
6824 compressed_blob_bls.push_back(bufferlist());
6825 bufferlist& bl = compressed_blob_bls.back();
6826 r = bptr->get_blob().map(
6827 0, bptr->get_blob().get_ondisk_length(),
6828 [&](uint64_t offset, uint64_t length) {
6829 int r;
6830 // use aio if there are more regions to read than those in this blob
6831 if (num_regions > p.second.size()) {
6832 r = bdev->aio_read(offset, length, &bl, &ioc);
6833 } else {
6834 r = bdev->read(offset, length, &bl, &ioc, false);
6835 }
6836 if (r < 0)
6837 return r;
6838 return 0;
6839 });
6840 if (r < 0) {
6841 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
6842 if (r == -EIO) {
6843 // propagate EIO to caller
6844 return r;
6845 }
6846 assert(r == 0);
6847 }
6848 } else {
6849 // read the pieces
6850 for (auto& reg : p.second) {
6851 // determine how much of the blob to read
6852 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6853 reg.r_off = reg.blob_xoffset;
6854 uint64_t r_len = reg.length;
6855 reg.front = reg.r_off % chunk_size;
6856 if (reg.front) {
6857 reg.r_off -= reg.front;
6858 r_len += reg.front;
6859 }
6860 unsigned tail = r_len % chunk_size;
6861 if (tail) {
6862 r_len += chunk_size - tail;
6863 }
6864 dout(20) << __func__ << " region 0x" << std::hex
6865 << reg.logical_offset
6866 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6867 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6868 << dendl;
6869
6870 // read it
6871 r = bptr->get_blob().map(
6872 reg.r_off, r_len,
6873 [&](uint64_t offset, uint64_t length) {
6874 int r;
6875 // use aio if there is more than one region to read
6876 if (num_regions > 1) {
6877 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6878 } else {
6879 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6880 }
6881 if (r < 0)
6882 return r;
6883 return 0;
6884 });
6885 if (r < 0) {
6886 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
6887 << dendl;
6888 if (r == -EIO) {
6889 // propagate EIO to caller
6890 return r;
6891 }
6892 assert(r == 0);
6893 }
6894 assert(reg.bl.length() == r_len);
6895 }
6896 }
6897 }
6898 if (ioc.has_pending_aios()) {
6899 bdev->aio_submit(&ioc);
6900 dout(20) << __func__ << " waiting for aio" << dendl;
6901 ioc.aio_wait();
6902 r = ioc.get_return_value();
6903 if (r < 0) {
6904 assert(r == -EIO); // no other errors allowed
6905 return -EIO;
6906 }
6907 }
6908 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6909
6910 // enumerate and decompress desired blobs
6911 auto p = compressed_blob_bls.begin();
6912 blobs2read_t::iterator b2r_it = blobs2read.begin();
6913 while (b2r_it != blobs2read.end()) {
6914 const BlobRef& bptr = b2r_it->first;
6915 dout(20) << __func__ << " blob " << *bptr << std::hex
6916 << " need 0x" << b2r_it->second << std::dec << dendl;
6917 if (bptr->get_blob().is_compressed()) {
6918 assert(p != compressed_blob_bls.end());
6919 bufferlist& compressed_bl = *p++;
6920 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6921 b2r_it->second.front().logical_offset) < 0) {
6922 return -EIO;
6923 }
6924 bufferlist raw_bl;
6925 r = _decompress(compressed_bl, &raw_bl);
6926 if (r < 0)
6927 return r;
6928 if (buffered) {
6929 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6930 raw_bl);
6931 }
6932 for (auto& i : b2r_it->second) {
6933 ready_regions[i.logical_offset].substr_of(
6934 raw_bl, i.blob_xoffset, i.length);
6935 }
6936 } else {
6937 for (auto& reg : b2r_it->second) {
6938 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6939 reg.logical_offset) < 0) {
6940 return -EIO;
6941 }
6942 if (buffered) {
6943 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6944 reg.r_off, reg.bl);
6945 }
6946
6947 // prune and keep result
6948 ready_regions[reg.logical_offset].substr_of(
6949 reg.bl, reg.front, reg.length);
6950 }
6951 }
6952 ++b2r_it;
6953 }
6954
6955 // generate a resulting buffer
6956 auto pr = ready_regions.begin();
6957 auto pr_end = ready_regions.end();
6958 pos = 0;
6959 while (pos < length) {
6960 if (pr != pr_end && pr->first == pos + offset) {
6961 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6962 << ": data from 0x" << pr->first << "~" << pr->second.length()
6963 << std::dec << dendl;
6964 pos += pr->second.length();
6965 bl.claim_append(pr->second);
6966 ++pr;
6967 } else {
6968 uint64_t l = length - pos;
6969 if (pr != pr_end) {
6970 assert(pr->first > pos + offset);
6971 l = pr->first - (pos + offset);
6972 }
6973 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6974 << ": zeros for 0x" << (pos + offset) << "~" << l
6975 << std::dec << dendl;
6976 bl.append_zero(l);
6977 pos += l;
6978 }
6979 }
6980 assert(bl.length() == length);
6981 assert(pos == length);
6982 assert(pr == pr_end);
6983 r = bl.length();
6984 return r;
6985 }
6986
6987 int BlueStore::_verify_csum(OnodeRef& o,
6988 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6989 const bufferlist& bl,
6990 uint64_t logical_offset) const
6991 {
6992 int bad;
6993 uint64_t bad_csum;
6994 utime_t start = ceph_clock_now();
6995 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6996 if (r < 0) {
6997 if (r == -1) {
6998 PExtentVector pex;
6999 blob->map(
7000 bad,
7001 blob->get_csum_chunk_size(),
7002 [&](uint64_t offset, uint64_t length) {
7003 pex.emplace_back(bluestore_pextent_t(offset, length));
7004 return 0;
7005 });
7006 derr << __func__ << " bad "
7007 << Checksummer::get_csum_type_string(blob->csum_type)
7008 << "/0x" << std::hex << blob->get_csum_chunk_size()
7009 << " checksum at blob offset 0x" << bad
7010 << ", got 0x" << bad_csum << ", expected 0x"
7011 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
7012 << ", device location " << pex
7013 << ", logical extent 0x" << std::hex
7014 << (logical_offset + bad - blob_xoffset) << "~"
7015 << blob->get_csum_chunk_size() << std::dec
7016 << ", object " << o->oid
7017 << dendl;
7018 } else {
7019 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
7020 }
7021 }
7022 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
7023 return r;
7024 }
7025
7026 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
7027 {
7028 int r = 0;
7029 utime_t start = ceph_clock_now();
7030 bufferlist::iterator i = source.begin();
7031 bluestore_compression_header_t chdr;
7032 ::decode(chdr, i);
7033 int alg = int(chdr.type);
7034 CompressorRef cp = compressor;
7035 if (!cp || (int)cp->get_type() != alg) {
7036 cp = Compressor::create(cct, alg);
7037 }
7038
7039 if (!cp.get()) {
7040 // if compressor isn't available - error, because cannot return
7041 // decompressed data?
7042 derr << __func__ << " can't load decompressor " << alg << dendl;
7043 r = -EIO;
7044 } else {
7045 r = cp->decompress(i, chdr.length, *result);
7046 if (r < 0) {
7047 derr << __func__ << " decompression failed with exit code " << r << dendl;
7048 r = -EIO;
7049 }
7050 }
7051 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
7052 return r;
7053 }
7054
7055 // this stores fiemap into interval_set, other variations
7056 // use it internally
7057 int BlueStore::_fiemap(
7058 CollectionHandle &c_,
7059 const ghobject_t& oid,
7060 uint64_t offset,
7061 size_t length,
7062 interval_set<uint64_t>& destset)
7063 {
7064 Collection *c = static_cast<Collection *>(c_.get());
7065 if (!c->exists)
7066 return -ENOENT;
7067 {
7068 RWLock::RLocker l(c->lock);
7069
7070 OnodeRef o = c->get_onode(oid, false);
7071 if (!o || !o->exists) {
7072 return -ENOENT;
7073 }
7074 _dump_onode(o);
7075
7076 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
7077 << " size 0x" << o->onode.size << std::dec << dendl;
7078
7079 boost::intrusive::set<Extent>::iterator ep, eend;
7080 if (offset >= o->onode.size)
7081 goto out;
7082
7083 if (offset + length > o->onode.size) {
7084 length = o->onode.size - offset;
7085 }
7086
7087 o->extent_map.fault_range(db, offset, length);
7088 eend = o->extent_map.extent_map.end();
7089 ep = o->extent_map.seek_lextent(offset);
7090 while (length > 0) {
7091 dout(20) << __func__ << " offset " << offset << dendl;
7092 if (ep != eend && ep->logical_offset + ep->length <= offset) {
7093 ++ep;
7094 continue;
7095 }
7096
7097 uint64_t x_len = length;
7098 if (ep != eend && ep->logical_offset <= offset) {
7099 uint64_t x_off = offset - ep->logical_offset;
7100 x_len = MIN(x_len, ep->length - x_off);
7101 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
7102 << x_len << std::dec << " blob " << ep->blob << dendl;
7103 destset.insert(offset, x_len);
7104 length -= x_len;
7105 offset += x_len;
7106 if (x_off + x_len == ep->length)
7107 ++ep;
7108 continue;
7109 }
7110 if (ep != eend &&
7111 ep->logical_offset > offset &&
7112 ep->logical_offset - offset < x_len) {
7113 x_len = ep->logical_offset - offset;
7114 }
7115 offset += x_len;
7116 length -= x_len;
7117 }
7118 }
7119
7120 out:
7121 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
7122 << " size = 0x(" << destset << ")" << std::dec << dendl;
7123 return 0;
7124 }
7125
7126 int BlueStore::fiemap(
7127 const coll_t& cid,
7128 const ghobject_t& oid,
7129 uint64_t offset,
7130 size_t len,
7131 bufferlist& bl)
7132 {
7133 CollectionHandle c = _get_collection(cid);
7134 if (!c)
7135 return -ENOENT;
7136 return fiemap(c, oid, offset, len, bl);
7137 }
7138
7139 int BlueStore::fiemap(
7140 CollectionHandle &c_,
7141 const ghobject_t& oid,
7142 uint64_t offset,
7143 size_t length,
7144 bufferlist& bl)
7145 {
7146 interval_set<uint64_t> m;
7147 int r = _fiemap(c_, oid, offset, length, m);
7148 if (r >= 0) {
7149 ::encode(m, bl);
7150 }
7151 return r;
7152 }
7153
7154 int BlueStore::fiemap(
7155 const coll_t& cid,
7156 const ghobject_t& oid,
7157 uint64_t offset,
7158 size_t len,
7159 map<uint64_t, uint64_t>& destmap)
7160 {
7161 CollectionHandle c = _get_collection(cid);
7162 if (!c)
7163 return -ENOENT;
7164 return fiemap(c, oid, offset, len, destmap);
7165 }
7166
7167 int BlueStore::fiemap(
7168 CollectionHandle &c_,
7169 const ghobject_t& oid,
7170 uint64_t offset,
7171 size_t length,
7172 map<uint64_t, uint64_t>& destmap)
7173 {
7174 interval_set<uint64_t> m;
7175 int r = _fiemap(c_, oid, offset, length, m);
7176 if (r >= 0) {
7177 m.move_into(destmap);
7178 }
7179 return r;
7180 }
7181
7182 int BlueStore::getattr(
7183 const coll_t& cid,
7184 const ghobject_t& oid,
7185 const char *name,
7186 bufferptr& value)
7187 {
7188 CollectionHandle c = _get_collection(cid);
7189 if (!c)
7190 return -ENOENT;
7191 return getattr(c, oid, name, value);
7192 }
7193
7194 int BlueStore::getattr(
7195 CollectionHandle &c_,
7196 const ghobject_t& oid,
7197 const char *name,
7198 bufferptr& value)
7199 {
7200 Collection *c = static_cast<Collection *>(c_.get());
7201 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
7202 if (!c->exists)
7203 return -ENOENT;
7204
7205 int r;
7206 {
7207 RWLock::RLocker l(c->lock);
7208 mempool::bluestore_cache_other::string k(name);
7209
7210 OnodeRef o = c->get_onode(oid, false);
7211 if (!o || !o->exists) {
7212 r = -ENOENT;
7213 goto out;
7214 }
7215
7216 if (!o->onode.attrs.count(k)) {
7217 r = -ENODATA;
7218 goto out;
7219 }
7220 value = o->onode.attrs[k];
7221 r = 0;
7222 }
7223 out:
7224 if (r == 0 && _debug_mdata_eio(oid)) {
7225 r = -EIO;
7226 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7227 }
7228 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
7229 << " = " << r << dendl;
7230 return r;
7231 }
7232
7233
7234 int BlueStore::getattrs(
7235 const coll_t& cid,
7236 const ghobject_t& oid,
7237 map<string,bufferptr>& aset)
7238 {
7239 CollectionHandle c = _get_collection(cid);
7240 if (!c)
7241 return -ENOENT;
7242 return getattrs(c, oid, aset);
7243 }
7244
7245 int BlueStore::getattrs(
7246 CollectionHandle &c_,
7247 const ghobject_t& oid,
7248 map<string,bufferptr>& aset)
7249 {
7250 Collection *c = static_cast<Collection *>(c_.get());
7251 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
7252 if (!c->exists)
7253 return -ENOENT;
7254
7255 int r;
7256 {
7257 RWLock::RLocker l(c->lock);
7258
7259 OnodeRef o = c->get_onode(oid, false);
7260 if (!o || !o->exists) {
7261 r = -ENOENT;
7262 goto out;
7263 }
7264 for (auto& i : o->onode.attrs) {
7265 aset.emplace(i.first.c_str(), i.second);
7266 }
7267 r = 0;
7268 }
7269
7270 out:
7271 if (r == 0 && _debug_mdata_eio(oid)) {
7272 r = -EIO;
7273 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7274 }
7275 dout(10) << __func__ << " " << c->cid << " " << oid
7276 << " = " << r << dendl;
7277 return r;
7278 }
7279
7280 int BlueStore::list_collections(vector<coll_t>& ls)
7281 {
7282 RWLock::RLocker l(coll_lock);
7283 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
7284 p != coll_map.end();
7285 ++p)
7286 ls.push_back(p->first);
7287 return 0;
7288 }
7289
7290 bool BlueStore::collection_exists(const coll_t& c)
7291 {
7292 RWLock::RLocker l(coll_lock);
7293 return coll_map.count(c);
7294 }
7295
7296 int BlueStore::collection_empty(const coll_t& cid, bool *empty)
7297 {
7298 dout(15) << __func__ << " " << cid << dendl;
7299 vector<ghobject_t> ls;
7300 ghobject_t next;
7301 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
7302 &ls, &next);
7303 if (r < 0) {
7304 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
7305 << dendl;
7306 return r;
7307 }
7308 *empty = ls.empty();
7309 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
7310 return 0;
7311 }
7312
7313 int BlueStore::collection_bits(const coll_t& cid)
7314 {
7315 dout(15) << __func__ << " " << cid << dendl;
7316 CollectionRef c = _get_collection(cid);
7317 if (!c)
7318 return -ENOENT;
7319 RWLock::RLocker l(c->lock);
7320 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
7321 return c->cnode.bits;
7322 }
7323
7324 int BlueStore::collection_list(
7325 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
7326 vector<ghobject_t> *ls, ghobject_t *pnext)
7327 {
7328 CollectionHandle c = _get_collection(cid);
7329 if (!c)
7330 return -ENOENT;
7331 return collection_list(c, start, end, max, ls, pnext);
7332 }
7333
7334 int BlueStore::collection_list(
7335 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
7336 vector<ghobject_t> *ls, ghobject_t *pnext)
7337 {
7338 Collection *c = static_cast<Collection *>(c_.get());
7339 dout(15) << __func__ << " " << c->cid
7340 << " start " << start << " end " << end << " max " << max << dendl;
7341 int r;
7342 {
7343 RWLock::RLocker l(c->lock);
7344 r = _collection_list(c, start, end, max, ls, pnext);
7345 }
7346
7347 dout(10) << __func__ << " " << c->cid
7348 << " start " << start << " end " << end << " max " << max
7349 << " = " << r << ", ls.size() = " << ls->size()
7350 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
7351 return r;
7352 }
7353
7354 int BlueStore::_collection_list(
7355 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
7356 vector<ghobject_t> *ls, ghobject_t *pnext)
7357 {
7358
7359 if (!c->exists)
7360 return -ENOENT;
7361
7362 int r = 0;
7363 ghobject_t static_next;
7364 KeyValueDB::Iterator it;
7365 string temp_start_key, temp_end_key;
7366 string start_key, end_key;
7367 bool set_next = false;
7368 string pend;
7369 bool temp;
7370
7371 if (!pnext)
7372 pnext = &static_next;
7373
7374 if (start == ghobject_t::get_max() ||
7375 start.hobj.is_max()) {
7376 goto out;
7377 }
7378 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
7379 &start_key, &end_key);
7380 dout(20) << __func__
7381 << " range " << pretty_binary_string(temp_start_key)
7382 << " to " << pretty_binary_string(temp_end_key)
7383 << " and " << pretty_binary_string(start_key)
7384 << " to " << pretty_binary_string(end_key)
7385 << " start " << start << dendl;
7386 it = db->get_iterator(PREFIX_OBJ);
7387 if (start == ghobject_t() ||
7388 start.hobj == hobject_t() ||
7389 start == c->cid.get_min_hobj()) {
7390 it->upper_bound(temp_start_key);
7391 temp = true;
7392 } else {
7393 string k;
7394 get_object_key(cct, start, &k);
7395 if (start.hobj.is_temp()) {
7396 temp = true;
7397 assert(k >= temp_start_key && k < temp_end_key);
7398 } else {
7399 temp = false;
7400 assert(k >= start_key && k < end_key);
7401 }
7402 dout(20) << " start from " << pretty_binary_string(k)
7403 << " temp=" << (int)temp << dendl;
7404 it->lower_bound(k);
7405 }
7406 if (end.hobj.is_max()) {
7407 pend = temp ? temp_end_key : end_key;
7408 } else {
7409 get_object_key(cct, end, &end_key);
7410 if (end.hobj.is_temp()) {
7411 if (temp)
7412 pend = end_key;
7413 else
7414 goto out;
7415 } else {
7416 pend = temp ? temp_end_key : end_key;
7417 }
7418 }
7419 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7420 while (true) {
7421 if (!it->valid() || it->key() >= pend) {
7422 if (!it->valid())
7423 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
7424 else
7425 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
7426 << " >= " << end << dendl;
7427 if (temp) {
7428 if (end.hobj.is_temp()) {
7429 break;
7430 }
7431 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
7432 temp = false;
7433 it->upper_bound(start_key);
7434 pend = end_key;
7435 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7436 continue;
7437 }
7438 break;
7439 }
7440 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
7441 if (is_extent_shard_key(it->key())) {
7442 it->next();
7443 continue;
7444 }
7445 ghobject_t oid;
7446 int r = get_key_object(it->key(), &oid);
7447 assert(r == 0);
7448 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
7449 if (ls->size() >= (unsigned)max) {
7450 dout(20) << __func__ << " reached max " << max << dendl;
7451 *pnext = oid;
7452 set_next = true;
7453 break;
7454 }
7455 ls->push_back(oid);
7456 it->next();
7457 }
7458 out:
7459 if (!set_next) {
7460 *pnext = ghobject_t::get_max();
7461 }
7462
7463 return r;
7464 }
7465
7466 int BlueStore::omap_get(
7467 const coll_t& cid, ///< [in] Collection containing oid
7468 const ghobject_t &oid, ///< [in] Object containing omap
7469 bufferlist *header, ///< [out] omap header
7470 map<string, bufferlist> *out /// < [out] Key to value map
7471 )
7472 {
7473 CollectionHandle c = _get_collection(cid);
7474 if (!c)
7475 return -ENOENT;
7476 return omap_get(c, oid, header, out);
7477 }
7478
7479 int BlueStore::omap_get(
7480 CollectionHandle &c_, ///< [in] Collection containing oid
7481 const ghobject_t &oid, ///< [in] Object containing omap
7482 bufferlist *header, ///< [out] omap header
7483 map<string, bufferlist> *out /// < [out] Key to value map
7484 )
7485 {
7486 Collection *c = static_cast<Collection *>(c_.get());
7487 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7488 if (!c->exists)
7489 return -ENOENT;
7490 RWLock::RLocker l(c->lock);
7491 int r = 0;
7492 OnodeRef o = c->get_onode(oid, false);
7493 if (!o || !o->exists) {
7494 r = -ENOENT;
7495 goto out;
7496 }
7497 if (!o->onode.has_omap())
7498 goto out;
7499 o->flush();
7500 {
7501 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7502 string head, tail;
7503 get_omap_header(o->onode.nid, &head);
7504 get_omap_tail(o->onode.nid, &tail);
7505 it->lower_bound(head);
7506 while (it->valid()) {
7507 if (it->key() == head) {
7508 dout(30) << __func__ << " got header" << dendl;
7509 *header = it->value();
7510 } else if (it->key() >= tail) {
7511 dout(30) << __func__ << " reached tail" << dendl;
7512 break;
7513 } else {
7514 string user_key;
7515 decode_omap_key(it->key(), &user_key);
7516 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7517 << " -> " << user_key << dendl;
7518 (*out)[user_key] = it->value();
7519 }
7520 it->next();
7521 }
7522 }
7523 out:
7524 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7525 << dendl;
7526 return r;
7527 }
7528
7529 int BlueStore::omap_get_header(
7530 const coll_t& cid, ///< [in] Collection containing oid
7531 const ghobject_t &oid, ///< [in] Object containing omap
7532 bufferlist *header, ///< [out] omap header
7533 bool allow_eio ///< [in] don't assert on eio
7534 )
7535 {
7536 CollectionHandle c = _get_collection(cid);
7537 if (!c)
7538 return -ENOENT;
7539 return omap_get_header(c, oid, header, allow_eio);
7540 }
7541
7542 int BlueStore::omap_get_header(
7543 CollectionHandle &c_, ///< [in] Collection containing oid
7544 const ghobject_t &oid, ///< [in] Object containing omap
7545 bufferlist *header, ///< [out] omap header
7546 bool allow_eio ///< [in] don't assert on eio
7547 )
7548 {
7549 Collection *c = static_cast<Collection *>(c_.get());
7550 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7551 if (!c->exists)
7552 return -ENOENT;
7553 RWLock::RLocker l(c->lock);
7554 int r = 0;
7555 OnodeRef o = c->get_onode(oid, false);
7556 if (!o || !o->exists) {
7557 r = -ENOENT;
7558 goto out;
7559 }
7560 if (!o->onode.has_omap())
7561 goto out;
7562 o->flush();
7563 {
7564 string head;
7565 get_omap_header(o->onode.nid, &head);
7566 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7567 dout(30) << __func__ << " got header" << dendl;
7568 } else {
7569 dout(30) << __func__ << " no header" << dendl;
7570 }
7571 }
7572 out:
7573 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7574 << dendl;
7575 return r;
7576 }
7577
7578 int BlueStore::omap_get_keys(
7579 const coll_t& cid, ///< [in] Collection containing oid
7580 const ghobject_t &oid, ///< [in] Object containing omap
7581 set<string> *keys ///< [out] Keys defined on oid
7582 )
7583 {
7584 CollectionHandle c = _get_collection(cid);
7585 if (!c)
7586 return -ENOENT;
7587 return omap_get_keys(c, oid, keys);
7588 }
7589
7590 int BlueStore::omap_get_keys(
7591 CollectionHandle &c_, ///< [in] Collection containing oid
7592 const ghobject_t &oid, ///< [in] Object containing omap
7593 set<string> *keys ///< [out] Keys defined on oid
7594 )
7595 {
7596 Collection *c = static_cast<Collection *>(c_.get());
7597 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7598 if (!c->exists)
7599 return -ENOENT;
7600 RWLock::RLocker l(c->lock);
7601 int r = 0;
7602 OnodeRef o = c->get_onode(oid, false);
7603 if (!o || !o->exists) {
7604 r = -ENOENT;
7605 goto out;
7606 }
7607 if (!o->onode.has_omap())
7608 goto out;
7609 o->flush();
7610 {
7611 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7612 string head, tail;
7613 get_omap_key(o->onode.nid, string(), &head);
7614 get_omap_tail(o->onode.nid, &tail);
7615 it->lower_bound(head);
7616 while (it->valid()) {
7617 if (it->key() >= tail) {
7618 dout(30) << __func__ << " reached tail" << dendl;
7619 break;
7620 }
7621 string user_key;
7622 decode_omap_key(it->key(), &user_key);
7623 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7624 << " -> " << user_key << dendl;
7625 keys->insert(user_key);
7626 it->next();
7627 }
7628 }
7629 out:
7630 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7631 << dendl;
7632 return r;
7633 }
7634
7635 int BlueStore::omap_get_values(
7636 const coll_t& cid, ///< [in] Collection containing oid
7637 const ghobject_t &oid, ///< [in] Object containing omap
7638 const set<string> &keys, ///< [in] Keys to get
7639 map<string, bufferlist> *out ///< [out] Returned keys and values
7640 )
7641 {
7642 CollectionHandle c = _get_collection(cid);
7643 if (!c)
7644 return -ENOENT;
7645 return omap_get_values(c, oid, keys, out);
7646 }
7647
7648 int BlueStore::omap_get_values(
7649 CollectionHandle &c_, ///< [in] Collection containing oid
7650 const ghobject_t &oid, ///< [in] Object containing omap
7651 const set<string> &keys, ///< [in] Keys to get
7652 map<string, bufferlist> *out ///< [out] Returned keys and values
7653 )
7654 {
7655 Collection *c = static_cast<Collection *>(c_.get());
7656 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7657 if (!c->exists)
7658 return -ENOENT;
7659 RWLock::RLocker l(c->lock);
7660 int r = 0;
7661 string final_key;
7662 OnodeRef o = c->get_onode(oid, false);
7663 if (!o || !o->exists) {
7664 r = -ENOENT;
7665 goto out;
7666 }
7667 if (!o->onode.has_omap())
7668 goto out;
7669 o->flush();
7670 _key_encode_u64(o->onode.nid, &final_key);
7671 final_key.push_back('.');
7672 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7673 final_key.resize(9); // keep prefix
7674 final_key += *p;
7675 bufferlist val;
7676 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7677 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7678 << " -> " << *p << dendl;
7679 out->insert(make_pair(*p, val));
7680 }
7681 }
7682 out:
7683 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7684 << dendl;
7685 return r;
7686 }
7687
7688 int BlueStore::omap_check_keys(
7689 const coll_t& cid, ///< [in] Collection containing oid
7690 const ghobject_t &oid, ///< [in] Object containing omap
7691 const set<string> &keys, ///< [in] Keys to check
7692 set<string> *out ///< [out] Subset of keys defined on oid
7693 )
7694 {
7695 CollectionHandle c = _get_collection(cid);
7696 if (!c)
7697 return -ENOENT;
7698 return omap_check_keys(c, oid, keys, out);
7699 }
7700
7701 int BlueStore::omap_check_keys(
7702 CollectionHandle &c_, ///< [in] Collection containing oid
7703 const ghobject_t &oid, ///< [in] Object containing omap
7704 const set<string> &keys, ///< [in] Keys to check
7705 set<string> *out ///< [out] Subset of keys defined on oid
7706 )
7707 {
7708 Collection *c = static_cast<Collection *>(c_.get());
7709 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7710 if (!c->exists)
7711 return -ENOENT;
7712 RWLock::RLocker l(c->lock);
7713 int r = 0;
7714 string final_key;
7715 OnodeRef o = c->get_onode(oid, false);
7716 if (!o || !o->exists) {
7717 r = -ENOENT;
7718 goto out;
7719 }
7720 if (!o->onode.has_omap())
7721 goto out;
7722 o->flush();
7723 _key_encode_u64(o->onode.nid, &final_key);
7724 final_key.push_back('.');
7725 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7726 final_key.resize(9); // keep prefix
7727 final_key += *p;
7728 bufferlist val;
7729 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7730 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7731 << " -> " << *p << dendl;
7732 out->insert(*p);
7733 } else {
7734 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7735 << " -> " << *p << dendl;
7736 }
7737 }
7738 out:
7739 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7740 << dendl;
7741 return r;
7742 }
7743
7744 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7745 const coll_t& cid, ///< [in] collection
7746 const ghobject_t &oid ///< [in] object
7747 )
7748 {
7749 CollectionHandle c = _get_collection(cid);
7750 if (!c) {
7751 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7752 return ObjectMap::ObjectMapIterator();
7753 }
7754 return get_omap_iterator(c, oid);
7755 }
7756
7757 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7758 CollectionHandle &c_, ///< [in] collection
7759 const ghobject_t &oid ///< [in] object
7760 )
7761 {
7762 Collection *c = static_cast<Collection *>(c_.get());
7763 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7764 if (!c->exists) {
7765 return ObjectMap::ObjectMapIterator();
7766 }
7767 RWLock::RLocker l(c->lock);
7768 OnodeRef o = c->get_onode(oid, false);
7769 if (!o || !o->exists) {
7770 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7771 return ObjectMap::ObjectMapIterator();
7772 }
7773 o->flush();
7774 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7775 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7776 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7777 }
7778
7779 // -----------------
7780 // write helpers
7781
7782 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7783 {
7784 dout(10) << __func__ << " ondisk_format " << ondisk_format
7785 << " min_compat_ondisk_format " << min_compat_ondisk_format
7786 << dendl;
7787 assert(ondisk_format == latest_ondisk_format);
7788 {
7789 bufferlist bl;
7790 ::encode(ondisk_format, bl);
7791 t->set(PREFIX_SUPER, "ondisk_format", bl);
7792 }
7793 {
7794 bufferlist bl;
7795 ::encode(min_compat_ondisk_format, bl);
7796 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7797 }
7798 }
7799
7800 int BlueStore::_open_super_meta()
7801 {
7802 // nid
7803 {
7804 nid_max = 0;
7805 bufferlist bl;
7806 db->get(PREFIX_SUPER, "nid_max", &bl);
7807 bufferlist::iterator p = bl.begin();
7808 try {
7809 uint64_t v;
7810 ::decode(v, p);
7811 nid_max = v;
7812 } catch (buffer::error& e) {
7813 derr << __func__ << " unable to read nid_max" << dendl;
7814 return -EIO;
7815 }
7816 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7817 nid_last = nid_max.load();
7818 }
7819
7820 // blobid
7821 {
7822 blobid_max = 0;
7823 bufferlist bl;
7824 db->get(PREFIX_SUPER, "blobid_max", &bl);
7825 bufferlist::iterator p = bl.begin();
7826 try {
7827 uint64_t v;
7828 ::decode(v, p);
7829 blobid_max = v;
7830 } catch (buffer::error& e) {
7831 derr << __func__ << " unable to read blobid_max" << dendl;
7832 return -EIO;
7833 }
7834 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7835 blobid_last = blobid_max.load();
7836 }
7837
7838 // freelist
7839 {
7840 bufferlist bl;
7841 db->get(PREFIX_SUPER, "freelist_type", &bl);
7842 if (bl.length()) {
7843 freelist_type = std::string(bl.c_str(), bl.length());
7844 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7845 } else {
7846 assert("Not Support extent freelist manager" == 0);
7847 }
7848 }
7849
7850 // bluefs alloc
7851 if (cct->_conf->bluestore_bluefs) {
7852 bluefs_extents.clear();
7853 bufferlist bl;
7854 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7855 bufferlist::iterator p = bl.begin();
7856 try {
7857 ::decode(bluefs_extents, p);
7858 }
7859 catch (buffer::error& e) {
7860 derr << __func__ << " unable to read bluefs_extents" << dendl;
7861 return -EIO;
7862 }
7863 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7864 << std::dec << dendl;
7865 }
7866
7867 // ondisk format
7868 int32_t compat_ondisk_format = 0;
7869 {
7870 bufferlist bl;
7871 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7872 if (r < 0) {
7873 // base case: kraken bluestore is v1 and readable by v1
7874 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7875 << dendl;
7876 ondisk_format = 1;
7877 compat_ondisk_format = 1;
7878 } else {
7879 auto p = bl.begin();
7880 try {
7881 ::decode(ondisk_format, p);
7882 } catch (buffer::error& e) {
7883 derr << __func__ << " unable to read ondisk_format" << dendl;
7884 return -EIO;
7885 }
7886 bl.clear();
7887 {
7888 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7889 assert(!r);
7890 auto p = bl.begin();
7891 try {
7892 ::decode(compat_ondisk_format, p);
7893 } catch (buffer::error& e) {
7894 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7895 return -EIO;
7896 }
7897 }
7898 }
7899 dout(10) << __func__ << " ondisk_format " << ondisk_format
7900 << " compat_ondisk_format " << compat_ondisk_format
7901 << dendl;
7902 }
7903
7904 if (latest_ondisk_format < compat_ondisk_format) {
7905 derr << __func__ << " compat_ondisk_format is "
7906 << compat_ondisk_format << " but we only understand version "
7907 << latest_ondisk_format << dendl;
7908 return -EPERM;
7909 }
7910 if (ondisk_format < latest_ondisk_format) {
7911 int r = _upgrade_super();
7912 if (r < 0) {
7913 return r;
7914 }
7915 }
7916
7917 {
7918 bufferlist bl;
7919 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7920 auto p = bl.begin();
7921 try {
7922 uint64_t val;
7923 ::decode(val, p);
7924 min_alloc_size = val;
7925 min_alloc_size_order = ctz(val);
7926 assert(min_alloc_size == 1u << min_alloc_size_order);
7927 } catch (buffer::error& e) {
7928 derr << __func__ << " unable to read min_alloc_size" << dendl;
7929 return -EIO;
7930 }
7931 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7932 << std::dec << dendl;
7933 }
7934 _open_statfs();
7935 _set_alloc_sizes();
7936 _set_throttle_params();
7937
7938 _set_csum();
7939 _set_compression();
7940 _set_blob_size();
7941
7942 _set_finisher_num();
7943
7944 return 0;
7945 }
7946
7947 int BlueStore::_upgrade_super()
7948 {
7949 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7950 << latest_ondisk_format << dendl;
7951 assert(ondisk_format > 0);
7952 assert(ondisk_format < latest_ondisk_format);
7953
7954 if (ondisk_format == 1) {
7955 // changes:
7956 // - super: added ondisk_format
7957 // - super: added min_readable_ondisk_format
7958 // - super: added min_compat_ondisk_format
7959 // - super: added min_alloc_size
7960 // - super: removed min_min_alloc_size
7961 KeyValueDB::Transaction t = db->get_transaction();
7962 {
7963 bufferlist bl;
7964 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7965 auto p = bl.begin();
7966 try {
7967 uint64_t val;
7968 ::decode(val, p);
7969 min_alloc_size = val;
7970 } catch (buffer::error& e) {
7971 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7972 return -EIO;
7973 }
7974 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7975 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7976 }
7977 ondisk_format = 2;
7978 _prepare_ondisk_format_super(t);
7979 int r = db->submit_transaction_sync(t);
7980 assert(r == 0);
7981 }
7982
7983 // done
7984 dout(1) << __func__ << " done" << dendl;
7985 return 0;
7986 }
7987
7988 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7989 {
7990 if (o->onode.nid) {
7991 assert(o->exists);
7992 return;
7993 }
7994 uint64_t nid = ++nid_last;
7995 dout(20) << __func__ << " " << nid << dendl;
7996 o->onode.nid = nid;
7997 txc->last_nid = nid;
7998 o->exists = true;
7999 }
8000
8001 uint64_t BlueStore::_assign_blobid(TransContext *txc)
8002 {
8003 uint64_t bid = ++blobid_last;
8004 dout(20) << __func__ << " " << bid << dendl;
8005 txc->last_blobid = bid;
8006 return bid;
8007 }
8008
8009 void BlueStore::get_db_statistics(Formatter *f)
8010 {
8011 db->get_statistics(f);
8012 }
8013
8014 BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
8015 {
8016 TransContext *txc = new TransContext(cct, osr);
8017 txc->t = db->get_transaction();
8018 osr->queue_new(txc);
8019 dout(20) << __func__ << " osr " << osr << " = " << txc
8020 << " seq " << txc->seq << dendl;
8021 return txc;
8022 }
8023
8024 void BlueStore::_txc_calc_cost(TransContext *txc)
8025 {
8026 // this is about the simplest model for transaction cost you can
8027 // imagine. there is some fixed overhead cost by saying there is a
8028 // minimum of one "io". and then we have some cost per "io" that is
8029 // a configurable (with different hdd and ssd defaults), and add
8030 // that to the bytes value.
8031 int ios = 1; // one "io" for the kv commit
8032 for (auto& p : txc->ioc.pending_aios) {
8033 ios += p.iov.size();
8034 }
8035 auto cost = throttle_cost_per_io.load();
8036 txc->cost = ios * cost + txc->bytes;
8037 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
8038 << ios << " ios * " << cost << " + " << txc->bytes
8039 << " bytes)" << dendl;
8040 }
8041
8042 void BlueStore::_txc_update_store_statfs(TransContext *txc)
8043 {
8044 if (txc->statfs_delta.is_empty())
8045 return;
8046
8047 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
8048 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
8049 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
8050 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
8051 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
8052
8053 {
8054 std::lock_guard<std::mutex> l(vstatfs_lock);
8055 vstatfs += txc->statfs_delta;
8056 }
8057
8058 bufferlist bl;
8059 txc->statfs_delta.encode(bl);
8060
8061 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
8062 txc->statfs_delta.reset();
8063 }
8064
8065 void BlueStore::_txc_state_proc(TransContext *txc)
8066 {
8067 while (true) {
8068 dout(10) << __func__ << " txc " << txc
8069 << " " << txc->get_state_name() << dendl;
8070 switch (txc->state) {
8071 case TransContext::STATE_PREPARE:
8072 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
8073 if (txc->ioc.has_pending_aios()) {
8074 txc->state = TransContext::STATE_AIO_WAIT;
8075 txc->had_ios = true;
8076 _txc_aio_submit(txc);
8077 return;
8078 }
8079 // ** fall-thru **
8080
8081 case TransContext::STATE_AIO_WAIT:
8082 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
8083 _txc_finish_io(txc); // may trigger blocked txc's too
8084 return;
8085
8086 case TransContext::STATE_IO_DONE:
8087 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
8088 if (txc->had_ios) {
8089 ++txc->osr->txc_with_unstable_io;
8090 }
8091 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
8092 txc->state = TransContext::STATE_KV_QUEUED;
8093 if (cct->_conf->bluestore_sync_submit_transaction) {
8094 if (txc->last_nid >= nid_max ||
8095 txc->last_blobid >= blobid_max) {
8096 dout(20) << __func__
8097 << " last_{nid,blobid} exceeds max, submit via kv thread"
8098 << dendl;
8099 } else if (txc->osr->kv_committing_serially) {
8100 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
8101 << dendl;
8102 // note: this is starvation-prone. once we have a txc in a busy
8103 // sequencer that is committing serially it is possible to keep
8104 // submitting new transactions fast enough that we get stuck doing
8105 // so. the alternative is to block here... fixme?
8106 } else if (txc->osr->txc_with_unstable_io) {
8107 dout(20) << __func__ << " prior txc(s) with unstable ios "
8108 << txc->osr->txc_with_unstable_io.load() << dendl;
8109 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
8110 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
8111 == 0) {
8112 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
8113 << dendl;
8114 } else {
8115 txc->state = TransContext::STATE_KV_SUBMITTED;
8116 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8117 assert(r == 0);
8118 _txc_applied_kv(txc);
8119 }
8120 }
8121 {
8122 std::lock_guard<std::mutex> l(kv_lock);
8123 kv_queue.push_back(txc);
8124 kv_cond.notify_one();
8125 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
8126 kv_queue_unsubmitted.push_back(txc);
8127 ++txc->osr->kv_committing_serially;
8128 }
8129 if (txc->had_ios)
8130 kv_ios++;
8131 kv_throttle_costs += txc->cost;
8132 }
8133 return;
8134 case TransContext::STATE_KV_SUBMITTED:
8135 _txc_committed_kv(txc);
8136 // ** fall-thru **
8137
8138 case TransContext::STATE_KV_DONE:
8139 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
8140 if (txc->deferred_txn) {
8141 txc->state = TransContext::STATE_DEFERRED_QUEUED;
8142 _deferred_queue(txc);
8143 return;
8144 }
8145 txc->state = TransContext::STATE_FINISHING;
8146 break;
8147
8148 case TransContext::STATE_DEFERRED_CLEANUP:
8149 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
8150 txc->state = TransContext::STATE_FINISHING;
8151 // ** fall-thru **
8152
8153 case TransContext::STATE_FINISHING:
8154 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
8155 _txc_finish(txc);
8156 return;
8157
8158 default:
8159 derr << __func__ << " unexpected txc " << txc
8160 << " state " << txc->get_state_name() << dendl;
8161 assert(0 == "unexpected txc state");
8162 return;
8163 }
8164 }
8165 }
8166
8167 void BlueStore::_txc_finish_io(TransContext *txc)
8168 {
8169 dout(20) << __func__ << " " << txc << dendl;
8170
8171 /*
8172 * we need to preserve the order of kv transactions,
8173 * even though aio will complete in any order.
8174 */
8175
8176 OpSequencer *osr = txc->osr.get();
8177 std::lock_guard<std::mutex> l(osr->qlock);
8178 txc->state = TransContext::STATE_IO_DONE;
8179
8180 // release aio contexts (including pinned buffers).
8181 txc->ioc.running_aios.clear();
8182
8183 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
8184 while (p != osr->q.begin()) {
8185 --p;
8186 if (p->state < TransContext::STATE_IO_DONE) {
8187 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
8188 << p->get_state_name() << dendl;
8189 return;
8190 }
8191 if (p->state > TransContext::STATE_IO_DONE) {
8192 ++p;
8193 break;
8194 }
8195 }
8196 do {
8197 _txc_state_proc(&*p++);
8198 } while (p != osr->q.end() &&
8199 p->state == TransContext::STATE_IO_DONE);
8200
8201 if (osr->kv_submitted_waiters &&
8202 osr->_is_all_kv_submitted()) {
8203 osr->qcond.notify_all();
8204 }
8205 }
8206
8207 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
8208 {
8209 dout(20) << __func__ << " txc " << txc
8210 << " onodes " << txc->onodes
8211 << " shared_blobs " << txc->shared_blobs
8212 << dendl;
8213
8214 // finalize onodes
8215 for (auto o : txc->onodes) {
8216 // finalize extent_map shards
8217 o->extent_map.update(t, false);
8218 if (o->extent_map.needs_reshard()) {
8219 o->extent_map.reshard(db, t);
8220 o->extent_map.update(t, true);
8221 if (o->extent_map.needs_reshard()) {
8222 dout(20) << __func__ << " warning: still wants reshard, check options?"
8223 << dendl;
8224 o->extent_map.clear_needs_reshard();
8225 }
8226 logger->inc(l_bluestore_onode_reshard);
8227 }
8228
8229 // bound encode
8230 size_t bound = 0;
8231 denc(o->onode, bound);
8232 o->extent_map.bound_encode_spanning_blobs(bound);
8233 if (o->onode.extent_map_shards.empty()) {
8234 denc(o->extent_map.inline_bl, bound);
8235 }
8236
8237 // encode
8238 bufferlist bl;
8239 unsigned onode_part, blob_part, extent_part;
8240 {
8241 auto p = bl.get_contiguous_appender(bound, true);
8242 denc(o->onode, p);
8243 onode_part = p.get_logical_offset();
8244 o->extent_map.encode_spanning_blobs(p);
8245 blob_part = p.get_logical_offset() - onode_part;
8246 if (o->onode.extent_map_shards.empty()) {
8247 denc(o->extent_map.inline_bl, p);
8248 }
8249 extent_part = p.get_logical_offset() - onode_part - blob_part;
8250 }
8251
8252 dout(20) << " onode " << o->oid << " is " << bl.length()
8253 << " (" << onode_part << " bytes onode + "
8254 << blob_part << " bytes spanning blobs + "
8255 << extent_part << " bytes inline extents)"
8256 << dendl;
8257 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
8258 o->flushing_count++;
8259 }
8260
8261 // objects we modified but didn't affect the onode
8262 auto p = txc->modified_objects.begin();
8263 while (p != txc->modified_objects.end()) {
8264 if (txc->onodes.count(*p) == 0) {
8265 (*p)->flushing_count++;
8266 ++p;
8267 } else {
8268 // remove dups with onodes list to avoid problems in _txc_finish
8269 p = txc->modified_objects.erase(p);
8270 }
8271 }
8272
8273 // finalize shared_blobs
8274 for (auto sb : txc->shared_blobs) {
8275 string key;
8276 auto sbid = sb->get_sbid();
8277 get_shared_blob_key(sbid, &key);
8278 if (sb->persistent->empty()) {
8279 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8280 << " is empty" << dendl;
8281 t->rmkey(PREFIX_SHARED_BLOB, key);
8282 } else {
8283 bufferlist bl;
8284 ::encode(*(sb->persistent), bl);
8285 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8286 << " is " << bl.length() << " " << *sb << dendl;
8287 t->set(PREFIX_SHARED_BLOB, key, bl);
8288 }
8289 }
8290 }
8291
8292 void BlueStore::BSPerfTracker::update_from_perfcounters(
8293 PerfCounters &logger)
8294 {
8295 os_commit_latency.consume_next(
8296 logger.get_tavg_ms(
8297 l_bluestore_commit_lat));
8298 os_apply_latency.consume_next(
8299 logger.get_tavg_ms(
8300 l_bluestore_commit_lat));
8301 }
8302
8303 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
8304 {
8305 dout(20) << __func__ << " txc " << txc << std::hex
8306 << " allocated 0x" << txc->allocated
8307 << " released 0x" << txc->released
8308 << std::dec << dendl;
8309
8310 // We have to handle the case where we allocate *and* deallocate the
8311 // same region in this transaction. The freelist doesn't like that.
8312 // (Actually, the only thing that cares is the BitmapFreelistManager
8313 // debug check. But that's important.)
8314 interval_set<uint64_t> tmp_allocated, tmp_released;
8315 interval_set<uint64_t> *pallocated = &txc->allocated;
8316 interval_set<uint64_t> *preleased = &txc->released;
8317 if (!txc->allocated.empty() && !txc->released.empty()) {
8318 interval_set<uint64_t> overlap;
8319 overlap.intersection_of(txc->allocated, txc->released);
8320 if (!overlap.empty()) {
8321 tmp_allocated = txc->allocated;
8322 tmp_allocated.subtract(overlap);
8323 tmp_released = txc->released;
8324 tmp_released.subtract(overlap);
8325 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
8326 << ", new allocated 0x" << tmp_allocated
8327 << " released 0x" << tmp_released << std::dec
8328 << dendl;
8329 pallocated = &tmp_allocated;
8330 preleased = &tmp_released;
8331 }
8332 }
8333
8334 // update freelist with non-overlap sets
8335 for (interval_set<uint64_t>::iterator p = pallocated->begin();
8336 p != pallocated->end();
8337 ++p) {
8338 fm->allocate(p.get_start(), p.get_len(), t);
8339 }
8340 for (interval_set<uint64_t>::iterator p = preleased->begin();
8341 p != preleased->end();
8342 ++p) {
8343 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
8344 << "~" << p.get_len() << std::dec << dendl;
8345 fm->release(p.get_start(), p.get_len(), t);
8346 }
8347
8348 _txc_update_store_statfs(txc);
8349 }
8350
8351 void BlueStore::_txc_applied_kv(TransContext *txc)
8352 {
8353 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
8354 for (auto& o : *ls) {
8355 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
8356 << dendl;
8357 if (--o->flushing_count == 0) {
8358 std::lock_guard<std::mutex> l(o->flush_lock);
8359 o->flush_cond.notify_all();
8360 }
8361 }
8362 }
8363 }
8364
8365 void BlueStore::_txc_committed_kv(TransContext *txc)
8366 {
8367 dout(20) << __func__ << " txc " << txc << dendl;
8368
8369 // warning: we're calling onreadable_sync inside the sequencer lock
8370 if (txc->onreadable_sync) {
8371 txc->onreadable_sync->complete(0);
8372 txc->onreadable_sync = NULL;
8373 }
8374 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
8375 if (txc->oncommit) {
8376 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
8377 finishers[n]->queue(txc->oncommit);
8378 txc->oncommit = NULL;
8379 }
8380 if (txc->onreadable) {
8381 finishers[n]->queue(txc->onreadable);
8382 txc->onreadable = NULL;
8383 }
8384
8385 {
8386 std::lock_guard<std::mutex> l(txc->osr->qlock);
8387 txc->state = TransContext::STATE_KV_DONE;
8388 if (!txc->oncommits.empty()) {
8389 finishers[n]->queue(txc->oncommits);
8390 }
8391 }
8392 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
8393 }
8394
8395 void BlueStore::_txc_finish(TransContext *txc)
8396 {
8397 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
8398 assert(txc->state == TransContext::STATE_FINISHING);
8399
8400 for (auto& sb : txc->shared_blobs_written) {
8401 sb->bc.finish_write(sb->get_cache(), txc->seq);
8402 }
8403 txc->shared_blobs_written.clear();
8404
8405 while (!txc->removed_collections.empty()) {
8406 _queue_reap_collection(txc->removed_collections.front());
8407 txc->removed_collections.pop_front();
8408 }
8409
8410 OpSequencerRef osr = txc->osr;
8411 bool empty = false;
8412 bool submit_deferred = false;
8413 OpSequencer::q_list_t releasing_txc;
8414 {
8415 std::lock_guard<std::mutex> l(osr->qlock);
8416 txc->state = TransContext::STATE_DONE;
8417 bool notify = false;
8418 while (!osr->q.empty()) {
8419 TransContext *txc = &osr->q.front();
8420 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
8421 << dendl;
8422 if (txc->state != TransContext::STATE_DONE) {
8423 if (txc->state == TransContext::STATE_PREPARE &&
8424 deferred_aggressive) {
8425 // for _osr_drain_preceding()
8426 notify = true;
8427 }
8428 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
8429 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
8430 submit_deferred = true;
8431 }
8432 break;
8433 }
8434
8435 osr->q.pop_front();
8436 releasing_txc.push_back(*txc);
8437 notify = true;
8438 }
8439 if (notify) {
8440 osr->qcond.notify_all();
8441 }
8442 if (osr->q.empty()) {
8443 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
8444 empty = true;
8445 }
8446 }
8447 while (!releasing_txc.empty()) {
8448 // release to allocator only after all preceding txc's have also
8449 // finished any deferred writes that potentially land in these
8450 // blocks
8451 auto txc = &releasing_txc.front();
8452 _txc_release_alloc(txc);
8453 releasing_txc.pop_front();
8454 txc->log_state_latency(logger, l_bluestore_state_done_lat);
8455 delete txc;
8456 }
8457
8458 if (submit_deferred) {
8459 // we're pinning memory; flush! we could be more fine-grained here but
8460 // i'm not sure it's worth the bother.
8461 deferred_try_submit();
8462 }
8463
8464 if (empty && osr->zombie) {
8465 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
8466 osr->_unregister();
8467 }
8468 }
8469
8470 void BlueStore::_txc_release_alloc(TransContext *txc)
8471 {
8472 // update allocator with full released set
8473 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
8474 dout(10) << __func__ << " " << txc << " " << std::hex
8475 << txc->released << std::dec << dendl;
8476 for (interval_set<uint64_t>::iterator p = txc->released.begin();
8477 p != txc->released.end();
8478 ++p) {
8479 alloc->release(p.get_start(), p.get_len());
8480 }
8481 }
8482
8483 txc->allocated.clear();
8484 txc->released.clear();
8485 }
8486
8487 void BlueStore::_osr_drain_preceding(TransContext *txc)
8488 {
8489 OpSequencer *osr = txc->osr.get();
8490 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8491 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8492 {
8493 // submit anything pending
8494 deferred_lock.lock();
8495 if (osr->deferred_pending) {
8496 _deferred_submit_unlock(osr);
8497 } else {
8498 deferred_lock.unlock();
8499 }
8500 }
8501 {
8502 // wake up any previously finished deferred events
8503 std::lock_guard<std::mutex> l(kv_lock);
8504 kv_cond.notify_one();
8505 }
8506 osr->drain_preceding(txc);
8507 --deferred_aggressive;
8508 dout(10) << __func__ << " " << osr << " done" << dendl;
8509 }
8510
8511 void BlueStore::_osr_drain_all()
8512 {
8513 dout(10) << __func__ << dendl;
8514
8515 set<OpSequencerRef> s;
8516 {
8517 std::lock_guard<std::mutex> l(osr_lock);
8518 s = osr_set;
8519 }
8520 dout(20) << __func__ << " osr_set " << s << dendl;
8521
8522 ++deferred_aggressive;
8523 {
8524 // submit anything pending
8525 deferred_try_submit();
8526 }
8527 {
8528 // wake up any previously finished deferred events
8529 std::lock_guard<std::mutex> l(kv_lock);
8530 kv_cond.notify_one();
8531 }
8532 {
8533 std::lock_guard<std::mutex> l(kv_finalize_lock);
8534 kv_finalize_cond.notify_one();
8535 }
8536 for (auto osr : s) {
8537 dout(20) << __func__ << " drain " << osr << dendl;
8538 osr->drain();
8539 }
8540 --deferred_aggressive;
8541
8542 dout(10) << __func__ << " done" << dendl;
8543 }
8544
8545 void BlueStore::_osr_unregister_all()
8546 {
8547 set<OpSequencerRef> s;
8548 {
8549 std::lock_guard<std::mutex> l(osr_lock);
8550 s = osr_set;
8551 }
8552 dout(10) << __func__ << " " << s << dendl;
8553 for (auto osr : s) {
8554 osr->_unregister();
8555
8556 if (!osr->zombie) {
8557 // break link from Sequencer to us so that this OpSequencer
8558 // instance can die with this mount/umount cycle. note that
8559 // we assume umount() will not race against ~Sequencer.
8560 assert(osr->parent);
8561 osr->parent->p.reset();
8562 }
8563 }
8564 // nobody should be creating sequencers during umount either.
8565 {
8566 std::lock_guard<std::mutex> l(osr_lock);
8567 assert(osr_set.empty());
8568 }
8569 }
8570
8571 void BlueStore::_kv_start()
8572 {
8573 dout(10) << __func__ << dendl;
8574
8575 for (int i = 0; i < m_finisher_num; ++i) {
8576 ostringstream oss;
8577 oss << "finisher-" << i;
8578 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8579 finishers.push_back(f);
8580 }
8581
8582 deferred_finisher.start();
8583 for (auto f : finishers) {
8584 f->start();
8585 }
8586 kv_sync_thread.create("bstore_kv_sync");
8587 kv_finalize_thread.create("bstore_kv_final");
8588 }
8589
8590 void BlueStore::_kv_stop()
8591 {
8592 dout(10) << __func__ << dendl;
8593 {
8594 std::unique_lock<std::mutex> l(kv_lock);
8595 while (!kv_sync_started) {
8596 kv_cond.wait(l);
8597 }
8598 kv_stop = true;
8599 kv_cond.notify_all();
8600 }
8601 {
8602 std::unique_lock<std::mutex> l(kv_finalize_lock);
8603 while (!kv_finalize_started) {
8604 kv_finalize_cond.wait(l);
8605 }
8606 kv_finalize_stop = true;
8607 kv_finalize_cond.notify_all();
8608 }
8609 kv_sync_thread.join();
8610 kv_finalize_thread.join();
8611 assert(removed_collections.empty());
8612 {
8613 std::lock_guard<std::mutex> l(kv_lock);
8614 kv_stop = false;
8615 }
8616 {
8617 std::lock_guard<std::mutex> l(kv_finalize_lock);
8618 kv_finalize_stop = false;
8619 }
8620 dout(10) << __func__ << " stopping finishers" << dendl;
8621 deferred_finisher.wait_for_empty();
8622 deferred_finisher.stop();
8623 for (auto f : finishers) {
8624 f->wait_for_empty();
8625 f->stop();
8626 }
8627 dout(10) << __func__ << " stopped" << dendl;
8628 }
8629
8630 void BlueStore::_kv_sync_thread()
8631 {
8632 dout(10) << __func__ << " start" << dendl;
8633 std::unique_lock<std::mutex> l(kv_lock);
8634 assert(!kv_sync_started);
8635 kv_sync_started = true;
8636 kv_cond.notify_all();
8637 while (true) {
8638 assert(kv_committing.empty());
8639 if (kv_queue.empty() &&
8640 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8641 !deferred_aggressive)) {
8642 if (kv_stop)
8643 break;
8644 dout(20) << __func__ << " sleep" << dendl;
8645 kv_cond.wait(l);
8646 dout(20) << __func__ << " wake" << dendl;
8647 } else {
8648 deque<TransContext*> kv_submitting;
8649 deque<DeferredBatch*> deferred_done, deferred_stable;
8650 uint64_t aios = 0, costs = 0;
8651
8652 dout(20) << __func__ << " committing " << kv_queue.size()
8653 << " submitting " << kv_queue_unsubmitted.size()
8654 << " deferred done " << deferred_done_queue.size()
8655 << " stable " << deferred_stable_queue.size()
8656 << dendl;
8657 kv_committing.swap(kv_queue);
8658 kv_submitting.swap(kv_queue_unsubmitted);
8659 deferred_done.swap(deferred_done_queue);
8660 deferred_stable.swap(deferred_stable_queue);
8661 aios = kv_ios;
8662 costs = kv_throttle_costs;
8663 kv_ios = 0;
8664 kv_throttle_costs = 0;
8665 utime_t start = ceph_clock_now();
8666 l.unlock();
8667
8668 dout(30) << __func__ << " committing " << kv_committing << dendl;
8669 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8670 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8671 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8672
8673 bool force_flush = false;
8674 // if bluefs is sharing the same device as data (only), then we
8675 // can rely on the bluefs commit to flush the device and make
8676 // deferred aios stable. that means that if we do have done deferred
8677 // txcs AND we are not on a single device, we need to force a flush.
8678 if (bluefs_single_shared_device && bluefs) {
8679 if (aios) {
8680 force_flush = true;
8681 } else if (kv_committing.empty() && kv_submitting.empty() &&
8682 deferred_stable.empty()) {
8683 force_flush = true; // there's nothing else to commit!
8684 } else if (deferred_aggressive) {
8685 force_flush = true;
8686 }
8687 } else
8688 force_flush = true;
8689
8690 if (force_flush) {
8691 dout(20) << __func__ << " num_aios=" << aios
8692 << " force_flush=" << (int)force_flush
8693 << ", flushing, deferred done->stable" << dendl;
8694 // flush/barrier on block device
8695 bdev->flush();
8696
8697 // if we flush then deferred done are now deferred stable
8698 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8699 deferred_done.end());
8700 deferred_done.clear();
8701 }
8702 utime_t after_flush = ceph_clock_now();
8703
8704 // we will use one final transaction to force a sync
8705 KeyValueDB::Transaction synct = db->get_transaction();
8706
8707 // increase {nid,blobid}_max? note that this covers both the
8708 // case where we are approaching the max and the case we passed
8709 // it. in either case, we increase the max in the earlier txn
8710 // we submit.
8711 uint64_t new_nid_max = 0, new_blobid_max = 0;
8712 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8713 KeyValueDB::Transaction t =
8714 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8715 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8716 bufferlist bl;
8717 ::encode(new_nid_max, bl);
8718 t->set(PREFIX_SUPER, "nid_max", bl);
8719 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8720 }
8721 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8722 KeyValueDB::Transaction t =
8723 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8724 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8725 bufferlist bl;
8726 ::encode(new_blobid_max, bl);
8727 t->set(PREFIX_SUPER, "blobid_max", bl);
8728 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8729 }
8730
8731 for (auto txc : kv_committing) {
8732 if (txc->state == TransContext::STATE_KV_QUEUED) {
8733 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8734 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8735 assert(r == 0);
8736 _txc_applied_kv(txc);
8737 --txc->osr->kv_committing_serially;
8738 txc->state = TransContext::STATE_KV_SUBMITTED;
8739 if (txc->osr->kv_submitted_waiters) {
8740 std::lock_guard<std::mutex> l(txc->osr->qlock);
8741 if (txc->osr->_is_all_kv_submitted()) {
8742 txc->osr->qcond.notify_all();
8743 }
8744 }
8745
8746 } else {
8747 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8748 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8749 }
8750 if (txc->had_ios) {
8751 --txc->osr->txc_with_unstable_io;
8752 }
8753 }
8754
8755 // release throttle *before* we commit. this allows new ops
8756 // to be prepared and enter pipeline while we are waiting on
8757 // the kv commit sync/flush. then hopefully on the next
8758 // iteration there will already be ops awake. otherwise, we
8759 // end up going to sleep, and then wake up when the very first
8760 // transaction is ready for commit.
8761 throttle_bytes.put(costs);
8762
8763 PExtentVector bluefs_gift_extents;
8764 if (bluefs &&
8765 after_flush - bluefs_last_balance >
8766 cct->_conf->bluestore_bluefs_balance_interval) {
8767 bluefs_last_balance = after_flush;
8768 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8769 assert(r >= 0);
8770 if (r > 0) {
8771 for (auto& p : bluefs_gift_extents) {
8772 bluefs_extents.insert(p.offset, p.length);
8773 }
8774 bufferlist bl;
8775 ::encode(bluefs_extents, bl);
8776 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8777 << bluefs_extents << std::dec << dendl;
8778 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8779 }
8780 }
8781
8782 // cleanup sync deferred keys
8783 for (auto b : deferred_stable) {
8784 for (auto& txc : b->txcs) {
8785 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8786 if (!wt.released.empty()) {
8787 // kraken replay compat only
8788 txc.released = wt.released;
8789 dout(10) << __func__ << " deferred txn has released "
8790 << txc.released
8791 << " (we just upgraded from kraken) on " << &txc << dendl;
8792 _txc_finalize_kv(&txc, synct);
8793 }
8794 // cleanup the deferred
8795 string key;
8796 get_deferred_key(wt.seq, &key);
8797 synct->rm_single_key(PREFIX_DEFERRED, key);
8798 }
8799 }
8800
8801 // submit synct synchronously (block and wait for it to commit)
8802 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
8803 assert(r == 0);
8804
8805 if (new_nid_max) {
8806 nid_max = new_nid_max;
8807 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8808 }
8809 if (new_blobid_max) {
8810 blobid_max = new_blobid_max;
8811 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8812 }
8813
8814 {
8815 utime_t finish = ceph_clock_now();
8816 utime_t dur_flush = after_flush - start;
8817 utime_t dur_kv = finish - after_flush;
8818 utime_t dur = finish - start;
8819 dout(20) << __func__ << " committed " << kv_committing.size()
8820 << " cleaned " << deferred_stable.size()
8821 << " in " << dur
8822 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8823 << dendl;
8824 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8825 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8826 logger->tinc(l_bluestore_kv_lat, dur);
8827 }
8828
8829 if (bluefs) {
8830 if (!bluefs_gift_extents.empty()) {
8831 _commit_bluefs_freespace(bluefs_gift_extents);
8832 }
8833 for (auto p = bluefs_extents_reclaiming.begin();
8834 p != bluefs_extents_reclaiming.end();
8835 ++p) {
8836 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8837 << p.get_start() << "~" << p.get_len() << std::dec
8838 << dendl;
8839 alloc->release(p.get_start(), p.get_len());
8840 }
8841 bluefs_extents_reclaiming.clear();
8842 }
8843
8844 {
8845 std::unique_lock<std::mutex> m(kv_finalize_lock);
8846 if (kv_committing_to_finalize.empty()) {
8847 kv_committing_to_finalize.swap(kv_committing);
8848 } else {
8849 kv_committing_to_finalize.insert(
8850 kv_committing_to_finalize.end(),
8851 kv_committing.begin(),
8852 kv_committing.end());
8853 kv_committing.clear();
8854 }
8855 if (deferred_stable_to_finalize.empty()) {
8856 deferred_stable_to_finalize.swap(deferred_stable);
8857 } else {
8858 deferred_stable_to_finalize.insert(
8859 deferred_stable_to_finalize.end(),
8860 deferred_stable.begin(),
8861 deferred_stable.end());
8862 deferred_stable.clear();
8863 }
8864 kv_finalize_cond.notify_one();
8865 }
8866
8867 l.lock();
8868 // previously deferred "done" are now "stable" by virtue of this
8869 // commit cycle.
8870 deferred_stable_queue.swap(deferred_done);
8871 }
8872 }
8873 dout(10) << __func__ << " finish" << dendl;
8874 kv_sync_started = false;
8875 }
8876
8877 void BlueStore::_kv_finalize_thread()
8878 {
8879 deque<TransContext*> kv_committed;
8880 deque<DeferredBatch*> deferred_stable;
8881 dout(10) << __func__ << " start" << dendl;
8882 std::unique_lock<std::mutex> l(kv_finalize_lock);
8883 assert(!kv_finalize_started);
8884 kv_finalize_started = true;
8885 kv_finalize_cond.notify_all();
8886 while (true) {
8887 assert(kv_committed.empty());
8888 assert(deferred_stable.empty());
8889 if (kv_committing_to_finalize.empty() &&
8890 deferred_stable_to_finalize.empty()) {
8891 if (kv_finalize_stop)
8892 break;
8893 dout(20) << __func__ << " sleep" << dendl;
8894 kv_finalize_cond.wait(l);
8895 dout(20) << __func__ << " wake" << dendl;
8896 } else {
8897 kv_committed.swap(kv_committing_to_finalize);
8898 deferred_stable.swap(deferred_stable_to_finalize);
8899 l.unlock();
8900 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8901 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8902
8903 while (!kv_committed.empty()) {
8904 TransContext *txc = kv_committed.front();
8905 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8906 _txc_state_proc(txc);
8907 kv_committed.pop_front();
8908 }
8909
8910 for (auto b : deferred_stable) {
8911 auto p = b->txcs.begin();
8912 while (p != b->txcs.end()) {
8913 TransContext *txc = &*p;
8914 p = b->txcs.erase(p); // unlink here because
8915 _txc_state_proc(txc); // this may destroy txc
8916 }
8917 delete b;
8918 }
8919 deferred_stable.clear();
8920
8921 if (!deferred_aggressive) {
8922 if (deferred_queue_size >= deferred_batch_ops.load() ||
8923 throttle_deferred_bytes.past_midpoint()) {
8924 deferred_try_submit();
8925 }
8926 }
8927
8928 // this is as good a place as any ...
8929 _reap_collections();
8930
8931 l.lock();
8932 }
8933 }
8934 dout(10) << __func__ << " finish" << dendl;
8935 kv_finalize_started = false;
8936 }
8937
8938 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8939 TransContext *txc, OnodeRef o)
8940 {
8941 if (!txc->deferred_txn) {
8942 txc->deferred_txn = new bluestore_deferred_transaction_t;
8943 }
8944 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8945 return &txc->deferred_txn->ops.back();
8946 }
8947
8948 void BlueStore::_deferred_queue(TransContext *txc)
8949 {
8950 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
8951 deferred_lock.lock();
8952 if (!txc->osr->deferred_pending &&
8953 !txc->osr->deferred_running) {
8954 deferred_queue.push_back(*txc->osr);
8955 }
8956 if (!txc->osr->deferred_pending) {
8957 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8958 }
8959 ++deferred_queue_size;
8960 txc->osr->deferred_pending->txcs.push_back(*txc);
8961 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8962 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8963 const auto& op = *opi;
8964 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8965 bufferlist::const_iterator p = op.data.begin();
8966 for (auto e : op.extents) {
8967 txc->osr->deferred_pending->prepare_write(
8968 cct, wt.seq, e.offset, e.length, p);
8969 }
8970 }
8971 if (deferred_aggressive &&
8972 !txc->osr->deferred_running) {
8973 _deferred_submit_unlock(txc->osr.get());
8974 } else {
8975 deferred_lock.unlock();
8976 }
8977 }
8978
8979 void BlueStore::deferred_try_submit()
8980 {
8981 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8982 << deferred_queue_size << " txcs" << dendl;
8983 std::lock_guard<std::mutex> l(deferred_lock);
8984 vector<OpSequencerRef> osrs;
8985 osrs.reserve(deferred_queue.size());
8986 for (auto& osr : deferred_queue) {
8987 osrs.push_back(&osr);
8988 }
8989 for (auto& osr : osrs) {
8990 if (osr->deferred_pending) {
8991 if (!osr->deferred_running) {
8992 _deferred_submit_unlock(osr.get());
8993 deferred_lock.lock();
8994 } else {
8995 dout(20) << __func__ << " osr " << osr << " already has running"
8996 << dendl;
8997 }
8998 } else {
8999 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
9000 }
9001 }
9002 }
9003
9004 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
9005 {
9006 dout(10) << __func__ << " osr " << osr
9007 << " " << osr->deferred_pending->iomap.size() << " ios pending "
9008 << dendl;
9009 assert(osr->deferred_pending);
9010 assert(!osr->deferred_running);
9011
9012 auto b = osr->deferred_pending;
9013 deferred_queue_size -= b->seq_bytes.size();
9014 assert(deferred_queue_size >= 0);
9015
9016 osr->deferred_running = osr->deferred_pending;
9017 osr->deferred_pending = nullptr;
9018
9019 uint64_t start = 0, pos = 0;
9020 bufferlist bl;
9021 auto i = b->iomap.begin();
9022 while (true) {
9023 if (i == b->iomap.end() || i->first != pos) {
9024 if (bl.length()) {
9025 dout(20) << __func__ << " write 0x" << std::hex
9026 << start << "~" << bl.length()
9027 << " crc " << bl.crc32c(-1) << std::dec << dendl;
9028 if (!g_conf->bluestore_debug_omit_block_device_write) {
9029 logger->inc(l_bluestore_deferred_write_ops);
9030 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
9031 int r = bdev->aio_write(start, bl, &b->ioc, false);
9032 assert(r == 0);
9033 }
9034 }
9035 if (i == b->iomap.end()) {
9036 break;
9037 }
9038 start = 0;
9039 pos = i->first;
9040 bl.clear();
9041 }
9042 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
9043 << std::hex << pos << "~" << i->second.bl.length() << std::dec
9044 << dendl;
9045 if (!bl.length()) {
9046 start = pos;
9047 }
9048 pos += i->second.bl.length();
9049 bl.claim_append(i->second.bl);
9050 ++i;
9051 }
9052
9053 deferred_lock.unlock();
9054 bdev->aio_submit(&b->ioc);
9055 }
9056
9057 struct C_DeferredTrySubmit : public Context {
9058 BlueStore *store;
9059 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
9060 void finish(int r) {
9061 store->deferred_try_submit();
9062 }
9063 };
9064
9065 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
9066 {
9067 dout(10) << __func__ << " osr " << osr << dendl;
9068 assert(osr->deferred_running);
9069 DeferredBatch *b = osr->deferred_running;
9070
9071 {
9072 std::lock_guard<std::mutex> l(deferred_lock);
9073 assert(osr->deferred_running == b);
9074 osr->deferred_running = nullptr;
9075 if (!osr->deferred_pending) {
9076 dout(20) << __func__ << " dequeueing" << dendl;
9077 auto q = deferred_queue.iterator_to(*osr);
9078 deferred_queue.erase(q);
9079 } else if (deferred_aggressive) {
9080 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
9081 deferred_finisher.queue(new C_DeferredTrySubmit(this));
9082 } else {
9083 dout(20) << __func__ << " leaving queued, more pending" << dendl;
9084 }
9085 }
9086
9087 {
9088 uint64_t costs = 0;
9089 std::lock_guard<std::mutex> l2(osr->qlock);
9090 for (auto& i : b->txcs) {
9091 TransContext *txc = &i;
9092 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
9093 costs += txc->cost;
9094 }
9095 osr->qcond.notify_all();
9096 throttle_deferred_bytes.put(costs);
9097 std::lock_guard<std::mutex> l(kv_lock);
9098 deferred_done_queue.emplace_back(b);
9099 }
9100
9101 // in the normal case, do not bother waking up the kv thread; it will
9102 // catch us on the next commit anyway.
9103 if (deferred_aggressive) {
9104 std::lock_guard<std::mutex> l(kv_lock);
9105 kv_cond.notify_one();
9106 }
9107 }
9108
9109 int BlueStore::_deferred_replay()
9110 {
9111 dout(10) << __func__ << " start" << dendl;
9112 OpSequencerRef osr = new OpSequencer(cct, this);
9113 int count = 0;
9114 int r = 0;
9115 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
9116 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
9117 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
9118 << dendl;
9119 bluestore_deferred_transaction_t *deferred_txn =
9120 new bluestore_deferred_transaction_t;
9121 bufferlist bl = it->value();
9122 bufferlist::iterator p = bl.begin();
9123 try {
9124 ::decode(*deferred_txn, p);
9125 } catch (buffer::error& e) {
9126 derr << __func__ << " failed to decode deferred txn "
9127 << pretty_binary_string(it->key()) << dendl;
9128 delete deferred_txn;
9129 r = -EIO;
9130 goto out;
9131 }
9132 TransContext *txc = _txc_create(osr.get());
9133 txc->deferred_txn = deferred_txn;
9134 txc->state = TransContext::STATE_KV_DONE;
9135 _txc_state_proc(txc);
9136 }
9137 out:
9138 dout(20) << __func__ << " draining osr" << dendl;
9139 _osr_drain_all();
9140 osr->discard();
9141 dout(10) << __func__ << " completed " << count << " events" << dendl;
9142 return r;
9143 }
9144
9145 // ---------------------------
9146 // transactions
9147
9148 int BlueStore::queue_transactions(
9149 Sequencer *posr,
9150 vector<Transaction>& tls,
9151 TrackedOpRef op,
9152 ThreadPool::TPHandle *handle)
9153 {
9154 FUNCTRACE();
9155 Context *onreadable;
9156 Context *ondisk;
9157 Context *onreadable_sync;
9158 ObjectStore::Transaction::collect_contexts(
9159 tls, &onreadable, &ondisk, &onreadable_sync);
9160
9161 if (cct->_conf->objectstore_blackhole) {
9162 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
9163 << dendl;
9164 delete ondisk;
9165 delete onreadable;
9166 delete onreadable_sync;
9167 return 0;
9168 }
9169 utime_t start = ceph_clock_now();
9170 // set up the sequencer
9171 OpSequencer *osr;
9172 assert(posr);
9173 if (posr->p) {
9174 osr = static_cast<OpSequencer *>(posr->p.get());
9175 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
9176 } else {
9177 osr = new OpSequencer(cct, this);
9178 osr->parent = posr;
9179 posr->p = osr;
9180 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
9181 }
9182
9183 // prepare
9184 TransContext *txc = _txc_create(osr);
9185 txc->onreadable = onreadable;
9186 txc->onreadable_sync = onreadable_sync;
9187 txc->oncommit = ondisk;
9188
9189 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
9190 (*p).set_osr(osr);
9191 txc->bytes += (*p).get_num_bytes();
9192 _txc_add_transaction(txc, &(*p));
9193 }
9194 _txc_calc_cost(txc);
9195
9196 _txc_write_nodes(txc, txc->t);
9197
9198 // journal deferred items
9199 if (txc->deferred_txn) {
9200 txc->deferred_txn->seq = ++deferred_seq;
9201 bufferlist bl;
9202 ::encode(*txc->deferred_txn, bl);
9203 string key;
9204 get_deferred_key(txc->deferred_txn->seq, &key);
9205 txc->t->set(PREFIX_DEFERRED, key, bl);
9206 }
9207
9208 _txc_finalize_kv(txc, txc->t);
9209 if (handle)
9210 handle->suspend_tp_timeout();
9211
9212 utime_t tstart = ceph_clock_now();
9213 throttle_bytes.get(txc->cost);
9214 if (txc->deferred_txn) {
9215 // ensure we do not block here because of deferred writes
9216 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
9217 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
9218 << dendl;
9219 ++deferred_aggressive;
9220 deferred_try_submit();
9221 {
9222 // wake up any previously finished deferred events
9223 std::lock_guard<std::mutex> l(kv_lock);
9224 kv_cond.notify_one();
9225 }
9226 throttle_deferred_bytes.get(txc->cost);
9227 --deferred_aggressive;
9228 }
9229 }
9230 utime_t tend = ceph_clock_now();
9231
9232 if (handle)
9233 handle->reset_tp_timeout();
9234
9235 logger->inc(l_bluestore_txc);
9236
9237 // execute (start)
9238 _txc_state_proc(txc);
9239
9240 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
9241 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
9242 return 0;
9243 }
9244
9245 void BlueStore::_txc_aio_submit(TransContext *txc)
9246 {
9247 dout(10) << __func__ << " txc " << txc << dendl;
9248 bdev->aio_submit(&txc->ioc);
9249 }
9250
9251 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
9252 {
9253 Transaction::iterator i = t->begin();
9254
9255 _dump_transaction(t);
9256
9257 vector<CollectionRef> cvec(i.colls.size());
9258 unsigned j = 0;
9259 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
9260 ++p, ++j) {
9261 cvec[j] = _get_collection(*p);
9262 }
9263 vector<OnodeRef> ovec(i.objects.size());
9264
9265 for (int pos = 0; i.have_op(); ++pos) {
9266 Transaction::Op *op = i.decode_op();
9267 int r = 0;
9268
9269 // no coll or obj
9270 if (op->op == Transaction::OP_NOP)
9271 continue;
9272
9273 // collection operations
9274 CollectionRef &c = cvec[op->cid];
9275 switch (op->op) {
9276 case Transaction::OP_RMCOLL:
9277 {
9278 const coll_t &cid = i.get_cid(op->cid);
9279 r = _remove_collection(txc, cid, &c);
9280 if (!r)
9281 continue;
9282 }
9283 break;
9284
9285 case Transaction::OP_MKCOLL:
9286 {
9287 assert(!c);
9288 const coll_t &cid = i.get_cid(op->cid);
9289 r = _create_collection(txc, cid, op->split_bits, &c);
9290 if (!r)
9291 continue;
9292 }
9293 break;
9294
9295 case Transaction::OP_SPLIT_COLLECTION:
9296 assert(0 == "deprecated");
9297 break;
9298
9299 case Transaction::OP_SPLIT_COLLECTION2:
9300 {
9301 uint32_t bits = op->split_bits;
9302 uint32_t rem = op->split_rem;
9303 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
9304 if (!r)
9305 continue;
9306 }
9307 break;
9308
9309 case Transaction::OP_COLL_HINT:
9310 {
9311 uint32_t type = op->hint_type;
9312 bufferlist hint;
9313 i.decode_bl(hint);
9314 bufferlist::iterator hiter = hint.begin();
9315 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
9316 uint32_t pg_num;
9317 uint64_t num_objs;
9318 ::decode(pg_num, hiter);
9319 ::decode(num_objs, hiter);
9320 dout(10) << __func__ << " collection hint objects is a no-op, "
9321 << " pg_num " << pg_num << " num_objects " << num_objs
9322 << dendl;
9323 } else {
9324 // Ignore the hint
9325 dout(10) << __func__ << " unknown collection hint " << type << dendl;
9326 }
9327 continue;
9328 }
9329 break;
9330
9331 case Transaction::OP_COLL_SETATTR:
9332 r = -EOPNOTSUPP;
9333 break;
9334
9335 case Transaction::OP_COLL_RMATTR:
9336 r = -EOPNOTSUPP;
9337 break;
9338
9339 case Transaction::OP_COLL_RENAME:
9340 assert(0 == "not implemented");
9341 break;
9342 }
9343 if (r < 0) {
9344 derr << __func__ << " error " << cpp_strerror(r)
9345 << " not handled on operation " << op->op
9346 << " (op " << pos << ", counting from 0)" << dendl;
9347 _dump_transaction(t, 0);
9348 assert(0 == "unexpected error");
9349 }
9350
9351 // these operations implicity create the object
9352 bool create = false;
9353 if (op->op == Transaction::OP_TOUCH ||
9354 op->op == Transaction::OP_WRITE ||
9355 op->op == Transaction::OP_ZERO) {
9356 create = true;
9357 }
9358
9359 // object operations
9360 RWLock::WLocker l(c->lock);
9361 OnodeRef &o = ovec[op->oid];
9362 if (!o) {
9363 ghobject_t oid = i.get_oid(op->oid);
9364 o = c->get_onode(oid, create);
9365 }
9366 if (!create && (!o || !o->exists)) {
9367 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
9368 << i.get_oid(op->oid) << dendl;
9369 r = -ENOENT;
9370 goto endop;
9371 }
9372
9373 switch (op->op) {
9374 case Transaction::OP_TOUCH:
9375 r = _touch(txc, c, o);
9376 break;
9377
9378 case Transaction::OP_WRITE:
9379 {
9380 uint64_t off = op->off;
9381 uint64_t len = op->len;
9382 uint32_t fadvise_flags = i.get_fadvise_flags();
9383 bufferlist bl;
9384 i.decode_bl(bl);
9385 r = _write(txc, c, o, off, len, bl, fadvise_flags);
9386 }
9387 break;
9388
9389 case Transaction::OP_ZERO:
9390 {
9391 uint64_t off = op->off;
9392 uint64_t len = op->len;
9393 r = _zero(txc, c, o, off, len);
9394 }
9395 break;
9396
9397 case Transaction::OP_TRIMCACHE:
9398 {
9399 // deprecated, no-op
9400 }
9401 break;
9402
9403 case Transaction::OP_TRUNCATE:
9404 {
9405 uint64_t off = op->off;
9406 r = _truncate(txc, c, o, off);
9407 }
9408 break;
9409
9410 case Transaction::OP_REMOVE:
9411 {
9412 r = _remove(txc, c, o);
9413 }
9414 break;
9415
9416 case Transaction::OP_SETATTR:
9417 {
9418 string name = i.decode_string();
9419 bufferptr bp;
9420 i.decode_bp(bp);
9421 r = _setattr(txc, c, o, name, bp);
9422 }
9423 break;
9424
9425 case Transaction::OP_SETATTRS:
9426 {
9427 map<string, bufferptr> aset;
9428 i.decode_attrset(aset);
9429 r = _setattrs(txc, c, o, aset);
9430 }
9431 break;
9432
9433 case Transaction::OP_RMATTR:
9434 {
9435 string name = i.decode_string();
9436 r = _rmattr(txc, c, o, name);
9437 }
9438 break;
9439
9440 case Transaction::OP_RMATTRS:
9441 {
9442 r = _rmattrs(txc, c, o);
9443 }
9444 break;
9445
9446 case Transaction::OP_CLONE:
9447 {
9448 OnodeRef& no = ovec[op->dest_oid];
9449 if (!no) {
9450 const ghobject_t& noid = i.get_oid(op->dest_oid);
9451 no = c->get_onode(noid, true);
9452 }
9453 r = _clone(txc, c, o, no);
9454 }
9455 break;
9456
9457 case Transaction::OP_CLONERANGE:
9458 assert(0 == "deprecated");
9459 break;
9460
9461 case Transaction::OP_CLONERANGE2:
9462 {
9463 OnodeRef& no = ovec[op->dest_oid];
9464 if (!no) {
9465 const ghobject_t& noid = i.get_oid(op->dest_oid);
9466 no = c->get_onode(noid, true);
9467 }
9468 uint64_t srcoff = op->off;
9469 uint64_t len = op->len;
9470 uint64_t dstoff = op->dest_off;
9471 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
9472 }
9473 break;
9474
9475 case Transaction::OP_COLL_ADD:
9476 assert(0 == "not implemented");
9477 break;
9478
9479 case Transaction::OP_COLL_REMOVE:
9480 assert(0 == "not implemented");
9481 break;
9482
9483 case Transaction::OP_COLL_MOVE:
9484 assert(0 == "deprecated");
9485 break;
9486
9487 case Transaction::OP_COLL_MOVE_RENAME:
9488 case Transaction::OP_TRY_RENAME:
9489 {
9490 assert(op->cid == op->dest_cid);
9491 const ghobject_t& noid = i.get_oid(op->dest_oid);
9492 OnodeRef& no = ovec[op->dest_oid];
9493 if (!no) {
9494 no = c->get_onode(noid, false);
9495 }
9496 r = _rename(txc, c, o, no, noid);
9497 }
9498 break;
9499
9500 case Transaction::OP_OMAP_CLEAR:
9501 {
9502 r = _omap_clear(txc, c, o);
9503 }
9504 break;
9505 case Transaction::OP_OMAP_SETKEYS:
9506 {
9507 bufferlist aset_bl;
9508 i.decode_attrset_bl(&aset_bl);
9509 r = _omap_setkeys(txc, c, o, aset_bl);
9510 }
9511 break;
9512 case Transaction::OP_OMAP_RMKEYS:
9513 {
9514 bufferlist keys_bl;
9515 i.decode_keyset_bl(&keys_bl);
9516 r = _omap_rmkeys(txc, c, o, keys_bl);
9517 }
9518 break;
9519 case Transaction::OP_OMAP_RMKEYRANGE:
9520 {
9521 string first, last;
9522 first = i.decode_string();
9523 last = i.decode_string();
9524 r = _omap_rmkey_range(txc, c, o, first, last);
9525 }
9526 break;
9527 case Transaction::OP_OMAP_SETHEADER:
9528 {
9529 bufferlist bl;
9530 i.decode_bl(bl);
9531 r = _omap_setheader(txc, c, o, bl);
9532 }
9533 break;
9534
9535 case Transaction::OP_SETALLOCHINT:
9536 {
9537 r = _set_alloc_hint(txc, c, o,
9538 op->expected_object_size,
9539 op->expected_write_size,
9540 op->alloc_hint_flags);
9541 }
9542 break;
9543
9544 default:
9545 derr << __func__ << "bad op " << op->op << dendl;
9546 ceph_abort();
9547 }
9548
9549 endop:
9550 if (r < 0) {
9551 bool ok = false;
9552
9553 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9554 op->op == Transaction::OP_CLONE ||
9555 op->op == Transaction::OP_CLONERANGE2 ||
9556 op->op == Transaction::OP_COLL_ADD ||
9557 op->op == Transaction::OP_SETATTR ||
9558 op->op == Transaction::OP_SETATTRS ||
9559 op->op == Transaction::OP_RMATTR ||
9560 op->op == Transaction::OP_OMAP_SETKEYS ||
9561 op->op == Transaction::OP_OMAP_RMKEYS ||
9562 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9563 op->op == Transaction::OP_OMAP_SETHEADER))
9564 // -ENOENT is usually okay
9565 ok = true;
9566 if (r == -ENODATA)
9567 ok = true;
9568
9569 if (!ok) {
9570 const char *msg = "unexpected error code";
9571
9572 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9573 op->op == Transaction::OP_CLONE ||
9574 op->op == Transaction::OP_CLONERANGE2))
9575 msg = "ENOENT on clone suggests osd bug";
9576
9577 if (r == -ENOSPC)
9578 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9579 // by partially applying transactions.
9580 msg = "ENOSPC from bluestore, misconfigured cluster";
9581
9582 if (r == -ENOTEMPTY) {
9583 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9584 }
9585
9586 derr << __func__ << " error " << cpp_strerror(r)
9587 << " not handled on operation " << op->op
9588 << " (op " << pos << ", counting from 0)"
9589 << dendl;
9590 derr << msg << dendl;
9591 _dump_transaction(t, 0);
9592 assert(0 == "unexpected error");
9593 }
9594 }
9595 }
9596 }
9597
9598
9599
9600 // -----------------
9601 // write operations
9602
9603 int BlueStore::_touch(TransContext *txc,
9604 CollectionRef& c,
9605 OnodeRef &o)
9606 {
9607 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9608 int r = 0;
9609 _assign_nid(txc, o);
9610 txc->write_onode(o);
9611 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9612 return r;
9613 }
9614
9615 void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
9616 {
9617 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9618 return;
9619 dout(log_level) << __func__ << " " << o << " " << o->oid
9620 << " nid " << o->onode.nid
9621 << " size 0x" << std::hex << o->onode.size
9622 << " (" << std::dec << o->onode.size << ")"
9623 << " expected_object_size " << o->onode.expected_object_size
9624 << " expected_write_size " << o->onode.expected_write_size
9625 << " in " << o->onode.extent_map_shards.size() << " shards"
9626 << ", " << o->extent_map.spanning_blob_map.size()
9627 << " spanning blobs"
9628 << dendl;
9629 for (auto p = o->onode.attrs.begin();
9630 p != o->onode.attrs.end();
9631 ++p) {
9632 dout(log_level) << __func__ << " attr " << p->first
9633 << " len " << p->second.length() << dendl;
9634 }
9635 _dump_extent_map(o->extent_map, log_level);
9636 }
9637
9638 void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9639 {
9640 uint64_t pos = 0;
9641 for (auto& s : em.shards) {
9642 dout(log_level) << __func__ << " shard " << *s.shard_info
9643 << (s.loaded ? " (loaded)" : "")
9644 << (s.dirty ? " (dirty)" : "")
9645 << dendl;
9646 }
9647 for (auto& e : em.extent_map) {
9648 dout(log_level) << __func__ << " " << e << dendl;
9649 assert(e.logical_offset >= pos);
9650 pos = e.logical_offset + e.length;
9651 const bluestore_blob_t& blob = e.blob->get_blob();
9652 if (blob.has_csum()) {
9653 vector<uint64_t> v;
9654 unsigned n = blob.get_csum_count();
9655 for (unsigned i = 0; i < n; ++i)
9656 v.push_back(blob.get_csum_item(i));
9657 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9658 << dendl;
9659 }
9660 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9661 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9662 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9663 << "~" << i.second->length << std::dec
9664 << " " << *i.second << dendl;
9665 }
9666 }
9667 }
9668
9669 void BlueStore::_dump_transaction(Transaction *t, int log_level)
9670 {
9671 dout(log_level) << " transaction dump:\n";
9672 JSONFormatter f(true);
9673 f.open_object_section("transaction");
9674 t->dump(&f);
9675 f.close_section();
9676 f.flush(*_dout);
9677 *_dout << dendl;
9678 }
9679
9680 void BlueStore::_pad_zeros(
9681 bufferlist *bl, uint64_t *offset,
9682 uint64_t chunk_size)
9683 {
9684 auto length = bl->length();
9685 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9686 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9687 dout(40) << "before:\n";
9688 bl->hexdump(*_dout);
9689 *_dout << dendl;
9690 // front
9691 size_t front_pad = *offset % chunk_size;
9692 size_t back_pad = 0;
9693 size_t pad_count = 0;
9694 if (front_pad) {
9695 size_t front_copy = MIN(chunk_size - front_pad, length);
9696 bufferptr z = buffer::create_page_aligned(chunk_size);
9697 z.zero(0, front_pad, false);
9698 pad_count += front_pad;
9699 bl->copy(0, front_copy, z.c_str() + front_pad);
9700 if (front_copy + front_pad < chunk_size) {
9701 back_pad = chunk_size - (length + front_pad);
9702 z.zero(front_pad + length, back_pad, false);
9703 pad_count += back_pad;
9704 }
9705 bufferlist old, t;
9706 old.swap(*bl);
9707 t.substr_of(old, front_copy, length - front_copy);
9708 bl->append(z);
9709 bl->claim_append(t);
9710 *offset -= front_pad;
9711 length += pad_count;
9712 }
9713
9714 // back
9715 uint64_t end = *offset + length;
9716 unsigned back_copy = end % chunk_size;
9717 if (back_copy) {
9718 assert(back_pad == 0);
9719 back_pad = chunk_size - back_copy;
9720 assert(back_copy <= length);
9721 bufferptr tail(chunk_size);
9722 bl->copy(length - back_copy, back_copy, tail.c_str());
9723 tail.zero(back_copy, back_pad, false);
9724 bufferlist old;
9725 old.swap(*bl);
9726 bl->substr_of(old, 0, length - back_copy);
9727 bl->append(tail);
9728 length += back_pad;
9729 pad_count += back_pad;
9730 }
9731 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9732 << back_pad << " on front/back, now 0x" << *offset << "~"
9733 << length << std::dec << dendl;
9734 dout(40) << "after:\n";
9735 bl->hexdump(*_dout);
9736 *_dout << dendl;
9737 if (pad_count)
9738 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9739 assert(bl->length() == length);
9740 }
9741
9742 void BlueStore::_do_write_small(
9743 TransContext *txc,
9744 CollectionRef &c,
9745 OnodeRef o,
9746 uint64_t offset, uint64_t length,
9747 bufferlist::iterator& blp,
9748 WriteContext *wctx)
9749 {
9750 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9751 << std::dec << dendl;
9752 assert(length < min_alloc_size);
9753 uint64_t end_offs = offset + length;
9754
9755 logger->inc(l_bluestore_write_small);
9756 logger->inc(l_bluestore_write_small_bytes, length);
9757
9758 bufferlist bl;
9759 blp.copy(length, bl);
9760
9761 // Look for an existing mutable blob we can use.
9762 auto begin = o->extent_map.extent_map.begin();
9763 auto end = o->extent_map.extent_map.end();
9764 auto ep = o->extent_map.seek_lextent(offset);
9765 if (ep != begin) {
9766 --ep;
9767 if (ep->blob_end() <= offset) {
9768 ++ep;
9769 }
9770 }
9771 auto prev_ep = ep;
9772 if (prev_ep != begin) {
9773 --prev_ep;
9774 } else {
9775 prev_ep = end; // to avoid this extent check as it's a duplicate
9776 }
9777
9778 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9779 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9780 uint32_t alloc_len = min_alloc_size;
9781 auto offset0 = P2ALIGN(offset, alloc_len);
9782
9783 bool any_change;
9784
9785 // search suitable extent in both forward and reverse direction in
9786 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9787 // then check if blob can be reused via can_reuse_blob func or apply
9788 // direct/deferred write (the latter for extents including or higher
9789 // than 'offset' only).
9790 do {
9791 any_change = false;
9792
9793 if (ep != end && ep->logical_offset < offset + max_bsize) {
9794 BlobRef b = ep->blob;
9795 auto bstart = ep->blob_start();
9796 dout(20) << __func__ << " considering " << *b
9797 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9798 if (bstart >= end_offs) {
9799 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9800 } else if (!b->get_blob().is_mutable()) {
9801 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9802 } else if (ep->logical_offset % min_alloc_size !=
9803 ep->blob_offset % min_alloc_size) {
9804 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9805 } else {
9806 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9807 // can we pad our head/tail out with zeros?
9808 uint64_t head_pad, tail_pad;
9809 head_pad = P2PHASE(offset, chunk_size);
9810 tail_pad = P2NPHASE(end_offs, chunk_size);
9811 if (head_pad || tail_pad) {
9812 o->extent_map.fault_range(db, offset - head_pad,
9813 end_offs - offset + head_pad + tail_pad);
9814 }
9815 if (head_pad &&
9816 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9817 head_pad = 0;
9818 }
9819 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9820 tail_pad = 0;
9821 }
9822
9823 uint64_t b_off = offset - head_pad - bstart;
9824 uint64_t b_len = length + head_pad + tail_pad;
9825
9826 // direct write into unused blocks of an existing mutable blob?
9827 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9828 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9829 b->get_blob().is_unused(b_off, b_len) &&
9830 b->get_blob().is_allocated(b_off, b_len)) {
9831 _apply_padding(head_pad, tail_pad, bl);
9832
9833 dout(20) << __func__ << " write to unused 0x" << std::hex
9834 << b_off << "~" << b_len
9835 << " pad 0x" << head_pad << " + 0x" << tail_pad
9836 << std::dec << " of mutable " << *b << dendl;
9837 _buffer_cache_write(txc, b, b_off, bl,
9838 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9839
9840 if (!g_conf->bluestore_debug_omit_block_device_write) {
9841 if (b_len <= prefer_deferred_size) {
9842 dout(20) << __func__ << " deferring small 0x" << std::hex
9843 << b_len << std::dec << " unused write via deferred" << dendl;
9844 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9845 op->op = bluestore_deferred_op_t::OP_WRITE;
9846 b->get_blob().map(
9847 b_off, b_len,
9848 [&](uint64_t offset, uint64_t length) {
9849 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9850 return 0;
9851 });
9852 op->data = bl;
9853 } else {
9854 b->get_blob().map_bl(
9855 b_off, bl,
9856 [&](uint64_t offset, bufferlist& t) {
9857 bdev->aio_write(offset, t,
9858 &txc->ioc, wctx->buffered);
9859 });
9860 }
9861 }
9862 b->dirty_blob().calc_csum(b_off, bl);
9863 dout(20) << __func__ << " lex old " << *ep << dendl;
9864 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9865 b,
9866 &wctx->old_extents);
9867 b->dirty_blob().mark_used(le->blob_offset, le->length);
9868 txc->statfs_delta.stored() += le->length;
9869 dout(20) << __func__ << " lex " << *le << dendl;
9870 logger->inc(l_bluestore_write_small_unused);
9871 return;
9872 }
9873 // read some data to fill out the chunk?
9874 uint64_t head_read = P2PHASE(b_off, chunk_size);
9875 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9876 if ((head_read || tail_read) &&
9877 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9878 head_read + tail_read < min_alloc_size) {
9879 b_off -= head_read;
9880 b_len += head_read + tail_read;
9881
9882 } else {
9883 head_read = tail_read = 0;
9884 }
9885
9886 // chunk-aligned deferred overwrite?
9887 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9888 b_off % chunk_size == 0 &&
9889 b_len % chunk_size == 0 &&
9890 b->get_blob().is_allocated(b_off, b_len)) {
9891
9892 _apply_padding(head_pad, tail_pad, bl);
9893
9894 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9895 << " and tail 0x" << tail_read << std::dec << dendl;
9896 if (head_read) {
9897 bufferlist head_bl;
9898 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9899 head_bl, 0);
9900 assert(r >= 0 && r <= (int)head_read);
9901 size_t zlen = head_read - r;
9902 if (zlen) {
9903 head_bl.append_zero(zlen);
9904 logger->inc(l_bluestore_write_pad_bytes, zlen);
9905 }
9906 bl.claim_prepend(head_bl);
9907 logger->inc(l_bluestore_write_penalty_read_ops);
9908 }
9909 if (tail_read) {
9910 bufferlist tail_bl;
9911 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9912 tail_bl, 0);
9913 assert(r >= 0 && r <= (int)tail_read);
9914 size_t zlen = tail_read - r;
9915 if (zlen) {
9916 tail_bl.append_zero(zlen);
9917 logger->inc(l_bluestore_write_pad_bytes, zlen);
9918 }
9919 bl.claim_append(tail_bl);
9920 logger->inc(l_bluestore_write_penalty_read_ops);
9921 }
9922 logger->inc(l_bluestore_write_small_pre_read);
9923
9924 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9925 op->op = bluestore_deferred_op_t::OP_WRITE;
9926 _buffer_cache_write(txc, b, b_off, bl,
9927 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9928
9929 int r = b->get_blob().map(
9930 b_off, b_len,
9931 [&](uint64_t offset, uint64_t length) {
9932 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9933 return 0;
9934 });
9935 assert(r == 0);
9936 if (b->get_blob().csum_type) {
9937 b->dirty_blob().calc_csum(b_off, bl);
9938 }
9939 op->data.claim(bl);
9940 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9941 << b_len << std::dec << " of mutable " << *b
9942 << " at " << op->extents << dendl;
9943 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9944 b, &wctx->old_extents);
9945 b->dirty_blob().mark_used(le->blob_offset, le->length);
9946 txc->statfs_delta.stored() += le->length;
9947 dout(20) << __func__ << " lex " << *le << dendl;
9948 logger->inc(l_bluestore_write_small_deferred);
9949 return;
9950 }
9951 // try to reuse blob if we can
9952 if (b->can_reuse_blob(min_alloc_size,
9953 max_bsize,
9954 offset0 - bstart,
9955 &alloc_len)) {
9956 assert(alloc_len == min_alloc_size); // expecting data always
9957 // fit into reused blob
9958 // Need to check for pending writes desiring to
9959 // reuse the same pextent. The rationale is that during GC two chunks
9960 // from garbage blobs(compressed?) can share logical space within the same
9961 // AU. That's in turn might be caused by unaligned len in clone_range2.
9962 // Hence the second write will fail in an attempt to reuse blob at
9963 // do_alloc_write().
9964 if (!wctx->has_conflict(b,
9965 offset0,
9966 offset0 + alloc_len,
9967 min_alloc_size)) {
9968
9969 // we can't reuse pad_head/pad_tail since they might be truncated
9970 // due to existent extents
9971 uint64_t b_off = offset - bstart;
9972 uint64_t b_off0 = b_off;
9973 _pad_zeros(&bl, &b_off0, chunk_size);
9974
9975 dout(20) << __func__ << " reuse blob " << *b << std::hex
9976 << " (0x" << b_off0 << "~" << bl.length() << ")"
9977 << " (0x" << b_off << "~" << length << ")"
9978 << std::dec << dendl;
9979
9980 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9981 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9982 false, false);
9983 logger->inc(l_bluestore_write_small_unused);
9984 return;
9985 }
9986 }
9987 }
9988 ++ep;
9989 any_change = true;
9990 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9991
9992 // check extent for reuse in reverse order
9993 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9994 BlobRef b = prev_ep->blob;
9995 auto bstart = prev_ep->blob_start();
9996 dout(20) << __func__ << " considering " << *b
9997 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9998 if (b->can_reuse_blob(min_alloc_size,
9999 max_bsize,
10000 offset0 - bstart,
10001 &alloc_len)) {
10002 assert(alloc_len == min_alloc_size); // expecting data always
10003 // fit into reused blob
10004 // Need to check for pending writes desiring to
10005 // reuse the same pextent. The rationale is that during GC two chunks
10006 // from garbage blobs(compressed?) can share logical space within the same
10007 // AU. That's in turn might be caused by unaligned len in clone_range2.
10008 // Hence the second write will fail in an attempt to reuse blob at
10009 // do_alloc_write().
10010 if (!wctx->has_conflict(b,
10011 offset0,
10012 offset0 + alloc_len,
10013 min_alloc_size)) {
10014
10015 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
10016 uint64_t b_off = offset - bstart;
10017 uint64_t b_off0 = b_off;
10018 _pad_zeros(&bl, &b_off0, chunk_size);
10019
10020 dout(20) << __func__ << " reuse blob " << *b << std::hex
10021 << " (0x" << b_off0 << "~" << bl.length() << ")"
10022 << " (0x" << b_off << "~" << length << ")"
10023 << std::dec << dendl;
10024
10025 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
10026 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
10027 false, false);
10028 logger->inc(l_bluestore_write_small_unused);
10029 return;
10030 }
10031 }
10032 if (prev_ep != begin) {
10033 --prev_ep;
10034 any_change = true;
10035 } else {
10036 prev_ep = end; // to avoid useless first extent re-check
10037 }
10038 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
10039 } while (any_change);
10040
10041 // new blob.
10042
10043 BlobRef b = c->new_blob();
10044 uint64_t b_off = P2PHASE(offset, alloc_len);
10045 uint64_t b_off0 = b_off;
10046 _pad_zeros(&bl, &b_off0, block_size);
10047 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
10048 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
10049 logger->inc(l_bluestore_write_small_new);
10050
10051 return;
10052 }
10053
10054 void BlueStore::_do_write_big(
10055 TransContext *txc,
10056 CollectionRef &c,
10057 OnodeRef o,
10058 uint64_t offset, uint64_t length,
10059 bufferlist::iterator& blp,
10060 WriteContext *wctx)
10061 {
10062 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
10063 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
10064 << " compress " << (int)wctx->compress
10065 << dendl;
10066 logger->inc(l_bluestore_write_big);
10067 logger->inc(l_bluestore_write_big_bytes, length);
10068 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
10069 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
10070 while (length > 0) {
10071 bool new_blob = false;
10072 uint32_t l = MIN(max_bsize, length);
10073 BlobRef b;
10074 uint32_t b_off = 0;
10075
10076 //attempting to reuse existing blob
10077 if (!wctx->compress) {
10078 // look for an existing mutable blob we can reuse
10079 auto begin = o->extent_map.extent_map.begin();
10080 auto end = o->extent_map.extent_map.end();
10081 auto ep = o->extent_map.seek_lextent(offset);
10082 auto prev_ep = ep;
10083 if (prev_ep != begin) {
10084 --prev_ep;
10085 } else {
10086 prev_ep = end; // to avoid this extent check as it's a duplicate
10087 }
10088 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
10089 // search suitable extent in both forward and reverse direction in
10090 // [offset - target_max_blob_size, offset + target_max_blob_size] range
10091 // then check if blob can be reused via can_reuse_blob func.
10092 bool any_change;
10093 do {
10094 any_change = false;
10095 if (ep != end && ep->logical_offset < offset + max_bsize) {
10096 if (offset >= ep->blob_start() &&
10097 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
10098 offset - ep->blob_start(),
10099 &l)) {
10100 b = ep->blob;
10101 b_off = offset - ep->blob_start();
10102 prev_ep = end; // to avoid check below
10103 dout(20) << __func__ << " reuse blob " << *b << std::hex
10104 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
10105 } else {
10106 ++ep;
10107 any_change = true;
10108 }
10109 }
10110
10111 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
10112 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
10113 offset - prev_ep->blob_start(),
10114 &l)) {
10115 b = prev_ep->blob;
10116 b_off = offset - prev_ep->blob_start();
10117 dout(20) << __func__ << " reuse blob " << *b << std::hex
10118 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
10119 } else if (prev_ep != begin) {
10120 --prev_ep;
10121 any_change = true;
10122 } else {
10123 prev_ep = end; // to avoid useless first extent re-check
10124 }
10125 }
10126 } while (b == nullptr && any_change);
10127 }
10128 if (b == nullptr) {
10129 b = c->new_blob();
10130 b_off = 0;
10131 new_blob = true;
10132 }
10133
10134 bufferlist t;
10135 blp.copy(l, t);
10136 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
10137 offset += l;
10138 length -= l;
10139 logger->inc(l_bluestore_write_big_blobs);
10140 }
10141 }
10142
10143 int BlueStore::_do_alloc_write(
10144 TransContext *txc,
10145 CollectionRef coll,
10146 OnodeRef o,
10147 WriteContext *wctx)
10148 {
10149 dout(20) << __func__ << " txc " << txc
10150 << " " << wctx->writes.size() << " blobs"
10151 << dendl;
10152 if (wctx->writes.empty()) {
10153 return 0;
10154 }
10155
10156 CompressorRef c;
10157 double crr = 0;
10158 if (wctx->compress) {
10159 c = select_option(
10160 "compression_algorithm",
10161 compressor,
10162 [&]() {
10163 string val;
10164 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
10165 CompressorRef cp = compressor;
10166 if (!cp || cp->get_type_name() != val) {
10167 cp = Compressor::create(cct, val);
10168 }
10169 return boost::optional<CompressorRef>(cp);
10170 }
10171 return boost::optional<CompressorRef>();
10172 }
10173 );
10174
10175 crr = select_option(
10176 "compression_required_ratio",
10177 cct->_conf->bluestore_compression_required_ratio,
10178 [&]() {
10179 double val;
10180 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
10181 return boost::optional<double>(val);
10182 }
10183 return boost::optional<double>();
10184 }
10185 );
10186 }
10187
10188 // checksum
10189 int csum = csum_type.load();
10190 csum = select_option(
10191 "csum_type",
10192 csum,
10193 [&]() {
10194 int val;
10195 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
10196 return boost::optional<int>(val);
10197 }
10198 return boost::optional<int>();
10199 }
10200 );
10201
10202 // compress (as needed) and calc needed space
10203 uint64_t need = 0;
10204 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
10205 for (auto& wi : wctx->writes) {
10206 if (c && wi.blob_length > min_alloc_size) {
10207 utime_t start = ceph_clock_now();
10208
10209 // compress
10210 assert(wi.b_off == 0);
10211 assert(wi.blob_length == wi.bl.length());
10212
10213 // FIXME: memory alignment here is bad
10214 bufferlist t;
10215 int r = c->compress(wi.bl, t);
10216 assert(r == 0);
10217
10218 bluestore_compression_header_t chdr;
10219 chdr.type = c->get_type();
10220 chdr.length = t.length();
10221 ::encode(chdr, wi.compressed_bl);
10222 wi.compressed_bl.claim_append(t);
10223
10224 wi.compressed_len = wi.compressed_bl.length();
10225 uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
10226 uint64_t want_len_raw = wi.blob_length * crr;
10227 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
10228 if (newlen <= want_len && newlen < wi.blob_length) {
10229 // Cool. We compressed at least as much as we were hoping to.
10230 // pad out to min_alloc_size
10231 wi.compressed_bl.append_zero(newlen - wi.compressed_len);
10232 logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
10233 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
10234 << " -> 0x" << wi.compressed_len << " => 0x" << newlen
10235 << " with " << c->get_type()
10236 << std::dec << dendl;
10237 txc->statfs_delta.compressed() += wi.compressed_len;
10238 txc->statfs_delta.compressed_original() += wi.blob_length;
10239 txc->statfs_delta.compressed_allocated() += newlen;
10240 logger->inc(l_bluestore_compress_success_count);
10241 wi.compressed = true;
10242 need += newlen;
10243 } else {
10244 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
10245 << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
10246 << " with " << c->get_type()
10247 << ", which is more than required 0x" << want_len_raw
10248 << " -> 0x" << want_len
10249 << ", leaving uncompressed"
10250 << std::dec << dendl;
10251 logger->inc(l_bluestore_compress_rejected_count);
10252 need += wi.blob_length;
10253 }
10254 logger->tinc(l_bluestore_compress_lat,
10255 ceph_clock_now() - start);
10256 } else {
10257 need += wi.blob_length;
10258 }
10259 }
10260 int r = alloc->reserve(need);
10261 if (r < 0) {
10262 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
10263 << dendl;
10264 return r;
10265 }
10266 AllocExtentVector prealloc;
10267 prealloc.reserve(2 * wctx->writes.size());;
10268 int prealloc_left = 0;
10269 prealloc_left = alloc->allocate(
10270 need, min_alloc_size, need,
10271 0, &prealloc);
10272 assert(prealloc_left == (int64_t)need);
10273 dout(20) << __func__ << " prealloc " << prealloc << dendl;
10274 auto prealloc_pos = prealloc.begin();
10275
10276 for (auto& wi : wctx->writes) {
10277 BlobRef b = wi.b;
10278 bluestore_blob_t& dblob = b->dirty_blob();
10279 uint64_t b_off = wi.b_off;
10280 bufferlist *l = &wi.bl;
10281 uint64_t final_length = wi.blob_length;
10282 uint64_t csum_length = wi.blob_length;
10283 unsigned csum_order = block_size_order;
10284 if (wi.compressed) {
10285 final_length = wi.compressed_bl.length();
10286 csum_length = final_length;
10287 csum_order = ctz(csum_length);
10288 l = &wi.compressed_bl;
10289 dblob.set_compressed(wi.blob_length, wi.compressed_len);
10290 } else if (wi.new_blob) {
10291 // initialize newly created blob only
10292 assert(dblob.is_mutable());
10293 if (l->length() != wi.blob_length) {
10294 // hrm, maybe we could do better here, but let's not bother.
10295 dout(20) << __func__ << " forcing csum_order to block_size_order "
10296 << block_size_order << dendl;
10297 csum_order = block_size_order;
10298 } else {
10299 csum_order = std::min(wctx->csum_order, ctz(l->length()));
10300 }
10301 // try to align blob with max_blob_size to improve
10302 // its reuse ratio, e.g. in case of reverse write
10303 uint32_t suggested_boff =
10304 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
10305 if ((suggested_boff % (1 << csum_order)) == 0 &&
10306 suggested_boff + final_length <= max_bsize &&
10307 suggested_boff > b_off) {
10308 dout(20) << __func__ << " forcing blob_offset to 0x"
10309 << std::hex << suggested_boff << std::dec << dendl;
10310 assert(suggested_boff >= b_off);
10311 csum_length += suggested_boff - b_off;
10312 b_off = suggested_boff;
10313 }
10314 if (csum != Checksummer::CSUM_NONE) {
10315 dout(20) << __func__ << " initialize csum setting for new blob " << *b
10316 << " csum_type " << Checksummer::get_csum_type_string(csum)
10317 << " csum_order " << csum_order
10318 << " csum_length 0x" << std::hex << csum_length << std::dec
10319 << dendl;
10320 dblob.init_csum(csum, csum_order, csum_length);
10321 }
10322 }
10323
10324 AllocExtentVector extents;
10325 int64_t left = final_length;
10326 while (left > 0) {
10327 assert(prealloc_left > 0);
10328 if (prealloc_pos->length <= left) {
10329 prealloc_left -= prealloc_pos->length;
10330 left -= prealloc_pos->length;
10331 txc->statfs_delta.allocated() += prealloc_pos->length;
10332 extents.push_back(*prealloc_pos);
10333 ++prealloc_pos;
10334 } else {
10335 extents.emplace_back(prealloc_pos->offset, left);
10336 prealloc_pos->offset += left;
10337 prealloc_pos->length -= left;
10338 prealloc_left -= left;
10339 txc->statfs_delta.allocated() += left;
10340 left = 0;
10341 break;
10342 }
10343 }
10344 for (auto& p : extents) {
10345 txc->allocated.insert(p.offset, p.length);
10346 }
10347 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
10348
10349 dout(20) << __func__ << " blob " << *b << dendl;
10350 if (dblob.has_csum()) {
10351 dblob.calc_csum(b_off, *l);
10352 }
10353
10354 if (wi.mark_unused) {
10355 auto b_end = b_off + wi.bl.length();
10356 if (b_off) {
10357 dblob.add_unused(0, b_off);
10358 }
10359 if (b_end < wi.blob_length) {
10360 dblob.add_unused(b_end, wi.blob_length - b_end);
10361 }
10362 }
10363
10364 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
10365 b_off + (wi.b_off0 - wi.b_off),
10366 wi.length0,
10367 wi.b,
10368 nullptr);
10369 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
10370 txc->statfs_delta.stored() += le->length;
10371 dout(20) << __func__ << " lex " << *le << dendl;
10372 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
10373 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
10374
10375 // queue io
10376 if (!g_conf->bluestore_debug_omit_block_device_write) {
10377 if (l->length() <= prefer_deferred_size.load()) {
10378 dout(20) << __func__ << " deferring small 0x" << std::hex
10379 << l->length() << std::dec << " write via deferred" << dendl;
10380 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
10381 op->op = bluestore_deferred_op_t::OP_WRITE;
10382 int r = b->get_blob().map(
10383 b_off, l->length(),
10384 [&](uint64_t offset, uint64_t length) {
10385 op->extents.emplace_back(bluestore_pextent_t(offset, length));
10386 return 0;
10387 });
10388 assert(r == 0);
10389 op->data = *l;
10390 } else {
10391 b->get_blob().map_bl(
10392 b_off, *l,
10393 [&](uint64_t offset, bufferlist& t) {
10394 bdev->aio_write(offset, t, &txc->ioc, false);
10395 });
10396 }
10397 }
10398 }
10399 assert(prealloc_pos == prealloc.end());
10400 assert(prealloc_left == 0);
10401 return 0;
10402 }
10403
10404 void BlueStore::_wctx_finish(
10405 TransContext *txc,
10406 CollectionRef& c,
10407 OnodeRef o,
10408 WriteContext *wctx,
10409 set<SharedBlob*> *maybe_unshared_blobs)
10410 {
10411 auto oep = wctx->old_extents.begin();
10412 while (oep != wctx->old_extents.end()) {
10413 auto &lo = *oep;
10414 oep = wctx->old_extents.erase(oep);
10415 dout(20) << __func__ << " lex_old " << lo.e << dendl;
10416 BlobRef b = lo.e.blob;
10417 const bluestore_blob_t& blob = b->get_blob();
10418 if (blob.is_compressed()) {
10419 if (lo.blob_empty) {
10420 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
10421 }
10422 txc->statfs_delta.compressed_original() -= lo.e.length;
10423 }
10424 auto& r = lo.r;
10425 txc->statfs_delta.stored() -= lo.e.length;
10426 if (!r.empty()) {
10427 dout(20) << __func__ << " blob release " << r << dendl;
10428 if (blob.is_shared()) {
10429 PExtentVector final;
10430 c->load_shared_blob(b->shared_blob);
10431 for (auto e : r) {
10432 b->shared_blob->put_ref(
10433 e.offset, e.length, &final,
10434 b->is_referenced() ? nullptr : maybe_unshared_blobs);
10435 }
10436 dout(20) << __func__ << " shared_blob release " << final
10437 << " from " << *b->shared_blob << dendl;
10438 txc->write_shared_blob(b->shared_blob);
10439 r.clear();
10440 r.swap(final);
10441 }
10442 }
10443 // we can't invalidate our logical extents as we drop them because
10444 // other lextents (either in our onode or others) may still
10445 // reference them. but we can throw out anything that is no
10446 // longer allocated. Note that this will leave behind edge bits
10447 // that are no longer referenced but not deallocated (until they
10448 // age out of the cache naturally).
10449 b->discard_unallocated(c.get());
10450 for (auto e : r) {
10451 dout(20) << __func__ << " release " << e << dendl;
10452 txc->released.insert(e.offset, e.length);
10453 txc->statfs_delta.allocated() -= e.length;
10454 if (blob.is_compressed()) {
10455 txc->statfs_delta.compressed_allocated() -= e.length;
10456 }
10457 }
10458 delete &lo;
10459 if (b->is_spanning() && !b->is_referenced()) {
10460 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
10461 << dendl;
10462 o->extent_map.spanning_blob_map.erase(b->id);
10463 }
10464 }
10465 }
10466
10467 void BlueStore::_do_write_data(
10468 TransContext *txc,
10469 CollectionRef& c,
10470 OnodeRef o,
10471 uint64_t offset,
10472 uint64_t length,
10473 bufferlist& bl,
10474 WriteContext *wctx)
10475 {
10476 uint64_t end = offset + length;
10477 bufferlist::iterator p = bl.begin();
10478
10479 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
10480 (length != min_alloc_size)) {
10481 // we fall within the same block
10482 _do_write_small(txc, c, o, offset, length, p, wctx);
10483 } else {
10484 uint64_t head_offset, head_length;
10485 uint64_t middle_offset, middle_length;
10486 uint64_t tail_offset, tail_length;
10487
10488 head_offset = offset;
10489 head_length = P2NPHASE(offset, min_alloc_size);
10490
10491 tail_offset = P2ALIGN(end, min_alloc_size);
10492 tail_length = P2PHASE(end, min_alloc_size);
10493
10494 middle_offset = head_offset + head_length;
10495 middle_length = length - head_length - tail_length;
10496
10497 if (head_length) {
10498 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
10499 }
10500
10501 if (middle_length) {
10502 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
10503 }
10504
10505 if (tail_length) {
10506 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
10507 }
10508 }
10509 }
10510
10511 void BlueStore::_choose_write_options(
10512 CollectionRef& c,
10513 OnodeRef o,
10514 uint32_t fadvise_flags,
10515 WriteContext *wctx)
10516 {
10517 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10518 dout(20) << __func__ << " will do buffered write" << dendl;
10519 wctx->buffered = true;
10520 } else if (cct->_conf->bluestore_default_buffered_write &&
10521 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10522 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10523 dout(20) << __func__ << " defaulting to buffered write" << dendl;
10524 wctx->buffered = true;
10525 }
10526
10527 // apply basic csum block size
10528 wctx->csum_order = block_size_order;
10529
10530 // compression parameters
10531 unsigned alloc_hints = o->onode.alloc_hint_flags;
10532 auto cm = select_option(
10533 "compression_mode",
10534 comp_mode.load(),
10535 [&]() {
10536 string val;
10537 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
10538 return boost::optional<Compressor::CompressionMode>(
10539 Compressor::get_comp_mode_type(val));
10540 }
10541 return boost::optional<Compressor::CompressionMode>();
10542 }
10543 );
10544
10545 wctx->compress = (cm != Compressor::COMP_NONE) &&
10546 ((cm == Compressor::COMP_FORCE) ||
10547 (cm == Compressor::COMP_AGGRESSIVE &&
10548 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10549 (cm == Compressor::COMP_PASSIVE &&
10550 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10551
10552 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10553 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
10554 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10555 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
10556 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
10557
10558 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
10559
10560 if (o->onode.expected_write_size) {
10561 wctx->csum_order = std::max(min_alloc_size_order,
10562 (uint8_t)ctz(o->onode.expected_write_size));
10563 } else {
10564 wctx->csum_order = min_alloc_size_order;
10565 }
10566
10567 if (wctx->compress) {
10568 wctx->target_blob_size = select_option(
10569 "compression_max_blob_size",
10570 comp_max_blob_size.load(),
10571 [&]() {
10572 int val;
10573 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10574 return boost::optional<uint64_t>((uint64_t)val);
10575 }
10576 return boost::optional<uint64_t>();
10577 }
10578 );
10579 }
10580 } else {
10581 if (wctx->compress) {
10582 wctx->target_blob_size = select_option(
10583 "compression_min_blob_size",
10584 comp_min_blob_size.load(),
10585 [&]() {
10586 int val;
10587 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10588 return boost::optional<uint64_t>((uint64_t)val);
10589 }
10590 return boost::optional<uint64_t>();
10591 }
10592 );
10593 }
10594 }
10595
10596 uint64_t max_bsize = max_blob_size.load();
10597 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10598 wctx->target_blob_size = max_bsize;
10599 }
10600
10601 // set the min blob size floor at 2x the min_alloc_size, or else we
10602 // won't be able to allocate a smaller extent for the compressed
10603 // data.
10604 if (wctx->compress &&
10605 wctx->target_blob_size < min_alloc_size * 2) {
10606 wctx->target_blob_size = min_alloc_size * 2;
10607 }
10608
10609 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10610 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10611 << std::dec << dendl;
10612 }
10613
10614 int BlueStore::_do_gc(
10615 TransContext *txc,
10616 CollectionRef& c,
10617 OnodeRef o,
10618 const GarbageCollector& gc,
10619 const WriteContext& wctx,
10620 uint64_t *dirty_start,
10621 uint64_t *dirty_end)
10622 {
10623 auto& extents_to_collect = gc.get_extents_to_collect();
10624
10625 bool dirty_range_updated = false;
10626 WriteContext wctx_gc;
10627 wctx_gc.fork(wctx); // make a clone for garbage collection
10628
10629 for (auto it = extents_to_collect.begin();
10630 it != extents_to_collect.end();
10631 ++it) {
10632 bufferlist bl;
10633 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10634 assert(r == (int)it->length);
10635
10636 o->extent_map.fault_range(db, it->offset, it->length);
10637 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10638 logger->inc(l_bluestore_gc_merged, it->length);
10639
10640 if (*dirty_start > it->offset) {
10641 *dirty_start = it->offset;
10642 dirty_range_updated = true;
10643 }
10644
10645 if (*dirty_end < it->offset + it->length) {
10646 *dirty_end = it->offset + it->length;
10647 dirty_range_updated = true;
10648 }
10649 }
10650 if (dirty_range_updated) {
10651 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
10652 }
10653
10654 dout(30) << __func__ << " alloc write" << dendl;
10655 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10656 if (r < 0) {
10657 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10658 << dendl;
10659 return r;
10660 }
10661
10662 _wctx_finish(txc, c, o, &wctx_gc);
10663 return 0;
10664 }
10665
10666 int BlueStore::_do_write(
10667 TransContext *txc,
10668 CollectionRef& c,
10669 OnodeRef o,
10670 uint64_t offset,
10671 uint64_t length,
10672 bufferlist& bl,
10673 uint32_t fadvise_flags)
10674 {
10675 int r = 0;
10676
10677 dout(20) << __func__
10678 << " " << o->oid
10679 << " 0x" << std::hex << offset << "~" << length
10680 << " - have 0x" << o->onode.size
10681 << " (" << std::dec << o->onode.size << ")"
10682 << " bytes"
10683 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10684 << dendl;
10685 _dump_onode(o);
10686
10687 if (length == 0) {
10688 return 0;
10689 }
10690
10691 uint64_t end = offset + length;
10692
10693 GarbageCollector gc(c->store->cct);
10694 int64_t benefit;
10695 auto dirty_start = offset;
10696 auto dirty_end = end;
10697
10698 WriteContext wctx;
10699 _choose_write_options(c, o, fadvise_flags, &wctx);
10700 o->extent_map.fault_range(db, offset, length);
10701 _do_write_data(txc, c, o, offset, length, bl, &wctx);
10702 r = _do_alloc_write(txc, c, o, &wctx);
10703 if (r < 0) {
10704 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10705 << dendl;
10706 goto out;
10707 }
10708
10709 // NB: _wctx_finish() will empty old_extents
10710 // so we must do gc estimation before that
10711 benefit = gc.estimate(offset,
10712 length,
10713 o->extent_map,
10714 wctx.old_extents,
10715 min_alloc_size);
10716
10717 _wctx_finish(txc, c, o, &wctx);
10718 if (end > o->onode.size) {
10719 dout(20) << __func__ << " extending size to 0x" << std::hex << end
10720 << std::dec << dendl;
10721 o->onode.size = end;
10722 }
10723
10724 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
10725 if (!gc.get_extents_to_collect().empty()) {
10726 dout(20) << __func__ << " perform garbage collection, "
10727 << "expected benefit = " << benefit << " AUs" << dendl;
10728 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10729 if (r < 0) {
10730 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10731 << dendl;
10732 goto out;
10733 }
10734 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
10735 << "~" << dirty_end - dirty_start << std::dec << dendl;
10736 }
10737 }
10738 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
10739 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10740
10741 r = 0;
10742
10743 out:
10744 return r;
10745 }
10746
10747 int BlueStore::_write(TransContext *txc,
10748 CollectionRef& c,
10749 OnodeRef& o,
10750 uint64_t offset, size_t length,
10751 bufferlist& bl,
10752 uint32_t fadvise_flags)
10753 {
10754 dout(15) << __func__ << " " << c->cid << " " << o->oid
10755 << " 0x" << std::hex << offset << "~" << length << std::dec
10756 << dendl;
10757 int r = 0;
10758 if (offset + length >= OBJECT_MAX_SIZE) {
10759 r = -E2BIG;
10760 } else {
10761 _assign_nid(txc, o);
10762 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10763 txc->write_onode(o);
10764 }
10765 dout(10) << __func__ << " " << c->cid << " " << o->oid
10766 << " 0x" << std::hex << offset << "~" << length << std::dec
10767 << " = " << r << dendl;
10768 return r;
10769 }
10770
10771 int BlueStore::_zero(TransContext *txc,
10772 CollectionRef& c,
10773 OnodeRef& o,
10774 uint64_t offset, size_t length)
10775 {
10776 dout(15) << __func__ << " " << c->cid << " " << o->oid
10777 << " 0x" << std::hex << offset << "~" << length << std::dec
10778 << dendl;
10779 int r = 0;
10780 if (offset + length >= OBJECT_MAX_SIZE) {
10781 r = -E2BIG;
10782 } else {
10783 _assign_nid(txc, o);
10784 r = _do_zero(txc, c, o, offset, length);
10785 }
10786 dout(10) << __func__ << " " << c->cid << " " << o->oid
10787 << " 0x" << std::hex << offset << "~" << length << std::dec
10788 << " = " << r << dendl;
10789 return r;
10790 }
10791
10792 int BlueStore::_do_zero(TransContext *txc,
10793 CollectionRef& c,
10794 OnodeRef& o,
10795 uint64_t offset, size_t length)
10796 {
10797 dout(15) << __func__ << " " << c->cid << " " << o->oid
10798 << " 0x" << std::hex << offset << "~" << length << std::dec
10799 << dendl;
10800 int r = 0;
10801
10802 _dump_onode(o);
10803
10804 WriteContext wctx;
10805 o->extent_map.fault_range(db, offset, length);
10806 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10807 o->extent_map.dirty_range(offset, length);
10808 _wctx_finish(txc, c, o, &wctx);
10809
10810 if (length > 0 && offset + length > o->onode.size) {
10811 o->onode.size = offset + length;
10812 dout(20) << __func__ << " extending size to " << offset + length
10813 << dendl;
10814 }
10815 txc->write_onode(o);
10816
10817 dout(10) << __func__ << " " << c->cid << " " << o->oid
10818 << " 0x" << std::hex << offset << "~" << length << std::dec
10819 << " = " << r << dendl;
10820 return r;
10821 }
10822
10823 void BlueStore::_do_truncate(
10824 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10825 set<SharedBlob*> *maybe_unshared_blobs)
10826 {
10827 dout(15) << __func__ << " " << c->cid << " " << o->oid
10828 << " 0x" << std::hex << offset << std::dec << dendl;
10829
10830 _dump_onode(o, 30);
10831
10832 if (offset == o->onode.size)
10833 return;
10834
10835 if (offset < o->onode.size) {
10836 WriteContext wctx;
10837 uint64_t length = o->onode.size - offset;
10838 o->extent_map.fault_range(db, offset, length);
10839 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10840 o->extent_map.dirty_range(offset, length);
10841 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
10842
10843 // if we have shards past EOF, ask for a reshard
10844 if (!o->onode.extent_map_shards.empty() &&
10845 o->onode.extent_map_shards.back().offset >= offset) {
10846 dout(10) << __func__ << " request reshard past EOF" << dendl;
10847 if (offset) {
10848 o->extent_map.request_reshard(offset - 1, offset + length);
10849 } else {
10850 o->extent_map.request_reshard(0, length);
10851 }
10852 }
10853 }
10854
10855 o->onode.size = offset;
10856
10857 txc->write_onode(o);
10858 }
10859
10860 int BlueStore::_truncate(TransContext *txc,
10861 CollectionRef& c,
10862 OnodeRef& o,
10863 uint64_t offset)
10864 {
10865 dout(15) << __func__ << " " << c->cid << " " << o->oid
10866 << " 0x" << std::hex << offset << std::dec
10867 << dendl;
10868 int r = 0;
10869 if (offset >= OBJECT_MAX_SIZE) {
10870 r = -E2BIG;
10871 } else {
10872 _do_truncate(txc, c, o, offset);
10873 }
10874 dout(10) << __func__ << " " << c->cid << " " << o->oid
10875 << " 0x" << std::hex << offset << std::dec
10876 << " = " << r << dendl;
10877 return r;
10878 }
10879
10880 int BlueStore::_do_remove(
10881 TransContext *txc,
10882 CollectionRef& c,
10883 OnodeRef o)
10884 {
10885 set<SharedBlob*> maybe_unshared_blobs;
10886 bool is_gen = !o->oid.is_no_gen();
10887 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
10888 if (o->onode.has_omap()) {
10889 o->flush();
10890 _do_omap_clear(txc, o->onode.nid);
10891 }
10892 o->exists = false;
10893 string key;
10894 for (auto &s : o->extent_map.shards) {
10895 dout(20) << __func__ << " removing shard 0x" << std::hex
10896 << s.shard_info->offset << std::dec << dendl;
10897 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10898 [&](const string& final_key) {
10899 txc->t->rmkey(PREFIX_OBJ, final_key);
10900 }
10901 );
10902 }
10903 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10904 txc->removed(o);
10905 o->extent_map.clear();
10906 o->onode = bluestore_onode_t();
10907 _debug_obj_on_delete(o->oid);
10908
10909 if (!is_gen || maybe_unshared_blobs.empty()) {
10910 return 0;
10911 }
10912
10913 // see if we can unshare blobs still referenced by the head
10914 dout(10) << __func__ << " gen and maybe_unshared_blobs "
10915 << maybe_unshared_blobs << dendl;
10916 ghobject_t nogen = o->oid;
10917 nogen.generation = ghobject_t::NO_GEN;
10918 OnodeRef h = c->onode_map.lookup(nogen);
10919
10920 if (!h || !h->exists) {
10921 return 0;
10922 }
10923
10924 dout(20) << __func__ << " checking for unshareable blobs on " << h
10925 << " " << h->oid << dendl;
10926 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
10927 for (auto& e : h->extent_map.extent_map) {
10928 const bluestore_blob_t& b = e.blob->get_blob();
10929 SharedBlob *sb = e.blob->shared_blob.get();
10930 if (b.is_shared() &&
10931 sb->loaded &&
10932 maybe_unshared_blobs.count(sb)) {
10933 if (b.is_compressed()) {
10934 expect[sb].get(0, b.get_ondisk_length());
10935 } else {
10936 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
10937 expect[sb].get(off, len);
10938 return 0;
10939 });
10940 }
10941 }
10942 }
10943
10944 vector<SharedBlob*> unshared_blobs;
10945 unshared_blobs.reserve(maybe_unshared_blobs.size());
10946 for (auto& p : expect) {
10947 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
10948 if (p.first->persistent->ref_map == p.second) {
10949 SharedBlob *sb = p.first;
10950 dout(20) << __func__ << " unsharing " << *sb << dendl;
10951 unshared_blobs.push_back(sb);
10952 txc->unshare_blob(sb);
10953 uint64_t sbid = c->make_blob_unshared(sb);
10954 string key;
10955 get_shared_blob_key(sbid, &key);
10956 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
10957 }
10958 }
10959
10960 if (unshared_blobs.empty()) {
10961 return 0;
10962 }
10963
10964 for (auto& e : h->extent_map.extent_map) {
10965 const bluestore_blob_t& b = e.blob->get_blob();
10966 SharedBlob *sb = e.blob->shared_blob.get();
10967 if (b.is_shared() &&
10968 std::find(unshared_blobs.begin(), unshared_blobs.end(),
10969 sb) != unshared_blobs.end()) {
10970 dout(20) << __func__ << " unsharing " << e << dendl;
10971 bluestore_blob_t& blob = e.blob->dirty_blob();
10972 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
10973 h->extent_map.dirty_range(e.logical_offset, 1);
10974 }
10975 }
10976 txc->write_onode(h);
10977
10978 return 0;
10979 }
10980
10981 int BlueStore::_remove(TransContext *txc,
10982 CollectionRef& c,
10983 OnodeRef &o)
10984 {
10985 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10986 int r = _do_remove(txc, c, o);
10987 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10988 return r;
10989 }
10990
10991 int BlueStore::_setattr(TransContext *txc,
10992 CollectionRef& c,
10993 OnodeRef& o,
10994 const string& name,
10995 bufferptr& val)
10996 {
10997 dout(15) << __func__ << " " << c->cid << " " << o->oid
10998 << " " << name << " (" << val.length() << " bytes)"
10999 << dendl;
11000 int r = 0;
11001 if (val.is_partial()) {
11002 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
11003 val.length());
11004 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11005 } else {
11006 auto& b = o->onode.attrs[name.c_str()] = val;
11007 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11008 }
11009 txc->write_onode(o);
11010 dout(10) << __func__ << " " << c->cid << " " << o->oid
11011 << " " << name << " (" << val.length() << " bytes)"
11012 << " = " << r << dendl;
11013 return r;
11014 }
11015
11016 int BlueStore::_setattrs(TransContext *txc,
11017 CollectionRef& c,
11018 OnodeRef& o,
11019 const map<string,bufferptr>& aset)
11020 {
11021 dout(15) << __func__ << " " << c->cid << " " << o->oid
11022 << " " << aset.size() << " keys"
11023 << dendl;
11024 int r = 0;
11025 for (map<string,bufferptr>::const_iterator p = aset.begin();
11026 p != aset.end(); ++p) {
11027 if (p->second.is_partial()) {
11028 auto& b = o->onode.attrs[p->first.c_str()] =
11029 bufferptr(p->second.c_str(), p->second.length());
11030 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11031 } else {
11032 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
11033 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11034 }
11035 }
11036 txc->write_onode(o);
11037 dout(10) << __func__ << " " << c->cid << " " << o->oid
11038 << " " << aset.size() << " keys"
11039 << " = " << r << dendl;
11040 return r;
11041 }
11042
11043
11044 int BlueStore::_rmattr(TransContext *txc,
11045 CollectionRef& c,
11046 OnodeRef& o,
11047 const string& name)
11048 {
11049 dout(15) << __func__ << " " << c->cid << " " << o->oid
11050 << " " << name << dendl;
11051 int r = 0;
11052 auto it = o->onode.attrs.find(name.c_str());
11053 if (it == o->onode.attrs.end())
11054 goto out;
11055
11056 o->onode.attrs.erase(it);
11057 txc->write_onode(o);
11058
11059 out:
11060 dout(10) << __func__ << " " << c->cid << " " << o->oid
11061 << " " << name << " = " << r << dendl;
11062 return r;
11063 }
11064
11065 int BlueStore::_rmattrs(TransContext *txc,
11066 CollectionRef& c,
11067 OnodeRef& o)
11068 {
11069 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11070 int r = 0;
11071
11072 if (o->onode.attrs.empty())
11073 goto out;
11074
11075 o->onode.attrs.clear();
11076 txc->write_onode(o);
11077
11078 out:
11079 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11080 return r;
11081 }
11082
11083 void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
11084 {
11085 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
11086 string prefix, tail;
11087 get_omap_header(id, &prefix);
11088 get_omap_tail(id, &tail);
11089 it->lower_bound(prefix);
11090 while (it->valid()) {
11091 if (it->key() >= tail) {
11092 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
11093 << dendl;
11094 break;
11095 }
11096 txc->t->rmkey(PREFIX_OMAP, it->key());
11097 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
11098 it->next();
11099 }
11100 }
11101
11102 int BlueStore::_omap_clear(TransContext *txc,
11103 CollectionRef& c,
11104 OnodeRef& o)
11105 {
11106 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11107 int r = 0;
11108 if (o->onode.has_omap()) {
11109 o->flush();
11110 _do_omap_clear(txc, o->onode.nid);
11111 o->onode.clear_omap_flag();
11112 txc->write_onode(o);
11113 }
11114 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11115 return r;
11116 }
11117
11118 int BlueStore::_omap_setkeys(TransContext *txc,
11119 CollectionRef& c,
11120 OnodeRef& o,
11121 bufferlist &bl)
11122 {
11123 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11124 int r;
11125 bufferlist::iterator p = bl.begin();
11126 __u32 num;
11127 if (!o->onode.has_omap()) {
11128 o->onode.set_omap_flag();
11129 txc->write_onode(o);
11130 } else {
11131 txc->note_modified_object(o);
11132 }
11133 string final_key;
11134 _key_encode_u64(o->onode.nid, &final_key);
11135 final_key.push_back('.');
11136 ::decode(num, p);
11137 while (num--) {
11138 string key;
11139 bufferlist value;
11140 ::decode(key, p);
11141 ::decode(value, p);
11142 final_key.resize(9); // keep prefix
11143 final_key += key;
11144 dout(30) << __func__ << " " << pretty_binary_string(final_key)
11145 << " <- " << key << dendl;
11146 txc->t->set(PREFIX_OMAP, final_key, value);
11147 }
11148 r = 0;
11149 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11150 return r;
11151 }
11152
11153 int BlueStore::_omap_setheader(TransContext *txc,
11154 CollectionRef& c,
11155 OnodeRef &o,
11156 bufferlist& bl)
11157 {
11158 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11159 int r;
11160 string key;
11161 if (!o->onode.has_omap()) {
11162 o->onode.set_omap_flag();
11163 txc->write_onode(o);
11164 } else {
11165 txc->note_modified_object(o);
11166 }
11167 get_omap_header(o->onode.nid, &key);
11168 txc->t->set(PREFIX_OMAP, key, bl);
11169 r = 0;
11170 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11171 return r;
11172 }
11173
11174 int BlueStore::_omap_rmkeys(TransContext *txc,
11175 CollectionRef& c,
11176 OnodeRef& o,
11177 bufferlist& bl)
11178 {
11179 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11180 int r = 0;
11181 bufferlist::iterator p = bl.begin();
11182 __u32 num;
11183 string final_key;
11184
11185 if (!o->onode.has_omap()) {
11186 goto out;
11187 }
11188 _key_encode_u64(o->onode.nid, &final_key);
11189 final_key.push_back('.');
11190 ::decode(num, p);
11191 while (num--) {
11192 string key;
11193 ::decode(key, p);
11194 final_key.resize(9); // keep prefix
11195 final_key += key;
11196 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
11197 << " <- " << key << dendl;
11198 txc->t->rmkey(PREFIX_OMAP, final_key);
11199 }
11200 txc->note_modified_object(o);
11201
11202 out:
11203 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11204 return r;
11205 }
11206
11207 int BlueStore::_omap_rmkey_range(TransContext *txc,
11208 CollectionRef& c,
11209 OnodeRef& o,
11210 const string& first, const string& last)
11211 {
11212 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11213 KeyValueDB::Iterator it;
11214 string key_first, key_last;
11215 int r = 0;
11216 if (!o->onode.has_omap()) {
11217 goto out;
11218 }
11219 o->flush();
11220 it = db->get_iterator(PREFIX_OMAP);
11221 get_omap_key(o->onode.nid, first, &key_first);
11222 get_omap_key(o->onode.nid, last, &key_last);
11223 it->lower_bound(key_first);
11224 while (it->valid()) {
11225 if (it->key() >= key_last) {
11226 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
11227 << dendl;
11228 break;
11229 }
11230 txc->t->rmkey(PREFIX_OMAP, it->key());
11231 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
11232 it->next();
11233 }
11234 txc->note_modified_object(o);
11235
11236 out:
11237 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11238 return r;
11239 }
11240
11241 int BlueStore::_set_alloc_hint(
11242 TransContext *txc,
11243 CollectionRef& c,
11244 OnodeRef& o,
11245 uint64_t expected_object_size,
11246 uint64_t expected_write_size,
11247 uint32_t flags)
11248 {
11249 dout(15) << __func__ << " " << c->cid << " " << o->oid
11250 << " object_size " << expected_object_size
11251 << " write_size " << expected_write_size
11252 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11253 << dendl;
11254 int r = 0;
11255 o->onode.expected_object_size = expected_object_size;
11256 o->onode.expected_write_size = expected_write_size;
11257 o->onode.alloc_hint_flags = flags;
11258 txc->write_onode(o);
11259 dout(10) << __func__ << " " << c->cid << " " << o->oid
11260 << " object_size " << expected_object_size
11261 << " write_size " << expected_write_size
11262 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11263 << " = " << r << dendl;
11264 return r;
11265 }
11266
11267 int BlueStore::_clone(TransContext *txc,
11268 CollectionRef& c,
11269 OnodeRef& oldo,
11270 OnodeRef& newo)
11271 {
11272 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11273 << newo->oid << dendl;
11274 int r = 0;
11275 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
11276 derr << __func__ << " mismatched hash on " << oldo->oid
11277 << " and " << newo->oid << dendl;
11278 return -EINVAL;
11279 }
11280
11281 _assign_nid(txc, newo);
11282
11283 // clone data
11284 oldo->flush();
11285 _do_truncate(txc, c, newo, 0);
11286 if (cct->_conf->bluestore_clone_cow) {
11287 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
11288 } else {
11289 bufferlist bl;
11290 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
11291 if (r < 0)
11292 goto out;
11293 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
11294 if (r < 0)
11295 goto out;
11296 }
11297
11298 // clone attrs
11299 newo->onode.attrs = oldo->onode.attrs;
11300
11301 // clone omap
11302 if (newo->onode.has_omap()) {
11303 dout(20) << __func__ << " clearing old omap data" << dendl;
11304 newo->flush();
11305 _do_omap_clear(txc, newo->onode.nid);
11306 }
11307 if (oldo->onode.has_omap()) {
11308 dout(20) << __func__ << " copying omap data" << dendl;
11309 if (!newo->onode.has_omap()) {
11310 newo->onode.set_omap_flag();
11311 }
11312 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
11313 string head, tail;
11314 get_omap_header(oldo->onode.nid, &head);
11315 get_omap_tail(oldo->onode.nid, &tail);
11316 it->lower_bound(head);
11317 while (it->valid()) {
11318 if (it->key() >= tail) {
11319 dout(30) << __func__ << " reached tail" << dendl;
11320 break;
11321 } else {
11322 dout(30) << __func__ << " got header/data "
11323 << pretty_binary_string(it->key()) << dendl;
11324 string key;
11325 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11326 txc->t->set(PREFIX_OMAP, key, it->value());
11327 }
11328 it->next();
11329 }
11330 } else {
11331 newo->onode.clear_omap_flag();
11332 }
11333
11334 txc->write_onode(newo);
11335 r = 0;
11336
11337 out:
11338 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11339 << newo->oid << " = " << r << dendl;
11340 return r;
11341 }
11342
11343 int BlueStore::_do_clone_range(
11344 TransContext *txc,
11345 CollectionRef& c,
11346 OnodeRef& oldo,
11347 OnodeRef& newo,
11348 uint64_t srcoff,
11349 uint64_t length,
11350 uint64_t dstoff)
11351 {
11352 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11353 << newo->oid
11354 << " 0x" << std::hex << srcoff << "~" << length << " -> "
11355 << " 0x" << dstoff << "~" << length << std::dec << dendl;
11356 oldo->extent_map.fault_range(db, srcoff, length);
11357 newo->extent_map.fault_range(db, dstoff, length);
11358 _dump_onode(oldo);
11359 _dump_onode(newo);
11360
11361 // hmm, this could go into an ExtentMap::dup() method.
11362 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
11363 for (auto &e : oldo->extent_map.extent_map) {
11364 e.blob->last_encoded_id = -1;
11365 }
11366 int n = 0;
11367 uint64_t end = srcoff + length;
11368 uint32_t dirty_range_begin = 0;
11369 uint32_t dirty_range_end = 0;
11370 bool src_dirty = false;
11371 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
11372 ep != oldo->extent_map.extent_map.end();
11373 ++ep) {
11374 auto& e = *ep;
11375 if (e.logical_offset >= end) {
11376 break;
11377 }
11378 dout(20) << __func__ << " src " << e << dendl;
11379 BlobRef cb;
11380 bool blob_duped = true;
11381 if (e.blob->last_encoded_id >= 0) {
11382 // blob is already duped
11383 cb = id_to_blob[e.blob->last_encoded_id];
11384 blob_duped = false;
11385 } else {
11386 // dup the blob
11387 const bluestore_blob_t& blob = e.blob->get_blob();
11388 // make sure it is shared
11389 if (!blob.is_shared()) {
11390 c->make_blob_shared(_assign_blobid(txc), e.blob);
11391 if (!src_dirty) {
11392 src_dirty = true;
11393 dirty_range_begin = e.logical_offset;
11394 }
11395 assert(e.logical_end() > 0);
11396 // -1 to exclude next potential shard
11397 dirty_range_end = e.logical_end() - 1;
11398 } else {
11399 c->load_shared_blob(e.blob->shared_blob);
11400 }
11401 cb = new Blob();
11402 e.blob->last_encoded_id = n;
11403 id_to_blob[n] = cb;
11404 e.blob->dup(*cb);
11405 // bump the extent refs on the copied blob's extents
11406 for (auto p : blob.get_extents()) {
11407 if (p.is_valid()) {
11408 e.blob->shared_blob->get_ref(p.offset, p.length);
11409 }
11410 }
11411 txc->write_shared_blob(e.blob->shared_blob);
11412 dout(20) << __func__ << " new " << *cb << dendl;
11413 }
11414 // dup extent
11415 int skip_front, skip_back;
11416 if (e.logical_offset < srcoff) {
11417 skip_front = srcoff - e.logical_offset;
11418 } else {
11419 skip_front = 0;
11420 }
11421 if (e.logical_end() > end) {
11422 skip_back = e.logical_end() - end;
11423 } else {
11424 skip_back = 0;
11425 }
11426 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
11427 e.blob_offset + skip_front,
11428 e.length - skip_front - skip_back, cb);
11429 newo->extent_map.extent_map.insert(*ne);
11430 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
11431 // fixme: we may leave parts of new blob unreferenced that could
11432 // be freed (relative to the shared_blob).
11433 txc->statfs_delta.stored() += ne->length;
11434 if (e.blob->get_blob().is_compressed()) {
11435 txc->statfs_delta.compressed_original() += ne->length;
11436 if (blob_duped){
11437 txc->statfs_delta.compressed() +=
11438 cb->get_blob().get_compressed_payload_length();
11439 }
11440 }
11441 dout(20) << __func__ << " dst " << *ne << dendl;
11442 ++n;
11443 }
11444 if (src_dirty) {
11445 oldo->extent_map.dirty_range(dirty_range_begin,
11446 dirty_range_end - dirty_range_begin);
11447 txc->write_onode(oldo);
11448 }
11449 txc->write_onode(newo);
11450
11451 if (dstoff + length > newo->onode.size) {
11452 newo->onode.size = dstoff + length;
11453 }
11454 newo->extent_map.dirty_range(dstoff, length);
11455 _dump_onode(oldo);
11456 _dump_onode(newo);
11457 return 0;
11458 }
11459
11460 int BlueStore::_clone_range(TransContext *txc,
11461 CollectionRef& c,
11462 OnodeRef& oldo,
11463 OnodeRef& newo,
11464 uint64_t srcoff, uint64_t length, uint64_t dstoff)
11465 {
11466 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11467 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11468 << " to offset 0x" << dstoff << std::dec << dendl;
11469 int r = 0;
11470
11471 if (srcoff + length >= OBJECT_MAX_SIZE ||
11472 dstoff + length >= OBJECT_MAX_SIZE) {
11473 r = -E2BIG;
11474 goto out;
11475 }
11476 if (srcoff + length > oldo->onode.size) {
11477 r = -EINVAL;
11478 goto out;
11479 }
11480
11481 _assign_nid(txc, newo);
11482
11483 if (length > 0) {
11484 if (cct->_conf->bluestore_clone_cow) {
11485 _do_zero(txc, c, newo, dstoff, length);
11486 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
11487 } else {
11488 bufferlist bl;
11489 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
11490 if (r < 0)
11491 goto out;
11492 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
11493 if (r < 0)
11494 goto out;
11495 }
11496 }
11497
11498 txc->write_onode(newo);
11499 r = 0;
11500
11501 out:
11502 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11503 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11504 << " to offset 0x" << dstoff << std::dec
11505 << " = " << r << dendl;
11506 return r;
11507 }
11508
11509 int BlueStore::_rename(TransContext *txc,
11510 CollectionRef& c,
11511 OnodeRef& oldo,
11512 OnodeRef& newo,
11513 const ghobject_t& new_oid)
11514 {
11515 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11516 << new_oid << dendl;
11517 int r;
11518 ghobject_t old_oid = oldo->oid;
11519 mempool::bluestore_cache_other::string new_okey;
11520
11521 if (newo) {
11522 if (newo->exists) {
11523 r = -EEXIST;
11524 goto out;
11525 }
11526 assert(txc->onodes.count(newo) == 0);
11527 }
11528
11529 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
11530
11531 // rewrite shards
11532 {
11533 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
11534 get_object_key(cct, new_oid, &new_okey);
11535 string key;
11536 for (auto &s : oldo->extent_map.shards) {
11537 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
11538 [&](const string& final_key) {
11539 txc->t->rmkey(PREFIX_OBJ, final_key);
11540 }
11541 );
11542 s.dirty = true;
11543 }
11544 }
11545
11546 newo = oldo;
11547 txc->write_onode(newo);
11548
11549 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11550 // Onode in the old slot
11551 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
11552 r = 0;
11553
11554 out:
11555 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
11556 << new_oid << " = " << r << dendl;
11557 return r;
11558 }
11559
11560 // collections
11561
11562 int BlueStore::_create_collection(
11563 TransContext *txc,
11564 const coll_t &cid,
11565 unsigned bits,
11566 CollectionRef *c)
11567 {
11568 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
11569 int r;
11570 bufferlist bl;
11571
11572 {
11573 RWLock::WLocker l(coll_lock);
11574 if (*c) {
11575 r = -EEXIST;
11576 goto out;
11577 }
11578 c->reset(
11579 new Collection(
11580 this,
11581 cache_shards[cid.hash_to_shard(cache_shards.size())],
11582 cid));
11583 (*c)->cnode.bits = bits;
11584 coll_map[cid] = *c;
11585 }
11586 ::encode((*c)->cnode, bl);
11587 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11588 r = 0;
11589
11590 out:
11591 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11592 return r;
11593 }
11594
11595 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11596 CollectionRef *c)
11597 {
11598 dout(15) << __func__ << " " << cid << dendl;
11599 int r;
11600
11601 {
11602 RWLock::WLocker l(coll_lock);
11603 if (!*c) {
11604 r = -ENOENT;
11605 goto out;
11606 }
11607 size_t nonexistent_count = 0;
11608 assert((*c)->exists);
11609 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11610 if (o->exists) {
11611 dout(10) << __func__ << " " << o->oid << " " << o
11612 << " exists in onode_map" << dendl;
11613 return true;
11614 }
11615 ++nonexistent_count;
11616 return false;
11617 })) {
11618 r = -ENOTEMPTY;
11619 goto out;
11620 }
11621
11622 vector<ghobject_t> ls;
11623 ghobject_t next;
11624 // Enumerate onodes in db, up to nonexistent_count + 1
11625 // then check if all of them are marked as non-existent.
11626 // Bypass the check if returned number is greater than nonexistent_count
11627 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11628 nonexistent_count + 1, &ls, &next);
11629 if (r >= 0) {
11630 bool exists = false; //ls.size() > nonexistent_count;
11631 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11632 dout(10) << __func__ << " oid " << *it << dendl;
11633 auto onode = (*c)->onode_map.lookup(*it);
11634 exists = !onode || onode->exists;
11635 if (exists) {
11636 dout(10) << __func__ << " " << *it
11637 << " exists in db" << dendl;
11638 }
11639 }
11640 if (!exists) {
11641 coll_map.erase(cid);
11642 txc->removed_collections.push_back(*c);
11643 (*c)->exists = false;
11644 c->reset();
11645 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11646 r = 0;
11647 } else {
11648 dout(10) << __func__ << " " << cid
11649 << " is non-empty" << dendl;
11650 r = -ENOTEMPTY;
11651 }
11652 }
11653 }
11654
11655 out:
11656 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11657 return r;
11658 }
11659
11660 int BlueStore::_split_collection(TransContext *txc,
11661 CollectionRef& c,
11662 CollectionRef& d,
11663 unsigned bits, int rem)
11664 {
11665 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11666 << " bits " << bits << dendl;
11667 RWLock::WLocker l(c->lock);
11668 RWLock::WLocker l2(d->lock);
11669 int r;
11670
11671 // flush all previous deferred writes on this sequencer. this is a bit
11672 // heavyweight, but we need to make sure all deferred writes complete
11673 // before we split as the new collection's sequencer may need to order
11674 // this after those writes, and we don't bother with the complexity of
11675 // moving those TransContexts over to the new osr.
11676 _osr_drain_preceding(txc);
11677
11678 // move any cached items (onodes and referenced shared blobs) that will
11679 // belong to the child collection post-split. leave everything else behind.
11680 // this may include things that don't strictly belong to the now-smaller
11681 // parent split, but the OSD will always send us a split for every new
11682 // child.
11683
11684 spg_t pgid, dest_pgid;
11685 bool is_pg = c->cid.is_pg(&pgid);
11686 assert(is_pg);
11687 is_pg = d->cid.is_pg(&dest_pgid);
11688 assert(is_pg);
11689
11690 // the destination should initially be empty.
11691 assert(d->onode_map.empty());
11692 assert(d->shared_blob_set.empty());
11693 assert(d->cnode.bits == bits);
11694
11695 c->split_cache(d.get());
11696
11697 // adjust bits. note that this will be redundant for all but the first
11698 // split call for this parent (first child).
11699 c->cnode.bits = bits;
11700 assert(d->cnode.bits == bits);
11701 r = 0;
11702
11703 bufferlist bl;
11704 ::encode(c->cnode, bl);
11705 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11706
11707 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11708 << " bits " << bits << " = " << r << dendl;
11709 return r;
11710 }
11711
11712 // DB key value Histogram
11713 #define KEY_SLAB 32
11714 #define VALUE_SLAB 64
11715
11716 const string prefix_onode = "o";
11717 const string prefix_onode_shard = "x";
11718 const string prefix_other = "Z";
11719
11720 int BlueStore::DBHistogram::get_key_slab(size_t sz)
11721 {
11722 return (sz/KEY_SLAB);
11723 }
11724
11725 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11726 {
11727 int lower_bound = slab * KEY_SLAB;
11728 int upper_bound = (slab + 1) * KEY_SLAB;
11729 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11730 return ret;
11731 }
11732
11733 int BlueStore::DBHistogram::get_value_slab(size_t sz)
11734 {
11735 return (sz/VALUE_SLAB);
11736 }
11737
11738 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11739 {
11740 int lower_bound = slab * VALUE_SLAB;
11741 int upper_bound = (slab + 1) * VALUE_SLAB;
11742 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11743 return ret;
11744 }
11745
11746 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11747 const string &prefix, size_t key_size, size_t value_size)
11748 {
11749 uint32_t key_slab = get_key_slab(key_size);
11750 uint32_t value_slab = get_value_slab(value_size);
11751 key_hist[prefix][key_slab].count++;
11752 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11753 key_hist[prefix][key_slab].val_map[value_slab].count++;
11754 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11755 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11756 }
11757
11758 void BlueStore::DBHistogram::dump(Formatter *f)
11759 {
11760 f->open_object_section("rocksdb_value_distribution");
11761 for (auto i : value_hist) {
11762 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11763 }
11764 f->close_section();
11765
11766 f->open_object_section("rocksdb_key_value_histogram");
11767 for (auto i : key_hist) {
11768 f->dump_string("prefix", i.first);
11769 f->open_object_section("key_hist");
11770 for ( auto k : i.second) {
11771 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11772 f->dump_unsigned("max_len", k.second.max_len);
11773 f->open_object_section("value_hist");
11774 for ( auto j : k.second.val_map) {
11775 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11776 f->dump_unsigned("max_len", j.second.max_len);
11777 }
11778 f->close_section();
11779 }
11780 f->close_section();
11781 }
11782 f->close_section();
11783 }
11784
11785 //Itrerates through the db and collects the stats
11786 void BlueStore::generate_db_histogram(Formatter *f)
11787 {
11788 //globals
11789 uint64_t num_onodes = 0;
11790 uint64_t num_shards = 0;
11791 uint64_t num_super = 0;
11792 uint64_t num_coll = 0;
11793 uint64_t num_omap = 0;
11794 uint64_t num_deferred = 0;
11795 uint64_t num_alloc = 0;
11796 uint64_t num_stat = 0;
11797 uint64_t num_others = 0;
11798 uint64_t num_shared_shards = 0;
11799 size_t max_key_size =0, max_value_size = 0;
11800 uint64_t total_key_size = 0, total_value_size = 0;
11801 size_t key_size = 0, value_size = 0;
11802 DBHistogram hist;
11803
11804 utime_t start = ceph_clock_now();
11805
11806 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11807 iter->seek_to_first();
11808 while (iter->valid()) {
11809 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11810 key_size = iter->key_size();
11811 value_size = iter->value_size();
11812 hist.value_hist[hist.get_value_slab(value_size)]++;
11813 max_key_size = MAX(max_key_size, key_size);
11814 max_value_size = MAX(max_value_size, value_size);
11815 total_key_size += key_size;
11816 total_value_size += value_size;
11817
11818 pair<string,string> key(iter->raw_key());
11819
11820 if (key.first == PREFIX_SUPER) {
11821 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11822 num_super++;
11823 } else if (key.first == PREFIX_STAT) {
11824 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11825 num_stat++;
11826 } else if (key.first == PREFIX_COLL) {
11827 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11828 num_coll++;
11829 } else if (key.first == PREFIX_OBJ) {
11830 if (key.second.back() == ONODE_KEY_SUFFIX) {
11831 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11832 num_onodes++;
11833 } else {
11834 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11835 num_shards++;
11836 }
11837 } else if (key.first == PREFIX_OMAP) {
11838 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11839 num_omap++;
11840 } else if (key.first == PREFIX_DEFERRED) {
11841 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11842 num_deferred++;
11843 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11844 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11845 num_alloc++;
11846 } else if (key.first == PREFIX_SHARED_BLOB) {
11847 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11848 num_shared_shards++;
11849 } else {
11850 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11851 num_others++;
11852 }
11853 iter->next();
11854 }
11855
11856 utime_t duration = ceph_clock_now() - start;
11857 f->open_object_section("rocksdb_key_value_stats");
11858 f->dump_unsigned("num_onodes", num_onodes);
11859 f->dump_unsigned("num_shards", num_shards);
11860 f->dump_unsigned("num_super", num_super);
11861 f->dump_unsigned("num_coll", num_coll);
11862 f->dump_unsigned("num_omap", num_omap);
11863 f->dump_unsigned("num_deferred", num_deferred);
11864 f->dump_unsigned("num_alloc", num_alloc);
11865 f->dump_unsigned("num_stat", num_stat);
11866 f->dump_unsigned("num_shared_shards", num_shared_shards);
11867 f->dump_unsigned("num_others", num_others);
11868 f->dump_unsigned("max_key_size", max_key_size);
11869 f->dump_unsigned("max_value_size", max_value_size);
11870 f->dump_unsigned("total_key_size", total_key_size);
11871 f->dump_unsigned("total_value_size", total_value_size);
11872 f->close_section();
11873
11874 hist.dump(f);
11875
11876 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11877
11878 }
11879
11880 void BlueStore::_flush_cache()
11881 {
11882 dout(10) << __func__ << dendl;
11883 for (auto i : cache_shards) {
11884 i->trim_all();
11885 assert(i->empty());
11886 }
11887 for (auto& p : coll_map) {
11888 if (!p.second->onode_map.empty()) {
11889 derr << __func__ << "stray onodes on " << p.first << dendl;
11890 p.second->onode_map.dump(cct, 0);
11891 }
11892 if (!p.second->shared_blob_set.empty()) {
11893 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11894 p.second->shared_blob_set.dump(cct, 0);
11895 }
11896 assert(p.second->onode_map.empty());
11897 assert(p.second->shared_blob_set.empty());
11898 }
11899 coll_map.clear();
11900 }
11901
11902 // For external caller.
11903 // We use a best-effort policy instead, e.g.,
11904 // we don't care if there are still some pinned onodes/data in the cache
11905 // after this command is completed.
11906 void BlueStore::flush_cache()
11907 {
11908 dout(10) << __func__ << dendl;
11909 for (auto i : cache_shards) {
11910 i->trim_all();
11911 }
11912 }
11913
11914 void BlueStore::_apply_padding(uint64_t head_pad,
11915 uint64_t tail_pad,
11916 bufferlist& padded)
11917 {
11918 if (head_pad) {
11919 padded.prepend_zero(head_pad);
11920 }
11921 if (tail_pad) {
11922 padded.append_zero(tail_pad);
11923 }
11924 if (head_pad || tail_pad) {
11925 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
11926 << " tail 0x" << tail_pad << std::dec << dendl;
11927 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
11928 }
11929 }
11930
11931 // ===========================================