]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
import ceph 12.2.12
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // vim: ts=8 sw=2 smarttab
2 /*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14 #include <unistd.h>
15 #include <stdlib.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <fcntl.h>
19
20 #include "include/cpp-btree/btree_set.h"
21
22 #include "BlueStore.h"
23 #include "os/kv.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "common/PriorityCache.h"
30 #include "Allocator.h"
31 #include "FreelistManager.h"
32 #include "BlueFS.h"
33 #include "BlueRocksEnv.h"
34 #include "auth/Crypto.h"
35 #include "common/EventTrace.h"
36 #include "perfglue/heap_profiler.h"
37
38 #define dout_context cct
39 #define dout_subsys ceph_subsys_bluestore
40
41 using bid_t = decltype(BlueStore::Blob::id);
42
43 // bluestore_cache_onode
44 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
45 bluestore_cache_onode);
46
47 // bluestore_cache_other
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
49 bluestore_cache_other);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
51 bluestore_cache_other);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
53 bluestore_cache_other);
54 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
55 bluestore_cache_other);
56
57 // bluestore_txc
58 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
59 bluestore_txc);
60
61
62 // kv store prefixes
63 const string PREFIX_SUPER = "S"; // field -> value
64 const string PREFIX_STAT = "T"; // field -> value(int64 array)
65 const string PREFIX_COLL = "C"; // collection name -> cnode_t
66 const string PREFIX_OBJ = "O"; // object name -> onode_t
67 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
68 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
69 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
70 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
71
72 // write a label in the first block. always use this size. note that
73 // bluefs makes a matching assumption about the location of its
74 // superblock (always the second block of the device).
75 #define BDEV_LABEL_BLOCK_SIZE 4096
76
77 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
78 #define SUPER_RESERVED 8192
79
80 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
81
82
83 /*
84 * extent map blob encoding
85 *
86 * we use the low bits of the blobid field to indicate some common scenarios
87 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
88 */
89 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
90 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
91 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
92 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
93 #define BLOBID_SHIFT_BITS 4
94
95 /*
96 * object name key structure
97 *
98 * encoded u8: shard + 2^7 (so that it sorts properly)
99 * encoded u64: poolid + 2^63 (so that it sorts properly)
100 * encoded u32: hash (bit reversed)
101 *
102 * escaped string: namespace
103 *
104 * escaped string: key or object name
105 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
106 * we are done. otherwise, we are followed by the object name.
107 * escaped string: object name (unless '=' above)
108 *
109 * encoded u64: snap
110 * encoded u64: generation
111 * 'o'
112 */
113 #define ONODE_KEY_SUFFIX 'o'
114
115 /*
116 * extent shard key
117 *
118 * object prefix key
119 * u32
120 * 'x'
121 */
122 #define EXTENT_SHARD_KEY_SUFFIX 'x'
123
124 /*
125 * string encoding in the key
126 *
127 * The key string needs to lexicographically sort the same way that
128 * ghobject_t does. We do this by escaping anything <= to '#' with #
129 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
130 * hex digits.
131 *
132 * We use ! as a terminator for strings; this works because it is < #
133 * and will get escaped if it is present in the string.
134 *
135 */
136 template<typename S>
137 static void append_escaped(const string &in, S *out)
138 {
139 char hexbyte[in.length() * 3 + 1];
140 char* ptr = &hexbyte[0];
141 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
142 if (*i <= '#') {
143 *ptr++ = '#';
144 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
145 *ptr++ = "0123456789abcdef"[*i & 0x0f];
146 } else if (*i >= '~') {
147 *ptr++ = '~';
148 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
149 *ptr++ = "0123456789abcdef"[*i & 0x0f];
150 } else {
151 *ptr++ = *i;
152 }
153 }
154 *ptr++ = '!';
155 out->append(hexbyte, ptr - &hexbyte[0]);
156 }
157
158 inline unsigned h2i(char c)
159 {
160 if ((c >= '0') && (c <= '9')) {
161 return c - 0x30;
162 } else if ((c >= 'a') && (c <= 'f')) {
163 return c - 'a' + 10;
164 } else if ((c >= 'A') && (c <= 'F')) {
165 return c - 'A' + 10;
166 } else {
167 return 256; // make it always larger than 255
168 }
169 }
170
171 static int decode_escaped(const char *p, string *out)
172 {
173 char buff[256];
174 char* ptr = &buff[0];
175 char* max = &buff[252];
176 const char *orig_p = p;
177 while (*p && *p != '!') {
178 if (*p == '#' || *p == '~') {
179 unsigned hex = 0;
180 p++;
181 hex = h2i(*p++) << 4;
182 if (hex > 255) {
183 return -EINVAL;
184 }
185 hex |= h2i(*p++);
186 if (hex > 255) {
187 return -EINVAL;
188 }
189 *ptr++ = hex;
190 } else {
191 *ptr++ = *p++;
192 }
193 if (ptr > max) {
194 out->append(buff, ptr-buff);
195 ptr = &buff[0];
196 }
197 }
198 if (ptr != buff) {
199 out->append(buff, ptr-buff);
200 }
201 return p - orig_p;
202 }
203
204 // some things we encode in binary (as le32 or le64); print the
205 // resulting key strings nicely
206 template<typename S>
207 static string pretty_binary_string(const S& in)
208 {
209 char buf[10];
210 string out;
211 out.reserve(in.length() * 3);
212 enum { NONE, HEX, STRING } mode = NONE;
213 unsigned from = 0, i;
214 for (i=0; i < in.length(); ++i) {
215 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
216 (mode == HEX && in.length() - i >= 4 &&
217 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
218 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
219 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
220 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
221 if (mode == STRING) {
222 out.append(in.c_str() + from, i - from);
223 out.push_back('\'');
224 }
225 if (mode != HEX) {
226 out.append("0x");
227 mode = HEX;
228 }
229 if (in.length() - i >= 4) {
230 // print a whole u32 at once
231 snprintf(buf, sizeof(buf), "%08x",
232 (uint32_t)(((unsigned char)in[i] << 24) |
233 ((unsigned char)in[i+1] << 16) |
234 ((unsigned char)in[i+2] << 8) |
235 ((unsigned char)in[i+3] << 0)));
236 i += 3;
237 } else {
238 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
239 }
240 out.append(buf);
241 } else {
242 if (mode != STRING) {
243 out.push_back('\'');
244 mode = STRING;
245 from = i;
246 }
247 }
248 }
249 if (mode == STRING) {
250 out.append(in.c_str() + from, i - from);
251 out.push_back('\'');
252 }
253 return out;
254 }
255
256 template<typename T>
257 static void _key_encode_shard(shard_id_t shard, T *key)
258 {
259 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
260 }
261
262 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
263 {
264 pshard->id = (uint8_t)*key - (uint8_t)0x80;
265 return key + 1;
266 }
267
268 static void get_coll_key_range(const coll_t& cid, int bits,
269 string *temp_start, string *temp_end,
270 string *start, string *end)
271 {
272 temp_start->clear();
273 temp_end->clear();
274 start->clear();
275 end->clear();
276
277 spg_t pgid;
278 if (cid.is_pg(&pgid)) {
279 _key_encode_shard(pgid.shard, start);
280 *temp_start = *start;
281
282 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
283 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
284
285 *end = *start;
286 *temp_end = *temp_start;
287
288 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
289 _key_encode_u32(reverse_hash, start);
290 _key_encode_u32(reverse_hash, temp_start);
291
292 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
293 if (end_hash > 0xffffffffull)
294 end_hash = 0xffffffffull;
295
296 _key_encode_u32(end_hash, end);
297 _key_encode_u32(end_hash, temp_end);
298 } else {
299 _key_encode_shard(shard_id_t::NO_SHARD, start);
300 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
301 *end = *start;
302 _key_encode_u32(0, start);
303 _key_encode_u32(0xffffffff, end);
304
305 // no separate temp section
306 *temp_start = *end;
307 *temp_end = *end;
308 }
309 }
310
311 static void get_shared_blob_key(uint64_t sbid, string *key)
312 {
313 key->clear();
314 _key_encode_u64(sbid, key);
315 }
316
317 static int get_key_shared_blob(const string& key, uint64_t *sbid)
318 {
319 const char *p = key.c_str();
320 if (key.length() < sizeof(uint64_t))
321 return -1;
322 _key_decode_u64(p, sbid);
323 return 0;
324 }
325
326 template<typename S>
327 static int get_key_object(const S& key, ghobject_t *oid)
328 {
329 int r;
330 const char *p = key.c_str();
331
332 if (key.length() < 1 + 8 + 4)
333 return -1;
334 p = _key_decode_shard(p, &oid->shard_id);
335
336 uint64_t pool;
337 p = _key_decode_u64(p, &pool);
338 oid->hobj.pool = pool - 0x8000000000000000ull;
339
340 unsigned hash;
341 p = _key_decode_u32(p, &hash);
342
343 oid->hobj.set_bitwise_key_u32(hash);
344
345 r = decode_escaped(p, &oid->hobj.nspace);
346 if (r < 0)
347 return -2;
348 p += r + 1;
349
350 string k;
351 r = decode_escaped(p, &k);
352 if (r < 0)
353 return -3;
354 p += r + 1;
355 if (*p == '=') {
356 // no key
357 ++p;
358 oid->hobj.oid.name = k;
359 } else if (*p == '<' || *p == '>') {
360 // key + name
361 ++p;
362 r = decode_escaped(p, &oid->hobj.oid.name);
363 if (r < 0)
364 return -5;
365 p += r + 1;
366 oid->hobj.set_key(k);
367 } else {
368 // malformed
369 return -6;
370 }
371
372 p = _key_decode_u64(p, &oid->hobj.snap.val);
373 p = _key_decode_u64(p, &oid->generation);
374
375 if (*p != ONODE_KEY_SUFFIX) {
376 return -7;
377 }
378 p++;
379 if (*p) {
380 // if we get something other than a null terminator here,
381 // something goes wrong.
382 return -8;
383 }
384
385 return 0;
386 }
387
388 template<typename S>
389 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
390 {
391 key->clear();
392
393 size_t max_len = 1 + 8 + 4 +
394 (oid.hobj.nspace.length() * 3 + 1) +
395 (oid.hobj.get_key().length() * 3 + 1) +
396 1 + // for '<', '=', or '>'
397 (oid.hobj.oid.name.length() * 3 + 1) +
398 8 + 8 + 1;
399 key->reserve(max_len);
400
401 _key_encode_shard(oid.shard_id, key);
402 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
403 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
404
405 append_escaped(oid.hobj.nspace, key);
406
407 if (oid.hobj.get_key().length()) {
408 // is a key... could be < = or >.
409 append_escaped(oid.hobj.get_key(), key);
410 // (ASCII chars < = and > sort in that order, yay)
411 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
412 if (r) {
413 key->append(r > 0 ? ">" : "<");
414 append_escaped(oid.hobj.oid.name, key);
415 } else {
416 // same as no key
417 key->append("=");
418 }
419 } else {
420 // no key
421 append_escaped(oid.hobj.oid.name, key);
422 key->append("=");
423 }
424
425 _key_encode_u64(oid.hobj.snap, key);
426 _key_encode_u64(oid.generation, key);
427
428 key->push_back(ONODE_KEY_SUFFIX);
429
430 // sanity check
431 if (true) {
432 ghobject_t t;
433 int r = get_key_object(*key, &t);
434 if (r || t != oid) {
435 derr << " r " << r << dendl;
436 derr << "key " << pretty_binary_string(*key) << dendl;
437 derr << "oid " << oid << dendl;
438 derr << " t " << t << dendl;
439 assert(r == 0 && t == oid);
440 }
441 }
442 }
443
444
445 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
446 // char lets us quickly test whether it is a shard key without decoding any
447 // of the prefix bytes.
448 template<typename S>
449 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
450 string *key)
451 {
452 key->clear();
453 key->reserve(onode_key.length() + 4 + 1);
454 key->append(onode_key.c_str(), onode_key.size());
455 _key_encode_u32(offset, key);
456 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
457 }
458
459 static void rewrite_extent_shard_key(uint32_t offset, string *key)
460 {
461 assert(key->size() > sizeof(uint32_t) + 1);
462 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
463 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
464 }
465
466 template<typename S>
467 static void generate_extent_shard_key_and_apply(
468 const S& onode_key,
469 uint32_t offset,
470 string *key,
471 std::function<void(const string& final_key)> apply)
472 {
473 if (key->empty()) { // make full key
474 assert(!onode_key.empty());
475 get_extent_shard_key(onode_key, offset, key);
476 } else {
477 rewrite_extent_shard_key(offset, key);
478 }
479 apply(*key);
480 }
481
482 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
483 {
484 assert(key.size() > sizeof(uint32_t) + 1);
485 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
486 int okey_len = key.size() - sizeof(uint32_t) - 1;
487 *onode_key = key.substr(0, okey_len);
488 const char *p = key.data() + okey_len;
489 _key_decode_u32(p, offset);
490 return 0;
491 }
492
493 static bool is_extent_shard_key(const string& key)
494 {
495 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
496 }
497
498 // '-' < '.' < '~'
499 static void get_omap_header(uint64_t id, string *out)
500 {
501 _key_encode_u64(id, out);
502 out->push_back('-');
503 }
504
505 // hmm, I don't think there's any need to escape the user key since we
506 // have a clean prefix.
507 static void get_omap_key(uint64_t id, const string& key, string *out)
508 {
509 _key_encode_u64(id, out);
510 out->push_back('.');
511 out->append(key);
512 }
513
514 static void rewrite_omap_key(uint64_t id, string old, string *out)
515 {
516 _key_encode_u64(id, out);
517 out->append(old.c_str() + out->length(), old.size() - out->length());
518 }
519
520 static void decode_omap_key(const string& key, string *user_key)
521 {
522 *user_key = key.substr(sizeof(uint64_t) + 1);
523 }
524
525 static void get_omap_tail(uint64_t id, string *out)
526 {
527 _key_encode_u64(id, out);
528 out->push_back('~');
529 }
530
531 static void get_deferred_key(uint64_t seq, string *out)
532 {
533 _key_encode_u64(seq, out);
534 }
535
536
537 // merge operators
538
539 struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
540 void merge_nonexistent(
541 const char *rdata, size_t rlen, std::string *new_value) override {
542 *new_value = std::string(rdata, rlen);
543 }
544 void merge(
545 const char *ldata, size_t llen,
546 const char *rdata, size_t rlen,
547 std::string *new_value) override {
548 assert(llen == rlen);
549 assert((rlen % 8) == 0);
550 new_value->resize(rlen);
551 const __le64* lv = (const __le64*)ldata;
552 const __le64* rv = (const __le64*)rdata;
553 __le64* nv = &(__le64&)new_value->at(0);
554 for (size_t i = 0; i < rlen >> 3; ++i) {
555 nv[i] = lv[i] + rv[i];
556 }
557 }
558 // We use each operator name and each prefix to construct the
559 // overall RocksDB operator name for consistency check at open time.
560 const char *name() const override {
561 return "int64_array";
562 }
563 };
564
565
566 // Buffer
567
568 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
569 {
570 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
571 << b.offset << "~" << b.length << std::dec
572 << " " << BlueStore::Buffer::get_state_name(b.state);
573 if (b.flags)
574 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
575 return out << ")";
576 }
577
578 // Garbage Collector
579
580 void BlueStore::GarbageCollector::process_protrusive_extents(
581 const BlueStore::ExtentMap& extent_map,
582 uint64_t start_offset,
583 uint64_t end_offset,
584 uint64_t start_touch_offset,
585 uint64_t end_touch_offset,
586 uint64_t min_alloc_size)
587 {
588 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
589
590 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
591 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
592
593 dout(30) << __func__ << " (hex): [" << std::hex
594 << lookup_start_offset << ", " << lookup_end_offset
595 << ")" << std::dec << dendl;
596
597 for (auto it = extent_map.seek_lextent(lookup_start_offset);
598 it != extent_map.extent_map.end() &&
599 it->logical_offset < lookup_end_offset;
600 ++it) {
601 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
602 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
603
604 dout(30) << __func__ << " " << *it
605 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
606 << dendl;
607
608 Blob* b = it->blob.get();
609
610 if (it->logical_offset >=start_touch_offset &&
611 it->logical_end() <= end_touch_offset) {
612 // Process extents within the range affected by
613 // the current write request.
614 // Need to take into account if existing extents
615 // can be merged with them (uncompressed case)
616 if (!b->get_blob().is_compressed()) {
617 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
618 --blob_info_counted->expected_allocations; // don't need to allocate
619 // new AU for compressed
620 // data since another
621 // collocated uncompressed
622 // blob already exists
623 dout(30) << __func__ << " --expected:"
624 << alloc_unit_start << dendl;
625 }
626 used_alloc_unit = alloc_unit_end;
627 blob_info_counted = nullptr;
628 }
629 } else if (b->get_blob().is_compressed()) {
630
631 // additionally we take compressed blobs that were not impacted
632 // by the write into account too
633 BlobInfo& bi =
634 affected_blobs.emplace(
635 b, BlobInfo(b->get_referenced_bytes())).first->second;
636
637 int adjust =
638 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
639 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
640 dout(30) << __func__ << " expected_allocations="
641 << bi.expected_allocations << " end_au:"
642 << alloc_unit_end << dendl;
643
644 blob_info_counted = &bi;
645 used_alloc_unit = alloc_unit_end;
646
647 assert(it->length <= bi.referenced_bytes);
648 bi.referenced_bytes -= it->length;
649 dout(30) << __func__ << " affected_blob:" << *b
650 << " unref 0x" << std::hex << it->length
651 << " referenced = 0x" << bi.referenced_bytes
652 << std::dec << dendl;
653 // NOTE: we can't move specific blob to resulting GC list here
654 // when reference counter == 0 since subsequent extents might
655 // decrement its expected_allocation.
656 // Hence need to enumerate all the extents first.
657 if (!bi.collect_candidate) {
658 bi.first_lextent = it;
659 bi.collect_candidate = true;
660 }
661 bi.last_lextent = it;
662 } else {
663 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
664 // don't need to allocate new AU for compressed data since another
665 // collocated uncompressed blob already exists
666 --blob_info_counted->expected_allocations;
667 dout(30) << __func__ << " --expected_allocations:"
668 << alloc_unit_start << dendl;
669 }
670 used_alloc_unit = alloc_unit_end;
671 blob_info_counted = nullptr;
672 }
673 }
674
675 for (auto b_it = affected_blobs.begin();
676 b_it != affected_blobs.end();
677 ++b_it) {
678 Blob* b = b_it->first;
679 BlobInfo& bi = b_it->second;
680 if (bi.referenced_bytes == 0) {
681 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
682 int64_t blob_expected_for_release =
683 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
684
685 dout(30) << __func__ << " " << *(b_it->first)
686 << " expected4release=" << blob_expected_for_release
687 << " expected_allocations=" << bi.expected_allocations
688 << dendl;
689 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
690 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
691 if (bi.collect_candidate) {
692 auto it = bi.first_lextent;
693 bool bExit = false;
694 do {
695 if (it->blob.get() == b) {
696 extents_to_collect.emplace_back(it->logical_offset, it->length);
697 }
698 bExit = it == bi.last_lextent;
699 ++it;
700 } while (!bExit);
701 }
702 expected_for_release += blob_expected_for_release;
703 expected_allocations += bi.expected_allocations;
704 }
705 }
706 }
707 }
708
709 int64_t BlueStore::GarbageCollector::estimate(
710 uint64_t start_offset,
711 uint64_t length,
712 const BlueStore::ExtentMap& extent_map,
713 const BlueStore::old_extent_map_t& old_extents,
714 uint64_t min_alloc_size)
715 {
716
717 affected_blobs.clear();
718 extents_to_collect.clear();
719 used_alloc_unit = boost::optional<uint64_t >();
720 blob_info_counted = nullptr;
721
722 gc_start_offset = start_offset;
723 gc_end_offset = start_offset + length;
724
725 uint64_t end_offset = start_offset + length;
726
727 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
728 Blob* b = it->e.blob.get();
729 if (b->get_blob().is_compressed()) {
730
731 // update gc_start_offset/gc_end_offset if needed
732 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
733 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
734
735 auto o = it->e.logical_offset;
736 auto l = it->e.length;
737
738 uint64_t ref_bytes = b->get_referenced_bytes();
739 // micro optimization to bypass blobs that have no more references
740 if (ref_bytes != 0) {
741 dout(30) << __func__ << " affected_blob:" << *b
742 << " unref 0x" << std::hex << o << "~" << l
743 << std::dec << dendl;
744 affected_blobs.emplace(b, BlobInfo(ref_bytes));
745 }
746 }
747 }
748 dout(30) << __func__ << " gc range(hex): [" << std::hex
749 << gc_start_offset << ", " << gc_end_offset
750 << ")" << std::dec << dendl;
751
752 // enumerate preceeding extents to check if they reference affected blobs
753 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
754 process_protrusive_extents(extent_map,
755 gc_start_offset,
756 gc_end_offset,
757 start_offset,
758 end_offset,
759 min_alloc_size);
760 }
761 return expected_for_release - expected_allocations;
762 }
763
764 // Cache
765
766 BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
767 PerfCounters *logger)
768 {
769 Cache *c = nullptr;
770
771 if (type == "lru")
772 c = new LRUCache(cct);
773 else if (type == "2q")
774 c = new TwoQCache(cct);
775 else
776 assert(0 == "unrecognized cache type");
777
778 c->logger = logger;
779 return c;
780 }
781
782 void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
783 {
784 std::lock_guard<std::recursive_mutex> l(lock);
785 _trim(onode_max, buffer_max);
786 }
787
788 void BlueStore::Cache::trim_all()
789 {
790 std::lock_guard<std::recursive_mutex> l(lock);
791 _trim(0, 0);
792 }
793
794 // LRUCache
795 #undef dout_prefix
796 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
797
798 void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
799 {
800 auto p = onode_lru.iterator_to(*o);
801 onode_lru.erase(p);
802 onode_lru.push_front(*o);
803 }
804
805 void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
806 {
807 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
808 << " buffers " << buffer_size << " / " << buffer_max
809 << dendl;
810
811 _audit("trim start");
812
813 // buffers
814 while (buffer_size > buffer_max) {
815 auto i = buffer_lru.rbegin();
816 if (i == buffer_lru.rend()) {
817 // stop if buffer_lru is now empty
818 break;
819 }
820
821 Buffer *b = &*i;
822 assert(b->is_clean());
823 dout(20) << __func__ << " rm " << *b << dendl;
824 b->space->_rm_buffer(this, b);
825 }
826
827 // onodes
828 if (onode_max >= onode_lru.size()) {
829 return; // don't even try
830 }
831 uint64_t num = onode_lru.size() - onode_max;
832
833 auto p = onode_lru.end();
834 assert(p != onode_lru.begin());
835 --p;
836 int skipped = 0;
837 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
838 while (num > 0) {
839 Onode *o = &*p;
840 int refs = o->nref.load();
841 if (refs > 1) {
842 dout(20) << __func__ << " " << o->oid << " has " << refs
843 << " refs, skipping" << dendl;
844 if (++skipped >= max_skipped) {
845 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
846 << num << " left to trim" << dendl;
847 break;
848 }
849
850 if (p == onode_lru.begin()) {
851 break;
852 } else {
853 p--;
854 num--;
855 continue;
856 }
857 }
858 dout(30) << __func__ << " rm " << o->oid << dendl;
859 if (p != onode_lru.begin()) {
860 onode_lru.erase(p--);
861 } else {
862 onode_lru.erase(p);
863 assert(num == 1);
864 }
865 o->get(); // paranoia
866 o->c->onode_map.remove(o->oid);
867 o->put();
868 --num;
869 }
870 }
871
872 #ifdef DEBUG_CACHE
873 void BlueStore::LRUCache::_audit(const char *when)
874 {
875 dout(10) << __func__ << " " << when << " start" << dendl;
876 uint64_t s = 0;
877 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
878 s += i->length;
879 }
880 if (s != buffer_size) {
881 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
882 << dendl;
883 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
884 derr << __func__ << " " << *i << dendl;
885 }
886 assert(s == buffer_size);
887 }
888 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
889 << " ok" << dendl;
890 }
891 #endif
892
893 // TwoQCache
894 #undef dout_prefix
895 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
896
897
898 void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
899 {
900 auto p = onode_lru.iterator_to(*o);
901 onode_lru.erase(p);
902 onode_lru.push_front(*o);
903 }
904
905 void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
906 {
907 dout(20) << __func__ << " level " << level << " near " << near
908 << " on " << *b
909 << " which has cache_private " << b->cache_private << dendl;
910 if (near) {
911 b->cache_private = near->cache_private;
912 switch (b->cache_private) {
913 case BUFFER_WARM_IN:
914 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
915 break;
916 case BUFFER_WARM_OUT:
917 assert(b->is_empty());
918 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
919 break;
920 case BUFFER_HOT:
921 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
922 break;
923 default:
924 assert(0 == "bad cache_private");
925 }
926 } else if (b->cache_private == BUFFER_NEW) {
927 b->cache_private = BUFFER_WARM_IN;
928 if (level > 0) {
929 buffer_warm_in.push_front(*b);
930 } else {
931 // take caller hint to start at the back of the warm queue
932 buffer_warm_in.push_back(*b);
933 }
934 } else {
935 // we got a hint from discard
936 switch (b->cache_private) {
937 case BUFFER_WARM_IN:
938 // stay in warm_in. move to front, even though 2Q doesn't actually
939 // do this.
940 dout(20) << __func__ << " move to front of warm " << *b << dendl;
941 buffer_warm_in.push_front(*b);
942 break;
943 case BUFFER_WARM_OUT:
944 b->cache_private = BUFFER_HOT;
945 // move to hot. fall-thru
946 case BUFFER_HOT:
947 dout(20) << __func__ << " move to front of hot " << *b << dendl;
948 buffer_hot.push_front(*b);
949 break;
950 default:
951 assert(0 == "bad cache_private");
952 }
953 }
954 if (!b->is_empty()) {
955 buffer_bytes += b->length;
956 buffer_list_bytes[b->cache_private] += b->length;
957 }
958 }
959
960 void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
961 {
962 dout(20) << __func__ << " " << *b << dendl;
963 if (!b->is_empty()) {
964 assert(buffer_bytes >= b->length);
965 buffer_bytes -= b->length;
966 assert(buffer_list_bytes[b->cache_private] >= b->length);
967 buffer_list_bytes[b->cache_private] -= b->length;
968 }
969 switch (b->cache_private) {
970 case BUFFER_WARM_IN:
971 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
972 break;
973 case BUFFER_WARM_OUT:
974 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
975 break;
976 case BUFFER_HOT:
977 buffer_hot.erase(buffer_hot.iterator_to(*b));
978 break;
979 default:
980 assert(0 == "bad cache_private");
981 }
982 }
983
984 void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
985 {
986 TwoQCache *src = static_cast<TwoQCache*>(srcc);
987 src->_rm_buffer(b);
988
989 // preserve which list we're on (even if we can't preserve the order!)
990 switch (b->cache_private) {
991 case BUFFER_WARM_IN:
992 assert(!b->is_empty());
993 buffer_warm_in.push_back(*b);
994 break;
995 case BUFFER_WARM_OUT:
996 assert(b->is_empty());
997 buffer_warm_out.push_back(*b);
998 break;
999 case BUFFER_HOT:
1000 assert(!b->is_empty());
1001 buffer_hot.push_back(*b);
1002 break;
1003 default:
1004 assert(0 == "bad cache_private");
1005 }
1006 if (!b->is_empty()) {
1007 buffer_bytes += b->length;
1008 buffer_list_bytes[b->cache_private] += b->length;
1009 }
1010 }
1011
1012 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1013 {
1014 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1015 if (!b->is_empty()) {
1016 assert((int64_t)buffer_bytes + delta >= 0);
1017 buffer_bytes += delta;
1018 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1019 buffer_list_bytes[b->cache_private] += delta;
1020 }
1021 }
1022
1023 void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1024 {
1025 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1026 << " buffers " << buffer_bytes << " / " << buffer_max
1027 << dendl;
1028
1029 _audit("trim start");
1030
1031 // buffers
1032 if (buffer_bytes > buffer_max) {
1033 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1034 uint64_t khot = buffer_max - kin;
1035
1036 // pre-calculate kout based on average buffer size too,
1037 // which is typical(the warm_in and hot lists may change later)
1038 uint64_t kout = 0;
1039 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1040 if (buffer_num) {
1041 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1042 assert(buffer_avg_size);
1043 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1044 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1045 }
1046
1047 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1048 // hot is small, give slack to warm_in
1049 kin += khot - buffer_list_bytes[BUFFER_HOT];
1050 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1051 // warm_in is small, give slack to hot
1052 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1053 }
1054
1055 // adjust warm_in list
1056 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1057 uint64_t evicted = 0;
1058
1059 while (to_evict_bytes > 0) {
1060 auto p = buffer_warm_in.rbegin();
1061 if (p == buffer_warm_in.rend()) {
1062 // stop if warm_in list is now empty
1063 break;
1064 }
1065
1066 Buffer *b = &*p;
1067 assert(b->is_clean());
1068 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1069 assert(buffer_bytes >= b->length);
1070 buffer_bytes -= b->length;
1071 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1072 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1073 to_evict_bytes -= b->length;
1074 evicted += b->length;
1075 b->state = Buffer::STATE_EMPTY;
1076 b->data.clear();
1077 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1078 buffer_warm_out.push_front(*b);
1079 b->cache_private = BUFFER_WARM_OUT;
1080 }
1081
1082 if (evicted > 0) {
1083 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1084 << " from warm_in list, done evicting warm_in buffers"
1085 << dendl;
1086 }
1087
1088 // adjust hot list
1089 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1090 evicted = 0;
1091
1092 while (to_evict_bytes > 0) {
1093 auto p = buffer_hot.rbegin();
1094 if (p == buffer_hot.rend()) {
1095 // stop if hot list is now empty
1096 break;
1097 }
1098
1099 Buffer *b = &*p;
1100 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1101 assert(b->is_clean());
1102 // adjust evict size before buffer goes invalid
1103 to_evict_bytes -= b->length;
1104 evicted += b->length;
1105 b->space->_rm_buffer(this, b);
1106 }
1107
1108 if (evicted > 0) {
1109 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1110 << " from hot list, done evicting hot buffers"
1111 << dendl;
1112 }
1113
1114 // adjust warm out list too, if necessary
1115 int64_t num = buffer_warm_out.size() - kout;
1116 while (num-- > 0) {
1117 Buffer *b = &*buffer_warm_out.rbegin();
1118 assert(b->is_empty());
1119 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1120 b->space->_rm_buffer(this, b);
1121 }
1122 }
1123
1124 // onodes
1125 if (onode_max >= onode_lru.size()) {
1126 return; // don't even try
1127 }
1128 uint64_t num = onode_lru.size() - onode_max;
1129
1130 auto p = onode_lru.end();
1131 assert(p != onode_lru.begin());
1132 --p;
1133 int skipped = 0;
1134 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1135 while (num > 0) {
1136 Onode *o = &*p;
1137 dout(20) << __func__ << " considering " << o << dendl;
1138 int refs = o->nref.load();
1139 if (refs > 1) {
1140 dout(20) << __func__ << " " << o->oid << " has " << refs
1141 << " refs; skipping" << dendl;
1142 if (++skipped >= max_skipped) {
1143 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1144 << num << " left to trim" << dendl;
1145 break;
1146 }
1147
1148 if (p == onode_lru.begin()) {
1149 break;
1150 } else {
1151 p--;
1152 num--;
1153 continue;
1154 }
1155 }
1156 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1157 if (p != onode_lru.begin()) {
1158 onode_lru.erase(p--);
1159 } else {
1160 onode_lru.erase(p);
1161 assert(num == 1);
1162 }
1163 o->get(); // paranoia
1164 o->c->onode_map.remove(o->oid);
1165 o->put();
1166 --num;
1167 }
1168 }
1169
1170 #ifdef DEBUG_CACHE
1171 void BlueStore::TwoQCache::_audit(const char *when)
1172 {
1173 dout(10) << __func__ << " " << when << " start" << dendl;
1174 uint64_t s = 0;
1175 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1176 s += i->length;
1177 }
1178
1179 uint64_t hot_bytes = s;
1180 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1181 derr << __func__ << " hot_list_bytes "
1182 << buffer_list_bytes[BUFFER_HOT]
1183 << " != actual " << hot_bytes
1184 << dendl;
1185 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1186 }
1187
1188 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1189 s += i->length;
1190 }
1191
1192 uint64_t warm_in_bytes = s - hot_bytes;
1193 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1194 derr << __func__ << " warm_in_list_bytes "
1195 << buffer_list_bytes[BUFFER_WARM_IN]
1196 << " != actual " << warm_in_bytes
1197 << dendl;
1198 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1199 }
1200
1201 if (s != buffer_bytes) {
1202 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1203 << dendl;
1204 assert(s == buffer_bytes);
1205 }
1206
1207 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1208 << " ok" << dendl;
1209 }
1210 #endif
1211
1212
1213 // BufferSpace
1214
1215 #undef dout_prefix
1216 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1217
1218 void BlueStore::BufferSpace::_clear(Cache* cache)
1219 {
1220 // note: we already hold cache->lock
1221 ldout(cache->cct, 20) << __func__ << dendl;
1222 while (!buffer_map.empty()) {
1223 _rm_buffer(cache, buffer_map.begin());
1224 }
1225 }
1226
1227 int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1228 {
1229 // note: we already hold cache->lock
1230 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1231 << std::dec << dendl;
1232 int cache_private = 0;
1233 cache->_audit("discard start");
1234 auto i = _data_lower_bound(offset);
1235 uint32_t end = offset + length;
1236 while (i != buffer_map.end()) {
1237 Buffer *b = i->second.get();
1238 if (b->offset >= end) {
1239 break;
1240 }
1241 if (b->cache_private > cache_private) {
1242 cache_private = b->cache_private;
1243 }
1244 if (b->offset < offset) {
1245 int64_t front = offset - b->offset;
1246 if (b->end() > end) {
1247 // drop middle (split)
1248 uint32_t tail = b->end() - end;
1249 if (b->data.length()) {
1250 bufferlist bl;
1251 bl.substr_of(b->data, b->length - tail, tail);
1252 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1253 nb->maybe_rebuild();
1254 _add_buffer(cache, nb, 0, b);
1255 } else {
1256 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1257 0, b);
1258 }
1259 if (!b->is_writing()) {
1260 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1261 }
1262 b->truncate(front);
1263 b->maybe_rebuild();
1264 cache->_audit("discard end 1");
1265 break;
1266 } else {
1267 // drop tail
1268 if (!b->is_writing()) {
1269 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1270 }
1271 b->truncate(front);
1272 b->maybe_rebuild();
1273 ++i;
1274 continue;
1275 }
1276 }
1277 if (b->end() <= end) {
1278 // drop entire buffer
1279 _rm_buffer(cache, i++);
1280 continue;
1281 }
1282 // drop front
1283 uint32_t keep = b->end() - end;
1284 if (b->data.length()) {
1285 bufferlist bl;
1286 bl.substr_of(b->data, b->length - keep, keep);
1287 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1288 nb->maybe_rebuild();
1289 _add_buffer(cache, nb, 0, b);
1290 } else {
1291 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1292 }
1293 _rm_buffer(cache, i);
1294 cache->_audit("discard end 2");
1295 break;
1296 }
1297 return cache_private;
1298 }
1299
1300 void BlueStore::BufferSpace::read(
1301 Cache* cache,
1302 uint32_t offset,
1303 uint32_t length,
1304 BlueStore::ready_regions_t& res,
1305 interval_set<uint32_t>& res_intervals,
1306 int flags)
1307 {
1308 res.clear();
1309 res_intervals.clear();
1310 uint32_t want_bytes = length;
1311 uint32_t end = offset + length;
1312
1313 {
1314 std::lock_guard<std::recursive_mutex> l(cache->lock);
1315 for (auto i = _data_lower_bound(offset);
1316 i != buffer_map.end() && offset < end && i->first < end;
1317 ++i) {
1318 Buffer *b = i->second.get();
1319 assert(b->end() > offset);
1320
1321 bool val = false;
1322 if (flags & BYPASS_CLEAN_CACHE)
1323 val = b->is_writing();
1324 else
1325 val = b->is_writing() || b->is_clean();
1326 if (val) {
1327 if (b->offset < offset) {
1328 uint32_t skip = offset - b->offset;
1329 uint32_t l = MIN(length, b->length - skip);
1330 res[offset].substr_of(b->data, skip, l);
1331 res_intervals.insert(offset, l);
1332 offset += l;
1333 length -= l;
1334 if (!b->is_writing()) {
1335 cache->_touch_buffer(b);
1336 }
1337 continue;
1338 }
1339 if (b->offset > offset) {
1340 uint32_t gap = b->offset - offset;
1341 if (length <= gap) {
1342 break;
1343 }
1344 offset += gap;
1345 length -= gap;
1346 }
1347 if (!b->is_writing()) {
1348 cache->_touch_buffer(b);
1349 }
1350 if (b->length > length) {
1351 res[offset].substr_of(b->data, 0, length);
1352 res_intervals.insert(offset, length);
1353 break;
1354 } else {
1355 res[offset].append(b->data);
1356 res_intervals.insert(offset, b->length);
1357 if (b->length == length)
1358 break;
1359 offset += b->length;
1360 length -= b->length;
1361 }
1362 }
1363 }
1364 }
1365
1366 uint64_t hit_bytes = res_intervals.size();
1367 assert(hit_bytes <= want_bytes);
1368 uint64_t miss_bytes = want_bytes - hit_bytes;
1369 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1370 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1371 }
1372
1373 void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
1374 {
1375 auto i = writing.begin();
1376 while (i != writing.end()) {
1377 if (i->seq > seq) {
1378 break;
1379 }
1380 if (i->seq < seq) {
1381 ++i;
1382 continue;
1383 }
1384
1385 Buffer *b = &*i;
1386 assert(b->is_writing());
1387
1388 if (b->flags & Buffer::FLAG_NOCACHE) {
1389 writing.erase(i++);
1390 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1391 buffer_map.erase(b->offset);
1392 } else {
1393 b->state = Buffer::STATE_CLEAN;
1394 writing.erase(i++);
1395 b->maybe_rebuild();
1396 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1397 cache->_add_buffer(b, 1, nullptr);
1398 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1399 }
1400 }
1401
1402 cache->_audit("finish_write end");
1403 }
1404
1405 void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1406 {
1407 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1408 if (buffer_map.empty())
1409 return;
1410
1411 auto p = --buffer_map.end();
1412 while (true) {
1413 if (p->second->end() <= pos)
1414 break;
1415
1416 if (p->second->offset < pos) {
1417 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1418 size_t left = pos - p->second->offset;
1419 size_t right = p->second->length - left;
1420 if (p->second->data.length()) {
1421 bufferlist bl;
1422 bl.substr_of(p->second->data, left, right);
1423 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1424 0, p->second.get());
1425 } else {
1426 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1427 0, p->second.get());
1428 }
1429 cache->_adjust_buffer_size(p->second.get(), -right);
1430 p->second->truncate(left);
1431 break;
1432 }
1433
1434 assert(p->second->end() > pos);
1435 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1436 if (p->second->data.length()) {
1437 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1438 p->second->offset - pos, p->second->data),
1439 0, p->second.get());
1440 } else {
1441 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1442 p->second->offset - pos, p->second->length),
1443 0, p->second.get());
1444 }
1445 if (p == buffer_map.begin()) {
1446 _rm_buffer(cache, p);
1447 break;
1448 } else {
1449 _rm_buffer(cache, p--);
1450 }
1451 }
1452 assert(writing.empty());
1453 }
1454
1455 // OnodeSpace
1456
1457 #undef dout_prefix
1458 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1459
1460 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1461 {
1462 std::lock_guard<std::recursive_mutex> l(cache->lock);
1463 auto p = onode_map.find(oid);
1464 if (p != onode_map.end()) {
1465 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1466 << " raced, returning existing " << p->second
1467 << dendl;
1468 return p->second;
1469 }
1470 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1471 onode_map[oid] = o;
1472 cache->_add_onode(o, 1);
1473 return o;
1474 }
1475
1476 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1477 {
1478 ldout(cache->cct, 30) << __func__ << dendl;
1479 OnodeRef o;
1480 bool hit = false;
1481
1482 {
1483 std::lock_guard<std::recursive_mutex> l(cache->lock);
1484 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1485 if (p == onode_map.end()) {
1486 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1487 } else {
1488 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1489 << dendl;
1490 cache->_touch_onode(p->second);
1491 hit = true;
1492 o = p->second;
1493 }
1494 }
1495
1496 if (hit) {
1497 cache->logger->inc(l_bluestore_onode_hits);
1498 } else {
1499 cache->logger->inc(l_bluestore_onode_misses);
1500 }
1501 return o;
1502 }
1503
1504 void BlueStore::OnodeSpace::clear()
1505 {
1506 std::lock_guard<std::recursive_mutex> l(cache->lock);
1507 ldout(cache->cct, 10) << __func__ << dendl;
1508 for (auto &p : onode_map) {
1509 cache->_rm_onode(p.second);
1510 }
1511 onode_map.clear();
1512 }
1513
1514 bool BlueStore::OnodeSpace::empty()
1515 {
1516 std::lock_guard<std::recursive_mutex> l(cache->lock);
1517 return onode_map.empty();
1518 }
1519
1520 void BlueStore::OnodeSpace::rename(
1521 OnodeRef& oldo,
1522 const ghobject_t& old_oid,
1523 const ghobject_t& new_oid,
1524 const mempool::bluestore_cache_other::string& new_okey)
1525 {
1526 std::lock_guard<std::recursive_mutex> l(cache->lock);
1527 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1528 << dendl;
1529 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1530 po = onode_map.find(old_oid);
1531 pn = onode_map.find(new_oid);
1532 assert(po != pn);
1533
1534 assert(po != onode_map.end());
1535 if (pn != onode_map.end()) {
1536 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1537 << dendl;
1538 cache->_rm_onode(pn->second);
1539 onode_map.erase(pn);
1540 }
1541 OnodeRef o = po->second;
1542
1543 // install a non-existent onode at old location
1544 oldo.reset(new Onode(o->c, old_oid, o->key));
1545 po->second = oldo;
1546 cache->_add_onode(po->second, 1);
1547
1548 // add at new position and fix oid, key
1549 onode_map.insert(make_pair(new_oid, o));
1550 cache->_touch_onode(o);
1551 o->oid = new_oid;
1552 o->key = new_okey;
1553 }
1554
1555 bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1556 {
1557 std::lock_guard<std::recursive_mutex> l(cache->lock);
1558 ldout(cache->cct, 20) << __func__ << dendl;
1559 for (auto& i : onode_map) {
1560 if (f(i.second)) {
1561 return true;
1562 }
1563 }
1564 return false;
1565 }
1566
1567 void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
1568 {
1569 for (auto& i : onode_map) {
1570 ldout(cct, lvl) << i.first << " : " << i.second << dendl;
1571 }
1572 }
1573
1574 // SharedBlob
1575
1576 #undef dout_prefix
1577 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1578
1579 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1580 {
1581 out << "SharedBlob(" << &sb;
1582
1583 if (sb.loaded) {
1584 out << " loaded " << *sb.persistent;
1585 } else {
1586 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1587 }
1588 return out << ")";
1589 }
1590
1591 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1592 : coll(_coll), sbid_unloaded(i)
1593 {
1594 assert(sbid_unloaded > 0);
1595 if (get_cache()) {
1596 get_cache()->add_blob();
1597 }
1598 }
1599
1600 BlueStore::SharedBlob::~SharedBlob()
1601 {
1602 if (loaded && persistent) {
1603 delete persistent;
1604 }
1605 }
1606
1607 void BlueStore::SharedBlob::put()
1608 {
1609 if (--nref == 0) {
1610 ldout(coll->store->cct, 20) << __func__ << " " << this
1611 << " removing self from set " << get_parent()
1612 << dendl;
1613 again:
1614 auto coll_snap = coll;
1615 if (coll_snap) {
1616 std::lock_guard<std::recursive_mutex> l(coll_snap->cache->lock);
1617 if (coll_snap != coll) {
1618 goto again;
1619 }
1620 if (!coll_snap->shared_blob_set.remove(this, true)) {
1621 // race with lookup
1622 return;
1623 }
1624 bc._clear(coll_snap->cache);
1625 coll_snap->cache->rm_blob();
1626 }
1627 delete this;
1628 }
1629 }
1630
1631 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1632 {
1633 assert(persistent);
1634 persistent->ref_map.get(offset, length);
1635 }
1636
1637 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
1638 PExtentVector *r,
1639 set<SharedBlob*> *maybe_unshared)
1640 {
1641 assert(persistent);
1642 bool maybe = false;
1643 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1644 if (maybe_unshared && maybe) {
1645 maybe_unshared->insert(this);
1646 }
1647 }
1648
1649 void BlueStore::SharedBlob::finish_write(uint64_t seq)
1650 {
1651 while (true) {
1652 Cache *cache = coll->cache;
1653 std::lock_guard<std::recursive_mutex> l(cache->lock);
1654 if (coll->cache != cache) {
1655 ldout(coll->store->cct, 20) << __func__
1656 << " raced with sb cache update, was " << cache
1657 << ", now " << coll->cache << ", retrying"
1658 << dendl;
1659 continue;
1660 }
1661 bc._finish_write(cache, seq);
1662 break;
1663 }
1664 }
1665
1666 // SharedBlobSet
1667
1668 #undef dout_prefix
1669 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1670
1671 void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
1672 {
1673 std::lock_guard<std::mutex> l(lock);
1674 for (auto& i : sb_map) {
1675 ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
1676 }
1677 }
1678
1679 // Blob
1680
1681 #undef dout_prefix
1682 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1683
1684 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1685 {
1686 out << "Blob(" << &b;
1687 if (b.is_spanning()) {
1688 out << " spanning " << b.id;
1689 }
1690 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1691 if (b.shared_blob) {
1692 out << " " << *b.shared_blob;
1693 } else {
1694 out << " (shared_blob=NULL)";
1695 }
1696 out << ")";
1697 return out;
1698 }
1699
1700 void BlueStore::Blob::discard_unallocated(Collection *coll)
1701 {
1702 if (get_blob().is_shared()) {
1703 return;
1704 }
1705 if (get_blob().is_compressed()) {
1706 bool discard = false;
1707 bool all_invalid = true;
1708 for (auto e : get_blob().get_extents()) {
1709 if (!e.is_valid()) {
1710 discard = true;
1711 } else {
1712 all_invalid = false;
1713 }
1714 }
1715 assert(discard == all_invalid); // in case of compressed blob all
1716 // or none pextents are invalid.
1717 if (discard) {
1718 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1719 get_blob().get_logical_length());
1720 }
1721 } else {
1722 size_t pos = 0;
1723 for (auto e : get_blob().get_extents()) {
1724 if (!e.is_valid()) {
1725 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1726 << "~" << e.length
1727 << std::dec << dendl;
1728 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1729 }
1730 pos += e.length;
1731 }
1732 if (get_blob().can_prune_tail()) {
1733 dirty_blob().prune_tail();
1734 used_in_blob.prune_tail(get_blob().get_ondisk_length());
1735 auto cct = coll->store->cct; //used by dout
1736 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
1737 }
1738 }
1739 }
1740
1741 void BlueStore::Blob::get_ref(
1742 Collection *coll,
1743 uint32_t offset,
1744 uint32_t length)
1745 {
1746 // Caller has to initialize Blob's logical length prior to increment
1747 // references. Otherwise one is neither unable to determine required
1748 // amount of counters in case of per-au tracking nor obtain min_release_size
1749 // for single counter mode.
1750 assert(get_blob().get_logical_length() != 0);
1751 auto cct = coll->store->cct;
1752 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1753 << std::dec << " " << *this << dendl;
1754
1755 if (used_in_blob.is_empty()) {
1756 uint32_t min_release_size =
1757 get_blob().get_release_size(coll->store->min_alloc_size);
1758 uint64_t l = get_blob().get_logical_length();
1759 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1760 << min_release_size << std::dec << dendl;
1761 used_in_blob.init(l, min_release_size);
1762 }
1763 used_in_blob.get(
1764 offset,
1765 length);
1766 }
1767
1768 bool BlueStore::Blob::put_ref(
1769 Collection *coll,
1770 uint32_t offset,
1771 uint32_t length,
1772 PExtentVector *r)
1773 {
1774 PExtentVector logical;
1775
1776 auto cct = coll->store->cct;
1777 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1778 << std::dec << " " << *this << dendl;
1779
1780 bool empty = used_in_blob.put(
1781 offset,
1782 length,
1783 &logical);
1784 r->clear();
1785 // nothing to release
1786 if (!empty && logical.empty()) {
1787 return false;
1788 }
1789
1790 bluestore_blob_t& b = dirty_blob();
1791 return b.release_extents(empty, logical, r);
1792 }
1793
1794 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
1795 uint32_t target_blob_size,
1796 uint32_t b_offset,
1797 uint32_t *length0) {
1798 assert(min_alloc_size);
1799 assert(target_blob_size);
1800 if (!get_blob().is_mutable()) {
1801 return false;
1802 }
1803
1804 uint32_t length = *length0;
1805 uint32_t end = b_offset + length;
1806
1807 // Currently for the sake of simplicity we omit blob reuse if data is
1808 // unaligned with csum chunk. Later we can perform padding if needed.
1809 if (get_blob().has_csum() &&
1810 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1811 (end % get_blob().get_csum_chunk_size()) != 0)) {
1812 return false;
1813 }
1814
1815 auto blen = get_blob().get_logical_length();
1816 uint32_t new_blen = blen;
1817
1818 // make sure target_blob_size isn't less than current blob len
1819 target_blob_size = MAX(blen, target_blob_size);
1820
1821 if (b_offset >= blen) {
1822 // new data totally stands out of the existing blob
1823 new_blen = end;
1824 } else {
1825 // new data overlaps with the existing blob
1826 new_blen = MAX(blen, end);
1827
1828 uint32_t overlap = 0;
1829 if (new_blen > blen) {
1830 overlap = blen - b_offset;
1831 } else {
1832 overlap = length;
1833 }
1834
1835 if (!get_blob().is_unallocated(b_offset, overlap)) {
1836 // abort if any piece of the overlap has already been allocated
1837 return false;
1838 }
1839 }
1840
1841 if (new_blen > blen) {
1842 int64_t overflow = int64_t(new_blen) - target_blob_size;
1843 // Unable to decrease the provided length to fit into max_blob_size
1844 if (overflow >= length) {
1845 return false;
1846 }
1847
1848 // FIXME: in some cases we could reduce unused resolution
1849 if (get_blob().has_unused()) {
1850 return false;
1851 }
1852
1853 if (overflow > 0) {
1854 new_blen -= overflow;
1855 length -= overflow;
1856 *length0 = length;
1857 }
1858
1859 if (new_blen > blen) {
1860 dirty_blob().add_tail(new_blen);
1861 used_in_blob.add_tail(new_blen,
1862 get_blob().get_release_size(min_alloc_size));
1863 }
1864 }
1865 return true;
1866 }
1867
1868 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1869 {
1870 auto cct = coll->store->cct; //used by dout
1871 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1872 << " start " << *this << dendl;
1873 assert(blob.can_split());
1874 assert(used_in_blob.can_split());
1875 bluestore_blob_t &lb = dirty_blob();
1876 bluestore_blob_t &rb = r->dirty_blob();
1877
1878 used_in_blob.split(
1879 blob_offset,
1880 &(r->used_in_blob));
1881
1882 lb.split(blob_offset, rb);
1883 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1884
1885 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1886 << " finish " << *this << dendl;
1887 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1888 << " and " << *r << dendl;
1889 }
1890
1891 #ifndef CACHE_BLOB_BL
1892 void BlueStore::Blob::decode(
1893 Collection *coll,
1894 bufferptr::iterator& p,
1895 uint64_t struct_v,
1896 uint64_t* sbid,
1897 bool include_ref_map)
1898 {
1899 denc(blob, p, struct_v);
1900 if (blob.is_shared()) {
1901 denc(*sbid, p);
1902 }
1903 if (include_ref_map) {
1904 if (struct_v > 1) {
1905 used_in_blob.decode(p);
1906 } else {
1907 used_in_blob.clear();
1908 bluestore_extent_ref_map_t legacy_ref_map;
1909 legacy_ref_map.decode(p);
1910 for (auto r : legacy_ref_map.ref_map) {
1911 get_ref(
1912 coll,
1913 r.first,
1914 r.second.refs * r.second.length);
1915 }
1916 }
1917 }
1918 }
1919 #endif
1920
1921 // Extent
1922
1923 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1924 {
1925 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1926 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1927 << " " << *e.blob;
1928 }
1929
1930 // OldExtent
1931 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1932 uint32_t lo,
1933 uint32_t o,
1934 uint32_t l,
1935 BlobRef& b) {
1936 OldExtent* oe = new OldExtent(lo, o, l, b);
1937 b->put_ref(c.get(), o, l, &(oe->r));
1938 oe->blob_empty = b->get_referenced_bytes() == 0;
1939 return oe;
1940 }
1941
1942 // ExtentMap
1943
1944 #undef dout_prefix
1945 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1946
1947 BlueStore::ExtentMap::ExtentMap(Onode *o)
1948 : onode(o),
1949 inline_bl(
1950 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1951 }
1952
1953 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1954 bool force)
1955 {
1956 auto cct = onode->c->store->cct; //used by dout
1957 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1958 if (onode->onode.extent_map_shards.empty()) {
1959 if (inline_bl.length() == 0) {
1960 unsigned n;
1961 // we need to encode inline_bl to measure encoded length
1962 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
1963 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
1964 assert(!never_happen);
1965 size_t len = inline_bl.length();
1966 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1967 << " extents" << dendl;
1968 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
1969 request_reshard(0, OBJECT_MAX_SIZE);
1970 return;
1971 }
1972 }
1973 // will persist in the onode key.
1974 } else {
1975 // pending shard update
1976 struct dirty_shard_t {
1977 Shard *shard;
1978 bufferlist bl;
1979 dirty_shard_t(Shard *s) : shard(s) {}
1980 };
1981 vector<dirty_shard_t> encoded_shards;
1982 // allocate slots for all shards in a single call instead of
1983 // doing multiple allocations - one per each dirty shard
1984 encoded_shards.reserve(shards.size());
1985
1986 auto p = shards.begin();
1987 auto prev_p = p;
1988 while (p != shards.end()) {
1989 assert(p->shard_info->offset >= prev_p->shard_info->offset);
1990 auto n = p;
1991 ++n;
1992 if (p->dirty) {
1993 uint32_t endoff;
1994 if (n == shards.end()) {
1995 endoff = OBJECT_MAX_SIZE;
1996 } else {
1997 endoff = n->shard_info->offset;
1998 }
1999 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2000 bufferlist& bl = encoded_shards.back().bl;
2001 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2002 bl, &p->extents)) {
2003 if (force) {
2004 derr << __func__ << " encode_some needs reshard" << dendl;
2005 assert(!force);
2006 }
2007 }
2008 size_t len = bl.length();
2009
2010 dout(20) << __func__ << " shard 0x" << std::hex
2011 << p->shard_info->offset << std::dec << " is " << len
2012 << " bytes (was " << p->shard_info->bytes << ") from "
2013 << p->extents << " extents" << dendl;
2014
2015 if (!force) {
2016 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2017 // we are big; reshard ourselves
2018 request_reshard(p->shard_info->offset, endoff);
2019 }
2020 // avoid resharding the trailing shard, even if it is small
2021 else if (n != shards.end() &&
2022 len < g_conf->bluestore_extent_map_shard_min_size) {
2023 assert(endoff != OBJECT_MAX_SIZE);
2024 if (p == shards.begin()) {
2025 // we are the first shard, combine with next shard
2026 request_reshard(p->shard_info->offset, endoff + 1);
2027 } else {
2028 // combine either with the previous shard or the next,
2029 // whichever is smaller
2030 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2031 request_reshard(p->shard_info->offset, endoff + 1);
2032 } else {
2033 request_reshard(prev_p->shard_info->offset, endoff);
2034 }
2035 }
2036 }
2037 }
2038 }
2039 prev_p = p;
2040 p = n;
2041 }
2042 if (needs_reshard()) {
2043 return;
2044 }
2045
2046 // schedule DB update for dirty shards
2047 string key;
2048 for (auto& it : encoded_shards) {
2049 it.shard->dirty = false;
2050 it.shard->shard_info->bytes = it.bl.length();
2051 generate_extent_shard_key_and_apply(
2052 onode->key,
2053 it.shard->shard_info->offset,
2054 &key,
2055 [&](const string& final_key) {
2056 t->set(PREFIX_OBJ, final_key, it.bl);
2057 }
2058 );
2059 }
2060 }
2061 }
2062
2063 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2064 {
2065 if (spanning_blob_map.empty())
2066 return 0;
2067 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2068 // bid is valid and available.
2069 if (bid >= 0)
2070 return bid;
2071 // Find next unused bid;
2072 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2073 const auto begin_bid = bid;
2074 do {
2075 if (!spanning_blob_map.count(bid))
2076 return bid;
2077 else {
2078 bid++;
2079 if (bid < 0) bid = 0;
2080 }
2081 } while (bid != begin_bid);
2082 assert(0 == "no available blob id");
2083 }
2084
2085 void BlueStore::ExtentMap::reshard(
2086 KeyValueDB *db,
2087 KeyValueDB::Transaction t)
2088 {
2089 auto cct = onode->c->store->cct; // used by dout
2090
2091 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2092 << needs_reshard_end << ")" << std::dec
2093 << " of " << onode->onode.extent_map_shards.size()
2094 << " shards on " << onode->oid << dendl;
2095 for (auto& p : spanning_blob_map) {
2096 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2097 << dendl;
2098 }
2099 // determine shard index range
2100 unsigned si_begin = 0, si_end = 0;
2101 if (!shards.empty()) {
2102 while (si_begin + 1 < shards.size() &&
2103 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2104 ++si_begin;
2105 }
2106 needs_reshard_begin = shards[si_begin].shard_info->offset;
2107 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2108 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2109 needs_reshard_end = shards[si_end].shard_info->offset;
2110 break;
2111 }
2112 }
2113 if (si_end == shards.size()) {
2114 needs_reshard_end = OBJECT_MAX_SIZE;
2115 }
2116 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2117 << " over 0x[" << std::hex << needs_reshard_begin << ","
2118 << needs_reshard_end << ")" << std::dec << dendl;
2119 }
2120
2121 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2122
2123 // we may need to fault in a larger interval later must have all
2124 // referring extents for spanning blobs loaded in order to have
2125 // accurate use_tracker values.
2126 uint32_t spanning_scan_begin = needs_reshard_begin;
2127 uint32_t spanning_scan_end = needs_reshard_end;
2128
2129 // remove old keys
2130 string key;
2131 for (unsigned i = si_begin; i < si_end; ++i) {
2132 generate_extent_shard_key_and_apply(
2133 onode->key, shards[i].shard_info->offset, &key,
2134 [&](const string& final_key) {
2135 t->rmkey(PREFIX_OBJ, final_key);
2136 }
2137 );
2138 }
2139
2140 // calculate average extent size
2141 unsigned bytes = 0;
2142 unsigned extents = 0;
2143 if (onode->onode.extent_map_shards.empty()) {
2144 bytes = inline_bl.length();
2145 extents = extent_map.size();
2146 } else {
2147 for (unsigned i = si_begin; i < si_end; ++i) {
2148 bytes += shards[i].shard_info->bytes;
2149 extents += shards[i].extents;
2150 }
2151 }
2152 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2153 unsigned slop = target *
2154 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2155 unsigned extent_avg = bytes / MAX(1, extents);
2156 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2157 << ", slop " << slop << dendl;
2158
2159 // reshard
2160 unsigned estimate = 0;
2161 unsigned offset = needs_reshard_begin;
2162 vector<bluestore_onode_t::shard_info> new_shard_info;
2163 unsigned max_blob_end = 0;
2164 Extent dummy(needs_reshard_begin);
2165 for (auto e = extent_map.lower_bound(dummy);
2166 e != extent_map.end();
2167 ++e) {
2168 if (e->logical_offset >= needs_reshard_end) {
2169 break;
2170 }
2171 dout(30) << " extent " << *e << dendl;
2172
2173 // disfavor shard boundaries that span a blob
2174 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2175 if (estimate &&
2176 estimate + extent_avg > target + (would_span ? slop : 0)) {
2177 // new shard
2178 if (offset == needs_reshard_begin) {
2179 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2180 new_shard_info.back().offset = offset;
2181 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2182 << std::dec << dendl;
2183 }
2184 offset = e->logical_offset;
2185 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2186 new_shard_info.back().offset = offset;
2187 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2188 << std::dec << dendl;
2189 estimate = 0;
2190 }
2191 estimate += extent_avg;
2192 unsigned bs = e->blob_start();
2193 if (bs < spanning_scan_begin) {
2194 spanning_scan_begin = bs;
2195 }
2196 uint32_t be = e->blob_end();
2197 if (be > max_blob_end) {
2198 max_blob_end = be;
2199 }
2200 if (be > spanning_scan_end) {
2201 spanning_scan_end = be;
2202 }
2203 }
2204 if (new_shard_info.empty() && (si_begin > 0 ||
2205 si_end < shards.size())) {
2206 // we resharded a partial range; we must produce at least one output
2207 // shard
2208 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2209 new_shard_info.back().offset = needs_reshard_begin;
2210 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2211 << std::dec << " (singleton degenerate case)" << dendl;
2212 }
2213
2214 auto& sv = onode->onode.extent_map_shards;
2215 dout(20) << __func__ << " new " << new_shard_info << dendl;
2216 dout(20) << __func__ << " old " << sv << dendl;
2217 if (sv.empty()) {
2218 // no old shards to keep
2219 sv.swap(new_shard_info);
2220 init_shards(true, true);
2221 } else {
2222 // splice in new shards
2223 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2224 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2225 sv.insert(
2226 sv.begin() + si_begin,
2227 new_shard_info.begin(),
2228 new_shard_info.end());
2229 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2230 si_end = si_begin + new_shard_info.size();
2231
2232 assert(sv.size() == shards.size());
2233
2234 // note that we need to update every shard_info of shards here,
2235 // as sv might have been totally re-allocated above
2236 for (unsigned i = 0; i < shards.size(); i++) {
2237 shards[i].shard_info = &sv[i];
2238 }
2239
2240 // mark newly added shards as dirty
2241 for (unsigned i = si_begin; i < si_end; ++i) {
2242 shards[i].loaded = true;
2243 shards[i].dirty = true;
2244 }
2245 }
2246 dout(20) << __func__ << " fin " << sv << dendl;
2247 inline_bl.clear();
2248
2249 if (sv.empty()) {
2250 // no more shards; unspan all previously spanning blobs
2251 auto p = spanning_blob_map.begin();
2252 while (p != spanning_blob_map.end()) {
2253 p->second->id = -1;
2254 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2255 p = spanning_blob_map.erase(p);
2256 }
2257 } else {
2258 // identify new spanning blobs
2259 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2260 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2261 if (spanning_scan_begin < needs_reshard_begin) {
2262 fault_range(db, spanning_scan_begin,
2263 needs_reshard_begin - spanning_scan_begin);
2264 }
2265 if (spanning_scan_end > needs_reshard_end) {
2266 fault_range(db, needs_reshard_end,
2267 spanning_scan_end - needs_reshard_end);
2268 }
2269 auto sp = sv.begin() + si_begin;
2270 auto esp = sv.end();
2271 unsigned shard_start = sp->offset;
2272 unsigned shard_end;
2273 ++sp;
2274 if (sp == esp) {
2275 shard_end = OBJECT_MAX_SIZE;
2276 } else {
2277 shard_end = sp->offset;
2278 }
2279 Extent dummy(needs_reshard_begin);
2280 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2281 if (e->logical_offset >= needs_reshard_end) {
2282 break;
2283 }
2284 dout(30) << " extent " << *e << dendl;
2285 while (e->logical_offset >= shard_end) {
2286 shard_start = shard_end;
2287 assert(sp != esp);
2288 ++sp;
2289 if (sp == esp) {
2290 shard_end = OBJECT_MAX_SIZE;
2291 } else {
2292 shard_end = sp->offset;
2293 }
2294 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2295 << " to 0x" << shard_end << std::dec << dendl;
2296 }
2297 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2298 if (!e->blob->is_spanning()) {
2299 // We have two options: (1) split the blob into pieces at the
2300 // shard boundaries (and adjust extents accordingly), or (2)
2301 // mark it spanning. We prefer to cut the blob if we can. Note that
2302 // we may have to split it multiple times--potentially at every
2303 // shard boundary.
2304 bool must_span = false;
2305 BlobRef b = e->blob;
2306 if (b->can_split()) {
2307 uint32_t bstart = e->blob_start();
2308 uint32_t bend = e->blob_end();
2309 for (const auto& sh : shards) {
2310 if (bstart < sh.shard_info->offset &&
2311 bend > sh.shard_info->offset) {
2312 uint32_t blob_offset = sh.shard_info->offset - bstart;
2313 if (b->can_split_at(blob_offset)) {
2314 dout(20) << __func__ << " splitting blob, bstart 0x"
2315 << std::hex << bstart << " blob_offset 0x"
2316 << blob_offset << std::dec << " " << *b << dendl;
2317 b = split_blob(b, blob_offset, sh.shard_info->offset);
2318 // switch b to the new right-hand side, in case it
2319 // *also* has to get split.
2320 bstart += blob_offset;
2321 onode->c->store->logger->inc(l_bluestore_blob_split);
2322 } else {
2323 must_span = true;
2324 break;
2325 }
2326 }
2327 }
2328 } else {
2329 must_span = true;
2330 }
2331 if (must_span) {
2332 auto bid = allocate_spanning_blob_id();
2333 b->id = bid;
2334 spanning_blob_map[b->id] = b;
2335 dout(20) << __func__ << " adding spanning " << *b << dendl;
2336 }
2337 }
2338 } else {
2339 if (e->blob->is_spanning()) {
2340 spanning_blob_map.erase(e->blob->id);
2341 e->blob->id = -1;
2342 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2343 }
2344 }
2345 }
2346 }
2347
2348 clear_needs_reshard();
2349 }
2350
2351 bool BlueStore::ExtentMap::encode_some(
2352 uint32_t offset,
2353 uint32_t length,
2354 bufferlist& bl,
2355 unsigned *pn)
2356 {
2357 auto cct = onode->c->store->cct; //used by dout
2358 Extent dummy(offset);
2359 auto start = extent_map.lower_bound(dummy);
2360 uint32_t end = offset + length;
2361
2362 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2363 // serialization only. Hence there is no specific
2364 // handling at ExtentMap level.
2365
2366 unsigned n = 0;
2367 size_t bound = 0;
2368 bool must_reshard = false;
2369 for (auto p = start;
2370 p != extent_map.end() && p->logical_offset < end;
2371 ++p, ++n) {
2372 assert(p->logical_offset >= offset);
2373 p->blob->last_encoded_id = -1;
2374 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2375 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2376 << std::dec << " hit new spanning blob " << *p << dendl;
2377 request_reshard(p->blob_start(), p->blob_end());
2378 must_reshard = true;
2379 }
2380 if (!must_reshard) {
2381 denc_varint(0, bound); // blobid
2382 denc_varint(0, bound); // logical_offset
2383 denc_varint(0, bound); // len
2384 denc_varint(0, bound); // blob_offset
2385
2386 p->blob->bound_encode(
2387 bound,
2388 struct_v,
2389 p->blob->shared_blob->get_sbid(),
2390 false);
2391 }
2392 }
2393 if (must_reshard) {
2394 return true;
2395 }
2396
2397 denc(struct_v, bound);
2398 denc_varint(0, bound); // number of extents
2399
2400 {
2401 auto app = bl.get_contiguous_appender(bound);
2402 denc(struct_v, app);
2403 denc_varint(n, app);
2404 if (pn) {
2405 *pn = n;
2406 }
2407
2408 n = 0;
2409 uint64_t pos = 0;
2410 uint64_t prev_len = 0;
2411 for (auto p = start;
2412 p != extent_map.end() && p->logical_offset < end;
2413 ++p, ++n) {
2414 unsigned blobid;
2415 bool include_blob = false;
2416 if (p->blob->is_spanning()) {
2417 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2418 blobid |= BLOBID_FLAG_SPANNING;
2419 } else if (p->blob->last_encoded_id < 0) {
2420 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2421 include_blob = true;
2422 blobid = 0; // the decoder will infer the id from n
2423 } else {
2424 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2425 }
2426 if (p->logical_offset == pos) {
2427 blobid |= BLOBID_FLAG_CONTIGUOUS;
2428 }
2429 if (p->blob_offset == 0) {
2430 blobid |= BLOBID_FLAG_ZEROOFFSET;
2431 }
2432 if (p->length == prev_len) {
2433 blobid |= BLOBID_FLAG_SAMELENGTH;
2434 } else {
2435 prev_len = p->length;
2436 }
2437 denc_varint(blobid, app);
2438 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2439 denc_varint_lowz(p->logical_offset - pos, app);
2440 }
2441 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2442 denc_varint_lowz(p->blob_offset, app);
2443 }
2444 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2445 denc_varint_lowz(p->length, app);
2446 }
2447 pos = p->logical_end();
2448 if (include_blob) {
2449 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2450 }
2451 }
2452 }
2453 /*derr << __func__ << bl << dendl;
2454 derr << __func__ << ":";
2455 bl.hexdump(*_dout);
2456 *_dout << dendl;
2457 */
2458 return false;
2459 }
2460
2461 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2462 {
2463 auto cct = onode->c->store->cct; //used by dout
2464 /*
2465 derr << __func__ << ":";
2466 bl.hexdump(*_dout);
2467 *_dout << dendl;
2468 */
2469
2470 assert(bl.get_num_buffers() <= 1);
2471 auto p = bl.front().begin_deep();
2472 __u8 struct_v;
2473 denc(struct_v, p);
2474 // Version 2 differs from v1 in blob's ref_map
2475 // serialization only. Hence there is no specific
2476 // handling at ExtentMap level below.
2477 assert(struct_v == 1 || struct_v == 2);
2478
2479 uint32_t num;
2480 denc_varint(num, p);
2481 vector<BlobRef> blobs(num);
2482 uint64_t pos = 0;
2483 uint64_t prev_len = 0;
2484 unsigned n = 0;
2485
2486 while (!p.end()) {
2487 Extent *le = new Extent();
2488 uint64_t blobid;
2489 denc_varint(blobid, p);
2490 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2491 uint64_t gap;
2492 denc_varint_lowz(gap, p);
2493 pos += gap;
2494 }
2495 le->logical_offset = pos;
2496 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2497 denc_varint_lowz(le->blob_offset, p);
2498 } else {
2499 le->blob_offset = 0;
2500 }
2501 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2502 denc_varint_lowz(prev_len, p);
2503 }
2504 le->length = prev_len;
2505
2506 if (blobid & BLOBID_FLAG_SPANNING) {
2507 dout(30) << __func__ << " getting spanning blob "
2508 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2509 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2510 } else {
2511 blobid >>= BLOBID_SHIFT_BITS;
2512 if (blobid) {
2513 le->assign_blob(blobs[blobid - 1]);
2514 assert(le->blob);
2515 } else {
2516 Blob *b = new Blob();
2517 uint64_t sbid = 0;
2518 b->decode(onode->c, p, struct_v, &sbid, false);
2519 blobs[n] = b;
2520 onode->c->open_shared_blob(sbid, b);
2521 le->assign_blob(b);
2522 }
2523 // we build ref_map dynamically for non-spanning blobs
2524 le->blob->get_ref(
2525 onode->c,
2526 le->blob_offset,
2527 le->length);
2528 }
2529 pos += prev_len;
2530 ++n;
2531 extent_map.insert(*le);
2532 }
2533
2534 assert(n == num);
2535 return num;
2536 }
2537
2538 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2539 {
2540 // Version 2 differs from v1 in blob's ref_map
2541 // serialization only. Hence there is no specific
2542 // handling at ExtentMap level.
2543 __u8 struct_v = 2;
2544
2545 denc(struct_v, p);
2546 denc_varint((uint32_t)0, p);
2547 size_t key_size = 0;
2548 denc_varint((uint32_t)0, key_size);
2549 p += spanning_blob_map.size() * key_size;
2550 for (const auto& i : spanning_blob_map) {
2551 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2552 }
2553 }
2554
2555 void BlueStore::ExtentMap::encode_spanning_blobs(
2556 bufferlist::contiguous_appender& p)
2557 {
2558 // Version 2 differs from v1 in blob's ref_map
2559 // serialization only. Hence there is no specific
2560 // handling at ExtentMap level.
2561 __u8 struct_v = 2;
2562
2563 denc(struct_v, p);
2564 denc_varint(spanning_blob_map.size(), p);
2565 for (auto& i : spanning_blob_map) {
2566 denc_varint(i.second->id, p);
2567 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2568 }
2569 }
2570
2571 void BlueStore::ExtentMap::decode_spanning_blobs(
2572 bufferptr::iterator& p)
2573 {
2574 __u8 struct_v;
2575 denc(struct_v, p);
2576 // Version 2 differs from v1 in blob's ref_map
2577 // serialization only. Hence there is no specific
2578 // handling at ExtentMap level.
2579 assert(struct_v == 1 || struct_v == 2);
2580
2581 unsigned n;
2582 denc_varint(n, p);
2583 while (n--) {
2584 BlobRef b(new Blob());
2585 denc_varint(b->id, p);
2586 spanning_blob_map[b->id] = b;
2587 uint64_t sbid = 0;
2588 b->decode(onode->c, p, struct_v, &sbid, true);
2589 onode->c->open_shared_blob(sbid, b);
2590 }
2591 }
2592
2593 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2594 {
2595 shards.resize(onode->onode.extent_map_shards.size());
2596 unsigned i = 0;
2597 for (auto &s : onode->onode.extent_map_shards) {
2598 shards[i].shard_info = &s;
2599 shards[i].loaded = loaded;
2600 shards[i].dirty = dirty;
2601 ++i;
2602 }
2603 }
2604
2605 void BlueStore::ExtentMap::fault_range(
2606 KeyValueDB *db,
2607 uint32_t offset,
2608 uint32_t length)
2609 {
2610 auto cct = onode->c->store->cct; //used by dout
2611 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2612 << std::dec << dendl;
2613 auto start = seek_shard(offset);
2614 auto last = seek_shard(offset + length);
2615
2616 if (start < 0)
2617 return;
2618
2619 assert(last >= start);
2620 string key;
2621 while (start <= last) {
2622 assert((size_t)start < shards.size());
2623 auto p = &shards[start];
2624 if (!p->loaded) {
2625 dout(30) << __func__ << " opening shard 0x" << std::hex
2626 << p->shard_info->offset << std::dec << dendl;
2627 bufferlist v;
2628 generate_extent_shard_key_and_apply(
2629 onode->key, p->shard_info->offset, &key,
2630 [&](const string& final_key) {
2631 int r = db->get(PREFIX_OBJ, final_key, &v);
2632 if (r < 0) {
2633 derr << __func__ << " missing shard 0x" << std::hex
2634 << p->shard_info->offset << std::dec << " for " << onode->oid
2635 << dendl;
2636 assert(r >= 0);
2637 }
2638 }
2639 );
2640 p->extents = decode_some(v);
2641 p->loaded = true;
2642 dout(20) << __func__ << " open shard 0x" << std::hex
2643 << p->shard_info->offset << std::dec
2644 << " (" << v.length() << " bytes)" << dendl;
2645 assert(p->dirty == false);
2646 assert(v.length() == p->shard_info->bytes);
2647 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2648 } else {
2649 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2650 }
2651 ++start;
2652 }
2653 }
2654
2655 void BlueStore::ExtentMap::dirty_range(
2656 uint32_t offset,
2657 uint32_t length)
2658 {
2659 auto cct = onode->c->store->cct; //used by dout
2660 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2661 << std::dec << dendl;
2662 if (shards.empty()) {
2663 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2664 inline_bl.clear();
2665 return;
2666 }
2667 auto start = seek_shard(offset);
2668 auto last = seek_shard(offset + length);
2669 if (start < 0)
2670 return;
2671
2672 assert(last >= start);
2673 while (start <= last) {
2674 assert((size_t)start < shards.size());
2675 auto p = &shards[start];
2676 if (!p->loaded) {
2677 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2678 << std::dec << " is not loaded, can't mark dirty" << dendl;
2679 assert(0 == "can't mark unloaded shard dirty");
2680 }
2681 if (!p->dirty) {
2682 dout(20) << __func__ << " mark shard 0x" << std::hex
2683 << p->shard_info->offset << std::dec << " dirty" << dendl;
2684 p->dirty = true;
2685 }
2686 ++start;
2687 }
2688 }
2689
2690 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2691 uint64_t offset)
2692 {
2693 Extent dummy(offset);
2694 return extent_map.find(dummy);
2695 }
2696
2697 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2698 uint64_t offset)
2699 {
2700 Extent dummy(offset);
2701 auto fp = extent_map.lower_bound(dummy);
2702 if (fp != extent_map.begin()) {
2703 --fp;
2704 if (fp->logical_end() <= offset) {
2705 ++fp;
2706 }
2707 }
2708 return fp;
2709 }
2710
2711 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2712 uint64_t offset) const
2713 {
2714 Extent dummy(offset);
2715 auto fp = extent_map.lower_bound(dummy);
2716 if (fp != extent_map.begin()) {
2717 --fp;
2718 if (fp->logical_end() <= offset) {
2719 ++fp;
2720 }
2721 }
2722 return fp;
2723 }
2724
2725 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2726 {
2727 auto fp = seek_lextent(offset);
2728 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2729 return false;
2730 }
2731 return true;
2732 }
2733
2734 int BlueStore::ExtentMap::compress_extent_map(
2735 uint64_t offset,
2736 uint64_t length)
2737 {
2738 auto cct = onode->c->store->cct; //used by dout
2739 if (extent_map.empty())
2740 return 0;
2741 int removed = 0;
2742 auto p = seek_lextent(offset);
2743 if (p != extent_map.begin()) {
2744 --p; // start to the left of offset
2745 }
2746 // the caller should have just written to this region
2747 assert(p != extent_map.end());
2748
2749 // identify the *next* shard
2750 auto pshard = shards.begin();
2751 while (pshard != shards.end() &&
2752 p->logical_offset >= pshard->shard_info->offset) {
2753 ++pshard;
2754 }
2755 uint64_t shard_end;
2756 if (pshard != shards.end()) {
2757 shard_end = pshard->shard_info->offset;
2758 } else {
2759 shard_end = OBJECT_MAX_SIZE;
2760 }
2761
2762 auto n = p;
2763 for (++n; n != extent_map.end(); p = n++) {
2764 if (n->logical_offset > offset + length) {
2765 break; // stop after end
2766 }
2767 while (n != extent_map.end() &&
2768 p->logical_end() == n->logical_offset &&
2769 p->blob == n->blob &&
2770 p->blob_offset + p->length == n->blob_offset &&
2771 n->logical_offset < shard_end) {
2772 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2773 << " next shard 0x" << shard_end << std::dec
2774 << " merging " << *p << " and " << *n << dendl;
2775 p->length += n->length;
2776 rm(n++);
2777 ++removed;
2778 }
2779 if (n == extent_map.end()) {
2780 break;
2781 }
2782 if (n->logical_offset >= shard_end) {
2783 assert(pshard != shards.end());
2784 ++pshard;
2785 if (pshard != shards.end()) {
2786 shard_end = pshard->shard_info->offset;
2787 } else {
2788 shard_end = OBJECT_MAX_SIZE;
2789 }
2790 }
2791 }
2792 if (removed && onode) {
2793 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2794 }
2795 return removed;
2796 }
2797
2798 void BlueStore::ExtentMap::punch_hole(
2799 CollectionRef &c,
2800 uint64_t offset,
2801 uint64_t length,
2802 old_extent_map_t *old_extents)
2803 {
2804 auto p = seek_lextent(offset);
2805 uint64_t end = offset + length;
2806 while (p != extent_map.end()) {
2807 if (p->logical_offset >= end) {
2808 break;
2809 }
2810 if (p->logical_offset < offset) {
2811 if (p->logical_end() > end) {
2812 // split and deref middle
2813 uint64_t front = offset - p->logical_offset;
2814 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2815 length, p->blob);
2816 old_extents->push_back(*oe);
2817 add(end,
2818 p->blob_offset + front + length,
2819 p->length - front - length,
2820 p->blob);
2821 p->length = front;
2822 break;
2823 } else {
2824 // deref tail
2825 assert(p->logical_end() > offset); // else seek_lextent bug
2826 uint64_t keep = offset - p->logical_offset;
2827 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2828 p->length - keep, p->blob);
2829 old_extents->push_back(*oe);
2830 p->length = keep;
2831 ++p;
2832 continue;
2833 }
2834 }
2835 if (p->logical_offset + p->length <= end) {
2836 // deref whole lextent
2837 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2838 p->length, p->blob);
2839 old_extents->push_back(*oe);
2840 rm(p++);
2841 continue;
2842 }
2843 // deref head
2844 uint64_t keep = p->logical_end() - end;
2845 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2846 p->length - keep, p->blob);
2847 old_extents->push_back(*oe);
2848
2849 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2850 rm(p);
2851 break;
2852 }
2853 }
2854
2855 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2856 CollectionRef &c,
2857 uint64_t logical_offset,
2858 uint64_t blob_offset, uint64_t length, BlobRef b,
2859 old_extent_map_t *old_extents)
2860 {
2861 // We need to have completely initialized Blob to increment its ref counters.
2862 assert(b->get_blob().get_logical_length() != 0);
2863
2864 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2865 // old_extents list if we overwre the blob totally
2866 // This might happen during WAL overwrite.
2867 b->get_ref(onode->c, blob_offset, length);
2868
2869 if (old_extents) {
2870 punch_hole(c, logical_offset, length, old_extents);
2871 }
2872
2873 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2874 extent_map.insert(*le);
2875 if (spans_shard(logical_offset, length)) {
2876 request_reshard(logical_offset, logical_offset + length);
2877 }
2878 return le;
2879 }
2880
2881 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2882 BlobRef lb,
2883 uint32_t blob_offset,
2884 uint32_t pos)
2885 {
2886 auto cct = onode->c->store->cct; //used by dout
2887
2888 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2889 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2890 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2891 << dendl;
2892 BlobRef rb = onode->c->new_blob();
2893 lb->split(onode->c, blob_offset, rb.get());
2894
2895 for (auto ep = seek_lextent(pos);
2896 ep != extent_map.end() && ep->logical_offset < end_pos;
2897 ++ep) {
2898 if (ep->blob != lb) {
2899 continue;
2900 }
2901 if (ep->logical_offset < pos) {
2902 // split extent
2903 size_t left = pos - ep->logical_offset;
2904 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2905 extent_map.insert(*ne);
2906 ep->length = left;
2907 dout(30) << __func__ << " split " << *ep << dendl;
2908 dout(30) << __func__ << " to " << *ne << dendl;
2909 } else {
2910 // switch blob
2911 assert(ep->blob_offset >= blob_offset);
2912
2913 ep->blob = rb;
2914 ep->blob_offset -= blob_offset;
2915 dout(30) << __func__ << " adjusted " << *ep << dendl;
2916 }
2917 }
2918 return rb;
2919 }
2920
2921 // Onode
2922
2923 #undef dout_prefix
2924 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2925
2926 void BlueStore::Onode::flush()
2927 {
2928 if (flushing_count.load()) {
2929 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2930 std::unique_lock<std::mutex> l(flush_lock);
2931 while (flushing_count.load()) {
2932 flush_cond.wait(l);
2933 }
2934 }
2935 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2936 }
2937
2938 // =======================================================
2939 // WriteContext
2940
2941 /// Checks for writes to the same pextent within a blob
2942 bool BlueStore::WriteContext::has_conflict(
2943 BlobRef b,
2944 uint64_t loffs,
2945 uint64_t loffs_end,
2946 uint64_t min_alloc_size)
2947 {
2948 assert((loffs % min_alloc_size) == 0);
2949 assert((loffs_end % min_alloc_size) == 0);
2950 for (auto w : writes) {
2951 if (b == w.b) {
2952 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
2953 auto loffs2_end = P2ROUNDUP(w.logical_offset + w.length0, min_alloc_size);
2954 if ((loffs <= loffs2 && loffs_end > loffs2) ||
2955 (loffs >= loffs2 && loffs < loffs2_end)) {
2956 return true;
2957 }
2958 }
2959 }
2960 return false;
2961 }
2962
2963 // =======================================================
2964
2965 // DeferredBatch
2966 #undef dout_prefix
2967 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2968
2969 void BlueStore::DeferredBatch::prepare_write(
2970 CephContext *cct,
2971 uint64_t seq, uint64_t offset, uint64_t length,
2972 bufferlist::const_iterator& blp)
2973 {
2974 _discard(cct, offset, length);
2975 auto i = iomap.insert(make_pair(offset, deferred_io()));
2976 assert(i.second); // this should be a new insertion
2977 i.first->second.seq = seq;
2978 blp.copy(length, i.first->second.bl);
2979 i.first->second.bl.reassign_to_mempool(
2980 mempool::mempool_bluestore_writing_deferred);
2981 dout(20) << __func__ << " seq " << seq
2982 << " 0x" << std::hex << offset << "~" << length
2983 << " crc " << i.first->second.bl.crc32c(-1)
2984 << std::dec << dendl;
2985 seq_bytes[seq] += length;
2986 #ifdef DEBUG_DEFERRED
2987 _audit(cct);
2988 #endif
2989 }
2990
2991 void BlueStore::DeferredBatch::_discard(
2992 CephContext *cct, uint64_t offset, uint64_t length)
2993 {
2994 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2995 << std::dec << dendl;
2996 auto p = iomap.lower_bound(offset);
2997 if (p != iomap.begin()) {
2998 --p;
2999 auto end = p->first + p->second.bl.length();
3000 if (end > offset) {
3001 bufferlist head;
3002 head.substr_of(p->second.bl, 0, offset - p->first);
3003 dout(20) << __func__ << " keep head " << p->second.seq
3004 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3005 << " -> 0x" << head.length() << std::dec << dendl;
3006 auto i = seq_bytes.find(p->second.seq);
3007 assert(i != seq_bytes.end());
3008 if (end > offset + length) {
3009 bufferlist tail;
3010 tail.substr_of(p->second.bl, offset + length - p->first,
3011 end - (offset + length));
3012 dout(20) << __func__ << " keep tail " << p->second.seq
3013 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3014 << " -> 0x" << tail.length() << std::dec << dendl;
3015 auto &n = iomap[offset + length];
3016 n.bl.swap(tail);
3017 n.seq = p->second.seq;
3018 i->second -= length;
3019 } else {
3020 i->second -= end - offset;
3021 }
3022 assert(i->second >= 0);
3023 p->second.bl.swap(head);
3024 }
3025 ++p;
3026 }
3027 while (p != iomap.end()) {
3028 if (p->first >= offset + length) {
3029 break;
3030 }
3031 auto i = seq_bytes.find(p->second.seq);
3032 assert(i != seq_bytes.end());
3033 auto end = p->first + p->second.bl.length();
3034 if (end > offset + length) {
3035 unsigned drop_front = offset + length - p->first;
3036 unsigned keep_tail = end - (offset + length);
3037 dout(20) << __func__ << " truncate front " << p->second.seq
3038 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3039 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3040 << " to 0x" << (offset + length) << "~" << keep_tail
3041 << std::dec << dendl;
3042 auto &s = iomap[offset + length];
3043 s.seq = p->second.seq;
3044 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3045 i->second -= drop_front;
3046 } else {
3047 dout(20) << __func__ << " drop " << p->second.seq
3048 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3049 << std::dec << dendl;
3050 i->second -= p->second.bl.length();
3051 }
3052 assert(i->second >= 0);
3053 p = iomap.erase(p);
3054 }
3055 }
3056
3057 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3058 {
3059 map<uint64_t,int> sb;
3060 for (auto p : seq_bytes) {
3061 sb[p.first] = 0; // make sure we have the same set of keys
3062 }
3063 uint64_t pos = 0;
3064 for (auto& p : iomap) {
3065 assert(p.first >= pos);
3066 sb[p.second.seq] += p.second.bl.length();
3067 pos = p.first + p.second.bl.length();
3068 }
3069 assert(sb == seq_bytes);
3070 }
3071
3072
3073 // Collection
3074
3075 #undef dout_prefix
3076 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3077
3078 BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3079 : store(ns),
3080 cache(c),
3081 cid(cid),
3082 lock("BlueStore::Collection::lock", true, false),
3083 exists(true),
3084 onode_map(c)
3085 {
3086 }
3087
3088 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3089 {
3090 assert(!b->shared_blob);
3091 const bluestore_blob_t& blob = b->get_blob();
3092 if (!blob.is_shared()) {
3093 b->shared_blob = new SharedBlob(this);
3094 return;
3095 }
3096
3097 b->shared_blob = shared_blob_set.lookup(sbid);
3098 if (b->shared_blob) {
3099 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3100 << std::dec << " had " << *b->shared_blob << dendl;
3101 } else {
3102 b->shared_blob = new SharedBlob(sbid, this);
3103 shared_blob_set.add(this, b->shared_blob.get());
3104 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3105 << std::dec << " opened " << *b->shared_blob
3106 << dendl;
3107 }
3108 }
3109
3110 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3111 {
3112 if (!sb->is_loaded()) {
3113
3114 bufferlist v;
3115 string key;
3116 auto sbid = sb->get_sbid();
3117 get_shared_blob_key(sbid, &key);
3118 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3119 if (r < 0) {
3120 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3121 << std::dec << " not found at key "
3122 << pretty_binary_string(key) << dendl;
3123 assert(0 == "uh oh, missing shared_blob");
3124 }
3125
3126 sb->loaded = true;
3127 sb->persistent = new bluestore_shared_blob_t(sbid);
3128 bufferlist::iterator p = v.begin();
3129 ::decode(*(sb->persistent), p);
3130 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3131 << std::dec << " loaded shared_blob " << *sb << dendl;
3132 }
3133 }
3134
3135 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3136 {
3137 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3138 assert(!b->shared_blob->is_loaded());
3139
3140 // update blob
3141 bluestore_blob_t& blob = b->dirty_blob();
3142 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3143
3144 // update shared blob
3145 b->shared_blob->loaded = true;
3146 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3147 shared_blob_set.add(this, b->shared_blob.get());
3148 for (auto p : blob.get_extents()) {
3149 if (p.is_valid()) {
3150 b->shared_blob->get_ref(
3151 p.offset,
3152 p.length);
3153 }
3154 }
3155 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3156 }
3157
3158 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3159 {
3160 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3161 assert(sb->is_loaded());
3162
3163 uint64_t sbid = sb->get_sbid();
3164 shared_blob_set.remove(sb);
3165 sb->loaded = false;
3166 delete sb->persistent;
3167 sb->sbid_unloaded = 0;
3168 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3169 return sbid;
3170 }
3171
3172 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3173 const ghobject_t& oid,
3174 bool create)
3175 {
3176 assert(create ? lock.is_wlocked() : lock.is_locked());
3177
3178 spg_t pgid;
3179 if (cid.is_pg(&pgid)) {
3180 if (!oid.match(cnode.bits, pgid.ps())) {
3181 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3182 << pgid << " bits " << cnode.bits << dendl;
3183 ceph_abort();
3184 }
3185 }
3186
3187 OnodeRef o = onode_map.lookup(oid);
3188 if (o)
3189 return o;
3190
3191 mempool::bluestore_cache_other::string key;
3192 get_object_key(store->cct, oid, &key);
3193
3194 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3195 << pretty_binary_string(key) << dendl;
3196
3197 bufferlist v;
3198 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3199 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3200 Onode *on;
3201 if (v.length() == 0) {
3202 assert(r == -ENOENT);
3203 if (!store->cct->_conf->bluestore_debug_misc &&
3204 !create)
3205 return OnodeRef();
3206
3207 // new object, new onode
3208 on = new Onode(this, oid, key);
3209 } else {
3210 // loaded
3211 assert(r >= 0);
3212 on = new Onode(this, oid, key);
3213 on->exists = true;
3214 bufferptr::iterator p = v.front().begin_deep();
3215 on->onode.decode(p);
3216 for (auto& i : on->onode.attrs) {
3217 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3218 }
3219
3220 // initialize extent_map
3221 on->extent_map.decode_spanning_blobs(p);
3222 if (on->onode.extent_map_shards.empty()) {
3223 denc(on->extent_map.inline_bl, p);
3224 on->extent_map.decode_some(on->extent_map.inline_bl);
3225 on->extent_map.inline_bl.reassign_to_mempool(
3226 mempool::mempool_bluestore_cache_other);
3227 } else {
3228 on->extent_map.init_shards(false, false);
3229 }
3230 }
3231 o.reset(on);
3232 return onode_map.add(oid, o);
3233 }
3234
3235 void BlueStore::Collection::split_cache(
3236 Collection *dest)
3237 {
3238 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3239
3240 // lock (one or both) cache shards
3241 std::lock(cache->lock, dest->cache->lock);
3242 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3243 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3244
3245 int destbits = dest->cnode.bits;
3246 spg_t destpg;
3247 bool is_pg = dest->cid.is_pg(&destpg);
3248 assert(is_pg);
3249
3250 auto p = onode_map.onode_map.begin();
3251 while (p != onode_map.onode_map.end()) {
3252 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3253 // onode does not belong to this child
3254 ++p;
3255 } else {
3256 OnodeRef o = p->second;
3257 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3258 << dendl;
3259
3260 cache->_rm_onode(p->second);
3261 p = onode_map.onode_map.erase(p);
3262
3263 o->c = dest;
3264 dest->cache->_add_onode(o, 1);
3265 dest->onode_map.onode_map[o->oid] = o;
3266 dest->onode_map.cache = dest->cache;
3267
3268 // move over shared blobs and buffers. cover shared blobs from
3269 // both extent map and spanning blob map (the full extent map
3270 // may not be faulted in)
3271 vector<SharedBlob*> sbvec;
3272 for (auto& e : o->extent_map.extent_map) {
3273 sbvec.push_back(e.blob->shared_blob.get());
3274 }
3275 for (auto& b : o->extent_map.spanning_blob_map) {
3276 sbvec.push_back(b.second->shared_blob.get());
3277 }
3278 for (auto sb : sbvec) {
3279 if (sb->coll == dest) {
3280 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3281 << dendl;
3282 continue;
3283 }
3284 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
3285 if (sb->get_sbid()) {
3286 ldout(store->cct, 20) << __func__
3287 << " moving registration " << *sb << dendl;
3288 shared_blob_set.remove(sb);
3289 dest->shared_blob_set.add(dest, sb);
3290 }
3291 sb->coll = dest;
3292 if (dest->cache != cache) {
3293 for (auto& i : sb->bc.buffer_map) {
3294 if (!i.second->is_writing()) {
3295 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3296 << dendl;
3297 dest->cache->_move_buffer(cache, i.second.get());
3298 }
3299 }
3300 }
3301 }
3302 }
3303 }
3304 }
3305
3306 // =======================================================
3307
3308 // MempoolThread
3309
3310 #undef dout_prefix
3311 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
3312
3313 void *BlueStore::MempoolThread::entry()
3314 {
3315 Mutex::Locker l(lock);
3316
3317 std::list<PriorityCache::PriCache *> caches;
3318 caches.push_back(store->db);
3319 caches.push_back(&meta_cache);
3320 caches.push_back(&data_cache);
3321 autotune_cache_size = store->osd_memory_cache_min;
3322
3323 utime_t next_balance = ceph_clock_now();
3324 utime_t next_resize = ceph_clock_now();
3325
3326 bool interval_stats_trim = false;
3327 bool interval_stats_resize = false;
3328 while (!stop) {
3329 _adjust_cache_settings();
3330
3331 // Before we trim, check and see if it's time to rebalance/resize.
3332 double autotune_interval = store->cache_autotune_interval;
3333 double resize_interval = store->osd_memory_cache_resize_interval;
3334
3335 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
3336 // Log events at 5 instead of 20 when balance happens.
3337 interval_stats_resize = true;
3338 interval_stats_trim = true;
3339 if (store->cache_autotune) {
3340 _balance_cache(caches);
3341 }
3342
3343 next_balance = ceph_clock_now();
3344 next_balance += autotune_interval;
3345 }
3346 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
3347 if (ceph_using_tcmalloc() && store->cache_autotune) {
3348 _tune_cache_size(interval_stats_resize);
3349 interval_stats_resize = false;
3350 }
3351 next_resize = ceph_clock_now();
3352 next_resize += resize_interval;
3353 }
3354
3355 // Now Trim
3356 _trim_shards(interval_stats_trim);
3357 interval_stats_trim = false;
3358
3359 store->_update_cache_logger();
3360 utime_t wait;
3361 wait += store->cct->_conf->bluestore_cache_trim_interval;
3362 cond.WaitInterval(lock, wait);
3363 }
3364 stop = false;
3365 return NULL;
3366 }
3367
3368 void BlueStore::MempoolThread::_adjust_cache_settings()
3369 {
3370 store->db->set_cache_ratio(store->cache_kv_ratio);
3371 meta_cache.set_cache_ratio(store->cache_meta_ratio);
3372 data_cache.set_cache_ratio(store->cache_data_ratio);
3373 }
3374
3375 void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
3376 {
3377 auto cct = store->cct;
3378 size_t num_shards = store->cache_shards.size();
3379
3380 int64_t kv_used = store->db->get_cache_usage();
3381 int64_t meta_used = meta_cache._get_used_bytes();
3382 int64_t data_used = data_cache._get_used_bytes();
3383
3384 uint64_t cache_size = store->cache_size;
3385 int64_t kv_alloc =
3386 static_cast<int64_t>(store->db->get_cache_ratio() * cache_size);
3387 int64_t meta_alloc =
3388 static_cast<int64_t>(meta_cache.get_cache_ratio() * cache_size);
3389 int64_t data_alloc =
3390 static_cast<int64_t>(data_cache.get_cache_ratio() * cache_size);
3391
3392 if (store->cache_autotune) {
3393 cache_size = autotune_cache_size;
3394
3395 kv_alloc = store->db->get_cache_bytes();
3396 meta_alloc = meta_cache.get_cache_bytes();
3397 data_alloc = data_cache.get_cache_bytes();
3398 }
3399
3400 if (interval_stats) {
3401 ldout(cct, 5) << __func__ << " cache_size: " << cache_size
3402 << " kv_alloc: " << kv_alloc
3403 << " kv_used: " << kv_used
3404 << " meta_alloc: " << meta_alloc
3405 << " meta_used: " << meta_used
3406 << " data_alloc: " << data_alloc
3407 << " data_used: " << data_used << dendl;
3408 } else {
3409 ldout(cct, 20) << __func__ << " cache_size: " << cache_size
3410 << " kv_alloc: " << kv_alloc
3411 << " kv_used: " << kv_used
3412 << " meta_alloc: " << meta_alloc
3413 << " meta_used: " << meta_used
3414 << " data_alloc: " << data_alloc
3415 << " data_used: " << data_used << dendl;
3416 }
3417
3418 uint64_t max_shard_onodes = static_cast<uint64_t>(
3419 (meta_alloc / (double) num_shards) / meta_cache.get_bytes_per_onode());
3420 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
3421
3422 ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
3423 << " max_shard_buffer: " << max_shard_buffer << dendl;
3424
3425 for (auto i : store->cache_shards) {
3426 i->trim(max_shard_onodes, max_shard_buffer);
3427 }
3428 }
3429
3430 void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
3431 {
3432 auto cct = store->cct;
3433 uint64_t target = store->osd_memory_target;
3434 uint64_t base = store->osd_memory_base;
3435 double fragmentation = store->osd_memory_expected_fragmentation;
3436 uint64_t cache_min = store->osd_memory_cache_min;
3437 uint64_t cache_max = cache_min;
3438 uint64_t limited_target = (1.0 - fragmentation) * target;
3439 if (limited_target > base + cache_min) {
3440 cache_max = limited_target - base;
3441 }
3442
3443 size_t heap_size = 0;
3444 size_t unmapped = 0;
3445 uint64_t mapped = 0;
3446
3447 ceph_heap_release_free_memory();
3448 ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
3449 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
3450 mapped = heap_size - unmapped;
3451
3452 uint64_t new_size = autotune_cache_size;
3453 new_size = (new_size < cache_max) ? new_size : cache_max;
3454 new_size = (new_size > cache_min) ? new_size : cache_min;
3455
3456 // Approach the min/max slowly, but bounce away quickly.
3457 if ((uint64_t) mapped < target) {
3458 double ratio = 1 - ((double) mapped / target);
3459 new_size += ratio * (cache_max - new_size);
3460 } else {
3461 double ratio = 1 - ((double) target / mapped);
3462 new_size -= ratio * (new_size - cache_min);
3463 }
3464
3465 if (interval_stats) {
3466 ldout(cct, 5) << __func__
3467 << " target: " << target
3468 << " heap: " << heap_size
3469 << " unmapped: " << unmapped
3470 << " mapped: " << mapped
3471 << " old cache_size: " << autotune_cache_size
3472 << " new cache size: " << new_size << dendl;
3473 } else {
3474 ldout(cct, 20) << __func__
3475 << " target: " << target
3476 << " heap: " << heap_size
3477 << " unmapped: " << unmapped
3478 << " mapped: " << mapped
3479 << " old cache_size: " << autotune_cache_size
3480 << " new cache size: " << new_size << dendl;
3481 }
3482 autotune_cache_size = new_size;
3483 }
3484
3485 void BlueStore::MempoolThread::_balance_cache(
3486 const std::list<PriorityCache::PriCache *>& caches)
3487 {
3488 int64_t mem_avail = autotune_cache_size;
3489
3490 // Assign memory for each priority level
3491 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
3492 ldout(store->cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
3493 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
3494 _balance_cache_pri(&mem_avail, caches, pri);
3495 }
3496 // Assign any leftover memory based on the default ratios.
3497 if (mem_avail > 0) {
3498 for (auto it = caches.begin(); it != caches.end(); it++) {
3499 int64_t fair_share =
3500 static_cast<int64_t>((*it)->get_cache_ratio() * mem_avail);
3501 if (fair_share > 0) {
3502 (*it)->add_cache_bytes(PriorityCache::Priority::LAST, fair_share);
3503 }
3504 }
3505 }
3506 // assert if we assigned more memory than is available.
3507 assert(mem_avail >= 0);
3508
3509 // Finally commit the new cache sizes
3510 for (auto it = caches.begin(); it != caches.end(); it++) {
3511 (*it)->commit_cache_size();
3512 }
3513 }
3514
3515 void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail,
3516 const std::list<PriorityCache::PriCache *>& caches, PriorityCache::Priority pri)
3517 {
3518 std::list<PriorityCache::PriCache *> tmp_caches = caches;
3519 double cur_ratios = 0;
3520 double new_ratios = 0;
3521
3522 // Zero this priority's bytes, sum the initial ratios.
3523 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); it++) {
3524 (*it)->set_cache_bytes(pri, 0);
3525 cur_ratios += (*it)->get_cache_ratio();
3526 }
3527
3528 // For this priority, loop until caches are satisified or we run out of memory.
3529 // Since we can't allocate fractional bytes, stop if we have fewer bytes left
3530 // than the number of participating caches.
3531 while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
3532 uint64_t total_assigned = 0;
3533
3534 for (auto it = tmp_caches.begin(); it != tmp_caches.end(); ) {
3535 int64_t cache_wants = (*it)->request_cache_bytes(pri, store->cache_autotune_chunk_size);
3536
3537 // Usually the ratio should be set to the fraction of the current caches'
3538 // assigned ratio compared to the total ratio of all caches that still
3539 // want memory. There is a special case where the only caches left are
3540 // all assigned 0% ratios but still want memory. In that case, give
3541 // them an equal shot at the remaining memory for this priority.
3542 double ratio = 1.0 / tmp_caches.size();
3543 if (cur_ratios > 0) {
3544 ratio = (*it)->get_cache_ratio() / cur_ratios;
3545 }
3546 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
3547
3548 if (cache_wants > fair_share) {
3549 // If we want too much, take what we can get but stick around for more
3550 (*it)->add_cache_bytes(pri, fair_share);
3551 total_assigned += fair_share;
3552
3553 new_ratios += (*it)->get_cache_ratio();
3554 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3555 << " wanted: " << cache_wants << " fair_share: " << fair_share
3556 << " mem_avail: " << *mem_avail
3557 << " staying in list. Size: " << tmp_caches.size()
3558 << dendl;
3559 ++it;
3560 } else {
3561 // Otherwise assign only what we want
3562 if (cache_wants > 0) {
3563 (*it)->add_cache_bytes(pri, cache_wants);
3564 total_assigned += cache_wants;
3565
3566 ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
3567 << " wanted: " << cache_wants << " fair_share: " << fair_share
3568 << " mem_avail: " << *mem_avail
3569 << " removing from list. New size: " << tmp_caches.size() - 1
3570 << dendl;
3571
3572 }
3573 // Either the cache didn't want anything or got what it wanted, so remove it from the tmp list.
3574 it = tmp_caches.erase(it);
3575 }
3576 }
3577 // Reset the ratios
3578 *mem_avail -= total_assigned;
3579 cur_ratios = new_ratios;
3580 new_ratios = 0;
3581 }
3582 }
3583
3584 // =======================================================
3585
3586 // OmapIteratorImpl
3587
3588 #undef dout_prefix
3589 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3590
3591 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3592 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3593 : c(c), o(o), it(it)
3594 {
3595 RWLock::RLocker l(c->lock);
3596 if (o->onode.has_omap()) {
3597 get_omap_key(o->onode.nid, string(), &head);
3598 get_omap_tail(o->onode.nid, &tail);
3599 it->lower_bound(head);
3600 }
3601 }
3602
3603 int BlueStore::OmapIteratorImpl::seek_to_first()
3604 {
3605 RWLock::RLocker l(c->lock);
3606 if (o->onode.has_omap()) {
3607 it->lower_bound(head);
3608 } else {
3609 it = KeyValueDB::Iterator();
3610 }
3611 return 0;
3612 }
3613
3614 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3615 {
3616 RWLock::RLocker l(c->lock);
3617 if (o->onode.has_omap()) {
3618 string key;
3619 get_omap_key(o->onode.nid, after, &key);
3620 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3621 << pretty_binary_string(key) << dendl;
3622 it->upper_bound(key);
3623 } else {
3624 it = KeyValueDB::Iterator();
3625 }
3626 return 0;
3627 }
3628
3629 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3630 {
3631 RWLock::RLocker l(c->lock);
3632 if (o->onode.has_omap()) {
3633 string key;
3634 get_omap_key(o->onode.nid, to, &key);
3635 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3636 << pretty_binary_string(key) << dendl;
3637 it->lower_bound(key);
3638 } else {
3639 it = KeyValueDB::Iterator();
3640 }
3641 return 0;
3642 }
3643
3644 bool BlueStore::OmapIteratorImpl::valid()
3645 {
3646 RWLock::RLocker l(c->lock);
3647 bool r = o->onode.has_omap() && it && it->valid() &&
3648 it->raw_key().second <= tail;
3649 if (it && it->valid()) {
3650 ldout(c->store->cct,20) << __func__ << " is at "
3651 << pretty_binary_string(it->raw_key().second)
3652 << dendl;
3653 }
3654 return r;
3655 }
3656
3657 int BlueStore::OmapIteratorImpl::next(bool validate)
3658 {
3659 RWLock::RLocker l(c->lock);
3660 if (o->onode.has_omap()) {
3661 it->next();
3662 return 0;
3663 } else {
3664 return -1;
3665 }
3666 }
3667
3668 string BlueStore::OmapIteratorImpl::key()
3669 {
3670 RWLock::RLocker l(c->lock);
3671 assert(it->valid());
3672 string db_key = it->raw_key().second;
3673 string user_key;
3674 decode_omap_key(db_key, &user_key);
3675 return user_key;
3676 }
3677
3678 bufferlist BlueStore::OmapIteratorImpl::value()
3679 {
3680 RWLock::RLocker l(c->lock);
3681 assert(it->valid());
3682 return it->value();
3683 }
3684
3685
3686 // =====================================
3687
3688 #undef dout_prefix
3689 #define dout_prefix *_dout << "bluestore(" << path << ") "
3690
3691
3692 static void aio_cb(void *priv, void *priv2)
3693 {
3694 BlueStore *store = static_cast<BlueStore*>(priv);
3695 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3696 c->aio_finish(store);
3697 }
3698
3699 BlueStore::BlueStore(CephContext *cct, const string& path)
3700 : ObjectStore(cct, path),
3701 throttle_bytes(cct, "bluestore_throttle_bytes",
3702 cct->_conf->bluestore_throttle_bytes),
3703 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3704 cct->_conf->bluestore_throttle_bytes +
3705 cct->_conf->bluestore_throttle_deferred_bytes),
3706 deferred_finisher(cct, "defered_finisher", "dfin"),
3707 kv_sync_thread(this),
3708 kv_finalize_thread(this),
3709 mempool_thread(this)
3710 {
3711 _init_logger();
3712 cct->_conf->add_observer(this);
3713 set_cache_shards(1);
3714 }
3715
3716 BlueStore::BlueStore(CephContext *cct,
3717 const string& path,
3718 uint64_t _min_alloc_size)
3719 : ObjectStore(cct, path),
3720 throttle_bytes(cct, "bluestore_throttle_bytes",
3721 cct->_conf->bluestore_throttle_bytes),
3722 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3723 cct->_conf->bluestore_throttle_bytes +
3724 cct->_conf->bluestore_throttle_deferred_bytes),
3725 deferred_finisher(cct, "defered_finisher", "dfin"),
3726 kv_sync_thread(this),
3727 kv_finalize_thread(this),
3728 min_alloc_size(_min_alloc_size),
3729 min_alloc_size_order(ctz(_min_alloc_size)),
3730 mempool_thread(this)
3731 {
3732 _init_logger();
3733 cct->_conf->add_observer(this);
3734 set_cache_shards(1);
3735 }
3736
3737 BlueStore::~BlueStore()
3738 {
3739 for (auto f : finishers) {
3740 delete f;
3741 }
3742 finishers.clear();
3743
3744 cct->_conf->remove_observer(this);
3745 _shutdown_logger();
3746 assert(!mounted);
3747 assert(db == NULL);
3748 assert(bluefs == NULL);
3749 assert(fsid_fd < 0);
3750 assert(path_fd < 0);
3751 for (auto i : cache_shards) {
3752 delete i;
3753 }
3754 cache_shards.clear();
3755 }
3756
3757 const char **BlueStore::get_tracked_conf_keys() const
3758 {
3759 static const char* KEYS[] = {
3760 "bluestore_csum_type",
3761 "bluestore_compression_mode",
3762 "bluestore_compression_algorithm",
3763 "bluestore_compression_min_blob_size",
3764 "bluestore_compression_min_blob_size_ssd",
3765 "bluestore_compression_min_blob_size_hdd",
3766 "bluestore_compression_max_blob_size",
3767 "bluestore_compression_max_blob_size_ssd",
3768 "bluestore_compression_max_blob_size_hdd",
3769 "bluestore_compression_required_ratio",
3770 "bluestore_max_alloc_size",
3771 "bluestore_prefer_deferred_size",
3772 "bluestore_prefer_deferred_size_hdd",
3773 "bluestore_prefer_deferred_size_ssd",
3774 "bluestore_deferred_batch_ops",
3775 "bluestore_deferred_batch_ops_hdd",
3776 "bluestore_deferred_batch_ops_ssd",
3777 "bluestore_throttle_bytes",
3778 "bluestore_throttle_deferred_bytes",
3779 "bluestore_throttle_cost_per_io_hdd",
3780 "bluestore_throttle_cost_per_io_ssd",
3781 "bluestore_throttle_cost_per_io",
3782 "bluestore_max_blob_size",
3783 "bluestore_max_blob_size_ssd",
3784 "bluestore_max_blob_size_hdd",
3785 NULL
3786 };
3787 return KEYS;
3788 }
3789
3790 void BlueStore::handle_conf_change(const struct md_config_t *conf,
3791 const std::set<std::string> &changed)
3792 {
3793 if (changed.count("bluestore_csum_type")) {
3794 _set_csum();
3795 }
3796 if (changed.count("bluestore_compression_mode") ||
3797 changed.count("bluestore_compression_algorithm") ||
3798 changed.count("bluestore_compression_min_blob_size") ||
3799 changed.count("bluestore_compression_max_blob_size")) {
3800 if (bdev) {
3801 _set_compression();
3802 }
3803 }
3804 if (changed.count("bluestore_max_blob_size") ||
3805 changed.count("bluestore_max_blob_size_ssd") ||
3806 changed.count("bluestore_max_blob_size_hdd")) {
3807 if (bdev) {
3808 // only after startup
3809 _set_blob_size();
3810 }
3811 }
3812 if (changed.count("bluestore_prefer_deferred_size") ||
3813 changed.count("bluestore_prefer_deferred_size_hdd") ||
3814 changed.count("bluestore_prefer_deferred_size_ssd") ||
3815 changed.count("bluestore_max_alloc_size") ||
3816 changed.count("bluestore_deferred_batch_ops") ||
3817 changed.count("bluestore_deferred_batch_ops_hdd") ||
3818 changed.count("bluestore_deferred_batch_ops_ssd")) {
3819 if (bdev) {
3820 // only after startup
3821 _set_alloc_sizes();
3822 }
3823 }
3824 if (changed.count("bluestore_throttle_cost_per_io") ||
3825 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3826 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3827 if (bdev) {
3828 _set_throttle_params();
3829 }
3830 }
3831 if (changed.count("bluestore_throttle_bytes")) {
3832 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3833 throttle_deferred_bytes.reset_max(
3834 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3835 }
3836 if (changed.count("bluestore_throttle_deferred_bytes")) {
3837 throttle_deferred_bytes.reset_max(
3838 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3839 }
3840 }
3841
3842 void BlueStore::_set_compression()
3843 {
3844 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3845 if (m) {
3846 comp_mode = *m;
3847 } else {
3848 derr << __func__ << " unrecognized value '"
3849 << cct->_conf->bluestore_compression_mode
3850 << "' for bluestore_compression_mode, reverting to 'none'"
3851 << dendl;
3852 comp_mode = Compressor::COMP_NONE;
3853 }
3854
3855 compressor = nullptr;
3856
3857 if (comp_mode == Compressor::COMP_NONE) {
3858 dout(10) << __func__ << " compression mode set to 'none', "
3859 << "ignore other compression setttings" << dendl;
3860 return;
3861 }
3862
3863 if (cct->_conf->bluestore_compression_min_blob_size) {
3864 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
3865 } else {
3866 assert(bdev);
3867 if (bdev->is_rotational()) {
3868 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3869 } else {
3870 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3871 }
3872 }
3873
3874 if (cct->_conf->bluestore_compression_max_blob_size) {
3875 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3876 } else {
3877 assert(bdev);
3878 if (bdev->is_rotational()) {
3879 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3880 } else {
3881 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3882 }
3883 }
3884
3885 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3886 if (!alg_name.empty()) {
3887 compressor = Compressor::create(cct, alg_name);
3888 if (!compressor) {
3889 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3890 << dendl;
3891 }
3892 }
3893
3894 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3895 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3896 << dendl;
3897 }
3898
3899 void BlueStore::_set_csum()
3900 {
3901 csum_type = Checksummer::CSUM_NONE;
3902 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3903 if (t > Checksummer::CSUM_NONE)
3904 csum_type = t;
3905
3906 dout(10) << __func__ << " csum_type "
3907 << Checksummer::get_csum_type_string(csum_type)
3908 << dendl;
3909 }
3910
3911 void BlueStore::_set_throttle_params()
3912 {
3913 if (cct->_conf->bluestore_throttle_cost_per_io) {
3914 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3915 } else {
3916 assert(bdev);
3917 if (bdev->is_rotational()) {
3918 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3919 } else {
3920 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3921 }
3922 }
3923
3924 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3925 << dendl;
3926 }
3927 void BlueStore::_set_blob_size()
3928 {
3929 if (cct->_conf->bluestore_max_blob_size) {
3930 max_blob_size = cct->_conf->bluestore_max_blob_size;
3931 } else {
3932 assert(bdev);
3933 if (bdev->is_rotational()) {
3934 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3935 } else {
3936 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3937 }
3938 }
3939 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3940 << std::dec << dendl;
3941 }
3942
3943 void BlueStore::_set_finisher_num()
3944 {
3945 if (cct->_conf->bluestore_shard_finishers) {
3946 if (cct->_conf->osd_op_num_shards) {
3947 m_finisher_num = cct->_conf->osd_op_num_shards;
3948 } else {
3949 assert(bdev);
3950 if (bdev->is_rotational()) {
3951 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
3952 } else {
3953 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
3954 }
3955 }
3956 }
3957 assert(m_finisher_num != 0);
3958 }
3959
3960 int BlueStore::_set_cache_sizes()
3961 {
3962 assert(bdev);
3963 cache_autotune = cct->_conf->get_val<bool>("bluestore_cache_autotune");
3964 cache_autotune_chunk_size =
3965 cct->_conf->get_val<uint64_t>("bluestore_cache_autotune_chunk_size");
3966 cache_autotune_interval =
3967 cct->_conf->get_val<double>("bluestore_cache_autotune_interval");
3968 osd_memory_target = cct->_conf->get_val<uint64_t>("osd_memory_target");
3969 osd_memory_base = cct->_conf->get_val<uint64_t>("osd_memory_base");
3970 osd_memory_expected_fragmentation =
3971 cct->_conf->get_val<double>("osd_memory_expected_fragmentation");
3972 osd_memory_cache_min = cct->_conf->get_val<uint64_t>("osd_memory_cache_min");
3973 osd_memory_cache_resize_interval =
3974 cct->_conf->get_val<double>("osd_memory_cache_resize_interval");
3975
3976 if (cct->_conf->bluestore_cache_size) {
3977 cache_size = cct->_conf->bluestore_cache_size;
3978 } else {
3979 // choose global cache size based on backend type
3980 if (bdev->is_rotational()) {
3981 cache_size = cct->_conf->bluestore_cache_size_hdd;
3982 } else {
3983 cache_size = cct->_conf->bluestore_cache_size_ssd;
3984 }
3985 }
3986
3987 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3988 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
3989 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
3990 << ") must be in range [0,1.0]" << dendl;
3991 return -EINVAL;
3992 }
3993
3994 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
3995 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
3996 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
3997 << ") must be in range [0,1.0]" << dendl;
3998 return -EINVAL;
3999 }
4000
4001 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
4002 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4003 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4004 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4005 << dendl;
4006 return -EINVAL;
4007 }
4008
4009 cache_data_ratio =
4010 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
4011 if (cache_data_ratio < 0) {
4012 // deal with floating point imprecision
4013 cache_data_ratio = 0;
4014 }
4015
4016 dout(1) << __func__ << " cache_size " << cache_size
4017 << " meta " << cache_meta_ratio
4018 << " kv " << cache_kv_ratio
4019 << " data " << cache_data_ratio
4020 << dendl;
4021 return 0;
4022 }
4023
4024 int BlueStore::write_meta(const std::string& key, const std::string& value)
4025 {
4026 bluestore_bdev_label_t label;
4027 string p = path + "/block";
4028 int r = _read_bdev_label(cct, p, &label);
4029 if (r < 0) {
4030 return ObjectStore::write_meta(key, value);
4031 }
4032 label.meta[key] = value;
4033 r = _write_bdev_label(cct, p, label);
4034 assert(r == 0);
4035 return ObjectStore::write_meta(key, value);
4036 }
4037
4038 int BlueStore::read_meta(const std::string& key, std::string *value)
4039 {
4040 bluestore_bdev_label_t label;
4041 string p = path + "/block";
4042 int r = _read_bdev_label(cct, p, &label);
4043 if (r < 0) {
4044 return ObjectStore::read_meta(key, value);
4045 }
4046 auto i = label.meta.find(key);
4047 if (i == label.meta.end()) {
4048 return ObjectStore::read_meta(key, value);
4049 }
4050 *value = i->second;
4051 return 0;
4052 }
4053
4054 void BlueStore::_init_logger()
4055 {
4056 PerfCountersBuilder b(cct, "bluestore",
4057 l_bluestore_first, l_bluestore_last);
4058 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4059 "Average kv_thread flush latency",
4060 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4061 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4062 "Average kv_thread commit latency");
4063 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
4064 "Average kv_thread sync latency",
4065 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
4066 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4067 "Average prepare state latency");
4068 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4069 "Average aio_wait state latency",
4070 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4071 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4072 "Average io_done state latency");
4073 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4074 "Average kv_queued state latency");
4075 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4076 "Average kv_commiting state latency");
4077 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4078 "Average kv_done state latency");
4079 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4080 "Average deferred_queued state latency");
4081 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4082 "Average aio_wait state latency");
4083 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4084 "Average cleanup state latency");
4085 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4086 "Average finishing state latency");
4087 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4088 "Average done state latency");
4089 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4090 "Average submit throttle latency",
4091 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4092 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4093 "Average submit latency",
4094 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4095 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4096 "Average commit latency",
4097 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4098 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4099 "Average read latency",
4100 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4101 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4102 "Average read onode metadata latency");
4103 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4104 "Average read latency");
4105 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4106 "Average compress latency");
4107 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4108 "Average decompress latency");
4109 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4110 "Average checksum latency");
4111 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4112 "Sum for beneficial compress ops");
4113 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4114 "Sum for compress ops rejected due to low net gain of space");
4115 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
4116 "Sum for write-op padded bytes", NULL, 0, unit_t(BYTES));
4117 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4118 "Sum for deferred write op");
4119 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
4120 "Sum for deferred write bytes", "def", 0, unit_t(BYTES));
4121 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4122 "Sum for write penalty read ops");
4123 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4124 "Sum for allocated bytes");
4125 b.add_u64(l_bluestore_stored, "bluestore_stored",
4126 "Sum for stored bytes");
4127 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
4128 "Sum for stored compressed bytes");
4129 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
4130 "Sum for bytes allocated for compressed data");
4131 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
4132 "Sum for original bytes that were compressed");
4133
4134 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4135 "Number of onodes in cache");
4136 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4137 "Sum for onode-lookups hit in the cache");
4138 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4139 "Sum for onode-lookups missed in the cache");
4140 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4141 "Sum for onode-shard lookups hit in the cache");
4142 b.add_u64_counter(l_bluestore_onode_shard_misses,
4143 "bluestore_onode_shard_misses",
4144 "Sum for onode-shard lookups missed in the cache");
4145 b.add_u64(l_bluestore_extents, "bluestore_extents",
4146 "Number of extents in cache");
4147 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4148 "Number of blobs in cache");
4149 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4150 "Number of buffers in cache");
4151 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
4152 "Number of buffer bytes in cache", NULL, 0, unit_t(BYTES));
4153 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
4154 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(BYTES));
4155 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
4156 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(BYTES));
4157
4158 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4159 "Large aligned writes into fresh blobs");
4160 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
4161 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(BYTES));
4162 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4163 "Large aligned writes into fresh blobs (blobs)");
4164 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4165 "Small writes into existing or sparse small blobs");
4166 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
4167 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(BYTES));
4168 b.add_u64_counter(l_bluestore_write_small_unused,
4169 "bluestore_write_small_unused",
4170 "Small writes into unused portion of existing blob");
4171 b.add_u64_counter(l_bluestore_write_small_deferred,
4172 "bluestore_write_small_deferred",
4173 "Small overwrites using deferred");
4174 b.add_u64_counter(l_bluestore_write_small_pre_read,
4175 "bluestore_write_small_pre_read",
4176 "Small writes that required we read some data (possibly "
4177 "cached) to fill out the block");
4178 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4179 "Small write into new (sparse) blob");
4180
4181 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4182 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4183 "Onode extent map reshard events");
4184 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4185 "Sum for blob splitting due to resharding");
4186 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4187 "Sum for extents that have been removed due to compression");
4188 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4189 "Sum for extents that have been merged due to garbage "
4190 "collection");
4191 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4192 "Read EIO errors propagated to high level callers");
4193 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4194 "Read operations that required at least one retry due to failed checksum validation");
4195 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4196 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4197 logger = b.create_perf_counters();
4198 cct->get_perfcounters_collection()->add(logger);
4199 }
4200
4201 int BlueStore::_reload_logger()
4202 {
4203 struct store_statfs_t store_statfs;
4204
4205 int r = statfs(&store_statfs);
4206 if(r >= 0) {
4207 logger->set(l_bluestore_allocated, store_statfs.allocated);
4208 logger->set(l_bluestore_stored, store_statfs.stored);
4209 logger->set(l_bluestore_compressed, store_statfs.compressed);
4210 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
4211 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
4212 }
4213 return r;
4214 }
4215
4216 void BlueStore::_shutdown_logger()
4217 {
4218 cct->get_perfcounters_collection()->remove(logger);
4219 delete logger;
4220 }
4221
4222 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4223 uuid_d *fsid)
4224 {
4225 bluestore_bdev_label_t label;
4226 int r = _read_bdev_label(cct, path, &label);
4227 if (r < 0)
4228 return r;
4229 *fsid = label.osd_uuid;
4230 return 0;
4231 }
4232
4233 int BlueStore::_open_path()
4234 {
4235 // sanity check(s)
4236 if (cct->_conf->get_val<uint64_t>("osd_max_object_size") >=
4237 4*1024*1024*1024ull) {
4238 derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl;
4239 return -EINVAL;
4240 }
4241 assert(path_fd < 0);
4242 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
4243 if (path_fd < 0) {
4244 int r = -errno;
4245 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4246 << dendl;
4247 return r;
4248 }
4249 return 0;
4250 }
4251
4252 void BlueStore::_close_path()
4253 {
4254 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4255 path_fd = -1;
4256 }
4257
4258 int BlueStore::_write_bdev_label(CephContext *cct,
4259 string path, bluestore_bdev_label_t label)
4260 {
4261 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4262 bufferlist bl;
4263 ::encode(label, bl);
4264 uint32_t crc = bl.crc32c(-1);
4265 ::encode(crc, bl);
4266 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
4267 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4268 z.zero();
4269 bl.append(std::move(z));
4270
4271 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
4272 if (fd < 0) {
4273 fd = -errno;
4274 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4275 << dendl;
4276 return fd;
4277 }
4278 int r = bl.write_fd(fd);
4279 if (r < 0) {
4280 derr << __func__ << " failed to write to " << path
4281 << ": " << cpp_strerror(r) << dendl;
4282 }
4283 r = ::fsync(fd);
4284 if (r < 0) {
4285 derr << __func__ << " failed to fsync " << path
4286 << ": " << cpp_strerror(r) << dendl;
4287 }
4288 VOID_TEMP_FAILURE_RETRY(::close(fd));
4289 return r;
4290 }
4291
4292 int BlueStore::_read_bdev_label(CephContext* cct, string path,
4293 bluestore_bdev_label_t *label)
4294 {
4295 dout(10) << __func__ << dendl;
4296 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
4297 if (fd < 0) {
4298 fd = -errno;
4299 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4300 << dendl;
4301 return fd;
4302 }
4303 bufferlist bl;
4304 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4305 VOID_TEMP_FAILURE_RETRY(::close(fd));
4306 if (r < 0) {
4307 derr << __func__ << " failed to read from " << path
4308 << ": " << cpp_strerror(r) << dendl;
4309 return r;
4310 }
4311
4312 uint32_t crc, expected_crc;
4313 bufferlist::iterator p = bl.begin();
4314 try {
4315 ::decode(*label, p);
4316 bufferlist t;
4317 t.substr_of(bl, 0, p.get_off());
4318 crc = t.crc32c(-1);
4319 ::decode(expected_crc, p);
4320 }
4321 catch (buffer::error& e) {
4322 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
4323 << ": " << e.what()
4324 << dendl;
4325 return -ENOENT;
4326 }
4327 if (crc != expected_crc) {
4328 derr << __func__ << " bad crc on label, expected " << expected_crc
4329 << " != actual " << crc << dendl;
4330 return -EIO;
4331 }
4332 dout(10) << __func__ << " got " << *label << dendl;
4333 return 0;
4334 }
4335
4336 int BlueStore::_check_or_set_bdev_label(
4337 string path, uint64_t size, string desc, bool create)
4338 {
4339 bluestore_bdev_label_t label;
4340 if (create) {
4341 label.osd_uuid = fsid;
4342 label.size = size;
4343 label.btime = ceph_clock_now();
4344 label.description = desc;
4345 int r = _write_bdev_label(cct, path, label);
4346 if (r < 0)
4347 return r;
4348 } else {
4349 int r = _read_bdev_label(cct, path, &label);
4350 if (r < 0)
4351 return r;
4352 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4353 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4354 << " and fsid " << fsid << " check bypassed" << dendl;
4355 }
4356 else if (label.osd_uuid != fsid) {
4357 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4358 << " does not match our fsid " << fsid << dendl;
4359 return -EIO;
4360 }
4361 }
4362 return 0;
4363 }
4364
4365 void BlueStore::_set_alloc_sizes(void)
4366 {
4367 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4368
4369 if (cct->_conf->bluestore_prefer_deferred_size) {
4370 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4371 } else {
4372 assert(bdev);
4373 if (bdev->is_rotational()) {
4374 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4375 } else {
4376 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4377 }
4378 }
4379
4380 if (cct->_conf->bluestore_deferred_batch_ops) {
4381 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4382 } else {
4383 assert(bdev);
4384 if (bdev->is_rotational()) {
4385 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4386 } else {
4387 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4388 }
4389 }
4390
4391 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4392 << std::dec << " order " << min_alloc_size_order
4393 << " max_alloc_size 0x" << std::hex << max_alloc_size
4394 << " prefer_deferred_size 0x" << prefer_deferred_size
4395 << std::dec
4396 << " deferred_batch_ops " << deferred_batch_ops
4397 << dendl;
4398 }
4399
4400 int BlueStore::_open_bdev(bool create)
4401 {
4402 assert(bdev == NULL);
4403 string p = path + "/block";
4404 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4405 int r = bdev->open(p);
4406 if (r < 0)
4407 goto fail;
4408
4409 if (bdev->supported_bdev_label()) {
4410 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4411 if (r < 0)
4412 goto fail_close;
4413 }
4414
4415 // initialize global block parameters
4416 block_size = bdev->get_block_size();
4417 block_mask = ~(block_size - 1);
4418 block_size_order = ctz(block_size);
4419 assert(block_size == 1u << block_size_order);
4420 // and set cache_size based on device type
4421 r = _set_cache_sizes();
4422 if (r < 0) {
4423 goto fail_close;
4424 }
4425 return 0;
4426
4427 fail_close:
4428 bdev->close();
4429 fail:
4430 delete bdev;
4431 bdev = NULL;
4432 return r;
4433 }
4434
4435 void BlueStore::_close_bdev()
4436 {
4437 assert(bdev);
4438 bdev->close();
4439 delete bdev;
4440 bdev = NULL;
4441 }
4442
4443 int BlueStore::_open_fm(bool create)
4444 {
4445 assert(fm == NULL);
4446 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4447
4448 if (create) {
4449 // initialize freespace
4450 dout(20) << __func__ << " initializing freespace" << dendl;
4451 KeyValueDB::Transaction t = db->get_transaction();
4452 {
4453 bufferlist bl;
4454 bl.append(freelist_type);
4455 t->set(PREFIX_SUPER, "freelist_type", bl);
4456 }
4457 // being able to allocate in units less than bdev block size
4458 // seems to be a bad idea.
4459 assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
4460 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
4461
4462 // allocate superblock reserved space. note that we do not mark
4463 // bluefs space as allocated in the freelist; we instead rely on
4464 // bluefs_extents.
4465 uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
4466 min_alloc_size);
4467 fm->allocate(0, reserved, t);
4468
4469 if (cct->_conf->bluestore_bluefs) {
4470 assert(bluefs_extents.num_intervals() == 1);
4471 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
4472 reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
4473 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4474 << " for bluefs" << dendl;
4475 bufferlist bl;
4476 ::encode(bluefs_extents, bl);
4477 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4478 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4479 << std::dec << dendl;
4480 }
4481
4482 if (cct->_conf->bluestore_debug_prefill > 0) {
4483 uint64_t end = bdev->get_size() - reserved;
4484 dout(1) << __func__ << " pre-fragmenting freespace, using "
4485 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4486 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4487 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4488 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4489 float r = cct->_conf->bluestore_debug_prefill;
4490 r /= 1.0 - r;
4491 bool stop = false;
4492
4493 while (!stop && start < end) {
4494 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4495 if (start + l > end) {
4496 l = end - start;
4497 l = P2ALIGN(l, min_alloc_size);
4498 }
4499 assert(start + l <= end);
4500
4501 uint64_t u = 1 + (uint64_t)(r * (double)l);
4502 u = P2ROUNDUP(u, min_alloc_size);
4503 if (start + l + u > end) {
4504 u = end - (start + l);
4505 // trim to align so we don't overflow again
4506 u = P2ALIGN(u, min_alloc_size);
4507 stop = true;
4508 }
4509 assert(start + l + u <= end);
4510
4511 dout(20) << " free 0x" << std::hex << start << "~" << l
4512 << " use 0x" << u << std::dec << dendl;
4513
4514 if (u == 0) {
4515 // break if u has been trimmed to nothing
4516 break;
4517 }
4518
4519 fm->allocate(start + l, u, t);
4520 start += l + u;
4521 }
4522 }
4523 db->submit_transaction_sync(t);
4524 }
4525
4526 int r = fm->init(bdev->get_size());
4527 if (r < 0) {
4528 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4529 delete fm;
4530 fm = NULL;
4531 return r;
4532 }
4533 return 0;
4534 }
4535
4536 void BlueStore::_close_fm()
4537 {
4538 dout(10) << __func__ << dendl;
4539 assert(fm);
4540 fm->shutdown();
4541 delete fm;
4542 fm = NULL;
4543 }
4544
4545 int BlueStore::_open_alloc()
4546 {
4547 assert(alloc == NULL);
4548 assert(bdev->get_size());
4549 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4550 bdev->get_size(),
4551 min_alloc_size);
4552 if (!alloc) {
4553 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4554 << cct->_conf->bluestore_allocator
4555 << dendl;
4556 return -EINVAL;
4557 }
4558
4559 uint64_t num = 0, bytes = 0;
4560
4561 dout(1) << __func__ << " opening allocation metadata" << dendl;
4562 // initialize from freelist
4563 fm->enumerate_reset();
4564 uint64_t offset, length;
4565 while (fm->enumerate_next(&offset, &length)) {
4566 alloc->init_add_free(offset, length);
4567 ++num;
4568 bytes += length;
4569 }
4570 fm->enumerate_reset();
4571 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
4572 << " in " << num << " extents"
4573 << dendl;
4574
4575 // also mark bluefs space as allocated
4576 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4577 alloc->init_rm_free(e.get_start(), e.get_len());
4578 }
4579 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4580 << bluefs_extents << std::dec << " as allocated" << dendl;
4581
4582 return 0;
4583 }
4584
4585 void BlueStore::_close_alloc()
4586 {
4587 assert(alloc);
4588 alloc->shutdown();
4589 delete alloc;
4590 alloc = NULL;
4591 }
4592
4593 int BlueStore::_open_fsid(bool create)
4594 {
4595 assert(fsid_fd < 0);
4596 int flags = O_RDWR|O_CLOEXEC;
4597 if (create)
4598 flags |= O_CREAT;
4599 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4600 if (fsid_fd < 0) {
4601 int err = -errno;
4602 derr << __func__ << " " << cpp_strerror(err) << dendl;
4603 return err;
4604 }
4605 return 0;
4606 }
4607
4608 int BlueStore::_read_fsid(uuid_d *uuid)
4609 {
4610 char fsid_str[40];
4611 memset(fsid_str, 0, sizeof(fsid_str));
4612 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4613 if (ret < 0) {
4614 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4615 return ret;
4616 }
4617 if (ret > 36)
4618 fsid_str[36] = 0;
4619 else
4620 fsid_str[ret] = 0;
4621 if (!uuid->parse(fsid_str)) {
4622 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4623 return -EINVAL;
4624 }
4625 return 0;
4626 }
4627
4628 int BlueStore::_write_fsid()
4629 {
4630 int r = ::ftruncate(fsid_fd, 0);
4631 if (r < 0) {
4632 r = -errno;
4633 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4634 return r;
4635 }
4636 string str = stringify(fsid) + "\n";
4637 r = safe_write(fsid_fd, str.c_str(), str.length());
4638 if (r < 0) {
4639 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4640 return r;
4641 }
4642 r = ::fsync(fsid_fd);
4643 if (r < 0) {
4644 r = -errno;
4645 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4646 return r;
4647 }
4648 return 0;
4649 }
4650
4651 void BlueStore::_close_fsid()
4652 {
4653 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4654 fsid_fd = -1;
4655 }
4656
4657 int BlueStore::_lock_fsid()
4658 {
4659 struct flock l;
4660 memset(&l, 0, sizeof(l));
4661 l.l_type = F_WRLCK;
4662 l.l_whence = SEEK_SET;
4663 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4664 if (r < 0) {
4665 int err = errno;
4666 derr << __func__ << " failed to lock " << path << "/fsid"
4667 << " (is another ceph-osd still running?)"
4668 << cpp_strerror(err) << dendl;
4669 return -err;
4670 }
4671 return 0;
4672 }
4673
4674 bool BlueStore::is_rotational()
4675 {
4676 if (bdev) {
4677 return bdev->is_rotational();
4678 }
4679
4680 bool rotational = true;
4681 int r = _open_path();
4682 if (r < 0)
4683 goto out;
4684 r = _open_fsid(false);
4685 if (r < 0)
4686 goto out_path;
4687 r = _read_fsid(&fsid);
4688 if (r < 0)
4689 goto out_fsid;
4690 r = _lock_fsid();
4691 if (r < 0)
4692 goto out_fsid;
4693 r = _open_bdev(false);
4694 if (r < 0)
4695 goto out_fsid;
4696 rotational = bdev->is_rotational();
4697 _close_bdev();
4698 out_fsid:
4699 _close_fsid();
4700 out_path:
4701 _close_path();
4702 out:
4703 return rotational;
4704 }
4705
4706 bool BlueStore::is_journal_rotational()
4707 {
4708 if (!bluefs) {
4709 dout(5) << __func__ << " bluefs disabled, default to store media type"
4710 << dendl;
4711 return is_rotational();
4712 }
4713 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4714 return bluefs->wal_is_rotational();
4715 }
4716
4717 bool BlueStore::test_mount_in_use()
4718 {
4719 // most error conditions mean the mount is not in use (e.g., because
4720 // it doesn't exist). only if we fail to lock do we conclude it is
4721 // in use.
4722 bool ret = false;
4723 int r = _open_path();
4724 if (r < 0)
4725 return false;
4726 r = _open_fsid(false);
4727 if (r < 0)
4728 goto out_path;
4729 r = _lock_fsid();
4730 if (r < 0)
4731 ret = true; // if we can't lock, it is in use
4732 _close_fsid();
4733 out_path:
4734 _close_path();
4735 return ret;
4736 }
4737
4738 int BlueStore::_open_db(bool create)
4739 {
4740 int r;
4741 assert(!db);
4742 string fn = path + "/db";
4743 string options;
4744 stringstream err;
4745 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4746
4747 string kv_backend;
4748 if (create) {
4749 kv_backend = cct->_conf->bluestore_kvbackend;
4750 } else {
4751 r = read_meta("kv_backend", &kv_backend);
4752 if (r < 0) {
4753 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4754 return -EIO;
4755 }
4756 }
4757 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4758
4759 bool do_bluefs;
4760 if (create) {
4761 do_bluefs = cct->_conf->bluestore_bluefs;
4762 } else {
4763 string s;
4764 r = read_meta("bluefs", &s);
4765 if (r < 0) {
4766 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4767 return -EIO;
4768 }
4769 if (s == "1") {
4770 do_bluefs = true;
4771 } else if (s == "0") {
4772 do_bluefs = false;
4773 } else {
4774 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4775 << dendl;
4776 return -EIO;
4777 }
4778 }
4779 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4780
4781 rocksdb::Env *env = NULL;
4782 if (do_bluefs) {
4783 dout(10) << __func__ << " initializing bluefs" << dendl;
4784 if (kv_backend != "rocksdb") {
4785 derr << " backend must be rocksdb to use bluefs" << dendl;
4786 return -EINVAL;
4787 }
4788 bluefs = new BlueFS(cct);
4789
4790 string bfn;
4791 struct stat st;
4792
4793 bfn = path + "/block.db";
4794 if (::stat(bfn.c_str(), &st) == 0) {
4795 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4796 if (r < 0) {
4797 derr << __func__ << " add block device(" << bfn << ") returned: "
4798 << cpp_strerror(r) << dendl;
4799 goto free_bluefs;
4800 }
4801
4802 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4803 r = _check_or_set_bdev_label(
4804 bfn,
4805 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4806 "bluefs db", create);
4807 if (r < 0) {
4808 derr << __func__
4809 << " check block device(" << bfn << ") label returned: "
4810 << cpp_strerror(r) << dendl;
4811 goto free_bluefs;
4812 }
4813 }
4814 if (create) {
4815 bluefs->add_block_extent(
4816 BlueFS::BDEV_DB,
4817 SUPER_RESERVED,
4818 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4819 }
4820 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4821 bluefs_single_shared_device = false;
4822 } else {
4823 r = -errno;
4824 if (::lstat(bfn.c_str(), &st) == -1) {
4825 r = 0;
4826 bluefs_shared_bdev = BlueFS::BDEV_DB;
4827 } else {
4828 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4829 << cpp_strerror(r) << dendl;
4830 goto free_bluefs;
4831 }
4832 }
4833
4834 // shared device
4835 bfn = path + "/block";
4836 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4837 if (r < 0) {
4838 derr << __func__ << " add block device(" << bfn << ") returned: "
4839 << cpp_strerror(r) << dendl;
4840 goto free_bluefs;
4841 }
4842 if (create) {
4843 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4844 uint64_t initial =
4845 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4846 cct->_conf->bluestore_bluefs_gift_ratio);
4847 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
4848 if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
4849 derr << __func__ << " bluefs_alloc_size 0x" << std::hex
4850 << cct->_conf->bluefs_alloc_size << " is not a multiple of "
4851 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4852 r = -EINVAL;
4853 goto free_bluefs;
4854 }
4855 // align to bluefs's alloc_size
4856 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
4857 // put bluefs in the middle of the device in case it is an HDD
4858 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4859 cct->_conf->bluefs_alloc_size);
4860 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4861 bluefs_extents.insert(start, initial);
4862 }
4863
4864 bfn = path + "/block.wal";
4865 if (::stat(bfn.c_str(), &st) == 0) {
4866 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4867 if (r < 0) {
4868 derr << __func__ << " add block device(" << bfn << ") returned: "
4869 << cpp_strerror(r) << dendl;
4870 goto free_bluefs;
4871 }
4872
4873 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4874 r = _check_or_set_bdev_label(
4875 bfn,
4876 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4877 "bluefs wal", create);
4878 if (r < 0) {
4879 derr << __func__ << " check block device(" << bfn
4880 << ") label returned: " << cpp_strerror(r) << dendl;
4881 goto free_bluefs;
4882 }
4883 }
4884
4885 if (create) {
4886 bluefs->add_block_extent(
4887 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4888 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4889 BDEV_LABEL_BLOCK_SIZE);
4890 }
4891 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4892 bluefs_single_shared_device = false;
4893 } else {
4894 r = -errno;
4895 if (::lstat(bfn.c_str(), &st) == -1) {
4896 r = 0;
4897 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
4898 } else {
4899 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4900 << cpp_strerror(r) << dendl;
4901 goto free_bluefs;
4902 }
4903 }
4904
4905 if (create) {
4906 bluefs->mkfs(fsid);
4907 }
4908 r = bluefs->mount();
4909 if (r < 0) {
4910 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4911 goto free_bluefs;
4912 }
4913 if (cct->_conf->bluestore_bluefs_env_mirror) {
4914 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4915 rocksdb::Env *b = rocksdb::Env::Default();
4916 if (create) {
4917 string cmd = "rm -rf " + path + "/db " +
4918 path + "/db.slow " +
4919 path + "/db.wal";
4920 int r = system(cmd.c_str());
4921 (void)r;
4922 }
4923 env = new rocksdb::EnvMirror(b, a, false, true);
4924 } else {
4925 env = new BlueRocksEnv(bluefs);
4926
4927 // simplify the dir names, too, as "seen" by rocksdb
4928 fn = "db";
4929 }
4930
4931 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4932 // we have both block.db and block; tell rocksdb!
4933 // note: the second (last) size value doesn't really matter
4934 ostringstream db_paths;
4935 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4936 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4937 db_paths << fn << ","
4938 << (uint64_t)(db_size * 95 / 100) << " "
4939 << fn + ".slow" << ","
4940 << (uint64_t)(slow_size * 95 / 100);
4941 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4942 dout(10) << __func__ << " set rocksdb_db_paths to "
4943 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4944 }
4945
4946 if (create) {
4947 env->CreateDir(fn);
4948 if (cct->_conf->rocksdb_separate_wal_dir)
4949 env->CreateDir(fn + ".wal");
4950 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4951 env->CreateDir(fn + ".slow");
4952 }
4953 } else if (create) {
4954 int r = ::mkdir(fn.c_str(), 0755);
4955 if (r < 0)
4956 r = -errno;
4957 if (r < 0 && r != -EEXIST) {
4958 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4959 << dendl;
4960 return r;
4961 }
4962
4963 // wal_dir, too!
4964 if (cct->_conf->rocksdb_separate_wal_dir) {
4965 string walfn = path + "/db.wal";
4966 r = ::mkdir(walfn.c_str(), 0755);
4967 if (r < 0)
4968 r = -errno;
4969 if (r < 0 && r != -EEXIST) {
4970 derr << __func__ << " failed to create " << walfn
4971 << ": " << cpp_strerror(r)
4972 << dendl;
4973 return r;
4974 }
4975 }
4976 }
4977
4978
4979 db = KeyValueDB::create(cct,
4980 kv_backend,
4981 fn,
4982 static_cast<void*>(env));
4983 if (!db) {
4984 derr << __func__ << " error creating db" << dendl;
4985 if (bluefs) {
4986 bluefs->umount();
4987 delete bluefs;
4988 bluefs = NULL;
4989 }
4990 // delete env manually here since we can't depend on db to do this
4991 // under this case
4992 delete env;
4993 env = NULL;
4994 return -EIO;
4995 }
4996
4997 FreelistManager::setup_merge_operators(db);
4998 db->set_merge_operator(PREFIX_STAT, merge_op);
4999 db->set_cache_size(cache_kv_ratio * cache_size);
5000
5001 if (kv_backend == "rocksdb")
5002 options = cct->_conf->bluestore_rocksdb_options;
5003 db->init(options);
5004 if (create)
5005 r = db->create_and_open(err);
5006 else
5007 r = db->open(err);
5008 if (r) {
5009 derr << __func__ << " erroring opening db: " << err.str() << dendl;
5010 if (bluefs) {
5011 bluefs->umount();
5012 delete bluefs;
5013 bluefs = NULL;
5014 }
5015 delete db;
5016 db = NULL;
5017 return -EIO;
5018 }
5019 dout(1) << __func__ << " opened " << kv_backend
5020 << " path " << fn << " options " << options << dendl;
5021 return 0;
5022
5023 free_bluefs:
5024 assert(bluefs);
5025 delete bluefs;
5026 bluefs = NULL;
5027 return r;
5028 }
5029
5030 void BlueStore::_close_db()
5031 {
5032 assert(db);
5033 delete db;
5034 db = NULL;
5035 if (bluefs) {
5036 bluefs->umount();
5037 delete bluefs;
5038 bluefs = NULL;
5039 }
5040 }
5041
5042 int BlueStore::_reconcile_bluefs_freespace()
5043 {
5044 dout(10) << __func__ << dendl;
5045 interval_set<uint64_t> bset;
5046 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
5047 assert(r == 0);
5048 if (bset == bluefs_extents) {
5049 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
5050 << std::dec << dendl;
5051 return 0;
5052 }
5053 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
5054 << dendl;
5055 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
5056 << std::dec << dendl;
5057
5058 interval_set<uint64_t> overlap;
5059 overlap.intersection_of(bset, bluefs_extents);
5060
5061 bset.subtract(overlap);
5062 if (!bset.empty()) {
5063 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
5064 << dendl;
5065 return -EIO;
5066 }
5067
5068 interval_set<uint64_t> super_extra;
5069 super_extra = bluefs_extents;
5070 super_extra.subtract(overlap);
5071 if (!super_extra.empty()) {
5072 // This is normal: it can happen if we commit to give extents to
5073 // bluefs and we crash before bluefs commits that it owns them.
5074 dout(10) << __func__ << " super extra " << super_extra << dendl;
5075 for (interval_set<uint64_t>::iterator p = super_extra.begin();
5076 p != super_extra.end();
5077 ++p) {
5078 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
5079 }
5080 }
5081
5082 return 0;
5083 }
5084
5085 void BlueStore::_dump_alloc_on_rebalance_failure()
5086 {
5087 auto dump_interval =
5088 cct->_conf->bluestore_bluefs_balance_failure_dump_interval;
5089 if (dump_interval > 0 &&
5090 next_dump_on_bluefs_balance_failure <= ceph_clock_now()) {
5091 alloc->dump();
5092 next_dump_on_bluefs_balance_failure = ceph_clock_now();
5093 next_dump_on_bluefs_balance_failure += dump_interval;
5094 }
5095 }
5096
5097 int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
5098 {
5099 int ret = 0;
5100 assert(bluefs);
5101
5102 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
5103 bluefs->get_usage(&bluefs_usage);
5104 assert(bluefs_usage.size() > bluefs_shared_bdev);
5105
5106 // fixme: look at primary bdev only for now
5107 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
5108 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
5109 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
5110
5111 uint64_t my_free = alloc->get_free();
5112 uint64_t total = bdev->get_size();
5113 float my_free_ratio = (float)my_free / (float)total;
5114
5115 uint64_t total_free = bluefs_free + my_free;
5116
5117 float bluefs_ratio = (float)bluefs_free / (float)total_free;
5118
5119 dout(10) << __func__
5120 << " bluefs " << byte_u_t(bluefs_free)
5121 << " free (" << bluefs_free_ratio
5122 << ") bluestore " << byte_u_t(my_free)
5123 << " free (" << my_free_ratio
5124 << "), bluefs_ratio " << bluefs_ratio
5125 << dendl;
5126
5127 uint64_t gift = 0;
5128 uint64_t reclaim = 0;
5129 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
5130 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
5131 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5132 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
5133 << ", should gift " << byte_u_t(gift) << dendl;
5134 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
5135 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
5136 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
5137 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
5138 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5139 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
5140 << ", should reclaim " << byte_u_t(reclaim) << dendl;
5141 }
5142
5143 // don't take over too much of the freespace
5144 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
5145 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
5146 cct->_conf->bluestore_bluefs_min < free_cap) {
5147 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
5148 dout(10) << __func__ << " bluefs_total " << bluefs_total
5149 << " < min " << cct->_conf->bluestore_bluefs_min
5150 << ", should gift " << byte_u_t(g) << dendl;
5151 if (g > gift)
5152 gift = g;
5153 reclaim = 0;
5154 }
5155 uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
5156 if (bluefs_free < min_free &&
5157 min_free < free_cap) {
5158 uint64_t g = min_free - bluefs_free;
5159 dout(10) << __func__ << " bluefs_free " << bluefs_total
5160 << " < min " << min_free
5161 << ", should gift " << byte_u_t(g) << dendl;
5162 if (g > gift)
5163 gift = g;
5164 reclaim = 0;
5165 }
5166
5167 if (gift) {
5168 // round up to alloc size
5169 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
5170
5171 // hard cap to fit into 32 bits
5172 gift = MIN(gift, 1ull<<31);
5173 dout(10) << __func__ << " gifting " << gift
5174 << " (" << byte_u_t(gift) << ")" << dendl;
5175
5176 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
5177 0, 0, extents);
5178
5179 if (alloc_len <= 0) {
5180 dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
5181 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5182 _dump_alloc_on_rebalance_failure();
5183 return 0;
5184 } else if (alloc_len < (int64_t)gift) {
5185 dout(0) << __func__ << " insufficient allocate on 0x" << std::hex << gift
5186 << " min_alloc_size 0x" << min_alloc_size
5187 << " allocated 0x" << alloc_len
5188 << std::dec << dendl;
5189 _dump_alloc_on_rebalance_failure();
5190 }
5191 for (auto& e : *extents) {
5192 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
5193 }
5194 gift = 0;
5195
5196 ret = 1;
5197 }
5198
5199 // reclaim from bluefs?
5200 if (reclaim) {
5201 // round up to alloc size
5202 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
5203
5204 // hard cap to fit into 32 bits
5205 reclaim = MIN(reclaim, 1ull<<31);
5206 dout(10) << __func__ << " reclaiming " << reclaim
5207 << " (" << byte_u_t(reclaim) << ")" << dendl;
5208
5209 while (reclaim > 0) {
5210 // NOTE: this will block and do IO.
5211 PExtentVector extents;
5212 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
5213 &extents);
5214 if (r < 0) {
5215 derr << __func__ << " failed to reclaim space from bluefs"
5216 << dendl;
5217 break;
5218 }
5219 for (auto e : extents) {
5220 bluefs_extents.erase(e.offset, e.length);
5221 bluefs_extents_reclaiming.insert(e.offset, e.length);
5222 reclaim -= e.length;
5223 }
5224 }
5225
5226 ret = 1;
5227 }
5228
5229 return ret;
5230 }
5231
5232 void BlueStore::_commit_bluefs_freespace(
5233 const PExtentVector& bluefs_gift_extents)
5234 {
5235 dout(10) << __func__ << dendl;
5236 for (auto& p : bluefs_gift_extents) {
5237 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
5238 }
5239 }
5240
5241 int BlueStore::_open_collections(int *errors)
5242 {
5243 dout(10) << __func__ << dendl;
5244 assert(coll_map.empty());
5245 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5246 for (it->upper_bound(string());
5247 it->valid();
5248 it->next()) {
5249 coll_t cid;
5250 if (cid.parse(it->key())) {
5251 CollectionRef c(
5252 new Collection(
5253 this,
5254 cache_shards[cid.hash_to_shard(cache_shards.size())],
5255 cid));
5256 bufferlist bl = it->value();
5257 bufferlist::iterator p = bl.begin();
5258 try {
5259 ::decode(c->cnode, p);
5260 } catch (buffer::error& e) {
5261 derr << __func__ << " failed to decode cnode, key:"
5262 << pretty_binary_string(it->key()) << dendl;
5263 return -EIO;
5264 }
5265 dout(20) << __func__ << " opened " << cid << " " << c
5266 << " " << c->cnode << dendl;
5267 coll_map[cid] = c;
5268 } else {
5269 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5270 if (errors)
5271 (*errors)++;
5272 }
5273 }
5274 return 0;
5275 }
5276
5277 void BlueStore::_open_statfs()
5278 {
5279 bufferlist bl;
5280 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
5281 if (r >= 0) {
5282 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
5283 auto it = bl.begin();
5284 vstatfs.decode(it);
5285 } else {
5286 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5287 }
5288 }
5289 else {
5290 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
5291 }
5292 }
5293
5294 int BlueStore::_setup_block_symlink_or_file(
5295 string name,
5296 string epath,
5297 uint64_t size,
5298 bool create)
5299 {
5300 dout(20) << __func__ << " name " << name << " path " << epath
5301 << " size " << size << " create=" << (int)create << dendl;
5302 int r = 0;
5303 int flags = O_RDWR|O_CLOEXEC;
5304 if (create)
5305 flags |= O_CREAT;
5306 if (epath.length()) {
5307 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5308 if (r < 0) {
5309 r = -errno;
5310 derr << __func__ << " failed to create " << name << " symlink to "
5311 << epath << ": " << cpp_strerror(r) << dendl;
5312 return r;
5313 }
5314
5315 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5316 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5317 if (fd < 0) {
5318 r = -errno;
5319 derr << __func__ << " failed to open " << epath << " file: "
5320 << cpp_strerror(r) << dendl;
5321 return r;
5322 }
5323 string serial_number = epath.substr(strlen(SPDK_PREFIX));
5324 r = ::write(fd, serial_number.c_str(), serial_number.size());
5325 assert(r == (int)serial_number.size());
5326 dout(1) << __func__ << " created " << name << " symlink to "
5327 << epath << dendl;
5328 VOID_TEMP_FAILURE_RETRY(::close(fd));
5329 }
5330 }
5331 if (size) {
5332 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5333 if (fd >= 0) {
5334 // block file is present
5335 struct stat st;
5336 int r = ::fstat(fd, &st);
5337 if (r == 0 &&
5338 S_ISREG(st.st_mode) && // if it is a regular file
5339 st.st_size == 0) { // and is 0 bytes
5340 r = ::ftruncate(fd, size);
5341 if (r < 0) {
5342 r = -errno;
5343 derr << __func__ << " failed to resize " << name << " file to "
5344 << size << ": " << cpp_strerror(r) << dendl;
5345 VOID_TEMP_FAILURE_RETRY(::close(fd));
5346 return r;
5347 }
5348
5349 if (cct->_conf->bluestore_block_preallocate_file) {
5350 r = ::ceph_posix_fallocate(fd, 0, size);
5351 if (r > 0) {
5352 derr << __func__ << " failed to prefallocate " << name << " file to "
5353 << size << ": " << cpp_strerror(r) << dendl;
5354 VOID_TEMP_FAILURE_RETRY(::close(fd));
5355 return -r;
5356 }
5357 }
5358 dout(1) << __func__ << " resized " << name << " file to "
5359 << byte_u_t(size) << dendl;
5360 }
5361 VOID_TEMP_FAILURE_RETRY(::close(fd));
5362 } else {
5363 int r = -errno;
5364 if (r != -ENOENT) {
5365 derr << __func__ << " failed to open " << name << " file: "
5366 << cpp_strerror(r) << dendl;
5367 return r;
5368 }
5369 }
5370 }
5371 return 0;
5372 }
5373
5374 int BlueStore::mkfs()
5375 {
5376 dout(1) << __func__ << " path " << path << dendl;
5377 int r;
5378 uuid_d old_fsid;
5379
5380 {
5381 string done;
5382 r = read_meta("mkfs_done", &done);
5383 if (r == 0) {
5384 dout(1) << __func__ << " already created" << dendl;
5385 if (cct->_conf->bluestore_fsck_on_mkfs) {
5386 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5387 if (r < 0) {
5388 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5389 << dendl;
5390 return r;
5391 }
5392 if (r > 0) {
5393 derr << __func__ << " fsck found " << r << " errors" << dendl;
5394 r = -EIO;
5395 }
5396 }
5397 return r; // idempotent
5398 }
5399 }
5400
5401 {
5402 string type;
5403 r = read_meta("type", &type);
5404 if (r == 0) {
5405 if (type != "bluestore") {
5406 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5407 return -EIO;
5408 }
5409 } else {
5410 r = write_meta("type", "bluestore");
5411 if (r < 0)
5412 return r;
5413 }
5414 }
5415
5416 freelist_type = "bitmap";
5417
5418 r = _open_path();
5419 if (r < 0)
5420 return r;
5421
5422 r = _open_fsid(true);
5423 if (r < 0)
5424 goto out_path_fd;
5425
5426 r = _lock_fsid();
5427 if (r < 0)
5428 goto out_close_fsid;
5429
5430 r = _read_fsid(&old_fsid);
5431 if (r < 0 || old_fsid.is_zero()) {
5432 if (fsid.is_zero()) {
5433 fsid.generate_random();
5434 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5435 } else {
5436 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5437 }
5438 // we'll write it later.
5439 } else {
5440 if (!fsid.is_zero() && fsid != old_fsid) {
5441 derr << __func__ << " on-disk fsid " << old_fsid
5442 << " != provided " << fsid << dendl;
5443 r = -EINVAL;
5444 goto out_close_fsid;
5445 }
5446 fsid = old_fsid;
5447 }
5448
5449 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5450 cct->_conf->bluestore_block_size,
5451 cct->_conf->bluestore_block_create);
5452 if (r < 0)
5453 goto out_close_fsid;
5454 if (cct->_conf->bluestore_bluefs) {
5455 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5456 cct->_conf->bluestore_block_wal_size,
5457 cct->_conf->bluestore_block_wal_create);
5458 if (r < 0)
5459 goto out_close_fsid;
5460 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5461 cct->_conf->bluestore_block_db_size,
5462 cct->_conf->bluestore_block_db_create);
5463 if (r < 0)
5464 goto out_close_fsid;
5465 }
5466
5467 r = _open_bdev(true);
5468 if (r < 0)
5469 goto out_close_fsid;
5470
5471 // choose min_alloc_size
5472 if (cct->_conf->bluestore_min_alloc_size) {
5473 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5474 } else {
5475 assert(bdev);
5476 if (bdev->is_rotational()) {
5477 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5478 } else {
5479 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5480 }
5481 }
5482
5483 // make sure min_alloc_size is power of 2 aligned.
5484 if (!ISP2(min_alloc_size)) {
5485 derr << __func__ << " min_alloc_size 0x"
5486 << std::hex << min_alloc_size << std::dec
5487 << " is not power of 2 aligned!"
5488 << dendl;
5489 r = -EINVAL;
5490 goto out_close_bdev;
5491 }
5492
5493 r = _open_db(true);
5494 if (r < 0)
5495 goto out_close_bdev;
5496
5497 r = _open_fm(true);
5498 if (r < 0)
5499 goto out_close_db;
5500
5501 {
5502 KeyValueDB::Transaction t = db->get_transaction();
5503 {
5504 bufferlist bl;
5505 ::encode((uint64_t)0, bl);
5506 t->set(PREFIX_SUPER, "nid_max", bl);
5507 t->set(PREFIX_SUPER, "blobid_max", bl);
5508 }
5509
5510 {
5511 bufferlist bl;
5512 ::encode((uint64_t)min_alloc_size, bl);
5513 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5514 }
5515
5516 ondisk_format = latest_ondisk_format;
5517 _prepare_ondisk_format_super(t);
5518 db->submit_transaction_sync(t);
5519 }
5520
5521
5522 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5523 if (r < 0)
5524 goto out_close_fm;
5525
5526 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
5527 if (r < 0)
5528 goto out_close_fm;
5529
5530 if (fsid != old_fsid) {
5531 r = _write_fsid();
5532 if (r < 0) {
5533 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
5534 goto out_close_fm;
5535 }
5536 }
5537
5538 out_close_fm:
5539 _close_fm();
5540 out_close_db:
5541 _close_db();
5542 out_close_bdev:
5543 _close_bdev();
5544 out_close_fsid:
5545 _close_fsid();
5546 out_path_fd:
5547 _close_path();
5548
5549 if (r == 0 &&
5550 cct->_conf->bluestore_fsck_on_mkfs) {
5551 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5552 if (rc < 0)
5553 return rc;
5554 if (rc > 0) {
5555 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5556 r = -EIO;
5557 }
5558 }
5559
5560 if (r == 0) {
5561 // indicate success by writing the 'mkfs_done' file
5562 r = write_meta("mkfs_done", "yes");
5563 }
5564
5565 if (r < 0) {
5566 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
5567 } else {
5568 dout(0) << __func__ << " success" << dendl;
5569 }
5570 return r;
5571 }
5572
5573 void BlueStore::set_cache_shards(unsigned num)
5574 {
5575 dout(10) << __func__ << " " << num << dendl;
5576 size_t old = cache_shards.size();
5577 assert(num >= old);
5578 cache_shards.resize(num);
5579 for (unsigned i = old; i < num; ++i) {
5580 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5581 logger);
5582 }
5583 }
5584
5585 int BlueStore::_mount(bool kv_only)
5586 {
5587 dout(1) << __func__ << " path " << path << dendl;
5588
5589 _kv_only = kv_only;
5590
5591 {
5592 string type;
5593 int r = read_meta("type", &type);
5594 if (r < 0) {
5595 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5596 << dendl;
5597 return r;
5598 }
5599
5600 if (type != "bluestore") {
5601 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5602 return -EIO;
5603 }
5604 }
5605
5606 if (cct->_conf->bluestore_fsck_on_mount) {
5607 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5608 if (rc < 0)
5609 return rc;
5610 if (rc > 0) {
5611 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5612 return -EIO;
5613 }
5614 }
5615
5616 int r = _open_path();
5617 if (r < 0)
5618 return r;
5619 r = _open_fsid(false);
5620 if (r < 0)
5621 goto out_path;
5622
5623 r = _read_fsid(&fsid);
5624 if (r < 0)
5625 goto out_fsid;
5626
5627 r = _lock_fsid();
5628 if (r < 0)
5629 goto out_fsid;
5630
5631 r = _open_bdev(false);
5632 if (r < 0)
5633 goto out_fsid;
5634
5635 r = _open_db(false);
5636 if (r < 0)
5637 goto out_bdev;
5638
5639 if (kv_only)
5640 return 0;
5641
5642 r = _open_super_meta();
5643 if (r < 0)
5644 goto out_db;
5645
5646 r = _open_fm(false);
5647 if (r < 0)
5648 goto out_db;
5649
5650 r = _open_alloc();
5651 if (r < 0)
5652 goto out_fm;
5653
5654 r = _open_collections();
5655 if (r < 0)
5656 goto out_alloc;
5657
5658 r = _reload_logger();
5659 if (r < 0)
5660 goto out_coll;
5661
5662 if (bluefs) {
5663 r = _reconcile_bluefs_freespace();
5664 if (r < 0)
5665 goto out_coll;
5666 }
5667
5668 _kv_start();
5669
5670 r = _deferred_replay();
5671 if (r < 0)
5672 goto out_stop;
5673
5674 mempool_thread.init();
5675
5676 mounted = true;
5677 return 0;
5678
5679 out_stop:
5680 _kv_stop();
5681 out_coll:
5682 _flush_cache();
5683 out_alloc:
5684 _close_alloc();
5685 out_fm:
5686 _close_fm();
5687 out_db:
5688 _close_db();
5689 out_bdev:
5690 _close_bdev();
5691 out_fsid:
5692 _close_fsid();
5693 out_path:
5694 _close_path();
5695 return r;
5696 }
5697
5698 int BlueStore::umount()
5699 {
5700 assert(_kv_only || mounted);
5701 dout(1) << __func__ << dendl;
5702
5703 _osr_drain_all();
5704 _osr_unregister_all();
5705
5706 mounted = false;
5707 if (!_kv_only) {
5708 mempool_thread.shutdown();
5709 dout(20) << __func__ << " stopping kv thread" << dendl;
5710 _kv_stop();
5711 _flush_cache();
5712 dout(20) << __func__ << " closing" << dendl;
5713
5714 _close_alloc();
5715 _close_fm();
5716 }
5717 _close_db();
5718 _close_bdev();
5719 _close_fsid();
5720 _close_path();
5721
5722 if (cct->_conf->bluestore_fsck_on_umount) {
5723 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5724 if (rc < 0)
5725 return rc;
5726 if (rc > 0) {
5727 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5728 return -EIO;
5729 }
5730 }
5731 return 0;
5732 }
5733
5734 static void apply(uint64_t off,
5735 uint64_t len,
5736 uint64_t granularity,
5737 BlueStore::mempool_dynamic_bitset &bitset,
5738 std::function<void(uint64_t,
5739 BlueStore::mempool_dynamic_bitset &)> f) {
5740 auto end = ROUND_UP_TO(off + len, granularity);
5741 while (off < end) {
5742 uint64_t pos = off / granularity;
5743 f(pos, bitset);
5744 off += granularity;
5745 }
5746 }
5747
5748 int BlueStore::_fsck_check_extents(
5749 const ghobject_t& oid,
5750 const PExtentVector& extents,
5751 bool compressed,
5752 mempool_dynamic_bitset &used_blocks,
5753 uint64_t granularity,
5754 store_statfs_t& expected_statfs)
5755 {
5756 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5757 int errors = 0;
5758 for (auto e : extents) {
5759 if (!e.is_valid())
5760 continue;
5761 expected_statfs.allocated += e.length;
5762 if (compressed) {
5763 expected_statfs.compressed_allocated += e.length;
5764 }
5765 bool already = false;
5766 apply(
5767 e.offset, e.length, granularity, used_blocks,
5768 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5769 assert(pos < bs.size());
5770 if (bs.test(pos))
5771 already = true;
5772 else
5773 bs.set(pos);
5774 });
5775 if (already) {
5776 derr << " " << oid << " extent " << e
5777 << " or a subset is already allocated" << dendl;
5778 ++errors;
5779 }
5780 if (e.end() > bdev->get_size()) {
5781 derr << " " << oid << " extent " << e
5782 << " past end of block device" << dendl;
5783 ++errors;
5784 }
5785 }
5786 return errors;
5787 }
5788
5789 int BlueStore::_fsck(bool deep, bool repair)
5790 {
5791 dout(1) << __func__
5792 << (repair ? " fsck" : " repair")
5793 << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
5794 int errors = 0;
5795 int repaired = 0;
5796
5797 typedef btree::btree_set<
5798 uint64_t,std::less<uint64_t>,
5799 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5800 uint64_t_btree_t used_nids;
5801 uint64_t_btree_t used_omap_head;
5802 uint64_t_btree_t used_sbids;
5803
5804 mempool_dynamic_bitset used_blocks;
5805 KeyValueDB::Iterator it;
5806 store_statfs_t expected_statfs, actual_statfs;
5807 struct sb_info_t {
5808 list<ghobject_t> oids;
5809 SharedBlobRef sb;
5810 bluestore_extent_ref_map_t ref_map;
5811 bool compressed;
5812 };
5813 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5814
5815 uint64_t num_objects = 0;
5816 uint64_t num_extents = 0;
5817 uint64_t num_blobs = 0;
5818 uint64_t num_spanning_blobs = 0;
5819 uint64_t num_shared_blobs = 0;
5820 uint64_t num_sharded_objects = 0;
5821 uint64_t num_object_shards = 0;
5822
5823 utime_t start = ceph_clock_now();
5824
5825 int r = _open_path();
5826 if (r < 0)
5827 return r;
5828 r = _open_fsid(false);
5829 if (r < 0)
5830 goto out_path;
5831
5832 r = _read_fsid(&fsid);
5833 if (r < 0)
5834 goto out_fsid;
5835
5836 r = _lock_fsid();
5837 if (r < 0)
5838 goto out_fsid;
5839
5840 r = _open_bdev(false);
5841 if (r < 0)
5842 goto out_fsid;
5843
5844 r = _open_db(false);
5845 if (r < 0)
5846 goto out_bdev;
5847
5848 r = _open_super_meta();
5849 if (r < 0)
5850 goto out_db;
5851
5852 r = _open_fm(false);
5853 if (r < 0)
5854 goto out_db;
5855
5856 r = _open_alloc();
5857 if (r < 0)
5858 goto out_fm;
5859
5860 r = _open_collections(&errors);
5861 if (r < 0)
5862 goto out_alloc;
5863
5864 mempool_thread.init();
5865
5866 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5867 _kv_start();
5868 r = _deferred_replay();
5869 _kv_stop();
5870 if (r < 0)
5871 goto out_scan;
5872
5873 used_blocks.resize(fm->get_alloc_units());
5874 apply(
5875 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
5876 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5877 assert(pos < bs.size());
5878 bs.set(pos);
5879 }
5880 );
5881
5882 if (bluefs) {
5883 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5884 apply(
5885 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
5886 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5887 assert(pos < bs.size());
5888 bs.set(pos);
5889 }
5890 );
5891 }
5892 r = bluefs->fsck();
5893 if (r < 0) {
5894 goto out_scan;
5895 }
5896 if (r > 0)
5897 errors += r;
5898 }
5899
5900 // get expected statfs; fill unaffected fields to be able to compare
5901 // structs
5902 statfs(&actual_statfs);
5903 expected_statfs.total = actual_statfs.total;
5904 expected_statfs.available = actual_statfs.available;
5905
5906 // walk PREFIX_OBJ
5907 dout(1) << __func__ << " walking object keyspace" << dendl;
5908 it = db->get_iterator(PREFIX_OBJ);
5909 if (it) {
5910 CollectionRef c;
5911 spg_t pgid;
5912 mempool::bluestore_fsck::list<string> expecting_shards;
5913 for (it->lower_bound(string()); it->valid(); it->next()) {
5914 if (g_conf->bluestore_debug_fsck_abort) {
5915 goto out_scan;
5916 }
5917 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5918 if (is_extent_shard_key(it->key())) {
5919 while (!expecting_shards.empty() &&
5920 expecting_shards.front() < it->key()) {
5921 derr << "fsck error: missing shard key "
5922 << pretty_binary_string(expecting_shards.front())
5923 << dendl;
5924 ++errors;
5925 expecting_shards.pop_front();
5926 }
5927 if (!expecting_shards.empty() &&
5928 expecting_shards.front() == it->key()) {
5929 // all good
5930 expecting_shards.pop_front();
5931 continue;
5932 }
5933
5934 uint32_t offset;
5935 string okey;
5936 get_key_extent_shard(it->key(), &okey, &offset);
5937 derr << "fsck error: stray shard 0x" << std::hex << offset
5938 << std::dec << dendl;
5939 if (expecting_shards.empty()) {
5940 derr << "fsck error: " << pretty_binary_string(it->key())
5941 << " is unexpected" << dendl;
5942 ++errors;
5943 continue;
5944 }
5945 while (expecting_shards.front() > it->key()) {
5946 derr << "fsck error: saw " << pretty_binary_string(it->key())
5947 << dendl;
5948 derr << "fsck error: exp "
5949 << pretty_binary_string(expecting_shards.front()) << dendl;
5950 ++errors;
5951 expecting_shards.pop_front();
5952 if (expecting_shards.empty()) {
5953 break;
5954 }
5955 }
5956 continue;
5957 }
5958
5959 ghobject_t oid;
5960 int r = get_key_object(it->key(), &oid);
5961 if (r < 0) {
5962 derr << "fsck error: bad object key "
5963 << pretty_binary_string(it->key()) << dendl;
5964 ++errors;
5965 continue;
5966 }
5967 if (!c ||
5968 oid.shard_id != pgid.shard ||
5969 oid.hobj.pool != (int64_t)pgid.pool() ||
5970 !c->contains(oid)) {
5971 c = nullptr;
5972 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5973 coll_map.begin();
5974 p != coll_map.end();
5975 ++p) {
5976 if (p->second->contains(oid)) {
5977 c = p->second;
5978 break;
5979 }
5980 }
5981 if (!c) {
5982 derr << "fsck error: stray object " << oid
5983 << " not owned by any collection" << dendl;
5984 ++errors;
5985 continue;
5986 }
5987 c->cid.is_pg(&pgid);
5988 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
5989 << dendl;
5990 }
5991
5992 if (!expecting_shards.empty()) {
5993 for (auto &k : expecting_shards) {
5994 derr << "fsck error: missing shard key "
5995 << pretty_binary_string(k) << dendl;
5996 }
5997 ++errors;
5998 expecting_shards.clear();
5999 }
6000
6001 dout(10) << __func__ << " " << oid << dendl;
6002 RWLock::RLocker l(c->lock);
6003 OnodeRef o = c->get_onode(oid, false);
6004 if (o->onode.nid) {
6005 if (o->onode.nid > nid_max) {
6006 derr << "fsck error: " << oid << " nid " << o->onode.nid
6007 << " > nid_max " << nid_max << dendl;
6008 ++errors;
6009 }
6010 if (used_nids.count(o->onode.nid)) {
6011 derr << "fsck error: " << oid << " nid " << o->onode.nid
6012 << " already in use" << dendl;
6013 ++errors;
6014 continue; // go for next object
6015 }
6016 used_nids.insert(o->onode.nid);
6017 }
6018 ++num_objects;
6019 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
6020 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
6021 _dump_onode(o, 30);
6022 // shards
6023 if (!o->extent_map.shards.empty()) {
6024 ++num_sharded_objects;
6025 num_object_shards += o->extent_map.shards.size();
6026 }
6027 for (auto& s : o->extent_map.shards) {
6028 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
6029 expecting_shards.push_back(string());
6030 get_extent_shard_key(o->key, s.shard_info->offset,
6031 &expecting_shards.back());
6032 if (s.shard_info->offset >= o->onode.size) {
6033 derr << "fsck error: " << oid << " shard 0x" << std::hex
6034 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
6035 << std::dec << dendl;
6036 ++errors;
6037 }
6038 }
6039 // lextents
6040 map<BlobRef,bluestore_blob_t::unused_t> referenced;
6041 uint64_t pos = 0;
6042 mempool::bluestore_fsck::map<BlobRef,
6043 bluestore_blob_use_tracker_t> ref_map;
6044 for (auto& l : o->extent_map.extent_map) {
6045 dout(20) << __func__ << " " << l << dendl;
6046 if (l.logical_offset < pos) {
6047 derr << "fsck error: " << oid << " lextent at 0x"
6048 << std::hex << l.logical_offset
6049 << " overlaps with the previous, which ends at 0x" << pos
6050 << std::dec << dendl;
6051 ++errors;
6052 }
6053 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
6054 derr << "fsck error: " << oid << " lextent at 0x"
6055 << std::hex << l.logical_offset << "~" << l.length
6056 << " spans a shard boundary"
6057 << std::dec << dendl;
6058 ++errors;
6059 }
6060 pos = l.logical_offset + l.length;
6061 expected_statfs.stored += l.length;
6062 assert(l.blob);
6063 const bluestore_blob_t& blob = l.blob->get_blob();
6064
6065 auto& ref = ref_map[l.blob];
6066 if (ref.is_empty()) {
6067 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
6068 uint32_t l = blob.get_logical_length();
6069 ref.init(l, min_release_size);
6070 }
6071 ref.get(
6072 l.blob_offset,
6073 l.length);
6074 ++num_extents;
6075 if (blob.has_unused()) {
6076 auto p = referenced.find(l.blob);
6077 bluestore_blob_t::unused_t *pu;
6078 if (p == referenced.end()) {
6079 pu = &referenced[l.blob];
6080 } else {
6081 pu = &p->second;
6082 }
6083 uint64_t blob_len = blob.get_logical_length();
6084 assert((blob_len % (sizeof(*pu)*8)) == 0);
6085 assert(l.blob_offset + l.length <= blob_len);
6086 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
6087 uint64_t start = l.blob_offset / chunk_size;
6088 uint64_t end =
6089 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
6090 for (auto i = start; i < end; ++i) {
6091 (*pu) |= (1u << i);
6092 }
6093 }
6094 }
6095 for (auto &i : referenced) {
6096 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
6097 << std::dec << " for " << *i.first << dendl;
6098 const bluestore_blob_t& blob = i.first->get_blob();
6099 if (i.second & blob.unused) {
6100 derr << "fsck error: " << oid << " blob claims unused 0x"
6101 << std::hex << blob.unused
6102 << " but extents reference 0x" << i.second
6103 << " on blob " << *i.first << dendl;
6104 ++errors;
6105 }
6106 if (blob.has_csum()) {
6107 uint64_t blob_len = blob.get_logical_length();
6108 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
6109 unsigned csum_count = blob.get_csum_count();
6110 unsigned csum_chunk_size = blob.get_csum_chunk_size();
6111 for (unsigned p = 0; p < csum_count; ++p) {
6112 unsigned pos = p * csum_chunk_size;
6113 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
6114 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
6115 unsigned mask = 1u << firstbit;
6116 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
6117 mask |= 1u << b;
6118 }
6119 if ((blob.unused & mask) == mask) {
6120 // this csum chunk region is marked unused
6121 if (blob.get_csum_item(p) != 0) {
6122 derr << "fsck error: " << oid
6123 << " blob claims csum chunk 0x" << std::hex << pos
6124 << "~" << csum_chunk_size
6125 << " is unused (mask 0x" << mask << " of unused 0x"
6126 << blob.unused << ") but csum is non-zero 0x"
6127 << blob.get_csum_item(p) << std::dec << " on blob "
6128 << *i.first << dendl;
6129 ++errors;
6130 }
6131 }
6132 }
6133 }
6134 }
6135 for (auto &i : ref_map) {
6136 ++num_blobs;
6137 const bluestore_blob_t& blob = i.first->get_blob();
6138 bool equal = i.first->get_blob_use_tracker().equal(i.second);
6139 if (!equal) {
6140 derr << "fsck error: " << oid << " blob " << *i.first
6141 << " doesn't match expected ref_map " << i.second << dendl;
6142 ++errors;
6143 }
6144 if (blob.is_compressed()) {
6145 expected_statfs.compressed += blob.get_compressed_payload_length();
6146 expected_statfs.compressed_original +=
6147 i.first->get_referenced_bytes();
6148 }
6149 if (blob.is_shared()) {
6150 if (i.first->shared_blob->get_sbid() > blobid_max) {
6151 derr << "fsck error: " << oid << " blob " << blob
6152 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
6153 << blobid_max << dendl;
6154 ++errors;
6155 } else if (i.first->shared_blob->get_sbid() == 0) {
6156 derr << "fsck error: " << oid << " blob " << blob
6157 << " marked as shared but has uninitialized sbid"
6158 << dendl;
6159 ++errors;
6160 }
6161 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
6162 sbi.sb = i.first->shared_blob;
6163 sbi.oids.push_back(oid);
6164 sbi.compressed = blob.is_compressed();
6165 for (auto e : blob.get_extents()) {
6166 if (e.is_valid()) {
6167 sbi.ref_map.get(e.offset, e.length);
6168 }
6169 }
6170 } else {
6171 errors += _fsck_check_extents(oid, blob.get_extents(),
6172 blob.is_compressed(),
6173 used_blocks,
6174 fm->get_alloc_size(),
6175 expected_statfs);
6176 }
6177 }
6178 if (deep) {
6179 bufferlist bl;
6180 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
6181 uint64_t offset = 0;
6182 do {
6183 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
6184 int r = _do_read(c.get(), o, offset, l, bl,
6185 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
6186 if (r < 0) {
6187 ++errors;
6188 derr << "fsck error: " << oid << std::hex
6189 << " error during read: "
6190 << " " << offset << "~" << l
6191 << " " << cpp_strerror(r) << std::dec
6192 << dendl;
6193 break;
6194 }
6195 offset += l;
6196 } while (offset < o->onode.size);
6197 }
6198 // omap
6199 if (o->onode.has_omap()) {
6200 if (used_omap_head.count(o->onode.nid)) {
6201 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
6202 << " already in use" << dendl;
6203 ++errors;
6204 } else {
6205 used_omap_head.insert(o->onode.nid);
6206 }
6207 }
6208 }
6209 }
6210 dout(1) << __func__ << " checking shared_blobs" << dendl;
6211 it = db->get_iterator(PREFIX_SHARED_BLOB);
6212 if (it) {
6213 for (it->lower_bound(string()); it->valid(); it->next()) {
6214 string key = it->key();
6215 uint64_t sbid;
6216 if (get_key_shared_blob(key, &sbid)) {
6217 derr << "fsck error: bad key '" << key
6218 << "' in shared blob namespace" << dendl;
6219 ++errors;
6220 continue;
6221 }
6222 auto p = sb_info.find(sbid);
6223 if (p == sb_info.end()) {
6224 derr << "fsck error: found stray shared blob data for sbid 0x"
6225 << std::hex << sbid << std::dec << dendl;
6226 ++errors;
6227 } else {
6228 ++num_shared_blobs;
6229 sb_info_t& sbi = p->second;
6230 bluestore_shared_blob_t shared_blob(sbid);
6231 bufferlist bl = it->value();
6232 bufferlist::iterator blp = bl.begin();
6233 ::decode(shared_blob, blp);
6234 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
6235 if (shared_blob.ref_map != sbi.ref_map) {
6236 derr << "fsck error: shared blob 0x" << std::hex << sbid
6237 << std::dec << " ref_map " << shared_blob.ref_map
6238 << " != expected " << sbi.ref_map << dendl;
6239 ++errors;
6240 }
6241 PExtentVector extents;
6242 for (auto &r : shared_blob.ref_map.ref_map) {
6243 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
6244 }
6245 errors += _fsck_check_extents(p->second.oids.front(),
6246 extents,
6247 p->second.compressed,
6248 used_blocks,
6249 fm->get_alloc_size(),
6250 expected_statfs);
6251 sb_info.erase(p);
6252 }
6253 }
6254 }
6255 for (auto &p : sb_info) {
6256 derr << "fsck error: shared_blob 0x" << p.first
6257 << " key is missing (" << *p.second.sb << ")" << dendl;
6258 ++errors;
6259 }
6260 if (!(actual_statfs == expected_statfs)) {
6261 derr << "fsck error: actual " << actual_statfs
6262 << " != expected " << expected_statfs << dendl;
6263 ++errors;
6264 }
6265
6266 dout(1) << __func__ << " checking for stray omap data" << dendl;
6267 it = db->get_iterator(PREFIX_OMAP);
6268 if (it) {
6269 for (it->lower_bound(string()); it->valid(); it->next()) {
6270 uint64_t omap_head;
6271 _key_decode_u64(it->key().c_str(), &omap_head);
6272 if (used_omap_head.count(omap_head) == 0) {
6273 derr << "fsck error: found stray omap data on omap_head "
6274 << omap_head << dendl;
6275 ++errors;
6276 }
6277 }
6278 }
6279
6280 dout(1) << __func__ << " checking deferred events" << dendl;
6281 it = db->get_iterator(PREFIX_DEFERRED);
6282 if (it) {
6283 for (it->lower_bound(string()); it->valid(); it->next()) {
6284 bufferlist bl = it->value();
6285 bufferlist::iterator p = bl.begin();
6286 bluestore_deferred_transaction_t wt;
6287 try {
6288 ::decode(wt, p);
6289 } catch (buffer::error& e) {
6290 derr << "fsck error: failed to decode deferred txn "
6291 << pretty_binary_string(it->key()) << dendl;
6292 r = -EIO;
6293 goto out_scan;
6294 }
6295 dout(20) << __func__ << " deferred " << wt.seq
6296 << " ops " << wt.ops.size()
6297 << " released 0x" << std::hex << wt.released << std::dec << dendl;
6298 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
6299 apply(
6300 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
6301 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6302 assert(pos < bs.size());
6303 bs.set(pos);
6304 }
6305 );
6306 }
6307 }
6308 }
6309
6310 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
6311 {
6312 // remove bluefs_extents from used set since the freelist doesn't
6313 // know they are allocated.
6314 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
6315 apply(
6316 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
6317 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6318 assert(pos < bs.size());
6319 bs.reset(pos);
6320 }
6321 );
6322 }
6323 fm->enumerate_reset();
6324 uint64_t offset, length;
6325 while (fm->enumerate_next(&offset, &length)) {
6326 bool intersects = false;
6327 apply(
6328 offset, length, fm->get_alloc_size(), used_blocks,
6329 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6330 assert(pos < bs.size());
6331 if (bs.test(pos)) {
6332 intersects = true;
6333 } else {
6334 bs.set(pos);
6335 }
6336 }
6337 );
6338 if (intersects) {
6339 if (offset == SUPER_RESERVED &&
6340 length == min_alloc_size - SUPER_RESERVED) {
6341 // this is due to the change just after luminous to min_alloc_size
6342 // granularity allocations, and our baked in assumption at the top
6343 // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
6344 // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
6345 // since we will never allocate this region below min_alloc_size.
6346 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
6347 << " and min_alloc_size, 0x" << std::hex << offset << "~"
6348 << length << dendl;
6349 } else {
6350 derr << "fsck error: free extent 0x" << std::hex << offset
6351 << "~" << length << std::dec
6352 << " intersects allocated blocks" << dendl;
6353 ++errors;
6354 }
6355 }
6356 }
6357 fm->enumerate_reset();
6358 size_t count = used_blocks.count();
6359 if (used_blocks.size() != count) {
6360 assert(used_blocks.size() > count);
6361 ++errors;
6362 used_blocks.flip();
6363 size_t start = used_blocks.find_first();
6364 while (start != decltype(used_blocks)::npos) {
6365 size_t cur = start;
6366 while (true) {
6367 size_t next = used_blocks.find_next(cur);
6368 if (next != cur + 1) {
6369 derr << "fsck error: leaked extent 0x" << std::hex
6370 << ((uint64_t)start * fm->get_alloc_size()) << "~"
6371 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
6372 << dendl;
6373 start = next;
6374 break;
6375 }
6376 cur = next;
6377 }
6378 }
6379 used_blocks.flip();
6380 }
6381 }
6382
6383 out_scan:
6384 mempool_thread.shutdown();
6385 _flush_cache();
6386 out_alloc:
6387 _close_alloc();
6388 out_fm:
6389 _close_fm();
6390 out_db:
6391 it.reset(); // before db is closed
6392 _close_db();
6393 out_bdev:
6394 _close_bdev();
6395 out_fsid:
6396 _close_fsid();
6397 out_path:
6398 _close_path();
6399
6400 // fatal errors take precedence
6401 if (r < 0)
6402 return r;
6403
6404 dout(2) << __func__ << " " << num_objects << " objects, "
6405 << num_sharded_objects << " of them sharded. "
6406 << dendl;
6407 dout(2) << __func__ << " " << num_extents << " extents to "
6408 << num_blobs << " blobs, "
6409 << num_spanning_blobs << " spanning, "
6410 << num_shared_blobs << " shared."
6411 << dendl;
6412
6413 utime_t duration = ceph_clock_now() - start;
6414 dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
6415 << " repaired, " << (errors - repaired) << " remaining in "
6416 << duration << " seconds" << dendl;
6417 return errors - repaired;
6418 }
6419
6420 void BlueStore::collect_metadata(map<string,string> *pm)
6421 {
6422 dout(10) << __func__ << dendl;
6423 bdev->collect_metadata("bluestore_bdev_", pm);
6424 if (bluefs) {
6425 (*pm)["bluefs"] = "1";
6426 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
6427 bluefs->collect_metadata(pm);
6428 } else {
6429 (*pm)["bluefs"] = "0";
6430 }
6431 }
6432
6433 int BlueStore::statfs(struct store_statfs_t *buf)
6434 {
6435 buf->reset();
6436 buf->total = bdev->get_size();
6437 buf->available = alloc->get_free();
6438
6439 if (bluefs) {
6440 // part of our shared device is "free" according to BlueFS, but we
6441 // can't touch bluestore_bluefs_min of it.
6442 int64_t shared_available = std::min(
6443 bluefs->get_free(bluefs_shared_bdev),
6444 bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
6445 if (shared_available > 0) {
6446 buf->available += shared_available;
6447 }
6448 }
6449
6450 {
6451 std::lock_guard<std::mutex> l(vstatfs_lock);
6452
6453 buf->allocated = vstatfs.allocated();
6454 buf->stored = vstatfs.stored();
6455 buf->compressed = vstatfs.compressed();
6456 buf->compressed_original = vstatfs.compressed_original();
6457 buf->compressed_allocated = vstatfs.compressed_allocated();
6458 }
6459
6460 dout(20) << __func__ << *buf << dendl;
6461 return 0;
6462 }
6463
6464 // ---------------
6465 // cache
6466
6467 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
6468 {
6469 RWLock::RLocker l(coll_lock);
6470 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
6471 if (cp == coll_map.end())
6472 return CollectionRef();
6473 return cp->second;
6474 }
6475
6476 void BlueStore::_queue_reap_collection(CollectionRef& c)
6477 {
6478 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6479 // _reap_collections and this in the same thread,
6480 // so no need a lock.
6481 removed_collections.push_back(c);
6482 }
6483
6484 void BlueStore::_reap_collections()
6485 {
6486
6487 list<CollectionRef> removed_colls;
6488 {
6489 // _queue_reap_collection and this in the same thread.
6490 // So no need a lock.
6491 if (!removed_collections.empty())
6492 removed_colls.swap(removed_collections);
6493 else
6494 return;
6495 }
6496
6497 list<CollectionRef>::iterator p = removed_colls.begin();
6498 while (p != removed_colls.end()) {
6499 CollectionRef c = *p;
6500 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6501 if (c->onode_map.map_any([&](OnodeRef o) {
6502 assert(!o->exists);
6503 if (o->flushing_count.load()) {
6504 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6505 << " flush_txns " << o->flushing_count << dendl;
6506 return true;
6507 }
6508 return false;
6509 })) {
6510 ++p;
6511 continue;
6512 }
6513 c->onode_map.clear();
6514 p = removed_colls.erase(p);
6515 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6516 }
6517 if (removed_colls.empty()) {
6518 dout(10) << __func__ << " all reaped" << dendl;
6519 } else {
6520 removed_collections.splice(removed_collections.begin(), removed_colls);
6521 }
6522 }
6523
6524 void BlueStore::_update_cache_logger()
6525 {
6526 uint64_t num_onodes = 0;
6527 uint64_t num_extents = 0;
6528 uint64_t num_blobs = 0;
6529 uint64_t num_buffers = 0;
6530 uint64_t num_buffer_bytes = 0;
6531 for (auto c : cache_shards) {
6532 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6533 &num_buffers, &num_buffer_bytes);
6534 }
6535 logger->set(l_bluestore_onodes, num_onodes);
6536 logger->set(l_bluestore_extents, num_extents);
6537 logger->set(l_bluestore_blobs, num_blobs);
6538 logger->set(l_bluestore_buffers, num_buffers);
6539 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6540 }
6541
6542 // ---------------
6543 // read operations
6544
6545 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6546 {
6547 return _get_collection(cid);
6548 }
6549
6550 bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6551 {
6552 CollectionHandle c = _get_collection(cid);
6553 if (!c)
6554 return false;
6555 return exists(c, oid);
6556 }
6557
6558 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6559 {
6560 Collection *c = static_cast<Collection *>(c_.get());
6561 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6562 if (!c->exists)
6563 return false;
6564
6565 bool r = true;
6566
6567 {
6568 RWLock::RLocker l(c->lock);
6569 OnodeRef o = c->get_onode(oid, false);
6570 if (!o || !o->exists)
6571 r = false;
6572 }
6573
6574 return r;
6575 }
6576
6577 int BlueStore::stat(
6578 const coll_t& cid,
6579 const ghobject_t& oid,
6580 struct stat *st,
6581 bool allow_eio)
6582 {
6583 CollectionHandle c = _get_collection(cid);
6584 if (!c)
6585 return -ENOENT;
6586 return stat(c, oid, st, allow_eio);
6587 }
6588
6589 int BlueStore::stat(
6590 CollectionHandle &c_,
6591 const ghobject_t& oid,
6592 struct stat *st,
6593 bool allow_eio)
6594 {
6595 Collection *c = static_cast<Collection *>(c_.get());
6596 if (!c->exists)
6597 return -ENOENT;
6598 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6599
6600 {
6601 RWLock::RLocker l(c->lock);
6602 OnodeRef o = c->get_onode(oid, false);
6603 if (!o || !o->exists)
6604 return -ENOENT;
6605 st->st_size = o->onode.size;
6606 st->st_blksize = 4096;
6607 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6608 st->st_nlink = 1;
6609 }
6610
6611 int r = 0;
6612 if (_debug_mdata_eio(oid)) {
6613 r = -EIO;
6614 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6615 }
6616 return r;
6617 }
6618 int BlueStore::set_collection_opts(
6619 const coll_t& cid,
6620 const pool_opts_t& opts)
6621 {
6622 CollectionHandle ch = _get_collection(cid);
6623 if (!ch)
6624 return -ENOENT;
6625 Collection *c = static_cast<Collection *>(ch.get());
6626 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6627 if (!c->exists)
6628 return -ENOENT;
6629 RWLock::WLocker l(c->lock);
6630 c->pool_opts = opts;
6631 return 0;
6632 }
6633
6634 int BlueStore::read(
6635 const coll_t& cid,
6636 const ghobject_t& oid,
6637 uint64_t offset,
6638 size_t length,
6639 bufferlist& bl,
6640 uint32_t op_flags)
6641 {
6642 CollectionHandle c = _get_collection(cid);
6643 if (!c)
6644 return -ENOENT;
6645 return read(c, oid, offset, length, bl, op_flags);
6646 }
6647
6648 int BlueStore::read(
6649 CollectionHandle &c_,
6650 const ghobject_t& oid,
6651 uint64_t offset,
6652 size_t length,
6653 bufferlist& bl,
6654 uint32_t op_flags)
6655 {
6656 utime_t start = ceph_clock_now();
6657 Collection *c = static_cast<Collection *>(c_.get());
6658 const coll_t &cid = c->get_cid();
6659 dout(15) << __func__ << " " << cid << " " << oid
6660 << " 0x" << std::hex << offset << "~" << length << std::dec
6661 << dendl;
6662 if (!c->exists)
6663 return -ENOENT;
6664
6665 bl.clear();
6666 int r;
6667 {
6668 RWLock::RLocker l(c->lock);
6669 utime_t start1 = ceph_clock_now();
6670 OnodeRef o = c->get_onode(oid, false);
6671 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6672 if (!o || !o->exists) {
6673 r = -ENOENT;
6674 goto out;
6675 }
6676
6677 if (offset == length && offset == 0)
6678 length = o->onode.size;
6679
6680 r = _do_read(c, o, offset, length, bl, op_flags);
6681 if (r == -EIO) {
6682 logger->inc(l_bluestore_read_eio);
6683 }
6684 }
6685
6686 out:
6687 if (r >= 0 && _debug_data_eio(oid)) {
6688 r = -EIO;
6689 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6690 } else if (cct->_conf->bluestore_debug_random_read_err &&
6691 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) {
6692 dout(0) << __func__ << ": inject random EIO" << dendl;
6693 r = -EIO;
6694 }
6695 dout(10) << __func__ << " " << cid << " " << oid
6696 << " 0x" << std::hex << offset << "~" << length << std::dec
6697 << " = " << r << dendl;
6698 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6699 return r;
6700 }
6701
6702 // --------------------------------------------------------
6703 // intermediate data structures used while reading
6704 struct region_t {
6705 uint64_t logical_offset;
6706 uint64_t blob_xoffset; //region offset within the blob
6707 uint64_t length;
6708 bufferlist bl;
6709
6710 // used later in read process
6711 uint64_t front = 0;
6712 uint64_t r_off = 0;
6713
6714 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6715 : logical_offset(offset),
6716 blob_xoffset(b_offs),
6717 length(len){}
6718 region_t(const region_t& from)
6719 : logical_offset(from.logical_offset),
6720 blob_xoffset(from.blob_xoffset),
6721 length(from.length){}
6722
6723 friend ostream& operator<<(ostream& out, const region_t& r) {
6724 return out << "0x" << std::hex << r.logical_offset << ":"
6725 << r.blob_xoffset << "~" << r.length << std::dec;
6726 }
6727 };
6728
6729 typedef list<region_t> regions2read_t;
6730 typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6731
6732 int BlueStore::_do_read(
6733 Collection *c,
6734 OnodeRef o,
6735 uint64_t offset,
6736 size_t length,
6737 bufferlist& bl,
6738 uint32_t op_flags,
6739 uint64_t retry_count)
6740 {
6741 FUNCTRACE();
6742 int r = 0;
6743 int read_cache_policy = 0; // do not bypass clean or dirty cache
6744
6745 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6746 << " size 0x" << o->onode.size << " (" << std::dec
6747 << o->onode.size << ")" << dendl;
6748 bl.clear();
6749
6750 if (offset >= o->onode.size) {
6751 return r;
6752 }
6753
6754 // generally, don't buffer anything, unless the client explicitly requests
6755 // it.
6756 bool buffered = false;
6757 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6758 dout(20) << __func__ << " will do buffered read" << dendl;
6759 buffered = true;
6760 } else if (cct->_conf->bluestore_default_buffered_read &&
6761 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6762 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6763 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6764 buffered = true;
6765 }
6766
6767 if (offset + length > o->onode.size) {
6768 length = o->onode.size - offset;
6769 }
6770
6771 utime_t start = ceph_clock_now();
6772 o->extent_map.fault_range(db, offset, length);
6773 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6774 _dump_onode(o);
6775
6776 ready_regions_t ready_regions;
6777
6778 // for deep-scrub, we only read dirty cache and bypass clean cache in
6779 // order to read underlying block device in case there are silent disk errors.
6780 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
6781 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
6782 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
6783 }
6784
6785 // build blob-wise list to of stuff read (that isn't cached)
6786 blobs2read_t blobs2read;
6787 unsigned left = length;
6788 uint64_t pos = offset;
6789 unsigned num_regions = 0;
6790 auto lp = o->extent_map.seek_lextent(offset);
6791 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6792 if (pos < lp->logical_offset) {
6793 unsigned hole = lp->logical_offset - pos;
6794 if (hole >= left) {
6795 break;
6796 }
6797 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6798 << std::dec << dendl;
6799 pos += hole;
6800 left -= hole;
6801 }
6802 BlobRef& bptr = lp->blob;
6803 unsigned l_off = pos - lp->logical_offset;
6804 unsigned b_off = l_off + lp->blob_offset;
6805 unsigned b_len = std::min(left, lp->length - l_off);
6806
6807 ready_regions_t cache_res;
6808 interval_set<uint32_t> cache_interval;
6809 bptr->shared_blob->bc.read(
6810 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
6811 read_cache_policy);
6812 dout(20) << __func__ << " blob " << *bptr << std::hex
6813 << " need 0x" << b_off << "~" << b_len
6814 << " cache has 0x" << cache_interval
6815 << std::dec << dendl;
6816
6817 auto pc = cache_res.begin();
6818 while (b_len > 0) {
6819 unsigned l;
6820 if (pc != cache_res.end() &&
6821 pc->first == b_off) {
6822 l = pc->second.length();
6823 ready_regions[pos].claim(pc->second);
6824 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6825 << b_off << "~" << l << std::dec << dendl;
6826 ++pc;
6827 } else {
6828 l = b_len;
6829 if (pc != cache_res.end()) {
6830 assert(pc->first > b_off);
6831 l = pc->first - b_off;
6832 }
6833 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6834 << b_off << "~" << l << std::dec << dendl;
6835 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6836 ++num_regions;
6837 }
6838 pos += l;
6839 b_off += l;
6840 left -= l;
6841 b_len -= l;
6842 }
6843 ++lp;
6844 }
6845
6846 // read raw blob data. use aio if we have >1 blobs to read.
6847 start = ceph_clock_now(); // for the sake of simplicity
6848 // measure the whole block below.
6849 // The error isn't that much...
6850 vector<bufferlist> compressed_blob_bls;
6851 IOContext ioc(cct, NULL, true); // allow EIO
6852 for (auto& p : blobs2read) {
6853 const BlobRef& bptr = p.first;
6854 dout(20) << __func__ << " blob " << *bptr << std::hex
6855 << " need " << p.second << std::dec << dendl;
6856 if (bptr->get_blob().is_compressed()) {
6857 // read the whole thing
6858 if (compressed_blob_bls.empty()) {
6859 // ensure we avoid any reallocation on subsequent blobs
6860 compressed_blob_bls.reserve(blobs2read.size());
6861 }
6862 compressed_blob_bls.push_back(bufferlist());
6863 bufferlist& bl = compressed_blob_bls.back();
6864 r = bptr->get_blob().map(
6865 0, bptr->get_blob().get_ondisk_length(),
6866 [&](uint64_t offset, uint64_t length) {
6867 int r;
6868 // use aio if there are more regions to read than those in this blob
6869 if (num_regions > p.second.size()) {
6870 r = bdev->aio_read(offset, length, &bl, &ioc);
6871 } else {
6872 r = bdev->read(offset, length, &bl, &ioc, false);
6873 }
6874 if (r < 0)
6875 return r;
6876 return 0;
6877 });
6878 if (r < 0) {
6879 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
6880 if (r == -EIO) {
6881 // propagate EIO to caller
6882 return r;
6883 }
6884 assert(r == 0);
6885 }
6886 } else {
6887 // read the pieces
6888 for (auto& reg : p.second) {
6889 // determine how much of the blob to read
6890 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6891 reg.r_off = reg.blob_xoffset;
6892 uint64_t r_len = reg.length;
6893 reg.front = reg.r_off % chunk_size;
6894 if (reg.front) {
6895 reg.r_off -= reg.front;
6896 r_len += reg.front;
6897 }
6898 unsigned tail = r_len % chunk_size;
6899 if (tail) {
6900 r_len += chunk_size - tail;
6901 }
6902 dout(20) << __func__ << " region 0x" << std::hex
6903 << reg.logical_offset
6904 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6905 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6906 << dendl;
6907
6908 // read it
6909 r = bptr->get_blob().map(
6910 reg.r_off, r_len,
6911 [&](uint64_t offset, uint64_t length) {
6912 int r;
6913 // use aio if there is more than one region to read
6914 if (num_regions > 1) {
6915 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6916 } else {
6917 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6918 }
6919 if (r < 0)
6920 return r;
6921 return 0;
6922 });
6923 if (r < 0) {
6924 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
6925 << dendl;
6926 if (r == -EIO) {
6927 // propagate EIO to caller
6928 return r;
6929 }
6930 assert(r == 0);
6931 }
6932 assert(reg.bl.length() == r_len);
6933 }
6934 }
6935 }
6936 if (ioc.has_pending_aios()) {
6937 bdev->aio_submit(&ioc);
6938 dout(20) << __func__ << " waiting for aio" << dendl;
6939 ioc.aio_wait();
6940 r = ioc.get_return_value();
6941 if (r < 0) {
6942 assert(r == -EIO); // no other errors allowed
6943 return -EIO;
6944 }
6945 }
6946 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6947
6948 // enumerate and decompress desired blobs
6949 auto p = compressed_blob_bls.begin();
6950 blobs2read_t::iterator b2r_it = blobs2read.begin();
6951 while (b2r_it != blobs2read.end()) {
6952 const BlobRef& bptr = b2r_it->first;
6953 dout(20) << __func__ << " blob " << *bptr << std::hex
6954 << " need 0x" << b2r_it->second << std::dec << dendl;
6955 if (bptr->get_blob().is_compressed()) {
6956 assert(p != compressed_blob_bls.end());
6957 bufferlist& compressed_bl = *p++;
6958 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6959 b2r_it->second.front().logical_offset) < 0) {
6960 // Handles spurious read errors caused by a kernel bug.
6961 // We sometimes get all-zero pages as a result of the read under
6962 // high memory pressure. Retrying the failing read succeeds in most cases.
6963 // See also: http://tracker.ceph.com/issues/22464
6964 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
6965 return -EIO;
6966 }
6967 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
6968 }
6969 bufferlist raw_bl;
6970 r = _decompress(compressed_bl, &raw_bl);
6971 if (r < 0)
6972 return r;
6973 if (buffered) {
6974 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6975 raw_bl);
6976 }
6977 for (auto& i : b2r_it->second) {
6978 ready_regions[i.logical_offset].substr_of(
6979 raw_bl, i.blob_xoffset, i.length);
6980 }
6981 } else {
6982 for (auto& reg : b2r_it->second) {
6983 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6984 reg.logical_offset) < 0) {
6985 // Handles spurious read errors caused by a kernel bug.
6986 // We sometimes get all-zero pages as a result of the read under
6987 // high memory pressure. Retrying the failing read succeeds in most cases.
6988 // See also: http://tracker.ceph.com/issues/22464
6989 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
6990 return -EIO;
6991 }
6992 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
6993 }
6994 if (buffered) {
6995 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6996 reg.r_off, reg.bl);
6997 }
6998
6999 // prune and keep result
7000 ready_regions[reg.logical_offset].substr_of(
7001 reg.bl, reg.front, reg.length);
7002 }
7003 }
7004 ++b2r_it;
7005 }
7006
7007 // generate a resulting buffer
7008 auto pr = ready_regions.begin();
7009 auto pr_end = ready_regions.end();
7010 pos = 0;
7011 while (pos < length) {
7012 if (pr != pr_end && pr->first == pos + offset) {
7013 dout(30) << __func__ << " assemble 0x" << std::hex << pos
7014 << ": data from 0x" << pr->first << "~" << pr->second.length()
7015 << std::dec << dendl;
7016 pos += pr->second.length();
7017 bl.claim_append(pr->second);
7018 ++pr;
7019 } else {
7020 uint64_t l = length - pos;
7021 if (pr != pr_end) {
7022 assert(pr->first > pos + offset);
7023 l = pr->first - (pos + offset);
7024 }
7025 dout(30) << __func__ << " assemble 0x" << std::hex << pos
7026 << ": zeros for 0x" << (pos + offset) << "~" << l
7027 << std::dec << dendl;
7028 bl.append_zero(l);
7029 pos += l;
7030 }
7031 }
7032 assert(bl.length() == length);
7033 assert(pos == length);
7034 assert(pr == pr_end);
7035 r = bl.length();
7036 if (retry_count) {
7037 logger->inc(l_bluestore_reads_with_retries);
7038 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
7039 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
7040 }
7041 return r;
7042 }
7043
7044 int BlueStore::_verify_csum(OnodeRef& o,
7045 const bluestore_blob_t* blob, uint64_t blob_xoffset,
7046 const bufferlist& bl,
7047 uint64_t logical_offset) const
7048 {
7049 int bad;
7050 uint64_t bad_csum;
7051 utime_t start = ceph_clock_now();
7052 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
7053 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
7054 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
7055 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
7056 bad = blob_xoffset;
7057 r = -1;
7058 bad_csum = 0xDEADBEEF;
7059 }
7060 if (r < 0) {
7061 if (r == -1) {
7062 PExtentVector pex;
7063 blob->map(
7064 bad,
7065 blob->get_csum_chunk_size(),
7066 [&](uint64_t offset, uint64_t length) {
7067 pex.emplace_back(bluestore_pextent_t(offset, length));
7068 return 0;
7069 });
7070 derr << __func__ << " bad "
7071 << Checksummer::get_csum_type_string(blob->csum_type)
7072 << "/0x" << std::hex << blob->get_csum_chunk_size()
7073 << " checksum at blob offset 0x" << bad
7074 << ", got 0x" << bad_csum << ", expected 0x"
7075 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
7076 << ", device location " << pex
7077 << ", logical extent 0x" << std::hex
7078 << (logical_offset + bad - blob_xoffset) << "~"
7079 << blob->get_csum_chunk_size() << std::dec
7080 << ", object " << o->oid
7081 << dendl;
7082 } else {
7083 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
7084 }
7085 }
7086 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
7087 return r;
7088 }
7089
7090 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
7091 {
7092 int r = 0;
7093 utime_t start = ceph_clock_now();
7094 bufferlist::iterator i = source.begin();
7095 bluestore_compression_header_t chdr;
7096 ::decode(chdr, i);
7097 int alg = int(chdr.type);
7098 CompressorRef cp = compressor;
7099 if (!cp || (int)cp->get_type() != alg) {
7100 cp = Compressor::create(cct, alg);
7101 }
7102
7103 if (!cp.get()) {
7104 // if compressor isn't available - error, because cannot return
7105 // decompressed data?
7106 derr << __func__ << " can't load decompressor " << alg << dendl;
7107 r = -EIO;
7108 } else {
7109 r = cp->decompress(i, chdr.length, *result);
7110 if (r < 0) {
7111 derr << __func__ << " decompression failed with exit code " << r << dendl;
7112 r = -EIO;
7113 }
7114 }
7115 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
7116 return r;
7117 }
7118
7119 // this stores fiemap into interval_set, other variations
7120 // use it internally
7121 int BlueStore::_fiemap(
7122 CollectionHandle &c_,
7123 const ghobject_t& oid,
7124 uint64_t offset,
7125 size_t length,
7126 interval_set<uint64_t>& destset)
7127 {
7128 Collection *c = static_cast<Collection *>(c_.get());
7129 if (!c->exists)
7130 return -ENOENT;
7131 {
7132 RWLock::RLocker l(c->lock);
7133
7134 OnodeRef o = c->get_onode(oid, false);
7135 if (!o || !o->exists) {
7136 return -ENOENT;
7137 }
7138 _dump_onode(o);
7139
7140 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
7141 << " size 0x" << o->onode.size << std::dec << dendl;
7142
7143 boost::intrusive::set<Extent>::iterator ep, eend;
7144 if (offset >= o->onode.size)
7145 goto out;
7146
7147 if (offset + length > o->onode.size) {
7148 length = o->onode.size - offset;
7149 }
7150
7151 o->extent_map.fault_range(db, offset, length);
7152 eend = o->extent_map.extent_map.end();
7153 ep = o->extent_map.seek_lextent(offset);
7154 while (length > 0) {
7155 dout(20) << __func__ << " offset " << offset << dendl;
7156 if (ep != eend && ep->logical_offset + ep->length <= offset) {
7157 ++ep;
7158 continue;
7159 }
7160
7161 uint64_t x_len = length;
7162 if (ep != eend && ep->logical_offset <= offset) {
7163 uint64_t x_off = offset - ep->logical_offset;
7164 x_len = MIN(x_len, ep->length - x_off);
7165 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
7166 << x_len << std::dec << " blob " << ep->blob << dendl;
7167 destset.insert(offset, x_len);
7168 length -= x_len;
7169 offset += x_len;
7170 if (x_off + x_len == ep->length)
7171 ++ep;
7172 continue;
7173 }
7174 if (ep != eend &&
7175 ep->logical_offset > offset &&
7176 ep->logical_offset - offset < x_len) {
7177 x_len = ep->logical_offset - offset;
7178 }
7179 offset += x_len;
7180 length -= x_len;
7181 }
7182 }
7183
7184 out:
7185 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
7186 << " size = 0x(" << destset << ")" << std::dec << dendl;
7187 return 0;
7188 }
7189
7190 int BlueStore::fiemap(
7191 const coll_t& cid,
7192 const ghobject_t& oid,
7193 uint64_t offset,
7194 size_t len,
7195 bufferlist& bl)
7196 {
7197 CollectionHandle c = _get_collection(cid);
7198 if (!c)
7199 return -ENOENT;
7200 return fiemap(c, oid, offset, len, bl);
7201 }
7202
7203 int BlueStore::fiemap(
7204 CollectionHandle &c_,
7205 const ghobject_t& oid,
7206 uint64_t offset,
7207 size_t length,
7208 bufferlist& bl)
7209 {
7210 interval_set<uint64_t> m;
7211 int r = _fiemap(c_, oid, offset, length, m);
7212 if (r >= 0) {
7213 ::encode(m, bl);
7214 }
7215 return r;
7216 }
7217
7218 int BlueStore::fiemap(
7219 const coll_t& cid,
7220 const ghobject_t& oid,
7221 uint64_t offset,
7222 size_t len,
7223 map<uint64_t, uint64_t>& destmap)
7224 {
7225 CollectionHandle c = _get_collection(cid);
7226 if (!c)
7227 return -ENOENT;
7228 return fiemap(c, oid, offset, len, destmap);
7229 }
7230
7231 int BlueStore::fiemap(
7232 CollectionHandle &c_,
7233 const ghobject_t& oid,
7234 uint64_t offset,
7235 size_t length,
7236 map<uint64_t, uint64_t>& destmap)
7237 {
7238 interval_set<uint64_t> m;
7239 int r = _fiemap(c_, oid, offset, length, m);
7240 if (r >= 0) {
7241 m.move_into(destmap);
7242 }
7243 return r;
7244 }
7245
7246 int BlueStore::getattr(
7247 const coll_t& cid,
7248 const ghobject_t& oid,
7249 const char *name,
7250 bufferptr& value)
7251 {
7252 CollectionHandle c = _get_collection(cid);
7253 if (!c)
7254 return -ENOENT;
7255 return getattr(c, oid, name, value);
7256 }
7257
7258 int BlueStore::getattr(
7259 CollectionHandle &c_,
7260 const ghobject_t& oid,
7261 const char *name,
7262 bufferptr& value)
7263 {
7264 Collection *c = static_cast<Collection *>(c_.get());
7265 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
7266 if (!c->exists)
7267 return -ENOENT;
7268
7269 int r;
7270 {
7271 RWLock::RLocker l(c->lock);
7272 mempool::bluestore_cache_other::string k(name);
7273
7274 OnodeRef o = c->get_onode(oid, false);
7275 if (!o || !o->exists) {
7276 r = -ENOENT;
7277 goto out;
7278 }
7279
7280 if (!o->onode.attrs.count(k)) {
7281 r = -ENODATA;
7282 goto out;
7283 }
7284 value = o->onode.attrs[k];
7285 r = 0;
7286 }
7287 out:
7288 if (r == 0 && _debug_mdata_eio(oid)) {
7289 r = -EIO;
7290 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7291 }
7292 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
7293 << " = " << r << dendl;
7294 return r;
7295 }
7296
7297
7298 int BlueStore::getattrs(
7299 const coll_t& cid,
7300 const ghobject_t& oid,
7301 map<string,bufferptr>& aset)
7302 {
7303 CollectionHandle c = _get_collection(cid);
7304 if (!c)
7305 return -ENOENT;
7306 return getattrs(c, oid, aset);
7307 }
7308
7309 int BlueStore::getattrs(
7310 CollectionHandle &c_,
7311 const ghobject_t& oid,
7312 map<string,bufferptr>& aset)
7313 {
7314 Collection *c = static_cast<Collection *>(c_.get());
7315 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
7316 if (!c->exists)
7317 return -ENOENT;
7318
7319 int r;
7320 {
7321 RWLock::RLocker l(c->lock);
7322
7323 OnodeRef o = c->get_onode(oid, false);
7324 if (!o || !o->exists) {
7325 r = -ENOENT;
7326 goto out;
7327 }
7328 for (auto& i : o->onode.attrs) {
7329 aset.emplace(i.first.c_str(), i.second);
7330 }
7331 r = 0;
7332 }
7333
7334 out:
7335 if (r == 0 && _debug_mdata_eio(oid)) {
7336 r = -EIO;
7337 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7338 }
7339 dout(10) << __func__ << " " << c->cid << " " << oid
7340 << " = " << r << dendl;
7341 return r;
7342 }
7343
7344 int BlueStore::list_collections(vector<coll_t>& ls)
7345 {
7346 RWLock::RLocker l(coll_lock);
7347 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
7348 p != coll_map.end();
7349 ++p)
7350 ls.push_back(p->first);
7351 return 0;
7352 }
7353
7354 bool BlueStore::collection_exists(const coll_t& c)
7355 {
7356 RWLock::RLocker l(coll_lock);
7357 return coll_map.count(c);
7358 }
7359
7360 int BlueStore::collection_empty(const coll_t& cid, bool *empty)
7361 {
7362 dout(15) << __func__ << " " << cid << dendl;
7363 vector<ghobject_t> ls;
7364 ghobject_t next;
7365 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
7366 &ls, &next);
7367 if (r < 0) {
7368 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
7369 << dendl;
7370 return r;
7371 }
7372 *empty = ls.empty();
7373 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
7374 return 0;
7375 }
7376
7377 int BlueStore::collection_bits(const coll_t& cid)
7378 {
7379 dout(15) << __func__ << " " << cid << dendl;
7380 CollectionRef c = _get_collection(cid);
7381 if (!c)
7382 return -ENOENT;
7383 RWLock::RLocker l(c->lock);
7384 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
7385 return c->cnode.bits;
7386 }
7387
7388 int BlueStore::collection_list(
7389 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
7390 vector<ghobject_t> *ls, ghobject_t *pnext)
7391 {
7392 CollectionHandle c = _get_collection(cid);
7393 if (!c)
7394 return -ENOENT;
7395 return collection_list(c, start, end, max, ls, pnext);
7396 }
7397
7398 int BlueStore::collection_list(
7399 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
7400 vector<ghobject_t> *ls, ghobject_t *pnext)
7401 {
7402 Collection *c = static_cast<Collection *>(c_.get());
7403 dout(15) << __func__ << " " << c->cid
7404 << " start " << start << " end " << end << " max " << max << dendl;
7405 int r;
7406 {
7407 RWLock::RLocker l(c->lock);
7408 r = _collection_list(c, start, end, max, ls, pnext);
7409 }
7410
7411 dout(10) << __func__ << " " << c->cid
7412 << " start " << start << " end " << end << " max " << max
7413 << " = " << r << ", ls.size() = " << ls->size()
7414 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
7415 return r;
7416 }
7417
7418 int BlueStore::_collection_list(
7419 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
7420 vector<ghobject_t> *ls, ghobject_t *pnext)
7421 {
7422
7423 if (!c->exists)
7424 return -ENOENT;
7425
7426 int r = 0;
7427 ghobject_t static_next;
7428 KeyValueDB::Iterator it;
7429 string temp_start_key, temp_end_key;
7430 string start_key, end_key;
7431 bool set_next = false;
7432 string pend;
7433 bool temp;
7434
7435 if (!pnext)
7436 pnext = &static_next;
7437
7438 if (start == ghobject_t::get_max() ||
7439 start.hobj.is_max()) {
7440 goto out;
7441 }
7442 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
7443 &start_key, &end_key);
7444 dout(20) << __func__
7445 << " range " << pretty_binary_string(temp_start_key)
7446 << " to " << pretty_binary_string(temp_end_key)
7447 << " and " << pretty_binary_string(start_key)
7448 << " to " << pretty_binary_string(end_key)
7449 << " start " << start << dendl;
7450 it = db->get_iterator(PREFIX_OBJ);
7451 if (start == ghobject_t() ||
7452 start.hobj == hobject_t() ||
7453 start == c->cid.get_min_hobj()) {
7454 it->upper_bound(temp_start_key);
7455 temp = true;
7456 } else {
7457 string k;
7458 get_object_key(cct, start, &k);
7459 if (start.hobj.is_temp()) {
7460 temp = true;
7461 assert(k >= temp_start_key && k < temp_end_key);
7462 } else {
7463 temp = false;
7464 assert(k >= start_key && k < end_key);
7465 }
7466 dout(20) << " start from " << pretty_binary_string(k)
7467 << " temp=" << (int)temp << dendl;
7468 it->lower_bound(k);
7469 }
7470 if (end.hobj.is_max()) {
7471 pend = temp ? temp_end_key : end_key;
7472 } else {
7473 get_object_key(cct, end, &end_key);
7474 if (end.hobj.is_temp()) {
7475 if (temp)
7476 pend = end_key;
7477 else
7478 goto out;
7479 } else {
7480 pend = temp ? temp_end_key : end_key;
7481 }
7482 }
7483 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7484 while (true) {
7485 if (!it->valid() || it->key() >= pend) {
7486 if (!it->valid())
7487 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
7488 else
7489 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
7490 << " >= " << end << dendl;
7491 if (temp) {
7492 if (end.hobj.is_temp()) {
7493 break;
7494 }
7495 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
7496 temp = false;
7497 it->upper_bound(start_key);
7498 pend = end_key;
7499 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7500 continue;
7501 }
7502 break;
7503 }
7504 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
7505 if (is_extent_shard_key(it->key())) {
7506 it->next();
7507 continue;
7508 }
7509 ghobject_t oid;
7510 int r = get_key_object(it->key(), &oid);
7511 assert(r == 0);
7512 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
7513 if (ls->size() >= (unsigned)max) {
7514 dout(20) << __func__ << " reached max " << max << dendl;
7515 *pnext = oid;
7516 set_next = true;
7517 break;
7518 }
7519 ls->push_back(oid);
7520 it->next();
7521 }
7522 out:
7523 if (!set_next) {
7524 *pnext = ghobject_t::get_max();
7525 }
7526
7527 return r;
7528 }
7529
7530 int BlueStore::omap_get(
7531 const coll_t& cid, ///< [in] Collection containing oid
7532 const ghobject_t &oid, ///< [in] Object containing omap
7533 bufferlist *header, ///< [out] omap header
7534 map<string, bufferlist> *out /// < [out] Key to value map
7535 )
7536 {
7537 CollectionHandle c = _get_collection(cid);
7538 if (!c)
7539 return -ENOENT;
7540 return omap_get(c, oid, header, out);
7541 }
7542
7543 int BlueStore::omap_get(
7544 CollectionHandle &c_, ///< [in] Collection containing oid
7545 const ghobject_t &oid, ///< [in] Object containing omap
7546 bufferlist *header, ///< [out] omap header
7547 map<string, bufferlist> *out /// < [out] Key to value map
7548 )
7549 {
7550 Collection *c = static_cast<Collection *>(c_.get());
7551 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7552 if (!c->exists)
7553 return -ENOENT;
7554 RWLock::RLocker l(c->lock);
7555 int r = 0;
7556 OnodeRef o = c->get_onode(oid, false);
7557 if (!o || !o->exists) {
7558 r = -ENOENT;
7559 goto out;
7560 }
7561 if (!o->onode.has_omap())
7562 goto out;
7563 o->flush();
7564 {
7565 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7566 string head, tail;
7567 get_omap_header(o->onode.nid, &head);
7568 get_omap_tail(o->onode.nid, &tail);
7569 it->lower_bound(head);
7570 while (it->valid()) {
7571 if (it->key() == head) {
7572 dout(30) << __func__ << " got header" << dendl;
7573 *header = it->value();
7574 } else if (it->key() >= tail) {
7575 dout(30) << __func__ << " reached tail" << dendl;
7576 break;
7577 } else {
7578 string user_key;
7579 decode_omap_key(it->key(), &user_key);
7580 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7581 << " -> " << user_key << dendl;
7582 (*out)[user_key] = it->value();
7583 }
7584 it->next();
7585 }
7586 }
7587 out:
7588 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7589 << dendl;
7590 return r;
7591 }
7592
7593 int BlueStore::omap_get_header(
7594 const coll_t& cid, ///< [in] Collection containing oid
7595 const ghobject_t &oid, ///< [in] Object containing omap
7596 bufferlist *header, ///< [out] omap header
7597 bool allow_eio ///< [in] don't assert on eio
7598 )
7599 {
7600 CollectionHandle c = _get_collection(cid);
7601 if (!c)
7602 return -ENOENT;
7603 return omap_get_header(c, oid, header, allow_eio);
7604 }
7605
7606 int BlueStore::omap_get_header(
7607 CollectionHandle &c_, ///< [in] Collection containing oid
7608 const ghobject_t &oid, ///< [in] Object containing omap
7609 bufferlist *header, ///< [out] omap header
7610 bool allow_eio ///< [in] don't assert on eio
7611 )
7612 {
7613 Collection *c = static_cast<Collection *>(c_.get());
7614 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7615 if (!c->exists)
7616 return -ENOENT;
7617 RWLock::RLocker l(c->lock);
7618 int r = 0;
7619 OnodeRef o = c->get_onode(oid, false);
7620 if (!o || !o->exists) {
7621 r = -ENOENT;
7622 goto out;
7623 }
7624 if (!o->onode.has_omap())
7625 goto out;
7626 o->flush();
7627 {
7628 string head;
7629 get_omap_header(o->onode.nid, &head);
7630 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7631 dout(30) << __func__ << " got header" << dendl;
7632 } else {
7633 dout(30) << __func__ << " no header" << dendl;
7634 }
7635 }
7636 out:
7637 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7638 << dendl;
7639 return r;
7640 }
7641
7642 int BlueStore::omap_get_keys(
7643 const coll_t& cid, ///< [in] Collection containing oid
7644 const ghobject_t &oid, ///< [in] Object containing omap
7645 set<string> *keys ///< [out] Keys defined on oid
7646 )
7647 {
7648 CollectionHandle c = _get_collection(cid);
7649 if (!c)
7650 return -ENOENT;
7651 return omap_get_keys(c, oid, keys);
7652 }
7653
7654 int BlueStore::omap_get_keys(
7655 CollectionHandle &c_, ///< [in] Collection containing oid
7656 const ghobject_t &oid, ///< [in] Object containing omap
7657 set<string> *keys ///< [out] Keys defined on oid
7658 )
7659 {
7660 Collection *c = static_cast<Collection *>(c_.get());
7661 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7662 if (!c->exists)
7663 return -ENOENT;
7664 RWLock::RLocker l(c->lock);
7665 int r = 0;
7666 OnodeRef o = c->get_onode(oid, false);
7667 if (!o || !o->exists) {
7668 r = -ENOENT;
7669 goto out;
7670 }
7671 if (!o->onode.has_omap())
7672 goto out;
7673 o->flush();
7674 {
7675 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7676 string head, tail;
7677 get_omap_key(o->onode.nid, string(), &head);
7678 get_omap_tail(o->onode.nid, &tail);
7679 it->lower_bound(head);
7680 while (it->valid()) {
7681 if (it->key() >= tail) {
7682 dout(30) << __func__ << " reached tail" << dendl;
7683 break;
7684 }
7685 string user_key;
7686 decode_omap_key(it->key(), &user_key);
7687 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7688 << " -> " << user_key << dendl;
7689 keys->insert(user_key);
7690 it->next();
7691 }
7692 }
7693 out:
7694 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7695 << dendl;
7696 return r;
7697 }
7698
7699 int BlueStore::omap_get_values(
7700 const coll_t& cid, ///< [in] Collection containing oid
7701 const ghobject_t &oid, ///< [in] Object containing omap
7702 const set<string> &keys, ///< [in] Keys to get
7703 map<string, bufferlist> *out ///< [out] Returned keys and values
7704 )
7705 {
7706 CollectionHandle c = _get_collection(cid);
7707 if (!c)
7708 return -ENOENT;
7709 return omap_get_values(c, oid, keys, out);
7710 }
7711
7712 int BlueStore::omap_get_values(
7713 CollectionHandle &c_, ///< [in] Collection containing oid
7714 const ghobject_t &oid, ///< [in] Object containing omap
7715 const set<string> &keys, ///< [in] Keys to get
7716 map<string, bufferlist> *out ///< [out] Returned keys and values
7717 )
7718 {
7719 Collection *c = static_cast<Collection *>(c_.get());
7720 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7721 if (!c->exists)
7722 return -ENOENT;
7723 RWLock::RLocker l(c->lock);
7724 int r = 0;
7725 string final_key;
7726 OnodeRef o = c->get_onode(oid, false);
7727 if (!o || !o->exists) {
7728 r = -ENOENT;
7729 goto out;
7730 }
7731 if (!o->onode.has_omap())
7732 goto out;
7733 o->flush();
7734 _key_encode_u64(o->onode.nid, &final_key);
7735 final_key.push_back('.');
7736 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7737 final_key.resize(9); // keep prefix
7738 final_key += *p;
7739 bufferlist val;
7740 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7741 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7742 << " -> " << *p << dendl;
7743 out->insert(make_pair(*p, val));
7744 }
7745 }
7746 out:
7747 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7748 << dendl;
7749 return r;
7750 }
7751
7752 int BlueStore::omap_check_keys(
7753 const coll_t& cid, ///< [in] Collection containing oid
7754 const ghobject_t &oid, ///< [in] Object containing omap
7755 const set<string> &keys, ///< [in] Keys to check
7756 set<string> *out ///< [out] Subset of keys defined on oid
7757 )
7758 {
7759 CollectionHandle c = _get_collection(cid);
7760 if (!c)
7761 return -ENOENT;
7762 return omap_check_keys(c, oid, keys, out);
7763 }
7764
7765 int BlueStore::omap_check_keys(
7766 CollectionHandle &c_, ///< [in] Collection containing oid
7767 const ghobject_t &oid, ///< [in] Object containing omap
7768 const set<string> &keys, ///< [in] Keys to check
7769 set<string> *out ///< [out] Subset of keys defined on oid
7770 )
7771 {
7772 Collection *c = static_cast<Collection *>(c_.get());
7773 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7774 if (!c->exists)
7775 return -ENOENT;
7776 RWLock::RLocker l(c->lock);
7777 int r = 0;
7778 string final_key;
7779 OnodeRef o = c->get_onode(oid, false);
7780 if (!o || !o->exists) {
7781 r = -ENOENT;
7782 goto out;
7783 }
7784 if (!o->onode.has_omap())
7785 goto out;
7786 o->flush();
7787 _key_encode_u64(o->onode.nid, &final_key);
7788 final_key.push_back('.');
7789 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7790 final_key.resize(9); // keep prefix
7791 final_key += *p;
7792 bufferlist val;
7793 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7794 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7795 << " -> " << *p << dendl;
7796 out->insert(*p);
7797 } else {
7798 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7799 << " -> " << *p << dendl;
7800 }
7801 }
7802 out:
7803 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7804 << dendl;
7805 return r;
7806 }
7807
7808 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7809 const coll_t& cid, ///< [in] collection
7810 const ghobject_t &oid ///< [in] object
7811 )
7812 {
7813 CollectionHandle c = _get_collection(cid);
7814 if (!c) {
7815 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7816 return ObjectMap::ObjectMapIterator();
7817 }
7818 return get_omap_iterator(c, oid);
7819 }
7820
7821 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7822 CollectionHandle &c_, ///< [in] collection
7823 const ghobject_t &oid ///< [in] object
7824 )
7825 {
7826 Collection *c = static_cast<Collection *>(c_.get());
7827 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7828 if (!c->exists) {
7829 return ObjectMap::ObjectMapIterator();
7830 }
7831 RWLock::RLocker l(c->lock);
7832 OnodeRef o = c->get_onode(oid, false);
7833 if (!o || !o->exists) {
7834 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7835 return ObjectMap::ObjectMapIterator();
7836 }
7837 o->flush();
7838 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7839 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7840 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7841 }
7842
7843 // -----------------
7844 // write helpers
7845
7846 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7847 {
7848 dout(10) << __func__ << " ondisk_format " << ondisk_format
7849 << " min_compat_ondisk_format " << min_compat_ondisk_format
7850 << dendl;
7851 assert(ondisk_format == latest_ondisk_format);
7852 {
7853 bufferlist bl;
7854 ::encode(ondisk_format, bl);
7855 t->set(PREFIX_SUPER, "ondisk_format", bl);
7856 }
7857 {
7858 bufferlist bl;
7859 ::encode(min_compat_ondisk_format, bl);
7860 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7861 }
7862 }
7863
7864 int BlueStore::_open_super_meta()
7865 {
7866 // nid
7867 {
7868 nid_max = 0;
7869 bufferlist bl;
7870 db->get(PREFIX_SUPER, "nid_max", &bl);
7871 bufferlist::iterator p = bl.begin();
7872 try {
7873 uint64_t v;
7874 ::decode(v, p);
7875 nid_max = v;
7876 } catch (buffer::error& e) {
7877 derr << __func__ << " unable to read nid_max" << dendl;
7878 return -EIO;
7879 }
7880 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7881 nid_last = nid_max.load();
7882 }
7883
7884 // blobid
7885 {
7886 blobid_max = 0;
7887 bufferlist bl;
7888 db->get(PREFIX_SUPER, "blobid_max", &bl);
7889 bufferlist::iterator p = bl.begin();
7890 try {
7891 uint64_t v;
7892 ::decode(v, p);
7893 blobid_max = v;
7894 } catch (buffer::error& e) {
7895 derr << __func__ << " unable to read blobid_max" << dendl;
7896 return -EIO;
7897 }
7898 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7899 blobid_last = blobid_max.load();
7900 }
7901
7902 // freelist
7903 {
7904 bufferlist bl;
7905 db->get(PREFIX_SUPER, "freelist_type", &bl);
7906 if (bl.length()) {
7907 freelist_type = std::string(bl.c_str(), bl.length());
7908 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7909 } else {
7910 assert("Not Support extent freelist manager" == 0);
7911 }
7912 }
7913
7914 // bluefs alloc
7915 if (cct->_conf->bluestore_bluefs) {
7916 bluefs_extents.clear();
7917 bufferlist bl;
7918 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7919 bufferlist::iterator p = bl.begin();
7920 try {
7921 ::decode(bluefs_extents, p);
7922 }
7923 catch (buffer::error& e) {
7924 derr << __func__ << " unable to read bluefs_extents" << dendl;
7925 return -EIO;
7926 }
7927 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7928 << std::dec << dendl;
7929 }
7930
7931 // ondisk format
7932 int32_t compat_ondisk_format = 0;
7933 {
7934 bufferlist bl;
7935 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7936 if (r < 0) {
7937 // base case: kraken bluestore is v1 and readable by v1
7938 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7939 << dendl;
7940 ondisk_format = 1;
7941 compat_ondisk_format = 1;
7942 } else {
7943 auto p = bl.begin();
7944 try {
7945 ::decode(ondisk_format, p);
7946 } catch (buffer::error& e) {
7947 derr << __func__ << " unable to read ondisk_format" << dendl;
7948 return -EIO;
7949 }
7950 bl.clear();
7951 {
7952 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7953 assert(!r);
7954 auto p = bl.begin();
7955 try {
7956 ::decode(compat_ondisk_format, p);
7957 } catch (buffer::error& e) {
7958 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7959 return -EIO;
7960 }
7961 }
7962 }
7963 dout(10) << __func__ << " ondisk_format " << ondisk_format
7964 << " compat_ondisk_format " << compat_ondisk_format
7965 << dendl;
7966 }
7967
7968 if (latest_ondisk_format < compat_ondisk_format) {
7969 derr << __func__ << " compat_ondisk_format is "
7970 << compat_ondisk_format << " but we only understand version "
7971 << latest_ondisk_format << dendl;
7972 return -EPERM;
7973 }
7974 if (ondisk_format < latest_ondisk_format) {
7975 int r = _upgrade_super();
7976 if (r < 0) {
7977 return r;
7978 }
7979 }
7980
7981 {
7982 bufferlist bl;
7983 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7984 auto p = bl.begin();
7985 try {
7986 uint64_t val;
7987 ::decode(val, p);
7988 min_alloc_size = val;
7989 min_alloc_size_order = ctz(val);
7990 assert(min_alloc_size == 1u << min_alloc_size_order);
7991 } catch (buffer::error& e) {
7992 derr << __func__ << " unable to read min_alloc_size" << dendl;
7993 return -EIO;
7994 }
7995 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7996 << std::dec << dendl;
7997 }
7998 _open_statfs();
7999 _set_alloc_sizes();
8000 _set_throttle_params();
8001
8002 _set_csum();
8003 _set_compression();
8004 _set_blob_size();
8005
8006 _set_finisher_num();
8007
8008 return 0;
8009 }
8010
8011 int BlueStore::_upgrade_super()
8012 {
8013 dout(1) << __func__ << " from " << ondisk_format << ", latest "
8014 << latest_ondisk_format << dendl;
8015 assert(ondisk_format > 0);
8016 assert(ondisk_format < latest_ondisk_format);
8017
8018 if (ondisk_format == 1) {
8019 // changes:
8020 // - super: added ondisk_format
8021 // - super: added min_readable_ondisk_format
8022 // - super: added min_compat_ondisk_format
8023 // - super: added min_alloc_size
8024 // - super: removed min_min_alloc_size
8025 KeyValueDB::Transaction t = db->get_transaction();
8026 {
8027 bufferlist bl;
8028 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
8029 auto p = bl.begin();
8030 try {
8031 uint64_t val;
8032 ::decode(val, p);
8033 min_alloc_size = val;
8034 } catch (buffer::error& e) {
8035 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
8036 return -EIO;
8037 }
8038 t->set(PREFIX_SUPER, "min_alloc_size", bl);
8039 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
8040 }
8041 ondisk_format = 2;
8042 _prepare_ondisk_format_super(t);
8043 int r = db->submit_transaction_sync(t);
8044 assert(r == 0);
8045 }
8046
8047 // done
8048 dout(1) << __func__ << " done" << dendl;
8049 return 0;
8050 }
8051
8052 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
8053 {
8054 if (o->onode.nid) {
8055 assert(o->exists);
8056 return;
8057 }
8058 uint64_t nid = ++nid_last;
8059 dout(20) << __func__ << " " << nid << dendl;
8060 o->onode.nid = nid;
8061 txc->last_nid = nid;
8062 o->exists = true;
8063 }
8064
8065 uint64_t BlueStore::_assign_blobid(TransContext *txc)
8066 {
8067 uint64_t bid = ++blobid_last;
8068 dout(20) << __func__ << " " << bid << dendl;
8069 txc->last_blobid = bid;
8070 return bid;
8071 }
8072
8073 void BlueStore::get_db_statistics(Formatter *f)
8074 {
8075 db->get_statistics(f);
8076 }
8077
8078 BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
8079 {
8080 TransContext *txc = new TransContext(cct, osr);
8081 txc->t = db->get_transaction();
8082 osr->queue_new(txc);
8083 dout(20) << __func__ << " osr " << osr << " = " << txc
8084 << " seq " << txc->seq << dendl;
8085 return txc;
8086 }
8087
8088 void BlueStore::_txc_calc_cost(TransContext *txc)
8089 {
8090 // this is about the simplest model for transaction cost you can
8091 // imagine. there is some fixed overhead cost by saying there is a
8092 // minimum of one "io". and then we have some cost per "io" that is
8093 // a configurable (with different hdd and ssd defaults), and add
8094 // that to the bytes value.
8095 int ios = 1; // one "io" for the kv commit
8096 for (auto& p : txc->ioc.pending_aios) {
8097 ios += p.iov.size();
8098 }
8099 auto cost = throttle_cost_per_io.load();
8100 txc->cost = ios * cost + txc->bytes;
8101 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
8102 << ios << " ios * " << cost << " + " << txc->bytes
8103 << " bytes)" << dendl;
8104 }
8105
8106 void BlueStore::_txc_update_store_statfs(TransContext *txc)
8107 {
8108 if (txc->statfs_delta.is_empty())
8109 return;
8110
8111 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
8112 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
8113 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
8114 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
8115 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
8116
8117 {
8118 std::lock_guard<std::mutex> l(vstatfs_lock);
8119 vstatfs += txc->statfs_delta;
8120 }
8121
8122 bufferlist bl;
8123 txc->statfs_delta.encode(bl);
8124
8125 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
8126 txc->statfs_delta.reset();
8127 }
8128
8129 void BlueStore::_txc_state_proc(TransContext *txc)
8130 {
8131 while (true) {
8132 dout(10) << __func__ << " txc " << txc
8133 << " " << txc->get_state_name() << dendl;
8134 switch (txc->state) {
8135 case TransContext::STATE_PREPARE:
8136 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
8137 if (txc->ioc.has_pending_aios()) {
8138 txc->state = TransContext::STATE_AIO_WAIT;
8139 txc->had_ios = true;
8140 _txc_aio_submit(txc);
8141 return;
8142 }
8143 // ** fall-thru **
8144
8145 case TransContext::STATE_AIO_WAIT:
8146 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
8147 _txc_finish_io(txc); // may trigger blocked txc's too
8148 return;
8149
8150 case TransContext::STATE_IO_DONE:
8151 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
8152 if (txc->had_ios) {
8153 ++txc->osr->txc_with_unstable_io;
8154 }
8155 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
8156 txc->state = TransContext::STATE_KV_QUEUED;
8157 if (cct->_conf->bluestore_sync_submit_transaction) {
8158 if (txc->last_nid >= nid_max ||
8159 txc->last_blobid >= blobid_max) {
8160 dout(20) << __func__
8161 << " last_{nid,blobid} exceeds max, submit via kv thread"
8162 << dendl;
8163 } else if (txc->osr->kv_committing_serially) {
8164 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
8165 << dendl;
8166 // note: this is starvation-prone. once we have a txc in a busy
8167 // sequencer that is committing serially it is possible to keep
8168 // submitting new transactions fast enough that we get stuck doing
8169 // so. the alternative is to block here... fixme?
8170 } else if (txc->osr->txc_with_unstable_io) {
8171 dout(20) << __func__ << " prior txc(s) with unstable ios "
8172 << txc->osr->txc_with_unstable_io.load() << dendl;
8173 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
8174 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
8175 == 0) {
8176 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
8177 << dendl;
8178 } else {
8179 txc->state = TransContext::STATE_KV_SUBMITTED;
8180 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8181 assert(r == 0);
8182 _txc_applied_kv(txc);
8183 }
8184 }
8185 {
8186 std::lock_guard<std::mutex> l(kv_lock);
8187 kv_queue.push_back(txc);
8188 kv_cond.notify_one();
8189 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
8190 kv_queue_unsubmitted.push_back(txc);
8191 ++txc->osr->kv_committing_serially;
8192 }
8193 if (txc->had_ios)
8194 kv_ios++;
8195 kv_throttle_costs += txc->cost;
8196 }
8197 return;
8198 case TransContext::STATE_KV_SUBMITTED:
8199 _txc_committed_kv(txc);
8200 // ** fall-thru **
8201
8202 case TransContext::STATE_KV_DONE:
8203 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
8204 if (txc->deferred_txn) {
8205 txc->state = TransContext::STATE_DEFERRED_QUEUED;
8206 _deferred_queue(txc);
8207 return;
8208 }
8209 txc->state = TransContext::STATE_FINISHING;
8210 break;
8211
8212 case TransContext::STATE_DEFERRED_CLEANUP:
8213 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
8214 txc->state = TransContext::STATE_FINISHING;
8215 // ** fall-thru **
8216
8217 case TransContext::STATE_FINISHING:
8218 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
8219 _txc_finish(txc);
8220 return;
8221
8222 default:
8223 derr << __func__ << " unexpected txc " << txc
8224 << " state " << txc->get_state_name() << dendl;
8225 assert(0 == "unexpected txc state");
8226 return;
8227 }
8228 }
8229 }
8230
8231 void BlueStore::_txc_finish_io(TransContext *txc)
8232 {
8233 dout(20) << __func__ << " " << txc << dendl;
8234
8235 /*
8236 * we need to preserve the order of kv transactions,
8237 * even though aio will complete in any order.
8238 */
8239
8240 OpSequencer *osr = txc->osr.get();
8241 std::lock_guard<std::mutex> l(osr->qlock);
8242 txc->state = TransContext::STATE_IO_DONE;
8243
8244 // release aio contexts (including pinned buffers).
8245 txc->ioc.running_aios.clear();
8246
8247 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
8248 while (p != osr->q.begin()) {
8249 --p;
8250 if (p->state < TransContext::STATE_IO_DONE) {
8251 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
8252 << p->get_state_name() << dendl;
8253 return;
8254 }
8255 if (p->state > TransContext::STATE_IO_DONE) {
8256 ++p;
8257 break;
8258 }
8259 }
8260 do {
8261 _txc_state_proc(&*p++);
8262 } while (p != osr->q.end() &&
8263 p->state == TransContext::STATE_IO_DONE);
8264
8265 if (osr->kv_submitted_waiters &&
8266 osr->_is_all_kv_submitted()) {
8267 osr->qcond.notify_all();
8268 }
8269 }
8270
8271 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
8272 {
8273 dout(20) << __func__ << " txc " << txc
8274 << " onodes " << txc->onodes
8275 << " shared_blobs " << txc->shared_blobs
8276 << dendl;
8277
8278 // finalize onodes
8279 for (auto o : txc->onodes) {
8280 // finalize extent_map shards
8281 o->extent_map.update(t, false);
8282 if (o->extent_map.needs_reshard()) {
8283 o->extent_map.reshard(db, t);
8284 o->extent_map.update(t, true);
8285 if (o->extent_map.needs_reshard()) {
8286 dout(20) << __func__ << " warning: still wants reshard, check options?"
8287 << dendl;
8288 o->extent_map.clear_needs_reshard();
8289 }
8290 logger->inc(l_bluestore_onode_reshard);
8291 }
8292
8293 // bound encode
8294 size_t bound = 0;
8295 denc(o->onode, bound);
8296 o->extent_map.bound_encode_spanning_blobs(bound);
8297 if (o->onode.extent_map_shards.empty()) {
8298 denc(o->extent_map.inline_bl, bound);
8299 }
8300
8301 // encode
8302 bufferlist bl;
8303 unsigned onode_part, blob_part, extent_part;
8304 {
8305 auto p = bl.get_contiguous_appender(bound, true);
8306 denc(o->onode, p);
8307 onode_part = p.get_logical_offset();
8308 o->extent_map.encode_spanning_blobs(p);
8309 blob_part = p.get_logical_offset() - onode_part;
8310 if (o->onode.extent_map_shards.empty()) {
8311 denc(o->extent_map.inline_bl, p);
8312 }
8313 extent_part = p.get_logical_offset() - onode_part - blob_part;
8314 }
8315
8316 dout(20) << " onode " << o->oid << " is " << bl.length()
8317 << " (" << onode_part << " bytes onode + "
8318 << blob_part << " bytes spanning blobs + "
8319 << extent_part << " bytes inline extents)"
8320 << dendl;
8321 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
8322 o->flushing_count++;
8323 }
8324
8325 // objects we modified but didn't affect the onode
8326 auto p = txc->modified_objects.begin();
8327 while (p != txc->modified_objects.end()) {
8328 if (txc->onodes.count(*p) == 0) {
8329 (*p)->flushing_count++;
8330 ++p;
8331 } else {
8332 // remove dups with onodes list to avoid problems in _txc_finish
8333 p = txc->modified_objects.erase(p);
8334 }
8335 }
8336
8337 // finalize shared_blobs
8338 for (auto sb : txc->shared_blobs) {
8339 string key;
8340 auto sbid = sb->get_sbid();
8341 get_shared_blob_key(sbid, &key);
8342 if (sb->persistent->empty()) {
8343 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8344 << " is empty" << dendl;
8345 t->rmkey(PREFIX_SHARED_BLOB, key);
8346 } else {
8347 bufferlist bl;
8348 ::encode(*(sb->persistent), bl);
8349 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8350 << " is " << bl.length() << " " << *sb << dendl;
8351 t->set(PREFIX_SHARED_BLOB, key, bl);
8352 }
8353 }
8354 }
8355
8356 void BlueStore::BSPerfTracker::update_from_perfcounters(
8357 PerfCounters &logger)
8358 {
8359 os_commit_latency.consume_next(
8360 logger.get_tavg_ms(
8361 l_bluestore_commit_lat));
8362 os_apply_latency.consume_next(
8363 logger.get_tavg_ms(
8364 l_bluestore_commit_lat));
8365 }
8366
8367 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
8368 {
8369 dout(20) << __func__ << " txc " << txc << std::hex
8370 << " allocated 0x" << txc->allocated
8371 << " released 0x" << txc->released
8372 << std::dec << dendl;
8373
8374 // We have to handle the case where we allocate *and* deallocate the
8375 // same region in this transaction. The freelist doesn't like that.
8376 // (Actually, the only thing that cares is the BitmapFreelistManager
8377 // debug check. But that's important.)
8378 interval_set<uint64_t> tmp_allocated, tmp_released;
8379 interval_set<uint64_t> *pallocated = &txc->allocated;
8380 interval_set<uint64_t> *preleased = &txc->released;
8381 if (!txc->allocated.empty() && !txc->released.empty()) {
8382 interval_set<uint64_t> overlap;
8383 overlap.intersection_of(txc->allocated, txc->released);
8384 if (!overlap.empty()) {
8385 tmp_allocated = txc->allocated;
8386 tmp_allocated.subtract(overlap);
8387 tmp_released = txc->released;
8388 tmp_released.subtract(overlap);
8389 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
8390 << ", new allocated 0x" << tmp_allocated
8391 << " released 0x" << tmp_released << std::dec
8392 << dendl;
8393 pallocated = &tmp_allocated;
8394 preleased = &tmp_released;
8395 }
8396 }
8397
8398 // update freelist with non-overlap sets
8399 for (interval_set<uint64_t>::iterator p = pallocated->begin();
8400 p != pallocated->end();
8401 ++p) {
8402 fm->allocate(p.get_start(), p.get_len(), t);
8403 }
8404 for (interval_set<uint64_t>::iterator p = preleased->begin();
8405 p != preleased->end();
8406 ++p) {
8407 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
8408 << "~" << p.get_len() << std::dec << dendl;
8409 fm->release(p.get_start(), p.get_len(), t);
8410 }
8411
8412 _txc_update_store_statfs(txc);
8413 }
8414
8415 void BlueStore::_txc_applied_kv(TransContext *txc)
8416 {
8417 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
8418 for (auto& o : *ls) {
8419 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
8420 << dendl;
8421 if (--o->flushing_count == 0) {
8422 std::lock_guard<std::mutex> l(o->flush_lock);
8423 o->flush_cond.notify_all();
8424 }
8425 }
8426 }
8427 }
8428
8429 void BlueStore::_txc_committed_kv(TransContext *txc)
8430 {
8431 dout(20) << __func__ << " txc " << txc << dendl;
8432
8433 // warning: we're calling onreadable_sync inside the sequencer lock
8434 if (txc->onreadable_sync) {
8435 txc->onreadable_sync->complete(0);
8436 txc->onreadable_sync = NULL;
8437 }
8438 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
8439 if (txc->oncommit) {
8440 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
8441 finishers[n]->queue(txc->oncommit);
8442 txc->oncommit = NULL;
8443 }
8444 if (txc->onreadable) {
8445 finishers[n]->queue(txc->onreadable);
8446 txc->onreadable = NULL;
8447 }
8448
8449 {
8450 std::lock_guard<std::mutex> l(txc->osr->qlock);
8451 txc->state = TransContext::STATE_KV_DONE;
8452 if (!txc->oncommits.empty()) {
8453 finishers[n]->queue(txc->oncommits);
8454 }
8455 }
8456 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
8457 }
8458
8459 void BlueStore::_txc_finish(TransContext *txc)
8460 {
8461 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
8462 assert(txc->state == TransContext::STATE_FINISHING);
8463
8464 for (auto& sb : txc->shared_blobs_written) {
8465 sb->finish_write(txc->seq);
8466 }
8467 txc->shared_blobs_written.clear();
8468
8469 while (!txc->removed_collections.empty()) {
8470 _queue_reap_collection(txc->removed_collections.front());
8471 txc->removed_collections.pop_front();
8472 }
8473
8474 OpSequencerRef osr = txc->osr;
8475 bool empty = false;
8476 bool submit_deferred = false;
8477 OpSequencer::q_list_t releasing_txc;
8478 {
8479 std::lock_guard<std::mutex> l(osr->qlock);
8480 txc->state = TransContext::STATE_DONE;
8481 bool notify = false;
8482 while (!osr->q.empty()) {
8483 TransContext *txc = &osr->q.front();
8484 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
8485 << dendl;
8486 if (txc->state != TransContext::STATE_DONE) {
8487 if (txc->state == TransContext::STATE_PREPARE &&
8488 deferred_aggressive) {
8489 // for _osr_drain_preceding()
8490 notify = true;
8491 }
8492 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
8493 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
8494 submit_deferred = true;
8495 }
8496 break;
8497 }
8498
8499 osr->q.pop_front();
8500 releasing_txc.push_back(*txc);
8501 notify = true;
8502 }
8503 if (notify) {
8504 osr->qcond.notify_all();
8505 }
8506 if (osr->q.empty()) {
8507 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
8508 empty = true;
8509 }
8510 }
8511 while (!releasing_txc.empty()) {
8512 // release to allocator only after all preceding txc's have also
8513 // finished any deferred writes that potentially land in these
8514 // blocks
8515 auto txc = &releasing_txc.front();
8516 _txc_release_alloc(txc);
8517 releasing_txc.pop_front();
8518 txc->log_state_latency(logger, l_bluestore_state_done_lat);
8519 delete txc;
8520 }
8521
8522 if (submit_deferred) {
8523 // we're pinning memory; flush! we could be more fine-grained here but
8524 // i'm not sure it's worth the bother.
8525 deferred_try_submit();
8526 }
8527
8528 if (empty && osr->zombie) {
8529 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
8530 osr->_unregister();
8531 }
8532 logger->set(l_bluestore_fragmentation,
8533 (uint64_t)(alloc->get_fragmentation(min_alloc_size) * 1000));
8534 }
8535
8536 void BlueStore::_txc_release_alloc(TransContext *txc)
8537 {
8538 interval_set<uint64_t> bulk_release_extents;
8539 // it's expected we're called with lazy_release_lock already taken!
8540 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
8541 dout(10) << __func__ << " " << txc << " " << std::hex
8542 << txc->released << std::dec << dendl;
8543 // interval_set seems to be too costly for inserting things in
8544 // bstore_kv_final. We could serialize in simpler format and perform
8545 // the merge separately, maybe even in a dedicated thread.
8546 bulk_release_extents.insert(txc->released);
8547 }
8548
8549 alloc->release(bulk_release_extents);
8550 txc->allocated.clear();
8551 txc->released.clear();
8552 }
8553
8554 void BlueStore::_osr_drain_preceding(TransContext *txc)
8555 {
8556 OpSequencer *osr = txc->osr.get();
8557 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8558 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8559 {
8560 // submit anything pending
8561 deferred_lock.lock();
8562 if (osr->deferred_pending) {
8563 _deferred_submit_unlock(osr);
8564 } else {
8565 deferred_lock.unlock();
8566 }
8567 }
8568 {
8569 // wake up any previously finished deferred events
8570 std::lock_guard<std::mutex> l(kv_lock);
8571 kv_cond.notify_one();
8572 }
8573 osr->drain_preceding(txc);
8574 --deferred_aggressive;
8575 dout(10) << __func__ << " " << osr << " done" << dendl;
8576 }
8577
8578 void BlueStore::_osr_drain_all()
8579 {
8580 dout(10) << __func__ << dendl;
8581
8582 set<OpSequencerRef> s;
8583 {
8584 std::lock_guard<std::mutex> l(osr_lock);
8585 s = osr_set;
8586 }
8587 dout(20) << __func__ << " osr_set " << s << dendl;
8588
8589 ++deferred_aggressive;
8590 {
8591 // submit anything pending
8592 deferred_try_submit();
8593 }
8594 {
8595 // wake up any previously finished deferred events
8596 std::lock_guard<std::mutex> l(kv_lock);
8597 kv_cond.notify_one();
8598 }
8599 {
8600 std::lock_guard<std::mutex> l(kv_finalize_lock);
8601 kv_finalize_cond.notify_one();
8602 }
8603 for (auto osr : s) {
8604 dout(20) << __func__ << " drain " << osr << dendl;
8605 osr->drain();
8606 }
8607 --deferred_aggressive;
8608
8609 dout(10) << __func__ << " done" << dendl;
8610 }
8611
8612 void BlueStore::_osr_unregister_all()
8613 {
8614 set<OpSequencerRef> s;
8615 {
8616 std::lock_guard<std::mutex> l(osr_lock);
8617 s = osr_set;
8618 }
8619 dout(10) << __func__ << " " << s << dendl;
8620 for (auto osr : s) {
8621 osr->_unregister();
8622
8623 if (!osr->zombie) {
8624 // break link from Sequencer to us so that this OpSequencer
8625 // instance can die with this mount/umount cycle. note that
8626 // we assume umount() will not race against ~Sequencer.
8627 assert(osr->parent);
8628 osr->parent->p.reset();
8629 }
8630 }
8631 // nobody should be creating sequencers during umount either.
8632 {
8633 std::lock_guard<std::mutex> l(osr_lock);
8634 assert(osr_set.empty());
8635 }
8636 }
8637
8638 void BlueStore::_kv_start()
8639 {
8640 dout(10) << __func__ << dendl;
8641
8642 for (int i = 0; i < m_finisher_num; ++i) {
8643 ostringstream oss;
8644 oss << "finisher-" << i;
8645 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8646 finishers.push_back(f);
8647 }
8648
8649 deferred_finisher.start();
8650 for (auto f : finishers) {
8651 f->start();
8652 }
8653 kv_sync_thread.create("bstore_kv_sync");
8654 kv_finalize_thread.create("bstore_kv_final");
8655 }
8656
8657 void BlueStore::_kv_stop()
8658 {
8659 dout(10) << __func__ << dendl;
8660 {
8661 std::unique_lock<std::mutex> l(kv_lock);
8662 while (!kv_sync_started) {
8663 kv_cond.wait(l);
8664 }
8665 kv_stop = true;
8666 kv_cond.notify_all();
8667 }
8668 {
8669 std::unique_lock<std::mutex> l(kv_finalize_lock);
8670 while (!kv_finalize_started) {
8671 kv_finalize_cond.wait(l);
8672 }
8673 kv_finalize_stop = true;
8674 kv_finalize_cond.notify_all();
8675 }
8676 kv_sync_thread.join();
8677 kv_finalize_thread.join();
8678 assert(removed_collections.empty());
8679 {
8680 std::lock_guard<std::mutex> l(kv_lock);
8681 kv_stop = false;
8682 }
8683 {
8684 std::lock_guard<std::mutex> l(kv_finalize_lock);
8685 kv_finalize_stop = false;
8686 }
8687 dout(10) << __func__ << " stopping finishers" << dendl;
8688 deferred_finisher.wait_for_empty();
8689 deferred_finisher.stop();
8690 for (auto f : finishers) {
8691 f->wait_for_empty();
8692 f->stop();
8693 }
8694 dout(10) << __func__ << " stopped" << dendl;
8695 }
8696
8697 void BlueStore::_kv_sync_thread()
8698 {
8699 dout(10) << __func__ << " start" << dendl;
8700 std::unique_lock<std::mutex> l(kv_lock);
8701 assert(!kv_sync_started);
8702 bool bluefs_do_check_balance = false;
8703 kv_sync_started = true;
8704 kv_cond.notify_all();
8705 while (true) {
8706 assert(kv_committing.empty());
8707 if (kv_queue.empty() &&
8708 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8709 !deferred_aggressive) &&
8710 (bluefs_do_check_balance == false)) {
8711 if (kv_stop)
8712 break;
8713 dout(20) << __func__ << " sleep" << dendl;
8714 std::cv_status status = kv_cond.wait_for(l,
8715 std::chrono::milliseconds(int64_t(cct->_conf->bluestore_bluefs_balance_interval * 1000)));
8716 dout(20) << __func__ << " wake" << dendl;
8717 if (status == std::cv_status::timeout) {
8718 bluefs_do_check_balance = true;
8719 }
8720 } else {
8721 deque<TransContext*> kv_submitting;
8722 deque<DeferredBatch*> deferred_done, deferred_stable;
8723 uint64_t aios = 0, costs = 0;
8724
8725 dout(20) << __func__ << " committing " << kv_queue.size()
8726 << " submitting " << kv_queue_unsubmitted.size()
8727 << " deferred done " << deferred_done_queue.size()
8728 << " stable " << deferred_stable_queue.size()
8729 << dendl;
8730 kv_committing.swap(kv_queue);
8731 kv_submitting.swap(kv_queue_unsubmitted);
8732 deferred_done.swap(deferred_done_queue);
8733 deferred_stable.swap(deferred_stable_queue);
8734 aios = kv_ios;
8735 costs = kv_throttle_costs;
8736 kv_ios = 0;
8737 kv_throttle_costs = 0;
8738 utime_t start = ceph_clock_now();
8739 l.unlock();
8740
8741 dout(30) << __func__ << " committing " << kv_committing << dendl;
8742 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8743 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8744 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8745
8746 bool force_flush = false;
8747 // if bluefs is sharing the same device as data (only), then we
8748 // can rely on the bluefs commit to flush the device and make
8749 // deferred aios stable. that means that if we do have done deferred
8750 // txcs AND we are not on a single device, we need to force a flush.
8751 if (bluefs_single_shared_device && bluefs) {
8752 if (aios) {
8753 force_flush = true;
8754 } else if (kv_committing.empty() && kv_submitting.empty() &&
8755 deferred_stable.empty()) {
8756 force_flush = true; // there's nothing else to commit!
8757 } else if (deferred_aggressive) {
8758 force_flush = true;
8759 }
8760 } else
8761 force_flush = true;
8762
8763 if (force_flush) {
8764 dout(20) << __func__ << " num_aios=" << aios
8765 << " force_flush=" << (int)force_flush
8766 << ", flushing, deferred done->stable" << dendl;
8767 // flush/barrier on block device
8768 bdev->flush();
8769
8770 // if we flush then deferred done are now deferred stable
8771 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8772 deferred_done.end());
8773 deferred_done.clear();
8774 }
8775 utime_t after_flush = ceph_clock_now();
8776
8777 // we will use one final transaction to force a sync
8778 KeyValueDB::Transaction synct = db->get_transaction();
8779
8780 // increase {nid,blobid}_max? note that this covers both the
8781 // case where we are approaching the max and the case we passed
8782 // it. in either case, we increase the max in the earlier txn
8783 // we submit.
8784 uint64_t new_nid_max = 0, new_blobid_max = 0;
8785 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8786 KeyValueDB::Transaction t =
8787 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8788 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8789 bufferlist bl;
8790 ::encode(new_nid_max, bl);
8791 t->set(PREFIX_SUPER, "nid_max", bl);
8792 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8793 }
8794 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8795 KeyValueDB::Transaction t =
8796 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8797 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8798 bufferlist bl;
8799 ::encode(new_blobid_max, bl);
8800 t->set(PREFIX_SUPER, "blobid_max", bl);
8801 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8802 }
8803
8804 for (auto txc : kv_committing) {
8805 if (txc->state == TransContext::STATE_KV_QUEUED) {
8806 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8807 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8808 assert(r == 0);
8809 _txc_applied_kv(txc);
8810 --txc->osr->kv_committing_serially;
8811 txc->state = TransContext::STATE_KV_SUBMITTED;
8812 if (txc->osr->kv_submitted_waiters) {
8813 std::lock_guard<std::mutex> l(txc->osr->qlock);
8814 if (txc->osr->_is_all_kv_submitted()) {
8815 txc->osr->qcond.notify_all();
8816 }
8817 }
8818
8819 } else {
8820 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8821 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8822 }
8823 if (txc->had_ios) {
8824 --txc->osr->txc_with_unstable_io;
8825 }
8826 }
8827
8828 // release throttle *before* we commit. this allows new ops
8829 // to be prepared and enter pipeline while we are waiting on
8830 // the kv commit sync/flush. then hopefully on the next
8831 // iteration there will already be ops awake. otherwise, we
8832 // end up going to sleep, and then wake up when the very first
8833 // transaction is ready for commit.
8834 throttle_bytes.put(costs);
8835
8836 PExtentVector bluefs_gift_extents;
8837 if (bluefs &&
8838 after_flush - bluefs_last_balance >
8839 cct->_conf->bluestore_bluefs_balance_interval) {
8840 bluefs_last_balance = after_flush;
8841 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8842 assert(r >= 0);
8843 if (r > 0) {
8844 for (auto& p : bluefs_gift_extents) {
8845 bluefs_extents.insert(p.offset, p.length);
8846 }
8847 bufferlist bl;
8848 ::encode(bluefs_extents, bl);
8849 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8850 << bluefs_extents << std::dec << dendl;
8851 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8852 }
8853 }
8854 bluefs_do_check_balance = false;
8855
8856 // cleanup sync deferred keys
8857 for (auto b : deferred_stable) {
8858 for (auto& txc : b->txcs) {
8859 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8860 if (!wt.released.empty()) {
8861 // kraken replay compat only
8862 txc.released = wt.released;
8863 dout(10) << __func__ << " deferred txn has released "
8864 << txc.released
8865 << " (we just upgraded from kraken) on " << &txc << dendl;
8866 _txc_finalize_kv(&txc, synct);
8867 }
8868 // cleanup the deferred
8869 string key;
8870 get_deferred_key(wt.seq, &key);
8871 synct->rm_single_key(PREFIX_DEFERRED, key);
8872 }
8873 }
8874
8875 // submit synct synchronously (block and wait for it to commit)
8876 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
8877 assert(r == 0);
8878
8879 if (new_nid_max) {
8880 nid_max = new_nid_max;
8881 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8882 }
8883 if (new_blobid_max) {
8884 blobid_max = new_blobid_max;
8885 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8886 }
8887
8888 {
8889 utime_t finish = ceph_clock_now();
8890 utime_t dur_flush = after_flush - start;
8891 utime_t dur_kv = finish - after_flush;
8892 utime_t dur = finish - start;
8893 dout(20) << __func__ << " committed " << kv_committing.size()
8894 << " cleaned " << deferred_stable.size()
8895 << " in " << dur
8896 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8897 << dendl;
8898 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8899 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8900 logger->tinc(l_bluestore_kv_lat, dur);
8901 }
8902
8903 if (bluefs) {
8904 if (!bluefs_gift_extents.empty()) {
8905 _commit_bluefs_freespace(bluefs_gift_extents);
8906 }
8907 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8908 << bluefs_extents_reclaiming << std::dec << dendl;
8909 alloc->release(bluefs_extents_reclaiming);
8910 bluefs_extents_reclaiming.clear();
8911 }
8912
8913 {
8914 std::unique_lock<std::mutex> m(kv_finalize_lock);
8915 if (kv_committing_to_finalize.empty()) {
8916 kv_committing_to_finalize.swap(kv_committing);
8917 } else {
8918 kv_committing_to_finalize.insert(
8919 kv_committing_to_finalize.end(),
8920 kv_committing.begin(),
8921 kv_committing.end());
8922 kv_committing.clear();
8923 }
8924 if (deferred_stable_to_finalize.empty()) {
8925 deferred_stable_to_finalize.swap(deferred_stable);
8926 } else {
8927 deferred_stable_to_finalize.insert(
8928 deferred_stable_to_finalize.end(),
8929 deferred_stable.begin(),
8930 deferred_stable.end());
8931 deferred_stable.clear();
8932 }
8933 kv_finalize_cond.notify_one();
8934 }
8935
8936 l.lock();
8937 // previously deferred "done" are now "stable" by virtue of this
8938 // commit cycle.
8939 deferred_stable_queue.swap(deferred_done);
8940 }
8941 }
8942 dout(10) << __func__ << " finish" << dendl;
8943 kv_sync_started = false;
8944 }
8945
8946 void BlueStore::_kv_finalize_thread()
8947 {
8948 deque<TransContext*> kv_committed;
8949 deque<DeferredBatch*> deferred_stable;
8950 dout(10) << __func__ << " start" << dendl;
8951 std::unique_lock<std::mutex> l(kv_finalize_lock);
8952 assert(!kv_finalize_started);
8953 kv_finalize_started = true;
8954 kv_finalize_cond.notify_all();
8955 while (true) {
8956 assert(kv_committed.empty());
8957 assert(deferred_stable.empty());
8958 if (kv_committing_to_finalize.empty() &&
8959 deferred_stable_to_finalize.empty()) {
8960 if (kv_finalize_stop)
8961 break;
8962 dout(20) << __func__ << " sleep" << dendl;
8963 kv_finalize_cond.wait(l);
8964 dout(20) << __func__ << " wake" << dendl;
8965 } else {
8966 kv_committed.swap(kv_committing_to_finalize);
8967 deferred_stable.swap(deferred_stable_to_finalize);
8968 l.unlock();
8969 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8970 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8971
8972 while (!kv_committed.empty()) {
8973 TransContext *txc = kv_committed.front();
8974 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8975 _txc_state_proc(txc);
8976 kv_committed.pop_front();
8977 }
8978
8979 for (auto b : deferred_stable) {
8980 auto p = b->txcs.begin();
8981 while (p != b->txcs.end()) {
8982 TransContext *txc = &*p;
8983 p = b->txcs.erase(p); // unlink here because
8984 _txc_state_proc(txc); // this may destroy txc
8985 }
8986 delete b;
8987 }
8988 deferred_stable.clear();
8989
8990 if (!deferred_aggressive) {
8991 if (deferred_queue_size >= deferred_batch_ops.load() ||
8992 throttle_deferred_bytes.past_midpoint()) {
8993 deferred_try_submit();
8994 }
8995 }
8996
8997 // this is as good a place as any ...
8998 _reap_collections();
8999
9000 l.lock();
9001 }
9002 }
9003 dout(10) << __func__ << " finish" << dendl;
9004 kv_finalize_started = false;
9005 }
9006
9007 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
9008 TransContext *txc, OnodeRef o)
9009 {
9010 if (!txc->deferred_txn) {
9011 txc->deferred_txn = new bluestore_deferred_transaction_t;
9012 }
9013 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
9014 return &txc->deferred_txn->ops.back();
9015 }
9016
9017 void BlueStore::_deferred_queue(TransContext *txc)
9018 {
9019 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
9020 deferred_lock.lock();
9021 if (!txc->osr->deferred_pending &&
9022 !txc->osr->deferred_running) {
9023 deferred_queue.push_back(*txc->osr);
9024 }
9025 if (!txc->osr->deferred_pending) {
9026 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
9027 }
9028 ++deferred_queue_size;
9029 txc->osr->deferred_pending->txcs.push_back(*txc);
9030 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
9031 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
9032 const auto& op = *opi;
9033 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
9034 bufferlist::const_iterator p = op.data.begin();
9035 for (auto e : op.extents) {
9036 txc->osr->deferred_pending->prepare_write(
9037 cct, wt.seq, e.offset, e.length, p);
9038 }
9039 }
9040 if (deferred_aggressive &&
9041 !txc->osr->deferred_running) {
9042 _deferred_submit_unlock(txc->osr.get());
9043 } else {
9044 deferred_lock.unlock();
9045 }
9046 }
9047
9048 void BlueStore::deferred_try_submit()
9049 {
9050 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
9051 << deferred_queue_size << " txcs" << dendl;
9052 std::lock_guard<std::mutex> l(deferred_lock);
9053 vector<OpSequencerRef> osrs;
9054 osrs.reserve(deferred_queue.size());
9055 for (auto& osr : deferred_queue) {
9056 osrs.push_back(&osr);
9057 }
9058 for (auto& osr : osrs) {
9059 if (osr->deferred_pending) {
9060 if (!osr->deferred_running) {
9061 _deferred_submit_unlock(osr.get());
9062 deferred_lock.lock();
9063 } else {
9064 dout(20) << __func__ << " osr " << osr << " already has running"
9065 << dendl;
9066 }
9067 } else {
9068 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
9069 }
9070 }
9071 }
9072
9073 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
9074 {
9075 dout(10) << __func__ << " osr " << osr
9076 << " " << osr->deferred_pending->iomap.size() << " ios pending "
9077 << dendl;
9078 assert(osr->deferred_pending);
9079 assert(!osr->deferred_running);
9080
9081 auto b = osr->deferred_pending;
9082 deferred_queue_size -= b->seq_bytes.size();
9083 assert(deferred_queue_size >= 0);
9084
9085 osr->deferred_running = osr->deferred_pending;
9086 osr->deferred_pending = nullptr;
9087
9088 uint64_t start = 0, pos = 0;
9089 bufferlist bl;
9090 auto i = b->iomap.begin();
9091 while (true) {
9092 if (i == b->iomap.end() || i->first != pos) {
9093 if (bl.length()) {
9094 dout(20) << __func__ << " write 0x" << std::hex
9095 << start << "~" << bl.length()
9096 << " crc " << bl.crc32c(-1) << std::dec << dendl;
9097 if (!g_conf->bluestore_debug_omit_block_device_write) {
9098 logger->inc(l_bluestore_deferred_write_ops);
9099 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
9100 int r = bdev->aio_write(start, bl, &b->ioc, false);
9101 assert(r == 0);
9102 }
9103 }
9104 if (i == b->iomap.end()) {
9105 break;
9106 }
9107 start = 0;
9108 pos = i->first;
9109 bl.clear();
9110 }
9111 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
9112 << std::hex << pos << "~" << i->second.bl.length() << std::dec
9113 << dendl;
9114 if (!bl.length()) {
9115 start = pos;
9116 }
9117 pos += i->second.bl.length();
9118 bl.claim_append(i->second.bl);
9119 ++i;
9120 }
9121
9122 deferred_lock.unlock();
9123 bdev->aio_submit(&b->ioc);
9124 }
9125
9126 struct C_DeferredTrySubmit : public Context {
9127 BlueStore *store;
9128 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
9129 void finish(int r) {
9130 store->deferred_try_submit();
9131 }
9132 };
9133
9134 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
9135 {
9136 dout(10) << __func__ << " osr " << osr << dendl;
9137 assert(osr->deferred_running);
9138 DeferredBatch *b = osr->deferred_running;
9139
9140 {
9141 std::lock_guard<std::mutex> l(deferred_lock);
9142 assert(osr->deferred_running == b);
9143 osr->deferred_running = nullptr;
9144 if (!osr->deferred_pending) {
9145 dout(20) << __func__ << " dequeueing" << dendl;
9146 auto q = deferred_queue.iterator_to(*osr);
9147 deferred_queue.erase(q);
9148 } else if (deferred_aggressive) {
9149 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
9150 deferred_finisher.queue(new C_DeferredTrySubmit(this));
9151 } else {
9152 dout(20) << __func__ << " leaving queued, more pending" << dendl;
9153 }
9154 }
9155
9156 {
9157 uint64_t costs = 0;
9158 std::lock_guard<std::mutex> l2(osr->qlock);
9159 for (auto& i : b->txcs) {
9160 TransContext *txc = &i;
9161 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
9162 costs += txc->cost;
9163 }
9164 osr->qcond.notify_all();
9165 throttle_deferred_bytes.put(costs);
9166 std::lock_guard<std::mutex> l(kv_lock);
9167 deferred_done_queue.emplace_back(b);
9168 }
9169
9170 // in the normal case, do not bother waking up the kv thread; it will
9171 // catch us on the next commit anyway.
9172 if (deferred_aggressive) {
9173 std::lock_guard<std::mutex> l(kv_lock);
9174 kv_cond.notify_one();
9175 }
9176 }
9177
9178 int BlueStore::_deferred_replay()
9179 {
9180 dout(10) << __func__ << " start" << dendl;
9181 OpSequencerRef osr = new OpSequencer(cct, this);
9182 int count = 0;
9183 int r = 0;
9184 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
9185 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
9186 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
9187 << dendl;
9188 bluestore_deferred_transaction_t *deferred_txn =
9189 new bluestore_deferred_transaction_t;
9190 bufferlist bl = it->value();
9191 bufferlist::iterator p = bl.begin();
9192 try {
9193 ::decode(*deferred_txn, p);
9194 } catch (buffer::error& e) {
9195 derr << __func__ << " failed to decode deferred txn "
9196 << pretty_binary_string(it->key()) << dendl;
9197 delete deferred_txn;
9198 r = -EIO;
9199 goto out;
9200 }
9201 TransContext *txc = _txc_create(osr.get());
9202 txc->deferred_txn = deferred_txn;
9203 txc->state = TransContext::STATE_KV_DONE;
9204 _txc_state_proc(txc);
9205 }
9206 out:
9207 dout(20) << __func__ << " draining osr" << dendl;
9208 _osr_drain_all();
9209 osr->discard();
9210 dout(10) << __func__ << " completed " << count << " events" << dendl;
9211 return r;
9212 }
9213
9214 // ---------------------------
9215 // transactions
9216
9217 int BlueStore::queue_transactions(
9218 Sequencer *posr,
9219 vector<Transaction>& tls,
9220 TrackedOpRef op,
9221 ThreadPool::TPHandle *handle)
9222 {
9223 FUNCTRACE();
9224 Context *onreadable;
9225 Context *ondisk;
9226 Context *onreadable_sync;
9227 ObjectStore::Transaction::collect_contexts(
9228 tls, &onreadable, &ondisk, &onreadable_sync);
9229
9230 if (cct->_conf->objectstore_blackhole) {
9231 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
9232 << dendl;
9233 delete ondisk;
9234 delete onreadable;
9235 delete onreadable_sync;
9236 return 0;
9237 }
9238 utime_t start = ceph_clock_now();
9239 // set up the sequencer
9240 OpSequencer *osr;
9241 assert(posr);
9242 if (posr->p) {
9243 osr = static_cast<OpSequencer *>(posr->p.get());
9244 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
9245 } else {
9246 osr = new OpSequencer(cct, this);
9247 osr->parent = posr;
9248 posr->p = osr;
9249 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
9250 }
9251
9252 // prepare
9253 TransContext *txc = _txc_create(osr);
9254 txc->onreadable = onreadable;
9255 txc->onreadable_sync = onreadable_sync;
9256 txc->oncommit = ondisk;
9257
9258 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
9259 (*p).set_osr(osr);
9260 txc->bytes += (*p).get_num_bytes();
9261 _txc_add_transaction(txc, &(*p));
9262 }
9263 _txc_calc_cost(txc);
9264
9265 _txc_write_nodes(txc, txc->t);
9266
9267 // journal deferred items
9268 if (txc->deferred_txn) {
9269 txc->deferred_txn->seq = ++deferred_seq;
9270 bufferlist bl;
9271 ::encode(*txc->deferred_txn, bl);
9272 string key;
9273 get_deferred_key(txc->deferred_txn->seq, &key);
9274 txc->t->set(PREFIX_DEFERRED, key, bl);
9275 }
9276
9277 _txc_finalize_kv(txc, txc->t);
9278 if (handle)
9279 handle->suspend_tp_timeout();
9280
9281 utime_t tstart = ceph_clock_now();
9282 throttle_bytes.get(txc->cost);
9283 if (txc->deferred_txn) {
9284 // ensure we do not block here because of deferred writes
9285 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
9286 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
9287 << dendl;
9288 ++deferred_aggressive;
9289 deferred_try_submit();
9290 {
9291 // wake up any previously finished deferred events
9292 std::lock_guard<std::mutex> l(kv_lock);
9293 kv_cond.notify_one();
9294 }
9295 throttle_deferred_bytes.get(txc->cost);
9296 --deferred_aggressive;
9297 }
9298 }
9299 utime_t tend = ceph_clock_now();
9300
9301 if (handle)
9302 handle->reset_tp_timeout();
9303
9304 logger->inc(l_bluestore_txc);
9305
9306 // execute (start)
9307 _txc_state_proc(txc);
9308
9309 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
9310 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
9311 return 0;
9312 }
9313
9314 void BlueStore::_txc_aio_submit(TransContext *txc)
9315 {
9316 dout(10) << __func__ << " txc " << txc << dendl;
9317 bdev->aio_submit(&txc->ioc);
9318 }
9319
9320 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
9321 {
9322 Transaction::iterator i = t->begin();
9323
9324 _dump_transaction(t);
9325
9326 vector<CollectionRef> cvec(i.colls.size());
9327 unsigned j = 0;
9328 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
9329 ++p, ++j) {
9330 cvec[j] = _get_collection(*p);
9331 }
9332 vector<OnodeRef> ovec(i.objects.size());
9333
9334 for (int pos = 0; i.have_op(); ++pos) {
9335 Transaction::Op *op = i.decode_op();
9336 int r = 0;
9337
9338 // no coll or obj
9339 if (op->op == Transaction::OP_NOP)
9340 continue;
9341
9342 // collection operations
9343 CollectionRef &c = cvec[op->cid];
9344 switch (op->op) {
9345 case Transaction::OP_RMCOLL:
9346 {
9347 const coll_t &cid = i.get_cid(op->cid);
9348 r = _remove_collection(txc, cid, &c);
9349 if (!r)
9350 continue;
9351 }
9352 break;
9353
9354 case Transaction::OP_MKCOLL:
9355 {
9356 assert(!c);
9357 const coll_t &cid = i.get_cid(op->cid);
9358 r = _create_collection(txc, cid, op->split_bits, &c);
9359 if (!r)
9360 continue;
9361 }
9362 break;
9363
9364 case Transaction::OP_SPLIT_COLLECTION:
9365 assert(0 == "deprecated");
9366 break;
9367
9368 case Transaction::OP_SPLIT_COLLECTION2:
9369 {
9370 uint32_t bits = op->split_bits;
9371 uint32_t rem = op->split_rem;
9372 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
9373 if (!r)
9374 continue;
9375 }
9376 break;
9377
9378 case Transaction::OP_COLL_HINT:
9379 {
9380 uint32_t type = op->hint_type;
9381 bufferlist hint;
9382 i.decode_bl(hint);
9383 bufferlist::iterator hiter = hint.begin();
9384 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
9385 uint32_t pg_num;
9386 uint64_t num_objs;
9387 ::decode(pg_num, hiter);
9388 ::decode(num_objs, hiter);
9389 dout(10) << __func__ << " collection hint objects is a no-op, "
9390 << " pg_num " << pg_num << " num_objects " << num_objs
9391 << dendl;
9392 } else {
9393 // Ignore the hint
9394 dout(10) << __func__ << " unknown collection hint " << type << dendl;
9395 }
9396 continue;
9397 }
9398 break;
9399
9400 case Transaction::OP_COLL_SETATTR:
9401 r = -EOPNOTSUPP;
9402 break;
9403
9404 case Transaction::OP_COLL_RMATTR:
9405 r = -EOPNOTSUPP;
9406 break;
9407
9408 case Transaction::OP_COLL_RENAME:
9409 assert(0 == "not implemented");
9410 break;
9411 }
9412 if (r < 0) {
9413 derr << __func__ << " error " << cpp_strerror(r)
9414 << " not handled on operation " << op->op
9415 << " (op " << pos << ", counting from 0)" << dendl;
9416 _dump_transaction(t, 0);
9417 assert(0 == "unexpected error");
9418 }
9419
9420 // these operations implicity create the object
9421 bool create = false;
9422 if (op->op == Transaction::OP_TOUCH ||
9423 op->op == Transaction::OP_WRITE ||
9424 op->op == Transaction::OP_ZERO) {
9425 create = true;
9426 }
9427
9428 // object operations
9429 RWLock::WLocker l(c->lock);
9430 OnodeRef &o = ovec[op->oid];
9431 if (!o) {
9432 ghobject_t oid = i.get_oid(op->oid);
9433 o = c->get_onode(oid, create);
9434 }
9435 if (!create && (!o || !o->exists)) {
9436 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
9437 << i.get_oid(op->oid) << dendl;
9438 r = -ENOENT;
9439 goto endop;
9440 }
9441
9442 switch (op->op) {
9443 case Transaction::OP_TOUCH:
9444 r = _touch(txc, c, o);
9445 break;
9446
9447 case Transaction::OP_WRITE:
9448 {
9449 uint64_t off = op->off;
9450 uint64_t len = op->len;
9451 uint32_t fadvise_flags = i.get_fadvise_flags();
9452 bufferlist bl;
9453 i.decode_bl(bl);
9454 r = _write(txc, c, o, off, len, bl, fadvise_flags);
9455 }
9456 break;
9457
9458 case Transaction::OP_ZERO:
9459 {
9460 uint64_t off = op->off;
9461 uint64_t len = op->len;
9462 r = _zero(txc, c, o, off, len);
9463 }
9464 break;
9465
9466 case Transaction::OP_TRIMCACHE:
9467 {
9468 // deprecated, no-op
9469 }
9470 break;
9471
9472 case Transaction::OP_TRUNCATE:
9473 {
9474 uint64_t off = op->off;
9475 r = _truncate(txc, c, o, off);
9476 }
9477 break;
9478
9479 case Transaction::OP_REMOVE:
9480 {
9481 r = _remove(txc, c, o);
9482 }
9483 break;
9484
9485 case Transaction::OP_SETATTR:
9486 {
9487 string name = i.decode_string();
9488 bufferptr bp;
9489 i.decode_bp(bp);
9490 r = _setattr(txc, c, o, name, bp);
9491 }
9492 break;
9493
9494 case Transaction::OP_SETATTRS:
9495 {
9496 map<string, bufferptr> aset;
9497 i.decode_attrset(aset);
9498 r = _setattrs(txc, c, o, aset);
9499 }
9500 break;
9501
9502 case Transaction::OP_RMATTR:
9503 {
9504 string name = i.decode_string();
9505 r = _rmattr(txc, c, o, name);
9506 }
9507 break;
9508
9509 case Transaction::OP_RMATTRS:
9510 {
9511 r = _rmattrs(txc, c, o);
9512 }
9513 break;
9514
9515 case Transaction::OP_CLONE:
9516 {
9517 OnodeRef& no = ovec[op->dest_oid];
9518 if (!no) {
9519 const ghobject_t& noid = i.get_oid(op->dest_oid);
9520 no = c->get_onode(noid, true);
9521 }
9522 r = _clone(txc, c, o, no);
9523 }
9524 break;
9525
9526 case Transaction::OP_CLONERANGE:
9527 assert(0 == "deprecated");
9528 break;
9529
9530 case Transaction::OP_CLONERANGE2:
9531 {
9532 OnodeRef& no = ovec[op->dest_oid];
9533 if (!no) {
9534 const ghobject_t& noid = i.get_oid(op->dest_oid);
9535 no = c->get_onode(noid, true);
9536 }
9537 uint64_t srcoff = op->off;
9538 uint64_t len = op->len;
9539 uint64_t dstoff = op->dest_off;
9540 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
9541 }
9542 break;
9543
9544 case Transaction::OP_COLL_ADD:
9545 assert(0 == "not implemented");
9546 break;
9547
9548 case Transaction::OP_COLL_REMOVE:
9549 assert(0 == "not implemented");
9550 break;
9551
9552 case Transaction::OP_COLL_MOVE:
9553 assert(0 == "deprecated");
9554 break;
9555
9556 case Transaction::OP_COLL_MOVE_RENAME:
9557 case Transaction::OP_TRY_RENAME:
9558 {
9559 assert(op->cid == op->dest_cid);
9560 const ghobject_t& noid = i.get_oid(op->dest_oid);
9561 OnodeRef& no = ovec[op->dest_oid];
9562 if (!no) {
9563 no = c->get_onode(noid, false);
9564 }
9565 r = _rename(txc, c, o, no, noid);
9566 }
9567 break;
9568
9569 case Transaction::OP_OMAP_CLEAR:
9570 {
9571 r = _omap_clear(txc, c, o);
9572 }
9573 break;
9574 case Transaction::OP_OMAP_SETKEYS:
9575 {
9576 bufferlist aset_bl;
9577 i.decode_attrset_bl(&aset_bl);
9578 r = _omap_setkeys(txc, c, o, aset_bl);
9579 }
9580 break;
9581 case Transaction::OP_OMAP_RMKEYS:
9582 {
9583 bufferlist keys_bl;
9584 i.decode_keyset_bl(&keys_bl);
9585 r = _omap_rmkeys(txc, c, o, keys_bl);
9586 }
9587 break;
9588 case Transaction::OP_OMAP_RMKEYRANGE:
9589 {
9590 string first, last;
9591 first = i.decode_string();
9592 last = i.decode_string();
9593 r = _omap_rmkey_range(txc, c, o, first, last);
9594 }
9595 break;
9596 case Transaction::OP_OMAP_SETHEADER:
9597 {
9598 bufferlist bl;
9599 i.decode_bl(bl);
9600 r = _omap_setheader(txc, c, o, bl);
9601 }
9602 break;
9603
9604 case Transaction::OP_SETALLOCHINT:
9605 {
9606 r = _set_alloc_hint(txc, c, o,
9607 op->expected_object_size,
9608 op->expected_write_size,
9609 op->alloc_hint_flags);
9610 }
9611 break;
9612
9613 default:
9614 derr << __func__ << "bad op " << op->op << dendl;
9615 ceph_abort();
9616 }
9617
9618 endop:
9619 if (r < 0) {
9620 bool ok = false;
9621
9622 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9623 op->op == Transaction::OP_CLONE ||
9624 op->op == Transaction::OP_CLONERANGE2 ||
9625 op->op == Transaction::OP_COLL_ADD ||
9626 op->op == Transaction::OP_SETATTR ||
9627 op->op == Transaction::OP_SETATTRS ||
9628 op->op == Transaction::OP_RMATTR ||
9629 op->op == Transaction::OP_OMAP_SETKEYS ||
9630 op->op == Transaction::OP_OMAP_RMKEYS ||
9631 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9632 op->op == Transaction::OP_OMAP_SETHEADER))
9633 // -ENOENT is usually okay
9634 ok = true;
9635 if (r == -ENODATA)
9636 ok = true;
9637
9638 if (!ok) {
9639 const char *msg = "unexpected error code";
9640
9641 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9642 op->op == Transaction::OP_CLONE ||
9643 op->op == Transaction::OP_CLONERANGE2))
9644 msg = "ENOENT on clone suggests osd bug";
9645
9646 if (r == -ENOSPC)
9647 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9648 // by partially applying transactions.
9649 msg = "ENOSPC from bluestore, misconfigured cluster";
9650
9651 if (r == -ENOTEMPTY) {
9652 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9653 }
9654
9655 derr << __func__ << " error " << cpp_strerror(r)
9656 << " not handled on operation " << op->op
9657 << " (op " << pos << ", counting from 0)"
9658 << dendl;
9659 derr << msg << dendl;
9660 _dump_transaction(t, 0);
9661 assert(0 == "unexpected error");
9662 }
9663 }
9664 }
9665 }
9666
9667
9668
9669 // -----------------
9670 // write operations
9671
9672 int BlueStore::_touch(TransContext *txc,
9673 CollectionRef& c,
9674 OnodeRef &o)
9675 {
9676 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9677 int r = 0;
9678 _assign_nid(txc, o);
9679 txc->write_onode(o);
9680 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9681 return r;
9682 }
9683
9684 void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
9685 {
9686 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9687 return;
9688 dout(log_level) << __func__ << " " << o << " " << o->oid
9689 << " nid " << o->onode.nid
9690 << " size 0x" << std::hex << o->onode.size
9691 << " (" << std::dec << o->onode.size << ")"
9692 << " expected_object_size " << o->onode.expected_object_size
9693 << " expected_write_size " << o->onode.expected_write_size
9694 << " in " << o->onode.extent_map_shards.size() << " shards"
9695 << ", " << o->extent_map.spanning_blob_map.size()
9696 << " spanning blobs"
9697 << dendl;
9698 for (auto p = o->onode.attrs.begin();
9699 p != o->onode.attrs.end();
9700 ++p) {
9701 dout(log_level) << __func__ << " attr " << p->first
9702 << " len " << p->second.length() << dendl;
9703 }
9704 _dump_extent_map(o->extent_map, log_level);
9705 }
9706
9707 void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9708 {
9709 uint64_t pos = 0;
9710 for (auto& s : em.shards) {
9711 dout(log_level) << __func__ << " shard " << *s.shard_info
9712 << (s.loaded ? " (loaded)" : "")
9713 << (s.dirty ? " (dirty)" : "")
9714 << dendl;
9715 }
9716 for (auto& e : em.extent_map) {
9717 dout(log_level) << __func__ << " " << e << dendl;
9718 assert(e.logical_offset >= pos);
9719 pos = e.logical_offset + e.length;
9720 const bluestore_blob_t& blob = e.blob->get_blob();
9721 if (blob.has_csum()) {
9722 vector<uint64_t> v;
9723 unsigned n = blob.get_csum_count();
9724 for (unsigned i = 0; i < n; ++i)
9725 v.push_back(blob.get_csum_item(i));
9726 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9727 << dendl;
9728 }
9729 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9730 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9731 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9732 << "~" << i.second->length << std::dec
9733 << " " << *i.second << dendl;
9734 }
9735 }
9736 }
9737
9738 void BlueStore::_dump_transaction(Transaction *t, int log_level)
9739 {
9740 dout(log_level) << " transaction dump:\n";
9741 JSONFormatter f(true);
9742 f.open_object_section("transaction");
9743 t->dump(&f);
9744 f.close_section();
9745 f.flush(*_dout);
9746 *_dout << dendl;
9747 }
9748
9749 void BlueStore::_pad_zeros(
9750 bufferlist *bl, uint64_t *offset,
9751 uint64_t chunk_size)
9752 {
9753 auto length = bl->length();
9754 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9755 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9756 dout(40) << "before:\n";
9757 bl->hexdump(*_dout);
9758 *_dout << dendl;
9759 // front
9760 size_t front_pad = *offset % chunk_size;
9761 size_t back_pad = 0;
9762 size_t pad_count = 0;
9763 if (front_pad) {
9764 size_t front_copy = MIN(chunk_size - front_pad, length);
9765 bufferptr z = buffer::create_page_aligned(chunk_size);
9766 z.zero(0, front_pad, false);
9767 pad_count += front_pad;
9768 bl->copy(0, front_copy, z.c_str() + front_pad);
9769 if (front_copy + front_pad < chunk_size) {
9770 back_pad = chunk_size - (length + front_pad);
9771 z.zero(front_pad + length, back_pad, false);
9772 pad_count += back_pad;
9773 }
9774 bufferlist old, t;
9775 old.swap(*bl);
9776 t.substr_of(old, front_copy, length - front_copy);
9777 bl->append(z);
9778 bl->claim_append(t);
9779 *offset -= front_pad;
9780 length += pad_count;
9781 }
9782
9783 // back
9784 uint64_t end = *offset + length;
9785 unsigned back_copy = end % chunk_size;
9786 if (back_copy) {
9787 assert(back_pad == 0);
9788 back_pad = chunk_size - back_copy;
9789 assert(back_copy <= length);
9790 bufferptr tail(chunk_size);
9791 bl->copy(length - back_copy, back_copy, tail.c_str());
9792 tail.zero(back_copy, back_pad, false);
9793 bufferlist old;
9794 old.swap(*bl);
9795 bl->substr_of(old, 0, length - back_copy);
9796 bl->append(tail);
9797 length += back_pad;
9798 pad_count += back_pad;
9799 }
9800 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9801 << back_pad << " on front/back, now 0x" << *offset << "~"
9802 << length << std::dec << dendl;
9803 dout(40) << "after:\n";
9804 bl->hexdump(*_dout);
9805 *_dout << dendl;
9806 if (pad_count)
9807 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9808 assert(bl->length() == length);
9809 }
9810
9811 void BlueStore::_do_write_small(
9812 TransContext *txc,
9813 CollectionRef &c,
9814 OnodeRef o,
9815 uint64_t offset, uint64_t length,
9816 bufferlist::iterator& blp,
9817 WriteContext *wctx)
9818 {
9819 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9820 << std::dec << dendl;
9821 assert(length < min_alloc_size);
9822 uint64_t end_offs = offset + length;
9823
9824 logger->inc(l_bluestore_write_small);
9825 logger->inc(l_bluestore_write_small_bytes, length);
9826
9827 bufferlist bl;
9828 blp.copy(length, bl);
9829
9830 // Look for an existing mutable blob we can use.
9831 auto begin = o->extent_map.extent_map.begin();
9832 auto end = o->extent_map.extent_map.end();
9833 auto ep = o->extent_map.seek_lextent(offset);
9834 if (ep != begin) {
9835 --ep;
9836 if (ep->blob_end() <= offset) {
9837 ++ep;
9838 }
9839 }
9840 auto prev_ep = ep;
9841 if (prev_ep != begin) {
9842 --prev_ep;
9843 } else {
9844 prev_ep = end; // to avoid this extent check as it's a duplicate
9845 }
9846
9847 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9848 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9849 uint32_t alloc_len = min_alloc_size;
9850 auto offset0 = P2ALIGN(offset, alloc_len);
9851
9852 bool any_change;
9853
9854 // search suitable extent in both forward and reverse direction in
9855 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9856 // then check if blob can be reused via can_reuse_blob func or apply
9857 // direct/deferred write (the latter for extents including or higher
9858 // than 'offset' only).
9859 do {
9860 any_change = false;
9861
9862 if (ep != end && ep->logical_offset < offset + max_bsize) {
9863 BlobRef b = ep->blob;
9864 auto bstart = ep->blob_start();
9865 dout(20) << __func__ << " considering " << *b
9866 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9867 if (bstart >= end_offs) {
9868 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9869 } else if (!b->get_blob().is_mutable()) {
9870 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9871 } else if (ep->logical_offset % min_alloc_size !=
9872 ep->blob_offset % min_alloc_size) {
9873 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9874 } else {
9875 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9876 // can we pad our head/tail out with zeros?
9877 uint64_t head_pad, tail_pad;
9878 head_pad = P2PHASE(offset, chunk_size);
9879 tail_pad = P2NPHASE(end_offs, chunk_size);
9880 if (head_pad || tail_pad) {
9881 o->extent_map.fault_range(db, offset - head_pad,
9882 end_offs - offset + head_pad + tail_pad);
9883 }
9884 if (head_pad &&
9885 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9886 head_pad = 0;
9887 }
9888 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9889 tail_pad = 0;
9890 }
9891
9892 uint64_t b_off = offset - head_pad - bstart;
9893 uint64_t b_len = length + head_pad + tail_pad;
9894
9895 // direct write into unused blocks of an existing mutable blob?
9896 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9897 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9898 b->get_blob().is_unused(b_off, b_len) &&
9899 b->get_blob().is_allocated(b_off, b_len)) {
9900 _apply_padding(head_pad, tail_pad, bl);
9901
9902 dout(20) << __func__ << " write to unused 0x" << std::hex
9903 << b_off << "~" << b_len
9904 << " pad 0x" << head_pad << " + 0x" << tail_pad
9905 << std::dec << " of mutable " << *b << dendl;
9906 _buffer_cache_write(txc, b, b_off, bl,
9907 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9908
9909 if (!g_conf->bluestore_debug_omit_block_device_write) {
9910 if (b_len <= prefer_deferred_size) {
9911 dout(20) << __func__ << " deferring small 0x" << std::hex
9912 << b_len << std::dec << " unused write via deferred" << dendl;
9913 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9914 op->op = bluestore_deferred_op_t::OP_WRITE;
9915 b->get_blob().map(
9916 b_off, b_len,
9917 [&](uint64_t offset, uint64_t length) {
9918 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9919 return 0;
9920 });
9921 op->data = bl;
9922 } else {
9923 b->get_blob().map_bl(
9924 b_off, bl,
9925 [&](uint64_t offset, bufferlist& t) {
9926 bdev->aio_write(offset, t,
9927 &txc->ioc, wctx->buffered);
9928 });
9929 }
9930 }
9931 b->dirty_blob().calc_csum(b_off, bl);
9932 dout(20) << __func__ << " lex old " << *ep << dendl;
9933 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9934 b,
9935 &wctx->old_extents);
9936 b->dirty_blob().mark_used(le->blob_offset, le->length);
9937 txc->statfs_delta.stored() += le->length;
9938 dout(20) << __func__ << " lex " << *le << dendl;
9939 logger->inc(l_bluestore_write_small_unused);
9940 return;
9941 }
9942 // read some data to fill out the chunk?
9943 uint64_t head_read = P2PHASE(b_off, chunk_size);
9944 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9945 if ((head_read || tail_read) &&
9946 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9947 head_read + tail_read < min_alloc_size) {
9948 b_off -= head_read;
9949 b_len += head_read + tail_read;
9950
9951 } else {
9952 head_read = tail_read = 0;
9953 }
9954
9955 // chunk-aligned deferred overwrite?
9956 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9957 b_off % chunk_size == 0 &&
9958 b_len % chunk_size == 0 &&
9959 b->get_blob().is_allocated(b_off, b_len)) {
9960
9961 _apply_padding(head_pad, tail_pad, bl);
9962
9963 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9964 << " and tail 0x" << tail_read << std::dec << dendl;
9965 if (head_read) {
9966 bufferlist head_bl;
9967 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9968 head_bl, 0);
9969 assert(r >= 0 && r <= (int)head_read);
9970 size_t zlen = head_read - r;
9971 if (zlen) {
9972 head_bl.append_zero(zlen);
9973 logger->inc(l_bluestore_write_pad_bytes, zlen);
9974 }
9975 bl.claim_prepend(head_bl);
9976 logger->inc(l_bluestore_write_penalty_read_ops);
9977 }
9978 if (tail_read) {
9979 bufferlist tail_bl;
9980 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9981 tail_bl, 0);
9982 assert(r >= 0 && r <= (int)tail_read);
9983 size_t zlen = tail_read - r;
9984 if (zlen) {
9985 tail_bl.append_zero(zlen);
9986 logger->inc(l_bluestore_write_pad_bytes, zlen);
9987 }
9988 bl.claim_append(tail_bl);
9989 logger->inc(l_bluestore_write_penalty_read_ops);
9990 }
9991 logger->inc(l_bluestore_write_small_pre_read);
9992
9993 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9994 op->op = bluestore_deferred_op_t::OP_WRITE;
9995 _buffer_cache_write(txc, b, b_off, bl,
9996 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9997
9998 int r = b->get_blob().map(
9999 b_off, b_len,
10000 [&](uint64_t offset, uint64_t length) {
10001 op->extents.emplace_back(bluestore_pextent_t(offset, length));
10002 return 0;
10003 });
10004 assert(r == 0);
10005 if (b->get_blob().csum_type) {
10006 b->dirty_blob().calc_csum(b_off, bl);
10007 }
10008 op->data.claim(bl);
10009 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
10010 << b_len << std::dec << " of mutable " << *b
10011 << " at " << op->extents << dendl;
10012 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
10013 b, &wctx->old_extents);
10014 b->dirty_blob().mark_used(le->blob_offset, le->length);
10015 txc->statfs_delta.stored() += le->length;
10016 dout(20) << __func__ << " lex " << *le << dendl;
10017 logger->inc(l_bluestore_write_small_deferred);
10018 return;
10019 }
10020 // try to reuse blob if we can
10021 if (b->can_reuse_blob(min_alloc_size,
10022 max_bsize,
10023 offset0 - bstart,
10024 &alloc_len)) {
10025 assert(alloc_len == min_alloc_size); // expecting data always
10026 // fit into reused blob
10027 // Need to check for pending writes desiring to
10028 // reuse the same pextent. The rationale is that during GC two chunks
10029 // from garbage blobs(compressed?) can share logical space within the same
10030 // AU. That's in turn might be caused by unaligned len in clone_range2.
10031 // Hence the second write will fail in an attempt to reuse blob at
10032 // do_alloc_write().
10033 if (!wctx->has_conflict(b,
10034 offset0,
10035 offset0 + alloc_len,
10036 min_alloc_size)) {
10037
10038 // we can't reuse pad_head/pad_tail since they might be truncated
10039 // due to existent extents
10040 uint64_t b_off = offset - bstart;
10041 uint64_t b_off0 = b_off;
10042 _pad_zeros(&bl, &b_off0, chunk_size);
10043
10044 dout(20) << __func__ << " reuse blob " << *b << std::hex
10045 << " (0x" << b_off0 << "~" << bl.length() << ")"
10046 << " (0x" << b_off << "~" << length << ")"
10047 << std::dec << dendl;
10048
10049 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
10050 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
10051 false, false);
10052 logger->inc(l_bluestore_write_small_unused);
10053 return;
10054 }
10055 }
10056 }
10057 ++ep;
10058 any_change = true;
10059 } // if (ep != end && ep->logical_offset < offset + max_bsize)
10060
10061 // check extent for reuse in reverse order
10062 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
10063 BlobRef b = prev_ep->blob;
10064 auto bstart = prev_ep->blob_start();
10065 dout(20) << __func__ << " considering " << *b
10066 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
10067 if (b->can_reuse_blob(min_alloc_size,
10068 max_bsize,
10069 offset0 - bstart,
10070 &alloc_len)) {
10071 assert(alloc_len == min_alloc_size); // expecting data always
10072 // fit into reused blob
10073 // Need to check for pending writes desiring to
10074 // reuse the same pextent. The rationale is that during GC two chunks
10075 // from garbage blobs(compressed?) can share logical space within the same
10076 // AU. That's in turn might be caused by unaligned len in clone_range2.
10077 // Hence the second write will fail in an attempt to reuse blob at
10078 // do_alloc_write().
10079 if (!wctx->has_conflict(b,
10080 offset0,
10081 offset0 + alloc_len,
10082 min_alloc_size)) {
10083
10084 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
10085 uint64_t b_off = offset - bstart;
10086 uint64_t b_off0 = b_off;
10087 _pad_zeros(&bl, &b_off0, chunk_size);
10088
10089 dout(20) << __func__ << " reuse blob " << *b << std::hex
10090 << " (0x" << b_off0 << "~" << bl.length() << ")"
10091 << " (0x" << b_off << "~" << length << ")"
10092 << std::dec << dendl;
10093
10094 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
10095 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
10096 false, false);
10097 logger->inc(l_bluestore_write_small_unused);
10098 return;
10099 }
10100 }
10101 if (prev_ep != begin) {
10102 --prev_ep;
10103 any_change = true;
10104 } else {
10105 prev_ep = end; // to avoid useless first extent re-check
10106 }
10107 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
10108 } while (any_change);
10109
10110 // new blob.
10111
10112 BlobRef b = c->new_blob();
10113 uint64_t b_off = P2PHASE(offset, alloc_len);
10114 uint64_t b_off0 = b_off;
10115 _pad_zeros(&bl, &b_off0, block_size);
10116 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
10117 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
10118 logger->inc(l_bluestore_write_small_new);
10119
10120 return;
10121 }
10122
10123 void BlueStore::_do_write_big(
10124 TransContext *txc,
10125 CollectionRef &c,
10126 OnodeRef o,
10127 uint64_t offset, uint64_t length,
10128 bufferlist::iterator& blp,
10129 WriteContext *wctx)
10130 {
10131 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
10132 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
10133 << " compress " << (int)wctx->compress
10134 << dendl;
10135 logger->inc(l_bluestore_write_big);
10136 logger->inc(l_bluestore_write_big_bytes, length);
10137 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
10138 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
10139 while (length > 0) {
10140 bool new_blob = false;
10141 uint32_t l = MIN(max_bsize, length);
10142 BlobRef b;
10143 uint32_t b_off = 0;
10144
10145 //attempting to reuse existing blob
10146 if (!wctx->compress) {
10147 // look for an existing mutable blob we can reuse
10148 auto begin = o->extent_map.extent_map.begin();
10149 auto end = o->extent_map.extent_map.end();
10150 auto ep = o->extent_map.seek_lextent(offset);
10151 auto prev_ep = ep;
10152 if (prev_ep != begin) {
10153 --prev_ep;
10154 } else {
10155 prev_ep = end; // to avoid this extent check as it's a duplicate
10156 }
10157 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
10158 // search suitable extent in both forward and reverse direction in
10159 // [offset - target_max_blob_size, offset + target_max_blob_size] range
10160 // then check if blob can be reused via can_reuse_blob func.
10161 bool any_change;
10162 do {
10163 any_change = false;
10164 if (ep != end && ep->logical_offset < offset + max_bsize) {
10165 if (offset >= ep->blob_start() &&
10166 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
10167 offset - ep->blob_start(),
10168 &l)) {
10169 b = ep->blob;
10170 b_off = offset - ep->blob_start();
10171 prev_ep = end; // to avoid check below
10172 dout(20) << __func__ << " reuse blob " << *b << std::hex
10173 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
10174 } else {
10175 ++ep;
10176 any_change = true;
10177 }
10178 }
10179
10180 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
10181 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
10182 offset - prev_ep->blob_start(),
10183 &l)) {
10184 b = prev_ep->blob;
10185 b_off = offset - prev_ep->blob_start();
10186 dout(20) << __func__ << " reuse blob " << *b << std::hex
10187 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
10188 } else if (prev_ep != begin) {
10189 --prev_ep;
10190 any_change = true;
10191 } else {
10192 prev_ep = end; // to avoid useless first extent re-check
10193 }
10194 }
10195 } while (b == nullptr && any_change);
10196 }
10197 if (b == nullptr) {
10198 b = c->new_blob();
10199 b_off = 0;
10200 new_blob = true;
10201 }
10202
10203 bufferlist t;
10204 blp.copy(l, t);
10205 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
10206 offset += l;
10207 length -= l;
10208 logger->inc(l_bluestore_write_big_blobs);
10209 }
10210 }
10211
10212 int BlueStore::_do_alloc_write(
10213 TransContext *txc,
10214 CollectionRef coll,
10215 OnodeRef o,
10216 WriteContext *wctx)
10217 {
10218 dout(20) << __func__ << " txc " << txc
10219 << " " << wctx->writes.size() << " blobs"
10220 << dendl;
10221 if (wctx->writes.empty()) {
10222 return 0;
10223 }
10224
10225 CompressorRef c;
10226 double crr = 0;
10227 if (wctx->compress) {
10228 c = select_option(
10229 "compression_algorithm",
10230 compressor,
10231 [&]() {
10232 string val;
10233 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
10234 CompressorRef cp = compressor;
10235 if (!cp || cp->get_type_name() != val) {
10236 cp = Compressor::create(cct, val);
10237 }
10238 return boost::optional<CompressorRef>(cp);
10239 }
10240 return boost::optional<CompressorRef>();
10241 }
10242 );
10243
10244 crr = select_option(
10245 "compression_required_ratio",
10246 cct->_conf->bluestore_compression_required_ratio,
10247 [&]() {
10248 double val;
10249 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
10250 return boost::optional<double>(val);
10251 }
10252 return boost::optional<double>();
10253 }
10254 );
10255 }
10256
10257 // checksum
10258 int csum = csum_type.load();
10259 csum = select_option(
10260 "csum_type",
10261 csum,
10262 [&]() {
10263 int val;
10264 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
10265 return boost::optional<int>(val);
10266 }
10267 return boost::optional<int>();
10268 }
10269 );
10270
10271 // compress (as needed) and calc needed space
10272 uint64_t need = 0;
10273 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
10274 for (auto& wi : wctx->writes) {
10275 if (c && wi.blob_length > min_alloc_size) {
10276 utime_t start = ceph_clock_now();
10277
10278 // compress
10279 assert(wi.b_off == 0);
10280 assert(wi.blob_length == wi.bl.length());
10281
10282 // FIXME: memory alignment here is bad
10283 bufferlist t;
10284 int r = c->compress(wi.bl, t);
10285
10286 uint64_t want_len_raw = wi.blob_length * crr;
10287 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
10288 bool rejected = false;
10289 uint64_t compressed_len = t.length();
10290 // do an approximate (fast) estimation for resulting blob size
10291 // that doesn't take header overhead into account
10292 uint64_t result_len = P2ROUNDUP(compressed_len, min_alloc_size);
10293 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
10294 bluestore_compression_header_t chdr;
10295 chdr.type = c->get_type();
10296 chdr.length = t.length();
10297 encode(chdr, wi.compressed_bl);
10298 wi.compressed_bl.claim_append(t);
10299
10300 compressed_len = wi.compressed_bl.length();
10301 result_len = P2ROUNDUP(compressed_len, min_alloc_size);
10302 if (result_len <= want_len && result_len < wi.blob_length) {
10303 // Cool. We compressed at least as much as we were hoping to.
10304 // pad out to min_alloc_size
10305 wi.compressed_bl.append_zero(result_len - compressed_len);
10306 wi.compressed_len = compressed_len;
10307 wi.compressed = true;
10308 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
10309 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
10310 << " -> 0x" << compressed_len << " => 0x" << result_len
10311 << " with " << c->get_type()
10312 << std::dec << dendl;
10313 txc->statfs_delta.compressed() += compressed_len;
10314 txc->statfs_delta.compressed_original() += wi.blob_length;
10315 txc->statfs_delta.compressed_allocated() += result_len;
10316 logger->inc(l_bluestore_compress_success_count);
10317 need += result_len;
10318 } else {
10319 rejected = true;
10320 }
10321 } else if (r != 0) {
10322 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
10323 << " bytes compressed using " << c->get_type_name()
10324 << std::dec
10325 << " failed with errcode = " << r
10326 << ", leaving uncompressed"
10327 << dendl;
10328 logger->inc(l_bluestore_compress_rejected_count);
10329 need += wi.blob_length;
10330 } else {
10331 rejected = true;
10332 }
10333
10334 if (rejected) {
10335 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
10336 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
10337 << " with " << c->get_type()
10338 << ", which is more than required 0x" << want_len_raw
10339 << " -> 0x" << want_len
10340 << ", leaving uncompressed"
10341 << std::dec << dendl;
10342 logger->inc(l_bluestore_compress_rejected_count);
10343 need += wi.blob_length;
10344 }
10345 logger->tinc(l_bluestore_compress_lat,
10346 ceph_clock_now() - start);
10347 } else {
10348 need += wi.blob_length;
10349 }
10350 }
10351 PExtentVector prealloc;
10352 prealloc.reserve(2 * wctx->writes.size());;
10353 int prealloc_left = 0;
10354 prealloc_left = alloc->allocate(
10355 need, min_alloc_size, need,
10356 0, &prealloc);
10357 if (prealloc_left < 0) {
10358 derr << __func__ << " failed to allocate 0x" << std::hex << need << std::dec
10359 << dendl;
10360 return -ENOSPC;
10361 }
10362 assert(prealloc_left == (int64_t)need);
10363
10364 dout(20) << __func__ << " prealloc " << prealloc << dendl;
10365 auto prealloc_pos = prealloc.begin();
10366
10367 for (auto& wi : wctx->writes) {
10368 BlobRef b = wi.b;
10369 bluestore_blob_t& dblob = b->dirty_blob();
10370 uint64_t b_off = wi.b_off;
10371 bufferlist *l = &wi.bl;
10372 uint64_t final_length = wi.blob_length;
10373 uint64_t csum_length = wi.blob_length;
10374 unsigned csum_order = block_size_order;
10375 if (wi.compressed) {
10376 final_length = wi.compressed_bl.length();
10377 csum_length = final_length;
10378 csum_order = ctz(csum_length);
10379 l = &wi.compressed_bl;
10380 dblob.set_compressed(wi.blob_length, wi.compressed_len);
10381 } else if (wi.new_blob) {
10382 // initialize newly created blob only
10383 assert(dblob.is_mutable());
10384 if (l->length() != wi.blob_length) {
10385 // hrm, maybe we could do better here, but let's not bother.
10386 dout(20) << __func__ << " forcing csum_order to block_size_order "
10387 << block_size_order << dendl;
10388 csum_order = block_size_order;
10389 } else {
10390 csum_order = std::min(wctx->csum_order, ctz(l->length()));
10391 }
10392 // try to align blob with max_blob_size to improve
10393 // its reuse ratio, e.g. in case of reverse write
10394 uint32_t suggested_boff =
10395 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
10396 if ((suggested_boff % (1 << csum_order)) == 0 &&
10397 suggested_boff + final_length <= max_bsize &&
10398 suggested_boff > b_off) {
10399 dout(20) << __func__ << " forcing blob_offset to 0x"
10400 << std::hex << suggested_boff << std::dec << dendl;
10401 assert(suggested_boff >= b_off);
10402 csum_length += suggested_boff - b_off;
10403 b_off = suggested_boff;
10404 }
10405 if (csum != Checksummer::CSUM_NONE) {
10406 dout(20) << __func__ << " initialize csum setting for new blob " << *b
10407 << " csum_type " << Checksummer::get_csum_type_string(csum)
10408 << " csum_order " << csum_order
10409 << " csum_length 0x" << std::hex << csum_length << std::dec
10410 << dendl;
10411 dblob.init_csum(csum, csum_order, csum_length);
10412 }
10413 }
10414
10415 PExtentVector extents;
10416 int64_t left = final_length;
10417 while (left > 0) {
10418 assert(prealloc_left > 0);
10419 if (prealloc_pos->length <= left) {
10420 prealloc_left -= prealloc_pos->length;
10421 left -= prealloc_pos->length;
10422 txc->statfs_delta.allocated() += prealloc_pos->length;
10423 extents.push_back(*prealloc_pos);
10424 ++prealloc_pos;
10425 } else {
10426 extents.emplace_back(prealloc_pos->offset, left);
10427 prealloc_pos->offset += left;
10428 prealloc_pos->length -= left;
10429 prealloc_left -= left;
10430 txc->statfs_delta.allocated() += left;
10431 left = 0;
10432 break;
10433 }
10434 }
10435 for (auto& p : extents) {
10436 txc->allocated.insert(p.offset, p.length);
10437 }
10438 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
10439
10440 dout(20) << __func__ << " blob " << *b << dendl;
10441 if (dblob.has_csum()) {
10442 dblob.calc_csum(b_off, *l);
10443 }
10444
10445 if (wi.mark_unused) {
10446 auto b_end = b_off + wi.bl.length();
10447 if (b_off) {
10448 dblob.add_unused(0, b_off);
10449 }
10450 if (b_end < wi.blob_length) {
10451 dblob.add_unused(b_end, wi.blob_length - b_end);
10452 }
10453 }
10454
10455 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
10456 b_off + (wi.b_off0 - wi.b_off),
10457 wi.length0,
10458 wi.b,
10459 nullptr);
10460 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
10461 txc->statfs_delta.stored() += le->length;
10462 dout(20) << __func__ << " lex " << *le << dendl;
10463 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
10464 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
10465
10466 // queue io
10467 if (!g_conf->bluestore_debug_omit_block_device_write) {
10468 if (l->length() <= prefer_deferred_size.load()) {
10469 dout(20) << __func__ << " deferring small 0x" << std::hex
10470 << l->length() << std::dec << " write via deferred" << dendl;
10471 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
10472 op->op = bluestore_deferred_op_t::OP_WRITE;
10473 int r = b->get_blob().map(
10474 b_off, l->length(),
10475 [&](uint64_t offset, uint64_t length) {
10476 op->extents.emplace_back(bluestore_pextent_t(offset, length));
10477 return 0;
10478 });
10479 assert(r == 0);
10480 op->data = *l;
10481 } else {
10482 b->get_blob().map_bl(
10483 b_off, *l,
10484 [&](uint64_t offset, bufferlist& t) {
10485 bdev->aio_write(offset, t, &txc->ioc, false);
10486 });
10487 }
10488 }
10489 }
10490 assert(prealloc_pos == prealloc.end());
10491 assert(prealloc_left == 0);
10492 return 0;
10493 }
10494
10495 void BlueStore::_wctx_finish(
10496 TransContext *txc,
10497 CollectionRef& c,
10498 OnodeRef o,
10499 WriteContext *wctx,
10500 set<SharedBlob*> *maybe_unshared_blobs)
10501 {
10502 auto oep = wctx->old_extents.begin();
10503 while (oep != wctx->old_extents.end()) {
10504 auto &lo = *oep;
10505 oep = wctx->old_extents.erase(oep);
10506 dout(20) << __func__ << " lex_old " << lo.e << dendl;
10507 BlobRef b = lo.e.blob;
10508 const bluestore_blob_t& blob = b->get_blob();
10509 if (blob.is_compressed()) {
10510 if (lo.blob_empty) {
10511 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
10512 }
10513 txc->statfs_delta.compressed_original() -= lo.e.length;
10514 }
10515 auto& r = lo.r;
10516 txc->statfs_delta.stored() -= lo.e.length;
10517 if (!r.empty()) {
10518 dout(20) << __func__ << " blob release " << r << dendl;
10519 if (blob.is_shared()) {
10520 PExtentVector final;
10521 c->load_shared_blob(b->shared_blob);
10522 for (auto e : r) {
10523 b->shared_blob->put_ref(
10524 e.offset, e.length, &final,
10525 b->is_referenced() ? nullptr : maybe_unshared_blobs);
10526 }
10527 dout(20) << __func__ << " shared_blob release " << final
10528 << " from " << *b->shared_blob << dendl;
10529 txc->write_shared_blob(b->shared_blob);
10530 r.clear();
10531 r.swap(final);
10532 }
10533 }
10534 // we can't invalidate our logical extents as we drop them because
10535 // other lextents (either in our onode or others) may still
10536 // reference them. but we can throw out anything that is no
10537 // longer allocated. Note that this will leave behind edge bits
10538 // that are no longer referenced but not deallocated (until they
10539 // age out of the cache naturally).
10540 b->discard_unallocated(c.get());
10541 for (auto e : r) {
10542 dout(20) << __func__ << " release " << e << dendl;
10543 txc->released.insert(e.offset, e.length);
10544 txc->statfs_delta.allocated() -= e.length;
10545 if (blob.is_compressed()) {
10546 txc->statfs_delta.compressed_allocated() -= e.length;
10547 }
10548 }
10549 delete &lo;
10550 if (b->is_spanning() && !b->is_referenced()) {
10551 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
10552 << dendl;
10553 o->extent_map.spanning_blob_map.erase(b->id);
10554 }
10555 }
10556 }
10557
10558 void BlueStore::_do_write_data(
10559 TransContext *txc,
10560 CollectionRef& c,
10561 OnodeRef o,
10562 uint64_t offset,
10563 uint64_t length,
10564 bufferlist& bl,
10565 WriteContext *wctx)
10566 {
10567 uint64_t end = offset + length;
10568 bufferlist::iterator p = bl.begin();
10569
10570 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
10571 (length != min_alloc_size)) {
10572 // we fall within the same block
10573 _do_write_small(txc, c, o, offset, length, p, wctx);
10574 } else {
10575 uint64_t head_offset, head_length;
10576 uint64_t middle_offset, middle_length;
10577 uint64_t tail_offset, tail_length;
10578
10579 head_offset = offset;
10580 head_length = P2NPHASE(offset, min_alloc_size);
10581
10582 tail_offset = P2ALIGN(end, min_alloc_size);
10583 tail_length = P2PHASE(end, min_alloc_size);
10584
10585 middle_offset = head_offset + head_length;
10586 middle_length = length - head_length - tail_length;
10587
10588 if (head_length) {
10589 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
10590 }
10591
10592 if (middle_length) {
10593 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
10594 }
10595
10596 if (tail_length) {
10597 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
10598 }
10599 }
10600 }
10601
10602 void BlueStore::_choose_write_options(
10603 CollectionRef& c,
10604 OnodeRef o,
10605 uint32_t fadvise_flags,
10606 WriteContext *wctx)
10607 {
10608 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10609 dout(20) << __func__ << " will do buffered write" << dendl;
10610 wctx->buffered = true;
10611 } else if (cct->_conf->bluestore_default_buffered_write &&
10612 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10613 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10614 dout(20) << __func__ << " defaulting to buffered write" << dendl;
10615 wctx->buffered = true;
10616 }
10617
10618 // apply basic csum block size
10619 wctx->csum_order = block_size_order;
10620
10621 // compression parameters
10622 unsigned alloc_hints = o->onode.alloc_hint_flags;
10623 auto cm = select_option(
10624 "compression_mode",
10625 comp_mode.load(),
10626 [&]() {
10627 string val;
10628 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
10629 return boost::optional<Compressor::CompressionMode>(
10630 Compressor::get_comp_mode_type(val));
10631 }
10632 return boost::optional<Compressor::CompressionMode>();
10633 }
10634 );
10635
10636 wctx->compress = (cm != Compressor::COMP_NONE) &&
10637 ((cm == Compressor::COMP_FORCE) ||
10638 (cm == Compressor::COMP_AGGRESSIVE &&
10639 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10640 (cm == Compressor::COMP_PASSIVE &&
10641 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10642
10643 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10644 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
10645 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10646 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
10647 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
10648
10649 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
10650
10651 if (o->onode.expected_write_size) {
10652 wctx->csum_order = std::max(min_alloc_size_order,
10653 (uint8_t)ctz(o->onode.expected_write_size));
10654 } else {
10655 wctx->csum_order = min_alloc_size_order;
10656 }
10657
10658 if (wctx->compress) {
10659 wctx->target_blob_size = select_option(
10660 "compression_max_blob_size",
10661 comp_max_blob_size.load(),
10662 [&]() {
10663 int val;
10664 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10665 return boost::optional<uint64_t>((uint64_t)val);
10666 }
10667 return boost::optional<uint64_t>();
10668 }
10669 );
10670 }
10671 } else {
10672 if (wctx->compress) {
10673 wctx->target_blob_size = select_option(
10674 "compression_min_blob_size",
10675 comp_min_blob_size.load(),
10676 [&]() {
10677 int val;
10678 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10679 return boost::optional<uint64_t>((uint64_t)val);
10680 }
10681 return boost::optional<uint64_t>();
10682 }
10683 );
10684 }
10685 }
10686
10687 uint64_t max_bsize = max_blob_size.load();
10688 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10689 wctx->target_blob_size = max_bsize;
10690 }
10691
10692 // set the min blob size floor at 2x the min_alloc_size, or else we
10693 // won't be able to allocate a smaller extent for the compressed
10694 // data.
10695 if (wctx->compress &&
10696 wctx->target_blob_size < min_alloc_size * 2) {
10697 wctx->target_blob_size = min_alloc_size * 2;
10698 }
10699
10700 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10701 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10702 << " compress=" << (int)wctx->compress
10703 << " buffered=" << (int)wctx->buffered
10704 << std::dec << dendl;
10705 }
10706
10707 int BlueStore::_do_gc(
10708 TransContext *txc,
10709 CollectionRef& c,
10710 OnodeRef o,
10711 const GarbageCollector& gc,
10712 const WriteContext& wctx,
10713 uint64_t *dirty_start,
10714 uint64_t *dirty_end)
10715 {
10716 auto& extents_to_collect = gc.get_extents_to_collect();
10717
10718 bool dirty_range_updated = false;
10719 WriteContext wctx_gc;
10720 wctx_gc.fork(wctx); // make a clone for garbage collection
10721
10722 for (auto it = extents_to_collect.begin();
10723 it != extents_to_collect.end();
10724 ++it) {
10725 bufferlist bl;
10726 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10727 assert(r == (int)it->length);
10728
10729 o->extent_map.fault_range(db, it->offset, it->length);
10730 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10731 logger->inc(l_bluestore_gc_merged, it->length);
10732
10733 if (*dirty_start > it->offset) {
10734 *dirty_start = it->offset;
10735 dirty_range_updated = true;
10736 }
10737
10738 if (*dirty_end < it->offset + it->length) {
10739 *dirty_end = it->offset + it->length;
10740 dirty_range_updated = true;
10741 }
10742 }
10743 if (dirty_range_updated) {
10744 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
10745 }
10746
10747 dout(30) << __func__ << " alloc write" << dendl;
10748 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10749 if (r < 0) {
10750 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10751 << dendl;
10752 return r;
10753 }
10754
10755 _wctx_finish(txc, c, o, &wctx_gc);
10756 return 0;
10757 }
10758
10759 int BlueStore::_do_write(
10760 TransContext *txc,
10761 CollectionRef& c,
10762 OnodeRef o,
10763 uint64_t offset,
10764 uint64_t length,
10765 bufferlist& bl,
10766 uint32_t fadvise_flags)
10767 {
10768 int r = 0;
10769
10770 dout(20) << __func__
10771 << " " << o->oid
10772 << " 0x" << std::hex << offset << "~" << length
10773 << " - have 0x" << o->onode.size
10774 << " (" << std::dec << o->onode.size << ")"
10775 << " bytes"
10776 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10777 << dendl;
10778 _dump_onode(o);
10779
10780 if (length == 0) {
10781 return 0;
10782 }
10783
10784 uint64_t end = offset + length;
10785
10786 GarbageCollector gc(c->store->cct);
10787 int64_t benefit;
10788 auto dirty_start = offset;
10789 auto dirty_end = end;
10790
10791 WriteContext wctx;
10792 _choose_write_options(c, o, fadvise_flags, &wctx);
10793 o->extent_map.fault_range(db, offset, length);
10794 _do_write_data(txc, c, o, offset, length, bl, &wctx);
10795 r = _do_alloc_write(txc, c, o, &wctx);
10796 if (r < 0) {
10797 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10798 << dendl;
10799 goto out;
10800 }
10801
10802 // NB: _wctx_finish() will empty old_extents
10803 // so we must do gc estimation before that
10804 benefit = gc.estimate(offset,
10805 length,
10806 o->extent_map,
10807 wctx.old_extents,
10808 min_alloc_size);
10809
10810 _wctx_finish(txc, c, o, &wctx);
10811 if (end > o->onode.size) {
10812 dout(20) << __func__ << " extending size to 0x" << std::hex << end
10813 << std::dec << dendl;
10814 o->onode.size = end;
10815 }
10816
10817 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
10818 if (!gc.get_extents_to_collect().empty()) {
10819 dout(20) << __func__ << " perform garbage collection, "
10820 << "expected benefit = " << benefit << " AUs" << dendl;
10821 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10822 if (r < 0) {
10823 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10824 << dendl;
10825 goto out;
10826 }
10827 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
10828 << "~" << dirty_end - dirty_start << std::dec << dendl;
10829 }
10830 }
10831 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
10832 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10833
10834 r = 0;
10835
10836 out:
10837 return r;
10838 }
10839
10840 int BlueStore::_write(TransContext *txc,
10841 CollectionRef& c,
10842 OnodeRef& o,
10843 uint64_t offset, size_t length,
10844 bufferlist& bl,
10845 uint32_t fadvise_flags)
10846 {
10847 dout(15) << __func__ << " " << c->cid << " " << o->oid
10848 << " 0x" << std::hex << offset << "~" << length << std::dec
10849 << dendl;
10850 int r = 0;
10851 if (offset + length >= OBJECT_MAX_SIZE) {
10852 r = -E2BIG;
10853 } else {
10854 _assign_nid(txc, o);
10855 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10856 txc->write_onode(o);
10857 }
10858 dout(10) << __func__ << " " << c->cid << " " << o->oid
10859 << " 0x" << std::hex << offset << "~" << length << std::dec
10860 << " = " << r << dendl;
10861 return r;
10862 }
10863
10864 int BlueStore::_zero(TransContext *txc,
10865 CollectionRef& c,
10866 OnodeRef& o,
10867 uint64_t offset, size_t length)
10868 {
10869 dout(15) << __func__ << " " << c->cid << " " << o->oid
10870 << " 0x" << std::hex << offset << "~" << length << std::dec
10871 << dendl;
10872 int r = 0;
10873 if (offset + length >= OBJECT_MAX_SIZE) {
10874 r = -E2BIG;
10875 } else {
10876 _assign_nid(txc, o);
10877 r = _do_zero(txc, c, o, offset, length);
10878 }
10879 dout(10) << __func__ << " " << c->cid << " " << o->oid
10880 << " 0x" << std::hex << offset << "~" << length << std::dec
10881 << " = " << r << dendl;
10882 return r;
10883 }
10884
10885 int BlueStore::_do_zero(TransContext *txc,
10886 CollectionRef& c,
10887 OnodeRef& o,
10888 uint64_t offset, size_t length)
10889 {
10890 dout(15) << __func__ << " " << c->cid << " " << o->oid
10891 << " 0x" << std::hex << offset << "~" << length << std::dec
10892 << dendl;
10893 int r = 0;
10894
10895 _dump_onode(o);
10896
10897 WriteContext wctx;
10898 o->extent_map.fault_range(db, offset, length);
10899 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10900 o->extent_map.dirty_range(offset, length);
10901 _wctx_finish(txc, c, o, &wctx);
10902
10903 if (length > 0 && offset + length > o->onode.size) {
10904 o->onode.size = offset + length;
10905 dout(20) << __func__ << " extending size to " << offset + length
10906 << dendl;
10907 }
10908 txc->write_onode(o);
10909
10910 dout(10) << __func__ << " " << c->cid << " " << o->oid
10911 << " 0x" << std::hex << offset << "~" << length << std::dec
10912 << " = " << r << dendl;
10913 return r;
10914 }
10915
10916 void BlueStore::_do_truncate(
10917 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10918 set<SharedBlob*> *maybe_unshared_blobs)
10919 {
10920 dout(15) << __func__ << " " << c->cid << " " << o->oid
10921 << " 0x" << std::hex << offset << std::dec << dendl;
10922
10923 _dump_onode(o, 30);
10924
10925 if (offset == o->onode.size)
10926 return;
10927
10928 if (offset < o->onode.size) {
10929 WriteContext wctx;
10930 uint64_t length = o->onode.size - offset;
10931 o->extent_map.fault_range(db, offset, length);
10932 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10933 o->extent_map.dirty_range(offset, length);
10934 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
10935
10936 // if we have shards past EOF, ask for a reshard
10937 if (!o->onode.extent_map_shards.empty() &&
10938 o->onode.extent_map_shards.back().offset >= offset) {
10939 dout(10) << __func__ << " request reshard past EOF" << dendl;
10940 if (offset) {
10941 o->extent_map.request_reshard(offset - 1, offset + length);
10942 } else {
10943 o->extent_map.request_reshard(0, length);
10944 }
10945 }
10946 }
10947
10948 o->onode.size = offset;
10949
10950 txc->write_onode(o);
10951 }
10952
10953 int BlueStore::_truncate(TransContext *txc,
10954 CollectionRef& c,
10955 OnodeRef& o,
10956 uint64_t offset)
10957 {
10958 dout(15) << __func__ << " " << c->cid << " " << o->oid
10959 << " 0x" << std::hex << offset << std::dec
10960 << dendl;
10961 int r = 0;
10962 if (offset >= OBJECT_MAX_SIZE) {
10963 r = -E2BIG;
10964 } else {
10965 _do_truncate(txc, c, o, offset);
10966 }
10967 dout(10) << __func__ << " " << c->cid << " " << o->oid
10968 << " 0x" << std::hex << offset << std::dec
10969 << " = " << r << dendl;
10970 return r;
10971 }
10972
10973 int BlueStore::_do_remove(
10974 TransContext *txc,
10975 CollectionRef& c,
10976 OnodeRef o)
10977 {
10978 set<SharedBlob*> maybe_unshared_blobs;
10979 bool is_gen = !o->oid.is_no_gen();
10980 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
10981 if (o->onode.has_omap()) {
10982 o->flush();
10983 _do_omap_clear(txc, o->onode.nid);
10984 }
10985 o->exists = false;
10986 string key;
10987 for (auto &s : o->extent_map.shards) {
10988 dout(20) << __func__ << " removing shard 0x" << std::hex
10989 << s.shard_info->offset << std::dec << dendl;
10990 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10991 [&](const string& final_key) {
10992 txc->t->rmkey(PREFIX_OBJ, final_key);
10993 }
10994 );
10995 }
10996 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10997 txc->note_removed_object(o);
10998 o->extent_map.clear();
10999 o->onode = bluestore_onode_t();
11000 _debug_obj_on_delete(o->oid);
11001
11002 if (!is_gen || maybe_unshared_blobs.empty()) {
11003 return 0;
11004 }
11005
11006 // see if we can unshare blobs still referenced by the head
11007 dout(10) << __func__ << " gen and maybe_unshared_blobs "
11008 << maybe_unshared_blobs << dendl;
11009 ghobject_t nogen = o->oid;
11010 nogen.generation = ghobject_t::NO_GEN;
11011 OnodeRef h = c->onode_map.lookup(nogen);
11012
11013 if (!h || !h->exists) {
11014 return 0;
11015 }
11016
11017 dout(20) << __func__ << " checking for unshareable blobs on " << h
11018 << " " << h->oid << dendl;
11019 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
11020 for (auto& e : h->extent_map.extent_map) {
11021 const bluestore_blob_t& b = e.blob->get_blob();
11022 SharedBlob *sb = e.blob->shared_blob.get();
11023 if (b.is_shared() &&
11024 sb->loaded &&
11025 maybe_unshared_blobs.count(sb)) {
11026 if (b.is_compressed()) {
11027 expect[sb].get(0, b.get_ondisk_length());
11028 } else {
11029 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
11030 expect[sb].get(off, len);
11031 return 0;
11032 });
11033 }
11034 }
11035 }
11036
11037 vector<SharedBlob*> unshared_blobs;
11038 unshared_blobs.reserve(maybe_unshared_blobs.size());
11039 for (auto& p : expect) {
11040 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
11041 if (p.first->persistent->ref_map == p.second) {
11042 SharedBlob *sb = p.first;
11043 dout(20) << __func__ << " unsharing " << *sb << dendl;
11044 unshared_blobs.push_back(sb);
11045 txc->unshare_blob(sb);
11046 uint64_t sbid = c->make_blob_unshared(sb);
11047 string key;
11048 get_shared_blob_key(sbid, &key);
11049 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
11050 }
11051 }
11052
11053 if (unshared_blobs.empty()) {
11054 return 0;
11055 }
11056
11057 for (auto& e : h->extent_map.extent_map) {
11058 const bluestore_blob_t& b = e.blob->get_blob();
11059 SharedBlob *sb = e.blob->shared_blob.get();
11060 if (b.is_shared() &&
11061 std::find(unshared_blobs.begin(), unshared_blobs.end(),
11062 sb) != unshared_blobs.end()) {
11063 dout(20) << __func__ << " unsharing " << e << dendl;
11064 bluestore_blob_t& blob = e.blob->dirty_blob();
11065 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
11066 h->extent_map.dirty_range(e.logical_offset, 1);
11067 }
11068 }
11069 txc->write_onode(h);
11070
11071 return 0;
11072 }
11073
11074 int BlueStore::_remove(TransContext *txc,
11075 CollectionRef& c,
11076 OnodeRef &o)
11077 {
11078 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11079 int r = _do_remove(txc, c, o);
11080 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11081 return r;
11082 }
11083
11084 int BlueStore::_setattr(TransContext *txc,
11085 CollectionRef& c,
11086 OnodeRef& o,
11087 const string& name,
11088 bufferptr& val)
11089 {
11090 dout(15) << __func__ << " " << c->cid << " " << o->oid
11091 << " " << name << " (" << val.length() << " bytes)"
11092 << dendl;
11093 int r = 0;
11094 if (val.is_partial()) {
11095 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
11096 val.length());
11097 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11098 } else {
11099 auto& b = o->onode.attrs[name.c_str()] = val;
11100 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11101 }
11102 txc->write_onode(o);
11103 dout(10) << __func__ << " " << c->cid << " " << o->oid
11104 << " " << name << " (" << val.length() << " bytes)"
11105 << " = " << r << dendl;
11106 return r;
11107 }
11108
11109 int BlueStore::_setattrs(TransContext *txc,
11110 CollectionRef& c,
11111 OnodeRef& o,
11112 const map<string,bufferptr>& aset)
11113 {
11114 dout(15) << __func__ << " " << c->cid << " " << o->oid
11115 << " " << aset.size() << " keys"
11116 << dendl;
11117 int r = 0;
11118 for (map<string,bufferptr>::const_iterator p = aset.begin();
11119 p != aset.end(); ++p) {
11120 if (p->second.is_partial()) {
11121 auto& b = o->onode.attrs[p->first.c_str()] =
11122 bufferptr(p->second.c_str(), p->second.length());
11123 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11124 } else {
11125 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
11126 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11127 }
11128 }
11129 txc->write_onode(o);
11130 dout(10) << __func__ << " " << c->cid << " " << o->oid
11131 << " " << aset.size() << " keys"
11132 << " = " << r << dendl;
11133 return r;
11134 }
11135
11136
11137 int BlueStore::_rmattr(TransContext *txc,
11138 CollectionRef& c,
11139 OnodeRef& o,
11140 const string& name)
11141 {
11142 dout(15) << __func__ << " " << c->cid << " " << o->oid
11143 << " " << name << dendl;
11144 int r = 0;
11145 auto it = o->onode.attrs.find(name.c_str());
11146 if (it == o->onode.attrs.end())
11147 goto out;
11148
11149 o->onode.attrs.erase(it);
11150 txc->write_onode(o);
11151
11152 out:
11153 dout(10) << __func__ << " " << c->cid << " " << o->oid
11154 << " " << name << " = " << r << dendl;
11155 return r;
11156 }
11157
11158 int BlueStore::_rmattrs(TransContext *txc,
11159 CollectionRef& c,
11160 OnodeRef& o)
11161 {
11162 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11163 int r = 0;
11164
11165 if (o->onode.attrs.empty())
11166 goto out;
11167
11168 o->onode.attrs.clear();
11169 txc->write_onode(o);
11170
11171 out:
11172 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11173 return r;
11174 }
11175
11176 void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
11177 {
11178 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
11179 string prefix, tail;
11180 get_omap_header(id, &prefix);
11181 get_omap_tail(id, &tail);
11182 it->lower_bound(prefix);
11183 while (it->valid()) {
11184 if (it->key() >= tail) {
11185 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
11186 << dendl;
11187 break;
11188 }
11189 txc->t->rmkey(PREFIX_OMAP, it->key());
11190 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
11191 it->next();
11192 }
11193 }
11194
11195 int BlueStore::_omap_clear(TransContext *txc,
11196 CollectionRef& c,
11197 OnodeRef& o)
11198 {
11199 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11200 int r = 0;
11201 if (o->onode.has_omap()) {
11202 o->flush();
11203 _do_omap_clear(txc, o->onode.nid);
11204 o->onode.clear_omap_flag();
11205 txc->write_onode(o);
11206 }
11207 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11208 return r;
11209 }
11210
11211 int BlueStore::_omap_setkeys(TransContext *txc,
11212 CollectionRef& c,
11213 OnodeRef& o,
11214 bufferlist &bl)
11215 {
11216 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11217 int r;
11218 bufferlist::iterator p = bl.begin();
11219 __u32 num;
11220 if (!o->onode.has_omap()) {
11221 o->onode.set_omap_flag();
11222 txc->write_onode(o);
11223 } else {
11224 txc->note_modified_object(o);
11225 }
11226 string final_key;
11227 _key_encode_u64(o->onode.nid, &final_key);
11228 final_key.push_back('.');
11229 ::decode(num, p);
11230 while (num--) {
11231 string key;
11232 bufferlist value;
11233 ::decode(key, p);
11234 ::decode(value, p);
11235 final_key.resize(9); // keep prefix
11236 final_key += key;
11237 dout(30) << __func__ << " " << pretty_binary_string(final_key)
11238 << " <- " << key << dendl;
11239 txc->t->set(PREFIX_OMAP, final_key, value);
11240 }
11241 r = 0;
11242 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11243 return r;
11244 }
11245
11246 int BlueStore::_omap_setheader(TransContext *txc,
11247 CollectionRef& c,
11248 OnodeRef &o,
11249 bufferlist& bl)
11250 {
11251 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11252 int r;
11253 string key;
11254 if (!o->onode.has_omap()) {
11255 o->onode.set_omap_flag();
11256 txc->write_onode(o);
11257 } else {
11258 txc->note_modified_object(o);
11259 }
11260 get_omap_header(o->onode.nid, &key);
11261 txc->t->set(PREFIX_OMAP, key, bl);
11262 r = 0;
11263 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11264 return r;
11265 }
11266
11267 int BlueStore::_omap_rmkeys(TransContext *txc,
11268 CollectionRef& c,
11269 OnodeRef& o,
11270 bufferlist& bl)
11271 {
11272 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11273 int r = 0;
11274 bufferlist::iterator p = bl.begin();
11275 __u32 num;
11276 string final_key;
11277
11278 if (!o->onode.has_omap()) {
11279 goto out;
11280 }
11281 _key_encode_u64(o->onode.nid, &final_key);
11282 final_key.push_back('.');
11283 ::decode(num, p);
11284 while (num--) {
11285 string key;
11286 ::decode(key, p);
11287 final_key.resize(9); // keep prefix
11288 final_key += key;
11289 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
11290 << " <- " << key << dendl;
11291 txc->t->rmkey(PREFIX_OMAP, final_key);
11292 }
11293 txc->note_modified_object(o);
11294
11295 out:
11296 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11297 return r;
11298 }
11299
11300 int BlueStore::_omap_rmkey_range(TransContext *txc,
11301 CollectionRef& c,
11302 OnodeRef& o,
11303 const string& first, const string& last)
11304 {
11305 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
11306 KeyValueDB::Iterator it;
11307 string key_first, key_last;
11308 int r = 0;
11309 if (!o->onode.has_omap()) {
11310 goto out;
11311 }
11312 o->flush();
11313 it = db->get_iterator(PREFIX_OMAP);
11314 get_omap_key(o->onode.nid, first, &key_first);
11315 get_omap_key(o->onode.nid, last, &key_last);
11316 it->lower_bound(key_first);
11317 while (it->valid()) {
11318 if (it->key() >= key_last) {
11319 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
11320 << dendl;
11321 break;
11322 }
11323 txc->t->rmkey(PREFIX_OMAP, it->key());
11324 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
11325 it->next();
11326 }
11327 txc->note_modified_object(o);
11328
11329 out:
11330 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11331 return r;
11332 }
11333
11334 int BlueStore::_set_alloc_hint(
11335 TransContext *txc,
11336 CollectionRef& c,
11337 OnodeRef& o,
11338 uint64_t expected_object_size,
11339 uint64_t expected_write_size,
11340 uint32_t flags)
11341 {
11342 dout(15) << __func__ << " " << c->cid << " " << o->oid
11343 << " object_size " << expected_object_size
11344 << " write_size " << expected_write_size
11345 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11346 << dendl;
11347 int r = 0;
11348 o->onode.expected_object_size = expected_object_size;
11349 o->onode.expected_write_size = expected_write_size;
11350 o->onode.alloc_hint_flags = flags;
11351 txc->write_onode(o);
11352 dout(10) << __func__ << " " << c->cid << " " << o->oid
11353 << " object_size " << expected_object_size
11354 << " write_size " << expected_write_size
11355 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11356 << " = " << r << dendl;
11357 return r;
11358 }
11359
11360 int BlueStore::_clone(TransContext *txc,
11361 CollectionRef& c,
11362 OnodeRef& oldo,
11363 OnodeRef& newo)
11364 {
11365 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11366 << newo->oid << dendl;
11367 int r = 0;
11368 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
11369 derr << __func__ << " mismatched hash on " << oldo->oid
11370 << " and " << newo->oid << dendl;
11371 return -EINVAL;
11372 }
11373
11374 _assign_nid(txc, newo);
11375
11376 // clone data
11377 oldo->flush();
11378 _do_truncate(txc, c, newo, 0);
11379 if (cct->_conf->bluestore_clone_cow) {
11380 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
11381 } else {
11382 bufferlist bl;
11383 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
11384 if (r < 0)
11385 goto out;
11386 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
11387 if (r < 0)
11388 goto out;
11389 }
11390
11391 // clone attrs
11392 newo->onode.attrs = oldo->onode.attrs;
11393
11394 // clone omap
11395 if (newo->onode.has_omap()) {
11396 dout(20) << __func__ << " clearing old omap data" << dendl;
11397 newo->flush();
11398 _do_omap_clear(txc, newo->onode.nid);
11399 }
11400 if (oldo->onode.has_omap()) {
11401 dout(20) << __func__ << " copying omap data" << dendl;
11402 if (!newo->onode.has_omap()) {
11403 newo->onode.set_omap_flag();
11404 }
11405 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
11406 string head, tail;
11407 get_omap_header(oldo->onode.nid, &head);
11408 get_omap_tail(oldo->onode.nid, &tail);
11409 it->lower_bound(head);
11410 while (it->valid()) {
11411 if (it->key() >= tail) {
11412 dout(30) << __func__ << " reached tail" << dendl;
11413 break;
11414 } else {
11415 dout(30) << __func__ << " got header/data "
11416 << pretty_binary_string(it->key()) << dendl;
11417 string key;
11418 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11419 txc->t->set(PREFIX_OMAP, key, it->value());
11420 }
11421 it->next();
11422 }
11423 } else {
11424 newo->onode.clear_omap_flag();
11425 }
11426
11427 txc->write_onode(newo);
11428 r = 0;
11429
11430 out:
11431 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11432 << newo->oid << " = " << r << dendl;
11433 return r;
11434 }
11435
11436 int BlueStore::_do_clone_range(
11437 TransContext *txc,
11438 CollectionRef& c,
11439 OnodeRef& oldo,
11440 OnodeRef& newo,
11441 uint64_t srcoff,
11442 uint64_t length,
11443 uint64_t dstoff)
11444 {
11445 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11446 << newo->oid
11447 << " 0x" << std::hex << srcoff << "~" << length << " -> "
11448 << " 0x" << dstoff << "~" << length << std::dec << dendl;
11449 oldo->extent_map.fault_range(db, srcoff, length);
11450 newo->extent_map.fault_range(db, dstoff, length);
11451 _dump_onode(oldo);
11452 _dump_onode(newo);
11453
11454 // hmm, this could go into an ExtentMap::dup() method.
11455 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
11456 for (auto &e : oldo->extent_map.extent_map) {
11457 e.blob->last_encoded_id = -1;
11458 }
11459 int n = 0;
11460 uint64_t end = srcoff + length;
11461 uint32_t dirty_range_begin = 0;
11462 uint32_t dirty_range_end = 0;
11463 bool src_dirty = false;
11464 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
11465 ep != oldo->extent_map.extent_map.end();
11466 ++ep) {
11467 auto& e = *ep;
11468 if (e.logical_offset >= end) {
11469 break;
11470 }
11471 dout(20) << __func__ << " src " << e << dendl;
11472 BlobRef cb;
11473 bool blob_duped = true;
11474 if (e.blob->last_encoded_id >= 0) {
11475 // blob is already duped
11476 cb = id_to_blob[e.blob->last_encoded_id];
11477 blob_duped = false;
11478 } else {
11479 // dup the blob
11480 const bluestore_blob_t& blob = e.blob->get_blob();
11481 // make sure it is shared
11482 if (!blob.is_shared()) {
11483 c->make_blob_shared(_assign_blobid(txc), e.blob);
11484 if (!src_dirty) {
11485 src_dirty = true;
11486 dirty_range_begin = e.logical_offset;
11487 }
11488 assert(e.logical_end() > 0);
11489 // -1 to exclude next potential shard
11490 dirty_range_end = e.logical_end() - 1;
11491 } else {
11492 c->load_shared_blob(e.blob->shared_blob);
11493 }
11494 cb = new Blob();
11495 e.blob->last_encoded_id = n;
11496 id_to_blob[n] = cb;
11497 e.blob->dup(*cb);
11498 // bump the extent refs on the copied blob's extents
11499 for (auto p : blob.get_extents()) {
11500 if (p.is_valid()) {
11501 e.blob->shared_blob->get_ref(p.offset, p.length);
11502 }
11503 }
11504 txc->write_shared_blob(e.blob->shared_blob);
11505 dout(20) << __func__ << " new " << *cb << dendl;
11506 }
11507 // dup extent
11508 int skip_front, skip_back;
11509 if (e.logical_offset < srcoff) {
11510 skip_front = srcoff - e.logical_offset;
11511 } else {
11512 skip_front = 0;
11513 }
11514 if (e.logical_end() > end) {
11515 skip_back = e.logical_end() - end;
11516 } else {
11517 skip_back = 0;
11518 }
11519 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
11520 e.blob_offset + skip_front,
11521 e.length - skip_front - skip_back, cb);
11522 newo->extent_map.extent_map.insert(*ne);
11523 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
11524 // fixme: we may leave parts of new blob unreferenced that could
11525 // be freed (relative to the shared_blob).
11526 txc->statfs_delta.stored() += ne->length;
11527 if (e.blob->get_blob().is_compressed()) {
11528 txc->statfs_delta.compressed_original() += ne->length;
11529 if (blob_duped){
11530 txc->statfs_delta.compressed() +=
11531 cb->get_blob().get_compressed_payload_length();
11532 }
11533 }
11534 dout(20) << __func__ << " dst " << *ne << dendl;
11535 ++n;
11536 }
11537 if (src_dirty) {
11538 oldo->extent_map.dirty_range(dirty_range_begin,
11539 dirty_range_end - dirty_range_begin);
11540 txc->write_onode(oldo);
11541 }
11542 txc->write_onode(newo);
11543
11544 if (dstoff + length > newo->onode.size) {
11545 newo->onode.size = dstoff + length;
11546 }
11547 newo->extent_map.dirty_range(dstoff, length);
11548 _dump_onode(oldo);
11549 _dump_onode(newo);
11550 return 0;
11551 }
11552
11553 int BlueStore::_clone_range(TransContext *txc,
11554 CollectionRef& c,
11555 OnodeRef& oldo,
11556 OnodeRef& newo,
11557 uint64_t srcoff, uint64_t length, uint64_t dstoff)
11558 {
11559 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11560 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11561 << " to offset 0x" << dstoff << std::dec << dendl;
11562 int r = 0;
11563
11564 if (srcoff + length >= OBJECT_MAX_SIZE ||
11565 dstoff + length >= OBJECT_MAX_SIZE) {
11566 r = -E2BIG;
11567 goto out;
11568 }
11569 if (srcoff + length > oldo->onode.size) {
11570 r = -EINVAL;
11571 goto out;
11572 }
11573
11574 _assign_nid(txc, newo);
11575
11576 if (length > 0) {
11577 if (cct->_conf->bluestore_clone_cow) {
11578 _do_zero(txc, c, newo, dstoff, length);
11579 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
11580 } else {
11581 bufferlist bl;
11582 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
11583 if (r < 0)
11584 goto out;
11585 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
11586 if (r < 0)
11587 goto out;
11588 }
11589 }
11590
11591 txc->write_onode(newo);
11592 r = 0;
11593
11594 out:
11595 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11596 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11597 << " to offset 0x" << dstoff << std::dec
11598 << " = " << r << dendl;
11599 return r;
11600 }
11601
11602 int BlueStore::_rename(TransContext *txc,
11603 CollectionRef& c,
11604 OnodeRef& oldo,
11605 OnodeRef& newo,
11606 const ghobject_t& new_oid)
11607 {
11608 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11609 << new_oid << dendl;
11610 int r;
11611 ghobject_t old_oid = oldo->oid;
11612 mempool::bluestore_cache_other::string new_okey;
11613
11614 if (newo) {
11615 if (newo->exists) {
11616 r = -EEXIST;
11617 goto out;
11618 }
11619 assert(txc->onodes.count(newo) == 0);
11620 }
11621
11622 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
11623
11624 // rewrite shards
11625 {
11626 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
11627 get_object_key(cct, new_oid, &new_okey);
11628 string key;
11629 for (auto &s : oldo->extent_map.shards) {
11630 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
11631 [&](const string& final_key) {
11632 txc->t->rmkey(PREFIX_OBJ, final_key);
11633 }
11634 );
11635 s.dirty = true;
11636 }
11637 }
11638
11639 newo = oldo;
11640 txc->write_onode(newo);
11641
11642 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11643 // Onode in the old slot
11644 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
11645 r = 0;
11646
11647 // hold a ref to new Onode in old name position, to ensure we don't drop
11648 // it from the cache before this txc commits (or else someone may come along
11649 // and read newo's metadata via the old name).
11650 txc->note_modified_object(oldo);
11651
11652 out:
11653 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
11654 << new_oid << " = " << r << dendl;
11655 return r;
11656 }
11657
11658 // collections
11659
11660 int BlueStore::_create_collection(
11661 TransContext *txc,
11662 const coll_t &cid,
11663 unsigned bits,
11664 CollectionRef *c)
11665 {
11666 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
11667 int r;
11668 bufferlist bl;
11669
11670 {
11671 RWLock::WLocker l(coll_lock);
11672 if (*c) {
11673 r = -EEXIST;
11674 goto out;
11675 }
11676 c->reset(
11677 new Collection(
11678 this,
11679 cache_shards[cid.hash_to_shard(cache_shards.size())],
11680 cid));
11681 (*c)->cnode.bits = bits;
11682 coll_map[cid] = *c;
11683 }
11684 ::encode((*c)->cnode, bl);
11685 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11686 r = 0;
11687
11688 out:
11689 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11690 return r;
11691 }
11692
11693 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11694 CollectionRef *c)
11695 {
11696 dout(15) << __func__ << " " << cid << dendl;
11697 int r;
11698
11699 {
11700 RWLock::WLocker l(coll_lock);
11701 if (!*c) {
11702 r = -ENOENT;
11703 goto out;
11704 }
11705 size_t nonexistent_count = 0;
11706 assert((*c)->exists);
11707 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11708 if (o->exists) {
11709 dout(10) << __func__ << " " << o->oid << " " << o
11710 << " exists in onode_map" << dendl;
11711 return true;
11712 }
11713 ++nonexistent_count;
11714 return false;
11715 })) {
11716 r = -ENOTEMPTY;
11717 goto out;
11718 }
11719
11720 vector<ghobject_t> ls;
11721 ghobject_t next;
11722 // Enumerate onodes in db, up to nonexistent_count + 1
11723 // then check if all of them are marked as non-existent.
11724 // Bypass the check if returned number is greater than nonexistent_count
11725 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11726 nonexistent_count + 1, &ls, &next);
11727 if (r >= 0) {
11728 bool exists = false; //ls.size() > nonexistent_count;
11729 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11730 dout(10) << __func__ << " oid " << *it << dendl;
11731 auto onode = (*c)->onode_map.lookup(*it);
11732 exists = !onode || onode->exists;
11733 if (exists) {
11734 dout(10) << __func__ << " " << *it
11735 << " exists in db" << dendl;
11736 }
11737 }
11738 if (!exists) {
11739 coll_map.erase(cid);
11740 txc->removed_collections.push_back(*c);
11741 (*c)->exists = false;
11742 c->reset();
11743 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11744 r = 0;
11745 } else {
11746 dout(10) << __func__ << " " << cid
11747 << " is non-empty" << dendl;
11748 r = -ENOTEMPTY;
11749 }
11750 }
11751 }
11752
11753 out:
11754 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11755 return r;
11756 }
11757
11758 int BlueStore::_split_collection(TransContext *txc,
11759 CollectionRef& c,
11760 CollectionRef& d,
11761 unsigned bits, int rem)
11762 {
11763 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11764 << " bits " << bits << dendl;
11765 RWLock::WLocker l(c->lock);
11766 RWLock::WLocker l2(d->lock);
11767 int r;
11768
11769 // flush all previous deferred writes on this sequencer. this is a bit
11770 // heavyweight, but we need to make sure all deferred writes complete
11771 // before we split as the new collection's sequencer may need to order
11772 // this after those writes, and we don't bother with the complexity of
11773 // moving those TransContexts over to the new osr.
11774 _osr_drain_preceding(txc);
11775
11776 // move any cached items (onodes and referenced shared blobs) that will
11777 // belong to the child collection post-split. leave everything else behind.
11778 // this may include things that don't strictly belong to the now-smaller
11779 // parent split, but the OSD will always send us a split for every new
11780 // child.
11781
11782 spg_t pgid, dest_pgid;
11783 bool is_pg = c->cid.is_pg(&pgid);
11784 assert(is_pg);
11785 is_pg = d->cid.is_pg(&dest_pgid);
11786 assert(is_pg);
11787
11788 // the destination should initially be empty.
11789 assert(d->onode_map.empty());
11790 assert(d->shared_blob_set.empty());
11791 assert(d->cnode.bits == bits);
11792
11793 c->split_cache(d.get());
11794
11795 // adjust bits. note that this will be redundant for all but the first
11796 // split call for this parent (first child).
11797 c->cnode.bits = bits;
11798 assert(d->cnode.bits == bits);
11799 r = 0;
11800
11801 bufferlist bl;
11802 ::encode(c->cnode, bl);
11803 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11804
11805 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11806 << " bits " << bits << " = " << r << dendl;
11807 return r;
11808 }
11809
11810 // DB key value Histogram
11811 #define KEY_SLAB 32
11812 #define VALUE_SLAB 64
11813
11814 const string prefix_onode = "o";
11815 const string prefix_onode_shard = "x";
11816 const string prefix_other = "Z";
11817
11818 int BlueStore::DBHistogram::get_key_slab(size_t sz)
11819 {
11820 return (sz/KEY_SLAB);
11821 }
11822
11823 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11824 {
11825 int lower_bound = slab * KEY_SLAB;
11826 int upper_bound = (slab + 1) * KEY_SLAB;
11827 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11828 return ret;
11829 }
11830
11831 int BlueStore::DBHistogram::get_value_slab(size_t sz)
11832 {
11833 return (sz/VALUE_SLAB);
11834 }
11835
11836 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11837 {
11838 int lower_bound = slab * VALUE_SLAB;
11839 int upper_bound = (slab + 1) * VALUE_SLAB;
11840 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11841 return ret;
11842 }
11843
11844 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11845 const string &prefix, size_t key_size, size_t value_size)
11846 {
11847 uint32_t key_slab = get_key_slab(key_size);
11848 uint32_t value_slab = get_value_slab(value_size);
11849 key_hist[prefix][key_slab].count++;
11850 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11851 key_hist[prefix][key_slab].val_map[value_slab].count++;
11852 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11853 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11854 }
11855
11856 void BlueStore::DBHistogram::dump(Formatter *f)
11857 {
11858 f->open_object_section("rocksdb_value_distribution");
11859 for (auto i : value_hist) {
11860 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11861 }
11862 f->close_section();
11863
11864 f->open_object_section("rocksdb_key_value_histogram");
11865 for (auto i : key_hist) {
11866 f->dump_string("prefix", i.first);
11867 f->open_object_section("key_hist");
11868 for ( auto k : i.second) {
11869 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11870 f->dump_unsigned("max_len", k.second.max_len);
11871 f->open_object_section("value_hist");
11872 for ( auto j : k.second.val_map) {
11873 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11874 f->dump_unsigned("max_len", j.second.max_len);
11875 }
11876 f->close_section();
11877 }
11878 f->close_section();
11879 }
11880 f->close_section();
11881 }
11882
11883 //Itrerates through the db and collects the stats
11884 void BlueStore::generate_db_histogram(Formatter *f)
11885 {
11886 //globals
11887 uint64_t num_onodes = 0;
11888 uint64_t num_shards = 0;
11889 uint64_t num_super = 0;
11890 uint64_t num_coll = 0;
11891 uint64_t num_omap = 0;
11892 uint64_t num_deferred = 0;
11893 uint64_t num_alloc = 0;
11894 uint64_t num_stat = 0;
11895 uint64_t num_others = 0;
11896 uint64_t num_shared_shards = 0;
11897 size_t max_key_size =0, max_value_size = 0;
11898 uint64_t total_key_size = 0, total_value_size = 0;
11899 size_t key_size = 0, value_size = 0;
11900 DBHistogram hist;
11901
11902 utime_t start = ceph_clock_now();
11903
11904 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11905 iter->seek_to_first();
11906 while (iter->valid()) {
11907 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11908 key_size = iter->key_size();
11909 value_size = iter->value_size();
11910 hist.value_hist[hist.get_value_slab(value_size)]++;
11911 max_key_size = MAX(max_key_size, key_size);
11912 max_value_size = MAX(max_value_size, value_size);
11913 total_key_size += key_size;
11914 total_value_size += value_size;
11915
11916 pair<string,string> key(iter->raw_key());
11917
11918 if (key.first == PREFIX_SUPER) {
11919 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11920 num_super++;
11921 } else if (key.first == PREFIX_STAT) {
11922 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11923 num_stat++;
11924 } else if (key.first == PREFIX_COLL) {
11925 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11926 num_coll++;
11927 } else if (key.first == PREFIX_OBJ) {
11928 if (key.second.back() == ONODE_KEY_SUFFIX) {
11929 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11930 num_onodes++;
11931 } else {
11932 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11933 num_shards++;
11934 }
11935 } else if (key.first == PREFIX_OMAP) {
11936 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11937 num_omap++;
11938 } else if (key.first == PREFIX_DEFERRED) {
11939 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11940 num_deferred++;
11941 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11942 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11943 num_alloc++;
11944 } else if (key.first == PREFIX_SHARED_BLOB) {
11945 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11946 num_shared_shards++;
11947 } else {
11948 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11949 num_others++;
11950 }
11951 iter->next();
11952 }
11953
11954 utime_t duration = ceph_clock_now() - start;
11955 f->open_object_section("rocksdb_key_value_stats");
11956 f->dump_unsigned("num_onodes", num_onodes);
11957 f->dump_unsigned("num_shards", num_shards);
11958 f->dump_unsigned("num_super", num_super);
11959 f->dump_unsigned("num_coll", num_coll);
11960 f->dump_unsigned("num_omap", num_omap);
11961 f->dump_unsigned("num_deferred", num_deferred);
11962 f->dump_unsigned("num_alloc", num_alloc);
11963 f->dump_unsigned("num_stat", num_stat);
11964 f->dump_unsigned("num_shared_shards", num_shared_shards);
11965 f->dump_unsigned("num_others", num_others);
11966 f->dump_unsigned("max_key_size", max_key_size);
11967 f->dump_unsigned("max_value_size", max_value_size);
11968 f->dump_unsigned("total_key_size", total_key_size);
11969 f->dump_unsigned("total_value_size", total_value_size);
11970 f->close_section();
11971
11972 hist.dump(f);
11973
11974 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11975
11976 }
11977
11978 void BlueStore::_flush_cache()
11979 {
11980 dout(10) << __func__ << dendl;
11981 for (auto i : cache_shards) {
11982 i->trim_all();
11983 assert(i->empty());
11984 }
11985 for (auto& p : coll_map) {
11986 if (!p.second->onode_map.empty()) {
11987 derr << __func__ << "stray onodes on " << p.first << dendl;
11988 p.second->onode_map.dump(cct, 0);
11989 }
11990 if (!p.second->shared_blob_set.empty()) {
11991 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11992 p.second->shared_blob_set.dump(cct, 0);
11993 }
11994 assert(p.second->onode_map.empty());
11995 assert(p.second->shared_blob_set.empty());
11996 }
11997 coll_map.clear();
11998 }
11999
12000 // For external caller.
12001 // We use a best-effort policy instead, e.g.,
12002 // we don't care if there are still some pinned onodes/data in the cache
12003 // after this command is completed.
12004 void BlueStore::flush_cache()
12005 {
12006 dout(10) << __func__ << dendl;
12007 for (auto i : cache_shards) {
12008 i->trim_all();
12009 }
12010 }
12011
12012 void BlueStore::_apply_padding(uint64_t head_pad,
12013 uint64_t tail_pad,
12014 bufferlist& padded)
12015 {
12016 if (head_pad) {
12017 padded.prepend_zero(head_pad);
12018 }
12019 if (tail_pad) {
12020 padded.append_zero(tail_pad);
12021 }
12022 if (head_pad || tail_pad) {
12023 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
12024 << " tail 0x" << tail_pad << std::dec << dendl;
12025 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
12026 }
12027 }
12028
12029 // ===========================================