]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
update sources to 12.2.7
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // vim: ts=8 sw=2 smarttab
2 /*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14 #include <unistd.h>
15 #include <stdlib.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <fcntl.h>
19
20 #include "include/cpp-btree/btree_set.h"
21
22 #include "BlueStore.h"
23 #include "os/kv.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "Allocator.h"
30 #include "FreelistManager.h"
31 #include "BlueFS.h"
32 #include "BlueRocksEnv.h"
33 #include "auth/Crypto.h"
34 #include "common/EventTrace.h"
35
36 #define dout_context cct
37 #define dout_subsys ceph_subsys_bluestore
38
39 using bid_t = decltype(BlueStore::Blob::id);
40
41 // bluestore_cache_onode
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
43 bluestore_cache_onode);
44
45 // bluestore_cache_other
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
47 bluestore_cache_other);
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
49 bluestore_cache_other);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
51 bluestore_cache_other);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
53 bluestore_cache_other);
54
55 // bluestore_txc
56 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
57 bluestore_txc);
58
59
60 // kv store prefixes
61 const string PREFIX_SUPER = "S"; // field -> value
62 const string PREFIX_STAT = "T"; // field -> value(int64 array)
63 const string PREFIX_COLL = "C"; // collection name -> cnode_t
64 const string PREFIX_OBJ = "O"; // object name -> onode_t
65 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
66 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
67 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
68 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
69
70 // write a label in the first block. always use this size. note that
71 // bluefs makes a matching assumption about the location of its
72 // superblock (always the second block of the device).
73 #define BDEV_LABEL_BLOCK_SIZE 4096
74
75 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76 #define SUPER_RESERVED 8192
77
78 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
79
80
81 /*
82 * extent map blob encoding
83 *
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
86 */
87 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91 #define BLOBID_SHIFT_BITS 4
92
93 /*
94 * object name key structure
95 *
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
99 *
100 * escaped string: namespace
101 *
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
106 *
107 * encoded u64: snap
108 * encoded u64: generation
109 * 'o'
110 */
111 #define ONODE_KEY_SUFFIX 'o'
112
113 /*
114 * extent shard key
115 *
116 * object prefix key
117 * u32
118 * 'x'
119 */
120 #define EXTENT_SHARD_KEY_SUFFIX 'x'
121
122 /*
123 * string encoding in the key
124 *
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
128 * hex digits.
129 *
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
132 *
133 */
134 template<typename S>
135 static void append_escaped(const string &in, S *out)
136 {
137 char hexbyte[in.length() * 3 + 1];
138 char* ptr = &hexbyte[0];
139 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
140 if (*i <= '#') {
141 *ptr++ = '#';
142 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
143 *ptr++ = "0123456789abcdef"[*i & 0x0f];
144 } else if (*i >= '~') {
145 *ptr++ = '~';
146 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
147 *ptr++ = "0123456789abcdef"[*i & 0x0f];
148 } else {
149 *ptr++ = *i;
150 }
151 }
152 *ptr++ = '!';
153 out->append(hexbyte, ptr - &hexbyte[0]);
154 }
155
156 inline unsigned h2i(char c)
157 {
158 if ((c >= '0') && (c <= '9')) {
159 return c - 0x30;
160 } else if ((c >= 'a') && (c <= 'f')) {
161 return c - 'a' + 10;
162 } else if ((c >= 'A') && (c <= 'F')) {
163 return c - 'A' + 10;
164 } else {
165 return 256; // make it always larger than 255
166 }
167 }
168
169 static int decode_escaped(const char *p, string *out)
170 {
171 char buff[256];
172 char* ptr = &buff[0];
173 char* max = &buff[252];
174 const char *orig_p = p;
175 while (*p && *p != '!') {
176 if (*p == '#' || *p == '~') {
177 unsigned hex = 0;
178 p++;
179 hex = h2i(*p++) << 4;
180 if (hex > 255) {
181 return -EINVAL;
182 }
183 hex |= h2i(*p++);
184 if (hex > 255) {
185 return -EINVAL;
186 }
187 *ptr++ = hex;
188 } else {
189 *ptr++ = *p++;
190 }
191 if (ptr > max) {
192 out->append(buff, ptr-buff);
193 ptr = &buff[0];
194 }
195 }
196 if (ptr != buff) {
197 out->append(buff, ptr-buff);
198 }
199 return p - orig_p;
200 }
201
202 // some things we encode in binary (as le32 or le64); print the
203 // resulting key strings nicely
204 template<typename S>
205 static string pretty_binary_string(const S& in)
206 {
207 char buf[10];
208 string out;
209 out.reserve(in.length() * 3);
210 enum { NONE, HEX, STRING } mode = NONE;
211 unsigned from = 0, i;
212 for (i=0; i < in.length(); ++i) {
213 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
214 (mode == HEX && in.length() - i >= 4 &&
215 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
216 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
217 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
218 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
219 if (mode == STRING) {
220 out.append(in.c_str() + from, i - from);
221 out.push_back('\'');
222 }
223 if (mode != HEX) {
224 out.append("0x");
225 mode = HEX;
226 }
227 if (in.length() - i >= 4) {
228 // print a whole u32 at once
229 snprintf(buf, sizeof(buf), "%08x",
230 (uint32_t)(((unsigned char)in[i] << 24) |
231 ((unsigned char)in[i+1] << 16) |
232 ((unsigned char)in[i+2] << 8) |
233 ((unsigned char)in[i+3] << 0)));
234 i += 3;
235 } else {
236 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
237 }
238 out.append(buf);
239 } else {
240 if (mode != STRING) {
241 out.push_back('\'');
242 mode = STRING;
243 from = i;
244 }
245 }
246 }
247 if (mode == STRING) {
248 out.append(in.c_str() + from, i - from);
249 out.push_back('\'');
250 }
251 return out;
252 }
253
254 template<typename T>
255 static void _key_encode_shard(shard_id_t shard, T *key)
256 {
257 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
258 }
259
260 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
261 {
262 pshard->id = (uint8_t)*key - (uint8_t)0x80;
263 return key + 1;
264 }
265
266 static void get_coll_key_range(const coll_t& cid, int bits,
267 string *temp_start, string *temp_end,
268 string *start, string *end)
269 {
270 temp_start->clear();
271 temp_end->clear();
272 start->clear();
273 end->clear();
274
275 spg_t pgid;
276 if (cid.is_pg(&pgid)) {
277 _key_encode_shard(pgid.shard, start);
278 *temp_start = *start;
279
280 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
281 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
282
283 *end = *start;
284 *temp_end = *temp_start;
285
286 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
287 _key_encode_u32(reverse_hash, start);
288 _key_encode_u32(reverse_hash, temp_start);
289
290 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
291 if (end_hash > 0xffffffffull)
292 end_hash = 0xffffffffull;
293
294 _key_encode_u32(end_hash, end);
295 _key_encode_u32(end_hash, temp_end);
296 } else {
297 _key_encode_shard(shard_id_t::NO_SHARD, start);
298 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
299 *end = *start;
300 _key_encode_u32(0, start);
301 _key_encode_u32(0xffffffff, end);
302
303 // no separate temp section
304 *temp_start = *end;
305 *temp_end = *end;
306 }
307 }
308
309 static void get_shared_blob_key(uint64_t sbid, string *key)
310 {
311 key->clear();
312 _key_encode_u64(sbid, key);
313 }
314
315 static int get_key_shared_blob(const string& key, uint64_t *sbid)
316 {
317 const char *p = key.c_str();
318 if (key.length() < sizeof(uint64_t))
319 return -1;
320 _key_decode_u64(p, sbid);
321 return 0;
322 }
323
324 template<typename S>
325 static int get_key_object(const S& key, ghobject_t *oid)
326 {
327 int r;
328 const char *p = key.c_str();
329
330 if (key.length() < 1 + 8 + 4)
331 return -1;
332 p = _key_decode_shard(p, &oid->shard_id);
333
334 uint64_t pool;
335 p = _key_decode_u64(p, &pool);
336 oid->hobj.pool = pool - 0x8000000000000000ull;
337
338 unsigned hash;
339 p = _key_decode_u32(p, &hash);
340
341 oid->hobj.set_bitwise_key_u32(hash);
342
343 r = decode_escaped(p, &oid->hobj.nspace);
344 if (r < 0)
345 return -2;
346 p += r + 1;
347
348 string k;
349 r = decode_escaped(p, &k);
350 if (r < 0)
351 return -3;
352 p += r + 1;
353 if (*p == '=') {
354 // no key
355 ++p;
356 oid->hobj.oid.name = k;
357 } else if (*p == '<' || *p == '>') {
358 // key + name
359 ++p;
360 r = decode_escaped(p, &oid->hobj.oid.name);
361 if (r < 0)
362 return -5;
363 p += r + 1;
364 oid->hobj.set_key(k);
365 } else {
366 // malformed
367 return -6;
368 }
369
370 p = _key_decode_u64(p, &oid->hobj.snap.val);
371 p = _key_decode_u64(p, &oid->generation);
372
373 if (*p != ONODE_KEY_SUFFIX) {
374 return -7;
375 }
376 p++;
377 if (*p) {
378 // if we get something other than a null terminator here,
379 // something goes wrong.
380 return -8;
381 }
382
383 return 0;
384 }
385
386 template<typename S>
387 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
388 {
389 key->clear();
390
391 size_t max_len = 1 + 8 + 4 +
392 (oid.hobj.nspace.length() * 3 + 1) +
393 (oid.hobj.get_key().length() * 3 + 1) +
394 1 + // for '<', '=', or '>'
395 (oid.hobj.oid.name.length() * 3 + 1) +
396 8 + 8 + 1;
397 key->reserve(max_len);
398
399 _key_encode_shard(oid.shard_id, key);
400 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
401 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
402
403 append_escaped(oid.hobj.nspace, key);
404
405 if (oid.hobj.get_key().length()) {
406 // is a key... could be < = or >.
407 append_escaped(oid.hobj.get_key(), key);
408 // (ASCII chars < = and > sort in that order, yay)
409 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
410 if (r) {
411 key->append(r > 0 ? ">" : "<");
412 append_escaped(oid.hobj.oid.name, key);
413 } else {
414 // same as no key
415 key->append("=");
416 }
417 } else {
418 // no key
419 append_escaped(oid.hobj.oid.name, key);
420 key->append("=");
421 }
422
423 _key_encode_u64(oid.hobj.snap, key);
424 _key_encode_u64(oid.generation, key);
425
426 key->push_back(ONODE_KEY_SUFFIX);
427
428 // sanity check
429 if (true) {
430 ghobject_t t;
431 int r = get_key_object(*key, &t);
432 if (r || t != oid) {
433 derr << " r " << r << dendl;
434 derr << "key " << pretty_binary_string(*key) << dendl;
435 derr << "oid " << oid << dendl;
436 derr << " t " << t << dendl;
437 assert(r == 0 && t == oid);
438 }
439 }
440 }
441
442
443 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
444 // char lets us quickly test whether it is a shard key without decoding any
445 // of the prefix bytes.
446 template<typename S>
447 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
448 string *key)
449 {
450 key->clear();
451 key->reserve(onode_key.length() + 4 + 1);
452 key->append(onode_key.c_str(), onode_key.size());
453 _key_encode_u32(offset, key);
454 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
455 }
456
457 static void rewrite_extent_shard_key(uint32_t offset, string *key)
458 {
459 assert(key->size() > sizeof(uint32_t) + 1);
460 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
461 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
462 }
463
464 template<typename S>
465 static void generate_extent_shard_key_and_apply(
466 const S& onode_key,
467 uint32_t offset,
468 string *key,
469 std::function<void(const string& final_key)> apply)
470 {
471 if (key->empty()) { // make full key
472 assert(!onode_key.empty());
473 get_extent_shard_key(onode_key, offset, key);
474 } else {
475 rewrite_extent_shard_key(offset, key);
476 }
477 apply(*key);
478 }
479
480 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
481 {
482 assert(key.size() > sizeof(uint32_t) + 1);
483 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
484 int okey_len = key.size() - sizeof(uint32_t) - 1;
485 *onode_key = key.substr(0, okey_len);
486 const char *p = key.data() + okey_len;
487 _key_decode_u32(p, offset);
488 return 0;
489 }
490
491 static bool is_extent_shard_key(const string& key)
492 {
493 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
494 }
495
496 // '-' < '.' < '~'
497 static void get_omap_header(uint64_t id, string *out)
498 {
499 _key_encode_u64(id, out);
500 out->push_back('-');
501 }
502
503 // hmm, I don't think there's any need to escape the user key since we
504 // have a clean prefix.
505 static void get_omap_key(uint64_t id, const string& key, string *out)
506 {
507 _key_encode_u64(id, out);
508 out->push_back('.');
509 out->append(key);
510 }
511
512 static void rewrite_omap_key(uint64_t id, string old, string *out)
513 {
514 _key_encode_u64(id, out);
515 out->append(old.c_str() + out->length(), old.size() - out->length());
516 }
517
518 static void decode_omap_key(const string& key, string *user_key)
519 {
520 *user_key = key.substr(sizeof(uint64_t) + 1);
521 }
522
523 static void get_omap_tail(uint64_t id, string *out)
524 {
525 _key_encode_u64(id, out);
526 out->push_back('~');
527 }
528
529 static void get_deferred_key(uint64_t seq, string *out)
530 {
531 _key_encode_u64(seq, out);
532 }
533
534
535 // merge operators
536
537 struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
538 void merge_nonexistent(
539 const char *rdata, size_t rlen, std::string *new_value) override {
540 *new_value = std::string(rdata, rlen);
541 }
542 void merge(
543 const char *ldata, size_t llen,
544 const char *rdata, size_t rlen,
545 std::string *new_value) override {
546 assert(llen == rlen);
547 assert((rlen % 8) == 0);
548 new_value->resize(rlen);
549 const __le64* lv = (const __le64*)ldata;
550 const __le64* rv = (const __le64*)rdata;
551 __le64* nv = &(__le64&)new_value->at(0);
552 for (size_t i = 0; i < rlen >> 3; ++i) {
553 nv[i] = lv[i] + rv[i];
554 }
555 }
556 // We use each operator name and each prefix to construct the
557 // overall RocksDB operator name for consistency check at open time.
558 string name() const override {
559 return "int64_array";
560 }
561 };
562
563
564 // Buffer
565
566 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
567 {
568 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
569 << b.offset << "~" << b.length << std::dec
570 << " " << BlueStore::Buffer::get_state_name(b.state);
571 if (b.flags)
572 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
573 return out << ")";
574 }
575
576 // Garbage Collector
577
578 void BlueStore::GarbageCollector::process_protrusive_extents(
579 const BlueStore::ExtentMap& extent_map,
580 uint64_t start_offset,
581 uint64_t end_offset,
582 uint64_t start_touch_offset,
583 uint64_t end_touch_offset,
584 uint64_t min_alloc_size)
585 {
586 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
587
588 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
589 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
590
591 dout(30) << __func__ << " (hex): [" << std::hex
592 << lookup_start_offset << ", " << lookup_end_offset
593 << ")" << std::dec << dendl;
594
595 for (auto it = extent_map.seek_lextent(lookup_start_offset);
596 it != extent_map.extent_map.end() &&
597 it->logical_offset < lookup_end_offset;
598 ++it) {
599 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
600 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
601
602 dout(30) << __func__ << " " << *it
603 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
604 << dendl;
605
606 Blob* b = it->blob.get();
607
608 if (it->logical_offset >=start_touch_offset &&
609 it->logical_end() <= end_touch_offset) {
610 // Process extents within the range affected by
611 // the current write request.
612 // Need to take into account if existing extents
613 // can be merged with them (uncompressed case)
614 if (!b->get_blob().is_compressed()) {
615 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
616 --blob_info_counted->expected_allocations; // don't need to allocate
617 // new AU for compressed
618 // data since another
619 // collocated uncompressed
620 // blob already exists
621 dout(30) << __func__ << " --expected:"
622 << alloc_unit_start << dendl;
623 }
624 used_alloc_unit = alloc_unit_end;
625 blob_info_counted = nullptr;
626 }
627 } else if (b->get_blob().is_compressed()) {
628
629 // additionally we take compressed blobs that were not impacted
630 // by the write into account too
631 BlobInfo& bi =
632 affected_blobs.emplace(
633 b, BlobInfo(b->get_referenced_bytes())).first->second;
634
635 int adjust =
636 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
637 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
638 dout(30) << __func__ << " expected_allocations="
639 << bi.expected_allocations << " end_au:"
640 << alloc_unit_end << dendl;
641
642 blob_info_counted = &bi;
643 used_alloc_unit = alloc_unit_end;
644
645 assert(it->length <= bi.referenced_bytes);
646 bi.referenced_bytes -= it->length;
647 dout(30) << __func__ << " affected_blob:" << *b
648 << " unref 0x" << std::hex << it->length
649 << " referenced = 0x" << bi.referenced_bytes
650 << std::dec << dendl;
651 // NOTE: we can't move specific blob to resulting GC list here
652 // when reference counter == 0 since subsequent extents might
653 // decrement its expected_allocation.
654 // Hence need to enumerate all the extents first.
655 if (!bi.collect_candidate) {
656 bi.first_lextent = it;
657 bi.collect_candidate = true;
658 }
659 bi.last_lextent = it;
660 } else {
661 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
662 // don't need to allocate new AU for compressed data since another
663 // collocated uncompressed blob already exists
664 --blob_info_counted->expected_allocations;
665 dout(30) << __func__ << " --expected_allocations:"
666 << alloc_unit_start << dendl;
667 }
668 used_alloc_unit = alloc_unit_end;
669 blob_info_counted = nullptr;
670 }
671 }
672
673 for (auto b_it = affected_blobs.begin();
674 b_it != affected_blobs.end();
675 ++b_it) {
676 Blob* b = b_it->first;
677 BlobInfo& bi = b_it->second;
678 if (bi.referenced_bytes == 0) {
679 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
680 int64_t blob_expected_for_release =
681 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
682
683 dout(30) << __func__ << " " << *(b_it->first)
684 << " expected4release=" << blob_expected_for_release
685 << " expected_allocations=" << bi.expected_allocations
686 << dendl;
687 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
688 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
689 if (bi.collect_candidate) {
690 auto it = bi.first_lextent;
691 bool bExit = false;
692 do {
693 if (it->blob.get() == b) {
694 extents_to_collect.emplace_back(it->logical_offset, it->length);
695 }
696 bExit = it == bi.last_lextent;
697 ++it;
698 } while (!bExit);
699 }
700 expected_for_release += blob_expected_for_release;
701 expected_allocations += bi.expected_allocations;
702 }
703 }
704 }
705 }
706
707 int64_t BlueStore::GarbageCollector::estimate(
708 uint64_t start_offset,
709 uint64_t length,
710 const BlueStore::ExtentMap& extent_map,
711 const BlueStore::old_extent_map_t& old_extents,
712 uint64_t min_alloc_size)
713 {
714
715 affected_blobs.clear();
716 extents_to_collect.clear();
717 used_alloc_unit = boost::optional<uint64_t >();
718 blob_info_counted = nullptr;
719
720 gc_start_offset = start_offset;
721 gc_end_offset = start_offset + length;
722
723 uint64_t end_offset = start_offset + length;
724
725 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
726 Blob* b = it->e.blob.get();
727 if (b->get_blob().is_compressed()) {
728
729 // update gc_start_offset/gc_end_offset if needed
730 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
731 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
732
733 auto o = it->e.logical_offset;
734 auto l = it->e.length;
735
736 uint64_t ref_bytes = b->get_referenced_bytes();
737 // micro optimization to bypass blobs that have no more references
738 if (ref_bytes != 0) {
739 dout(30) << __func__ << " affected_blob:" << *b
740 << " unref 0x" << std::hex << o << "~" << l
741 << std::dec << dendl;
742 affected_blobs.emplace(b, BlobInfo(ref_bytes));
743 }
744 }
745 }
746 dout(30) << __func__ << " gc range(hex): [" << std::hex
747 << gc_start_offset << ", " << gc_end_offset
748 << ")" << std::dec << dendl;
749
750 // enumerate preceeding extents to check if they reference affected blobs
751 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
752 process_protrusive_extents(extent_map,
753 gc_start_offset,
754 gc_end_offset,
755 start_offset,
756 end_offset,
757 min_alloc_size);
758 }
759 return expected_for_release - expected_allocations;
760 }
761
762 // Cache
763
764 BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
765 PerfCounters *logger)
766 {
767 Cache *c = nullptr;
768
769 if (type == "lru")
770 c = new LRUCache(cct);
771 else if (type == "2q")
772 c = new TwoQCache(cct);
773 else
774 assert(0 == "unrecognized cache type");
775
776 c->logger = logger;
777 return c;
778 }
779
780 void BlueStore::Cache::trim_all()
781 {
782 std::lock_guard<std::recursive_mutex> l(lock);
783 _trim(0, 0);
784 }
785
786 void BlueStore::Cache::trim(
787 uint64_t target_bytes,
788 float target_meta_ratio,
789 float target_data_ratio,
790 float bytes_per_onode)
791 {
792 std::lock_guard<std::recursive_mutex> l(lock);
793 uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
794 uint64_t current_buffer = _get_buffer_bytes();
795 uint64_t current = current_meta + current_buffer;
796
797 uint64_t target_meta = target_bytes * target_meta_ratio;
798 uint64_t target_buffer = target_bytes * target_data_ratio;
799
800 // correct for overflow or float imprecision
801 target_meta = min(target_bytes, target_meta);
802 target_buffer = min(target_bytes - target_meta, target_buffer);
803
804 if (current <= target_bytes) {
805 dout(10) << __func__
806 << " shard target " << pretty_si_t(target_bytes)
807 << " meta/data ratios " << target_meta_ratio
808 << " + " << target_data_ratio << " ("
809 << pretty_si_t(target_meta) << " + "
810 << pretty_si_t(target_buffer) << "), "
811 << " current " << pretty_si_t(current) << " ("
812 << pretty_si_t(current_meta) << " + "
813 << pretty_si_t(current_buffer) << ")"
814 << dendl;
815 return;
816 }
817
818 uint64_t need_to_free = current - target_bytes;
819 uint64_t free_buffer = 0;
820 uint64_t free_meta = 0;
821 if (current_buffer > target_buffer) {
822 free_buffer = current_buffer - target_buffer;
823 if (free_buffer > need_to_free) {
824 free_buffer = need_to_free;
825 }
826 }
827 free_meta = need_to_free - free_buffer;
828
829 // start bounds at what we have now
830 uint64_t max_buffer = current_buffer - free_buffer;
831 uint64_t max_meta = current_meta - free_meta;
832 uint64_t max_onodes = max_meta / bytes_per_onode;
833
834 dout(10) << __func__
835 << " shard target " << pretty_si_t(target_bytes)
836 << " ratio " << target_meta_ratio << " ("
837 << pretty_si_t(target_meta) << " + "
838 << pretty_si_t(target_buffer) << "), "
839 << " current " << pretty_si_t(current) << " ("
840 << pretty_si_t(current_meta) << " + "
841 << pretty_si_t(current_buffer) << "),"
842 << " need_to_free " << pretty_si_t(need_to_free) << " ("
843 << pretty_si_t(free_meta) << " + "
844 << pretty_si_t(free_buffer) << ")"
845 << " -> max " << max_onodes << " onodes + "
846 << max_buffer << " buffer"
847 << dendl;
848 _trim(max_onodes, max_buffer);
849 }
850
851
852 // LRUCache
853 #undef dout_prefix
854 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
855
856 void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
857 {
858 auto p = onode_lru.iterator_to(*o);
859 onode_lru.erase(p);
860 onode_lru.push_front(*o);
861 }
862
863 void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
864 {
865 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
866 << " buffers " << buffer_size << " / " << buffer_max
867 << dendl;
868
869 _audit("trim start");
870
871 // buffers
872 while (buffer_size > buffer_max) {
873 auto i = buffer_lru.rbegin();
874 if (i == buffer_lru.rend()) {
875 // stop if buffer_lru is now empty
876 break;
877 }
878
879 Buffer *b = &*i;
880 assert(b->is_clean());
881 dout(20) << __func__ << " rm " << *b << dendl;
882 b->space->_rm_buffer(this, b);
883 }
884
885 // onodes
886 int num = onode_lru.size() - onode_max;
887 if (num <= 0)
888 return; // don't even try
889
890 auto p = onode_lru.end();
891 assert(p != onode_lru.begin());
892 --p;
893 int skipped = 0;
894 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
895 while (num > 0) {
896 Onode *o = &*p;
897 int refs = o->nref.load();
898 if (refs > 1) {
899 dout(20) << __func__ << " " << o->oid << " has " << refs
900 << " refs, skipping" << dendl;
901 if (++skipped >= max_skipped) {
902 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
903 << num << " left to trim" << dendl;
904 break;
905 }
906
907 if (p == onode_lru.begin()) {
908 break;
909 } else {
910 p--;
911 num--;
912 continue;
913 }
914 }
915 dout(30) << __func__ << " rm " << o->oid << dendl;
916 if (p != onode_lru.begin()) {
917 onode_lru.erase(p--);
918 } else {
919 onode_lru.erase(p);
920 assert(num == 1);
921 }
922 o->get(); // paranoia
923 o->c->onode_map.remove(o->oid);
924 o->put();
925 --num;
926 }
927 }
928
929 #ifdef DEBUG_CACHE
930 void BlueStore::LRUCache::_audit(const char *when)
931 {
932 dout(10) << __func__ << " " << when << " start" << dendl;
933 uint64_t s = 0;
934 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
935 s += i->length;
936 }
937 if (s != buffer_size) {
938 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
939 << dendl;
940 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
941 derr << __func__ << " " << *i << dendl;
942 }
943 assert(s == buffer_size);
944 }
945 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
946 << " ok" << dendl;
947 }
948 #endif
949
950 // TwoQCache
951 #undef dout_prefix
952 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
953
954
955 void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
956 {
957 auto p = onode_lru.iterator_to(*o);
958 onode_lru.erase(p);
959 onode_lru.push_front(*o);
960 }
961
962 void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
963 {
964 dout(20) << __func__ << " level " << level << " near " << near
965 << " on " << *b
966 << " which has cache_private " << b->cache_private << dendl;
967 if (near) {
968 b->cache_private = near->cache_private;
969 switch (b->cache_private) {
970 case BUFFER_WARM_IN:
971 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
972 break;
973 case BUFFER_WARM_OUT:
974 assert(b->is_empty());
975 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
976 break;
977 case BUFFER_HOT:
978 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
979 break;
980 default:
981 assert(0 == "bad cache_private");
982 }
983 } else if (b->cache_private == BUFFER_NEW) {
984 b->cache_private = BUFFER_WARM_IN;
985 if (level > 0) {
986 buffer_warm_in.push_front(*b);
987 } else {
988 // take caller hint to start at the back of the warm queue
989 buffer_warm_in.push_back(*b);
990 }
991 } else {
992 // we got a hint from discard
993 switch (b->cache_private) {
994 case BUFFER_WARM_IN:
995 // stay in warm_in. move to front, even though 2Q doesn't actually
996 // do this.
997 dout(20) << __func__ << " move to front of warm " << *b << dendl;
998 buffer_warm_in.push_front(*b);
999 break;
1000 case BUFFER_WARM_OUT:
1001 b->cache_private = BUFFER_HOT;
1002 // move to hot. fall-thru
1003 case BUFFER_HOT:
1004 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1005 buffer_hot.push_front(*b);
1006 break;
1007 default:
1008 assert(0 == "bad cache_private");
1009 }
1010 }
1011 if (!b->is_empty()) {
1012 buffer_bytes += b->length;
1013 buffer_list_bytes[b->cache_private] += b->length;
1014 }
1015 }
1016
1017 void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1018 {
1019 dout(20) << __func__ << " " << *b << dendl;
1020 if (!b->is_empty()) {
1021 assert(buffer_bytes >= b->length);
1022 buffer_bytes -= b->length;
1023 assert(buffer_list_bytes[b->cache_private] >= b->length);
1024 buffer_list_bytes[b->cache_private] -= b->length;
1025 }
1026 switch (b->cache_private) {
1027 case BUFFER_WARM_IN:
1028 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1029 break;
1030 case BUFFER_WARM_OUT:
1031 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1032 break;
1033 case BUFFER_HOT:
1034 buffer_hot.erase(buffer_hot.iterator_to(*b));
1035 break;
1036 default:
1037 assert(0 == "bad cache_private");
1038 }
1039 }
1040
1041 void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1042 {
1043 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1044 src->_rm_buffer(b);
1045
1046 // preserve which list we're on (even if we can't preserve the order!)
1047 switch (b->cache_private) {
1048 case BUFFER_WARM_IN:
1049 assert(!b->is_empty());
1050 buffer_warm_in.push_back(*b);
1051 break;
1052 case BUFFER_WARM_OUT:
1053 assert(b->is_empty());
1054 buffer_warm_out.push_back(*b);
1055 break;
1056 case BUFFER_HOT:
1057 assert(!b->is_empty());
1058 buffer_hot.push_back(*b);
1059 break;
1060 default:
1061 assert(0 == "bad cache_private");
1062 }
1063 if (!b->is_empty()) {
1064 buffer_bytes += b->length;
1065 buffer_list_bytes[b->cache_private] += b->length;
1066 }
1067 }
1068
1069 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1070 {
1071 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1072 if (!b->is_empty()) {
1073 assert((int64_t)buffer_bytes + delta >= 0);
1074 buffer_bytes += delta;
1075 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1076 buffer_list_bytes[b->cache_private] += delta;
1077 }
1078 }
1079
1080 void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1081 {
1082 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1083 << " buffers " << buffer_bytes << " / " << buffer_max
1084 << dendl;
1085
1086 _audit("trim start");
1087
1088 // buffers
1089 if (buffer_bytes > buffer_max) {
1090 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1091 uint64_t khot = buffer_max - kin;
1092
1093 // pre-calculate kout based on average buffer size too,
1094 // which is typical(the warm_in and hot lists may change later)
1095 uint64_t kout = 0;
1096 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1097 if (buffer_num) {
1098 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1099 assert(buffer_avg_size);
1100 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1101 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1102 }
1103
1104 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1105 // hot is small, give slack to warm_in
1106 kin += khot - buffer_list_bytes[BUFFER_HOT];
1107 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1108 // warm_in is small, give slack to hot
1109 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1110 }
1111
1112 // adjust warm_in list
1113 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1114 uint64_t evicted = 0;
1115
1116 while (to_evict_bytes > 0) {
1117 auto p = buffer_warm_in.rbegin();
1118 if (p == buffer_warm_in.rend()) {
1119 // stop if warm_in list is now empty
1120 break;
1121 }
1122
1123 Buffer *b = &*p;
1124 assert(b->is_clean());
1125 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1126 assert(buffer_bytes >= b->length);
1127 buffer_bytes -= b->length;
1128 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1129 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1130 to_evict_bytes -= b->length;
1131 evicted += b->length;
1132 b->state = Buffer::STATE_EMPTY;
1133 b->data.clear();
1134 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1135 buffer_warm_out.push_front(*b);
1136 b->cache_private = BUFFER_WARM_OUT;
1137 }
1138
1139 if (evicted > 0) {
1140 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1141 << " from warm_in list, done evicting warm_in buffers"
1142 << dendl;
1143 }
1144
1145 // adjust hot list
1146 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1147 evicted = 0;
1148
1149 while (to_evict_bytes > 0) {
1150 auto p = buffer_hot.rbegin();
1151 if (p == buffer_hot.rend()) {
1152 // stop if hot list is now empty
1153 break;
1154 }
1155
1156 Buffer *b = &*p;
1157 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1158 assert(b->is_clean());
1159 // adjust evict size before buffer goes invalid
1160 to_evict_bytes -= b->length;
1161 evicted += b->length;
1162 b->space->_rm_buffer(this, b);
1163 }
1164
1165 if (evicted > 0) {
1166 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1167 << " from hot list, done evicting hot buffers"
1168 << dendl;
1169 }
1170
1171 // adjust warm out list too, if necessary
1172 int64_t num = buffer_warm_out.size() - kout;
1173 while (num-- > 0) {
1174 Buffer *b = &*buffer_warm_out.rbegin();
1175 assert(b->is_empty());
1176 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1177 b->space->_rm_buffer(this, b);
1178 }
1179 }
1180
1181 // onodes
1182 int num = onode_lru.size() - onode_max;
1183 if (num <= 0)
1184 return; // don't even try
1185
1186 auto p = onode_lru.end();
1187 assert(p != onode_lru.begin());
1188 --p;
1189 int skipped = 0;
1190 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1191 while (num > 0) {
1192 Onode *o = &*p;
1193 dout(20) << __func__ << " considering " << o << dendl;
1194 int refs = o->nref.load();
1195 if (refs > 1) {
1196 dout(20) << __func__ << " " << o->oid << " has " << refs
1197 << " refs; skipping" << dendl;
1198 if (++skipped >= max_skipped) {
1199 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1200 << num << " left to trim" << dendl;
1201 break;
1202 }
1203
1204 if (p == onode_lru.begin()) {
1205 break;
1206 } else {
1207 p--;
1208 num--;
1209 continue;
1210 }
1211 }
1212 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1213 if (p != onode_lru.begin()) {
1214 onode_lru.erase(p--);
1215 } else {
1216 onode_lru.erase(p);
1217 assert(num == 1);
1218 }
1219 o->get(); // paranoia
1220 o->c->onode_map.remove(o->oid);
1221 o->put();
1222 --num;
1223 }
1224 }
1225
1226 #ifdef DEBUG_CACHE
1227 void BlueStore::TwoQCache::_audit(const char *when)
1228 {
1229 dout(10) << __func__ << " " << when << " start" << dendl;
1230 uint64_t s = 0;
1231 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1232 s += i->length;
1233 }
1234
1235 uint64_t hot_bytes = s;
1236 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1237 derr << __func__ << " hot_list_bytes "
1238 << buffer_list_bytes[BUFFER_HOT]
1239 << " != actual " << hot_bytes
1240 << dendl;
1241 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1242 }
1243
1244 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1245 s += i->length;
1246 }
1247
1248 uint64_t warm_in_bytes = s - hot_bytes;
1249 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1250 derr << __func__ << " warm_in_list_bytes "
1251 << buffer_list_bytes[BUFFER_WARM_IN]
1252 << " != actual " << warm_in_bytes
1253 << dendl;
1254 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1255 }
1256
1257 if (s != buffer_bytes) {
1258 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1259 << dendl;
1260 assert(s == buffer_bytes);
1261 }
1262
1263 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1264 << " ok" << dendl;
1265 }
1266 #endif
1267
1268
1269 // BufferSpace
1270
1271 #undef dout_prefix
1272 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1273
1274 void BlueStore::BufferSpace::_clear(Cache* cache)
1275 {
1276 // note: we already hold cache->lock
1277 ldout(cache->cct, 20) << __func__ << dendl;
1278 while (!buffer_map.empty()) {
1279 _rm_buffer(cache, buffer_map.begin());
1280 }
1281 }
1282
1283 int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1284 {
1285 // note: we already hold cache->lock
1286 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1287 << std::dec << dendl;
1288 int cache_private = 0;
1289 cache->_audit("discard start");
1290 auto i = _data_lower_bound(offset);
1291 uint32_t end = offset + length;
1292 while (i != buffer_map.end()) {
1293 Buffer *b = i->second.get();
1294 if (b->offset >= end) {
1295 break;
1296 }
1297 if (b->cache_private > cache_private) {
1298 cache_private = b->cache_private;
1299 }
1300 if (b->offset < offset) {
1301 int64_t front = offset - b->offset;
1302 if (b->end() > end) {
1303 // drop middle (split)
1304 uint32_t tail = b->end() - end;
1305 if (b->data.length()) {
1306 bufferlist bl;
1307 bl.substr_of(b->data, b->length - tail, tail);
1308 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1309 nb->maybe_rebuild();
1310 _add_buffer(cache, nb, 0, b);
1311 } else {
1312 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1313 0, b);
1314 }
1315 if (!b->is_writing()) {
1316 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1317 }
1318 b->truncate(front);
1319 b->maybe_rebuild();
1320 cache->_audit("discard end 1");
1321 break;
1322 } else {
1323 // drop tail
1324 if (!b->is_writing()) {
1325 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1326 }
1327 b->truncate(front);
1328 b->maybe_rebuild();
1329 ++i;
1330 continue;
1331 }
1332 }
1333 if (b->end() <= end) {
1334 // drop entire buffer
1335 _rm_buffer(cache, i++);
1336 continue;
1337 }
1338 // drop front
1339 uint32_t keep = b->end() - end;
1340 if (b->data.length()) {
1341 bufferlist bl;
1342 bl.substr_of(b->data, b->length - keep, keep);
1343 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1344 nb->maybe_rebuild();
1345 _add_buffer(cache, nb, 0, b);
1346 } else {
1347 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1348 }
1349 _rm_buffer(cache, i);
1350 cache->_audit("discard end 2");
1351 break;
1352 }
1353 return cache_private;
1354 }
1355
1356 void BlueStore::BufferSpace::read(
1357 Cache* cache,
1358 uint32_t offset,
1359 uint32_t length,
1360 BlueStore::ready_regions_t& res,
1361 interval_set<uint32_t>& res_intervals)
1362 {
1363 res.clear();
1364 res_intervals.clear();
1365 uint32_t want_bytes = length;
1366 uint32_t end = offset + length;
1367
1368 {
1369 std::lock_guard<std::recursive_mutex> l(cache->lock);
1370 for (auto i = _data_lower_bound(offset);
1371 i != buffer_map.end() && offset < end && i->first < end;
1372 ++i) {
1373 Buffer *b = i->second.get();
1374 assert(b->end() > offset);
1375 if (b->is_writing() || b->is_clean()) {
1376 if (b->offset < offset) {
1377 uint32_t skip = offset - b->offset;
1378 uint32_t l = MIN(length, b->length - skip);
1379 res[offset].substr_of(b->data, skip, l);
1380 res_intervals.insert(offset, l);
1381 offset += l;
1382 length -= l;
1383 if (!b->is_writing()) {
1384 cache->_touch_buffer(b);
1385 }
1386 continue;
1387 }
1388 if (b->offset > offset) {
1389 uint32_t gap = b->offset - offset;
1390 if (length <= gap) {
1391 break;
1392 }
1393 offset += gap;
1394 length -= gap;
1395 }
1396 if (!b->is_writing()) {
1397 cache->_touch_buffer(b);
1398 }
1399 if (b->length > length) {
1400 res[offset].substr_of(b->data, 0, length);
1401 res_intervals.insert(offset, length);
1402 break;
1403 } else {
1404 res[offset].append(b->data);
1405 res_intervals.insert(offset, b->length);
1406 if (b->length == length)
1407 break;
1408 offset += b->length;
1409 length -= b->length;
1410 }
1411 }
1412 }
1413 }
1414
1415 uint64_t hit_bytes = res_intervals.size();
1416 assert(hit_bytes <= want_bytes);
1417 uint64_t miss_bytes = want_bytes - hit_bytes;
1418 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1419 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1420 }
1421
1422 void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1423 {
1424 std::lock_guard<std::recursive_mutex> l(cache->lock);
1425
1426 auto i = writing.begin();
1427 while (i != writing.end()) {
1428 if (i->seq > seq) {
1429 break;
1430 }
1431 if (i->seq < seq) {
1432 ++i;
1433 continue;
1434 }
1435
1436 Buffer *b = &*i;
1437 assert(b->is_writing());
1438
1439 if (b->flags & Buffer::FLAG_NOCACHE) {
1440 writing.erase(i++);
1441 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1442 buffer_map.erase(b->offset);
1443 } else {
1444 b->state = Buffer::STATE_CLEAN;
1445 writing.erase(i++);
1446 b->maybe_rebuild();
1447 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1448 cache->_add_buffer(b, 1, nullptr);
1449 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1450 }
1451 }
1452
1453 cache->_audit("finish_write end");
1454 }
1455
1456 void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1457 {
1458 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1459 if (buffer_map.empty())
1460 return;
1461
1462 auto p = --buffer_map.end();
1463 while (true) {
1464 if (p->second->end() <= pos)
1465 break;
1466
1467 if (p->second->offset < pos) {
1468 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1469 size_t left = pos - p->second->offset;
1470 size_t right = p->second->length - left;
1471 if (p->second->data.length()) {
1472 bufferlist bl;
1473 bl.substr_of(p->second->data, left, right);
1474 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1475 0, p->second.get());
1476 } else {
1477 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1478 0, p->second.get());
1479 }
1480 cache->_adjust_buffer_size(p->second.get(), -right);
1481 p->second->truncate(left);
1482 break;
1483 }
1484
1485 assert(p->second->end() > pos);
1486 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1487 if (p->second->data.length()) {
1488 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1489 p->second->offset - pos, p->second->data),
1490 0, p->second.get());
1491 } else {
1492 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1493 p->second->offset - pos, p->second->length),
1494 0, p->second.get());
1495 }
1496 if (p == buffer_map.begin()) {
1497 _rm_buffer(cache, p);
1498 break;
1499 } else {
1500 _rm_buffer(cache, p--);
1501 }
1502 }
1503 assert(writing.empty());
1504 }
1505
1506 // OnodeSpace
1507
1508 #undef dout_prefix
1509 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1510
1511 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1512 {
1513 std::lock_guard<std::recursive_mutex> l(cache->lock);
1514 auto p = onode_map.find(oid);
1515 if (p != onode_map.end()) {
1516 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1517 << " raced, returning existing " << p->second
1518 << dendl;
1519 return p->second;
1520 }
1521 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1522 onode_map[oid] = o;
1523 cache->_add_onode(o, 1);
1524 return o;
1525 }
1526
1527 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1528 {
1529 ldout(cache->cct, 30) << __func__ << dendl;
1530 OnodeRef o;
1531 bool hit = false;
1532
1533 {
1534 std::lock_guard<std::recursive_mutex> l(cache->lock);
1535 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1536 if (p == onode_map.end()) {
1537 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1538 } else {
1539 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1540 << dendl;
1541 cache->_touch_onode(p->second);
1542 hit = true;
1543 o = p->second;
1544 }
1545 }
1546
1547 if (hit) {
1548 cache->logger->inc(l_bluestore_onode_hits);
1549 } else {
1550 cache->logger->inc(l_bluestore_onode_misses);
1551 }
1552 return o;
1553 }
1554
1555 void BlueStore::OnodeSpace::clear()
1556 {
1557 std::lock_guard<std::recursive_mutex> l(cache->lock);
1558 ldout(cache->cct, 10) << __func__ << dendl;
1559 for (auto &p : onode_map) {
1560 cache->_rm_onode(p.second);
1561 }
1562 onode_map.clear();
1563 }
1564
1565 bool BlueStore::OnodeSpace::empty()
1566 {
1567 std::lock_guard<std::recursive_mutex> l(cache->lock);
1568 return onode_map.empty();
1569 }
1570
1571 void BlueStore::OnodeSpace::rename(
1572 OnodeRef& oldo,
1573 const ghobject_t& old_oid,
1574 const ghobject_t& new_oid,
1575 const mempool::bluestore_cache_other::string& new_okey)
1576 {
1577 std::lock_guard<std::recursive_mutex> l(cache->lock);
1578 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1579 << dendl;
1580 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1581 po = onode_map.find(old_oid);
1582 pn = onode_map.find(new_oid);
1583 assert(po != pn);
1584
1585 assert(po != onode_map.end());
1586 if (pn != onode_map.end()) {
1587 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1588 << dendl;
1589 cache->_rm_onode(pn->second);
1590 onode_map.erase(pn);
1591 }
1592 OnodeRef o = po->second;
1593
1594 // install a non-existent onode at old location
1595 oldo.reset(new Onode(o->c, old_oid, o->key));
1596 po->second = oldo;
1597 cache->_add_onode(po->second, 1);
1598
1599 // add at new position and fix oid, key
1600 onode_map.insert(make_pair(new_oid, o));
1601 cache->_touch_onode(o);
1602 o->oid = new_oid;
1603 o->key = new_okey;
1604 }
1605
1606 bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1607 {
1608 std::lock_guard<std::recursive_mutex> l(cache->lock);
1609 ldout(cache->cct, 20) << __func__ << dendl;
1610 for (auto& i : onode_map) {
1611 if (f(i.second)) {
1612 return true;
1613 }
1614 }
1615 return false;
1616 }
1617
1618 void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
1619 {
1620 for (auto& i : onode_map) {
1621 ldout(cct, lvl) << i.first << " : " << i.second << dendl;
1622 }
1623 }
1624
1625 // SharedBlob
1626
1627 #undef dout_prefix
1628 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1629
1630 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1631 {
1632 out << "SharedBlob(" << &sb;
1633
1634 if (sb.loaded) {
1635 out << " loaded " << *sb.persistent;
1636 } else {
1637 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1638 }
1639 return out << ")";
1640 }
1641
1642 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1643 : coll(_coll), sbid_unloaded(i)
1644 {
1645 assert(sbid_unloaded > 0);
1646 if (get_cache()) {
1647 get_cache()->add_blob();
1648 }
1649 }
1650
1651 BlueStore::SharedBlob::~SharedBlob()
1652 {
1653 if (get_cache()) { // the dummy instances have a nullptr
1654 std::lock_guard<std::recursive_mutex> l(get_cache()->lock);
1655 bc._clear(get_cache());
1656 get_cache()->rm_blob();
1657 }
1658 if (loaded && persistent) {
1659 delete persistent;
1660 }
1661 }
1662
1663 void BlueStore::SharedBlob::put()
1664 {
1665 if (--nref == 0) {
1666 ldout(coll->store->cct, 20) << __func__ << " " << this
1667 << " removing self from set " << get_parent()
1668 << dendl;
1669 if (get_parent()) {
1670 get_parent()->remove(this);
1671 }
1672 delete this;
1673 }
1674 }
1675
1676 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1677 {
1678 assert(persistent);
1679 persistent->ref_map.get(offset, length);
1680 }
1681
1682 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
1683 PExtentVector *r,
1684 set<SharedBlob*> *maybe_unshared)
1685 {
1686 assert(persistent);
1687 bool maybe = false;
1688 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1689 if (maybe_unshared && maybe) {
1690 maybe_unshared->insert(this);
1691 }
1692 }
1693
1694 // SharedBlobSet
1695
1696 #undef dout_prefix
1697 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1698
1699 void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
1700 {
1701 std::lock_guard<std::mutex> l(lock);
1702 for (auto& i : sb_map) {
1703 ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
1704 }
1705 }
1706
1707 // Blob
1708
1709 #undef dout_prefix
1710 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1711
1712 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1713 {
1714 out << "Blob(" << &b;
1715 if (b.is_spanning()) {
1716 out << " spanning " << b.id;
1717 }
1718 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1719 if (b.shared_blob) {
1720 out << " " << *b.shared_blob;
1721 } else {
1722 out << " (shared_blob=NULL)";
1723 }
1724 out << ")";
1725 return out;
1726 }
1727
1728 void BlueStore::Blob::discard_unallocated(Collection *coll)
1729 {
1730 if (get_blob().is_shared()) {
1731 return;
1732 }
1733 if (get_blob().is_compressed()) {
1734 bool discard = false;
1735 bool all_invalid = true;
1736 for (auto e : get_blob().get_extents()) {
1737 if (!e.is_valid()) {
1738 discard = true;
1739 } else {
1740 all_invalid = false;
1741 }
1742 }
1743 assert(discard == all_invalid); // in case of compressed blob all
1744 // or none pextents are invalid.
1745 if (discard) {
1746 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1747 get_blob().get_logical_length());
1748 }
1749 } else {
1750 size_t pos = 0;
1751 for (auto e : get_blob().get_extents()) {
1752 if (!e.is_valid()) {
1753 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1754 << "~" << e.length
1755 << std::dec << dendl;
1756 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1757 }
1758 pos += e.length;
1759 }
1760 if (get_blob().can_prune_tail()) {
1761 dirty_blob().prune_tail();
1762 used_in_blob.prune_tail(get_blob().get_ondisk_length());
1763 auto cct = coll->store->cct; //used by dout
1764 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
1765 }
1766 }
1767 }
1768
1769 void BlueStore::Blob::get_ref(
1770 Collection *coll,
1771 uint32_t offset,
1772 uint32_t length)
1773 {
1774 // Caller has to initialize Blob's logical length prior to increment
1775 // references. Otherwise one is neither unable to determine required
1776 // amount of counters in case of per-au tracking nor obtain min_release_size
1777 // for single counter mode.
1778 assert(get_blob().get_logical_length() != 0);
1779 auto cct = coll->store->cct;
1780 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1781 << std::dec << " " << *this << dendl;
1782
1783 if (used_in_blob.is_empty()) {
1784 uint32_t min_release_size =
1785 get_blob().get_release_size(coll->store->min_alloc_size);
1786 uint64_t l = get_blob().get_logical_length();
1787 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1788 << min_release_size << std::dec << dendl;
1789 used_in_blob.init(l, min_release_size);
1790 }
1791 used_in_blob.get(
1792 offset,
1793 length);
1794 }
1795
1796 bool BlueStore::Blob::put_ref(
1797 Collection *coll,
1798 uint32_t offset,
1799 uint32_t length,
1800 PExtentVector *r)
1801 {
1802 PExtentVector logical;
1803
1804 auto cct = coll->store->cct;
1805 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1806 << std::dec << " " << *this << dendl;
1807
1808 bool empty = used_in_blob.put(
1809 offset,
1810 length,
1811 &logical);
1812 r->clear();
1813 // nothing to release
1814 if (!empty && logical.empty()) {
1815 return false;
1816 }
1817
1818 bluestore_blob_t& b = dirty_blob();
1819 return b.release_extents(empty, logical, r);
1820 }
1821
1822 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
1823 uint32_t target_blob_size,
1824 uint32_t b_offset,
1825 uint32_t *length0) {
1826 assert(min_alloc_size);
1827 assert(target_blob_size);
1828 if (!get_blob().is_mutable()) {
1829 return false;
1830 }
1831
1832 uint32_t length = *length0;
1833 uint32_t end = b_offset + length;
1834
1835 // Currently for the sake of simplicity we omit blob reuse if data is
1836 // unaligned with csum chunk. Later we can perform padding if needed.
1837 if (get_blob().has_csum() &&
1838 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1839 (end % get_blob().get_csum_chunk_size()) != 0)) {
1840 return false;
1841 }
1842
1843 auto blen = get_blob().get_logical_length();
1844 uint32_t new_blen = blen;
1845
1846 // make sure target_blob_size isn't less than current blob len
1847 target_blob_size = MAX(blen, target_blob_size);
1848
1849 if (b_offset >= blen) {
1850 // new data totally stands out of the existing blob
1851 new_blen = end;
1852 } else {
1853 // new data overlaps with the existing blob
1854 new_blen = MAX(blen, end);
1855
1856 uint32_t overlap = 0;
1857 if (new_blen > blen) {
1858 overlap = blen - b_offset;
1859 } else {
1860 overlap = length;
1861 }
1862
1863 if (!get_blob().is_unallocated(b_offset, overlap)) {
1864 // abort if any piece of the overlap has already been allocated
1865 return false;
1866 }
1867 }
1868
1869 if (new_blen > blen) {
1870 int64_t overflow = int64_t(new_blen) - target_blob_size;
1871 // Unable to decrease the provided length to fit into max_blob_size
1872 if (overflow >= length) {
1873 return false;
1874 }
1875
1876 // FIXME: in some cases we could reduce unused resolution
1877 if (get_blob().has_unused()) {
1878 return false;
1879 }
1880
1881 if (overflow > 0) {
1882 new_blen -= overflow;
1883 length -= overflow;
1884 *length0 = length;
1885 }
1886
1887 if (new_blen > blen) {
1888 dirty_blob().add_tail(new_blen);
1889 used_in_blob.add_tail(new_blen,
1890 get_blob().get_release_size(min_alloc_size));
1891 }
1892 }
1893 return true;
1894 }
1895
1896 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1897 {
1898 auto cct = coll->store->cct; //used by dout
1899 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1900 << " start " << *this << dendl;
1901 assert(blob.can_split());
1902 assert(used_in_blob.can_split());
1903 bluestore_blob_t &lb = dirty_blob();
1904 bluestore_blob_t &rb = r->dirty_blob();
1905
1906 used_in_blob.split(
1907 blob_offset,
1908 &(r->used_in_blob));
1909
1910 lb.split(blob_offset, rb);
1911 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1912
1913 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1914 << " finish " << *this << dendl;
1915 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1916 << " and " << *r << dendl;
1917 }
1918
1919 #ifndef CACHE_BLOB_BL
1920 void BlueStore::Blob::decode(
1921 Collection *coll,
1922 bufferptr::iterator& p,
1923 uint64_t struct_v,
1924 uint64_t* sbid,
1925 bool include_ref_map)
1926 {
1927 denc(blob, p, struct_v);
1928 if (blob.is_shared()) {
1929 denc(*sbid, p);
1930 }
1931 if (include_ref_map) {
1932 if (struct_v > 1) {
1933 used_in_blob.decode(p);
1934 } else {
1935 used_in_blob.clear();
1936 bluestore_extent_ref_map_t legacy_ref_map;
1937 legacy_ref_map.decode(p);
1938 for (auto r : legacy_ref_map.ref_map) {
1939 get_ref(
1940 coll,
1941 r.first,
1942 r.second.refs * r.second.length);
1943 }
1944 }
1945 }
1946 }
1947 #endif
1948
1949 // Extent
1950
1951 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1952 {
1953 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1954 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1955 << " " << *e.blob;
1956 }
1957
1958 // OldExtent
1959 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1960 uint32_t lo,
1961 uint32_t o,
1962 uint32_t l,
1963 BlobRef& b) {
1964 OldExtent* oe = new OldExtent(lo, o, l, b);
1965 b->put_ref(c.get(), o, l, &(oe->r));
1966 oe->blob_empty = b->get_referenced_bytes() == 0;
1967 return oe;
1968 }
1969
1970 // ExtentMap
1971
1972 #undef dout_prefix
1973 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1974
1975 BlueStore::ExtentMap::ExtentMap(Onode *o)
1976 : onode(o),
1977 inline_bl(
1978 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1979 }
1980
1981 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1982 bool force)
1983 {
1984 auto cct = onode->c->store->cct; //used by dout
1985 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1986 if (onode->onode.extent_map_shards.empty()) {
1987 if (inline_bl.length() == 0) {
1988 unsigned n;
1989 // we need to encode inline_bl to measure encoded length
1990 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
1991 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
1992 assert(!never_happen);
1993 size_t len = inline_bl.length();
1994 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1995 << " extents" << dendl;
1996 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
1997 request_reshard(0, OBJECT_MAX_SIZE);
1998 return;
1999 }
2000 }
2001 // will persist in the onode key.
2002 } else {
2003 // pending shard update
2004 struct dirty_shard_t {
2005 Shard *shard;
2006 bufferlist bl;
2007 dirty_shard_t(Shard *s) : shard(s) {}
2008 };
2009 vector<dirty_shard_t> encoded_shards;
2010 // allocate slots for all shards in a single call instead of
2011 // doing multiple allocations - one per each dirty shard
2012 encoded_shards.reserve(shards.size());
2013
2014 auto p = shards.begin();
2015 auto prev_p = p;
2016 while (p != shards.end()) {
2017 assert(p->shard_info->offset >= prev_p->shard_info->offset);
2018 auto n = p;
2019 ++n;
2020 if (p->dirty) {
2021 uint32_t endoff;
2022 if (n == shards.end()) {
2023 endoff = OBJECT_MAX_SIZE;
2024 } else {
2025 endoff = n->shard_info->offset;
2026 }
2027 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2028 bufferlist& bl = encoded_shards.back().bl;
2029 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2030 bl, &p->extents)) {
2031 if (force) {
2032 derr << __func__ << " encode_some needs reshard" << dendl;
2033 assert(!force);
2034 }
2035 }
2036 size_t len = bl.length();
2037
2038 dout(20) << __func__ << " shard 0x" << std::hex
2039 << p->shard_info->offset << std::dec << " is " << len
2040 << " bytes (was " << p->shard_info->bytes << ") from "
2041 << p->extents << " extents" << dendl;
2042
2043 if (!force) {
2044 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2045 // we are big; reshard ourselves
2046 request_reshard(p->shard_info->offset, endoff);
2047 }
2048 // avoid resharding the trailing shard, even if it is small
2049 else if (n != shards.end() &&
2050 len < g_conf->bluestore_extent_map_shard_min_size) {
2051 assert(endoff != OBJECT_MAX_SIZE);
2052 if (p == shards.begin()) {
2053 // we are the first shard, combine with next shard
2054 request_reshard(p->shard_info->offset, endoff + 1);
2055 } else {
2056 // combine either with the previous shard or the next,
2057 // whichever is smaller
2058 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2059 request_reshard(p->shard_info->offset, endoff + 1);
2060 } else {
2061 request_reshard(prev_p->shard_info->offset, endoff);
2062 }
2063 }
2064 }
2065 }
2066 }
2067 prev_p = p;
2068 p = n;
2069 }
2070 if (needs_reshard()) {
2071 return;
2072 }
2073
2074 // schedule DB update for dirty shards
2075 string key;
2076 for (auto& it : encoded_shards) {
2077 it.shard->dirty = false;
2078 it.shard->shard_info->bytes = it.bl.length();
2079 generate_extent_shard_key_and_apply(
2080 onode->key,
2081 it.shard->shard_info->offset,
2082 &key,
2083 [&](const string& final_key) {
2084 t->set(PREFIX_OBJ, final_key, it.bl);
2085 }
2086 );
2087 }
2088 }
2089 }
2090
2091 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2092 {
2093 if (spanning_blob_map.empty())
2094 return 0;
2095 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2096 // bid is valid and available.
2097 if (bid >= 0)
2098 return bid;
2099 // Find next unused bid;
2100 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2101 const auto begin_bid = bid;
2102 do {
2103 if (!spanning_blob_map.count(bid))
2104 return bid;
2105 else {
2106 bid++;
2107 if (bid < 0) bid = 0;
2108 }
2109 } while (bid != begin_bid);
2110 assert(0 == "no available blob id");
2111 }
2112
2113 void BlueStore::ExtentMap::reshard(
2114 KeyValueDB *db,
2115 KeyValueDB::Transaction t)
2116 {
2117 auto cct = onode->c->store->cct; // used by dout
2118
2119 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2120 << needs_reshard_end << ")" << std::dec
2121 << " of " << onode->onode.extent_map_shards.size()
2122 << " shards on " << onode->oid << dendl;
2123 for (auto& p : spanning_blob_map) {
2124 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2125 << dendl;
2126 }
2127 // determine shard index range
2128 unsigned si_begin = 0, si_end = 0;
2129 if (!shards.empty()) {
2130 while (si_begin + 1 < shards.size() &&
2131 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2132 ++si_begin;
2133 }
2134 needs_reshard_begin = shards[si_begin].shard_info->offset;
2135 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2136 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2137 needs_reshard_end = shards[si_end].shard_info->offset;
2138 break;
2139 }
2140 }
2141 if (si_end == shards.size()) {
2142 needs_reshard_end = OBJECT_MAX_SIZE;
2143 }
2144 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2145 << " over 0x[" << std::hex << needs_reshard_begin << ","
2146 << needs_reshard_end << ")" << std::dec << dendl;
2147 }
2148
2149 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2150
2151 // we may need to fault in a larger interval later must have all
2152 // referring extents for spanning blobs loaded in order to have
2153 // accurate use_tracker values.
2154 uint32_t spanning_scan_begin = needs_reshard_begin;
2155 uint32_t spanning_scan_end = needs_reshard_end;
2156
2157 // remove old keys
2158 string key;
2159 for (unsigned i = si_begin; i < si_end; ++i) {
2160 generate_extent_shard_key_and_apply(
2161 onode->key, shards[i].shard_info->offset, &key,
2162 [&](const string& final_key) {
2163 t->rmkey(PREFIX_OBJ, final_key);
2164 }
2165 );
2166 }
2167
2168 // calculate average extent size
2169 unsigned bytes = 0;
2170 unsigned extents = 0;
2171 if (onode->onode.extent_map_shards.empty()) {
2172 bytes = inline_bl.length();
2173 extents = extent_map.size();
2174 } else {
2175 for (unsigned i = si_begin; i < si_end; ++i) {
2176 bytes += shards[i].shard_info->bytes;
2177 extents += shards[i].extents;
2178 }
2179 }
2180 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2181 unsigned slop = target *
2182 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2183 unsigned extent_avg = bytes / MAX(1, extents);
2184 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2185 << ", slop " << slop << dendl;
2186
2187 // reshard
2188 unsigned estimate = 0;
2189 unsigned offset = needs_reshard_begin;
2190 vector<bluestore_onode_t::shard_info> new_shard_info;
2191 unsigned max_blob_end = 0;
2192 Extent dummy(needs_reshard_begin);
2193 for (auto e = extent_map.lower_bound(dummy);
2194 e != extent_map.end();
2195 ++e) {
2196 if (e->logical_offset >= needs_reshard_end) {
2197 break;
2198 }
2199 dout(30) << " extent " << *e << dendl;
2200
2201 // disfavor shard boundaries that span a blob
2202 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2203 if (estimate &&
2204 estimate + extent_avg > target + (would_span ? slop : 0)) {
2205 // new shard
2206 if (offset == needs_reshard_begin) {
2207 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2208 new_shard_info.back().offset = offset;
2209 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2210 << std::dec << dendl;
2211 }
2212 offset = e->logical_offset;
2213 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2214 new_shard_info.back().offset = offset;
2215 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2216 << std::dec << dendl;
2217 estimate = 0;
2218 }
2219 estimate += extent_avg;
2220 unsigned bs = e->blob_start();
2221 if (bs < spanning_scan_begin) {
2222 spanning_scan_begin = bs;
2223 }
2224 uint32_t be = e->blob_end();
2225 if (be > max_blob_end) {
2226 max_blob_end = be;
2227 }
2228 if (be > spanning_scan_end) {
2229 spanning_scan_end = be;
2230 }
2231 }
2232 if (new_shard_info.empty() && (si_begin > 0 ||
2233 si_end < shards.size())) {
2234 // we resharded a partial range; we must produce at least one output
2235 // shard
2236 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2237 new_shard_info.back().offset = needs_reshard_begin;
2238 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2239 << std::dec << " (singleton degenerate case)" << dendl;
2240 }
2241
2242 auto& sv = onode->onode.extent_map_shards;
2243 dout(20) << __func__ << " new " << new_shard_info << dendl;
2244 dout(20) << __func__ << " old " << sv << dendl;
2245 if (sv.empty()) {
2246 // no old shards to keep
2247 sv.swap(new_shard_info);
2248 init_shards(true, true);
2249 } else {
2250 // splice in new shards
2251 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2252 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2253 sv.insert(
2254 sv.begin() + si_begin,
2255 new_shard_info.begin(),
2256 new_shard_info.end());
2257 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2258 si_end = si_begin + new_shard_info.size();
2259
2260 assert(sv.size() == shards.size());
2261
2262 // note that we need to update every shard_info of shards here,
2263 // as sv might have been totally re-allocated above
2264 for (unsigned i = 0; i < shards.size(); i++) {
2265 shards[i].shard_info = &sv[i];
2266 }
2267
2268 // mark newly added shards as dirty
2269 for (unsigned i = si_begin; i < si_end; ++i) {
2270 shards[i].loaded = true;
2271 shards[i].dirty = true;
2272 }
2273 }
2274 dout(20) << __func__ << " fin " << sv << dendl;
2275 inline_bl.clear();
2276
2277 if (sv.empty()) {
2278 // no more shards; unspan all previously spanning blobs
2279 auto p = spanning_blob_map.begin();
2280 while (p != spanning_blob_map.end()) {
2281 p->second->id = -1;
2282 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2283 p = spanning_blob_map.erase(p);
2284 }
2285 } else {
2286 // identify new spanning blobs
2287 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2288 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2289 if (spanning_scan_begin < needs_reshard_begin) {
2290 fault_range(db, spanning_scan_begin,
2291 needs_reshard_begin - spanning_scan_begin);
2292 }
2293 if (spanning_scan_end > needs_reshard_end) {
2294 fault_range(db, needs_reshard_end,
2295 spanning_scan_end - needs_reshard_end);
2296 }
2297 auto sp = sv.begin() + si_begin;
2298 auto esp = sv.end();
2299 unsigned shard_start = sp->offset;
2300 unsigned shard_end;
2301 ++sp;
2302 if (sp == esp) {
2303 shard_end = OBJECT_MAX_SIZE;
2304 } else {
2305 shard_end = sp->offset;
2306 }
2307 Extent dummy(needs_reshard_begin);
2308 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2309 if (e->logical_offset >= needs_reshard_end) {
2310 break;
2311 }
2312 dout(30) << " extent " << *e << dendl;
2313 while (e->logical_offset >= shard_end) {
2314 shard_start = shard_end;
2315 assert(sp != esp);
2316 ++sp;
2317 if (sp == esp) {
2318 shard_end = OBJECT_MAX_SIZE;
2319 } else {
2320 shard_end = sp->offset;
2321 }
2322 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2323 << " to 0x" << shard_end << std::dec << dendl;
2324 }
2325 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2326 if (!e->blob->is_spanning()) {
2327 // We have two options: (1) split the blob into pieces at the
2328 // shard boundaries (and adjust extents accordingly), or (2)
2329 // mark it spanning. We prefer to cut the blob if we can. Note that
2330 // we may have to split it multiple times--potentially at every
2331 // shard boundary.
2332 bool must_span = false;
2333 BlobRef b = e->blob;
2334 if (b->can_split()) {
2335 uint32_t bstart = e->blob_start();
2336 uint32_t bend = e->blob_end();
2337 for (const auto& sh : shards) {
2338 if (bstart < sh.shard_info->offset &&
2339 bend > sh.shard_info->offset) {
2340 uint32_t blob_offset = sh.shard_info->offset - bstart;
2341 if (b->can_split_at(blob_offset)) {
2342 dout(20) << __func__ << " splitting blob, bstart 0x"
2343 << std::hex << bstart << " blob_offset 0x"
2344 << blob_offset << std::dec << " " << *b << dendl;
2345 b = split_blob(b, blob_offset, sh.shard_info->offset);
2346 // switch b to the new right-hand side, in case it
2347 // *also* has to get split.
2348 bstart += blob_offset;
2349 onode->c->store->logger->inc(l_bluestore_blob_split);
2350 } else {
2351 must_span = true;
2352 break;
2353 }
2354 }
2355 }
2356 } else {
2357 must_span = true;
2358 }
2359 if (must_span) {
2360 auto bid = allocate_spanning_blob_id();
2361 b->id = bid;
2362 spanning_blob_map[b->id] = b;
2363 dout(20) << __func__ << " adding spanning " << *b << dendl;
2364 }
2365 }
2366 } else {
2367 if (e->blob->is_spanning()) {
2368 spanning_blob_map.erase(e->blob->id);
2369 e->blob->id = -1;
2370 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2371 }
2372 }
2373 }
2374 }
2375
2376 clear_needs_reshard();
2377 }
2378
2379 bool BlueStore::ExtentMap::encode_some(
2380 uint32_t offset,
2381 uint32_t length,
2382 bufferlist& bl,
2383 unsigned *pn)
2384 {
2385 auto cct = onode->c->store->cct; //used by dout
2386 Extent dummy(offset);
2387 auto start = extent_map.lower_bound(dummy);
2388 uint32_t end = offset + length;
2389
2390 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2391 // serialization only. Hence there is no specific
2392 // handling at ExtentMap level.
2393
2394 unsigned n = 0;
2395 size_t bound = 0;
2396 bool must_reshard = false;
2397 for (auto p = start;
2398 p != extent_map.end() && p->logical_offset < end;
2399 ++p, ++n) {
2400 assert(p->logical_offset >= offset);
2401 p->blob->last_encoded_id = -1;
2402 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2403 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2404 << std::dec << " hit new spanning blob " << *p << dendl;
2405 request_reshard(p->blob_start(), p->blob_end());
2406 must_reshard = true;
2407 }
2408 if (!must_reshard) {
2409 denc_varint(0, bound); // blobid
2410 denc_varint(0, bound); // logical_offset
2411 denc_varint(0, bound); // len
2412 denc_varint(0, bound); // blob_offset
2413
2414 p->blob->bound_encode(
2415 bound,
2416 struct_v,
2417 p->blob->shared_blob->get_sbid(),
2418 false);
2419 }
2420 }
2421 if (must_reshard) {
2422 return true;
2423 }
2424
2425 denc(struct_v, bound);
2426 denc_varint(0, bound); // number of extents
2427
2428 {
2429 auto app = bl.get_contiguous_appender(bound);
2430 denc(struct_v, app);
2431 denc_varint(n, app);
2432 if (pn) {
2433 *pn = n;
2434 }
2435
2436 n = 0;
2437 uint64_t pos = 0;
2438 uint64_t prev_len = 0;
2439 for (auto p = start;
2440 p != extent_map.end() && p->logical_offset < end;
2441 ++p, ++n) {
2442 unsigned blobid;
2443 bool include_blob = false;
2444 if (p->blob->is_spanning()) {
2445 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2446 blobid |= BLOBID_FLAG_SPANNING;
2447 } else if (p->blob->last_encoded_id < 0) {
2448 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2449 include_blob = true;
2450 blobid = 0; // the decoder will infer the id from n
2451 } else {
2452 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2453 }
2454 if (p->logical_offset == pos) {
2455 blobid |= BLOBID_FLAG_CONTIGUOUS;
2456 }
2457 if (p->blob_offset == 0) {
2458 blobid |= BLOBID_FLAG_ZEROOFFSET;
2459 }
2460 if (p->length == prev_len) {
2461 blobid |= BLOBID_FLAG_SAMELENGTH;
2462 } else {
2463 prev_len = p->length;
2464 }
2465 denc_varint(blobid, app);
2466 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2467 denc_varint_lowz(p->logical_offset - pos, app);
2468 }
2469 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2470 denc_varint_lowz(p->blob_offset, app);
2471 }
2472 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2473 denc_varint_lowz(p->length, app);
2474 }
2475 pos = p->logical_end();
2476 if (include_blob) {
2477 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2478 }
2479 }
2480 }
2481 /*derr << __func__ << bl << dendl;
2482 derr << __func__ << ":";
2483 bl.hexdump(*_dout);
2484 *_dout << dendl;
2485 */
2486 return false;
2487 }
2488
2489 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2490 {
2491 auto cct = onode->c->store->cct; //used by dout
2492 /*
2493 derr << __func__ << ":";
2494 bl.hexdump(*_dout);
2495 *_dout << dendl;
2496 */
2497
2498 assert(bl.get_num_buffers() <= 1);
2499 auto p = bl.front().begin_deep();
2500 __u8 struct_v;
2501 denc(struct_v, p);
2502 // Version 2 differs from v1 in blob's ref_map
2503 // serialization only. Hence there is no specific
2504 // handling at ExtentMap level below.
2505 assert(struct_v == 1 || struct_v == 2);
2506
2507 uint32_t num;
2508 denc_varint(num, p);
2509 vector<BlobRef> blobs(num);
2510 uint64_t pos = 0;
2511 uint64_t prev_len = 0;
2512 unsigned n = 0;
2513
2514 while (!p.end()) {
2515 Extent *le = new Extent();
2516 uint64_t blobid;
2517 denc_varint(blobid, p);
2518 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2519 uint64_t gap;
2520 denc_varint_lowz(gap, p);
2521 pos += gap;
2522 }
2523 le->logical_offset = pos;
2524 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2525 denc_varint_lowz(le->blob_offset, p);
2526 } else {
2527 le->blob_offset = 0;
2528 }
2529 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2530 denc_varint_lowz(prev_len, p);
2531 }
2532 le->length = prev_len;
2533
2534 if (blobid & BLOBID_FLAG_SPANNING) {
2535 dout(30) << __func__ << " getting spanning blob "
2536 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2537 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2538 } else {
2539 blobid >>= BLOBID_SHIFT_BITS;
2540 if (blobid) {
2541 le->assign_blob(blobs[blobid - 1]);
2542 assert(le->blob);
2543 } else {
2544 Blob *b = new Blob();
2545 uint64_t sbid = 0;
2546 b->decode(onode->c, p, struct_v, &sbid, false);
2547 blobs[n] = b;
2548 onode->c->open_shared_blob(sbid, b);
2549 le->assign_blob(b);
2550 }
2551 // we build ref_map dynamically for non-spanning blobs
2552 le->blob->get_ref(
2553 onode->c,
2554 le->blob_offset,
2555 le->length);
2556 }
2557 pos += prev_len;
2558 ++n;
2559 extent_map.insert(*le);
2560 }
2561
2562 assert(n == num);
2563 return num;
2564 }
2565
2566 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2567 {
2568 // Version 2 differs from v1 in blob's ref_map
2569 // serialization only. Hence there is no specific
2570 // handling at ExtentMap level.
2571 __u8 struct_v = 2;
2572
2573 denc(struct_v, p);
2574 denc_varint((uint32_t)0, p);
2575 size_t key_size = 0;
2576 denc_varint((uint32_t)0, key_size);
2577 p += spanning_blob_map.size() * key_size;
2578 for (const auto& i : spanning_blob_map) {
2579 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2580 }
2581 }
2582
2583 void BlueStore::ExtentMap::encode_spanning_blobs(
2584 bufferlist::contiguous_appender& p)
2585 {
2586 // Version 2 differs from v1 in blob's ref_map
2587 // serialization only. Hence there is no specific
2588 // handling at ExtentMap level.
2589 __u8 struct_v = 2;
2590
2591 denc(struct_v, p);
2592 denc_varint(spanning_blob_map.size(), p);
2593 for (auto& i : spanning_blob_map) {
2594 denc_varint(i.second->id, p);
2595 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2596 }
2597 }
2598
2599 void BlueStore::ExtentMap::decode_spanning_blobs(
2600 bufferptr::iterator& p)
2601 {
2602 __u8 struct_v;
2603 denc(struct_v, p);
2604 // Version 2 differs from v1 in blob's ref_map
2605 // serialization only. Hence there is no specific
2606 // handling at ExtentMap level.
2607 assert(struct_v == 1 || struct_v == 2);
2608
2609 unsigned n;
2610 denc_varint(n, p);
2611 while (n--) {
2612 BlobRef b(new Blob());
2613 denc_varint(b->id, p);
2614 spanning_blob_map[b->id] = b;
2615 uint64_t sbid = 0;
2616 b->decode(onode->c, p, struct_v, &sbid, true);
2617 onode->c->open_shared_blob(sbid, b);
2618 }
2619 }
2620
2621 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2622 {
2623 shards.resize(onode->onode.extent_map_shards.size());
2624 unsigned i = 0;
2625 for (auto &s : onode->onode.extent_map_shards) {
2626 shards[i].shard_info = &s;
2627 shards[i].loaded = loaded;
2628 shards[i].dirty = dirty;
2629 ++i;
2630 }
2631 }
2632
2633 void BlueStore::ExtentMap::fault_range(
2634 KeyValueDB *db,
2635 uint32_t offset,
2636 uint32_t length)
2637 {
2638 auto cct = onode->c->store->cct; //used by dout
2639 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2640 << std::dec << dendl;
2641 auto start = seek_shard(offset);
2642 auto last = seek_shard(offset + length);
2643
2644 if (start < 0)
2645 return;
2646
2647 assert(last >= start);
2648 string key;
2649 while (start <= last) {
2650 assert((size_t)start < shards.size());
2651 auto p = &shards[start];
2652 if (!p->loaded) {
2653 dout(30) << __func__ << " opening shard 0x" << std::hex
2654 << p->shard_info->offset << std::dec << dendl;
2655 bufferlist v;
2656 generate_extent_shard_key_and_apply(
2657 onode->key, p->shard_info->offset, &key,
2658 [&](const string& final_key) {
2659 int r = db->get(PREFIX_OBJ, final_key, &v);
2660 if (r < 0) {
2661 derr << __func__ << " missing shard 0x" << std::hex
2662 << p->shard_info->offset << std::dec << " for " << onode->oid
2663 << dendl;
2664 assert(r >= 0);
2665 }
2666 }
2667 );
2668 p->extents = decode_some(v);
2669 p->loaded = true;
2670 dout(20) << __func__ << " open shard 0x" << std::hex
2671 << p->shard_info->offset << std::dec
2672 << " (" << v.length() << " bytes)" << dendl;
2673 assert(p->dirty == false);
2674 assert(v.length() == p->shard_info->bytes);
2675 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2676 } else {
2677 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2678 }
2679 ++start;
2680 }
2681 }
2682
2683 void BlueStore::ExtentMap::dirty_range(
2684 uint32_t offset,
2685 uint32_t length)
2686 {
2687 auto cct = onode->c->store->cct; //used by dout
2688 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2689 << std::dec << dendl;
2690 if (shards.empty()) {
2691 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2692 inline_bl.clear();
2693 return;
2694 }
2695 auto start = seek_shard(offset);
2696 auto last = seek_shard(offset + length);
2697 if (start < 0)
2698 return;
2699
2700 assert(last >= start);
2701 while (start <= last) {
2702 assert((size_t)start < shards.size());
2703 auto p = &shards[start];
2704 if (!p->loaded) {
2705 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2706 << std::dec << " is not loaded, can't mark dirty" << dendl;
2707 assert(0 == "can't mark unloaded shard dirty");
2708 }
2709 if (!p->dirty) {
2710 dout(20) << __func__ << " mark shard 0x" << std::hex
2711 << p->shard_info->offset << std::dec << " dirty" << dendl;
2712 p->dirty = true;
2713 }
2714 ++start;
2715 }
2716 }
2717
2718 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2719 uint64_t offset)
2720 {
2721 Extent dummy(offset);
2722 return extent_map.find(dummy);
2723 }
2724
2725 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2726 uint64_t offset)
2727 {
2728 Extent dummy(offset);
2729 auto fp = extent_map.lower_bound(dummy);
2730 if (fp != extent_map.begin()) {
2731 --fp;
2732 if (fp->logical_end() <= offset) {
2733 ++fp;
2734 }
2735 }
2736 return fp;
2737 }
2738
2739 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2740 uint64_t offset) const
2741 {
2742 Extent dummy(offset);
2743 auto fp = extent_map.lower_bound(dummy);
2744 if (fp != extent_map.begin()) {
2745 --fp;
2746 if (fp->logical_end() <= offset) {
2747 ++fp;
2748 }
2749 }
2750 return fp;
2751 }
2752
2753 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2754 {
2755 auto fp = seek_lextent(offset);
2756 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2757 return false;
2758 }
2759 return true;
2760 }
2761
2762 int BlueStore::ExtentMap::compress_extent_map(
2763 uint64_t offset,
2764 uint64_t length)
2765 {
2766 auto cct = onode->c->store->cct; //used by dout
2767 if (extent_map.empty())
2768 return 0;
2769 int removed = 0;
2770 auto p = seek_lextent(offset);
2771 if (p != extent_map.begin()) {
2772 --p; // start to the left of offset
2773 }
2774 // the caller should have just written to this region
2775 assert(p != extent_map.end());
2776
2777 // identify the *next* shard
2778 auto pshard = shards.begin();
2779 while (pshard != shards.end() &&
2780 p->logical_offset >= pshard->shard_info->offset) {
2781 ++pshard;
2782 }
2783 uint64_t shard_end;
2784 if (pshard != shards.end()) {
2785 shard_end = pshard->shard_info->offset;
2786 } else {
2787 shard_end = OBJECT_MAX_SIZE;
2788 }
2789
2790 auto n = p;
2791 for (++n; n != extent_map.end(); p = n++) {
2792 if (n->logical_offset > offset + length) {
2793 break; // stop after end
2794 }
2795 while (n != extent_map.end() &&
2796 p->logical_end() == n->logical_offset &&
2797 p->blob == n->blob &&
2798 p->blob_offset + p->length == n->blob_offset &&
2799 n->logical_offset < shard_end) {
2800 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2801 << " next shard 0x" << shard_end << std::dec
2802 << " merging " << *p << " and " << *n << dendl;
2803 p->length += n->length;
2804 rm(n++);
2805 ++removed;
2806 }
2807 if (n == extent_map.end()) {
2808 break;
2809 }
2810 if (n->logical_offset >= shard_end) {
2811 assert(pshard != shards.end());
2812 ++pshard;
2813 if (pshard != shards.end()) {
2814 shard_end = pshard->shard_info->offset;
2815 } else {
2816 shard_end = OBJECT_MAX_SIZE;
2817 }
2818 }
2819 }
2820 if (removed && onode) {
2821 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2822 }
2823 return removed;
2824 }
2825
2826 void BlueStore::ExtentMap::punch_hole(
2827 CollectionRef &c,
2828 uint64_t offset,
2829 uint64_t length,
2830 old_extent_map_t *old_extents)
2831 {
2832 auto p = seek_lextent(offset);
2833 uint64_t end = offset + length;
2834 while (p != extent_map.end()) {
2835 if (p->logical_offset >= end) {
2836 break;
2837 }
2838 if (p->logical_offset < offset) {
2839 if (p->logical_end() > end) {
2840 // split and deref middle
2841 uint64_t front = offset - p->logical_offset;
2842 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2843 length, p->blob);
2844 old_extents->push_back(*oe);
2845 add(end,
2846 p->blob_offset + front + length,
2847 p->length - front - length,
2848 p->blob);
2849 p->length = front;
2850 break;
2851 } else {
2852 // deref tail
2853 assert(p->logical_end() > offset); // else seek_lextent bug
2854 uint64_t keep = offset - p->logical_offset;
2855 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2856 p->length - keep, p->blob);
2857 old_extents->push_back(*oe);
2858 p->length = keep;
2859 ++p;
2860 continue;
2861 }
2862 }
2863 if (p->logical_offset + p->length <= end) {
2864 // deref whole lextent
2865 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2866 p->length, p->blob);
2867 old_extents->push_back(*oe);
2868 rm(p++);
2869 continue;
2870 }
2871 // deref head
2872 uint64_t keep = p->logical_end() - end;
2873 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2874 p->length - keep, p->blob);
2875 old_extents->push_back(*oe);
2876
2877 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2878 rm(p);
2879 break;
2880 }
2881 }
2882
2883 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2884 CollectionRef &c,
2885 uint64_t logical_offset,
2886 uint64_t blob_offset, uint64_t length, BlobRef b,
2887 old_extent_map_t *old_extents)
2888 {
2889 // We need to have completely initialized Blob to increment its ref counters.
2890 assert(b->get_blob().get_logical_length() != 0);
2891
2892 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2893 // old_extents list if we overwre the blob totally
2894 // This might happen during WAL overwrite.
2895 b->get_ref(onode->c, blob_offset, length);
2896
2897 if (old_extents) {
2898 punch_hole(c, logical_offset, length, old_extents);
2899 }
2900
2901 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2902 extent_map.insert(*le);
2903 if (spans_shard(logical_offset, length)) {
2904 request_reshard(logical_offset, logical_offset + length);
2905 }
2906 return le;
2907 }
2908
2909 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2910 BlobRef lb,
2911 uint32_t blob_offset,
2912 uint32_t pos)
2913 {
2914 auto cct = onode->c->store->cct; //used by dout
2915
2916 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2917 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2918 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2919 << dendl;
2920 BlobRef rb = onode->c->new_blob();
2921 lb->split(onode->c, blob_offset, rb.get());
2922
2923 for (auto ep = seek_lextent(pos);
2924 ep != extent_map.end() && ep->logical_offset < end_pos;
2925 ++ep) {
2926 if (ep->blob != lb) {
2927 continue;
2928 }
2929 if (ep->logical_offset < pos) {
2930 // split extent
2931 size_t left = pos - ep->logical_offset;
2932 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2933 extent_map.insert(*ne);
2934 ep->length = left;
2935 dout(30) << __func__ << " split " << *ep << dendl;
2936 dout(30) << __func__ << " to " << *ne << dendl;
2937 } else {
2938 // switch blob
2939 assert(ep->blob_offset >= blob_offset);
2940
2941 ep->blob = rb;
2942 ep->blob_offset -= blob_offset;
2943 dout(30) << __func__ << " adjusted " << *ep << dendl;
2944 }
2945 }
2946 return rb;
2947 }
2948
2949 // Onode
2950
2951 #undef dout_prefix
2952 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2953
2954 void BlueStore::Onode::flush()
2955 {
2956 if (flushing_count.load()) {
2957 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2958 std::unique_lock<std::mutex> l(flush_lock);
2959 while (flushing_count.load()) {
2960 flush_cond.wait(l);
2961 }
2962 }
2963 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2964 }
2965
2966 // =======================================================
2967 // WriteContext
2968
2969 /// Checks for writes to the same pextent within a blob
2970 bool BlueStore::WriteContext::has_conflict(
2971 BlobRef b,
2972 uint64_t loffs,
2973 uint64_t loffs_end,
2974 uint64_t min_alloc_size)
2975 {
2976 assert((loffs % min_alloc_size) == 0);
2977 assert((loffs_end % min_alloc_size) == 0);
2978 for (auto w : writes) {
2979 if (b == w.b) {
2980 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
2981 auto loffs2_end = P2ROUNDUP(w.logical_offset + w.length0, min_alloc_size);
2982 if ((loffs <= loffs2 && loffs_end > loffs2) ||
2983 (loffs >= loffs2 && loffs < loffs2_end)) {
2984 return true;
2985 }
2986 }
2987 }
2988 return false;
2989 }
2990
2991 // =======================================================
2992
2993 // DeferredBatch
2994 #undef dout_prefix
2995 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2996
2997 void BlueStore::DeferredBatch::prepare_write(
2998 CephContext *cct,
2999 uint64_t seq, uint64_t offset, uint64_t length,
3000 bufferlist::const_iterator& blp)
3001 {
3002 _discard(cct, offset, length);
3003 auto i = iomap.insert(make_pair(offset, deferred_io()));
3004 assert(i.second); // this should be a new insertion
3005 i.first->second.seq = seq;
3006 blp.copy(length, i.first->second.bl);
3007 i.first->second.bl.reassign_to_mempool(
3008 mempool::mempool_bluestore_writing_deferred);
3009 dout(20) << __func__ << " seq " << seq
3010 << " 0x" << std::hex << offset << "~" << length
3011 << " crc " << i.first->second.bl.crc32c(-1)
3012 << std::dec << dendl;
3013 seq_bytes[seq] += length;
3014 #ifdef DEBUG_DEFERRED
3015 _audit(cct);
3016 #endif
3017 }
3018
3019 void BlueStore::DeferredBatch::_discard(
3020 CephContext *cct, uint64_t offset, uint64_t length)
3021 {
3022 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3023 << std::dec << dendl;
3024 auto p = iomap.lower_bound(offset);
3025 if (p != iomap.begin()) {
3026 --p;
3027 auto end = p->first + p->second.bl.length();
3028 if (end > offset) {
3029 bufferlist head;
3030 head.substr_of(p->second.bl, 0, offset - p->first);
3031 dout(20) << __func__ << " keep head " << p->second.seq
3032 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3033 << " -> 0x" << head.length() << std::dec << dendl;
3034 auto i = seq_bytes.find(p->second.seq);
3035 assert(i != seq_bytes.end());
3036 if (end > offset + length) {
3037 bufferlist tail;
3038 tail.substr_of(p->second.bl, offset + length - p->first,
3039 end - (offset + length));
3040 dout(20) << __func__ << " keep tail " << p->second.seq
3041 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3042 << " -> 0x" << tail.length() << std::dec << dendl;
3043 auto &n = iomap[offset + length];
3044 n.bl.swap(tail);
3045 n.seq = p->second.seq;
3046 i->second -= length;
3047 } else {
3048 i->second -= end - offset;
3049 }
3050 assert(i->second >= 0);
3051 p->second.bl.swap(head);
3052 }
3053 ++p;
3054 }
3055 while (p != iomap.end()) {
3056 if (p->first >= offset + length) {
3057 break;
3058 }
3059 auto i = seq_bytes.find(p->second.seq);
3060 assert(i != seq_bytes.end());
3061 auto end = p->first + p->second.bl.length();
3062 if (end > offset + length) {
3063 unsigned drop_front = offset + length - p->first;
3064 unsigned keep_tail = end - (offset + length);
3065 dout(20) << __func__ << " truncate front " << p->second.seq
3066 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3067 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3068 << " to 0x" << (offset + length) << "~" << keep_tail
3069 << std::dec << dendl;
3070 auto &s = iomap[offset + length];
3071 s.seq = p->second.seq;
3072 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3073 i->second -= drop_front;
3074 } else {
3075 dout(20) << __func__ << " drop " << p->second.seq
3076 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3077 << std::dec << dendl;
3078 i->second -= p->second.bl.length();
3079 }
3080 assert(i->second >= 0);
3081 p = iomap.erase(p);
3082 }
3083 }
3084
3085 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3086 {
3087 map<uint64_t,int> sb;
3088 for (auto p : seq_bytes) {
3089 sb[p.first] = 0; // make sure we have the same set of keys
3090 }
3091 uint64_t pos = 0;
3092 for (auto& p : iomap) {
3093 assert(p.first >= pos);
3094 sb[p.second.seq] += p.second.bl.length();
3095 pos = p.first + p.second.bl.length();
3096 }
3097 assert(sb == seq_bytes);
3098 }
3099
3100
3101 // Collection
3102
3103 #undef dout_prefix
3104 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3105
3106 BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3107 : store(ns),
3108 cache(c),
3109 cid(cid),
3110 lock("BlueStore::Collection::lock", true, false),
3111 exists(true),
3112 onode_map(c)
3113 {
3114 }
3115
3116 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3117 {
3118 assert(!b->shared_blob);
3119 const bluestore_blob_t& blob = b->get_blob();
3120 if (!blob.is_shared()) {
3121 b->shared_blob = new SharedBlob(this);
3122 return;
3123 }
3124
3125 b->shared_blob = shared_blob_set.lookup(sbid);
3126 if (b->shared_blob) {
3127 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3128 << std::dec << " had " << *b->shared_blob << dendl;
3129 } else {
3130 b->shared_blob = new SharedBlob(sbid, this);
3131 shared_blob_set.add(this, b->shared_blob.get());
3132 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3133 << std::dec << " opened " << *b->shared_blob
3134 << dendl;
3135 }
3136 }
3137
3138 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3139 {
3140 if (!sb->is_loaded()) {
3141
3142 bufferlist v;
3143 string key;
3144 auto sbid = sb->get_sbid();
3145 get_shared_blob_key(sbid, &key);
3146 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3147 if (r < 0) {
3148 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3149 << std::dec << " not found at key "
3150 << pretty_binary_string(key) << dendl;
3151 assert(0 == "uh oh, missing shared_blob");
3152 }
3153
3154 sb->loaded = true;
3155 sb->persistent = new bluestore_shared_blob_t(sbid);
3156 bufferlist::iterator p = v.begin();
3157 ::decode(*(sb->persistent), p);
3158 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3159 << std::dec << " loaded shared_blob " << *sb << dendl;
3160 }
3161 }
3162
3163 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3164 {
3165 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3166 assert(!b->shared_blob->is_loaded());
3167
3168 // update blob
3169 bluestore_blob_t& blob = b->dirty_blob();
3170 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3171
3172 // update shared blob
3173 b->shared_blob->loaded = true;
3174 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3175 shared_blob_set.add(this, b->shared_blob.get());
3176 for (auto p : blob.get_extents()) {
3177 if (p.is_valid()) {
3178 b->shared_blob->get_ref(
3179 p.offset,
3180 p.length);
3181 }
3182 }
3183 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3184 }
3185
3186 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3187 {
3188 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3189 assert(sb->is_loaded());
3190
3191 uint64_t sbid = sb->get_sbid();
3192 shared_blob_set.remove(sb);
3193 sb->loaded = false;
3194 delete sb->persistent;
3195 sb->sbid_unloaded = 0;
3196 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3197 return sbid;
3198 }
3199
3200 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3201 const ghobject_t& oid,
3202 bool create)
3203 {
3204 assert(create ? lock.is_wlocked() : lock.is_locked());
3205
3206 spg_t pgid;
3207 if (cid.is_pg(&pgid)) {
3208 if (!oid.match(cnode.bits, pgid.ps())) {
3209 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3210 << pgid << " bits " << cnode.bits << dendl;
3211 ceph_abort();
3212 }
3213 }
3214
3215 OnodeRef o = onode_map.lookup(oid);
3216 if (o)
3217 return o;
3218
3219 mempool::bluestore_cache_other::string key;
3220 get_object_key(store->cct, oid, &key);
3221
3222 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3223 << pretty_binary_string(key) << dendl;
3224
3225 bufferlist v;
3226 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3227 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3228 Onode *on;
3229 if (v.length() == 0) {
3230 assert(r == -ENOENT);
3231 if (!store->cct->_conf->bluestore_debug_misc &&
3232 !create)
3233 return OnodeRef();
3234
3235 // new object, new onode
3236 on = new Onode(this, oid, key);
3237 } else {
3238 // loaded
3239 assert(r >= 0);
3240 on = new Onode(this, oid, key);
3241 on->exists = true;
3242 bufferptr::iterator p = v.front().begin_deep();
3243 on->onode.decode(p);
3244 for (auto& i : on->onode.attrs) {
3245 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3246 }
3247
3248 // initialize extent_map
3249 on->extent_map.decode_spanning_blobs(p);
3250 if (on->onode.extent_map_shards.empty()) {
3251 denc(on->extent_map.inline_bl, p);
3252 on->extent_map.decode_some(on->extent_map.inline_bl);
3253 on->extent_map.inline_bl.reassign_to_mempool(
3254 mempool::mempool_bluestore_cache_other);
3255 } else {
3256 on->extent_map.init_shards(false, false);
3257 }
3258 }
3259 o.reset(on);
3260 return onode_map.add(oid, o);
3261 }
3262
3263 void BlueStore::Collection::split_cache(
3264 Collection *dest)
3265 {
3266 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3267
3268 // lock (one or both) cache shards
3269 std::lock(cache->lock, dest->cache->lock);
3270 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3271 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3272
3273 int destbits = dest->cnode.bits;
3274 spg_t destpg;
3275 bool is_pg = dest->cid.is_pg(&destpg);
3276 assert(is_pg);
3277
3278 auto p = onode_map.onode_map.begin();
3279 while (p != onode_map.onode_map.end()) {
3280 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3281 // onode does not belong to this child
3282 ++p;
3283 } else {
3284 OnodeRef o = p->second;
3285 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3286 << dendl;
3287
3288 cache->_rm_onode(p->second);
3289 p = onode_map.onode_map.erase(p);
3290
3291 o->c = dest;
3292 dest->cache->_add_onode(o, 1);
3293 dest->onode_map.onode_map[o->oid] = o;
3294 dest->onode_map.cache = dest->cache;
3295
3296 // move over shared blobs and buffers. cover shared blobs from
3297 // both extent map and spanning blob map (the full extent map
3298 // may not be faulted in)
3299 vector<SharedBlob*> sbvec;
3300 for (auto& e : o->extent_map.extent_map) {
3301 sbvec.push_back(e.blob->shared_blob.get());
3302 }
3303 for (auto& b : o->extent_map.spanning_blob_map) {
3304 sbvec.push_back(b.second->shared_blob.get());
3305 }
3306 for (auto sb : sbvec) {
3307 if (sb->coll == dest) {
3308 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3309 << dendl;
3310 continue;
3311 }
3312 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
3313 if (sb->get_sbid()) {
3314 ldout(store->cct, 20) << __func__
3315 << " moving registration " << *sb << dendl;
3316 shared_blob_set.remove(sb);
3317 dest->shared_blob_set.add(dest, sb);
3318 }
3319 sb->coll = dest;
3320 if (dest->cache != cache) {
3321 for (auto& i : sb->bc.buffer_map) {
3322 if (!i.second->is_writing()) {
3323 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3324 << dendl;
3325 dest->cache->_move_buffer(cache, i.second.get());
3326 }
3327 }
3328 }
3329 }
3330 }
3331 }
3332 }
3333
3334 // =======================================================
3335
3336 void *BlueStore::MempoolThread::entry()
3337 {
3338 Mutex::Locker l(lock);
3339 while (!stop) {
3340 uint64_t meta_bytes =
3341 mempool::bluestore_cache_other::allocated_bytes() +
3342 mempool::bluestore_cache_onode::allocated_bytes();
3343 uint64_t onode_num =
3344 mempool::bluestore_cache_onode::allocated_items();
3345
3346 if (onode_num < 2) {
3347 onode_num = 2;
3348 }
3349
3350 float bytes_per_onode = (float)meta_bytes / (float)onode_num;
3351 size_t num_shards = store->cache_shards.size();
3352 float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
3353 // A little sloppy but should be close enough
3354 uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
3355
3356 for (auto i : store->cache_shards) {
3357 i->trim(shard_target,
3358 store->cache_meta_ratio,
3359 store->cache_data_ratio,
3360 bytes_per_onode);
3361 }
3362
3363 store->_update_cache_logger();
3364
3365 utime_t wait;
3366 wait += store->cct->_conf->bluestore_cache_trim_interval;
3367 cond.WaitInterval(lock, wait);
3368 }
3369 stop = false;
3370 return NULL;
3371 }
3372
3373 // =======================================================
3374
3375 // OmapIteratorImpl
3376
3377 #undef dout_prefix
3378 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3379
3380 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3381 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3382 : c(c), o(o), it(it)
3383 {
3384 RWLock::RLocker l(c->lock);
3385 if (o->onode.has_omap()) {
3386 get_omap_key(o->onode.nid, string(), &head);
3387 get_omap_tail(o->onode.nid, &tail);
3388 it->lower_bound(head);
3389 }
3390 }
3391
3392 int BlueStore::OmapIteratorImpl::seek_to_first()
3393 {
3394 RWLock::RLocker l(c->lock);
3395 if (o->onode.has_omap()) {
3396 it->lower_bound(head);
3397 } else {
3398 it = KeyValueDB::Iterator();
3399 }
3400 return 0;
3401 }
3402
3403 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3404 {
3405 RWLock::RLocker l(c->lock);
3406 if (o->onode.has_omap()) {
3407 string key;
3408 get_omap_key(o->onode.nid, after, &key);
3409 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3410 << pretty_binary_string(key) << dendl;
3411 it->upper_bound(key);
3412 } else {
3413 it = KeyValueDB::Iterator();
3414 }
3415 return 0;
3416 }
3417
3418 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3419 {
3420 RWLock::RLocker l(c->lock);
3421 if (o->onode.has_omap()) {
3422 string key;
3423 get_omap_key(o->onode.nid, to, &key);
3424 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3425 << pretty_binary_string(key) << dendl;
3426 it->lower_bound(key);
3427 } else {
3428 it = KeyValueDB::Iterator();
3429 }
3430 return 0;
3431 }
3432
3433 bool BlueStore::OmapIteratorImpl::valid()
3434 {
3435 RWLock::RLocker l(c->lock);
3436 bool r = o->onode.has_omap() && it && it->valid() &&
3437 it->raw_key().second <= tail;
3438 if (it && it->valid()) {
3439 ldout(c->store->cct,20) << __func__ << " is at "
3440 << pretty_binary_string(it->raw_key().second)
3441 << dendl;
3442 }
3443 return r;
3444 }
3445
3446 int BlueStore::OmapIteratorImpl::next(bool validate)
3447 {
3448 RWLock::RLocker l(c->lock);
3449 if (o->onode.has_omap()) {
3450 it->next();
3451 return 0;
3452 } else {
3453 return -1;
3454 }
3455 }
3456
3457 string BlueStore::OmapIteratorImpl::key()
3458 {
3459 RWLock::RLocker l(c->lock);
3460 assert(it->valid());
3461 string db_key = it->raw_key().second;
3462 string user_key;
3463 decode_omap_key(db_key, &user_key);
3464 return user_key;
3465 }
3466
3467 bufferlist BlueStore::OmapIteratorImpl::value()
3468 {
3469 RWLock::RLocker l(c->lock);
3470 assert(it->valid());
3471 return it->value();
3472 }
3473
3474
3475 // =====================================
3476
3477 #undef dout_prefix
3478 #define dout_prefix *_dout << "bluestore(" << path << ") "
3479
3480
3481 static void aio_cb(void *priv, void *priv2)
3482 {
3483 BlueStore *store = static_cast<BlueStore*>(priv);
3484 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3485 c->aio_finish(store);
3486 }
3487
3488 BlueStore::BlueStore(CephContext *cct, const string& path)
3489 : ObjectStore(cct, path),
3490 throttle_bytes(cct, "bluestore_throttle_bytes",
3491 cct->_conf->bluestore_throttle_bytes),
3492 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3493 cct->_conf->bluestore_throttle_bytes +
3494 cct->_conf->bluestore_throttle_deferred_bytes),
3495 deferred_finisher(cct, "defered_finisher", "dfin"),
3496 kv_sync_thread(this),
3497 kv_finalize_thread(this),
3498 mempool_thread(this)
3499 {
3500 _init_logger();
3501 cct->_conf->add_observer(this);
3502 set_cache_shards(1);
3503 }
3504
3505 BlueStore::BlueStore(CephContext *cct,
3506 const string& path,
3507 uint64_t _min_alloc_size)
3508 : ObjectStore(cct, path),
3509 throttle_bytes(cct, "bluestore_throttle_bytes",
3510 cct->_conf->bluestore_throttle_bytes),
3511 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3512 cct->_conf->bluestore_throttle_bytes +
3513 cct->_conf->bluestore_throttle_deferred_bytes),
3514 deferred_finisher(cct, "defered_finisher", "dfin"),
3515 kv_sync_thread(this),
3516 kv_finalize_thread(this),
3517 min_alloc_size(_min_alloc_size),
3518 min_alloc_size_order(ctz(_min_alloc_size)),
3519 mempool_thread(this)
3520 {
3521 _init_logger();
3522 cct->_conf->add_observer(this);
3523 set_cache_shards(1);
3524 }
3525
3526 BlueStore::~BlueStore()
3527 {
3528 for (auto f : finishers) {
3529 delete f;
3530 }
3531 finishers.clear();
3532
3533 cct->_conf->remove_observer(this);
3534 _shutdown_logger();
3535 assert(!mounted);
3536 assert(db == NULL);
3537 assert(bluefs == NULL);
3538 assert(fsid_fd < 0);
3539 assert(path_fd < 0);
3540 for (auto i : cache_shards) {
3541 delete i;
3542 }
3543 cache_shards.clear();
3544 }
3545
3546 const char **BlueStore::get_tracked_conf_keys() const
3547 {
3548 static const char* KEYS[] = {
3549 "bluestore_csum_type",
3550 "bluestore_compression_mode",
3551 "bluestore_compression_algorithm",
3552 "bluestore_compression_min_blob_size",
3553 "bluestore_compression_min_blob_size_ssd",
3554 "bluestore_compression_min_blob_size_hdd",
3555 "bluestore_compression_max_blob_size",
3556 "bluestore_compression_max_blob_size_ssd",
3557 "bluestore_compression_max_blob_size_hdd",
3558 "bluestore_compression_required_ratio",
3559 "bluestore_max_alloc_size",
3560 "bluestore_prefer_deferred_size",
3561 "bluestore_prefer_deferred_size_hdd",
3562 "bluestore_prefer_deferred_size_ssd",
3563 "bluestore_deferred_batch_ops",
3564 "bluestore_deferred_batch_ops_hdd",
3565 "bluestore_deferred_batch_ops_ssd",
3566 "bluestore_throttle_bytes",
3567 "bluestore_throttle_deferred_bytes",
3568 "bluestore_throttle_cost_per_io_hdd",
3569 "bluestore_throttle_cost_per_io_ssd",
3570 "bluestore_throttle_cost_per_io",
3571 "bluestore_max_blob_size",
3572 "bluestore_max_blob_size_ssd",
3573 "bluestore_max_blob_size_hdd",
3574 NULL
3575 };
3576 return KEYS;
3577 }
3578
3579 void BlueStore::handle_conf_change(const struct md_config_t *conf,
3580 const std::set<std::string> &changed)
3581 {
3582 if (changed.count("bluestore_csum_type")) {
3583 _set_csum();
3584 }
3585 if (changed.count("bluestore_compression_mode") ||
3586 changed.count("bluestore_compression_algorithm") ||
3587 changed.count("bluestore_compression_min_blob_size") ||
3588 changed.count("bluestore_compression_max_blob_size")) {
3589 if (bdev) {
3590 _set_compression();
3591 }
3592 }
3593 if (changed.count("bluestore_max_blob_size") ||
3594 changed.count("bluestore_max_blob_size_ssd") ||
3595 changed.count("bluestore_max_blob_size_hdd")) {
3596 if (bdev) {
3597 // only after startup
3598 _set_blob_size();
3599 }
3600 }
3601 if (changed.count("bluestore_prefer_deferred_size") ||
3602 changed.count("bluestore_prefer_deferred_size_hdd") ||
3603 changed.count("bluestore_prefer_deferred_size_ssd") ||
3604 changed.count("bluestore_max_alloc_size") ||
3605 changed.count("bluestore_deferred_batch_ops") ||
3606 changed.count("bluestore_deferred_batch_ops_hdd") ||
3607 changed.count("bluestore_deferred_batch_ops_ssd")) {
3608 if (bdev) {
3609 // only after startup
3610 _set_alloc_sizes();
3611 }
3612 }
3613 if (changed.count("bluestore_throttle_cost_per_io") ||
3614 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3615 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3616 if (bdev) {
3617 _set_throttle_params();
3618 }
3619 }
3620 if (changed.count("bluestore_throttle_bytes")) {
3621 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3622 throttle_deferred_bytes.reset_max(
3623 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3624 }
3625 if (changed.count("bluestore_throttle_deferred_bytes")) {
3626 throttle_deferred_bytes.reset_max(
3627 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3628 }
3629 }
3630
3631 void BlueStore::_set_compression()
3632 {
3633 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3634 if (m) {
3635 comp_mode = *m;
3636 } else {
3637 derr << __func__ << " unrecognized value '"
3638 << cct->_conf->bluestore_compression_mode
3639 << "' for bluestore_compression_mode, reverting to 'none'"
3640 << dendl;
3641 comp_mode = Compressor::COMP_NONE;
3642 }
3643
3644 compressor = nullptr;
3645
3646 if (comp_mode == Compressor::COMP_NONE) {
3647 dout(10) << __func__ << " compression mode set to 'none', "
3648 << "ignore other compression setttings" << dendl;
3649 return;
3650 }
3651
3652 if (cct->_conf->bluestore_compression_min_blob_size) {
3653 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
3654 } else {
3655 assert(bdev);
3656 if (bdev->is_rotational()) {
3657 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3658 } else {
3659 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3660 }
3661 }
3662
3663 if (cct->_conf->bluestore_compression_max_blob_size) {
3664 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3665 } else {
3666 assert(bdev);
3667 if (bdev->is_rotational()) {
3668 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3669 } else {
3670 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3671 }
3672 }
3673
3674 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3675 if (!alg_name.empty()) {
3676 compressor = Compressor::create(cct, alg_name);
3677 if (!compressor) {
3678 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3679 << dendl;
3680 }
3681 }
3682
3683 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3684 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3685 << dendl;
3686 }
3687
3688 void BlueStore::_set_csum()
3689 {
3690 csum_type = Checksummer::CSUM_NONE;
3691 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3692 if (t > Checksummer::CSUM_NONE)
3693 csum_type = t;
3694
3695 dout(10) << __func__ << " csum_type "
3696 << Checksummer::get_csum_type_string(csum_type)
3697 << dendl;
3698 }
3699
3700 void BlueStore::_set_throttle_params()
3701 {
3702 if (cct->_conf->bluestore_throttle_cost_per_io) {
3703 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3704 } else {
3705 assert(bdev);
3706 if (bdev->is_rotational()) {
3707 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3708 } else {
3709 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3710 }
3711 }
3712
3713 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3714 << dendl;
3715 }
3716 void BlueStore::_set_blob_size()
3717 {
3718 if (cct->_conf->bluestore_max_blob_size) {
3719 max_blob_size = cct->_conf->bluestore_max_blob_size;
3720 } else {
3721 assert(bdev);
3722 if (bdev->is_rotational()) {
3723 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3724 } else {
3725 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3726 }
3727 }
3728 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3729 << std::dec << dendl;
3730 }
3731
3732 int BlueStore::_set_cache_sizes()
3733 {
3734 assert(bdev);
3735 if (cct->_conf->bluestore_cache_size) {
3736 cache_size = cct->_conf->bluestore_cache_size;
3737 } else {
3738 // choose global cache size based on backend type
3739 if (bdev->is_rotational()) {
3740 cache_size = cct->_conf->bluestore_cache_size_hdd;
3741 } else {
3742 cache_size = cct->_conf->bluestore_cache_size_ssd;
3743 }
3744 }
3745 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3746 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
3747
3748 double cache_kv_max = cct->_conf->bluestore_cache_kv_max;
3749 double cache_kv_max_ratio = 0;
3750
3751 // if cache_kv_max is negative, disable it
3752 if (cache_size > 0 && cache_kv_max >= 0) {
3753 cache_kv_max_ratio = (double) cache_kv_max / (double) cache_size;
3754 if (cache_kv_max_ratio < 1.0 && cache_kv_max_ratio < cache_kv_ratio) {
3755 dout(1) << __func__ << " max " << cache_kv_max_ratio
3756 << " < ratio " << cache_kv_ratio
3757 << dendl;
3758 cache_meta_ratio = cache_meta_ratio + cache_kv_ratio - cache_kv_max_ratio;
3759 cache_kv_ratio = cache_kv_max_ratio;
3760 }
3761 }
3762
3763 cache_data_ratio =
3764 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
3765
3766 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
3767 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
3768 << ") must be in range [0,1.0]" << dendl;
3769 return -EINVAL;
3770 }
3771 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
3772 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
3773 << ") must be in range [0,1.0]" << dendl;
3774 return -EINVAL;
3775 }
3776 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
3777 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
3778 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3779 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
3780 << dendl;
3781 return -EINVAL;
3782 }
3783 if (cache_data_ratio < 0) {
3784 // deal with floating point imprecision
3785 cache_data_ratio = 0;
3786 }
3787 dout(1) << __func__ << " cache_size " << cache_size
3788 << " meta " << cache_meta_ratio
3789 << " kv " << cache_kv_ratio
3790 << " data " << cache_data_ratio
3791 << dendl;
3792 return 0;
3793 }
3794
3795 int BlueStore::write_meta(const std::string& key, const std::string& value)
3796 {
3797 bluestore_bdev_label_t label;
3798 string p = path + "/block";
3799 int r = _read_bdev_label(cct, p, &label);
3800 if (r < 0) {
3801 return ObjectStore::write_meta(key, value);
3802 }
3803 label.meta[key] = value;
3804 r = _write_bdev_label(cct, p, label);
3805 assert(r == 0);
3806 return ObjectStore::write_meta(key, value);
3807 }
3808
3809 int BlueStore::read_meta(const std::string& key, std::string *value)
3810 {
3811 bluestore_bdev_label_t label;
3812 string p = path + "/block";
3813 int r = _read_bdev_label(cct, p, &label);
3814 if (r < 0) {
3815 return ObjectStore::read_meta(key, value);
3816 }
3817 auto i = label.meta.find(key);
3818 if (i == label.meta.end()) {
3819 return ObjectStore::read_meta(key, value);
3820 }
3821 *value = i->second;
3822 return 0;
3823 }
3824
3825 void BlueStore::_init_logger()
3826 {
3827 PerfCountersBuilder b(cct, "bluestore",
3828 l_bluestore_first, l_bluestore_last);
3829 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
3830 "Average kv_thread flush latency",
3831 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
3832 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
3833 "Average kv_thread commit latency");
3834 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
3835 "Average kv_thread sync latency",
3836 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
3837 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
3838 "Average prepare state latency");
3839 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
3840 "Average aio_wait state latency",
3841 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
3842 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
3843 "Average io_done state latency");
3844 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
3845 "Average kv_queued state latency");
3846 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
3847 "Average kv_commiting state latency");
3848 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
3849 "Average kv_done state latency");
3850 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
3851 "Average deferred_queued state latency");
3852 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
3853 "Average aio_wait state latency");
3854 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
3855 "Average cleanup state latency");
3856 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
3857 "Average finishing state latency");
3858 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
3859 "Average done state latency");
3860 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
3861 "Average submit throttle latency",
3862 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
3863 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
3864 "Average submit latency",
3865 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
3866 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
3867 "Average commit latency",
3868 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
3869 b.add_time_avg(l_bluestore_read_lat, "read_lat",
3870 "Average read latency",
3871 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
3872 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
3873 "Average read onode metadata latency");
3874 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
3875 "Average read latency");
3876 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
3877 "Average compress latency");
3878 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
3879 "Average decompress latency");
3880 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
3881 "Average checksum latency");
3882 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
3883 "Sum for beneficial compress ops");
3884 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
3885 "Sum for compress ops rejected due to low net gain of space");
3886 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
3887 "Sum for write-op padded bytes");
3888 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
3889 "Sum for deferred write op");
3890 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
3891 "Sum for deferred write bytes", "def");
3892 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
3893 "Sum for write penalty read ops");
3894 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
3895 "Sum for allocated bytes");
3896 b.add_u64(l_bluestore_stored, "bluestore_stored",
3897 "Sum for stored bytes");
3898 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
3899 "Sum for stored compressed bytes");
3900 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
3901 "Sum for bytes allocated for compressed data");
3902 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
3903 "Sum for original bytes that were compressed");
3904
3905 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
3906 "Number of onodes in cache");
3907 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
3908 "Sum for onode-lookups hit in the cache");
3909 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
3910 "Sum for onode-lookups missed in the cache");
3911 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
3912 "Sum for onode-shard lookups hit in the cache");
3913 b.add_u64_counter(l_bluestore_onode_shard_misses,
3914 "bluestore_onode_shard_misses",
3915 "Sum for onode-shard lookups missed in the cache");
3916 b.add_u64(l_bluestore_extents, "bluestore_extents",
3917 "Number of extents in cache");
3918 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
3919 "Number of blobs in cache");
3920 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
3921 "Number of buffers in cache");
3922 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
3923 "Number of buffer bytes in cache");
3924 b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
3925 "Sum for bytes of read hit in the cache");
3926 b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
3927 "Sum for bytes of read missed in the cache");
3928
3929 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
3930 "Large aligned writes into fresh blobs");
3931 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
3932 "Large aligned writes into fresh blobs (bytes)");
3933 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
3934 "Large aligned writes into fresh blobs (blobs)");
3935 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
3936 "Small writes into existing or sparse small blobs");
3937 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
3938 "Small writes into existing or sparse small blobs (bytes)");
3939 b.add_u64_counter(l_bluestore_write_small_unused,
3940 "bluestore_write_small_unused",
3941 "Small writes into unused portion of existing blob");
3942 b.add_u64_counter(l_bluestore_write_small_deferred,
3943 "bluestore_write_small_deferred",
3944 "Small overwrites using deferred");
3945 b.add_u64_counter(l_bluestore_write_small_pre_read,
3946 "bluestore_write_small_pre_read",
3947 "Small writes that required we read some data (possibly "
3948 "cached) to fill out the block");
3949 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
3950 "Small write into new (sparse) blob");
3951
3952 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
3953 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
3954 "Onode extent map reshard events");
3955 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
3956 "Sum for blob splitting due to resharding");
3957 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
3958 "Sum for extents that have been removed due to compression");
3959 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
3960 "Sum for extents that have been merged due to garbage "
3961 "collection");
3962 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
3963 "Read EIO errors propagated to high level callers");
3964 logger = b.create_perf_counters();
3965 cct->get_perfcounters_collection()->add(logger);
3966 }
3967
3968 int BlueStore::_reload_logger()
3969 {
3970 struct store_statfs_t store_statfs;
3971
3972 int r = statfs(&store_statfs);
3973 if(r >= 0) {
3974 logger->set(l_bluestore_allocated, store_statfs.allocated);
3975 logger->set(l_bluestore_stored, store_statfs.stored);
3976 logger->set(l_bluestore_compressed, store_statfs.compressed);
3977 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
3978 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
3979 }
3980 return r;
3981 }
3982
3983 void BlueStore::_shutdown_logger()
3984 {
3985 cct->get_perfcounters_collection()->remove(logger);
3986 delete logger;
3987 }
3988
3989 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
3990 uuid_d *fsid)
3991 {
3992 bluestore_bdev_label_t label;
3993 int r = _read_bdev_label(cct, path, &label);
3994 if (r < 0)
3995 return r;
3996 *fsid = label.osd_uuid;
3997 return 0;
3998 }
3999
4000 int BlueStore::_open_path()
4001 {
4002 // sanity check(s)
4003 if (cct->_conf->get_val<uint64_t>("osd_max_object_size") >=
4004 4*1024*1024*1024ull) {
4005 derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl;
4006 return -EINVAL;
4007 }
4008 assert(path_fd < 0);
4009 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY));
4010 if (path_fd < 0) {
4011 int r = -errno;
4012 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4013 << dendl;
4014 return r;
4015 }
4016 return 0;
4017 }
4018
4019 void BlueStore::_close_path()
4020 {
4021 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4022 path_fd = -1;
4023 }
4024
4025 int BlueStore::_write_bdev_label(CephContext *cct,
4026 string path, bluestore_bdev_label_t label)
4027 {
4028 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4029 bufferlist bl;
4030 ::encode(label, bl);
4031 uint32_t crc = bl.crc32c(-1);
4032 ::encode(crc, bl);
4033 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
4034 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4035 z.zero();
4036 bl.append(std::move(z));
4037
4038 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY));
4039 if (fd < 0) {
4040 fd = -errno;
4041 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4042 << dendl;
4043 return fd;
4044 }
4045 int r = bl.write_fd(fd);
4046 if (r < 0) {
4047 derr << __func__ << " failed to write to " << path
4048 << ": " << cpp_strerror(r) << dendl;
4049 }
4050 r = ::fsync(fd);
4051 if (r < 0) {
4052 derr << __func__ << " failed to fsync " << path
4053 << ": " << cpp_strerror(r) << dendl;
4054 }
4055 VOID_TEMP_FAILURE_RETRY(::close(fd));
4056 return r;
4057 }
4058
4059 int BlueStore::_read_bdev_label(CephContext* cct, string path,
4060 bluestore_bdev_label_t *label)
4061 {
4062 dout(10) << __func__ << dendl;
4063 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY));
4064 if (fd < 0) {
4065 fd = -errno;
4066 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4067 << dendl;
4068 return fd;
4069 }
4070 bufferlist bl;
4071 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4072 VOID_TEMP_FAILURE_RETRY(::close(fd));
4073 if (r < 0) {
4074 derr << __func__ << " failed to read from " << path
4075 << ": " << cpp_strerror(r) << dendl;
4076 return r;
4077 }
4078
4079 uint32_t crc, expected_crc;
4080 bufferlist::iterator p = bl.begin();
4081 try {
4082 ::decode(*label, p);
4083 bufferlist t;
4084 t.substr_of(bl, 0, p.get_off());
4085 crc = t.crc32c(-1);
4086 ::decode(expected_crc, p);
4087 }
4088 catch (buffer::error& e) {
4089 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
4090 << ": " << e.what()
4091 << dendl;
4092 return -ENOENT;
4093 }
4094 if (crc != expected_crc) {
4095 derr << __func__ << " bad crc on label, expected " << expected_crc
4096 << " != actual " << crc << dendl;
4097 return -EIO;
4098 }
4099 dout(10) << __func__ << " got " << *label << dendl;
4100 return 0;
4101 }
4102
4103 int BlueStore::_check_or_set_bdev_label(
4104 string path, uint64_t size, string desc, bool create)
4105 {
4106 bluestore_bdev_label_t label;
4107 if (create) {
4108 label.osd_uuid = fsid;
4109 label.size = size;
4110 label.btime = ceph_clock_now();
4111 label.description = desc;
4112 int r = _write_bdev_label(cct, path, label);
4113 if (r < 0)
4114 return r;
4115 } else {
4116 int r = _read_bdev_label(cct, path, &label);
4117 if (r < 0)
4118 return r;
4119 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4120 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4121 << " and fsid " << fsid << " check bypassed" << dendl;
4122 }
4123 else if (label.osd_uuid != fsid) {
4124 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4125 << " does not match our fsid " << fsid << dendl;
4126 return -EIO;
4127 }
4128 }
4129 return 0;
4130 }
4131
4132 void BlueStore::_set_alloc_sizes(void)
4133 {
4134 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4135
4136 if (cct->_conf->bluestore_prefer_deferred_size) {
4137 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4138 } else {
4139 assert(bdev);
4140 if (bdev->is_rotational()) {
4141 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4142 } else {
4143 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4144 }
4145 }
4146
4147 if (cct->_conf->bluestore_deferred_batch_ops) {
4148 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4149 } else {
4150 assert(bdev);
4151 if (bdev->is_rotational()) {
4152 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4153 } else {
4154 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4155 }
4156 }
4157
4158 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4159 << std::dec << " order " << min_alloc_size_order
4160 << " max_alloc_size 0x" << std::hex << max_alloc_size
4161 << " prefer_deferred_size 0x" << prefer_deferred_size
4162 << std::dec
4163 << " deferred_batch_ops " << deferred_batch_ops
4164 << dendl;
4165 }
4166
4167 int BlueStore::_open_bdev(bool create)
4168 {
4169 assert(bdev == NULL);
4170 string p = path + "/block";
4171 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4172 int r = bdev->open(p);
4173 if (r < 0)
4174 goto fail;
4175
4176 if (bdev->supported_bdev_label()) {
4177 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4178 if (r < 0)
4179 goto fail_close;
4180 }
4181
4182 // initialize global block parameters
4183 block_size = bdev->get_block_size();
4184 block_mask = ~(block_size - 1);
4185 block_size_order = ctz(block_size);
4186 assert(block_size == 1u << block_size_order);
4187 // and set cache_size based on device type
4188 r = _set_cache_sizes();
4189 if (r < 0) {
4190 goto fail_close;
4191 }
4192 return 0;
4193
4194 fail_close:
4195 bdev->close();
4196 fail:
4197 delete bdev;
4198 bdev = NULL;
4199 return r;
4200 }
4201
4202 void BlueStore::_close_bdev()
4203 {
4204 assert(bdev);
4205 bdev->close();
4206 delete bdev;
4207 bdev = NULL;
4208 }
4209
4210 int BlueStore::_open_fm(bool create)
4211 {
4212 assert(fm == NULL);
4213 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4214
4215 if (create) {
4216 // initialize freespace
4217 dout(20) << __func__ << " initializing freespace" << dendl;
4218 KeyValueDB::Transaction t = db->get_transaction();
4219 {
4220 bufferlist bl;
4221 bl.append(freelist_type);
4222 t->set(PREFIX_SUPER, "freelist_type", bl);
4223 }
4224 // being able to allocate in units less than bdev block size
4225 // seems to be a bad idea.
4226 assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
4227 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
4228
4229 // allocate superblock reserved space. note that we do not mark
4230 // bluefs space as allocated in the freelist; we instead rely on
4231 // bluefs_extents.
4232 uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
4233 min_alloc_size);
4234 fm->allocate(0, reserved, t);
4235
4236 if (cct->_conf->bluestore_bluefs) {
4237 assert(bluefs_extents.num_intervals() == 1);
4238 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
4239 reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
4240 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4241 << " for bluefs" << dendl;
4242 bufferlist bl;
4243 ::encode(bluefs_extents, bl);
4244 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4245 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4246 << std::dec << dendl;
4247 }
4248
4249 if (cct->_conf->bluestore_debug_prefill > 0) {
4250 uint64_t end = bdev->get_size() - reserved;
4251 dout(1) << __func__ << " pre-fragmenting freespace, using "
4252 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4253 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4254 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4255 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4256 float r = cct->_conf->bluestore_debug_prefill;
4257 r /= 1.0 - r;
4258 bool stop = false;
4259
4260 while (!stop && start < end) {
4261 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4262 if (start + l > end) {
4263 l = end - start;
4264 l = P2ALIGN(l, min_alloc_size);
4265 }
4266 assert(start + l <= end);
4267
4268 uint64_t u = 1 + (uint64_t)(r * (double)l);
4269 u = P2ROUNDUP(u, min_alloc_size);
4270 if (start + l + u > end) {
4271 u = end - (start + l);
4272 // trim to align so we don't overflow again
4273 u = P2ALIGN(u, min_alloc_size);
4274 stop = true;
4275 }
4276 assert(start + l + u <= end);
4277
4278 dout(20) << " free 0x" << std::hex << start << "~" << l
4279 << " use 0x" << u << std::dec << dendl;
4280
4281 if (u == 0) {
4282 // break if u has been trimmed to nothing
4283 break;
4284 }
4285
4286 fm->allocate(start + l, u, t);
4287 start += l + u;
4288 }
4289 }
4290 db->submit_transaction_sync(t);
4291 }
4292
4293 int r = fm->init(bdev->get_size());
4294 if (r < 0) {
4295 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4296 delete fm;
4297 fm = NULL;
4298 return r;
4299 }
4300 return 0;
4301 }
4302
4303 void BlueStore::_close_fm()
4304 {
4305 dout(10) << __func__ << dendl;
4306 assert(fm);
4307 fm->shutdown();
4308 delete fm;
4309 fm = NULL;
4310 }
4311
4312 int BlueStore::_open_alloc()
4313 {
4314 assert(alloc == NULL);
4315 assert(bdev->get_size());
4316 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4317 bdev->get_size(),
4318 min_alloc_size);
4319 if (!alloc) {
4320 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4321 << cct->_conf->bluestore_allocator
4322 << dendl;
4323 return -EINVAL;
4324 }
4325
4326 uint64_t num = 0, bytes = 0;
4327
4328 dout(1) << __func__ << " opening allocation metadata" << dendl;
4329 // initialize from freelist
4330 fm->enumerate_reset();
4331 uint64_t offset, length;
4332 while (fm->enumerate_next(&offset, &length)) {
4333 alloc->init_add_free(offset, length);
4334 ++num;
4335 bytes += length;
4336 }
4337 fm->enumerate_reset();
4338 dout(1) << __func__ << " loaded " << pretty_si_t(bytes)
4339 << " in " << num << " extents"
4340 << dendl;
4341
4342 // also mark bluefs space as allocated
4343 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4344 alloc->init_rm_free(e.get_start(), e.get_len());
4345 }
4346 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4347 << bluefs_extents << std::dec << " as allocated" << dendl;
4348
4349 return 0;
4350 }
4351
4352 void BlueStore::_close_alloc()
4353 {
4354 assert(alloc);
4355 alloc->shutdown();
4356 delete alloc;
4357 alloc = NULL;
4358 }
4359
4360 int BlueStore::_open_fsid(bool create)
4361 {
4362 assert(fsid_fd < 0);
4363 int flags = O_RDWR;
4364 if (create)
4365 flags |= O_CREAT;
4366 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4367 if (fsid_fd < 0) {
4368 int err = -errno;
4369 derr << __func__ << " " << cpp_strerror(err) << dendl;
4370 return err;
4371 }
4372 return 0;
4373 }
4374
4375 int BlueStore::_read_fsid(uuid_d *uuid)
4376 {
4377 char fsid_str[40];
4378 memset(fsid_str, 0, sizeof(fsid_str));
4379 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4380 if (ret < 0) {
4381 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4382 return ret;
4383 }
4384 if (ret > 36)
4385 fsid_str[36] = 0;
4386 else
4387 fsid_str[ret] = 0;
4388 if (!uuid->parse(fsid_str)) {
4389 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4390 return -EINVAL;
4391 }
4392 return 0;
4393 }
4394
4395 int BlueStore::_write_fsid()
4396 {
4397 int r = ::ftruncate(fsid_fd, 0);
4398 if (r < 0) {
4399 r = -errno;
4400 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4401 return r;
4402 }
4403 string str = stringify(fsid) + "\n";
4404 r = safe_write(fsid_fd, str.c_str(), str.length());
4405 if (r < 0) {
4406 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4407 return r;
4408 }
4409 r = ::fsync(fsid_fd);
4410 if (r < 0) {
4411 r = -errno;
4412 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4413 return r;
4414 }
4415 return 0;
4416 }
4417
4418 void BlueStore::_close_fsid()
4419 {
4420 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4421 fsid_fd = -1;
4422 }
4423
4424 int BlueStore::_lock_fsid()
4425 {
4426 struct flock l;
4427 memset(&l, 0, sizeof(l));
4428 l.l_type = F_WRLCK;
4429 l.l_whence = SEEK_SET;
4430 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4431 if (r < 0) {
4432 int err = errno;
4433 derr << __func__ << " failed to lock " << path << "/fsid"
4434 << " (is another ceph-osd still running?)"
4435 << cpp_strerror(err) << dendl;
4436 return -err;
4437 }
4438 return 0;
4439 }
4440
4441 bool BlueStore::is_rotational()
4442 {
4443 if (bdev) {
4444 return bdev->is_rotational();
4445 }
4446
4447 bool rotational = true;
4448 int r = _open_path();
4449 if (r < 0)
4450 goto out;
4451 r = _open_fsid(false);
4452 if (r < 0)
4453 goto out_path;
4454 r = _read_fsid(&fsid);
4455 if (r < 0)
4456 goto out_fsid;
4457 r = _lock_fsid();
4458 if (r < 0)
4459 goto out_fsid;
4460 r = _open_bdev(false);
4461 if (r < 0)
4462 goto out_fsid;
4463 rotational = bdev->is_rotational();
4464 _close_bdev();
4465 out_fsid:
4466 _close_fsid();
4467 out_path:
4468 _close_path();
4469 out:
4470 return rotational;
4471 }
4472
4473 bool BlueStore::is_journal_rotational()
4474 {
4475 if (!bluefs) {
4476 dout(5) << __func__ << " bluefs disabled, default to store media type"
4477 << dendl;
4478 return is_rotational();
4479 }
4480 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4481 return bluefs->wal_is_rotational();
4482 }
4483
4484 bool BlueStore::test_mount_in_use()
4485 {
4486 // most error conditions mean the mount is not in use (e.g., because
4487 // it doesn't exist). only if we fail to lock do we conclude it is
4488 // in use.
4489 bool ret = false;
4490 int r = _open_path();
4491 if (r < 0)
4492 return false;
4493 r = _open_fsid(false);
4494 if (r < 0)
4495 goto out_path;
4496 r = _lock_fsid();
4497 if (r < 0)
4498 ret = true; // if we can't lock, it is in use
4499 _close_fsid();
4500 out_path:
4501 _close_path();
4502 return ret;
4503 }
4504
4505 int BlueStore::_open_db(bool create)
4506 {
4507 int r;
4508 assert(!db);
4509 string fn = path + "/db";
4510 string options;
4511 stringstream err;
4512 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4513
4514 string kv_backend;
4515 if (create) {
4516 kv_backend = cct->_conf->bluestore_kvbackend;
4517 } else {
4518 r = read_meta("kv_backend", &kv_backend);
4519 if (r < 0) {
4520 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4521 return -EIO;
4522 }
4523 }
4524 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4525
4526 bool do_bluefs;
4527 if (create) {
4528 do_bluefs = cct->_conf->bluestore_bluefs;
4529 } else {
4530 string s;
4531 r = read_meta("bluefs", &s);
4532 if (r < 0) {
4533 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4534 return -EIO;
4535 }
4536 if (s == "1") {
4537 do_bluefs = true;
4538 } else if (s == "0") {
4539 do_bluefs = false;
4540 } else {
4541 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4542 << dendl;
4543 return -EIO;
4544 }
4545 }
4546 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4547
4548 rocksdb::Env *env = NULL;
4549 if (do_bluefs) {
4550 dout(10) << __func__ << " initializing bluefs" << dendl;
4551 if (kv_backend != "rocksdb") {
4552 derr << " backend must be rocksdb to use bluefs" << dendl;
4553 return -EINVAL;
4554 }
4555 bluefs = new BlueFS(cct);
4556
4557 string bfn;
4558 struct stat st;
4559
4560 bfn = path + "/block.db";
4561 if (::stat(bfn.c_str(), &st) == 0) {
4562 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4563 if (r < 0) {
4564 derr << __func__ << " add block device(" << bfn << ") returned: "
4565 << cpp_strerror(r) << dendl;
4566 goto free_bluefs;
4567 }
4568
4569 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4570 r = _check_or_set_bdev_label(
4571 bfn,
4572 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4573 "bluefs db", create);
4574 if (r < 0) {
4575 derr << __func__
4576 << " check block device(" << bfn << ") label returned: "
4577 << cpp_strerror(r) << dendl;
4578 goto free_bluefs;
4579 }
4580 }
4581 if (create) {
4582 bluefs->add_block_extent(
4583 BlueFS::BDEV_DB,
4584 SUPER_RESERVED,
4585 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4586 }
4587 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4588 bluefs_single_shared_device = false;
4589 } else {
4590 r = -errno;
4591 if (::lstat(bfn.c_str(), &st) == -1) {
4592 r = 0;
4593 bluefs_shared_bdev = BlueFS::BDEV_DB;
4594 } else {
4595 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4596 << cpp_strerror(r) << dendl;
4597 goto free_bluefs;
4598 }
4599 }
4600
4601 // shared device
4602 bfn = path + "/block";
4603 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4604 if (r < 0) {
4605 derr << __func__ << " add block device(" << bfn << ") returned: "
4606 << cpp_strerror(r) << dendl;
4607 goto free_bluefs;
4608 }
4609 if (create) {
4610 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4611 uint64_t initial =
4612 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4613 cct->_conf->bluestore_bluefs_gift_ratio);
4614 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
4615 if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
4616 derr << __func__ << " bluefs_alloc_size 0x" << std::hex
4617 << cct->_conf->bluefs_alloc_size << " is not a multiple of "
4618 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4619 r = -EINVAL;
4620 goto free_bluefs;
4621 }
4622 // align to bluefs's alloc_size
4623 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
4624 // put bluefs in the middle of the device in case it is an HDD
4625 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4626 cct->_conf->bluefs_alloc_size);
4627 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4628 bluefs_extents.insert(start, initial);
4629 }
4630
4631 bfn = path + "/block.wal";
4632 if (::stat(bfn.c_str(), &st) == 0) {
4633 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4634 if (r < 0) {
4635 derr << __func__ << " add block device(" << bfn << ") returned: "
4636 << cpp_strerror(r) << dendl;
4637 goto free_bluefs;
4638 }
4639
4640 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4641 r = _check_or_set_bdev_label(
4642 bfn,
4643 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4644 "bluefs wal", create);
4645 if (r < 0) {
4646 derr << __func__ << " check block device(" << bfn
4647 << ") label returned: " << cpp_strerror(r) << dendl;
4648 goto free_bluefs;
4649 }
4650 }
4651
4652 if (create) {
4653 bluefs->add_block_extent(
4654 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4655 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4656 BDEV_LABEL_BLOCK_SIZE);
4657 }
4658 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4659 bluefs_single_shared_device = false;
4660 } else {
4661 r = -errno;
4662 if (::lstat(bfn.c_str(), &st) == -1) {
4663 r = 0;
4664 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
4665 } else {
4666 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
4667 << cpp_strerror(r) << dendl;
4668 goto free_bluefs;
4669 }
4670 }
4671
4672 if (create) {
4673 bluefs->mkfs(fsid);
4674 }
4675 r = bluefs->mount();
4676 if (r < 0) {
4677 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4678 goto free_bluefs;
4679 }
4680 if (cct->_conf->bluestore_bluefs_env_mirror) {
4681 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4682 rocksdb::Env *b = rocksdb::Env::Default();
4683 if (create) {
4684 string cmd = "rm -rf " + path + "/db " +
4685 path + "/db.slow " +
4686 path + "/db.wal";
4687 int r = system(cmd.c_str());
4688 (void)r;
4689 }
4690 env = new rocksdb::EnvMirror(b, a, false, true);
4691 } else {
4692 env = new BlueRocksEnv(bluefs);
4693
4694 // simplify the dir names, too, as "seen" by rocksdb
4695 fn = "db";
4696 }
4697
4698 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4699 // we have both block.db and block; tell rocksdb!
4700 // note: the second (last) size value doesn't really matter
4701 ostringstream db_paths;
4702 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4703 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4704 db_paths << fn << ","
4705 << (uint64_t)(db_size * 95 / 100) << " "
4706 << fn + ".slow" << ","
4707 << (uint64_t)(slow_size * 95 / 100);
4708 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4709 dout(10) << __func__ << " set rocksdb_db_paths to "
4710 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4711 }
4712
4713 if (create) {
4714 env->CreateDir(fn);
4715 if (cct->_conf->rocksdb_separate_wal_dir)
4716 env->CreateDir(fn + ".wal");
4717 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4718 env->CreateDir(fn + ".slow");
4719 }
4720 } else if (create) {
4721 int r = ::mkdir(fn.c_str(), 0755);
4722 if (r < 0)
4723 r = -errno;
4724 if (r < 0 && r != -EEXIST) {
4725 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4726 << dendl;
4727 return r;
4728 }
4729
4730 // wal_dir, too!
4731 if (cct->_conf->rocksdb_separate_wal_dir) {
4732 string walfn = path + "/db.wal";
4733 r = ::mkdir(walfn.c_str(), 0755);
4734 if (r < 0)
4735 r = -errno;
4736 if (r < 0 && r != -EEXIST) {
4737 derr << __func__ << " failed to create " << walfn
4738 << ": " << cpp_strerror(r)
4739 << dendl;
4740 return r;
4741 }
4742 }
4743 }
4744
4745 db = KeyValueDB::create(cct,
4746 kv_backend,
4747 fn,
4748 static_cast<void*>(env));
4749 if (!db) {
4750 derr << __func__ << " error creating db" << dendl;
4751 if (bluefs) {
4752 bluefs->umount();
4753 delete bluefs;
4754 bluefs = NULL;
4755 }
4756 // delete env manually here since we can't depend on db to do this
4757 // under this case
4758 delete env;
4759 env = NULL;
4760 return -EIO;
4761 }
4762
4763 FreelistManager::setup_merge_operators(db);
4764 db->set_merge_operator(PREFIX_STAT, merge_op);
4765
4766 db->set_cache_size(cache_size * cache_kv_ratio);
4767
4768 if (kv_backend == "rocksdb")
4769 options = cct->_conf->bluestore_rocksdb_options;
4770 db->init(options);
4771 if (create)
4772 r = db->create_and_open(err);
4773 else
4774 r = db->open(err);
4775 if (r) {
4776 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4777 if (bluefs) {
4778 bluefs->umount();
4779 delete bluefs;
4780 bluefs = NULL;
4781 }
4782 delete db;
4783 db = NULL;
4784 return -EIO;
4785 }
4786 dout(1) << __func__ << " opened " << kv_backend
4787 << " path " << fn << " options " << options << dendl;
4788 return 0;
4789
4790 free_bluefs:
4791 assert(bluefs);
4792 delete bluefs;
4793 bluefs = NULL;
4794 return r;
4795 }
4796
4797 void BlueStore::_close_db()
4798 {
4799 assert(db);
4800 delete db;
4801 db = NULL;
4802 if (bluefs) {
4803 bluefs->umount();
4804 delete bluefs;
4805 bluefs = NULL;
4806 }
4807 }
4808
4809 int BlueStore::_reconcile_bluefs_freespace()
4810 {
4811 dout(10) << __func__ << dendl;
4812 interval_set<uint64_t> bset;
4813 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
4814 assert(r == 0);
4815 if (bset == bluefs_extents) {
4816 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
4817 << std::dec << dendl;
4818 return 0;
4819 }
4820 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
4821 << dendl;
4822 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
4823 << std::dec << dendl;
4824
4825 interval_set<uint64_t> overlap;
4826 overlap.intersection_of(bset, bluefs_extents);
4827
4828 bset.subtract(overlap);
4829 if (!bset.empty()) {
4830 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
4831 << dendl;
4832 return -EIO;
4833 }
4834
4835 interval_set<uint64_t> super_extra;
4836 super_extra = bluefs_extents;
4837 super_extra.subtract(overlap);
4838 if (!super_extra.empty()) {
4839 // This is normal: it can happen if we commit to give extents to
4840 // bluefs and we crash before bluefs commits that it owns them.
4841 dout(10) << __func__ << " super extra " << super_extra << dendl;
4842 for (interval_set<uint64_t>::iterator p = super_extra.begin();
4843 p != super_extra.end();
4844 ++p) {
4845 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
4846 }
4847 }
4848
4849 return 0;
4850 }
4851
4852 int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
4853 {
4854 int ret = 0;
4855 assert(bluefs);
4856
4857 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
4858 bluefs->get_usage(&bluefs_usage);
4859 assert(bluefs_usage.size() > bluefs_shared_bdev);
4860
4861 // fixme: look at primary bdev only for now
4862 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
4863 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
4864 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
4865
4866 uint64_t my_free = alloc->get_free();
4867 uint64_t total = bdev->get_size();
4868 float my_free_ratio = (float)my_free / (float)total;
4869
4870 uint64_t total_free = bluefs_free + my_free;
4871
4872 float bluefs_ratio = (float)bluefs_free / (float)total_free;
4873
4874 dout(10) << __func__
4875 << " bluefs " << pretty_si_t(bluefs_free)
4876 << " free (" << bluefs_free_ratio
4877 << ") bluestore " << pretty_si_t(my_free)
4878 << " free (" << my_free_ratio
4879 << "), bluefs_ratio " << bluefs_ratio
4880 << dendl;
4881
4882 uint64_t gift = 0;
4883 uint64_t reclaim = 0;
4884 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
4885 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
4886 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4887 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
4888 << ", should gift " << pretty_si_t(gift) << dendl;
4889 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
4890 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
4891 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
4892 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
4893 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4894 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
4895 << ", should reclaim " << pretty_si_t(reclaim) << dendl;
4896 }
4897
4898 // don't take over too much of the freespace
4899 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
4900 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
4901 cct->_conf->bluestore_bluefs_min < free_cap) {
4902 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
4903 dout(10) << __func__ << " bluefs_total " << bluefs_total
4904 << " < min " << cct->_conf->bluestore_bluefs_min
4905 << ", should gift " << pretty_si_t(g) << dendl;
4906 if (g > gift)
4907 gift = g;
4908 reclaim = 0;
4909 }
4910 uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
4911 if (bluefs_free < min_free &&
4912 min_free < free_cap) {
4913 uint64_t g = min_free - bluefs_free;
4914 dout(10) << __func__ << " bluefs_free " << bluefs_total
4915 << " < min " << min_free
4916 << ", should gift " << pretty_si_t(g) << dendl;
4917 if (g > gift)
4918 gift = g;
4919 reclaim = 0;
4920 }
4921
4922 if (gift) {
4923 // round up to alloc size
4924 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
4925
4926 // hard cap to fit into 32 bits
4927 gift = MIN(gift, 1ull<<31);
4928 dout(10) << __func__ << " gifting " << gift
4929 << " (" << pretty_si_t(gift) << ")" << dendl;
4930
4931 // fixme: just do one allocation to start...
4932 int r = alloc->reserve(gift);
4933 assert(r == 0);
4934
4935 AllocExtentVector exts;
4936 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
4937 0, 0, &exts);
4938
4939 if (alloc_len <= 0) {
4940 dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
4941 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4942 alloc->unreserve(gift);
4943 alloc->dump();
4944 return 0;
4945 } else if (alloc_len < (int64_t)gift) {
4946 dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
4947 << " min_alloc_size 0x" << min_alloc_size
4948 << " allocated 0x" << alloc_len
4949 << std::dec << dendl;
4950 alloc->unreserve(gift - alloc_len);
4951 alloc->dump();
4952 }
4953 for (auto& p : exts) {
4954 bluestore_pextent_t e = bluestore_pextent_t(p);
4955 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
4956 extents->push_back(e);
4957 }
4958 gift = 0;
4959
4960 ret = 1;
4961 }
4962
4963 // reclaim from bluefs?
4964 if (reclaim) {
4965 // round up to alloc size
4966 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
4967
4968 // hard cap to fit into 32 bits
4969 reclaim = MIN(reclaim, 1ull<<31);
4970 dout(10) << __func__ << " reclaiming " << reclaim
4971 << " (" << pretty_si_t(reclaim) << ")" << dendl;
4972
4973 while (reclaim > 0) {
4974 // NOTE: this will block and do IO.
4975 AllocExtentVector extents;
4976 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
4977 &extents);
4978 if (r < 0) {
4979 derr << __func__ << " failed to reclaim space from bluefs"
4980 << dendl;
4981 break;
4982 }
4983 for (auto e : extents) {
4984 bluefs_extents.erase(e.offset, e.length);
4985 bluefs_extents_reclaiming.insert(e.offset, e.length);
4986 reclaim -= e.length;
4987 }
4988 }
4989
4990 ret = 1;
4991 }
4992
4993 return ret;
4994 }
4995
4996 void BlueStore::_commit_bluefs_freespace(
4997 const PExtentVector& bluefs_gift_extents)
4998 {
4999 dout(10) << __func__ << dendl;
5000 for (auto& p : bluefs_gift_extents) {
5001 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
5002 }
5003 }
5004
5005 int BlueStore::_open_collections(int *errors)
5006 {
5007 dout(10) << __func__ << dendl;
5008 assert(coll_map.empty());
5009 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
5010 for (it->upper_bound(string());
5011 it->valid();
5012 it->next()) {
5013 coll_t cid;
5014 if (cid.parse(it->key())) {
5015 CollectionRef c(
5016 new Collection(
5017 this,
5018 cache_shards[cid.hash_to_shard(cache_shards.size())],
5019 cid));
5020 bufferlist bl = it->value();
5021 bufferlist::iterator p = bl.begin();
5022 try {
5023 ::decode(c->cnode, p);
5024 } catch (buffer::error& e) {
5025 derr << __func__ << " failed to decode cnode, key:"
5026 << pretty_binary_string(it->key()) << dendl;
5027 return -EIO;
5028 }
5029 dout(20) << __func__ << " opened " << cid << " " << c
5030 << " " << c->cnode << dendl;
5031 coll_map[cid] = c;
5032 } else {
5033 derr << __func__ << " unrecognized collection " << it->key() << dendl;
5034 if (errors)
5035 (*errors)++;
5036 }
5037 }
5038 return 0;
5039 }
5040
5041 void BlueStore::_open_statfs()
5042 {
5043 bufferlist bl;
5044 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
5045 if (r >= 0) {
5046 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
5047 auto it = bl.begin();
5048 vstatfs.decode(it);
5049 } else {
5050 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
5051 }
5052 }
5053 else {
5054 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
5055 }
5056 }
5057
5058 int BlueStore::_setup_block_symlink_or_file(
5059 string name,
5060 string epath,
5061 uint64_t size,
5062 bool create)
5063 {
5064 dout(20) << __func__ << " name " << name << " path " << epath
5065 << " size " << size << " create=" << (int)create << dendl;
5066 int r = 0;
5067 int flags = O_RDWR;
5068 if (create)
5069 flags |= O_CREAT;
5070 if (epath.length()) {
5071 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
5072 if (r < 0) {
5073 r = -errno;
5074 derr << __func__ << " failed to create " << name << " symlink to "
5075 << epath << ": " << cpp_strerror(r) << dendl;
5076 return r;
5077 }
5078
5079 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
5080 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
5081 if (fd < 0) {
5082 r = -errno;
5083 derr << __func__ << " failed to open " << epath << " file: "
5084 << cpp_strerror(r) << dendl;
5085 return r;
5086 }
5087 string serial_number = epath.substr(strlen(SPDK_PREFIX));
5088 r = ::write(fd, serial_number.c_str(), serial_number.size());
5089 assert(r == (int)serial_number.size());
5090 dout(1) << __func__ << " created " << name << " symlink to "
5091 << epath << dendl;
5092 VOID_TEMP_FAILURE_RETRY(::close(fd));
5093 }
5094 }
5095 if (size) {
5096 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
5097 if (fd >= 0) {
5098 // block file is present
5099 struct stat st;
5100 int r = ::fstat(fd, &st);
5101 if (r == 0 &&
5102 S_ISREG(st.st_mode) && // if it is a regular file
5103 st.st_size == 0) { // and is 0 bytes
5104 r = ::ftruncate(fd, size);
5105 if (r < 0) {
5106 r = -errno;
5107 derr << __func__ << " failed to resize " << name << " file to "
5108 << size << ": " << cpp_strerror(r) << dendl;
5109 VOID_TEMP_FAILURE_RETRY(::close(fd));
5110 return r;
5111 }
5112
5113 if (cct->_conf->bluestore_block_preallocate_file) {
5114 r = ::ceph_posix_fallocate(fd, 0, size);
5115 if (r > 0) {
5116 derr << __func__ << " failed to prefallocate " << name << " file to "
5117 << size << ": " << cpp_strerror(r) << dendl;
5118 VOID_TEMP_FAILURE_RETRY(::close(fd));
5119 return -r;
5120 }
5121 }
5122 dout(1) << __func__ << " resized " << name << " file to "
5123 << pretty_si_t(size) << "B" << dendl;
5124 }
5125 VOID_TEMP_FAILURE_RETRY(::close(fd));
5126 } else {
5127 int r = -errno;
5128 if (r != -ENOENT) {
5129 derr << __func__ << " failed to open " << name << " file: "
5130 << cpp_strerror(r) << dendl;
5131 return r;
5132 }
5133 }
5134 }
5135 return 0;
5136 }
5137
5138 int BlueStore::mkfs()
5139 {
5140 dout(1) << __func__ << " path " << path << dendl;
5141 int r;
5142 uuid_d old_fsid;
5143
5144 {
5145 string done;
5146 r = read_meta("mkfs_done", &done);
5147 if (r == 0) {
5148 dout(1) << __func__ << " already created" << dendl;
5149 if (cct->_conf->bluestore_fsck_on_mkfs) {
5150 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5151 if (r < 0) {
5152 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5153 << dendl;
5154 return r;
5155 }
5156 if (r > 0) {
5157 derr << __func__ << " fsck found " << r << " errors" << dendl;
5158 r = -EIO;
5159 }
5160 }
5161 return r; // idempotent
5162 }
5163 }
5164
5165 {
5166 string type;
5167 r = read_meta("type", &type);
5168 if (r == 0) {
5169 if (type != "bluestore") {
5170 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5171 return -EIO;
5172 }
5173 } else {
5174 r = write_meta("type", "bluestore");
5175 if (r < 0)
5176 return r;
5177 }
5178 }
5179
5180 freelist_type = "bitmap";
5181
5182 r = _open_path();
5183 if (r < 0)
5184 return r;
5185
5186 r = _open_fsid(true);
5187 if (r < 0)
5188 goto out_path_fd;
5189
5190 r = _lock_fsid();
5191 if (r < 0)
5192 goto out_close_fsid;
5193
5194 r = _read_fsid(&old_fsid);
5195 if (r < 0 || old_fsid.is_zero()) {
5196 if (fsid.is_zero()) {
5197 fsid.generate_random();
5198 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5199 } else {
5200 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5201 }
5202 // we'll write it later.
5203 } else {
5204 if (!fsid.is_zero() && fsid != old_fsid) {
5205 derr << __func__ << " on-disk fsid " << old_fsid
5206 << " != provided " << fsid << dendl;
5207 r = -EINVAL;
5208 goto out_close_fsid;
5209 }
5210 fsid = old_fsid;
5211 }
5212
5213 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5214 cct->_conf->bluestore_block_size,
5215 cct->_conf->bluestore_block_create);
5216 if (r < 0)
5217 goto out_close_fsid;
5218 if (cct->_conf->bluestore_bluefs) {
5219 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5220 cct->_conf->bluestore_block_wal_size,
5221 cct->_conf->bluestore_block_wal_create);
5222 if (r < 0)
5223 goto out_close_fsid;
5224 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5225 cct->_conf->bluestore_block_db_size,
5226 cct->_conf->bluestore_block_db_create);
5227 if (r < 0)
5228 goto out_close_fsid;
5229 }
5230
5231 r = _open_bdev(true);
5232 if (r < 0)
5233 goto out_close_fsid;
5234
5235 // choose min_alloc_size
5236 if (cct->_conf->bluestore_min_alloc_size) {
5237 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5238 } else {
5239 assert(bdev);
5240 if (bdev->is_rotational()) {
5241 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5242 } else {
5243 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5244 }
5245 }
5246
5247 // make sure min_alloc_size is power of 2 aligned.
5248 if (!ISP2(min_alloc_size)) {
5249 derr << __func__ << " min_alloc_size 0x"
5250 << std::hex << min_alloc_size << std::dec
5251 << " is not power of 2 aligned!"
5252 << dendl;
5253 r = -EINVAL;
5254 goto out_close_bdev;
5255 }
5256
5257 r = _open_db(true);
5258 if (r < 0)
5259 goto out_close_bdev;
5260
5261 r = _open_fm(true);
5262 if (r < 0)
5263 goto out_close_db;
5264
5265 {
5266 KeyValueDB::Transaction t = db->get_transaction();
5267 {
5268 bufferlist bl;
5269 ::encode((uint64_t)0, bl);
5270 t->set(PREFIX_SUPER, "nid_max", bl);
5271 t->set(PREFIX_SUPER, "blobid_max", bl);
5272 }
5273
5274 {
5275 bufferlist bl;
5276 ::encode((uint64_t)min_alloc_size, bl);
5277 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5278 }
5279
5280 ondisk_format = latest_ondisk_format;
5281 _prepare_ondisk_format_super(t);
5282 db->submit_transaction_sync(t);
5283 }
5284
5285
5286 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5287 if (r < 0)
5288 goto out_close_fm;
5289
5290 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
5291 if (r < 0)
5292 goto out_close_fm;
5293
5294 if (fsid != old_fsid) {
5295 r = _write_fsid();
5296 if (r < 0) {
5297 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
5298 goto out_close_fm;
5299 }
5300 }
5301
5302 out_close_fm:
5303 _close_fm();
5304 out_close_db:
5305 _close_db();
5306 out_close_bdev:
5307 _close_bdev();
5308 out_close_fsid:
5309 _close_fsid();
5310 out_path_fd:
5311 _close_path();
5312
5313 if (r == 0 &&
5314 cct->_conf->bluestore_fsck_on_mkfs) {
5315 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5316 if (rc < 0)
5317 return rc;
5318 if (rc > 0) {
5319 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5320 r = -EIO;
5321 }
5322 }
5323
5324 if (r == 0) {
5325 // indicate success by writing the 'mkfs_done' file
5326 r = write_meta("mkfs_done", "yes");
5327 }
5328
5329 if (r < 0) {
5330 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
5331 } else {
5332 dout(0) << __func__ << " success" << dendl;
5333 }
5334 return r;
5335 }
5336
5337 void BlueStore::set_cache_shards(unsigned num)
5338 {
5339 dout(10) << __func__ << " " << num << dendl;
5340 size_t old = cache_shards.size();
5341 assert(num >= old);
5342 cache_shards.resize(num);
5343 for (unsigned i = old; i < num; ++i) {
5344 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5345 logger);
5346 }
5347 }
5348
5349 int BlueStore::_mount(bool kv_only)
5350 {
5351 dout(1) << __func__ << " path " << path << dendl;
5352
5353 _kv_only = kv_only;
5354
5355 {
5356 string type;
5357 int r = read_meta("type", &type);
5358 if (r < 0) {
5359 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5360 << dendl;
5361 return r;
5362 }
5363
5364 if (type != "bluestore") {
5365 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5366 return -EIO;
5367 }
5368 }
5369
5370 if (cct->_conf->bluestore_fsck_on_mount) {
5371 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5372 if (rc < 0)
5373 return rc;
5374 if (rc > 0) {
5375 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5376 return -EIO;
5377 }
5378 }
5379
5380 int r = _open_path();
5381 if (r < 0)
5382 return r;
5383 r = _open_fsid(false);
5384 if (r < 0)
5385 goto out_path;
5386
5387 r = _read_fsid(&fsid);
5388 if (r < 0)
5389 goto out_fsid;
5390
5391 r = _lock_fsid();
5392 if (r < 0)
5393 goto out_fsid;
5394
5395 r = _open_bdev(false);
5396 if (r < 0)
5397 goto out_fsid;
5398
5399 r = _open_db(false);
5400 if (r < 0)
5401 goto out_bdev;
5402
5403 if (kv_only)
5404 return 0;
5405
5406 r = _open_super_meta();
5407 if (r < 0)
5408 goto out_db;
5409
5410 r = _open_fm(false);
5411 if (r < 0)
5412 goto out_db;
5413
5414 r = _open_alloc();
5415 if (r < 0)
5416 goto out_fm;
5417
5418 r = _open_collections();
5419 if (r < 0)
5420 goto out_alloc;
5421
5422 r = _reload_logger();
5423 if (r < 0)
5424 goto out_coll;
5425
5426 if (bluefs) {
5427 r = _reconcile_bluefs_freespace();
5428 if (r < 0)
5429 goto out_coll;
5430 }
5431
5432 _kv_start();
5433
5434 r = _deferred_replay();
5435 if (r < 0)
5436 goto out_stop;
5437
5438 mempool_thread.init();
5439
5440 mounted = true;
5441 return 0;
5442
5443 out_stop:
5444 _kv_stop();
5445 out_coll:
5446 _flush_cache();
5447 out_alloc:
5448 _close_alloc();
5449 out_fm:
5450 _close_fm();
5451 out_db:
5452 _close_db();
5453 out_bdev:
5454 _close_bdev();
5455 out_fsid:
5456 _close_fsid();
5457 out_path:
5458 _close_path();
5459 return r;
5460 }
5461
5462 int BlueStore::umount()
5463 {
5464 assert(_kv_only || mounted);
5465 dout(1) << __func__ << dendl;
5466
5467 _osr_drain_all();
5468 _osr_unregister_all();
5469
5470 mounted = false;
5471 if (!_kv_only) {
5472 mempool_thread.shutdown();
5473 dout(20) << __func__ << " stopping kv thread" << dendl;
5474 _kv_stop();
5475 _flush_cache();
5476 dout(20) << __func__ << " closing" << dendl;
5477
5478 _close_alloc();
5479 _close_fm();
5480 }
5481 _close_db();
5482 _close_bdev();
5483 _close_fsid();
5484 _close_path();
5485
5486 if (cct->_conf->bluestore_fsck_on_umount) {
5487 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5488 if (rc < 0)
5489 return rc;
5490 if (rc > 0) {
5491 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5492 return -EIO;
5493 }
5494 }
5495 return 0;
5496 }
5497
5498 static void apply(uint64_t off,
5499 uint64_t len,
5500 uint64_t granularity,
5501 BlueStore::mempool_dynamic_bitset &bitset,
5502 std::function<void(uint64_t,
5503 BlueStore::mempool_dynamic_bitset &)> f) {
5504 auto end = ROUND_UP_TO(off + len, granularity);
5505 while (off < end) {
5506 uint64_t pos = off / granularity;
5507 f(pos, bitset);
5508 off += granularity;
5509 }
5510 }
5511
5512 int BlueStore::_fsck_check_extents(
5513 const ghobject_t& oid,
5514 const PExtentVector& extents,
5515 bool compressed,
5516 mempool_dynamic_bitset &used_blocks,
5517 uint64_t granularity,
5518 store_statfs_t& expected_statfs)
5519 {
5520 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5521 int errors = 0;
5522 for (auto e : extents) {
5523 if (!e.is_valid())
5524 continue;
5525 expected_statfs.allocated += e.length;
5526 if (compressed) {
5527 expected_statfs.compressed_allocated += e.length;
5528 }
5529 bool already = false;
5530 apply(
5531 e.offset, e.length, granularity, used_blocks,
5532 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5533 assert(pos < bs.size());
5534 if (bs.test(pos))
5535 already = true;
5536 else
5537 bs.set(pos);
5538 });
5539 if (already) {
5540 derr << " " << oid << " extent " << e
5541 << " or a subset is already allocated" << dendl;
5542 ++errors;
5543 }
5544 if (e.end() > bdev->get_size()) {
5545 derr << " " << oid << " extent " << e
5546 << " past end of block device" << dendl;
5547 ++errors;
5548 }
5549 }
5550 return errors;
5551 }
5552
5553 int BlueStore::_fsck(bool deep, bool repair)
5554 {
5555 dout(1) << __func__
5556 << (repair ? " fsck" : " repair")
5557 << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
5558 int errors = 0;
5559 int repaired = 0;
5560
5561 typedef btree::btree_set<
5562 uint64_t,std::less<uint64_t>,
5563 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5564 uint64_t_btree_t used_nids;
5565 uint64_t_btree_t used_omap_head;
5566 uint64_t_btree_t used_sbids;
5567
5568 mempool_dynamic_bitset used_blocks;
5569 KeyValueDB::Iterator it;
5570 store_statfs_t expected_statfs, actual_statfs;
5571 struct sb_info_t {
5572 list<ghobject_t> oids;
5573 SharedBlobRef sb;
5574 bluestore_extent_ref_map_t ref_map;
5575 bool compressed;
5576 };
5577 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5578
5579 uint64_t num_objects = 0;
5580 uint64_t num_extents = 0;
5581 uint64_t num_blobs = 0;
5582 uint64_t num_spanning_blobs = 0;
5583 uint64_t num_shared_blobs = 0;
5584 uint64_t num_sharded_objects = 0;
5585 uint64_t num_object_shards = 0;
5586
5587 utime_t start = ceph_clock_now();
5588
5589 int r = _open_path();
5590 if (r < 0)
5591 return r;
5592 r = _open_fsid(false);
5593 if (r < 0)
5594 goto out_path;
5595
5596 r = _read_fsid(&fsid);
5597 if (r < 0)
5598 goto out_fsid;
5599
5600 r = _lock_fsid();
5601 if (r < 0)
5602 goto out_fsid;
5603
5604 r = _open_bdev(false);
5605 if (r < 0)
5606 goto out_fsid;
5607
5608 r = _open_db(false);
5609 if (r < 0)
5610 goto out_bdev;
5611
5612 r = _open_super_meta();
5613 if (r < 0)
5614 goto out_db;
5615
5616 r = _open_fm(false);
5617 if (r < 0)
5618 goto out_db;
5619
5620 r = _open_alloc();
5621 if (r < 0)
5622 goto out_fm;
5623
5624 r = _open_collections(&errors);
5625 if (r < 0)
5626 goto out_alloc;
5627
5628 mempool_thread.init();
5629
5630 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5631 _kv_start();
5632 r = _deferred_replay();
5633 _kv_stop();
5634 if (r < 0)
5635 goto out_scan;
5636
5637 used_blocks.resize(fm->get_alloc_units());
5638 apply(
5639 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
5640 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5641 assert(pos < bs.size());
5642 bs.set(pos);
5643 }
5644 );
5645
5646 if (bluefs) {
5647 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5648 apply(
5649 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
5650 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5651 assert(pos < bs.size());
5652 bs.set(pos);
5653 }
5654 );
5655 }
5656 r = bluefs->fsck();
5657 if (r < 0) {
5658 goto out_scan;
5659 }
5660 if (r > 0)
5661 errors += r;
5662 }
5663
5664 // get expected statfs; fill unaffected fields to be able to compare
5665 // structs
5666 statfs(&actual_statfs);
5667 expected_statfs.total = actual_statfs.total;
5668 expected_statfs.available = actual_statfs.available;
5669
5670 // walk PREFIX_OBJ
5671 dout(1) << __func__ << " walking object keyspace" << dendl;
5672 it = db->get_iterator(PREFIX_OBJ);
5673 if (it) {
5674 CollectionRef c;
5675 spg_t pgid;
5676 mempool::bluestore_fsck::list<string> expecting_shards;
5677 for (it->lower_bound(string()); it->valid(); it->next()) {
5678 if (g_conf->bluestore_debug_fsck_abort) {
5679 goto out_scan;
5680 }
5681 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5682 if (is_extent_shard_key(it->key())) {
5683 while (!expecting_shards.empty() &&
5684 expecting_shards.front() < it->key()) {
5685 derr << "fsck error: missing shard key "
5686 << pretty_binary_string(expecting_shards.front())
5687 << dendl;
5688 ++errors;
5689 expecting_shards.pop_front();
5690 }
5691 if (!expecting_shards.empty() &&
5692 expecting_shards.front() == it->key()) {
5693 // all good
5694 expecting_shards.pop_front();
5695 continue;
5696 }
5697
5698 uint32_t offset;
5699 string okey;
5700 get_key_extent_shard(it->key(), &okey, &offset);
5701 derr << "fsck error: stray shard 0x" << std::hex << offset
5702 << std::dec << dendl;
5703 if (expecting_shards.empty()) {
5704 derr << "fsck error: " << pretty_binary_string(it->key())
5705 << " is unexpected" << dendl;
5706 ++errors;
5707 continue;
5708 }
5709 while (expecting_shards.front() > it->key()) {
5710 derr << "fsck error: saw " << pretty_binary_string(it->key())
5711 << dendl;
5712 derr << "fsck error: exp "
5713 << pretty_binary_string(expecting_shards.front()) << dendl;
5714 ++errors;
5715 expecting_shards.pop_front();
5716 if (expecting_shards.empty()) {
5717 break;
5718 }
5719 }
5720 continue;
5721 }
5722
5723 ghobject_t oid;
5724 int r = get_key_object(it->key(), &oid);
5725 if (r < 0) {
5726 derr << "fsck error: bad object key "
5727 << pretty_binary_string(it->key()) << dendl;
5728 ++errors;
5729 continue;
5730 }
5731 if (!c ||
5732 oid.shard_id != pgid.shard ||
5733 oid.hobj.pool != (int64_t)pgid.pool() ||
5734 !c->contains(oid)) {
5735 c = nullptr;
5736 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5737 coll_map.begin();
5738 p != coll_map.end();
5739 ++p) {
5740 if (p->second->contains(oid)) {
5741 c = p->second;
5742 break;
5743 }
5744 }
5745 if (!c) {
5746 derr << "fsck error: stray object " << oid
5747 << " not owned by any collection" << dendl;
5748 ++errors;
5749 continue;
5750 }
5751 c->cid.is_pg(&pgid);
5752 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
5753 << dendl;
5754 }
5755
5756 if (!expecting_shards.empty()) {
5757 for (auto &k : expecting_shards) {
5758 derr << "fsck error: missing shard key "
5759 << pretty_binary_string(k) << dendl;
5760 }
5761 ++errors;
5762 expecting_shards.clear();
5763 }
5764
5765 dout(10) << __func__ << " " << oid << dendl;
5766 RWLock::RLocker l(c->lock);
5767 OnodeRef o = c->get_onode(oid, false);
5768 if (o->onode.nid) {
5769 if (o->onode.nid > nid_max) {
5770 derr << "fsck error: " << oid << " nid " << o->onode.nid
5771 << " > nid_max " << nid_max << dendl;
5772 ++errors;
5773 }
5774 if (used_nids.count(o->onode.nid)) {
5775 derr << "fsck error: " << oid << " nid " << o->onode.nid
5776 << " already in use" << dendl;
5777 ++errors;
5778 continue; // go for next object
5779 }
5780 used_nids.insert(o->onode.nid);
5781 }
5782 ++num_objects;
5783 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5784 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5785 _dump_onode(o, 30);
5786 // shards
5787 if (!o->extent_map.shards.empty()) {
5788 ++num_sharded_objects;
5789 num_object_shards += o->extent_map.shards.size();
5790 }
5791 for (auto& s : o->extent_map.shards) {
5792 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
5793 expecting_shards.push_back(string());
5794 get_extent_shard_key(o->key, s.shard_info->offset,
5795 &expecting_shards.back());
5796 if (s.shard_info->offset >= o->onode.size) {
5797 derr << "fsck error: " << oid << " shard 0x" << std::hex
5798 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
5799 << std::dec << dendl;
5800 ++errors;
5801 }
5802 }
5803 // lextents
5804 map<BlobRef,bluestore_blob_t::unused_t> referenced;
5805 uint64_t pos = 0;
5806 mempool::bluestore_fsck::map<BlobRef,
5807 bluestore_blob_use_tracker_t> ref_map;
5808 for (auto& l : o->extent_map.extent_map) {
5809 dout(20) << __func__ << " " << l << dendl;
5810 if (l.logical_offset < pos) {
5811 derr << "fsck error: " << oid << " lextent at 0x"
5812 << std::hex << l.logical_offset
5813 << " overlaps with the previous, which ends at 0x" << pos
5814 << std::dec << dendl;
5815 ++errors;
5816 }
5817 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
5818 derr << "fsck error: " << oid << " lextent at 0x"
5819 << std::hex << l.logical_offset << "~" << l.length
5820 << " spans a shard boundary"
5821 << std::dec << dendl;
5822 ++errors;
5823 }
5824 pos = l.logical_offset + l.length;
5825 expected_statfs.stored += l.length;
5826 assert(l.blob);
5827 const bluestore_blob_t& blob = l.blob->get_blob();
5828
5829 auto& ref = ref_map[l.blob];
5830 if (ref.is_empty()) {
5831 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
5832 uint32_t l = blob.get_logical_length();
5833 ref.init(l, min_release_size);
5834 }
5835 ref.get(
5836 l.blob_offset,
5837 l.length);
5838 ++num_extents;
5839 if (blob.has_unused()) {
5840 auto p = referenced.find(l.blob);
5841 bluestore_blob_t::unused_t *pu;
5842 if (p == referenced.end()) {
5843 pu = &referenced[l.blob];
5844 } else {
5845 pu = &p->second;
5846 }
5847 uint64_t blob_len = blob.get_logical_length();
5848 assert((blob_len % (sizeof(*pu)*8)) == 0);
5849 assert(l.blob_offset + l.length <= blob_len);
5850 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
5851 uint64_t start = l.blob_offset / chunk_size;
5852 uint64_t end =
5853 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
5854 for (auto i = start; i < end; ++i) {
5855 (*pu) |= (1u << i);
5856 }
5857 }
5858 }
5859 for (auto &i : referenced) {
5860 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
5861 << std::dec << " for " << *i.first << dendl;
5862 const bluestore_blob_t& blob = i.first->get_blob();
5863 if (i.second & blob.unused) {
5864 derr << "fsck error: " << oid << " blob claims unused 0x"
5865 << std::hex << blob.unused
5866 << " but extents reference 0x" << i.second
5867 << " on blob " << *i.first << dendl;
5868 ++errors;
5869 }
5870 if (blob.has_csum()) {
5871 uint64_t blob_len = blob.get_logical_length();
5872 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
5873 unsigned csum_count = blob.get_csum_count();
5874 unsigned csum_chunk_size = blob.get_csum_chunk_size();
5875 for (unsigned p = 0; p < csum_count; ++p) {
5876 unsigned pos = p * csum_chunk_size;
5877 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
5878 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
5879 unsigned mask = 1u << firstbit;
5880 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
5881 mask |= 1u << b;
5882 }
5883 if ((blob.unused & mask) == mask) {
5884 // this csum chunk region is marked unused
5885 if (blob.get_csum_item(p) != 0) {
5886 derr << "fsck error: " << oid
5887 << " blob claims csum chunk 0x" << std::hex << pos
5888 << "~" << csum_chunk_size
5889 << " is unused (mask 0x" << mask << " of unused 0x"
5890 << blob.unused << ") but csum is non-zero 0x"
5891 << blob.get_csum_item(p) << std::dec << " on blob "
5892 << *i.first << dendl;
5893 ++errors;
5894 }
5895 }
5896 }
5897 }
5898 }
5899 for (auto &i : ref_map) {
5900 ++num_blobs;
5901 const bluestore_blob_t& blob = i.first->get_blob();
5902 bool equal = i.first->get_blob_use_tracker().equal(i.second);
5903 if (!equal) {
5904 derr << "fsck error: " << oid << " blob " << *i.first
5905 << " doesn't match expected ref_map " << i.second << dendl;
5906 ++errors;
5907 }
5908 if (blob.is_compressed()) {
5909 expected_statfs.compressed += blob.get_compressed_payload_length();
5910 expected_statfs.compressed_original +=
5911 i.first->get_referenced_bytes();
5912 }
5913 if (blob.is_shared()) {
5914 if (i.first->shared_blob->get_sbid() > blobid_max) {
5915 derr << "fsck error: " << oid << " blob " << blob
5916 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
5917 << blobid_max << dendl;
5918 ++errors;
5919 } else if (i.first->shared_blob->get_sbid() == 0) {
5920 derr << "fsck error: " << oid << " blob " << blob
5921 << " marked as shared but has uninitialized sbid"
5922 << dendl;
5923 ++errors;
5924 }
5925 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
5926 sbi.sb = i.first->shared_blob;
5927 sbi.oids.push_back(oid);
5928 sbi.compressed = blob.is_compressed();
5929 for (auto e : blob.get_extents()) {
5930 if (e.is_valid()) {
5931 sbi.ref_map.get(e.offset, e.length);
5932 }
5933 }
5934 } else {
5935 errors += _fsck_check_extents(oid, blob.get_extents(),
5936 blob.is_compressed(),
5937 used_blocks,
5938 fm->get_alloc_size(),
5939 expected_statfs);
5940 }
5941 }
5942 if (deep) {
5943 bufferlist bl;
5944 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
5945 if (r < 0) {
5946 ++errors;
5947 derr << "fsck error: " << oid << " error during read: "
5948 << cpp_strerror(r) << dendl;
5949 }
5950 }
5951 // omap
5952 if (o->onode.has_omap()) {
5953 if (used_omap_head.count(o->onode.nid)) {
5954 derr << "fsck error: " << oid << " omap_head " << o->onode.nid
5955 << " already in use" << dendl;
5956 ++errors;
5957 } else {
5958 used_omap_head.insert(o->onode.nid);
5959 }
5960 }
5961 }
5962 }
5963 dout(1) << __func__ << " checking shared_blobs" << dendl;
5964 it = db->get_iterator(PREFIX_SHARED_BLOB);
5965 if (it) {
5966 for (it->lower_bound(string()); it->valid(); it->next()) {
5967 string key = it->key();
5968 uint64_t sbid;
5969 if (get_key_shared_blob(key, &sbid)) {
5970 derr << "fsck error: bad key '" << key
5971 << "' in shared blob namespace" << dendl;
5972 ++errors;
5973 continue;
5974 }
5975 auto p = sb_info.find(sbid);
5976 if (p == sb_info.end()) {
5977 derr << "fsck error: found stray shared blob data for sbid 0x"
5978 << std::hex << sbid << std::dec << dendl;
5979 ++errors;
5980 } else {
5981 ++num_shared_blobs;
5982 sb_info_t& sbi = p->second;
5983 bluestore_shared_blob_t shared_blob(sbid);
5984 bufferlist bl = it->value();
5985 bufferlist::iterator blp = bl.begin();
5986 ::decode(shared_blob, blp);
5987 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
5988 if (shared_blob.ref_map != sbi.ref_map) {
5989 derr << "fsck error: shared blob 0x" << std::hex << sbid
5990 << std::dec << " ref_map " << shared_blob.ref_map
5991 << " != expected " << sbi.ref_map << dendl;
5992 ++errors;
5993 }
5994 PExtentVector extents;
5995 for (auto &r : shared_blob.ref_map.ref_map) {
5996 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
5997 }
5998 errors += _fsck_check_extents(p->second.oids.front(),
5999 extents,
6000 p->second.compressed,
6001 used_blocks,
6002 fm->get_alloc_size(),
6003 expected_statfs);
6004 sb_info.erase(p);
6005 }
6006 }
6007 }
6008 for (auto &p : sb_info) {
6009 derr << "fsck error: shared_blob 0x" << p.first
6010 << " key is missing (" << *p.second.sb << ")" << dendl;
6011 ++errors;
6012 }
6013 if (!(actual_statfs == expected_statfs)) {
6014 derr << "fsck error: actual " << actual_statfs
6015 << " != expected " << expected_statfs << dendl;
6016 ++errors;
6017 }
6018
6019 dout(1) << __func__ << " checking for stray omap data" << dendl;
6020 it = db->get_iterator(PREFIX_OMAP);
6021 if (it) {
6022 for (it->lower_bound(string()); it->valid(); it->next()) {
6023 uint64_t omap_head;
6024 _key_decode_u64(it->key().c_str(), &omap_head);
6025 if (used_omap_head.count(omap_head) == 0) {
6026 derr << "fsck error: found stray omap data on omap_head "
6027 << omap_head << dendl;
6028 ++errors;
6029 }
6030 }
6031 }
6032
6033 dout(1) << __func__ << " checking deferred events" << dendl;
6034 it = db->get_iterator(PREFIX_DEFERRED);
6035 if (it) {
6036 for (it->lower_bound(string()); it->valid(); it->next()) {
6037 bufferlist bl = it->value();
6038 bufferlist::iterator p = bl.begin();
6039 bluestore_deferred_transaction_t wt;
6040 try {
6041 ::decode(wt, p);
6042 } catch (buffer::error& e) {
6043 derr << "fsck error: failed to decode deferred txn "
6044 << pretty_binary_string(it->key()) << dendl;
6045 r = -EIO;
6046 goto out_scan;
6047 }
6048 dout(20) << __func__ << " deferred " << wt.seq
6049 << " ops " << wt.ops.size()
6050 << " released 0x" << std::hex << wt.released << std::dec << dendl;
6051 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
6052 apply(
6053 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
6054 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6055 assert(pos < bs.size());
6056 bs.set(pos);
6057 }
6058 );
6059 }
6060 }
6061 }
6062
6063 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
6064 {
6065 // remove bluefs_extents from used set since the freelist doesn't
6066 // know they are allocated.
6067 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
6068 apply(
6069 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
6070 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6071 assert(pos < bs.size());
6072 bs.reset(pos);
6073 }
6074 );
6075 }
6076 fm->enumerate_reset();
6077 uint64_t offset, length;
6078 while (fm->enumerate_next(&offset, &length)) {
6079 bool intersects = false;
6080 apply(
6081 offset, length, fm->get_alloc_size(), used_blocks,
6082 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
6083 assert(pos < bs.size());
6084 if (bs.test(pos)) {
6085 intersects = true;
6086 } else {
6087 bs.set(pos);
6088 }
6089 }
6090 );
6091 if (intersects) {
6092 if (offset == SUPER_RESERVED &&
6093 length == min_alloc_size - SUPER_RESERVED) {
6094 // this is due to the change just after luminous to min_alloc_size
6095 // granularity allocations, and our baked in assumption at the top
6096 // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
6097 // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
6098 // since we will never allocate this region below min_alloc_size.
6099 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
6100 << " and min_alloc_size, 0x" << std::hex << offset << "~"
6101 << length << dendl;
6102 } else {
6103 derr << "fsck error: free extent 0x" << std::hex << offset
6104 << "~" << length << std::dec
6105 << " intersects allocated blocks" << dendl;
6106 ++errors;
6107 }
6108 }
6109 }
6110 fm->enumerate_reset();
6111 size_t count = used_blocks.count();
6112 if (used_blocks.size() != count) {
6113 assert(used_blocks.size() > count);
6114 ++errors;
6115 used_blocks.flip();
6116 size_t start = used_blocks.find_first();
6117 while (start != decltype(used_blocks)::npos) {
6118 size_t cur = start;
6119 while (true) {
6120 size_t next = used_blocks.find_next(cur);
6121 if (next != cur + 1) {
6122 derr << "fsck error: leaked extent 0x" << std::hex
6123 << ((uint64_t)start * fm->get_alloc_size()) << "~"
6124 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
6125 << dendl;
6126 start = next;
6127 break;
6128 }
6129 cur = next;
6130 }
6131 }
6132 used_blocks.flip();
6133 }
6134 }
6135
6136 out_scan:
6137 mempool_thread.shutdown();
6138 _flush_cache();
6139 out_alloc:
6140 _close_alloc();
6141 out_fm:
6142 _close_fm();
6143 out_db:
6144 it.reset(); // before db is closed
6145 _close_db();
6146 out_bdev:
6147 _close_bdev();
6148 out_fsid:
6149 _close_fsid();
6150 out_path:
6151 _close_path();
6152
6153 // fatal errors take precedence
6154 if (r < 0)
6155 return r;
6156
6157 dout(2) << __func__ << " " << num_objects << " objects, "
6158 << num_sharded_objects << " of them sharded. "
6159 << dendl;
6160 dout(2) << __func__ << " " << num_extents << " extents to "
6161 << num_blobs << " blobs, "
6162 << num_spanning_blobs << " spanning, "
6163 << num_shared_blobs << " shared."
6164 << dendl;
6165
6166 utime_t duration = ceph_clock_now() - start;
6167 dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
6168 << " repaired, " << (errors - repaired) << " remaining in "
6169 << duration << " seconds" << dendl;
6170 return errors - repaired;
6171 }
6172
6173 void BlueStore::collect_metadata(map<string,string> *pm)
6174 {
6175 dout(10) << __func__ << dendl;
6176 bdev->collect_metadata("bluestore_bdev_", pm);
6177 if (bluefs) {
6178 (*pm)["bluefs"] = "1";
6179 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
6180 bluefs->collect_metadata(pm);
6181 } else {
6182 (*pm)["bluefs"] = "0";
6183 }
6184 }
6185
6186 int BlueStore::statfs(struct store_statfs_t *buf)
6187 {
6188 buf->reset();
6189 buf->total = bdev->get_size();
6190 buf->available = alloc->get_free();
6191
6192 if (bluefs) {
6193 // part of our shared device is "free" according to BlueFS, but we
6194 // can't touch bluestore_bluefs_min of it.
6195 int64_t shared_available = std::min(
6196 bluefs->get_free(bluefs_shared_bdev),
6197 bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
6198 if (shared_available > 0) {
6199 buf->available += shared_available;
6200 }
6201 }
6202
6203 {
6204 std::lock_guard<std::mutex> l(vstatfs_lock);
6205
6206 buf->allocated = vstatfs.allocated();
6207 buf->stored = vstatfs.stored();
6208 buf->compressed = vstatfs.compressed();
6209 buf->compressed_original = vstatfs.compressed_original();
6210 buf->compressed_allocated = vstatfs.compressed_allocated();
6211 }
6212
6213 dout(20) << __func__ << *buf << dendl;
6214 return 0;
6215 }
6216
6217 // ---------------
6218 // cache
6219
6220 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
6221 {
6222 RWLock::RLocker l(coll_lock);
6223 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
6224 if (cp == coll_map.end())
6225 return CollectionRef();
6226 return cp->second;
6227 }
6228
6229 void BlueStore::_queue_reap_collection(CollectionRef& c)
6230 {
6231 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6232 // _reap_collections and this in the same thread,
6233 // so no need a lock.
6234 removed_collections.push_back(c);
6235 }
6236
6237 void BlueStore::_reap_collections()
6238 {
6239
6240 list<CollectionRef> removed_colls;
6241 {
6242 // _queue_reap_collection and this in the same thread.
6243 // So no need a lock.
6244 if (!removed_collections.empty())
6245 removed_colls.swap(removed_collections);
6246 else
6247 return;
6248 }
6249
6250 list<CollectionRef>::iterator p = removed_colls.begin();
6251 while (p != removed_colls.end()) {
6252 CollectionRef c = *p;
6253 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6254 if (c->onode_map.map_any([&](OnodeRef o) {
6255 assert(!o->exists);
6256 if (o->flushing_count.load()) {
6257 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6258 << " flush_txns " << o->flushing_count << dendl;
6259 return true;
6260 }
6261 return false;
6262 })) {
6263 ++p;
6264 continue;
6265 }
6266 c->onode_map.clear();
6267 p = removed_colls.erase(p);
6268 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6269 }
6270 if (removed_colls.empty()) {
6271 dout(10) << __func__ << " all reaped" << dendl;
6272 } else {
6273 removed_collections.splice(removed_collections.begin(), removed_colls);
6274 }
6275 }
6276
6277 void BlueStore::_update_cache_logger()
6278 {
6279 uint64_t num_onodes = 0;
6280 uint64_t num_extents = 0;
6281 uint64_t num_blobs = 0;
6282 uint64_t num_buffers = 0;
6283 uint64_t num_buffer_bytes = 0;
6284 for (auto c : cache_shards) {
6285 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6286 &num_buffers, &num_buffer_bytes);
6287 }
6288 logger->set(l_bluestore_onodes, num_onodes);
6289 logger->set(l_bluestore_extents, num_extents);
6290 logger->set(l_bluestore_blobs, num_blobs);
6291 logger->set(l_bluestore_buffers, num_buffers);
6292 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6293 }
6294
6295 // ---------------
6296 // read operations
6297
6298 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6299 {
6300 return _get_collection(cid);
6301 }
6302
6303 bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6304 {
6305 CollectionHandle c = _get_collection(cid);
6306 if (!c)
6307 return false;
6308 return exists(c, oid);
6309 }
6310
6311 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6312 {
6313 Collection *c = static_cast<Collection *>(c_.get());
6314 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6315 if (!c->exists)
6316 return false;
6317
6318 bool r = true;
6319
6320 {
6321 RWLock::RLocker l(c->lock);
6322 OnodeRef o = c->get_onode(oid, false);
6323 if (!o || !o->exists)
6324 r = false;
6325 }
6326
6327 return r;
6328 }
6329
6330 int BlueStore::stat(
6331 const coll_t& cid,
6332 const ghobject_t& oid,
6333 struct stat *st,
6334 bool allow_eio)
6335 {
6336 CollectionHandle c = _get_collection(cid);
6337 if (!c)
6338 return -ENOENT;
6339 return stat(c, oid, st, allow_eio);
6340 }
6341
6342 int BlueStore::stat(
6343 CollectionHandle &c_,
6344 const ghobject_t& oid,
6345 struct stat *st,
6346 bool allow_eio)
6347 {
6348 Collection *c = static_cast<Collection *>(c_.get());
6349 if (!c->exists)
6350 return -ENOENT;
6351 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6352
6353 {
6354 RWLock::RLocker l(c->lock);
6355 OnodeRef o = c->get_onode(oid, false);
6356 if (!o || !o->exists)
6357 return -ENOENT;
6358 st->st_size = o->onode.size;
6359 st->st_blksize = 4096;
6360 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6361 st->st_nlink = 1;
6362 }
6363
6364 int r = 0;
6365 if (_debug_mdata_eio(oid)) {
6366 r = -EIO;
6367 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6368 }
6369 return r;
6370 }
6371 int BlueStore::set_collection_opts(
6372 const coll_t& cid,
6373 const pool_opts_t& opts)
6374 {
6375 CollectionHandle ch = _get_collection(cid);
6376 if (!ch)
6377 return -ENOENT;
6378 Collection *c = static_cast<Collection *>(ch.get());
6379 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6380 if (!c->exists)
6381 return -ENOENT;
6382 RWLock::WLocker l(c->lock);
6383 c->pool_opts = opts;
6384 return 0;
6385 }
6386
6387 int BlueStore::read(
6388 const coll_t& cid,
6389 const ghobject_t& oid,
6390 uint64_t offset,
6391 size_t length,
6392 bufferlist& bl,
6393 uint32_t op_flags)
6394 {
6395 CollectionHandle c = _get_collection(cid);
6396 if (!c)
6397 return -ENOENT;
6398 return read(c, oid, offset, length, bl, op_flags);
6399 }
6400
6401 int BlueStore::read(
6402 CollectionHandle &c_,
6403 const ghobject_t& oid,
6404 uint64_t offset,
6405 size_t length,
6406 bufferlist& bl,
6407 uint32_t op_flags)
6408 {
6409 utime_t start = ceph_clock_now();
6410 Collection *c = static_cast<Collection *>(c_.get());
6411 const coll_t &cid = c->get_cid();
6412 dout(15) << __func__ << " " << cid << " " << oid
6413 << " 0x" << std::hex << offset << "~" << length << std::dec
6414 << dendl;
6415 if (!c->exists)
6416 return -ENOENT;
6417
6418 bl.clear();
6419 int r;
6420 {
6421 RWLock::RLocker l(c->lock);
6422 utime_t start1 = ceph_clock_now();
6423 OnodeRef o = c->get_onode(oid, false);
6424 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6425 if (!o || !o->exists) {
6426 r = -ENOENT;
6427 goto out;
6428 }
6429
6430 if (offset == length && offset == 0)
6431 length = o->onode.size;
6432
6433 r = _do_read(c, o, offset, length, bl, op_flags);
6434 if (r == -EIO) {
6435 logger->inc(l_bluestore_read_eio);
6436 }
6437 }
6438
6439 out:
6440 if (r >= 0 && _debug_data_eio(oid)) {
6441 r = -EIO;
6442 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6443 } else if (cct->_conf->bluestore_debug_random_read_err &&
6444 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) {
6445 dout(0) << __func__ << ": inject random EIO" << dendl;
6446 r = -EIO;
6447 }
6448 dout(10) << __func__ << " " << cid << " " << oid
6449 << " 0x" << std::hex << offset << "~" << length << std::dec
6450 << " = " << r << dendl;
6451 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6452 return r;
6453 }
6454
6455 // --------------------------------------------------------
6456 // intermediate data structures used while reading
6457 struct region_t {
6458 uint64_t logical_offset;
6459 uint64_t blob_xoffset; //region offset within the blob
6460 uint64_t length;
6461 bufferlist bl;
6462
6463 // used later in read process
6464 uint64_t front = 0;
6465 uint64_t r_off = 0;
6466
6467 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6468 : logical_offset(offset),
6469 blob_xoffset(b_offs),
6470 length(len){}
6471 region_t(const region_t& from)
6472 : logical_offset(from.logical_offset),
6473 blob_xoffset(from.blob_xoffset),
6474 length(from.length){}
6475
6476 friend ostream& operator<<(ostream& out, const region_t& r) {
6477 return out << "0x" << std::hex << r.logical_offset << ":"
6478 << r.blob_xoffset << "~" << r.length << std::dec;
6479 }
6480 };
6481
6482 typedef list<region_t> regions2read_t;
6483 typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6484
6485 int BlueStore::_do_read(
6486 Collection *c,
6487 OnodeRef o,
6488 uint64_t offset,
6489 size_t length,
6490 bufferlist& bl,
6491 uint32_t op_flags)
6492 {
6493 FUNCTRACE();
6494 int r = 0;
6495
6496 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6497 << " size 0x" << o->onode.size << " (" << std::dec
6498 << o->onode.size << ")" << dendl;
6499 bl.clear();
6500
6501 if (offset >= o->onode.size) {
6502 return r;
6503 }
6504
6505 // generally, don't buffer anything, unless the client explicitly requests
6506 // it.
6507 bool buffered = false;
6508 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6509 dout(20) << __func__ << " will do buffered read" << dendl;
6510 buffered = true;
6511 } else if (cct->_conf->bluestore_default_buffered_read &&
6512 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6513 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6514 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6515 buffered = true;
6516 }
6517
6518 if (offset + length > o->onode.size) {
6519 length = o->onode.size - offset;
6520 }
6521
6522 utime_t start = ceph_clock_now();
6523 o->extent_map.fault_range(db, offset, length);
6524 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6525 _dump_onode(o);
6526
6527 ready_regions_t ready_regions;
6528
6529 // build blob-wise list to of stuff read (that isn't cached)
6530 blobs2read_t blobs2read;
6531 unsigned left = length;
6532 uint64_t pos = offset;
6533 unsigned num_regions = 0;
6534 auto lp = o->extent_map.seek_lextent(offset);
6535 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6536 if (pos < lp->logical_offset) {
6537 unsigned hole = lp->logical_offset - pos;
6538 if (hole >= left) {
6539 break;
6540 }
6541 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6542 << std::dec << dendl;
6543 pos += hole;
6544 left -= hole;
6545 }
6546 BlobRef& bptr = lp->blob;
6547 unsigned l_off = pos - lp->logical_offset;
6548 unsigned b_off = l_off + lp->blob_offset;
6549 unsigned b_len = std::min(left, lp->length - l_off);
6550
6551 ready_regions_t cache_res;
6552 interval_set<uint32_t> cache_interval;
6553 bptr->shared_blob->bc.read(
6554 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
6555 dout(20) << __func__ << " blob " << *bptr << std::hex
6556 << " need 0x" << b_off << "~" << b_len
6557 << " cache has 0x" << cache_interval
6558 << std::dec << dendl;
6559
6560 auto pc = cache_res.begin();
6561 while (b_len > 0) {
6562 unsigned l;
6563 if (pc != cache_res.end() &&
6564 pc->first == b_off) {
6565 l = pc->second.length();
6566 ready_regions[pos].claim(pc->second);
6567 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6568 << b_off << "~" << l << std::dec << dendl;
6569 ++pc;
6570 } else {
6571 l = b_len;
6572 if (pc != cache_res.end()) {
6573 assert(pc->first > b_off);
6574 l = pc->first - b_off;
6575 }
6576 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6577 << b_off << "~" << l << std::dec << dendl;
6578 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6579 ++num_regions;
6580 }
6581 pos += l;
6582 b_off += l;
6583 left -= l;
6584 b_len -= l;
6585 }
6586 ++lp;
6587 }
6588
6589 // read raw blob data. use aio if we have >1 blobs to read.
6590 start = ceph_clock_now(); // for the sake of simplicity
6591 // measure the whole block below.
6592 // The error isn't that much...
6593 vector<bufferlist> compressed_blob_bls;
6594 IOContext ioc(cct, NULL, true); // allow EIO
6595 for (auto& p : blobs2read) {
6596 const BlobRef& bptr = p.first;
6597 dout(20) << __func__ << " blob " << *bptr << std::hex
6598 << " need " << p.second << std::dec << dendl;
6599 if (bptr->get_blob().is_compressed()) {
6600 // read the whole thing
6601 if (compressed_blob_bls.empty()) {
6602 // ensure we avoid any reallocation on subsequent blobs
6603 compressed_blob_bls.reserve(blobs2read.size());
6604 }
6605 compressed_blob_bls.push_back(bufferlist());
6606 bufferlist& bl = compressed_blob_bls.back();
6607 r = bptr->get_blob().map(
6608 0, bptr->get_blob().get_ondisk_length(),
6609 [&](uint64_t offset, uint64_t length) {
6610 int r;
6611 // use aio if there are more regions to read than those in this blob
6612 if (num_regions > p.second.size()) {
6613 r = bdev->aio_read(offset, length, &bl, &ioc);
6614 } else {
6615 r = bdev->read(offset, length, &bl, &ioc, false);
6616 }
6617 if (r < 0)
6618 return r;
6619 return 0;
6620 });
6621 if (r < 0) {
6622 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
6623 if (r == -EIO) {
6624 // propagate EIO to caller
6625 return r;
6626 }
6627 assert(r == 0);
6628 }
6629 } else {
6630 // read the pieces
6631 for (auto& reg : p.second) {
6632 // determine how much of the blob to read
6633 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6634 reg.r_off = reg.blob_xoffset;
6635 uint64_t r_len = reg.length;
6636 reg.front = reg.r_off % chunk_size;
6637 if (reg.front) {
6638 reg.r_off -= reg.front;
6639 r_len += reg.front;
6640 }
6641 unsigned tail = r_len % chunk_size;
6642 if (tail) {
6643 r_len += chunk_size - tail;
6644 }
6645 dout(20) << __func__ << " region 0x" << std::hex
6646 << reg.logical_offset
6647 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6648 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6649 << dendl;
6650
6651 // read it
6652 r = bptr->get_blob().map(
6653 reg.r_off, r_len,
6654 [&](uint64_t offset, uint64_t length) {
6655 int r;
6656 // use aio if there is more than one region to read
6657 if (num_regions > 1) {
6658 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6659 } else {
6660 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6661 }
6662 if (r < 0)
6663 return r;
6664 return 0;
6665 });
6666 if (r < 0) {
6667 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
6668 << dendl;
6669 if (r == -EIO) {
6670 // propagate EIO to caller
6671 return r;
6672 }
6673 assert(r == 0);
6674 }
6675 assert(reg.bl.length() == r_len);
6676 }
6677 }
6678 }
6679 if (ioc.has_pending_aios()) {
6680 bdev->aio_submit(&ioc);
6681 dout(20) << __func__ << " waiting for aio" << dendl;
6682 ioc.aio_wait();
6683 r = ioc.get_return_value();
6684 if (r < 0) {
6685 assert(r == -EIO); // no other errors allowed
6686 return -EIO;
6687 }
6688 }
6689 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6690
6691 // enumerate and decompress desired blobs
6692 auto p = compressed_blob_bls.begin();
6693 blobs2read_t::iterator b2r_it = blobs2read.begin();
6694 while (b2r_it != blobs2read.end()) {
6695 const BlobRef& bptr = b2r_it->first;
6696 dout(20) << __func__ << " blob " << *bptr << std::hex
6697 << " need 0x" << b2r_it->second << std::dec << dendl;
6698 if (bptr->get_blob().is_compressed()) {
6699 assert(p != compressed_blob_bls.end());
6700 bufferlist& compressed_bl = *p++;
6701 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6702 b2r_it->second.front().logical_offset) < 0) {
6703 return -EIO;
6704 }
6705 bufferlist raw_bl;
6706 r = _decompress(compressed_bl, &raw_bl);
6707 if (r < 0)
6708 return r;
6709 if (buffered) {
6710 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6711 raw_bl);
6712 }
6713 for (auto& i : b2r_it->second) {
6714 ready_regions[i.logical_offset].substr_of(
6715 raw_bl, i.blob_xoffset, i.length);
6716 }
6717 } else {
6718 for (auto& reg : b2r_it->second) {
6719 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6720 reg.logical_offset) < 0) {
6721 return -EIO;
6722 }
6723 if (buffered) {
6724 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6725 reg.r_off, reg.bl);
6726 }
6727
6728 // prune and keep result
6729 ready_regions[reg.logical_offset].substr_of(
6730 reg.bl, reg.front, reg.length);
6731 }
6732 }
6733 ++b2r_it;
6734 }
6735
6736 // generate a resulting buffer
6737 auto pr = ready_regions.begin();
6738 auto pr_end = ready_regions.end();
6739 pos = 0;
6740 while (pos < length) {
6741 if (pr != pr_end && pr->first == pos + offset) {
6742 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6743 << ": data from 0x" << pr->first << "~" << pr->second.length()
6744 << std::dec << dendl;
6745 pos += pr->second.length();
6746 bl.claim_append(pr->second);
6747 ++pr;
6748 } else {
6749 uint64_t l = length - pos;
6750 if (pr != pr_end) {
6751 assert(pr->first > pos + offset);
6752 l = pr->first - (pos + offset);
6753 }
6754 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6755 << ": zeros for 0x" << (pos + offset) << "~" << l
6756 << std::dec << dendl;
6757 bl.append_zero(l);
6758 pos += l;
6759 }
6760 }
6761 assert(bl.length() == length);
6762 assert(pos == length);
6763 assert(pr == pr_end);
6764 r = bl.length();
6765 return r;
6766 }
6767
6768 int BlueStore::_verify_csum(OnodeRef& o,
6769 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6770 const bufferlist& bl,
6771 uint64_t logical_offset) const
6772 {
6773 int bad;
6774 uint64_t bad_csum;
6775 utime_t start = ceph_clock_now();
6776 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6777 if (r < 0) {
6778 if (r == -1) {
6779 PExtentVector pex;
6780 blob->map(
6781 bad,
6782 blob->get_csum_chunk_size(),
6783 [&](uint64_t offset, uint64_t length) {
6784 pex.emplace_back(bluestore_pextent_t(offset, length));
6785 return 0;
6786 });
6787 derr << __func__ << " bad "
6788 << Checksummer::get_csum_type_string(blob->csum_type)
6789 << "/0x" << std::hex << blob->get_csum_chunk_size()
6790 << " checksum at blob offset 0x" << bad
6791 << ", got 0x" << bad_csum << ", expected 0x"
6792 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
6793 << ", device location " << pex
6794 << ", logical extent 0x" << std::hex
6795 << (logical_offset + bad - blob_xoffset) << "~"
6796 << blob->get_csum_chunk_size() << std::dec
6797 << ", object " << o->oid
6798 << dendl;
6799 } else {
6800 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
6801 }
6802 }
6803 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
6804 return r;
6805 }
6806
6807 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
6808 {
6809 int r = 0;
6810 utime_t start = ceph_clock_now();
6811 bufferlist::iterator i = source.begin();
6812 bluestore_compression_header_t chdr;
6813 ::decode(chdr, i);
6814 int alg = int(chdr.type);
6815 CompressorRef cp = compressor;
6816 if (!cp || (int)cp->get_type() != alg) {
6817 cp = Compressor::create(cct, alg);
6818 }
6819
6820 if (!cp.get()) {
6821 // if compressor isn't available - error, because cannot return
6822 // decompressed data?
6823 derr << __func__ << " can't load decompressor " << alg << dendl;
6824 r = -EIO;
6825 } else {
6826 r = cp->decompress(i, chdr.length, *result);
6827 if (r < 0) {
6828 derr << __func__ << " decompression failed with exit code " << r << dendl;
6829 r = -EIO;
6830 }
6831 }
6832 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
6833 return r;
6834 }
6835
6836 // this stores fiemap into interval_set, other variations
6837 // use it internally
6838 int BlueStore::_fiemap(
6839 CollectionHandle &c_,
6840 const ghobject_t& oid,
6841 uint64_t offset,
6842 size_t length,
6843 interval_set<uint64_t>& destset)
6844 {
6845 Collection *c = static_cast<Collection *>(c_.get());
6846 if (!c->exists)
6847 return -ENOENT;
6848 {
6849 RWLock::RLocker l(c->lock);
6850
6851 OnodeRef o = c->get_onode(oid, false);
6852 if (!o || !o->exists) {
6853 return -ENOENT;
6854 }
6855 _dump_onode(o);
6856
6857 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6858 << " size 0x" << o->onode.size << std::dec << dendl;
6859
6860 boost::intrusive::set<Extent>::iterator ep, eend;
6861 if (offset >= o->onode.size)
6862 goto out;
6863
6864 if (offset + length > o->onode.size) {
6865 length = o->onode.size - offset;
6866 }
6867
6868 o->extent_map.fault_range(db, offset, length);
6869 eend = o->extent_map.extent_map.end();
6870 ep = o->extent_map.seek_lextent(offset);
6871 while (length > 0) {
6872 dout(20) << __func__ << " offset " << offset << dendl;
6873 if (ep != eend && ep->logical_offset + ep->length <= offset) {
6874 ++ep;
6875 continue;
6876 }
6877
6878 uint64_t x_len = length;
6879 if (ep != eend && ep->logical_offset <= offset) {
6880 uint64_t x_off = offset - ep->logical_offset;
6881 x_len = MIN(x_len, ep->length - x_off);
6882 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
6883 << x_len << std::dec << " blob " << ep->blob << dendl;
6884 destset.insert(offset, x_len);
6885 length -= x_len;
6886 offset += x_len;
6887 if (x_off + x_len == ep->length)
6888 ++ep;
6889 continue;
6890 }
6891 if (ep != eend &&
6892 ep->logical_offset > offset &&
6893 ep->logical_offset - offset < x_len) {
6894 x_len = ep->logical_offset - offset;
6895 }
6896 offset += x_len;
6897 length -= x_len;
6898 }
6899 }
6900
6901 out:
6902 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6903 << " size = 0x(" << destset << ")" << std::dec << dendl;
6904 return 0;
6905 }
6906
6907 int BlueStore::fiemap(
6908 const coll_t& cid,
6909 const ghobject_t& oid,
6910 uint64_t offset,
6911 size_t len,
6912 bufferlist& bl)
6913 {
6914 CollectionHandle c = _get_collection(cid);
6915 if (!c)
6916 return -ENOENT;
6917 return fiemap(c, oid, offset, len, bl);
6918 }
6919
6920 int BlueStore::fiemap(
6921 CollectionHandle &c_,
6922 const ghobject_t& oid,
6923 uint64_t offset,
6924 size_t length,
6925 bufferlist& bl)
6926 {
6927 interval_set<uint64_t> m;
6928 int r = _fiemap(c_, oid, offset, length, m);
6929 if (r >= 0) {
6930 ::encode(m, bl);
6931 }
6932 return r;
6933 }
6934
6935 int BlueStore::fiemap(
6936 const coll_t& cid,
6937 const ghobject_t& oid,
6938 uint64_t offset,
6939 size_t len,
6940 map<uint64_t, uint64_t>& destmap)
6941 {
6942 CollectionHandle c = _get_collection(cid);
6943 if (!c)
6944 return -ENOENT;
6945 return fiemap(c, oid, offset, len, destmap);
6946 }
6947
6948 int BlueStore::fiemap(
6949 CollectionHandle &c_,
6950 const ghobject_t& oid,
6951 uint64_t offset,
6952 size_t length,
6953 map<uint64_t, uint64_t>& destmap)
6954 {
6955 interval_set<uint64_t> m;
6956 int r = _fiemap(c_, oid, offset, length, m);
6957 if (r >= 0) {
6958 m.move_into(destmap);
6959 }
6960 return r;
6961 }
6962
6963 int BlueStore::getattr(
6964 const coll_t& cid,
6965 const ghobject_t& oid,
6966 const char *name,
6967 bufferptr& value)
6968 {
6969 CollectionHandle c = _get_collection(cid);
6970 if (!c)
6971 return -ENOENT;
6972 return getattr(c, oid, name, value);
6973 }
6974
6975 int BlueStore::getattr(
6976 CollectionHandle &c_,
6977 const ghobject_t& oid,
6978 const char *name,
6979 bufferptr& value)
6980 {
6981 Collection *c = static_cast<Collection *>(c_.get());
6982 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
6983 if (!c->exists)
6984 return -ENOENT;
6985
6986 int r;
6987 {
6988 RWLock::RLocker l(c->lock);
6989 mempool::bluestore_cache_other::string k(name);
6990
6991 OnodeRef o = c->get_onode(oid, false);
6992 if (!o || !o->exists) {
6993 r = -ENOENT;
6994 goto out;
6995 }
6996
6997 if (!o->onode.attrs.count(k)) {
6998 r = -ENODATA;
6999 goto out;
7000 }
7001 value = o->onode.attrs[k];
7002 r = 0;
7003 }
7004 out:
7005 if (r == 0 && _debug_mdata_eio(oid)) {
7006 r = -EIO;
7007 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7008 }
7009 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
7010 << " = " << r << dendl;
7011 return r;
7012 }
7013
7014
7015 int BlueStore::getattrs(
7016 const coll_t& cid,
7017 const ghobject_t& oid,
7018 map<string,bufferptr>& aset)
7019 {
7020 CollectionHandle c = _get_collection(cid);
7021 if (!c)
7022 return -ENOENT;
7023 return getattrs(c, oid, aset);
7024 }
7025
7026 int BlueStore::getattrs(
7027 CollectionHandle &c_,
7028 const ghobject_t& oid,
7029 map<string,bufferptr>& aset)
7030 {
7031 Collection *c = static_cast<Collection *>(c_.get());
7032 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
7033 if (!c->exists)
7034 return -ENOENT;
7035
7036 int r;
7037 {
7038 RWLock::RLocker l(c->lock);
7039
7040 OnodeRef o = c->get_onode(oid, false);
7041 if (!o || !o->exists) {
7042 r = -ENOENT;
7043 goto out;
7044 }
7045 for (auto& i : o->onode.attrs) {
7046 aset.emplace(i.first.c_str(), i.second);
7047 }
7048 r = 0;
7049 }
7050
7051 out:
7052 if (r == 0 && _debug_mdata_eio(oid)) {
7053 r = -EIO;
7054 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
7055 }
7056 dout(10) << __func__ << " " << c->cid << " " << oid
7057 << " = " << r << dendl;
7058 return r;
7059 }
7060
7061 int BlueStore::list_collections(vector<coll_t>& ls)
7062 {
7063 RWLock::RLocker l(coll_lock);
7064 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
7065 p != coll_map.end();
7066 ++p)
7067 ls.push_back(p->first);
7068 return 0;
7069 }
7070
7071 bool BlueStore::collection_exists(const coll_t& c)
7072 {
7073 RWLock::RLocker l(coll_lock);
7074 return coll_map.count(c);
7075 }
7076
7077 int BlueStore::collection_empty(const coll_t& cid, bool *empty)
7078 {
7079 dout(15) << __func__ << " " << cid << dendl;
7080 vector<ghobject_t> ls;
7081 ghobject_t next;
7082 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
7083 &ls, &next);
7084 if (r < 0) {
7085 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
7086 << dendl;
7087 return r;
7088 }
7089 *empty = ls.empty();
7090 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
7091 return 0;
7092 }
7093
7094 int BlueStore::collection_bits(const coll_t& cid)
7095 {
7096 dout(15) << __func__ << " " << cid << dendl;
7097 CollectionRef c = _get_collection(cid);
7098 if (!c)
7099 return -ENOENT;
7100 RWLock::RLocker l(c->lock);
7101 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
7102 return c->cnode.bits;
7103 }
7104
7105 int BlueStore::collection_list(
7106 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
7107 vector<ghobject_t> *ls, ghobject_t *pnext)
7108 {
7109 CollectionHandle c = _get_collection(cid);
7110 if (!c)
7111 return -ENOENT;
7112 return collection_list(c, start, end, max, ls, pnext);
7113 }
7114
7115 int BlueStore::collection_list(
7116 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
7117 vector<ghobject_t> *ls, ghobject_t *pnext)
7118 {
7119 Collection *c = static_cast<Collection *>(c_.get());
7120 dout(15) << __func__ << " " << c->cid
7121 << " start " << start << " end " << end << " max " << max << dendl;
7122 int r;
7123 {
7124 RWLock::RLocker l(c->lock);
7125 r = _collection_list(c, start, end, max, ls, pnext);
7126 }
7127
7128 dout(10) << __func__ << " " << c->cid
7129 << " start " << start << " end " << end << " max " << max
7130 << " = " << r << ", ls.size() = " << ls->size()
7131 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
7132 return r;
7133 }
7134
7135 int BlueStore::_collection_list(
7136 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
7137 vector<ghobject_t> *ls, ghobject_t *pnext)
7138 {
7139
7140 if (!c->exists)
7141 return -ENOENT;
7142
7143 int r = 0;
7144 ghobject_t static_next;
7145 KeyValueDB::Iterator it;
7146 string temp_start_key, temp_end_key;
7147 string start_key, end_key;
7148 bool set_next = false;
7149 string pend;
7150 bool temp;
7151
7152 if (!pnext)
7153 pnext = &static_next;
7154
7155 if (start == ghobject_t::get_max() ||
7156 start.hobj.is_max()) {
7157 goto out;
7158 }
7159 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
7160 &start_key, &end_key);
7161 dout(20) << __func__
7162 << " range " << pretty_binary_string(temp_start_key)
7163 << " to " << pretty_binary_string(temp_end_key)
7164 << " and " << pretty_binary_string(start_key)
7165 << " to " << pretty_binary_string(end_key)
7166 << " start " << start << dendl;
7167 it = db->get_iterator(PREFIX_OBJ);
7168 if (start == ghobject_t() ||
7169 start.hobj == hobject_t() ||
7170 start == c->cid.get_min_hobj()) {
7171 it->upper_bound(temp_start_key);
7172 temp = true;
7173 } else {
7174 string k;
7175 get_object_key(cct, start, &k);
7176 if (start.hobj.is_temp()) {
7177 temp = true;
7178 assert(k >= temp_start_key && k < temp_end_key);
7179 } else {
7180 temp = false;
7181 assert(k >= start_key && k < end_key);
7182 }
7183 dout(20) << " start from " << pretty_binary_string(k)
7184 << " temp=" << (int)temp << dendl;
7185 it->lower_bound(k);
7186 }
7187 if (end.hobj.is_max()) {
7188 pend = temp ? temp_end_key : end_key;
7189 } else {
7190 get_object_key(cct, end, &end_key);
7191 if (end.hobj.is_temp()) {
7192 if (temp)
7193 pend = end_key;
7194 else
7195 goto out;
7196 } else {
7197 pend = temp ? temp_end_key : end_key;
7198 }
7199 }
7200 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7201 while (true) {
7202 if (!it->valid() || it->key() >= pend) {
7203 if (!it->valid())
7204 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
7205 else
7206 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
7207 << " >= " << end << dendl;
7208 if (temp) {
7209 if (end.hobj.is_temp()) {
7210 break;
7211 }
7212 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
7213 temp = false;
7214 it->upper_bound(start_key);
7215 pend = end_key;
7216 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7217 continue;
7218 }
7219 break;
7220 }
7221 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
7222 if (is_extent_shard_key(it->key())) {
7223 it->next();
7224 continue;
7225 }
7226 ghobject_t oid;
7227 int r = get_key_object(it->key(), &oid);
7228 assert(r == 0);
7229 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
7230 if (ls->size() >= (unsigned)max) {
7231 dout(20) << __func__ << " reached max " << max << dendl;
7232 *pnext = oid;
7233 set_next = true;
7234 break;
7235 }
7236 ls->push_back(oid);
7237 it->next();
7238 }
7239 out:
7240 if (!set_next) {
7241 *pnext = ghobject_t::get_max();
7242 }
7243
7244 return r;
7245 }
7246
7247 int BlueStore::omap_get(
7248 const coll_t& cid, ///< [in] Collection containing oid
7249 const ghobject_t &oid, ///< [in] Object containing omap
7250 bufferlist *header, ///< [out] omap header
7251 map<string, bufferlist> *out /// < [out] Key to value map
7252 )
7253 {
7254 CollectionHandle c = _get_collection(cid);
7255 if (!c)
7256 return -ENOENT;
7257 return omap_get(c, oid, header, out);
7258 }
7259
7260 int BlueStore::omap_get(
7261 CollectionHandle &c_, ///< [in] Collection containing oid
7262 const ghobject_t &oid, ///< [in] Object containing omap
7263 bufferlist *header, ///< [out] omap header
7264 map<string, bufferlist> *out /// < [out] Key to value map
7265 )
7266 {
7267 Collection *c = static_cast<Collection *>(c_.get());
7268 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7269 if (!c->exists)
7270 return -ENOENT;
7271 RWLock::RLocker l(c->lock);
7272 int r = 0;
7273 OnodeRef o = c->get_onode(oid, false);
7274 if (!o || !o->exists) {
7275 r = -ENOENT;
7276 goto out;
7277 }
7278 if (!o->onode.has_omap())
7279 goto out;
7280 o->flush();
7281 {
7282 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7283 string head, tail;
7284 get_omap_header(o->onode.nid, &head);
7285 get_omap_tail(o->onode.nid, &tail);
7286 it->lower_bound(head);
7287 while (it->valid()) {
7288 if (it->key() == head) {
7289 dout(30) << __func__ << " got header" << dendl;
7290 *header = it->value();
7291 } else if (it->key() >= tail) {
7292 dout(30) << __func__ << " reached tail" << dendl;
7293 break;
7294 } else {
7295 string user_key;
7296 decode_omap_key(it->key(), &user_key);
7297 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7298 << " -> " << user_key << dendl;
7299 (*out)[user_key] = it->value();
7300 }
7301 it->next();
7302 }
7303 }
7304 out:
7305 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7306 << dendl;
7307 return r;
7308 }
7309
7310 int BlueStore::omap_get_header(
7311 const coll_t& cid, ///< [in] Collection containing oid
7312 const ghobject_t &oid, ///< [in] Object containing omap
7313 bufferlist *header, ///< [out] omap header
7314 bool allow_eio ///< [in] don't assert on eio
7315 )
7316 {
7317 CollectionHandle c = _get_collection(cid);
7318 if (!c)
7319 return -ENOENT;
7320 return omap_get_header(c, oid, header, allow_eio);
7321 }
7322
7323 int BlueStore::omap_get_header(
7324 CollectionHandle &c_, ///< [in] Collection containing oid
7325 const ghobject_t &oid, ///< [in] Object containing omap
7326 bufferlist *header, ///< [out] omap header
7327 bool allow_eio ///< [in] don't assert on eio
7328 )
7329 {
7330 Collection *c = static_cast<Collection *>(c_.get());
7331 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7332 if (!c->exists)
7333 return -ENOENT;
7334 RWLock::RLocker l(c->lock);
7335 int r = 0;
7336 OnodeRef o = c->get_onode(oid, false);
7337 if (!o || !o->exists) {
7338 r = -ENOENT;
7339 goto out;
7340 }
7341 if (!o->onode.has_omap())
7342 goto out;
7343 o->flush();
7344 {
7345 string head;
7346 get_omap_header(o->onode.nid, &head);
7347 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7348 dout(30) << __func__ << " got header" << dendl;
7349 } else {
7350 dout(30) << __func__ << " no header" << dendl;
7351 }
7352 }
7353 out:
7354 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7355 << dendl;
7356 return r;
7357 }
7358
7359 int BlueStore::omap_get_keys(
7360 const coll_t& cid, ///< [in] Collection containing oid
7361 const ghobject_t &oid, ///< [in] Object containing omap
7362 set<string> *keys ///< [out] Keys defined on oid
7363 )
7364 {
7365 CollectionHandle c = _get_collection(cid);
7366 if (!c)
7367 return -ENOENT;
7368 return omap_get_keys(c, oid, keys);
7369 }
7370
7371 int BlueStore::omap_get_keys(
7372 CollectionHandle &c_, ///< [in] Collection containing oid
7373 const ghobject_t &oid, ///< [in] Object containing omap
7374 set<string> *keys ///< [out] Keys defined on oid
7375 )
7376 {
7377 Collection *c = static_cast<Collection *>(c_.get());
7378 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7379 if (!c->exists)
7380 return -ENOENT;
7381 RWLock::RLocker l(c->lock);
7382 int r = 0;
7383 OnodeRef o = c->get_onode(oid, false);
7384 if (!o || !o->exists) {
7385 r = -ENOENT;
7386 goto out;
7387 }
7388 if (!o->onode.has_omap())
7389 goto out;
7390 o->flush();
7391 {
7392 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7393 string head, tail;
7394 get_omap_key(o->onode.nid, string(), &head);
7395 get_omap_tail(o->onode.nid, &tail);
7396 it->lower_bound(head);
7397 while (it->valid()) {
7398 if (it->key() >= tail) {
7399 dout(30) << __func__ << " reached tail" << dendl;
7400 break;
7401 }
7402 string user_key;
7403 decode_omap_key(it->key(), &user_key);
7404 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7405 << " -> " << user_key << dendl;
7406 keys->insert(user_key);
7407 it->next();
7408 }
7409 }
7410 out:
7411 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7412 << dendl;
7413 return r;
7414 }
7415
7416 int BlueStore::omap_get_values(
7417 const coll_t& cid, ///< [in] Collection containing oid
7418 const ghobject_t &oid, ///< [in] Object containing omap
7419 const set<string> &keys, ///< [in] Keys to get
7420 map<string, bufferlist> *out ///< [out] Returned keys and values
7421 )
7422 {
7423 CollectionHandle c = _get_collection(cid);
7424 if (!c)
7425 return -ENOENT;
7426 return omap_get_values(c, oid, keys, out);
7427 }
7428
7429 int BlueStore::omap_get_values(
7430 CollectionHandle &c_, ///< [in] Collection containing oid
7431 const ghobject_t &oid, ///< [in] Object containing omap
7432 const set<string> &keys, ///< [in] Keys to get
7433 map<string, bufferlist> *out ///< [out] Returned keys and values
7434 )
7435 {
7436 Collection *c = static_cast<Collection *>(c_.get());
7437 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7438 if (!c->exists)
7439 return -ENOENT;
7440 RWLock::RLocker l(c->lock);
7441 int r = 0;
7442 string final_key;
7443 OnodeRef o = c->get_onode(oid, false);
7444 if (!o || !o->exists) {
7445 r = -ENOENT;
7446 goto out;
7447 }
7448 if (!o->onode.has_omap())
7449 goto out;
7450 o->flush();
7451 _key_encode_u64(o->onode.nid, &final_key);
7452 final_key.push_back('.');
7453 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7454 final_key.resize(9); // keep prefix
7455 final_key += *p;
7456 bufferlist val;
7457 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7458 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7459 << " -> " << *p << dendl;
7460 out->insert(make_pair(*p, val));
7461 }
7462 }
7463 out:
7464 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7465 << dendl;
7466 return r;
7467 }
7468
7469 int BlueStore::omap_check_keys(
7470 const coll_t& cid, ///< [in] Collection containing oid
7471 const ghobject_t &oid, ///< [in] Object containing omap
7472 const set<string> &keys, ///< [in] Keys to check
7473 set<string> *out ///< [out] Subset of keys defined on oid
7474 )
7475 {
7476 CollectionHandle c = _get_collection(cid);
7477 if (!c)
7478 return -ENOENT;
7479 return omap_check_keys(c, oid, keys, out);
7480 }
7481
7482 int BlueStore::omap_check_keys(
7483 CollectionHandle &c_, ///< [in] Collection containing oid
7484 const ghobject_t &oid, ///< [in] Object containing omap
7485 const set<string> &keys, ///< [in] Keys to check
7486 set<string> *out ///< [out] Subset of keys defined on oid
7487 )
7488 {
7489 Collection *c = static_cast<Collection *>(c_.get());
7490 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7491 if (!c->exists)
7492 return -ENOENT;
7493 RWLock::RLocker l(c->lock);
7494 int r = 0;
7495 string final_key;
7496 OnodeRef o = c->get_onode(oid, false);
7497 if (!o || !o->exists) {
7498 r = -ENOENT;
7499 goto out;
7500 }
7501 if (!o->onode.has_omap())
7502 goto out;
7503 o->flush();
7504 _key_encode_u64(o->onode.nid, &final_key);
7505 final_key.push_back('.');
7506 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7507 final_key.resize(9); // keep prefix
7508 final_key += *p;
7509 bufferlist val;
7510 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7511 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7512 << " -> " << *p << dendl;
7513 out->insert(*p);
7514 } else {
7515 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7516 << " -> " << *p << dendl;
7517 }
7518 }
7519 out:
7520 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7521 << dendl;
7522 return r;
7523 }
7524
7525 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7526 const coll_t& cid, ///< [in] collection
7527 const ghobject_t &oid ///< [in] object
7528 )
7529 {
7530 CollectionHandle c = _get_collection(cid);
7531 if (!c) {
7532 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7533 return ObjectMap::ObjectMapIterator();
7534 }
7535 return get_omap_iterator(c, oid);
7536 }
7537
7538 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7539 CollectionHandle &c_, ///< [in] collection
7540 const ghobject_t &oid ///< [in] object
7541 )
7542 {
7543 Collection *c = static_cast<Collection *>(c_.get());
7544 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7545 if (!c->exists) {
7546 return ObjectMap::ObjectMapIterator();
7547 }
7548 RWLock::RLocker l(c->lock);
7549 OnodeRef o = c->get_onode(oid, false);
7550 if (!o || !o->exists) {
7551 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7552 return ObjectMap::ObjectMapIterator();
7553 }
7554 o->flush();
7555 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7556 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7557 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7558 }
7559
7560 // -----------------
7561 // write helpers
7562
7563 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7564 {
7565 dout(10) << __func__ << " ondisk_format " << ondisk_format
7566 << " min_compat_ondisk_format " << min_compat_ondisk_format
7567 << dendl;
7568 assert(ondisk_format == latest_ondisk_format);
7569 {
7570 bufferlist bl;
7571 ::encode(ondisk_format, bl);
7572 t->set(PREFIX_SUPER, "ondisk_format", bl);
7573 }
7574 {
7575 bufferlist bl;
7576 ::encode(min_compat_ondisk_format, bl);
7577 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7578 }
7579 }
7580
7581 int BlueStore::_open_super_meta()
7582 {
7583 // nid
7584 {
7585 nid_max = 0;
7586 bufferlist bl;
7587 db->get(PREFIX_SUPER, "nid_max", &bl);
7588 bufferlist::iterator p = bl.begin();
7589 try {
7590 uint64_t v;
7591 ::decode(v, p);
7592 nid_max = v;
7593 } catch (buffer::error& e) {
7594 derr << __func__ << " unable to read nid_max" << dendl;
7595 return -EIO;
7596 }
7597 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7598 nid_last = nid_max.load();
7599 }
7600
7601 // blobid
7602 {
7603 blobid_max = 0;
7604 bufferlist bl;
7605 db->get(PREFIX_SUPER, "blobid_max", &bl);
7606 bufferlist::iterator p = bl.begin();
7607 try {
7608 uint64_t v;
7609 ::decode(v, p);
7610 blobid_max = v;
7611 } catch (buffer::error& e) {
7612 derr << __func__ << " unable to read blobid_max" << dendl;
7613 return -EIO;
7614 }
7615 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7616 blobid_last = blobid_max.load();
7617 }
7618
7619 // freelist
7620 {
7621 bufferlist bl;
7622 db->get(PREFIX_SUPER, "freelist_type", &bl);
7623 if (bl.length()) {
7624 freelist_type = std::string(bl.c_str(), bl.length());
7625 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7626 } else {
7627 assert("Not Support extent freelist manager" == 0);
7628 }
7629 }
7630
7631 // bluefs alloc
7632 if (cct->_conf->bluestore_bluefs) {
7633 bluefs_extents.clear();
7634 bufferlist bl;
7635 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7636 bufferlist::iterator p = bl.begin();
7637 try {
7638 ::decode(bluefs_extents, p);
7639 }
7640 catch (buffer::error& e) {
7641 derr << __func__ << " unable to read bluefs_extents" << dendl;
7642 return -EIO;
7643 }
7644 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7645 << std::dec << dendl;
7646 }
7647
7648 // ondisk format
7649 int32_t compat_ondisk_format = 0;
7650 {
7651 bufferlist bl;
7652 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7653 if (r < 0) {
7654 // base case: kraken bluestore is v1 and readable by v1
7655 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7656 << dendl;
7657 ondisk_format = 1;
7658 compat_ondisk_format = 1;
7659 } else {
7660 auto p = bl.begin();
7661 try {
7662 ::decode(ondisk_format, p);
7663 } catch (buffer::error& e) {
7664 derr << __func__ << " unable to read ondisk_format" << dendl;
7665 return -EIO;
7666 }
7667 bl.clear();
7668 {
7669 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7670 assert(!r);
7671 auto p = bl.begin();
7672 try {
7673 ::decode(compat_ondisk_format, p);
7674 } catch (buffer::error& e) {
7675 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7676 return -EIO;
7677 }
7678 }
7679 }
7680 dout(10) << __func__ << " ondisk_format " << ondisk_format
7681 << " compat_ondisk_format " << compat_ondisk_format
7682 << dendl;
7683 }
7684
7685 if (latest_ondisk_format < compat_ondisk_format) {
7686 derr << __func__ << " compat_ondisk_format is "
7687 << compat_ondisk_format << " but we only understand version "
7688 << latest_ondisk_format << dendl;
7689 return -EPERM;
7690 }
7691 if (ondisk_format < latest_ondisk_format) {
7692 int r = _upgrade_super();
7693 if (r < 0) {
7694 return r;
7695 }
7696 }
7697
7698 {
7699 bufferlist bl;
7700 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7701 auto p = bl.begin();
7702 try {
7703 uint64_t val;
7704 ::decode(val, p);
7705 min_alloc_size = val;
7706 min_alloc_size_order = ctz(val);
7707 assert(min_alloc_size == 1u << min_alloc_size_order);
7708 } catch (buffer::error& e) {
7709 derr << __func__ << " unable to read min_alloc_size" << dendl;
7710 return -EIO;
7711 }
7712 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7713 << std::dec << dendl;
7714 }
7715 _open_statfs();
7716 _set_alloc_sizes();
7717 _set_throttle_params();
7718
7719 _set_csum();
7720 _set_compression();
7721 _set_blob_size();
7722
7723 return 0;
7724 }
7725
7726 int BlueStore::_upgrade_super()
7727 {
7728 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7729 << latest_ondisk_format << dendl;
7730 assert(ondisk_format > 0);
7731 assert(ondisk_format < latest_ondisk_format);
7732
7733 if (ondisk_format == 1) {
7734 // changes:
7735 // - super: added ondisk_format
7736 // - super: added min_readable_ondisk_format
7737 // - super: added min_compat_ondisk_format
7738 // - super: added min_alloc_size
7739 // - super: removed min_min_alloc_size
7740 KeyValueDB::Transaction t = db->get_transaction();
7741 {
7742 bufferlist bl;
7743 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7744 auto p = bl.begin();
7745 try {
7746 uint64_t val;
7747 ::decode(val, p);
7748 min_alloc_size = val;
7749 } catch (buffer::error& e) {
7750 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7751 return -EIO;
7752 }
7753 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7754 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7755 }
7756 ondisk_format = 2;
7757 _prepare_ondisk_format_super(t);
7758 int r = db->submit_transaction_sync(t);
7759 assert(r == 0);
7760 }
7761
7762 // done
7763 dout(1) << __func__ << " done" << dendl;
7764 return 0;
7765 }
7766
7767 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7768 {
7769 if (o->onode.nid) {
7770 assert(o->exists);
7771 return;
7772 }
7773 uint64_t nid = ++nid_last;
7774 dout(20) << __func__ << " " << nid << dendl;
7775 o->onode.nid = nid;
7776 txc->last_nid = nid;
7777 o->exists = true;
7778 }
7779
7780 uint64_t BlueStore::_assign_blobid(TransContext *txc)
7781 {
7782 uint64_t bid = ++blobid_last;
7783 dout(20) << __func__ << " " << bid << dendl;
7784 txc->last_blobid = bid;
7785 return bid;
7786 }
7787
7788 void BlueStore::get_db_statistics(Formatter *f)
7789 {
7790 db->get_statistics(f);
7791 }
7792
7793 BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
7794 {
7795 TransContext *txc = new TransContext(cct, osr);
7796 txc->t = db->get_transaction();
7797 osr->queue_new(txc);
7798 dout(20) << __func__ << " osr " << osr << " = " << txc
7799 << " seq " << txc->seq << dendl;
7800 return txc;
7801 }
7802
7803 void BlueStore::_txc_calc_cost(TransContext *txc)
7804 {
7805 // this is about the simplest model for transaction cost you can
7806 // imagine. there is some fixed overhead cost by saying there is a
7807 // minimum of one "io". and then we have some cost per "io" that is
7808 // a configurable (with different hdd and ssd defaults), and add
7809 // that to the bytes value.
7810 int ios = 1; // one "io" for the kv commit
7811 for (auto& p : txc->ioc.pending_aios) {
7812 ios += p.iov.size();
7813 }
7814 auto cost = throttle_cost_per_io.load();
7815 txc->cost = ios * cost + txc->bytes;
7816 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
7817 << ios << " ios * " << cost << " + " << txc->bytes
7818 << " bytes)" << dendl;
7819 }
7820
7821 void BlueStore::_txc_update_store_statfs(TransContext *txc)
7822 {
7823 if (txc->statfs_delta.is_empty())
7824 return;
7825
7826 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
7827 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
7828 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
7829 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
7830 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
7831
7832 {
7833 std::lock_guard<std::mutex> l(vstatfs_lock);
7834 vstatfs += txc->statfs_delta;
7835 }
7836
7837 bufferlist bl;
7838 txc->statfs_delta.encode(bl);
7839
7840 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
7841 txc->statfs_delta.reset();
7842 }
7843
7844 void BlueStore::_txc_state_proc(TransContext *txc)
7845 {
7846 while (true) {
7847 dout(10) << __func__ << " txc " << txc
7848 << " " << txc->get_state_name() << dendl;
7849 switch (txc->state) {
7850 case TransContext::STATE_PREPARE:
7851 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
7852 if (txc->ioc.has_pending_aios()) {
7853 txc->state = TransContext::STATE_AIO_WAIT;
7854 txc->had_ios = true;
7855 _txc_aio_submit(txc);
7856 return;
7857 }
7858 // ** fall-thru **
7859
7860 case TransContext::STATE_AIO_WAIT:
7861 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
7862 _txc_finish_io(txc); // may trigger blocked txc's too
7863 return;
7864
7865 case TransContext::STATE_IO_DONE:
7866 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7867 if (txc->had_ios) {
7868 ++txc->osr->txc_with_unstable_io;
7869 }
7870 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
7871 txc->state = TransContext::STATE_KV_QUEUED;
7872 if (cct->_conf->bluestore_sync_submit_transaction) {
7873 if (txc->last_nid >= nid_max ||
7874 txc->last_blobid >= blobid_max) {
7875 dout(20) << __func__
7876 << " last_{nid,blobid} exceeds max, submit via kv thread"
7877 << dendl;
7878 } else if (txc->osr->kv_committing_serially) {
7879 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
7880 << dendl;
7881 // note: this is starvation-prone. once we have a txc in a busy
7882 // sequencer that is committing serially it is possible to keep
7883 // submitting new transactions fast enough that we get stuck doing
7884 // so. the alternative is to block here... fixme?
7885 } else if (txc->osr->txc_with_unstable_io) {
7886 dout(20) << __func__ << " prior txc(s) with unstable ios "
7887 << txc->osr->txc_with_unstable_io.load() << dendl;
7888 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
7889 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
7890 == 0) {
7891 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
7892 << dendl;
7893 } else {
7894 txc->state = TransContext::STATE_KV_SUBMITTED;
7895 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
7896 assert(r == 0);
7897 _txc_applied_kv(txc);
7898 }
7899 }
7900 {
7901 std::lock_guard<std::mutex> l(kv_lock);
7902 kv_queue.push_back(txc);
7903 kv_cond.notify_one();
7904 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
7905 kv_queue_unsubmitted.push_back(txc);
7906 ++txc->osr->kv_committing_serially;
7907 }
7908 if (txc->had_ios)
7909 kv_ios++;
7910 kv_throttle_costs += txc->cost;
7911 }
7912 return;
7913 case TransContext::STATE_KV_SUBMITTED:
7914 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
7915 txc->state = TransContext::STATE_KV_DONE;
7916 _txc_committed_kv(txc);
7917 // ** fall-thru **
7918
7919 case TransContext::STATE_KV_DONE:
7920 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
7921 if (txc->deferred_txn) {
7922 txc->state = TransContext::STATE_DEFERRED_QUEUED;
7923 _deferred_queue(txc);
7924 return;
7925 }
7926 txc->state = TransContext::STATE_FINISHING;
7927 break;
7928
7929 case TransContext::STATE_DEFERRED_CLEANUP:
7930 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
7931 txc->state = TransContext::STATE_FINISHING;
7932 // ** fall-thru **
7933
7934 case TransContext::STATE_FINISHING:
7935 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
7936 _txc_finish(txc);
7937 return;
7938
7939 default:
7940 derr << __func__ << " unexpected txc " << txc
7941 << " state " << txc->get_state_name() << dendl;
7942 assert(0 == "unexpected txc state");
7943 return;
7944 }
7945 }
7946 }
7947
7948 void BlueStore::_txc_finish_io(TransContext *txc)
7949 {
7950 dout(20) << __func__ << " " << txc << dendl;
7951
7952 /*
7953 * we need to preserve the order of kv transactions,
7954 * even though aio will complete in any order.
7955 */
7956
7957 OpSequencer *osr = txc->osr.get();
7958 std::lock_guard<std::mutex> l(osr->qlock);
7959 txc->state = TransContext::STATE_IO_DONE;
7960
7961 // release aio contexts (including pinned buffers).
7962 txc->ioc.running_aios.clear();
7963
7964 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
7965 while (p != osr->q.begin()) {
7966 --p;
7967 if (p->state < TransContext::STATE_IO_DONE) {
7968 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
7969 << p->get_state_name() << dendl;
7970 return;
7971 }
7972 if (p->state > TransContext::STATE_IO_DONE) {
7973 ++p;
7974 break;
7975 }
7976 }
7977 do {
7978 _txc_state_proc(&*p++);
7979 } while (p != osr->q.end() &&
7980 p->state == TransContext::STATE_IO_DONE);
7981
7982 if (osr->kv_submitted_waiters &&
7983 osr->_is_all_kv_submitted()) {
7984 osr->qcond.notify_all();
7985 }
7986 }
7987
7988 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
7989 {
7990 dout(20) << __func__ << " txc " << txc
7991 << " onodes " << txc->onodes
7992 << " shared_blobs " << txc->shared_blobs
7993 << dendl;
7994
7995 // finalize onodes
7996 for (auto o : txc->onodes) {
7997 // finalize extent_map shards
7998 o->extent_map.update(t, false);
7999 if (o->extent_map.needs_reshard()) {
8000 o->extent_map.reshard(db, t);
8001 o->extent_map.update(t, true);
8002 if (o->extent_map.needs_reshard()) {
8003 dout(20) << __func__ << " warning: still wants reshard, check options?"
8004 << dendl;
8005 o->extent_map.clear_needs_reshard();
8006 }
8007 logger->inc(l_bluestore_onode_reshard);
8008 }
8009
8010 // bound encode
8011 size_t bound = 0;
8012 denc(o->onode, bound);
8013 o->extent_map.bound_encode_spanning_blobs(bound);
8014 if (o->onode.extent_map_shards.empty()) {
8015 denc(o->extent_map.inline_bl, bound);
8016 }
8017
8018 // encode
8019 bufferlist bl;
8020 unsigned onode_part, blob_part, extent_part;
8021 {
8022 auto p = bl.get_contiguous_appender(bound, true);
8023 denc(o->onode, p);
8024 onode_part = p.get_logical_offset();
8025 o->extent_map.encode_spanning_blobs(p);
8026 blob_part = p.get_logical_offset() - onode_part;
8027 if (o->onode.extent_map_shards.empty()) {
8028 denc(o->extent_map.inline_bl, p);
8029 }
8030 extent_part = p.get_logical_offset() - onode_part - blob_part;
8031 }
8032
8033 dout(20) << " onode " << o->oid << " is " << bl.length()
8034 << " (" << onode_part << " bytes onode + "
8035 << blob_part << " bytes spanning blobs + "
8036 << extent_part << " bytes inline extents)"
8037 << dendl;
8038 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
8039 o->flushing_count++;
8040 }
8041
8042 // objects we modified but didn't affect the onode
8043 auto p = txc->modified_objects.begin();
8044 while (p != txc->modified_objects.end()) {
8045 if (txc->onodes.count(*p) == 0) {
8046 (*p)->flushing_count++;
8047 ++p;
8048 } else {
8049 // remove dups with onodes list to avoid problems in _txc_finish
8050 p = txc->modified_objects.erase(p);
8051 }
8052 }
8053
8054 // finalize shared_blobs
8055 for (auto sb : txc->shared_blobs) {
8056 string key;
8057 auto sbid = sb->get_sbid();
8058 get_shared_blob_key(sbid, &key);
8059 if (sb->persistent->empty()) {
8060 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8061 << " is empty" << dendl;
8062 t->rmkey(PREFIX_SHARED_BLOB, key);
8063 } else {
8064 bufferlist bl;
8065 ::encode(*(sb->persistent), bl);
8066 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
8067 << " is " << bl.length() << " " << *sb << dendl;
8068 t->set(PREFIX_SHARED_BLOB, key, bl);
8069 }
8070 }
8071 }
8072
8073 void BlueStore::BSPerfTracker::update_from_perfcounters(
8074 PerfCounters &logger)
8075 {
8076 os_commit_latency.consume_next(
8077 logger.get_tavg_ms(
8078 l_bluestore_commit_lat));
8079 os_apply_latency.consume_next(
8080 logger.get_tavg_ms(
8081 l_bluestore_commit_lat));
8082 }
8083
8084 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
8085 {
8086 dout(20) << __func__ << " txc " << txc << std::hex
8087 << " allocated 0x" << txc->allocated
8088 << " released 0x" << txc->released
8089 << std::dec << dendl;
8090
8091 // We have to handle the case where we allocate *and* deallocate the
8092 // same region in this transaction. The freelist doesn't like that.
8093 // (Actually, the only thing that cares is the BitmapFreelistManager
8094 // debug check. But that's important.)
8095 interval_set<uint64_t> tmp_allocated, tmp_released;
8096 interval_set<uint64_t> *pallocated = &txc->allocated;
8097 interval_set<uint64_t> *preleased = &txc->released;
8098 if (!txc->allocated.empty() && !txc->released.empty()) {
8099 interval_set<uint64_t> overlap;
8100 overlap.intersection_of(txc->allocated, txc->released);
8101 if (!overlap.empty()) {
8102 tmp_allocated = txc->allocated;
8103 tmp_allocated.subtract(overlap);
8104 tmp_released = txc->released;
8105 tmp_released.subtract(overlap);
8106 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
8107 << ", new allocated 0x" << tmp_allocated
8108 << " released 0x" << tmp_released << std::dec
8109 << dendl;
8110 pallocated = &tmp_allocated;
8111 preleased = &tmp_released;
8112 }
8113 }
8114
8115 // update freelist with non-overlap sets
8116 for (interval_set<uint64_t>::iterator p = pallocated->begin();
8117 p != pallocated->end();
8118 ++p) {
8119 fm->allocate(p.get_start(), p.get_len(), t);
8120 }
8121 for (interval_set<uint64_t>::iterator p = preleased->begin();
8122 p != preleased->end();
8123 ++p) {
8124 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
8125 << "~" << p.get_len() << std::dec << dendl;
8126 fm->release(p.get_start(), p.get_len(), t);
8127 }
8128
8129 _txc_update_store_statfs(txc);
8130 }
8131
8132 void BlueStore::_txc_applied_kv(TransContext *txc)
8133 {
8134 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
8135 for (auto& o : *ls) {
8136 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
8137 << dendl;
8138 if (--o->flushing_count == 0) {
8139 std::lock_guard<std::mutex> l(o->flush_lock);
8140 o->flush_cond.notify_all();
8141 }
8142 }
8143 }
8144 }
8145
8146 void BlueStore::_txc_committed_kv(TransContext *txc)
8147 {
8148 dout(20) << __func__ << " txc " << txc << dendl;
8149
8150 // warning: we're calling onreadable_sync inside the sequencer lock
8151 if (txc->onreadable_sync) {
8152 txc->onreadable_sync->complete(0);
8153 txc->onreadable_sync = NULL;
8154 }
8155 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
8156 if (txc->oncommit) {
8157 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
8158 finishers[n]->queue(txc->oncommit);
8159 txc->oncommit = NULL;
8160 }
8161 if (txc->onreadable) {
8162 finishers[n]->queue(txc->onreadable);
8163 txc->onreadable = NULL;
8164 }
8165
8166 if (!txc->oncommits.empty()) {
8167 finishers[n]->queue(txc->oncommits);
8168 }
8169 }
8170
8171 void BlueStore::_txc_finish(TransContext *txc)
8172 {
8173 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
8174 assert(txc->state == TransContext::STATE_FINISHING);
8175
8176 for (auto& sb : txc->shared_blobs_written) {
8177 sb->bc.finish_write(sb->get_cache(), txc->seq);
8178 }
8179 txc->shared_blobs_written.clear();
8180
8181 while (!txc->removed_collections.empty()) {
8182 _queue_reap_collection(txc->removed_collections.front());
8183 txc->removed_collections.pop_front();
8184 }
8185
8186 OpSequencerRef osr = txc->osr;
8187 bool empty = false;
8188 bool submit_deferred = false;
8189 OpSequencer::q_list_t releasing_txc;
8190 {
8191 std::lock_guard<std::mutex> l(osr->qlock);
8192 txc->state = TransContext::STATE_DONE;
8193 bool notify = false;
8194 while (!osr->q.empty()) {
8195 TransContext *txc = &osr->q.front();
8196 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
8197 << dendl;
8198 if (txc->state != TransContext::STATE_DONE) {
8199 if (txc->state == TransContext::STATE_PREPARE &&
8200 deferred_aggressive) {
8201 // for _osr_drain_preceding()
8202 notify = true;
8203 }
8204 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
8205 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
8206 submit_deferred = true;
8207 }
8208 break;
8209 }
8210
8211 osr->q.pop_front();
8212 releasing_txc.push_back(*txc);
8213 notify = true;
8214 }
8215 if (notify) {
8216 osr->qcond.notify_all();
8217 }
8218 if (osr->q.empty()) {
8219 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
8220 empty = true;
8221 }
8222 }
8223 while (!releasing_txc.empty()) {
8224 // release to allocator only after all preceding txc's have also
8225 // finished any deferred writes that potentially land in these
8226 // blocks
8227 auto txc = &releasing_txc.front();
8228 _txc_release_alloc(txc);
8229 releasing_txc.pop_front();
8230 txc->log_state_latency(logger, l_bluestore_state_done_lat);
8231 delete txc;
8232 }
8233
8234 if (submit_deferred) {
8235 // we're pinning memory; flush! we could be more fine-grained here but
8236 // i'm not sure it's worth the bother.
8237 deferred_try_submit();
8238 }
8239
8240 if (empty && osr->zombie) {
8241 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
8242 osr->_unregister();
8243 }
8244 }
8245
8246 void BlueStore::_txc_release_alloc(TransContext *txc)
8247 {
8248 // update allocator with full released set
8249 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
8250 dout(10) << __func__ << " " << txc << " " << std::hex
8251 << txc->released << std::dec << dendl;
8252 for (interval_set<uint64_t>::iterator p = txc->released.begin();
8253 p != txc->released.end();
8254 ++p) {
8255 alloc->release(p.get_start(), p.get_len());
8256 }
8257 }
8258
8259 txc->allocated.clear();
8260 txc->released.clear();
8261 }
8262
8263 void BlueStore::_osr_drain_preceding(TransContext *txc)
8264 {
8265 OpSequencer *osr = txc->osr.get();
8266 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8267 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8268 {
8269 // submit anything pending
8270 deferred_lock.lock();
8271 if (osr->deferred_pending) {
8272 _deferred_submit_unlock(osr);
8273 } else {
8274 deferred_lock.unlock();
8275 }
8276 }
8277 {
8278 // wake up any previously finished deferred events
8279 std::lock_guard<std::mutex> l(kv_lock);
8280 kv_cond.notify_one();
8281 }
8282 osr->drain_preceding(txc);
8283 --deferred_aggressive;
8284 dout(10) << __func__ << " " << osr << " done" << dendl;
8285 }
8286
8287 void BlueStore::_osr_drain_all()
8288 {
8289 dout(10) << __func__ << dendl;
8290
8291 set<OpSequencerRef> s;
8292 {
8293 std::lock_guard<std::mutex> l(osr_lock);
8294 s = osr_set;
8295 }
8296 dout(20) << __func__ << " osr_set " << s << dendl;
8297
8298 ++deferred_aggressive;
8299 {
8300 // submit anything pending
8301 deferred_try_submit();
8302 }
8303 {
8304 // wake up any previously finished deferred events
8305 std::lock_guard<std::mutex> l(kv_lock);
8306 kv_cond.notify_one();
8307 }
8308 {
8309 std::lock_guard<std::mutex> l(kv_finalize_lock);
8310 kv_finalize_cond.notify_one();
8311 }
8312 for (auto osr : s) {
8313 dout(20) << __func__ << " drain " << osr << dendl;
8314 osr->drain();
8315 }
8316 --deferred_aggressive;
8317
8318 dout(10) << __func__ << " done" << dendl;
8319 }
8320
8321 void BlueStore::_osr_unregister_all()
8322 {
8323 set<OpSequencerRef> s;
8324 {
8325 std::lock_guard<std::mutex> l(osr_lock);
8326 s = osr_set;
8327 }
8328 dout(10) << __func__ << " " << s << dendl;
8329 for (auto osr : s) {
8330 osr->_unregister();
8331
8332 if (!osr->zombie) {
8333 // break link from Sequencer to us so that this OpSequencer
8334 // instance can die with this mount/umount cycle. note that
8335 // we assume umount() will not race against ~Sequencer.
8336 assert(osr->parent);
8337 osr->parent->p.reset();
8338 }
8339 }
8340 // nobody should be creating sequencers during umount either.
8341 {
8342 std::lock_guard<std::mutex> l(osr_lock);
8343 assert(osr_set.empty());
8344 }
8345 }
8346
8347 void BlueStore::_kv_start()
8348 {
8349 dout(10) << __func__ << dendl;
8350
8351 if (cct->_conf->bluestore_shard_finishers) {
8352 if (cct->_conf->osd_op_num_shards) {
8353 m_finisher_num = cct->_conf->osd_op_num_shards;
8354 } else {
8355 assert(bdev);
8356 if (bdev->is_rotational()) {
8357 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
8358 } else {
8359 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
8360 }
8361 }
8362 }
8363
8364 assert(m_finisher_num != 0);
8365
8366 for (int i = 0; i < m_finisher_num; ++i) {
8367 ostringstream oss;
8368 oss << "finisher-" << i;
8369 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8370 finishers.push_back(f);
8371 }
8372
8373 deferred_finisher.start();
8374 for (auto f : finishers) {
8375 f->start();
8376 }
8377 kv_sync_thread.create("bstore_kv_sync");
8378 kv_finalize_thread.create("bstore_kv_final");
8379 }
8380
8381 void BlueStore::_kv_stop()
8382 {
8383 dout(10) << __func__ << dendl;
8384 {
8385 std::unique_lock<std::mutex> l(kv_lock);
8386 while (!kv_sync_started) {
8387 kv_cond.wait(l);
8388 }
8389 kv_stop = true;
8390 kv_cond.notify_all();
8391 }
8392 {
8393 std::unique_lock<std::mutex> l(kv_finalize_lock);
8394 while (!kv_finalize_started) {
8395 kv_finalize_cond.wait(l);
8396 }
8397 kv_finalize_stop = true;
8398 kv_finalize_cond.notify_all();
8399 }
8400 kv_sync_thread.join();
8401 kv_finalize_thread.join();
8402 assert(removed_collections.empty());
8403 {
8404 std::lock_guard<std::mutex> l(kv_lock);
8405 kv_stop = false;
8406 }
8407 {
8408 std::lock_guard<std::mutex> l(kv_finalize_lock);
8409 kv_finalize_stop = false;
8410 }
8411 dout(10) << __func__ << " stopping finishers" << dendl;
8412 deferred_finisher.wait_for_empty();
8413 deferred_finisher.stop();
8414 for (auto f : finishers) {
8415 f->wait_for_empty();
8416 f->stop();
8417 }
8418 dout(10) << __func__ << " stopped" << dendl;
8419 }
8420
8421 void BlueStore::_kv_sync_thread()
8422 {
8423 dout(10) << __func__ << " start" << dendl;
8424 std::unique_lock<std::mutex> l(kv_lock);
8425 assert(!kv_sync_started);
8426 kv_sync_started = true;
8427 kv_cond.notify_all();
8428 while (true) {
8429 assert(kv_committing.empty());
8430 if (kv_queue.empty() &&
8431 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8432 !deferred_aggressive)) {
8433 if (kv_stop)
8434 break;
8435 dout(20) << __func__ << " sleep" << dendl;
8436 kv_cond.wait(l);
8437 dout(20) << __func__ << " wake" << dendl;
8438 } else {
8439 deque<TransContext*> kv_submitting;
8440 deque<DeferredBatch*> deferred_done, deferred_stable;
8441 uint64_t aios = 0, costs = 0;
8442
8443 dout(20) << __func__ << " committing " << kv_queue.size()
8444 << " submitting " << kv_queue_unsubmitted.size()
8445 << " deferred done " << deferred_done_queue.size()
8446 << " stable " << deferred_stable_queue.size()
8447 << dendl;
8448 kv_committing.swap(kv_queue);
8449 kv_submitting.swap(kv_queue_unsubmitted);
8450 deferred_done.swap(deferred_done_queue);
8451 deferred_stable.swap(deferred_stable_queue);
8452 aios = kv_ios;
8453 costs = kv_throttle_costs;
8454 kv_ios = 0;
8455 kv_throttle_costs = 0;
8456 utime_t start = ceph_clock_now();
8457 l.unlock();
8458
8459 dout(30) << __func__ << " committing " << kv_committing << dendl;
8460 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8461 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8462 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8463
8464 bool force_flush = false;
8465 // if bluefs is sharing the same device as data (only), then we
8466 // can rely on the bluefs commit to flush the device and make
8467 // deferred aios stable. that means that if we do have done deferred
8468 // txcs AND we are not on a single device, we need to force a flush.
8469 if (bluefs_single_shared_device && bluefs) {
8470 if (aios) {
8471 force_flush = true;
8472 } else if (kv_committing.empty() && kv_submitting.empty() &&
8473 deferred_stable.empty()) {
8474 force_flush = true; // there's nothing else to commit!
8475 } else if (deferred_aggressive) {
8476 force_flush = true;
8477 }
8478 } else
8479 force_flush = true;
8480
8481 if (force_flush) {
8482 dout(20) << __func__ << " num_aios=" << aios
8483 << " force_flush=" << (int)force_flush
8484 << ", flushing, deferred done->stable" << dendl;
8485 // flush/barrier on block device
8486 bdev->flush();
8487
8488 // if we flush then deferred done are now deferred stable
8489 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8490 deferred_done.end());
8491 deferred_done.clear();
8492 }
8493 utime_t after_flush = ceph_clock_now();
8494
8495 // we will use one final transaction to force a sync
8496 KeyValueDB::Transaction synct = db->get_transaction();
8497
8498 // increase {nid,blobid}_max? note that this covers both the
8499 // case where we are approaching the max and the case we passed
8500 // it. in either case, we increase the max in the earlier txn
8501 // we submit.
8502 uint64_t new_nid_max = 0, new_blobid_max = 0;
8503 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8504 KeyValueDB::Transaction t =
8505 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8506 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8507 bufferlist bl;
8508 ::encode(new_nid_max, bl);
8509 t->set(PREFIX_SUPER, "nid_max", bl);
8510 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8511 }
8512 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8513 KeyValueDB::Transaction t =
8514 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8515 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8516 bufferlist bl;
8517 ::encode(new_blobid_max, bl);
8518 t->set(PREFIX_SUPER, "blobid_max", bl);
8519 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8520 }
8521
8522 for (auto txc : kv_committing) {
8523 if (txc->state == TransContext::STATE_KV_QUEUED) {
8524 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8525 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8526 assert(r == 0);
8527 _txc_applied_kv(txc);
8528 --txc->osr->kv_committing_serially;
8529 txc->state = TransContext::STATE_KV_SUBMITTED;
8530 if (txc->osr->kv_submitted_waiters) {
8531 std::lock_guard<std::mutex> l(txc->osr->qlock);
8532 if (txc->osr->_is_all_kv_submitted()) {
8533 txc->osr->qcond.notify_all();
8534 }
8535 }
8536
8537 } else {
8538 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8539 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8540 }
8541 if (txc->had_ios) {
8542 --txc->osr->txc_with_unstable_io;
8543 }
8544 }
8545
8546 // release throttle *before* we commit. this allows new ops
8547 // to be prepared and enter pipeline while we are waiting on
8548 // the kv commit sync/flush. then hopefully on the next
8549 // iteration there will already be ops awake. otherwise, we
8550 // end up going to sleep, and then wake up when the very first
8551 // transaction is ready for commit.
8552 throttle_bytes.put(costs);
8553
8554 PExtentVector bluefs_gift_extents;
8555 if (bluefs &&
8556 after_flush - bluefs_last_balance >
8557 cct->_conf->bluestore_bluefs_balance_interval) {
8558 bluefs_last_balance = after_flush;
8559 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8560 assert(r >= 0);
8561 if (r > 0) {
8562 for (auto& p : bluefs_gift_extents) {
8563 bluefs_extents.insert(p.offset, p.length);
8564 }
8565 bufferlist bl;
8566 ::encode(bluefs_extents, bl);
8567 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8568 << bluefs_extents << std::dec << dendl;
8569 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8570 }
8571 }
8572
8573 // cleanup sync deferred keys
8574 for (auto b : deferred_stable) {
8575 for (auto& txc : b->txcs) {
8576 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8577 if (!wt.released.empty()) {
8578 // kraken replay compat only
8579 txc.released = wt.released;
8580 dout(10) << __func__ << " deferred txn has released "
8581 << txc.released
8582 << " (we just upgraded from kraken) on " << &txc << dendl;
8583 _txc_finalize_kv(&txc, synct);
8584 }
8585 // cleanup the deferred
8586 string key;
8587 get_deferred_key(wt.seq, &key);
8588 synct->rm_single_key(PREFIX_DEFERRED, key);
8589 }
8590 }
8591
8592 // submit synct synchronously (block and wait for it to commit)
8593 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
8594 assert(r == 0);
8595
8596 if (new_nid_max) {
8597 nid_max = new_nid_max;
8598 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8599 }
8600 if (new_blobid_max) {
8601 blobid_max = new_blobid_max;
8602 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8603 }
8604
8605 {
8606 utime_t finish = ceph_clock_now();
8607 utime_t dur_flush = after_flush - start;
8608 utime_t dur_kv = finish - after_flush;
8609 utime_t dur = finish - start;
8610 dout(20) << __func__ << " committed " << kv_committing.size()
8611 << " cleaned " << deferred_stable.size()
8612 << " in " << dur
8613 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8614 << dendl;
8615 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8616 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8617 logger->tinc(l_bluestore_kv_lat, dur);
8618 }
8619
8620 if (bluefs) {
8621 if (!bluefs_gift_extents.empty()) {
8622 _commit_bluefs_freespace(bluefs_gift_extents);
8623 }
8624 for (auto p = bluefs_extents_reclaiming.begin();
8625 p != bluefs_extents_reclaiming.end();
8626 ++p) {
8627 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8628 << p.get_start() << "~" << p.get_len() << std::dec
8629 << dendl;
8630 alloc->release(p.get_start(), p.get_len());
8631 }
8632 bluefs_extents_reclaiming.clear();
8633 }
8634
8635 {
8636 std::unique_lock<std::mutex> m(kv_finalize_lock);
8637 if (kv_committing_to_finalize.empty()) {
8638 kv_committing_to_finalize.swap(kv_committing);
8639 } else {
8640 kv_committing_to_finalize.insert(
8641 kv_committing_to_finalize.end(),
8642 kv_committing.begin(),
8643 kv_committing.end());
8644 kv_committing.clear();
8645 }
8646 if (deferred_stable_to_finalize.empty()) {
8647 deferred_stable_to_finalize.swap(deferred_stable);
8648 } else {
8649 deferred_stable_to_finalize.insert(
8650 deferred_stable_to_finalize.end(),
8651 deferred_stable.begin(),
8652 deferred_stable.end());
8653 deferred_stable.clear();
8654 }
8655 kv_finalize_cond.notify_one();
8656 }
8657
8658 l.lock();
8659 // previously deferred "done" are now "stable" by virtue of this
8660 // commit cycle.
8661 deferred_stable_queue.swap(deferred_done);
8662 }
8663 }
8664 dout(10) << __func__ << " finish" << dendl;
8665 kv_sync_started = false;
8666 }
8667
8668 void BlueStore::_kv_finalize_thread()
8669 {
8670 deque<TransContext*> kv_committed;
8671 deque<DeferredBatch*> deferred_stable;
8672 dout(10) << __func__ << " start" << dendl;
8673 std::unique_lock<std::mutex> l(kv_finalize_lock);
8674 assert(!kv_finalize_started);
8675 kv_finalize_started = true;
8676 kv_finalize_cond.notify_all();
8677 while (true) {
8678 assert(kv_committed.empty());
8679 assert(deferred_stable.empty());
8680 if (kv_committing_to_finalize.empty() &&
8681 deferred_stable_to_finalize.empty()) {
8682 if (kv_finalize_stop)
8683 break;
8684 dout(20) << __func__ << " sleep" << dendl;
8685 kv_finalize_cond.wait(l);
8686 dout(20) << __func__ << " wake" << dendl;
8687 } else {
8688 kv_committed.swap(kv_committing_to_finalize);
8689 deferred_stable.swap(deferred_stable_to_finalize);
8690 l.unlock();
8691 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8692 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8693
8694 while (!kv_committed.empty()) {
8695 TransContext *txc = kv_committed.front();
8696 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8697 _txc_state_proc(txc);
8698 kv_committed.pop_front();
8699 }
8700
8701 for (auto b : deferred_stable) {
8702 auto p = b->txcs.begin();
8703 while (p != b->txcs.end()) {
8704 TransContext *txc = &*p;
8705 p = b->txcs.erase(p); // unlink here because
8706 _txc_state_proc(txc); // this may destroy txc
8707 }
8708 delete b;
8709 }
8710 deferred_stable.clear();
8711
8712 if (!deferred_aggressive) {
8713 if (deferred_queue_size >= deferred_batch_ops.load() ||
8714 throttle_deferred_bytes.past_midpoint()) {
8715 deferred_try_submit();
8716 }
8717 }
8718
8719 // this is as good a place as any ...
8720 _reap_collections();
8721
8722 l.lock();
8723 }
8724 }
8725 dout(10) << __func__ << " finish" << dendl;
8726 kv_finalize_started = false;
8727 }
8728
8729 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8730 TransContext *txc, OnodeRef o)
8731 {
8732 if (!txc->deferred_txn) {
8733 txc->deferred_txn = new bluestore_deferred_transaction_t;
8734 }
8735 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8736 return &txc->deferred_txn->ops.back();
8737 }
8738
8739 void BlueStore::_deferred_queue(TransContext *txc)
8740 {
8741 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
8742 deferred_lock.lock();
8743 if (!txc->osr->deferred_pending &&
8744 !txc->osr->deferred_running) {
8745 deferred_queue.push_back(*txc->osr);
8746 }
8747 if (!txc->osr->deferred_pending) {
8748 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8749 }
8750 ++deferred_queue_size;
8751 txc->osr->deferred_pending->txcs.push_back(*txc);
8752 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8753 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8754 const auto& op = *opi;
8755 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8756 bufferlist::const_iterator p = op.data.begin();
8757 for (auto e : op.extents) {
8758 txc->osr->deferred_pending->prepare_write(
8759 cct, wt.seq, e.offset, e.length, p);
8760 }
8761 }
8762 if (deferred_aggressive &&
8763 !txc->osr->deferred_running) {
8764 _deferred_submit_unlock(txc->osr.get());
8765 } else {
8766 deferred_lock.unlock();
8767 }
8768 }
8769
8770 void BlueStore::deferred_try_submit()
8771 {
8772 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8773 << deferred_queue_size << " txcs" << dendl;
8774 std::lock_guard<std::mutex> l(deferred_lock);
8775 vector<OpSequencerRef> osrs;
8776 osrs.reserve(deferred_queue.size());
8777 for (auto& osr : deferred_queue) {
8778 osrs.push_back(&osr);
8779 }
8780 for (auto& osr : osrs) {
8781 if (osr->deferred_pending) {
8782 if (!osr->deferred_running) {
8783 _deferred_submit_unlock(osr.get());
8784 deferred_lock.lock();
8785 } else {
8786 dout(20) << __func__ << " osr " << osr << " already has running"
8787 << dendl;
8788 }
8789 } else {
8790 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
8791 }
8792 }
8793 }
8794
8795 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
8796 {
8797 dout(10) << __func__ << " osr " << osr
8798 << " " << osr->deferred_pending->iomap.size() << " ios pending "
8799 << dendl;
8800 assert(osr->deferred_pending);
8801 assert(!osr->deferred_running);
8802
8803 auto b = osr->deferred_pending;
8804 deferred_queue_size -= b->seq_bytes.size();
8805 assert(deferred_queue_size >= 0);
8806
8807 osr->deferred_running = osr->deferred_pending;
8808 osr->deferred_pending = nullptr;
8809
8810 uint64_t start = 0, pos = 0;
8811 bufferlist bl;
8812 auto i = b->iomap.begin();
8813 while (true) {
8814 if (i == b->iomap.end() || i->first != pos) {
8815 if (bl.length()) {
8816 dout(20) << __func__ << " write 0x" << std::hex
8817 << start << "~" << bl.length()
8818 << " crc " << bl.crc32c(-1) << std::dec << dendl;
8819 if (!g_conf->bluestore_debug_omit_block_device_write) {
8820 logger->inc(l_bluestore_deferred_write_ops);
8821 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
8822 int r = bdev->aio_write(start, bl, &b->ioc, false);
8823 assert(r == 0);
8824 }
8825 }
8826 if (i == b->iomap.end()) {
8827 break;
8828 }
8829 start = 0;
8830 pos = i->first;
8831 bl.clear();
8832 }
8833 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
8834 << std::hex << pos << "~" << i->second.bl.length() << std::dec
8835 << dendl;
8836 if (!bl.length()) {
8837 start = pos;
8838 }
8839 pos += i->second.bl.length();
8840 bl.claim_append(i->second.bl);
8841 ++i;
8842 }
8843
8844 deferred_lock.unlock();
8845 bdev->aio_submit(&b->ioc);
8846 }
8847
8848 struct C_DeferredTrySubmit : public Context {
8849 BlueStore *store;
8850 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
8851 void finish(int r) {
8852 store->deferred_try_submit();
8853 }
8854 };
8855
8856 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
8857 {
8858 dout(10) << __func__ << " osr " << osr << dendl;
8859 assert(osr->deferred_running);
8860 DeferredBatch *b = osr->deferred_running;
8861
8862 {
8863 std::lock_guard<std::mutex> l(deferred_lock);
8864 assert(osr->deferred_running == b);
8865 osr->deferred_running = nullptr;
8866 if (!osr->deferred_pending) {
8867 dout(20) << __func__ << " dequeueing" << dendl;
8868 auto q = deferred_queue.iterator_to(*osr);
8869 deferred_queue.erase(q);
8870 } else if (deferred_aggressive) {
8871 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
8872 deferred_finisher.queue(new C_DeferredTrySubmit(this));
8873 } else {
8874 dout(20) << __func__ << " leaving queued, more pending" << dendl;
8875 }
8876 }
8877
8878 {
8879 uint64_t costs = 0;
8880 std::lock_guard<std::mutex> l2(osr->qlock);
8881 for (auto& i : b->txcs) {
8882 TransContext *txc = &i;
8883 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
8884 costs += txc->cost;
8885 }
8886 osr->qcond.notify_all();
8887 throttle_deferred_bytes.put(costs);
8888 std::lock_guard<std::mutex> l(kv_lock);
8889 deferred_done_queue.emplace_back(b);
8890 }
8891
8892 // in the normal case, do not bother waking up the kv thread; it will
8893 // catch us on the next commit anyway.
8894 if (deferred_aggressive) {
8895 std::lock_guard<std::mutex> l(kv_lock);
8896 kv_cond.notify_one();
8897 }
8898 }
8899
8900 int BlueStore::_deferred_replay()
8901 {
8902 dout(10) << __func__ << " start" << dendl;
8903 OpSequencerRef osr = new OpSequencer(cct, this);
8904 int count = 0;
8905 int r = 0;
8906 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
8907 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
8908 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
8909 << dendl;
8910 bluestore_deferred_transaction_t *deferred_txn =
8911 new bluestore_deferred_transaction_t;
8912 bufferlist bl = it->value();
8913 bufferlist::iterator p = bl.begin();
8914 try {
8915 ::decode(*deferred_txn, p);
8916 } catch (buffer::error& e) {
8917 derr << __func__ << " failed to decode deferred txn "
8918 << pretty_binary_string(it->key()) << dendl;
8919 delete deferred_txn;
8920 r = -EIO;
8921 goto out;
8922 }
8923 TransContext *txc = _txc_create(osr.get());
8924 txc->deferred_txn = deferred_txn;
8925 txc->state = TransContext::STATE_KV_DONE;
8926 _txc_state_proc(txc);
8927 }
8928 out:
8929 dout(20) << __func__ << " draining osr" << dendl;
8930 _osr_drain_all();
8931 osr->discard();
8932 dout(10) << __func__ << " completed " << count << " events" << dendl;
8933 return r;
8934 }
8935
8936 // ---------------------------
8937 // transactions
8938
8939 int BlueStore::queue_transactions(
8940 Sequencer *posr,
8941 vector<Transaction>& tls,
8942 TrackedOpRef op,
8943 ThreadPool::TPHandle *handle)
8944 {
8945 FUNCTRACE();
8946 Context *onreadable;
8947 Context *ondisk;
8948 Context *onreadable_sync;
8949 ObjectStore::Transaction::collect_contexts(
8950 tls, &onreadable, &ondisk, &onreadable_sync);
8951
8952 if (cct->_conf->objectstore_blackhole) {
8953 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
8954 << dendl;
8955 delete ondisk;
8956 delete onreadable;
8957 delete onreadable_sync;
8958 return 0;
8959 }
8960 utime_t start = ceph_clock_now();
8961 // set up the sequencer
8962 OpSequencer *osr;
8963 assert(posr);
8964 if (posr->p) {
8965 osr = static_cast<OpSequencer *>(posr->p.get());
8966 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
8967 } else {
8968 osr = new OpSequencer(cct, this);
8969 osr->parent = posr;
8970 posr->p = osr;
8971 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
8972 }
8973
8974 // prepare
8975 TransContext *txc = _txc_create(osr);
8976 txc->onreadable = onreadable;
8977 txc->onreadable_sync = onreadable_sync;
8978 txc->oncommit = ondisk;
8979
8980 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
8981 (*p).set_osr(osr);
8982 txc->bytes += (*p).get_num_bytes();
8983 _txc_add_transaction(txc, &(*p));
8984 }
8985 _txc_calc_cost(txc);
8986
8987 _txc_write_nodes(txc, txc->t);
8988
8989 // journal deferred items
8990 if (txc->deferred_txn) {
8991 txc->deferred_txn->seq = ++deferred_seq;
8992 bufferlist bl;
8993 ::encode(*txc->deferred_txn, bl);
8994 string key;
8995 get_deferred_key(txc->deferred_txn->seq, &key);
8996 txc->t->set(PREFIX_DEFERRED, key, bl);
8997 }
8998
8999 _txc_finalize_kv(txc, txc->t);
9000 if (handle)
9001 handle->suspend_tp_timeout();
9002
9003 utime_t tstart = ceph_clock_now();
9004 throttle_bytes.get(txc->cost);
9005 if (txc->deferred_txn) {
9006 // ensure we do not block here because of deferred writes
9007 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
9008 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
9009 << dendl;
9010 ++deferred_aggressive;
9011 deferred_try_submit();
9012 {
9013 // wake up any previously finished deferred events
9014 std::lock_guard<std::mutex> l(kv_lock);
9015 kv_cond.notify_one();
9016 }
9017 throttle_deferred_bytes.get(txc->cost);
9018 --deferred_aggressive;
9019 }
9020 }
9021 utime_t tend = ceph_clock_now();
9022
9023 if (handle)
9024 handle->reset_tp_timeout();
9025
9026 logger->inc(l_bluestore_txc);
9027
9028 // execute (start)
9029 _txc_state_proc(txc);
9030
9031 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
9032 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
9033 return 0;
9034 }
9035
9036 void BlueStore::_txc_aio_submit(TransContext *txc)
9037 {
9038 dout(10) << __func__ << " txc " << txc << dendl;
9039 bdev->aio_submit(&txc->ioc);
9040 }
9041
9042 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
9043 {
9044 Transaction::iterator i = t->begin();
9045
9046 _dump_transaction(t);
9047
9048 vector<CollectionRef> cvec(i.colls.size());
9049 unsigned j = 0;
9050 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
9051 ++p, ++j) {
9052 cvec[j] = _get_collection(*p);
9053 }
9054 vector<OnodeRef> ovec(i.objects.size());
9055
9056 for (int pos = 0; i.have_op(); ++pos) {
9057 Transaction::Op *op = i.decode_op();
9058 int r = 0;
9059
9060 // no coll or obj
9061 if (op->op == Transaction::OP_NOP)
9062 continue;
9063
9064 // collection operations
9065 CollectionRef &c = cvec[op->cid];
9066 switch (op->op) {
9067 case Transaction::OP_RMCOLL:
9068 {
9069 const coll_t &cid = i.get_cid(op->cid);
9070 r = _remove_collection(txc, cid, &c);
9071 if (!r)
9072 continue;
9073 }
9074 break;
9075
9076 case Transaction::OP_MKCOLL:
9077 {
9078 assert(!c);
9079 const coll_t &cid = i.get_cid(op->cid);
9080 r = _create_collection(txc, cid, op->split_bits, &c);
9081 if (!r)
9082 continue;
9083 }
9084 break;
9085
9086 case Transaction::OP_SPLIT_COLLECTION:
9087 assert(0 == "deprecated");
9088 break;
9089
9090 case Transaction::OP_SPLIT_COLLECTION2:
9091 {
9092 uint32_t bits = op->split_bits;
9093 uint32_t rem = op->split_rem;
9094 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
9095 if (!r)
9096 continue;
9097 }
9098 break;
9099
9100 case Transaction::OP_COLL_HINT:
9101 {
9102 uint32_t type = op->hint_type;
9103 bufferlist hint;
9104 i.decode_bl(hint);
9105 bufferlist::iterator hiter = hint.begin();
9106 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
9107 uint32_t pg_num;
9108 uint64_t num_objs;
9109 ::decode(pg_num, hiter);
9110 ::decode(num_objs, hiter);
9111 dout(10) << __func__ << " collection hint objects is a no-op, "
9112 << " pg_num " << pg_num << " num_objects " << num_objs
9113 << dendl;
9114 } else {
9115 // Ignore the hint
9116 dout(10) << __func__ << " unknown collection hint " << type << dendl;
9117 }
9118 continue;
9119 }
9120 break;
9121
9122 case Transaction::OP_COLL_SETATTR:
9123 r = -EOPNOTSUPP;
9124 break;
9125
9126 case Transaction::OP_COLL_RMATTR:
9127 r = -EOPNOTSUPP;
9128 break;
9129
9130 case Transaction::OP_COLL_RENAME:
9131 assert(0 == "not implemented");
9132 break;
9133 }
9134 if (r < 0) {
9135 derr << __func__ << " error " << cpp_strerror(r)
9136 << " not handled on operation " << op->op
9137 << " (op " << pos << ", counting from 0)" << dendl;
9138 _dump_transaction(t, 0);
9139 assert(0 == "unexpected error");
9140 }
9141
9142 // these operations implicity create the object
9143 bool create = false;
9144 if (op->op == Transaction::OP_TOUCH ||
9145 op->op == Transaction::OP_WRITE ||
9146 op->op == Transaction::OP_ZERO) {
9147 create = true;
9148 }
9149
9150 // object operations
9151 RWLock::WLocker l(c->lock);
9152 OnodeRef &o = ovec[op->oid];
9153 if (!o) {
9154 ghobject_t oid = i.get_oid(op->oid);
9155 o = c->get_onode(oid, create);
9156 }
9157 if (!create && (!o || !o->exists)) {
9158 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
9159 << i.get_oid(op->oid) << dendl;
9160 r = -ENOENT;
9161 goto endop;
9162 }
9163
9164 switch (op->op) {
9165 case Transaction::OP_TOUCH:
9166 r = _touch(txc, c, o);
9167 break;
9168
9169 case Transaction::OP_WRITE:
9170 {
9171 uint64_t off = op->off;
9172 uint64_t len = op->len;
9173 uint32_t fadvise_flags = i.get_fadvise_flags();
9174 bufferlist bl;
9175 i.decode_bl(bl);
9176 r = _write(txc, c, o, off, len, bl, fadvise_flags);
9177 }
9178 break;
9179
9180 case Transaction::OP_ZERO:
9181 {
9182 uint64_t off = op->off;
9183 uint64_t len = op->len;
9184 r = _zero(txc, c, o, off, len);
9185 }
9186 break;
9187
9188 case Transaction::OP_TRIMCACHE:
9189 {
9190 // deprecated, no-op
9191 }
9192 break;
9193
9194 case Transaction::OP_TRUNCATE:
9195 {
9196 uint64_t off = op->off;
9197 r = _truncate(txc, c, o, off);
9198 }
9199 break;
9200
9201 case Transaction::OP_REMOVE:
9202 {
9203 r = _remove(txc, c, o);
9204 }
9205 break;
9206
9207 case Transaction::OP_SETATTR:
9208 {
9209 string name = i.decode_string();
9210 bufferptr bp;
9211 i.decode_bp(bp);
9212 r = _setattr(txc, c, o, name, bp);
9213 }
9214 break;
9215
9216 case Transaction::OP_SETATTRS:
9217 {
9218 map<string, bufferptr> aset;
9219 i.decode_attrset(aset);
9220 r = _setattrs(txc, c, o, aset);
9221 }
9222 break;
9223
9224 case Transaction::OP_RMATTR:
9225 {
9226 string name = i.decode_string();
9227 r = _rmattr(txc, c, o, name);
9228 }
9229 break;
9230
9231 case Transaction::OP_RMATTRS:
9232 {
9233 r = _rmattrs(txc, c, o);
9234 }
9235 break;
9236
9237 case Transaction::OP_CLONE:
9238 {
9239 OnodeRef& no = ovec[op->dest_oid];
9240 if (!no) {
9241 const ghobject_t& noid = i.get_oid(op->dest_oid);
9242 no = c->get_onode(noid, true);
9243 }
9244 r = _clone(txc, c, o, no);
9245 }
9246 break;
9247
9248 case Transaction::OP_CLONERANGE:
9249 assert(0 == "deprecated");
9250 break;
9251
9252 case Transaction::OP_CLONERANGE2:
9253 {
9254 OnodeRef& no = ovec[op->dest_oid];
9255 if (!no) {
9256 const ghobject_t& noid = i.get_oid(op->dest_oid);
9257 no = c->get_onode(noid, true);
9258 }
9259 uint64_t srcoff = op->off;
9260 uint64_t len = op->len;
9261 uint64_t dstoff = op->dest_off;
9262 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
9263 }
9264 break;
9265
9266 case Transaction::OP_COLL_ADD:
9267 assert(0 == "not implemented");
9268 break;
9269
9270 case Transaction::OP_COLL_REMOVE:
9271 assert(0 == "not implemented");
9272 break;
9273
9274 case Transaction::OP_COLL_MOVE:
9275 assert(0 == "deprecated");
9276 break;
9277
9278 case Transaction::OP_COLL_MOVE_RENAME:
9279 case Transaction::OP_TRY_RENAME:
9280 {
9281 assert(op->cid == op->dest_cid);
9282 const ghobject_t& noid = i.get_oid(op->dest_oid);
9283 OnodeRef& no = ovec[op->dest_oid];
9284 if (!no) {
9285 no = c->get_onode(noid, false);
9286 }
9287 r = _rename(txc, c, o, no, noid);
9288 }
9289 break;
9290
9291 case Transaction::OP_OMAP_CLEAR:
9292 {
9293 r = _omap_clear(txc, c, o);
9294 }
9295 break;
9296 case Transaction::OP_OMAP_SETKEYS:
9297 {
9298 bufferlist aset_bl;
9299 i.decode_attrset_bl(&aset_bl);
9300 r = _omap_setkeys(txc, c, o, aset_bl);
9301 }
9302 break;
9303 case Transaction::OP_OMAP_RMKEYS:
9304 {
9305 bufferlist keys_bl;
9306 i.decode_keyset_bl(&keys_bl);
9307 r = _omap_rmkeys(txc, c, o, keys_bl);
9308 }
9309 break;
9310 case Transaction::OP_OMAP_RMKEYRANGE:
9311 {
9312 string first, last;
9313 first = i.decode_string();
9314 last = i.decode_string();
9315 r = _omap_rmkey_range(txc, c, o, first, last);
9316 }
9317 break;
9318 case Transaction::OP_OMAP_SETHEADER:
9319 {
9320 bufferlist bl;
9321 i.decode_bl(bl);
9322 r = _omap_setheader(txc, c, o, bl);
9323 }
9324 break;
9325
9326 case Transaction::OP_SETALLOCHINT:
9327 {
9328 r = _set_alloc_hint(txc, c, o,
9329 op->expected_object_size,
9330 op->expected_write_size,
9331 op->alloc_hint_flags);
9332 }
9333 break;
9334
9335 default:
9336 derr << __func__ << "bad op " << op->op << dendl;
9337 ceph_abort();
9338 }
9339
9340 endop:
9341 if (r < 0) {
9342 bool ok = false;
9343
9344 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9345 op->op == Transaction::OP_CLONE ||
9346 op->op == Transaction::OP_CLONERANGE2 ||
9347 op->op == Transaction::OP_COLL_ADD ||
9348 op->op == Transaction::OP_SETATTR ||
9349 op->op == Transaction::OP_SETATTRS ||
9350 op->op == Transaction::OP_RMATTR ||
9351 op->op == Transaction::OP_OMAP_SETKEYS ||
9352 op->op == Transaction::OP_OMAP_RMKEYS ||
9353 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9354 op->op == Transaction::OP_OMAP_SETHEADER))
9355 // -ENOENT is usually okay
9356 ok = true;
9357 if (r == -ENODATA)
9358 ok = true;
9359
9360 if (!ok) {
9361 const char *msg = "unexpected error code";
9362
9363 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9364 op->op == Transaction::OP_CLONE ||
9365 op->op == Transaction::OP_CLONERANGE2))
9366 msg = "ENOENT on clone suggests osd bug";
9367
9368 if (r == -ENOSPC)
9369 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9370 // by partially applying transactions.
9371 msg = "ENOSPC from bluestore, misconfigured cluster";
9372
9373 if (r == -ENOTEMPTY) {
9374 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9375 }
9376
9377 derr << __func__ << " error " << cpp_strerror(r)
9378 << " not handled on operation " << op->op
9379 << " (op " << pos << ", counting from 0)"
9380 << dendl;
9381 derr << msg << dendl;
9382 _dump_transaction(t, 0);
9383 assert(0 == "unexpected error");
9384 }
9385 }
9386 }
9387 }
9388
9389
9390
9391 // -----------------
9392 // write operations
9393
9394 int BlueStore::_touch(TransContext *txc,
9395 CollectionRef& c,
9396 OnodeRef &o)
9397 {
9398 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9399 int r = 0;
9400 _assign_nid(txc, o);
9401 txc->write_onode(o);
9402 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9403 return r;
9404 }
9405
9406 void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
9407 {
9408 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9409 return;
9410 dout(log_level) << __func__ << " " << o << " " << o->oid
9411 << " nid " << o->onode.nid
9412 << " size 0x" << std::hex << o->onode.size
9413 << " (" << std::dec << o->onode.size << ")"
9414 << " expected_object_size " << o->onode.expected_object_size
9415 << " expected_write_size " << o->onode.expected_write_size
9416 << " in " << o->onode.extent_map_shards.size() << " shards"
9417 << ", " << o->extent_map.spanning_blob_map.size()
9418 << " spanning blobs"
9419 << dendl;
9420 for (auto p = o->onode.attrs.begin();
9421 p != o->onode.attrs.end();
9422 ++p) {
9423 dout(log_level) << __func__ << " attr " << p->first
9424 << " len " << p->second.length() << dendl;
9425 }
9426 _dump_extent_map(o->extent_map, log_level);
9427 }
9428
9429 void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9430 {
9431 uint64_t pos = 0;
9432 for (auto& s : em.shards) {
9433 dout(log_level) << __func__ << " shard " << *s.shard_info
9434 << (s.loaded ? " (loaded)" : "")
9435 << (s.dirty ? " (dirty)" : "")
9436 << dendl;
9437 }
9438 for (auto& e : em.extent_map) {
9439 dout(log_level) << __func__ << " " << e << dendl;
9440 assert(e.logical_offset >= pos);
9441 pos = e.logical_offset + e.length;
9442 const bluestore_blob_t& blob = e.blob->get_blob();
9443 if (blob.has_csum()) {
9444 vector<uint64_t> v;
9445 unsigned n = blob.get_csum_count();
9446 for (unsigned i = 0; i < n; ++i)
9447 v.push_back(blob.get_csum_item(i));
9448 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9449 << dendl;
9450 }
9451 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9452 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9453 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9454 << "~" << i.second->length << std::dec
9455 << " " << *i.second << dendl;
9456 }
9457 }
9458 }
9459
9460 void BlueStore::_dump_transaction(Transaction *t, int log_level)
9461 {
9462 dout(log_level) << " transaction dump:\n";
9463 JSONFormatter f(true);
9464 f.open_object_section("transaction");
9465 t->dump(&f);
9466 f.close_section();
9467 f.flush(*_dout);
9468 *_dout << dendl;
9469 }
9470
9471 void BlueStore::_pad_zeros(
9472 bufferlist *bl, uint64_t *offset,
9473 uint64_t chunk_size)
9474 {
9475 auto length = bl->length();
9476 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9477 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9478 dout(40) << "before:\n";
9479 bl->hexdump(*_dout);
9480 *_dout << dendl;
9481 // front
9482 size_t front_pad = *offset % chunk_size;
9483 size_t back_pad = 0;
9484 size_t pad_count = 0;
9485 if (front_pad) {
9486 size_t front_copy = MIN(chunk_size - front_pad, length);
9487 bufferptr z = buffer::create_page_aligned(chunk_size);
9488 z.zero(0, front_pad, false);
9489 pad_count += front_pad;
9490 bl->copy(0, front_copy, z.c_str() + front_pad);
9491 if (front_copy + front_pad < chunk_size) {
9492 back_pad = chunk_size - (length + front_pad);
9493 z.zero(front_pad + length, back_pad, false);
9494 pad_count += back_pad;
9495 }
9496 bufferlist old, t;
9497 old.swap(*bl);
9498 t.substr_of(old, front_copy, length - front_copy);
9499 bl->append(z);
9500 bl->claim_append(t);
9501 *offset -= front_pad;
9502 length += pad_count;
9503 }
9504
9505 // back
9506 uint64_t end = *offset + length;
9507 unsigned back_copy = end % chunk_size;
9508 if (back_copy) {
9509 assert(back_pad == 0);
9510 back_pad = chunk_size - back_copy;
9511 assert(back_copy <= length);
9512 bufferptr tail(chunk_size);
9513 bl->copy(length - back_copy, back_copy, tail.c_str());
9514 tail.zero(back_copy, back_pad, false);
9515 bufferlist old;
9516 old.swap(*bl);
9517 bl->substr_of(old, 0, length - back_copy);
9518 bl->append(tail);
9519 length += back_pad;
9520 pad_count += back_pad;
9521 }
9522 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9523 << back_pad << " on front/back, now 0x" << *offset << "~"
9524 << length << std::dec << dendl;
9525 dout(40) << "after:\n";
9526 bl->hexdump(*_dout);
9527 *_dout << dendl;
9528 if (pad_count)
9529 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9530 assert(bl->length() == length);
9531 }
9532
9533 void BlueStore::_do_write_small(
9534 TransContext *txc,
9535 CollectionRef &c,
9536 OnodeRef o,
9537 uint64_t offset, uint64_t length,
9538 bufferlist::iterator& blp,
9539 WriteContext *wctx)
9540 {
9541 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9542 << std::dec << dendl;
9543 assert(length < min_alloc_size);
9544 uint64_t end_offs = offset + length;
9545
9546 logger->inc(l_bluestore_write_small);
9547 logger->inc(l_bluestore_write_small_bytes, length);
9548
9549 bufferlist bl;
9550 blp.copy(length, bl);
9551
9552 // Look for an existing mutable blob we can use.
9553 auto begin = o->extent_map.extent_map.begin();
9554 auto end = o->extent_map.extent_map.end();
9555 auto ep = o->extent_map.seek_lextent(offset);
9556 if (ep != begin) {
9557 --ep;
9558 if (ep->blob_end() <= offset) {
9559 ++ep;
9560 }
9561 }
9562 auto prev_ep = ep;
9563 if (prev_ep != begin) {
9564 --prev_ep;
9565 } else {
9566 prev_ep = end; // to avoid this extent check as it's a duplicate
9567 }
9568
9569 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9570 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9571 uint32_t alloc_len = min_alloc_size;
9572 auto offset0 = P2ALIGN(offset, alloc_len);
9573
9574 bool any_change;
9575
9576 // search suitable extent in both forward and reverse direction in
9577 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9578 // then check if blob can be reused via can_reuse_blob func or apply
9579 // direct/deferred write (the latter for extents including or higher
9580 // than 'offset' only).
9581 do {
9582 any_change = false;
9583
9584 if (ep != end && ep->logical_offset < offset + max_bsize) {
9585 BlobRef b = ep->blob;
9586 auto bstart = ep->blob_start();
9587 dout(20) << __func__ << " considering " << *b
9588 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9589 if (bstart >= end_offs) {
9590 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9591 } else if (!b->get_blob().is_mutable()) {
9592 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9593 } else if (ep->logical_offset % min_alloc_size !=
9594 ep->blob_offset % min_alloc_size) {
9595 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9596 } else {
9597 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9598 // can we pad our head/tail out with zeros?
9599 uint64_t head_pad, tail_pad;
9600 head_pad = P2PHASE(offset, chunk_size);
9601 tail_pad = P2NPHASE(end_offs, chunk_size);
9602 if (head_pad || tail_pad) {
9603 o->extent_map.fault_range(db, offset - head_pad,
9604 end_offs - offset + head_pad + tail_pad);
9605 }
9606 if (head_pad &&
9607 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9608 head_pad = 0;
9609 }
9610 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9611 tail_pad = 0;
9612 }
9613
9614 uint64_t b_off = offset - head_pad - bstart;
9615 uint64_t b_len = length + head_pad + tail_pad;
9616
9617 // direct write into unused blocks of an existing mutable blob?
9618 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9619 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9620 b->get_blob().is_unused(b_off, b_len) &&
9621 b->get_blob().is_allocated(b_off, b_len)) {
9622 _apply_padding(head_pad, tail_pad, bl);
9623
9624 dout(20) << __func__ << " write to unused 0x" << std::hex
9625 << b_off << "~" << b_len
9626 << " pad 0x" << head_pad << " + 0x" << tail_pad
9627 << std::dec << " of mutable " << *b << dendl;
9628 _buffer_cache_write(txc, b, b_off, bl,
9629 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9630
9631 if (!g_conf->bluestore_debug_omit_block_device_write) {
9632 if (b_len <= prefer_deferred_size) {
9633 dout(20) << __func__ << " deferring small 0x" << std::hex
9634 << b_len << std::dec << " unused write via deferred" << dendl;
9635 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9636 op->op = bluestore_deferred_op_t::OP_WRITE;
9637 b->get_blob().map(
9638 b_off, b_len,
9639 [&](uint64_t offset, uint64_t length) {
9640 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9641 return 0;
9642 });
9643 op->data = bl;
9644 } else {
9645 b->get_blob().map_bl(
9646 b_off, bl,
9647 [&](uint64_t offset, bufferlist& t) {
9648 bdev->aio_write(offset, t,
9649 &txc->ioc, wctx->buffered);
9650 });
9651 }
9652 }
9653 b->dirty_blob().calc_csum(b_off, bl);
9654 dout(20) << __func__ << " lex old " << *ep << dendl;
9655 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9656 b,
9657 &wctx->old_extents);
9658 b->dirty_blob().mark_used(le->blob_offset, le->length);
9659 txc->statfs_delta.stored() += le->length;
9660 dout(20) << __func__ << " lex " << *le << dendl;
9661 logger->inc(l_bluestore_write_small_unused);
9662 return;
9663 }
9664 // read some data to fill out the chunk?
9665 uint64_t head_read = P2PHASE(b_off, chunk_size);
9666 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9667 if ((head_read || tail_read) &&
9668 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9669 head_read + tail_read < min_alloc_size) {
9670 b_off -= head_read;
9671 b_len += head_read + tail_read;
9672
9673 } else {
9674 head_read = tail_read = 0;
9675 }
9676
9677 // chunk-aligned deferred overwrite?
9678 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9679 b_off % chunk_size == 0 &&
9680 b_len % chunk_size == 0 &&
9681 b->get_blob().is_allocated(b_off, b_len)) {
9682
9683 _apply_padding(head_pad, tail_pad, bl);
9684
9685 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9686 << " and tail 0x" << tail_read << std::dec << dendl;
9687 if (head_read) {
9688 bufferlist head_bl;
9689 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9690 head_bl, 0);
9691 assert(r >= 0 && r <= (int)head_read);
9692 size_t zlen = head_read - r;
9693 if (zlen) {
9694 head_bl.append_zero(zlen);
9695 logger->inc(l_bluestore_write_pad_bytes, zlen);
9696 }
9697 bl.claim_prepend(head_bl);
9698 logger->inc(l_bluestore_write_penalty_read_ops);
9699 }
9700 if (tail_read) {
9701 bufferlist tail_bl;
9702 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9703 tail_bl, 0);
9704 assert(r >= 0 && r <= (int)tail_read);
9705 size_t zlen = tail_read - r;
9706 if (zlen) {
9707 tail_bl.append_zero(zlen);
9708 logger->inc(l_bluestore_write_pad_bytes, zlen);
9709 }
9710 bl.claim_append(tail_bl);
9711 logger->inc(l_bluestore_write_penalty_read_ops);
9712 }
9713 logger->inc(l_bluestore_write_small_pre_read);
9714
9715 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9716 op->op = bluestore_deferred_op_t::OP_WRITE;
9717 _buffer_cache_write(txc, b, b_off, bl,
9718 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9719
9720 int r = b->get_blob().map(
9721 b_off, b_len,
9722 [&](uint64_t offset, uint64_t length) {
9723 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9724 return 0;
9725 });
9726 assert(r == 0);
9727 if (b->get_blob().csum_type) {
9728 b->dirty_blob().calc_csum(b_off, bl);
9729 }
9730 op->data.claim(bl);
9731 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9732 << b_len << std::dec << " of mutable " << *b
9733 << " at " << op->extents << dendl;
9734 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9735 b, &wctx->old_extents);
9736 b->dirty_blob().mark_used(le->blob_offset, le->length);
9737 txc->statfs_delta.stored() += le->length;
9738 dout(20) << __func__ << " lex " << *le << dendl;
9739 logger->inc(l_bluestore_write_small_deferred);
9740 return;
9741 }
9742 // try to reuse blob if we can
9743 if (b->can_reuse_blob(min_alloc_size,
9744 max_bsize,
9745 offset0 - bstart,
9746 &alloc_len)) {
9747 assert(alloc_len == min_alloc_size); // expecting data always
9748 // fit into reused blob
9749 // Need to check for pending writes desiring to
9750 // reuse the same pextent. The rationale is that during GC two chunks
9751 // from garbage blobs(compressed?) can share logical space within the same
9752 // AU. That's in turn might be caused by unaligned len in clone_range2.
9753 // Hence the second write will fail in an attempt to reuse blob at
9754 // do_alloc_write().
9755 if (!wctx->has_conflict(b,
9756 offset0,
9757 offset0 + alloc_len,
9758 min_alloc_size)) {
9759
9760 // we can't reuse pad_head/pad_tail since they might be truncated
9761 // due to existent extents
9762 uint64_t b_off = offset - bstart;
9763 uint64_t b_off0 = b_off;
9764 _pad_zeros(&bl, &b_off0, chunk_size);
9765
9766 dout(20) << __func__ << " reuse blob " << *b << std::hex
9767 << " (0x" << b_off0 << "~" << bl.length() << ")"
9768 << " (0x" << b_off << "~" << length << ")"
9769 << std::dec << dendl;
9770
9771 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9772 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9773 false, false);
9774 logger->inc(l_bluestore_write_small_unused);
9775 return;
9776 }
9777 }
9778 }
9779 ++ep;
9780 any_change = true;
9781 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9782
9783 // check extent for reuse in reverse order
9784 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9785 BlobRef b = prev_ep->blob;
9786 auto bstart = prev_ep->blob_start();
9787 dout(20) << __func__ << " considering " << *b
9788 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9789 if (b->can_reuse_blob(min_alloc_size,
9790 max_bsize,
9791 offset0 - bstart,
9792 &alloc_len)) {
9793 assert(alloc_len == min_alloc_size); // expecting data always
9794 // fit into reused blob
9795 // Need to check for pending writes desiring to
9796 // reuse the same pextent. The rationale is that during GC two chunks
9797 // from garbage blobs(compressed?) can share logical space within the same
9798 // AU. That's in turn might be caused by unaligned len in clone_range2.
9799 // Hence the second write will fail in an attempt to reuse blob at
9800 // do_alloc_write().
9801 if (!wctx->has_conflict(b,
9802 offset0,
9803 offset0 + alloc_len,
9804 min_alloc_size)) {
9805
9806 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9807 uint64_t b_off = offset - bstart;
9808 uint64_t b_off0 = b_off;
9809 _pad_zeros(&bl, &b_off0, chunk_size);
9810
9811 dout(20) << __func__ << " reuse blob " << *b << std::hex
9812 << " (0x" << b_off0 << "~" << bl.length() << ")"
9813 << " (0x" << b_off << "~" << length << ")"
9814 << std::dec << dendl;
9815
9816 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9817 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9818 false, false);
9819 logger->inc(l_bluestore_write_small_unused);
9820 return;
9821 }
9822 }
9823 if (prev_ep != begin) {
9824 --prev_ep;
9825 any_change = true;
9826 } else {
9827 prev_ep = end; // to avoid useless first extent re-check
9828 }
9829 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9830 } while (any_change);
9831
9832 // new blob.
9833
9834 BlobRef b = c->new_blob();
9835 uint64_t b_off = P2PHASE(offset, alloc_len);
9836 uint64_t b_off0 = b_off;
9837 _pad_zeros(&bl, &b_off0, block_size);
9838 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9839 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
9840 logger->inc(l_bluestore_write_small_new);
9841
9842 return;
9843 }
9844
9845 void BlueStore::_do_write_big(
9846 TransContext *txc,
9847 CollectionRef &c,
9848 OnodeRef o,
9849 uint64_t offset, uint64_t length,
9850 bufferlist::iterator& blp,
9851 WriteContext *wctx)
9852 {
9853 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9854 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
9855 << " compress " << (int)wctx->compress
9856 << dendl;
9857 logger->inc(l_bluestore_write_big);
9858 logger->inc(l_bluestore_write_big_bytes, length);
9859 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9860 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9861 while (length > 0) {
9862 bool new_blob = false;
9863 uint32_t l = MIN(max_bsize, length);
9864 BlobRef b;
9865 uint32_t b_off = 0;
9866
9867 //attempting to reuse existing blob
9868 if (!wctx->compress) {
9869 // look for an existing mutable blob we can reuse
9870 auto begin = o->extent_map.extent_map.begin();
9871 auto end = o->extent_map.extent_map.end();
9872 auto ep = o->extent_map.seek_lextent(offset);
9873 auto prev_ep = ep;
9874 if (prev_ep != begin) {
9875 --prev_ep;
9876 } else {
9877 prev_ep = end; // to avoid this extent check as it's a duplicate
9878 }
9879 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9880 // search suitable extent in both forward and reverse direction in
9881 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9882 // then check if blob can be reused via can_reuse_blob func.
9883 bool any_change;
9884 do {
9885 any_change = false;
9886 if (ep != end && ep->logical_offset < offset + max_bsize) {
9887 if (offset >= ep->blob_start() &&
9888 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
9889 offset - ep->blob_start(),
9890 &l)) {
9891 b = ep->blob;
9892 b_off = offset - ep->blob_start();
9893 prev_ep = end; // to avoid check below
9894 dout(20) << __func__ << " reuse blob " << *b << std::hex
9895 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
9896 } else {
9897 ++ep;
9898 any_change = true;
9899 }
9900 }
9901
9902 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9903 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
9904 offset - prev_ep->blob_start(),
9905 &l)) {
9906 b = prev_ep->blob;
9907 b_off = offset - prev_ep->blob_start();
9908 dout(20) << __func__ << " reuse blob " << *b << std::hex
9909 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
9910 } else if (prev_ep != begin) {
9911 --prev_ep;
9912 any_change = true;
9913 } else {
9914 prev_ep = end; // to avoid useless first extent re-check
9915 }
9916 }
9917 } while (b == nullptr && any_change);
9918 }
9919 if (b == nullptr) {
9920 b = c->new_blob();
9921 b_off = 0;
9922 new_blob = true;
9923 }
9924
9925 bufferlist t;
9926 blp.copy(l, t);
9927 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
9928 offset += l;
9929 length -= l;
9930 logger->inc(l_bluestore_write_big_blobs);
9931 }
9932 }
9933
9934 int BlueStore::_do_alloc_write(
9935 TransContext *txc,
9936 CollectionRef coll,
9937 OnodeRef o,
9938 WriteContext *wctx)
9939 {
9940 dout(20) << __func__ << " txc " << txc
9941 << " " << wctx->writes.size() << " blobs"
9942 << dendl;
9943 if (wctx->writes.empty()) {
9944 return 0;
9945 }
9946
9947 CompressorRef c;
9948 double crr = 0;
9949 if (wctx->compress) {
9950 c = select_option(
9951 "compression_algorithm",
9952 compressor,
9953 [&]() {
9954 string val;
9955 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
9956 CompressorRef cp = compressor;
9957 if (!cp || cp->get_type_name() != val) {
9958 cp = Compressor::create(cct, val);
9959 }
9960 return boost::optional<CompressorRef>(cp);
9961 }
9962 return boost::optional<CompressorRef>();
9963 }
9964 );
9965
9966 crr = select_option(
9967 "compression_required_ratio",
9968 cct->_conf->bluestore_compression_required_ratio,
9969 [&]() {
9970 double val;
9971 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
9972 return boost::optional<double>(val);
9973 }
9974 return boost::optional<double>();
9975 }
9976 );
9977 }
9978
9979 // checksum
9980 int csum = csum_type.load();
9981 csum = select_option(
9982 "csum_type",
9983 csum,
9984 [&]() {
9985 int val;
9986 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
9987 return boost::optional<int>(val);
9988 }
9989 return boost::optional<int>();
9990 }
9991 );
9992
9993 // compress (as needed) and calc needed space
9994 uint64_t need = 0;
9995 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9996 for (auto& wi : wctx->writes) {
9997 if (c && wi.blob_length > min_alloc_size) {
9998 utime_t start = ceph_clock_now();
9999
10000 // compress
10001 assert(wi.b_off == 0);
10002 assert(wi.blob_length == wi.bl.length());
10003
10004 // FIXME: memory alignment here is bad
10005 bufferlist t;
10006 int r = c->compress(wi.bl, t);
10007 assert(r == 0);
10008
10009 bluestore_compression_header_t chdr;
10010 chdr.type = c->get_type();
10011 chdr.length = t.length();
10012 ::encode(chdr, wi.compressed_bl);
10013 wi.compressed_bl.claim_append(t);
10014
10015 wi.compressed_len = wi.compressed_bl.length();
10016 uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
10017 uint64_t want_len_raw = wi.blob_length * crr;
10018 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
10019 if (newlen <= want_len && newlen < wi.blob_length) {
10020 // Cool. We compressed at least as much as we were hoping to.
10021 // pad out to min_alloc_size
10022 wi.compressed_bl.append_zero(newlen - wi.compressed_len);
10023 logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
10024 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
10025 << " -> 0x" << wi.compressed_len << " => 0x" << newlen
10026 << " with " << c->get_type()
10027 << std::dec << dendl;
10028 txc->statfs_delta.compressed() += wi.compressed_len;
10029 txc->statfs_delta.compressed_original() += wi.blob_length;
10030 txc->statfs_delta.compressed_allocated() += newlen;
10031 logger->inc(l_bluestore_compress_success_count);
10032 wi.compressed = true;
10033 need += newlen;
10034 } else {
10035 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
10036 << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
10037 << " with " << c->get_type()
10038 << ", which is more than required 0x" << want_len_raw
10039 << " -> 0x" << want_len
10040 << ", leaving uncompressed"
10041 << std::dec << dendl;
10042 logger->inc(l_bluestore_compress_rejected_count);
10043 need += wi.blob_length;
10044 }
10045 logger->tinc(l_bluestore_compress_lat,
10046 ceph_clock_now() - start);
10047 } else {
10048 need += wi.blob_length;
10049 }
10050 }
10051 int r = alloc->reserve(need);
10052 if (r < 0) {
10053 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
10054 << dendl;
10055 return r;
10056 }
10057 AllocExtentVector prealloc;
10058 prealloc.reserve(2 * wctx->writes.size());;
10059 int prealloc_left = 0;
10060 prealloc_left = alloc->allocate(
10061 need, min_alloc_size, need,
10062 0, &prealloc);
10063 assert(prealloc_left == (int64_t)need);
10064 dout(20) << __func__ << " prealloc " << prealloc << dendl;
10065 auto prealloc_pos = prealloc.begin();
10066
10067 for (auto& wi : wctx->writes) {
10068 BlobRef b = wi.b;
10069 bluestore_blob_t& dblob = b->dirty_blob();
10070 uint64_t b_off = wi.b_off;
10071 bufferlist *l = &wi.bl;
10072 uint64_t final_length = wi.blob_length;
10073 uint64_t csum_length = wi.blob_length;
10074 unsigned csum_order = block_size_order;
10075 if (wi.compressed) {
10076 final_length = wi.compressed_bl.length();
10077 csum_length = final_length;
10078 csum_order = ctz(csum_length);
10079 l = &wi.compressed_bl;
10080 dblob.set_compressed(wi.blob_length, wi.compressed_len);
10081 } else if (wi.new_blob) {
10082 // initialize newly created blob only
10083 assert(dblob.is_mutable());
10084 if (l->length() != wi.blob_length) {
10085 // hrm, maybe we could do better here, but let's not bother.
10086 dout(20) << __func__ << " forcing csum_order to block_size_order "
10087 << block_size_order << dendl;
10088 csum_order = block_size_order;
10089 } else {
10090 csum_order = std::min(wctx->csum_order, ctz(l->length()));
10091 }
10092 // try to align blob with max_blob_size to improve
10093 // its reuse ratio, e.g. in case of reverse write
10094 uint32_t suggested_boff =
10095 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
10096 if ((suggested_boff % (1 << csum_order)) == 0 &&
10097 suggested_boff + final_length <= max_bsize &&
10098 suggested_boff > b_off) {
10099 dout(20) << __func__ << " forcing blob_offset to 0x"
10100 << std::hex << suggested_boff << std::dec << dendl;
10101 assert(suggested_boff >= b_off);
10102 csum_length += suggested_boff - b_off;
10103 b_off = suggested_boff;
10104 }
10105 if (csum != Checksummer::CSUM_NONE) {
10106 dout(20) << __func__ << " initialize csum setting for new blob " << *b
10107 << " csum_type " << Checksummer::get_csum_type_string(csum)
10108 << " csum_order " << csum_order
10109 << " csum_length 0x" << std::hex << csum_length << std::dec
10110 << dendl;
10111 dblob.init_csum(csum, csum_order, csum_length);
10112 }
10113 }
10114
10115 AllocExtentVector extents;
10116 int64_t left = final_length;
10117 while (left > 0) {
10118 assert(prealloc_left > 0);
10119 if (prealloc_pos->length <= left) {
10120 prealloc_left -= prealloc_pos->length;
10121 left -= prealloc_pos->length;
10122 txc->statfs_delta.allocated() += prealloc_pos->length;
10123 extents.push_back(*prealloc_pos);
10124 ++prealloc_pos;
10125 } else {
10126 extents.emplace_back(prealloc_pos->offset, left);
10127 prealloc_pos->offset += left;
10128 prealloc_pos->length -= left;
10129 prealloc_left -= left;
10130 txc->statfs_delta.allocated() += left;
10131 left = 0;
10132 break;
10133 }
10134 }
10135 for (auto& p : extents) {
10136 txc->allocated.insert(p.offset, p.length);
10137 }
10138 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
10139
10140 dout(20) << __func__ << " blob " << *b << dendl;
10141 if (dblob.has_csum()) {
10142 dblob.calc_csum(b_off, *l);
10143 }
10144
10145 if (wi.mark_unused) {
10146 auto b_end = b_off + wi.bl.length();
10147 if (b_off) {
10148 dblob.add_unused(0, b_off);
10149 }
10150 if (b_end < wi.blob_length) {
10151 dblob.add_unused(b_end, wi.blob_length - b_end);
10152 }
10153 }
10154
10155 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
10156 b_off + (wi.b_off0 - wi.b_off),
10157 wi.length0,
10158 wi.b,
10159 nullptr);
10160 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
10161 txc->statfs_delta.stored() += le->length;
10162 dout(20) << __func__ << " lex " << *le << dendl;
10163 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
10164 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
10165
10166 // queue io
10167 if (!g_conf->bluestore_debug_omit_block_device_write) {
10168 if (l->length() <= prefer_deferred_size.load()) {
10169 dout(20) << __func__ << " deferring small 0x" << std::hex
10170 << l->length() << std::dec << " write via deferred" << dendl;
10171 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
10172 op->op = bluestore_deferred_op_t::OP_WRITE;
10173 int r = b->get_blob().map(
10174 b_off, l->length(),
10175 [&](uint64_t offset, uint64_t length) {
10176 op->extents.emplace_back(bluestore_pextent_t(offset, length));
10177 return 0;
10178 });
10179 assert(r == 0);
10180 op->data = *l;
10181 } else {
10182 b->get_blob().map_bl(
10183 b_off, *l,
10184 [&](uint64_t offset, bufferlist& t) {
10185 bdev->aio_write(offset, t, &txc->ioc, false);
10186 });
10187 }
10188 }
10189 }
10190 assert(prealloc_pos == prealloc.end());
10191 assert(prealloc_left == 0);
10192 return 0;
10193 }
10194
10195 void BlueStore::_wctx_finish(
10196 TransContext *txc,
10197 CollectionRef& c,
10198 OnodeRef o,
10199 WriteContext *wctx,
10200 set<SharedBlob*> *maybe_unshared_blobs)
10201 {
10202 auto oep = wctx->old_extents.begin();
10203 while (oep != wctx->old_extents.end()) {
10204 auto &lo = *oep;
10205 oep = wctx->old_extents.erase(oep);
10206 dout(20) << __func__ << " lex_old " << lo.e << dendl;
10207 BlobRef b = lo.e.blob;
10208 const bluestore_blob_t& blob = b->get_blob();
10209 if (blob.is_compressed()) {
10210 if (lo.blob_empty) {
10211 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
10212 }
10213 txc->statfs_delta.compressed_original() -= lo.e.length;
10214 }
10215 auto& r = lo.r;
10216 txc->statfs_delta.stored() -= lo.e.length;
10217 if (!r.empty()) {
10218 dout(20) << __func__ << " blob release " << r << dendl;
10219 if (blob.is_shared()) {
10220 PExtentVector final;
10221 c->load_shared_blob(b->shared_blob);
10222 for (auto e : r) {
10223 b->shared_blob->put_ref(
10224 e.offset, e.length, &final,
10225 b->is_referenced() ? nullptr : maybe_unshared_blobs);
10226 }
10227 dout(20) << __func__ << " shared_blob release " << final
10228 << " from " << *b->shared_blob << dendl;
10229 txc->write_shared_blob(b->shared_blob);
10230 r.clear();
10231 r.swap(final);
10232 }
10233 }
10234 // we can't invalidate our logical extents as we drop them because
10235 // other lextents (either in our onode or others) may still
10236 // reference them. but we can throw out anything that is no
10237 // longer allocated. Note that this will leave behind edge bits
10238 // that are no longer referenced but not deallocated (until they
10239 // age out of the cache naturally).
10240 b->discard_unallocated(c.get());
10241 for (auto e : r) {
10242 dout(20) << __func__ << " release " << e << dendl;
10243 txc->released.insert(e.offset, e.length);
10244 txc->statfs_delta.allocated() -= e.length;
10245 if (blob.is_compressed()) {
10246 txc->statfs_delta.compressed_allocated() -= e.length;
10247 }
10248 }
10249 delete &lo;
10250 if (b->is_spanning() && !b->is_referenced()) {
10251 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
10252 << dendl;
10253 o->extent_map.spanning_blob_map.erase(b->id);
10254 }
10255 }
10256 }
10257
10258 void BlueStore::_do_write_data(
10259 TransContext *txc,
10260 CollectionRef& c,
10261 OnodeRef o,
10262 uint64_t offset,
10263 uint64_t length,
10264 bufferlist& bl,
10265 WriteContext *wctx)
10266 {
10267 uint64_t end = offset + length;
10268 bufferlist::iterator p = bl.begin();
10269
10270 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
10271 (length != min_alloc_size)) {
10272 // we fall within the same block
10273 _do_write_small(txc, c, o, offset, length, p, wctx);
10274 } else {
10275 uint64_t head_offset, head_length;
10276 uint64_t middle_offset, middle_length;
10277 uint64_t tail_offset, tail_length;
10278
10279 head_offset = offset;
10280 head_length = P2NPHASE(offset, min_alloc_size);
10281
10282 tail_offset = P2ALIGN(end, min_alloc_size);
10283 tail_length = P2PHASE(end, min_alloc_size);
10284
10285 middle_offset = head_offset + head_length;
10286 middle_length = length - head_length - tail_length;
10287
10288 if (head_length) {
10289 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
10290 }
10291
10292 if (middle_length) {
10293 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
10294 }
10295
10296 if (tail_length) {
10297 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
10298 }
10299 }
10300 }
10301
10302 void BlueStore::_choose_write_options(
10303 CollectionRef& c,
10304 OnodeRef o,
10305 uint32_t fadvise_flags,
10306 WriteContext *wctx)
10307 {
10308 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10309 dout(20) << __func__ << " will do buffered write" << dendl;
10310 wctx->buffered = true;
10311 } else if (cct->_conf->bluestore_default_buffered_write &&
10312 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10313 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10314 dout(20) << __func__ << " defaulting to buffered write" << dendl;
10315 wctx->buffered = true;
10316 }
10317
10318 // apply basic csum block size
10319 wctx->csum_order = block_size_order;
10320
10321 // compression parameters
10322 unsigned alloc_hints = o->onode.alloc_hint_flags;
10323 auto cm = select_option(
10324 "compression_mode",
10325 comp_mode.load(),
10326 [&]() {
10327 string val;
10328 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
10329 return boost::optional<Compressor::CompressionMode>(
10330 Compressor::get_comp_mode_type(val));
10331 }
10332 return boost::optional<Compressor::CompressionMode>();
10333 }
10334 );
10335
10336 wctx->compress = (cm != Compressor::COMP_NONE) &&
10337 ((cm == Compressor::COMP_FORCE) ||
10338 (cm == Compressor::COMP_AGGRESSIVE &&
10339 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10340 (cm == Compressor::COMP_PASSIVE &&
10341 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10342
10343 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10344 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
10345 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10346 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
10347 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
10348
10349 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
10350
10351 if (o->onode.expected_write_size) {
10352 wctx->csum_order = std::max(min_alloc_size_order,
10353 (uint8_t)ctz(o->onode.expected_write_size));
10354 } else {
10355 wctx->csum_order = min_alloc_size_order;
10356 }
10357
10358 if (wctx->compress) {
10359 wctx->target_blob_size = select_option(
10360 "compression_max_blob_size",
10361 comp_max_blob_size.load(),
10362 [&]() {
10363 int val;
10364 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10365 return boost::optional<uint64_t>((uint64_t)val);
10366 }
10367 return boost::optional<uint64_t>();
10368 }
10369 );
10370 }
10371 } else {
10372 if (wctx->compress) {
10373 wctx->target_blob_size = select_option(
10374 "compression_min_blob_size",
10375 comp_min_blob_size.load(),
10376 [&]() {
10377 int val;
10378 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10379 return boost::optional<uint64_t>((uint64_t)val);
10380 }
10381 return boost::optional<uint64_t>();
10382 }
10383 );
10384 }
10385 }
10386
10387 uint64_t max_bsize = max_blob_size.load();
10388 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10389 wctx->target_blob_size = max_bsize;
10390 }
10391
10392 // set the min blob size floor at 2x the min_alloc_size, or else we
10393 // won't be able to allocate a smaller extent for the compressed
10394 // data.
10395 if (wctx->compress &&
10396 wctx->target_blob_size < min_alloc_size * 2) {
10397 wctx->target_blob_size = min_alloc_size * 2;
10398 }
10399
10400 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10401 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10402 << std::dec << dendl;
10403 }
10404
10405 int BlueStore::_do_gc(
10406 TransContext *txc,
10407 CollectionRef& c,
10408 OnodeRef o,
10409 const GarbageCollector& gc,
10410 const WriteContext& wctx,
10411 uint64_t *dirty_start,
10412 uint64_t *dirty_end)
10413 {
10414 auto& extents_to_collect = gc.get_extents_to_collect();
10415
10416 WriteContext wctx_gc;
10417 wctx_gc.fork(wctx); // make a clone for garbage collection
10418
10419 for (auto it = extents_to_collect.begin();
10420 it != extents_to_collect.end();
10421 ++it) {
10422 bufferlist bl;
10423 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10424 assert(r == (int)it->length);
10425
10426 o->extent_map.fault_range(db, it->offset, it->length);
10427 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10428 logger->inc(l_bluestore_gc_merged, it->length);
10429
10430 if (*dirty_start > it->offset) {
10431 *dirty_start = it->offset;
10432 }
10433
10434 if (*dirty_end < it->offset + it->length) {
10435 *dirty_end = it->offset + it->length;
10436 }
10437 }
10438
10439 dout(30) << __func__ << " alloc write" << dendl;
10440 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10441 if (r < 0) {
10442 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10443 << dendl;
10444 return r;
10445 }
10446
10447 _wctx_finish(txc, c, o, &wctx_gc);
10448 return 0;
10449 }
10450
10451 int BlueStore::_do_write(
10452 TransContext *txc,
10453 CollectionRef& c,
10454 OnodeRef o,
10455 uint64_t offset,
10456 uint64_t length,
10457 bufferlist& bl,
10458 uint32_t fadvise_flags)
10459 {
10460 int r = 0;
10461
10462 dout(20) << __func__
10463 << " " << o->oid
10464 << " 0x" << std::hex << offset << "~" << length
10465 << " - have 0x" << o->onode.size
10466 << " (" << std::dec << o->onode.size << ")"
10467 << " bytes"
10468 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10469 << dendl;
10470 _dump_onode(o);
10471
10472 if (length == 0) {
10473 return 0;
10474 }
10475
10476 uint64_t end = offset + length;
10477
10478 GarbageCollector gc(c->store->cct);
10479 int64_t benefit;
10480 auto dirty_start = offset;
10481 auto dirty_end = end;
10482
10483 WriteContext wctx;
10484 _choose_write_options(c, o, fadvise_flags, &wctx);
10485 o->extent_map.fault_range(db, offset, length);
10486 _do_write_data(txc, c, o, offset, length, bl, &wctx);
10487 r = _do_alloc_write(txc, c, o, &wctx);
10488 if (r < 0) {
10489 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10490 << dendl;
10491 goto out;
10492 }
10493
10494 // NB: _wctx_finish() will empty old_extents
10495 // so we must do gc estimation before that
10496 benefit = gc.estimate(offset,
10497 length,
10498 o->extent_map,
10499 wctx.old_extents,
10500 min_alloc_size);
10501
10502 _wctx_finish(txc, c, o, &wctx);
10503 if (end > o->onode.size) {
10504 dout(20) << __func__ << " extending size to 0x" << std::hex << end
10505 << std::dec << dendl;
10506 o->onode.size = end;
10507 }
10508
10509 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
10510 if (!gc.get_extents_to_collect().empty()) {
10511 dout(20) << __func__ << " perform garbage collection, "
10512 << "expected benefit = " << benefit << " AUs" << dendl;
10513 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10514 if (r < 0) {
10515 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10516 << dendl;
10517 goto out;
10518 }
10519 }
10520 }
10521
10522 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
10523 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10524
10525 r = 0;
10526
10527 out:
10528 return r;
10529 }
10530
10531 int BlueStore::_write(TransContext *txc,
10532 CollectionRef& c,
10533 OnodeRef& o,
10534 uint64_t offset, size_t length,
10535 bufferlist& bl,
10536 uint32_t fadvise_flags)
10537 {
10538 dout(15) << __func__ << " " << c->cid << " " << o->oid
10539 << " 0x" << std::hex << offset << "~" << length << std::dec
10540 << dendl;
10541 int r = 0;
10542 if (offset + length >= OBJECT_MAX_SIZE) {
10543 r = -E2BIG;
10544 } else {
10545 _assign_nid(txc, o);
10546 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10547 txc->write_onode(o);
10548 }
10549 dout(10) << __func__ << " " << c->cid << " " << o->oid
10550 << " 0x" << std::hex << offset << "~" << length << std::dec
10551 << " = " << r << dendl;
10552 return r;
10553 }
10554
10555 int BlueStore::_zero(TransContext *txc,
10556 CollectionRef& c,
10557 OnodeRef& o,
10558 uint64_t offset, size_t length)
10559 {
10560 dout(15) << __func__ << " " << c->cid << " " << o->oid
10561 << " 0x" << std::hex << offset << "~" << length << std::dec
10562 << dendl;
10563 int r = 0;
10564 if (offset + length >= OBJECT_MAX_SIZE) {
10565 r = -E2BIG;
10566 } else {
10567 _assign_nid(txc, o);
10568 r = _do_zero(txc, c, o, offset, length);
10569 }
10570 dout(10) << __func__ << " " << c->cid << " " << o->oid
10571 << " 0x" << std::hex << offset << "~" << length << std::dec
10572 << " = " << r << dendl;
10573 return r;
10574 }
10575
10576 int BlueStore::_do_zero(TransContext *txc,
10577 CollectionRef& c,
10578 OnodeRef& o,
10579 uint64_t offset, size_t length)
10580 {
10581 dout(15) << __func__ << " " << c->cid << " " << o->oid
10582 << " 0x" << std::hex << offset << "~" << length << std::dec
10583 << dendl;
10584 int r = 0;
10585
10586 _dump_onode(o);
10587
10588 WriteContext wctx;
10589 o->extent_map.fault_range(db, offset, length);
10590 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10591 o->extent_map.dirty_range(offset, length);
10592 _wctx_finish(txc, c, o, &wctx);
10593
10594 if (length > 0 && offset + length > o->onode.size) {
10595 o->onode.size = offset + length;
10596 dout(20) << __func__ << " extending size to " << offset + length
10597 << dendl;
10598 }
10599 txc->write_onode(o);
10600
10601 dout(10) << __func__ << " " << c->cid << " " << o->oid
10602 << " 0x" << std::hex << offset << "~" << length << std::dec
10603 << " = " << r << dendl;
10604 return r;
10605 }
10606
10607 void BlueStore::_do_truncate(
10608 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10609 set<SharedBlob*> *maybe_unshared_blobs)
10610 {
10611 dout(15) << __func__ << " " << c->cid << " " << o->oid
10612 << " 0x" << std::hex << offset << std::dec << dendl;
10613
10614 _dump_onode(o, 30);
10615
10616 if (offset == o->onode.size)
10617 return;
10618
10619 if (offset < o->onode.size) {
10620 WriteContext wctx;
10621 uint64_t length = o->onode.size - offset;
10622 o->extent_map.fault_range(db, offset, length);
10623 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10624 o->extent_map.dirty_range(offset, length);
10625 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
10626
10627 // if we have shards past EOF, ask for a reshard
10628 if (!o->onode.extent_map_shards.empty() &&
10629 o->onode.extent_map_shards.back().offset >= offset) {
10630 dout(10) << __func__ << " request reshard past EOF" << dendl;
10631 if (offset) {
10632 o->extent_map.request_reshard(offset - 1, offset + length);
10633 } else {
10634 o->extent_map.request_reshard(0, length);
10635 }
10636 }
10637 }
10638
10639 o->onode.size = offset;
10640
10641 txc->write_onode(o);
10642 }
10643
10644 int BlueStore::_truncate(TransContext *txc,
10645 CollectionRef& c,
10646 OnodeRef& o,
10647 uint64_t offset)
10648 {
10649 dout(15) << __func__ << " " << c->cid << " " << o->oid
10650 << " 0x" << std::hex << offset << std::dec
10651 << dendl;
10652 int r = 0;
10653 if (offset >= OBJECT_MAX_SIZE) {
10654 r = -E2BIG;
10655 } else {
10656 _do_truncate(txc, c, o, offset);
10657 }
10658 dout(10) << __func__ << " " << c->cid << " " << o->oid
10659 << " 0x" << std::hex << offset << std::dec
10660 << " = " << r << dendl;
10661 return r;
10662 }
10663
10664 int BlueStore::_do_remove(
10665 TransContext *txc,
10666 CollectionRef& c,
10667 OnodeRef o)
10668 {
10669 set<SharedBlob*> maybe_unshared_blobs;
10670 bool is_gen = !o->oid.is_no_gen();
10671 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
10672 if (o->onode.has_omap()) {
10673 o->flush();
10674 _do_omap_clear(txc, o->onode.nid);
10675 }
10676 o->exists = false;
10677 string key;
10678 for (auto &s : o->extent_map.shards) {
10679 dout(20) << __func__ << " removing shard 0x" << std::hex
10680 << s.shard_info->offset << std::dec << dendl;
10681 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10682 [&](const string& final_key) {
10683 txc->t->rmkey(PREFIX_OBJ, final_key);
10684 }
10685 );
10686 }
10687 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10688 txc->removed(o);
10689 o->extent_map.clear();
10690 o->onode = bluestore_onode_t();
10691 _debug_obj_on_delete(o->oid);
10692
10693 if (!is_gen || maybe_unshared_blobs.empty()) {
10694 return 0;
10695 }
10696
10697 // see if we can unshare blobs still referenced by the head
10698 dout(10) << __func__ << " gen and maybe_unshared_blobs "
10699 << maybe_unshared_blobs << dendl;
10700 ghobject_t nogen = o->oid;
10701 nogen.generation = ghobject_t::NO_GEN;
10702 OnodeRef h = c->onode_map.lookup(nogen);
10703
10704 if (!h || !h->exists) {
10705 return 0;
10706 }
10707
10708 dout(20) << __func__ << " checking for unshareable blobs on " << h
10709 << " " << h->oid << dendl;
10710 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
10711 for (auto& e : h->extent_map.extent_map) {
10712 const bluestore_blob_t& b = e.blob->get_blob();
10713 SharedBlob *sb = e.blob->shared_blob.get();
10714 if (b.is_shared() &&
10715 sb->loaded &&
10716 maybe_unshared_blobs.count(sb)) {
10717 if (b.is_compressed()) {
10718 expect[sb].get(0, b.get_ondisk_length());
10719 } else {
10720 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
10721 expect[sb].get(off, len);
10722 return 0;
10723 });
10724 }
10725 }
10726 }
10727
10728 vector<SharedBlob*> unshared_blobs;
10729 unshared_blobs.reserve(maybe_unshared_blobs.size());
10730 for (auto& p : expect) {
10731 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
10732 if (p.first->persistent->ref_map == p.second) {
10733 SharedBlob *sb = p.first;
10734 dout(20) << __func__ << " unsharing " << *sb << dendl;
10735 unshared_blobs.push_back(sb);
10736 txc->unshare_blob(sb);
10737 uint64_t sbid = c->make_blob_unshared(sb);
10738 string key;
10739 get_shared_blob_key(sbid, &key);
10740 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
10741 }
10742 }
10743
10744 if (unshared_blobs.empty()) {
10745 return 0;
10746 }
10747
10748 for (auto& e : h->extent_map.extent_map) {
10749 const bluestore_blob_t& b = e.blob->get_blob();
10750 SharedBlob *sb = e.blob->shared_blob.get();
10751 if (b.is_shared() &&
10752 std::find(unshared_blobs.begin(), unshared_blobs.end(),
10753 sb) != unshared_blobs.end()) {
10754 dout(20) << __func__ << " unsharing " << e << dendl;
10755 bluestore_blob_t& blob = e.blob->dirty_blob();
10756 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
10757 h->extent_map.dirty_range(e.logical_offset, 1);
10758 }
10759 }
10760 txc->write_onode(h);
10761
10762 return 0;
10763 }
10764
10765 int BlueStore::_remove(TransContext *txc,
10766 CollectionRef& c,
10767 OnodeRef &o)
10768 {
10769 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10770 int r = _do_remove(txc, c, o);
10771 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10772 return r;
10773 }
10774
10775 int BlueStore::_setattr(TransContext *txc,
10776 CollectionRef& c,
10777 OnodeRef& o,
10778 const string& name,
10779 bufferptr& val)
10780 {
10781 dout(15) << __func__ << " " << c->cid << " " << o->oid
10782 << " " << name << " (" << val.length() << " bytes)"
10783 << dendl;
10784 int r = 0;
10785 if (val.is_partial()) {
10786 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
10787 val.length());
10788 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10789 } else {
10790 auto& b = o->onode.attrs[name.c_str()] = val;
10791 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10792 }
10793 txc->write_onode(o);
10794 dout(10) << __func__ << " " << c->cid << " " << o->oid
10795 << " " << name << " (" << val.length() << " bytes)"
10796 << " = " << r << dendl;
10797 return r;
10798 }
10799
10800 int BlueStore::_setattrs(TransContext *txc,
10801 CollectionRef& c,
10802 OnodeRef& o,
10803 const map<string,bufferptr>& aset)
10804 {
10805 dout(15) << __func__ << " " << c->cid << " " << o->oid
10806 << " " << aset.size() << " keys"
10807 << dendl;
10808 int r = 0;
10809 for (map<string,bufferptr>::const_iterator p = aset.begin();
10810 p != aset.end(); ++p) {
10811 if (p->second.is_partial()) {
10812 auto& b = o->onode.attrs[p->first.c_str()] =
10813 bufferptr(p->second.c_str(), p->second.length());
10814 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10815 } else {
10816 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
10817 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
10818 }
10819 }
10820 txc->write_onode(o);
10821 dout(10) << __func__ << " " << c->cid << " " << o->oid
10822 << " " << aset.size() << " keys"
10823 << " = " << r << dendl;
10824 return r;
10825 }
10826
10827
10828 int BlueStore::_rmattr(TransContext *txc,
10829 CollectionRef& c,
10830 OnodeRef& o,
10831 const string& name)
10832 {
10833 dout(15) << __func__ << " " << c->cid << " " << o->oid
10834 << " " << name << dendl;
10835 int r = 0;
10836 auto it = o->onode.attrs.find(name.c_str());
10837 if (it == o->onode.attrs.end())
10838 goto out;
10839
10840 o->onode.attrs.erase(it);
10841 txc->write_onode(o);
10842
10843 out:
10844 dout(10) << __func__ << " " << c->cid << " " << o->oid
10845 << " " << name << " = " << r << dendl;
10846 return r;
10847 }
10848
10849 int BlueStore::_rmattrs(TransContext *txc,
10850 CollectionRef& c,
10851 OnodeRef& o)
10852 {
10853 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10854 int r = 0;
10855
10856 if (o->onode.attrs.empty())
10857 goto out;
10858
10859 o->onode.attrs.clear();
10860 txc->write_onode(o);
10861
10862 out:
10863 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10864 return r;
10865 }
10866
10867 void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
10868 {
10869 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10870 string prefix, tail;
10871 get_omap_header(id, &prefix);
10872 get_omap_tail(id, &tail);
10873 it->lower_bound(prefix);
10874 while (it->valid()) {
10875 if (it->key() >= tail) {
10876 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
10877 << dendl;
10878 break;
10879 }
10880 txc->t->rmkey(PREFIX_OMAP, it->key());
10881 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10882 it->next();
10883 }
10884 }
10885
10886 int BlueStore::_omap_clear(TransContext *txc,
10887 CollectionRef& c,
10888 OnodeRef& o)
10889 {
10890 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10891 int r = 0;
10892 if (o->onode.has_omap()) {
10893 o->flush();
10894 _do_omap_clear(txc, o->onode.nid);
10895 o->onode.clear_omap_flag();
10896 txc->write_onode(o);
10897 }
10898 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10899 return r;
10900 }
10901
10902 int BlueStore::_omap_setkeys(TransContext *txc,
10903 CollectionRef& c,
10904 OnodeRef& o,
10905 bufferlist &bl)
10906 {
10907 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10908 int r;
10909 bufferlist::iterator p = bl.begin();
10910 __u32 num;
10911 if (!o->onode.has_omap()) {
10912 o->onode.set_omap_flag();
10913 txc->write_onode(o);
10914 } else {
10915 txc->note_modified_object(o);
10916 }
10917 string final_key;
10918 _key_encode_u64(o->onode.nid, &final_key);
10919 final_key.push_back('.');
10920 ::decode(num, p);
10921 while (num--) {
10922 string key;
10923 bufferlist value;
10924 ::decode(key, p);
10925 ::decode(value, p);
10926 final_key.resize(9); // keep prefix
10927 final_key += key;
10928 dout(30) << __func__ << " " << pretty_binary_string(final_key)
10929 << " <- " << key << dendl;
10930 txc->t->set(PREFIX_OMAP, final_key, value);
10931 }
10932 r = 0;
10933 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10934 return r;
10935 }
10936
10937 int BlueStore::_omap_setheader(TransContext *txc,
10938 CollectionRef& c,
10939 OnodeRef &o,
10940 bufferlist& bl)
10941 {
10942 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10943 int r;
10944 string key;
10945 if (!o->onode.has_omap()) {
10946 o->onode.set_omap_flag();
10947 txc->write_onode(o);
10948 } else {
10949 txc->note_modified_object(o);
10950 }
10951 get_omap_header(o->onode.nid, &key);
10952 txc->t->set(PREFIX_OMAP, key, bl);
10953 r = 0;
10954 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10955 return r;
10956 }
10957
10958 int BlueStore::_omap_rmkeys(TransContext *txc,
10959 CollectionRef& c,
10960 OnodeRef& o,
10961 bufferlist& bl)
10962 {
10963 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10964 int r = 0;
10965 bufferlist::iterator p = bl.begin();
10966 __u32 num;
10967 string final_key;
10968
10969 if (!o->onode.has_omap()) {
10970 goto out;
10971 }
10972 _key_encode_u64(o->onode.nid, &final_key);
10973 final_key.push_back('.');
10974 ::decode(num, p);
10975 while (num--) {
10976 string key;
10977 ::decode(key, p);
10978 final_key.resize(9); // keep prefix
10979 final_key += key;
10980 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
10981 << " <- " << key << dendl;
10982 txc->t->rmkey(PREFIX_OMAP, final_key);
10983 }
10984 txc->note_modified_object(o);
10985
10986 out:
10987 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10988 return r;
10989 }
10990
10991 int BlueStore::_omap_rmkey_range(TransContext *txc,
10992 CollectionRef& c,
10993 OnodeRef& o,
10994 const string& first, const string& last)
10995 {
10996 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10997 KeyValueDB::Iterator it;
10998 string key_first, key_last;
10999 int r = 0;
11000 if (!o->onode.has_omap()) {
11001 goto out;
11002 }
11003 o->flush();
11004 it = db->get_iterator(PREFIX_OMAP);
11005 get_omap_key(o->onode.nid, first, &key_first);
11006 get_omap_key(o->onode.nid, last, &key_last);
11007 it->lower_bound(key_first);
11008 while (it->valid()) {
11009 if (it->key() >= key_last) {
11010 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
11011 << dendl;
11012 break;
11013 }
11014 txc->t->rmkey(PREFIX_OMAP, it->key());
11015 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
11016 it->next();
11017 }
11018 txc->note_modified_object(o);
11019
11020 out:
11021 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
11022 return r;
11023 }
11024
11025 int BlueStore::_set_alloc_hint(
11026 TransContext *txc,
11027 CollectionRef& c,
11028 OnodeRef& o,
11029 uint64_t expected_object_size,
11030 uint64_t expected_write_size,
11031 uint32_t flags)
11032 {
11033 dout(15) << __func__ << " " << c->cid << " " << o->oid
11034 << " object_size " << expected_object_size
11035 << " write_size " << expected_write_size
11036 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11037 << dendl;
11038 int r = 0;
11039 o->onode.expected_object_size = expected_object_size;
11040 o->onode.expected_write_size = expected_write_size;
11041 o->onode.alloc_hint_flags = flags;
11042 txc->write_onode(o);
11043 dout(10) << __func__ << " " << c->cid << " " << o->oid
11044 << " object_size " << expected_object_size
11045 << " write_size " << expected_write_size
11046 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
11047 << " = " << r << dendl;
11048 return r;
11049 }
11050
11051 int BlueStore::_clone(TransContext *txc,
11052 CollectionRef& c,
11053 OnodeRef& oldo,
11054 OnodeRef& newo)
11055 {
11056 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11057 << newo->oid << dendl;
11058 int r = 0;
11059 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
11060 derr << __func__ << " mismatched hash on " << oldo->oid
11061 << " and " << newo->oid << dendl;
11062 return -EINVAL;
11063 }
11064
11065 _assign_nid(txc, newo);
11066
11067 // clone data
11068 oldo->flush();
11069 _do_truncate(txc, c, newo, 0);
11070 if (cct->_conf->bluestore_clone_cow) {
11071 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
11072 } else {
11073 bufferlist bl;
11074 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
11075 if (r < 0)
11076 goto out;
11077 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
11078 if (r < 0)
11079 goto out;
11080 }
11081
11082 // clone attrs
11083 newo->onode.attrs = oldo->onode.attrs;
11084
11085 // clone omap
11086 if (newo->onode.has_omap()) {
11087 dout(20) << __func__ << " clearing old omap data" << dendl;
11088 newo->flush();
11089 _do_omap_clear(txc, newo->onode.nid);
11090 }
11091 if (oldo->onode.has_omap()) {
11092 dout(20) << __func__ << " copying omap data" << dendl;
11093 if (!newo->onode.has_omap()) {
11094 newo->onode.set_omap_flag();
11095 }
11096 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
11097 string head, tail;
11098 get_omap_header(oldo->onode.nid, &head);
11099 get_omap_tail(oldo->onode.nid, &tail);
11100 it->lower_bound(head);
11101 while (it->valid()) {
11102 if (it->key() >= tail) {
11103 dout(30) << __func__ << " reached tail" << dendl;
11104 break;
11105 } else {
11106 dout(30) << __func__ << " got header/data "
11107 << pretty_binary_string(it->key()) << dendl;
11108 string key;
11109 rewrite_omap_key(newo->onode.nid, it->key(), &key);
11110 txc->t->set(PREFIX_OMAP, key, it->value());
11111 }
11112 it->next();
11113 }
11114 } else {
11115 newo->onode.clear_omap_flag();
11116 }
11117
11118 txc->write_onode(newo);
11119 r = 0;
11120
11121 out:
11122 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11123 << newo->oid << " = " << r << dendl;
11124 return r;
11125 }
11126
11127 int BlueStore::_do_clone_range(
11128 TransContext *txc,
11129 CollectionRef& c,
11130 OnodeRef& oldo,
11131 OnodeRef& newo,
11132 uint64_t srcoff,
11133 uint64_t length,
11134 uint64_t dstoff)
11135 {
11136 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11137 << newo->oid
11138 << " 0x" << std::hex << srcoff << "~" << length << " -> "
11139 << " 0x" << dstoff << "~" << length << std::dec << dendl;
11140 oldo->extent_map.fault_range(db, srcoff, length);
11141 newo->extent_map.fault_range(db, dstoff, length);
11142 _dump_onode(oldo);
11143 _dump_onode(newo);
11144
11145 // hmm, this could go into an ExtentMap::dup() method.
11146 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
11147 for (auto &e : oldo->extent_map.extent_map) {
11148 e.blob->last_encoded_id = -1;
11149 }
11150 int n = 0;
11151 uint64_t end = srcoff + length;
11152 uint32_t dirty_range_begin = 0;
11153 uint32_t dirty_range_end = 0;
11154 bool src_dirty = false;
11155 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
11156 ep != oldo->extent_map.extent_map.end();
11157 ++ep) {
11158 auto& e = *ep;
11159 if (e.logical_offset >= end) {
11160 break;
11161 }
11162 dout(20) << __func__ << " src " << e << dendl;
11163 BlobRef cb;
11164 bool blob_duped = true;
11165 if (e.blob->last_encoded_id >= 0) {
11166 // blob is already duped
11167 cb = id_to_blob[e.blob->last_encoded_id];
11168 blob_duped = false;
11169 } else {
11170 // dup the blob
11171 const bluestore_blob_t& blob = e.blob->get_blob();
11172 // make sure it is shared
11173 if (!blob.is_shared()) {
11174 c->make_blob_shared(_assign_blobid(txc), e.blob);
11175 if (!src_dirty) {
11176 src_dirty = true;
11177 dirty_range_begin = e.logical_offset;
11178 }
11179 assert(e.logical_end() > 0);
11180 // -1 to exclude next potential shard
11181 dirty_range_end = e.logical_end() - 1;
11182 } else {
11183 c->load_shared_blob(e.blob->shared_blob);
11184 }
11185 cb = new Blob();
11186 e.blob->last_encoded_id = n;
11187 id_to_blob[n] = cb;
11188 e.blob->dup(*cb);
11189 // bump the extent refs on the copied blob's extents
11190 for (auto p : blob.get_extents()) {
11191 if (p.is_valid()) {
11192 e.blob->shared_blob->get_ref(p.offset, p.length);
11193 }
11194 }
11195 txc->write_shared_blob(e.blob->shared_blob);
11196 dout(20) << __func__ << " new " << *cb << dendl;
11197 }
11198 // dup extent
11199 int skip_front, skip_back;
11200 if (e.logical_offset < srcoff) {
11201 skip_front = srcoff - e.logical_offset;
11202 } else {
11203 skip_front = 0;
11204 }
11205 if (e.logical_end() > end) {
11206 skip_back = e.logical_end() - end;
11207 } else {
11208 skip_back = 0;
11209 }
11210 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
11211 e.blob_offset + skip_front,
11212 e.length - skip_front - skip_back, cb);
11213 newo->extent_map.extent_map.insert(*ne);
11214 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
11215 // fixme: we may leave parts of new blob unreferenced that could
11216 // be freed (relative to the shared_blob).
11217 txc->statfs_delta.stored() += ne->length;
11218 if (e.blob->get_blob().is_compressed()) {
11219 txc->statfs_delta.compressed_original() += ne->length;
11220 if (blob_duped){
11221 txc->statfs_delta.compressed() +=
11222 cb->get_blob().get_compressed_payload_length();
11223 }
11224 }
11225 dout(20) << __func__ << " dst " << *ne << dendl;
11226 ++n;
11227 }
11228 if (src_dirty) {
11229 oldo->extent_map.dirty_range(dirty_range_begin,
11230 dirty_range_end - dirty_range_begin);
11231 txc->write_onode(oldo);
11232 }
11233 txc->write_onode(newo);
11234
11235 if (dstoff + length > newo->onode.size) {
11236 newo->onode.size = dstoff + length;
11237 }
11238 newo->extent_map.dirty_range(dstoff, length);
11239 _dump_onode(oldo);
11240 _dump_onode(newo);
11241 return 0;
11242 }
11243
11244 int BlueStore::_clone_range(TransContext *txc,
11245 CollectionRef& c,
11246 OnodeRef& oldo,
11247 OnodeRef& newo,
11248 uint64_t srcoff, uint64_t length, uint64_t dstoff)
11249 {
11250 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11251 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11252 << " to offset 0x" << dstoff << std::dec << dendl;
11253 int r = 0;
11254
11255 if (srcoff + length >= OBJECT_MAX_SIZE ||
11256 dstoff + length >= OBJECT_MAX_SIZE) {
11257 r = -E2BIG;
11258 goto out;
11259 }
11260 if (srcoff + length > oldo->onode.size) {
11261 r = -EINVAL;
11262 goto out;
11263 }
11264
11265 _assign_nid(txc, newo);
11266
11267 if (length > 0) {
11268 if (cct->_conf->bluestore_clone_cow) {
11269 _do_zero(txc, c, newo, dstoff, length);
11270 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
11271 } else {
11272 bufferlist bl;
11273 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
11274 if (r < 0)
11275 goto out;
11276 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
11277 if (r < 0)
11278 goto out;
11279 }
11280 }
11281
11282 txc->write_onode(newo);
11283 r = 0;
11284
11285 out:
11286 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11287 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11288 << " to offset 0x" << dstoff << std::dec
11289 << " = " << r << dendl;
11290 return r;
11291 }
11292
11293 int BlueStore::_rename(TransContext *txc,
11294 CollectionRef& c,
11295 OnodeRef& oldo,
11296 OnodeRef& newo,
11297 const ghobject_t& new_oid)
11298 {
11299 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11300 << new_oid << dendl;
11301 int r;
11302 ghobject_t old_oid = oldo->oid;
11303 mempool::bluestore_cache_other::string new_okey;
11304
11305 if (newo) {
11306 if (newo->exists) {
11307 r = -EEXIST;
11308 goto out;
11309 }
11310 assert(txc->onodes.count(newo) == 0);
11311 }
11312
11313 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
11314
11315 // rewrite shards
11316 {
11317 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
11318 get_object_key(cct, new_oid, &new_okey);
11319 string key;
11320 for (auto &s : oldo->extent_map.shards) {
11321 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
11322 [&](const string& final_key) {
11323 txc->t->rmkey(PREFIX_OBJ, final_key);
11324 }
11325 );
11326 s.dirty = true;
11327 }
11328 }
11329
11330 newo = oldo;
11331 txc->write_onode(newo);
11332
11333 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11334 // Onode in the old slot
11335 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
11336 r = 0;
11337
11338 out:
11339 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
11340 << new_oid << " = " << r << dendl;
11341 return r;
11342 }
11343
11344 // collections
11345
11346 int BlueStore::_create_collection(
11347 TransContext *txc,
11348 const coll_t &cid,
11349 unsigned bits,
11350 CollectionRef *c)
11351 {
11352 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
11353 int r;
11354 bufferlist bl;
11355
11356 {
11357 RWLock::WLocker l(coll_lock);
11358 if (*c) {
11359 r = -EEXIST;
11360 goto out;
11361 }
11362 c->reset(
11363 new Collection(
11364 this,
11365 cache_shards[cid.hash_to_shard(cache_shards.size())],
11366 cid));
11367 (*c)->cnode.bits = bits;
11368 coll_map[cid] = *c;
11369 }
11370 ::encode((*c)->cnode, bl);
11371 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11372 r = 0;
11373
11374 out:
11375 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11376 return r;
11377 }
11378
11379 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11380 CollectionRef *c)
11381 {
11382 dout(15) << __func__ << " " << cid << dendl;
11383 int r;
11384
11385 {
11386 RWLock::WLocker l(coll_lock);
11387 if (!*c) {
11388 r = -ENOENT;
11389 goto out;
11390 }
11391 size_t nonexistent_count = 0;
11392 assert((*c)->exists);
11393 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11394 if (o->exists) {
11395 dout(10) << __func__ << " " << o->oid << " " << o
11396 << " exists in onode_map" << dendl;
11397 return true;
11398 }
11399 ++nonexistent_count;
11400 return false;
11401 })) {
11402 r = -ENOTEMPTY;
11403 goto out;
11404 }
11405
11406 vector<ghobject_t> ls;
11407 ghobject_t next;
11408 // Enumerate onodes in db, up to nonexistent_count + 1
11409 // then check if all of them are marked as non-existent.
11410 // Bypass the check if returned number is greater than nonexistent_count
11411 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11412 nonexistent_count + 1, &ls, &next);
11413 if (r >= 0) {
11414 bool exists = false; //ls.size() > nonexistent_count;
11415 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11416 dout(10) << __func__ << " oid " << *it << dendl;
11417 auto onode = (*c)->onode_map.lookup(*it);
11418 exists = !onode || onode->exists;
11419 if (exists) {
11420 dout(10) << __func__ << " " << *it
11421 << " exists in db" << dendl;
11422 }
11423 }
11424 if (!exists) {
11425 coll_map.erase(cid);
11426 txc->removed_collections.push_back(*c);
11427 (*c)->exists = false;
11428 c->reset();
11429 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11430 r = 0;
11431 } else {
11432 dout(10) << __func__ << " " << cid
11433 << " is non-empty" << dendl;
11434 r = -ENOTEMPTY;
11435 }
11436 }
11437 }
11438
11439 out:
11440 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11441 return r;
11442 }
11443
11444 int BlueStore::_split_collection(TransContext *txc,
11445 CollectionRef& c,
11446 CollectionRef& d,
11447 unsigned bits, int rem)
11448 {
11449 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11450 << " bits " << bits << dendl;
11451 RWLock::WLocker l(c->lock);
11452 RWLock::WLocker l2(d->lock);
11453 int r;
11454
11455 // flush all previous deferred writes on this sequencer. this is a bit
11456 // heavyweight, but we need to make sure all deferred writes complete
11457 // before we split as the new collection's sequencer may need to order
11458 // this after those writes, and we don't bother with the complexity of
11459 // moving those TransContexts over to the new osr.
11460 _osr_drain_preceding(txc);
11461
11462 // move any cached items (onodes and referenced shared blobs) that will
11463 // belong to the child collection post-split. leave everything else behind.
11464 // this may include things that don't strictly belong to the now-smaller
11465 // parent split, but the OSD will always send us a split for every new
11466 // child.
11467
11468 spg_t pgid, dest_pgid;
11469 bool is_pg = c->cid.is_pg(&pgid);
11470 assert(is_pg);
11471 is_pg = d->cid.is_pg(&dest_pgid);
11472 assert(is_pg);
11473
11474 // the destination should initially be empty.
11475 assert(d->onode_map.empty());
11476 assert(d->shared_blob_set.empty());
11477 assert(d->cnode.bits == bits);
11478
11479 c->split_cache(d.get());
11480
11481 // adjust bits. note that this will be redundant for all but the first
11482 // split call for this parent (first child).
11483 c->cnode.bits = bits;
11484 assert(d->cnode.bits == bits);
11485 r = 0;
11486
11487 bufferlist bl;
11488 ::encode(c->cnode, bl);
11489 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11490
11491 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11492 << " bits " << bits << " = " << r << dendl;
11493 return r;
11494 }
11495
11496 // DB key value Histogram
11497 #define KEY_SLAB 32
11498 #define VALUE_SLAB 64
11499
11500 const string prefix_onode = "o";
11501 const string prefix_onode_shard = "x";
11502 const string prefix_other = "Z";
11503
11504 int BlueStore::DBHistogram::get_key_slab(size_t sz)
11505 {
11506 return (sz/KEY_SLAB);
11507 }
11508
11509 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11510 {
11511 int lower_bound = slab * KEY_SLAB;
11512 int upper_bound = (slab + 1) * KEY_SLAB;
11513 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11514 return ret;
11515 }
11516
11517 int BlueStore::DBHistogram::get_value_slab(size_t sz)
11518 {
11519 return (sz/VALUE_SLAB);
11520 }
11521
11522 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11523 {
11524 int lower_bound = slab * VALUE_SLAB;
11525 int upper_bound = (slab + 1) * VALUE_SLAB;
11526 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11527 return ret;
11528 }
11529
11530 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11531 const string &prefix, size_t key_size, size_t value_size)
11532 {
11533 uint32_t key_slab = get_key_slab(key_size);
11534 uint32_t value_slab = get_value_slab(value_size);
11535 key_hist[prefix][key_slab].count++;
11536 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11537 key_hist[prefix][key_slab].val_map[value_slab].count++;
11538 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11539 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11540 }
11541
11542 void BlueStore::DBHistogram::dump(Formatter *f)
11543 {
11544 f->open_object_section("rocksdb_value_distribution");
11545 for (auto i : value_hist) {
11546 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11547 }
11548 f->close_section();
11549
11550 f->open_object_section("rocksdb_key_value_histogram");
11551 for (auto i : key_hist) {
11552 f->dump_string("prefix", i.first);
11553 f->open_object_section("key_hist");
11554 for ( auto k : i.second) {
11555 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11556 f->dump_unsigned("max_len", k.second.max_len);
11557 f->open_object_section("value_hist");
11558 for ( auto j : k.second.val_map) {
11559 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11560 f->dump_unsigned("max_len", j.second.max_len);
11561 }
11562 f->close_section();
11563 }
11564 f->close_section();
11565 }
11566 f->close_section();
11567 }
11568
11569 //Itrerates through the db and collects the stats
11570 void BlueStore::generate_db_histogram(Formatter *f)
11571 {
11572 //globals
11573 uint64_t num_onodes = 0;
11574 uint64_t num_shards = 0;
11575 uint64_t num_super = 0;
11576 uint64_t num_coll = 0;
11577 uint64_t num_omap = 0;
11578 uint64_t num_deferred = 0;
11579 uint64_t num_alloc = 0;
11580 uint64_t num_stat = 0;
11581 uint64_t num_others = 0;
11582 uint64_t num_shared_shards = 0;
11583 size_t max_key_size =0, max_value_size = 0;
11584 uint64_t total_key_size = 0, total_value_size = 0;
11585 size_t key_size = 0, value_size = 0;
11586 DBHistogram hist;
11587
11588 utime_t start = ceph_clock_now();
11589
11590 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11591 iter->seek_to_first();
11592 while (iter->valid()) {
11593 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11594 key_size = iter->key_size();
11595 value_size = iter->value_size();
11596 hist.value_hist[hist.get_value_slab(value_size)]++;
11597 max_key_size = MAX(max_key_size, key_size);
11598 max_value_size = MAX(max_value_size, value_size);
11599 total_key_size += key_size;
11600 total_value_size += value_size;
11601
11602 pair<string,string> key(iter->raw_key());
11603
11604 if (key.first == PREFIX_SUPER) {
11605 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11606 num_super++;
11607 } else if (key.first == PREFIX_STAT) {
11608 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11609 num_stat++;
11610 } else if (key.first == PREFIX_COLL) {
11611 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11612 num_coll++;
11613 } else if (key.first == PREFIX_OBJ) {
11614 if (key.second.back() == ONODE_KEY_SUFFIX) {
11615 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11616 num_onodes++;
11617 } else {
11618 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11619 num_shards++;
11620 }
11621 } else if (key.first == PREFIX_OMAP) {
11622 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11623 num_omap++;
11624 } else if (key.first == PREFIX_DEFERRED) {
11625 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11626 num_deferred++;
11627 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11628 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11629 num_alloc++;
11630 } else if (key.first == PREFIX_SHARED_BLOB) {
11631 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11632 num_shared_shards++;
11633 } else {
11634 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11635 num_others++;
11636 }
11637 iter->next();
11638 }
11639
11640 utime_t duration = ceph_clock_now() - start;
11641 f->open_object_section("rocksdb_key_value_stats");
11642 f->dump_unsigned("num_onodes", num_onodes);
11643 f->dump_unsigned("num_shards", num_shards);
11644 f->dump_unsigned("num_super", num_super);
11645 f->dump_unsigned("num_coll", num_coll);
11646 f->dump_unsigned("num_omap", num_omap);
11647 f->dump_unsigned("num_deferred", num_deferred);
11648 f->dump_unsigned("num_alloc", num_alloc);
11649 f->dump_unsigned("num_stat", num_stat);
11650 f->dump_unsigned("num_shared_shards", num_shared_shards);
11651 f->dump_unsigned("num_others", num_others);
11652 f->dump_unsigned("max_key_size", max_key_size);
11653 f->dump_unsigned("max_value_size", max_value_size);
11654 f->dump_unsigned("total_key_size", total_key_size);
11655 f->dump_unsigned("total_value_size", total_value_size);
11656 f->close_section();
11657
11658 hist.dump(f);
11659
11660 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11661
11662 }
11663
11664 void BlueStore::_flush_cache()
11665 {
11666 dout(10) << __func__ << dendl;
11667 for (auto i : cache_shards) {
11668 i->trim_all();
11669 assert(i->empty());
11670 }
11671 for (auto& p : coll_map) {
11672 if (!p.second->onode_map.empty()) {
11673 derr << __func__ << "stray onodes on " << p.first << dendl;
11674 p.second->onode_map.dump(cct, 0);
11675 }
11676 if (!p.second->shared_blob_set.empty()) {
11677 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11678 p.second->shared_blob_set.dump(cct, 0);
11679 }
11680 assert(p.second->onode_map.empty());
11681 assert(p.second->shared_blob_set.empty());
11682 }
11683 coll_map.clear();
11684 }
11685
11686 // For external caller.
11687 // We use a best-effort policy instead, e.g.,
11688 // we don't care if there are still some pinned onodes/data in the cache
11689 // after this command is completed.
11690 void BlueStore::flush_cache()
11691 {
11692 dout(10) << __func__ << dendl;
11693 for (auto i : cache_shards) {
11694 i->trim_all();
11695 }
11696 }
11697
11698 void BlueStore::_apply_padding(uint64_t head_pad,
11699 uint64_t tail_pad,
11700 bufferlist& padded)
11701 {
11702 if (head_pad) {
11703 padded.prepend_zero(head_pad);
11704 }
11705 if (tail_pad) {
11706 padded.append_zero(tail_pad);
11707 }
11708 if (head_pad || tail_pad) {
11709 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
11710 << " tail 0x" << tail_pad << std::dec << dendl;
11711 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
11712 }
11713 }
11714
11715 // ===========================================