]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
update sources to v12.2.1
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // vim: ts=8 sw=2 smarttab
2 /*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2014 Red Hat
6 *
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
11 *
12 */
13
14 #include <unistd.h>
15 #include <stdlib.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <fcntl.h>
19
20 #include "include/cpp-btree/btree_set.h"
21
22 #include "BlueStore.h"
23 #include "os/kv.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "Allocator.h"
30 #include "FreelistManager.h"
31 #include "BlueFS.h"
32 #include "BlueRocksEnv.h"
33 #include "auth/Crypto.h"
34 #include "common/EventTrace.h"
35
36 #define dout_context cct
37 #define dout_subsys ceph_subsys_bluestore
38
39 using bid_t = decltype(BlueStore::Blob::id);
40
41 // bluestore_cache_onode
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
43 bluestore_cache_onode);
44
45 // bluestore_cache_other
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
47 bluestore_cache_other);
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
49 bluestore_cache_other);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
51 bluestore_cache_other);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
53 bluestore_cache_other);
54
55 // bluestore_txc
56 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
57 bluestore_txc);
58
59
60 // kv store prefixes
61 const string PREFIX_SUPER = "S"; // field -> value
62 const string PREFIX_STAT = "T"; // field -> value(int64 array)
63 const string PREFIX_COLL = "C"; // collection name -> cnode_t
64 const string PREFIX_OBJ = "O"; // object name -> onode_t
65 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
66 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
67 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
68 const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
69
70 // write a label in the first block. always use this size. note that
71 // bluefs makes a matching assumption about the location of its
72 // superblock (always the second block of the device).
73 #define BDEV_LABEL_BLOCK_SIZE 4096
74
75 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76 #define SUPER_RESERVED 8192
77
78 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
79
80
81 /*
82 * extent map blob encoding
83 *
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
86 */
87 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91 #define BLOBID_SHIFT_BITS 4
92
93 /*
94 * object name key structure
95 *
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
99 *
100 * escaped string: namespace
101 *
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
106 *
107 * encoded u64: snap
108 * encoded u64: generation
109 * 'o'
110 */
111 #define ONODE_KEY_SUFFIX 'o'
112
113 /*
114 * extent shard key
115 *
116 * object prefix key
117 * u32
118 * 'x'
119 */
120 #define EXTENT_SHARD_KEY_SUFFIX 'x'
121
122 /*
123 * string encoding in the key
124 *
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
128 * hex digits.
129 *
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
132 *
133 */
134 template<typename S>
135 static void append_escaped(const string &in, S *out)
136 {
137 char hexbyte[in.length() * 3 + 1];
138 char* ptr = &hexbyte[0];
139 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
140 if (*i <= '#') {
141 *ptr++ = '#';
142 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
143 *ptr++ = "0123456789abcdef"[*i & 0x0f];
144 } else if (*i >= '~') {
145 *ptr++ = '~';
146 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
147 *ptr++ = "0123456789abcdef"[*i & 0x0f];
148 } else {
149 *ptr++ = *i;
150 }
151 }
152 *ptr++ = '!';
153 out->append(hexbyte, ptr - &hexbyte[0]);
154 }
155
156 inline unsigned h2i(char c)
157 {
158 if ((c >= '0') && (c <= '9')) {
159 return c - 0x30;
160 } else if ((c >= 'a') && (c <= 'f')) {
161 return c - 'a' + 10;
162 } else if ((c >= 'A') && (c <= 'F')) {
163 return c - 'A' + 10;
164 } else {
165 return 256; // make it always larger than 255
166 }
167 }
168
169 static int decode_escaped(const char *p, string *out)
170 {
171 char buff[256];
172 char* ptr = &buff[0];
173 char* max = &buff[252];
174 const char *orig_p = p;
175 while (*p && *p != '!') {
176 if (*p == '#' || *p == '~') {
177 unsigned hex = 0;
178 p++;
179 hex = h2i(*p++) << 4;
180 if (hex > 255) {
181 return -EINVAL;
182 }
183 hex |= h2i(*p++);
184 if (hex > 255) {
185 return -EINVAL;
186 }
187 *ptr++ = hex;
188 } else {
189 *ptr++ = *p++;
190 }
191 if (ptr > max) {
192 out->append(buff, ptr-buff);
193 ptr = &buff[0];
194 }
195 }
196 if (ptr != buff) {
197 out->append(buff, ptr-buff);
198 }
199 return p - orig_p;
200 }
201
202 // some things we encode in binary (as le32 or le64); print the
203 // resulting key strings nicely
204 template<typename S>
205 static string pretty_binary_string(const S& in)
206 {
207 char buf[10];
208 string out;
209 out.reserve(in.length() * 3);
210 enum { NONE, HEX, STRING } mode = NONE;
211 unsigned from = 0, i;
212 for (i=0; i < in.length(); ++i) {
213 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
214 (mode == HEX && in.length() - i >= 4 &&
215 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
216 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
217 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
218 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
219 if (mode == STRING) {
220 out.append(in.c_str() + from, i - from);
221 out.push_back('\'');
222 }
223 if (mode != HEX) {
224 out.append("0x");
225 mode = HEX;
226 }
227 if (in.length() - i >= 4) {
228 // print a whole u32 at once
229 snprintf(buf, sizeof(buf), "%08x",
230 (uint32_t)(((unsigned char)in[i] << 24) |
231 ((unsigned char)in[i+1] << 16) |
232 ((unsigned char)in[i+2] << 8) |
233 ((unsigned char)in[i+3] << 0)));
234 i += 3;
235 } else {
236 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
237 }
238 out.append(buf);
239 } else {
240 if (mode != STRING) {
241 out.push_back('\'');
242 mode = STRING;
243 from = i;
244 }
245 }
246 }
247 if (mode == STRING) {
248 out.append(in.c_str() + from, i - from);
249 out.push_back('\'');
250 }
251 return out;
252 }
253
254 template<typename T>
255 static void _key_encode_shard(shard_id_t shard, T *key)
256 {
257 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
258 }
259
260 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
261 {
262 pshard->id = (uint8_t)*key - (uint8_t)0x80;
263 return key + 1;
264 }
265
266 static void get_coll_key_range(const coll_t& cid, int bits,
267 string *temp_start, string *temp_end,
268 string *start, string *end)
269 {
270 temp_start->clear();
271 temp_end->clear();
272 start->clear();
273 end->clear();
274
275 spg_t pgid;
276 if (cid.is_pg(&pgid)) {
277 _key_encode_shard(pgid.shard, start);
278 *temp_start = *start;
279
280 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
281 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
282
283 *end = *start;
284 *temp_end = *temp_start;
285
286 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
287 _key_encode_u32(reverse_hash, start);
288 _key_encode_u32(reverse_hash, temp_start);
289
290 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
291 if (end_hash > 0xffffffffull)
292 end_hash = 0xffffffffull;
293
294 _key_encode_u32(end_hash, end);
295 _key_encode_u32(end_hash, temp_end);
296 } else {
297 _key_encode_shard(shard_id_t::NO_SHARD, start);
298 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
299 *end = *start;
300 _key_encode_u32(0, start);
301 _key_encode_u32(0xffffffff, end);
302
303 // no separate temp section
304 *temp_start = *end;
305 *temp_end = *end;
306 }
307 }
308
309 static void get_shared_blob_key(uint64_t sbid, string *key)
310 {
311 key->clear();
312 _key_encode_u64(sbid, key);
313 }
314
315 static int get_key_shared_blob(const string& key, uint64_t *sbid)
316 {
317 const char *p = key.c_str();
318 if (key.length() < sizeof(uint64_t))
319 return -1;
320 _key_decode_u64(p, sbid);
321 return 0;
322 }
323
324 template<typename S>
325 static int get_key_object(const S& key, ghobject_t *oid)
326 {
327 int r;
328 const char *p = key.c_str();
329
330 if (key.length() < 1 + 8 + 4)
331 return -1;
332 p = _key_decode_shard(p, &oid->shard_id);
333
334 uint64_t pool;
335 p = _key_decode_u64(p, &pool);
336 oid->hobj.pool = pool - 0x8000000000000000ull;
337
338 unsigned hash;
339 p = _key_decode_u32(p, &hash);
340
341 oid->hobj.set_bitwise_key_u32(hash);
342
343 r = decode_escaped(p, &oid->hobj.nspace);
344 if (r < 0)
345 return -2;
346 p += r + 1;
347
348 string k;
349 r = decode_escaped(p, &k);
350 if (r < 0)
351 return -3;
352 p += r + 1;
353 if (*p == '=') {
354 // no key
355 ++p;
356 oid->hobj.oid.name = k;
357 } else if (*p == '<' || *p == '>') {
358 // key + name
359 ++p;
360 r = decode_escaped(p, &oid->hobj.oid.name);
361 if (r < 0)
362 return -5;
363 p += r + 1;
364 oid->hobj.set_key(k);
365 } else {
366 // malformed
367 return -6;
368 }
369
370 p = _key_decode_u64(p, &oid->hobj.snap.val);
371 p = _key_decode_u64(p, &oid->generation);
372
373 if (*p != ONODE_KEY_SUFFIX) {
374 return -7;
375 }
376 p++;
377 if (*p) {
378 // if we get something other than a null terminator here,
379 // something goes wrong.
380 return -8;
381 }
382
383 return 0;
384 }
385
386 template<typename S>
387 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
388 {
389 key->clear();
390
391 size_t max_len = 1 + 8 + 4 +
392 (oid.hobj.nspace.length() * 3 + 1) +
393 (oid.hobj.get_key().length() * 3 + 1) +
394 1 + // for '<', '=', or '>'
395 (oid.hobj.oid.name.length() * 3 + 1) +
396 8 + 8 + 1;
397 key->reserve(max_len);
398
399 _key_encode_shard(oid.shard_id, key);
400 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
401 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
402
403 append_escaped(oid.hobj.nspace, key);
404
405 if (oid.hobj.get_key().length()) {
406 // is a key... could be < = or >.
407 append_escaped(oid.hobj.get_key(), key);
408 // (ASCII chars < = and > sort in that order, yay)
409 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
410 if (r) {
411 key->append(r > 0 ? ">" : "<");
412 append_escaped(oid.hobj.oid.name, key);
413 } else {
414 // same as no key
415 key->append("=");
416 }
417 } else {
418 // no key
419 append_escaped(oid.hobj.oid.name, key);
420 key->append("=");
421 }
422
423 _key_encode_u64(oid.hobj.snap, key);
424 _key_encode_u64(oid.generation, key);
425
426 key->push_back(ONODE_KEY_SUFFIX);
427
428 // sanity check
429 if (true) {
430 ghobject_t t;
431 int r = get_key_object(*key, &t);
432 if (r || t != oid) {
433 derr << " r " << r << dendl;
434 derr << "key " << pretty_binary_string(*key) << dendl;
435 derr << "oid " << oid << dendl;
436 derr << " t " << t << dendl;
437 assert(r == 0 && t == oid);
438 }
439 }
440 }
441
442
443 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
444 // char lets us quickly test whether it is a shard key without decoding any
445 // of the prefix bytes.
446 template<typename S>
447 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
448 string *key)
449 {
450 key->clear();
451 key->reserve(onode_key.length() + 4 + 1);
452 key->append(onode_key.c_str(), onode_key.size());
453 _key_encode_u32(offset, key);
454 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
455 }
456
457 static void rewrite_extent_shard_key(uint32_t offset, string *key)
458 {
459 assert(key->size() > sizeof(uint32_t) + 1);
460 assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
461 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
462 }
463
464 template<typename S>
465 static void generate_extent_shard_key_and_apply(
466 const S& onode_key,
467 uint32_t offset,
468 string *key,
469 std::function<void(const string& final_key)> apply)
470 {
471 if (key->empty()) { // make full key
472 assert(!onode_key.empty());
473 get_extent_shard_key(onode_key, offset, key);
474 } else {
475 rewrite_extent_shard_key(offset, key);
476 }
477 apply(*key);
478 }
479
480 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
481 {
482 assert(key.size() > sizeof(uint32_t) + 1);
483 assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
484 int okey_len = key.size() - sizeof(uint32_t) - 1;
485 *onode_key = key.substr(0, okey_len);
486 const char *p = key.data() + okey_len;
487 _key_decode_u32(p, offset);
488 return 0;
489 }
490
491 static bool is_extent_shard_key(const string& key)
492 {
493 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
494 }
495
496 // '-' < '.' < '~'
497 static void get_omap_header(uint64_t id, string *out)
498 {
499 _key_encode_u64(id, out);
500 out->push_back('-');
501 }
502
503 // hmm, I don't think there's any need to escape the user key since we
504 // have a clean prefix.
505 static void get_omap_key(uint64_t id, const string& key, string *out)
506 {
507 _key_encode_u64(id, out);
508 out->push_back('.');
509 out->append(key);
510 }
511
512 static void rewrite_omap_key(uint64_t id, string old, string *out)
513 {
514 _key_encode_u64(id, out);
515 out->append(old.c_str() + out->length(), old.size() - out->length());
516 }
517
518 static void decode_omap_key(const string& key, string *user_key)
519 {
520 *user_key = key.substr(sizeof(uint64_t) + 1);
521 }
522
523 static void get_omap_tail(uint64_t id, string *out)
524 {
525 _key_encode_u64(id, out);
526 out->push_back('~');
527 }
528
529 static void get_deferred_key(uint64_t seq, string *out)
530 {
531 _key_encode_u64(seq, out);
532 }
533
534
535 // merge operators
536
537 struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
538 void merge_nonexistent(
539 const char *rdata, size_t rlen, std::string *new_value) override {
540 *new_value = std::string(rdata, rlen);
541 }
542 void merge(
543 const char *ldata, size_t llen,
544 const char *rdata, size_t rlen,
545 std::string *new_value) override {
546 assert(llen == rlen);
547 assert((rlen % 8) == 0);
548 new_value->resize(rlen);
549 const __le64* lv = (const __le64*)ldata;
550 const __le64* rv = (const __le64*)rdata;
551 __le64* nv = &(__le64&)new_value->at(0);
552 for (size_t i = 0; i < rlen >> 3; ++i) {
553 nv[i] = lv[i] + rv[i];
554 }
555 }
556 // We use each operator name and each prefix to construct the
557 // overall RocksDB operator name for consistency check at open time.
558 string name() const override {
559 return "int64_array";
560 }
561 };
562
563
564 // Buffer
565
566 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
567 {
568 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
569 << b.offset << "~" << b.length << std::dec
570 << " " << BlueStore::Buffer::get_state_name(b.state);
571 if (b.flags)
572 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
573 return out << ")";
574 }
575
576 // Garbage Collector
577
578 void BlueStore::GarbageCollector::process_protrusive_extents(
579 const BlueStore::ExtentMap& extent_map,
580 uint64_t start_offset,
581 uint64_t end_offset,
582 uint64_t start_touch_offset,
583 uint64_t end_touch_offset,
584 uint64_t min_alloc_size)
585 {
586 assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
587
588 uint64_t lookup_start_offset = P2ALIGN(start_offset, min_alloc_size);
589 uint64_t lookup_end_offset = ROUND_UP_TO(end_offset, min_alloc_size);
590
591 dout(30) << __func__ << " (hex): [" << std::hex
592 << lookup_start_offset << ", " << lookup_end_offset
593 << ")" << std::dec << dendl;
594
595 for (auto it = extent_map.seek_lextent(lookup_start_offset);
596 it != extent_map.extent_map.end() &&
597 it->logical_offset < lookup_end_offset;
598 ++it) {
599 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
600 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
601
602 dout(30) << __func__ << " " << *it
603 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
604 << dendl;
605
606 Blob* b = it->blob.get();
607
608 if (it->logical_offset >=start_touch_offset &&
609 it->logical_end() <= end_touch_offset) {
610 // Process extents within the range affected by
611 // the current write request.
612 // Need to take into account if existing extents
613 // can be merged with them (uncompressed case)
614 if (!b->get_blob().is_compressed()) {
615 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
616 --blob_info_counted->expected_allocations; // don't need to allocate
617 // new AU for compressed
618 // data since another
619 // collocated uncompressed
620 // blob already exists
621 dout(30) << __func__ << " --expected:"
622 << alloc_unit_start << dendl;
623 }
624 used_alloc_unit = alloc_unit_end;
625 blob_info_counted = nullptr;
626 }
627 } else if (b->get_blob().is_compressed()) {
628
629 // additionally we take compressed blobs that were not impacted
630 // by the write into account too
631 BlobInfo& bi =
632 affected_blobs.emplace(
633 b, BlobInfo(b->get_referenced_bytes())).first->second;
634
635 int adjust =
636 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
637 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
638 dout(30) << __func__ << " expected_allocations="
639 << bi.expected_allocations << " end_au:"
640 << alloc_unit_end << dendl;
641
642 blob_info_counted = &bi;
643 used_alloc_unit = alloc_unit_end;
644
645 assert(it->length <= bi.referenced_bytes);
646 bi.referenced_bytes -= it->length;
647 dout(30) << __func__ << " affected_blob:" << *b
648 << " unref 0x" << std::hex << it->length
649 << " referenced = 0x" << bi.referenced_bytes
650 << std::dec << dendl;
651 // NOTE: we can't move specific blob to resulting GC list here
652 // when reference counter == 0 since subsequent extents might
653 // decrement its expected_allocation.
654 // Hence need to enumerate all the extents first.
655 if (!bi.collect_candidate) {
656 bi.first_lextent = it;
657 bi.collect_candidate = true;
658 }
659 bi.last_lextent = it;
660 } else {
661 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
662 // don't need to allocate new AU for compressed data since another
663 // collocated uncompressed blob already exists
664 --blob_info_counted->expected_allocations;
665 dout(30) << __func__ << " --expected_allocations:"
666 << alloc_unit_start << dendl;
667 }
668 used_alloc_unit = alloc_unit_end;
669 blob_info_counted = nullptr;
670 }
671 }
672
673 for (auto b_it = affected_blobs.begin();
674 b_it != affected_blobs.end();
675 ++b_it) {
676 Blob* b = b_it->first;
677 BlobInfo& bi = b_it->second;
678 if (bi.referenced_bytes == 0) {
679 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
680 int64_t blob_expected_for_release =
681 ROUND_UP_TO(len_on_disk, min_alloc_size) / min_alloc_size;
682
683 dout(30) << __func__ << " " << *(b_it->first)
684 << " expected4release=" << blob_expected_for_release
685 << " expected_allocations=" << bi.expected_allocations
686 << dendl;
687 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
688 if (benefit >= g_conf->bluestore_gc_enable_blob_threshold) {
689 if (bi.collect_candidate) {
690 auto it = bi.first_lextent;
691 bool bExit = false;
692 do {
693 if (it->blob.get() == b) {
694 extents_to_collect.emplace_back(it->logical_offset, it->length);
695 }
696 bExit = it == bi.last_lextent;
697 ++it;
698 } while (!bExit);
699 }
700 expected_for_release += blob_expected_for_release;
701 expected_allocations += bi.expected_allocations;
702 }
703 }
704 }
705 }
706
707 int64_t BlueStore::GarbageCollector::estimate(
708 uint64_t start_offset,
709 uint64_t length,
710 const BlueStore::ExtentMap& extent_map,
711 const BlueStore::old_extent_map_t& old_extents,
712 uint64_t min_alloc_size)
713 {
714
715 affected_blobs.clear();
716 extents_to_collect.clear();
717 used_alloc_unit = boost::optional<uint64_t >();
718 blob_info_counted = nullptr;
719
720 gc_start_offset = start_offset;
721 gc_end_offset = start_offset + length;
722
723 uint64_t end_offset = start_offset + length;
724
725 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
726 Blob* b = it->e.blob.get();
727 if (b->get_blob().is_compressed()) {
728
729 // update gc_start_offset/gc_end_offset if needed
730 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
731 gc_end_offset = max(gc_end_offset, (uint64_t)it->e.blob_end());
732
733 auto o = it->e.logical_offset;
734 auto l = it->e.length;
735
736 uint64_t ref_bytes = b->get_referenced_bytes();
737 // micro optimization to bypass blobs that have no more references
738 if (ref_bytes != 0) {
739 dout(30) << __func__ << " affected_blob:" << *b
740 << " unref 0x" << std::hex << o << "~" << l
741 << std::dec << dendl;
742 affected_blobs.emplace(b, BlobInfo(ref_bytes));
743 }
744 }
745 }
746 dout(30) << __func__ << " gc range(hex): [" << std::hex
747 << gc_start_offset << ", " << gc_end_offset
748 << ")" << std::dec << dendl;
749
750 // enumerate preceeding extents to check if they reference affected blobs
751 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
752 process_protrusive_extents(extent_map,
753 gc_start_offset,
754 gc_end_offset,
755 start_offset,
756 end_offset,
757 min_alloc_size);
758 }
759 return expected_for_release - expected_allocations;
760 }
761
762 // Cache
763
764 BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
765 PerfCounters *logger)
766 {
767 Cache *c = nullptr;
768
769 if (type == "lru")
770 c = new LRUCache(cct);
771 else if (type == "2q")
772 c = new TwoQCache(cct);
773 else
774 assert(0 == "unrecognized cache type");
775
776 c->logger = logger;
777 return c;
778 }
779
780 void BlueStore::Cache::trim_all()
781 {
782 std::lock_guard<std::recursive_mutex> l(lock);
783 _trim(0, 0);
784 }
785
786 void BlueStore::Cache::trim(
787 uint64_t target_bytes,
788 float target_meta_ratio,
789 float target_data_ratio,
790 float bytes_per_onode)
791 {
792 std::lock_guard<std::recursive_mutex> l(lock);
793 uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
794 uint64_t current_buffer = _get_buffer_bytes();
795 uint64_t current = current_meta + current_buffer;
796
797 uint64_t target_meta = target_bytes * target_meta_ratio;
798 uint64_t target_buffer = target_bytes * target_data_ratio;
799
800 // correct for overflow or float imprecision
801 target_meta = min(target_bytes, target_meta);
802 target_buffer = min(target_bytes - target_meta, target_buffer);
803
804 if (current <= target_bytes) {
805 dout(10) << __func__
806 << " shard target " << pretty_si_t(target_bytes)
807 << " meta/data ratios " << target_meta_ratio
808 << " + " << target_data_ratio << " ("
809 << pretty_si_t(target_meta) << " + "
810 << pretty_si_t(target_buffer) << "), "
811 << " current " << pretty_si_t(current) << " ("
812 << pretty_si_t(current_meta) << " + "
813 << pretty_si_t(current_buffer) << ")"
814 << dendl;
815 return;
816 }
817
818 uint64_t need_to_free = current - target_bytes;
819 uint64_t free_buffer = 0;
820 uint64_t free_meta = 0;
821 if (current_buffer > target_buffer) {
822 free_buffer = current_buffer - target_buffer;
823 if (free_buffer > need_to_free) {
824 free_buffer = need_to_free;
825 }
826 }
827 free_meta = need_to_free - free_buffer;
828
829 // start bounds at what we have now
830 uint64_t max_buffer = current_buffer - free_buffer;
831 uint64_t max_meta = current_meta - free_meta;
832 uint64_t max_onodes = max_meta / bytes_per_onode;
833
834 dout(10) << __func__
835 << " shard target " << pretty_si_t(target_bytes)
836 << " ratio " << target_meta_ratio << " ("
837 << pretty_si_t(target_meta) << " + "
838 << pretty_si_t(target_buffer) << "), "
839 << " current " << pretty_si_t(current) << " ("
840 << pretty_si_t(current_meta) << " + "
841 << pretty_si_t(current_buffer) << "),"
842 << " need_to_free " << pretty_si_t(need_to_free) << " ("
843 << pretty_si_t(free_meta) << " + "
844 << pretty_si_t(free_buffer) << ")"
845 << " -> max " << max_onodes << " onodes + "
846 << max_buffer << " buffer"
847 << dendl;
848 _trim(max_onodes, max_buffer);
849 }
850
851
852 // LRUCache
853 #undef dout_prefix
854 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
855
856 void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
857 {
858 auto p = onode_lru.iterator_to(*o);
859 onode_lru.erase(p);
860 onode_lru.push_front(*o);
861 }
862
863 void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
864 {
865 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
866 << " buffers " << buffer_size << " / " << buffer_max
867 << dendl;
868
869 _audit("trim start");
870
871 // buffers
872 while (buffer_size > buffer_max) {
873 auto i = buffer_lru.rbegin();
874 if (i == buffer_lru.rend()) {
875 // stop if buffer_lru is now empty
876 break;
877 }
878
879 Buffer *b = &*i;
880 assert(b->is_clean());
881 dout(20) << __func__ << " rm " << *b << dendl;
882 b->space->_rm_buffer(this, b);
883 }
884
885 // onodes
886 int num = onode_lru.size() - onode_max;
887 if (num <= 0)
888 return; // don't even try
889
890 auto p = onode_lru.end();
891 assert(p != onode_lru.begin());
892 --p;
893 int skipped = 0;
894 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
895 while (num > 0) {
896 Onode *o = &*p;
897 int refs = o->nref.load();
898 if (refs > 1) {
899 dout(20) << __func__ << " " << o->oid << " has " << refs
900 << " refs, skipping" << dendl;
901 if (++skipped >= max_skipped) {
902 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
903 << num << " left to trim" << dendl;
904 break;
905 }
906
907 if (p == onode_lru.begin()) {
908 break;
909 } else {
910 p--;
911 num--;
912 continue;
913 }
914 }
915 dout(30) << __func__ << " rm " << o->oid << dendl;
916 if (p != onode_lru.begin()) {
917 onode_lru.erase(p--);
918 } else {
919 onode_lru.erase(p);
920 assert(num == 1);
921 }
922 o->get(); // paranoia
923 o->c->onode_map.remove(o->oid);
924 o->put();
925 --num;
926 }
927 }
928
929 #ifdef DEBUG_CACHE
930 void BlueStore::LRUCache::_audit(const char *when)
931 {
932 dout(10) << __func__ << " " << when << " start" << dendl;
933 uint64_t s = 0;
934 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
935 s += i->length;
936 }
937 if (s != buffer_size) {
938 derr << __func__ << " buffer_size " << buffer_size << " actual " << s
939 << dendl;
940 for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
941 derr << __func__ << " " << *i << dendl;
942 }
943 assert(s == buffer_size);
944 }
945 dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
946 << " ok" << dendl;
947 }
948 #endif
949
950 // TwoQCache
951 #undef dout_prefix
952 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
953
954
955 void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
956 {
957 auto p = onode_lru.iterator_to(*o);
958 onode_lru.erase(p);
959 onode_lru.push_front(*o);
960 }
961
962 void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
963 {
964 dout(20) << __func__ << " level " << level << " near " << near
965 << " on " << *b
966 << " which has cache_private " << b->cache_private << dendl;
967 if (near) {
968 b->cache_private = near->cache_private;
969 switch (b->cache_private) {
970 case BUFFER_WARM_IN:
971 buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
972 break;
973 case BUFFER_WARM_OUT:
974 assert(b->is_empty());
975 buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
976 break;
977 case BUFFER_HOT:
978 buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
979 break;
980 default:
981 assert(0 == "bad cache_private");
982 }
983 } else if (b->cache_private == BUFFER_NEW) {
984 b->cache_private = BUFFER_WARM_IN;
985 if (level > 0) {
986 buffer_warm_in.push_front(*b);
987 } else {
988 // take caller hint to start at the back of the warm queue
989 buffer_warm_in.push_back(*b);
990 }
991 } else {
992 // we got a hint from discard
993 switch (b->cache_private) {
994 case BUFFER_WARM_IN:
995 // stay in warm_in. move to front, even though 2Q doesn't actually
996 // do this.
997 dout(20) << __func__ << " move to front of warm " << *b << dendl;
998 buffer_warm_in.push_front(*b);
999 break;
1000 case BUFFER_WARM_OUT:
1001 b->cache_private = BUFFER_HOT;
1002 // move to hot. fall-thru
1003 case BUFFER_HOT:
1004 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1005 buffer_hot.push_front(*b);
1006 break;
1007 default:
1008 assert(0 == "bad cache_private");
1009 }
1010 }
1011 if (!b->is_empty()) {
1012 buffer_bytes += b->length;
1013 buffer_list_bytes[b->cache_private] += b->length;
1014 }
1015 }
1016
1017 void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
1018 {
1019 dout(20) << __func__ << " " << *b << dendl;
1020 if (!b->is_empty()) {
1021 assert(buffer_bytes >= b->length);
1022 buffer_bytes -= b->length;
1023 assert(buffer_list_bytes[b->cache_private] >= b->length);
1024 buffer_list_bytes[b->cache_private] -= b->length;
1025 }
1026 switch (b->cache_private) {
1027 case BUFFER_WARM_IN:
1028 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1029 break;
1030 case BUFFER_WARM_OUT:
1031 buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
1032 break;
1033 case BUFFER_HOT:
1034 buffer_hot.erase(buffer_hot.iterator_to(*b));
1035 break;
1036 default:
1037 assert(0 == "bad cache_private");
1038 }
1039 }
1040
1041 void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
1042 {
1043 TwoQCache *src = static_cast<TwoQCache*>(srcc);
1044 src->_rm_buffer(b);
1045
1046 // preserve which list we're on (even if we can't preserve the order!)
1047 switch (b->cache_private) {
1048 case BUFFER_WARM_IN:
1049 assert(!b->is_empty());
1050 buffer_warm_in.push_back(*b);
1051 break;
1052 case BUFFER_WARM_OUT:
1053 assert(b->is_empty());
1054 buffer_warm_out.push_back(*b);
1055 break;
1056 case BUFFER_HOT:
1057 assert(!b->is_empty());
1058 buffer_hot.push_back(*b);
1059 break;
1060 default:
1061 assert(0 == "bad cache_private");
1062 }
1063 if (!b->is_empty()) {
1064 buffer_bytes += b->length;
1065 buffer_list_bytes[b->cache_private] += b->length;
1066 }
1067 }
1068
1069 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
1070 {
1071 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1072 if (!b->is_empty()) {
1073 assert((int64_t)buffer_bytes + delta >= 0);
1074 buffer_bytes += delta;
1075 assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
1076 buffer_list_bytes[b->cache_private] += delta;
1077 }
1078 }
1079
1080 void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
1081 {
1082 dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
1083 << " buffers " << buffer_bytes << " / " << buffer_max
1084 << dendl;
1085
1086 _audit("trim start");
1087
1088 // buffers
1089 if (buffer_bytes > buffer_max) {
1090 uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
1091 uint64_t khot = buffer_max - kin;
1092
1093 // pre-calculate kout based on average buffer size too,
1094 // which is typical(the warm_in and hot lists may change later)
1095 uint64_t kout = 0;
1096 uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
1097 if (buffer_num) {
1098 uint64_t buffer_avg_size = buffer_bytes / buffer_num;
1099 assert(buffer_avg_size);
1100 uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
1101 kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1102 }
1103
1104 if (buffer_list_bytes[BUFFER_HOT] < khot) {
1105 // hot is small, give slack to warm_in
1106 kin += khot - buffer_list_bytes[BUFFER_HOT];
1107 } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
1108 // warm_in is small, give slack to hot
1109 khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
1110 }
1111
1112 // adjust warm_in list
1113 int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
1114 uint64_t evicted = 0;
1115
1116 while (to_evict_bytes > 0) {
1117 auto p = buffer_warm_in.rbegin();
1118 if (p == buffer_warm_in.rend()) {
1119 // stop if warm_in list is now empty
1120 break;
1121 }
1122
1123 Buffer *b = &*p;
1124 assert(b->is_clean());
1125 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1126 assert(buffer_bytes >= b->length);
1127 buffer_bytes -= b->length;
1128 assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
1129 buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
1130 to_evict_bytes -= b->length;
1131 evicted += b->length;
1132 b->state = Buffer::STATE_EMPTY;
1133 b->data.clear();
1134 buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
1135 buffer_warm_out.push_front(*b);
1136 b->cache_private = BUFFER_WARM_OUT;
1137 }
1138
1139 if (evicted > 0) {
1140 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1141 << " from warm_in list, done evicting warm_in buffers"
1142 << dendl;
1143 }
1144
1145 // adjust hot list
1146 to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
1147 evicted = 0;
1148
1149 while (to_evict_bytes > 0) {
1150 auto p = buffer_hot.rbegin();
1151 if (p == buffer_hot.rend()) {
1152 // stop if hot list is now empty
1153 break;
1154 }
1155
1156 Buffer *b = &*p;
1157 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1158 assert(b->is_clean());
1159 // adjust evict size before buffer goes invalid
1160 to_evict_bytes -= b->length;
1161 evicted += b->length;
1162 b->space->_rm_buffer(this, b);
1163 }
1164
1165 if (evicted > 0) {
1166 dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
1167 << " from hot list, done evicting hot buffers"
1168 << dendl;
1169 }
1170
1171 // adjust warm out list too, if necessary
1172 int64_t num = buffer_warm_out.size() - kout;
1173 while (num-- > 0) {
1174 Buffer *b = &*buffer_warm_out.rbegin();
1175 assert(b->is_empty());
1176 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1177 b->space->_rm_buffer(this, b);
1178 }
1179 }
1180
1181 // onodes
1182 int num = onode_lru.size() - onode_max;
1183 if (num <= 0)
1184 return; // don't even try
1185
1186 auto p = onode_lru.end();
1187 assert(p != onode_lru.begin());
1188 --p;
1189 int skipped = 0;
1190 int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
1191 while (num > 0) {
1192 Onode *o = &*p;
1193 dout(20) << __func__ << " considering " << o << dendl;
1194 int refs = o->nref.load();
1195 if (refs > 1) {
1196 dout(20) << __func__ << " " << o->oid << " has " << refs
1197 << " refs; skipping" << dendl;
1198 if (++skipped >= max_skipped) {
1199 dout(20) << __func__ << " maximum skip pinned reached; stopping with "
1200 << num << " left to trim" << dendl;
1201 break;
1202 }
1203
1204 if (p == onode_lru.begin()) {
1205 break;
1206 } else {
1207 p--;
1208 num--;
1209 continue;
1210 }
1211 }
1212 dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
1213 if (p != onode_lru.begin()) {
1214 onode_lru.erase(p--);
1215 } else {
1216 onode_lru.erase(p);
1217 assert(num == 1);
1218 }
1219 o->get(); // paranoia
1220 o->c->onode_map.remove(o->oid);
1221 o->put();
1222 --num;
1223 }
1224 }
1225
1226 #ifdef DEBUG_CACHE
1227 void BlueStore::TwoQCache::_audit(const char *when)
1228 {
1229 dout(10) << __func__ << " " << when << " start" << dendl;
1230 uint64_t s = 0;
1231 for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
1232 s += i->length;
1233 }
1234
1235 uint64_t hot_bytes = s;
1236 if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
1237 derr << __func__ << " hot_list_bytes "
1238 << buffer_list_bytes[BUFFER_HOT]
1239 << " != actual " << hot_bytes
1240 << dendl;
1241 assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
1242 }
1243
1244 for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
1245 s += i->length;
1246 }
1247
1248 uint64_t warm_in_bytes = s - hot_bytes;
1249 if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
1250 derr << __func__ << " warm_in_list_bytes "
1251 << buffer_list_bytes[BUFFER_WARM_IN]
1252 << " != actual " << warm_in_bytes
1253 << dendl;
1254 assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
1255 }
1256
1257 if (s != buffer_bytes) {
1258 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1259 << dendl;
1260 assert(s == buffer_bytes);
1261 }
1262
1263 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1264 << " ok" << dendl;
1265 }
1266 #endif
1267
1268
1269 // BufferSpace
1270
1271 #undef dout_prefix
1272 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1273
1274 void BlueStore::BufferSpace::_clear(Cache* cache)
1275 {
1276 // note: we already hold cache->lock
1277 ldout(cache->cct, 20) << __func__ << dendl;
1278 while (!buffer_map.empty()) {
1279 _rm_buffer(cache, buffer_map.begin());
1280 }
1281 }
1282
1283 int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
1284 {
1285 // note: we already hold cache->lock
1286 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1287 << std::dec << dendl;
1288 int cache_private = 0;
1289 cache->_audit("discard start");
1290 auto i = _data_lower_bound(offset);
1291 uint32_t end = offset + length;
1292 while (i != buffer_map.end()) {
1293 Buffer *b = i->second.get();
1294 if (b->offset >= end) {
1295 break;
1296 }
1297 if (b->cache_private > cache_private) {
1298 cache_private = b->cache_private;
1299 }
1300 if (b->offset < offset) {
1301 int64_t front = offset - b->offset;
1302 if (b->end() > end) {
1303 // drop middle (split)
1304 uint32_t tail = b->end() - end;
1305 if (b->data.length()) {
1306 bufferlist bl;
1307 bl.substr_of(b->data, b->length - tail, tail);
1308 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1309 nb->maybe_rebuild();
1310 _add_buffer(cache, nb, 0, b);
1311 } else {
1312 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1313 0, b);
1314 }
1315 if (!b->is_writing()) {
1316 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1317 }
1318 b->truncate(front);
1319 b->maybe_rebuild();
1320 cache->_audit("discard end 1");
1321 break;
1322 } else {
1323 // drop tail
1324 if (!b->is_writing()) {
1325 cache->_adjust_buffer_size(b, front - (int64_t)b->length);
1326 }
1327 b->truncate(front);
1328 b->maybe_rebuild();
1329 ++i;
1330 continue;
1331 }
1332 }
1333 if (b->end() <= end) {
1334 // drop entire buffer
1335 _rm_buffer(cache, i++);
1336 continue;
1337 }
1338 // drop front
1339 uint32_t keep = b->end() - end;
1340 if (b->data.length()) {
1341 bufferlist bl;
1342 bl.substr_of(b->data, b->length - keep, keep);
1343 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1344 nb->maybe_rebuild();
1345 _add_buffer(cache, nb, 0, b);
1346 } else {
1347 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1348 }
1349 _rm_buffer(cache, i);
1350 cache->_audit("discard end 2");
1351 break;
1352 }
1353 return cache_private;
1354 }
1355
1356 void BlueStore::BufferSpace::read(
1357 Cache* cache,
1358 uint32_t offset,
1359 uint32_t length,
1360 BlueStore::ready_regions_t& res,
1361 interval_set<uint32_t>& res_intervals)
1362 {
1363 res.clear();
1364 res_intervals.clear();
1365 uint32_t want_bytes = length;
1366 uint32_t end = offset + length;
1367
1368 {
1369 std::lock_guard<std::recursive_mutex> l(cache->lock);
1370 for (auto i = _data_lower_bound(offset);
1371 i != buffer_map.end() && offset < end && i->first < end;
1372 ++i) {
1373 Buffer *b = i->second.get();
1374 assert(b->end() > offset);
1375 if (b->is_writing() || b->is_clean()) {
1376 if (b->offset < offset) {
1377 uint32_t skip = offset - b->offset;
1378 uint32_t l = MIN(length, b->length - skip);
1379 res[offset].substr_of(b->data, skip, l);
1380 res_intervals.insert(offset, l);
1381 offset += l;
1382 length -= l;
1383 if (!b->is_writing()) {
1384 cache->_touch_buffer(b);
1385 }
1386 continue;
1387 }
1388 if (b->offset > offset) {
1389 uint32_t gap = b->offset - offset;
1390 if (length <= gap) {
1391 break;
1392 }
1393 offset += gap;
1394 length -= gap;
1395 }
1396 if (!b->is_writing()) {
1397 cache->_touch_buffer(b);
1398 }
1399 if (b->length > length) {
1400 res[offset].substr_of(b->data, 0, length);
1401 res_intervals.insert(offset, length);
1402 break;
1403 } else {
1404 res[offset].append(b->data);
1405 res_intervals.insert(offset, b->length);
1406 if (b->length == length)
1407 break;
1408 offset += b->length;
1409 length -= b->length;
1410 }
1411 }
1412 }
1413 }
1414
1415 uint64_t hit_bytes = res_intervals.size();
1416 assert(hit_bytes <= want_bytes);
1417 uint64_t miss_bytes = want_bytes - hit_bytes;
1418 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1419 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1420 }
1421
1422 void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
1423 {
1424 std::lock_guard<std::recursive_mutex> l(cache->lock);
1425
1426 auto i = writing.begin();
1427 while (i != writing.end()) {
1428 if (i->seq > seq) {
1429 break;
1430 }
1431 if (i->seq < seq) {
1432 ++i;
1433 continue;
1434 }
1435
1436 Buffer *b = &*i;
1437 assert(b->is_writing());
1438
1439 if (b->flags & Buffer::FLAG_NOCACHE) {
1440 writing.erase(i++);
1441 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1442 buffer_map.erase(b->offset);
1443 } else {
1444 b->state = Buffer::STATE_CLEAN;
1445 writing.erase(i++);
1446 b->maybe_rebuild();
1447 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1448 cache->_add_buffer(b, 1, nullptr);
1449 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1450 }
1451 }
1452
1453 cache->_audit("finish_write end");
1454 }
1455
1456 void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
1457 {
1458 std::lock_guard<std::recursive_mutex> lk(cache->lock);
1459 if (buffer_map.empty())
1460 return;
1461
1462 auto p = --buffer_map.end();
1463 while (true) {
1464 if (p->second->end() <= pos)
1465 break;
1466
1467 if (p->second->offset < pos) {
1468 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1469 size_t left = pos - p->second->offset;
1470 size_t right = p->second->length - left;
1471 if (p->second->data.length()) {
1472 bufferlist bl;
1473 bl.substr_of(p->second->data, left, right);
1474 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1475 0, p->second.get());
1476 } else {
1477 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1478 0, p->second.get());
1479 }
1480 cache->_adjust_buffer_size(p->second.get(), -right);
1481 p->second->truncate(left);
1482 break;
1483 }
1484
1485 assert(p->second->end() > pos);
1486 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1487 if (p->second->data.length()) {
1488 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1489 p->second->offset - pos, p->second->data),
1490 0, p->second.get());
1491 } else {
1492 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1493 p->second->offset - pos, p->second->length),
1494 0, p->second.get());
1495 }
1496 if (p == buffer_map.begin()) {
1497 _rm_buffer(cache, p);
1498 break;
1499 } else {
1500 _rm_buffer(cache, p--);
1501 }
1502 }
1503 assert(writing.empty());
1504 }
1505
1506 // OnodeSpace
1507
1508 #undef dout_prefix
1509 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1510
1511 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1512 {
1513 std::lock_guard<std::recursive_mutex> l(cache->lock);
1514 auto p = onode_map.find(oid);
1515 if (p != onode_map.end()) {
1516 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1517 << " raced, returning existing " << p->second
1518 << dendl;
1519 return p->second;
1520 }
1521 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1522 onode_map[oid] = o;
1523 cache->_add_onode(o, 1);
1524 return o;
1525 }
1526
1527 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1528 {
1529 ldout(cache->cct, 30) << __func__ << dendl;
1530 OnodeRef o;
1531 bool hit = false;
1532
1533 {
1534 std::lock_guard<std::recursive_mutex> l(cache->lock);
1535 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1536 if (p == onode_map.end()) {
1537 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1538 } else {
1539 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1540 << dendl;
1541 cache->_touch_onode(p->second);
1542 hit = true;
1543 o = p->second;
1544 }
1545 }
1546
1547 if (hit) {
1548 cache->logger->inc(l_bluestore_onode_hits);
1549 } else {
1550 cache->logger->inc(l_bluestore_onode_misses);
1551 }
1552 return o;
1553 }
1554
1555 void BlueStore::OnodeSpace::clear()
1556 {
1557 std::lock_guard<std::recursive_mutex> l(cache->lock);
1558 ldout(cache->cct, 10) << __func__ << dendl;
1559 for (auto &p : onode_map) {
1560 cache->_rm_onode(p.second);
1561 }
1562 onode_map.clear();
1563 }
1564
1565 bool BlueStore::OnodeSpace::empty()
1566 {
1567 std::lock_guard<std::recursive_mutex> l(cache->lock);
1568 return onode_map.empty();
1569 }
1570
1571 void BlueStore::OnodeSpace::rename(
1572 OnodeRef& oldo,
1573 const ghobject_t& old_oid,
1574 const ghobject_t& new_oid,
1575 const mempool::bluestore_cache_other::string& new_okey)
1576 {
1577 std::lock_guard<std::recursive_mutex> l(cache->lock);
1578 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1579 << dendl;
1580 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1581 po = onode_map.find(old_oid);
1582 pn = onode_map.find(new_oid);
1583 assert(po != pn);
1584
1585 assert(po != onode_map.end());
1586 if (pn != onode_map.end()) {
1587 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1588 << dendl;
1589 cache->_rm_onode(pn->second);
1590 onode_map.erase(pn);
1591 }
1592 OnodeRef o = po->second;
1593
1594 // install a non-existent onode at old location
1595 oldo.reset(new Onode(o->c, old_oid, o->key));
1596 po->second = oldo;
1597 cache->_add_onode(po->second, 1);
1598
1599 // add at new position and fix oid, key
1600 onode_map.insert(make_pair(new_oid, o));
1601 cache->_touch_onode(o);
1602 o->oid = new_oid;
1603 o->key = new_okey;
1604 }
1605
1606 bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1607 {
1608 std::lock_guard<std::recursive_mutex> l(cache->lock);
1609 ldout(cache->cct, 20) << __func__ << dendl;
1610 for (auto& i : onode_map) {
1611 if (f(i.second)) {
1612 return true;
1613 }
1614 }
1615 return false;
1616 }
1617
1618
1619 // SharedBlob
1620
1621 #undef dout_prefix
1622 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1623
1624 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1625 {
1626 out << "SharedBlob(" << &sb;
1627
1628 if (sb.loaded) {
1629 out << " loaded " << *sb.persistent;
1630 } else {
1631 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1632 }
1633 return out << ")";
1634 }
1635
1636 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1637 : coll(_coll), sbid_unloaded(i)
1638 {
1639 assert(sbid_unloaded > 0);
1640 if (get_cache()) {
1641 get_cache()->add_blob();
1642 }
1643 }
1644
1645 BlueStore::SharedBlob::~SharedBlob()
1646 {
1647 if (get_cache()) { // the dummy instances have a nullptr
1648 std::lock_guard<std::recursive_mutex> l(get_cache()->lock);
1649 bc._clear(get_cache());
1650 get_cache()->rm_blob();
1651 }
1652 if (loaded && persistent) {
1653 delete persistent;
1654 }
1655 }
1656
1657 void BlueStore::SharedBlob::put()
1658 {
1659 if (--nref == 0) {
1660 ldout(coll->store->cct, 20) << __func__ << " " << this
1661 << " removing self from set " << get_parent()
1662 << dendl;
1663 if (get_parent()) {
1664 if (get_parent()->remove(this)) {
1665 delete this;
1666 } else {
1667 ldout(coll->store->cct, 20)
1668 << __func__ << " " << this << " lost race to remove myself from set"
1669 << dendl;
1670 }
1671 } else {
1672 delete this;
1673 }
1674 }
1675 }
1676
1677 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1678 {
1679 assert(persistent);
1680 persistent->ref_map.get(offset, length);
1681 }
1682
1683 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
1684 PExtentVector *r,
1685 set<SharedBlob*> *maybe_unshared)
1686 {
1687 assert(persistent);
1688 bool maybe = false;
1689 persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr);
1690 if (maybe_unshared && maybe) {
1691 maybe_unshared->insert(this);
1692 }
1693 }
1694
1695 // Blob
1696
1697 #undef dout_prefix
1698 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1699
1700 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1701 {
1702 out << "Blob(" << &b;
1703 if (b.is_spanning()) {
1704 out << " spanning " << b.id;
1705 }
1706 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1707 if (b.shared_blob) {
1708 out << " " << *b.shared_blob;
1709 } else {
1710 out << " (shared_blob=NULL)";
1711 }
1712 out << ")";
1713 return out;
1714 }
1715
1716 void BlueStore::Blob::discard_unallocated(Collection *coll)
1717 {
1718 if (get_blob().is_shared()) {
1719 return;
1720 }
1721 if (get_blob().is_compressed()) {
1722 bool discard = false;
1723 bool all_invalid = true;
1724 for (auto e : get_blob().get_extents()) {
1725 if (!e.is_valid()) {
1726 discard = true;
1727 } else {
1728 all_invalid = false;
1729 }
1730 }
1731 assert(discard == all_invalid); // in case of compressed blob all
1732 // or none pextents are invalid.
1733 if (discard) {
1734 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1735 get_blob().get_logical_length());
1736 }
1737 } else {
1738 size_t pos = 0;
1739 for (auto e : get_blob().get_extents()) {
1740 if (!e.is_valid()) {
1741 ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
1742 << "~" << e.length
1743 << std::dec << dendl;
1744 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1745 }
1746 pos += e.length;
1747 }
1748 if (get_blob().can_prune_tail()) {
1749 dirty_blob().prune_tail();
1750 used_in_blob.prune_tail(get_blob().get_ondisk_length());
1751 auto cct = coll->store->cct; //used by dout
1752 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
1753 }
1754 }
1755 }
1756
1757 void BlueStore::Blob::get_ref(
1758 Collection *coll,
1759 uint32_t offset,
1760 uint32_t length)
1761 {
1762 // Caller has to initialize Blob's logical length prior to increment
1763 // references. Otherwise one is neither unable to determine required
1764 // amount of counters in case of per-au tracking nor obtain min_release_size
1765 // for single counter mode.
1766 assert(get_blob().get_logical_length() != 0);
1767 auto cct = coll->store->cct;
1768 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1769 << std::dec << " " << *this << dendl;
1770
1771 if (used_in_blob.is_empty()) {
1772 uint32_t min_release_size =
1773 get_blob().get_release_size(coll->store->min_alloc_size);
1774 uint64_t l = get_blob().get_logical_length();
1775 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1776 << min_release_size << std::dec << dendl;
1777 used_in_blob.init(l, min_release_size);
1778 }
1779 used_in_blob.get(
1780 offset,
1781 length);
1782 }
1783
1784 bool BlueStore::Blob::put_ref(
1785 Collection *coll,
1786 uint32_t offset,
1787 uint32_t length,
1788 PExtentVector *r)
1789 {
1790 PExtentVector logical;
1791
1792 auto cct = coll->store->cct;
1793 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1794 << std::dec << " " << *this << dendl;
1795
1796 bool empty = used_in_blob.put(
1797 offset,
1798 length,
1799 &logical);
1800 r->clear();
1801 // nothing to release
1802 if (!empty && logical.empty()) {
1803 return false;
1804 }
1805
1806 bluestore_blob_t& b = dirty_blob();
1807 return b.release_extents(empty, logical, r);
1808 }
1809
1810 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
1811 uint32_t target_blob_size,
1812 uint32_t b_offset,
1813 uint32_t *length0) {
1814 assert(min_alloc_size);
1815 assert(target_blob_size);
1816 if (!get_blob().is_mutable()) {
1817 return false;
1818 }
1819
1820 uint32_t length = *length0;
1821 uint32_t end = b_offset + length;
1822
1823 // Currently for the sake of simplicity we omit blob reuse if data is
1824 // unaligned with csum chunk. Later we can perform padding if needed.
1825 if (get_blob().has_csum() &&
1826 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
1827 (end % get_blob().get_csum_chunk_size()) != 0)) {
1828 return false;
1829 }
1830
1831 auto blen = get_blob().get_logical_length();
1832 uint32_t new_blen = blen;
1833
1834 // make sure target_blob_size isn't less than current blob len
1835 target_blob_size = MAX(blen, target_blob_size);
1836
1837 if (b_offset >= blen) {
1838 // new data totally stands out of the existing blob
1839 new_blen = end;
1840 } else {
1841 // new data overlaps with the existing blob
1842 new_blen = MAX(blen, end);
1843
1844 uint32_t overlap = 0;
1845 if (new_blen > blen) {
1846 overlap = blen - b_offset;
1847 } else {
1848 overlap = length;
1849 }
1850
1851 if (!get_blob().is_unallocated(b_offset, overlap)) {
1852 // abort if any piece of the overlap has already been allocated
1853 return false;
1854 }
1855 }
1856
1857 if (new_blen > blen) {
1858 int64_t overflow = int64_t(new_blen) - target_blob_size;
1859 // Unable to decrease the provided length to fit into max_blob_size
1860 if (overflow >= length) {
1861 return false;
1862 }
1863
1864 // FIXME: in some cases we could reduce unused resolution
1865 if (get_blob().has_unused()) {
1866 return false;
1867 }
1868
1869 if (overflow > 0) {
1870 new_blen -= overflow;
1871 length -= overflow;
1872 *length0 = length;
1873 }
1874
1875 if (new_blen > blen) {
1876 dirty_blob().add_tail(new_blen);
1877 used_in_blob.add_tail(new_blen,
1878 get_blob().get_release_size(min_alloc_size));
1879 }
1880 }
1881 return true;
1882 }
1883
1884 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
1885 {
1886 auto cct = coll->store->cct; //used by dout
1887 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1888 << " start " << *this << dendl;
1889 assert(blob.can_split());
1890 assert(used_in_blob.can_split());
1891 bluestore_blob_t &lb = dirty_blob();
1892 bluestore_blob_t &rb = r->dirty_blob();
1893
1894 used_in_blob.split(
1895 blob_offset,
1896 &(r->used_in_blob));
1897
1898 lb.split(blob_offset, rb);
1899 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
1900
1901 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1902 << " finish " << *this << dendl;
1903 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
1904 << " and " << *r << dendl;
1905 }
1906
1907 #ifndef CACHE_BLOB_BL
1908 void BlueStore::Blob::decode(
1909 Collection *coll,
1910 bufferptr::iterator& p,
1911 uint64_t struct_v,
1912 uint64_t* sbid,
1913 bool include_ref_map)
1914 {
1915 denc(blob, p, struct_v);
1916 if (blob.is_shared()) {
1917 denc(*sbid, p);
1918 }
1919 if (include_ref_map) {
1920 if (struct_v > 1) {
1921 used_in_blob.decode(p);
1922 } else {
1923 used_in_blob.clear();
1924 bluestore_extent_ref_map_t legacy_ref_map;
1925 legacy_ref_map.decode(p);
1926 for (auto r : legacy_ref_map.ref_map) {
1927 get_ref(
1928 coll,
1929 r.first,
1930 r.second.refs * r.second.length);
1931 }
1932 }
1933 }
1934 }
1935 #endif
1936
1937 // Extent
1938
1939 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
1940 {
1941 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
1942 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
1943 << " " << *e.blob;
1944 }
1945
1946 // OldExtent
1947 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
1948 uint32_t lo,
1949 uint32_t o,
1950 uint32_t l,
1951 BlobRef& b) {
1952 OldExtent* oe = new OldExtent(lo, o, l, b);
1953 b->put_ref(c.get(), o, l, &(oe->r));
1954 oe->blob_empty = b->get_referenced_bytes() == 0;
1955 return oe;
1956 }
1957
1958 // ExtentMap
1959
1960 #undef dout_prefix
1961 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1962
1963 BlueStore::ExtentMap::ExtentMap(Onode *o)
1964 : onode(o),
1965 inline_bl(
1966 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
1967 }
1968
1969 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
1970 bool force)
1971 {
1972 auto cct = onode->c->store->cct; //used by dout
1973 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
1974 if (onode->onode.extent_map_shards.empty()) {
1975 if (inline_bl.length() == 0) {
1976 unsigned n;
1977 // we need to encode inline_bl to measure encoded length
1978 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
1979 assert(!never_happen);
1980 size_t len = inline_bl.length();
1981 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
1982 << " extents" << dendl;
1983 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
1984 request_reshard(0, OBJECT_MAX_SIZE);
1985 return;
1986 }
1987 }
1988 // will persist in the onode key.
1989 } else {
1990 // pending shard update
1991 struct dirty_shard_t {
1992 Shard *shard;
1993 bufferlist bl;
1994 dirty_shard_t(Shard *s) : shard(s) {}
1995 };
1996 vector<dirty_shard_t> encoded_shards;
1997 // allocate slots for all shards in a single call instead of
1998 // doing multiple allocations - one per each dirty shard
1999 encoded_shards.reserve(shards.size());
2000
2001 auto p = shards.begin();
2002 auto prev_p = p;
2003 while (p != shards.end()) {
2004 assert(p->shard_info->offset >= prev_p->shard_info->offset);
2005 auto n = p;
2006 ++n;
2007 if (p->dirty) {
2008 uint32_t endoff;
2009 if (n == shards.end()) {
2010 endoff = OBJECT_MAX_SIZE;
2011 } else {
2012 endoff = n->shard_info->offset;
2013 }
2014 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2015 bufferlist& bl = encoded_shards.back().bl;
2016 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2017 bl, &p->extents)) {
2018 if (force) {
2019 derr << __func__ << " encode_some needs reshard" << dendl;
2020 assert(!force);
2021 }
2022 }
2023 size_t len = bl.length();
2024
2025 dout(20) << __func__ << " shard 0x" << std::hex
2026 << p->shard_info->offset << std::dec << " is " << len
2027 << " bytes (was " << p->shard_info->bytes << ") from "
2028 << p->extents << " extents" << dendl;
2029
2030 if (!force) {
2031 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2032 // we are big; reshard ourselves
2033 request_reshard(p->shard_info->offset, endoff);
2034 }
2035 // avoid resharding the trailing shard, even if it is small
2036 else if (n != shards.end() &&
2037 len < g_conf->bluestore_extent_map_shard_min_size) {
2038 assert(endoff != OBJECT_MAX_SIZE);
2039 if (p == shards.begin()) {
2040 // we are the first shard, combine with next shard
2041 request_reshard(p->shard_info->offset, endoff + 1);
2042 } else {
2043 // combine either with the previous shard or the next,
2044 // whichever is smaller
2045 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2046 request_reshard(p->shard_info->offset, endoff + 1);
2047 } else {
2048 request_reshard(prev_p->shard_info->offset, endoff);
2049 }
2050 }
2051 }
2052 }
2053 }
2054 prev_p = p;
2055 p = n;
2056 }
2057 if (needs_reshard()) {
2058 return;
2059 }
2060
2061 // schedule DB update for dirty shards
2062 string key;
2063 for (auto& it : encoded_shards) {
2064 it.shard->dirty = false;
2065 it.shard->shard_info->bytes = it.bl.length();
2066 generate_extent_shard_key_and_apply(
2067 onode->key,
2068 it.shard->shard_info->offset,
2069 &key,
2070 [&](const string& final_key) {
2071 t->set(PREFIX_OBJ, final_key, it.bl);
2072 }
2073 );
2074 }
2075 }
2076 }
2077
2078 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2079 {
2080 if (spanning_blob_map.empty())
2081 return 0;
2082 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2083 // bid is valid and available.
2084 if (bid >= 0)
2085 return bid;
2086 // Find next unused bid;
2087 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2088 const auto begin_bid = bid;
2089 do {
2090 if (!spanning_blob_map.count(bid))
2091 return bid;
2092 else {
2093 bid++;
2094 if (bid < 0) bid = 0;
2095 }
2096 } while (bid != begin_bid);
2097 assert(0 == "no available blob id");
2098 }
2099
2100 void BlueStore::ExtentMap::reshard(
2101 KeyValueDB *db,
2102 KeyValueDB::Transaction t)
2103 {
2104 auto cct = onode->c->store->cct; // used by dout
2105
2106 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2107 << needs_reshard_end << ")" << std::dec
2108 << " of " << onode->onode.extent_map_shards.size()
2109 << " shards on " << onode->oid << dendl;
2110 for (auto& p : spanning_blob_map) {
2111 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2112 << dendl;
2113 }
2114 // determine shard index range
2115 unsigned si_begin = 0, si_end = 0;
2116 if (!shards.empty()) {
2117 while (si_begin + 1 < shards.size() &&
2118 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2119 ++si_begin;
2120 }
2121 needs_reshard_begin = shards[si_begin].shard_info->offset;
2122 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2123 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2124 needs_reshard_end = shards[si_end].shard_info->offset;
2125 break;
2126 }
2127 }
2128 if (si_end == shards.size()) {
2129 needs_reshard_end = OBJECT_MAX_SIZE;
2130 }
2131 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2132 << " over 0x[" << std::hex << needs_reshard_begin << ","
2133 << needs_reshard_end << ")" << std::dec << dendl;
2134 }
2135
2136 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2137
2138 // we may need to fault in a larger interval later must have all
2139 // referring extents for spanning blobs loaded in order to have
2140 // accurate use_tracker values.
2141 uint32_t spanning_scan_begin = needs_reshard_begin;
2142 uint32_t spanning_scan_end = needs_reshard_end;
2143
2144 // remove old keys
2145 string key;
2146 for (unsigned i = si_begin; i < si_end; ++i) {
2147 generate_extent_shard_key_and_apply(
2148 onode->key, shards[i].shard_info->offset, &key,
2149 [&](const string& final_key) {
2150 t->rmkey(PREFIX_OBJ, final_key);
2151 }
2152 );
2153 }
2154
2155 // calculate average extent size
2156 unsigned bytes = 0;
2157 unsigned extents = 0;
2158 if (onode->onode.extent_map_shards.empty()) {
2159 bytes = inline_bl.length();
2160 extents = extent_map.size();
2161 } else {
2162 for (unsigned i = si_begin; i < si_end; ++i) {
2163 bytes += shards[i].shard_info->bytes;
2164 extents += shards[i].extents;
2165 }
2166 }
2167 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2168 unsigned slop = target *
2169 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2170 unsigned extent_avg = bytes / MAX(1, extents);
2171 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2172 << ", slop " << slop << dendl;
2173
2174 // reshard
2175 unsigned estimate = 0;
2176 unsigned offset = needs_reshard_begin;
2177 vector<bluestore_onode_t::shard_info> new_shard_info;
2178 unsigned max_blob_end = 0;
2179 Extent dummy(needs_reshard_begin);
2180 for (auto e = extent_map.lower_bound(dummy);
2181 e != extent_map.end();
2182 ++e) {
2183 if (e->logical_offset >= needs_reshard_end) {
2184 break;
2185 }
2186 dout(30) << " extent " << *e << dendl;
2187
2188 // disfavor shard boundaries that span a blob
2189 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2190 if (estimate &&
2191 estimate + extent_avg > target + (would_span ? slop : 0)) {
2192 // new shard
2193 if (offset == needs_reshard_begin) {
2194 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2195 new_shard_info.back().offset = offset;
2196 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2197 << std::dec << dendl;
2198 }
2199 offset = e->logical_offset;
2200 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2201 new_shard_info.back().offset = offset;
2202 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2203 << std::dec << dendl;
2204 estimate = 0;
2205 }
2206 estimate += extent_avg;
2207 unsigned bs = e->blob_start();
2208 if (bs < spanning_scan_begin) {
2209 spanning_scan_begin = bs;
2210 }
2211 uint32_t be = e->blob_end();
2212 if (be > max_blob_end) {
2213 max_blob_end = be;
2214 }
2215 if (be > spanning_scan_end) {
2216 spanning_scan_end = be;
2217 }
2218 }
2219 if (new_shard_info.empty() && (si_begin > 0 ||
2220 si_end < shards.size())) {
2221 // we resharded a partial range; we must produce at least one output
2222 // shard
2223 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2224 new_shard_info.back().offset = needs_reshard_begin;
2225 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2226 << std::dec << " (singleton degenerate case)" << dendl;
2227 }
2228
2229 auto& sv = onode->onode.extent_map_shards;
2230 dout(20) << __func__ << " new " << new_shard_info << dendl;
2231 dout(20) << __func__ << " old " << sv << dendl;
2232 if (sv.empty()) {
2233 // no old shards to keep
2234 sv.swap(new_shard_info);
2235 init_shards(true, true);
2236 } else {
2237 // splice in new shards
2238 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2239 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2240 sv.insert(
2241 sv.begin() + si_begin,
2242 new_shard_info.begin(),
2243 new_shard_info.end());
2244 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2245 si_end = si_begin + new_shard_info.size();
2246
2247 assert(sv.size() == shards.size());
2248
2249 // note that we need to update every shard_info of shards here,
2250 // as sv might have been totally re-allocated above
2251 for (unsigned i = 0; i < shards.size(); i++) {
2252 shards[i].shard_info = &sv[i];
2253 }
2254
2255 // mark newly added shards as dirty
2256 for (unsigned i = si_begin; i < si_end; ++i) {
2257 shards[i].loaded = true;
2258 shards[i].dirty = true;
2259 }
2260 }
2261 dout(20) << __func__ << " fin " << sv << dendl;
2262 inline_bl.clear();
2263
2264 if (sv.empty()) {
2265 // no more shards; unspan all previously spanning blobs
2266 auto p = spanning_blob_map.begin();
2267 while (p != spanning_blob_map.end()) {
2268 p->second->id = -1;
2269 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2270 p = spanning_blob_map.erase(p);
2271 }
2272 } else {
2273 // identify new spanning blobs
2274 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2275 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2276 if (spanning_scan_begin < needs_reshard_begin) {
2277 fault_range(db, spanning_scan_begin,
2278 needs_reshard_begin - spanning_scan_begin);
2279 }
2280 if (spanning_scan_end > needs_reshard_end) {
2281 fault_range(db, needs_reshard_end,
2282 spanning_scan_end - needs_reshard_end);
2283 }
2284 auto sp = sv.begin() + si_begin;
2285 auto esp = sv.end();
2286 unsigned shard_start = sp->offset;
2287 unsigned shard_end;
2288 ++sp;
2289 if (sp == esp) {
2290 shard_end = OBJECT_MAX_SIZE;
2291 } else {
2292 shard_end = sp->offset;
2293 }
2294 Extent dummy(needs_reshard_begin);
2295 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2296 if (e->logical_offset >= needs_reshard_end) {
2297 break;
2298 }
2299 dout(30) << " extent " << *e << dendl;
2300 while (e->logical_offset >= shard_end) {
2301 shard_start = shard_end;
2302 assert(sp != esp);
2303 ++sp;
2304 if (sp == esp) {
2305 shard_end = OBJECT_MAX_SIZE;
2306 } else {
2307 shard_end = sp->offset;
2308 }
2309 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2310 << " to 0x" << shard_end << std::dec << dendl;
2311 }
2312 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2313 if (!e->blob->is_spanning()) {
2314 // We have two options: (1) split the blob into pieces at the
2315 // shard boundaries (and adjust extents accordingly), or (2)
2316 // mark it spanning. We prefer to cut the blob if we can. Note that
2317 // we may have to split it multiple times--potentially at every
2318 // shard boundary.
2319 bool must_span = false;
2320 BlobRef b = e->blob;
2321 if (b->can_split()) {
2322 uint32_t bstart = e->blob_start();
2323 uint32_t bend = e->blob_end();
2324 for (const auto& sh : shards) {
2325 if (bstart < sh.shard_info->offset &&
2326 bend > sh.shard_info->offset) {
2327 uint32_t blob_offset = sh.shard_info->offset - bstart;
2328 if (b->can_split_at(blob_offset)) {
2329 dout(20) << __func__ << " splitting blob, bstart 0x"
2330 << std::hex << bstart << " blob_offset 0x"
2331 << blob_offset << std::dec << " " << *b << dendl;
2332 b = split_blob(b, blob_offset, sh.shard_info->offset);
2333 // switch b to the new right-hand side, in case it
2334 // *also* has to get split.
2335 bstart += blob_offset;
2336 onode->c->store->logger->inc(l_bluestore_blob_split);
2337 } else {
2338 must_span = true;
2339 break;
2340 }
2341 }
2342 }
2343 } else {
2344 must_span = true;
2345 }
2346 if (must_span) {
2347 auto bid = allocate_spanning_blob_id();
2348 b->id = bid;
2349 spanning_blob_map[b->id] = b;
2350 dout(20) << __func__ << " adding spanning " << *b << dendl;
2351 }
2352 }
2353 } else {
2354 if (e->blob->is_spanning()) {
2355 spanning_blob_map.erase(e->blob->id);
2356 e->blob->id = -1;
2357 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2358 }
2359 }
2360 }
2361 }
2362
2363 clear_needs_reshard();
2364 }
2365
2366 bool BlueStore::ExtentMap::encode_some(
2367 uint32_t offset,
2368 uint32_t length,
2369 bufferlist& bl,
2370 unsigned *pn)
2371 {
2372 auto cct = onode->c->store->cct; //used by dout
2373 Extent dummy(offset);
2374 auto start = extent_map.lower_bound(dummy);
2375 uint32_t end = offset + length;
2376
2377 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2378 // serialization only. Hence there is no specific
2379 // handling at ExtentMap level.
2380
2381 unsigned n = 0;
2382 size_t bound = 0;
2383 bool must_reshard = false;
2384 for (auto p = start;
2385 p != extent_map.end() && p->logical_offset < end;
2386 ++p, ++n) {
2387 assert(p->logical_offset >= offset);
2388 p->blob->last_encoded_id = -1;
2389 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2390 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2391 << std::dec << " hit new spanning blob " << *p << dendl;
2392 request_reshard(p->blob_start(), p->blob_end());
2393 must_reshard = true;
2394 }
2395 if (!must_reshard) {
2396 denc_varint(0, bound); // blobid
2397 denc_varint(0, bound); // logical_offset
2398 denc_varint(0, bound); // len
2399 denc_varint(0, bound); // blob_offset
2400
2401 p->blob->bound_encode(
2402 bound,
2403 struct_v,
2404 p->blob->shared_blob->get_sbid(),
2405 false);
2406 }
2407 }
2408 if (must_reshard) {
2409 return true;
2410 }
2411
2412 denc(struct_v, bound);
2413 denc_varint(0, bound); // number of extents
2414
2415 {
2416 auto app = bl.get_contiguous_appender(bound);
2417 denc(struct_v, app);
2418 denc_varint(n, app);
2419 if (pn) {
2420 *pn = n;
2421 }
2422
2423 n = 0;
2424 uint64_t pos = 0;
2425 uint64_t prev_len = 0;
2426 for (auto p = start;
2427 p != extent_map.end() && p->logical_offset < end;
2428 ++p, ++n) {
2429 unsigned blobid;
2430 bool include_blob = false;
2431 if (p->blob->is_spanning()) {
2432 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2433 blobid |= BLOBID_FLAG_SPANNING;
2434 } else if (p->blob->last_encoded_id < 0) {
2435 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2436 include_blob = true;
2437 blobid = 0; // the decoder will infer the id from n
2438 } else {
2439 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2440 }
2441 if (p->logical_offset == pos) {
2442 blobid |= BLOBID_FLAG_CONTIGUOUS;
2443 }
2444 if (p->blob_offset == 0) {
2445 blobid |= BLOBID_FLAG_ZEROOFFSET;
2446 }
2447 if (p->length == prev_len) {
2448 blobid |= BLOBID_FLAG_SAMELENGTH;
2449 } else {
2450 prev_len = p->length;
2451 }
2452 denc_varint(blobid, app);
2453 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2454 denc_varint_lowz(p->logical_offset - pos, app);
2455 }
2456 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2457 denc_varint_lowz(p->blob_offset, app);
2458 }
2459 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2460 denc_varint_lowz(p->length, app);
2461 }
2462 pos = p->logical_end();
2463 if (include_blob) {
2464 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2465 }
2466 }
2467 }
2468 /*derr << __func__ << bl << dendl;
2469 derr << __func__ << ":";
2470 bl.hexdump(*_dout);
2471 *_dout << dendl;
2472 */
2473 return false;
2474 }
2475
2476 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2477 {
2478 auto cct = onode->c->store->cct; //used by dout
2479 /*
2480 derr << __func__ << ":";
2481 bl.hexdump(*_dout);
2482 *_dout << dendl;
2483 */
2484
2485 assert(bl.get_num_buffers() <= 1);
2486 auto p = bl.front().begin_deep();
2487 __u8 struct_v;
2488 denc(struct_v, p);
2489 // Version 2 differs from v1 in blob's ref_map
2490 // serialization only. Hence there is no specific
2491 // handling at ExtentMap level below.
2492 assert(struct_v == 1 || struct_v == 2);
2493
2494 uint32_t num;
2495 denc_varint(num, p);
2496 vector<BlobRef> blobs(num);
2497 uint64_t pos = 0;
2498 uint64_t prev_len = 0;
2499 unsigned n = 0;
2500
2501 while (!p.end()) {
2502 Extent *le = new Extent();
2503 uint64_t blobid;
2504 denc_varint(blobid, p);
2505 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2506 uint64_t gap;
2507 denc_varint_lowz(gap, p);
2508 pos += gap;
2509 }
2510 le->logical_offset = pos;
2511 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2512 denc_varint_lowz(le->blob_offset, p);
2513 } else {
2514 le->blob_offset = 0;
2515 }
2516 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2517 denc_varint_lowz(prev_len, p);
2518 }
2519 le->length = prev_len;
2520
2521 if (blobid & BLOBID_FLAG_SPANNING) {
2522 dout(30) << __func__ << " getting spanning blob "
2523 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2524 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2525 } else {
2526 blobid >>= BLOBID_SHIFT_BITS;
2527 if (blobid) {
2528 le->assign_blob(blobs[blobid - 1]);
2529 assert(le->blob);
2530 } else {
2531 Blob *b = new Blob();
2532 uint64_t sbid = 0;
2533 b->decode(onode->c, p, struct_v, &sbid, false);
2534 blobs[n] = b;
2535 onode->c->open_shared_blob(sbid, b);
2536 le->assign_blob(b);
2537 }
2538 // we build ref_map dynamically for non-spanning blobs
2539 le->blob->get_ref(
2540 onode->c,
2541 le->blob_offset,
2542 le->length);
2543 }
2544 pos += prev_len;
2545 ++n;
2546 extent_map.insert(*le);
2547 }
2548
2549 assert(n == num);
2550 return num;
2551 }
2552
2553 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2554 {
2555 // Version 2 differs from v1 in blob's ref_map
2556 // serialization only. Hence there is no specific
2557 // handling at ExtentMap level.
2558 __u8 struct_v = 2;
2559
2560 denc(struct_v, p);
2561 denc_varint((uint32_t)0, p);
2562 size_t key_size = 0;
2563 denc_varint((uint32_t)0, key_size);
2564 p += spanning_blob_map.size() * key_size;
2565 for (const auto& i : spanning_blob_map) {
2566 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2567 }
2568 }
2569
2570 void BlueStore::ExtentMap::encode_spanning_blobs(
2571 bufferlist::contiguous_appender& p)
2572 {
2573 // Version 2 differs from v1 in blob's ref_map
2574 // serialization only. Hence there is no specific
2575 // handling at ExtentMap level.
2576 __u8 struct_v = 2;
2577
2578 denc(struct_v, p);
2579 denc_varint(spanning_blob_map.size(), p);
2580 for (auto& i : spanning_blob_map) {
2581 denc_varint(i.second->id, p);
2582 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2583 }
2584 }
2585
2586 void BlueStore::ExtentMap::decode_spanning_blobs(
2587 bufferptr::iterator& p)
2588 {
2589 __u8 struct_v;
2590 denc(struct_v, p);
2591 // Version 2 differs from v1 in blob's ref_map
2592 // serialization only. Hence there is no specific
2593 // handling at ExtentMap level.
2594 assert(struct_v == 1 || struct_v == 2);
2595
2596 unsigned n;
2597 denc_varint(n, p);
2598 while (n--) {
2599 BlobRef b(new Blob());
2600 denc_varint(b->id, p);
2601 spanning_blob_map[b->id] = b;
2602 uint64_t sbid = 0;
2603 b->decode(onode->c, p, struct_v, &sbid, true);
2604 onode->c->open_shared_blob(sbid, b);
2605 }
2606 }
2607
2608 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2609 {
2610 shards.resize(onode->onode.extent_map_shards.size());
2611 unsigned i = 0;
2612 for (auto &s : onode->onode.extent_map_shards) {
2613 shards[i].shard_info = &s;
2614 shards[i].loaded = loaded;
2615 shards[i].dirty = dirty;
2616 ++i;
2617 }
2618 }
2619
2620 void BlueStore::ExtentMap::fault_range(
2621 KeyValueDB *db,
2622 uint32_t offset,
2623 uint32_t length)
2624 {
2625 auto cct = onode->c->store->cct; //used by dout
2626 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2627 << std::dec << dendl;
2628 auto start = seek_shard(offset);
2629 auto last = seek_shard(offset + length);
2630
2631 if (start < 0)
2632 return;
2633
2634 assert(last >= start);
2635 string key;
2636 while (start <= last) {
2637 assert((size_t)start < shards.size());
2638 auto p = &shards[start];
2639 if (!p->loaded) {
2640 dout(30) << __func__ << " opening shard 0x" << std::hex
2641 << p->shard_info->offset << std::dec << dendl;
2642 bufferlist v;
2643 generate_extent_shard_key_and_apply(
2644 onode->key, p->shard_info->offset, &key,
2645 [&](const string& final_key) {
2646 int r = db->get(PREFIX_OBJ, final_key, &v);
2647 if (r < 0) {
2648 derr << __func__ << " missing shard 0x" << std::hex
2649 << p->shard_info->offset << std::dec << " for " << onode->oid
2650 << dendl;
2651 assert(r >= 0);
2652 }
2653 }
2654 );
2655 p->extents = decode_some(v);
2656 p->loaded = true;
2657 dout(20) << __func__ << " open shard 0x" << std::hex
2658 << p->shard_info->offset << std::dec
2659 << " (" << v.length() << " bytes)" << dendl;
2660 assert(p->dirty == false);
2661 assert(v.length() == p->shard_info->bytes);
2662 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
2663 } else {
2664 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
2665 }
2666 ++start;
2667 }
2668 }
2669
2670 void BlueStore::ExtentMap::dirty_range(
2671 uint32_t offset,
2672 uint32_t length)
2673 {
2674 auto cct = onode->c->store->cct; //used by dout
2675 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2676 << std::dec << dendl;
2677 if (shards.empty()) {
2678 dout(20) << __func__ << " mark inline shard dirty" << dendl;
2679 inline_bl.clear();
2680 return;
2681 }
2682 auto start = seek_shard(offset);
2683 auto last = seek_shard(offset + length);
2684 if (start < 0)
2685 return;
2686
2687 assert(last >= start);
2688 while (start <= last) {
2689 assert((size_t)start < shards.size());
2690 auto p = &shards[start];
2691 if (!p->loaded) {
2692 dout(20) << __func__ << " shard 0x" << std::hex << p->shard_info->offset
2693 << std::dec << " is not loaded, can't mark dirty" << dendl;
2694 assert(0 == "can't mark unloaded shard dirty");
2695 }
2696 if (!p->dirty) {
2697 dout(20) << __func__ << " mark shard 0x" << std::hex
2698 << p->shard_info->offset << std::dec << " dirty" << dendl;
2699 p->dirty = true;
2700 }
2701 ++start;
2702 }
2703 }
2704
2705 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
2706 uint64_t offset)
2707 {
2708 Extent dummy(offset);
2709 return extent_map.find(dummy);
2710 }
2711
2712 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
2713 uint64_t offset)
2714 {
2715 Extent dummy(offset);
2716 auto fp = extent_map.lower_bound(dummy);
2717 if (fp != extent_map.begin()) {
2718 --fp;
2719 if (fp->logical_end() <= offset) {
2720 ++fp;
2721 }
2722 }
2723 return fp;
2724 }
2725
2726 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
2727 uint64_t offset) const
2728 {
2729 Extent dummy(offset);
2730 auto fp = extent_map.lower_bound(dummy);
2731 if (fp != extent_map.begin()) {
2732 --fp;
2733 if (fp->logical_end() <= offset) {
2734 ++fp;
2735 }
2736 }
2737 return fp;
2738 }
2739
2740 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
2741 {
2742 auto fp = seek_lextent(offset);
2743 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
2744 return false;
2745 }
2746 return true;
2747 }
2748
2749 int BlueStore::ExtentMap::compress_extent_map(
2750 uint64_t offset,
2751 uint64_t length)
2752 {
2753 auto cct = onode->c->store->cct; //used by dout
2754 if (extent_map.empty())
2755 return 0;
2756 int removed = 0;
2757 auto p = seek_lextent(offset);
2758 if (p != extent_map.begin()) {
2759 --p; // start to the left of offset
2760 }
2761 // the caller should have just written to this region
2762 assert(p != extent_map.end());
2763
2764 // identify the *next* shard
2765 auto pshard = shards.begin();
2766 while (pshard != shards.end() &&
2767 p->logical_offset >= pshard->shard_info->offset) {
2768 ++pshard;
2769 }
2770 uint64_t shard_end;
2771 if (pshard != shards.end()) {
2772 shard_end = pshard->shard_info->offset;
2773 } else {
2774 shard_end = OBJECT_MAX_SIZE;
2775 }
2776
2777 auto n = p;
2778 for (++n; n != extent_map.end(); p = n++) {
2779 if (n->logical_offset > offset + length) {
2780 break; // stop after end
2781 }
2782 while (n != extent_map.end() &&
2783 p->logical_end() == n->logical_offset &&
2784 p->blob == n->blob &&
2785 p->blob_offset + p->length == n->blob_offset &&
2786 n->logical_offset < shard_end) {
2787 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2788 << " next shard 0x" << shard_end << std::dec
2789 << " merging " << *p << " and " << *n << dendl;
2790 p->length += n->length;
2791 rm(n++);
2792 ++removed;
2793 }
2794 if (n == extent_map.end()) {
2795 break;
2796 }
2797 if (n->logical_offset >= shard_end) {
2798 assert(pshard != shards.end());
2799 ++pshard;
2800 if (pshard != shards.end()) {
2801 shard_end = pshard->shard_info->offset;
2802 } else {
2803 shard_end = OBJECT_MAX_SIZE;
2804 }
2805 }
2806 }
2807 if (removed && onode) {
2808 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
2809 }
2810 return removed;
2811 }
2812
2813 void BlueStore::ExtentMap::punch_hole(
2814 CollectionRef &c,
2815 uint64_t offset,
2816 uint64_t length,
2817 old_extent_map_t *old_extents)
2818 {
2819 auto p = seek_lextent(offset);
2820 uint64_t end = offset + length;
2821 while (p != extent_map.end()) {
2822 if (p->logical_offset >= end) {
2823 break;
2824 }
2825 if (p->logical_offset < offset) {
2826 if (p->logical_end() > end) {
2827 // split and deref middle
2828 uint64_t front = offset - p->logical_offset;
2829 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
2830 length, p->blob);
2831 old_extents->push_back(*oe);
2832 add(end,
2833 p->blob_offset + front + length,
2834 p->length - front - length,
2835 p->blob);
2836 p->length = front;
2837 break;
2838 } else {
2839 // deref tail
2840 assert(p->logical_end() > offset); // else seek_lextent bug
2841 uint64_t keep = offset - p->logical_offset;
2842 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
2843 p->length - keep, p->blob);
2844 old_extents->push_back(*oe);
2845 p->length = keep;
2846 ++p;
2847 continue;
2848 }
2849 }
2850 if (p->logical_offset + p->length <= end) {
2851 // deref whole lextent
2852 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2853 p->length, p->blob);
2854 old_extents->push_back(*oe);
2855 rm(p++);
2856 continue;
2857 }
2858 // deref head
2859 uint64_t keep = p->logical_end() - end;
2860 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
2861 p->length - keep, p->blob);
2862 old_extents->push_back(*oe);
2863
2864 add(end, p->blob_offset + p->length - keep, keep, p->blob);
2865 rm(p);
2866 break;
2867 }
2868 }
2869
2870 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
2871 CollectionRef &c,
2872 uint64_t logical_offset,
2873 uint64_t blob_offset, uint64_t length, BlobRef b,
2874 old_extent_map_t *old_extents)
2875 {
2876 // We need to have completely initialized Blob to increment its ref counters.
2877 assert(b->get_blob().get_logical_length() != 0);
2878
2879 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2880 // old_extents list if we overwre the blob totally
2881 // This might happen during WAL overwrite.
2882 b->get_ref(onode->c, blob_offset, length);
2883
2884 if (old_extents) {
2885 punch_hole(c, logical_offset, length, old_extents);
2886 }
2887
2888 Extent *le = new Extent(logical_offset, blob_offset, length, b);
2889 extent_map.insert(*le);
2890 if (spans_shard(logical_offset, length)) {
2891 request_reshard(logical_offset, logical_offset + length);
2892 }
2893 return le;
2894 }
2895
2896 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
2897 BlobRef lb,
2898 uint32_t blob_offset,
2899 uint32_t pos)
2900 {
2901 auto cct = onode->c->store->cct; //used by dout
2902
2903 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
2904 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
2905 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
2906 << dendl;
2907 BlobRef rb = onode->c->new_blob();
2908 lb->split(onode->c, blob_offset, rb.get());
2909
2910 for (auto ep = seek_lextent(pos);
2911 ep != extent_map.end() && ep->logical_offset < end_pos;
2912 ++ep) {
2913 if (ep->blob != lb) {
2914 continue;
2915 }
2916 if (ep->logical_offset < pos) {
2917 // split extent
2918 size_t left = pos - ep->logical_offset;
2919 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
2920 extent_map.insert(*ne);
2921 ep->length = left;
2922 dout(30) << __func__ << " split " << *ep << dendl;
2923 dout(30) << __func__ << " to " << *ne << dendl;
2924 } else {
2925 // switch blob
2926 assert(ep->blob_offset >= blob_offset);
2927
2928 ep->blob = rb;
2929 ep->blob_offset -= blob_offset;
2930 dout(30) << __func__ << " adjusted " << *ep << dendl;
2931 }
2932 }
2933 return rb;
2934 }
2935
2936 // Onode
2937
2938 #undef dout_prefix
2939 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2940
2941 void BlueStore::Onode::flush()
2942 {
2943 if (flushing_count.load()) {
2944 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
2945 std::unique_lock<std::mutex> l(flush_lock);
2946 while (flushing_count.load()) {
2947 flush_cond.wait(l);
2948 }
2949 }
2950 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
2951 }
2952
2953 // =======================================================
2954 // WriteContext
2955
2956 /// Checks for writes to the same pextent within a blob
2957 bool BlueStore::WriteContext::has_conflict(
2958 BlobRef b,
2959 uint64_t loffs,
2960 uint64_t loffs_end,
2961 uint64_t min_alloc_size)
2962 {
2963 assert((loffs % min_alloc_size) == 0);
2964 assert((loffs_end % min_alloc_size) == 0);
2965 for (auto w : writes) {
2966 if (b == w.b) {
2967 auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
2968 auto loffs2_end = P2ROUNDUP(w.logical_offset + w.length0, min_alloc_size);
2969 if ((loffs <= loffs2 && loffs_end > loffs2) ||
2970 (loffs >= loffs2 && loffs < loffs2_end)) {
2971 return true;
2972 }
2973 }
2974 }
2975 return false;
2976 }
2977
2978 // =======================================================
2979
2980 // DeferredBatch
2981 #undef dout_prefix
2982 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2983
2984 void BlueStore::DeferredBatch::prepare_write(
2985 CephContext *cct,
2986 uint64_t seq, uint64_t offset, uint64_t length,
2987 bufferlist::const_iterator& blp)
2988 {
2989 _discard(cct, offset, length);
2990 auto i = iomap.insert(make_pair(offset, deferred_io()));
2991 assert(i.second); // this should be a new insertion
2992 i.first->second.seq = seq;
2993 blp.copy(length, i.first->second.bl);
2994 i.first->second.bl.reassign_to_mempool(
2995 mempool::mempool_bluestore_writing_deferred);
2996 dout(20) << __func__ << " seq " << seq
2997 << " 0x" << std::hex << offset << "~" << length
2998 << " crc " << i.first->second.bl.crc32c(-1)
2999 << std::dec << dendl;
3000 seq_bytes[seq] += length;
3001 #ifdef DEBUG_DEFERRED
3002 _audit(cct);
3003 #endif
3004 }
3005
3006 void BlueStore::DeferredBatch::_discard(
3007 CephContext *cct, uint64_t offset, uint64_t length)
3008 {
3009 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3010 << std::dec << dendl;
3011 auto p = iomap.lower_bound(offset);
3012 if (p != iomap.begin()) {
3013 --p;
3014 auto end = p->first + p->second.bl.length();
3015 if (end > offset) {
3016 bufferlist head;
3017 head.substr_of(p->second.bl, 0, offset - p->first);
3018 dout(20) << __func__ << " keep head " << p->second.seq
3019 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3020 << " -> 0x" << head.length() << std::dec << dendl;
3021 auto i = seq_bytes.find(p->second.seq);
3022 assert(i != seq_bytes.end());
3023 if (end > offset + length) {
3024 bufferlist tail;
3025 tail.substr_of(p->second.bl, offset + length - p->first,
3026 end - (offset + length));
3027 dout(20) << __func__ << " keep tail " << p->second.seq
3028 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3029 << " -> 0x" << tail.length() << std::dec << dendl;
3030 auto &n = iomap[offset + length];
3031 n.bl.swap(tail);
3032 n.seq = p->second.seq;
3033 i->second -= length;
3034 } else {
3035 i->second -= end - offset;
3036 }
3037 assert(i->second >= 0);
3038 p->second.bl.swap(head);
3039 }
3040 ++p;
3041 }
3042 while (p != iomap.end()) {
3043 if (p->first >= offset + length) {
3044 break;
3045 }
3046 auto i = seq_bytes.find(p->second.seq);
3047 assert(i != seq_bytes.end());
3048 auto end = p->first + p->second.bl.length();
3049 if (end > offset + length) {
3050 unsigned drop_front = offset + length - p->first;
3051 unsigned keep_tail = end - (offset + length);
3052 dout(20) << __func__ << " truncate front " << p->second.seq
3053 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3054 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3055 << " to 0x" << (offset + length) << "~" << keep_tail
3056 << std::dec << dendl;
3057 auto &s = iomap[offset + length];
3058 s.seq = p->second.seq;
3059 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3060 i->second -= drop_front;
3061 } else {
3062 dout(20) << __func__ << " drop " << p->second.seq
3063 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3064 << std::dec << dendl;
3065 i->second -= p->second.bl.length();
3066 }
3067 assert(i->second >= 0);
3068 p = iomap.erase(p);
3069 }
3070 }
3071
3072 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3073 {
3074 map<uint64_t,int> sb;
3075 for (auto p : seq_bytes) {
3076 sb[p.first] = 0; // make sure we have the same set of keys
3077 }
3078 uint64_t pos = 0;
3079 for (auto& p : iomap) {
3080 assert(p.first >= pos);
3081 sb[p.second.seq] += p.second.bl.length();
3082 pos = p.first + p.second.bl.length();
3083 }
3084 assert(sb == seq_bytes);
3085 }
3086
3087
3088 // Collection
3089
3090 #undef dout_prefix
3091 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3092
3093 BlueStore::Collection::Collection(BlueStore *ns, Cache *c, coll_t cid)
3094 : store(ns),
3095 cache(c),
3096 cid(cid),
3097 lock("BlueStore::Collection::lock", true, false),
3098 exists(true),
3099 onode_map(c)
3100 {
3101 }
3102
3103 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3104 {
3105 assert(!b->shared_blob);
3106 const bluestore_blob_t& blob = b->get_blob();
3107 if (!blob.is_shared()) {
3108 b->shared_blob = new SharedBlob(this);
3109 return;
3110 }
3111
3112 b->shared_blob = shared_blob_set.lookup(sbid);
3113 if (b->shared_blob) {
3114 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3115 << std::dec << " had " << *b->shared_blob << dendl;
3116 } else {
3117 b->shared_blob = new SharedBlob(sbid, this);
3118 shared_blob_set.add(this, b->shared_blob.get());
3119 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3120 << std::dec << " opened " << *b->shared_blob
3121 << dendl;
3122 }
3123 }
3124
3125 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3126 {
3127 if (!sb->is_loaded()) {
3128
3129 bufferlist v;
3130 string key;
3131 auto sbid = sb->get_sbid();
3132 get_shared_blob_key(sbid, &key);
3133 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3134 if (r < 0) {
3135 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3136 << std::dec << " not found at key "
3137 << pretty_binary_string(key) << dendl;
3138 assert(0 == "uh oh, missing shared_blob");
3139 }
3140
3141 sb->loaded = true;
3142 sb->persistent = new bluestore_shared_blob_t(sbid);
3143 bufferlist::iterator p = v.begin();
3144 ::decode(*(sb->persistent), p);
3145 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3146 << std::dec << " loaded shared_blob " << *sb << dendl;
3147 }
3148 }
3149
3150 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3151 {
3152 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
3153 assert(!b->shared_blob->is_loaded());
3154
3155 // update blob
3156 bluestore_blob_t& blob = b->dirty_blob();
3157 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
3158
3159 // update shared blob
3160 b->shared_blob->loaded = true;
3161 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3162 shared_blob_set.add(this, b->shared_blob.get());
3163 for (auto p : blob.get_extents()) {
3164 if (p.is_valid()) {
3165 b->shared_blob->get_ref(
3166 p.offset,
3167 p.length);
3168 }
3169 }
3170 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3171 }
3172
3173 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3174 {
3175 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
3176 assert(sb->is_loaded());
3177
3178 uint64_t sbid = sb->get_sbid();
3179 shared_blob_set.remove(sb);
3180 sb->loaded = false;
3181 delete sb->persistent;
3182 sb->sbid_unloaded = 0;
3183 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3184 return sbid;
3185 }
3186
3187 BlueStore::OnodeRef BlueStore::Collection::get_onode(
3188 const ghobject_t& oid,
3189 bool create)
3190 {
3191 assert(create ? lock.is_wlocked() : lock.is_locked());
3192
3193 spg_t pgid;
3194 if (cid.is_pg(&pgid)) {
3195 if (!oid.match(cnode.bits, pgid.ps())) {
3196 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3197 << pgid << " bits " << cnode.bits << dendl;
3198 ceph_abort();
3199 }
3200 }
3201
3202 OnodeRef o = onode_map.lookup(oid);
3203 if (o)
3204 return o;
3205
3206 mempool::bluestore_cache_other::string key;
3207 get_object_key(store->cct, oid, &key);
3208
3209 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3210 << pretty_binary_string(key) << dendl;
3211
3212 bufferlist v;
3213 int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3214 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3215 Onode *on;
3216 if (v.length() == 0) {
3217 assert(r == -ENOENT);
3218 if (!store->cct->_conf->bluestore_debug_misc &&
3219 !create)
3220 return OnodeRef();
3221
3222 // new object, new onode
3223 on = new Onode(this, oid, key);
3224 } else {
3225 // loaded
3226 assert(r >= 0);
3227 on = new Onode(this, oid, key);
3228 on->exists = true;
3229 bufferptr::iterator p = v.front().begin_deep();
3230 on->onode.decode(p);
3231
3232 // initialize extent_map
3233 on->extent_map.decode_spanning_blobs(p);
3234 if (on->onode.extent_map_shards.empty()) {
3235 denc(on->extent_map.inline_bl, p);
3236 on->extent_map.decode_some(on->extent_map.inline_bl);
3237 } else {
3238 on->extent_map.init_shards(false, false);
3239 }
3240 }
3241 o.reset(on);
3242 return onode_map.add(oid, o);
3243 }
3244
3245 void BlueStore::Collection::split_cache(
3246 Collection *dest)
3247 {
3248 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3249
3250 // lock (one or both) cache shards
3251 std::lock(cache->lock, dest->cache->lock);
3252 std::lock_guard<std::recursive_mutex> l(cache->lock, std::adopt_lock);
3253 std::lock_guard<std::recursive_mutex> l2(dest->cache->lock, std::adopt_lock);
3254
3255 int destbits = dest->cnode.bits;
3256 spg_t destpg;
3257 bool is_pg = dest->cid.is_pg(&destpg);
3258 assert(is_pg);
3259
3260 auto p = onode_map.onode_map.begin();
3261 while (p != onode_map.onode_map.end()) {
3262 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3263 // onode does not belong to this child
3264 ++p;
3265 } else {
3266 OnodeRef o = p->second;
3267 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3268 << dendl;
3269
3270 cache->_rm_onode(p->second);
3271 p = onode_map.onode_map.erase(p);
3272
3273 o->c = dest;
3274 dest->cache->_add_onode(o, 1);
3275 dest->onode_map.onode_map[o->oid] = o;
3276 dest->onode_map.cache = dest->cache;
3277
3278 // move over shared blobs and buffers. cover shared blobs from
3279 // both extent map and spanning blob map (the full extent map
3280 // may not be faulted in)
3281 vector<SharedBlob*> sbvec;
3282 for (auto& e : o->extent_map.extent_map) {
3283 sbvec.push_back(e.blob->shared_blob.get());
3284 }
3285 for (auto& b : o->extent_map.spanning_blob_map) {
3286 sbvec.push_back(b.second->shared_blob.get());
3287 }
3288 for (auto sb : sbvec) {
3289 if (sb->coll == dest) {
3290 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3291 << dendl;
3292 continue;
3293 }
3294 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
3295 sb->coll = dest;
3296 if (sb->get_sbid()) {
3297 ldout(store->cct, 20) << __func__
3298 << " moving registration " << *sb << dendl;
3299 shared_blob_set.remove(sb);
3300 dest->shared_blob_set.add(dest, sb);
3301 }
3302 if (dest->cache != cache) {
3303 for (auto& i : sb->bc.buffer_map) {
3304 if (!i.second->is_writing()) {
3305 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3306 << dendl;
3307 dest->cache->_move_buffer(cache, i.second.get());
3308 }
3309 }
3310 }
3311 }
3312 }
3313 }
3314 }
3315
3316 // =======================================================
3317
3318 void *BlueStore::MempoolThread::entry()
3319 {
3320 Mutex::Locker l(lock);
3321 while (!stop) {
3322 uint64_t meta_bytes =
3323 mempool::bluestore_cache_other::allocated_bytes() +
3324 mempool::bluestore_cache_onode::allocated_bytes();
3325 uint64_t onode_num =
3326 mempool::bluestore_cache_onode::allocated_items();
3327
3328 if (onode_num < 2) {
3329 onode_num = 2;
3330 }
3331
3332 float bytes_per_onode = (float)meta_bytes / (float)onode_num;
3333 size_t num_shards = store->cache_shards.size();
3334 float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
3335 // A little sloppy but should be close enough
3336 uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
3337
3338 for (auto i : store->cache_shards) {
3339 i->trim(shard_target,
3340 store->cache_meta_ratio,
3341 store->cache_data_ratio,
3342 bytes_per_onode);
3343 }
3344
3345 store->_update_cache_logger();
3346
3347 utime_t wait;
3348 wait += store->cct->_conf->bluestore_cache_trim_interval;
3349 cond.WaitInterval(lock, wait);
3350 }
3351 stop = false;
3352 return NULL;
3353 }
3354
3355 // =======================================================
3356
3357 // OmapIteratorImpl
3358
3359 #undef dout_prefix
3360 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3361
3362 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3363 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3364 : c(c), o(o), it(it)
3365 {
3366 RWLock::RLocker l(c->lock);
3367 if (o->onode.has_omap()) {
3368 get_omap_key(o->onode.nid, string(), &head);
3369 get_omap_tail(o->onode.nid, &tail);
3370 it->lower_bound(head);
3371 }
3372 }
3373
3374 int BlueStore::OmapIteratorImpl::seek_to_first()
3375 {
3376 RWLock::RLocker l(c->lock);
3377 if (o->onode.has_omap()) {
3378 it->lower_bound(head);
3379 } else {
3380 it = KeyValueDB::Iterator();
3381 }
3382 return 0;
3383 }
3384
3385 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
3386 {
3387 RWLock::RLocker l(c->lock);
3388 if (o->onode.has_omap()) {
3389 string key;
3390 get_omap_key(o->onode.nid, after, &key);
3391 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
3392 << pretty_binary_string(key) << dendl;
3393 it->upper_bound(key);
3394 } else {
3395 it = KeyValueDB::Iterator();
3396 }
3397 return 0;
3398 }
3399
3400 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
3401 {
3402 RWLock::RLocker l(c->lock);
3403 if (o->onode.has_omap()) {
3404 string key;
3405 get_omap_key(o->onode.nid, to, &key);
3406 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
3407 << pretty_binary_string(key) << dendl;
3408 it->lower_bound(key);
3409 } else {
3410 it = KeyValueDB::Iterator();
3411 }
3412 return 0;
3413 }
3414
3415 bool BlueStore::OmapIteratorImpl::valid()
3416 {
3417 RWLock::RLocker l(c->lock);
3418 bool r = o->onode.has_omap() && it && it->valid() &&
3419 it->raw_key().second <= tail;
3420 if (it && it->valid()) {
3421 ldout(c->store->cct,20) << __func__ << " is at "
3422 << pretty_binary_string(it->raw_key().second)
3423 << dendl;
3424 }
3425 return r;
3426 }
3427
3428 int BlueStore::OmapIteratorImpl::next(bool validate)
3429 {
3430 RWLock::RLocker l(c->lock);
3431 if (o->onode.has_omap()) {
3432 it->next();
3433 return 0;
3434 } else {
3435 return -1;
3436 }
3437 }
3438
3439 string BlueStore::OmapIteratorImpl::key()
3440 {
3441 RWLock::RLocker l(c->lock);
3442 assert(it->valid());
3443 string db_key = it->raw_key().second;
3444 string user_key;
3445 decode_omap_key(db_key, &user_key);
3446 return user_key;
3447 }
3448
3449 bufferlist BlueStore::OmapIteratorImpl::value()
3450 {
3451 RWLock::RLocker l(c->lock);
3452 assert(it->valid());
3453 return it->value();
3454 }
3455
3456
3457 // =====================================
3458
3459 #undef dout_prefix
3460 #define dout_prefix *_dout << "bluestore(" << path << ") "
3461
3462
3463 static void aio_cb(void *priv, void *priv2)
3464 {
3465 BlueStore *store = static_cast<BlueStore*>(priv);
3466 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
3467 c->aio_finish(store);
3468 }
3469
3470 BlueStore::BlueStore(CephContext *cct, const string& path)
3471 : ObjectStore(cct, path),
3472 throttle_bytes(cct, "bluestore_throttle_bytes",
3473 cct->_conf->bluestore_throttle_bytes),
3474 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3475 cct->_conf->bluestore_throttle_bytes +
3476 cct->_conf->bluestore_throttle_deferred_bytes),
3477 deferred_finisher(cct, "defered_finisher", "dfin"),
3478 kv_sync_thread(this),
3479 kv_finalize_thread(this),
3480 mempool_thread(this)
3481 {
3482 _init_logger();
3483 cct->_conf->add_observer(this);
3484 set_cache_shards(1);
3485 }
3486
3487 BlueStore::BlueStore(CephContext *cct,
3488 const string& path,
3489 uint64_t _min_alloc_size)
3490 : ObjectStore(cct, path),
3491 throttle_bytes(cct, "bluestore_throttle_bytes",
3492 cct->_conf->bluestore_throttle_bytes),
3493 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
3494 cct->_conf->bluestore_throttle_bytes +
3495 cct->_conf->bluestore_throttle_deferred_bytes),
3496 deferred_finisher(cct, "defered_finisher", "dfin"),
3497 kv_sync_thread(this),
3498 kv_finalize_thread(this),
3499 min_alloc_size(_min_alloc_size),
3500 min_alloc_size_order(ctz(_min_alloc_size)),
3501 mempool_thread(this)
3502 {
3503 _init_logger();
3504 cct->_conf->add_observer(this);
3505 set_cache_shards(1);
3506 }
3507
3508 BlueStore::~BlueStore()
3509 {
3510 for (auto f : finishers) {
3511 delete f;
3512 }
3513 finishers.clear();
3514
3515 cct->_conf->remove_observer(this);
3516 _shutdown_logger();
3517 assert(!mounted);
3518 assert(db == NULL);
3519 assert(bluefs == NULL);
3520 assert(fsid_fd < 0);
3521 assert(path_fd < 0);
3522 for (auto i : cache_shards) {
3523 delete i;
3524 }
3525 cache_shards.clear();
3526 }
3527
3528 const char **BlueStore::get_tracked_conf_keys() const
3529 {
3530 static const char* KEYS[] = {
3531 "bluestore_csum_type",
3532 "bluestore_compression_mode",
3533 "bluestore_compression_algorithm",
3534 "bluestore_compression_min_blob_size",
3535 "bluestore_compression_min_blob_size_ssd",
3536 "bluestore_compression_min_blob_size_hdd",
3537 "bluestore_compression_max_blob_size",
3538 "bluestore_compression_max_blob_size_ssd",
3539 "bluestore_compression_max_blob_size_hdd",
3540 "bluestore_compression_required_ratio",
3541 "bluestore_max_alloc_size",
3542 "bluestore_prefer_deferred_size",
3543 "bluestore_prefer_deferred_size_hdd",
3544 "bluestore_prefer_deferred_size_ssd",
3545 "bluestore_deferred_batch_ops",
3546 "bluestore_deferred_batch_ops_hdd",
3547 "bluestore_deferred_batch_ops_ssd",
3548 "bluestore_throttle_bytes",
3549 "bluestore_throttle_deferred_bytes",
3550 "bluestore_throttle_cost_per_io_hdd",
3551 "bluestore_throttle_cost_per_io_ssd",
3552 "bluestore_throttle_cost_per_io",
3553 "bluestore_max_blob_size",
3554 "bluestore_max_blob_size_ssd",
3555 "bluestore_max_blob_size_hdd",
3556 NULL
3557 };
3558 return KEYS;
3559 }
3560
3561 void BlueStore::handle_conf_change(const struct md_config_t *conf,
3562 const std::set<std::string> &changed)
3563 {
3564 if (changed.count("bluestore_csum_type")) {
3565 _set_csum();
3566 }
3567 if (changed.count("bluestore_compression_mode") ||
3568 changed.count("bluestore_compression_algorithm") ||
3569 changed.count("bluestore_compression_min_blob_size") ||
3570 changed.count("bluestore_compression_max_blob_size")) {
3571 if (bdev) {
3572 _set_compression();
3573 }
3574 }
3575 if (changed.count("bluestore_max_blob_size") ||
3576 changed.count("bluestore_max_blob_size_ssd") ||
3577 changed.count("bluestore_max_blob_size_hdd")) {
3578 if (bdev) {
3579 // only after startup
3580 _set_blob_size();
3581 }
3582 }
3583 if (changed.count("bluestore_prefer_deferred_size") ||
3584 changed.count("bluestore_prefer_deferred_size_hdd") ||
3585 changed.count("bluestore_prefer_deferred_size_ssd") ||
3586 changed.count("bluestore_max_alloc_size") ||
3587 changed.count("bluestore_deferred_batch_ops") ||
3588 changed.count("bluestore_deferred_batch_ops_hdd") ||
3589 changed.count("bluestore_deferred_batch_ops_ssd")) {
3590 if (bdev) {
3591 // only after startup
3592 _set_alloc_sizes();
3593 }
3594 }
3595 if (changed.count("bluestore_throttle_cost_per_io") ||
3596 changed.count("bluestore_throttle_cost_per_io_hdd") ||
3597 changed.count("bluestore_throttle_cost_per_io_ssd")) {
3598 if (bdev) {
3599 _set_throttle_params();
3600 }
3601 }
3602 if (changed.count("bluestore_throttle_bytes")) {
3603 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
3604 throttle_deferred_bytes.reset_max(
3605 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3606 }
3607 if (changed.count("bluestore_throttle_deferred_bytes")) {
3608 throttle_deferred_bytes.reset_max(
3609 conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
3610 }
3611 }
3612
3613 void BlueStore::_set_compression()
3614 {
3615 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
3616 if (m) {
3617 comp_mode = *m;
3618 } else {
3619 derr << __func__ << " unrecognized value '"
3620 << cct->_conf->bluestore_compression_mode
3621 << "' for bluestore_compression_mode, reverting to 'none'"
3622 << dendl;
3623 comp_mode = Compressor::COMP_NONE;
3624 }
3625
3626 compressor = nullptr;
3627
3628 if (comp_mode == Compressor::COMP_NONE) {
3629 dout(10) << __func__ << " compression mode set to 'none', "
3630 << "ignore other compression setttings" << dendl;
3631 return;
3632 }
3633
3634 if (cct->_conf->bluestore_compression_max_blob_size) {
3635 comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3636 } else {
3637 assert(bdev);
3638 if (bdev->is_rotational()) {
3639 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
3640 } else {
3641 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
3642 }
3643 }
3644
3645 if (cct->_conf->bluestore_compression_max_blob_size) {
3646 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
3647 } else {
3648 assert(bdev);
3649 if (bdev->is_rotational()) {
3650 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
3651 } else {
3652 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
3653 }
3654 }
3655
3656 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
3657 if (!alg_name.empty()) {
3658 compressor = Compressor::create(cct, alg_name);
3659 if (!compressor) {
3660 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
3661 << dendl;
3662 }
3663 }
3664
3665 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
3666 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
3667 << dendl;
3668 }
3669
3670 void BlueStore::_set_csum()
3671 {
3672 csum_type = Checksummer::CSUM_NONE;
3673 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
3674 if (t > Checksummer::CSUM_NONE)
3675 csum_type = t;
3676
3677 dout(10) << __func__ << " csum_type "
3678 << Checksummer::get_csum_type_string(csum_type)
3679 << dendl;
3680 }
3681
3682 void BlueStore::_set_throttle_params()
3683 {
3684 if (cct->_conf->bluestore_throttle_cost_per_io) {
3685 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
3686 } else {
3687 assert(bdev);
3688 if (bdev->is_rotational()) {
3689 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
3690 } else {
3691 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
3692 }
3693 }
3694
3695 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
3696 << dendl;
3697 }
3698 void BlueStore::_set_blob_size()
3699 {
3700 if (cct->_conf->bluestore_max_blob_size) {
3701 max_blob_size = cct->_conf->bluestore_max_blob_size;
3702 } else {
3703 assert(bdev);
3704 if (bdev->is_rotational()) {
3705 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
3706 } else {
3707 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
3708 }
3709 }
3710 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
3711 << std::dec << dendl;
3712 }
3713
3714 int BlueStore::_set_cache_sizes()
3715 {
3716 assert(bdev);
3717 if (cct->_conf->bluestore_cache_size) {
3718 cache_size = cct->_conf->bluestore_cache_size;
3719 } else {
3720 // choose global cache size based on backend type
3721 if (bdev->is_rotational()) {
3722 cache_size = cct->_conf->bluestore_cache_size_hdd;
3723 } else {
3724 cache_size = cct->_conf->bluestore_cache_size_ssd;
3725 }
3726 }
3727 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
3728 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
3729
3730 double cache_kv_max = cct->_conf->bluestore_cache_kv_max;
3731 double cache_kv_max_ratio = 0;
3732
3733 // if cache_kv_max is negative, disable it
3734 if (cache_size > 0 && cache_kv_max >= 0) {
3735 cache_kv_max_ratio = (double) cache_kv_max / (double) cache_size;
3736 if (cache_kv_max_ratio < 1.0 && cache_kv_max_ratio < cache_kv_ratio) {
3737 dout(1) << __func__ << " max " << cache_kv_max_ratio
3738 << " < ratio " << cache_kv_ratio
3739 << dendl;
3740 cache_meta_ratio = cache_meta_ratio + cache_kv_ratio - cache_kv_max_ratio;
3741 cache_kv_ratio = cache_kv_max_ratio;
3742 }
3743 }
3744
3745 cache_data_ratio =
3746 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
3747
3748 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
3749 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
3750 << ") must be in range [0,1.0]" << dendl;
3751 return -EINVAL;
3752 }
3753 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
3754 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
3755 << ") must be in range [0,1.0]" << dendl;
3756 return -EINVAL;
3757 }
3758 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
3759 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
3760 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3761 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
3762 << dendl;
3763 return -EINVAL;
3764 }
3765 if (cache_data_ratio < 0) {
3766 // deal with floating point imprecision
3767 cache_data_ratio = 0;
3768 }
3769 dout(1) << __func__ << " cache_size " << cache_size
3770 << " meta " << cache_meta_ratio
3771 << " kv " << cache_kv_ratio
3772 << " data " << cache_data_ratio
3773 << dendl;
3774 return 0;
3775 }
3776
3777 void BlueStore::_init_logger()
3778 {
3779 PerfCountersBuilder b(cct, "bluestore",
3780 l_bluestore_first, l_bluestore_last);
3781 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
3782 "Average kv_thread flush latency",
3783 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
3784 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
3785 "Average kv_thread commit latency");
3786 b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
3787 "Average kv_thread sync latency",
3788 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
3789 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
3790 "Average prepare state latency");
3791 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
3792 "Average aio_wait state latency",
3793 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
3794 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
3795 "Average io_done state latency");
3796 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
3797 "Average kv_queued state latency");
3798 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
3799 "Average kv_commiting state latency");
3800 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
3801 "Average kv_done state latency");
3802 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
3803 "Average deferred_queued state latency");
3804 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
3805 "Average aio_wait state latency");
3806 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
3807 "Average cleanup state latency");
3808 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
3809 "Average finishing state latency");
3810 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
3811 "Average done state latency");
3812 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
3813 "Average submit throttle latency",
3814 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
3815 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
3816 "Average submit latency",
3817 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
3818 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
3819 "Average commit latency",
3820 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
3821 b.add_time_avg(l_bluestore_read_lat, "read_lat",
3822 "Average read latency",
3823 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
3824 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
3825 "Average read onode metadata latency");
3826 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
3827 "Average read latency");
3828 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
3829 "Average compress latency");
3830 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
3831 "Average decompress latency");
3832 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
3833 "Average checksum latency");
3834 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
3835 "Sum for beneficial compress ops");
3836 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
3837 "Sum for compress ops rejected due to low net gain of space");
3838 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
3839 "Sum for write-op padded bytes");
3840 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
3841 "Sum for deferred write op");
3842 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
3843 "Sum for deferred write bytes", "def");
3844 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
3845 "Sum for write penalty read ops");
3846 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
3847 "Sum for allocated bytes");
3848 b.add_u64(l_bluestore_stored, "bluestore_stored",
3849 "Sum for stored bytes");
3850 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
3851 "Sum for stored compressed bytes");
3852 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
3853 "Sum for bytes allocated for compressed data");
3854 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
3855 "Sum for original bytes that were compressed");
3856
3857 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
3858 "Number of onodes in cache");
3859 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
3860 "Sum for onode-lookups hit in the cache");
3861 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
3862 "Sum for onode-lookups missed in the cache");
3863 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
3864 "Sum for onode-shard lookups hit in the cache");
3865 b.add_u64_counter(l_bluestore_onode_shard_misses,
3866 "bluestore_onode_shard_misses",
3867 "Sum for onode-shard lookups missed in the cache");
3868 b.add_u64(l_bluestore_extents, "bluestore_extents",
3869 "Number of extents in cache");
3870 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
3871 "Number of blobs in cache");
3872 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
3873 "Number of buffers in cache");
3874 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
3875 "Number of buffer bytes in cache");
3876 b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
3877 "Sum for bytes of read hit in the cache");
3878 b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
3879 "Sum for bytes of read missed in the cache");
3880
3881 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
3882 "Large aligned writes into fresh blobs");
3883 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
3884 "Large aligned writes into fresh blobs (bytes)");
3885 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
3886 "Large aligned writes into fresh blobs (blobs)");
3887 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
3888 "Small writes into existing or sparse small blobs");
3889 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
3890 "Small writes into existing or sparse small blobs (bytes)");
3891 b.add_u64_counter(l_bluestore_write_small_unused,
3892 "bluestore_write_small_unused",
3893 "Small writes into unused portion of existing blob");
3894 b.add_u64_counter(l_bluestore_write_small_deferred,
3895 "bluestore_write_small_deferred",
3896 "Small overwrites using deferred");
3897 b.add_u64_counter(l_bluestore_write_small_pre_read,
3898 "bluestore_write_small_pre_read",
3899 "Small writes that required we read some data (possibly "
3900 "cached) to fill out the block");
3901 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
3902 "Small write into new (sparse) blob");
3903
3904 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
3905 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
3906 "Onode extent map reshard events");
3907 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
3908 "Sum for blob splitting due to resharding");
3909 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
3910 "Sum for extents that have been removed due to compression");
3911 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
3912 "Sum for extents that have been merged due to garbage "
3913 "collection");
3914 logger = b.create_perf_counters();
3915 cct->get_perfcounters_collection()->add(logger);
3916 }
3917
3918 int BlueStore::_reload_logger()
3919 {
3920 struct store_statfs_t store_statfs;
3921
3922 int r = statfs(&store_statfs);
3923 if(r >= 0) {
3924 logger->set(l_bluestore_allocated, store_statfs.allocated);
3925 logger->set(l_bluestore_stored, store_statfs.stored);
3926 logger->set(l_bluestore_compressed, store_statfs.compressed);
3927 logger->set(l_bluestore_compressed_allocated, store_statfs.compressed_allocated);
3928 logger->set(l_bluestore_compressed_original, store_statfs.compressed_original);
3929 }
3930 return r;
3931 }
3932
3933 void BlueStore::_shutdown_logger()
3934 {
3935 cct->get_perfcounters_collection()->remove(logger);
3936 delete logger;
3937 }
3938
3939 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
3940 uuid_d *fsid)
3941 {
3942 bluestore_bdev_label_t label;
3943 int r = _read_bdev_label(cct, path, &label);
3944 if (r < 0)
3945 return r;
3946 *fsid = label.osd_uuid;
3947 return 0;
3948 }
3949
3950 int BlueStore::_open_path()
3951 {
3952 assert(path_fd < 0);
3953 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY));
3954 if (path_fd < 0) {
3955 int r = -errno;
3956 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
3957 << dendl;
3958 return r;
3959 }
3960 return 0;
3961 }
3962
3963 void BlueStore::_close_path()
3964 {
3965 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
3966 path_fd = -1;
3967 }
3968
3969 int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
3970 {
3971 dout(10) << __func__ << " path " << path << " label " << label << dendl;
3972 bufferlist bl;
3973 ::encode(label, bl);
3974 uint32_t crc = bl.crc32c(-1);
3975 ::encode(crc, bl);
3976 assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
3977 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
3978 z.zero();
3979 bl.append(std::move(z));
3980
3981 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY));
3982 if (fd < 0) {
3983 fd = -errno;
3984 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
3985 << dendl;
3986 return fd;
3987 }
3988 int r = bl.write_fd(fd);
3989 if (r < 0) {
3990 derr << __func__ << " failed to write to " << path
3991 << ": " << cpp_strerror(r) << dendl;
3992 }
3993 VOID_TEMP_FAILURE_RETRY(::close(fd));
3994 return r;
3995 }
3996
3997 int BlueStore::_read_bdev_label(CephContext* cct, string path,
3998 bluestore_bdev_label_t *label)
3999 {
4000 dout(10) << __func__ << dendl;
4001 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY));
4002 if (fd < 0) {
4003 fd = -errno;
4004 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4005 << dendl;
4006 return fd;
4007 }
4008 bufferlist bl;
4009 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4010 VOID_TEMP_FAILURE_RETRY(::close(fd));
4011 if (r < 0) {
4012 derr << __func__ << " failed to read from " << path
4013 << ": " << cpp_strerror(r) << dendl;
4014 return r;
4015 }
4016
4017 uint32_t crc, expected_crc;
4018 bufferlist::iterator p = bl.begin();
4019 try {
4020 ::decode(*label, p);
4021 bufferlist t;
4022 t.substr_of(bl, 0, p.get_off());
4023 crc = t.crc32c(-1);
4024 ::decode(expected_crc, p);
4025 }
4026 catch (buffer::error& e) {
4027 derr << __func__ << " unable to decode label at offset " << p.get_off()
4028 << ": " << e.what()
4029 << dendl;
4030 return -EINVAL;
4031 }
4032 if (crc != expected_crc) {
4033 derr << __func__ << " bad crc on label, expected " << expected_crc
4034 << " != actual " << crc << dendl;
4035 return -EIO;
4036 }
4037 dout(10) << __func__ << " got " << *label << dendl;
4038 return 0;
4039 }
4040
4041 int BlueStore::_check_or_set_bdev_label(
4042 string path, uint64_t size, string desc, bool create)
4043 {
4044 bluestore_bdev_label_t label;
4045 if (create) {
4046 label.osd_uuid = fsid;
4047 label.size = size;
4048 label.btime = ceph_clock_now();
4049 label.description = desc;
4050 int r = _write_bdev_label(path, label);
4051 if (r < 0)
4052 return r;
4053 } else {
4054 int r = _read_bdev_label(cct, path, &label);
4055 if (r < 0)
4056 return r;
4057 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4058 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4059 << " and fsid " << fsid << " check bypassed" << dendl;
4060 }
4061 else if (label.osd_uuid != fsid) {
4062 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4063 << " does not match our fsid " << fsid << dendl;
4064 return -EIO;
4065 }
4066 }
4067 return 0;
4068 }
4069
4070 void BlueStore::_set_alloc_sizes(void)
4071 {
4072 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4073
4074 if (cct->_conf->bluestore_prefer_deferred_size) {
4075 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4076 } else {
4077 assert(bdev);
4078 if (bdev->is_rotational()) {
4079 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4080 } else {
4081 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4082 }
4083 }
4084
4085 if (cct->_conf->bluestore_deferred_batch_ops) {
4086 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4087 } else {
4088 assert(bdev);
4089 if (bdev->is_rotational()) {
4090 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4091 } else {
4092 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4093 }
4094 }
4095
4096 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
4097 << std::dec << " order " << min_alloc_size_order
4098 << " max_alloc_size 0x" << std::hex << max_alloc_size
4099 << " prefer_deferred_size 0x" << prefer_deferred_size
4100 << std::dec
4101 << " deferred_batch_ops " << deferred_batch_ops
4102 << dendl;
4103 }
4104
4105 int BlueStore::_open_bdev(bool create)
4106 {
4107 assert(bdev == NULL);
4108 string p = path + "/block";
4109 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
4110 int r = bdev->open(p);
4111 if (r < 0)
4112 goto fail;
4113
4114 if (bdev->supported_bdev_label()) {
4115 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4116 if (r < 0)
4117 goto fail_close;
4118 }
4119
4120 // initialize global block parameters
4121 block_size = bdev->get_block_size();
4122 block_mask = ~(block_size - 1);
4123 block_size_order = ctz(block_size);
4124 assert(block_size == 1u << block_size_order);
4125 // and set cache_size based on device type
4126 r = _set_cache_sizes();
4127 if (r < 0) {
4128 goto fail_close;
4129 }
4130 return 0;
4131
4132 fail_close:
4133 bdev->close();
4134 fail:
4135 delete bdev;
4136 bdev = NULL;
4137 return r;
4138 }
4139
4140 void BlueStore::_close_bdev()
4141 {
4142 assert(bdev);
4143 bdev->close();
4144 delete bdev;
4145 bdev = NULL;
4146 }
4147
4148 int BlueStore::_open_fm(bool create)
4149 {
4150 assert(fm == NULL);
4151 fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
4152
4153 if (create) {
4154 // initialize freespace
4155 dout(20) << __func__ << " initializing freespace" << dendl;
4156 KeyValueDB::Transaction t = db->get_transaction();
4157 {
4158 bufferlist bl;
4159 bl.append(freelist_type);
4160 t->set(PREFIX_SUPER, "freelist_type", bl);
4161 }
4162 fm->create(bdev->get_size(), t);
4163
4164 // allocate superblock reserved space. note that we do not mark
4165 // bluefs space as allocated in the freelist; we instead rely on
4166 // bluefs_extents.
4167 fm->allocate(0, SUPER_RESERVED, t);
4168
4169 uint64_t reserved = 0;
4170 if (cct->_conf->bluestore_bluefs) {
4171 assert(bluefs_extents.num_intervals() == 1);
4172 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
4173 reserved = p.get_start() + p.get_len();
4174 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4175 << " for bluefs" << dendl;
4176 bufferlist bl;
4177 ::encode(bluefs_extents, bl);
4178 t->set(PREFIX_SUPER, "bluefs_extents", bl);
4179 dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
4180 << std::dec << dendl;
4181 } else {
4182 reserved = SUPER_RESERVED;
4183 }
4184
4185 if (cct->_conf->bluestore_debug_prefill > 0) {
4186 uint64_t end = bdev->get_size() - reserved;
4187 dout(1) << __func__ << " pre-fragmenting freespace, using "
4188 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4189 << cct->_conf->bluestore_debug_prefragment_max << dendl;
4190 uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
4191 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4192 float r = cct->_conf->bluestore_debug_prefill;
4193 r /= 1.0 - r;
4194 bool stop = false;
4195
4196 while (!stop && start < end) {
4197 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4198 if (start + l > end) {
4199 l = end - start;
4200 l = P2ALIGN(l, min_alloc_size);
4201 }
4202 assert(start + l <= end);
4203
4204 uint64_t u = 1 + (uint64_t)(r * (double)l);
4205 u = P2ROUNDUP(u, min_alloc_size);
4206 if (start + l + u > end) {
4207 u = end - (start + l);
4208 // trim to align so we don't overflow again
4209 u = P2ALIGN(u, min_alloc_size);
4210 stop = true;
4211 }
4212 assert(start + l + u <= end);
4213
4214 dout(20) << " free 0x" << std::hex << start << "~" << l
4215 << " use 0x" << u << std::dec << dendl;
4216
4217 if (u == 0) {
4218 // break if u has been trimmed to nothing
4219 break;
4220 }
4221
4222 fm->allocate(start + l, u, t);
4223 start += l + u;
4224 }
4225 }
4226 db->submit_transaction_sync(t);
4227 }
4228
4229 int r = fm->init();
4230 if (r < 0) {
4231 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
4232 delete fm;
4233 fm = NULL;
4234 return r;
4235 }
4236 return 0;
4237 }
4238
4239 void BlueStore::_close_fm()
4240 {
4241 dout(10) << __func__ << dendl;
4242 assert(fm);
4243 fm->shutdown();
4244 delete fm;
4245 fm = NULL;
4246 }
4247
4248 int BlueStore::_open_alloc()
4249 {
4250 assert(alloc == NULL);
4251 assert(bdev->get_size());
4252 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
4253 bdev->get_size(),
4254 min_alloc_size);
4255 if (!alloc) {
4256 lderr(cct) << __func__ << " Allocator::unknown alloc type "
4257 << cct->_conf->bluestore_allocator
4258 << dendl;
4259 return -EINVAL;
4260 }
4261
4262 uint64_t num = 0, bytes = 0;
4263
4264 dout(1) << __func__ << " opening allocation metadata" << dendl;
4265 // initialize from freelist
4266 fm->enumerate_reset();
4267 uint64_t offset, length;
4268 while (fm->enumerate_next(&offset, &length)) {
4269 alloc->init_add_free(offset, length);
4270 ++num;
4271 bytes += length;
4272 }
4273 fm->enumerate_reset();
4274 dout(1) << __func__ << " loaded " << pretty_si_t(bytes)
4275 << " in " << num << " extents"
4276 << dendl;
4277
4278 // also mark bluefs space as allocated
4279 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
4280 alloc->init_rm_free(e.get_start(), e.get_len());
4281 }
4282 dout(10) << __func__ << " marked bluefs_extents 0x" << std::hex
4283 << bluefs_extents << std::dec << " as allocated" << dendl;
4284
4285 return 0;
4286 }
4287
4288 void BlueStore::_close_alloc()
4289 {
4290 assert(alloc);
4291 alloc->shutdown();
4292 delete alloc;
4293 alloc = NULL;
4294 }
4295
4296 int BlueStore::_open_fsid(bool create)
4297 {
4298 assert(fsid_fd < 0);
4299 int flags = O_RDWR;
4300 if (create)
4301 flags |= O_CREAT;
4302 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
4303 if (fsid_fd < 0) {
4304 int err = -errno;
4305 derr << __func__ << " " << cpp_strerror(err) << dendl;
4306 return err;
4307 }
4308 return 0;
4309 }
4310
4311 int BlueStore::_read_fsid(uuid_d *uuid)
4312 {
4313 char fsid_str[40];
4314 memset(fsid_str, 0, sizeof(fsid_str));
4315 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
4316 if (ret < 0) {
4317 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
4318 return ret;
4319 }
4320 if (ret > 36)
4321 fsid_str[36] = 0;
4322 else
4323 fsid_str[ret] = 0;
4324 if (!uuid->parse(fsid_str)) {
4325 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
4326 return -EINVAL;
4327 }
4328 return 0;
4329 }
4330
4331 int BlueStore::_write_fsid()
4332 {
4333 int r = ::ftruncate(fsid_fd, 0);
4334 if (r < 0) {
4335 r = -errno;
4336 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
4337 return r;
4338 }
4339 string str = stringify(fsid) + "\n";
4340 r = safe_write(fsid_fd, str.c_str(), str.length());
4341 if (r < 0) {
4342 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
4343 return r;
4344 }
4345 r = ::fsync(fsid_fd);
4346 if (r < 0) {
4347 r = -errno;
4348 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
4349 return r;
4350 }
4351 return 0;
4352 }
4353
4354 void BlueStore::_close_fsid()
4355 {
4356 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
4357 fsid_fd = -1;
4358 }
4359
4360 int BlueStore::_lock_fsid()
4361 {
4362 struct flock l;
4363 memset(&l, 0, sizeof(l));
4364 l.l_type = F_WRLCK;
4365 l.l_whence = SEEK_SET;
4366 int r = ::fcntl(fsid_fd, F_SETLK, &l);
4367 if (r < 0) {
4368 int err = errno;
4369 derr << __func__ << " failed to lock " << path << "/fsid"
4370 << " (is another ceph-osd still running?)"
4371 << cpp_strerror(err) << dendl;
4372 return -err;
4373 }
4374 return 0;
4375 }
4376
4377 bool BlueStore::is_rotational()
4378 {
4379 if (bdev) {
4380 return bdev->is_rotational();
4381 }
4382
4383 bool rotational = true;
4384 int r = _open_path();
4385 if (r < 0)
4386 goto out;
4387 r = _open_fsid(false);
4388 if (r < 0)
4389 goto out_path;
4390 r = _read_fsid(&fsid);
4391 if (r < 0)
4392 goto out_fsid;
4393 r = _lock_fsid();
4394 if (r < 0)
4395 goto out_fsid;
4396 r = _open_bdev(false);
4397 if (r < 0)
4398 goto out_fsid;
4399 rotational = bdev->is_rotational();
4400 _close_bdev();
4401 out_fsid:
4402 _close_fsid();
4403 out_path:
4404 _close_path();
4405 out:
4406 return rotational;
4407 }
4408
4409 bool BlueStore::is_journal_rotational()
4410 {
4411 if (!bluefs) {
4412 dout(5) << __func__ << " bluefs disabled, default to store media type"
4413 << dendl;
4414 return is_rotational();
4415 }
4416 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
4417 return bluefs->wal_is_rotational();
4418 }
4419
4420 bool BlueStore::test_mount_in_use()
4421 {
4422 // most error conditions mean the mount is not in use (e.g., because
4423 // it doesn't exist). only if we fail to lock do we conclude it is
4424 // in use.
4425 bool ret = false;
4426 int r = _open_path();
4427 if (r < 0)
4428 return false;
4429 r = _open_fsid(false);
4430 if (r < 0)
4431 goto out_path;
4432 r = _lock_fsid();
4433 if (r < 0)
4434 ret = true; // if we can't lock, it is in use
4435 _close_fsid();
4436 out_path:
4437 _close_path();
4438 return ret;
4439 }
4440
4441 int BlueStore::_open_db(bool create)
4442 {
4443 int r;
4444 assert(!db);
4445 string fn = path + "/db";
4446 string options;
4447 stringstream err;
4448 ceph::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
4449
4450 string kv_backend;
4451 if (create) {
4452 kv_backend = cct->_conf->bluestore_kvbackend;
4453 } else {
4454 r = read_meta("kv_backend", &kv_backend);
4455 if (r < 0) {
4456 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
4457 return -EIO;
4458 }
4459 }
4460 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
4461
4462 bool do_bluefs;
4463 if (create) {
4464 do_bluefs = cct->_conf->bluestore_bluefs;
4465 } else {
4466 string s;
4467 r = read_meta("bluefs", &s);
4468 if (r < 0) {
4469 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
4470 return -EIO;
4471 }
4472 if (s == "1") {
4473 do_bluefs = true;
4474 } else if (s == "0") {
4475 do_bluefs = false;
4476 } else {
4477 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
4478 << dendl;
4479 return -EIO;
4480 }
4481 }
4482 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
4483
4484 rocksdb::Env *env = NULL;
4485 if (do_bluefs) {
4486 dout(10) << __func__ << " initializing bluefs" << dendl;
4487 if (kv_backend != "rocksdb") {
4488 derr << " backend must be rocksdb to use bluefs" << dendl;
4489 return -EINVAL;
4490 }
4491 bluefs = new BlueFS(cct);
4492
4493 string bfn;
4494 struct stat st;
4495
4496 bfn = path + "/block.db";
4497 if (::stat(bfn.c_str(), &st) == 0) {
4498 r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
4499 if (r < 0) {
4500 derr << __func__ << " add block device(" << bfn << ") returned: "
4501 << cpp_strerror(r) << dendl;
4502 goto free_bluefs;
4503 }
4504
4505 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
4506 r = _check_or_set_bdev_label(
4507 bfn,
4508 bluefs->get_block_device_size(BlueFS::BDEV_DB),
4509 "bluefs db", create);
4510 if (r < 0) {
4511 derr << __func__
4512 << " check block device(" << bfn << ") label returned: "
4513 << cpp_strerror(r) << dendl;
4514 goto free_bluefs;
4515 }
4516 }
4517 if (create) {
4518 bluefs->add_block_extent(
4519 BlueFS::BDEV_DB,
4520 SUPER_RESERVED,
4521 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
4522 }
4523 bluefs_shared_bdev = BlueFS::BDEV_SLOW;
4524 bluefs_single_shared_device = false;
4525 } else if (::lstat(bfn.c_str(), &st) == -1) {
4526 bluefs_shared_bdev = BlueFS::BDEV_DB;
4527 } else {
4528 //symlink exist is bug
4529 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4530 r = -errno;
4531 goto free_bluefs;
4532 }
4533
4534 // shared device
4535 bfn = path + "/block";
4536 r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
4537 if (r < 0) {
4538 derr << __func__ << " add block device(" << bfn << ") returned: "
4539 << cpp_strerror(r) << dendl;
4540 goto free_bluefs;
4541 }
4542 if (create) {
4543 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4544 uint64_t initial =
4545 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
4546 cct->_conf->bluestore_bluefs_gift_ratio);
4547 initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
4548 // align to bluefs's alloc_size
4549 initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
4550 // put bluefs in the middle of the device in case it is an HDD
4551 uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
4552 cct->_conf->bluefs_alloc_size);
4553 bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
4554 bluefs_extents.insert(start, initial);
4555 }
4556
4557 bfn = path + "/block.wal";
4558 if (::stat(bfn.c_str(), &st) == 0) {
4559 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
4560 if (r < 0) {
4561 derr << __func__ << " add block device(" << bfn << ") returned: "
4562 << cpp_strerror(r) << dendl;
4563 goto free_bluefs;
4564 }
4565
4566 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
4567 r = _check_or_set_bdev_label(
4568 bfn,
4569 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
4570 "bluefs wal", create);
4571 if (r < 0) {
4572 derr << __func__ << " check block device(" << bfn
4573 << ") label returned: " << cpp_strerror(r) << dendl;
4574 goto free_bluefs;
4575 }
4576 }
4577
4578 if (create) {
4579 bluefs->add_block_extent(
4580 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
4581 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
4582 BDEV_LABEL_BLOCK_SIZE);
4583 }
4584 cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
4585 bluefs_single_shared_device = false;
4586 } else if (::lstat(bfn.c_str(), &st) == -1) {
4587 cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
4588 } else {
4589 //symlink exist is bug
4590 derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
4591 r = -errno;
4592 goto free_bluefs;
4593 }
4594
4595 if (create) {
4596 bluefs->mkfs(fsid);
4597 }
4598 r = bluefs->mount();
4599 if (r < 0) {
4600 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
4601 goto free_bluefs;
4602 }
4603 if (cct->_conf->bluestore_bluefs_env_mirror) {
4604 rocksdb::Env *a = new BlueRocksEnv(bluefs);
4605 rocksdb::Env *b = rocksdb::Env::Default();
4606 if (create) {
4607 string cmd = "rm -rf " + path + "/db " +
4608 path + "/db.slow " +
4609 path + "/db.wal";
4610 int r = system(cmd.c_str());
4611 (void)r;
4612 }
4613 env = new rocksdb::EnvMirror(b, a, false, true);
4614 } else {
4615 env = new BlueRocksEnv(bluefs);
4616
4617 // simplify the dir names, too, as "seen" by rocksdb
4618 fn = "db";
4619 }
4620
4621 if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
4622 // we have both block.db and block; tell rocksdb!
4623 // note: the second (last) size value doesn't really matter
4624 ostringstream db_paths;
4625 uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
4626 uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
4627 db_paths << fn << ","
4628 << (uint64_t)(db_size * 95 / 100) << " "
4629 << fn + ".slow" << ","
4630 << (uint64_t)(slow_size * 95 / 100);
4631 cct->_conf->set_val("rocksdb_db_paths", db_paths.str(), false);
4632 dout(10) << __func__ << " set rocksdb_db_paths to "
4633 << cct->_conf->get_val<std::string>("rocksdb_db_paths") << dendl;
4634 }
4635
4636 if (create) {
4637 env->CreateDir(fn);
4638 if (cct->_conf->rocksdb_separate_wal_dir)
4639 env->CreateDir(fn + ".wal");
4640 if (cct->_conf->get_val<std::string>("rocksdb_db_paths").length())
4641 env->CreateDir(fn + ".slow");
4642 }
4643 } else if (create) {
4644 int r = ::mkdir(fn.c_str(), 0755);
4645 if (r < 0)
4646 r = -errno;
4647 if (r < 0 && r != -EEXIST) {
4648 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
4649 << dendl;
4650 return r;
4651 }
4652
4653 // wal_dir, too!
4654 if (cct->_conf->rocksdb_separate_wal_dir) {
4655 string walfn = path + "/db.wal";
4656 r = ::mkdir(walfn.c_str(), 0755);
4657 if (r < 0)
4658 r = -errno;
4659 if (r < 0 && r != -EEXIST) {
4660 derr << __func__ << " failed to create " << walfn
4661 << ": " << cpp_strerror(r)
4662 << dendl;
4663 return r;
4664 }
4665 }
4666 }
4667
4668 db = KeyValueDB::create(cct,
4669 kv_backend,
4670 fn,
4671 static_cast<void*>(env));
4672 if (!db) {
4673 derr << __func__ << " error creating db" << dendl;
4674 if (bluefs) {
4675 bluefs->umount();
4676 delete bluefs;
4677 bluefs = NULL;
4678 }
4679 // delete env manually here since we can't depend on db to do this
4680 // under this case
4681 delete env;
4682 env = NULL;
4683 return -EIO;
4684 }
4685
4686 FreelistManager::setup_merge_operators(db);
4687 db->set_merge_operator(PREFIX_STAT, merge_op);
4688
4689 db->set_cache_size(cache_size * cache_kv_ratio);
4690
4691 if (kv_backend == "rocksdb")
4692 options = cct->_conf->bluestore_rocksdb_options;
4693 db->init(options);
4694 if (create)
4695 r = db->create_and_open(err);
4696 else
4697 r = db->open(err);
4698 if (r) {
4699 derr << __func__ << " erroring opening db: " << err.str() << dendl;
4700 if (bluefs) {
4701 bluefs->umount();
4702 delete bluefs;
4703 bluefs = NULL;
4704 }
4705 delete db;
4706 db = NULL;
4707 return -EIO;
4708 }
4709 dout(1) << __func__ << " opened " << kv_backend
4710 << " path " << fn << " options " << options << dendl;
4711 return 0;
4712
4713 free_bluefs:
4714 assert(bluefs);
4715 delete bluefs;
4716 bluefs = NULL;
4717 return r;
4718 }
4719
4720 void BlueStore::_close_db()
4721 {
4722 assert(db);
4723 delete db;
4724 db = NULL;
4725 if (bluefs) {
4726 bluefs->umount();
4727 delete bluefs;
4728 bluefs = NULL;
4729 }
4730 }
4731
4732 int BlueStore::_reconcile_bluefs_freespace()
4733 {
4734 dout(10) << __func__ << dendl;
4735 interval_set<uint64_t> bset;
4736 int r = bluefs->get_block_extents(bluefs_shared_bdev, &bset);
4737 assert(r == 0);
4738 if (bset == bluefs_extents) {
4739 dout(10) << __func__ << " we agree bluefs has 0x" << std::hex << bset
4740 << std::dec << dendl;
4741 return 0;
4742 }
4743 dout(10) << __func__ << " bluefs says 0x" << std::hex << bset << std::dec
4744 << dendl;
4745 dout(10) << __func__ << " super says 0x" << std::hex << bluefs_extents
4746 << std::dec << dendl;
4747
4748 interval_set<uint64_t> overlap;
4749 overlap.intersection_of(bset, bluefs_extents);
4750
4751 bset.subtract(overlap);
4752 if (!bset.empty()) {
4753 derr << __func__ << " bluefs extra 0x" << std::hex << bset << std::dec
4754 << dendl;
4755 return -EIO;
4756 }
4757
4758 interval_set<uint64_t> super_extra;
4759 super_extra = bluefs_extents;
4760 super_extra.subtract(overlap);
4761 if (!super_extra.empty()) {
4762 // This is normal: it can happen if we commit to give extents to
4763 // bluefs and we crash before bluefs commits that it owns them.
4764 dout(10) << __func__ << " super extra " << super_extra << dendl;
4765 for (interval_set<uint64_t>::iterator p = super_extra.begin();
4766 p != super_extra.end();
4767 ++p) {
4768 bluefs->add_block_extent(bluefs_shared_bdev, p.get_start(), p.get_len());
4769 }
4770 }
4771
4772 return 0;
4773 }
4774
4775 int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
4776 {
4777 int ret = 0;
4778 assert(bluefs);
4779
4780 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
4781 bluefs->get_usage(&bluefs_usage);
4782 assert(bluefs_usage.size() > bluefs_shared_bdev);
4783
4784 // fixme: look at primary bdev only for now
4785 uint64_t bluefs_free = bluefs_usage[bluefs_shared_bdev].first;
4786 uint64_t bluefs_total = bluefs_usage[bluefs_shared_bdev].second;
4787 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
4788
4789 uint64_t my_free = alloc->get_free();
4790 uint64_t total = bdev->get_size();
4791 float my_free_ratio = (float)my_free / (float)total;
4792
4793 uint64_t total_free = bluefs_free + my_free;
4794
4795 float bluefs_ratio = (float)bluefs_free / (float)total_free;
4796
4797 dout(10) << __func__
4798 << " bluefs " << pretty_si_t(bluefs_free)
4799 << " free (" << bluefs_free_ratio
4800 << ") bluestore " << pretty_si_t(my_free)
4801 << " free (" << my_free_ratio
4802 << "), bluefs_ratio " << bluefs_ratio
4803 << dendl;
4804
4805 uint64_t gift = 0;
4806 uint64_t reclaim = 0;
4807 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
4808 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
4809 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4810 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
4811 << ", should gift " << pretty_si_t(gift) << dendl;
4812 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
4813 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
4814 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
4815 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
4816 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
4817 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
4818 << ", should reclaim " << pretty_si_t(reclaim) << dendl;
4819 }
4820 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
4821 cct->_conf->bluestore_bluefs_min <
4822 (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
4823 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
4824 dout(10) << __func__ << " bluefs_total " << bluefs_total
4825 << " < min " << cct->_conf->bluestore_bluefs_min
4826 << ", should gift " << pretty_si_t(g) << dendl;
4827 if (g > gift)
4828 gift = g;
4829 reclaim = 0;
4830 }
4831
4832 if (gift) {
4833 // round up to alloc size
4834 gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
4835
4836 // hard cap to fit into 32 bits
4837 gift = MIN(gift, 1ull<<31);
4838 dout(10) << __func__ << " gifting " << gift
4839 << " (" << pretty_si_t(gift) << ")" << dendl;
4840
4841 // fixme: just do one allocation to start...
4842 int r = alloc->reserve(gift);
4843 assert(r == 0);
4844
4845 AllocExtentVector exts;
4846 int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
4847 0, 0, &exts);
4848
4849 if (alloc_len < (int64_t)gift) {
4850 derr << __func__ << " allocate failed on 0x" << std::hex << gift
4851 << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
4852 alloc->dump();
4853 assert(0 == "allocate failed, wtf");
4854 return -ENOSPC;
4855 }
4856 for (auto& p : exts) {
4857 bluestore_pextent_t e = bluestore_pextent_t(p);
4858 dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
4859 extents->push_back(e);
4860 }
4861 gift = 0;
4862
4863 ret = 1;
4864 }
4865
4866 // reclaim from bluefs?
4867 if (reclaim) {
4868 // round up to alloc size
4869 reclaim = P2ROUNDUP(reclaim, cct->_conf->bluefs_alloc_size);
4870
4871 // hard cap to fit into 32 bits
4872 reclaim = MIN(reclaim, 1ull<<31);
4873 dout(10) << __func__ << " reclaiming " << reclaim
4874 << " (" << pretty_si_t(reclaim) << ")" << dendl;
4875
4876 while (reclaim > 0) {
4877 // NOTE: this will block and do IO.
4878 AllocExtentVector extents;
4879 int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
4880 &extents);
4881 if (r < 0) {
4882 derr << __func__ << " failed to reclaim space from bluefs"
4883 << dendl;
4884 break;
4885 }
4886 for (auto e : extents) {
4887 bluefs_extents.erase(e.offset, e.length);
4888 bluefs_extents_reclaiming.insert(e.offset, e.length);
4889 reclaim -= e.length;
4890 }
4891 }
4892
4893 ret = 1;
4894 }
4895
4896 return ret;
4897 }
4898
4899 void BlueStore::_commit_bluefs_freespace(
4900 const PExtentVector& bluefs_gift_extents)
4901 {
4902 dout(10) << __func__ << dendl;
4903 for (auto& p : bluefs_gift_extents) {
4904 bluefs->add_block_extent(bluefs_shared_bdev, p.offset, p.length);
4905 }
4906 }
4907
4908 int BlueStore::_open_collections(int *errors)
4909 {
4910 assert(coll_map.empty());
4911 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
4912 for (it->upper_bound(string());
4913 it->valid();
4914 it->next()) {
4915 coll_t cid;
4916 if (cid.parse(it->key())) {
4917 CollectionRef c(
4918 new Collection(
4919 this,
4920 cache_shards[cid.hash_to_shard(cache_shards.size())],
4921 cid));
4922 bufferlist bl = it->value();
4923 bufferlist::iterator p = bl.begin();
4924 try {
4925 ::decode(c->cnode, p);
4926 } catch (buffer::error& e) {
4927 derr << __func__ << " failed to decode cnode, key:"
4928 << pretty_binary_string(it->key()) << dendl;
4929 return -EIO;
4930 }
4931 dout(20) << __func__ << " opened " << cid << " " << c << dendl;
4932 coll_map[cid] = c;
4933 } else {
4934 derr << __func__ << " unrecognized collection " << it->key() << dendl;
4935 if (errors)
4936 (*errors)++;
4937 }
4938 }
4939 return 0;
4940 }
4941
4942 void BlueStore::_open_statfs()
4943 {
4944 bufferlist bl;
4945 int r = db->get(PREFIX_STAT, "bluestore_statfs", &bl);
4946 if (r >= 0) {
4947 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
4948 auto it = bl.begin();
4949 vstatfs.decode(it);
4950 } else {
4951 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
4952 }
4953 }
4954 else {
4955 dout(10) << __func__ << " store_statfs missed, using empty" << dendl;
4956 }
4957 }
4958
4959 int BlueStore::_setup_block_symlink_or_file(
4960 string name,
4961 string epath,
4962 uint64_t size,
4963 bool create)
4964 {
4965 dout(20) << __func__ << " name " << name << " path " << epath
4966 << " size " << size << " create=" << (int)create << dendl;
4967 int r = 0;
4968 int flags = O_RDWR;
4969 if (create)
4970 flags |= O_CREAT;
4971 if (epath.length()) {
4972 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
4973 if (r < 0) {
4974 r = -errno;
4975 derr << __func__ << " failed to create " << name << " symlink to "
4976 << epath << ": " << cpp_strerror(r) << dendl;
4977 return r;
4978 }
4979
4980 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
4981 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
4982 if (fd < 0) {
4983 r = -errno;
4984 derr << __func__ << " failed to open " << epath << " file: "
4985 << cpp_strerror(r) << dendl;
4986 return r;
4987 }
4988 string serial_number = epath.substr(strlen(SPDK_PREFIX));
4989 r = ::write(fd, serial_number.c_str(), serial_number.size());
4990 assert(r == (int)serial_number.size());
4991 dout(1) << __func__ << " created " << name << " symlink to "
4992 << epath << dendl;
4993 VOID_TEMP_FAILURE_RETRY(::close(fd));
4994 }
4995 }
4996 if (size) {
4997 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
4998 if (fd >= 0) {
4999 // block file is present
5000 struct stat st;
5001 int r = ::fstat(fd, &st);
5002 if (r == 0 &&
5003 S_ISREG(st.st_mode) && // if it is a regular file
5004 st.st_size == 0) { // and is 0 bytes
5005 r = ::ftruncate(fd, size);
5006 if (r < 0) {
5007 r = -errno;
5008 derr << __func__ << " failed to resize " << name << " file to "
5009 << size << ": " << cpp_strerror(r) << dendl;
5010 VOID_TEMP_FAILURE_RETRY(::close(fd));
5011 return r;
5012 }
5013
5014 if (cct->_conf->bluestore_block_preallocate_file) {
5015 #ifdef HAVE_POSIX_FALLOCATE
5016 r = ::posix_fallocate(fd, 0, size);
5017 if (r) {
5018 derr << __func__ << " failed to prefallocate " << name << " file to "
5019 << size << ": " << cpp_strerror(r) << dendl;
5020 VOID_TEMP_FAILURE_RETRY(::close(fd));
5021 return -r;
5022 }
5023 #else
5024 char data[1024*128];
5025 for (uint64_t off = 0; off < size; off += sizeof(data)) {
5026 if (off + sizeof(data) > size)
5027 r = ::write(fd, data, size - off);
5028 else
5029 r = ::write(fd, data, sizeof(data));
5030 if (r < 0) {
5031 r = -errno;
5032 derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
5033 << size << ": " << cpp_strerror(r) << dendl;
5034 VOID_TEMP_FAILURE_RETRY(::close(fd));
5035 return r;
5036 }
5037 }
5038 #endif
5039 }
5040 dout(1) << __func__ << " resized " << name << " file to "
5041 << pretty_si_t(size) << "B" << dendl;
5042 }
5043 VOID_TEMP_FAILURE_RETRY(::close(fd));
5044 } else {
5045 int r = -errno;
5046 if (r != -ENOENT) {
5047 derr << __func__ << " failed to open " << name << " file: "
5048 << cpp_strerror(r) << dendl;
5049 return r;
5050 }
5051 }
5052 }
5053 return 0;
5054 }
5055
5056 int BlueStore::mkfs()
5057 {
5058 dout(1) << __func__ << " path " << path << dendl;
5059 int r;
5060 uuid_d old_fsid;
5061
5062 {
5063 string done;
5064 r = read_meta("mkfs_done", &done);
5065 if (r == 0) {
5066 dout(1) << __func__ << " already created" << dendl;
5067 if (cct->_conf->bluestore_fsck_on_mkfs) {
5068 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5069 if (r < 0) {
5070 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
5071 << dendl;
5072 return r;
5073 }
5074 if (r > 0) {
5075 derr << __func__ << " fsck found " << r << " errors" << dendl;
5076 r = -EIO;
5077 }
5078 }
5079 return r; // idempotent
5080 }
5081 }
5082
5083 {
5084 string type;
5085 r = read_meta("type", &type);
5086 if (r == 0) {
5087 if (type != "bluestore") {
5088 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5089 return -EIO;
5090 }
5091 } else {
5092 r = write_meta("type", "bluestore");
5093 if (r < 0)
5094 return r;
5095 }
5096 }
5097
5098 freelist_type = "bitmap";
5099
5100 r = _open_path();
5101 if (r < 0)
5102 return r;
5103
5104 r = _open_fsid(true);
5105 if (r < 0)
5106 goto out_path_fd;
5107
5108 r = _lock_fsid();
5109 if (r < 0)
5110 goto out_close_fsid;
5111
5112 r = _read_fsid(&old_fsid);
5113 if (r < 0 || old_fsid.is_zero()) {
5114 if (fsid.is_zero()) {
5115 fsid.generate_random();
5116 dout(1) << __func__ << " generated fsid " << fsid << dendl;
5117 } else {
5118 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
5119 }
5120 // we'll write it later.
5121 } else {
5122 if (!fsid.is_zero() && fsid != old_fsid) {
5123 derr << __func__ << " on-disk fsid " << old_fsid
5124 << " != provided " << fsid << dendl;
5125 r = -EINVAL;
5126 goto out_close_fsid;
5127 }
5128 fsid = old_fsid;
5129 }
5130
5131 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
5132 cct->_conf->bluestore_block_size,
5133 cct->_conf->bluestore_block_create);
5134 if (r < 0)
5135 goto out_close_fsid;
5136 if (cct->_conf->bluestore_bluefs) {
5137 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
5138 cct->_conf->bluestore_block_wal_size,
5139 cct->_conf->bluestore_block_wal_create);
5140 if (r < 0)
5141 goto out_close_fsid;
5142 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
5143 cct->_conf->bluestore_block_db_size,
5144 cct->_conf->bluestore_block_db_create);
5145 if (r < 0)
5146 goto out_close_fsid;
5147 }
5148
5149 r = _open_bdev(true);
5150 if (r < 0)
5151 goto out_close_fsid;
5152
5153 r = _open_db(true);
5154 if (r < 0)
5155 goto out_close_bdev;
5156
5157 r = _open_fm(true);
5158 if (r < 0)
5159 goto out_close_db;
5160
5161 {
5162 KeyValueDB::Transaction t = db->get_transaction();
5163 {
5164 bufferlist bl;
5165 ::encode((uint64_t)0, bl);
5166 t->set(PREFIX_SUPER, "nid_max", bl);
5167 t->set(PREFIX_SUPER, "blobid_max", bl);
5168 }
5169
5170 // choose min_alloc_size
5171 if (cct->_conf->bluestore_min_alloc_size) {
5172 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
5173 } else {
5174 assert(bdev);
5175 if (bdev->is_rotational()) {
5176 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
5177 } else {
5178 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
5179 }
5180 }
5181
5182 // make sure min_alloc_size is power of 2 aligned.
5183 if (!ISP2(min_alloc_size)) {
5184 derr << __func__ << " min_alloc_size 0x"
5185 << std::hex << min_alloc_size << std::dec
5186 << " is not power of 2 aligned!"
5187 << dendl;
5188 r = -EINVAL;
5189 goto out_close_fm;
5190 }
5191
5192 {
5193 bufferlist bl;
5194 ::encode((uint64_t)min_alloc_size, bl);
5195 t->set(PREFIX_SUPER, "min_alloc_size", bl);
5196 }
5197
5198 ondisk_format = latest_ondisk_format;
5199 _prepare_ondisk_format_super(t);
5200 db->submit_transaction_sync(t);
5201 }
5202
5203
5204 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
5205 if (r < 0)
5206 goto out_close_fm;
5207
5208 r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
5209 if (r < 0)
5210 goto out_close_fm;
5211
5212 if (fsid != old_fsid) {
5213 r = _write_fsid();
5214 if (r < 0) {
5215 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
5216 goto out_close_fm;
5217 }
5218 }
5219
5220 out_close_fm:
5221 _close_fm();
5222 out_close_db:
5223 _close_db();
5224 out_close_bdev:
5225 _close_bdev();
5226 out_close_fsid:
5227 _close_fsid();
5228 out_path_fd:
5229 _close_path();
5230
5231 if (r == 0 &&
5232 cct->_conf->bluestore_fsck_on_mkfs) {
5233 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
5234 if (rc < 0)
5235 return rc;
5236 if (rc > 0) {
5237 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5238 r = -EIO;
5239 }
5240 }
5241
5242 if (r == 0) {
5243 // indicate success by writing the 'mkfs_done' file
5244 r = write_meta("mkfs_done", "yes");
5245 }
5246
5247 if (r < 0) {
5248 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
5249 } else {
5250 dout(0) << __func__ << " success" << dendl;
5251 }
5252 return r;
5253 }
5254
5255 void BlueStore::set_cache_shards(unsigned num)
5256 {
5257 dout(10) << __func__ << " " << num << dendl;
5258 size_t old = cache_shards.size();
5259 assert(num >= old);
5260 cache_shards.resize(num);
5261 for (unsigned i = old; i < num; ++i) {
5262 cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
5263 logger);
5264 }
5265 }
5266
5267 int BlueStore::_mount(bool kv_only)
5268 {
5269 dout(1) << __func__ << " path " << path << dendl;
5270
5271 {
5272 string type;
5273 int r = read_meta("type", &type);
5274 if (r < 0) {
5275 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5276 << dendl;
5277 return r;
5278 }
5279
5280 if (type != "bluestore") {
5281 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5282 return -EIO;
5283 }
5284 }
5285
5286 if (cct->_conf->bluestore_fsck_on_mount) {
5287 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
5288 if (rc < 0)
5289 return rc;
5290 if (rc > 0) {
5291 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5292 return -EIO;
5293 }
5294 }
5295
5296 int r = _open_path();
5297 if (r < 0)
5298 return r;
5299 r = _open_fsid(false);
5300 if (r < 0)
5301 goto out_path;
5302
5303 r = _read_fsid(&fsid);
5304 if (r < 0)
5305 goto out_fsid;
5306
5307 r = _lock_fsid();
5308 if (r < 0)
5309 goto out_fsid;
5310
5311 r = _open_bdev(false);
5312 if (r < 0)
5313 goto out_fsid;
5314
5315 r = _open_db(false);
5316 if (r < 0)
5317 goto out_bdev;
5318
5319 if (kv_only)
5320 return 0;
5321
5322 r = _open_super_meta();
5323 if (r < 0)
5324 goto out_db;
5325
5326 r = _open_fm(false);
5327 if (r < 0)
5328 goto out_db;
5329
5330 r = _open_alloc();
5331 if (r < 0)
5332 goto out_fm;
5333
5334 r = _open_collections();
5335 if (r < 0)
5336 goto out_alloc;
5337
5338 r = _reload_logger();
5339 if (r < 0)
5340 goto out_coll;
5341
5342 if (bluefs) {
5343 r = _reconcile_bluefs_freespace();
5344 if (r < 0)
5345 goto out_coll;
5346 }
5347
5348 _kv_start();
5349
5350 r = _deferred_replay();
5351 if (r < 0)
5352 goto out_stop;
5353
5354 mempool_thread.init();
5355
5356
5357 mounted = true;
5358 return 0;
5359
5360 out_stop:
5361 _kv_stop();
5362 out_coll:
5363 _flush_cache();
5364 out_alloc:
5365 _close_alloc();
5366 out_fm:
5367 _close_fm();
5368 out_db:
5369 _close_db();
5370 out_bdev:
5371 _close_bdev();
5372 out_fsid:
5373 _close_fsid();
5374 out_path:
5375 _close_path();
5376 return r;
5377 }
5378
5379 int BlueStore::umount()
5380 {
5381 assert(mounted);
5382 dout(1) << __func__ << dendl;
5383
5384 _osr_drain_all();
5385 _osr_unregister_all();
5386
5387 mempool_thread.shutdown();
5388
5389 dout(20) << __func__ << " stopping kv thread" << dendl;
5390 _kv_stop();
5391 _reap_collections();
5392 _flush_cache();
5393 dout(20) << __func__ << " closing" << dendl;
5394
5395 mounted = false;
5396 _close_alloc();
5397 _close_fm();
5398 _close_db();
5399 _close_bdev();
5400 _close_fsid();
5401 _close_path();
5402
5403 if (cct->_conf->bluestore_fsck_on_umount) {
5404 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
5405 if (rc < 0)
5406 return rc;
5407 if (rc > 0) {
5408 derr << __func__ << " fsck found " << rc << " errors" << dendl;
5409 return -EIO;
5410 }
5411 }
5412 return 0;
5413 }
5414
5415 static void apply(uint64_t off,
5416 uint64_t len,
5417 uint64_t granularity,
5418 BlueStore::mempool_dynamic_bitset &bitset,
5419 const char *what,
5420 std::function<void(uint64_t,
5421 BlueStore::mempool_dynamic_bitset &)> f) {
5422 auto end = ROUND_UP_TO(off + len, granularity);
5423 while (off < end) {
5424 uint64_t pos = off / granularity;
5425 f(pos, bitset);
5426 off += granularity;
5427 }
5428 }
5429
5430 int BlueStore::_fsck_check_extents(
5431 const ghobject_t& oid,
5432 const PExtentVector& extents,
5433 bool compressed,
5434 mempool_dynamic_bitset &used_blocks,
5435 store_statfs_t& expected_statfs)
5436 {
5437 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
5438 int errors = 0;
5439 for (auto e : extents) {
5440 if (!e.is_valid())
5441 continue;
5442 expected_statfs.allocated += e.length;
5443 if (compressed) {
5444 expected_statfs.compressed_allocated += e.length;
5445 }
5446 bool already = false;
5447 apply(
5448 e.offset, e.length, block_size, used_blocks, __func__,
5449 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5450 if (bs.test(pos))
5451 already = true;
5452 else
5453 bs.set(pos);
5454 });
5455 if (already) {
5456 derr << " " << oid << " extent " << e
5457 << " or a subset is already allocated" << dendl;
5458 ++errors;
5459 }
5460 if (e.end() > bdev->get_size()) {
5461 derr << " " << oid << " extent " << e
5462 << " past end of block device" << dendl;
5463 ++errors;
5464 }
5465 }
5466 return errors;
5467 }
5468
5469 int BlueStore::fsck(bool deep)
5470 {
5471 dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
5472 int errors = 0;
5473
5474 typedef btree::btree_set<
5475 uint64_t,std::less<uint64_t>,
5476 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
5477 uint64_t_btree_t used_nids;
5478 uint64_t_btree_t used_omap_head;
5479 uint64_t_btree_t used_sbids;
5480
5481 mempool_dynamic_bitset used_blocks;
5482 KeyValueDB::Iterator it;
5483 store_statfs_t expected_statfs, actual_statfs;
5484 struct sb_info_t {
5485 list<ghobject_t> oids;
5486 SharedBlobRef sb;
5487 bluestore_extent_ref_map_t ref_map;
5488 bool compressed;
5489 };
5490 mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
5491
5492 uint64_t num_objects = 0;
5493 uint64_t num_extents = 0;
5494 uint64_t num_blobs = 0;
5495 uint64_t num_spanning_blobs = 0;
5496 uint64_t num_shared_blobs = 0;
5497 uint64_t num_sharded_objects = 0;
5498 uint64_t num_object_shards = 0;
5499
5500 utime_t start = ceph_clock_now();
5501
5502 int r = _open_path();
5503 if (r < 0)
5504 return r;
5505 r = _open_fsid(false);
5506 if (r < 0)
5507 goto out_path;
5508
5509 r = _read_fsid(&fsid);
5510 if (r < 0)
5511 goto out_fsid;
5512
5513 r = _lock_fsid();
5514 if (r < 0)
5515 goto out_fsid;
5516
5517 r = _open_bdev(false);
5518 if (r < 0)
5519 goto out_fsid;
5520
5521 r = _open_db(false);
5522 if (r < 0)
5523 goto out_bdev;
5524
5525 r = _open_super_meta();
5526 if (r < 0)
5527 goto out_db;
5528
5529 r = _open_fm(false);
5530 if (r < 0)
5531 goto out_db;
5532
5533 r = _open_alloc();
5534 if (r < 0)
5535 goto out_fm;
5536
5537 r = _open_collections(&errors);
5538 if (r < 0)
5539 goto out_alloc;
5540
5541 mempool_thread.init();
5542
5543 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5544 _kv_start();
5545 r = _deferred_replay();
5546 _kv_stop();
5547 if (r < 0)
5548 goto out_scan;
5549
5550 used_blocks.resize(bdev->get_size() / block_size);
5551 apply(
5552 0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
5553 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5554 bs.set(pos);
5555 }
5556 );
5557
5558 if (bluefs) {
5559 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5560 apply(
5561 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
5562 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5563 bs.set(pos);
5564 }
5565 );
5566 }
5567 r = bluefs->fsck();
5568 if (r < 0) {
5569 goto out_scan;
5570 }
5571 if (r > 0)
5572 errors += r;
5573 }
5574
5575 // get expected statfs; fill unaffected fields to be able to compare
5576 // structs
5577 statfs(&actual_statfs);
5578 expected_statfs.total = actual_statfs.total;
5579 expected_statfs.available = actual_statfs.available;
5580
5581 // walk PREFIX_OBJ
5582 dout(1) << __func__ << " walking object keyspace" << dendl;
5583 it = db->get_iterator(PREFIX_OBJ);
5584 if (it) {
5585 CollectionRef c;
5586 spg_t pgid;
5587 mempool::bluestore_fsck::list<string> expecting_shards;
5588 for (it->lower_bound(string()); it->valid(); it->next()) {
5589 if (g_conf->bluestore_debug_fsck_abort) {
5590 goto out_scan;
5591 }
5592 dout(30) << " key " << pretty_binary_string(it->key()) << dendl;
5593 if (is_extent_shard_key(it->key())) {
5594 while (!expecting_shards.empty() &&
5595 expecting_shards.front() < it->key()) {
5596 derr << __func__ << " error: missing shard key "
5597 << pretty_binary_string(expecting_shards.front())
5598 << dendl;
5599 ++errors;
5600 expecting_shards.pop_front();
5601 }
5602 if (!expecting_shards.empty() &&
5603 expecting_shards.front() == it->key()) {
5604 // all good
5605 expecting_shards.pop_front();
5606 continue;
5607 }
5608
5609 uint32_t offset;
5610 string okey;
5611 get_key_extent_shard(it->key(), &okey, &offset);
5612 derr << __func__ << " error: stray shard 0x" << std::hex << offset
5613 << std::dec << dendl;
5614 if (expecting_shards.empty()) {
5615 derr << __func__ << " error: " << pretty_binary_string(it->key())
5616 << " is unexpected" << dendl;
5617 ++errors;
5618 continue;
5619 }
5620 while (expecting_shards.front() > it->key()) {
5621 derr << __func__ << " error: saw " << pretty_binary_string(it->key())
5622 << dendl;
5623 derr << __func__ << " error: exp "
5624 << pretty_binary_string(expecting_shards.front()) << dendl;
5625 ++errors;
5626 expecting_shards.pop_front();
5627 if (expecting_shards.empty()) {
5628 break;
5629 }
5630 }
5631 continue;
5632 }
5633
5634 ghobject_t oid;
5635 int r = get_key_object(it->key(), &oid);
5636 if (r < 0) {
5637 derr << __func__ << " error: bad object key "
5638 << pretty_binary_string(it->key()) << dendl;
5639 ++errors;
5640 continue;
5641 }
5642 if (!c ||
5643 oid.shard_id != pgid.shard ||
5644 oid.hobj.pool != (int64_t)pgid.pool() ||
5645 !c->contains(oid)) {
5646 c = nullptr;
5647 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
5648 coll_map.begin();
5649 p != coll_map.end();
5650 ++p) {
5651 if (p->second->contains(oid)) {
5652 c = p->second;
5653 break;
5654 }
5655 }
5656 if (!c) {
5657 derr << __func__ << " error: stray object " << oid
5658 << " not owned by any collection" << dendl;
5659 ++errors;
5660 continue;
5661 }
5662 c->cid.is_pg(&pgid);
5663 dout(20) << __func__ << " collection " << c->cid << dendl;
5664 }
5665
5666 if (!expecting_shards.empty()) {
5667 for (auto &k : expecting_shards) {
5668 derr << __func__ << " error: missing shard key "
5669 << pretty_binary_string(k) << dendl;
5670 }
5671 ++errors;
5672 expecting_shards.clear();
5673 }
5674
5675 dout(10) << __func__ << " " << oid << dendl;
5676 RWLock::RLocker l(c->lock);
5677 OnodeRef o = c->get_onode(oid, false);
5678 if (o->onode.nid) {
5679 if (o->onode.nid > nid_max) {
5680 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5681 << " > nid_max " << nid_max << dendl;
5682 ++errors;
5683 }
5684 if (used_nids.count(o->onode.nid)) {
5685 derr << __func__ << " error: " << oid << " nid " << o->onode.nid
5686 << " already in use" << dendl;
5687 ++errors;
5688 continue; // go for next object
5689 }
5690 used_nids.insert(o->onode.nid);
5691 }
5692 ++num_objects;
5693 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
5694 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
5695 _dump_onode(o, 30);
5696 // shards
5697 if (!o->extent_map.shards.empty()) {
5698 ++num_sharded_objects;
5699 num_object_shards += o->extent_map.shards.size();
5700 }
5701 for (auto& s : o->extent_map.shards) {
5702 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
5703 expecting_shards.push_back(string());
5704 get_extent_shard_key(o->key, s.shard_info->offset,
5705 &expecting_shards.back());
5706 if (s.shard_info->offset >= o->onode.size) {
5707 derr << __func__ << " error: " << oid << " shard 0x" << std::hex
5708 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
5709 << std::dec << dendl;
5710 ++errors;
5711 }
5712 }
5713 // lextents
5714 map<BlobRef,bluestore_blob_t::unused_t> referenced;
5715 uint64_t pos = 0;
5716 mempool::bluestore_fsck::map<BlobRef,
5717 bluestore_blob_use_tracker_t> ref_map;
5718 for (auto& l : o->extent_map.extent_map) {
5719 dout(20) << __func__ << " " << l << dendl;
5720 if (l.logical_offset < pos) {
5721 derr << __func__ << " error: " << oid << " lextent at 0x"
5722 << std::hex << l.logical_offset
5723 << " overlaps with the previous, which ends at 0x" << pos
5724 << std::dec << dendl;
5725 ++errors;
5726 }
5727 if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
5728 derr << __func__ << " error: " << oid << " lextent at 0x"
5729 << std::hex << l.logical_offset << "~" << l.length
5730 << " spans a shard boundary"
5731 << std::dec << dendl;
5732 ++errors;
5733 }
5734 pos = l.logical_offset + l.length;
5735 expected_statfs.stored += l.length;
5736 assert(l.blob);
5737 const bluestore_blob_t& blob = l.blob->get_blob();
5738
5739 auto& ref = ref_map[l.blob];
5740 if (ref.is_empty()) {
5741 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
5742 uint32_t l = blob.get_logical_length();
5743 ref.init(l, min_release_size);
5744 }
5745 ref.get(
5746 l.blob_offset,
5747 l.length);
5748 ++num_extents;
5749 if (blob.has_unused()) {
5750 auto p = referenced.find(l.blob);
5751 bluestore_blob_t::unused_t *pu;
5752 if (p == referenced.end()) {
5753 pu = &referenced[l.blob];
5754 } else {
5755 pu = &p->second;
5756 }
5757 uint64_t blob_len = blob.get_logical_length();
5758 assert((blob_len % (sizeof(*pu)*8)) == 0);
5759 assert(l.blob_offset + l.length <= blob_len);
5760 uint64_t chunk_size = blob_len / (sizeof(*pu)*8);
5761 uint64_t start = l.blob_offset / chunk_size;
5762 uint64_t end =
5763 ROUND_UP_TO(l.blob_offset + l.length, chunk_size) / chunk_size;
5764 for (auto i = start; i < end; ++i) {
5765 (*pu) |= (1u << i);
5766 }
5767 }
5768 }
5769 for (auto &i : referenced) {
5770 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
5771 << std::dec << " for " << *i.first << dendl;
5772 const bluestore_blob_t& blob = i.first->get_blob();
5773 if (i.second & blob.unused) {
5774 derr << __func__ << " error: " << oid << " blob claims unused 0x"
5775 << std::hex << blob.unused
5776 << " but extents reference 0x" << i.second
5777 << " on blob " << *i.first << dendl;
5778 ++errors;
5779 }
5780 if (blob.has_csum()) {
5781 uint64_t blob_len = blob.get_logical_length();
5782 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused)*8);
5783 unsigned csum_count = blob.get_csum_count();
5784 unsigned csum_chunk_size = blob.get_csum_chunk_size();
5785 for (unsigned p = 0; p < csum_count; ++p) {
5786 unsigned pos = p * csum_chunk_size;
5787 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
5788 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
5789 unsigned mask = 1u << firstbit;
5790 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
5791 mask |= 1u << b;
5792 }
5793 if ((blob.unused & mask) == mask) {
5794 // this csum chunk region is marked unused
5795 if (blob.get_csum_item(p) != 0) {
5796 derr << __func__ << " error: " << oid
5797 << " blob claims csum chunk 0x" << std::hex << pos
5798 << "~" << csum_chunk_size
5799 << " is unused (mask 0x" << mask << " of unused 0x"
5800 << blob.unused << ") but csum is non-zero 0x"
5801 << blob.get_csum_item(p) << std::dec << " on blob "
5802 << *i.first << dendl;
5803 ++errors;
5804 }
5805 }
5806 }
5807 }
5808 }
5809 for (auto &i : ref_map) {
5810 ++num_blobs;
5811 const bluestore_blob_t& blob = i.first->get_blob();
5812 bool equal = i.first->get_blob_use_tracker().equal(i.second);
5813 if (!equal) {
5814 derr << __func__ << " error: " << oid << " blob " << *i.first
5815 << " doesn't match expected ref_map " << i.second << dendl;
5816 ++errors;
5817 }
5818 if (blob.is_compressed()) {
5819 expected_statfs.compressed += blob.get_compressed_payload_length();
5820 expected_statfs.compressed_original +=
5821 i.first->get_referenced_bytes();
5822 }
5823 if (blob.is_shared()) {
5824 if (i.first->shared_blob->get_sbid() > blobid_max) {
5825 derr << __func__ << " error: " << oid << " blob " << blob
5826 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
5827 << blobid_max << dendl;
5828 ++errors;
5829 } else if (i.first->shared_blob->get_sbid() == 0) {
5830 derr << __func__ << " error: " << oid << " blob " << blob
5831 << " marked as shared but has uninitialized sbid"
5832 << dendl;
5833 ++errors;
5834 }
5835 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
5836 sbi.sb = i.first->shared_blob;
5837 sbi.oids.push_back(oid);
5838 sbi.compressed = blob.is_compressed();
5839 for (auto e : blob.get_extents()) {
5840 if (e.is_valid()) {
5841 sbi.ref_map.get(e.offset, e.length);
5842 }
5843 }
5844 } else {
5845 errors += _fsck_check_extents(oid, blob.get_extents(),
5846 blob.is_compressed(),
5847 used_blocks,
5848 expected_statfs);
5849 }
5850 }
5851 if (deep) {
5852 bufferlist bl;
5853 int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
5854 if (r < 0) {
5855 ++errors;
5856 derr << __func__ << " error: " << oid << " error during read: "
5857 << cpp_strerror(r) << dendl;
5858 }
5859 }
5860 // omap
5861 if (o->onode.has_omap()) {
5862 if (used_omap_head.count(o->onode.nid)) {
5863 derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
5864 << " already in use" << dendl;
5865 ++errors;
5866 } else {
5867 used_omap_head.insert(o->onode.nid);
5868 }
5869 }
5870 }
5871 }
5872 dout(1) << __func__ << " checking shared_blobs" << dendl;
5873 it = db->get_iterator(PREFIX_SHARED_BLOB);
5874 if (it) {
5875 for (it->lower_bound(string()); it->valid(); it->next()) {
5876 string key = it->key();
5877 uint64_t sbid;
5878 if (get_key_shared_blob(key, &sbid)) {
5879 derr << __func__ << " error: bad key '" << key
5880 << "' in shared blob namespace" << dendl;
5881 ++errors;
5882 continue;
5883 }
5884 auto p = sb_info.find(sbid);
5885 if (p == sb_info.end()) {
5886 derr << __func__ << " error: found stray shared blob data for sbid 0x"
5887 << std::hex << sbid << std::dec << dendl;
5888 ++errors;
5889 } else {
5890 ++num_shared_blobs;
5891 sb_info_t& sbi = p->second;
5892 bluestore_shared_blob_t shared_blob(sbid);
5893 bufferlist bl = it->value();
5894 bufferlist::iterator blp = bl.begin();
5895 ::decode(shared_blob, blp);
5896 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
5897 if (shared_blob.ref_map != sbi.ref_map) {
5898 derr << __func__ << " error: shared blob 0x" << std::hex << sbid
5899 << std::dec << " ref_map " << shared_blob.ref_map
5900 << " != expected " << sbi.ref_map << dendl;
5901 ++errors;
5902 }
5903 PExtentVector extents;
5904 for (auto &r : shared_blob.ref_map.ref_map) {
5905 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
5906 }
5907 errors += _fsck_check_extents(p->second.oids.front(),
5908 extents,
5909 p->second.compressed,
5910 used_blocks, expected_statfs);
5911 sb_info.erase(p);
5912 }
5913 }
5914 }
5915 for (auto &p : sb_info) {
5916 derr << __func__ << " error: shared_blob 0x" << p.first
5917 << " key is missing (" << *p.second.sb << ")" << dendl;
5918 ++errors;
5919 }
5920 if (!(actual_statfs == expected_statfs)) {
5921 derr << __func__ << " error: actual " << actual_statfs
5922 << " != expected " << expected_statfs << dendl;
5923 ++errors;
5924 }
5925
5926 dout(1) << __func__ << " checking for stray omap data" << dendl;
5927 it = db->get_iterator(PREFIX_OMAP);
5928 if (it) {
5929 for (it->lower_bound(string()); it->valid(); it->next()) {
5930 uint64_t omap_head;
5931 _key_decode_u64(it->key().c_str(), &omap_head);
5932 if (used_omap_head.count(omap_head) == 0) {
5933 derr << __func__ << " error: found stray omap data on omap_head "
5934 << omap_head << dendl;
5935 ++errors;
5936 }
5937 }
5938 }
5939
5940 dout(1) << __func__ << " checking deferred events" << dendl;
5941 it = db->get_iterator(PREFIX_DEFERRED);
5942 if (it) {
5943 for (it->lower_bound(string()); it->valid(); it->next()) {
5944 bufferlist bl = it->value();
5945 bufferlist::iterator p = bl.begin();
5946 bluestore_deferred_transaction_t wt;
5947 try {
5948 ::decode(wt, p);
5949 } catch (buffer::error& e) {
5950 derr << __func__ << " error: failed to decode deferred txn "
5951 << pretty_binary_string(it->key()) << dendl;
5952 r = -EIO;
5953 goto out_scan;
5954 }
5955 dout(20) << __func__ << " deferred " << wt.seq
5956 << " ops " << wt.ops.size()
5957 << " released 0x" << std::hex << wt.released << std::dec << dendl;
5958 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
5959 apply(
5960 e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
5961 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5962 bs.set(pos);
5963 }
5964 );
5965 }
5966 }
5967 }
5968
5969 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
5970 {
5971 // remove bluefs_extents from used set since the freelist doesn't
5972 // know they are allocated.
5973 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5974 apply(
5975 e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
5976 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5977 bs.reset(pos);
5978 }
5979 );
5980 }
5981 fm->enumerate_reset();
5982 uint64_t offset, length;
5983 while (fm->enumerate_next(&offset, &length)) {
5984 bool intersects = false;
5985 apply(
5986 offset, length, block_size, used_blocks, "free",
5987 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
5988 if (bs.test(pos)) {
5989 intersects = true;
5990 } else {
5991 bs.set(pos);
5992 }
5993 }
5994 );
5995 if (intersects) {
5996 derr << __func__ << " error: free extent 0x" << std::hex << offset
5997 << "~" << length << std::dec
5998 << " intersects allocated blocks" << dendl;
5999 ++errors;
6000 }
6001 }
6002 fm->enumerate_reset();
6003 size_t count = used_blocks.count();
6004 if (used_blocks.size() == count + 1) {
6005 // this due to http://tracker.ceph.com/issues/21089
6006 bufferlist fm_bpb_bl, fm_blocks_bl, fm_bpk_bl;
6007 db->get(PREFIX_ALLOC, "bytes_per_block", &fm_bpb_bl);
6008 db->get(PREFIX_ALLOC, "blocks", &fm_blocks_bl);
6009 db->get(PREFIX_ALLOC, "blocks_per_key", &fm_bpk_bl);
6010 uint64_t fm_blocks = 0;
6011 uint64_t fm_bsize = 1;
6012 uint64_t fm_blocks_per_key = 1;
6013 try {
6014 auto p = fm_blocks_bl.begin();
6015 ::decode(fm_blocks, p);
6016 auto q = fm_bpb_bl.begin();
6017 ::decode(fm_bsize, q);
6018 auto r = fm_bpk_bl.begin();
6019 ::decode(fm_blocks_per_key, r);
6020 } catch (buffer::error& e) {
6021 }
6022 uint64_t dev_bsize = bdev->get_block_size();
6023 uint64_t bad_size = bdev->get_size() & ~fm_bsize;
6024 if (used_blocks.test(bad_size / dev_bsize) == 0) {
6025 // this is the last block of the device that we previously
6026 // (incorrectly) truncated off of the effective device size. this
6027 // prevented BitmapFreelistManager from marking it as used along with
6028 // the other "past-eof" blocks in the last key slot. mark it used
6029 // now.
6030 derr << __func__ << " warning: fixing leaked block 0x" << std::hex
6031 << bad_size << "~" << fm_bsize << std::dec << " due to old bug"
6032 << dendl;
6033 KeyValueDB::Transaction t = db->get_transaction();
6034 // fix freelistmanager metadata (the internal 'blocks' count is
6035 // rounded up to include the trailing key, past eof)
6036 uint64_t new_blocks = bdev->get_size() / fm_bsize;
6037 if (new_blocks / fm_blocks_per_key * fm_blocks_per_key != new_blocks) {
6038 new_blocks = (new_blocks / fm_blocks_per_key + 1) *
6039 fm_blocks_per_key;
6040 }
6041 if (new_blocks != fm_blocks) {
6042 // the fm block count increased
6043 derr << __func__ << " freelist block and key count changed, fixing 0x"
6044 << std::hex << bdev->get_size() << "~"
6045 << ((new_blocks * fm_bsize) - bdev->get_size()) << std::dec
6046 << dendl;
6047 bufferlist bl;
6048 ::encode(new_blocks, bl);
6049 t->set(PREFIX_ALLOC, "blocks", bl);
6050 fm->allocate(bdev->get_size(),
6051 (new_blocks * fm_bsize) - bdev->get_size(),
6052 t);
6053 } else {
6054 // block count is the same, but size changed; fix just the size
6055 derr << __func__ << " fixing just the stray block at 0x"
6056 << std::hex << bad_size << "~" << fm_bsize << std::dec << dendl;
6057 fm->allocate(bad_size, fm_bsize, t);
6058 }
6059 bufferlist sizebl;
6060 ::encode(bdev->get_size(), sizebl);
6061 t->set(PREFIX_ALLOC, "size", sizebl);
6062 int r = db->submit_transaction_sync(t);
6063 assert(r == 0);
6064
6065 used_blocks.set(bad_size / dev_bsize);
6066 ++count;
6067 }
6068 }
6069 if (used_blocks.size() != count) {
6070 assert(used_blocks.size() > count);
6071 ++errors;
6072 used_blocks.flip();
6073 size_t start = used_blocks.find_first();
6074 while (start != decltype(used_blocks)::npos) {
6075 size_t cur = start;
6076 while (true) {
6077 size_t next = used_blocks.find_next(cur);
6078 if (next != cur + 1) {
6079 derr << __func__ << " error: leaked extent 0x" << std::hex
6080 << ((uint64_t)start * block_size) << "~"
6081 << ((cur + 1 - start) * block_size) << std::dec
6082 << dendl;
6083 start = next;
6084 break;
6085 }
6086 cur = next;
6087 }
6088 }
6089 used_blocks.flip();
6090 }
6091 }
6092
6093 out_scan:
6094 mempool_thread.shutdown();
6095 _flush_cache();
6096 out_alloc:
6097 _close_alloc();
6098 out_fm:
6099 _close_fm();
6100 out_db:
6101 it.reset(); // before db is closed
6102 _close_db();
6103 out_bdev:
6104 _close_bdev();
6105 out_fsid:
6106 _close_fsid();
6107 out_path:
6108 _close_path();
6109
6110 // fatal errors take precedence
6111 if (r < 0)
6112 return r;
6113
6114 dout(2) << __func__ << " " << num_objects << " objects, "
6115 << num_sharded_objects << " of them sharded. "
6116 << dendl;
6117 dout(2) << __func__ << " " << num_extents << " extents to "
6118 << num_blobs << " blobs, "
6119 << num_spanning_blobs << " spanning, "
6120 << num_shared_blobs << " shared."
6121 << dendl;
6122
6123 utime_t duration = ceph_clock_now() - start;
6124 dout(1) << __func__ << " finish with " << errors << " errors in "
6125 << duration << " seconds" << dendl;
6126 return errors;
6127 }
6128
6129 void BlueStore::collect_metadata(map<string,string> *pm)
6130 {
6131 dout(10) << __func__ << dendl;
6132 bdev->collect_metadata("bluestore_bdev_", pm);
6133 if (bluefs) {
6134 (*pm)["bluefs"] = "1";
6135 (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
6136 bluefs->collect_metadata(pm);
6137 } else {
6138 (*pm)["bluefs"] = "0";
6139 }
6140 }
6141
6142 int BlueStore::statfs(struct store_statfs_t *buf)
6143 {
6144 buf->reset();
6145 buf->total = bdev->get_size();
6146 buf->available = alloc->get_free();
6147
6148 if (bluefs) {
6149 // part of our shared device is "free" according to BlueFS
6150 // Don't include bluestore_bluefs_min because that space can't
6151 // be used for any other purpose.
6152 buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;
6153
6154 // include dedicated db, too, if that isn't the shared device.
6155 if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
6156 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
6157 }
6158 }
6159
6160 {
6161 std::lock_guard<std::mutex> l(vstatfs_lock);
6162
6163 buf->allocated = vstatfs.allocated();
6164 buf->stored = vstatfs.stored();
6165 buf->compressed = vstatfs.compressed();
6166 buf->compressed_original = vstatfs.compressed_original();
6167 buf->compressed_allocated = vstatfs.compressed_allocated();
6168 }
6169
6170 dout(20) << __func__ << *buf << dendl;
6171 return 0;
6172 }
6173
6174 // ---------------
6175 // cache
6176
6177 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
6178 {
6179 RWLock::RLocker l(coll_lock);
6180 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
6181 if (cp == coll_map.end())
6182 return CollectionRef();
6183 return cp->second;
6184 }
6185
6186 void BlueStore::_queue_reap_collection(CollectionRef& c)
6187 {
6188 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6189 std::lock_guard<std::mutex> l(reap_lock);
6190 removed_collections.push_back(c);
6191 }
6192
6193 void BlueStore::_reap_collections()
6194 {
6195 list<CollectionRef> removed_colls;
6196 {
6197 std::lock_guard<std::mutex> l(reap_lock);
6198 removed_colls.swap(removed_collections);
6199 }
6200
6201 bool all_reaped = true;
6202
6203 for (list<CollectionRef>::iterator p = removed_colls.begin();
6204 p != removed_colls.end();
6205 ++p) {
6206 CollectionRef c = *p;
6207 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
6208 if (c->onode_map.map_any([&](OnodeRef o) {
6209 assert(!o->exists);
6210 if (o->flushing_count.load()) {
6211 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
6212 << " flush_txns " << o->flushing_count << dendl;
6213 return false;
6214 }
6215 return true;
6216 })) {
6217 all_reaped = false;
6218 continue;
6219 }
6220 c->onode_map.clear();
6221 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
6222 }
6223
6224 if (all_reaped) {
6225 dout(10) << __func__ << " all reaped" << dendl;
6226 }
6227 }
6228
6229 void BlueStore::_update_cache_logger()
6230 {
6231 uint64_t num_onodes = 0;
6232 uint64_t num_extents = 0;
6233 uint64_t num_blobs = 0;
6234 uint64_t num_buffers = 0;
6235 uint64_t num_buffer_bytes = 0;
6236 for (auto c : cache_shards) {
6237 c->add_stats(&num_onodes, &num_extents, &num_blobs,
6238 &num_buffers, &num_buffer_bytes);
6239 }
6240 logger->set(l_bluestore_onodes, num_onodes);
6241 logger->set(l_bluestore_extents, num_extents);
6242 logger->set(l_bluestore_blobs, num_blobs);
6243 logger->set(l_bluestore_buffers, num_buffers);
6244 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
6245 }
6246
6247 // ---------------
6248 // read operations
6249
6250 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
6251 {
6252 return _get_collection(cid);
6253 }
6254
6255 bool BlueStore::exists(const coll_t& cid, const ghobject_t& oid)
6256 {
6257 CollectionHandle c = _get_collection(cid);
6258 if (!c)
6259 return false;
6260 return exists(c, oid);
6261 }
6262
6263 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
6264 {
6265 Collection *c = static_cast<Collection *>(c_.get());
6266 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
6267 if (!c->exists)
6268 return false;
6269
6270 bool r = true;
6271
6272 {
6273 RWLock::RLocker l(c->lock);
6274 OnodeRef o = c->get_onode(oid, false);
6275 if (!o || !o->exists)
6276 r = false;
6277 }
6278
6279 return r;
6280 }
6281
6282 int BlueStore::stat(
6283 const coll_t& cid,
6284 const ghobject_t& oid,
6285 struct stat *st,
6286 bool allow_eio)
6287 {
6288 CollectionHandle c = _get_collection(cid);
6289 if (!c)
6290 return -ENOENT;
6291 return stat(c, oid, st, allow_eio);
6292 }
6293
6294 int BlueStore::stat(
6295 CollectionHandle &c_,
6296 const ghobject_t& oid,
6297 struct stat *st,
6298 bool allow_eio)
6299 {
6300 Collection *c = static_cast<Collection *>(c_.get());
6301 if (!c->exists)
6302 return -ENOENT;
6303 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
6304
6305 {
6306 RWLock::RLocker l(c->lock);
6307 OnodeRef o = c->get_onode(oid, false);
6308 if (!o || !o->exists)
6309 return -ENOENT;
6310 st->st_size = o->onode.size;
6311 st->st_blksize = 4096;
6312 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
6313 st->st_nlink = 1;
6314 }
6315
6316 int r = 0;
6317 if (_debug_mdata_eio(oid)) {
6318 r = -EIO;
6319 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6320 }
6321 return r;
6322 }
6323 int BlueStore::set_collection_opts(
6324 const coll_t& cid,
6325 const pool_opts_t& opts)
6326 {
6327 CollectionHandle ch = _get_collection(cid);
6328 if (!ch)
6329 return -ENOENT;
6330 Collection *c = static_cast<Collection *>(ch.get());
6331 dout(15) << __func__ << " " << cid << " options " << opts << dendl;
6332 if (!c->exists)
6333 return -ENOENT;
6334 RWLock::WLocker l(c->lock);
6335 c->pool_opts = opts;
6336 return 0;
6337 }
6338
6339 int BlueStore::read(
6340 const coll_t& cid,
6341 const ghobject_t& oid,
6342 uint64_t offset,
6343 size_t length,
6344 bufferlist& bl,
6345 uint32_t op_flags)
6346 {
6347 CollectionHandle c = _get_collection(cid);
6348 if (!c)
6349 return -ENOENT;
6350 return read(c, oid, offset, length, bl, op_flags);
6351 }
6352
6353 int BlueStore::read(
6354 CollectionHandle &c_,
6355 const ghobject_t& oid,
6356 uint64_t offset,
6357 size_t length,
6358 bufferlist& bl,
6359 uint32_t op_flags)
6360 {
6361 utime_t start = ceph_clock_now();
6362 Collection *c = static_cast<Collection *>(c_.get());
6363 const coll_t &cid = c->get_cid();
6364 dout(15) << __func__ << " " << cid << " " << oid
6365 << " 0x" << std::hex << offset << "~" << length << std::dec
6366 << dendl;
6367 if (!c->exists)
6368 return -ENOENT;
6369
6370 bl.clear();
6371 int r;
6372 {
6373 RWLock::RLocker l(c->lock);
6374 utime_t start1 = ceph_clock_now();
6375 OnodeRef o = c->get_onode(oid, false);
6376 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start1);
6377 if (!o || !o->exists) {
6378 r = -ENOENT;
6379 goto out;
6380 }
6381
6382 if (offset == length && offset == 0)
6383 length = o->onode.size;
6384
6385 r = _do_read(c, o, offset, length, bl, op_flags);
6386 }
6387
6388 out:
6389 if (r == 0 && _debug_data_eio(oid)) {
6390 r = -EIO;
6391 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6392 } else if (cct->_conf->bluestore_debug_random_read_err &&
6393 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * 100.0)) == 0) {
6394 dout(0) << __func__ << ": inject random EIO" << dendl;
6395 r = -EIO;
6396 }
6397 dout(10) << __func__ << " " << cid << " " << oid
6398 << " 0x" << std::hex << offset << "~" << length << std::dec
6399 << " = " << r << dendl;
6400 logger->tinc(l_bluestore_read_lat, ceph_clock_now() - start);
6401 return r;
6402 }
6403
6404 // --------------------------------------------------------
6405 // intermediate data structures used while reading
6406 struct region_t {
6407 uint64_t logical_offset;
6408 uint64_t blob_xoffset; //region offset within the blob
6409 uint64_t length;
6410 bufferlist bl;
6411
6412 // used later in read process
6413 uint64_t front = 0;
6414 uint64_t r_off = 0;
6415
6416 region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
6417 : logical_offset(offset),
6418 blob_xoffset(b_offs),
6419 length(len){}
6420 region_t(const region_t& from)
6421 : logical_offset(from.logical_offset),
6422 blob_xoffset(from.blob_xoffset),
6423 length(from.length){}
6424
6425 friend ostream& operator<<(ostream& out, const region_t& r) {
6426 return out << "0x" << std::hex << r.logical_offset << ":"
6427 << r.blob_xoffset << "~" << r.length << std::dec;
6428 }
6429 };
6430
6431 typedef list<region_t> regions2read_t;
6432 typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
6433
6434 int BlueStore::_do_read(
6435 Collection *c,
6436 OnodeRef o,
6437 uint64_t offset,
6438 size_t length,
6439 bufferlist& bl,
6440 uint32_t op_flags)
6441 {
6442 FUNCTRACE();
6443 int r = 0;
6444
6445 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6446 << " size 0x" << o->onode.size << " (" << std::dec
6447 << o->onode.size << ")" << dendl;
6448 bl.clear();
6449
6450 if (offset >= o->onode.size) {
6451 return r;
6452 }
6453
6454 // generally, don't buffer anything, unless the client explicitly requests
6455 // it.
6456 bool buffered = false;
6457 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
6458 dout(20) << __func__ << " will do buffered read" << dendl;
6459 buffered = true;
6460 } else if (cct->_conf->bluestore_default_buffered_read &&
6461 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
6462 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
6463 dout(20) << __func__ << " defaulting to buffered read" << dendl;
6464 buffered = true;
6465 }
6466
6467 if (offset + length > o->onode.size) {
6468 length = o->onode.size - offset;
6469 }
6470
6471 utime_t start = ceph_clock_now();
6472 o->extent_map.fault_range(db, offset, length);
6473 logger->tinc(l_bluestore_read_onode_meta_lat, ceph_clock_now() - start);
6474 _dump_onode(o);
6475
6476 ready_regions_t ready_regions;
6477
6478 // build blob-wise list to of stuff read (that isn't cached)
6479 blobs2read_t blobs2read;
6480 unsigned left = length;
6481 uint64_t pos = offset;
6482 unsigned num_regions = 0;
6483 auto lp = o->extent_map.seek_lextent(offset);
6484 while (left > 0 && lp != o->extent_map.extent_map.end()) {
6485 if (pos < lp->logical_offset) {
6486 unsigned hole = lp->logical_offset - pos;
6487 if (hole >= left) {
6488 break;
6489 }
6490 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
6491 << std::dec << dendl;
6492 pos += hole;
6493 left -= hole;
6494 }
6495 BlobRef bptr = lp->blob;
6496 unsigned l_off = pos - lp->logical_offset;
6497 unsigned b_off = l_off + lp->blob_offset;
6498 unsigned b_len = std::min(left, lp->length - l_off);
6499
6500 ready_regions_t cache_res;
6501 interval_set<uint32_t> cache_interval;
6502 bptr->shared_blob->bc.read(
6503 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
6504 dout(20) << __func__ << " blob " << *bptr << std::hex
6505 << " need 0x" << b_off << "~" << b_len
6506 << " cache has 0x" << cache_interval
6507 << std::dec << dendl;
6508
6509 auto pc = cache_res.begin();
6510 while (b_len > 0) {
6511 unsigned l;
6512 if (pc != cache_res.end() &&
6513 pc->first == b_off) {
6514 l = pc->second.length();
6515 ready_regions[pos].claim(pc->second);
6516 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
6517 << b_off << "~" << l << std::dec << dendl;
6518 ++pc;
6519 } else {
6520 l = b_len;
6521 if (pc != cache_res.end()) {
6522 assert(pc->first > b_off);
6523 l = pc->first - b_off;
6524 }
6525 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
6526 << b_off << "~" << l << std::dec << dendl;
6527 blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
6528 ++num_regions;
6529 }
6530 pos += l;
6531 b_off += l;
6532 left -= l;
6533 b_len -= l;
6534 }
6535 ++lp;
6536 }
6537
6538 // read raw blob data. use aio if we have >1 blobs to read.
6539 start = ceph_clock_now(); // for the sake of simplicity
6540 // measure the whole block below.
6541 // The error isn't that much...
6542 vector<bufferlist> compressed_blob_bls;
6543 IOContext ioc(cct, NULL);
6544 for (auto& p : blobs2read) {
6545 BlobRef bptr = p.first;
6546 dout(20) << __func__ << " blob " << *bptr << std::hex
6547 << " need " << p.second << std::dec << dendl;
6548 if (bptr->get_blob().is_compressed()) {
6549 // read the whole thing
6550 if (compressed_blob_bls.empty()) {
6551 // ensure we avoid any reallocation on subsequent blobs
6552 compressed_blob_bls.reserve(blobs2read.size());
6553 }
6554 compressed_blob_bls.push_back(bufferlist());
6555 bufferlist& bl = compressed_blob_bls.back();
6556 r = bptr->get_blob().map(
6557 0, bptr->get_blob().get_ondisk_length(),
6558 [&](uint64_t offset, uint64_t length) {
6559 int r;
6560 // use aio if there are more regions to read than those in this blob
6561 if (num_regions > p.second.size()) {
6562 r = bdev->aio_read(offset, length, &bl, &ioc);
6563 } else {
6564 r = bdev->read(offset, length, &bl, &ioc, false);
6565 }
6566 if (r < 0)
6567 return r;
6568 return 0;
6569 });
6570 assert(r == 0);
6571 } else {
6572 // read the pieces
6573 for (auto& reg : p.second) {
6574 // determine how much of the blob to read
6575 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
6576 reg.r_off = reg.blob_xoffset;
6577 uint64_t r_len = reg.length;
6578 reg.front = reg.r_off % chunk_size;
6579 if (reg.front) {
6580 reg.r_off -= reg.front;
6581 r_len += reg.front;
6582 }
6583 unsigned tail = r_len % chunk_size;
6584 if (tail) {
6585 r_len += chunk_size - tail;
6586 }
6587 dout(20) << __func__ << " region 0x" << std::hex
6588 << reg.logical_offset
6589 << ": 0x" << reg.blob_xoffset << "~" << reg.length
6590 << " reading 0x" << reg.r_off << "~" << r_len << std::dec
6591 << dendl;
6592
6593 // read it
6594 r = bptr->get_blob().map(
6595 reg.r_off, r_len,
6596 [&](uint64_t offset, uint64_t length) {
6597 int r;
6598 // use aio if there is more than one region to read
6599 if (num_regions > 1) {
6600 r = bdev->aio_read(offset, length, &reg.bl, &ioc);
6601 } else {
6602 r = bdev->read(offset, length, &reg.bl, &ioc, false);
6603 }
6604 if (r < 0)
6605 return r;
6606 return 0;
6607 });
6608 assert(r == 0);
6609 assert(reg.bl.length() == r_len);
6610 }
6611 }
6612 }
6613 if (ioc.has_pending_aios()) {
6614 bdev->aio_submit(&ioc);
6615 dout(20) << __func__ << " waiting for aio" << dendl;
6616 ioc.aio_wait();
6617 }
6618 logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
6619
6620 // enumerate and decompress desired blobs
6621 auto p = compressed_blob_bls.begin();
6622 blobs2read_t::iterator b2r_it = blobs2read.begin();
6623 while (b2r_it != blobs2read.end()) {
6624 BlobRef bptr = b2r_it->first;
6625 dout(20) << __func__ << " blob " << *bptr << std::hex
6626 << " need 0x" << b2r_it->second << std::dec << dendl;
6627 if (bptr->get_blob().is_compressed()) {
6628 assert(p != compressed_blob_bls.end());
6629 bufferlist& compressed_bl = *p++;
6630 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
6631 b2r_it->second.front().logical_offset) < 0) {
6632 return -EIO;
6633 }
6634 bufferlist raw_bl;
6635 r = _decompress(compressed_bl, &raw_bl);
6636 if (r < 0)
6637 return r;
6638 if (buffered) {
6639 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
6640 raw_bl);
6641 }
6642 for (auto& i : b2r_it->second) {
6643 ready_regions[i.logical_offset].substr_of(
6644 raw_bl, i.blob_xoffset, i.length);
6645 }
6646 } else {
6647 for (auto& reg : b2r_it->second) {
6648 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
6649 reg.logical_offset) < 0) {
6650 return -EIO;
6651 }
6652 if (buffered) {
6653 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
6654 reg.r_off, reg.bl);
6655 }
6656
6657 // prune and keep result
6658 ready_regions[reg.logical_offset].substr_of(
6659 reg.bl, reg.front, reg.length);
6660 }
6661 }
6662 ++b2r_it;
6663 }
6664
6665 // generate a resulting buffer
6666 auto pr = ready_regions.begin();
6667 auto pr_end = ready_regions.end();
6668 pos = 0;
6669 while (pos < length) {
6670 if (pr != pr_end && pr->first == pos + offset) {
6671 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6672 << ": data from 0x" << pr->first << "~" << pr->second.length()
6673 << std::dec << dendl;
6674 pos += pr->second.length();
6675 bl.claim_append(pr->second);
6676 ++pr;
6677 } else {
6678 uint64_t l = length - pos;
6679 if (pr != pr_end) {
6680 assert(pr->first > pos + offset);
6681 l = pr->first - (pos + offset);
6682 }
6683 dout(30) << __func__ << " assemble 0x" << std::hex << pos
6684 << ": zeros for 0x" << (pos + offset) << "~" << l
6685 << std::dec << dendl;
6686 bl.append_zero(l);
6687 pos += l;
6688 }
6689 }
6690 assert(bl.length() == length);
6691 assert(pos == length);
6692 assert(pr == pr_end);
6693 r = bl.length();
6694 return r;
6695 }
6696
6697 int BlueStore::_verify_csum(OnodeRef& o,
6698 const bluestore_blob_t* blob, uint64_t blob_xoffset,
6699 const bufferlist& bl,
6700 uint64_t logical_offset) const
6701 {
6702 int bad;
6703 uint64_t bad_csum;
6704 utime_t start = ceph_clock_now();
6705 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
6706 if (r < 0) {
6707 if (r == -1) {
6708 PExtentVector pex;
6709 blob->map(
6710 bad,
6711 blob->get_csum_chunk_size(),
6712 [&](uint64_t offset, uint64_t length) {
6713 pex.emplace_back(bluestore_pextent_t(offset, length));
6714 return 0;
6715 });
6716 derr << __func__ << " bad "
6717 << Checksummer::get_csum_type_string(blob->csum_type)
6718 << "/0x" << std::hex << blob->get_csum_chunk_size()
6719 << " checksum at blob offset 0x" << bad
6720 << ", got 0x" << bad_csum << ", expected 0x"
6721 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
6722 << ", device location " << pex
6723 << ", logical extent 0x" << std::hex
6724 << (logical_offset + bad - blob_xoffset) << "~"
6725 << blob->get_csum_chunk_size() << std::dec
6726 << ", object " << o->oid
6727 << dendl;
6728 } else {
6729 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
6730 }
6731 }
6732 logger->tinc(l_bluestore_csum_lat, ceph_clock_now() - start);
6733 return r;
6734 }
6735
6736 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
6737 {
6738 int r = 0;
6739 utime_t start = ceph_clock_now();
6740 bufferlist::iterator i = source.begin();
6741 bluestore_compression_header_t chdr;
6742 ::decode(chdr, i);
6743 int alg = int(chdr.type);
6744 CompressorRef cp = compressor;
6745 if (!cp || (int)cp->get_type() != alg) {
6746 cp = Compressor::create(cct, alg);
6747 }
6748
6749 if (!cp.get()) {
6750 // if compressor isn't available - error, because cannot return
6751 // decompressed data?
6752 derr << __func__ << " can't load decompressor " << alg << dendl;
6753 r = -EIO;
6754 } else {
6755 r = cp->decompress(i, chdr.length, *result);
6756 if (r < 0) {
6757 derr << __func__ << " decompression failed with exit code " << r << dendl;
6758 r = -EIO;
6759 }
6760 }
6761 logger->tinc(l_bluestore_decompress_lat, ceph_clock_now() - start);
6762 return r;
6763 }
6764
6765 // this stores fiemap into interval_set, other variations
6766 // use it internally
6767 int BlueStore::_fiemap(
6768 CollectionHandle &c_,
6769 const ghobject_t& oid,
6770 uint64_t offset,
6771 size_t length,
6772 interval_set<uint64_t>& destset)
6773 {
6774 Collection *c = static_cast<Collection *>(c_.get());
6775 if (!c->exists)
6776 return -ENOENT;
6777 {
6778 RWLock::RLocker l(c->lock);
6779
6780 OnodeRef o = c->get_onode(oid, false);
6781 if (!o || !o->exists) {
6782 return -ENOENT;
6783 }
6784 _dump_onode(o);
6785
6786 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6787 << " size 0x" << o->onode.size << std::dec << dendl;
6788
6789 boost::intrusive::set<Extent>::iterator ep, eend;
6790 if (offset >= o->onode.size)
6791 goto out;
6792
6793 if (offset + length > o->onode.size) {
6794 length = o->onode.size - offset;
6795 }
6796
6797 o->extent_map.fault_range(db, offset, length);
6798 eend = o->extent_map.extent_map.end();
6799 ep = o->extent_map.seek_lextent(offset);
6800 while (length > 0) {
6801 dout(20) << __func__ << " offset " << offset << dendl;
6802 if (ep != eend && ep->logical_offset + ep->length <= offset) {
6803 ++ep;
6804 continue;
6805 }
6806
6807 uint64_t x_len = length;
6808 if (ep != eend && ep->logical_offset <= offset) {
6809 uint64_t x_off = offset - ep->logical_offset;
6810 x_len = MIN(x_len, ep->length - x_off);
6811 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
6812 << x_len << std::dec << " blob " << ep->blob << dendl;
6813 destset.insert(offset, x_len);
6814 length -= x_len;
6815 offset += x_len;
6816 if (x_off + x_len == ep->length)
6817 ++ep;
6818 continue;
6819 }
6820 if (ep != eend &&
6821 ep->logical_offset > offset &&
6822 ep->logical_offset - offset < x_len) {
6823 x_len = ep->logical_offset - offset;
6824 }
6825 offset += x_len;
6826 length -= x_len;
6827 }
6828 }
6829
6830 out:
6831 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
6832 << " size = 0x(" << destset << ")" << std::dec << dendl;
6833 return 0;
6834 }
6835
6836 int BlueStore::fiemap(
6837 const coll_t& cid,
6838 const ghobject_t& oid,
6839 uint64_t offset,
6840 size_t len,
6841 bufferlist& bl)
6842 {
6843 CollectionHandle c = _get_collection(cid);
6844 if (!c)
6845 return -ENOENT;
6846 return fiemap(c, oid, offset, len, bl);
6847 }
6848
6849 int BlueStore::fiemap(
6850 CollectionHandle &c_,
6851 const ghobject_t& oid,
6852 uint64_t offset,
6853 size_t length,
6854 bufferlist& bl)
6855 {
6856 interval_set<uint64_t> m;
6857 int r = _fiemap(c_, oid, offset, length, m);
6858 if (r >= 0) {
6859 ::encode(m, bl);
6860 }
6861 return r;
6862 }
6863
6864 int BlueStore::fiemap(
6865 const coll_t& cid,
6866 const ghobject_t& oid,
6867 uint64_t offset,
6868 size_t len,
6869 map<uint64_t, uint64_t>& destmap)
6870 {
6871 CollectionHandle c = _get_collection(cid);
6872 if (!c)
6873 return -ENOENT;
6874 return fiemap(c, oid, offset, len, destmap);
6875 }
6876
6877 int BlueStore::fiemap(
6878 CollectionHandle &c_,
6879 const ghobject_t& oid,
6880 uint64_t offset,
6881 size_t length,
6882 map<uint64_t, uint64_t>& destmap)
6883 {
6884 interval_set<uint64_t> m;
6885 int r = _fiemap(c_, oid, offset, length, m);
6886 if (r >= 0) {
6887 m.move_into(destmap);
6888 }
6889 return r;
6890 }
6891
6892 int BlueStore::getattr(
6893 const coll_t& cid,
6894 const ghobject_t& oid,
6895 const char *name,
6896 bufferptr& value)
6897 {
6898 CollectionHandle c = _get_collection(cid);
6899 if (!c)
6900 return -ENOENT;
6901 return getattr(c, oid, name, value);
6902 }
6903
6904 int BlueStore::getattr(
6905 CollectionHandle &c_,
6906 const ghobject_t& oid,
6907 const char *name,
6908 bufferptr& value)
6909 {
6910 Collection *c = static_cast<Collection *>(c_.get());
6911 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
6912 if (!c->exists)
6913 return -ENOENT;
6914
6915 int r;
6916 {
6917 RWLock::RLocker l(c->lock);
6918 mempool::bluestore_cache_other::string k(name);
6919
6920 OnodeRef o = c->get_onode(oid, false);
6921 if (!o || !o->exists) {
6922 r = -ENOENT;
6923 goto out;
6924 }
6925
6926 if (!o->onode.attrs.count(k)) {
6927 r = -ENODATA;
6928 goto out;
6929 }
6930 value = o->onode.attrs[k];
6931 r = 0;
6932 }
6933 out:
6934 if (r == 0 && _debug_mdata_eio(oid)) {
6935 r = -EIO;
6936 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6937 }
6938 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
6939 << " = " << r << dendl;
6940 return r;
6941 }
6942
6943
6944 int BlueStore::getattrs(
6945 const coll_t& cid,
6946 const ghobject_t& oid,
6947 map<string,bufferptr>& aset)
6948 {
6949 CollectionHandle c = _get_collection(cid);
6950 if (!c)
6951 return -ENOENT;
6952 return getattrs(c, oid, aset);
6953 }
6954
6955 int BlueStore::getattrs(
6956 CollectionHandle &c_,
6957 const ghobject_t& oid,
6958 map<string,bufferptr>& aset)
6959 {
6960 Collection *c = static_cast<Collection *>(c_.get());
6961 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
6962 if (!c->exists)
6963 return -ENOENT;
6964
6965 int r;
6966 {
6967 RWLock::RLocker l(c->lock);
6968
6969 OnodeRef o = c->get_onode(oid, false);
6970 if (!o || !o->exists) {
6971 r = -ENOENT;
6972 goto out;
6973 }
6974 for (auto& i : o->onode.attrs) {
6975 aset.emplace(i.first.c_str(), i.second);
6976 }
6977 r = 0;
6978 }
6979
6980 out:
6981 if (r == 0 && _debug_mdata_eio(oid)) {
6982 r = -EIO;
6983 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
6984 }
6985 dout(10) << __func__ << " " << c->cid << " " << oid
6986 << " = " << r << dendl;
6987 return r;
6988 }
6989
6990 int BlueStore::list_collections(vector<coll_t>& ls)
6991 {
6992 RWLock::RLocker l(coll_lock);
6993 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
6994 p != coll_map.end();
6995 ++p)
6996 ls.push_back(p->first);
6997 return 0;
6998 }
6999
7000 bool BlueStore::collection_exists(const coll_t& c)
7001 {
7002 RWLock::RLocker l(coll_lock);
7003 return coll_map.count(c);
7004 }
7005
7006 int BlueStore::collection_empty(const coll_t& cid, bool *empty)
7007 {
7008 dout(15) << __func__ << " " << cid << dendl;
7009 vector<ghobject_t> ls;
7010 ghobject_t next;
7011 int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), 1,
7012 &ls, &next);
7013 if (r < 0) {
7014 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
7015 << dendl;
7016 return r;
7017 }
7018 *empty = ls.empty();
7019 dout(10) << __func__ << " " << cid << " = " << (int)(*empty) << dendl;
7020 return 0;
7021 }
7022
7023 int BlueStore::collection_bits(const coll_t& cid)
7024 {
7025 dout(15) << __func__ << " " << cid << dendl;
7026 CollectionRef c = _get_collection(cid);
7027 if (!c)
7028 return -ENOENT;
7029 RWLock::RLocker l(c->lock);
7030 dout(10) << __func__ << " " << cid << " = " << c->cnode.bits << dendl;
7031 return c->cnode.bits;
7032 }
7033
7034 int BlueStore::collection_list(
7035 const coll_t& cid, const ghobject_t& start, const ghobject_t& end, int max,
7036 vector<ghobject_t> *ls, ghobject_t *pnext)
7037 {
7038 CollectionHandle c = _get_collection(cid);
7039 if (!c)
7040 return -ENOENT;
7041 return collection_list(c, start, end, max, ls, pnext);
7042 }
7043
7044 int BlueStore::collection_list(
7045 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
7046 vector<ghobject_t> *ls, ghobject_t *pnext)
7047 {
7048 Collection *c = static_cast<Collection *>(c_.get());
7049 dout(15) << __func__ << " " << c->cid
7050 << " start " << start << " end " << end << " max " << max << dendl;
7051 int r;
7052 {
7053 RWLock::RLocker l(c->lock);
7054 r = _collection_list(c, start, end, max, ls, pnext);
7055 }
7056
7057 dout(10) << __func__ << " " << c->cid
7058 << " start " << start << " end " << end << " max " << max
7059 << " = " << r << ", ls.size() = " << ls->size()
7060 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
7061 return r;
7062 }
7063
7064 int BlueStore::_collection_list(
7065 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
7066 vector<ghobject_t> *ls, ghobject_t *pnext)
7067 {
7068
7069 if (!c->exists)
7070 return -ENOENT;
7071
7072 int r = 0;
7073 ghobject_t static_next;
7074 KeyValueDB::Iterator it;
7075 string temp_start_key, temp_end_key;
7076 string start_key, end_key;
7077 bool set_next = false;
7078 string pend;
7079 bool temp;
7080
7081 if (!pnext)
7082 pnext = &static_next;
7083
7084 if (start == ghobject_t::get_max() ||
7085 start.hobj.is_max()) {
7086 goto out;
7087 }
7088 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
7089 &start_key, &end_key);
7090 dout(20) << __func__
7091 << " range " << pretty_binary_string(temp_start_key)
7092 << " to " << pretty_binary_string(temp_end_key)
7093 << " and " << pretty_binary_string(start_key)
7094 << " to " << pretty_binary_string(end_key)
7095 << " start " << start << dendl;
7096 it = db->get_iterator(PREFIX_OBJ);
7097 if (start == ghobject_t() ||
7098 start.hobj == hobject_t() ||
7099 start == c->cid.get_min_hobj()) {
7100 it->upper_bound(temp_start_key);
7101 temp = true;
7102 } else {
7103 string k;
7104 get_object_key(cct, start, &k);
7105 if (start.hobj.is_temp()) {
7106 temp = true;
7107 assert(k >= temp_start_key && k < temp_end_key);
7108 } else {
7109 temp = false;
7110 assert(k >= start_key && k < end_key);
7111 }
7112 dout(20) << " start from " << pretty_binary_string(k)
7113 << " temp=" << (int)temp << dendl;
7114 it->lower_bound(k);
7115 }
7116 if (end.hobj.is_max()) {
7117 pend = temp ? temp_end_key : end_key;
7118 } else {
7119 get_object_key(cct, end, &end_key);
7120 if (end.hobj.is_temp()) {
7121 if (temp)
7122 pend = end_key;
7123 else
7124 goto out;
7125 } else {
7126 pend = temp ? temp_end_key : end_key;
7127 }
7128 }
7129 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7130 while (true) {
7131 if (!it->valid() || it->key() >= pend) {
7132 if (!it->valid())
7133 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
7134 else
7135 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
7136 << " >= " << end << dendl;
7137 if (temp) {
7138 if (end.hobj.is_temp()) {
7139 break;
7140 }
7141 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
7142 temp = false;
7143 it->upper_bound(start_key);
7144 pend = end_key;
7145 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
7146 continue;
7147 }
7148 break;
7149 }
7150 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
7151 if (is_extent_shard_key(it->key())) {
7152 it->next();
7153 continue;
7154 }
7155 ghobject_t oid;
7156 int r = get_key_object(it->key(), &oid);
7157 assert(r == 0);
7158 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
7159 if (ls->size() >= (unsigned)max) {
7160 dout(20) << __func__ << " reached max " << max << dendl;
7161 *pnext = oid;
7162 set_next = true;
7163 break;
7164 }
7165 ls->push_back(oid);
7166 it->next();
7167 }
7168 out:
7169 if (!set_next) {
7170 *pnext = ghobject_t::get_max();
7171 }
7172
7173 return r;
7174 }
7175
7176 int BlueStore::omap_get(
7177 const coll_t& cid, ///< [in] Collection containing oid
7178 const ghobject_t &oid, ///< [in] Object containing omap
7179 bufferlist *header, ///< [out] omap header
7180 map<string, bufferlist> *out /// < [out] Key to value map
7181 )
7182 {
7183 CollectionHandle c = _get_collection(cid);
7184 if (!c)
7185 return -ENOENT;
7186 return omap_get(c, oid, header, out);
7187 }
7188
7189 int BlueStore::omap_get(
7190 CollectionHandle &c_, ///< [in] Collection containing oid
7191 const ghobject_t &oid, ///< [in] Object containing omap
7192 bufferlist *header, ///< [out] omap header
7193 map<string, bufferlist> *out /// < [out] Key to value map
7194 )
7195 {
7196 Collection *c = static_cast<Collection *>(c_.get());
7197 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7198 if (!c->exists)
7199 return -ENOENT;
7200 RWLock::RLocker l(c->lock);
7201 int r = 0;
7202 OnodeRef o = c->get_onode(oid, false);
7203 if (!o || !o->exists) {
7204 r = -ENOENT;
7205 goto out;
7206 }
7207 if (!o->onode.has_omap())
7208 goto out;
7209 o->flush();
7210 {
7211 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7212 string head, tail;
7213 get_omap_header(o->onode.nid, &head);
7214 get_omap_tail(o->onode.nid, &tail);
7215 it->lower_bound(head);
7216 while (it->valid()) {
7217 if (it->key() == head) {
7218 dout(30) << __func__ << " got header" << dendl;
7219 *header = it->value();
7220 } else if (it->key() >= tail) {
7221 dout(30) << __func__ << " reached tail" << dendl;
7222 break;
7223 } else {
7224 string user_key;
7225 decode_omap_key(it->key(), &user_key);
7226 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7227 << " -> " << user_key << dendl;
7228 (*out)[user_key] = it->value();
7229 }
7230 it->next();
7231 }
7232 }
7233 out:
7234 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7235 << dendl;
7236 return r;
7237 }
7238
7239 int BlueStore::omap_get_header(
7240 const coll_t& cid, ///< [in] Collection containing oid
7241 const ghobject_t &oid, ///< [in] Object containing omap
7242 bufferlist *header, ///< [out] omap header
7243 bool allow_eio ///< [in] don't assert on eio
7244 )
7245 {
7246 CollectionHandle c = _get_collection(cid);
7247 if (!c)
7248 return -ENOENT;
7249 return omap_get_header(c, oid, header, allow_eio);
7250 }
7251
7252 int BlueStore::omap_get_header(
7253 CollectionHandle &c_, ///< [in] Collection containing oid
7254 const ghobject_t &oid, ///< [in] Object containing omap
7255 bufferlist *header, ///< [out] omap header
7256 bool allow_eio ///< [in] don't assert on eio
7257 )
7258 {
7259 Collection *c = static_cast<Collection *>(c_.get());
7260 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7261 if (!c->exists)
7262 return -ENOENT;
7263 RWLock::RLocker l(c->lock);
7264 int r = 0;
7265 OnodeRef o = c->get_onode(oid, false);
7266 if (!o || !o->exists) {
7267 r = -ENOENT;
7268 goto out;
7269 }
7270 if (!o->onode.has_omap())
7271 goto out;
7272 o->flush();
7273 {
7274 string head;
7275 get_omap_header(o->onode.nid, &head);
7276 if (db->get(PREFIX_OMAP, head, header) >= 0) {
7277 dout(30) << __func__ << " got header" << dendl;
7278 } else {
7279 dout(30) << __func__ << " no header" << dendl;
7280 }
7281 }
7282 out:
7283 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7284 << dendl;
7285 return r;
7286 }
7287
7288 int BlueStore::omap_get_keys(
7289 const coll_t& cid, ///< [in] Collection containing oid
7290 const ghobject_t &oid, ///< [in] Object containing omap
7291 set<string> *keys ///< [out] Keys defined on oid
7292 )
7293 {
7294 CollectionHandle c = _get_collection(cid);
7295 if (!c)
7296 return -ENOENT;
7297 return omap_get_keys(c, oid, keys);
7298 }
7299
7300 int BlueStore::omap_get_keys(
7301 CollectionHandle &c_, ///< [in] Collection containing oid
7302 const ghobject_t &oid, ///< [in] Object containing omap
7303 set<string> *keys ///< [out] Keys defined on oid
7304 )
7305 {
7306 Collection *c = static_cast<Collection *>(c_.get());
7307 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7308 if (!c->exists)
7309 return -ENOENT;
7310 RWLock::RLocker l(c->lock);
7311 int r = 0;
7312 OnodeRef o = c->get_onode(oid, false);
7313 if (!o || !o->exists) {
7314 r = -ENOENT;
7315 goto out;
7316 }
7317 if (!o->onode.has_omap())
7318 goto out;
7319 o->flush();
7320 {
7321 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7322 string head, tail;
7323 get_omap_key(o->onode.nid, string(), &head);
7324 get_omap_tail(o->onode.nid, &tail);
7325 it->lower_bound(head);
7326 while (it->valid()) {
7327 if (it->key() >= tail) {
7328 dout(30) << __func__ << " reached tail" << dendl;
7329 break;
7330 }
7331 string user_key;
7332 decode_omap_key(it->key(), &user_key);
7333 dout(30) << __func__ << " got " << pretty_binary_string(it->key())
7334 << " -> " << user_key << dendl;
7335 keys->insert(user_key);
7336 it->next();
7337 }
7338 }
7339 out:
7340 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7341 << dendl;
7342 return r;
7343 }
7344
7345 int BlueStore::omap_get_values(
7346 const coll_t& cid, ///< [in] Collection containing oid
7347 const ghobject_t &oid, ///< [in] Object containing omap
7348 const set<string> &keys, ///< [in] Keys to get
7349 map<string, bufferlist> *out ///< [out] Returned keys and values
7350 )
7351 {
7352 CollectionHandle c = _get_collection(cid);
7353 if (!c)
7354 return -ENOENT;
7355 return omap_get_values(c, oid, keys, out);
7356 }
7357
7358 int BlueStore::omap_get_values(
7359 CollectionHandle &c_, ///< [in] Collection containing oid
7360 const ghobject_t &oid, ///< [in] Object containing omap
7361 const set<string> &keys, ///< [in] Keys to get
7362 map<string, bufferlist> *out ///< [out] Returned keys and values
7363 )
7364 {
7365 Collection *c = static_cast<Collection *>(c_.get());
7366 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7367 if (!c->exists)
7368 return -ENOENT;
7369 RWLock::RLocker l(c->lock);
7370 int r = 0;
7371 string final_key;
7372 OnodeRef o = c->get_onode(oid, false);
7373 if (!o || !o->exists) {
7374 r = -ENOENT;
7375 goto out;
7376 }
7377 if (!o->onode.has_omap())
7378 goto out;
7379 o->flush();
7380 _key_encode_u64(o->onode.nid, &final_key);
7381 final_key.push_back('.');
7382 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7383 final_key.resize(9); // keep prefix
7384 final_key += *p;
7385 bufferlist val;
7386 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7387 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
7388 << " -> " << *p << dendl;
7389 out->insert(make_pair(*p, val));
7390 }
7391 }
7392 out:
7393 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7394 << dendl;
7395 return r;
7396 }
7397
7398 int BlueStore::omap_check_keys(
7399 const coll_t& cid, ///< [in] Collection containing oid
7400 const ghobject_t &oid, ///< [in] Object containing omap
7401 const set<string> &keys, ///< [in] Keys to check
7402 set<string> *out ///< [out] Subset of keys defined on oid
7403 )
7404 {
7405 CollectionHandle c = _get_collection(cid);
7406 if (!c)
7407 return -ENOENT;
7408 return omap_check_keys(c, oid, keys, out);
7409 }
7410
7411 int BlueStore::omap_check_keys(
7412 CollectionHandle &c_, ///< [in] Collection containing oid
7413 const ghobject_t &oid, ///< [in] Object containing omap
7414 const set<string> &keys, ///< [in] Keys to check
7415 set<string> *out ///< [out] Subset of keys defined on oid
7416 )
7417 {
7418 Collection *c = static_cast<Collection *>(c_.get());
7419 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
7420 if (!c->exists)
7421 return -ENOENT;
7422 RWLock::RLocker l(c->lock);
7423 int r = 0;
7424 string final_key;
7425 OnodeRef o = c->get_onode(oid, false);
7426 if (!o || !o->exists) {
7427 r = -ENOENT;
7428 goto out;
7429 }
7430 if (!o->onode.has_omap())
7431 goto out;
7432 o->flush();
7433 _key_encode_u64(o->onode.nid, &final_key);
7434 final_key.push_back('.');
7435 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
7436 final_key.resize(9); // keep prefix
7437 final_key += *p;
7438 bufferlist val;
7439 if (db->get(PREFIX_OMAP, final_key, &val) >= 0) {
7440 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
7441 << " -> " << *p << dendl;
7442 out->insert(*p);
7443 } else {
7444 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
7445 << " -> " << *p << dendl;
7446 }
7447 }
7448 out:
7449 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
7450 << dendl;
7451 return r;
7452 }
7453
7454 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7455 const coll_t& cid, ///< [in] collection
7456 const ghobject_t &oid ///< [in] object
7457 )
7458 {
7459 CollectionHandle c = _get_collection(cid);
7460 if (!c) {
7461 dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
7462 return ObjectMap::ObjectMapIterator();
7463 }
7464 return get_omap_iterator(c, oid);
7465 }
7466
7467 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
7468 CollectionHandle &c_, ///< [in] collection
7469 const ghobject_t &oid ///< [in] object
7470 )
7471 {
7472 Collection *c = static_cast<Collection *>(c_.get());
7473 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
7474 if (!c->exists) {
7475 return ObjectMap::ObjectMapIterator();
7476 }
7477 RWLock::RLocker l(c->lock);
7478 OnodeRef o = c->get_onode(oid, false);
7479 if (!o || !o->exists) {
7480 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
7481 return ObjectMap::ObjectMapIterator();
7482 }
7483 o->flush();
7484 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
7485 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
7486 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
7487 }
7488
7489 // -----------------
7490 // write helpers
7491
7492 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
7493 {
7494 dout(10) << __func__ << " ondisk_format " << ondisk_format
7495 << " min_compat_ondisk_format " << min_compat_ondisk_format
7496 << dendl;
7497 assert(ondisk_format == latest_ondisk_format);
7498 {
7499 bufferlist bl;
7500 ::encode(ondisk_format, bl);
7501 t->set(PREFIX_SUPER, "ondisk_format", bl);
7502 }
7503 {
7504 bufferlist bl;
7505 ::encode(min_compat_ondisk_format, bl);
7506 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
7507 }
7508 }
7509
7510 int BlueStore::_open_super_meta()
7511 {
7512 // nid
7513 {
7514 nid_max = 0;
7515 bufferlist bl;
7516 db->get(PREFIX_SUPER, "nid_max", &bl);
7517 bufferlist::iterator p = bl.begin();
7518 try {
7519 uint64_t v;
7520 ::decode(v, p);
7521 nid_max = v;
7522 } catch (buffer::error& e) {
7523 derr << __func__ << " unable to read nid_max" << dendl;
7524 return -EIO;
7525 }
7526 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
7527 nid_last = nid_max.load();
7528 }
7529
7530 // blobid
7531 {
7532 blobid_max = 0;
7533 bufferlist bl;
7534 db->get(PREFIX_SUPER, "blobid_max", &bl);
7535 bufferlist::iterator p = bl.begin();
7536 try {
7537 uint64_t v;
7538 ::decode(v, p);
7539 blobid_max = v;
7540 } catch (buffer::error& e) {
7541 derr << __func__ << " unable to read blobid_max" << dendl;
7542 return -EIO;
7543 }
7544 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
7545 blobid_last = blobid_max.load();
7546 }
7547
7548 // freelist
7549 {
7550 bufferlist bl;
7551 db->get(PREFIX_SUPER, "freelist_type", &bl);
7552 if (bl.length()) {
7553 freelist_type = std::string(bl.c_str(), bl.length());
7554 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
7555 } else {
7556 assert("Not Support extent freelist manager" == 0);
7557 }
7558 }
7559
7560 // bluefs alloc
7561 if (cct->_conf->bluestore_bluefs) {
7562 bluefs_extents.clear();
7563 bufferlist bl;
7564 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
7565 bufferlist::iterator p = bl.begin();
7566 try {
7567 ::decode(bluefs_extents, p);
7568 }
7569 catch (buffer::error& e) {
7570 derr << __func__ << " unable to read bluefs_extents" << dendl;
7571 return -EIO;
7572 }
7573 dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
7574 << std::dec << dendl;
7575 }
7576
7577 // ondisk format
7578 int32_t compat_ondisk_format = 0;
7579 {
7580 bufferlist bl;
7581 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
7582 if (r < 0) {
7583 // base case: kraken bluestore is v1 and readable by v1
7584 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
7585 << dendl;
7586 ondisk_format = 1;
7587 compat_ondisk_format = 1;
7588 } else {
7589 auto p = bl.begin();
7590 try {
7591 ::decode(ondisk_format, p);
7592 } catch (buffer::error& e) {
7593 derr << __func__ << " unable to read ondisk_format" << dendl;
7594 return -EIO;
7595 }
7596 bl.clear();
7597 {
7598 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
7599 assert(!r);
7600 auto p = bl.begin();
7601 try {
7602 ::decode(compat_ondisk_format, p);
7603 } catch (buffer::error& e) {
7604 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
7605 return -EIO;
7606 }
7607 }
7608 }
7609 dout(10) << __func__ << " ondisk_format " << ondisk_format
7610 << " compat_ondisk_format " << compat_ondisk_format
7611 << dendl;
7612 }
7613
7614 if (latest_ondisk_format < compat_ondisk_format) {
7615 derr << __func__ << " compat_ondisk_format is "
7616 << compat_ondisk_format << " but we only understand version "
7617 << latest_ondisk_format << dendl;
7618 return -EPERM;
7619 }
7620 if (ondisk_format < latest_ondisk_format) {
7621 int r = _upgrade_super();
7622 if (r < 0) {
7623 return r;
7624 }
7625 }
7626
7627 {
7628 bufferlist bl;
7629 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
7630 auto p = bl.begin();
7631 try {
7632 uint64_t val;
7633 ::decode(val, p);
7634 min_alloc_size = val;
7635 min_alloc_size_order = ctz(val);
7636 assert(min_alloc_size == 1u << min_alloc_size_order);
7637 } catch (buffer::error& e) {
7638 derr << __func__ << " unable to read min_alloc_size" << dendl;
7639 return -EIO;
7640 }
7641 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7642 << std::dec << dendl;
7643 }
7644 _open_statfs();
7645 _set_alloc_sizes();
7646 _set_throttle_params();
7647
7648 _set_csum();
7649 _set_compression();
7650 _set_blob_size();
7651
7652 return 0;
7653 }
7654
7655 int BlueStore::_upgrade_super()
7656 {
7657 dout(1) << __func__ << " from " << ondisk_format << ", latest "
7658 << latest_ondisk_format << dendl;
7659 assert(ondisk_format > 0);
7660 assert(ondisk_format < latest_ondisk_format);
7661
7662 if (ondisk_format == 1) {
7663 // changes:
7664 // - super: added ondisk_format
7665 // - super: added min_readable_ondisk_format
7666 // - super: added min_compat_ondisk_format
7667 // - super: added min_alloc_size
7668 // - super: removed min_min_alloc_size
7669 KeyValueDB::Transaction t = db->get_transaction();
7670 {
7671 bufferlist bl;
7672 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
7673 auto p = bl.begin();
7674 try {
7675 uint64_t val;
7676 ::decode(val, p);
7677 min_alloc_size = val;
7678 } catch (buffer::error& e) {
7679 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
7680 return -EIO;
7681 }
7682 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7683 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7684 }
7685 ondisk_format = 2;
7686 _prepare_ondisk_format_super(t);
7687 int r = db->submit_transaction_sync(t);
7688 assert(r == 0);
7689 }
7690
7691 // done
7692 dout(1) << __func__ << " done" << dendl;
7693 return 0;
7694 }
7695
7696 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
7697 {
7698 if (o->onode.nid) {
7699 assert(o->exists);
7700 return;
7701 }
7702 uint64_t nid = ++nid_last;
7703 dout(20) << __func__ << " " << nid << dendl;
7704 o->onode.nid = nid;
7705 txc->last_nid = nid;
7706 o->exists = true;
7707 }
7708
7709 uint64_t BlueStore::_assign_blobid(TransContext *txc)
7710 {
7711 uint64_t bid = ++blobid_last;
7712 dout(20) << __func__ << " " << bid << dendl;
7713 txc->last_blobid = bid;
7714 return bid;
7715 }
7716
7717 void BlueStore::get_db_statistics(Formatter *f)
7718 {
7719 db->get_statistics(f);
7720 }
7721
7722 BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
7723 {
7724 TransContext *txc = new TransContext(cct, osr);
7725 txc->t = db->get_transaction();
7726 osr->queue_new(txc);
7727 dout(20) << __func__ << " osr " << osr << " = " << txc
7728 << " seq " << txc->seq << dendl;
7729 return txc;
7730 }
7731
7732 void BlueStore::_txc_calc_cost(TransContext *txc)
7733 {
7734 // this is about the simplest model for transaction cost you can
7735 // imagine. there is some fixed overhead cost by saying there is a
7736 // minimum of one "io". and then we have some cost per "io" that is
7737 // a configurable (with different hdd and ssd defaults), and add
7738 // that to the bytes value.
7739 int ios = 1; // one "io" for the kv commit
7740 for (auto& p : txc->ioc.pending_aios) {
7741 ios += p.iov.size();
7742 }
7743 auto cost = throttle_cost_per_io.load();
7744 txc->cost = ios * cost + txc->bytes;
7745 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
7746 << ios << " ios * " << cost << " + " << txc->bytes
7747 << " bytes)" << dendl;
7748 }
7749
7750 void BlueStore::_txc_update_store_statfs(TransContext *txc)
7751 {
7752 if (txc->statfs_delta.is_empty())
7753 return;
7754
7755 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
7756 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
7757 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
7758 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
7759 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
7760
7761 {
7762 std::lock_guard<std::mutex> l(vstatfs_lock);
7763 vstatfs += txc->statfs_delta;
7764 }
7765
7766 bufferlist bl;
7767 txc->statfs_delta.encode(bl);
7768
7769 txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
7770 txc->statfs_delta.reset();
7771 }
7772
7773 void BlueStore::_txc_state_proc(TransContext *txc)
7774 {
7775 while (true) {
7776 dout(10) << __func__ << " txc " << txc
7777 << " " << txc->get_state_name() << dendl;
7778 switch (txc->state) {
7779 case TransContext::STATE_PREPARE:
7780 txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
7781 if (txc->ioc.has_pending_aios()) {
7782 txc->state = TransContext::STATE_AIO_WAIT;
7783 txc->had_ios = true;
7784 _txc_aio_submit(txc);
7785 return;
7786 }
7787 // ** fall-thru **
7788
7789 case TransContext::STATE_AIO_WAIT:
7790 txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
7791 _txc_finish_io(txc); // may trigger blocked txc's too
7792 return;
7793
7794 case TransContext::STATE_IO_DONE:
7795 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7796 if (txc->had_ios) {
7797 ++txc->osr->txc_with_unstable_io;
7798 }
7799 txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
7800 txc->state = TransContext::STATE_KV_QUEUED;
7801 if (cct->_conf->bluestore_sync_submit_transaction) {
7802 if (txc->last_nid >= nid_max ||
7803 txc->last_blobid >= blobid_max) {
7804 dout(20) << __func__
7805 << " last_{nid,blobid} exceeds max, submit via kv thread"
7806 << dendl;
7807 } else if (txc->osr->kv_committing_serially) {
7808 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
7809 << dendl;
7810 // note: this is starvation-prone. once we have a txc in a busy
7811 // sequencer that is committing serially it is possible to keep
7812 // submitting new transactions fast enough that we get stuck doing
7813 // so. the alternative is to block here... fixme?
7814 } else if (txc->osr->txc_with_unstable_io) {
7815 dout(20) << __func__ << " prior txc(s) with unstable ios "
7816 << txc->osr->txc_with_unstable_io.load() << dendl;
7817 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
7818 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
7819 == 0) {
7820 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
7821 << dendl;
7822 } else {
7823 txc->state = TransContext::STATE_KV_SUBMITTED;
7824 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
7825 assert(r == 0);
7826 _txc_applied_kv(txc);
7827 }
7828 }
7829 {
7830 std::lock_guard<std::mutex> l(kv_lock);
7831 kv_queue.push_back(txc);
7832 kv_cond.notify_one();
7833 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
7834 kv_queue_unsubmitted.push_back(txc);
7835 ++txc->osr->kv_committing_serially;
7836 }
7837 if (txc->had_ios)
7838 kv_ios++;
7839 kv_throttle_costs += txc->cost;
7840 }
7841 return;
7842 case TransContext::STATE_KV_SUBMITTED:
7843 txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
7844 txc->state = TransContext::STATE_KV_DONE;
7845 _txc_committed_kv(txc);
7846 // ** fall-thru **
7847
7848 case TransContext::STATE_KV_DONE:
7849 txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
7850 if (txc->deferred_txn) {
7851 txc->state = TransContext::STATE_DEFERRED_QUEUED;
7852 _deferred_queue(txc);
7853 return;
7854 }
7855 txc->state = TransContext::STATE_FINISHING;
7856 break;
7857
7858 case TransContext::STATE_DEFERRED_CLEANUP:
7859 txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
7860 txc->state = TransContext::STATE_FINISHING;
7861 // ** fall-thru **
7862
7863 case TransContext::STATE_FINISHING:
7864 txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
7865 _txc_finish(txc);
7866 return;
7867
7868 default:
7869 derr << __func__ << " unexpected txc " << txc
7870 << " state " << txc->get_state_name() << dendl;
7871 assert(0 == "unexpected txc state");
7872 return;
7873 }
7874 }
7875 }
7876
7877 void BlueStore::_txc_finish_io(TransContext *txc)
7878 {
7879 dout(20) << __func__ << " " << txc << dendl;
7880
7881 /*
7882 * we need to preserve the order of kv transactions,
7883 * even though aio will complete in any order.
7884 */
7885
7886 OpSequencer *osr = txc->osr.get();
7887 std::lock_guard<std::mutex> l(osr->qlock);
7888 txc->state = TransContext::STATE_IO_DONE;
7889
7890 // release aio contexts (including pinned buffers).
7891 txc->ioc.running_aios.clear();
7892
7893 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
7894 while (p != osr->q.begin()) {
7895 --p;
7896 if (p->state < TransContext::STATE_IO_DONE) {
7897 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
7898 << p->get_state_name() << dendl;
7899 return;
7900 }
7901 if (p->state > TransContext::STATE_IO_DONE) {
7902 ++p;
7903 break;
7904 }
7905 }
7906 do {
7907 _txc_state_proc(&*p++);
7908 } while (p != osr->q.end() &&
7909 p->state == TransContext::STATE_IO_DONE);
7910
7911 if (osr->kv_submitted_waiters &&
7912 osr->_is_all_kv_submitted()) {
7913 osr->qcond.notify_all();
7914 }
7915 }
7916
7917 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
7918 {
7919 dout(20) << __func__ << " txc " << txc
7920 << " onodes " << txc->onodes
7921 << " shared_blobs " << txc->shared_blobs
7922 << dendl;
7923
7924 // finalize onodes
7925 for (auto o : txc->onodes) {
7926 // finalize extent_map shards
7927 o->extent_map.update(t, false);
7928 if (o->extent_map.needs_reshard()) {
7929 o->extent_map.reshard(db, t);
7930 o->extent_map.update(t, true);
7931 if (o->extent_map.needs_reshard()) {
7932 dout(20) << __func__ << " warning: still wants reshard, check options?"
7933 << dendl;
7934 o->extent_map.clear_needs_reshard();
7935 }
7936 logger->inc(l_bluestore_onode_reshard);
7937 }
7938
7939 // bound encode
7940 size_t bound = 0;
7941 denc(o->onode, bound);
7942 o->extent_map.bound_encode_spanning_blobs(bound);
7943 if (o->onode.extent_map_shards.empty()) {
7944 denc(o->extent_map.inline_bl, bound);
7945 }
7946
7947 // encode
7948 bufferlist bl;
7949 unsigned onode_part, blob_part, extent_part;
7950 {
7951 auto p = bl.get_contiguous_appender(bound, true);
7952 denc(o->onode, p);
7953 onode_part = p.get_logical_offset();
7954 o->extent_map.encode_spanning_blobs(p);
7955 blob_part = p.get_logical_offset() - onode_part;
7956 if (o->onode.extent_map_shards.empty()) {
7957 denc(o->extent_map.inline_bl, p);
7958 }
7959 extent_part = p.get_logical_offset() - onode_part - blob_part;
7960 }
7961
7962 dout(20) << " onode " << o->oid << " is " << bl.length()
7963 << " (" << onode_part << " bytes onode + "
7964 << blob_part << " bytes spanning blobs + "
7965 << extent_part << " bytes inline extents)"
7966 << dendl;
7967 t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
7968 o->flushing_count++;
7969 }
7970
7971 // objects we modified but didn't affect the onode
7972 auto p = txc->modified_objects.begin();
7973 while (p != txc->modified_objects.end()) {
7974 if (txc->onodes.count(*p) == 0) {
7975 (*p)->flushing_count++;
7976 ++p;
7977 } else {
7978 // remove dups with onodes list to avoid problems in _txc_finish
7979 p = txc->modified_objects.erase(p);
7980 }
7981 }
7982
7983 // finalize shared_blobs
7984 for (auto sb : txc->shared_blobs) {
7985 string key;
7986 auto sbid = sb->get_sbid();
7987 get_shared_blob_key(sbid, &key);
7988 if (sb->persistent->empty()) {
7989 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
7990 << " is empty" << dendl;
7991 t->rmkey(PREFIX_SHARED_BLOB, key);
7992 } else {
7993 bufferlist bl;
7994 ::encode(*(sb->persistent), bl);
7995 dout(20) << " shared_blob 0x" << std::hex << sbid << std::dec
7996 << " is " << bl.length() << " " << *sb << dendl;
7997 t->set(PREFIX_SHARED_BLOB, key, bl);
7998 }
7999 }
8000 }
8001
8002 void BlueStore::BSPerfTracker::update_from_perfcounters(
8003 PerfCounters &logger)
8004 {
8005 os_commit_latency.consume_next(
8006 logger.get_tavg_ms(
8007 l_bluestore_commit_lat));
8008 os_apply_latency.consume_next(
8009 logger.get_tavg_ms(
8010 l_bluestore_commit_lat));
8011 }
8012
8013 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
8014 {
8015 dout(20) << __func__ << " txc " << txc << std::hex
8016 << " allocated 0x" << txc->allocated
8017 << " released 0x" << txc->released
8018 << std::dec << dendl;
8019
8020 // We have to handle the case where we allocate *and* deallocate the
8021 // same region in this transaction. The freelist doesn't like that.
8022 // (Actually, the only thing that cares is the BitmapFreelistManager
8023 // debug check. But that's important.)
8024 interval_set<uint64_t> tmp_allocated, tmp_released;
8025 interval_set<uint64_t> *pallocated = &txc->allocated;
8026 interval_set<uint64_t> *preleased = &txc->released;
8027 if (!txc->allocated.empty() && !txc->released.empty()) {
8028 interval_set<uint64_t> overlap;
8029 overlap.intersection_of(txc->allocated, txc->released);
8030 if (!overlap.empty()) {
8031 tmp_allocated = txc->allocated;
8032 tmp_allocated.subtract(overlap);
8033 tmp_released = txc->released;
8034 tmp_released.subtract(overlap);
8035 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
8036 << ", new allocated 0x" << tmp_allocated
8037 << " released 0x" << tmp_released << std::dec
8038 << dendl;
8039 pallocated = &tmp_allocated;
8040 preleased = &tmp_released;
8041 }
8042 }
8043
8044 // update freelist with non-overlap sets
8045 for (interval_set<uint64_t>::iterator p = pallocated->begin();
8046 p != pallocated->end();
8047 ++p) {
8048 fm->allocate(p.get_start(), p.get_len(), t);
8049 }
8050 for (interval_set<uint64_t>::iterator p = preleased->begin();
8051 p != preleased->end();
8052 ++p) {
8053 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
8054 << "~" << p.get_len() << std::dec << dendl;
8055 fm->release(p.get_start(), p.get_len(), t);
8056 }
8057
8058 _txc_update_store_statfs(txc);
8059 }
8060
8061 void BlueStore::_txc_applied_kv(TransContext *txc)
8062 {
8063 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
8064 for (auto& o : *ls) {
8065 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
8066 << dendl;
8067 if (--o->flushing_count == 0) {
8068 std::lock_guard<std::mutex> l(o->flush_lock);
8069 o->flush_cond.notify_all();
8070 }
8071 }
8072 }
8073 }
8074
8075 void BlueStore::_txc_committed_kv(TransContext *txc)
8076 {
8077 dout(20) << __func__ << " txc " << txc << dendl;
8078
8079 // warning: we're calling onreadable_sync inside the sequencer lock
8080 if (txc->onreadable_sync) {
8081 txc->onreadable_sync->complete(0);
8082 txc->onreadable_sync = NULL;
8083 }
8084 unsigned n = txc->osr->parent->shard_hint.hash_to_shard(m_finisher_num);
8085 if (txc->oncommit) {
8086 logger->tinc(l_bluestore_commit_lat, ceph_clock_now() - txc->start);
8087 finishers[n]->queue(txc->oncommit);
8088 txc->oncommit = NULL;
8089 }
8090 if (txc->onreadable) {
8091 finishers[n]->queue(txc->onreadable);
8092 txc->onreadable = NULL;
8093 }
8094
8095 if (!txc->oncommits.empty()) {
8096 finishers[n]->queue(txc->oncommits);
8097 }
8098 }
8099
8100 void BlueStore::_txc_finish(TransContext *txc)
8101 {
8102 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
8103 assert(txc->state == TransContext::STATE_FINISHING);
8104
8105 for (auto& sb : txc->shared_blobs_written) {
8106 sb->bc.finish_write(sb->get_cache(), txc->seq);
8107 }
8108 txc->shared_blobs_written.clear();
8109
8110 while (!txc->removed_collections.empty()) {
8111 _queue_reap_collection(txc->removed_collections.front());
8112 txc->removed_collections.pop_front();
8113 }
8114
8115 OpSequencerRef osr = txc->osr;
8116 bool empty = false;
8117 bool submit_deferred = false;
8118 OpSequencer::q_list_t releasing_txc;
8119 {
8120 std::lock_guard<std::mutex> l(osr->qlock);
8121 txc->state = TransContext::STATE_DONE;
8122 bool notify = false;
8123 while (!osr->q.empty()) {
8124 TransContext *txc = &osr->q.front();
8125 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
8126 << dendl;
8127 if (txc->state != TransContext::STATE_DONE) {
8128 if (txc->state == TransContext::STATE_PREPARE &&
8129 deferred_aggressive) {
8130 // for _osr_drain_preceding()
8131 notify = true;
8132 }
8133 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
8134 osr->q.size() > g_conf->bluestore_max_deferred_txc) {
8135 submit_deferred = true;
8136 }
8137 break;
8138 }
8139
8140 osr->q.pop_front();
8141 releasing_txc.push_back(*txc);
8142 notify = true;
8143 }
8144 if (notify) {
8145 osr->qcond.notify_all();
8146 }
8147 if (osr->q.empty()) {
8148 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
8149 empty = true;
8150 }
8151 }
8152 while (!releasing_txc.empty()) {
8153 // release to allocator only after all preceding txc's have also
8154 // finished any deferred writes that potentially land in these
8155 // blocks
8156 auto txc = &releasing_txc.front();
8157 _txc_release_alloc(txc);
8158 releasing_txc.pop_front();
8159 txc->log_state_latency(logger, l_bluestore_state_done_lat);
8160 delete txc;
8161 }
8162
8163 if (submit_deferred) {
8164 // we're pinning memory; flush! we could be more fine-grained here but
8165 // i'm not sure it's worth the bother.
8166 deferred_try_submit();
8167 }
8168
8169 if (empty && osr->zombie) {
8170 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
8171 osr->_unregister();
8172 }
8173 }
8174
8175 void BlueStore::_txc_release_alloc(TransContext *txc)
8176 {
8177 // update allocator with full released set
8178 if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
8179 dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
8180 for (interval_set<uint64_t>::iterator p = txc->released.begin();
8181 p != txc->released.end();
8182 ++p) {
8183 alloc->release(p.get_start(), p.get_len());
8184 }
8185 }
8186
8187 txc->allocated.clear();
8188 txc->released.clear();
8189 }
8190
8191 void BlueStore::_osr_drain_preceding(TransContext *txc)
8192 {
8193 OpSequencer *osr = txc->osr.get();
8194 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
8195 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
8196 {
8197 // submit anything pending
8198 deferred_lock.lock();
8199 if (osr->deferred_pending) {
8200 _deferred_submit_unlock(osr);
8201 } else {
8202 deferred_lock.unlock();
8203 }
8204 }
8205 {
8206 // wake up any previously finished deferred events
8207 std::lock_guard<std::mutex> l(kv_lock);
8208 kv_cond.notify_one();
8209 }
8210 osr->drain_preceding(txc);
8211 --deferred_aggressive;
8212 dout(10) << __func__ << " " << osr << " done" << dendl;
8213 }
8214
8215 void BlueStore::_osr_drain_all()
8216 {
8217 dout(10) << __func__ << dendl;
8218
8219 set<OpSequencerRef> s;
8220 {
8221 std::lock_guard<std::mutex> l(osr_lock);
8222 s = osr_set;
8223 }
8224 dout(20) << __func__ << " osr_set " << s << dendl;
8225
8226 ++deferred_aggressive;
8227 {
8228 // submit anything pending
8229 deferred_try_submit();
8230 }
8231 {
8232 // wake up any previously finished deferred events
8233 std::lock_guard<std::mutex> l(kv_lock);
8234 kv_cond.notify_one();
8235 }
8236 {
8237 std::lock_guard<std::mutex> l(kv_finalize_lock);
8238 kv_finalize_cond.notify_one();
8239 }
8240 for (auto osr : s) {
8241 dout(20) << __func__ << " drain " << osr << dendl;
8242 osr->drain();
8243 }
8244 --deferred_aggressive;
8245
8246 dout(10) << __func__ << " done" << dendl;
8247 }
8248
8249 void BlueStore::_osr_unregister_all()
8250 {
8251 set<OpSequencerRef> s;
8252 {
8253 std::lock_guard<std::mutex> l(osr_lock);
8254 s = osr_set;
8255 }
8256 dout(10) << __func__ << " " << s << dendl;
8257 for (auto osr : s) {
8258 osr->_unregister();
8259
8260 if (!osr->zombie) {
8261 // break link from Sequencer to us so that this OpSequencer
8262 // instance can die with this mount/umount cycle. note that
8263 // we assume umount() will not race against ~Sequencer.
8264 assert(osr->parent);
8265 osr->parent->p.reset();
8266 }
8267 }
8268 // nobody should be creating sequencers during umount either.
8269 {
8270 std::lock_guard<std::mutex> l(osr_lock);
8271 assert(osr_set.empty());
8272 }
8273 }
8274
8275 void BlueStore::_kv_start()
8276 {
8277 dout(10) << __func__ << dendl;
8278
8279 if (cct->_conf->bluestore_shard_finishers) {
8280 if (cct->_conf->osd_op_num_shards) {
8281 m_finisher_num = cct->_conf->osd_op_num_shards;
8282 } else {
8283 assert(bdev);
8284 if (bdev->is_rotational()) {
8285 m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
8286 } else {
8287 m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
8288 }
8289 }
8290 }
8291
8292 assert(m_finisher_num != 0);
8293
8294 for (int i = 0; i < m_finisher_num; ++i) {
8295 ostringstream oss;
8296 oss << "finisher-" << i;
8297 Finisher *f = new Finisher(cct, oss.str(), "finisher");
8298 finishers.push_back(f);
8299 }
8300
8301 deferred_finisher.start();
8302 for (auto f : finishers) {
8303 f->start();
8304 }
8305 kv_sync_thread.create("bstore_kv_sync");
8306 kv_finalize_thread.create("bstore_kv_final");
8307 }
8308
8309 void BlueStore::_kv_stop()
8310 {
8311 dout(10) << __func__ << dendl;
8312 {
8313 std::unique_lock<std::mutex> l(kv_lock);
8314 while (!kv_sync_started) {
8315 kv_cond.wait(l);
8316 }
8317 kv_stop = true;
8318 kv_cond.notify_all();
8319 }
8320 {
8321 std::unique_lock<std::mutex> l(kv_finalize_lock);
8322 while (!kv_finalize_started) {
8323 kv_finalize_cond.wait(l);
8324 }
8325 kv_finalize_stop = true;
8326 kv_finalize_cond.notify_all();
8327 }
8328 kv_sync_thread.join();
8329 kv_finalize_thread.join();
8330 {
8331 std::lock_guard<std::mutex> l(kv_lock);
8332 kv_stop = false;
8333 }
8334 {
8335 std::lock_guard<std::mutex> l(kv_finalize_lock);
8336 kv_finalize_stop = false;
8337 }
8338 dout(10) << __func__ << " stopping finishers" << dendl;
8339 deferred_finisher.wait_for_empty();
8340 deferred_finisher.stop();
8341 for (auto f : finishers) {
8342 f->wait_for_empty();
8343 f->stop();
8344 }
8345 dout(10) << __func__ << " stopped" << dendl;
8346 }
8347
8348 void BlueStore::_kv_sync_thread()
8349 {
8350 dout(10) << __func__ << " start" << dendl;
8351 std::unique_lock<std::mutex> l(kv_lock);
8352 assert(!kv_sync_started);
8353 kv_sync_started = true;
8354 kv_cond.notify_all();
8355 while (true) {
8356 assert(kv_committing.empty());
8357 if (kv_queue.empty() &&
8358 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
8359 !deferred_aggressive)) {
8360 if (kv_stop)
8361 break;
8362 dout(20) << __func__ << " sleep" << dendl;
8363 kv_cond.wait(l);
8364 dout(20) << __func__ << " wake" << dendl;
8365 } else {
8366 deque<TransContext*> kv_submitting;
8367 deque<DeferredBatch*> deferred_done, deferred_stable;
8368 uint64_t aios = 0, costs = 0;
8369
8370 dout(20) << __func__ << " committing " << kv_queue.size()
8371 << " submitting " << kv_queue_unsubmitted.size()
8372 << " deferred done " << deferred_done_queue.size()
8373 << " stable " << deferred_stable_queue.size()
8374 << dendl;
8375 kv_committing.swap(kv_queue);
8376 kv_submitting.swap(kv_queue_unsubmitted);
8377 deferred_done.swap(deferred_done_queue);
8378 deferred_stable.swap(deferred_stable_queue);
8379 aios = kv_ios;
8380 costs = kv_throttle_costs;
8381 kv_ios = 0;
8382 kv_throttle_costs = 0;
8383 utime_t start = ceph_clock_now();
8384 l.unlock();
8385
8386 dout(30) << __func__ << " committing " << kv_committing << dendl;
8387 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
8388 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
8389 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
8390
8391 bool force_flush = false;
8392 // if bluefs is sharing the same device as data (only), then we
8393 // can rely on the bluefs commit to flush the device and make
8394 // deferred aios stable. that means that if we do have done deferred
8395 // txcs AND we are not on a single device, we need to force a flush.
8396 if (bluefs_single_shared_device && bluefs) {
8397 if (aios) {
8398 force_flush = true;
8399 } else if (kv_committing.empty() && kv_submitting.empty() &&
8400 deferred_stable.empty()) {
8401 force_flush = true; // there's nothing else to commit!
8402 } else if (deferred_aggressive) {
8403 force_flush = true;
8404 }
8405 } else
8406 force_flush = true;
8407
8408 if (force_flush) {
8409 dout(20) << __func__ << " num_aios=" << aios
8410 << " force_flush=" << (int)force_flush
8411 << ", flushing, deferred done->stable" << dendl;
8412 // flush/barrier on block device
8413 bdev->flush();
8414
8415 // if we flush then deferred done are now deferred stable
8416 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
8417 deferred_done.end());
8418 deferred_done.clear();
8419 }
8420 utime_t after_flush = ceph_clock_now();
8421
8422 // we will use one final transaction to force a sync
8423 KeyValueDB::Transaction synct = db->get_transaction();
8424
8425 // increase {nid,blobid}_max? note that this covers both the
8426 // case where we are approaching the max and the case we passed
8427 // it. in either case, we increase the max in the earlier txn
8428 // we submit.
8429 uint64_t new_nid_max = 0, new_blobid_max = 0;
8430 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
8431 KeyValueDB::Transaction t =
8432 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8433 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
8434 bufferlist bl;
8435 ::encode(new_nid_max, bl);
8436 t->set(PREFIX_SUPER, "nid_max", bl);
8437 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
8438 }
8439 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
8440 KeyValueDB::Transaction t =
8441 kv_submitting.empty() ? synct : kv_submitting.front()->t;
8442 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
8443 bufferlist bl;
8444 ::encode(new_blobid_max, bl);
8445 t->set(PREFIX_SUPER, "blobid_max", bl);
8446 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
8447 }
8448
8449 for (auto txc : kv_committing) {
8450 if (txc->state == TransContext::STATE_KV_QUEUED) {
8451 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8452 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
8453 assert(r == 0);
8454 _txc_applied_kv(txc);
8455 --txc->osr->kv_committing_serially;
8456 txc->state = TransContext::STATE_KV_SUBMITTED;
8457 if (txc->osr->kv_submitted_waiters) {
8458 std::lock_guard<std::mutex> l(txc->osr->qlock);
8459 if (txc->osr->_is_all_kv_submitted()) {
8460 txc->osr->qcond.notify_all();
8461 }
8462 }
8463
8464 } else {
8465 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8466 txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
8467 }
8468 if (txc->had_ios) {
8469 --txc->osr->txc_with_unstable_io;
8470 }
8471 }
8472
8473 // release throttle *before* we commit. this allows new ops
8474 // to be prepared and enter pipeline while we are waiting on
8475 // the kv commit sync/flush. then hopefully on the next
8476 // iteration there will already be ops awake. otherwise, we
8477 // end up going to sleep, and then wake up when the very first
8478 // transaction is ready for commit.
8479 throttle_bytes.put(costs);
8480
8481 PExtentVector bluefs_gift_extents;
8482 if (bluefs &&
8483 after_flush - bluefs_last_balance >
8484 cct->_conf->bluestore_bluefs_balance_interval) {
8485 bluefs_last_balance = after_flush;
8486 int r = _balance_bluefs_freespace(&bluefs_gift_extents);
8487 assert(r >= 0);
8488 if (r > 0) {
8489 for (auto& p : bluefs_gift_extents) {
8490 bluefs_extents.insert(p.offset, p.length);
8491 }
8492 bufferlist bl;
8493 ::encode(bluefs_extents, bl);
8494 dout(10) << __func__ << " bluefs_extents now 0x" << std::hex
8495 << bluefs_extents << std::dec << dendl;
8496 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
8497 }
8498 }
8499
8500 // cleanup sync deferred keys
8501 for (auto b : deferred_stable) {
8502 for (auto& txc : b->txcs) {
8503 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
8504 if (!wt.released.empty()) {
8505 // kraken replay compat only
8506 txc.released = wt.released;
8507 dout(10) << __func__ << " deferred txn has released "
8508 << txc.released
8509 << " (we just upgraded from kraken) on " << &txc << dendl;
8510 _txc_finalize_kv(&txc, synct);
8511 }
8512 // cleanup the deferred
8513 string key;
8514 get_deferred_key(wt.seq, &key);
8515 synct->rm_single_key(PREFIX_DEFERRED, key);
8516 }
8517 }
8518
8519 // submit synct synchronously (block and wait for it to commit)
8520 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
8521 assert(r == 0);
8522
8523 if (new_nid_max) {
8524 nid_max = new_nid_max;
8525 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
8526 }
8527 if (new_blobid_max) {
8528 blobid_max = new_blobid_max;
8529 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
8530 }
8531
8532 {
8533 utime_t finish = ceph_clock_now();
8534 utime_t dur_flush = after_flush - start;
8535 utime_t dur_kv = finish - after_flush;
8536 utime_t dur = finish - start;
8537 dout(20) << __func__ << " committed " << kv_committing.size()
8538 << " cleaned " << deferred_stable.size()
8539 << " in " << dur
8540 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
8541 << dendl;
8542 logger->tinc(l_bluestore_kv_flush_lat, dur_flush);
8543 logger->tinc(l_bluestore_kv_commit_lat, dur_kv);
8544 logger->tinc(l_bluestore_kv_lat, dur);
8545 }
8546
8547 if (bluefs) {
8548 if (!bluefs_gift_extents.empty()) {
8549 _commit_bluefs_freespace(bluefs_gift_extents);
8550 }
8551 for (auto p = bluefs_extents_reclaiming.begin();
8552 p != bluefs_extents_reclaiming.end();
8553 ++p) {
8554 dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
8555 << p.get_start() << "~" << p.get_len() << std::dec
8556 << dendl;
8557 alloc->release(p.get_start(), p.get_len());
8558 }
8559 bluefs_extents_reclaiming.clear();
8560 }
8561
8562 {
8563 std::unique_lock<std::mutex> m(kv_finalize_lock);
8564 if (kv_committing_to_finalize.empty()) {
8565 kv_committing_to_finalize.swap(kv_committing);
8566 } else {
8567 kv_committing_to_finalize.insert(
8568 kv_committing_to_finalize.end(),
8569 kv_committing.begin(),
8570 kv_committing.end());
8571 kv_committing.clear();
8572 }
8573 if (deferred_stable_to_finalize.empty()) {
8574 deferred_stable_to_finalize.swap(deferred_stable);
8575 } else {
8576 deferred_stable_to_finalize.insert(
8577 deferred_stable_to_finalize.end(),
8578 deferred_stable.begin(),
8579 deferred_stable.end());
8580 deferred_stable.clear();
8581 }
8582 kv_finalize_cond.notify_one();
8583 }
8584
8585 l.lock();
8586 // previously deferred "done" are now "stable" by virtue of this
8587 // commit cycle.
8588 deferred_stable_queue.swap(deferred_done);
8589 }
8590 }
8591 dout(10) << __func__ << " finish" << dendl;
8592 kv_sync_started = false;
8593 }
8594
8595 void BlueStore::_kv_finalize_thread()
8596 {
8597 deque<TransContext*> kv_committed;
8598 deque<DeferredBatch*> deferred_stable;
8599 dout(10) << __func__ << " start" << dendl;
8600 std::unique_lock<std::mutex> l(kv_finalize_lock);
8601 assert(!kv_finalize_started);
8602 kv_finalize_started = true;
8603 kv_finalize_cond.notify_all();
8604 while (true) {
8605 assert(kv_committed.empty());
8606 assert(deferred_stable.empty());
8607 if (kv_committing_to_finalize.empty() &&
8608 deferred_stable_to_finalize.empty()) {
8609 if (kv_finalize_stop)
8610 break;
8611 dout(20) << __func__ << " sleep" << dendl;
8612 kv_finalize_cond.wait(l);
8613 dout(20) << __func__ << " wake" << dendl;
8614 } else {
8615 kv_committed.swap(kv_committing_to_finalize);
8616 deferred_stable.swap(deferred_stable_to_finalize);
8617 l.unlock();
8618 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
8619 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
8620
8621 while (!kv_committed.empty()) {
8622 TransContext *txc = kv_committed.front();
8623 assert(txc->state == TransContext::STATE_KV_SUBMITTED);
8624 _txc_state_proc(txc);
8625 kv_committed.pop_front();
8626 }
8627
8628 for (auto b : deferred_stable) {
8629 auto p = b->txcs.begin();
8630 while (p != b->txcs.end()) {
8631 TransContext *txc = &*p;
8632 p = b->txcs.erase(p); // unlink here because
8633 _txc_state_proc(txc); // this may destroy txc
8634 }
8635 delete b;
8636 }
8637 deferred_stable.clear();
8638
8639 if (!deferred_aggressive) {
8640 if (deferred_queue_size >= deferred_batch_ops.load() ||
8641 throttle_deferred_bytes.past_midpoint()) {
8642 deferred_try_submit();
8643 }
8644 }
8645
8646 // this is as good a place as any ...
8647 _reap_collections();
8648
8649 l.lock();
8650 }
8651 }
8652 dout(10) << __func__ << " finish" << dendl;
8653 kv_finalize_started = false;
8654 }
8655
8656 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
8657 TransContext *txc, OnodeRef o)
8658 {
8659 if (!txc->deferred_txn) {
8660 txc->deferred_txn = new bluestore_deferred_transaction_t;
8661 }
8662 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
8663 return &txc->deferred_txn->ops.back();
8664 }
8665
8666 void BlueStore::_deferred_queue(TransContext *txc)
8667 {
8668 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
8669 deferred_lock.lock();
8670 if (!txc->osr->deferred_pending &&
8671 !txc->osr->deferred_running) {
8672 deferred_queue.push_back(*txc->osr);
8673 }
8674 if (!txc->osr->deferred_pending) {
8675 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
8676 }
8677 ++deferred_queue_size;
8678 txc->osr->deferred_pending->txcs.push_back(*txc);
8679 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
8680 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
8681 const auto& op = *opi;
8682 assert(op.op == bluestore_deferred_op_t::OP_WRITE);
8683 bufferlist::const_iterator p = op.data.begin();
8684 for (auto e : op.extents) {
8685 txc->osr->deferred_pending->prepare_write(
8686 cct, wt.seq, e.offset, e.length, p);
8687 }
8688 }
8689 if (deferred_aggressive &&
8690 !txc->osr->deferred_running) {
8691 _deferred_submit_unlock(txc->osr.get());
8692 } else {
8693 deferred_lock.unlock();
8694 }
8695 }
8696
8697 void BlueStore::deferred_try_submit()
8698 {
8699 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
8700 << deferred_queue_size << " txcs" << dendl;
8701 std::lock_guard<std::mutex> l(deferred_lock);
8702 vector<OpSequencerRef> osrs;
8703 osrs.reserve(deferred_queue.size());
8704 for (auto& osr : deferred_queue) {
8705 osrs.push_back(&osr);
8706 }
8707 for (auto& osr : osrs) {
8708 if (osr->deferred_pending) {
8709 if (!osr->deferred_running) {
8710 _deferred_submit_unlock(osr.get());
8711 deferred_lock.lock();
8712 } else {
8713 dout(20) << __func__ << " osr " << osr << " already has running"
8714 << dendl;
8715 }
8716 } else {
8717 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
8718 }
8719 }
8720 }
8721
8722 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
8723 {
8724 dout(10) << __func__ << " osr " << osr
8725 << " " << osr->deferred_pending->iomap.size() << " ios pending "
8726 << dendl;
8727 assert(osr->deferred_pending);
8728 assert(!osr->deferred_running);
8729
8730 auto b = osr->deferred_pending;
8731 deferred_queue_size -= b->seq_bytes.size();
8732 assert(deferred_queue_size >= 0);
8733
8734 osr->deferred_running = osr->deferred_pending;
8735 osr->deferred_pending = nullptr;
8736
8737 uint64_t start = 0, pos = 0;
8738 bufferlist bl;
8739 auto i = b->iomap.begin();
8740 while (true) {
8741 if (i == b->iomap.end() || i->first != pos) {
8742 if (bl.length()) {
8743 dout(20) << __func__ << " write 0x" << std::hex
8744 << start << "~" << bl.length()
8745 << " crc " << bl.crc32c(-1) << std::dec << dendl;
8746 if (!g_conf->bluestore_debug_omit_block_device_write) {
8747 logger->inc(l_bluestore_deferred_write_ops);
8748 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
8749 int r = bdev->aio_write(start, bl, &b->ioc, false);
8750 assert(r == 0);
8751 }
8752 }
8753 if (i == b->iomap.end()) {
8754 break;
8755 }
8756 start = 0;
8757 pos = i->first;
8758 bl.clear();
8759 }
8760 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
8761 << std::hex << pos << "~" << i->second.bl.length() << std::dec
8762 << dendl;
8763 if (!bl.length()) {
8764 start = pos;
8765 }
8766 pos += i->second.bl.length();
8767 bl.claim_append(i->second.bl);
8768 ++i;
8769 }
8770
8771 deferred_lock.unlock();
8772 bdev->aio_submit(&b->ioc);
8773 }
8774
8775 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
8776 {
8777 dout(10) << __func__ << " osr " << osr << dendl;
8778 assert(osr->deferred_running);
8779 DeferredBatch *b = osr->deferred_running;
8780
8781 {
8782 std::lock_guard<std::mutex> l(deferred_lock);
8783 assert(osr->deferred_running == b);
8784 osr->deferred_running = nullptr;
8785 if (!osr->deferred_pending) {
8786 dout(20) << __func__ << " dequeueing" << dendl;
8787 auto q = deferred_queue.iterator_to(*osr);
8788 deferred_queue.erase(q);
8789 } else if (deferred_aggressive) {
8790 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
8791 deferred_finisher.queue(new FunctionContext([&](int) {
8792 deferred_try_submit();
8793 }));
8794 } else {
8795 dout(20) << __func__ << " leaving queued, more pending" << dendl;
8796 }
8797 }
8798
8799 {
8800 uint64_t costs = 0;
8801 std::lock_guard<std::mutex> l2(osr->qlock);
8802 for (auto& i : b->txcs) {
8803 TransContext *txc = &i;
8804 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
8805 costs += txc->cost;
8806 }
8807 osr->qcond.notify_all();
8808 throttle_deferred_bytes.put(costs);
8809 std::lock_guard<std::mutex> l(kv_lock);
8810 deferred_done_queue.emplace_back(b);
8811 }
8812
8813 // in the normal case, do not bother waking up the kv thread; it will
8814 // catch us on the next commit anyway.
8815 if (deferred_aggressive) {
8816 std::lock_guard<std::mutex> l(kv_lock);
8817 kv_cond.notify_one();
8818 }
8819 }
8820
8821 int BlueStore::_deferred_replay()
8822 {
8823 dout(10) << __func__ << " start" << dendl;
8824 OpSequencerRef osr = new OpSequencer(cct, this);
8825 int count = 0;
8826 int r = 0;
8827 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
8828 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
8829 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
8830 << dendl;
8831 bluestore_deferred_transaction_t *deferred_txn =
8832 new bluestore_deferred_transaction_t;
8833 bufferlist bl = it->value();
8834 bufferlist::iterator p = bl.begin();
8835 try {
8836 ::decode(*deferred_txn, p);
8837 } catch (buffer::error& e) {
8838 derr << __func__ << " failed to decode deferred txn "
8839 << pretty_binary_string(it->key()) << dendl;
8840 delete deferred_txn;
8841 r = -EIO;
8842 goto out;
8843 }
8844 TransContext *txc = _txc_create(osr.get());
8845 txc->deferred_txn = deferred_txn;
8846 txc->state = TransContext::STATE_KV_DONE;
8847 _txc_state_proc(txc);
8848 }
8849 out:
8850 dout(20) << __func__ << " draining osr" << dendl;
8851 _osr_drain_all();
8852 osr->discard();
8853 dout(10) << __func__ << " completed " << count << " events" << dendl;
8854 return r;
8855 }
8856
8857 // ---------------------------
8858 // transactions
8859
8860 int BlueStore::queue_transactions(
8861 Sequencer *posr,
8862 vector<Transaction>& tls,
8863 TrackedOpRef op,
8864 ThreadPool::TPHandle *handle)
8865 {
8866 FUNCTRACE();
8867 Context *onreadable;
8868 Context *ondisk;
8869 Context *onreadable_sync;
8870 ObjectStore::Transaction::collect_contexts(
8871 tls, &onreadable, &ondisk, &onreadable_sync);
8872
8873 if (cct->_conf->objectstore_blackhole) {
8874 dout(0) << __func__ << " objectstore_blackhole = TRUE, dropping transaction"
8875 << dendl;
8876 delete ondisk;
8877 delete onreadable;
8878 delete onreadable_sync;
8879 return 0;
8880 }
8881 utime_t start = ceph_clock_now();
8882 // set up the sequencer
8883 OpSequencer *osr;
8884 assert(posr);
8885 if (posr->p) {
8886 osr = static_cast<OpSequencer *>(posr->p.get());
8887 dout(10) << __func__ << " existing " << osr << " " << *osr << dendl;
8888 } else {
8889 osr = new OpSequencer(cct, this);
8890 osr->parent = posr;
8891 posr->p = osr;
8892 dout(10) << __func__ << " new " << osr << " " << *osr << dendl;
8893 }
8894
8895 // prepare
8896 TransContext *txc = _txc_create(osr);
8897 txc->onreadable = onreadable;
8898 txc->onreadable_sync = onreadable_sync;
8899 txc->oncommit = ondisk;
8900
8901 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
8902 (*p).set_osr(osr);
8903 txc->bytes += (*p).get_num_bytes();
8904 _txc_add_transaction(txc, &(*p));
8905 }
8906 _txc_calc_cost(txc);
8907
8908 _txc_write_nodes(txc, txc->t);
8909
8910 // journal deferred items
8911 if (txc->deferred_txn) {
8912 txc->deferred_txn->seq = ++deferred_seq;
8913 bufferlist bl;
8914 ::encode(*txc->deferred_txn, bl);
8915 string key;
8916 get_deferred_key(txc->deferred_txn->seq, &key);
8917 txc->t->set(PREFIX_DEFERRED, key, bl);
8918 }
8919
8920 _txc_finalize_kv(txc, txc->t);
8921 if (handle)
8922 handle->suspend_tp_timeout();
8923
8924 utime_t tstart = ceph_clock_now();
8925 throttle_bytes.get(txc->cost);
8926 if (txc->deferred_txn) {
8927 // ensure we do not block here because of deferred writes
8928 if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
8929 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
8930 << dendl;
8931 ++deferred_aggressive;
8932 deferred_try_submit();
8933 throttle_deferred_bytes.get(txc->cost);
8934 --deferred_aggressive;
8935 }
8936 }
8937 utime_t tend = ceph_clock_now();
8938
8939 if (handle)
8940 handle->reset_tp_timeout();
8941
8942 logger->inc(l_bluestore_txc);
8943
8944 // execute (start)
8945 _txc_state_proc(txc);
8946
8947 logger->tinc(l_bluestore_submit_lat, ceph_clock_now() - start);
8948 logger->tinc(l_bluestore_throttle_lat, tend - tstart);
8949 return 0;
8950 }
8951
8952 void BlueStore::_txc_aio_submit(TransContext *txc)
8953 {
8954 dout(10) << __func__ << " txc " << txc << dendl;
8955 bdev->aio_submit(&txc->ioc);
8956 }
8957
8958 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
8959 {
8960 Transaction::iterator i = t->begin();
8961
8962 _dump_transaction(t);
8963
8964 vector<CollectionRef> cvec(i.colls.size());
8965 unsigned j = 0;
8966 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
8967 ++p, ++j) {
8968 cvec[j] = _get_collection(*p);
8969 }
8970 vector<OnodeRef> ovec(i.objects.size());
8971
8972 for (int pos = 0; i.have_op(); ++pos) {
8973 Transaction::Op *op = i.decode_op();
8974 int r = 0;
8975
8976 // no coll or obj
8977 if (op->op == Transaction::OP_NOP)
8978 continue;
8979
8980 // collection operations
8981 CollectionRef &c = cvec[op->cid];
8982 switch (op->op) {
8983 case Transaction::OP_RMCOLL:
8984 {
8985 const coll_t &cid = i.get_cid(op->cid);
8986 r = _remove_collection(txc, cid, &c);
8987 if (!r)
8988 continue;
8989 }
8990 break;
8991
8992 case Transaction::OP_MKCOLL:
8993 {
8994 assert(!c);
8995 const coll_t &cid = i.get_cid(op->cid);
8996 r = _create_collection(txc, cid, op->split_bits, &c);
8997 if (!r)
8998 continue;
8999 }
9000 break;
9001
9002 case Transaction::OP_SPLIT_COLLECTION:
9003 assert(0 == "deprecated");
9004 break;
9005
9006 case Transaction::OP_SPLIT_COLLECTION2:
9007 {
9008 uint32_t bits = op->split_bits;
9009 uint32_t rem = op->split_rem;
9010 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
9011 if (!r)
9012 continue;
9013 }
9014 break;
9015
9016 case Transaction::OP_COLL_HINT:
9017 {
9018 uint32_t type = op->hint_type;
9019 bufferlist hint;
9020 i.decode_bl(hint);
9021 bufferlist::iterator hiter = hint.begin();
9022 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
9023 uint32_t pg_num;
9024 uint64_t num_objs;
9025 ::decode(pg_num, hiter);
9026 ::decode(num_objs, hiter);
9027 dout(10) << __func__ << " collection hint objects is a no-op, "
9028 << " pg_num " << pg_num << " num_objects " << num_objs
9029 << dendl;
9030 } else {
9031 // Ignore the hint
9032 dout(10) << __func__ << " unknown collection hint " << type << dendl;
9033 }
9034 continue;
9035 }
9036 break;
9037
9038 case Transaction::OP_COLL_SETATTR:
9039 r = -EOPNOTSUPP;
9040 break;
9041
9042 case Transaction::OP_COLL_RMATTR:
9043 r = -EOPNOTSUPP;
9044 break;
9045
9046 case Transaction::OP_COLL_RENAME:
9047 assert(0 == "not implemented");
9048 break;
9049 }
9050 if (r < 0) {
9051 derr << __func__ << " error " << cpp_strerror(r)
9052 << " not handled on operation " << op->op
9053 << " (op " << pos << ", counting from 0)" << dendl;
9054 _dump_transaction(t, 0);
9055 assert(0 == "unexpected error");
9056 }
9057
9058 // these operations implicity create the object
9059 bool create = false;
9060 if (op->op == Transaction::OP_TOUCH ||
9061 op->op == Transaction::OP_WRITE ||
9062 op->op == Transaction::OP_ZERO) {
9063 create = true;
9064 }
9065
9066 // object operations
9067 RWLock::WLocker l(c->lock);
9068 OnodeRef &o = ovec[op->oid];
9069 if (!o) {
9070 ghobject_t oid = i.get_oid(op->oid);
9071 o = c->get_onode(oid, create);
9072 }
9073 if (!create && (!o || !o->exists)) {
9074 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
9075 << i.get_oid(op->oid) << dendl;
9076 r = -ENOENT;
9077 goto endop;
9078 }
9079
9080 switch (op->op) {
9081 case Transaction::OP_TOUCH:
9082 r = _touch(txc, c, o);
9083 break;
9084
9085 case Transaction::OP_WRITE:
9086 {
9087 uint64_t off = op->off;
9088 uint64_t len = op->len;
9089 uint32_t fadvise_flags = i.get_fadvise_flags();
9090 bufferlist bl;
9091 i.decode_bl(bl);
9092 r = _write(txc, c, o, off, len, bl, fadvise_flags);
9093 }
9094 break;
9095
9096 case Transaction::OP_ZERO:
9097 {
9098 uint64_t off = op->off;
9099 uint64_t len = op->len;
9100 r = _zero(txc, c, o, off, len);
9101 }
9102 break;
9103
9104 case Transaction::OP_TRIMCACHE:
9105 {
9106 // deprecated, no-op
9107 }
9108 break;
9109
9110 case Transaction::OP_TRUNCATE:
9111 {
9112 uint64_t off = op->off;
9113 r = _truncate(txc, c, o, off);
9114 }
9115 break;
9116
9117 case Transaction::OP_REMOVE:
9118 {
9119 r = _remove(txc, c, o);
9120 }
9121 break;
9122
9123 case Transaction::OP_SETATTR:
9124 {
9125 string name = i.decode_string();
9126 bufferptr bp;
9127 i.decode_bp(bp);
9128 r = _setattr(txc, c, o, name, bp);
9129 }
9130 break;
9131
9132 case Transaction::OP_SETATTRS:
9133 {
9134 map<string, bufferptr> aset;
9135 i.decode_attrset(aset);
9136 r = _setattrs(txc, c, o, aset);
9137 }
9138 break;
9139
9140 case Transaction::OP_RMATTR:
9141 {
9142 string name = i.decode_string();
9143 r = _rmattr(txc, c, o, name);
9144 }
9145 break;
9146
9147 case Transaction::OP_RMATTRS:
9148 {
9149 r = _rmattrs(txc, c, o);
9150 }
9151 break;
9152
9153 case Transaction::OP_CLONE:
9154 {
9155 OnodeRef& no = ovec[op->dest_oid];
9156 if (!no) {
9157 const ghobject_t& noid = i.get_oid(op->dest_oid);
9158 no = c->get_onode(noid, true);
9159 }
9160 r = _clone(txc, c, o, no);
9161 }
9162 break;
9163
9164 case Transaction::OP_CLONERANGE:
9165 assert(0 == "deprecated");
9166 break;
9167
9168 case Transaction::OP_CLONERANGE2:
9169 {
9170 OnodeRef& no = ovec[op->dest_oid];
9171 if (!no) {
9172 const ghobject_t& noid = i.get_oid(op->dest_oid);
9173 no = c->get_onode(noid, true);
9174 }
9175 uint64_t srcoff = op->off;
9176 uint64_t len = op->len;
9177 uint64_t dstoff = op->dest_off;
9178 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
9179 }
9180 break;
9181
9182 case Transaction::OP_COLL_ADD:
9183 assert(0 == "not implemented");
9184 break;
9185
9186 case Transaction::OP_COLL_REMOVE:
9187 assert(0 == "not implemented");
9188 break;
9189
9190 case Transaction::OP_COLL_MOVE:
9191 assert(0 == "deprecated");
9192 break;
9193
9194 case Transaction::OP_COLL_MOVE_RENAME:
9195 case Transaction::OP_TRY_RENAME:
9196 {
9197 assert(op->cid == op->dest_cid);
9198 const ghobject_t& noid = i.get_oid(op->dest_oid);
9199 OnodeRef& no = ovec[op->dest_oid];
9200 if (!no) {
9201 no = c->get_onode(noid, false);
9202 }
9203 r = _rename(txc, c, o, no, noid);
9204 }
9205 break;
9206
9207 case Transaction::OP_OMAP_CLEAR:
9208 {
9209 r = _omap_clear(txc, c, o);
9210 }
9211 break;
9212 case Transaction::OP_OMAP_SETKEYS:
9213 {
9214 bufferlist aset_bl;
9215 i.decode_attrset_bl(&aset_bl);
9216 r = _omap_setkeys(txc, c, o, aset_bl);
9217 }
9218 break;
9219 case Transaction::OP_OMAP_RMKEYS:
9220 {
9221 bufferlist keys_bl;
9222 i.decode_keyset_bl(&keys_bl);
9223 r = _omap_rmkeys(txc, c, o, keys_bl);
9224 }
9225 break;
9226 case Transaction::OP_OMAP_RMKEYRANGE:
9227 {
9228 string first, last;
9229 first = i.decode_string();
9230 last = i.decode_string();
9231 r = _omap_rmkey_range(txc, c, o, first, last);
9232 }
9233 break;
9234 case Transaction::OP_OMAP_SETHEADER:
9235 {
9236 bufferlist bl;
9237 i.decode_bl(bl);
9238 r = _omap_setheader(txc, c, o, bl);
9239 }
9240 break;
9241
9242 case Transaction::OP_SETALLOCHINT:
9243 {
9244 r = _set_alloc_hint(txc, c, o,
9245 op->expected_object_size,
9246 op->expected_write_size,
9247 op->alloc_hint_flags);
9248 }
9249 break;
9250
9251 default:
9252 derr << __func__ << "bad op " << op->op << dendl;
9253 ceph_abort();
9254 }
9255
9256 endop:
9257 if (r < 0) {
9258 bool ok = false;
9259
9260 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
9261 op->op == Transaction::OP_CLONE ||
9262 op->op == Transaction::OP_CLONERANGE2 ||
9263 op->op == Transaction::OP_COLL_ADD ||
9264 op->op == Transaction::OP_SETATTR ||
9265 op->op == Transaction::OP_SETATTRS ||
9266 op->op == Transaction::OP_RMATTR ||
9267 op->op == Transaction::OP_OMAP_SETKEYS ||
9268 op->op == Transaction::OP_OMAP_RMKEYS ||
9269 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
9270 op->op == Transaction::OP_OMAP_SETHEADER))
9271 // -ENOENT is usually okay
9272 ok = true;
9273 if (r == -ENODATA)
9274 ok = true;
9275
9276 if (!ok) {
9277 const char *msg = "unexpected error code";
9278
9279 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
9280 op->op == Transaction::OP_CLONE ||
9281 op->op == Transaction::OP_CLONERANGE2))
9282 msg = "ENOENT on clone suggests osd bug";
9283
9284 if (r == -ENOSPC)
9285 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9286 // by partially applying transactions.
9287 msg = "ENOSPC from bluestore, misconfigured cluster";
9288
9289 if (r == -ENOTEMPTY) {
9290 msg = "ENOTEMPTY suggests garbage data in osd data dir";
9291 }
9292
9293 derr << __func__ << " error " << cpp_strerror(r)
9294 << " not handled on operation " << op->op
9295 << " (op " << pos << ", counting from 0)"
9296 << dendl;
9297 derr << msg << dendl;
9298 _dump_transaction(t, 0);
9299 assert(0 == "unexpected error");
9300 }
9301 }
9302 }
9303 }
9304
9305
9306
9307 // -----------------
9308 // write operations
9309
9310 int BlueStore::_touch(TransContext *txc,
9311 CollectionRef& c,
9312 OnodeRef &o)
9313 {
9314 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
9315 int r = 0;
9316 _assign_nid(txc, o);
9317 txc->write_onode(o);
9318 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
9319 return r;
9320 }
9321
9322 void BlueStore::_dump_onode(OnodeRef o, int log_level)
9323 {
9324 if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
9325 return;
9326 dout(log_level) << __func__ << " " << o << " " << o->oid
9327 << " nid " << o->onode.nid
9328 << " size 0x" << std::hex << o->onode.size
9329 << " (" << std::dec << o->onode.size << ")"
9330 << " expected_object_size " << o->onode.expected_object_size
9331 << " expected_write_size " << o->onode.expected_write_size
9332 << " in " << o->onode.extent_map_shards.size() << " shards"
9333 << ", " << o->extent_map.spanning_blob_map.size()
9334 << " spanning blobs"
9335 << dendl;
9336 for (auto p = o->onode.attrs.begin();
9337 p != o->onode.attrs.end();
9338 ++p) {
9339 dout(log_level) << __func__ << " attr " << p->first
9340 << " len " << p->second.length() << dendl;
9341 }
9342 _dump_extent_map(o->extent_map, log_level);
9343 }
9344
9345 void BlueStore::_dump_extent_map(ExtentMap &em, int log_level)
9346 {
9347 uint64_t pos = 0;
9348 for (auto& s : em.shards) {
9349 dout(log_level) << __func__ << " shard " << *s.shard_info
9350 << (s.loaded ? " (loaded)" : "")
9351 << (s.dirty ? " (dirty)" : "")
9352 << dendl;
9353 }
9354 for (auto& e : em.extent_map) {
9355 dout(log_level) << __func__ << " " << e << dendl;
9356 assert(e.logical_offset >= pos);
9357 pos = e.logical_offset + e.length;
9358 const bluestore_blob_t& blob = e.blob->get_blob();
9359 if (blob.has_csum()) {
9360 vector<uint64_t> v;
9361 unsigned n = blob.get_csum_count();
9362 for (unsigned i = 0; i < n; ++i)
9363 v.push_back(blob.get_csum_item(i));
9364 dout(log_level) << __func__ << " csum: " << std::hex << v << std::dec
9365 << dendl;
9366 }
9367 std::lock_guard<std::recursive_mutex> l(e.blob->shared_blob->get_cache()->lock);
9368 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
9369 dout(log_level) << __func__ << " 0x" << std::hex << i.first
9370 << "~" << i.second->length << std::dec
9371 << " " << *i.second << dendl;
9372 }
9373 }
9374 }
9375
9376 void BlueStore::_dump_transaction(Transaction *t, int log_level)
9377 {
9378 dout(log_level) << " transaction dump:\n";
9379 JSONFormatter f(true);
9380 f.open_object_section("transaction");
9381 t->dump(&f);
9382 f.close_section();
9383 f.flush(*_dout);
9384 *_dout << dendl;
9385 }
9386
9387 void BlueStore::_pad_zeros(
9388 bufferlist *bl, uint64_t *offset,
9389 uint64_t chunk_size)
9390 {
9391 auto length = bl->length();
9392 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
9393 << " chunk_size 0x" << chunk_size << std::dec << dendl;
9394 dout(40) << "before:\n";
9395 bl->hexdump(*_dout);
9396 *_dout << dendl;
9397 // front
9398 size_t front_pad = *offset % chunk_size;
9399 size_t back_pad = 0;
9400 size_t pad_count = 0;
9401 if (front_pad) {
9402 size_t front_copy = MIN(chunk_size - front_pad, length);
9403 bufferptr z = buffer::create_page_aligned(chunk_size);
9404 z.zero(0, front_pad, false);
9405 pad_count += front_pad;
9406 bl->copy(0, front_copy, z.c_str() + front_pad);
9407 if (front_copy + front_pad < chunk_size) {
9408 back_pad = chunk_size - (length + front_pad);
9409 z.zero(front_pad + length, back_pad, false);
9410 pad_count += back_pad;
9411 }
9412 bufferlist old, t;
9413 old.swap(*bl);
9414 t.substr_of(old, front_copy, length - front_copy);
9415 bl->append(z);
9416 bl->claim_append(t);
9417 *offset -= front_pad;
9418 length += pad_count;
9419 }
9420
9421 // back
9422 uint64_t end = *offset + length;
9423 unsigned back_copy = end % chunk_size;
9424 if (back_copy) {
9425 assert(back_pad == 0);
9426 back_pad = chunk_size - back_copy;
9427 assert(back_copy <= length);
9428 bufferptr tail(chunk_size);
9429 bl->copy(length - back_copy, back_copy, tail.c_str());
9430 tail.zero(back_copy, back_pad, false);
9431 bufferlist old;
9432 old.swap(*bl);
9433 bl->substr_of(old, 0, length - back_copy);
9434 bl->append(tail);
9435 length += back_pad;
9436 pad_count += back_pad;
9437 }
9438 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
9439 << back_pad << " on front/back, now 0x" << *offset << "~"
9440 << length << std::dec << dendl;
9441 dout(40) << "after:\n";
9442 bl->hexdump(*_dout);
9443 *_dout << dendl;
9444 if (pad_count)
9445 logger->inc(l_bluestore_write_pad_bytes, pad_count);
9446 assert(bl->length() == length);
9447 }
9448
9449 void BlueStore::_do_write_small(
9450 TransContext *txc,
9451 CollectionRef &c,
9452 OnodeRef o,
9453 uint64_t offset, uint64_t length,
9454 bufferlist::iterator& blp,
9455 WriteContext *wctx)
9456 {
9457 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9458 << std::dec << dendl;
9459 assert(length < min_alloc_size);
9460 uint64_t end_offs = offset + length;
9461
9462 logger->inc(l_bluestore_write_small);
9463 logger->inc(l_bluestore_write_small_bytes, length);
9464
9465 bufferlist bl;
9466 blp.copy(length, bl);
9467
9468 // Look for an existing mutable blob we can use.
9469 auto begin = o->extent_map.extent_map.begin();
9470 auto end = o->extent_map.extent_map.end();
9471 auto ep = o->extent_map.seek_lextent(offset);
9472 if (ep != begin) {
9473 --ep;
9474 if (ep->blob_end() <= offset) {
9475 ++ep;
9476 }
9477 }
9478 auto prev_ep = ep;
9479 if (prev_ep != begin) {
9480 --prev_ep;
9481 } else {
9482 prev_ep = end; // to avoid this extent check as it's a duplicate
9483 }
9484
9485 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9486 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9487 uint32_t alloc_len = min_alloc_size;
9488 auto offset0 = P2ALIGN(offset, alloc_len);
9489
9490 bool any_change;
9491
9492 // search suitable extent in both forward and reverse direction in
9493 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9494 // then check if blob can be reused via can_reuse_blob func or apply
9495 // direct/deferred write (the latter for extents including or higher
9496 // than 'offset' only).
9497 do {
9498 any_change = false;
9499
9500 if (ep != end && ep->logical_offset < offset + max_bsize) {
9501 BlobRef b = ep->blob;
9502 auto bstart = ep->blob_start();
9503 dout(20) << __func__ << " considering " << *b
9504 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9505 if (bstart >= end_offs) {
9506 dout(20) << __func__ << " ignoring distant " << *b << dendl;
9507 } else if (!b->get_blob().is_mutable()) {
9508 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
9509 } else if (ep->logical_offset % min_alloc_size !=
9510 ep->blob_offset % min_alloc_size) {
9511 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
9512 } else {
9513 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9514 // can we pad our head/tail out with zeros?
9515 uint64_t head_pad, tail_pad;
9516 head_pad = P2PHASE(offset, chunk_size);
9517 tail_pad = P2NPHASE(end_offs, chunk_size);
9518 if (head_pad || tail_pad) {
9519 o->extent_map.fault_range(db, offset - head_pad,
9520 end_offs - offset + head_pad + tail_pad);
9521 }
9522 if (head_pad &&
9523 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
9524 head_pad = 0;
9525 }
9526 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
9527 tail_pad = 0;
9528 }
9529
9530 uint64_t b_off = offset - head_pad - bstart;
9531 uint64_t b_len = length + head_pad + tail_pad;
9532
9533 // direct write into unused blocks of an existing mutable blob?
9534 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
9535 b->get_blob().get_ondisk_length() >= b_off + b_len &&
9536 b->get_blob().is_unused(b_off, b_len) &&
9537 b->get_blob().is_allocated(b_off, b_len)) {
9538 _apply_padding(head_pad, tail_pad, bl);
9539
9540 dout(20) << __func__ << " write to unused 0x" << std::hex
9541 << b_off << "~" << b_len
9542 << " pad 0x" << head_pad << " + 0x" << tail_pad
9543 << std::dec << " of mutable " << *b << dendl;
9544 _buffer_cache_write(txc, b, b_off, bl,
9545 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9546
9547 if (!g_conf->bluestore_debug_omit_block_device_write) {
9548 if (b_len <= prefer_deferred_size) {
9549 dout(20) << __func__ << " deferring small 0x" << std::hex
9550 << b_len << std::dec << " unused write via deferred" << dendl;
9551 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9552 op->op = bluestore_deferred_op_t::OP_WRITE;
9553 b->get_blob().map(
9554 b_off, b_len,
9555 [&](uint64_t offset, uint64_t length) {
9556 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9557 return 0;
9558 });
9559 op->data = bl;
9560 } else {
9561 b->get_blob().map_bl(
9562 b_off, bl,
9563 [&](uint64_t offset, bufferlist& t) {
9564 bdev->aio_write(offset, t,
9565 &txc->ioc, wctx->buffered);
9566 });
9567 }
9568 }
9569 b->dirty_blob().calc_csum(b_off, bl);
9570 dout(20) << __func__ << " lex old " << *ep << dendl;
9571 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
9572 b,
9573 &wctx->old_extents);
9574 b->dirty_blob().mark_used(le->blob_offset, le->length);
9575 txc->statfs_delta.stored() += le->length;
9576 dout(20) << __func__ << " lex " << *le << dendl;
9577 logger->inc(l_bluestore_write_small_unused);
9578 return;
9579 }
9580 // read some data to fill out the chunk?
9581 uint64_t head_read = P2PHASE(b_off, chunk_size);
9582 uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
9583 if ((head_read || tail_read) &&
9584 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
9585 head_read + tail_read < min_alloc_size) {
9586 b_off -= head_read;
9587 b_len += head_read + tail_read;
9588
9589 } else {
9590 head_read = tail_read = 0;
9591 }
9592
9593 // chunk-aligned deferred overwrite?
9594 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
9595 b_off % chunk_size == 0 &&
9596 b_len % chunk_size == 0 &&
9597 b->get_blob().is_allocated(b_off, b_len)) {
9598
9599 _apply_padding(head_pad, tail_pad, bl);
9600
9601 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
9602 << " and tail 0x" << tail_read << std::dec << dendl;
9603 if (head_read) {
9604 bufferlist head_bl;
9605 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
9606 head_bl, 0);
9607 assert(r >= 0 && r <= (int)head_read);
9608 size_t zlen = head_read - r;
9609 if (zlen) {
9610 head_bl.append_zero(zlen);
9611 logger->inc(l_bluestore_write_pad_bytes, zlen);
9612 }
9613 bl.claim_prepend(head_bl);
9614 logger->inc(l_bluestore_write_penalty_read_ops);
9615 }
9616 if (tail_read) {
9617 bufferlist tail_bl;
9618 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
9619 tail_bl, 0);
9620 assert(r >= 0 && r <= (int)tail_read);
9621 size_t zlen = tail_read - r;
9622 if (zlen) {
9623 tail_bl.append_zero(zlen);
9624 logger->inc(l_bluestore_write_pad_bytes, zlen);
9625 }
9626 bl.claim_append(tail_bl);
9627 logger->inc(l_bluestore_write_penalty_read_ops);
9628 }
9629 logger->inc(l_bluestore_write_small_pre_read);
9630
9631 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
9632 op->op = bluestore_deferred_op_t::OP_WRITE;
9633 _buffer_cache_write(txc, b, b_off, bl,
9634 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
9635
9636 int r = b->get_blob().map(
9637 b_off, b_len,
9638 [&](uint64_t offset, uint64_t length) {
9639 op->extents.emplace_back(bluestore_pextent_t(offset, length));
9640 return 0;
9641 });
9642 assert(r == 0);
9643 if (b->get_blob().csum_type) {
9644 b->dirty_blob().calc_csum(b_off, bl);
9645 }
9646 op->data.claim(bl);
9647 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
9648 << b_len << std::dec << " of mutable " << *b
9649 << " at " << op->extents << dendl;
9650 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
9651 b, &wctx->old_extents);
9652 b->dirty_blob().mark_used(le->blob_offset, le->length);
9653 txc->statfs_delta.stored() += le->length;
9654 dout(20) << __func__ << " lex " << *le << dendl;
9655 logger->inc(l_bluestore_write_small_deferred);
9656 return;
9657 }
9658 // try to reuse blob if we can
9659 if (b->can_reuse_blob(min_alloc_size,
9660 max_bsize,
9661 offset0 - bstart,
9662 &alloc_len)) {
9663 assert(alloc_len == min_alloc_size); // expecting data always
9664 // fit into reused blob
9665 // Need to check for pending writes desiring to
9666 // reuse the same pextent. The rationale is that during GC two chunks
9667 // from garbage blobs(compressed?) can share logical space within the same
9668 // AU. That's in turn might be caused by unaligned len in clone_range2.
9669 // Hence the second write will fail in an attempt to reuse blob at
9670 // do_alloc_write().
9671 if (!wctx->has_conflict(b,
9672 offset0,
9673 offset0 + alloc_len,
9674 min_alloc_size)) {
9675
9676 // we can't reuse pad_head/pad_tail since they might be truncated
9677 // due to existent extents
9678 uint64_t b_off = offset - bstart;
9679 uint64_t b_off0 = b_off;
9680 _pad_zeros(&bl, &b_off0, chunk_size);
9681
9682 dout(20) << __func__ << " reuse blob " << *b << std::hex
9683 << " (0x" << b_off0 << "~" << bl.length() << ")"
9684 << " (0x" << b_off << "~" << length << ")"
9685 << std::dec << dendl;
9686
9687 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9688 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9689 false, false);
9690 logger->inc(l_bluestore_write_small_unused);
9691 return;
9692 }
9693 }
9694 }
9695 ++ep;
9696 any_change = true;
9697 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9698
9699 // check extent for reuse in reverse order
9700 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9701 BlobRef b = prev_ep->blob;
9702 auto bstart = prev_ep->blob_start();
9703 dout(20) << __func__ << " considering " << *b
9704 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
9705 if (b->can_reuse_blob(min_alloc_size,
9706 max_bsize,
9707 offset0 - bstart,
9708 &alloc_len)) {
9709 assert(alloc_len == min_alloc_size); // expecting data always
9710 // fit into reused blob
9711 // Need to check for pending writes desiring to
9712 // reuse the same pextent. The rationale is that during GC two chunks
9713 // from garbage blobs(compressed?) can share logical space within the same
9714 // AU. That's in turn might be caused by unaligned len in clone_range2.
9715 // Hence the second write will fail in an attempt to reuse blob at
9716 // do_alloc_write().
9717 if (!wctx->has_conflict(b,
9718 offset0,
9719 offset0 + alloc_len,
9720 min_alloc_size)) {
9721
9722 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
9723 uint64_t b_off = offset - bstart;
9724 uint64_t b_off0 = b_off;
9725 _pad_zeros(&bl, &b_off0, chunk_size);
9726
9727 dout(20) << __func__ << " reuse blob " << *b << std::hex
9728 << " (0x" << b_off0 << "~" << bl.length() << ")"
9729 << " (0x" << b_off << "~" << length << ")"
9730 << std::dec << dendl;
9731
9732 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9733 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
9734 false, false);
9735 logger->inc(l_bluestore_write_small_unused);
9736 return;
9737 }
9738 }
9739 if (prev_ep != begin) {
9740 --prev_ep;
9741 any_change = true;
9742 } else {
9743 prev_ep = end; // to avoid useless first extent re-check
9744 }
9745 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9746 } while (any_change);
9747
9748 // new blob.
9749
9750 BlobRef b = c->new_blob();
9751 uint64_t b_off = P2PHASE(offset, alloc_len);
9752 uint64_t b_off0 = b_off;
9753 _pad_zeros(&bl, &b_off0, block_size);
9754 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9755 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
9756 logger->inc(l_bluestore_write_small_new);
9757
9758 return;
9759 }
9760
9761 void BlueStore::_do_write_big(
9762 TransContext *txc,
9763 CollectionRef &c,
9764 OnodeRef o,
9765 uint64_t offset, uint64_t length,
9766 bufferlist::iterator& blp,
9767 WriteContext *wctx)
9768 {
9769 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
9770 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
9771 << " compress " << (int)wctx->compress
9772 << dendl;
9773 logger->inc(l_bluestore_write_big);
9774 logger->inc(l_bluestore_write_big_bytes, length);
9775 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
9776 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9777 while (length > 0) {
9778 bool new_blob = false;
9779 uint32_t l = MIN(max_bsize, length);
9780 BlobRef b;
9781 uint32_t b_off = 0;
9782
9783 //attempting to reuse existing blob
9784 if (!wctx->compress) {
9785 // look for an existing mutable blob we can reuse
9786 auto begin = o->extent_map.extent_map.begin();
9787 auto end = o->extent_map.extent_map.end();
9788 auto ep = o->extent_map.seek_lextent(offset);
9789 auto prev_ep = ep;
9790 if (prev_ep != begin) {
9791 --prev_ep;
9792 } else {
9793 prev_ep = end; // to avoid this extent check as it's a duplicate
9794 }
9795 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
9796 // search suitable extent in both forward and reverse direction in
9797 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9798 // then check if blob can be reused via can_reuse_blob func.
9799 bool any_change;
9800 do {
9801 any_change = false;
9802 if (ep != end && ep->logical_offset < offset + max_bsize) {
9803 if (offset >= ep->blob_start() &&
9804 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
9805 offset - ep->blob_start(),
9806 &l)) {
9807 b = ep->blob;
9808 b_off = offset - ep->blob_start();
9809 prev_ep = end; // to avoid check below
9810 dout(20) << __func__ << " reuse blob " << *b << std::hex
9811 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
9812 } else {
9813 ++ep;
9814 any_change = true;
9815 }
9816 }
9817
9818 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
9819 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
9820 offset - prev_ep->blob_start(),
9821 &l)) {
9822 b = prev_ep->blob;
9823 b_off = offset - prev_ep->blob_start();
9824 dout(20) << __func__ << " reuse blob " << *b << std::hex
9825 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
9826 } else if (prev_ep != begin) {
9827 --prev_ep;
9828 any_change = true;
9829 } else {
9830 prev_ep = end; // to avoid useless first extent re-check
9831 }
9832 }
9833 } while (b == nullptr && any_change);
9834 }
9835 if (b == nullptr) {
9836 b = c->new_blob();
9837 b_off = 0;
9838 new_blob = true;
9839 }
9840
9841 bufferlist t;
9842 blp.copy(l, t);
9843 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
9844 offset += l;
9845 length -= l;
9846 logger->inc(l_bluestore_write_big_blobs);
9847 }
9848 }
9849
9850 int BlueStore::_do_alloc_write(
9851 TransContext *txc,
9852 CollectionRef coll,
9853 OnodeRef o,
9854 WriteContext *wctx)
9855 {
9856 dout(20) << __func__ << " txc " << txc
9857 << " " << wctx->writes.size() << " blobs"
9858 << dendl;
9859
9860 uint64_t need = 0;
9861 auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
9862 for (auto &wi : wctx->writes) {
9863 need += wi.blob_length;
9864 }
9865 int r = alloc->reserve(need);
9866 if (r < 0) {
9867 derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
9868 << dendl;
9869 return r;
9870 }
9871
9872 uint64_t hint = 0;
9873 CompressorRef c;
9874 double crr = 0;
9875 if (wctx->compress) {
9876 c = select_option(
9877 "compression_algorithm",
9878 compressor,
9879 [&]() {
9880 string val;
9881 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
9882 CompressorRef cp = compressor;
9883 if (!cp || cp->get_type_name() != val) {
9884 cp = Compressor::create(cct, val);
9885 }
9886 return boost::optional<CompressorRef>(cp);
9887 }
9888 return boost::optional<CompressorRef>();
9889 }
9890 );
9891
9892 crr = select_option(
9893 "compression_required_ratio",
9894 cct->_conf->bluestore_compression_required_ratio,
9895 [&]() {
9896 double val;
9897 if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
9898 return boost::optional<double>(val);
9899 }
9900 return boost::optional<double>();
9901 }
9902 );
9903 }
9904
9905 // checksum
9906 int csum = csum_type.load();
9907 csum = select_option(
9908 "csum_type",
9909 csum,
9910 [&]() {
9911 int val;
9912 if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
9913 return boost::optional<int>(val);
9914 }
9915 return boost::optional<int>();
9916 }
9917 );
9918
9919 for (auto& wi : wctx->writes) {
9920 BlobRef b = wi.b;
9921 bluestore_blob_t& dblob = b->dirty_blob();
9922 uint64_t b_off = wi.b_off;
9923 bufferlist *l = &wi.bl;
9924 uint64_t final_length = wi.blob_length;
9925 uint64_t csum_length = wi.blob_length;
9926 unsigned csum_order = block_size_order;
9927 bufferlist compressed_bl;
9928 bool compressed = false;
9929 if(c && wi.blob_length > min_alloc_size) {
9930
9931 utime_t start = ceph_clock_now();
9932
9933 // compress
9934 assert(b_off == 0);
9935 assert(wi.blob_length == l->length());
9936 bluestore_compression_header_t chdr;
9937 chdr.type = c->get_type();
9938 // FIXME: memory alignment here is bad
9939 bufferlist t;
9940
9941 r = c->compress(*l, t);
9942 assert(r == 0);
9943
9944 chdr.length = t.length();
9945 ::encode(chdr, compressed_bl);
9946 compressed_bl.claim_append(t);
9947 uint64_t rawlen = compressed_bl.length();
9948 uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
9949 uint64_t want_len_raw = final_length * crr;
9950 uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
9951 if (newlen <= want_len && newlen < final_length) {
9952 // Cool. We compressed at least as much as we were hoping to.
9953 // pad out to min_alloc_size
9954 compressed_bl.append_zero(newlen - rawlen);
9955 logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
9956 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
9957 << " -> 0x" << rawlen << " => 0x" << newlen
9958 << " with " << c->get_type()
9959 << std::dec << dendl;
9960 txc->statfs_delta.compressed() += rawlen;
9961 txc->statfs_delta.compressed_original() += l->length();
9962 txc->statfs_delta.compressed_allocated() += newlen;
9963 l = &compressed_bl;
9964 final_length = newlen;
9965 csum_length = newlen;
9966 csum_order = ctz(newlen);
9967 dblob.set_compressed(wi.blob_length, rawlen);
9968 compressed = true;
9969 logger->inc(l_bluestore_compress_success_count);
9970 } else {
9971 dout(20) << __func__ << std::hex << " 0x" << l->length()
9972 << " compressed to 0x" << rawlen << " -> 0x" << newlen
9973 << " with " << c->get_type()
9974 << ", which is more than required 0x" << want_len_raw
9975 << " -> 0x" << want_len
9976 << ", leaving uncompressed"
9977 << std::dec << dendl;
9978 logger->inc(l_bluestore_compress_rejected_count);
9979 }
9980 logger->tinc(l_bluestore_compress_lat,
9981 ceph_clock_now() - start);
9982 }
9983 if (!compressed && wi.new_blob) {
9984 // initialize newly created blob only
9985 assert(dblob.is_mutable());
9986 if (l->length() != wi.blob_length) {
9987 // hrm, maybe we could do better here, but let's not bother.
9988 dout(20) << __func__ << " forcing csum_order to block_size_order "
9989 << block_size_order << dendl;
9990 csum_order = block_size_order;
9991 } else {
9992 csum_order = std::min(wctx->csum_order, ctz(l->length()));
9993 }
9994 // try to align blob with max_blob_size to improve
9995 // its reuse ratio, e.g. in case of reverse write
9996 uint32_t suggested_boff =
9997 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
9998 if ((suggested_boff % (1 << csum_order)) == 0 &&
9999 suggested_boff + final_length <= max_bsize &&
10000 suggested_boff > b_off) {
10001 dout(20) << __func__ << " forcing blob_offset to 0x"
10002 << std::hex << suggested_boff << std::dec << dendl;
10003 assert(suggested_boff >= b_off);
10004 csum_length += suggested_boff - b_off;
10005 b_off = suggested_boff;
10006 }
10007 if (csum != Checksummer::CSUM_NONE) {
10008 dout(20) << __func__ << " initialize csum setting for new blob " << *b
10009 << " csum_type " << Checksummer::get_csum_type_string(csum)
10010 << " csum_order " << csum_order
10011 << " csum_length 0x" << std::hex << csum_length << std::dec
10012 << dendl;
10013 dblob.init_csum(csum, csum_order, csum_length);
10014 }
10015 }
10016
10017 AllocExtentVector extents;
10018 extents.reserve(4); // 4 should be (more than) enough for most allocations
10019 int64_t got = alloc->allocate(final_length, min_alloc_size,
10020 max_alloc_size.load(),
10021 hint, &extents);
10022 assert(got == (int64_t)final_length);
10023 need -= got;
10024 txc->statfs_delta.allocated() += got;
10025 for (auto& p : extents) {
10026 bluestore_pextent_t e = bluestore_pextent_t(p);
10027 txc->allocated.insert(e.offset, e.length);
10028 hint = p.end();
10029 }
10030 dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
10031
10032 dout(20) << __func__ << " blob " << *b << dendl;
10033 if (dblob.has_csum()) {
10034 dblob.calc_csum(b_off, *l);
10035 }
10036
10037 if (wi.mark_unused) {
10038 auto b_end = b_off + wi.bl.length();
10039 if (b_off) {
10040 dblob.add_unused(0, b_off);
10041 }
10042 if (b_end < wi.blob_length) {
10043 dblob.add_unused(b_end, wi.blob_length - b_end);
10044 }
10045 }
10046
10047 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
10048 b_off + (wi.b_off0 - wi.b_off),
10049 wi.length0,
10050 wi.b,
10051 nullptr);
10052 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
10053 txc->statfs_delta.stored() += le->length;
10054 dout(20) << __func__ << " lex " << *le << dendl;
10055 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
10056 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
10057
10058 // queue io
10059 if (!g_conf->bluestore_debug_omit_block_device_write) {
10060 if (l->length() <= prefer_deferred_size.load()) {
10061 dout(20) << __func__ << " deferring small 0x" << std::hex
10062 << l->length() << std::dec << " write via deferred" << dendl;
10063 bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
10064 op->op = bluestore_deferred_op_t::OP_WRITE;
10065 int r = b->get_blob().map(
10066 b_off, l->length(),
10067 [&](uint64_t offset, uint64_t length) {
10068 op->extents.emplace_back(bluestore_pextent_t(offset, length));
10069 return 0;
10070 });
10071 assert(r == 0);
10072 op->data = *l;
10073 } else {
10074 b->get_blob().map_bl(
10075 b_off, *l,
10076 [&](uint64_t offset, bufferlist& t) {
10077 bdev->aio_write(offset, t, &txc->ioc, false);
10078 });
10079 }
10080 }
10081 }
10082 if (need > 0) {
10083 alloc->unreserve(need);
10084 }
10085 return 0;
10086 }
10087
10088 void BlueStore::_wctx_finish(
10089 TransContext *txc,
10090 CollectionRef& c,
10091 OnodeRef o,
10092 WriteContext *wctx,
10093 set<SharedBlob*> *maybe_unshared_blobs)
10094 {
10095 auto oep = wctx->old_extents.begin();
10096 while (oep != wctx->old_extents.end()) {
10097 auto &lo = *oep;
10098 oep = wctx->old_extents.erase(oep);
10099 dout(20) << __func__ << " lex_old " << lo.e << dendl;
10100 BlobRef b = lo.e.blob;
10101 const bluestore_blob_t& blob = b->get_blob();
10102 if (blob.is_compressed()) {
10103 if (lo.blob_empty) {
10104 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
10105 }
10106 txc->statfs_delta.compressed_original() -= lo.e.length;
10107 }
10108 auto& r = lo.r;
10109 txc->statfs_delta.stored() -= lo.e.length;
10110 if (!r.empty()) {
10111 dout(20) << __func__ << " blob release " << r << dendl;
10112 if (blob.is_shared()) {
10113 PExtentVector final;
10114 c->load_shared_blob(b->shared_blob);
10115 for (auto e : r) {
10116 b->shared_blob->put_ref(
10117 e.offset, e.length, &final,
10118 b->is_referenced() ? nullptr : maybe_unshared_blobs);
10119 }
10120 dout(20) << __func__ << " shared_blob release " << final
10121 << " from " << *b->shared_blob << dendl;
10122 txc->write_shared_blob(b->shared_blob);
10123 r.clear();
10124 r.swap(final);
10125 }
10126 }
10127 // we can't invalidate our logical extents as we drop them because
10128 // other lextents (either in our onode or others) may still
10129 // reference them. but we can throw out anything that is no
10130 // longer allocated. Note that this will leave behind edge bits
10131 // that are no longer referenced but not deallocated (until they
10132 // age out of the cache naturally).
10133 b->discard_unallocated(c.get());
10134 for (auto e : r) {
10135 dout(20) << __func__ << " release " << e << dendl;
10136 txc->released.insert(e.offset, e.length);
10137 txc->statfs_delta.allocated() -= e.length;
10138 if (blob.is_compressed()) {
10139 txc->statfs_delta.compressed_allocated() -= e.length;
10140 }
10141 }
10142 delete &lo;
10143 if (b->is_spanning() && !b->is_referenced()) {
10144 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
10145 << dendl;
10146 o->extent_map.spanning_blob_map.erase(b->id);
10147 }
10148 }
10149 }
10150
10151 void BlueStore::_do_write_data(
10152 TransContext *txc,
10153 CollectionRef& c,
10154 OnodeRef o,
10155 uint64_t offset,
10156 uint64_t length,
10157 bufferlist& bl,
10158 WriteContext *wctx)
10159 {
10160 uint64_t end = offset + length;
10161 bufferlist::iterator p = bl.begin();
10162
10163 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
10164 (length != min_alloc_size)) {
10165 // we fall within the same block
10166 _do_write_small(txc, c, o, offset, length, p, wctx);
10167 } else {
10168 uint64_t head_offset, head_length;
10169 uint64_t middle_offset, middle_length;
10170 uint64_t tail_offset, tail_length;
10171
10172 head_offset = offset;
10173 head_length = P2NPHASE(offset, min_alloc_size);
10174
10175 tail_offset = P2ALIGN(end, min_alloc_size);
10176 tail_length = P2PHASE(end, min_alloc_size);
10177
10178 middle_offset = head_offset + head_length;
10179 middle_length = length - head_length - tail_length;
10180
10181 if (head_length) {
10182 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
10183 }
10184
10185 if (middle_length) {
10186 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
10187 }
10188
10189 if (tail_length) {
10190 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
10191 }
10192 }
10193 }
10194
10195 void BlueStore::_choose_write_options(
10196 CollectionRef& c,
10197 OnodeRef o,
10198 uint32_t fadvise_flags,
10199 WriteContext *wctx)
10200 {
10201 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10202 dout(20) << __func__ << " will do buffered write" << dendl;
10203 wctx->buffered = true;
10204 } else if (cct->_conf->bluestore_default_buffered_write &&
10205 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10206 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10207 dout(20) << __func__ << " defaulting to buffered write" << dendl;
10208 wctx->buffered = true;
10209 }
10210
10211 // apply basic csum block size
10212 wctx->csum_order = block_size_order;
10213
10214 // compression parameters
10215 unsigned alloc_hints = o->onode.alloc_hint_flags;
10216 auto cm = select_option(
10217 "compression_mode",
10218 comp_mode.load(),
10219 [&]() {
10220 string val;
10221 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
10222 return boost::optional<Compressor::CompressionMode>(
10223 Compressor::get_comp_mode_type(val));
10224 }
10225 return boost::optional<Compressor::CompressionMode>();
10226 }
10227 );
10228
10229 wctx->compress = (cm != Compressor::COMP_NONE) &&
10230 ((cm == Compressor::COMP_FORCE) ||
10231 (cm == Compressor::COMP_AGGRESSIVE &&
10232 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
10233 (cm == Compressor::COMP_PASSIVE &&
10234 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
10235
10236 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
10237 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
10238 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
10239 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
10240 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
10241
10242 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
10243
10244 if (o->onode.expected_write_size) {
10245 wctx->csum_order = std::max(min_alloc_size_order,
10246 (uint8_t)ctz(o->onode.expected_write_size));
10247 } else {
10248 wctx->csum_order = min_alloc_size_order;
10249 }
10250
10251 if (wctx->compress) {
10252 wctx->target_blob_size = select_option(
10253 "compression_max_blob_size",
10254 comp_max_blob_size.load(),
10255 [&]() {
10256 int val;
10257 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
10258 return boost::optional<uint64_t>((uint64_t)val);
10259 }
10260 return boost::optional<uint64_t>();
10261 }
10262 );
10263 }
10264 } else {
10265 if (wctx->compress) {
10266 wctx->target_blob_size = select_option(
10267 "compression_min_blob_size",
10268 comp_min_blob_size.load(),
10269 [&]() {
10270 int val;
10271 if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
10272 return boost::optional<uint64_t>((uint64_t)val);
10273 }
10274 return boost::optional<uint64_t>();
10275 }
10276 );
10277 }
10278 }
10279
10280 uint64_t max_bsize = max_blob_size.load();
10281 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
10282 wctx->target_blob_size = max_bsize;
10283 }
10284
10285 // set the min blob size floor at 2x the min_alloc_size, or else we
10286 // won't be able to allocate a smaller extent for the compressed
10287 // data.
10288 if (wctx->compress &&
10289 wctx->target_blob_size < min_alloc_size * 2) {
10290 wctx->target_blob_size = min_alloc_size * 2;
10291 }
10292
10293 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
10294 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
10295 << std::dec << dendl;
10296 }
10297
10298 int BlueStore::_do_gc(
10299 TransContext *txc,
10300 CollectionRef& c,
10301 OnodeRef o,
10302 const GarbageCollector& gc,
10303 const WriteContext& wctx,
10304 uint64_t *dirty_start,
10305 uint64_t *dirty_end)
10306 {
10307 auto& extents_to_collect = gc.get_extents_to_collect();
10308
10309 WriteContext wctx_gc;
10310 wctx_gc.fork(wctx); // make a clone for garbage collection
10311
10312 for (auto it = extents_to_collect.begin();
10313 it != extents_to_collect.end();
10314 ++it) {
10315 bufferlist bl;
10316 int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
10317 assert(r == (int)it->length);
10318
10319 o->extent_map.fault_range(db, it->offset, it->length);
10320 _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
10321 logger->inc(l_bluestore_gc_merged, it->length);
10322
10323 if (*dirty_start > it->offset) {
10324 *dirty_start = it->offset;
10325 }
10326
10327 if (*dirty_end < it->offset + it->length) {
10328 *dirty_end = it->offset + it->length;
10329 }
10330 }
10331
10332 dout(30) << __func__ << " alloc write" << dendl;
10333 int r = _do_alloc_write(txc, c, o, &wctx_gc);
10334 if (r < 0) {
10335 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10336 << dendl;
10337 return r;
10338 }
10339
10340 _wctx_finish(txc, c, o, &wctx_gc);
10341 return 0;
10342 }
10343
10344 int BlueStore::_do_write(
10345 TransContext *txc,
10346 CollectionRef& c,
10347 OnodeRef o,
10348 uint64_t offset,
10349 uint64_t length,
10350 bufferlist& bl,
10351 uint32_t fadvise_flags)
10352 {
10353 int r = 0;
10354
10355 dout(20) << __func__
10356 << " " << o->oid
10357 << " 0x" << std::hex << offset << "~" << length
10358 << " - have 0x" << o->onode.size
10359 << " (" << std::dec << o->onode.size << ")"
10360 << " bytes"
10361 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
10362 << dendl;
10363 _dump_onode(o);
10364
10365 if (length == 0) {
10366 return 0;
10367 }
10368
10369 uint64_t end = offset + length;
10370
10371 GarbageCollector gc(c->store->cct);
10372 int64_t benefit;
10373 auto dirty_start = offset;
10374 auto dirty_end = end;
10375
10376 WriteContext wctx;
10377 _choose_write_options(c, o, fadvise_flags, &wctx);
10378 o->extent_map.fault_range(db, offset, length);
10379 _do_write_data(txc, c, o, offset, length, bl, &wctx);
10380 r = _do_alloc_write(txc, c, o, &wctx);
10381 if (r < 0) {
10382 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
10383 << dendl;
10384 goto out;
10385 }
10386
10387 // NB: _wctx_finish() will empty old_extents
10388 // so we must do gc estimation before that
10389 benefit = gc.estimate(offset,
10390 length,
10391 o->extent_map,
10392 wctx.old_extents,
10393 min_alloc_size);
10394
10395 _wctx_finish(txc, c, o, &wctx);
10396 if (end > o->onode.size) {
10397 dout(20) << __func__ << " extending size to 0x" << std::hex << end
10398 << std::dec << dendl;
10399 o->onode.size = end;
10400 }
10401
10402 if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
10403 if (!gc.get_extents_to_collect().empty()) {
10404 dout(20) << __func__ << " perform garbage collection, "
10405 << "expected benefit = " << benefit << " AUs" << dendl;
10406 r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
10407 if (r < 0) {
10408 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
10409 << dendl;
10410 goto out;
10411 }
10412 }
10413 }
10414
10415 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
10416 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
10417
10418 r = 0;
10419
10420 out:
10421 return r;
10422 }
10423
10424 int BlueStore::_write(TransContext *txc,
10425 CollectionRef& c,
10426 OnodeRef& o,
10427 uint64_t offset, size_t length,
10428 bufferlist& bl,
10429 uint32_t fadvise_flags)
10430 {
10431 dout(15) << __func__ << " " << c->cid << " " << o->oid
10432 << " 0x" << std::hex << offset << "~" << length << std::dec
10433 << dendl;
10434 int r = 0;
10435 if (offset + length >= OBJECT_MAX_SIZE) {
10436 r = -E2BIG;
10437 } else {
10438 _assign_nid(txc, o);
10439 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
10440 txc->write_onode(o);
10441 }
10442 dout(10) << __func__ << " " << c->cid << " " << o->oid
10443 << " 0x" << std::hex << offset << "~" << length << std::dec
10444 << " = " << r << dendl;
10445 return r;
10446 }
10447
10448 int BlueStore::_zero(TransContext *txc,
10449 CollectionRef& c,
10450 OnodeRef& o,
10451 uint64_t offset, size_t length)
10452 {
10453 dout(15) << __func__ << " " << c->cid << " " << o->oid
10454 << " 0x" << std::hex << offset << "~" << length << std::dec
10455 << dendl;
10456 int r = 0;
10457 if (offset + length >= OBJECT_MAX_SIZE) {
10458 r = -E2BIG;
10459 } else {
10460 _assign_nid(txc, o);
10461 r = _do_zero(txc, c, o, offset, length);
10462 }
10463 dout(10) << __func__ << " " << c->cid << " " << o->oid
10464 << " 0x" << std::hex << offset << "~" << length << std::dec
10465 << " = " << r << dendl;
10466 return r;
10467 }
10468
10469 int BlueStore::_do_zero(TransContext *txc,
10470 CollectionRef& c,
10471 OnodeRef& o,
10472 uint64_t offset, size_t length)
10473 {
10474 dout(15) << __func__ << " " << c->cid << " " << o->oid
10475 << " 0x" << std::hex << offset << "~" << length << std::dec
10476 << dendl;
10477 int r = 0;
10478
10479 _dump_onode(o);
10480
10481 WriteContext wctx;
10482 o->extent_map.fault_range(db, offset, length);
10483 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10484 o->extent_map.dirty_range(offset, length);
10485 _wctx_finish(txc, c, o, &wctx);
10486
10487 if (offset + length > o->onode.size) {
10488 o->onode.size = offset + length;
10489 dout(20) << __func__ << " extending size to " << offset + length
10490 << dendl;
10491 }
10492 txc->write_onode(o);
10493
10494 dout(10) << __func__ << " " << c->cid << " " << o->oid
10495 << " 0x" << std::hex << offset << "~" << length << std::dec
10496 << " = " << r << dendl;
10497 return r;
10498 }
10499
10500 void BlueStore::_do_truncate(
10501 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
10502 set<SharedBlob*> *maybe_unshared_blobs)
10503 {
10504 dout(15) << __func__ << " " << c->cid << " " << o->oid
10505 << " 0x" << std::hex << offset << std::dec << dendl;
10506
10507 _dump_onode(o, 30);
10508
10509 if (offset == o->onode.size)
10510 return;
10511
10512 if (offset < o->onode.size) {
10513 WriteContext wctx;
10514 uint64_t length = o->onode.size - offset;
10515 o->extent_map.fault_range(db, offset, length);
10516 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
10517 o->extent_map.dirty_range(offset, length);
10518 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
10519
10520 // if we have shards past EOF, ask for a reshard
10521 if (!o->onode.extent_map_shards.empty() &&
10522 o->onode.extent_map_shards.back().offset >= offset) {
10523 dout(10) << __func__ << " request reshard past EOF" << dendl;
10524 if (offset) {
10525 o->extent_map.request_reshard(offset - 1, offset + length);
10526 } else {
10527 o->extent_map.request_reshard(0, length);
10528 }
10529 }
10530 }
10531
10532 o->onode.size = offset;
10533
10534 txc->write_onode(o);
10535 }
10536
10537 int BlueStore::_truncate(TransContext *txc,
10538 CollectionRef& c,
10539 OnodeRef& o,
10540 uint64_t offset)
10541 {
10542 dout(15) << __func__ << " " << c->cid << " " << o->oid
10543 << " 0x" << std::hex << offset << std::dec
10544 << dendl;
10545 int r = 0;
10546 if (offset >= OBJECT_MAX_SIZE) {
10547 r = -E2BIG;
10548 } else {
10549 _do_truncate(txc, c, o, offset);
10550 }
10551 dout(10) << __func__ << " " << c->cid << " " << o->oid
10552 << " 0x" << std::hex << offset << std::dec
10553 << " = " << r << dendl;
10554 return r;
10555 }
10556
10557 int BlueStore::_do_remove(
10558 TransContext *txc,
10559 CollectionRef& c,
10560 OnodeRef o)
10561 {
10562 set<SharedBlob*> maybe_unshared_blobs;
10563 bool is_gen = !o->oid.is_no_gen();
10564 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
10565 if (o->onode.has_omap()) {
10566 o->flush();
10567 _do_omap_clear(txc, o->onode.nid);
10568 }
10569 o->exists = false;
10570 string key;
10571 for (auto &s : o->extent_map.shards) {
10572 dout(20) << __func__ << " removing shard 0x" << std::hex
10573 << s.shard_info->offset << std::dec << dendl;
10574 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
10575 [&](const string& final_key) {
10576 txc->t->rmkey(PREFIX_OBJ, final_key);
10577 }
10578 );
10579 }
10580 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
10581 txc->removed(o);
10582 o->extent_map.clear();
10583 o->onode = bluestore_onode_t();
10584 _debug_obj_on_delete(o->oid);
10585
10586 if (!is_gen || maybe_unshared_blobs.empty()) {
10587 return 0;
10588 }
10589
10590 // see if we can unshare blobs still referenced by the head
10591 dout(10) << __func__ << " gen and maybe_unshared_blobs "
10592 << maybe_unshared_blobs << dendl;
10593 ghobject_t nogen = o->oid;
10594 nogen.generation = ghobject_t::NO_GEN;
10595 OnodeRef h = c->onode_map.lookup(nogen);
10596
10597 if (!h || !h->exists) {
10598 return 0;
10599 }
10600
10601 dout(20) << __func__ << " checking for unshareable blobs on " << h
10602 << " " << h->oid << dendl;
10603 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
10604 for (auto& e : h->extent_map.extent_map) {
10605 const bluestore_blob_t& b = e.blob->get_blob();
10606 SharedBlob *sb = e.blob->shared_blob.get();
10607 if (b.is_shared() &&
10608 sb->loaded &&
10609 maybe_unshared_blobs.count(sb)) {
10610 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
10611 expect[sb].get(off, len);
10612 return 0;
10613 });
10614 }
10615 }
10616
10617 vector<SharedBlob*> unshared_blobs;
10618 unshared_blobs.reserve(maybe_unshared_blobs.size());
10619 for (auto& p : expect) {
10620 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
10621 if (p.first->persistent->ref_map == p.second) {
10622 SharedBlob *sb = p.first;
10623 dout(20) << __func__ << " unsharing " << *sb << dendl;
10624 unshared_blobs.push_back(sb);
10625 txc->unshare_blob(sb);
10626 uint64_t sbid = c->make_blob_unshared(sb);
10627 string key;
10628 get_shared_blob_key(sbid, &key);
10629 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
10630 }
10631 }
10632
10633 if (unshared_blobs.empty()) {
10634 return 0;
10635 }
10636
10637 for (auto& e : h->extent_map.extent_map) {
10638 const bluestore_blob_t& b = e.blob->get_blob();
10639 SharedBlob *sb = e.blob->shared_blob.get();
10640 if (b.is_shared() &&
10641 std::find(unshared_blobs.begin(), unshared_blobs.end(),
10642 sb) != unshared_blobs.end()) {
10643 dout(20) << __func__ << " unsharing " << e << dendl;
10644 bluestore_blob_t& blob = e.blob->dirty_blob();
10645 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
10646 h->extent_map.dirty_range(e.logical_offset, 1);
10647 }
10648 }
10649 txc->write_onode(h);
10650
10651 return 0;
10652 }
10653
10654 int BlueStore::_remove(TransContext *txc,
10655 CollectionRef& c,
10656 OnodeRef &o)
10657 {
10658 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10659 int r = _do_remove(txc, c, o);
10660 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10661 return r;
10662 }
10663
10664 int BlueStore::_setattr(TransContext *txc,
10665 CollectionRef& c,
10666 OnodeRef& o,
10667 const string& name,
10668 bufferptr& val)
10669 {
10670 dout(15) << __func__ << " " << c->cid << " " << o->oid
10671 << " " << name << " (" << val.length() << " bytes)"
10672 << dendl;
10673 int r = 0;
10674 if (val.is_partial())
10675 o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
10676 else
10677 o->onode.attrs[name.c_str()] = val;
10678 txc->write_onode(o);
10679 dout(10) << __func__ << " " << c->cid << " " << o->oid
10680 << " " << name << " (" << val.length() << " bytes)"
10681 << " = " << r << dendl;
10682 return r;
10683 }
10684
10685 int BlueStore::_setattrs(TransContext *txc,
10686 CollectionRef& c,
10687 OnodeRef& o,
10688 const map<string,bufferptr>& aset)
10689 {
10690 dout(15) << __func__ << " " << c->cid << " " << o->oid
10691 << " " << aset.size() << " keys"
10692 << dendl;
10693 int r = 0;
10694 for (map<string,bufferptr>::const_iterator p = aset.begin();
10695 p != aset.end(); ++p) {
10696 if (p->second.is_partial())
10697 o->onode.attrs[p->first.c_str()] =
10698 bufferptr(p->second.c_str(), p->second.length());
10699 else
10700 o->onode.attrs[p->first.c_str()] = p->second;
10701 }
10702 txc->write_onode(o);
10703 dout(10) << __func__ << " " << c->cid << " " << o->oid
10704 << " " << aset.size() << " keys"
10705 << " = " << r << dendl;
10706 return r;
10707 }
10708
10709
10710 int BlueStore::_rmattr(TransContext *txc,
10711 CollectionRef& c,
10712 OnodeRef& o,
10713 const string& name)
10714 {
10715 dout(15) << __func__ << " " << c->cid << " " << o->oid
10716 << " " << name << dendl;
10717 int r = 0;
10718 auto it = o->onode.attrs.find(name.c_str());
10719 if (it == o->onode.attrs.end())
10720 goto out;
10721
10722 o->onode.attrs.erase(it);
10723 txc->write_onode(o);
10724
10725 out:
10726 dout(10) << __func__ << " " << c->cid << " " << o->oid
10727 << " " << name << " = " << r << dendl;
10728 return r;
10729 }
10730
10731 int BlueStore::_rmattrs(TransContext *txc,
10732 CollectionRef& c,
10733 OnodeRef& o)
10734 {
10735 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10736 int r = 0;
10737
10738 if (o->onode.attrs.empty())
10739 goto out;
10740
10741 o->onode.attrs.clear();
10742 txc->write_onode(o);
10743
10744 out:
10745 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10746 return r;
10747 }
10748
10749 void BlueStore::_do_omap_clear(TransContext *txc, uint64_t id)
10750 {
10751 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10752 string prefix, tail;
10753 get_omap_header(id, &prefix);
10754 get_omap_tail(id, &tail);
10755 it->lower_bound(prefix);
10756 while (it->valid()) {
10757 if (it->key() >= tail) {
10758 dout(30) << __func__ << " stop at " << pretty_binary_string(tail)
10759 << dendl;
10760 break;
10761 }
10762 txc->t->rmkey(PREFIX_OMAP, it->key());
10763 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10764 it->next();
10765 }
10766 }
10767
10768 int BlueStore::_omap_clear(TransContext *txc,
10769 CollectionRef& c,
10770 OnodeRef& o)
10771 {
10772 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10773 int r = 0;
10774 if (o->onode.has_omap()) {
10775 o->flush();
10776 _do_omap_clear(txc, o->onode.nid);
10777 o->onode.clear_omap_flag();
10778 txc->write_onode(o);
10779 }
10780 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10781 return r;
10782 }
10783
10784 int BlueStore::_omap_setkeys(TransContext *txc,
10785 CollectionRef& c,
10786 OnodeRef& o,
10787 bufferlist &bl)
10788 {
10789 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10790 int r;
10791 bufferlist::iterator p = bl.begin();
10792 __u32 num;
10793 if (!o->onode.has_omap()) {
10794 o->onode.set_omap_flag();
10795 txc->write_onode(o);
10796 } else {
10797 txc->note_modified_object(o);
10798 }
10799 string final_key;
10800 _key_encode_u64(o->onode.nid, &final_key);
10801 final_key.push_back('.');
10802 ::decode(num, p);
10803 while (num--) {
10804 string key;
10805 bufferlist value;
10806 ::decode(key, p);
10807 ::decode(value, p);
10808 final_key.resize(9); // keep prefix
10809 final_key += key;
10810 dout(30) << __func__ << " " << pretty_binary_string(final_key)
10811 << " <- " << key << dendl;
10812 txc->t->set(PREFIX_OMAP, final_key, value);
10813 }
10814 r = 0;
10815 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10816 return r;
10817 }
10818
10819 int BlueStore::_omap_setheader(TransContext *txc,
10820 CollectionRef& c,
10821 OnodeRef &o,
10822 bufferlist& bl)
10823 {
10824 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10825 int r;
10826 string key;
10827 if (!o->onode.has_omap()) {
10828 o->onode.set_omap_flag();
10829 txc->write_onode(o);
10830 } else {
10831 txc->note_modified_object(o);
10832 }
10833 get_omap_header(o->onode.nid, &key);
10834 txc->t->set(PREFIX_OMAP, key, bl);
10835 r = 0;
10836 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10837 return r;
10838 }
10839
10840 int BlueStore::_omap_rmkeys(TransContext *txc,
10841 CollectionRef& c,
10842 OnodeRef& o,
10843 bufferlist& bl)
10844 {
10845 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10846 int r = 0;
10847 bufferlist::iterator p = bl.begin();
10848 __u32 num;
10849 string final_key;
10850
10851 if (!o->onode.has_omap()) {
10852 goto out;
10853 }
10854 _key_encode_u64(o->onode.nid, &final_key);
10855 final_key.push_back('.');
10856 ::decode(num, p);
10857 while (num--) {
10858 string key;
10859 ::decode(key, p);
10860 final_key.resize(9); // keep prefix
10861 final_key += key;
10862 dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
10863 << " <- " << key << dendl;
10864 txc->t->rmkey(PREFIX_OMAP, final_key);
10865 }
10866 txc->note_modified_object(o);
10867
10868 out:
10869 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10870 return r;
10871 }
10872
10873 int BlueStore::_omap_rmkey_range(TransContext *txc,
10874 CollectionRef& c,
10875 OnodeRef& o,
10876 const string& first, const string& last)
10877 {
10878 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
10879 KeyValueDB::Iterator it;
10880 string key_first, key_last;
10881 int r = 0;
10882 if (!o->onode.has_omap()) {
10883 goto out;
10884 }
10885 o->flush();
10886 it = db->get_iterator(PREFIX_OMAP);
10887 get_omap_key(o->onode.nid, first, &key_first);
10888 get_omap_key(o->onode.nid, last, &key_last);
10889 it->lower_bound(key_first);
10890 while (it->valid()) {
10891 if (it->key() >= key_last) {
10892 dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
10893 << dendl;
10894 break;
10895 }
10896 txc->t->rmkey(PREFIX_OMAP, it->key());
10897 dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
10898 it->next();
10899 }
10900 txc->note_modified_object(o);
10901
10902 out:
10903 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
10904 return r;
10905 }
10906
10907 int BlueStore::_set_alloc_hint(
10908 TransContext *txc,
10909 CollectionRef& c,
10910 OnodeRef& o,
10911 uint64_t expected_object_size,
10912 uint64_t expected_write_size,
10913 uint32_t flags)
10914 {
10915 dout(15) << __func__ << " " << c->cid << " " << o->oid
10916 << " object_size " << expected_object_size
10917 << " write_size " << expected_write_size
10918 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10919 << dendl;
10920 int r = 0;
10921 o->onode.expected_object_size = expected_object_size;
10922 o->onode.expected_write_size = expected_write_size;
10923 o->onode.alloc_hint_flags = flags;
10924 txc->write_onode(o);
10925 dout(10) << __func__ << " " << c->cid << " " << o->oid
10926 << " object_size " << expected_object_size
10927 << " write_size " << expected_write_size
10928 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
10929 << " = " << r << dendl;
10930 return r;
10931 }
10932
10933 int BlueStore::_clone(TransContext *txc,
10934 CollectionRef& c,
10935 OnodeRef& oldo,
10936 OnodeRef& newo)
10937 {
10938 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
10939 << newo->oid << dendl;
10940 int r = 0;
10941 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
10942 derr << __func__ << " mismatched hash on " << oldo->oid
10943 << " and " << newo->oid << dendl;
10944 return -EINVAL;
10945 }
10946
10947 _assign_nid(txc, newo);
10948
10949 // clone data
10950 oldo->flush();
10951 _do_truncate(txc, c, newo, 0);
10952 if (cct->_conf->bluestore_clone_cow) {
10953 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
10954 } else {
10955 bufferlist bl;
10956 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
10957 if (r < 0)
10958 goto out;
10959 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
10960 if (r < 0)
10961 goto out;
10962 }
10963
10964 // clone attrs
10965 newo->onode.attrs = oldo->onode.attrs;
10966
10967 // clone omap
10968 if (newo->onode.has_omap()) {
10969 dout(20) << __func__ << " clearing old omap data" << dendl;
10970 newo->flush();
10971 _do_omap_clear(txc, newo->onode.nid);
10972 }
10973 if (oldo->onode.has_omap()) {
10974 dout(20) << __func__ << " copying omap data" << dendl;
10975 if (!newo->onode.has_omap()) {
10976 newo->onode.set_omap_flag();
10977 }
10978 KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
10979 string head, tail;
10980 get_omap_header(oldo->onode.nid, &head);
10981 get_omap_tail(oldo->onode.nid, &tail);
10982 it->lower_bound(head);
10983 while (it->valid()) {
10984 if (it->key() >= tail) {
10985 dout(30) << __func__ << " reached tail" << dendl;
10986 break;
10987 } else {
10988 dout(30) << __func__ << " got header/data "
10989 << pretty_binary_string(it->key()) << dendl;
10990 string key;
10991 rewrite_omap_key(newo->onode.nid, it->key(), &key);
10992 txc->t->set(PREFIX_OMAP, key, it->value());
10993 }
10994 it->next();
10995 }
10996 } else {
10997 newo->onode.clear_omap_flag();
10998 }
10999
11000 txc->write_onode(newo);
11001 r = 0;
11002
11003 out:
11004 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11005 << newo->oid << " = " << r << dendl;
11006 return r;
11007 }
11008
11009 int BlueStore::_do_clone_range(
11010 TransContext *txc,
11011 CollectionRef& c,
11012 OnodeRef& oldo,
11013 OnodeRef& newo,
11014 uint64_t srcoff,
11015 uint64_t length,
11016 uint64_t dstoff)
11017 {
11018 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11019 << newo->oid
11020 << " 0x" << std::hex << srcoff << "~" << length << " -> "
11021 << " 0x" << dstoff << "~" << length << std::dec << dendl;
11022 oldo->extent_map.fault_range(db, srcoff, length);
11023 newo->extent_map.fault_range(db, dstoff, length);
11024 _dump_onode(oldo);
11025 _dump_onode(newo);
11026
11027 // hmm, this could go into an ExtentMap::dup() method.
11028 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
11029 for (auto &e : oldo->extent_map.extent_map) {
11030 e.blob->last_encoded_id = -1;
11031 }
11032 int n = 0;
11033 uint64_t end = srcoff + length;
11034 uint32_t dirty_range_begin = 0;
11035 uint32_t dirty_range_end = 0;
11036 bool src_dirty = false;
11037 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
11038 ep != oldo->extent_map.extent_map.end();
11039 ++ep) {
11040 auto& e = *ep;
11041 if (e.logical_offset >= end) {
11042 break;
11043 }
11044 dout(20) << __func__ << " src " << e << dendl;
11045 BlobRef cb;
11046 bool blob_duped = true;
11047 if (e.blob->last_encoded_id >= 0) {
11048 // blob is already duped
11049 cb = id_to_blob[e.blob->last_encoded_id];
11050 blob_duped = false;
11051 } else {
11052 // dup the blob
11053 const bluestore_blob_t& blob = e.blob->get_blob();
11054 // make sure it is shared
11055 if (!blob.is_shared()) {
11056 c->make_blob_shared(_assign_blobid(txc), e.blob);
11057 if (!src_dirty) {
11058 src_dirty = true;
11059 dirty_range_begin = e.logical_offset;
11060 }
11061 assert(e.logical_end() > 0);
11062 // -1 to exclude next potential shard
11063 dirty_range_end = e.logical_end() - 1;
11064 } else {
11065 c->load_shared_blob(e.blob->shared_blob);
11066 }
11067 cb = new Blob();
11068 e.blob->last_encoded_id = n;
11069 id_to_blob[n] = cb;
11070 e.blob->dup(*cb);
11071 // bump the extent refs on the copied blob's extents
11072 for (auto p : blob.get_extents()) {
11073 if (p.is_valid()) {
11074 e.blob->shared_blob->get_ref(p.offset, p.length);
11075 }
11076 }
11077 txc->write_shared_blob(e.blob->shared_blob);
11078 dout(20) << __func__ << " new " << *cb << dendl;
11079 }
11080 // dup extent
11081 int skip_front, skip_back;
11082 if (e.logical_offset < srcoff) {
11083 skip_front = srcoff - e.logical_offset;
11084 } else {
11085 skip_front = 0;
11086 }
11087 if (e.logical_end() > end) {
11088 skip_back = e.logical_end() - end;
11089 } else {
11090 skip_back = 0;
11091 }
11092 Extent *ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
11093 e.blob_offset + skip_front,
11094 e.length - skip_front - skip_back, cb);
11095 newo->extent_map.extent_map.insert(*ne);
11096 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
11097 // fixme: we may leave parts of new blob unreferenced that could
11098 // be freed (relative to the shared_blob).
11099 txc->statfs_delta.stored() += ne->length;
11100 if (e.blob->get_blob().is_compressed()) {
11101 txc->statfs_delta.compressed_original() += ne->length;
11102 if (blob_duped){
11103 txc->statfs_delta.compressed() +=
11104 cb->get_blob().get_compressed_payload_length();
11105 }
11106 }
11107 dout(20) << __func__ << " dst " << *ne << dendl;
11108 ++n;
11109 }
11110 if (src_dirty) {
11111 oldo->extent_map.dirty_range(dirty_range_begin,
11112 dirty_range_end - dirty_range_begin);
11113 txc->write_onode(oldo);
11114 }
11115 txc->write_onode(newo);
11116
11117 if (dstoff + length > newo->onode.size) {
11118 newo->onode.size = dstoff + length;
11119 }
11120 newo->extent_map.dirty_range(dstoff, length);
11121 _dump_onode(oldo);
11122 _dump_onode(newo);
11123 return 0;
11124 }
11125
11126 int BlueStore::_clone_range(TransContext *txc,
11127 CollectionRef& c,
11128 OnodeRef& oldo,
11129 OnodeRef& newo,
11130 uint64_t srcoff, uint64_t length, uint64_t dstoff)
11131 {
11132 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11133 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11134 << " to offset 0x" << dstoff << std::dec << dendl;
11135 int r = 0;
11136
11137 if (srcoff + length >= OBJECT_MAX_SIZE ||
11138 dstoff + length >= OBJECT_MAX_SIZE) {
11139 r = -E2BIG;
11140 goto out;
11141 }
11142 if (srcoff + length > oldo->onode.size) {
11143 r = -EINVAL;
11144 goto out;
11145 }
11146
11147 _assign_nid(txc, newo);
11148
11149 if (length > 0) {
11150 if (cct->_conf->bluestore_clone_cow) {
11151 _do_zero(txc, c, newo, dstoff, length);
11152 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
11153 } else {
11154 bufferlist bl;
11155 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
11156 if (r < 0)
11157 goto out;
11158 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
11159 if (r < 0)
11160 goto out;
11161 }
11162 }
11163
11164 txc->write_onode(newo);
11165 r = 0;
11166
11167 out:
11168 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11169 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
11170 << " to offset 0x" << dstoff << std::dec
11171 << " = " << r << dendl;
11172 return r;
11173 }
11174
11175 int BlueStore::_rename(TransContext *txc,
11176 CollectionRef& c,
11177 OnodeRef& oldo,
11178 OnodeRef& newo,
11179 const ghobject_t& new_oid)
11180 {
11181 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
11182 << new_oid << dendl;
11183 int r;
11184 ghobject_t old_oid = oldo->oid;
11185 mempool::bluestore_cache_other::string new_okey;
11186
11187 if (newo) {
11188 if (newo->exists) {
11189 r = -EEXIST;
11190 goto out;
11191 }
11192 assert(txc->onodes.count(newo) == 0);
11193 }
11194
11195 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
11196
11197 // rewrite shards
11198 {
11199 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
11200 get_object_key(cct, new_oid, &new_okey);
11201 string key;
11202 for (auto &s : oldo->extent_map.shards) {
11203 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
11204 [&](const string& final_key) {
11205 txc->t->rmkey(PREFIX_OBJ, final_key);
11206 }
11207 );
11208 s.dirty = true;
11209 }
11210 }
11211
11212 newo = oldo;
11213 txc->write_onode(newo);
11214
11215 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11216 // Onode in the old slot
11217 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
11218 r = 0;
11219
11220 out:
11221 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
11222 << new_oid << " = " << r << dendl;
11223 return r;
11224 }
11225
11226 // collections
11227
11228 int BlueStore::_create_collection(
11229 TransContext *txc,
11230 const coll_t &cid,
11231 unsigned bits,
11232 CollectionRef *c)
11233 {
11234 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
11235 int r;
11236 bufferlist bl;
11237
11238 {
11239 RWLock::WLocker l(coll_lock);
11240 if (*c) {
11241 r = -EEXIST;
11242 goto out;
11243 }
11244 c->reset(
11245 new Collection(
11246 this,
11247 cache_shards[cid.hash_to_shard(cache_shards.size())],
11248 cid));
11249 (*c)->cnode.bits = bits;
11250 coll_map[cid] = *c;
11251 }
11252 ::encode((*c)->cnode, bl);
11253 txc->t->set(PREFIX_COLL, stringify(cid), bl);
11254 r = 0;
11255
11256 out:
11257 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
11258 return r;
11259 }
11260
11261 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
11262 CollectionRef *c)
11263 {
11264 dout(15) << __func__ << " " << cid << dendl;
11265 int r;
11266
11267 {
11268 RWLock::WLocker l(coll_lock);
11269 if (!*c) {
11270 r = -ENOENT;
11271 goto out;
11272 }
11273 size_t nonexistent_count = 0;
11274 assert((*c)->exists);
11275 if ((*c)->onode_map.map_any([&](OnodeRef o) {
11276 if (o->exists) {
11277 dout(10) << __func__ << " " << o->oid << " " << o
11278 << " exists in onode_map" << dendl;
11279 return true;
11280 }
11281 ++nonexistent_count;
11282 return false;
11283 })) {
11284 r = -ENOTEMPTY;
11285 goto out;
11286 }
11287
11288 vector<ghobject_t> ls;
11289 ghobject_t next;
11290 // Enumerate onodes in db, up to nonexistent_count + 1
11291 // then check if all of them are marked as non-existent.
11292 // Bypass the check if returned number is greater than nonexistent_count
11293 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
11294 nonexistent_count + 1, &ls, &next);
11295 if (r >= 0) {
11296 bool exists = false; //ls.size() > nonexistent_count;
11297 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
11298 dout(10) << __func__ << " oid " << *it << dendl;
11299 auto onode = (*c)->onode_map.lookup(*it);
11300 exists = !onode || onode->exists;
11301 if (exists) {
11302 dout(10) << __func__ << " " << *it
11303 << " exists in db" << dendl;
11304 }
11305 }
11306 if (!exists) {
11307 coll_map.erase(cid);
11308 txc->removed_collections.push_back(*c);
11309 (*c)->exists = false;
11310 c->reset();
11311 txc->t->rmkey(PREFIX_COLL, stringify(cid));
11312 r = 0;
11313 } else {
11314 dout(10) << __func__ << " " << cid
11315 << " is non-empty" << dendl;
11316 r = -ENOTEMPTY;
11317 }
11318 }
11319 }
11320
11321 out:
11322 dout(10) << __func__ << " " << cid << " = " << r << dendl;
11323 return r;
11324 }
11325
11326 int BlueStore::_split_collection(TransContext *txc,
11327 CollectionRef& c,
11328 CollectionRef& d,
11329 unsigned bits, int rem)
11330 {
11331 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
11332 << " bits " << bits << dendl;
11333 RWLock::WLocker l(c->lock);
11334 RWLock::WLocker l2(d->lock);
11335 int r;
11336
11337 // flush all previous deferred writes on this sequencer. this is a bit
11338 // heavyweight, but we need to make sure all deferred writes complete
11339 // before we split as the new collection's sequencer may need to order
11340 // this after those writes, and we don't bother with the complexity of
11341 // moving those TransContexts over to the new osr.
11342 _osr_drain_preceding(txc);
11343
11344 // move any cached items (onodes and referenced shared blobs) that will
11345 // belong to the child collection post-split. leave everything else behind.
11346 // this may include things that don't strictly belong to the now-smaller
11347 // parent split, but the OSD will always send us a split for every new
11348 // child.
11349
11350 spg_t pgid, dest_pgid;
11351 bool is_pg = c->cid.is_pg(&pgid);
11352 assert(is_pg);
11353 is_pg = d->cid.is_pg(&dest_pgid);
11354 assert(is_pg);
11355
11356 // the destination should initially be empty.
11357 assert(d->onode_map.empty());
11358 assert(d->shared_blob_set.empty());
11359 assert(d->cnode.bits == bits);
11360
11361 c->split_cache(d.get());
11362
11363 // adjust bits. note that this will be redundant for all but the first
11364 // split call for this parent (first child).
11365 c->cnode.bits = bits;
11366 assert(d->cnode.bits == bits);
11367 r = 0;
11368
11369 bufferlist bl;
11370 ::encode(c->cnode, bl);
11371 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
11372
11373 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
11374 << " bits " << bits << " = " << r << dendl;
11375 return r;
11376 }
11377
11378 // DB key value Histogram
11379 #define KEY_SLAB 32
11380 #define VALUE_SLAB 64
11381
11382 const string prefix_onode = "o";
11383 const string prefix_onode_shard = "x";
11384 const string prefix_other = "Z";
11385
11386 int BlueStore::DBHistogram::get_key_slab(size_t sz)
11387 {
11388 return (sz/KEY_SLAB);
11389 }
11390
11391 string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
11392 {
11393 int lower_bound = slab * KEY_SLAB;
11394 int upper_bound = (slab + 1) * KEY_SLAB;
11395 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11396 return ret;
11397 }
11398
11399 int BlueStore::DBHistogram::get_value_slab(size_t sz)
11400 {
11401 return (sz/VALUE_SLAB);
11402 }
11403
11404 string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
11405 {
11406 int lower_bound = slab * VALUE_SLAB;
11407 int upper_bound = (slab + 1) * VALUE_SLAB;
11408 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
11409 return ret;
11410 }
11411
11412 void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
11413 const string &prefix, size_t key_size, size_t value_size)
11414 {
11415 uint32_t key_slab = get_key_slab(key_size);
11416 uint32_t value_slab = get_value_slab(value_size);
11417 key_hist[prefix][key_slab].count++;
11418 key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len);
11419 key_hist[prefix][key_slab].val_map[value_slab].count++;
11420 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11421 MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len);
11422 }
11423
11424 void BlueStore::DBHistogram::dump(Formatter *f)
11425 {
11426 f->open_object_section("rocksdb_value_distribution");
11427 for (auto i : value_hist) {
11428 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
11429 }
11430 f->close_section();
11431
11432 f->open_object_section("rocksdb_key_value_histogram");
11433 for (auto i : key_hist) {
11434 f->dump_string("prefix", i.first);
11435 f->open_object_section("key_hist");
11436 for ( auto k : i.second) {
11437 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
11438 f->dump_unsigned("max_len", k.second.max_len);
11439 f->open_object_section("value_hist");
11440 for ( auto j : k.second.val_map) {
11441 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
11442 f->dump_unsigned("max_len", j.second.max_len);
11443 }
11444 f->close_section();
11445 }
11446 f->close_section();
11447 }
11448 f->close_section();
11449 }
11450
11451 //Itrerates through the db and collects the stats
11452 void BlueStore::generate_db_histogram(Formatter *f)
11453 {
11454 //globals
11455 uint64_t num_onodes = 0;
11456 uint64_t num_shards = 0;
11457 uint64_t num_super = 0;
11458 uint64_t num_coll = 0;
11459 uint64_t num_omap = 0;
11460 uint64_t num_deferred = 0;
11461 uint64_t num_alloc = 0;
11462 uint64_t num_stat = 0;
11463 uint64_t num_others = 0;
11464 uint64_t num_shared_shards = 0;
11465 size_t max_key_size =0, max_value_size = 0;
11466 uint64_t total_key_size = 0, total_value_size = 0;
11467 size_t key_size = 0, value_size = 0;
11468 DBHistogram hist;
11469
11470 utime_t start = ceph_clock_now();
11471
11472 KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
11473 iter->seek_to_first();
11474 while (iter->valid()) {
11475 dout(30) << __func__ << " Key: " << iter->key() << dendl;
11476 key_size = iter->key_size();
11477 value_size = iter->value_size();
11478 hist.value_hist[hist.get_value_slab(value_size)]++;
11479 max_key_size = MAX(max_key_size, key_size);
11480 max_value_size = MAX(max_value_size, value_size);
11481 total_key_size += key_size;
11482 total_value_size += value_size;
11483
11484 pair<string,string> key(iter->raw_key());
11485
11486 if (key.first == PREFIX_SUPER) {
11487 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
11488 num_super++;
11489 } else if (key.first == PREFIX_STAT) {
11490 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
11491 num_stat++;
11492 } else if (key.first == PREFIX_COLL) {
11493 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
11494 num_coll++;
11495 } else if (key.first == PREFIX_OBJ) {
11496 if (key.second.back() == ONODE_KEY_SUFFIX) {
11497 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
11498 num_onodes++;
11499 } else {
11500 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
11501 num_shards++;
11502 }
11503 } else if (key.first == PREFIX_OMAP) {
11504 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
11505 num_omap++;
11506 } else if (key.first == PREFIX_DEFERRED) {
11507 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
11508 num_deferred++;
11509 } else if (key.first == PREFIX_ALLOC || key.first == "b" ) {
11510 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
11511 num_alloc++;
11512 } else if (key.first == PREFIX_SHARED_BLOB) {
11513 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
11514 num_shared_shards++;
11515 } else {
11516 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
11517 num_others++;
11518 }
11519 iter->next();
11520 }
11521
11522 utime_t duration = ceph_clock_now() - start;
11523 f->open_object_section("rocksdb_key_value_stats");
11524 f->dump_unsigned("num_onodes", num_onodes);
11525 f->dump_unsigned("num_shards", num_shards);
11526 f->dump_unsigned("num_super", num_super);
11527 f->dump_unsigned("num_coll", num_coll);
11528 f->dump_unsigned("num_omap", num_omap);
11529 f->dump_unsigned("num_deferred", num_deferred);
11530 f->dump_unsigned("num_alloc", num_alloc);
11531 f->dump_unsigned("num_stat", num_stat);
11532 f->dump_unsigned("num_shared_shards", num_shared_shards);
11533 f->dump_unsigned("num_others", num_others);
11534 f->dump_unsigned("max_key_size", max_key_size);
11535 f->dump_unsigned("max_value_size", max_value_size);
11536 f->dump_unsigned("total_key_size", total_key_size);
11537 f->dump_unsigned("total_value_size", total_value_size);
11538 f->close_section();
11539
11540 hist.dump(f);
11541
11542 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
11543
11544 }
11545
11546 void BlueStore::_flush_cache()
11547 {
11548 dout(10) << __func__ << dendl;
11549 for (auto i : cache_shards) {
11550 i->trim_all();
11551 assert(i->empty());
11552 }
11553 for (auto& p : coll_map) {
11554 assert(p.second->onode_map.empty());
11555 assert(p.second->shared_blob_set.empty());
11556 }
11557 coll_map.clear();
11558 }
11559
11560 // For external caller.
11561 // We use a best-effort policy instead, e.g.,
11562 // we don't care if there are still some pinned onodes/data in the cache
11563 // after this command is completed.
11564 void BlueStore::flush_cache()
11565 {
11566 dout(10) << __func__ << dendl;
11567 for (auto i : cache_shards) {
11568 i->trim_all();
11569 }
11570 }
11571
11572 void BlueStore::_apply_padding(uint64_t head_pad,
11573 uint64_t tail_pad,
11574 bufferlist& padded)
11575 {
11576 if (head_pad) {
11577 padded.prepend_zero(head_pad);
11578 }
11579 if (tail_pad) {
11580 padded.append_zero(tail_pad);
11581 }
11582 if (head_pad || tail_pad) {
11583 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
11584 << " tail 0x" << tail_pad << std::dec << dendl;
11585 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
11586 }
11587 }
11588
11589 // ===========================================