]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.h
update sources to 12.2.10
[ceph.git] / ceph / src / os / bluestore / BlueStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_OSD_BLUESTORE_H
16#define CEPH_OSD_BLUESTORE_H
17
18#include "acconfig.h"
19
20#include <unistd.h>
21
22#include <atomic>
23#include <mutex>
24#include <condition_variable>
25
26#include <boost/intrusive/list.hpp>
27#include <boost/intrusive/unordered_set.hpp>
28#include <boost/intrusive/set.hpp>
29#include <boost/functional/hash.hpp>
30#include <boost/dynamic_bitset.hpp>
31
32#include "include/assert.h"
33#include "include/unordered_map.h"
34#include "include/memory.h"
35#include "include/mempool.h"
36#include "common/Finisher.h"
37#include "common/perf_counters.h"
91327a77 38#include "common/PriorityCache.h"
7c673cae
FG
39#include "compressor/Compressor.h"
40#include "os/ObjectStore.h"
41
42#include "bluestore_types.h"
43#include "BlockDevice.h"
44#include "common/EventTrace.h"
45
46class Allocator;
47class FreelistManager;
48class BlueFS;
49
50//#define DEBUG_CACHE
51//#define DEBUG_DEFERRED
52
31f18b77
FG
53
54
55// constants for Buffer::optimize()
56#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
57
58
7c673cae
FG
59enum {
60 l_bluestore_first = 732430,
61 l_bluestore_kv_flush_lat,
62 l_bluestore_kv_commit_lat,
63 l_bluestore_kv_lat,
64 l_bluestore_state_prepare_lat,
65 l_bluestore_state_aio_wait_lat,
66 l_bluestore_state_io_done_lat,
67 l_bluestore_state_kv_queued_lat,
68 l_bluestore_state_kv_committing_lat,
69 l_bluestore_state_kv_done_lat,
70 l_bluestore_state_deferred_queued_lat,
71 l_bluestore_state_deferred_aio_wait_lat,
72 l_bluestore_state_deferred_cleanup_lat,
73 l_bluestore_state_finishing_lat,
74 l_bluestore_state_done_lat,
75 l_bluestore_throttle_lat,
76 l_bluestore_submit_lat,
77 l_bluestore_commit_lat,
78 l_bluestore_read_lat,
79 l_bluestore_read_onode_meta_lat,
80 l_bluestore_read_wait_aio_lat,
81 l_bluestore_compress_lat,
82 l_bluestore_decompress_lat,
83 l_bluestore_csum_lat,
84 l_bluestore_compress_success_count,
85 l_bluestore_compress_rejected_count,
86 l_bluestore_write_pad_bytes,
87 l_bluestore_deferred_write_ops,
88 l_bluestore_deferred_write_bytes,
89 l_bluestore_write_penalty_read_ops,
90 l_bluestore_allocated,
91 l_bluestore_stored,
92 l_bluestore_compressed,
93 l_bluestore_compressed_allocated,
94 l_bluestore_compressed_original,
95 l_bluestore_onodes,
96 l_bluestore_onode_hits,
97 l_bluestore_onode_misses,
98 l_bluestore_onode_shard_hits,
99 l_bluestore_onode_shard_misses,
100 l_bluestore_extents,
101 l_bluestore_blobs,
102 l_bluestore_buffers,
103 l_bluestore_buffer_bytes,
104 l_bluestore_buffer_hit_bytes,
105 l_bluestore_buffer_miss_bytes,
106 l_bluestore_write_big,
107 l_bluestore_write_big_bytes,
108 l_bluestore_write_big_blobs,
109 l_bluestore_write_small,
110 l_bluestore_write_small_bytes,
111 l_bluestore_write_small_unused,
112 l_bluestore_write_small_deferred,
113 l_bluestore_write_small_pre_read,
114 l_bluestore_write_small_new,
115 l_bluestore_txc,
116 l_bluestore_onode_reshard,
117 l_bluestore_blob_split,
118 l_bluestore_extent_compress,
119 l_bluestore_gc_merged,
b32b8144 120 l_bluestore_read_eio,
7c673cae
FG
121 l_bluestore_last
122};
123
124class BlueStore : public ObjectStore,
125 public md_config_obs_t {
126 // -----------------------------------------------------
127 // types
128public:
129 // config observer
130 const char** get_tracked_conf_keys() const override;
131 void handle_conf_change(const struct md_config_t *conf,
132 const std::set<std::string> &changed) override;
133
134 void _set_csum();
135 void _set_compression();
136 void _set_throttle_params();
31f18b77 137 int _set_cache_sizes();
7c673cae
FG
138
139 class TransContext;
140
141 typedef map<uint64_t, bufferlist> ready_regions_t;
142
143 struct BufferSpace;
144 struct Collection;
145 typedef boost::intrusive_ptr<Collection> CollectionRef;
146
147 struct AioContext {
148 virtual void aio_finish(BlueStore *store) = 0;
149 virtual ~AioContext() {}
150 };
151
152 /// cached buffer
153 struct Buffer {
154 MEMPOOL_CLASS_HELPERS();
155
156 enum {
157 STATE_EMPTY, ///< empty buffer -- used for cache history
158 STATE_CLEAN, ///< clean data that is up to date
159 STATE_WRITING, ///< data that is being written (io not yet complete)
160 };
161 static const char *get_state_name(int s) {
162 switch (s) {
163 case STATE_EMPTY: return "empty";
164 case STATE_CLEAN: return "clean";
165 case STATE_WRITING: return "writing";
166 default: return "???";
167 }
168 }
169 enum {
170 FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
171 // NOTE: fix operator<< when you define a second flag
172 };
173 static const char *get_flag_name(int s) {
174 switch (s) {
175 case FLAG_NOCACHE: return "nocache";
176 default: return "???";
177 }
178 }
179
180 BufferSpace *space;
181 uint16_t state; ///< STATE_*
182 uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
183 uint32_t flags; ///< FLAG_*
184 uint64_t seq;
185 uint32_t offset, length;
186 bufferlist data;
187
188 boost::intrusive::list_member_hook<> lru_item;
189 boost::intrusive::list_member_hook<> state_item;
190
191 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
192 unsigned f = 0)
193 : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
194 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b,
195 unsigned f = 0)
196 : space(space), state(s), flags(f), seq(q), offset(o),
197 length(b.length()), data(b) {}
198
199 bool is_empty() const {
200 return state == STATE_EMPTY;
201 }
202 bool is_clean() const {
203 return state == STATE_CLEAN;
204 }
205 bool is_writing() const {
206 return state == STATE_WRITING;
207 }
208
209 uint32_t end() const {
210 return offset + length;
211 }
212
213 void truncate(uint32_t newlen) {
214 assert(newlen < length);
215 if (data.length()) {
216 bufferlist t;
217 t.substr_of(data, 0, newlen);
218 data.claim(t);
219 }
220 length = newlen;
221 }
31f18b77
FG
222 void maybe_rebuild() {
223 if (data.length() &&
224 (data.get_num_buffers() > 1 ||
225 data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
226 data.rebuild();
227 }
228 }
7c673cae
FG
229
230 void dump(Formatter *f) const {
231 f->dump_string("state", get_state_name(state));
232 f->dump_unsigned("seq", seq);
233 f->dump_unsigned("offset", offset);
234 f->dump_unsigned("length", length);
235 f->dump_unsigned("data_length", data.length());
236 }
237 };
238
239 struct Cache;
240
241 /// map logical extent range (object) onto buffers
242 struct BufferSpace {
91327a77
AA
243 enum {
244 BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
245 };
246
7c673cae
FG
247 typedef boost::intrusive::list<
248 Buffer,
249 boost::intrusive::member_hook<
250 Buffer,
251 boost::intrusive::list_member_hook<>,
252 &Buffer::state_item> > state_list_t;
253
31f18b77 254 mempool::bluestore_cache_other::map<uint32_t, std::unique_ptr<Buffer>>
7c673cae
FG
255 buffer_map;
256
257 // we use a bare intrusive list here instead of std::map because
258 // it uses less memory and we expect this to be very small (very
259 // few IOs in flight to the same Blob at the same time).
260 state_list_t writing; ///< writing buffers, sorted by seq, ascending
261
262 ~BufferSpace() {
263 assert(buffer_map.empty());
264 assert(writing.empty());
265 }
266
267 void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) {
268 cache->_audit("_add_buffer start");
269 buffer_map[b->offset].reset(b);
270 if (b->is_writing()) {
31f18b77 271 b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
224ce89b
WB
272 if (writing.empty() || writing.rbegin()->seq <= b->seq) {
273 writing.push_back(*b);
274 } else {
275 auto it = writing.begin();
276 while (it->seq < b->seq) {
277 ++it;
278 }
279
280 assert(it->seq >= b->seq);
281 // note that this will insert b before it
282 // hence the order is maintained
283 writing.insert(it, *b);
284 }
7c673cae 285 } else {
31f18b77 286 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
287 cache->_add_buffer(b, level, near);
288 }
289 cache->_audit("_add_buffer end");
290 }
291 void _rm_buffer(Cache* cache, Buffer *b) {
292 _rm_buffer(cache, buffer_map.find(b->offset));
293 }
31f18b77
FG
294 void _rm_buffer(Cache* cache,
295 map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
7c673cae
FG
296 assert(p != buffer_map.end());
297 cache->_audit("_rm_buffer start");
298 if (p->second->is_writing()) {
299 writing.erase(writing.iterator_to(*p->second));
300 } else {
301 cache->_rm_buffer(p->second.get());
302 }
303 buffer_map.erase(p);
304 cache->_audit("_rm_buffer end");
305 }
306
307 map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
308 uint32_t offset) {
309 auto i = buffer_map.lower_bound(offset);
310 if (i != buffer_map.begin()) {
311 --i;
312 if (i->first + i->second->length <= offset)
313 ++i;
314 }
315 return i;
316 }
317
318 // must be called under protection of the Cache lock
319 void _clear(Cache* cache);
320
321 // return value is the highest cache_private of a trimmed buffer, or 0.
322 int discard(Cache* cache, uint32_t offset, uint32_t length) {
323 std::lock_guard<std::recursive_mutex> l(cache->lock);
324 return _discard(cache, offset, length);
325 }
326 int _discard(Cache* cache, uint32_t offset, uint32_t length);
327
328 void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
329 unsigned flags) {
330 std::lock_guard<std::recursive_mutex> l(cache->lock);
331 Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
332 flags);
333 b->cache_private = _discard(cache, offset, bl.length());
334 _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
335 }
336 void finish_write(Cache* cache, uint64_t seq);
337 void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
338 std::lock_guard<std::recursive_mutex> l(cache->lock);
339 Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
340 b->cache_private = _discard(cache, offset, bl.length());
341 _add_buffer(cache, b, 1, nullptr);
342 }
343
344 void read(Cache* cache, uint32_t offset, uint32_t length,
345 BlueStore::ready_regions_t& res,
91327a77
AA
346 interval_set<uint32_t>& res_intervals,
347 int flags = 0);
7c673cae
FG
348
349 void truncate(Cache* cache, uint32_t offset) {
350 discard(cache, offset, (uint32_t)-1 - offset);
351 }
352
353 void split(Cache* cache, size_t pos, BufferSpace &r);
354
355 void dump(Cache* cache, Formatter *f) const {
356 std::lock_guard<std::recursive_mutex> l(cache->lock);
357 f->open_array_section("buffers");
358 for (auto& i : buffer_map) {
359 f->open_object_section("buffer");
360 assert(i.first == i.second->offset);
361 i.second->dump(f);
362 f->close_section();
363 }
364 f->close_section();
365 }
366 };
367
368 struct SharedBlobSet;
369
370 /// in-memory shared blob state (incl cached buffers)
371 struct SharedBlob {
372 MEMPOOL_CLASS_HELPERS();
373
374 std::atomic_int nref = {0}; ///< reference count
375 bool loaded = false;
376
377 CollectionRef coll;
378 union {
379 uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
380 bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
381 };
382 BufferSpace bc; ///< buffer cache
383
384 SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
385 if (get_cache()) {
386 get_cache()->add_blob();
387 }
388 }
389 SharedBlob(uint64_t i, Collection *_coll);
390 ~SharedBlob();
391
392 uint64_t get_sbid() const {
393 return loaded ? persistent->sbid : sbid_unloaded;
394 }
395
396 friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
397 friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
398
399 friend ostream& operator<<(ostream& out, const SharedBlob& sb);
400
401 void get() {
402 ++nref;
403 }
404 void put();
405
406 /// get logical references
407 void get_ref(uint64_t offset, uint32_t length);
408
409 /// put logical references, and get back any released extents
410 void put_ref(uint64_t offset, uint32_t length,
31f18b77 411 PExtentVector *r, set<SharedBlob*> *maybe_unshared_blobs);
7c673cae
FG
412
413 friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
414 return l.get_sbid() == r.get_sbid();
415 }
416 inline Cache* get_cache() {
417 return coll ? coll->cache : nullptr;
418 }
419 inline SharedBlobSet* get_parent() {
420 return coll ? &(coll->shared_blob_set) : nullptr;
421 }
422 inline bool is_loaded() const {
423 return loaded;
424 }
425
426 };
427 typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
428
429 /// a lookup table of SharedBlobs
430 struct SharedBlobSet {
431 std::mutex lock; ///< protect lookup, insertion, removal
432
433 // we use a bare pointer because we don't want to affect the ref
434 // count
31f18b77 435 mempool::bluestore_cache_other::unordered_map<uint64_t,SharedBlob*> sb_map;
7c673cae
FG
436
437 SharedBlobRef lookup(uint64_t sbid) {
438 std::lock_guard<std::mutex> l(lock);
439 auto p = sb_map.find(sbid);
28e407b8
AA
440 if (p == sb_map.end() ||
441 p->second->nref == 0) {
7c673cae
FG
442 return nullptr;
443 }
444 return p->second;
445 }
446
447 void add(Collection* coll, SharedBlob *sb) {
448 std::lock_guard<std::mutex> l(lock);
449 sb_map[sb->get_sbid()] = sb;
450 sb->coll = coll;
451 }
452
91327a77 453 bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
3efd9988
FG
454 std::lock_guard<std::mutex> l(lock);
455 assert(sb->get_parent() == this);
91327a77
AA
456 if (verify_nref_is_zero && sb->nref != 0) {
457 return false;
458 }
28e407b8
AA
459 // only remove if it still points to us
460 auto p = sb_map.find(sb->get_sbid());
461 if (p != sb_map.end() &&
462 p->second == sb) {
463 sb_map.erase(p);
464 }
91327a77 465 return true;
3efd9988
FG
466 }
467
7c673cae
FG
468 bool empty() {
469 std::lock_guard<std::mutex> l(lock);
470 return sb_map.empty();
471 }
3efd9988
FG
472
473 void dump(CephContext *cct, int lvl);
7c673cae
FG
474 };
475
476//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
477
478 /// in-memory blob metadata and associated cached buffers (if any)
479 struct Blob {
480 MEMPOOL_CLASS_HELPERS();
481
482 std::atomic_int nref = {0}; ///< reference count
483 int16_t id = -1; ///< id, for spanning blobs only, >= 0
484 int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
485 SharedBlobRef shared_blob; ///< shared blob state (if any)
486
487 private:
488 mutable bluestore_blob_t blob; ///< decoded blob metadata
489#ifdef CACHE_BLOB_BL
490 mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty
491#endif
492 /// refs from this shard. ephemeral if id<0, persisted if spanning.
493 bluestore_blob_use_tracker_t used_in_blob;
494
495 public:
496
497 friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
498 friend void intrusive_ptr_release(Blob *b) { b->put(); }
499
500 friend ostream& operator<<(ostream& out, const Blob &b);
501
502 const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
503 return used_in_blob;
504 }
505 bool is_referenced() const {
506 return used_in_blob.is_not_empty();
507 }
508 uint32_t get_referenced_bytes() const {
509 return used_in_blob.get_referenced_bytes();
510 }
511
512 bool is_spanning() const {
513 return id >= 0;
514 }
515
516 bool can_split() const {
517 std::lock_guard<std::recursive_mutex> l(shared_blob->get_cache()->lock);
518 // splitting a BufferSpace writing list is too hard; don't try.
519 return shared_blob->bc.writing.empty() &&
520 used_in_blob.can_split() &&
521 get_blob().can_split();
522 }
523
524 bool can_split_at(uint32_t blob_offset) const {
525 return used_in_blob.can_split_at(blob_offset) &&
526 get_blob().can_split_at(blob_offset);
527 }
528
224ce89b 529 bool can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
530 uint32_t target_blob_size,
531 uint32_t b_offset,
532 uint32_t *length0);
533
534 void dup(Blob& o) {
535 o.shared_blob = shared_blob;
536 o.blob = blob;
537#ifdef CACHE_BLOB_BL
538 o.blob_bl = blob_bl;
539#endif
540 }
541
224ce89b 542 inline const bluestore_blob_t& get_blob() const {
7c673cae
FG
543 return blob;
544 }
224ce89b 545 inline bluestore_blob_t& dirty_blob() {
7c673cae
FG
546#ifdef CACHE_BLOB_BL
547 blob_bl.clear();
548#endif
549 return blob;
550 }
551
552 /// discard buffers for unallocated regions
553 void discard_unallocated(Collection *coll);
554
555 /// get logical references
556 void get_ref(Collection *coll, uint32_t offset, uint32_t length);
557 /// put logical references, and get back any released extents
558 bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
559 PExtentVector *r);
560
561 /// split the blob
562 void split(Collection *coll, uint32_t blob_offset, Blob *o);
563
564 void get() {
565 ++nref;
566 }
567 void put() {
568 if (--nref == 0)
569 delete this;
570 }
571
572
573#ifdef CACHE_BLOB_BL
574 void _encode() const {
575 if (blob_bl.length() == 0 ) {
576 ::encode(blob, blob_bl);
577 } else {
578 assert(blob_bl.length());
579 }
580 }
581 void bound_encode(
582 size_t& p,
583 bool include_ref_map) const {
584 _encode();
585 p += blob_bl.length();
586 if (include_ref_map) {
587 used_in_blob.bound_encode(p);
588 }
589 }
590 void encode(
591 bufferlist::contiguous_appender& p,
592 bool include_ref_map) const {
593 _encode();
594 p.append(blob_bl);
595 if (include_ref_map) {
596 used_in_blob.encode(p);
597 }
598 }
599 void decode(
600 Collection */*coll*/,
601 bufferptr::iterator& p,
602 bool include_ref_map) {
603 const char *start = p.get_pos();
604 denc(blob, p);
605 const char *end = p.get_pos();
606 blob_bl.clear();
607 blob_bl.append(start, end - start);
608 if (include_ref_map) {
609 used_in_blob.decode(p);
610 }
611 }
612#else
613 void bound_encode(
614 size_t& p,
615 uint64_t struct_v,
616 uint64_t sbid,
617 bool include_ref_map) const {
618 denc(blob, p, struct_v);
619 if (blob.is_shared()) {
620 denc(sbid, p);
621 }
622 if (include_ref_map) {
623 used_in_blob.bound_encode(p);
624 }
625 }
626 void encode(
627 bufferlist::contiguous_appender& p,
628 uint64_t struct_v,
629 uint64_t sbid,
630 bool include_ref_map) const {
631 denc(blob, p, struct_v);
632 if (blob.is_shared()) {
633 denc(sbid, p);
634 }
635 if (include_ref_map) {
636 used_in_blob.encode(p);
637 }
638 }
639 void decode(
640 Collection *coll,
641 bufferptr::iterator& p,
642 uint64_t struct_v,
643 uint64_t* sbid,
644 bool include_ref_map);
645#endif
646 };
647 typedef boost::intrusive_ptr<Blob> BlobRef;
31f18b77 648 typedef mempool::bluestore_cache_other::map<int,BlobRef> blob_map_t;
7c673cae
FG
649
650 /// a logical extent, pointing to (some portion of) a blob
651 typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
652 struct Extent : public ExtentBase {
653 MEMPOOL_CLASS_HELPERS();
654
655 uint32_t logical_offset = 0; ///< logical offset
656 uint32_t blob_offset = 0; ///< blob offset
657 uint32_t length = 0; ///< length
658 BlobRef blob; ///< the blob with our data
659
660 /// ctor for lookup only
661 explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
662 /// ctor for delayed initialization (see decode_some())
663 explicit Extent() : ExtentBase() {
664 }
665 /// ctor for general usage
666 Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
667 : ExtentBase(),
668 logical_offset(lo), blob_offset(o), length(l) {
669 assign_blob(b);
670 }
671 ~Extent() {
672 if (blob) {
673 blob->shared_blob->get_cache()->rm_extent();
674 }
675 }
676
677 void assign_blob(const BlobRef& b) {
678 assert(!blob);
679 blob = b;
680 blob->shared_blob->get_cache()->add_extent();
681 }
682
683 // comparators for intrusive_set
684 friend bool operator<(const Extent &a, const Extent &b) {
685 return a.logical_offset < b.logical_offset;
686 }
687 friend bool operator>(const Extent &a, const Extent &b) {
688 return a.logical_offset > b.logical_offset;
689 }
690 friend bool operator==(const Extent &a, const Extent &b) {
691 return a.logical_offset == b.logical_offset;
692 }
693
694 uint32_t blob_start() const {
695 return logical_offset - blob_offset;
696 }
697
698 uint32_t blob_end() const {
699 return blob_start() + blob->get_blob().get_logical_length();
700 }
701
702 uint32_t logical_end() const {
703 return logical_offset + length;
704 }
705
706 // return true if any piece of the blob is out of
707 // the given range [o, o + l].
708 bool blob_escapes_range(uint32_t o, uint32_t l) const {
709 return blob_start() < o || blob_end() > o + l;
710 }
711 };
712 typedef boost::intrusive::set<Extent> extent_map_t;
713
714
715 friend ostream& operator<<(ostream& out, const Extent& e);
716
717 struct OldExtent {
718 boost::intrusive::list_member_hook<> old_extent_item;
719 Extent e;
720 PExtentVector r;
721 bool blob_empty; // flag to track the last removed extent that makes blob
722 // empty - required to update compression stat properly
723 OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
724 : e(lo, o, l, b), blob_empty(false) {
725 }
726 static OldExtent* create(CollectionRef c,
727 uint32_t lo,
728 uint32_t o,
729 uint32_t l,
730 BlobRef& b);
731 };
732 typedef boost::intrusive::list<
733 OldExtent,
734 boost::intrusive::member_hook<
735 OldExtent,
736 boost::intrusive::list_member_hook<>,
737 &OldExtent::old_extent_item> > old_extent_map_t;
738
739 struct Onode;
740
741 /// a sharded extent map, mapping offsets to lextents to blobs
742 struct ExtentMap {
743 Onode *onode;
744 extent_map_t extent_map; ///< map of Extents to Blobs
745 blob_map_t spanning_blob_map; ///< blobs that span shards
746
747 struct Shard {
748 bluestore_onode_t::shard_info *shard_info = nullptr;
749 unsigned extents = 0; ///< count extents in this shard
750 bool loaded = false; ///< true if shard is loaded
751 bool dirty = false; ///< true if shard is dirty and needs reencoding
752 };
31f18b77 753 mempool::bluestore_cache_other::vector<Shard> shards; ///< shards
7c673cae
FG
754
755 bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
756
757 uint32_t needs_reshard_begin = 0;
758 uint32_t needs_reshard_end = 0;
759
760 bool needs_reshard() const {
761 return needs_reshard_end > needs_reshard_begin;
762 }
763 void clear_needs_reshard() {
764 needs_reshard_begin = needs_reshard_end = 0;
765 }
766 void request_reshard(uint32_t begin, uint32_t end) {
767 if (begin < needs_reshard_begin) {
768 needs_reshard_begin = begin;
769 }
770 if (end > needs_reshard_end) {
771 needs_reshard_end = end;
772 }
773 }
774
775 struct DeleteDisposer {
776 void operator()(Extent *e) { delete e; }
777 };
778
779 ExtentMap(Onode *o);
780 ~ExtentMap() {
781 extent_map.clear_and_dispose(DeleteDisposer());
782 }
783
784 void clear() {
785 extent_map.clear_and_dispose(DeleteDisposer());
786 shards.clear();
787 inline_bl.clear();
788 clear_needs_reshard();
789 }
790
791 bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl,
792 unsigned *pn);
793 unsigned decode_some(bufferlist& bl);
794
795 void bound_encode_spanning_blobs(size_t& p);
796 void encode_spanning_blobs(bufferlist::contiguous_appender& p);
797 void decode_spanning_blobs(bufferptr::iterator& p);
798
799 BlobRef get_spanning_blob(int id) {
800 auto p = spanning_blob_map.find(id);
801 assert(p != spanning_blob_map.end());
802 return p->second;
803 }
804
805 void update(KeyValueDB::Transaction t, bool force);
31f18b77 806 decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
7c673cae
FG
807 void reshard(
808 KeyValueDB *db,
809 KeyValueDB::Transaction t);
810
811 /// initialize Shards from the onode
812 void init_shards(bool loaded, bool dirty);
813
814 /// return index of shard containing offset
815 /// or -1 if not found
816 int seek_shard(uint32_t offset) {
817 size_t end = shards.size();
818 size_t mid, left = 0;
819 size_t right = end; // one passed the right end
820
821 while (left < right) {
822 mid = left + (right - left) / 2;
823 if (offset >= shards[mid].shard_info->offset) {
824 size_t next = mid + 1;
825 if (next >= end || offset < shards[next].shard_info->offset)
826 return mid;
827 //continue to search forwards
828 left = next;
829 } else {
830 //continue to search backwards
831 right = mid;
832 }
833 }
834
835 return -1; // not found
836 }
837
838 /// check if a range spans a shard
839 bool spans_shard(uint32_t offset, uint32_t length) {
840 if (shards.empty()) {
841 return false;
842 }
843 int s = seek_shard(offset);
844 assert(s >= 0);
845 if (s == (int)shards.size() - 1) {
846 return false; // last shard
847 }
848 if (offset + length <= shards[s+1].shard_info->offset) {
849 return false;
850 }
851 return true;
852 }
853
854 /// ensure that a range of the map is loaded
855 void fault_range(KeyValueDB *db,
856 uint32_t offset, uint32_t length);
857
858 /// ensure a range of the map is marked dirty
31f18b77 859 void dirty_range(uint32_t offset, uint32_t length);
7c673cae 860
31f18b77 861 /// for seek_lextent test
7c673cae
FG
862 extent_map_t::iterator find(uint64_t offset);
863
7c673cae
FG
864 /// seek to the first lextent including or after offset
865 extent_map_t::iterator seek_lextent(uint64_t offset);
866 extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
867
868 /// add a new Extent
869 void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
870 extent_map.insert(*new Extent(lo, o, l, b));
871 }
872
873 /// remove (and delete) an Extent
874 void rm(extent_map_t::iterator p) {
875 extent_map.erase_and_dispose(p, DeleteDisposer());
876 }
877
878 bool has_any_lextents(uint64_t offset, uint64_t length);
879
880 /// consolidate adjacent lextents in extent_map
881 int compress_extent_map(uint64_t offset, uint64_t length);
882
883 /// punch a logical hole. add lextents to deref to target list.
884 void punch_hole(CollectionRef &c,
885 uint64_t offset, uint64_t length,
886 old_extent_map_t *old_extents);
887
888 /// put new lextent into lextent_map overwriting existing ones if
889 /// any and update references accordingly
890 Extent *set_lextent(CollectionRef &c,
891 uint64_t logical_offset,
892 uint64_t offset, uint64_t length,
893 BlobRef b,
894 old_extent_map_t *old_extents);
895
896 /// split a blob (and referring extents)
897 BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
898 };
899
900 /// Compressed Blob Garbage collector
901 /*
902 The primary idea of the collector is to estimate a difference between
903 allocation units(AU) currently present for compressed blobs and new AUs
904 required to store that data uncompressed.
905 Estimation is performed for protrusive extents within a logical range
906 determined by a concatenation of old_extents collection and specific(current)
907 write request.
908 The root cause for old_extents use is the need to handle blob ref counts
909 properly. Old extents still hold blob refs and hence we need to traverse
910 the collection to determine if blob to be released.
911 Protrusive extents are extents that fit into the blob set in action
912 (ones that are below the logical range from above) but not removed totally
913 due to the current write.
914 E.g. for
915 extent1 <loffs = 100, boffs = 100, len = 100> ->
916 blob1<compressed, len_on_disk=4096, logical_len=8192>
917 extent2 <loffs = 200, boffs = 200, len = 100> ->
918 blob2<raw, len_on_disk=4096, llen=4096>
919 extent3 <loffs = 300, boffs = 300, len = 100> ->
920 blob1<compressed, len_on_disk=4096, llen=8192>
921 extent4 <loffs = 4096, boffs = 0, len = 100> ->
922 blob3<raw, len_on_disk=4096, llen=4096>
923 write(300~100)
924 protrusive extents are within the following ranges <0~300, 400~8192-400>
925 In this case existing AUs that might be removed due to GC (i.e. blob1)
926 use 2x4K bytes.
927 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
928 Hence we should do a collect.
929 */
930 class GarbageCollector
931 {
932 public:
933 /// return amount of allocation units that might be saved due to GC
934 int64_t estimate(
935 uint64_t offset,
936 uint64_t length,
937 const ExtentMap& extent_map,
938 const old_extent_map_t& old_extents,
939 uint64_t min_alloc_size);
940
941 /// return a collection of extents to perform GC on
942 const vector<AllocExtent>& get_extents_to_collect() const {
943 return extents_to_collect;
944 }
945 GarbageCollector(CephContext* _cct) : cct(_cct) {}
946
947 private:
948 struct BlobInfo {
949 uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
950 int64_t expected_allocations = 0; ///< new alloc units required
951 ///< in case of gc fulfilled
952 bool collect_candidate = false; ///< indicate if blob has any extents
953 ///< eligible for GC.
954 extent_map_t::const_iterator first_lextent; ///< points to the first
955 ///< lextent referring to
956 ///< the blob if any.
957 ///< collect_candidate flag
958 ///< determines the validity
959 extent_map_t::const_iterator last_lextent; ///< points to the last
960 ///< lextent referring to
961 ///< the blob if any.
962
963 BlobInfo(uint64_t ref_bytes) :
964 referenced_bytes(ref_bytes) {
965 }
966 };
967 CephContext* cct;
968 map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
969 ///< copies that are affected by the
970 ///< specific write
971
972 vector<AllocExtent> extents_to_collect; ///< protrusive extents that should
973 ///< be collected if GC takes place
974
975 boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
976 ///< unit when traversing
977 ///< protrusive extents.
978 ///< Other extents mapped to
979 ///< this AU to be ignored
980 ///< (except the case where
981 ///< uncompressed extent follows
982 ///< compressed one - see below).
983 BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit
984 ///< caused expected_allocations
985 ///< counter increment at this blob.
986 ///< if uncompressed extent follows
987 ///< a decrement for the
988 ///< expected_allocations counter
989 ///< is needed
990 int64_t expected_allocations = 0; ///< new alloc units required in case
991 ///< of gc fulfilled
992 int64_t expected_for_release = 0; ///< alloc units currently used by
993 ///< compressed blobs that might
994 ///< gone after GC
995 uint64_t gc_start_offset; ///starting offset for GC
996 uint64_t gc_end_offset; ///ending offset for GC
997
998 protected:
999 void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
1000 uint64_t start_offset,
1001 uint64_t end_offset,
1002 uint64_t start_touch_offset,
1003 uint64_t end_touch_offset,
1004 uint64_t min_alloc_size);
1005 };
1006
1007 struct OnodeSpace;
1008
1009 /// an in-memory object
1010 struct Onode {
1011 MEMPOOL_CLASS_HELPERS();
1012
1013 std::atomic_int nref; ///< reference count
1014 Collection *c;
1015
1016 ghobject_t oid;
1017
1018 /// key under PREFIX_OBJ where we are stored
31f18b77 1019 mempool::bluestore_cache_other::string key;
7c673cae
FG
1020
1021 boost::intrusive::list_member_hook<> lru_item;
1022
1023 bluestore_onode_t onode; ///< metadata stored as value in kv store
1024 bool exists; ///< true if object logically exists
1025
1026 ExtentMap extent_map;
1027
1028 // track txc's that have not been committed to kv store (and whose
1029 // effects cannot be read via the kvdb read methods)
1030 std::atomic<int> flushing_count = {0};
1031 std::mutex flush_lock; ///< protect flush_txns
1032 std::condition_variable flush_cond; ///< wait here for uncommitted txns
1033
1034 Onode(Collection *c, const ghobject_t& o,
31f18b77 1035 const mempool::bluestore_cache_other::string& k)
7c673cae
FG
1036 : nref(0),
1037 c(c),
1038 oid(o),
1039 key(k),
1040 exists(false),
1041 extent_map(this) {
1042 }
1043
1044 void flush();
1045 void get() {
1046 ++nref;
1047 }
1048 void put() {
1049 if (--nref == 0)
1050 delete this;
1051 }
1052 };
1053 typedef boost::intrusive_ptr<Onode> OnodeRef;
1054
1055
1056 /// a cache (shard) of onodes and buffers
1057 struct Cache {
1058 CephContext* cct;
1059 PerfCounters *logger;
1060 std::recursive_mutex lock; ///< protect lru and other structures
1061
1062 std::atomic<uint64_t> num_extents = {0};
1063 std::atomic<uint64_t> num_blobs = {0};
1064
7c673cae
FG
1065 static Cache *create(CephContext* cct, string type, PerfCounters *logger);
1066
1067 Cache(CephContext* cct) : cct(cct), logger(nullptr) {}
1068 virtual ~Cache() {}
1069
1070 virtual void _add_onode(OnodeRef& o, int level) = 0;
1071 virtual void _rm_onode(OnodeRef& o) = 0;
1072 virtual void _touch_onode(OnodeRef& o) = 0;
1073
1074 virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0;
1075 virtual void _rm_buffer(Buffer *b) = 0;
1076 virtual void _move_buffer(Cache *src, Buffer *b) = 0;
1077 virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
1078 virtual void _touch_buffer(Buffer *b) = 0;
1079
1080 virtual uint64_t _get_num_onodes() = 0;
1081 virtual uint64_t _get_buffer_bytes() = 0;
1082
1083 void add_extent() {
1084 ++num_extents;
1085 }
1086 void rm_extent() {
1087 --num_extents;
1088 }
1089
1090 void add_blob() {
1091 ++num_blobs;
1092 }
1093 void rm_blob() {
1094 --num_blobs;
1095 }
1096
91327a77 1097 void trim(uint64_t onode_max, uint64_t buffer_max);
7c673cae
FG
1098
1099 void trim_all();
1100
1101 virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
1102
1103 virtual void add_stats(uint64_t *onodes, uint64_t *extents,
1104 uint64_t *blobs,
1105 uint64_t *buffers,
1106 uint64_t *bytes) = 0;
1107
31f18b77
FG
1108 bool empty() {
1109 std::lock_guard<std::recursive_mutex> l(lock);
1110 return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
1111 }
1112
7c673cae
FG
1113#ifdef DEBUG_CACHE
1114 virtual void _audit(const char *s) = 0;
1115#else
1116 void _audit(const char *s) { /* no-op */ }
1117#endif
1118 };
1119
1120 /// simple LRU cache for onodes and buffers
1121 struct LRUCache : public Cache {
1122 private:
1123 typedef boost::intrusive::list<
1124 Onode,
1125 boost::intrusive::member_hook<
1126 Onode,
1127 boost::intrusive::list_member_hook<>,
1128 &Onode::lru_item> > onode_lru_list_t;
1129 typedef boost::intrusive::list<
1130 Buffer,
1131 boost::intrusive::member_hook<
1132 Buffer,
1133 boost::intrusive::list_member_hook<>,
1134 &Buffer::lru_item> > buffer_lru_list_t;
1135
1136 onode_lru_list_t onode_lru;
1137
1138 buffer_lru_list_t buffer_lru;
1139 uint64_t buffer_size = 0;
1140
1141 public:
1142 LRUCache(CephContext* cct) : Cache(cct) {}
1143 uint64_t _get_num_onodes() override {
1144 return onode_lru.size();
1145 }
1146 void _add_onode(OnodeRef& o, int level) override {
1147 if (level > 0)
1148 onode_lru.push_front(*o);
1149 else
1150 onode_lru.push_back(*o);
1151 }
1152 void _rm_onode(OnodeRef& o) override {
1153 auto q = onode_lru.iterator_to(*o);
1154 onode_lru.erase(q);
1155 }
1156 void _touch_onode(OnodeRef& o) override;
1157
1158 uint64_t _get_buffer_bytes() override {
1159 return buffer_size;
1160 }
1161 void _add_buffer(Buffer *b, int level, Buffer *near) override {
1162 if (near) {
1163 auto q = buffer_lru.iterator_to(*near);
1164 buffer_lru.insert(q, *b);
1165 } else if (level > 0) {
1166 buffer_lru.push_front(*b);
1167 } else {
1168 buffer_lru.push_back(*b);
1169 }
1170 buffer_size += b->length;
1171 }
1172 void _rm_buffer(Buffer *b) override {
1173 assert(buffer_size >= b->length);
1174 buffer_size -= b->length;
1175 auto q = buffer_lru.iterator_to(*b);
1176 buffer_lru.erase(q);
1177 }
1178 void _move_buffer(Cache *src, Buffer *b) override {
1179 src->_rm_buffer(b);
1180 _add_buffer(b, 0, nullptr);
1181 }
1182 void _adjust_buffer_size(Buffer *b, int64_t delta) override {
1183 assert((int64_t)buffer_size + delta >= 0);
1184 buffer_size += delta;
1185 }
1186 void _touch_buffer(Buffer *b) override {
1187 auto p = buffer_lru.iterator_to(*b);
1188 buffer_lru.erase(p);
1189 buffer_lru.push_front(*b);
1190 _audit("_touch_buffer end");
1191 }
1192
1193 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1194
1195 void add_stats(uint64_t *onodes, uint64_t *extents,
1196 uint64_t *blobs,
1197 uint64_t *buffers,
1198 uint64_t *bytes) override {
1199 std::lock_guard<std::recursive_mutex> l(lock);
1200 *onodes += onode_lru.size();
1201 *extents += num_extents;
1202 *blobs += num_blobs;
1203 *buffers += buffer_lru.size();
1204 *bytes += buffer_size;
1205 }
1206
1207#ifdef DEBUG_CACHE
1208 void _audit(const char *s) override;
1209#endif
1210 };
1211
1212 // 2Q cache for buffers, LRU for onodes
1213 struct TwoQCache : public Cache {
1214 private:
1215 // stick with LRU for onodes for now (fixme?)
1216 typedef boost::intrusive::list<
1217 Onode,
1218 boost::intrusive::member_hook<
1219 Onode,
1220 boost::intrusive::list_member_hook<>,
1221 &Onode::lru_item> > onode_lru_list_t;
1222 typedef boost::intrusive::list<
1223 Buffer,
1224 boost::intrusive::member_hook<
1225 Buffer,
1226 boost::intrusive::list_member_hook<>,
1227 &Buffer::lru_item> > buffer_list_t;
1228
1229 onode_lru_list_t onode_lru;
1230
1231 buffer_list_t buffer_hot; ///< "Am" hot buffers
1232 buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
1233 buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted
1234 uint64_t buffer_bytes = 0; ///< bytes
1235
1236 enum {
1237 BUFFER_NEW = 0,
1238 BUFFER_WARM_IN, ///< in buffer_warm_in
1239 BUFFER_WARM_OUT, ///< in buffer_warm_out
1240 BUFFER_HOT, ///< in buffer_hot
1241 BUFFER_TYPE_MAX
1242 };
1243
1244 uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1245
1246 public:
1247 TwoQCache(CephContext* cct) : Cache(cct) {}
1248 uint64_t _get_num_onodes() override {
1249 return onode_lru.size();
1250 }
1251 void _add_onode(OnodeRef& o, int level) override {
1252 if (level > 0)
1253 onode_lru.push_front(*o);
1254 else
1255 onode_lru.push_back(*o);
1256 }
1257 void _rm_onode(OnodeRef& o) override {
1258 auto q = onode_lru.iterator_to(*o);
1259 onode_lru.erase(q);
1260 }
1261 void _touch_onode(OnodeRef& o) override;
1262
1263 uint64_t _get_buffer_bytes() override {
1264 return buffer_bytes;
1265 }
1266 void _add_buffer(Buffer *b, int level, Buffer *near) override;
1267 void _rm_buffer(Buffer *b) override;
1268 void _move_buffer(Cache *src, Buffer *b) override;
1269 void _adjust_buffer_size(Buffer *b, int64_t delta) override;
1270 void _touch_buffer(Buffer *b) override {
1271 switch (b->cache_private) {
1272 case BUFFER_WARM_IN:
1273 // do nothing (somewhat counter-intuitively!)
1274 break;
1275 case BUFFER_WARM_OUT:
1276 // move from warm_out to hot LRU
1277 assert(0 == "this happens via discard hint");
1278 break;
1279 case BUFFER_HOT:
1280 // move to front of hot LRU
1281 buffer_hot.erase(buffer_hot.iterator_to(*b));
1282 buffer_hot.push_front(*b);
1283 break;
1284 }
1285 _audit("_touch_buffer end");
1286 }
1287
1288 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1289
1290 void add_stats(uint64_t *onodes, uint64_t *extents,
1291 uint64_t *blobs,
1292 uint64_t *buffers,
1293 uint64_t *bytes) override {
1294 std::lock_guard<std::recursive_mutex> l(lock);
1295 *onodes += onode_lru.size();
1296 *extents += num_extents;
1297 *blobs += num_blobs;
1298 *buffers += buffer_hot.size() + buffer_warm_in.size();
1299 *bytes += buffer_bytes;
1300 }
1301
1302#ifdef DEBUG_CACHE
1303 void _audit(const char *s) override;
1304#endif
1305 };
1306
1307 struct OnodeSpace {
1308 private:
1309 Cache *cache;
1310
1311 /// forward lookups
31f18b77 1312 mempool::bluestore_cache_other::unordered_map<ghobject_t,OnodeRef> onode_map;
7c673cae
FG
1313
1314 friend class Collection; // for split_cache()
1315
1316 public:
1317 OnodeSpace(Cache *c) : cache(c) {}
1318 ~OnodeSpace() {
1319 clear();
1320 }
1321
1322 OnodeRef add(const ghobject_t& oid, OnodeRef o);
1323 OnodeRef lookup(const ghobject_t& o);
1324 void remove(const ghobject_t& oid) {
1325 onode_map.erase(oid);
1326 }
1327 void rename(OnodeRef& o, const ghobject_t& old_oid,
1328 const ghobject_t& new_oid,
31f18b77 1329 const mempool::bluestore_cache_other::string& new_okey);
7c673cae
FG
1330 void clear();
1331 bool empty();
1332
3efd9988
FG
1333 void dump(CephContext *cct, int lvl);
1334
7c673cae
FG
1335 /// return true if f true for any item
1336 bool map_any(std::function<bool(OnodeRef)> f);
1337 };
1338
1339 struct Collection : public CollectionImpl {
1340 BlueStore *store;
1341 Cache *cache; ///< our cache shard
1342 coll_t cid;
1343 bluestore_cnode_t cnode;
1344 RWLock lock;
1345
1346 bool exists;
1347
1348 SharedBlobSet shared_blob_set; ///< open SharedBlobs
1349
1350 // cache onodes on a per-collection basis to avoid lock
1351 // contention.
1352 OnodeSpace onode_map;
1353
1354 //pool options
1355 pool_opts_t pool_opts;
1356
1357 OnodeRef get_onode(const ghobject_t& oid, bool create);
1358
1359 // the terminology is confusing here, sorry!
1360 //
1361 // blob_t shared_blob_t
1362 // !shared unused -> open
1363 // shared !loaded -> open + shared
1364 // shared loaded -> open + shared + loaded
1365 //
1366 // i.e.,
1367 // open = SharedBlob is instantiated
1368 // shared = blob_t shared flag is set; SharedBlob is hashed.
1369 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1370 void open_shared_blob(uint64_t sbid, BlobRef b);
1371 void load_shared_blob(SharedBlobRef sb);
1372 void make_blob_shared(uint64_t sbid, BlobRef b);
31f18b77 1373 uint64_t make_blob_unshared(SharedBlob *sb);
7c673cae
FG
1374
1375 BlobRef new_blob() {
1376 BlobRef b = new Blob();
1377 b->shared_blob = new SharedBlob(this);
1378 return b;
1379 }
1380
1381 const coll_t &get_cid() override {
1382 return cid;
1383 }
1384
1385 bool contains(const ghobject_t& oid) {
1386 if (cid.is_meta())
1387 return oid.hobj.pool == -1;
1388 spg_t spgid;
1389 if (cid.is_pg(&spgid))
1390 return
1391 spgid.pgid.contains(cnode.bits, oid) &&
1392 oid.shard_id == spgid.shard;
1393 return false;
1394 }
1395
1396 void split_cache(Collection *dest);
7c673cae
FG
1397
1398 Collection(BlueStore *ns, Cache *ca, coll_t c);
1399 };
1400
1401 class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
1402 CollectionRef c;
1403 OnodeRef o;
1404 KeyValueDB::Iterator it;
1405 string head, tail;
1406 public:
1407 OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
1408 int seek_to_first() override;
1409 int upper_bound(const string &after) override;
1410 int lower_bound(const string &to) override;
1411 bool valid() override;
1412 int next(bool validate=true) override;
1413 string key() override;
1414 bufferlist value() override;
1415 int status() override {
1416 return 0;
1417 }
1418 };
1419
1420 class OpSequencer;
1421 typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
1422
31f18b77
FG
1423 struct volatile_statfs{
1424 enum {
1425 STATFS_ALLOCATED = 0,
1426 STATFS_STORED,
1427 STATFS_COMPRESSED_ORIGINAL,
1428 STATFS_COMPRESSED,
1429 STATFS_COMPRESSED_ALLOCATED,
1430 STATFS_LAST
1431 };
1432 int64_t values[STATFS_LAST];
1433 volatile_statfs() {
1434 memset(this, 0, sizeof(volatile_statfs));
1435 }
1436 void reset() {
1437 *this = volatile_statfs();
1438 }
1439 volatile_statfs& operator+=(const volatile_statfs& other) {
1440 for (size_t i = 0; i < STATFS_LAST; ++i) {
1441 values[i] += other.values[i];
1442 }
1443 return *this;
1444 }
1445 int64_t& allocated() {
1446 return values[STATFS_ALLOCATED];
1447 }
1448 int64_t& stored() {
1449 return values[STATFS_STORED];
1450 }
1451 int64_t& compressed_original() {
1452 return values[STATFS_COMPRESSED_ORIGINAL];
1453 }
1454 int64_t& compressed() {
1455 return values[STATFS_COMPRESSED];
1456 }
1457 int64_t& compressed_allocated() {
1458 return values[STATFS_COMPRESSED_ALLOCATED];
1459 }
1460 bool is_empty() {
1461 return values[STATFS_ALLOCATED] == 0 &&
1462 values[STATFS_STORED] == 0 &&
1463 values[STATFS_COMPRESSED] == 0 &&
1464 values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
1465 values[STATFS_COMPRESSED_ALLOCATED] == 0;
1466 }
1467 void decode(bufferlist::iterator& it) {
1468 for (size_t i = 0; i < STATFS_LAST; i++) {
1469 ::decode(values[i], it);
1470 }
1471 }
1472
1473 void encode(bufferlist& bl) {
1474 for (size_t i = 0; i < STATFS_LAST; i++) {
1475 ::encode(values[i], bl);
1476 }
1477 }
1478 };
1479
7c673cae 1480 struct TransContext : public AioContext {
31f18b77
FG
1481 MEMPOOL_CLASS_HELPERS();
1482
7c673cae
FG
1483 typedef enum {
1484 STATE_PREPARE,
1485 STATE_AIO_WAIT,
1486 STATE_IO_DONE,
1487 STATE_KV_QUEUED, // queued for kv_sync_thread submission
1488 STATE_KV_SUBMITTED, // submitted to kv; not yet synced
1489 STATE_KV_DONE,
1490 STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
1491 STATE_DEFERRED_CLEANUP, // remove deferred kv record
1492 STATE_DEFERRED_DONE,
1493 STATE_FINISHING,
1494 STATE_DONE,
1495 } state_t;
1496
1497 state_t state = STATE_PREPARE;
1498
1499 const char *get_state_name() {
1500 switch (state) {
1501 case STATE_PREPARE: return "prepare";
1502 case STATE_AIO_WAIT: return "aio_wait";
1503 case STATE_IO_DONE: return "io_done";
1504 case STATE_KV_QUEUED: return "kv_queued";
1505 case STATE_KV_SUBMITTED: return "kv_submitted";
1506 case STATE_KV_DONE: return "kv_done";
1507 case STATE_DEFERRED_QUEUED: return "deferred_queued";
1508 case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
1509 case STATE_DEFERRED_DONE: return "deferred_done";
1510 case STATE_FINISHING: return "finishing";
1511 case STATE_DONE: return "done";
1512 }
1513 return "???";
1514 }
1515
1516#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1517 const char *get_state_latency_name(int state) {
1518 switch (state) {
1519 case l_bluestore_state_prepare_lat: return "prepare";
1520 case l_bluestore_state_aio_wait_lat: return "aio_wait";
1521 case l_bluestore_state_io_done_lat: return "io_done";
1522 case l_bluestore_state_kv_queued_lat: return "kv_queued";
1523 case l_bluestore_state_kv_committing_lat: return "kv_committing";
1524 case l_bluestore_state_kv_done_lat: return "kv_done";
1525 case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
1526 case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
1527 case l_bluestore_state_finishing_lat: return "finishing";
1528 case l_bluestore_state_done_lat: return "done";
1529 }
1530 return "???";
1531 }
1532#endif
1533
1534 void log_state_latency(PerfCounters *logger, int state) {
1535 utime_t lat, now = ceph_clock_now();
1536 lat = now - last_stamp;
1537 logger->tinc(state, lat);
1538#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1539 if (state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) {
1540 double usecs = (now.to_nsec()-last_stamp.to_nsec())/1000;
1541 OID_ELAPSED("", usecs, get_state_latency_name(state));
1542 }
1543#endif
1544 last_stamp = now;
1545 }
1546
1547 OpSequencerRef osr;
1548 boost::intrusive::list_member_hook<> sequencer_item;
1549
1550 uint64_t bytes = 0, cost = 0;
1551
1552 set<OnodeRef> onodes; ///< these need to be updated/written
1553 set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
1554 set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
1555 set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
1556
1557 KeyValueDB::Transaction t; ///< then we will commit this
1558 Context *oncommit = nullptr; ///< signal on commit
1559 Context *onreadable = nullptr; ///< signal on readable
1560 Context *onreadable_sync = nullptr; ///< signal on readable
1561 list<Context*> oncommits; ///< more commit completions
1562 list<CollectionRef> removed_collections; ///< colls we removed
1563
1564 boost::intrusive::list_member_hook<> deferred_queue_item;
1565 bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
1566
1567 interval_set<uint64_t> allocated, released;
31f18b77 1568 volatile_statfs statfs_delta;
7c673cae
FG
1569
1570 IOContext ioc;
1571 bool had_ios = false; ///< true if we submitted IOs before our kv txn
1572
7c673cae
FG
1573 uint64_t seq = 0;
1574 utime_t start;
1575 utime_t last_stamp;
1576
1577 uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
1578 uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
1579
1580 explicit TransContext(CephContext* cct, OpSequencer *o)
1581 : osr(o),
1582 ioc(cct, this),
1583 start(ceph_clock_now()) {
1584 last_stamp = start;
1585 }
1586 ~TransContext() {
1587 delete deferred_txn;
1588 }
1589
1590 void write_onode(OnodeRef &o) {
1591 onodes.insert(o);
1592 }
1593 void write_shared_blob(SharedBlobRef &sb) {
1594 shared_blobs.insert(sb);
1595 }
31f18b77
FG
1596 void unshare_blob(SharedBlob *sb) {
1597 shared_blobs.erase(sb);
1598 }
1599
7c673cae
FG
1600 /// note we logically modified object (when onode itself is unmodified)
1601 void note_modified_object(OnodeRef &o) {
1602 // onode itself isn't written, though
1603 modified_objects.insert(o);
1604 }
1605 void removed(OnodeRef& o) {
1606 onodes.erase(o);
1607 modified_objects.erase(o);
1608 }
1609
1610 void aio_finish(BlueStore *store) override {
1611 store->txc_aio_finish(this);
1612 }
1613 };
1614
1615 typedef boost::intrusive::list<
1616 TransContext,
1617 boost::intrusive::member_hook<
1618 TransContext,
1619 boost::intrusive::list_member_hook<>,
1620 &TransContext::deferred_queue_item> > deferred_queue_t;
1621
1622 struct DeferredBatch : public AioContext {
1623 OpSequencer *osr;
1624 struct deferred_io {
1625 bufferlist bl; ///< data
1626 uint64_t seq; ///< deferred transaction seq
1627 };
1628 map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
1629 deferred_queue_t txcs; ///< txcs in this batch
1630 IOContext ioc; ///< our aios
1631 /// bytes of pending io for each deferred seq (may be 0)
1632 map<uint64_t,int> seq_bytes;
1633
1634 void _discard(CephContext *cct, uint64_t offset, uint64_t length);
1635 void _audit(CephContext *cct);
1636
1637 DeferredBatch(CephContext *cct, OpSequencer *osr)
1638 : osr(osr), ioc(cct, this) {}
1639
1640 /// prepare a write
1641 void prepare_write(CephContext *cct,
1642 uint64_t seq, uint64_t offset, uint64_t length,
1643 bufferlist::const_iterator& p);
1644
1645 void aio_finish(BlueStore *store) override {
1646 store->_deferred_aio_finish(osr);
1647 }
1648 };
1649
1650 class OpSequencer : public Sequencer_impl {
1651 public:
1652 std::mutex qlock;
1653 std::condition_variable qcond;
1654 typedef boost::intrusive::list<
1655 TransContext,
1656 boost::intrusive::member_hook<
1657 TransContext,
1658 boost::intrusive::list_member_hook<>,
1659 &TransContext::sequencer_item> > q_list_t;
1660 q_list_t q; ///< transactions
1661
1662 boost::intrusive::list_member_hook<> deferred_osr_queue_item;
1663
1664 DeferredBatch *deferred_running = nullptr;
1665 DeferredBatch *deferred_pending = nullptr;
1666
1667 Sequencer *parent;
1668 BlueStore *store;
1669
1670 uint64_t last_seq = 0;
1671
1672 std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
1673
1674 std::atomic_int kv_committing_serially = {0};
1675
1676 std::atomic_int kv_submitted_waiters = {0};
1677
1678 std::atomic_bool registered = {true}; ///< registered in BlueStore's osr_set
1679 std::atomic_bool zombie = {false}; ///< owning Sequencer has gone away
1680
1681 OpSequencer(CephContext* cct, BlueStore *store)
1682 : Sequencer_impl(cct),
1683 parent(NULL), store(store) {
1684 store->register_osr(this);
1685 }
1686 ~OpSequencer() override {
1687 assert(q.empty());
1688 _unregister();
1689 }
1690
1691 void discard() override {
1692 // Note that we may have txc's in flight when the parent Sequencer
1693 // goes away. Reflect this with zombie==registered==true and let
1694 // _osr_drain_all clean up later.
1695 assert(!zombie);
1696 zombie = true;
1697 parent = nullptr;
1698 bool empty;
1699 {
1700 std::lock_guard<std::mutex> l(qlock);
1701 empty = q.empty();
1702 }
1703 if (empty) {
1704 _unregister();
1705 }
1706 }
1707
1708 void _unregister() {
1709 if (registered) {
1710 store->unregister_osr(this);
1711 registered = false;
1712 }
1713 }
1714
1715 void queue_new(TransContext *txc) {
1716 std::lock_guard<std::mutex> l(qlock);
1717 txc->seq = ++last_seq;
1718 q.push_back(*txc);
1719 }
1720
1721 void drain() {
1722 std::unique_lock<std::mutex> l(qlock);
1723 while (!q.empty())
1724 qcond.wait(l);
1725 }
1726
1727 void drain_preceding(TransContext *txc) {
1728 std::unique_lock<std::mutex> l(qlock);
1729 while (!q.empty() && &q.front() != txc)
1730 qcond.wait(l);
1731 }
1732
1733 bool _is_all_kv_submitted() {
1734 // caller must hold qlock
1735 if (q.empty()) {
1736 return true;
1737 }
1738 TransContext *txc = &q.back();
1739 if (txc->state >= TransContext::STATE_KV_SUBMITTED) {
1740 return true;
1741 }
1742 return false;
1743 }
1744
1745 void flush() override {
1746 std::unique_lock<std::mutex> l(qlock);
1747 while (true) {
1748 // set flag before the check because the condition
1749 // may become true outside qlock, and we need to make
1750 // sure those threads see waiters and signal qcond.
1751 ++kv_submitted_waiters;
1752 if (_is_all_kv_submitted()) {
1753 return;
1754 }
1755 qcond.wait(l);
1756 --kv_submitted_waiters;
1757 }
1758 }
1759
1760 bool flush_commit(Context *c) override {
1761 std::lock_guard<std::mutex> l(qlock);
1762 if (q.empty()) {
1763 return true;
1764 }
1765 TransContext *txc = &q.back();
1766 if (txc->state >= TransContext::STATE_KV_DONE) {
1767 return true;
1768 }
1769 txc->oncommits.push_back(c);
1770 return false;
1771 }
1772 };
1773
1774 typedef boost::intrusive::list<
1775 OpSequencer,
1776 boost::intrusive::member_hook<
1777 OpSequencer,
1778 boost::intrusive::list_member_hook<>,
1779 &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
1780
1781 struct KVSyncThread : public Thread {
1782 BlueStore *store;
1783 explicit KVSyncThread(BlueStore *s) : store(s) {}
1784 void *entry() override {
1785 store->_kv_sync_thread();
1786 return NULL;
1787 }
1788 };
31f18b77
FG
1789 struct KVFinalizeThread : public Thread {
1790 BlueStore *store;
1791 explicit KVFinalizeThread(BlueStore *s) : store(s) {}
1792 void *entry() {
1793 store->_kv_finalize_thread();
1794 return NULL;
1795 }
1796 };
7c673cae
FG
1797
1798 struct DBHistogram {
1799 struct value_dist {
1800 uint64_t count;
1801 uint32_t max_len;
1802 };
1803
1804 struct key_dist {
1805 uint64_t count;
1806 uint32_t max_len;
1807 map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
1808 };
1809
1810 map<string, map<int, struct key_dist> > key_hist;
1811 map<int, uint64_t> value_hist;
1812 int get_key_slab(size_t sz);
1813 string get_key_slab_to_range(int slab);
1814 int get_value_slab(size_t sz);
1815 string get_value_slab_to_range(int slab);
1816 void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
1817 const string &prefix, size_t key_size, size_t value_size);
1818 void dump(Formatter *f);
1819 };
1820
1821 // --------------------------------------------------------
1822 // members
1823private:
1824 BlueFS *bluefs = nullptr;
1825 unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
1826 bool bluefs_single_shared_device = true;
1827 utime_t bluefs_last_balance;
1828
1829 KeyValueDB *db = nullptr;
1830 BlockDevice *bdev = nullptr;
1831 std::string freelist_type;
1832 FreelistManager *fm = nullptr;
1833 Allocator *alloc = nullptr;
1834 uuid_d fsid;
1835 int path_fd = -1; ///< open handle to $path
1836 int fsid_fd = -1; ///< open handle (locked) to $path/fsid
1837 bool mounted = false;
1838
1839 RWLock coll_lock = {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
31f18b77 1840 mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
7c673cae
FG
1841
1842 vector<Cache*> cache_shards;
1843
1844 std::mutex osr_lock; ///< protect osd_set
1845 std::set<OpSequencerRef> osr_set; ///< set of all OpSequencers
1846
1847 std::atomic<uint64_t> nid_last = {0};
1848 std::atomic<uint64_t> nid_max = {0};
1849 std::atomic<uint64_t> blobid_last = {0};
1850 std::atomic<uint64_t> blobid_max = {0};
1851
1852 Throttle throttle_bytes; ///< submit to commit
1853 Throttle throttle_deferred_bytes; ///< submit to deferred complete
1854
1855 interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
1856 interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
1857
181888fb 1858 std::mutex deferred_lock;
7c673cae
FG
1859 std::atomic<uint64_t> deferred_seq = {0};
1860 deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
1861 int deferred_queue_size = 0; ///< num txc's queued across all osrs
1862 atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
181888fb 1863 Finisher deferred_finisher;
7c673cae
FG
1864
1865 int m_finisher_num = 1;
1866 vector<Finisher*> finishers;
1867
1868 KVSyncThread kv_sync_thread;
1869 std::mutex kv_lock;
1870 std::condition_variable kv_cond;
3efd9988 1871 bool _kv_only = false;
31f18b77 1872 bool kv_sync_started = false;
7c673cae 1873 bool kv_stop = false;
31f18b77
FG
1874 bool kv_finalize_started = false;
1875 bool kv_finalize_stop = false;
7c673cae
FG
1876 deque<TransContext*> kv_queue; ///< ready, already submitted
1877 deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
1878 deque<TransContext*> kv_committing; ///< currently syncing
1879 deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
1880 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
1881
31f18b77
FG
1882 KVFinalizeThread kv_finalize_thread;
1883 std::mutex kv_finalize_lock;
1884 std::condition_variable kv_finalize_cond;
1885 deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
1886 deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
1887
7c673cae
FG
1888 PerfCounters *logger = nullptr;
1889
7c673cae
FG
1890 list<CollectionRef> removed_collections;
1891
1892 RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"};
1893 set<ghobject_t> debug_data_error_objects;
1894 set<ghobject_t> debug_mdata_error_objects;
1895
1896 std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
1897
1898 uint64_t block_size = 0; ///< block size of block device (power of 2)
1899 uint64_t block_mask = 0; ///< mask to get just the block offset
1900 size_t block_size_order = 0; ///< bits to shift to get block size
1901
1902 uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2)
7c673cae 1903 ///< bits for min_alloc_size
224ce89b 1904 uint8_t min_alloc_size_order = 0;
7c673cae
FG
1905 static_assert(std::numeric_limits<uint8_t>::max() >
1906 std::numeric_limits<decltype(min_alloc_size)>::digits,
1907 "not enough bits for min_alloc_size");
1908
7c673cae
FG
1909 ///< maximum allocation unit (power of 2)
1910 std::atomic<uint64_t> max_alloc_size = {0};
1911
224ce89b
WB
1912 ///< number threshold for forced deferred writes
1913 std::atomic<int> deferred_batch_ops = {0};
1914
1915 ///< size threshold for forced deferred writes
1916 std::atomic<uint64_t> prefer_deferred_size = {0};
1917
7c673cae
FG
1918 ///< approx cost per io, in bytes
1919 std::atomic<uint64_t> throttle_cost_per_io = {0};
1920
224ce89b
WB
1921 std::atomic<Compressor::CompressionMode> comp_mode =
1922 {Compressor::COMP_NONE}; ///< compression mode
7c673cae
FG
1923 CompressorRef compressor;
1924 std::atomic<uint64_t> comp_min_blob_size = {0};
1925 std::atomic<uint64_t> comp_max_blob_size = {0};
1926
1927 std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
1928
31f18b77
FG
1929 uint64_t kv_ios = 0;
1930 uint64_t kv_throttle_costs = 0;
1931
7c673cae 1932 // cache trim control
91327a77
AA
1933 uint64_t cache_size = 0; ///< total cache size
1934 double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
1935 double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
1936 double cache_data_ratio = 0; ///< cache ratio dedicated to object data
1937 bool cache_autotune = false; ///< cache autotune setting
1938 uint64_t cache_autotune_chunk_size = 0; ///< cache autotune chunk size
1939 double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
1940 uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
1941 uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
1942 double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
1943 uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cahce
1944 double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
31f18b77
FG
1945 std::mutex vstatfs_lock;
1946 volatile_statfs vstatfs;
7c673cae
FG
1947
1948 struct MempoolThread : public Thread {
91327a77 1949 public:
7c673cae 1950 BlueStore *store;
91327a77 1951
7c673cae
FG
1952 Cond cond;
1953 Mutex lock;
1954 bool stop = false;
91327a77
AA
1955 uint64_t autotune_cache_size = 0;
1956
1957 struct MempoolCache : public PriorityCache::PriCache {
1958 BlueStore *store;
1959 int64_t cache_bytes[PriorityCache::Priority::LAST+1];
1960 double cache_ratio = 0;
1961
1962 MempoolCache(BlueStore *s) : store(s) {};
1963
1964 virtual uint64_t _get_used_bytes() const = 0;
1965
1966 virtual int64_t request_cache_bytes(
1967 PriorityCache::Priority pri, uint64_t chunk_bytes) const {
1968 int64_t assigned = get_cache_bytes(pri);
1969
1970 switch (pri) {
1971 // All cache items are currently shoved into the LAST priority
1972 case PriorityCache::Priority::LAST:
1973 {
1974 uint64_t usage = _get_used_bytes();
1975 int64_t request = PriorityCache::get_chunk(usage, chunk_bytes);
1976 return(request > assigned) ? request - assigned : 0;
1977 }
1978 default:
1979 break;
1980 }
1981 return -EOPNOTSUPP;
1982 }
1983
1984 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
1985 return cache_bytes[pri];
1986 }
1987 virtual int64_t get_cache_bytes() const {
1988 int64_t total = 0;
1989
1990 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
1991 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
1992 total += get_cache_bytes(pri);
1993 }
1994 return total;
1995 }
1996 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
1997 cache_bytes[pri] = bytes;
1998 }
1999 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2000 cache_bytes[pri] += bytes;
2001 }
2002 virtual int64_t commit_cache_size() {
2003 return get_cache_bytes();
2004 }
2005 virtual double get_cache_ratio() const {
2006 return cache_ratio;
2007 }
2008 virtual void set_cache_ratio(double ratio) {
2009 cache_ratio = ratio;
2010 }
2011 virtual string get_cache_name() const = 0;
2012 };
2013
2014 struct MetaCache : public MempoolCache {
2015 MetaCache(BlueStore *s) : MempoolCache(s) {};
2016
2017 virtual uint64_t _get_used_bytes() const {
2018 return mempool::bluestore_cache_other::allocated_bytes() +
2019 mempool::bluestore_cache_onode::allocated_bytes();
2020 }
2021
2022 virtual string get_cache_name() const {
2023 return "BlueStore Meta Cache";
2024 }
2025
2026 uint64_t _get_num_onodes() const {
2027 uint64_t onode_num =
2028 mempool::bluestore_cache_onode::allocated_items();
2029 return (2 > onode_num) ? 2 : onode_num;
2030 }
2031
2032 double get_bytes_per_onode() const {
2033 return (double)_get_used_bytes() / (double)_get_num_onodes();
2034 }
2035 } meta_cache;
2036
2037 struct DataCache : public MempoolCache {
2038 DataCache(BlueStore *s) : MempoolCache(s) {};
2039
2040 virtual uint64_t _get_used_bytes() const {
2041 uint64_t bytes = 0;
2042 for (auto i : store->cache_shards) {
2043 bytes += i->_get_buffer_bytes();
2044 }
2045 return bytes;
2046 }
2047 virtual string get_cache_name() const {
2048 return "BlueStore Data Cache";
2049 }
2050 } data_cache;
2051
7c673cae
FG
2052 public:
2053 explicit MempoolThread(BlueStore *s)
2054 : store(s),
91327a77
AA
2055 lock("BlueStore::MempoolThread::lock"),
2056 meta_cache(MetaCache(s)),
2057 data_cache(DataCache(s)) {}
2058
7c673cae
FG
2059 void *entry() override;
2060 void init() {
2061 assert(stop == false);
2062 create("bstore_mempool");
2063 }
2064 void shutdown() {
2065 lock.Lock();
2066 stop = true;
2067 cond.Signal();
2068 lock.Unlock();
2069 join();
2070 }
91327a77
AA
2071
2072 private:
2073 void _adjust_cache_settings();
2074 void _trim_shards(bool interval_stats);
2075 void _tune_cache_size(bool interval_stats);
2076 void _balance_cache(const std::list<PriorityCache::PriCache *>& caches);
2077 void _balance_cache_pri(int64_t *mem_avail,
2078 const std::list<PriorityCache::PriCache *>& caches,
2079 PriorityCache::Priority pri);
7c673cae
FG
2080 } mempool_thread;
2081
2082 // --------------------------------------------------------
2083 // private methods
2084
2085 void _init_logger();
2086 void _shutdown_logger();
2087 int _reload_logger();
2088
2089 int _open_path();
2090 void _close_path();
2091 int _open_fsid(bool create);
2092 int _lock_fsid();
2093 int _read_fsid(uuid_d *f);
2094 int _write_fsid();
2095 void _close_fsid();
2096 void _set_alloc_sizes();
2097 void _set_blob_size();
1adf2230 2098 void _set_finisher_num();
7c673cae
FG
2099
2100 int _open_bdev(bool create);
2101 void _close_bdev();
2102 int _open_db(bool create);
2103 void _close_db();
2104 int _open_fm(bool create);
2105 void _close_fm();
2106 int _open_alloc();
2107 void _close_alloc();
2108 int _open_collections(int *errors=0);
2109 void _close_collections();
2110
2111 int _setup_block_symlink_or_file(string name, string path, uint64_t size,
2112 bool create);
2113
7c673cae 2114public:
3efd9988
FG
2115 static int _write_bdev_label(CephContext* cct,
2116 string path, bluestore_bdev_label_t label);
7c673cae
FG
2117 static int _read_bdev_label(CephContext* cct, string path,
2118 bluestore_bdev_label_t *label);
2119private:
2120 int _check_or_set_bdev_label(string path, uint64_t size, string desc,
2121 bool create);
2122
2123 int _open_super_meta();
2124
224ce89b 2125 void _open_statfs();
31f18b77 2126
7c673cae
FG
2127 int _reconcile_bluefs_freespace();
2128 int _balance_bluefs_freespace(PExtentVector *extents);
2129 void _commit_bluefs_freespace(const PExtentVector& extents);
2130
2131 CollectionRef _get_collection(const coll_t& cid);
2132 void _queue_reap_collection(CollectionRef& c);
2133 void _reap_collections();
2134 void _update_cache_logger();
2135
2136 void _assign_nid(TransContext *txc, OnodeRef o);
2137 uint64_t _assign_blobid(TransContext *txc);
2138
94b18763 2139 void _dump_onode(const OnodeRef& o, int log_level=30);
7c673cae
FG
2140 void _dump_extent_map(ExtentMap& em, int log_level=30);
2141 void _dump_transaction(Transaction *t, int log_level = 30);
2142
2143 TransContext *_txc_create(OpSequencer *osr);
2144 void _txc_update_store_statfs(TransContext *txc);
2145 void _txc_add_transaction(TransContext *txc, Transaction *t);
2146 void _txc_calc_cost(TransContext *txc);
2147 void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
2148 void _txc_state_proc(TransContext *txc);
2149 void _txc_aio_submit(TransContext *txc);
2150public:
2151 void txc_aio_finish(void *p) {
2152 _txc_state_proc(static_cast<TransContext*>(p));
2153 }
2154private:
2155 void _txc_finish_io(TransContext *txc);
2156 void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
2157 void _txc_applied_kv(TransContext *txc);
2158 void _txc_committed_kv(TransContext *txc);
2159 void _txc_finish(TransContext *txc);
2160 void _txc_release_alloc(TransContext *txc);
2161
2162 void _osr_drain_preceding(TransContext *txc);
2163 void _osr_drain_all();
2164 void _osr_unregister_all();
2165
31f18b77
FG
2166 void _kv_start();
2167 void _kv_stop();
7c673cae 2168 void _kv_sync_thread();
31f18b77 2169 void _kv_finalize_thread();
7c673cae
FG
2170
2171 bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
2172 void _deferred_queue(TransContext *txc);
3efd9988 2173public:
224ce89b 2174 void deferred_try_submit();
3efd9988 2175private:
224ce89b 2176 void _deferred_submit_unlock(OpSequencer *osr);
7c673cae
FG
2177 void _deferred_aio_finish(OpSequencer *osr);
2178 int _deferred_replay();
2179
2180public:
2181 using mempool_dynamic_bitset =
2182 boost::dynamic_bitset<uint64_t,
2183 mempool::bluestore_fsck::pool_allocator<uint64_t>>;
2184
2185private:
2186 int _fsck_check_extents(
2187 const ghobject_t& oid,
2188 const PExtentVector& extents,
2189 bool compressed,
2190 mempool_dynamic_bitset &used_blocks,
b32b8144 2191 uint64_t granularity,
7c673cae
FG
2192 store_statfs_t& expected_statfs);
2193
2194 void _buffer_cache_write(
2195 TransContext *txc,
2196 BlobRef b,
2197 uint64_t offset,
2198 bufferlist& bl,
2199 unsigned flags) {
2200 b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
2201 flags);
2202 txc->shared_blobs_written.insert(b->shared_blob);
2203 }
2204
2205 int _collection_list(
2206 Collection *c, const ghobject_t& start, const ghobject_t& end,
2207 int max, vector<ghobject_t> *ls, ghobject_t *next);
2208
2209 template <typename T, typename F>
2210 T select_option(const std::string& opt_name, T val1, F f) {
2211 //NB: opt_name reserved for future use
2212 boost::optional<T> val2 = f();
2213 if (val2) {
2214 return *val2;
2215 }
2216 return val1;
2217 }
2218
2219 void _apply_padding(uint64_t head_pad,
2220 uint64_t tail_pad,
7c673cae
FG
2221 bufferlist& padded);
2222
2223 // -- ondisk version ---
2224public:
2225 const int32_t latest_ondisk_format = 2; ///< our version
2226 const int32_t min_readable_ondisk_format = 1; ///< what we can read
2227 const int32_t min_compat_ondisk_format = 2; ///< who can read us
2228
2229private:
2230 int32_t ondisk_format = 0; ///< value detected on mount
2231
2232 int _upgrade_super(); ///< upgrade (called during open_super)
2233 void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
2234
2235 // --- public interface ---
2236public:
2237 BlueStore(CephContext *cct, const string& path);
2238 BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only
2239 ~BlueStore() override;
2240
2241 string get_type() override {
2242 return "bluestore";
2243 }
2244
2245 bool needs_journal() override { return false; };
2246 bool wants_journal() override { return false; };
2247 bool allows_journal() override { return false; };
2248
31f18b77 2249 bool is_rotational() override;
d2e6a577 2250 bool is_journal_rotational() override;
31f18b77 2251
224ce89b
WB
2252 string get_default_device_class() override {
2253 string device_class;
2254 map<string, string> metadata;
2255 collect_metadata(&metadata);
2256 auto it = metadata.find("bluestore_bdev_type");
2257 if (it != metadata.end()) {
2258 device_class = it->second;
2259 }
2260 return device_class;
2261 }
2262
7c673cae
FG
2263 static int get_block_device_fsid(CephContext* cct, const string& path,
2264 uuid_d *fsid);
2265
2266 bool test_mount_in_use() override;
2267
2268private:
2269 int _mount(bool kv_only);
2270public:
2271 int mount() override {
2272 return _mount(false);
2273 }
2274 int umount() override;
2275
2276 int start_kv_only(KeyValueDB **pdb) {
2277 int r = _mount(true);
2278 if (r < 0)
2279 return r;
2280 *pdb = db;
2281 return 0;
2282 }
2283
3efd9988
FG
2284 int write_meta(const std::string& key, const std::string& value) override;
2285 int read_meta(const std::string& key, std::string *value) override;
2286
2287
2288 int fsck(bool deep) override {
2289 return _fsck(deep, false);
2290 }
2291 int repair(bool deep) override {
2292 return _fsck(deep, true);
2293 }
2294 int _fsck(bool deep, bool repair);
7c673cae
FG
2295
2296 void set_cache_shards(unsigned num) override;
2297
2298 int validate_hobject_key(const hobject_t &obj) const override {
2299 return 0;
2300 }
2301 unsigned get_max_attr_name_length() override {
2302 return 256; // arbitrary; there is no real limit internally
2303 }
2304
2305 int mkfs() override;
2306 int mkjournal() override {
2307 return 0;
2308 }
2309
2310 void get_db_statistics(Formatter *f) override;
2311 void generate_db_histogram(Formatter *f) override;
31f18b77 2312 void _flush_cache();
7c673cae
FG
2313 void flush_cache() override;
2314 void dump_perf_counters(Formatter *f) override {
2315 f->open_object_section("perf_counters");
2316 logger->dump_formatted(f, false);
2317 f->close_section();
2318 }
2319
2320 void register_osr(OpSequencer *osr) {
2321 std::lock_guard<std::mutex> l(osr_lock);
2322 osr_set.insert(osr);
2323 }
2324 void unregister_osr(OpSequencer *osr) {
2325 std::lock_guard<std::mutex> l(osr_lock);
2326 osr_set.erase(osr);
2327 }
2328
2329public:
2330 int statfs(struct store_statfs_t *buf) override;
2331
2332 void collect_metadata(map<string,string> *pm) override;
2333
2334 bool exists(const coll_t& cid, const ghobject_t& oid) override;
2335 bool exists(CollectionHandle &c, const ghobject_t& oid) override;
2336 int set_collection_opts(
2337 const coll_t& cid,
2338 const pool_opts_t& opts) override;
2339 int stat(
2340 const coll_t& cid,
2341 const ghobject_t& oid,
2342 struct stat *st,
2343 bool allow_eio = false) override;
2344 int stat(
2345 CollectionHandle &c,
2346 const ghobject_t& oid,
2347 struct stat *st,
2348 bool allow_eio = false) override;
2349 int read(
2350 const coll_t& cid,
2351 const ghobject_t& oid,
2352 uint64_t offset,
2353 size_t len,
2354 bufferlist& bl,
224ce89b 2355 uint32_t op_flags = 0) override;
7c673cae
FG
2356 int read(
2357 CollectionHandle &c,
2358 const ghobject_t& oid,
2359 uint64_t offset,
2360 size_t len,
2361 bufferlist& bl,
224ce89b 2362 uint32_t op_flags = 0) override;
7c673cae
FG
2363 int _do_read(
2364 Collection *c,
2365 OnodeRef o,
2366 uint64_t offset,
2367 size_t len,
2368 bufferlist& bl,
2369 uint32_t op_flags = 0);
2370
2371private:
2372 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
2373 uint64_t offset, size_t len, interval_set<uint64_t>& destset);
2374public:
2375 int fiemap(const coll_t& cid, const ghobject_t& oid,
2376 uint64_t offset, size_t len, bufferlist& bl) override;
2377 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2378 uint64_t offset, size_t len, bufferlist& bl) override;
2379 int fiemap(const coll_t& cid, const ghobject_t& oid,
2380 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2381 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2382 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2383
2384
2385 int getattr(const coll_t& cid, const ghobject_t& oid, const char *name,
2386 bufferptr& value) override;
2387 int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
2388 bufferptr& value) override;
2389
2390 int getattrs(const coll_t& cid, const ghobject_t& oid,
2391 map<string,bufferptr>& aset) override;
2392 int getattrs(CollectionHandle &c, const ghobject_t& oid,
2393 map<string,bufferptr>& aset) override;
2394
2395 int list_collections(vector<coll_t>& ls) override;
2396
2397 CollectionHandle open_collection(const coll_t &c) override;
2398
2399 bool collection_exists(const coll_t& c) override;
2400 int collection_empty(const coll_t& c, bool *empty) override;
2401 int collection_bits(const coll_t& c) override;
2402
2403 int collection_list(const coll_t& cid,
2404 const ghobject_t& start,
2405 const ghobject_t& end,
2406 int max,
2407 vector<ghobject_t> *ls, ghobject_t *next) override;
2408 int collection_list(CollectionHandle &c,
2409 const ghobject_t& start,
2410 const ghobject_t& end,
2411 int max,
2412 vector<ghobject_t> *ls, ghobject_t *next) override;
2413
2414 int omap_get(
2415 const coll_t& cid, ///< [in] Collection containing oid
2416 const ghobject_t &oid, ///< [in] Object containing omap
2417 bufferlist *header, ///< [out] omap header
2418 map<string, bufferlist> *out /// < [out] Key to value map
2419 ) override;
2420 int omap_get(
2421 CollectionHandle &c, ///< [in] Collection containing oid
2422 const ghobject_t &oid, ///< [in] Object containing omap
2423 bufferlist *header, ///< [out] omap header
2424 map<string, bufferlist> *out /// < [out] Key to value map
2425 ) override;
2426
2427 /// Get omap header
2428 int omap_get_header(
2429 const coll_t& cid, ///< [in] Collection containing oid
2430 const ghobject_t &oid, ///< [in] Object containing omap
2431 bufferlist *header, ///< [out] omap header
2432 bool allow_eio = false ///< [in] don't assert on eio
2433 ) override;
2434 int omap_get_header(
2435 CollectionHandle &c, ///< [in] Collection containing oid
2436 const ghobject_t &oid, ///< [in] Object containing omap
2437 bufferlist *header, ///< [out] omap header
2438 bool allow_eio = false ///< [in] don't assert on eio
2439 ) override;
2440
2441 /// Get keys defined on oid
2442 int omap_get_keys(
2443 const coll_t& cid, ///< [in] Collection containing oid
2444 const ghobject_t &oid, ///< [in] Object containing omap
2445 set<string> *keys ///< [out] Keys defined on oid
2446 ) override;
2447 int omap_get_keys(
2448 CollectionHandle &c, ///< [in] Collection containing oid
2449 const ghobject_t &oid, ///< [in] Object containing omap
2450 set<string> *keys ///< [out] Keys defined on oid
2451 ) override;
2452
2453 /// Get key values
2454 int omap_get_values(
2455 const coll_t& cid, ///< [in] Collection containing oid
2456 const ghobject_t &oid, ///< [in] Object containing omap
2457 const set<string> &keys, ///< [in] Keys to get
2458 map<string, bufferlist> *out ///< [out] Returned keys and values
2459 ) override;
2460 int omap_get_values(
2461 CollectionHandle &c, ///< [in] Collection containing oid
2462 const ghobject_t &oid, ///< [in] Object containing omap
2463 const set<string> &keys, ///< [in] Keys to get
2464 map<string, bufferlist> *out ///< [out] Returned keys and values
2465 ) override;
2466
2467 /// Filters keys into out which are defined on oid
2468 int omap_check_keys(
2469 const coll_t& cid, ///< [in] Collection containing oid
2470 const ghobject_t &oid, ///< [in] Object containing omap
2471 const set<string> &keys, ///< [in] Keys to check
2472 set<string> *out ///< [out] Subset of keys defined on oid
2473 ) override;
2474 int omap_check_keys(
2475 CollectionHandle &c, ///< [in] Collection containing oid
2476 const ghobject_t &oid, ///< [in] Object containing omap
2477 const set<string> &keys, ///< [in] Keys to check
2478 set<string> *out ///< [out] Subset of keys defined on oid
2479 ) override;
2480
2481 ObjectMap::ObjectMapIterator get_omap_iterator(
2482 const coll_t& cid, ///< [in] collection
2483 const ghobject_t &oid ///< [in] object
2484 ) override;
2485 ObjectMap::ObjectMapIterator get_omap_iterator(
2486 CollectionHandle &c, ///< [in] collection
2487 const ghobject_t &oid ///< [in] object
2488 ) override;
2489
2490 void set_fsid(uuid_d u) override {
2491 fsid = u;
2492 }
2493 uuid_d get_fsid() override {
2494 return fsid;
2495 }
2496
2497 uint64_t estimate_objects_overhead(uint64_t num_objects) override {
2498 return num_objects * 300; //assuming per-object overhead is 300 bytes
2499 }
2500
2501 struct BSPerfTracker {
2502 PerfCounters::avg_tracker<uint64_t> os_commit_latency;
2503 PerfCounters::avg_tracker<uint64_t> os_apply_latency;
2504
2505 objectstore_perf_stat_t get_cur_stats() const {
2506 objectstore_perf_stat_t ret;
c07f9fc5
FG
2507 ret.os_commit_latency = os_commit_latency.current_avg();
2508 ret.os_apply_latency = os_apply_latency.current_avg();
7c673cae
FG
2509 return ret;
2510 }
2511
2512 void update_from_perfcounters(PerfCounters &logger);
2513 } perf_tracker;
2514
2515 objectstore_perf_stat_t get_cur_stats() override {
2516 perf_tracker.update_from_perfcounters(*logger);
2517 return perf_tracker.get_cur_stats();
2518 }
2519 const PerfCounters* get_perf_counters() const override {
2520 return logger;
2521 }
2522
2523 int queue_transactions(
2524 Sequencer *osr,
2525 vector<Transaction>& tls,
2526 TrackedOpRef op = TrackedOpRef(),
2527 ThreadPool::TPHandle *handle = NULL) override;
2528
2529 // error injection
2530 void inject_data_error(const ghobject_t& o) override {
2531 RWLock::WLocker l(debug_read_error_lock);
2532 debug_data_error_objects.insert(o);
2533 }
2534 void inject_mdata_error(const ghobject_t& o) override {
2535 RWLock::WLocker l(debug_read_error_lock);
2536 debug_mdata_error_objects.insert(o);
2537 }
224ce89b
WB
2538 void compact() override {
2539 assert(db);
2540 db->compact();
2541 }
28e407b8
AA
2542 bool has_builtin_csum() const override {
2543 return true;
2544 }
2545
7c673cae
FG
2546private:
2547 bool _debug_data_eio(const ghobject_t& o) {
2548 if (!cct->_conf->bluestore_debug_inject_read_err) {
2549 return false;
2550 }
2551 RWLock::RLocker l(debug_read_error_lock);
2552 return debug_data_error_objects.count(o);
2553 }
2554 bool _debug_mdata_eio(const ghobject_t& o) {
2555 if (!cct->_conf->bluestore_debug_inject_read_err) {
2556 return false;
2557 }
2558 RWLock::RLocker l(debug_read_error_lock);
2559 return debug_mdata_error_objects.count(o);
2560 }
2561 void _debug_obj_on_delete(const ghobject_t& o) {
2562 if (cct->_conf->bluestore_debug_inject_read_err) {
2563 RWLock::WLocker l(debug_read_error_lock);
2564 debug_data_error_objects.erase(o);
2565 debug_mdata_error_objects.erase(o);
2566 }
2567 }
2568
2569private:
2570
2571 // --------------------------------------------------------
2572 // read processing internal methods
2573 int _verify_csum(
2574 OnodeRef& o,
2575 const bluestore_blob_t* blob,
2576 uint64_t blob_xoffset,
2577 const bufferlist& bl,
2578 uint64_t logical_offset) const;
2579 int _decompress(bufferlist& source, bufferlist* result);
2580
2581
2582 // --------------------------------------------------------
2583 // write ops
2584
2585 struct WriteContext {
2586 bool buffered = false; ///< buffered write
2587 bool compress = false; ///< compressed write
2588 uint64_t target_blob_size = 0; ///< target (max) blob size
2589 unsigned csum_order = 0; ///< target checksum chunk order
2590
2591 old_extent_map_t old_extents; ///< must deref these blobs
2592
2593 struct write_item {
2594 uint64_t logical_offset; ///< write logical offset
2595 BlobRef b;
2596 uint64_t blob_length;
2597 uint64_t b_off;
2598 bufferlist bl;
2599 uint64_t b_off0; ///< original offset in a blob prior to padding
2600 uint64_t length0; ///< original data length prior to padding
2601
2602 bool mark_unused;
2603 bool new_blob; ///< whether new blob was created
2604
3efd9988
FG
2605 bool compressed = false;
2606 bufferlist compressed_bl;
2607 size_t compressed_len = 0;
2608
7c673cae
FG
2609 write_item(
2610 uint64_t logical_offs,
2611 BlobRef b,
2612 uint64_t blob_len,
2613 uint64_t o,
2614 bufferlist& bl,
2615 uint64_t o0,
2616 uint64_t l0,
2617 bool _mark_unused,
2618 bool _new_blob)
2619 :
2620 logical_offset(logical_offs),
2621 b(b),
2622 blob_length(blob_len),
2623 b_off(o),
2624 bl(bl),
2625 b_off0(o0),
2626 length0(l0),
2627 mark_unused(_mark_unused),
2628 new_blob(_new_blob) {}
2629 };
2630 vector<write_item> writes; ///< blobs we're writing
2631
2632 /// partial clone of the context
2633 void fork(const WriteContext& other) {
2634 buffered = other.buffered;
2635 compress = other.compress;
2636 target_blob_size = other.target_blob_size;
2637 csum_order = other.csum_order;
2638 }
2639 void write(
2640 uint64_t loffs,
2641 BlobRef b,
2642 uint64_t blob_len,
2643 uint64_t o,
2644 bufferlist& bl,
2645 uint64_t o0,
2646 uint64_t len0,
2647 bool _mark_unused,
2648 bool _new_blob) {
2649 writes.emplace_back(loffs,
2650 b,
2651 blob_len,
2652 o,
2653 bl,
2654 o0,
2655 len0,
2656 _mark_unused,
2657 _new_blob);
2658 }
2659 /// Checks for writes to the same pextent within a blob
2660 bool has_conflict(
2661 BlobRef b,
2662 uint64_t loffs,
2663 uint64_t loffs_end,
2664 uint64_t min_alloc_size);
2665 };
2666
2667 void _do_write_small(
2668 TransContext *txc,
2669 CollectionRef &c,
2670 OnodeRef o,
2671 uint64_t offset, uint64_t length,
2672 bufferlist::iterator& blp,
2673 WriteContext *wctx);
2674 void _do_write_big(
2675 TransContext *txc,
2676 CollectionRef &c,
2677 OnodeRef o,
2678 uint64_t offset, uint64_t length,
2679 bufferlist::iterator& blp,
2680 WriteContext *wctx);
2681 int _do_alloc_write(
2682 TransContext *txc,
2683 CollectionRef c,
2684 OnodeRef o,
2685 WriteContext *wctx);
2686 void _wctx_finish(
2687 TransContext *txc,
2688 CollectionRef& c,
2689 OnodeRef o,
31f18b77
FG
2690 WriteContext *wctx,
2691 set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae
FG
2692
2693 int _do_transaction(Transaction *t,
2694 TransContext *txc,
2695 ThreadPool::TPHandle *handle);
2696
2697 int _write(TransContext *txc,
2698 CollectionRef& c,
2699 OnodeRef& o,
2700 uint64_t offset, size_t len,
2701 bufferlist& bl,
2702 uint32_t fadvise_flags);
2703 void _pad_zeros(bufferlist *bl, uint64_t *offset,
2704 uint64_t chunk_size);
2705
31f18b77
FG
2706 void _choose_write_options(CollectionRef& c,
2707 OnodeRef o,
2708 uint32_t fadvise_flags,
2709 WriteContext *wctx);
2710
2711 int _do_gc(TransContext *txc,
2712 CollectionRef& c,
2713 OnodeRef o,
2714 const GarbageCollector& gc,
2715 const WriteContext& wctx,
2716 uint64_t *dirty_start,
2717 uint64_t *dirty_end);
2718
7c673cae
FG
2719 int _do_write(TransContext *txc,
2720 CollectionRef &c,
2721 OnodeRef o,
2722 uint64_t offset, uint64_t length,
2723 bufferlist& bl,
2724 uint32_t fadvise_flags);
2725 void _do_write_data(TransContext *txc,
2726 CollectionRef& c,
2727 OnodeRef o,
2728 uint64_t offset,
2729 uint64_t length,
2730 bufferlist& bl,
2731 WriteContext *wctx);
2732
2733 int _touch(TransContext *txc,
2734 CollectionRef& c,
2735 OnodeRef& o);
2736 int _do_zero(TransContext *txc,
2737 CollectionRef& c,
2738 OnodeRef& o,
2739 uint64_t offset, size_t len);
2740 int _zero(TransContext *txc,
2741 CollectionRef& c,
2742 OnodeRef& o,
2743 uint64_t offset, size_t len);
2744 void _do_truncate(TransContext *txc,
2745 CollectionRef& c,
2746 OnodeRef o,
31f18b77
FG
2747 uint64_t offset,
2748 set<SharedBlob*> *maybe_unshared_blobs=0);
35e4c445 2749 int _truncate(TransContext *txc,
7c673cae
FG
2750 CollectionRef& c,
2751 OnodeRef& o,
2752 uint64_t offset);
2753 int _remove(TransContext *txc,
2754 CollectionRef& c,
2755 OnodeRef& o);
2756 int _do_remove(TransContext *txc,
2757 CollectionRef& c,
2758 OnodeRef o);
2759 int _setattr(TransContext *txc,
2760 CollectionRef& c,
2761 OnodeRef& o,
2762 const string& name,
2763 bufferptr& val);
2764 int _setattrs(TransContext *txc,
2765 CollectionRef& c,
2766 OnodeRef& o,
2767 const map<string,bufferptr>& aset);
2768 int _rmattr(TransContext *txc,
2769 CollectionRef& c,
2770 OnodeRef& o,
2771 const string& name);
2772 int _rmattrs(TransContext *txc,
2773 CollectionRef& c,
2774 OnodeRef& o);
2775 void _do_omap_clear(TransContext *txc, uint64_t id);
2776 int _omap_clear(TransContext *txc,
2777 CollectionRef& c,
2778 OnodeRef& o);
2779 int _omap_setkeys(TransContext *txc,
2780 CollectionRef& c,
2781 OnodeRef& o,
2782 bufferlist& bl);
2783 int _omap_setheader(TransContext *txc,
2784 CollectionRef& c,
2785 OnodeRef& o,
2786 bufferlist& header);
2787 int _omap_rmkeys(TransContext *txc,
2788 CollectionRef& c,
2789 OnodeRef& o,
2790 bufferlist& bl);
2791 int _omap_rmkey_range(TransContext *txc,
2792 CollectionRef& c,
2793 OnodeRef& o,
2794 const string& first, const string& last);
2795 int _set_alloc_hint(
2796 TransContext *txc,
2797 CollectionRef& c,
2798 OnodeRef& o,
2799 uint64_t expected_object_size,
2800 uint64_t expected_write_size,
2801 uint32_t flags);
2802 int _do_clone_range(TransContext *txc,
2803 CollectionRef& c,
2804 OnodeRef& oldo,
2805 OnodeRef& newo,
2806 uint64_t srcoff, uint64_t length, uint64_t dstoff);
2807 int _clone(TransContext *txc,
2808 CollectionRef& c,
2809 OnodeRef& oldo,
2810 OnodeRef& newo);
2811 int _clone_range(TransContext *txc,
2812 CollectionRef& c,
2813 OnodeRef& oldo,
2814 OnodeRef& newo,
2815 uint64_t srcoff, uint64_t length, uint64_t dstoff);
2816 int _rename(TransContext *txc,
2817 CollectionRef& c,
2818 OnodeRef& oldo,
2819 OnodeRef& newo,
2820 const ghobject_t& new_oid);
2821 int _create_collection(TransContext *txc, const coll_t &cid,
2822 unsigned bits, CollectionRef *c);
2823 int _remove_collection(TransContext *txc, const coll_t &cid,
2824 CollectionRef *c);
2825 int _split_collection(TransContext *txc,
2826 CollectionRef& c,
2827 CollectionRef& d,
2828 unsigned bits, int rem);
2829};
2830
2831inline ostream& operator<<(ostream& out, const BlueStore::OpSequencer& s) {
2832 return out << *s.parent;
2833}
2834
2835static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
2836 o->get();
2837}
2838static inline void intrusive_ptr_release(BlueStore::Onode *o) {
2839 o->put();
2840}
2841
2842static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
2843 o->get();
2844}
2845static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
2846 o->put();
2847}
2848
2849#endif