]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.h
update sources to v12.1.3
[ceph.git] / ceph / src / os / bluestore / BlueStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_OSD_BLUESTORE_H
16#define CEPH_OSD_BLUESTORE_H
17
18#include "acconfig.h"
19
20#include <unistd.h>
21
22#include <atomic>
23#include <mutex>
24#include <condition_variable>
25
26#include <boost/intrusive/list.hpp>
27#include <boost/intrusive/unordered_set.hpp>
28#include <boost/intrusive/set.hpp>
29#include <boost/functional/hash.hpp>
30#include <boost/dynamic_bitset.hpp>
31
32#include "include/assert.h"
33#include "include/unordered_map.h"
34#include "include/memory.h"
35#include "include/mempool.h"
36#include "common/Finisher.h"
37#include "common/perf_counters.h"
38#include "compressor/Compressor.h"
39#include "os/ObjectStore.h"
40
41#include "bluestore_types.h"
42#include "BlockDevice.h"
43#include "common/EventTrace.h"
44
45class Allocator;
46class FreelistManager;
47class BlueFS;
48
49//#define DEBUG_CACHE
50//#define DEBUG_DEFERRED
51
31f18b77
FG
52
53
54// constants for Buffer::optimize()
55#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
56
57
7c673cae
FG
58enum {
59 l_bluestore_first = 732430,
60 l_bluestore_kv_flush_lat,
61 l_bluestore_kv_commit_lat,
62 l_bluestore_kv_lat,
63 l_bluestore_state_prepare_lat,
64 l_bluestore_state_aio_wait_lat,
65 l_bluestore_state_io_done_lat,
66 l_bluestore_state_kv_queued_lat,
67 l_bluestore_state_kv_committing_lat,
68 l_bluestore_state_kv_done_lat,
69 l_bluestore_state_deferred_queued_lat,
70 l_bluestore_state_deferred_aio_wait_lat,
71 l_bluestore_state_deferred_cleanup_lat,
72 l_bluestore_state_finishing_lat,
73 l_bluestore_state_done_lat,
74 l_bluestore_throttle_lat,
75 l_bluestore_submit_lat,
76 l_bluestore_commit_lat,
77 l_bluestore_read_lat,
78 l_bluestore_read_onode_meta_lat,
79 l_bluestore_read_wait_aio_lat,
80 l_bluestore_compress_lat,
81 l_bluestore_decompress_lat,
82 l_bluestore_csum_lat,
83 l_bluestore_compress_success_count,
84 l_bluestore_compress_rejected_count,
85 l_bluestore_write_pad_bytes,
86 l_bluestore_deferred_write_ops,
87 l_bluestore_deferred_write_bytes,
88 l_bluestore_write_penalty_read_ops,
89 l_bluestore_allocated,
90 l_bluestore_stored,
91 l_bluestore_compressed,
92 l_bluestore_compressed_allocated,
93 l_bluestore_compressed_original,
94 l_bluestore_onodes,
95 l_bluestore_onode_hits,
96 l_bluestore_onode_misses,
97 l_bluestore_onode_shard_hits,
98 l_bluestore_onode_shard_misses,
99 l_bluestore_extents,
100 l_bluestore_blobs,
101 l_bluestore_buffers,
102 l_bluestore_buffer_bytes,
103 l_bluestore_buffer_hit_bytes,
104 l_bluestore_buffer_miss_bytes,
105 l_bluestore_write_big,
106 l_bluestore_write_big_bytes,
107 l_bluestore_write_big_blobs,
108 l_bluestore_write_small,
109 l_bluestore_write_small_bytes,
110 l_bluestore_write_small_unused,
111 l_bluestore_write_small_deferred,
112 l_bluestore_write_small_pre_read,
113 l_bluestore_write_small_new,
114 l_bluestore_txc,
115 l_bluestore_onode_reshard,
116 l_bluestore_blob_split,
117 l_bluestore_extent_compress,
118 l_bluestore_gc_merged,
119 l_bluestore_last
120};
121
122class BlueStore : public ObjectStore,
123 public md_config_obs_t {
124 // -----------------------------------------------------
125 // types
126public:
127 // config observer
128 const char** get_tracked_conf_keys() const override;
129 void handle_conf_change(const struct md_config_t *conf,
130 const std::set<std::string> &changed) override;
131
132 void _set_csum();
133 void _set_compression();
134 void _set_throttle_params();
31f18b77 135 int _set_cache_sizes();
7c673cae
FG
136
137 class TransContext;
138
139 typedef map<uint64_t, bufferlist> ready_regions_t;
140
141 struct BufferSpace;
142 struct Collection;
143 typedef boost::intrusive_ptr<Collection> CollectionRef;
144
145 struct AioContext {
146 virtual void aio_finish(BlueStore *store) = 0;
147 virtual ~AioContext() {}
148 };
149
150 /// cached buffer
151 struct Buffer {
152 MEMPOOL_CLASS_HELPERS();
153
154 enum {
155 STATE_EMPTY, ///< empty buffer -- used for cache history
156 STATE_CLEAN, ///< clean data that is up to date
157 STATE_WRITING, ///< data that is being written (io not yet complete)
158 };
159 static const char *get_state_name(int s) {
160 switch (s) {
161 case STATE_EMPTY: return "empty";
162 case STATE_CLEAN: return "clean";
163 case STATE_WRITING: return "writing";
164 default: return "???";
165 }
166 }
167 enum {
168 FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
169 // NOTE: fix operator<< when you define a second flag
170 };
171 static const char *get_flag_name(int s) {
172 switch (s) {
173 case FLAG_NOCACHE: return "nocache";
174 default: return "???";
175 }
176 }
177
178 BufferSpace *space;
179 uint16_t state; ///< STATE_*
180 uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
181 uint32_t flags; ///< FLAG_*
182 uint64_t seq;
183 uint32_t offset, length;
184 bufferlist data;
185
186 boost::intrusive::list_member_hook<> lru_item;
187 boost::intrusive::list_member_hook<> state_item;
188
189 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
190 unsigned f = 0)
191 : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
192 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b,
193 unsigned f = 0)
194 : space(space), state(s), flags(f), seq(q), offset(o),
195 length(b.length()), data(b) {}
196
197 bool is_empty() const {
198 return state == STATE_EMPTY;
199 }
200 bool is_clean() const {
201 return state == STATE_CLEAN;
202 }
203 bool is_writing() const {
204 return state == STATE_WRITING;
205 }
206
207 uint32_t end() const {
208 return offset + length;
209 }
210
211 void truncate(uint32_t newlen) {
212 assert(newlen < length);
213 if (data.length()) {
214 bufferlist t;
215 t.substr_of(data, 0, newlen);
216 data.claim(t);
217 }
218 length = newlen;
219 }
31f18b77
FG
220 void maybe_rebuild() {
221 if (data.length() &&
222 (data.get_num_buffers() > 1 ||
223 data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
224 data.rebuild();
225 }
226 }
7c673cae
FG
227
228 void dump(Formatter *f) const {
229 f->dump_string("state", get_state_name(state));
230 f->dump_unsigned("seq", seq);
231 f->dump_unsigned("offset", offset);
232 f->dump_unsigned("length", length);
233 f->dump_unsigned("data_length", data.length());
234 }
235 };
236
237 struct Cache;
238
239 /// map logical extent range (object) onto buffers
240 struct BufferSpace {
241 typedef boost::intrusive::list<
242 Buffer,
243 boost::intrusive::member_hook<
244 Buffer,
245 boost::intrusive::list_member_hook<>,
246 &Buffer::state_item> > state_list_t;
247
31f18b77 248 mempool::bluestore_cache_other::map<uint32_t, std::unique_ptr<Buffer>>
7c673cae
FG
249 buffer_map;
250
251 // we use a bare intrusive list here instead of std::map because
252 // it uses less memory and we expect this to be very small (very
253 // few IOs in flight to the same Blob at the same time).
254 state_list_t writing; ///< writing buffers, sorted by seq, ascending
255
256 ~BufferSpace() {
257 assert(buffer_map.empty());
258 assert(writing.empty());
259 }
260
261 void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) {
262 cache->_audit("_add_buffer start");
263 buffer_map[b->offset].reset(b);
264 if (b->is_writing()) {
31f18b77 265 b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
224ce89b
WB
266 if (writing.empty() || writing.rbegin()->seq <= b->seq) {
267 writing.push_back(*b);
268 } else {
269 auto it = writing.begin();
270 while (it->seq < b->seq) {
271 ++it;
272 }
273
274 assert(it->seq >= b->seq);
275 // note that this will insert b before it
276 // hence the order is maintained
277 writing.insert(it, *b);
278 }
7c673cae 279 } else {
31f18b77 280 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
281 cache->_add_buffer(b, level, near);
282 }
283 cache->_audit("_add_buffer end");
284 }
285 void _rm_buffer(Cache* cache, Buffer *b) {
286 _rm_buffer(cache, buffer_map.find(b->offset));
287 }
31f18b77
FG
288 void _rm_buffer(Cache* cache,
289 map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
7c673cae
FG
290 assert(p != buffer_map.end());
291 cache->_audit("_rm_buffer start");
292 if (p->second->is_writing()) {
293 writing.erase(writing.iterator_to(*p->second));
294 } else {
295 cache->_rm_buffer(p->second.get());
296 }
297 buffer_map.erase(p);
298 cache->_audit("_rm_buffer end");
299 }
300
301 map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
302 uint32_t offset) {
303 auto i = buffer_map.lower_bound(offset);
304 if (i != buffer_map.begin()) {
305 --i;
306 if (i->first + i->second->length <= offset)
307 ++i;
308 }
309 return i;
310 }
311
312 // must be called under protection of the Cache lock
313 void _clear(Cache* cache);
314
315 // return value is the highest cache_private of a trimmed buffer, or 0.
316 int discard(Cache* cache, uint32_t offset, uint32_t length) {
317 std::lock_guard<std::recursive_mutex> l(cache->lock);
318 return _discard(cache, offset, length);
319 }
320 int _discard(Cache* cache, uint32_t offset, uint32_t length);
321
322 void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
323 unsigned flags) {
324 std::lock_guard<std::recursive_mutex> l(cache->lock);
325 Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
326 flags);
327 b->cache_private = _discard(cache, offset, bl.length());
328 _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
329 }
330 void finish_write(Cache* cache, uint64_t seq);
331 void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
332 std::lock_guard<std::recursive_mutex> l(cache->lock);
333 Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
334 b->cache_private = _discard(cache, offset, bl.length());
335 _add_buffer(cache, b, 1, nullptr);
336 }
337
338 void read(Cache* cache, uint32_t offset, uint32_t length,
339 BlueStore::ready_regions_t& res,
340 interval_set<uint32_t>& res_intervals);
341
342 void truncate(Cache* cache, uint32_t offset) {
343 discard(cache, offset, (uint32_t)-1 - offset);
344 }
345
346 void split(Cache* cache, size_t pos, BufferSpace &r);
347
348 void dump(Cache* cache, Formatter *f) const {
349 std::lock_guard<std::recursive_mutex> l(cache->lock);
350 f->open_array_section("buffers");
351 for (auto& i : buffer_map) {
352 f->open_object_section("buffer");
353 assert(i.first == i.second->offset);
354 i.second->dump(f);
355 f->close_section();
356 }
357 f->close_section();
358 }
359 };
360
361 struct SharedBlobSet;
362
363 /// in-memory shared blob state (incl cached buffers)
364 struct SharedBlob {
365 MEMPOOL_CLASS_HELPERS();
366
367 std::atomic_int nref = {0}; ///< reference count
368 bool loaded = false;
369
370 CollectionRef coll;
371 union {
372 uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
373 bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
374 };
375 BufferSpace bc; ///< buffer cache
376
377 SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
378 if (get_cache()) {
379 get_cache()->add_blob();
380 }
381 }
382 SharedBlob(uint64_t i, Collection *_coll);
383 ~SharedBlob();
384
385 uint64_t get_sbid() const {
386 return loaded ? persistent->sbid : sbid_unloaded;
387 }
388
389 friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
390 friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
391
392 friend ostream& operator<<(ostream& out, const SharedBlob& sb);
393
394 void get() {
395 ++nref;
396 }
397 void put();
398
399 /// get logical references
400 void get_ref(uint64_t offset, uint32_t length);
401
402 /// put logical references, and get back any released extents
403 void put_ref(uint64_t offset, uint32_t length,
31f18b77 404 PExtentVector *r, set<SharedBlob*> *maybe_unshared_blobs);
7c673cae
FG
405
406 friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
407 return l.get_sbid() == r.get_sbid();
408 }
409 inline Cache* get_cache() {
410 return coll ? coll->cache : nullptr;
411 }
412 inline SharedBlobSet* get_parent() {
413 return coll ? &(coll->shared_blob_set) : nullptr;
414 }
415 inline bool is_loaded() const {
416 return loaded;
417 }
418
419 };
420 typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
421
422 /// a lookup table of SharedBlobs
423 struct SharedBlobSet {
424 std::mutex lock; ///< protect lookup, insertion, removal
425
426 // we use a bare pointer because we don't want to affect the ref
427 // count
31f18b77 428 mempool::bluestore_cache_other::unordered_map<uint64_t,SharedBlob*> sb_map;
7c673cae
FG
429
430 SharedBlobRef lookup(uint64_t sbid) {
431 std::lock_guard<std::mutex> l(lock);
432 auto p = sb_map.find(sbid);
433 if (p == sb_map.end()) {
434 return nullptr;
435 }
436 return p->second;
437 }
438
439 void add(Collection* coll, SharedBlob *sb) {
440 std::lock_guard<std::mutex> l(lock);
441 sb_map[sb->get_sbid()] = sb;
442 sb->coll = coll;
443 }
444
445 bool remove(SharedBlob *sb) {
446 std::lock_guard<std::mutex> l(lock);
447 if (sb->nref == 0) {
448 assert(sb->get_parent() == this);
449 sb_map.erase(sb->get_sbid());
450 return true;
451 }
452 return false;
453 }
454
455 bool empty() {
456 std::lock_guard<std::mutex> l(lock);
457 return sb_map.empty();
458 }
459 };
460
461//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
462
463 /// in-memory blob metadata and associated cached buffers (if any)
464 struct Blob {
465 MEMPOOL_CLASS_HELPERS();
466
467 std::atomic_int nref = {0}; ///< reference count
468 int16_t id = -1; ///< id, for spanning blobs only, >= 0
469 int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
470 SharedBlobRef shared_blob; ///< shared blob state (if any)
471
472 private:
473 mutable bluestore_blob_t blob; ///< decoded blob metadata
474#ifdef CACHE_BLOB_BL
475 mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty
476#endif
477 /// refs from this shard. ephemeral if id<0, persisted if spanning.
478 bluestore_blob_use_tracker_t used_in_blob;
479
480 public:
481
482 friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
483 friend void intrusive_ptr_release(Blob *b) { b->put(); }
484
485 friend ostream& operator<<(ostream& out, const Blob &b);
486
487 const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
488 return used_in_blob;
489 }
490 bool is_referenced() const {
491 return used_in_blob.is_not_empty();
492 }
493 uint32_t get_referenced_bytes() const {
494 return used_in_blob.get_referenced_bytes();
495 }
496
497 bool is_spanning() const {
498 return id >= 0;
499 }
500
501 bool can_split() const {
502 std::lock_guard<std::recursive_mutex> l(shared_blob->get_cache()->lock);
503 // splitting a BufferSpace writing list is too hard; don't try.
504 return shared_blob->bc.writing.empty() &&
505 used_in_blob.can_split() &&
506 get_blob().can_split();
507 }
508
509 bool can_split_at(uint32_t blob_offset) const {
510 return used_in_blob.can_split_at(blob_offset) &&
511 get_blob().can_split_at(blob_offset);
512 }
513
224ce89b 514 bool can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
515 uint32_t target_blob_size,
516 uint32_t b_offset,
517 uint32_t *length0);
518
519 void dup(Blob& o) {
520 o.shared_blob = shared_blob;
521 o.blob = blob;
522#ifdef CACHE_BLOB_BL
523 o.blob_bl = blob_bl;
524#endif
525 }
526
224ce89b 527 inline const bluestore_blob_t& get_blob() const {
7c673cae
FG
528 return blob;
529 }
224ce89b 530 inline bluestore_blob_t& dirty_blob() {
7c673cae
FG
531#ifdef CACHE_BLOB_BL
532 blob_bl.clear();
533#endif
534 return blob;
535 }
536
537 /// discard buffers for unallocated regions
538 void discard_unallocated(Collection *coll);
539
540 /// get logical references
541 void get_ref(Collection *coll, uint32_t offset, uint32_t length);
542 /// put logical references, and get back any released extents
543 bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
544 PExtentVector *r);
545
546 /// split the blob
547 void split(Collection *coll, uint32_t blob_offset, Blob *o);
548
549 void get() {
550 ++nref;
551 }
552 void put() {
553 if (--nref == 0)
554 delete this;
555 }
556
557
558#ifdef CACHE_BLOB_BL
559 void _encode() const {
560 if (blob_bl.length() == 0 ) {
561 ::encode(blob, blob_bl);
562 } else {
563 assert(blob_bl.length());
564 }
565 }
566 void bound_encode(
567 size_t& p,
568 bool include_ref_map) const {
569 _encode();
570 p += blob_bl.length();
571 if (include_ref_map) {
572 used_in_blob.bound_encode(p);
573 }
574 }
575 void encode(
576 bufferlist::contiguous_appender& p,
577 bool include_ref_map) const {
578 _encode();
579 p.append(blob_bl);
580 if (include_ref_map) {
581 used_in_blob.encode(p);
582 }
583 }
584 void decode(
585 Collection */*coll*/,
586 bufferptr::iterator& p,
587 bool include_ref_map) {
588 const char *start = p.get_pos();
589 denc(blob, p);
590 const char *end = p.get_pos();
591 blob_bl.clear();
592 blob_bl.append(start, end - start);
593 if (include_ref_map) {
594 used_in_blob.decode(p);
595 }
596 }
597#else
598 void bound_encode(
599 size_t& p,
600 uint64_t struct_v,
601 uint64_t sbid,
602 bool include_ref_map) const {
603 denc(blob, p, struct_v);
604 if (blob.is_shared()) {
605 denc(sbid, p);
606 }
607 if (include_ref_map) {
608 used_in_blob.bound_encode(p);
609 }
610 }
611 void encode(
612 bufferlist::contiguous_appender& p,
613 uint64_t struct_v,
614 uint64_t sbid,
615 bool include_ref_map) const {
616 denc(blob, p, struct_v);
617 if (blob.is_shared()) {
618 denc(sbid, p);
619 }
620 if (include_ref_map) {
621 used_in_blob.encode(p);
622 }
623 }
624 void decode(
625 Collection *coll,
626 bufferptr::iterator& p,
627 uint64_t struct_v,
628 uint64_t* sbid,
629 bool include_ref_map);
630#endif
631 };
632 typedef boost::intrusive_ptr<Blob> BlobRef;
31f18b77 633 typedef mempool::bluestore_cache_other::map<int,BlobRef> blob_map_t;
7c673cae
FG
634
635 /// a logical extent, pointing to (some portion of) a blob
636 typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
637 struct Extent : public ExtentBase {
638 MEMPOOL_CLASS_HELPERS();
639
640 uint32_t logical_offset = 0; ///< logical offset
641 uint32_t blob_offset = 0; ///< blob offset
642 uint32_t length = 0; ///< length
643 BlobRef blob; ///< the blob with our data
644
645 /// ctor for lookup only
646 explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
647 /// ctor for delayed initialization (see decode_some())
648 explicit Extent() : ExtentBase() {
649 }
650 /// ctor for general usage
651 Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
652 : ExtentBase(),
653 logical_offset(lo), blob_offset(o), length(l) {
654 assign_blob(b);
655 }
656 ~Extent() {
657 if (blob) {
658 blob->shared_blob->get_cache()->rm_extent();
659 }
660 }
661
662 void assign_blob(const BlobRef& b) {
663 assert(!blob);
664 blob = b;
665 blob->shared_blob->get_cache()->add_extent();
666 }
667
668 // comparators for intrusive_set
669 friend bool operator<(const Extent &a, const Extent &b) {
670 return a.logical_offset < b.logical_offset;
671 }
672 friend bool operator>(const Extent &a, const Extent &b) {
673 return a.logical_offset > b.logical_offset;
674 }
675 friend bool operator==(const Extent &a, const Extent &b) {
676 return a.logical_offset == b.logical_offset;
677 }
678
679 uint32_t blob_start() const {
680 return logical_offset - blob_offset;
681 }
682
683 uint32_t blob_end() const {
684 return blob_start() + blob->get_blob().get_logical_length();
685 }
686
687 uint32_t logical_end() const {
688 return logical_offset + length;
689 }
690
691 // return true if any piece of the blob is out of
692 // the given range [o, o + l].
693 bool blob_escapes_range(uint32_t o, uint32_t l) const {
694 return blob_start() < o || blob_end() > o + l;
695 }
696 };
697 typedef boost::intrusive::set<Extent> extent_map_t;
698
699
700 friend ostream& operator<<(ostream& out, const Extent& e);
701
702 struct OldExtent {
703 boost::intrusive::list_member_hook<> old_extent_item;
704 Extent e;
705 PExtentVector r;
706 bool blob_empty; // flag to track the last removed extent that makes blob
707 // empty - required to update compression stat properly
708 OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
709 : e(lo, o, l, b), blob_empty(false) {
710 }
711 static OldExtent* create(CollectionRef c,
712 uint32_t lo,
713 uint32_t o,
714 uint32_t l,
715 BlobRef& b);
716 };
717 typedef boost::intrusive::list<
718 OldExtent,
719 boost::intrusive::member_hook<
720 OldExtent,
721 boost::intrusive::list_member_hook<>,
722 &OldExtent::old_extent_item> > old_extent_map_t;
723
724 struct Onode;
725
726 /// a sharded extent map, mapping offsets to lextents to blobs
727 struct ExtentMap {
728 Onode *onode;
729 extent_map_t extent_map; ///< map of Extents to Blobs
730 blob_map_t spanning_blob_map; ///< blobs that span shards
731
732 struct Shard {
733 bluestore_onode_t::shard_info *shard_info = nullptr;
734 unsigned extents = 0; ///< count extents in this shard
735 bool loaded = false; ///< true if shard is loaded
736 bool dirty = false; ///< true if shard is dirty and needs reencoding
737 };
31f18b77 738 mempool::bluestore_cache_other::vector<Shard> shards; ///< shards
7c673cae
FG
739
740 bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
741
742 uint32_t needs_reshard_begin = 0;
743 uint32_t needs_reshard_end = 0;
744
745 bool needs_reshard() const {
746 return needs_reshard_end > needs_reshard_begin;
747 }
748 void clear_needs_reshard() {
749 needs_reshard_begin = needs_reshard_end = 0;
750 }
751 void request_reshard(uint32_t begin, uint32_t end) {
752 if (begin < needs_reshard_begin) {
753 needs_reshard_begin = begin;
754 }
755 if (end > needs_reshard_end) {
756 needs_reshard_end = end;
757 }
758 }
759
760 struct DeleteDisposer {
761 void operator()(Extent *e) { delete e; }
762 };
763
764 ExtentMap(Onode *o);
765 ~ExtentMap() {
766 extent_map.clear_and_dispose(DeleteDisposer());
767 }
768
769 void clear() {
770 extent_map.clear_and_dispose(DeleteDisposer());
771 shards.clear();
772 inline_bl.clear();
773 clear_needs_reshard();
774 }
775
776 bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl,
777 unsigned *pn);
778 unsigned decode_some(bufferlist& bl);
779
780 void bound_encode_spanning_blobs(size_t& p);
781 void encode_spanning_blobs(bufferlist::contiguous_appender& p);
782 void decode_spanning_blobs(bufferptr::iterator& p);
783
784 BlobRef get_spanning_blob(int id) {
785 auto p = spanning_blob_map.find(id);
786 assert(p != spanning_blob_map.end());
787 return p->second;
788 }
789
790 void update(KeyValueDB::Transaction t, bool force);
31f18b77 791 decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
7c673cae
FG
792 void reshard(
793 KeyValueDB *db,
794 KeyValueDB::Transaction t);
795
796 /// initialize Shards from the onode
797 void init_shards(bool loaded, bool dirty);
798
799 /// return index of shard containing offset
800 /// or -1 if not found
801 int seek_shard(uint32_t offset) {
802 size_t end = shards.size();
803 size_t mid, left = 0;
804 size_t right = end; // one passed the right end
805
806 while (left < right) {
807 mid = left + (right - left) / 2;
808 if (offset >= shards[mid].shard_info->offset) {
809 size_t next = mid + 1;
810 if (next >= end || offset < shards[next].shard_info->offset)
811 return mid;
812 //continue to search forwards
813 left = next;
814 } else {
815 //continue to search backwards
816 right = mid;
817 }
818 }
819
820 return -1; // not found
821 }
822
823 /// check if a range spans a shard
824 bool spans_shard(uint32_t offset, uint32_t length) {
825 if (shards.empty()) {
826 return false;
827 }
828 int s = seek_shard(offset);
829 assert(s >= 0);
830 if (s == (int)shards.size() - 1) {
831 return false; // last shard
832 }
833 if (offset + length <= shards[s+1].shard_info->offset) {
834 return false;
835 }
836 return true;
837 }
838
839 /// ensure that a range of the map is loaded
840 void fault_range(KeyValueDB *db,
841 uint32_t offset, uint32_t length);
842
843 /// ensure a range of the map is marked dirty
31f18b77 844 void dirty_range(uint32_t offset, uint32_t length);
7c673cae 845
31f18b77 846 /// for seek_lextent test
7c673cae
FG
847 extent_map_t::iterator find(uint64_t offset);
848
7c673cae
FG
849 /// seek to the first lextent including or after offset
850 extent_map_t::iterator seek_lextent(uint64_t offset);
851 extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
852
853 /// add a new Extent
854 void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
855 extent_map.insert(*new Extent(lo, o, l, b));
856 }
857
858 /// remove (and delete) an Extent
859 void rm(extent_map_t::iterator p) {
860 extent_map.erase_and_dispose(p, DeleteDisposer());
861 }
862
863 bool has_any_lextents(uint64_t offset, uint64_t length);
864
865 /// consolidate adjacent lextents in extent_map
866 int compress_extent_map(uint64_t offset, uint64_t length);
867
868 /// punch a logical hole. add lextents to deref to target list.
869 void punch_hole(CollectionRef &c,
870 uint64_t offset, uint64_t length,
871 old_extent_map_t *old_extents);
872
873 /// put new lextent into lextent_map overwriting existing ones if
874 /// any and update references accordingly
875 Extent *set_lextent(CollectionRef &c,
876 uint64_t logical_offset,
877 uint64_t offset, uint64_t length,
878 BlobRef b,
879 old_extent_map_t *old_extents);
880
881 /// split a blob (and referring extents)
882 BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
883 };
884
885 /// Compressed Blob Garbage collector
886 /*
887 The primary idea of the collector is to estimate a difference between
888 allocation units(AU) currently present for compressed blobs and new AUs
889 required to store that data uncompressed.
890 Estimation is performed for protrusive extents within a logical range
891 determined by a concatenation of old_extents collection and specific(current)
892 write request.
893 The root cause for old_extents use is the need to handle blob ref counts
894 properly. Old extents still hold blob refs and hence we need to traverse
895 the collection to determine if blob to be released.
896 Protrusive extents are extents that fit into the blob set in action
897 (ones that are below the logical range from above) but not removed totally
898 due to the current write.
899 E.g. for
900 extent1 <loffs = 100, boffs = 100, len = 100> ->
901 blob1<compressed, len_on_disk=4096, logical_len=8192>
902 extent2 <loffs = 200, boffs = 200, len = 100> ->
903 blob2<raw, len_on_disk=4096, llen=4096>
904 extent3 <loffs = 300, boffs = 300, len = 100> ->
905 blob1<compressed, len_on_disk=4096, llen=8192>
906 extent4 <loffs = 4096, boffs = 0, len = 100> ->
907 blob3<raw, len_on_disk=4096, llen=4096>
908 write(300~100)
909 protrusive extents are within the following ranges <0~300, 400~8192-400>
910 In this case existing AUs that might be removed due to GC (i.e. blob1)
911 use 2x4K bytes.
912 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
913 Hence we should do a collect.
914 */
915 class GarbageCollector
916 {
917 public:
918 /// return amount of allocation units that might be saved due to GC
919 int64_t estimate(
920 uint64_t offset,
921 uint64_t length,
922 const ExtentMap& extent_map,
923 const old_extent_map_t& old_extents,
924 uint64_t min_alloc_size);
925
926 /// return a collection of extents to perform GC on
927 const vector<AllocExtent>& get_extents_to_collect() const {
928 return extents_to_collect;
929 }
930 GarbageCollector(CephContext* _cct) : cct(_cct) {}
931
932 private:
933 struct BlobInfo {
934 uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
935 int64_t expected_allocations = 0; ///< new alloc units required
936 ///< in case of gc fulfilled
937 bool collect_candidate = false; ///< indicate if blob has any extents
938 ///< eligible for GC.
939 extent_map_t::const_iterator first_lextent; ///< points to the first
940 ///< lextent referring to
941 ///< the blob if any.
942 ///< collect_candidate flag
943 ///< determines the validity
944 extent_map_t::const_iterator last_lextent; ///< points to the last
945 ///< lextent referring to
946 ///< the blob if any.
947
948 BlobInfo(uint64_t ref_bytes) :
949 referenced_bytes(ref_bytes) {
950 }
951 };
952 CephContext* cct;
953 map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
954 ///< copies that are affected by the
955 ///< specific write
956
957 vector<AllocExtent> extents_to_collect; ///< protrusive extents that should
958 ///< be collected if GC takes place
959
960 boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
961 ///< unit when traversing
962 ///< protrusive extents.
963 ///< Other extents mapped to
964 ///< this AU to be ignored
965 ///< (except the case where
966 ///< uncompressed extent follows
967 ///< compressed one - see below).
968 BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit
969 ///< caused expected_allocations
970 ///< counter increment at this blob.
971 ///< if uncompressed extent follows
972 ///< a decrement for the
973 ///< expected_allocations counter
974 ///< is needed
975 int64_t expected_allocations = 0; ///< new alloc units required in case
976 ///< of gc fulfilled
977 int64_t expected_for_release = 0; ///< alloc units currently used by
978 ///< compressed blobs that might
979 ///< gone after GC
980 uint64_t gc_start_offset; ///starting offset for GC
981 uint64_t gc_end_offset; ///ending offset for GC
982
983 protected:
984 void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
985 uint64_t start_offset,
986 uint64_t end_offset,
987 uint64_t start_touch_offset,
988 uint64_t end_touch_offset,
989 uint64_t min_alloc_size);
990 };
991
992 struct OnodeSpace;
993
994 /// an in-memory object
995 struct Onode {
996 MEMPOOL_CLASS_HELPERS();
997
998 std::atomic_int nref; ///< reference count
999 Collection *c;
1000
1001 ghobject_t oid;
1002
1003 /// key under PREFIX_OBJ where we are stored
31f18b77 1004 mempool::bluestore_cache_other::string key;
7c673cae
FG
1005
1006 boost::intrusive::list_member_hook<> lru_item;
1007
1008 bluestore_onode_t onode; ///< metadata stored as value in kv store
1009 bool exists; ///< true if object logically exists
1010
1011 ExtentMap extent_map;
1012
1013 // track txc's that have not been committed to kv store (and whose
1014 // effects cannot be read via the kvdb read methods)
1015 std::atomic<int> flushing_count = {0};
1016 std::mutex flush_lock; ///< protect flush_txns
1017 std::condition_variable flush_cond; ///< wait here for uncommitted txns
1018
1019 Onode(Collection *c, const ghobject_t& o,
31f18b77 1020 const mempool::bluestore_cache_other::string& k)
7c673cae
FG
1021 : nref(0),
1022 c(c),
1023 oid(o),
1024 key(k),
1025 exists(false),
1026 extent_map(this) {
1027 }
1028
1029 void flush();
1030 void get() {
1031 ++nref;
1032 }
1033 void put() {
1034 if (--nref == 0)
1035 delete this;
1036 }
1037 };
1038 typedef boost::intrusive_ptr<Onode> OnodeRef;
1039
1040
1041 /// a cache (shard) of onodes and buffers
1042 struct Cache {
1043 CephContext* cct;
1044 PerfCounters *logger;
1045 std::recursive_mutex lock; ///< protect lru and other structures
1046
1047 std::atomic<uint64_t> num_extents = {0};
1048 std::atomic<uint64_t> num_blobs = {0};
1049
7c673cae
FG
1050 static Cache *create(CephContext* cct, string type, PerfCounters *logger);
1051
1052 Cache(CephContext* cct) : cct(cct), logger(nullptr) {}
1053 virtual ~Cache() {}
1054
1055 virtual void _add_onode(OnodeRef& o, int level) = 0;
1056 virtual void _rm_onode(OnodeRef& o) = 0;
1057 virtual void _touch_onode(OnodeRef& o) = 0;
1058
1059 virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0;
1060 virtual void _rm_buffer(Buffer *b) = 0;
1061 virtual void _move_buffer(Cache *src, Buffer *b) = 0;
1062 virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
1063 virtual void _touch_buffer(Buffer *b) = 0;
1064
1065 virtual uint64_t _get_num_onodes() = 0;
1066 virtual uint64_t _get_buffer_bytes() = 0;
1067
1068 void add_extent() {
1069 ++num_extents;
1070 }
1071 void rm_extent() {
1072 --num_extents;
1073 }
1074
1075 void add_blob() {
1076 ++num_blobs;
1077 }
1078 void rm_blob() {
1079 --num_blobs;
1080 }
1081
31f18b77
FG
1082 void trim(uint64_t target_bytes,
1083 float target_meta_ratio,
1084 float target_data_ratio,
7c673cae
FG
1085 float bytes_per_onode);
1086
1087 void trim_all();
1088
1089 virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
1090
1091 virtual void add_stats(uint64_t *onodes, uint64_t *extents,
1092 uint64_t *blobs,
1093 uint64_t *buffers,
1094 uint64_t *bytes) = 0;
1095
31f18b77
FG
1096 bool empty() {
1097 std::lock_guard<std::recursive_mutex> l(lock);
1098 return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
1099 }
1100
7c673cae
FG
1101#ifdef DEBUG_CACHE
1102 virtual void _audit(const char *s) = 0;
1103#else
1104 void _audit(const char *s) { /* no-op */ }
1105#endif
1106 };
1107
1108 /// simple LRU cache for onodes and buffers
1109 struct LRUCache : public Cache {
1110 private:
1111 typedef boost::intrusive::list<
1112 Onode,
1113 boost::intrusive::member_hook<
1114 Onode,
1115 boost::intrusive::list_member_hook<>,
1116 &Onode::lru_item> > onode_lru_list_t;
1117 typedef boost::intrusive::list<
1118 Buffer,
1119 boost::intrusive::member_hook<
1120 Buffer,
1121 boost::intrusive::list_member_hook<>,
1122 &Buffer::lru_item> > buffer_lru_list_t;
1123
1124 onode_lru_list_t onode_lru;
1125
1126 buffer_lru_list_t buffer_lru;
1127 uint64_t buffer_size = 0;
1128
1129 public:
1130 LRUCache(CephContext* cct) : Cache(cct) {}
1131 uint64_t _get_num_onodes() override {
1132 return onode_lru.size();
1133 }
1134 void _add_onode(OnodeRef& o, int level) override {
1135 if (level > 0)
1136 onode_lru.push_front(*o);
1137 else
1138 onode_lru.push_back(*o);
1139 }
1140 void _rm_onode(OnodeRef& o) override {
1141 auto q = onode_lru.iterator_to(*o);
1142 onode_lru.erase(q);
1143 }
1144 void _touch_onode(OnodeRef& o) override;
1145
1146 uint64_t _get_buffer_bytes() override {
1147 return buffer_size;
1148 }
1149 void _add_buffer(Buffer *b, int level, Buffer *near) override {
1150 if (near) {
1151 auto q = buffer_lru.iterator_to(*near);
1152 buffer_lru.insert(q, *b);
1153 } else if (level > 0) {
1154 buffer_lru.push_front(*b);
1155 } else {
1156 buffer_lru.push_back(*b);
1157 }
1158 buffer_size += b->length;
1159 }
1160 void _rm_buffer(Buffer *b) override {
1161 assert(buffer_size >= b->length);
1162 buffer_size -= b->length;
1163 auto q = buffer_lru.iterator_to(*b);
1164 buffer_lru.erase(q);
1165 }
1166 void _move_buffer(Cache *src, Buffer *b) override {
1167 src->_rm_buffer(b);
1168 _add_buffer(b, 0, nullptr);
1169 }
1170 void _adjust_buffer_size(Buffer *b, int64_t delta) override {
1171 assert((int64_t)buffer_size + delta >= 0);
1172 buffer_size += delta;
1173 }
1174 void _touch_buffer(Buffer *b) override {
1175 auto p = buffer_lru.iterator_to(*b);
1176 buffer_lru.erase(p);
1177 buffer_lru.push_front(*b);
1178 _audit("_touch_buffer end");
1179 }
1180
1181 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1182
1183 void add_stats(uint64_t *onodes, uint64_t *extents,
1184 uint64_t *blobs,
1185 uint64_t *buffers,
1186 uint64_t *bytes) override {
1187 std::lock_guard<std::recursive_mutex> l(lock);
1188 *onodes += onode_lru.size();
1189 *extents += num_extents;
1190 *blobs += num_blobs;
1191 *buffers += buffer_lru.size();
1192 *bytes += buffer_size;
1193 }
1194
1195#ifdef DEBUG_CACHE
1196 void _audit(const char *s) override;
1197#endif
1198 };
1199
1200 // 2Q cache for buffers, LRU for onodes
1201 struct TwoQCache : public Cache {
1202 private:
1203 // stick with LRU for onodes for now (fixme?)
1204 typedef boost::intrusive::list<
1205 Onode,
1206 boost::intrusive::member_hook<
1207 Onode,
1208 boost::intrusive::list_member_hook<>,
1209 &Onode::lru_item> > onode_lru_list_t;
1210 typedef boost::intrusive::list<
1211 Buffer,
1212 boost::intrusive::member_hook<
1213 Buffer,
1214 boost::intrusive::list_member_hook<>,
1215 &Buffer::lru_item> > buffer_list_t;
1216
1217 onode_lru_list_t onode_lru;
1218
1219 buffer_list_t buffer_hot; ///< "Am" hot buffers
1220 buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
1221 buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted
1222 uint64_t buffer_bytes = 0; ///< bytes
1223
1224 enum {
1225 BUFFER_NEW = 0,
1226 BUFFER_WARM_IN, ///< in buffer_warm_in
1227 BUFFER_WARM_OUT, ///< in buffer_warm_out
1228 BUFFER_HOT, ///< in buffer_hot
1229 BUFFER_TYPE_MAX
1230 };
1231
1232 uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1233
1234 public:
1235 TwoQCache(CephContext* cct) : Cache(cct) {}
1236 uint64_t _get_num_onodes() override {
1237 return onode_lru.size();
1238 }
1239 void _add_onode(OnodeRef& o, int level) override {
1240 if (level > 0)
1241 onode_lru.push_front(*o);
1242 else
1243 onode_lru.push_back(*o);
1244 }
1245 void _rm_onode(OnodeRef& o) override {
1246 auto q = onode_lru.iterator_to(*o);
1247 onode_lru.erase(q);
1248 }
1249 void _touch_onode(OnodeRef& o) override;
1250
1251 uint64_t _get_buffer_bytes() override {
1252 return buffer_bytes;
1253 }
1254 void _add_buffer(Buffer *b, int level, Buffer *near) override;
1255 void _rm_buffer(Buffer *b) override;
1256 void _move_buffer(Cache *src, Buffer *b) override;
1257 void _adjust_buffer_size(Buffer *b, int64_t delta) override;
1258 void _touch_buffer(Buffer *b) override {
1259 switch (b->cache_private) {
1260 case BUFFER_WARM_IN:
1261 // do nothing (somewhat counter-intuitively!)
1262 break;
1263 case BUFFER_WARM_OUT:
1264 // move from warm_out to hot LRU
1265 assert(0 == "this happens via discard hint");
1266 break;
1267 case BUFFER_HOT:
1268 // move to front of hot LRU
1269 buffer_hot.erase(buffer_hot.iterator_to(*b));
1270 buffer_hot.push_front(*b);
1271 break;
1272 }
1273 _audit("_touch_buffer end");
1274 }
1275
1276 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1277
1278 void add_stats(uint64_t *onodes, uint64_t *extents,
1279 uint64_t *blobs,
1280 uint64_t *buffers,
1281 uint64_t *bytes) override {
1282 std::lock_guard<std::recursive_mutex> l(lock);
1283 *onodes += onode_lru.size();
1284 *extents += num_extents;
1285 *blobs += num_blobs;
1286 *buffers += buffer_hot.size() + buffer_warm_in.size();
1287 *bytes += buffer_bytes;
1288 }
1289
1290#ifdef DEBUG_CACHE
1291 void _audit(const char *s) override;
1292#endif
1293 };
1294
1295 struct OnodeSpace {
1296 private:
1297 Cache *cache;
1298
1299 /// forward lookups
31f18b77 1300 mempool::bluestore_cache_other::unordered_map<ghobject_t,OnodeRef> onode_map;
7c673cae
FG
1301
1302 friend class Collection; // for split_cache()
1303
1304 public:
1305 OnodeSpace(Cache *c) : cache(c) {}
1306 ~OnodeSpace() {
1307 clear();
1308 }
1309
1310 OnodeRef add(const ghobject_t& oid, OnodeRef o);
1311 OnodeRef lookup(const ghobject_t& o);
1312 void remove(const ghobject_t& oid) {
1313 onode_map.erase(oid);
1314 }
1315 void rename(OnodeRef& o, const ghobject_t& old_oid,
1316 const ghobject_t& new_oid,
31f18b77 1317 const mempool::bluestore_cache_other::string& new_okey);
7c673cae
FG
1318 void clear();
1319 bool empty();
1320
1321 /// return true if f true for any item
1322 bool map_any(std::function<bool(OnodeRef)> f);
1323 };
1324
1325 struct Collection : public CollectionImpl {
1326 BlueStore *store;
1327 Cache *cache; ///< our cache shard
1328 coll_t cid;
1329 bluestore_cnode_t cnode;
1330 RWLock lock;
1331
1332 bool exists;
1333
1334 SharedBlobSet shared_blob_set; ///< open SharedBlobs
1335
1336 // cache onodes on a per-collection basis to avoid lock
1337 // contention.
1338 OnodeSpace onode_map;
1339
1340 //pool options
1341 pool_opts_t pool_opts;
1342
1343 OnodeRef get_onode(const ghobject_t& oid, bool create);
1344
1345 // the terminology is confusing here, sorry!
1346 //
1347 // blob_t shared_blob_t
1348 // !shared unused -> open
1349 // shared !loaded -> open + shared
1350 // shared loaded -> open + shared + loaded
1351 //
1352 // i.e.,
1353 // open = SharedBlob is instantiated
1354 // shared = blob_t shared flag is set; SharedBlob is hashed.
1355 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1356 void open_shared_blob(uint64_t sbid, BlobRef b);
1357 void load_shared_blob(SharedBlobRef sb);
1358 void make_blob_shared(uint64_t sbid, BlobRef b);
31f18b77 1359 uint64_t make_blob_unshared(SharedBlob *sb);
7c673cae
FG
1360
1361 BlobRef new_blob() {
1362 BlobRef b = new Blob();
1363 b->shared_blob = new SharedBlob(this);
1364 return b;
1365 }
1366
1367 const coll_t &get_cid() override {
1368 return cid;
1369 }
1370
1371 bool contains(const ghobject_t& oid) {
1372 if (cid.is_meta())
1373 return oid.hobj.pool == -1;
1374 spg_t spgid;
1375 if (cid.is_pg(&spgid))
1376 return
1377 spgid.pgid.contains(cnode.bits, oid) &&
1378 oid.shard_id == spgid.shard;
1379 return false;
1380 }
1381
1382 void split_cache(Collection *dest);
7c673cae
FG
1383
1384 Collection(BlueStore *ns, Cache *ca, coll_t c);
1385 };
1386
1387 class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
1388 CollectionRef c;
1389 OnodeRef o;
1390 KeyValueDB::Iterator it;
1391 string head, tail;
1392 public:
1393 OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
1394 int seek_to_first() override;
1395 int upper_bound(const string &after) override;
1396 int lower_bound(const string &to) override;
1397 bool valid() override;
1398 int next(bool validate=true) override;
1399 string key() override;
1400 bufferlist value() override;
1401 int status() override {
1402 return 0;
1403 }
1404 };
1405
1406 class OpSequencer;
1407 typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
1408
31f18b77
FG
1409 struct volatile_statfs{
1410 enum {
1411 STATFS_ALLOCATED = 0,
1412 STATFS_STORED,
1413 STATFS_COMPRESSED_ORIGINAL,
1414 STATFS_COMPRESSED,
1415 STATFS_COMPRESSED_ALLOCATED,
1416 STATFS_LAST
1417 };
1418 int64_t values[STATFS_LAST];
1419 volatile_statfs() {
1420 memset(this, 0, sizeof(volatile_statfs));
1421 }
1422 void reset() {
1423 *this = volatile_statfs();
1424 }
1425 volatile_statfs& operator+=(const volatile_statfs& other) {
1426 for (size_t i = 0; i < STATFS_LAST; ++i) {
1427 values[i] += other.values[i];
1428 }
1429 return *this;
1430 }
1431 int64_t& allocated() {
1432 return values[STATFS_ALLOCATED];
1433 }
1434 int64_t& stored() {
1435 return values[STATFS_STORED];
1436 }
1437 int64_t& compressed_original() {
1438 return values[STATFS_COMPRESSED_ORIGINAL];
1439 }
1440 int64_t& compressed() {
1441 return values[STATFS_COMPRESSED];
1442 }
1443 int64_t& compressed_allocated() {
1444 return values[STATFS_COMPRESSED_ALLOCATED];
1445 }
1446 bool is_empty() {
1447 return values[STATFS_ALLOCATED] == 0 &&
1448 values[STATFS_STORED] == 0 &&
1449 values[STATFS_COMPRESSED] == 0 &&
1450 values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
1451 values[STATFS_COMPRESSED_ALLOCATED] == 0;
1452 }
1453 void decode(bufferlist::iterator& it) {
1454 for (size_t i = 0; i < STATFS_LAST; i++) {
1455 ::decode(values[i], it);
1456 }
1457 }
1458
1459 void encode(bufferlist& bl) {
1460 for (size_t i = 0; i < STATFS_LAST; i++) {
1461 ::encode(values[i], bl);
1462 }
1463 }
1464 };
1465
7c673cae 1466 struct TransContext : public AioContext {
31f18b77
FG
1467 MEMPOOL_CLASS_HELPERS();
1468
7c673cae
FG
1469 typedef enum {
1470 STATE_PREPARE,
1471 STATE_AIO_WAIT,
1472 STATE_IO_DONE,
1473 STATE_KV_QUEUED, // queued for kv_sync_thread submission
1474 STATE_KV_SUBMITTED, // submitted to kv; not yet synced
1475 STATE_KV_DONE,
1476 STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
1477 STATE_DEFERRED_CLEANUP, // remove deferred kv record
1478 STATE_DEFERRED_DONE,
1479 STATE_FINISHING,
1480 STATE_DONE,
1481 } state_t;
1482
1483 state_t state = STATE_PREPARE;
1484
1485 const char *get_state_name() {
1486 switch (state) {
1487 case STATE_PREPARE: return "prepare";
1488 case STATE_AIO_WAIT: return "aio_wait";
1489 case STATE_IO_DONE: return "io_done";
1490 case STATE_KV_QUEUED: return "kv_queued";
1491 case STATE_KV_SUBMITTED: return "kv_submitted";
1492 case STATE_KV_DONE: return "kv_done";
1493 case STATE_DEFERRED_QUEUED: return "deferred_queued";
1494 case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
1495 case STATE_DEFERRED_DONE: return "deferred_done";
1496 case STATE_FINISHING: return "finishing";
1497 case STATE_DONE: return "done";
1498 }
1499 return "???";
1500 }
1501
1502#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1503 const char *get_state_latency_name(int state) {
1504 switch (state) {
1505 case l_bluestore_state_prepare_lat: return "prepare";
1506 case l_bluestore_state_aio_wait_lat: return "aio_wait";
1507 case l_bluestore_state_io_done_lat: return "io_done";
1508 case l_bluestore_state_kv_queued_lat: return "kv_queued";
1509 case l_bluestore_state_kv_committing_lat: return "kv_committing";
1510 case l_bluestore_state_kv_done_lat: return "kv_done";
1511 case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
1512 case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
1513 case l_bluestore_state_finishing_lat: return "finishing";
1514 case l_bluestore_state_done_lat: return "done";
1515 }
1516 return "???";
1517 }
1518#endif
1519
1520 void log_state_latency(PerfCounters *logger, int state) {
1521 utime_t lat, now = ceph_clock_now();
1522 lat = now - last_stamp;
1523 logger->tinc(state, lat);
1524#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1525 if (state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) {
1526 double usecs = (now.to_nsec()-last_stamp.to_nsec())/1000;
1527 OID_ELAPSED("", usecs, get_state_latency_name(state));
1528 }
1529#endif
1530 last_stamp = now;
1531 }
1532
1533 OpSequencerRef osr;
1534 boost::intrusive::list_member_hook<> sequencer_item;
1535
1536 uint64_t bytes = 0, cost = 0;
1537
1538 set<OnodeRef> onodes; ///< these need to be updated/written
1539 set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
1540 set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
1541 set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
1542
1543 KeyValueDB::Transaction t; ///< then we will commit this
1544 Context *oncommit = nullptr; ///< signal on commit
1545 Context *onreadable = nullptr; ///< signal on readable
1546 Context *onreadable_sync = nullptr; ///< signal on readable
1547 list<Context*> oncommits; ///< more commit completions
1548 list<CollectionRef> removed_collections; ///< colls we removed
1549
1550 boost::intrusive::list_member_hook<> deferred_queue_item;
1551 bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
1552
1553 interval_set<uint64_t> allocated, released;
31f18b77 1554 volatile_statfs statfs_delta;
7c673cae
FG
1555
1556 IOContext ioc;
1557 bool had_ios = false; ///< true if we submitted IOs before our kv txn
1558
7c673cae
FG
1559 uint64_t seq = 0;
1560 utime_t start;
1561 utime_t last_stamp;
1562
1563 uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
1564 uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
1565
1566 explicit TransContext(CephContext* cct, OpSequencer *o)
1567 : osr(o),
1568 ioc(cct, this),
1569 start(ceph_clock_now()) {
1570 last_stamp = start;
1571 }
1572 ~TransContext() {
1573 delete deferred_txn;
1574 }
1575
1576 void write_onode(OnodeRef &o) {
1577 onodes.insert(o);
1578 }
1579 void write_shared_blob(SharedBlobRef &sb) {
1580 shared_blobs.insert(sb);
1581 }
31f18b77
FG
1582 void unshare_blob(SharedBlob *sb) {
1583 shared_blobs.erase(sb);
1584 }
1585
7c673cae
FG
1586 /// note we logically modified object (when onode itself is unmodified)
1587 void note_modified_object(OnodeRef &o) {
1588 // onode itself isn't written, though
1589 modified_objects.insert(o);
1590 }
1591 void removed(OnodeRef& o) {
1592 onodes.erase(o);
1593 modified_objects.erase(o);
1594 }
1595
1596 void aio_finish(BlueStore *store) override {
1597 store->txc_aio_finish(this);
1598 }
1599 };
1600
1601 typedef boost::intrusive::list<
1602 TransContext,
1603 boost::intrusive::member_hook<
1604 TransContext,
1605 boost::intrusive::list_member_hook<>,
1606 &TransContext::deferred_queue_item> > deferred_queue_t;
1607
1608 struct DeferredBatch : public AioContext {
1609 OpSequencer *osr;
1610 struct deferred_io {
1611 bufferlist bl; ///< data
1612 uint64_t seq; ///< deferred transaction seq
1613 };
1614 map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
1615 deferred_queue_t txcs; ///< txcs in this batch
1616 IOContext ioc; ///< our aios
1617 /// bytes of pending io for each deferred seq (may be 0)
1618 map<uint64_t,int> seq_bytes;
1619
1620 void _discard(CephContext *cct, uint64_t offset, uint64_t length);
1621 void _audit(CephContext *cct);
1622
1623 DeferredBatch(CephContext *cct, OpSequencer *osr)
1624 : osr(osr), ioc(cct, this) {}
1625
1626 /// prepare a write
1627 void prepare_write(CephContext *cct,
1628 uint64_t seq, uint64_t offset, uint64_t length,
1629 bufferlist::const_iterator& p);
1630
1631 void aio_finish(BlueStore *store) override {
1632 store->_deferred_aio_finish(osr);
1633 }
1634 };
1635
1636 class OpSequencer : public Sequencer_impl {
1637 public:
1638 std::mutex qlock;
1639 std::condition_variable qcond;
1640 typedef boost::intrusive::list<
1641 TransContext,
1642 boost::intrusive::member_hook<
1643 TransContext,
1644 boost::intrusive::list_member_hook<>,
1645 &TransContext::sequencer_item> > q_list_t;
1646 q_list_t q; ///< transactions
1647
1648 boost::intrusive::list_member_hook<> deferred_osr_queue_item;
1649
1650 DeferredBatch *deferred_running = nullptr;
1651 DeferredBatch *deferred_pending = nullptr;
1652
1653 Sequencer *parent;
1654 BlueStore *store;
1655
1656 uint64_t last_seq = 0;
1657
1658 std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
1659
1660 std::atomic_int kv_committing_serially = {0};
1661
1662 std::atomic_int kv_submitted_waiters = {0};
1663
1664 std::atomic_bool registered = {true}; ///< registered in BlueStore's osr_set
1665 std::atomic_bool zombie = {false}; ///< owning Sequencer has gone away
1666
1667 OpSequencer(CephContext* cct, BlueStore *store)
1668 : Sequencer_impl(cct),
1669 parent(NULL), store(store) {
1670 store->register_osr(this);
1671 }
1672 ~OpSequencer() override {
1673 assert(q.empty());
1674 _unregister();
1675 }
1676
1677 void discard() override {
1678 // Note that we may have txc's in flight when the parent Sequencer
1679 // goes away. Reflect this with zombie==registered==true and let
1680 // _osr_drain_all clean up later.
1681 assert(!zombie);
1682 zombie = true;
1683 parent = nullptr;
1684 bool empty;
1685 {
1686 std::lock_guard<std::mutex> l(qlock);
1687 empty = q.empty();
1688 }
1689 if (empty) {
1690 _unregister();
1691 }
1692 }
1693
1694 void _unregister() {
1695 if (registered) {
1696 store->unregister_osr(this);
1697 registered = false;
1698 }
1699 }
1700
1701 void queue_new(TransContext *txc) {
1702 std::lock_guard<std::mutex> l(qlock);
1703 txc->seq = ++last_seq;
1704 q.push_back(*txc);
1705 }
1706
1707 void drain() {
1708 std::unique_lock<std::mutex> l(qlock);
1709 while (!q.empty())
1710 qcond.wait(l);
1711 }
1712
1713 void drain_preceding(TransContext *txc) {
1714 std::unique_lock<std::mutex> l(qlock);
1715 while (!q.empty() && &q.front() != txc)
1716 qcond.wait(l);
1717 }
1718
1719 bool _is_all_kv_submitted() {
1720 // caller must hold qlock
1721 if (q.empty()) {
1722 return true;
1723 }
1724 TransContext *txc = &q.back();
1725 if (txc->state >= TransContext::STATE_KV_SUBMITTED) {
1726 return true;
1727 }
1728 return false;
1729 }
1730
1731 void flush() override {
1732 std::unique_lock<std::mutex> l(qlock);
1733 while (true) {
1734 // set flag before the check because the condition
1735 // may become true outside qlock, and we need to make
1736 // sure those threads see waiters and signal qcond.
1737 ++kv_submitted_waiters;
1738 if (_is_all_kv_submitted()) {
1739 return;
1740 }
1741 qcond.wait(l);
1742 --kv_submitted_waiters;
1743 }
1744 }
1745
1746 bool flush_commit(Context *c) override {
1747 std::lock_guard<std::mutex> l(qlock);
1748 if (q.empty()) {
1749 return true;
1750 }
1751 TransContext *txc = &q.back();
1752 if (txc->state >= TransContext::STATE_KV_DONE) {
1753 return true;
1754 }
1755 txc->oncommits.push_back(c);
1756 return false;
1757 }
1758 };
1759
1760 typedef boost::intrusive::list<
1761 OpSequencer,
1762 boost::intrusive::member_hook<
1763 OpSequencer,
1764 boost::intrusive::list_member_hook<>,
1765 &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
1766
1767 struct KVSyncThread : public Thread {
1768 BlueStore *store;
1769 explicit KVSyncThread(BlueStore *s) : store(s) {}
1770 void *entry() override {
1771 store->_kv_sync_thread();
1772 return NULL;
1773 }
1774 };
31f18b77
FG
1775 struct KVFinalizeThread : public Thread {
1776 BlueStore *store;
1777 explicit KVFinalizeThread(BlueStore *s) : store(s) {}
1778 void *entry() {
1779 store->_kv_finalize_thread();
1780 return NULL;
1781 }
1782 };
7c673cae
FG
1783
1784 struct DBHistogram {
1785 struct value_dist {
1786 uint64_t count;
1787 uint32_t max_len;
1788 };
1789
1790 struct key_dist {
1791 uint64_t count;
1792 uint32_t max_len;
1793 map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
1794 };
1795
1796 map<string, map<int, struct key_dist> > key_hist;
1797 map<int, uint64_t> value_hist;
1798 int get_key_slab(size_t sz);
1799 string get_key_slab_to_range(int slab);
1800 int get_value_slab(size_t sz);
1801 string get_value_slab_to_range(int slab);
1802 void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
1803 const string &prefix, size_t key_size, size_t value_size);
1804 void dump(Formatter *f);
1805 };
1806
1807 // --------------------------------------------------------
1808 // members
1809private:
1810 BlueFS *bluefs = nullptr;
1811 unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
1812 bool bluefs_single_shared_device = true;
1813 utime_t bluefs_last_balance;
1814
1815 KeyValueDB *db = nullptr;
1816 BlockDevice *bdev = nullptr;
1817 std::string freelist_type;
1818 FreelistManager *fm = nullptr;
1819 Allocator *alloc = nullptr;
1820 uuid_d fsid;
1821 int path_fd = -1; ///< open handle to $path
1822 int fsid_fd = -1; ///< open handle (locked) to $path/fsid
1823 bool mounted = false;
1824
1825 RWLock coll_lock = {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
31f18b77 1826 mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
7c673cae
FG
1827
1828 vector<Cache*> cache_shards;
1829
1830 std::mutex osr_lock; ///< protect osd_set
1831 std::set<OpSequencerRef> osr_set; ///< set of all OpSequencers
1832
1833 std::atomic<uint64_t> nid_last = {0};
1834 std::atomic<uint64_t> nid_max = {0};
1835 std::atomic<uint64_t> blobid_last = {0};
1836 std::atomic<uint64_t> blobid_max = {0};
1837
1838 Throttle throttle_bytes; ///< submit to commit
1839 Throttle throttle_deferred_bytes; ///< submit to deferred complete
1840
1841 interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
1842 interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
1843
224ce89b 1844 std::mutex deferred_lock, deferred_submit_lock;
7c673cae
FG
1845 std::atomic<uint64_t> deferred_seq = {0};
1846 deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
1847 int deferred_queue_size = 0; ///< num txc's queued across all osrs
1848 atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
1849
1850 int m_finisher_num = 1;
1851 vector<Finisher*> finishers;
1852
1853 KVSyncThread kv_sync_thread;
1854 std::mutex kv_lock;
1855 std::condition_variable kv_cond;
31f18b77 1856 bool kv_sync_started = false;
7c673cae 1857 bool kv_stop = false;
31f18b77
FG
1858 bool kv_finalize_started = false;
1859 bool kv_finalize_stop = false;
7c673cae
FG
1860 deque<TransContext*> kv_queue; ///< ready, already submitted
1861 deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
1862 deque<TransContext*> kv_committing; ///< currently syncing
1863 deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
1864 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
1865
31f18b77
FG
1866 KVFinalizeThread kv_finalize_thread;
1867 std::mutex kv_finalize_lock;
1868 std::condition_variable kv_finalize_cond;
1869 deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
1870 deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
1871
7c673cae
FG
1872 PerfCounters *logger = nullptr;
1873
1874 std::mutex reap_lock;
1875 list<CollectionRef> removed_collections;
1876
1877 RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"};
1878 set<ghobject_t> debug_data_error_objects;
1879 set<ghobject_t> debug_mdata_error_objects;
1880
1881 std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
1882
1883 uint64_t block_size = 0; ///< block size of block device (power of 2)
1884 uint64_t block_mask = 0; ///< mask to get just the block offset
1885 size_t block_size_order = 0; ///< bits to shift to get block size
1886
1887 uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2)
7c673cae 1888 ///< bits for min_alloc_size
224ce89b 1889 uint8_t min_alloc_size_order = 0;
7c673cae
FG
1890 static_assert(std::numeric_limits<uint8_t>::max() >
1891 std::numeric_limits<decltype(min_alloc_size)>::digits,
1892 "not enough bits for min_alloc_size");
1893
7c673cae
FG
1894 ///< maximum allocation unit (power of 2)
1895 std::atomic<uint64_t> max_alloc_size = {0};
1896
224ce89b
WB
1897 ///< number threshold for forced deferred writes
1898 std::atomic<int> deferred_batch_ops = {0};
1899
1900 ///< size threshold for forced deferred writes
1901 std::atomic<uint64_t> prefer_deferred_size = {0};
1902
7c673cae
FG
1903 ///< approx cost per io, in bytes
1904 std::atomic<uint64_t> throttle_cost_per_io = {0};
1905
224ce89b
WB
1906 std::atomic<Compressor::CompressionMode> comp_mode =
1907 {Compressor::COMP_NONE}; ///< compression mode
7c673cae
FG
1908 CompressorRef compressor;
1909 std::atomic<uint64_t> comp_min_blob_size = {0};
1910 std::atomic<uint64_t> comp_max_blob_size = {0};
1911
1912 std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
1913
31f18b77
FG
1914 uint64_t kv_ios = 0;
1915 uint64_t kv_throttle_costs = 0;
1916
7c673cae 1917 // cache trim control
224ce89b 1918 uint64_t cache_size = 0; ///< total cache size
31f18b77
FG
1919 float cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
1920 float cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
1921 float cache_data_ratio = 0; ///< cache ratio dedicated to object data
7c673cae 1922
31f18b77
FG
1923 std::mutex vstatfs_lock;
1924 volatile_statfs vstatfs;
7c673cae
FG
1925
1926 struct MempoolThread : public Thread {
1927 BlueStore *store;
1928 Cond cond;
1929 Mutex lock;
1930 bool stop = false;
1931 public:
1932 explicit MempoolThread(BlueStore *s)
1933 : store(s),
1934 lock("BlueStore::MempoolThread::lock") {}
1935 void *entry() override;
1936 void init() {
1937 assert(stop == false);
1938 create("bstore_mempool");
1939 }
1940 void shutdown() {
1941 lock.Lock();
1942 stop = true;
1943 cond.Signal();
1944 lock.Unlock();
1945 join();
1946 }
1947 } mempool_thread;
1948
1949 // --------------------------------------------------------
1950 // private methods
1951
1952 void _init_logger();
1953 void _shutdown_logger();
1954 int _reload_logger();
1955
1956 int _open_path();
1957 void _close_path();
1958 int _open_fsid(bool create);
1959 int _lock_fsid();
1960 int _read_fsid(uuid_d *f);
1961 int _write_fsid();
1962 void _close_fsid();
1963 void _set_alloc_sizes();
1964 void _set_blob_size();
1965
1966 int _open_bdev(bool create);
1967 void _close_bdev();
1968 int _open_db(bool create);
1969 void _close_db();
1970 int _open_fm(bool create);
1971 void _close_fm();
1972 int _open_alloc();
1973 void _close_alloc();
1974 int _open_collections(int *errors=0);
1975 void _close_collections();
1976
1977 int _setup_block_symlink_or_file(string name, string path, uint64_t size,
1978 bool create);
1979
1980 int _write_bdev_label(string path, bluestore_bdev_label_t label);
1981public:
1982 static int _read_bdev_label(CephContext* cct, string path,
1983 bluestore_bdev_label_t *label);
1984private:
1985 int _check_or_set_bdev_label(string path, uint64_t size, string desc,
1986 bool create);
1987
1988 int _open_super_meta();
1989
224ce89b 1990 void _open_statfs();
31f18b77 1991
7c673cae
FG
1992 int _reconcile_bluefs_freespace();
1993 int _balance_bluefs_freespace(PExtentVector *extents);
1994 void _commit_bluefs_freespace(const PExtentVector& extents);
1995
1996 CollectionRef _get_collection(const coll_t& cid);
1997 void _queue_reap_collection(CollectionRef& c);
1998 void _reap_collections();
1999 void _update_cache_logger();
2000
2001 void _assign_nid(TransContext *txc, OnodeRef o);
2002 uint64_t _assign_blobid(TransContext *txc);
2003
2004 void _dump_onode(OnodeRef o, int log_level=30);
2005 void _dump_extent_map(ExtentMap& em, int log_level=30);
2006 void _dump_transaction(Transaction *t, int log_level = 30);
2007
2008 TransContext *_txc_create(OpSequencer *osr);
2009 void _txc_update_store_statfs(TransContext *txc);
2010 void _txc_add_transaction(TransContext *txc, Transaction *t);
2011 void _txc_calc_cost(TransContext *txc);
2012 void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
2013 void _txc_state_proc(TransContext *txc);
2014 void _txc_aio_submit(TransContext *txc);
2015public:
2016 void txc_aio_finish(void *p) {
2017 _txc_state_proc(static_cast<TransContext*>(p));
2018 }
2019private:
2020 void _txc_finish_io(TransContext *txc);
2021 void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
2022 void _txc_applied_kv(TransContext *txc);
2023 void _txc_committed_kv(TransContext *txc);
2024 void _txc_finish(TransContext *txc);
2025 void _txc_release_alloc(TransContext *txc);
2026
2027 void _osr_drain_preceding(TransContext *txc);
2028 void _osr_drain_all();
2029 void _osr_unregister_all();
2030
31f18b77
FG
2031 void _kv_start();
2032 void _kv_stop();
7c673cae 2033 void _kv_sync_thread();
31f18b77 2034 void _kv_finalize_thread();
7c673cae
FG
2035
2036 bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
2037 void _deferred_queue(TransContext *txc);
224ce89b
WB
2038 void deferred_try_submit();
2039 void _deferred_submit_unlock(OpSequencer *osr);
7c673cae
FG
2040 void _deferred_aio_finish(OpSequencer *osr);
2041 int _deferred_replay();
2042
2043public:
2044 using mempool_dynamic_bitset =
2045 boost::dynamic_bitset<uint64_t,
2046 mempool::bluestore_fsck::pool_allocator<uint64_t>>;
2047
2048private:
2049 int _fsck_check_extents(
2050 const ghobject_t& oid,
2051 const PExtentVector& extents,
2052 bool compressed,
2053 mempool_dynamic_bitset &used_blocks,
2054 store_statfs_t& expected_statfs);
2055
2056 void _buffer_cache_write(
2057 TransContext *txc,
2058 BlobRef b,
2059 uint64_t offset,
2060 bufferlist& bl,
2061 unsigned flags) {
2062 b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
2063 flags);
2064 txc->shared_blobs_written.insert(b->shared_blob);
2065 }
2066
2067 int _collection_list(
2068 Collection *c, const ghobject_t& start, const ghobject_t& end,
2069 int max, vector<ghobject_t> *ls, ghobject_t *next);
2070
2071 template <typename T, typename F>
2072 T select_option(const std::string& opt_name, T val1, F f) {
2073 //NB: opt_name reserved for future use
2074 boost::optional<T> val2 = f();
2075 if (val2) {
2076 return *val2;
2077 }
2078 return val1;
2079 }
2080
2081 void _apply_padding(uint64_t head_pad,
2082 uint64_t tail_pad,
7c673cae
FG
2083 bufferlist& padded);
2084
2085 // -- ondisk version ---
2086public:
2087 const int32_t latest_ondisk_format = 2; ///< our version
2088 const int32_t min_readable_ondisk_format = 1; ///< what we can read
2089 const int32_t min_compat_ondisk_format = 2; ///< who can read us
2090
2091private:
2092 int32_t ondisk_format = 0; ///< value detected on mount
2093
2094 int _upgrade_super(); ///< upgrade (called during open_super)
2095 void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
2096
2097 // --- public interface ---
2098public:
2099 BlueStore(CephContext *cct, const string& path);
2100 BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only
2101 ~BlueStore() override;
2102
2103 string get_type() override {
2104 return "bluestore";
2105 }
2106
2107 bool needs_journal() override { return false; };
2108 bool wants_journal() override { return false; };
2109 bool allows_journal() override { return false; };
2110
31f18b77 2111 bool is_rotational() override;
d2e6a577 2112 bool is_journal_rotational() override;
31f18b77 2113
224ce89b
WB
2114 string get_default_device_class() override {
2115 string device_class;
2116 map<string, string> metadata;
2117 collect_metadata(&metadata);
2118 auto it = metadata.find("bluestore_bdev_type");
2119 if (it != metadata.end()) {
2120 device_class = it->second;
2121 }
2122 return device_class;
2123 }
2124
7c673cae
FG
2125 static int get_block_device_fsid(CephContext* cct, const string& path,
2126 uuid_d *fsid);
2127
2128 bool test_mount_in_use() override;
2129
2130private:
2131 int _mount(bool kv_only);
2132public:
2133 int mount() override {
2134 return _mount(false);
2135 }
2136 int umount() override;
2137
2138 int start_kv_only(KeyValueDB **pdb) {
2139 int r = _mount(true);
2140 if (r < 0)
2141 return r;
2142 *pdb = db;
2143 return 0;
2144 }
2145
2146 int fsck(bool deep) override;
2147
2148 void set_cache_shards(unsigned num) override;
2149
2150 int validate_hobject_key(const hobject_t &obj) const override {
2151 return 0;
2152 }
2153 unsigned get_max_attr_name_length() override {
2154 return 256; // arbitrary; there is no real limit internally
2155 }
2156
2157 int mkfs() override;
2158 int mkjournal() override {
2159 return 0;
2160 }
2161
2162 void get_db_statistics(Formatter *f) override;
2163 void generate_db_histogram(Formatter *f) override;
31f18b77 2164 void _flush_cache();
7c673cae
FG
2165 void flush_cache() override;
2166 void dump_perf_counters(Formatter *f) override {
2167 f->open_object_section("perf_counters");
2168 logger->dump_formatted(f, false);
2169 f->close_section();
2170 }
2171
2172 void register_osr(OpSequencer *osr) {
2173 std::lock_guard<std::mutex> l(osr_lock);
2174 osr_set.insert(osr);
2175 }
2176 void unregister_osr(OpSequencer *osr) {
2177 std::lock_guard<std::mutex> l(osr_lock);
2178 osr_set.erase(osr);
2179 }
2180
2181public:
2182 int statfs(struct store_statfs_t *buf) override;
2183
2184 void collect_metadata(map<string,string> *pm) override;
2185
2186 bool exists(const coll_t& cid, const ghobject_t& oid) override;
2187 bool exists(CollectionHandle &c, const ghobject_t& oid) override;
2188 int set_collection_opts(
2189 const coll_t& cid,
2190 const pool_opts_t& opts) override;
2191 int stat(
2192 const coll_t& cid,
2193 const ghobject_t& oid,
2194 struct stat *st,
2195 bool allow_eio = false) override;
2196 int stat(
2197 CollectionHandle &c,
2198 const ghobject_t& oid,
2199 struct stat *st,
2200 bool allow_eio = false) override;
2201 int read(
2202 const coll_t& cid,
2203 const ghobject_t& oid,
2204 uint64_t offset,
2205 size_t len,
2206 bufferlist& bl,
224ce89b 2207 uint32_t op_flags = 0) override;
7c673cae
FG
2208 int read(
2209 CollectionHandle &c,
2210 const ghobject_t& oid,
2211 uint64_t offset,
2212 size_t len,
2213 bufferlist& bl,
224ce89b 2214 uint32_t op_flags = 0) override;
7c673cae
FG
2215 int _do_read(
2216 Collection *c,
2217 OnodeRef o,
2218 uint64_t offset,
2219 size_t len,
2220 bufferlist& bl,
2221 uint32_t op_flags = 0);
2222
2223private:
2224 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
2225 uint64_t offset, size_t len, interval_set<uint64_t>& destset);
2226public:
2227 int fiemap(const coll_t& cid, const ghobject_t& oid,
2228 uint64_t offset, size_t len, bufferlist& bl) override;
2229 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2230 uint64_t offset, size_t len, bufferlist& bl) override;
2231 int fiemap(const coll_t& cid, const ghobject_t& oid,
2232 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2233 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2234 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2235
2236
2237 int getattr(const coll_t& cid, const ghobject_t& oid, const char *name,
2238 bufferptr& value) override;
2239 int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
2240 bufferptr& value) override;
2241
2242 int getattrs(const coll_t& cid, const ghobject_t& oid,
2243 map<string,bufferptr>& aset) override;
2244 int getattrs(CollectionHandle &c, const ghobject_t& oid,
2245 map<string,bufferptr>& aset) override;
2246
2247 int list_collections(vector<coll_t>& ls) override;
2248
2249 CollectionHandle open_collection(const coll_t &c) override;
2250
2251 bool collection_exists(const coll_t& c) override;
2252 int collection_empty(const coll_t& c, bool *empty) override;
2253 int collection_bits(const coll_t& c) override;
2254
2255 int collection_list(const coll_t& cid,
2256 const ghobject_t& start,
2257 const ghobject_t& end,
2258 int max,
2259 vector<ghobject_t> *ls, ghobject_t *next) override;
2260 int collection_list(CollectionHandle &c,
2261 const ghobject_t& start,
2262 const ghobject_t& end,
2263 int max,
2264 vector<ghobject_t> *ls, ghobject_t *next) override;
2265
2266 int omap_get(
2267 const coll_t& cid, ///< [in] Collection containing oid
2268 const ghobject_t &oid, ///< [in] Object containing omap
2269 bufferlist *header, ///< [out] omap header
2270 map<string, bufferlist> *out /// < [out] Key to value map
2271 ) override;
2272 int omap_get(
2273 CollectionHandle &c, ///< [in] Collection containing oid
2274 const ghobject_t &oid, ///< [in] Object containing omap
2275 bufferlist *header, ///< [out] omap header
2276 map<string, bufferlist> *out /// < [out] Key to value map
2277 ) override;
2278
2279 /// Get omap header
2280 int omap_get_header(
2281 const coll_t& cid, ///< [in] Collection containing oid
2282 const ghobject_t &oid, ///< [in] Object containing omap
2283 bufferlist *header, ///< [out] omap header
2284 bool allow_eio = false ///< [in] don't assert on eio
2285 ) override;
2286 int omap_get_header(
2287 CollectionHandle &c, ///< [in] Collection containing oid
2288 const ghobject_t &oid, ///< [in] Object containing omap
2289 bufferlist *header, ///< [out] omap header
2290 bool allow_eio = false ///< [in] don't assert on eio
2291 ) override;
2292
2293 /// Get keys defined on oid
2294 int omap_get_keys(
2295 const coll_t& cid, ///< [in] Collection containing oid
2296 const ghobject_t &oid, ///< [in] Object containing omap
2297 set<string> *keys ///< [out] Keys defined on oid
2298 ) override;
2299 int omap_get_keys(
2300 CollectionHandle &c, ///< [in] Collection containing oid
2301 const ghobject_t &oid, ///< [in] Object containing omap
2302 set<string> *keys ///< [out] Keys defined on oid
2303 ) override;
2304
2305 /// Get key values
2306 int omap_get_values(
2307 const coll_t& cid, ///< [in] Collection containing oid
2308 const ghobject_t &oid, ///< [in] Object containing omap
2309 const set<string> &keys, ///< [in] Keys to get
2310 map<string, bufferlist> *out ///< [out] Returned keys and values
2311 ) override;
2312 int omap_get_values(
2313 CollectionHandle &c, ///< [in] Collection containing oid
2314 const ghobject_t &oid, ///< [in] Object containing omap
2315 const set<string> &keys, ///< [in] Keys to get
2316 map<string, bufferlist> *out ///< [out] Returned keys and values
2317 ) override;
2318
2319 /// Filters keys into out which are defined on oid
2320 int omap_check_keys(
2321 const coll_t& cid, ///< [in] Collection containing oid
2322 const ghobject_t &oid, ///< [in] Object containing omap
2323 const set<string> &keys, ///< [in] Keys to check
2324 set<string> *out ///< [out] Subset of keys defined on oid
2325 ) override;
2326 int omap_check_keys(
2327 CollectionHandle &c, ///< [in] Collection containing oid
2328 const ghobject_t &oid, ///< [in] Object containing omap
2329 const set<string> &keys, ///< [in] Keys to check
2330 set<string> *out ///< [out] Subset of keys defined on oid
2331 ) override;
2332
2333 ObjectMap::ObjectMapIterator get_omap_iterator(
2334 const coll_t& cid, ///< [in] collection
2335 const ghobject_t &oid ///< [in] object
2336 ) override;
2337 ObjectMap::ObjectMapIterator get_omap_iterator(
2338 CollectionHandle &c, ///< [in] collection
2339 const ghobject_t &oid ///< [in] object
2340 ) override;
2341
2342 void set_fsid(uuid_d u) override {
2343 fsid = u;
2344 }
2345 uuid_d get_fsid() override {
2346 return fsid;
2347 }
2348
2349 uint64_t estimate_objects_overhead(uint64_t num_objects) override {
2350 return num_objects * 300; //assuming per-object overhead is 300 bytes
2351 }
2352
2353 struct BSPerfTracker {
2354 PerfCounters::avg_tracker<uint64_t> os_commit_latency;
2355 PerfCounters::avg_tracker<uint64_t> os_apply_latency;
2356
2357 objectstore_perf_stat_t get_cur_stats() const {
2358 objectstore_perf_stat_t ret;
c07f9fc5
FG
2359 ret.os_commit_latency = os_commit_latency.current_avg();
2360 ret.os_apply_latency = os_apply_latency.current_avg();
7c673cae
FG
2361 return ret;
2362 }
2363
2364 void update_from_perfcounters(PerfCounters &logger);
2365 } perf_tracker;
2366
2367 objectstore_perf_stat_t get_cur_stats() override {
2368 perf_tracker.update_from_perfcounters(*logger);
2369 return perf_tracker.get_cur_stats();
2370 }
2371 const PerfCounters* get_perf_counters() const override {
2372 return logger;
2373 }
2374
2375 int queue_transactions(
2376 Sequencer *osr,
2377 vector<Transaction>& tls,
2378 TrackedOpRef op = TrackedOpRef(),
2379 ThreadPool::TPHandle *handle = NULL) override;
2380
2381 // error injection
2382 void inject_data_error(const ghobject_t& o) override {
2383 RWLock::WLocker l(debug_read_error_lock);
2384 debug_data_error_objects.insert(o);
2385 }
2386 void inject_mdata_error(const ghobject_t& o) override {
2387 RWLock::WLocker l(debug_read_error_lock);
2388 debug_mdata_error_objects.insert(o);
2389 }
224ce89b
WB
2390 void compact() override {
2391 assert(db);
2392 db->compact();
2393 }
2394
7c673cae
FG
2395private:
2396 bool _debug_data_eio(const ghobject_t& o) {
2397 if (!cct->_conf->bluestore_debug_inject_read_err) {
2398 return false;
2399 }
2400 RWLock::RLocker l(debug_read_error_lock);
2401 return debug_data_error_objects.count(o);
2402 }
2403 bool _debug_mdata_eio(const ghobject_t& o) {
2404 if (!cct->_conf->bluestore_debug_inject_read_err) {
2405 return false;
2406 }
2407 RWLock::RLocker l(debug_read_error_lock);
2408 return debug_mdata_error_objects.count(o);
2409 }
2410 void _debug_obj_on_delete(const ghobject_t& o) {
2411 if (cct->_conf->bluestore_debug_inject_read_err) {
2412 RWLock::WLocker l(debug_read_error_lock);
2413 debug_data_error_objects.erase(o);
2414 debug_mdata_error_objects.erase(o);
2415 }
2416 }
2417
2418private:
2419
2420 // --------------------------------------------------------
2421 // read processing internal methods
2422 int _verify_csum(
2423 OnodeRef& o,
2424 const bluestore_blob_t* blob,
2425 uint64_t blob_xoffset,
2426 const bufferlist& bl,
2427 uint64_t logical_offset) const;
2428 int _decompress(bufferlist& source, bufferlist* result);
2429
2430
2431 // --------------------------------------------------------
2432 // write ops
2433
2434 struct WriteContext {
2435 bool buffered = false; ///< buffered write
2436 bool compress = false; ///< compressed write
2437 uint64_t target_blob_size = 0; ///< target (max) blob size
2438 unsigned csum_order = 0; ///< target checksum chunk order
2439
2440 old_extent_map_t old_extents; ///< must deref these blobs
2441
2442 struct write_item {
2443 uint64_t logical_offset; ///< write logical offset
2444 BlobRef b;
2445 uint64_t blob_length;
2446 uint64_t b_off;
2447 bufferlist bl;
2448 uint64_t b_off0; ///< original offset in a blob prior to padding
2449 uint64_t length0; ///< original data length prior to padding
2450
2451 bool mark_unused;
2452 bool new_blob; ///< whether new blob was created
2453
2454 write_item(
2455 uint64_t logical_offs,
2456 BlobRef b,
2457 uint64_t blob_len,
2458 uint64_t o,
2459 bufferlist& bl,
2460 uint64_t o0,
2461 uint64_t l0,
2462 bool _mark_unused,
2463 bool _new_blob)
2464 :
2465 logical_offset(logical_offs),
2466 b(b),
2467 blob_length(blob_len),
2468 b_off(o),
2469 bl(bl),
2470 b_off0(o0),
2471 length0(l0),
2472 mark_unused(_mark_unused),
2473 new_blob(_new_blob) {}
2474 };
2475 vector<write_item> writes; ///< blobs we're writing
2476
2477 /// partial clone of the context
2478 void fork(const WriteContext& other) {
2479 buffered = other.buffered;
2480 compress = other.compress;
2481 target_blob_size = other.target_blob_size;
2482 csum_order = other.csum_order;
2483 }
2484 void write(
2485 uint64_t loffs,
2486 BlobRef b,
2487 uint64_t blob_len,
2488 uint64_t o,
2489 bufferlist& bl,
2490 uint64_t o0,
2491 uint64_t len0,
2492 bool _mark_unused,
2493 bool _new_blob) {
2494 writes.emplace_back(loffs,
2495 b,
2496 blob_len,
2497 o,
2498 bl,
2499 o0,
2500 len0,
2501 _mark_unused,
2502 _new_blob);
2503 }
2504 /// Checks for writes to the same pextent within a blob
2505 bool has_conflict(
2506 BlobRef b,
2507 uint64_t loffs,
2508 uint64_t loffs_end,
2509 uint64_t min_alloc_size);
2510 };
2511
2512 void _do_write_small(
2513 TransContext *txc,
2514 CollectionRef &c,
2515 OnodeRef o,
2516 uint64_t offset, uint64_t length,
2517 bufferlist::iterator& blp,
2518 WriteContext *wctx);
2519 void _do_write_big(
2520 TransContext *txc,
2521 CollectionRef &c,
2522 OnodeRef o,
2523 uint64_t offset, uint64_t length,
2524 bufferlist::iterator& blp,
2525 WriteContext *wctx);
2526 int _do_alloc_write(
2527 TransContext *txc,
2528 CollectionRef c,
2529 OnodeRef o,
2530 WriteContext *wctx);
2531 void _wctx_finish(
2532 TransContext *txc,
2533 CollectionRef& c,
2534 OnodeRef o,
31f18b77
FG
2535 WriteContext *wctx,
2536 set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae
FG
2537
2538 int _do_transaction(Transaction *t,
2539 TransContext *txc,
2540 ThreadPool::TPHandle *handle);
2541
2542 int _write(TransContext *txc,
2543 CollectionRef& c,
2544 OnodeRef& o,
2545 uint64_t offset, size_t len,
2546 bufferlist& bl,
2547 uint32_t fadvise_flags);
2548 void _pad_zeros(bufferlist *bl, uint64_t *offset,
2549 uint64_t chunk_size);
2550
31f18b77
FG
2551 void _choose_write_options(CollectionRef& c,
2552 OnodeRef o,
2553 uint32_t fadvise_flags,
2554 WriteContext *wctx);
2555
2556 int _do_gc(TransContext *txc,
2557 CollectionRef& c,
2558 OnodeRef o,
2559 const GarbageCollector& gc,
2560 const WriteContext& wctx,
2561 uint64_t *dirty_start,
2562 uint64_t *dirty_end);
2563
7c673cae
FG
2564 int _do_write(TransContext *txc,
2565 CollectionRef &c,
2566 OnodeRef o,
2567 uint64_t offset, uint64_t length,
2568 bufferlist& bl,
2569 uint32_t fadvise_flags);
2570 void _do_write_data(TransContext *txc,
2571 CollectionRef& c,
2572 OnodeRef o,
2573 uint64_t offset,
2574 uint64_t length,
2575 bufferlist& bl,
2576 WriteContext *wctx);
2577
2578 int _touch(TransContext *txc,
2579 CollectionRef& c,
2580 OnodeRef& o);
2581 int _do_zero(TransContext *txc,
2582 CollectionRef& c,
2583 OnodeRef& o,
2584 uint64_t offset, size_t len);
2585 int _zero(TransContext *txc,
2586 CollectionRef& c,
2587 OnodeRef& o,
2588 uint64_t offset, size_t len);
2589 void _do_truncate(TransContext *txc,
2590 CollectionRef& c,
2591 OnodeRef o,
31f18b77
FG
2592 uint64_t offset,
2593 set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae
FG
2594 void _truncate(TransContext *txc,
2595 CollectionRef& c,
2596 OnodeRef& o,
2597 uint64_t offset);
2598 int _remove(TransContext *txc,
2599 CollectionRef& c,
2600 OnodeRef& o);
2601 int _do_remove(TransContext *txc,
2602 CollectionRef& c,
2603 OnodeRef o);
2604 int _setattr(TransContext *txc,
2605 CollectionRef& c,
2606 OnodeRef& o,
2607 const string& name,
2608 bufferptr& val);
2609 int _setattrs(TransContext *txc,
2610 CollectionRef& c,
2611 OnodeRef& o,
2612 const map<string,bufferptr>& aset);
2613 int _rmattr(TransContext *txc,
2614 CollectionRef& c,
2615 OnodeRef& o,
2616 const string& name);
2617 int _rmattrs(TransContext *txc,
2618 CollectionRef& c,
2619 OnodeRef& o);
2620 void _do_omap_clear(TransContext *txc, uint64_t id);
2621 int _omap_clear(TransContext *txc,
2622 CollectionRef& c,
2623 OnodeRef& o);
2624 int _omap_setkeys(TransContext *txc,
2625 CollectionRef& c,
2626 OnodeRef& o,
2627 bufferlist& bl);
2628 int _omap_setheader(TransContext *txc,
2629 CollectionRef& c,
2630 OnodeRef& o,
2631 bufferlist& header);
2632 int _omap_rmkeys(TransContext *txc,
2633 CollectionRef& c,
2634 OnodeRef& o,
2635 bufferlist& bl);
2636 int _omap_rmkey_range(TransContext *txc,
2637 CollectionRef& c,
2638 OnodeRef& o,
2639 const string& first, const string& last);
2640 int _set_alloc_hint(
2641 TransContext *txc,
2642 CollectionRef& c,
2643 OnodeRef& o,
2644 uint64_t expected_object_size,
2645 uint64_t expected_write_size,
2646 uint32_t flags);
2647 int _do_clone_range(TransContext *txc,
2648 CollectionRef& c,
2649 OnodeRef& oldo,
2650 OnodeRef& newo,
2651 uint64_t srcoff, uint64_t length, uint64_t dstoff);
2652 int _clone(TransContext *txc,
2653 CollectionRef& c,
2654 OnodeRef& oldo,
2655 OnodeRef& newo);
2656 int _clone_range(TransContext *txc,
2657 CollectionRef& c,
2658 OnodeRef& oldo,
2659 OnodeRef& newo,
2660 uint64_t srcoff, uint64_t length, uint64_t dstoff);
2661 int _rename(TransContext *txc,
2662 CollectionRef& c,
2663 OnodeRef& oldo,
2664 OnodeRef& newo,
2665 const ghobject_t& new_oid);
2666 int _create_collection(TransContext *txc, const coll_t &cid,
2667 unsigned bits, CollectionRef *c);
2668 int _remove_collection(TransContext *txc, const coll_t &cid,
2669 CollectionRef *c);
2670 int _split_collection(TransContext *txc,
2671 CollectionRef& c,
2672 CollectionRef& d,
2673 unsigned bits, int rem);
2674};
2675
2676inline ostream& operator<<(ostream& out, const BlueStore::OpSequencer& s) {
2677 return out << *s.parent;
2678}
2679
2680static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
2681 o->get();
2682}
2683static inline void intrusive_ptr_release(BlueStore::Onode *o) {
2684 o->put();
2685}
2686
2687static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
2688 o->get();
2689}
2690static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
2691 o->put();
2692}
2693
2694#endif