1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #ifndef CEPH_FILESTORE_H
17 #define CEPH_FILESTORE_H
19 #include "include/types.h"
27 #include <boost/scoped_ptr.hpp>
29 #include "include/unordered_map.h"
31 #include "include/ceph_assert.h"
33 #include "os/ObjectStore.h"
34 #include "JournalingObjectStore.h"
36 #include "common/Timer.h"
37 #include "common/WorkQueue.h"
38 #include "common/perf_counters.h"
39 #include "common/zipkin_trace.h"
41 #include "common/ceph_mutex.h"
42 #include "HashIndex.h"
43 #include "IndexManager.h"
44 #include "os/ObjectMap.h"
45 #include "SequencerPosition.h"
47 #include "WBThrottle.h"
49 #include "include/uuid.h"
51 #if defined(__linux__)
52 # ifndef BTRFS_SUPER_MAGIC
53 #define BTRFS_SUPER_MAGIC 0x9123683EUL
55 # ifndef XFS_SUPER_MAGIC
56 #define XFS_SUPER_MAGIC 0x58465342UL
58 # ifndef ZFS_SUPER_MAGIC
59 #define ZFS_SUPER_MAGIC 0x2fc12fc1UL
64 class FileStoreBackend
;
66 #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
69 l_filestore_first
= 84000,
70 l_filestore_journal_queue_ops
,
71 l_filestore_journal_queue_bytes
,
72 l_filestore_journal_ops
,
73 l_filestore_journal_bytes
,
74 l_filestore_journal_latency
,
75 l_filestore_journal_wr
,
76 l_filestore_journal_wr_bytes
,
77 l_filestore_journal_full
,
78 l_filestore_committing
,
79 l_filestore_commitcycle
,
80 l_filestore_commitcycle_interval
,
81 l_filestore_commitcycle_latency
,
82 l_filestore_op_queue_max_ops
,
83 l_filestore_op_queue_ops
,
85 l_filestore_op_queue_max_bytes
,
86 l_filestore_op_queue_bytes
,
88 l_filestore_apply_latency
,
89 l_filestore_queue_transaction_latency_avg
,
90 l_filestore_sync_pause_max_lat
,
96 CompatSet compat_features
;
97 std::string omap_backend
;
101 void encode(ceph::buffer::list
&bl
) const;
102 void decode(ceph::buffer::list::const_iterator
&bl
);
103 void dump(ceph::Formatter
*f
) const;
104 static void generate_test_instances(std::list
<FSSuperblock
*>& o
);
106 WRITE_CLASS_ENCODER(FSSuperblock
)
108 inline std::ostream
& operator<<(std::ostream
& out
, const FSSuperblock
& sb
)
110 return out
<< "sb(" << sb
.compat_features
<< "): "
114 class FileStore
: public JournalingObjectStore
,
115 public md_config_obs_t
117 static const uint32_t target_version
= 4;
119 uint32_t get_target_version() {
120 return target_version
;
123 static int get_block_device_fsid(CephContext
* cct
, const std::string
& path
,
125 struct FSPerfTracker
{
126 PerfCounters::avg_tracker
<uint64_t> os_commit_latency_ns
;
127 PerfCounters::avg_tracker
<uint64_t> os_apply_latency_ns
;
129 objectstore_perf_stat_t
get_cur_stats() const {
130 objectstore_perf_stat_t ret
;
131 ret
.os_commit_latency_ns
= os_commit_latency_ns
.current_avg();
132 ret
.os_apply_latency_ns
= os_apply_latency_ns
.current_avg();
136 void update_from_perfcounters(PerfCounters
&logger
);
138 objectstore_perf_stat_t
get_cur_stats() override
{
139 perf_tracker
.update_from_perfcounters(*logger
);
140 return perf_tracker
.get_cur_stats();
142 const PerfCounters
* get_perf_counters() const override
{
147 std::string internal_name
; ///< internal name, used to name the perfcounter instance
148 std::string basedir
, journalpath
;
149 osflagbits_t generic_flags
;
150 std::string current_fn
;
151 std::string current_op_seq_fn
;
152 std::string omap_dir
;
155 size_t blk_size
; ///< fs block size
157 int fsid_fd
, op_fd
, basedir_fd
, current_fd
;
159 FileStoreBackend
*backend
;
161 void create_backend(unsigned long f_type
);
166 std::string vdo_name
;
168 deque
<uint64_t> snaps
;
170 // Indexed Collections
171 IndexManager index_manager
;
172 int get_index(const coll_t
& c
, Index
*index
);
173 int init_index(const coll_t
& c
);
175 bool _need_temp_object_collection(const coll_t
& cid
, const ghobject_t
& oid
) {
176 // - normal temp case: cid is pg, object is temp (pool < -1)
177 // - hammer temp case: cid is pg (or already temp), object pool is -1
178 return cid
.is_pg() && oid
.hobj
.pool
<= -1;
180 void init_temp_collections();
185 boost::scoped_ptr
<ObjectMap
> object_map
;
188 int get_cdir(const coll_t
& cid
, char *s
, int len
);
190 /// read a uuid from fd
191 int read_fsid(int fd
, uuid_d
*uuid
);
197 ceph::mutex lock
= ceph::make_mutex("FileStore::lock");
199 ceph::condition_variable sync_cond
;
201 ceph::mutex sync_entry_timeo_lock
= ceph::make_mutex("FileStore::sync_entry_timeo_lock");
204 std::list
<Context
*> sync_waiters
;
207 struct SyncThread
: public Thread
{
209 explicit SyncThread(FileStore
*f
) : fs(f
) {}
210 void *entry() override
{
216 // -- op workqueue --
220 std::vector
<Transaction
> tls
;
221 Context
*onreadable
, *onreadable_sync
;
224 ZTracer::Trace trace
;
225 bool registered_apply
= false;
227 class OpSequencer
: public CollectionImpl
{
229 // to protect q, for benefit of flush (peek/dequeue also protected by lock)
231 ceph::make_mutex("FileStore::OpSequencer::qlock", false);
233 std::list
<uint64_t> jq
;
234 std::list
<std::pair
<uint64_t, Context
*> > flush_commit_waiters
;
235 ceph::condition_variable cond
;
236 std::string osr_name_str
;
237 /// hash of pointers to ghobject_t's for in-flight writes
238 std::unordered_multimap
<uint32_t,const ghobject_t
*> applying
;
240 // for apply mutual exclusion
241 ceph::mutex apply_lock
=
242 ceph::make_mutex("FileStore::OpSequencer::apply_lock", false);
244 const char *osr_name
;
246 /// get_max_uncompleted
247 bool _get_max_uncompleted(
248 uint64_t *seq
///< [out] max uncompleted seq
252 if (q
.empty() && jq
.empty())
257 if (!jq
.empty() && jq
.back() > *seq
)
261 } /// @returns true if both queues are empty
263 /// get_min_uncompleted
264 bool _get_min_uncompleted(
265 uint64_t *seq
///< [out] min uncompleted seq
269 if (q
.empty() && jq
.empty())
273 *seq
= q
.front()->op
;
274 if (!jq
.empty() && jq
.front() < *seq
)
278 } /// @returns true if both queues are empty
280 void _wake_flush_waiters(std::list
<Context
*> *to_queue
) {
282 if (_get_min_uncompleted(&seq
))
285 for (auto i
= flush_commit_waiters
.begin();
286 i
!= flush_commit_waiters
.end() && i
->first
< seq
;
287 flush_commit_waiters
.erase(i
++)) {
288 to_queue
->push_back(i
->second
);
292 void queue_journal(Op
*o
) {
293 std::lock_guard l
{qlock
};
297 void dequeue_journal(std::list
<Context
*> *to_queue
) {
298 std::lock_guard l
{qlock
};
301 _wake_flush_waiters(to_queue
);
304 std::lock_guard l
{qlock
};
307 o
->trace
.keyval("queue depth", q
.size());
309 void _register_apply(Op
*o
);
310 void _unregister_apply(Op
*o
);
311 void wait_for_apply(const ghobject_t
& oid
);
313 std::lock_guard l
{qlock
};
314 ceph_assert(ceph_mutex_is_locked(apply_lock
));
318 Op
*dequeue(std::list
<Context
*> *to_queue
) {
319 ceph_assert(to_queue
);
320 ceph_assert(ceph_mutex_is_locked(apply_lock
));
321 std::lock_guard l
{qlock
};
325 _unregister_apply(o
);
326 _wake_flush_waiters(to_queue
);
330 void flush() override
{
331 std::unique_lock l
{qlock
};
333 cond
.wait(l
, [this] { return !cct
->_conf
->filestore_blackhole
; });
335 // get max for journal _or_ op queues
339 if (!jq
.empty() && jq
.back() > seq
)
343 // everything prior to our watermark to drain through either/both queues
344 cond
.wait(l
, [seq
, this] {
345 return ((q
.empty() || q
.front()->op
> seq
) &&
346 (jq
.empty() || jq
.front() > seq
));
350 bool flush_commit(Context
*c
) override
{
351 std::lock_guard l
{qlock
};
353 if (_get_max_uncompleted(&seq
)) {
356 flush_commit_waiters
.push_back(std::make_pair(seq
, c
));
362 FRIEND_MAKE_REF(OpSequencer
);
363 OpSequencer(CephContext
* cct
, int i
, coll_t cid
)
364 : CollectionImpl(cct
, cid
),
366 osr_name_str(stringify(cid
)),
368 osr_name(osr_name_str
.c_str()) {}
369 ~OpSequencer() override
{
370 ceph_assert(q
.empty());
373 typedef boost::intrusive_ptr
<OpSequencer
> OpSequencerRef
;
375 ceph::mutex coll_lock
= ceph::make_mutex("FileStore::coll_lock");
376 std::map
<coll_t
,OpSequencerRef
> coll_map
;
378 friend std::ostream
& operator<<(std::ostream
& out
, const OpSequencer
& s
);
381 WBThrottle wbthrottle
;
383 std::atomic
<int64_t> next_osr_id
= { 0 };
384 bool m_disable_wbthrottle
;
385 deque
<OpSequencer
*> op_queue
;
386 BackoffThrottle throttle_ops
, throttle_bytes
;
387 const int m_ondisk_finisher_num
;
388 const int m_apply_finisher_num
;
389 std::vector
<Finisher
*> ondisk_finishers
;
390 std::vector
<Finisher
*> apply_finishers
;
393 struct OpWQ
: public ThreadPool::WorkQueue
<OpSequencer
> {
396 ceph::timespan timeout
,
397 ceph::timespan suicide_timeout
,
399 : ThreadPool::WorkQueue
<OpSequencer
>("FileStore::OpWQ",
400 timeout
, suicide_timeout
, tp
),
403 bool _enqueue(OpSequencer
*osr
) override
{
404 store
->op_queue
.push_back(osr
);
407 void _dequeue(OpSequencer
*o
) override
{
410 bool _empty() override
{
411 return store
->op_queue
.empty();
413 OpSequencer
*_dequeue() override
{
414 if (store
->op_queue
.empty())
416 OpSequencer
*osr
= store
->op_queue
.front();
417 store
->op_queue
.pop_front();
420 void _process(OpSequencer
*osr
, ThreadPool::TPHandle
&handle
) override
{
421 store
->_do_op(osr
, handle
);
423 void _process_finish(OpSequencer
*osr
) override
{
424 store
->_finish_op(osr
);
426 void _clear() override
{
427 ceph_assert(store
->op_queue
.empty());
431 void _do_op(OpSequencer
*o
, ThreadPool::TPHandle
&handle
);
432 void _finish_op(OpSequencer
*o
);
433 Op
*build_op(std::vector
<Transaction
>& tls
,
434 Context
*onreadable
, Context
*onreadable_sync
,
435 TrackedOpRef osd_op
);
436 void queue_op(OpSequencer
*osr
, Op
*o
);
437 void op_queue_reserve_throttle(Op
*o
);
438 void op_queue_release_throttle(Op
*o
);
439 void _journaled_ahead(OpSequencer
*osr
, Op
*o
, Context
*ondisk
);
440 friend struct C_JournaledAhead
;
444 PerfCounters
*logger
;
446 ZTracer::Endpoint trace_endpoint
;
449 int lfn_find(const ghobject_t
& oid
, const Index
& index
,
450 IndexedPath
*path
= nullptr);
451 int lfn_truncate(const coll_t
& cid
, const ghobject_t
& oid
, off_t length
);
452 int lfn_stat(const coll_t
& cid
, const ghobject_t
& oid
, struct stat
*buf
);
455 const ghobject_t
& oid
,
458 Index
*index
= nullptr);
460 void lfn_close(FDRef fd
);
461 int lfn_link(const coll_t
& c
, const coll_t
& newcid
, const ghobject_t
& o
, const ghobject_t
& newoid
) ;
462 int lfn_unlink(const coll_t
& cid
, const ghobject_t
& o
, const SequencerPosition
&spos
,
463 bool force_clear_omap
=false);
466 FileStore(CephContext
* cct
, const std::string
&base
, const std::string
&jdev
,
467 osflagbits_t flags
= 0,
468 const char *internal_name
= "filestore", bool update_to
=false);
469 ~FileStore() override
;
471 std::string
get_type() override
{
476 int _sanity_check_fs();
478 bool test_mount_in_use() override
;
479 int read_op_seq(uint64_t *seq
);
480 int write_op_seq(int, uint64_t seq
);
481 int mount() override
;
482 int umount() override
;
484 int validate_hobject_key(const hobject_t
&obj
) const override
;
486 unsigned get_max_attr_name_length() override
{
487 // xattr limit is 128; leave room for our prefixes (user.ceph._),
488 // some margin, and cap at 100
492 int mkjournal() override
;
493 bool wants_journal() override
{
496 bool allows_journal() override
{
499 bool needs_journal() override
{
503 bool is_sync_onreadable() const override
{
507 bool is_rotational() override
;
508 bool is_journal_rotational() override
;
510 void dump_perf_counters(ceph::Formatter
*f
) override
{
511 f
->open_object_section("perf_counters");
512 logger
->dump_formatted(f
, false);
516 int flush_cache(std::ostream
*os
= NULL
) override
;
517 int write_version_stamp();
518 int version_stamp_is_valid(uint32_t *version
);
519 int update_version_stamp();
520 int upgrade() override
;
522 bool can_sort_nibblewise() override
{
523 return true; // i support legacy sort order
526 void collect_metadata(std::map
<std::string
,std::string
> *pm
) override
;
527 int get_devices(std::set
<std::string
> *ls
) override
;
529 int statfs(struct store_statfs_t
*buf
,
530 osd_alert_list_t
* alerts
= nullptr) override
;
531 int pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
532 bool *per_pool_omap
) override
;
534 int _do_transactions(
535 std::vector
<Transaction
> &tls
, uint64_t op_seq
,
536 ThreadPool::TPHandle
*handle
,
537 const char *osr_name
);
538 int do_transactions(std::vector
<Transaction
> &tls
, uint64_t op_seq
) override
{
539 return _do_transactions(tls
, op_seq
, nullptr, "replay");
541 void _do_transaction(
542 Transaction
& t
, uint64_t op_seq
, int trans_num
,
543 ThreadPool::TPHandle
*handle
, const char *osr_name
);
545 CollectionHandle
open_collection(const coll_t
& c
) override
;
546 CollectionHandle
create_new_collection(const coll_t
& c
) override
;
547 void set_collection_commit_queue(const coll_t
& cid
,
548 ContextQueue
*commit_queue
) override
{
551 int queue_transactions(CollectionHandle
& ch
, std::vector
<Transaction
>& tls
,
552 TrackedOpRef op
= TrackedOpRef(),
553 ThreadPool::TPHandle
*handle
= nullptr) override
;
556 * set replay guard xattr on given file
558 * This will ensure that we will not replay this (or any previous) operation
559 * against this particular inode/object.
561 * @param fd open file descriptor for the file/object
562 * @param spos sequencer position of the last operation we should not replay
564 void _set_replay_guard(int fd
,
565 const SequencerPosition
& spos
,
566 const ghobject_t
*oid
=0,
567 bool in_progress
=false);
568 void _set_replay_guard(const coll_t
& cid
,
569 const SequencerPosition
& spos
,
571 void _set_global_replay_guard(const coll_t
& cid
,
572 const SequencerPosition
&spos
);
574 /// close a replay guard opened with in_progress=true
575 void _close_replay_guard(int fd
, const SequencerPosition
& spos
,
576 const ghobject_t
*oid
=0);
577 void _close_replay_guard(const coll_t
& cid
, const SequencerPosition
& spos
);
580 * check replay guard xattr on given file
582 * Check the current position against any marker on the file that
583 * indicates which operations have already been applied. If the
584 * current or a newer operation has been marked as applied, we
585 * should not replay the current operation again.
587 * If we are not replaying the journal, we already return true. It
588 * is only on replay that we might return false, indicated that the
589 * operation should not be performed (again).
591 * @param fd open fd on the file/object in question
592 * @param spos sequencerposition for an operation we could apply/replay
593 * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
595 int _check_replay_guard(int fd
, const SequencerPosition
& spos
);
596 int _check_replay_guard(const coll_t
& cid
, const SequencerPosition
& spos
);
597 int _check_replay_guard(const coll_t
& cid
, const ghobject_t
&oid
, const SequencerPosition
& pos
);
598 int _check_global_replay_guard(const coll_t
& cid
, const SequencerPosition
& spos
);
600 // ------------------
602 int pick_object_revision_lt(ghobject_t
& oid
) {
605 using ObjectStore::exists
;
606 bool exists(CollectionHandle
& c
, const ghobject_t
& oid
) override
;
607 using ObjectStore::stat
;
610 const ghobject_t
& oid
,
612 bool allow_eio
= false) override
;
613 using ObjectStore::set_collection_opts
;
614 int set_collection_opts(
616 const pool_opts_t
& opts
) override
;
617 using ObjectStore::read
;
620 const ghobject_t
& oid
,
623 ceph::buffer::list
& bl
,
624 uint32_t op_flags
= 0) override
;
625 int _do_fiemap(int fd
, uint64_t offset
, size_t len
,
626 std::map
<uint64_t, uint64_t> *m
);
627 int _do_seek_hole_data(int fd
, uint64_t offset
, size_t len
,
628 std::map
<uint64_t, uint64_t> *m
);
629 using ObjectStore::fiemap
;
630 int fiemap(CollectionHandle
& c
, const ghobject_t
& oid
, uint64_t offset
, size_t len
, ceph::buffer::list
& bl
) override
;
631 int fiemap(CollectionHandle
& c
, const ghobject_t
& oid
, uint64_t offset
, size_t len
, std::map
<uint64_t, uint64_t>& destmap
) override
;
633 int _touch(const coll_t
& cid
, const ghobject_t
& oid
);
634 int _write(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t offset
, size_t len
,
635 const ceph::buffer::list
& bl
, uint32_t fadvise_flags
= 0);
636 int _zero(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t offset
, size_t len
);
637 int _truncate(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t size
);
638 int _clone(const coll_t
& cid
, const ghobject_t
& oldoid
, const ghobject_t
& newoid
,
639 const SequencerPosition
& spos
);
640 int _clone_range(const coll_t
& oldcid
, const ghobject_t
& oldoid
, const coll_t
& newcid
, const ghobject_t
& newoid
,
641 uint64_t srcoff
, uint64_t len
, uint64_t dstoff
,
642 const SequencerPosition
& spos
);
643 int _do_clone_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
);
644 int _do_sparse_copy_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
);
645 int _do_copy_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
, bool skip_sloppycrc
=false);
646 int _remove(const coll_t
& cid
, const ghobject_t
& oid
, const SequencerPosition
&spos
);
648 int _fgetattr(int fd
, const char *name
, ceph::bufferptr
& bp
);
649 int _fgetattrs(int fd
, std::map
<std::string
, ceph::bufferptr
,std::less
<>>& aset
);
650 int _fsetattrs(int fd
, std::map
<std::string
, ceph::bufferptr
,std::less
<>>& aset
);
652 void do_force_sync();
653 void start_sync(Context
*onsafe
);
655 void _flush_op_queue();
657 void sync_and_flush();
659 int flush_journal() override
;
660 int dump_journal(std::ostream
& out
) override
;
662 void set_fsid(uuid_d u
) override
{
665 uuid_d
get_fsid() override
{ return fsid
; }
667 uint64_t estimate_objects_overhead(uint64_t num_objects
) override
;
669 // DEBUG read error injection, an object is removed from both on delete()
670 ceph::mutex read_error_lock
= ceph::make_mutex("FileStore::read_error_lock");
671 std::set
<ghobject_t
> data_error_set
; // read() will return -EIO
672 std::set
<ghobject_t
> mdata_error_set
; // getattr(),stat() will return -EIO
673 void inject_data_error(const ghobject_t
&oid
) override
;
674 void inject_mdata_error(const ghobject_t
&oid
) override
;
676 void compact() override
{
677 ceph_assert(object_map
);
678 object_map
->compact();
681 bool has_builtin_csum() const override
{
685 void debug_obj_on_delete(const ghobject_t
&oid
);
686 bool debug_data_eio(const ghobject_t
&oid
);
687 bool debug_mdata_eio(const ghobject_t
&oid
);
689 int snapshot(const std::string
& name
) override
;
692 using ObjectStore::getattr
;
693 using ObjectStore::getattrs
;
694 int getattr(CollectionHandle
& c
, const ghobject_t
& oid
, const char *name
, ceph::bufferptr
&bp
) override
;
695 int getattrs(CollectionHandle
& c
, const ghobject_t
& oid
, std::map
<std::string
,ceph::bufferptr
,std::less
<>>& aset
) override
;
697 int _setattrs(const coll_t
& cid
, const ghobject_t
& oid
, std::map
<std::string
,ceph::bufferptr
>& aset
,
698 const SequencerPosition
&spos
);
699 int _rmattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
,
700 const SequencerPosition
&spos
);
701 int _rmattrs(const coll_t
& cid
, const ghobject_t
& oid
,
702 const SequencerPosition
&spos
);
704 int _collection_remove_recursive(const coll_t
&cid
,
705 const SequencerPosition
&spos
);
707 int _collection_set_bits(const coll_t
& cid
, int bits
);
710 using ObjectStore::collection_list
;
711 int collection_bits(CollectionHandle
& c
) override
;
712 int collection_list(CollectionHandle
& c
,
713 const ghobject_t
& start
, const ghobject_t
& end
, int max
,
714 std::vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
{
716 return collection_list(c
->cid
, start
, end
, max
, ls
, next
);
718 int collection_list(const coll_t
& cid
,
719 const ghobject_t
& start
, const ghobject_t
& end
, int max
,
720 std::vector
<ghobject_t
> *ls
, ghobject_t
*next
);
721 int list_collections(std::vector
<coll_t
>& ls
) override
;
722 int list_collections(std::vector
<coll_t
>& ls
, bool include_temp
);
723 int collection_stat(const coll_t
& c
, struct stat
*st
);
724 bool collection_exists(const coll_t
& c
) override
;
725 int collection_empty(CollectionHandle
& c
, bool *empty
) override
{
727 return collection_empty(c
->cid
, empty
);
729 int collection_empty(const coll_t
& cid
, bool *empty
);
731 // omap (see ObjectStore.h for documentation)
732 using ObjectStore::omap_get
;
733 int omap_get(CollectionHandle
& c
, const ghobject_t
&oid
, ceph::buffer::list
*header
,
734 std::map
<std::string
, ceph::buffer::list
> *out
) override
;
735 using ObjectStore::omap_get_header
;
738 const ghobject_t
&oid
,
739 ceph::buffer::list
*out
,
740 bool allow_eio
= false) override
;
741 using ObjectStore::omap_get_keys
;
742 int omap_get_keys(CollectionHandle
& c
, const ghobject_t
&oid
, std::set
<std::string
> *keys
) override
;
743 using ObjectStore::omap_get_values
;
744 int omap_get_values(CollectionHandle
& c
, const ghobject_t
&oid
, const std::set
<std::string
> &keys
,
745 std::map
<std::string
, ceph::buffer::list
> *out
) override
;
746 using ObjectStore::omap_check_keys
;
747 int omap_check_keys(CollectionHandle
& c
, const ghobject_t
&oid
, const std::set
<std::string
> &keys
,
748 std::set
<std::string
> *out
) override
;
749 using ObjectStore::get_omap_iterator
;
750 ObjectMap::ObjectMapIterator
get_omap_iterator(CollectionHandle
& c
, const ghobject_t
&oid
) override
;
751 ObjectMap::ObjectMapIterator
get_omap_iterator(const coll_t
& cid
, const ghobject_t
&oid
);
753 int _create_collection(const coll_t
& c
, int bits
,
754 const SequencerPosition
&spos
);
755 int _destroy_collection(const coll_t
& c
);
757 * Give an expected number of objects hint to the collection.
759 * @param c - collection id.
760 * @param pg_num - pg number of the pool this collection belongs to
761 * @param expected_num_objs - expected number of objects in this collection
762 * @param spos - sequence position
764 * @return 0 on success, an error code otherwise
766 int _collection_hint_expected_num_objs(const coll_t
& c
, uint32_t pg_num
,
767 uint64_t expected_num_objs
,
768 const SequencerPosition
&spos
);
769 int _collection_add(const coll_t
& c
, const coll_t
& ocid
, const ghobject_t
& oid
,
770 const SequencerPosition
& spos
);
771 int _collection_move_rename(const coll_t
& oldcid
, const ghobject_t
& oldoid
,
772 coll_t c
, const ghobject_t
& o
,
773 const SequencerPosition
& spos
,
774 bool ignore_enoent
= false);
776 int _set_alloc_hint(const coll_t
& cid
, const ghobject_t
& oid
,
777 uint64_t expected_object_size
,
778 uint64_t expected_write_size
);
780 void dump_start(const std::string
& file
);
782 void dump_transactions(std::vector
<Transaction
>& ls
, uint64_t seq
, OpSequencer
*osr
);
784 virtual int apply_layout_settings(const coll_t
&cid
, int target_level
);
786 void get_db_statistics(ceph::Formatter
* f
) override
;
789 void _inject_failure();
792 int _omap_clear(const coll_t
& cid
, const ghobject_t
&oid
,
793 const SequencerPosition
&spos
);
794 int _omap_setkeys(const coll_t
& cid
, const ghobject_t
&oid
,
795 const std::map
<std::string
, ceph::buffer::list
> &aset
,
796 const SequencerPosition
&spos
);
797 int _omap_rmkeys(const coll_t
& cid
, const ghobject_t
&oid
, const std::set
<std::string
> &keys
,
798 const SequencerPosition
&spos
);
799 int _omap_rmkeyrange(const coll_t
& cid
, const ghobject_t
&oid
,
800 const std::string
& first
, const std::string
& last
,
801 const SequencerPosition
&spos
);
802 int _omap_setheader(const coll_t
& cid
, const ghobject_t
&oid
, const ceph::buffer::list
&bl
,
803 const SequencerPosition
&spos
);
804 int _split_collection(const coll_t
& cid
, uint32_t bits
, uint32_t rem
, coll_t dest
,
805 const SequencerPosition
&spos
);
806 int _merge_collection(const coll_t
& cid
, uint32_t bits
, coll_t dest
,
807 const SequencerPosition
&spos
);
809 const char** get_tracked_conf_keys() const override
;
810 void handle_conf_change(const ConfigProxy
& conf
,
811 const std::set
<std::string
> &changed
) override
;
812 int set_throttle_params();
813 float m_filestore_commit_timeout
;
814 bool m_filestore_journal_parallel
;
815 bool m_filestore_journal_trailing
;
816 bool m_filestore_journal_writeahead
;
817 int m_filestore_fiemap_threshold
;
818 double m_filestore_max_sync_interval
;
819 double m_filestore_min_sync_interval
;
820 bool m_filestore_fail_eio
;
821 bool m_filestore_fadvise
;
823 bool m_journal_dio
, m_journal_aio
, m_journal_force_aio
;
824 std::string m_osd_rollback_to_cluster_snap
;
825 bool m_osd_use_stale_snap
;
826 bool m_filestore_do_dump
;
827 std::ofstream m_filestore_dump
;
828 ceph::JSONFormatter m_filestore_dump_fmt
;
829 std::atomic
<int64_t> m_filestore_kill_at
= { 0 };
830 bool m_filestore_sloppy_crc
;
831 int m_filestore_sloppy_crc_block_size
;
832 uint64_t m_filestore_max_alloc_hint_size
;
833 unsigned long m_fs_type
;
835 //Determined xattr handling based on fs type
836 void set_xattr_limits_via_conf();
837 uint32_t m_filestore_max_inline_xattr_size
;
838 uint32_t m_filestore_max_inline_xattrs
;
839 uint32_t m_filestore_max_xattr_value_size
;
841 FSSuperblock superblock
;
846 * Write superblock to persisent storage
848 * return value: 0 on success, otherwise negative errno
850 int write_superblock();
855 * Fill in FileStore::superblock by reading persistent storage
857 * return value: 0 on success, otherwise negative errno
859 int read_superblock();
861 friend class FileStoreBackend
;
862 friend class TestFileStore
;
865 std::ostream
& operator<<(std::ostream
& out
, const FileStore::OpSequencer
& s
);
869 class FileStoreBackend
{
871 FileStore
*filestore
;
873 int get_basedir_fd() {
874 return filestore
->basedir_fd
;
876 int get_current_fd() {
877 return filestore
->current_fd
;
880 return filestore
->op_fd
;
882 size_t get_blksize() {
883 return filestore
->blk_size
;
885 const std::string
& get_basedir_path() {
886 return filestore
->basedir
;
888 const std::string
& get_journal_path() {
889 return filestore
->journalpath
;
891 const std::string
& get_current_path() {
892 return filestore
->current_fn
;
894 int _copy_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
) {
895 if (has_fiemap() || has_seek_data_hole()) {
896 return filestore
->_do_sparse_copy_range(from
, to
, srcoff
, len
, dstoff
);
898 return filestore
->_do_copy_range(from
, to
, srcoff
, len
, dstoff
);
901 int get_crc_block_size() {
902 return filestore
->m_filestore_sloppy_crc_block_size
;
906 explicit FileStoreBackend(FileStore
*fs
) : filestore(fs
) {}
907 virtual ~FileStoreBackend() {}
909 CephContext
* cct() const {
910 return filestore
->cct
;
913 static FileStoreBackend
*create(unsigned long f_type
, FileStore
*fs
);
915 virtual const char *get_name() = 0;
916 virtual int detect_features() = 0;
917 virtual int create_current() = 0;
918 virtual bool can_checkpoint() = 0;
919 virtual int list_checkpoints(std::list
<std::string
>& ls
) = 0;
920 virtual int create_checkpoint(const std::string
& name
, uint64_t *cid
) = 0;
921 virtual int sync_checkpoint(uint64_t id
) = 0;
922 virtual int rollback_to(const std::string
& name
) = 0;
923 virtual int destroy_checkpoint(const std::string
& name
) = 0;
924 virtual int syncfs() = 0;
925 virtual bool has_fiemap() = 0;
926 virtual bool has_seek_data_hole() = 0;
927 virtual bool is_rotational() = 0;
928 virtual bool is_journal_rotational() = 0;
929 virtual int do_fiemap(int fd
, off_t start
, size_t len
, struct fiemap
**pfiemap
) = 0;
930 virtual int clone_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
) = 0;
931 virtual int set_alloc_hint(int fd
, uint64_t hint
) = 0;
932 virtual bool has_splice() const = 0;
934 // hooks for (sloppy) crc tracking
935 virtual int _crc_update_write(int fd
, loff_t off
, size_t len
, const ceph::buffer::list
& bl
) = 0;
936 virtual int _crc_update_truncate(int fd
, loff_t off
) = 0;
937 virtual int _crc_update_zero(int fd
, loff_t off
, size_t len
) = 0;
938 virtual int _crc_update_clone_range(int srcfd
, int destfd
,
939 loff_t srcoff
, size_t len
, loff_t dstoff
) = 0;
940 virtual int _crc_verify_read(int fd
, loff_t off
, size_t len
, const ceph::buffer::list
& bl
,
941 std::ostream
*out
) = 0;