1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #ifndef CEPH_FILESTORE_H
17 #define CEPH_FILESTORE_H
19 #include "include/types.h"
28 #include <boost/scoped_ptr.hpp>
30 #include "include/unordered_map.h"
32 #include "include/assert.h"
34 #include "os/ObjectStore.h"
35 #include "JournalingObjectStore.h"
37 #include "common/Timer.h"
38 #include "common/WorkQueue.h"
39 #include "common/perf_counters.h"
40 #include "common/zipkin_trace.h"
42 #include "common/Mutex.h"
43 #include "HashIndex.h"
44 #include "IndexManager.h"
45 #include "os/ObjectMap.h"
46 #include "SequencerPosition.h"
48 #include "WBThrottle.h"
50 #include "include/uuid.h"
53 // from include/linux/falloc.h:
54 #ifndef FALLOC_FL_PUNCH_HOLE
55 # define FALLOC_FL_PUNCH_HOLE 0x2
58 #if defined(__linux__)
59 # ifndef BTRFS_SUPER_MAGIC
60 #define BTRFS_SUPER_MAGIC 0x9123683EL
62 # ifndef XFS_SUPER_MAGIC
63 #define XFS_SUPER_MAGIC 0x58465342L
65 # ifndef ZFS_SUPER_MAGIC
66 #define ZFS_SUPER_MAGIC 0x2fc12fc1L
71 class FileStoreBackend
;
73 #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
76 l_filestore_first
= 84000,
77 l_filestore_journal_queue_ops
,
78 l_filestore_journal_queue_bytes
,
79 l_filestore_journal_ops
,
80 l_filestore_journal_bytes
,
81 l_filestore_journal_latency
,
82 l_filestore_journal_wr
,
83 l_filestore_journal_wr_bytes
,
84 l_filestore_journal_full
,
85 l_filestore_committing
,
86 l_filestore_commitcycle
,
87 l_filestore_commitcycle_interval
,
88 l_filestore_commitcycle_latency
,
89 l_filestore_op_queue_max_ops
,
90 l_filestore_op_queue_ops
,
92 l_filestore_op_queue_max_bytes
,
93 l_filestore_op_queue_bytes
,
95 l_filestore_apply_latency
,
96 l_filestore_queue_transaction_latency_avg
,
97 l_filestore_sync_pause_max_lat
,
103 CompatSet compat_features
;
108 void encode(bufferlist
&bl
) const;
109 void decode(bufferlist::iterator
&bl
);
110 void dump(Formatter
*f
) const;
111 static void generate_test_instances(list
<FSSuperblock
*>& o
);
113 WRITE_CLASS_ENCODER(FSSuperblock
)
115 inline ostream
& operator<<(ostream
& out
, const FSSuperblock
& sb
)
117 return out
<< "sb(" << sb
.compat_features
<< "): "
121 class FileStore
: public JournalingObjectStore
,
122 public md_config_obs_t
124 static const uint32_t target_version
= 4;
126 uint32_t get_target_version() {
127 return target_version
;
130 static int get_block_device_fsid(CephContext
* cct
, const string
& path
,
132 struct FSPerfTracker
{
133 PerfCounters::avg_tracker
<uint64_t> os_commit_latency
;
134 PerfCounters::avg_tracker
<uint64_t> os_apply_latency
;
136 objectstore_perf_stat_t
get_cur_stats() const {
137 objectstore_perf_stat_t ret
;
138 ret
.os_commit_latency
= os_commit_latency
.current_avg();
139 ret
.os_apply_latency
= os_apply_latency
.current_avg();
143 void update_from_perfcounters(PerfCounters
&logger
);
145 objectstore_perf_stat_t
get_cur_stats() override
{
146 perf_tracker
.update_from_perfcounters(*logger
);
147 return perf_tracker
.get_cur_stats();
149 const PerfCounters
* get_perf_counters() const override
{
154 string internal_name
; ///< internal name, used to name the perfcounter instance
155 string basedir
, journalpath
;
156 osflagbits_t generic_flags
;
157 std::string current_fn
;
158 std::string current_op_seq_fn
;
159 std::string omap_dir
;
162 size_t blk_size
; ///< fs block size
164 int fsid_fd
, op_fd
, basedir_fd
, current_fd
;
166 FileStoreBackend
*backend
;
168 void create_backend(long f_type
);
170 deque
<uint64_t> snaps
;
172 // Indexed Collections
173 IndexManager index_manager
;
174 int get_index(const coll_t
& c
, Index
*index
);
175 int init_index(const coll_t
& c
);
177 bool _need_temp_object_collection(const coll_t
& cid
, const ghobject_t
& oid
) {
178 // - normal temp case: cid is pg, object is temp (pool < -1)
179 // - hammer temp case: cid is pg (or already temp), object pool is -1
180 return cid
.is_pg() && oid
.hobj
.pool
<= -1;
182 void init_temp_collections();
185 boost::scoped_ptr
<ObjectMap
> object_map
;
188 int get_cdir(const coll_t
& cid
, char *s
, int len
);
190 /// read a uuid from fd
191 int read_fsid(int fd
, uuid_d
*uuid
);
201 Mutex sync_entry_timeo_lock
;
204 list
<Context
*> sync_waiters
;
207 struct SyncThread
: public Thread
{
209 explicit SyncThread(FileStore
*f
) : fs(f
) {}
210 void *entry() override
{
216 // -- op workqueue --
220 vector
<Transaction
> tls
;
221 Context
*onreadable
, *onreadable_sync
;
224 ZTracer::Trace trace
;
226 class OpSequencer
: public Sequencer_impl
{
227 Mutex qlock
; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
230 list
<pair
<uint64_t, Context
*> > flush_commit_waiters
;
234 Mutex apply_lock
; // for apply mutual exclusion
237 /// get_max_uncompleted
238 bool _get_max_uncompleted(
239 uint64_t *seq
///< [out] max uncompleted seq
241 assert(qlock
.is_locked());
244 if (q
.empty() && jq
.empty())
249 if (!jq
.empty() && jq
.back() > *seq
)
253 } /// @returns true if both queues are empty
255 /// get_min_uncompleted
256 bool _get_min_uncompleted(
257 uint64_t *seq
///< [out] min uncompleted seq
259 assert(qlock
.is_locked());
262 if (q
.empty() && jq
.empty())
266 *seq
= q
.front()->op
;
267 if (!jq
.empty() && jq
.front() < *seq
)
271 } /// @returns true if both queues are empty
273 void _wake_flush_waiters(list
<Context
*> *to_queue
) {
275 if (_get_min_uncompleted(&seq
))
278 for (list
<pair
<uint64_t, Context
*> >::iterator i
=
279 flush_commit_waiters
.begin();
280 i
!= flush_commit_waiters
.end() && i
->first
< seq
;
281 flush_commit_waiters
.erase(i
++)) {
282 to_queue
->push_back(i
->second
);
286 void queue_journal(uint64_t s
) {
287 Mutex::Locker
l(qlock
);
290 void dequeue_journal(list
<Context
*> *to_queue
) {
291 Mutex::Locker
l(qlock
);
294 _wake_flush_waiters(to_queue
);
297 Mutex::Locker
l(qlock
);
299 o
->trace
.keyval("queue depth", q
.size());
302 Mutex::Locker
l(qlock
);
303 assert(apply_lock
.is_locked());
307 Op
*dequeue(list
<Context
*> *to_queue
) {
309 assert(apply_lock
.is_locked());
310 Mutex::Locker
l(qlock
);
315 _wake_flush_waiters(to_queue
);
319 void flush() override
{
320 Mutex::Locker
l(qlock
);
322 while (cct
->_conf
->filestore_blackhole
)
323 cond
.Wait(qlock
); // wait forever
326 // get max for journal _or_ op queues
330 if (!jq
.empty() && jq
.back() > seq
)
334 // everything prior to our watermark to drain through either/both queues
335 while ((!q
.empty() && q
.front()->op
<= seq
) ||
336 (!jq
.empty() && jq
.front() <= seq
))
340 bool flush_commit(Context
*c
) override
{
341 Mutex::Locker
l(qlock
);
343 if (_get_max_uncompleted(&seq
)) {
346 flush_commit_waiters
.push_back(make_pair(seq
, c
));
351 OpSequencer(CephContext
* cct
, int i
)
352 : Sequencer_impl(cct
),
353 qlock("FileStore::OpSequencer::qlock", false, false),
355 apply_lock("FileStore::OpSequencer::apply_lock", false, false),
357 ~OpSequencer() override
{
361 const string
& get_name() const {
362 return parent
->get_name();
366 friend ostream
& operator<<(ostream
& out
, const OpSequencer
& s
);
369 WBThrottle wbthrottle
;
371 std::atomic
<int64_t> next_osr_id
= { 0 };
372 bool m_disable_wbthrottle
;
373 deque
<OpSequencer
*> op_queue
;
374 BackoffThrottle throttle_ops
, throttle_bytes
;
375 const int m_ondisk_finisher_num
;
376 const int m_apply_finisher_num
;
377 vector
<Finisher
*> ondisk_finishers
;
378 vector
<Finisher
*> apply_finishers
;
381 struct OpWQ
: public ThreadPool::WorkQueue
<OpSequencer
> {
383 OpWQ(FileStore
*fs
, time_t timeout
, time_t suicide_timeout
, ThreadPool
*tp
)
384 : ThreadPool::WorkQueue
<OpSequencer
>("FileStore::OpWQ", timeout
, suicide_timeout
, tp
), store(fs
) {}
386 bool _enqueue(OpSequencer
*osr
) override
{
387 store
->op_queue
.push_back(osr
);
390 void _dequeue(OpSequencer
*o
) override
{
393 bool _empty() override
{
394 return store
->op_queue
.empty();
396 OpSequencer
*_dequeue() override
{
397 if (store
->op_queue
.empty())
399 OpSequencer
*osr
= store
->op_queue
.front();
400 store
->op_queue
.pop_front();
403 void _process(OpSequencer
*osr
, ThreadPool::TPHandle
&handle
) override
{
404 store
->_do_op(osr
, handle
);
406 void _process_finish(OpSequencer
*osr
) override
{
407 store
->_finish_op(osr
);
409 void _clear() override
{
410 assert(store
->op_queue
.empty());
414 void _do_op(OpSequencer
*o
, ThreadPool::TPHandle
&handle
);
415 void _finish_op(OpSequencer
*o
);
416 Op
*build_op(vector
<Transaction
>& tls
,
417 Context
*onreadable
, Context
*onreadable_sync
,
418 TrackedOpRef osd_op
);
419 void queue_op(OpSequencer
*osr
, Op
*o
);
420 void op_queue_reserve_throttle(Op
*o
);
421 void op_queue_release_throttle(Op
*o
);
422 void _journaled_ahead(OpSequencer
*osr
, Op
*o
, Context
*ondisk
);
423 friend struct C_JournaledAhead
;
427 PerfCounters
*logger
;
429 ZTracer::Endpoint trace_endpoint
;
432 int lfn_find(const ghobject_t
& oid
, const Index
& index
,
433 IndexedPath
*path
= NULL
);
434 int lfn_truncate(const coll_t
& cid
, const ghobject_t
& oid
, off_t length
);
435 int lfn_stat(const coll_t
& cid
, const ghobject_t
& oid
, struct stat
*buf
);
438 const ghobject_t
& oid
,
443 void lfn_close(FDRef fd
);
444 int lfn_link(const coll_t
& c
, const coll_t
& newcid
, const ghobject_t
& o
, const ghobject_t
& newoid
) ;
445 int lfn_unlink(const coll_t
& cid
, const ghobject_t
& o
, const SequencerPosition
&spos
,
446 bool force_clear_omap
=false);
449 FileStore(CephContext
* cct
, const std::string
&base
, const std::string
&jdev
,
450 osflagbits_t flags
= 0,
451 const char *internal_name
= "filestore", bool update_to
=false);
452 ~FileStore() override
;
454 string
get_type() override
{
459 int _sanity_check_fs();
461 bool test_mount_in_use() override
;
462 int read_op_seq(uint64_t *seq
);
463 int write_op_seq(int, uint64_t seq
);
464 int mount() override
;
465 int umount() override
;
467 int validate_hobject_key(const hobject_t
&obj
) const override
;
469 unsigned get_max_attr_name_length() override
{
470 // xattr limit is 128; leave room for our prefixes (user.ceph._),
471 // some margin, and cap at 100
475 int mkjournal() override
;
476 bool wants_journal() override
{
479 bool allows_journal() override
{
482 bool needs_journal() override
{
486 bool is_rotational() override
;
487 bool is_journal_rotational() override
;
489 void dump_perf_counters(Formatter
*f
) override
{
490 f
->open_object_section("perf_counters");
491 logger
->dump_formatted(f
, false);
495 int write_version_stamp();
496 int version_stamp_is_valid(uint32_t *version
);
497 int update_version_stamp();
498 int upgrade() override
;
500 bool can_sort_nibblewise() override
{
501 return true; // i support legacy sort order
504 void collect_metadata(map
<string
,string
> *pm
) override
;
506 int statfs(struct store_statfs_t
*buf
) override
;
508 int _do_transactions(
509 vector
<Transaction
> &tls
, uint64_t op_seq
,
510 ThreadPool::TPHandle
*handle
);
511 int do_transactions(vector
<Transaction
> &tls
, uint64_t op_seq
) override
{
512 return _do_transactions(tls
, op_seq
, 0);
514 void _do_transaction(
515 Transaction
& t
, uint64_t op_seq
, int trans_num
,
516 ThreadPool::TPHandle
*handle
);
518 int queue_transactions(Sequencer
*osr
, vector
<Transaction
>& tls
,
519 TrackedOpRef op
= TrackedOpRef(),
520 ThreadPool::TPHandle
*handle
= NULL
) override
;
523 * set replay guard xattr on given file
525 * This will ensure that we will not replay this (or any previous) operation
526 * against this particular inode/object.
528 * @param fd open file descriptor for the file/object
529 * @param spos sequencer position of the last operation we should not replay
531 void _set_replay_guard(int fd
,
532 const SequencerPosition
& spos
,
533 const ghobject_t
*oid
=0,
534 bool in_progress
=false);
535 void _set_replay_guard(const coll_t
& cid
,
536 const SequencerPosition
& spos
,
538 void _set_global_replay_guard(const coll_t
& cid
,
539 const SequencerPosition
&spos
);
541 /// close a replay guard opened with in_progress=true
542 void _close_replay_guard(int fd
, const SequencerPosition
& spos
,
543 const ghobject_t
*oid
=0);
544 void _close_replay_guard(const coll_t
& cid
, const SequencerPosition
& spos
);
547 * check replay guard xattr on given file
549 * Check the current position against any marker on the file that
550 * indicates which operations have already been applied. If the
551 * current or a newer operation has been marked as applied, we
552 * should not replay the current operation again.
554 * If we are not replaying the journal, we already return true. It
555 * is only on replay that we might return false, indicated that the
556 * operation should not be performed (again).
558 * @param fd open fd on the file/object in question
559 * @param spos sequencerposition for an operation we could apply/replay
560 * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
562 int _check_replay_guard(int fd
, const SequencerPosition
& spos
);
563 int _check_replay_guard(const coll_t
& cid
, const SequencerPosition
& spos
);
564 int _check_replay_guard(const coll_t
& cid
, const ghobject_t
&oid
, const SequencerPosition
& pos
);
565 int _check_global_replay_guard(const coll_t
& cid
, const SequencerPosition
& spos
);
567 // ------------------
569 int pick_object_revision_lt(ghobject_t
& oid
) {
572 using ObjectStore::exists
;
573 bool exists(const coll_t
& cid
, const ghobject_t
& oid
) override
;
574 using ObjectStore::stat
;
577 const ghobject_t
& oid
,
579 bool allow_eio
= false) override
;
580 using ObjectStore::set_collection_opts
;
581 int set_collection_opts(
583 const pool_opts_t
& opts
) override
;
584 using ObjectStore::read
;
587 const ghobject_t
& oid
,
591 uint32_t op_flags
= 0) override
;
592 int _do_fiemap(int fd
, uint64_t offset
, size_t len
,
593 map
<uint64_t, uint64_t> *m
);
594 int _do_seek_hole_data(int fd
, uint64_t offset
, size_t len
,
595 map
<uint64_t, uint64_t> *m
);
596 using ObjectStore::fiemap
;
597 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
598 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
600 int _touch(const coll_t
& cid
, const ghobject_t
& oid
);
601 int _write(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t offset
, size_t len
,
602 const bufferlist
& bl
, uint32_t fadvise_flags
= 0);
603 int _zero(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t offset
, size_t len
);
604 int _truncate(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t size
);
605 int _clone(const coll_t
& cid
, const ghobject_t
& oldoid
, const ghobject_t
& newoid
,
606 const SequencerPosition
& spos
);
607 int _clone_range(const coll_t
& oldcid
, const ghobject_t
& oldoid
, const coll_t
& newcid
, const ghobject_t
& newoid
,
608 uint64_t srcoff
, uint64_t len
, uint64_t dstoff
,
609 const SequencerPosition
& spos
);
610 int _do_clone_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
);
611 int _do_sparse_copy_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
);
612 int _do_copy_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
, bool skip_sloppycrc
=false);
613 int _remove(const coll_t
& cid
, const ghobject_t
& oid
, const SequencerPosition
&spos
);
615 int _fgetattr(int fd
, const char *name
, bufferptr
& bp
);
616 int _fgetattrs(int fd
, map
<string
,bufferptr
>& aset
);
617 int _fsetattrs(int fd
, map
<string
, bufferptr
> &aset
);
621 void do_force_sync();
622 void start_sync(Context
*onsafe
);
624 void _flush_op_queue();
626 void sync_and_flush();
628 int flush_journal() override
;
629 int dump_journal(ostream
& out
) override
;
631 void set_fsid(uuid_d u
) override
{
634 uuid_d
get_fsid() override
{ return fsid
; }
636 uint64_t estimate_objects_overhead(uint64_t num_objects
) override
;
638 // DEBUG read error injection, an object is removed from both on delete()
639 Mutex read_error_lock
;
640 set
<ghobject_t
> data_error_set
; // read() will return -EIO
641 set
<ghobject_t
> mdata_error_set
; // getattr(),stat() will return -EIO
642 void inject_data_error(const ghobject_t
&oid
) override
;
643 void inject_mdata_error(const ghobject_t
&oid
) override
;
645 void compact() override
{
647 object_map
->compact();
650 bool has_builtin_csum() const override
{
654 void debug_obj_on_delete(const ghobject_t
&oid
);
655 bool debug_data_eio(const ghobject_t
&oid
);
656 bool debug_mdata_eio(const ghobject_t
&oid
);
658 int snapshot(const string
& name
) override
;
661 using ObjectStore::getattr
;
662 using ObjectStore::getattrs
;
663 int getattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
, bufferptr
&bp
) override
;
664 int getattrs(const coll_t
& cid
, const ghobject_t
& oid
, map
<string
,bufferptr
>& aset
) override
;
666 int _setattrs(const coll_t
& cid
, const ghobject_t
& oid
, map
<string
,bufferptr
>& aset
,
667 const SequencerPosition
&spos
);
668 int _rmattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
,
669 const SequencerPosition
&spos
);
670 int _rmattrs(const coll_t
& cid
, const ghobject_t
& oid
,
671 const SequencerPosition
&spos
);
673 int _collection_remove_recursive(const coll_t
&cid
,
674 const SequencerPosition
&spos
);
676 int _collection_set_bits(const coll_t
& cid
, int bits
);
679 using ObjectStore::collection_list
;
680 int collection_bits(const coll_t
& c
) override
;
681 int collection_list(const coll_t
& c
,
682 const ghobject_t
& start
, const ghobject_t
& end
, int max
,
683 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
684 int list_collections(vector
<coll_t
>& ls
) override
;
685 int list_collections(vector
<coll_t
>& ls
, bool include_temp
);
686 int collection_stat(const coll_t
& c
, struct stat
*st
);
687 bool collection_exists(const coll_t
& c
) override
;
688 int collection_empty(const coll_t
& c
, bool *empty
) override
;
690 // omap (see ObjectStore.h for documentation)
691 using ObjectStore::omap_get
;
692 int omap_get(const coll_t
& c
, const ghobject_t
&oid
, bufferlist
*header
,
693 map
<string
, bufferlist
> *out
) override
;
694 using ObjectStore::omap_get_header
;
697 const ghobject_t
&oid
,
699 bool allow_eio
= false) override
;
700 using ObjectStore::omap_get_keys
;
701 int omap_get_keys(const coll_t
& c
, const ghobject_t
&oid
, set
<string
> *keys
) override
;
702 using ObjectStore::omap_get_values
;
703 int omap_get_values(const coll_t
& c
, const ghobject_t
&oid
, const set
<string
> &keys
,
704 map
<string
, bufferlist
> *out
) override
;
705 using ObjectStore::omap_check_keys
;
706 int omap_check_keys(const coll_t
& c
, const ghobject_t
&oid
, const set
<string
> &keys
,
707 set
<string
> *out
) override
;
708 using ObjectStore::get_omap_iterator
;
709 ObjectMap::ObjectMapIterator
get_omap_iterator(const coll_t
& c
, const ghobject_t
&oid
) override
;
711 int _create_collection(const coll_t
& c
, int bits
,
712 const SequencerPosition
&spos
);
713 int _destroy_collection(const coll_t
& c
);
715 * Give an expected number of objects hint to the collection.
717 * @param c - collection id.
718 * @param pg_num - pg number of the pool this collection belongs to
719 * @param expected_num_objs - expected number of objects in this collection
720 * @param spos - sequence position
722 * @return 0 on success, an error code otherwise
724 int _collection_hint_expected_num_objs(const coll_t
& c
, uint32_t pg_num
,
725 uint64_t expected_num_objs
,
726 const SequencerPosition
&spos
);
727 int _collection_add(const coll_t
& c
, const coll_t
& ocid
, const ghobject_t
& oid
,
728 const SequencerPosition
& spos
);
729 int _collection_move_rename(const coll_t
& oldcid
, const ghobject_t
& oldoid
,
730 coll_t c
, const ghobject_t
& o
,
731 const SequencerPosition
& spos
,
732 bool ignore_enoent
= false);
734 int _set_alloc_hint(const coll_t
& cid
, const ghobject_t
& oid
,
735 uint64_t expected_object_size
,
736 uint64_t expected_write_size
);
738 void dump_start(const std::string
& file
);
740 void dump_transactions(vector
<Transaction
>& ls
, uint64_t seq
, OpSequencer
*osr
);
742 virtual int apply_layout_settings(const coll_t
&cid
);
745 void _inject_failure();
748 int _omap_clear(const coll_t
& cid
, const ghobject_t
&oid
,
749 const SequencerPosition
&spos
);
750 int _omap_setkeys(const coll_t
& cid
, const ghobject_t
&oid
,
751 const map
<string
, bufferlist
> &aset
,
752 const SequencerPosition
&spos
);
753 int _omap_rmkeys(const coll_t
& cid
, const ghobject_t
&oid
, const set
<string
> &keys
,
754 const SequencerPosition
&spos
);
755 int _omap_rmkeyrange(const coll_t
& cid
, const ghobject_t
&oid
,
756 const string
& first
, const string
& last
,
757 const SequencerPosition
&spos
);
758 int _omap_setheader(const coll_t
& cid
, const ghobject_t
&oid
, const bufferlist
&bl
,
759 const SequencerPosition
&spos
);
760 int _split_collection(const coll_t
& cid
, uint32_t bits
, uint32_t rem
, coll_t dest
,
761 const SequencerPosition
&spos
);
762 int _split_collection_create(const coll_t
& cid
, uint32_t bits
, uint32_t rem
,
764 const SequencerPosition
&spos
);
766 const char** get_tracked_conf_keys() const override
;
767 void handle_conf_change(const struct md_config_t
*conf
,
768 const std::set
<std::string
> &changed
) override
;
769 int set_throttle_params();
770 float m_filestore_commit_timeout
;
771 bool m_filestore_journal_parallel
;
772 bool m_filestore_journal_trailing
;
773 bool m_filestore_journal_writeahead
;
774 int m_filestore_fiemap_threshold
;
775 double m_filestore_max_sync_interval
;
776 double m_filestore_min_sync_interval
;
777 bool m_filestore_fail_eio
;
778 bool m_filestore_fadvise
;
780 bool m_journal_dio
, m_journal_aio
, m_journal_force_aio
;
781 std::string m_osd_rollback_to_cluster_snap
;
782 bool m_osd_use_stale_snap
;
783 bool m_filestore_do_dump
;
784 std::ofstream m_filestore_dump
;
785 JSONFormatter m_filestore_dump_fmt
;
786 std::atomic
<int64_t> m_filestore_kill_at
= { 0 };
787 bool m_filestore_sloppy_crc
;
788 int m_filestore_sloppy_crc_block_size
;
789 uint64_t m_filestore_max_alloc_hint_size
;
792 //Determined xattr handling based on fs type
793 void set_xattr_limits_via_conf();
794 uint32_t m_filestore_max_inline_xattr_size
;
795 uint32_t m_filestore_max_inline_xattrs
;
796 uint32_t m_filestore_max_xattr_value_size
;
798 FSSuperblock superblock
;
803 * Write superblock to persisent storage
805 * return value: 0 on success, otherwise negative errno
807 int write_superblock();
812 * Fill in FileStore::superblock by reading persistent storage
814 * return value: 0 on success, otherwise negative errno
816 int read_superblock();
818 friend class FileStoreBackend
;
819 friend class TestFileStore
;
822 ostream
& operator<<(ostream
& out
, const FileStore::OpSequencer
& s
);
826 class FileStoreBackend
{
828 FileStore
*filestore
;
830 int get_basedir_fd() {
831 return filestore
->basedir_fd
;
833 int get_current_fd() {
834 return filestore
->current_fd
;
837 return filestore
->op_fd
;
839 size_t get_blksize() {
840 return filestore
->blk_size
;
842 const string
& get_basedir_path() {
843 return filestore
->basedir
;
845 const string
& get_journal_path() {
846 return filestore
->journalpath
;
848 const string
& get_current_path() {
849 return filestore
->current_fn
;
851 int _copy_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
) {
852 if (has_fiemap() || has_seek_data_hole()) {
853 return filestore
->_do_sparse_copy_range(from
, to
, srcoff
, len
, dstoff
);
855 return filestore
->_do_copy_range(from
, to
, srcoff
, len
, dstoff
);
858 int get_crc_block_size() {
859 return filestore
->m_filestore_sloppy_crc_block_size
;
863 explicit FileStoreBackend(FileStore
*fs
) : filestore(fs
) {}
864 virtual ~FileStoreBackend() {}
866 CephContext
* cct() const {
867 return filestore
->cct
;
870 static FileStoreBackend
*create(long f_type
, FileStore
*fs
);
872 virtual const char *get_name() = 0;
873 virtual int detect_features() = 0;
874 virtual int create_current() = 0;
875 virtual bool can_checkpoint() = 0;
876 virtual int list_checkpoints(list
<string
>& ls
) = 0;
877 virtual int create_checkpoint(const string
& name
, uint64_t *cid
) = 0;
878 virtual int sync_checkpoint(uint64_t id
) = 0;
879 virtual int rollback_to(const string
& name
) = 0;
880 virtual int destroy_checkpoint(const string
& name
) = 0;
881 virtual int syncfs() = 0;
882 virtual bool has_fiemap() = 0;
883 virtual bool has_seek_data_hole() = 0;
884 virtual bool is_rotational() = 0;
885 virtual bool is_journal_rotational() = 0;
886 virtual int do_fiemap(int fd
, off_t start
, size_t len
, struct fiemap
**pfiemap
) = 0;
887 virtual int clone_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
) = 0;
888 virtual int set_alloc_hint(int fd
, uint64_t hint
) = 0;
889 virtual bool has_splice() const = 0;
891 // hooks for (sloppy) crc tracking
892 virtual int _crc_update_write(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
) = 0;
893 virtual int _crc_update_truncate(int fd
, loff_t off
) = 0;
894 virtual int _crc_update_zero(int fd
, loff_t off
, size_t len
) = 0;
895 virtual int _crc_update_clone_range(int srcfd
, int destfd
,
896 loff_t srcoff
, size_t len
, loff_t dstoff
) = 0;
897 virtual int _crc_verify_read(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
,