]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/filestore/FileStore.h
buildsys: change download over to reef release
[ceph.git] / ceph / src / os / filestore / FileStore.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #ifndef CEPH_FILESTORE_H
17 #define CEPH_FILESTORE_H
18
19 #include "include/types.h"
20
21 #include <map>
22 #include <deque>
23 #include <atomic>
24 #include <fstream>
25
26
27 #include <boost/scoped_ptr.hpp>
28
29 #include "include/unordered_map.h"
30
31 #include "include/ceph_assert.h"
32
33 #include "os/ObjectStore.h"
34 #include "JournalingObjectStore.h"
35
36 #include "common/Timer.h"
37 #include "common/WorkQueue.h"
38 #include "common/perf_counters.h"
39 #include "common/zipkin_trace.h"
40
41 #include "common/ceph_mutex.h"
42 #include "HashIndex.h"
43 #include "IndexManager.h"
44 #include "os/ObjectMap.h"
45 #include "SequencerPosition.h"
46 #include "FDCache.h"
47 #include "WBThrottle.h"
48
49 #include "include/uuid.h"
50
51 #if defined(__linux__)
52 # ifndef BTRFS_SUPER_MAGIC
53 #define BTRFS_SUPER_MAGIC 0x9123683EUL
54 # endif
55 # ifndef XFS_SUPER_MAGIC
56 #define XFS_SUPER_MAGIC 0x58465342UL
57 # endif
58 # ifndef ZFS_SUPER_MAGIC
59 #define ZFS_SUPER_MAGIC 0x2fc12fc1UL
60 # endif
61 #endif
62
63
64 class FileStoreBackend;
65
66 #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
67
68 enum {
69 l_filestore_first = 84000,
70 l_filestore_journal_queue_ops,
71 l_filestore_journal_queue_bytes,
72 l_filestore_journal_ops,
73 l_filestore_journal_bytes,
74 l_filestore_journal_latency,
75 l_filestore_journal_wr,
76 l_filestore_journal_wr_bytes,
77 l_filestore_journal_full,
78 l_filestore_committing,
79 l_filestore_commitcycle,
80 l_filestore_commitcycle_interval,
81 l_filestore_commitcycle_latency,
82 l_filestore_op_queue_max_ops,
83 l_filestore_op_queue_ops,
84 l_filestore_ops,
85 l_filestore_op_queue_max_bytes,
86 l_filestore_op_queue_bytes,
87 l_filestore_bytes,
88 l_filestore_apply_latency,
89 l_filestore_queue_transaction_latency_avg,
90 l_filestore_sync_pause_max_lat,
91 l_filestore_last,
92 };
93
94 class FSSuperblock {
95 public:
96 CompatSet compat_features;
97 std::string omap_backend;
98
99 FSSuperblock() { }
100
101 void encode(ceph::buffer::list &bl) const;
102 void decode(ceph::buffer::list::const_iterator &bl);
103 void dump(ceph::Formatter *f) const;
104 static void generate_test_instances(std::list<FSSuperblock*>& o);
105 };
106 WRITE_CLASS_ENCODER(FSSuperblock)
107
108 inline std::ostream& operator<<(std::ostream& out, const FSSuperblock& sb)
109 {
110 return out << "sb(" << sb.compat_features << "): "
111 << sb.omap_backend;
112 }
113
114 class FileStore : public JournalingObjectStore,
115 public md_config_obs_t
116 {
117 static const uint32_t target_version = 4;
118 public:
119 uint32_t get_target_version() {
120 return target_version;
121 }
122
123 static int get_block_device_fsid(CephContext* cct, const std::string& path,
124 uuid_d *fsid);
125 struct FSPerfTracker {
126 PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
127 PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
128
129 objectstore_perf_stat_t get_cur_stats() const {
130 objectstore_perf_stat_t ret;
131 ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
132 ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
133 return ret;
134 }
135
136 void update_from_perfcounters(PerfCounters &logger);
137 } perf_tracker;
138 objectstore_perf_stat_t get_cur_stats() override {
139 perf_tracker.update_from_perfcounters(*logger);
140 return perf_tracker.get_cur_stats();
141 }
142 const PerfCounters* get_perf_counters() const override {
143 return logger;
144 }
145
146 private:
147 std::string internal_name; ///< internal name, used to name the perfcounter instance
148 std::string basedir, journalpath;
149 osflagbits_t generic_flags;
150 std::string current_fn;
151 std::string current_op_seq_fn;
152 std::string omap_dir;
153 uuid_d fsid;
154
155 size_t blk_size; ///< fs block size
156
157 int fsid_fd, op_fd, basedir_fd, current_fd;
158
159 FileStoreBackend *backend;
160
161 void create_backend(unsigned long f_type);
162
163 std::string devname;
164
165 int vdo_fd = -1;
166 std::string vdo_name;
167
168 deque<uint64_t> snaps;
169
170 // Indexed Collections
171 IndexManager index_manager;
172 int get_index(const coll_t& c, Index *index);
173 int init_index(const coll_t& c);
174
175 bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
176 // - normal temp case: cid is pg, object is temp (pool < -1)
177 // - hammer temp case: cid is pg (or already temp), object pool is -1
178 return cid.is_pg() && oid.hobj.pool <= -1;
179 }
180 void init_temp_collections();
181
182 void handle_eio();
183
184 // ObjectMap
185 boost::scoped_ptr<ObjectMap> object_map;
186
187 // helper fns
188 int get_cdir(const coll_t& cid, char *s, int len);
189
190 /// read a uuid from fd
191 int read_fsid(int fd, uuid_d *uuid);
192
193 /// lock fsid_fd
194 int lock_fsid();
195
196 // sync thread
197 ceph::mutex lock = ceph::make_mutex("FileStore::lock");
198 bool force_sync;
199 ceph::condition_variable sync_cond;
200
201 ceph::mutex sync_entry_timeo_lock = ceph::make_mutex("FileStore::sync_entry_timeo_lock");
202 SafeTimer timer;
203
204 std::list<Context*> sync_waiters;
205 bool stop;
206 void sync_entry();
207 struct SyncThread : public Thread {
208 FileStore *fs;
209 explicit SyncThread(FileStore *f) : fs(f) {}
210 void *entry() override {
211 fs->sync_entry();
212 return 0;
213 }
214 } sync_thread;
215
216 // -- op workqueue --
217 struct Op {
218 utime_t start;
219 uint64_t op;
220 std::vector<Transaction> tls;
221 Context *onreadable, *onreadable_sync;
222 uint64_t ops, bytes;
223 TrackedOpRef osd_op;
224 ZTracer::Trace trace;
225 bool registered_apply = false;
226 };
227 class OpSequencer : public CollectionImpl {
228 CephContext *cct;
229 // to protect q, for benefit of flush (peek/dequeue also protected by lock)
230 ceph::mutex qlock =
231 ceph::make_mutex("FileStore::OpSequencer::qlock", false);
232 std::list<Op*> q;
233 std::list<uint64_t> jq;
234 std::list<std::pair<uint64_t, Context*> > flush_commit_waiters;
235 ceph::condition_variable cond;
236 std::string osr_name_str;
237 /// hash of pointers to ghobject_t's for in-flight writes
238 std::unordered_multimap<uint32_t,const ghobject_t*> applying;
239 public:
240 // for apply mutual exclusion
241 ceph::mutex apply_lock =
242 ceph::make_mutex("FileStore::OpSequencer::apply_lock", false);
243 int id;
244 const char *osr_name;
245
246 /// get_max_uncompleted
247 bool _get_max_uncompleted(
248 uint64_t *seq ///< [out] max uncompleted seq
249 ) {
250 ceph_assert(seq);
251 *seq = 0;
252 if (q.empty() && jq.empty())
253 return true;
254
255 if (!q.empty())
256 *seq = q.back()->op;
257 if (!jq.empty() && jq.back() > *seq)
258 *seq = jq.back();
259
260 return false;
261 } /// @returns true if both queues are empty
262
263 /// get_min_uncompleted
264 bool _get_min_uncompleted(
265 uint64_t *seq ///< [out] min uncompleted seq
266 ) {
267 ceph_assert(seq);
268 *seq = 0;
269 if (q.empty() && jq.empty())
270 return true;
271
272 if (!q.empty())
273 *seq = q.front()->op;
274 if (!jq.empty() && jq.front() < *seq)
275 *seq = jq.front();
276
277 return false;
278 } /// @returns true if both queues are empty
279
280 void _wake_flush_waiters(std::list<Context*> *to_queue) {
281 uint64_t seq;
282 if (_get_min_uncompleted(&seq))
283 seq = -1;
284
285 for (auto i = flush_commit_waiters.begin();
286 i != flush_commit_waiters.end() && i->first < seq;
287 flush_commit_waiters.erase(i++)) {
288 to_queue->push_back(i->second);
289 }
290 }
291
292 void queue_journal(Op *o) {
293 std::lock_guard l{qlock};
294 jq.push_back(o->op);
295 _register_apply(o);
296 }
297 void dequeue_journal(std::list<Context*> *to_queue) {
298 std::lock_guard l{qlock};
299 jq.pop_front();
300 cond.notify_all();
301 _wake_flush_waiters(to_queue);
302 }
303 void queue(Op *o) {
304 std::lock_guard l{qlock};
305 q.push_back(o);
306 _register_apply(o);
307 o->trace.keyval("queue depth", q.size());
308 }
309 void _register_apply(Op *o);
310 void _unregister_apply(Op *o);
311 void wait_for_apply(const ghobject_t& oid);
312 Op *peek_queue() {
313 std::lock_guard l{qlock};
314 ceph_assert(ceph_mutex_is_locked(apply_lock));
315 return q.front();
316 }
317
318 Op *dequeue(std::list<Context*> *to_queue) {
319 ceph_assert(to_queue);
320 ceph_assert(ceph_mutex_is_locked(apply_lock));
321 std::lock_guard l{qlock};
322 Op *o = q.front();
323 q.pop_front();
324 cond.notify_all();
325 _unregister_apply(o);
326 _wake_flush_waiters(to_queue);
327 return o;
328 }
329
330 void flush() override {
331 std::unique_lock l{qlock};
332 // wait forever
333 cond.wait(l, [this] { return !cct->_conf->filestore_blackhole; });
334
335 // get max for journal _or_ op queues
336 uint64_t seq = 0;
337 if (!q.empty())
338 seq = q.back()->op;
339 if (!jq.empty() && jq.back() > seq)
340 seq = jq.back();
341
342 if (seq) {
343 // everything prior to our watermark to drain through either/both queues
344 cond.wait(l, [seq, this] {
345 return ((q.empty() || q.front()->op > seq) &&
346 (jq.empty() || jq.front() > seq));
347 });
348 }
349 }
350 bool flush_commit(Context *c) override {
351 std::lock_guard l{qlock};
352 uint64_t seq = 0;
353 if (_get_max_uncompleted(&seq)) {
354 return true;
355 } else {
356 flush_commit_waiters.push_back(std::make_pair(seq, c));
357 return false;
358 }
359 }
360
361 private:
362 FRIEND_MAKE_REF(OpSequencer);
363 OpSequencer(CephContext* cct, int i, coll_t cid)
364 : CollectionImpl(cct, cid),
365 cct(cct),
366 osr_name_str(stringify(cid)),
367 id(i),
368 osr_name(osr_name_str.c_str()) {}
369 ~OpSequencer() override {
370 ceph_assert(q.empty());
371 }
372 };
373 typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
374
375 ceph::mutex coll_lock = ceph::make_mutex("FileStore::coll_lock");
376 std::map<coll_t,OpSequencerRef> coll_map;
377
378 friend std::ostream& operator<<(std::ostream& out, const OpSequencer& s);
379
380 FDCache fdcache;
381 WBThrottle wbthrottle;
382
383 std::atomic<int64_t> next_osr_id = { 0 };
384 bool m_disable_wbthrottle;
385 deque<OpSequencer*> op_queue;
386 BackoffThrottle throttle_ops, throttle_bytes;
387 const int m_ondisk_finisher_num;
388 const int m_apply_finisher_num;
389 std::vector<Finisher*> ondisk_finishers;
390 std::vector<Finisher*> apply_finishers;
391
392 ThreadPool op_tp;
393 struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
394 FileStore *store;
395 OpWQ(FileStore *fs,
396 ceph::timespan timeout,
397 ceph::timespan suicide_timeout,
398 ThreadPool *tp)
399 : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ",
400 timeout, suicide_timeout, tp),
401 store(fs) {}
402
403 bool _enqueue(OpSequencer *osr) override {
404 store->op_queue.push_back(osr);
405 return true;
406 }
407 void _dequeue(OpSequencer *o) override {
408 ceph_abort();
409 }
410 bool _empty() override {
411 return store->op_queue.empty();
412 }
413 OpSequencer *_dequeue() override {
414 if (store->op_queue.empty())
415 return nullptr;
416 OpSequencer *osr = store->op_queue.front();
417 store->op_queue.pop_front();
418 return osr;
419 }
420 void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
421 store->_do_op(osr, handle);
422 }
423 void _process_finish(OpSequencer *osr) override {
424 store->_finish_op(osr);
425 }
426 void _clear() override {
427 ceph_assert(store->op_queue.empty());
428 }
429 } op_wq;
430
431 void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
432 void _finish_op(OpSequencer *o);
433 Op *build_op(std::vector<Transaction>& tls,
434 Context *onreadable, Context *onreadable_sync,
435 TrackedOpRef osd_op);
436 void queue_op(OpSequencer *osr, Op *o);
437 void op_queue_reserve_throttle(Op *o);
438 void op_queue_release_throttle(Op *o);
439 void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
440 friend struct C_JournaledAhead;
441
442 void new_journal();
443
444 PerfCounters *logger;
445
446 ZTracer::Endpoint trace_endpoint;
447
448 public:
449 int lfn_find(const ghobject_t& oid, const Index& index,
450 IndexedPath *path = nullptr);
451 int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
452 int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
453 int lfn_open(
454 const coll_t& cid,
455 const ghobject_t& oid,
456 bool create,
457 FDRef *outfd,
458 Index *index = nullptr);
459
460 void lfn_close(FDRef fd);
461 int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
462 int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
463 bool force_clear_omap=false);
464
465 public:
466 FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
467 osflagbits_t flags = 0,
468 const char *internal_name = "filestore", bool update_to=false);
469 ~FileStore() override;
470
471 std::string get_type() override {
472 return "filestore";
473 }
474
475 int _detect_fs();
476 int _sanity_check_fs();
477
478 bool test_mount_in_use() override;
479 int read_op_seq(uint64_t *seq);
480 int write_op_seq(int, uint64_t seq);
481 int mount() override;
482 int umount() override;
483
484 int validate_hobject_key(const hobject_t &obj) const override;
485
486 unsigned get_max_attr_name_length() override {
487 // xattr limit is 128; leave room for our prefixes (user.ceph._),
488 // some margin, and cap at 100
489 return 100;
490 }
491 int mkfs() override;
492 int mkjournal() override;
493 bool wants_journal() override {
494 return true;
495 }
496 bool allows_journal() override {
497 return true;
498 }
499 bool needs_journal() override {
500 return false;
501 }
502
503 bool is_sync_onreadable() const override {
504 return false;
505 }
506
507 bool is_rotational() override;
508 bool is_journal_rotational() override;
509
510 void dump_perf_counters(ceph::Formatter *f) override {
511 f->open_object_section("perf_counters");
512 logger->dump_formatted(f, false);
513 f->close_section();
514 }
515
516 int flush_cache(std::ostream *os = NULL) override;
517 int write_version_stamp();
518 int version_stamp_is_valid(uint32_t *version);
519 int update_version_stamp();
520 int upgrade() override;
521
522 bool can_sort_nibblewise() override {
523 return true; // i support legacy sort order
524 }
525
526 void collect_metadata(std::map<std::string,std::string> *pm) override;
527 int get_devices(std::set<std::string> *ls) override;
528
529 int statfs(struct store_statfs_t *buf,
530 osd_alert_list_t* alerts = nullptr) override;
531 int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
532 bool *per_pool_omap) override;
533
534 int _do_transactions(
535 std::vector<Transaction> &tls, uint64_t op_seq,
536 ThreadPool::TPHandle *handle,
537 const char *osr_name);
538 int do_transactions(std::vector<Transaction> &tls, uint64_t op_seq) override {
539 return _do_transactions(tls, op_seq, nullptr, "replay");
540 }
541 void _do_transaction(
542 Transaction& t, uint64_t op_seq, int trans_num,
543 ThreadPool::TPHandle *handle, const char *osr_name);
544
545 CollectionHandle open_collection(const coll_t& c) override;
546 CollectionHandle create_new_collection(const coll_t& c) override;
547 void set_collection_commit_queue(const coll_t& cid,
548 ContextQueue *commit_queue) override {
549 }
550
551 int queue_transactions(CollectionHandle& ch, std::vector<Transaction>& tls,
552 TrackedOpRef op = TrackedOpRef(),
553 ThreadPool::TPHandle *handle = nullptr) override;
554
555 /**
556 * set replay guard xattr on given file
557 *
558 * This will ensure that we will not replay this (or any previous) operation
559 * against this particular inode/object.
560 *
561 * @param fd open file descriptor for the file/object
562 * @param spos sequencer position of the last operation we should not replay
563 */
564 void _set_replay_guard(int fd,
565 const SequencerPosition& spos,
566 const ghobject_t *oid=0,
567 bool in_progress=false);
568 void _set_replay_guard(const coll_t& cid,
569 const SequencerPosition& spos,
570 bool in_progress);
571 void _set_global_replay_guard(const coll_t& cid,
572 const SequencerPosition &spos);
573
574 /// close a replay guard opened with in_progress=true
575 void _close_replay_guard(int fd, const SequencerPosition& spos,
576 const ghobject_t *oid=0);
577 void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
578
579 /**
580 * check replay guard xattr on given file
581 *
582 * Check the current position against any marker on the file that
583 * indicates which operations have already been applied. If the
584 * current or a newer operation has been marked as applied, we
585 * should not replay the current operation again.
586 *
587 * If we are not replaying the journal, we already return true. It
588 * is only on replay that we might return false, indicated that the
589 * operation should not be performed (again).
590 *
591 * @param fd open fd on the file/object in question
592 * @param spos sequencerposition for an operation we could apply/replay
593 * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
594 */
595 int _check_replay_guard(int fd, const SequencerPosition& spos);
596 int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
597 int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
598 int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
599
600 // ------------------
601 // objects
602 int pick_object_revision_lt(ghobject_t& oid) {
603 return 0;
604 }
605 using ObjectStore::exists;
606 bool exists(CollectionHandle& c, const ghobject_t& oid) override;
607 using ObjectStore::stat;
608 int stat(
609 CollectionHandle& c,
610 const ghobject_t& oid,
611 struct stat *st,
612 bool allow_eio = false) override;
613 using ObjectStore::set_collection_opts;
614 int set_collection_opts(
615 CollectionHandle& c,
616 const pool_opts_t& opts) override;
617 using ObjectStore::read;
618 int read(
619 CollectionHandle& c,
620 const ghobject_t& oid,
621 uint64_t offset,
622 size_t len,
623 ceph::buffer::list& bl,
624 uint32_t op_flags = 0) override;
625 int _do_fiemap(int fd, uint64_t offset, size_t len,
626 std::map<uint64_t, uint64_t> *m);
627 int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
628 std::map<uint64_t, uint64_t> *m);
629 using ObjectStore::fiemap;
630 int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, ceph::buffer::list& bl) override;
631 int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
632
633 int _touch(const coll_t& cid, const ghobject_t& oid);
634 int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
635 const ceph::buffer::list& bl, uint32_t fadvise_flags = 0);
636 int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
637 int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
638 int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
639 const SequencerPosition& spos);
640 int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
641 uint64_t srcoff, uint64_t len, uint64_t dstoff,
642 const SequencerPosition& spos);
643 int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
644 int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
645 int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
646 int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
647
648 int _fgetattr(int fd, const char *name, ceph::bufferptr& bp);
649 int _fgetattrs(int fd, std::map<std::string, ceph::bufferptr,std::less<>>& aset);
650 int _fsetattrs(int fd, std::map<std::string, ceph::bufferptr,std::less<>>& aset);
651
652 void do_force_sync();
653 void start_sync(Context *onsafe);
654 void sync();
655 void _flush_op_queue();
656 void flush();
657 void sync_and_flush();
658
659 int flush_journal() override;
660 int dump_journal(std::ostream& out) override;
661
662 void set_fsid(uuid_d u) override {
663 fsid = u;
664 }
665 uuid_d get_fsid() override { return fsid; }
666
667 uint64_t estimate_objects_overhead(uint64_t num_objects) override;
668
669 // DEBUG read error injection, an object is removed from both on delete()
670 ceph::mutex read_error_lock = ceph::make_mutex("FileStore::read_error_lock");
671 std::set<ghobject_t> data_error_set; // read() will return -EIO
672 std::set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
673 void inject_data_error(const ghobject_t &oid) override;
674 void inject_mdata_error(const ghobject_t &oid) override;
675
676 void compact() override {
677 ceph_assert(object_map);
678 object_map->compact();
679 }
680
681 bool has_builtin_csum() const override {
682 return false;
683 }
684
685 void debug_obj_on_delete(const ghobject_t &oid);
686 bool debug_data_eio(const ghobject_t &oid);
687 bool debug_mdata_eio(const ghobject_t &oid);
688
689 int snapshot(const std::string& name) override;
690
691 // attrs
692 using ObjectStore::getattr;
693 using ObjectStore::getattrs;
694 int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, ceph::bufferptr &bp) override;
695 int getattrs(CollectionHandle& c, const ghobject_t& oid, std::map<std::string,ceph::bufferptr,std::less<>>& aset) override;
696
697 int _setattrs(const coll_t& cid, const ghobject_t& oid, std::map<std::string,ceph::bufferptr>& aset,
698 const SequencerPosition &spos);
699 int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
700 const SequencerPosition &spos);
701 int _rmattrs(const coll_t& cid, const ghobject_t& oid,
702 const SequencerPosition &spos);
703
704 int _collection_remove_recursive(const coll_t &cid,
705 const SequencerPosition &spos);
706
707 int _collection_set_bits(const coll_t& cid, int bits);
708
709 // collections
710 using ObjectStore::collection_list;
711 int collection_bits(CollectionHandle& c) override;
712 int collection_list(CollectionHandle& c,
713 const ghobject_t& start, const ghobject_t& end, int max,
714 std::vector<ghobject_t> *ls, ghobject_t *next) override {
715 c->flush();
716 return collection_list(c->cid, start, end, max, ls, next);
717 }
718 int collection_list(const coll_t& cid,
719 const ghobject_t& start, const ghobject_t& end, int max,
720 std::vector<ghobject_t> *ls, ghobject_t *next);
721 int list_collections(std::vector<coll_t>& ls) override;
722 int list_collections(std::vector<coll_t>& ls, bool include_temp);
723 int collection_stat(const coll_t& c, struct stat *st);
724 bool collection_exists(const coll_t& c) override;
725 int collection_empty(CollectionHandle& c, bool *empty) override {
726 c->flush();
727 return collection_empty(c->cid, empty);
728 }
729 int collection_empty(const coll_t& cid, bool *empty);
730
731 // omap (see ObjectStore.h for documentation)
732 using ObjectStore::omap_get;
733 int omap_get(CollectionHandle& c, const ghobject_t &oid, ceph::buffer::list *header,
734 std::map<std::string, ceph::buffer::list> *out) override;
735 using ObjectStore::omap_get_header;
736 int omap_get_header(
737 CollectionHandle& c,
738 const ghobject_t &oid,
739 ceph::buffer::list *out,
740 bool allow_eio = false) override;
741 using ObjectStore::omap_get_keys;
742 int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, std::set<std::string> *keys) override;
743 using ObjectStore::omap_get_values;
744 int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys,
745 std::map<std::string, ceph::buffer::list> *out) override;
746 using ObjectStore::omap_check_keys;
747 int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys,
748 std::set<std::string> *out) override;
749 using ObjectStore::get_omap_iterator;
750 ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override;
751 ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid);
752
753 int _create_collection(const coll_t& c, int bits,
754 const SequencerPosition &spos);
755 int _destroy_collection(const coll_t& c);
756 /**
757 * Give an expected number of objects hint to the collection.
758 *
759 * @param c - collection id.
760 * @param pg_num - pg number of the pool this collection belongs to
761 * @param expected_num_objs - expected number of objects in this collection
762 * @param spos - sequence position
763 *
764 * @return 0 on success, an error code otherwise
765 */
766 int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
767 uint64_t expected_num_objs,
768 const SequencerPosition &spos);
769 int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
770 const SequencerPosition& spos);
771 int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
772 coll_t c, const ghobject_t& o,
773 const SequencerPosition& spos,
774 bool ignore_enoent = false);
775
776 int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
777 uint64_t expected_object_size,
778 uint64_t expected_write_size);
779
780 void dump_start(const std::string& file);
781 void dump_stop();
782 void dump_transactions(std::vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
783
784 virtual int apply_layout_settings(const coll_t &cid, int target_level);
785
786 void get_db_statistics(ceph::Formatter* f) override;
787
788 private:
789 void _inject_failure();
790
791 // omap
792 int _omap_clear(const coll_t& cid, const ghobject_t &oid,
793 const SequencerPosition &spos);
794 int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
795 const std::map<std::string, ceph::buffer::list> &aset,
796 const SequencerPosition &spos);
797 int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const std::set<std::string> &keys,
798 const SequencerPosition &spos);
799 int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
800 const std::string& first, const std::string& last,
801 const SequencerPosition &spos);
802 int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const ceph::buffer::list &bl,
803 const SequencerPosition &spos);
804 int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
805 const SequencerPosition &spos);
806 int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest,
807 const SequencerPosition &spos);
808
809 const char** get_tracked_conf_keys() const override;
810 void handle_conf_change(const ConfigProxy& conf,
811 const std::set <std::string> &changed) override;
812 int set_throttle_params();
813 float m_filestore_commit_timeout;
814 bool m_filestore_journal_parallel;
815 bool m_filestore_journal_trailing;
816 bool m_filestore_journal_writeahead;
817 int m_filestore_fiemap_threshold;
818 double m_filestore_max_sync_interval;
819 double m_filestore_min_sync_interval;
820 bool m_filestore_fail_eio;
821 bool m_filestore_fadvise;
822 int do_update;
823 bool m_journal_dio, m_journal_aio, m_journal_force_aio;
824 std::string m_osd_rollback_to_cluster_snap;
825 bool m_osd_use_stale_snap;
826 bool m_filestore_do_dump;
827 std::ofstream m_filestore_dump;
828 ceph::JSONFormatter m_filestore_dump_fmt;
829 std::atomic<int64_t> m_filestore_kill_at = { 0 };
830 bool m_filestore_sloppy_crc;
831 int m_filestore_sloppy_crc_block_size;
832 uint64_t m_filestore_max_alloc_hint_size;
833 unsigned long m_fs_type;
834
835 //Determined xattr handling based on fs type
836 void set_xattr_limits_via_conf();
837 uint32_t m_filestore_max_inline_xattr_size;
838 uint32_t m_filestore_max_inline_xattrs;
839 uint32_t m_filestore_max_xattr_value_size;
840
841 FSSuperblock superblock;
842
843 /**
844 * write_superblock()
845 *
846 * Write superblock to persisent storage
847 *
848 * return value: 0 on success, otherwise negative errno
849 */
850 int write_superblock();
851
852 /**
853 * read_superblock()
854 *
855 * Fill in FileStore::superblock by reading persistent storage
856 *
857 * return value: 0 on success, otherwise negative errno
858 */
859 int read_superblock();
860
861 friend class FileStoreBackend;
862 friend class TestFileStore;
863 };
864
865 std::ostream& operator<<(std::ostream& out, const FileStore::OpSequencer& s);
866
867 struct fiemap;
868
869 class FileStoreBackend {
870 private:
871 FileStore *filestore;
872 protected:
873 int get_basedir_fd() {
874 return filestore->basedir_fd;
875 }
876 int get_current_fd() {
877 return filestore->current_fd;
878 }
879 int get_op_fd() {
880 return filestore->op_fd;
881 }
882 size_t get_blksize() {
883 return filestore->blk_size;
884 }
885 const std::string& get_basedir_path() {
886 return filestore->basedir;
887 }
888 const std::string& get_journal_path() {
889 return filestore->journalpath;
890 }
891 const std::string& get_current_path() {
892 return filestore->current_fn;
893 }
894 int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
895 if (has_fiemap() || has_seek_data_hole()) {
896 return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
897 } else {
898 return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
899 }
900 }
901 int get_crc_block_size() {
902 return filestore->m_filestore_sloppy_crc_block_size;
903 }
904
905 public:
906 explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
907 virtual ~FileStoreBackend() {}
908
909 CephContext* cct() const {
910 return filestore->cct;
911 }
912
913 static FileStoreBackend *create(unsigned long f_type, FileStore *fs);
914
915 virtual const char *get_name() = 0;
916 virtual int detect_features() = 0;
917 virtual int create_current() = 0;
918 virtual bool can_checkpoint() = 0;
919 virtual int list_checkpoints(std::list<std::string>& ls) = 0;
920 virtual int create_checkpoint(const std::string& name, uint64_t *cid) = 0;
921 virtual int sync_checkpoint(uint64_t id) = 0;
922 virtual int rollback_to(const std::string& name) = 0;
923 virtual int destroy_checkpoint(const std::string& name) = 0;
924 virtual int syncfs() = 0;
925 virtual bool has_fiemap() = 0;
926 virtual bool has_seek_data_hole() = 0;
927 virtual bool is_rotational() = 0;
928 virtual bool is_journal_rotational() = 0;
929 virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
930 virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
931 virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
932 virtual bool has_splice() const = 0;
933
934 // hooks for (sloppy) crc tracking
935 virtual int _crc_update_write(int fd, loff_t off, size_t len, const ceph::buffer::list& bl) = 0;
936 virtual int _crc_update_truncate(int fd, loff_t off) = 0;
937 virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
938 virtual int _crc_update_clone_range(int srcfd, int destfd,
939 loff_t srcoff, size_t len, loff_t dstoff) = 0;
940 virtual int _crc_verify_read(int fd, loff_t off, size_t len, const ceph::buffer::list& bl,
941 std::ostream *out) = 0;
942 };
943
944 #endif