]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/filestore/FileStore.h
88a1d3170d08bb0e0fd3dad151117406b6f147a6
[ceph.git] / ceph / src / os / filestore / FileStore.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #ifndef CEPH_FILESTORE_H
17 #define CEPH_FILESTORE_H
18
19 #include "include/types.h"
20
21 #include <map>
22 #include <deque>
23 #include <atomic>
24 #include <fstream>
25
26 using namespace std;
27
28 #include <boost/scoped_ptr.hpp>
29
30 #include "include/unordered_map.h"
31
32 #include "include/assert.h"
33
34 #include "os/ObjectStore.h"
35 #include "JournalingObjectStore.h"
36
37 #include "common/Timer.h"
38 #include "common/WorkQueue.h"
39 #include "common/perf_counters.h"
40 #include "common/zipkin_trace.h"
41
42 #include "common/Mutex.h"
43 #include "HashIndex.h"
44 #include "IndexManager.h"
45 #include "os/ObjectMap.h"
46 #include "SequencerPosition.h"
47 #include "FDCache.h"
48 #include "WBThrottle.h"
49
50 #include "include/uuid.h"
51
52
53 // from include/linux/falloc.h:
54 #ifndef FALLOC_FL_PUNCH_HOLE
55 # define FALLOC_FL_PUNCH_HOLE 0x2
56 #endif
57
58 #if defined(__linux__)
59 # ifndef BTRFS_SUPER_MAGIC
60 #define BTRFS_SUPER_MAGIC 0x9123683EL
61 # endif
62 # ifndef XFS_SUPER_MAGIC
63 #define XFS_SUPER_MAGIC 0x58465342L
64 # endif
65 # ifndef ZFS_SUPER_MAGIC
66 #define ZFS_SUPER_MAGIC 0x2fc12fc1L
67 # endif
68 #endif
69
70
71 class FileStoreBackend;
72
73 #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
74
75 enum {
76 l_filestore_first = 84000,
77 l_filestore_journal_queue_ops,
78 l_filestore_journal_queue_bytes,
79 l_filestore_journal_ops,
80 l_filestore_journal_bytes,
81 l_filestore_journal_latency,
82 l_filestore_journal_wr,
83 l_filestore_journal_wr_bytes,
84 l_filestore_journal_full,
85 l_filestore_committing,
86 l_filestore_commitcycle,
87 l_filestore_commitcycle_interval,
88 l_filestore_commitcycle_latency,
89 l_filestore_op_queue_max_ops,
90 l_filestore_op_queue_ops,
91 l_filestore_ops,
92 l_filestore_op_queue_max_bytes,
93 l_filestore_op_queue_bytes,
94 l_filestore_bytes,
95 l_filestore_apply_latency,
96 l_filestore_queue_transaction_latency_avg,
97 l_filestore_last,
98 };
99
100 class FSSuperblock {
101 public:
102 CompatSet compat_features;
103 string omap_backend;
104
105 FSSuperblock() { }
106
107 void encode(bufferlist &bl) const;
108 void decode(bufferlist::iterator &bl);
109 void dump(Formatter *f) const;
110 static void generate_test_instances(list<FSSuperblock*>& o);
111 };
112 WRITE_CLASS_ENCODER(FSSuperblock)
113
114 inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
115 {
116 return out << "sb(" << sb.compat_features << "): "
117 << sb.omap_backend;
118 }
119
120 class FileStore : public JournalingObjectStore,
121 public md_config_obs_t
122 {
123 static const uint32_t target_version = 4;
124 public:
125 uint32_t get_target_version() {
126 return target_version;
127 }
128
129 static int get_block_device_fsid(CephContext* cct, const string& path,
130 uuid_d *fsid);
131 struct FSPerfTracker {
132 PerfCounters::avg_tracker<uint64_t> os_commit_latency;
133 PerfCounters::avg_tracker<uint64_t> os_apply_latency;
134
135 objectstore_perf_stat_t get_cur_stats() const {
136 objectstore_perf_stat_t ret;
137 ret.os_commit_latency = os_commit_latency.avg();
138 ret.os_apply_latency = os_apply_latency.avg();
139 return ret;
140 }
141
142 void update_from_perfcounters(PerfCounters &logger);
143 } perf_tracker;
144 objectstore_perf_stat_t get_cur_stats() override {
145 perf_tracker.update_from_perfcounters(*logger);
146 return perf_tracker.get_cur_stats();
147 }
148 const PerfCounters* get_perf_counters() const override {
149 return logger;
150 }
151
152 private:
153 string internal_name; ///< internal name, used to name the perfcounter instance
154 string basedir, journalpath;
155 osflagbits_t generic_flags;
156 std::string current_fn;
157 std::string current_op_seq_fn;
158 std::string omap_dir;
159 uuid_d fsid;
160
161 size_t blk_size; ///< fs block size
162
163 int fsid_fd, op_fd, basedir_fd, current_fd;
164
165 FileStoreBackend *backend;
166
167 void create_backend(long f_type);
168
169 deque<uint64_t> snaps;
170
171 // Indexed Collections
172 IndexManager index_manager;
173 int get_index(const coll_t& c, Index *index);
174 int init_index(const coll_t& c);
175
176 bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
177 // - normal temp case: cid is pg, object is temp (pool < -1)
178 // - hammer temp case: cid is pg (or already temp), object pool is -1
179 return cid.is_pg() && oid.hobj.pool <= -1;
180 }
181 void init_temp_collections();
182
183 // ObjectMap
184 boost::scoped_ptr<ObjectMap> object_map;
185
186 // helper fns
187 int get_cdir(const coll_t& cid, char *s, int len);
188
189 /// read a uuid from fd
190 int read_fsid(int fd, uuid_d *uuid);
191
192 /// lock fsid_fd
193 int lock_fsid();
194
195 // sync thread
196 Mutex lock;
197 bool force_sync;
198 Cond sync_cond;
199
200 Mutex sync_entry_timeo_lock;
201 SafeTimer timer;
202
203 list<Context*> sync_waiters;
204 bool stop;
205 void sync_entry();
206 struct SyncThread : public Thread {
207 FileStore *fs;
208 explicit SyncThread(FileStore *f) : fs(f) {}
209 void *entry() override {
210 fs->sync_entry();
211 return 0;
212 }
213 } sync_thread;
214
215 // -- op workqueue --
216 struct Op {
217 utime_t start;
218 uint64_t op;
219 vector<Transaction> tls;
220 Context *onreadable, *onreadable_sync;
221 uint64_t ops, bytes;
222 TrackedOpRef osd_op;
223 ZTracer::Trace trace;
224 };
225 class OpSequencer : public Sequencer_impl {
226 Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
227 list<Op*> q;
228 list<uint64_t> jq;
229 list<pair<uint64_t, Context*> > flush_commit_waiters;
230 Cond cond;
231 public:
232 Sequencer *parent;
233 Mutex apply_lock; // for apply mutual exclusion
234 int id;
235
236 /// get_max_uncompleted
237 bool _get_max_uncompleted(
238 uint64_t *seq ///< [out] max uncompleted seq
239 ) {
240 assert(qlock.is_locked());
241 assert(seq);
242 *seq = 0;
243 if (q.empty() && jq.empty())
244 return true;
245
246 if (!q.empty())
247 *seq = q.back()->op;
248 if (!jq.empty() && jq.back() > *seq)
249 *seq = jq.back();
250
251 return false;
252 } /// @returns true if both queues are empty
253
254 /// get_min_uncompleted
255 bool _get_min_uncompleted(
256 uint64_t *seq ///< [out] min uncompleted seq
257 ) {
258 assert(qlock.is_locked());
259 assert(seq);
260 *seq = 0;
261 if (q.empty() && jq.empty())
262 return true;
263
264 if (!q.empty())
265 *seq = q.front()->op;
266 if (!jq.empty() && jq.front() < *seq)
267 *seq = jq.front();
268
269 return false;
270 } /// @returns true if both queues are empty
271
272 void _wake_flush_waiters(list<Context*> *to_queue) {
273 uint64_t seq;
274 if (_get_min_uncompleted(&seq))
275 seq = -1;
276
277 for (list<pair<uint64_t, Context*> >::iterator i =
278 flush_commit_waiters.begin();
279 i != flush_commit_waiters.end() && i->first < seq;
280 flush_commit_waiters.erase(i++)) {
281 to_queue->push_back(i->second);
282 }
283 }
284
285 void queue_journal(uint64_t s) {
286 Mutex::Locker l(qlock);
287 jq.push_back(s);
288 }
289 void dequeue_journal(list<Context*> *to_queue) {
290 Mutex::Locker l(qlock);
291 jq.pop_front();
292 cond.Signal();
293 _wake_flush_waiters(to_queue);
294 }
295 void queue(Op *o) {
296 Mutex::Locker l(qlock);
297 q.push_back(o);
298 o->trace.keyval("queue depth", q.size());
299 }
300 Op *peek_queue() {
301 Mutex::Locker l(qlock);
302 assert(apply_lock.is_locked());
303 return q.front();
304 }
305
306 Op *dequeue(list<Context*> *to_queue) {
307 assert(to_queue);
308 assert(apply_lock.is_locked());
309 Mutex::Locker l(qlock);
310 Op *o = q.front();
311 q.pop_front();
312 cond.Signal();
313
314 _wake_flush_waiters(to_queue);
315 return o;
316 }
317
318 void flush() override {
319 Mutex::Locker l(qlock);
320
321 while (cct->_conf->filestore_blackhole)
322 cond.Wait(qlock); // wait forever
323
324
325 // get max for journal _or_ op queues
326 uint64_t seq = 0;
327 if (!q.empty())
328 seq = q.back()->op;
329 if (!jq.empty() && jq.back() > seq)
330 seq = jq.back();
331
332 if (seq) {
333 // everything prior to our watermark to drain through either/both queues
334 while ((!q.empty() && q.front()->op <= seq) ||
335 (!jq.empty() && jq.front() <= seq))
336 cond.Wait(qlock);
337 }
338 }
339 bool flush_commit(Context *c) override {
340 Mutex::Locker l(qlock);
341 uint64_t seq = 0;
342 if (_get_max_uncompleted(&seq)) {
343 return true;
344 } else {
345 flush_commit_waiters.push_back(make_pair(seq, c));
346 return false;
347 }
348 }
349
350 OpSequencer(CephContext* cct, int i)
351 : Sequencer_impl(cct),
352 qlock("FileStore::OpSequencer::qlock", false, false),
353 parent(0),
354 apply_lock("FileStore::OpSequencer::apply_lock", false, false),
355 id(i) {}
356 ~OpSequencer() override {
357 assert(q.empty());
358 }
359
360 const string& get_name() const {
361 return parent->get_name();
362 }
363 };
364
365 friend ostream& operator<<(ostream& out, const OpSequencer& s);
366
367 FDCache fdcache;
368 WBThrottle wbthrottle;
369
370 std::atomic<int64_t> next_osr_id = { 0 };
371 bool m_disable_wbthrottle;
372 deque<OpSequencer*> op_queue;
373 BackoffThrottle throttle_ops, throttle_bytes;
374 const int m_ondisk_finisher_num;
375 const int m_apply_finisher_num;
376 vector<Finisher*> ondisk_finishers;
377 vector<Finisher*> apply_finishers;
378
379 ThreadPool op_tp;
380 struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
381 FileStore *store;
382 OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
383 : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {}
384
385 bool _enqueue(OpSequencer *osr) override {
386 store->op_queue.push_back(osr);
387 return true;
388 }
389 void _dequeue(OpSequencer *o) override {
390 ceph_abort();
391 }
392 bool _empty() override {
393 return store->op_queue.empty();
394 }
395 OpSequencer *_dequeue() override {
396 if (store->op_queue.empty())
397 return NULL;
398 OpSequencer *osr = store->op_queue.front();
399 store->op_queue.pop_front();
400 return osr;
401 }
402 void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
403 store->_do_op(osr, handle);
404 }
405 void _process_finish(OpSequencer *osr) override {
406 store->_finish_op(osr);
407 }
408 void _clear() override {
409 assert(store->op_queue.empty());
410 }
411 } op_wq;
412
413 void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
414 void _finish_op(OpSequencer *o);
415 Op *build_op(vector<Transaction>& tls,
416 Context *onreadable, Context *onreadable_sync,
417 TrackedOpRef osd_op);
418 void queue_op(OpSequencer *osr, Op *o);
419 void op_queue_reserve_throttle(Op *o);
420 void op_queue_release_throttle(Op *o);
421 void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
422 friend struct C_JournaledAhead;
423
424 void new_journal();
425
426 PerfCounters *logger;
427
428 ZTracer::Endpoint trace_endpoint;
429
430 public:
431 int lfn_find(const ghobject_t& oid, const Index& index,
432 IndexedPath *path = NULL);
433 int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
434 int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
435 int lfn_open(
436 const coll_t& cid,
437 const ghobject_t& oid,
438 bool create,
439 FDRef *outfd,
440 Index *index = 0);
441
442 void lfn_close(FDRef fd);
443 int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
444 int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
445 bool force_clear_omap=false);
446
447 public:
448 FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
449 osflagbits_t flags = 0,
450 const char *internal_name = "filestore", bool update_to=false);
451 ~FileStore() override;
452
453 string get_type() override {
454 return "filestore";
455 }
456
457 int _detect_fs();
458 int _sanity_check_fs();
459
460 bool test_mount_in_use() override;
461 int read_op_seq(uint64_t *seq);
462 int write_op_seq(int, uint64_t seq);
463 int mount() override;
464 int umount() override;
465
466 int validate_hobject_key(const hobject_t &obj) const override;
467
468 unsigned get_max_attr_name_length() override {
469 // xattr limit is 128; leave room for our prefixes (user.ceph._),
470 // some margin, and cap at 100
471 return 100;
472 }
473 int mkfs() override;
474 int mkjournal() override;
475 bool wants_journal() override {
476 return true;
477 }
478 bool allows_journal() override {
479 return true;
480 }
481 bool needs_journal() override {
482 return false;
483 }
484
485 bool is_rotational() override;
486
487 void dump_perf_counters(Formatter *f) override {
488 f->open_object_section("perf_counters");
489 logger->dump_formatted(f, false);
490 f->close_section();
491 }
492
493 int write_version_stamp();
494 int version_stamp_is_valid(uint32_t *version);
495 int update_version_stamp();
496 int upgrade() override;
497
498 bool can_sort_nibblewise() override {
499 return true; // i support legacy sort order
500 }
501
502 void collect_metadata(map<string,string> *pm) override;
503
504 int statfs(struct store_statfs_t *buf) override;
505
506 int _do_transactions(
507 vector<Transaction> &tls, uint64_t op_seq,
508 ThreadPool::TPHandle *handle);
509 int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override {
510 return _do_transactions(tls, op_seq, 0);
511 }
512 void _do_transaction(
513 Transaction& t, uint64_t op_seq, int trans_num,
514 ThreadPool::TPHandle *handle);
515
516 int queue_transactions(Sequencer *osr, vector<Transaction>& tls,
517 TrackedOpRef op = TrackedOpRef(),
518 ThreadPool::TPHandle *handle = NULL) override;
519
520 /**
521 * set replay guard xattr on given file
522 *
523 * This will ensure that we will not replay this (or any previous) operation
524 * against this particular inode/object.
525 *
526 * @param fd open file descriptor for the file/object
527 * @param spos sequencer position of the last operation we should not replay
528 */
529 void _set_replay_guard(int fd,
530 const SequencerPosition& spos,
531 const ghobject_t *oid=0,
532 bool in_progress=false);
533 void _set_replay_guard(const coll_t& cid,
534 const SequencerPosition& spos,
535 bool in_progress);
536 void _set_global_replay_guard(const coll_t& cid,
537 const SequencerPosition &spos);
538
539 /// close a replay guard opened with in_progress=true
540 void _close_replay_guard(int fd, const SequencerPosition& spos,
541 const ghobject_t *oid=0);
542 void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
543
544 /**
545 * check replay guard xattr on given file
546 *
547 * Check the current position against any marker on the file that
548 * indicates which operations have already been applied. If the
549 * current or a newer operation has been marked as applied, we
550 * should not replay the current operation again.
551 *
552 * If we are not replaying the journal, we already return true. It
553 * is only on replay that we might return false, indicated that the
554 * operation should not be performed (again).
555 *
556 * @param fd open fd on the file/object in question
557 * @param spos sequencerposition for an operation we could apply/replay
558 * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
559 */
560 int _check_replay_guard(int fd, const SequencerPosition& spos);
561 int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
562 int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
563 int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
564
565 // ------------------
566 // objects
567 int pick_object_revision_lt(ghobject_t& oid) {
568 return 0;
569 }
570 using ObjectStore::exists;
571 bool exists(const coll_t& cid, const ghobject_t& oid) override;
572 using ObjectStore::stat;
573 int stat(
574 const coll_t& cid,
575 const ghobject_t& oid,
576 struct stat *st,
577 bool allow_eio = false) override;
578 using ObjectStore::set_collection_opts;
579 int set_collection_opts(
580 const coll_t& cid,
581 const pool_opts_t& opts) override;
582 using ObjectStore::read;
583 int read(
584 const coll_t& cid,
585 const ghobject_t& oid,
586 uint64_t offset,
587 size_t len,
588 bufferlist& bl,
589 uint32_t op_flags = 0,
590 bool allow_eio = false) override;
591 int _do_fiemap(int fd, uint64_t offset, size_t len,
592 map<uint64_t, uint64_t> *m);
593 int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
594 map<uint64_t, uint64_t> *m);
595 using ObjectStore::fiemap;
596 int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override;
597 int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
598
599 int _touch(const coll_t& cid, const ghobject_t& oid);
600 int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
601 const bufferlist& bl, uint32_t fadvise_flags = 0);
602 int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
603 int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
604 int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
605 const SequencerPosition& spos);
606 int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
607 uint64_t srcoff, uint64_t len, uint64_t dstoff,
608 const SequencerPosition& spos);
609 int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
610 int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
611 int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
612 int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
613
614 int _fgetattr(int fd, const char *name, bufferptr& bp);
615 int _fgetattrs(int fd, map<string,bufferptr>& aset);
616 int _fsetattrs(int fd, map<string, bufferptr> &aset);
617
618 void _start_sync();
619
620 void do_force_sync();
621 void start_sync(Context *onsafe);
622 void sync();
623 void _flush_op_queue();
624 void flush();
625 void sync_and_flush();
626
627 int flush_journal() override;
628 int dump_journal(ostream& out) override;
629
630 void set_fsid(uuid_d u) override {
631 fsid = u;
632 }
633 uuid_d get_fsid() override { return fsid; }
634
635 uint64_t estimate_objects_overhead(uint64_t num_objects) override;
636
637 // DEBUG read error injection, an object is removed from both on delete()
638 Mutex read_error_lock;
639 set<ghobject_t> data_error_set; // read() will return -EIO
640 set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
641 void inject_data_error(const ghobject_t &oid) override;
642 void inject_mdata_error(const ghobject_t &oid) override;
643 void debug_obj_on_delete(const ghobject_t &oid);
644 bool debug_data_eio(const ghobject_t &oid);
645 bool debug_mdata_eio(const ghobject_t &oid);
646
647 int snapshot(const string& name) override;
648
649 // attrs
650 using ObjectStore::getattr;
651 using ObjectStore::getattrs;
652 int getattr(const coll_t& cid, const ghobject_t& oid, const char *name, bufferptr &bp) override;
653 int getattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset) override;
654
655 int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
656 const SequencerPosition &spos);
657 int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
658 const SequencerPosition &spos);
659 int _rmattrs(const coll_t& cid, const ghobject_t& oid,
660 const SequencerPosition &spos);
661
662 int _collection_remove_recursive(const coll_t &cid,
663 const SequencerPosition &spos);
664
665 int _collection_set_bits(const coll_t& cid, int bits);
666
667 // collections
668 using ObjectStore::collection_list;
669 int collection_bits(const coll_t& c) override;
670 int collection_list(const coll_t& c,
671 const ghobject_t& start, const ghobject_t& end, int max,
672 vector<ghobject_t> *ls, ghobject_t *next) override;
673 int list_collections(vector<coll_t>& ls) override;
674 int list_collections(vector<coll_t>& ls, bool include_temp);
675 int collection_stat(const coll_t& c, struct stat *st);
676 bool collection_exists(const coll_t& c) override;
677 int collection_empty(const coll_t& c, bool *empty) override;
678
679 // omap (see ObjectStore.h for documentation)
680 using ObjectStore::omap_get;
681 int omap_get(const coll_t& c, const ghobject_t &oid, bufferlist *header,
682 map<string, bufferlist> *out) override;
683 using ObjectStore::omap_get_header;
684 int omap_get_header(
685 const coll_t& c,
686 const ghobject_t &oid,
687 bufferlist *out,
688 bool allow_eio = false) override;
689 using ObjectStore::omap_get_keys;
690 int omap_get_keys(const coll_t& c, const ghobject_t &oid, set<string> *keys) override;
691 using ObjectStore::omap_get_values;
692 int omap_get_values(const coll_t& c, const ghobject_t &oid, const set<string> &keys,
693 map<string, bufferlist> *out) override;
694 using ObjectStore::omap_check_keys;
695 int omap_check_keys(const coll_t& c, const ghobject_t &oid, const set<string> &keys,
696 set<string> *out) override;
697 using ObjectStore::get_omap_iterator;
698 ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& c, const ghobject_t &oid) override;
699
700 int _create_collection(const coll_t& c, int bits,
701 const SequencerPosition &spos);
702 int _destroy_collection(const coll_t& c);
703 /**
704 * Give an expected number of objects hint to the collection.
705 *
706 * @param c - collection id.
707 * @param pg_num - pg number of the pool this collection belongs to
708 * @param expected_num_objs - expected number of objects in this collection
709 * @param spos - sequence position
710 *
711 * @return 0 on success, an error code otherwise
712 */
713 int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
714 uint64_t expected_num_objs,
715 const SequencerPosition &spos);
716 int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
717 const SequencerPosition& spos);
718 int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
719 coll_t c, const ghobject_t& o,
720 const SequencerPosition& spos,
721 bool ignore_enoent = false);
722
723 int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
724 uint64_t expected_object_size,
725 uint64_t expected_write_size);
726
727 void dump_start(const std::string& file);
728 void dump_stop();
729 void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
730
731 virtual int apply_layout_settings(const coll_t &cid);
732
733 private:
734 void _inject_failure();
735
736 // omap
737 int _omap_clear(const coll_t& cid, const ghobject_t &oid,
738 const SequencerPosition &spos);
739 int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
740 const map<string, bufferlist> &aset,
741 const SequencerPosition &spos);
742 int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys,
743 const SequencerPosition &spos);
744 int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
745 const string& first, const string& last,
746 const SequencerPosition &spos);
747 int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl,
748 const SequencerPosition &spos);
749 int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
750 const SequencerPosition &spos);
751 int _split_collection_create(const coll_t& cid, uint32_t bits, uint32_t rem,
752 coll_t dest,
753 const SequencerPosition &spos);
754
755 const char** get_tracked_conf_keys() const override;
756 void handle_conf_change(const struct md_config_t *conf,
757 const std::set <std::string> &changed) override;
758 int set_throttle_params();
759 float m_filestore_commit_timeout;
760 bool m_filestore_journal_parallel;
761 bool m_filestore_journal_trailing;
762 bool m_filestore_journal_writeahead;
763 int m_filestore_fiemap_threshold;
764 double m_filestore_max_sync_interval;
765 double m_filestore_min_sync_interval;
766 bool m_filestore_fail_eio;
767 bool m_filestore_fadvise;
768 int do_update;
769 bool m_journal_dio, m_journal_aio, m_journal_force_aio;
770 std::string m_osd_rollback_to_cluster_snap;
771 bool m_osd_use_stale_snap;
772 bool m_filestore_do_dump;
773 std::ofstream m_filestore_dump;
774 JSONFormatter m_filestore_dump_fmt;
775 std::atomic<int64_t> m_filestore_kill_at = { 0 };
776 bool m_filestore_sloppy_crc;
777 int m_filestore_sloppy_crc_block_size;
778 uint64_t m_filestore_max_alloc_hint_size;
779 long m_fs_type;
780
781 //Determined xattr handling based on fs type
782 void set_xattr_limits_via_conf();
783 uint32_t m_filestore_max_inline_xattr_size;
784 uint32_t m_filestore_max_inline_xattrs;
785 uint32_t m_filestore_max_xattr_value_size;
786
787 FSSuperblock superblock;
788
789 /**
790 * write_superblock()
791 *
792 * Write superblock to persisent storage
793 *
794 * return value: 0 on success, otherwise negative errno
795 */
796 int write_superblock();
797
798 /**
799 * read_superblock()
800 *
801 * Fill in FileStore::superblock by reading persistent storage
802 *
803 * return value: 0 on success, otherwise negative errno
804 */
805 int read_superblock();
806
807 friend class FileStoreBackend;
808 friend class TestFileStore;
809 };
810
811 ostream& operator<<(ostream& out, const FileStore::OpSequencer& s);
812
813 struct fiemap;
814
815 class FileStoreBackend {
816 private:
817 FileStore *filestore;
818 protected:
819 int get_basedir_fd() {
820 return filestore->basedir_fd;
821 }
822 int get_current_fd() {
823 return filestore->current_fd;
824 }
825 int get_op_fd() {
826 return filestore->op_fd;
827 }
828 size_t get_blksize() {
829 return filestore->blk_size;
830 }
831 const string& get_basedir_path() {
832 return filestore->basedir;
833 }
834 const string& get_current_path() {
835 return filestore->current_fn;
836 }
837 int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
838 if (has_fiemap() || has_seek_data_hole()) {
839 return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
840 } else {
841 return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
842 }
843 }
844 int get_crc_block_size() {
845 return filestore->m_filestore_sloppy_crc_block_size;
846 }
847
848 public:
849 explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
850 virtual ~FileStoreBackend() {}
851
852 CephContext* cct() const {
853 return filestore->cct;
854 }
855
856 static FileStoreBackend *create(long f_type, FileStore *fs);
857
858 virtual const char *get_name() = 0;
859 virtual int detect_features() = 0;
860 virtual int create_current() = 0;
861 virtual bool can_checkpoint() = 0;
862 virtual int list_checkpoints(list<string>& ls) = 0;
863 virtual int create_checkpoint(const string& name, uint64_t *cid) = 0;
864 virtual int sync_checkpoint(uint64_t id) = 0;
865 virtual int rollback_to(const string& name) = 0;
866 virtual int destroy_checkpoint(const string& name) = 0;
867 virtual int syncfs() = 0;
868 virtual bool has_fiemap() = 0;
869 virtual bool has_seek_data_hole() = 0;
870 virtual bool is_rotational() = 0;
871 virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
872 virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
873 virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
874 virtual bool has_splice() const = 0;
875
876 // hooks for (sloppy) crc tracking
877 virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
878 virtual int _crc_update_truncate(int fd, loff_t off) = 0;
879 virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
880 virtual int _crc_update_clone_range(int srcfd, int destfd,
881 loff_t srcoff, size_t len, loff_t dstoff) = 0;
882 virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
883 ostream *out) = 0;
884 };
885
886 #endif