]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/filestore/FileStore.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / os / filestore / FileStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#ifndef CEPH_FILESTORE_H
17#define CEPH_FILESTORE_H
18
19#include "include/types.h"
20
21#include <map>
22#include <deque>
23#include <boost/scoped_ptr.hpp>
24#include <fstream>
25using namespace std;
26
27#include "include/unordered_map.h"
28
29#include "include/assert.h"
30
31#include "os/ObjectStore.h"
32#include "JournalingObjectStore.h"
33
34#include "common/Timer.h"
35#include "common/WorkQueue.h"
36#include "common/perf_counters.h"
37#include "common/zipkin_trace.h"
38
39#include "common/Mutex.h"
40#include "HashIndex.h"
41#include "IndexManager.h"
42#include "os/ObjectMap.h"
43#include "SequencerPosition.h"
44#include "FDCache.h"
45#include "WBThrottle.h"
46
47#include "include/uuid.h"
48
49
50// from include/linux/falloc.h:
51#ifndef FALLOC_FL_PUNCH_HOLE
52# define FALLOC_FL_PUNCH_HOLE 0x2
53#endif
54
55#if defined(__linux__)
56# ifndef BTRFS_SUPER_MAGIC
57#define BTRFS_SUPER_MAGIC 0x9123683EL
58# endif
59# ifndef XFS_SUPER_MAGIC
60#define XFS_SUPER_MAGIC 0x58465342L
61# endif
62# ifndef ZFS_SUPER_MAGIC
63#define ZFS_SUPER_MAGIC 0x2fc12fc1L
64# endif
65#endif
66
67
68class FileStoreBackend;
69
70#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
71
72enum {
73 l_filestore_first = 84000,
74 l_filestore_journal_queue_ops,
75 l_filestore_journal_queue_bytes,
76 l_filestore_journal_ops,
77 l_filestore_journal_bytes,
78 l_filestore_journal_latency,
79 l_filestore_journal_wr,
80 l_filestore_journal_wr_bytes,
81 l_filestore_journal_full,
82 l_filestore_committing,
83 l_filestore_commitcycle,
84 l_filestore_commitcycle_interval,
85 l_filestore_commitcycle_latency,
86 l_filestore_op_queue_max_ops,
87 l_filestore_op_queue_ops,
88 l_filestore_ops,
89 l_filestore_op_queue_max_bytes,
90 l_filestore_op_queue_bytes,
91 l_filestore_bytes,
92 l_filestore_apply_latency,
93 l_filestore_queue_transaction_latency_avg,
94 l_filestore_last,
95};
96
97class FSSuperblock {
98public:
99 CompatSet compat_features;
100 string omap_backend;
101
102 FSSuperblock() { }
103
104 void encode(bufferlist &bl) const;
105 void decode(bufferlist::iterator &bl);
106 void dump(Formatter *f) const;
107 static void generate_test_instances(list<FSSuperblock*>& o);
108};
109WRITE_CLASS_ENCODER(FSSuperblock)
110
111inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
112{
113 return out << "sb(" << sb.compat_features << "): "
114 << sb.omap_backend;
115}
116
117class FileStore : public JournalingObjectStore,
118 public md_config_obs_t
119{
120 static const uint32_t target_version = 4;
121public:
122 uint32_t get_target_version() {
123 return target_version;
124 }
125
126 static int get_block_device_fsid(CephContext* cct, const string& path,
127 uuid_d *fsid);
128 struct FSPerfTracker {
129 PerfCounters::avg_tracker<uint64_t> os_commit_latency;
130 PerfCounters::avg_tracker<uint64_t> os_apply_latency;
131
132 objectstore_perf_stat_t get_cur_stats() const {
133 objectstore_perf_stat_t ret;
134 ret.os_commit_latency = os_commit_latency.avg();
135 ret.os_apply_latency = os_apply_latency.avg();
136 return ret;
137 }
138
139 void update_from_perfcounters(PerfCounters &logger);
140 } perf_tracker;
141 objectstore_perf_stat_t get_cur_stats() override {
142 perf_tracker.update_from_perfcounters(*logger);
143 return perf_tracker.get_cur_stats();
144 }
145 const PerfCounters* get_perf_counters() const override {
146 return logger;
147 }
148
149private:
150 string internal_name; ///< internal name, used to name the perfcounter instance
151 string basedir, journalpath;
152 osflagbits_t generic_flags;
153 std::string current_fn;
154 std::string current_op_seq_fn;
155 std::string omap_dir;
156 uuid_d fsid;
157
158 size_t blk_size; ///< fs block size
159
160 int fsid_fd, op_fd, basedir_fd, current_fd;
161
162 FileStoreBackend *backend;
163
164 void create_backend(long f_type);
165
166 deque<uint64_t> snaps;
167
168 // Indexed Collections
169 IndexManager index_manager;
170 int get_index(const coll_t& c, Index *index);
171 int init_index(const coll_t& c);
172
173 bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
174 // - normal temp case: cid is pg, object is temp (pool < -1)
175 // - hammer temp case: cid is pg (or already temp), object pool is -1
176 return cid.is_pg() && oid.hobj.pool <= -1;
177 }
178 void init_temp_collections();
179
180 // ObjectMap
181 boost::scoped_ptr<ObjectMap> object_map;
182
183 // helper fns
184 int get_cdir(const coll_t& cid, char *s, int len);
185
186 /// read a uuid from fd
187 int read_fsid(int fd, uuid_d *uuid);
188
189 /// lock fsid_fd
190 int lock_fsid();
191
192 // sync thread
193 Mutex lock;
194 bool force_sync;
195 Cond sync_cond;
196
197 Mutex sync_entry_timeo_lock;
198 SafeTimer timer;
199
200 list<Context*> sync_waiters;
201 bool stop;
202 void sync_entry();
203 struct SyncThread : public Thread {
204 FileStore *fs;
205 explicit SyncThread(FileStore *f) : fs(f) {}
206 void *entry() override {
207 fs->sync_entry();
208 return 0;
209 }
210 } sync_thread;
211
212 // -- op workqueue --
213 struct Op {
214 utime_t start;
215 uint64_t op;
216 vector<Transaction> tls;
217 Context *onreadable, *onreadable_sync;
218 uint64_t ops, bytes;
219 TrackedOpRef osd_op;
220 ZTracer::Trace trace;
221 };
222 class OpSequencer : public Sequencer_impl {
223 Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
224 list<Op*> q;
225 list<uint64_t> jq;
226 list<pair<uint64_t, Context*> > flush_commit_waiters;
227 Cond cond;
228 public:
229 Sequencer *parent;
230 Mutex apply_lock; // for apply mutual exclusion
231 int id;
232
233 /// get_max_uncompleted
234 bool _get_max_uncompleted(
235 uint64_t *seq ///< [out] max uncompleted seq
236 ) {
237 assert(qlock.is_locked());
238 assert(seq);
239 *seq = 0;
240 if (q.empty() && jq.empty())
241 return true;
242
243 if (!q.empty())
244 *seq = q.back()->op;
245 if (!jq.empty() && jq.back() > *seq)
246 *seq = jq.back();
247
248 return false;
249 } /// @returns true if both queues are empty
250
251 /// get_min_uncompleted
252 bool _get_min_uncompleted(
253 uint64_t *seq ///< [out] min uncompleted seq
254 ) {
255 assert(qlock.is_locked());
256 assert(seq);
257 *seq = 0;
258 if (q.empty() && jq.empty())
259 return true;
260
261 if (!q.empty())
262 *seq = q.front()->op;
263 if (!jq.empty() && jq.front() < *seq)
264 *seq = jq.front();
265
266 return false;
267 } /// @returns true if both queues are empty
268
269 void _wake_flush_waiters(list<Context*> *to_queue) {
270 uint64_t seq;
271 if (_get_min_uncompleted(&seq))
272 seq = -1;
273
274 for (list<pair<uint64_t, Context*> >::iterator i =
275 flush_commit_waiters.begin();
276 i != flush_commit_waiters.end() && i->first < seq;
277 flush_commit_waiters.erase(i++)) {
278 to_queue->push_back(i->second);
279 }
280 }
281
282 void queue_journal(uint64_t s) {
283 Mutex::Locker l(qlock);
284 jq.push_back(s);
285 }
286 void dequeue_journal(list<Context*> *to_queue) {
287 Mutex::Locker l(qlock);
288 jq.pop_front();
289 cond.Signal();
290 _wake_flush_waiters(to_queue);
291 }
292 void queue(Op *o) {
293 Mutex::Locker l(qlock);
294 q.push_back(o);
295 o->trace.keyval("queue depth", q.size());
296 }
297 Op *peek_queue() {
298 Mutex::Locker l(qlock);
299 assert(apply_lock.is_locked());
300 return q.front();
301 }
302
303 Op *dequeue(list<Context*> *to_queue) {
304 assert(to_queue);
305 assert(apply_lock.is_locked());
306 Mutex::Locker l(qlock);
307 Op *o = q.front();
308 q.pop_front();
309 cond.Signal();
310
311 _wake_flush_waiters(to_queue);
312 return o;
313 }
314
315 void flush() override {
316 Mutex::Locker l(qlock);
317
318 while (cct->_conf->filestore_blackhole)
319 cond.Wait(qlock); // wait forever
320
321
322 // get max for journal _or_ op queues
323 uint64_t seq = 0;
324 if (!q.empty())
325 seq = q.back()->op;
326 if (!jq.empty() && jq.back() > seq)
327 seq = jq.back();
328
329 if (seq) {
330 // everything prior to our watermark to drain through either/both queues
331 while ((!q.empty() && q.front()->op <= seq) ||
332 (!jq.empty() && jq.front() <= seq))
333 cond.Wait(qlock);
334 }
335 }
336 bool flush_commit(Context *c) override {
337 Mutex::Locker l(qlock);
338 uint64_t seq = 0;
339 if (_get_max_uncompleted(&seq)) {
340 return true;
341 } else {
342 flush_commit_waiters.push_back(make_pair(seq, c));
343 return false;
344 }
345 }
346
347 OpSequencer(CephContext* cct, int i)
348 : Sequencer_impl(cct),
349 qlock("FileStore::OpSequencer::qlock", false, false),
350 parent(0),
351 apply_lock("FileStore::OpSequencer::apply_lock", false, false),
352 id(i) {}
353 ~OpSequencer() override {
354 assert(q.empty());
355 }
356
357 const string& get_name() const {
358 return parent->get_name();
359 }
360 };
361
362 friend ostream& operator<<(ostream& out, const OpSequencer& s);
363
364 FDCache fdcache;
365 WBThrottle wbthrottle;
366
367 atomic_t next_osr_id;
368 bool m_disable_wbthrottle;
369 deque<OpSequencer*> op_queue;
370 BackoffThrottle throttle_ops, throttle_bytes;
371 const int m_ondisk_finisher_num;
372 const int m_apply_finisher_num;
373 vector<Finisher*> ondisk_finishers;
374 vector<Finisher*> apply_finishers;
375
376 ThreadPool op_tp;
377 struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
378 FileStore *store;
379 OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
380 : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {}
381
382 bool _enqueue(OpSequencer *osr) override {
383 store->op_queue.push_back(osr);
384 return true;
385 }
386 void _dequeue(OpSequencer *o) override {
387 ceph_abort();
388 }
389 bool _empty() override {
390 return store->op_queue.empty();
391 }
392 OpSequencer *_dequeue() override {
393 if (store->op_queue.empty())
394 return NULL;
395 OpSequencer *osr = store->op_queue.front();
396 store->op_queue.pop_front();
397 return osr;
398 }
399 void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
400 store->_do_op(osr, handle);
401 }
402 void _process_finish(OpSequencer *osr) override {
403 store->_finish_op(osr);
404 }
405 void _clear() override {
406 assert(store->op_queue.empty());
407 }
408 } op_wq;
409
410 void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
411 void _finish_op(OpSequencer *o);
412 Op *build_op(vector<Transaction>& tls,
413 Context *onreadable, Context *onreadable_sync,
414 TrackedOpRef osd_op);
415 void queue_op(OpSequencer *osr, Op *o);
416 void op_queue_reserve_throttle(Op *o);
417 void op_queue_release_throttle(Op *o);
418 void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
419 friend struct C_JournaledAhead;
420
421 void new_journal();
422
423 PerfCounters *logger;
424
425 ZTracer::Endpoint trace_endpoint;
426
427public:
428 int lfn_find(const ghobject_t& oid, const Index& index,
429 IndexedPath *path = NULL);
430 int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
431 int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
432 int lfn_open(
433 const coll_t& cid,
434 const ghobject_t& oid,
435 bool create,
436 FDRef *outfd,
437 Index *index = 0);
438
439 void lfn_close(FDRef fd);
440 int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
441 int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
442 bool force_clear_omap=false);
443
444public:
445 FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
446 osflagbits_t flags = 0,
447 const char *internal_name = "filestore", bool update_to=false);
448 ~FileStore() override;
449
450 string get_type() override {
451 return "filestore";
452 }
453
454 int _detect_fs();
455 int _sanity_check_fs();
456
457 bool test_mount_in_use() override;
458 int read_op_seq(uint64_t *seq);
459 int write_op_seq(int, uint64_t seq);
460 int mount() override;
461 int umount() override;
462
463 int validate_hobject_key(const hobject_t &obj) const override;
464
465 unsigned get_max_attr_name_length() override {
466 // xattr limit is 128; leave room for our prefixes (user.ceph._),
467 // some margin, and cap at 100
468 return 100;
469 }
470 int mkfs() override;
471 int mkjournal() override;
472 bool wants_journal() override {
473 return true;
474 }
475 bool allows_journal() override {
476 return true;
477 }
478 bool needs_journal() override {
479 return false;
480 }
481 void dump_perf_counters(Formatter *f) override {
482 f->open_object_section("perf_counters");
483 logger->dump_formatted(f, false);
484 f->close_section();
485 }
486
487 int write_version_stamp();
488 int version_stamp_is_valid(uint32_t *version);
489 int update_version_stamp();
490 int upgrade() override;
491
492 bool can_sort_nibblewise() override {
493 return true; // i support legacy sort order
494 }
495
496 void collect_metadata(map<string,string> *pm) override;
497
498 int statfs(struct store_statfs_t *buf) override;
499
500 int _do_transactions(
501 vector<Transaction> &tls, uint64_t op_seq,
502 ThreadPool::TPHandle *handle);
503 int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override {
504 return _do_transactions(tls, op_seq, 0);
505 }
506 void _do_transaction(
507 Transaction& t, uint64_t op_seq, int trans_num,
508 ThreadPool::TPHandle *handle);
509
510 int queue_transactions(Sequencer *osr, vector<Transaction>& tls,
511 TrackedOpRef op = TrackedOpRef(),
512 ThreadPool::TPHandle *handle = NULL) override;
513
514 /**
515 * set replay guard xattr on given file
516 *
517 * This will ensure that we will not replay this (or any previous) operation
518 * against this particular inode/object.
519 *
520 * @param fd open file descriptor for the file/object
521 * @param spos sequencer position of the last operation we should not replay
522 */
523 void _set_replay_guard(int fd,
524 const SequencerPosition& spos,
525 const ghobject_t *oid=0,
526 bool in_progress=false);
527 void _set_replay_guard(const coll_t& cid,
528 const SequencerPosition& spos,
529 bool in_progress);
530 void _set_global_replay_guard(const coll_t& cid,
531 const SequencerPosition &spos);
532
533 /// close a replay guard opened with in_progress=true
534 void _close_replay_guard(int fd, const SequencerPosition& spos,
535 const ghobject_t *oid=0);
536 void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
537
538 /**
539 * check replay guard xattr on given file
540 *
541 * Check the current position against any marker on the file that
542 * indicates which operations have already been applied. If the
543 * current or a newer operation has been marked as applied, we
544 * should not replay the current operation again.
545 *
546 * If we are not replaying the journal, we already return true. It
547 * is only on replay that we might return false, indicated that the
548 * operation should not be performed (again).
549 *
550 * @param fd open fd on the file/object in question
551 * @param spos sequencerposition for an operation we could apply/replay
552 * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
553 */
554 int _check_replay_guard(int fd, const SequencerPosition& spos);
555 int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
556 int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
557 int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
558
559 // ------------------
560 // objects
561 int pick_object_revision_lt(ghobject_t& oid) {
562 return 0;
563 }
564 using ObjectStore::exists;
565 bool exists(const coll_t& cid, const ghobject_t& oid) override;
566 using ObjectStore::stat;
567 int stat(
568 const coll_t& cid,
569 const ghobject_t& oid,
570 struct stat *st,
571 bool allow_eio = false) override;
572 using ObjectStore::set_collection_opts;
573 int set_collection_opts(
574 const coll_t& cid,
575 const pool_opts_t& opts) override;
576 using ObjectStore::read;
577 int read(
578 const coll_t& cid,
579 const ghobject_t& oid,
580 uint64_t offset,
581 size_t len,
582 bufferlist& bl,
583 uint32_t op_flags = 0,
584 bool allow_eio = false) override;
585 int _do_fiemap(int fd, uint64_t offset, size_t len,
586 map<uint64_t, uint64_t> *m);
587 int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
588 map<uint64_t, uint64_t> *m);
589 using ObjectStore::fiemap;
590 int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override;
591 int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
592
593 int _touch(const coll_t& cid, const ghobject_t& oid);
594 int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
595 const bufferlist& bl, uint32_t fadvise_flags = 0);
596 int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
597 int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
598 int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
599 const SequencerPosition& spos);
600 int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
601 uint64_t srcoff, uint64_t len, uint64_t dstoff,
602 const SequencerPosition& spos);
603 int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
604 int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
605 int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
606 int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
607
608 int _fgetattr(int fd, const char *name, bufferptr& bp);
609 int _fgetattrs(int fd, map<string,bufferptr>& aset);
610 int _fsetattrs(int fd, map<string, bufferptr> &aset);
611
612 void _start_sync();
613
614 void do_force_sync();
615 void start_sync(Context *onsafe);
616 void sync();
617 void _flush_op_queue();
618 void flush();
619 void sync_and_flush();
620
621 int flush_journal() override;
622 int dump_journal(ostream& out) override;
623
624 void set_fsid(uuid_d u) override {
625 fsid = u;
626 }
627 uuid_d get_fsid() override { return fsid; }
628
629 uint64_t estimate_objects_overhead(uint64_t num_objects) override;
630
631 // DEBUG read error injection, an object is removed from both on delete()
632 Mutex read_error_lock;
633 set<ghobject_t> data_error_set; // read() will return -EIO
634 set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
635 void inject_data_error(const ghobject_t &oid) override;
636 void inject_mdata_error(const ghobject_t &oid) override;
637 void debug_obj_on_delete(const ghobject_t &oid);
638 bool debug_data_eio(const ghobject_t &oid);
639 bool debug_mdata_eio(const ghobject_t &oid);
640
641 int snapshot(const string& name) override;
642
643 // attrs
644 using ObjectStore::getattr;
645 using ObjectStore::getattrs;
646 int getattr(const coll_t& cid, const ghobject_t& oid, const char *name, bufferptr &bp) override;
647 int getattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset) override;
648
649 int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
650 const SequencerPosition &spos);
651 int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
652 const SequencerPosition &spos);
653 int _rmattrs(const coll_t& cid, const ghobject_t& oid,
654 const SequencerPosition &spos);
655
656 int _collection_remove_recursive(const coll_t &cid,
657 const SequencerPosition &spos);
658
659 int _collection_set_bits(const coll_t& cid, int bits);
660
661 // collections
662 using ObjectStore::collection_list;
663 int collection_bits(const coll_t& c) override;
664 int collection_list(const coll_t& c,
665 const ghobject_t& start, const ghobject_t& end, int max,
666 vector<ghobject_t> *ls, ghobject_t *next) override;
667 int list_collections(vector<coll_t>& ls) override;
668 int list_collections(vector<coll_t>& ls, bool include_temp);
669 int collection_stat(const coll_t& c, struct stat *st);
670 bool collection_exists(const coll_t& c) override;
671 int collection_empty(const coll_t& c, bool *empty) override;
672
673 // omap (see ObjectStore.h for documentation)
674 using ObjectStore::omap_get;
675 int omap_get(const coll_t& c, const ghobject_t &oid, bufferlist *header,
676 map<string, bufferlist> *out) override;
677 using ObjectStore::omap_get_header;
678 int omap_get_header(
679 const coll_t& c,
680 const ghobject_t &oid,
681 bufferlist *out,
682 bool allow_eio = false) override;
683 using ObjectStore::omap_get_keys;
684 int omap_get_keys(const coll_t& c, const ghobject_t &oid, set<string> *keys) override;
685 using ObjectStore::omap_get_values;
686 int omap_get_values(const coll_t& c, const ghobject_t &oid, const set<string> &keys,
687 map<string, bufferlist> *out) override;
688 using ObjectStore::omap_check_keys;
689 int omap_check_keys(const coll_t& c, const ghobject_t &oid, const set<string> &keys,
690 set<string> *out) override;
691 using ObjectStore::get_omap_iterator;
692 ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& c, const ghobject_t &oid) override;
693
694 int _create_collection(const coll_t& c, int bits,
695 const SequencerPosition &spos);
696 int _destroy_collection(const coll_t& c);
697 /**
698 * Give an expected number of objects hint to the collection.
699 *
700 * @param c - collection id.
701 * @param pg_num - pg number of the pool this collection belongs to
702 * @param expected_num_objs - expected number of objects in this collection
703 * @param spos - sequence position
704 *
705 * @return 0 on success, an error code otherwise
706 */
707 int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
708 uint64_t expected_num_objs,
709 const SequencerPosition &spos);
710 int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
711 const SequencerPosition& spos);
712 int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
713 coll_t c, const ghobject_t& o,
714 const SequencerPosition& spos,
715 bool ignore_enoent = false);
716
717 int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
718 uint64_t expected_object_size,
719 uint64_t expected_write_size);
720
721 void dump_start(const std::string& file);
722 void dump_stop();
723 void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
724
725 virtual int apply_layout_settings(const coll_t &cid);
726
727private:
728 void _inject_failure();
729
730 // omap
731 int _omap_clear(const coll_t& cid, const ghobject_t &oid,
732 const SequencerPosition &spos);
733 int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
734 const map<string, bufferlist> &aset,
735 const SequencerPosition &spos);
736 int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys,
737 const SequencerPosition &spos);
738 int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
739 const string& first, const string& last,
740 const SequencerPosition &spos);
741 int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl,
742 const SequencerPosition &spos);
743 int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
744 const SequencerPosition &spos);
745 int _split_collection_create(const coll_t& cid, uint32_t bits, uint32_t rem,
746 coll_t dest,
747 const SequencerPosition &spos);
748
749 const char** get_tracked_conf_keys() const override;
750 void handle_conf_change(const struct md_config_t *conf,
751 const std::set <std::string> &changed) override;
752 int set_throttle_params();
753 float m_filestore_commit_timeout;
754 bool m_filestore_journal_parallel;
755 bool m_filestore_journal_trailing;
756 bool m_filestore_journal_writeahead;
757 int m_filestore_fiemap_threshold;
758 double m_filestore_max_sync_interval;
759 double m_filestore_min_sync_interval;
760 bool m_filestore_fail_eio;
761 bool m_filestore_fadvise;
762 int do_update;
763 bool m_journal_dio, m_journal_aio, m_journal_force_aio;
764 std::string m_osd_rollback_to_cluster_snap;
765 bool m_osd_use_stale_snap;
766 bool m_filestore_do_dump;
767 std::ofstream m_filestore_dump;
768 JSONFormatter m_filestore_dump_fmt;
769 atomic_t m_filestore_kill_at;
770 bool m_filestore_sloppy_crc;
771 int m_filestore_sloppy_crc_block_size;
772 uint64_t m_filestore_max_alloc_hint_size;
773 long m_fs_type;
774
775 //Determined xattr handling based on fs type
776 void set_xattr_limits_via_conf();
777 uint32_t m_filestore_max_inline_xattr_size;
778 uint32_t m_filestore_max_inline_xattrs;
779 uint32_t m_filestore_max_xattr_value_size;
780
781 FSSuperblock superblock;
782
783 /**
784 * write_superblock()
785 *
786 * Write superblock to persisent storage
787 *
788 * return value: 0 on success, otherwise negative errno
789 */
790 int write_superblock();
791
792 /**
793 * read_superblock()
794 *
795 * Fill in FileStore::superblock by reading persistent storage
796 *
797 * return value: 0 on success, otherwise negative errno
798 */
799 int read_superblock();
800
801 friend class FileStoreBackend;
802 friend class TestFileStore;
803};
804
805ostream& operator<<(ostream& out, const FileStore::OpSequencer& s);
806
807struct fiemap;
808
809class FileStoreBackend {
810private:
811 FileStore *filestore;
812protected:
813 int get_basedir_fd() {
814 return filestore->basedir_fd;
815 }
816 int get_current_fd() {
817 return filestore->current_fd;
818 }
819 int get_op_fd() {
820 return filestore->op_fd;
821 }
822 size_t get_blksize() {
823 return filestore->blk_size;
824 }
825 const string& get_basedir_path() {
826 return filestore->basedir;
827 }
828 const string& get_current_path() {
829 return filestore->current_fn;
830 }
831 int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
832 if (has_fiemap() || has_seek_data_hole()) {
833 return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
834 } else {
835 return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
836 }
837 }
838 int get_crc_block_size() {
839 return filestore->m_filestore_sloppy_crc_block_size;
840 }
841
842public:
843 explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
844 virtual ~FileStoreBackend() {}
845
846 CephContext* cct() const {
847 return filestore->cct;
848 }
849
850 static FileStoreBackend *create(long f_type, FileStore *fs);
851
852 virtual const char *get_name() = 0;
853 virtual int detect_features() = 0;
854 virtual int create_current() = 0;
855 virtual bool can_checkpoint() = 0;
856 virtual int list_checkpoints(list<string>& ls) = 0;
857 virtual int create_checkpoint(const string& name, uint64_t *cid) = 0;
858 virtual int sync_checkpoint(uint64_t id) = 0;
859 virtual int rollback_to(const string& name) = 0;
860 virtual int destroy_checkpoint(const string& name) = 0;
861 virtual int syncfs() = 0;
862 virtual bool has_fiemap() = 0;
863 virtual bool has_seek_data_hole() = 0;
864 virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
865 virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
866 virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
867 virtual bool has_splice() const = 0;
868
869 // hooks for (sloppy) crc tracking
870 virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
871 virtual int _crc_update_truncate(int fd, loff_t off) = 0;
872 virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
873 virtual int _crc_update_clone_range(int srcfd, int destfd,
874 loff_t srcoff, size_t len, loff_t dstoff) = 0;
875 virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
876 ostream *out) = 0;
877};
878
879#endif