]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/filestore/FileStore.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / os / filestore / FileStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#ifndef CEPH_FILESTORE_H
17#define CEPH_FILESTORE_H
18
19#include "include/types.h"
20
21#include <map>
22#include <deque>
31f18b77 23#include <atomic>
7c673cae 24#include <fstream>
31f18b77 25
7c673cae 26
31f18b77
FG
27#include <boost/scoped_ptr.hpp>
28
7c673cae
FG
29#include "include/unordered_map.h"
30
11fdf7f2 31#include "include/ceph_assert.h"
7c673cae
FG
32
33#include "os/ObjectStore.h"
34#include "JournalingObjectStore.h"
35
36#include "common/Timer.h"
37#include "common/WorkQueue.h"
38#include "common/perf_counters.h"
39#include "common/zipkin_trace.h"
40
9f95a23c 41#include "common/ceph_mutex.h"
7c673cae
FG
42#include "HashIndex.h"
43#include "IndexManager.h"
44#include "os/ObjectMap.h"
45#include "SequencerPosition.h"
46#include "FDCache.h"
47#include "WBThrottle.h"
48
49#include "include/uuid.h"
50
7c673cae
FG
51#if defined(__linux__)
52# ifndef BTRFS_SUPER_MAGIC
11fdf7f2 53#define BTRFS_SUPER_MAGIC 0x9123683EUL
7c673cae
FG
54# endif
55# ifndef XFS_SUPER_MAGIC
11fdf7f2 56#define XFS_SUPER_MAGIC 0x58465342UL
7c673cae
FG
57# endif
58# ifndef ZFS_SUPER_MAGIC
11fdf7f2 59#define ZFS_SUPER_MAGIC 0x2fc12fc1UL
7c673cae
FG
60# endif
61#endif
62
63
64class FileStoreBackend;
65
66#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
67
68enum {
69 l_filestore_first = 84000,
70 l_filestore_journal_queue_ops,
71 l_filestore_journal_queue_bytes,
72 l_filestore_journal_ops,
73 l_filestore_journal_bytes,
74 l_filestore_journal_latency,
75 l_filestore_journal_wr,
76 l_filestore_journal_wr_bytes,
77 l_filestore_journal_full,
78 l_filestore_committing,
79 l_filestore_commitcycle,
80 l_filestore_commitcycle_interval,
81 l_filestore_commitcycle_latency,
82 l_filestore_op_queue_max_ops,
83 l_filestore_op_queue_ops,
84 l_filestore_ops,
85 l_filestore_op_queue_max_bytes,
86 l_filestore_op_queue_bytes,
87 l_filestore_bytes,
88 l_filestore_apply_latency,
89 l_filestore_queue_transaction_latency_avg,
224ce89b 90 l_filestore_sync_pause_max_lat,
7c673cae
FG
91 l_filestore_last,
92};
93
94class FSSuperblock {
95public:
96 CompatSet compat_features;
f67539c2 97 std::string omap_backend;
7c673cae
FG
98
99 FSSuperblock() { }
100
f67539c2
TL
101 void encode(ceph::buffer::list &bl) const;
102 void decode(ceph::buffer::list::const_iterator &bl);
103 void dump(ceph::Formatter *f) const;
104 static void generate_test_instances(std::list<FSSuperblock*>& o);
7c673cae
FG
105};
106WRITE_CLASS_ENCODER(FSSuperblock)
107
f67539c2 108inline std::ostream& operator<<(std::ostream& out, const FSSuperblock& sb)
7c673cae
FG
109{
110 return out << "sb(" << sb.compat_features << "): "
111 << sb.omap_backend;
112}
113
114class FileStore : public JournalingObjectStore,
115 public md_config_obs_t
116{
117 static const uint32_t target_version = 4;
118public:
119 uint32_t get_target_version() {
120 return target_version;
121 }
122
f67539c2 123 static int get_block_device_fsid(CephContext* cct, const std::string& path,
7c673cae
FG
124 uuid_d *fsid);
125 struct FSPerfTracker {
11fdf7f2
TL
126 PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
127 PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
7c673cae
FG
128
129 objectstore_perf_stat_t get_cur_stats() const {
130 objectstore_perf_stat_t ret;
11fdf7f2
TL
131 ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
132 ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
7c673cae
FG
133 return ret;
134 }
135
136 void update_from_perfcounters(PerfCounters &logger);
137 } perf_tracker;
138 objectstore_perf_stat_t get_cur_stats() override {
139 perf_tracker.update_from_perfcounters(*logger);
140 return perf_tracker.get_cur_stats();
141 }
142 const PerfCounters* get_perf_counters() const override {
143 return logger;
144 }
145
146private:
f67539c2
TL
147 std::string internal_name; ///< internal name, used to name the perfcounter instance
148 std::string basedir, journalpath;
7c673cae
FG
149 osflagbits_t generic_flags;
150 std::string current_fn;
151 std::string current_op_seq_fn;
152 std::string omap_dir;
153 uuid_d fsid;
154
155 size_t blk_size; ///< fs block size
156
157 int fsid_fd, op_fd, basedir_fd, current_fd;
158
159 FileStoreBackend *backend;
160
11fdf7f2
TL
161 void create_backend(unsigned long f_type);
162
f67539c2 163 std::string devname;
11fdf7f2
TL
164
165 int vdo_fd = -1;
f67539c2 166 std::string vdo_name;
7c673cae
FG
167
168 deque<uint64_t> snaps;
169
170 // Indexed Collections
171 IndexManager index_manager;
172 int get_index(const coll_t& c, Index *index);
173 int init_index(const coll_t& c);
174
175 bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
176 // - normal temp case: cid is pg, object is temp (pool < -1)
177 // - hammer temp case: cid is pg (or already temp), object pool is -1
178 return cid.is_pg() && oid.hobj.pool <= -1;
179 }
180 void init_temp_collections();
181
11fdf7f2
TL
182 void handle_eio();
183
7c673cae
FG
184 // ObjectMap
185 boost::scoped_ptr<ObjectMap> object_map;
186
187 // helper fns
188 int get_cdir(const coll_t& cid, char *s, int len);
189
190 /// read a uuid from fd
191 int read_fsid(int fd, uuid_d *uuid);
192
193 /// lock fsid_fd
194 int lock_fsid();
195
196 // sync thread
9f95a23c 197 ceph::mutex lock = ceph::make_mutex("FileStore::lock");
7c673cae 198 bool force_sync;
9f95a23c 199 ceph::condition_variable sync_cond;
7c673cae 200
9f95a23c 201 ceph::mutex sync_entry_timeo_lock = ceph::make_mutex("FileStore::sync_entry_timeo_lock");
7c673cae
FG
202 SafeTimer timer;
203
f67539c2 204 std::list<Context*> sync_waiters;
7c673cae
FG
205 bool stop;
206 void sync_entry();
207 struct SyncThread : public Thread {
208 FileStore *fs;
209 explicit SyncThread(FileStore *f) : fs(f) {}
210 void *entry() override {
211 fs->sync_entry();
212 return 0;
213 }
214 } sync_thread;
215
216 // -- op workqueue --
217 struct Op {
218 utime_t start;
219 uint64_t op;
f67539c2 220 std::vector<Transaction> tls;
7c673cae
FG
221 Context *onreadable, *onreadable_sync;
222 uint64_t ops, bytes;
223 TrackedOpRef osd_op;
224 ZTracer::Trace trace;
11fdf7f2 225 bool registered_apply = false;
7c673cae 226 };
11fdf7f2
TL
227 class OpSequencer : public CollectionImpl {
228 CephContext *cct;
9f95a23c
TL
229 // to protect q, for benefit of flush (peek/dequeue also protected by lock)
230 ceph::mutex qlock =
231 ceph::make_mutex("FileStore::OpSequencer::qlock", false);
f67539c2
TL
232 std::list<Op*> q;
233 std::list<uint64_t> jq;
234 std::list<std::pair<uint64_t, Context*> > flush_commit_waiters;
9f95a23c 235 ceph::condition_variable cond;
f67539c2 236 std::string osr_name_str;
11fdf7f2 237 /// hash of pointers to ghobject_t's for in-flight writes
f67539c2 238 std::unordered_multimap<uint32_t,const ghobject_t*> applying;
7c673cae 239 public:
9f95a23c
TL
240 // for apply mutual exclusion
241 ceph::mutex apply_lock =
242 ceph::make_mutex("FileStore::OpSequencer::apply_lock", false);
7c673cae 243 int id;
11fdf7f2 244 const char *osr_name;
7c673cae
FG
245
246 /// get_max_uncompleted
247 bool _get_max_uncompleted(
248 uint64_t *seq ///< [out] max uncompleted seq
249 ) {
11fdf7f2 250 ceph_assert(seq);
7c673cae
FG
251 *seq = 0;
252 if (q.empty() && jq.empty())
253 return true;
254
255 if (!q.empty())
256 *seq = q.back()->op;
257 if (!jq.empty() && jq.back() > *seq)
258 *seq = jq.back();
259
260 return false;
261 } /// @returns true if both queues are empty
262
263 /// get_min_uncompleted
264 bool _get_min_uncompleted(
265 uint64_t *seq ///< [out] min uncompleted seq
266 ) {
11fdf7f2 267 ceph_assert(seq);
7c673cae
FG
268 *seq = 0;
269 if (q.empty() && jq.empty())
270 return true;
271
272 if (!q.empty())
273 *seq = q.front()->op;
274 if (!jq.empty() && jq.front() < *seq)
275 *seq = jq.front();
276
277 return false;
278 } /// @returns true if both queues are empty
279
f67539c2 280 void _wake_flush_waiters(std::list<Context*> *to_queue) {
7c673cae
FG
281 uint64_t seq;
282 if (_get_min_uncompleted(&seq))
283 seq = -1;
284
f67539c2 285 for (auto i = flush_commit_waiters.begin();
7c673cae
FG
286 i != flush_commit_waiters.end() && i->first < seq;
287 flush_commit_waiters.erase(i++)) {
288 to_queue->push_back(i->second);
289 }
290 }
291
11fdf7f2 292 void queue_journal(Op *o) {
9f95a23c 293 std::lock_guard l{qlock};
11fdf7f2
TL
294 jq.push_back(o->op);
295 _register_apply(o);
7c673cae 296 }
f67539c2 297 void dequeue_journal(std::list<Context*> *to_queue) {
9f95a23c 298 std::lock_guard l{qlock};
7c673cae 299 jq.pop_front();
9f95a23c 300 cond.notify_all();
7c673cae
FG
301 _wake_flush_waiters(to_queue);
302 }
303 void queue(Op *o) {
9f95a23c 304 std::lock_guard l{qlock};
7c673cae 305 q.push_back(o);
11fdf7f2 306 _register_apply(o);
7c673cae
FG
307 o->trace.keyval("queue depth", q.size());
308 }
11fdf7f2
TL
309 void _register_apply(Op *o);
310 void _unregister_apply(Op *o);
311 void wait_for_apply(const ghobject_t& oid);
7c673cae 312 Op *peek_queue() {
9f95a23c
TL
313 std::lock_guard l{qlock};
314 ceph_assert(ceph_mutex_is_locked(apply_lock));
7c673cae
FG
315 return q.front();
316 }
317
f67539c2 318 Op *dequeue(std::list<Context*> *to_queue) {
11fdf7f2 319 ceph_assert(to_queue);
9f95a23c
TL
320 ceph_assert(ceph_mutex_is_locked(apply_lock));
321 std::lock_guard l{qlock};
7c673cae
FG
322 Op *o = q.front();
323 q.pop_front();
9f95a23c 324 cond.notify_all();
11fdf7f2 325 _unregister_apply(o);
7c673cae
FG
326 _wake_flush_waiters(to_queue);
327 return o;
328 }
329
330 void flush() override {
9f95a23c
TL
331 std::unique_lock l{qlock};
332 // wait forever
333 cond.wait(l, [this] { return !cct->_conf->filestore_blackhole; });
7c673cae
FG
334
335 // get max for journal _or_ op queues
336 uint64_t seq = 0;
337 if (!q.empty())
338 seq = q.back()->op;
339 if (!jq.empty() && jq.back() > seq)
340 seq = jq.back();
341
342 if (seq) {
343 // everything prior to our watermark to drain through either/both queues
9f95a23c
TL
344 cond.wait(l, [seq, this] {
345 return ((q.empty() || q.front()->op > seq) &&
346 (jq.empty() || jq.front() > seq));
347 });
7c673cae
FG
348 }
349 }
350 bool flush_commit(Context *c) override {
9f95a23c 351 std::lock_guard l{qlock};
7c673cae
FG
352 uint64_t seq = 0;
353 if (_get_max_uncompleted(&seq)) {
354 return true;
355 } else {
f67539c2 356 flush_commit_waiters.push_back(std::make_pair(seq, c));
7c673cae
FG
357 return false;
358 }
359 }
360
9f95a23c
TL
361 private:
362 FRIEND_MAKE_REF(OpSequencer);
11fdf7f2 363 OpSequencer(CephContext* cct, int i, coll_t cid)
9f95a23c 364 : CollectionImpl(cct, cid),
11fdf7f2 365 cct(cct),
11fdf7f2 366 osr_name_str(stringify(cid)),
11fdf7f2
TL
367 id(i),
368 osr_name(osr_name_str.c_str()) {}
7c673cae 369 ~OpSequencer() override {
11fdf7f2 370 ceph_assert(q.empty());
7c673cae
FG
371 }
372 };
11fdf7f2
TL
373 typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
374
9f95a23c 375 ceph::mutex coll_lock = ceph::make_mutex("FileStore::coll_lock");
f67539c2 376 std::map<coll_t,OpSequencerRef> coll_map;
7c673cae 377
f67539c2 378 friend std::ostream& operator<<(std::ostream& out, const OpSequencer& s);
7c673cae
FG
379
380 FDCache fdcache;
381 WBThrottle wbthrottle;
382
31f18b77 383 std::atomic<int64_t> next_osr_id = { 0 };
7c673cae
FG
384 bool m_disable_wbthrottle;
385 deque<OpSequencer*> op_queue;
386 BackoffThrottle throttle_ops, throttle_bytes;
387 const int m_ondisk_finisher_num;
388 const int m_apply_finisher_num;
f67539c2
TL
389 std::vector<Finisher*> ondisk_finishers;
390 std::vector<Finisher*> apply_finishers;
7c673cae
FG
391
392 ThreadPool op_tp;
393 struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
394 FileStore *store;
f67539c2
TL
395 OpWQ(FileStore *fs,
396 ceph::timespan timeout,
397 ceph::timespan suicide_timeout,
398 ThreadPool *tp)
399 : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ",
400 timeout, suicide_timeout, tp),
401 store(fs) {}
7c673cae
FG
402
403 bool _enqueue(OpSequencer *osr) override {
404 store->op_queue.push_back(osr);
405 return true;
406 }
407 void _dequeue(OpSequencer *o) override {
408 ceph_abort();
409 }
410 bool _empty() override {
411 return store->op_queue.empty();
412 }
413 OpSequencer *_dequeue() override {
414 if (store->op_queue.empty())
11fdf7f2 415 return nullptr;
7c673cae
FG
416 OpSequencer *osr = store->op_queue.front();
417 store->op_queue.pop_front();
418 return osr;
419 }
420 void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
421 store->_do_op(osr, handle);
422 }
423 void _process_finish(OpSequencer *osr) override {
424 store->_finish_op(osr);
425 }
426 void _clear() override {
11fdf7f2 427 ceph_assert(store->op_queue.empty());
7c673cae
FG
428 }
429 } op_wq;
430
431 void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
432 void _finish_op(OpSequencer *o);
f67539c2 433 Op *build_op(std::vector<Transaction>& tls,
7c673cae
FG
434 Context *onreadable, Context *onreadable_sync,
435 TrackedOpRef osd_op);
436 void queue_op(OpSequencer *osr, Op *o);
437 void op_queue_reserve_throttle(Op *o);
438 void op_queue_release_throttle(Op *o);
439 void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
440 friend struct C_JournaledAhead;
441
442 void new_journal();
443
444 PerfCounters *logger;
445
446 ZTracer::Endpoint trace_endpoint;
447
448public:
449 int lfn_find(const ghobject_t& oid, const Index& index,
11fdf7f2 450 IndexedPath *path = nullptr);
7c673cae
FG
451 int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
452 int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
453 int lfn_open(
454 const coll_t& cid,
455 const ghobject_t& oid,
456 bool create,
457 FDRef *outfd,
11fdf7f2 458 Index *index = nullptr);
7c673cae
FG
459
460 void lfn_close(FDRef fd);
461 int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
462 int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
463 bool force_clear_omap=false);
464
465public:
466 FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
467 osflagbits_t flags = 0,
468 const char *internal_name = "filestore", bool update_to=false);
469 ~FileStore() override;
470
f67539c2 471 std::string get_type() override {
7c673cae
FG
472 return "filestore";
473 }
474
475 int _detect_fs();
476 int _sanity_check_fs();
477
478 bool test_mount_in_use() override;
479 int read_op_seq(uint64_t *seq);
480 int write_op_seq(int, uint64_t seq);
481 int mount() override;
482 int umount() override;
483
484 int validate_hobject_key(const hobject_t &obj) const override;
485
486 unsigned get_max_attr_name_length() override {
487 // xattr limit is 128; leave room for our prefixes (user.ceph._),
488 // some margin, and cap at 100
489 return 100;
490 }
491 int mkfs() override;
492 int mkjournal() override;
493 bool wants_journal() override {
494 return true;
495 }
496 bool allows_journal() override {
497 return true;
498 }
499 bool needs_journal() override {
500 return false;
501 }
31f18b77 502
11fdf7f2
TL
503 bool is_sync_onreadable() const override {
504 return false;
505 }
506
31f18b77 507 bool is_rotational() override;
d2e6a577 508 bool is_journal_rotational() override;
31f18b77 509
f67539c2 510 void dump_perf_counters(ceph::Formatter *f) override {
7c673cae
FG
511 f->open_object_section("perf_counters");
512 logger->dump_formatted(f, false);
513 f->close_section();
514 }
515
f67539c2 516 int flush_cache(std::ostream *os = NULL) override;
7c673cae
FG
517 int write_version_stamp();
518 int version_stamp_is_valid(uint32_t *version);
519 int update_version_stamp();
520 int upgrade() override;
521
522 bool can_sort_nibblewise() override {
523 return true; // i support legacy sort order
524 }
525
f67539c2
TL
526 void collect_metadata(std::map<std::string,std::string> *pm) override;
527 int get_devices(std::set<std::string> *ls) override;
7c673cae 528
11fdf7f2
TL
529 int statfs(struct store_statfs_t *buf,
530 osd_alert_list_t* alerts = nullptr) override;
9f95a23c
TL
531 int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
532 bool *per_pool_omap) override;
7c673cae
FG
533
534 int _do_transactions(
f67539c2 535 std::vector<Transaction> &tls, uint64_t op_seq,
11fdf7f2
TL
536 ThreadPool::TPHandle *handle,
537 const char *osr_name);
f67539c2 538 int do_transactions(std::vector<Transaction> &tls, uint64_t op_seq) override {
11fdf7f2 539 return _do_transactions(tls, op_seq, nullptr, "replay");
7c673cae
FG
540 }
541 void _do_transaction(
542 Transaction& t, uint64_t op_seq, int trans_num,
11fdf7f2
TL
543 ThreadPool::TPHandle *handle, const char *osr_name);
544
545 CollectionHandle open_collection(const coll_t& c) override;
546 CollectionHandle create_new_collection(const coll_t& c) override;
547 void set_collection_commit_queue(const coll_t& cid,
548 ContextQueue *commit_queue) override {
549 }
7c673cae 550
f67539c2 551 int queue_transactions(CollectionHandle& ch, std::vector<Transaction>& tls,
7c673cae 552 TrackedOpRef op = TrackedOpRef(),
11fdf7f2 553 ThreadPool::TPHandle *handle = nullptr) override;
7c673cae
FG
554
555 /**
556 * set replay guard xattr on given file
557 *
558 * This will ensure that we will not replay this (or any previous) operation
559 * against this particular inode/object.
560 *
561 * @param fd open file descriptor for the file/object
562 * @param spos sequencer position of the last operation we should not replay
563 */
564 void _set_replay_guard(int fd,
565 const SequencerPosition& spos,
566 const ghobject_t *oid=0,
567 bool in_progress=false);
568 void _set_replay_guard(const coll_t& cid,
569 const SequencerPosition& spos,
570 bool in_progress);
571 void _set_global_replay_guard(const coll_t& cid,
572 const SequencerPosition &spos);
573
574 /// close a replay guard opened with in_progress=true
575 void _close_replay_guard(int fd, const SequencerPosition& spos,
576 const ghobject_t *oid=0);
577 void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
578
579 /**
580 * check replay guard xattr on given file
581 *
582 * Check the current position against any marker on the file that
583 * indicates which operations have already been applied. If the
584 * current or a newer operation has been marked as applied, we
585 * should not replay the current operation again.
586 *
587 * If we are not replaying the journal, we already return true. It
588 * is only on replay that we might return false, indicated that the
589 * operation should not be performed (again).
590 *
591 * @param fd open fd on the file/object in question
592 * @param spos sequencerposition for an operation we could apply/replay
593 * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
594 */
595 int _check_replay_guard(int fd, const SequencerPosition& spos);
596 int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
597 int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
598 int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
599
600 // ------------------
601 // objects
602 int pick_object_revision_lt(ghobject_t& oid) {
603 return 0;
604 }
605 using ObjectStore::exists;
11fdf7f2 606 bool exists(CollectionHandle& c, const ghobject_t& oid) override;
7c673cae
FG
607 using ObjectStore::stat;
608 int stat(
11fdf7f2 609 CollectionHandle& c,
7c673cae
FG
610 const ghobject_t& oid,
611 struct stat *st,
612 bool allow_eio = false) override;
613 using ObjectStore::set_collection_opts;
614 int set_collection_opts(
11fdf7f2 615 CollectionHandle& c,
7c673cae
FG
616 const pool_opts_t& opts) override;
617 using ObjectStore::read;
618 int read(
11fdf7f2 619 CollectionHandle& c,
7c673cae
FG
620 const ghobject_t& oid,
621 uint64_t offset,
622 size_t len,
f67539c2 623 ceph::buffer::list& bl,
224ce89b 624 uint32_t op_flags = 0) override;
7c673cae 625 int _do_fiemap(int fd, uint64_t offset, size_t len,
f67539c2 626 std::map<uint64_t, uint64_t> *m);
7c673cae 627 int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
f67539c2 628 std::map<uint64_t, uint64_t> *m);
7c673cae 629 using ObjectStore::fiemap;
f67539c2
TL
630 int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, ceph::buffer::list& bl) override;
631 int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
7c673cae
FG
632
633 int _touch(const coll_t& cid, const ghobject_t& oid);
634 int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
f67539c2 635 const ceph::buffer::list& bl, uint32_t fadvise_flags = 0);
7c673cae
FG
636 int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
637 int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
638 int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
639 const SequencerPosition& spos);
640 int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
641 uint64_t srcoff, uint64_t len, uint64_t dstoff,
642 const SequencerPosition& spos);
643 int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
644 int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
645 int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
646 int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
647
f67539c2 648 int _fgetattr(int fd, const char *name, ceph::bufferptr& bp);
20effc67
TL
649 int _fgetattrs(int fd, std::map<std::string, ceph::bufferptr,std::less<>>& aset);
650 int _fsetattrs(int fd, std::map<std::string, ceph::bufferptr,std::less<>>& aset);
7c673cae 651
7c673cae
FG
652 void do_force_sync();
653 void start_sync(Context *onsafe);
654 void sync();
655 void _flush_op_queue();
656 void flush();
657 void sync_and_flush();
658
659 int flush_journal() override;
f67539c2 660 int dump_journal(std::ostream& out) override;
7c673cae
FG
661
662 void set_fsid(uuid_d u) override {
663 fsid = u;
664 }
665 uuid_d get_fsid() override { return fsid; }
666
667 uint64_t estimate_objects_overhead(uint64_t num_objects) override;
668
669 // DEBUG read error injection, an object is removed from both on delete()
9f95a23c 670 ceph::mutex read_error_lock = ceph::make_mutex("FileStore::read_error_lock");
f67539c2
TL
671 std::set<ghobject_t> data_error_set; // read() will return -EIO
672 std::set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
7c673cae
FG
673 void inject_data_error(const ghobject_t &oid) override;
674 void inject_mdata_error(const ghobject_t &oid) override;
224ce89b
WB
675
676 void compact() override {
11fdf7f2 677 ceph_assert(object_map);
224ce89b
WB
678 object_map->compact();
679 }
680
28e407b8
AA
681 bool has_builtin_csum() const override {
682 return false;
683 }
684
7c673cae
FG
685 void debug_obj_on_delete(const ghobject_t &oid);
686 bool debug_data_eio(const ghobject_t &oid);
687 bool debug_mdata_eio(const ghobject_t &oid);
688
f67539c2 689 int snapshot(const std::string& name) override;
7c673cae
FG
690
691 // attrs
692 using ObjectStore::getattr;
693 using ObjectStore::getattrs;
f67539c2 694 int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, ceph::bufferptr &bp) override;
20effc67 695 int getattrs(CollectionHandle& c, const ghobject_t& oid, std::map<std::string,ceph::bufferptr,std::less<>>& aset) override;
7c673cae 696
f67539c2 697 int _setattrs(const coll_t& cid, const ghobject_t& oid, std::map<std::string,ceph::bufferptr>& aset,
7c673cae
FG
698 const SequencerPosition &spos);
699 int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
700 const SequencerPosition &spos);
701 int _rmattrs(const coll_t& cid, const ghobject_t& oid,
702 const SequencerPosition &spos);
703
704 int _collection_remove_recursive(const coll_t &cid,
705 const SequencerPosition &spos);
706
707 int _collection_set_bits(const coll_t& cid, int bits);
708
709 // collections
710 using ObjectStore::collection_list;
11fdf7f2
TL
711 int collection_bits(CollectionHandle& c) override;
712 int collection_list(CollectionHandle& c,
713 const ghobject_t& start, const ghobject_t& end, int max,
f67539c2 714 std::vector<ghobject_t> *ls, ghobject_t *next) override {
11fdf7f2
TL
715 c->flush();
716 return collection_list(c->cid, start, end, max, ls, next);
717 }
718 int collection_list(const coll_t& cid,
7c673cae 719 const ghobject_t& start, const ghobject_t& end, int max,
f67539c2
TL
720 std::vector<ghobject_t> *ls, ghobject_t *next);
721 int list_collections(std::vector<coll_t>& ls) override;
722 int list_collections(std::vector<coll_t>& ls, bool include_temp);
7c673cae
FG
723 int collection_stat(const coll_t& c, struct stat *st);
724 bool collection_exists(const coll_t& c) override;
11fdf7f2
TL
725 int collection_empty(CollectionHandle& c, bool *empty) override {
726 c->flush();
727 return collection_empty(c->cid, empty);
728 }
729 int collection_empty(const coll_t& cid, bool *empty);
7c673cae
FG
730
731 // omap (see ObjectStore.h for documentation)
732 using ObjectStore::omap_get;
f67539c2
TL
733 int omap_get(CollectionHandle& c, const ghobject_t &oid, ceph::buffer::list *header,
734 std::map<std::string, ceph::buffer::list> *out) override;
7c673cae
FG
735 using ObjectStore::omap_get_header;
736 int omap_get_header(
11fdf7f2 737 CollectionHandle& c,
7c673cae 738 const ghobject_t &oid,
f67539c2 739 ceph::buffer::list *out,
7c673cae
FG
740 bool allow_eio = false) override;
741 using ObjectStore::omap_get_keys;
f67539c2 742 int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, std::set<std::string> *keys) override;
7c673cae 743 using ObjectStore::omap_get_values;
f67539c2
TL
744 int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys,
745 std::map<std::string, ceph::buffer::list> *out) override;
7c673cae 746 using ObjectStore::omap_check_keys;
f67539c2
TL
747 int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys,
748 std::set<std::string> *out) override;
7c673cae 749 using ObjectStore::get_omap_iterator;
11fdf7f2
TL
750 ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override;
751 ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid);
7c673cae
FG
752
753 int _create_collection(const coll_t& c, int bits,
754 const SequencerPosition &spos);
755 int _destroy_collection(const coll_t& c);
756 /**
757 * Give an expected number of objects hint to the collection.
758 *
759 * @param c - collection id.
760 * @param pg_num - pg number of the pool this collection belongs to
761 * @param expected_num_objs - expected number of objects in this collection
762 * @param spos - sequence position
763 *
764 * @return 0 on success, an error code otherwise
765 */
766 int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
767 uint64_t expected_num_objs,
768 const SequencerPosition &spos);
769 int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
770 const SequencerPosition& spos);
771 int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
772 coll_t c, const ghobject_t& o,
773 const SequencerPosition& spos,
774 bool ignore_enoent = false);
775
776 int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
777 uint64_t expected_object_size,
778 uint64_t expected_write_size);
779
780 void dump_start(const std::string& file);
781 void dump_stop();
f67539c2 782 void dump_transactions(std::vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
7c673cae 783
1adf2230 784 virtual int apply_layout_settings(const coll_t &cid, int target_level);
7c673cae 785
f67539c2 786 void get_db_statistics(ceph::Formatter* f) override;
11fdf7f2 787
7c673cae
FG
788private:
789 void _inject_failure();
790
791 // omap
792 int _omap_clear(const coll_t& cid, const ghobject_t &oid,
793 const SequencerPosition &spos);
794 int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
f67539c2 795 const std::map<std::string, ceph::buffer::list> &aset,
7c673cae 796 const SequencerPosition &spos);
f67539c2 797 int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const std::set<std::string> &keys,
7c673cae
FG
798 const SequencerPosition &spos);
799 int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
f67539c2 800 const std::string& first, const std::string& last,
7c673cae 801 const SequencerPosition &spos);
f67539c2 802 int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const ceph::buffer::list &bl,
7c673cae
FG
803 const SequencerPosition &spos);
804 int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
805 const SequencerPosition &spos);
11fdf7f2
TL
806 int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest,
807 const SequencerPosition &spos);
7c673cae
FG
808
809 const char** get_tracked_conf_keys() const override;
11fdf7f2 810 void handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
811 const std::set <std::string> &changed) override;
812 int set_throttle_params();
813 float m_filestore_commit_timeout;
814 bool m_filestore_journal_parallel;
815 bool m_filestore_journal_trailing;
816 bool m_filestore_journal_writeahead;
817 int m_filestore_fiemap_threshold;
818 double m_filestore_max_sync_interval;
819 double m_filestore_min_sync_interval;
820 bool m_filestore_fail_eio;
821 bool m_filestore_fadvise;
822 int do_update;
823 bool m_journal_dio, m_journal_aio, m_journal_force_aio;
824 std::string m_osd_rollback_to_cluster_snap;
825 bool m_osd_use_stale_snap;
826 bool m_filestore_do_dump;
827 std::ofstream m_filestore_dump;
f67539c2 828 ceph::JSONFormatter m_filestore_dump_fmt;
31f18b77 829 std::atomic<int64_t> m_filestore_kill_at = { 0 };
7c673cae
FG
830 bool m_filestore_sloppy_crc;
831 int m_filestore_sloppy_crc_block_size;
832 uint64_t m_filestore_max_alloc_hint_size;
11fdf7f2 833 unsigned long m_fs_type;
7c673cae
FG
834
835 //Determined xattr handling based on fs type
836 void set_xattr_limits_via_conf();
837 uint32_t m_filestore_max_inline_xattr_size;
838 uint32_t m_filestore_max_inline_xattrs;
839 uint32_t m_filestore_max_xattr_value_size;
840
841 FSSuperblock superblock;
842
843 /**
844 * write_superblock()
845 *
846 * Write superblock to persisent storage
847 *
848 * return value: 0 on success, otherwise negative errno
849 */
850 int write_superblock();
851
852 /**
853 * read_superblock()
854 *
855 * Fill in FileStore::superblock by reading persistent storage
856 *
857 * return value: 0 on success, otherwise negative errno
858 */
859 int read_superblock();
860
861 friend class FileStoreBackend;
862 friend class TestFileStore;
863};
864
f67539c2 865std::ostream& operator<<(std::ostream& out, const FileStore::OpSequencer& s);
7c673cae
FG
866
867struct fiemap;
868
869class FileStoreBackend {
870private:
871 FileStore *filestore;
872protected:
873 int get_basedir_fd() {
874 return filestore->basedir_fd;
875 }
876 int get_current_fd() {
877 return filestore->current_fd;
878 }
879 int get_op_fd() {
880 return filestore->op_fd;
881 }
882 size_t get_blksize() {
883 return filestore->blk_size;
884 }
f67539c2 885 const std::string& get_basedir_path() {
7c673cae
FG
886 return filestore->basedir;
887 }
f67539c2 888 const std::string& get_journal_path() {
d2e6a577
FG
889 return filestore->journalpath;
890 }
f67539c2 891 const std::string& get_current_path() {
7c673cae
FG
892 return filestore->current_fn;
893 }
894 int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
895 if (has_fiemap() || has_seek_data_hole()) {
896 return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
897 } else {
898 return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
899 }
900 }
901 int get_crc_block_size() {
902 return filestore->m_filestore_sloppy_crc_block_size;
903 }
904
905public:
906 explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
907 virtual ~FileStoreBackend() {}
908
909 CephContext* cct() const {
910 return filestore->cct;
911 }
912
11fdf7f2 913 static FileStoreBackend *create(unsigned long f_type, FileStore *fs);
7c673cae
FG
914
915 virtual const char *get_name() = 0;
916 virtual int detect_features() = 0;
917 virtual int create_current() = 0;
918 virtual bool can_checkpoint() = 0;
f67539c2
TL
919 virtual int list_checkpoints(std::list<std::string>& ls) = 0;
920 virtual int create_checkpoint(const std::string& name, uint64_t *cid) = 0;
7c673cae 921 virtual int sync_checkpoint(uint64_t id) = 0;
f67539c2
TL
922 virtual int rollback_to(const std::string& name) = 0;
923 virtual int destroy_checkpoint(const std::string& name) = 0;
7c673cae
FG
924 virtual int syncfs() = 0;
925 virtual bool has_fiemap() = 0;
926 virtual bool has_seek_data_hole() = 0;
31f18b77 927 virtual bool is_rotational() = 0;
d2e6a577 928 virtual bool is_journal_rotational() = 0;
7c673cae
FG
929 virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
930 virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
931 virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
932 virtual bool has_splice() const = 0;
933
934 // hooks for (sloppy) crc tracking
f67539c2 935 virtual int _crc_update_write(int fd, loff_t off, size_t len, const ceph::buffer::list& bl) = 0;
7c673cae
FG
936 virtual int _crc_update_truncate(int fd, loff_t off) = 0;
937 virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
938 virtual int _crc_update_clone_range(int srcfd, int destfd,
939 loff_t srcoff, size_t len, loff_t dstoff) = 0;
f67539c2
TL
940 virtual int _crc_verify_read(int fd, loff_t off, size_t len, const ceph::buffer::list& bl,
941 std::ostream *out) = 0;
7c673cae
FG
942};
943
944#endif