]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #ifndef CEPH_FILESTORE_H | |
17 | #define CEPH_FILESTORE_H | |
18 | ||
19 | #include "include/types.h" | |
20 | ||
21 | #include <map> | |
22 | #include <deque> | |
31f18b77 | 23 | #include <atomic> |
7c673cae | 24 | #include <fstream> |
31f18b77 | 25 | |
7c673cae | 26 | |
31f18b77 FG |
27 | #include <boost/scoped_ptr.hpp> |
28 | ||
7c673cae FG |
29 | #include "include/unordered_map.h" |
30 | ||
11fdf7f2 | 31 | #include "include/ceph_assert.h" |
7c673cae FG |
32 | |
33 | #include "os/ObjectStore.h" | |
34 | #include "JournalingObjectStore.h" | |
35 | ||
36 | #include "common/Timer.h" | |
37 | #include "common/WorkQueue.h" | |
38 | #include "common/perf_counters.h" | |
39 | #include "common/zipkin_trace.h" | |
40 | ||
9f95a23c | 41 | #include "common/ceph_mutex.h" |
7c673cae FG |
42 | #include "HashIndex.h" |
43 | #include "IndexManager.h" | |
44 | #include "os/ObjectMap.h" | |
45 | #include "SequencerPosition.h" | |
46 | #include "FDCache.h" | |
47 | #include "WBThrottle.h" | |
48 | ||
49 | #include "include/uuid.h" | |
50 | ||
7c673cae FG |
51 | #if defined(__linux__) |
52 | # ifndef BTRFS_SUPER_MAGIC | |
11fdf7f2 | 53 | #define BTRFS_SUPER_MAGIC 0x9123683EUL |
7c673cae FG |
54 | # endif |
55 | # ifndef XFS_SUPER_MAGIC | |
11fdf7f2 | 56 | #define XFS_SUPER_MAGIC 0x58465342UL |
7c673cae FG |
57 | # endif |
58 | # ifndef ZFS_SUPER_MAGIC | |
11fdf7f2 | 59 | #define ZFS_SUPER_MAGIC 0x2fc12fc1UL |
7c673cae FG |
60 | # endif |
61 | #endif | |
62 | ||
63 | ||
64 | class FileStoreBackend; | |
65 | ||
66 | #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects") | |
67 | ||
68 | enum { | |
69 | l_filestore_first = 84000, | |
70 | l_filestore_journal_queue_ops, | |
71 | l_filestore_journal_queue_bytes, | |
72 | l_filestore_journal_ops, | |
73 | l_filestore_journal_bytes, | |
74 | l_filestore_journal_latency, | |
75 | l_filestore_journal_wr, | |
76 | l_filestore_journal_wr_bytes, | |
77 | l_filestore_journal_full, | |
78 | l_filestore_committing, | |
79 | l_filestore_commitcycle, | |
80 | l_filestore_commitcycle_interval, | |
81 | l_filestore_commitcycle_latency, | |
82 | l_filestore_op_queue_max_ops, | |
83 | l_filestore_op_queue_ops, | |
84 | l_filestore_ops, | |
85 | l_filestore_op_queue_max_bytes, | |
86 | l_filestore_op_queue_bytes, | |
87 | l_filestore_bytes, | |
88 | l_filestore_apply_latency, | |
89 | l_filestore_queue_transaction_latency_avg, | |
224ce89b | 90 | l_filestore_sync_pause_max_lat, |
7c673cae FG |
91 | l_filestore_last, |
92 | }; | |
93 | ||
94 | class FSSuperblock { | |
95 | public: | |
96 | CompatSet compat_features; | |
f67539c2 | 97 | std::string omap_backend; |
7c673cae FG |
98 | |
99 | FSSuperblock() { } | |
100 | ||
f67539c2 TL |
101 | void encode(ceph::buffer::list &bl) const; |
102 | void decode(ceph::buffer::list::const_iterator &bl); | |
103 | void dump(ceph::Formatter *f) const; | |
104 | static void generate_test_instances(std::list<FSSuperblock*>& o); | |
7c673cae FG |
105 | }; |
106 | WRITE_CLASS_ENCODER(FSSuperblock) | |
107 | ||
f67539c2 | 108 | inline std::ostream& operator<<(std::ostream& out, const FSSuperblock& sb) |
7c673cae FG |
109 | { |
110 | return out << "sb(" << sb.compat_features << "): " | |
111 | << sb.omap_backend; | |
112 | } | |
113 | ||
114 | class FileStore : public JournalingObjectStore, | |
115 | public md_config_obs_t | |
116 | { | |
117 | static const uint32_t target_version = 4; | |
118 | public: | |
119 | uint32_t get_target_version() { | |
120 | return target_version; | |
121 | } | |
122 | ||
f67539c2 | 123 | static int get_block_device_fsid(CephContext* cct, const std::string& path, |
7c673cae FG |
124 | uuid_d *fsid); |
125 | struct FSPerfTracker { | |
11fdf7f2 TL |
126 | PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns; |
127 | PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns; | |
7c673cae FG |
128 | |
129 | objectstore_perf_stat_t get_cur_stats() const { | |
130 | objectstore_perf_stat_t ret; | |
11fdf7f2 TL |
131 | ret.os_commit_latency_ns = os_commit_latency_ns.current_avg(); |
132 | ret.os_apply_latency_ns = os_apply_latency_ns.current_avg(); | |
7c673cae FG |
133 | return ret; |
134 | } | |
135 | ||
136 | void update_from_perfcounters(PerfCounters &logger); | |
137 | } perf_tracker; | |
138 | objectstore_perf_stat_t get_cur_stats() override { | |
139 | perf_tracker.update_from_perfcounters(*logger); | |
140 | return perf_tracker.get_cur_stats(); | |
141 | } | |
142 | const PerfCounters* get_perf_counters() const override { | |
143 | return logger; | |
144 | } | |
145 | ||
146 | private: | |
f67539c2 TL |
147 | std::string internal_name; ///< internal name, used to name the perfcounter instance |
148 | std::string basedir, journalpath; | |
7c673cae FG |
149 | osflagbits_t generic_flags; |
150 | std::string current_fn; | |
151 | std::string current_op_seq_fn; | |
152 | std::string omap_dir; | |
153 | uuid_d fsid; | |
154 | ||
155 | size_t blk_size; ///< fs block size | |
156 | ||
157 | int fsid_fd, op_fd, basedir_fd, current_fd; | |
158 | ||
159 | FileStoreBackend *backend; | |
160 | ||
11fdf7f2 TL |
161 | void create_backend(unsigned long f_type); |
162 | ||
f67539c2 | 163 | std::string devname; |
11fdf7f2 TL |
164 | |
165 | int vdo_fd = -1; | |
f67539c2 | 166 | std::string vdo_name; |
7c673cae FG |
167 | |
168 | deque<uint64_t> snaps; | |
169 | ||
170 | // Indexed Collections | |
171 | IndexManager index_manager; | |
172 | int get_index(const coll_t& c, Index *index); | |
173 | int init_index(const coll_t& c); | |
174 | ||
175 | bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) { | |
176 | // - normal temp case: cid is pg, object is temp (pool < -1) | |
177 | // - hammer temp case: cid is pg (or already temp), object pool is -1 | |
178 | return cid.is_pg() && oid.hobj.pool <= -1; | |
179 | } | |
180 | void init_temp_collections(); | |
181 | ||
11fdf7f2 TL |
182 | void handle_eio(); |
183 | ||
7c673cae FG |
184 | // ObjectMap |
185 | boost::scoped_ptr<ObjectMap> object_map; | |
186 | ||
187 | // helper fns | |
188 | int get_cdir(const coll_t& cid, char *s, int len); | |
189 | ||
190 | /// read a uuid from fd | |
191 | int read_fsid(int fd, uuid_d *uuid); | |
192 | ||
193 | /// lock fsid_fd | |
194 | int lock_fsid(); | |
195 | ||
196 | // sync thread | |
9f95a23c | 197 | ceph::mutex lock = ceph::make_mutex("FileStore::lock"); |
7c673cae | 198 | bool force_sync; |
9f95a23c | 199 | ceph::condition_variable sync_cond; |
7c673cae | 200 | |
9f95a23c | 201 | ceph::mutex sync_entry_timeo_lock = ceph::make_mutex("FileStore::sync_entry_timeo_lock"); |
7c673cae FG |
202 | SafeTimer timer; |
203 | ||
f67539c2 | 204 | std::list<Context*> sync_waiters; |
7c673cae FG |
205 | bool stop; |
206 | void sync_entry(); | |
207 | struct SyncThread : public Thread { | |
208 | FileStore *fs; | |
209 | explicit SyncThread(FileStore *f) : fs(f) {} | |
210 | void *entry() override { | |
211 | fs->sync_entry(); | |
212 | return 0; | |
213 | } | |
214 | } sync_thread; | |
215 | ||
216 | // -- op workqueue -- | |
217 | struct Op { | |
218 | utime_t start; | |
219 | uint64_t op; | |
f67539c2 | 220 | std::vector<Transaction> tls; |
7c673cae FG |
221 | Context *onreadable, *onreadable_sync; |
222 | uint64_t ops, bytes; | |
223 | TrackedOpRef osd_op; | |
224 | ZTracer::Trace trace; | |
11fdf7f2 | 225 | bool registered_apply = false; |
7c673cae | 226 | }; |
11fdf7f2 TL |
227 | class OpSequencer : public CollectionImpl { |
228 | CephContext *cct; | |
9f95a23c TL |
229 | // to protect q, for benefit of flush (peek/dequeue also protected by lock) |
230 | ceph::mutex qlock = | |
231 | ceph::make_mutex("FileStore::OpSequencer::qlock", false); | |
f67539c2 TL |
232 | std::list<Op*> q; |
233 | std::list<uint64_t> jq; | |
234 | std::list<std::pair<uint64_t, Context*> > flush_commit_waiters; | |
9f95a23c | 235 | ceph::condition_variable cond; |
f67539c2 | 236 | std::string osr_name_str; |
11fdf7f2 | 237 | /// hash of pointers to ghobject_t's for in-flight writes |
f67539c2 | 238 | std::unordered_multimap<uint32_t,const ghobject_t*> applying; |
7c673cae | 239 | public: |
9f95a23c TL |
240 | // for apply mutual exclusion |
241 | ceph::mutex apply_lock = | |
242 | ceph::make_mutex("FileStore::OpSequencer::apply_lock", false); | |
7c673cae | 243 | int id; |
11fdf7f2 | 244 | const char *osr_name; |
7c673cae FG |
245 | |
246 | /// get_max_uncompleted | |
247 | bool _get_max_uncompleted( | |
248 | uint64_t *seq ///< [out] max uncompleted seq | |
249 | ) { | |
11fdf7f2 | 250 | ceph_assert(seq); |
7c673cae FG |
251 | *seq = 0; |
252 | if (q.empty() && jq.empty()) | |
253 | return true; | |
254 | ||
255 | if (!q.empty()) | |
256 | *seq = q.back()->op; | |
257 | if (!jq.empty() && jq.back() > *seq) | |
258 | *seq = jq.back(); | |
259 | ||
260 | return false; | |
261 | } /// @returns true if both queues are empty | |
262 | ||
263 | /// get_min_uncompleted | |
264 | bool _get_min_uncompleted( | |
265 | uint64_t *seq ///< [out] min uncompleted seq | |
266 | ) { | |
11fdf7f2 | 267 | ceph_assert(seq); |
7c673cae FG |
268 | *seq = 0; |
269 | if (q.empty() && jq.empty()) | |
270 | return true; | |
271 | ||
272 | if (!q.empty()) | |
273 | *seq = q.front()->op; | |
274 | if (!jq.empty() && jq.front() < *seq) | |
275 | *seq = jq.front(); | |
276 | ||
277 | return false; | |
278 | } /// @returns true if both queues are empty | |
279 | ||
f67539c2 | 280 | void _wake_flush_waiters(std::list<Context*> *to_queue) { |
7c673cae FG |
281 | uint64_t seq; |
282 | if (_get_min_uncompleted(&seq)) | |
283 | seq = -1; | |
284 | ||
f67539c2 | 285 | for (auto i = flush_commit_waiters.begin(); |
7c673cae FG |
286 | i != flush_commit_waiters.end() && i->first < seq; |
287 | flush_commit_waiters.erase(i++)) { | |
288 | to_queue->push_back(i->second); | |
289 | } | |
290 | } | |
291 | ||
11fdf7f2 | 292 | void queue_journal(Op *o) { |
9f95a23c | 293 | std::lock_guard l{qlock}; |
11fdf7f2 TL |
294 | jq.push_back(o->op); |
295 | _register_apply(o); | |
7c673cae | 296 | } |
f67539c2 | 297 | void dequeue_journal(std::list<Context*> *to_queue) { |
9f95a23c | 298 | std::lock_guard l{qlock}; |
7c673cae | 299 | jq.pop_front(); |
9f95a23c | 300 | cond.notify_all(); |
7c673cae FG |
301 | _wake_flush_waiters(to_queue); |
302 | } | |
303 | void queue(Op *o) { | |
9f95a23c | 304 | std::lock_guard l{qlock}; |
7c673cae | 305 | q.push_back(o); |
11fdf7f2 | 306 | _register_apply(o); |
7c673cae FG |
307 | o->trace.keyval("queue depth", q.size()); |
308 | } | |
11fdf7f2 TL |
309 | void _register_apply(Op *o); |
310 | void _unregister_apply(Op *o); | |
311 | void wait_for_apply(const ghobject_t& oid); | |
7c673cae | 312 | Op *peek_queue() { |
9f95a23c TL |
313 | std::lock_guard l{qlock}; |
314 | ceph_assert(ceph_mutex_is_locked(apply_lock)); | |
7c673cae FG |
315 | return q.front(); |
316 | } | |
317 | ||
f67539c2 | 318 | Op *dequeue(std::list<Context*> *to_queue) { |
11fdf7f2 | 319 | ceph_assert(to_queue); |
9f95a23c TL |
320 | ceph_assert(ceph_mutex_is_locked(apply_lock)); |
321 | std::lock_guard l{qlock}; | |
7c673cae FG |
322 | Op *o = q.front(); |
323 | q.pop_front(); | |
9f95a23c | 324 | cond.notify_all(); |
11fdf7f2 | 325 | _unregister_apply(o); |
7c673cae FG |
326 | _wake_flush_waiters(to_queue); |
327 | return o; | |
328 | } | |
329 | ||
330 | void flush() override { | |
9f95a23c TL |
331 | std::unique_lock l{qlock}; |
332 | // wait forever | |
333 | cond.wait(l, [this] { return !cct->_conf->filestore_blackhole; }); | |
7c673cae FG |
334 | |
335 | // get max for journal _or_ op queues | |
336 | uint64_t seq = 0; | |
337 | if (!q.empty()) | |
338 | seq = q.back()->op; | |
339 | if (!jq.empty() && jq.back() > seq) | |
340 | seq = jq.back(); | |
341 | ||
342 | if (seq) { | |
343 | // everything prior to our watermark to drain through either/both queues | |
9f95a23c TL |
344 | cond.wait(l, [seq, this] { |
345 | return ((q.empty() || q.front()->op > seq) && | |
346 | (jq.empty() || jq.front() > seq)); | |
347 | }); | |
7c673cae FG |
348 | } |
349 | } | |
350 | bool flush_commit(Context *c) override { | |
9f95a23c | 351 | std::lock_guard l{qlock}; |
7c673cae FG |
352 | uint64_t seq = 0; |
353 | if (_get_max_uncompleted(&seq)) { | |
354 | return true; | |
355 | } else { | |
f67539c2 | 356 | flush_commit_waiters.push_back(std::make_pair(seq, c)); |
7c673cae FG |
357 | return false; |
358 | } | |
359 | } | |
360 | ||
9f95a23c TL |
361 | private: |
362 | FRIEND_MAKE_REF(OpSequencer); | |
11fdf7f2 | 363 | OpSequencer(CephContext* cct, int i, coll_t cid) |
9f95a23c | 364 | : CollectionImpl(cct, cid), |
11fdf7f2 | 365 | cct(cct), |
11fdf7f2 | 366 | osr_name_str(stringify(cid)), |
11fdf7f2 TL |
367 | id(i), |
368 | osr_name(osr_name_str.c_str()) {} | |
7c673cae | 369 | ~OpSequencer() override { |
11fdf7f2 | 370 | ceph_assert(q.empty()); |
7c673cae FG |
371 | } |
372 | }; | |
11fdf7f2 TL |
373 | typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef; |
374 | ||
9f95a23c | 375 | ceph::mutex coll_lock = ceph::make_mutex("FileStore::coll_lock"); |
f67539c2 | 376 | std::map<coll_t,OpSequencerRef> coll_map; |
7c673cae | 377 | |
f67539c2 | 378 | friend std::ostream& operator<<(std::ostream& out, const OpSequencer& s); |
7c673cae FG |
379 | |
380 | FDCache fdcache; | |
381 | WBThrottle wbthrottle; | |
382 | ||
31f18b77 | 383 | std::atomic<int64_t> next_osr_id = { 0 }; |
7c673cae FG |
384 | bool m_disable_wbthrottle; |
385 | deque<OpSequencer*> op_queue; | |
386 | BackoffThrottle throttle_ops, throttle_bytes; | |
387 | const int m_ondisk_finisher_num; | |
388 | const int m_apply_finisher_num; | |
f67539c2 TL |
389 | std::vector<Finisher*> ondisk_finishers; |
390 | std::vector<Finisher*> apply_finishers; | |
7c673cae FG |
391 | |
392 | ThreadPool op_tp; | |
393 | struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> { | |
394 | FileStore *store; | |
f67539c2 TL |
395 | OpWQ(FileStore *fs, |
396 | ceph::timespan timeout, | |
397 | ceph::timespan suicide_timeout, | |
398 | ThreadPool *tp) | |
399 | : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", | |
400 | timeout, suicide_timeout, tp), | |
401 | store(fs) {} | |
7c673cae FG |
402 | |
403 | bool _enqueue(OpSequencer *osr) override { | |
404 | store->op_queue.push_back(osr); | |
405 | return true; | |
406 | } | |
407 | void _dequeue(OpSequencer *o) override { | |
408 | ceph_abort(); | |
409 | } | |
410 | bool _empty() override { | |
411 | return store->op_queue.empty(); | |
412 | } | |
413 | OpSequencer *_dequeue() override { | |
414 | if (store->op_queue.empty()) | |
11fdf7f2 | 415 | return nullptr; |
7c673cae FG |
416 | OpSequencer *osr = store->op_queue.front(); |
417 | store->op_queue.pop_front(); | |
418 | return osr; | |
419 | } | |
420 | void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override { | |
421 | store->_do_op(osr, handle); | |
422 | } | |
423 | void _process_finish(OpSequencer *osr) override { | |
424 | store->_finish_op(osr); | |
425 | } | |
426 | void _clear() override { | |
11fdf7f2 | 427 | ceph_assert(store->op_queue.empty()); |
7c673cae FG |
428 | } |
429 | } op_wq; | |
430 | ||
431 | void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle); | |
432 | void _finish_op(OpSequencer *o); | |
f67539c2 | 433 | Op *build_op(std::vector<Transaction>& tls, |
7c673cae FG |
434 | Context *onreadable, Context *onreadable_sync, |
435 | TrackedOpRef osd_op); | |
436 | void queue_op(OpSequencer *osr, Op *o); | |
437 | void op_queue_reserve_throttle(Op *o); | |
438 | void op_queue_release_throttle(Op *o); | |
439 | void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk); | |
440 | friend struct C_JournaledAhead; | |
441 | ||
442 | void new_journal(); | |
443 | ||
444 | PerfCounters *logger; | |
445 | ||
446 | ZTracer::Endpoint trace_endpoint; | |
447 | ||
448 | public: | |
449 | int lfn_find(const ghobject_t& oid, const Index& index, | |
11fdf7f2 | 450 | IndexedPath *path = nullptr); |
7c673cae FG |
451 | int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length); |
452 | int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf); | |
453 | int lfn_open( | |
454 | const coll_t& cid, | |
455 | const ghobject_t& oid, | |
456 | bool create, | |
457 | FDRef *outfd, | |
11fdf7f2 | 458 | Index *index = nullptr); |
7c673cae FG |
459 | |
460 | void lfn_close(FDRef fd); | |
461 | int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ; | |
462 | int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos, | |
463 | bool force_clear_omap=false); | |
464 | ||
465 | public: | |
466 | FileStore(CephContext* cct, const std::string &base, const std::string &jdev, | |
467 | osflagbits_t flags = 0, | |
468 | const char *internal_name = "filestore", bool update_to=false); | |
469 | ~FileStore() override; | |
470 | ||
f67539c2 | 471 | std::string get_type() override { |
7c673cae FG |
472 | return "filestore"; |
473 | } | |
474 | ||
475 | int _detect_fs(); | |
476 | int _sanity_check_fs(); | |
477 | ||
478 | bool test_mount_in_use() override; | |
479 | int read_op_seq(uint64_t *seq); | |
480 | int write_op_seq(int, uint64_t seq); | |
481 | int mount() override; | |
482 | int umount() override; | |
483 | ||
484 | int validate_hobject_key(const hobject_t &obj) const override; | |
485 | ||
486 | unsigned get_max_attr_name_length() override { | |
487 | // xattr limit is 128; leave room for our prefixes (user.ceph._), | |
488 | // some margin, and cap at 100 | |
489 | return 100; | |
490 | } | |
491 | int mkfs() override; | |
492 | int mkjournal() override; | |
493 | bool wants_journal() override { | |
494 | return true; | |
495 | } | |
496 | bool allows_journal() override { | |
497 | return true; | |
498 | } | |
499 | bool needs_journal() override { | |
500 | return false; | |
501 | } | |
31f18b77 | 502 | |
11fdf7f2 TL |
503 | bool is_sync_onreadable() const override { |
504 | return false; | |
505 | } | |
506 | ||
31f18b77 | 507 | bool is_rotational() override; |
d2e6a577 | 508 | bool is_journal_rotational() override; |
31f18b77 | 509 | |
f67539c2 | 510 | void dump_perf_counters(ceph::Formatter *f) override { |
7c673cae FG |
511 | f->open_object_section("perf_counters"); |
512 | logger->dump_formatted(f, false); | |
513 | f->close_section(); | |
514 | } | |
515 | ||
f67539c2 | 516 | int flush_cache(std::ostream *os = NULL) override; |
7c673cae FG |
517 | int write_version_stamp(); |
518 | int version_stamp_is_valid(uint32_t *version); | |
519 | int update_version_stamp(); | |
520 | int upgrade() override; | |
521 | ||
522 | bool can_sort_nibblewise() override { | |
523 | return true; // i support legacy sort order | |
524 | } | |
525 | ||
f67539c2 TL |
526 | void collect_metadata(std::map<std::string,std::string> *pm) override; |
527 | int get_devices(std::set<std::string> *ls) override; | |
7c673cae | 528 | |
11fdf7f2 TL |
529 | int statfs(struct store_statfs_t *buf, |
530 | osd_alert_list_t* alerts = nullptr) override; | |
9f95a23c TL |
531 | int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, |
532 | bool *per_pool_omap) override; | |
7c673cae FG |
533 | |
534 | int _do_transactions( | |
f67539c2 | 535 | std::vector<Transaction> &tls, uint64_t op_seq, |
11fdf7f2 TL |
536 | ThreadPool::TPHandle *handle, |
537 | const char *osr_name); | |
f67539c2 | 538 | int do_transactions(std::vector<Transaction> &tls, uint64_t op_seq) override { |
11fdf7f2 | 539 | return _do_transactions(tls, op_seq, nullptr, "replay"); |
7c673cae FG |
540 | } |
541 | void _do_transaction( | |
542 | Transaction& t, uint64_t op_seq, int trans_num, | |
11fdf7f2 TL |
543 | ThreadPool::TPHandle *handle, const char *osr_name); |
544 | ||
545 | CollectionHandle open_collection(const coll_t& c) override; | |
546 | CollectionHandle create_new_collection(const coll_t& c) override; | |
547 | void set_collection_commit_queue(const coll_t& cid, | |
548 | ContextQueue *commit_queue) override { | |
549 | } | |
7c673cae | 550 | |
f67539c2 | 551 | int queue_transactions(CollectionHandle& ch, std::vector<Transaction>& tls, |
7c673cae | 552 | TrackedOpRef op = TrackedOpRef(), |
11fdf7f2 | 553 | ThreadPool::TPHandle *handle = nullptr) override; |
7c673cae FG |
554 | |
555 | /** | |
556 | * set replay guard xattr on given file | |
557 | * | |
558 | * This will ensure that we will not replay this (or any previous) operation | |
559 | * against this particular inode/object. | |
560 | * | |
561 | * @param fd open file descriptor for the file/object | |
562 | * @param spos sequencer position of the last operation we should not replay | |
563 | */ | |
564 | void _set_replay_guard(int fd, | |
565 | const SequencerPosition& spos, | |
566 | const ghobject_t *oid=0, | |
567 | bool in_progress=false); | |
568 | void _set_replay_guard(const coll_t& cid, | |
569 | const SequencerPosition& spos, | |
570 | bool in_progress); | |
571 | void _set_global_replay_guard(const coll_t& cid, | |
572 | const SequencerPosition &spos); | |
573 | ||
574 | /// close a replay guard opened with in_progress=true | |
575 | void _close_replay_guard(int fd, const SequencerPosition& spos, | |
576 | const ghobject_t *oid=0); | |
577 | void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos); | |
578 | ||
579 | /** | |
580 | * check replay guard xattr on given file | |
581 | * | |
582 | * Check the current position against any marker on the file that | |
583 | * indicates which operations have already been applied. If the | |
584 | * current or a newer operation has been marked as applied, we | |
585 | * should not replay the current operation again. | |
586 | * | |
587 | * If we are not replaying the journal, we already return true. It | |
588 | * is only on replay that we might return false, indicated that the | |
589 | * operation should not be performed (again). | |
590 | * | |
591 | * @param fd open fd on the file/object in question | |
592 | * @param spos sequencerposition for an operation we could apply/replay | |
593 | * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress | |
594 | */ | |
595 | int _check_replay_guard(int fd, const SequencerPosition& spos); | |
596 | int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos); | |
597 | int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos); | |
598 | int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos); | |
599 | ||
600 | // ------------------ | |
601 | // objects | |
602 | int pick_object_revision_lt(ghobject_t& oid) { | |
603 | return 0; | |
604 | } | |
605 | using ObjectStore::exists; | |
11fdf7f2 | 606 | bool exists(CollectionHandle& c, const ghobject_t& oid) override; |
7c673cae FG |
607 | using ObjectStore::stat; |
608 | int stat( | |
11fdf7f2 | 609 | CollectionHandle& c, |
7c673cae FG |
610 | const ghobject_t& oid, |
611 | struct stat *st, | |
612 | bool allow_eio = false) override; | |
613 | using ObjectStore::set_collection_opts; | |
614 | int set_collection_opts( | |
11fdf7f2 | 615 | CollectionHandle& c, |
7c673cae FG |
616 | const pool_opts_t& opts) override; |
617 | using ObjectStore::read; | |
618 | int read( | |
11fdf7f2 | 619 | CollectionHandle& c, |
7c673cae FG |
620 | const ghobject_t& oid, |
621 | uint64_t offset, | |
622 | size_t len, | |
f67539c2 | 623 | ceph::buffer::list& bl, |
224ce89b | 624 | uint32_t op_flags = 0) override; |
7c673cae | 625 | int _do_fiemap(int fd, uint64_t offset, size_t len, |
f67539c2 | 626 | std::map<uint64_t, uint64_t> *m); |
7c673cae | 627 | int _do_seek_hole_data(int fd, uint64_t offset, size_t len, |
f67539c2 | 628 | std::map<uint64_t, uint64_t> *m); |
7c673cae | 629 | using ObjectStore::fiemap; |
f67539c2 TL |
630 | int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, ceph::buffer::list& bl) override; |
631 | int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override; | |
7c673cae FG |
632 | |
633 | int _touch(const coll_t& cid, const ghobject_t& oid); | |
634 | int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, | |
f67539c2 | 635 | const ceph::buffer::list& bl, uint32_t fadvise_flags = 0); |
7c673cae FG |
636 | int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len); |
637 | int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); | |
638 | int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid, | |
639 | const SequencerPosition& spos); | |
640 | int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid, | |
641 | uint64_t srcoff, uint64_t len, uint64_t dstoff, | |
642 | const SequencerPosition& spos); | |
643 | int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); | |
644 | int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); | |
645 | int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false); | |
646 | int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos); | |
647 | ||
f67539c2 | 648 | int _fgetattr(int fd, const char *name, ceph::bufferptr& bp); |
20effc67 TL |
649 | int _fgetattrs(int fd, std::map<std::string, ceph::bufferptr,std::less<>>& aset); |
650 | int _fsetattrs(int fd, std::map<std::string, ceph::bufferptr,std::less<>>& aset); | |
7c673cae | 651 | |
7c673cae FG |
652 | void do_force_sync(); |
653 | void start_sync(Context *onsafe); | |
654 | void sync(); | |
655 | void _flush_op_queue(); | |
656 | void flush(); | |
657 | void sync_and_flush(); | |
658 | ||
659 | int flush_journal() override; | |
f67539c2 | 660 | int dump_journal(std::ostream& out) override; |
7c673cae FG |
661 | |
662 | void set_fsid(uuid_d u) override { | |
663 | fsid = u; | |
664 | } | |
665 | uuid_d get_fsid() override { return fsid; } | |
666 | ||
667 | uint64_t estimate_objects_overhead(uint64_t num_objects) override; | |
668 | ||
669 | // DEBUG read error injection, an object is removed from both on delete() | |
9f95a23c | 670 | ceph::mutex read_error_lock = ceph::make_mutex("FileStore::read_error_lock"); |
f67539c2 TL |
671 | std::set<ghobject_t> data_error_set; // read() will return -EIO |
672 | std::set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO | |
7c673cae FG |
673 | void inject_data_error(const ghobject_t &oid) override; |
674 | void inject_mdata_error(const ghobject_t &oid) override; | |
224ce89b WB |
675 | |
676 | void compact() override { | |
11fdf7f2 | 677 | ceph_assert(object_map); |
224ce89b WB |
678 | object_map->compact(); |
679 | } | |
680 | ||
28e407b8 AA |
681 | bool has_builtin_csum() const override { |
682 | return false; | |
683 | } | |
684 | ||
7c673cae FG |
685 | void debug_obj_on_delete(const ghobject_t &oid); |
686 | bool debug_data_eio(const ghobject_t &oid); | |
687 | bool debug_mdata_eio(const ghobject_t &oid); | |
688 | ||
f67539c2 | 689 | int snapshot(const std::string& name) override; |
7c673cae FG |
690 | |
691 | // attrs | |
692 | using ObjectStore::getattr; | |
693 | using ObjectStore::getattrs; | |
f67539c2 | 694 | int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, ceph::bufferptr &bp) override; |
20effc67 | 695 | int getattrs(CollectionHandle& c, const ghobject_t& oid, std::map<std::string,ceph::bufferptr,std::less<>>& aset) override; |
7c673cae | 696 | |
f67539c2 | 697 | int _setattrs(const coll_t& cid, const ghobject_t& oid, std::map<std::string,ceph::bufferptr>& aset, |
7c673cae FG |
698 | const SequencerPosition &spos); |
699 | int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name, | |
700 | const SequencerPosition &spos); | |
701 | int _rmattrs(const coll_t& cid, const ghobject_t& oid, | |
702 | const SequencerPosition &spos); | |
703 | ||
704 | int _collection_remove_recursive(const coll_t &cid, | |
705 | const SequencerPosition &spos); | |
706 | ||
707 | int _collection_set_bits(const coll_t& cid, int bits); | |
708 | ||
709 | // collections | |
710 | using ObjectStore::collection_list; | |
11fdf7f2 TL |
711 | int collection_bits(CollectionHandle& c) override; |
712 | int collection_list(CollectionHandle& c, | |
713 | const ghobject_t& start, const ghobject_t& end, int max, | |
f67539c2 | 714 | std::vector<ghobject_t> *ls, ghobject_t *next) override { |
11fdf7f2 TL |
715 | c->flush(); |
716 | return collection_list(c->cid, start, end, max, ls, next); | |
717 | } | |
718 | int collection_list(const coll_t& cid, | |
7c673cae | 719 | const ghobject_t& start, const ghobject_t& end, int max, |
f67539c2 TL |
720 | std::vector<ghobject_t> *ls, ghobject_t *next); |
721 | int list_collections(std::vector<coll_t>& ls) override; | |
722 | int list_collections(std::vector<coll_t>& ls, bool include_temp); | |
7c673cae FG |
723 | int collection_stat(const coll_t& c, struct stat *st); |
724 | bool collection_exists(const coll_t& c) override; | |
11fdf7f2 TL |
725 | int collection_empty(CollectionHandle& c, bool *empty) override { |
726 | c->flush(); | |
727 | return collection_empty(c->cid, empty); | |
728 | } | |
729 | int collection_empty(const coll_t& cid, bool *empty); | |
7c673cae FG |
730 | |
731 | // omap (see ObjectStore.h for documentation) | |
732 | using ObjectStore::omap_get; | |
f67539c2 TL |
733 | int omap_get(CollectionHandle& c, const ghobject_t &oid, ceph::buffer::list *header, |
734 | std::map<std::string, ceph::buffer::list> *out) override; | |
7c673cae FG |
735 | using ObjectStore::omap_get_header; |
736 | int omap_get_header( | |
11fdf7f2 | 737 | CollectionHandle& c, |
7c673cae | 738 | const ghobject_t &oid, |
f67539c2 | 739 | ceph::buffer::list *out, |
7c673cae FG |
740 | bool allow_eio = false) override; |
741 | using ObjectStore::omap_get_keys; | |
f67539c2 | 742 | int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, std::set<std::string> *keys) override; |
7c673cae | 743 | using ObjectStore::omap_get_values; |
f67539c2 TL |
744 | int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys, |
745 | std::map<std::string, ceph::buffer::list> *out) override; | |
7c673cae | 746 | using ObjectStore::omap_check_keys; |
f67539c2 TL |
747 | int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys, |
748 | std::set<std::string> *out) override; | |
7c673cae | 749 | using ObjectStore::get_omap_iterator; |
11fdf7f2 TL |
750 | ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override; |
751 | ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid); | |
7c673cae FG |
752 | |
753 | int _create_collection(const coll_t& c, int bits, | |
754 | const SequencerPosition &spos); | |
755 | int _destroy_collection(const coll_t& c); | |
756 | /** | |
757 | * Give an expected number of objects hint to the collection. | |
758 | * | |
759 | * @param c - collection id. | |
760 | * @param pg_num - pg number of the pool this collection belongs to | |
761 | * @param expected_num_objs - expected number of objects in this collection | |
762 | * @param spos - sequence position | |
763 | * | |
764 | * @return 0 on success, an error code otherwise | |
765 | */ | |
766 | int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num, | |
767 | uint64_t expected_num_objs, | |
768 | const SequencerPosition &spos); | |
769 | int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid, | |
770 | const SequencerPosition& spos); | |
771 | int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, | |
772 | coll_t c, const ghobject_t& o, | |
773 | const SequencerPosition& spos, | |
774 | bool ignore_enoent = false); | |
775 | ||
776 | int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid, | |
777 | uint64_t expected_object_size, | |
778 | uint64_t expected_write_size); | |
779 | ||
780 | void dump_start(const std::string& file); | |
781 | void dump_stop(); | |
f67539c2 | 782 | void dump_transactions(std::vector<Transaction>& ls, uint64_t seq, OpSequencer *osr); |
7c673cae | 783 | |
1adf2230 | 784 | virtual int apply_layout_settings(const coll_t &cid, int target_level); |
7c673cae | 785 | |
f67539c2 | 786 | void get_db_statistics(ceph::Formatter* f) override; |
11fdf7f2 | 787 | |
7c673cae FG |
788 | private: |
789 | void _inject_failure(); | |
790 | ||
791 | // omap | |
792 | int _omap_clear(const coll_t& cid, const ghobject_t &oid, | |
793 | const SequencerPosition &spos); | |
794 | int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, | |
f67539c2 | 795 | const std::map<std::string, ceph::buffer::list> &aset, |
7c673cae | 796 | const SequencerPosition &spos); |
f67539c2 | 797 | int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const std::set<std::string> &keys, |
7c673cae FG |
798 | const SequencerPosition &spos); |
799 | int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid, | |
f67539c2 | 800 | const std::string& first, const std::string& last, |
7c673cae | 801 | const SequencerPosition &spos); |
f67539c2 | 802 | int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const ceph::buffer::list &bl, |
7c673cae FG |
803 | const SequencerPosition &spos); |
804 | int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest, | |
805 | const SequencerPosition &spos); | |
11fdf7f2 TL |
806 | int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest, |
807 | const SequencerPosition &spos); | |
7c673cae FG |
808 | |
809 | const char** get_tracked_conf_keys() const override; | |
11fdf7f2 | 810 | void handle_conf_change(const ConfigProxy& conf, |
7c673cae FG |
811 | const std::set <std::string> &changed) override; |
812 | int set_throttle_params(); | |
813 | float m_filestore_commit_timeout; | |
814 | bool m_filestore_journal_parallel; | |
815 | bool m_filestore_journal_trailing; | |
816 | bool m_filestore_journal_writeahead; | |
817 | int m_filestore_fiemap_threshold; | |
818 | double m_filestore_max_sync_interval; | |
819 | double m_filestore_min_sync_interval; | |
820 | bool m_filestore_fail_eio; | |
821 | bool m_filestore_fadvise; | |
822 | int do_update; | |
823 | bool m_journal_dio, m_journal_aio, m_journal_force_aio; | |
824 | std::string m_osd_rollback_to_cluster_snap; | |
825 | bool m_osd_use_stale_snap; | |
826 | bool m_filestore_do_dump; | |
827 | std::ofstream m_filestore_dump; | |
f67539c2 | 828 | ceph::JSONFormatter m_filestore_dump_fmt; |
31f18b77 | 829 | std::atomic<int64_t> m_filestore_kill_at = { 0 }; |
7c673cae FG |
830 | bool m_filestore_sloppy_crc; |
831 | int m_filestore_sloppy_crc_block_size; | |
832 | uint64_t m_filestore_max_alloc_hint_size; | |
11fdf7f2 | 833 | unsigned long m_fs_type; |
7c673cae FG |
834 | |
835 | //Determined xattr handling based on fs type | |
836 | void set_xattr_limits_via_conf(); | |
837 | uint32_t m_filestore_max_inline_xattr_size; | |
838 | uint32_t m_filestore_max_inline_xattrs; | |
839 | uint32_t m_filestore_max_xattr_value_size; | |
840 | ||
841 | FSSuperblock superblock; | |
842 | ||
843 | /** | |
844 | * write_superblock() | |
845 | * | |
846 | * Write superblock to persisent storage | |
847 | * | |
848 | * return value: 0 on success, otherwise negative errno | |
849 | */ | |
850 | int write_superblock(); | |
851 | ||
852 | /** | |
853 | * read_superblock() | |
854 | * | |
855 | * Fill in FileStore::superblock by reading persistent storage | |
856 | * | |
857 | * return value: 0 on success, otherwise negative errno | |
858 | */ | |
859 | int read_superblock(); | |
860 | ||
861 | friend class FileStoreBackend; | |
862 | friend class TestFileStore; | |
863 | }; | |
864 | ||
f67539c2 | 865 | std::ostream& operator<<(std::ostream& out, const FileStore::OpSequencer& s); |
7c673cae FG |
866 | |
867 | struct fiemap; | |
868 | ||
869 | class FileStoreBackend { | |
870 | private: | |
871 | FileStore *filestore; | |
872 | protected: | |
873 | int get_basedir_fd() { | |
874 | return filestore->basedir_fd; | |
875 | } | |
876 | int get_current_fd() { | |
877 | return filestore->current_fd; | |
878 | } | |
879 | int get_op_fd() { | |
880 | return filestore->op_fd; | |
881 | } | |
882 | size_t get_blksize() { | |
883 | return filestore->blk_size; | |
884 | } | |
f67539c2 | 885 | const std::string& get_basedir_path() { |
7c673cae FG |
886 | return filestore->basedir; |
887 | } | |
f67539c2 | 888 | const std::string& get_journal_path() { |
d2e6a577 FG |
889 | return filestore->journalpath; |
890 | } | |
f67539c2 | 891 | const std::string& get_current_path() { |
7c673cae FG |
892 | return filestore->current_fn; |
893 | } | |
894 | int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { | |
895 | if (has_fiemap() || has_seek_data_hole()) { | |
896 | return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff); | |
897 | } else { | |
898 | return filestore->_do_copy_range(from, to, srcoff, len, dstoff); | |
899 | } | |
900 | } | |
901 | int get_crc_block_size() { | |
902 | return filestore->m_filestore_sloppy_crc_block_size; | |
903 | } | |
904 | ||
905 | public: | |
906 | explicit FileStoreBackend(FileStore *fs) : filestore(fs) {} | |
907 | virtual ~FileStoreBackend() {} | |
908 | ||
909 | CephContext* cct() const { | |
910 | return filestore->cct; | |
911 | } | |
912 | ||
11fdf7f2 | 913 | static FileStoreBackend *create(unsigned long f_type, FileStore *fs); |
7c673cae FG |
914 | |
915 | virtual const char *get_name() = 0; | |
916 | virtual int detect_features() = 0; | |
917 | virtual int create_current() = 0; | |
918 | virtual bool can_checkpoint() = 0; | |
f67539c2 TL |
919 | virtual int list_checkpoints(std::list<std::string>& ls) = 0; |
920 | virtual int create_checkpoint(const std::string& name, uint64_t *cid) = 0; | |
7c673cae | 921 | virtual int sync_checkpoint(uint64_t id) = 0; |
f67539c2 TL |
922 | virtual int rollback_to(const std::string& name) = 0; |
923 | virtual int destroy_checkpoint(const std::string& name) = 0; | |
7c673cae FG |
924 | virtual int syncfs() = 0; |
925 | virtual bool has_fiemap() = 0; | |
926 | virtual bool has_seek_data_hole() = 0; | |
31f18b77 | 927 | virtual bool is_rotational() = 0; |
d2e6a577 | 928 | virtual bool is_journal_rotational() = 0; |
7c673cae FG |
929 | virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0; |
930 | virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0; | |
931 | virtual int set_alloc_hint(int fd, uint64_t hint) = 0; | |
932 | virtual bool has_splice() const = 0; | |
933 | ||
934 | // hooks for (sloppy) crc tracking | |
f67539c2 | 935 | virtual int _crc_update_write(int fd, loff_t off, size_t len, const ceph::buffer::list& bl) = 0; |
7c673cae FG |
936 | virtual int _crc_update_truncate(int fd, loff_t off) = 0; |
937 | virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0; | |
938 | virtual int _crc_update_clone_range(int srcfd, int destfd, | |
939 | loff_t srcoff, size_t len, loff_t dstoff) = 0; | |
f67539c2 TL |
940 | virtual int _crc_verify_read(int fd, loff_t off, size_t len, const ceph::buffer::list& bl, |
941 | std::ostream *out) = 0; | |
7c673cae FG |
942 | }; |
943 | ||
944 | #endif |