]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #ifndef CEPH_FILESTORE_H | |
17 | #define CEPH_FILESTORE_H | |
18 | ||
19 | #include "include/types.h" | |
20 | ||
21 | #include <map> | |
22 | #include <deque> | |
31f18b77 | 23 | #include <atomic> |
7c673cae | 24 | #include <fstream> |
31f18b77 | 25 | |
7c673cae FG |
26 | using namespace std; |
27 | ||
31f18b77 FG |
28 | #include <boost/scoped_ptr.hpp> |
29 | ||
7c673cae FG |
30 | #include "include/unordered_map.h" |
31 | ||
32 | #include "include/assert.h" | |
33 | ||
34 | #include "os/ObjectStore.h" | |
35 | #include "JournalingObjectStore.h" | |
36 | ||
37 | #include "common/Timer.h" | |
38 | #include "common/WorkQueue.h" | |
39 | #include "common/perf_counters.h" | |
40 | #include "common/zipkin_trace.h" | |
41 | ||
42 | #include "common/Mutex.h" | |
43 | #include "HashIndex.h" | |
44 | #include "IndexManager.h" | |
45 | #include "os/ObjectMap.h" | |
46 | #include "SequencerPosition.h" | |
47 | #include "FDCache.h" | |
48 | #include "WBThrottle.h" | |
49 | ||
50 | #include "include/uuid.h" | |
51 | ||
52 | ||
53 | // from include/linux/falloc.h: | |
54 | #ifndef FALLOC_FL_PUNCH_HOLE | |
55 | # define FALLOC_FL_PUNCH_HOLE 0x2 | |
56 | #endif | |
57 | ||
58 | #if defined(__linux__) | |
59 | # ifndef BTRFS_SUPER_MAGIC | |
60 | #define BTRFS_SUPER_MAGIC 0x9123683EL | |
61 | # endif | |
62 | # ifndef XFS_SUPER_MAGIC | |
63 | #define XFS_SUPER_MAGIC 0x58465342L | |
64 | # endif | |
65 | # ifndef ZFS_SUPER_MAGIC | |
66 | #define ZFS_SUPER_MAGIC 0x2fc12fc1L | |
67 | # endif | |
68 | #endif | |
69 | ||
70 | ||
71 | class FileStoreBackend; | |
72 | ||
73 | #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects") | |
74 | ||
75 | enum { | |
76 | l_filestore_first = 84000, | |
77 | l_filestore_journal_queue_ops, | |
78 | l_filestore_journal_queue_bytes, | |
79 | l_filestore_journal_ops, | |
80 | l_filestore_journal_bytes, | |
81 | l_filestore_journal_latency, | |
82 | l_filestore_journal_wr, | |
83 | l_filestore_journal_wr_bytes, | |
84 | l_filestore_journal_full, | |
85 | l_filestore_committing, | |
86 | l_filestore_commitcycle, | |
87 | l_filestore_commitcycle_interval, | |
88 | l_filestore_commitcycle_latency, | |
89 | l_filestore_op_queue_max_ops, | |
90 | l_filestore_op_queue_ops, | |
91 | l_filestore_ops, | |
92 | l_filestore_op_queue_max_bytes, | |
93 | l_filestore_op_queue_bytes, | |
94 | l_filestore_bytes, | |
95 | l_filestore_apply_latency, | |
96 | l_filestore_queue_transaction_latency_avg, | |
97 | l_filestore_last, | |
98 | }; | |
99 | ||
100 | class FSSuperblock { | |
101 | public: | |
102 | CompatSet compat_features; | |
103 | string omap_backend; | |
104 | ||
105 | FSSuperblock() { } | |
106 | ||
107 | void encode(bufferlist &bl) const; | |
108 | void decode(bufferlist::iterator &bl); | |
109 | void dump(Formatter *f) const; | |
110 | static void generate_test_instances(list<FSSuperblock*>& o); | |
111 | }; | |
112 | WRITE_CLASS_ENCODER(FSSuperblock) | |
113 | ||
114 | inline ostream& operator<<(ostream& out, const FSSuperblock& sb) | |
115 | { | |
116 | return out << "sb(" << sb.compat_features << "): " | |
117 | << sb.omap_backend; | |
118 | } | |
119 | ||
120 | class FileStore : public JournalingObjectStore, | |
121 | public md_config_obs_t | |
122 | { | |
123 | static const uint32_t target_version = 4; | |
124 | public: | |
125 | uint32_t get_target_version() { | |
126 | return target_version; | |
127 | } | |
128 | ||
129 | static int get_block_device_fsid(CephContext* cct, const string& path, | |
130 | uuid_d *fsid); | |
131 | struct FSPerfTracker { | |
132 | PerfCounters::avg_tracker<uint64_t> os_commit_latency; | |
133 | PerfCounters::avg_tracker<uint64_t> os_apply_latency; | |
134 | ||
135 | objectstore_perf_stat_t get_cur_stats() const { | |
136 | objectstore_perf_stat_t ret; | |
137 | ret.os_commit_latency = os_commit_latency.avg(); | |
138 | ret.os_apply_latency = os_apply_latency.avg(); | |
139 | return ret; | |
140 | } | |
141 | ||
142 | void update_from_perfcounters(PerfCounters &logger); | |
143 | } perf_tracker; | |
144 | objectstore_perf_stat_t get_cur_stats() override { | |
145 | perf_tracker.update_from_perfcounters(*logger); | |
146 | return perf_tracker.get_cur_stats(); | |
147 | } | |
148 | const PerfCounters* get_perf_counters() const override { | |
149 | return logger; | |
150 | } | |
151 | ||
152 | private: | |
153 | string internal_name; ///< internal name, used to name the perfcounter instance | |
154 | string basedir, journalpath; | |
155 | osflagbits_t generic_flags; | |
156 | std::string current_fn; | |
157 | std::string current_op_seq_fn; | |
158 | std::string omap_dir; | |
159 | uuid_d fsid; | |
160 | ||
161 | size_t blk_size; ///< fs block size | |
162 | ||
163 | int fsid_fd, op_fd, basedir_fd, current_fd; | |
164 | ||
165 | FileStoreBackend *backend; | |
166 | ||
167 | void create_backend(long f_type); | |
168 | ||
169 | deque<uint64_t> snaps; | |
170 | ||
171 | // Indexed Collections | |
172 | IndexManager index_manager; | |
173 | int get_index(const coll_t& c, Index *index); | |
174 | int init_index(const coll_t& c); | |
175 | ||
176 | bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) { | |
177 | // - normal temp case: cid is pg, object is temp (pool < -1) | |
178 | // - hammer temp case: cid is pg (or already temp), object pool is -1 | |
179 | return cid.is_pg() && oid.hobj.pool <= -1; | |
180 | } | |
181 | void init_temp_collections(); | |
182 | ||
183 | // ObjectMap | |
184 | boost::scoped_ptr<ObjectMap> object_map; | |
185 | ||
186 | // helper fns | |
187 | int get_cdir(const coll_t& cid, char *s, int len); | |
188 | ||
189 | /// read a uuid from fd | |
190 | int read_fsid(int fd, uuid_d *uuid); | |
191 | ||
192 | /// lock fsid_fd | |
193 | int lock_fsid(); | |
194 | ||
195 | // sync thread | |
196 | Mutex lock; | |
197 | bool force_sync; | |
198 | Cond sync_cond; | |
199 | ||
200 | Mutex sync_entry_timeo_lock; | |
201 | SafeTimer timer; | |
202 | ||
203 | list<Context*> sync_waiters; | |
204 | bool stop; | |
205 | void sync_entry(); | |
206 | struct SyncThread : public Thread { | |
207 | FileStore *fs; | |
208 | explicit SyncThread(FileStore *f) : fs(f) {} | |
209 | void *entry() override { | |
210 | fs->sync_entry(); | |
211 | return 0; | |
212 | } | |
213 | } sync_thread; | |
214 | ||
215 | // -- op workqueue -- | |
216 | struct Op { | |
217 | utime_t start; | |
218 | uint64_t op; | |
219 | vector<Transaction> tls; | |
220 | Context *onreadable, *onreadable_sync; | |
221 | uint64_t ops, bytes; | |
222 | TrackedOpRef osd_op; | |
223 | ZTracer::Trace trace; | |
224 | }; | |
225 | class OpSequencer : public Sequencer_impl { | |
226 | Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) | |
227 | list<Op*> q; | |
228 | list<uint64_t> jq; | |
229 | list<pair<uint64_t, Context*> > flush_commit_waiters; | |
230 | Cond cond; | |
231 | public: | |
232 | Sequencer *parent; | |
233 | Mutex apply_lock; // for apply mutual exclusion | |
234 | int id; | |
235 | ||
236 | /// get_max_uncompleted | |
237 | bool _get_max_uncompleted( | |
238 | uint64_t *seq ///< [out] max uncompleted seq | |
239 | ) { | |
240 | assert(qlock.is_locked()); | |
241 | assert(seq); | |
242 | *seq = 0; | |
243 | if (q.empty() && jq.empty()) | |
244 | return true; | |
245 | ||
246 | if (!q.empty()) | |
247 | *seq = q.back()->op; | |
248 | if (!jq.empty() && jq.back() > *seq) | |
249 | *seq = jq.back(); | |
250 | ||
251 | return false; | |
252 | } /// @returns true if both queues are empty | |
253 | ||
254 | /// get_min_uncompleted | |
255 | bool _get_min_uncompleted( | |
256 | uint64_t *seq ///< [out] min uncompleted seq | |
257 | ) { | |
258 | assert(qlock.is_locked()); | |
259 | assert(seq); | |
260 | *seq = 0; | |
261 | if (q.empty() && jq.empty()) | |
262 | return true; | |
263 | ||
264 | if (!q.empty()) | |
265 | *seq = q.front()->op; | |
266 | if (!jq.empty() && jq.front() < *seq) | |
267 | *seq = jq.front(); | |
268 | ||
269 | return false; | |
270 | } /// @returns true if both queues are empty | |
271 | ||
272 | void _wake_flush_waiters(list<Context*> *to_queue) { | |
273 | uint64_t seq; | |
274 | if (_get_min_uncompleted(&seq)) | |
275 | seq = -1; | |
276 | ||
277 | for (list<pair<uint64_t, Context*> >::iterator i = | |
278 | flush_commit_waiters.begin(); | |
279 | i != flush_commit_waiters.end() && i->first < seq; | |
280 | flush_commit_waiters.erase(i++)) { | |
281 | to_queue->push_back(i->second); | |
282 | } | |
283 | } | |
284 | ||
285 | void queue_journal(uint64_t s) { | |
286 | Mutex::Locker l(qlock); | |
287 | jq.push_back(s); | |
288 | } | |
289 | void dequeue_journal(list<Context*> *to_queue) { | |
290 | Mutex::Locker l(qlock); | |
291 | jq.pop_front(); | |
292 | cond.Signal(); | |
293 | _wake_flush_waiters(to_queue); | |
294 | } | |
295 | void queue(Op *o) { | |
296 | Mutex::Locker l(qlock); | |
297 | q.push_back(o); | |
298 | o->trace.keyval("queue depth", q.size()); | |
299 | } | |
300 | Op *peek_queue() { | |
301 | Mutex::Locker l(qlock); | |
302 | assert(apply_lock.is_locked()); | |
303 | return q.front(); | |
304 | } | |
305 | ||
306 | Op *dequeue(list<Context*> *to_queue) { | |
307 | assert(to_queue); | |
308 | assert(apply_lock.is_locked()); | |
309 | Mutex::Locker l(qlock); | |
310 | Op *o = q.front(); | |
311 | q.pop_front(); | |
312 | cond.Signal(); | |
313 | ||
314 | _wake_flush_waiters(to_queue); | |
315 | return o; | |
316 | } | |
317 | ||
318 | void flush() override { | |
319 | Mutex::Locker l(qlock); | |
320 | ||
321 | while (cct->_conf->filestore_blackhole) | |
322 | cond.Wait(qlock); // wait forever | |
323 | ||
324 | ||
325 | // get max for journal _or_ op queues | |
326 | uint64_t seq = 0; | |
327 | if (!q.empty()) | |
328 | seq = q.back()->op; | |
329 | if (!jq.empty() && jq.back() > seq) | |
330 | seq = jq.back(); | |
331 | ||
332 | if (seq) { | |
333 | // everything prior to our watermark to drain through either/both queues | |
334 | while ((!q.empty() && q.front()->op <= seq) || | |
335 | (!jq.empty() && jq.front() <= seq)) | |
336 | cond.Wait(qlock); | |
337 | } | |
338 | } | |
339 | bool flush_commit(Context *c) override { | |
340 | Mutex::Locker l(qlock); | |
341 | uint64_t seq = 0; | |
342 | if (_get_max_uncompleted(&seq)) { | |
343 | return true; | |
344 | } else { | |
345 | flush_commit_waiters.push_back(make_pair(seq, c)); | |
346 | return false; | |
347 | } | |
348 | } | |
349 | ||
350 | OpSequencer(CephContext* cct, int i) | |
351 | : Sequencer_impl(cct), | |
352 | qlock("FileStore::OpSequencer::qlock", false, false), | |
353 | parent(0), | |
354 | apply_lock("FileStore::OpSequencer::apply_lock", false, false), | |
355 | id(i) {} | |
356 | ~OpSequencer() override { | |
357 | assert(q.empty()); | |
358 | } | |
359 | ||
360 | const string& get_name() const { | |
361 | return parent->get_name(); | |
362 | } | |
363 | }; | |
364 | ||
365 | friend ostream& operator<<(ostream& out, const OpSequencer& s); | |
366 | ||
367 | FDCache fdcache; | |
368 | WBThrottle wbthrottle; | |
369 | ||
31f18b77 | 370 | std::atomic<int64_t> next_osr_id = { 0 }; |
7c673cae FG |
371 | bool m_disable_wbthrottle; |
372 | deque<OpSequencer*> op_queue; | |
373 | BackoffThrottle throttle_ops, throttle_bytes; | |
374 | const int m_ondisk_finisher_num; | |
375 | const int m_apply_finisher_num; | |
376 | vector<Finisher*> ondisk_finishers; | |
377 | vector<Finisher*> apply_finishers; | |
378 | ||
379 | ThreadPool op_tp; | |
380 | struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> { | |
381 | FileStore *store; | |
382 | OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp) | |
383 | : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {} | |
384 | ||
385 | bool _enqueue(OpSequencer *osr) override { | |
386 | store->op_queue.push_back(osr); | |
387 | return true; | |
388 | } | |
389 | void _dequeue(OpSequencer *o) override { | |
390 | ceph_abort(); | |
391 | } | |
392 | bool _empty() override { | |
393 | return store->op_queue.empty(); | |
394 | } | |
395 | OpSequencer *_dequeue() override { | |
396 | if (store->op_queue.empty()) | |
397 | return NULL; | |
398 | OpSequencer *osr = store->op_queue.front(); | |
399 | store->op_queue.pop_front(); | |
400 | return osr; | |
401 | } | |
402 | void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override { | |
403 | store->_do_op(osr, handle); | |
404 | } | |
405 | void _process_finish(OpSequencer *osr) override { | |
406 | store->_finish_op(osr); | |
407 | } | |
408 | void _clear() override { | |
409 | assert(store->op_queue.empty()); | |
410 | } | |
411 | } op_wq; | |
412 | ||
413 | void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle); | |
414 | void _finish_op(OpSequencer *o); | |
415 | Op *build_op(vector<Transaction>& tls, | |
416 | Context *onreadable, Context *onreadable_sync, | |
417 | TrackedOpRef osd_op); | |
418 | void queue_op(OpSequencer *osr, Op *o); | |
419 | void op_queue_reserve_throttle(Op *o); | |
420 | void op_queue_release_throttle(Op *o); | |
421 | void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk); | |
422 | friend struct C_JournaledAhead; | |
423 | ||
424 | void new_journal(); | |
425 | ||
426 | PerfCounters *logger; | |
427 | ||
428 | ZTracer::Endpoint trace_endpoint; | |
429 | ||
430 | public: | |
431 | int lfn_find(const ghobject_t& oid, const Index& index, | |
432 | IndexedPath *path = NULL); | |
433 | int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length); | |
434 | int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf); | |
435 | int lfn_open( | |
436 | const coll_t& cid, | |
437 | const ghobject_t& oid, | |
438 | bool create, | |
439 | FDRef *outfd, | |
440 | Index *index = 0); | |
441 | ||
442 | void lfn_close(FDRef fd); | |
443 | int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ; | |
444 | int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos, | |
445 | bool force_clear_omap=false); | |
446 | ||
447 | public: | |
448 | FileStore(CephContext* cct, const std::string &base, const std::string &jdev, | |
449 | osflagbits_t flags = 0, | |
450 | const char *internal_name = "filestore", bool update_to=false); | |
451 | ~FileStore() override; | |
452 | ||
453 | string get_type() override { | |
454 | return "filestore"; | |
455 | } | |
456 | ||
457 | int _detect_fs(); | |
458 | int _sanity_check_fs(); | |
459 | ||
460 | bool test_mount_in_use() override; | |
461 | int read_op_seq(uint64_t *seq); | |
462 | int write_op_seq(int, uint64_t seq); | |
463 | int mount() override; | |
464 | int umount() override; | |
465 | ||
466 | int validate_hobject_key(const hobject_t &obj) const override; | |
467 | ||
468 | unsigned get_max_attr_name_length() override { | |
469 | // xattr limit is 128; leave room for our prefixes (user.ceph._), | |
470 | // some margin, and cap at 100 | |
471 | return 100; | |
472 | } | |
473 | int mkfs() override; | |
474 | int mkjournal() override; | |
475 | bool wants_journal() override { | |
476 | return true; | |
477 | } | |
478 | bool allows_journal() override { | |
479 | return true; | |
480 | } | |
481 | bool needs_journal() override { | |
482 | return false; | |
483 | } | |
31f18b77 FG |
484 | |
485 | bool is_rotational() override; | |
486 | ||
7c673cae FG |
487 | void dump_perf_counters(Formatter *f) override { |
488 | f->open_object_section("perf_counters"); | |
489 | logger->dump_formatted(f, false); | |
490 | f->close_section(); | |
491 | } | |
492 | ||
493 | int write_version_stamp(); | |
494 | int version_stamp_is_valid(uint32_t *version); | |
495 | int update_version_stamp(); | |
496 | int upgrade() override; | |
497 | ||
498 | bool can_sort_nibblewise() override { | |
499 | return true; // i support legacy sort order | |
500 | } | |
501 | ||
502 | void collect_metadata(map<string,string> *pm) override; | |
503 | ||
504 | int statfs(struct store_statfs_t *buf) override; | |
505 | ||
506 | int _do_transactions( | |
507 | vector<Transaction> &tls, uint64_t op_seq, | |
508 | ThreadPool::TPHandle *handle); | |
509 | int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override { | |
510 | return _do_transactions(tls, op_seq, 0); | |
511 | } | |
512 | void _do_transaction( | |
513 | Transaction& t, uint64_t op_seq, int trans_num, | |
514 | ThreadPool::TPHandle *handle); | |
515 | ||
516 | int queue_transactions(Sequencer *osr, vector<Transaction>& tls, | |
517 | TrackedOpRef op = TrackedOpRef(), | |
518 | ThreadPool::TPHandle *handle = NULL) override; | |
519 | ||
520 | /** | |
521 | * set replay guard xattr on given file | |
522 | * | |
523 | * This will ensure that we will not replay this (or any previous) operation | |
524 | * against this particular inode/object. | |
525 | * | |
526 | * @param fd open file descriptor for the file/object | |
527 | * @param spos sequencer position of the last operation we should not replay | |
528 | */ | |
529 | void _set_replay_guard(int fd, | |
530 | const SequencerPosition& spos, | |
531 | const ghobject_t *oid=0, | |
532 | bool in_progress=false); | |
533 | void _set_replay_guard(const coll_t& cid, | |
534 | const SequencerPosition& spos, | |
535 | bool in_progress); | |
536 | void _set_global_replay_guard(const coll_t& cid, | |
537 | const SequencerPosition &spos); | |
538 | ||
539 | /// close a replay guard opened with in_progress=true | |
540 | void _close_replay_guard(int fd, const SequencerPosition& spos, | |
541 | const ghobject_t *oid=0); | |
542 | void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos); | |
543 | ||
544 | /** | |
545 | * check replay guard xattr on given file | |
546 | * | |
547 | * Check the current position against any marker on the file that | |
548 | * indicates which operations have already been applied. If the | |
549 | * current or a newer operation has been marked as applied, we | |
550 | * should not replay the current operation again. | |
551 | * | |
552 | * If we are not replaying the journal, we already return true. It | |
553 | * is only on replay that we might return false, indicated that the | |
554 | * operation should not be performed (again). | |
555 | * | |
556 | * @param fd open fd on the file/object in question | |
557 | * @param spos sequencerposition for an operation we could apply/replay | |
558 | * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress | |
559 | */ | |
560 | int _check_replay_guard(int fd, const SequencerPosition& spos); | |
561 | int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos); | |
562 | int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos); | |
563 | int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos); | |
564 | ||
565 | // ------------------ | |
566 | // objects | |
567 | int pick_object_revision_lt(ghobject_t& oid) { | |
568 | return 0; | |
569 | } | |
570 | using ObjectStore::exists; | |
571 | bool exists(const coll_t& cid, const ghobject_t& oid) override; | |
572 | using ObjectStore::stat; | |
573 | int stat( | |
574 | const coll_t& cid, | |
575 | const ghobject_t& oid, | |
576 | struct stat *st, | |
577 | bool allow_eio = false) override; | |
578 | using ObjectStore::set_collection_opts; | |
579 | int set_collection_opts( | |
580 | const coll_t& cid, | |
581 | const pool_opts_t& opts) override; | |
582 | using ObjectStore::read; | |
583 | int read( | |
584 | const coll_t& cid, | |
585 | const ghobject_t& oid, | |
586 | uint64_t offset, | |
587 | size_t len, | |
588 | bufferlist& bl, | |
589 | uint32_t op_flags = 0, | |
590 | bool allow_eio = false) override; | |
591 | int _do_fiemap(int fd, uint64_t offset, size_t len, | |
592 | map<uint64_t, uint64_t> *m); | |
593 | int _do_seek_hole_data(int fd, uint64_t offset, size_t len, | |
594 | map<uint64_t, uint64_t> *m); | |
595 | using ObjectStore::fiemap; | |
596 | int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override; | |
597 | int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override; | |
598 | ||
599 | int _touch(const coll_t& cid, const ghobject_t& oid); | |
600 | int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, | |
601 | const bufferlist& bl, uint32_t fadvise_flags = 0); | |
602 | int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len); | |
603 | int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); | |
604 | int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid, | |
605 | const SequencerPosition& spos); | |
606 | int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid, | |
607 | uint64_t srcoff, uint64_t len, uint64_t dstoff, | |
608 | const SequencerPosition& spos); | |
609 | int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); | |
610 | int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); | |
611 | int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false); | |
612 | int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos); | |
613 | ||
614 | int _fgetattr(int fd, const char *name, bufferptr& bp); | |
615 | int _fgetattrs(int fd, map<string,bufferptr>& aset); | |
616 | int _fsetattrs(int fd, map<string, bufferptr> &aset); | |
617 | ||
618 | void _start_sync(); | |
619 | ||
620 | void do_force_sync(); | |
621 | void start_sync(Context *onsafe); | |
622 | void sync(); | |
623 | void _flush_op_queue(); | |
624 | void flush(); | |
625 | void sync_and_flush(); | |
626 | ||
627 | int flush_journal() override; | |
628 | int dump_journal(ostream& out) override; | |
629 | ||
630 | void set_fsid(uuid_d u) override { | |
631 | fsid = u; | |
632 | } | |
633 | uuid_d get_fsid() override { return fsid; } | |
634 | ||
635 | uint64_t estimate_objects_overhead(uint64_t num_objects) override; | |
636 | ||
637 | // DEBUG read error injection, an object is removed from both on delete() | |
638 | Mutex read_error_lock; | |
639 | set<ghobject_t> data_error_set; // read() will return -EIO | |
640 | set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO | |
641 | void inject_data_error(const ghobject_t &oid) override; | |
642 | void inject_mdata_error(const ghobject_t &oid) override; | |
643 | void debug_obj_on_delete(const ghobject_t &oid); | |
644 | bool debug_data_eio(const ghobject_t &oid); | |
645 | bool debug_mdata_eio(const ghobject_t &oid); | |
646 | ||
647 | int snapshot(const string& name) override; | |
648 | ||
649 | // attrs | |
650 | using ObjectStore::getattr; | |
651 | using ObjectStore::getattrs; | |
652 | int getattr(const coll_t& cid, const ghobject_t& oid, const char *name, bufferptr &bp) override; | |
653 | int getattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset) override; | |
654 | ||
655 | int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset, | |
656 | const SequencerPosition &spos); | |
657 | int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name, | |
658 | const SequencerPosition &spos); | |
659 | int _rmattrs(const coll_t& cid, const ghobject_t& oid, | |
660 | const SequencerPosition &spos); | |
661 | ||
662 | int _collection_remove_recursive(const coll_t &cid, | |
663 | const SequencerPosition &spos); | |
664 | ||
665 | int _collection_set_bits(const coll_t& cid, int bits); | |
666 | ||
667 | // collections | |
668 | using ObjectStore::collection_list; | |
669 | int collection_bits(const coll_t& c) override; | |
670 | int collection_list(const coll_t& c, | |
671 | const ghobject_t& start, const ghobject_t& end, int max, | |
672 | vector<ghobject_t> *ls, ghobject_t *next) override; | |
673 | int list_collections(vector<coll_t>& ls) override; | |
674 | int list_collections(vector<coll_t>& ls, bool include_temp); | |
675 | int collection_stat(const coll_t& c, struct stat *st); | |
676 | bool collection_exists(const coll_t& c) override; | |
677 | int collection_empty(const coll_t& c, bool *empty) override; | |
678 | ||
679 | // omap (see ObjectStore.h for documentation) | |
680 | using ObjectStore::omap_get; | |
681 | int omap_get(const coll_t& c, const ghobject_t &oid, bufferlist *header, | |
682 | map<string, bufferlist> *out) override; | |
683 | using ObjectStore::omap_get_header; | |
684 | int omap_get_header( | |
685 | const coll_t& c, | |
686 | const ghobject_t &oid, | |
687 | bufferlist *out, | |
688 | bool allow_eio = false) override; | |
689 | using ObjectStore::omap_get_keys; | |
690 | int omap_get_keys(const coll_t& c, const ghobject_t &oid, set<string> *keys) override; | |
691 | using ObjectStore::omap_get_values; | |
692 | int omap_get_values(const coll_t& c, const ghobject_t &oid, const set<string> &keys, | |
693 | map<string, bufferlist> *out) override; | |
694 | using ObjectStore::omap_check_keys; | |
695 | int omap_check_keys(const coll_t& c, const ghobject_t &oid, const set<string> &keys, | |
696 | set<string> *out) override; | |
697 | using ObjectStore::get_omap_iterator; | |
698 | ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& c, const ghobject_t &oid) override; | |
699 | ||
700 | int _create_collection(const coll_t& c, int bits, | |
701 | const SequencerPosition &spos); | |
702 | int _destroy_collection(const coll_t& c); | |
703 | /** | |
704 | * Give an expected number of objects hint to the collection. | |
705 | * | |
706 | * @param c - collection id. | |
707 | * @param pg_num - pg number of the pool this collection belongs to | |
708 | * @param expected_num_objs - expected number of objects in this collection | |
709 | * @param spos - sequence position | |
710 | * | |
711 | * @return 0 on success, an error code otherwise | |
712 | */ | |
713 | int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num, | |
714 | uint64_t expected_num_objs, | |
715 | const SequencerPosition &spos); | |
716 | int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid, | |
717 | const SequencerPosition& spos); | |
718 | int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, | |
719 | coll_t c, const ghobject_t& o, | |
720 | const SequencerPosition& spos, | |
721 | bool ignore_enoent = false); | |
722 | ||
723 | int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid, | |
724 | uint64_t expected_object_size, | |
725 | uint64_t expected_write_size); | |
726 | ||
727 | void dump_start(const std::string& file); | |
728 | void dump_stop(); | |
729 | void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr); | |
730 | ||
731 | virtual int apply_layout_settings(const coll_t &cid); | |
732 | ||
733 | private: | |
734 | void _inject_failure(); | |
735 | ||
736 | // omap | |
737 | int _omap_clear(const coll_t& cid, const ghobject_t &oid, | |
738 | const SequencerPosition &spos); | |
739 | int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, | |
740 | const map<string, bufferlist> &aset, | |
741 | const SequencerPosition &spos); | |
742 | int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys, | |
743 | const SequencerPosition &spos); | |
744 | int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid, | |
745 | const string& first, const string& last, | |
746 | const SequencerPosition &spos); | |
747 | int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl, | |
748 | const SequencerPosition &spos); | |
749 | int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest, | |
750 | const SequencerPosition &spos); | |
751 | int _split_collection_create(const coll_t& cid, uint32_t bits, uint32_t rem, | |
752 | coll_t dest, | |
753 | const SequencerPosition &spos); | |
754 | ||
755 | const char** get_tracked_conf_keys() const override; | |
756 | void handle_conf_change(const struct md_config_t *conf, | |
757 | const std::set <std::string> &changed) override; | |
758 | int set_throttle_params(); | |
759 | float m_filestore_commit_timeout; | |
760 | bool m_filestore_journal_parallel; | |
761 | bool m_filestore_journal_trailing; | |
762 | bool m_filestore_journal_writeahead; | |
763 | int m_filestore_fiemap_threshold; | |
764 | double m_filestore_max_sync_interval; | |
765 | double m_filestore_min_sync_interval; | |
766 | bool m_filestore_fail_eio; | |
767 | bool m_filestore_fadvise; | |
768 | int do_update; | |
769 | bool m_journal_dio, m_journal_aio, m_journal_force_aio; | |
770 | std::string m_osd_rollback_to_cluster_snap; | |
771 | bool m_osd_use_stale_snap; | |
772 | bool m_filestore_do_dump; | |
773 | std::ofstream m_filestore_dump; | |
774 | JSONFormatter m_filestore_dump_fmt; | |
31f18b77 | 775 | std::atomic<int64_t> m_filestore_kill_at = { 0 }; |
7c673cae FG |
776 | bool m_filestore_sloppy_crc; |
777 | int m_filestore_sloppy_crc_block_size; | |
778 | uint64_t m_filestore_max_alloc_hint_size; | |
779 | long m_fs_type; | |
780 | ||
781 | //Determined xattr handling based on fs type | |
782 | void set_xattr_limits_via_conf(); | |
783 | uint32_t m_filestore_max_inline_xattr_size; | |
784 | uint32_t m_filestore_max_inline_xattrs; | |
785 | uint32_t m_filestore_max_xattr_value_size; | |
786 | ||
787 | FSSuperblock superblock; | |
788 | ||
789 | /** | |
790 | * write_superblock() | |
791 | * | |
792 | * Write superblock to persisent storage | |
793 | * | |
794 | * return value: 0 on success, otherwise negative errno | |
795 | */ | |
796 | int write_superblock(); | |
797 | ||
798 | /** | |
799 | * read_superblock() | |
800 | * | |
801 | * Fill in FileStore::superblock by reading persistent storage | |
802 | * | |
803 | * return value: 0 on success, otherwise negative errno | |
804 | */ | |
805 | int read_superblock(); | |
806 | ||
807 | friend class FileStoreBackend; | |
808 | friend class TestFileStore; | |
809 | }; | |
810 | ||
811 | ostream& operator<<(ostream& out, const FileStore::OpSequencer& s); | |
812 | ||
813 | struct fiemap; | |
814 | ||
815 | class FileStoreBackend { | |
816 | private: | |
817 | FileStore *filestore; | |
818 | protected: | |
819 | int get_basedir_fd() { | |
820 | return filestore->basedir_fd; | |
821 | } | |
822 | int get_current_fd() { | |
823 | return filestore->current_fd; | |
824 | } | |
825 | int get_op_fd() { | |
826 | return filestore->op_fd; | |
827 | } | |
828 | size_t get_blksize() { | |
829 | return filestore->blk_size; | |
830 | } | |
831 | const string& get_basedir_path() { | |
832 | return filestore->basedir; | |
833 | } | |
834 | const string& get_current_path() { | |
835 | return filestore->current_fn; | |
836 | } | |
837 | int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { | |
838 | if (has_fiemap() || has_seek_data_hole()) { | |
839 | return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff); | |
840 | } else { | |
841 | return filestore->_do_copy_range(from, to, srcoff, len, dstoff); | |
842 | } | |
843 | } | |
844 | int get_crc_block_size() { | |
845 | return filestore->m_filestore_sloppy_crc_block_size; | |
846 | } | |
847 | ||
848 | public: | |
849 | explicit FileStoreBackend(FileStore *fs) : filestore(fs) {} | |
850 | virtual ~FileStoreBackend() {} | |
851 | ||
852 | CephContext* cct() const { | |
853 | return filestore->cct; | |
854 | } | |
855 | ||
856 | static FileStoreBackend *create(long f_type, FileStore *fs); | |
857 | ||
858 | virtual const char *get_name() = 0; | |
859 | virtual int detect_features() = 0; | |
860 | virtual int create_current() = 0; | |
861 | virtual bool can_checkpoint() = 0; | |
862 | virtual int list_checkpoints(list<string>& ls) = 0; | |
863 | virtual int create_checkpoint(const string& name, uint64_t *cid) = 0; | |
864 | virtual int sync_checkpoint(uint64_t id) = 0; | |
865 | virtual int rollback_to(const string& name) = 0; | |
866 | virtual int destroy_checkpoint(const string& name) = 0; | |
867 | virtual int syncfs() = 0; | |
868 | virtual bool has_fiemap() = 0; | |
869 | virtual bool has_seek_data_hole() = 0; | |
31f18b77 | 870 | virtual bool is_rotational() = 0; |
7c673cae FG |
871 | virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0; |
872 | virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0; | |
873 | virtual int set_alloc_hint(int fd, uint64_t hint) = 0; | |
874 | virtual bool has_splice() const = 0; | |
875 | ||
876 | // hooks for (sloppy) crc tracking | |
877 | virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0; | |
878 | virtual int _crc_update_truncate(int fd, loff_t off) = 0; | |
879 | virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0; | |
880 | virtual int _crc_update_clone_range(int srcfd, int destfd, | |
881 | loff_t srcoff, size_t len, loff_t dstoff) = 0; | |
882 | virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, | |
883 | ostream *out) = 0; | |
884 | }; | |
885 | ||
886 | #endif |