]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/flush_job.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / db / flush_job.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9#pragma once
10
11#include <atomic>
12#include <deque>
13#include <limits>
f67539c2 14#include <list>
7c673cae 15#include <set>
f67539c2 16#include <string>
7c673cae
FG
17#include <utility>
18#include <vector>
7c673cae
FG
19
20#include "db/column_family.h"
21#include "db/dbformat.h"
22#include "db/flush_scheduler.h"
23#include "db/internal_stats.h"
24#include "db/job_context.h"
25#include "db/log_writer.h"
11fdf7f2 26#include "db/logs_with_prep_tracker.h"
7c673cae
FG
27#include "db/memtable_list.h"
28#include "db/snapshot_impl.h"
29#include "db/version_edit.h"
30#include "db/write_controller.h"
31#include "db/write_thread.h"
f67539c2 32#include "logging/event_logger.h"
7c673cae
FG
33#include "monitoring/instrumented_mutex.h"
34#include "options/db_options.h"
35#include "port/port.h"
36#include "rocksdb/db.h"
37#include "rocksdb/env.h"
f67539c2 38#include "rocksdb/listener.h"
7c673cae
FG
39#include "rocksdb/memtablerep.h"
40#include "rocksdb/transaction_log.h"
41#include "table/scoped_arena_iterator.h"
42#include "util/autovector.h"
7c673cae
FG
43#include "util/stop_watch.h"
44#include "util/thread_local.h"
45
f67539c2 46namespace ROCKSDB_NAMESPACE {
7c673cae 47
11fdf7f2 48class DBImpl;
7c673cae 49class MemTable;
11fdf7f2 50class SnapshotChecker;
7c673cae
FG
51class TableCache;
52class Version;
53class VersionEdit;
54class VersionSet;
55class Arena;
56
57class FlushJob {
58 public:
59 // TODO(icanadi) make effort to reduce number of parameters here
60 // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
61 FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
62 const ImmutableDBOptions& db_options,
63 const MutableCFOptions& mutable_cf_options,
f67539c2 64 const uint64_t* max_memtable_id, const FileOptions& file_options,
494da23a
TL
65 VersionSet* versions, InstrumentedMutex* db_mutex,
66 std::atomic<bool>* shutting_down,
7c673cae
FG
67 std::vector<SequenceNumber> existing_snapshots,
68 SequenceNumber earliest_write_conflict_snapshot,
11fdf7f2 69 SnapshotChecker* snapshot_checker, JobContext* job_context,
20effc67
TL
70 LogBuffer* log_buffer, FSDirectory* db_directory,
71 FSDirectory* output_file_directory,
72 CompressionType output_compression, Statistics* stats,
73 EventLogger* event_logger, bool measure_io_stats,
494da23a 74 const bool sync_output_directory, const bool write_manifest,
20effc67
TL
75 Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
76 const std::string& db_id = "", const std::string& db_session_id = "",
77 std::string full_history_ts_low = "");
7c673cae
FG
78
79 ~FlushJob();
80
81 // Require db_mutex held.
11fdf7f2 82 // Once PickMemTable() is called, either Run() or Cancel() has to be called.
7c673cae 83 void PickMemTable();
11fdf7f2
TL
84 Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
85 FileMetaData* file_meta = nullptr);
7c673cae 86 void Cancel();
494da23a 87 const autovector<MemTable*>& GetMemTables() const { return mems_; }
7c673cae 88
f67539c2
TL
89#ifndef ROCKSDB_LITE
90 std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
91 return &committed_flush_jobs_info_;
92 }
93#endif // !ROCKSDB_LITE
94
20effc67
TL
95 // Return the IO status
96 IOStatus io_status() const { return io_status_; }
97
7c673cae
FG
98 private:
99 void ReportStartedFlush();
100 void ReportFlushInputSize(const autovector<MemTable*>& mems);
101 void RecordFlushIOStats();
102 Status WriteLevel0Table();
f67539c2
TL
103#ifndef ROCKSDB_LITE
104 std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
105#endif // !ROCKSDB_LITE
494da23a 106
7c673cae 107 const std::string& dbname_;
20effc67
TL
108 const std::string db_id_;
109 const std::string db_session_id_;
7c673cae
FG
110 ColumnFamilyData* cfd_;
111 const ImmutableDBOptions& db_options_;
112 const MutableCFOptions& mutable_cf_options_;
494da23a
TL
113 // Pointer to a variable storing the largest memtable id to flush in this
114 // flush job. RocksDB uses this variable to select the memtables to flush in
115 // this job. All memtables in this column family with an ID smaller than or
116 // equal to *max_memtable_id_ will be selected for flush. If null, then all
117 // memtables in the column family will be selected.
118 const uint64_t* max_memtable_id_;
f67539c2 119 const FileOptions file_options_;
7c673cae
FG
120 VersionSet* versions_;
121 InstrumentedMutex* db_mutex_;
122 std::atomic<bool>* shutting_down_;
123 std::vector<SequenceNumber> existing_snapshots_;
124 SequenceNumber earliest_write_conflict_snapshot_;
11fdf7f2 125 SnapshotChecker* snapshot_checker_;
7c673cae
FG
126 JobContext* job_context_;
127 LogBuffer* log_buffer_;
20effc67
TL
128 FSDirectory* db_directory_;
129 FSDirectory* output_file_directory_;
7c673cae
FG
130 CompressionType output_compression_;
131 Statistics* stats_;
132 EventLogger* event_logger_;
133 TableProperties table_properties_;
134 bool measure_io_stats_;
494da23a
TL
135 // True if this flush job should call fsync on the output directory. False
136 // otherwise.
137 // Usually sync_output_directory_ is true. A flush job needs to call sync on
138 // the output directory before committing to the MANIFEST.
139 // However, an individual flush job does not have to call sync on the output
140 // directory if it is part of an atomic flush. After all flush jobs in the
141 // atomic flush succeed, call sync once on each distinct output directory.
142 const bool sync_output_directory_;
143 // True if this flush job should write to MANIFEST after successfully
144 // flushing memtables. False otherwise.
145 // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
146 // flushing the memtables.
147 // However, an individual flush job cannot rashly write to the MANIFEST
148 // immediately after it finishes the flush if it is part of an atomic flush.
149 // In this case, only after all flush jobs succeed in flush can RocksDB
150 // commit to the MANIFEST.
151 const bool write_manifest_;
f67539c2
TL
152 // The current flush job can commit flush result of a concurrent flush job.
153 // We collect FlushJobInfo of all jobs committed by current job and fire
154 // OnFlushCompleted for them.
155 std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
7c673cae
FG
156
157 // Variables below are set by PickMemTable():
158 FileMetaData meta_;
159 autovector<MemTable*> mems_;
160 VersionEdit* edit_;
161 Version* base_;
162 bool pick_memtable_called;
494da23a 163 Env::Priority thread_pri_;
20effc67
TL
164 IOStatus io_status_;
165
166 const std::shared_ptr<IOTracer> io_tracer_;
167
168 const std::string full_history_ts_low_;
7c673cae
FG
169};
170
f67539c2 171} // namespace ROCKSDB_NAMESPACE