]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/db.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / rocksdb / include / rocksdb / db.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6// Use of this source code is governed by a BSD-style license that can be
7// found in the LICENSE file. See the AUTHORS file for names of contributors.
8
11fdf7f2 9#pragma once
7c673cae
FG
10
11#include <stdint.h>
12#include <stdio.h>
13#include <map>
14#include <memory>
15#include <string>
16#include <unordered_map>
17#include <vector>
18#include "rocksdb/iterator.h"
19#include "rocksdb/listener.h"
20#include "rocksdb/metadata.h"
21#include "rocksdb/options.h"
22#include "rocksdb/snapshot.h"
23#include "rocksdb/sst_file_writer.h"
24#include "rocksdb/thread_status.h"
25#include "rocksdb/transaction_log.h"
26#include "rocksdb/types.h"
27#include "rocksdb/version.h"
28
29#ifdef _WIN32
30// Windows API macro interference
31#undef DeleteFile
32#endif
33
34#if defined(__GNUC__) || defined(__clang__)
35#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
36#elif _WIN32
37#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
38#endif
39
f67539c2 40namespace ROCKSDB_NAMESPACE {
7c673cae
FG
41
42struct Options;
43struct DBOptions;
44struct ColumnFamilyOptions;
45struct ReadOptions;
46struct WriteOptions;
47struct FlushOptions;
48struct CompactionOptions;
49struct CompactRangeOptions;
50struct TableProperties;
51struct ExternalSstFileInfo;
52class WriteBatch;
53class Env;
54class EventListener;
494da23a 55class StatsHistoryIterator;
11fdf7f2 56class TraceWriter;
494da23a
TL
57#ifdef ROCKSDB_LITE
58class CompactionJobInfo;
59#endif
f67539c2 60class FileSystem;
7c673cae
FG
61
62extern const std::string kDefaultColumnFamilyName;
f67539c2 63extern const std::string kPersistentStatsColumnFamilyName;
7c673cae
FG
64struct ColumnFamilyDescriptor {
65 std::string name;
66 ColumnFamilyOptions options;
67 ColumnFamilyDescriptor()
68 : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
69 ColumnFamilyDescriptor(const std::string& _name,
70 const ColumnFamilyOptions& _options)
71 : name(_name), options(_options) {}
72};
73
74class ColumnFamilyHandle {
75 public:
76 virtual ~ColumnFamilyHandle() {}
77 // Returns the name of the column family associated with the current handle.
78 virtual const std::string& GetName() const = 0;
79 // Returns the ID of the column family associated with the current handle.
80 virtual uint32_t GetID() const = 0;
81 // Fills "*desc" with the up-to-date descriptor of the column family
82 // associated with this handle. Since it fills "*desc" with the up-to-date
83 // information, this call might internally lock and release DB mutex to
84 // access the up-to-date CF options. In addition, all the pointer-typed
85 // options cannot be referenced any longer than the original options exist.
86 //
87 // Note that this function is not supported in RocksDBLite.
88 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
89 // Returns the comparator of the column family associated with the
90 // current handle.
91 virtual const Comparator* GetComparator() const = 0;
92};
93
94static const int kMajorVersion = __ROCKSDB_MAJOR__;
95static const int kMinorVersion = __ROCKSDB_MINOR__;
96
97// A range of keys
98struct Range {
11fdf7f2
TL
99 Slice start;
100 Slice limit;
7c673cae 101
494da23a
TL
102 Range() {}
103 Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
7c673cae
FG
104};
105
11fdf7f2
TL
106struct RangePtr {
107 const Slice* start;
108 const Slice* limit;
109
494da23a
TL
110 RangePtr() : start(nullptr), limit(nullptr) {}
111 RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
112};
113
114struct IngestExternalFileArg {
115 ColumnFamilyHandle* column_family = nullptr;
116 std::vector<std::string> external_files;
117 IngestExternalFileOptions options;
11fdf7f2
TL
118};
119
f67539c2
TL
120struct GetMergeOperandsOptions {
121 int expected_max_number_of_operands = 0;
122};
123
7c673cae
FG
124// A collections of table properties objects, where
125// key: is the table's file name.
126// value: the table properties object of the given table.
127typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
128 TablePropertiesCollection;
129
130// A DB is a persistent ordered map from keys to values.
131// A DB is safe for concurrent access from multiple threads without
132// any external synchronization.
133class DB {
134 public:
135 // Open the database with the specified "name".
136 // Stores a pointer to a heap-allocated database in *dbptr and returns
137 // OK on success.
138 // Stores nullptr in *dbptr and returns a non-OK status on error.
139 // Caller should delete *dbptr when it is no longer needed.
494da23a 140 static Status Open(const Options& options, const std::string& name,
7c673cae
FG
141 DB** dbptr);
142
143 // Open the database for read only. All DB interfaces
144 // that modify data, like put/delete, will return error.
145 // If the db is opened in read only mode, then no compactions
146 // will happen.
147 //
148 // Not supported in ROCKSDB_LITE, in which case the function will
149 // return Status::NotSupported.
494da23a
TL
150 static Status OpenForReadOnly(const Options& options, const std::string& name,
151 DB** dbptr,
152 bool error_if_log_file_exist = false);
7c673cae
FG
153
154 // Open the database for read only with column families. When opening DB with
155 // read only, you can specify only a subset of column families in the
156 // database that should be opened. However, you always need to specify default
157 // column family. The default column family name is 'default' and it's stored
f67539c2 158 // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
7c673cae
FG
159 //
160 // Not supported in ROCKSDB_LITE, in which case the function will
161 // return Status::NotSupported.
162 static Status OpenForReadOnly(
163 const DBOptions& db_options, const std::string& name,
164 const std::vector<ColumnFamilyDescriptor>& column_families,
165 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
166 bool error_if_log_file_exist = false);
167
494da23a
TL
168 // The following OpenAsSecondary functions create a secondary instance that
169 // can dynamically tail the MANIFEST of a primary that must have already been
170 // created. User can call TryCatchUpWithPrimary to make the secondary
171 // instance catch up with primary (WAL tailing is NOT supported now) whenever
172 // the user feels necessary. Column families created by the primary after the
173 // secondary instance starts are currently ignored by the secondary instance.
174 // Column families opened by secondary and dropped by the primary will be
175 // dropped by secondary as well. However the user of the secondary instance
176 // can still access the data of such dropped column family as long as they
177 // do not destroy the corresponding column family handle.
178 // WAL tailing is not supported at present, but will arrive soon.
179 //
180 // The options argument specifies the options to open the secondary instance.
181 // The name argument specifies the name of the primary db that you have used
182 // to open the primary instance.
183 // The secondary_path argument points to a directory where the secondary
184 // instance stores its info log.
185 // The dbptr is an out-arg corresponding to the opened secondary instance.
186 // The pointer points to a heap-allocated database, and the user should
187 // delete it after use.
188 // Open DB as secondary instance with only the default column family.
189 // Return OK on success, non-OK on failures.
190 static Status OpenAsSecondary(const Options& options, const std::string& name,
191 const std::string& secondary_path, DB** dbptr);
192
193 // Open DB as secondary instance with column families. You can open a subset
194 // of column families in secondary mode.
195 // The db_options specify the database specific options.
196 // The name argument specifies the name of the primary db that you have used
197 // to open the primary instance.
198 // The secondary_path argument points to a directory where the secondary
199 // instance stores its info log.
200 // The column_families argument specifieds a list of column families to open.
201 // If any of the column families does not exist, the function returns non-OK
202 // status.
203 // The handles is an out-arg corresponding to the opened database column
204 // familiy handles.
205 // The dbptr is an out-arg corresponding to the opened secondary instance.
206 // The pointer points to a heap-allocated database, and the caller should
207 // delete it after use. Before deleting the dbptr, the user should also
208 // delete the pointers stored in handles vector.
209 // Return OK on success, on-OK on failures.
210 static Status OpenAsSecondary(
211 const DBOptions& db_options, const std::string& name,
212 const std::string& secondary_path,
213 const std::vector<ColumnFamilyDescriptor>& column_families,
214 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
215
7c673cae
FG
216 // Open DB with column families.
217 // db_options specify database specific options
218 // column_families is the vector of all column families in the database,
219 // containing column family name and options. You need to open ALL column
220 // families in the database. To get the list of column families, you can use
221 // ListColumnFamilies(). Also, you can open only a subset of column families
222 // for read-only access.
223 // The default column family name is 'default' and it's stored
f67539c2 224 // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName.
7c673cae
FG
225 // If everything is OK, handles will on return be the same size
226 // as column_families --- handles[i] will be a handle that you
227 // will use to operate on column family column_family[i].
228 // Before delete DB, you have to close All column families by calling
229 // DestroyColumnFamilyHandle() with all the handles.
230 static Status Open(const DBOptions& db_options, const std::string& name,
231 const std::vector<ColumnFamilyDescriptor>& column_families,
232 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
233
11fdf7f2
TL
234 virtual Status Resume() { return Status::NotSupported(); }
235
236 // Close the DB by releasing resources, closing files etc. This should be
237 // called before calling the destructor so that the caller can get back a
238 // status in case there are any errors. This will not fsync the WAL files.
239 // If syncing is required, the caller must first call SyncWAL(), or Write()
240 // using an empty write batch with WriteOptions.sync=true.
f67539c2
TL
241 // Regardless of the return status, the DB must be freed.
242 // If the return status is Aborted(), closing fails because there is
243 // unreleased snapshot in the system. In this case, users can release
244 // the unreleased snapshots and try again and expect it to succeed. For
245 // other status, recalling Close() will be no-op.
246 // If the return status is NotSupported(), then the DB implementation does
247 // cleanup in the destructor
11fdf7f2
TL
248 virtual Status Close() { return Status::NotSupported(); }
249
7c673cae
FG
250 // ListColumnFamilies will open the DB specified by argument name
251 // and return the list of all column families in that DB
252 // through column_families argument. The ordering of
253 // column families in column_families is unspecified.
254 static Status ListColumnFamilies(const DBOptions& db_options,
255 const std::string& name,
256 std::vector<std::string>* column_families);
257
494da23a 258 DB() {}
f67539c2
TL
259 // No copying allowed
260 DB(const DB&) = delete;
261 void operator=(const DB&) = delete;
262
7c673cae
FG
263 virtual ~DB();
264
265 // Create a column_family and return the handle of column family
266 // through the argument handle.
267 virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
268 const std::string& column_family_name,
269 ColumnFamilyHandle** handle);
270
11fdf7f2
TL
271 // Bulk create column families with the same column family options.
272 // Return the handles of the column families through the argument handles.
273 // In case of error, the request may succeed partially, and handles will
274 // contain column family handles that it managed to create, and have size
275 // equal to the number of created column families.
276 virtual Status CreateColumnFamilies(
277 const ColumnFamilyOptions& options,
278 const std::vector<std::string>& column_family_names,
279 std::vector<ColumnFamilyHandle*>* handles);
280
281 // Bulk create column families.
282 // Return the handles of the column families through the argument handles.
283 // In case of error, the request may succeed partially, and handles will
284 // contain column family handles that it managed to create, and have size
285 // equal to the number of created column families.
286 virtual Status CreateColumnFamilies(
287 const std::vector<ColumnFamilyDescriptor>& column_families,
288 std::vector<ColumnFamilyHandle*>* handles);
289
7c673cae
FG
290 // Drop a column family specified by column_family handle. This call
291 // only records a drop record in the manifest and prevents the column
292 // family from flushing and compacting.
293 virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
11fdf7f2
TL
294
295 // Bulk drop column families. This call only records drop records in the
296 // manifest and prevents the column families from flushing and compacting.
297 // In case of error, the request may succeed partially. User may call
298 // ListColumnFamilies to check the result.
299 virtual Status DropColumnFamilies(
300 const std::vector<ColumnFamilyHandle*>& column_families);
301
7c673cae
FG
302 // Close a column family specified by column_family handle and destroy
303 // the column family handle specified to avoid double deletion. This call
304 // deletes the column family handle by default. Use this method to
305 // close column family instead of deleting column family handle directly
306 virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
307
308 // Set the database entry for "key" to "value".
309 // If "key" already exists, it will be overwritten.
310 // Returns OK on success, and a non-OK status on error.
311 // Note: consider setting options.sync = true.
312 virtual Status Put(const WriteOptions& options,
313 ColumnFamilyHandle* column_family, const Slice& key,
314 const Slice& value) = 0;
315 virtual Status Put(const WriteOptions& options, const Slice& key,
316 const Slice& value) {
317 return Put(options, DefaultColumnFamily(), key, value);
318 }
319
320 // Remove the database entry (if any) for "key". Returns OK on
321 // success, and a non-OK status on error. It is not an error if "key"
322 // did not exist in the database.
323 // Note: consider setting options.sync = true.
324 virtual Status Delete(const WriteOptions& options,
325 ColumnFamilyHandle* column_family,
326 const Slice& key) = 0;
327 virtual Status Delete(const WriteOptions& options, const Slice& key) {
328 return Delete(options, DefaultColumnFamily(), key);
329 }
330
331 // Remove the database entry for "key". Requires that the key exists
332 // and was not overwritten. Returns OK on success, and a non-OK status
333 // on error. It is not an error if "key" did not exist in the database.
334 //
335 // If a key is overwritten (by calling Put() multiple times), then the result
336 // of calling SingleDelete() on this key is undefined. SingleDelete() only
337 // behaves correctly if there has been only one Put() for this key since the
338 // previous call to SingleDelete() for this key.
339 //
340 // This feature is currently an experimental performance optimization
341 // for a very specific workload. It is up to the caller to ensure that
342 // SingleDelete is only used for a key that is not deleted using Delete() or
343 // written using Merge(). Mixing SingleDelete operations with Deletes and
344 // Merges can result in undefined behavior.
345 //
346 // Note: consider setting options.sync = true.
347 virtual Status SingleDelete(const WriteOptions& options,
348 ColumnFamilyHandle* column_family,
349 const Slice& key) = 0;
350 virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
351 return SingleDelete(options, DefaultColumnFamily(), key);
352 }
353
354 // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
355 // including "begin_key" and excluding "end_key". Returns OK on success, and
356 // a non-OK status on error. It is not an error if no keys exist in the range
357 // ["begin_key", "end_key").
358 //
494da23a
TL
359 // This feature is now usable in production, with the following caveats:
360 // 1) Accumulating many range tombstones in the memtable will degrade read
361 // performance; this can be avoided by manually flushing occasionally.
362 // 2) Limiting the maximum number of open files in the presence of range
363 // tombstones can degrade read performance. To avoid this problem, set
364 // max_open_files to -1 whenever possible.
7c673cae
FG
365 virtual Status DeleteRange(const WriteOptions& options,
366 ColumnFamilyHandle* column_family,
367 const Slice& begin_key, const Slice& end_key);
368
369 // Merge the database entry for "key" with "value". Returns OK on success,
370 // and a non-OK status on error. The semantics of this operation is
371 // determined by the user provided merge_operator when opening DB.
372 // Note: consider setting options.sync = true.
373 virtual Status Merge(const WriteOptions& options,
374 ColumnFamilyHandle* column_family, const Slice& key,
375 const Slice& value) = 0;
376 virtual Status Merge(const WriteOptions& options, const Slice& key,
377 const Slice& value) {
378 return Merge(options, DefaultColumnFamily(), key, value);
379 }
380
381 // Apply the specified updates to the database.
382 // If `updates` contains no update, WAL will still be synced if
383 // options.sync=true.
384 // Returns OK on success, non-OK on failure.
385 // Note: consider setting options.sync = true.
386 virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
387
388 // If the database contains an entry for "key" store the
389 // corresponding value in *value and return OK.
390 //
391 // If there is no entry for "key" leave *value unchanged and return
392 // a status for which Status::IsNotFound() returns true.
393 //
394 // May return some other Status on an error.
395 virtual inline Status Get(const ReadOptions& options,
396 ColumnFamilyHandle* column_family, const Slice& key,
397 std::string* value) {
398 assert(value != nullptr);
399 PinnableSlice pinnable_val(value);
400 assert(!pinnable_val.IsPinned());
401 auto s = Get(options, column_family, key, &pinnable_val);
402 if (s.ok() && pinnable_val.IsPinned()) {
403 value->assign(pinnable_val.data(), pinnable_val.size());
404 } // else value is already assigned
405 return s;
406 }
407 virtual Status Get(const ReadOptions& options,
408 ColumnFamilyHandle* column_family, const Slice& key,
409 PinnableSlice* value) = 0;
494da23a
TL
410 virtual Status Get(const ReadOptions& options, const Slice& key,
411 std::string* value) {
7c673cae
FG
412 return Get(options, DefaultColumnFamily(), key, value);
413 }
414
f67539c2
TL
415 // Returns all the merge operands corresponding to the key. If the
416 // number of merge operands in DB is greater than
417 // merge_operands_options.expected_max_number_of_operands
418 // no merge operands are returned and status is Incomplete. Merge operands
419 // returned are in the order of insertion.
420 // merge_operands- Points to an array of at-least
421 // merge_operands_options.expected_max_number_of_operands and the
422 // caller is responsible for allocating it. If the status
423 // returned is Incomplete then number_of_operands will contain
424 // the total number of merge operands found in DB for key.
425 virtual Status GetMergeOperands(
426 const ReadOptions& options, ColumnFamilyHandle* column_family,
427 const Slice& key, PinnableSlice* merge_operands,
428 GetMergeOperandsOptions* get_merge_operands_options,
429 int* number_of_operands) = 0;
430
7c673cae
FG
431 // If keys[i] does not exist in the database, then the i'th returned
432 // status will be one for which Status::IsNotFound() is true, and
433 // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
434 // the i'th returned status will have Status::ok() true, and (*values)[i]
435 // will store the value associated with keys[i].
436 //
437 // (*values) will always be resized to be the same size as (keys).
438 // Similarly, the number of returned statuses will be the number of keys.
439 // Note: keys will not be "de-duplicated". Duplicate keys will return
440 // duplicate values in order.
441 virtual std::vector<Status> MultiGet(
442 const ReadOptions& options,
443 const std::vector<ColumnFamilyHandle*>& column_family,
444 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
445 virtual std::vector<Status> MultiGet(const ReadOptions& options,
446 const std::vector<Slice>& keys,
447 std::vector<std::string>* values) {
494da23a
TL
448 return MultiGet(
449 options,
450 std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
451 keys, values);
7c673cae
FG
452 }
453
f67539c2
TL
454 // Overloaded MultiGet API that improves performance by batching operations
455 // in the read path for greater efficiency. Currently, only the block based
456 // table format with full filters are supported. Other table formats such
457 // as plain table, block based table with block based filters and
458 // partitioned indexes will still work, but will not get any performance
459 // benefits.
460 // Parameters -
461 // options - ReadOptions
462 // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
463 // passed to the API are restricted to a single column family
464 // num_keys - Number of keys to lookup
465 // keys - Pointer to C style array of key Slices with num_keys elements
466 // values - Pointer to C style array of PinnableSlices with num_keys elements
467 // statuses - Pointer to C style array of Status with num_keys elements
468 // sorted_input - If true, it means the input keys are already sorted by key
469 // order, so the MultiGet() API doesn't have to sort them
470 // again. If false, the keys will be copied and sorted
471 // internally by the API - the input array will not be
472 // modified
473 virtual void MultiGet(const ReadOptions& options,
474 ColumnFamilyHandle* column_family,
475 const size_t num_keys, const Slice* keys,
476 PinnableSlice* values, Status* statuses,
477 const bool /*sorted_input*/ = false) {
478 std::vector<ColumnFamilyHandle*> cf;
479 std::vector<Slice> user_keys;
480 std::vector<Status> status;
481 std::vector<std::string> vals;
482
483 for (size_t i = 0; i < num_keys; ++i) {
484 cf.emplace_back(column_family);
485 user_keys.emplace_back(keys[i]);
486 }
487 status = MultiGet(options, cf, user_keys, &vals);
488 std::copy(status.begin(), status.end(), statuses);
489 for (auto& value : vals) {
490 values->PinSelf(value);
491 values++;
492 }
493 }
494
495 // Overloaded MultiGet API that improves performance by batching operations
496 // in the read path for greater efficiency. Currently, only the block based
497 // table format with full filters are supported. Other table formats such
498 // as plain table, block based table with block based filters and
499 // partitioned indexes will still work, but will not get any performance
500 // benefits.
501 // Parameters -
502 // options - ReadOptions
503 // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
504 // passed to the API are restricted to a single column family
505 // num_keys - Number of keys to lookup
506 // keys - Pointer to C style array of key Slices with num_keys elements
507 // values - Pointer to C style array of PinnableSlices with num_keys elements
508 // statuses - Pointer to C style array of Status with num_keys elements
509 // sorted_input - If true, it means the input keys are already sorted by key
510 // order, so the MultiGet() API doesn't have to sort them
511 // again. If false, the keys will be copied and sorted
512 // internally by the API - the input array will not be
513 // modified
514 virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
515 ColumnFamilyHandle** column_families, const Slice* keys,
516 PinnableSlice* values, Status* statuses,
517 const bool /*sorted_input*/ = false) {
518 std::vector<ColumnFamilyHandle*> cf;
519 std::vector<Slice> user_keys;
520 std::vector<Status> status;
521 std::vector<std::string> vals;
522
523 for (size_t i = 0; i < num_keys; ++i) {
524 cf.emplace_back(column_families[i]);
525 user_keys.emplace_back(keys[i]);
526 }
527 status = MultiGet(options, cf, user_keys, &vals);
528 std::copy(status.begin(), status.end(), statuses);
529 for (auto& value : vals) {
530 values->PinSelf(value);
531 values++;
532 }
533 }
534
7c673cae
FG
535 // If the key definitely does not exist in the database, then this method
536 // returns false, else true. If the caller wants to obtain value when the key
537 // is found in memory, a bool for 'value_found' must be passed. 'value_found'
538 // will be true on return if value has been set properly.
539 // This check is potentially lighter-weight than invoking DB::Get(). One way
540 // to make this lighter weight is to avoid doing any IOs.
541 // Default implementation here returns true and sets 'value_found' to false
542 virtual bool KeyMayExist(const ReadOptions& /*options*/,
543 ColumnFamilyHandle* /*column_family*/,
544 const Slice& /*key*/, std::string* /*value*/,
545 bool* value_found = nullptr) {
546 if (value_found != nullptr) {
547 *value_found = false;
548 }
549 return true;
550 }
551 virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
552 std::string* value, bool* value_found = nullptr) {
553 return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
554 }
555
556 // Return a heap-allocated iterator over the contents of the database.
557 // The result of NewIterator() is initially invalid (caller must
558 // call one of the Seek methods on the iterator before using it).
559 //
560 // Caller should delete the iterator when it is no longer needed.
561 // The returned iterator should be deleted before this db is deleted.
562 virtual Iterator* NewIterator(const ReadOptions& options,
563 ColumnFamilyHandle* column_family) = 0;
564 virtual Iterator* NewIterator(const ReadOptions& options) {
565 return NewIterator(options, DefaultColumnFamily());
566 }
567 // Returns iterators from a consistent database state across multiple
568 // column families. Iterators are heap allocated and need to be deleted
569 // before the db is deleted
570 virtual Status NewIterators(
571 const ReadOptions& options,
572 const std::vector<ColumnFamilyHandle*>& column_families,
573 std::vector<Iterator*>* iterators) = 0;
574
575 // Return a handle to the current DB state. Iterators created with
576 // this handle will all observe a stable snapshot of the current DB
577 // state. The caller must call ReleaseSnapshot(result) when the
578 // snapshot is no longer needed.
579 //
580 // nullptr will be returned if the DB fails to take a snapshot or does
581 // not support snapshot.
582 virtual const Snapshot* GetSnapshot() = 0;
583
584 // Release a previously acquired snapshot. The caller must not
585 // use "snapshot" after this call.
586 virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
587
588#ifndef ROCKSDB_LITE
589 // Contains all valid property arguments for GetProperty().
590 //
591 // NOTE: Property names cannot end in numbers since those are interpreted as
592 // arguments, e.g., see kNumFilesAtLevelPrefix.
593 struct Properties {
594 // "rocksdb.num-files-at-level<N>" - returns string containing the number
595 // of files at level <N>, where <N> is an ASCII representation of a
596 // level number (e.g., "0").
597 static const std::string kNumFilesAtLevelPrefix;
598
599 // "rocksdb.compression-ratio-at-level<N>" - returns string containing the
600 // compression ratio of data at level <N>, where <N> is an ASCII
601 // representation of a level number (e.g., "0"). Here, compression
602 // ratio is defined as uncompressed data size / compressed file size.
603 // Returns "-1.0" if no open files at level <N>.
604 static const std::string kCompressionRatioAtLevelPrefix;
605
606 // "rocksdb.stats" - returns a multi-line string containing the data
607 // described by kCFStats followed by the data described by kDBStats.
608 static const std::string kStats;
609
610 // "rocksdb.sstables" - returns a multi-line string summarizing current
611 // SST files.
612 static const std::string kSSTables;
613
614 // "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and
615 // "rocksdb.cf-file-histogram" together. See below for description
616 // of the two.
617 static const std::string kCFStats;
618
619 // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
620 // general columm family stats per-level over db's lifetime ("L<n>"),
621 // aggregated over db's lifetime ("Sum"), and aggregated over the
622 // interval since the last retrieval ("Int").
623 // It could also be used to return the stats in the format of the map.
624 // In this case there will a pair of string to array of double for
625 // each level as well as for "Sum". "Int" stats will not be affected
11fdf7f2 626 // when this form of stats are retrieved.
7c673cae
FG
627 static const std::string kCFStatsNoFileHistogram;
628
629 // "rocksdb.cf-file-histogram" - print out how many file reads to every
630 // level, as well as the histogram of latency of single requests.
631 static const std::string kCFFileHistogram;
632
633 // "rocksdb.dbstats" - returns a multi-line string with general database
634 // stats, both cumulative (over the db's lifetime) and interval (since
635 // the last retrieval of kDBStats).
636 static const std::string kDBStats;
637
638 // "rocksdb.levelstats" - returns multi-line string containing the number
639 // of files per level and total size of each level (MB).
640 static const std::string kLevelStats;
641
642 // "rocksdb.num-immutable-mem-table" - returns number of immutable
643 // memtables that have not yet been flushed.
644 static const std::string kNumImmutableMemTable;
645
646 // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
647 // memtables that have already been flushed.
648 static const std::string kNumImmutableMemTableFlushed;
649
650 // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
651 // pending; otherwise, returns 0.
652 static const std::string kMemTableFlushPending;
653
654 // "rocksdb.num-running-flushes" - returns the number of currently running
655 // flushes.
656 static const std::string kNumRunningFlushes;
657
658 // "rocksdb.compaction-pending" - returns 1 if at least one compaction is
659 // pending; otherwise, returns 0.
660 static const std::string kCompactionPending;
661
662 // "rocksdb.num-running-compactions" - returns the number of currently
663 // running compactions.
664 static const std::string kNumRunningCompactions;
665
666 // "rocksdb.background-errors" - returns accumulated number of background
667 // errors.
668 static const std::string kBackgroundErrors;
669
670 // "rocksdb.cur-size-active-mem-table" - returns approximate size of active
671 // memtable (bytes).
672 static const std::string kCurSizeActiveMemTable;
673
674 // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
675 // and unflushed immutable memtables (bytes).
676 static const std::string kCurSizeAllMemTables;
677
678 // "rocksdb.size-all-mem-tables" - returns approximate size of active,
679 // unflushed immutable, and pinned immutable memtables (bytes).
680 static const std::string kSizeAllMemTables;
681
682 // "rocksdb.num-entries-active-mem-table" - returns total number of entries
683 // in the active memtable.
684 static const std::string kNumEntriesActiveMemTable;
685
686 // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
687 // in the unflushed immutable memtables.
688 static const std::string kNumEntriesImmMemTables;
689
690 // "rocksdb.num-deletes-active-mem-table" - returns total number of delete
691 // entries in the active memtable.
692 static const std::string kNumDeletesActiveMemTable;
693
694 // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
695 // entries in the unflushed immutable memtables.
696 static const std::string kNumDeletesImmMemTables;
697
698 // "rocksdb.estimate-num-keys" - returns estimated number of total keys in
699 // the active and unflushed immutable memtables and storage.
700 static const std::string kEstimateNumKeys;
701
702 // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
703 // reading SST tables, excluding memory used in block cache (e.g.,
704 // filter and index blocks).
705 static const std::string kEstimateTableReadersMem;
706
707 // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
708 // files is enabled; otherwise, returns a non-zero number.
709 static const std::string kIsFileDeletionsEnabled;
710
711 // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
712 // database.
713 static const std::string kNumSnapshots;
714
715 // "rocksdb.oldest-snapshot-time" - returns number representing unix
716 // timestamp of oldest unreleased snapshot.
717 static const std::string kOldestSnapshotTime;
718
f67539c2
TL
719 // "rocksdb.oldest-snapshot-sequence" - returns number representing
720 // sequence number of oldest unreleased snapshot.
721 static const std::string kOldestSnapshotSequence;
722
7c673cae
FG
723 // "rocksdb.num-live-versions" - returns number of live versions. `Version`
724 // is an internal data structure. See version_set.h for details. More
725 // live versions often mean more SST files are held from being deleted,
726 // by iterators or unfinished compactions.
727 static const std::string kNumLiveVersions;
728
11fdf7f2 729 // "rocksdb.current-super-version-number" - returns number of current LSM
7c673cae
FG
730 // version. It is a uint64_t integer number, incremented after there is
731 // any change to the LSM tree. The number is not preserved after restarting
732 // the DB. After DB restart, it will start from 0 again.
733 static const std::string kCurrentSuperVersionNumber;
734
735 // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
736 // live data in bytes.
737 static const std::string kEstimateLiveDataSize;
738
11fdf7f2 739 // "rocksdb.min-log-number-to-keep" - return the minimum log number of the
7c673cae
FG
740 // log files that should be kept.
741 static const std::string kMinLogNumberToKeep;
742
494da23a
TL
743 // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
744 // number for an obsolete SST to be kept. The max value of `uint64_t`
745 // will be returned if all obsolete files can be deleted.
746 static const std::string kMinObsoleteSstNumberToKeep;
747
7c673cae
FG
748 // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
749 // files.
750 // WARNING: may slow down online queries if there are too many files.
751 static const std::string kTotalSstFilesSize;
752
11fdf7f2
TL
753 // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
754 // files belong to the latest LSM tree.
755 static const std::string kLiveSstFilesSize;
756
7c673cae
FG
757 // "rocksdb.base-level" - returns number of level to which L0 data will be
758 // compacted.
759 static const std::string kBaseLevel;
760
761 // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
762 // number of bytes compaction needs to rewrite to get all levels down
763 // to under target size. Not valid for other compactions than level-
764 // based.
765 static const std::string kEstimatePendingCompactionBytes;
766
767 // "rocksdb.aggregated-table-properties" - returns a string representation
768 // of the aggregated table properties of the target column family.
769 static const std::string kAggregatedTableProperties;
770
771 // "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
772 // one but only returns the aggregated table properties of the
773 // specified level "N" at the target column family.
774 static const std::string kAggregatedTablePropertiesAtLevel;
775
776 // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
777 // write rate. 0 means no delay.
778 static const std::string kActualDelayedWriteRate;
779
780 // "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
781 static const std::string kIsWriteStopped;
11fdf7f2
TL
782
783 // "rocksdb.estimate-oldest-key-time" - returns an estimation of
784 // oldest key timestamp in the DB. Currently only available for
785 // FIFO compaction with
786 // compaction_options_fifo.allow_compaction = false.
787 static const std::string kEstimateOldestKeyTime;
788
789 // "rocksdb.block-cache-capacity" - returns block cache capacity.
790 static const std::string kBlockCacheCapacity;
791
792 // "rocksdb.block-cache-usage" - returns the memory size for the entries
793 // residing in block cache.
794 static const std::string kBlockCacheUsage;
795
796 // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
797 // entries being pinned.
798 static const std::string kBlockCachePinnedUsage;
799
800 // "rocksdb.options-statistics" - returns multi-line string
801 // of options.statistics
802 static const std::string kOptionsStatistics;
7c673cae
FG
803 };
804#endif /* ROCKSDB_LITE */
805
806 // DB implementations can export properties about their state via this method.
807 // If "property" is a valid property understood by this DB implementation (see
808 // Properties struct above for valid options), fills "*value" with its current
809 // value and returns true. Otherwise, returns false.
810 virtual bool GetProperty(ColumnFamilyHandle* column_family,
811 const Slice& property, std::string* value) = 0;
812 virtual bool GetProperty(const Slice& property, std::string* value) {
813 return GetProperty(DefaultColumnFamily(), property, value);
814 }
815 virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
816 const Slice& property,
11fdf7f2 817 std::map<std::string, std::string>* value) = 0;
7c673cae 818 virtual bool GetMapProperty(const Slice& property,
11fdf7f2 819 std::map<std::string, std::string>* value) {
7c673cae
FG
820 return GetMapProperty(DefaultColumnFamily(), property, value);
821 }
822
823 // Similar to GetProperty(), but only works for a subset of properties whose
824 // return value is an integer. Return the value by integer. Supported
825 // properties:
826 // "rocksdb.num-immutable-mem-table"
827 // "rocksdb.mem-table-flush-pending"
828 // "rocksdb.compaction-pending"
829 // "rocksdb.background-errors"
830 // "rocksdb.cur-size-active-mem-table"
831 // "rocksdb.cur-size-all-mem-tables"
832 // "rocksdb.size-all-mem-tables"
833 // "rocksdb.num-entries-active-mem-table"
834 // "rocksdb.num-entries-imm-mem-tables"
835 // "rocksdb.num-deletes-active-mem-table"
836 // "rocksdb.num-deletes-imm-mem-tables"
837 // "rocksdb.estimate-num-keys"
838 // "rocksdb.estimate-table-readers-mem"
839 // "rocksdb.is-file-deletions-enabled"
840 // "rocksdb.num-snapshots"
841 // "rocksdb.oldest-snapshot-time"
842 // "rocksdb.num-live-versions"
843 // "rocksdb.current-super-version-number"
844 // "rocksdb.estimate-live-data-size"
845 // "rocksdb.min-log-number-to-keep"
494da23a 846 // "rocksdb.min-obsolete-sst-number-to-keep"
7c673cae 847 // "rocksdb.total-sst-files-size"
11fdf7f2 848 // "rocksdb.live-sst-files-size"
7c673cae
FG
849 // "rocksdb.base-level"
850 // "rocksdb.estimate-pending-compaction-bytes"
851 // "rocksdb.num-running-compactions"
852 // "rocksdb.num-running-flushes"
853 // "rocksdb.actual-delayed-write-rate"
854 // "rocksdb.is-write-stopped"
11fdf7f2
TL
855 // "rocksdb.estimate-oldest-key-time"
856 // "rocksdb.block-cache-capacity"
857 // "rocksdb.block-cache-usage"
858 // "rocksdb.block-cache-pinned-usage"
7c673cae
FG
859 virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
860 const Slice& property, uint64_t* value) = 0;
861 virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
862 return GetIntProperty(DefaultColumnFamily(), property, value);
863 }
864
865 // Reset internal stats for DB and all column families.
866 // Note this doesn't reset options.statistics as it is not owned by
867 // DB.
868 virtual Status ResetStats() {
869 return Status::NotSupported("Not implemented");
870 }
871
872 // Same as GetIntProperty(), but this one returns the aggregated int
873 // property from all column families.
874 virtual bool GetAggregatedIntProperty(const Slice& property,
875 uint64_t* value) = 0;
876
877 // Flags for DB::GetSizeApproximation that specify whether memtable
878 // stats should be included, or file stats approximation or both
879 enum SizeApproximationFlags : uint8_t {
880 NONE = 0,
f67539c2 881 INCLUDE_MEMTABLES = 1 << 0,
7c673cae
FG
882 INCLUDE_FILES = 1 << 1
883 };
884
885 // For each i in [0,n-1], store in "sizes[i]", the approximate
886 // file system space used by keys in "[range[i].start .. range[i].limit)".
887 //
888 // Note that the returned sizes measure file system space usage, so
889 // if the user data compresses by a factor of ten, the returned
890 // sizes will be one-tenth the size of the corresponding user data size.
f67539c2
TL
891 virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
892 ColumnFamilyHandle* column_family,
893 const Range* range, int n,
894 uint64_t* sizes) = 0;
895
896 // Simpler versions of the GetApproximateSizes() method above.
897 // The include_flags argumenbt must of type DB::SizeApproximationFlags
898 // and can not be NONE.
7c673cae
FG
899 virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
900 const Range* range, int n, uint64_t* sizes,
f67539c2
TL
901 uint8_t include_flags = INCLUDE_FILES) {
902 SizeApproximationOptions options;
903 options.include_memtabtles =
904 (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
905 options.include_files =
906 (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
907 GetApproximateSizes(options, column_family, range, n, sizes);
908 }
7c673cae 909 virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
494da23a
TL
910 uint8_t include_flags = INCLUDE_FILES) {
911 GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
7c673cae
FG
912 }
913
914 // The method is similar to GetApproximateSizes, except it
915 // returns approximate number of records in memtables.
916 virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
917 const Range& range,
918 uint64_t* const count,
919 uint64_t* const size) = 0;
920 virtual void GetApproximateMemTableStats(const Range& range,
921 uint64_t* const count,
922 uint64_t* const size) {
923 GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
924 }
925
926 // Deprecated versions of GetApproximateSizes
927 ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
494da23a 928 const Range* range, int n, uint64_t* sizes, bool include_memtable) {
7c673cae
FG
929 uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
930 if (include_memtable) {
931 include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
932 }
933 GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
934 }
935 ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
494da23a
TL
936 ColumnFamilyHandle* column_family, const Range* range, int n,
937 uint64_t* sizes, bool include_memtable) {
7c673cae
FG
938 uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
939 if (include_memtable) {
940 include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
941 }
942 GetApproximateSizes(column_family, range, n, sizes, include_flags);
943 }
944
945 // Compact the underlying storage for the key range [*begin,*end].
946 // The actual compaction interval might be superset of [*begin, *end].
947 // In particular, deleted and overwritten versions are discarded,
948 // and the data is rearranged to reduce the cost of operations
949 // needed to access the data. This operation should typically only
950 // be invoked by users who understand the underlying implementation.
951 //
952 // begin==nullptr is treated as a key before all keys in the database.
953 // end==nullptr is treated as a key after all keys in the database.
954 // Therefore the following call will compact the entire database:
955 // db->CompactRange(options, nullptr, nullptr);
956 // Note that after the entire database is compacted, all data are pushed
957 // down to the last level containing any data. If the total data size after
958 // compaction is reduced, that level might not be appropriate for hosting all
959 // the files. In this case, client could set options.change_level to true, to
960 // move the files back to the minimum level capable of holding the data set
961 // or a given level (specified by non-negative options.target_level).
962 virtual Status CompactRange(const CompactRangeOptions& options,
963 ColumnFamilyHandle* column_family,
964 const Slice* begin, const Slice* end) = 0;
965 virtual Status CompactRange(const CompactRangeOptions& options,
966 const Slice* begin, const Slice* end) {
967 return CompactRange(options, DefaultColumnFamily(), begin, end);
968 }
969
970 ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
971 ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end,
972 bool change_level = false, int target_level = -1,
973 uint32_t target_path_id = 0) {
974 CompactRangeOptions options;
975 options.change_level = change_level;
976 options.target_level = target_level;
977 options.target_path_id = target_path_id;
978 return CompactRange(options, column_family, begin, end);
979 }
980
981 ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
982 const Slice* begin, const Slice* end, bool change_level = false,
983 int target_level = -1, uint32_t target_path_id = 0) {
984 CompactRangeOptions options;
985 options.change_level = change_level;
986 options.target_level = target_level;
987 options.target_path_id = target_path_id;
988 return CompactRange(options, DefaultColumnFamily(), begin, end);
989 }
990
991 virtual Status SetOptions(
992 ColumnFamilyHandle* /*column_family*/,
993 const std::unordered_map<std::string, std::string>& /*new_options*/) {
994 return Status::NotSupported("Not implemented");
995 }
996 virtual Status SetOptions(
997 const std::unordered_map<std::string, std::string>& new_options) {
998 return SetOptions(DefaultColumnFamily(), new_options);
999 }
1000
1001 virtual Status SetDBOptions(
1002 const std::unordered_map<std::string, std::string>& new_options) = 0;
1003
1004 // CompactFiles() inputs a list of files specified by file numbers and
1005 // compacts them to the specified level. Note that the behavior is different
1006 // from CompactRange() in that CompactFiles() performs the compaction job
1007 // using the CURRENT thread.
1008 //
1009 // @see GetDataBaseMetaData
1010 // @see GetColumnFamilyMetaData
1011 virtual Status CompactFiles(
1012 const CompactionOptions& compact_options,
1013 ColumnFamilyHandle* column_family,
494da23a
TL
1014 const std::vector<std::string>& input_file_names, const int output_level,
1015 const int output_path_id = -1,
1016 std::vector<std::string>* const output_file_names = nullptr,
1017 CompactionJobInfo* compaction_job_info = nullptr) = 0;
7c673cae
FG
1018
1019 virtual Status CompactFiles(
1020 const CompactionOptions& compact_options,
494da23a
TL
1021 const std::vector<std::string>& input_file_names, const int output_level,
1022 const int output_path_id = -1,
1023 std::vector<std::string>* const output_file_names = nullptr,
1024 CompactionJobInfo* compaction_job_info = nullptr) {
7c673cae 1025 return CompactFiles(compact_options, DefaultColumnFamily(),
11fdf7f2 1026 input_file_names, output_level, output_path_id,
494da23a 1027 output_file_names, compaction_job_info);
7c673cae
FG
1028 }
1029
1030 // This function will wait until all currently running background processes
1031 // finish. After it returns, no background process will be run until
11fdf7f2 1032 // ContinueBackgroundWork is called
7c673cae
FG
1033 virtual Status PauseBackgroundWork() = 0;
1034 virtual Status ContinueBackgroundWork() = 0;
1035
1036 // This function will enable automatic compactions for the given column
1037 // families if they were previously disabled. The function will first set the
1038 // disable_auto_compactions option for each column family to 'false', after
1039 // which it will schedule a flush/compaction.
1040 //
1041 // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
1042 // does NOT schedule a flush/compaction afterwards, and only changes the
1043 // parameter itself within the column family option.
1044 //
1045 virtual Status EnableAutoCompaction(
1046 const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
1047
f67539c2
TL
1048 virtual void DisableManualCompaction() = 0;
1049 virtual void EnableManualCompaction() = 0;
1050
7c673cae
FG
1051 // Number of levels used for this DB.
1052 virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
1053 virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
1054
1055 // Maximum level to which a new compacted memtable is pushed if it
1056 // does not create overlap.
1057 virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
1058 virtual int MaxMemCompactionLevel() {
1059 return MaxMemCompactionLevel(DefaultColumnFamily());
1060 }
1061
1062 // Number of files in level-0 that would stop writes.
1063 virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
1064 virtual int Level0StopWriteTrigger() {
1065 return Level0StopWriteTrigger(DefaultColumnFamily());
1066 }
1067
1068 // Get DB name -- the exact same name that was provided as an argument to
1069 // DB::Open()
1070 virtual const std::string& GetName() const = 0;
1071
1072 // Get Env object from the DB
1073 virtual Env* GetEnv() const = 0;
1074
f67539c2
TL
1075 virtual FileSystem* GetFileSystem() const;
1076
7c673cae
FG
1077 // Get DB Options that we use. During the process of opening the
1078 // column family, the options provided when calling DB::Open() or
1079 // DB::CreateColumnFamily() will have been "sanitized" and transformed
1080 // in an implementation-defined manner.
1081 virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
1082 virtual Options GetOptions() const {
1083 return GetOptions(DefaultColumnFamily());
1084 }
1085
1086 virtual DBOptions GetDBOptions() const = 0;
1087
1088 // Flush all mem-table data.
494da23a
TL
1089 // Flush a single column family, even when atomic flush is enabled. To flush
1090 // multiple column families, use Flush(options, column_families).
7c673cae
FG
1091 virtual Status Flush(const FlushOptions& options,
1092 ColumnFamilyHandle* column_family) = 0;
1093 virtual Status Flush(const FlushOptions& options) {
1094 return Flush(options, DefaultColumnFamily());
1095 }
494da23a
TL
1096 // Flushes multiple column families.
1097 // If atomic flush is not enabled, Flush(options, column_families) is
1098 // equivalent to calling Flush(options, column_family) multiple times.
1099 // If atomic flush is enabled, Flush(options, column_families) will flush all
1100 // column families specified in 'column_families' up to the latest sequence
1101 // number at the time when flush is requested.
1102 // Note that RocksDB 5.15 and earlier may not be able to open later versions
1103 // with atomic flush enabled.
1104 virtual Status Flush(
1105 const FlushOptions& options,
1106 const std::vector<ColumnFamilyHandle*>& column_families) = 0;
7c673cae 1107
11fdf7f2
TL
1108 // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
1109 // afterwards.
1110 virtual Status FlushWAL(bool /*sync*/) {
1111 return Status::NotSupported("FlushWAL not implemented");
1112 }
7c673cae
FG
1113 // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
1114 // same as Write() with sync=true: in the latter case the changes won't be
1115 // visible until the sync is done.
1116 // Currently only works if allow_mmap_writes = false in Options.
1117 virtual Status SyncWAL() = 0;
1118
494da23a
TL
1119 // Lock the WAL. Also flushes the WAL after locking.
1120 virtual Status LockWAL() {
1121 return Status::NotSupported("LockWAL not implemented");
1122 }
1123
1124 // Unlock the WAL.
1125 virtual Status UnlockWAL() {
1126 return Status::NotSupported("UnlockWAL not implemented");
1127 }
1128
7c673cae
FG
1129 // The sequence number of the most recent transaction.
1130 virtual SequenceNumber GetLatestSequenceNumber() const = 0;
1131
11fdf7f2
TL
1132 // Instructs DB to preserve deletes with sequence numbers >= passed seqnum.
1133 // Has no effect if DBOptions.preserve_deletes is set to false.
1134 // This function assumes that user calls this function with monotonically
1135 // increasing seqnums (otherwise we can't guarantee that a particular delete
1136 // hasn't been already processed); returns true if the value was successfully
1137 // updated, false if user attempted to call if with seqnum <= current value.
1138 virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0;
1139
7c673cae
FG
1140#ifndef ROCKSDB_LITE
1141
1142 // Prevent file deletions. Compactions will continue to occur,
1143 // but no obsolete files will be deleted. Calling this multiple
1144 // times have the same effect as calling it once.
1145 virtual Status DisableFileDeletions() = 0;
1146
1147 // Allow compactions to delete obsolete files.
1148 // If force == true, the call to EnableFileDeletions() will guarantee that
1149 // file deletions are enabled after the call, even if DisableFileDeletions()
1150 // was called multiple times before.
1151 // If force == false, EnableFileDeletions will only enable file deletion
1152 // after it's been called at least as many times as DisableFileDeletions(),
1153 // enabling the two methods to be called by two threads concurrently without
1154 // synchronization -- i.e., file deletions will be enabled only after both
1155 // threads call EnableFileDeletions()
1156 virtual Status EnableFileDeletions(bool force = true) = 0;
1157
1158 // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
1159
1160 // Retrieve the list of all files in the database. The files are
11fdf7f2
TL
1161 // relative to the dbname and are not absolute paths. Despite being relative
1162 // paths, the file names begin with "/". The valid size of the manifest file
1163 // is returned in manifest_file_size. The manifest file is an ever growing
1164 // file, but only the portion specified by manifest_file_size is valid for
1165 // this snapshot. Setting flush_memtable to true does Flush before recording
1166 // the live files. Setting flush_memtable to false is useful when we don't
1167 // want to wait for flush which may have to wait for compaction to complete
1168 // taking an indeterminate time.
7c673cae
FG
1169 //
1170 // In case you have multiple column families, even if flush_memtable is true,
1171 // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
1172 // for new data that arrived to already-flushed column families while other
1173 // column families were flushing
1174 virtual Status GetLiveFiles(std::vector<std::string>&,
1175 uint64_t* manifest_file_size,
1176 bool flush_memtable = true) = 0;
1177
1178 // Retrieve the sorted list of all wal files with earliest file first
1179 virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
1180
f67539c2
TL
1181 // Retrieve information about the current wal file
1182 //
1183 // Note that the log might have rolled after this call in which case
1184 // the current_log_file would not point to the current log file.
1185 //
1186 // Additionally, for the sake of optimization current_log_file->StartSequence
1187 // would always be set to 0
1188 virtual Status GetCurrentWalFile(
1189 std::unique_ptr<LogFile>* current_log_file) = 0;
1190
1191 // Retrieves the creation time of the oldest file in the DB.
1192 // This API only works if max_open_files = -1, if it is not then
1193 // Status returned is Status::NotSupported()
1194 // The file creation time is set using the env provided to the DB.
1195 // If the DB was created from a very old release then its possible that
1196 // the SST files might not have file_creation_time property and even after
1197 // moving to a newer release its possible that some files never got compacted
1198 // and may not have file_creation_time property. In both the cases
1199 // file_creation_time is considered 0 which means this API will return
1200 // creation_time = 0 as there wouldn't be a timestamp lower than 0.
1201 virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
1202
11fdf7f2 1203 // Note: this API is not yet consistent with WritePrepared transactions.
7c673cae
FG
1204 // Sets iter to an iterator that is positioned at a write-batch containing
1205 // seq_number. If the sequence number is non existent, it returns an iterator
1206 // at the first available seq_no after the requested seq_no
1207 // Returns Status::OK if iterator is valid
1208 // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
1209 // use this api, else the WAL files will get
1210 // cleared aggressively and the iterator might keep getting invalid before
1211 // an update is read.
1212 virtual Status GetUpdatesSince(
494da23a
TL
1213 SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
1214 const TransactionLogIterator::ReadOptions& read_options =
1215 TransactionLogIterator::ReadOptions()) = 0;
7c673cae
FG
1216
1217// Windows API macro interference
1218#undef DeleteFile
1219 // Delete the file name from the db directory and update the internal state to
1220 // reflect that. Supports deletion of sst and log files only. 'name' must be
1221 // path relative to the db directory. eg. 000001.sst, /archive/000003.log
1222 virtual Status DeleteFile(std::string name) = 0;
1223
1224 // Returns a list of all table files with their level, start key
1225 // and end key
1226 virtual void GetLiveFilesMetaData(
1227 std::vector<LiveFileMetaData>* /*metadata*/) {}
1228
1229 // Obtains the meta data of the specified column family of the DB.
7c673cae
FG
1230 virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
1231 ColumnFamilyMetaData* /*metadata*/) {}
1232
1233 // Get the metadata of the default column family.
494da23a 1234 void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
7c673cae
FG
1235 GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
1236 }
1237
1238 // IngestExternalFile() will load a list of external SST files (1) into the DB
11fdf7f2
TL
1239 // Two primary modes are supported:
1240 // - Duplicate keys in the new files will overwrite exiting keys (default)
1241 // - Duplicate keys will be skipped (set ingest_behind=true)
1242 // In the first mode we will try to find the lowest possible level that
1243 // the file can fit in, and ingest the file into this level (2). A file that
1244 // have a key range that overlap with the memtable key range will require us
1245 // to Flush the memtable first before ingesting the file.
1246 // In the second mode we will always ingest in the bottom most level (see
1247 // docs to IngestExternalFileOptions::ingest_behind).
7c673cae
FG
1248 //
1249 // (1) External SST files can be created using SstFileWriter
1250 // (2) We will try to ingest the files to the lowest possible level
11fdf7f2
TL
1251 // even if the file compression doesn't match the level compression
1252 // (3) If IngestExternalFileOptions->ingest_behind is set to true,
1253 // we always ingest at the bottommost level, which should be reserved
1254 // for this purpose (see DBOPtions::allow_ingest_behind flag).
7c673cae
FG
1255 virtual Status IngestExternalFile(
1256 ColumnFamilyHandle* column_family,
1257 const std::vector<std::string>& external_files,
1258 const IngestExternalFileOptions& options) = 0;
1259
1260 virtual Status IngestExternalFile(
1261 const std::vector<std::string>& external_files,
1262 const IngestExternalFileOptions& options) {
1263 return IngestExternalFile(DefaultColumnFamily(), external_files, options);
1264 }
1265
494da23a
TL
1266 // IngestExternalFiles() will ingest files for multiple column families, and
1267 // record the result atomically to the MANIFEST.
1268 // If this function returns OK, all column families' ingestion must succeed.
1269 // If this function returns NOK, or the process crashes, then non-of the
1270 // files will be ingested into the database after recovery.
1271 // Note that it is possible for application to observe a mixed state during
1272 // the execution of this function. If the user performs range scan over the
1273 // column families with iterators, iterator on one column family may return
1274 // ingested data, while iterator on other column family returns old data.
1275 // Users can use snapshot for a consistent view of data.
1276 // If your db ingests multiple SST files using this API, i.e. args.size()
1277 // > 1, then RocksDB 5.15 and earlier will not be able to open it.
1278 //
1279 // REQUIRES: each arg corresponds to a different column family: namely, for
1280 // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
1281 virtual Status IngestExternalFiles(
1282 const std::vector<IngestExternalFileArg>& args) = 0;
1283
f67539c2
TL
1284 // CreateColumnFamilyWithImport() will create a new column family with
1285 // column_family_name and import external SST files specified in metadata into
1286 // this column family.
1287 // (1) External SST files can be created using SstFileWriter.
1288 // (2) External SST files can be exported from a particular column family in
1289 // an existing DB.
1290 // Option in import_options specifies whether the external files are copied or
1291 // moved (default is copy). When option specifies copy, managing files at
1292 // external_file_path is caller's responsibility. When option specifies a
1293 // move, the call ensures that the specified files at external_file_path are
1294 // deleted on successful return and files are not modified on any error
1295 // return.
1296 // On error return, column family handle returned will be nullptr.
1297 // ColumnFamily will be present on successful return and will not be present
1298 // on error return. ColumnFamily may be present on any crash during this call.
1299 virtual Status CreateColumnFamilyWithImport(
1300 const ColumnFamilyOptions& options, const std::string& column_family_name,
1301 const ImportColumnFamilyOptions& import_options,
1302 const ExportImportFilesMetaData& metadata,
1303 ColumnFamilyHandle** handle) = 0;
1304
1305 virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
1306
1307 virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
11fdf7f2 1308
7c673cae
FG
1309 // AddFile() is deprecated, please use IngestExternalFile()
1310 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1311 ColumnFamilyHandle* column_family,
1312 const std::vector<std::string>& file_path_list, bool move_file = false,
1313 bool skip_snapshot_check = false) {
1314 IngestExternalFileOptions ifo;
1315 ifo.move_files = move_file;
1316 ifo.snapshot_consistency = !skip_snapshot_check;
1317 ifo.allow_global_seqno = false;
1318 ifo.allow_blocking_flush = false;
1319 return IngestExternalFile(column_family, file_path_list, ifo);
1320 }
1321
1322 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1323 const std::vector<std::string>& file_path_list, bool move_file = false,
1324 bool skip_snapshot_check = false) {
1325 IngestExternalFileOptions ifo;
1326 ifo.move_files = move_file;
1327 ifo.snapshot_consistency = !skip_snapshot_check;
1328 ifo.allow_global_seqno = false;
1329 ifo.allow_blocking_flush = false;
1330 return IngestExternalFile(DefaultColumnFamily(), file_path_list, ifo);
1331 }
1332
1333 // AddFile() is deprecated, please use IngestExternalFile()
1334 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1335 ColumnFamilyHandle* column_family, const std::string& file_path,
1336 bool move_file = false, bool skip_snapshot_check = false) {
1337 IngestExternalFileOptions ifo;
1338 ifo.move_files = move_file;
1339 ifo.snapshot_consistency = !skip_snapshot_check;
1340 ifo.allow_global_seqno = false;
1341 ifo.allow_blocking_flush = false;
1342 return IngestExternalFile(column_family, {file_path}, ifo);
1343 }
1344
1345 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1346 const std::string& file_path, bool move_file = false,
1347 bool skip_snapshot_check = false) {
1348 IngestExternalFileOptions ifo;
1349 ifo.move_files = move_file;
1350 ifo.snapshot_consistency = !skip_snapshot_check;
1351 ifo.allow_global_seqno = false;
1352 ifo.allow_blocking_flush = false;
1353 return IngestExternalFile(DefaultColumnFamily(), {file_path}, ifo);
1354 }
1355
1356 // Load table file with information "file_info" into "column_family"
1357 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1358 ColumnFamilyHandle* column_family,
1359 const std::vector<ExternalSstFileInfo>& file_info_list,
1360 bool move_file = false, bool skip_snapshot_check = false) {
1361 std::vector<std::string> external_files;
1362 for (const ExternalSstFileInfo& file_info : file_info_list) {
1363 external_files.push_back(file_info.file_path);
1364 }
1365 IngestExternalFileOptions ifo;
1366 ifo.move_files = move_file;
1367 ifo.snapshot_consistency = !skip_snapshot_check;
1368 ifo.allow_global_seqno = false;
1369 ifo.allow_blocking_flush = false;
1370 return IngestExternalFile(column_family, external_files, ifo);
1371 }
1372
1373 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1374 const std::vector<ExternalSstFileInfo>& file_info_list,
1375 bool move_file = false, bool skip_snapshot_check = false) {
1376 std::vector<std::string> external_files;
1377 for (const ExternalSstFileInfo& file_info : file_info_list) {
1378 external_files.push_back(file_info.file_path);
1379 }
1380 IngestExternalFileOptions ifo;
1381 ifo.move_files = move_file;
1382 ifo.snapshot_consistency = !skip_snapshot_check;
1383 ifo.allow_global_seqno = false;
1384 ifo.allow_blocking_flush = false;
1385 return IngestExternalFile(DefaultColumnFamily(), external_files, ifo);
1386 }
1387
1388 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1389 ColumnFamilyHandle* column_family, const ExternalSstFileInfo* file_info,
1390 bool move_file = false, bool skip_snapshot_check = false) {
1391 IngestExternalFileOptions ifo;
1392 ifo.move_files = move_file;
1393 ifo.snapshot_consistency = !skip_snapshot_check;
1394 ifo.allow_global_seqno = false;
1395 ifo.allow_blocking_flush = false;
1396 return IngestExternalFile(column_family, {file_info->file_path}, ifo);
1397 }
1398
1399 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1400 const ExternalSstFileInfo* file_info, bool move_file = false,
1401 bool skip_snapshot_check = false) {
1402 IngestExternalFileOptions ifo;
1403 ifo.move_files = move_file;
1404 ifo.snapshot_consistency = !skip_snapshot_check;
1405 ifo.allow_global_seqno = false;
1406 ifo.allow_blocking_flush = false;
1407 return IngestExternalFile(DefaultColumnFamily(), {file_info->file_path},
1408 ifo);
1409 }
1410
1411#endif // ROCKSDB_LITE
1412
f67539c2
TL
1413 // Returns the unique ID which is read from IDENTITY file during the opening
1414 // of database by setting in the identity variable
1415 // Returns Status::OK if identity could be set properly
7c673cae
FG
1416 virtual Status GetDbIdentity(std::string& identity) const = 0;
1417
1418 // Returns default column family handle
1419 virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
1420
1421#ifndef ROCKSDB_LITE
1422 virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
1423 TablePropertiesCollection* props) = 0;
1424 virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
1425 return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
1426 }
1427 virtual Status GetPropertiesOfTablesInRange(
1428 ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
1429 TablePropertiesCollection* props) = 0;
11fdf7f2
TL
1430
1431 virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
1432 const Slice* /*begin*/,
1433 const Slice* /*end*/) {
1434 return Status::NotSupported("SuggestCompactRange() is not implemented.");
1435 }
1436
1437 virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
1438 int /*target_level*/) {
1439 return Status::NotSupported("PromoteL0() is not implemented.");
1440 }
1441
1442 // Trace DB operations. Use EndTrace() to stop tracing.
1443 virtual Status StartTrace(const TraceOptions& /*options*/,
1444 std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
1445 return Status::NotSupported("StartTrace() is not implemented.");
1446 }
1447
1448 virtual Status EndTrace() {
1449 return Status::NotSupported("EndTrace() is not implemented.");
1450 }
f67539c2
TL
1451
1452 // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
1453 virtual Status StartBlockCacheTrace(
1454 const TraceOptions& /*options*/,
1455 std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
1456 return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
1457 }
1458
1459 virtual Status EndBlockCacheTrace() {
1460 return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
1461 }
7c673cae
FG
1462#endif // ROCKSDB_LITE
1463
1464 // Needed for StackableDB
1465 virtual DB* GetRootDB() { return this; }
1466
f67539c2
TL
1467 // Given a window [start_time, end_time), setup a StatsHistoryIterator
1468 // to access stats history. Note the start_time and end_time are epoch
1469 // time measured in seconds, and end_time is an exclusive bound.
494da23a
TL
1470 virtual Status GetStatsHistory(
1471 uint64_t /*start_time*/, uint64_t /*end_time*/,
1472 std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
1473 return Status::NotSupported("GetStatsHistory() is not implemented.");
1474 }
1475
1476#ifndef ROCKSDB_LITE
1477 // Make the secondary instance catch up with the primary by tailing and
1478 // replaying the MANIFEST and WAL of the primary.
1479 // Column families created by the primary after the secondary instance starts
1480 // will be ignored unless the secondary instance closes and restarts with the
1481 // newly created column families.
1482 // Column families that exist before secondary instance starts and dropped by
1483 // the primary afterwards will be marked as dropped. However, as long as the
1484 // secondary instance does not delete the corresponding column family
1485 // handles, the data of the column family is still accessible to the
1486 // secondary.
1487 // TODO: we will support WAL tailing soon.
1488 virtual Status TryCatchUpWithPrimary() {
1489 return Status::NotSupported("Supported only by secondary instance");
1490 }
1491#endif // !ROCKSDB_LITE
7c673cae
FG
1492};
1493
1494// Destroy the contents of the specified database.
1495// Be very careful using this method.
11fdf7f2
TL
1496Status DestroyDB(const std::string& name, const Options& options,
1497 const std::vector<ColumnFamilyDescriptor>& column_families =
494da23a 1498 std::vector<ColumnFamilyDescriptor>());
7c673cae
FG
1499
1500#ifndef ROCKSDB_LITE
1501// If a DB cannot be opened, you may attempt to call this method to
1502// resurrect as much of the contents of the database as possible.
1503// Some data may be lost, so be careful when calling this function
1504// on a database that contains important information.
1505//
1506// With this API, we will warn and skip data associated with column families not
1507// specified in column_families.
1508//
1509// @param column_families Descriptors for known column families
1510Status RepairDB(const std::string& dbname, const DBOptions& db_options,
1511 const std::vector<ColumnFamilyDescriptor>& column_families);
1512
1513// @param unknown_cf_opts Options for column families encountered during the
1514// repair that were not specified in column_families.
1515Status RepairDB(const std::string& dbname, const DBOptions& db_options,
1516 const std::vector<ColumnFamilyDescriptor>& column_families,
1517 const ColumnFamilyOptions& unknown_cf_opts);
1518
1519// @param options These options will be used for the database and for ALL column
1520// families encountered during the repair
1521Status RepairDB(const std::string& dbname, const Options& options);
1522
1523#endif
1524
f67539c2 1525} // namespace ROCKSDB_NAMESPACE