]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/db.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / include / rocksdb / db.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6// Use of this source code is governed by a BSD-style license that can be
7// found in the LICENSE file. See the AUTHORS file for names of contributors.
8
11fdf7f2 9#pragma once
7c673cae
FG
10
11#include <stdint.h>
12#include <stdio.h>
1e59de90 13
7c673cae
FG
14#include <map>
15#include <memory>
16#include <string>
17#include <unordered_map>
18#include <vector>
1e59de90
TL
19
20#include "rocksdb/block_cache_trace_writer.h"
7c673cae
FG
21#include "rocksdb/iterator.h"
22#include "rocksdb/listener.h"
23#include "rocksdb/metadata.h"
24#include "rocksdb/options.h"
25#include "rocksdb/snapshot.h"
26#include "rocksdb/sst_file_writer.h"
27#include "rocksdb/thread_status.h"
28#include "rocksdb/transaction_log.h"
29#include "rocksdb/types.h"
30#include "rocksdb/version.h"
1e59de90 31#include "rocksdb/wide_columns.h"
7c673cae
FG
32
33#ifdef _WIN32
34// Windows API macro interference
35#undef DeleteFile
36#endif
37
38#if defined(__GNUC__) || defined(__clang__)
39#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
40#elif _WIN32
41#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
42#endif
43
f67539c2 44namespace ROCKSDB_NAMESPACE {
7c673cae 45
7c673cae 46struct ColumnFamilyOptions;
7c673cae
FG
47struct CompactionOptions;
48struct CompactRangeOptions;
1e59de90 49struct DBOptions;
7c673cae 50struct ExternalSstFileInfo;
1e59de90
TL
51struct FlushOptions;
52struct Options;
53struct ReadOptions;
54struct TableProperties;
55struct WriteOptions;
56#ifdef ROCKSDB_LITE
57class CompactionJobInfo;
58#endif
7c673cae
FG
59class Env;
60class EventListener;
1e59de90
TL
61class FileSystem;
62#ifndef ROCKSDB_LITE
63class Replayer;
64#endif
494da23a 65class StatsHistoryIterator;
1e59de90
TL
66#ifndef ROCKSDB_LITE
67class TraceReader;
11fdf7f2 68class TraceWriter;
494da23a 69#endif
1e59de90 70class WriteBatch;
7c673cae
FG
71
72extern const std::string kDefaultColumnFamilyName;
f67539c2 73extern const std::string kPersistentStatsColumnFamilyName;
7c673cae
FG
74struct ColumnFamilyDescriptor {
75 std::string name;
76 ColumnFamilyOptions options;
77 ColumnFamilyDescriptor()
78 : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
79 ColumnFamilyDescriptor(const std::string& _name,
80 const ColumnFamilyOptions& _options)
81 : name(_name), options(_options) {}
82};
83
84class ColumnFamilyHandle {
85 public:
86 virtual ~ColumnFamilyHandle() {}
87 // Returns the name of the column family associated with the current handle.
88 virtual const std::string& GetName() const = 0;
89 // Returns the ID of the column family associated with the current handle.
90 virtual uint32_t GetID() const = 0;
91 // Fills "*desc" with the up-to-date descriptor of the column family
92 // associated with this handle. Since it fills "*desc" with the up-to-date
93 // information, this call might internally lock and release DB mutex to
94 // access the up-to-date CF options. In addition, all the pointer-typed
95 // options cannot be referenced any longer than the original options exist.
96 //
97 // Note that this function is not supported in RocksDBLite.
98 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
99 // Returns the comparator of the column family associated with the
100 // current handle.
101 virtual const Comparator* GetComparator() const = 0;
102};
103
104static const int kMajorVersion = __ROCKSDB_MAJOR__;
105static const int kMinorVersion = __ROCKSDB_MINOR__;
106
107// A range of keys
108struct Range {
11fdf7f2
TL
109 Slice start;
110 Slice limit;
7c673cae 111
494da23a
TL
112 Range() {}
113 Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
7c673cae
FG
114};
115
11fdf7f2
TL
116struct RangePtr {
117 const Slice* start;
118 const Slice* limit;
119
494da23a
TL
120 RangePtr() : start(nullptr), limit(nullptr) {}
121 RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
122};
123
20effc67 124// It is valid that files_checksums and files_checksum_func_names are both
1e59de90 125// empty (no checksum information is provided for ingestion). Otherwise,
20effc67
TL
126// their sizes should be the same as external_files. The file order should
127// be the same in three vectors and guaranteed by the caller.
1e59de90
TL
128// Note that, we assume the temperatures of this batch of files to be
129// ingested are the same.
494da23a
TL
130struct IngestExternalFileArg {
131 ColumnFamilyHandle* column_family = nullptr;
132 std::vector<std::string> external_files;
133 IngestExternalFileOptions options;
20effc67
TL
134 std::vector<std::string> files_checksums;
135 std::vector<std::string> files_checksum_func_names;
1e59de90 136 Temperature file_temperature = Temperature::kUnknown;
11fdf7f2
TL
137};
138
f67539c2
TL
139struct GetMergeOperandsOptions {
140 int expected_max_number_of_operands = 0;
141};
142
7c673cae
FG
143// A collections of table properties objects, where
144// key: is the table's file name.
145// value: the table properties object of the given table.
1e59de90
TL
146using TablePropertiesCollection =
147 std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
7c673cae 148
20effc67 149// A DB is a persistent, versioned ordered map from keys to values.
7c673cae
FG
150// A DB is safe for concurrent access from multiple threads without
151// any external synchronization.
20effc67
TL
152// DB is an abstract base class with one primary implementation (DBImpl)
153// and a number of wrapper implementations.
7c673cae
FG
154class DB {
155 public:
1e59de90 156 // Open the database with the specified "name" for reads and writes.
7c673cae
FG
157 // Stores a pointer to a heap-allocated database in *dbptr and returns
158 // OK on success.
1e59de90
TL
159 // Stores nullptr in *dbptr and returns a non-OK status on error, including
160 // if the DB is already open (read-write) by another DB object. (This
161 // guarantee depends on options.env->LockFile(), which might not provide
162 // this guarantee in a custom Env implementation.)
163 //
164 // Caller must delete *dbptr when it is no longer needed.
494da23a 165 static Status Open(const Options& options, const std::string& name,
7c673cae
FG
166 DB** dbptr);
167
1e59de90
TL
168 // Open DB with column families.
169 // db_options specify database specific options
170 // column_families is the vector of all column families in the database,
171 // containing column family name and options. You need to open ALL column
172 // families in the database. To get the list of column families, you can use
173 // ListColumnFamilies().
174 //
175 // The default column family name is 'default' and it's stored
176 // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName.
177 // If everything is OK, handles will on return be the same size
178 // as column_families --- handles[i] will be a handle that you
179 // will use to operate on column family column_family[i].
180 // Before delete DB, you have to close All column families by calling
181 // DestroyColumnFamilyHandle() with all the handles.
182 static Status Open(const DBOptions& db_options, const std::string& name,
183 const std::vector<ColumnFamilyDescriptor>& column_families,
184 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
185
186 // OpenForReadOnly() creates a Read-only instance that supports reads alone.
187 //
188 // All DB interfaces that modify data, like put/delete, will return error.
189 // Automatic Flush and Compactions are disabled and any manual calls
190 // to Flush/Compaction will return error.
191 //
192 // While a given DB can be simultaneously opened via OpenForReadOnly
193 // by any number of readers, if a DB is simultaneously opened by Open
194 // and OpenForReadOnly, the read-only instance has undefined behavior
195 // (though can often succeed if quickly closed) and the read-write
196 // instance is unaffected. See also OpenAsSecondary.
197
198 // Open the database for read only.
7c673cae
FG
199 //
200 // Not supported in ROCKSDB_LITE, in which case the function will
201 // return Status::NotSupported.
494da23a
TL
202 static Status OpenForReadOnly(const Options& options, const std::string& name,
203 DB** dbptr,
20effc67 204 bool error_if_wal_file_exists = false);
7c673cae 205
1e59de90
TL
206 // Open the database for read only with column families.
207 //
208 // When opening DB with read only, you can specify only a subset of column
209 // families in the database that should be opened. However, you always need
210 // to specify default column family. The default column family name is
211 // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
7c673cae
FG
212 //
213 // Not supported in ROCKSDB_LITE, in which case the function will
214 // return Status::NotSupported.
215 static Status OpenForReadOnly(
216 const DBOptions& db_options, const std::string& name,
217 const std::vector<ColumnFamilyDescriptor>& column_families,
218 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
20effc67 219 bool error_if_wal_file_exists = false);
7c673cae 220
1e59de90
TL
221 // OpenAsSecondary() creates a secondary instance that supports read-only
222 // operations and supports dynamic catch up with the primary (through a
223 // call to TryCatchUpWithPrimary()).
224 //
225 // All DB interfaces that modify data, like put/delete, will return error.
226 // Automatic Flush and Compactions are disabled and any manual calls
227 // to Flush/Compaction will return error.
228 //
229 // Multiple secondary instances can co-exist at the same time.
230 //
231
232 // Open DB as secondary instance
494da23a
TL
233 //
234 // The options argument specifies the options to open the secondary instance.
1e59de90 235 // Options.max_open_files should be set to -1.
494da23a
TL
236 // The name argument specifies the name of the primary db that you have used
237 // to open the primary instance.
238 // The secondary_path argument points to a directory where the secondary
239 // instance stores its info log.
240 // The dbptr is an out-arg corresponding to the opened secondary instance.
1e59de90 241 // The pointer points to a heap-allocated database, and the caller should
494da23a 242 // delete it after use.
1e59de90 243 //
494da23a
TL
244 // Return OK on success, non-OK on failures.
245 static Status OpenAsSecondary(const Options& options, const std::string& name,
246 const std::string& secondary_path, DB** dbptr);
247
1e59de90
TL
248 // Open DB as secondary instance with specified column families
249 //
250 // When opening DB in secondary mode, you can specify only a subset of column
251 // families in the database that should be opened. However, you always need
252 // to specify default column family. The default column family name is
253 // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
254 //
255 // Column families created by the primary after the secondary instance starts
256 // are currently ignored by the secondary instance. Column families opened
257 // by secondary and dropped by the primary will be dropped by secondary as
258 // well (on next invocation of TryCatchUpWithPrimary()). However the user
259 // of the secondary instance can still access the data of such dropped column
260 // family as long as they do not destroy the corresponding column family
261 // handle.
262 //
263 // The options argument specifies the options to open the secondary instance.
264 // Options.max_open_files should be set to -1.
494da23a
TL
265 // The name argument specifies the name of the primary db that you have used
266 // to open the primary instance.
267 // The secondary_path argument points to a directory where the secondary
268 // instance stores its info log.
1e59de90
TL
269 // The column_families argument specifies a list of column families to open.
270 // If default column family is not specified or if any specified column
271 // families does not exist, the function returns non-OK status.
494da23a 272 // The handles is an out-arg corresponding to the opened database column
1e59de90 273 // family handles.
494da23a
TL
274 // The dbptr is an out-arg corresponding to the opened secondary instance.
275 // The pointer points to a heap-allocated database, and the caller should
276 // delete it after use. Before deleting the dbptr, the user should also
277 // delete the pointers stored in handles vector.
1e59de90
TL
278 //
279 // Return OK on success, non-OK on failures.
494da23a
TL
280 static Status OpenAsSecondary(
281 const DBOptions& db_options, const std::string& name,
282 const std::string& secondary_path,
283 const std::vector<ColumnFamilyDescriptor>& column_families,
284 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
285
1e59de90
TL
286 // Open DB and run the compaction.
287 // It's a read-only operation, the result won't be installed to the DB, it
288 // will be output to the `output_directory`. The API should only be used with
289 // `options.CompactionService` to run compaction triggered by
290 // `CompactionService`.
291 static Status OpenAndCompact(
292 const std::string& name, const std::string& output_directory,
293 const std::string& input, std::string* output,
294 const CompactionServiceOptionsOverride& override_options);
295
296 static Status OpenAndCompact(
297 const OpenAndCompactOptions& options, const std::string& name,
298 const std::string& output_directory, const std::string& input,
299 std::string* output,
300 const CompactionServiceOptionsOverride& override_options);
301
302 // Experimental and subject to change
303 // Open DB and trim data newer than specified timestamp.
304 // The trim_ts specified the user-defined timestamp trim bound.
305 // This API should only be used at timestamp enabled column families recovery.
306 // If some input column families do not support timestamp, nothing will
307 // be happened to them. The data with timestamp > trim_ts
308 // will be removed after this API returns successfully.
309 static Status OpenAndTrimHistory(
310 const DBOptions& db_options, const std::string& dbname,
311 const std::vector<ColumnFamilyDescriptor>& column_families,
312 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
313 std::string trim_ts);
7c673cae 314
11fdf7f2
TL
315 virtual Status Resume() { return Status::NotSupported(); }
316
317 // Close the DB by releasing resources, closing files etc. This should be
318 // called before calling the destructor so that the caller can get back a
319 // status in case there are any errors. This will not fsync the WAL files.
320 // If syncing is required, the caller must first call SyncWAL(), or Write()
321 // using an empty write batch with WriteOptions.sync=true.
f67539c2
TL
322 // Regardless of the return status, the DB must be freed.
323 // If the return status is Aborted(), closing fails because there is
324 // unreleased snapshot in the system. In this case, users can release
325 // the unreleased snapshots and try again and expect it to succeed. For
1e59de90
TL
326 // other status, re-calling Close() will be no-op and return the original
327 // close status. If the return status is NotSupported(), then the DB
328 // implementation does cleanup in the destructor
11fdf7f2
TL
329 virtual Status Close() { return Status::NotSupported(); }
330
7c673cae
FG
331 // ListColumnFamilies will open the DB specified by argument name
332 // and return the list of all column families in that DB
333 // through column_families argument. The ordering of
334 // column families in column_families is unspecified.
335 static Status ListColumnFamilies(const DBOptions& db_options,
336 const std::string& name,
337 std::vector<std::string>* column_families);
338
20effc67 339 // Abstract class ctor
494da23a 340 DB() {}
f67539c2
TL
341 // No copying allowed
342 DB(const DB&) = delete;
343 void operator=(const DB&) = delete;
344
7c673cae
FG
345 virtual ~DB();
346
347 // Create a column_family and return the handle of column family
348 // through the argument handle.
349 virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
350 const std::string& column_family_name,
351 ColumnFamilyHandle** handle);
352
11fdf7f2
TL
353 // Bulk create column families with the same column family options.
354 // Return the handles of the column families through the argument handles.
355 // In case of error, the request may succeed partially, and handles will
356 // contain column family handles that it managed to create, and have size
357 // equal to the number of created column families.
358 virtual Status CreateColumnFamilies(
359 const ColumnFamilyOptions& options,
360 const std::vector<std::string>& column_family_names,
361 std::vector<ColumnFamilyHandle*>* handles);
362
363 // Bulk create column families.
364 // Return the handles of the column families through the argument handles.
365 // In case of error, the request may succeed partially, and handles will
366 // contain column family handles that it managed to create, and have size
367 // equal to the number of created column families.
368 virtual Status CreateColumnFamilies(
369 const std::vector<ColumnFamilyDescriptor>& column_families,
370 std::vector<ColumnFamilyHandle*>* handles);
371
7c673cae
FG
372 // Drop a column family specified by column_family handle. This call
373 // only records a drop record in the manifest and prevents the column
374 // family from flushing and compacting.
375 virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
11fdf7f2
TL
376
377 // Bulk drop column families. This call only records drop records in the
378 // manifest and prevents the column families from flushing and compacting.
379 // In case of error, the request may succeed partially. User may call
380 // ListColumnFamilies to check the result.
381 virtual Status DropColumnFamilies(
382 const std::vector<ColumnFamilyHandle*>& column_families);
383
1e59de90
TL
384 // Release and deallocate a column family handle. A column family is only
385 // removed once it is dropped (DropColumnFamily) and all handles have been
386 // destroyed (DestroyColumnFamilyHandle). Use this method to destroy
387 // column family handles (except for DefaultColumnFamily()!) before closing
388 // a DB.
7c673cae
FG
389 virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
390
391 // Set the database entry for "key" to "value".
392 // If "key" already exists, it will be overwritten.
393 // Returns OK on success, and a non-OK status on error.
394 // Note: consider setting options.sync = true.
395 virtual Status Put(const WriteOptions& options,
396 ColumnFamilyHandle* column_family, const Slice& key,
397 const Slice& value) = 0;
1e59de90
TL
398 virtual Status Put(const WriteOptions& options,
399 ColumnFamilyHandle* column_family, const Slice& key,
400 const Slice& ts, const Slice& value) = 0;
7c673cae
FG
401 virtual Status Put(const WriteOptions& options, const Slice& key,
402 const Slice& value) {
403 return Put(options, DefaultColumnFamily(), key, value);
404 }
1e59de90
TL
405 virtual Status Put(const WriteOptions& options, const Slice& key,
406 const Slice& ts, const Slice& value) {
407 return Put(options, DefaultColumnFamily(), key, ts, value);
408 }
409
410 // Set the database entry for "key" in the column family specified by
411 // "column_family" to the wide-column entity defined by "columns". If the key
412 // already exists in the column family, it will be overwritten.
413 //
414 // Returns OK on success, and a non-OK status on error.
415 virtual Status PutEntity(const WriteOptions& options,
416 ColumnFamilyHandle* column_family, const Slice& key,
417 const WideColumns& columns);
7c673cae
FG
418
419 // Remove the database entry (if any) for "key". Returns OK on
420 // success, and a non-OK status on error. It is not an error if "key"
421 // did not exist in the database.
422 // Note: consider setting options.sync = true.
423 virtual Status Delete(const WriteOptions& options,
424 ColumnFamilyHandle* column_family,
425 const Slice& key) = 0;
1e59de90
TL
426 virtual Status Delete(const WriteOptions& options,
427 ColumnFamilyHandle* column_family, const Slice& key,
428 const Slice& ts) = 0;
7c673cae
FG
429 virtual Status Delete(const WriteOptions& options, const Slice& key) {
430 return Delete(options, DefaultColumnFamily(), key);
431 }
1e59de90
TL
432 virtual Status Delete(const WriteOptions& options, const Slice& key,
433 const Slice& ts) {
434 return Delete(options, DefaultColumnFamily(), key, ts);
435 }
7c673cae
FG
436
437 // Remove the database entry for "key". Requires that the key exists
438 // and was not overwritten. Returns OK on success, and a non-OK status
439 // on error. It is not an error if "key" did not exist in the database.
440 //
441 // If a key is overwritten (by calling Put() multiple times), then the result
442 // of calling SingleDelete() on this key is undefined. SingleDelete() only
443 // behaves correctly if there has been only one Put() for this key since the
444 // previous call to SingleDelete() for this key.
445 //
446 // This feature is currently an experimental performance optimization
447 // for a very specific workload. It is up to the caller to ensure that
448 // SingleDelete is only used for a key that is not deleted using Delete() or
449 // written using Merge(). Mixing SingleDelete operations with Deletes and
450 // Merges can result in undefined behavior.
451 //
452 // Note: consider setting options.sync = true.
453 virtual Status SingleDelete(const WriteOptions& options,
454 ColumnFamilyHandle* column_family,
455 const Slice& key) = 0;
1e59de90
TL
456 virtual Status SingleDelete(const WriteOptions& options,
457 ColumnFamilyHandle* column_family,
458 const Slice& key, const Slice& ts) = 0;
7c673cae
FG
459 virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
460 return SingleDelete(options, DefaultColumnFamily(), key);
461 }
1e59de90
TL
462 virtual Status SingleDelete(const WriteOptions& options, const Slice& key,
463 const Slice& ts) {
464 return SingleDelete(options, DefaultColumnFamily(), key, ts);
465 }
7c673cae
FG
466
467 // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
468 // including "begin_key" and excluding "end_key". Returns OK on success, and
20effc67
TL
469 // a non-OK status on error. It is not an error if the database does not
470 // contain any existing data in the range ["begin_key", "end_key").
471 //
472 // If "end_key" comes before "start_key" according to the user's comparator,
473 // a `Status::InvalidArgument` is returned.
7c673cae 474 //
494da23a 475 // This feature is now usable in production, with the following caveats:
1e59de90 476 // 1) Accumulating too many range tombstones in the memtable will degrade read
494da23a
TL
477 // performance; this can be avoided by manually flushing occasionally.
478 // 2) Limiting the maximum number of open files in the presence of range
479 // tombstones can degrade read performance. To avoid this problem, set
480 // max_open_files to -1 whenever possible.
7c673cae
FG
481 virtual Status DeleteRange(const WriteOptions& options,
482 ColumnFamilyHandle* column_family,
483 const Slice& begin_key, const Slice& end_key);
1e59de90
TL
484 virtual Status DeleteRange(const WriteOptions& options,
485 ColumnFamilyHandle* column_family,
486 const Slice& begin_key, const Slice& end_key,
487 const Slice& ts);
7c673cae
FG
488
489 // Merge the database entry for "key" with "value". Returns OK on success,
490 // and a non-OK status on error. The semantics of this operation is
491 // determined by the user provided merge_operator when opening DB.
492 // Note: consider setting options.sync = true.
493 virtual Status Merge(const WriteOptions& options,
494 ColumnFamilyHandle* column_family, const Slice& key,
495 const Slice& value) = 0;
496 virtual Status Merge(const WriteOptions& options, const Slice& key,
497 const Slice& value) {
498 return Merge(options, DefaultColumnFamily(), key, value);
499 }
1e59de90
TL
500 virtual Status Merge(const WriteOptions& /*options*/,
501 ColumnFamilyHandle* /*column_family*/,
502 const Slice& /*key*/, const Slice& /*ts*/,
503 const Slice& /*value*/);
7c673cae
FG
504
505 // Apply the specified updates to the database.
506 // If `updates` contains no update, WAL will still be synced if
507 // options.sync=true.
508 // Returns OK on success, non-OK on failure.
509 // Note: consider setting options.sync = true.
510 virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
511
1e59de90
TL
512 // If the column family specified by "column_family" contains an entry for
513 // "key", return the corresponding value in "*value". If the entry is a plain
514 // key-value, return the value as-is; if it is a wide-column entity, return
515 // the value of its default anonymous column (see kDefaultWideColumnName) if
516 // any, or an empty value otherwise.
7c673cae 517 //
20effc67
TL
518 // If timestamp is enabled and a non-null timestamp pointer is passed in,
519 // timestamp is returned.
520 //
1e59de90
TL
521 // Returns OK on success. Returns NotFound and an empty value in "*value" if
522 // there is no entry for "key". Returns some other non-OK status on error.
7c673cae
FG
523 virtual inline Status Get(const ReadOptions& options,
524 ColumnFamilyHandle* column_family, const Slice& key,
525 std::string* value) {
526 assert(value != nullptr);
527 PinnableSlice pinnable_val(value);
528 assert(!pinnable_val.IsPinned());
529 auto s = Get(options, column_family, key, &pinnable_val);
530 if (s.ok() && pinnable_val.IsPinned()) {
531 value->assign(pinnable_val.data(), pinnable_val.size());
532 } // else value is already assigned
533 return s;
534 }
535 virtual Status Get(const ReadOptions& options,
536 ColumnFamilyHandle* column_family, const Slice& key,
537 PinnableSlice* value) = 0;
494da23a
TL
538 virtual Status Get(const ReadOptions& options, const Slice& key,
539 std::string* value) {
7c673cae
FG
540 return Get(options, DefaultColumnFamily(), key, value);
541 }
542
20effc67
TL
543 // Get() methods that return timestamp. Derived DB classes don't need to worry
544 // about this group of methods if they don't care about timestamp feature.
545 virtual inline Status Get(const ReadOptions& options,
546 ColumnFamilyHandle* column_family, const Slice& key,
547 std::string* value, std::string* timestamp) {
548 assert(value != nullptr);
549 PinnableSlice pinnable_val(value);
550 assert(!pinnable_val.IsPinned());
551 auto s = Get(options, column_family, key, &pinnable_val, timestamp);
552 if (s.ok() && pinnable_val.IsPinned()) {
553 value->assign(pinnable_val.data(), pinnable_val.size());
554 } // else value is already assigned
555 return s;
556 }
557 virtual Status Get(const ReadOptions& /*options*/,
558 ColumnFamilyHandle* /*column_family*/,
559 const Slice& /*key*/, PinnableSlice* /*value*/,
560 std::string* /*timestamp*/) {
561 return Status::NotSupported(
562 "Get() that returns timestamp is not implemented.");
563 }
564 virtual Status Get(const ReadOptions& options, const Slice& key,
565 std::string* value, std::string* timestamp) {
566 return Get(options, DefaultColumnFamily(), key, value, timestamp);
567 }
568
1e59de90
TL
569 // If the column family specified by "column_family" contains an entry for
570 // "key", return it as a wide-column entity in "*columns". If the entry is a
571 // wide-column entity, return it as-is; if it is a plain key-value, return it
572 // as an entity with a single anonymous column (see kDefaultWideColumnName)
573 // which contains the value.
574 //
575 // Returns OK on success. Returns NotFound and an empty wide-column entity in
576 // "*columns" if there is no entry for "key". Returns some other non-OK status
577 // on error.
578 virtual Status GetEntity(const ReadOptions& /* options */,
579 ColumnFamilyHandle* /* column_family */,
580 const Slice& /* key */,
581 PinnableWideColumns* /* columns */) {
582 return Status::NotSupported("GetEntity not supported");
583 }
584
585 // Populates the `merge_operands` array with all the merge operands in the DB
586 // for `key`. The `merge_operands` array will be populated in the order of
587 // insertion. The number of entries populated in `merge_operands` will be
588 // assigned to `*number_of_operands`.
589 //
590 // If the number of merge operands in DB for `key` is greater than
591 // `merge_operands_options.expected_max_number_of_operands`,
592 // `merge_operands` is not populated and the return value is
593 // `Status::Incomplete`. In that case, `*number_of_operands` will be assigned
594 // the number of merge operands found in the DB for `key`.
595 //
596 // `merge_operands`- Points to an array of at-least
f67539c2 597 // merge_operands_options.expected_max_number_of_operands and the
1e59de90
TL
598 // caller is responsible for allocating it.
599 //
600 // The caller should delete or `Reset()` the `merge_operands` entries when
601 // they are no longer needed. All `merge_operands` entries must be destroyed
602 // or `Reset()` before this DB is closed or destroyed.
f67539c2
TL
603 virtual Status GetMergeOperands(
604 const ReadOptions& options, ColumnFamilyHandle* column_family,
605 const Slice& key, PinnableSlice* merge_operands,
606 GetMergeOperandsOptions* get_merge_operands_options,
607 int* number_of_operands) = 0;
608
20effc67
TL
609 // Consistent Get of many keys across column families without the need
610 // for an explicit snapshot. NOTE: the implementation of this MultiGet API
611 // does not have the performance benefits of the void-returning MultiGet
612 // functions.
613 //
7c673cae
FG
614 // If keys[i] does not exist in the database, then the i'th returned
615 // status will be one for which Status::IsNotFound() is true, and
616 // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
617 // the i'th returned status will have Status::ok() true, and (*values)[i]
618 // will store the value associated with keys[i].
619 //
620 // (*values) will always be resized to be the same size as (keys).
621 // Similarly, the number of returned statuses will be the number of keys.
622 // Note: keys will not be "de-duplicated". Duplicate keys will return
623 // duplicate values in order.
624 virtual std::vector<Status> MultiGet(
625 const ReadOptions& options,
626 const std::vector<ColumnFamilyHandle*>& column_family,
627 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
628 virtual std::vector<Status> MultiGet(const ReadOptions& options,
629 const std::vector<Slice>& keys,
630 std::vector<std::string>* values) {
494da23a
TL
631 return MultiGet(
632 options,
633 std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
634 keys, values);
7c673cae
FG
635 }
636
20effc67
TL
637 virtual std::vector<Status> MultiGet(
638 const ReadOptions& /*options*/,
639 const std::vector<ColumnFamilyHandle*>& /*column_family*/,
640 const std::vector<Slice>& keys, std::vector<std::string>* /*values*/,
641 std::vector<std::string>* /*timestamps*/) {
642 return std::vector<Status>(
643 keys.size(), Status::NotSupported(
644 "MultiGet() returning timestamps not implemented."));
645 }
646 virtual std::vector<Status> MultiGet(const ReadOptions& options,
647 const std::vector<Slice>& keys,
648 std::vector<std::string>* values,
649 std::vector<std::string>* timestamps) {
650 return MultiGet(
651 options,
652 std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
653 keys, values, timestamps);
654 }
655
f67539c2
TL
656 // Overloaded MultiGet API that improves performance by batching operations
657 // in the read path for greater efficiency. Currently, only the block based
658 // table format with full filters are supported. Other table formats such
659 // as plain table, block based table with block based filters and
660 // partitioned indexes will still work, but will not get any performance
661 // benefits.
662 // Parameters -
663 // options - ReadOptions
664 // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
665 // passed to the API are restricted to a single column family
666 // num_keys - Number of keys to lookup
667 // keys - Pointer to C style array of key Slices with num_keys elements
668 // values - Pointer to C style array of PinnableSlices with num_keys elements
669 // statuses - Pointer to C style array of Status with num_keys elements
670 // sorted_input - If true, it means the input keys are already sorted by key
671 // order, so the MultiGet() API doesn't have to sort them
672 // again. If false, the keys will be copied and sorted
673 // internally by the API - the input array will not be
674 // modified
675 virtual void MultiGet(const ReadOptions& options,
676 ColumnFamilyHandle* column_family,
677 const size_t num_keys, const Slice* keys,
678 PinnableSlice* values, Status* statuses,
679 const bool /*sorted_input*/ = false) {
680 std::vector<ColumnFamilyHandle*> cf;
681 std::vector<Slice> user_keys;
682 std::vector<Status> status;
683 std::vector<std::string> vals;
684
685 for (size_t i = 0; i < num_keys; ++i) {
686 cf.emplace_back(column_family);
687 user_keys.emplace_back(keys[i]);
688 }
689 status = MultiGet(options, cf, user_keys, &vals);
690 std::copy(status.begin(), status.end(), statuses);
691 for (auto& value : vals) {
692 values->PinSelf(value);
693 values++;
694 }
695 }
696
20effc67
TL
697 virtual void MultiGet(const ReadOptions& options,
698 ColumnFamilyHandle* column_family,
699 const size_t num_keys, const Slice* keys,
700 PinnableSlice* values, std::string* timestamps,
701 Status* statuses, const bool /*sorted_input*/ = false) {
702 std::vector<ColumnFamilyHandle*> cf;
703 std::vector<Slice> user_keys;
704 std::vector<Status> status;
705 std::vector<std::string> vals;
706 std::vector<std::string> tss;
707
708 for (size_t i = 0; i < num_keys; ++i) {
709 cf.emplace_back(column_family);
710 user_keys.emplace_back(keys[i]);
711 }
712 status = MultiGet(options, cf, user_keys, &vals, &tss);
713 std::copy(status.begin(), status.end(), statuses);
714 std::copy(tss.begin(), tss.end(), timestamps);
715 for (auto& value : vals) {
716 values->PinSelf(value);
717 values++;
718 }
719 }
720
f67539c2
TL
721 // Overloaded MultiGet API that improves performance by batching operations
722 // in the read path for greater efficiency. Currently, only the block based
723 // table format with full filters are supported. Other table formats such
724 // as plain table, block based table with block based filters and
725 // partitioned indexes will still work, but will not get any performance
726 // benefits.
727 // Parameters -
728 // options - ReadOptions
729 // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
730 // passed to the API are restricted to a single column family
731 // num_keys - Number of keys to lookup
732 // keys - Pointer to C style array of key Slices with num_keys elements
733 // values - Pointer to C style array of PinnableSlices with num_keys elements
734 // statuses - Pointer to C style array of Status with num_keys elements
735 // sorted_input - If true, it means the input keys are already sorted by key
736 // order, so the MultiGet() API doesn't have to sort them
737 // again. If false, the keys will be copied and sorted
738 // internally by the API - the input array will not be
739 // modified
740 virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
741 ColumnFamilyHandle** column_families, const Slice* keys,
742 PinnableSlice* values, Status* statuses,
743 const bool /*sorted_input*/ = false) {
744 std::vector<ColumnFamilyHandle*> cf;
745 std::vector<Slice> user_keys;
746 std::vector<Status> status;
747 std::vector<std::string> vals;
748
749 for (size_t i = 0; i < num_keys; ++i) {
750 cf.emplace_back(column_families[i]);
751 user_keys.emplace_back(keys[i]);
752 }
753 status = MultiGet(options, cf, user_keys, &vals);
754 std::copy(status.begin(), status.end(), statuses);
755 for (auto& value : vals) {
756 values->PinSelf(value);
757 values++;
758 }
759 }
20effc67
TL
760 virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
761 ColumnFamilyHandle** column_families, const Slice* keys,
762 PinnableSlice* values, std::string* timestamps,
763 Status* statuses, const bool /*sorted_input*/ = false) {
764 std::vector<ColumnFamilyHandle*> cf;
765 std::vector<Slice> user_keys;
766 std::vector<Status> status;
767 std::vector<std::string> vals;
768 std::vector<std::string> tss;
769
770 for (size_t i = 0; i < num_keys; ++i) {
771 cf.emplace_back(column_families[i]);
772 user_keys.emplace_back(keys[i]);
773 }
774 status = MultiGet(options, cf, user_keys, &vals, &tss);
775 std::copy(status.begin(), status.end(), statuses);
776 std::copy(tss.begin(), tss.end(), timestamps);
777 for (auto& value : vals) {
778 values->PinSelf(value);
779 values++;
780 }
781 }
f67539c2 782
7c673cae
FG
783 // If the key definitely does not exist in the database, then this method
784 // returns false, else true. If the caller wants to obtain value when the key
785 // is found in memory, a bool for 'value_found' must be passed. 'value_found'
786 // will be true on return if value has been set properly.
787 // This check is potentially lighter-weight than invoking DB::Get(). One way
788 // to make this lighter weight is to avoid doing any IOs.
789 // Default implementation here returns true and sets 'value_found' to false
790 virtual bool KeyMayExist(const ReadOptions& /*options*/,
791 ColumnFamilyHandle* /*column_family*/,
792 const Slice& /*key*/, std::string* /*value*/,
20effc67 793 std::string* /*timestamp*/,
7c673cae
FG
794 bool* value_found = nullptr) {
795 if (value_found != nullptr) {
796 *value_found = false;
797 }
798 return true;
799 }
20effc67
TL
800
801 virtual bool KeyMayExist(const ReadOptions& options,
802 ColumnFamilyHandle* column_family, const Slice& key,
803 std::string* value, bool* value_found = nullptr) {
804 return KeyMayExist(options, column_family, key, value,
805 /*timestamp=*/nullptr, value_found);
806 }
807
7c673cae
FG
808 virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
809 std::string* value, bool* value_found = nullptr) {
810 return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
811 }
812
20effc67
TL
813 virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
814 std::string* value, std::string* timestamp,
815 bool* value_found = nullptr) {
816 return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp,
817 value_found);
818 }
819
7c673cae
FG
820 // Return a heap-allocated iterator over the contents of the database.
821 // The result of NewIterator() is initially invalid (caller must
822 // call one of the Seek methods on the iterator before using it).
823 //
824 // Caller should delete the iterator when it is no longer needed.
825 // The returned iterator should be deleted before this db is deleted.
826 virtual Iterator* NewIterator(const ReadOptions& options,
827 ColumnFamilyHandle* column_family) = 0;
828 virtual Iterator* NewIterator(const ReadOptions& options) {
829 return NewIterator(options, DefaultColumnFamily());
830 }
831 // Returns iterators from a consistent database state across multiple
832 // column families. Iterators are heap allocated and need to be deleted
833 // before the db is deleted
834 virtual Status NewIterators(
835 const ReadOptions& options,
836 const std::vector<ColumnFamilyHandle*>& column_families,
837 std::vector<Iterator*>* iterators) = 0;
838
839 // Return a handle to the current DB state. Iterators created with
840 // this handle will all observe a stable snapshot of the current DB
841 // state. The caller must call ReleaseSnapshot(result) when the
842 // snapshot is no longer needed.
843 //
844 // nullptr will be returned if the DB fails to take a snapshot or does
1e59de90 845 // not support snapshot (eg: inplace_update_support enabled).
7c673cae
FG
846 virtual const Snapshot* GetSnapshot() = 0;
847
848 // Release a previously acquired snapshot. The caller must not
849 // use "snapshot" after this call.
850 virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
851
852#ifndef ROCKSDB_LITE
1e59de90
TL
853 // Contains all valid property arguments for GetProperty() or
854 // GetMapProperty(). Each is a "string" property for retrieval with
855 // GetProperty() unless noted as a "map" property, for GetMapProperty().
7c673cae
FG
856 //
857 // NOTE: Property names cannot end in numbers since those are interpreted as
858 // arguments, e.g., see kNumFilesAtLevelPrefix.
859 struct Properties {
860 // "rocksdb.num-files-at-level<N>" - returns string containing the number
861 // of files at level <N>, where <N> is an ASCII representation of a
862 // level number (e.g., "0").
863 static const std::string kNumFilesAtLevelPrefix;
864
865 // "rocksdb.compression-ratio-at-level<N>" - returns string containing the
866 // compression ratio of data at level <N>, where <N> is an ASCII
867 // representation of a level number (e.g., "0"). Here, compression
868 // ratio is defined as uncompressed data size / compressed file size.
869 // Returns "-1.0" if no open files at level <N>.
870 static const std::string kCompressionRatioAtLevelPrefix;
871
872 // "rocksdb.stats" - returns a multi-line string containing the data
873 // described by kCFStats followed by the data described by kDBStats.
874 static const std::string kStats;
875
876 // "rocksdb.sstables" - returns a multi-line string summarizing current
877 // SST files.
878 static const std::string kSSTables;
879
1e59de90
TL
880 // "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram"
881 // and "rocksdb.cf-file-histogram" as a "map" property.
7c673cae
FG
882 static const std::string kCFStats;
883
884 // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
1e59de90 885 // general column family stats per-level over db's lifetime ("L<n>"),
7c673cae
FG
886 // aggregated over db's lifetime ("Sum"), and aggregated over the
887 // interval since the last retrieval ("Int").
7c673cae
FG
888 static const std::string kCFStatsNoFileHistogram;
889
890 // "rocksdb.cf-file-histogram" - print out how many file reads to every
891 // level, as well as the histogram of latency of single requests.
892 static const std::string kCFFileHistogram;
893
1e59de90
TL
894 // "rocksdb.dbstats" - As a string property, returns a multi-line string
895 // with general database stats, both cumulative (over the db's
896 // lifetime) and interval (since the last retrieval of kDBStats).
897 // As a map property, returns cumulative stats only and does not
898 // update the baseline for the interval stats.
7c673cae
FG
899 static const std::string kDBStats;
900
901 // "rocksdb.levelstats" - returns multi-line string containing the number
902 // of files per level and total size of each level (MB).
903 static const std::string kLevelStats;
904
1e59de90
TL
905 // "rocksdb.block-cache-entry-stats" - returns a multi-line string or
906 // map with statistics on block cache usage. See
907 // `BlockCacheEntryStatsMapKeys` for structured representation of keys
908 // available in the map form.
909 static const std::string kBlockCacheEntryStats;
910
911 // "rocksdb.fast-block-cache-entry-stats" - same as above, but returns
912 // stale values more frequently to reduce overhead and latency.
913 static const std::string kFastBlockCacheEntryStats;
914
7c673cae
FG
915 // "rocksdb.num-immutable-mem-table" - returns number of immutable
916 // memtables that have not yet been flushed.
917 static const std::string kNumImmutableMemTable;
918
919 // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
920 // memtables that have already been flushed.
921 static const std::string kNumImmutableMemTableFlushed;
922
923 // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
924 // pending; otherwise, returns 0.
925 static const std::string kMemTableFlushPending;
926
927 // "rocksdb.num-running-flushes" - returns the number of currently running
928 // flushes.
929 static const std::string kNumRunningFlushes;
930
931 // "rocksdb.compaction-pending" - returns 1 if at least one compaction is
932 // pending; otherwise, returns 0.
933 static const std::string kCompactionPending;
934
935 // "rocksdb.num-running-compactions" - returns the number of currently
936 // running compactions.
937 static const std::string kNumRunningCompactions;
938
939 // "rocksdb.background-errors" - returns accumulated number of background
940 // errors.
941 static const std::string kBackgroundErrors;
942
943 // "rocksdb.cur-size-active-mem-table" - returns approximate size of active
944 // memtable (bytes).
945 static const std::string kCurSizeActiveMemTable;
946
947 // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
948 // and unflushed immutable memtables (bytes).
949 static const std::string kCurSizeAllMemTables;
950
951 // "rocksdb.size-all-mem-tables" - returns approximate size of active,
952 // unflushed immutable, and pinned immutable memtables (bytes).
953 static const std::string kSizeAllMemTables;
954
955 // "rocksdb.num-entries-active-mem-table" - returns total number of entries
956 // in the active memtable.
957 static const std::string kNumEntriesActiveMemTable;
958
959 // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
960 // in the unflushed immutable memtables.
961 static const std::string kNumEntriesImmMemTables;
962
963 // "rocksdb.num-deletes-active-mem-table" - returns total number of delete
964 // entries in the active memtable.
965 static const std::string kNumDeletesActiveMemTable;
966
967 // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
968 // entries in the unflushed immutable memtables.
969 static const std::string kNumDeletesImmMemTables;
970
971 // "rocksdb.estimate-num-keys" - returns estimated number of total keys in
972 // the active and unflushed immutable memtables and storage.
973 static const std::string kEstimateNumKeys;
974
975 // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
976 // reading SST tables, excluding memory used in block cache (e.g.,
977 // filter and index blocks).
978 static const std::string kEstimateTableReadersMem;
979
980 // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
981 // files is enabled; otherwise, returns a non-zero number.
1e59de90
TL
982 // This name may be misleading because true(non-zero) means disable,
983 // but we keep the name for backward compatibility.
7c673cae
FG
984 static const std::string kIsFileDeletionsEnabled;
985
986 // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
987 // database.
988 static const std::string kNumSnapshots;
989
990 // "rocksdb.oldest-snapshot-time" - returns number representing unix
991 // timestamp of oldest unreleased snapshot.
992 static const std::string kOldestSnapshotTime;
993
f67539c2
TL
994 // "rocksdb.oldest-snapshot-sequence" - returns number representing
995 // sequence number of oldest unreleased snapshot.
996 static const std::string kOldestSnapshotSequence;
997
7c673cae
FG
998 // "rocksdb.num-live-versions" - returns number of live versions. `Version`
999 // is an internal data structure. See version_set.h for details. More
1000 // live versions often mean more SST files are held from being deleted,
1001 // by iterators or unfinished compactions.
1002 static const std::string kNumLiveVersions;
1003
11fdf7f2 1004 // "rocksdb.current-super-version-number" - returns number of current LSM
7c673cae
FG
1005 // version. It is a uint64_t integer number, incremented after there is
1006 // any change to the LSM tree. The number is not preserved after restarting
1007 // the DB. After DB restart, it will start from 0 again.
1008 static const std::string kCurrentSuperVersionNumber;
1009
1010 // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
1e59de90
TL
1011 // live data in bytes. For BlobDB, it also includes the exact value of
1012 // live bytes in the blob files of the version.
7c673cae
FG
1013 static const std::string kEstimateLiveDataSize;
1014
11fdf7f2 1015 // "rocksdb.min-log-number-to-keep" - return the minimum log number of the
7c673cae
FG
1016 // log files that should be kept.
1017 static const std::string kMinLogNumberToKeep;
1018
494da23a
TL
1019 // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
1020 // number for an obsolete SST to be kept. The max value of `uint64_t`
1021 // will be returned if all obsolete files can be deleted.
1022 static const std::string kMinObsoleteSstNumberToKeep;
1023
7c673cae
FG
1024 // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
1025 // files.
1026 // WARNING: may slow down online queries if there are too many files.
1027 static const std::string kTotalSstFilesSize;
1028
11fdf7f2
TL
1029 // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
1030 // files belong to the latest LSM tree.
1031 static const std::string kLiveSstFilesSize;
1032
1e59de90
TL
1033 // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes)
1034 // of SST files at all certain file temperature
1035 static const std::string kLiveSstFilesSizeAtTemperature;
1036
7c673cae
FG
1037 // "rocksdb.base-level" - returns number of level to which L0 data will be
1038 // compacted.
1039 static const std::string kBaseLevel;
1040
1041 // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
1042 // number of bytes compaction needs to rewrite to get all levels down
1043 // to under target size. Not valid for other compactions than level-
1044 // based.
1045 static const std::string kEstimatePendingCompactionBytes;
1046
1e59de90
TL
1047 // "rocksdb.aggregated-table-properties" - returns a string or map
1048 // representation of the aggregated table properties of the target
1049 // column family. Only properties that make sense for aggregation
1050 // are included.
7c673cae
FG
1051 static const std::string kAggregatedTableProperties;
1052
1053 // "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
1054 // one but only returns the aggregated table properties of the
1055 // specified level "N" at the target column family.
1056 static const std::string kAggregatedTablePropertiesAtLevel;
1057
1058 // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
1059 // write rate. 0 means no delay.
1060 static const std::string kActualDelayedWriteRate;
1061
1062 // "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
1063 static const std::string kIsWriteStopped;
11fdf7f2
TL
1064
1065 // "rocksdb.estimate-oldest-key-time" - returns an estimation of
1066 // oldest key timestamp in the DB. Currently only available for
1067 // FIFO compaction with
1068 // compaction_options_fifo.allow_compaction = false.
1069 static const std::string kEstimateOldestKeyTime;
1070
1071 // "rocksdb.block-cache-capacity" - returns block cache capacity.
1072 static const std::string kBlockCacheCapacity;
1073
1074 // "rocksdb.block-cache-usage" - returns the memory size for the entries
1075 // residing in block cache.
1076 static const std::string kBlockCacheUsage;
1077
1078 // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
1079 // entries being pinned.
1080 static const std::string kBlockCachePinnedUsage;
1081
1082 // "rocksdb.options-statistics" - returns multi-line string
1083 // of options.statistics
1084 static const std::string kOptionsStatistics;
1e59de90
TL
1085
1086 // "rocksdb.num-blob-files" - returns number of blob files in the current
1087 // version.
1088 static const std::string kNumBlobFiles;
1089
1090 // "rocksdb.blob-stats" - return the total number and size of all blob
1091 // files, and total amount of garbage (bytes) in the blob files in
1092 // the current version.
1093 static const std::string kBlobStats;
1094
1095 // "rocksdb.total-blob-file-size" - returns the total size of all blob
1096 // files over all versions.
1097 static const std::string kTotalBlobFileSize;
1098
1099 // "rocksdb.live-blob-file-size" - returns the total size of all blob
1100 // files in the current version.
1101 static const std::string kLiveBlobFileSize;
1102
1103 // "rocksdb.live-blob-file-garbage-size" - returns the total amount of
1104 // garbage in the blob files in the current version.
1105 static const std::string kLiveBlobFileGarbageSize;
1106
1107 // "rocksdb.blob-cache-capacity" - returns blob cache capacity.
1108 static const std::string kBlobCacheCapacity;
1109
1110 // "rocksdb.blob-cache-usage" - returns the memory size for the entries
1111 // residing in blob cache.
1112 static const std::string kBlobCacheUsage;
1113
1114 // "rocksdb.blob-cache-pinned-usage" - returns the memory size for the
1115 // entries being pinned in blob cache.
1116 static const std::string kBlobCachePinnedUsage;
7c673cae
FG
1117 };
1118#endif /* ROCKSDB_LITE */
1119
1e59de90
TL
1120 // DB implementations export properties about their state via this method.
1121 // If "property" is a valid "string" property understood by this DB
1122 // implementation (see Properties struct above for valid options), fills
1123 // "*value" with its current value and returns true. Otherwise, returns
1124 // false.
7c673cae
FG
1125 virtual bool GetProperty(ColumnFamilyHandle* column_family,
1126 const Slice& property, std::string* value) = 0;
1127 virtual bool GetProperty(const Slice& property, std::string* value) {
1128 return GetProperty(DefaultColumnFamily(), property, value);
1129 }
1e59de90
TL
1130
1131 // Like GetProperty but for valid "map" properties. (Some properties can be
1132 // accessed as either "string" properties or "map" properties.)
7c673cae
FG
1133 virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
1134 const Slice& property,
11fdf7f2 1135 std::map<std::string, std::string>* value) = 0;
7c673cae 1136 virtual bool GetMapProperty(const Slice& property,
11fdf7f2 1137 std::map<std::string, std::string>* value) {
7c673cae
FG
1138 return GetMapProperty(DefaultColumnFamily(), property, value);
1139 }
1140
1141 // Similar to GetProperty(), but only works for a subset of properties whose
1142 // return value is an integer. Return the value by integer. Supported
1143 // properties:
1144 // "rocksdb.num-immutable-mem-table"
1145 // "rocksdb.mem-table-flush-pending"
1146 // "rocksdb.compaction-pending"
1147 // "rocksdb.background-errors"
1148 // "rocksdb.cur-size-active-mem-table"
1149 // "rocksdb.cur-size-all-mem-tables"
1150 // "rocksdb.size-all-mem-tables"
1151 // "rocksdb.num-entries-active-mem-table"
1152 // "rocksdb.num-entries-imm-mem-tables"
1153 // "rocksdb.num-deletes-active-mem-table"
1154 // "rocksdb.num-deletes-imm-mem-tables"
1155 // "rocksdb.estimate-num-keys"
1156 // "rocksdb.estimate-table-readers-mem"
1157 // "rocksdb.is-file-deletions-enabled"
1158 // "rocksdb.num-snapshots"
1159 // "rocksdb.oldest-snapshot-time"
1160 // "rocksdb.num-live-versions"
1161 // "rocksdb.current-super-version-number"
1162 // "rocksdb.estimate-live-data-size"
1163 // "rocksdb.min-log-number-to-keep"
494da23a 1164 // "rocksdb.min-obsolete-sst-number-to-keep"
7c673cae 1165 // "rocksdb.total-sst-files-size"
11fdf7f2 1166 // "rocksdb.live-sst-files-size"
7c673cae
FG
1167 // "rocksdb.base-level"
1168 // "rocksdb.estimate-pending-compaction-bytes"
1169 // "rocksdb.num-running-compactions"
1170 // "rocksdb.num-running-flushes"
1171 // "rocksdb.actual-delayed-write-rate"
1172 // "rocksdb.is-write-stopped"
11fdf7f2
TL
1173 // "rocksdb.estimate-oldest-key-time"
1174 // "rocksdb.block-cache-capacity"
1175 // "rocksdb.block-cache-usage"
1176 // "rocksdb.block-cache-pinned-usage"
1e59de90
TL
1177 //
1178 // Properties dedicated for BlobDB:
1179 // "rocksdb.num-blob-files"
1180 // "rocksdb.total-blob-file-size"
1181 // "rocksdb.live-blob-file-size"
1182 // "rocksdb.blob-cache-capacity"
1183 // "rocksdb.blob-cache-usage"
1184 // "rocksdb.blob-cache-pinned-usage"
7c673cae
FG
1185 virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
1186 const Slice& property, uint64_t* value) = 0;
1187 virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
1188 return GetIntProperty(DefaultColumnFamily(), property, value);
1189 }
1190
1191 // Reset internal stats for DB and all column families.
1192 // Note this doesn't reset options.statistics as it is not owned by
1193 // DB.
1194 virtual Status ResetStats() {
1195 return Status::NotSupported("Not implemented");
1196 }
1197
1198 // Same as GetIntProperty(), but this one returns the aggregated int
1199 // property from all column families.
1200 virtual bool GetAggregatedIntProperty(const Slice& property,
1201 uint64_t* value) = 0;
1202
1203 // Flags for DB::GetSizeApproximation that specify whether memtable
1204 // stats should be included, or file stats approximation or both
1e59de90 1205 enum class SizeApproximationFlags : uint8_t {
7c673cae 1206 NONE = 0,
f67539c2 1207 INCLUDE_MEMTABLES = 1 << 0,
7c673cae
FG
1208 INCLUDE_FILES = 1 << 1
1209 };
1210
1211 // For each i in [0,n-1], store in "sizes[i]", the approximate
20effc67
TL
1212 // file system space used by keys in "[range[i].start .. range[i].limit)"
1213 // in a single column family.
7c673cae
FG
1214 //
1215 // Note that the returned sizes measure file system space usage, so
1216 // if the user data compresses by a factor of ten, the returned
1217 // sizes will be one-tenth the size of the corresponding user data size.
f67539c2
TL
1218 virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
1219 ColumnFamilyHandle* column_family,
20effc67 1220 const Range* ranges, int n,
f67539c2
TL
1221 uint64_t* sizes) = 0;
1222
1223 // Simpler versions of the GetApproximateSizes() method above.
1e59de90 1224 // The include_flags argument must of type DB::SizeApproximationFlags
f67539c2 1225 // and can not be NONE.
1e59de90
TL
1226 virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family,
1227 const Range* ranges, int n,
1228 uint64_t* sizes,
1229 SizeApproximationFlags include_flags =
1230 SizeApproximationFlags::INCLUDE_FILES);
1231
1232 virtual Status GetApproximateSizes(
1233 const Range* ranges, int n, uint64_t* sizes,
1234 SizeApproximationFlags include_flags =
1235 SizeApproximationFlags::INCLUDE_FILES) {
1236 return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes,
1237 include_flags);
7c673cae
FG
1238 }
1239
1240 // The method is similar to GetApproximateSizes, except it
1241 // returns approximate number of records in memtables.
1242 virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
1243 const Range& range,
1244 uint64_t* const count,
1245 uint64_t* const size) = 0;
1246 virtual void GetApproximateMemTableStats(const Range& range,
1247 uint64_t* const count,
1248 uint64_t* const size) {
1249 GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
1250 }
1251
7c673cae
FG
1252 // Compact the underlying storage for the key range [*begin,*end].
1253 // The actual compaction interval might be superset of [*begin, *end].
1254 // In particular, deleted and overwritten versions are discarded,
1255 // and the data is rearranged to reduce the cost of operations
1256 // needed to access the data. This operation should typically only
1257 // be invoked by users who understand the underlying implementation.
1e59de90
TL
1258 // This call blocks until the operation completes successfully, fails,
1259 // or is aborted (Status::Incomplete). See DisableManualCompaction.
7c673cae
FG
1260 //
1261 // begin==nullptr is treated as a key before all keys in the database.
1262 // end==nullptr is treated as a key after all keys in the database.
1263 // Therefore the following call will compact the entire database:
1264 // db->CompactRange(options, nullptr, nullptr);
1265 // Note that after the entire database is compacted, all data are pushed
1266 // down to the last level containing any data. If the total data size after
1267 // compaction is reduced, that level might not be appropriate for hosting all
1268 // the files. In this case, client could set options.change_level to true, to
1269 // move the files back to the minimum level capable of holding the data set
1270 // or a given level (specified by non-negative options.target_level).
1271 virtual Status CompactRange(const CompactRangeOptions& options,
1272 ColumnFamilyHandle* column_family,
1273 const Slice* begin, const Slice* end) = 0;
1274 virtual Status CompactRange(const CompactRangeOptions& options,
1275 const Slice* begin, const Slice* end) {
1276 return CompactRange(options, DefaultColumnFamily(), begin, end);
1277 }
1278
1e59de90
TL
1279 // Dynamically change column family options or table factory options in a
1280 // running DB, for the specified column family. Only options internally
1281 // marked as "mutable" can be changed. Options not listed in `opts_map` will
1282 // keep their current values. See GetColumnFamilyOptionsFromMap() in
1283 // convenience.h for the details of `opts_map`. Not supported in LITE mode.
1284 //
1285 // USABILITY NOTE: SetOptions is intended only for expert users, and does
1286 // not apply the same sanitization to options as the standard DB::Open code
1287 // path does. Use with caution.
1288 //
1289 // RELIABILITY & PERFORMANCE NOTE: SetOptions is not fully stress-tested for
1290 // reliability, and this is a slow call because a new OPTIONS file is
1291 // serialized and persisted for each call. Use only infrequently.
1292 //
1293 // EXAMPLES:
1294 // s = db->SetOptions(cfh, {{"ttl", "36000"}});
1295 // s = db->SetOptions(cfh, {{"block_based_table_factory",
1296 // "{prepopulate_block_cache=kDisable;}"}});
7c673cae
FG
1297 virtual Status SetOptions(
1298 ColumnFamilyHandle* /*column_family*/,
1e59de90 1299 const std::unordered_map<std::string, std::string>& /*opts_map*/) {
7c673cae
FG
1300 return Status::NotSupported("Not implemented");
1301 }
1e59de90 1302 // Shortcut for SetOptions on the default column family handle.
7c673cae
FG
1303 virtual Status SetOptions(
1304 const std::unordered_map<std::string, std::string>& new_options) {
1305 return SetOptions(DefaultColumnFamily(), new_options);
1306 }
1307
1e59de90
TL
1308 // Like SetOptions but for DBOptions, including the same caveats for
1309 // usability, reliability, and performance. See GetDBOptionsFromMap() (and
1310 // GetColumnFamilyOptionsFromMap()) in convenience.h for details on
1311 // `opts_map`. Note supported in LITE mode.
1312 //
1313 // EXAMPLES:
1314 // s = db->SetDBOptions({{"max_subcompactions", "2"}});
1315 // s = db->SetDBOptions({{"stats_dump_period_sec", "0"},
1316 // {"stats_persist_period_sec", "0"}});
7c673cae
FG
1317 virtual Status SetDBOptions(
1318 const std::unordered_map<std::string, std::string>& new_options) = 0;
1319
1320 // CompactFiles() inputs a list of files specified by file numbers and
1e59de90
TL
1321 // compacts them to the specified level. A small difference compared to
1322 // CompactRange() is that CompactFiles() performs the compaction job
1323 // using the CURRENT thread, so is not considered a "background" job.
7c673cae
FG
1324 //
1325 // @see GetDataBaseMetaData
1326 // @see GetColumnFamilyMetaData
1327 virtual Status CompactFiles(
1328 const CompactionOptions& compact_options,
1329 ColumnFamilyHandle* column_family,
494da23a
TL
1330 const std::vector<std::string>& input_file_names, const int output_level,
1331 const int output_path_id = -1,
1332 std::vector<std::string>* const output_file_names = nullptr,
1333 CompactionJobInfo* compaction_job_info = nullptr) = 0;
7c673cae
FG
1334
1335 virtual Status CompactFiles(
1336 const CompactionOptions& compact_options,
494da23a
TL
1337 const std::vector<std::string>& input_file_names, const int output_level,
1338 const int output_path_id = -1,
1339 std::vector<std::string>* const output_file_names = nullptr,
1340 CompactionJobInfo* compaction_job_info = nullptr) {
7c673cae 1341 return CompactFiles(compact_options, DefaultColumnFamily(),
11fdf7f2 1342 input_file_names, output_level, output_path_id,
494da23a 1343 output_file_names, compaction_job_info);
7c673cae
FG
1344 }
1345
1346 // This function will wait until all currently running background processes
1347 // finish. After it returns, no background process will be run until
20effc67
TL
1348 // ContinueBackgroundWork is called, once for each preceding OK-returning
1349 // call to PauseBackgroundWork.
7c673cae
FG
1350 virtual Status PauseBackgroundWork() = 0;
1351 virtual Status ContinueBackgroundWork() = 0;
1352
1353 // This function will enable automatic compactions for the given column
1354 // families if they were previously disabled. The function will first set the
1355 // disable_auto_compactions option for each column family to 'false', after
1356 // which it will schedule a flush/compaction.
1357 //
1358 // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
1359 // does NOT schedule a flush/compaction afterwards, and only changes the
1360 // parameter itself within the column family option.
1361 //
1362 virtual Status EnableAutoCompaction(
1363 const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
1364
1e59de90
TL
1365 // After this function call, CompactRange() or CompactFiles() will not
1366 // run compactions and fail. Calling this function will tell outstanding
1367 // manual compactions to abort and will wait for them to finish or abort
1368 // before returning.
f67539c2 1369 virtual void DisableManualCompaction() = 0;
1e59de90
TL
1370 // Re-enable CompactRange() and ComapctFiles() that are disabled by
1371 // DisableManualCompaction(). This function must be called as many times
1372 // as DisableManualCompaction() has been called in order to re-enable
1373 // manual compactions, and must not be called more times than
1374 // DisableManualCompaction() has been called.
f67539c2
TL
1375 virtual void EnableManualCompaction() = 0;
1376
7c673cae
FG
1377 // Number of levels used for this DB.
1378 virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
1379 virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
1380
1381 // Maximum level to which a new compacted memtable is pushed if it
1382 // does not create overlap.
1383 virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
1384 virtual int MaxMemCompactionLevel() {
1385 return MaxMemCompactionLevel(DefaultColumnFamily());
1386 }
1387
1388 // Number of files in level-0 that would stop writes.
1389 virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
1390 virtual int Level0StopWriteTrigger() {
1391 return Level0StopWriteTrigger(DefaultColumnFamily());
1392 }
1393
1394 // Get DB name -- the exact same name that was provided as an argument to
1395 // DB::Open()
1396 virtual const std::string& GetName() const = 0;
1397
1398 // Get Env object from the DB
1399 virtual Env* GetEnv() const = 0;
1400
1e59de90
TL
1401 // A shortcut for GetEnv()->->GetFileSystem().get(), possibly cached for
1402 // efficiency.
f67539c2
TL
1403 virtual FileSystem* GetFileSystem() const;
1404
7c673cae
FG
1405 // Get DB Options that we use. During the process of opening the
1406 // column family, the options provided when calling DB::Open() or
1407 // DB::CreateColumnFamily() will have been "sanitized" and transformed
1408 // in an implementation-defined manner.
1409 virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
1410 virtual Options GetOptions() const {
1411 return GetOptions(DefaultColumnFamily());
1412 }
1413
1414 virtual DBOptions GetDBOptions() const = 0;
1415
1416 // Flush all mem-table data.
494da23a
TL
1417 // Flush a single column family, even when atomic flush is enabled. To flush
1418 // multiple column families, use Flush(options, column_families).
7c673cae
FG
1419 virtual Status Flush(const FlushOptions& options,
1420 ColumnFamilyHandle* column_family) = 0;
1421 virtual Status Flush(const FlushOptions& options) {
1422 return Flush(options, DefaultColumnFamily());
1423 }
494da23a
TL
1424 // Flushes multiple column families.
1425 // If atomic flush is not enabled, Flush(options, column_families) is
1426 // equivalent to calling Flush(options, column_family) multiple times.
1427 // If atomic flush is enabled, Flush(options, column_families) will flush all
1428 // column families specified in 'column_families' up to the latest sequence
1429 // number at the time when flush is requested.
1430 // Note that RocksDB 5.15 and earlier may not be able to open later versions
1431 // with atomic flush enabled.
1432 virtual Status Flush(
1433 const FlushOptions& options,
1434 const std::vector<ColumnFamilyHandle*>& column_families) = 0;
7c673cae 1435
11fdf7f2
TL
1436 // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
1437 // afterwards.
1438 virtual Status FlushWAL(bool /*sync*/) {
1439 return Status::NotSupported("FlushWAL not implemented");
1440 }
7c673cae
FG
1441 // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
1442 // same as Write() with sync=true: in the latter case the changes won't be
1443 // visible until the sync is done.
1444 // Currently only works if allow_mmap_writes = false in Options.
1445 virtual Status SyncWAL() = 0;
1446
494da23a
TL
1447 // Lock the WAL. Also flushes the WAL after locking.
1448 virtual Status LockWAL() {
1449 return Status::NotSupported("LockWAL not implemented");
1450 }
1451
1452 // Unlock the WAL.
1453 virtual Status UnlockWAL() {
1454 return Status::NotSupported("UnlockWAL not implemented");
1455 }
1456
7c673cae
FG
1457 // The sequence number of the most recent transaction.
1458 virtual SequenceNumber GetLatestSequenceNumber() const = 0;
1459
7c673cae
FG
1460 // Prevent file deletions. Compactions will continue to occur,
1461 // but no obsolete files will be deleted. Calling this multiple
1462 // times have the same effect as calling it once.
1463 virtual Status DisableFileDeletions() = 0;
1464
1e59de90
TL
1465 // Increase the full_history_ts of column family. The new ts_low value should
1466 // be newer than current full_history_ts value.
1467 // If another thread updates full_history_ts_low concurrently to a higher
1468 // timestamp than the requested ts_low, a try again error will be returned.
1469 virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
1470 std::string ts_low) = 0;
1471
1472 // Get current full_history_ts value.
1473 virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
1474 std::string* ts_low) = 0;
1475
7c673cae
FG
1476 // Allow compactions to delete obsolete files.
1477 // If force == true, the call to EnableFileDeletions() will guarantee that
1478 // file deletions are enabled after the call, even if DisableFileDeletions()
1479 // was called multiple times before.
1480 // If force == false, EnableFileDeletions will only enable file deletion
1481 // after it's been called at least as many times as DisableFileDeletions(),
1482 // enabling the two methods to be called by two threads concurrently without
1483 // synchronization -- i.e., file deletions will be enabled only after both
1484 // threads call EnableFileDeletions()
1485 virtual Status EnableFileDeletions(bool force = true) = 0;
1486
20effc67 1487#ifndef ROCKSDB_LITE
f67539c2
TL
1488 // Retrieves the creation time of the oldest file in the DB.
1489 // This API only works if max_open_files = -1, if it is not then
1490 // Status returned is Status::NotSupported()
1491 // The file creation time is set using the env provided to the DB.
1492 // If the DB was created from a very old release then its possible that
1493 // the SST files might not have file_creation_time property and even after
1494 // moving to a newer release its possible that some files never got compacted
1495 // and may not have file_creation_time property. In both the cases
1496 // file_creation_time is considered 0 which means this API will return
1497 // creation_time = 0 as there wouldn't be a timestamp lower than 0.
1498 virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
1499
11fdf7f2 1500 // Note: this API is not yet consistent with WritePrepared transactions.
1e59de90
TL
1501 //
1502 // Sets iter to an iterator that is positioned at a write-batch whose
1503 // sequence number range [start_seq, end_seq] covers seq_number. If no such
1504 // write-batch exists, then iter is positioned at the next write-batch whose
1505 // start_seq > seq_number.
1506 //
7c673cae
FG
1507 // Returns Status::OK if iterator is valid
1508 // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
1509 // use this api, else the WAL files will get
1510 // cleared aggressively and the iterator might keep getting invalid before
1511 // an update is read.
1512 virtual Status GetUpdatesSince(
494da23a
TL
1513 SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
1514 const TransactionLogIterator::ReadOptions& read_options =
1515 TransactionLogIterator::ReadOptions()) = 0;
7c673cae
FG
1516
1517// Windows API macro interference
1518#undef DeleteFile
20effc67
TL
1519 // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
1520 // operate at the proper level of abstraction for a key-value store, and its
1521 // contract/restrictions are poorly documented. For example, it returns non-OK
1522 // `Status` for non-bottommost files and files undergoing compaction. Since we
1523 // do not plan to maintain it, the contract will likely remain underspecified
1524 // until its removal. Any user is encouraged to read the implementation
1525 // carefully and migrate away from it when possible.
1526 //
7c673cae
FG
1527 // Delete the file name from the db directory and update the internal state to
1528 // reflect that. Supports deletion of sst and log files only. 'name' must be
1529 // path relative to the db directory. eg. 000001.sst, /archive/000003.log
1530 virtual Status DeleteFile(std::string name) = 0;
1531
1e59de90
TL
1532 // Obtains a list of all live table (SST) files and how they fit into the
1533 // LSM-trees, such as column family, level, key range, etc.
1534 // This builds a de-normalized form of GetAllColumnFamilyMetaData().
1535 // For information about all files in a DB, use GetLiveFilesStorageInfo().
7c673cae
FG
1536 virtual void GetLiveFilesMetaData(
1537 std::vector<LiveFileMetaData>* /*metadata*/) {}
1538
1e59de90 1539 // Return a list of all table (SST) and blob files checksum info.
20effc67 1540 // Note: This function might be of limited use because it cannot be
1e59de90
TL
1541 // synchronized with other "live files" APIs. GetLiveFilesStorageInfo()
1542 // is recommended instead.
20effc67
TL
1543 virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0;
1544
1e59de90
TL
1545 // Get information about all live files that make up a DB, for making
1546 // live copies (Checkpoint, backups, etc.) or other storage-related purposes.
1547 // If creating a live copy, use DisableFileDeletions() before and
1548 // EnableFileDeletions() after to prevent deletions.
1549 // For LSM-tree metadata, use Get*MetaData() functions instead.
1550 virtual Status GetLiveFilesStorageInfo(
1551 const LiveFilesStorageInfoOptions& opts,
1552 std::vector<LiveFileStorageInfo>* files) = 0;
1553
1554 // Obtains the LSM-tree meta data of the specified column family of the DB,
1555 // including metadata for each live table (SST) file in that column family.
7c673cae
FG
1556 virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
1557 ColumnFamilyMetaData* /*metadata*/) {}
1558
1559 // Get the metadata of the default column family.
494da23a 1560 void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
7c673cae
FG
1561 GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
1562 }
1563
1e59de90
TL
1564 // Obtains the LSM-tree meta data of all column families of the DB, including
1565 // metadata for each live table (SST) file and each blob file in the DB.
1566 virtual void GetAllColumnFamilyMetaData(
1567 std::vector<ColumnFamilyMetaData>* /*metadata*/) {}
1568
1569 // Retrieve the list of all files in the database except WAL files. The files
1570 // are relative to the dbname (or db_paths/cf_paths), not absolute paths.
1571 // (Not recommended with db_paths/cf_paths because that information is not
1572 // returned.) Despite being relative paths, the file names begin with "/".
1573 // The valid size of the manifest file is returned in manifest_file_size.
1574 // The manifest file is an ever growing file, but only the portion specified
1575 // by manifest_file_size is valid for this snapshot. Setting flush_memtable
1576 // to true does Flush before recording the live files (unless DB is
1577 // read-only). Setting flush_memtable to false is useful when we don't want
1578 // to wait for flush which may have to wait for compaction to complete
1579 // taking an indeterminate time.
1580 //
1581 // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate
1582 // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended
1583 // instead, because it ensures a single consistent view of all files is
1584 // captured in one call.
1585 virtual Status GetLiveFiles(std::vector<std::string>&,
1586 uint64_t* manifest_file_size,
1587 bool flush_memtable = true) = 0;
1588
1589 // Retrieve the sorted list of all wal files with earliest file first
1590 virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
1591
1592 // Retrieve information about the current wal file
1593 //
1594 // Note that the log might have rolled after this call in which case
1595 // the current_log_file would not point to the current log file.
1596 //
1597 // Additionally, for the sake of optimization current_log_file->StartSequence
1598 // would always be set to 0
1599 virtual Status GetCurrentWalFile(
1600 std::unique_ptr<LogFile>* current_log_file) = 0;
1601
7c673cae 1602 // IngestExternalFile() will load a list of external SST files (1) into the DB
11fdf7f2
TL
1603 // Two primary modes are supported:
1604 // - Duplicate keys in the new files will overwrite exiting keys (default)
1605 // - Duplicate keys will be skipped (set ingest_behind=true)
1606 // In the first mode we will try to find the lowest possible level that
1607 // the file can fit in, and ingest the file into this level (2). A file that
1608 // have a key range that overlap with the memtable key range will require us
1609 // to Flush the memtable first before ingesting the file.
1610 // In the second mode we will always ingest in the bottom most level (see
1611 // docs to IngestExternalFileOptions::ingest_behind).
7c673cae
FG
1612 //
1613 // (1) External SST files can be created using SstFileWriter
1614 // (2) We will try to ingest the files to the lowest possible level
11fdf7f2
TL
1615 // even if the file compression doesn't match the level compression
1616 // (3) If IngestExternalFileOptions->ingest_behind is set to true,
1617 // we always ingest at the bottommost level, which should be reserved
1618 // for this purpose (see DBOPtions::allow_ingest_behind flag).
1e59de90
TL
1619 // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to
1620 // true, then this method can return Status:TryAgain() indicating that
1621 // the files cannot be ingested to the bottommost level, and it is the
1622 // user's responsibility to clear the bottommost level in the overlapping
1623 // range before re-attempting the ingestion.
7c673cae
FG
1624 virtual Status IngestExternalFile(
1625 ColumnFamilyHandle* column_family,
1626 const std::vector<std::string>& external_files,
1627 const IngestExternalFileOptions& options) = 0;
1628
1629 virtual Status IngestExternalFile(
1630 const std::vector<std::string>& external_files,
1631 const IngestExternalFileOptions& options) {
1632 return IngestExternalFile(DefaultColumnFamily(), external_files, options);
1633 }
1634
494da23a
TL
1635 // IngestExternalFiles() will ingest files for multiple column families, and
1636 // record the result atomically to the MANIFEST.
1637 // If this function returns OK, all column families' ingestion must succeed.
1638 // If this function returns NOK, or the process crashes, then non-of the
1639 // files will be ingested into the database after recovery.
1640 // Note that it is possible for application to observe a mixed state during
1641 // the execution of this function. If the user performs range scan over the
1642 // column families with iterators, iterator on one column family may return
1643 // ingested data, while iterator on other column family returns old data.
1644 // Users can use snapshot for a consistent view of data.
1645 // If your db ingests multiple SST files using this API, i.e. args.size()
1646 // > 1, then RocksDB 5.15 and earlier will not be able to open it.
1647 //
1648 // REQUIRES: each arg corresponds to a different column family: namely, for
1649 // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
1650 virtual Status IngestExternalFiles(
1651 const std::vector<IngestExternalFileArg>& args) = 0;
1652
f67539c2
TL
1653 // CreateColumnFamilyWithImport() will create a new column family with
1654 // column_family_name and import external SST files specified in metadata into
1655 // this column family.
1656 // (1) External SST files can be created using SstFileWriter.
1657 // (2) External SST files can be exported from a particular column family in
1e59de90 1658 // an existing DB using Checkpoint::ExportColumnFamily.
f67539c2
TL
1659 // Option in import_options specifies whether the external files are copied or
1660 // moved (default is copy). When option specifies copy, managing files at
1661 // external_file_path is caller's responsibility. When option specifies a
1e59de90
TL
1662 // move, the call makes a best effort to delete the specified files at
1663 // external_file_path on successful return, logging any failure to delete
1664 // rather than returning in Status. Files are not modified on any error
1665 // return, and a best effort is made to remove any newly-created files.
f67539c2
TL
1666 // On error return, column family handle returned will be nullptr.
1667 // ColumnFamily will be present on successful return and will not be present
1668 // on error return. ColumnFamily may be present on any crash during this call.
1669 virtual Status CreateColumnFamilyWithImport(
1670 const ColumnFamilyOptions& options, const std::string& column_family_name,
1671 const ImportColumnFamilyOptions& import_options,
1672 const ExportImportFilesMetaData& metadata,
1673 ColumnFamilyHandle** handle) = 0;
1674
20effc67
TL
1675 // Verify the checksums of files in db. Currently the whole-file checksum of
1676 // table files are checked.
1677 virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) {
1678 return Status::NotSupported("File verification not supported");
1679 }
1680
1681 // Verify the block checksums of files in db. The block checksums of table
1682 // files are checked.
f67539c2
TL
1683 virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
1684
1685 virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
11fdf7f2 1686
7c673cae
FG
1687#endif // ROCKSDB_LITE
1688
f67539c2
TL
1689 // Returns the unique ID which is read from IDENTITY file during the opening
1690 // of database by setting in the identity variable
1691 // Returns Status::OK if identity could be set properly
7c673cae
FG
1692 virtual Status GetDbIdentity(std::string& identity) const = 0;
1693
20effc67
TL
1694 // Return a unique identifier for each DB object that is opened
1695 // This DB session ID should be unique among all open DB instances on all
1696 // hosts, and should be unique among re-openings of the same or other DBs.
1697 // (Two open DBs have the same identity from other function GetDbIdentity when
1698 // one is physically copied from the other.)
1699 virtual Status GetDbSessionId(std::string& session_id) const = 0;
1700
7c673cae
FG
1701 // Returns default column family handle
1702 virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
1703
1704#ifndef ROCKSDB_LITE
1e59de90 1705
7c673cae
FG
1706 virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
1707 TablePropertiesCollection* props) = 0;
1708 virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
1709 return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
1710 }
1711 virtual Status GetPropertiesOfTablesInRange(
1712 ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
1713 TablePropertiesCollection* props) = 0;
11fdf7f2
TL
1714
1715 virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
1716 const Slice* /*begin*/,
1717 const Slice* /*end*/) {
1718 return Status::NotSupported("SuggestCompactRange() is not implemented.");
1719 }
1720
1721 virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
1722 int /*target_level*/) {
1723 return Status::NotSupported("PromoteL0() is not implemented.");
1724 }
1725
1726 // Trace DB operations. Use EndTrace() to stop tracing.
1727 virtual Status StartTrace(const TraceOptions& /*options*/,
1728 std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
1729 return Status::NotSupported("StartTrace() is not implemented.");
1730 }
1731
1732 virtual Status EndTrace() {
1733 return Status::NotSupported("EndTrace() is not implemented.");
1734 }
f67539c2 1735
20effc67 1736 // IO Tracing operations. Use EndIOTrace() to stop tracing.
1e59de90 1737 virtual Status StartIOTrace(const TraceOptions& /*options*/,
20effc67
TL
1738 std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
1739 return Status::NotSupported("StartIOTrace() is not implemented.");
1740 }
1741
1742 virtual Status EndIOTrace() {
1743 return Status::NotSupported("EndIOTrace() is not implemented.");
1744 }
1745
f67539c2
TL
1746 // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
1747 virtual Status StartBlockCacheTrace(
1e59de90 1748 const TraceOptions& /*trace_options*/,
f67539c2
TL
1749 std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
1750 return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
1751 }
1752
1e59de90
TL
1753 virtual Status StartBlockCacheTrace(
1754 const BlockCacheTraceOptions& /*options*/,
1755 std::unique_ptr<BlockCacheTraceWriter>&& /*trace_writer*/) {
1756 return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
1757 }
1758
f67539c2
TL
1759 virtual Status EndBlockCacheTrace() {
1760 return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
1761 }
1e59de90
TL
1762
1763 // Create a default trace replayer.
1764 virtual Status NewDefaultReplayer(
1765 const std::vector<ColumnFamilyHandle*>& /*handles*/,
1766 std::unique_ptr<TraceReader>&& /*reader*/,
1767 std::unique_ptr<Replayer>* /*replayer*/) {
1768 return Status::NotSupported("NewDefaultReplayer() is not implemented.");
1769 }
1770
7c673cae
FG
1771#endif // ROCKSDB_LITE
1772
1773 // Needed for StackableDB
1774 virtual DB* GetRootDB() { return this; }
1775
f67539c2
TL
1776 // Given a window [start_time, end_time), setup a StatsHistoryIterator
1777 // to access stats history. Note the start_time and end_time are epoch
1778 // time measured in seconds, and end_time is an exclusive bound.
494da23a
TL
1779 virtual Status GetStatsHistory(
1780 uint64_t /*start_time*/, uint64_t /*end_time*/,
1781 std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
1782 return Status::NotSupported("GetStatsHistory() is not implemented.");
1783 }
1784
1785#ifndef ROCKSDB_LITE
1786 // Make the secondary instance catch up with the primary by tailing and
1787 // replaying the MANIFEST and WAL of the primary.
1788 // Column families created by the primary after the secondary instance starts
1789 // will be ignored unless the secondary instance closes and restarts with the
1790 // newly created column families.
1791 // Column families that exist before secondary instance starts and dropped by
1792 // the primary afterwards will be marked as dropped. However, as long as the
1793 // secondary instance does not delete the corresponding column family
1794 // handles, the data of the column family is still accessible to the
1795 // secondary.
494da23a
TL
1796 virtual Status TryCatchUpWithPrimary() {
1797 return Status::NotSupported("Supported only by secondary instance");
1798 }
1799#endif // !ROCKSDB_LITE
7c673cae
FG
1800};
1801
1e59de90
TL
1802// Overloaded operators for enum class SizeApproximationFlags.
1803inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs,
1804 DB::SizeApproximationFlags rhs) {
1805 return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) &
1806 static_cast<uint8_t>(rhs));
1807}
1808inline DB::SizeApproximationFlags operator|(DB::SizeApproximationFlags lhs,
1809 DB::SizeApproximationFlags rhs) {
1810 return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) |
1811 static_cast<uint8_t>(rhs));
1812}
1813
1814inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
1815 const Range* ranges, int n,
1816 uint64_t* sizes,
1817 SizeApproximationFlags include_flags) {
1818 SizeApproximationOptions options;
1819 options.include_memtables =
1820 ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) !=
1821 SizeApproximationFlags::NONE);
1822 options.include_files =
1823 ((include_flags & SizeApproximationFlags::INCLUDE_FILES) !=
1824 SizeApproximationFlags::NONE);
1825 return GetApproximateSizes(options, column_family, ranges, n, sizes);
1826}
1827
7c673cae
FG
1828// Destroy the contents of the specified database.
1829// Be very careful using this method.
11fdf7f2
TL
1830Status DestroyDB(const std::string& name, const Options& options,
1831 const std::vector<ColumnFamilyDescriptor>& column_families =
494da23a 1832 std::vector<ColumnFamilyDescriptor>());
7c673cae
FG
1833
1834#ifndef ROCKSDB_LITE
1835// If a DB cannot be opened, you may attempt to call this method to
1836// resurrect as much of the contents of the database as possible.
1837// Some data may be lost, so be careful when calling this function
1838// on a database that contains important information.
1839//
1840// With this API, we will warn and skip data associated with column families not
1841// specified in column_families.
1842//
1843// @param column_families Descriptors for known column families
1844Status RepairDB(const std::string& dbname, const DBOptions& db_options,
1845 const std::vector<ColumnFamilyDescriptor>& column_families);
1846
1847// @param unknown_cf_opts Options for column families encountered during the
1848// repair that were not specified in column_families.
1849Status RepairDB(const std::string& dbname, const DBOptions& db_options,
1850 const std::vector<ColumnFamilyDescriptor>& column_families,
1851 const ColumnFamilyOptions& unknown_cf_opts);
1852
1853// @param options These options will be used for the database and for ALL column
1854// families encountered during the repair
1855Status RepairDB(const std::string& dbname, const Options& options);
1856
1857#endif
1858
f67539c2 1859} // namespace ROCKSDB_NAMESPACE