X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Frocksdb%2Finclude%2Frocksdb%2Fdb.h;fp=ceph%2Fsrc%2Frocksdb%2Finclude%2Frocksdb%2Fdb.h;h=840e1b4ecf10edaeb64a590e1b828011a3b595ff;hb=20effc670b57271cb089376d6d0800990e5218d5;hp=3108003f1f5d60f37ecf6474dd82ebc1798919e0;hpb=a71831dadd1e1f3e0fa70405511f65cc33db0498;p=ceph.git diff --git a/ceph/src/rocksdb/include/rocksdb/db.h b/ceph/src/rocksdb/include/rocksdb/db.h index 3108003f1..840e1b4ec 100644 --- a/ceph/src/rocksdb/include/rocksdb/db.h +++ b/ceph/src/rocksdb/include/rocksdb/db.h @@ -111,10 +111,16 @@ struct RangePtr { RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} }; +// It is valid that files_checksums and files_checksum_func_names are both +// empty (no checksum informaiton is provided for ingestion). Otherwise, +// their sizes should be the same as external_files. The file order should +// be the same in three vectors and guaranteed by the caller. struct IngestExternalFileArg { ColumnFamilyHandle* column_family = nullptr; std::vector external_files; IngestExternalFileOptions options; + std::vector files_checksums; + std::vector files_checksum_func_names; }; struct GetMergeOperandsOptions { @@ -127,9 +133,11 @@ struct GetMergeOperandsOptions { typedef std::unordered_map> TablePropertiesCollection; -// A DB is a persistent ordered map from keys to values. +// A DB is a persistent, versioned ordered map from keys to values. // A DB is safe for concurrent access from multiple threads without // any external synchronization. +// DB is an abstract base class with one primary implementation (DBImpl) +// and a number of wrapper implementations. class DB { public: // Open the database with the specified "name". @@ -149,7 +157,7 @@ class DB { // return Status::NotSupported. static Status OpenForReadOnly(const Options& options, const std::string& name, DB** dbptr, - bool error_if_log_file_exist = false); + bool error_if_wal_file_exists = false); // Open the database for read only with column families. When opening DB with // read only, you can specify only a subset of column families in the @@ -163,7 +171,7 @@ class DB { const DBOptions& db_options, const std::string& name, const std::vector& column_families, std::vector* handles, DB** dbptr, - bool error_if_log_file_exist = false); + bool error_if_wal_file_exists = false); // The following OpenAsSecondary functions create a secondary instance that // can dynamically tail the MANIFEST of a primary that must have already been @@ -255,6 +263,7 @@ class DB { const std::string& name, std::vector* column_families); + // Abstract class ctor DB() {} // No copying allowed DB(const DB&) = delete; @@ -353,8 +362,11 @@ class DB { // Removes the database entries in the range ["begin_key", "end_key"), i.e., // including "begin_key" and excluding "end_key". Returns OK on success, and - // a non-OK status on error. It is not an error if no keys exist in the range - // ["begin_key", "end_key"). + // a non-OK status on error. It is not an error if the database does not + // contain any existing data in the range ["begin_key", "end_key"). + // + // If "end_key" comes before "start_key" according to the user's comparator, + // a `Status::InvalidArgument` is returned. // // This feature is now usable in production, with the following caveats: // 1) Accumulating many range tombstones in the memtable will degrade read @@ -388,6 +400,9 @@ class DB { // If the database contains an entry for "key" store the // corresponding value in *value and return OK. // + // If timestamp is enabled and a non-null timestamp pointer is passed in, + // timestamp is returned. + // // If there is no entry for "key" leave *value unchanged and return // a status for which Status::IsNotFound() returns true. // @@ -412,6 +427,32 @@ class DB { return Get(options, DefaultColumnFamily(), key, value); } + // Get() methods that return timestamp. Derived DB classes don't need to worry + // about this group of methods if they don't care about timestamp feature. + virtual inline Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(options, column_family, key, &pinnable_val, timestamp); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + virtual Status Get(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, PinnableSlice* /*value*/, + std::string* /*timestamp*/) { + return Status::NotSupported( + "Get() that returns timestamp is not implemented."); + } + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp) { + return Get(options, DefaultColumnFamily(), key, value, timestamp); + } + // Returns all the merge operands corresponding to the key. If the // number of merge operands in DB is greater than // merge_operands_options.expected_max_number_of_operands @@ -428,6 +469,11 @@ class DB { GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) = 0; + // Consistent Get of many keys across column families without the need + // for an explicit snapshot. NOTE: the implementation of this MultiGet API + // does not have the performance benefits of the void-returning MultiGet + // functions. + // // If keys[i] does not exist in the database, then the i'th returned // status will be one for which Status::IsNotFound() is true, and // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, @@ -451,6 +497,25 @@ class DB { keys, values); } + virtual std::vector MultiGet( + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + const std::vector& keys, std::vector* /*values*/, + std::vector* /*timestamps*/) { + return std::vector( + keys.size(), Status::NotSupported( + "MultiGet() returning timestamps not implemented.")); + } + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values, + std::vector* timestamps) { + return MultiGet( + options, + std::vector(keys.size(), DefaultColumnFamily()), + keys, values, timestamps); + } + // Overloaded MultiGet API that improves performance by batching operations // in the read path for greater efficiency. Currently, only the block based // table format with full filters are supported. Other table formats such @@ -492,6 +557,30 @@ class DB { } } + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_family); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + // Overloaded MultiGet API that improves performance by batching operations // in the read path for greater efficiency. Currently, only the block based // table format with full filters are supported. Other table formats such @@ -531,6 +620,28 @@ class DB { values++; } } + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } // If the key definitely does not exist in the database, then this method // returns false, else true. If the caller wants to obtain value when the key @@ -542,17 +653,33 @@ class DB { virtual bool KeyMayExist(const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, std::string* /*value*/, + std::string* /*timestamp*/, bool* value_found = nullptr) { if (value_found != nullptr) { *value_found = false; } return true; } + + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, column_family, key, value, + /*timestamp=*/nullptr, value_found); + } + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, std::string* value, bool* value_found = nullptr) { return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); } + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp, + value_found); + } + // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). @@ -883,32 +1010,33 @@ class DB { }; // For each i in [0,n-1], store in "sizes[i]", the approximate - // file system space used by keys in "[range[i].start .. range[i].limit)". + // file system space used by keys in "[range[i].start .. range[i].limit)" + // in a single column family. // // Note that the returned sizes measure file system space usage, so // if the user data compresses by a factor of ten, the returned // sizes will be one-tenth the size of the corresponding user data size. virtual Status GetApproximateSizes(const SizeApproximationOptions& options, ColumnFamilyHandle* column_family, - const Range* range, int n, + const Range* ranges, int n, uint64_t* sizes) = 0; // Simpler versions of the GetApproximateSizes() method above. // The include_flags argumenbt must of type DB::SizeApproximationFlags // and can not be NONE. virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes, + const Range* ranges, int n, uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) { SizeApproximationOptions options; options.include_memtabtles = (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0; options.include_files = (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0; - GetApproximateSizes(options, column_family, range, n, sizes); + GetApproximateSizes(options, column_family, ranges, n, sizes); } - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes, + virtual void GetApproximateSizes(const Range* ranges, int n, uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) { - GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); + GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, include_flags); } // The method is similar to GetApproximateSizes, except it @@ -1029,7 +1157,8 @@ class DB { // This function will wait until all currently running background processes // finish. After it returns, no background process will be run until - // ContinueBackgroundWork is called + // ContinueBackgroundWork is called, once for each preceding OK-returning + // call to PauseBackgroundWork. virtual Status PauseBackgroundWork() = 0; virtual Status ContinueBackgroundWork() = 0; @@ -1137,8 +1266,6 @@ class DB { // updated, false if user attempted to call if with seqnum <= current value. virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0; -#ifndef ROCKSDB_LITE - // Prevent file deletions. Compactions will continue to occur, // but no obsolete files will be deleted. Calling this multiple // times have the same effect as calling it once. @@ -1155,6 +1282,7 @@ class DB { // threads call EnableFileDeletions() virtual Status EnableFileDeletions(bool force = true) = 0; +#ifndef ROCKSDB_LITE // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup // Retrieve the list of all files in the database. The files are @@ -1216,6 +1344,14 @@ class DB { // Windows API macro interference #undef DeleteFile + // WARNING: This API is planned for removal in RocksDB 7.0 since it does not + // operate at the proper level of abstraction for a key-value store, and its + // contract/restrictions are poorly documented. For example, it returns non-OK + // `Status` for non-bottommost files and files undergoing compaction. Since we + // do not plan to maintain it, the contract will likely remain underspecified + // until its removal. Any user is encouraged to read the implementation + // carefully and migrate away from it when possible. + // // Delete the file name from the db directory and update the internal state to // reflect that. Supports deletion of sst and log files only. 'name' must be // path relative to the db directory. eg. 000001.sst, /archive/000003.log @@ -1226,6 +1362,11 @@ class DB { virtual void GetLiveFilesMetaData( std::vector* /*metadata*/) {} + // Return a list of all table file checksum info. + // Note: This function might be of limited use because it cannot be + // synchronized with GetLiveFiles. + virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0; + // Obtains the meta data of the specified column family of the DB. virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, ColumnFamilyMetaData* /*metadata*/) {} @@ -1302,6 +1443,14 @@ class DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of + // table files are checked. + virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { + return Status::NotSupported("File verification not supported"); + } + + // Verify the block checksums of files in db. The block checksums of table + // files are checked. virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } @@ -1415,6 +1564,13 @@ class DB { // Returns Status::OK if identity could be set properly virtual Status GetDbIdentity(std::string& identity) const = 0; + // Return a unique identifier for each DB object that is opened + // This DB session ID should be unique among all open DB instances on all + // hosts, and should be unique among re-openings of the same or other DBs. + // (Two open DBs have the same identity from other function GetDbIdentity when + // one is physically copied from the other.) + virtual Status GetDbSessionId(std::string& session_id) const = 0; + // Returns default column family handle virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; @@ -1449,6 +1605,16 @@ class DB { return Status::NotSupported("EndTrace() is not implemented."); } + // IO Tracing operations. Use EndIOTrace() to stop tracing. + virtual Status StartIOTrace(Env* /*env*/, const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartIOTrace() is not implemented."); + } + + virtual Status EndIOTrace() { + return Status::NotSupported("EndIOTrace() is not implemented."); + } + // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. virtual Status StartBlockCacheTrace( const TraceOptions& /*options*/,