]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/write_batch.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / include / rocksdb / write_batch.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6// Use of this source code is governed by a BSD-style license that can be
7// found in the LICENSE file. See the AUTHORS file for names of contributors.
8//
9// WriteBatch holds a collection of updates to apply atomically to a DB.
10//
11// The updates are applied in the order in which they are added
12// to the WriteBatch. For example, the value of "key" will be "v3"
13// after the following batch is written:
14//
15// batch.Put("key", "v1");
16// batch.Delete("key");
17// batch.Put("key", "v2");
18// batch.Put("key", "v3");
19//
20// Multiple threads can invoke const methods on a WriteBatch without
21// external synchronization, but if any of the threads may call a
22// non-const method, all threads accessing the same WriteBatch must use
23// external synchronization.
24
11fdf7f2 25#pragma once
7c673cae 26
494da23a 27#include <stdint.h>
1e59de90 28
7c673cae 29#include <atomic>
1e59de90 30#include <functional>
f67539c2 31#include <memory>
7c673cae 32#include <string>
f67539c2 33#include <vector>
1e59de90 34
7c673cae
FG
35#include "rocksdb/status.h"
36#include "rocksdb/write_batch_base.h"
37
f67539c2 38namespace ROCKSDB_NAMESPACE {
7c673cae
FG
39
40class Slice;
41class ColumnFamilyHandle;
42struct SavePoints;
43struct SliceParts;
44
45struct SavePoint {
46 size_t size; // size of rep_
47 int count; // count of elements in rep_
48 uint32_t content_flags;
49
50 SavePoint() : size(0), count(0), content_flags(0) {}
51
52 SavePoint(size_t _size, int _count, uint32_t _flags)
53 : size(_size), count(_count), content_flags(_flags) {}
54
55 void clear() {
56 size = 0;
57 count = 0;
58 content_flags = 0;
59 }
60
61 bool is_cleared() const { return (size | count | content_flags) == 0; }
62};
63
64class WriteBatch : public WriteBatchBase {
65 public:
1e59de90
TL
66 explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
67 : WriteBatch(reserved_bytes, max_bytes, 0, 0) {}
68
69 // `protection_bytes_per_key` is the number of bytes used to store
70 // protection information for each key entry. Currently supported values are
71 // zero (disabled) and eight.
72 explicit WriteBatch(size_t reserved_bytes, size_t max_bytes,
73 size_t protection_bytes_per_key, size_t default_cf_ts_sz);
11fdf7f2 74 ~WriteBatch() override;
7c673cae
FG
75
76 using WriteBatchBase::Put;
77 // Store the mapping "key->value" in the database.
1e59de90
TL
78 // The following Put(..., const Slice& key, ...) API can also be used when
79 // user-defined timestamp is enabled as long as `key` points to a contiguous
80 // buffer with timestamp appended after user key. The caller is responsible
81 // for setting up the memory buffer pointed to by `key`.
7c673cae
FG
82 Status Put(ColumnFamilyHandle* column_family, const Slice& key,
83 const Slice& value) override;
84 Status Put(const Slice& key, const Slice& value) override {
85 return Put(nullptr, key, value);
86 }
1e59de90
TL
87 Status Put(ColumnFamilyHandle* column_family, const Slice& key,
88 const Slice& ts, const Slice& value) override;
7c673cae
FG
89
90 // Variant of Put() that gathers output like writev(2). The key and value
11fdf7f2 91 // that will be written to the database are concatenations of arrays of
7c673cae 92 // slices.
1e59de90
TL
93 // The following Put(..., const SliceParts& key, ...) API can be used when
94 // user-defined timestamp is enabled as long as the timestamp is the last
95 // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
96 // for setting up the `key` SliceParts object.
7c673cae
FG
97 Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
98 const SliceParts& value) override;
99 Status Put(const SliceParts& key, const SliceParts& value) override {
100 return Put(nullptr, key, value);
101 }
102
1e59de90
TL
103 // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
104 // column family specified by "column_family".
105 using WriteBatchBase::PutEntity;
106 Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key,
107 const WideColumns& columns) override;
108
7c673cae
FG
109 using WriteBatchBase::Delete;
110 // If the database contains a mapping for "key", erase it. Else do nothing.
1e59de90
TL
111 // The following Delete(..., const Slice& key) can be used when user-defined
112 // timestamp is enabled as long as `key` points to a contiguous buffer with
113 // timestamp appended after user key. The caller is responsible for setting
114 // up the memory buffer pointed to by `key`.
7c673cae
FG
115 Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
116 Status Delete(const Slice& key) override { return Delete(nullptr, key); }
1e59de90
TL
117 Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
118 const Slice& ts) override;
7c673cae
FG
119
120 // variant that takes SliceParts
1e59de90
TL
121 // These two variants of Delete(..., const SliceParts& key) can be used when
122 // user-defined timestamp is enabled as long as the timestamp is the last
123 // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
124 // for setting up the `key` SliceParts object.
7c673cae
FG
125 Status Delete(ColumnFamilyHandle* column_family,
126 const SliceParts& key) override;
127 Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
128
129 using WriteBatchBase::SingleDelete;
130 // WriteBatch implementation of DB::SingleDelete(). See db.h.
131 Status SingleDelete(ColumnFamilyHandle* column_family,
132 const Slice& key) override;
133 Status SingleDelete(const Slice& key) override {
134 return SingleDelete(nullptr, key);
135 }
1e59de90
TL
136 Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
137 const Slice& ts) override;
7c673cae
FG
138
139 // variant that takes SliceParts
140 Status SingleDelete(ColumnFamilyHandle* column_family,
141 const SliceParts& key) override;
142 Status SingleDelete(const SliceParts& key) override {
143 return SingleDelete(nullptr, key);
144 }
145
146 using WriteBatchBase::DeleteRange;
147 // WriteBatch implementation of DB::DeleteRange(). See db.h.
148 Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
149 const Slice& end_key) override;
150 Status DeleteRange(const Slice& begin_key, const Slice& end_key) override {
151 return DeleteRange(nullptr, begin_key, end_key);
152 }
1e59de90
TL
153 // begin_key and end_key should be user keys without timestamp.
154 Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
155 const Slice& end_key, const Slice& ts) override;
7c673cae
FG
156
157 // variant that takes SliceParts
158 Status DeleteRange(ColumnFamilyHandle* column_family,
159 const SliceParts& begin_key,
160 const SliceParts& end_key) override;
161 Status DeleteRange(const SliceParts& begin_key,
162 const SliceParts& end_key) override {
163 return DeleteRange(nullptr, begin_key, end_key);
164 }
165
166 using WriteBatchBase::Merge;
167 // Merge "value" with the existing value of "key" in the database.
168 // "key->merge(existing, value)"
169 Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
170 const Slice& value) override;
171 Status Merge(const Slice& key, const Slice& value) override {
172 return Merge(nullptr, key, value);
173 }
1e59de90
TL
174 Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
175 const Slice& /*ts*/, const Slice& /*value*/) override;
7c673cae
FG
176
177 // variant that takes SliceParts
178 Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
179 const SliceParts& value) override;
180 Status Merge(const SliceParts& key, const SliceParts& value) override {
181 return Merge(nullptr, key, value);
182 }
183
184 using WriteBatchBase::PutLogData;
185 // Append a blob of arbitrary size to the records in this batch. The blob will
186 // be stored in the transaction log but not in any other file. In particular,
187 // it will not be persisted to the SST files. When iterating over this
188 // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
189 // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
11fdf7f2 190 // encountered in the same order in which they were inserted. The blob will
7c673cae
FG
191 // NOT consume sequence number(s) and will NOT increase the count of the batch
192 //
193 // Example application: add timestamps to the transaction log for use in
194 // replication.
195 Status PutLogData(const Slice& blob) override;
196
197 using WriteBatchBase::Clear;
198 // Clear all updates buffered in this batch.
199 void Clear() override;
200
201 // Records the state of the batch for future calls to RollbackToSavePoint().
202 // May be called multiple times to set multiple save points.
203 void SetSavePoint() override;
204
205 // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
206 // most recent call to SetSavePoint() and removes the most recent save point.
207 // If there is no previous call to SetSavePoint(), Status::NotFound()
208 // will be returned.
209 // Otherwise returns Status::OK().
210 Status RollbackToSavePoint() override;
211
11fdf7f2
TL
212 // Pop the most recent save point.
213 // If there is no previous call to SetSavePoint(), Status::NotFound()
214 // will be returned.
215 // Otherwise returns Status::OK().
216 Status PopSavePoint() override;
217
7c673cae 218 // Support for iterating over the contents of a batch.
1e59de90 219 // Objects of subclasses of Handler will be used by WriteBatch::Iterate().
7c673cae
FG
220 class Handler {
221 public:
222 virtual ~Handler();
223 // All handler functions in this class provide default implementations so
224 // we won't break existing clients of Handler on a source code level when
225 // adding a new member function.
226
227 // default implementation will just call Put without column family for
228 // backwards compatibility. If the column family is not default,
229 // the function is noop
1e59de90 230 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
231 virtual Status PutCF(uint32_t column_family_id, const Slice& key,
232 const Slice& value) {
233 if (column_family_id == 0) {
234 // Put() historically doesn't return status. We didn't want to be
235 // backwards incompatible so we didn't change the return status
236 // (this is a public API). We do an ordinary get and return Status::OK()
237 Put(key, value);
238 return Status::OK();
239 }
240 return Status::InvalidArgument(
241 "non-default column family and PutCF not implemented");
242 }
1e59de90 243 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
244 virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {}
245
1e59de90
TL
246 // If user-defined timestamp is enabled, then `key` includes timestamp.
247 virtual Status PutEntityCF(uint32_t /* column_family_id */,
248 const Slice& /* key */,
249 const Slice& /* entity */) {
250 return Status::NotSupported("PutEntityCF not implemented");
251 }
252
253 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
254 virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
255 if (column_family_id == 0) {
256 Delete(key);
257 return Status::OK();
258 }
259 return Status::InvalidArgument(
260 "non-default column family and DeleteCF not implemented");
261 }
1e59de90 262 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
263 virtual void Delete(const Slice& /*key*/) {}
264
1e59de90 265 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
266 virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) {
267 if (column_family_id == 0) {
268 SingleDelete(key);
269 return Status::OK();
270 }
271 return Status::InvalidArgument(
272 "non-default column family and SingleDeleteCF not implemented");
273 }
1e59de90 274 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
275 virtual void SingleDelete(const Slice& /*key*/) {}
276
1e59de90
TL
277 // If user-defined timestamp is enabled, then `begin_key` and `end_key`
278 // both include timestamp.
11fdf7f2
TL
279 virtual Status DeleteRangeCF(uint32_t /*column_family_id*/,
280 const Slice& /*begin_key*/,
281 const Slice& /*end_key*/) {
7c673cae
FG
282 return Status::InvalidArgument("DeleteRangeCF not implemented");
283 }
284
1e59de90 285 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
286 virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
287 const Slice& value) {
288 if (column_family_id == 0) {
289 Merge(key, value);
290 return Status::OK();
291 }
292 return Status::InvalidArgument(
293 "non-default column family and MergeCF not implemented");
294 }
1e59de90 295 // If user-defined timestamp is enabled, then `key` includes timestamp.
7c673cae
FG
296 virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
297
1e59de90 298 // If user-defined timestamp is enabled, then `key` includes timestamp.
11fdf7f2
TL
299 virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
300 const Slice& /*key*/,
301 const Slice& /*value*/) {
302 return Status::InvalidArgument("PutBlobIndexCF not implemented");
303 }
304
7c673cae
FG
305 // The default implementation of LogData does nothing.
306 virtual void LogData(const Slice& blob);
307
11fdf7f2 308 virtual Status MarkBeginPrepare(bool = false) {
7c673cae
FG
309 return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
310 }
311
11fdf7f2 312 virtual Status MarkEndPrepare(const Slice& /*xid*/) {
7c673cae
FG
313 return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
314 }
315
11fdf7f2
TL
316 virtual Status MarkNoop(bool /*empty_batch*/) {
317 return Status::InvalidArgument("MarkNoop() handler not defined.");
318 }
319
320 virtual Status MarkRollback(const Slice& /*xid*/) {
7c673cae
FG
321 return Status::InvalidArgument(
322 "MarkRollbackPrepare() handler not defined.");
323 }
324
11fdf7f2 325 virtual Status MarkCommit(const Slice& /*xid*/) {
7c673cae
FG
326 return Status::InvalidArgument("MarkCommit() handler not defined.");
327 }
328
1e59de90
TL
329 virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/,
330 const Slice& /*commit_ts*/) {
331 return Status::InvalidArgument(
332 "MarkCommitWithTimestamp() handler not defined.");
333 }
334
7c673cae
FG
335 // Continue is called by WriteBatch::Iterate. If it returns false,
336 // iteration is halted. Otherwise, it continues iterating. The default
337 // implementation always returns true.
338 virtual bool Continue();
11fdf7f2
TL
339
340 protected:
f67539c2 341 friend class WriteBatchInternal;
1e59de90
TL
342 enum class OptionState {
343 kUnknown,
344 kDisabled,
345 kEnabled,
346 };
347 virtual OptionState WriteAfterCommit() const {
348 return OptionState::kUnknown;
349 }
350 virtual OptionState WriteBeforePrepare() const {
351 return OptionState::kUnknown;
352 }
7c673cae
FG
353 };
354 Status Iterate(Handler* handler) const;
355
356 // Retrieve the serialized version of this batch.
357 const std::string& Data() const { return rep_; }
358
359 // Retrieve data size of the batch.
360 size_t GetDataSize() const { return rep_.size(); }
361
362 // Returns the number of updates in the batch
f67539c2 363 uint32_t Count() const;
7c673cae
FG
364
365 // Returns true if PutCF will be called during Iterate
366 bool HasPut() const;
367
1e59de90
TL
368 // Returns true if PutEntityCF will be called during Iterate
369 bool HasPutEntity() const;
370
7c673cae
FG
371 // Returns true if DeleteCF will be called during Iterate
372 bool HasDelete() const;
373
374 // Returns true if SingleDeleteCF will be called during Iterate
375 bool HasSingleDelete() const;
376
377 // Returns true if DeleteRangeCF will be called during Iterate
378 bool HasDeleteRange() const;
379
380 // Returns true if MergeCF will be called during Iterate
381 bool HasMerge() const;
382
383 // Returns true if MarkBeginPrepare will be called during Iterate
384 bool HasBeginPrepare() const;
385
386 // Returns true if MarkEndPrepare will be called during Iterate
387 bool HasEndPrepare() const;
388
1e59de90 389 // Returns true if MarkCommit will be called during Iterate
7c673cae
FG
390 bool HasCommit() const;
391
1e59de90 392 // Returns true if MarkRollback will be called during Iterate
7c673cae
FG
393 bool HasRollback() const;
394
1e59de90
TL
395 // Experimental.
396 //
397 // Update timestamps of existing entries in the write batch if
398 // applicable. If a key is intended for a column family that disables
399 // timestamp, then this API won't set the timestamp for this key.
400 // This requires that all keys, if enable timestamp, (possibly from multiple
401 // column families) in the write batch have timestamps of the same format.
402 //
403 // ts_sz_func: callable object to obtain the timestamp sizes of column
404 // families. If ts_sz_func() accesses data structures, then the caller of this
405 // API must guarantee thread-safety. Like other parts of RocksDB, this API is
406 // not exception-safe. Therefore, ts_sz_func() must not throw.
407 //
408 // in: cf, the column family id.
409 // ret: timestamp size of the given column family. Return
410 // std::numeric_limits<size_t>::max() indicating "don't know or column
411 // family info not found", this will cause UpdateTimestamps() to fail.
412 // size_t ts_sz_func(uint32_t cf);
413 Status UpdateTimestamps(const Slice& ts,
414 std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
415
416 // Verify the per-key-value checksums of this write batch.
417 // Corruption status will be returned if the verification fails.
418 // If this write batch does not have per-key-value checksum,
419 // OK status will be returned.
420 Status VerifyChecksum() const;
f67539c2 421
7c673cae
FG
422 using WriteBatchBase::GetWriteBatch;
423 WriteBatch* GetWriteBatch() override { return this; }
424
425 // Constructor with a serialized string object
426 explicit WriteBatch(const std::string& rep);
11fdf7f2 427 explicit WriteBatch(std::string&& rep);
7c673cae
FG
428
429 WriteBatch(const WriteBatch& src);
11fdf7f2 430 WriteBatch(WriteBatch&& src) noexcept;
7c673cae
FG
431 WriteBatch& operator=(const WriteBatch& src);
432 WriteBatch& operator=(WriteBatch&& src);
433
434 // marks this point in the WriteBatch as the last record to
435 // be inserted into the WAL, provided the WAL is enabled
436 void MarkWalTerminationPoint();
437 const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; }
438
439 void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; }
440
1e59de90
TL
441 struct ProtectionInfo;
442 size_t GetProtectionBytesPerKey() const;
443
7c673cae
FG
444 private:
445 friend class WriteBatchInternal;
446 friend class LocalSavePoint;
11fdf7f2
TL
447 // TODO(myabandeh): this is needed for a hack to collapse the write batch and
448 // remove duplicate keys. Remove it when the hack is replaced with a proper
449 // solution.
450 friend class WriteBatchWithIndex;
f67539c2 451 std::unique_ptr<SavePoints> save_points_;
7c673cae
FG
452
453 // When sending a WriteBatch through WriteImpl we might want to
454 // specify that only the first x records of the batch be written to
455 // the WAL.
456 SavePoint wal_term_point_;
457
1e59de90
TL
458 // Is the content of the batch the application's latest state that meant only
459 // to be used for recovery? Refer to
460 // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
461 // more details.
462 bool is_latest_persistent_state_ = false;
463
464 // False if all keys are from column families that disable user-defined
465 // timestamp OR UpdateTimestamps() has been called at least once.
466 // This flag will be set to true if any of the above Put(), Delete(),
467 // SingleDelete(), etc. APIs are called at least once.
468 // Calling Put(ts), Delete(ts), SingleDelete(ts), etc. will not set this flag
469 // to true because the assumption is that these APIs have already set the
470 // timestamps to desired values.
471 bool needs_in_place_update_ts_ = false;
472
473 // True if the write batch contains at least one key from a column family
474 // that enables user-defined timestamp.
475 bool has_key_with_ts_ = false;
476
7c673cae
FG
477 // For HasXYZ. Mutable to allow lazy computation of results
478 mutable std::atomic<uint32_t> content_flags_;
479
480 // Performs deferred computation of content_flags if necessary
481 uint32_t ComputeContentFlags() const;
482
483 // Maximum size of rep_.
484 size_t max_bytes_;
485
1e59de90
TL
486 std::unique_ptr<ProtectionInfo> prot_info_;
487
488 size_t default_cf_ts_sz_ = 0;
11fdf7f2 489
7c673cae
FG
490 protected:
491 std::string rep_; // See comment in write_batch.cc for the format of rep_
7c673cae
FG
492};
493
f67539c2 494} // namespace ROCKSDB_NAMESPACE