1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 // WriteBatch holds a collection of updates to apply atomically to a DB.
11 // The updates are applied in the order in which they are added
12 // to the WriteBatch. For example, the value of "key" will be "v3"
13 // after the following batch is written:
15 // batch.Put("key", "v1");
16 // batch.Delete("key");
17 // batch.Put("key", "v2");
18 // batch.Put("key", "v3");
20 // Multiple threads can invoke const methods on a WriteBatch without
21 // external synchronization, but if any of the threads may call a
22 // non-const method, all threads accessing the same WriteBatch must use
23 // external synchronization.
35 #include "rocksdb/status.h"
36 #include "rocksdb/write_batch_base.h"
38 namespace ROCKSDB_NAMESPACE
{
41 class ColumnFamilyHandle
;
46 size_t size
; // size of rep_
47 int count
; // count of elements in rep_
48 uint32_t content_flags
;
50 SavePoint() : size(0), count(0), content_flags(0) {}
52 SavePoint(size_t _size
, int _count
, uint32_t _flags
)
53 : size(_size
), count(_count
), content_flags(_flags
) {}
61 bool is_cleared() const { return (size
| count
| content_flags
) == 0; }
64 class WriteBatch
: public WriteBatchBase
{
66 explicit WriteBatch(size_t reserved_bytes
= 0, size_t max_bytes
= 0)
67 : WriteBatch(reserved_bytes
, max_bytes
, 0, 0) {}
69 // `protection_bytes_per_key` is the number of bytes used to store
70 // protection information for each key entry. Currently supported values are
71 // zero (disabled) and eight.
72 explicit WriteBatch(size_t reserved_bytes
, size_t max_bytes
,
73 size_t protection_bytes_per_key
, size_t default_cf_ts_sz
);
74 ~WriteBatch() override
;
76 using WriteBatchBase::Put
;
77 // Store the mapping "key->value" in the database.
78 // The following Put(..., const Slice& key, ...) API can also be used when
79 // user-defined timestamp is enabled as long as `key` points to a contiguous
80 // buffer with timestamp appended after user key. The caller is responsible
81 // for setting up the memory buffer pointed to by `key`.
82 Status
Put(ColumnFamilyHandle
* column_family
, const Slice
& key
,
83 const Slice
& value
) override
;
84 Status
Put(const Slice
& key
, const Slice
& value
) override
{
85 return Put(nullptr, key
, value
);
87 Status
Put(ColumnFamilyHandle
* column_family
, const Slice
& key
,
88 const Slice
& ts
, const Slice
& value
) override
;
90 // Variant of Put() that gathers output like writev(2). The key and value
91 // that will be written to the database are concatenations of arrays of
93 // The following Put(..., const SliceParts& key, ...) API can be used when
94 // user-defined timestamp is enabled as long as the timestamp is the last
95 // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
96 // for setting up the `key` SliceParts object.
97 Status
Put(ColumnFamilyHandle
* column_family
, const SliceParts
& key
,
98 const SliceParts
& value
) override
;
99 Status
Put(const SliceParts
& key
, const SliceParts
& value
) override
{
100 return Put(nullptr, key
, value
);
103 // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
104 // column family specified by "column_family".
105 using WriteBatchBase::PutEntity
;
106 Status
PutEntity(ColumnFamilyHandle
* column_family
, const Slice
& key
,
107 const WideColumns
& columns
) override
;
109 using WriteBatchBase::Delete
;
110 // If the database contains a mapping for "key", erase it. Else do nothing.
111 // The following Delete(..., const Slice& key) can be used when user-defined
112 // timestamp is enabled as long as `key` points to a contiguous buffer with
113 // timestamp appended after user key. The caller is responsible for setting
114 // up the memory buffer pointed to by `key`.
115 Status
Delete(ColumnFamilyHandle
* column_family
, const Slice
& key
) override
;
116 Status
Delete(const Slice
& key
) override
{ return Delete(nullptr, key
); }
117 Status
Delete(ColumnFamilyHandle
* column_family
, const Slice
& key
,
118 const Slice
& ts
) override
;
120 // variant that takes SliceParts
121 // These two variants of Delete(..., const SliceParts& key) can be used when
122 // user-defined timestamp is enabled as long as the timestamp is the last
123 // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
124 // for setting up the `key` SliceParts object.
125 Status
Delete(ColumnFamilyHandle
* column_family
,
126 const SliceParts
& key
) override
;
127 Status
Delete(const SliceParts
& key
) override
{ return Delete(nullptr, key
); }
129 using WriteBatchBase::SingleDelete
;
130 // WriteBatch implementation of DB::SingleDelete(). See db.h.
131 Status
SingleDelete(ColumnFamilyHandle
* column_family
,
132 const Slice
& key
) override
;
133 Status
SingleDelete(const Slice
& key
) override
{
134 return SingleDelete(nullptr, key
);
136 Status
SingleDelete(ColumnFamilyHandle
* column_family
, const Slice
& key
,
137 const Slice
& ts
) override
;
139 // variant that takes SliceParts
140 Status
SingleDelete(ColumnFamilyHandle
* column_family
,
141 const SliceParts
& key
) override
;
142 Status
SingleDelete(const SliceParts
& key
) override
{
143 return SingleDelete(nullptr, key
);
146 using WriteBatchBase::DeleteRange
;
147 // WriteBatch implementation of DB::DeleteRange(). See db.h.
148 Status
DeleteRange(ColumnFamilyHandle
* column_family
, const Slice
& begin_key
,
149 const Slice
& end_key
) override
;
150 Status
DeleteRange(const Slice
& begin_key
, const Slice
& end_key
) override
{
151 return DeleteRange(nullptr, begin_key
, end_key
);
153 // begin_key and end_key should be user keys without timestamp.
154 Status
DeleteRange(ColumnFamilyHandle
* column_family
, const Slice
& begin_key
,
155 const Slice
& end_key
, const Slice
& ts
) override
;
157 // variant that takes SliceParts
158 Status
DeleteRange(ColumnFamilyHandle
* column_family
,
159 const SliceParts
& begin_key
,
160 const SliceParts
& end_key
) override
;
161 Status
DeleteRange(const SliceParts
& begin_key
,
162 const SliceParts
& end_key
) override
{
163 return DeleteRange(nullptr, begin_key
, end_key
);
166 using WriteBatchBase::Merge
;
167 // Merge "value" with the existing value of "key" in the database.
168 // "key->merge(existing, value)"
169 Status
Merge(ColumnFamilyHandle
* column_family
, const Slice
& key
,
170 const Slice
& value
) override
;
171 Status
Merge(const Slice
& key
, const Slice
& value
) override
{
172 return Merge(nullptr, key
, value
);
174 Status
Merge(ColumnFamilyHandle
* /*column_family*/, const Slice
& /*key*/,
175 const Slice
& /*ts*/, const Slice
& /*value*/) override
;
177 // variant that takes SliceParts
178 Status
Merge(ColumnFamilyHandle
* column_family
, const SliceParts
& key
,
179 const SliceParts
& value
) override
;
180 Status
Merge(const SliceParts
& key
, const SliceParts
& value
) override
{
181 return Merge(nullptr, key
, value
);
184 using WriteBatchBase::PutLogData
;
185 // Append a blob of arbitrary size to the records in this batch. The blob will
186 // be stored in the transaction log but not in any other file. In particular,
187 // it will not be persisted to the SST files. When iterating over this
188 // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
189 // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
190 // encountered in the same order in which they were inserted. The blob will
191 // NOT consume sequence number(s) and will NOT increase the count of the batch
193 // Example application: add timestamps to the transaction log for use in
195 Status
PutLogData(const Slice
& blob
) override
;
197 using WriteBatchBase::Clear
;
198 // Clear all updates buffered in this batch.
199 void Clear() override
;
201 // Records the state of the batch for future calls to RollbackToSavePoint().
202 // May be called multiple times to set multiple save points.
203 void SetSavePoint() override
;
205 // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
206 // most recent call to SetSavePoint() and removes the most recent save point.
207 // If there is no previous call to SetSavePoint(), Status::NotFound()
209 // Otherwise returns Status::OK().
210 Status
RollbackToSavePoint() override
;
212 // Pop the most recent save point.
213 // If there is no previous call to SetSavePoint(), Status::NotFound()
215 // Otherwise returns Status::OK().
216 Status
PopSavePoint() override
;
218 // Support for iterating over the contents of a batch.
219 // Objects of subclasses of Handler will be used by WriteBatch::Iterate().
223 // All handler functions in this class provide default implementations so
224 // we won't break existing clients of Handler on a source code level when
225 // adding a new member function.
227 // default implementation will just call Put without column family for
228 // backwards compatibility. If the column family is not default,
229 // the function is noop
230 // If user-defined timestamp is enabled, then `key` includes timestamp.
231 virtual Status
PutCF(uint32_t column_family_id
, const Slice
& key
,
232 const Slice
& value
) {
233 if (column_family_id
== 0) {
234 // Put() historically doesn't return status. We didn't want to be
235 // backwards incompatible so we didn't change the return status
236 // (this is a public API). We do an ordinary get and return Status::OK()
240 return Status::InvalidArgument(
241 "non-default column family and PutCF not implemented");
243 // If user-defined timestamp is enabled, then `key` includes timestamp.
244 virtual void Put(const Slice
& /*key*/, const Slice
& /*value*/) {}
246 // If user-defined timestamp is enabled, then `key` includes timestamp.
247 virtual Status
PutEntityCF(uint32_t /* column_family_id */,
248 const Slice
& /* key */,
249 const Slice
& /* entity */) {
250 return Status::NotSupported("PutEntityCF not implemented");
253 // If user-defined timestamp is enabled, then `key` includes timestamp.
254 virtual Status
DeleteCF(uint32_t column_family_id
, const Slice
& key
) {
255 if (column_family_id
== 0) {
259 return Status::InvalidArgument(
260 "non-default column family and DeleteCF not implemented");
262 // If user-defined timestamp is enabled, then `key` includes timestamp.
263 virtual void Delete(const Slice
& /*key*/) {}
265 // If user-defined timestamp is enabled, then `key` includes timestamp.
266 virtual Status
SingleDeleteCF(uint32_t column_family_id
, const Slice
& key
) {
267 if (column_family_id
== 0) {
271 return Status::InvalidArgument(
272 "non-default column family and SingleDeleteCF not implemented");
274 // If user-defined timestamp is enabled, then `key` includes timestamp.
275 virtual void SingleDelete(const Slice
& /*key*/) {}
277 // If user-defined timestamp is enabled, then `begin_key` and `end_key`
278 // both include timestamp.
279 virtual Status
DeleteRangeCF(uint32_t /*column_family_id*/,
280 const Slice
& /*begin_key*/,
281 const Slice
& /*end_key*/) {
282 return Status::InvalidArgument("DeleteRangeCF not implemented");
285 // If user-defined timestamp is enabled, then `key` includes timestamp.
286 virtual Status
MergeCF(uint32_t column_family_id
, const Slice
& key
,
287 const Slice
& value
) {
288 if (column_family_id
== 0) {
292 return Status::InvalidArgument(
293 "non-default column family and MergeCF not implemented");
295 // If user-defined timestamp is enabled, then `key` includes timestamp.
296 virtual void Merge(const Slice
& /*key*/, const Slice
& /*value*/) {}
298 // If user-defined timestamp is enabled, then `key` includes timestamp.
299 virtual Status
PutBlobIndexCF(uint32_t /*column_family_id*/,
300 const Slice
& /*key*/,
301 const Slice
& /*value*/) {
302 return Status::InvalidArgument("PutBlobIndexCF not implemented");
305 // The default implementation of LogData does nothing.
306 virtual void LogData(const Slice
& blob
);
308 virtual Status
MarkBeginPrepare(bool = false) {
309 return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
312 virtual Status
MarkEndPrepare(const Slice
& /*xid*/) {
313 return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
316 virtual Status
MarkNoop(bool /*empty_batch*/) {
317 return Status::InvalidArgument("MarkNoop() handler not defined.");
320 virtual Status
MarkRollback(const Slice
& /*xid*/) {
321 return Status::InvalidArgument(
322 "MarkRollbackPrepare() handler not defined.");
325 virtual Status
MarkCommit(const Slice
& /*xid*/) {
326 return Status::InvalidArgument("MarkCommit() handler not defined.");
329 virtual Status
MarkCommitWithTimestamp(const Slice
& /*xid*/,
330 const Slice
& /*commit_ts*/) {
331 return Status::InvalidArgument(
332 "MarkCommitWithTimestamp() handler not defined.");
335 // Continue is called by WriteBatch::Iterate. If it returns false,
336 // iteration is halted. Otherwise, it continues iterating. The default
337 // implementation always returns true.
338 virtual bool Continue();
341 friend class WriteBatchInternal
;
342 enum class OptionState
{
347 virtual OptionState
WriteAfterCommit() const {
348 return OptionState::kUnknown
;
350 virtual OptionState
WriteBeforePrepare() const {
351 return OptionState::kUnknown
;
354 Status
Iterate(Handler
* handler
) const;
356 // Retrieve the serialized version of this batch.
357 const std::string
& Data() const { return rep_
; }
359 // Retrieve data size of the batch.
360 size_t GetDataSize() const { return rep_
.size(); }
362 // Returns the number of updates in the batch
363 uint32_t Count() const;
365 // Returns true if PutCF will be called during Iterate
368 // Returns true if PutEntityCF will be called during Iterate
369 bool HasPutEntity() const;
371 // Returns true if DeleteCF will be called during Iterate
372 bool HasDelete() const;
374 // Returns true if SingleDeleteCF will be called during Iterate
375 bool HasSingleDelete() const;
377 // Returns true if DeleteRangeCF will be called during Iterate
378 bool HasDeleteRange() const;
380 // Returns true if MergeCF will be called during Iterate
381 bool HasMerge() const;
383 // Returns true if MarkBeginPrepare will be called during Iterate
384 bool HasBeginPrepare() const;
386 // Returns true if MarkEndPrepare will be called during Iterate
387 bool HasEndPrepare() const;
389 // Returns true if MarkCommit will be called during Iterate
390 bool HasCommit() const;
392 // Returns true if MarkRollback will be called during Iterate
393 bool HasRollback() const;
397 // Update timestamps of existing entries in the write batch if
398 // applicable. If a key is intended for a column family that disables
399 // timestamp, then this API won't set the timestamp for this key.
400 // This requires that all keys, if enable timestamp, (possibly from multiple
401 // column families) in the write batch have timestamps of the same format.
403 // ts_sz_func: callable object to obtain the timestamp sizes of column
404 // families. If ts_sz_func() accesses data structures, then the caller of this
405 // API must guarantee thread-safety. Like other parts of RocksDB, this API is
406 // not exception-safe. Therefore, ts_sz_func() must not throw.
408 // in: cf, the column family id.
409 // ret: timestamp size of the given column family. Return
410 // std::numeric_limits<size_t>::max() indicating "don't know or column
411 // family info not found", this will cause UpdateTimestamps() to fail.
412 // size_t ts_sz_func(uint32_t cf);
413 Status
UpdateTimestamps(const Slice
& ts
,
414 std::function
<size_t(uint32_t /*cf*/)> ts_sz_func
);
416 // Verify the per-key-value checksums of this write batch.
417 // Corruption status will be returned if the verification fails.
418 // If this write batch does not have per-key-value checksum,
419 // OK status will be returned.
420 Status
VerifyChecksum() const;
422 using WriteBatchBase::GetWriteBatch
;
423 WriteBatch
* GetWriteBatch() override
{ return this; }
425 // Constructor with a serialized string object
426 explicit WriteBatch(const std::string
& rep
);
427 explicit WriteBatch(std::string
&& rep
);
429 WriteBatch(const WriteBatch
& src
);
430 WriteBatch(WriteBatch
&& src
) noexcept
;
431 WriteBatch
& operator=(const WriteBatch
& src
);
432 WriteBatch
& operator=(WriteBatch
&& src
);
434 // marks this point in the WriteBatch as the last record to
435 // be inserted into the WAL, provided the WAL is enabled
436 void MarkWalTerminationPoint();
437 const SavePoint
& GetWalTerminationPoint() const { return wal_term_point_
; }
439 void SetMaxBytes(size_t max_bytes
) override
{ max_bytes_
= max_bytes
; }
441 struct ProtectionInfo
;
442 size_t GetProtectionBytesPerKey() const;
445 friend class WriteBatchInternal
;
446 friend class LocalSavePoint
;
447 // TODO(myabandeh): this is needed for a hack to collapse the write batch and
448 // remove duplicate keys. Remove it when the hack is replaced with a proper
450 friend class WriteBatchWithIndex
;
451 std::unique_ptr
<SavePoints
> save_points_
;
453 // When sending a WriteBatch through WriteImpl we might want to
454 // specify that only the first x records of the batch be written to
456 SavePoint wal_term_point_
;
458 // Is the content of the batch the application's latest state that meant only
459 // to be used for recovery? Refer to
460 // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
462 bool is_latest_persistent_state_
= false;
464 // False if all keys are from column families that disable user-defined
465 // timestamp OR UpdateTimestamps() has been called at least once.
466 // This flag will be set to true if any of the above Put(), Delete(),
467 // SingleDelete(), etc. APIs are called at least once.
468 // Calling Put(ts), Delete(ts), SingleDelete(ts), etc. will not set this flag
469 // to true because the assumption is that these APIs have already set the
470 // timestamps to desired values.
471 bool needs_in_place_update_ts_
= false;
473 // True if the write batch contains at least one key from a column family
474 // that enables user-defined timestamp.
475 bool has_key_with_ts_
= false;
477 // For HasXYZ. Mutable to allow lazy computation of results
478 mutable std::atomic
<uint32_t> content_flags_
;
480 // Performs deferred computation of content_flags if necessary
481 uint32_t ComputeContentFlags() const;
483 // Maximum size of rep_.
486 std::unique_ptr
<ProtectionInfo
> prot_info_
;
488 size_t default_cf_ts_sz_
= 0;
491 std::string rep_
; // See comment in write_batch.cc for the format of rep_
494 } // namespace ROCKSDB_NAMESPACE