1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
13 #include "rocksdb/comparator.h"
14 #include "rocksdb/db.h"
15 #include "rocksdb/status.h"
17 namespace ROCKSDB_NAMESPACE
{
21 class WriteBatchWithIndex
;
23 using TransactionName
= std::string
;
25 using TransactionID
= uint64_t;
27 // Provides notification to the caller of SetSnapshotOnNextOperation when
28 // the actual snapshot gets created
29 class TransactionNotifier
{
31 virtual ~TransactionNotifier() {}
33 // Implement this method to receive notification when a snapshot is
34 // requested via SetSnapshotOnNextOperation.
35 virtual void SnapshotCreated(const Snapshot
* newSnapshot
) = 0;
38 // Provides BEGIN/COMMIT/ROLLBACK transactions.
40 // To use transactions, you must first create either an OptimisticTransactionDB
41 // or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
44 // To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
46 // It is up to the caller to synchronize access to this object.
48 // See examples/transaction_example.cc for some simple examples.
50 // TODO(agiardullo): Not yet implemented
51 // -PerfContext statistics
52 // -Support for using Transactions with DBWithTTL
56 Transaction(const Transaction
&) = delete;
57 void operator=(const Transaction
&) = delete;
59 virtual ~Transaction() {}
61 // If a transaction has a snapshot set, the transaction will ensure that
62 // any keys successfully written(or fetched via GetForUpdate()) have not
63 // been modified outside of this transaction since the time the snapshot was
65 // If a snapshot has not been set, the transaction guarantees that keys have
66 // not been modified since the time each key was first written (or fetched via
69 // Using SetSnapshot() will provide stricter isolation guarantees at the
70 // expense of potentially more transaction failures due to conflicts with
73 // Calling SetSnapshot() has no effect on keys written before this function
76 // SetSnapshot() may be called multiple times if you would like to change
77 // the snapshot used for different operations in this transaction.
79 // Calling SetSnapshot will not affect the version of Data returned by Get()
80 // methods. See Transaction::Get() for more details.
81 virtual void SetSnapshot() = 0;
83 // Similar to SetSnapshot(), but will not change the current snapshot
84 // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
85 // By calling this function, the transaction will essentially call
86 // SetSnapshot() for you right before performing the next write/GetForUpdate.
88 // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
89 // returned by GetSnapshot() until the next write/GetForUpdate is executed.
91 // When the snapshot is created the notifier's SnapshotCreated method will
92 // be called so that the caller can get access to the snapshot.
94 // This is an optimization to reduce the likelihood of conflicts that
95 // could occur in between the time SetSnapshot() is called and the first
96 // write/GetForUpdate operation. Eg, this prevents the following
99 // txn1->SetSnapshot();
100 // txn2->Put("A", ...);
102 // txn1->GetForUpdate(opts, "A", ...); // FAIL!
103 virtual void SetSnapshotOnNextOperation(
104 std::shared_ptr
<TransactionNotifier
> notifier
= nullptr) = 0;
106 // Returns the Snapshot created by the last call to SetSnapshot().
108 // REQUIRED: The returned Snapshot is only valid up until the next time
109 // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
110 // is called, or the Transaction is deleted.
111 virtual const Snapshot
* GetSnapshot() const = 0;
113 // Clears the current snapshot (i.e. no snapshot will be 'set')
115 // This removes any snapshot that currently exists or is set to be created
116 // on the next update operation (SetSnapshotOnNextOperation).
118 // Calling ClearSnapshot() has no effect on keys written before this function
121 // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
122 // longer be valid and should be discarded after a call to ClearSnapshot().
123 virtual void ClearSnapshot() = 0;
125 // Prepare the current transaction for 2PC
126 virtual Status
Prepare() = 0;
128 // Write all batched keys to the db atomically.
130 // Returns OK on success.
132 // May return any error status that could be returned by DB:Write().
134 // If this transaction was created by an OptimisticTransactionDB(),
135 // Status::Busy() may be returned if the transaction could not guarantee
136 // that there are no write conflicts. Status::TryAgain() may be returned
137 // if the memtable history size is not large enough
138 // (See max_write_buffer_size_to_maintain).
140 // If this transaction was created by a TransactionDB(), Status::Expired()
141 // may be returned if this transaction has lived for longer than
142 // TransactionOptions.expiration.
143 virtual Status
Commit() = 0;
145 // Discard all batched writes in this transaction.
146 virtual Status
Rollback() = 0;
148 // Records the state of the transaction for future calls to
149 // RollbackToSavePoint(). May be called multiple times to set multiple save
151 virtual void SetSavePoint() = 0;
153 // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
154 // since the most recent call to SetSavePoint() and removes the most recent
156 // If there is no previous call to SetSavePoint(), returns Status::NotFound()
157 virtual Status
RollbackToSavePoint() = 0;
159 // Pop the most recent save point.
160 // If there is no previous call to SetSavePoint(), Status::NotFound()
162 // Otherwise returns Status::OK().
163 virtual Status
PopSavePoint() = 0;
165 // This function is similar to DB::Get() except it will also read pending
166 // changes in this transaction. Currently, this function will return
167 // Status::MergeInProgress if the most recent write to the queried key in
168 // this batch is a Merge.
170 // If read_options.snapshot is not set, the current version of the key will
171 // be read. Calling SetSnapshot() does not affect the version of the data
174 // Note that setting read_options.snapshot will affect what is read from the
175 // DB but will NOT change which keys are read from this transaction (the keys
176 // in this transaction do not yet belong to any snapshot and will be fetched
178 virtual Status
Get(const ReadOptions
& options
,
179 ColumnFamilyHandle
* column_family
, const Slice
& key
,
180 std::string
* value
) = 0;
182 // An overload of the above method that receives a PinnableSlice
183 // For backward compatibility a default implementation is provided
184 virtual Status
Get(const ReadOptions
& options
,
185 ColumnFamilyHandle
* column_family
, const Slice
& key
,
186 PinnableSlice
* pinnable_val
) {
187 assert(pinnable_val
!= nullptr);
188 auto s
= Get(options
, column_family
, key
, pinnable_val
->GetSelf());
189 pinnable_val
->PinSelf();
193 virtual Status
Get(const ReadOptions
& options
, const Slice
& key
,
194 std::string
* value
) = 0;
195 virtual Status
Get(const ReadOptions
& options
, const Slice
& key
,
196 PinnableSlice
* pinnable_val
) {
197 assert(pinnable_val
!= nullptr);
198 auto s
= Get(options
, key
, pinnable_val
->GetSelf());
199 pinnable_val
->PinSelf();
203 virtual std::vector
<Status
> MultiGet(
204 const ReadOptions
& options
,
205 const std::vector
<ColumnFamilyHandle
*>& column_family
,
206 const std::vector
<Slice
>& keys
, std::vector
<std::string
>* values
) = 0;
208 virtual std::vector
<Status
> MultiGet(const ReadOptions
& options
,
209 const std::vector
<Slice
>& keys
,
210 std::vector
<std::string
>* values
) = 0;
212 // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are
213 // expected to override this with an implementation that calls
214 // DBImpl::MultiGet()
215 virtual void MultiGet(const ReadOptions
& options
,
216 ColumnFamilyHandle
* column_family
,
217 const size_t num_keys
, const Slice
* keys
,
218 PinnableSlice
* values
, Status
* statuses
,
219 const bool /*sorted_input*/ = false) {
220 for (size_t i
= 0; i
< num_keys
; ++i
) {
221 statuses
[i
] = Get(options
, column_family
, keys
[i
], &values
[i
]);
225 // Read this key and ensure that this transaction will only
226 // be able to be committed if this key is not written outside this
227 // transaction after it has first been read (or after the snapshot if a
228 // snapshot is set in this transaction and do_validate is true). If
229 // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
230 // that GetForUpdate returns the latest committed value. The transaction
231 // behavior is the same regardless of whether the key exists or not.
233 // Note: Currently, this function will return Status::MergeInProgress
234 // if the most recent write to the queried key in this batch is a Merge.
236 // The values returned by this function are similar to Transaction::Get().
237 // If value==nullptr, then this function will not read any data, but will
238 // still ensure that this key cannot be written to by outside of this
241 // If this transaction was created by an OptimisticTransaction, GetForUpdate()
242 // could cause commit() to fail. Otherwise, it could return any error
243 // that could be returned by DB::Get().
245 // If this transaction was created by a TransactionDB, it can return
246 // Status::OK() on success,
247 // Status::Busy() if there is a write conflict,
248 // Status::TimedOut() if a lock could not be acquired,
249 // Status::TryAgain() if the memtable history size is not large enough
250 // (See max_write_buffer_size_to_maintain)
251 // Status::MergeInProgress() if merge operations cannot be resolved.
252 // or other errors if this key could not be read.
253 virtual Status
GetForUpdate(const ReadOptions
& options
,
254 ColumnFamilyHandle
* column_family
,
255 const Slice
& key
, std::string
* value
,
256 bool exclusive
= true,
257 const bool do_validate
= true) = 0;
259 // An overload of the above method that receives a PinnableSlice
260 // For backward compatibility a default implementation is provided
261 virtual Status
GetForUpdate(const ReadOptions
& options
,
262 ColumnFamilyHandle
* column_family
,
263 const Slice
& key
, PinnableSlice
* pinnable_val
,
264 bool exclusive
= true,
265 const bool do_validate
= true) {
266 if (pinnable_val
== nullptr) {
267 std::string
* null_str
= nullptr;
268 return GetForUpdate(options
, column_family
, key
, null_str
, exclusive
,
271 auto s
= GetForUpdate(options
, column_family
, key
,
272 pinnable_val
->GetSelf(), exclusive
, do_validate
);
273 pinnable_val
->PinSelf();
278 virtual Status
GetForUpdate(const ReadOptions
& options
, const Slice
& key
,
279 std::string
* value
, bool exclusive
= true,
280 const bool do_validate
= true) = 0;
282 virtual std::vector
<Status
> MultiGetForUpdate(
283 const ReadOptions
& options
,
284 const std::vector
<ColumnFamilyHandle
*>& column_family
,
285 const std::vector
<Slice
>& keys
, std::vector
<std::string
>* values
) = 0;
287 virtual std::vector
<Status
> MultiGetForUpdate(
288 const ReadOptions
& options
, const std::vector
<Slice
>& keys
,
289 std::vector
<std::string
>* values
) = 0;
291 // Returns an iterator that will iterate on all keys in the default
292 // column family including both keys in the DB and uncommitted keys in this
295 // Setting read_options.snapshot will affect what is read from the
296 // DB but will NOT change which keys are read from this transaction (the keys
297 // in this transaction do not yet belong to any snapshot and will be fetched
300 // Caller is responsible for deleting the returned Iterator.
302 // The returned iterator is only valid until Commit(), Rollback(), or
303 // RollbackToSavePoint() is called.
304 virtual Iterator
* GetIterator(const ReadOptions
& read_options
) = 0;
306 virtual Iterator
* GetIterator(const ReadOptions
& read_options
,
307 ColumnFamilyHandle
* column_family
) = 0;
309 // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
310 // functions in WriteBatch, but will also do conflict checking on the
311 // keys being written.
313 // assume_tracked=true expects the key be already tracked. More
314 // specifically, it means the the key was previous tracked in the same
315 // savepoint, with the same exclusive flag, and at a lower sequence number.
316 // If valid then it skips ValidateSnapshot. Returns error otherwise.
318 // If this Transaction was created on an OptimisticTransactionDB, these
319 // functions should always return Status::OK().
321 // If this Transaction was created on a TransactionDB, the status returned
323 // Status::OK() on success,
324 // Status::Busy() if there is a write conflict,
325 // Status::TimedOut() if a lock could not be acquired,
326 // Status::TryAgain() if the memtable history size is not large enough
327 // (See max_write_buffer_size_to_maintain)
328 // or other errors on unexpected failures.
329 virtual Status
Put(ColumnFamilyHandle
* column_family
, const Slice
& key
,
330 const Slice
& value
, const bool assume_tracked
= false) = 0;
331 virtual Status
Put(const Slice
& key
, const Slice
& value
) = 0;
332 virtual Status
Put(ColumnFamilyHandle
* column_family
, const SliceParts
& key
,
333 const SliceParts
& value
,
334 const bool assume_tracked
= false) = 0;
335 virtual Status
Put(const SliceParts
& key
, const SliceParts
& value
) = 0;
337 virtual Status
Merge(ColumnFamilyHandle
* column_family
, const Slice
& key
,
339 const bool assume_tracked
= false) = 0;
340 virtual Status
Merge(const Slice
& key
, const Slice
& value
) = 0;
342 virtual Status
Delete(ColumnFamilyHandle
* column_family
, const Slice
& key
,
343 const bool assume_tracked
= false) = 0;
344 virtual Status
Delete(const Slice
& key
) = 0;
345 virtual Status
Delete(ColumnFamilyHandle
* column_family
,
346 const SliceParts
& key
,
347 const bool assume_tracked
= false) = 0;
348 virtual Status
Delete(const SliceParts
& key
) = 0;
350 virtual Status
SingleDelete(ColumnFamilyHandle
* column_family
,
352 const bool assume_tracked
= false) = 0;
353 virtual Status
SingleDelete(const Slice
& key
) = 0;
354 virtual Status
SingleDelete(ColumnFamilyHandle
* column_family
,
355 const SliceParts
& key
,
356 const bool assume_tracked
= false) = 0;
357 virtual Status
SingleDelete(const SliceParts
& key
) = 0;
359 // PutUntracked() will write a Put to the batch of operations to be committed
360 // in this transaction. This write will only happen if this transaction
361 // gets committed successfully. But unlike Transaction::Put(),
362 // no conflict checking will be done for this key.
364 // If this Transaction was created on a PessimisticTransactionDB, this
365 // function will still acquire locks necessary to make sure this write doesn't
366 // cause conflicts in other transactions and may return Status::Busy().
367 virtual Status
PutUntracked(ColumnFamilyHandle
* column_family
,
368 const Slice
& key
, const Slice
& value
) = 0;
369 virtual Status
PutUntracked(const Slice
& key
, const Slice
& value
) = 0;
370 virtual Status
PutUntracked(ColumnFamilyHandle
* column_family
,
371 const SliceParts
& key
,
372 const SliceParts
& value
) = 0;
373 virtual Status
PutUntracked(const SliceParts
& key
,
374 const SliceParts
& value
) = 0;
376 virtual Status
MergeUntracked(ColumnFamilyHandle
* column_family
,
377 const Slice
& key
, const Slice
& value
) = 0;
378 virtual Status
MergeUntracked(const Slice
& key
, const Slice
& value
) = 0;
380 virtual Status
DeleteUntracked(ColumnFamilyHandle
* column_family
,
381 const Slice
& key
) = 0;
383 virtual Status
DeleteUntracked(const Slice
& key
) = 0;
384 virtual Status
DeleteUntracked(ColumnFamilyHandle
* column_family
,
385 const SliceParts
& key
) = 0;
386 virtual Status
DeleteUntracked(const SliceParts
& key
) = 0;
387 virtual Status
SingleDeleteUntracked(ColumnFamilyHandle
* column_family
,
388 const Slice
& key
) = 0;
390 virtual Status
SingleDeleteUntracked(const Slice
& key
) = 0;
392 // Similar to WriteBatch::PutLogData
393 virtual void PutLogData(const Slice
& blob
) = 0;
395 // By default, all Put/Merge/Delete operations will be indexed in the
396 // transaction so that Get/GetForUpdate/GetIterator can search for these
399 // If the caller does not want to fetch the keys about to be written,
400 // they may want to avoid indexing as a performance optimization.
401 // Calling DisableIndexing() will turn off indexing for all future
402 // Put/Merge/Delete operations until EnableIndexing() is called.
404 // If a key is Put/Merge/Deleted after DisableIndexing is called and then
405 // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
407 virtual void DisableIndexing() = 0;
408 virtual void EnableIndexing() = 0;
410 // Returns the number of distinct Keys being tracked by this transaction.
411 // If this transaction was created by a TransactionDB, this is the number of
412 // keys that are currently locked by this transaction.
413 // If this transaction was created by an OptimisticTransactionDB, this is the
414 // number of keys that need to be checked for conflicts at commit time.
415 virtual uint64_t GetNumKeys() const = 0;
417 // Returns the number of Puts/Deletes/Merges that have been applied to this
418 // transaction so far.
419 virtual uint64_t GetNumPuts() const = 0;
420 virtual uint64_t GetNumDeletes() const = 0;
421 virtual uint64_t GetNumMerges() const = 0;
423 // Returns the elapsed time in milliseconds since this Transaction began.
424 virtual uint64_t GetElapsedTime() const = 0;
426 // Fetch the underlying write batch that contains all pending changes to be
429 // Note: You should not write or delete anything from the batch directly and
430 // should only use the functions in the Transaction class to
431 // write to this transaction.
432 virtual WriteBatchWithIndex
* GetWriteBatch() = 0;
434 // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
436 // Has no effect on OptimisticTransactions.
437 virtual void SetLockTimeout(int64_t timeout
) = 0;
439 // Return the WriteOptions that will be used during Commit()
440 virtual WriteOptions
* GetWriteOptions() = 0;
442 // Reset the WriteOptions that will be used during Commit().
443 virtual void SetWriteOptions(const WriteOptions
& write_options
) = 0;
445 // If this key was previously fetched in this transaction using
446 // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
447 // the transaction that it no longer needs to do any conflict checking
450 // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
451 // then UndoGetForUpdate will only have an effect if it is also called N
452 // times. If this key has been written to in this transaction,
453 // UndoGetForUpdate() will have no effect.
455 // If SetSavePoint() has been called after the GetForUpdate(),
456 // UndoGetForUpdate() will not have any effect.
458 // If this Transaction was created by an OptimisticTransactionDB,
459 // calling UndoGetForUpdate can affect whether this key is conflict checked
461 // If this Transaction was created by a TransactionDB,
462 // calling UndoGetForUpdate may release any held locks for this key.
463 virtual void UndoGetForUpdate(ColumnFamilyHandle
* column_family
,
464 const Slice
& key
) = 0;
465 virtual void UndoGetForUpdate(const Slice
& key
) = 0;
467 virtual Status
RebuildFromWriteBatch(WriteBatch
* src_batch
) = 0;
469 virtual WriteBatch
* GetCommitTimeWriteBatch() = 0;
471 virtual void SetLogNumber(uint64_t log
) { log_number_
= log
; }
473 virtual uint64_t GetLogNumber() const { return log_number_
; }
475 virtual Status
SetName(const TransactionName
& name
) = 0;
477 virtual TransactionName
GetName() const { return name_
; }
479 virtual TransactionID
GetID() const { return 0; }
481 virtual bool IsDeadlockDetect() const { return false; }
483 virtual std::vector
<TransactionID
> GetWaitingTxns(
484 uint32_t* /*column_family_id*/, std::string
* /*key*/) const {
486 return std::vector
<TransactionID
>();
489 enum TransactionState
{
491 AWAITING_PREPARE
= 1,
495 AWAITING_ROLLBACK
= 5,
500 TransactionState
GetState() const { return txn_state_
; }
501 void SetState(TransactionState state
) { txn_state_
= state
; }
503 // NOTE: Experimental feature
504 // The globally unique id with which the transaction is identified. This id
505 // might or might not be set depending on the implementation. Similarly the
506 // implementation decides the point in lifetime of a transaction at which it
507 // assigns the id. Although currently it is the case, the id is not guaranteed
508 // to remain the same across restarts.
509 uint64_t GetId() { return id_
; }
512 explicit Transaction(const TransactionDB
* /*db*/) {}
513 Transaction() : log_number_(0), txn_state_(STARTED
) {}
515 // the log in which the prepared section for this txn resides
516 // (for two phase commit)
517 uint64_t log_number_
;
518 TransactionName name_
;
520 // Execution status of the transaction.
521 std::atomic
<TransactionState
> txn_state_
;
524 virtual void SetId(uint64_t id
) {
529 virtual uint64_t GetLastLogNumber() const { return log_number_
; }
532 friend class PessimisticTransactionDB
;
533 friend class WriteUnpreparedTxnDB
;
534 friend class TransactionTest_TwoPhaseLogRollingTest_Test
;
535 friend class TransactionTest_TwoPhaseLogRollingTest2_Test
;
538 } // namespace ROCKSDB_NAMESPACE
540 #endif // ROCKSDB_LITE