]>
git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/utilities/transaction.h
86627d4f4580dacd31191deebfc15f30b34df7e2
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
13 #include "rocksdb/comparator.h"
14 #include "rocksdb/db.h"
15 #include "rocksdb/status.h"
21 class WriteBatchWithIndex
;
23 using TransactionName
= std::string
;
25 using TransactionID
= uint64_t;
27 // Provides notification to the caller of SetSnapshotOnNextOperation when
28 // the actual snapshot gets created
29 class TransactionNotifier
{
31 virtual ~TransactionNotifier() {}
33 // Implement this method to receive notification when a snapshot is
34 // requested via SetSnapshotOnNextOperation.
35 virtual void SnapshotCreated(const Snapshot
* newSnapshot
) = 0;
38 // Provides BEGIN/COMMIT/ROLLBACK transactions.
40 // To use transactions, you must first create either an OptimisticTransactionDB
41 // or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
44 // To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
46 // It is up to the caller to synchronize access to this object.
48 // See examples/transaction_example.cc for some simple examples.
50 // TODO(agiardullo): Not yet implemented
51 // -PerfContext statistics
52 // -Support for using Transactions with DBWithTTL
55 virtual ~Transaction() {}
57 // If a transaction has a snapshot set, the transaction will ensure that
58 // any keys successfully written(or fetched via GetForUpdate()) have not
59 // been modified outside of this transaction since the time the snapshot was
61 // If a snapshot has not been set, the transaction guarantees that keys have
62 // not been modified since the time each key was first written (or fetched via
65 // Using SetSnapshot() will provide stricter isolation guarantees at the
66 // expense of potentially more transaction failures due to conflicts with
69 // Calling SetSnapshot() has no effect on keys written before this function
72 // SetSnapshot() may be called multiple times if you would like to change
73 // the snapshot used for different operations in this transaction.
75 // Calling SetSnapshot will not affect the version of Data returned by Get()
76 // methods. See Transaction::Get() for more details.
77 virtual void SetSnapshot() = 0;
79 // Similar to SetSnapshot(), but will not change the current snapshot
80 // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
81 // By calling this function, the transaction will essentially call
82 // SetSnapshot() for you right before performing the next write/GetForUpdate.
84 // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
85 // returned by GetSnapshot() until the next write/GetForUpdate is executed.
87 // When the snapshot is created the notifier's SnapshotCreated method will
88 // be called so that the caller can get access to the snapshot.
90 // This is an optimization to reduce the likelihood of conflicts that
91 // could occur in between the time SetSnapshot() is called and the first
92 // write/GetForUpdate operation. Eg, this prevents the following
95 // txn1->SetSnapshot();
96 // txn2->Put("A", ...);
98 // txn1->GetForUpdate(opts, "A", ...); // FAIL!
99 virtual void SetSnapshotOnNextOperation(
100 std::shared_ptr
<TransactionNotifier
> notifier
= nullptr) = 0;
102 // Returns the Snapshot created by the last call to SetSnapshot().
104 // REQUIRED: The returned Snapshot is only valid up until the next time
105 // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
106 // is called, or the Transaction is deleted.
107 virtual const Snapshot
* GetSnapshot() const = 0;
109 // Clears the current snapshot (i.e. no snapshot will be 'set')
111 // This removes any snapshot that currently exists or is set to be created
112 // on the next update operation (SetSnapshotOnNextOperation).
114 // Calling ClearSnapshot() has no effect on keys written before this function
117 // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
118 // longer be valid and should be discarded after a call to ClearSnapshot().
119 virtual void ClearSnapshot() = 0;
121 // Prepare the current transaction for 2PC
122 virtual Status
Prepare() = 0;
124 // Write all batched keys to the db atomically.
126 // Returns OK on success.
128 // May return any error status that could be returned by DB:Write().
130 // If this transaction was created by an OptimisticTransactionDB(),
131 // Status::Busy() may be returned if the transaction could not guarantee
132 // that there are no write conflicts. Status::TryAgain() may be returned
133 // if the memtable history size is not large enough
134 // (See max_write_buffer_number_to_maintain).
136 // If this transaction was created by a TransactionDB(), Status::Expired()
137 // may be returned if this transaction has lived for longer than
138 // TransactionOptions.expiration.
139 virtual Status
Commit() = 0;
141 // Discard all batched writes in this transaction.
142 virtual Status
Rollback() = 0;
144 // Records the state of the transaction for future calls to
145 // RollbackToSavePoint(). May be called multiple times to set multiple save
147 virtual void SetSavePoint() = 0;
149 // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
150 // since the most recent call to SetSavePoint() and removes the most recent
152 // If there is no previous call to SetSavePoint(), returns Status::NotFound()
153 virtual Status
RollbackToSavePoint() = 0;
155 // Pop the most recent save point.
156 // If there is no previous call to SetSavePoint(), Status::NotFound()
158 // Otherwise returns Status::OK().
159 virtual Status
PopSavePoint() = 0;
161 // This function is similar to DB::Get() except it will also read pending
162 // changes in this transaction. Currently, this function will return
163 // Status::MergeInProgress if the most recent write to the queried key in
164 // this batch is a Merge.
166 // If read_options.snapshot is not set, the current version of the key will
167 // be read. Calling SetSnapshot() does not affect the version of the data
170 // Note that setting read_options.snapshot will affect what is read from the
171 // DB but will NOT change which keys are read from this transaction (the keys
172 // in this transaction do not yet belong to any snapshot and will be fetched
174 virtual Status
Get(const ReadOptions
& options
,
175 ColumnFamilyHandle
* column_family
, const Slice
& key
,
176 std::string
* value
) = 0;
178 // An overload of the above method that receives a PinnableSlice
179 // For backward compatibility a default implementation is provided
180 virtual Status
Get(const ReadOptions
& options
,
181 ColumnFamilyHandle
* column_family
, const Slice
& key
,
182 PinnableSlice
* pinnable_val
) {
183 assert(pinnable_val
!= nullptr);
184 auto s
= Get(options
, column_family
, key
, pinnable_val
->GetSelf());
185 pinnable_val
->PinSelf();
189 virtual Status
Get(const ReadOptions
& options
, const Slice
& key
,
190 std::string
* value
) = 0;
191 virtual Status
Get(const ReadOptions
& options
, const Slice
& key
,
192 PinnableSlice
* pinnable_val
) {
193 assert(pinnable_val
!= nullptr);
194 auto s
= Get(options
, key
, pinnable_val
->GetSelf());
195 pinnable_val
->PinSelf();
199 virtual std::vector
<Status
> MultiGet(
200 const ReadOptions
& options
,
201 const std::vector
<ColumnFamilyHandle
*>& column_family
,
202 const std::vector
<Slice
>& keys
, std::vector
<std::string
>* values
) = 0;
204 virtual std::vector
<Status
> MultiGet(const ReadOptions
& options
,
205 const std::vector
<Slice
>& keys
,
206 std::vector
<std::string
>* values
) = 0;
208 // Read this key and ensure that this transaction will only
209 // be able to be committed if this key is not written outside this
210 // transaction after it has first been read (or after the snapshot if a
211 // snapshot is set in this transaction). The transaction behavior is the
212 // same regardless of whether the key exists or not.
214 // Note: Currently, this function will return Status::MergeInProgress
215 // if the most recent write to the queried key in this batch is a Merge.
217 // The values returned by this function are similar to Transaction::Get().
218 // If value==nullptr, then this function will not read any data, but will
219 // still ensure that this key cannot be written to by outside of this
222 // If this transaction was created by an OptimisticTransaction, GetForUpdate()
223 // could cause commit() to fail. Otherwise, it could return any error
224 // that could be returned by DB::Get().
226 // If this transaction was created by a TransactionDB, it can return
227 // Status::OK() on success,
228 // Status::Busy() if there is a write conflict,
229 // Status::TimedOut() if a lock could not be acquired,
230 // Status::TryAgain() if the memtable history size is not large enough
231 // (See max_write_buffer_number_to_maintain)
232 // Status::MergeInProgress() if merge operations cannot be resolved.
233 // or other errors if this key could not be read.
234 virtual Status
GetForUpdate(const ReadOptions
& options
,
235 ColumnFamilyHandle
* column_family
,
236 const Slice
& key
, std::string
* value
,
237 bool exclusive
= true) = 0;
239 // An overload of the above method that receives a PinnableSlice
240 // For backward compatibility a default implementation is provided
241 virtual Status
GetForUpdate(const ReadOptions
& options
,
242 ColumnFamilyHandle
* /*column_family*/,
243 const Slice
& key
, PinnableSlice
* pinnable_val
,
244 bool /*exclusive*/ = true) {
245 if (pinnable_val
== nullptr) {
246 std::string
* null_str
= nullptr;
247 return GetForUpdate(options
, key
, null_str
);
249 auto s
= GetForUpdate(options
, key
, pinnable_val
->GetSelf());
250 pinnable_val
->PinSelf();
255 virtual Status
GetForUpdate(const ReadOptions
& options
, const Slice
& key
,
256 std::string
* value
, bool exclusive
= true) = 0;
258 virtual std::vector
<Status
> MultiGetForUpdate(
259 const ReadOptions
& options
,
260 const std::vector
<ColumnFamilyHandle
*>& column_family
,
261 const std::vector
<Slice
>& keys
, std::vector
<std::string
>* values
) = 0;
263 virtual std::vector
<Status
> MultiGetForUpdate(
264 const ReadOptions
& options
, const std::vector
<Slice
>& keys
,
265 std::vector
<std::string
>* values
) = 0;
267 // Returns an iterator that will iterate on all keys in the default
268 // column family including both keys in the DB and uncommitted keys in this
271 // Setting read_options.snapshot will affect what is read from the
272 // DB but will NOT change which keys are read from this transaction (the keys
273 // in this transaction do not yet belong to any snapshot and will be fetched
276 // Caller is responsible for deleting the returned Iterator.
278 // The returned iterator is only valid until Commit(), Rollback(), or
279 // RollbackToSavePoint() is called.
280 virtual Iterator
* GetIterator(const ReadOptions
& read_options
) = 0;
282 virtual Iterator
* GetIterator(const ReadOptions
& read_options
,
283 ColumnFamilyHandle
* column_family
) = 0;
285 // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
286 // functions in WriteBatch, but will also do conflict checking on the
287 // keys being written.
289 // If this Transaction was created on an OptimisticTransactionDB, these
290 // functions should always return Status::OK().
292 // If this Transaction was created on a TransactionDB, the status returned
294 // Status::OK() on success,
295 // Status::Busy() if there is a write conflict,
296 // Status::TimedOut() if a lock could not be acquired,
297 // Status::TryAgain() if the memtable history size is not large enough
298 // (See max_write_buffer_number_to_maintain)
299 // or other errors on unexpected failures.
300 virtual Status
Put(ColumnFamilyHandle
* column_family
, const Slice
& key
,
301 const Slice
& value
) = 0;
302 virtual Status
Put(const Slice
& key
, const Slice
& value
) = 0;
303 virtual Status
Put(ColumnFamilyHandle
* column_family
, const SliceParts
& key
,
304 const SliceParts
& value
) = 0;
305 virtual Status
Put(const SliceParts
& key
, const SliceParts
& value
) = 0;
307 virtual Status
Merge(ColumnFamilyHandle
* column_family
, const Slice
& key
,
308 const Slice
& value
) = 0;
309 virtual Status
Merge(const Slice
& key
, const Slice
& value
) = 0;
311 virtual Status
Delete(ColumnFamilyHandle
* column_family
,
312 const Slice
& key
) = 0;
313 virtual Status
Delete(const Slice
& key
) = 0;
314 virtual Status
Delete(ColumnFamilyHandle
* column_family
,
315 const SliceParts
& key
) = 0;
316 virtual Status
Delete(const SliceParts
& key
) = 0;
318 virtual Status
SingleDelete(ColumnFamilyHandle
* column_family
,
319 const Slice
& key
) = 0;
320 virtual Status
SingleDelete(const Slice
& key
) = 0;
321 virtual Status
SingleDelete(ColumnFamilyHandle
* column_family
,
322 const SliceParts
& key
) = 0;
323 virtual Status
SingleDelete(const SliceParts
& key
) = 0;
325 // PutUntracked() will write a Put to the batch of operations to be committed
326 // in this transaction. This write will only happen if this transaction
327 // gets committed successfully. But unlike Transaction::Put(),
328 // no conflict checking will be done for this key.
330 // If this Transaction was created on a PessimisticTransactionDB, this
331 // function will still acquire locks necessary to make sure this write doesn't
332 // cause conflicts in other transactions and may return Status::Busy().
333 virtual Status
PutUntracked(ColumnFamilyHandle
* column_family
,
334 const Slice
& key
, const Slice
& value
) = 0;
335 virtual Status
PutUntracked(const Slice
& key
, const Slice
& value
) = 0;
336 virtual Status
PutUntracked(ColumnFamilyHandle
* column_family
,
337 const SliceParts
& key
,
338 const SliceParts
& value
) = 0;
339 virtual Status
PutUntracked(const SliceParts
& key
,
340 const SliceParts
& value
) = 0;
342 virtual Status
MergeUntracked(ColumnFamilyHandle
* column_family
,
343 const Slice
& key
, const Slice
& value
) = 0;
344 virtual Status
MergeUntracked(const Slice
& key
, const Slice
& value
) = 0;
346 virtual Status
DeleteUntracked(ColumnFamilyHandle
* column_family
,
347 const Slice
& key
) = 0;
349 virtual Status
DeleteUntracked(const Slice
& key
) = 0;
350 virtual Status
DeleteUntracked(ColumnFamilyHandle
* column_family
,
351 const SliceParts
& key
) = 0;
352 virtual Status
DeleteUntracked(const SliceParts
& key
) = 0;
353 virtual Status
SingleDeleteUntracked(ColumnFamilyHandle
* column_family
,
354 const Slice
& key
) = 0;
356 virtual Status
SingleDeleteUntracked(const Slice
& key
) = 0;
358 // Similar to WriteBatch::PutLogData
359 virtual void PutLogData(const Slice
& blob
) = 0;
361 // By default, all Put/Merge/Delete operations will be indexed in the
362 // transaction so that Get/GetForUpdate/GetIterator can search for these
365 // If the caller does not want to fetch the keys about to be written,
366 // they may want to avoid indexing as a performance optimization.
367 // Calling DisableIndexing() will turn off indexing for all future
368 // Put/Merge/Delete operations until EnableIndexing() is called.
370 // If a key is Put/Merge/Deleted after DisableIndexing is called and then
371 // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
373 virtual void DisableIndexing() = 0;
374 virtual void EnableIndexing() = 0;
376 // Returns the number of distinct Keys being tracked by this transaction.
377 // If this transaction was created by a TransactionDB, this is the number of
378 // keys that are currently locked by this transaction.
379 // If this transaction was created by an OptimisticTransactionDB, this is the
380 // number of keys that need to be checked for conflicts at commit time.
381 virtual uint64_t GetNumKeys() const = 0;
383 // Returns the number of Puts/Deletes/Merges that have been applied to this
384 // transaction so far.
385 virtual uint64_t GetNumPuts() const = 0;
386 virtual uint64_t GetNumDeletes() const = 0;
387 virtual uint64_t GetNumMerges() const = 0;
389 // Returns the elapsed time in milliseconds since this Transaction began.
390 virtual uint64_t GetElapsedTime() const = 0;
392 // Fetch the underlying write batch that contains all pending changes to be
395 // Note: You should not write or delete anything from the batch directly and
396 // should only use the functions in the Transaction class to
397 // write to this transaction.
398 virtual WriteBatchWithIndex
* GetWriteBatch() = 0;
400 // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
402 // Has no effect on OptimisticTransactions.
403 virtual void SetLockTimeout(int64_t timeout
) = 0;
405 // Return the WriteOptions that will be used during Commit()
406 virtual WriteOptions
* GetWriteOptions() = 0;
408 // Reset the WriteOptions that will be used during Commit().
409 virtual void SetWriteOptions(const WriteOptions
& write_options
) = 0;
411 // If this key was previously fetched in this transaction using
412 // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
413 // the transaction that it no longer needs to do any conflict checking
416 // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
417 // then UndoGetForUpdate will only have an effect if it is also called N
418 // times. If this key has been written to in this transaction,
419 // UndoGetForUpdate() will have no effect.
421 // If SetSavePoint() has been called after the GetForUpdate(),
422 // UndoGetForUpdate() will not have any effect.
424 // If this Transaction was created by an OptimisticTransactionDB,
425 // calling UndoGetForUpdate can affect whether this key is conflict checked
427 // If this Transaction was created by a TransactionDB,
428 // calling UndoGetForUpdate may release any held locks for this key.
429 virtual void UndoGetForUpdate(ColumnFamilyHandle
* column_family
,
430 const Slice
& key
) = 0;
431 virtual void UndoGetForUpdate(const Slice
& key
) = 0;
433 virtual Status
RebuildFromWriteBatch(WriteBatch
* src_batch
) = 0;
435 virtual WriteBatch
* GetCommitTimeWriteBatch() = 0;
437 virtual void SetLogNumber(uint64_t log
) { log_number_
= log
; }
439 virtual uint64_t GetLogNumber() const { return log_number_
; }
441 virtual Status
SetName(const TransactionName
& name
) = 0;
443 virtual TransactionName
GetName() const { return name_
; }
445 virtual TransactionID
GetID() const { return 0; }
447 virtual bool IsDeadlockDetect() const { return false; }
449 virtual std::vector
<TransactionID
> GetWaitingTxns(
450 uint32_t* /*column_family_id*/, std::string
* /*key*/) const {
452 return std::vector
<TransactionID
>();
455 enum TransactionState
{
457 AWAITING_PREPARE
= 1,
461 AWAITING_ROLLBACK
= 5,
466 TransactionState
GetState() const { return txn_state_
; }
467 void SetState(TransactionState state
) { txn_state_
= state
; }
469 // NOTE: Experimental feature
470 // The globally unique id with which the transaction is identified. This id
471 // might or might not be set depending on the implementation. Similarly the
472 // implementation decides the point in lifetime of a transaction at which it
473 // assigns the id. Although currently it is the case, the id is not guaranteed
474 // to remain the same across restarts.
475 uint64_t GetId() { return id_
; }
478 explicit Transaction(const TransactionDB
* /*db*/) {}
479 Transaction() : log_number_(0), txn_state_(STARTED
) {}
481 // the log in which the prepared section for this txn resides
482 // (for two phase commit)
483 uint64_t log_number_
;
484 TransactionName name_
;
486 // Execution status of the transaction.
487 std::atomic
<TransactionState
> txn_state_
;
490 virtual void SetId(uint64_t id
) {
496 friend class PessimisticTransactionDB
;
497 friend class WriteUnpreparedTxnDB
;
498 // No copying allowed
499 Transaction(const Transaction
&);
500 void operator=(const Transaction
&);
503 } // namespace rocksdb
505 #endif // ROCKSDB_LITE