]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/utilities/transaction.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / include / rocksdb / utilities / transaction.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#pragma once
7
8#ifndef ROCKSDB_LITE
9
10#include <string>
11#include <vector>
12
13#include "rocksdb/comparator.h"
14#include "rocksdb/db.h"
15#include "rocksdb/status.h"
16
f67539c2 17namespace ROCKSDB_NAMESPACE {
7c673cae
FG
18
19class Iterator;
20class TransactionDB;
21class WriteBatchWithIndex;
22
23using TransactionName = std::string;
24
25using TransactionID = uint64_t;
26
20effc67
TL
27// An endpoint for a range of keys.
28class Endpoint {
29 // TODO
30};
31
7c673cae
FG
32// Provides notification to the caller of SetSnapshotOnNextOperation when
33// the actual snapshot gets created
34class TransactionNotifier {
35 public:
36 virtual ~TransactionNotifier() {}
37
38 // Implement this method to receive notification when a snapshot is
39 // requested via SetSnapshotOnNextOperation.
40 virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0;
41};
42
43// Provides BEGIN/COMMIT/ROLLBACK transactions.
44//
45// To use transactions, you must first create either an OptimisticTransactionDB
46// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
47// more information.
48//
49// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
50//
51// It is up to the caller to synchronize access to this object.
52//
53// See examples/transaction_example.cc for some simple examples.
54//
55// TODO(agiardullo): Not yet implemented
56// -PerfContext statistics
57// -Support for using Transactions with DBWithTTL
58class Transaction {
59 public:
f67539c2
TL
60 // No copying allowed
61 Transaction(const Transaction&) = delete;
62 void operator=(const Transaction&) = delete;
63
7c673cae
FG
64 virtual ~Transaction() {}
65
66 // If a transaction has a snapshot set, the transaction will ensure that
67 // any keys successfully written(or fetched via GetForUpdate()) have not
68 // been modified outside of this transaction since the time the snapshot was
69 // set.
70 // If a snapshot has not been set, the transaction guarantees that keys have
71 // not been modified since the time each key was first written (or fetched via
72 // GetForUpdate()).
73 //
74 // Using SetSnapshot() will provide stricter isolation guarantees at the
75 // expense of potentially more transaction failures due to conflicts with
76 // other writes.
77 //
78 // Calling SetSnapshot() has no effect on keys written before this function
79 // has been called.
80 //
81 // SetSnapshot() may be called multiple times if you would like to change
82 // the snapshot used for different operations in this transaction.
83 //
84 // Calling SetSnapshot will not affect the version of Data returned by Get()
85 // methods. See Transaction::Get() for more details.
86 virtual void SetSnapshot() = 0;
87
88 // Similar to SetSnapshot(), but will not change the current snapshot
89 // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
90 // By calling this function, the transaction will essentially call
91 // SetSnapshot() for you right before performing the next write/GetForUpdate.
92 //
93 // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
94 // returned by GetSnapshot() until the next write/GetForUpdate is executed.
95 //
96 // When the snapshot is created the notifier's SnapshotCreated method will
97 // be called so that the caller can get access to the snapshot.
98 //
99 // This is an optimization to reduce the likelihood of conflicts that
100 // could occur in between the time SetSnapshot() is called and the first
101 // write/GetForUpdate operation. Eg, this prevents the following
102 // race-condition:
103 //
104 // txn1->SetSnapshot();
105 // txn2->Put("A", ...);
106 // txn2->Commit();
107 // txn1->GetForUpdate(opts, "A", ...); // FAIL!
108 virtual void SetSnapshotOnNextOperation(
109 std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0;
110
111 // Returns the Snapshot created by the last call to SetSnapshot().
112 //
113 // REQUIRED: The returned Snapshot is only valid up until the next time
114 // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
115 // is called, or the Transaction is deleted.
116 virtual const Snapshot* GetSnapshot() const = 0;
117
118 // Clears the current snapshot (i.e. no snapshot will be 'set')
119 //
120 // This removes any snapshot that currently exists or is set to be created
121 // on the next update operation (SetSnapshotOnNextOperation).
122 //
123 // Calling ClearSnapshot() has no effect on keys written before this function
124 // has been called.
125 //
126 // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
127 // longer be valid and should be discarded after a call to ClearSnapshot().
128 virtual void ClearSnapshot() = 0;
129
11fdf7f2 130 // Prepare the current transaction for 2PC
7c673cae
FG
131 virtual Status Prepare() = 0;
132
133 // Write all batched keys to the db atomically.
134 //
135 // Returns OK on success.
136 //
137 // May return any error status that could be returned by DB:Write().
138 //
139 // If this transaction was created by an OptimisticTransactionDB(),
140 // Status::Busy() may be returned if the transaction could not guarantee
141 // that there are no write conflicts. Status::TryAgain() may be returned
142 // if the memtable history size is not large enough
f67539c2 143 // (See max_write_buffer_size_to_maintain).
7c673cae
FG
144 //
145 // If this transaction was created by a TransactionDB(), Status::Expired()
146 // may be returned if this transaction has lived for longer than
20effc67
TL
147 // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if
148 // TransactionOptions.skip_prepare is false and Prepare is not called on this
149 // transaction before Commit.
7c673cae
FG
150 virtual Status Commit() = 0;
151
152 // Discard all batched writes in this transaction.
153 virtual Status Rollback() = 0;
154
155 // Records the state of the transaction for future calls to
156 // RollbackToSavePoint(). May be called multiple times to set multiple save
157 // points.
158 virtual void SetSavePoint() = 0;
159
160 // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
161 // since the most recent call to SetSavePoint() and removes the most recent
162 // SetSavePoint().
163 // If there is no previous call to SetSavePoint(), returns Status::NotFound()
164 virtual Status RollbackToSavePoint() = 0;
165
11fdf7f2
TL
166 // Pop the most recent save point.
167 // If there is no previous call to SetSavePoint(), Status::NotFound()
168 // will be returned.
169 // Otherwise returns Status::OK().
170 virtual Status PopSavePoint() = 0;
171
7c673cae
FG
172 // This function is similar to DB::Get() except it will also read pending
173 // changes in this transaction. Currently, this function will return
174 // Status::MergeInProgress if the most recent write to the queried key in
175 // this batch is a Merge.
176 //
177 // If read_options.snapshot is not set, the current version of the key will
178 // be read. Calling SetSnapshot() does not affect the version of the data
179 // returned.
180 //
181 // Note that setting read_options.snapshot will affect what is read from the
182 // DB but will NOT change which keys are read from this transaction (the keys
183 // in this transaction do not yet belong to any snapshot and will be fetched
184 // regardless).
185 virtual Status Get(const ReadOptions& options,
186 ColumnFamilyHandle* column_family, const Slice& key,
187 std::string* value) = 0;
188
11fdf7f2
TL
189 // An overload of the above method that receives a PinnableSlice
190 // For backward compatibility a default implementation is provided
191 virtual Status Get(const ReadOptions& options,
192 ColumnFamilyHandle* column_family, const Slice& key,
193 PinnableSlice* pinnable_val) {
194 assert(pinnable_val != nullptr);
195 auto s = Get(options, column_family, key, pinnable_val->GetSelf());
196 pinnable_val->PinSelf();
197 return s;
198 }
199
7c673cae
FG
200 virtual Status Get(const ReadOptions& options, const Slice& key,
201 std::string* value) = 0;
11fdf7f2
TL
202 virtual Status Get(const ReadOptions& options, const Slice& key,
203 PinnableSlice* pinnable_val) {
204 assert(pinnable_val != nullptr);
205 auto s = Get(options, key, pinnable_val->GetSelf());
206 pinnable_val->PinSelf();
207 return s;
208 }
7c673cae
FG
209
210 virtual std::vector<Status> MultiGet(
211 const ReadOptions& options,
212 const std::vector<ColumnFamilyHandle*>& column_family,
213 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
214
215 virtual std::vector<Status> MultiGet(const ReadOptions& options,
216 const std::vector<Slice>& keys,
217 std::vector<std::string>* values) = 0;
218
f67539c2
TL
219 // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are
220 // expected to override this with an implementation that calls
221 // DBImpl::MultiGet()
222 virtual void MultiGet(const ReadOptions& options,
223 ColumnFamilyHandle* column_family,
224 const size_t num_keys, const Slice* keys,
225 PinnableSlice* values, Status* statuses,
226 const bool /*sorted_input*/ = false) {
227 for (size_t i = 0; i < num_keys; ++i) {
228 statuses[i] = Get(options, column_family, keys[i], &values[i]);
229 }
230 }
231
7c673cae
FG
232 // Read this key and ensure that this transaction will only
233 // be able to be committed if this key is not written outside this
234 // transaction after it has first been read (or after the snapshot if a
494da23a
TL
235 // snapshot is set in this transaction and do_validate is true). If
236 // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
237 // that GetForUpdate returns the latest committed value. The transaction
238 // behavior is the same regardless of whether the key exists or not.
7c673cae
FG
239 //
240 // Note: Currently, this function will return Status::MergeInProgress
241 // if the most recent write to the queried key in this batch is a Merge.
242 //
243 // The values returned by this function are similar to Transaction::Get().
244 // If value==nullptr, then this function will not read any data, but will
245 // still ensure that this key cannot be written to by outside of this
246 // transaction.
247 //
248 // If this transaction was created by an OptimisticTransaction, GetForUpdate()
249 // could cause commit() to fail. Otherwise, it could return any error
250 // that could be returned by DB::Get().
251 //
252 // If this transaction was created by a TransactionDB, it can return
253 // Status::OK() on success,
254 // Status::Busy() if there is a write conflict,
255 // Status::TimedOut() if a lock could not be acquired,
256 // Status::TryAgain() if the memtable history size is not large enough
f67539c2 257 // (See max_write_buffer_size_to_maintain)
7c673cae
FG
258 // Status::MergeInProgress() if merge operations cannot be resolved.
259 // or other errors if this key could not be read.
260 virtual Status GetForUpdate(const ReadOptions& options,
261 ColumnFamilyHandle* column_family,
262 const Slice& key, std::string* value,
494da23a
TL
263 bool exclusive = true,
264 const bool do_validate = true) = 0;
7c673cae 265
11fdf7f2
TL
266 // An overload of the above method that receives a PinnableSlice
267 // For backward compatibility a default implementation is provided
268 virtual Status GetForUpdate(const ReadOptions& options,
494da23a 269 ColumnFamilyHandle* column_family,
11fdf7f2 270 const Slice& key, PinnableSlice* pinnable_val,
494da23a
TL
271 bool exclusive = true,
272 const bool do_validate = true) {
11fdf7f2
TL
273 if (pinnable_val == nullptr) {
274 std::string* null_str = nullptr;
494da23a
TL
275 return GetForUpdate(options, column_family, key, null_str, exclusive,
276 do_validate);
11fdf7f2 277 } else {
494da23a
TL
278 auto s = GetForUpdate(options, column_family, key,
279 pinnable_val->GetSelf(), exclusive, do_validate);
11fdf7f2
TL
280 pinnable_val->PinSelf();
281 return s;
282 }
283 }
284
7c673cae 285 virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
494da23a
TL
286 std::string* value, bool exclusive = true,
287 const bool do_validate = true) = 0;
7c673cae
FG
288
289 virtual std::vector<Status> MultiGetForUpdate(
290 const ReadOptions& options,
291 const std::vector<ColumnFamilyHandle*>& column_family,
292 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
293
294 virtual std::vector<Status> MultiGetForUpdate(
295 const ReadOptions& options, const std::vector<Slice>& keys,
296 std::vector<std::string>* values) = 0;
297
298 // Returns an iterator that will iterate on all keys in the default
299 // column family including both keys in the DB and uncommitted keys in this
300 // transaction.
301 //
302 // Setting read_options.snapshot will affect what is read from the
303 // DB but will NOT change which keys are read from this transaction (the keys
304 // in this transaction do not yet belong to any snapshot and will be fetched
305 // regardless).
306 //
307 // Caller is responsible for deleting the returned Iterator.
308 //
309 // The returned iterator is only valid until Commit(), Rollback(), or
310 // RollbackToSavePoint() is called.
311 virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
312
313 virtual Iterator* GetIterator(const ReadOptions& read_options,
314 ColumnFamilyHandle* column_family) = 0;
315
316 // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
317 // functions in WriteBatch, but will also do conflict checking on the
318 // keys being written.
319 //
f67539c2
TL
320 // assume_tracked=true expects the key be already tracked. More
321 // specifically, it means the the key was previous tracked in the same
322 // savepoint, with the same exclusive flag, and at a lower sequence number.
323 // If valid then it skips ValidateSnapshot. Returns error otherwise.
494da23a 324 //
7c673cae
FG
325 // If this Transaction was created on an OptimisticTransactionDB, these
326 // functions should always return Status::OK().
327 //
328 // If this Transaction was created on a TransactionDB, the status returned
329 // can be:
330 // Status::OK() on success,
331 // Status::Busy() if there is a write conflict,
332 // Status::TimedOut() if a lock could not be acquired,
333 // Status::TryAgain() if the memtable history size is not large enough
f67539c2 334 // (See max_write_buffer_size_to_maintain)
7c673cae
FG
335 // or other errors on unexpected failures.
336 virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
494da23a 337 const Slice& value, const bool assume_tracked = false) = 0;
7c673cae
FG
338 virtual Status Put(const Slice& key, const Slice& value) = 0;
339 virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
494da23a
TL
340 const SliceParts& value,
341 const bool assume_tracked = false) = 0;
7c673cae
FG
342 virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
343
344 virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
494da23a
TL
345 const Slice& value,
346 const bool assume_tracked = false) = 0;
7c673cae
FG
347 virtual Status Merge(const Slice& key, const Slice& value) = 0;
348
494da23a
TL
349 virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
350 const bool assume_tracked = false) = 0;
7c673cae
FG
351 virtual Status Delete(const Slice& key) = 0;
352 virtual Status Delete(ColumnFamilyHandle* column_family,
494da23a
TL
353 const SliceParts& key,
354 const bool assume_tracked = false) = 0;
7c673cae
FG
355 virtual Status Delete(const SliceParts& key) = 0;
356
357 virtual Status SingleDelete(ColumnFamilyHandle* column_family,
494da23a
TL
358 const Slice& key,
359 const bool assume_tracked = false) = 0;
7c673cae
FG
360 virtual Status SingleDelete(const Slice& key) = 0;
361 virtual Status SingleDelete(ColumnFamilyHandle* column_family,
494da23a
TL
362 const SliceParts& key,
363 const bool assume_tracked = false) = 0;
7c673cae
FG
364 virtual Status SingleDelete(const SliceParts& key) = 0;
365
366 // PutUntracked() will write a Put to the batch of operations to be committed
367 // in this transaction. This write will only happen if this transaction
368 // gets committed successfully. But unlike Transaction::Put(),
369 // no conflict checking will be done for this key.
370 //
11fdf7f2
TL
371 // If this Transaction was created on a PessimisticTransactionDB, this
372 // function will still acquire locks necessary to make sure this write doesn't
373 // cause conflicts in other transactions and may return Status::Busy().
7c673cae
FG
374 virtual Status PutUntracked(ColumnFamilyHandle* column_family,
375 const Slice& key, const Slice& value) = 0;
376 virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
377 virtual Status PutUntracked(ColumnFamilyHandle* column_family,
378 const SliceParts& key,
379 const SliceParts& value) = 0;
380 virtual Status PutUntracked(const SliceParts& key,
381 const SliceParts& value) = 0;
382
383 virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
384 const Slice& key, const Slice& value) = 0;
385 virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
386
387 virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
388 const Slice& key) = 0;
389
390 virtual Status DeleteUntracked(const Slice& key) = 0;
391 virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
392 const SliceParts& key) = 0;
393 virtual Status DeleteUntracked(const SliceParts& key) = 0;
11fdf7f2
TL
394 virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
395 const Slice& key) = 0;
396
397 virtual Status SingleDeleteUntracked(const Slice& key) = 0;
7c673cae
FG
398
399 // Similar to WriteBatch::PutLogData
400 virtual void PutLogData(const Slice& blob) = 0;
401
402 // By default, all Put/Merge/Delete operations will be indexed in the
403 // transaction so that Get/GetForUpdate/GetIterator can search for these
404 // keys.
405 //
406 // If the caller does not want to fetch the keys about to be written,
407 // they may want to avoid indexing as a performance optimization.
408 // Calling DisableIndexing() will turn off indexing for all future
409 // Put/Merge/Delete operations until EnableIndexing() is called.
410 //
411 // If a key is Put/Merge/Deleted after DisableIndexing is called and then
412 // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
413 // undefined.
414 virtual void DisableIndexing() = 0;
415 virtual void EnableIndexing() = 0;
416
417 // Returns the number of distinct Keys being tracked by this transaction.
11fdf7f2 418 // If this transaction was created by a TransactionDB, this is the number of
7c673cae
FG
419 // keys that are currently locked by this transaction.
420 // If this transaction was created by an OptimisticTransactionDB, this is the
421 // number of keys that need to be checked for conflicts at commit time.
422 virtual uint64_t GetNumKeys() const = 0;
423
424 // Returns the number of Puts/Deletes/Merges that have been applied to this
425 // transaction so far.
426 virtual uint64_t GetNumPuts() const = 0;
427 virtual uint64_t GetNumDeletes() const = 0;
428 virtual uint64_t GetNumMerges() const = 0;
429
430 // Returns the elapsed time in milliseconds since this Transaction began.
431 virtual uint64_t GetElapsedTime() const = 0;
432
433 // Fetch the underlying write batch that contains all pending changes to be
434 // committed.
435 //
436 // Note: You should not write or delete anything from the batch directly and
437 // should only use the functions in the Transaction class to
438 // write to this transaction.
439 virtual WriteBatchWithIndex* GetWriteBatch() = 0;
440
441 // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
442 // this transaction.
443 // Has no effect on OptimisticTransactions.
444 virtual void SetLockTimeout(int64_t timeout) = 0;
445
446 // Return the WriteOptions that will be used during Commit()
447 virtual WriteOptions* GetWriteOptions() = 0;
448
449 // Reset the WriteOptions that will be used during Commit().
450 virtual void SetWriteOptions(const WriteOptions& write_options) = 0;
451
452 // If this key was previously fetched in this transaction using
453 // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
454 // the transaction that it no longer needs to do any conflict checking
455 // for this key.
456 //
457 // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
458 // then UndoGetForUpdate will only have an effect if it is also called N
459 // times. If this key has been written to in this transaction,
460 // UndoGetForUpdate() will have no effect.
461 //
462 // If SetSavePoint() has been called after the GetForUpdate(),
463 // UndoGetForUpdate() will not have any effect.
464 //
465 // If this Transaction was created by an OptimisticTransactionDB,
466 // calling UndoGetForUpdate can affect whether this key is conflict checked
467 // at commit time.
468 // If this Transaction was created by a TransactionDB,
469 // calling UndoGetForUpdate may release any held locks for this key.
470 virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family,
471 const Slice& key) = 0;
472 virtual void UndoGetForUpdate(const Slice& key) = 0;
473
474 virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0;
475
476 virtual WriteBatch* GetCommitTimeWriteBatch() = 0;
477
478 virtual void SetLogNumber(uint64_t log) { log_number_ = log; }
479
480 virtual uint64_t GetLogNumber() const { return log_number_; }
481
482 virtual Status SetName(const TransactionName& name) = 0;
483
484 virtual TransactionName GetName() const { return name_; }
485
486 virtual TransactionID GetID() const { return 0; }
487
488 virtual bool IsDeadlockDetect() const { return false; }
489
11fdf7f2
TL
490 virtual std::vector<TransactionID> GetWaitingTxns(
491 uint32_t* /*column_family_id*/, std::string* /*key*/) const {
7c673cae
FG
492 assert(false);
493 return std::vector<TransactionID>();
494 }
495
496 enum TransactionState {
497 STARTED = 0,
498 AWAITING_PREPARE = 1,
499 PREPARED = 2,
500 AWAITING_COMMIT = 3,
20effc67
TL
501 COMMITTED = 4,
502 COMMITED = COMMITTED, // old misspelled name
7c673cae
FG
503 AWAITING_ROLLBACK = 5,
504 ROLLEDBACK = 6,
505 LOCKS_STOLEN = 7,
506 };
507
508 TransactionState GetState() const { return txn_state_; }
509 void SetState(TransactionState state) { txn_state_ = state; }
510
11fdf7f2
TL
511 // NOTE: Experimental feature
512 // The globally unique id with which the transaction is identified. This id
513 // might or might not be set depending on the implementation. Similarly the
514 // implementation decides the point in lifetime of a transaction at which it
515 // assigns the id. Although currently it is the case, the id is not guaranteed
516 // to remain the same across restarts.
517 uint64_t GetId() { return id_; }
518
7c673cae 519 protected:
11fdf7f2
TL
520 explicit Transaction(const TransactionDB* /*db*/) {}
521 Transaction() : log_number_(0), txn_state_(STARTED) {}
7c673cae
FG
522
523 // the log in which the prepared section for this txn resides
524 // (for two phase commit)
525 uint64_t log_number_;
526 TransactionName name_;
527
528 // Execution status of the transaction.
529 std::atomic<TransactionState> txn_state_;
530
11fdf7f2
TL
531 uint64_t id_ = 0;
532 virtual void SetId(uint64_t id) {
533 assert(id_ == 0);
534 id_ = id;
535 }
536
f67539c2
TL
537 virtual uint64_t GetLastLogNumber() const { return log_number_; }
538
7c673cae 539 private:
11fdf7f2
TL
540 friend class PessimisticTransactionDB;
541 friend class WriteUnpreparedTxnDB;
f67539c2
TL
542 friend class TransactionTest_TwoPhaseLogRollingTest_Test;
543 friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
7c673cae
FG
544};
545
f67539c2 546} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
547
548#endif // ROCKSDB_LITE