]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/utilities/transaction.h
d6c6722c8eece3d08763c3cc98c7747cf91eb389
[ceph.git] / ceph / src / rocksdb / include / rocksdb / utilities / transaction.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #pragma once
7
8 #ifndef ROCKSDB_LITE
9
10 #include <string>
11 #include <vector>
12
13 #include "rocksdb/comparator.h"
14 #include "rocksdb/db.h"
15 #include "rocksdb/status.h"
16
17 namespace ROCKSDB_NAMESPACE {
18
19 class Iterator;
20 class TransactionDB;
21 class WriteBatchWithIndex;
22
23 using TransactionName = std::string;
24
25 using TransactionID = uint64_t;
26
27 // Provides notification to the caller of SetSnapshotOnNextOperation when
28 // the actual snapshot gets created
29 class TransactionNotifier {
30 public:
31 virtual ~TransactionNotifier() {}
32
33 // Implement this method to receive notification when a snapshot is
34 // requested via SetSnapshotOnNextOperation.
35 virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0;
36 };
37
38 // Provides BEGIN/COMMIT/ROLLBACK transactions.
39 //
40 // To use transactions, you must first create either an OptimisticTransactionDB
41 // or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
42 // more information.
43 //
44 // To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
45 //
46 // It is up to the caller to synchronize access to this object.
47 //
48 // See examples/transaction_example.cc for some simple examples.
49 //
50 // TODO(agiardullo): Not yet implemented
51 // -PerfContext statistics
52 // -Support for using Transactions with DBWithTTL
53 class Transaction {
54 public:
55 // No copying allowed
56 Transaction(const Transaction&) = delete;
57 void operator=(const Transaction&) = delete;
58
59 virtual ~Transaction() {}
60
61 // If a transaction has a snapshot set, the transaction will ensure that
62 // any keys successfully written(or fetched via GetForUpdate()) have not
63 // been modified outside of this transaction since the time the snapshot was
64 // set.
65 // If a snapshot has not been set, the transaction guarantees that keys have
66 // not been modified since the time each key was first written (or fetched via
67 // GetForUpdate()).
68 //
69 // Using SetSnapshot() will provide stricter isolation guarantees at the
70 // expense of potentially more transaction failures due to conflicts with
71 // other writes.
72 //
73 // Calling SetSnapshot() has no effect on keys written before this function
74 // has been called.
75 //
76 // SetSnapshot() may be called multiple times if you would like to change
77 // the snapshot used for different operations in this transaction.
78 //
79 // Calling SetSnapshot will not affect the version of Data returned by Get()
80 // methods. See Transaction::Get() for more details.
81 virtual void SetSnapshot() = 0;
82
83 // Similar to SetSnapshot(), but will not change the current snapshot
84 // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
85 // By calling this function, the transaction will essentially call
86 // SetSnapshot() for you right before performing the next write/GetForUpdate.
87 //
88 // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
89 // returned by GetSnapshot() until the next write/GetForUpdate is executed.
90 //
91 // When the snapshot is created the notifier's SnapshotCreated method will
92 // be called so that the caller can get access to the snapshot.
93 //
94 // This is an optimization to reduce the likelihood of conflicts that
95 // could occur in between the time SetSnapshot() is called and the first
96 // write/GetForUpdate operation. Eg, this prevents the following
97 // race-condition:
98 //
99 // txn1->SetSnapshot();
100 // txn2->Put("A", ...);
101 // txn2->Commit();
102 // txn1->GetForUpdate(opts, "A", ...); // FAIL!
103 virtual void SetSnapshotOnNextOperation(
104 std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0;
105
106 // Returns the Snapshot created by the last call to SetSnapshot().
107 //
108 // REQUIRED: The returned Snapshot is only valid up until the next time
109 // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
110 // is called, or the Transaction is deleted.
111 virtual const Snapshot* GetSnapshot() const = 0;
112
113 // Clears the current snapshot (i.e. no snapshot will be 'set')
114 //
115 // This removes any snapshot that currently exists or is set to be created
116 // on the next update operation (SetSnapshotOnNextOperation).
117 //
118 // Calling ClearSnapshot() has no effect on keys written before this function
119 // has been called.
120 //
121 // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
122 // longer be valid and should be discarded after a call to ClearSnapshot().
123 virtual void ClearSnapshot() = 0;
124
125 // Prepare the current transaction for 2PC
126 virtual Status Prepare() = 0;
127
128 // Write all batched keys to the db atomically.
129 //
130 // Returns OK on success.
131 //
132 // May return any error status that could be returned by DB:Write().
133 //
134 // If this transaction was created by an OptimisticTransactionDB(),
135 // Status::Busy() may be returned if the transaction could not guarantee
136 // that there are no write conflicts. Status::TryAgain() may be returned
137 // if the memtable history size is not large enough
138 // (See max_write_buffer_size_to_maintain).
139 //
140 // If this transaction was created by a TransactionDB(), Status::Expired()
141 // may be returned if this transaction has lived for longer than
142 // TransactionOptions.expiration.
143 virtual Status Commit() = 0;
144
145 // Discard all batched writes in this transaction.
146 virtual Status Rollback() = 0;
147
148 // Records the state of the transaction for future calls to
149 // RollbackToSavePoint(). May be called multiple times to set multiple save
150 // points.
151 virtual void SetSavePoint() = 0;
152
153 // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
154 // since the most recent call to SetSavePoint() and removes the most recent
155 // SetSavePoint().
156 // If there is no previous call to SetSavePoint(), returns Status::NotFound()
157 virtual Status RollbackToSavePoint() = 0;
158
159 // Pop the most recent save point.
160 // If there is no previous call to SetSavePoint(), Status::NotFound()
161 // will be returned.
162 // Otherwise returns Status::OK().
163 virtual Status PopSavePoint() = 0;
164
165 // This function is similar to DB::Get() except it will also read pending
166 // changes in this transaction. Currently, this function will return
167 // Status::MergeInProgress if the most recent write to the queried key in
168 // this batch is a Merge.
169 //
170 // If read_options.snapshot is not set, the current version of the key will
171 // be read. Calling SetSnapshot() does not affect the version of the data
172 // returned.
173 //
174 // Note that setting read_options.snapshot will affect what is read from the
175 // DB but will NOT change which keys are read from this transaction (the keys
176 // in this transaction do not yet belong to any snapshot and will be fetched
177 // regardless).
178 virtual Status Get(const ReadOptions& options,
179 ColumnFamilyHandle* column_family, const Slice& key,
180 std::string* value) = 0;
181
182 // An overload of the above method that receives a PinnableSlice
183 // For backward compatibility a default implementation is provided
184 virtual Status Get(const ReadOptions& options,
185 ColumnFamilyHandle* column_family, const Slice& key,
186 PinnableSlice* pinnable_val) {
187 assert(pinnable_val != nullptr);
188 auto s = Get(options, column_family, key, pinnable_val->GetSelf());
189 pinnable_val->PinSelf();
190 return s;
191 }
192
193 virtual Status Get(const ReadOptions& options, const Slice& key,
194 std::string* value) = 0;
195 virtual Status Get(const ReadOptions& options, const Slice& key,
196 PinnableSlice* pinnable_val) {
197 assert(pinnable_val != nullptr);
198 auto s = Get(options, key, pinnable_val->GetSelf());
199 pinnable_val->PinSelf();
200 return s;
201 }
202
203 virtual std::vector<Status> MultiGet(
204 const ReadOptions& options,
205 const std::vector<ColumnFamilyHandle*>& column_family,
206 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
207
208 virtual std::vector<Status> MultiGet(const ReadOptions& options,
209 const std::vector<Slice>& keys,
210 std::vector<std::string>* values) = 0;
211
212 // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are
213 // expected to override this with an implementation that calls
214 // DBImpl::MultiGet()
215 virtual void MultiGet(const ReadOptions& options,
216 ColumnFamilyHandle* column_family,
217 const size_t num_keys, const Slice* keys,
218 PinnableSlice* values, Status* statuses,
219 const bool /*sorted_input*/ = false) {
220 for (size_t i = 0; i < num_keys; ++i) {
221 statuses[i] = Get(options, column_family, keys[i], &values[i]);
222 }
223 }
224
225 // Read this key and ensure that this transaction will only
226 // be able to be committed if this key is not written outside this
227 // transaction after it has first been read (or after the snapshot if a
228 // snapshot is set in this transaction and do_validate is true). If
229 // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
230 // that GetForUpdate returns the latest committed value. The transaction
231 // behavior is the same regardless of whether the key exists or not.
232 //
233 // Note: Currently, this function will return Status::MergeInProgress
234 // if the most recent write to the queried key in this batch is a Merge.
235 //
236 // The values returned by this function are similar to Transaction::Get().
237 // If value==nullptr, then this function will not read any data, but will
238 // still ensure that this key cannot be written to by outside of this
239 // transaction.
240 //
241 // If this transaction was created by an OptimisticTransaction, GetForUpdate()
242 // could cause commit() to fail. Otherwise, it could return any error
243 // that could be returned by DB::Get().
244 //
245 // If this transaction was created by a TransactionDB, it can return
246 // Status::OK() on success,
247 // Status::Busy() if there is a write conflict,
248 // Status::TimedOut() if a lock could not be acquired,
249 // Status::TryAgain() if the memtable history size is not large enough
250 // (See max_write_buffer_size_to_maintain)
251 // Status::MergeInProgress() if merge operations cannot be resolved.
252 // or other errors if this key could not be read.
253 virtual Status GetForUpdate(const ReadOptions& options,
254 ColumnFamilyHandle* column_family,
255 const Slice& key, std::string* value,
256 bool exclusive = true,
257 const bool do_validate = true) = 0;
258
259 // An overload of the above method that receives a PinnableSlice
260 // For backward compatibility a default implementation is provided
261 virtual Status GetForUpdate(const ReadOptions& options,
262 ColumnFamilyHandle* column_family,
263 const Slice& key, PinnableSlice* pinnable_val,
264 bool exclusive = true,
265 const bool do_validate = true) {
266 if (pinnable_val == nullptr) {
267 std::string* null_str = nullptr;
268 return GetForUpdate(options, column_family, key, null_str, exclusive,
269 do_validate);
270 } else {
271 auto s = GetForUpdate(options, column_family, key,
272 pinnable_val->GetSelf(), exclusive, do_validate);
273 pinnable_val->PinSelf();
274 return s;
275 }
276 }
277
278 virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
279 std::string* value, bool exclusive = true,
280 const bool do_validate = true) = 0;
281
282 virtual std::vector<Status> MultiGetForUpdate(
283 const ReadOptions& options,
284 const std::vector<ColumnFamilyHandle*>& column_family,
285 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
286
287 virtual std::vector<Status> MultiGetForUpdate(
288 const ReadOptions& options, const std::vector<Slice>& keys,
289 std::vector<std::string>* values) = 0;
290
291 // Returns an iterator that will iterate on all keys in the default
292 // column family including both keys in the DB and uncommitted keys in this
293 // transaction.
294 //
295 // Setting read_options.snapshot will affect what is read from the
296 // DB but will NOT change which keys are read from this transaction (the keys
297 // in this transaction do not yet belong to any snapshot and will be fetched
298 // regardless).
299 //
300 // Caller is responsible for deleting the returned Iterator.
301 //
302 // The returned iterator is only valid until Commit(), Rollback(), or
303 // RollbackToSavePoint() is called.
304 virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
305
306 virtual Iterator* GetIterator(const ReadOptions& read_options,
307 ColumnFamilyHandle* column_family) = 0;
308
309 // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
310 // functions in WriteBatch, but will also do conflict checking on the
311 // keys being written.
312 //
313 // assume_tracked=true expects the key be already tracked. More
314 // specifically, it means the the key was previous tracked in the same
315 // savepoint, with the same exclusive flag, and at a lower sequence number.
316 // If valid then it skips ValidateSnapshot. Returns error otherwise.
317 //
318 // If this Transaction was created on an OptimisticTransactionDB, these
319 // functions should always return Status::OK().
320 //
321 // If this Transaction was created on a TransactionDB, the status returned
322 // can be:
323 // Status::OK() on success,
324 // Status::Busy() if there is a write conflict,
325 // Status::TimedOut() if a lock could not be acquired,
326 // Status::TryAgain() if the memtable history size is not large enough
327 // (See max_write_buffer_size_to_maintain)
328 // or other errors on unexpected failures.
329 virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
330 const Slice& value, const bool assume_tracked = false) = 0;
331 virtual Status Put(const Slice& key, const Slice& value) = 0;
332 virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
333 const SliceParts& value,
334 const bool assume_tracked = false) = 0;
335 virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
336
337 virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
338 const Slice& value,
339 const bool assume_tracked = false) = 0;
340 virtual Status Merge(const Slice& key, const Slice& value) = 0;
341
342 virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
343 const bool assume_tracked = false) = 0;
344 virtual Status Delete(const Slice& key) = 0;
345 virtual Status Delete(ColumnFamilyHandle* column_family,
346 const SliceParts& key,
347 const bool assume_tracked = false) = 0;
348 virtual Status Delete(const SliceParts& key) = 0;
349
350 virtual Status SingleDelete(ColumnFamilyHandle* column_family,
351 const Slice& key,
352 const bool assume_tracked = false) = 0;
353 virtual Status SingleDelete(const Slice& key) = 0;
354 virtual Status SingleDelete(ColumnFamilyHandle* column_family,
355 const SliceParts& key,
356 const bool assume_tracked = false) = 0;
357 virtual Status SingleDelete(const SliceParts& key) = 0;
358
359 // PutUntracked() will write a Put to the batch of operations to be committed
360 // in this transaction. This write will only happen if this transaction
361 // gets committed successfully. But unlike Transaction::Put(),
362 // no conflict checking will be done for this key.
363 //
364 // If this Transaction was created on a PessimisticTransactionDB, this
365 // function will still acquire locks necessary to make sure this write doesn't
366 // cause conflicts in other transactions and may return Status::Busy().
367 virtual Status PutUntracked(ColumnFamilyHandle* column_family,
368 const Slice& key, const Slice& value) = 0;
369 virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
370 virtual Status PutUntracked(ColumnFamilyHandle* column_family,
371 const SliceParts& key,
372 const SliceParts& value) = 0;
373 virtual Status PutUntracked(const SliceParts& key,
374 const SliceParts& value) = 0;
375
376 virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
377 const Slice& key, const Slice& value) = 0;
378 virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
379
380 virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
381 const Slice& key) = 0;
382
383 virtual Status DeleteUntracked(const Slice& key) = 0;
384 virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
385 const SliceParts& key) = 0;
386 virtual Status DeleteUntracked(const SliceParts& key) = 0;
387 virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
388 const Slice& key) = 0;
389
390 virtual Status SingleDeleteUntracked(const Slice& key) = 0;
391
392 // Similar to WriteBatch::PutLogData
393 virtual void PutLogData(const Slice& blob) = 0;
394
395 // By default, all Put/Merge/Delete operations will be indexed in the
396 // transaction so that Get/GetForUpdate/GetIterator can search for these
397 // keys.
398 //
399 // If the caller does not want to fetch the keys about to be written,
400 // they may want to avoid indexing as a performance optimization.
401 // Calling DisableIndexing() will turn off indexing for all future
402 // Put/Merge/Delete operations until EnableIndexing() is called.
403 //
404 // If a key is Put/Merge/Deleted after DisableIndexing is called and then
405 // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
406 // undefined.
407 virtual void DisableIndexing() = 0;
408 virtual void EnableIndexing() = 0;
409
410 // Returns the number of distinct Keys being tracked by this transaction.
411 // If this transaction was created by a TransactionDB, this is the number of
412 // keys that are currently locked by this transaction.
413 // If this transaction was created by an OptimisticTransactionDB, this is the
414 // number of keys that need to be checked for conflicts at commit time.
415 virtual uint64_t GetNumKeys() const = 0;
416
417 // Returns the number of Puts/Deletes/Merges that have been applied to this
418 // transaction so far.
419 virtual uint64_t GetNumPuts() const = 0;
420 virtual uint64_t GetNumDeletes() const = 0;
421 virtual uint64_t GetNumMerges() const = 0;
422
423 // Returns the elapsed time in milliseconds since this Transaction began.
424 virtual uint64_t GetElapsedTime() const = 0;
425
426 // Fetch the underlying write batch that contains all pending changes to be
427 // committed.
428 //
429 // Note: You should not write or delete anything from the batch directly and
430 // should only use the functions in the Transaction class to
431 // write to this transaction.
432 virtual WriteBatchWithIndex* GetWriteBatch() = 0;
433
434 // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
435 // this transaction.
436 // Has no effect on OptimisticTransactions.
437 virtual void SetLockTimeout(int64_t timeout) = 0;
438
439 // Return the WriteOptions that will be used during Commit()
440 virtual WriteOptions* GetWriteOptions() = 0;
441
442 // Reset the WriteOptions that will be used during Commit().
443 virtual void SetWriteOptions(const WriteOptions& write_options) = 0;
444
445 // If this key was previously fetched in this transaction using
446 // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
447 // the transaction that it no longer needs to do any conflict checking
448 // for this key.
449 //
450 // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
451 // then UndoGetForUpdate will only have an effect if it is also called N
452 // times. If this key has been written to in this transaction,
453 // UndoGetForUpdate() will have no effect.
454 //
455 // If SetSavePoint() has been called after the GetForUpdate(),
456 // UndoGetForUpdate() will not have any effect.
457 //
458 // If this Transaction was created by an OptimisticTransactionDB,
459 // calling UndoGetForUpdate can affect whether this key is conflict checked
460 // at commit time.
461 // If this Transaction was created by a TransactionDB,
462 // calling UndoGetForUpdate may release any held locks for this key.
463 virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family,
464 const Slice& key) = 0;
465 virtual void UndoGetForUpdate(const Slice& key) = 0;
466
467 virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0;
468
469 virtual WriteBatch* GetCommitTimeWriteBatch() = 0;
470
471 virtual void SetLogNumber(uint64_t log) { log_number_ = log; }
472
473 virtual uint64_t GetLogNumber() const { return log_number_; }
474
475 virtual Status SetName(const TransactionName& name) = 0;
476
477 virtual TransactionName GetName() const { return name_; }
478
479 virtual TransactionID GetID() const { return 0; }
480
481 virtual bool IsDeadlockDetect() const { return false; }
482
483 virtual std::vector<TransactionID> GetWaitingTxns(
484 uint32_t* /*column_family_id*/, std::string* /*key*/) const {
485 assert(false);
486 return std::vector<TransactionID>();
487 }
488
489 enum TransactionState {
490 STARTED = 0,
491 AWAITING_PREPARE = 1,
492 PREPARED = 2,
493 AWAITING_COMMIT = 3,
494 COMMITED = 4,
495 AWAITING_ROLLBACK = 5,
496 ROLLEDBACK = 6,
497 LOCKS_STOLEN = 7,
498 };
499
500 TransactionState GetState() const { return txn_state_; }
501 void SetState(TransactionState state) { txn_state_ = state; }
502
503 // NOTE: Experimental feature
504 // The globally unique id with which the transaction is identified. This id
505 // might or might not be set depending on the implementation. Similarly the
506 // implementation decides the point in lifetime of a transaction at which it
507 // assigns the id. Although currently it is the case, the id is not guaranteed
508 // to remain the same across restarts.
509 uint64_t GetId() { return id_; }
510
511 protected:
512 explicit Transaction(const TransactionDB* /*db*/) {}
513 Transaction() : log_number_(0), txn_state_(STARTED) {}
514
515 // the log in which the prepared section for this txn resides
516 // (for two phase commit)
517 uint64_t log_number_;
518 TransactionName name_;
519
520 // Execution status of the transaction.
521 std::atomic<TransactionState> txn_state_;
522
523 uint64_t id_ = 0;
524 virtual void SetId(uint64_t id) {
525 assert(id_ == 0);
526 id_ = id;
527 }
528
529 virtual uint64_t GetLastLogNumber() const { return log_number_; }
530
531 private:
532 friend class PessimisticTransactionDB;
533 friend class WriteUnpreparedTxnDB;
534 friend class TransactionTest_TwoPhaseLogRollingTest_Test;
535 friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
536 };
537
538 } // namespace ROCKSDB_NAMESPACE
539
540 #endif // ROCKSDB_LITE