ceph/src/rocksdb/include/rocksdb/utilities/transaction_db.h

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5
   6 #pragma once
   7 #ifndef ROCKSDB_LITE
   8
   9 #include <string>
  10 #include <utility>
  11 #include <vector>
  12
  13 #include "rocksdb/comparator.h"
  14 #include "rocksdb/db.h"
  15 #include "rocksdb/utilities/stackable_db.h"
  16 #include "rocksdb/utilities/transaction.h"
  17
  18 // Database with Transaction support.
  19 //
  20 // See transaction.h and examples/transaction_example.cc
  21
  22 namespace rocksdb {
  23
  24 class TransactionDBMutexFactory;
  25
  26 enum TxnDBWritePolicy {
  27   WRITE_COMMITTED = 0,  // write only the committed data
  28   // TODO(myabandeh): Not implemented yet
  29   WRITE_PREPARED,  // write data after the prepare phase of 2pc
  30   // TODO(myabandeh): Not implemented yet
  31   WRITE_UNPREPARED  // write data before the prepare phase of 2pc
  32 };
  33
  34 const uint32_t kInitialMaxDeadlocks = 5;
  35
  36 struct TransactionDBOptions {
  37   // Specifies the maximum number of keys that can be locked at the same time
  38   // per column family.
  39   // If the number of locked keys is greater than max_num_locks, transaction
  40   // writes (or GetForUpdate) will return an error.
  41   // If this value is not positive, no limit will be enforced.
  42   int64_t max_num_locks = -1;
  43
  44   // Stores the number of latest deadlocks to track
  45   uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
  46
  47   // Increasing this value will increase the concurrency by dividing the lock
  48   // table (per column family) into more sub-tables, each with their own
  49   // separate
  50   // mutex.
  51   size_t num_stripes = 16;
  52
  53   // If positive, specifies the default wait timeout in milliseconds when
  54   // a transaction attempts to lock a key if not specified by
  55   // TransactionOptions::lock_timeout.
  56   //
  57   // If 0, no waiting is done if a lock cannot instantly be acquired.
  58   // If negative, there is no timeout.  Not using a timeout is not recommended
  59   // as it can lead to deadlocks.  Currently, there is no deadlock-detection to
  60   // recover
  61   // from a deadlock.
  62   int64_t transaction_lock_timeout = 1000;  // 1 second
  63
  64   // If positive, specifies the wait timeout in milliseconds when writing a key
  65   // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
  66   // directly).
  67   // If 0, no waiting is done if a lock cannot instantly be acquired.
  68   // If negative, there is no timeout and will block indefinitely when acquiring
  69   // a lock.
  70   //
  71   // Not using a timeout can lead to deadlocks.  Currently, there
  72   // is no deadlock-detection to recover from a deadlock.  While DB writes
  73   // cannot deadlock with other DB writes, they can deadlock with a transaction.
  74   // A negative timeout should only be used if all transactions have a small
  75   // expiration set.
  76   int64_t default_lock_timeout = 1000;  // 1 second
  77
  78   // If set, the TransactionDB will use this implementation of a mutex and
  79   // condition variable for all transaction locking instead of the default
  80   // mutex/condvar implementation.
  81   std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
  82
  83   // The policy for when to write the data into the DB. The default policy is to
  84   // write only the committed data (WRITE_COMMITTED). The data could be written
  85   // before the commit phase. The DB then needs to provide the mechanisms to
  86   // tell apart committed from uncommitted data.
  87   TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
  88
  89   // TODO(myabandeh): remove this option
  90   // Note: this is a temporary option as a hot fix in rollback of writeprepared
  91   // txns in myrocks. MyRocks uses merge operands for autoinc column id without
  92   // however obtaining locks. This breaks the assumption behind the rollback
  93   // logic in myrocks. This hack of simply not rolling back merge operands works
  94   // for the special way that myrocks uses this operands.
  95   bool rollback_merge_operands = false;
  96
  97  private:
  98   // 128 entries
  99   size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
 100   // 8m entry, 64MB size
 101   size_t wp_commit_cache_bits = static_cast<size_t>(23);
 102
 103   friend class WritePreparedTxnDB;
 104   friend class WritePreparedTransactionTestBase;
 105   friend class MySQLStyleTransactionTest;
 106 };
 107
 108 struct TransactionOptions {
 109   // Setting set_snapshot=true is the same as calling
 110   // Transaction::SetSnapshot().
 111   bool set_snapshot = false;
 112
 113   // Setting to true means that before acquiring locks, this transaction will
 114   // check if doing so will cause a deadlock. If so, it will return with
 115   // Status::Busy.  The user should retry their transaction.
 116   bool deadlock_detect = false;
 117
 118   // If set, it states that the CommitTimeWriteBatch represents the latest state
 119   // of the application, has only one sub-batch, i.e., no duplicate keys,  and
 120   // meant to be used later during recovery. It enables an optimization to
 121   // postpone updating the memtable with CommitTimeWriteBatch to only
 122   // SwitchMemtable or recovery.
 123   bool use_only_the_last_commit_time_batch_for_recovery = false;
 124
 125   // TODO(agiardullo): TransactionDB does not yet support comparators that allow
 126   // two non-equal keys to be equivalent.  Ie, cmp->Compare(a,b) should only
 127   // return 0 if
 128   // a.compare(b) returns 0.
 129
 130   // If positive, specifies the wait timeout in milliseconds when
 131   // a transaction attempts to lock a key.
 132   //
 133   // If 0, no waiting is done if a lock cannot instantly be acquired.
 134   // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
 135   int64_t lock_timeout = -1;
 136
 137   // Expiration duration in milliseconds.  If non-negative, transactions that
 138   // last longer than this many milliseconds will fail to commit.  If not set,
 139   // a forgotten transaction that is never committed, rolled back, or deleted
 140   // will never relinquish any locks it holds.  This could prevent keys from
 141   // being written by other writers.
 142   int64_t expiration = -1;
 143
 144   // The number of traversals to make during deadlock detection.
 145   int64_t deadlock_detect_depth = 50;
 146
 147   // The maximum number of bytes used for the write batch. 0 means no limit.
 148   size_t max_write_batch_size = 0;
 149
 150   // Skip Concurrency Control. This could be as an optimization if the
 151   // application knows that the transaction would not have any conflict with
 152   // concurrent transactions. It could also be used during recovery if (i)
 153   // application guarantees no conflict between prepared transactions in the WAL
 154   // (ii) application guarantees that recovered transactions will be rolled
 155   // back/commit before new transactions start.
 156   // Default: false
 157   bool skip_concurrency_control = false;
 158 };
 159
 160 // The per-write optimizations that do not involve transactions. TransactionDB
 161 // implementation might or might not make use of the specified optimizations.
 162 struct TransactionDBWriteOptimizations {
 163   // If it is true it means that the application guarantees that the
 164   // key-set in the write batch do not conflict with any concurrent transaction
 165   // and hence the concurrency control mechanism could be skipped for this
 166   // write.
 167   bool skip_concurrency_control = false;
 168   // If true, the application guarantees that there is no duplicate <column
 169   // family, key> in the write batch and any employed mechanism to handle
 170   // duplicate keys could be skipped.
 171   bool skip_duplicate_key_check = false;
 172 };
 173
 174 struct KeyLockInfo {
 175   std::string key;
 176   std::vector<TransactionID> ids;
 177   bool exclusive;
 178 };
 179
 180 struct DeadlockInfo {
 181   TransactionID m_txn_id;
 182   uint32_t m_cf_id;
 183   bool m_exclusive;
 184   std::string m_waiting_key;
 185 };
 186
 187 struct DeadlockPath {
 188   std::vector<DeadlockInfo> path;
 189   bool limit_exceeded;
 190   int64_t deadlock_time;
 191
 192   explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
 193                         const int64_t& dl_time)
 194       : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
 195
 196   // empty path, limit exceeded constructor and default constructor
 197   explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
 198       : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
 199
 200   bool empty() { return path.empty() && !limit_exceeded; }
 201 };
 202
 203 class TransactionDB : public StackableDB {
 204  public:
 205   // Optimized version of ::Write that receives more optimization request such
 206   // as skip_concurrency_control.
 207   using StackableDB::Write;
 208   virtual Status Write(const WriteOptions& opts,
 209                        const TransactionDBWriteOptimizations&,
 210                        WriteBatch* updates) {
 211     // The default implementation ignores TransactionDBWriteOptimizations and
 212     // falls back to the un-optimized version of ::Write
 213     return Write(opts, updates);
 214   }
 215   // Open a TransactionDB similar to DB::Open().
 216   // Internally call PrepareWrap() and WrapDB()
 217   // If the return status is not ok, then dbptr is set to nullptr.
 218   static Status Open(const Options& options,
 219                      const TransactionDBOptions& txn_db_options,
 220                      const std::string& dbname, TransactionDB** dbptr);
 221
 222   static Status Open(const DBOptions& db_options,
 223                      const TransactionDBOptions& txn_db_options,
 224                      const std::string& dbname,
 225                      const std::vector<ColumnFamilyDescriptor>& column_families,
 226                      std::vector<ColumnFamilyHandle*>* handles,
 227                      TransactionDB** dbptr);
 228   // Note: PrepareWrap() may change parameters, make copies before the
 229   // invocation if needed.
 230   static void PrepareWrap(DBOptions* db_options,
 231                           std::vector<ColumnFamilyDescriptor>* column_families,
 232                           std::vector<size_t>* compaction_enabled_cf_indices);
 233   // If the return status is not ok, then dbptr will bet set to nullptr. The
 234   // input db parameter might or might not be deleted as a result of the
 235   // failure. If it is properly deleted it will be set to nullptr. If the return
 236   // status is ok, the ownership of db is transferred to dbptr.
 237   static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
 238                        const std::vector<size_t>& compaction_enabled_cf_indices,
 239                        const std::vector<ColumnFamilyHandle*>& handles,
 240                        TransactionDB** dbptr);
 241   // If the return status is not ok, then dbptr will bet set to nullptr. The
 242   // input db parameter might or might not be deleted as a result of the
 243   // failure. If it is properly deleted it will be set to nullptr. If the return
 244   // status is ok, the ownership of db is transferred to dbptr.
 245   static Status WrapStackableDB(
 246       StackableDB* db, const TransactionDBOptions& txn_db_options,
 247       const std::vector<size_t>& compaction_enabled_cf_indices,
 248       const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
 249   // Since the destructor in StackableDB is virtual, this destructor is virtual
 250   // too. The root db will be deleted by the base's destructor.
 251   ~TransactionDB() override {}
 252
 253   // Starts a new Transaction.
 254   //
 255   // Caller is responsible for deleting the returned transaction when no
 256   // longer needed.
 257   //
 258   // If old_txn is not null, BeginTransaction will reuse this Transaction
 259   // handle instead of allocating a new one.  This is an optimization to avoid
 260   // extra allocations when repeatedly creating transactions.
 261   virtual Transaction* BeginTransaction(
 262       const WriteOptions& write_options,
 263       const TransactionOptions& txn_options = TransactionOptions(),
 264       Transaction* old_txn = nullptr) = 0;
 265
 266   virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
 267   virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
 268
 269   // Returns set of all locks held.
 270   //
 271   // The mapping is column family id -> KeyLockInfo
 272   virtual std::unordered_multimap<uint32_t, KeyLockInfo>
 273   GetLockStatusData() = 0;
 274   virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
 275   virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
 276
 277  protected:
 278   // To Create an TransactionDB, call Open()
 279   // The ownership of db is transferred to the base StackableDB
 280   explicit TransactionDB(DB* db) : StackableDB(db) {}
 281
 282  private:
 283   // No copying allowed
 284   TransactionDB(const TransactionDB&);
 285   void operator=(const TransactionDB&);
 286 };
 287
 288 }  // namespace rocksdb
 289
 290 #endif  // ROCKSDB_LITE