[ceph.git] / ceph / src / rocksdb / utilities / transactions / write_unprepared_txn.h

// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#pragma once

#ifndef ROCKSDB_LITE

#include <set>

#include "utilities/transactions/write_prepared_txn.h"
#include "utilities/transactions/write_unprepared_txn_db.h"

namespace ROCKSDB_NAMESPACE {

class WriteUnpreparedTxnDB;
class WriteUnpreparedTxn;

// WriteUnprepared transactions needs to be able to read their own uncommitted
// writes, and supporting this requires some careful consideration. Because
// writes in the current transaction may be flushed to DB already, we cannot
// rely on the contents of WriteBatchWithIndex to determine whether a key should
// be visible or not, so we have to remember to check the DB for any uncommitted
// keys that should be visible to us. First, we will need to change the seek to
// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq).
// Any key greater than max_visible_seq should not be visible because they
// cannot be unprepared by the current transaction and they are not in its
// snapshot.
//
// When we seek to max_visible_seq, one of these cases will happen:
// 1. We hit a unprepared key from the current transaction.
// 2. We hit a unprepared key from the another transaction.
// 3. We hit a committed key with snap_seq < seq < max_unprep_seq.
// 4. We hit a committed key with seq <= snap_seq.
//
// IsVisibleFullCheck handles all cases correctly.
//
// Other notes:
// Note that max_visible_seq is only calculated once at iterator construction
// time, meaning if the same transaction is adding more unprep seqs through
// writes during iteration, these newer writes may not be visible. This is not a
// problem for MySQL though because it avoids modifying the index as it is
// scanning through it to avoid the Halloween Problem. Instead, it scans the
// index once up front, and modifies based on a temporary copy.
//
// In DBIter, there is a "reseek" optimization if the iterator skips over too
// many keys. However, this assumes that the reseek seeks exactly to the
// required key. In write unprepared, even after seeking directly to
// max_visible_seq, some iteration may be required before hitting a visible key,
// and special precautions must be taken to avoid performing another reseek,
// leading to an infinite loop.
//
class WriteUnpreparedTxnReadCallback : public ReadCallback {
 public:
  WriteUnpreparedTxnReadCallback(
      WritePreparedTxnDB* db, SequenceNumber snapshot,
      SequenceNumber min_uncommitted,
      const std::map<SequenceNumber, size_t>& unprep_seqs,
      SnapshotBackup backed_by_snapshot)
      // Pass our last uncommitted seq as the snapshot to the parent class to
      // ensure that the parent will not prematurely filter out own writes. We
      // will do the exact comparison against snapshots in IsVisibleFullCheck
      // override.
      : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
        db_(db),
        unprep_seqs_(unprep_seqs),
        wup_snapshot_(snapshot),
        backed_by_snapshot_(backed_by_snapshot) {
    (void)backed_by_snapshot_;  // to silence unused private field warning
  }

  virtual ~WriteUnpreparedTxnReadCallback() {
    // If it is not backed by snapshot, the caller must check validity
    assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
  }

  virtual bool IsVisibleFullCheck(SequenceNumber seq) override;

  inline bool valid() {
    valid_checked_ = true;
    return snap_released_ == false;
  }

  void Refresh(SequenceNumber seq) override {
    max_visible_seq_ = std::max(max_visible_seq_, seq);
    wup_snapshot_ = seq;
  }

  static SequenceNumber CalcMaxVisibleSeq(
      const std::map<SequenceNumber, size_t>& unprep_seqs,
      SequenceNumber snapshot_seq) {
    SequenceNumber max_unprepared = 0;
    if (unprep_seqs.size()) {
      max_unprepared =
          unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
    }
    return std::max(max_unprepared, snapshot_seq);
  }

 private:
  WritePreparedTxnDB* db_;
  const std::map<SequenceNumber, size_t>& unprep_seqs_;
  SequenceNumber wup_snapshot_;
  // Whether max_visible_seq_ is backed by a snapshot
  const SnapshotBackup backed_by_snapshot_;
  bool snap_released_ = false;
  // Safety check to ensure that the caller has checked invalid statuses
  bool valid_checked_ = false;
};

class WriteUnpreparedTxn : public WritePreparedTxn {
 public:
  WriteUnpreparedTxn(WriteUnpreparedTxnDB* db,
                     const WriteOptions& write_options,
                     const TransactionOptions& txn_options);

  virtual ~WriteUnpreparedTxn();

  using TransactionBaseImpl::Put;
  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& value,
                     const bool assume_tracked = false) override;
  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
                     const SliceParts& value,
                     const bool assume_tracked = false) override;

  using TransactionBaseImpl::Merge;
  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
                       const Slice& value,
                       const bool assume_tracked = false) override;

  using TransactionBaseImpl::Delete;
  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
                        const bool assume_tracked = false) override;
  virtual Status Delete(ColumnFamilyHandle* column_family,
                        const SliceParts& key,
                        const bool assume_tracked = false) override;

  using TransactionBaseImpl::SingleDelete;
  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
                              const Slice& key,
                              const bool assume_tracked = false) override;
  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
                              const SliceParts& key,
                              const bool assume_tracked = false) override;

  // In WriteUnprepared, untracked writes will break snapshot validation logic.
  // Snapshot validation will only check the largest sequence number of a key to
  // see if it was committed or not. However, an untracked unprepared write will
  // hide smaller committed sequence numbers.
  //
  // TODO(lth): Investigate whether it is worth having snapshot validation
  // validate all values larger than snap_seq. Otherwise, we should return
  // Status::NotSupported for untracked writes.

  virtual Status RebuildFromWriteBatch(WriteBatch*) override;

  virtual uint64_t GetLastLogNumber() const override {
    return last_log_number_;
  }

  void RemoveActiveIterator(Iterator* iter) {
    active_iterators_.erase(
        std::remove(active_iterators_.begin(), active_iterators_.end(), iter),
        active_iterators_.end());
  }

 protected:
  void Initialize(const TransactionOptions& txn_options) override;

  Status PrepareInternal() override;

  Status CommitWithoutPrepareInternal() override;
  Status CommitInternal() override;

  Status RollbackInternal() override;

  void Clear() override;

  void SetSavePoint() override;
  Status RollbackToSavePoint() override;
  Status PopSavePoint() override;

  // Get and GetIterator needs to be overridden so that a ReadCallback to
  // handle read-your-own-write is used.
  using Transaction::Get;
  virtual Status Get(const ReadOptions& options,
                     ColumnFamilyHandle* column_family, const Slice& key,
                     PinnableSlice* value) override;

  using Transaction::MultiGet;
  virtual void MultiGet(const ReadOptions& options,
                        ColumnFamilyHandle* column_family,
                        const size_t num_keys, const Slice* keys,
                        PinnableSlice* values, Status* statuses,
                        const bool sorted_input = false) override;

  using Transaction::GetIterator;
  virtual Iterator* GetIterator(const ReadOptions& options) override;
  virtual Iterator* GetIterator(const ReadOptions& options,
                                ColumnFamilyHandle* column_family) override;

  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
                                  const Slice& key,
                                  SequenceNumber* tracked_at_seq) override;

 private:
  friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
  friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
  friend class WriteUnpreparedTxnDB;

  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
  Status WriteRollbackKeys(const LockTracker& tracked_keys,
                           WriteBatchWithIndex* rollback_batch,
                           ReadCallback* callback, const ReadOptions& roptions);

  Status MaybeFlushWriteBatchToDB();
  Status FlushWriteBatchToDB(bool prepared);
  Status FlushWriteBatchToDBInternal(bool prepared);
  Status FlushWriteBatchWithSavePointToDB();
  Status RollbackToSavePointInternal();
  Status HandleWrite(std::function<Status()> do_write);

  // For write unprepared, we check on every writebatch append to see if
  // write_batch_flush_threshold_ has been exceeded, and then call
  // FlushWriteBatchToDB if so. This logic is encapsulated in
  // MaybeFlushWriteBatchToDB.
  int64_t write_batch_flush_threshold_;
  WriteUnpreparedTxnDB* wupt_db_;

  // Ordered list of unprep_seq sequence numbers that we have already written
  // to DB.
  //
  // This maps unprep_seq => prepare_batch_cnt for each unprepared batch
  // written by this transaction.
  //
  // Note that this contains both prepared and unprepared batches, since they
  // are treated similarily in prepare heap/commit map, so it simplifies the
  // commit callbacks.
  std::map<SequenceNumber, size_t> unprep_seqs_;

  uint64_t last_log_number_;

  // Recovered transactions have tracked_keys_ populated, but are not actually
  // locked for efficiency reasons. For recovered transactions, skip unlocking
  // keys when transaction ends.
  bool recovered_txn_;

  // Track the largest sequence number at which we performed snapshot
  // validation. If snapshot validation was skipped because no snapshot was set,
  // then this is set to GetLastPublishedSequence. This value is useful because
  // it means that for keys that have unprepared seqnos, we can guarantee that
  // no committed keys by other transactions can exist between
  // largest_validated_seq_ and max_unprep_seq. See
  // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
  // necessary for iterator Prev().
  //
  // Currently this value only increases during the lifetime of a transaction,
  // but in some cases, we should be able to restore the previously largest
  // value when calling RollbackToSavepoint.
  SequenceNumber largest_validated_seq_;

  struct SavePoint {
    // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
    // used during RollbackToSavepoint to determine visibility when restoring
    // old values.
    //
    // TODO(lth): Since all unprep_seqs_ sets further down the stack must be
    // subsets, this can potentially be deduplicated by just storing set
    // difference. Investigate if this is worth it.
    std::map<SequenceNumber, size_t> unprep_seqs_;

    // This snapshot will be used to read keys at this savepoint if we call
    // RollbackToSavePoint.
    std::unique_ptr<ManagedSnapshot> snapshot_;

    SavePoint(const std::map<SequenceNumber, size_t>& seqs,
              ManagedSnapshot* snapshot)
        : unprep_seqs_(seqs), snapshot_(snapshot){};
  };

  // We have 3 data structures holding savepoint information:
  // 1. TransactionBaseImpl::save_points_
  // 2. WriteUnpreparedTxn::flushed_save_points_
  // 3. WriteUnpreparecTxn::unflushed_save_points_
  //
  // TransactionBaseImpl::save_points_ holds information about all write
  // batches, including the current in-memory write_batch_, or unprepared
  // batches that have been written out. Its responsibility is just to track
  // which keys have been modified in every savepoint.
  //
  // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints
  // set on unprepared batches that have already flushed. It holds the snapshot
  // and unprep_seqs at that savepoint, so that the rollback process can
  // determine which keys were visible at that point in time.
  //
  // WriteUnpreparecTxn::unflushed_save_points_ holds information about
  // savepoints on the current in-memory write_batch_. It simply records the
  // size of the write batch at every savepoint.
  //
  // TODO(lth): Remove the redundancy between save_point_boundaries_ and
  // write_batch_.save_points_.
  //
  // Based on this information, here are some invariants:
  // size(unflushed_save_points_) = size(write_batch_.save_points_)
  // size(flushed_save_points_) + size(unflushed_save_points_)
  //   = size(save_points_)
  //
  std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
      flushed_save_points_;
  std::unique_ptr<autovector<size_t>> unflushed_save_points_;

  // It is currently unsafe to flush a write batch if there are active iterators
  // created from this transaction. This is because we use WriteBatchWithIndex
  // to do merging reads from the DB and the write batch. If we flush the write
  // batch, it is possible that the delta iterator on the iterator will point to
  // invalid memory.
  std::vector<Iterator*> active_iterators_;

  // Untracked keys that we have to rollback.
  //
  // TODO(lth): Currently we we do not record untracked keys per-savepoint.
  // This means that when rolling back to savepoints, we have to check all
  // keys in the current transaction for rollback. Note that this is only
  // inefficient, but still correct because we take a snapshot at every
  // savepoint, and we will use that snapshot to construct the rollback batch.
  // The rollback batch will then contain a reissue of the same marker.
  //
  // A more optimal solution would be to only check keys changed since the
  // last savepoint. Also, it may make sense to merge this into tracked_keys_
  // and differentiate between tracked but not locked keys to avoid having two
  // very similar data structures.
  using KeySet = std::unordered_map<uint32_t, std::vector<std::string>>;
  KeySet untracked_keys_;
};

}  // namespace ROCKSDB_NAMESPACE

#endif  // ROCKSDB_LITE
Commit	Line	Data
11fdf7f2 TL	1	// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
	2	// This source code is licensed under both the GPLv2 (found in the
	3	// COPYING file in the root directory) and Apache 2.0 License
	4	// (found in the LICENSE.Apache file in the root directory).
	5
	6	#pragma once
	7
	8	#ifndef ROCKSDB_LITE
	9
	10	#include <set>
	11
	12	#include "utilities/transactions/write_prepared_txn.h"
	13	#include "utilities/transactions/write_unprepared_txn_db.h"
	14
f67539c2	15	namespace ROCKSDB_NAMESPACE {
11fdf7f2 TL	16
	17	class WriteUnpreparedTxnDB;
	18	class WriteUnpreparedTxn;
	19
f67539c2 TL	20	// WriteUnprepared transactions needs to be able to read their own uncommitted
	21	// writes, and supporting this requires some careful consideration. Because
	22	// writes in the current transaction may be flushed to DB already, we cannot
	23	// rely on the contents of WriteBatchWithIndex to determine whether a key should
	24	// be visible or not, so we have to remember to check the DB for any uncommitted
	25	// keys that should be visible to us. First, we will need to change the seek to
	26	// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq).
	27	// Any key greater than max_visible_seq should not be visible because they
	28	// cannot be unprepared by the current transaction and they are not in its
	29	// snapshot.
	30	//
	31	// When we seek to max_visible_seq, one of these cases will happen:
	32	// 1. We hit a unprepared key from the current transaction.
	33	// 2. We hit a unprepared key from the another transaction.
	34	// 3. We hit a committed key with snap_seq < seq < max_unprep_seq.
	35	// 4. We hit a committed key with seq <= snap_seq.
	36	//
	37	// IsVisibleFullCheck handles all cases correctly.
	38	//
	39	// Other notes:
	40	// Note that max_visible_seq is only calculated once at iterator construction
	41	// time, meaning if the same transaction is adding more unprep seqs through
	42	// writes during iteration, these newer writes may not be visible. This is not a
	43	// problem for MySQL though because it avoids modifying the index as it is
	44	// scanning through it to avoid the Halloween Problem. Instead, it scans the
	45	// index once up front, and modifies based on a temporary copy.
	46	//
	47	// In DBIter, there is a "reseek" optimization if the iterator skips over too
	48	// many keys. However, this assumes that the reseek seeks exactly to the
	49	// required key. In write unprepared, even after seeking directly to
	50	// max_visible_seq, some iteration may be required before hitting a visible key,
	51	// and special precautions must be taken to avoid performing another reseek,
	52	// leading to an infinite loop.
	53	//
11fdf7f2 TL	54	class WriteUnpreparedTxnReadCallback : public ReadCallback {
11fdf7f2 TL	55	public:
f67539c2 TL	56	WriteUnpreparedTxnReadCallback(
	57	WritePreparedTxnDB* db, SequenceNumber snapshot,
	58	SequenceNumber min_uncommitted,
	59	const std::map<SequenceNumber, size_t>& unprep_seqs,
	60	SnapshotBackup backed_by_snapshot)
494da23a TL	61	// Pass our last uncommitted seq as the snapshot to the parent class to
494da23a TL	62	// ensure that the parent will not prematurely filter out own writes. We
f67539c2	63	// will do the exact comparison against snapshots in IsVisibleFullCheck
494da23a	64	// override.
f67539c2	65	: ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
494da23a	66	db_(db),
f67539c2 TL	67	unprep_seqs_(unprep_seqs),
	68	wup_snapshot_(snapshot),
	69	backed_by_snapshot_(backed_by_snapshot) {
	70	(void)backed_by_snapshot_; // to silence unused private field warning
	71	}
	72
	73	virtual ~WriteUnpreparedTxnReadCallback() {
	74	// If it is not backed by snapshot, the caller must check validity
	75	assert(valid_checked_ \|\| backed_by_snapshot_ == kBackedByDBSnapshot);
	76	}
494da23a TL	77
	78	virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
	79
f67539c2 TL	80	inline bool valid() {
	81	valid_checked_ = true;
	82	return snap_released_ == false;
494da23a	83	}
11fdf7f2	84
f67539c2 TL	85	void Refresh(SequenceNumber seq) override {
	86	max_visible_seq_ = std::max(max_visible_seq_, seq);
	87	wup_snapshot_ = seq;
	88	}
	89
	90	static SequenceNumber CalcMaxVisibleSeq(
	91	const std::map<SequenceNumber, size_t>& unprep_seqs,
	92	SequenceNumber snapshot_seq) {
	93	SequenceNumber max_unprepared = 0;
	94	if (unprep_seqs.size()) {
	95	max_unprepared =
	96	unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
	97	}
494da23a TL	98	return std::max(max_unprepared, snapshot_seq);
494da23a TL	99	}
f67539c2 TL	100
f67539c2 TL	101	private:
11fdf7f2	102	WritePreparedTxnDB* db_;
f67539c2	103	const std::map<SequenceNumber, size_t>& unprep_seqs_;
494da23a	104	SequenceNumber wup_snapshot_;
f67539c2 TL	105	// Whether max_visible_seq_ is backed by a snapshot
	106	const SnapshotBackup backed_by_snapshot_;
	107	bool snap_released_ = false;
	108	// Safety check to ensure that the caller has checked invalid statuses
	109	bool valid_checked_ = false;
11fdf7f2 TL	110	};
	111
	112	class WriteUnpreparedTxn : public WritePreparedTxn {
	113	public:
	114	WriteUnpreparedTxn(WriteUnpreparedTxnDB* db,
	115	const WriteOptions& write_options,
	116	const TransactionOptions& txn_options);
	117
	118	virtual ~WriteUnpreparedTxn();
	119
	120	using TransactionBaseImpl::Put;
	121	virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
494da23a TL	122	const Slice& value,
494da23a TL	123	const bool assume_tracked = false) override;
11fdf7f2	124	virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
494da23a TL	125	const SliceParts& value,
494da23a TL	126	const bool assume_tracked = false) override;
11fdf7f2 TL	127
	128	using TransactionBaseImpl::Merge;
	129	virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
494da23a TL	130	const Slice& value,
494da23a TL	131	const bool assume_tracked = false) override;
11fdf7f2 TL	132
11fdf7f2 TL	133	using TransactionBaseImpl::Delete;
494da23a TL	134	virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
494da23a TL	135	const bool assume_tracked = false) override;
11fdf7f2	136	virtual Status Delete(ColumnFamilyHandle* column_family,
494da23a TL	137	const SliceParts& key,
494da23a TL	138	const bool assume_tracked = false) override;
11fdf7f2 TL	139
	140	using TransactionBaseImpl::SingleDelete;
	141	virtual Status SingleDelete(ColumnFamilyHandle* column_family,
494da23a TL	142	const Slice& key,
494da23a TL	143	const bool assume_tracked = false) override;
11fdf7f2	144	virtual Status SingleDelete(ColumnFamilyHandle* column_family,
494da23a TL	145	const SliceParts& key,
494da23a TL	146	const bool assume_tracked = false) override;
11fdf7f2	147
f67539c2 TL	148	// In WriteUnprepared, untracked writes will break snapshot validation logic.
	149	// Snapshot validation will only check the largest sequence number of a key to
	150	// see if it was committed or not. However, an untracked unprepared write will
	151	// hide smaller committed sequence numbers.
	152	//
	153	// TODO(lth): Investigate whether it is worth having snapshot validation
	154	// validate all values larger than snap_seq. Otherwise, we should return
	155	// Status::NotSupported for untracked writes.
11fdf7f2	156
f67539c2	157	virtual Status RebuildFromWriteBatch(WriteBatch*) override;
11fdf7f2	158
f67539c2 TL	159	virtual uint64_t GetLastLogNumber() const override {
	160	return last_log_number_;
	161	}
	162
	163	void RemoveActiveIterator(Iterator* iter) {
	164	active_iterators_.erase(
	165	std::remove(active_iterators_.begin(), active_iterators_.end(), iter),
	166	active_iterators_.end());
	167	}
11fdf7f2 TL	168
	169	protected:
	170	void Initialize(const TransactionOptions& txn_options) override;
	171
	172	Status PrepareInternal() override;
	173
	174	Status CommitWithoutPrepareInternal() override;
	175	Status CommitInternal() override;
	176
	177	Status RollbackInternal() override;
	178
f67539c2 TL	179	void Clear() override;
	180
	181	void SetSavePoint() override;
	182	Status RollbackToSavePoint() override;
	183	Status PopSavePoint() override;
	184
11fdf7f2 TL	185	// Get and GetIterator needs to be overridden so that a ReadCallback to
	186	// handle read-your-own-write is used.
	187	using Transaction::Get;
	188	virtual Status Get(const ReadOptions& options,
	189	ColumnFamilyHandle* column_family, const Slice& key,
	190	PinnableSlice* value) override;
	191
f67539c2 TL	192	using Transaction::MultiGet;
	193	virtual void MultiGet(const ReadOptions& options,
	194	ColumnFamilyHandle* column_family,
	195	const size_t num_keys, const Slice* keys,
	196	PinnableSlice* values, Status* statuses,
	197	const bool sorted_input = false) override;
	198
11fdf7f2 TL	199	using Transaction::GetIterator;
	200	virtual Iterator* GetIterator(const ReadOptions& options) override;
	201	virtual Iterator* GetIterator(const ReadOptions& options,
	202	ColumnFamilyHandle* column_family) override;
	203
f67539c2 TL	204	virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
	205	const Slice& key,
	206	SequenceNumber* tracked_at_seq) override;
	207
11fdf7f2 TL	208	private:
	209	friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
	210	friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
	211	friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
	212	friend class WriteUnpreparedTxnDB;
	213
f67539c2	214	const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
20effc67	215	Status WriteRollbackKeys(const LockTracker& tracked_keys,
f67539c2 TL	216	WriteBatchWithIndex* rollback_batch,
	217	ReadCallback* callback, const ReadOptions& roptions);
	218
11fdf7f2 TL	219	Status MaybeFlushWriteBatchToDB();
11fdf7f2 TL	220	Status FlushWriteBatchToDB(bool prepared);
f67539c2 TL	221	Status FlushWriteBatchToDBInternal(bool prepared);
	222	Status FlushWriteBatchWithSavePointToDB();
	223	Status RollbackToSavePointInternal();
	224	Status HandleWrite(std::function<Status()> do_write);
11fdf7f2 TL	225
11fdf7f2 TL	226	// For write unprepared, we check on every writebatch append to see if
f67539c2	227	// write_batch_flush_threshold_ has been exceeded, and then call
11fdf7f2 TL	228	// FlushWriteBatchToDB if so. This logic is encapsulated in
11fdf7f2 TL	229	// MaybeFlushWriteBatchToDB.
f67539c2	230	int64_t write_batch_flush_threshold_;
11fdf7f2 TL	231	WriteUnpreparedTxnDB* wupt_db_;
	232
	233	// Ordered list of unprep_seq sequence numbers that we have already written
	234	// to DB.
	235	//
	236	// This maps unprep_seq => prepare_batch_cnt for each unprepared batch
	237	// written by this transaction.
	238	//
	239	// Note that this contains both prepared and unprepared batches, since they
	240	// are treated similarily in prepare heap/commit map, so it simplifies the
	241	// commit callbacks.
	242	std::map<SequenceNumber, size_t> unprep_seqs_;
	243
f67539c2 TL	244	uint64_t last_log_number_;
	245
	246	// Recovered transactions have tracked_keys_ populated, but are not actually
	247	// locked for efficiency reasons. For recovered transactions, skip unlocking
	248	// keys when transaction ends.
	249	bool recovered_txn_;
	250
	251	// Track the largest sequence number at which we performed snapshot
	252	// validation. If snapshot validation was skipped because no snapshot was set,
	253	// then this is set to GetLastPublishedSequence. This value is useful because
	254	// it means that for keys that have unprepared seqnos, we can guarantee that
	255	// no committed keys by other transactions can exist between
	256	// largest_validated_seq_ and max_unprep_seq. See
	257	// WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
	258	// necessary for iterator Prev().
	259	//
	260	// Currently this value only increases during the lifetime of a transaction,
	261	// but in some cases, we should be able to restore the previously largest
	262	// value when calling RollbackToSavepoint.
	263	SequenceNumber largest_validated_seq_;
	264
f67539c2 TL	265	struct SavePoint {
	266	// Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
	267	// used during RollbackToSavepoint to determine visibility when restoring
	268	// old values.
	269	//
	270	// TODO(lth): Since all unprep_seqs_ sets further down the stack must be
	271	// subsets, this can potentially be deduplicated by just storing set
	272	// difference. Investigate if this is worth it.
	273	std::map<SequenceNumber, size_t> unprep_seqs_;
	274
	275	// This snapshot will be used to read keys at this savepoint if we call
	276	// RollbackToSavePoint.
	277	std::unique_ptr<ManagedSnapshot> snapshot_;
	278
	279	SavePoint(const std::map<SequenceNumber, size_t>& seqs,
	280	ManagedSnapshot* snapshot)
	281	: unprep_seqs_(seqs), snapshot_(snapshot){};
	282	};
	283
	284	// We have 3 data structures holding savepoint information:
	285	// 1. TransactionBaseImpl::save_points_
	286	// 2. WriteUnpreparedTxn::flushed_save_points_
	287	// 3. WriteUnpreparecTxn::unflushed_save_points_
	288	//
	289	// TransactionBaseImpl::save_points_ holds information about all write
	290	// batches, including the current in-memory write_batch_, or unprepared
	291	// batches that have been written out. Its responsibility is just to track
	292	// which keys have been modified in every savepoint.
	293	//
	294	// WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints
	295	// set on unprepared batches that have already flushed. It holds the snapshot
	296	// and unprep_seqs at that savepoint, so that the rollback process can
	297	// determine which keys were visible at that point in time.
	298	//
	299	// WriteUnpreparecTxn::unflushed_save_points_ holds information about
	300	// savepoints on the current in-memory write_batch_. It simply records the
	301	// size of the write batch at every savepoint.
	302	//
	303	// TODO(lth): Remove the redundancy between save_point_boundaries_ and
	304	// write_batch_.save_points_.
	305	//
	306	// Based on this information, here are some invariants:
	307	// size(unflushed_save_points_) = size(write_batch_.save_points_)
	308	// size(flushed_save_points_) + size(unflushed_save_points_)
	309	// = size(save_points_)
	310	//
	311	std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
	312	flushed_save_points_;
	313	std::unique_ptr<autovector<size_t>> unflushed_save_points_;
	314
	315	// It is currently unsafe to flush a write batch if there are active iterators
	316	// created from this transaction. This is because we use WriteBatchWithIndex
	317	// to do merging reads from the DB and the write batch. If we flush the write
	318	// batch, it is possible that the delta iterator on the iterator will point to
	319	// invalid memory.
	320	std::vector<Iterator*> active_iterators_;
	321
	322	// Untracked keys that we have to rollback.
	323	//
	324	// TODO(lth): Currently we we do not record untracked keys per-savepoint.
	325	// This means that when rolling back to savepoints, we have to check all
	326	// keys in the current transaction for rollback. Note that this is only
	327	// inefficient, but still correct because we take a snapshot at every
	328	// savepoint, and we will use that snapshot to construct the rollback batch.
329	// The rollback batch will then contain a reissue of the same marker.
11fdf7f2	330	//
f67539c2 TL	331	// A more optimal solution would be to only check keys changed since the
	332	// last savepoint. Also, it may make sense to merge this into tracked_keys_
	333	// and differentiate between tracked but not locked keys to avoid having two
	334	// very similar data structures.
20effc67	335	using KeySet = std::unordered_map<uint32_t, std::vector<std::string>>;
f67539c2	336	KeySet untracked_keys_;
11fdf7f2 TL	337	};
11fdf7f2 TL	338
f67539c2	339	} // namespace ROCKSDB_NAMESPACE
11fdf7f2 TL	340
11fdf7f2 TL	341	#endif // ROCKSDB_LITE