[ceph.git] / ceph / src / rocksdb / utilities / write_batch_with_index / write_batch_with_index_internal.h

// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
#pragma once

#ifndef ROCKSDB_LITE

#include <limits>
#include <string>
#include <vector>

#include "options/db_options.h"
#include "port/port.h"
#include "rocksdb/comparator.h"
#include "rocksdb/iterator.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/write_batch_with_index.h"

namespace ROCKSDB_NAMESPACE {

class MergeContext;
struct Options;

// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
struct WriteBatchIndexEntry {
  WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
      : offset(o),
        column_family(c),
        key_offset(ko),
        key_size(ksz),
        search_key(nullptr) {}
  // Create a dummy entry as the search key. This index entry won't be backed
  // by an entry from the write batch, but a pointer to the search key. Or a
  // special flag of offset can indicate we are seek to first.
  // @_search_key: the search key
  // @_column_family: column family
  // @is_forward_direction: true for Seek(). False for SeekForPrev()
  // @is_seek_to_first: true if we seek to the beginning of the column family
  //                    _search_key should be null in this case.
  WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
                       bool is_forward_direction, bool is_seek_to_first)
      // For SeekForPrev(), we need to make the dummy entry larger than any
      // entry who has the same search key. Otherwise, we'll miss those entries.
      : offset(is_forward_direction ? 0 : port::kMaxSizet),
        column_family(_column_family),
        key_offset(0),
        key_size(is_seek_to_first ? kFlagMinInCf : 0),
        search_key(_search_key) {
    assert(_search_key != nullptr || is_seek_to_first);
  }

  // If this flag appears in the key_size, it indicates a
  // key that is smaller than any other entry for the same column family.
  static const size_t kFlagMinInCf = port::kMaxSizet;

  bool is_min_in_cf() const {
    assert(key_size != kFlagMinInCf ||
           (key_offset == 0 && search_key == nullptr));
    return key_size == kFlagMinInCf;
  }

  // offset of an entry in write batch's string buffer. If this is a dummy
  // lookup key, in which case search_key != nullptr, offset is set to either
  // 0 or max, only for comparison purpose. Because when entries have the same
  // key, the entry with larger offset is larger, offset = 0 will make a seek
  // key small or equal than all the entries with the seek key, so that Seek()
  // will find all the entries of the same key. Similarly, offset = MAX will
  // make the entry just larger than all entries with the search key so
  // SeekForPrev() will see all the keys with the same key.
  size_t offset;
  uint32_t column_family;  // c1olumn family of the entry.
  size_t key_offset;       // offset of the key in write batch's string buffer.
  size_t key_size;         // size of the key. kFlagMinInCf indicates
                           // that this is a dummy look up entry for
                           // SeekToFirst() to the beginning of the column
                           // family. We use the flag here to save a boolean
                           // in the struct.

  const Slice* search_key;  // if not null, instead of reading keys from
                            // write batch, use it to compare. This is used
                            // for lookup key.
};

class ReadableWriteBatch : public WriteBatch {
 public:
  explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
      : WriteBatch(reserved_bytes, max_bytes) {}
  // Retrieve some information from a write entry in the write batch, given
  // the start offset of the write entry.
  Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
                                Slice* value, Slice* blob, Slice* xid) const;
};

class WriteBatchEntryComparator {
 public:
  WriteBatchEntryComparator(const Comparator* _default_comparator,
                            const ReadableWriteBatch* write_batch)
      : default_comparator_(_default_comparator), write_batch_(write_batch) {}
  // Compare a and b. Return a negative value if a is less than b, 0 if they
  // are equal, and a positive value if a is greater than b
  int operator()(const WriteBatchIndexEntry* entry1,
                 const WriteBatchIndexEntry* entry2) const;

  int CompareKey(uint32_t column_family, const Slice& key1,
                 const Slice& key2) const;

  void SetComparatorForCF(uint32_t column_family_id,
                          const Comparator* comparator) {
    if (column_family_id >= cf_comparators_.size()) {
      cf_comparators_.resize(column_family_id + 1, nullptr);
    }
    cf_comparators_[column_family_id] = comparator;
  }

  const Comparator* default_comparator() { return default_comparator_; }

 private:
  const Comparator* default_comparator_;
  std::vector<const Comparator*> cf_comparators_;
  const ReadableWriteBatch* write_batch_;
};

class WriteBatchWithIndexInternal {
 public:
  enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };

  // If batch contains a value for key, store it in *value and return kFound.
  // If batch contains a deletion for key, return Deleted.
  // If batch contains Merge operations as the most recent entry for a key,
  //   and the merge process does not stop (not reaching a value or delete),
  //   prepend the current merge operands to *operands,
  //   and return kMergeInProgress
  // If batch does not contain this key, return kNotFound
  // Else, return kError on error with error Status stored in *s.
  static WriteBatchWithIndexInternal::Result GetFromBatch(
      const ImmutableDBOptions& ioptions, WriteBatchWithIndex* batch,
      ColumnFamilyHandle* column_family, const Slice& key,
      MergeContext* merge_context, WriteBatchEntryComparator* cmp,
      std::string* value, bool overwrite_key, Status* s);
};

}  // namespace ROCKSDB_NAMESPACE
#endif  // !ROCKSDB_LITE
Commit	Line	Data
7c673cae	1	// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2 TL	2	// This source code is licensed under both the GPLv2 (found in the
	3	// COPYING file in the root directory) and Apache 2.0 License
	4	// (found in the LICENSE.Apache file in the root directory).
7c673cae FG	5	#pragma once
	6
	7	#ifndef ROCKSDB_LITE
	8
	9	#include <limits>
	10	#include <string>
	11	#include <vector>
	12
	13	#include "options/db_options.h"
	14	#include "port/port.h"
	15	#include "rocksdb/comparator.h"
	16	#include "rocksdb/iterator.h"
	17	#include "rocksdb/slice.h"
	18	#include "rocksdb/status.h"
	19	#include "rocksdb/utilities/write_batch_with_index.h"
	20
f67539c2	21	namespace ROCKSDB_NAMESPACE {
7c673cae FG	22
	23	class MergeContext;
	24	struct Options;
	25
	26	// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
	27	struct WriteBatchIndexEntry {
	28	WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
	29	: offset(o),
	30	column_family(c),
	31	key_offset(ko),
	32	key_size(ksz),
	33	search_key(nullptr) {}
11fdf7f2 TL	34	// Create a dummy entry as the search key. This index entry won't be backed
	35	// by an entry from the write batch, but a pointer to the search key. Or a
	36	// special flag of offset can indicate we are seek to first.
	37	// @_search_key: the search key
	38	// @_column_family: column family
	39	// @is_forward_direction: true for Seek(). False for SeekForPrev()
	40	// @is_seek_to_first: true if we seek to the beginning of the column family
	41	// _search_key should be null in this case.
	42	WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
	43	bool is_forward_direction, bool is_seek_to_first)
	44	// For SeekForPrev(), we need to make the dummy entry larger than any
	45	// entry who has the same search key. Otherwise, we'll miss those entries.
	46	: offset(is_forward_direction ? 0 : port::kMaxSizet),
	47	column_family(_column_family),
7c673cae	48	key_offset(0),
11fdf7f2 TL	49	key_size(is_seek_to_first ? kFlagMinInCf : 0),
	50	search_key(_search_key) {
	51	assert(_search_key != nullptr \|\| is_seek_to_first);
	52	}
	53
	54	// If this flag appears in the key_size, it indicates a
	55	// key that is smaller than any other entry for the same column family.
	56	static const size_t kFlagMinInCf = port::kMaxSizet;
7c673cae	57
11fdf7f2 TL	58	bool is_min_in_cf() const {
	59	assert(key_size != kFlagMinInCf \|\|
	60	(key_offset == 0 && search_key == nullptr));
	61	return key_size == kFlagMinInCf;
	62	}
7c673cae	63
11fdf7f2 TL	64	// offset of an entry in write batch's string buffer. If this is a dummy
	65	// lookup key, in which case search_key != nullptr, offset is set to either
	66	// 0 or max, only for comparison purpose. Because when entries have the same
	67	// key, the entry with larger offset is larger, offset = 0 will make a seek
	68	// key small or equal than all the entries with the seek key, so that Seek()
	69	// will find all the entries of the same key. Similarly, offset = MAX will
	70	// make the entry just larger than all entries with the search key so
	71	// SeekForPrev() will see all the keys with the same key.
	72	size_t offset;
	73	uint32_t column_family; // c1olumn family of the entry.
7c673cae	74	size_t key_offset; // offset of the key in write batch's string buffer.
11fdf7f2 TL	75	size_t key_size; // size of the key. kFlagMinInCf indicates
	76	// that this is a dummy look up entry for
	77	// SeekToFirst() to the beginning of the column
	78	// family. We use the flag here to save a boolean
	79	// in the struct.
7c673cae FG	80
	81	const Slice* search_key; // if not null, instead of reading keys from
	82	// write batch, use it to compare. This is used
	83	// for lookup key.
	84	};
	85
	86	class ReadableWriteBatch : public WriteBatch {
	87	public:
	88	explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
	89	: WriteBatch(reserved_bytes, max_bytes) {}
	90	// Retrieve some information from a write entry in the write batch, given
	91	// the start offset of the write entry.
	92	Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
	93	Slice* value, Slice* blob, Slice* xid) const;
	94	};
	95
	96	class WriteBatchEntryComparator {
	97	public:
	98	WriteBatchEntryComparator(const Comparator* _default_comparator,
	99	const ReadableWriteBatch* write_batch)
	100	: default_comparator_(_default_comparator), write_batch_(write_batch) {}
	101	// Compare a and b. Return a negative value if a is less than b, 0 if they
	102	// are equal, and a positive value if a is greater than b
	103	int operator()(const WriteBatchIndexEntry* entry1,
	104	const WriteBatchIndexEntry* entry2) const;
	105
	106	int CompareKey(uint32_t column_family, const Slice& key1,
	107	const Slice& key2) const;
	108
	109	void SetComparatorForCF(uint32_t column_family_id,
	110	const Comparator* comparator) {
	111	if (column_family_id >= cf_comparators_.size()) {
	112	cf_comparators_.resize(column_family_id + 1, nullptr);
	113	}
	114	cf_comparators_[column_family_id] = comparator;
	115	}
	116
	117	const Comparator* default_comparator() { return default_comparator_; }
	118
	119	private:
	120	const Comparator* default_comparator_;
	121	std::vector<const Comparator*> cf_comparators_;
	122	const ReadableWriteBatch* write_batch_;
	123	};
	124
	125	class WriteBatchWithIndexInternal {
	126	public:
	127	enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };
	128
	129	// If batch contains a value for key, store it in *value and return kFound.
	130	// If batch contains a deletion for key, return Deleted.
	131	// If batch contains Merge operations as the most recent entry for a key,
	132	// and the merge process does not stop (not reaching a value or delete),
	133	// prepend the current merge operands to *operands,
	134	// and return kMergeInProgress
	135	// If batch does not contain this key, return kNotFound
	136	// Else, return kError on error with error Status stored in *s.
	137	static WriteBatchWithIndexInternal::Result GetFromBatch(
	138	const ImmutableDBOptions& ioptions, WriteBatchWithIndex* batch,
	139	ColumnFamilyHandle* column_family, const Slice& key,
	140	MergeContext* merge_context, WriteBatchEntryComparator* cmp,
	141	std::string* value, bool overwrite_key, Status* s);
	142	};
	143
f67539c2	144	} // namespace ROCKSDB_NAMESPACE
7c673cae	145	#endif // !ROCKSDB_LITE