]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / utilities / write_batch_with_index / write_batch_with_index_internal.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5#pragma once
6
7#ifndef ROCKSDB_LITE
8
9#include <limits>
10#include <string>
11#include <vector>
12
1e59de90
TL
13#include "db/merge_context.h"
14#include "memtable/skiplist.h"
7c673cae
FG
15#include "options/db_options.h"
16#include "port/port.h"
17#include "rocksdb/comparator.h"
18#include "rocksdb/iterator.h"
19#include "rocksdb/slice.h"
20#include "rocksdb/status.h"
21#include "rocksdb/utilities/write_batch_with_index.h"
22
f67539c2 23namespace ROCKSDB_NAMESPACE {
7c673cae
FG
24
25class MergeContext;
1e59de90
TL
26class WBWIIteratorImpl;
27class WriteBatchWithIndexInternal;
7c673cae
FG
28struct Options;
29
1e59de90
TL
30// when direction == forward
31// * current_at_base_ <=> base_iterator > delta_iterator
32// when direction == backwards
33// * current_at_base_ <=> base_iterator < delta_iterator
34// always:
35// * equal_keys_ <=> base_iterator == delta_iterator
36class BaseDeltaIterator : public Iterator {
37 public:
38 BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator,
39 WBWIIteratorImpl* delta_iterator,
40 const Comparator* comparator,
41 const ReadOptions* read_options = nullptr);
42
43 ~BaseDeltaIterator() override {}
44
45 bool Valid() const override;
46 void SeekToFirst() override;
47 void SeekToLast() override;
48 void Seek(const Slice& k) override;
49 void SeekForPrev(const Slice& k) override;
50 void Next() override;
51 void Prev() override;
52 Slice key() const override;
53 Slice value() const override;
54 Status status() const override;
55 void Invalidate(Status s);
56
57 private:
58 void AssertInvariants();
59 void Advance();
60 void AdvanceDelta();
61 void AdvanceBase();
62 bool BaseValid() const;
63 bool DeltaValid() const;
64 void UpdateCurrent();
65
66 std::unique_ptr<WriteBatchWithIndexInternal> wbwii_;
67 bool forward_;
68 bool current_at_base_;
69 bool equal_keys_;
70 mutable Status status_;
71 std::unique_ptr<Iterator> base_iterator_;
72 std::unique_ptr<WBWIIteratorImpl> delta_iterator_;
73 const Comparator* comparator_; // not owned
74 const Slice* iterate_upper_bound_;
75 mutable PinnableSlice merge_result_;
76};
77
7c673cae
FG
78// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
79struct WriteBatchIndexEntry {
80 WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
81 : offset(o),
82 column_family(c),
83 key_offset(ko),
84 key_size(ksz),
85 search_key(nullptr) {}
11fdf7f2
TL
86 // Create a dummy entry as the search key. This index entry won't be backed
87 // by an entry from the write batch, but a pointer to the search key. Or a
88 // special flag of offset can indicate we are seek to first.
89 // @_search_key: the search key
90 // @_column_family: column family
91 // @is_forward_direction: true for Seek(). False for SeekForPrev()
92 // @is_seek_to_first: true if we seek to the beginning of the column family
93 // _search_key should be null in this case.
94 WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
95 bool is_forward_direction, bool is_seek_to_first)
96 // For SeekForPrev(), we need to make the dummy entry larger than any
97 // entry who has the same search key. Otherwise, we'll miss those entries.
1e59de90 98 : offset(is_forward_direction ? 0 : std::numeric_limits<size_t>::max()),
11fdf7f2 99 column_family(_column_family),
7c673cae 100 key_offset(0),
11fdf7f2
TL
101 key_size(is_seek_to_first ? kFlagMinInCf : 0),
102 search_key(_search_key) {
103 assert(_search_key != nullptr || is_seek_to_first);
104 }
105
106 // If this flag appears in the key_size, it indicates a
107 // key that is smaller than any other entry for the same column family.
1e59de90 108 static const size_t kFlagMinInCf = std::numeric_limits<size_t>::max();
7c673cae 109
11fdf7f2
TL
110 bool is_min_in_cf() const {
111 assert(key_size != kFlagMinInCf ||
112 (key_offset == 0 && search_key == nullptr));
113 return key_size == kFlagMinInCf;
114 }
7c673cae 115
11fdf7f2
TL
116 // offset of an entry in write batch's string buffer. If this is a dummy
117 // lookup key, in which case search_key != nullptr, offset is set to either
118 // 0 or max, only for comparison purpose. Because when entries have the same
119 // key, the entry with larger offset is larger, offset = 0 will make a seek
120 // key small or equal than all the entries with the seek key, so that Seek()
121 // will find all the entries of the same key. Similarly, offset = MAX will
122 // make the entry just larger than all entries with the search key so
123 // SeekForPrev() will see all the keys with the same key.
124 size_t offset;
1e59de90 125 uint32_t column_family; // column family of the entry.
7c673cae 126 size_t key_offset; // offset of the key in write batch's string buffer.
11fdf7f2
TL
127 size_t key_size; // size of the key. kFlagMinInCf indicates
128 // that this is a dummy look up entry for
129 // SeekToFirst() to the beginning of the column
130 // family. We use the flag here to save a boolean
131 // in the struct.
7c673cae
FG
132
133 const Slice* search_key; // if not null, instead of reading keys from
134 // write batch, use it to compare. This is used
135 // for lookup key.
136};
137
138class ReadableWriteBatch : public WriteBatch {
139 public:
1e59de90
TL
140 explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0,
141 size_t protection_bytes_per_key = 0,
142 size_t default_cf_ts_sz = 0)
143 : WriteBatch(reserved_bytes, max_bytes, protection_bytes_per_key,
144 default_cf_ts_sz) {}
7c673cae
FG
145 // Retrieve some information from a write entry in the write batch, given
146 // the start offset of the write entry.
147 Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
148 Slice* value, Slice* blob, Slice* xid) const;
149};
150
151class WriteBatchEntryComparator {
152 public:
153 WriteBatchEntryComparator(const Comparator* _default_comparator,
154 const ReadableWriteBatch* write_batch)
155 : default_comparator_(_default_comparator), write_batch_(write_batch) {}
156 // Compare a and b. Return a negative value if a is less than b, 0 if they
157 // are equal, and a positive value if a is greater than b
158 int operator()(const WriteBatchIndexEntry* entry1,
159 const WriteBatchIndexEntry* entry2) const;
160
161 int CompareKey(uint32_t column_family, const Slice& key1,
162 const Slice& key2) const;
163
164 void SetComparatorForCF(uint32_t column_family_id,
165 const Comparator* comparator) {
166 if (column_family_id >= cf_comparators_.size()) {
167 cf_comparators_.resize(column_family_id + 1, nullptr);
168 }
169 cf_comparators_[column_family_id] = comparator;
170 }
171
172 const Comparator* default_comparator() { return default_comparator_; }
173
1e59de90
TL
174 const Comparator* GetComparator(
175 const ColumnFamilyHandle* column_family) const;
176
177 const Comparator* GetComparator(uint32_t column_family) const;
178
7c673cae 179 private:
1e59de90 180 const Comparator* const default_comparator_;
7c673cae 181 std::vector<const Comparator*> cf_comparators_;
1e59de90
TL
182 const ReadableWriteBatch* const write_batch_;
183};
184
185using WriteBatchEntrySkipList =
186 SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>;
187
188class WBWIIteratorImpl : public WBWIIterator {
189 public:
190 enum Result : uint8_t {
191 kFound,
192 kDeleted,
193 kNotFound,
194 kMergeInProgress,
195 kError
196 };
197 WBWIIteratorImpl(uint32_t column_family_id,
198 WriteBatchEntrySkipList* skip_list,
199 const ReadableWriteBatch* write_batch,
200 WriteBatchEntryComparator* comparator)
201 : column_family_id_(column_family_id),
202 skip_list_iter_(skip_list),
203 write_batch_(write_batch),
204 comparator_(comparator) {}
205
206 ~WBWIIteratorImpl() override {}
207
208 bool Valid() const override {
209 if (!skip_list_iter_.Valid()) {
210 return false;
211 }
212 const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
213 return (iter_entry != nullptr &&
214 iter_entry->column_family == column_family_id_);
215 }
216
217 void SeekToFirst() override {
218 WriteBatchIndexEntry search_entry(
219 nullptr /* search_key */, column_family_id_,
220 true /* is_forward_direction */, true /* is_seek_to_first */);
221 skip_list_iter_.Seek(&search_entry);
222 }
223
224 void SeekToLast() override {
225 WriteBatchIndexEntry search_entry(
226 nullptr /* search_key */, column_family_id_ + 1,
227 true /* is_forward_direction */, true /* is_seek_to_first */);
228 skip_list_iter_.Seek(&search_entry);
229 if (!skip_list_iter_.Valid()) {
230 skip_list_iter_.SeekToLast();
231 } else {
232 skip_list_iter_.Prev();
233 }
234 }
235
236 void Seek(const Slice& key) override {
237 WriteBatchIndexEntry search_entry(&key, column_family_id_,
238 true /* is_forward_direction */,
239 false /* is_seek_to_first */);
240 skip_list_iter_.Seek(&search_entry);
241 }
242
243 void SeekForPrev(const Slice& key) override {
244 WriteBatchIndexEntry search_entry(&key, column_family_id_,
245 false /* is_forward_direction */,
246 false /* is_seek_to_first */);
247 skip_list_iter_.SeekForPrev(&search_entry);
248 }
249
250 void Next() override { skip_list_iter_.Next(); }
251
252 void Prev() override { skip_list_iter_.Prev(); }
253
254 WriteEntry Entry() const override;
255
256 Status status() const override {
257 // this is in-memory data structure, so the only way status can be non-ok is
258 // through memory corruption
259 return Status::OK();
260 }
261
262 const WriteBatchIndexEntry* GetRawEntry() const {
263 return skip_list_iter_.key();
264 }
265
266 bool MatchesKey(uint32_t cf_id, const Slice& key);
267
268 // Moves the iterator to first entry of the previous key.
269 void PrevKey();
270 // Moves the iterator to first entry of the next key.
271 void NextKey();
272
273 // Moves the iterator to the Update (Put or Delete) for the current key
274 // If there are no Put/Delete, the Iterator will point to the first entry for
275 // this key
276 // @return kFound if a Put was found for the key
277 // @return kDeleted if a delete was found for the key
278 // @return kMergeInProgress if only merges were fouund for the key
279 // @return kError if an unsupported operation was found for the key
280 // @return kNotFound if no operations were found for this key
281 //
282 Result FindLatestUpdate(const Slice& key, MergeContext* merge_context);
283 Result FindLatestUpdate(MergeContext* merge_context);
284
285 protected:
286 void AdvanceKey(bool forward);
287
288 private:
289 uint32_t column_family_id_;
290 WriteBatchEntrySkipList::Iterator skip_list_iter_;
7c673cae 291 const ReadableWriteBatch* write_batch_;
1e59de90 292 WriteBatchEntryComparator* comparator_;
7c673cae
FG
293};
294
295class WriteBatchWithIndexInternal {
296 public:
1e59de90
TL
297 static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi,
298 uint32_t cf_id);
299
300 // For GetFromBatchAndDB or similar
301 explicit WriteBatchWithIndexInternal(DB* db,
302 ColumnFamilyHandle* column_family);
303 // For GetFromBatchAndDB or similar
304 explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family);
305 // For GetFromBatch or similar
306 explicit WriteBatchWithIndexInternal(const DBOptions* db_options,
307 ColumnFamilyHandle* column_family);
7c673cae
FG
308
309 // If batch contains a value for key, store it in *value and return kFound.
310 // If batch contains a deletion for key, return Deleted.
311 // If batch contains Merge operations as the most recent entry for a key,
312 // and the merge process does not stop (not reaching a value or delete),
313 // prepend the current merge operands to *operands,
314 // and return kMergeInProgress
315 // If batch does not contain this key, return kNotFound
316 // Else, return kError on error with error Status stored in *s.
1e59de90
TL
317 WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
318 const Slice& key, std::string* value,
319 Status* s) {
320 return GetFromBatch(batch, key, &merge_context_, value, s);
321 }
322 WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
323 const Slice& key,
324 MergeContext* merge_context,
325 std::string* value, Status* s);
326 Status MergeKey(const Slice& key, const Slice* value,
327 std::string* result) const {
328 return MergeKey(key, value, merge_context_, result);
329 }
330 Status MergeKey(const Slice& key, const Slice* value,
331 const MergeContext& context, std::string* result) const;
332 size_t GetNumOperands() const { return merge_context_.GetNumOperands(); }
333 MergeContext* GetMergeContext() { return &merge_context_; }
334 Slice GetOperand(int index) const { return merge_context_.GetOperand(index); }
335
336 private:
337 DB* db_;
338 const DBOptions* db_options_;
339 ColumnFamilyHandle* column_family_;
340 MergeContext merge_context_;
7c673cae
FG
341};
342
f67539c2 343} // namespace ROCKSDB_NAMESPACE
7c673cae 344#endif // !ROCKSDB_LITE