]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | #pragma once |
6 | ||
7 | #ifndef ROCKSDB_LITE | |
8 | ||
9 | #include <limits> | |
10 | #include <string> | |
11 | #include <vector> | |
12 | ||
1e59de90 TL |
13 | #include "db/merge_context.h" |
14 | #include "memtable/skiplist.h" | |
7c673cae FG |
15 | #include "options/db_options.h" |
16 | #include "port/port.h" | |
17 | #include "rocksdb/comparator.h" | |
18 | #include "rocksdb/iterator.h" | |
19 | #include "rocksdb/slice.h" | |
20 | #include "rocksdb/status.h" | |
21 | #include "rocksdb/utilities/write_batch_with_index.h" | |
22 | ||
f67539c2 | 23 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
24 | |
25 | class MergeContext; | |
1e59de90 TL |
26 | class WBWIIteratorImpl; |
27 | class WriteBatchWithIndexInternal; | |
7c673cae FG |
28 | struct Options; |
29 | ||
1e59de90 TL |
30 | // when direction == forward |
31 | // * current_at_base_ <=> base_iterator > delta_iterator | |
32 | // when direction == backwards | |
33 | // * current_at_base_ <=> base_iterator < delta_iterator | |
34 | // always: | |
35 | // * equal_keys_ <=> base_iterator == delta_iterator | |
36 | class BaseDeltaIterator : public Iterator { | |
37 | public: | |
38 | BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator, | |
39 | WBWIIteratorImpl* delta_iterator, | |
40 | const Comparator* comparator, | |
41 | const ReadOptions* read_options = nullptr); | |
42 | ||
43 | ~BaseDeltaIterator() override {} | |
44 | ||
45 | bool Valid() const override; | |
46 | void SeekToFirst() override; | |
47 | void SeekToLast() override; | |
48 | void Seek(const Slice& k) override; | |
49 | void SeekForPrev(const Slice& k) override; | |
50 | void Next() override; | |
51 | void Prev() override; | |
52 | Slice key() const override; | |
53 | Slice value() const override; | |
54 | Status status() const override; | |
55 | void Invalidate(Status s); | |
56 | ||
57 | private: | |
58 | void AssertInvariants(); | |
59 | void Advance(); | |
60 | void AdvanceDelta(); | |
61 | void AdvanceBase(); | |
62 | bool BaseValid() const; | |
63 | bool DeltaValid() const; | |
64 | void UpdateCurrent(); | |
65 | ||
66 | std::unique_ptr<WriteBatchWithIndexInternal> wbwii_; | |
67 | bool forward_; | |
68 | bool current_at_base_; | |
69 | bool equal_keys_; | |
70 | mutable Status status_; | |
71 | std::unique_ptr<Iterator> base_iterator_; | |
72 | std::unique_ptr<WBWIIteratorImpl> delta_iterator_; | |
73 | const Comparator* comparator_; // not owned | |
74 | const Slice* iterate_upper_bound_; | |
75 | mutable PinnableSlice merge_result_; | |
76 | }; | |
77 | ||
7c673cae FG |
78 | // Key used by skip list, as the binary searchable index of WriteBatchWithIndex. |
79 | struct WriteBatchIndexEntry { | |
80 | WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz) | |
81 | : offset(o), | |
82 | column_family(c), | |
83 | key_offset(ko), | |
84 | key_size(ksz), | |
85 | search_key(nullptr) {} | |
11fdf7f2 TL |
86 | // Create a dummy entry as the search key. This index entry won't be backed |
87 | // by an entry from the write batch, but a pointer to the search key. Or a | |
88 | // special flag of offset can indicate we are seek to first. | |
89 | // @_search_key: the search key | |
90 | // @_column_family: column family | |
91 | // @is_forward_direction: true for Seek(). False for SeekForPrev() | |
92 | // @is_seek_to_first: true if we seek to the beginning of the column family | |
93 | // _search_key should be null in this case. | |
94 | WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family, | |
95 | bool is_forward_direction, bool is_seek_to_first) | |
96 | // For SeekForPrev(), we need to make the dummy entry larger than any | |
97 | // entry who has the same search key. Otherwise, we'll miss those entries. | |
1e59de90 | 98 | : offset(is_forward_direction ? 0 : std::numeric_limits<size_t>::max()), |
11fdf7f2 | 99 | column_family(_column_family), |
7c673cae | 100 | key_offset(0), |
11fdf7f2 TL |
101 | key_size(is_seek_to_first ? kFlagMinInCf : 0), |
102 | search_key(_search_key) { | |
103 | assert(_search_key != nullptr || is_seek_to_first); | |
104 | } | |
105 | ||
106 | // If this flag appears in the key_size, it indicates a | |
107 | // key that is smaller than any other entry for the same column family. | |
1e59de90 | 108 | static const size_t kFlagMinInCf = std::numeric_limits<size_t>::max(); |
7c673cae | 109 | |
11fdf7f2 TL |
110 | bool is_min_in_cf() const { |
111 | assert(key_size != kFlagMinInCf || | |
112 | (key_offset == 0 && search_key == nullptr)); | |
113 | return key_size == kFlagMinInCf; | |
114 | } | |
7c673cae | 115 | |
11fdf7f2 TL |
116 | // offset of an entry in write batch's string buffer. If this is a dummy |
117 | // lookup key, in which case search_key != nullptr, offset is set to either | |
118 | // 0 or max, only for comparison purpose. Because when entries have the same | |
119 | // key, the entry with larger offset is larger, offset = 0 will make a seek | |
120 | // key small or equal than all the entries with the seek key, so that Seek() | |
121 | // will find all the entries of the same key. Similarly, offset = MAX will | |
122 | // make the entry just larger than all entries with the search key so | |
123 | // SeekForPrev() will see all the keys with the same key. | |
124 | size_t offset; | |
1e59de90 | 125 | uint32_t column_family; // column family of the entry. |
7c673cae | 126 | size_t key_offset; // offset of the key in write batch's string buffer. |
11fdf7f2 TL |
127 | size_t key_size; // size of the key. kFlagMinInCf indicates |
128 | // that this is a dummy look up entry for | |
129 | // SeekToFirst() to the beginning of the column | |
130 | // family. We use the flag here to save a boolean | |
131 | // in the struct. | |
7c673cae FG |
132 | |
133 | const Slice* search_key; // if not null, instead of reading keys from | |
134 | // write batch, use it to compare. This is used | |
135 | // for lookup key. | |
136 | }; | |
137 | ||
138 | class ReadableWriteBatch : public WriteBatch { | |
139 | public: | |
1e59de90 TL |
140 | explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0, |
141 | size_t protection_bytes_per_key = 0, | |
142 | size_t default_cf_ts_sz = 0) | |
143 | : WriteBatch(reserved_bytes, max_bytes, protection_bytes_per_key, | |
144 | default_cf_ts_sz) {} | |
7c673cae FG |
145 | // Retrieve some information from a write entry in the write batch, given |
146 | // the start offset of the write entry. | |
147 | Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, | |
148 | Slice* value, Slice* blob, Slice* xid) const; | |
149 | }; | |
150 | ||
151 | class WriteBatchEntryComparator { | |
152 | public: | |
153 | WriteBatchEntryComparator(const Comparator* _default_comparator, | |
154 | const ReadableWriteBatch* write_batch) | |
155 | : default_comparator_(_default_comparator), write_batch_(write_batch) {} | |
156 | // Compare a and b. Return a negative value if a is less than b, 0 if they | |
157 | // are equal, and a positive value if a is greater than b | |
158 | int operator()(const WriteBatchIndexEntry* entry1, | |
159 | const WriteBatchIndexEntry* entry2) const; | |
160 | ||
161 | int CompareKey(uint32_t column_family, const Slice& key1, | |
162 | const Slice& key2) const; | |
163 | ||
164 | void SetComparatorForCF(uint32_t column_family_id, | |
165 | const Comparator* comparator) { | |
166 | if (column_family_id >= cf_comparators_.size()) { | |
167 | cf_comparators_.resize(column_family_id + 1, nullptr); | |
168 | } | |
169 | cf_comparators_[column_family_id] = comparator; | |
170 | } | |
171 | ||
172 | const Comparator* default_comparator() { return default_comparator_; } | |
173 | ||
1e59de90 TL |
174 | const Comparator* GetComparator( |
175 | const ColumnFamilyHandle* column_family) const; | |
176 | ||
177 | const Comparator* GetComparator(uint32_t column_family) const; | |
178 | ||
7c673cae | 179 | private: |
1e59de90 | 180 | const Comparator* const default_comparator_; |
7c673cae | 181 | std::vector<const Comparator*> cf_comparators_; |
1e59de90 TL |
182 | const ReadableWriteBatch* const write_batch_; |
183 | }; | |
184 | ||
185 | using WriteBatchEntrySkipList = | |
186 | SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>; | |
187 | ||
188 | class WBWIIteratorImpl : public WBWIIterator { | |
189 | public: | |
190 | enum Result : uint8_t { | |
191 | kFound, | |
192 | kDeleted, | |
193 | kNotFound, | |
194 | kMergeInProgress, | |
195 | kError | |
196 | }; | |
197 | WBWIIteratorImpl(uint32_t column_family_id, | |
198 | WriteBatchEntrySkipList* skip_list, | |
199 | const ReadableWriteBatch* write_batch, | |
200 | WriteBatchEntryComparator* comparator) | |
201 | : column_family_id_(column_family_id), | |
202 | skip_list_iter_(skip_list), | |
203 | write_batch_(write_batch), | |
204 | comparator_(comparator) {} | |
205 | ||
206 | ~WBWIIteratorImpl() override {} | |
207 | ||
208 | bool Valid() const override { | |
209 | if (!skip_list_iter_.Valid()) { | |
210 | return false; | |
211 | } | |
212 | const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); | |
213 | return (iter_entry != nullptr && | |
214 | iter_entry->column_family == column_family_id_); | |
215 | } | |
216 | ||
217 | void SeekToFirst() override { | |
218 | WriteBatchIndexEntry search_entry( | |
219 | nullptr /* search_key */, column_family_id_, | |
220 | true /* is_forward_direction */, true /* is_seek_to_first */); | |
221 | skip_list_iter_.Seek(&search_entry); | |
222 | } | |
223 | ||
224 | void SeekToLast() override { | |
225 | WriteBatchIndexEntry search_entry( | |
226 | nullptr /* search_key */, column_family_id_ + 1, | |
227 | true /* is_forward_direction */, true /* is_seek_to_first */); | |
228 | skip_list_iter_.Seek(&search_entry); | |
229 | if (!skip_list_iter_.Valid()) { | |
230 | skip_list_iter_.SeekToLast(); | |
231 | } else { | |
232 | skip_list_iter_.Prev(); | |
233 | } | |
234 | } | |
235 | ||
236 | void Seek(const Slice& key) override { | |
237 | WriteBatchIndexEntry search_entry(&key, column_family_id_, | |
238 | true /* is_forward_direction */, | |
239 | false /* is_seek_to_first */); | |
240 | skip_list_iter_.Seek(&search_entry); | |
241 | } | |
242 | ||
243 | void SeekForPrev(const Slice& key) override { | |
244 | WriteBatchIndexEntry search_entry(&key, column_family_id_, | |
245 | false /* is_forward_direction */, | |
246 | false /* is_seek_to_first */); | |
247 | skip_list_iter_.SeekForPrev(&search_entry); | |
248 | } | |
249 | ||
250 | void Next() override { skip_list_iter_.Next(); } | |
251 | ||
252 | void Prev() override { skip_list_iter_.Prev(); } | |
253 | ||
254 | WriteEntry Entry() const override; | |
255 | ||
256 | Status status() const override { | |
257 | // this is in-memory data structure, so the only way status can be non-ok is | |
258 | // through memory corruption | |
259 | return Status::OK(); | |
260 | } | |
261 | ||
262 | const WriteBatchIndexEntry* GetRawEntry() const { | |
263 | return skip_list_iter_.key(); | |
264 | } | |
265 | ||
266 | bool MatchesKey(uint32_t cf_id, const Slice& key); | |
267 | ||
268 | // Moves the iterator to first entry of the previous key. | |
269 | void PrevKey(); | |
270 | // Moves the iterator to first entry of the next key. | |
271 | void NextKey(); | |
272 | ||
273 | // Moves the iterator to the Update (Put or Delete) for the current key | |
274 | // If there are no Put/Delete, the Iterator will point to the first entry for | |
275 | // this key | |
276 | // @return kFound if a Put was found for the key | |
277 | // @return kDeleted if a delete was found for the key | |
278 | // @return kMergeInProgress if only merges were fouund for the key | |
279 | // @return kError if an unsupported operation was found for the key | |
280 | // @return kNotFound if no operations were found for this key | |
281 | // | |
282 | Result FindLatestUpdate(const Slice& key, MergeContext* merge_context); | |
283 | Result FindLatestUpdate(MergeContext* merge_context); | |
284 | ||
285 | protected: | |
286 | void AdvanceKey(bool forward); | |
287 | ||
288 | private: | |
289 | uint32_t column_family_id_; | |
290 | WriteBatchEntrySkipList::Iterator skip_list_iter_; | |
7c673cae | 291 | const ReadableWriteBatch* write_batch_; |
1e59de90 | 292 | WriteBatchEntryComparator* comparator_; |
7c673cae FG |
293 | }; |
294 | ||
295 | class WriteBatchWithIndexInternal { | |
296 | public: | |
1e59de90 TL |
297 | static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi, |
298 | uint32_t cf_id); | |
299 | ||
300 | // For GetFromBatchAndDB or similar | |
301 | explicit WriteBatchWithIndexInternal(DB* db, | |
302 | ColumnFamilyHandle* column_family); | |
303 | // For GetFromBatchAndDB or similar | |
304 | explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family); | |
305 | // For GetFromBatch or similar | |
306 | explicit WriteBatchWithIndexInternal(const DBOptions* db_options, | |
307 | ColumnFamilyHandle* column_family); | |
7c673cae FG |
308 | |
309 | // If batch contains a value for key, store it in *value and return kFound. | |
310 | // If batch contains a deletion for key, return Deleted. | |
311 | // If batch contains Merge operations as the most recent entry for a key, | |
312 | // and the merge process does not stop (not reaching a value or delete), | |
313 | // prepend the current merge operands to *operands, | |
314 | // and return kMergeInProgress | |
315 | // If batch does not contain this key, return kNotFound | |
316 | // Else, return kError on error with error Status stored in *s. | |
1e59de90 TL |
317 | WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, |
318 | const Slice& key, std::string* value, | |
319 | Status* s) { | |
320 | return GetFromBatch(batch, key, &merge_context_, value, s); | |
321 | } | |
322 | WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, | |
323 | const Slice& key, | |
324 | MergeContext* merge_context, | |
325 | std::string* value, Status* s); | |
326 | Status MergeKey(const Slice& key, const Slice* value, | |
327 | std::string* result) const { | |
328 | return MergeKey(key, value, merge_context_, result); | |
329 | } | |
330 | Status MergeKey(const Slice& key, const Slice* value, | |
331 | const MergeContext& context, std::string* result) const; | |
332 | size_t GetNumOperands() const { return merge_context_.GetNumOperands(); } | |
333 | MergeContext* GetMergeContext() { return &merge_context_; } | |
334 | Slice GetOperand(int index) const { return merge_context_.GetOperand(index); } | |
335 | ||
336 | private: | |
337 | DB* db_; | |
338 | const DBOptions* db_options_; | |
339 | ColumnFamilyHandle* column_family_; | |
340 | MergeContext merge_context_; | |
7c673cae FG |
341 | }; |
342 | ||
f67539c2 | 343 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 344 | #endif // !ROCKSDB_LITE |