]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | |
6 | #pragma once | |
7 | #ifndef ROCKSDB_LITE | |
8 | ||
9 | #include <array> | |
10 | #include "rocksdb/slice.h" | |
11 | #include "db/dbformat.h" | |
12 | #include "table/plain_table_reader.h" | |
13 | ||
14 | namespace rocksdb { | |
15 | ||
16 | class WritableFile; | |
17 | struct ParsedInternalKey; | |
18 | struct PlainTableReaderFileInfo; | |
19 | enum PlainTableEntryType : unsigned char; | |
20 | ||
21 | // Helper class to write out a key to an output file | |
22 | // Actual data format of the key is documented in plain_table_factory.h | |
23 | class PlainTableKeyEncoder { | |
24 | public: | |
25 | explicit PlainTableKeyEncoder(EncodingType encoding_type, | |
26 | uint32_t user_key_len, | |
27 | const SliceTransform* prefix_extractor, | |
28 | size_t index_sparseness) | |
29 | : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), | |
30 | fixed_user_key_len_(user_key_len), | |
31 | prefix_extractor_(prefix_extractor), | |
32 | index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), | |
33 | key_count_for_prefix_(0) {} | |
34 | // key: the key to write out, in the format of internal key. | |
35 | // file: the output file to write out | |
36 | // offset: offset in the file. Needs to be updated after appending bytes | |
37 | // for the key | |
38 | // meta_bytes_buf: buffer for extra meta bytes | |
39 | // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated | |
40 | // if meta_bytes_buf is updated. | |
41 | Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, | |
42 | char* meta_bytes_buf, size_t* meta_bytes_buf_size); | |
43 | ||
44 | // Return actual encoding type to be picked | |
45 | EncodingType GetEncodingType() { return encoding_type_; } | |
46 | ||
47 | private: | |
48 | EncodingType encoding_type_; | |
49 | uint32_t fixed_user_key_len_; | |
50 | const SliceTransform* prefix_extractor_; | |
51 | const size_t index_sparseness_; | |
52 | size_t key_count_for_prefix_; | |
53 | IterKey pre_prefix_; | |
54 | }; | |
55 | ||
56 | class PlainTableFileReader { | |
57 | public: | |
58 | explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) | |
59 | : file_info_(_file_info), num_buf_(0) {} | |
60 | // In mmaped mode, the results point to mmaped area of the file, which | |
61 | // means it is always valid before closing the file. | |
62 | // In non-mmap mode, the results point to an internal buffer. If the caller | |
63 | // makes another read call, the results may not be valid. So callers should | |
64 | // make a copy when needed. | |
65 | // In order to save read calls to files, we keep two internal buffers: | |
66 | // the first read and the most recent read. This is efficient because it | |
67 | // columns these two common use cases: | |
68 | // (1) hash index only identify one location, we read the key to verify | |
69 | // the location, and read key and value if it is the right location. | |
70 | // (2) after hash index checking, we identify two locations (because of | |
71 | // hash bucket conflicts), we binary search the two location to see | |
72 | // which one is what we need and start to read from the location. | |
73 | // These two most common use cases will be covered by the two buffers | |
74 | // so that we don't need to re-read the same location. | |
75 | // Currently we keep a fixed size buffer. If a read doesn't exactly fit | |
76 | // the buffer, we replace the second buffer with the location user reads. | |
77 | // | |
78 | // If return false, status code is stored in status_. | |
79 | bool Read(uint32_t file_offset, uint32_t len, Slice* out) { | |
80 | if (file_info_->is_mmap_mode) { | |
81 | assert(file_offset + len <= file_info_->data_end_offset); | |
82 | *out = Slice(file_info_->file_data.data() + file_offset, len); | |
83 | return true; | |
84 | } else { | |
85 | return ReadNonMmap(file_offset, len, out); | |
86 | } | |
87 | } | |
88 | ||
89 | // If return false, status code is stored in status_. | |
90 | bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); | |
91 | ||
92 | // *bytes_read = 0 means eof. false means failure and status is saved | |
93 | // in status_. Not directly returning Status to save copying status | |
94 | // object to map previous performance of mmap mode. | |
95 | inline bool ReadVarint32(uint32_t offset, uint32_t* output, | |
96 | uint32_t* bytes_read); | |
97 | ||
98 | bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, | |
99 | uint32_t* bytes_read); | |
100 | ||
101 | Status status() const { return status_; } | |
102 | ||
103 | const PlainTableReaderFileInfo* file_info() { return file_info_; } | |
104 | ||
105 | private: | |
106 | const PlainTableReaderFileInfo* file_info_; | |
107 | ||
108 | struct Buffer { | |
109 | Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} | |
110 | std::unique_ptr<char[]> buf; | |
111 | uint32_t buf_start_offset; | |
112 | uint32_t buf_len; | |
113 | uint32_t buf_capacity; | |
114 | }; | |
115 | ||
116 | // Keep buffers for two recent reads. | |
117 | std::array<unique_ptr<Buffer>, 2> buffers_; | |
118 | uint32_t num_buf_; | |
119 | Status status_; | |
120 | ||
121 | Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); | |
122 | }; | |
123 | ||
124 | // A helper class to decode keys from input buffer | |
125 | // Actual data format of the key is documented in plain_table_factory.h | |
126 | class PlainTableKeyDecoder { | |
127 | public: | |
128 | explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, | |
129 | EncodingType encoding_type, | |
130 | uint32_t user_key_len, | |
131 | const SliceTransform* prefix_extractor) | |
132 | : file_reader_(file_info), | |
133 | encoding_type_(encoding_type), | |
134 | prefix_len_(0), | |
135 | fixed_user_key_len_(user_key_len), | |
136 | prefix_extractor_(prefix_extractor), | |
137 | in_prefix_(false) {} | |
138 | // Find the next key. | |
139 | // start: char array where the key starts. | |
140 | // limit: boundary of the char array | |
141 | // parsed_key: the output of the result key | |
142 | // internal_key: if not null, fill with the output of the result key in | |
143 | // un-parsed format | |
144 | // bytes_read: how many bytes read from start. Output | |
145 | // seekable: whether key can be read from this place. Used when building | |
146 | // indexes. Output. | |
147 | Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key, | |
148 | Slice* internal_key, Slice* value, uint32_t* bytes_read, | |
149 | bool* seekable = nullptr); | |
150 | ||
151 | Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key, | |
152 | Slice* internal_key, uint32_t* bytes_read, | |
153 | bool* seekable = nullptr); | |
154 | ||
155 | PlainTableFileReader file_reader_; | |
156 | EncodingType encoding_type_; | |
157 | uint32_t prefix_len_; | |
158 | uint32_t fixed_user_key_len_; | |
159 | Slice saved_user_key_; | |
160 | IterKey cur_key_; | |
161 | const SliceTransform* prefix_extractor_; | |
162 | bool in_prefix_; | |
163 | ||
164 | private: | |
165 | Status NextPlainEncodingKey(uint32_t start_offset, | |
166 | ParsedInternalKey* parsed_key, | |
167 | Slice* internal_key, uint32_t* bytes_read, | |
168 | bool* seekable = nullptr); | |
169 | Status NextPrefixEncodingKey(uint32_t start_offset, | |
170 | ParsedInternalKey* parsed_key, | |
171 | Slice* internal_key, uint32_t* bytes_read, | |
172 | bool* seekable = nullptr); | |
173 | Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size, | |
174 | ParsedInternalKey* parsed_key, uint32_t* bytes_read, | |
175 | bool* internal_key_valid, Slice* internal_key); | |
176 | inline Status DecodeSize(uint32_t start_offset, | |
177 | PlainTableEntryType* entry_type, uint32_t* key_size, | |
178 | uint32_t* bytes_read); | |
179 | }; | |
180 | ||
181 | } // namespace rocksdb | |
182 | ||
183 | #endif // ROCKSDB_LITE |