]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/plain_table_key_coding.h
321e0aed594cb80a2563bad5342843a250e679ae
[ceph.git] / ceph / src / rocksdb / table / plain_table_key_coding.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #pragma once
7 #ifndef ROCKSDB_LITE
8
9 #include <array>
10 #include "rocksdb/slice.h"
11 #include "db/dbformat.h"
12 #include "table/plain_table_reader.h"
13
14 namespace rocksdb {
15
16 class WritableFile;
17 struct ParsedInternalKey;
18 struct PlainTableReaderFileInfo;
19 enum PlainTableEntryType : unsigned char;
20
21 // Helper class to write out a key to an output file
22 // Actual data format of the key is documented in plain_table_factory.h
23 class PlainTableKeyEncoder {
24 public:
25 explicit PlainTableKeyEncoder(EncodingType encoding_type,
26 uint32_t user_key_len,
27 const SliceTransform* prefix_extractor,
28 size_t index_sparseness)
29 : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
30 fixed_user_key_len_(user_key_len),
31 prefix_extractor_(prefix_extractor),
32 index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
33 key_count_for_prefix_(0) {}
34 // key: the key to write out, in the format of internal key.
35 // file: the output file to write out
36 // offset: offset in the file. Needs to be updated after appending bytes
37 // for the key
38 // meta_bytes_buf: buffer for extra meta bytes
39 // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
40 // if meta_bytes_buf is updated.
41 Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset,
42 char* meta_bytes_buf, size_t* meta_bytes_buf_size);
43
44 // Return actual encoding type to be picked
45 EncodingType GetEncodingType() { return encoding_type_; }
46
47 private:
48 EncodingType encoding_type_;
49 uint32_t fixed_user_key_len_;
50 const SliceTransform* prefix_extractor_;
51 const size_t index_sparseness_;
52 size_t key_count_for_prefix_;
53 IterKey pre_prefix_;
54 };
55
56 class PlainTableFileReader {
57 public:
58 explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
59 : file_info_(_file_info), num_buf_(0) {}
60 // In mmaped mode, the results point to mmaped area of the file, which
61 // means it is always valid before closing the file.
62 // In non-mmap mode, the results point to an internal buffer. If the caller
63 // makes another read call, the results may not be valid. So callers should
64 // make a copy when needed.
65 // In order to save read calls to files, we keep two internal buffers:
66 // the first read and the most recent read. This is efficient because it
67 // columns these two common use cases:
68 // (1) hash index only identify one location, we read the key to verify
69 // the location, and read key and value if it is the right location.
70 // (2) after hash index checking, we identify two locations (because of
71 // hash bucket conflicts), we binary search the two location to see
72 // which one is what we need and start to read from the location.
73 // These two most common use cases will be covered by the two buffers
74 // so that we don't need to re-read the same location.
75 // Currently we keep a fixed size buffer. If a read doesn't exactly fit
76 // the buffer, we replace the second buffer with the location user reads.
77 //
78 // If return false, status code is stored in status_.
79 bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
80 if (file_info_->is_mmap_mode) {
81 assert(file_offset + len <= file_info_->data_end_offset);
82 *out = Slice(file_info_->file_data.data() + file_offset, len);
83 return true;
84 } else {
85 return ReadNonMmap(file_offset, len, out);
86 }
87 }
88
89 // If return false, status code is stored in status_.
90 bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
91
92 // *bytes_read = 0 means eof. false means failure and status is saved
93 // in status_. Not directly returning Status to save copying status
94 // object to map previous performance of mmap mode.
95 inline bool ReadVarint32(uint32_t offset, uint32_t* output,
96 uint32_t* bytes_read);
97
98 bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
99 uint32_t* bytes_read);
100
101 Status status() const { return status_; }
102
103 const PlainTableReaderFileInfo* file_info() { return file_info_; }
104
105 private:
106 const PlainTableReaderFileInfo* file_info_;
107
108 struct Buffer {
109 Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
110 std::unique_ptr<char[]> buf;
111 uint32_t buf_start_offset;
112 uint32_t buf_len;
113 uint32_t buf_capacity;
114 };
115
116 // Keep buffers for two recent reads.
117 std::array<unique_ptr<Buffer>, 2> buffers_;
118 uint32_t num_buf_;
119 Status status_;
120
121 Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
122 };
123
124 // A helper class to decode keys from input buffer
125 // Actual data format of the key is documented in plain_table_factory.h
126 class PlainTableKeyDecoder {
127 public:
128 explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
129 EncodingType encoding_type,
130 uint32_t user_key_len,
131 const SliceTransform* prefix_extractor)
132 : file_reader_(file_info),
133 encoding_type_(encoding_type),
134 prefix_len_(0),
135 fixed_user_key_len_(user_key_len),
136 prefix_extractor_(prefix_extractor),
137 in_prefix_(false) {}
138 // Find the next key.
139 // start: char array where the key starts.
140 // limit: boundary of the char array
141 // parsed_key: the output of the result key
142 // internal_key: if not null, fill with the output of the result key in
143 // un-parsed format
144 // bytes_read: how many bytes read from start. Output
145 // seekable: whether key can be read from this place. Used when building
146 // indexes. Output.
147 Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
148 Slice* internal_key, Slice* value, uint32_t* bytes_read,
149 bool* seekable = nullptr);
150
151 Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
152 Slice* internal_key, uint32_t* bytes_read,
153 bool* seekable = nullptr);
154
155 PlainTableFileReader file_reader_;
156 EncodingType encoding_type_;
157 uint32_t prefix_len_;
158 uint32_t fixed_user_key_len_;
159 Slice saved_user_key_;
160 IterKey cur_key_;
161 const SliceTransform* prefix_extractor_;
162 bool in_prefix_;
163
164 private:
165 Status NextPlainEncodingKey(uint32_t start_offset,
166 ParsedInternalKey* parsed_key,
167 Slice* internal_key, uint32_t* bytes_read,
168 bool* seekable = nullptr);
169 Status NextPrefixEncodingKey(uint32_t start_offset,
170 ParsedInternalKey* parsed_key,
171 Slice* internal_key, uint32_t* bytes_read,
172 bool* seekable = nullptr);
173 Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
174 ParsedInternalKey* parsed_key, uint32_t* bytes_read,
175 bool* internal_key_valid, Slice* internal_key);
176 inline Status DecodeSize(uint32_t start_offset,
177 PlainTableEntryType* entry_type, uint32_t* key_size,
178 uint32_t* bytes_read);
179 };
180
181 } // namespace rocksdb
182
183 #endif // ROCKSDB_LITE