]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <cstdint> | |
21 | #include <cstring> | |
22 | ||
23 | #include "arrow/buffer.h" | |
24 | #include "arrow/util/bit_util.h" | |
25 | #include "arrow/util/endian.h" | |
26 | #include "arrow/util/macros.h" | |
27 | ||
28 | namespace arrow { | |
29 | namespace internal { | |
30 | ||
31 | class BitmapReader { | |
32 | public: | |
33 | BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) | |
34 | : bitmap_(bitmap), position_(0), length_(length) { | |
35 | current_byte_ = 0; | |
36 | byte_offset_ = start_offset / 8; | |
37 | bit_offset_ = start_offset % 8; | |
38 | if (length > 0) { | |
39 | current_byte_ = bitmap[byte_offset_]; | |
40 | } | |
41 | } | |
42 | ||
43 | bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; } | |
44 | ||
45 | bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; } | |
46 | ||
47 | void Next() { | |
48 | ++bit_offset_; | |
49 | ++position_; | |
50 | if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) { | |
51 | bit_offset_ = 0; | |
52 | ++byte_offset_; | |
53 | if (ARROW_PREDICT_TRUE(position_ < length_)) { | |
54 | current_byte_ = bitmap_[byte_offset_]; | |
55 | } | |
56 | } | |
57 | } | |
58 | ||
59 | int64_t position() const { return position_; } | |
60 | ||
61 | int64_t length() const { return length_; } | |
62 | ||
63 | private: | |
64 | const uint8_t* bitmap_; | |
65 | int64_t position_; | |
66 | int64_t length_; | |
67 | ||
68 | uint8_t current_byte_; | |
69 | int64_t byte_offset_; | |
70 | int64_t bit_offset_; | |
71 | }; | |
72 | ||
73 | // XXX Cannot name it BitmapWordReader because the name is already used | |
74 | // in bitmap_ops.cc | |
75 | ||
76 | class BitmapUInt64Reader { | |
77 | public: | |
78 | BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length) | |
79 | : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8), | |
80 | num_carry_bits_(8 - start_offset % 8), | |
81 | length_(length), | |
82 | remaining_length_(length_) { | |
83 | if (length_ > 0) { | |
84 | // Load carry bits from the first byte's MSBs | |
85 | if (length_ >= num_carry_bits_) { | |
86 | carry_bits_ = | |
87 | LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_); | |
88 | } else { | |
89 | carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_); | |
90 | } | |
91 | } | |
92 | } | |
93 | ||
94 | uint64_t NextWord() { | |
95 | if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) { | |
96 | // We can load a full word | |
97 | uint64_t next_word = LoadFullWord(); | |
98 | // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word | |
99 | uint64_t word = carry_bits_ | (next_word << num_carry_bits_); | |
100 | carry_bits_ = next_word >> (64 - num_carry_bits_); | |
101 | remaining_length_ -= 64; | |
102 | return word; | |
103 | } else if (remaining_length_ > num_carry_bits_) { | |
104 | // We can load a partial word | |
105 | uint64_t next_word = | |
106 | LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_); | |
107 | uint64_t word = carry_bits_ | (next_word << num_carry_bits_); | |
108 | carry_bits_ = next_word >> (64 - num_carry_bits_); | |
109 | remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0); | |
110 | return word; | |
111 | } else { | |
112 | remaining_length_ = 0; | |
113 | return carry_bits_; | |
114 | } | |
115 | } | |
116 | ||
117 | int64_t position() const { return length_ - remaining_length_; } | |
118 | ||
119 | int64_t length() const { return length_; } | |
120 | ||
121 | private: | |
122 | uint64_t LoadFullWord() { | |
123 | uint64_t word; | |
124 | memcpy(&word, bitmap_, 8); | |
125 | bitmap_ += 8; | |
126 | return BitUtil::ToLittleEndian(word); | |
127 | } | |
128 | ||
129 | uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) { | |
130 | uint64_t word = 0; | |
131 | const int64_t num_bytes = BitUtil::BytesForBits(num_bits); | |
132 | memcpy(&word, bitmap_, num_bytes); | |
133 | bitmap_ += num_bytes; | |
134 | return (BitUtil::ToLittleEndian(word) >> bit_offset) & | |
135 | BitUtil::LeastSignificantBitMask(num_bits); | |
136 | } | |
137 | ||
138 | const uint8_t* bitmap_; | |
139 | const int64_t num_carry_bits_; // in [1, 8] | |
140 | const int64_t length_; | |
141 | int64_t remaining_length_; | |
142 | uint64_t carry_bits_; | |
143 | }; | |
144 | ||
145 | // BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h) | |
146 | // on sufficiently large inputs. However, it has a larger prolog / epilog overhead | |
147 | // and should probably not be used for small bitmaps. | |
148 | ||
149 | template <typename Word, bool may_have_byte_offset = true> | |
150 | class BitmapWordReader { | |
151 | public: | |
152 | BitmapWordReader() = default; | |
153 | BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) | |
154 | : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)), | |
155 | bitmap_(bitmap + offset / 8), | |
156 | bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) { | |
157 | // decrement word count by one as we may touch two adjacent words in one iteration | |
158 | nwords_ = length / (sizeof(Word) * 8) - 1; | |
159 | if (nwords_ < 0) { | |
160 | nwords_ = 0; | |
161 | } | |
162 | trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8); | |
163 | trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_)); | |
164 | ||
165 | if (nwords_ > 0) { | |
166 | current_data.word_ = load<Word>(bitmap_); | |
167 | } else if (length > 0) { | |
168 | current_data.epi.byte_ = load<uint8_t>(bitmap_); | |
169 | } | |
170 | } | |
171 | ||
172 | Word NextWord() { | |
173 | bitmap_ += sizeof(Word); | |
174 | const Word next_word = load<Word>(bitmap_); | |
175 | Word word = current_data.word_; | |
176 | if (may_have_byte_offset && offset_) { | |
177 | // combine two adjacent words into one word | |
178 | // |<------ next ----->|<---- current ---->| | |
179 | // +-------------+-----+-------------+-----+ | |
180 | // | --- | A | B | --- | | |
181 | // +-------------+-----+-------------+-----+ | |
182 | // | | offset | |
183 | // v v | |
184 | // +-----+-------------+ | |
185 | // | A | B | | |
186 | // +-----+-------------+ | |
187 | // |<------ word ----->| | |
188 | word >>= offset_; | |
189 | word |= next_word << (sizeof(Word) * 8 - offset_); | |
190 | } | |
191 | current_data.word_ = next_word; | |
192 | return word; | |
193 | } | |
194 | ||
195 | uint8_t NextTrailingByte(int& valid_bits) { | |
196 | uint8_t byte; | |
197 | assert(trailing_bits_ > 0); | |
198 | ||
199 | if (trailing_bits_ <= 8) { | |
200 | // last byte | |
201 | valid_bits = trailing_bits_; | |
202 | trailing_bits_ = 0; | |
203 | byte = 0; | |
204 | internal::BitmapReader reader(bitmap_, offset_, valid_bits); | |
205 | for (int i = 0; i < valid_bits; ++i) { | |
206 | byte >>= 1; | |
207 | if (reader.IsSet()) { | |
208 | byte |= 0x80; | |
209 | } | |
210 | reader.Next(); | |
211 | } | |
212 | byte >>= (8 - valid_bits); | |
213 | } else { | |
214 | ++bitmap_; | |
215 | const uint8_t next_byte = load<uint8_t>(bitmap_); | |
216 | byte = current_data.epi.byte_; | |
217 | if (may_have_byte_offset && offset_) { | |
218 | byte >>= offset_; | |
219 | byte |= next_byte << (8 - offset_); | |
220 | } | |
221 | current_data.epi.byte_ = next_byte; | |
222 | trailing_bits_ -= 8; | |
223 | trailing_bytes_--; | |
224 | valid_bits = 8; | |
225 | } | |
226 | return byte; | |
227 | } | |
228 | ||
229 | int64_t words() const { return nwords_; } | |
230 | int trailing_bytes() const { return trailing_bytes_; } | |
231 | ||
232 | private: | |
233 | int64_t offset_; | |
234 | const uint8_t* bitmap_; | |
235 | ||
236 | const uint8_t* bitmap_end_; | |
237 | int64_t nwords_; | |
238 | int trailing_bits_; | |
239 | int trailing_bytes_; | |
240 | union { | |
241 | Word word_; | |
242 | struct { | |
243 | #if ARROW_LITTLE_ENDIAN == 0 | |
244 | uint8_t padding_bytes_[sizeof(Word) - 1]; | |
245 | #endif | |
246 | uint8_t byte_; | |
247 | } epi; | |
248 | } current_data; | |
249 | ||
250 | template <typename DType> | |
251 | DType load(const uint8_t* bitmap) { | |
252 | assert(bitmap + sizeof(DType) <= bitmap_end_); | |
253 | return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap)); | |
254 | } | |
255 | }; | |
256 | ||
257 | /// \brief Index into a possibly non-existent bitmap | |
258 | struct OptionalBitIndexer { | |
259 | const uint8_t* bitmap; | |
260 | const int64_t offset; | |
261 | ||
262 | explicit OptionalBitIndexer(const std::shared_ptr<Buffer>& buffer, int64_t offset = 0) | |
263 | : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {} | |
264 | ||
265 | bool operator[](int64_t i) const { | |
266 | return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i); | |
267 | } | |
268 | }; | |
269 | ||
270 | } // namespace internal | |
271 | } // namespace arrow |