]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/util/bitmap_reader.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / bitmap_reader.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <cstdint>
21#include <cstring>
22
23#include "arrow/buffer.h"
24#include "arrow/util/bit_util.h"
25#include "arrow/util/endian.h"
26#include "arrow/util/macros.h"
27
28namespace arrow {
29namespace internal {
30
31class BitmapReader {
32 public:
33 BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
34 : bitmap_(bitmap), position_(0), length_(length) {
35 current_byte_ = 0;
36 byte_offset_ = start_offset / 8;
37 bit_offset_ = start_offset % 8;
38 if (length > 0) {
39 current_byte_ = bitmap[byte_offset_];
40 }
41 }
42
43 bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
44
45 bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
46
47 void Next() {
48 ++bit_offset_;
49 ++position_;
50 if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
51 bit_offset_ = 0;
52 ++byte_offset_;
53 if (ARROW_PREDICT_TRUE(position_ < length_)) {
54 current_byte_ = bitmap_[byte_offset_];
55 }
56 }
57 }
58
59 int64_t position() const { return position_; }
60
61 int64_t length() const { return length_; }
62
63 private:
64 const uint8_t* bitmap_;
65 int64_t position_;
66 int64_t length_;
67
68 uint8_t current_byte_;
69 int64_t byte_offset_;
70 int64_t bit_offset_;
71};
72
73// XXX Cannot name it BitmapWordReader because the name is already used
74// in bitmap_ops.cc
75
76class BitmapUInt64Reader {
77 public:
78 BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
79 : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
80 num_carry_bits_(8 - start_offset % 8),
81 length_(length),
82 remaining_length_(length_) {
83 if (length_ > 0) {
84 // Load carry bits from the first byte's MSBs
85 if (length_ >= num_carry_bits_) {
86 carry_bits_ =
87 LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
88 } else {
89 carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
90 }
91 }
92 }
93
94 uint64_t NextWord() {
95 if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
96 // We can load a full word
97 uint64_t next_word = LoadFullWord();
98 // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
99 uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
100 carry_bits_ = next_word >> (64 - num_carry_bits_);
101 remaining_length_ -= 64;
102 return word;
103 } else if (remaining_length_ > num_carry_bits_) {
104 // We can load a partial word
105 uint64_t next_word =
106 LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
107 uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
108 carry_bits_ = next_word >> (64 - num_carry_bits_);
109 remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
110 return word;
111 } else {
112 remaining_length_ = 0;
113 return carry_bits_;
114 }
115 }
116
117 int64_t position() const { return length_ - remaining_length_; }
118
119 int64_t length() const { return length_; }
120
121 private:
122 uint64_t LoadFullWord() {
123 uint64_t word;
124 memcpy(&word, bitmap_, 8);
125 bitmap_ += 8;
126 return BitUtil::ToLittleEndian(word);
127 }
128
129 uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
130 uint64_t word = 0;
131 const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
132 memcpy(&word, bitmap_, num_bytes);
133 bitmap_ += num_bytes;
134 return (BitUtil::ToLittleEndian(word) >> bit_offset) &
135 BitUtil::LeastSignificantBitMask(num_bits);
136 }
137
138 const uint8_t* bitmap_;
139 const int64_t num_carry_bits_; // in [1, 8]
140 const int64_t length_;
141 int64_t remaining_length_;
142 uint64_t carry_bits_;
143};
144
145// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
146// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
147// and should probably not be used for small bitmaps.
148
149template <typename Word, bool may_have_byte_offset = true>
150class BitmapWordReader {
151 public:
152 BitmapWordReader() = default;
153 BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
154 : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
155 bitmap_(bitmap + offset / 8),
156 bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
157 // decrement word count by one as we may touch two adjacent words in one iteration
158 nwords_ = length / (sizeof(Word) * 8) - 1;
159 if (nwords_ < 0) {
160 nwords_ = 0;
161 }
162 trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
163 trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
164
165 if (nwords_ > 0) {
166 current_data.word_ = load<Word>(bitmap_);
167 } else if (length > 0) {
168 current_data.epi.byte_ = load<uint8_t>(bitmap_);
169 }
170 }
171
172 Word NextWord() {
173 bitmap_ += sizeof(Word);
174 const Word next_word = load<Word>(bitmap_);
175 Word word = current_data.word_;
176 if (may_have_byte_offset && offset_) {
177 // combine two adjacent words into one word
178 // |<------ next ----->|<---- current ---->|
179 // +-------------+-----+-------------+-----+
180 // | --- | A | B | --- |
181 // +-------------+-----+-------------+-----+
182 // | | offset
183 // v v
184 // +-----+-------------+
185 // | A | B |
186 // +-----+-------------+
187 // |<------ word ----->|
188 word >>= offset_;
189 word |= next_word << (sizeof(Word) * 8 - offset_);
190 }
191 current_data.word_ = next_word;
192 return word;
193 }
194
195 uint8_t NextTrailingByte(int& valid_bits) {
196 uint8_t byte;
197 assert(trailing_bits_ > 0);
198
199 if (trailing_bits_ <= 8) {
200 // last byte
201 valid_bits = trailing_bits_;
202 trailing_bits_ = 0;
203 byte = 0;
204 internal::BitmapReader reader(bitmap_, offset_, valid_bits);
205 for (int i = 0; i < valid_bits; ++i) {
206 byte >>= 1;
207 if (reader.IsSet()) {
208 byte |= 0x80;
209 }
210 reader.Next();
211 }
212 byte >>= (8 - valid_bits);
213 } else {
214 ++bitmap_;
215 const uint8_t next_byte = load<uint8_t>(bitmap_);
216 byte = current_data.epi.byte_;
217 if (may_have_byte_offset && offset_) {
218 byte >>= offset_;
219 byte |= next_byte << (8 - offset_);
220 }
221 current_data.epi.byte_ = next_byte;
222 trailing_bits_ -= 8;
223 trailing_bytes_--;
224 valid_bits = 8;
225 }
226 return byte;
227 }
228
229 int64_t words() const { return nwords_; }
230 int trailing_bytes() const { return trailing_bytes_; }
231
232 private:
233 int64_t offset_;
234 const uint8_t* bitmap_;
235
236 const uint8_t* bitmap_end_;
237 int64_t nwords_;
238 int trailing_bits_;
239 int trailing_bytes_;
240 union {
241 Word word_;
242 struct {
243#if ARROW_LITTLE_ENDIAN == 0
244 uint8_t padding_bytes_[sizeof(Word) - 1];
245#endif
246 uint8_t byte_;
247 } epi;
248 } current_data;
249
250 template <typename DType>
251 DType load(const uint8_t* bitmap) {
252 assert(bitmap + sizeof(DType) <= bitmap_end_);
253 return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
254 }
255};
256
257/// \brief Index into a possibly non-existent bitmap
258struct OptionalBitIndexer {
259 const uint8_t* bitmap;
260 const int64_t offset;
261
262 explicit OptionalBitIndexer(const std::shared_ptr<Buffer>& buffer, int64_t offset = 0)
263 : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}
264
265 bool operator[](int64_t i) const {
266 return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
267 }
268};
269
270} // namespace internal
271} // namespace arrow