]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/util/bitmap_writer.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / bitmap_writer.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #pragma once
19
20 #include <cstdint>
21 #include <cstring>
22
23 #include "arrow/util/bit_util.h"
24 #include "arrow/util/endian.h"
25 #include "arrow/util/macros.h"
26
27 namespace arrow {
28 namespace internal {
29
30 class BitmapWriter {
31 // A sequential bitwise writer that preserves surrounding bit values.
32
33 public:
34 BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
35 : bitmap_(bitmap), position_(0), length_(length) {
36 byte_offset_ = start_offset / 8;
37 bit_mask_ = BitUtil::kBitmask[start_offset % 8];
38 if (length > 0) {
39 current_byte_ = bitmap[byte_offset_];
40 } else {
41 current_byte_ = 0;
42 }
43 }
44
45 void Set() { current_byte_ |= bit_mask_; }
46
47 void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; }
48
49 void Next() {
50 bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
51 ++position_;
52 if (bit_mask_ == 0) {
53 // Finished this byte, need advancing
54 bit_mask_ = 0x01;
55 bitmap_[byte_offset_++] = current_byte_;
56 if (ARROW_PREDICT_TRUE(position_ < length_)) {
57 current_byte_ = bitmap_[byte_offset_];
58 }
59 }
60 }
61
62 void Finish() {
63 // Store current byte if we didn't went past bitmap storage
64 if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
65 bitmap_[byte_offset_] = current_byte_;
66 }
67 }
68
69 int64_t position() const { return position_; }
70
71 private:
72 uint8_t* bitmap_;
73 int64_t position_;
74 int64_t length_;
75
76 uint8_t current_byte_;
77 uint8_t bit_mask_;
78 int64_t byte_offset_;
79 };
80
81 class FirstTimeBitmapWriter {
82 // Like BitmapWriter, but any bit values *following* the bits written
83 // might be clobbered. It is hence faster than BitmapWriter, and can
84 // also avoid false positives with Valgrind.
85
86 public:
87 FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
88 : bitmap_(bitmap), position_(0), length_(length) {
89 current_byte_ = 0;
90 byte_offset_ = start_offset / 8;
91 bit_mask_ = BitUtil::kBitmask[start_offset % 8];
92 if (length > 0) {
93 current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8];
94 } else {
95 current_byte_ = 0;
96 }
97 }
98
99 /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
100 ///
101 /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed
102 /// to be unset (i.e. 0).
103 /// \param[in] number_of_bits The number of bits to append from word.
104 void AppendWord(uint64_t word, int64_t number_of_bits) {
105 if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
106 return;
107 }
108
109 // Location that the first byte needs to be written to.
110 uint8_t* append_position = bitmap_ + byte_offset_;
111
112 // Update state variables except for current_byte_ here.
113 position_ += number_of_bits;
114 int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
115 bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
116 byte_offset_ += (bit_offset + number_of_bits) / 8;
117
118 if (bit_offset != 0) {
119 // We are in the middle of the byte. This code updates the byte and shifts
120 // bits appropriately within word so it can be memcpy'd below.
121 int64_t bits_to_carry = 8 - bit_offset;
122 // Carry over bits from word to current_byte_. We assume any extra bits in word
123 // unset so no additional accounting is needed for when number_of_bits <
124 // bits_to_carry.
125 current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) << bit_offset;
126 // Check if everything is transfered into current_byte_.
127 if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
128 return;
129 }
130 *append_position = current_byte_;
131 append_position++;
132 // Move the carry bits off of word.
133 word = word >> bits_to_carry;
134 number_of_bits -= bits_to_carry;
135 }
136 word = BitUtil::ToLittleEndian(word);
137 int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits);
138 std::memcpy(append_position, &word, bytes_for_word);
139 // At this point, the previous current_byte_ has been written to bitmap_.
140 // The new current_byte_ is either the last relevant byte in 'word'
141 // or cleared if the new position is byte aligned (i.e. a fresh byte).
142 if (bit_mask_ == 0x1) {
143 current_byte_ = 0;
144 } else {
145 current_byte_ = *(append_position + bytes_for_word - 1);
146 }
147 }
148
149 void Set() { current_byte_ |= bit_mask_; }
150
151 void Clear() {}
152
153 void Next() {
154 bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
155 ++position_;
156 if (bit_mask_ == 0) {
157 // Finished this byte, need advancing
158 bit_mask_ = 0x01;
159 bitmap_[byte_offset_++] = current_byte_;
160 current_byte_ = 0;
161 }
162 }
163
164 void Finish() {
165 // Store current byte if we didn't went go bitmap storage
166 if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
167 bitmap_[byte_offset_] = current_byte_;
168 }
169 }
170
171 int64_t position() const { return position_; }
172
173 private:
174 uint8_t* bitmap_;
175 int64_t position_;
176 int64_t length_;
177
178 uint8_t current_byte_;
179 uint8_t bit_mask_;
180 int64_t byte_offset_;
181 };
182
183 template <typename Word, bool may_have_byte_offset = true>
184 class BitmapWordWriter {
185 public:
186 BitmapWordWriter() = default;
187 BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
188 : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
189 bitmap_(bitmap + offset / 8),
190 bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)),
191 mask_((1U << offset_) - 1) {
192 if (offset_) {
193 if (length >= static_cast<int>(sizeof(Word) * 8)) {
194 current_data.word_ = load<Word>(bitmap_);
195 } else if (length > 0) {
196 current_data.epi.byte_ = load<uint8_t>(bitmap_);
197 }
198 }
199 }
200
201 void PutNextWord(Word word) {
202 if (may_have_byte_offset && offset_) {
203 // split one word into two adjacent words, don't touch unused bits
204 // |<------ word ----->|
205 // +-----+-------------+
206 // | A | B |
207 // +-----+-------------+
208 // | |
209 // v v offset
210 // +-------------+-----+-------------+-----+
211 // | --- | A | B | --- |
212 // +-------------+-----+-------------+-----+
213 // |<------ next ----->|<---- current ---->|
214 word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
215 Word next_word = load<Word>(bitmap_ + sizeof(Word));
216 current_data.word_ = (current_data.word_ & mask_) | (word & ~mask_);
217 next_word = (next_word & ~mask_) | (word & mask_);
218 store<Word>(bitmap_, current_data.word_);
219 store<Word>(bitmap_ + sizeof(Word), next_word);
220 current_data.word_ = next_word;
221 } else {
222 store<Word>(bitmap_, word);
223 }
224 bitmap_ += sizeof(Word);
225 }
226
227 void PutNextTrailingByte(uint8_t byte, int valid_bits) {
228 if (valid_bits == 8) {
229 if (may_have_byte_offset && offset_) {
230 byte = (byte << offset_) | (byte >> (8 - offset_));
231 uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
232 current_data.epi.byte_ = (current_data.epi.byte_ & mask_) | (byte & ~mask_);
233 next_byte = (next_byte & ~mask_) | (byte & mask_);
234 store<uint8_t>(bitmap_, current_data.epi.byte_);
235 store<uint8_t>(bitmap_ + 1, next_byte);
236 current_data.epi.byte_ = next_byte;
237 } else {
238 store<uint8_t>(bitmap_, byte);
239 }
240 ++bitmap_;
241 } else {
242 assert(valid_bits > 0);
243 assert(valid_bits < 8);
244 assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
245 internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
246 for (int i = 0; i < valid_bits; ++i) {
247 (byte & 0x01) ? writer.Set() : writer.Clear();
248 writer.Next();
249 byte >>= 1;
250 }
251 writer.Finish();
252 }
253 }
254
255 private:
256 int64_t offset_;
257 uint8_t* bitmap_;
258
259 const uint8_t* bitmap_end_;
260 uint64_t mask_;
261 union {
262 Word word_;
263 struct {
264 #if ARROW_LITTLE_ENDIAN == 0
265 uint8_t padding_bytes_[sizeof(Word) - 1];
266 #endif
267 uint8_t byte_;
268 } epi;
269 } current_data;
270
271 template <typename DType>
272 DType load(const uint8_t* bitmap) {
273 assert(bitmap + sizeof(DType) <= bitmap_end_);
274 return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
275 }
276
277 template <typename DType>
278 void store(uint8_t* bitmap, DType data) {
279 assert(bitmap + sizeof(DType) <= bitmap_end_);
280 util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
281 }
282 };
283
284 } // namespace internal
285 } // namespace arrow