[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / bitmap_reader.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <cstring>

#include "arrow/buffer.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/endian.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace internal {

class BitmapReader {
 public:
  BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
      : bitmap_(bitmap), position_(0), length_(length) {
    current_byte_ = 0;
    byte_offset_ = start_offset / 8;
    bit_offset_ = start_offset % 8;
    if (length > 0) {
      current_byte_ = bitmap[byte_offset_];
    }
  }

  bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }

  bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }

  void Next() {
    ++bit_offset_;
    ++position_;
    if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
      bit_offset_ = 0;
      ++byte_offset_;
      if (ARROW_PREDICT_TRUE(position_ < length_)) {
        current_byte_ = bitmap_[byte_offset_];
      }
    }
  }

  int64_t position() const { return position_; }

  int64_t length() const { return length_; }

 private:
  const uint8_t* bitmap_;
  int64_t position_;
  int64_t length_;

  uint8_t current_byte_;
  int64_t byte_offset_;
  int64_t bit_offset_;
};

// XXX Cannot name it BitmapWordReader because the name is already used
// in bitmap_ops.cc

class BitmapUInt64Reader {
 public:
  BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
      : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
        num_carry_bits_(8 - start_offset % 8),
        length_(length),
        remaining_length_(length_) {
    if (length_ > 0) {
      // Load carry bits from the first byte's MSBs
      if (length_ >= num_carry_bits_) {
        carry_bits_ =
            LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
      } else {
        carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
      }
    }
  }

  uint64_t NextWord() {
    if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
      // We can load a full word
      uint64_t next_word = LoadFullWord();
      // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
      uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
      carry_bits_ = next_word >> (64 - num_carry_bits_);
      remaining_length_ -= 64;
      return word;
    } else if (remaining_length_ > num_carry_bits_) {
      // We can load a partial word
      uint64_t next_word =
          LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
      uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
      carry_bits_ = next_word >> (64 - num_carry_bits_);
      remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
      return word;
    } else {
      remaining_length_ = 0;
      return carry_bits_;
    }
  }

  int64_t position() const { return length_ - remaining_length_; }

  int64_t length() const { return length_; }

 private:
  uint64_t LoadFullWord() {
    uint64_t word;
    memcpy(&word, bitmap_, 8);
    bitmap_ += 8;
    return BitUtil::ToLittleEndian(word);
  }

  uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
    uint64_t word = 0;
    const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
    memcpy(&word, bitmap_, num_bytes);
    bitmap_ += num_bytes;
    return (BitUtil::ToLittleEndian(word) >> bit_offset) &
           BitUtil::LeastSignificantBitMask(num_bits);
  }

  const uint8_t* bitmap_;
  const int64_t num_carry_bits_;  // in [1, 8]
  const int64_t length_;
  int64_t remaining_length_;
  uint64_t carry_bits_;
};

// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
// on sufficiently large inputs.  However, it has a larger prolog / epilog overhead
// and should probably not be used for small bitmaps.

template <typename Word, bool may_have_byte_offset = true>
class BitmapWordReader {
 public:
  BitmapWordReader() = default;
  BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
      : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
        bitmap_(bitmap + offset / 8),
        bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
    // decrement word count by one as we may touch two adjacent words in one iteration
    nwords_ = length / (sizeof(Word) * 8) - 1;
    if (nwords_ < 0) {
      nwords_ = 0;
    }
    trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
    trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));

    if (nwords_ > 0) {
      current_data.word_ = load<Word>(bitmap_);
    } else if (length > 0) {
      current_data.epi.byte_ = load<uint8_t>(bitmap_);
    }
  }

  Word NextWord() {
    bitmap_ += sizeof(Word);
    const Word next_word = load<Word>(bitmap_);
    Word word = current_data.word_;
    if (may_have_byte_offset && offset_) {
      // combine two adjacent words into one word
      // |<------ next ----->|<---- current ---->|
      // +-------------+-----+-------------+-----+
      // |     ---     |  A  |      B      | --- |
      // +-------------+-----+-------------+-----+
      //                  |         |       offset
      //                  v         v
      //               +-----+-------------+
      //               |  A  |      B      |
      //               +-----+-------------+
      //               |<------ word ----->|
      word >>= offset_;
      word |= next_word << (sizeof(Word) * 8 - offset_);
    }
    current_data.word_ = next_word;
    return word;
  }

  uint8_t NextTrailingByte(int& valid_bits) {
    uint8_t byte;
    assert(trailing_bits_ > 0);

    if (trailing_bits_ <= 8) {
      // last byte
      valid_bits = trailing_bits_;
      trailing_bits_ = 0;
      byte = 0;
      internal::BitmapReader reader(bitmap_, offset_, valid_bits);
      for (int i = 0; i < valid_bits; ++i) {
        byte >>= 1;
        if (reader.IsSet()) {
          byte |= 0x80;
        }
        reader.Next();
      }
      byte >>= (8 - valid_bits);
    } else {
      ++bitmap_;
      const uint8_t next_byte = load<uint8_t>(bitmap_);
      byte = current_data.epi.byte_;
      if (may_have_byte_offset && offset_) {
        byte >>= offset_;
        byte |= next_byte << (8 - offset_);
      }
      current_data.epi.byte_ = next_byte;
      trailing_bits_ -= 8;
      trailing_bytes_--;
      valid_bits = 8;
    }
    return byte;
  }

  int64_t words() const { return nwords_; }
  int trailing_bytes() const { return trailing_bytes_; }

 private:
  int64_t offset_;
  const uint8_t* bitmap_;

  const uint8_t* bitmap_end_;
  int64_t nwords_;
  int trailing_bits_;
  int trailing_bytes_;
  union {
    Word word_;
    struct {
#if ARROW_LITTLE_ENDIAN == 0
      uint8_t padding_bytes_[sizeof(Word) - 1];
#endif
      uint8_t byte_;
    } epi;
  } current_data;

  template <typename DType>
  DType load(const uint8_t* bitmap) {
    assert(bitmap + sizeof(DType) <= bitmap_end_);
    return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
  }
};

/// \brief Index into a possibly non-existent bitmap
struct OptionalBitIndexer {
  const uint8_t* bitmap;
  const int64_t offset;

  explicit OptionalBitIndexer(const std::shared_ptr<Buffer>& buffer, int64_t offset = 0)
      : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}

  bool operator[](int64_t i) const {
    return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
  }
};

}  // namespace internal
}  // namespace arrow
Commit	Line	Data
1d09f67e TL	1	// Licensed to the Apache Software Foundation (ASF) under one
	2	// or more contributor license agreements. See the NOTICE file
	3	// distributed with this work for additional information
	4	// regarding copyright ownership. The ASF licenses this file
	5	// to you under the Apache License, Version 2.0 (the
	6	// "License"); you may not use this file except in compliance
	7	// with the License. You may obtain a copy of the License at
	8	//
	9	// http://www.apache.org/licenses/LICENSE-2.0
	10	//
	11	// Unless required by applicable law or agreed to in writing,
	12	// software distributed under the License is distributed on an
	13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	14	// KIND, either express or implied. See the License for the
	15	// specific language governing permissions and limitations
	16	// under the License.
	17
	18	#pragma once
	19
	20	#include <cstdint>
	21	#include <cstring>
	22
	23	#include "arrow/buffer.h"
	24	#include "arrow/util/bit_util.h"
	25	#include "arrow/util/endian.h"
	26	#include "arrow/util/macros.h"
	27
	28	namespace arrow {
	29	namespace internal {
	30
	31	class BitmapReader {
	32	public:
	33	BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
	34	: bitmap_(bitmap), position_(0), length_(length) {
	35	current_byte_ = 0;
	36	byte_offset_ = start_offset / 8;
	37	bit_offset_ = start_offset % 8;
	38	if (length > 0) {
	39	current_byte_ = bitmap[byte_offset_];
	40	}
	41	}
	42
	43	bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
	44
	45	bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
	46
	47	void Next() {
	48	++bit_offset_;
	49	++position_;
	50	if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
	51	bit_offset_ = 0;
	52	++byte_offset_;
	53	if (ARROW_PREDICT_TRUE(position_ < length_)) {
	54	current_byte_ = bitmap_[byte_offset_];
	55	}
	56	}
	57	}
	58
	59	int64_t position() const { return position_; }
	60
	61	int64_t length() const { return length_; }
	62
	63	private:
	64	const uint8_t* bitmap_;
65	int64_t position_;
66	int64_t length_;
67
68	uint8_t current_byte_;
69	int64_t byte_offset_;
70	int64_t bit_offset_;
71	};
72
73	// XXX Cannot name it BitmapWordReader because the name is already used
74	// in bitmap_ops.cc
75
76	class BitmapUInt64Reader {
77	public:
78	BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
79	: bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
80	num_carry_bits_(8 - start_offset % 8),
81	length_(length),
82	remaining_length_(length_) {
83	if (length_ > 0) {
84	// Load carry bits from the first byte's MSBs
85	if (length_ >= num_carry_bits_) {
86	carry_bits_ =
87	LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
88	} else {
89	carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
90	}
91	}
92	}
93
94	uint64_t NextWord() {
95	if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
96	// We can load a full word
97	uint64_t next_word = LoadFullWord();
98	// Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
99	uint64_t word = carry_bits_ \| (next_word << num_carry_bits_);
100	carry_bits_ = next_word >> (64 - num_carry_bits_);
101	remaining_length_ -= 64;
102	return word;
103	} else if (remaining_length_ > num_carry_bits_) {
104	// We can load a partial word
105	uint64_t next_word =
106	LoadPartialWord(/bit_offset=/0, remaining_length_ - num_carry_bits_);
107	uint64_t word = carry_bits_ \| (next_word << num_carry_bits_);
108	carry_bits_ = next_word >> (64 - num_carry_bits_);
109	remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
110	return word;
111	} else {
112	remaining_length_ = 0;
113	return carry_bits_;
114	}
115	}
116
117	int64_t position() const { return length_ - remaining_length_; }
118
119	int64_t length() const { return length_; }
120
121	private:
122	uint64_t LoadFullWord() {
123	uint64_t word;
124	memcpy(&word, bitmap_, 8);
125	bitmap_ += 8;
126	return BitUtil::ToLittleEndian(word);
127	}
128
129	uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
130	uint64_t word = 0;
131	const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
132	memcpy(&word, bitmap_, num_bytes);
133	bitmap_ += num_bytes;
134	return (BitUtil::ToLittleEndian(word) >> bit_offset) &
135	BitUtil::LeastSignificantBitMask(num_bits);
136	}
137
138	const uint8_t* bitmap_;
139	const int64_t num_carry_bits_; // in [1, 8]
140	const int64_t length_;
141	int64_t remaining_length_;
142	uint64_t carry_bits_;
143	};
144
145	// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
146	// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
147	// and should probably not be used for small bitmaps.
148
149	template <typename Word, bool may_have_byte_offset = true>
150	class BitmapWordReader {
151	public:
152	BitmapWordReader() = default;
153	BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
154	: offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
155	bitmap_(bitmap + offset / 8),
156	bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
157	// decrement word count by one as we may touch two adjacent words in one iteration
158	nwords_ = length / (sizeof(Word) * 8) - 1;
159	if (nwords_ < 0) {
160	nwords_ = 0;
161	}
162	trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
163	trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
164
165	if (nwords_ > 0) {
166	current_data.word_ = load<Word>(bitmap_);
167	} else if (length > 0) {
168	current_data.epi.byte_ = load<uint8_t>(bitmap_);
169	}
170	}
171
172	Word NextWord() {
173	bitmap_ += sizeof(Word);
174	const Word next_word = load<Word>(bitmap_);
175	Word word = current_data.word_;
176	if (may_have_byte_offset && offset_) {
177	// combine two adjacent words into one word
178	// \|<------ next ----->\|<---- current ---->\|
179	// +-------------+-----+-------------+-----+
180	// \| --- \| A \| B \| --- \|
181	// +-------------+-----+-------------+-----+
182	// \| \| offset
183	// v v
184	// +-----+-------------+
185	// \| A \| B \|
186	// +-----+-------------+
187	// \|<------ word ----->\|
188	word >>= offset_;
189	word \|= next_word << (sizeof(Word) * 8 - offset_);
190	}
191	current_data.word_ = next_word;
192	return word;
193	}
194
195	uint8_t NextTrailingByte(int& valid_bits) {
196	uint8_t byte;
197	assert(trailing_bits_ > 0);
198
199	if (trailing_bits_ <= 8) {
200	// last byte
201	valid_bits = trailing_bits_;
202	trailing_bits_ = 0;
203	byte = 0;
204	internal::BitmapReader reader(bitmap_, offset_, valid_bits);
205	for (int i = 0; i < valid_bits; ++i) {
206	byte >>= 1;
207	if (reader.IsSet()) {
208	byte \|= 0x80;
209	}
210	reader.Next();
211	}
212	byte >>= (8 - valid_bits);
213	} else {
214	++bitmap_;
215	const uint8_t next_byte = load<uint8_t>(bitmap_);
216	byte = current_data.epi.byte_;
217	if (may_have_byte_offset && offset_) {
218	byte >>= offset_;
219	byte \|= next_byte << (8 - offset_);
220	}
221	current_data.epi.byte_ = next_byte;
222	trailing_bits_ -= 8;
223	trailing_bytes_--;
224	valid_bits = 8;
225	}
226	return byte;
227	}
228
229	int64_t words() const { return nwords_; }
230	int trailing_bytes() const { return trailing_bytes_; }
231
232	private:
233	int64_t offset_;
234	const uint8_t* bitmap_;
235
236	const uint8_t* bitmap_end_;
237	int64_t nwords_;
238	int trailing_bits_;
239	int trailing_bytes_;
240	union {
241	Word word_;
242	struct {
243	#if ARROW_LITTLE_ENDIAN == 0
244	uint8_t padding_bytes_[sizeof(Word) - 1];
245	#endif
246	uint8_t byte_;
247	} epi;
248	} current_data;
249
250	template <typename DType>
251	DType load(const uint8_t* bitmap) {
252	assert(bitmap + sizeof(DType) <= bitmap_end_);
253	return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
254	}
255	};
256
257	/// \brief Index into a possibly non-existent bitmap
258	struct OptionalBitIndexer {
259	const uint8_t* bitmap;
260	const int64_t offset;
261
262	explicit OptionalBitIndexer(const std::shared_ptr<Buffer>& buffer, int64_t offset = 0)
263	: bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}
264
265	bool operator[](int64_t i) const {
266	return bitmap == NULLPTR \|\| BitUtil::GetBit(bitmap, offset + i);
267	}
268	};
269
270	} // namespace internal
271	} // namespace arrow