--- /dev/null
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/int_util.h"
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+
+#include "arrow/array/data.h"
+#include "arrow/datum.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+
+static constexpr uint64_t max_uint8 =
+ static_cast<uint64_t>(std::numeric_limits<uint8_t>::max());
+static constexpr uint64_t max_uint16 =
+ static_cast<uint64_t>(std::numeric_limits<uint16_t>::max());
+static constexpr uint64_t max_uint32 =
+ static_cast<uint64_t>(std::numeric_limits<uint32_t>::max());
+static constexpr uint64_t max_uint64 = std::numeric_limits<uint64_t>::max();
+
+static constexpr uint64_t mask_uint8 = ~0xffULL;
+static constexpr uint64_t mask_uint16 = ~0xffffULL;
+static constexpr uint64_t mask_uint32 = ~0xffffffffULL;
+
+//
+// Unsigned integer width detection
+//
+
+static const uint64_t max_uints[] = {0, max_uint8, max_uint16, 0, max_uint32,
+ 0, 0, 0, max_uint64};
+
+// Check if we would need to expand the underlying storage type
+static inline uint8_t ExpandedUIntWidth(uint64_t val, uint8_t current_width) {
+ // Optimize for the common case where width doesn't change
+ if (ARROW_PREDICT_TRUE(val <= max_uints[current_width])) {
+ return current_width;
+ }
+ if (current_width == 1 && val <= max_uint8) {
+ return 1;
+ } else if (current_width <= 2 && val <= max_uint16) {
+ return 2;
+ } else if (current_width <= 4 && val <= max_uint32) {
+ return 4;
+ } else {
+ return 8;
+ }
+}
+
+uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width) {
+ uint8_t width = min_width;
+ if (min_width < 8) {
+ auto p = values;
+ const auto end = p + length;
+ while (p <= end - 16) {
+ // This is probably SIMD-izable
+ auto u = p[0];
+ auto v = p[1];
+ auto w = p[2];
+ auto x = p[3];
+ u |= p[4];
+ v |= p[5];
+ w |= p[6];
+ x |= p[7];
+ u |= p[8];
+ v |= p[9];
+ w |= p[10];
+ x |= p[11];
+ u |= p[12];
+ v |= p[13];
+ w |= p[14];
+ x |= p[15];
+ p += 16;
+ width = ExpandedUIntWidth(u | v | w | x, width);
+ if (ARROW_PREDICT_FALSE(width == 8)) {
+ break;
+ }
+ }
+ if (p <= end - 8) {
+ auto u = p[0];
+ auto v = p[1];
+ auto w = p[2];
+ auto x = p[3];
+ u |= p[4];
+ v |= p[5];
+ w |= p[6];
+ x |= p[7];
+ p += 8;
+ width = ExpandedUIntWidth(u | v | w | x, width);
+ }
+ while (p < end) {
+ width = ExpandedUIntWidth(*p++, width);
+ }
+ }
+ return width;
+}
+
+uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes,
+ int64_t length, uint8_t min_width) {
+ if (valid_bytes == nullptr) {
+ return DetectUIntWidth(values, length, min_width);
+ }
+ uint8_t width = min_width;
+ if (min_width < 8) {
+ auto p = values;
+ const auto end = p + length;
+ auto b = valid_bytes;
+
+#define MASK(p, b, i) p[i] * (b[i] != 0)
+
+ while (p <= end - 8) {
+ // This is probably be SIMD-izable
+ auto u = MASK(p, b, 0);
+ auto v = MASK(p, b, 1);
+ auto w = MASK(p, b, 2);
+ auto x = MASK(p, b, 3);
+ u |= MASK(p, b, 4);
+ v |= MASK(p, b, 5);
+ w |= MASK(p, b, 6);
+ x |= MASK(p, b, 7);
+ b += 8;
+ p += 8;
+ width = ExpandedUIntWidth(u | v | w | x, width);
+ if (ARROW_PREDICT_FALSE(width == 8)) {
+ break;
+ }
+ }
+ uint64_t mask = 0;
+ while (p < end) {
+ mask |= MASK(p, b, 0);
+ ++b;
+ ++p;
+ }
+ width = ExpandedUIntWidth(mask, width);
+
+#undef MASK
+ }
+ return width;
+}
+
+//
+// Signed integer width detection
+//
+
+uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width) {
+ if (min_width == 8) {
+ return min_width;
+ }
+ uint8_t width = min_width;
+
+ auto p = values;
+ const auto end = p + length;
+ // Strategy: to determine whether `x` is between -0x80 and 0x7f,
+ // we determine whether `x + 0x80` is between 0x00 and 0xff. The
+ // latter can be done with a simple AND mask with ~0xff and, more
+ // importantly, can be computed in a single step over multiple ORed
+ // values (so we can branch once every N items instead of once every item).
+ // This strategy could probably lend itself to explicit SIMD-ization,
+ // if more performance is needed.
+ constexpr uint64_t addend8 = 0x80ULL;
+ constexpr uint64_t addend16 = 0x8000ULL;
+ constexpr uint64_t addend32 = 0x80000000ULL;
+
+ auto test_one_item = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto v = *p++;
+ if (ARROW_PREDICT_FALSE(((v + addend) & test_mask) != 0)) {
+ --p;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+ auto test_four_items = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto mask = (p[0] + addend) | (p[1] + addend) | (p[2] + addend) | (p[3] + addend);
+ p += 4;
+ if (ARROW_PREDICT_FALSE((mask & test_mask) != 0)) {
+ p -= 4;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+ if (width == 1) {
+ while (p <= end - 4) {
+ if (!test_four_items(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ return 1;
+ }
+width2:
+ if (width == 2) {
+ while (p <= end - 4) {
+ if (!test_four_items(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ return 2;
+ }
+width4:
+ if (width == 4) {
+ while (p <= end - 4) {
+ if (!test_four_items(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ return 4;
+ }
+width8:
+ return 8;
+}
+
+uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length,
+ uint8_t min_width) {
+ if (valid_bytes == nullptr) {
+ return DetectIntWidth(values, length, min_width);
+ }
+
+ if (min_width == 8) {
+ return min_width;
+ }
+ uint8_t width = min_width;
+
+ auto p = values;
+ const auto end = p + length;
+ auto b = valid_bytes;
+ // Strategy is similar to the no-nulls case above, but we also
+ // have to zero any incoming items that have a zero validity byte.
+ constexpr uint64_t addend8 = 0x80ULL;
+ constexpr uint64_t addend16 = 0x8000ULL;
+ constexpr uint64_t addend32 = 0x80000000ULL;
+
+#define MASK(p, b, addend, i) (p[i] + addend) * (b[i] != 0)
+
+ auto test_one_item = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto v = MASK(p, b, addend, 0);
+ ++b;
+ ++p;
+ if (ARROW_PREDICT_FALSE((v & test_mask) != 0)) {
+ --b;
+ --p;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+ auto test_eight_items = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto mask1 = MASK(p, b, addend, 0) | MASK(p, b, addend, 1) | MASK(p, b, addend, 2) |
+ MASK(p, b, addend, 3);
+ auto mask2 = MASK(p, b, addend, 4) | MASK(p, b, addend, 5) | MASK(p, b, addend, 6) |
+ MASK(p, b, addend, 7);
+ b += 8;
+ p += 8;
+ if (ARROW_PREDICT_FALSE(((mask1 | mask2) & test_mask) != 0)) {
+ b -= 8;
+ p -= 8;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+#undef MASK
+
+ if (width == 1) {
+ while (p <= end - 8) {
+ if (!test_eight_items(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ return 1;
+ }
+width2:
+ if (width == 2) {
+ while (p <= end - 8) {
+ if (!test_eight_items(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ return 2;
+ }
+width4:
+ if (width == 4) {
+ while (p <= end - 8) {
+ if (!test_eight_items(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ return 4;
+ }
+width8:
+ return 8;
+}
+
+template <typename Source, typename Dest>
+static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t length) {
+ while (length >= 4) {
+ dest[0] = static_cast<Dest>(src[0]);
+ dest[1] = static_cast<Dest>(src[1]);
+ dest[2] = static_cast<Dest>(src[2]);
+ dest[3] = static_cast<Dest>(src[3]);
+ length -= 4;
+ src += 4;
+ dest += 4;
+ }
+ while (length > 0) {
+ *dest++ = static_cast<Dest>(*src++);
+ --length;
+ }
+}
+
+void DowncastInts(const int64_t* source, int8_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastInts(const int64_t* source, int16_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastInts(const int64_t* source, int32_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastInts(const int64_t* source, int64_t* dest, int64_t length) {
+ memcpy(dest, source, length * sizeof(int64_t));
+}
+
+void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) {
+ memcpy(dest, source, length * sizeof(int64_t));
+}
+
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length,
+ const int32_t* transpose_map) {
+ while (length >= 4) {
+ dest[0] = static_cast<OutputInt>(transpose_map[src[0]]);
+ dest[1] = static_cast<OutputInt>(transpose_map[src[1]]);
+ dest[2] = static_cast<OutputInt>(transpose_map[src[2]]);
+ dest[3] = static_cast<OutputInt>(transpose_map[src[3]]);
+ length -= 4;
+ src += 4;
+ dest += 4;
+ }
+ while (length > 0) {
+ *dest++ = static_cast<OutputInt>(transpose_map[*src++]);
+ --length;
+ }
+}
+
+#define INSTANTIATE(SRC, DEST) \
+ template ARROW_EXPORT void TransposeInts( \
+ const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map);
+
+#define INSTANTIATE_ALL_DEST(DEST) \
+ INSTANTIATE(uint8_t, DEST) \
+ INSTANTIATE(int8_t, DEST) \
+ INSTANTIATE(uint16_t, DEST) \
+ INSTANTIATE(int16_t, DEST) \
+ INSTANTIATE(uint32_t, DEST) \
+ INSTANTIATE(int32_t, DEST) \
+ INSTANTIATE(uint64_t, DEST) \
+ INSTANTIATE(int64_t, DEST)
+
+#define INSTANTIATE_ALL() \
+ INSTANTIATE_ALL_DEST(uint8_t) \
+ INSTANTIATE_ALL_DEST(int8_t) \
+ INSTANTIATE_ALL_DEST(uint16_t) \
+ INSTANTIATE_ALL_DEST(int16_t) \
+ INSTANTIATE_ALL_DEST(uint32_t) \
+ INSTANTIATE_ALL_DEST(int32_t) \
+ INSTANTIATE_ALL_DEST(uint64_t) \
+ INSTANTIATE_ALL_DEST(int64_t)
+
+INSTANTIATE_ALL()
+
+#undef INSTANTIATE
+#undef INSTANTIATE_ALL
+#undef INSTANTIATE_ALL_DEST
+
+namespace {
+
+template <typename SrcType>
+struct TransposeIntsDest {
+ const SrcType* src;
+ uint8_t* dest;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using DestType = typename T::c_type;
+ TransposeInts(src, reinterpret_cast<DestType*>(dest) + dest_offset, length,
+ transpose_map);
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+struct TransposeIntsSrc {
+ const uint8_t* src;
+ uint8_t* dest;
+ int64_t src_offset;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+ const DataType& dest_type;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using SrcType = typename T::c_type;
+ return TransposeIntsDest<SrcType>{reinterpret_cast<const SrcType*>(src) + src_offset,
+ dest, dest_offset, length,
+ transpose_map}(dest_type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+}; // namespace
+
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+ const uint8_t* src, uint8_t* dest, int64_t src_offset,
+ int64_t dest_offset, int64_t length, const int32_t* transpose_map) {
+ TransposeIntsSrc transposer{src, dest, src_offset, dest_offset,
+ length, transpose_map, dest_type};
+ return transposer(src_type);
+}
+
+template <typename T>
+static std::string FormatInt(T val) {
+ return std::to_string(val);
+}
+
+template <typename IndexCType, bool IsSigned = std::is_signed<IndexCType>::value>
+static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limit) {
+ // For unsigned integers, if the values array is larger than the maximum
+ // index value (e.g. especially for UINT8 / UINT16), then there is no need to
+ // boundscheck.
+ if (!IsSigned &&
+ upper_limit > static_cast<uint64_t>(std::numeric_limits<IndexCType>::max())) {
+ return Status::OK();
+ }
+
+ const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
+ const uint8_t* bitmap = nullptr;
+ if (indices.buffers[0]) {
+ bitmap = indices.buffers[0]->data();
+ }
+ auto IsOutOfBounds = [&](IndexCType val) -> bool {
+ return ((IsSigned && val < 0) ||
+ (val >= 0 && static_cast<uint64_t>(val) >= upper_limit));
+ };
+ return VisitSetBitRuns(
+ bitmap, indices.offset, indices.length, [&](int64_t offset, int64_t length) {
+ bool block_out_of_bounds = false;
+ for (int64_t i = 0; i < length; ++i) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[offset + i]);
+ }
+ if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
+ for (int64_t i = 0; i < length; ++i) {
+ if (IsOutOfBounds(indices_data[offset + i])) {
+ return Status::IndexError("Index ", FormatInt(indices_data[offset + i]),
+ " out of bounds");
+ }
+ }
+ }
+ return Status::OK();
+ });
+}
+
+/// \brief Branchless boundschecking of the indices. Processes batches of
+/// indices at a time and shortcircuits when encountering an out-of-bounds
+/// index in a batch
+Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit) {
+ switch (indices.type->id()) {
+ case Type::INT8:
+ return CheckIndexBoundsImpl<int8_t>(indices, upper_limit);
+ case Type::INT16:
+ return CheckIndexBoundsImpl<int16_t>(indices, upper_limit);
+ case Type::INT32:
+ return CheckIndexBoundsImpl<int32_t>(indices, upper_limit);
+ case Type::INT64:
+ return CheckIndexBoundsImpl<int64_t>(indices, upper_limit);
+ case Type::UINT8:
+ return CheckIndexBoundsImpl<uint8_t>(indices, upper_limit);
+ case Type::UINT16:
+ return CheckIndexBoundsImpl<uint16_t>(indices, upper_limit);
+ case Type::UINT32:
+ return CheckIndexBoundsImpl<uint32_t>(indices, upper_limit);
+ case Type::UINT64:
+ return CheckIndexBoundsImpl<uint64_t>(indices, upper_limit);
+ default:
+ return Status::Invalid("Invalid index type for boundschecking");
+ }
+}
+
+// ----------------------------------------------------------------------
+// Utilities for casting from one integer type to another
+
+namespace {
+
+template <typename InType, typename CType = typename InType::c_type>
+Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper) {
+ if (std::numeric_limits<CType>::lowest() >= bound_lower &&
+ std::numeric_limits<CType>::max() <= bound_upper) {
+ return Status::OK();
+ }
+
+ auto IsOutOfBounds = [&](CType val) -> bool {
+ return val < bound_lower || val > bound_upper;
+ };
+ auto IsOutOfBoundsMaybeNull = [&](CType val, bool is_valid) -> bool {
+ return is_valid && (val < bound_lower || val > bound_upper);
+ };
+ auto GetErrorMessage = [&](CType val) {
+ return Status::Invalid("Integer value ", FormatInt(val),
+ " not in range: ", FormatInt(bound_lower), " to ",
+ FormatInt(bound_upper));
+ };
+
+ if (datum.kind() == Datum::SCALAR) {
+ const auto& scalar = datum.scalar_as<typename TypeTraits<InType>::ScalarType>();
+ if (IsOutOfBoundsMaybeNull(scalar.value, scalar.is_valid)) {
+ return GetErrorMessage(scalar.value);
+ }
+ return Status::OK();
+ }
+
+ const ArrayData& indices = *datum.array();
+ const CType* indices_data = indices.GetValues<CType>(1);
+ const uint8_t* bitmap = nullptr;
+ if (indices.buffers[0]) {
+ bitmap = indices.buffers[0]->data();
+ }
+ OptionalBitBlockCounter indices_bit_counter(bitmap, indices.offset, indices.length);
+ int64_t position = 0;
+ int64_t offset_position = indices.offset;
+ while (position < indices.length) {
+ BitBlockCount block = indices_bit_counter.NextBlock();
+ bool block_out_of_bounds = false;
+ if (block.popcount == block.length) {
+ // Fast path: branchless
+ int64_t i = 0;
+ for (int64_t chunk = 0; chunk < block.length / 8; ++chunk) {
+ // Let the compiler unroll this
+ for (int j = 0; j < 8; ++j) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[i++]);
+ }
+ }
+ for (; i < block.length; ++i) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[i]);
+ }
+ } else if (block.popcount > 0) {
+ // Indices have nulls, must only boundscheck non-null values
+ int64_t i = 0;
+ for (int64_t chunk = 0; chunk < block.length / 8; ++chunk) {
+ // Let the compiler unroll this
+ for (int j = 0; j < 8; ++j) {
+ block_out_of_bounds |= IsOutOfBoundsMaybeNull(
+ indices_data[i], BitUtil::GetBit(bitmap, offset_position + i));
+ ++i;
+ }
+ }
+ for (; i < block.length; ++i) {
+ block_out_of_bounds |= IsOutOfBoundsMaybeNull(
+ indices_data[i], BitUtil::GetBit(bitmap, offset_position + i));
+ }
+ }
+ if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
+ if (indices.GetNullCount() > 0) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (IsOutOfBoundsMaybeNull(indices_data[i],
+ BitUtil::GetBit(bitmap, offset_position + i))) {
+ return GetErrorMessage(indices_data[i]);
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (IsOutOfBounds(indices_data[i])) {
+ return GetErrorMessage(indices_data[i]);
+ }
+ }
+ }
+ }
+ indices_data += block.length;
+ position += block.length;
+ offset_position += block.length;
+ }
+ return Status::OK();
+}
+
+template <typename Type>
+Status CheckIntegersInRangeImpl(const Datum& datum, const Scalar& bound_lower,
+ const Scalar& bound_upper) {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ return IntegersInRange<Type>(datum, checked_cast<const ScalarType&>(bound_lower).value,
+ checked_cast<const ScalarType&>(bound_upper).value);
+}
+
+} // namespace
+
+Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
+ const Scalar& bound_upper) {
+ Type::type type_id = datum.type()->id();
+
+ if (bound_lower.type->id() != type_id || bound_upper.type->id() != type_id ||
+ !bound_lower.is_valid || !bound_upper.is_valid) {
+ return Status::Invalid("Scalar bound types must be non-null and same type as data");
+ }
+
+ switch (type_id) {
+ case Type::INT8:
+ return CheckIntegersInRangeImpl<Int8Type>(datum, bound_lower, bound_upper);
+ case Type::INT16:
+ return CheckIntegersInRangeImpl<Int16Type>(datum, bound_lower, bound_upper);
+ case Type::INT32:
+ return CheckIntegersInRangeImpl<Int32Type>(datum, bound_lower, bound_upper);
+ case Type::INT64:
+ return CheckIntegersInRangeImpl<Int64Type>(datum, bound_lower, bound_upper);
+ case Type::UINT8:
+ return CheckIntegersInRangeImpl<UInt8Type>(datum, bound_lower, bound_upper);
+ case Type::UINT16:
+ return CheckIntegersInRangeImpl<UInt16Type>(datum, bound_lower, bound_upper);
+ case Type::UINT32:
+ return CheckIntegersInRangeImpl<UInt32Type>(datum, bound_lower, bound_upper);
+ case Type::UINT64:
+ return CheckIntegersInRangeImpl<UInt64Type>(datum, bound_lower, bound_upper);
+ default:
+ return Status::TypeError("Invalid index type for boundschecking");
+ }
+}
+
+namespace {
+
+template <typename O, typename I, typename Enable = void>
+struct is_number_downcast {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_number_downcast<
+ O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ // Both types are of the same sign-ness.
+ ((std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+ // Both types are of the same integral-ness.
+ (std::is_floating_point<O_T>::value == std::is_floating_point<I_T>::value)) &&
+ // Smaller output size
+ (sizeof(O_T) < sizeof(I_T)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_number_upcast {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_number_upcast<
+ O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ // Both types are of the same sign-ness.
+ ((std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+ // Both types are of the same integral-ness.
+ (std::is_floating_point<O_T>::value == std::is_floating_point<I_T>::value)) &&
+ // Larger output size
+ (sizeof(O_T) > sizeof(I_T)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_integral_signed_to_unsigned {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_integral_signed_to_unsigned<
+ O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ ((std::is_unsigned<O_T>::value && std::is_signed<I_T>::value)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_integral_unsigned_to_signed {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_integral_unsigned_to_signed<
+ O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ ((std::is_signed<O_T>::value && std::is_unsigned<I_T>::value)));
+};
+
+// This set of functions SafeMinimum/SafeMaximum would be simplified with
+// C++17 and `if constexpr`.
+
+// clang-format doesn't handle this construct properly. Thus the macro, but it
+// also improves readability.
+//
+// The effective return type of the function is always `I::c_type`, this is
+// just how enable_if works with functions.
+#define RET_TYPE(TRAIT) enable_if_t<TRAIT<O, I>::value, typename I::c_type>
+
+template <typename O, typename I>
+constexpr RET_TYPE(std::is_same) SafeMinimum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::lowest();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(std::is_same) SafeMaximum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_downcast) SafeMinimum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::lowest();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_downcast) SafeMaximum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_upcast) SafeMinimum() {
+ using in_type = typename I::c_type;
+ return std::numeric_limits<in_type>::lowest();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_upcast) SafeMaximum() {
+ using in_type = typename I::c_type;
+ return std::numeric_limits<in_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMinimum() {
+ return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMaximum() {
+ using in_type = typename I::c_type;
+ using out_type = typename O::c_type;
+
+ // Equality is missing because in_type::max() > out_type::max() when types
+ // are of the same width.
+ return static_cast<in_type>(sizeof(in_type) < sizeof(out_type)
+ ? std::numeric_limits<in_type>::max()
+ : std::numeric_limits<out_type>::max());
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMinimum() {
+ return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMaximum() {
+ using in_type = typename I::c_type;
+ using out_type = typename O::c_type;
+
+ return static_cast<in_type>(sizeof(in_type) <= sizeof(out_type)
+ ? std::numeric_limits<in_type>::max()
+ : std::numeric_limits<out_type>::max());
+}
+
+#undef RET_TYPE
+
+#define GET_MIN_MAX_CASE(TYPE, OUT_TYPE) \
+ case Type::TYPE: \
+ *min = SafeMinimum<OUT_TYPE, InType>(); \
+ *max = SafeMaximum<OUT_TYPE, InType>(); \
+ break
+
+template <typename InType, typename T = typename InType::c_type>
+void GetSafeMinMax(Type::type out_type, T* min, T* max) {
+ switch (out_type) {
+ GET_MIN_MAX_CASE(INT8, Int8Type);
+ GET_MIN_MAX_CASE(INT16, Int16Type);
+ GET_MIN_MAX_CASE(INT32, Int32Type);
+ GET_MIN_MAX_CASE(INT64, Int64Type);
+ GET_MIN_MAX_CASE(UINT8, UInt8Type);
+ GET_MIN_MAX_CASE(UINT16, UInt16Type);
+ GET_MIN_MAX_CASE(UINT32, UInt32Type);
+ GET_MIN_MAX_CASE(UINT64, UInt64Type);
+ default:
+ break;
+ }
+}
+
+template <typename Type, typename CType = typename Type::c_type,
+ typename ScalarType = typename TypeTraits<Type>::ScalarType>
+Status IntegersCanFitImpl(const Datum& datum, const DataType& target_type) {
+ CType bound_min{}, bound_max{};
+ GetSafeMinMax<Type>(target_type.id(), &bound_min, &bound_max);
+ return CheckIntegersInRange(datum, ScalarType(bound_min), ScalarType(bound_max));
+}
+
+} // namespace
+
+Status IntegersCanFit(const Datum& datum, const DataType& target_type) {
+ if (!is_integer(target_type.id())) {
+ return Status::Invalid("Target type is not an integer type: ", target_type);
+ }
+
+ switch (datum.type()->id()) {
+ case Type::INT8:
+ return IntegersCanFitImpl<Int8Type>(datum, target_type);
+ case Type::INT16:
+ return IntegersCanFitImpl<Int16Type>(datum, target_type);
+ case Type::INT32:
+ return IntegersCanFitImpl<Int32Type>(datum, target_type);
+ case Type::INT64:
+ return IntegersCanFitImpl<Int64Type>(datum, target_type);
+ case Type::UINT8:
+ return IntegersCanFitImpl<UInt8Type>(datum, target_type);
+ case Type::UINT16:
+ return IntegersCanFitImpl<UInt16Type>(datum, target_type);
+ case Type::UINT32:
+ return IntegersCanFitImpl<UInt32Type>(datum, target_type);
+ case Type::UINT64:
+ return IntegersCanFitImpl<UInt64Type>(datum, target_type);
+ default:
+ return Status::TypeError("Invalid index type for boundschecking");
+ }
+}
+
+} // namespace internal
+} // namespace arrow