[ceph.git] / ceph / src / arrow / cpp / src / arrow / compute / kernels / scalar_validity.cc

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include <cmath>

#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"

#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"

namespace arrow {

using internal::CopyBitmap;
using internal::InvertBitmap;

namespace compute {
namespace internal {
namespace {

struct IsValidOperator {
  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
    checked_cast<BooleanScalar*>(out)->value = in.is_valid;
    return Status::OK();
  }

  static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
    DCHECK_EQ(out->offset, 0);
    DCHECK_LE(out->length, arr.length);
    if (arr.MayHaveNulls()) {
      // Input has nulls => output is the null (validity) bitmap.
      // To avoid copying the null bitmap, slice from the starting byte offset
      // and set the offset to the remaining bit offset.
      out->offset = arr.offset % 8;
      out->buffers[1] =
          arr.offset == 0 ? arr.buffers[0]
                          : SliceBuffer(arr.buffers[0], arr.offset / 8,
                                        BitUtil::BytesForBits(out->length + out->offset));
      return Status::OK();
    }

    // Input has no nulls => output is entirely true.
    ARROW_ASSIGN_OR_RAISE(out->buffers[1],
                          ctx->AllocateBitmap(out->length + out->offset));
    BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true);
    return Status::OK();
  }
};

struct IsFiniteOperator {
  template <typename OutType, typename InType>
  static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
    return std::isfinite(value);
  }
};

struct IsInfOperator {
  template <typename OutType, typename InType>
  static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
    return std::isinf(value);
  }
};

using NanOptionsState = OptionsWrapper<NullOptions>;

struct IsNullOperator {
  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
    const auto& options = NanOptionsState::Get(ctx);
    bool* out_value = &checked_cast<BooleanScalar*>(out)->value;

    if (in.is_valid) {
      if (options.nan_is_null && is_floating(in.type->id())) {
        switch (in.type->id()) {
          case Type::FLOAT:
            *out_value = std::isnan(internal::UnboxScalar<FloatType>::Unbox(in));
            break;
          case Type::DOUBLE:
            *out_value = std::isnan(internal::UnboxScalar<DoubleType>::Unbox(in));
            break;
          default:
            return Status::NotImplemented("NaN detection not implemented for type ",
                                          in.type->ToString());
        }
      } else {
        *out_value = false;
      }
    } else {
      *out_value = true;
    }

    return Status::OK();
  }

  template <typename T>
  static void SetNanBits(const ArrayData& arr, uint8_t* out_bitmap, int64_t out_offset) {
    const T* data = arr.GetValues<T>(1);
    for (int64_t i = 0; i < arr.length; ++i) {
      if (std::isnan(data[i])) {
        BitUtil::SetBit(out_bitmap, i + out_offset);
      }
    }
  }

  static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
    const auto& options = NanOptionsState::Get(ctx);

    uint8_t* out_bitmap = out->buffers[1]->mutable_data();
    if (arr.GetNullCount() > 0) {
      // Input has nulls => output is the inverted null (validity) bitmap.
      InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, out_bitmap,
                   out->offset);
    } else {
      // Input has no nulls => output is entirely false.
      BitUtil::SetBitsTo(out_bitmap, out->offset, out->length, false);
    }

    if (is_floating(arr.type->id()) && options.nan_is_null) {
      switch (arr.type->id()) {
        case Type::FLOAT:
          SetNanBits<float>(arr, out_bitmap, out->offset);
          break;
        case Type::DOUBLE:
          SetNanBits<double>(arr, out_bitmap, out->offset);
          break;
        default:
          return Status::NotImplemented("NaN detection not implemented for type ",
                                        arr.type->ToString());
      }
    }
    return Status::OK();
  }
};

struct IsNanOperator {
  template <typename OutType, typename InType>
  static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
    return std::isnan(value);
  }
};

void MakeFunction(std::string name, const FunctionDoc* doc,
                  std::vector<InputType> in_types, OutputType out_type,
                  ArrayKernelExec exec, FunctionRegistry* registry,
                  MemAllocation::type mem_allocation, bool can_write_into_slices,
                  const FunctionOptions* default_options = NULLPTR,
                  KernelInit init = NULLPTR) {
  Arity arity{static_cast<int>(in_types.size())};
  auto func = std::make_shared<ScalarFunction>(name, arity, doc, default_options);

  ScalarKernel kernel(std::move(in_types), out_type, exec, init);
  kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
  kernel.can_write_into_slices = can_write_into_slices;
  kernel.mem_allocation = mem_allocation;

  DCHECK_OK(func->AddKernel(std::move(kernel)));
  DCHECK_OK(registry->AddFunction(std::move(func)));
}

template <typename InType, typename Op>
void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
  DCHECK_OK(func->AddKernel({ty}, boolean(),
                            applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
}

std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
                                                     const FunctionDoc* doc) {
  auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);

  AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
  AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());

  return func;
}

std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
                                                  const FunctionDoc* doc) {
  auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);

  AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
  AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());

  return func;
}

std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
                                                  const FunctionDoc* doc) {
  auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);

  AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
  AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());

  return func;
}

Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
  const Datum& arg0 = batch[0];
  if (arg0.type()->id() == Type::NA) {
    auto false_value = std::make_shared<BooleanScalar>(false);
    if (arg0.kind() == Datum::SCALAR) {
      out->value = false_value;
    } else {
      std::shared_ptr<Array> false_values;
      RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
                        .Value(&false_values));
      out->value = false_values->data();
    }
    return Status::OK();
  } else {
    return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
  }
}

Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
  const Datum& arg0 = batch[0];
  if (arg0.type()->id() == Type::NA) {
    if (arg0.kind() == Datum::SCALAR) {
      out->value = std::make_shared<BooleanScalar>(true);
    } else {
      // Data is preallocated
      ArrayData* out_arr = out->mutable_array();
      BitUtil::SetBitsTo(out_arr->buffers[1]->mutable_data(), out_arr->offset,
                         out_arr->length, true);
    }
    return Status::OK();
  } else {
    return applicator::SimpleUnary<IsNullOperator>(ctx, batch, out);
  }
}

const FunctionDoc is_valid_doc(
    "Return true if non-null",
    ("For each input value, emit true iff the value is valid (non-null)."), {"values"});

const FunctionDoc is_finite_doc(
    "Return true if value is finite",
    ("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
    {"values"});

const FunctionDoc is_inf_doc(
    "Return true if infinity",
    ("For each input value, emit true iff the value is infinite (inf or -inf)."),
    {"values"});

const FunctionDoc is_null_doc(
    "Return true if null (and optionally NaN)",
    ("For each input value, emit true iff the value is null.\n"
     "True may also be emitted for NaN values by setting the `nan_is_null` flag."),
    {"values"}, "NullOptions");

const FunctionDoc is_nan_doc("Return true if NaN",
                             ("For each input value, emit true iff the value is NaN."),
                             {"values"});

}  // namespace

void RegisterScalarValidity(FunctionRegistry* registry) {
  static auto kNullOptions = NullOptions::Defaults();
  MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
               registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);

  MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
               registry, MemAllocation::PREALLOCATE,
               /*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init);

  DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
  DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
  DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
}

}  // namespace internal
}  // namespace compute
}  // namespace arrow
Commit	Line	Data
1d09f67e TL	1	// Licensed to the Apache Software Foundation (ASF) under one
	2	// or more contributor license agreements. See the NOTICE file
	3	// distributed with this work for additional information
	4	// regarding copyright ownership. The ASF licenses this file
	5	// to you under the Apache License, Version 2.0 (the
	6	// "License"); you may not use this file except in compliance
	7	// with the License. You may obtain a copy of the License at
	8	//
	9	// http://www.apache.org/licenses/LICENSE-2.0
	10	//
	11	// Unless required by applicable law or agreed to in writing,
	12	// software distributed under the License is distributed on an
	13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	14	// KIND, either express or implied. See the License for the
	15	// specific language governing permissions and limitations
	16	// under the License.
	17
	18	#include <cmath>
	19
	20	#include "arrow/compute/api_scalar.h"
	21	#include "arrow/compute/kernels/common.h"
	22
	23	#include "arrow/util/bit_util.h"
	24	#include "arrow/util/bitmap_ops.h"
	25
	26	namespace arrow {
	27
	28	using internal::CopyBitmap;
	29	using internal::InvertBitmap;
	30
	31	namespace compute {
	32	namespace internal {
	33	namespace {
	34
	35	struct IsValidOperator {
	36	static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
	37	checked_cast<BooleanScalar*>(out)->value = in.is_valid;
	38	return Status::OK();
	39	}
	40
	41	static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
	42	DCHECK_EQ(out->offset, 0);
	43	DCHECK_LE(out->length, arr.length);
	44	if (arr.MayHaveNulls()) {
	45	// Input has nulls => output is the null (validity) bitmap.
	46	// To avoid copying the null bitmap, slice from the starting byte offset
	47	// and set the offset to the remaining bit offset.
	48	out->offset = arr.offset % 8;
	49	out->buffers[1] =
	50	arr.offset == 0 ? arr.buffers[0]
	51	: SliceBuffer(arr.buffers[0], arr.offset / 8,
	52	BitUtil::BytesForBits(out->length + out->offset));
	53	return Status::OK();
	54	}
	55
	56	// Input has no nulls => output is entirely true.
	57	ARROW_ASSIGN_OR_RAISE(out->buffers[1],
	58	ctx->AllocateBitmap(out->length + out->offset));
	59	BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true);
	60	return Status::OK();
	61	}
	62	};
	63
	64	struct IsFiniteOperator {
65	template <typename OutType, typename InType>
66	static constexpr OutType Call(KernelContext, const InType& value, Status) {
67	return std::isfinite(value);
68	}
69	};
70
71	struct IsInfOperator {
72	template <typename OutType, typename InType>
73	static constexpr OutType Call(KernelContext, const InType& value, Status) {
74	return std::isinf(value);
75	}
76	};
77
78	using NanOptionsState = OptionsWrapper<NullOptions>;
79
80	struct IsNullOperator {
81	static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
82	const auto& options = NanOptionsState::Get(ctx);
83	bool* out_value = &checked_cast<BooleanScalar*>(out)->value;
84
85	if (in.is_valid) {
86	if (options.nan_is_null && is_floating(in.type->id())) {
87	switch (in.type->id()) {
88	case Type::FLOAT:
89	*out_value = std::isnan(internal::UnboxScalar<FloatType>::Unbox(in));
90	break;
91	case Type::DOUBLE:
92	*out_value = std::isnan(internal::UnboxScalar<DoubleType>::Unbox(in));
93	break;
94	default:
95	return Status::NotImplemented("NaN detection not implemented for type ",
96	in.type->ToString());
97	}
98	} else {
99	*out_value = false;
100	}
101	} else {
102	*out_value = true;
103	}
104
105	return Status::OK();
106	}
107
108	template <typename T>
109	static void SetNanBits(const ArrayData& arr, uint8_t* out_bitmap, int64_t out_offset) {
110	const T* data = arr.GetValues<T>(1);
111	for (int64_t i = 0; i < arr.length; ++i) {
112	if (std::isnan(data[i])) {
113	BitUtil::SetBit(out_bitmap, i + out_offset);
114	}
115	}
116	}
117
118	static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
119	const auto& options = NanOptionsState::Get(ctx);
120
121	uint8_t* out_bitmap = out->buffers[1]->mutable_data();
122	if (arr.GetNullCount() > 0) {
123	// Input has nulls => output is the inverted null (validity) bitmap.
124	InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, out_bitmap,
125	out->offset);
126	} else {
127	// Input has no nulls => output is entirely false.
128	BitUtil::SetBitsTo(out_bitmap, out->offset, out->length, false);
129	}
130
131	if (is_floating(arr.type->id()) && options.nan_is_null) {
132	switch (arr.type->id()) {
133	case Type::FLOAT:
134	SetNanBits<float>(arr, out_bitmap, out->offset);
135	break;
136	case Type::DOUBLE:
137	SetNanBits<double>(arr, out_bitmap, out->offset);
138	break;
139	default:
140	return Status::NotImplemented("NaN detection not implemented for type ",
141	arr.type->ToString());
142	}
143	}
144	return Status::OK();
145	}
146	};
147
148	struct IsNanOperator {
149	template <typename OutType, typename InType>
150	static constexpr OutType Call(KernelContext, const InType& value, Status) {
151	return std::isnan(value);
152	}
153	};
154
155	void MakeFunction(std::string name, const FunctionDoc* doc,
156	std::vector<InputType> in_types, OutputType out_type,
157	ArrayKernelExec exec, FunctionRegistry* registry,
158	MemAllocation::type mem_allocation, bool can_write_into_slices,
159	const FunctionOptions* default_options = NULLPTR,
160	KernelInit init = NULLPTR) {
161	Arity arity{static_cast<int>(in_types.size())};
162	auto func = std::make_shared<ScalarFunction>(name, arity, doc, default_options);
163
164	ScalarKernel kernel(std::move(in_types), out_type, exec, init);
165	kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
166	kernel.can_write_into_slices = can_write_into_slices;
167	kernel.mem_allocation = mem_allocation;
168
169	DCHECK_OK(func->AddKernel(std::move(kernel)));
170	DCHECK_OK(registry->AddFunction(std::move(func)));
171	}
172
173	template <typename InType, typename Op>
174	void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
175	DCHECK_OK(func->AddKernel({ty}, boolean(),
176	applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
177	}
178
179	std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
180	const FunctionDoc* doc) {
181	auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
182
183	AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
184	AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());
185
186	return func;
187	}
188
189	std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
190	const FunctionDoc* doc) {
191	auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
192
193	AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
194	AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());
195
196	return func;
197	}
198
199	std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
200	const FunctionDoc* doc) {
201	auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
202
203	AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
204	AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());
205
206	return func;
207	}
208
209	Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
210	const Datum& arg0 = batch[0];
211	if (arg0.type()->id() == Type::NA) {
212	auto false_value = std::make_shared<BooleanScalar>(false);
213	if (arg0.kind() == Datum::SCALAR) {
214	out->value = false_value;
215	} else {
216	std::shared_ptr<Array> false_values;
217	RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
218	.Value(&false_values));
219	out->value = false_values->data();
220	}
221	return Status::OK();
222	} else {
223	return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
224	}
225	}
226
227	Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
228	const Datum& arg0 = batch[0];
229	if (arg0.type()->id() == Type::NA) {
230	if (arg0.kind() == Datum::SCALAR) {
231	out->value = std::make_shared<BooleanScalar>(true);
232	} else {
233	// Data is preallocated
234	ArrayData* out_arr = out->mutable_array();
235	BitUtil::SetBitsTo(out_arr->buffers[1]->mutable_data(), out_arr->offset,
236	out_arr->length, true);
237	}
238	return Status::OK();
239	} else {
240	return applicator::SimpleUnary<IsNullOperator>(ctx, batch, out);
241	}
242	}
243
244	const FunctionDoc is_valid_doc(
245	"Return true if non-null",
246	("For each input value, emit true iff the value is valid (non-null)."), {"values"});
247
248	const FunctionDoc is_finite_doc(
249	"Return true if value is finite",
250	("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
251	{"values"});
252
253	const FunctionDoc is_inf_doc(
254	"Return true if infinity",
255	("For each input value, emit true iff the value is infinite (inf or -inf)."),
256	{"values"});
257
258	const FunctionDoc is_null_doc(
259	"Return true if null (and optionally NaN)",
260	("For each input value, emit true iff the value is null.\n"
261	"True may also be emitted for NaN values by setting the `nan_is_null` flag."),
262	{"values"}, "NullOptions");
263
264	const FunctionDoc is_nan_doc("Return true if NaN",
265	("For each input value, emit true iff the value is NaN."),
266	{"values"});
267
268	} // namespace
269
270	void RegisterScalarValidity(FunctionRegistry* registry) {
271	static auto kNullOptions = NullOptions::Defaults();
272	MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
273	registry, MemAllocation::NO_PREALLOCATE, /can_write_into_slices=/false);
274
275	MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
276	registry, MemAllocation::PREALLOCATE,
277	/can_write_into_slices=/true, &kNullOptions, NanOptionsState::Init);
278
279	DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
280	DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
281	DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
282	}
283
284	} // namespace internal
285	} // namespace compute
286	} // namespace arrow