]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/arrow/cpp/src/arrow/sparse_tensor.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / sparse_tensor.cc
diff --git a/ceph/src/arrow/cpp/src/arrow/sparse_tensor.cc b/ceph/src/arrow/cpp/src/arrow/sparse_tensor.cc
new file mode 100644 (file)
index 0000000..03d59c3
--- /dev/null
@@ -0,0 +1,478 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/sparse_tensor.h"
+#include "arrow/tensor/converter.h"
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <numeric>
+
+#include "arrow/compare.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+// ----------------------------------------------------------------------
+// SparseIndex
+
+Status SparseIndex::ValidateShape(const std::vector<int64_t>& shape) const {
+  if (!std::all_of(shape.begin(), shape.end(), [](int64_t x) { return x >= 0; })) {
+    return Status::Invalid("Shape elements must be positive");
+  }
+
+  return Status::OK();
+}
+
+namespace internal {
+namespace {
+
+template <typename IndexValueType>
+Status CheckSparseIndexMaximumValue(const std::vector<int64_t>& shape) {
+  using c_index_value_type = typename IndexValueType::c_type;
+  constexpr int64_t type_max =
+      static_cast<int64_t>(std::numeric_limits<c_index_value_type>::max());
+  auto greater_than_type_max = [&](int64_t x) { return x > type_max; };
+  if (std::any_of(shape.begin(), shape.end(), greater_than_type_max)) {
+    return Status::Invalid("The bit width of the index value type is too small");
+  }
+  return Status::OK();
+}
+
+template <>
+Status CheckSparseIndexMaximumValue<Int64Type>(const std::vector<int64_t>& shape) {
+  return Status::OK();
+}
+
+template <>
+Status CheckSparseIndexMaximumValue<UInt64Type>(const std::vector<int64_t>& shape) {
+  return Status::Invalid("UInt64Type cannot be used as IndexValueType of SparseIndex");
+}
+
+}  // namespace
+
+#define CALL_CHECK_MAXIMUM_VALUE(TYPE_CLASS) \
+  case TYPE_CLASS##Type::type_id:            \
+    return CheckSparseIndexMaximumValue<TYPE_CLASS##Type>(shape);
+
+Status CheckSparseIndexMaximumValue(const std::shared_ptr<DataType>& index_value_type,
+                                    const std::vector<int64_t>& shape) {
+  switch (index_value_type->id()) {
+    ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_CHECK_MAXIMUM_VALUE);
+    default:
+      return Status::TypeError("Unsupported SparseTensor index value type");
+  }
+}
+
+#undef CALL_CHECK_MAXIMUM_VALUE
+
+Status MakeSparseTensorFromTensor(const Tensor& tensor,
+                                  SparseTensorFormat::type sparse_format_id,
+                                  const std::shared_ptr<DataType>& index_value_type,
+                                  MemoryPool* pool,
+                                  std::shared_ptr<SparseIndex>* out_sparse_index,
+                                  std::shared_ptr<Buffer>* out_data) {
+  switch (sparse_format_id) {
+    case SparseTensorFormat::COO:
+      return MakeSparseCOOTensorFromTensor(tensor, index_value_type, pool,
+                                           out_sparse_index, out_data);
+    case SparseTensorFormat::CSR:
+      return MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis::ROW, tensor,
+                                           index_value_type, pool, out_sparse_index,
+                                           out_data);
+    case SparseTensorFormat::CSC:
+      return MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis::COLUMN, tensor,
+                                           index_value_type, pool, out_sparse_index,
+                                           out_data);
+    case SparseTensorFormat::CSF:
+      return MakeSparseCSFTensorFromTensor(tensor, index_value_type, pool,
+                                           out_sparse_index, out_data);
+
+    // LCOV_EXCL_START: ignore program failure
+    default:
+      return Status::Invalid("Invalid sparse tensor format");
+      // LCOV_EXCL_STOP
+  }
+}
+
+}  // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCOOIndex
+
+namespace {
+
+inline Status CheckSparseCOOIndexValidity(const std::shared_ptr<DataType>& type,
+                                          const std::vector<int64_t>& shape,
+                                          const std::vector<int64_t>& strides) {
+  if (!is_integer(type->id())) {
+    return Status::TypeError("Type of SparseCOOIndex indices must be integer");
+  }
+  if (shape.size() != 2) {
+    return Status::Invalid("SparseCOOIndex indices must be a matrix");
+  }
+
+  RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(type, shape));
+
+  if (!internal::IsTensorStridesContiguous(type, shape, strides)) {
+    return Status::Invalid("SparseCOOIndex indices must be contiguous");
+  }
+  return Status::OK();
+}
+
+void GetCOOIndexTensorRow(const std::shared_ptr<Tensor>& coords, const int64_t row,
+                          std::vector<int64_t>* out_index) {
+  const auto& fw_index_value_type =
+      internal::checked_cast<const FixedWidthType&>(*coords->type());
+  const size_t indices_elsize = fw_index_value_type.bit_width() / CHAR_BIT;
+
+  const auto& shape = coords->shape();
+  const int64_t non_zero_length = shape[0];
+  DCHECK(0 <= row && row < non_zero_length);
+
+  const int64_t ndim = shape[1];
+  out_index->resize(ndim);
+
+  switch (indices_elsize) {
+    case 1:  // Int8, UInt8
+      for (int64_t i = 0; i < ndim; ++i) {
+        (*out_index)[i] = static_cast<int64_t>(coords->Value<UInt8Type>({row, i}));
+      }
+      break;
+    case 2:  // Int16, UInt16
+      for (int64_t i = 0; i < ndim; ++i) {
+        (*out_index)[i] = static_cast<int64_t>(coords->Value<UInt16Type>({row, i}));
+      }
+      break;
+    case 4:  // Int32, UInt32
+      for (int64_t i = 0; i < ndim; ++i) {
+        (*out_index)[i] = static_cast<int64_t>(coords->Value<UInt32Type>({row, i}));
+      }
+      break;
+    case 8:  // Int64
+      for (int64_t i = 0; i < ndim; ++i) {
+        (*out_index)[i] = coords->Value<Int64Type>({row, i});
+      }
+      break;
+    default:
+      DCHECK(false) << "Must not reach here";
+      break;
+  }
+}
+
+bool DetectSparseCOOIndexCanonicality(const std::shared_ptr<Tensor>& coords) {
+  DCHECK_EQ(coords->ndim(), 2);
+
+  const auto& shape = coords->shape();
+  const int64_t non_zero_length = shape[0];
+  if (non_zero_length <= 1) return true;
+
+  const int64_t ndim = shape[1];
+  std::vector<int64_t> last_index, index;
+  GetCOOIndexTensorRow(coords, 0, &last_index);
+  for (int64_t i = 1; i < non_zero_length; ++i) {
+    GetCOOIndexTensorRow(coords, i, &index);
+    int64_t j = 0;
+    while (j < ndim) {
+      if (last_index[j] > index[j]) {
+        // last_index > index, so we can detect non-canonical here
+        return false;
+      }
+      if (last_index[j] < index[j]) {
+        // last_index < index, so we can skip the remaining dimensions
+        break;
+      }
+      ++j;
+    }
+    if (j == ndim) {
+      // last_index == index, so we can detect non-canonical here
+      return false;
+    }
+    swap(last_index, index);
+  }
+
+  return true;
+}
+
+}  // namespace
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+    const std::shared_ptr<Tensor>& coords, bool is_canonical) {
+  RETURN_NOT_OK(
+      CheckSparseCOOIndexValidity(coords->type(), coords->shape(), coords->strides()));
+  return std::make_shared<SparseCOOIndex>(coords, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+    const std::shared_ptr<Tensor>& coords) {
+  RETURN_NOT_OK(
+      CheckSparseCOOIndexValidity(coords->type(), coords->shape(), coords->strides()));
+  auto is_canonical = DetectSparseCOOIndexCanonicality(coords);
+  return std::make_shared<SparseCOOIndex>(coords, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+    const std::shared_ptr<DataType>& indices_type,
+    const std::vector<int64_t>& indices_shape,
+    const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data,
+    bool is_canonical) {
+  RETURN_NOT_OK(
+      CheckSparseCOOIndexValidity(indices_type, indices_shape, indices_strides));
+  return std::make_shared<SparseCOOIndex>(
+      std::make_shared<Tensor>(indices_type, indices_data, indices_shape,
+                               indices_strides),
+      is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+    const std::shared_ptr<DataType>& indices_type,
+    const std::vector<int64_t>& indices_shape,
+    const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data) {
+  RETURN_NOT_OK(
+      CheckSparseCOOIndexValidity(indices_type, indices_shape, indices_strides));
+  auto coords = std::make_shared<Tensor>(indices_type, indices_data, indices_shape,
+                                         indices_strides);
+  auto is_canonical = DetectSparseCOOIndexCanonicality(coords);
+  return std::make_shared<SparseCOOIndex>(coords, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+    const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+    int64_t non_zero_length, std::shared_ptr<Buffer> indices_data, bool is_canonical) {
+  auto ndim = static_cast<int64_t>(shape.size());
+  if (!is_integer(indices_type->id())) {
+    return Status::TypeError("Type of SparseCOOIndex indices must be integer");
+  }
+  const int64_t elsize =
+      internal::checked_cast<const IntegerType&>(*indices_type).bit_width() / 8;
+  std::vector<int64_t> indices_shape({non_zero_length, ndim});
+  std::vector<int64_t> indices_strides({elsize * ndim, elsize});
+  return Make(indices_type, indices_shape, indices_strides, indices_data, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+    const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+    int64_t non_zero_length, std::shared_ptr<Buffer> indices_data) {
+  auto ndim = static_cast<int64_t>(shape.size());
+  if (!is_integer(indices_type->id())) {
+    return Status::TypeError("Type of SparseCOOIndex indices must be integer");
+  }
+  const int64_t elsize = internal::GetByteWidth(*indices_type);
+  std::vector<int64_t> indices_shape({non_zero_length, ndim});
+  std::vector<int64_t> indices_strides({elsize * ndim, elsize});
+  return Make(indices_type, indices_shape, indices_strides, indices_data);
+}
+
+// Constructor with a contiguous NumericTensor
+SparseCOOIndex::SparseCOOIndex(const std::shared_ptr<Tensor>& coords, bool is_canonical)
+    : SparseIndexBase(), coords_(coords), is_canonical_(is_canonical) {
+  ARROW_CHECK_OK(
+      CheckSparseCOOIndexValidity(coords_->type(), coords_->shape(), coords_->strides()));
+}
+
+std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); }
+
+// ----------------------------------------------------------------------
+// SparseCSXIndex
+
+namespace internal {
+
+Status ValidateSparseCSXIndex(const std::shared_ptr<DataType>& indptr_type,
+                              const std::shared_ptr<DataType>& indices_type,
+                              const std::vector<int64_t>& indptr_shape,
+                              const std::vector<int64_t>& indices_shape,
+                              char const* type_name) {
+  if (!is_integer(indptr_type->id())) {
+    return Status::TypeError("Type of ", type_name, " indptr must be integer");
+  }
+  if (indptr_shape.size() != 1) {
+    return Status::Invalid(type_name, " indptr must be a vector");
+  }
+  if (!is_integer(indices_type->id())) {
+    return Status::Invalid("Type of ", type_name, " indices must be integer");
+  }
+  if (indices_shape.size() != 1) {
+    return Status::Invalid(type_name, " indices must be a vector");
+  }
+
+  RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indptr_type, indptr_shape));
+  RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indices_type, indices_shape));
+
+  return Status::OK();
+}
+
+void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
+                                 const std::shared_ptr<DataType>& indices_type,
+                                 const std::vector<int64_t>& indptr_shape,
+                                 const std::vector<int64_t>& indices_shape,
+                                 char const* type_name) {
+  ARROW_CHECK_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape,
+                                        indices_shape, type_name));
+}
+
+}  // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCSFIndex
+
+namespace {
+
+inline Status CheckSparseCSFIndexValidity(const std::shared_ptr<DataType>& indptr_type,
+                                          const std::shared_ptr<DataType>& indices_type,
+                                          const int64_t num_indptrs,
+                                          const int64_t num_indices,
+                                          const int64_t axis_order_size) {
+  if (!is_integer(indptr_type->id())) {
+    return Status::TypeError("Type of SparseCSFIndex indptr must be integer");
+  }
+  if (!is_integer(indices_type->id())) {
+    return Status::TypeError("Type of SparseCSFIndex indices must be integer");
+  }
+  if (num_indptrs + 1 != num_indices) {
+    return Status::Invalid(
+        "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex.");
+  }
+  if (axis_order_size != num_indices) {
+    return Status::Invalid(
+        "Length of indices must be equal to number of dimensions for SparseCSFIndex.");
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
+    const std::shared_ptr<DataType>& indptr_type,
+    const std::shared_ptr<DataType>& indices_type,
+    const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+    const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+    const std::vector<std::shared_ptr<Buffer>>& indices_data) {
+  int64_t ndim = axis_order.size();
+  std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
+  std::vector<std::shared_ptr<Tensor>> indices(ndim);
+
+  for (int64_t i = 0; i < ndim - 1; ++i)
+    indptr[i] = std::make_shared<Tensor>(indptr_type, indptr_data[i],
+                                         std::vector<int64_t>({indices_shapes[i] + 1}));
+  for (int64_t i = 0; i < ndim; ++i)
+    indices[i] = std::make_shared<Tensor>(indices_type, indices_data[i],
+                                          std::vector<int64_t>({indices_shapes[i]}));
+
+  RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(),
+                                            indices.size(), axis_order.size()));
+
+  for (auto tensor : indptr) {
+    RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indptr_type, tensor->shape()));
+  }
+
+  for (auto tensor : indices) {
+    RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indices_type, tensor->shape()));
+  }
+
+  return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
+}
+
+// Constructor with two index vectors
+SparseCSFIndex::SparseCSFIndex(const std::vector<std::shared_ptr<Tensor>>& indptr,
+                               const std::vector<std::shared_ptr<Tensor>>& indices,
+                               const std::vector<int64_t>& axis_order)
+    : SparseIndexBase(), indptr_(indptr), indices_(indices), axis_order_(axis_order) {
+  ARROW_CHECK_OK(CheckSparseCSFIndexValidity(indptr_.front()->type(),
+                                             indices_.front()->type(), indptr_.size(),
+                                             indices_.size(), axis_order_.size()));
+}
+
+std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); }
+
+bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const {
+  for (int64_t i = 0; i < static_cast<int64_t>(indices().size()); ++i) {
+    if (!indices()[i]->Equals(*other.indices()[i])) return false;
+  }
+  for (int64_t i = 0; i < static_cast<int64_t>(indptr().size()); ++i) {
+    if (!indptr()[i]->Equals(*other.indptr()[i])) return false;
+  }
+  return axis_order() == other.axis_order();
+}
+
+// ----------------------------------------------------------------------
+// SparseTensor
+
+// Constructor with all attributes
+SparseTensor::SparseTensor(const std::shared_ptr<DataType>& type,
+                           const std::shared_ptr<Buffer>& data,
+                           const std::vector<int64_t>& shape,
+                           const std::shared_ptr<SparseIndex>& sparse_index,
+                           const std::vector<std::string>& dim_names)
+    : type_(type),
+      data_(data),
+      shape_(shape),
+      sparse_index_(sparse_index),
+      dim_names_(dim_names) {
+  ARROW_CHECK(is_tensor_supported(type->id()));
+}
+
+const std::string& SparseTensor::dim_name(int i) const {
+  static const std::string kEmpty = "";
+  if (dim_names_.size() == 0) {
+    return kEmpty;
+  } else {
+    ARROW_CHECK_LT(i, static_cast<int>(dim_names_.size()));
+    return dim_names_[i];
+  }
+}
+
+int64_t SparseTensor::size() const {
+  return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies<int64_t>());
+}
+
+bool SparseTensor::Equals(const SparseTensor& other, const EqualOptions& opts) const {
+  return SparseTensorEquals(*this, other, opts);
+}
+
+Result<std::shared_ptr<Tensor>> SparseTensor::ToTensor(MemoryPool* pool) const {
+  switch (format_id()) {
+    case SparseTensorFormat::COO:
+      return MakeTensorFromSparseCOOTensor(
+          pool, internal::checked_cast<const SparseCOOTensor*>(this));
+      break;
+
+    case SparseTensorFormat::CSR:
+      return MakeTensorFromSparseCSRMatrix(
+          pool, internal::checked_cast<const SparseCSRMatrix*>(this));
+      break;
+
+    case SparseTensorFormat::CSC:
+      return MakeTensorFromSparseCSCMatrix(
+          pool, internal::checked_cast<const SparseCSCMatrix*>(this));
+      break;
+
+    case SparseTensorFormat::CSF:
+      return MakeTensorFromSparseCSFTensor(
+          pool, internal::checked_cast<const SparseCSFTensor*>(this));
+
+    default:
+      return Status::NotImplemented("Unsupported SparseIndex format type");
+  }
+}
+
+}  // namespace arrow