]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/arrow/cpp/src/arrow/tensor/coo_converter.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / tensor / coo_converter.cc
diff --git a/ceph/src/arrow/cpp/src/arrow/tensor/coo_converter.cc b/ceph/src/arrow/cpp/src/arrow/tensor/coo_converter.cc
new file mode 100644 (file)
index 0000000..2124d0a
--- /dev/null
@@ -0,0 +1,333 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/tensor/converter_internal.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace internal {
+namespace {
+
+template <typename c_index_type>
+inline void IncrementRowMajorIndex(std::vector<c_index_type>& coord,
+                                   const std::vector<int64_t>& shape) {
+  const int64_t ndim = shape.size();
+  ++coord[ndim - 1];
+  if (coord[ndim - 1] == shape[ndim - 1]) {
+    int64_t d = ndim - 1;
+    while (d > 0 && coord[d] == shape[d]) {
+      coord[d] = 0;
+      ++coord[d - 1];
+      --d;
+    }
+  }
+}
+
+template <typename c_index_type, typename c_value_type>
+void ConvertRowMajorTensor(const Tensor& tensor, c_index_type* indices,
+                           c_value_type* values, const int64_t size) {
+  const auto ndim = tensor.ndim();
+  const auto& shape = tensor.shape();
+  const c_value_type* tensor_data =
+      reinterpret_cast<const c_value_type*>(tensor.raw_data());
+
+  constexpr c_value_type zero = 0;
+  std::vector<c_index_type> coord(ndim, 0);
+  for (int64_t n = tensor.size(); n > 0; --n) {
+    const c_value_type x = *tensor_data;
+    if (ARROW_PREDICT_FALSE(x != zero)) {
+      std::copy(coord.begin(), coord.end(), indices);
+      *values++ = x;
+      indices += ndim;
+    }
+
+    IncrementRowMajorIndex(coord, shape);
+    ++tensor_data;
+  }
+}
+
+template <typename c_index_type, typename c_value_type>
+void ConvertColumnMajorTensor(const Tensor& tensor, c_index_type* out_indices,
+                              c_value_type* out_values, const int64_t size) {
+  const auto ndim = tensor.ndim();
+  std::vector<c_index_type> indices(ndim * size);
+  std::vector<c_value_type> values(size);
+  ConvertRowMajorTensor(tensor, indices.data(), values.data(), size);
+
+  // transpose indices
+  for (int64_t i = 0; i < size; ++i) {
+    for (int j = 0; j < ndim / 2; ++j) {
+      std::swap(indices[i * ndim + j], indices[i * ndim + ndim - j - 1]);
+    }
+  }
+
+  // sort indices
+  std::vector<int64_t> order(size);
+  std::iota(order.begin(), order.end(), 0);
+  std::sort(order.begin(), order.end(), [&](const int64_t xi, const int64_t yi) {
+    const int64_t x_offset = xi * ndim;
+    const int64_t y_offset = yi * ndim;
+    for (int j = 0; j < ndim; ++j) {
+      const auto x = indices[x_offset + j];
+      const auto y = indices[y_offset + j];
+      if (x < y) return true;
+      if (x > y) return false;
+    }
+    return false;
+  });
+
+  // transfer result
+  const auto* indices_data = indices.data();
+  for (int64_t i = 0; i < size; ++i) {
+    out_values[i] = values[i];
+
+    std::copy_n(indices_data, ndim, out_indices);
+    indices_data += ndim;
+    out_indices += ndim;
+  }
+}
+
+template <typename c_index_type, typename c_value_type>
+void ConvertStridedTensor(const Tensor& tensor, c_index_type* indices,
+                          c_value_type* values, const int64_t size) {
+  using ValueType = typename CTypeTraits<c_value_type>::ArrowType;
+  const auto& shape = tensor.shape();
+  const auto ndim = tensor.ndim();
+  std::vector<int64_t> coord(ndim, 0);
+
+  constexpr c_value_type zero = 0;
+  c_value_type x;
+  int64_t i;
+  for (int64_t n = tensor.size(); n > 0; --n) {
+    x = tensor.Value<ValueType>(coord);
+    if (ARROW_PREDICT_FALSE(x != zero)) {
+      *values++ = x;
+      for (i = 0; i < ndim; ++i) {
+        *indices++ = static_cast<c_index_type>(coord[i]);
+      }
+    }
+
+    IncrementRowMajorIndex(coord, shape);
+  }
+}
+
+#define CONVERT_TENSOR(func, index_type, value_type, indices, values, size)     \
+  func<index_type, value_type>(tensor_, reinterpret_cast<index_type*>(indices), \
+                               reinterpret_cast<value_type*>(values), size)
+
+// Using ARROW_EXPAND is necessary to expand __VA_ARGS__ correctly on VC++.
+#define CONVERT_ROW_MAJOR_TENSOR(index_type, value_type, ...) \
+  ARROW_EXPAND(CONVERT_TENSOR(ConvertRowMajorTensor, index_type, value_type, __VA_ARGS__))
+
+#define CONVERT_COLUMN_MAJOR_TENSOR(index_type, value_type, ...) \
+  ARROW_EXPAND(                                                  \
+      CONVERT_TENSOR(ConvertColumnMajorTensor, index_type, value_type, __VA_ARGS__))
+
+#define CONVERT_STRIDED_TENSOR(index_type, value_type, ...) \
+  ARROW_EXPAND(CONVERT_TENSOR(ConvertStridedTensor, index_type, value_type, __VA_ARGS__))
+
+// ----------------------------------------------------------------------
+// SparseTensorConverter for SparseCOOIndex
+
+class SparseCOOTensorConverter : private SparseTensorConverterMixin {
+  using SparseTensorConverterMixin::AssignIndex;
+  using SparseTensorConverterMixin::IsNonZero;
+
+ public:
+  SparseCOOTensorConverter(const Tensor& tensor,
+                           const std::shared_ptr<DataType>& index_value_type,
+                           MemoryPool* pool)
+      : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
+
+  Status Convert() {
+    RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
+                                                                  tensor_.shape()));
+
+    const int index_elsize = GetByteWidth(*index_value_type_);
+    const int value_elsize = GetByteWidth(*tensor_.type());
+
+    const int64_t ndim = tensor_.ndim();
+    ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
+
+    ARROW_ASSIGN_OR_RAISE(auto indices_buffer,
+                          AllocateBuffer(index_elsize * ndim * nonzero_count, pool_));
+    uint8_t* indices = indices_buffer->mutable_data();
+
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+                          AllocateBuffer(value_elsize * nonzero_count, pool_));
+    uint8_t* values = values_buffer->mutable_data();
+
+    const uint8_t* tensor_data = tensor_.raw_data();
+    if (ndim <= 1) {
+      const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0];
+      for (int64_t i = 0; i < count; ++i) {
+        if (std::any_of(tensor_data, tensor_data + value_elsize, IsNonZero)) {
+          AssignIndex(indices, i, index_elsize);
+          std::copy_n(tensor_data, value_elsize, values);
+
+          indices += index_elsize;
+          values += value_elsize;
+        }
+        tensor_data += value_elsize;
+      }
+    } else if (tensor_.is_row_major()) {
+      DISPATCH(CONVERT_ROW_MAJOR_TENSOR, index_elsize, value_elsize, indices, values,
+               nonzero_count);
+    } else if (tensor_.is_column_major()) {
+      DISPATCH(CONVERT_COLUMN_MAJOR_TENSOR, index_elsize, value_elsize, indices, values,
+               nonzero_count);
+    } else {
+      DISPATCH(CONVERT_STRIDED_TENSOR, index_elsize, value_elsize, indices, values,
+               nonzero_count);
+    }
+
+    // make results
+    const std::vector<int64_t> indices_shape = {nonzero_count, ndim};
+    std::vector<int64_t> indices_strides;
+    RETURN_NOT_OK(internal::ComputeRowMajorStrides(
+        checked_cast<const FixedWidthType&>(*index_value_type_), indices_shape,
+        &indices_strides));
+    auto coords = std::make_shared<Tensor>(index_value_type_, std::move(indices_buffer),
+                                           indices_shape, indices_strides);
+    ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true));
+    data = std::move(values_buffer);
+
+    return Status::OK();
+  }
+
+  std::shared_ptr<SparseCOOIndex> sparse_index;
+  std::shared_ptr<Buffer> data;
+
+ private:
+  const Tensor& tensor_;
+  const std::shared_ptr<DataType>& index_value_type_;
+  MemoryPool* pool_;
+};
+
+}  // namespace
+
+void SparseTensorConverterMixin::AssignIndex(uint8_t* indices, int64_t val,
+                                             const int elsize) {
+  switch (elsize) {
+    case 1:
+      *indices = static_cast<uint8_t>(val);
+      break;
+    case 2:
+      *reinterpret_cast<uint16_t*>(indices) = static_cast<uint16_t>(val);
+      break;
+    case 4:
+      *reinterpret_cast<uint32_t*>(indices) = static_cast<uint32_t>(val);
+      break;
+    case 8:
+      *reinterpret_cast<int64_t*>(indices) = val;
+      break;
+    default:
+      break;
+  }
+}
+
+int64_t SparseTensorConverterMixin::GetIndexValue(const uint8_t* value_ptr,
+                                                  const int elsize) {
+  switch (elsize) {
+    case 1:
+      return *value_ptr;
+
+    case 2:
+      return *reinterpret_cast<const uint16_t*>(value_ptr);
+
+    case 4:
+      return *reinterpret_cast<const uint32_t*>(value_ptr);
+
+    case 8:
+      return *reinterpret_cast<const int64_t*>(value_ptr);
+
+    default:
+      return 0;
+  }
+}
+
+Status MakeSparseCOOTensorFromTensor(const Tensor& tensor,
+                                     const std::shared_ptr<DataType>& index_value_type,
+                                     MemoryPool* pool,
+                                     std::shared_ptr<SparseIndex>* out_sparse_index,
+                                     std::shared_ptr<Buffer>* out_data) {
+  SparseCOOTensorConverter converter(tensor, index_value_type, pool);
+  RETURN_NOT_OK(converter.Convert());
+
+  *out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
+  *out_data = converter.data;
+  return Status::OK();
+}
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
+    MemoryPool* pool, const SparseCOOTensor* sparse_tensor) {
+  const auto& sparse_index =
+      checked_cast<const SparseCOOIndex&>(*sparse_tensor->sparse_index());
+  const auto& coords = sparse_index.indices();
+  const auto* coords_data = coords->raw_data();
+
+  const int index_elsize = GetByteWidth(*coords->type());
+
+  const auto& value_type = checked_cast<const FixedWidthType&>(*sparse_tensor->type());
+  const int value_elsize = GetByteWidth(value_type);
+  ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+                        AllocateBuffer(value_elsize * sparse_tensor->size(), pool));
+  auto values = values_buffer->mutable_data();
+  std::fill_n(values, value_elsize * sparse_tensor->size(), 0);
+
+  std::vector<int64_t> strides;
+  RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides));
+
+  const auto* raw_data = sparse_tensor->raw_data();
+  const int ndim = sparse_tensor->ndim();
+
+  for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) {
+    int64_t offset = 0;
+
+    for (int j = 0; j < ndim; ++j) {
+      auto index = static_cast<int64_t>(
+          SparseTensorConverterMixin::GetIndexValue(coords_data, index_elsize));
+      offset += index * strides[j];
+      coords_data += index_elsize;
+    }
+
+    std::copy_n(raw_data, value_elsize, values + offset);
+    raw_data += value_elsize;
+  }
+
+  return std::make_shared<Tensor>(sparse_tensor->type(), std::move(values_buffer),
+                                  sparse_tensor->shape(), strides,
+                                  sparse_tensor->dim_names());
+}
+
+}  // namespace internal
+}  // namespace arrow