--- /dev/null
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/tensor/converter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/sort.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace internal {
+namespace {
+
+inline void IncrementIndex(std::vector<int64_t>& coord, const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& axis_order) {
+ const int64_t ndim = shape.size();
+ const int64_t last_axis = axis_order[ndim - 1];
+ ++coord[last_axis];
+ if (coord[last_axis] == shape[last_axis]) {
+ int64_t d = ndim - 1;
+ while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) {
+ coord[axis_order[d]] = 0;
+ ++coord[axis_order[d - 1]];
+ --d;
+ }
+ }
+}
+
+// ----------------------------------------------------------------------
+// SparseTensorConverter for SparseCSFIndex
+
+class SparseCSFTensorConverter : private SparseTensorConverterMixin {
+ using SparseTensorConverterMixin::AssignIndex;
+ using SparseTensorConverterMixin::IsNonZero;
+
+ public:
+ SparseCSFTensorConverter(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool)
+ : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
+
+ Status Convert() {
+ RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
+ tensor_.shape()));
+
+ const int index_elsize = GetByteWidth(*index_value_type_);
+ const int value_elsize = GetByteWidth(*tensor_.type());
+
+ const int64_t ndim = tensor_.ndim();
+ // Axis order as ascending order of dimension size is a good heuristic but is not
+ // necessarily optimal.
+ std::vector<int64_t> axis_order = internal::ArgSort(tensor_.shape());
+ ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
+
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+ AllocateBuffer(value_elsize * nonzero_count, pool_));
+ auto* values = values_buffer->mutable_data();
+
+ std::vector<int64_t> counts(ndim, 0);
+ std::vector<int64_t> coord(ndim, 0);
+ std::vector<int64_t> previous_coord(ndim, -1);
+ std::vector<BufferBuilder> indptr_buffer_builders(ndim - 1);
+ std::vector<BufferBuilder> indices_buffer_builders(ndim);
+
+ const auto* tensor_data = tensor_.raw_data();
+ uint8_t index_buffer[sizeof(int64_t)];
+
+ if (ndim <= 1) {
+ return Status::NotImplemented("TODO for ndim <= 1");
+ } else {
+ const auto& shape = tensor_.shape();
+ for (int64_t n = tensor_.size(); n > 0; n--) {
+ const auto offset = tensor_.CalculateValueOffset(coord);
+ const auto xp = tensor_data + offset;
+
+ if (std::any_of(xp, xp + value_elsize, IsNonZero)) {
+ bool tree_split = false;
+
+ std::copy_n(xp, value_elsize, values);
+ values += value_elsize;
+
+ for (int64_t i = 0; i < ndim; ++i) {
+ int64_t dimension = axis_order[i];
+
+ tree_split = tree_split || (coord[dimension] != previous_coord[dimension]);
+ if (tree_split) {
+ if (i < ndim - 1) {
+ AssignIndex(index_buffer, counts[i + 1], index_elsize);
+ RETURN_NOT_OK(
+ indptr_buffer_builders[i].Append(index_buffer, index_elsize));
+ }
+
+ AssignIndex(index_buffer, coord[dimension], index_elsize);
+ RETURN_NOT_OK(
+ indices_buffer_builders[i].Append(index_buffer, index_elsize));
+
+ ++counts[i];
+ }
+ }
+
+ previous_coord = coord;
+ }
+
+ IncrementIndex(coord, shape, axis_order);
+ }
+ }
+
+ for (int64_t column = 0; column < ndim - 1; ++column) {
+ AssignIndex(index_buffer, counts[column + 1], index_elsize);
+ RETURN_NOT_OK(indptr_buffer_builders[column].Append(index_buffer, index_elsize));
+ }
+
+ // make results
+ data = std::move(values_buffer);
+
+ std::vector<std::shared_ptr<Buffer>> indptr_buffers(ndim - 1);
+ std::vector<std::shared_ptr<Buffer>> indices_buffers(ndim);
+ std::vector<int64_t> indptr_shapes(counts.begin(), counts.end() - 1);
+ std::vector<int64_t> indices_shapes = counts;
+
+ for (int64_t column = 0; column < ndim; ++column) {
+ RETURN_NOT_OK(
+ indices_buffer_builders[column].Finish(&indices_buffers[column], true));
+ }
+ for (int64_t column = 0; column < ndim - 1; ++column) {
+ RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true));
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order,
+ indptr_buffers, indices_buffers));
+ return Status::OK();
+ }
+
+ std::shared_ptr<SparseCSFIndex> sparse_index;
+ std::shared_ptr<Buffer> data;
+
+ private:
+ const Tensor& tensor_;
+ const std::shared_ptr<DataType>& index_value_type_;
+ MemoryPool* pool_;
+};
+
+class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin {
+ using SparseTensorConverterMixin::GetIndexValue;
+
+ MemoryPool* pool_;
+ const SparseCSFTensor* sparse_tensor_;
+ const SparseCSFIndex* sparse_index_;
+ const std::vector<std::shared_ptr<Tensor>>& indptr_;
+ const std::vector<std::shared_ptr<Tensor>>& indices_;
+ const std::vector<int64_t>& axis_order_;
+ const std::vector<int64_t>& shape_;
+ const int64_t non_zero_length_;
+ const int ndim_;
+ const int64_t tensor_size_;
+ const FixedWidthType& value_type_;
+ const int value_elsize_;
+ const uint8_t* raw_data_;
+ std::vector<int64_t> strides_;
+ std::shared_ptr<Buffer> values_buffer_;
+ uint8_t* values_;
+
+ public:
+ TensorBuilderFromSparseCSFTensor(const SparseCSFTensor* sparse_tensor, MemoryPool* pool)
+ : pool_(pool),
+ sparse_tensor_(sparse_tensor),
+ sparse_index_(
+ checked_cast<const SparseCSFIndex*>(sparse_tensor->sparse_index().get())),
+ indptr_(sparse_index_->indptr()),
+ indices_(sparse_index_->indices()),
+ axis_order_(sparse_index_->axis_order()),
+ shape_(sparse_tensor->shape()),
+ non_zero_length_(sparse_tensor->non_zero_length()),
+ ndim_(sparse_tensor->ndim()),
+ tensor_size_(sparse_tensor->size()),
+ value_type_(checked_cast<const FixedWidthType&>(*sparse_tensor->type())),
+ value_elsize_(GetByteWidth(value_type_)),
+ raw_data_(sparse_tensor->raw_data()) {}
+
+ int ElementSize(const std::shared_ptr<Tensor>& tensor) const {
+ return GetByteWidth(*tensor->type());
+ }
+
+ Result<std::shared_ptr<Tensor>> Build() {
+ RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_));
+
+ ARROW_ASSIGN_OR_RAISE(values_buffer_,
+ AllocateBuffer(value_elsize_ * tensor_size_, pool_));
+ values_ = values_buffer_->mutable_data();
+ std::fill_n(values_, value_elsize_ * tensor_size_, 0);
+
+ const int64_t start = 0;
+ const int64_t stop = indptr_[0]->size() - 1;
+ ExpandValues(0, 0, start, stop);
+
+ return std::make_shared<Tensor>(sparse_tensor_->type(), std::move(values_buffer_),
+ shape_, strides_, sparse_tensor_->dim_names());
+ }
+
+ void ExpandValues(const int64_t dim, const int64_t dim_offset, const int64_t start,
+ const int64_t stop) {
+ const auto& cur_indices = indices_[dim];
+ const int indices_elsize = ElementSize(cur_indices);
+ const auto* indices_data = cur_indices->raw_data() + start * indices_elsize;
+
+ if (dim == ndim_ - 1) {
+ for (auto i = start; i < stop; ++i) {
+ const int64_t index =
+ SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize);
+ const int64_t offset = dim_offset + index * strides_[axis_order_[dim]];
+
+ std::copy_n(raw_data_ + i * value_elsize_, value_elsize_, values_ + offset);
+
+ indices_data += indices_elsize;
+ }
+ } else {
+ const auto& cur_indptr = indptr_[dim];
+ const int indptr_elsize = ElementSize(cur_indptr);
+ const auto* indptr_data = cur_indptr->raw_data() + start * indptr_elsize;
+
+ for (int64_t i = start; i < stop; ++i) {
+ const int64_t index =
+ SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize);
+ const int64_t offset = dim_offset + index * strides_[axis_order_[dim]];
+ const int64_t next_start = GetIndexValue(indptr_data, indptr_elsize);
+ const int64_t next_stop =
+ GetIndexValue(indptr_data + indptr_elsize, indptr_elsize);
+
+ ExpandValues(dim + 1, offset, next_start, next_stop);
+
+ indices_data += indices_elsize;
+ indptr_data += indptr_elsize;
+ }
+ }
+ }
+};
+
+} // namespace
+
+Status MakeSparseCSFTensorFromTensor(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data) {
+ SparseCSFTensorConverter converter(tensor, index_value_type, pool);
+ RETURN_NOT_OK(converter.Convert());
+
+ *out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
+ *out_data = converter.data;
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
+ MemoryPool* pool, const SparseCSFTensor* sparse_tensor) {
+ TensorBuilderFromSparseCSFTensor builder(sparse_tensor, pool);
+ return builder.Build();
+}
+
+} // namespace internal
+} // namespace arrow