--- /dev/null
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// DictionaryArray
+
+/// \brief Array type for dictionary-encoded data with a
+/// data-dependent dictionary
+///
+/// A dictionary array contains an array of non-negative integers (the
+/// "dictionary indices") along with a data type containing a "dictionary"
+/// corresponding to the distinct values represented in the data.
+///
+/// For example, the array
+///
+/// ["foo", "bar", "foo", "bar", "foo", "bar"]
+///
+/// with dictionary ["bar", "foo"], would have dictionary array representation
+///
+/// indices: [1, 0, 1, 0, 1, 0]
+/// dictionary: ["bar", "foo"]
+///
+/// The indices in principle may be any integer type.
+class ARROW_EXPORT DictionaryArray : public Array {
+ public:
+ using TypeClass = DictionaryType;
+
+ explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
+
+ DictionaryArray(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& dictionary);
+
+ /// \brief Construct DictionaryArray from dictionary and indices
+ /// array and validate
+ ///
+ /// This function does the validation of the indices and input type. It checks if
+ /// all indices are non-negative and smaller than the size of the dictionary.
+ ///
+ /// \param[in] type a dictionary type
+ /// \param[in] dictionary the dictionary with same value type as the
+ /// type object
+ /// \param[in] indices an array of non-negative integers smaller than the
+ /// size of the dictionary
+ static Result<std::shared_ptr<Array>> FromArrays(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& dictionary);
+
+ static Result<std::shared_ptr<Array>> FromArrays(
+ const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
+ return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
+ dictionary);
+ }
+
+ /// \brief Transpose this DictionaryArray
+ ///
+ /// This method constructs a new dictionary array with the given dictionary
+ /// type, transposing indices using the transpose map. The type and the
+ /// transpose map are typically computed using DictionaryUnifier.
+ ///
+ /// \param[in] type the new type object
+ /// \param[in] dictionary the new dictionary
+ /// \param[in] transpose_map transposition array of this array's indices
+ /// into the target array's indices
+ /// \param[in] pool a pool to allocate the array data from
+ Result<std::shared_ptr<Array>> Transpose(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
+ const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
+
+ /// \brief Determine whether dictionary arrays may be compared without unification
+ bool CanCompareIndices(const DictionaryArray& other) const;
+
+ /// \brief Return the dictionary for this array, which is stored as
+ /// a member of the ArrayData internal structure
+ std::shared_ptr<Array> dictionary() const;
+ std::shared_ptr<Array> indices() const;
+
+ /// \brief Return the ith value of indices, cast to int64_t. Not recommended
+ /// for use in performance-sensitive code. Does not validate whether the
+ /// value is null or out-of-bounds.
+ int64_t GetValueIndex(int64_t i) const;
+
+ const DictionaryType* dict_type() const { return dict_type_; }
+
+ private:
+ void SetData(const std::shared_ptr<ArrayData>& data);
+ const DictionaryType* dict_type_;
+ std::shared_ptr<Array> indices_;
+
+ // Lazily initialized when invoking dictionary()
+ mutable std::shared_ptr<Array> dictionary_;
+};
+
+/// \brief Helper class for incremental dictionary unification
+class ARROW_EXPORT DictionaryUnifier {
+ public:
+ virtual ~DictionaryUnifier() = default;
+
+ /// \brief Construct a DictionaryUnifier
+ /// \param[in] value_type the data type of the dictionaries
+ /// \param[in] pool MemoryPool to use for memory allocations
+ static Result<std::unique_ptr<DictionaryUnifier>> Make(
+ std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross array chunks
+ ///
+ /// The dictionaries in the array chunks will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
+ const std::shared_ptr<ChunkedArray>& array,
+ MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross the chunks of each table column
+ ///
+ /// The dictionaries in each table column will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<Table>> UnifyTable(
+ const Table& table, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Append dictionary to the internal memo
+ virtual Status Unify(const Array& dictionary) = 0;
+
+ /// \brief Append dictionary and compute transpose indices
+ /// \param[in] dictionary the dictionary values to unify
+ /// \param[out] out_transpose a Buffer containing computed transpose indices
+ /// as int32_t values equal in length to the passed dictionary. The value in
+ /// each slot corresponds to the new index value for each original index
+ /// for a DictionaryArray with the old dictionary
+ virtual Status Unify(const Array& dictionary,
+ std::shared_ptr<Buffer>* out_transpose) = 0;
+
+ /// \brief Return a result DictionaryType with the smallest possible index
+ /// type to accommodate the unified dictionary. The unifier cannot be used
+ /// after this is called
+ virtual Status GetResult(std::shared_ptr<DataType>* out_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+
+ /// \brief Return a unified dictionary with the given index type. If
+ /// the index type is not large enough then an invalid status will be returned.
+ /// The unifier cannot be used after this is called
+ virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+};
+
+} // namespace arrow