]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "arrow/array/builder_dict.h" | |
19 | ||
20 | #include <cstdint> | |
21 | #include <utility> | |
22 | ||
23 | #include "arrow/array/dict_internal.h" | |
24 | #include "arrow/status.h" | |
25 | #include "arrow/type.h" | |
26 | #include "arrow/type_traits.h" | |
27 | #include "arrow/util/checked_cast.h" | |
28 | #include "arrow/util/hashing.h" | |
29 | #include "arrow/util/logging.h" | |
30 | #include "arrow/visitor_inline.h" | |
31 | ||
32 | namespace arrow { | |
33 | ||
34 | // ---------------------------------------------------------------------- | |
35 | // DictionaryBuilder | |
36 | ||
37 | namespace internal { | |
38 | ||
39 | class DictionaryMemoTable::DictionaryMemoTableImpl { | |
40 | // Type-dependent visitor for memo table initialization | |
41 | struct MemoTableInitializer { | |
42 | std::shared_ptr<DataType> value_type_; | |
43 | MemoryPool* pool_; | |
44 | std::unique_ptr<MemoTable>* memo_table_; | |
45 | ||
46 | template <typename T> | |
47 | enable_if_no_memoize<T, Status> Visit(const T&) { | |
48 | return Status::NotImplemented("Initialization of ", value_type_->ToString(), | |
49 | " memo table is not implemented"); | |
50 | } | |
51 | ||
52 | template <typename T> | |
53 | enable_if_memoize<T, Status> Visit(const T&) { | |
54 | using MemoTable = typename DictionaryTraits<T>::MemoTableType; | |
55 | memo_table_->reset(new MemoTable(pool_, 0)); | |
56 | return Status::OK(); | |
57 | } | |
58 | }; | |
59 | ||
60 | // Type-dependent visitor for memo table insertion | |
61 | struct ArrayValuesInserter { | |
62 | DictionaryMemoTableImpl* impl_; | |
63 | const Array& values_; | |
64 | ||
65 | template <typename T> | |
66 | Status Visit(const T& type) { | |
67 | using ArrayType = typename TypeTraits<T>::ArrayType; | |
68 | return InsertValues(type, checked_cast<const ArrayType&>(values_)); | |
69 | } | |
70 | ||
71 | private: | |
72 | template <typename T, typename ArrayType> | |
73 | enable_if_no_memoize<T, Status> InsertValues(const T& type, const ArrayType&) { | |
74 | return Status::NotImplemented("Inserting array values of ", type, | |
75 | " is not implemented"); | |
76 | } | |
77 | ||
78 | template <typename T, typename ArrayType> | |
79 | enable_if_memoize<T, Status> InsertValues(const T&, const ArrayType& array) { | |
80 | if (array.null_count() > 0) { | |
81 | return Status::Invalid("Cannot insert dictionary values containing nulls"); | |
82 | } | |
83 | for (int64_t i = 0; i < array.length(); ++i) { | |
84 | int32_t unused_memo_index; | |
85 | RETURN_NOT_OK(impl_->GetOrInsert<T>(array.GetView(i), &unused_memo_index)); | |
86 | } | |
87 | return Status::OK(); | |
88 | } | |
89 | }; | |
90 | ||
91 | // Type-dependent visitor for building ArrayData from memo table | |
92 | struct ArrayDataGetter { | |
93 | std::shared_ptr<DataType> value_type_; | |
94 | MemoTable* memo_table_; | |
95 | MemoryPool* pool_; | |
96 | int64_t start_offset_; | |
97 | std::shared_ptr<ArrayData>* out_; | |
98 | ||
99 | template <typename T> | |
100 | enable_if_no_memoize<T, Status> Visit(const T&) { | |
101 | return Status::NotImplemented("Getting array data of ", value_type_, | |
102 | " is not implemented"); | |
103 | } | |
104 | ||
105 | template <typename T> | |
106 | enable_if_memoize<T, Status> Visit(const T&) { | |
107 | using ConcreteMemoTable = typename DictionaryTraits<T>::MemoTableType; | |
108 | auto memo_table = checked_cast<ConcreteMemoTable*>(memo_table_); | |
109 | return DictionaryTraits<T>::GetDictionaryArrayData(pool_, value_type_, *memo_table, | |
110 | start_offset_, out_); | |
111 | } | |
112 | }; | |
113 | ||
114 | public: | |
115 | DictionaryMemoTableImpl(MemoryPool* pool, std::shared_ptr<DataType> type) | |
116 | : pool_(pool), type_(std::move(type)), memo_table_(nullptr) { | |
117 | MemoTableInitializer visitor{type_, pool_, &memo_table_}; | |
118 | ARROW_CHECK_OK(VisitTypeInline(*type_, &visitor)); | |
119 | } | |
120 | ||
121 | Status InsertValues(const Array& array) { | |
122 | if (!array.type()->Equals(*type_)) { | |
123 | return Status::Invalid("Array value type does not match memo type: ", | |
124 | array.type()->ToString()); | |
125 | } | |
126 | ArrayValuesInserter visitor{this, array}; | |
127 | return VisitTypeInline(*array.type(), &visitor); | |
128 | } | |
129 | ||
130 | template <typename PhysicalType, | |
131 | typename CType = typename DictionaryValue<PhysicalType>::type> | |
132 | Status GetOrInsert(CType value, int32_t* out) { | |
133 | using ConcreteMemoTable = typename DictionaryTraits<PhysicalType>::MemoTableType; | |
134 | return checked_cast<ConcreteMemoTable*>(memo_table_.get())->GetOrInsert(value, out); | |
135 | } | |
136 | ||
137 | Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out) { | |
138 | ArrayDataGetter visitor{type_, memo_table_.get(), pool_, start_offset, out}; | |
139 | return VisitTypeInline(*type_, &visitor); | |
140 | } | |
141 | ||
142 | int32_t size() const { return memo_table_->size(); } | |
143 | ||
144 | private: | |
145 | MemoryPool* pool_; | |
146 | std::shared_ptr<DataType> type_; | |
147 | std::unique_ptr<MemoTable> memo_table_; | |
148 | }; | |
149 | ||
150 | DictionaryMemoTable::DictionaryMemoTable(MemoryPool* pool, | |
151 | const std::shared_ptr<DataType>& type) | |
152 | : impl_(new DictionaryMemoTableImpl(pool, type)) {} | |
153 | ||
154 | DictionaryMemoTable::DictionaryMemoTable(MemoryPool* pool, | |
155 | const std::shared_ptr<Array>& dictionary) | |
156 | : impl_(new DictionaryMemoTableImpl(pool, dictionary->type())) { | |
157 | ARROW_CHECK_OK(impl_->InsertValues(*dictionary)); | |
158 | } | |
159 | ||
160 | DictionaryMemoTable::~DictionaryMemoTable() = default; | |
161 | ||
162 | #define GET_OR_INSERT(ARROW_TYPE) \ | |
163 | Status DictionaryMemoTable::GetOrInsert( \ | |
164 | const ARROW_TYPE*, typename ARROW_TYPE::c_type value, int32_t* out) { \ | |
165 | return impl_->GetOrInsert<ARROW_TYPE>(value, out); \ | |
166 | } | |
167 | ||
168 | GET_OR_INSERT(BooleanType) | |
169 | GET_OR_INSERT(Int8Type) | |
170 | GET_OR_INSERT(Int16Type) | |
171 | GET_OR_INSERT(Int32Type) | |
172 | GET_OR_INSERT(Int64Type) | |
173 | GET_OR_INSERT(UInt8Type) | |
174 | GET_OR_INSERT(UInt16Type) | |
175 | GET_OR_INSERT(UInt32Type) | |
176 | GET_OR_INSERT(UInt64Type) | |
177 | GET_OR_INSERT(FloatType) | |
178 | GET_OR_INSERT(DoubleType) | |
179 | GET_OR_INSERT(DurationType); | |
180 | GET_OR_INSERT(TimestampType); | |
181 | GET_OR_INSERT(Date32Type); | |
182 | GET_OR_INSERT(Date64Type); | |
183 | GET_OR_INSERT(Time32Type); | |
184 | GET_OR_INSERT(Time64Type); | |
185 | GET_OR_INSERT(MonthDayNanoIntervalType); | |
186 | GET_OR_INSERT(DayTimeIntervalType); | |
187 | GET_OR_INSERT(MonthIntervalType); | |
188 | ||
189 | #undef GET_OR_INSERT | |
190 | ||
191 | Status DictionaryMemoTable::GetOrInsert(const BinaryType*, util::string_view value, | |
192 | int32_t* out) { | |
193 | return impl_->GetOrInsert<BinaryType>(value, out); | |
194 | } | |
195 | ||
196 | Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, util::string_view value, | |
197 | int32_t* out) { | |
198 | return impl_->GetOrInsert<LargeBinaryType>(value, out); | |
199 | } | |
200 | ||
201 | Status DictionaryMemoTable::GetArrayData(int64_t start_offset, | |
202 | std::shared_ptr<ArrayData>* out) { | |
203 | return impl_->GetArrayData(start_offset, out); | |
204 | } | |
205 | ||
206 | Status DictionaryMemoTable::InsertValues(const Array& array) { | |
207 | return impl_->InsertValues(array); | |
208 | } | |
209 | ||
210 | int32_t DictionaryMemoTable::size() const { return impl_->size(); } | |
211 | ||
212 | } // namespace internal | |
213 | } // namespace arrow |