]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include <algorithm> | |
19 | #include <cmath> | |
20 | ||
21 | #include "feather_reader.h" | |
22 | ||
23 | #include <arrow/array/array_base.h> | |
24 | #include <arrow/array/builder_base.h> | |
25 | #include <arrow/array/builder_primitive.h> | |
26 | #include <arrow/io/file.h> | |
27 | #include <arrow/ipc/feather.h> | |
28 | #include <arrow/result.h> | |
29 | #include <arrow/status.h> | |
30 | #include <arrow/table.h> | |
31 | #include <arrow/type.h> | |
32 | #include <arrow/type_traits.h> | |
33 | #include <arrow/util/bitmap_visit.h> | |
34 | #include <mex.h> | |
35 | ||
36 | #include "matlab_traits.h" | |
37 | #include "util/handle_status.h" | |
38 | #include "util/unicode_conversion.h" | |
39 | ||
40 | namespace arrow { | |
41 | namespace matlab { | |
42 | namespace internal { | |
43 | ||
44 | // Read the name of variable i from the Feather file as a mxArray*. | |
45 | mxArray* ReadVariableName(const std::string& column_name) { | |
46 | return matlab::util::ConvertUTF8StringToUTF16CharMatrix(column_name); | |
47 | } | |
48 | ||
49 | template <typename ArrowDataType> | |
50 | mxArray* ReadNumericVariableData(const std::shared_ptr<Array>& column) { | |
51 | using MatlabType = typename MatlabTraits<ArrowDataType>::MatlabType; | |
52 | using ArrowArrayType = typename TypeTraits<ArrowDataType>::ArrayType; | |
53 | ||
54 | const mxClassID matlab_class_id = MatlabTraits<ArrowDataType>::matlab_class_id; | |
55 | // Allocate a numeric mxArray* with the correct mxClassID based on the type of the | |
56 | // arrow::Array. | |
57 | mxArray* variable_data = | |
58 | mxCreateNumericMatrix(column->length(), 1, matlab_class_id, mxREAL); | |
59 | ||
60 | auto arrow_numeric_array = | |
61 | std::static_pointer_cast<ArrowArrayType>(column); | |
62 | ||
63 | // Get a raw pointer to the Arrow array data. | |
64 | const MatlabType* source = arrow_numeric_array->raw_values(); | |
65 | ||
66 | // Get a mutable pointer to the MATLAB array data and std::copy the | |
67 | // Arrow array data into it. | |
68 | MatlabType* destination = MatlabTraits<ArrowDataType>::GetData(variable_data); | |
69 | std::copy(source, source + column->length(), destination); | |
70 | ||
71 | return variable_data; | |
72 | } | |
73 | ||
74 | // Read the data of variable i from the Feather file as a mxArray*. | |
75 | mxArray* ReadVariableData(const std::shared_ptr<Array>& column, | |
76 | const std::string& column_name) { | |
77 | std::shared_ptr<DataType> type = column->type(); | |
78 | ||
79 | switch (type->id()) { | |
80 | case Type::FLOAT: | |
81 | return ReadNumericVariableData<FloatType>(column); | |
82 | case Type::DOUBLE: | |
83 | return ReadNumericVariableData<DoubleType>(column); | |
84 | case Type::UINT8: | |
85 | return ReadNumericVariableData<UInt8Type>(column); | |
86 | case Type::UINT16: | |
87 | return ReadNumericVariableData<UInt16Type>(column); | |
88 | case Type::UINT32: | |
89 | return ReadNumericVariableData<UInt32Type>(column); | |
90 | case Type::UINT64: | |
91 | return ReadNumericVariableData<UInt64Type>(column); | |
92 | case Type::INT8: | |
93 | return ReadNumericVariableData<Int8Type>(column); | |
94 | case Type::INT16: | |
95 | return ReadNumericVariableData<Int16Type>(column); | |
96 | case Type::INT32: | |
97 | return ReadNumericVariableData<Int32Type>(column); | |
98 | case Type::INT64: | |
99 | return ReadNumericVariableData<Int64Type>(column); | |
100 | default: { | |
101 | mexErrMsgIdAndTxt("MATLAB:arrow:UnsupportedArrowType", | |
102 | "Unsupported arrow::Type '%s' for variable '%s'", | |
103 | type->name().c_str(), column_name.c_str()); | |
104 | break; | |
105 | } | |
106 | } | |
107 | ||
108 | return nullptr; | |
109 | } | |
110 | ||
111 | // arrow::Buffers are bit-packed, while mxLogical arrays aren't. This utility | |
112 | // uses an Arrow utility to copy each bit of an arrow::Buffer into each byte | |
113 | // of an mxLogical array. | |
114 | void BitUnpackBuffer(const std::shared_ptr<Buffer>& source, int64_t length, | |
115 | mxLogical* destination) { | |
116 | const uint8_t* source_data = source->data(); | |
117 | ||
118 | // Call into an Arrow utility to visit each bit in the bitmap. | |
119 | auto visitFcn = [&](mxLogical is_valid) { *destination++ = is_valid; }; | |
120 | ||
121 | const int64_t start_offset = 0; | |
122 | arrow::internal::VisitBitsUnrolled(source_data, start_offset, length, visitFcn); | |
123 | } | |
124 | ||
125 | // Populates the validity bitmap from an arrow::Array. | |
126 | // writes to a zero-initialized destination buffer. | |
127 | // Implements a fast path for the fully-valid and fully-invalid cases. | |
128 | // Returns true if the destination buffer was successfully populated. | |
129 | bool TryBitUnpackFastPath(const std::shared_ptr<Array>& array, mxLogical* destination) { | |
130 | const int64_t null_count = array->null_count(); | |
131 | const int64_t length = array->length(); | |
132 | ||
133 | if (null_count == length) { | |
134 | // The source array is filled with invalid values. Since mxCreateLogicalMatrix | |
135 | // zero-initializes the destination buffer, we can return without changing anything | |
136 | // in the destination buffer. | |
137 | return true; | |
138 | } else if (null_count == 0) { | |
139 | // The source array contains only valid values. Fill the destination buffer | |
140 | // with 'true'. | |
141 | std::fill(destination, destination + length, true); | |
142 | return true; | |
143 | } | |
144 | ||
145 | // Return false to indicate that we couldn't fill the entire validity bitmap. | |
146 | return false; | |
147 | } | |
148 | ||
149 | // Read the validity (null) bitmap of variable i from the Feather | |
150 | // file as an mxArray*. | |
151 | mxArray* ReadVariableValidityBitmap(const std::shared_ptr<Array>& column) { | |
152 | // Allocate an mxLogical array to store the validity (null) bitmap values. | |
153 | // Note: All Arrow arrays can have an associated validity (null) bitmap. | |
154 | // The Apache Arrow specification defines 0 (false) to represent an | |
155 | // invalid (null) array entry and 1 (true) to represent a valid | |
156 | // (non-null) array entry. | |
157 | mxArray* validity_bitmap = mxCreateLogicalMatrix(column->length(), 1); | |
158 | mxLogical* validity_bitmap_unpacked = mxGetLogicals(validity_bitmap); | |
159 | ||
160 | if (!TryBitUnpackFastPath(column, validity_bitmap_unpacked)) { | |
161 | // Couldn't fill the full validity bitmap at once. Call an optimized loop-unrolled | |
162 | // implementation instead that goes byte-by-byte and populates the validity bitmap. | |
163 | BitUnpackBuffer(column->null_bitmap(), column->length(), validity_bitmap_unpacked); | |
164 | } | |
165 | ||
166 | return validity_bitmap; | |
167 | } | |
168 | ||
169 | // Read the type name of an arrow::Array as an mxChar array. | |
170 | mxArray* ReadVariableType(const std::shared_ptr<Array>& column) { | |
171 | return util::ConvertUTF8StringToUTF16CharMatrix(column->type()->name()); | |
172 | } | |
173 | ||
174 | // MATLAB arrays cannot be larger than 2^48 elements. | |
175 | static constexpr uint64_t MAX_MATLAB_SIZE = static_cast<uint64_t>(0x01) << 48; | |
176 | ||
177 | } // namespace internal | |
178 | ||
179 | Status FeatherReader::Open(const std::string& filename, | |
180 | std::shared_ptr<FeatherReader>* feather_reader) { | |
181 | *feather_reader = std::shared_ptr<FeatherReader>(new FeatherReader()); | |
182 | ||
183 | // Open file with given filename as a ReadableFile. | |
184 | ARROW_ASSIGN_OR_RAISE(auto readable_file, io::ReadableFile::Open(filename)); | |
185 | ||
186 | // Open the Feather file for reading with a TableReader. | |
187 | ARROW_ASSIGN_OR_RAISE(auto reader, ipc::feather::Reader::Open(readable_file)); | |
188 | ||
189 | // Set the internal reader_ object. | |
190 | (*feather_reader)->reader_ = reader; | |
191 | ||
192 | // Check the feather file version | |
193 | auto version = reader->version(); | |
194 | if (version == ipc::feather::kFeatherV2Version) { | |
195 | return Status::NotImplemented("Support for Feather V2 has not been implemented."); | |
196 | } else if (version != ipc::feather::kFeatherV1Version) { | |
197 | return Status::Invalid("Unknown Feather format version."); | |
198 | } | |
199 | ||
200 | // read the table metadata from the Feather file | |
201 | (*feather_reader)->num_variables_ = reader->schema()->num_fields(); | |
202 | return Status::OK(); | |
203 | } | |
204 | ||
205 | // Read the table metadata from the Feather file as a mxArray*. | |
206 | mxArray* FeatherReader::ReadMetadata() const { | |
207 | const int32_t num_metadata_fields = 3; | |
208 | const char* fieldnames[] = {"NumRows", "NumVariables", "Description"}; | |
209 | ||
210 | // Create a mxArray struct array containing the table metadata to be passed back to | |
211 | // MATLAB. | |
212 | mxArray* metadata = mxCreateStructMatrix(1, 1, num_metadata_fields, fieldnames); | |
213 | ||
214 | // Returning double values to MATLAB since that is the default type. | |
215 | ||
216 | // Set the number of rows. | |
217 | mxSetField(metadata, 0, "NumRows", | |
218 | mxCreateDoubleScalar(static_cast<double>(num_rows_))); | |
219 | ||
220 | // Set the number of variables. | |
221 | mxSetField(metadata, 0, "NumVariables", | |
222 | mxCreateDoubleScalar(static_cast<double>(num_variables_))); | |
223 | ||
224 | return metadata; | |
225 | } | |
226 | ||
227 | // Read the table variables from the Feather file as a mxArray*. | |
228 | mxArray* FeatherReader::ReadVariables() { | |
229 | const int32_t num_variable_fields = 4; | |
230 | const char* fieldnames[] = {"Name", "Type", "Data", "Valid"}; | |
231 | ||
232 | // Create an mxArray* struct array containing the table variables to be passed back to | |
233 | // MATLAB. | |
234 | mxArray* variables = | |
235 | mxCreateStructMatrix(1, num_variables_, num_variable_fields, fieldnames); | |
236 | ||
237 | std::shared_ptr<arrow::Table> table; | |
238 | auto status = reader_->Read(&table); | |
239 | if (!status.ok()) { | |
240 | mexErrMsgIdAndTxt("MATLAB:arrow:FeatherReader::FailedToReadTable", | |
241 | "Failed to read arrow::Table from Feather file. Reason: %s", | |
242 | status.message().c_str()); | |
243 | } | |
244 | ||
245 | // Set the number of rows | |
246 | num_rows_ = table->num_rows(); | |
247 | ||
248 | if (num_rows_ > internal::MAX_MATLAB_SIZE || | |
249 | num_variables_ > internal::MAX_MATLAB_SIZE) { | |
250 | mexErrMsgIdAndTxt("MATLAB:arrow:SizeTooLarge", | |
251 | "The table size exceeds MATLAB limits: %u x %u", num_rows_, | |
252 | num_variables_); | |
253 | } | |
254 | ||
255 | auto column_names = table->ColumnNames(); | |
256 | ||
257 | for (int64_t i = 0; i < num_variables_; ++i) { | |
258 | auto column = table->column(i); | |
259 | if (column->num_chunks() != 1) { | |
260 | mexErrMsgIdAndTxt("MATLAB:arrow:FeatherReader::ReadVariables", | |
261 | "Chunked columns not yet supported"); | |
262 | } | |
263 | std::shared_ptr<Array> chunk = column->chunk(0); | |
264 | const std::string column_name = column_names[i]; | |
265 | ||
266 | // set the struct fields data | |
267 | mxSetField(variables, i, "Name", internal::ReadVariableName(column_name)); | |
268 | mxSetField(variables, i, "Type", internal::ReadVariableType(chunk)); | |
269 | mxSetField(variables, i, "Data", internal::ReadVariableData(chunk, column_name)); | |
270 | mxSetField(variables, i, "Valid", internal::ReadVariableValidityBitmap(chunk)); | |
271 | } | |
272 | ||
273 | return variables; | |
274 | } | |
275 | ||
276 | } // namespace matlab | |
277 | } // namespace arrow |