]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/matlab/src/feather_reader.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / matlab / src / feather_reader.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <algorithm>
19#include <cmath>
20
21#include "feather_reader.h"
22
23#include <arrow/array/array_base.h>
24#include <arrow/array/builder_base.h>
25#include <arrow/array/builder_primitive.h>
26#include <arrow/io/file.h>
27#include <arrow/ipc/feather.h>
28#include <arrow/result.h>
29#include <arrow/status.h>
30#include <arrow/table.h>
31#include <arrow/type.h>
32#include <arrow/type_traits.h>
33#include <arrow/util/bitmap_visit.h>
34#include <mex.h>
35
36#include "matlab_traits.h"
37#include "util/handle_status.h"
38#include "util/unicode_conversion.h"
39
40namespace arrow {
41namespace matlab {
42namespace internal {
43
44// Read the name of variable i from the Feather file as a mxArray*.
45mxArray* ReadVariableName(const std::string& column_name) {
46 return matlab::util::ConvertUTF8StringToUTF16CharMatrix(column_name);
47}
48
49template <typename ArrowDataType>
50mxArray* ReadNumericVariableData(const std::shared_ptr<Array>& column) {
51 using MatlabType = typename MatlabTraits<ArrowDataType>::MatlabType;
52 using ArrowArrayType = typename TypeTraits<ArrowDataType>::ArrayType;
53
54 const mxClassID matlab_class_id = MatlabTraits<ArrowDataType>::matlab_class_id;
55 // Allocate a numeric mxArray* with the correct mxClassID based on the type of the
56 // arrow::Array.
57 mxArray* variable_data =
58 mxCreateNumericMatrix(column->length(), 1, matlab_class_id, mxREAL);
59
60 auto arrow_numeric_array =
61 std::static_pointer_cast<ArrowArrayType>(column);
62
63 // Get a raw pointer to the Arrow array data.
64 const MatlabType* source = arrow_numeric_array->raw_values();
65
66 // Get a mutable pointer to the MATLAB array data and std::copy the
67 // Arrow array data into it.
68 MatlabType* destination = MatlabTraits<ArrowDataType>::GetData(variable_data);
69 std::copy(source, source + column->length(), destination);
70
71 return variable_data;
72}
73
74// Read the data of variable i from the Feather file as a mxArray*.
75mxArray* ReadVariableData(const std::shared_ptr<Array>& column,
76 const std::string& column_name) {
77 std::shared_ptr<DataType> type = column->type();
78
79 switch (type->id()) {
80 case Type::FLOAT:
81 return ReadNumericVariableData<FloatType>(column);
82 case Type::DOUBLE:
83 return ReadNumericVariableData<DoubleType>(column);
84 case Type::UINT8:
85 return ReadNumericVariableData<UInt8Type>(column);
86 case Type::UINT16:
87 return ReadNumericVariableData<UInt16Type>(column);
88 case Type::UINT32:
89 return ReadNumericVariableData<UInt32Type>(column);
90 case Type::UINT64:
91 return ReadNumericVariableData<UInt64Type>(column);
92 case Type::INT8:
93 return ReadNumericVariableData<Int8Type>(column);
94 case Type::INT16:
95 return ReadNumericVariableData<Int16Type>(column);
96 case Type::INT32:
97 return ReadNumericVariableData<Int32Type>(column);
98 case Type::INT64:
99 return ReadNumericVariableData<Int64Type>(column);
100 default: {
101 mexErrMsgIdAndTxt("MATLAB:arrow:UnsupportedArrowType",
102 "Unsupported arrow::Type '%s' for variable '%s'",
103 type->name().c_str(), column_name.c_str());
104 break;
105 }
106 }
107
108 return nullptr;
109}
110
111// arrow::Buffers are bit-packed, while mxLogical arrays aren't. This utility
112// uses an Arrow utility to copy each bit of an arrow::Buffer into each byte
113// of an mxLogical array.
114void BitUnpackBuffer(const std::shared_ptr<Buffer>& source, int64_t length,
115 mxLogical* destination) {
116 const uint8_t* source_data = source->data();
117
118 // Call into an Arrow utility to visit each bit in the bitmap.
119 auto visitFcn = [&](mxLogical is_valid) { *destination++ = is_valid; };
120
121 const int64_t start_offset = 0;
122 arrow::internal::VisitBitsUnrolled(source_data, start_offset, length, visitFcn);
123}
124
125// Populates the validity bitmap from an arrow::Array.
126// writes to a zero-initialized destination buffer.
127// Implements a fast path for the fully-valid and fully-invalid cases.
128// Returns true if the destination buffer was successfully populated.
129bool TryBitUnpackFastPath(const std::shared_ptr<Array>& array, mxLogical* destination) {
130 const int64_t null_count = array->null_count();
131 const int64_t length = array->length();
132
133 if (null_count == length) {
134 // The source array is filled with invalid values. Since mxCreateLogicalMatrix
135 // zero-initializes the destination buffer, we can return without changing anything
136 // in the destination buffer.
137 return true;
138 } else if (null_count == 0) {
139 // The source array contains only valid values. Fill the destination buffer
140 // with 'true'.
141 std::fill(destination, destination + length, true);
142 return true;
143 }
144
145 // Return false to indicate that we couldn't fill the entire validity bitmap.
146 return false;
147}
148
149// Read the validity (null) bitmap of variable i from the Feather
150// file as an mxArray*.
151mxArray* ReadVariableValidityBitmap(const std::shared_ptr<Array>& column) {
152 // Allocate an mxLogical array to store the validity (null) bitmap values.
153 // Note: All Arrow arrays can have an associated validity (null) bitmap.
154 // The Apache Arrow specification defines 0 (false) to represent an
155 // invalid (null) array entry and 1 (true) to represent a valid
156 // (non-null) array entry.
157 mxArray* validity_bitmap = mxCreateLogicalMatrix(column->length(), 1);
158 mxLogical* validity_bitmap_unpacked = mxGetLogicals(validity_bitmap);
159
160 if (!TryBitUnpackFastPath(column, validity_bitmap_unpacked)) {
161 // Couldn't fill the full validity bitmap at once. Call an optimized loop-unrolled
162 // implementation instead that goes byte-by-byte and populates the validity bitmap.
163 BitUnpackBuffer(column->null_bitmap(), column->length(), validity_bitmap_unpacked);
164 }
165
166 return validity_bitmap;
167}
168
169// Read the type name of an arrow::Array as an mxChar array.
170mxArray* ReadVariableType(const std::shared_ptr<Array>& column) {
171 return util::ConvertUTF8StringToUTF16CharMatrix(column->type()->name());
172}
173
174// MATLAB arrays cannot be larger than 2^48 elements.
175static constexpr uint64_t MAX_MATLAB_SIZE = static_cast<uint64_t>(0x01) << 48;
176
177} // namespace internal
178
179Status FeatherReader::Open(const std::string& filename,
180 std::shared_ptr<FeatherReader>* feather_reader) {
181 *feather_reader = std::shared_ptr<FeatherReader>(new FeatherReader());
182
183 // Open file with given filename as a ReadableFile.
184 ARROW_ASSIGN_OR_RAISE(auto readable_file, io::ReadableFile::Open(filename));
185
186 // Open the Feather file for reading with a TableReader.
187 ARROW_ASSIGN_OR_RAISE(auto reader, ipc::feather::Reader::Open(readable_file));
188
189 // Set the internal reader_ object.
190 (*feather_reader)->reader_ = reader;
191
192 // Check the feather file version
193 auto version = reader->version();
194 if (version == ipc::feather::kFeatherV2Version) {
195 return Status::NotImplemented("Support for Feather V2 has not been implemented.");
196 } else if (version != ipc::feather::kFeatherV1Version) {
197 return Status::Invalid("Unknown Feather format version.");
198 }
199
200 // read the table metadata from the Feather file
201 (*feather_reader)->num_variables_ = reader->schema()->num_fields();
202 return Status::OK();
203}
204
205// Read the table metadata from the Feather file as a mxArray*.
206mxArray* FeatherReader::ReadMetadata() const {
207 const int32_t num_metadata_fields = 3;
208 const char* fieldnames[] = {"NumRows", "NumVariables", "Description"};
209
210 // Create a mxArray struct array containing the table metadata to be passed back to
211 // MATLAB.
212 mxArray* metadata = mxCreateStructMatrix(1, 1, num_metadata_fields, fieldnames);
213
214 // Returning double values to MATLAB since that is the default type.
215
216 // Set the number of rows.
217 mxSetField(metadata, 0, "NumRows",
218 mxCreateDoubleScalar(static_cast<double>(num_rows_)));
219
220 // Set the number of variables.
221 mxSetField(metadata, 0, "NumVariables",
222 mxCreateDoubleScalar(static_cast<double>(num_variables_)));
223
224 return metadata;
225}
226
227// Read the table variables from the Feather file as a mxArray*.
228mxArray* FeatherReader::ReadVariables() {
229 const int32_t num_variable_fields = 4;
230 const char* fieldnames[] = {"Name", "Type", "Data", "Valid"};
231
232 // Create an mxArray* struct array containing the table variables to be passed back to
233 // MATLAB.
234 mxArray* variables =
235 mxCreateStructMatrix(1, num_variables_, num_variable_fields, fieldnames);
236
237 std::shared_ptr<arrow::Table> table;
238 auto status = reader_->Read(&table);
239 if (!status.ok()) {
240 mexErrMsgIdAndTxt("MATLAB:arrow:FeatherReader::FailedToReadTable",
241 "Failed to read arrow::Table from Feather file. Reason: %s",
242 status.message().c_str());
243 }
244
245 // Set the number of rows
246 num_rows_ = table->num_rows();
247
248 if (num_rows_ > internal::MAX_MATLAB_SIZE ||
249 num_variables_ > internal::MAX_MATLAB_SIZE) {
250 mexErrMsgIdAndTxt("MATLAB:arrow:SizeTooLarge",
251 "The table size exceeds MATLAB limits: %u x %u", num_rows_,
252 num_variables_);
253 }
254
255 auto column_names = table->ColumnNames();
256
257 for (int64_t i = 0; i < num_variables_; ++i) {
258 auto column = table->column(i);
259 if (column->num_chunks() != 1) {
260 mexErrMsgIdAndTxt("MATLAB:arrow:FeatherReader::ReadVariables",
261 "Chunked columns not yet supported");
262 }
263 std::shared_ptr<Array> chunk = column->chunk(0);
264 const std::string column_name = column_names[i];
265
266 // set the struct fields data
267 mxSetField(variables, i, "Name", internal::ReadVariableName(column_name));
268 mxSetField(variables, i, "Type", internal::ReadVariableType(chunk));
269 mxSetField(variables, i, "Data", internal::ReadVariableData(chunk, column_name));
270 mxSetField(variables, i, "Valid", internal::ReadVariableValidityBitmap(chunk));
271 }
272
273 return variables;
274}
275
276} // namespace matlab
277} // namespace arrow