]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/json/reader_test.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / json / reader_test.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include <gtest/gtest.h>
23
24 #include "arrow/io/interfaces.h"
25 #include "arrow/json/options.h"
26 #include "arrow/json/reader.h"
27 #include "arrow/json/test_common.h"
28 #include "arrow/table.h"
29 #include "arrow/testing/gtest_util.h"
30
31 namespace arrow {
32 namespace json {
33
34 using util::string_view;
35
36 using internal::checked_cast;
37
38 class ReaderTest : public ::testing::TestWithParam<bool> {
39 public:
40 void SetUpReader() {
41 read_options_.use_threads = GetParam();
42 ASSERT_OK_AND_ASSIGN(reader_, TableReader::Make(default_memory_pool(), input_,
43 read_options_, parse_options_));
44 }
45
46 void SetUpReader(util::string_view input) {
47 ASSERT_OK(MakeStream(input, &input_));
48 SetUpReader();
49 }
50
51 std::shared_ptr<ChunkedArray> ChunkedFromJSON(const std::shared_ptr<Field>& field,
52 const std::vector<std::string>& data) {
53 ArrayVector chunks(data.size());
54 for (size_t i = 0; i < chunks.size(); ++i) {
55 chunks[i] = ArrayFromJSON(field->type(), data[i]);
56 }
57 return std::make_shared<ChunkedArray>(std::move(chunks));
58 }
59
60 ParseOptions parse_options_ = ParseOptions::Defaults();
61 ReadOptions read_options_ = ReadOptions::Defaults();
62 std::shared_ptr<io::InputStream> input_;
63 std::shared_ptr<TableReader> reader_;
64 std::shared_ptr<Table> table_;
65 };
66
67 INSTANTIATE_TEST_SUITE_P(ReaderTest, ReaderTest, ::testing::Values(false, true));
68
69 TEST_P(ReaderTest, Empty) {
70 SetUpReader("{}\n{}\n");
71 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
72
73 auto expected_table = Table::Make(schema({}), ArrayVector(), 2);
74 AssertTablesEqual(*expected_table, *table_);
75 }
76
77 TEST_P(ReaderTest, EmptyNoNewlineAtEnd) {
78 SetUpReader("{}\n{}");
79 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
80
81 auto expected_table = Table::Make(schema({}), ArrayVector(), 2);
82 AssertTablesEqual(*expected_table, *table_);
83 }
84
85 TEST_P(ReaderTest, EmptyManyNewlines) {
86 SetUpReader("{}\n\r\n{}\n\r\n");
87 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
88
89 auto expected_table = Table::Make(schema({}), ArrayVector(), 2);
90 AssertTablesEqual(*expected_table, *table_);
91 }
92
93 TEST_P(ReaderTest, Basics) {
94 parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
95 auto src = scalars_only_src();
96 SetUpReader(src);
97 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
98
99 auto schema = ::arrow::schema(
100 {field("hello", float64()), field("world", boolean()), field("yo", utf8())});
101
102 auto expected_table = Table::Make(
103 schema, {
104 ArrayFromJSON(schema->field(0)->type(), "[3.5, 3.25, 3.125, 0.0]"),
105 ArrayFromJSON(schema->field(1)->type(), "[false, null, null, true]"),
106 ArrayFromJSON(schema->field(2)->type(),
107 "[\"thing\", null, \"\xe5\xbf\x8d\", null]"),
108 });
109 AssertTablesEqual(*expected_table, *table_);
110 }
111
112 TEST_P(ReaderTest, Nested) {
113 parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
114 auto src = nested_src();
115 SetUpReader(src);
116 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
117
118 auto schema = ::arrow::schema({field("hello", float64()), field("world", boolean()),
119 field("yo", utf8()), field("arr", list(int64())),
120 field("nuf", struct_({field("ps", int64())}))});
121
122 auto a0 = ArrayFromJSON(schema->field(0)->type(), "[3.5, 3.25, 3.125, 0.0]");
123 auto a1 = ArrayFromJSON(schema->field(1)->type(), "[false, null, null, true]");
124 auto a2 = ArrayFromJSON(schema->field(2)->type(),
125 "[\"thing\", null, \"\xe5\xbf\x8d\", null]");
126 auto a3 = ArrayFromJSON(schema->field(3)->type(), "[[1, 2, 3], [2], [], null]");
127 auto a4 = ArrayFromJSON(schema->field(4)->type(),
128 R"([{"ps":null}, null, {"ps":78}, {"ps":90}])");
129 auto expected_table = Table::Make(schema, {a0, a1, a2, a3, a4});
130 AssertTablesEqual(*expected_table, *table_);
131 }
132
133 TEST_P(ReaderTest, PartialSchema) {
134 parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
135 parse_options_.explicit_schema =
136 schema({field("nuf", struct_({field("absent", date32())})),
137 field("arr", list(float32()))});
138 auto src = nested_src();
139 SetUpReader(src);
140 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
141
142 auto schema = ::arrow::schema(
143 {field("nuf", struct_({field("absent", date32()), field("ps", int64())})),
144 field("arr", list(float32())), field("hello", float64()),
145 field("world", boolean()), field("yo", utf8())});
146
147 auto expected_table = Table::Make(
148 schema,
149 {
150 // NB: explicitly declared fields will appear first
151 ArrayFromJSON(
152 schema->field(0)->type(),
153 R"([{"absent":null,"ps":null}, null, {"absent":null,"ps":78}, {"absent":null,"ps":90}])"),
154 ArrayFromJSON(schema->field(1)->type(), R"([[1, 2, 3], [2], [], null])"),
155 // ...followed by undeclared fields
156 ArrayFromJSON(schema->field(2)->type(), "[3.5, 3.25, 3.125, 0.0]"),
157 ArrayFromJSON(schema->field(3)->type(), "[false, null, null, true]"),
158 ArrayFromJSON(schema->field(4)->type(),
159 "[\"thing\", null, \"\xe5\xbf\x8d\", null]"),
160 });
161 AssertTablesEqual(*expected_table, *table_);
162 }
163
164 TEST_P(ReaderTest, TypeInference) {
165 parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
166 SetUpReader(R"(
167 {"ts":null, "f": null}
168 {"ts":"1970-01-01", "f": 3}
169 {"ts":"2018-11-13 17:11:10", "f":3.125}
170 )");
171 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
172
173 auto schema =
174 ::arrow::schema({field("ts", timestamp(TimeUnit::SECOND)), field("f", float64())});
175 auto expected_table = Table::Make(
176 schema, {ArrayFromJSON(schema->field(0)->type(),
177 R"([null, "1970-01-01", "2018-11-13 17:11:10"])"),
178 ArrayFromJSON(schema->field(1)->type(), R"([null, 3, 3.125])")});
179 AssertTablesEqual(*expected_table, *table_);
180 }
181
182 TEST_P(ReaderTest, MultipleChunks) {
183 parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
184
185 auto src = scalars_only_src();
186 read_options_.block_size = static_cast<int>(src.length() / 3);
187
188 SetUpReader(src);
189 ASSERT_OK_AND_ASSIGN(table_, reader_->Read());
190
191 auto schema = ::arrow::schema(
192 {field("hello", float64()), field("world", boolean()), field("yo", utf8())});
193
194 // there is an empty chunk because the last block of the file is " "
195 auto expected_table = Table::Make(
196 schema,
197 {
198 ChunkedFromJSON(schema->field(0), {"[3.5]", "[3.25]", "[3.125, 0.0]", "[]"}),
199 ChunkedFromJSON(schema->field(1), {"[false]", "[null]", "[null, true]", "[]"}),
200 ChunkedFromJSON(schema->field(2),
201 {"[\"thing\"]", "[null]", "[\"\xe5\xbf\x8d\", null]", "[]"}),
202 });
203 AssertTablesEqual(*expected_table, *table_);
204 }
205
206 TEST(ReaderTest, MultipleChunksParallel) {
207 int64_t count = 1 << 10;
208
209 ParseOptions parse_options;
210 parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
211 ReadOptions read_options;
212 read_options.block_size =
213 static_cast<int>(count / 2); // there will be about two dozen blocks
214
215 std::string json;
216 for (int i = 0; i < count; ++i) {
217 json += "{\"a\":" + std::to_string(i) + "}\n";
218 }
219 std::shared_ptr<io::InputStream> input;
220 std::shared_ptr<TableReader> reader;
221
222 read_options.use_threads = true;
223 ASSERT_OK(MakeStream(json, &input));
224 ASSERT_OK_AND_ASSIGN(reader, TableReader::Make(default_memory_pool(), input,
225 read_options, parse_options));
226 ASSERT_OK_AND_ASSIGN(auto threaded, reader->Read());
227
228 read_options.use_threads = false;
229 ASSERT_OK(MakeStream(json, &input));
230 ASSERT_OK_AND_ASSIGN(reader, TableReader::Make(default_memory_pool(), input,
231 read_options, parse_options));
232 ASSERT_OK_AND_ASSIGN(auto serial, reader->Read());
233
234 ASSERT_EQ(serial->column(0)->type()->id(), Type::INT64);
235 int expected = 0;
236 for (auto chunk : serial->column(0)->chunks()) {
237 for (int64_t i = 0; i < chunk->length(); ++i) {
238 ASSERT_EQ(checked_cast<const Int64Array*>(chunk.get())->GetView(i), expected)
239 << " at index " << i;
240 ++expected;
241 }
242 }
243
244 AssertTablesEqual(*serial, *threaded);
245 }
246
247 TEST(ReaderTest, ListArrayWithFewValues) {
248 // ARROW-7647
249 ParseOptions parse_options;
250 parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
251 ReadOptions read_options;
252
253 auto expected_batch = RecordBatchFromJSON(
254 schema({field("a", list(int64())),
255 field("b", struct_({field("c", boolean()),
256 field("d", timestamp(TimeUnit::SECOND))}))}),
257 R"([
258 {"a": [1], "b": {"c": true, "d": "1991-02-03"}},
259 {"a": [], "b": {"c": false, "d": "2019-04-01"}}
260 ])");
261 ASSERT_OK_AND_ASSIGN(auto expected_table, Table::FromRecordBatches({expected_batch}));
262
263 std::string json = R"({"a": [1], "b": {"c": true, "d": "1991-02-03"}}
264 {"a": [], "b": {"c": false, "d": "2019-04-01"}}
265 )";
266 std::shared_ptr<io::InputStream> input;
267 ASSERT_OK(MakeStream(json, &input));
268
269 read_options.use_threads = false;
270 ASSERT_OK_AND_ASSIGN(auto reader, TableReader::Make(default_memory_pool(), input,
271 read_options, parse_options));
272
273 ASSERT_OK_AND_ASSIGN(auto actual_table, reader->Read());
274 AssertTablesEqual(*actual_table, *expected_table);
275 }
276
277 } // namespace json
278 } // namespace arrow