1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
22 #include <gtest/gtest.h>
24 #include "arrow/io/interfaces.h"
25 #include "arrow/json/options.h"
26 #include "arrow/json/reader.h"
27 #include "arrow/json/test_common.h"
28 #include "arrow/table.h"
29 #include "arrow/testing/gtest_util.h"
34 using util::string_view
;
36 using internal::checked_cast
;
38 class ReaderTest
: public ::testing::TestWithParam
<bool> {
41 read_options_
.use_threads
= GetParam();
42 ASSERT_OK_AND_ASSIGN(reader_
, TableReader::Make(default_memory_pool(), input_
,
43 read_options_
, parse_options_
));
46 void SetUpReader(util::string_view input
) {
47 ASSERT_OK(MakeStream(input
, &input_
));
51 std::shared_ptr
<ChunkedArray
> ChunkedFromJSON(const std::shared_ptr
<Field
>& field
,
52 const std::vector
<std::string
>& data
) {
53 ArrayVector
chunks(data
.size());
54 for (size_t i
= 0; i
< chunks
.size(); ++i
) {
55 chunks
[i
] = ArrayFromJSON(field
->type(), data
[i
]);
57 return std::make_shared
<ChunkedArray
>(std::move(chunks
));
60 ParseOptions parse_options_
= ParseOptions::Defaults();
61 ReadOptions read_options_
= ReadOptions::Defaults();
62 std::shared_ptr
<io::InputStream
> input_
;
63 std::shared_ptr
<TableReader
> reader_
;
64 std::shared_ptr
<Table
> table_
;
67 INSTANTIATE_TEST_SUITE_P(ReaderTest
, ReaderTest
, ::testing::Values(false, true));
69 TEST_P(ReaderTest
, Empty
) {
70 SetUpReader("{}\n{}\n");
71 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
73 auto expected_table
= Table::Make(schema({}), ArrayVector(), 2);
74 AssertTablesEqual(*expected_table
, *table_
);
77 TEST_P(ReaderTest
, EmptyNoNewlineAtEnd
) {
78 SetUpReader("{}\n{}");
79 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
81 auto expected_table
= Table::Make(schema({}), ArrayVector(), 2);
82 AssertTablesEqual(*expected_table
, *table_
);
85 TEST_P(ReaderTest
, EmptyManyNewlines
) {
86 SetUpReader("{}\n\r\n{}\n\r\n");
87 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
89 auto expected_table
= Table::Make(schema({}), ArrayVector(), 2);
90 AssertTablesEqual(*expected_table
, *table_
);
93 TEST_P(ReaderTest
, Basics
) {
94 parse_options_
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
95 auto src
= scalars_only_src();
97 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
99 auto schema
= ::arrow::schema(
100 {field("hello", float64()), field("world", boolean()), field("yo", utf8())});
102 auto expected_table
= Table::Make(
104 ArrayFromJSON(schema
->field(0)->type(), "[3.5, 3.25, 3.125, 0.0]"),
105 ArrayFromJSON(schema
->field(1)->type(), "[false, null, null, true]"),
106 ArrayFromJSON(schema
->field(2)->type(),
107 "[\"thing\", null, \"\xe5\xbf\x8d\", null]"),
109 AssertTablesEqual(*expected_table
, *table_
);
112 TEST_P(ReaderTest
, Nested
) {
113 parse_options_
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
114 auto src
= nested_src();
116 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
118 auto schema
= ::arrow::schema({field("hello", float64()), field("world", boolean()),
119 field("yo", utf8()), field("arr", list(int64())),
120 field("nuf", struct_({field("ps", int64())}))});
122 auto a0
= ArrayFromJSON(schema
->field(0)->type(), "[3.5, 3.25, 3.125, 0.0]");
123 auto a1
= ArrayFromJSON(schema
->field(1)->type(), "[false, null, null, true]");
124 auto a2
= ArrayFromJSON(schema
->field(2)->type(),
125 "[\"thing\", null, \"\xe5\xbf\x8d\", null]");
126 auto a3
= ArrayFromJSON(schema
->field(3)->type(), "[[1, 2, 3], [2], [], null]");
127 auto a4
= ArrayFromJSON(schema
->field(4)->type(),
128 R
"([{"ps
":null}, null, {"ps
":78}, {"ps
":90}])");
129 auto expected_table
= Table::Make(schema
, {a0
, a1
, a2
, a3
, a4
});
130 AssertTablesEqual(*expected_table
, *table_
);
133 TEST_P(ReaderTest
, PartialSchema
) {
134 parse_options_
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
135 parse_options_
.explicit_schema
=
136 schema({field("nuf", struct_({field("absent", date32())})),
137 field("arr", list(float32()))});
138 auto src
= nested_src();
140 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
142 auto schema
= ::arrow::schema(
143 {field("nuf", struct_({field("absent", date32()), field("ps", int64())})),
144 field("arr", list(float32())), field("hello", float64()),
145 field("world", boolean()), field("yo", utf8())});
147 auto expected_table
= Table::Make(
150 // NB: explicitly declared fields will appear first
152 schema
->field(0)->type(),
153 R
"([{"absent
":null,"ps
":null}, null, {"absent
":null,"ps
":78}, {"absent
":null,"ps
":90}])"),
154 ArrayFromJSON(schema
->field(1)->type(), R
"([[1, 2, 3], [2], [], null])"),
155 // ...followed by undeclared fields
156 ArrayFromJSON(schema
->field(2)->type(), "[3.5, 3.25, 3.125, 0.0]"),
157 ArrayFromJSON(schema
->field(3)->type(), "[false, null, null, true]"),
158 ArrayFromJSON(schema
->field(4)->type(),
159 "[\"thing\", null, \"\xe5\xbf\x8d\", null]"),
161 AssertTablesEqual(*expected_table
, *table_
);
164 TEST_P(ReaderTest
, TypeInference
) {
165 parse_options_
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
167 {"ts
":null, "f
": null}
168 {"ts
":"1970-01-01", "f
": 3}
169 {"ts
":"2018-11-13 17:11:10", "f
":3.125}
171 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
174 ::arrow::schema({field("ts", timestamp(TimeUnit::SECOND
)), field("f", float64())});
175 auto expected_table
= Table::Make(
176 schema
, {ArrayFromJSON(schema
->field(0)->type(),
177 R
"([null, "1970-01-01", "2018-11-13 17:11:10"])"),
178 ArrayFromJSON(schema
->field(1)->type(), R
"([null, 3, 3.125])")});
179 AssertTablesEqual(*expected_table
, *table_
);
182 TEST_P(ReaderTest
, MultipleChunks
) {
183 parse_options_
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
185 auto src
= scalars_only_src();
186 read_options_
.block_size
= static_cast<int>(src
.length() / 3);
189 ASSERT_OK_AND_ASSIGN(table_
, reader_
->Read());
191 auto schema
= ::arrow::schema(
192 {field("hello", float64()), field("world", boolean()), field("yo", utf8())});
194 // there is an empty chunk because the last block of the file is " "
195 auto expected_table
= Table::Make(
198 ChunkedFromJSON(schema
->field(0), {"[3.5]", "[3.25]", "[3.125, 0.0]", "[]"}),
199 ChunkedFromJSON(schema
->field(1), {"[false]", "[null]", "[null, true]", "[]"}),
200 ChunkedFromJSON(schema
->field(2),
201 {"[\"thing\"]", "[null]", "[\"\xe5\xbf\x8d\", null]", "[]"}),
203 AssertTablesEqual(*expected_table
, *table_
);
206 TEST(ReaderTest
, MultipleChunksParallel
) {
207 int64_t count
= 1 << 10;
209 ParseOptions parse_options
;
210 parse_options
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
211 ReadOptions read_options
;
212 read_options
.block_size
=
213 static_cast<int>(count
/ 2); // there will be about two dozen blocks
216 for (int i
= 0; i
< count
; ++i
) {
217 json
+= "{\"a\":" + std::to_string(i
) + "}\n";
219 std::shared_ptr
<io::InputStream
> input
;
220 std::shared_ptr
<TableReader
> reader
;
222 read_options
.use_threads
= true;
223 ASSERT_OK(MakeStream(json
, &input
));
224 ASSERT_OK_AND_ASSIGN(reader
, TableReader::Make(default_memory_pool(), input
,
225 read_options
, parse_options
));
226 ASSERT_OK_AND_ASSIGN(auto threaded
, reader
->Read());
228 read_options
.use_threads
= false;
229 ASSERT_OK(MakeStream(json
, &input
));
230 ASSERT_OK_AND_ASSIGN(reader
, TableReader::Make(default_memory_pool(), input
,
231 read_options
, parse_options
));
232 ASSERT_OK_AND_ASSIGN(auto serial
, reader
->Read());
234 ASSERT_EQ(serial
->column(0)->type()->id(), Type::INT64
);
236 for (auto chunk
: serial
->column(0)->chunks()) {
237 for (int64_t i
= 0; i
< chunk
->length(); ++i
) {
238 ASSERT_EQ(checked_cast
<const Int64Array
*>(chunk
.get())->GetView(i
), expected
)
239 << " at index " << i
;
244 AssertTablesEqual(*serial
, *threaded
);
247 TEST(ReaderTest
, ListArrayWithFewValues
) {
249 ParseOptions parse_options
;
250 parse_options
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
251 ReadOptions read_options
;
253 auto expected_batch
= RecordBatchFromJSON(
254 schema({field("a", list(int64())),
255 field("b", struct_({field("c", boolean()),
256 field("d", timestamp(TimeUnit::SECOND
))}))}),
258 {"a
": [1], "b
": {"c
": true, "d
": "1991-02-03"}},
259 {"a
": [], "b
": {"c
": false, "d
": "2019-04-01"}}
261 ASSERT_OK_AND_ASSIGN(auto expected_table
, Table::FromRecordBatches({expected_batch
}));
263 std::string json
= R
"({"a
": [1], "b
": {"c
": true, "d
": "1991-02-03"}}
264 {"a
": [], "b
": {"c
": false, "d
": "2019-04-01"}}
266 std::shared_ptr
<io::InputStream
> input
;
267 ASSERT_OK(MakeStream(json
, &input
));
269 read_options
.use_threads
= false;
270 ASSERT_OK_AND_ASSIGN(auto reader
, TableReader::Make(default_memory_pool(), input
,
271 read_options
, parse_options
));
273 ASSERT_OK_AND_ASSIGN(auto actual_table
, reader
->Read());
274 AssertTablesEqual(*actual_table
, *expected_table
);