1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 #include "arrow/json/parser.h"
20 #include <gmock/gmock-matchers.h>
21 #include <gtest/gtest.h>
27 #include "arrow/json/options.h"
28 #include "arrow/json/test_common.h"
29 #include "arrow/status.h"
30 #include "arrow/testing/gtest_util.h"
31 #include "arrow/util/checked_cast.h"
32 #include "arrow/util/string_view.h"
36 using internal::checked_cast
;
40 using util::string_view
;
42 void AssertUnconvertedStructArraysEqual(const StructArray
& expected
,
43 const StructArray
& actual
);
45 void AssertUnconvertedArraysEqual(const Array
& expected
, const Array
& actual
) {
46 switch (actual
.type_id()) {
49 return AssertArraysEqual(expected
, actual
);
50 case Type::DICTIONARY
: {
51 ASSERT_EQ(expected
.type_id(), Type::STRING
);
52 std::shared_ptr
<Array
> actual_decoded
;
53 ASSERT_OK(DecodeStringDictionary(checked_cast
<const DictionaryArray
&>(actual
),
55 return AssertArraysEqual(expected
, *actual_decoded
);
58 ASSERT_EQ(expected
.type_id(), Type::LIST
);
59 ASSERT_EQ(expected
.null_count(), actual
.null_count());
60 if (expected
.null_count() != 0) {
61 AssertBufferEqual(*expected
.null_bitmap(), *actual
.null_bitmap());
63 const auto& expected_offsets
= expected
.data()->buffers
[1];
64 const auto& actual_offsets
= actual
.data()->buffers
[1];
65 AssertBufferEqual(*expected_offsets
, *actual_offsets
);
66 auto expected_values
= checked_cast
<const ListArray
&>(expected
).values();
67 auto actual_values
= checked_cast
<const ListArray
&>(actual
).values();
68 return AssertUnconvertedArraysEqual(*expected_values
, *actual_values
);
71 ASSERT_EQ(expected
.type_id(), Type::STRUCT
);
72 return AssertUnconvertedStructArraysEqual(
73 checked_cast
<const StructArray
&>(expected
),
74 checked_cast
<const StructArray
&>(actual
));
80 void AssertUnconvertedStructArraysEqual(const StructArray
& expected
,
81 const StructArray
& actual
) {
82 ASSERT_EQ(expected
.num_fields(), actual
.num_fields());
83 for (int i
= 0; i
< expected
.num_fields(); ++i
) {
84 auto expected_name
= expected
.type()->field(i
)->name();
85 auto actual_name
= actual
.type()->field(i
)->name();
86 ASSERT_EQ(expected_name
, actual_name
);
87 AssertUnconvertedArraysEqual(*expected
.field(i
), *actual
.field(i
));
91 void AssertParseColumns(ParseOptions options
, string_view src_str
,
92 const std::vector
<std::shared_ptr
<Field
>>& fields
,
93 const std::vector
<std::string
>& columns_json
) {
94 std::shared_ptr
<Array
> parsed
;
95 ASSERT_OK(ParseFromString(options
, src_str
, &parsed
));
96 auto struct_array
= std::static_pointer_cast
<StructArray
>(parsed
);
97 for (size_t i
= 0; i
< fields
.size(); ++i
) {
98 auto column_expected
= ArrayFromJSON(fields
[i
]->type(), columns_json
[i
]);
99 auto column
= struct_array
->GetFieldByName(fields
[i
]->name());
100 AssertUnconvertedArraysEqual(*column_expected
, *column
);
104 // TODO(bkietz) parameterize (at least some of) these tests over UnexpectedFieldBehavior
106 TEST(BlockParserWithSchema
, Basics
) {
107 auto options
= ParseOptions::Defaults();
108 options
.explicit_schema
=
109 schema({field("hello", float64()), field("world", boolean()), field("yo", utf8())});
110 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::Ignore
;
112 options
, scalars_only_src(),
113 {field("hello", utf8()), field("world", boolean()), field("yo", utf8())},
114 {"[\"3.5\", \"3.25\", \"3.125\", \"0.0\"]", "[false, null, null, true]",
115 "[\"thing\", null, \"\xe5\xbf\x8d\", null]"});
118 TEST(BlockParserWithSchema
, Empty
) {
119 auto options
= ParseOptions::Defaults();
120 options
.explicit_schema
=
121 schema({field("hello", float64()), field("world", boolean()), field("yo", utf8())});
122 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::Ignore
;
125 {field("hello", utf8()), field("world", boolean()), field("yo", utf8())},
129 TEST(BlockParserWithSchema
, SkipFieldsOutsideSchema
) {
130 auto options
= ParseOptions::Defaults();
131 options
.explicit_schema
= schema({field("hello", float64()), field("yo", utf8())});
132 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::Ignore
;
133 AssertParseColumns(options
, scalars_only_src(),
134 {field("hello", utf8()), field("yo", utf8())},
135 {"[\"3.5\", \"3.25\", \"3.125\", \"0.0\"]",
136 "[\"thing\", null, \"\xe5\xbf\x8d\", null]"});
139 class BlockParserTypeError
: public ::testing::TestWithParam
<UnexpectedFieldBehavior
> {
141 ParseOptions
Options(std::shared_ptr
<Schema
> explicit_schema
) {
142 auto options
= ParseOptions::Defaults();
143 options
.explicit_schema
= std::move(explicit_schema
);
144 options
.unexpected_field_behavior
= GetParam();
149 TEST_P(BlockParserTypeError
, FailOnInconvertible
) {
150 auto options
= Options(schema({field("a", int32())}));
151 std::shared_ptr
<Array
> parsed
;
152 Status error
= ParseFromString(options
, "{\"a\":0}\n{\"a\":true}", &parsed
);
153 ASSERT_RAISES(Invalid
, error
);
157 "JSON parse error: Column(/a) changed from number to boolean in row 1"));
160 TEST_P(BlockParserTypeError
, FailOnNestedInconvertible
) {
161 auto options
= Options(schema({field("a", list(struct_({field("b", int32())})))}));
162 std::shared_ptr
<Array
> parsed
;
164 ParseFromString(options
, "{\"a\":[{\"b\":0}]}\n{\"a\":[{\"b\":true}]}", &parsed
);
165 ASSERT_RAISES(Invalid
, error
);
169 "JSON parse error: Column(/a/[]/b) changed from number to boolean in row 1"));
172 TEST_P(BlockParserTypeError
, FailOnDuplicateKeys
) {
173 std::shared_ptr
<Array
> parsed
;
174 Status error
= ParseFromString(Options(schema({field("a", int32())})),
175 "{\"a\":0, \"a\":1}\n", &parsed
);
176 ASSERT_RAISES(Invalid
, error
);
179 testing::StartsWith("JSON parse error: Column(/a) was specified twice in row 0"));
182 TEST_P(BlockParserTypeError
, FailOnDuplicateKeysNoSchema
) {
183 std::shared_ptr
<Array
> parsed
;
185 ParseFromString(ParseOptions::Defaults(), "{\"a\":0, \"a\":1}\n", &parsed
);
187 ASSERT_RAISES(Invalid
, error
);
190 testing::StartsWith("JSON parse error: Column(/a) was specified twice in row 0"));
193 INSTANTIATE_TEST_SUITE_P(BlockParserTypeError
, BlockParserTypeError
,
194 ::testing::Values(UnexpectedFieldBehavior::Ignore
,
195 UnexpectedFieldBehavior::Error
,
196 UnexpectedFieldBehavior::InferType
));
198 TEST(BlockParserWithSchema
, Nested
) {
199 auto options
= ParseOptions::Defaults();
200 options
.explicit_schema
= schema({field("yo", utf8()), field("arr", list(int32())),
201 field("nuf", struct_({field("ps", int32())}))});
202 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::Ignore
;
203 AssertParseColumns(options
, nested_src(),
204 {field("yo", utf8()), field("arr", list(utf8())),
205 field("nuf", struct_({field("ps", utf8())}))},
206 {"[\"thing\", null, \"\xe5\xbf\x8d\", null]",
207 R
"([["1", "2", "3"], ["2"], [], null])",
208 R
"([{"ps
":null}, {}, {"ps
":"78"}, {"ps
":"90"}])"});
211 TEST(BlockParserWithSchema
, FailOnIncompleteJson
) {
212 auto options
= ParseOptions::Defaults();
213 options
.explicit_schema
= schema({field("a", int32())});
214 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::Ignore
;
215 std::shared_ptr
<Array
> parsed
;
216 ASSERT_RAISES(Invalid
, ParseFromString(options
, "{\"a\":0, \"b\"", &parsed
));
219 TEST(BlockParser
, Basics
) {
220 auto options
= ParseOptions::Defaults();
221 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
223 options
, scalars_only_src(),
224 {field("hello", utf8()), field("world", boolean()), field("yo", utf8())},
225 {"[\"3.5\", \"3.25\", \"3.125\", \"0.0\"]", "[false, null, null, true]",
226 "[\"thing\", null, \"\xe5\xbf\x8d\", null]"});
229 TEST(BlockParser
, Nested
) {
230 auto options
= ParseOptions::Defaults();
231 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
232 AssertParseColumns(options
, nested_src(),
233 {field("yo", utf8()), field("arr", list(utf8())),
234 field("nuf", struct_({field("ps", utf8())}))},
235 {"[\"thing\", null, \"\xe5\xbf\x8d\", null]",
236 R
"([["1", "2", "3"], ["2"], [], null])",
237 R
"([{"ps
":null}, {}, {"ps
":"78"}, {"ps
":"90"}])"});
240 TEST(BlockParser
, Null
) {
241 auto options
= ParseOptions::Defaults();
242 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
245 {field("plain", null()), field("list1", list(null())), field("list2", list(null())),
246 field("struct", struct_({field("plain", null())}))},
247 {"[null, null]", "[[], []]", "[[], [null]]",
248 R
"([{"plain
": null}, {"plain
": null}])"});
251 TEST(BlockParser
, AdHoc
) {
252 auto options
= ParseOptions::Defaults();
253 options
.unexpected_field_behavior
= UnexpectedFieldBehavior::InferType
;
255 options
, R
"({"a
": [1], "b
": {"c
": true, "d
": "1991-02-03"}}
256 {"a
": [], "b
": {"c
": false, "d
": "2019-04-01"}}
258 {field("a", list(utf8())),
259 field("b", struct_({field("c", boolean()), field("d", utf8())}))},
261 R
"([{"c
":true, "d
": "1991-02-03"}, {"c
":false, "d
":"2019-04-01"}])"});