]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "arrow/json/parser.h" | |
19 | ||
20 | #include <gmock/gmock-matchers.h> | |
21 | #include <gtest/gtest.h> | |
22 | ||
23 | #include <string> | |
24 | #include <utility> | |
25 | #include <vector> | |
26 | ||
27 | #include "arrow/json/options.h" | |
28 | #include "arrow/json/test_common.h" | |
29 | #include "arrow/status.h" | |
30 | #include "arrow/testing/gtest_util.h" | |
31 | #include "arrow/util/checked_cast.h" | |
32 | #include "arrow/util/string_view.h" | |
33 | ||
34 | namespace arrow { | |
35 | ||
36 | using internal::checked_cast; | |
37 | ||
38 | namespace json { | |
39 | ||
40 | using util::string_view; | |
41 | ||
42 | void AssertUnconvertedStructArraysEqual(const StructArray& expected, | |
43 | const StructArray& actual); | |
44 | ||
45 | void AssertUnconvertedArraysEqual(const Array& expected, const Array& actual) { | |
46 | switch (actual.type_id()) { | |
47 | case Type::BOOL: | |
48 | case Type::NA: | |
49 | return AssertArraysEqual(expected, actual); | |
50 | case Type::DICTIONARY: { | |
51 | ASSERT_EQ(expected.type_id(), Type::STRING); | |
52 | std::shared_ptr<Array> actual_decoded; | |
53 | ASSERT_OK(DecodeStringDictionary(checked_cast<const DictionaryArray&>(actual), | |
54 | &actual_decoded)); | |
55 | return AssertArraysEqual(expected, *actual_decoded); | |
56 | } | |
57 | case Type::LIST: { | |
58 | ASSERT_EQ(expected.type_id(), Type::LIST); | |
59 | ASSERT_EQ(expected.null_count(), actual.null_count()); | |
60 | if (expected.null_count() != 0) { | |
61 | AssertBufferEqual(*expected.null_bitmap(), *actual.null_bitmap()); | |
62 | } | |
63 | const auto& expected_offsets = expected.data()->buffers[1]; | |
64 | const auto& actual_offsets = actual.data()->buffers[1]; | |
65 | AssertBufferEqual(*expected_offsets, *actual_offsets); | |
66 | auto expected_values = checked_cast<const ListArray&>(expected).values(); | |
67 | auto actual_values = checked_cast<const ListArray&>(actual).values(); | |
68 | return AssertUnconvertedArraysEqual(*expected_values, *actual_values); | |
69 | } | |
70 | case Type::STRUCT: | |
71 | ASSERT_EQ(expected.type_id(), Type::STRUCT); | |
72 | return AssertUnconvertedStructArraysEqual( | |
73 | checked_cast<const StructArray&>(expected), | |
74 | checked_cast<const StructArray&>(actual)); | |
75 | default: | |
76 | FAIL(); | |
77 | } | |
78 | } | |
79 | ||
80 | void AssertUnconvertedStructArraysEqual(const StructArray& expected, | |
81 | const StructArray& actual) { | |
82 | ASSERT_EQ(expected.num_fields(), actual.num_fields()); | |
83 | for (int i = 0; i < expected.num_fields(); ++i) { | |
84 | auto expected_name = expected.type()->field(i)->name(); | |
85 | auto actual_name = actual.type()->field(i)->name(); | |
86 | ASSERT_EQ(expected_name, actual_name); | |
87 | AssertUnconvertedArraysEqual(*expected.field(i), *actual.field(i)); | |
88 | } | |
89 | } | |
90 | ||
91 | void AssertParseColumns(ParseOptions options, string_view src_str, | |
92 | const std::vector<std::shared_ptr<Field>>& fields, | |
93 | const std::vector<std::string>& columns_json) { | |
94 | std::shared_ptr<Array> parsed; | |
95 | ASSERT_OK(ParseFromString(options, src_str, &parsed)); | |
96 | auto struct_array = std::static_pointer_cast<StructArray>(parsed); | |
97 | for (size_t i = 0; i < fields.size(); ++i) { | |
98 | auto column_expected = ArrayFromJSON(fields[i]->type(), columns_json[i]); | |
99 | auto column = struct_array->GetFieldByName(fields[i]->name()); | |
100 | AssertUnconvertedArraysEqual(*column_expected, *column); | |
101 | } | |
102 | } | |
103 | ||
104 | // TODO(bkietz) parameterize (at least some of) these tests over UnexpectedFieldBehavior | |
105 | ||
106 | TEST(BlockParserWithSchema, Basics) { | |
107 | auto options = ParseOptions::Defaults(); | |
108 | options.explicit_schema = | |
109 | schema({field("hello", float64()), field("world", boolean()), field("yo", utf8())}); | |
110 | options.unexpected_field_behavior = UnexpectedFieldBehavior::Ignore; | |
111 | AssertParseColumns( | |
112 | options, scalars_only_src(), | |
113 | {field("hello", utf8()), field("world", boolean()), field("yo", utf8())}, | |
114 | {"[\"3.5\", \"3.25\", \"3.125\", \"0.0\"]", "[false, null, null, true]", | |
115 | "[\"thing\", null, \"\xe5\xbf\x8d\", null]"}); | |
116 | } | |
117 | ||
118 | TEST(BlockParserWithSchema, Empty) { | |
119 | auto options = ParseOptions::Defaults(); | |
120 | options.explicit_schema = | |
121 | schema({field("hello", float64()), field("world", boolean()), field("yo", utf8())}); | |
122 | options.unexpected_field_behavior = UnexpectedFieldBehavior::Ignore; | |
123 | AssertParseColumns( | |
124 | options, "", | |
125 | {field("hello", utf8()), field("world", boolean()), field("yo", utf8())}, | |
126 | {"[]", "[]", "[]"}); | |
127 | } | |
128 | ||
129 | TEST(BlockParserWithSchema, SkipFieldsOutsideSchema) { | |
130 | auto options = ParseOptions::Defaults(); | |
131 | options.explicit_schema = schema({field("hello", float64()), field("yo", utf8())}); | |
132 | options.unexpected_field_behavior = UnexpectedFieldBehavior::Ignore; | |
133 | AssertParseColumns(options, scalars_only_src(), | |
134 | {field("hello", utf8()), field("yo", utf8())}, | |
135 | {"[\"3.5\", \"3.25\", \"3.125\", \"0.0\"]", | |
136 | "[\"thing\", null, \"\xe5\xbf\x8d\", null]"}); | |
137 | } | |
138 | ||
139 | class BlockParserTypeError : public ::testing::TestWithParam<UnexpectedFieldBehavior> { | |
140 | public: | |
141 | ParseOptions Options(std::shared_ptr<Schema> explicit_schema) { | |
142 | auto options = ParseOptions::Defaults(); | |
143 | options.explicit_schema = std::move(explicit_schema); | |
144 | options.unexpected_field_behavior = GetParam(); | |
145 | return options; | |
146 | } | |
147 | }; | |
148 | ||
149 | TEST_P(BlockParserTypeError, FailOnInconvertible) { | |
150 | auto options = Options(schema({field("a", int32())})); | |
151 | std::shared_ptr<Array> parsed; | |
152 | Status error = ParseFromString(options, "{\"a\":0}\n{\"a\":true}", &parsed); | |
153 | ASSERT_RAISES(Invalid, error); | |
154 | EXPECT_THAT( | |
155 | error.message(), | |
156 | testing::StartsWith( | |
157 | "JSON parse error: Column(/a) changed from number to boolean in row 1")); | |
158 | } | |
159 | ||
160 | TEST_P(BlockParserTypeError, FailOnNestedInconvertible) { | |
161 | auto options = Options(schema({field("a", list(struct_({field("b", int32())})))})); | |
162 | std::shared_ptr<Array> parsed; | |
163 | Status error = | |
164 | ParseFromString(options, "{\"a\":[{\"b\":0}]}\n{\"a\":[{\"b\":true}]}", &parsed); | |
165 | ASSERT_RAISES(Invalid, error); | |
166 | EXPECT_THAT( | |
167 | error.message(), | |
168 | testing::StartsWith( | |
169 | "JSON parse error: Column(/a/[]/b) changed from number to boolean in row 1")); | |
170 | } | |
171 | ||
172 | TEST_P(BlockParserTypeError, FailOnDuplicateKeys) { | |
173 | std::shared_ptr<Array> parsed; | |
174 | Status error = ParseFromString(Options(schema({field("a", int32())})), | |
175 | "{\"a\":0, \"a\":1}\n", &parsed); | |
176 | ASSERT_RAISES(Invalid, error); | |
177 | EXPECT_THAT( | |
178 | error.message(), | |
179 | testing::StartsWith("JSON parse error: Column(/a) was specified twice in row 0")); | |
180 | } | |
181 | ||
182 | TEST_P(BlockParserTypeError, FailOnDuplicateKeysNoSchema) { | |
183 | std::shared_ptr<Array> parsed; | |
184 | Status error = | |
185 | ParseFromString(ParseOptions::Defaults(), "{\"a\":0, \"a\":1}\n", &parsed); | |
186 | ||
187 | ASSERT_RAISES(Invalid, error); | |
188 | EXPECT_THAT( | |
189 | error.message(), | |
190 | testing::StartsWith("JSON parse error: Column(/a) was specified twice in row 0")); | |
191 | } | |
192 | ||
193 | INSTANTIATE_TEST_SUITE_P(BlockParserTypeError, BlockParserTypeError, | |
194 | ::testing::Values(UnexpectedFieldBehavior::Ignore, | |
195 | UnexpectedFieldBehavior::Error, | |
196 | UnexpectedFieldBehavior::InferType)); | |
197 | ||
198 | TEST(BlockParserWithSchema, Nested) { | |
199 | auto options = ParseOptions::Defaults(); | |
200 | options.explicit_schema = schema({field("yo", utf8()), field("arr", list(int32())), | |
201 | field("nuf", struct_({field("ps", int32())}))}); | |
202 | options.unexpected_field_behavior = UnexpectedFieldBehavior::Ignore; | |
203 | AssertParseColumns(options, nested_src(), | |
204 | {field("yo", utf8()), field("arr", list(utf8())), | |
205 | field("nuf", struct_({field("ps", utf8())}))}, | |
206 | {"[\"thing\", null, \"\xe5\xbf\x8d\", null]", | |
207 | R"([["1", "2", "3"], ["2"], [], null])", | |
208 | R"([{"ps":null}, {}, {"ps":"78"}, {"ps":"90"}])"}); | |
209 | } | |
210 | ||
211 | TEST(BlockParserWithSchema, FailOnIncompleteJson) { | |
212 | auto options = ParseOptions::Defaults(); | |
213 | options.explicit_schema = schema({field("a", int32())}); | |
214 | options.unexpected_field_behavior = UnexpectedFieldBehavior::Ignore; | |
215 | std::shared_ptr<Array> parsed; | |
216 | ASSERT_RAISES(Invalid, ParseFromString(options, "{\"a\":0, \"b\"", &parsed)); | |
217 | } | |
218 | ||
219 | TEST(BlockParser, Basics) { | |
220 | auto options = ParseOptions::Defaults(); | |
221 | options.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; | |
222 | AssertParseColumns( | |
223 | options, scalars_only_src(), | |
224 | {field("hello", utf8()), field("world", boolean()), field("yo", utf8())}, | |
225 | {"[\"3.5\", \"3.25\", \"3.125\", \"0.0\"]", "[false, null, null, true]", | |
226 | "[\"thing\", null, \"\xe5\xbf\x8d\", null]"}); | |
227 | } | |
228 | ||
229 | TEST(BlockParser, Nested) { | |
230 | auto options = ParseOptions::Defaults(); | |
231 | options.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; | |
232 | AssertParseColumns(options, nested_src(), | |
233 | {field("yo", utf8()), field("arr", list(utf8())), | |
234 | field("nuf", struct_({field("ps", utf8())}))}, | |
235 | {"[\"thing\", null, \"\xe5\xbf\x8d\", null]", | |
236 | R"([["1", "2", "3"], ["2"], [], null])", | |
237 | R"([{"ps":null}, {}, {"ps":"78"}, {"ps":"90"}])"}); | |
238 | } | |
239 | ||
240 | TEST(BlockParser, Null) { | |
241 | auto options = ParseOptions::Defaults(); | |
242 | options.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; | |
243 | AssertParseColumns( | |
244 | options, null_src(), | |
245 | {field("plain", null()), field("list1", list(null())), field("list2", list(null())), | |
246 | field("struct", struct_({field("plain", null())}))}, | |
247 | {"[null, null]", "[[], []]", "[[], [null]]", | |
248 | R"([{"plain": null}, {"plain": null}])"}); | |
249 | } | |
250 | ||
251 | TEST(BlockParser, AdHoc) { | |
252 | auto options = ParseOptions::Defaults(); | |
253 | options.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; | |
254 | AssertParseColumns( | |
255 | options, R"({"a": [1], "b": {"c": true, "d": "1991-02-03"}} | |
256 | {"a": [], "b": {"c": false, "d": "2019-04-01"}} | |
257 | )", | |
258 | {field("a", list(utf8())), | |
259 | field("b", struct_({field("c", boolean()), field("d", utf8())}))}, | |
260 | {R"([["1"], []])", | |
261 | R"([{"c":true, "d": "1991-02-03"}, {"c":false, "d":"2019-04-01"}])"}); | |
262 | } | |
263 | ||
264 | } // namespace json | |
265 | } // namespace arrow |