1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
21 #include <fstream> // IWYU pragma: keep
28 #include <gflags/gflags.h>
29 #include <gtest/gtest.h>
31 #include "arrow/array.h"
32 #include "arrow/array/builder_binary.h"
33 #include "arrow/array/builder_primitive.h"
34 #include "arrow/io/file.h"
35 #include "arrow/ipc/dictionary.h"
36 #include "arrow/ipc/reader.h"
37 #include "arrow/ipc/test_common.h"
38 #include "arrow/ipc/writer.h"
39 #include "arrow/pretty_print.h"
40 #include "arrow/status.h"
41 #include "arrow/testing/extension_type.h"
42 #include "arrow/testing/gtest_util.h"
43 #include "arrow/testing/json_integration.h"
44 #include "arrow/testing/json_internal.h"
45 #include "arrow/testing/random.h"
46 #include "arrow/type.h"
47 #include "arrow/type_fwd.h"
48 #include "arrow/util/io_util.h"
50 DEFINE_string(arrow
, "", "Arrow file name");
51 DEFINE_string(json
, "", "JSON file name");
54 "Mode of integration testing tool (ARROW_TO_JSON, JSON_TO_ARROW, VALIDATE)");
55 DEFINE_bool(integration
, false, "Run in integration test mode");
56 DEFINE_bool(verbose
, true, "Verbose output");
60 using internal::TemporaryDir
;
61 using ipc::DictionaryFieldMapper
;
62 using ipc::DictionaryMemo
;
63 using ipc::IpcWriteOptions
;
64 using ipc::MetadataVersion
;
68 using namespace ::arrow::ipc::test
; // NOLINT
70 // Convert JSON file to IPC binary format
71 static Status
ConvertJsonToArrow(const std::string
& json_path
,
72 const std::string
& arrow_path
) {
73 ARROW_ASSIGN_OR_RAISE(auto in_file
, io::ReadableFile::Open(json_path
));
74 ARROW_ASSIGN_OR_RAISE(auto out_file
, io::FileOutputStream::Open(arrow_path
));
76 ARROW_ASSIGN_OR_RAISE(int64_t file_size
, in_file
->GetSize());
77 ARROW_ASSIGN_OR_RAISE(auto json_buffer
, in_file
->Read(file_size
));
79 std::unique_ptr
<IntegrationJsonReader
> reader
;
80 RETURN_NOT_OK(IntegrationJsonReader::Open(json_buffer
, &reader
));
83 std::cout
<< "Found schema:\n"
84 << reader
->schema()->ToString(/* show_metadata = */ true) << std::endl
;
87 ARROW_ASSIGN_OR_RAISE(auto writer
, ipc::MakeFileWriter(out_file
, reader
->schema(),
88 IpcWriteOptions::Defaults()));
89 for (int i
= 0; i
< reader
->num_record_batches(); ++i
) {
90 std::shared_ptr
<RecordBatch
> batch
;
91 RETURN_NOT_OK(reader
->ReadRecordBatch(i
, &batch
));
92 RETURN_NOT_OK(writer
->WriteRecordBatch(*batch
));
94 return writer
->Close();
97 // Convert IPC binary format to JSON
98 static Status
ConvertArrowToJson(const std::string
& arrow_path
,
99 const std::string
& json_path
) {
100 ARROW_ASSIGN_OR_RAISE(auto in_file
, io::ReadableFile::Open(arrow_path
));
101 ARROW_ASSIGN_OR_RAISE(auto out_file
, io::FileOutputStream::Open(json_path
));
103 std::shared_ptr
<ipc::RecordBatchFileReader
> reader
;
104 ARROW_ASSIGN_OR_RAISE(reader
, ipc::RecordBatchFileReader::Open(in_file
.get()));
107 std::cout
<< "Found schema:\n" << reader
->schema()->ToString() << std::endl
;
110 std::unique_ptr
<IntegrationJsonWriter
> writer
;
111 RETURN_NOT_OK(IntegrationJsonWriter::Open(reader
->schema(), &writer
));
113 for (int i
= 0; i
< reader
->num_record_batches(); ++i
) {
114 ARROW_ASSIGN_OR_RAISE(std::shared_ptr
<RecordBatch
> batch
, reader
->ReadRecordBatch(i
));
115 RETURN_NOT_OK(writer
->WriteRecordBatch(*batch
));
119 RETURN_NOT_OK(writer
->Finish(&result
));
120 return out_file
->Write(result
.c_str(), static_cast<int64_t>(result
.size()));
123 static Status
ValidateArrowVsJson(const std::string
& arrow_path
,
124 const std::string
& json_path
) {
125 // Construct JSON reader
126 ARROW_ASSIGN_OR_RAISE(auto json_file
, io::ReadableFile::Open(json_path
));
128 ARROW_ASSIGN_OR_RAISE(int64_t file_size
, json_file
->GetSize());
129 ARROW_ASSIGN_OR_RAISE(auto json_buffer
, json_file
->Read(file_size
));
131 std::unique_ptr
<IntegrationJsonReader
> json_reader
;
132 RETURN_NOT_OK(IntegrationJsonReader::Open(json_buffer
, &json_reader
));
134 // Construct Arrow reader
135 ARROW_ASSIGN_OR_RAISE(auto arrow_file
, io::ReadableFile::Open(arrow_path
));
137 std::shared_ptr
<ipc::RecordBatchFileReader
> arrow_reader
;
138 ARROW_ASSIGN_OR_RAISE(arrow_reader
, ipc::RecordBatchFileReader::Open(arrow_file
.get()));
140 auto json_schema
= json_reader
->schema();
141 auto arrow_schema
= arrow_reader
->schema();
143 if (!json_schema
->Equals(*arrow_schema
)) {
144 std::stringstream ss
;
145 ss
<< "JSON schema: \n"
146 << json_schema
->ToString(/* show_metadata = */ true) << "\n\n"
147 << "Arrow schema: \n"
148 << arrow_schema
->ToString(/* show_metadata = */ true) << "\n";
151 std::cout
<< ss
.str() << std::endl
;
153 return Status::Invalid("Schemas did not match");
156 const int json_nbatches
= json_reader
->num_record_batches();
157 const int arrow_nbatches
= arrow_reader
->num_record_batches();
159 if (json_nbatches
!= arrow_nbatches
) {
160 return Status::Invalid("Different number of record batches: ", json_nbatches
,
161 " (JSON) vs ", arrow_nbatches
, " (Arrow)");
164 std::shared_ptr
<RecordBatch
> arrow_batch
;
165 std::shared_ptr
<RecordBatch
> json_batch
;
166 for (int i
= 0; i
< json_nbatches
; ++i
) {
167 RETURN_NOT_OK(json_reader
->ReadRecordBatch(i
, &json_batch
));
168 ARROW_ASSIGN_OR_RAISE(arrow_batch
, arrow_reader
->ReadRecordBatch(i
));
169 Status valid_st
= json_batch
->ValidateFull();
170 if (!valid_st
.ok()) {
171 return Status::Invalid("JSON record batch ", i
, " did not validate:\n",
172 valid_st
.ToString());
174 valid_st
= arrow_batch
->ValidateFull();
175 if (!valid_st
.ok()) {
176 return Status::Invalid("Arrow record batch ", i
, " did not validate:\n",
177 valid_st
.ToString());
180 if (!json_batch
->ApproxEquals(*arrow_batch
)) {
181 std::stringstream ss
;
182 ss
<< "Record batch " << i
<< " did not match";
185 RETURN_NOT_OK(PrettyPrint(*json_batch
, 0, &ss
));
188 RETURN_NOT_OK(PrettyPrint(*arrow_batch
, 0, &ss
));
189 return Status::Invalid(ss
.str());
196 Status
RunCommand(const std::string
& json_path
, const std::string
& arrow_path
,
197 const std::string
& command
) {
198 // Make sure the required extension types are registered, as they will be
199 // referenced in test data.
200 ExtensionTypeGuard
ext_guard({uuid(), dict_extension_type()});
202 if (json_path
== "") {
203 return Status::Invalid("Must specify json file name");
206 if (arrow_path
== "") {
207 return Status::Invalid("Must specify arrow file name");
210 auto file_exists
= [](const char* path
) { return std::ifstream(path
).good(); };
212 if (command
== "ARROW_TO_JSON") {
213 if (!file_exists(arrow_path
.c_str())) {
214 return Status::Invalid("Input file does not exist");
217 return ConvertArrowToJson(arrow_path
, json_path
);
218 } else if (command
== "JSON_TO_ARROW") {
219 if (!file_exists(json_path
.c_str())) {
220 return Status::Invalid("Input file does not exist");
223 return ConvertJsonToArrow(json_path
, arrow_path
);
224 } else if (command
== "VALIDATE") {
225 if (!file_exists(json_path
.c_str())) {
226 return Status::Invalid("JSON file does not exist");
229 if (!file_exists(arrow_path
.c_str())) {
230 return Status::Invalid("Arrow file does not exist");
233 return ValidateArrowVsJson(arrow_path
, json_path
);
235 return Status::Invalid("Unknown command: ", command
);
239 class TestJSONIntegration
: public ::testing::Test
{
242 ASSERT_OK_AND_ASSIGN(temp_dir_
, TemporaryDir::Make("json-integration-test-"));
245 std::string
mkstemp() {
246 std::stringstream ss
;
247 ss
<< temp_dir_
->path().ToString();
248 ss
<< "file" << ntemp_
++;
252 Status
WriteJson(const char* data
, const std::string
& path
) {
253 ARROW_ASSIGN_OR_RAISE(auto out_file
, io::FileOutputStream::Open(path
));
254 return out_file
->Write(data
, static_cast<int64_t>(strlen(data
)));
257 void TearDown() { temp_dir_
.reset(); }
260 std::unique_ptr
<TemporaryDir
> temp_dir_
;
264 static const char* JSON_EXAMPLE
= R
"example(
270 "type
": {"name
": "int", "isSigned
": true, "bitWidth
": 64},
271 "nullable
": true, "children
": []
275 "type
": {"name
": "floatingpoint
", "precision
": "DOUBLE
"},
276 "nullable
": true, "children
": []
287 "DATA
": ["1", "2", "3", "4", "5"],
288 "VALIDITY
": [1, 0, 1, 1, 1]
293 "DATA
": [1.0, 2.0, 3.0, 4.0, 5.0],
294 "VALIDITY
": [1, 0, 0, 1, 1]
304 "DATA
": ["-1", "0", "9223372036854775807", "-9223372036854775808"],
305 "VALIDITY
": [1, 0, 1, 1]
310 "DATA
": [1.0, 2.0, 3.0, 4.0],
311 "VALIDITY
": [1, 0, 0, 1]
319 static const char* JSON_EXAMPLE2
= R
"example(
325 "type
": {"name
": "int", "isSigned
": true, "bitWidth
": 32},
326 "nullable
": true, "children
": [],
328 {"key
": "converted_from_time32
", "value
": "true"}
333 {"key
": "schema_custom_0
", "value
": "eh
"}
343 "DATA
": [1, 2, 3, 4, 5],
344 "VALIDITY
": [1, 0, 1, 1, 1]
352 TEST_F(TestJSONIntegration
, ConvertAndValidate
) {
353 std::string json_path
= this->mkstemp();
354 std::string arrow_path
= this->mkstemp();
356 ASSERT_OK(WriteJson(JSON_EXAMPLE
, json_path
));
358 ASSERT_OK(RunCommand(json_path
, arrow_path
, "JSON_TO_ARROW"));
359 ASSERT_OK(RunCommand(json_path
, arrow_path
, "VALIDATE"));
361 // Convert and overwrite
362 ASSERT_OK(RunCommand(json_path
, arrow_path
, "ARROW_TO_JSON"));
364 // Convert back to arrow, and validate
365 ASSERT_OK(RunCommand(json_path
, arrow_path
, "JSON_TO_ARROW"));
366 ASSERT_OK(RunCommand(json_path
, arrow_path
, "VALIDATE"));
369 TEST_F(TestJSONIntegration
, ErrorStates
) {
370 std::string json_path
= this->mkstemp();
371 std::string json_path2
= this->mkstemp();
372 std::string arrow_path
= this->mkstemp();
374 ASSERT_OK(WriteJson(JSON_EXAMPLE
, json_path
));
375 ASSERT_OK(WriteJson(JSON_EXAMPLE2
, json_path2
));
377 ASSERT_OK(ConvertJsonToArrow(json_path
, arrow_path
));
378 ASSERT_RAISES(Invalid
, ValidateArrowVsJson(arrow_path
, json_path2
));
380 ASSERT_RAISES(IOError
, ValidateArrowVsJson("does_not_exist-1234", json_path2
));
381 ASSERT_RAISES(IOError
, ValidateArrowVsJson(arrow_path
, "does_not_exist-1234"));
383 ASSERT_RAISES(Invalid
, RunCommand("", arrow_path
, "VALIDATE"));
384 ASSERT_RAISES(Invalid
, RunCommand(json_path
, "", "VALIDATE"));
387 // A batch with primitive types
388 static const char* json_example1
= R
"example(
394 "type
": {"name
": "int", "isSigned
": true, "bitWidth
": 32},
395 "nullable
": true, "children
": []
399 "type
": {"name
": "floatingpoint
", "precision
": "DOUBLE
"},
400 "nullable
": true, "children
": []
411 "DATA
": [1, 2, 3, 4, 5],
412 "VALIDITY
": [1, 0, 1, 1, 1]
417 "DATA
": [1.0, 2.0, 3.0, 4.0, 5.0],
418 "VALIDITY
": [1, 0, 0, 1, 1]
426 // A batch with extension types
427 static const char* json_example2
= R
"example(
434 "name
" : "fixedsizebinary
",
440 {"key
": "ARROW
:extension
:name
", "value
": "uuid
"},
441 {"key
": "ARROW
:extension
:metadata
", "value
": "uuid
-serialized
"}
452 {"key
": "ARROW
:extension
:name
", "value
": "!does
not exist
!"},
453 {"key
": "ARROW
:extension
:metadata
", "value
": ""},
454 {"key
": "ARROW
:integration
:allow_unregistered_extension
", "value
": "true"}
466 "DATA
": ["30313233343536373839616263646566",
467 "00000000000000000000000000000000"],
480 // A batch with dict-extension types
481 static const char* json_example3
= R
"example(
486 "name
": "dict
-extensions
",
502 {"key
": "ARROW
:extension
:name
", "value
": "dict
-extension
"},
503 {"key
": "ARROW
:extension
:metadata
", "value
": "dict
-extension
-serialized
"}
543 "name
": "dict
-extensions
",
545 "DATA
": [2, 0, 1, 1, 2],
546 "VALIDITY
": [1, 1, 0, 1, 1]
554 // A batch with a map type with non-canonical field names
555 static const char* json_example4
= R
"example(
568 "name
": "some_entries
",
585 "name
": "some_value
",
605 "name
": "map_other_names
",
607 "VALIDITY
": [1, 0, 1],
608 "OFFSET
": [0, 3, 3, 5],
611 "name
": "some_entries
",
613 "VALIDITY
": [1, 1, 1, 1, 1],
618 "VALIDITY
": [1, 1, 1, 1, 1],
619 "DATA
": [11, 22, 33, 44, 55]
622 "name
": "some_value
",
624 "VALIDITY
": [1, 1, 0, 1, 1],
625 "DATA
": [111, 222, 0, 444, 555]
637 // An empty struct type, with "children" member in batches
638 static const char* json_example5
= R
"example(
643 "name
": "empty_struct
",
657 "name
": "empty_struct
",
659 "VALIDITY
": [1, 0, 1],
668 // An empty struct type, without "children" member in batches
669 static const char* json_example6
= R
"example(
674 "name
": "empty_struct
",
688 "name
": "empty_struct
",
698 void TestSchemaRoundTrip(const Schema
& schema
) {
700 rj::Writer
<rj::StringBuffer
> writer(sb
);
702 DictionaryFieldMapper
mapper(schema
);
704 writer
.StartObject();
705 ASSERT_OK(json::WriteSchema(schema
, mapper
, &writer
));
708 std::string json_schema
= sb
.GetString();
711 // Pass explicit size to avoid ASAN issues with
712 // SIMD loads in RapidJson.
713 d
.Parse(json_schema
.data(), json_schema
.size());
715 DictionaryMemo in_memo
;
716 std::shared_ptr
<Schema
> out
;
717 if (!json::ReadSchema(d
, default_memory_pool(), &in_memo
, &out
).ok()) {
718 FAIL() << "Unable to read JSON schema: " << json_schema
;
721 if (!schema
.Equals(*out
)) {
722 FAIL() << "In schema: " << schema
.ToString() << "\nOut schema: " << out
->ToString();
726 void TestArrayRoundTrip(const Array
& array
) {
727 static std::string name
= "dummy";
730 rj::Writer
<rj::StringBuffer
> writer(sb
);
732 ASSERT_OK(json::WriteArray(name
, array
, &writer
));
734 std::string array_as_json
= sb
.GetString();
737 // Pass explicit size to avoid ASAN issues with
738 // SIMD loads in RapidJson.
739 d
.Parse(array_as_json
.data(), array_as_json
.size());
741 if (d
.HasParseError()) {
742 FAIL() << "JSON parsing failed";
745 std::shared_ptr
<Array
> out
;
746 ASSERT_OK(json::ReadArray(default_memory_pool(), d
, ::arrow::field(name
, array
.type()),
749 // std::cout << array_as_json << std::endl;
750 CompareArraysDetailed(0, *out
, array
);
753 template <typename T
, typename ValueType
>
754 void CheckPrimitive(const std::shared_ptr
<DataType
>& type
,
755 const std::vector
<bool>& is_valid
,
756 const std::vector
<ValueType
>& values
) {
757 MemoryPool
* pool
= default_memory_pool();
758 typename TypeTraits
<T
>::BuilderType
builder(pool
);
760 for (size_t i
= 0; i
< values
.size(); ++i
) {
762 ASSERT_OK(builder
.Append(values
[i
]));
764 ASSERT_OK(builder
.AppendNull());
768 std::shared_ptr
<Array
> array
;
769 ASSERT_OK(builder
.Finish(&array
));
770 TestArrayRoundTrip(*array
);
773 TEST(TestJsonSchemaWriter
, FlatTypes
) {
775 // field("f14", date32())
776 std::vector
<std::shared_ptr
<Field
>> fields
= {
778 field("f1", int16(), false),
779 field("f2", int32()),
780 field("f3", int64(), false),
781 field("f4", uint8()),
782 field("f5", uint16()),
783 field("f6", uint32()),
784 field("f7", uint64()),
785 field("f8", float32()),
786 field("f9", float64()),
787 field("f10", utf8()),
788 field("f11", binary()),
789 field("f12", list(int32())),
790 field("f13", struct_({field("s1", int32()), field("s2", utf8())})),
791 field("f15", date64()),
792 field("f16", timestamp(TimeUnit::NANO
)),
793 field("f17", time64(TimeUnit::MICRO
)),
795 dense_union({field("u1", int8()), field("u2", time32(TimeUnit::MILLI
))},
797 field("f19", large_list(uint8())),
798 field("f20", null()),
801 Schema
schema(fields
);
802 TestSchemaRoundTrip(schema
);
805 template <typename T
>
806 void PrimitiveTypesCheckOne() {
807 using c_type
= typename
T::c_type
;
809 std::vector
<bool> is_valid
= {true, false, true, true, true, false, true, true};
810 std::vector
<c_type
> values
= {0, 1, 2, 3, 4, 5, 6, 7};
811 CheckPrimitive
<T
, c_type
>(std::make_shared
<T
>(), is_valid
, values
);
814 TEST(TestJsonArrayWriter
, NullType
) {
815 auto arr
= std::make_shared
<NullArray
>(10);
816 TestArrayRoundTrip(*arr
);
819 TEST(TestJsonArrayWriter
, PrimitiveTypes
) {
820 PrimitiveTypesCheckOne
<Int8Type
>();
821 PrimitiveTypesCheckOne
<Int16Type
>();
822 PrimitiveTypesCheckOne
<Int32Type
>();
823 PrimitiveTypesCheckOne
<Int64Type
>();
824 PrimitiveTypesCheckOne
<UInt8Type
>();
825 PrimitiveTypesCheckOne
<UInt16Type
>();
826 PrimitiveTypesCheckOne
<UInt32Type
>();
827 PrimitiveTypesCheckOne
<UInt64Type
>();
828 PrimitiveTypesCheckOne
<FloatType
>();
829 PrimitiveTypesCheckOne
<DoubleType
>();
831 std::vector
<bool> is_valid
= {true, false, true, true, true, false, true, true};
832 std::vector
<std::string
> values
= {"foo", "bar", "", "baz", "qux", "foo", "a", "1"};
834 CheckPrimitive
<StringType
, std::string
>(utf8(), is_valid
, values
);
835 CheckPrimitive
<BinaryType
, std::string
>(binary(), is_valid
, values
);
838 TEST(TestJsonArrayWriter
, NestedTypes
) {
839 auto value_type
= int32();
841 std::vector
<bool> values_is_valid
= {true, false, true, true, false, true, true};
843 std::vector
<int32_t> values
= {0, 1, 2, 3, 4, 5, 6};
844 std::shared_ptr
<Array
> values_array
;
845 ArrayFromVector
<Int32Type
, int32_t>(values_is_valid
, values
, &values_array
);
847 std::vector
<int16_t> i16_values
= {0, 1, 2, 3, 4, 5, 6};
848 std::shared_ptr
<Array
> i16_values_array
;
849 ArrayFromVector
<Int16Type
, int16_t>(values_is_valid
, i16_values
, &i16_values_array
);
852 std::vector
<bool> list_is_valid
= {true, false, true, true, true};
853 std::shared_ptr
<Buffer
> list_bitmap
;
854 ASSERT_OK(GetBitmapFromVector(list_is_valid
, &list_bitmap
));
855 std::vector
<int32_t> offsets
= {0, 0, 0, 1, 4, 7};
856 std::shared_ptr
<Buffer
> offsets_buffer
= Buffer::Wrap(offsets
);
858 ListArray
list_array(list(value_type
), 5, offsets_buffer
, values_array
, list_bitmap
,
860 TestArrayRoundTrip(list_array
);
864 std::vector
<int64_t> large_offsets
= {0, 0, 0, 1, 4, 7};
865 std::shared_ptr
<Buffer
> large_offsets_buffer
= Buffer::Wrap(large_offsets
);
867 LargeListArray
list_array(large_list(value_type
), 5, large_offsets_buffer
,
868 values_array
, list_bitmap
, 1);
869 TestArrayRoundTrip(list_array
);
873 auto map_type
= map(utf8(), int32());
874 auto keys_array
= ArrayFromJSON(utf8(), R
"(["a
", "b
", "c
", "d
", "a
", "b
", "c
"])");
876 MapArray
map_array(map_type
, 5, offsets_buffer
, keys_array
, values_array
, list_bitmap
,
879 TestArrayRoundTrip(map_array
);
882 FixedSizeListArray
fixed_size_list_array(fixed_size_list(value_type
, 2), 3,
883 values_array
->Slice(1), list_bitmap
, 1);
885 TestArrayRoundTrip(fixed_size_list_array
);
888 std::vector
<bool> struct_is_valid
= {true, false, true, true, true, false, true};
889 std::shared_ptr
<Buffer
> struct_bitmap
;
890 ASSERT_OK(GetBitmapFromVector(struct_is_valid
, &struct_bitmap
));
893 struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())});
895 std::vector
<std::shared_ptr
<Array
>> fields
= {values_array
, values_array
, values_array
};
896 StructArray
struct_array(struct_type
, static_cast<int>(struct_is_valid
.size()), fields
,
898 TestArrayRoundTrip(struct_array
);
901 TEST(TestJsonArrayWriter
, Unions
) {
902 std::shared_ptr
<RecordBatch
> batch
;
903 ASSERT_OK(MakeUnion(&batch
));
905 for (int i
= 0; i
< batch
->num_columns(); ++i
) {
906 TestArrayRoundTrip(*batch
->column(i
));
910 // Data generation for test case below
911 void MakeBatchArrays(const std::shared_ptr
<Schema
>& schema
, const int num_rows
,
912 std::vector
<std::shared_ptr
<Array
>>* arrays
) {
913 const float null_prob
= 0.25f
;
914 random::RandomArrayGenerator
rand(0x564a3bf0);
916 *arrays
= {rand
.Boolean(num_rows
, 0.75, null_prob
),
917 rand
.Int8(num_rows
, 0, 100, null_prob
),
918 rand
.Int32(num_rows
, -1000, 1000, null_prob
),
919 rand
.UInt64(num_rows
, 0, 1UL << 16, null_prob
)};
921 static const int kBufferSize
= 10;
922 static uint8_t buffer
[kBufferSize
];
923 static uint32_t seed
= 0;
924 StringBuilder string_builder
;
925 for (int i
= 0; i
< num_rows
; ++i
) {
926 random_ascii(kBufferSize
, seed
++, buffer
);
927 ASSERT_OK(string_builder
.Append(buffer
, kBufferSize
));
929 std::shared_ptr
<Array
> v3
;
930 ASSERT_OK(string_builder
.Finish(&v3
));
932 arrays
->emplace_back(v3
);
935 TEST(TestJsonFileReadWrite
, BasicRoundTrip
) {
936 auto v1_type
= boolean();
937 auto v2_type
= int8();
938 auto v3_type
= int32();
939 auto v4_type
= uint64();
940 auto v5_type
= utf8();
943 ::arrow::schema({field("f1", v1_type
), field("f2", v2_type
), field("f3", v3_type
),
944 field("f4", v4_type
), field("f5", v5_type
)});
946 std::unique_ptr
<IntegrationJsonWriter
> writer
;
947 ASSERT_OK(IntegrationJsonWriter::Open(schema
, &writer
));
949 const int nbatches
= 3;
950 std::vector
<std::shared_ptr
<RecordBatch
>> batches
;
951 for (int i
= 0; i
< nbatches
; ++i
) {
952 int num_rows
= 5 + i
* 5;
953 std::vector
<std::shared_ptr
<Array
>> arrays
;
955 MakeBatchArrays(schema
, num_rows
, &arrays
);
956 auto batch
= RecordBatch::Make(schema
, num_rows
, arrays
);
957 batches
.push_back(batch
);
958 ASSERT_OK(writer
->WriteRecordBatch(*batch
));
962 ASSERT_OK(writer
->Finish(&result
));
964 std::unique_ptr
<IntegrationJsonReader
> reader
;
966 auto buffer
= std::make_shared
<Buffer
>(result
);
968 ASSERT_OK(IntegrationJsonReader::Open(buffer
, &reader
));
969 ASSERT_TRUE(reader
->schema()->Equals(*schema
));
971 ASSERT_EQ(nbatches
, reader
->num_record_batches());
973 for (int i
= 0; i
< nbatches
; ++i
) {
974 std::shared_ptr
<RecordBatch
> batch
;
975 ASSERT_OK(reader
->ReadRecordBatch(i
, &batch
));
976 ASSERT_BATCHES_EQUAL(*batch
, *batches
[i
]);
980 static void ReadOneBatchJson(const char* json
, const Schema
& expected_schema
,
981 std::shared_ptr
<RecordBatch
>* out
) {
982 auto buffer
= Buffer::Wrap(json
, strlen(json
));
984 std::unique_ptr
<IntegrationJsonReader
> reader
;
985 ASSERT_OK(IntegrationJsonReader::Open(buffer
, &reader
));
987 AssertSchemaEqual(*reader
->schema(), expected_schema
, /*check_metadata=*/true);
988 ASSERT_EQ(1, reader
->num_record_batches());
990 ASSERT_OK(reader
->ReadRecordBatch(0, out
));
993 TEST(TestJsonFileReadWrite
, JsonExample1
) {
994 Schema
ex_schema({field("foo", int32()), field("bar", float64())});
996 std::shared_ptr
<RecordBatch
> batch
;
997 ReadOneBatchJson(json_example1
, ex_schema
, &batch
);
999 std::vector
<bool> foo_valid
= {true, false, true, true, true};
1000 std::vector
<int32_t> foo_values
= {1, 2, 3, 4, 5};
1001 std::shared_ptr
<Array
> foo
;
1002 ArrayFromVector
<Int32Type
, int32_t>(foo_valid
, foo_values
, &foo
);
1003 ASSERT_TRUE(batch
->column(0)->Equals(foo
));
1005 std::vector
<bool> bar_valid
= {true, false, false, true, true};
1006 std::vector
<double> bar_values
= {1, 2, 3, 4, 5};
1007 std::shared_ptr
<Array
> bar
;
1008 ArrayFromVector
<DoubleType
, double>(bar_valid
, bar_values
, &bar
);
1009 ASSERT_TRUE(batch
->column(1)->Equals(bar
));
1012 TEST(TestJsonFileReadWrite
, JsonExample2
) {
1013 // Example 2: two extension types (one registered, one unregistered)
1014 auto uuid_type
= uuid();
1015 auto buffer
= Buffer::Wrap(json_example2
, strlen(json_example2
));
1017 std::unique_ptr
<IntegrationJsonReader
> reader
;
1019 ExtensionTypeGuard
ext_guard(uuid_type
);
1021 ASSERT_OK(IntegrationJsonReader::Open(buffer
, &reader
));
1022 // The second field is an unregistered extension and will be read as
1023 // its underlying storage.
1024 Schema
ex_schema({field("uuids", uuid_type
), field("things", null())});
1026 AssertSchemaEqual(ex_schema
, *reader
->schema());
1027 ASSERT_EQ(1, reader
->num_record_batches());
1029 std::shared_ptr
<RecordBatch
> batch
;
1030 ASSERT_OK(reader
->ReadRecordBatch(0, &batch
));
1032 auto storage_array
=
1033 ArrayFromJSON(fixed_size_binary(16), R
"(["0123456789abcdef
", null])");
1034 AssertArraysEqual(*batch
->column(0), UuidArray(uuid_type
, storage_array
));
1036 AssertArraysEqual(*batch
->column(1), NullArray(2));
1039 // Should fail now that the Uuid extension is unregistered
1040 ASSERT_RAISES(KeyError
, IntegrationJsonReader::Open(buffer
, &reader
));
1043 TEST(TestJsonFileReadWrite
, JsonExample3
) {
1044 // Example 3: An extension type with a dictionary storage type
1045 auto dict_ext_type
= std::make_shared
<DictExtensionType
>();
1046 ExtensionTypeGuard
ext_guard(dict_ext_type
);
1047 Schema
ex_schema({field("dict-extensions", dict_ext_type
)});
1049 std::shared_ptr
<RecordBatch
> batch
;
1050 ReadOneBatchJson(json_example3
, ex_schema
, &batch
);
1051 auto storage_array
= std::make_shared
<DictionaryArray
>(
1052 dict_ext_type
->storage_type(), ArrayFromJSON(int8(), "[2, 0, null, 1, 2]"),
1053 ArrayFromJSON(utf8(), R
"(["foo
", "bar
", "quux
"])"));
1054 AssertArraysEqual(*batch
->column(0), ExtensionArray(dict_ext_type
, storage_array
),
1058 TEST(TestJsonFileReadWrite
, JsonExample4
) {
1059 // Example 4: A map type with non-canonical field names
1060 ASSERT_OK_AND_ASSIGN(auto map_type
,
1061 MapType::Make(field("some_entries",
1062 struct_({field("some_key", int16(), false),
1063 field("some_value", int32())}),
1065 Schema
ex_schema({field("maps", map_type
)});
1067 std::shared_ptr
<RecordBatch
> batch
;
1068 ReadOneBatchJson(json_example4
, ex_schema
, &batch
);
1070 auto expected_array
= ArrayFromJSON(
1071 map(int16(), int32()),
1072 R
"([[[11, 111], [22, 222], [33, null]], null, [[44, 444], [55, 555]]])");
1073 AssertArraysEqual(*batch
->column(0), *expected_array
);
1076 TEST(TestJsonFileReadWrite
, JsonExample5
) {
1077 // Example 5: An empty struct
1078 auto struct_type
= struct_(FieldVector
{});
1079 Schema
ex_schema({field("empty_struct", struct_type
)});
1081 std::shared_ptr
<RecordBatch
> batch
;
1082 ReadOneBatchJson(json_example5
, ex_schema
, &batch
);
1084 auto expected_array
= ArrayFromJSON(struct_type
, "[{}, null, {}]");
1085 AssertArraysEqual(*batch
->column(0), *expected_array
);
1088 TEST(TestJsonFileReadWrite
, JsonExample6
) {
1089 // Example 6: An empty struct
1090 auto struct_type
= struct_(FieldVector
{});
1091 Schema
ex_schema({field("empty_struct", struct_type
)});
1093 std::shared_ptr
<RecordBatch
> batch
;
1094 ReadOneBatchJson(json_example6
, ex_schema
, &batch
);
1096 auto expected_array
= ArrayFromJSON(struct_type
, "[{}, null]");
1097 AssertArraysEqual(*batch
->column(0), *expected_array
);
1100 class TestJsonRoundTrip
: public ::testing::TestWithParam
<MakeRecordBatch
*> {
1106 void CheckRoundtrip(const RecordBatch
& batch
) {
1107 ExtensionTypeGuard
guard({uuid(), dict_extension_type(), complex128()});
1109 TestSchemaRoundTrip(*batch
.schema());
1111 std::unique_ptr
<IntegrationJsonWriter
> writer
;
1112 ASSERT_OK(IntegrationJsonWriter::Open(batch
.schema(), &writer
));
1113 ASSERT_OK(writer
->WriteRecordBatch(batch
));
1116 ASSERT_OK(writer
->Finish(&result
));
1118 auto buffer
= std::make_shared
<Buffer
>(result
);
1120 std::unique_ptr
<IntegrationJsonReader
> reader
;
1121 ASSERT_OK(IntegrationJsonReader::Open(buffer
, &reader
));
1123 std::shared_ptr
<RecordBatch
> result_batch
;
1124 ASSERT_OK(reader
->ReadRecordBatch(0, &result_batch
));
1126 // take care of float rounding error in the text representation
1127 ApproxCompareBatch(batch
, *result_batch
);
1130 TEST_P(TestJsonRoundTrip
, RoundTrip
) {
1131 std::shared_ptr
<RecordBatch
> batch
;
1132 ASSERT_OK((*GetParam())(&batch
)); // NOLINT clang-tidy gtest issue
1134 CheckRoundtrip(*batch
);
1137 const std::vector
<ipc::test::MakeRecordBatch
*> kBatchCases
= {
1138 &MakeIntRecordBatch
,
1139 &MakeListRecordBatch
,
1140 &MakeFixedSizeListRecordBatch
,
1141 &MakeNonNullRecordBatch
,
1142 &MakeZeroLengthRecordBatch
,
1143 &MakeDeeplyNestedList
,
1144 &MakeStringTypesRecordBatchWithNulls
,
1148 &MakeNestedDictionary
,
1150 &MakeMapOfDictionary
,
1162 &MakeDictExtension
};
1164 INSTANTIATE_TEST_SUITE_P(TestJsonRoundTrip
, TestJsonRoundTrip
,
1165 ::testing::ValuesIn(kBatchCases
));
1167 } // namespace testing
1168 } // namespace arrow
1170 int main(int argc
, char** argv
) {
1171 gflags::ParseCommandLineFlags(&argc
, &argv
, true);
1175 if (FLAGS_integration
) {
1176 arrow::Status result
=
1177 arrow::testing::RunCommand(FLAGS_json
, FLAGS_arrow
, FLAGS_mode
);
1179 std::cout
<< "Error message: " << result
.ToString() << std::endl
;
1183 ::testing::InitGoogleTest(&argc
, argv
);
1184 ret
= RUN_ALL_TESTS();
1186 gflags::ShutDownCommandLineFlags();