]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/testing/json_integration_test.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / testing / json_integration_test.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include <cstdint>
19 #include <cstdio>
20 #include <cstring>
21 #include <fstream> // IWYU pragma: keep
22 #include <iostream>
23 #include <memory>
24 #include <sstream>
25 #include <string>
26 #include <vector>
27
28 #include <gflags/gflags.h>
29 #include <gtest/gtest.h>
30
31 #include "arrow/array.h"
32 #include "arrow/array/builder_binary.h"
33 #include "arrow/array/builder_primitive.h"
34 #include "arrow/io/file.h"
35 #include "arrow/ipc/dictionary.h"
36 #include "arrow/ipc/reader.h"
37 #include "arrow/ipc/test_common.h"
38 #include "arrow/ipc/writer.h"
39 #include "arrow/pretty_print.h"
40 #include "arrow/status.h"
41 #include "arrow/testing/extension_type.h"
42 #include "arrow/testing/gtest_util.h"
43 #include "arrow/testing/json_integration.h"
44 #include "arrow/testing/json_internal.h"
45 #include "arrow/testing/random.h"
46 #include "arrow/type.h"
47 #include "arrow/type_fwd.h"
48 #include "arrow/util/io_util.h"
49
50 DEFINE_string(arrow, "", "Arrow file name");
51 DEFINE_string(json, "", "JSON file name");
52 DEFINE_string(
53 mode, "VALIDATE",
54 "Mode of integration testing tool (ARROW_TO_JSON, JSON_TO_ARROW, VALIDATE)");
55 DEFINE_bool(integration, false, "Run in integration test mode");
56 DEFINE_bool(verbose, true, "Verbose output");
57
58 namespace arrow {
59
60 using internal::TemporaryDir;
61 using ipc::DictionaryFieldMapper;
62 using ipc::DictionaryMemo;
63 using ipc::IpcWriteOptions;
64 using ipc::MetadataVersion;
65
66 namespace testing {
67
68 using namespace ::arrow::ipc::test; // NOLINT
69
70 // Convert JSON file to IPC binary format
71 static Status ConvertJsonToArrow(const std::string& json_path,
72 const std::string& arrow_path) {
73 ARROW_ASSIGN_OR_RAISE(auto in_file, io::ReadableFile::Open(json_path));
74 ARROW_ASSIGN_OR_RAISE(auto out_file, io::FileOutputStream::Open(arrow_path));
75
76 ARROW_ASSIGN_OR_RAISE(int64_t file_size, in_file->GetSize());
77 ARROW_ASSIGN_OR_RAISE(auto json_buffer, in_file->Read(file_size));
78
79 std::unique_ptr<IntegrationJsonReader> reader;
80 RETURN_NOT_OK(IntegrationJsonReader::Open(json_buffer, &reader));
81
82 if (FLAGS_verbose) {
83 std::cout << "Found schema:\n"
84 << reader->schema()->ToString(/* show_metadata = */ true) << std::endl;
85 }
86
87 ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(out_file, reader->schema(),
88 IpcWriteOptions::Defaults()));
89 for (int i = 0; i < reader->num_record_batches(); ++i) {
90 std::shared_ptr<RecordBatch> batch;
91 RETURN_NOT_OK(reader->ReadRecordBatch(i, &batch));
92 RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
93 }
94 return writer->Close();
95 }
96
97 // Convert IPC binary format to JSON
98 static Status ConvertArrowToJson(const std::string& arrow_path,
99 const std::string& json_path) {
100 ARROW_ASSIGN_OR_RAISE(auto in_file, io::ReadableFile::Open(arrow_path));
101 ARROW_ASSIGN_OR_RAISE(auto out_file, io::FileOutputStream::Open(json_path));
102
103 std::shared_ptr<ipc::RecordBatchFileReader> reader;
104 ARROW_ASSIGN_OR_RAISE(reader, ipc::RecordBatchFileReader::Open(in_file.get()));
105
106 if (FLAGS_verbose) {
107 std::cout << "Found schema:\n" << reader->schema()->ToString() << std::endl;
108 }
109
110 std::unique_ptr<IntegrationJsonWriter> writer;
111 RETURN_NOT_OK(IntegrationJsonWriter::Open(reader->schema(), &writer));
112
113 for (int i = 0; i < reader->num_record_batches(); ++i) {
114 ARROW_ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> batch, reader->ReadRecordBatch(i));
115 RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
116 }
117
118 std::string result;
119 RETURN_NOT_OK(writer->Finish(&result));
120 return out_file->Write(result.c_str(), static_cast<int64_t>(result.size()));
121 }
122
123 static Status ValidateArrowVsJson(const std::string& arrow_path,
124 const std::string& json_path) {
125 // Construct JSON reader
126 ARROW_ASSIGN_OR_RAISE(auto json_file, io::ReadableFile::Open(json_path));
127
128 ARROW_ASSIGN_OR_RAISE(int64_t file_size, json_file->GetSize());
129 ARROW_ASSIGN_OR_RAISE(auto json_buffer, json_file->Read(file_size));
130
131 std::unique_ptr<IntegrationJsonReader> json_reader;
132 RETURN_NOT_OK(IntegrationJsonReader::Open(json_buffer, &json_reader));
133
134 // Construct Arrow reader
135 ARROW_ASSIGN_OR_RAISE(auto arrow_file, io::ReadableFile::Open(arrow_path));
136
137 std::shared_ptr<ipc::RecordBatchFileReader> arrow_reader;
138 ARROW_ASSIGN_OR_RAISE(arrow_reader, ipc::RecordBatchFileReader::Open(arrow_file.get()));
139
140 auto json_schema = json_reader->schema();
141 auto arrow_schema = arrow_reader->schema();
142
143 if (!json_schema->Equals(*arrow_schema)) {
144 std::stringstream ss;
145 ss << "JSON schema: \n"
146 << json_schema->ToString(/* show_metadata = */ true) << "\n\n"
147 << "Arrow schema: \n"
148 << arrow_schema->ToString(/* show_metadata = */ true) << "\n";
149
150 if (FLAGS_verbose) {
151 std::cout << ss.str() << std::endl;
152 }
153 return Status::Invalid("Schemas did not match");
154 }
155
156 const int json_nbatches = json_reader->num_record_batches();
157 const int arrow_nbatches = arrow_reader->num_record_batches();
158
159 if (json_nbatches != arrow_nbatches) {
160 return Status::Invalid("Different number of record batches: ", json_nbatches,
161 " (JSON) vs ", arrow_nbatches, " (Arrow)");
162 }
163
164 std::shared_ptr<RecordBatch> arrow_batch;
165 std::shared_ptr<RecordBatch> json_batch;
166 for (int i = 0; i < json_nbatches; ++i) {
167 RETURN_NOT_OK(json_reader->ReadRecordBatch(i, &json_batch));
168 ARROW_ASSIGN_OR_RAISE(arrow_batch, arrow_reader->ReadRecordBatch(i));
169 Status valid_st = json_batch->ValidateFull();
170 if (!valid_st.ok()) {
171 return Status::Invalid("JSON record batch ", i, " did not validate:\n",
172 valid_st.ToString());
173 }
174 valid_st = arrow_batch->ValidateFull();
175 if (!valid_st.ok()) {
176 return Status::Invalid("Arrow record batch ", i, " did not validate:\n",
177 valid_st.ToString());
178 }
179
180 if (!json_batch->ApproxEquals(*arrow_batch)) {
181 std::stringstream ss;
182 ss << "Record batch " << i << " did not match";
183
184 ss << "\nJSON:\n";
185 RETURN_NOT_OK(PrettyPrint(*json_batch, 0, &ss));
186
187 ss << "\nArrow:\n";
188 RETURN_NOT_OK(PrettyPrint(*arrow_batch, 0, &ss));
189 return Status::Invalid(ss.str());
190 }
191 }
192
193 return Status::OK();
194 }
195
196 Status RunCommand(const std::string& json_path, const std::string& arrow_path,
197 const std::string& command) {
198 // Make sure the required extension types are registered, as they will be
199 // referenced in test data.
200 ExtensionTypeGuard ext_guard({uuid(), dict_extension_type()});
201
202 if (json_path == "") {
203 return Status::Invalid("Must specify json file name");
204 }
205
206 if (arrow_path == "") {
207 return Status::Invalid("Must specify arrow file name");
208 }
209
210 auto file_exists = [](const char* path) { return std::ifstream(path).good(); };
211
212 if (command == "ARROW_TO_JSON") {
213 if (!file_exists(arrow_path.c_str())) {
214 return Status::Invalid("Input file does not exist");
215 }
216
217 return ConvertArrowToJson(arrow_path, json_path);
218 } else if (command == "JSON_TO_ARROW") {
219 if (!file_exists(json_path.c_str())) {
220 return Status::Invalid("Input file does not exist");
221 }
222
223 return ConvertJsonToArrow(json_path, arrow_path);
224 } else if (command == "VALIDATE") {
225 if (!file_exists(json_path.c_str())) {
226 return Status::Invalid("JSON file does not exist");
227 }
228
229 if (!file_exists(arrow_path.c_str())) {
230 return Status::Invalid("Arrow file does not exist");
231 }
232
233 return ValidateArrowVsJson(arrow_path, json_path);
234 } else {
235 return Status::Invalid("Unknown command: ", command);
236 }
237 }
238
239 class TestJSONIntegration : public ::testing::Test {
240 public:
241 void SetUp() {
242 ASSERT_OK_AND_ASSIGN(temp_dir_, TemporaryDir::Make("json-integration-test-"));
243 }
244
245 std::string mkstemp() {
246 std::stringstream ss;
247 ss << temp_dir_->path().ToString();
248 ss << "file" << ntemp_++;
249 return ss.str();
250 }
251
252 Status WriteJson(const char* data, const std::string& path) {
253 ARROW_ASSIGN_OR_RAISE(auto out_file, io::FileOutputStream::Open(path));
254 return out_file->Write(data, static_cast<int64_t>(strlen(data)));
255 }
256
257 void TearDown() { temp_dir_.reset(); }
258
259 protected:
260 std::unique_ptr<TemporaryDir> temp_dir_;
261 int ntemp_ = 1;
262 };
263
264 static const char* JSON_EXAMPLE = R"example(
265 {
266 "schema": {
267 "fields": [
268 {
269 "name": "foo",
270 "type": {"name": "int", "isSigned": true, "bitWidth": 64},
271 "nullable": true, "children": []
272 },
273 {
274 "name": "bar",
275 "type": {"name": "floatingpoint", "precision": "DOUBLE"},
276 "nullable": true, "children": []
277 }
278 ]
279 },
280 "batches": [
281 {
282 "count": 5,
283 "columns": [
284 {
285 "name": "foo",
286 "count": 5,
287 "DATA": ["1", "2", "3", "4", "5"],
288 "VALIDITY": [1, 0, 1, 1, 1]
289 },
290 {
291 "name": "bar",
292 "count": 5,
293 "DATA": [1.0, 2.0, 3.0, 4.0, 5.0],
294 "VALIDITY": [1, 0, 0, 1, 1]
295 }
296 ]
297 },
298 {
299 "count": 4,
300 "columns": [
301 {
302 "name": "foo",
303 "count": 4,
304 "DATA": ["-1", "0", "9223372036854775807", "-9223372036854775808"],
305 "VALIDITY": [1, 0, 1, 1]
306 },
307 {
308 "name": "bar",
309 "count": 4,
310 "DATA": [1.0, 2.0, 3.0, 4.0],
311 "VALIDITY": [1, 0, 0, 1]
312 }
313 ]
314 }
315 ]
316 }
317 )example";
318
319 static const char* JSON_EXAMPLE2 = R"example(
320 {
321 "schema": {
322 "fields": [
323 {
324 "name": "foo",
325 "type": {"name": "int", "isSigned": true, "bitWidth": 32},
326 "nullable": true, "children": [],
327 "metadata": [
328 {"key": "converted_from_time32", "value": "true"}
329 ]
330 }
331 ],
332 "metadata": [
333 {"key": "schema_custom_0", "value": "eh"}
334 ]
335 },
336 "batches": [
337 {
338 "count": 5,
339 "columns": [
340 {
341 "name": "foo",
342 "count": 5,
343 "DATA": [1, 2, 3, 4, 5],
344 "VALIDITY": [1, 0, 1, 1, 1]
345 }
346 ]
347 }
348 ]
349 }
350 )example";
351
352 TEST_F(TestJSONIntegration, ConvertAndValidate) {
353 std::string json_path = this->mkstemp();
354 std::string arrow_path = this->mkstemp();
355
356 ASSERT_OK(WriteJson(JSON_EXAMPLE, json_path));
357
358 ASSERT_OK(RunCommand(json_path, arrow_path, "JSON_TO_ARROW"));
359 ASSERT_OK(RunCommand(json_path, arrow_path, "VALIDATE"));
360
361 // Convert and overwrite
362 ASSERT_OK(RunCommand(json_path, arrow_path, "ARROW_TO_JSON"));
363
364 // Convert back to arrow, and validate
365 ASSERT_OK(RunCommand(json_path, arrow_path, "JSON_TO_ARROW"));
366 ASSERT_OK(RunCommand(json_path, arrow_path, "VALIDATE"));
367 }
368
369 TEST_F(TestJSONIntegration, ErrorStates) {
370 std::string json_path = this->mkstemp();
371 std::string json_path2 = this->mkstemp();
372 std::string arrow_path = this->mkstemp();
373
374 ASSERT_OK(WriteJson(JSON_EXAMPLE, json_path));
375 ASSERT_OK(WriteJson(JSON_EXAMPLE2, json_path2));
376
377 ASSERT_OK(ConvertJsonToArrow(json_path, arrow_path));
378 ASSERT_RAISES(Invalid, ValidateArrowVsJson(arrow_path, json_path2));
379
380 ASSERT_RAISES(IOError, ValidateArrowVsJson("does_not_exist-1234", json_path2));
381 ASSERT_RAISES(IOError, ValidateArrowVsJson(arrow_path, "does_not_exist-1234"));
382
383 ASSERT_RAISES(Invalid, RunCommand("", arrow_path, "VALIDATE"));
384 ASSERT_RAISES(Invalid, RunCommand(json_path, "", "VALIDATE"));
385 }
386
387 // A batch with primitive types
388 static const char* json_example1 = R"example(
389 {
390 "schema": {
391 "fields": [
392 {
393 "name": "foo",
394 "type": {"name": "int", "isSigned": true, "bitWidth": 32},
395 "nullable": true, "children": []
396 },
397 {
398 "name": "bar",
399 "type": {"name": "floatingpoint", "precision": "DOUBLE"},
400 "nullable": true, "children": []
401 }
402 ]
403 },
404 "batches": [
405 {
406 "count": 5,
407 "columns": [
408 {
409 "name": "foo",
410 "count": 5,
411 "DATA": [1, 2, 3, 4, 5],
412 "VALIDITY": [1, 0, 1, 1, 1]
413 },
414 {
415 "name": "bar",
416 "count": 5,
417 "DATA": [1.0, 2.0, 3.0, 4.0, 5.0],
418 "VALIDITY": [1, 0, 0, 1, 1]
419 }
420 ]
421 }
422 ]
423 }
424 )example";
425
426 // A batch with extension types
427 static const char* json_example2 = R"example(
428 {
429 "schema": {
430 "fields": [
431 {
432 "name": "uuids",
433 "type" : {
434 "name" : "fixedsizebinary",
435 "byteWidth" : 16
436 },
437 "nullable": true,
438 "children" : [],
439 "metadata" : [
440 {"key": "ARROW:extension:name", "value": "uuid"},
441 {"key": "ARROW:extension:metadata", "value": "uuid-serialized"}
442 ]
443 },
444 {
445 "name": "things",
446 "type" : {
447 "name" : "null"
448 },
449 "nullable": true,
450 "children" : [],
451 "metadata" : [
452 {"key": "ARROW:extension:name", "value": "!does not exist!"},
453 {"key": "ARROW:extension:metadata", "value": ""},
454 {"key": "ARROW:integration:allow_unregistered_extension", "value": "true"}
455 ]
456 }
457 ]
458 },
459 "batches": [
460 {
461 "count": 2,
462 "columns": [
463 {
464 "name": "uuids",
465 "count": 2,
466 "DATA": ["30313233343536373839616263646566",
467 "00000000000000000000000000000000"],
468 "VALIDITY": [1, 0]
469 },
470 {
471 "name": "things",
472 "count": 2
473 }
474 ]
475 }
476 ]
477 }
478 )example";
479
480 // A batch with dict-extension types
481 static const char* json_example3 = R"example(
482 {
483 "schema": {
484 "fields": [
485 {
486 "name": "dict-extensions",
487 "type" : {
488 "name" : "utf8"
489 },
490 "nullable": true,
491 "children" : [],
492 "dictionary": {
493 "id": 0,
494 "indexType": {
495 "name": "int",
496 "isSigned": true,
497 "bitWidth": 8
498 },
499 "isOrdered": false
500 },
501 "metadata" : [
502 {"key": "ARROW:extension:name", "value": "dict-extension"},
503 {"key": "ARROW:extension:metadata", "value": "dict-extension-serialized"}
504 ]
505 }
506 ]
507 },
508 "dictionaries": [
509 {
510 "id": 0,
511 "data": {
512 "count": 3,
513 "columns": [
514 {
515 "name": "DICT0",
516 "count": 3,
517 "VALIDITY": [
518 1,
519 1,
520 1
521 ],
522 "OFFSET": [
523 0,
524 3,
525 6,
526 10
527 ],
528 "DATA": [
529 "foo",
530 "bar",
531 "quux"
532 ]
533 }
534 ]
535 }
536 }
537 ],
538 "batches": [
539 {
540 "count": 5,
541 "columns": [
542 {
543 "name": "dict-extensions",
544 "count": 5,
545 "DATA": [2, 0, 1, 1, 2],
546 "VALIDITY": [1, 1, 0, 1, 1]
547 }
548 ]
549 }
550 ]
551 }
552 )example";
553
554 // A batch with a map type with non-canonical field names
555 static const char* json_example4 = R"example(
556 {
557 "schema": {
558 "fields": [
559 {
560 "name": "maps",
561 "type": {
562 "name": "map",
563 "keysSorted": false
564 },
565 "nullable": true,
566 "children": [
567 {
568 "name": "some_entries",
569 "type": {
570 "name": "struct"
571 },
572 "nullable": false,
573 "children": [
574 {
575 "name": "some_key",
576 "type": {
577 "name": "int",
578 "isSigned": true,
579 "bitWidth": 16
580 },
581 "nullable": false,
582 "children": []
583 },
584 {
585 "name": "some_value",
586 "type": {
587 "name": "int",
588 "isSigned": true,
589 "bitWidth": 32
590 },
591 "nullable": true,
592 "children": []
593 }
594 ]
595 }
596 ]
597 }
598 ]
599 },
600 "batches": [
601 {
602 "count": 3,
603 "columns": [
604 {
605 "name": "map_other_names",
606 "count": 3,
607 "VALIDITY": [1, 0, 1],
608 "OFFSET": [0, 3, 3, 5],
609 "children": [
610 {
611 "name": "some_entries",
612 "count": 5,
613 "VALIDITY": [1, 1, 1, 1, 1],
614 "children": [
615 {
616 "name": "some_key",
617 "count": 5,
618 "VALIDITY": [1, 1, 1, 1, 1],
619 "DATA": [11, 22, 33, 44, 55]
620 },
621 {
622 "name": "some_value",
623 "count": 5,
624 "VALIDITY": [1, 1, 0, 1, 1],
625 "DATA": [111, 222, 0, 444, 555]
626 }
627 ]
628 }
629 ]
630 }
631 ]
632 }
633 ]
634 }
635 )example";
636
637 // An empty struct type, with "children" member in batches
638 static const char* json_example5 = R"example(
639 {
640 "schema": {
641 "fields": [
642 {
643 "name": "empty_struct",
644 "nullable": true,
645 "type": {
646 "name": "struct"
647 },
648 "children": []
649 }
650 ]
651 },
652 "batches": [
653 {
654 "count": 3,
655 "columns": [
656 {
657 "name": "empty_struct",
658 "count": 3,
659 "VALIDITY": [1, 0, 1],
660 "children": []
661 }
662 ]
663 }
664 ]
665 }
666 )example";
667
668 // An empty struct type, without "children" member in batches
669 static const char* json_example6 = R"example(
670 {
671 "schema": {
672 "fields": [
673 {
674 "name": "empty_struct",
675 "nullable": true,
676 "type": {
677 "name": "struct"
678 },
679 "children": []
680 }
681 ]
682 },
683 "batches": [
684 {
685 "count": 2,
686 "columns": [
687 {
688 "name": "empty_struct",
689 "count": 2,
690 "VALIDITY": [1, 0]
691 }
692 ]
693 }
694 ]
695 }
696 )example";
697
698 void TestSchemaRoundTrip(const Schema& schema) {
699 rj::StringBuffer sb;
700 rj::Writer<rj::StringBuffer> writer(sb);
701
702 DictionaryFieldMapper mapper(schema);
703
704 writer.StartObject();
705 ASSERT_OK(json::WriteSchema(schema, mapper, &writer));
706 writer.EndObject();
707
708 std::string json_schema = sb.GetString();
709
710 rj::Document d;
711 // Pass explicit size to avoid ASAN issues with
712 // SIMD loads in RapidJson.
713 d.Parse(json_schema.data(), json_schema.size());
714
715 DictionaryMemo in_memo;
716 std::shared_ptr<Schema> out;
717 if (!json::ReadSchema(d, default_memory_pool(), &in_memo, &out).ok()) {
718 FAIL() << "Unable to read JSON schema: " << json_schema;
719 }
720
721 if (!schema.Equals(*out)) {
722 FAIL() << "In schema: " << schema.ToString() << "\nOut schema: " << out->ToString();
723 }
724 }
725
726 void TestArrayRoundTrip(const Array& array) {
727 static std::string name = "dummy";
728
729 rj::StringBuffer sb;
730 rj::Writer<rj::StringBuffer> writer(sb);
731
732 ASSERT_OK(json::WriteArray(name, array, &writer));
733
734 std::string array_as_json = sb.GetString();
735
736 rj::Document d;
737 // Pass explicit size to avoid ASAN issues with
738 // SIMD loads in RapidJson.
739 d.Parse(array_as_json.data(), array_as_json.size());
740
741 if (d.HasParseError()) {
742 FAIL() << "JSON parsing failed";
743 }
744
745 std::shared_ptr<Array> out;
746 ASSERT_OK(json::ReadArray(default_memory_pool(), d, ::arrow::field(name, array.type()),
747 &out));
748
749 // std::cout << array_as_json << std::endl;
750 CompareArraysDetailed(0, *out, array);
751 }
752
753 template <typename T, typename ValueType>
754 void CheckPrimitive(const std::shared_ptr<DataType>& type,
755 const std::vector<bool>& is_valid,
756 const std::vector<ValueType>& values) {
757 MemoryPool* pool = default_memory_pool();
758 typename TypeTraits<T>::BuilderType builder(pool);
759
760 for (size_t i = 0; i < values.size(); ++i) {
761 if (is_valid[i]) {
762 ASSERT_OK(builder.Append(values[i]));
763 } else {
764 ASSERT_OK(builder.AppendNull());
765 }
766 }
767
768 std::shared_ptr<Array> array;
769 ASSERT_OK(builder.Finish(&array));
770 TestArrayRoundTrip(*array);
771 }
772
773 TEST(TestJsonSchemaWriter, FlatTypes) {
774 // TODO
775 // field("f14", date32())
776 std::vector<std::shared_ptr<Field>> fields = {
777 field("f0", int8()),
778 field("f1", int16(), false),
779 field("f2", int32()),
780 field("f3", int64(), false),
781 field("f4", uint8()),
782 field("f5", uint16()),
783 field("f6", uint32()),
784 field("f7", uint64()),
785 field("f8", float32()),
786 field("f9", float64()),
787 field("f10", utf8()),
788 field("f11", binary()),
789 field("f12", list(int32())),
790 field("f13", struct_({field("s1", int32()), field("s2", utf8())})),
791 field("f15", date64()),
792 field("f16", timestamp(TimeUnit::NANO)),
793 field("f17", time64(TimeUnit::MICRO)),
794 field("f18",
795 dense_union({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))},
796 {0, 1})),
797 field("f19", large_list(uint8())),
798 field("f20", null()),
799 };
800
801 Schema schema(fields);
802 TestSchemaRoundTrip(schema);
803 }
804
805 template <typename T>
806 void PrimitiveTypesCheckOne() {
807 using c_type = typename T::c_type;
808
809 std::vector<bool> is_valid = {true, false, true, true, true, false, true, true};
810 std::vector<c_type> values = {0, 1, 2, 3, 4, 5, 6, 7};
811 CheckPrimitive<T, c_type>(std::make_shared<T>(), is_valid, values);
812 }
813
814 TEST(TestJsonArrayWriter, NullType) {
815 auto arr = std::make_shared<NullArray>(10);
816 TestArrayRoundTrip(*arr);
817 }
818
819 TEST(TestJsonArrayWriter, PrimitiveTypes) {
820 PrimitiveTypesCheckOne<Int8Type>();
821 PrimitiveTypesCheckOne<Int16Type>();
822 PrimitiveTypesCheckOne<Int32Type>();
823 PrimitiveTypesCheckOne<Int64Type>();
824 PrimitiveTypesCheckOne<UInt8Type>();
825 PrimitiveTypesCheckOne<UInt16Type>();
826 PrimitiveTypesCheckOne<UInt32Type>();
827 PrimitiveTypesCheckOne<UInt64Type>();
828 PrimitiveTypesCheckOne<FloatType>();
829 PrimitiveTypesCheckOne<DoubleType>();
830
831 std::vector<bool> is_valid = {true, false, true, true, true, false, true, true};
832 std::vector<std::string> values = {"foo", "bar", "", "baz", "qux", "foo", "a", "1"};
833
834 CheckPrimitive<StringType, std::string>(utf8(), is_valid, values);
835 CheckPrimitive<BinaryType, std::string>(binary(), is_valid, values);
836 }
837
838 TEST(TestJsonArrayWriter, NestedTypes) {
839 auto value_type = int32();
840
841 std::vector<bool> values_is_valid = {true, false, true, true, false, true, true};
842
843 std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
844 std::shared_ptr<Array> values_array;
845 ArrayFromVector<Int32Type, int32_t>(values_is_valid, values, &values_array);
846
847 std::vector<int16_t> i16_values = {0, 1, 2, 3, 4, 5, 6};
848 std::shared_ptr<Array> i16_values_array;
849 ArrayFromVector<Int16Type, int16_t>(values_is_valid, i16_values, &i16_values_array);
850
851 // List
852 std::vector<bool> list_is_valid = {true, false, true, true, true};
853 std::shared_ptr<Buffer> list_bitmap;
854 ASSERT_OK(GetBitmapFromVector(list_is_valid, &list_bitmap));
855 std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7};
856 std::shared_ptr<Buffer> offsets_buffer = Buffer::Wrap(offsets);
857 {
858 ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap,
859 1);
860 TestArrayRoundTrip(list_array);
861 }
862
863 // LargeList
864 std::vector<int64_t> large_offsets = {0, 0, 0, 1, 4, 7};
865 std::shared_ptr<Buffer> large_offsets_buffer = Buffer::Wrap(large_offsets);
866 {
867 LargeListArray list_array(large_list(value_type), 5, large_offsets_buffer,
868 values_array, list_bitmap, 1);
869 TestArrayRoundTrip(list_array);
870 }
871
872 // Map
873 auto map_type = map(utf8(), int32());
874 auto keys_array = ArrayFromJSON(utf8(), R"(["a", "b", "c", "d", "a", "b", "c"])");
875
876 MapArray map_array(map_type, 5, offsets_buffer, keys_array, values_array, list_bitmap,
877 1);
878
879 TestArrayRoundTrip(map_array);
880
881 // FixedSizeList
882 FixedSizeListArray fixed_size_list_array(fixed_size_list(value_type, 2), 3,
883 values_array->Slice(1), list_bitmap, 1);
884
885 TestArrayRoundTrip(fixed_size_list_array);
886
887 // Struct
888 std::vector<bool> struct_is_valid = {true, false, true, true, true, false, true};
889 std::shared_ptr<Buffer> struct_bitmap;
890 ASSERT_OK(GetBitmapFromVector(struct_is_valid, &struct_bitmap));
891
892 auto struct_type =
893 struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())});
894
895 std::vector<std::shared_ptr<Array>> fields = {values_array, values_array, values_array};
896 StructArray struct_array(struct_type, static_cast<int>(struct_is_valid.size()), fields,
897 struct_bitmap, 2);
898 TestArrayRoundTrip(struct_array);
899 }
900
901 TEST(TestJsonArrayWriter, Unions) {
902 std::shared_ptr<RecordBatch> batch;
903 ASSERT_OK(MakeUnion(&batch));
904
905 for (int i = 0; i < batch->num_columns(); ++i) {
906 TestArrayRoundTrip(*batch->column(i));
907 }
908 }
909
910 // Data generation for test case below
911 void MakeBatchArrays(const std::shared_ptr<Schema>& schema, const int num_rows,
912 std::vector<std::shared_ptr<Array>>* arrays) {
913 const float null_prob = 0.25f;
914 random::RandomArrayGenerator rand(0x564a3bf0);
915
916 *arrays = {rand.Boolean(num_rows, 0.75, null_prob),
917 rand.Int8(num_rows, 0, 100, null_prob),
918 rand.Int32(num_rows, -1000, 1000, null_prob),
919 rand.UInt64(num_rows, 0, 1UL << 16, null_prob)};
920
921 static const int kBufferSize = 10;
922 static uint8_t buffer[kBufferSize];
923 static uint32_t seed = 0;
924 StringBuilder string_builder;
925 for (int i = 0; i < num_rows; ++i) {
926 random_ascii(kBufferSize, seed++, buffer);
927 ASSERT_OK(string_builder.Append(buffer, kBufferSize));
928 }
929 std::shared_ptr<Array> v3;
930 ASSERT_OK(string_builder.Finish(&v3));
931
932 arrays->emplace_back(v3);
933 }
934
935 TEST(TestJsonFileReadWrite, BasicRoundTrip) {
936 auto v1_type = boolean();
937 auto v2_type = int8();
938 auto v3_type = int32();
939 auto v4_type = uint64();
940 auto v5_type = utf8();
941
942 auto schema =
943 ::arrow::schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type),
944 field("f4", v4_type), field("f5", v5_type)});
945
946 std::unique_ptr<IntegrationJsonWriter> writer;
947 ASSERT_OK(IntegrationJsonWriter::Open(schema, &writer));
948
949 const int nbatches = 3;
950 std::vector<std::shared_ptr<RecordBatch>> batches;
951 for (int i = 0; i < nbatches; ++i) {
952 int num_rows = 5 + i * 5;
953 std::vector<std::shared_ptr<Array>> arrays;
954
955 MakeBatchArrays(schema, num_rows, &arrays);
956 auto batch = RecordBatch::Make(schema, num_rows, arrays);
957 batches.push_back(batch);
958 ASSERT_OK(writer->WriteRecordBatch(*batch));
959 }
960
961 std::string result;
962 ASSERT_OK(writer->Finish(&result));
963
964 std::unique_ptr<IntegrationJsonReader> reader;
965
966 auto buffer = std::make_shared<Buffer>(result);
967
968 ASSERT_OK(IntegrationJsonReader::Open(buffer, &reader));
969 ASSERT_TRUE(reader->schema()->Equals(*schema));
970
971 ASSERT_EQ(nbatches, reader->num_record_batches());
972
973 for (int i = 0; i < nbatches; ++i) {
974 std::shared_ptr<RecordBatch> batch;
975 ASSERT_OK(reader->ReadRecordBatch(i, &batch));
976 ASSERT_BATCHES_EQUAL(*batch, *batches[i]);
977 }
978 }
979
980 static void ReadOneBatchJson(const char* json, const Schema& expected_schema,
981 std::shared_ptr<RecordBatch>* out) {
982 auto buffer = Buffer::Wrap(json, strlen(json));
983
984 std::unique_ptr<IntegrationJsonReader> reader;
985 ASSERT_OK(IntegrationJsonReader::Open(buffer, &reader));
986
987 AssertSchemaEqual(*reader->schema(), expected_schema, /*check_metadata=*/true);
988 ASSERT_EQ(1, reader->num_record_batches());
989
990 ASSERT_OK(reader->ReadRecordBatch(0, out));
991 }
992
993 TEST(TestJsonFileReadWrite, JsonExample1) {
994 Schema ex_schema({field("foo", int32()), field("bar", float64())});
995
996 std::shared_ptr<RecordBatch> batch;
997 ReadOneBatchJson(json_example1, ex_schema, &batch);
998
999 std::vector<bool> foo_valid = {true, false, true, true, true};
1000 std::vector<int32_t> foo_values = {1, 2, 3, 4, 5};
1001 std::shared_ptr<Array> foo;
1002 ArrayFromVector<Int32Type, int32_t>(foo_valid, foo_values, &foo);
1003 ASSERT_TRUE(batch->column(0)->Equals(foo));
1004
1005 std::vector<bool> bar_valid = {true, false, false, true, true};
1006 std::vector<double> bar_values = {1, 2, 3, 4, 5};
1007 std::shared_ptr<Array> bar;
1008 ArrayFromVector<DoubleType, double>(bar_valid, bar_values, &bar);
1009 ASSERT_TRUE(batch->column(1)->Equals(bar));
1010 }
1011
1012 TEST(TestJsonFileReadWrite, JsonExample2) {
1013 // Example 2: two extension types (one registered, one unregistered)
1014 auto uuid_type = uuid();
1015 auto buffer = Buffer::Wrap(json_example2, strlen(json_example2));
1016
1017 std::unique_ptr<IntegrationJsonReader> reader;
1018 {
1019 ExtensionTypeGuard ext_guard(uuid_type);
1020
1021 ASSERT_OK(IntegrationJsonReader::Open(buffer, &reader));
1022 // The second field is an unregistered extension and will be read as
1023 // its underlying storage.
1024 Schema ex_schema({field("uuids", uuid_type), field("things", null())});
1025
1026 AssertSchemaEqual(ex_schema, *reader->schema());
1027 ASSERT_EQ(1, reader->num_record_batches());
1028
1029 std::shared_ptr<RecordBatch> batch;
1030 ASSERT_OK(reader->ReadRecordBatch(0, &batch));
1031
1032 auto storage_array =
1033 ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])");
1034 AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array));
1035
1036 AssertArraysEqual(*batch->column(1), NullArray(2));
1037 }
1038
1039 // Should fail now that the Uuid extension is unregistered
1040 ASSERT_RAISES(KeyError, IntegrationJsonReader::Open(buffer, &reader));
1041 }
1042
1043 TEST(TestJsonFileReadWrite, JsonExample3) {
1044 // Example 3: An extension type with a dictionary storage type
1045 auto dict_ext_type = std::make_shared<DictExtensionType>();
1046 ExtensionTypeGuard ext_guard(dict_ext_type);
1047 Schema ex_schema({field("dict-extensions", dict_ext_type)});
1048
1049 std::shared_ptr<RecordBatch> batch;
1050 ReadOneBatchJson(json_example3, ex_schema, &batch);
1051 auto storage_array = std::make_shared<DictionaryArray>(
1052 dict_ext_type->storage_type(), ArrayFromJSON(int8(), "[2, 0, null, 1, 2]"),
1053 ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"));
1054 AssertArraysEqual(*batch->column(0), ExtensionArray(dict_ext_type, storage_array),
1055 /*verbose=*/true);
1056 }
1057
1058 TEST(TestJsonFileReadWrite, JsonExample4) {
1059 // Example 4: A map type with non-canonical field names
1060 ASSERT_OK_AND_ASSIGN(auto map_type,
1061 MapType::Make(field("some_entries",
1062 struct_({field("some_key", int16(), false),
1063 field("some_value", int32())}),
1064 false)));
1065 Schema ex_schema({field("maps", map_type)});
1066
1067 std::shared_ptr<RecordBatch> batch;
1068 ReadOneBatchJson(json_example4, ex_schema, &batch);
1069
1070 auto expected_array = ArrayFromJSON(
1071 map(int16(), int32()),
1072 R"([[[11, 111], [22, 222], [33, null]], null, [[44, 444], [55, 555]]])");
1073 AssertArraysEqual(*batch->column(0), *expected_array);
1074 }
1075
1076 TEST(TestJsonFileReadWrite, JsonExample5) {
1077 // Example 5: An empty struct
1078 auto struct_type = struct_(FieldVector{});
1079 Schema ex_schema({field("empty_struct", struct_type)});
1080
1081 std::shared_ptr<RecordBatch> batch;
1082 ReadOneBatchJson(json_example5, ex_schema, &batch);
1083
1084 auto expected_array = ArrayFromJSON(struct_type, "[{}, null, {}]");
1085 AssertArraysEqual(*batch->column(0), *expected_array);
1086 }
1087
1088 TEST(TestJsonFileReadWrite, JsonExample6) {
1089 // Example 6: An empty struct
1090 auto struct_type = struct_(FieldVector{});
1091 Schema ex_schema({field("empty_struct", struct_type)});
1092
1093 std::shared_ptr<RecordBatch> batch;
1094 ReadOneBatchJson(json_example6, ex_schema, &batch);
1095
1096 auto expected_array = ArrayFromJSON(struct_type, "[{}, null]");
1097 AssertArraysEqual(*batch->column(0), *expected_array);
1098 }
1099
1100 class TestJsonRoundTrip : public ::testing::TestWithParam<MakeRecordBatch*> {
1101 public:
1102 void SetUp() {}
1103 void TearDown() {}
1104 };
1105
1106 void CheckRoundtrip(const RecordBatch& batch) {
1107 ExtensionTypeGuard guard({uuid(), dict_extension_type(), complex128()});
1108
1109 TestSchemaRoundTrip(*batch.schema());
1110
1111 std::unique_ptr<IntegrationJsonWriter> writer;
1112 ASSERT_OK(IntegrationJsonWriter::Open(batch.schema(), &writer));
1113 ASSERT_OK(writer->WriteRecordBatch(batch));
1114
1115 std::string result;
1116 ASSERT_OK(writer->Finish(&result));
1117
1118 auto buffer = std::make_shared<Buffer>(result);
1119
1120 std::unique_ptr<IntegrationJsonReader> reader;
1121 ASSERT_OK(IntegrationJsonReader::Open(buffer, &reader));
1122
1123 std::shared_ptr<RecordBatch> result_batch;
1124 ASSERT_OK(reader->ReadRecordBatch(0, &result_batch));
1125
1126 // take care of float rounding error in the text representation
1127 ApproxCompareBatch(batch, *result_batch);
1128 }
1129
1130 TEST_P(TestJsonRoundTrip, RoundTrip) {
1131 std::shared_ptr<RecordBatch> batch;
1132 ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue
1133
1134 CheckRoundtrip(*batch);
1135 }
1136
1137 const std::vector<ipc::test::MakeRecordBatch*> kBatchCases = {
1138 &MakeIntRecordBatch,
1139 &MakeListRecordBatch,
1140 &MakeFixedSizeListRecordBatch,
1141 &MakeNonNullRecordBatch,
1142 &MakeZeroLengthRecordBatch,
1143 &MakeDeeplyNestedList,
1144 &MakeStringTypesRecordBatchWithNulls,
1145 &MakeStruct,
1146 &MakeUnion,
1147 &MakeDictionary,
1148 &MakeNestedDictionary,
1149 &MakeMap,
1150 &MakeMapOfDictionary,
1151 &MakeDates,
1152 &MakeTimestamps,
1153 &MakeTimes,
1154 &MakeFWBinary,
1155 &MakeNull,
1156 &MakeDecimal,
1157 &MakeBooleanBatch,
1158 &MakeFloatBatch,
1159 &MakeIntervals,
1160 &MakeUuid,
1161 &MakeComplex128,
1162 &MakeDictExtension};
1163
1164 INSTANTIATE_TEST_SUITE_P(TestJsonRoundTrip, TestJsonRoundTrip,
1165 ::testing::ValuesIn(kBatchCases));
1166
1167 } // namespace testing
1168 } // namespace arrow
1169
1170 int main(int argc, char** argv) {
1171 gflags::ParseCommandLineFlags(&argc, &argv, true);
1172
1173 int ret = 0;
1174
1175 if (FLAGS_integration) {
1176 arrow::Status result =
1177 arrow::testing::RunCommand(FLAGS_json, FLAGS_arrow, FLAGS_mode);
1178 if (!result.ok()) {
1179 std::cout << "Error message: " << result.ToString() << std::endl;
1180 ret = 1;
1181 }
1182 } else {
1183 ::testing::InitGoogleTest(&argc, argv);
1184 ret = RUN_ALL_TESTS();
1185 }
1186 gflags::ShutDownCommandLineFlags();
1187 return ret;
1188 }