1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 #include "gtest/gtest.h"
23 #include "arrow/buffer.h"
24 #include "arrow/csv/writer.h"
25 #include "arrow/io/memory.h"
26 #include "arrow/ipc/writer.h"
27 #include "arrow/record_batch.h"
28 #include "arrow/result_internal.h"
29 #include "arrow/testing/gtest_util.h"
30 #include "arrow/type.h"
31 #include "arrow/type_fwd.h"
36 struct WriterTestParams
{
37 std::shared_ptr
<Schema
> schema
;
38 std::string batch_data
;
40 std::string expected_output
;
43 // Avoid Valgrind failures with GTest trying to represent a WriterTestParams
44 void PrintTo(const WriterTestParams
& p
, std::ostream
* os
) {
45 *os
<< "WriterTestParams(" << reinterpret_cast<const void*>(&p
) << ")";
48 WriteOptions
DefaultTestOptions(bool include_header
) {
50 options
.batch_size
= 5;
51 options
.include_header
= include_header
;
55 std::vector
<WriterTestParams
> GenerateTestCases() {
56 auto abc_schema
= schema({
57 {field("a", uint64())},
58 {field("b\"", utf8())},
59 {field("c ", int32())},
60 {field("d", date32())},
61 {field("e", date64())},
63 auto populated_batch
= R
"([{"a
": 1, "c
": -1},
64 { "a
": 1, "b
\"": "abc
\"efg
", "c
": 2324},
65 { "b
\"": "abcd
", "c
": 5467},
67 { "a
": 546, "b
\"": "", "c
": 517 },
68 { "a
": 124, "b
\"": "a
\"\"b
\"" },
71 std::string expected_without_header
= std::string("1,,-1,,") + "\n" + // line 1
72 R
"(1,"abc
""efg
",2324,,)" + "\n" + // line 2
73 R
"(,"abcd
",5467,,)" + "\n" + // line 3
74 R
"(,,,,)" + "\n" + // line 4
75 R
"(546,"",517,,)" + "\n" + // line 5
76 R
"(124,"a
""""b
""",,,)" + "\n" + // line 6
77 R
"(,,,1970-01-01,)" + "\n" + // line 7
78 R
"(,,,,1970-01-02)" + "\n"; // line 8
79 std::string expected_header
= std::string(R
"("a
","b
""","c
","d
","e
")") + "\n";
81 return std::vector
<WriterTestParams
>{
82 {abc_schema
, "[]", DefaultTestOptions(/*header=*/false), ""},
83 {abc_schema
, "[]", DefaultTestOptions(/*header=*/true), expected_header
},
84 {abc_schema
, populated_batch
, DefaultTestOptions(/*header=*/false),
85 expected_without_header
},
86 {abc_schema
, populated_batch
, DefaultTestOptions(/*header=*/true),
87 expected_header
+ expected_without_header
}};
90 class TestWriteCSV
: public ::testing::TestWithParam
<WriterTestParams
> {
92 template <typename Data
>
93 Result
<std::string
> ToCsvString(const Data
& data
, const WriteOptions
& options
) {
94 std::shared_ptr
<io::BufferOutputStream
> out
;
95 ASSIGN_OR_RAISE(out
, io::BufferOutputStream::Create());
97 RETURN_NOT_OK(WriteCSV(data
, options
, out
.get()));
98 ASSIGN_OR_RAISE(std::shared_ptr
<Buffer
> buffer
, out
->Finish());
99 return std::string(reinterpret_cast<const char*>(buffer
->data()), buffer
->size());
102 Result
<std::string
> ToCsvStringUsingWriter(const Table
& data
,
103 const WriteOptions
& options
) {
104 std::shared_ptr
<io::BufferOutputStream
> out
;
105 ASSIGN_OR_RAISE(out
, io::BufferOutputStream::Create());
107 ASSIGN_OR_RAISE(auto writer
, MakeCSVWriter(out
, data
.schema(), options
));
108 TableBatchReader
reader(data
);
109 reader
.set_chunksize(1);
110 std::shared_ptr
<RecordBatch
> batch
;
111 RETURN_NOT_OK(reader
.ReadNext(&batch
));
112 while (batch
!= nullptr) {
113 RETURN_NOT_OK(writer
->WriteRecordBatch(*batch
));
114 RETURN_NOT_OK(reader
.ReadNext(&batch
));
116 RETURN_NOT_OK(writer
->Close());
117 EXPECT_EQ(data
.num_rows(), writer
->stats().num_record_batches
);
118 ASSIGN_OR_RAISE(std::shared_ptr
<Buffer
> buffer
, out
->Finish());
119 return std::string(reinterpret_cast<const char*>(buffer
->data()), buffer
->size());
123 TEST_P(TestWriteCSV
, TestWrite
) {
124 ASSERT_OK_AND_ASSIGN(std::shared_ptr
<io::BufferOutputStream
> out
,
125 io::BufferOutputStream::Create());
126 WriteOptions options
= GetParam().options
;
128 auto record_batch
= RecordBatchFromJSON(GetParam().schema
, GetParam().batch_data
);
129 ASSERT_OK_AND_ASSIGN(csv
, ToCsvString(*record_batch
, options
));
130 EXPECT_EQ(csv
, GetParam().expected_output
);
132 // Batch size shouldn't matter.
133 options
.batch_size
/= 2;
134 ASSERT_OK_AND_ASSIGN(csv
, ToCsvString(*record_batch
, options
));
135 EXPECT_EQ(csv
, GetParam().expected_output
);
137 // Table and Record batch should work identically.
138 ASSERT_OK_AND_ASSIGN(std::shared_ptr
<Table
> table
,
139 Table::FromRecordBatches({record_batch
}));
140 ASSERT_OK_AND_ASSIGN(csv
, ToCsvString(*table
, options
));
141 EXPECT_EQ(csv
, GetParam().expected_output
);
143 // The writer should work identically.
144 ASSERT_OK_AND_ASSIGN(csv
, ToCsvStringUsingWriter(*table
, options
));
145 EXPECT_EQ(csv
, GetParam().expected_output
);
148 INSTANTIATE_TEST_SUITE_P(MultiColumnWriteCSVTest
, TestWriteCSV
,
149 ::testing::ValuesIn(GenerateTestCases()));
151 INSTANTIATE_TEST_SUITE_P(SingleColumnWriteCSVTest
, TestWriteCSV
,
152 ::testing::Values(WriterTestParams
{
153 schema({field("int64", int64())}),
154 R
"([{ "int64
": 9999}, {}, { "int64
": -15}])", WriteOptions(),
156 "\n9999\n\n-15\n"}));