]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/arrow/cpp/src/arrow/csv/column_builder_test.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / csv / column_builder_test.cc
diff --git a/ceph/src/arrow/cpp/src/arrow/csv/column_builder_test.cc b/ceph/src/arrow/cpp/src/arrow/csv/column_builder_test.cc
new file mode 100644 (file)
index 0000000..7577c88
--- /dev/null
@@ -0,0 +1,608 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/column_builder.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/csv/options.h"
+#include "arrow/csv/test_common.h"
+#include "arrow/memory_pool.h"
+#include "arrow/table.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/task_group.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+
+using internal::checked_cast;
+using internal::GetCpuThreadPool;
+using internal::TaskGroup;
+
+using ChunkData = std::vector<std::vector<std::string>>;
+
+class ColumnBuilderTest : public ::testing::Test {
+ public:
+  void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder,
+                      const ChunkData& chunks, bool validate_full,
+                      std::shared_ptr<ChunkedArray>* out) {
+    for (const auto& chunk : chunks) {
+      std::shared_ptr<BlockParser> parser;
+      MakeColumnParser(chunk, &parser);
+      builder->Append(parser);
+    }
+    ASSERT_OK(builder->task_group()->Finish());
+    ASSERT_OK_AND_ASSIGN(*out, builder->Finish());
+    if (validate_full) {
+      ASSERT_OK((*out)->ValidateFull());
+    } else {
+      ASSERT_OK((*out)->Validate());
+    }
+  }
+
+  void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder,
+                      const ChunkData& chunks, std::shared_ptr<ChunkedArray>* out) {
+    AssertBuilding(builder, chunks, /*validate_full=*/true, out);
+  }
+
+  void CheckInferred(const std::shared_ptr<TaskGroup>& tg, const ChunkData& csv_data,
+                     const ConvertOptions& options,
+                     std::shared_ptr<ChunkedArray> expected, bool validate_full = true) {
+    std::shared_ptr<ColumnBuilder> builder;
+    std::shared_ptr<ChunkedArray> actual;
+    ASSERT_OK_AND_ASSIGN(builder,
+                         ColumnBuilder::Make(default_memory_pool(), 0, options, tg));
+    AssertBuilding(builder, csv_data, validate_full, &actual);
+    AssertChunkedEqual(*actual, *expected);
+  }
+
+  void CheckInferred(const std::shared_ptr<TaskGroup>& tg, const ChunkData& csv_data,
+                     const ConvertOptions& options,
+                     std::vector<std::shared_ptr<Array>> expected_chunks,
+                     bool validate_full = true) {
+    CheckInferred(tg, csv_data, options, std::make_shared<ChunkedArray>(expected_chunks),
+                  validate_full);
+  }
+
+  void CheckFixedType(const std::shared_ptr<TaskGroup>& tg,
+                      const std::shared_ptr<DataType>& type, const ChunkData& csv_data,
+                      const ConvertOptions& options,
+                      std::shared_ptr<ChunkedArray> expected) {
+    std::shared_ptr<ColumnBuilder> builder;
+    std::shared_ptr<ChunkedArray> actual;
+    ASSERT_OK_AND_ASSIGN(
+        builder, ColumnBuilder::Make(default_memory_pool(), type, 0, options, tg));
+    AssertBuilding(builder, csv_data, &actual);
+    AssertChunkedEqual(*actual, *expected);
+  }
+
+  void CheckFixedType(const std::shared_ptr<TaskGroup>& tg,
+                      const std::shared_ptr<DataType>& type, const ChunkData& csv_data,
+                      const ConvertOptions& options,
+                      std::vector<std::shared_ptr<Array>> expected_chunks) {
+    CheckFixedType(tg, type, csv_data, options,
+                   std::make_shared<ChunkedArray>(expected_chunks));
+  }
+
+ protected:
+  ConvertOptions default_options = ConvertOptions::Defaults();
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Tests for null column builder
+
+class NullColumnBuilderTest : public ColumnBuilderTest {};
+
+TEST_F(NullColumnBuilderTest, Empty) {
+  std::shared_ptr<DataType> type = null();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK_AND_ASSIGN(builder, ColumnBuilder::MakeNull(default_memory_pool(), type, tg));
+
+  std::shared_ptr<ChunkedArray> actual;
+  AssertBuilding(builder, {}, &actual);
+
+  ChunkedArray expected({}, type);
+  AssertChunkedEqual(*actual, expected);
+}
+
+TEST_F(NullColumnBuilderTest, InsertNull) {
+  // Building a column of nulls with type null()
+  std::shared_ptr<DataType> type = null();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK_AND_ASSIGN(builder, ColumnBuilder::MakeNull(default_memory_pool(), type, tg));
+
+  std::shared_ptr<BlockParser> parser;
+  std::shared_ptr<ChunkedArray> actual, expected;
+  // Those values are indifferent, only the number of rows is used
+  MakeColumnParser({"456", "789"}, &parser);
+  builder->Insert(1, parser);
+  MakeColumnParser({"123"}, &parser);
+  builder->Insert(0, parser);
+  ASSERT_OK(builder->task_group()->Finish());
+  ASSERT_OK_AND_ASSIGN(actual, builder->Finish());
+  ASSERT_OK(actual->ValidateFull());
+
+  auto chunks =
+      ArrayVector{std::make_shared<NullArray>(1), std::make_shared<NullArray>(2)};
+  expected = std::make_shared<ChunkedArray>(chunks);
+  AssertChunkedEqual(*actual, *expected);
+}
+
+TEST_F(NullColumnBuilderTest, InsertTyped) {
+  // Building a column of nulls with another type
+  std::shared_ptr<DataType> type = int16();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK_AND_ASSIGN(builder, ColumnBuilder::MakeNull(default_memory_pool(), type, tg));
+
+  std::shared_ptr<BlockParser> parser;
+  std::shared_ptr<ChunkedArray> actual, expected;
+  // Those values are indifferent, only the number of rows is used
+  MakeColumnParser({"abc", "def", "ghi"}, &parser);
+  builder->Insert(1, parser);
+  MakeColumnParser({"jkl"}, &parser);
+  builder->Insert(0, parser);
+  ASSERT_OK(builder->task_group()->Finish());
+  ASSERT_OK_AND_ASSIGN(actual, builder->Finish());
+  ASSERT_OK(actual->ValidateFull());
+
+  auto chunks = ArrayVector{ArrayFromJSON(type, "[null]"),
+                            ArrayFromJSON(type, "[null, null, null]")};
+  expected = std::make_shared<ChunkedArray>(chunks);
+  AssertChunkedEqual(*actual, *expected);
+}
+
+TEST_F(NullColumnBuilderTest, EmptyChunks) {
+  std::shared_ptr<DataType> type = int16();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK_AND_ASSIGN(builder, ColumnBuilder::MakeNull(default_memory_pool(), type, tg));
+
+  std::shared_ptr<BlockParser> parser;
+  std::shared_ptr<ChunkedArray> actual, expected;
+  // Those values are indifferent, only the number of rows is used
+  MakeColumnParser({}, &parser);
+  builder->Insert(0, parser);
+  MakeColumnParser({"abc", "def"}, &parser);
+  builder->Insert(1, parser);
+  MakeColumnParser({}, &parser);
+  builder->Insert(2, parser);
+  ASSERT_OK(builder->task_group()->Finish());
+  ASSERT_OK_AND_ASSIGN(actual, builder->Finish());
+  ASSERT_OK(actual->ValidateFull());
+
+  auto chunks =
+      ArrayVector{ArrayFromJSON(type, "[]"), ArrayFromJSON(type, "[null, null]"),
+                  ArrayFromJSON(type, "[]")};
+  expected = std::make_shared<ChunkedArray>(chunks);
+  AssertChunkedEqual(*actual, *expected);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Tests for fixed-type column builder
+
+class TypedColumnBuilderTest : public ColumnBuilderTest {};
+
+TEST_F(TypedColumnBuilderTest, Empty) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK_AND_ASSIGN(
+      builder, ColumnBuilder::Make(default_memory_pool(), int32(), 0, options, tg));
+
+  std::shared_ptr<ChunkedArray> actual;
+  AssertBuilding(builder, {}, &actual);
+
+  ChunkedArray expected({}, int32());
+  AssertChunkedEqual(*actual, expected);
+}
+
+TEST_F(TypedColumnBuilderTest, Basics) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckFixedType(tg, int32(), {{"123", "-456"}}, options,
+                 {ArrayFromJSON(int32(), "[123, -456]")});
+}
+
+TEST_F(TypedColumnBuilderTest, Insert) {
+  // Test ColumnBuilder::Insert()
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK_AND_ASSIGN(
+      builder, ColumnBuilder::Make(default_memory_pool(), int32(), 0, options, tg));
+
+  std::shared_ptr<BlockParser> parser;
+  std::shared_ptr<ChunkedArray> actual, expected;
+  MakeColumnParser({"456"}, &parser);
+  builder->Insert(1, parser);
+  MakeColumnParser({"123"}, &parser);
+  builder->Insert(0, parser);
+  ASSERT_OK(builder->task_group()->Finish());
+  ASSERT_OK_AND_ASSIGN(actual, builder->Finish());
+  ASSERT_OK(actual->ValidateFull());
+
+  ChunkedArrayFromVector<Int32Type>({{123}, {456}}, &expected);
+  AssertChunkedEqual(*actual, *expected);
+}
+
+TEST_F(TypedColumnBuilderTest, MultipleChunks) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckFixedType(tg, int16(), {{"1", "2", "3"}, {"4", "5"}}, options,
+                 {ArrayFromJSON(int16(), "[1, 2, 3]"), ArrayFromJSON(int16(), "[4, 5]")});
+}
+
+TEST_F(TypedColumnBuilderTest, MultipleChunksParallel) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<Int32Type>({{1, 2}, {3}, {4, 5}, {6, 7}}, &expected);
+  CheckFixedType(tg, int32(), {{"1", "2"}, {"3"}, {"4", "5"}, {"6", "7"}}, options,
+                 expected);
+}
+
+TEST_F(TypedColumnBuilderTest, EmptyChunks) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckFixedType(tg, int16(), {{}, {"1", "2"}, {}}, options,
+                 {ArrayFromJSON(int16(), "[]"), ArrayFromJSON(int16(), "[1, 2]"),
+                  ArrayFromJSON(int16(), "[]")});
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Tests for type-inferring column builder
+
+class InferringColumnBuilderTest : public ColumnBuilderTest {
+ public:
+  void CheckAutoDictEncoded(const std::shared_ptr<TaskGroup>& tg,
+                            const ChunkData& csv_data, const ConvertOptions& options,
+                            std::vector<std::shared_ptr<Array>> expected_indices,
+                            std::vector<std::shared_ptr<Array>> expected_dictionaries,
+                            bool validate_full = true) {
+    std::shared_ptr<ColumnBuilder> builder;
+    std::shared_ptr<ChunkedArray> actual;
+    ASSERT_OK_AND_ASSIGN(builder,
+                         ColumnBuilder::Make(default_memory_pool(), 0, options, tg));
+    AssertBuilding(builder, csv_data, validate_full, &actual);
+    ASSERT_EQ(actual->num_chunks(), static_cast<int>(csv_data.size()));
+    for (int i = 0; i < actual->num_chunks(); ++i) {
+      ASSERT_EQ(actual->chunk(i)->type_id(), Type::DICTIONARY);
+      const auto& dict_array = checked_cast<const DictionaryArray&>(*actual->chunk(i));
+      AssertArraysEqual(*dict_array.dictionary(), *expected_dictionaries[i]);
+      AssertArraysEqual(*dict_array.indices(), *expected_indices[i]);
+    }
+  }
+};
+
+TEST_F(InferringColumnBuilderTest, Empty) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {}, options, std::make_shared<ChunkedArray>(ArrayVector(), null()));
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkNull) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "NA"}}, options, {std::make_shared<NullArray>(2)});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkNull) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "NA"}, {""}, {"NaN"}}, options,
+                {std::make_shared<NullArray>(2), std::make_shared<NullArray>(1),
+                 std::make_shared<NullArray>(1)});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkInteger) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "123", "456"}}, options,
+                {ArrayFromJSON(int64(), "[null, 123, 456]")});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkInteger) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(
+      tg, {{""}, {"NA", "123", "456"}}, options,
+      {ArrayFromJSON(int64(), "[null]"), ArrayFromJSON(int64(), "[null, 123, 456]")});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkBoolean) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "0", "FALSE", "TRUE"}}, options,
+                {ArrayFromJSON(boolean(), "[null, false, false, true]")});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkBoolean) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{""}, {"1", "True", "0"}}, options,
+                {ArrayFromJSON(boolean(), "[null]"),
+                 ArrayFromJSON(boolean(), "[true, true, false]")});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkReal) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "0.0", "12.5"}}, options,
+                {ArrayFromJSON(float64(), "[null, 0.0, 12.5]")});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkReal) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{""}, {"008"}, {"NaN", "12.5"}}, options,
+                {ArrayFromJSON(float64(), "[null]"), ArrayFromJSON(float64(), "[8.0]"),
+                 ArrayFromJSON(float64(), "[null, 12.5]")});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkDate) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "1970-01-04", "NA"}}, options,
+                {ArrayFromJSON(date32(), "[null, 3, null]")});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkDate) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{""}, {"1970-01-04"}, {"NA"}}, options,
+                {ArrayFromJSON(date32(), "[null]"), ArrayFromJSON(date32(), "[3]"),
+                 ArrayFromJSON(date32(), "[null]")});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkTime) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "01:23:45", "NA"}}, options,
+                {ArrayFromJSON(time32(TimeUnit::SECOND), "[null, 5025, null]")});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkTime) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+  auto type = time32(TimeUnit::SECOND);
+
+  CheckInferred(tg, {{""}, {"01:23:45"}, {"NA"}}, options,
+                {ArrayFromJSON(type, "[null]"), ArrayFromJSON(type, "[5025]"),
+                 ArrayFromJSON(type, "[null]")});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkTimestamp) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
+                                        {{false, true, true}}, {{0, 0, 1542129070}},
+                                        &expected);
+  CheckInferred(tg, {{"", "1970-01-01", "2018-11-13 17:11:10"}}, options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkTimestamp) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
+                                        {{false}, {true}, {true}},
+                                        {{0}, {0}, {1542129070}}, &expected);
+  CheckInferred(tg, {{""}, {"1970-01-01"}, {"2018-11-13 17:11:10"}}, options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkTimestampNS) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(
+      timestamp(TimeUnit::NANO), {{false, true, true, true, true}},
+      {{0, 0, 1542129070123000000, 1542129070123456000, 1542129070123456789}}, &expected);
+  CheckInferred(tg,
+                {{"", "1970-01-01", "2018-11-13 17:11:10.123",
+                  "2018-11-13 17:11:10.123456", "2018-11-13 17:11:10.123456789"}},
+                options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampNS) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<TimestampType>(
+      timestamp(TimeUnit::NANO), {{false}, {true}, {true, true, true}},
+      {{0}, {0}, {1542129070123000000, 1542129070123456000, 1542129070123456789}},
+      &expected);
+  CheckInferred(tg,
+                {{""},
+                 {"1970-01-01"},
+                 {"2018-11-13 17:11:10.123", "2018-11-13 17:11:10.123456",
+                  "2018-11-13 17:11:10.123456789"}},
+                options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkIntegerAndTime) {
+  // Fallback to utf-8
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "99", "01:23:45", "NA"}}, options,
+                {ArrayFromJSON(utf8(), R"(["", "99", "01:23:45", "NA"])")});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkIntegerAndTime) {
+  // Fallback to utf-8
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+  auto type = utf8();
+
+  CheckInferred(tg, {{""}, {"99"}, {"01:23:45", "NA"}}, options,
+                {ArrayFromJSON(type, R"([""])"), ArrayFromJSON(type, R"(["99"])"),
+                 ArrayFromJSON(type, R"(["01:23:45", "NA"])")});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkDateAndTime) {
+  // Fallback to utf-8
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"", "01:23:45", "1998-04-05"}}, options,
+                {ArrayFromJSON(utf8(), R"(["", "01:23:45", "1998-04-05"])")});
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkDateAndTime) {
+  // Fallback to utf-8
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+  auto type = utf8();
+
+  CheckInferred(tg, {{""}, {"01:23:45"}, {"1998-04-05"}}, options,
+                {ArrayFromJSON(type, R"([""])"), ArrayFromJSON(type, R"(["01:23:45"])"),
+                 ArrayFromJSON(type, R"(["1998-04-05"])")});
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkString) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ChunkedArray> expected;
+
+  // With valid UTF8
+  CheckInferred(tg, {{"", "foo", "baré"}}, options,
+                {ArrayFromJSON(utf8(), R"(["", "foo", "baré"])")});
+
+  // With invalid UTF8, non-checking
+  options.check_utf8 = false;
+  tg = TaskGroup::MakeSerial();
+  ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
+                                                  {{"", "foo\xff", "baré"}}, &expected);
+  CheckInferred(tg, {{"", "foo\xff", "baré"}}, options, expected,
+                /*validate_full=*/false);
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkBinary) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ChunkedArray> expected;
+
+  // With invalid UTF8, checking
+  tg = TaskGroup::MakeSerial();
+  ChunkedArrayFromVector<BinaryType, std::string>({{true, true, true}},
+                                                  {{"", "foo\xff", "baré"}}, &expected);
+  CheckInferred(tg, {{"", "foo\xff", "baré"}}, options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkString) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<StringType, std::string>(
+      {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré"}}, &expected);
+
+  CheckInferred(tg, {{""}, {"008"}, {"NaN", "baré"}}, options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkBinary) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeSerial();
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<BinaryType, std::string>(
+      {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré\xff"}}, &expected);
+
+  CheckInferred(tg, {{""}, {"008"}, {"NaN", "baré\xff"}}, options, expected);
+}
+
+// Parallel parsing is tested more comprehensively on the Python side
+// (see python/pyarrow/tests/test_csv.py)
+
+TEST_F(InferringColumnBuilderTest, MultipleChunkIntegerParallel) {
+  auto options = ConvertOptions::Defaults();
+  auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<Int64Type>({{1, 2}, {3}, {4, 5}, {6, 7}}, &expected);
+  CheckInferred(tg, {{"1", "2"}, {"3"}, {"4", "5"}, {"6", "7"}}, options, expected);
+}
+
+TEST_F(InferringColumnBuilderTest, SingleChunkBinaryAutoDict) {
+  auto options = ConvertOptions::Defaults();
+  options.auto_dict_encode = true;
+  options.auto_dict_max_cardinality = 3;
+
+  // With valid UTF8
+  auto expected_indices = ArrayFromJSON(int32(), "[0, 1, 0]");
+  auto expected_dictionary = ArrayFromJSON(utf8(), R"(["abé", "cd"])");
+  ChunkData csv_data = {{"abé", "cd", "abé"}};
+
+  CheckAutoDictEncoded(TaskGroup::MakeSerial(), csv_data, options, {expected_indices},
+                       {expected_dictionary});
+
+  // With invalid UTF8, non-checking
+  csv_data = {{"ab", "cd\xff", "ab"}};
+  options.check_utf8 = false;
+  ArrayFromVector<StringType, std::string>({"ab", "cd\xff"}, &expected_dictionary);
+
+  CheckAutoDictEncoded(TaskGroup::MakeSerial(), csv_data, options, {expected_indices},
+                       {expected_dictionary}, /*validate_full=*/false);
+
+  // With invalid UTF8, checking
+  options.check_utf8 = true;
+  ArrayFromVector<BinaryType, std::string>({"ab", "cd\xff"}, &expected_dictionary);
+
+  CheckAutoDictEncoded(TaskGroup::MakeSerial(), csv_data, options, {expected_indices},
+                       {expected_dictionary});
+}
+
+}  // namespace csv
+}  // namespace arrow