1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
23 #include <unordered_map>
26 #include "arrow/csv/invalid_row.h"
27 #include "arrow/csv/type_fwd.h"
28 #include "arrow/io/interfaces.h"
29 #include "arrow/status.h"
30 #include "arrow/util/visibility.h"
35 class TimestampParser
;
39 // Silly workaround for https://github.com/michaeljones/breathe/issues/453
40 constexpr char kDefaultEscapeChar
= '\\';
42 struct ARROW_EXPORT ParseOptions
{
47 /// Whether quoting is used
49 /// Quoting character (if `quoting` is true)
50 char quote_char
= '"';
51 /// Whether a quote inside a value is double-quoted
52 bool double_quote
= true;
53 /// Whether escaping is used
54 bool escaping
= false;
55 /// Escaping character (if `escaping` is true)
56 char escape_char
= kDefaultEscapeChar
;
57 /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
58 bool newlines_in_values
= false;
59 /// Whether empty lines are ignored. If false, an empty line represents
60 /// a single empty value (assuming a one-column CSV file).
61 bool ignore_empty_lines
= true;
62 /// A handler function for rows which do not have the correct number of columns
63 InvalidRowHandler invalid_row_handler
;
65 /// Create parsing options with default values
66 static ParseOptions
Defaults();
68 /// \brief Test that all set options are valid
69 Status
Validate() const;
72 struct ARROW_EXPORT ConvertOptions
{
75 /// Whether to check UTF8 validity of string columns
76 bool check_utf8
= true;
77 /// Optional per-column types (disabling type inference on those columns)
78 std::unordered_map
<std::string
, std::shared_ptr
<DataType
>> column_types
;
79 /// Recognized spellings for null values
80 std::vector
<std::string
> null_values
;
81 /// Recognized spellings for boolean true values
82 std::vector
<std::string
> true_values
;
83 /// Recognized spellings for boolean false values
84 std::vector
<std::string
> false_values
;
86 /// Whether string / binary columns can have null values.
88 /// If true, then strings in "null_values" are considered null for string columns.
89 /// If false, then all strings are valid string values.
90 bool strings_can_be_null
= false;
92 /// Whether quoted values can be null.
94 /// If true, then strings in "null_values" are also considered null when they
95 /// appear quoted in the CSV file. Otherwise, quoted values are never considered null.
96 bool quoted_strings_can_be_null
= true;
98 /// Whether to try to automatically dict-encode string / binary data.
99 /// If true, then when type inference detects a string or binary column,
100 /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values
101 /// (per chunk), after which it switches to regular encoding.
103 /// This setting is ignored for non-inferred columns (those in `column_types`).
104 bool auto_dict_encode
= false;
105 int32_t auto_dict_max_cardinality
= 50;
107 /// Decimal point character for floating-point and decimal data
108 char decimal_point
= '.';
110 // XXX Should we have a separate FilterOptions?
112 /// If non-empty, indicates the names of columns from the CSV file that should
113 /// be actually read and converted (in the vector's order).
114 /// Columns not in this vector will be ignored.
115 std::vector
<std::string
> include_columns
;
116 /// If false, columns in `include_columns` but not in the CSV file will error out.
117 /// If true, columns in `include_columns` but not in the CSV file will produce
118 /// a column of nulls (whose type is selected using `column_types`,
119 /// or null by default)
120 /// This option is ignored if `include_columns` is empty.
121 bool include_missing_columns
= false;
123 /// User-defined timestamp parsers, using the virtual parser interface in
124 /// arrow/util/value_parsing.h. More than one parser can be specified, and
125 /// the CSV conversion logic will try parsing values starting from the
126 /// beginning of this vector. If no parsers are specified, we use the default
127 /// built-in ISO-8601 parser.
128 std::vector
<std::shared_ptr
<TimestampParser
>> timestamp_parsers
;
130 /// Create conversion options with default values, including conventional
131 /// values for `null_values`, `true_values` and `false_values`
132 static ConvertOptions
Defaults();
134 /// \brief Test that all set options are valid
135 Status
Validate() const;
138 struct ARROW_EXPORT ReadOptions
{
141 /// Whether to use the global CPU thread pool
142 bool use_threads
= true;
144 /// \brief Block size we request from the IO layer.
146 /// This will determine multi-threading granularity as well as
147 /// the size of individual record batches.
148 /// Minimum valid value for block size is 1
149 int32_t block_size
= 1 << 20; // 1 MB
151 /// Number of header rows to skip (not including the row of column names, if any)
152 int32_t skip_rows
= 0;
154 /// Number of rows to skip after the column names are read, if any
155 int32_t skip_rows_after_names
= 0;
157 /// Column names for the target table.
158 /// If empty, fall back on autogenerate_column_names.
159 std::vector
<std::string
> column_names
;
161 /// Whether to autogenerate column names if `column_names` is empty.
162 /// If true, column names will be of the form "f0", "f1"...
163 /// If false, column names will be read from the first CSV row after `skip_rows`.
164 bool autogenerate_column_names
= false;
166 /// Create read options with default values
167 static ReadOptions
Defaults();
169 /// \brief Test that all set options are valid
170 Status
Validate() const;
173 struct ARROW_EXPORT WriteOptions
{
174 /// Whether to write an initial header line with column names
175 bool include_header
= true;
177 /// \brief Maximum number of rows processed at a time
179 /// The CSV writer converts and writes data in batches of N rows.
180 /// This number can impact performance.
181 int32_t batch_size
= 1024;
183 /// \brief IO context for writing.
184 io::IOContext io_context
;
186 /// Create write options with default values
187 static WriteOptions
Defaults();
189 /// \brief Test that all set options are valid
190 Status
Validate() const;