]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/csv/options.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / csv / options.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #pragma once
19
20 #include <cstdint>
21 #include <memory>
22 #include <string>
23 #include <unordered_map>
24 #include <vector>
25
26 #include "arrow/csv/invalid_row.h"
27 #include "arrow/csv/type_fwd.h"
28 #include "arrow/io/interfaces.h"
29 #include "arrow/status.h"
30 #include "arrow/util/visibility.h"
31
32 namespace arrow {
33
34 class DataType;
35 class TimestampParser;
36
37 namespace csv {
38
39 // Silly workaround for https://github.com/michaeljones/breathe/issues/453
40 constexpr char kDefaultEscapeChar = '\\';
41
42 struct ARROW_EXPORT ParseOptions {
43 // Parsing options
44
45 /// Field delimiter
46 char delimiter = ',';
47 /// Whether quoting is used
48 bool quoting = true;
49 /// Quoting character (if `quoting` is true)
50 char quote_char = '"';
51 /// Whether a quote inside a value is double-quoted
52 bool double_quote = true;
53 /// Whether escaping is used
54 bool escaping = false;
55 /// Escaping character (if `escaping` is true)
56 char escape_char = kDefaultEscapeChar;
57 /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
58 bool newlines_in_values = false;
59 /// Whether empty lines are ignored. If false, an empty line represents
60 /// a single empty value (assuming a one-column CSV file).
61 bool ignore_empty_lines = true;
62 /// A handler function for rows which do not have the correct number of columns
63 InvalidRowHandler invalid_row_handler;
64
65 /// Create parsing options with default values
66 static ParseOptions Defaults();
67
68 /// \brief Test that all set options are valid
69 Status Validate() const;
70 };
71
72 struct ARROW_EXPORT ConvertOptions {
73 // Conversion options
74
75 /// Whether to check UTF8 validity of string columns
76 bool check_utf8 = true;
77 /// Optional per-column types (disabling type inference on those columns)
78 std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
79 /// Recognized spellings for null values
80 std::vector<std::string> null_values;
81 /// Recognized spellings for boolean true values
82 std::vector<std::string> true_values;
83 /// Recognized spellings for boolean false values
84 std::vector<std::string> false_values;
85
86 /// Whether string / binary columns can have null values.
87 ///
88 /// If true, then strings in "null_values" are considered null for string columns.
89 /// If false, then all strings are valid string values.
90 bool strings_can_be_null = false;
91
92 /// Whether quoted values can be null.
93 ///
94 /// If true, then strings in "null_values" are also considered null when they
95 /// appear quoted in the CSV file. Otherwise, quoted values are never considered null.
96 bool quoted_strings_can_be_null = true;
97
98 /// Whether to try to automatically dict-encode string / binary data.
99 /// If true, then when type inference detects a string or binary column,
100 /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values
101 /// (per chunk), after which it switches to regular encoding.
102 ///
103 /// This setting is ignored for non-inferred columns (those in `column_types`).
104 bool auto_dict_encode = false;
105 int32_t auto_dict_max_cardinality = 50;
106
107 /// Decimal point character for floating-point and decimal data
108 char decimal_point = '.';
109
110 // XXX Should we have a separate FilterOptions?
111
112 /// If non-empty, indicates the names of columns from the CSV file that should
113 /// be actually read and converted (in the vector's order).
114 /// Columns not in this vector will be ignored.
115 std::vector<std::string> include_columns;
116 /// If false, columns in `include_columns` but not in the CSV file will error out.
117 /// If true, columns in `include_columns` but not in the CSV file will produce
118 /// a column of nulls (whose type is selected using `column_types`,
119 /// or null by default)
120 /// This option is ignored if `include_columns` is empty.
121 bool include_missing_columns = false;
122
123 /// User-defined timestamp parsers, using the virtual parser interface in
124 /// arrow/util/value_parsing.h. More than one parser can be specified, and
125 /// the CSV conversion logic will try parsing values starting from the
126 /// beginning of this vector. If no parsers are specified, we use the default
127 /// built-in ISO-8601 parser.
128 std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;
129
130 /// Create conversion options with default values, including conventional
131 /// values for `null_values`, `true_values` and `false_values`
132 static ConvertOptions Defaults();
133
134 /// \brief Test that all set options are valid
135 Status Validate() const;
136 };
137
138 struct ARROW_EXPORT ReadOptions {
139 // Reader options
140
141 /// Whether to use the global CPU thread pool
142 bool use_threads = true;
143
144 /// \brief Block size we request from the IO layer.
145 ///
146 /// This will determine multi-threading granularity as well as
147 /// the size of individual record batches.
148 /// Minimum valid value for block size is 1
149 int32_t block_size = 1 << 20; // 1 MB
150
151 /// Number of header rows to skip (not including the row of column names, if any)
152 int32_t skip_rows = 0;
153
154 /// Number of rows to skip after the column names are read, if any
155 int32_t skip_rows_after_names = 0;
156
157 /// Column names for the target table.
158 /// If empty, fall back on autogenerate_column_names.
159 std::vector<std::string> column_names;
160
161 /// Whether to autogenerate column names if `column_names` is empty.
162 /// If true, column names will be of the form "f0", "f1"...
163 /// If false, column names will be read from the first CSV row after `skip_rows`.
164 bool autogenerate_column_names = false;
165
166 /// Create read options with default values
167 static ReadOptions Defaults();
168
169 /// \brief Test that all set options are valid
170 Status Validate() const;
171 };
172
173 struct ARROW_EXPORT WriteOptions {
174 /// Whether to write an initial header line with column names
175 bool include_header = true;
176
177 /// \brief Maximum number of rows processed at a time
178 ///
179 /// The CSV writer converts and writes data in batches of N rows.
180 /// This number can impact performance.
181 int32_t batch_size = 1024;
182
183 /// \brief IO context for writing.
184 io::IOContext io_context;
185
186 /// Create write options with default values
187 static WriteOptions Defaults();
188
189 /// \brief Test that all set options are valid
190 Status Validate() const;
191 };
192
193 } // namespace csv
194 } // namespace arrow