ceph/src/arrow/cpp/src/arrow/csv/options.h

   1 // Licensed to the Apache Software Foundation (ASF) under one
   2 // or more contributor license agreements.  See the NOTICE file
   3 // distributed with this work for additional information
   4 // regarding copyright ownership.  The ASF licenses this file
   5 // to you under the Apache License, Version 2.0 (the
   6 // "License"); you may not use this file except in compliance
   7 // with the License.  You may obtain a copy of the License at
   8 //
   9 //   http://www.apache.org/licenses/LICENSE-2.0
  10 //
  11 // Unless required by applicable law or agreed to in writing,
  12 // software distributed under the License is distributed on an
  13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 // KIND, either express or implied.  See the License for the
  15 // specific language governing permissions and limitations
  16 // under the License.
  17
  18 #pragma once
  19
  20 #include <cstdint>
  21 #include <memory>
  22 #include <string>
  23 #include <unordered_map>
  24 #include <vector>
  25
  26 #include "arrow/csv/invalid_row.h"
  27 #include "arrow/csv/type_fwd.h"
  28 #include "arrow/io/interfaces.h"
  29 #include "arrow/status.h"
  30 #include "arrow/util/visibility.h"
  31
  32 namespace arrow {
  33
  34 class DataType;
  35 class TimestampParser;
  36
  37 namespace csv {
  38
  39 // Silly workaround for https://github.com/michaeljones/breathe/issues/453
  40 constexpr char kDefaultEscapeChar = '\\';
  41
  42 struct ARROW_EXPORT ParseOptions {
  43   // Parsing options
  44
  45   /// Field delimiter
  46   char delimiter = ',';
  47   /// Whether quoting is used
  48   bool quoting = true;
  49   /// Quoting character (if `quoting` is true)
  50   char quote_char = '"';
  51   /// Whether a quote inside a value is double-quoted
  52   bool double_quote = true;
  53   /// Whether escaping is used
  54   bool escaping = false;
  55   /// Escaping character (if `escaping` is true)
  56   char escape_char = kDefaultEscapeChar;
  57   /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
  58   bool newlines_in_values = false;
  59   /// Whether empty lines are ignored.  If false, an empty line represents
  60   /// a single empty value (assuming a one-column CSV file).
  61   bool ignore_empty_lines = true;
  62   /// A handler function for rows which do not have the correct number of columns
  63   InvalidRowHandler invalid_row_handler;
  64
  65   /// Create parsing options with default values
  66   static ParseOptions Defaults();
  67
  68   /// \brief Test that all set options are valid
  69   Status Validate() const;
  70 };
  71
  72 struct ARROW_EXPORT ConvertOptions {
  73   // Conversion options
  74
  75   /// Whether to check UTF8 validity of string columns
  76   bool check_utf8 = true;
  77   /// Optional per-column types (disabling type inference on those columns)
  78   std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
  79   /// Recognized spellings for null values
  80   std::vector<std::string> null_values;
  81   /// Recognized spellings for boolean true values
  82   std::vector<std::string> true_values;
  83   /// Recognized spellings for boolean false values
  84   std::vector<std::string> false_values;
  85
  86   /// Whether string / binary columns can have null values.
  87   ///
  88   /// If true, then strings in "null_values" are considered null for string columns.
  89   /// If false, then all strings are valid string values.
  90   bool strings_can_be_null = false;
  91
  92   /// Whether quoted values can be null.
  93   ///
  94   /// If true, then strings in "null_values" are also considered null when they
  95   /// appear quoted in the CSV file. Otherwise, quoted values are never considered null.
  96   bool quoted_strings_can_be_null = true;
  97
  98   /// Whether to try to automatically dict-encode string / binary data.
  99   /// If true, then when type inference detects a string or binary column,
 100   /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values
 101   /// (per chunk), after which it switches to regular encoding.
 102   ///
 103   /// This setting is ignored for non-inferred columns (those in `column_types`).
 104   bool auto_dict_encode = false;
 105   int32_t auto_dict_max_cardinality = 50;
 106
 107   /// Decimal point character for floating-point and decimal data
 108   char decimal_point = '.';
 109
 110   // XXX Should we have a separate FilterOptions?
 111
 112   /// If non-empty, indicates the names of columns from the CSV file that should
 113   /// be actually read and converted (in the vector's order).
 114   /// Columns not in this vector will be ignored.
 115   std::vector<std::string> include_columns;
 116   /// If false, columns in `include_columns` but not in the CSV file will error out.
 117   /// If true, columns in `include_columns` but not in the CSV file will produce
 118   /// a column of nulls (whose type is selected using `column_types`,
 119   /// or null by default)
 120   /// This option is ignored if `include_columns` is empty.
 121   bool include_missing_columns = false;
 122
 123   /// User-defined timestamp parsers, using the virtual parser interface in
 124   /// arrow/util/value_parsing.h. More than one parser can be specified, and
 125   /// the CSV conversion logic will try parsing values starting from the
 126   /// beginning of this vector. If no parsers are specified, we use the default
 127   /// built-in ISO-8601 parser.
 128   std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;
 129
 130   /// Create conversion options with default values, including conventional
 131   /// values for `null_values`, `true_values` and `false_values`
 132   static ConvertOptions Defaults();
 133
 134   /// \brief Test that all set options are valid
 135   Status Validate() const;
 136 };
 137
 138 struct ARROW_EXPORT ReadOptions {
 139   // Reader options
 140
 141   /// Whether to use the global CPU thread pool
 142   bool use_threads = true;
 143
 144   /// \brief Block size we request from the IO layer.
 145   ///
 146   /// This will determine multi-threading granularity as well as
 147   /// the size of individual record batches.
 148   /// Minimum valid value for block size is 1
 149   int32_t block_size = 1 << 20;  // 1 MB
 150
 151   /// Number of header rows to skip (not including the row of column names, if any)
 152   int32_t skip_rows = 0;
 153
 154   /// Number of rows to skip after the column names are read, if any
 155   int32_t skip_rows_after_names = 0;
 156
 157   /// Column names for the target table.
 158   /// If empty, fall back on autogenerate_column_names.
 159   std::vector<std::string> column_names;
 160
 161   /// Whether to autogenerate column names if `column_names` is empty.
 162   /// If true, column names will be of the form "f0", "f1"...
 163   /// If false, column names will be read from the first CSV row after `skip_rows`.
 164   bool autogenerate_column_names = false;
 165
 166   /// Create read options with default values
 167   static ReadOptions Defaults();
 168
 169   /// \brief Test that all set options are valid
 170   Status Validate() const;
 171 };
 172
 173 struct ARROW_EXPORT WriteOptions {
 174   /// Whether to write an initial header line with column names
 175   bool include_header = true;
 176
 177   /// \brief Maximum number of rows processed at a time
 178   ///
 179   /// The CSV writer converts and writes data in batches of N rows.
 180   /// This number can impact performance.
 181   int32_t batch_size = 1024;
 182
 183   /// \brief IO context for writing.
 184   io::IOContext io_context;
 185
 186   /// Create write options with default values
 187   static WriteOptions Defaults();
 188
 189   /// \brief Test that all set options are valid
 190   Status Validate() const;
 191 };
 192
 193 }  // namespace csv
 194 }  // namespace arrow