]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | % Generated by roxygen2: do not edit by hand |
2 | % Please edit documentation in R/csv.R | |
3 | \name{read_delim_arrow} | |
4 | \alias{read_delim_arrow} | |
5 | \alias{read_csv_arrow} | |
6 | \alias{read_tsv_arrow} | |
7 | \title{Read a CSV or other delimited file with Arrow} | |
8 | \usage{ | |
9 | read_delim_arrow( | |
10 | file, | |
11 | delim = ",", | |
12 | quote = "\\"", | |
13 | escape_double = TRUE, | |
14 | escape_backslash = FALSE, | |
15 | schema = NULL, | |
16 | col_names = TRUE, | |
17 | col_types = NULL, | |
18 | col_select = NULL, | |
19 | na = c("", "NA"), | |
20 | quoted_na = TRUE, | |
21 | skip_empty_rows = TRUE, | |
22 | skip = 0L, | |
23 | parse_options = NULL, | |
24 | convert_options = NULL, | |
25 | read_options = NULL, | |
26 | as_data_frame = TRUE, | |
27 | timestamp_parsers = NULL | |
28 | ) | |
29 | ||
30 | read_csv_arrow( | |
31 | file, | |
32 | quote = "\\"", | |
33 | escape_double = TRUE, | |
34 | escape_backslash = FALSE, | |
35 | schema = NULL, | |
36 | col_names = TRUE, | |
37 | col_types = NULL, | |
38 | col_select = NULL, | |
39 | na = c("", "NA"), | |
40 | quoted_na = TRUE, | |
41 | skip_empty_rows = TRUE, | |
42 | skip = 0L, | |
43 | parse_options = NULL, | |
44 | convert_options = NULL, | |
45 | read_options = NULL, | |
46 | as_data_frame = TRUE, | |
47 | timestamp_parsers = NULL | |
48 | ) | |
49 | ||
50 | read_tsv_arrow( | |
51 | file, | |
52 | quote = "\\"", | |
53 | escape_double = TRUE, | |
54 | escape_backslash = FALSE, | |
55 | schema = NULL, | |
56 | col_names = TRUE, | |
57 | col_types = NULL, | |
58 | col_select = NULL, | |
59 | na = c("", "NA"), | |
60 | quoted_na = TRUE, | |
61 | skip_empty_rows = TRUE, | |
62 | skip = 0L, | |
63 | parse_options = NULL, | |
64 | convert_options = NULL, | |
65 | read_options = NULL, | |
66 | as_data_frame = TRUE, | |
67 | timestamp_parsers = NULL | |
68 | ) | |
69 | } | |
70 | \arguments{ | |
71 | \item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, | |
72 | or a \code{FileSystem} with path (\code{SubTreeFileSystem}). | |
73 | If a file name, a memory-mapped Arrow \link{InputStream} will be opened and | |
74 | closed when finished; compression will be detected from the file extension | |
75 | and handled automatically. If an input stream is provided, it will be left | |
76 | open.} | |
77 | ||
78 | \item{delim}{Single character used to separate fields within a record.} | |
79 | ||
80 | \item{quote}{Single character used to quote strings.} | |
81 | ||
82 | \item{escape_double}{Does the file escape quotes by doubling them? | |
83 | i.e. If this option is \code{TRUE}, the value \verb{""""} represents | |
84 | a single quote, \verb{\\"}.} | |
85 | ||
86 | \item{escape_backslash}{Does the file use backslashes to escape special | |
87 | characters? This is more general than \code{escape_double} as backslashes | |
88 | can be used to escape the delimiter character, the quote character, or | |
89 | to add special characters like \verb{\\\\n}.} | |
90 | ||
91 | \item{schema}{\link{Schema} that describes the table. If provided, it will be | |
92 | used to satisfy both \code{col_names} and \code{col_types}.} | |
93 | ||
94 | \item{col_names}{If \code{TRUE}, the first row of the input will be used as the | |
95 | column names and will not be included in the data frame. If \code{FALSE}, column | |
96 | names will be generated by Arrow, starting with "f0", "f1", ..., "fN". | |
97 | Alternatively, you can specify a character vector of column names.} | |
98 | ||
99 | \item{col_types}{A compact string representation of the column types, or | |
100 | \code{NULL} (the default) to infer types from the data.} | |
101 | ||
102 | \item{col_select}{A character vector of column names to keep, as in the | |
103 | "select" argument to \code{data.table::fread()}, or a | |
104 | \link[tidyselect:vars_select]{tidy selection specification} | |
105 | of columns, as used in \code{dplyr::select()}.} | |
106 | ||
107 | \item{na}{A character vector of strings to interpret as missing values.} | |
108 | ||
109 | \item{quoted_na}{Should missing values inside quotes be treated as missing | |
110 | values (the default) or strings. (Note that this is different from the | |
111 | the Arrow C++ default for the corresponding convert option, | |
112 | \code{strings_can_be_null}.)} | |
113 | ||
114 | \item{skip_empty_rows}{Should blank rows be ignored altogether? If | |
115 | \code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be | |
116 | filled with missings.} | |
117 | ||
118 | \item{skip}{Number of lines to skip before reading data.} | |
119 | ||
120 | \item{parse_options}{see \link[=CsvReadOptions]{file reader options}. | |
121 | If given, this overrides any | |
122 | parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).} | |
123 | ||
124 | \item{convert_options}{see \link[=CsvReadOptions]{file reader options}} | |
125 | ||
126 | \item{read_options}{see \link[=CsvReadOptions]{file reader options}} | |
127 | ||
128 | \item{as_data_frame}{Should the function return a \code{data.frame} (default) or | |
129 | an Arrow \link{Table}?} | |
130 | ||
131 | \item{timestamp_parsers}{User-defined timestamp parsers. If more than one | |
132 | parser is specified, the CSV conversion logic will try parsing values | |
133 | starting from the beginning of this vector. Possible values are: | |
134 | \itemize{ | |
135 | \item \code{NULL}: the default, which uses the ISO-8601 parser | |
136 | \item a character vector of \link[base:strptime]{strptime} parse strings | |
137 | \item a list of \link{TimestampParser} objects | |
138 | }} | |
139 | } | |
140 | \value{ | |
141 | A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. | |
142 | } | |
143 | \description{ | |
144 | These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}. | |
145 | Arrow C++ options have been mapped to argument names that follow those of | |
146 | \code{readr::read_delim()}, and \code{col_select} was inspired by \code{vroom::vroom()}. | |
147 | } | |
148 | \details{ | |
149 | \code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around | |
150 | \code{read_delim_arrow()} that specify a delimiter. | |
151 | ||
152 | Note that not all \code{readr} options are currently implemented here. Please file | |
153 | an issue if you encounter one that \code{arrow} should support. | |
154 | ||
155 | If you need to control Arrow-specific reader parameters that don't have an | |
156 | equivalent in \code{readr::read_csv()}, you can either provide them in the | |
157 | \code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can | |
158 | use \link{CsvTableReader} directly for lower-level access. | |
159 | } | |
160 | \section{Specifying column types and names}{ | |
161 | ||
162 | ||
163 | By default, the CSV reader will infer the column names and data types from the file, but there | |
164 | are a few ways you can specify them directly. | |
165 | ||
166 | One way is to provide an Arrow \link{Schema} in the \code{schema} argument, | |
167 | which is an ordered map of column name to type. | |
168 | When provided, it satisfies both the \code{col_names} and \code{col_types} arguments. | |
169 | This is good if you know all of this information up front. | |
170 | ||
171 | You can also pass a \code{Schema} to the \code{col_types} argument. If you do this, | |
172 | column names will still be inferred from the file unless you also specify | |
173 | \code{col_names}. In either case, the column names in the \code{Schema} must match the | |
174 | data's column names, whether they are explicitly provided or inferred. That | |
175 | said, this \code{Schema} does not have to reference all columns: those omitted | |
176 | will have their types inferred. | |
177 | ||
178 | Alternatively, you can declare column types by providing the compact string representation | |
179 | that \code{readr} uses to the \code{col_types} argument. This means you provide a | |
180 | single string, one character per column, where the characters map to Arrow | |
181 | types analogously to the \code{readr} type mapping: | |
182 | \itemize{ | |
183 | \item "c": \code{utf8()} | |
184 | \item "i": \code{int32()} | |
185 | \item "n": \code{float64()} | |
186 | \item "d": \code{float64()} | |
187 | \item "l": \code{bool()} | |
188 | \item "f": \code{dictionary()} | |
189 | \item "D": \code{date32()} | |
190 | \item "T": \code{timestamp()} | |
191 | \item "t": \code{time32()} | |
192 | \item "_": \code{null()} | |
193 | \item "-": \code{null()} | |
194 | \item "?": infer the type from the data | |
195 | } | |
196 | ||
197 | If you use the compact string representation for \code{col_types}, you must also | |
198 | specify \code{col_names}. | |
199 | ||
200 | Regardless of how types are specified, all columns with a \code{null()} type will | |
201 | be dropped. | |
202 | ||
203 | Note that if you are specifying column names, whether by \code{schema} or | |
204 | \code{col_names}, and the CSV file has a header row that would otherwise be used | |
205 | to idenfity column names, you'll need to add \code{skip = 1} to skip that row. | |
206 | } | |
207 | ||
208 | \examples{ | |
209 | \dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} | |
210 | tf <- tempfile() | |
211 | on.exit(unlink(tf)) | |
212 | write.csv(mtcars, file = tf) | |
213 | df <- read_csv_arrow(tf) | |
214 | dim(df) | |
215 | # Can select columns | |
216 | df <- read_csv_arrow(tf, col_select = starts_with("d")) | |
217 | \dontshow{\}) # examplesIf} | |
218 | } |