]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/r/man/read_delim_arrow.Rd
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / r / man / read_delim_arrow.Rd
CommitLineData
1d09f67e
TL
1% Generated by roxygen2: do not edit by hand
2% Please edit documentation in R/csv.R
3\name{read_delim_arrow}
4\alias{read_delim_arrow}
5\alias{read_csv_arrow}
6\alias{read_tsv_arrow}
7\title{Read a CSV or other delimited file with Arrow}
8\usage{
9read_delim_arrow(
10 file,
11 delim = ",",
12 quote = "\\"",
13 escape_double = TRUE,
14 escape_backslash = FALSE,
15 schema = NULL,
16 col_names = TRUE,
17 col_types = NULL,
18 col_select = NULL,
19 na = c("", "NA"),
20 quoted_na = TRUE,
21 skip_empty_rows = TRUE,
22 skip = 0L,
23 parse_options = NULL,
24 convert_options = NULL,
25 read_options = NULL,
26 as_data_frame = TRUE,
27 timestamp_parsers = NULL
28)
29
30read_csv_arrow(
31 file,
32 quote = "\\"",
33 escape_double = TRUE,
34 escape_backslash = FALSE,
35 schema = NULL,
36 col_names = TRUE,
37 col_types = NULL,
38 col_select = NULL,
39 na = c("", "NA"),
40 quoted_na = TRUE,
41 skip_empty_rows = TRUE,
42 skip = 0L,
43 parse_options = NULL,
44 convert_options = NULL,
45 read_options = NULL,
46 as_data_frame = TRUE,
47 timestamp_parsers = NULL
48)
49
50read_tsv_arrow(
51 file,
52 quote = "\\"",
53 escape_double = TRUE,
54 escape_backslash = FALSE,
55 schema = NULL,
56 col_names = TRUE,
57 col_types = NULL,
58 col_select = NULL,
59 na = c("", "NA"),
60 quoted_na = TRUE,
61 skip_empty_rows = TRUE,
62 skip = 0L,
63 parse_options = NULL,
64 convert_options = NULL,
65 read_options = NULL,
66 as_data_frame = TRUE,
67 timestamp_parsers = NULL
68)
69}
70\arguments{
71\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream,
72or a \code{FileSystem} with path (\code{SubTreeFileSystem}).
73If a file name, a memory-mapped Arrow \link{InputStream} will be opened and
74closed when finished; compression will be detected from the file extension
75and handled automatically. If an input stream is provided, it will be left
76open.}
77
78\item{delim}{Single character used to separate fields within a record.}
79
80\item{quote}{Single character used to quote strings.}
81
82\item{escape_double}{Does the file escape quotes by doubling them?
83i.e. If this option is \code{TRUE}, the value \verb{""""} represents
84a single quote, \verb{\\"}.}
85
86\item{escape_backslash}{Does the file use backslashes to escape special
87characters? This is more general than \code{escape_double} as backslashes
88can be used to escape the delimiter character, the quote character, or
89to add special characters like \verb{\\\\n}.}
90
91\item{schema}{\link{Schema} that describes the table. If provided, it will be
92used to satisfy both \code{col_names} and \code{col_types}.}
93
94\item{col_names}{If \code{TRUE}, the first row of the input will be used as the
95column names and will not be included in the data frame. If \code{FALSE}, column
96names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
97Alternatively, you can specify a character vector of column names.}
98
99\item{col_types}{A compact string representation of the column types, or
100\code{NULL} (the default) to infer types from the data.}
101
102\item{col_select}{A character vector of column names to keep, as in the
103"select" argument to \code{data.table::fread()}, or a
104\link[tidyselect:vars_select]{tidy selection specification}
105of columns, as used in \code{dplyr::select()}.}
106
107\item{na}{A character vector of strings to interpret as missing values.}
108
109\item{quoted_na}{Should missing values inside quotes be treated as missing
110values (the default) or strings. (Note that this is different from the
111the Arrow C++ default for the corresponding convert option,
112\code{strings_can_be_null}.)}
113
114\item{skip_empty_rows}{Should blank rows be ignored altogether? If
115\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be
116filled with missings.}
117
118\item{skip}{Number of lines to skip before reading data.}
119
120\item{parse_options}{see \link[=CsvReadOptions]{file reader options}.
121If given, this overrides any
122parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).}
123
124\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}
125
126\item{read_options}{see \link[=CsvReadOptions]{file reader options}}
127
128\item{as_data_frame}{Should the function return a \code{data.frame} (default) or
129an Arrow \link{Table}?}
130
131\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
132parser is specified, the CSV conversion logic will try parsing values
133starting from the beginning of this vector. Possible values are:
134\itemize{
135\item \code{NULL}: the default, which uses the ISO-8601 parser
136\item a character vector of \link[base:strptime]{strptime} parse strings
137\item a list of \link{TimestampParser} objects
138}}
139}
140\value{
141A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}.
142}
143\description{
144These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}.
145Arrow C++ options have been mapped to argument names that follow those of
146\code{readr::read_delim()}, and \code{col_select} was inspired by \code{vroom::vroom()}.
147}
148\details{
149\code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around
150\code{read_delim_arrow()} that specify a delimiter.
151
152Note that not all \code{readr} options are currently implemented here. Please file
153an issue if you encounter one that \code{arrow} should support.
154
155If you need to control Arrow-specific reader parameters that don't have an
156equivalent in \code{readr::read_csv()}, you can either provide them in the
157\code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can
158use \link{CsvTableReader} directly for lower-level access.
159}
160\section{Specifying column types and names}{
161
162
163By default, the CSV reader will infer the column names and data types from the file, but there
164are a few ways you can specify them directly.
165
166One way is to provide an Arrow \link{Schema} in the \code{schema} argument,
167which is an ordered map of column name to type.
168When provided, it satisfies both the \code{col_names} and \code{col_types} arguments.
169This is good if you know all of this information up front.
170
171You can also pass a \code{Schema} to the \code{col_types} argument. If you do this,
172column names will still be inferred from the file unless you also specify
173\code{col_names}. In either case, the column names in the \code{Schema} must match the
174data's column names, whether they are explicitly provided or inferred. That
175said, this \code{Schema} does not have to reference all columns: those omitted
176will have their types inferred.
177
178Alternatively, you can declare column types by providing the compact string representation
179that \code{readr} uses to the \code{col_types} argument. This means you provide a
180single string, one character per column, where the characters map to Arrow
181types analogously to the \code{readr} type mapping:
182\itemize{
183\item "c": \code{utf8()}
184\item "i": \code{int32()}
185\item "n": \code{float64()}
186\item "d": \code{float64()}
187\item "l": \code{bool()}
188\item "f": \code{dictionary()}
189\item "D": \code{date32()}
190\item "T": \code{timestamp()}
191\item "t": \code{time32()}
192\item "_": \code{null()}
193\item "-": \code{null()}
194\item "?": infer the type from the data
195}
196
197If you use the compact string representation for \code{col_types}, you must also
198specify \code{col_names}.
199
200Regardless of how types are specified, all columns with a \code{null()} type will
201be dropped.
202
203Note that if you are specifying column names, whether by \code{schema} or
204\code{col_names}, and the CSV file has a header row that would otherwise be used
205to idenfity column names, you'll need to add \code{skip = 1} to skip that row.
206}
207
208\examples{
209\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
210tf <- tempfile()
211on.exit(unlink(tf))
212write.csv(mtcars, file = tf)
213df <- read_csv_arrow(tf)
214dim(df)
215# Can select columns
216df <- read_csv_arrow(tf, col_select = starts_with("d"))
217\dontshow{\}) # examplesIf}
218}