1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
29 #include "arrow/filesystem/type_fwd.h"
30 #include "arrow/io/interfaces.h"
31 #include "arrow/type_fwd.h"
32 #include "arrow/util/compare.h"
33 #include "arrow/util/macros.h"
34 #include "arrow/util/type_fwd.h"
35 #include "arrow/util/visibility.h"
36 #include "arrow/util/windows_fixup.h"
41 // A system clock time point expressed as a 64-bit (or more) number of
42 // nanoseconds since the epoch.
44 std::chrono::time_point
<std::chrono::system_clock
, std::chrono::nanoseconds
>;
46 ARROW_EXPORT
std::string
ToString(FileType
);
48 ARROW_EXPORT
std::ostream
& operator<<(std::ostream
& os
, FileType
);
50 static const int64_t kNoSize
= -1;
51 static const TimePoint kNoTime
= TimePoint(TimePoint::duration(-1));
53 /// \brief FileSystem entry info
54 struct ARROW_EXPORT FileInfo
: public util::EqualityComparable
<FileInfo
> {
56 FileInfo(FileInfo
&&) = default;
57 FileInfo
& operator=(FileInfo
&&) = default;
58 FileInfo(const FileInfo
&) = default;
59 FileInfo
& operator=(const FileInfo
&) = default;
61 explicit FileInfo(std::string path
, FileType type
= FileType::Unknown
)
62 : path_(std::move(path
)), type_(type
) {}
65 FileType
type() const { return type_
; }
66 void set_type(FileType type
) { type_
= type
; }
68 /// The full file path in the filesystem
69 const std::string
& path() const { return path_
; }
70 void set_path(std::string path
) { path_
= std::move(path
); }
72 /// The file base name (component after the last directory separator)
73 std::string
base_name() const;
75 // The directory base name (component before the file base name).
76 std::string
dir_name() const;
78 /// The size in bytes, if available
80 /// Only regular files are guaranteed to have a size.
81 int64_t size() const { return size_
; }
82 void set_size(int64_t size
) { size_
= size
; }
84 /// The file extension (excluding the dot)
85 std::string
extension() const;
87 /// The time of last modification, if available
88 TimePoint
mtime() const { return mtime_
; }
89 void set_mtime(TimePoint mtime
) { mtime_
= mtime
; }
91 bool IsFile() const { return type_
== FileType::File
; }
92 bool IsDirectory() const { return type_
== FileType::Directory
; }
94 bool Equals(const FileInfo
& other
) const {
95 return type() == other
.type() && path() == other
.path() && size() == other
.size() &&
96 mtime() == other
.mtime();
99 std::string
ToString() const;
101 /// Function object implementing less-than comparison and hashing by
102 /// path, to support sorting infos, using them as keys, and other
103 /// interactions with the STL.
105 bool operator()(const FileInfo
& l
, const FileInfo
& r
) const {
106 return l
.path() < r
.path();
109 size_t operator()(const FileInfo
& i
) const {
110 return std::hash
<std::string
>{}(i
.path());
116 FileType type_
= FileType::Unknown
;
117 int64_t size_
= kNoSize
;
118 TimePoint mtime_
= kNoTime
;
121 ARROW_EXPORT
std::ostream
& operator<<(std::ostream
& os
, const FileInfo
&);
123 /// \brief File selector for filesystem APIs
124 struct ARROW_EXPORT FileSelector
{
125 /// The directory in which to select files.
126 /// If the path exists but doesn't point to a directory, this should be an error.
127 std::string base_dir
;
128 /// The behavior if `base_dir` isn't found in the filesystem. If false,
129 /// an error is returned. If true, an empty selection is returned.
130 bool allow_not_found
;
131 /// Whether to recurse into subdirectories.
133 /// The maximum number of subdirectories to recurse into.
134 int32_t max_recursion
;
136 FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX
) {}
139 /// \brief FileSystem, path pair
140 struct ARROW_EXPORT FileLocator
{
141 std::shared_ptr
<FileSystem
> filesystem
;
145 using FileInfoVector
= std::vector
<FileInfo
>;
146 using FileInfoGenerator
= std::function
<Future
<FileInfoVector
>()>;
151 struct IterationTraits
<fs::FileInfoVector
> {
152 static fs::FileInfoVector
End() { return {}; }
153 static bool IsEnd(const fs::FileInfoVector
& val
) { return val
.empty(); }
158 /// \brief Abstract file system API
159 class ARROW_EXPORT FileSystem
: public std::enable_shared_from_this
<FileSystem
> {
161 virtual ~FileSystem();
163 virtual std::string
type_name() const = 0;
165 /// EXPERIMENTAL: The IOContext associated with this filesystem.
166 const io::IOContext
& io_context() const { return io_context_
; }
168 /// Normalize path for the given filesystem
170 /// The default implementation of this method is a no-op, but subclasses
171 /// may allow normalizing irregular path forms (such as Windows local paths).
172 virtual Result
<std::string
> NormalizePath(std::string path
);
174 virtual bool Equals(const FileSystem
& other
) const = 0;
176 virtual bool Equals(const std::shared_ptr
<FileSystem
>& other
) const {
177 return Equals(*other
);
180 /// Get info for the given target.
182 /// Any symlink is automatically dereferenced, recursively.
183 /// A nonexistent or unreachable file returns an Ok status and
184 /// has a FileType of value NotFound. An error status indicates
185 /// a truly exceptional condition (low-level I/O error, etc.).
186 virtual Result
<FileInfo
> GetFileInfo(const std::string
& path
) = 0;
187 /// Same, for many targets at once.
188 virtual Result
<FileInfoVector
> GetFileInfo(const std::vector
<std::string
>& paths
);
189 /// Same, according to a selector.
191 /// The selector's base directory will not be part of the results, even if
193 /// If it doesn't exist, see `FileSelector::allow_not_found`.
194 virtual Result
<FileInfoVector
> GetFileInfo(const FileSelector
& select
) = 0;
196 /// Async version of GetFileInfo
197 virtual Future
<FileInfoVector
> GetFileInfoAsync(const std::vector
<std::string
>& paths
);
199 /// Streaming async version of GetFileInfo
201 /// The returned generator is not async-reentrant, i.e. you need to wait for
202 /// the returned future to complete before calling the generator again.
203 virtual FileInfoGenerator
GetFileInfoGenerator(const FileSelector
& select
);
205 /// Create a directory and subdirectories.
207 /// This function succeeds if the directory already exists.
208 virtual Status
CreateDir(const std::string
& path
, bool recursive
= true) = 0;
210 /// Delete a directory and its contents, recursively.
211 virtual Status
DeleteDir(const std::string
& path
) = 0;
213 /// Delete a directory's contents, recursively.
215 /// Like DeleteDir, but doesn't delete the directory itself.
216 /// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents.
217 virtual Status
DeleteDirContents(const std::string
& path
) = 0;
219 /// EXPERIMENTAL: Delete the root directory's contents, recursively.
221 /// Implementations may decide to raise an error if this operation is
223 // NOTE: may decide to remove this if it's deemed not useful
224 virtual Status
DeleteRootDirContents() = 0;
227 virtual Status
DeleteFile(const std::string
& path
) = 0;
228 /// Delete many files.
230 /// The default implementation issues individual delete operations in sequence.
231 virtual Status
DeleteFiles(const std::vector
<std::string
>& paths
);
233 /// Move / rename a file or directory.
235 /// If the destination exists:
236 /// - if it is a non-empty directory, an error is returned
237 /// - otherwise, if it has the same type as the source, it is replaced
238 /// - otherwise, behavior is unspecified (implementation-dependent).
239 virtual Status
Move(const std::string
& src
, const std::string
& dest
) = 0;
243 /// If the destination exists and is a directory, an error is returned.
244 /// Otherwise, it is replaced.
245 virtual Status
CopyFile(const std::string
& src
, const std::string
& dest
) = 0;
247 /// Open an input stream for sequential reading.
248 virtual Result
<std::shared_ptr
<io::InputStream
>> OpenInputStream(
249 const std::string
& path
) = 0;
250 /// Open an input stream for sequential reading.
252 /// This override assumes the given FileInfo validly represents the file's
253 /// characteristics, and may optimize access depending on them (for example
254 /// avoid querying the file size or its existence).
255 virtual Result
<std::shared_ptr
<io::InputStream
>> OpenInputStream(const FileInfo
& info
);
257 /// Open an input file for random access reading.
258 virtual Result
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFile(
259 const std::string
& path
) = 0;
260 /// Open an input file for random access reading.
262 /// This override assumes the given FileInfo validly represents the file's
263 /// characteristics, and may optimize access depending on them (for example
264 /// avoid querying the file size or its existence).
265 virtual Result
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFile(
266 const FileInfo
& info
);
268 /// Async version of OpenInputStream
269 virtual Future
<std::shared_ptr
<io::InputStream
>> OpenInputStreamAsync(
270 const std::string
& path
);
271 /// Async version of OpenInputStream
272 virtual Future
<std::shared_ptr
<io::InputStream
>> OpenInputStreamAsync(
273 const FileInfo
& info
);
275 /// Async version of OpenInputFile
276 virtual Future
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFileAsync(
277 const std::string
& path
);
278 /// Async version of OpenInputFile
279 virtual Future
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFileAsync(
280 const FileInfo
& info
);
282 /// Open an output stream for sequential writing.
284 /// If the target already exists, existing data is truncated.
285 virtual Result
<std::shared_ptr
<io::OutputStream
>> OpenOutputStream(
286 const std::string
& path
,
287 const std::shared_ptr
<const KeyValueMetadata
>& metadata
) = 0;
288 Result
<std::shared_ptr
<io::OutputStream
>> OpenOutputStream(const std::string
& path
);
290 /// Open an output stream for appending.
292 /// If the target doesn't exist, a new empty file is created.
294 "Deprecated in 6.0.0. "
295 "OpenAppendStream is unsupported on several filesystems and will be later removed.")
296 virtual Result
<std::shared_ptr
<io::OutputStream
>> OpenAppendStream(
297 const std::string
& path
,
298 const std::shared_ptr
<const KeyValueMetadata
>& metadata
) = 0;
299 Result
<std::shared_ptr
<io::OutputStream
>> OpenAppendStream(const std::string
& path
);
302 explicit FileSystem(const io::IOContext
& io_context
= io::default_io_context())
303 : io_context_(io_context
) {}
305 io::IOContext io_context_
;
306 // Whether metadata operations (such as GetFileInfo or OpenInputStream)
307 // are cheap enough that the default async variants don't bother with
309 bool default_async_is_sync_
= true;
312 /// \brief A FileSystem implementation that delegates to another
313 /// implementation after prepending a fixed base path.
315 /// This is useful to expose a logical view of a subtree of a filesystem,
316 /// for example a directory in a LocalFileSystem.
317 /// This works on abstract paths, i.e. paths using forward slashes and
318 /// and a single root "/". Windows paths are not guaranteed to work.
319 /// This makes no security guarantee. For example, symlinks may allow to
320 /// "escape" the subtree and access other parts of the underlying filesystem.
321 class ARROW_EXPORT SubTreeFileSystem
: public FileSystem
{
323 // This constructor may abort if base_path is invalid.
324 explicit SubTreeFileSystem(const std::string
& base_path
,
325 std::shared_ptr
<FileSystem
> base_fs
);
326 ~SubTreeFileSystem() override
;
328 std::string
type_name() const override
{ return "subtree"; }
329 std::string
base_path() const { return base_path_
; }
330 std::shared_ptr
<FileSystem
> base_fs() const { return base_fs_
; }
332 Result
<std::string
> NormalizePath(std::string path
) override
;
334 bool Equals(const FileSystem
& other
) const override
;
337 using FileSystem::GetFileInfo
;
339 Result
<FileInfo
> GetFileInfo(const std::string
& path
) override
;
340 Result
<FileInfoVector
> GetFileInfo(const FileSelector
& select
) override
;
342 FileInfoGenerator
GetFileInfoGenerator(const FileSelector
& select
) override
;
344 Status
CreateDir(const std::string
& path
, bool recursive
= true) override
;
346 Status
DeleteDir(const std::string
& path
) override
;
347 Status
DeleteDirContents(const std::string
& path
) override
;
348 Status
DeleteRootDirContents() override
;
350 Status
DeleteFile(const std::string
& path
) override
;
352 Status
Move(const std::string
& src
, const std::string
& dest
) override
;
354 Status
CopyFile(const std::string
& src
, const std::string
& dest
) override
;
356 Result
<std::shared_ptr
<io::InputStream
>> OpenInputStream(
357 const std::string
& path
) override
;
358 Result
<std::shared_ptr
<io::InputStream
>> OpenInputStream(const FileInfo
& info
) override
;
359 Result
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFile(
360 const std::string
& path
) override
;
361 Result
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFile(
362 const FileInfo
& info
) override
;
364 Future
<std::shared_ptr
<io::InputStream
>> OpenInputStreamAsync(
365 const std::string
& path
) override
;
366 Future
<std::shared_ptr
<io::InputStream
>> OpenInputStreamAsync(
367 const FileInfo
& info
) override
;
368 Future
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFileAsync(
369 const std::string
& path
) override
;
370 Future
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFileAsync(
371 const FileInfo
& info
) override
;
373 Result
<std::shared_ptr
<io::OutputStream
>> OpenOutputStream(
374 const std::string
& path
,
375 const std::shared_ptr
<const KeyValueMetadata
>& metadata
= {}) override
;
376 Result
<std::shared_ptr
<io::OutputStream
>> OpenAppendStream(
377 const std::string
& path
,
378 const std::shared_ptr
<const KeyValueMetadata
>& metadata
= {}) override
;
381 SubTreeFileSystem() {}
383 const std::string base_path_
;
384 std::shared_ptr
<FileSystem
> base_fs_
;
386 std::string
PrependBase(const std::string
& s
) const;
387 Status
PrependBaseNonEmpty(std::string
* s
) const;
388 Result
<std::string
> StripBase(const std::string
& s
) const;
389 Status
FixInfo(FileInfo
* info
) const;
391 static Result
<std::string
> NormalizeBasePath(
392 std::string base_path
, const std::shared_ptr
<FileSystem
>& base_fs
);
395 /// \brief A FileSystem implementation that delegates to another
396 /// implementation but inserts latencies at various points.
397 class ARROW_EXPORT SlowFileSystem
: public FileSystem
{
399 SlowFileSystem(std::shared_ptr
<FileSystem
> base_fs
,
400 std::shared_ptr
<io::LatencyGenerator
> latencies
);
401 SlowFileSystem(std::shared_ptr
<FileSystem
> base_fs
, double average_latency
);
402 SlowFileSystem(std::shared_ptr
<FileSystem
> base_fs
, double average_latency
,
405 std::string
type_name() const override
{ return "slow"; }
406 bool Equals(const FileSystem
& other
) const override
;
408 using FileSystem::GetFileInfo
;
409 Result
<FileInfo
> GetFileInfo(const std::string
& path
) override
;
410 Result
<FileInfoVector
> GetFileInfo(const FileSelector
& select
) override
;
412 Status
CreateDir(const std::string
& path
, bool recursive
= true) override
;
414 Status
DeleteDir(const std::string
& path
) override
;
415 Status
DeleteDirContents(const std::string
& path
) override
;
416 Status
DeleteRootDirContents() override
;
418 Status
DeleteFile(const std::string
& path
) override
;
420 Status
Move(const std::string
& src
, const std::string
& dest
) override
;
422 Status
CopyFile(const std::string
& src
, const std::string
& dest
) override
;
424 Result
<std::shared_ptr
<io::InputStream
>> OpenInputStream(
425 const std::string
& path
) override
;
426 Result
<std::shared_ptr
<io::InputStream
>> OpenInputStream(const FileInfo
& info
) override
;
427 Result
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFile(
428 const std::string
& path
) override
;
429 Result
<std::shared_ptr
<io::RandomAccessFile
>> OpenInputFile(
430 const FileInfo
& info
) override
;
431 Result
<std::shared_ptr
<io::OutputStream
>> OpenOutputStream(
432 const std::string
& path
,
433 const std::shared_ptr
<const KeyValueMetadata
>& metadata
= {}) override
;
434 Result
<std::shared_ptr
<io::OutputStream
>> OpenAppendStream(
435 const std::string
& path
,
436 const std::shared_ptr
<const KeyValueMetadata
>& metadata
= {}) override
;
439 std::shared_ptr
<FileSystem
> base_fs_
;
440 std::shared_ptr
<io::LatencyGenerator
> latencies_
;
443 /// \defgroup filesystem-factories Functions for creating FileSystem instances
447 /// \brief Create a new FileSystem by URI
449 /// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
451 /// \param[in] uri a URI-based path, ex: file:///some/local/path
452 /// \param[out] out_path (optional) Path inside the filesystem.
453 /// \return out_fs FileSystem instance.
455 Result
<std::shared_ptr
<FileSystem
>> FileSystemFromUri(const std::string
& uri
,
456 std::string
* out_path
= NULLPTR
);
458 /// \brief Create a new FileSystem by URI with a custom IO context
460 /// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
462 /// \param[in] uri a URI-based path, ex: file:///some/local/path
463 /// \param[in] io_context an IOContext which will be associated with the filesystem
464 /// \param[out] out_path (optional) Path inside the filesystem.
465 /// \return out_fs FileSystem instance.
467 Result
<std::shared_ptr
<FileSystem
>> FileSystemFromUri(const std::string
& uri
,
468 const io::IOContext
& io_context
,
469 std::string
* out_path
= NULLPTR
);
471 /// \brief Create a new FileSystem by URI
473 /// Same as FileSystemFromUri, but in addition also recognize non-URIs
474 /// and treat them as local filesystem paths. Only absolute local filesystem
475 /// paths are allowed.
477 Result
<std::shared_ptr
<FileSystem
>> FileSystemFromUriOrPath(
478 const std::string
& uri
, std::string
* out_path
= NULLPTR
);
480 /// \brief Create a new FileSystem by URI with a custom IO context
482 /// Same as FileSystemFromUri, but in addition also recognize non-URIs
483 /// and treat them as local filesystem paths. Only absolute local filesystem
484 /// paths are allowed.
486 Result
<std::shared_ptr
<FileSystem
>> FileSystemFromUriOrPath(
487 const std::string
& uri
, const io::IOContext
& io_context
,
488 std::string
* out_path
= NULLPTR
);
492 /// \brief Copy files, including from one FileSystem to another
494 /// If a source and destination are resident in the same FileSystem FileSystem::CopyFile
495 /// will be used, otherwise the file will be opened as a stream in both FileSystems and
496 /// chunks copied from the source to the destination. No directories will be created.
498 Status
CopyFiles(const std::vector
<FileLocator
>& sources
,
499 const std::vector
<FileLocator
>& destinations
,
500 const io::IOContext
& io_context
= io::default_io_context(),
501 int64_t chunk_size
= 1024 * 1024, bool use_threads
= true);
503 /// \brief Copy selected files, including from one FileSystem to another
505 /// Directories will be created under the destination base directory as needed.
507 Status
CopyFiles(const std::shared_ptr
<FileSystem
>& source_fs
,
508 const FileSelector
& source_sel
,
509 const std::shared_ptr
<FileSystem
>& destination_fs
,
510 const std::string
& destination_base_dir
,
511 const io::IOContext
& io_context
= io::default_io_context(),
512 int64_t chunk_size
= 1024 * 1024, bool use_threads
= true);
514 struct FileSystemGlobalOptions
{
515 /// Path to a single PEM file holding all TLS CA certificates
517 /// If empty, the underlying TLS library's defaults will be used.
518 std::string tls_ca_file_path
;
520 /// Path to a directory holding TLS CA certificates in individual PEM files
521 /// named along the OpenSSL "hashed" format.
523 /// If empty, the underlying TLS library's defaults will be used.
524 std::string tls_ca_dir_path
;
527 /// EXPERIMENTAL: optional global initialization routine
529 /// This is for environments (such as manylinux) where the path
530 /// to TLS CA certificates needs to be configured at runtime.
532 Status
Initialize(const FileSystemGlobalOptions
& options
);