]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/filesystem/filesystem.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / filesystem / filesystem.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <chrono>
21#include <cstdint>
22#include <functional>
23#include <iosfwd>
24#include <memory>
25#include <string>
26#include <utility>
27#include <vector>
28
29#include "arrow/filesystem/type_fwd.h"
30#include "arrow/io/interfaces.h"
31#include "arrow/type_fwd.h"
32#include "arrow/util/compare.h"
33#include "arrow/util/macros.h"
34#include "arrow/util/type_fwd.h"
35#include "arrow/util/visibility.h"
36#include "arrow/util/windows_fixup.h"
37
38namespace arrow {
39namespace fs {
40
41// A system clock time point expressed as a 64-bit (or more) number of
42// nanoseconds since the epoch.
43using TimePoint =
44 std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
45
46ARROW_EXPORT std::string ToString(FileType);
47
48ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType);
49
50static const int64_t kNoSize = -1;
51static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1));
52
53/// \brief FileSystem entry info
54struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> {
55 FileInfo() = default;
56 FileInfo(FileInfo&&) = default;
57 FileInfo& operator=(FileInfo&&) = default;
58 FileInfo(const FileInfo&) = default;
59 FileInfo& operator=(const FileInfo&) = default;
60
61 explicit FileInfo(std::string path, FileType type = FileType::Unknown)
62 : path_(std::move(path)), type_(type) {}
63
64 /// The file type
65 FileType type() const { return type_; }
66 void set_type(FileType type) { type_ = type; }
67
68 /// The full file path in the filesystem
69 const std::string& path() const { return path_; }
70 void set_path(std::string path) { path_ = std::move(path); }
71
72 /// The file base name (component after the last directory separator)
73 std::string base_name() const;
74
75 // The directory base name (component before the file base name).
76 std::string dir_name() const;
77
78 /// The size in bytes, if available
79 ///
80 /// Only regular files are guaranteed to have a size.
81 int64_t size() const { return size_; }
82 void set_size(int64_t size) { size_ = size; }
83
84 /// The file extension (excluding the dot)
85 std::string extension() const;
86
87 /// The time of last modification, if available
88 TimePoint mtime() const { return mtime_; }
89 void set_mtime(TimePoint mtime) { mtime_ = mtime; }
90
91 bool IsFile() const { return type_ == FileType::File; }
92 bool IsDirectory() const { return type_ == FileType::Directory; }
93
94 bool Equals(const FileInfo& other) const {
95 return type() == other.type() && path() == other.path() && size() == other.size() &&
96 mtime() == other.mtime();
97 }
98
99 std::string ToString() const;
100
101 /// Function object implementing less-than comparison and hashing by
102 /// path, to support sorting infos, using them as keys, and other
103 /// interactions with the STL.
104 struct ByPath {
105 bool operator()(const FileInfo& l, const FileInfo& r) const {
106 return l.path() < r.path();
107 }
108
109 size_t operator()(const FileInfo& i) const {
110 return std::hash<std::string>{}(i.path());
111 }
112 };
113
114 protected:
115 std::string path_;
116 FileType type_ = FileType::Unknown;
117 int64_t size_ = kNoSize;
118 TimePoint mtime_ = kNoTime;
119};
120
121ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&);
122
123/// \brief File selector for filesystem APIs
124struct ARROW_EXPORT FileSelector {
125 /// The directory in which to select files.
126 /// If the path exists but doesn't point to a directory, this should be an error.
127 std::string base_dir;
128 /// The behavior if `base_dir` isn't found in the filesystem. If false,
129 /// an error is returned. If true, an empty selection is returned.
130 bool allow_not_found;
131 /// Whether to recurse into subdirectories.
132 bool recursive;
133 /// The maximum number of subdirectories to recurse into.
134 int32_t max_recursion;
135
136 FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {}
137};
138
139/// \brief FileSystem, path pair
140struct ARROW_EXPORT FileLocator {
141 std::shared_ptr<FileSystem> filesystem;
142 std::string path;
143};
144
145using FileInfoVector = std::vector<FileInfo>;
146using FileInfoGenerator = std::function<Future<FileInfoVector>()>;
147
148} // namespace fs
149
150template <>
151struct IterationTraits<fs::FileInfoVector> {
152 static fs::FileInfoVector End() { return {}; }
153 static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); }
154};
155
156namespace fs {
157
158/// \brief Abstract file system API
159class ARROW_EXPORT FileSystem : public std::enable_shared_from_this<FileSystem> {
160 public:
161 virtual ~FileSystem();
162
163 virtual std::string type_name() const = 0;
164
165 /// EXPERIMENTAL: The IOContext associated with this filesystem.
166 const io::IOContext& io_context() const { return io_context_; }
167
168 /// Normalize path for the given filesystem
169 ///
170 /// The default implementation of this method is a no-op, but subclasses
171 /// may allow normalizing irregular path forms (such as Windows local paths).
172 virtual Result<std::string> NormalizePath(std::string path);
173
174 virtual bool Equals(const FileSystem& other) const = 0;
175
176 virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
177 return Equals(*other);
178 }
179
180 /// Get info for the given target.
181 ///
182 /// Any symlink is automatically dereferenced, recursively.
183 /// A nonexistent or unreachable file returns an Ok status and
184 /// has a FileType of value NotFound. An error status indicates
185 /// a truly exceptional condition (low-level I/O error, etc.).
186 virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0;
187 /// Same, for many targets at once.
188 virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>& paths);
189 /// Same, according to a selector.
190 ///
191 /// The selector's base directory will not be part of the results, even if
192 /// it exists.
193 /// If it doesn't exist, see `FileSelector::allow_not_found`.
194 virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0;
195
196 /// Async version of GetFileInfo
197 virtual Future<FileInfoVector> GetFileInfoAsync(const std::vector<std::string>& paths);
198
199 /// Streaming async version of GetFileInfo
200 ///
201 /// The returned generator is not async-reentrant, i.e. you need to wait for
202 /// the returned future to complete before calling the generator again.
203 virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select);
204
205 /// Create a directory and subdirectories.
206 ///
207 /// This function succeeds if the directory already exists.
208 virtual Status CreateDir(const std::string& path, bool recursive = true) = 0;
209
210 /// Delete a directory and its contents, recursively.
211 virtual Status DeleteDir(const std::string& path) = 0;
212
213 /// Delete a directory's contents, recursively.
214 ///
215 /// Like DeleteDir, but doesn't delete the directory itself.
216 /// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents.
217 virtual Status DeleteDirContents(const std::string& path) = 0;
218
219 /// EXPERIMENTAL: Delete the root directory's contents, recursively.
220 ///
221 /// Implementations may decide to raise an error if this operation is
222 /// too dangerous.
223 // NOTE: may decide to remove this if it's deemed not useful
224 virtual Status DeleteRootDirContents() = 0;
225
226 /// Delete a file.
227 virtual Status DeleteFile(const std::string& path) = 0;
228 /// Delete many files.
229 ///
230 /// The default implementation issues individual delete operations in sequence.
231 virtual Status DeleteFiles(const std::vector<std::string>& paths);
232
233 /// Move / rename a file or directory.
234 ///
235 /// If the destination exists:
236 /// - if it is a non-empty directory, an error is returned
237 /// - otherwise, if it has the same type as the source, it is replaced
238 /// - otherwise, behavior is unspecified (implementation-dependent).
239 virtual Status Move(const std::string& src, const std::string& dest) = 0;
240
241 /// Copy a file.
242 ///
243 /// If the destination exists and is a directory, an error is returned.
244 /// Otherwise, it is replaced.
245 virtual Status CopyFile(const std::string& src, const std::string& dest) = 0;
246
247 /// Open an input stream for sequential reading.
248 virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(
249 const std::string& path) = 0;
250 /// Open an input stream for sequential reading.
251 ///
252 /// This override assumes the given FileInfo validly represents the file's
253 /// characteristics, and may optimize access depending on them (for example
254 /// avoid querying the file size or its existence).
255 virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info);
256
257 /// Open an input file for random access reading.
258 virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
259 const std::string& path) = 0;
260 /// Open an input file for random access reading.
261 ///
262 /// This override assumes the given FileInfo validly represents the file's
263 /// characteristics, and may optimize access depending on them (for example
264 /// avoid querying the file size or its existence).
265 virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
266 const FileInfo& info);
267
268 /// Async version of OpenInputStream
269 virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
270 const std::string& path);
271 /// Async version of OpenInputStream
272 virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
273 const FileInfo& info);
274
275 /// Async version of OpenInputFile
276 virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
277 const std::string& path);
278 /// Async version of OpenInputFile
279 virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
280 const FileInfo& info);
281
282 /// Open an output stream for sequential writing.
283 ///
284 /// If the target already exists, existing data is truncated.
285 virtual Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
286 const std::string& path,
287 const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
288 Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(const std::string& path);
289
290 /// Open an output stream for appending.
291 ///
292 /// If the target doesn't exist, a new empty file is created.
293 ARROW_DEPRECATED(
294 "Deprecated in 6.0.0. "
295 "OpenAppendStream is unsupported on several filesystems and will be later removed.")
296 virtual Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
297 const std::string& path,
298 const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
299 Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const std::string& path);
300
301 protected:
302 explicit FileSystem(const io::IOContext& io_context = io::default_io_context())
303 : io_context_(io_context) {}
304
305 io::IOContext io_context_;
306 // Whether metadata operations (such as GetFileInfo or OpenInputStream)
307 // are cheap enough that the default async variants don't bother with
308 // a thread pool.
309 bool default_async_is_sync_ = true;
310};
311
312/// \brief A FileSystem implementation that delegates to another
313/// implementation after prepending a fixed base path.
314///
315/// This is useful to expose a logical view of a subtree of a filesystem,
316/// for example a directory in a LocalFileSystem.
317/// This works on abstract paths, i.e. paths using forward slashes and
318/// and a single root "/". Windows paths are not guaranteed to work.
319/// This makes no security guarantee. For example, symlinks may allow to
320/// "escape" the subtree and access other parts of the underlying filesystem.
321class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
322 public:
323 // This constructor may abort if base_path is invalid.
324 explicit SubTreeFileSystem(const std::string& base_path,
325 std::shared_ptr<FileSystem> base_fs);
326 ~SubTreeFileSystem() override;
327
328 std::string type_name() const override { return "subtree"; }
329 std::string base_path() const { return base_path_; }
330 std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
331
332 Result<std::string> NormalizePath(std::string path) override;
333
334 bool Equals(const FileSystem& other) const override;
335
336 /// \cond FALSE
337 using FileSystem::GetFileInfo;
338 /// \endcond
339 Result<FileInfo> GetFileInfo(const std::string& path) override;
340 Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
341
342 FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
343
344 Status CreateDir(const std::string& path, bool recursive = true) override;
345
346 Status DeleteDir(const std::string& path) override;
347 Status DeleteDirContents(const std::string& path) override;
348 Status DeleteRootDirContents() override;
349
350 Status DeleteFile(const std::string& path) override;
351
352 Status Move(const std::string& src, const std::string& dest) override;
353
354 Status CopyFile(const std::string& src, const std::string& dest) override;
355
356 Result<std::shared_ptr<io::InputStream>> OpenInputStream(
357 const std::string& path) override;
358 Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
359 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
360 const std::string& path) override;
361 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
362 const FileInfo& info) override;
363
364 Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
365 const std::string& path) override;
366 Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
367 const FileInfo& info) override;
368 Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
369 const std::string& path) override;
370 Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
371 const FileInfo& info) override;
372
373 Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
374 const std::string& path,
375 const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
376 Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
377 const std::string& path,
378 const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
379
380 protected:
381 SubTreeFileSystem() {}
382
383 const std::string base_path_;
384 std::shared_ptr<FileSystem> base_fs_;
385
386 std::string PrependBase(const std::string& s) const;
387 Status PrependBaseNonEmpty(std::string* s) const;
388 Result<std::string> StripBase(const std::string& s) const;
389 Status FixInfo(FileInfo* info) const;
390
391 static Result<std::string> NormalizeBasePath(
392 std::string base_path, const std::shared_ptr<FileSystem>& base_fs);
393};
394
395/// \brief A FileSystem implementation that delegates to another
396/// implementation but inserts latencies at various points.
397class ARROW_EXPORT SlowFileSystem : public FileSystem {
398 public:
399 SlowFileSystem(std::shared_ptr<FileSystem> base_fs,
400 std::shared_ptr<io::LatencyGenerator> latencies);
401 SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency);
402 SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency,
403 int32_t seed);
404
405 std::string type_name() const override { return "slow"; }
406 bool Equals(const FileSystem& other) const override;
407
408 using FileSystem::GetFileInfo;
409 Result<FileInfo> GetFileInfo(const std::string& path) override;
410 Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
411
412 Status CreateDir(const std::string& path, bool recursive = true) override;
413
414 Status DeleteDir(const std::string& path) override;
415 Status DeleteDirContents(const std::string& path) override;
416 Status DeleteRootDirContents() override;
417
418 Status DeleteFile(const std::string& path) override;
419
420 Status Move(const std::string& src, const std::string& dest) override;
421
422 Status CopyFile(const std::string& src, const std::string& dest) override;
423
424 Result<std::shared_ptr<io::InputStream>> OpenInputStream(
425 const std::string& path) override;
426 Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
427 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
428 const std::string& path) override;
429 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
430 const FileInfo& info) override;
431 Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
432 const std::string& path,
433 const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
434 Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
435 const std::string& path,
436 const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
437
438 protected:
439 std::shared_ptr<FileSystem> base_fs_;
440 std::shared_ptr<io::LatencyGenerator> latencies_;
441};
442
443/// \defgroup filesystem-factories Functions for creating FileSystem instances
444///
445/// @{
446
447/// \brief Create a new FileSystem by URI
448///
449/// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
450///
451/// \param[in] uri a URI-based path, ex: file:///some/local/path
452/// \param[out] out_path (optional) Path inside the filesystem.
453/// \return out_fs FileSystem instance.
454ARROW_EXPORT
455Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
456 std::string* out_path = NULLPTR);
457
458/// \brief Create a new FileSystem by URI with a custom IO context
459///
460/// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
461///
462/// \param[in] uri a URI-based path, ex: file:///some/local/path
463/// \param[in] io_context an IOContext which will be associated with the filesystem
464/// \param[out] out_path (optional) Path inside the filesystem.
465/// \return out_fs FileSystem instance.
466ARROW_EXPORT
467Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
468 const io::IOContext& io_context,
469 std::string* out_path = NULLPTR);
470
471/// \brief Create a new FileSystem by URI
472///
473/// Same as FileSystemFromUri, but in addition also recognize non-URIs
474/// and treat them as local filesystem paths. Only absolute local filesystem
475/// paths are allowed.
476ARROW_EXPORT
477Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
478 const std::string& uri, std::string* out_path = NULLPTR);
479
480/// \brief Create a new FileSystem by URI with a custom IO context
481///
482/// Same as FileSystemFromUri, but in addition also recognize non-URIs
483/// and treat them as local filesystem paths. Only absolute local filesystem
484/// paths are allowed.
485ARROW_EXPORT
486Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
487 const std::string& uri, const io::IOContext& io_context,
488 std::string* out_path = NULLPTR);
489
490/// @}
491
492/// \brief Copy files, including from one FileSystem to another
493///
494/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile
495/// will be used, otherwise the file will be opened as a stream in both FileSystems and
496/// chunks copied from the source to the destination. No directories will be created.
497ARROW_EXPORT
498Status CopyFiles(const std::vector<FileLocator>& sources,
499 const std::vector<FileLocator>& destinations,
500 const io::IOContext& io_context = io::default_io_context(),
501 int64_t chunk_size = 1024 * 1024, bool use_threads = true);
502
503/// \brief Copy selected files, including from one FileSystem to another
504///
505/// Directories will be created under the destination base directory as needed.
506ARROW_EXPORT
507Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs,
508 const FileSelector& source_sel,
509 const std::shared_ptr<FileSystem>& destination_fs,
510 const std::string& destination_base_dir,
511 const io::IOContext& io_context = io::default_io_context(),
512 int64_t chunk_size = 1024 * 1024, bool use_threads = true);
513
514struct FileSystemGlobalOptions {
515 /// Path to a single PEM file holding all TLS CA certificates
516 ///
517 /// If empty, the underlying TLS library's defaults will be used.
518 std::string tls_ca_file_path;
519
520 /// Path to a directory holding TLS CA certificates in individual PEM files
521 /// named along the OpenSSL "hashed" format.
522 ///
523 /// If empty, the underlying TLS library's defaults will be used.
524 std::string tls_ca_dir_path;
525};
526
527/// EXPERIMENTAL: optional global initialization routine
528///
529/// This is for environments (such as manylinux) where the path
530/// to TLS CA certificates needs to be configured at runtime.
531ARROW_EXPORT
532Status Initialize(const FileSystemGlobalOptions& options);
533
534} // namespace fs
535} // namespace arrow