1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
23 #include <unordered_map>
26 #include "arrow/io/interfaces.h"
27 #include "arrow/util/macros.h"
28 #include "arrow/util/visibility.h"
38 class HdfsReadableFile
;
39 class HdfsOutputStream
;
41 /// DEPRECATED. Use the FileSystem API in arrow::fs instead.
43 enum type
{ FILE, DIRECTORY
};
46 /// DEPRECATED. Use the FileSystem API in arrow::fs instead.
47 struct ARROW_EXPORT FileStatistics
{
48 /// Size of file, -1 if finding length is unsupported
50 ObjectType::type kind
;
53 class ARROW_EXPORT FileSystem
{
55 virtual ~FileSystem() = default;
57 virtual Status
MakeDirectory(const std::string
& path
) = 0;
59 virtual Status
DeleteDirectory(const std::string
& path
) = 0;
61 virtual Status
GetChildren(const std::string
& path
,
62 std::vector
<std::string
>* listing
) = 0;
64 virtual Status
Rename(const std::string
& src
, const std::string
& dst
) = 0;
66 virtual Status
Stat(const std::string
& path
, FileStatistics
* stat
) = 0;
70 ObjectType::type kind
;
76 // Access times in UNIX timestamps (seconds)
80 int32_t last_modified_time
;
81 int32_t last_access_time
;
87 struct HdfsConnectionConfig
{
91 std::string kerb_ticket
;
92 std::unordered_map
<std::string
, std::string
> extra_conf
;
95 class ARROW_EXPORT HadoopFileSystem
: public FileSystem
{
97 ~HadoopFileSystem() override
;
99 // Connect to an HDFS cluster given a configuration
101 // @param config (in): configuration for connecting
102 // @param fs (out): the created client
104 static Status
Connect(const HdfsConnectionConfig
* config
,
105 std::shared_ptr
<HadoopFileSystem
>* fs
);
107 // Create directory and all parents
109 // @param path (in): absolute HDFS path
111 Status
MakeDirectory(const std::string
& path
) override
;
113 // Delete file or directory
114 // @param path absolute path to data
115 // @param recursive if path is a directory, delete contents as well
116 // @returns error status on failure
117 Status
Delete(const std::string
& path
, bool recursive
= false);
119 Status
DeleteDirectory(const std::string
& path
) override
;
121 // Disconnect from cluster
126 // @param path (in): absolute HDFS path
127 // @returns bool, true if the path exists, false if not (or on error)
128 bool Exists(const std::string
& path
);
130 // @param path (in): absolute HDFS path
133 Status
GetPathInfo(const std::string
& path
, HdfsPathInfo
* info
);
135 // @param nbytes (out): total capacity of the filesystem
137 Status
GetCapacity(int64_t* nbytes
);
139 // @param nbytes (out): total bytes used of the filesystem
141 Status
GetUsed(int64_t* nbytes
);
143 Status
GetChildren(const std::string
& path
, std::vector
<std::string
>* listing
) override
;
145 /// List directory contents
147 /// If path is a relative path, returned values will be absolute paths or URIs
148 /// starting from the current working directory.
149 Status
ListDirectory(const std::string
& path
, std::vector
<HdfsPathInfo
>* listing
);
151 /// Return the filesystem's current working directory.
153 /// The working directory is the base path for all relative paths given to
155 /// NOTE: this actually returns a URI.
156 Status
GetWorkingDirectory(std::string
* out
);
160 /// @param path file path to change
161 /// @param owner pass null for no change
162 /// @param group pass null for no change
163 Status
Chown(const std::string
& path
, const char* owner
, const char* group
);
165 /// Change path permissions
167 /// \param path Absolute path in file system
168 /// \param mode Mode bitset
170 Status
Chmod(const std::string
& path
, int mode
);
172 // Move file or directory from source path to destination path within the
173 // current filesystem
174 Status
Rename(const std::string
& src
, const std::string
& dst
) override
;
176 Status
Copy(const std::string
& src
, const std::string
& dst
);
178 Status
Move(const std::string
& src
, const std::string
& dst
);
180 Status
Stat(const std::string
& path
, FileStatistics
* stat
) override
;
182 // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory
184 // Open an HDFS file in READ mode. Returns error
185 // status if the file is not found.
187 // @param path complete file path
188 Status
OpenReadable(const std::string
& path
, int32_t buffer_size
,
189 std::shared_ptr
<HdfsReadableFile
>* file
);
191 Status
OpenReadable(const std::string
& path
, int32_t buffer_size
,
192 const io::IOContext
& io_context
,
193 std::shared_ptr
<HdfsReadableFile
>* file
);
195 Status
OpenReadable(const std::string
& path
, std::shared_ptr
<HdfsReadableFile
>* file
);
197 Status
OpenReadable(const std::string
& path
, const io::IOContext
& io_context
,
198 std::shared_ptr
<HdfsReadableFile
>* file
);
200 // FileMode::WRITE options
201 // @param path complete file path
202 // @param buffer_size 0 by default
203 // @param replication 0 by default
204 // @param default_block_size 0 by default
205 Status
OpenWritable(const std::string
& path
, bool append
, int32_t buffer_size
,
206 int16_t replication
, int64_t default_block_size
,
207 std::shared_ptr
<HdfsOutputStream
>* file
);
209 Status
OpenWritable(const std::string
& path
, bool append
,
210 std::shared_ptr
<HdfsOutputStream
>* file
);
213 friend class HdfsReadableFile
;
214 friend class HdfsOutputStream
;
216 class ARROW_NO_EXPORT HadoopFileSystemImpl
;
217 std::unique_ptr
<HadoopFileSystemImpl
> impl_
;
220 ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem
);
223 class ARROW_EXPORT HdfsReadableFile
: public RandomAccessFile
{
225 ~HdfsReadableFile() override
;
227 Status
Close() override
;
229 bool closed() const override
;
231 // NOTE: If you wish to read a particular range of a file in a multithreaded
232 // context, you may prefer to use ReadAt to avoid locking issues
233 Result
<int64_t> Read(int64_t nbytes
, void* out
) override
;
234 Result
<std::shared_ptr
<Buffer
>> Read(int64_t nbytes
) override
;
235 Result
<int64_t> ReadAt(int64_t position
, int64_t nbytes
, void* out
) override
;
236 Result
<std::shared_ptr
<Buffer
>> ReadAt(int64_t position
, int64_t nbytes
) override
;
238 Status
Seek(int64_t position
) override
;
239 Result
<int64_t> Tell() const override
;
240 Result
<int64_t> GetSize() override
;
243 explicit HdfsReadableFile(const io::IOContext
&);
245 class ARROW_NO_EXPORT HdfsReadableFileImpl
;
246 std::unique_ptr
<HdfsReadableFileImpl
> impl_
;
248 friend class HadoopFileSystem::HadoopFileSystemImpl
;
250 ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile
);
253 // Naming this file OutputStream because it does not support seeking (like the
254 // WritableFile interface)
255 class ARROW_EXPORT HdfsOutputStream
: public OutputStream
{
257 ~HdfsOutputStream() override
;
259 Status
Close() override
;
261 bool closed() const override
;
263 using OutputStream::Write
;
264 Status
Write(const void* buffer
, int64_t nbytes
) override
;
266 Status
Flush() override
;
268 Result
<int64_t> Tell() const override
;
271 class ARROW_NO_EXPORT HdfsOutputStreamImpl
;
272 std::unique_ptr
<HdfsOutputStreamImpl
> impl_
;
274 friend class HadoopFileSystem::HadoopFileSystemImpl
;
278 ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream
);
281 Status ARROW_EXPORT
HaveLibHdfs();