2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
19 * Copyright 2015 Cloudius Systems
24 #include <seastar/core/stream.hh>
25 #include <seastar/core/sstring.hh>
26 #include <seastar/core/shared_ptr.hh>
27 #include <seastar/core/align.hh>
28 #include <seastar/core/future-util.hh>
29 #include <seastar/core/fair_queue.hh>
30 #include <seastar/util/std-compat.hh>
31 #include <system_error>
33 #include <sys/statvfs.h>
34 #include <sys/ioctl.h>
41 /// \addtogroup fileio-module
44 /// Enumeration describing the type of a directory entry being listed.
46 /// \see file::list_directory()
47 enum class directory_entry_type {
57 /// Enumeration describing the type of a particular filesystem
69 /// A directory entry being listed.
70 struct directory_entry {
71 /// Name of the file in a directory entry. Will never be "." or "..". Only the last component is included.
73 /// Type of the directory entry, if known.
74 compat::optional<directory_entry_type> type;
79 /// Options used to configure an open file.
82 struct file_open_options {
83 uint64_t extent_allocation_size_hint = 1 << 20; ///< Allocate this much disk space when extending the file
84 bool sloppy_size = false; ///< Allow the file size not to track the amount of data written until a flush
85 uint64_t sloppy_size_hint = 1 << 20; ///< Hint as to what the eventual file size will be
90 class io_priority_class {
99 const io_priority_class& default_priority_class();
106 // A handle that can be transported across shards and used to
107 // create a dup(2)-like `file` object referring to the same underlying file
108 class file_handle_impl {
110 virtual ~file_handle_impl() = default;
111 virtual std::unique_ptr<file_handle_impl> clone() const = 0;
112 virtual shared_ptr<file_impl> to_file() && = 0;
117 static file_impl* get_file_impl(file& f);
119 unsigned _memory_dma_alignment = 4096;
120 unsigned _disk_read_dma_alignment = 4096;
121 unsigned _disk_write_dma_alignment = 4096;
123 virtual ~file_impl() {}
125 virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) = 0;
126 virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
127 virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) = 0;
128 virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
129 virtual future<> flush(void) = 0;
130 virtual future<struct stat> stat(void) = 0;
131 virtual future<> truncate(uint64_t length) = 0;
132 virtual future<> discard(uint64_t offset, uint64_t length) = 0;
133 virtual future<> allocate(uint64_t position, uint64_t length) = 0;
134 virtual future<uint64_t> size(void) = 0;
135 virtual future<> close() = 0;
136 virtual std::unique_ptr<file_handle_impl> dup();
137 virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) = 0;
138 virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) = 0;
140 friend class reactor;
145 /// A data file on persistent storage.
147 /// File objects represent uncached, unbuffered files. As such great care
148 /// must be taken to cache data at the application layer; neither seastar
149 /// nor the OS will cache these file.
151 /// Data is transferred using direct memory access (DMA). This imposes
152 /// restrictions on file offsets and data pointers. The former must be aligned
153 /// on a 4096 byte boundary, while a 512 byte boundary suffices for the latter.
155 shared_ptr<file_impl> _file_impl;
157 explicit file(int fd, file_open_options options);
159 /// Default constructor constructs an uninitialized file object.
161 /// A default constructor is useful for the common practice of declaring
162 /// a variable, and only assigning to it later. The uninitialized file
163 /// must not be used, or undefined behavior will result (currently, a null
164 /// pointer dereference).
166 /// One can check whether a file object is in uninitialized state with
167 /// \ref operator bool(); One can reset a file back to uninitialized state
168 /// by assigning file() to it.
169 file() : _file_impl(nullptr) {}
171 file(shared_ptr<file_impl> impl)
172 : _file_impl(std::move(impl)) {}
174 /// Constructs a file object from a \ref file_handle obtained from another shard
175 explicit file(file_handle&& handle);
177 /// Checks whether the file object was initialized.
179 /// \return false if the file object is uninitialized (default
180 /// constructed), true if the file object refers to an actual file.
181 explicit operator bool() const noexcept { return bool(_file_impl); }
183 /// Copies a file object. The new and old objects refer to the
184 /// same underlying file.
186 /// \param x file object to be copied
187 file(const file& x) = default;
188 /// Moves a file object.
189 file(file&& x) noexcept : _file_impl(std::move(x._file_impl)) {}
190 /// Assigns a file object. After assignent, the destination and source refer
191 /// to the same underlying file.
193 /// \param x file object to assign to `this`.
194 file& operator=(const file& x) noexcept = default;
195 /// Moves assigns a file object.
196 file& operator=(file&& x) noexcept = default;
198 // O_DIRECT reading requires that buffer, offset, and read length, are
199 // all aligned. Alignment of 4096 was necessary in the past, but no longer
200 // is - 512 is usually enough; But we'll need to use BLKSSZGET ioctl to
201 // be sure it is really enough on this filesystem. 4096 is always safe.
202 // In addition, if we start reading in things outside page boundaries,
203 // we will end up with various pages around, some of them with
204 // overlapping ranges. Those would be very challenging to cache.
206 /// Alignment requirement for file offsets (for reads)
207 uint64_t disk_read_dma_alignment() const {
208 return _file_impl->_disk_read_dma_alignment;
211 /// Alignment requirement for file offsets (for writes)
212 uint64_t disk_write_dma_alignment() const {
213 return _file_impl->_disk_write_dma_alignment;
216 /// Alignment requirement for data buffers
217 uint64_t memory_dma_alignment() const {
218 return _file_impl->_memory_dma_alignment;
223 * Perform a single DMA read operation.
225 * @param aligned_pos offset to begin reading at (should be aligned)
226 * @param aligned_buffer output buffer (should be aligned)
227 * @param aligned_len number of bytes to read (should be aligned)
228 * @param pc the IO priority class under which to queue this operation
230 * Alignment is HW dependent but use 4KB alignment to be on the safe side as
233 * @return number of bytes actually read
234 * @throw exception in case of I/O error
236 template <typename CharType>
238 dma_read(uint64_t aligned_pos, CharType* aligned_buffer, size_t aligned_len, const io_priority_class& pc = default_priority_class()) {
239 return _file_impl->read_dma(aligned_pos, aligned_buffer, aligned_len, pc);
243 * Read the requested amount of bytes starting from the given offset.
245 * @param pos offset to begin reading from
246 * @param len number of bytes to read
247 * @param pc the IO priority class under which to queue this operation
249 * @return temporary buffer containing the requested data.
250 * @throw exception in case of I/O error
252 * This function doesn't require any alignment for both "pos" and "len"
254 * @note size of the returned buffer may be smaller than "len" if EOF is
255 * reached of in case of I/O error.
257 template <typename CharType>
258 future<temporary_buffer<CharType>> dma_read(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) {
259 return dma_read_bulk<CharType>(pos, len, pc).then(
260 [len] (temporary_buffer<CharType> buf) {
261 if (len < buf.size()) {
265 return std::move(buf);
269 /// Error thrown when attempting to read past end-of-file
270 /// with \ref dma_read_exactly().
271 class eof_error : public std::exception {};
274 * Read the exact amount of bytes.
276 * @param pos offset in a file to begin reading from
277 * @param len number of bytes to read
278 * @param pc the IO priority class under which to queue this operation
280 * @return temporary buffer containing the read data
281 * @throw end_of_file_error if EOF is reached, file_io_error or
282 * std::system_error in case of I/O error.
284 template <typename CharType>
285 future<temporary_buffer<CharType>>
286 dma_read_exactly(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) {
287 return dma_read<CharType>(pos, len, pc).then(
288 [pos, len] (auto buf) {
289 if (buf.size() < len) {
293 return std::move(buf);
297 /// Performs a DMA read into the specified iovec.
299 /// \param pos offset to read from. Must be aligned to \ref dma_alignment.
300 /// \param iov vector of address/size pairs to read into. Addresses must be
302 /// \param pc the IO priority class under which to queue this operation
304 /// \return a future representing the number of bytes actually read. A short
305 /// read may happen due to end-of-file or an I/O error.
306 future<size_t> dma_read(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) {
307 return _file_impl->read_dma(pos, std::move(iov), pc);
310 /// Performs a DMA write from the specified buffer.
312 /// \param pos offset to write into. Must be aligned to \ref dma_alignment.
313 /// \param buffer aligned address of buffer to read from. Buffer must exists
314 /// until the future is made ready.
315 /// \param len number of bytes to write. Must be aligned.
316 /// \param pc the IO priority class under which to queue this operation
318 /// \return a future representing the number of bytes actually written. A short
319 /// write may happen due to an I/O error.
320 template <typename CharType>
321 future<size_t> dma_write(uint64_t pos, const CharType* buffer, size_t len, const io_priority_class& pc = default_priority_class()) {
322 return _file_impl->write_dma(pos, buffer, len, pc);
325 /// Performs a DMA write to the specified iovec.
327 /// \param pos offset to write into. Must be aligned to \ref dma_alignment.
328 /// \param iov vector of address/size pairs to write from. Addresses must be
330 /// \param pc the IO priority class under which to queue this operation
332 /// \return a future representing the number of bytes actually written. A short
333 /// write may happen due to an I/O error.
334 future<size_t> dma_write(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) {
335 return _file_impl->write_dma(pos, std::move(iov), pc);
338 /// Causes any previously written data to be made stable on persistent storage.
340 /// Prior to a flush, written data may or may not survive a power failure. After
341 /// a flush, data is guaranteed to be on disk.
343 return _file_impl->flush();
346 /// Returns \c stat information about the file.
347 future<struct stat> stat() {
348 return _file_impl->stat();
351 /// Truncates the file to a specified length.
352 future<> truncate(uint64_t length) {
353 return _file_impl->truncate(length);
356 /// Preallocate disk blocks for a specified byte range.
358 /// Requests the file system to allocate disk blocks to
359 /// back the specified range (\c length bytes starting at
360 /// \c position). The range may be outside the current file
361 /// size; the blocks can then be used when appending to the
364 /// \param position beginning of the range at which to allocate
366 /// \parm length length of range to allocate.
367 /// \return future that becomes ready when the operation completes.
368 future<> allocate(uint64_t position, uint64_t length) {
369 return _file_impl->allocate(position, length);
372 /// Discard unneeded data from the file.
374 /// The discard operation tells the file system that a range of offsets
375 /// (which be aligned) is no longer needed and can be reused.
376 future<> discard(uint64_t offset, uint64_t length) {
377 return _file_impl->discard(offset, length);
380 /// Gets the file size.
381 future<uint64_t> size() const {
382 return _file_impl->size();
387 /// Flushes any pending operations and release any resources associated with
388 /// the file (except for stable storage).
391 /// to ensure file data reaches stable storage, you must call \ref flush()
392 /// before calling \c close().
394 return _file_impl->close();
397 /// Returns a directory listing, given that this file object is a directory.
398 subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) {
399 return _file_impl->list_directory(std::move(next));
403 * Read a data bulk containing the provided addresses range that starts at
404 * the given offset and ends at either the address aligned to
405 * dma_alignment (4KB) or at the file end.
407 * @param offset starting address of the range the read bulk should contain
408 * @param range_size size of the addresses range
409 * @param pc the IO priority class under which to queue this operation
411 * @return temporary buffer containing the read data bulk.
412 * @throw system_error exception in case of I/O error or eof_error when
413 * "offset" is beyond EOF.
415 template <typename CharType>
416 future<temporary_buffer<CharType>>
417 dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc = default_priority_class()) {
418 return _file_impl->dma_read_bulk(offset, range_size, pc).then([] (temporary_buffer<uint8_t> t) {
419 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
423 /// \brief Creates a handle that can be transported across shards.
425 /// Creates a handle that can be transported across shards, and then
426 /// used to create a new shard-local \ref file object that refers to
427 /// the same on-disk file.
429 /// \note Use on read-only files.
433 template <typename CharType>
436 friend class reactor;
437 friend class file_impl;
440 /// \brief A shard-transportable handle to a file
442 /// If you need to access a file (for reads only) across multiple shards,
443 /// you can use the file::dup() method to create a `file_handle`, transport
444 /// this file handle to another shard, and use the handle to create \ref file
445 /// object on that shard. This is more efficient than calling open_file_dma()
448 std::unique_ptr<file_handle_impl> _impl;
450 explicit file_handle(std::unique_ptr<file_handle_impl> impl) : _impl(std::move(impl)) {}
452 /// Copies a file handle object
453 file_handle(const file_handle&);
454 /// Moves a file handle object
455 file_handle(file_handle&&) noexcept;
456 /// Assigns a file handle object
457 file_handle& operator=(const file_handle&);
458 /// Move-assigns a file handle object
459 file_handle& operator=(file_handle&&) noexcept;
460 /// Converts the file handle object to a \ref file.
461 file to_file() const &;
462 /// Converts the file handle object to a \ref file.
470 template <typename CharType>
471 struct file::read_state {
472 typedef temporary_buffer<CharType> tmp_buf_type;
474 read_state(uint64_t offset, uint64_t front, size_t to_read,
475 size_t memory_alignment, size_t disk_alignment)
476 : buf(tmp_buf_type::aligned(memory_alignment,
477 align_up(to_read, disk_alignment)))
483 return eof || pos >= _to_read;
487 * Trim the buffer to the actual number of read bytes and cut the
488 * bytes from offset 0 till "_front".
490 * @note this function has to be called only if we read bytes beyond
493 void trim_buf_before_ret() {
494 if (have_good_bytes()) {
496 buf.trim_front(_front);
502 uint64_t cur_offset() const {
503 return _offset + pos;
506 size_t left_space() const {
507 return buf.size() - pos;
510 size_t left_to_read() const {
511 // positive as long as (done() == false)
512 return _to_read - pos;
515 void append_new_data(tmp_buf_type& new_data) {
516 auto to_copy = std::min(left_space(), new_data.size());
518 std::memcpy(buf.get_write() + pos, new_data.get(), to_copy);
522 bool have_good_bytes() const {