]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/include/seastar/core/file.hh
buildsys: switch source download to quincy
[ceph.git] / ceph / src / seastar / include / seastar / core / file.hh
CommitLineData
11fdf7f2
TL
1/*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18/*
19 * Copyright 2015 Cloudius Systems
20 */
21
22#pragma once
23
f67539c2 24#include <seastar/core/do_with.hh>
11fdf7f2
TL
25#include <seastar/core/stream.hh>
26#include <seastar/core/sstring.hh>
27#include <seastar/core/shared_ptr.hh>
28#include <seastar/core/align.hh>
11fdf7f2 29#include <seastar/core/fair_queue.hh>
9f95a23c 30#include <seastar/core/file-types.hh>
11fdf7f2
TL
31#include <seastar/util/std-compat.hh>
32#include <system_error>
11fdf7f2
TL
33#include <sys/statvfs.h>
34#include <sys/ioctl.h>
35#include <linux/fs.h>
36#include <sys/uio.h>
37#include <unistd.h>
38
39namespace seastar {
40
41/// \addtogroup fileio-module
42/// @{
43
11fdf7f2
TL
44/// A directory entry being listed.
45struct directory_entry {
46 /// Name of the file in a directory entry. Will never be "." or "..". Only the last component is included.
47 sstring name;
48 /// Type of the directory entry, if known.
f67539c2 49 std::optional<directory_entry_type> type;
11fdf7f2
TL
50};
51
9f95a23c
TL
52/// Filesystem object stat information
53struct stat_data {
54 uint64_t device_id; // ID of device containing file
55 uint64_t inode_number; // Inode number
56 uint64_t mode; // File type and mode
57 directory_entry_type type;
58 uint64_t number_of_links;// Number of hard links
59 uint64_t uid; // User ID of owner
60 uint64_t gid; // Group ID of owner
61 uint64_t rdev; // Device ID (if special file)
62 uint64_t size; // Total size, in bytes
63 uint64_t block_size; // Block size for filesystem I/O
64 uint64_t allocated_size; // Total size of allocated storage, in bytes
65
66 std::chrono::system_clock::time_point time_accessed; // Time of last content access
67 std::chrono::system_clock::time_point time_modified; // Time of last content modification
68 std::chrono::system_clock::time_point time_changed; // Time of last status change (either content or attributes)
69};
70
11fdf7f2
TL
71/// File open options
72///
73/// Options used to configure an open file.
74///
75/// \ref file
76struct file_open_options {
77 uint64_t extent_allocation_size_hint = 1 << 20; ///< Allocate this much disk space when extending the file
78 bool sloppy_size = false; ///< Allow the file size not to track the amount of data written until a flush
79 uint64_t sloppy_size_hint = 1 << 20; ///< Hint as to what the eventual file size will be
9f95a23c 80 file_permissions create_permissions = file_permissions::default_file_permissions; ///< File permissions to use when creating a file
11fdf7f2
TL
81};
82
83/// \cond internal
84class io_queue;
9f95a23c 85using io_priority_class_id = unsigned;
11fdf7f2 86class io_priority_class {
9f95a23c 87 io_priority_class_id _id;
11fdf7f2 88 friend io_queue;
9f95a23c 89
f67539c2
TL
90 io_priority_class() = delete;
91 explicit io_priority_class(io_priority_class_id id) noexcept
9f95a23c
TL
92 : _id(id)
93 { }
94
11fdf7f2 95public:
9f95a23c
TL
96 io_priority_class_id id() const {
97 return _id;
11fdf7f2
TL
98 }
99};
100
101const io_priority_class& default_priority_class();
102
103class file;
104class file_impl;
105
106class file_handle;
107
108// A handle that can be transported across shards and used to
109// create a dup(2)-like `file` object referring to the same underlying file
110class file_handle_impl {
111public:
112 virtual ~file_handle_impl() = default;
113 virtual std::unique_ptr<file_handle_impl> clone() const = 0;
114 virtual shared_ptr<file_impl> to_file() && = 0;
115};
116
117class file_impl {
118protected:
119 static file_impl* get_file_impl(file& f);
120public:
121 unsigned _memory_dma_alignment = 4096;
122 unsigned _disk_read_dma_alignment = 4096;
123 unsigned _disk_write_dma_alignment = 4096;
124public:
125 virtual ~file_impl() {}
126
127 virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) = 0;
128 virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
129 virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) = 0;
130 virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
131 virtual future<> flush(void) = 0;
132 virtual future<struct stat> stat(void) = 0;
133 virtual future<> truncate(uint64_t length) = 0;
134 virtual future<> discard(uint64_t offset, uint64_t length) = 0;
135 virtual future<> allocate(uint64_t position, uint64_t length) = 0;
136 virtual future<uint64_t> size(void) = 0;
137 virtual future<> close() = 0;
138 virtual std::unique_ptr<file_handle_impl> dup();
139 virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) = 0;
140 virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) = 0;
141
142 friend class reactor;
143};
144
f67539c2
TL
145future<shared_ptr<file_impl>> make_file_impl(int fd, file_open_options options, int oflags) noexcept;
146
11fdf7f2
TL
147/// \endcond
148
149/// A data file on persistent storage.
150///
151/// File objects represent uncached, unbuffered files. As such great care
152/// must be taken to cache data at the application layer; neither seastar
153/// nor the OS will cache these file.
154///
155/// Data is transferred using direct memory access (DMA). This imposes
156/// restrictions on file offsets and data pointers. The former must be aligned
157/// on a 4096 byte boundary, while a 512 byte boundary suffices for the latter.
158class file {
159 shared_ptr<file_impl> _file_impl;
11fdf7f2
TL
160public:
161 /// Default constructor constructs an uninitialized file object.
162 ///
163 /// A default constructor is useful for the common practice of declaring
164 /// a variable, and only assigning to it later. The uninitialized file
165 /// must not be used, or undefined behavior will result (currently, a null
166 /// pointer dereference).
167 ///
168 /// One can check whether a file object is in uninitialized state with
169 /// \ref operator bool(); One can reset a file back to uninitialized state
170 /// by assigning file() to it.
f67539c2 171 file() noexcept : _file_impl(nullptr) {}
11fdf7f2 172
f67539c2 173 file(shared_ptr<file_impl> impl) noexcept
11fdf7f2
TL
174 : _file_impl(std::move(impl)) {}
175
176 /// Constructs a file object from a \ref file_handle obtained from another shard
f67539c2 177 explicit file(file_handle&& handle) noexcept;
11fdf7f2
TL
178
179 /// Checks whether the file object was initialized.
180 ///
181 /// \return false if the file object is uninitialized (default
182 /// constructed), true if the file object refers to an actual file.
183 explicit operator bool() const noexcept { return bool(_file_impl); }
184
185 /// Copies a file object. The new and old objects refer to the
186 /// same underlying file.
187 ///
188 /// \param x file object to be copied
189 file(const file& x) = default;
190 /// Moves a file object.
191 file(file&& x) noexcept : _file_impl(std::move(x._file_impl)) {}
192 /// Assigns a file object. After assignent, the destination and source refer
193 /// to the same underlying file.
194 ///
195 /// \param x file object to assign to `this`.
196 file& operator=(const file& x) noexcept = default;
197 /// Moves assigns a file object.
198 file& operator=(file&& x) noexcept = default;
199
200 // O_DIRECT reading requires that buffer, offset, and read length, are
201 // all aligned. Alignment of 4096 was necessary in the past, but no longer
202 // is - 512 is usually enough; But we'll need to use BLKSSZGET ioctl to
203 // be sure it is really enough on this filesystem. 4096 is always safe.
204 // In addition, if we start reading in things outside page boundaries,
205 // we will end up with various pages around, some of them with
206 // overlapping ranges. Those would be very challenging to cache.
207
208 /// Alignment requirement for file offsets (for reads)
f67539c2 209 uint64_t disk_read_dma_alignment() const noexcept {
11fdf7f2
TL
210 return _file_impl->_disk_read_dma_alignment;
211 }
212
213 /// Alignment requirement for file offsets (for writes)
f67539c2 214 uint64_t disk_write_dma_alignment() const noexcept {
11fdf7f2
TL
215 return _file_impl->_disk_write_dma_alignment;
216 }
217
218 /// Alignment requirement for data buffers
f67539c2 219 uint64_t memory_dma_alignment() const noexcept {
11fdf7f2
TL
220 return _file_impl->_memory_dma_alignment;
221 }
222
223
224 /**
225 * Perform a single DMA read operation.
226 *
227 * @param aligned_pos offset to begin reading at (should be aligned)
228 * @param aligned_buffer output buffer (should be aligned)
229 * @param aligned_len number of bytes to read (should be aligned)
230 * @param pc the IO priority class under which to queue this operation
231 *
232 * Alignment is HW dependent but use 4KB alignment to be on the safe side as
233 * explained above.
234 *
235 * @return number of bytes actually read
f67539c2 236 * or exceptional future in case of I/O error
11fdf7f2
TL
237 */
238 template <typename CharType>
239 future<size_t>
f67539c2
TL
240 dma_read(uint64_t aligned_pos, CharType* aligned_buffer, size_t aligned_len, const io_priority_class& pc = default_priority_class()) noexcept {
241 return dma_read_impl(aligned_pos, reinterpret_cast<uint8_t*>(aligned_buffer), aligned_len, pc);
11fdf7f2
TL
242 }
243
244 /**
245 * Read the requested amount of bytes starting from the given offset.
246 *
247 * @param pos offset to begin reading from
248 * @param len number of bytes to read
249 * @param pc the IO priority class under which to queue this operation
250 *
251 * @return temporary buffer containing the requested data.
f67539c2 252 * or exceptional future in case of I/O error
11fdf7f2
TL
253 *
254 * This function doesn't require any alignment for both "pos" and "len"
255 *
256 * @note size of the returned buffer may be smaller than "len" if EOF is
f67539c2 257 * reached or in case of I/O error.
11fdf7f2
TL
258 */
259 template <typename CharType>
f67539c2
TL
260 future<temporary_buffer<CharType>> dma_read(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) noexcept {
261 return dma_read_impl(pos, len, pc).then([] (temporary_buffer<uint8_t> t) {
262 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
11fdf7f2
TL
263 });
264 }
265
266 /// Error thrown when attempting to read past end-of-file
267 /// with \ref dma_read_exactly().
268 class eof_error : public std::exception {};
269
270 /**
271 * Read the exact amount of bytes.
272 *
273 * @param pos offset in a file to begin reading from
274 * @param len number of bytes to read
275 * @param pc the IO priority class under which to queue this operation
276 *
277 * @return temporary buffer containing the read data
f67539c2
TL
278 * or exceptional future in case an error, holding:
279 * end_of_file_error if EOF is reached, file_io_error or
11fdf7f2
TL
280 * std::system_error in case of I/O error.
281 */
282 template <typename CharType>
283 future<temporary_buffer<CharType>>
f67539c2
TL
284 dma_read_exactly(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) noexcept {
285 return dma_read_exactly_impl(pos, len, pc).then([] (temporary_buffer<uint8_t> t) {
286 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
11fdf7f2
TL
287 });
288 }
289
290 /// Performs a DMA read into the specified iovec.
291 ///
f67539c2 292 /// \param pos offset to read from. Must be aligned to \ref disk_read_dma_alignment.
11fdf7f2
TL
293 /// \param iov vector of address/size pairs to read into. Addresses must be
294 /// aligned.
295 /// \param pc the IO priority class under which to queue this operation
296 ///
297 /// \return a future representing the number of bytes actually read. A short
298 /// read may happen due to end-of-file or an I/O error.
f67539c2 299 future<size_t> dma_read(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) noexcept;
11fdf7f2
TL
300
301 /// Performs a DMA write from the specified buffer.
302 ///
f67539c2 303 /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment.
11fdf7f2
TL
304 /// \param buffer aligned address of buffer to read from. Buffer must exists
305 /// until the future is made ready.
306 /// \param len number of bytes to write. Must be aligned.
307 /// \param pc the IO priority class under which to queue this operation
308 ///
309 /// \return a future representing the number of bytes actually written. A short
310 /// write may happen due to an I/O error.
311 template <typename CharType>
f67539c2
TL
312 future<size_t> dma_write(uint64_t pos, const CharType* buffer, size_t len, const io_priority_class& pc = default_priority_class()) noexcept {
313 return dma_write_impl(pos, reinterpret_cast<const uint8_t*>(buffer), len, pc);
11fdf7f2
TL
314 }
315
316 /// Performs a DMA write to the specified iovec.
317 ///
f67539c2 318 /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment.
11fdf7f2
TL
319 /// \param iov vector of address/size pairs to write from. Addresses must be
320 /// aligned.
321 /// \param pc the IO priority class under which to queue this operation
322 ///
323 /// \return a future representing the number of bytes actually written. A short
324 /// write may happen due to an I/O error.
f67539c2 325 future<size_t> dma_write(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) noexcept;
11fdf7f2
TL
326
327 /// Causes any previously written data to be made stable on persistent storage.
328 ///
329 /// Prior to a flush, written data may or may not survive a power failure. After
330 /// a flush, data is guaranteed to be on disk.
f67539c2 331 future<> flush() noexcept;
11fdf7f2
TL
332
333 /// Returns \c stat information about the file.
f67539c2 334 future<struct stat> stat() noexcept;
11fdf7f2
TL
335
336 /// Truncates the file to a specified length.
f67539c2 337 future<> truncate(uint64_t length) noexcept;
11fdf7f2
TL
338
339 /// Preallocate disk blocks for a specified byte range.
340 ///
341 /// Requests the file system to allocate disk blocks to
342 /// back the specified range (\c length bytes starting at
343 /// \c position). The range may be outside the current file
344 /// size; the blocks can then be used when appending to the
345 /// file.
346 ///
347 /// \param position beginning of the range at which to allocate
348 /// blocks.
f67539c2 349 /// \param length length of range to allocate.
11fdf7f2 350 /// \return future that becomes ready when the operation completes.
f67539c2 351 future<> allocate(uint64_t position, uint64_t length) noexcept;
11fdf7f2
TL
352
353 /// Discard unneeded data from the file.
354 ///
355 /// The discard operation tells the file system that a range of offsets
356 /// (which be aligned) is no longer needed and can be reused.
f67539c2 357 future<> discard(uint64_t offset, uint64_t length) noexcept;
11fdf7f2
TL
358
359 /// Gets the file size.
f67539c2 360 future<uint64_t> size() const noexcept;
11fdf7f2
TL
361
362 /// Closes the file.
363 ///
364 /// Flushes any pending operations and release any resources associated with
365 /// the file (except for stable storage).
366 ///
367 /// \note
368 /// to ensure file data reaches stable storage, you must call \ref flush()
369 /// before calling \c close().
f67539c2 370 future<> close() noexcept;
11fdf7f2
TL
371
372 /// Returns a directory listing, given that this file object is a directory.
f67539c2 373 subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next);
11fdf7f2
TL
374
375 /**
376 * Read a data bulk containing the provided addresses range that starts at
377 * the given offset and ends at either the address aligned to
378 * dma_alignment (4KB) or at the file end.
379 *
380 * @param offset starting address of the range the read bulk should contain
381 * @param range_size size of the addresses range
382 * @param pc the IO priority class under which to queue this operation
383 *
384 * @return temporary buffer containing the read data bulk.
f67539c2
TL
385 * or exceptional future holding:
386 * system_error exception in case of I/O error or eof_error when
11fdf7f2
TL
387 * "offset" is beyond EOF.
388 */
389 template <typename CharType>
390 future<temporary_buffer<CharType>>
f67539c2
TL
391 dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc = default_priority_class()) noexcept {
392 return dma_read_bulk_impl(offset, range_size, pc).then([] (temporary_buffer<uint8_t> t) {
11fdf7f2
TL
393 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
394 });
395 }
396
397 /// \brief Creates a handle that can be transported across shards.
398 ///
399 /// Creates a handle that can be transported across shards, and then
400 /// used to create a new shard-local \ref file object that refers to
401 /// the same on-disk file.
402 ///
403 /// \note Use on read-only files.
404 ///
405 file_handle dup();
406
407 template <typename CharType>
408 struct read_state;
409private:
f67539c2
TL
410 future<temporary_buffer<uint8_t>>
411 dma_read_bulk_impl(uint64_t offset, size_t range_size, const io_priority_class& pc) noexcept;
412
413 future<size_t>
414 dma_write_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc) noexcept;
415
416 future<temporary_buffer<uint8_t>>
417 dma_read_impl(uint64_t pos, size_t len, const io_priority_class& pc) noexcept;
418
419 future<size_t>
420 dma_read_impl(uint64_t aligned_pos, uint8_t* aligned_buffer, size_t aligned_len, const io_priority_class& pc) noexcept;
421
422 future<temporary_buffer<uint8_t>>
423 dma_read_exactly_impl(uint64_t pos, size_t len, const io_priority_class& pc) noexcept;
424
11fdf7f2
TL
425 friend class reactor;
426 friend class file_impl;
427};
428
f67539c2
TL
429/// \brief Helper for ensuring a file is closed after \c func is called.
430///
431/// The file provided by the \c file_fut future is passed to \c func.
432///
433/// \param file_fut A future that produces a file
434/// \param func A function that uses a file
435/// \returns the future returned by \c func, or an exceptional future if either \c file_fut or closing the file failed.
436template <typename Func>
437SEASTAR_CONCEPT( requires std::invocable<Func, file&> && std::is_nothrow_move_constructible_v<Func> )
438auto with_file(future<file> file_fut, Func func) noexcept {
439 static_assert(std::is_nothrow_move_constructible_v<Func>, "Func's move constructor must not throw");
440 return file_fut.then([func = std::move(func)] (file f) mutable {
441 return do_with(std::move(f), [func = std::move(func)] (file& f) mutable {
442 return futurize_invoke(func, f).finally([&f] {
443 return f.close();
444 });
445 });
446 });
447}
448
449/// \brief Helper for ensuring a file is closed if \c func fails.
450///
451/// The file provided by the \c file_fut future is passed to \c func.
452/// * If func throws an exception E, the file is closed and we return
453/// a failed future with E.
454/// * If func returns a value V, the file is not closed and we return
455/// a future with V.
456/// Note that when an exception is not thrown, it is the
457/// responsibility of func to make sure the file will be closed. It
458/// can close the file itself, return it, or store it somewhere.
459///
460/// \param file_fut A future that produces a file
461/// \param func A function that uses a file
462/// \returns the future returned by \c func, or an exceptional future if \c file_fut failed or a nested exception if closing the file failed.
463template <typename Func>
464SEASTAR_CONCEPT( requires std::invocable<Func, file&> && std::is_nothrow_move_constructible_v<Func> )
465auto with_file_close_on_failure(future<file> file_fut, Func func) noexcept {
466 static_assert(std::is_nothrow_move_constructible_v<Func>, "Func's move constructor must not throw");
467 return file_fut.then([func = std::move(func)] (file f) mutable {
468 return do_with(std::move(f), [func = std::move(func)] (file& f) mutable {
469 return futurize_invoke(std::move(func), f).then_wrapped([&f] (auto ret) mutable {
470 if (!ret.failed()) {
471 return ret;
472 }
473 return ret.finally([&f] {
474 // If f.close() fails, return that as nested exception.
475 return f.close();
476 });
477 });
478 });
479 });
480}
481
482/// \example file_demo.cc
483/// A program demonstrating the use of \ref seastar::with_file
484/// and \ref seastar::with_file_close_on_failure
485
11fdf7f2
TL
486/// \brief A shard-transportable handle to a file
487///
488/// If you need to access a file (for reads only) across multiple shards,
489/// you can use the file::dup() method to create a `file_handle`, transport
490/// this file handle to another shard, and use the handle to create \ref file
491/// object on that shard. This is more efficient than calling open_file_dma()
492/// again.
493class file_handle {
494 std::unique_ptr<file_handle_impl> _impl;
495private:
496 explicit file_handle(std::unique_ptr<file_handle_impl> impl) : _impl(std::move(impl)) {}
497public:
498 /// Copies a file handle object
499 file_handle(const file_handle&);
500 /// Moves a file handle object
501 file_handle(file_handle&&) noexcept;
502 /// Assigns a file handle object
503 file_handle& operator=(const file_handle&);
504 /// Move-assigns a file handle object
505 file_handle& operator=(file_handle&&) noexcept;
506 /// Converts the file handle object to a \ref file.
507 file to_file() const &;
508 /// Converts the file handle object to a \ref file.
509 file to_file() &&;
510
511 friend class file;
512};
513
514/// \cond internal
515
516template <typename CharType>
517struct file::read_state {
518 typedef temporary_buffer<CharType> tmp_buf_type;
519
520 read_state(uint64_t offset, uint64_t front, size_t to_read,
521 size_t memory_alignment, size_t disk_alignment)
522 : buf(tmp_buf_type::aligned(memory_alignment,
523 align_up(to_read, disk_alignment)))
524 , _offset(offset)
525 , _to_read(to_read)
526 , _front(front) {}
527
528 bool done() const {
529 return eof || pos >= _to_read;
530 }
531
532 /**
533 * Trim the buffer to the actual number of read bytes and cut the
534 * bytes from offset 0 till "_front".
535 *
536 * @note this function has to be called only if we read bytes beyond
537 * "_front".
538 */
539 void trim_buf_before_ret() {
540 if (have_good_bytes()) {
541 buf.trim(pos);
542 buf.trim_front(_front);
543 } else {
544 buf.trim(0);
545 }
546 }
547
548 uint64_t cur_offset() const {
549 return _offset + pos;
550 }
551
552 size_t left_space() const {
553 return buf.size() - pos;
554 }
555
556 size_t left_to_read() const {
557 // positive as long as (done() == false)
558 return _to_read - pos;
559 }
560
561 void append_new_data(tmp_buf_type& new_data) {
562 auto to_copy = std::min(left_space(), new_data.size());
563
564 std::memcpy(buf.get_write() + pos, new_data.get(), to_copy);
565 pos += to_copy;
566 }
567
568 bool have_good_bytes() const {
569 return pos > _front;
570 }
571
572public:
573 bool eof = false;
574 tmp_buf_type buf;
575 size_t pos = 0;
576private:
577 uint64_t _offset;
578 size_t _to_read;
579 uint64_t _front;
580};
581
582/// \endcond
583
584/// @}
585
586}