]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/include/seastar/core/file.hh
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / seastar / include / seastar / core / file.hh
CommitLineData
11fdf7f2
TL
1/*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18/*
19 * Copyright 2015 Cloudius Systems
20 */
21
22#pragma once
23
f67539c2 24#include <seastar/core/do_with.hh>
11fdf7f2
TL
25#include <seastar/core/stream.hh>
26#include <seastar/core/sstring.hh>
27#include <seastar/core/shared_ptr.hh>
28#include <seastar/core/align.hh>
20effc67 29#include <seastar/core/io_priority_class.hh>
9f95a23c 30#include <seastar/core/file-types.hh>
11fdf7f2
TL
31#include <seastar/util/std-compat.hh>
32#include <system_error>
11fdf7f2
TL
33#include <sys/statvfs.h>
34#include <sys/ioctl.h>
35#include <linux/fs.h>
36#include <sys/uio.h>
37#include <unistd.h>
38
39namespace seastar {
40
41/// \addtogroup fileio-module
42/// @{
43
11fdf7f2
TL
44/// A directory entry being listed.
45struct directory_entry {
46 /// Name of the file in a directory entry. Will never be "." or "..". Only the last component is included.
47 sstring name;
48 /// Type of the directory entry, if known.
f67539c2 49 std::optional<directory_entry_type> type;
11fdf7f2
TL
50};
51
9f95a23c
TL
52/// Filesystem object stat information
53struct stat_data {
54 uint64_t device_id; // ID of device containing file
55 uint64_t inode_number; // Inode number
56 uint64_t mode; // File type and mode
57 directory_entry_type type;
58 uint64_t number_of_links;// Number of hard links
59 uint64_t uid; // User ID of owner
60 uint64_t gid; // Group ID of owner
61 uint64_t rdev; // Device ID (if special file)
62 uint64_t size; // Total size, in bytes
63 uint64_t block_size; // Block size for filesystem I/O
64 uint64_t allocated_size; // Total size of allocated storage, in bytes
65
66 std::chrono::system_clock::time_point time_accessed; // Time of last content access
67 std::chrono::system_clock::time_point time_modified; // Time of last content modification
68 std::chrono::system_clock::time_point time_changed; // Time of last status change (either content or attributes)
69};
70
11fdf7f2
TL
71/// File open options
72///
73/// Options used to configure an open file.
74///
75/// \ref file
76struct file_open_options {
77 uint64_t extent_allocation_size_hint = 1 << 20; ///< Allocate this much disk space when extending the file
78 bool sloppy_size = false; ///< Allow the file size not to track the amount of data written until a flush
79 uint64_t sloppy_size_hint = 1 << 20; ///< Hint as to what the eventual file size will be
9f95a23c 80 file_permissions create_permissions = file_permissions::default_file_permissions; ///< File permissions to use when creating a file
20effc67 81 bool append_is_unlikely = false; ///< Hint that user promises (or at least tries hard) not to write behind file size
9f95a23c 82
20effc67
TL
83 // The fsxattr.fsx_extsize is 32-bit
84 static constexpr uint64_t max_extent_allocation_size_hint = 1 << 31;
11fdf7f2
TL
85};
86
11fdf7f2
TL
87class file;
88class file_impl;
20effc67 89class io_intent;
11fdf7f2
TL
90class file_handle;
91
92// A handle that can be transported across shards and used to
93// create a dup(2)-like `file` object referring to the same underlying file
94class file_handle_impl {
95public:
96 virtual ~file_handle_impl() = default;
97 virtual std::unique_ptr<file_handle_impl> clone() const = 0;
98 virtual shared_ptr<file_impl> to_file() && = 0;
99};
100
101class file_impl {
20effc67 102 friend class file;
11fdf7f2
TL
103protected:
104 static file_impl* get_file_impl(file& f);
11fdf7f2
TL
105 unsigned _memory_dma_alignment = 4096;
106 unsigned _disk_read_dma_alignment = 4096;
107 unsigned _disk_write_dma_alignment = 4096;
20effc67
TL
108 unsigned _disk_overwrite_dma_alignment = 4096;
109 unsigned _read_max_length = 1u << 30;
110 unsigned _write_max_length = 1u << 30;
11fdf7f2
TL
111public:
112 virtual ~file_impl() {}
113
114 virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) = 0;
115 virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
116 virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) = 0;
117 virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
20effc67
TL
118
119 virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent*) {
120 return write_dma(pos, buffer, len, pc);
121 }
122 virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent*) {
123 return write_dma(pos, std::move(iov), pc);
124 }
125 virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent*) {
126 return read_dma(pos, buffer, len, pc);
127 }
128 virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent*) {
129 return read_dma(pos, std::move(iov), pc);
130 }
131
11fdf7f2
TL
132 virtual future<> flush(void) = 0;
133 virtual future<struct stat> stat(void) = 0;
134 virtual future<> truncate(uint64_t length) = 0;
135 virtual future<> discard(uint64_t offset, uint64_t length) = 0;
20effc67
TL
136 virtual future<int> ioctl(uint64_t cmd, void* argp) noexcept;
137 virtual future<int> ioctl_short(uint64_t cmd, void* argp) noexcept;
138 virtual future<int> fcntl(int op, uintptr_t arg) noexcept;
139 virtual future<int> fcntl_short(int op, uintptr_t arg) noexcept;
11fdf7f2
TL
140 virtual future<> allocate(uint64_t position, uint64_t length) = 0;
141 virtual future<uint64_t> size(void) = 0;
142 virtual future<> close() = 0;
143 virtual std::unique_ptr<file_handle_impl> dup();
144 virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) = 0;
145 virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) = 0;
20effc67
TL
146 virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent*) {
147 return dma_read_bulk(offset, range_size, pc);
148 }
11fdf7f2
TL
149
150 friend class reactor;
151};
152
f67539c2
TL
153future<shared_ptr<file_impl>> make_file_impl(int fd, file_open_options options, int oflags) noexcept;
154
11fdf7f2
TL
155/// \endcond
156
157/// A data file on persistent storage.
158///
159/// File objects represent uncached, unbuffered files. As such great care
160/// must be taken to cache data at the application layer; neither seastar
161/// nor the OS will cache these file.
162///
163/// Data is transferred using direct memory access (DMA). This imposes
164/// restrictions on file offsets and data pointers. The former must be aligned
165/// on a 4096 byte boundary, while a 512 byte boundary suffices for the latter.
166class file {
167 shared_ptr<file_impl> _file_impl;
11fdf7f2
TL
168public:
169 /// Default constructor constructs an uninitialized file object.
170 ///
171 /// A default constructor is useful for the common practice of declaring
172 /// a variable, and only assigning to it later. The uninitialized file
173 /// must not be used, or undefined behavior will result (currently, a null
174 /// pointer dereference).
175 ///
176 /// One can check whether a file object is in uninitialized state with
177 /// \ref operator bool(); One can reset a file back to uninitialized state
178 /// by assigning file() to it.
f67539c2 179 file() noexcept : _file_impl(nullptr) {}
11fdf7f2 180
f67539c2 181 file(shared_ptr<file_impl> impl) noexcept
11fdf7f2
TL
182 : _file_impl(std::move(impl)) {}
183
184 /// Constructs a file object from a \ref file_handle obtained from another shard
f67539c2 185 explicit file(file_handle&& handle) noexcept;
11fdf7f2
TL
186
187 /// Checks whether the file object was initialized.
188 ///
189 /// \return false if the file object is uninitialized (default
190 /// constructed), true if the file object refers to an actual file.
191 explicit operator bool() const noexcept { return bool(_file_impl); }
192
193 /// Copies a file object. The new and old objects refer to the
194 /// same underlying file.
195 ///
196 /// \param x file object to be copied
197 file(const file& x) = default;
198 /// Moves a file object.
199 file(file&& x) noexcept : _file_impl(std::move(x._file_impl)) {}
200 /// Assigns a file object. After assignent, the destination and source refer
201 /// to the same underlying file.
202 ///
203 /// \param x file object to assign to `this`.
204 file& operator=(const file& x) noexcept = default;
205 /// Moves assigns a file object.
206 file& operator=(file&& x) noexcept = default;
207
208 // O_DIRECT reading requires that buffer, offset, and read length, are
209 // all aligned. Alignment of 4096 was necessary in the past, but no longer
210 // is - 512 is usually enough; But we'll need to use BLKSSZGET ioctl to
211 // be sure it is really enough on this filesystem. 4096 is always safe.
212 // In addition, if we start reading in things outside page boundaries,
213 // we will end up with various pages around, some of them with
214 // overlapping ranges. Those would be very challenging to cache.
215
216 /// Alignment requirement for file offsets (for reads)
f67539c2 217 uint64_t disk_read_dma_alignment() const noexcept {
11fdf7f2
TL
218 return _file_impl->_disk_read_dma_alignment;
219 }
220
221 /// Alignment requirement for file offsets (for writes)
f67539c2 222 uint64_t disk_write_dma_alignment() const noexcept {
11fdf7f2
TL
223 return _file_impl->_disk_write_dma_alignment;
224 }
225
20effc67
TL
226 /// Alignment requirement for file offsets (for overwrites).
227 ///
228 /// Specifies the minimum alignment for disk offsets for
229 /// overwrites (writes to a location that was previously written).
230 /// This can be smaller than \ref disk_write_dma_alignment(), allowing
231 /// a reduction in disk bandwidth used.
232 uint64_t disk_overwrite_dma_alignment() const noexcept {
233 return _file_impl->_disk_overwrite_dma_alignment;
234 }
235
11fdf7f2 236 /// Alignment requirement for data buffers
f67539c2 237 uint64_t memory_dma_alignment() const noexcept {
11fdf7f2
TL
238 return _file_impl->_memory_dma_alignment;
239 }
240
20effc67
TL
241 /// Recommended limit for read request size.
242 /// Submitting a larger request will not cause any error,
243 /// but may result in poor latencies for this and any other
244 /// concurrent requests
245 size_t disk_read_max_length() const noexcept {
246 return _file_impl->_read_max_length;
247 }
248
249 /// Recommended limit for write request size.
250 /// Submitting a larger request will not cause any error,
251 /// but may result in poor latencies for this and any other
252 /// concurrent requests
253 size_t disk_write_max_length() const noexcept {
254 return _file_impl->_write_max_length;
255 }
11fdf7f2
TL
256
257 /**
258 * Perform a single DMA read operation.
259 *
260 * @param aligned_pos offset to begin reading at (should be aligned)
261 * @param aligned_buffer output buffer (should be aligned)
262 * @param aligned_len number of bytes to read (should be aligned)
263 * @param pc the IO priority class under which to queue this operation
20effc67 264 * @param intent the IO intention confirmation (\ref seastar::io_intent)
11fdf7f2
TL
265 *
266 * Alignment is HW dependent but use 4KB alignment to be on the safe side as
267 * explained above.
268 *
269 * @return number of bytes actually read
f67539c2 270 * or exceptional future in case of I/O error
11fdf7f2
TL
271 */
272 template <typename CharType>
273 future<size_t>
20effc67
TL
274 dma_read(uint64_t aligned_pos, CharType* aligned_buffer, size_t aligned_len, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept {
275 return dma_read_impl(aligned_pos, reinterpret_cast<uint8_t*>(aligned_buffer), aligned_len, pc, intent);
11fdf7f2
TL
276 }
277
278 /**
279 * Read the requested amount of bytes starting from the given offset.
280 *
281 * @param pos offset to begin reading from
282 * @param len number of bytes to read
283 * @param pc the IO priority class under which to queue this operation
20effc67 284 * @param intent the IO intention confirmation (\ref seastar::io_intent)
11fdf7f2
TL
285 *
286 * @return temporary buffer containing the requested data.
f67539c2 287 * or exceptional future in case of I/O error
11fdf7f2
TL
288 *
289 * This function doesn't require any alignment for both "pos" and "len"
290 *
291 * @note size of the returned buffer may be smaller than "len" if EOF is
f67539c2 292 * reached or in case of I/O error.
11fdf7f2
TL
293 */
294 template <typename CharType>
20effc67
TL
295 future<temporary_buffer<CharType>> dma_read(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept {
296 return dma_read_impl(pos, len, pc, intent).then([] (temporary_buffer<uint8_t> t) {
f67539c2 297 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
11fdf7f2
TL
298 });
299 }
300
301 /// Error thrown when attempting to read past end-of-file
302 /// with \ref dma_read_exactly().
303 class eof_error : public std::exception {};
304
305 /**
306 * Read the exact amount of bytes.
307 *
308 * @param pos offset in a file to begin reading from
309 * @param len number of bytes to read
310 * @param pc the IO priority class under which to queue this operation
20effc67 311 * @param intent the IO intention confirmation (\ref seastar::io_intent)
11fdf7f2
TL
312 *
313 * @return temporary buffer containing the read data
f67539c2
TL
314 * or exceptional future in case an error, holding:
315 * end_of_file_error if EOF is reached, file_io_error or
11fdf7f2
TL
316 * std::system_error in case of I/O error.
317 */
318 template <typename CharType>
319 future<temporary_buffer<CharType>>
20effc67
TL
320 dma_read_exactly(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept {
321 return dma_read_exactly_impl(pos, len, pc, intent).then([] (temporary_buffer<uint8_t> t) {
f67539c2 322 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
11fdf7f2
TL
323 });
324 }
325
326 /// Performs a DMA read into the specified iovec.
327 ///
f67539c2 328 /// \param pos offset to read from. Must be aligned to \ref disk_read_dma_alignment.
11fdf7f2
TL
329 /// \param iov vector of address/size pairs to read into. Addresses must be
330 /// aligned.
331 /// \param pc the IO priority class under which to queue this operation
20effc67 332 /// \param intent the IO intention confirmation (\ref seastar::io_intent)
11fdf7f2
TL
333 ///
334 /// \return a future representing the number of bytes actually read. A short
335 /// read may happen due to end-of-file or an I/O error.
20effc67 336 future<size_t> dma_read(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept;
11fdf7f2
TL
337
338 /// Performs a DMA write from the specified buffer.
339 ///
f67539c2 340 /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment.
11fdf7f2
TL
341 /// \param buffer aligned address of buffer to read from. Buffer must exists
342 /// until the future is made ready.
343 /// \param len number of bytes to write. Must be aligned.
344 /// \param pc the IO priority class under which to queue this operation
20effc67 345 /// \param intent the IO intention confirmation (\ref seastar::io_intent)
11fdf7f2
TL
346 ///
347 /// \return a future representing the number of bytes actually written. A short
348 /// write may happen due to an I/O error.
349 template <typename CharType>
20effc67
TL
350 future<size_t> dma_write(uint64_t pos, const CharType* buffer, size_t len, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept {
351 return dma_write_impl(pos, reinterpret_cast<const uint8_t*>(buffer), len, pc, intent);
11fdf7f2
TL
352 }
353
354 /// Performs a DMA write to the specified iovec.
355 ///
f67539c2 356 /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment.
11fdf7f2
TL
357 /// \param iov vector of address/size pairs to write from. Addresses must be
358 /// aligned.
359 /// \param pc the IO priority class under which to queue this operation
20effc67 360 /// \param intent the IO intention confirmation (\ref seastar::io_intent)
11fdf7f2
TL
361 ///
362 /// \return a future representing the number of bytes actually written. A short
363 /// write may happen due to an I/O error.
20effc67 364 future<size_t> dma_write(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept;
11fdf7f2
TL
365
366 /// Causes any previously written data to be made stable on persistent storage.
367 ///
368 /// Prior to a flush, written data may or may not survive a power failure. After
369 /// a flush, data is guaranteed to be on disk.
f67539c2 370 future<> flush() noexcept;
11fdf7f2
TL
371
372 /// Returns \c stat information about the file.
f67539c2 373 future<struct stat> stat() noexcept;
11fdf7f2
TL
374
375 /// Truncates the file to a specified length.
f67539c2 376 future<> truncate(uint64_t length) noexcept;
11fdf7f2
TL
377
378 /// Preallocate disk blocks for a specified byte range.
379 ///
380 /// Requests the file system to allocate disk blocks to
381 /// back the specified range (\c length bytes starting at
382 /// \c position). The range may be outside the current file
383 /// size; the blocks can then be used when appending to the
384 /// file.
385 ///
386 /// \param position beginning of the range at which to allocate
387 /// blocks.
f67539c2 388 /// \param length length of range to allocate.
11fdf7f2 389 /// \return future that becomes ready when the operation completes.
f67539c2 390 future<> allocate(uint64_t position, uint64_t length) noexcept;
11fdf7f2
TL
391
392 /// Discard unneeded data from the file.
393 ///
394 /// The discard operation tells the file system that a range of offsets
395 /// (which be aligned) is no longer needed and can be reused.
f67539c2 396 future<> discard(uint64_t offset, uint64_t length) noexcept;
11fdf7f2 397
20effc67
TL
398 /// Generic ioctl syscall support for special file handling.
399 ///
400 /// This interface is useful for many non-standard operations on seastar::file.
401 /// The examples can be - querying device or file system capabilities,
402 /// configuring special performance or access modes on devices etc.
403 /// Refer ioctl(2) man page for more details.
404 ///
405 /// \param cmd ioctl command to be executed
406 /// \param argp pointer to the buffer which holds the argument
407 ///
408 /// \return a future containing the return value if any, or an exceptional future
409 /// if the operation has failed.
410 future<int> ioctl(uint64_t cmd, void* argp) noexcept;
411
412 /// Performs a short ioctl syscall on seastar::file
413 ///
414 /// This is similar to generic \c ioctl; the difference is, here user indicates
415 /// that this operation is a short one, and does not involve any i/o or locking.
416 /// The \c file module will process this differently from the normal \ref ioctl().
417 /// Use this method only if the user is sure that the operation does not involve any
418 /// blocking operation. If unsure, use the default \ref ioctl() method.
419 /// Refer ioctl(2) man page for more details on ioctl operation.
420 ///
421 /// \param cmd ioctl command to be executed
422 /// \param argp pointer to the buffer which holds the argument
423 ///
424 /// \return a future containing the return value if any, or an exceptional future
425 /// if the operation has failed.
426 future<int> ioctl_short(uint64_t cmd, void* argp) noexcept;
427
428 /// Generic fcntl syscall support for special file handling.
429 ///
430 /// fcntl performs the operation specified by 'op' field on the file.
431 /// Some of the use cases can be - setting file status flags, advisory record locking,
432 /// managing signals, managing file leases or write hints etc.
433 /// Refer fcntl(2) man page for more details.
434 ///
435 /// \param op the operation to be executed
436 /// \param arg the optional argument
437 /// \return a future containing the return value if any, or an exceptional future
438 /// if the operation has failed
439 future<int> fcntl(int op, uintptr_t arg = 0UL) noexcept;
440
441 /// Performs a 'short' fcntl syscall on seastar::file
442 ///
443 /// This is similar to generic \c fcntl; the difference is, here user indicates
444 /// that this operation is a short one, and does not involve any i/o or locking.
445 /// The \c file module will process this differently from normal \ref fcntl().
446 /// Use this only if the user is sure that the operation does not involve any
447 /// blocking operation. If unsure, use the default \ref fcntl() method.
448 /// Refer fcntl(2) man page for more details on fcntl operation.
449 ///
450 /// \param op the operation to be executed
451 /// \param arg the optional argument
452 /// \return a future containing the return value if any, or an exceptional future
453 /// if the operation has failed
454 future<int> fcntl_short(int op, uintptr_t arg = 0UL) noexcept;
455
456 /// Set a lifetime hint for the open file descriptor corresponding to seastar::file
457 ///
458 /// Write lifetime hints can be used to inform the kernel about the relative
459 /// expected lifetime of writes on a given inode or via open file descriptor.
460 /// An application may use the different hint values to separate writes into different
461 /// write classes, so that multiple users or applications running on a single storage back-end
462 /// can aggregate their I/O patterns in a consistent manner.
463 /// Refer fcntl(2) man page for more details on write lifetime hints.
464 ///
465 /// \param hint the hint value of the stream
466 /// \return future indicating success or failure
1e59de90 467 [[deprecated("This API was removed from the kernel")]]
20effc67
TL
468 future<> set_file_lifetime_hint(uint64_t hint) noexcept;
469
470 /// Set a lifetime hint for the inode corresponding to seastar::file
471 ///
472 /// Write lifetime hints can be used to inform the kernel about the relative
473 /// expected lifetime of writes on a given inode or via open file descriptor.
474 /// An application may use the different hint values to separate writes into different
475 /// write classes, so that multiple users or applications running on a single storage back-end
476 /// can aggregate their I/O patterns in a consistent manner.
477 /// Refer fcntl(2) man page for more details on write lifetime hints.
478 ///
479 /// \param hint the hint value of the stream
480 /// \return future indicating success or failure
481 future<> set_inode_lifetime_hint(uint64_t hint) noexcept;
482
483 /// Get the lifetime hint of the open file descriptor of seastar::file which was set by
484 /// \ref set_file_lifetime_hint()
485 ///
486 /// Write lifetime hints can be used to inform the kernel about the relative
487 /// expected lifetime of writes on a given inode or via open file descriptor.
488 /// An application may use the different hint values to separate writes into different
489 /// write classes, so that multiple users or applications running on a single storage back-end
490 /// can aggregate their I/O patterns in a consistent manner.
491 /// Refer fcntl(2) man page for more details on write lifetime hints.
492 ///
493 /// \return the hint value of the open file descriptor
1e59de90 494 [[deprecated("This API was removed from the kernel")]]
20effc67
TL
495 future<uint64_t> get_file_lifetime_hint() noexcept;
496
497 /// Get the lifetime hint of the inode of seastar::file which was set by
498 /// \ref set_inode_lifetime_hint()
499 ///
500 /// Write lifetime hints can be used to inform the kernel about the relative
501 /// expected lifetime of writes on a given inode or via open file descriptor.
502 /// An application may use the different hint values to separate writes into different
503 /// write classes, so that multiple users or applications running on a single storage back-end
504 /// can aggregate their I/O patterns in a consistent manner.
505 /// Refer fcntl(2) man page for more details on write lifetime hints.
506 ///
507 /// \return the hint value of the inode
508 future<uint64_t> get_inode_lifetime_hint() noexcept;
509
11fdf7f2 510 /// Gets the file size.
f67539c2 511 future<uint64_t> size() const noexcept;
11fdf7f2
TL
512
513 /// Closes the file.
514 ///
515 /// Flushes any pending operations and release any resources associated with
516 /// the file (except for stable storage).
517 ///
518 /// \note
20effc67
TL
519 /// \c close() never fails. It just reports errors and swallows them.
520 /// To ensure file data reaches stable storage, you must call \ref flush()
11fdf7f2 521 /// before calling \c close().
f67539c2 522 future<> close() noexcept;
11fdf7f2
TL
523
524 /// Returns a directory listing, given that this file object is a directory.
f67539c2 525 subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next);
11fdf7f2
TL
526
527 /**
528 * Read a data bulk containing the provided addresses range that starts at
529 * the given offset and ends at either the address aligned to
530 * dma_alignment (4KB) or at the file end.
531 *
532 * @param offset starting address of the range the read bulk should contain
533 * @param range_size size of the addresses range
534 * @param pc the IO priority class under which to queue this operation
20effc67 535 * @param intent the IO intention confirmation (\ref seastar::io_intent)
11fdf7f2
TL
536 *
537 * @return temporary buffer containing the read data bulk.
f67539c2
TL
538 * or exceptional future holding:
539 * system_error exception in case of I/O error or eof_error when
11fdf7f2
TL
540 * "offset" is beyond EOF.
541 */
542 template <typename CharType>
543 future<temporary_buffer<CharType>>
20effc67
TL
544 dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept {
545 return dma_read_bulk_impl(offset, range_size, pc, intent).then([] (temporary_buffer<uint8_t> t) {
11fdf7f2
TL
546 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
547 });
548 }
549
550 /// \brief Creates a handle that can be transported across shards.
551 ///
552 /// Creates a handle that can be transported across shards, and then
553 /// used to create a new shard-local \ref file object that refers to
554 /// the same on-disk file.
555 ///
556 /// \note Use on read-only files.
557 ///
558 file_handle dup();
11fdf7f2 559private:
f67539c2 560 future<temporary_buffer<uint8_t>>
20effc67 561 dma_read_bulk_impl(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept;
f67539c2
TL
562
563 future<size_t>
20effc67 564 dma_write_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept;
f67539c2
TL
565
566 future<temporary_buffer<uint8_t>>
20effc67 567 dma_read_impl(uint64_t pos, size_t len, const io_priority_class& pc, io_intent* intent) noexcept;
f67539c2
TL
568
569 future<size_t>
20effc67 570 dma_read_impl(uint64_t aligned_pos, uint8_t* aligned_buffer, size_t aligned_len, const io_priority_class& pc, io_intent* intent) noexcept;
f67539c2
TL
571
572 future<temporary_buffer<uint8_t>>
20effc67
TL
573 dma_read_exactly_impl(uint64_t pos, size_t len, const io_priority_class& pc, io_intent* intent) noexcept;
574
575 future<uint64_t> get_lifetime_hint_impl(int op) noexcept;
576 future<> set_lifetime_hint_impl(int op, uint64_t hint) noexcept;
f67539c2 577
11fdf7f2
TL
578 friend class reactor;
579 friend class file_impl;
580};
581
f67539c2
TL
582/// \brief Helper for ensuring a file is closed after \c func is called.
583///
584/// The file provided by the \c file_fut future is passed to \c func.
585///
586/// \param file_fut A future that produces a file
587/// \param func A function that uses a file
588/// \returns the future returned by \c func, or an exceptional future if either \c file_fut or closing the file failed.
589template <typename Func>
590SEASTAR_CONCEPT( requires std::invocable<Func, file&> && std::is_nothrow_move_constructible_v<Func> )
591auto with_file(future<file> file_fut, Func func) noexcept {
592 static_assert(std::is_nothrow_move_constructible_v<Func>, "Func's move constructor must not throw");
593 return file_fut.then([func = std::move(func)] (file f) mutable {
594 return do_with(std::move(f), [func = std::move(func)] (file& f) mutable {
595 return futurize_invoke(func, f).finally([&f] {
596 return f.close();
597 });
598 });
599 });
600}
601
602/// \brief Helper for ensuring a file is closed if \c func fails.
603///
604/// The file provided by the \c file_fut future is passed to \c func.
605/// * If func throws an exception E, the file is closed and we return
606/// a failed future with E.
607/// * If func returns a value V, the file is not closed and we return
608/// a future with V.
609/// Note that when an exception is not thrown, it is the
610/// responsibility of func to make sure the file will be closed. It
611/// can close the file itself, return it, or store it somewhere.
612///
613/// \param file_fut A future that produces a file
614/// \param func A function that uses a file
615/// \returns the future returned by \c func, or an exceptional future if \c file_fut failed or a nested exception if closing the file failed.
616template <typename Func>
617SEASTAR_CONCEPT( requires std::invocable<Func, file&> && std::is_nothrow_move_constructible_v<Func> )
618auto with_file_close_on_failure(future<file> file_fut, Func func) noexcept {
619 static_assert(std::is_nothrow_move_constructible_v<Func>, "Func's move constructor must not throw");
620 return file_fut.then([func = std::move(func)] (file f) mutable {
621 return do_with(std::move(f), [func = std::move(func)] (file& f) mutable {
622 return futurize_invoke(std::move(func), f).then_wrapped([&f] (auto ret) mutable {
623 if (!ret.failed()) {
624 return ret;
625 }
626 return ret.finally([&f] {
627 // If f.close() fails, return that as nested exception.
628 return f.close();
629 });
630 });
631 });
632 });
633}
634
635/// \example file_demo.cc
636/// A program demonstrating the use of \ref seastar::with_file
637/// and \ref seastar::with_file_close_on_failure
638
11fdf7f2
TL
639/// \brief A shard-transportable handle to a file
640///
641/// If you need to access a file (for reads only) across multiple shards,
642/// you can use the file::dup() method to create a `file_handle`, transport
643/// this file handle to another shard, and use the handle to create \ref file
644/// object on that shard. This is more efficient than calling open_file_dma()
645/// again.
646class file_handle {
647 std::unique_ptr<file_handle_impl> _impl;
648private:
649 explicit file_handle(std::unique_ptr<file_handle_impl> impl) : _impl(std::move(impl)) {}
650public:
651 /// Copies a file handle object
652 file_handle(const file_handle&);
653 /// Moves a file handle object
654 file_handle(file_handle&&) noexcept;
655 /// Assigns a file handle object
656 file_handle& operator=(const file_handle&);
657 /// Move-assigns a file handle object
658 file_handle& operator=(file_handle&&) noexcept;
659 /// Converts the file handle object to a \ref file.
660 file to_file() const &;
661 /// Converts the file handle object to a \ref file.
662 file to_file() &&;
663
664 friend class file;
665};
666
20effc67 667/// @}
11fdf7f2 668
20effc67
TL
669/// An exception Cancelled IOs resolve their future into (see \ref io_intent "io_intent")
670class cancelled_error : public std::exception {
11fdf7f2 671public:
20effc67
TL
672 virtual const char* what() const noexcept {
673 return "cancelled";
674 }
11fdf7f2
TL
675};
676
11fdf7f2 677}