]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/include/seastar/core/file.hh
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / seastar / include / seastar / core / file.hh
1 /*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18 /*
19 * Copyright 2015 Cloudius Systems
20 */
21
22 #pragma once
23
24 #include <seastar/core/stream.hh>
25 #include <seastar/core/sstring.hh>
26 #include <seastar/core/shared_ptr.hh>
27 #include <seastar/core/align.hh>
28 #include <seastar/core/future-util.hh>
29 #include <seastar/core/fair_queue.hh>
30 #include <seastar/util/std-compat.hh>
31 #include <system_error>
32 #include <sys/stat.h>
33 #include <sys/statvfs.h>
34 #include <sys/ioctl.h>
35 #include <linux/fs.h>
36 #include <sys/uio.h>
37 #include <unistd.h>
38
39 namespace seastar {
40
41 /// \addtogroup fileio-module
42 /// @{
43
44 /// Enumeration describing the type of a directory entry being listed.
45 ///
46 /// \see file::list_directory()
47 enum class directory_entry_type {
48 block_device,
49 char_device,
50 directory,
51 fifo,
52 link,
53 regular,
54 socket,
55 };
56
57 /// Enumeration describing the type of a particular filesystem
58 enum class fs_type {
59 other,
60 xfs,
61 ext2,
62 ext3,
63 ext4,
64 btrfs,
65 hfs,
66 tmpfs,
67 };
68
69 /// A directory entry being listed.
70 struct directory_entry {
71 /// Name of the file in a directory entry. Will never be "." or "..". Only the last component is included.
72 sstring name;
73 /// Type of the directory entry, if known.
74 compat::optional<directory_entry_type> type;
75 };
76
77 /// File open options
78 ///
79 /// Options used to configure an open file.
80 ///
81 /// \ref file
82 struct file_open_options {
83 uint64_t extent_allocation_size_hint = 1 << 20; ///< Allocate this much disk space when extending the file
84 bool sloppy_size = false; ///< Allow the file size not to track the amount of data written until a flush
85 uint64_t sloppy_size_hint = 1 << 20; ///< Hint as to what the eventual file size will be
86 };
87
88 /// \cond internal
89 class io_queue;
90 class io_priority_class {
91 unsigned val;
92 friend io_queue;
93 public:
94 unsigned id() const {
95 return val;
96 }
97 };
98
99 const io_priority_class& default_priority_class();
100
101 class file;
102 class file_impl;
103
104 class file_handle;
105
106 // A handle that can be transported across shards and used to
107 // create a dup(2)-like `file` object referring to the same underlying file
108 class file_handle_impl {
109 public:
110 virtual ~file_handle_impl() = default;
111 virtual std::unique_ptr<file_handle_impl> clone() const = 0;
112 virtual shared_ptr<file_impl> to_file() && = 0;
113 };
114
115 class file_impl {
116 protected:
117 static file_impl* get_file_impl(file& f);
118 public:
119 unsigned _memory_dma_alignment = 4096;
120 unsigned _disk_read_dma_alignment = 4096;
121 unsigned _disk_write_dma_alignment = 4096;
122 public:
123 virtual ~file_impl() {}
124
125 virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) = 0;
126 virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
127 virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) = 0;
128 virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
129 virtual future<> flush(void) = 0;
130 virtual future<struct stat> stat(void) = 0;
131 virtual future<> truncate(uint64_t length) = 0;
132 virtual future<> discard(uint64_t offset, uint64_t length) = 0;
133 virtual future<> allocate(uint64_t position, uint64_t length) = 0;
134 virtual future<uint64_t> size(void) = 0;
135 virtual future<> close() = 0;
136 virtual std::unique_ptr<file_handle_impl> dup();
137 virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) = 0;
138 virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) = 0;
139
140 friend class reactor;
141 };
142
143 /// \endcond
144
145 /// A data file on persistent storage.
146 ///
147 /// File objects represent uncached, unbuffered files. As such great care
148 /// must be taken to cache data at the application layer; neither seastar
149 /// nor the OS will cache these file.
150 ///
151 /// Data is transferred using direct memory access (DMA). This imposes
152 /// restrictions on file offsets and data pointers. The former must be aligned
153 /// on a 4096 byte boundary, while a 512 byte boundary suffices for the latter.
154 class file {
155 shared_ptr<file_impl> _file_impl;
156 private:
157 explicit file(int fd, file_open_options options);
158 public:
159 /// Default constructor constructs an uninitialized file object.
160 ///
161 /// A default constructor is useful for the common practice of declaring
162 /// a variable, and only assigning to it later. The uninitialized file
163 /// must not be used, or undefined behavior will result (currently, a null
164 /// pointer dereference).
165 ///
166 /// One can check whether a file object is in uninitialized state with
167 /// \ref operator bool(); One can reset a file back to uninitialized state
168 /// by assigning file() to it.
169 file() : _file_impl(nullptr) {}
170
171 file(shared_ptr<file_impl> impl)
172 : _file_impl(std::move(impl)) {}
173
174 /// Constructs a file object from a \ref file_handle obtained from another shard
175 explicit file(file_handle&& handle);
176
177 /// Checks whether the file object was initialized.
178 ///
179 /// \return false if the file object is uninitialized (default
180 /// constructed), true if the file object refers to an actual file.
181 explicit operator bool() const noexcept { return bool(_file_impl); }
182
183 /// Copies a file object. The new and old objects refer to the
184 /// same underlying file.
185 ///
186 /// \param x file object to be copied
187 file(const file& x) = default;
188 /// Moves a file object.
189 file(file&& x) noexcept : _file_impl(std::move(x._file_impl)) {}
190 /// Assigns a file object. After assignent, the destination and source refer
191 /// to the same underlying file.
192 ///
193 /// \param x file object to assign to `this`.
194 file& operator=(const file& x) noexcept = default;
195 /// Moves assigns a file object.
196 file& operator=(file&& x) noexcept = default;
197
198 // O_DIRECT reading requires that buffer, offset, and read length, are
199 // all aligned. Alignment of 4096 was necessary in the past, but no longer
200 // is - 512 is usually enough; But we'll need to use BLKSSZGET ioctl to
201 // be sure it is really enough on this filesystem. 4096 is always safe.
202 // In addition, if we start reading in things outside page boundaries,
203 // we will end up with various pages around, some of them with
204 // overlapping ranges. Those would be very challenging to cache.
205
206 /// Alignment requirement for file offsets (for reads)
207 uint64_t disk_read_dma_alignment() const {
208 return _file_impl->_disk_read_dma_alignment;
209 }
210
211 /// Alignment requirement for file offsets (for writes)
212 uint64_t disk_write_dma_alignment() const {
213 return _file_impl->_disk_write_dma_alignment;
214 }
215
216 /// Alignment requirement for data buffers
217 uint64_t memory_dma_alignment() const {
218 return _file_impl->_memory_dma_alignment;
219 }
220
221
222 /**
223 * Perform a single DMA read operation.
224 *
225 * @param aligned_pos offset to begin reading at (should be aligned)
226 * @param aligned_buffer output buffer (should be aligned)
227 * @param aligned_len number of bytes to read (should be aligned)
228 * @param pc the IO priority class under which to queue this operation
229 *
230 * Alignment is HW dependent but use 4KB alignment to be on the safe side as
231 * explained above.
232 *
233 * @return number of bytes actually read
234 * @throw exception in case of I/O error
235 */
236 template <typename CharType>
237 future<size_t>
238 dma_read(uint64_t aligned_pos, CharType* aligned_buffer, size_t aligned_len, const io_priority_class& pc = default_priority_class()) {
239 return _file_impl->read_dma(aligned_pos, aligned_buffer, aligned_len, pc);
240 }
241
242 /**
243 * Read the requested amount of bytes starting from the given offset.
244 *
245 * @param pos offset to begin reading from
246 * @param len number of bytes to read
247 * @param pc the IO priority class under which to queue this operation
248 *
249 * @return temporary buffer containing the requested data.
250 * @throw exception in case of I/O error
251 *
252 * This function doesn't require any alignment for both "pos" and "len"
253 *
254 * @note size of the returned buffer may be smaller than "len" if EOF is
255 * reached of in case of I/O error.
256 */
257 template <typename CharType>
258 future<temporary_buffer<CharType>> dma_read(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) {
259 return dma_read_bulk<CharType>(pos, len, pc).then(
260 [len] (temporary_buffer<CharType> buf) {
261 if (len < buf.size()) {
262 buf.trim(len);
263 }
264
265 return std::move(buf);
266 });
267 }
268
269 /// Error thrown when attempting to read past end-of-file
270 /// with \ref dma_read_exactly().
271 class eof_error : public std::exception {};
272
273 /**
274 * Read the exact amount of bytes.
275 *
276 * @param pos offset in a file to begin reading from
277 * @param len number of bytes to read
278 * @param pc the IO priority class under which to queue this operation
279 *
280 * @return temporary buffer containing the read data
281 * @throw end_of_file_error if EOF is reached, file_io_error or
282 * std::system_error in case of I/O error.
283 */
284 template <typename CharType>
285 future<temporary_buffer<CharType>>
286 dma_read_exactly(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) {
287 return dma_read<CharType>(pos, len, pc).then(
288 [pos, len] (auto buf) {
289 if (buf.size() < len) {
290 throw eof_error();
291 }
292
293 return std::move(buf);
294 });
295 }
296
297 /// Performs a DMA read into the specified iovec.
298 ///
299 /// \param pos offset to read from. Must be aligned to \ref dma_alignment.
300 /// \param iov vector of address/size pairs to read into. Addresses must be
301 /// aligned.
302 /// \param pc the IO priority class under which to queue this operation
303 ///
304 /// \return a future representing the number of bytes actually read. A short
305 /// read may happen due to end-of-file or an I/O error.
306 future<size_t> dma_read(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) {
307 return _file_impl->read_dma(pos, std::move(iov), pc);
308 }
309
310 /// Performs a DMA write from the specified buffer.
311 ///
312 /// \param pos offset to write into. Must be aligned to \ref dma_alignment.
313 /// \param buffer aligned address of buffer to read from. Buffer must exists
314 /// until the future is made ready.
315 /// \param len number of bytes to write. Must be aligned.
316 /// \param pc the IO priority class under which to queue this operation
317 ///
318 /// \return a future representing the number of bytes actually written. A short
319 /// write may happen due to an I/O error.
320 template <typename CharType>
321 future<size_t> dma_write(uint64_t pos, const CharType* buffer, size_t len, const io_priority_class& pc = default_priority_class()) {
322 return _file_impl->write_dma(pos, buffer, len, pc);
323 }
324
325 /// Performs a DMA write to the specified iovec.
326 ///
327 /// \param pos offset to write into. Must be aligned to \ref dma_alignment.
328 /// \param iov vector of address/size pairs to write from. Addresses must be
329 /// aligned.
330 /// \param pc the IO priority class under which to queue this operation
331 ///
332 /// \return a future representing the number of bytes actually written. A short
333 /// write may happen due to an I/O error.
334 future<size_t> dma_write(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) {
335 return _file_impl->write_dma(pos, std::move(iov), pc);
336 }
337
338 /// Causes any previously written data to be made stable on persistent storage.
339 ///
340 /// Prior to a flush, written data may or may not survive a power failure. After
341 /// a flush, data is guaranteed to be on disk.
342 future<> flush() {
343 return _file_impl->flush();
344 }
345
346 /// Returns \c stat information about the file.
347 future<struct stat> stat() {
348 return _file_impl->stat();
349 }
350
351 /// Truncates the file to a specified length.
352 future<> truncate(uint64_t length) {
353 return _file_impl->truncate(length);
354 }
355
356 /// Preallocate disk blocks for a specified byte range.
357 ///
358 /// Requests the file system to allocate disk blocks to
359 /// back the specified range (\c length bytes starting at
360 /// \c position). The range may be outside the current file
361 /// size; the blocks can then be used when appending to the
362 /// file.
363 ///
364 /// \param position beginning of the range at which to allocate
365 /// blocks.
366 /// \parm length length of range to allocate.
367 /// \return future that becomes ready when the operation completes.
368 future<> allocate(uint64_t position, uint64_t length) {
369 return _file_impl->allocate(position, length);
370 }
371
372 /// Discard unneeded data from the file.
373 ///
374 /// The discard operation tells the file system that a range of offsets
375 /// (which be aligned) is no longer needed and can be reused.
376 future<> discard(uint64_t offset, uint64_t length) {
377 return _file_impl->discard(offset, length);
378 }
379
380 /// Gets the file size.
381 future<uint64_t> size() const {
382 return _file_impl->size();
383 }
384
385 /// Closes the file.
386 ///
387 /// Flushes any pending operations and release any resources associated with
388 /// the file (except for stable storage).
389 ///
390 /// \note
391 /// to ensure file data reaches stable storage, you must call \ref flush()
392 /// before calling \c close().
393 future<> close() {
394 return _file_impl->close();
395 }
396
397 /// Returns a directory listing, given that this file object is a directory.
398 subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) {
399 return _file_impl->list_directory(std::move(next));
400 }
401
402 /**
403 * Read a data bulk containing the provided addresses range that starts at
404 * the given offset and ends at either the address aligned to
405 * dma_alignment (4KB) or at the file end.
406 *
407 * @param offset starting address of the range the read bulk should contain
408 * @param range_size size of the addresses range
409 * @param pc the IO priority class under which to queue this operation
410 *
411 * @return temporary buffer containing the read data bulk.
412 * @throw system_error exception in case of I/O error or eof_error when
413 * "offset" is beyond EOF.
414 */
415 template <typename CharType>
416 future<temporary_buffer<CharType>>
417 dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc = default_priority_class()) {
418 return _file_impl->dma_read_bulk(offset, range_size, pc).then([] (temporary_buffer<uint8_t> t) {
419 return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
420 });
421 }
422
423 /// \brief Creates a handle that can be transported across shards.
424 ///
425 /// Creates a handle that can be transported across shards, and then
426 /// used to create a new shard-local \ref file object that refers to
427 /// the same on-disk file.
428 ///
429 /// \note Use on read-only files.
430 ///
431 file_handle dup();
432
433 template <typename CharType>
434 struct read_state;
435 private:
436 friend class reactor;
437 friend class file_impl;
438 };
439
440 /// \brief A shard-transportable handle to a file
441 ///
442 /// If you need to access a file (for reads only) across multiple shards,
443 /// you can use the file::dup() method to create a `file_handle`, transport
444 /// this file handle to another shard, and use the handle to create \ref file
445 /// object on that shard. This is more efficient than calling open_file_dma()
446 /// again.
447 class file_handle {
448 std::unique_ptr<file_handle_impl> _impl;
449 private:
450 explicit file_handle(std::unique_ptr<file_handle_impl> impl) : _impl(std::move(impl)) {}
451 public:
452 /// Copies a file handle object
453 file_handle(const file_handle&);
454 /// Moves a file handle object
455 file_handle(file_handle&&) noexcept;
456 /// Assigns a file handle object
457 file_handle& operator=(const file_handle&);
458 /// Move-assigns a file handle object
459 file_handle& operator=(file_handle&&) noexcept;
460 /// Converts the file handle object to a \ref file.
461 file to_file() const &;
462 /// Converts the file handle object to a \ref file.
463 file to_file() &&;
464
465 friend class file;
466 };
467
468 /// \cond internal
469
470 template <typename CharType>
471 struct file::read_state {
472 typedef temporary_buffer<CharType> tmp_buf_type;
473
474 read_state(uint64_t offset, uint64_t front, size_t to_read,
475 size_t memory_alignment, size_t disk_alignment)
476 : buf(tmp_buf_type::aligned(memory_alignment,
477 align_up(to_read, disk_alignment)))
478 , _offset(offset)
479 , _to_read(to_read)
480 , _front(front) {}
481
482 bool done() const {
483 return eof || pos >= _to_read;
484 }
485
486 /**
487 * Trim the buffer to the actual number of read bytes and cut the
488 * bytes from offset 0 till "_front".
489 *
490 * @note this function has to be called only if we read bytes beyond
491 * "_front".
492 */
493 void trim_buf_before_ret() {
494 if (have_good_bytes()) {
495 buf.trim(pos);
496 buf.trim_front(_front);
497 } else {
498 buf.trim(0);
499 }
500 }
501
502 uint64_t cur_offset() const {
503 return _offset + pos;
504 }
505
506 size_t left_space() const {
507 return buf.size() - pos;
508 }
509
510 size_t left_to_read() const {
511 // positive as long as (done() == false)
512 return _to_read - pos;
513 }
514
515 void append_new_data(tmp_buf_type& new_data) {
516 auto to_copy = std::min(left_space(), new_data.size());
517
518 std::memcpy(buf.get_write() + pos, new_data.get(), to_copy);
519 pos += to_copy;
520 }
521
522 bool have_good_bytes() const {
523 return pos > _front;
524 }
525
526 public:
527 bool eof = false;
528 tmp_buf_type buf;
529 size_t pos = 0;
530 private:
531 uint64_t _offset;
532 size_t _to_read;
533 uint64_t _front;
534 };
535
536 /// \endcond
537
538 /// @}
539
540 }