]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/src/core/file-impl.hh
import 15.2.0 Octopus source
[ceph.git] / ceph / src / seastar / src / core / file-impl.hh
CommitLineData
11fdf7f2
TL
1/*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18/*
19 * Copyright 2016 ScyllaDB
20 */
21
22#pragma once
23
24#include <seastar/core/file.hh>
25#include <seastar/core/shared_ptr.hh>
26
27#include <deque>
28#include <atomic>
29
30namespace seastar {
31class io_queue;
32
9f95a23c
TL
33namespace internal {
34
35// Given a properly aligned vector of iovecs, ensures that it respects the
36// IOV_MAX limit, by trimming if necessary. The modified vector still satisfied
37// the alignment requirements.
38// Returns the final total length of all iovecs.
39size_t sanitize_iovecs(std::vector<iovec>& iov, size_t disk_alignment) noexcept;
40
41}
42
11fdf7f2
TL
43class posix_file_handle_impl : public seastar::file_handle_impl {
44 int _fd;
45 std::atomic<unsigned>* _refcount;
46 io_queue* _io_queue;
9f95a23c 47 open_flags _open_flags;
11fdf7f2 48public:
9f95a23c
TL
49 posix_file_handle_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, io_queue *ioq)
50 : _fd(fd), _refcount(refcount), _io_queue(ioq), _open_flags(f) {
11fdf7f2
TL
51 }
52 virtual ~posix_file_handle_impl();
53 posix_file_handle_impl(const posix_file_handle_impl&) = delete;
54 posix_file_handle_impl(posix_file_handle_impl&&) = delete;
55 virtual shared_ptr<file_impl> to_file() && override;
56 virtual std::unique_ptr<seastar::file_handle_impl> clone() const override;
57};
58
59class posix_file_impl : public file_impl {
60 std::atomic<unsigned>* _refcount = nullptr;
61 io_queue* _io_queue;
9f95a23c 62 open_flags _open_flags;
11fdf7f2
TL
63public:
64 int _fd;
9f95a23c
TL
65 posix_file_impl(int fd, open_flags, file_open_options options, io_queue* ioq);
66 posix_file_impl(int fd, open_flags, std::atomic<unsigned>* refcount, io_queue *ioq);
11fdf7f2
TL
67 virtual ~posix_file_impl() override;
68 future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override;
69 future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override;
70 future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override;
71 future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override;
72 future<> flush(void) override;
73 future<struct stat> stat(void) override;
74 future<> truncate(uint64_t length) override;
75 future<> discard(uint64_t offset, uint64_t length) override;
76 virtual future<> allocate(uint64_t position, uint64_t length) override;
77 future<uint64_t> size() override;
78 virtual future<> close() noexcept override;
79 virtual std::unique_ptr<seastar::file_handle_impl> dup() override;
80 virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override;
81 virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override;
9f95a23c
TL
82
83 open_flags flags() const {
84 return _open_flags;
85 }
11fdf7f2
TL
86private:
87 void query_dma_alignment();
88
89 /**
90 * Try to read from the given position where the previous short read has
91 * stopped. Check the EOF condition.
92 *
93 * The below code assumes the following: short reads due to I/O errors
94 * always end at address aligned to HW block boundary. Therefore if we issue
95 * a new read operation from the next position we are promised to get an
96 * error (different from EINVAL). If we've got a short read because we have
97 * reached EOF then the above read would either return a zero-length success
98 * (if the file size is aligned to HW block size) or an EINVAL error (if
99 * file length is not aligned to HW block size).
100 *
101 * @param pos offset to read from
102 * @param len number of bytes to read
103 * @param pc the IO priority class under which to queue this operation
104 *
105 * @return temporary buffer with read data or zero-sized temporary buffer if
106 * pos is at or beyond EOF.
107 * @throw appropriate exception in case of I/O error.
108 */
109 future<temporary_buffer<uint8_t>>
110 read_maybe_eof(uint64_t pos, size_t len, const io_priority_class& pc);
111};
112
113// The Linux XFS implementation is challenged wrt. append: a write that changes
114// eof will be blocked by any other concurrent AIO operation to the same file, whether
115// it changes file size or not. Furthermore, ftruncate() will also block and be blocked
116// by AIO, so attempts to game the system and call ftruncate() have to be done very carefully.
117//
118// Other Linux filesystems may have different locking rules, so this may need to be
119// adjusted for them.
120class append_challenged_posix_file_impl : public posix_file_impl, public enable_shared_from_this<append_challenged_posix_file_impl> {
121 // File size as a result of completed kernel operations (writes and truncates)
122 uint64_t _committed_size;
123 // File size as a result of seastar API calls
124 uint64_t _logical_size;
125 // Pending operations
126 enum class opcode {
127 invalid,
128 read,
129 write,
130 truncate,
131 flush,
132 };
133 struct op {
134 opcode type;
135 uint64_t pos;
136 size_t len;
137 std::function<future<> ()> run;
138 };
139 // Queue of pending operations; processed from front to end to avoid
140 // starvation, but can issue concurrent operations.
141 std::deque<op> _q;
142 unsigned _max_size_changing_ops = 0;
143 unsigned _current_non_size_changing_ops = 0;
144 unsigned _current_size_changing_ops = 0;
145 bool _fsync_is_exclusive = true;
9f95a23c
TL
146
147 // Set when the user is closing the file
148 enum class state { open, draining, closing, closed };
149 state _closing_state = state::open;
150
11fdf7f2
TL
151 bool _sloppy_size = false;
152 // Fulfiled when _done and I/O is complete
153 promise<> _completed;
154private:
155 void commit_size(uint64_t size) noexcept;
156 bool must_run_alone(const op& candidate) const noexcept;
157 bool size_changing(const op& candidate) const noexcept;
158 bool may_dispatch(const op& candidate) const noexcept;
159 void dispatch(op& candidate) noexcept;
160 void optimize_queue() noexcept;
161 void process_queue() noexcept;
162 bool may_quit() const noexcept;
163 void enqueue(op&& op);
164public:
9f95a23c 165 append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, io_queue* ioq);
11fdf7f2
TL
166 ~append_challenged_posix_file_impl() override;
167 future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override;
168 future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override;
169 future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override;
170 future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override;
171 future<> flush() override;
172 future<struct stat> stat() override;
173 future<> truncate(uint64_t length) override;
174 future<uint64_t> size() override;
175 future<> close() noexcept override;
176};
177
178class blockdev_file_impl : public posix_file_impl {
179public:
9f95a23c 180 blockdev_file_impl(int fd, open_flags, file_open_options options, io_queue* ioq);
11fdf7f2
TL
181 future<> truncate(uint64_t length) override;
182 future<> discard(uint64_t offset, uint64_t length) override;
183 future<uint64_t> size() override;
184 virtual future<> allocate(uint64_t position, uint64_t length) override;
185};
186
187}