]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* |
2 | * This file is open source software, licensed to you under the terms | |
3 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
4 | * distributed with this work for additional information regarding copyright | |
5 | * ownership. You may not use this file except in compliance with the License. | |
6 | * | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, | |
12 | * software distributed under the License is distributed on an | |
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | * KIND, either express or implied. See the License for the | |
15 | * specific language governing permissions and limitations | |
16 | * under the License. | |
17 | */ | |
18 | /* | |
19 | * Copyright 2016 ScyllaDB | |
20 | */ | |
21 | ||
22 | #pragma once | |
23 | ||
24 | #include <seastar/core/file.hh> | |
25 | #include <seastar/core/shared_ptr.hh> | |
26 | ||
27 | #include <deque> | |
28 | #include <atomic> | |
29 | ||
30 | namespace seastar { | |
31 | class io_queue; | |
32 | ||
9f95a23c TL |
33 | namespace internal { |
34 | ||
35 | // Given a properly aligned vector of iovecs, ensures that it respects the | |
36 | // IOV_MAX limit, by trimming if necessary. The modified vector still satisfied | |
37 | // the alignment requirements. | |
38 | // Returns the final total length of all iovecs. | |
39 | size_t sanitize_iovecs(std::vector<iovec>& iov, size_t disk_alignment) noexcept; | |
40 | ||
41 | } | |
42 | ||
11fdf7f2 TL |
43 | class posix_file_handle_impl : public seastar::file_handle_impl { |
44 | int _fd; | |
45 | std::atomic<unsigned>* _refcount; | |
46 | io_queue* _io_queue; | |
9f95a23c | 47 | open_flags _open_flags; |
11fdf7f2 | 48 | public: |
9f95a23c TL |
49 | posix_file_handle_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, io_queue *ioq) |
50 | : _fd(fd), _refcount(refcount), _io_queue(ioq), _open_flags(f) { | |
11fdf7f2 TL |
51 | } |
52 | virtual ~posix_file_handle_impl(); | |
53 | posix_file_handle_impl(const posix_file_handle_impl&) = delete; | |
54 | posix_file_handle_impl(posix_file_handle_impl&&) = delete; | |
55 | virtual shared_ptr<file_impl> to_file() && override; | |
56 | virtual std::unique_ptr<seastar::file_handle_impl> clone() const override; | |
57 | }; | |
58 | ||
59 | class posix_file_impl : public file_impl { | |
60 | std::atomic<unsigned>* _refcount = nullptr; | |
61 | io_queue* _io_queue; | |
9f95a23c | 62 | open_flags _open_flags; |
11fdf7f2 TL |
63 | public: |
64 | int _fd; | |
9f95a23c TL |
65 | posix_file_impl(int fd, open_flags, file_open_options options, io_queue* ioq); |
66 | posix_file_impl(int fd, open_flags, std::atomic<unsigned>* refcount, io_queue *ioq); | |
11fdf7f2 TL |
67 | virtual ~posix_file_impl() override; |
68 | future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override; | |
69 | future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override; | |
70 | future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override; | |
71 | future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override; | |
72 | future<> flush(void) override; | |
73 | future<struct stat> stat(void) override; | |
74 | future<> truncate(uint64_t length) override; | |
75 | future<> discard(uint64_t offset, uint64_t length) override; | |
76 | virtual future<> allocate(uint64_t position, uint64_t length) override; | |
77 | future<uint64_t> size() override; | |
78 | virtual future<> close() noexcept override; | |
79 | virtual std::unique_ptr<seastar::file_handle_impl> dup() override; | |
80 | virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override; | |
81 | virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override; | |
9f95a23c TL |
82 | |
83 | open_flags flags() const { | |
84 | return _open_flags; | |
85 | } | |
11fdf7f2 TL |
86 | private: |
87 | void query_dma_alignment(); | |
88 | ||
89 | /** | |
90 | * Try to read from the given position where the previous short read has | |
91 | * stopped. Check the EOF condition. | |
92 | * | |
93 | * The below code assumes the following: short reads due to I/O errors | |
94 | * always end at address aligned to HW block boundary. Therefore if we issue | |
95 | * a new read operation from the next position we are promised to get an | |
96 | * error (different from EINVAL). If we've got a short read because we have | |
97 | * reached EOF then the above read would either return a zero-length success | |
98 | * (if the file size is aligned to HW block size) or an EINVAL error (if | |
99 | * file length is not aligned to HW block size). | |
100 | * | |
101 | * @param pos offset to read from | |
102 | * @param len number of bytes to read | |
103 | * @param pc the IO priority class under which to queue this operation | |
104 | * | |
105 | * @return temporary buffer with read data or zero-sized temporary buffer if | |
106 | * pos is at or beyond EOF. | |
107 | * @throw appropriate exception in case of I/O error. | |
108 | */ | |
109 | future<temporary_buffer<uint8_t>> | |
110 | read_maybe_eof(uint64_t pos, size_t len, const io_priority_class& pc); | |
111 | }; | |
112 | ||
113 | // The Linux XFS implementation is challenged wrt. append: a write that changes | |
114 | // eof will be blocked by any other concurrent AIO operation to the same file, whether | |
115 | // it changes file size or not. Furthermore, ftruncate() will also block and be blocked | |
116 | // by AIO, so attempts to game the system and call ftruncate() have to be done very carefully. | |
117 | // | |
118 | // Other Linux filesystems may have different locking rules, so this may need to be | |
119 | // adjusted for them. | |
120 | class append_challenged_posix_file_impl : public posix_file_impl, public enable_shared_from_this<append_challenged_posix_file_impl> { | |
121 | // File size as a result of completed kernel operations (writes and truncates) | |
122 | uint64_t _committed_size; | |
123 | // File size as a result of seastar API calls | |
124 | uint64_t _logical_size; | |
125 | // Pending operations | |
126 | enum class opcode { | |
127 | invalid, | |
128 | read, | |
129 | write, | |
130 | truncate, | |
131 | flush, | |
132 | }; | |
133 | struct op { | |
134 | opcode type; | |
135 | uint64_t pos; | |
136 | size_t len; | |
137 | std::function<future<> ()> run; | |
138 | }; | |
139 | // Queue of pending operations; processed from front to end to avoid | |
140 | // starvation, but can issue concurrent operations. | |
141 | std::deque<op> _q; | |
142 | unsigned _max_size_changing_ops = 0; | |
143 | unsigned _current_non_size_changing_ops = 0; | |
144 | unsigned _current_size_changing_ops = 0; | |
145 | bool _fsync_is_exclusive = true; | |
9f95a23c TL |
146 | |
147 | // Set when the user is closing the file | |
148 | enum class state { open, draining, closing, closed }; | |
149 | state _closing_state = state::open; | |
150 | ||
11fdf7f2 TL |
151 | bool _sloppy_size = false; |
152 | // Fulfiled when _done and I/O is complete | |
153 | promise<> _completed; | |
154 | private: | |
155 | void commit_size(uint64_t size) noexcept; | |
156 | bool must_run_alone(const op& candidate) const noexcept; | |
157 | bool size_changing(const op& candidate) const noexcept; | |
158 | bool may_dispatch(const op& candidate) const noexcept; | |
159 | void dispatch(op& candidate) noexcept; | |
160 | void optimize_queue() noexcept; | |
161 | void process_queue() noexcept; | |
162 | bool may_quit() const noexcept; | |
163 | void enqueue(op&& op); | |
164 | public: | |
9f95a23c | 165 | append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, io_queue* ioq); |
11fdf7f2 TL |
166 | ~append_challenged_posix_file_impl() override; |
167 | future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override; | |
168 | future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override; | |
169 | future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override; | |
170 | future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override; | |
171 | future<> flush() override; | |
172 | future<struct stat> stat() override; | |
173 | future<> truncate(uint64_t length) override; | |
174 | future<uint64_t> size() override; | |
175 | future<> close() noexcept override; | |
176 | }; | |
177 | ||
178 | class blockdev_file_impl : public posix_file_impl { | |
179 | public: | |
9f95a23c | 180 | blockdev_file_impl(int fd, open_flags, file_open_options options, io_queue* ioq); |
11fdf7f2 TL |
181 | future<> truncate(uint64_t length) override; |
182 | future<> discard(uint64_t offset, uint64_t length) override; | |
183 | future<uint64_t> size() override; | |
184 | virtual future<> allocate(uint64_t position, uint64_t length) override; | |
185 | }; | |
186 | ||
187 | } |