]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | /* |
2 | * This file is open source software, licensed to you under the terms | |
3 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
4 | * distributed with this work for additional information regarding copyright | |
5 | * ownership. You may not use this file except in compliance with the License. | |
6 | * | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, | |
12 | * software distributed under the License is distributed on an | |
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | * KIND, either express or implied. See the License for the | |
15 | * specific language governing permissions and limitations | |
16 | * under the License. | |
17 | */ | |
18 | /* | |
19 | * Copyright 2019 ScyllaDB | |
20 | */ | |
21 | ||
22 | #define __user /* empty */ // for xfs includes, below | |
23 | ||
24 | #include <sys/syscall.h> | |
25 | #include <dirent.h> | |
26 | #include <linux/types.h> // for xfs, below | |
20effc67 | 27 | #include <linux/fs.h> // BLKBSZGET |
9f95a23c | 28 | #include <sys/ioctl.h> |
20effc67 TL |
29 | #include <unistd.h> |
30 | #include <fcntl.h> | |
9f95a23c TL |
31 | #include <xfs/linux.h> |
32 | #define min min /* prevent xfs.h from defining min() as a macro */ | |
33 | #include <xfs/xfs.h> | |
34 | #undef min | |
35 | #include <boost/range/numeric.hpp> | |
36 | #include <boost/range/adaptor/transformed.hpp> | |
37 | #include <seastar/core/reactor.hh> | |
38 | #include <seastar/core/file.hh> | |
39 | #include <seastar/core/report_exception.hh> | |
40 | #include <seastar/core/linux-aio.hh> | |
f67539c2 | 41 | #include <seastar/util/later.hh> |
20effc67 | 42 | #include <seastar/core/io_queue.hh> |
9f95a23c TL |
43 | #include "core/file-impl.hh" |
44 | #include "core/syscall_result.hh" | |
45 | #include "core/thread_pool.hh" | |
46 | #include "core/uname.hh" | |
20effc67 | 47 | #include "seastar/core/internal/read_state.hh" |
9f95a23c TL |
48 | |
49 | namespace seastar { | |
50 | ||
f67539c2 TL |
51 | static_assert(std::is_nothrow_copy_constructible_v<io_priority_class>); |
52 | static_assert(std::is_nothrow_move_constructible_v<io_priority_class>); | |
53 | ||
20effc67 TL |
54 | namespace internal { |
55 | ||
56 | struct fs_info { | |
57 | uint32_t block_size; | |
58 | bool append_challenged; | |
59 | unsigned append_concurrency; | |
60 | bool fsync_is_exclusive; | |
61 | bool nowait_works; | |
62 | std::optional<dioattr> dioinfo; | |
63 | }; | |
64 | ||
65 | }; | |
66 | ||
9f95a23c TL |
67 | using namespace internal; |
68 | using namespace internal::linux_abi; | |
69 | ||
70 | file_handle::file_handle(const file_handle& x) | |
71 | : _impl(x._impl ? x._impl->clone() : std::unique_ptr<file_handle_impl>()) { | |
72 | } | |
73 | ||
74 | file_handle::file_handle(file_handle&& x) noexcept = default; | |
75 | ||
76 | file_handle& | |
77 | file_handle::operator=(const file_handle& x) { | |
78 | return operator=(file_handle(x)); | |
79 | } | |
80 | ||
81 | file_handle& | |
82 | file_handle::operator=(file_handle&&) noexcept = default; | |
83 | ||
84 | file | |
85 | file_handle::to_file() const & { | |
86 | return file_handle(*this).to_file(); | |
87 | } | |
88 | ||
89 | file | |
90 | file_handle::to_file() && { | |
91 | return file(std::move(*_impl).to_file()); | |
92 | } | |
93 | ||
20effc67 | 94 | posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, bool nowait_works) |
f67539c2 | 95 | : _device_id(device_id) |
20effc67 TL |
96 | , _nowait_works(nowait_works) |
97 | , _io_queue(engine().get_io_queue(_device_id)) | |
9f95a23c TL |
98 | , _open_flags(f) |
99 | , _fd(fd) | |
100 | { | |
20effc67 TL |
101 | configure_io_lengths(); |
102 | } | |
103 | ||
104 | posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, const internal::fs_info& fsi) | |
105 | : posix_file_impl(fd, f, options, device_id, fsi.nowait_works) | |
106 | { | |
107 | configure_dma_alignment(fsi); | |
9f95a23c TL |
108 | } |
109 | ||
110 | posix_file_impl::~posix_file_impl() { | |
111 | if (_refcount && _refcount->fetch_add(-1, std::memory_order_relaxed) != 1) { | |
112 | return; | |
113 | } | |
114 | delete _refcount; | |
115 | if (_fd != -1) { | |
116 | // Note: close() can be a blocking operation on NFS | |
117 | ::close(_fd); | |
118 | } | |
119 | } | |
120 | ||
121 | void | |
20effc67 TL |
122 | posix_file_impl::configure_dma_alignment(const internal::fs_info& fsi) { |
123 | if (fsi.dioinfo) { | |
124 | const dioattr& da = *fsi.dioinfo; | |
9f95a23c TL |
125 | _memory_dma_alignment = da.d_mem; |
126 | _disk_read_dma_alignment = da.d_miniosz; | |
127 | // xfs wants at least the block size for writes | |
128 | // FIXME: really read the block size | |
20effc67 TL |
129 | _disk_write_dma_alignment = std::max<unsigned>(da.d_miniosz, fsi.block_size); |
130 | static bool xfs_with_relaxed_overwrite_alignment = kernel_uname().whitelisted({"5.12"}); | |
131 | _disk_overwrite_dma_alignment = xfs_with_relaxed_overwrite_alignment ? da.d_miniosz : _disk_write_dma_alignment; | |
9f95a23c TL |
132 | } |
133 | } | |
134 | ||
20effc67 TL |
135 | void posix_file_impl::configure_io_lengths() noexcept { |
136 | auto limits = _io_queue.get_request_limits(); | |
137 | _read_max_length = std::min<size_t>(_read_max_length, limits.max_read); | |
138 | _write_max_length = std::min<size_t>(_write_max_length, limits.max_write); | |
139 | } | |
140 | ||
9f95a23c TL |
141 | std::unique_ptr<seastar::file_handle_impl> |
142 | posix_file_impl::dup() { | |
143 | if (!_refcount) { | |
144 | _refcount = new std::atomic<unsigned>(1u); | |
145 | } | |
20effc67 TL |
146 | auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id, |
147 | _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, | |
148 | _nowait_works); | |
9f95a23c TL |
149 | _refcount->fetch_add(1, std::memory_order_relaxed); |
150 | return ret; | |
151 | } | |
152 | ||
20effc67 TL |
153 | posix_file_impl::posix_file_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, dev_t device_id, |
154 | uint32_t memory_dma_alignment, | |
155 | uint32_t disk_read_dma_alignment, | |
156 | uint32_t disk_write_dma_alignment, | |
157 | uint32_t disk_overwrite_dma_alignment, | |
158 | bool nowait_works) | |
f67539c2 TL |
159 | : _refcount(refcount) |
160 | , _device_id(device_id) | |
20effc67 TL |
161 | , _nowait_works(nowait_works) |
162 | , _io_queue(engine().get_io_queue(_device_id)) | |
f67539c2 TL |
163 | , _open_flags(f) |
164 | , _fd(fd) { | |
20effc67 TL |
165 | _memory_dma_alignment = memory_dma_alignment; |
166 | _disk_read_dma_alignment = disk_read_dma_alignment; | |
167 | _disk_write_dma_alignment = disk_write_dma_alignment; | |
168 | _disk_overwrite_dma_alignment = disk_overwrite_dma_alignment; | |
169 | configure_io_lengths(); | |
9f95a23c TL |
170 | } |
171 | ||
172 | future<> | |
f67539c2 | 173 | posix_file_impl::flush(void) noexcept { |
9f95a23c TL |
174 | if ((_open_flags & open_flags::dsync) != open_flags{}) { |
175 | return make_ready_future<>(); | |
176 | } | |
177 | return engine().fdatasync(_fd); | |
178 | } | |
179 | ||
180 | future<struct stat> | |
f67539c2 TL |
181 | posix_file_impl::stat() noexcept { |
182 | return engine().fstat(_fd); | |
9f95a23c TL |
183 | } |
184 | ||
185 | future<> | |
f67539c2 | 186 | posix_file_impl::truncate(uint64_t length) noexcept { |
9f95a23c TL |
187 | return engine()._thread_pool->submit<syscall_result<int>>([this, length] { |
188 | return wrap_syscall<int>(::ftruncate(_fd, length)); | |
189 | }).then([] (syscall_result<int> sr) { | |
190 | sr.throw_if_error(); | |
191 | return make_ready_future<>(); | |
192 | }); | |
193 | } | |
194 | ||
20effc67 TL |
195 | future<int> |
196 | posix_file_impl::ioctl(uint64_t cmd, void* argp) noexcept { | |
197 | return engine()._thread_pool->submit<syscall_result<int>>([this, cmd, argp] () mutable { | |
198 | return wrap_syscall<int>(::ioctl(_fd, cmd, argp)); | |
199 | }).then([] (syscall_result<int> sr) { | |
200 | sr.throw_if_error(); | |
201 | // Some ioctls require to return a positive integer back. | |
202 | return make_ready_future<int>(sr.result); | |
203 | }); | |
204 | } | |
205 | ||
206 | future<int> | |
207 | posix_file_impl::ioctl_short(uint64_t cmd, void* argp) noexcept { | |
208 | int ret = ::ioctl(_fd, cmd, argp); | |
209 | if (ret == -1) { | |
210 | return make_exception_future<int>( | |
211 | std::system_error(errno, std::system_category(), "ioctl failed")); | |
212 | } | |
213 | return make_ready_future<int>(ret); | |
214 | } | |
215 | ||
216 | future<int> | |
217 | posix_file_impl::fcntl(int op, uintptr_t arg) noexcept { | |
218 | return engine()._thread_pool->submit<syscall_result<int>>([this, op, arg] () mutable { | |
219 | return wrap_syscall<int>(::fcntl(_fd, op, arg)); | |
220 | }).then([] (syscall_result<int> sr) { | |
221 | sr.throw_if_error(); | |
222 | // Some fcntls require to return a positive integer back. | |
223 | return make_ready_future<int>(sr.result); | |
224 | }); | |
225 | } | |
226 | ||
227 | future<int> | |
228 | posix_file_impl::fcntl_short(int op, uintptr_t arg) noexcept { | |
229 | int ret = ::fcntl(_fd, op, arg); | |
230 | if (ret == -1) { | |
231 | return make_exception_future<int>( | |
232 | std::system_error(errno, std::system_category(), "fcntl failed")); | |
233 | } | |
234 | return make_ready_future<int>(ret); | |
235 | } | |
236 | ||
9f95a23c | 237 | future<> |
f67539c2 | 238 | posix_file_impl::discard(uint64_t offset, uint64_t length) noexcept { |
9f95a23c TL |
239 | return engine()._thread_pool->submit<syscall_result<int>>([this, offset, length] () mutable { |
240 | return wrap_syscall<int>(::fallocate(_fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, | |
241 | offset, length)); | |
242 | }).then([] (syscall_result<int> sr) { | |
243 | sr.throw_if_error(); | |
244 | return make_ready_future<>(); | |
245 | }); | |
246 | } | |
247 | ||
248 | future<> | |
f67539c2 | 249 | posix_file_impl::allocate(uint64_t position, uint64_t length) noexcept { |
9f95a23c TL |
250 | #ifdef FALLOC_FL_ZERO_RANGE |
251 | // FALLOC_FL_ZERO_RANGE is fairly new, so don't fail if it's not supported. | |
252 | static bool supported = true; | |
253 | if (!supported) { | |
254 | return make_ready_future<>(); | |
255 | } | |
256 | return engine()._thread_pool->submit<syscall_result<int>>([this, position, length] () mutable { | |
257 | auto ret = ::fallocate(_fd, FALLOC_FL_ZERO_RANGE|FALLOC_FL_KEEP_SIZE, position, length); | |
258 | if (ret == -1 && errno == EOPNOTSUPP) { | |
259 | ret = 0; | |
260 | supported = false; // Racy, but harmless. At most we issue an extra call or two. | |
261 | } | |
262 | return wrap_syscall<int>(ret); | |
263 | }).then([] (syscall_result<int> sr) { | |
264 | sr.throw_if_error(); | |
265 | return make_ready_future<>(); | |
266 | }); | |
267 | #else | |
268 | return make_ready_future<>(); | |
269 | #endif | |
270 | } | |
271 | ||
272 | future<uint64_t> | |
f67539c2 | 273 | posix_file_impl::size() noexcept { |
9f95a23c TL |
274 | auto r = ::lseek(_fd, 0, SEEK_END); |
275 | if (r == -1) { | |
276 | return make_exception_future<uint64_t>(std::system_error(errno, std::system_category())); | |
277 | } | |
278 | return make_ready_future<uint64_t>(r); | |
279 | } | |
280 | ||
281 | future<> | |
282 | posix_file_impl::close() noexcept { | |
283 | if (_fd == -1) { | |
284 | seastar_logger.warn("double close() detected, contact support"); | |
285 | return make_ready_future<>(); | |
286 | } | |
287 | auto fd = _fd; | |
288 | _fd = -1; // Prevent a concurrent close (which is illegal) from closing another file's fd | |
289 | if (_refcount && _refcount->fetch_add(-1, std::memory_order_relaxed) != 1) { | |
290 | _refcount = nullptr; | |
291 | return make_ready_future<>(); | |
292 | } | |
293 | delete _refcount; | |
294 | _refcount = nullptr; | |
295 | auto closed = [fd] () noexcept { | |
296 | try { | |
297 | return engine()._thread_pool->submit<syscall_result<int>>([fd] { | |
298 | return wrap_syscall<int>(::close(fd)); | |
299 | }); | |
300 | } catch (...) { | |
301 | report_exception("Running ::close() in reactor thread, submission failed with exception", std::current_exception()); | |
302 | return make_ready_future<syscall_result<int>>(wrap_syscall<int>(::close(fd))); | |
303 | } | |
304 | }(); | |
305 | return closed.then([] (syscall_result<int> sr) { | |
20effc67 | 306 | try { |
9f95a23c | 307 | sr.throw_if_error(); |
20effc67 TL |
308 | } catch (...) { |
309 | report_exception("close() syscall failed", std::current_exception()); | |
310 | } | |
9f95a23c TL |
311 | }); |
312 | } | |
313 | ||
314 | future<uint64_t> | |
f67539c2 | 315 | blockdev_file_impl::size(void) noexcept { |
9f95a23c TL |
316 | return engine()._thread_pool->submit<syscall_result_extra<size_t>>([this] { |
317 | uint64_t size; | |
318 | int ret = ::ioctl(_fd, BLKGETSIZE64, &size); | |
319 | return wrap_syscall(ret, size); | |
320 | }).then([] (syscall_result_extra<uint64_t> ret) { | |
321 | ret.throw_if_error(); | |
322 | return make_ready_future<uint64_t>(ret.extra); | |
323 | }); | |
324 | } | |
325 | ||
326 | subscription<directory_entry> | |
327 | posix_file_impl::list_directory(std::function<future<> (directory_entry de)> next) { | |
328 | static constexpr size_t buffer_size = 8192; | |
329 | struct work { | |
330 | stream<directory_entry> s; | |
331 | unsigned current = 0; | |
332 | unsigned total = 0; | |
333 | bool eof = false; | |
334 | int error = 0; | |
335 | char buffer[buffer_size]; | |
336 | }; | |
337 | ||
338 | // While it would be natural to use fdopendir()/readdir(), | |
339 | // our syscall thread pool doesn't support malloc(), which is | |
340 | // required for this to work. So resort to using getdents() | |
341 | // instead. | |
342 | ||
343 | // From getdents(2): | |
344 | struct linux_dirent64 { | |
345 | ino64_t d_ino; /* 64-bit inode number */ | |
346 | off64_t d_off; /* 64-bit offset to next structure */ | |
347 | unsigned short d_reclen; /* Size of this dirent */ | |
348 | unsigned char d_type; /* File type */ | |
349 | char d_name[]; /* Filename (null-terminated) */ | |
350 | }; | |
351 | ||
352 | auto w = make_lw_shared<work>(); | |
353 | auto ret = w->s.listen(std::move(next)); | |
354 | // List the directory asynchronously in the background. | |
355 | // Caller synchronizes using the returned subscription. | |
356 | (void)w->s.started().then([w, this] { | |
357 | auto eofcond = [w] { return w->eof; }; | |
358 | return do_until(eofcond, [w, this] { | |
359 | if (w->current == w->total) { | |
360 | return engine()._thread_pool->submit<syscall_result<long>>([w , this] () { | |
361 | auto ret = ::syscall(__NR_getdents64, _fd, reinterpret_cast<linux_dirent64*>(w->buffer), buffer_size); | |
362 | return wrap_syscall(ret); | |
363 | }).then([w] (syscall_result<long> ret) { | |
364 | ret.throw_if_error(); | |
365 | if (ret.result == 0) { | |
366 | w->eof = true; | |
367 | } else { | |
368 | w->current = 0; | |
369 | w->total = ret.result; | |
370 | } | |
371 | }); | |
372 | } | |
373 | auto start = w->buffer + w->current; | |
374 | auto de = reinterpret_cast<linux_dirent64*>(start); | |
f67539c2 | 375 | std::optional<directory_entry_type> type; |
9f95a23c TL |
376 | switch (de->d_type) { |
377 | case DT_BLK: | |
378 | type = directory_entry_type::block_device; | |
379 | break; | |
380 | case DT_CHR: | |
381 | type = directory_entry_type::char_device; | |
382 | break; | |
383 | case DT_DIR: | |
384 | type = directory_entry_type::directory; | |
385 | break; | |
386 | case DT_FIFO: | |
387 | type = directory_entry_type::fifo; | |
388 | break; | |
389 | case DT_REG: | |
390 | type = directory_entry_type::regular; | |
391 | break; | |
392 | case DT_LNK: | |
393 | type = directory_entry_type::link; | |
394 | break; | |
395 | case DT_SOCK: | |
396 | type = directory_entry_type::socket; | |
397 | break; | |
398 | default: | |
399 | // unknown, ignore | |
400 | ; | |
401 | } | |
402 | w->current += de->d_reclen; | |
403 | sstring name = de->d_name; | |
404 | if (name == "." || name == "..") { | |
405 | return make_ready_future<>(); | |
406 | } | |
407 | return w->s.produce({std::move(name), type}); | |
408 | }); | |
409 | }).then([w] { | |
410 | w->s.close(); | |
411 | }).handle_exception([] (std::exception_ptr ignored) {}); | |
412 | return ret; | |
413 | } | |
414 | ||
415 | future<size_t> | |
20effc67 TL |
416 | posix_file_impl::do_write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept { |
417 | auto req = internal::io_request::make_write(_fd, pos, buffer, len, _nowait_works); | |
418 | return engine().submit_io_write(&_io_queue, io_priority_class, len, std::move(req), intent); | |
9f95a23c TL |
419 | } |
420 | ||
421 | future<size_t> | |
20effc67 | 422 | posix_file_impl::do_write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept { |
9f95a23c | 423 | auto len = internal::sanitize_iovecs(iov, _disk_write_dma_alignment); |
20effc67 TL |
424 | auto req = internal::io_request::make_writev(_fd, pos, iov, _nowait_works); |
425 | return engine().submit_io_write(&_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {}); | |
9f95a23c TL |
426 | } |
427 | ||
428 | future<size_t> | |
20effc67 TL |
429 | posix_file_impl::do_read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept { |
430 | auto req = internal::io_request::make_read(_fd, pos, buffer, len, _nowait_works); | |
431 | return engine().submit_io_read(&_io_queue, io_priority_class, len, std::move(req), intent); | |
9f95a23c TL |
432 | } |
433 | ||
434 | future<size_t> | |
20effc67 | 435 | posix_file_impl::do_read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept { |
9f95a23c | 436 | auto len = internal::sanitize_iovecs(iov, _disk_read_dma_alignment); |
20effc67 TL |
437 | auto req = internal::io_request::make_readv(_fd, pos, iov, _nowait_works); |
438 | return engine().submit_io_read(&_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {}); | |
439 | } | |
440 | ||
441 | future<size_t> | |
442 | posix_file_real_impl::write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { | |
443 | return posix_file_impl::do_write_dma(pos, buffer, len, pc, intent); | |
444 | } | |
445 | ||
446 | future<size_t> | |
447 | posix_file_real_impl::write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { | |
448 | return posix_file_impl::do_write_dma(pos, std::move(iov), pc, intent); | |
449 | } | |
450 | ||
451 | future<size_t> | |
452 | posix_file_real_impl::read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { | |
453 | return posix_file_impl::do_read_dma(pos, buffer, len, pc, intent); | |
454 | } | |
455 | ||
456 | future<size_t> | |
457 | posix_file_real_impl::read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { | |
458 | return posix_file_impl::do_read_dma(pos, std::move(iov), pc, intent); | |
9f95a23c TL |
459 | } |
460 | ||
461 | future<temporary_buffer<uint8_t>> | |
20effc67 TL |
462 | posix_file_real_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept { |
463 | return posix_file_impl::do_dma_read_bulk(offset, range_size, pc, intent); | |
464 | } | |
465 | ||
466 | future<temporary_buffer<uint8_t>> | |
467 | posix_file_impl::do_dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept { | |
468 | using tmp_buf_type = typename internal::file_read_state<uint8_t>::tmp_buf_type; | |
9f95a23c | 469 | |
f67539c2 | 470 | try { |
9f95a23c TL |
471 | auto front = offset & (_disk_read_dma_alignment - 1); |
472 | offset -= front; | |
473 | range_size += front; | |
474 | ||
20effc67 | 475 | auto rstate = make_lw_shared<internal::file_read_state<uint8_t>>(offset, front, |
9f95a23c TL |
476 | range_size, |
477 | _memory_dma_alignment, | |
20effc67 TL |
478 | _disk_read_dma_alignment, |
479 | intent); | |
9f95a23c TL |
480 | |
481 | // | |
482 | // First, try to read directly into the buffer. Most of the reads will | |
483 | // end here. | |
484 | // | |
485 | auto read = read_dma(offset, rstate->buf.get_write(), | |
20effc67 | 486 | rstate->buf.size(), pc, intent); |
9f95a23c TL |
487 | |
488 | return read.then([rstate, this, &pc] (size_t size) mutable { | |
489 | rstate->pos = size; | |
490 | ||
491 | // | |
492 | // If we haven't read all required data at once - | |
493 | // start read-copy sequence. We can't continue with direct reads | |
494 | // into the previously allocated buffer here since we have to ensure | |
495 | // the aligned read length and thus the aligned destination buffer | |
496 | // size. | |
497 | // | |
498 | // The copying will actually take place only if there was a HW glitch. | |
499 | // In EOF case or in case of a persistent I/O error the only overhead is | |
500 | // an extra allocation. | |
501 | // | |
502 | return do_until( | |
503 | [rstate] { return rstate->done(); }, | |
504 | [rstate, this, &pc] () mutable { | |
505 | return read_maybe_eof( | |
20effc67 | 506 | rstate->cur_offset(), rstate->left_to_read(), pc, rstate->get_intent()).then( |
9f95a23c TL |
507 | [rstate] (auto buf1) mutable { |
508 | if (buf1.size()) { | |
509 | rstate->append_new_data(buf1); | |
510 | } else { | |
511 | rstate->eof = true; | |
512 | } | |
513 | ||
514 | return make_ready_future<>(); | |
515 | }); | |
516 | }).then([rstate] () mutable { | |
517 | // | |
518 | // If we are here we are promised to have read some bytes beyond | |
519 | // "front" so we may trim straight away. | |
520 | // | |
521 | rstate->trim_buf_before_ret(); | |
522 | return make_ready_future<tmp_buf_type>(std::move(rstate->buf)); | |
523 | }); | |
524 | }); | |
f67539c2 TL |
525 | } catch (...) { |
526 | return make_exception_future<tmp_buf_type>(std::current_exception()); | |
527 | } | |
9f95a23c TL |
528 | } |
529 | ||
530 | future<temporary_buffer<uint8_t>> | |
20effc67 | 531 | posix_file_impl::read_maybe_eof(uint64_t pos, size_t len, const io_priority_class& pc, io_intent* intent) { |
9f95a23c TL |
532 | // |
533 | // We have to allocate a new aligned buffer to make sure we don't get | |
534 | // an EINVAL error due to unaligned destination buffer. | |
535 | // | |
536 | temporary_buffer<uint8_t> buf = temporary_buffer<uint8_t>::aligned( | |
537 | _memory_dma_alignment, align_up(len, size_t(_disk_read_dma_alignment))); | |
538 | ||
539 | // try to read a single bulk from the given position | |
540 | auto dst = buf.get_write(); | |
541 | auto buf_size = buf.size(); | |
20effc67 | 542 | return read_dma(pos, dst, buf_size, pc, intent).then_wrapped( |
9f95a23c TL |
543 | [buf = std::move(buf)](future<size_t> f) mutable { |
544 | try { | |
f67539c2 | 545 | size_t size = f.get0(); |
9f95a23c TL |
546 | |
547 | buf.trim(size); | |
548 | ||
549 | return std::move(buf); | |
550 | } catch (std::system_error& e) { | |
551 | // | |
552 | // TODO: implement a non-trowing file_impl::dma_read() interface to | |
553 | // avoid the exceptions throwing in a good flow completely. | |
554 | // Otherwise for users that don't want to care about the | |
555 | // underlying file size and preventing the attempts to read | |
556 | // bytes beyond EOF there will always be at least one | |
557 | // exception throwing at the file end for files with unaligned | |
558 | // length. | |
559 | // | |
560 | if (e.code().value() == EINVAL) { | |
561 | buf.trim(0); | |
562 | return std::move(buf); | |
563 | } else { | |
564 | throw; | |
565 | } | |
566 | } | |
567 | }); | |
568 | } | |
569 | ||
20effc67 TL |
570 | static bool blockdev_nowait_works = kernel_uname().whitelisted({"4.13"}); |
571 | ||
572 | blockdev_file_impl::blockdev_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, size_t block_size) | |
573 | : posix_file_impl(fd, f, options, device_id, blockdev_nowait_works) { | |
574 | // FIXME -- configure file_impl::_..._dma_alignment's from block_size | |
9f95a23c TL |
575 | } |
576 | ||
577 | future<> | |
f67539c2 | 578 | blockdev_file_impl::truncate(uint64_t length) noexcept { |
9f95a23c TL |
579 | return make_ready_future<>(); |
580 | } | |
581 | ||
582 | future<> | |
f67539c2 | 583 | blockdev_file_impl::discard(uint64_t offset, uint64_t length) noexcept { |
9f95a23c TL |
584 | return engine()._thread_pool->submit<syscall_result<int>>([this, offset, length] () mutable { |
585 | uint64_t range[2] { offset, length }; | |
586 | return wrap_syscall<int>(::ioctl(_fd, BLKDISCARD, &range)); | |
587 | }).then([] (syscall_result<int> sr) { | |
588 | sr.throw_if_error(); | |
589 | return make_ready_future<>(); | |
590 | }); | |
591 | } | |
592 | ||
593 | future<> | |
f67539c2 | 594 | blockdev_file_impl::allocate(uint64_t position, uint64_t length) noexcept { |
9f95a23c TL |
595 | // nothing to do for block device |
596 | return make_ready_future<>(); | |
597 | } | |
598 | ||
20effc67 TL |
599 | future<size_t> |
600 | blockdev_file_impl::write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { | |
601 | return posix_file_impl::do_write_dma(pos, buffer, len, pc, intent); | |
602 | } | |
603 | ||
604 | future<size_t> | |
605 | blockdev_file_impl::write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { | |
606 | return posix_file_impl::do_write_dma(pos, std::move(iov), pc, intent); | |
607 | } | |
608 | ||
609 | future<size_t> | |
610 | blockdev_file_impl::read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { | |
611 | return posix_file_impl::do_read_dma(pos, buffer, len, pc, intent); | |
612 | } | |
613 | ||
614 | future<size_t> | |
615 | blockdev_file_impl::read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { | |
616 | return posix_file_impl::do_read_dma(pos, std::move(iov), pc, intent); | |
617 | } | |
618 | ||
619 | future<temporary_buffer<uint8_t>> | |
620 | blockdev_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept { | |
621 | return posix_file_impl::do_dma_read_bulk(offset, range_size, pc, intent); | |
622 | } | |
623 | ||
624 | append_challenged_posix_file_impl::append_challenged_posix_file_impl(int fd, open_flags f, file_open_options options, const fs_info& fsi, dev_t device_id) | |
625 | : posix_file_impl(fd, f, options, device_id, fsi) | |
626 | , _max_size_changing_ops(fsi.append_concurrency) | |
627 | , _fsync_is_exclusive(fsi.fsync_is_exclusive) | |
628 | , _sloppy_size(options.sloppy_size) | |
629 | , _sloppy_size_hint(align_up<uint64_t>(options.sloppy_size_hint, _disk_write_dma_alignment)) | |
630 | { | |
9f95a23c TL |
631 | auto r = ::lseek(fd, 0, SEEK_END); |
632 | throw_system_error_on(r == -1); | |
633 | _committed_size = _logical_size = r; | |
9f95a23c TL |
634 | } |
635 | ||
636 | append_challenged_posix_file_impl::~append_challenged_posix_file_impl() { | |
637 | // If the file has not been closed we risk having running tasks | |
638 | // that will try to access freed memory. | |
f67539c2 TL |
639 | // |
640 | // It is safe to destory it if nothing is queued. | |
641 | // Note that posix_file_impl::~posix_file_impl auto-closes the file descriptor. | |
20effc67 | 642 | assert(_q.empty() && (_logical_size == _committed_size || _closing_state == state::closed)); |
9f95a23c TL |
643 | } |
644 | ||
645 | bool | |
646 | append_challenged_posix_file_impl::must_run_alone(const op& candidate) const noexcept { | |
647 | // checks if candidate is a non-write, size-changing operation. | |
648 | return (candidate.type == opcode::truncate) | |
649 | || (candidate.type == opcode::flush && (_fsync_is_exclusive || _sloppy_size)); | |
650 | } | |
651 | ||
20effc67 TL |
652 | bool |
653 | append_challenged_posix_file_impl::appending_write(const op& candidate) const noexcept { | |
654 | return (candidate.type == opcode::write) && | |
655 | (candidate.pos + candidate.len > _committed_size); | |
656 | } | |
657 | ||
9f95a23c TL |
658 | bool |
659 | append_challenged_posix_file_impl::size_changing(const op& candidate) const noexcept { | |
20effc67 | 660 | return appending_write(candidate) || must_run_alone(candidate); |
9f95a23c TL |
661 | } |
662 | ||
663 | bool | |
664 | append_challenged_posix_file_impl::may_dispatch(const op& candidate) const noexcept { | |
665 | if (size_changing(candidate)) { | |
666 | return !_current_size_changing_ops && !_current_non_size_changing_ops; | |
667 | } else { | |
668 | return !_current_size_changing_ops; | |
669 | } | |
670 | } | |
671 | ||
672 | void | |
673 | append_challenged_posix_file_impl::dispatch(op& candidate) noexcept { | |
674 | unsigned* op_counter = size_changing(candidate) | |
675 | ? &_current_size_changing_ops : &_current_non_size_changing_ops; | |
676 | ++*op_counter; | |
677 | // FIXME: future is discarded | |
678 | (void)candidate.run().then([me = shared_from_this(), op_counter] { | |
679 | --*op_counter; | |
680 | me->process_queue(); | |
681 | }); | |
682 | } | |
683 | ||
20effc67 TL |
684 | int append_challenged_posix_file_impl::truncate_sync(uint64_t length) noexcept { |
685 | int r = ::ftruncate(_fd, length); | |
686 | if (r != -1) { | |
687 | _committed_size = length; | |
688 | } | |
689 | return r; | |
690 | } | |
691 | ||
692 | void append_challenged_posix_file_impl::truncate_to_logical_size() { | |
693 | auto r = truncate_sync(_logical_size); | |
694 | if (r == -1) { | |
695 | throw std::system_error(errno, std::system_category(), "truncate"); | |
696 | } | |
697 | } | |
698 | ||
9f95a23c TL |
699 | // If we have a bunch of size-extending writes in the queue, |
700 | // issue an ftruncate() extending the file size, so they can | |
701 | // be issued concurrently. | |
702 | void | |
703 | append_challenged_posix_file_impl::optimize_queue() noexcept { | |
704 | if (_current_non_size_changing_ops || _current_size_changing_ops) { | |
705 | // Can't issue an ftruncate() if something is going on | |
706 | return; | |
707 | } | |
708 | auto speculative_size = _committed_size; | |
709 | unsigned n_appending_writes = 0; | |
710 | for (const auto& op : _q) { | |
711 | // stop calculating speculative size after a non-write, size-changing | |
712 | // operation is found to prevent an useless truncate from being issued. | |
713 | if (must_run_alone(op)) { | |
714 | break; | |
715 | } | |
20effc67 TL |
716 | |
717 | if (appending_write(op)) { | |
9f95a23c TL |
718 | speculative_size = std::max(speculative_size, op.pos + op.len); |
719 | ++n_appending_writes; | |
720 | } | |
721 | } | |
722 | if (n_appending_writes > _max_size_changing_ops | |
723 | || (n_appending_writes && _sloppy_size)) { | |
20effc67 TL |
724 | if (_sloppy_size) { |
725 | if (!_committed_size) { | |
726 | speculative_size = std::max(speculative_size, _sloppy_size_hint); | |
727 | } else if (speculative_size < 2 * _committed_size) { | |
9f95a23c | 728 | speculative_size = align_up<uint64_t>(2 * _committed_size, _disk_write_dma_alignment); |
20effc67 | 729 | } |
9f95a23c TL |
730 | } |
731 | // We're all alone, so issuing the ftruncate() in the reactor | |
732 | // thread won't block us. | |
733 | // | |
734 | // Issuing it in the syscall thread is too slow; this can happen | |
735 | // every several ops, and the syscall thread latency can be very | |
736 | // high. | |
20effc67 TL |
737 | |
738 | truncate_sync(speculative_size); | |
739 | // If we failed, the next write will pick it up. | |
9f95a23c TL |
740 | } |
741 | } | |
742 | ||
743 | void | |
744 | append_challenged_posix_file_impl::process_queue() noexcept { | |
745 | optimize_queue(); | |
746 | while (!_q.empty() && may_dispatch(_q.front())) { | |
747 | op candidate = std::move(_q.front()); | |
748 | _q.pop_front(); | |
749 | dispatch(candidate); | |
750 | } | |
751 | if (may_quit()) { | |
752 | _completed.set_value(); | |
753 | _closing_state = state::closing; // prevents _completed to be signaled again in case of recursion | |
754 | } | |
755 | } | |
756 | ||
757 | void | |
f67539c2 | 758 | append_challenged_posix_file_impl::enqueue_op(op&& op) { |
9f95a23c TL |
759 | _q.push_back(std::move(op)); |
760 | process_queue(); | |
761 | } | |
762 | ||
763 | bool | |
764 | append_challenged_posix_file_impl::may_quit() const noexcept { | |
765 | return _closing_state == state::draining && _q.empty() && !_current_non_size_changing_ops && | |
766 | !_current_size_changing_ops; | |
767 | } | |
768 | ||
769 | void | |
770 | append_challenged_posix_file_impl::commit_size(uint64_t size) noexcept { | |
771 | _committed_size = std::max(size, _committed_size); | |
772 | _logical_size = std::max(size, _logical_size); | |
773 | } | |
774 | ||
775 | future<size_t> | |
20effc67 | 776 | append_challenged_posix_file_impl::read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { |
9f95a23c TL |
777 | if (pos >= _logical_size) { |
778 | // later() avoids tail recursion | |
779 | return later().then([] { | |
780 | return size_t(0); | |
781 | }); | |
782 | } | |
783 | len = std::min(pos + len, align_up<uint64_t>(_logical_size, _disk_read_dma_alignment)) - pos; | |
20effc67 | 784 | internal::intent_reference iref(intent); |
f67539c2 | 785 | return enqueue<size_t>( |
9f95a23c TL |
786 | opcode::read, |
787 | pos, | |
788 | len, | |
20effc67 TL |
789 | [this, pos, buffer, len, &pc, iref = std::move(iref)] () mutable { |
790 | return posix_file_impl::do_read_dma(pos, buffer, len, pc, iref.retrieve()); | |
9f95a23c | 791 | } |
f67539c2 | 792 | ); |
9f95a23c TL |
793 | } |
794 | ||
795 | future<size_t> | |
20effc67 | 796 | append_challenged_posix_file_impl::read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { |
9f95a23c TL |
797 | if (pos >= _logical_size) { |
798 | // later() avoids tail recursion | |
799 | return later().then([] { | |
800 | return size_t(0); | |
801 | }); | |
802 | } | |
803 | size_t len = 0; | |
804 | auto i = iov.begin(); | |
f67539c2 TL |
805 | auto aligned_logical_size = align_up<uint64_t>(_logical_size, _disk_read_dma_alignment); |
806 | while (i != iov.end() && pos + len + i->iov_len <= aligned_logical_size) { | |
9f95a23c TL |
807 | len += i++->iov_len; |
808 | } | |
9f95a23c TL |
809 | if (i != iov.end()) { |
810 | auto last_len = pos + len + i->iov_len - aligned_logical_size; | |
811 | if (last_len) { | |
812 | i++->iov_len = last_len; | |
813 | } | |
814 | iov.erase(i, iov.end()); | |
815 | } | |
20effc67 | 816 | internal::intent_reference iref(intent); |
f67539c2 | 817 | return enqueue<size_t>( |
9f95a23c TL |
818 | opcode::read, |
819 | pos, | |
820 | len, | |
20effc67 TL |
821 | [this, pos, iov = std::move(iov), &pc, iref = std::move(iref)] () mutable { |
822 | return posix_file_impl::do_read_dma(pos, std::move(iov), pc, iref.retrieve()); | |
9f95a23c | 823 | } |
f67539c2 | 824 | ); |
9f95a23c TL |
825 | } |
826 | ||
827 | future<size_t> | |
20effc67 TL |
828 | append_challenged_posix_file_impl::write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { |
829 | internal::intent_reference iref(intent); | |
f67539c2 | 830 | return enqueue<size_t>( |
9f95a23c TL |
831 | opcode::write, |
832 | pos, | |
833 | len, | |
20effc67 TL |
834 | [this, pos, buffer, len, &pc, iref = std::move(iref)] { |
835 | return posix_file_impl::do_write_dma(pos, buffer, len, pc, iref.retrieve()).then([this, pos] (size_t ret) { | |
f67539c2 TL |
836 | commit_size(pos + ret); |
837 | return make_ready_future<size_t>(ret); | |
9f95a23c TL |
838 | }); |
839 | } | |
f67539c2 | 840 | ); |
9f95a23c TL |
841 | } |
842 | ||
843 | future<size_t> | |
20effc67 | 844 | append_challenged_posix_file_impl::write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { |
9f95a23c | 845 | auto len = boost::accumulate(iov | boost::adaptors::transformed(std::mem_fn(&iovec::iov_len)), size_t(0)); |
20effc67 | 846 | internal::intent_reference iref(intent); |
f67539c2 | 847 | return enqueue<size_t>( |
9f95a23c TL |
848 | opcode::write, |
849 | pos, | |
850 | len, | |
20effc67 TL |
851 | [this, pos, iov = std::move(iov), &pc, iref = std::move(iref)] () mutable { |
852 | return posix_file_impl::do_write_dma(pos, std::move(iov), pc, iref.retrieve()).then([this, pos] (size_t ret) { | |
f67539c2 TL |
853 | commit_size(pos + ret); |
854 | return make_ready_future<size_t>(ret); | |
9f95a23c TL |
855 | }); |
856 | } | |
f67539c2 | 857 | ); |
9f95a23c TL |
858 | } |
859 | ||
20effc67 TL |
860 | future<temporary_buffer<uint8_t>> |
861 | append_challenged_posix_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept { | |
862 | return posix_file_impl::do_dma_read_bulk(offset, range_size, pc, intent); | |
863 | } | |
864 | ||
9f95a23c | 865 | future<> |
f67539c2 | 866 | append_challenged_posix_file_impl::flush() noexcept { |
9f95a23c TL |
867 | if ((!_sloppy_size || _logical_size == _committed_size) && !_fsync_is_exclusive) { |
868 | // FIXME: determine if flush can block concurrent reads or writes | |
869 | return posix_file_impl::flush(); | |
870 | } else { | |
f67539c2 | 871 | return enqueue<>( |
9f95a23c TL |
872 | opcode::flush, |
873 | 0, | |
874 | 0, | |
f67539c2 TL |
875 | [this] () { |
876 | if (_logical_size != _committed_size) { | |
877 | // We're all alone, so can truncate in reactor thread | |
20effc67 | 878 | truncate_to_logical_size(); |
f67539c2 TL |
879 | } |
880 | return posix_file_impl::flush(); | |
9f95a23c | 881 | } |
f67539c2 | 882 | ); |
9f95a23c TL |
883 | } |
884 | } | |
885 | ||
886 | future<struct stat> | |
f67539c2 | 887 | append_challenged_posix_file_impl::stat() noexcept { |
9f95a23c TL |
888 | // FIXME: can this conflict with anything? |
889 | return posix_file_impl::stat().then([this] (struct stat stat) { | |
890 | stat.st_size = _logical_size; | |
891 | return stat; | |
892 | }); | |
893 | } | |
894 | ||
895 | future<> | |
f67539c2 TL |
896 | append_challenged_posix_file_impl::truncate(uint64_t length) noexcept { |
897 | return enqueue<>( | |
9f95a23c TL |
898 | opcode::truncate, |
899 | length, | |
900 | 0, | |
f67539c2 TL |
901 | [this, length] () mutable { |
902 | return posix_file_impl::truncate(length).then([this, length] () mutable { | |
903 | _committed_size = _logical_size = length; | |
9f95a23c TL |
904 | }); |
905 | } | |
f67539c2 | 906 | ); |
9f95a23c TL |
907 | } |
908 | ||
909 | future<uint64_t> | |
f67539c2 | 910 | append_challenged_posix_file_impl::size() noexcept { |
9f95a23c TL |
911 | return make_ready_future<size_t>(_logical_size); |
912 | } | |
913 | ||
914 | future<> | |
915 | append_challenged_posix_file_impl::close() noexcept { | |
916 | // Caller should have drained all pending I/O | |
917 | _closing_state = state::draining; | |
918 | process_queue(); | |
919 | return _completed.get_future().then([this] { | |
920 | if (_logical_size != _committed_size) { | |
20effc67 TL |
921 | truncate_to_logical_size(); |
922 | } | |
923 | }).then_wrapped([this] (future<> f) { | |
924 | if (f.failed()) { | |
925 | report_exception("Closing append_challenged_posix_file_impl failed.", f.get_exception()); | |
9f95a23c TL |
926 | } |
927 | return posix_file_impl::close().finally([this] { _closing_state = state::closed; }); | |
928 | }); | |
929 | } | |
930 | ||
931 | posix_file_handle_impl::~posix_file_handle_impl() { | |
932 | if (_refcount && _refcount->fetch_add(-1, std::memory_order_relaxed) == 1) { | |
933 | ::close(_fd); | |
934 | delete _refcount; | |
935 | } | |
936 | } | |
937 | ||
938 | std::unique_ptr<seastar::file_handle_impl> | |
939 | posix_file_handle_impl::clone() const { | |
20effc67 TL |
940 | auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id, |
941 | _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works); | |
9f95a23c TL |
942 | if (_refcount) { |
943 | _refcount->fetch_add(1, std::memory_order_relaxed); | |
944 | } | |
945 | return ret; | |
946 | } | |
947 | ||
948 | shared_ptr<file_impl> | |
949 | posix_file_handle_impl::to_file() && { | |
20effc67 TL |
950 | auto ret = ::seastar::make_shared<posix_file_real_impl>(_fd, _open_flags, _refcount, _device_id, |
951 | _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works); | |
9f95a23c TL |
952 | _fd = -1; |
953 | _refcount = nullptr; | |
954 | return ret; | |
955 | } | |
956 | ||
957 | // Some kernels can append to xfs filesystems, some cannot; determine | |
958 | // from kernel version. | |
959 | static | |
960 | unsigned | |
961 | xfs_concurrency_from_kernel_version() { | |
962 | // try to see if this is a mainline kernel with xfs append fixed (3.15+) | |
963 | // or a RHEL kernel with the backported fix (3.10.0-325.el7+) | |
964 | if (kernel_uname().whitelisted({"3.15", "3.10.0-325.el7"})) { | |
965 | // Can append, but not concurrently | |
966 | return 1; | |
967 | } | |
968 | // Cannot append at all; need ftrucnate(). | |
969 | return 0; | |
970 | } | |
971 | ||
f67539c2 TL |
972 | future<shared_ptr<file_impl>> |
973 | make_file_impl(int fd, file_open_options options, int flags) noexcept { | |
974 | return engine().fstat(fd).then([fd, options = std::move(options), flags] (struct stat st) mutable { | |
975 | auto st_dev = st.st_dev; | |
9f95a23c | 976 | |
f67539c2 | 977 | if (S_ISBLK(st.st_mode)) { |
20effc67 TL |
978 | size_t block_size; |
979 | auto ret = ::ioctl(fd, BLKBSZGET, &block_size); | |
980 | if (ret == -1) { | |
981 | return make_exception_future<shared_ptr<file_impl>>( | |
982 | std::system_error(errno, std::system_category(), "ioctl(BLKBSZGET) failed")); | |
983 | } | |
984 | return make_ready_future<shared_ptr<file_impl>>(make_shared<blockdev_file_impl>(fd, open_flags(flags), options, st_dev, block_size)); | |
f67539c2 | 985 | } else { |
20effc67 TL |
986 | if (S_ISDIR(st.st_mode)) { |
987 | // Directories don't care about block size, so we need not | |
988 | // query it here. Just provide something reasonable. | |
989 | internal::fs_info fsi; | |
990 | fsi.block_size = 4096; | |
991 | fsi.nowait_works = false; | |
992 | return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, fsi, st_dev)); | |
9f95a23c | 993 | } |
20effc67 TL |
994 | static thread_local std::unordered_map<decltype(st_dev), fs_info> s_fstype; |
995 | future<> get_fs_info = s_fstype.count(st_dev) ? make_ready_future<>() : | |
996 | engine().fstatfs(fd).then([fd, st_dev] (struct statfs sfs) { | |
997 | internal::fs_info fsi; | |
998 | fsi.block_size = sfs.f_bsize; | |
f67539c2 TL |
999 | switch (sfs.f_type) { |
1000 | case 0x58465342: /* XFS */ | |
20effc67 TL |
1001 | dioattr da; |
1002 | if (::ioctl(fd, XFS_IOC_DIOINFO, &da) == 0) { | |
1003 | fsi.dioinfo = std::move(da); | |
1004 | } | |
1005 | ||
1006 | fsi.append_challenged = true; | |
f67539c2 | 1007 | static auto xc = xfs_concurrency_from_kernel_version(); |
20effc67 TL |
1008 | fsi.append_concurrency = xc; |
1009 | fsi.fsync_is_exclusive = true; | |
1010 | fsi.nowait_works = kernel_uname().whitelisted({"4.13"}); | |
f67539c2 TL |
1011 | break; |
1012 | case 0x6969: /* NFS */ | |
20effc67 TL |
1013 | fsi.append_challenged = false; |
1014 | fsi.append_concurrency = 0; | |
1015 | fsi.fsync_is_exclusive = false; | |
1016 | fsi.nowait_works = kernel_uname().whitelisted({"4.13"}); | |
f67539c2 TL |
1017 | break; |
1018 | case 0xEF53: /* EXT4 */ | |
20effc67 TL |
1019 | fsi.append_challenged = true; |
1020 | fsi.append_concurrency = 0; | |
1021 | fsi.fsync_is_exclusive = false; | |
1022 | fsi.nowait_works = kernel_uname().whitelisted({"5.5"}); | |
1023 | break; | |
1024 | case 0x9123683E: /* BTRFS */ | |
1025 | fsi.append_challenged = true; | |
1026 | fsi.append_concurrency = 0; | |
1027 | fsi.fsync_is_exclusive = true; | |
1028 | fsi.nowait_works = kernel_uname().whitelisted({"5.9"}); | |
1029 | break; | |
1030 | case 0x01021994: /* TMPFS */ | |
1031 | case 0x65735546: /* FUSE */ | |
1032 | fsi.append_challenged = false; | |
1033 | fsi.append_concurrency = 999; | |
1034 | fsi.fsync_is_exclusive = false; | |
1035 | fsi.nowait_works = false; | |
f67539c2 TL |
1036 | break; |
1037 | default: | |
20effc67 TL |
1038 | fsi.append_challenged = true; |
1039 | fsi.append_concurrency = 0; | |
1040 | fsi.fsync_is_exclusive = true; | |
1041 | fsi.nowait_works = false; | |
f67539c2 | 1042 | } |
20effc67 | 1043 | s_fstype[st_dev] = std::move(fsi); |
f67539c2 | 1044 | }); |
20effc67 TL |
1045 | return get_fs_info.then([st_dev, fd, flags, options = std::move(options)] () mutable { |
1046 | const fs_info& fsi = s_fstype[st_dev]; | |
1047 | if (!fsi.append_challenged || options.append_is_unlikely || ((flags & O_ACCMODE) == O_RDONLY)) { | |
1048 | return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), std::move(options), fsi, st_dev)); | |
f67539c2 | 1049 | } |
20effc67 | 1050 | return make_ready_future<shared_ptr<file_impl>>(make_shared<append_challenged_posix_file_impl>(fd, open_flags(flags), std::move(options), fsi, st_dev)); |
f67539c2 | 1051 | }); |
9f95a23c | 1052 | } |
f67539c2 TL |
1053 | }); |
1054 | } | |
1055 | ||
1056 | file::file(seastar::file_handle&& handle) noexcept | |
1057 | : _file_impl(std::move(std::move(handle).to_file()._file_impl)) { | |
1058 | } | |
1059 | ||
1060 | future<uint64_t> file::size() const noexcept { | |
1061 | try { | |
1062 | return _file_impl->size(); | |
1063 | } catch (...) { | |
1064 | return current_exception_as_future<uint64_t>(); | |
1065 | } | |
1066 | } | |
1067 | ||
1068 | future<> file::close() noexcept { | |
20effc67 TL |
1069 | auto f = std::move(_file_impl); |
1070 | return f->close().handle_exception([f = std::move(f)] (std::exception_ptr ex) { | |
1071 | report_exception("Closing the file failed unexpectedly", std::move(ex)); | |
f67539c2 TL |
1072 | }); |
1073 | } | |
1074 | ||
1075 | subscription<directory_entry> | |
1076 | file::list_directory(std::function<future<>(directory_entry de)> next) { | |
1077 | return _file_impl->list_directory(std::move(next)); | |
1078 | } | |
1079 | ||
20effc67 TL |
1080 | future<int> file::ioctl(uint64_t cmd, void* argp) noexcept { |
1081 | try { | |
1082 | return _file_impl->ioctl(cmd, argp); | |
1083 | } catch (...) { | |
1084 | return current_exception_as_future<int>(); | |
1085 | } | |
1086 | } | |
1087 | ||
1088 | future<int> file::ioctl_short(uint64_t cmd, void* argp) noexcept { | |
1089 | try { | |
1090 | return _file_impl->ioctl_short(cmd, argp); | |
1091 | } catch (...) { | |
1092 | return current_exception_as_future<int>(); | |
1093 | } | |
1094 | } | |
1095 | ||
1096 | future<int> file::fcntl(int op, uintptr_t arg) noexcept { | |
1097 | try { | |
1098 | return _file_impl->fcntl(op, arg); | |
1099 | } catch (...) { | |
1100 | return current_exception_as_future<int>(); | |
1101 | } | |
1102 | } | |
1103 | ||
1104 | future<int> file::fcntl_short(int op, uintptr_t arg) noexcept { | |
1105 | try { | |
1106 | return _file_impl->fcntl_short(op, arg); | |
1107 | } catch (...) { | |
1108 | return current_exception_as_future<int>(); | |
1109 | } | |
1110 | } | |
1111 | ||
1112 | future<> file::set_lifetime_hint_impl(int op, uint64_t hint) noexcept { | |
1113 | return do_with(hint, [op, this] (uint64_t& arg) { | |
1114 | try { | |
1115 | return _file_impl->fcntl(op, (uintptr_t)&arg).then_wrapped([] (future<int> f) { | |
1116 | // Need to handle return value differently from that of fcntl | |
1117 | if (f.failed()) { | |
1118 | return make_exception_future<>(f.get_exception()); | |
1119 | } | |
1120 | return make_ready_future<>(); | |
1121 | }); | |
1122 | } catch (...) { | |
1123 | return current_exception_as_future<>(); | |
1124 | } | |
1125 | }); | |
1126 | } | |
1127 | ||
1128 | future<> file::set_file_lifetime_hint(uint64_t hint) noexcept { | |
1129 | return set_lifetime_hint_impl(F_SET_FILE_RW_HINT, hint); | |
1130 | } | |
1131 | ||
1132 | future<> file::set_inode_lifetime_hint(uint64_t hint) noexcept { | |
1133 | return set_lifetime_hint_impl(F_SET_RW_HINT, hint); | |
1134 | } | |
1135 | ||
1136 | future<uint64_t> file::get_lifetime_hint_impl(int op) noexcept { | |
1137 | return do_with(uint64_t(0), [op, this] (uint64_t& arg) { | |
1138 | try { | |
1139 | return _file_impl->fcntl(op, (uintptr_t)&arg).then_wrapped([&arg] (future<int> f) { | |
1140 | // Need to handle return value differently from that of fcntl | |
1141 | if (f.failed()) { | |
1142 | return make_exception_future<uint64_t>(f.get_exception()); | |
1143 | } | |
1144 | return make_ready_future<uint64_t>(arg); | |
1145 | }); | |
1146 | } catch (...) { | |
1147 | return current_exception_as_future<uint64_t>(); | |
1148 | } | |
1149 | }); | |
1150 | } | |
1151 | ||
1152 | future<uint64_t> file::get_file_lifetime_hint() noexcept { | |
1153 | return get_lifetime_hint_impl(F_GET_FILE_RW_HINT); | |
1154 | } | |
1155 | ||
1156 | future<uint64_t> file::get_inode_lifetime_hint() noexcept { | |
1157 | return get_lifetime_hint_impl(F_GET_RW_HINT); | |
1158 | } | |
1159 | ||
f67539c2 | 1160 | future<temporary_buffer<uint8_t>> |
20effc67 | 1161 | file::dma_read_bulk_impl(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept { |
f67539c2 | 1162 | try { |
20effc67 | 1163 | return _file_impl->dma_read_bulk(offset, range_size, pc, intent); |
f67539c2 TL |
1164 | } catch (...) { |
1165 | return current_exception_as_future<temporary_buffer<uint8_t>>(); | |
1166 | } | |
1167 | } | |
1168 | ||
1169 | future<> file::discard(uint64_t offset, uint64_t length) noexcept { | |
1170 | try { | |
1171 | return _file_impl->discard(offset, length); | |
1172 | } catch (...) { | |
1173 | return current_exception_as_future(); | |
1174 | } | |
1175 | } | |
1176 | ||
1177 | future<> file::allocate(uint64_t position, uint64_t length) noexcept { | |
1178 | try { | |
1179 | return _file_impl->allocate(position, length); | |
1180 | } catch (...) { | |
1181 | return current_exception_as_future(); | |
1182 | } | |
1183 | } | |
1184 | ||
1185 | future<> file::truncate(uint64_t length) noexcept { | |
1186 | try { | |
1187 | return _file_impl->truncate(length); | |
1188 | } catch (...) { | |
1189 | return current_exception_as_future(); | |
1190 | } | |
1191 | } | |
1192 | ||
1193 | future<struct stat> file::stat() noexcept { | |
1194 | try { | |
1195 | return _file_impl->stat(); | |
1196 | } catch (...) { | |
1197 | return current_exception_as_future<struct stat>(); | |
1198 | } | |
1199 | } | |
1200 | ||
1201 | future<> file::flush() noexcept { | |
1202 | try { | |
1203 | return _file_impl->flush(); | |
1204 | } catch (...) { | |
1205 | return current_exception_as_future(); | |
1206 | } | |
1207 | } | |
1208 | ||
20effc67 | 1209 | future<size_t> file::dma_write(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { |
f67539c2 | 1210 | try { |
20effc67 | 1211 | return _file_impl->write_dma(pos, std::move(iov), pc, intent); |
f67539c2 TL |
1212 | } catch (...) { |
1213 | return current_exception_as_future<size_t>(); | |
1214 | } | |
1215 | } | |
1216 | ||
1217 | future<size_t> | |
20effc67 | 1218 | file::dma_write_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { |
f67539c2 | 1219 | try { |
20effc67 | 1220 | return _file_impl->write_dma(pos, buffer, len, pc, intent); |
f67539c2 TL |
1221 | } catch (...) { |
1222 | return current_exception_as_future<size_t>(); | |
1223 | } | |
1224 | } | |
1225 | ||
20effc67 | 1226 | future<size_t> file::dma_read(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept { |
f67539c2 | 1227 | try { |
20effc67 | 1228 | return _file_impl->read_dma(pos, std::move(iov), pc, intent); |
f67539c2 TL |
1229 | } catch (...) { |
1230 | return current_exception_as_future<size_t>(); | |
1231 | } | |
1232 | } | |
1233 | ||
1234 | future<temporary_buffer<uint8_t>> | |
20effc67 TL |
1235 | file::dma_read_exactly_impl(uint64_t pos, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { |
1236 | return dma_read<uint8_t>(pos, len, pc, intent).then([len](auto buf) { | |
f67539c2 TL |
1237 | if (buf.size() < len) { |
1238 | throw eof_error(); | |
9f95a23c | 1239 | } |
f67539c2 TL |
1240 | |
1241 | return buf; | |
1242 | }); | |
9f95a23c TL |
1243 | } |
1244 | ||
f67539c2 | 1245 | future<temporary_buffer<uint8_t>> |
20effc67 TL |
1246 | file::dma_read_impl(uint64_t pos, size_t len, const io_priority_class& pc, io_intent* intent) noexcept { |
1247 | return dma_read_bulk<uint8_t>(pos, len, pc, intent).then([len](temporary_buffer<uint8_t> buf) { | |
f67539c2 TL |
1248 | if (len < buf.size()) { |
1249 | buf.trim(len); | |
1250 | } | |
1251 | ||
1252 | return buf; | |
1253 | }); | |
9f95a23c TL |
1254 | } |
1255 | ||
f67539c2 | 1256 | future<size_t> |
20effc67 | 1257 | file::dma_read_impl(uint64_t aligned_pos, uint8_t* aligned_buffer, size_t aligned_len, const io_priority_class& pc, io_intent* intent) noexcept { |
f67539c2 | 1258 | try { |
20effc67 | 1259 | return _file_impl->read_dma(aligned_pos, aligned_buffer, aligned_len, pc, intent); |
f67539c2 TL |
1260 | } catch (...) { |
1261 | return current_exception_as_future<size_t>(); | |
1262 | } | |
9f95a23c TL |
1263 | } |
1264 | ||
1265 | seastar::file_handle | |
1266 | file::dup() { | |
1267 | return seastar::file_handle(_file_impl->dup()); | |
1268 | } | |
1269 | ||
1270 | file_impl* file_impl::get_file_impl(file& f) { | |
1271 | return f._file_impl.get(); | |
1272 | } | |
1273 | ||
1274 | std::unique_ptr<seastar::file_handle_impl> | |
1275 | file_impl::dup() { | |
1276 | throw std::runtime_error("this file type cannot be duplicated"); | |
1277 | } | |
1278 | ||
20effc67 TL |
1279 | future<int> file_impl::ioctl(uint64_t cmd, void* argp) noexcept { |
1280 | return make_exception_future<int>(std::runtime_error("this file type does not support ioctl")); | |
1281 | } | |
1282 | ||
1283 | future<int> file_impl::ioctl_short(uint64_t cmd, void* argp) noexcept { | |
1284 | return make_exception_future<int>(std::runtime_error("this file type does not support ioctl_short")); | |
1285 | } | |
1286 | ||
1287 | future<int> file_impl::fcntl(int op, uintptr_t arg) noexcept { | |
1288 | return make_exception_future<int>(std::runtime_error("this file type does not support fcntl")); | |
1289 | } | |
1290 | ||
1291 | future<int> file_impl::fcntl_short(int op, uintptr_t arg) noexcept { | |
1292 | return make_exception_future<int>(std::runtime_error("this file type does not support fcntl_short")); | |
1293 | } | |
1294 | ||
f67539c2 TL |
1295 | future<file> open_file_dma(std::string_view name, open_flags flags) noexcept { |
1296 | return engine().open_file_dma(name, flags, file_open_options()); | |
1297 | } | |
1298 | ||
1299 | future<file> open_file_dma(std::string_view name, open_flags flags, file_open_options options) noexcept { | |
1300 | return engine().open_file_dma(name, flags, std::move(options)); | |
1301 | } | |
1302 | ||
1303 | future<file> open_directory(std::string_view name) noexcept { | |
1304 | return engine().open_directory(name); | |
1305 | } | |
1306 | ||
9f95a23c | 1307 | } |