]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/env/io_posix.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / env / io_posix.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #ifdef ROCKSDB_LIB_IO_POSIX
11 #include "env/io_posix.h"
12 #include <errno.h>
13 #include <fcntl.h>
14 #include <algorithm>
15 #if defined(OS_LINUX)
16 #include <linux/fs.h>
17 #ifndef FALLOC_FL_KEEP_SIZE
18 #include <linux/falloc.h>
19 #endif
20 #endif
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/ioctl.h>
25 #include <sys/mman.h>
26 #include <sys/stat.h>
27 #include <sys/types.h>
28 #ifdef OS_LINUX
29 #include <sys/statfs.h>
30 #include <sys/sysmacros.h>
31 #endif
32 #include "monitoring/iostats_context_imp.h"
33 #include "port/port.h"
34 #include "rocksdb/slice.h"
35 #include "test_util/sync_point.h"
36 #include "util/autovector.h"
37 #include "util/coding.h"
38 #include "util/string_util.h"
39
40 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
41 #define F_LINUX_SPECIFIC_BASE 1024
42 #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
43 #endif
44
45 namespace ROCKSDB_NAMESPACE {
46
47 std::string IOErrorMsg(const std::string& context,
48 const std::string& file_name) {
49 if (file_name.empty()) {
50 return context;
51 }
52 return context + ": " + file_name;
53 }
54
55 // file_name can be left empty if it is not unkown.
56 IOStatus IOError(const std::string& context, const std::string& file_name,
57 int err_number) {
58 switch (err_number) {
59 case ENOSPC: {
60 IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
61 strerror(err_number));
62 s.SetRetryable(true);
63 return s;
64 }
65 case ESTALE:
66 return IOStatus::IOError(IOStatus::kStaleFile);
67 case ENOENT:
68 return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
69 strerror(err_number));
70 default:
71 return IOStatus::IOError(IOErrorMsg(context, file_name),
72 strerror(err_number));
73 }
74 }
75
76 // A wrapper for fadvise, if the platform doesn't support fadvise,
77 // it will simply return 0.
78 int Fadvise(int fd, off_t offset, size_t len, int advice) {
79 #ifdef OS_LINUX
80 return posix_fadvise(fd, offset, len, advice);
81 #else
82 (void)fd;
83 (void)offset;
84 (void)len;
85 (void)advice;
86 return 0; // simply do nothing.
87 #endif
88 }
89
90 namespace {
91
92 // On MacOS (and probably *BSD), the posix write and pwrite calls do not support
93 // buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
94 // cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
95 // the writes aligned.
96
97 bool PosixWrite(int fd, const char* buf, size_t nbyte) {
98 const size_t kLimit1Gb = 1UL << 30;
99
100 const char* src = buf;
101 size_t left = nbyte;
102
103 while (left != 0) {
104 size_t bytes_to_write = std::min(left, kLimit1Gb);
105
106 ssize_t done = write(fd, src, bytes_to_write);
107 if (done < 0) {
108 if (errno == EINTR) {
109 continue;
110 }
111 return false;
112 }
113 left -= done;
114 src += done;
115 }
116 return true;
117 }
118
119 bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
120 const size_t kLimit1Gb = 1UL << 30;
121
122 const char* src = buf;
123 size_t left = nbyte;
124
125 while (left != 0) {
126 size_t bytes_to_write = std::min(left, kLimit1Gb);
127
128 ssize_t done = pwrite(fd, src, bytes_to_write, offset);
129 if (done < 0) {
130 if (errno == EINTR) {
131 continue;
132 }
133 return false;
134 }
135 left -= done;
136 offset += done;
137 src += done;
138 }
139
140 return true;
141 }
142
143 #ifdef ROCKSDB_RANGESYNC_PRESENT
144
145 #if !defined(ZFS_SUPER_MAGIC)
146 // The magic number for ZFS was not exposed until recently. It should be fixed
147 // forever so we can just copy the magic number here.
148 #define ZFS_SUPER_MAGIC 0x2fc12fc1
149 #endif
150
151 bool IsSyncFileRangeSupported(int fd) {
152 // This function tracks and checks for cases where we know `sync_file_range`
153 // definitely will not work properly despite passing the compile-time check
154 // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
155 // fail in unexpected ways, we allow `sync_file_range` to be used. This way
156 // should minimize risk of impacting existing use cases.
157 struct statfs buf;
158 int ret = fstatfs(fd, &buf);
159 assert(ret == 0);
160 if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
161 // Testing on ZFS showed the writeback did not happen asynchronously when
162 // `sync_file_range` was called, even though it returned success. Avoid it
163 // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
164 // even though this'll incur extra I/O for metadata.
165 return false;
166 }
167
168 ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
169 assert(!(ret == -1 && errno != ENOSYS));
170 if (ret == -1 && errno == ENOSYS) {
171 // `sync_file_range` is not implemented on all platforms even if
172 // compile-time checks pass and a supported filesystem is in-use. For
173 // example, using ext4 on WSL (Windows Subsystem for Linux),
174 // `sync_file_range()` returns `ENOSYS`
175 // ("Function not implemented").
176 return false;
177 }
178 // None of the known cases matched, so allow `sync_file_range` use.
179 return true;
180 }
181
182 #undef ZFS_SUPER_MAGIC
183
184 #endif // ROCKSDB_RANGESYNC_PRESENT
185
186 } // anonymous namespace
187
188 /*
189 * DirectIOHelper
190 */
191 namespace {
192
193 bool IsSectorAligned(const size_t off, size_t sector_size) {
194 assert((sector_size & (sector_size - 1)) == 0);
195 return (off & (sector_size - 1)) == 0;
196 }
197
198 #ifndef NDEBUG
199 bool IsSectorAligned(const void* ptr, size_t sector_size) {
200 return uintptr_t(ptr) % sector_size == 0;
201 }
202 #endif
203 } // namespace
204
205 /*
206 * PosixSequentialFile
207 */
208 PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
209 int fd, size_t logical_block_size,
210 const EnvOptions& options)
211 : filename_(fname),
212 file_(file),
213 fd_(fd),
214 use_direct_io_(options.use_direct_reads),
215 logical_sector_size_(logical_block_size) {
216 assert(!options.use_direct_reads || !options.use_mmap_reads);
217 }
218
219 PosixSequentialFile::~PosixSequentialFile() {
220 if (!use_direct_io()) {
221 assert(file_);
222 fclose(file_);
223 } else {
224 assert(fd_);
225 close(fd_);
226 }
227 }
228
229 IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
230 Slice* result, char* scratch,
231 IODebugContext* /*dbg*/) {
232 assert(result != nullptr && !use_direct_io());
233 IOStatus s;
234 size_t r = 0;
235 do {
236 clearerr(file_);
237 r = fread_unlocked(scratch, 1, n, file_);
238 } while (r == 0 && ferror(file_) && errno == EINTR);
239 *result = Slice(scratch, r);
240 if (r < n) {
241 if (feof(file_)) {
242 // We leave status as ok if we hit the end of the file
243 // We also clear the error so that the reads can continue
244 // if a new data is written to the file
245 clearerr(file_);
246 } else {
247 // A partial read with an error: return a non-ok status
248 s = IOError("While reading file sequentially", filename_, errno);
249 }
250 }
251 return s;
252 }
253
254 IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
255 const IOOptions& /*opts*/,
256 Slice* result, char* scratch,
257 IODebugContext* /*dbg*/) {
258 assert(use_direct_io());
259 assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
260 assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
261 assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
262
263 IOStatus s;
264 ssize_t r = -1;
265 size_t left = n;
266 char* ptr = scratch;
267 while (left > 0) {
268 r = pread(fd_, ptr, left, static_cast<off_t>(offset));
269 if (r <= 0) {
270 if (r == -1 && errno == EINTR) {
271 continue;
272 }
273 break;
274 }
275 ptr += r;
276 offset += r;
277 left -= r;
278 if (!IsSectorAligned(r, GetRequiredBufferAlignment())) {
279 // Bytes reads don't fill sectors. Should only happen at the end
280 // of the file.
281 break;
282 }
283 }
284 if (r < 0) {
285 // An error: return a non-ok status
286 s = IOError(
287 "While pread " + ToString(n) + " bytes from offset " + ToString(offset),
288 filename_, errno);
289 }
290 *result = Slice(scratch, (r < 0) ? 0 : n - left);
291 return s;
292 }
293
294 IOStatus PosixSequentialFile::Skip(uint64_t n) {
295 if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
296 return IOError("While fseek to skip " + ToString(n) + " bytes", filename_,
297 errno);
298 }
299 return IOStatus::OK();
300 }
301
302 IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
303 #ifndef OS_LINUX
304 (void)offset;
305 (void)length;
306 return IOStatus::OK();
307 #else
308 if (!use_direct_io()) {
309 // free OS pages
310 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
311 if (ret != 0) {
312 return IOError("While fadvise NotNeeded offset " + ToString(offset) +
313 " len " + ToString(length),
314 filename_, errno);
315 }
316 }
317 return IOStatus::OK();
318 #endif
319 }
320
321 /*
322 * PosixRandomAccessFile
323 */
324 #if defined(OS_LINUX)
325 size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
326 if (max_size < kMaxVarint64Length * 3) {
327 return 0;
328 }
329
330 struct stat buf;
331 int result = fstat(fd, &buf);
332 if (result == -1) {
333 return 0;
334 }
335
336 long version = 0;
337 result = ioctl(fd, FS_IOC_GETVERSION, &version);
338 TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
339 if (result == -1) {
340 return 0;
341 }
342 uint64_t uversion = (uint64_t)version;
343
344 char* rid = id;
345 rid = EncodeVarint64(rid, buf.st_dev);
346 rid = EncodeVarint64(rid, buf.st_ino);
347 rid = EncodeVarint64(rid, uversion);
348 assert(rid >= id);
349 return static_cast<size_t>(rid - id);
350 }
351 #endif
352
353 #if defined(OS_MACOSX) || defined(OS_AIX)
354 size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
355 if (max_size < kMaxVarint64Length * 3) {
356 return 0;
357 }
358
359 struct stat buf;
360 int result = fstat(fd, &buf);
361 if (result == -1) {
362 return 0;
363 }
364
365 char* rid = id;
366 rid = EncodeVarint64(rid, buf.st_dev);
367 rid = EncodeVarint64(rid, buf.st_ino);
368 rid = EncodeVarint64(rid, buf.st_gen);
369 assert(rid >= id);
370 return static_cast<size_t>(rid - id);
371 }
372 #endif
373
374 #ifdef OS_LINUX
375 std::string RemoveTrailingSlash(const std::string& path) {
376 std::string p = path;
377 if (p.size() > 1 && p.back() == '/') {
378 p.pop_back();
379 }
380 return p;
381 }
382
383 Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
384 const std::vector<std::string>& directories) {
385 std::vector<std::string> dirs;
386 dirs.reserve(directories.size());
387 for (auto& d : directories) {
388 dirs.emplace_back(RemoveTrailingSlash(d));
389 }
390
391 std::map<std::string, size_t> dir_sizes;
392 {
393 ReadLock lock(&cache_mutex_);
394 for (const auto& dir : dirs) {
395 if (cache_.find(dir) == cache_.end()) {
396 dir_sizes.emplace(dir, 0);
397 }
398 }
399 }
400
401 Status s;
402 for (auto& dir_size : dir_sizes) {
403 s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second);
404 if (!s.ok()) {
405 return s;
406 }
407 }
408
409 WriteLock lock(&cache_mutex_);
410 for (const auto& dir : dirs) {
411 auto& v = cache_[dir];
412 v.ref++;
413 auto dir_size = dir_sizes.find(dir);
414 if (dir_size != dir_sizes.end()) {
415 v.size = dir_size->second;
416 }
417 }
418 return s;
419 }
420
421 void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
422 const std::vector<std::string>& directories) {
423 std::vector<std::string> dirs;
424 dirs.reserve(directories.size());
425 for (auto& dir : directories) {
426 dirs.emplace_back(RemoveTrailingSlash(dir));
427 }
428
429 WriteLock lock(&cache_mutex_);
430 for (const auto& dir : dirs) {
431 auto it = cache_.find(dir);
432 if (it != cache_.end() && !(--(it->second.ref))) {
433 cache_.erase(it);
434 }
435 }
436 }
437
438 size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
439 int fd) {
440 std::string dir = fname.substr(0, fname.find_last_of("/"));
441 if (dir.empty()) {
442 dir = "/";
443 }
444 {
445 ReadLock lock(&cache_mutex_);
446 auto it = cache_.find(dir);
447 if (it != cache_.end()) {
448 return it->second.size;
449 }
450 }
451 return get_logical_block_size_of_fd_(fd);
452 }
453 #endif
454
455 Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
456 size_t* size) {
457 int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
458 if (fd == -1) {
459 close(fd);
460 return Status::IOError("Cannot open directory " + directory);
461 }
462 *size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
463 close(fd);
464 return Status::OK();
465 }
466
467 size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
468 #ifdef OS_LINUX
469 struct stat buf;
470 int result = fstat(fd, &buf);
471 if (result == -1) {
472 return kDefaultPageSize;
473 }
474 if (major(buf.st_dev) == 0) {
475 // Unnamed devices (e.g. non-device mounts), reserved as null device number.
476 // These don't have an entry in /sys/dev/block/. Return a sensible default.
477 return kDefaultPageSize;
478 }
479
480 // Reading queue/logical_block_size does not require special permissions.
481 const int kBufferSize = 100;
482 char path[kBufferSize];
483 char real_path[PATH_MAX + 1];
484 snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
485 minor(buf.st_dev));
486 if (realpath(path, real_path) == nullptr) {
487 return kDefaultPageSize;
488 }
489 std::string device_dir(real_path);
490 if (!device_dir.empty() && device_dir.back() == '/') {
491 device_dir.pop_back();
492 }
493 // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
494 // and nvme0n1 have it.
495 // $ ls -al '/sys/dev/block/8:3'
496 // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
497 // ../../block/sda/sda3
498 // $ ls -al '/sys/dev/block/259:4'
499 // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
500 // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
501 size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
502 if (parent_end == std::string::npos) {
503 return kDefaultPageSize;
504 }
505 size_t parent_begin = device_dir.rfind('/', parent_end - 1);
506 if (parent_begin == std::string::npos) {
507 return kDefaultPageSize;
508 }
509 std::string parent =
510 device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
511 std::string child = device_dir.substr(parent_end + 1, std::string::npos);
512 if (parent != "block" &&
513 (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
514 device_dir = device_dir.substr(0, parent_end);
515 }
516 std::string fname = device_dir + "/queue/logical_block_size";
517 FILE* fp;
518 size_t size = 0;
519 fp = fopen(fname.c_str(), "r");
520 if (fp != nullptr) {
521 char* line = nullptr;
522 size_t len = 0;
523 if (getline(&line, &len, fp) != -1) {
524 sscanf(line, "%zu", &size);
525 }
526 free(line);
527 fclose(fp);
528 }
529 if (size != 0 && (size & (size - 1)) == 0) {
530 return size;
531 }
532 #endif
533 (void)fd;
534 return kDefaultPageSize;
535 }
536
537 /*
538 * PosixRandomAccessFile
539 *
540 * pread() based random-access
541 */
542 PosixRandomAccessFile::PosixRandomAccessFile(
543 const std::string& fname, int fd, size_t logical_block_size,
544 const EnvOptions& options
545 #if defined(ROCKSDB_IOURING_PRESENT)
546 ,
547 ThreadLocalPtr* thread_local_io_urings
548 #endif
549 )
550 : filename_(fname),
551 fd_(fd),
552 use_direct_io_(options.use_direct_reads),
553 logical_sector_size_(logical_block_size)
554 #if defined(ROCKSDB_IOURING_PRESENT)
555 ,
556 thread_local_io_urings_(thread_local_io_urings)
557 #endif
558 {
559 assert(!options.use_direct_reads || !options.use_mmap_reads);
560 assert(!options.use_mmap_reads || sizeof(void*) < 8);
561 }
562
563 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
564
565 IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
566 const IOOptions& /*opts*/, Slice* result,
567 char* scratch,
568 IODebugContext* /*dbg*/) const {
569 if (use_direct_io()) {
570 assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
571 assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
572 assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
573 }
574 IOStatus s;
575 ssize_t r = -1;
576 size_t left = n;
577 char* ptr = scratch;
578 while (left > 0) {
579 r = pread(fd_, ptr, left, static_cast<off_t>(offset));
580 if (r <= 0) {
581 if (r == -1 && errno == EINTR) {
582 continue;
583 }
584 break;
585 }
586 ptr += r;
587 offset += r;
588 left -= r;
589 if (use_direct_io() &&
590 r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
591 // Bytes reads don't fill sectors. Should only happen at the end
592 // of the file.
593 break;
594 }
595 }
596 if (r < 0) {
597 // An error: return a non-ok status
598 s = IOError(
599 "While pread offset " + ToString(offset) + " len " + ToString(n),
600 filename_, errno);
601 }
602 *result = Slice(scratch, (r < 0) ? 0 : n - left);
603 return s;
604 }
605
606 IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
607 size_t num_reqs,
608 const IOOptions& options,
609 IODebugContext* dbg) {
610 if (use_direct_io()) {
611 for (size_t i = 0; i < num_reqs; i++) {
612 assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment()));
613 assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment()));
614 assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment()));
615 }
616 }
617
618 #if defined(ROCKSDB_IOURING_PRESENT)
619 struct io_uring* iu = nullptr;
620 if (thread_local_io_urings_) {
621 iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
622 if (iu == nullptr) {
623 iu = CreateIOUring();
624 if (iu != nullptr) {
625 thread_local_io_urings_->Reset(iu);
626 }
627 }
628 }
629
630 // Init failed, platform doesn't support io_uring. Fall back to
631 // serialized reads
632 if (iu == nullptr) {
633 return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
634 }
635
636 struct WrappedReadRequest {
637 FSReadRequest* req;
638 struct iovec iov;
639 size_t finished_len;
640 explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
641 };
642
643 autovector<WrappedReadRequest, 32> req_wraps;
644 autovector<WrappedReadRequest*, 4> incomplete_rq_list;
645
646 for (size_t i = 0; i < num_reqs; i++) {
647 req_wraps.emplace_back(&reqs[i]);
648 }
649
650 size_t reqs_off = 0;
651 while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
652 size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
653
654 // If requests exceed depth, split it into batches
655 if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth;
656
657 assert(incomplete_rq_list.size() <= this_reqs);
658 for (size_t i = 0; i < this_reqs; i++) {
659 WrappedReadRequest* rep_to_submit;
660 if (i < incomplete_rq_list.size()) {
661 rep_to_submit = incomplete_rq_list[i];
662 } else {
663 rep_to_submit = &req_wraps[reqs_off++];
664 }
665 assert(rep_to_submit->req->len > rep_to_submit->finished_len);
666 rep_to_submit->iov.iov_base =
667 rep_to_submit->req->scratch + rep_to_submit->finished_len;
668 rep_to_submit->iov.iov_len =
669 rep_to_submit->req->len - rep_to_submit->finished_len;
670
671 struct io_uring_sqe* sqe;
672 sqe = io_uring_get_sqe(iu);
673 io_uring_prep_readv(
674 sqe, fd_, &rep_to_submit->iov, 1,
675 rep_to_submit->req->offset + rep_to_submit->finished_len);
676 io_uring_sqe_set_data(sqe, rep_to_submit);
677 }
678 incomplete_rq_list.clear();
679
680 ssize_t ret =
681 io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
682 if (static_cast<size_t>(ret) != this_reqs) {
683 fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
684 }
685 assert(static_cast<size_t>(ret) == this_reqs);
686
687 for (size_t i = 0; i < this_reqs; i++) {
688 struct io_uring_cqe* cqe;
689 WrappedReadRequest* req_wrap;
690
691 // We could use the peek variant here, but this seems safer in terms
692 // of our initial wait not reaping all completions
693 ret = io_uring_wait_cqe(iu, &cqe);
694 assert(!ret);
695
696 req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
697 FSReadRequest* req = req_wrap->req;
698 if (cqe->res < 0) {
699 req->result = Slice(req->scratch, 0);
700 req->status = IOError("Req failed", filename_, cqe->res);
701 } else {
702 size_t bytes_read = static_cast<size_t>(cqe->res);
703 TEST_SYNC_POINT_CALLBACK(
704 "PosixRandomAccessFile::MultiRead:io_uring_result", &bytes_read);
705 if (bytes_read == req_wrap->iov.iov_len) {
706 req->result = Slice(req->scratch, req->len);
707 req->status = IOStatus::OK();
708 } else if (bytes_read == 0) {
709 // cqe->res == 0 can means EOF, or can mean partial results. See
710 // comment
711 // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
712 // Fall back to pread in this case.
713 if (use_direct_io() &&
714 !IsSectorAligned(req_wrap->finished_len,
715 GetRequiredBufferAlignment())) {
716 // Bytes reads don't fill sectors. Should only happen at the end
717 // of the file.
718 req->result = Slice(req->scratch, req_wrap->finished_len);
719 req->status = IOStatus::OK();
720 } else {
721 Slice tmp_slice;
722 req->status =
723 Read(req->offset + req_wrap->finished_len,
724 req->len - req_wrap->finished_len, options, &tmp_slice,
725 req->scratch + req_wrap->finished_len, dbg);
726 req->result =
727 Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
728 }
729 } else if (bytes_read < req_wrap->iov.iov_len) {
730 assert(bytes_read > 0);
731 assert(bytes_read + req_wrap->finished_len < req->len);
732 req_wrap->finished_len += bytes_read;
733 incomplete_rq_list.push_back(req_wrap);
734 } else {
735 req->result = Slice(req->scratch, 0);
736 req->status = IOError("Req returned more bytes than requested",
737 filename_, cqe->res);
738 }
739 }
740 io_uring_cqe_seen(iu, cqe);
741 }
742 }
743 return IOStatus::OK();
744 #else
745 return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
746 #endif
747 }
748
749 IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
750 const IOOptions& /*opts*/,
751 IODebugContext* /*dbg*/) {
752 IOStatus s;
753 if (!use_direct_io()) {
754 ssize_t r = 0;
755 #ifdef OS_LINUX
756 r = readahead(fd_, offset, n);
757 #endif
758 #ifdef OS_MACOSX
759 radvisory advice;
760 advice.ra_offset = static_cast<off_t>(offset);
761 advice.ra_count = static_cast<int>(n);
762 r = fcntl(fd_, F_RDADVISE, &advice);
763 #endif
764 if (r == -1) {
765 s = IOError("While prefetching offset " + ToString(offset) + " len " +
766 ToString(n),
767 filename_, errno);
768 }
769 }
770 return s;
771 }
772
773 #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
774 size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
775 return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
776 }
777 #endif
778
779 void PosixRandomAccessFile::Hint(AccessPattern pattern) {
780 if (use_direct_io()) {
781 return;
782 }
783 switch (pattern) {
784 case kNormal:
785 Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
786 break;
787 case kRandom:
788 Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
789 break;
790 case kSequential:
791 Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
792 break;
793 case kWillNeed:
794 Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
795 break;
796 case kWontNeed:
797 Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
798 break;
799 default:
800 assert(false);
801 break;
802 }
803 }
804
805 IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
806 if (use_direct_io()) {
807 return IOStatus::OK();
808 }
809 #ifndef OS_LINUX
810 (void)offset;
811 (void)length;
812 return IOStatus::OK();
813 #else
814 // free OS pages
815 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
816 if (ret == 0) {
817 return IOStatus::OK();
818 }
819 return IOError("While fadvise NotNeeded offset " + ToString(offset) +
820 " len " + ToString(length),
821 filename_, errno);
822 #endif
823 }
824
825 /*
826 * PosixMmapReadableFile
827 *
828 * mmap() based random-access
829 */
830 // base[0,length-1] contains the mmapped contents of the file.
831 PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
832 const std::string& fname,
833 void* base, size_t length,
834 const EnvOptions& options)
835 : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
836 #ifdef NDEBUG
837 (void)options;
838 #endif
839 fd_ = fd_ + 0; // suppress the warning for used variables
840 assert(options.use_mmap_reads);
841 assert(!options.use_direct_reads);
842 }
843
844 PosixMmapReadableFile::~PosixMmapReadableFile() {
845 int ret = munmap(mmapped_region_, length_);
846 if (ret != 0) {
847 fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
848 mmapped_region_, length_);
849 }
850 close(fd_);
851 }
852
853 IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
854 const IOOptions& /*opts*/, Slice* result,
855 char* /*scratch*/,
856 IODebugContext* /*dbg*/) const {
857 IOStatus s;
858 if (offset > length_) {
859 *result = Slice();
860 return IOError("While mmap read offset " + ToString(offset) +
861 " larger than file length " + ToString(length_),
862 filename_, EINVAL);
863 } else if (offset + n > length_) {
864 n = static_cast<size_t>(length_ - offset);
865 }
866 *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
867 return s;
868 }
869
870 IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
871 #ifndef OS_LINUX
872 (void)offset;
873 (void)length;
874 return IOStatus::OK();
875 #else
876 // free OS pages
877 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
878 if (ret == 0) {
879 return IOStatus::OK();
880 }
881 return IOError("While fadvise not needed. Offset " + ToString(offset) +
882 " len" + ToString(length),
883 filename_, errno);
884 #endif
885 }
886
887 /*
888 * PosixMmapFile
889 *
890 * We preallocate up to an extra megabyte and use memcpy to append new
891 * data to the file. This is safe since we either properly close the
892 * file before reading from it, or for log files, the reading code
893 * knows enough to skip zero suffixes.
894 */
895 IOStatus PosixMmapFile::UnmapCurrentRegion() {
896 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
897 if (base_ != nullptr) {
898 int munmap_status = munmap(base_, limit_ - base_);
899 if (munmap_status != 0) {
900 return IOError("While munmap", filename_, munmap_status);
901 }
902 file_offset_ += limit_ - base_;
903 base_ = nullptr;
904 limit_ = nullptr;
905 last_sync_ = nullptr;
906 dst_ = nullptr;
907
908 // Increase the amount we map the next time, but capped at 1MB
909 if (map_size_ < (1 << 20)) {
910 map_size_ *= 2;
911 }
912 }
913 return IOStatus::OK();
914 }
915
916 IOStatus PosixMmapFile::MapNewRegion() {
917 #ifdef ROCKSDB_FALLOCATE_PRESENT
918 assert(base_ == nullptr);
919 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
920 // we can't fallocate with FALLOC_FL_KEEP_SIZE here
921 if (allow_fallocate_) {
922 IOSTATS_TIMER_GUARD(allocate_nanos);
923 int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
924 if (alloc_status != 0) {
925 // fallback to posix_fallocate
926 alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
927 }
928 if (alloc_status != 0) {
929 return IOStatus::IOError("Error allocating space to file : " + filename_ +
930 "Error : " + strerror(alloc_status));
931 }
932 }
933
934 TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
935 void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
936 file_offset_);
937 if (ptr == MAP_FAILED) {
938 return IOStatus::IOError("MMap failed on " + filename_);
939 }
940 TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
941
942 base_ = reinterpret_cast<char*>(ptr);
943 limit_ = base_ + map_size_;
944 dst_ = base_;
945 last_sync_ = base_;
946 return IOStatus::OK();
947 #else
948 return IOStatus::NotSupported("This platform doesn't support fallocate()");
949 #endif
950 }
951
952 IOStatus PosixMmapFile::Msync() {
953 if (dst_ == last_sync_) {
954 return IOStatus::OK();
955 }
956 // Find the beginnings of the pages that contain the first and last
957 // bytes to be synced.
958 size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
959 size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
960 last_sync_ = dst_;
961 TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
962 if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
963 return IOError("While msync", filename_, errno);
964 }
965 return IOStatus::OK();
966 }
967
968 PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
969 const EnvOptions& options)
970 : filename_(fname),
971 fd_(fd),
972 page_size_(page_size),
973 map_size_(Roundup(65536, page_size)),
974 base_(nullptr),
975 limit_(nullptr),
976 dst_(nullptr),
977 last_sync_(nullptr),
978 file_offset_(0) {
979 #ifdef ROCKSDB_FALLOCATE_PRESENT
980 allow_fallocate_ = options.allow_fallocate;
981 fallocate_with_keep_size_ = options.fallocate_with_keep_size;
982 #else
983 (void)options;
984 #endif
985 assert((page_size & (page_size - 1)) == 0);
986 assert(options.use_mmap_writes);
987 assert(!options.use_direct_writes);
988 }
989
990 PosixMmapFile::~PosixMmapFile() {
991 if (fd_ >= 0) {
992 IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr);
993 s.PermitUncheckedError();
994 }
995 }
996
997 IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
998 IODebugContext* /*dbg*/) {
999 const char* src = data.data();
1000 size_t left = data.size();
1001 while (left > 0) {
1002 assert(base_ <= dst_);
1003 assert(dst_ <= limit_);
1004 size_t avail = limit_ - dst_;
1005 if (avail == 0) {
1006 IOStatus s = UnmapCurrentRegion();
1007 if (!s.ok()) {
1008 return s;
1009 }
1010 s = MapNewRegion();
1011 if (!s.ok()) {
1012 return s;
1013 }
1014 TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
1015 }
1016
1017 size_t n = (left <= avail) ? left : avail;
1018 assert(dst_);
1019 memcpy(dst_, src, n);
1020 dst_ += n;
1021 src += n;
1022 left -= n;
1023 }
1024 return IOStatus::OK();
1025 }
1026
1027 IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
1028 IODebugContext* /*dbg*/) {
1029 IOStatus s;
1030 size_t unused = limit_ - dst_;
1031
1032 s = UnmapCurrentRegion();
1033 if (!s.ok()) {
1034 s = IOError("While closing mmapped file", filename_, errno);
1035 } else if (unused > 0) {
1036 // Trim the extra space at the end of the file
1037 if (ftruncate(fd_, file_offset_ - unused) < 0) {
1038 s = IOError("While ftruncating mmaped file", filename_, errno);
1039 }
1040 }
1041
1042 if (close(fd_) < 0) {
1043 if (s.ok()) {
1044 s = IOError("While closing mmapped file", filename_, errno);
1045 }
1046 }
1047
1048 fd_ = -1;
1049 base_ = nullptr;
1050 limit_ = nullptr;
1051 return s;
1052 }
1053
1054 IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
1055 IODebugContext* /*dbg*/) {
1056 return IOStatus::OK();
1057 }
1058
1059 IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
1060 IODebugContext* /*dbg*/) {
1061 if (fdatasync(fd_) < 0) {
1062 return IOError("While fdatasync mmapped file", filename_, errno);
1063 }
1064
1065 return Msync();
1066 }
1067
1068 /**
1069 * Flush data as well as metadata to stable storage.
1070 */
1071 IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
1072 IODebugContext* /*dbg*/) {
1073 if (fsync(fd_) < 0) {
1074 return IOError("While fsync mmaped file", filename_, errno);
1075 }
1076
1077 return Msync();
1078 }
1079
1080 /**
1081 * Get the size of valid data in the file. This will not match the
1082 * size that is returned from the filesystem because we use mmap
1083 * to extend file by map_size every time.
1084 */
1085 uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
1086 IODebugContext* /*dbg*/) {
1087 size_t used = dst_ - base_;
1088 return file_offset_ + used;
1089 }
1090
1091 IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
1092 #ifndef OS_LINUX
1093 (void)offset;
1094 (void)length;
1095 return IOStatus::OK();
1096 #else
1097 // free OS pages
1098 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
1099 if (ret == 0) {
1100 return IOStatus::OK();
1101 }
1102 return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
1103 #endif
1104 }
1105
1106 #ifdef ROCKSDB_FALLOCATE_PRESENT
1107 IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
1108 const IOOptions& /*opts*/,
1109 IODebugContext* /*dbg*/) {
1110 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1111 assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1112 TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
1113 int alloc_status = 0;
1114 if (allow_fallocate_) {
1115 alloc_status =
1116 fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
1117 static_cast<off_t>(offset), static_cast<off_t>(len));
1118 }
1119 if (alloc_status == 0) {
1120 return IOStatus::OK();
1121 } else {
1122 return IOError(
1123 "While fallocate offset " + ToString(offset) + " len " + ToString(len),
1124 filename_, errno);
1125 }
1126 }
1127 #endif
1128
1129 /*
1130 * PosixWritableFile
1131 *
1132 * Use posix write to write data to a file.
1133 */
1134 PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
1135 size_t logical_block_size,
1136 const EnvOptions& options)
1137 : FSWritableFile(options),
1138 filename_(fname),
1139 use_direct_io_(options.use_direct_writes),
1140 fd_(fd),
1141 filesize_(0),
1142 logical_sector_size_(logical_block_size) {
1143 #ifdef ROCKSDB_FALLOCATE_PRESENT
1144 allow_fallocate_ = options.allow_fallocate;
1145 fallocate_with_keep_size_ = options.fallocate_with_keep_size;
1146 #endif
1147 #ifdef ROCKSDB_RANGESYNC_PRESENT
1148 sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
1149 #endif // ROCKSDB_RANGESYNC_PRESENT
1150 assert(!options.use_mmap_writes);
1151 }
1152
1153 PosixWritableFile::~PosixWritableFile() {
1154 if (fd_ >= 0) {
1155 IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr);
1156 s.PermitUncheckedError();
1157 }
1158 }
1159
1160 IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
1161 IODebugContext* /*dbg*/) {
1162 if (use_direct_io()) {
1163 assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
1164 assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
1165 }
1166 const char* src = data.data();
1167 size_t nbytes = data.size();
1168
1169 if (!PosixWrite(fd_, src, nbytes)) {
1170 return IOError("While appending to file", filename_, errno);
1171 }
1172
1173 filesize_ += nbytes;
1174 return IOStatus::OK();
1175 }
1176
1177 IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
1178 const IOOptions& /*opts*/,
1179 IODebugContext* /*dbg*/) {
1180 if (use_direct_io()) {
1181 assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
1182 assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
1183 assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
1184 }
1185 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1186 const char* src = data.data();
1187 size_t nbytes = data.size();
1188 if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
1189 return IOError("While pwrite to file at offset " + ToString(offset),
1190 filename_, errno);
1191 }
1192 filesize_ = offset + nbytes;
1193 return IOStatus::OK();
1194 }
1195
1196 IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
1197 IODebugContext* /*dbg*/) {
1198 IOStatus s;
1199 int r = ftruncate(fd_, size);
1200 if (r < 0) {
1201 s = IOError("While ftruncate file to size " + ToString(size), filename_,
1202 errno);
1203 } else {
1204 filesize_ = size;
1205 }
1206 return s;
1207 }
1208
1209 IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
1210 IODebugContext* /*dbg*/) {
1211 IOStatus s;
1212
1213 size_t block_size;
1214 size_t last_allocated_block;
1215 GetPreallocationStatus(&block_size, &last_allocated_block);
1216 if (last_allocated_block > 0) {
1217 // trim the extra space preallocated at the end of the file
1218 // NOTE(ljin): we probably don't want to surface failure as an IOError,
1219 // but it will be nice to log these errors.
1220 int dummy __attribute__((__unused__));
1221 dummy = ftruncate(fd_, filesize_);
1222 #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) && \
1223 !defined(TRAVIS)
1224 // in some file systems, ftruncate only trims trailing space if the
1225 // new file size is smaller than the current size. Calling fallocate
1226 // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
1227 // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
1228 // filesystems:
1229 // XFS (since Linux 2.6.38)
1230 // ext4 (since Linux 3.0)
1231 // Btrfs (since Linux 3.7)
1232 // tmpfs (since Linux 3.5)
1233 // We ignore error since failure of this operation does not affect
1234 // correctness.
1235 // TRAVIS - this code does not work on TRAVIS filesystems.
1236 // the FALLOC_FL_KEEP_SIZE option is expected to not change the size
1237 // of the file, but it does. Simple strace report will show that.
1238 // While we work with Travis-CI team to figure out if this is a
1239 // quirk of Docker/AUFS, we will comment this out.
1240 struct stat file_stats;
1241 int result = fstat(fd_, &file_stats);
1242 // After ftruncate, we check whether ftruncate has the correct behavior.
1243 // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
1244 if (result == 0 &&
1245 (file_stats.st_size + file_stats.st_blksize - 1) /
1246 file_stats.st_blksize !=
1247 file_stats.st_blocks / (file_stats.st_blksize / 512)) {
1248 IOSTATS_TIMER_GUARD(allocate_nanos);
1249 if (allow_fallocate_) {
1250 fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
1251 block_size * last_allocated_block - filesize_);
1252 }
1253 }
1254 #endif
1255 }
1256
1257 if (close(fd_) < 0) {
1258 s = IOError("While closing file after writing", filename_, errno);
1259 }
1260 fd_ = -1;
1261 return s;
1262 }
1263
1264 // write out the cached data to the OS cache
1265 IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
1266 IODebugContext* /*dbg*/) {
1267 return IOStatus::OK();
1268 }
1269
1270 IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
1271 IODebugContext* /*dbg*/) {
1272 if (fdatasync(fd_) < 0) {
1273 return IOError("While fdatasync", filename_, errno);
1274 }
1275 return IOStatus::OK();
1276 }
1277
1278 IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
1279 IODebugContext* /*dbg*/) {
1280 if (fsync(fd_) < 0) {
1281 return IOError("While fsync", filename_, errno);
1282 }
1283 return IOStatus::OK();
1284 }
1285
1286 bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
1287
1288 uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
1289 IODebugContext* /*dbg*/) {
1290 return filesize_;
1291 }
1292
1293 void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
1294 #ifdef OS_LINUX
1295 // Suppress Valgrind "Unimplemented functionality" error.
1296 #ifndef ROCKSDB_VALGRIND_RUN
1297 if (hint == write_hint_) {
1298 return;
1299 }
1300 if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
1301 write_hint_ = hint;
1302 }
1303 #else
1304 (void)hint;
1305 #endif // ROCKSDB_VALGRIND_RUN
1306 #else
1307 (void)hint;
1308 #endif // OS_LINUX
1309 }
1310
1311 IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
1312 if (use_direct_io()) {
1313 return IOStatus::OK();
1314 }
1315 #ifndef OS_LINUX
1316 (void)offset;
1317 (void)length;
1318 return IOStatus::OK();
1319 #else
1320 // free OS pages
1321 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
1322 if (ret == 0) {
1323 return IOStatus::OK();
1324 }
1325 return IOError("While fadvise NotNeeded", filename_, errno);
1326 #endif
1327 }
1328
1329 #ifdef ROCKSDB_FALLOCATE_PRESENT
1330 IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
1331 const IOOptions& /*opts*/,
1332 IODebugContext* /*dbg*/) {
1333 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1334 assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1335 TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
1336 IOSTATS_TIMER_GUARD(allocate_nanos);
1337 int alloc_status = 0;
1338 if (allow_fallocate_) {
1339 alloc_status =
1340 fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
1341 static_cast<off_t>(offset), static_cast<off_t>(len));
1342 }
1343 if (alloc_status == 0) {
1344 return IOStatus::OK();
1345 } else {
1346 return IOError(
1347 "While fallocate offset " + ToString(offset) + " len " + ToString(len),
1348 filename_, errno);
1349 }
1350 }
1351 #endif
1352
1353 IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
1354 const IOOptions& opts,
1355 IODebugContext* dbg) {
1356 #ifdef ROCKSDB_RANGESYNC_PRESENT
1357 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1358 assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1359 if (sync_file_range_supported_) {
1360 int ret;
1361 if (strict_bytes_per_sync_) {
1362 // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
1363 // that spans all bytes written so far tells `sync_file_range` to wait for
1364 // any outstanding writeback requests to finish before issuing a new one.
1365 ret =
1366 sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
1367 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
1368 } else {
1369 ret = sync_file_range(fd_, static_cast<off_t>(offset),
1370 static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
1371 }
1372 if (ret != 0) {
1373 return IOError("While sync_file_range returned " + ToString(ret),
1374 filename_, errno);
1375 }
1376 return IOStatus::OK();
1377 }
1378 #endif // ROCKSDB_RANGESYNC_PRESENT
1379 return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
1380 }
1381
1382 #ifdef OS_LINUX
1383 size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
1384 return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
1385 }
1386 #endif
1387
1388 /*
1389 * PosixRandomRWFile
1390 */
1391
1392 PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
1393 const EnvOptions& /*options*/)
1394 : filename_(fname), fd_(fd) {}
1395
1396 PosixRandomRWFile::~PosixRandomRWFile() {
1397 if (fd_ >= 0) {
1398 IOStatus s = Close(IOOptions(), nullptr);
1399 s.PermitUncheckedError();
1400 }
1401 }
1402
1403 IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
1404 const IOOptions& /*opts*/,
1405 IODebugContext* /*dbg*/) {
1406 const char* src = data.data();
1407 size_t nbytes = data.size();
1408 if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
1409 return IOError(
1410 "While write random read/write file at offset " + ToString(offset),
1411 filename_, errno);
1412 }
1413
1414 return IOStatus::OK();
1415 }
1416
1417 IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
1418 const IOOptions& /*opts*/, Slice* result,
1419 char* scratch, IODebugContext* /*dbg*/) const {
1420 size_t left = n;
1421 char* ptr = scratch;
1422 while (left > 0) {
1423 ssize_t done = pread(fd_, ptr, left, offset);
1424 if (done < 0) {
1425 // error while reading from file
1426 if (errno == EINTR) {
1427 // read was interrupted, try again.
1428 continue;
1429 }
1430 return IOError("While reading random read/write file offset " +
1431 ToString(offset) + " len " + ToString(n),
1432 filename_, errno);
1433 } else if (done == 0) {
1434 // Nothing more to read
1435 break;
1436 }
1437
1438 // Read `done` bytes
1439 ptr += done;
1440 offset += done;
1441 left -= done;
1442 }
1443
1444 *result = Slice(scratch, n - left);
1445 return IOStatus::OK();
1446 }
1447
1448 IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
1449 IODebugContext* /*dbg*/) {
1450 return IOStatus::OK();
1451 }
1452
1453 IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
1454 IODebugContext* /*dbg*/) {
1455 if (fdatasync(fd_) < 0) {
1456 return IOError("While fdatasync random read/write file", filename_, errno);
1457 }
1458 return IOStatus::OK();
1459 }
1460
1461 IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
1462 IODebugContext* /*dbg*/) {
1463 if (fsync(fd_) < 0) {
1464 return IOError("While fsync random read/write file", filename_, errno);
1465 }
1466 return IOStatus::OK();
1467 }
1468
1469 IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
1470 IODebugContext* /*dbg*/) {
1471 if (close(fd_) < 0) {
1472 return IOError("While close random read/write file", filename_, errno);
1473 }
1474 fd_ = -1;
1475 return IOStatus::OK();
1476 }
1477
1478 PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
1479 // TODO should have error handling though not much we can do...
1480 munmap(this->base_, length_);
1481 }
1482
1483 /*
1484 * PosixDirectory
1485 */
1486
1487 PosixDirectory::~PosixDirectory() { close(fd_); }
1488
1489 IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/,
1490 IODebugContext* /*dbg*/) {
1491 #ifndef OS_AIX
1492 if (fsync(fd_) == -1) {
1493 return IOError("While fsync", "a directory", errno);
1494 }
1495 #endif
1496 return IOStatus::OK();
1497 }
1498 } // namespace ROCKSDB_NAMESPACE
1499 #endif