1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #ifdef ROCKSDB_LIB_IO_POSIX
11 #include "env/io_posix.h"
19 #ifndef FALLOC_FL_KEEP_SIZE
20 #include <linux/falloc.h>
26 #include <sys/ioctl.h>
29 #include <sys/types.h>
31 #include <sys/statfs.h>
32 #include <sys/sysmacros.h>
34 #include "monitoring/iostats_context_imp.h"
35 #include "port/port.h"
36 #include "port/stack_trace.h"
37 #include "rocksdb/slice.h"
38 #include "test_util/sync_point.h"
39 #include "util/autovector.h"
40 #include "util/coding.h"
41 #include "util/string_util.h"
43 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
44 #define F_LINUX_SPECIFIC_BASE 1024
45 #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
48 namespace ROCKSDB_NAMESPACE
{
50 std::string
IOErrorMsg(const std::string
& context
,
51 const std::string
& file_name
) {
52 if (file_name
.empty()) {
55 return context
+ ": " + file_name
;
58 // file_name can be left empty if it is not unkown.
59 IOStatus
IOError(const std::string
& context
, const std::string
& file_name
,
63 IOStatus s
= IOStatus::NoSpace(IOErrorMsg(context
, file_name
),
64 errnoStr(err_number
).c_str());
69 return IOStatus::IOError(IOStatus::kStaleFile
);
71 return IOStatus::PathNotFound(IOErrorMsg(context
, file_name
),
72 errnoStr(err_number
).c_str());
74 return IOStatus::IOError(IOErrorMsg(context
, file_name
),
75 errnoStr(err_number
).c_str());
79 // A wrapper for fadvise, if the platform doesn't support fadvise,
80 // it will simply return 0.
81 int Fadvise(int fd
, off_t offset
, size_t len
, int advice
) {
83 return posix_fadvise(fd
, offset
, len
, advice
);
89 return 0; // simply do nothing.
93 // A wrapper for fadvise, if the platform doesn't support fadvise,
94 // it will simply return 0.
95 int Madvise(void* addr
, size_t len
, int advice
) {
97 return posix_madvise(addr
, len
, advice
);
102 return 0; // simply do nothing.
108 // On MacOS (and probably *BSD), the posix write and pwrite calls do not support
109 // buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
110 // cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
111 // the writes aligned.
113 bool PosixWrite(int fd
, const char* buf
, size_t nbyte
) {
114 const size_t kLimit1Gb
= 1UL << 30;
116 const char* src
= buf
;
120 size_t bytes_to_write
= std::min(left
, kLimit1Gb
);
122 ssize_t done
= write(fd
, src
, bytes_to_write
);
124 if (errno
== EINTR
) {
135 bool PosixPositionedWrite(int fd
, const char* buf
, size_t nbyte
, off_t offset
) {
136 const size_t kLimit1Gb
= 1UL << 30;
138 const char* src
= buf
;
142 size_t bytes_to_write
= std::min(left
, kLimit1Gb
);
144 ssize_t done
= pwrite(fd
, src
, bytes_to_write
, offset
);
146 if (errno
== EINTR
) {
159 #ifdef ROCKSDB_RANGESYNC_PRESENT
161 #if !defined(ZFS_SUPER_MAGIC)
162 // The magic number for ZFS was not exposed until recently. It should be fixed
163 // forever so we can just copy the magic number here.
164 #define ZFS_SUPER_MAGIC 0x2fc12fc1
167 bool IsSyncFileRangeSupported(int fd
) {
168 // This function tracks and checks for cases where we know `sync_file_range`
169 // definitely will not work properly despite passing the compile-time check
170 // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
171 // fail in unexpected ways, we allow `sync_file_range` to be used. This way
172 // should minimize risk of impacting existing use cases.
174 int ret
= fstatfs(fd
, &buf
);
176 if (ret
== 0 && buf
.f_type
== ZFS_SUPER_MAGIC
) {
177 // Testing on ZFS showed the writeback did not happen asynchronously when
178 // `sync_file_range` was called, even though it returned success. Avoid it
179 // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
180 // even though this'll incur extra I/O for metadata.
184 ret
= sync_file_range(fd
, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
185 assert(!(ret
== -1 && errno
!= ENOSYS
));
186 if (ret
== -1 && errno
== ENOSYS
) {
187 // `sync_file_range` is not implemented on all platforms even if
188 // compile-time checks pass and a supported filesystem is in-use. For
189 // example, using ext4 on WSL (Windows Subsystem for Linux),
190 // `sync_file_range()` returns `ENOSYS`
191 // ("Function not implemented").
194 // None of the known cases matched, so allow `sync_file_range` use.
198 #undef ZFS_SUPER_MAGIC
200 #endif // ROCKSDB_RANGESYNC_PRESENT
202 } // anonymous namespace
205 * PosixSequentialFile
207 PosixSequentialFile::PosixSequentialFile(const std::string
& fname
, FILE* file
,
208 int fd
, size_t logical_block_size
,
209 const EnvOptions
& options
)
213 use_direct_io_(options
.use_direct_reads
),
214 logical_sector_size_(logical_block_size
) {
215 assert(!options
.use_direct_reads
|| !options
.use_mmap_reads
);
218 PosixSequentialFile::~PosixSequentialFile() {
219 if (!use_direct_io()) {
228 IOStatus
PosixSequentialFile::Read(size_t n
, const IOOptions
& /*opts*/,
229 Slice
* result
, char* scratch
,
230 IODebugContext
* /*dbg*/) {
231 assert(result
!= nullptr && !use_direct_io());
236 r
= fread_unlocked(scratch
, 1, n
, file_
);
237 } while (r
== 0 && ferror(file_
) && errno
== EINTR
);
238 *result
= Slice(scratch
, r
);
241 // We leave status as ok if we hit the end of the file
242 // We also clear the error so that the reads can continue
243 // if a new data is written to the file
246 // A partial read with an error: return a non-ok status
247 s
= IOError("While reading file sequentially", filename_
, errno
);
253 IOStatus
PosixSequentialFile::PositionedRead(uint64_t offset
, size_t n
,
254 const IOOptions
& /*opts*/,
255 Slice
* result
, char* scratch
,
256 IODebugContext
* /*dbg*/) {
257 assert(use_direct_io());
258 assert(IsSectorAligned(offset
, GetRequiredBufferAlignment()));
259 assert(IsSectorAligned(n
, GetRequiredBufferAlignment()));
260 assert(IsSectorAligned(scratch
, GetRequiredBufferAlignment()));
267 r
= pread(fd_
, ptr
, left
, static_cast<off_t
>(offset
));
269 if (r
== -1 && errno
== EINTR
) {
277 if (!IsSectorAligned(r
, GetRequiredBufferAlignment())) {
278 // Bytes reads don't fill sectors. Should only happen at the end
284 // An error: return a non-ok status
285 s
= IOError("While pread " + std::to_string(n
) + " bytes from offset " +
286 std::to_string(offset
),
289 *result
= Slice(scratch
, (r
< 0) ? 0 : n
- left
);
293 IOStatus
PosixSequentialFile::Skip(uint64_t n
) {
294 if (fseek(file_
, static_cast<long int>(n
), SEEK_CUR
)) {
295 return IOError("While fseek to skip " + std::to_string(n
) + " bytes",
298 return IOStatus::OK();
301 IOStatus
PosixSequentialFile::InvalidateCache(size_t offset
, size_t length
) {
305 return IOStatus::OK();
307 if (!use_direct_io()) {
309 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
311 return IOError("While fadvise NotNeeded offset " +
312 std::to_string(offset
) + " len " +
313 std::to_string(length
),
317 return IOStatus::OK();
322 * PosixRandomAccessFile
324 #if defined(OS_LINUX)
325 size_t PosixHelper::GetUniqueIdFromFile(int fd
, char* id
, size_t max_size
) {
326 if (max_size
< kMaxVarint64Length
* 3) {
331 int result
= fstat(fd
, &buf
);
337 result
= ioctl(fd
, FS_IOC_GETVERSION
, &version
);
338 TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result
);
342 uint64_t uversion
= (uint64_t)version
;
345 rid
= EncodeVarint64(rid
, buf
.st_dev
);
346 rid
= EncodeVarint64(rid
, buf
.st_ino
);
347 rid
= EncodeVarint64(rid
, uversion
);
349 return static_cast<size_t>(rid
- id
);
353 #if defined(OS_MACOSX) || defined(OS_AIX)
354 size_t PosixHelper::GetUniqueIdFromFile(int fd
, char* id
, size_t max_size
) {
355 if (max_size
< kMaxVarint64Length
* 3) {
360 int result
= fstat(fd
, &buf
);
366 rid
= EncodeVarint64(rid
, buf
.st_dev
);
367 rid
= EncodeVarint64(rid
, buf
.st_ino
);
368 rid
= EncodeVarint64(rid
, buf
.st_gen
);
370 return static_cast<size_t>(rid
- id
);
375 std::string
RemoveTrailingSlash(const std::string
& path
) {
376 std::string p
= path
;
377 if (p
.size() > 1 && p
.back() == '/') {
383 Status
LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
384 const std::vector
<std::string
>& directories
) {
385 std::vector
<std::string
> dirs
;
386 dirs
.reserve(directories
.size());
387 for (auto& d
: directories
) {
388 dirs
.emplace_back(RemoveTrailingSlash(d
));
391 std::map
<std::string
, size_t> dir_sizes
;
393 ReadLock
lock(&cache_mutex_
);
394 for (const auto& dir
: dirs
) {
395 if (cache_
.find(dir
) == cache_
.end()) {
396 dir_sizes
.emplace(dir
, 0);
402 for (auto& dir_size
: dir_sizes
) {
403 s
= get_logical_block_size_of_directory_(dir_size
.first
, &dir_size
.second
);
409 WriteLock
lock(&cache_mutex_
);
410 for (const auto& dir
: dirs
) {
411 auto& v
= cache_
[dir
];
413 auto dir_size
= dir_sizes
.find(dir
);
414 if (dir_size
!= dir_sizes
.end()) {
415 v
.size
= dir_size
->second
;
421 void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
422 const std::vector
<std::string
>& directories
) {
423 std::vector
<std::string
> dirs
;
424 dirs
.reserve(directories
.size());
425 for (auto& dir
: directories
) {
426 dirs
.emplace_back(RemoveTrailingSlash(dir
));
429 WriteLock
lock(&cache_mutex_
);
430 for (const auto& dir
: dirs
) {
431 auto it
= cache_
.find(dir
);
432 if (it
!= cache_
.end() && !(--(it
->second
.ref
))) {
438 size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string
& fname
,
440 std::string dir
= fname
.substr(0, fname
.find_last_of("/"));
445 ReadLock
lock(&cache_mutex_
);
446 auto it
= cache_
.find(dir
);
447 if (it
!= cache_
.end()) {
448 return it
->second
.size
;
451 return get_logical_block_size_of_fd_(fd
);
455 Status
PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string
& directory
,
457 int fd
= open(directory
.c_str(), O_DIRECTORY
| O_RDONLY
);
460 return Status::IOError("Cannot open directory " + directory
);
462 *size
= PosixHelper::GetLogicalBlockSizeOfFd(fd
);
467 size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd
) {
470 int result
= fstat(fd
, &buf
);
472 return kDefaultPageSize
;
474 if (major(buf
.st_dev
) == 0) {
475 // Unnamed devices (e.g. non-device mounts), reserved as null device number.
476 // These don't have an entry in /sys/dev/block/. Return a sensible default.
477 return kDefaultPageSize
;
480 // Reading queue/logical_block_size does not require special permissions.
481 const int kBufferSize
= 100;
482 char path
[kBufferSize
];
483 char real_path
[PATH_MAX
+ 1];
484 snprintf(path
, kBufferSize
, "/sys/dev/block/%u:%u", major(buf
.st_dev
),
486 if (realpath(path
, real_path
) == nullptr) {
487 return kDefaultPageSize
;
489 std::string
device_dir(real_path
);
490 if (!device_dir
.empty() && device_dir
.back() == '/') {
491 device_dir
.pop_back();
493 // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
494 // and nvme0n1 have it.
495 // $ ls -al '/sys/dev/block/8:3'
496 // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
497 // ../../block/sda/sda3
498 // $ ls -al '/sys/dev/block/259:4'
499 // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
500 // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
501 size_t parent_end
= device_dir
.rfind('/', device_dir
.length() - 1);
502 if (parent_end
== std::string::npos
) {
503 return kDefaultPageSize
;
505 size_t parent_begin
= device_dir
.rfind('/', parent_end
- 1);
506 if (parent_begin
== std::string::npos
) {
507 return kDefaultPageSize
;
510 device_dir
.substr(parent_begin
+ 1, parent_end
- parent_begin
- 1);
511 std::string child
= device_dir
.substr(parent_end
+ 1, std::string::npos
);
512 if (parent
!= "block" &&
513 (child
.compare(0, 4, "nvme") || child
.find('p') != std::string::npos
)) {
514 device_dir
= device_dir
.substr(0, parent_end
);
516 std::string fname
= device_dir
+ "/queue/logical_block_size";
519 fp
= fopen(fname
.c_str(), "r");
521 char* line
= nullptr;
523 if (getline(&line
, &len
, fp
) != -1) {
524 sscanf(line
, "%zu", &size
);
529 if (size
!= 0 && (size
& (size
- 1)) == 0) {
534 return kDefaultPageSize
;
538 * PosixRandomAccessFile
540 * pread() based random-access
542 PosixRandomAccessFile::PosixRandomAccessFile(
543 const std::string
& fname
, int fd
, size_t logical_block_size
,
544 const EnvOptions
& options
545 #if defined(ROCKSDB_IOURING_PRESENT)
547 ThreadLocalPtr
* thread_local_io_urings
552 use_direct_io_(options
.use_direct_reads
),
553 logical_sector_size_(logical_block_size
)
554 #if defined(ROCKSDB_IOURING_PRESENT)
556 thread_local_io_urings_(thread_local_io_urings
)
559 assert(!options
.use_direct_reads
|| !options
.use_mmap_reads
);
560 assert(!options
.use_mmap_reads
);
563 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_
); }
565 IOStatus
PosixRandomAccessFile::Read(uint64_t offset
, size_t n
,
566 const IOOptions
& /*opts*/, Slice
* result
,
568 IODebugContext
* /*dbg*/) const {
569 if (use_direct_io()) {
570 assert(IsSectorAligned(offset
, GetRequiredBufferAlignment()));
571 assert(IsSectorAligned(n
, GetRequiredBufferAlignment()));
572 assert(IsSectorAligned(scratch
, GetRequiredBufferAlignment()));
579 r
= pread(fd_
, ptr
, left
, static_cast<off_t
>(offset
));
581 if (r
== -1 && errno
== EINTR
) {
589 if (use_direct_io() &&
590 r
% static_cast<ssize_t
>(GetRequiredBufferAlignment()) != 0) {
591 // Bytes reads don't fill sectors. Should only happen at the end
597 // An error: return a non-ok status
598 s
= IOError("While pread offset " + std::to_string(offset
) + " len " +
602 *result
= Slice(scratch
, (r
< 0) ? 0 : n
- left
);
606 IOStatus
PosixRandomAccessFile::MultiRead(FSReadRequest
* reqs
, size_t num_reqs
,
607 const IOOptions
& options
,
608 IODebugContext
* dbg
) {
609 if (use_direct_io()) {
610 for (size_t i
= 0; i
< num_reqs
; i
++) {
611 assert(IsSectorAligned(reqs
[i
].offset
, GetRequiredBufferAlignment()));
612 assert(IsSectorAligned(reqs
[i
].len
, GetRequiredBufferAlignment()));
613 assert(IsSectorAligned(reqs
[i
].scratch
, GetRequiredBufferAlignment()));
617 #if defined(ROCKSDB_IOURING_PRESENT)
618 struct io_uring
* iu
= nullptr;
619 if (thread_local_io_urings_
) {
620 iu
= static_cast<struct io_uring
*>(thread_local_io_urings_
->Get());
622 iu
= CreateIOUring();
624 thread_local_io_urings_
->Reset(iu
);
629 // Init failed, platform doesn't support io_uring. Fall back to
632 return FSRandomAccessFile::MultiRead(reqs
, num_reqs
, options
, dbg
);
635 IOStatus ios
= IOStatus::OK();
637 struct WrappedReadRequest
{
641 explicit WrappedReadRequest(FSReadRequest
* r
) : req(r
), finished_len(0) {}
644 autovector
<WrappedReadRequest
, 32> req_wraps
;
645 autovector
<WrappedReadRequest
*, 4> incomplete_rq_list
;
646 std::unordered_set
<WrappedReadRequest
*> wrap_cache
;
648 for (size_t i
= 0; i
< num_reqs
; i
++) {
649 req_wraps
.emplace_back(&reqs
[i
]);
653 while (num_reqs
> reqs_off
|| !incomplete_rq_list
.empty()) {
654 size_t this_reqs
= (num_reqs
- reqs_off
) + incomplete_rq_list
.size();
656 // If requests exceed depth, split it into batches
657 if (this_reqs
> kIoUringDepth
) this_reqs
= kIoUringDepth
;
659 assert(incomplete_rq_list
.size() <= this_reqs
);
660 for (size_t i
= 0; i
< this_reqs
; i
++) {
661 WrappedReadRequest
* rep_to_submit
;
662 if (i
< incomplete_rq_list
.size()) {
663 rep_to_submit
= incomplete_rq_list
[i
];
665 rep_to_submit
= &req_wraps
[reqs_off
++];
667 assert(rep_to_submit
->req
->len
> rep_to_submit
->finished_len
);
668 rep_to_submit
->iov
.iov_base
=
669 rep_to_submit
->req
->scratch
+ rep_to_submit
->finished_len
;
670 rep_to_submit
->iov
.iov_len
=
671 rep_to_submit
->req
->len
- rep_to_submit
->finished_len
;
673 struct io_uring_sqe
* sqe
;
674 sqe
= io_uring_get_sqe(iu
);
676 sqe
, fd_
, &rep_to_submit
->iov
, 1,
677 rep_to_submit
->req
->offset
+ rep_to_submit
->finished_len
);
678 io_uring_sqe_set_data(sqe
, rep_to_submit
);
679 wrap_cache
.emplace(rep_to_submit
);
681 incomplete_rq_list
.clear();
684 io_uring_submit_and_wait(iu
, static_cast<unsigned int>(this_reqs
));
685 TEST_SYNC_POINT_CALLBACK(
686 "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
688 TEST_SYNC_POINT_CALLBACK(
689 "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
692 if (static_cast<size_t>(ret
) != this_reqs
) {
693 fprintf(stderr
, "ret = %ld this_reqs: %ld\n", (long)ret
, (long)this_reqs
);
694 // If error happens and we submitted fewer than expected, it is an
695 // exception case and we don't retry here. We should still consume
696 // what is is submitted in the ring.
697 for (ssize_t i
= 0; i
< ret
; i
++) {
698 struct io_uring_cqe
* cqe
= nullptr;
699 io_uring_wait_cqe(iu
, &cqe
);
700 if (cqe
!= nullptr) {
701 io_uring_cqe_seen(iu
, cqe
);
704 return IOStatus::IOError("io_uring_submit_and_wait() requested " +
705 std::to_string(this_reqs
) + " but returned " +
706 std::to_string(ret
));
709 for (size_t i
= 0; i
< this_reqs
; i
++) {
710 struct io_uring_cqe
* cqe
= nullptr;
711 WrappedReadRequest
* req_wrap
;
713 // We could use the peek variant here, but this seems safer in terms
714 // of our initial wait not reaping all completions
715 ret
= io_uring_wait_cqe(iu
, &cqe
);
716 TEST_SYNC_POINT_CALLBACK(
717 "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret
);
719 ios
= IOStatus::IOError("io_uring_wait_cqe() returns " +
720 std::to_string(ret
));
722 if (cqe
!= nullptr) {
723 io_uring_cqe_seen(iu
, cqe
);
728 req_wrap
= static_cast<WrappedReadRequest
*>(io_uring_cqe_get_data(cqe
));
729 // Reset cqe data to catch any stray reuse of it
730 static_cast<struct io_uring_cqe
*>(cqe
)->user_data
= 0xd5d5d5d5d5d5d5d5;
731 // Check that we got a valid unique cqe data
732 auto wrap_check
= wrap_cache
.find(req_wrap
);
733 if (wrap_check
== wrap_cache
.end()) {
735 "PosixRandomAccessFile::MultiRead: "
736 "Bad cqe data from IO uring - %p\n",
739 ios
= IOStatus::IOError("io_uring_cqe_get_data() returned " +
740 std::to_string((uint64_t)req_wrap
));
743 wrap_cache
.erase(wrap_check
);
745 FSReadRequest
* req
= req_wrap
->req
;
746 size_t bytes_read
= 0;
747 bool read_again
= false;
748 UpdateResult(cqe
, filename_
, req
->len
, req_wrap
->iov
.iov_len
,
749 false /*async_read*/, use_direct_io(),
750 GetRequiredBufferAlignment(), req_wrap
->finished_len
, req
,
751 bytes_read
, read_again
);
752 int32_t res
= cqe
->res
;
754 if (bytes_read
== 0) {
758 Read(req
->offset
+ req_wrap
->finished_len
,
759 req
->len
- req_wrap
->finished_len
, options
, &tmp_slice
,
760 req
->scratch
+ req_wrap
->finished_len
, dbg
);
762 Slice(req
->scratch
, req_wrap
->finished_len
+ tmp_slice
.size());
764 // else It means EOF so no need to do anything.
765 } else if (bytes_read
< req_wrap
->iov
.iov_len
) {
766 incomplete_rq_list
.push_back(req_wrap
);
769 io_uring_cqe_seen(iu
, cqe
);
775 return FSRandomAccessFile::MultiRead(reqs
, num_reqs
, options
, dbg
);
779 IOStatus
PosixRandomAccessFile::Prefetch(uint64_t offset
, size_t n
,
780 const IOOptions
& /*opts*/,
781 IODebugContext
* /*dbg*/) {
783 if (!use_direct_io()) {
786 r
= readahead(fd_
, offset
, n
);
790 advice
.ra_offset
= static_cast<off_t
>(offset
);
791 advice
.ra_count
= static_cast<int>(n
);
792 r
= fcntl(fd_
, F_RDADVISE
, &advice
);
795 s
= IOError("While prefetching offset " + std::to_string(offset
) +
796 " len " + std::to_string(n
),
803 #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
804 size_t PosixRandomAccessFile::GetUniqueId(char* id
, size_t max_size
) const {
805 return PosixHelper::GetUniqueIdFromFile(fd_
, id
, max_size
);
809 void PosixRandomAccessFile::Hint(AccessPattern pattern
) {
810 if (use_direct_io()) {
815 Fadvise(fd_
, 0, 0, POSIX_FADV_NORMAL
);
818 Fadvise(fd_
, 0, 0, POSIX_FADV_RANDOM
);
821 Fadvise(fd_
, 0, 0, POSIX_FADV_SEQUENTIAL
);
824 Fadvise(fd_
, 0, 0, POSIX_FADV_WILLNEED
);
827 Fadvise(fd_
, 0, 0, POSIX_FADV_DONTNEED
);
835 IOStatus
PosixRandomAccessFile::InvalidateCache(size_t offset
, size_t length
) {
836 if (use_direct_io()) {
837 return IOStatus::OK();
842 return IOStatus::OK();
845 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
847 return IOStatus::OK();
849 return IOError("While fadvise NotNeeded offset " + std::to_string(offset
) +
850 " len " + std::to_string(length
),
855 IOStatus
PosixRandomAccessFile::ReadAsync(
856 FSReadRequest
& req
, const IOOptions
& /*opts*/,
857 std::function
<void(const FSReadRequest
&, void*)> cb
, void* cb_arg
,
858 void** io_handle
, IOHandleDeleter
* del_fn
, IODebugContext
* /*dbg*/) {
859 if (use_direct_io()) {
860 assert(IsSectorAligned(req
.offset
, GetRequiredBufferAlignment()));
861 assert(IsSectorAligned(req
.len
, GetRequiredBufferAlignment()));
862 assert(IsSectorAligned(req
.scratch
, GetRequiredBufferAlignment()));
865 #if defined(ROCKSDB_IOURING_PRESENT)
866 // io_uring_queue_init.
867 struct io_uring
* iu
= nullptr;
868 if (thread_local_io_urings_
) {
869 iu
= static_cast<struct io_uring
*>(thread_local_io_urings_
->Get());
871 iu
= CreateIOUring();
873 thread_local_io_urings_
->Reset(iu
);
878 // Init failed, platform doesn't support io_uring.
880 return IOStatus::NotSupported("ReadAsync");
883 // Allocate io_handle.
884 IOHandleDeleter deletefn
= [](void* args
) -> void {
885 delete (static_cast<Posix_IOHandle
*>(args
));
889 // Initialize Posix_IOHandle.
890 Posix_IOHandle
* posix_handle
=
891 new Posix_IOHandle(iu
, cb
, cb_arg
, req
.offset
, req
.len
, req
.scratch
,
892 use_direct_io(), GetRequiredBufferAlignment());
893 posix_handle
->iov
.iov_base
= req
.scratch
;
894 posix_handle
->iov
.iov_len
= req
.len
;
896 *io_handle
= static_cast<void*>(posix_handle
);
899 // Step 3: io_uring_sqe_set_data
900 struct io_uring_sqe
* sqe
;
901 sqe
= io_uring_get_sqe(iu
);
903 io_uring_prep_readv(sqe
, fd_
, /*sqe->addr=*/&posix_handle
->iov
,
904 /*sqe->len=*/1, /*sqe->offset=*/posix_handle
->offset
);
906 // Sets sqe->user_data to posix_handle.
907 io_uring_sqe_set_data(sqe
, posix_handle
);
909 // Step 4: io_uring_submit
910 ssize_t ret
= io_uring_submit(iu
);
912 fprintf(stderr
, "io_uring_submit error: %ld\n", long(ret
));
913 return IOStatus::IOError("io_uring_submit() requested but returned " +
914 std::to_string(ret
));
916 return IOStatus::OK();
923 return IOStatus::NotSupported("ReadAsync");
928 * PosixMmapReadableFile
930 * mmap() based random-access
932 // base[0,length-1] contains the mmapped contents of the file.
933 PosixMmapReadableFile::PosixMmapReadableFile(const int fd
,
934 const std::string
& fname
,
935 void* base
, size_t length
,
936 const EnvOptions
& options
)
937 : fd_(fd
), filename_(fname
), mmapped_region_(base
), length_(length
) {
941 fd_
= fd_
+ 0; // suppress the warning for used variables
942 assert(options
.use_mmap_reads
);
943 assert(!options
.use_direct_reads
);
946 PosixMmapReadableFile::~PosixMmapReadableFile() {
947 int ret
= munmap(mmapped_region_
, length_
);
949 fprintf(stdout
, "failed to munmap %p length %" ROCKSDB_PRIszt
" \n",
950 mmapped_region_
, length_
);
955 IOStatus
PosixMmapReadableFile::Read(uint64_t offset
, size_t n
,
956 const IOOptions
& /*opts*/, Slice
* result
,
958 IODebugContext
* /*dbg*/) const {
960 if (offset
> length_
) {
962 return IOError("While mmap read offset " + std::to_string(offset
) +
963 " larger than file length " + std::to_string(length_
),
965 } else if (offset
+ n
> length_
) {
966 n
= static_cast<size_t>(length_
- offset
);
968 *result
= Slice(reinterpret_cast<char*>(mmapped_region_
) + offset
, n
);
972 void PosixMmapReadableFile::Hint(AccessPattern pattern
) {
975 Madvise(mmapped_region_
, length_
, POSIX_MADV_NORMAL
);
978 Madvise(mmapped_region_
, length_
, POSIX_MADV_RANDOM
);
981 Madvise(mmapped_region_
, length_
, POSIX_MADV_SEQUENTIAL
);
984 Madvise(mmapped_region_
, length_
, POSIX_MADV_WILLNEED
);
987 Madvise(mmapped_region_
, length_
, POSIX_MADV_DONTNEED
);
995 IOStatus
PosixMmapReadableFile::InvalidateCache(size_t offset
, size_t length
) {
999 return IOStatus::OK();
1002 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
1004 return IOStatus::OK();
1006 return IOError("While fadvise not needed. Offset " + std::to_string(offset
) +
1007 " len" + std::to_string(length
),
1015 * We preallocate up to an extra megabyte and use memcpy to append new
1016 * data to the file. This is safe since we either properly close the
1017 * file before reading from it, or for log files, the reading code
1018 * knows enough to skip zero suffixes.
1020 IOStatus
PosixMmapFile::UnmapCurrentRegion() {
1021 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
1022 if (base_
!= nullptr) {
1023 int munmap_status
= munmap(base_
, limit_
- base_
);
1024 if (munmap_status
!= 0) {
1025 return IOError("While munmap", filename_
, munmap_status
);
1027 file_offset_
+= limit_
- base_
;
1030 last_sync_
= nullptr;
1033 // Increase the amount we map the next time, but capped at 1MB
1034 if (map_size_
< (1 << 20)) {
1038 return IOStatus::OK();
1041 IOStatus
PosixMmapFile::MapNewRegion() {
1042 #ifdef ROCKSDB_FALLOCATE_PRESENT
1043 assert(base_
== nullptr);
1044 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
1045 // we can't fallocate with FALLOC_FL_KEEP_SIZE here
1046 if (allow_fallocate_
) {
1047 IOSTATS_TIMER_GUARD(allocate_nanos
);
1048 int alloc_status
= fallocate(fd_
, 0, file_offset_
, map_size_
);
1049 if (alloc_status
!= 0) {
1050 // fallback to posix_fallocate
1051 alloc_status
= posix_fallocate(fd_
, file_offset_
, map_size_
);
1053 if (alloc_status
!= 0) {
1054 return IOStatus::IOError("Error allocating space to file : " + filename_
+
1055 "Error : " + errnoStr(alloc_status
).c_str());
1059 TEST_KILL_RANDOM("PosixMmapFile::Append:1");
1060 void* ptr
= mmap(nullptr, map_size_
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, fd_
,
1062 if (ptr
== MAP_FAILED
) {
1063 return IOStatus::IOError("MMap failed on " + filename_
);
1065 TEST_KILL_RANDOM("PosixMmapFile::Append:2");
1067 base_
= reinterpret_cast<char*>(ptr
);
1068 limit_
= base_
+ map_size_
;
1071 return IOStatus::OK();
1073 return IOStatus::NotSupported("This platform doesn't support fallocate()");
1077 IOStatus
PosixMmapFile::Msync() {
1078 if (dst_
== last_sync_
) {
1079 return IOStatus::OK();
1081 // Find the beginnings of the pages that contain the first and last
1082 // bytes to be synced.
1083 size_t p1
= TruncateToPageBoundary(last_sync_
- base_
);
1084 size_t p2
= TruncateToPageBoundary(dst_
- base_
- 1);
1086 TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
1087 if (msync(base_
+ p1
, p2
- p1
+ page_size_
, MS_SYNC
) < 0) {
1088 return IOError("While msync", filename_
, errno
);
1090 return IOStatus::OK();
1093 PosixMmapFile::PosixMmapFile(const std::string
& fname
, int fd
, size_t page_size
,
1094 const EnvOptions
& options
)
1097 page_size_(page_size
),
1098 map_size_(Roundup(65536, page_size
)),
1102 last_sync_(nullptr),
1104 #ifdef ROCKSDB_FALLOCATE_PRESENT
1105 allow_fallocate_
= options
.allow_fallocate
;
1106 fallocate_with_keep_size_
= options
.fallocate_with_keep_size
;
1110 assert((page_size
& (page_size
- 1)) == 0);
1111 assert(options
.use_mmap_writes
);
1112 assert(!options
.use_direct_writes
);
1115 PosixMmapFile::~PosixMmapFile() {
1117 IOStatus s
= PosixMmapFile::Close(IOOptions(), nullptr);
1118 s
.PermitUncheckedError();
1122 IOStatus
PosixMmapFile::Append(const Slice
& data
, const IOOptions
& /*opts*/,
1123 IODebugContext
* /*dbg*/) {
1124 const char* src
= data
.data();
1125 size_t left
= data
.size();
1127 assert(base_
<= dst_
);
1128 assert(dst_
<= limit_
);
1129 size_t avail
= limit_
- dst_
;
1131 IOStatus s
= UnmapCurrentRegion();
1139 TEST_KILL_RANDOM("PosixMmapFile::Append:0");
1142 size_t n
= (left
<= avail
) ? left
: avail
;
1144 memcpy(dst_
, src
, n
);
1149 return IOStatus::OK();
1152 IOStatus
PosixMmapFile::Close(const IOOptions
& /*opts*/,
1153 IODebugContext
* /*dbg*/) {
1155 size_t unused
= limit_
- dst_
;
1157 s
= UnmapCurrentRegion();
1159 s
= IOError("While closing mmapped file", filename_
, errno
);
1160 } else if (unused
> 0) {
1161 // Trim the extra space at the end of the file
1162 if (ftruncate(fd_
, file_offset_
- unused
) < 0) {
1163 s
= IOError("While ftruncating mmaped file", filename_
, errno
);
1167 if (close(fd_
) < 0) {
1169 s
= IOError("While closing mmapped file", filename_
, errno
);
1179 IOStatus
PosixMmapFile::Flush(const IOOptions
& /*opts*/,
1180 IODebugContext
* /*dbg*/) {
1181 return IOStatus::OK();
1184 IOStatus
PosixMmapFile::Sync(const IOOptions
& /*opts*/,
1185 IODebugContext
* /*dbg*/) {
1186 #ifdef HAVE_FULLFSYNC
1187 if (::fcntl(fd_
, F_FULLFSYNC
) < 0) {
1188 return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_
, errno
);
1190 #else // HAVE_FULLFSYNC
1191 if (fdatasync(fd_
) < 0) {
1192 return IOError("While fdatasync mmapped file", filename_
, errno
);
1194 #endif // HAVE_FULLFSYNC
1200 * Flush data as well as metadata to stable storage.
1202 IOStatus
PosixMmapFile::Fsync(const IOOptions
& /*opts*/,
1203 IODebugContext
* /*dbg*/) {
1204 #ifdef HAVE_FULLFSYNC
1205 if (::fcntl(fd_
, F_FULLFSYNC
) < 0) {
1206 return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_
, errno
);
1208 #else // HAVE_FULLFSYNC
1209 if (fsync(fd_
) < 0) {
1210 return IOError("While fsync mmaped file", filename_
, errno
);
1212 #endif // HAVE_FULLFSYNC
1218 * Get the size of valid data in the file. This will not match the
1219 * size that is returned from the filesystem because we use mmap
1220 * to extend file by map_size every time.
1222 uint64_t PosixMmapFile::GetFileSize(const IOOptions
& /*opts*/,
1223 IODebugContext
* /*dbg*/) {
1224 size_t used
= dst_
- base_
;
1225 return file_offset_
+ used
;
1228 IOStatus
PosixMmapFile::InvalidateCache(size_t offset
, size_t length
) {
1232 return IOStatus::OK();
1235 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
1237 return IOStatus::OK();
1239 return IOError("While fadvise NotNeeded mmapped file", filename_
, errno
);
1243 #ifdef ROCKSDB_FALLOCATE_PRESENT
1244 IOStatus
PosixMmapFile::Allocate(uint64_t offset
, uint64_t len
,
1245 const IOOptions
& /*opts*/,
1246 IODebugContext
* /*dbg*/) {
1247 assert(offset
<= static_cast<uint64_t>(std::numeric_limits
<off_t
>::max()));
1248 assert(len
<= static_cast<uint64_t>(std::numeric_limits
<off_t
>::max()));
1249 TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
1250 int alloc_status
= 0;
1251 if (allow_fallocate_
) {
1253 fallocate(fd_
, fallocate_with_keep_size_
? FALLOC_FL_KEEP_SIZE
: 0,
1254 static_cast<off_t
>(offset
), static_cast<off_t
>(len
));
1256 if (alloc_status
== 0) {
1257 return IOStatus::OK();
1259 return IOError("While fallocate offset " + std::to_string(offset
) +
1260 " len " + std::to_string(len
),
1269 * Use posix write to write data to a file.
1271 PosixWritableFile::PosixWritableFile(const std::string
& fname
, int fd
,
1272 size_t logical_block_size
,
1273 const EnvOptions
& options
)
1274 : FSWritableFile(options
),
1276 use_direct_io_(options
.use_direct_writes
),
1279 logical_sector_size_(logical_block_size
) {
1280 #ifdef ROCKSDB_FALLOCATE_PRESENT
1281 allow_fallocate_
= options
.allow_fallocate
;
1282 fallocate_with_keep_size_
= options
.fallocate_with_keep_size
;
1284 #ifdef ROCKSDB_RANGESYNC_PRESENT
1285 sync_file_range_supported_
= IsSyncFileRangeSupported(fd_
);
1286 #endif // ROCKSDB_RANGESYNC_PRESENT
1287 assert(!options
.use_mmap_writes
);
1290 PosixWritableFile::~PosixWritableFile() {
1292 IOStatus s
= PosixWritableFile::Close(IOOptions(), nullptr);
1293 s
.PermitUncheckedError();
1297 IOStatus
PosixWritableFile::Append(const Slice
& data
, const IOOptions
& /*opts*/,
1298 IODebugContext
* /*dbg*/) {
1299 if (use_direct_io()) {
1300 assert(IsSectorAligned(data
.size(), GetRequiredBufferAlignment()));
1301 assert(IsSectorAligned(data
.data(), GetRequiredBufferAlignment()));
1303 const char* src
= data
.data();
1304 size_t nbytes
= data
.size();
1306 if (!PosixWrite(fd_
, src
, nbytes
)) {
1307 return IOError("While appending to file", filename_
, errno
);
1310 filesize_
+= nbytes
;
1311 return IOStatus::OK();
1314 IOStatus
PosixWritableFile::PositionedAppend(const Slice
& data
, uint64_t offset
,
1315 const IOOptions
& /*opts*/,
1316 IODebugContext
* /*dbg*/) {
1317 if (use_direct_io()) {
1318 assert(IsSectorAligned(offset
, GetRequiredBufferAlignment()));
1319 assert(IsSectorAligned(data
.size(), GetRequiredBufferAlignment()));
1320 assert(IsSectorAligned(data
.data(), GetRequiredBufferAlignment()));
1322 assert(offset
<= static_cast<uint64_t>(std::numeric_limits
<off_t
>::max()));
1323 const char* src
= data
.data();
1324 size_t nbytes
= data
.size();
1325 if (!PosixPositionedWrite(fd_
, src
, nbytes
, static_cast<off_t
>(offset
))) {
1326 return IOError("While pwrite to file at offset " + std::to_string(offset
),
1329 filesize_
= offset
+ nbytes
;
1330 return IOStatus::OK();
1333 IOStatus
PosixWritableFile::Truncate(uint64_t size
, const IOOptions
& /*opts*/,
1334 IODebugContext
* /*dbg*/) {
1336 int r
= ftruncate(fd_
, size
);
1338 s
= IOError("While ftruncate file to size " + std::to_string(size
),
1346 IOStatus
PosixWritableFile::Close(const IOOptions
& /*opts*/,
1347 IODebugContext
* /*dbg*/) {
1351 size_t last_allocated_block
;
1352 GetPreallocationStatus(&block_size
, &last_allocated_block
);
1353 TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block
);
1354 if (last_allocated_block
> 0) {
1355 // trim the extra space preallocated at the end of the file
1356 // NOTE(ljin): we probably don't want to surface failure as an IOError,
1357 // but it will be nice to log these errors.
1358 int dummy
__attribute__((__unused__
));
1359 dummy
= ftruncate(fd_
, filesize_
);
1360 #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE)
1361 // in some file systems, ftruncate only trims trailing space if the
1362 // new file size is smaller than the current size. Calling fallocate
1363 // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
1364 // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
1366 // XFS (since Linux 2.6.38)
1367 // ext4 (since Linux 3.0)
1368 // Btrfs (since Linux 3.7)
1369 // tmpfs (since Linux 3.5)
1370 // We ignore error since failure of this operation does not affect
1372 struct stat file_stats
;
1373 int result
= fstat(fd_
, &file_stats
);
1374 // After ftruncate, we check whether ftruncate has the correct behavior.
1375 // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
1377 (file_stats
.st_size
+ file_stats
.st_blksize
- 1) /
1378 file_stats
.st_blksize
!=
1379 file_stats
.st_blocks
/ (file_stats
.st_blksize
/ 512)) {
1380 IOSTATS_TIMER_GUARD(allocate_nanos
);
1381 if (allow_fallocate_
) {
1382 fallocate(fd_
, FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
, filesize_
,
1383 block_size
* last_allocated_block
- filesize_
);
1389 if (close(fd_
) < 0) {
1390 s
= IOError("While closing file after writing", filename_
, errno
);
1396 // write out the cached data to the OS cache
1397 IOStatus
PosixWritableFile::Flush(const IOOptions
& /*opts*/,
1398 IODebugContext
* /*dbg*/) {
1399 return IOStatus::OK();
1402 IOStatus
PosixWritableFile::Sync(const IOOptions
& /*opts*/,
1403 IODebugContext
* /*dbg*/) {
1404 #ifdef HAVE_FULLFSYNC
1405 if (::fcntl(fd_
, F_FULLFSYNC
) < 0) {
1406 return IOError("while fcntl(F_FULLFSYNC)", filename_
, errno
);
1408 #else // HAVE_FULLFSYNC
1409 if (fdatasync(fd_
) < 0) {
1410 return IOError("While fdatasync", filename_
, errno
);
1412 #endif // HAVE_FULLFSYNC
1413 return IOStatus::OK();
1416 IOStatus
PosixWritableFile::Fsync(const IOOptions
& /*opts*/,
1417 IODebugContext
* /*dbg*/) {
1418 #ifdef HAVE_FULLFSYNC
1419 if (::fcntl(fd_
, F_FULLFSYNC
) < 0) {
1420 return IOError("while fcntl(F_FULLFSYNC)", filename_
, errno
);
1422 #else // HAVE_FULLFSYNC
1423 if (fsync(fd_
) < 0) {
1424 return IOError("While fsync", filename_
, errno
);
1426 #endif // HAVE_FULLFSYNC
1427 return IOStatus::OK();
1430 bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
1432 uint64_t PosixWritableFile::GetFileSize(const IOOptions
& /*opts*/,
1433 IODebugContext
* /*dbg*/) {
1437 void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint
) {
1439 // Suppress Valgrind "Unimplemented functionality" error.
1440 #ifndef ROCKSDB_VALGRIND_RUN
1441 if (hint
== write_hint_
) {
1444 if (fcntl(fd_
, F_SET_RW_HINT
, &hint
) == 0) {
1449 #endif // ROCKSDB_VALGRIND_RUN
1455 IOStatus
PosixWritableFile::InvalidateCache(size_t offset
, size_t length
) {
1456 if (use_direct_io()) {
1457 return IOStatus::OK();
1462 return IOStatus::OK();
1465 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
1467 return IOStatus::OK();
1469 return IOError("While fadvise NotNeeded", filename_
, errno
);
1473 #ifdef ROCKSDB_FALLOCATE_PRESENT
1474 IOStatus
PosixWritableFile::Allocate(uint64_t offset
, uint64_t len
,
1475 const IOOptions
& /*opts*/,
1476 IODebugContext
* /*dbg*/) {
1477 assert(offset
<= static_cast<uint64_t>(std::numeric_limits
<off_t
>::max()));
1478 assert(len
<= static_cast<uint64_t>(std::numeric_limits
<off_t
>::max()));
1479 TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
1480 IOSTATS_TIMER_GUARD(allocate_nanos
);
1481 int alloc_status
= 0;
1482 if (allow_fallocate_
) {
1484 fallocate(fd_
, fallocate_with_keep_size_
? FALLOC_FL_KEEP_SIZE
: 0,
1485 static_cast<off_t
>(offset
), static_cast<off_t
>(len
));
1487 if (alloc_status
== 0) {
1488 return IOStatus::OK();
1490 return IOError("While fallocate offset " + std::to_string(offset
) +
1491 " len " + std::to_string(len
),
1497 IOStatus
PosixWritableFile::RangeSync(uint64_t offset
, uint64_t nbytes
,
1498 const IOOptions
& opts
,
1499 IODebugContext
* dbg
) {
1500 #ifdef ROCKSDB_RANGESYNC_PRESENT
1501 assert(offset
<= static_cast<uint64_t>(std::numeric_limits
<off_t
>::max()));
1502 assert(nbytes
<= static_cast<uint64_t>(std::numeric_limits
<off_t
>::max()));
1503 if (sync_file_range_supported_
) {
1505 if (strict_bytes_per_sync_
) {
1506 // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
1507 // that spans all bytes written so far tells `sync_file_range` to wait for
1508 // any outstanding writeback requests to finish before issuing a new one.
1510 sync_file_range(fd_
, 0, static_cast<off_t
>(offset
+ nbytes
),
1511 SYNC_FILE_RANGE_WAIT_BEFORE
| SYNC_FILE_RANGE_WRITE
);
1513 ret
= sync_file_range(fd_
, static_cast<off_t
>(offset
),
1514 static_cast<off_t
>(nbytes
), SYNC_FILE_RANGE_WRITE
);
1517 return IOError("While sync_file_range returned " + std::to_string(ret
),
1520 return IOStatus::OK();
1522 #endif // ROCKSDB_RANGESYNC_PRESENT
1523 return FSWritableFile::RangeSync(offset
, nbytes
, opts
, dbg
);
1527 size_t PosixWritableFile::GetUniqueId(char* id
, size_t max_size
) const {
1528 return PosixHelper::GetUniqueIdFromFile(fd_
, id
, max_size
);
1536 PosixRandomRWFile::PosixRandomRWFile(const std::string
& fname
, int fd
,
1537 const EnvOptions
& /*options*/)
1538 : filename_(fname
), fd_(fd
) {}
1540 PosixRandomRWFile::~PosixRandomRWFile() {
1542 IOStatus s
= Close(IOOptions(), nullptr);
1543 s
.PermitUncheckedError();
1547 IOStatus
PosixRandomRWFile::Write(uint64_t offset
, const Slice
& data
,
1548 const IOOptions
& /*opts*/,
1549 IODebugContext
* /*dbg*/) {
1550 const char* src
= data
.data();
1551 size_t nbytes
= data
.size();
1552 if (!PosixPositionedWrite(fd_
, src
, nbytes
, static_cast<off_t
>(offset
))) {
1553 return IOError("While write random read/write file at offset " +
1554 std::to_string(offset
),
1558 return IOStatus::OK();
1561 IOStatus
PosixRandomRWFile::Read(uint64_t offset
, size_t n
,
1562 const IOOptions
& /*opts*/, Slice
* result
,
1563 char* scratch
, IODebugContext
* /*dbg*/) const {
1565 char* ptr
= scratch
;
1567 ssize_t done
= pread(fd_
, ptr
, left
, offset
);
1569 // error while reading from file
1570 if (errno
== EINTR
) {
1571 // read was interrupted, try again.
1574 return IOError("While reading random read/write file offset " +
1575 std::to_string(offset
) + " len " + std::to_string(n
),
1577 } else if (done
== 0) {
1578 // Nothing more to read
1582 // Read `done` bytes
1588 *result
= Slice(scratch
, n
- left
);
1589 return IOStatus::OK();
1592 IOStatus
PosixRandomRWFile::Flush(const IOOptions
& /*opts*/,
1593 IODebugContext
* /*dbg*/) {
1594 return IOStatus::OK();
1597 IOStatus
PosixRandomRWFile::Sync(const IOOptions
& /*opts*/,
1598 IODebugContext
* /*dbg*/) {
1599 #ifdef HAVE_FULLFSYNC
1600 if (::fcntl(fd_
, F_FULLFSYNC
) < 0) {
1601 return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_
, errno
);
1603 #else // HAVE_FULLFSYNC
1604 if (fdatasync(fd_
) < 0) {
1605 return IOError("While fdatasync random read/write file", filename_
, errno
);
1607 #endif // HAVE_FULLFSYNC
1608 return IOStatus::OK();
1611 IOStatus
PosixRandomRWFile::Fsync(const IOOptions
& /*opts*/,
1612 IODebugContext
* /*dbg*/) {
1613 #ifdef HAVE_FULLFSYNC
1614 if (::fcntl(fd_
, F_FULLFSYNC
) < 0) {
1615 return IOError("While fcntl(F_FULLSYNC) random rw file", filename_
, errno
);
1617 #else // HAVE_FULLFSYNC
1618 if (fsync(fd_
) < 0) {
1619 return IOError("While fsync random read/write file", filename_
, errno
);
1621 #endif // HAVE_FULLFSYNC
1622 return IOStatus::OK();
1625 IOStatus
PosixRandomRWFile::Close(const IOOptions
& /*opts*/,
1626 IODebugContext
* /*dbg*/) {
1627 if (close(fd_
) < 0) {
1628 return IOError("While close random read/write file", filename_
, errno
);
1631 return IOStatus::OK();
1634 PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
1635 // TODO should have error handling though not much we can do...
1636 munmap(this->base_
, length_
);
1642 #if !defined(BTRFS_SUPER_MAGIC)
1643 // The magic number for BTRFS is fixed, if it's not defined, define it here
1644 #define BTRFS_SUPER_MAGIC 0x9123683E
1646 PosixDirectory::PosixDirectory(int fd
, const std::string
& directory_name
)
1647 : fd_(fd
), directory_name_(directory_name
) {
1651 int ret
= fstatfs(fd
, &buf
);
1652 is_btrfs_
= (ret
== 0 && buf
.f_type
== static_cast<decltype(buf
.f_type
)>(
1653 BTRFS_SUPER_MAGIC
));
1657 PosixDirectory::~PosixDirectory() {
1659 IOStatus s
= PosixDirectory::Close(IOOptions(), nullptr);
1660 s
.PermitUncheckedError();
1664 IOStatus
PosixDirectory::Fsync(const IOOptions
& opts
, IODebugContext
* dbg
) {
1665 return FsyncWithDirOptions(opts
, dbg
, DirFsyncOptions());
1668 // Users who want the file entries synced in Directory project must call a
1669 // Fsync or FsyncWithDirOptions function before Close
1670 IOStatus
PosixDirectory::Close(const IOOptions
& /*opts*/,
1671 IODebugContext
* /*dbg*/) {
1672 IOStatus s
= IOStatus::OK();
1673 if (close(fd_
) < 0) {
1674 s
= IOError("While closing directory ", directory_name_
, errno
);
1681 IOStatus
PosixDirectory::FsyncWithDirOptions(
1682 const IOOptions
& /*opts*/, IODebugContext
* /*dbg*/,
1683 const DirFsyncOptions
& dir_fsync_options
) {
1684 assert(fd_
>= 0); // Check use after close
1685 IOStatus s
= IOStatus::OK();
1688 // skip dir fsync for new file creation, which is not needed for btrfs
1689 if (dir_fsync_options
.reason
== DirFsyncOptions::kNewFileSynced
) {
1692 // skip dir fsync for renaming file, only need to sync new file
1693 if (dir_fsync_options
.reason
== DirFsyncOptions::kFileRenamed
) {
1694 std::string new_name
= dir_fsync_options
.renamed_new_name
;
1695 assert(!new_name
.empty());
1698 IOSTATS_TIMER_GUARD(open_nanos
);
1699 fd
= open(new_name
.c_str(), O_RDONLY
);
1700 } while (fd
< 0 && errno
== EINTR
);
1702 s
= IOError("While open renaming file", new_name
, errno
);
1703 } else if (fsync(fd
) < 0) {
1704 s
= IOError("While fsync renaming file", new_name
, errno
);
1706 if (close(fd
) < 0) {
1707 s
= IOError("While closing file after fsync", new_name
, errno
);
1711 // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
1714 // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed
1715 // in either the de-construction or the close function, data must have been
1716 // fsync-ed before de-construction and close is called
1717 #ifdef HAVE_FULLFSYNC
1718 // btrfs is a Linux file system, while currently F_FULLFSYNC is available on
1721 if (fd_
!= -1 && ::fcntl(fd_
, F_FULLFSYNC
) < 0) {
1722 return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno
);
1724 #else // HAVE_FULLFSYNC
1725 if (fd_
!= -1 && fsync(fd_
) == -1) {
1726 s
= IOError("While fsync", "a directory", errno
);
1728 #endif // HAVE_FULLFSYNC
1732 } // namespace ROCKSDB_NAMESPACE