1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #ifdef ROCKSDB_LIB_IO_POSIX
11 #include "env/io_posix.h"
21 #include <sys/ioctl.h>
24 #include <sys/types.h>
26 #include <sys/statfs.h>
27 #include <sys/syscall.h>
28 #include <sys/sysmacros.h>
30 #include "env/posix_logger.h"
31 #include "monitoring/iostats_context_imp.h"
32 #include "port/port.h"
33 #include "rocksdb/slice.h"
34 #include "util/coding.h"
35 #include "util/string_util.h"
36 #include "util/sync_point.h"
38 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
39 #define F_LINUX_SPECIFIC_BASE 1024
40 #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
45 // A wrapper for fadvise, if the platform doesn't support fadvise,
46 // it will simply return 0.
47 int Fadvise(int fd
, off_t offset
, size_t len
, int advice
) {
49 return posix_fadvise(fd
, offset
, len
, advice
);
55 return 0; // simply do nothing.
60 size_t GetLogicalBufferSize(int __attribute__((__unused__
)) fd
) {
63 int result
= fstat(fd
, &buf
);
65 return kDefaultPageSize
;
67 if (major(buf
.st_dev
) == 0) {
68 // Unnamed devices (e.g. non-device mounts), reserved as null device number.
69 // These don't have an entry in /sys/dev/block/. Return a sensible default.
70 return kDefaultPageSize
;
73 // Reading queue/logical_block_size does not require special permissions.
74 const int kBufferSize
= 100;
75 char path
[kBufferSize
];
76 char real_path
[PATH_MAX
+ 1];
77 snprintf(path
, kBufferSize
, "/sys/dev/block/%u:%u", major(buf
.st_dev
),
79 if (realpath(path
, real_path
) == nullptr) {
80 return kDefaultPageSize
;
82 std::string
device_dir(real_path
);
83 if (!device_dir
.empty() && device_dir
.back() == '/') {
84 device_dir
.pop_back();
86 // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
87 // and nvme0n1 have it.
88 // $ ls -al '/sys/dev/block/8:3'
89 // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
90 // ../../block/sda/sda3
91 // $ ls -al '/sys/dev/block/259:4'
92 // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
93 // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
94 size_t parent_end
= device_dir
.rfind('/', device_dir
.length() - 1);
95 if (parent_end
== std::string::npos
) {
96 return kDefaultPageSize
;
98 size_t parent_begin
= device_dir
.rfind('/', parent_end
- 1);
99 if (parent_begin
== std::string::npos
) {
100 return kDefaultPageSize
;
103 device_dir
.substr(parent_begin
+ 1, parent_end
- parent_begin
- 1);
104 std::string child
= device_dir
.substr(parent_end
+ 1, std::string::npos
);
105 if (parent
!= "block" &&
106 (child
.compare(0, 4, "nvme") || child
.find('p') != std::string::npos
)) {
107 device_dir
= device_dir
.substr(0, parent_end
);
109 std::string fname
= device_dir
+ "/queue/logical_block_size";
112 fp
= fopen(fname
.c_str(), "r");
114 char* line
= nullptr;
116 if (getline(&line
, &len
, fp
) != -1) {
117 sscanf(line
, "%zu", &size
);
122 if (size
!= 0 && (size
& (size
- 1)) == 0) {
126 return kDefaultPageSize
;
136 bool IsSectorAligned(const size_t off
, size_t sector_size
) {
137 return off
% sector_size
== 0;
140 bool IsSectorAligned(const void* ptr
, size_t sector_size
) {
141 return uintptr_t(ptr
) % sector_size
== 0;
148 * PosixSequentialFile
150 PosixSequentialFile::PosixSequentialFile(const std::string
& fname
, FILE* file
,
151 int fd
, const EnvOptions
& options
)
155 use_direct_io_(options
.use_direct_reads
),
156 logical_sector_size_(GetLogicalBufferSize(fd_
)) {
157 assert(!options
.use_direct_reads
|| !options
.use_mmap_reads
);
160 PosixSequentialFile::~PosixSequentialFile() {
161 if (!use_direct_io()) {
170 Status
PosixSequentialFile::Read(size_t n
, Slice
* result
, char* scratch
) {
171 assert(result
!= nullptr && !use_direct_io());
175 r
= fread_unlocked(scratch
, 1, n
, file_
);
176 } while (r
== 0 && ferror(file_
) && errno
== EINTR
);
177 *result
= Slice(scratch
, r
);
180 // We leave status as ok if we hit the end of the file
181 // We also clear the error so that the reads can continue
182 // if a new data is written to the file
185 // A partial read with an error: return a non-ok status
186 s
= IOError("While reading file sequentially", filename_
, errno
);
192 Status
PosixSequentialFile::PositionedRead(uint64_t offset
, size_t n
,
193 Slice
* result
, char* scratch
) {
194 assert(use_direct_io());
195 assert(IsSectorAligned(offset
, GetRequiredBufferAlignment()));
196 assert(IsSectorAligned(n
, GetRequiredBufferAlignment()));
197 assert(IsSectorAligned(scratch
, GetRequiredBufferAlignment()));
204 r
= pread(fd_
, ptr
, left
, static_cast<off_t
>(offset
));
206 if (r
== -1 && errno
== EINTR
) {
214 if (r
% static_cast<ssize_t
>(GetRequiredBufferAlignment()) != 0) {
215 // Bytes reads don't fill sectors. Should only happen at the end
221 // An error: return a non-ok status
223 "While pread " + ToString(n
) + " bytes from offset " + ToString(offset
),
226 *result
= Slice(scratch
, (r
< 0) ? 0 : n
- left
);
230 Status
PosixSequentialFile::Skip(uint64_t n
) {
231 if (fseek(file_
, static_cast<long int>(n
), SEEK_CUR
)) {
232 return IOError("While fseek to skip " + ToString(n
) + " bytes", filename_
,
238 Status
PosixSequentialFile::InvalidateCache(size_t offset
, size_t length
) {
244 if (!use_direct_io()) {
246 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
248 return IOError("While fadvise NotNeeded offset " + ToString(offset
) +
249 " len " + ToString(length
),
258 * PosixRandomAccessFile
260 #if defined(OS_LINUX)
261 size_t PosixHelper::GetUniqueIdFromFile(int fd
, char* id
, size_t max_size
) {
262 if (max_size
< kMaxVarint64Length
* 3) {
267 int result
= fstat(fd
, &buf
);
273 result
= ioctl(fd
, FS_IOC_GETVERSION
, &version
);
274 TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result
);
278 uint64_t uversion
= (uint64_t)version
;
281 rid
= EncodeVarint64(rid
, buf
.st_dev
);
282 rid
= EncodeVarint64(rid
, buf
.st_ino
);
283 rid
= EncodeVarint64(rid
, uversion
);
285 return static_cast<size_t>(rid
- id
);
289 #if defined(OS_MACOSX) || defined(OS_AIX)
290 size_t PosixHelper::GetUniqueIdFromFile(int fd
, char* id
, size_t max_size
) {
291 if (max_size
< kMaxVarint64Length
* 3) {
296 int result
= fstat(fd
, &buf
);
302 rid
= EncodeVarint64(rid
, buf
.st_dev
);
303 rid
= EncodeVarint64(rid
, buf
.st_ino
);
304 rid
= EncodeVarint64(rid
, buf
.st_gen
);
306 return static_cast<size_t>(rid
- id
);
310 * PosixRandomAccessFile
312 * pread() based random-access
314 PosixRandomAccessFile::PosixRandomAccessFile(const std::string
& fname
, int fd
,
315 const EnvOptions
& options
)
318 use_direct_io_(options
.use_direct_reads
),
319 logical_sector_size_(GetLogicalBufferSize(fd_
)) {
320 assert(!options
.use_direct_reads
|| !options
.use_mmap_reads
);
321 assert(!options
.use_mmap_reads
|| sizeof(void*) < 8);
324 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_
); }
326 Status
PosixRandomAccessFile::Read(uint64_t offset
, size_t n
, Slice
* result
,
327 char* scratch
) const {
328 if (use_direct_io()) {
329 assert(IsSectorAligned(offset
, GetRequiredBufferAlignment()));
330 assert(IsSectorAligned(n
, GetRequiredBufferAlignment()));
331 assert(IsSectorAligned(scratch
, GetRequiredBufferAlignment()));
338 r
= pread(fd_
, ptr
, left
, static_cast<off_t
>(offset
));
340 if (r
== -1 && errno
== EINTR
) {
348 if (use_direct_io() &&
349 r
% static_cast<ssize_t
>(GetRequiredBufferAlignment()) != 0) {
350 // Bytes reads don't fill sectors. Should only happen at the end
356 // An error: return a non-ok status
358 "While pread offset " + ToString(offset
) + " len " + ToString(n
),
361 *result
= Slice(scratch
, (r
< 0) ? 0 : n
- left
);
365 Status
PosixRandomAccessFile::Prefetch(uint64_t offset
, size_t n
) {
367 if (!use_direct_io()) {
370 r
= readahead(fd_
, offset
, n
);
374 advice
.ra_offset
= static_cast<off_t
>(offset
);
375 advice
.ra_count
= static_cast<int>(n
);
376 r
= fcntl(fd_
, F_RDADVISE
, &advice
);
379 s
= IOError("While prefetching offset " + ToString(offset
) + " len " +
387 #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
388 size_t PosixRandomAccessFile::GetUniqueId(char* id
, size_t max_size
) const {
389 return PosixHelper::GetUniqueIdFromFile(fd_
, id
, max_size
);
393 void PosixRandomAccessFile::Hint(AccessPattern pattern
) {
394 if (use_direct_io()) {
399 Fadvise(fd_
, 0, 0, POSIX_FADV_NORMAL
);
402 Fadvise(fd_
, 0, 0, POSIX_FADV_RANDOM
);
405 Fadvise(fd_
, 0, 0, POSIX_FADV_SEQUENTIAL
);
408 Fadvise(fd_
, 0, 0, POSIX_FADV_WILLNEED
);
411 Fadvise(fd_
, 0, 0, POSIX_FADV_DONTNEED
);
419 Status
PosixRandomAccessFile::InvalidateCache(size_t offset
, size_t length
) {
420 if (use_direct_io()) {
429 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
433 return IOError("While fadvise NotNeeded offset " + ToString(offset
) +
434 " len " + ToString(length
),
440 * PosixMmapReadableFile
442 * mmap() based random-access
444 // base[0,length-1] contains the mmapped contents of the file.
445 PosixMmapReadableFile::PosixMmapReadableFile(const int fd
,
446 const std::string
& fname
,
447 void* base
, size_t length
,
448 const EnvOptions
& options
)
449 : fd_(fd
), filename_(fname
), mmapped_region_(base
), length_(length
) {
453 fd_
= fd_
+ 0; // suppress the warning for used variables
454 assert(options
.use_mmap_reads
);
455 assert(!options
.use_direct_reads
);
458 PosixMmapReadableFile::~PosixMmapReadableFile() {
459 int ret
= munmap(mmapped_region_
, length_
);
461 fprintf(stdout
, "failed to munmap %p length %" ROCKSDB_PRIszt
" \n",
462 mmapped_region_
, length_
);
467 Status
PosixMmapReadableFile::Read(uint64_t offset
, size_t n
, Slice
* result
,
468 char* /*scratch*/) const {
470 if (offset
> length_
) {
472 return IOError("While mmap read offset " + ToString(offset
) +
473 " larger than file length " + ToString(length_
),
475 } else if (offset
+ n
> length_
) {
476 n
= static_cast<size_t>(length_
- offset
);
478 *result
= Slice(reinterpret_cast<char*>(mmapped_region_
) + offset
, n
);
482 Status
PosixMmapReadableFile::InvalidateCache(size_t offset
, size_t length
) {
489 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
493 return IOError("While fadvise not needed. Offset " + ToString(offset
) +
494 " len" + ToString(length
),
502 * We preallocate up to an extra megabyte and use memcpy to append new
503 * data to the file. This is safe since we either properly close the
504 * file before reading from it, or for log files, the reading code
505 * knows enough to skip zero suffixes.
507 Status
PosixMmapFile::UnmapCurrentRegion() {
508 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds
);
509 if (base_
!= nullptr) {
510 int munmap_status
= munmap(base_
, limit_
- base_
);
511 if (munmap_status
!= 0) {
512 return IOError("While munmap", filename_
, munmap_status
);
514 file_offset_
+= limit_
- base_
;
517 last_sync_
= nullptr;
520 // Increase the amount we map the next time, but capped at 1MB
521 if (map_size_
< (1 << 20)) {
528 Status
PosixMmapFile::MapNewRegion() {
529 #ifdef ROCKSDB_FALLOCATE_PRESENT
530 assert(base_
== nullptr);
531 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds
);
532 // we can't fallocate with FALLOC_FL_KEEP_SIZE here
533 if (allow_fallocate_
) {
534 IOSTATS_TIMER_GUARD(allocate_nanos
);
535 int alloc_status
= fallocate(fd_
, 0, file_offset_
, map_size_
);
536 if (alloc_status
!= 0) {
537 // fallback to posix_fallocate
538 alloc_status
= posix_fallocate(fd_
, file_offset_
, map_size_
);
540 if (alloc_status
!= 0) {
541 return Status::IOError("Error allocating space to file : " + filename_
+
542 "Error : " + strerror(alloc_status
));
546 TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds
);
547 void* ptr
= mmap(nullptr, map_size_
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, fd_
,
549 if (ptr
== MAP_FAILED
) {
550 return Status::IOError("MMap failed on " + filename_
);
552 TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds
);
554 base_
= reinterpret_cast<char*>(ptr
);
555 limit_
= base_
+ map_size_
;
560 return Status::NotSupported("This platform doesn't support fallocate()");
564 Status
PosixMmapFile::Msync() {
565 if (dst_
== last_sync_
) {
568 // Find the beginnings of the pages that contain the first and last
569 // bytes to be synced.
570 size_t p1
= TruncateToPageBoundary(last_sync_
- base_
);
571 size_t p2
= TruncateToPageBoundary(dst_
- base_
- 1);
573 TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds
);
574 if (msync(base_
+ p1
, p2
- p1
+ page_size_
, MS_SYNC
) < 0) {
575 return IOError("While msync", filename_
, errno
);
580 PosixMmapFile::PosixMmapFile(const std::string
& fname
, int fd
, size_t page_size
,
581 const EnvOptions
& options
)
584 page_size_(page_size
),
585 map_size_(Roundup(65536, page_size
)),
591 #ifdef ROCKSDB_FALLOCATE_PRESENT
592 allow_fallocate_
= options
.allow_fallocate
;
593 fallocate_with_keep_size_
= options
.fallocate_with_keep_size
;
597 assert((page_size
& (page_size
- 1)) == 0);
598 assert(options
.use_mmap_writes
);
599 assert(!options
.use_direct_writes
);
602 PosixMmapFile::~PosixMmapFile() {
604 PosixMmapFile::Close();
608 Status
PosixMmapFile::Append(const Slice
& data
) {
609 const char* src
= data
.data();
610 size_t left
= data
.size();
612 assert(base_
<= dst_
);
613 assert(dst_
<= limit_
);
614 size_t avail
= limit_
- dst_
;
616 Status s
= UnmapCurrentRegion();
624 TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds
);
627 size_t n
= (left
<= avail
) ? left
: avail
;
629 memcpy(dst_
, src
, n
);
637 Status
PosixMmapFile::Close() {
639 size_t unused
= limit_
- dst_
;
641 s
= UnmapCurrentRegion();
643 s
= IOError("While closing mmapped file", filename_
, errno
);
644 } else if (unused
> 0) {
645 // Trim the extra space at the end of the file
646 if (ftruncate(fd_
, file_offset_
- unused
) < 0) {
647 s
= IOError("While ftruncating mmaped file", filename_
, errno
);
651 if (close(fd_
) < 0) {
653 s
= IOError("While closing mmapped file", filename_
, errno
);
663 Status
PosixMmapFile::Flush() { return Status::OK(); }
665 Status
PosixMmapFile::Sync() {
666 if (fdatasync(fd_
) < 0) {
667 return IOError("While fdatasync mmapped file", filename_
, errno
);
674 * Flush data as well as metadata to stable storage.
676 Status
PosixMmapFile::Fsync() {
677 if (fsync(fd_
) < 0) {
678 return IOError("While fsync mmaped file", filename_
, errno
);
685 * Get the size of valid data in the file. This will not match the
686 * size that is returned from the filesystem because we use mmap
687 * to extend file by map_size every time.
689 uint64_t PosixMmapFile::GetFileSize() {
690 size_t used
= dst_
- base_
;
691 return file_offset_
+ used
;
694 Status
PosixMmapFile::InvalidateCache(size_t offset
, size_t length
) {
701 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
705 return IOError("While fadvise NotNeeded mmapped file", filename_
, errno
);
709 #ifdef ROCKSDB_FALLOCATE_PRESENT
710 Status
PosixMmapFile::Allocate(uint64_t offset
, uint64_t len
) {
711 assert(offset
<= std::numeric_limits
<off_t
>::max());
712 assert(len
<= std::numeric_limits
<off_t
>::max());
713 TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds
);
714 int alloc_status
= 0;
715 if (allow_fallocate_
) {
716 alloc_status
= fallocate(
717 fd_
, fallocate_with_keep_size_
? FALLOC_FL_KEEP_SIZE
: 0,
718 static_cast<off_t
>(offset
), static_cast<off_t
>(len
));
720 if (alloc_status
== 0) {
724 "While fallocate offset " + ToString(offset
) + " len " + ToString(len
),
733 * Use posix write to write data to a file.
735 PosixWritableFile::PosixWritableFile(const std::string
& fname
, int fd
,
736 const EnvOptions
& options
)
738 use_direct_io_(options
.use_direct_writes
),
741 logical_sector_size_(GetLogicalBufferSize(fd_
)) {
742 #ifdef ROCKSDB_FALLOCATE_PRESENT
743 allow_fallocate_
= options
.allow_fallocate
;
744 fallocate_with_keep_size_
= options
.fallocate_with_keep_size
;
746 assert(!options
.use_mmap_writes
);
749 PosixWritableFile::~PosixWritableFile() {
751 PosixWritableFile::Close();
755 Status
PosixWritableFile::Append(const Slice
& data
) {
756 if (use_direct_io()) {
757 assert(IsSectorAligned(data
.size(), GetRequiredBufferAlignment()));
758 assert(IsSectorAligned(data
.data(), GetRequiredBufferAlignment()));
760 const char* src
= data
.data();
761 size_t left
= data
.size();
763 ssize_t done
= write(fd_
, src
, left
);
765 if (errno
== EINTR
) {
768 return IOError("While appending to file", filename_
, errno
);
773 filesize_
+= data
.size();
777 Status
PosixWritableFile::PositionedAppend(const Slice
& data
, uint64_t offset
) {
778 if (use_direct_io()) {
779 assert(IsSectorAligned(offset
, GetRequiredBufferAlignment()));
780 assert(IsSectorAligned(data
.size(), GetRequiredBufferAlignment()));
781 assert(IsSectorAligned(data
.data(), GetRequiredBufferAlignment()));
783 assert(offset
<= std::numeric_limits
<off_t
>::max());
784 const char* src
= data
.data();
785 size_t left
= data
.size();
787 ssize_t done
= pwrite(fd_
, src
, left
, static_cast<off_t
>(offset
));
789 if (errno
== EINTR
) {
792 return IOError("While pwrite to file at offset " + ToString(offset
),
803 Status
PosixWritableFile::Truncate(uint64_t size
) {
805 int r
= ftruncate(fd_
, size
);
807 s
= IOError("While ftruncate file to size " + ToString(size
), filename_
,
815 Status
PosixWritableFile::Close() {
819 size_t last_allocated_block
;
820 GetPreallocationStatus(&block_size
, &last_allocated_block
);
821 if (last_allocated_block
> 0) {
822 // trim the extra space preallocated at the end of the file
823 // NOTE(ljin): we probably don't want to surface failure as an IOError,
824 // but it will be nice to log these errors.
825 int dummy
__attribute__((__unused__
));
826 dummy
= ftruncate(fd_
, filesize_
);
827 #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) && \
829 // in some file systems, ftruncate only trims trailing space if the
830 // new file size is smaller than the current size. Calling fallocate
831 // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
832 // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
834 // XFS (since Linux 2.6.38)
835 // ext4 (since Linux 3.0)
836 // Btrfs (since Linux 3.7)
837 // tmpfs (since Linux 3.5)
838 // We ignore error since failure of this operation does not affect
840 // TRAVIS - this code does not work on TRAVIS filesystems.
841 // the FALLOC_FL_KEEP_SIZE option is expected to not change the size
842 // of the file, but it does. Simple strace report will show that.
843 // While we work with Travis-CI team to figure out if this is a
844 // quirk of Docker/AUFS, we will comment this out.
845 struct stat file_stats
;
846 int result
= fstat(fd_
, &file_stats
);
847 // After ftruncate, we check whether ftruncate has the correct behavior.
848 // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
850 (file_stats
.st_size
+ file_stats
.st_blksize
- 1) /
851 file_stats
.st_blksize
!=
852 file_stats
.st_blocks
/ (file_stats
.st_blksize
/ 512)) {
853 IOSTATS_TIMER_GUARD(allocate_nanos
);
854 if (allow_fallocate_
) {
855 fallocate(fd_
, FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
, filesize_
,
856 block_size
* last_allocated_block
- filesize_
);
862 if (close(fd_
) < 0) {
863 s
= IOError("While closing file after writing", filename_
, errno
);
869 // write out the cached data to the OS cache
870 Status
PosixWritableFile::Flush() { return Status::OK(); }
872 Status
PosixWritableFile::Sync() {
873 if (fdatasync(fd_
) < 0) {
874 return IOError("While fdatasync", filename_
, errno
);
879 Status
PosixWritableFile::Fsync() {
880 if (fsync(fd_
) < 0) {
881 return IOError("While fsync", filename_
, errno
);
886 bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
888 uint64_t PosixWritableFile::GetFileSize() { return filesize_
; }
890 void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint
) {
892 // Suppress Valgrind "Unimplemented functionality" error.
893 #ifndef ROCKSDB_VALGRIND_RUN
894 if (hint
== write_hint_
) {
897 if (fcntl(fd_
, F_SET_RW_HINT
, &hint
) == 0) {
902 #endif // ROCKSDB_VALGRIND_RUN
908 Status
PosixWritableFile::InvalidateCache(size_t offset
, size_t length
) {
909 if (use_direct_io()) {
918 int ret
= Fadvise(fd_
, offset
, length
, POSIX_FADV_DONTNEED
);
922 return IOError("While fadvise NotNeeded", filename_
, errno
);
926 #ifdef ROCKSDB_FALLOCATE_PRESENT
927 Status
PosixWritableFile::Allocate(uint64_t offset
, uint64_t len
) {
928 assert(offset
<= std::numeric_limits
<off_t
>::max());
929 assert(len
<= std::numeric_limits
<off_t
>::max());
930 TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds
);
931 IOSTATS_TIMER_GUARD(allocate_nanos
);
932 int alloc_status
= 0;
933 if (allow_fallocate_
) {
934 alloc_status
= fallocate(
935 fd_
, fallocate_with_keep_size_
? FALLOC_FL_KEEP_SIZE
: 0,
936 static_cast<off_t
>(offset
), static_cast<off_t
>(len
));
938 if (alloc_status
== 0) {
942 "While fallocate offset " + ToString(offset
) + " len " + ToString(len
),
948 #ifdef ROCKSDB_RANGESYNC_PRESENT
949 Status
PosixWritableFile::RangeSync(uint64_t offset
, uint64_t nbytes
) {
950 assert(offset
<= std::numeric_limits
<off_t
>::max());
951 assert(nbytes
<= std::numeric_limits
<off_t
>::max());
952 if (sync_file_range(fd_
, static_cast<off_t
>(offset
),
953 static_cast<off_t
>(nbytes
), SYNC_FILE_RANGE_WRITE
) == 0) {
956 return IOError("While sync_file_range offset " + ToString(offset
) +
957 " bytes " + ToString(nbytes
),
964 size_t PosixWritableFile::GetUniqueId(char* id
, size_t max_size
) const {
965 return PosixHelper::GetUniqueIdFromFile(fd_
, id
, max_size
);
973 PosixRandomRWFile::PosixRandomRWFile(const std::string
& fname
, int fd
,
974 const EnvOptions
& /*options*/)
975 : filename_(fname
), fd_(fd
) {}
977 PosixRandomRWFile::~PosixRandomRWFile() {
983 Status
PosixRandomRWFile::Write(uint64_t offset
, const Slice
& data
) {
984 const char* src
= data
.data();
985 size_t left
= data
.size();
987 ssize_t done
= pwrite(fd_
, src
, left
, offset
);
989 // error while writing to file
990 if (errno
== EINTR
) {
991 // write was interrupted, try again.
995 "While write random read/write file at offset " + ToString(offset
),
999 // Wrote `done` bytes
1005 return Status::OK();
1008 Status
PosixRandomRWFile::Read(uint64_t offset
, size_t n
, Slice
* result
,
1009 char* scratch
) const {
1011 char* ptr
= scratch
;
1013 ssize_t done
= pread(fd_
, ptr
, left
, offset
);
1015 // error while reading from file
1016 if (errno
== EINTR
) {
1017 // read was interrupted, try again.
1020 return IOError("While reading random read/write file offset " +
1021 ToString(offset
) + " len " + ToString(n
),
1023 } else if (done
== 0) {
1024 // Nothing more to read
1028 // Read `done` bytes
1034 *result
= Slice(scratch
, n
- left
);
1035 return Status::OK();
1038 Status
PosixRandomRWFile::Flush() { return Status::OK(); }
1040 Status
PosixRandomRWFile::Sync() {
1041 if (fdatasync(fd_
) < 0) {
1042 return IOError("While fdatasync random read/write file", filename_
, errno
);
1044 return Status::OK();
1047 Status
PosixRandomRWFile::Fsync() {
1048 if (fsync(fd_
) < 0) {
1049 return IOError("While fsync random read/write file", filename_
, errno
);
1051 return Status::OK();
1054 Status
PosixRandomRWFile::Close() {
1055 if (close(fd_
) < 0) {
1056 return IOError("While close random read/write file", filename_
, errno
);
1059 return Status::OK();
1062 PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
1063 // TODO should have error handling though not much we can do...
1064 munmap(this->base_
, length_
);
1071 PosixDirectory::~PosixDirectory() { close(fd_
); }
1073 Status
PosixDirectory::Fsync() {
1075 if (fsync(fd_
) == -1) {
1076 return IOError("While fsync", "a directory", errno
);
1079 return Status::OK();
1081 } // namespace rocksdb