1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors
13 #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
24 #include <sys/ioctl.h>
27 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
28 #include <sys/statfs.h>
29 #include <sys/sysmacros.h>
31 #include <sys/statvfs.h>
33 #include <sys/types.h>
36 // Get nano time includes
37 #if defined(OS_LINUX) || defined(OS_FREEBSD)
38 #elif defined(__MACH__)
39 #include <Availability.h>
40 #include <mach/clock.h>
41 #include <mach/mach.h>
49 #include "env/composite_env_wrapper.h"
50 #include "env/io_posix.h"
51 #include "logging/posix_logger.h"
52 #include "monitoring/iostats_context_imp.h"
53 #include "monitoring/thread_status_updater.h"
54 #include "port/port.h"
55 #include "rocksdb/options.h"
56 #include "rocksdb/slice.h"
57 #include "rocksdb/utilities/object_registry.h"
58 #include "test_util/sync_point.h"
59 #include "util/coding.h"
60 #include "util/compression_context_cache.h"
61 #include "util/random.h"
62 #include "util/string_util.h"
63 #include "util/thread_local.h"
64 #include "util/threadpool_imp.h"
66 #if !defined(TMPFS_MAGIC)
67 #define TMPFS_MAGIC 0x01021994
69 #if !defined(XFS_SUPER_MAGIC)
70 #define XFS_SUPER_MAGIC 0x58465342
72 #if !defined(EXT4_SUPER_MAGIC)
73 #define EXT4_SUPER_MAGIC 0xEF53
76 namespace ROCKSDB_NAMESPACE
{
80 inline mode_t
GetDBFileMode(bool allow_non_owner_access
) {
81 return allow_non_owner_access
? 0644 : 0600;
84 static uint64_t gettid() {
85 return Env::Default()->GetThreadID();
88 // list of pathnames that are locked
89 // Only used for error message.
90 struct LockHoldingInfo
{
92 uint64_t acquiring_thread
;
94 static std::map
<std::string
, LockHoldingInfo
> locked_files
;
95 static port::Mutex mutex_locked_files
;
97 static int LockOrUnlock(int fd
, bool lock
) {
100 memset(&f
, 0, sizeof(f
));
101 f
.l_type
= (lock
? F_WRLCK
: F_UNLCK
);
102 f
.l_whence
= SEEK_SET
;
104 f
.l_len
= 0; // Lock/unlock entire file
105 int value
= fcntl(fd
, F_SETLK
, &f
);
110 class PosixFileLock
: public FileLock
{
113 std::string filename
;
116 int cloexec_flags(int flags
, const EnvOptions
* options
) {
117 // If the system supports opening the file with cloexec enabled,
118 // do so, as this avoids a race condition if a db is opened around
119 // the same time that a child process is forked
121 if (options
== nullptr || options
->set_fd_cloexec
) {
130 class PosixFileSystem
: public FileSystem
{
134 const char* Name() const override
{ return "Posix File System"; }
136 ~PosixFileSystem() override
{}
138 void SetFD_CLOEXEC(int fd
, const EnvOptions
* options
) {
139 if ((options
== nullptr || options
->set_fd_cloexec
) && fd
> 0) {
140 fcntl(fd
, F_SETFD
, fcntl(fd
, F_GETFD
) | FD_CLOEXEC
);
144 IOStatus
NewSequentialFile(const std::string
& fname
,
145 const FileOptions
& options
,
146 std::unique_ptr
<FSSequentialFile
>* result
,
147 IODebugContext
* /*dbg*/) override
{
150 int flags
= cloexec_flags(O_RDONLY
, &options
);
151 FILE* file
= nullptr;
153 if (options
.use_direct_reads
&& !options
.use_mmap_reads
) {
155 return IOStatus::IOError(fname
,
156 "Direct I/O not supported in RocksDB lite");
157 #endif // !ROCKSDB_LITE
158 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
160 TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags
);
165 IOSTATS_TIMER_GUARD(open_nanos
);
166 fd
= open(fname
.c_str(), flags
, GetDBFileMode(allow_non_owner_access_
));
167 } while (fd
< 0 && errno
== EINTR
);
169 return IOError("While opening a file for sequentially reading", fname
,
173 SetFD_CLOEXEC(fd
, &options
);
175 if (options
.use_direct_reads
&& !options
.use_mmap_reads
) {
177 if (fcntl(fd
, F_NOCACHE
, 1) == -1) {
179 return IOError("While fcntl NoCache", fname
, errno
);
184 IOSTATS_TIMER_GUARD(open_nanos
);
185 file
= fdopen(fd
, "r");
186 } while (file
== nullptr && errno
== EINTR
);
187 if (file
== nullptr) {
189 return IOError("While opening file for sequentially read", fname
,
193 result
->reset(new PosixSequentialFile(
194 fname
, file
, fd
, GetLogicalBlockSizeForReadIfNeeded(options
, fname
, fd
),
196 return IOStatus::OK();
199 IOStatus
NewRandomAccessFile(const std::string
& fname
,
200 const FileOptions
& options
,
201 std::unique_ptr
<FSRandomAccessFile
>* result
,
202 IODebugContext
* /*dbg*/) override
{
204 IOStatus s
= IOStatus::OK();
206 int flags
= cloexec_flags(O_RDONLY
, &options
);
208 if (options
.use_direct_reads
&& !options
.use_mmap_reads
) {
210 return IOStatus::IOError(fname
,
211 "Direct I/O not supported in RocksDB lite");
212 #endif // !ROCKSDB_LITE
213 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
215 TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags
);
220 IOSTATS_TIMER_GUARD(open_nanos
);
221 fd
= open(fname
.c_str(), flags
, GetDBFileMode(allow_non_owner_access_
));
222 } while (fd
< 0 && errno
== EINTR
);
224 s
= IOError("While open a file for random read", fname
, errno
);
227 SetFD_CLOEXEC(fd
, &options
);
229 if (options
.use_mmap_reads
&& sizeof(void*) >= 8) {
230 // Use of mmap for random reads has been removed because it
231 // kills performance when storage is fast.
232 // Use mmap when virtual address-space is plentiful.
235 s
= GetFileSize(fname
, opts
, &size
, nullptr);
237 void* base
= mmap(nullptr, size
, PROT_READ
, MAP_SHARED
, fd
, 0);
238 if (base
!= MAP_FAILED
) {
240 new PosixMmapReadableFile(fd
, fname
, base
, size
, options
));
242 s
= IOError("while mmap file for read", fname
, errno
);
249 if (options
.use_direct_reads
&& !options
.use_mmap_reads
) {
251 if (fcntl(fd
, F_NOCACHE
, 1) == -1) {
253 return IOError("while fcntl NoCache", fname
, errno
);
257 result
->reset(new PosixRandomAccessFile(
258 fname
, fd
, GetLogicalBlockSizeForReadIfNeeded(options
, fname
, fd
),
260 #if defined(ROCKSDB_IOURING_PRESENT)
262 thread_local_io_urings_
.get()
269 virtual IOStatus
OpenWritableFile(const std::string
& fname
,
270 const FileOptions
& options
,
272 std::unique_ptr
<FSWritableFile
>* result
,
273 IODebugContext
* /*dbg*/) {
277 int flags
= (reopen
) ? (O_CREAT
| O_APPEND
) : (O_CREAT
| O_TRUNC
);
278 // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
279 if (options
.use_direct_writes
&& !options
.use_mmap_writes
) {
280 // Note: we should avoid O_APPEND here due to ta the following bug:
281 // POSIX requires that opening a file with the O_APPEND flag should
282 // have no affect on the location at which pwrite() writes data.
283 // However, on Linux, if a file is opened with O_APPEND, pwrite()
284 // appends data to the end of the file, regardless of the value of
286 // More info here: https://linux.die.net/man/2/pwrite
288 return IOStatus::IOError(fname
,
289 "Direct I/O not supported in RocksDB lite");
290 #endif // ROCKSDB_LITE
292 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
295 TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags
);
296 } else if (options
.use_mmap_writes
) {
303 flags
= cloexec_flags(flags
, &options
);
306 IOSTATS_TIMER_GUARD(open_nanos
);
307 fd
= open(fname
.c_str(), flags
, GetDBFileMode(allow_non_owner_access_
));
308 } while (fd
< 0 && errno
== EINTR
);
311 s
= IOError("While open a file for appending", fname
, errno
);
314 SetFD_CLOEXEC(fd
, &options
);
316 if (options
.use_mmap_writes
) {
317 if (!checkedDiskForMmap_
) {
318 // this will be executed once in the program's lifetime.
319 // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
320 if (!SupportsFastAllocate(fname
)) {
321 forceMmapOff_
= true;
323 checkedDiskForMmap_
= true;
326 if (options
.use_mmap_writes
&& !forceMmapOff_
) {
327 result
->reset(new PosixMmapFile(fname
, fd
, page_size_
, options
));
328 } else if (options
.use_direct_writes
&& !options
.use_mmap_writes
) {
330 if (fcntl(fd
, F_NOCACHE
, 1) == -1) {
332 s
= IOError("While fcntl NoCache an opened file for appending", fname
,
336 #elif defined(OS_SOLARIS)
337 if (directio(fd
, DIRECTIO_ON
) == -1) {
338 if (errno
!= ENOTTY
) { // ZFS filesystems don't support DIRECTIO_ON
340 s
= IOError("While calling directio()", fname
, errno
);
345 result
->reset(new PosixWritableFile(
346 fname
, fd
, GetLogicalBlockSizeForWriteIfNeeded(options
, fname
, fd
),
349 // disable mmap writes
350 EnvOptions no_mmap_writes_options
= options
;
351 no_mmap_writes_options
.use_mmap_writes
= false;
353 new PosixWritableFile(fname
, fd
,
354 GetLogicalBlockSizeForWriteIfNeeded(
355 no_mmap_writes_options
, fname
, fd
),
356 no_mmap_writes_options
));
361 IOStatus
NewWritableFile(const std::string
& fname
, const FileOptions
& options
,
362 std::unique_ptr
<FSWritableFile
>* result
,
363 IODebugContext
* dbg
) override
{
364 return OpenWritableFile(fname
, options
, false, result
, dbg
);
367 IOStatus
ReopenWritableFile(const std::string
& fname
,
368 const FileOptions
& options
,
369 std::unique_ptr
<FSWritableFile
>* result
,
370 IODebugContext
* dbg
) override
{
371 return OpenWritableFile(fname
, options
, true, result
, dbg
);
374 IOStatus
ReuseWritableFile(const std::string
& fname
,
375 const std::string
& old_fname
,
376 const FileOptions
& options
,
377 std::unique_ptr
<FSWritableFile
>* result
,
378 IODebugContext
* /*dbg*/) override
{
384 // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
385 if (options
.use_direct_writes
&& !options
.use_mmap_writes
) {
387 return IOStatus::IOError(fname
,
388 "Direct I/O not supported in RocksDB lite");
389 #endif // !ROCKSDB_LITE
391 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
394 TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags
);
395 } else if (options
.use_mmap_writes
) {
396 // mmap needs O_RDWR mode
402 flags
= cloexec_flags(flags
, &options
);
405 IOSTATS_TIMER_GUARD(open_nanos
);
406 fd
= open(old_fname
.c_str(), flags
,
407 GetDBFileMode(allow_non_owner_access_
));
408 } while (fd
< 0 && errno
== EINTR
);
410 s
= IOError("while reopen file for write", fname
, errno
);
414 SetFD_CLOEXEC(fd
, &options
);
416 if (rename(old_fname
.c_str(), fname
.c_str()) != 0) {
417 s
= IOError("while rename file to " + fname
, old_fname
, errno
);
422 if (options
.use_mmap_writes
) {
423 if (!checkedDiskForMmap_
) {
424 // this will be executed once in the program's lifetime.
425 // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
426 if (!SupportsFastAllocate(fname
)) {
427 forceMmapOff_
= true;
429 checkedDiskForMmap_
= true;
432 if (options
.use_mmap_writes
&& !forceMmapOff_
) {
433 result
->reset(new PosixMmapFile(fname
, fd
, page_size_
, options
));
434 } else if (options
.use_direct_writes
&& !options
.use_mmap_writes
) {
436 if (fcntl(fd
, F_NOCACHE
, 1) == -1) {
438 s
= IOError("while fcntl NoCache for reopened file for append", fname
,
442 #elif defined(OS_SOLARIS)
443 if (directio(fd
, DIRECTIO_ON
) == -1) {
444 if (errno
!= ENOTTY
) { // ZFS filesystems don't support DIRECTIO_ON
446 s
= IOError("while calling directio()", fname
, errno
);
451 result
->reset(new PosixWritableFile(
452 fname
, fd
, GetLogicalBlockSizeForWriteIfNeeded(options
, fname
, fd
),
455 // disable mmap writes
456 FileOptions no_mmap_writes_options
= options
;
457 no_mmap_writes_options
.use_mmap_writes
= false;
459 new PosixWritableFile(fname
, fd
,
460 GetLogicalBlockSizeForWriteIfNeeded(
461 no_mmap_writes_options
, fname
, fd
),
462 no_mmap_writes_options
));
467 IOStatus
NewRandomRWFile(const std::string
& fname
, const FileOptions
& options
,
468 std::unique_ptr
<FSRandomRWFile
>* result
,
469 IODebugContext
* /*dbg*/) override
{
471 int flags
= cloexec_flags(O_RDWR
, &options
);
474 IOSTATS_TIMER_GUARD(open_nanos
);
476 fd
= open(fname
.c_str(), flags
, GetDBFileMode(allow_non_owner_access_
));
478 // Error while opening the file
479 if (errno
== EINTR
) {
482 return IOError("While open file for random read/write", fname
, errno
);
486 SetFD_CLOEXEC(fd
, &options
);
487 result
->reset(new PosixRandomRWFile(fname
, fd
, options
));
488 return IOStatus::OK();
491 IOStatus
NewMemoryMappedFileBuffer(
492 const std::string
& fname
,
493 std::unique_ptr
<MemoryMappedFileBuffer
>* result
) override
{
496 int flags
= cloexec_flags(O_RDWR
, nullptr);
499 IOSTATS_TIMER_GUARD(open_nanos
);
500 fd
= open(fname
.c_str(), flags
, 0644);
502 // Error while opening the file
503 if (errno
== EINTR
) {
507 IOError("While open file for raw mmap buffer access", fname
, errno
);
514 status
= GetFileSize(fname
, opts
, &size
, nullptr);
516 void* base
= nullptr;
518 base
= mmap(nullptr, static_cast<size_t>(size
), PROT_READ
| PROT_WRITE
,
520 if (base
== MAP_FAILED
) {
521 status
= IOError("while mmap file for read", fname
, errno
);
526 new PosixMemoryMappedFileBuffer(base
, static_cast<size_t>(size
)));
529 // don't need to keep it open after mmap has been called
535 IOStatus
NewDirectory(const std::string
& name
, const IOOptions
& /*opts*/,
536 std::unique_ptr
<FSDirectory
>* result
,
537 IODebugContext
* /*dbg*/) override
{
540 int flags
= cloexec_flags(0, nullptr);
542 IOSTATS_TIMER_GUARD(open_nanos
);
543 fd
= open(name
.c_str(), flags
);
546 return IOError("While open directory", name
, errno
);
548 result
->reset(new PosixDirectory(fd
));
550 return IOStatus::OK();
553 IOStatus
NewLogger(const std::string
& fname
, const IOOptions
& /*opts*/,
554 std::shared_ptr
<Logger
>* result
,
555 IODebugContext
* /*dbg*/) override
{
558 IOSTATS_TIMER_GUARD(open_nanos
);
559 f
= fopen(fname
.c_str(),
561 #ifdef __GLIBC_PREREQ
562 #if __GLIBC_PREREQ(2, 7)
563 "e" // glibc extension to enable O_CLOEXEC
570 return status_to_io_status(
571 IOError("when fopen a file for new logger", fname
, errno
));
574 #ifdef ROCKSDB_FALLOCATE_PRESENT
575 fallocate(fd
, FALLOC_FL_KEEP_SIZE
, 0, 4 * 1024);
577 SetFD_CLOEXEC(fd
, nullptr);
578 result
->reset(new PosixLogger(f
, &gettid
, Env::Default()));
579 return IOStatus::OK();
583 IOStatus
FileExists(const std::string
& fname
, const IOOptions
& /*opts*/,
584 IODebugContext
* /*dbg*/) override
{
585 int result
= access(fname
.c_str(), F_OK
);
588 return IOStatus::OK();
598 return IOStatus::NotFound();
600 assert(err
== EIO
|| err
== ENOMEM
);
601 return IOStatus::IOError("Unexpected error(" + ToString(err
) +
602 ") accessing file `" + fname
+ "' ");
606 IOStatus
GetChildren(const std::string
& dir
, const IOOptions
& /*opts*/,
607 std::vector
<std::string
>* result
,
608 IODebugContext
* /*dbg*/) override
{
610 DIR* d
= opendir(dir
.c_str());
616 return IOStatus::NotFound();
618 return IOError("While opendir", dir
, errno
);
621 struct dirent
* entry
;
622 while ((entry
= readdir(d
)) != nullptr) {
623 result
->push_back(entry
->d_name
);
626 return IOStatus::OK();
629 IOStatus
DeleteFile(const std::string
& fname
, const IOOptions
& /*opts*/,
630 IODebugContext
* /*dbg*/) override
{
632 if (unlink(fname
.c_str()) != 0) {
633 result
= IOError("while unlink() file", fname
, errno
);
638 IOStatus
CreateDir(const std::string
& name
, const IOOptions
& /*opts*/,
639 IODebugContext
* /*dbg*/) override
{
640 if (mkdir(name
.c_str(), 0755) != 0) {
641 return IOError("While mkdir", name
, errno
);
643 return IOStatus::OK();
646 IOStatus
CreateDirIfMissing(const std::string
& name
,
647 const IOOptions
& /*opts*/,
648 IODebugContext
* /*dbg*/) override
{
649 if (mkdir(name
.c_str(), 0755) != 0) {
650 if (errno
!= EEXIST
) {
651 return IOError("While mkdir if missing", name
, errno
);
652 } else if (!DirExists(name
)) { // Check that name is actually a
654 // Message is taken from mkdir
655 return IOStatus::IOError("`" + name
+
656 "' exists but is not a directory");
659 return IOStatus::OK();
662 IOStatus
DeleteDir(const std::string
& name
, const IOOptions
& /*opts*/,
663 IODebugContext
* /*dbg*/) override
{
664 if (rmdir(name
.c_str()) != 0) {
665 return IOError("file rmdir", name
, errno
);
667 return IOStatus::OK();
670 IOStatus
GetFileSize(const std::string
& fname
, const IOOptions
& /*opts*/,
671 uint64_t* size
, IODebugContext
* /*dbg*/) override
{
673 if (stat(fname
.c_str(), &sbuf
) != 0) {
675 return IOError("while stat a file for size", fname
, errno
);
677 *size
= sbuf
.st_size
;
679 return IOStatus::OK();
682 IOStatus
GetFileModificationTime(const std::string
& fname
,
683 const IOOptions
& /*opts*/,
684 uint64_t* file_mtime
,
685 IODebugContext
* /*dbg*/) override
{
687 if (stat(fname
.c_str(), &s
) != 0) {
688 return IOError("while stat a file for modification time", fname
, errno
);
690 *file_mtime
= static_cast<uint64_t>(s
.st_mtime
);
691 return IOStatus::OK();
694 IOStatus
RenameFile(const std::string
& src
, const std::string
& target
,
695 const IOOptions
& /*opts*/,
696 IODebugContext
* /*dbg*/) override
{
697 if (rename(src
.c_str(), target
.c_str()) != 0) {
698 return IOError("While renaming a file to " + target
, src
, errno
);
700 return IOStatus::OK();
703 IOStatus
LinkFile(const std::string
& src
, const std::string
& target
,
704 const IOOptions
& /*opts*/,
705 IODebugContext
* /*dbg*/) override
{
706 if (link(src
.c_str(), target
.c_str()) != 0) {
707 if (errno
== EXDEV
) {
708 return IOStatus::NotSupported("No cross FS links allowed");
710 return IOError("while link file to " + target
, src
, errno
);
712 return IOStatus::OK();
715 IOStatus
NumFileLinks(const std::string
& fname
, const IOOptions
& /*opts*/,
716 uint64_t* count
, IODebugContext
* /*dbg*/) override
{
718 if (stat(fname
.c_str(), &s
) != 0) {
719 return IOError("while stat a file for num file links", fname
, errno
);
721 *count
= static_cast<uint64_t>(s
.st_nlink
);
722 return IOStatus::OK();
725 IOStatus
AreFilesSame(const std::string
& first
, const std::string
& second
,
726 const IOOptions
& /*opts*/, bool* res
,
727 IODebugContext
* /*dbg*/) override
{
728 struct stat statbuf
[2];
729 if (stat(first
.c_str(), &statbuf
[0]) != 0) {
730 return IOError("stat file", first
, errno
);
732 if (stat(second
.c_str(), &statbuf
[1]) != 0) {
733 return IOError("stat file", second
, errno
);
736 if (major(statbuf
[0].st_dev
) != major(statbuf
[1].st_dev
) ||
737 minor(statbuf
[0].st_dev
) != minor(statbuf
[1].st_dev
) ||
738 statbuf
[0].st_ino
!= statbuf
[1].st_ino
) {
743 return IOStatus::OK();
746 IOStatus
LockFile(const std::string
& fname
, const IOOptions
& /*opts*/,
747 FileLock
** lock
, IODebugContext
* /*dbg*/) override
{
751 int64_t current_time
= 0;
752 // Ignore status code as the time is only used for error message.
753 Env::Default()->GetCurrentTime(¤t_time
).PermitUncheckedError();
754 lhi
.acquire_time
= current_time
;
755 lhi
.acquiring_thread
= Env::Default()->GetThreadID();
757 mutex_locked_files
.Lock();
758 // If it already exists in the locked_files set, then it is already locked,
759 // and fail this lock attempt. Otherwise, insert it into locked_files.
760 // This check is needed because fcntl() does not detect lock conflict
761 // if the fcntl is issued by the same thread that earlier acquired
763 // We must do this check *before* opening the file:
764 // Otherwise, we will open a new file descriptor. Locks are associated with
765 // a process, not a file descriptor and when *any* file descriptor is
766 // closed, all locks the process holds for that *file* are released
767 const auto it_success
= locked_files
.insert({fname
, lhi
});
768 if (it_success
.second
== false) {
769 mutex_locked_files
.Unlock();
771 LockHoldingInfo
& prev_info
= it_success
.first
->second
;
772 // Note that the thread ID printed is the same one as the one in
773 // posix logger, but posix logger prints it hex format.
774 return IOError("lock hold by current process, acquire time " +
775 ToString(prev_info
.acquire_time
) +
776 " acquiring thread " +
777 ToString(prev_info
.acquiring_thread
),
781 IOStatus result
= IOStatus::OK();
783 int flags
= cloexec_flags(O_RDWR
| O_CREAT
, nullptr);
786 IOSTATS_TIMER_GUARD(open_nanos
);
787 fd
= open(fname
.c_str(), flags
, 0644);
790 result
= IOError("while open a file for lock", fname
, errno
);
791 } else if (LockOrUnlock(fd
, true) == -1) {
792 // if there is an error in locking, then remove the pathname from
794 locked_files
.erase(fname
);
795 result
= IOError("While lock file", fname
, errno
);
798 SetFD_CLOEXEC(fd
, nullptr);
799 PosixFileLock
* my_lock
= new PosixFileLock
;
801 my_lock
->filename
= fname
;
805 mutex_locked_files
.Unlock();
809 IOStatus
UnlockFile(FileLock
* lock
, const IOOptions
& /*opts*/,
810 IODebugContext
* /*dbg*/) override
{
811 PosixFileLock
* my_lock
= reinterpret_cast<PosixFileLock
*>(lock
);
813 mutex_locked_files
.Lock();
814 // If we are unlocking, then verify that we had locked it earlier,
815 // it should already exist in locked_files. Remove it from locked_files.
816 if (locked_files
.erase(my_lock
->filename
) != 1) {
818 result
= IOError("unlock", my_lock
->filename
, errno
);
819 } else if (LockOrUnlock(my_lock
->fd_
, false) == -1) {
820 result
= IOError("unlock", my_lock
->filename
, errno
);
824 mutex_locked_files
.Unlock();
828 IOStatus
GetAbsolutePath(const std::string
& db_path
,
829 const IOOptions
& /*opts*/, std::string
* output_path
,
830 IODebugContext
* /*dbg*/) override
{
831 if (!db_path
.empty() && db_path
[0] == '/') {
832 *output_path
= db_path
;
833 return IOStatus::OK();
837 char* ret
= getcwd(the_path
, 256);
838 if (ret
== nullptr) {
839 return IOStatus::IOError(strerror(errno
));
843 return IOStatus::OK();
846 IOStatus
GetTestDirectory(const IOOptions
& /*opts*/, std::string
* result
,
847 IODebugContext
* /*dbg*/) override
{
848 const char* env
= getenv("TEST_TMPDIR");
849 if (env
&& env
[0] != '\0') {
853 snprintf(buf
, sizeof(buf
), "/tmp/rocksdbtest-%d", int(geteuid()));
856 // Directory may already exist
859 return CreateDirIfMissing(*result
, opts
, nullptr);
861 return IOStatus::OK();
864 IOStatus
GetFreeSpace(const std::string
& fname
, const IOOptions
& /*opts*/,
865 uint64_t* free_space
,
866 IODebugContext
* /*dbg*/) override
{
869 if (statvfs(fname
.c_str(), &sbuf
) < 0) {
870 return IOError("While doing statvfs", fname
, errno
);
873 *free_space
= ((uint64_t)sbuf
.f_bsize
* sbuf
.f_bfree
);
874 return IOStatus::OK();
877 IOStatus
IsDirectory(const std::string
& path
, const IOOptions
& /*opts*/,
878 bool* is_dir
, IODebugContext
* /*dbg*/) override
{
881 int flags
= cloexec_flags(O_RDONLY
, nullptr);
883 IOSTATS_TIMER_GUARD(open_nanos
);
884 fd
= open(path
.c_str(), flags
);
887 return IOError("While open for IsDirectory()", path
, errno
);
891 if (fstat(fd
, &sbuf
) < 0) {
892 io_s
= IOError("While doing stat for IsDirectory()", path
, errno
);
895 if (io_s
.ok() && nullptr != is_dir
) {
896 *is_dir
= S_ISDIR(sbuf
.st_mode
);
901 FileOptions
OptimizeForLogWrite(const FileOptions
& file_options
,
902 const DBOptions
& db_options
) const override
{
903 FileOptions optimized
= file_options
;
904 optimized
.use_mmap_writes
= false;
905 optimized
.use_direct_writes
= false;
906 optimized
.bytes_per_sync
= db_options
.wal_bytes_per_sync
;
907 // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
908 // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
909 // test and make this false
910 optimized
.fallocate_with_keep_size
= true;
911 optimized
.writable_file_max_buffer_size
=
912 db_options
.writable_file_max_buffer_size
;
916 FileOptions
OptimizeForManifestWrite(
917 const FileOptions
& file_options
) const override
{
918 FileOptions optimized
= file_options
;
919 optimized
.use_mmap_writes
= false;
920 optimized
.use_direct_writes
= false;
921 optimized
.fallocate_with_keep_size
= true;
925 Status
RegisterDbPaths(const std::vector
<std::string
>& paths
) override
{
926 return logical_block_size_cache_
.RefAndCacheLogicalBlockSize(paths
);
928 Status
UnregisterDbPaths(const std::vector
<std::string
>& paths
) override
{
929 logical_block_size_cache_
.UnrefAndTryRemoveCachedLogicalBlockSize(paths
);
934 bool checkedDiskForMmap_
;
935 bool forceMmapOff_
; // do we override Env options?
937 // Returns true iff the named directory exists and is a directory.
938 virtual bool DirExists(const std::string
& dname
) {
940 if (stat(dname
.c_str(), &statbuf
) == 0) {
941 return S_ISDIR(statbuf
.st_mode
);
943 return false; // stat() failed return false
946 bool SupportsFastAllocate(const std::string
& path
) {
947 #ifdef ROCKSDB_FALLOCATE_PRESENT
949 if (statfs(path
.c_str(), &s
)) {
953 case EXT4_SUPER_MAGIC
:
955 case XFS_SUPER_MAGIC
:
968 #if defined(ROCKSDB_IOURING_PRESENT)
970 std::unique_ptr
<ThreadLocalPtr
> thread_local_io_urings_
;
975 // If true, allow non owner read access for db files. Otherwise, non-owner
976 // has no access to db files.
977 bool allow_non_owner_access_
;
980 static LogicalBlockSizeCache logical_block_size_cache_
;
982 static size_t GetLogicalBlockSize(const std::string
& fname
, int fd
);
983 // In non-direct IO mode, this directly returns kDefaultPageSize.
984 // Otherwise call GetLogicalBlockSize.
985 static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions
& options
,
986 const std::string
& fname
,
988 static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions
& options
,
989 const std::string
& fname
,
994 LogicalBlockSizeCache
PosixFileSystem::logical_block_size_cache_
;
997 size_t PosixFileSystem::GetLogicalBlockSize(const std::string
& fname
, int fd
) {
999 return logical_block_size_cache_
.GetLogicalBlockSize(fname
, fd
);
1002 return PosixHelper::GetLogicalBlockSizeOfFd(fd
);
1006 size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded(
1007 const EnvOptions
& options
, const std::string
& fname
, int fd
) {
1008 return options
.use_direct_reads
1009 ? PosixFileSystem::GetLogicalBlockSize(fname
, fd
)
1013 size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
1014 const EnvOptions
& options
, const std::string
& fname
, int fd
) {
1015 return options
.use_direct_writes
1016 ? PosixFileSystem::GetLogicalBlockSize(fname
, fd
)
1020 PosixFileSystem::PosixFileSystem()
1021 : checkedDiskForMmap_(false),
1022 forceMmapOff_(false),
1023 page_size_(getpagesize()),
1024 allow_non_owner_access_(true) {
1025 #if defined(ROCKSDB_IOURING_PRESENT)
1026 // Test whether IOUring is supported, and if it does, create a managing
1027 // object for thread local point so that in the future thread-local
1028 // io_uring can be created.
1029 struct io_uring
* new_io_uring
= CreateIOUring();
1030 if (new_io_uring
!= nullptr) {
1031 thread_local_io_urings_
.reset(new ThreadLocalPtr(DeleteIOUring
));
1032 delete new_io_uring
;
1040 // Default Posix FileSystem
1042 std::shared_ptr
<FileSystem
> FileSystem::Default() {
1043 static PosixFileSystem default_fs
;
1044 static std::shared_ptr
<PosixFileSystem
> default_fs_ptr(
1045 &default_fs
, [](PosixFileSystem
*) {});
1046 return default_fs_ptr
;
1049 #ifndef ROCKSDB_LITE
1050 static FactoryFunc
<FileSystem
> posix_filesystem_reg
=
1051 ObjectLibrary::Default()->Register
<FileSystem
>(
1053 [](const std::string
& /* uri */, std::unique_ptr
<FileSystem
>* f
,
1054 std::string
* /* errmsg */) {
1055 f
->reset(new PosixFileSystem());
1060 } // namespace ROCKSDB_NAMESPACE