1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
11 #if defined(ROCKSDB_IOURING_PRESENT)
20 #include "port/port.h"
21 #include "rocksdb/env.h"
22 #include "rocksdb/file_system.h"
23 #include "rocksdb/io_status.h"
24 #include "util/mutexlock.h"
25 #include "util/thread_local.h"
27 // For non linux platform, the following macros are used only as place
29 #if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX)
30 #define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
31 #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
32 #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
33 #define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
34 #define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */
37 namespace ROCKSDB_NAMESPACE
{
38 std::string
IOErrorMsg(const std::string
& context
,
39 const std::string
& file_name
);
40 // file_name can be left empty if it is not unkown.
41 IOStatus
IOError(const std::string
& context
, const std::string
& file_name
,
46 static size_t GetUniqueIdFromFile(int fd
, char* id
, size_t max_size
);
47 static size_t GetLogicalBlockSizeOfFd(int fd
);
48 static Status
GetLogicalBlockSizeOfDirectory(const std::string
& directory
,
53 // Files under a specific directory have the same logical block size.
54 // This class caches the logical block size for the specified directories to
55 // save the CPU cost of computing the size.
56 // Safe for concurrent access from multiple threads without any external
58 class LogicalBlockSizeCache
{
60 LogicalBlockSizeCache(
61 std::function
<size_t(int)> get_logical_block_size_of_fd
=
62 PosixHelper::GetLogicalBlockSizeOfFd
,
63 std::function
<Status(const std::string
&, size_t*)>
64 get_logical_block_size_of_directory
=
65 PosixHelper::GetLogicalBlockSizeOfDirectory
)
66 : get_logical_block_size_of_fd_(get_logical_block_size_of_fd
),
67 get_logical_block_size_of_directory_(
68 get_logical_block_size_of_directory
) {}
70 // Takes the following actions:
71 // 1. Increases reference count of the directories;
72 // 2. If the directory's logical block size is not cached,
73 // compute the buffer size and cache the result.
74 Status
RefAndCacheLogicalBlockSize(
75 const std::vector
<std::string
>& directories
);
77 // Takes the following actions:
78 // 1. Decreases reference count of the directories;
79 // 2. If the reference count of a directory reaches 0, remove the directory
81 void UnrefAndTryRemoveCachedLogicalBlockSize(
82 const std::vector
<std::string
>& directories
);
84 // Returns the logical block size for the file.
86 // If the file is under a cached directory, return the cached size.
87 // Otherwise, the size is computed.
88 size_t GetLogicalBlockSize(const std::string
& fname
, int fd
);
90 int GetRefCount(const std::string
& dir
) {
91 ReadLock
lock(&cache_mutex_
);
92 auto it
= cache_
.find(dir
);
93 if (it
== cache_
.end()) {
96 return it
->second
.ref
;
99 size_t Size() const { return cache_
.size(); }
101 bool Contains(const std::string
& dir
) {
102 ReadLock
lock(&cache_mutex_
);
103 return cache_
.find(dir
) != cache_
.end();
108 CacheValue() : size(0), ref(0) {}
110 // Logical block size of the directory.
112 // Reference count of the directory.
116 std::function
<size_t(int)> get_logical_block_size_of_fd_
;
117 std::function
<Status(const std::string
&, size_t*)>
118 get_logical_block_size_of_directory_
;
120 std::map
<std::string
, CacheValue
> cache_
;
121 port::RWMutex cache_mutex_
;
125 class PosixSequentialFile
: public FSSequentialFile
{
127 std::string filename_
;
131 size_t logical_sector_size_
;
134 PosixSequentialFile(const std::string
& fname
, FILE* file
, int fd
,
135 size_t logical_block_size
,
136 const EnvOptions
& options
);
137 virtual ~PosixSequentialFile();
139 virtual IOStatus
Read(size_t n
, const IOOptions
& opts
, Slice
* result
,
140 char* scratch
, IODebugContext
* dbg
) override
;
141 virtual IOStatus
PositionedRead(uint64_t offset
, size_t n
,
142 const IOOptions
& opts
, Slice
* result
,
143 char* scratch
, IODebugContext
* dbg
) override
;
144 virtual IOStatus
Skip(uint64_t n
) override
;
145 virtual IOStatus
InvalidateCache(size_t offset
, size_t length
) override
;
146 virtual bool use_direct_io() const override
{ return use_direct_io_
; }
147 virtual size_t GetRequiredBufferAlignment() const override
{
148 return logical_sector_size_
;
152 #if defined(ROCKSDB_IOURING_PRESENT)
153 // io_uring instance queue depth
154 const unsigned int kIoUringDepth
= 256;
156 inline void DeleteIOUring(void* p
) {
157 struct io_uring
* iu
= static_cast<struct io_uring
*>(p
);
161 inline struct io_uring
* CreateIOUring() {
162 struct io_uring
* new_io_uring
= new struct io_uring
;
163 int ret
= io_uring_queue_init(kIoUringDepth
, new_io_uring
, 0);
166 new_io_uring
= nullptr;
170 #endif // defined(ROCKSDB_IOURING_PRESENT)
172 class PosixRandomAccessFile
: public FSRandomAccessFile
{
174 std::string filename_
;
177 size_t logical_sector_size_
;
178 #if defined(ROCKSDB_IOURING_PRESENT)
179 ThreadLocalPtr
* thread_local_io_urings_
;
183 PosixRandomAccessFile(const std::string
& fname
, int fd
,
184 size_t logical_block_size
,
185 const EnvOptions
& options
186 #if defined(ROCKSDB_IOURING_PRESENT)
188 ThreadLocalPtr
* thread_local_io_urings
191 virtual ~PosixRandomAccessFile();
193 virtual IOStatus
Read(uint64_t offset
, size_t n
, const IOOptions
& opts
,
194 Slice
* result
, char* scratch
,
195 IODebugContext
* dbg
) const override
;
197 virtual IOStatus
MultiRead(FSReadRequest
* reqs
, size_t num_reqs
,
198 const IOOptions
& options
,
199 IODebugContext
* dbg
) override
;
201 virtual IOStatus
Prefetch(uint64_t offset
, size_t n
, const IOOptions
& opts
,
202 IODebugContext
* dbg
) override
;
204 #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
205 virtual size_t GetUniqueId(char* id
, size_t max_size
) const override
;
207 virtual void Hint(AccessPattern pattern
) override
;
208 virtual IOStatus
InvalidateCache(size_t offset
, size_t length
) override
;
209 virtual bool use_direct_io() const override
{ return use_direct_io_
; }
210 virtual size_t GetRequiredBufferAlignment() const override
{
211 return logical_sector_size_
;
215 class PosixWritableFile
: public FSWritableFile
{
217 const std::string filename_
;
218 const bool use_direct_io_
;
221 size_t logical_sector_size_
;
222 #ifdef ROCKSDB_FALLOCATE_PRESENT
223 bool allow_fallocate_
;
224 bool fallocate_with_keep_size_
;
226 #ifdef ROCKSDB_RANGESYNC_PRESENT
227 // Even if the syscall is present, the filesystem may still not properly
228 // support it, so we need to do a dynamic check too.
229 bool sync_file_range_supported_
;
230 #endif // ROCKSDB_RANGESYNC_PRESENT
233 explicit PosixWritableFile(const std::string
& fname
, int fd
,
234 size_t logical_block_size
,
235 const EnvOptions
& options
);
236 virtual ~PosixWritableFile();
238 // Need to implement this so the file is truncated correctly
240 virtual IOStatus
Truncate(uint64_t size
, const IOOptions
& opts
,
241 IODebugContext
* dbg
) override
;
242 virtual IOStatus
Close(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
243 virtual IOStatus
Append(const Slice
& data
, const IOOptions
& opts
,
244 IODebugContext
* dbg
) override
;
245 virtual IOStatus
Append(const Slice
& data
, const IOOptions
& opts
,
246 const DataVerificationInfo
& /* verification_info */,
247 IODebugContext
* dbg
) override
{
248 return Append(data
, opts
, dbg
);
250 virtual IOStatus
PositionedAppend(const Slice
& data
, uint64_t offset
,
251 const IOOptions
& opts
,
252 IODebugContext
* dbg
) override
;
253 virtual IOStatus
PositionedAppend(
254 const Slice
& data
, uint64_t offset
, const IOOptions
& opts
,
255 const DataVerificationInfo
& /* verification_info */,
256 IODebugContext
* dbg
) override
{
257 return PositionedAppend(data
, offset
, opts
, dbg
);
259 virtual IOStatus
Flush(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
260 virtual IOStatus
Sync(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
261 virtual IOStatus
Fsync(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
262 virtual bool IsSyncThreadSafe() const override
;
263 virtual bool use_direct_io() const override
{ return use_direct_io_
; }
264 virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint
) override
;
265 virtual uint64_t GetFileSize(const IOOptions
& opts
,
266 IODebugContext
* dbg
) override
;
267 virtual IOStatus
InvalidateCache(size_t offset
, size_t length
) override
;
268 virtual size_t GetRequiredBufferAlignment() const override
{
269 return logical_sector_size_
;
271 #ifdef ROCKSDB_FALLOCATE_PRESENT
272 virtual IOStatus
Allocate(uint64_t offset
, uint64_t len
,
273 const IOOptions
& opts
,
274 IODebugContext
* dbg
) override
;
276 virtual IOStatus
RangeSync(uint64_t offset
, uint64_t nbytes
,
277 const IOOptions
& opts
,
278 IODebugContext
* dbg
) override
;
280 virtual size_t GetUniqueId(char* id
, size_t max_size
) const override
;
284 // mmap() based random-access
285 class PosixMmapReadableFile
: public FSRandomAccessFile
{
288 std::string filename_
;
289 void* mmapped_region_
;
293 PosixMmapReadableFile(const int fd
, const std::string
& fname
, void* base
,
294 size_t length
, const EnvOptions
& options
);
295 virtual ~PosixMmapReadableFile();
296 virtual IOStatus
Read(uint64_t offset
, size_t n
, const IOOptions
& opts
,
297 Slice
* result
, char* scratch
,
298 IODebugContext
* dbg
) const override
;
299 virtual IOStatus
InvalidateCache(size_t offset
, size_t length
) override
;
302 class PosixMmapFile
: public FSWritableFile
{
304 std::string filename_
;
307 size_t map_size_
; // How much extra memory to map at a time
308 char* base_
; // The mapped region
309 char* limit_
; // Limit of the mapped region
310 char* dst_
; // Where to write next (in range [base_,limit_])
311 char* last_sync_
; // Where have we synced up to
312 uint64_t file_offset_
; // Offset of base_ in file
313 #ifdef ROCKSDB_FALLOCATE_PRESENT
314 bool allow_fallocate_
; // If false, fallocate calls are bypassed
315 bool fallocate_with_keep_size_
;
318 // Roundup x to a multiple of y
319 static size_t Roundup(size_t x
, size_t y
) { return ((x
+ y
- 1) / y
) * y
; }
321 size_t TruncateToPageBoundary(size_t s
) {
322 s
-= (s
& (page_size_
- 1));
323 assert((s
% page_size_
) == 0);
327 IOStatus
MapNewRegion();
328 IOStatus
UnmapCurrentRegion();
332 PosixMmapFile(const std::string
& fname
, int fd
, size_t page_size
,
333 const EnvOptions
& options
);
336 // Means Close() will properly take care of truncate
337 // and it does not need any additional information
338 virtual IOStatus
Truncate(uint64_t /*size*/, const IOOptions
& /*opts*/,
339 IODebugContext
* /*dbg*/) override
{
340 return IOStatus::OK();
342 virtual IOStatus
Close(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
343 virtual IOStatus
Append(const Slice
& data
, const IOOptions
& opts
,
344 IODebugContext
* dbg
) override
;
345 virtual IOStatus
Append(const Slice
& data
, const IOOptions
& opts
,
346 const DataVerificationInfo
& /* verification_info */,
347 IODebugContext
* dbg
) override
{
348 return Append(data
, opts
, dbg
);
350 virtual IOStatus
Flush(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
351 virtual IOStatus
Sync(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
352 virtual IOStatus
Fsync(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
353 virtual uint64_t GetFileSize(const IOOptions
& opts
,
354 IODebugContext
* dbg
) override
;
355 virtual IOStatus
InvalidateCache(size_t offset
, size_t length
) override
;
356 #ifdef ROCKSDB_FALLOCATE_PRESENT
357 virtual IOStatus
Allocate(uint64_t offset
, uint64_t len
,
358 const IOOptions
& opts
,
359 IODebugContext
* dbg
) override
;
363 class PosixRandomRWFile
: public FSRandomRWFile
{
365 explicit PosixRandomRWFile(const std::string
& fname
, int fd
,
366 const EnvOptions
& options
);
367 virtual ~PosixRandomRWFile();
369 virtual IOStatus
Write(uint64_t offset
, const Slice
& data
,
370 const IOOptions
& opts
, IODebugContext
* dbg
) override
;
372 virtual IOStatus
Read(uint64_t offset
, size_t n
, const IOOptions
& opts
,
373 Slice
* result
, char* scratch
,
374 IODebugContext
* dbg
) const override
;
376 virtual IOStatus
Flush(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
377 virtual IOStatus
Sync(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
378 virtual IOStatus
Fsync(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
379 virtual IOStatus
Close(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
382 const std::string filename_
;
386 struct PosixMemoryMappedFileBuffer
: public MemoryMappedFileBuffer
{
387 PosixMemoryMappedFileBuffer(void* _base
, size_t _length
)
388 : MemoryMappedFileBuffer(_base
, _length
) {}
389 virtual ~PosixMemoryMappedFileBuffer();
392 class PosixDirectory
: public FSDirectory
{
394 explicit PosixDirectory(int fd
) : fd_(fd
) {}
396 virtual IOStatus
Fsync(const IOOptions
& opts
, IODebugContext
* dbg
) override
;
402 } // namespace ROCKSDB_NAMESPACE