]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/env/io_posix.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / env / io_posix.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #pragma once
10 #include <errno.h>
11 #if defined(ROCKSDB_IOURING_PRESENT)
12 #include <liburing.h>
13 #include <sys/uio.h>
14 #endif
15 #include <unistd.h>
16 #include <atomic>
17 #include <functional>
18 #include <map>
19 #include <string>
20 #include "port/port.h"
21 #include "rocksdb/env.h"
22 #include "rocksdb/file_system.h"
23 #include "rocksdb/io_status.h"
24 #include "util/mutexlock.h"
25 #include "util/thread_local.h"
26
27 // For non linux platform, the following macros are used only as place
28 // holder.
29 #if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX)
30 #define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
31 #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
32 #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
33 #define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
34 #define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */
35 #endif
36
37 namespace ROCKSDB_NAMESPACE {
38 std::string IOErrorMsg(const std::string& context,
39 const std::string& file_name);
40 // file_name can be left empty if it is not unkown.
41 IOStatus IOError(const std::string& context, const std::string& file_name,
42 int err_number);
43
44 class PosixHelper {
45 public:
46 static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
47 static size_t GetLogicalBlockSizeOfFd(int fd);
48 static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
49 size_t* size);
50 };
51
52 #ifdef OS_LINUX
53 // Files under a specific directory have the same logical block size.
54 // This class caches the logical block size for the specified directories to
55 // save the CPU cost of computing the size.
56 // Safe for concurrent access from multiple threads without any external
57 // synchronization.
58 class LogicalBlockSizeCache {
59 public:
60 LogicalBlockSizeCache(
61 std::function<size_t(int)> get_logical_block_size_of_fd =
62 PosixHelper::GetLogicalBlockSizeOfFd,
63 std::function<Status(const std::string&, size_t*)>
64 get_logical_block_size_of_directory =
65 PosixHelper::GetLogicalBlockSizeOfDirectory)
66 : get_logical_block_size_of_fd_(get_logical_block_size_of_fd),
67 get_logical_block_size_of_directory_(
68 get_logical_block_size_of_directory) {}
69
70 // Takes the following actions:
71 // 1. Increases reference count of the directories;
72 // 2. If the directory's logical block size is not cached,
73 // compute the buffer size and cache the result.
74 Status RefAndCacheLogicalBlockSize(
75 const std::vector<std::string>& directories);
76
77 // Takes the following actions:
78 // 1. Decreases reference count of the directories;
79 // 2. If the reference count of a directory reaches 0, remove the directory
80 // from the cache.
81 void UnrefAndTryRemoveCachedLogicalBlockSize(
82 const std::vector<std::string>& directories);
83
84 // Returns the logical block size for the file.
85 //
86 // If the file is under a cached directory, return the cached size.
87 // Otherwise, the size is computed.
88 size_t GetLogicalBlockSize(const std::string& fname, int fd);
89
90 int GetRefCount(const std::string& dir) {
91 ReadLock lock(&cache_mutex_);
92 auto it = cache_.find(dir);
93 if (it == cache_.end()) {
94 return 0;
95 }
96 return it->second.ref;
97 }
98
99 size_t Size() const { return cache_.size(); }
100
101 bool Contains(const std::string& dir) {
102 ReadLock lock(&cache_mutex_);
103 return cache_.find(dir) != cache_.end();
104 }
105
106 private:
107 struct CacheValue {
108 CacheValue() : size(0), ref(0) {}
109
110 // Logical block size of the directory.
111 size_t size;
112 // Reference count of the directory.
113 int ref;
114 };
115
116 std::function<size_t(int)> get_logical_block_size_of_fd_;
117 std::function<Status(const std::string&, size_t*)>
118 get_logical_block_size_of_directory_;
119
120 std::map<std::string, CacheValue> cache_;
121 port::RWMutex cache_mutex_;
122 };
123 #endif
124
125 class PosixSequentialFile : public FSSequentialFile {
126 private:
127 std::string filename_;
128 FILE* file_;
129 int fd_;
130 bool use_direct_io_;
131 size_t logical_sector_size_;
132
133 public:
134 PosixSequentialFile(const std::string& fname, FILE* file, int fd,
135 size_t logical_block_size,
136 const EnvOptions& options);
137 virtual ~PosixSequentialFile();
138
139 virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result,
140 char* scratch, IODebugContext* dbg) override;
141 virtual IOStatus PositionedRead(uint64_t offset, size_t n,
142 const IOOptions& opts, Slice* result,
143 char* scratch, IODebugContext* dbg) override;
144 virtual IOStatus Skip(uint64_t n) override;
145 virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
146 virtual bool use_direct_io() const override { return use_direct_io_; }
147 virtual size_t GetRequiredBufferAlignment() const override {
148 return logical_sector_size_;
149 }
150 };
151
152 #if defined(ROCKSDB_IOURING_PRESENT)
153 // io_uring instance queue depth
154 const unsigned int kIoUringDepth = 256;
155
156 inline void DeleteIOUring(void* p) {
157 struct io_uring* iu = static_cast<struct io_uring*>(p);
158 delete iu;
159 }
160
161 inline struct io_uring* CreateIOUring() {
162 struct io_uring* new_io_uring = new struct io_uring;
163 int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0);
164 if (ret) {
165 delete new_io_uring;
166 new_io_uring = nullptr;
167 }
168 return new_io_uring;
169 }
170 #endif // defined(ROCKSDB_IOURING_PRESENT)
171
172 class PosixRandomAccessFile : public FSRandomAccessFile {
173 protected:
174 std::string filename_;
175 int fd_;
176 bool use_direct_io_;
177 size_t logical_sector_size_;
178 #if defined(ROCKSDB_IOURING_PRESENT)
179 ThreadLocalPtr* thread_local_io_urings_;
180 #endif
181
182 public:
183 PosixRandomAccessFile(const std::string& fname, int fd,
184 size_t logical_block_size,
185 const EnvOptions& options
186 #if defined(ROCKSDB_IOURING_PRESENT)
187 ,
188 ThreadLocalPtr* thread_local_io_urings
189 #endif
190 );
191 virtual ~PosixRandomAccessFile();
192
193 virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
194 Slice* result, char* scratch,
195 IODebugContext* dbg) const override;
196
197 virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
198 const IOOptions& options,
199 IODebugContext* dbg) override;
200
201 virtual IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts,
202 IODebugContext* dbg) override;
203
204 #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
205 virtual size_t GetUniqueId(char* id, size_t max_size) const override;
206 #endif
207 virtual void Hint(AccessPattern pattern) override;
208 virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
209 virtual bool use_direct_io() const override { return use_direct_io_; }
210 virtual size_t GetRequiredBufferAlignment() const override {
211 return logical_sector_size_;
212 }
213 };
214
215 class PosixWritableFile : public FSWritableFile {
216 protected:
217 const std::string filename_;
218 const bool use_direct_io_;
219 int fd_;
220 uint64_t filesize_;
221 size_t logical_sector_size_;
222 #ifdef ROCKSDB_FALLOCATE_PRESENT
223 bool allow_fallocate_;
224 bool fallocate_with_keep_size_;
225 #endif
226 #ifdef ROCKSDB_RANGESYNC_PRESENT
227 // Even if the syscall is present, the filesystem may still not properly
228 // support it, so we need to do a dynamic check too.
229 bool sync_file_range_supported_;
230 #endif // ROCKSDB_RANGESYNC_PRESENT
231
232 public:
233 explicit PosixWritableFile(const std::string& fname, int fd,
234 size_t logical_block_size,
235 const EnvOptions& options);
236 virtual ~PosixWritableFile();
237
238 // Need to implement this so the file is truncated correctly
239 // with direct I/O
240 virtual IOStatus Truncate(uint64_t size, const IOOptions& opts,
241 IODebugContext* dbg) override;
242 virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
243 virtual IOStatus Append(const Slice& data, const IOOptions& opts,
244 IODebugContext* dbg) override;
245 virtual IOStatus Append(const Slice& data, const IOOptions& opts,
246 const DataVerificationInfo& /* verification_info */,
247 IODebugContext* dbg) override {
248 return Append(data, opts, dbg);
249 }
250 virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
251 const IOOptions& opts,
252 IODebugContext* dbg) override;
253 virtual IOStatus PositionedAppend(
254 const Slice& data, uint64_t offset, const IOOptions& opts,
255 const DataVerificationInfo& /* verification_info */,
256 IODebugContext* dbg) override {
257 return PositionedAppend(data, offset, opts, dbg);
258 }
259 virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
260 virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
261 virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
262 virtual bool IsSyncThreadSafe() const override;
263 virtual bool use_direct_io() const override { return use_direct_io_; }
264 virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
265 virtual uint64_t GetFileSize(const IOOptions& opts,
266 IODebugContext* dbg) override;
267 virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
268 virtual size_t GetRequiredBufferAlignment() const override {
269 return logical_sector_size_;
270 }
271 #ifdef ROCKSDB_FALLOCATE_PRESENT
272 virtual IOStatus Allocate(uint64_t offset, uint64_t len,
273 const IOOptions& opts,
274 IODebugContext* dbg) override;
275 #endif
276 virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
277 const IOOptions& opts,
278 IODebugContext* dbg) override;
279 #ifdef OS_LINUX
280 virtual size_t GetUniqueId(char* id, size_t max_size) const override;
281 #endif
282 };
283
284 // mmap() based random-access
285 class PosixMmapReadableFile : public FSRandomAccessFile {
286 private:
287 int fd_;
288 std::string filename_;
289 void* mmapped_region_;
290 size_t length_;
291
292 public:
293 PosixMmapReadableFile(const int fd, const std::string& fname, void* base,
294 size_t length, const EnvOptions& options);
295 virtual ~PosixMmapReadableFile();
296 virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
297 Slice* result, char* scratch,
298 IODebugContext* dbg) const override;
299 virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
300 };
301
302 class PosixMmapFile : public FSWritableFile {
303 private:
304 std::string filename_;
305 int fd_;
306 size_t page_size_;
307 size_t map_size_; // How much extra memory to map at a time
308 char* base_; // The mapped region
309 char* limit_; // Limit of the mapped region
310 char* dst_; // Where to write next (in range [base_,limit_])
311 char* last_sync_; // Where have we synced up to
312 uint64_t file_offset_; // Offset of base_ in file
313 #ifdef ROCKSDB_FALLOCATE_PRESENT
314 bool allow_fallocate_; // If false, fallocate calls are bypassed
315 bool fallocate_with_keep_size_;
316 #endif
317
318 // Roundup x to a multiple of y
319 static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
320
321 size_t TruncateToPageBoundary(size_t s) {
322 s -= (s & (page_size_ - 1));
323 assert((s % page_size_) == 0);
324 return s;
325 }
326
327 IOStatus MapNewRegion();
328 IOStatus UnmapCurrentRegion();
329 IOStatus Msync();
330
331 public:
332 PosixMmapFile(const std::string& fname, int fd, size_t page_size,
333 const EnvOptions& options);
334 ~PosixMmapFile();
335
336 // Means Close() will properly take care of truncate
337 // and it does not need any additional information
338 virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/,
339 IODebugContext* /*dbg*/) override {
340 return IOStatus::OK();
341 }
342 virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
343 virtual IOStatus Append(const Slice& data, const IOOptions& opts,
344 IODebugContext* dbg) override;
345 virtual IOStatus Append(const Slice& data, const IOOptions& opts,
346 const DataVerificationInfo& /* verification_info */,
347 IODebugContext* dbg) override {
348 return Append(data, opts, dbg);
349 }
350 virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
351 virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
352 virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
353 virtual uint64_t GetFileSize(const IOOptions& opts,
354 IODebugContext* dbg) override;
355 virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
356 #ifdef ROCKSDB_FALLOCATE_PRESENT
357 virtual IOStatus Allocate(uint64_t offset, uint64_t len,
358 const IOOptions& opts,
359 IODebugContext* dbg) override;
360 #endif
361 };
362
363 class PosixRandomRWFile : public FSRandomRWFile {
364 public:
365 explicit PosixRandomRWFile(const std::string& fname, int fd,
366 const EnvOptions& options);
367 virtual ~PosixRandomRWFile();
368
369 virtual IOStatus Write(uint64_t offset, const Slice& data,
370 const IOOptions& opts, IODebugContext* dbg) override;
371
372 virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
373 Slice* result, char* scratch,
374 IODebugContext* dbg) const override;
375
376 virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
377 virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
378 virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
379 virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
380
381 private:
382 const std::string filename_;
383 int fd_;
384 };
385
386 struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer {
387 PosixMemoryMappedFileBuffer(void* _base, size_t _length)
388 : MemoryMappedFileBuffer(_base, _length) {}
389 virtual ~PosixMemoryMappedFileBuffer();
390 };
391
392 class PosixDirectory : public FSDirectory {
393 public:
394 explicit PosixDirectory(int fd) : fd_(fd) {}
395 ~PosixDirectory();
396 virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
397
398 private:
399 int fd_;
400 };
401
402 } // namespace ROCKSDB_NAMESPACE