]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/env/fs_posix.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / env / fs_posix.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors
9
10 #if !defined(OS_WIN)
11
12 #include <dirent.h>
13 #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
14 #include <dlfcn.h>
15 #endif
16 #include <errno.h>
17 #include <fcntl.h>
18
19 #include <pthread.h>
20 #include <signal.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/ioctl.h>
25 #include <sys/mman.h>
26 #include <sys/stat.h>
27 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
28 #include <sys/statfs.h>
29 #include <sys/sysmacros.h>
30 #endif
31 #include <sys/statvfs.h>
32 #include <sys/time.h>
33 #include <sys/types.h>
34 #include <time.h>
35 #include <algorithm>
36 // Get nano time includes
37 #if defined(OS_LINUX) || defined(OS_FREEBSD)
38 #elif defined(__MACH__)
39 #include <Availability.h>
40 #include <mach/clock.h>
41 #include <mach/mach.h>
42 #else
43 #include <chrono>
44 #endif
45 #include <deque>
46 #include <set>
47 #include <vector>
48
49 #include "env/composite_env_wrapper.h"
50 #include "env/io_posix.h"
51 #include "logging/posix_logger.h"
52 #include "monitoring/iostats_context_imp.h"
53 #include "monitoring/thread_status_updater.h"
54 #include "port/port.h"
55 #include "rocksdb/options.h"
56 #include "rocksdb/slice.h"
57 #include "rocksdb/utilities/object_registry.h"
58 #include "test_util/sync_point.h"
59 #include "util/coding.h"
60 #include "util/compression_context_cache.h"
61 #include "util/random.h"
62 #include "util/string_util.h"
63 #include "util/thread_local.h"
64 #include "util/threadpool_imp.h"
65
66 #if !defined(TMPFS_MAGIC)
67 #define TMPFS_MAGIC 0x01021994
68 #endif
69 #if !defined(XFS_SUPER_MAGIC)
70 #define XFS_SUPER_MAGIC 0x58465342
71 #endif
72 #if !defined(EXT4_SUPER_MAGIC)
73 #define EXT4_SUPER_MAGIC 0xEF53
74 #endif
75
76 namespace ROCKSDB_NAMESPACE {
77
78 namespace {
79
80 inline mode_t GetDBFileMode(bool allow_non_owner_access) {
81 return allow_non_owner_access ? 0644 : 0600;
82 }
83
84 static uint64_t gettid() {
85 return Env::Default()->GetThreadID();
86 }
87
88 // list of pathnames that are locked
89 // Only used for error message.
90 struct LockHoldingInfo {
91 int64_t acquire_time;
92 uint64_t acquiring_thread;
93 };
94 static std::map<std::string, LockHoldingInfo> locked_files;
95 static port::Mutex mutex_locked_files;
96
97 static int LockOrUnlock(int fd, bool lock) {
98 errno = 0;
99 struct flock f;
100 memset(&f, 0, sizeof(f));
101 f.l_type = (lock ? F_WRLCK : F_UNLCK);
102 f.l_whence = SEEK_SET;
103 f.l_start = 0;
104 f.l_len = 0; // Lock/unlock entire file
105 int value = fcntl(fd, F_SETLK, &f);
106
107 return value;
108 }
109
110 class PosixFileLock : public FileLock {
111 public:
112 int fd_;
113 std::string filename;
114 };
115
116 int cloexec_flags(int flags, const EnvOptions* options) {
117 // If the system supports opening the file with cloexec enabled,
118 // do so, as this avoids a race condition if a db is opened around
119 // the same time that a child process is forked
120 #ifdef O_CLOEXEC
121 if (options == nullptr || options->set_fd_cloexec) {
122 flags |= O_CLOEXEC;
123 }
124 #else
125 (void)options;
126 #endif
127 return flags;
128 }
129
130 class PosixFileSystem : public FileSystem {
131 public:
132 PosixFileSystem();
133
134 const char* Name() const override { return "Posix File System"; }
135
136 ~PosixFileSystem() override {}
137
138 void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
139 if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
140 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
141 }
142 }
143
144 IOStatus NewSequentialFile(const std::string& fname,
145 const FileOptions& options,
146 std::unique_ptr<FSSequentialFile>* result,
147 IODebugContext* /*dbg*/) override {
148 result->reset();
149 int fd = -1;
150 int flags = cloexec_flags(O_RDONLY, &options);
151 FILE* file = nullptr;
152
153 if (options.use_direct_reads && !options.use_mmap_reads) {
154 #ifdef ROCKSDB_LITE
155 return IOStatus::IOError(fname,
156 "Direct I/O not supported in RocksDB lite");
157 #endif // !ROCKSDB_LITE
158 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
159 flags |= O_DIRECT;
160 TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
161 #endif
162 }
163
164 do {
165 IOSTATS_TIMER_GUARD(open_nanos);
166 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
167 } while (fd < 0 && errno == EINTR);
168 if (fd < 0) {
169 return IOError("While opening a file for sequentially reading", fname,
170 errno);
171 }
172
173 SetFD_CLOEXEC(fd, &options);
174
175 if (options.use_direct_reads && !options.use_mmap_reads) {
176 #ifdef OS_MACOSX
177 if (fcntl(fd, F_NOCACHE, 1) == -1) {
178 close(fd);
179 return IOError("While fcntl NoCache", fname, errno);
180 }
181 #endif
182 } else {
183 do {
184 IOSTATS_TIMER_GUARD(open_nanos);
185 file = fdopen(fd, "r");
186 } while (file == nullptr && errno == EINTR);
187 if (file == nullptr) {
188 close(fd);
189 return IOError("While opening file for sequentially read", fname,
190 errno);
191 }
192 }
193 result->reset(new PosixSequentialFile(
194 fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
195 options));
196 return IOStatus::OK();
197 }
198
199 IOStatus NewRandomAccessFile(const std::string& fname,
200 const FileOptions& options,
201 std::unique_ptr<FSRandomAccessFile>* result,
202 IODebugContext* /*dbg*/) override {
203 result->reset();
204 IOStatus s = IOStatus::OK();
205 int fd;
206 int flags = cloexec_flags(O_RDONLY, &options);
207
208 if (options.use_direct_reads && !options.use_mmap_reads) {
209 #ifdef ROCKSDB_LITE
210 return IOStatus::IOError(fname,
211 "Direct I/O not supported in RocksDB lite");
212 #endif // !ROCKSDB_LITE
213 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
214 flags |= O_DIRECT;
215 TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
216 #endif
217 }
218
219 do {
220 IOSTATS_TIMER_GUARD(open_nanos);
221 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
222 } while (fd < 0 && errno == EINTR);
223 if (fd < 0) {
224 s = IOError("While open a file for random read", fname, errno);
225 return s;
226 }
227 SetFD_CLOEXEC(fd, &options);
228
229 if (options.use_mmap_reads && sizeof(void*) >= 8) {
230 // Use of mmap for random reads has been removed because it
231 // kills performance when storage is fast.
232 // Use mmap when virtual address-space is plentiful.
233 uint64_t size;
234 IOOptions opts;
235 s = GetFileSize(fname, opts, &size, nullptr);
236 if (s.ok()) {
237 void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
238 if (base != MAP_FAILED) {
239 result->reset(
240 new PosixMmapReadableFile(fd, fname, base, size, options));
241 } else {
242 s = IOError("while mmap file for read", fname, errno);
243 close(fd);
244 }
245 } else {
246 close(fd);
247 }
248 } else {
249 if (options.use_direct_reads && !options.use_mmap_reads) {
250 #ifdef OS_MACOSX
251 if (fcntl(fd, F_NOCACHE, 1) == -1) {
252 close(fd);
253 return IOError("while fcntl NoCache", fname, errno);
254 }
255 #endif
256 }
257 result->reset(new PosixRandomAccessFile(
258 fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
259 options
260 #if defined(ROCKSDB_IOURING_PRESENT)
261 ,
262 thread_local_io_urings_.get()
263 #endif
264 ));
265 }
266 return s;
267 }
268
269 virtual IOStatus OpenWritableFile(const std::string& fname,
270 const FileOptions& options,
271 bool reopen,
272 std::unique_ptr<FSWritableFile>* result,
273 IODebugContext* /*dbg*/) {
274 result->reset();
275 IOStatus s;
276 int fd = -1;
277 int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
278 // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
279 if (options.use_direct_writes && !options.use_mmap_writes) {
280 // Note: we should avoid O_APPEND here due to ta the following bug:
281 // POSIX requires that opening a file with the O_APPEND flag should
282 // have no affect on the location at which pwrite() writes data.
283 // However, on Linux, if a file is opened with O_APPEND, pwrite()
284 // appends data to the end of the file, regardless of the value of
285 // offset.
286 // More info here: https://linux.die.net/man/2/pwrite
287 #ifdef ROCKSDB_LITE
288 return IOStatus::IOError(fname,
289 "Direct I/O not supported in RocksDB lite");
290 #endif // ROCKSDB_LITE
291 flags |= O_WRONLY;
292 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
293 flags |= O_DIRECT;
294 #endif
295 TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
296 } else if (options.use_mmap_writes) {
297 // non-direct I/O
298 flags |= O_RDWR;
299 } else {
300 flags |= O_WRONLY;
301 }
302
303 flags = cloexec_flags(flags, &options);
304
305 do {
306 IOSTATS_TIMER_GUARD(open_nanos);
307 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
308 } while (fd < 0 && errno == EINTR);
309
310 if (fd < 0) {
311 s = IOError("While open a file for appending", fname, errno);
312 return s;
313 }
314 SetFD_CLOEXEC(fd, &options);
315
316 if (options.use_mmap_writes) {
317 if (!checkedDiskForMmap_) {
318 // this will be executed once in the program's lifetime.
319 // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
320 if (!SupportsFastAllocate(fname)) {
321 forceMmapOff_ = true;
322 }
323 checkedDiskForMmap_ = true;
324 }
325 }
326 if (options.use_mmap_writes && !forceMmapOff_) {
327 result->reset(new PosixMmapFile(fname, fd, page_size_, options));
328 } else if (options.use_direct_writes && !options.use_mmap_writes) {
329 #ifdef OS_MACOSX
330 if (fcntl(fd, F_NOCACHE, 1) == -1) {
331 close(fd);
332 s = IOError("While fcntl NoCache an opened file for appending", fname,
333 errno);
334 return s;
335 }
336 #elif defined(OS_SOLARIS)
337 if (directio(fd, DIRECTIO_ON) == -1) {
338 if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
339 close(fd);
340 s = IOError("While calling directio()", fname, errno);
341 return s;
342 }
343 }
344 #endif
345 result->reset(new PosixWritableFile(
346 fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
347 options));
348 } else {
349 // disable mmap writes
350 EnvOptions no_mmap_writes_options = options;
351 no_mmap_writes_options.use_mmap_writes = false;
352 result->reset(
353 new PosixWritableFile(fname, fd,
354 GetLogicalBlockSizeForWriteIfNeeded(
355 no_mmap_writes_options, fname, fd),
356 no_mmap_writes_options));
357 }
358 return s;
359 }
360
361 IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
362 std::unique_ptr<FSWritableFile>* result,
363 IODebugContext* dbg) override {
364 return OpenWritableFile(fname, options, false, result, dbg);
365 }
366
367 IOStatus ReopenWritableFile(const std::string& fname,
368 const FileOptions& options,
369 std::unique_ptr<FSWritableFile>* result,
370 IODebugContext* dbg) override {
371 return OpenWritableFile(fname, options, true, result, dbg);
372 }
373
374 IOStatus ReuseWritableFile(const std::string& fname,
375 const std::string& old_fname,
376 const FileOptions& options,
377 std::unique_ptr<FSWritableFile>* result,
378 IODebugContext* /*dbg*/) override {
379 result->reset();
380 IOStatus s;
381 int fd = -1;
382
383 int flags = 0;
384 // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
385 if (options.use_direct_writes && !options.use_mmap_writes) {
386 #ifdef ROCKSDB_LITE
387 return IOStatus::IOError(fname,
388 "Direct I/O not supported in RocksDB lite");
389 #endif // !ROCKSDB_LITE
390 flags |= O_WRONLY;
391 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
392 flags |= O_DIRECT;
393 #endif
394 TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
395 } else if (options.use_mmap_writes) {
396 // mmap needs O_RDWR mode
397 flags |= O_RDWR;
398 } else {
399 flags |= O_WRONLY;
400 }
401
402 flags = cloexec_flags(flags, &options);
403
404 do {
405 IOSTATS_TIMER_GUARD(open_nanos);
406 fd = open(old_fname.c_str(), flags,
407 GetDBFileMode(allow_non_owner_access_));
408 } while (fd < 0 && errno == EINTR);
409 if (fd < 0) {
410 s = IOError("while reopen file for write", fname, errno);
411 return s;
412 }
413
414 SetFD_CLOEXEC(fd, &options);
415 // rename into place
416 if (rename(old_fname.c_str(), fname.c_str()) != 0) {
417 s = IOError("while rename file to " + fname, old_fname, errno);
418 close(fd);
419 return s;
420 }
421
422 if (options.use_mmap_writes) {
423 if (!checkedDiskForMmap_) {
424 // this will be executed once in the program's lifetime.
425 // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
426 if (!SupportsFastAllocate(fname)) {
427 forceMmapOff_ = true;
428 }
429 checkedDiskForMmap_ = true;
430 }
431 }
432 if (options.use_mmap_writes && !forceMmapOff_) {
433 result->reset(new PosixMmapFile(fname, fd, page_size_, options));
434 } else if (options.use_direct_writes && !options.use_mmap_writes) {
435 #ifdef OS_MACOSX
436 if (fcntl(fd, F_NOCACHE, 1) == -1) {
437 close(fd);
438 s = IOError("while fcntl NoCache for reopened file for append", fname,
439 errno);
440 return s;
441 }
442 #elif defined(OS_SOLARIS)
443 if (directio(fd, DIRECTIO_ON) == -1) {
444 if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
445 close(fd);
446 s = IOError("while calling directio()", fname, errno);
447 return s;
448 }
449 }
450 #endif
451 result->reset(new PosixWritableFile(
452 fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
453 options));
454 } else {
455 // disable mmap writes
456 FileOptions no_mmap_writes_options = options;
457 no_mmap_writes_options.use_mmap_writes = false;
458 result->reset(
459 new PosixWritableFile(fname, fd,
460 GetLogicalBlockSizeForWriteIfNeeded(
461 no_mmap_writes_options, fname, fd),
462 no_mmap_writes_options));
463 }
464 return s;
465 }
466
467 IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
468 std::unique_ptr<FSRandomRWFile>* result,
469 IODebugContext* /*dbg*/) override {
470 int fd = -1;
471 int flags = cloexec_flags(O_RDWR, &options);
472
473 while (fd < 0) {
474 IOSTATS_TIMER_GUARD(open_nanos);
475
476 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
477 if (fd < 0) {
478 // Error while opening the file
479 if (errno == EINTR) {
480 continue;
481 }
482 return IOError("While open file for random read/write", fname, errno);
483 }
484 }
485
486 SetFD_CLOEXEC(fd, &options);
487 result->reset(new PosixRandomRWFile(fname, fd, options));
488 return IOStatus::OK();
489 }
490
491 IOStatus NewMemoryMappedFileBuffer(
492 const std::string& fname,
493 std::unique_ptr<MemoryMappedFileBuffer>* result) override {
494 int fd = -1;
495 IOStatus status;
496 int flags = cloexec_flags(O_RDWR, nullptr);
497
498 while (fd < 0) {
499 IOSTATS_TIMER_GUARD(open_nanos);
500 fd = open(fname.c_str(), flags, 0644);
501 if (fd < 0) {
502 // Error while opening the file
503 if (errno == EINTR) {
504 continue;
505 }
506 status =
507 IOError("While open file for raw mmap buffer access", fname, errno);
508 break;
509 }
510 }
511 uint64_t size;
512 if (status.ok()) {
513 IOOptions opts;
514 status = GetFileSize(fname, opts, &size, nullptr);
515 }
516 void* base = nullptr;
517 if (status.ok()) {
518 base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE,
519 MAP_SHARED, fd, 0);
520 if (base == MAP_FAILED) {
521 status = IOError("while mmap file for read", fname, errno);
522 }
523 }
524 if (status.ok()) {
525 result->reset(
526 new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size)));
527 }
528 if (fd >= 0) {
529 // don't need to keep it open after mmap has been called
530 close(fd);
531 }
532 return status;
533 }
534
535 IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/,
536 std::unique_ptr<FSDirectory>* result,
537 IODebugContext* /*dbg*/) override {
538 result->reset();
539 int fd;
540 int flags = cloexec_flags(0, nullptr);
541 {
542 IOSTATS_TIMER_GUARD(open_nanos);
543 fd = open(name.c_str(), flags);
544 }
545 if (fd < 0) {
546 return IOError("While open directory", name, errno);
547 } else {
548 result->reset(new PosixDirectory(fd));
549 }
550 return IOStatus::OK();
551 }
552
553 IOStatus NewLogger(const std::string& fname, const IOOptions& /*opts*/,
554 std::shared_ptr<Logger>* result,
555 IODebugContext* /*dbg*/) override {
556 FILE* f;
557 {
558 IOSTATS_TIMER_GUARD(open_nanos);
559 f = fopen(fname.c_str(),
560 "w"
561 #ifdef __GLIBC_PREREQ
562 #if __GLIBC_PREREQ(2, 7)
563 "e" // glibc extension to enable O_CLOEXEC
564 #endif
565 #endif
566 );
567 }
568 if (f == nullptr) {
569 result->reset();
570 return status_to_io_status(
571 IOError("when fopen a file for new logger", fname, errno));
572 } else {
573 int fd = fileno(f);
574 #ifdef ROCKSDB_FALLOCATE_PRESENT
575 fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024);
576 #endif
577 SetFD_CLOEXEC(fd, nullptr);
578 result->reset(new PosixLogger(f, &gettid, Env::Default()));
579 return IOStatus::OK();
580 }
581 }
582
583 IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/,
584 IODebugContext* /*dbg*/) override {
585 int result = access(fname.c_str(), F_OK);
586
587 if (result == 0) {
588 return IOStatus::OK();
589 }
590
591 int err = errno;
592 switch (err) {
593 case EACCES:
594 case ELOOP:
595 case ENAMETOOLONG:
596 case ENOENT:
597 case ENOTDIR:
598 return IOStatus::NotFound();
599 default:
600 assert(err == EIO || err == ENOMEM);
601 return IOStatus::IOError("Unexpected error(" + ToString(err) +
602 ") accessing file `" + fname + "' ");
603 }
604 }
605
606 IOStatus GetChildren(const std::string& dir, const IOOptions& /*opts*/,
607 std::vector<std::string>* result,
608 IODebugContext* /*dbg*/) override {
609 result->clear();
610 DIR* d = opendir(dir.c_str());
611 if (d == nullptr) {
612 switch (errno) {
613 case EACCES:
614 case ENOENT:
615 case ENOTDIR:
616 return IOStatus::NotFound();
617 default:
618 return IOError("While opendir", dir, errno);
619 }
620 }
621 struct dirent* entry;
622 while ((entry = readdir(d)) != nullptr) {
623 result->push_back(entry->d_name);
624 }
625 closedir(d);
626 return IOStatus::OK();
627 }
628
629 IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/,
630 IODebugContext* /*dbg*/) override {
631 IOStatus result;
632 if (unlink(fname.c_str()) != 0) {
633 result = IOError("while unlink() file", fname, errno);
634 }
635 return result;
636 }
637
638 IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/,
639 IODebugContext* /*dbg*/) override {
640 if (mkdir(name.c_str(), 0755) != 0) {
641 return IOError("While mkdir", name, errno);
642 }
643 return IOStatus::OK();
644 }
645
646 IOStatus CreateDirIfMissing(const std::string& name,
647 const IOOptions& /*opts*/,
648 IODebugContext* /*dbg*/) override {
649 if (mkdir(name.c_str(), 0755) != 0) {
650 if (errno != EEXIST) {
651 return IOError("While mkdir if missing", name, errno);
652 } else if (!DirExists(name)) { // Check that name is actually a
653 // directory.
654 // Message is taken from mkdir
655 return IOStatus::IOError("`" + name +
656 "' exists but is not a directory");
657 }
658 }
659 return IOStatus::OK();
660 }
661
662 IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/,
663 IODebugContext* /*dbg*/) override {
664 if (rmdir(name.c_str()) != 0) {
665 return IOError("file rmdir", name, errno);
666 }
667 return IOStatus::OK();
668 }
669
670 IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
671 uint64_t* size, IODebugContext* /*dbg*/) override {
672 struct stat sbuf;
673 if (stat(fname.c_str(), &sbuf) != 0) {
674 *size = 0;
675 return IOError("while stat a file for size", fname, errno);
676 } else {
677 *size = sbuf.st_size;
678 }
679 return IOStatus::OK();
680 }
681
682 IOStatus GetFileModificationTime(const std::string& fname,
683 const IOOptions& /*opts*/,
684 uint64_t* file_mtime,
685 IODebugContext* /*dbg*/) override {
686 struct stat s;
687 if (stat(fname.c_str(), &s) != 0) {
688 return IOError("while stat a file for modification time", fname, errno);
689 }
690 *file_mtime = static_cast<uint64_t>(s.st_mtime);
691 return IOStatus::OK();
692 }
693
694 IOStatus RenameFile(const std::string& src, const std::string& target,
695 const IOOptions& /*opts*/,
696 IODebugContext* /*dbg*/) override {
697 if (rename(src.c_str(), target.c_str()) != 0) {
698 return IOError("While renaming a file to " + target, src, errno);
699 }
700 return IOStatus::OK();
701 }
702
703 IOStatus LinkFile(const std::string& src, const std::string& target,
704 const IOOptions& /*opts*/,
705 IODebugContext* /*dbg*/) override {
706 if (link(src.c_str(), target.c_str()) != 0) {
707 if (errno == EXDEV) {
708 return IOStatus::NotSupported("No cross FS links allowed");
709 }
710 return IOError("while link file to " + target, src, errno);
711 }
712 return IOStatus::OK();
713 }
714
715 IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/,
716 uint64_t* count, IODebugContext* /*dbg*/) override {
717 struct stat s;
718 if (stat(fname.c_str(), &s) != 0) {
719 return IOError("while stat a file for num file links", fname, errno);
720 }
721 *count = static_cast<uint64_t>(s.st_nlink);
722 return IOStatus::OK();
723 }
724
725 IOStatus AreFilesSame(const std::string& first, const std::string& second,
726 const IOOptions& /*opts*/, bool* res,
727 IODebugContext* /*dbg*/) override {
728 struct stat statbuf[2];
729 if (stat(first.c_str(), &statbuf[0]) != 0) {
730 return IOError("stat file", first, errno);
731 }
732 if (stat(second.c_str(), &statbuf[1]) != 0) {
733 return IOError("stat file", second, errno);
734 }
735
736 if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) ||
737 minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) ||
738 statbuf[0].st_ino != statbuf[1].st_ino) {
739 *res = false;
740 } else {
741 *res = true;
742 }
743 return IOStatus::OK();
744 }
745
746 IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/,
747 FileLock** lock, IODebugContext* /*dbg*/) override {
748 *lock = nullptr;
749
750 LockHoldingInfo lhi;
751 int64_t current_time = 0;
752 // Ignore status code as the time is only used for error message.
753 Env::Default()->GetCurrentTime(&current_time).PermitUncheckedError();
754 lhi.acquire_time = current_time;
755 lhi.acquiring_thread = Env::Default()->GetThreadID();
756
757 mutex_locked_files.Lock();
758 // If it already exists in the locked_files set, then it is already locked,
759 // and fail this lock attempt. Otherwise, insert it into locked_files.
760 // This check is needed because fcntl() does not detect lock conflict
761 // if the fcntl is issued by the same thread that earlier acquired
762 // this lock.
763 // We must do this check *before* opening the file:
764 // Otherwise, we will open a new file descriptor. Locks are associated with
765 // a process, not a file descriptor and when *any* file descriptor is
766 // closed, all locks the process holds for that *file* are released
767 const auto it_success = locked_files.insert({fname, lhi});
768 if (it_success.second == false) {
769 mutex_locked_files.Unlock();
770 errno = ENOLCK;
771 LockHoldingInfo& prev_info = it_success.first->second;
772 // Note that the thread ID printed is the same one as the one in
773 // posix logger, but posix logger prints it hex format.
774 return IOError("lock hold by current process, acquire time " +
775 ToString(prev_info.acquire_time) +
776 " acquiring thread " +
777 ToString(prev_info.acquiring_thread),
778 fname, errno);
779 }
780
781 IOStatus result = IOStatus::OK();
782 int fd;
783 int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
784
785 {
786 IOSTATS_TIMER_GUARD(open_nanos);
787 fd = open(fname.c_str(), flags, 0644);
788 }
789 if (fd < 0) {
790 result = IOError("while open a file for lock", fname, errno);
791 } else if (LockOrUnlock(fd, true) == -1) {
792 // if there is an error in locking, then remove the pathname from
793 // lockedfiles
794 locked_files.erase(fname);
795 result = IOError("While lock file", fname, errno);
796 close(fd);
797 } else {
798 SetFD_CLOEXEC(fd, nullptr);
799 PosixFileLock* my_lock = new PosixFileLock;
800 my_lock->fd_ = fd;
801 my_lock->filename = fname;
802 *lock = my_lock;
803 }
804
805 mutex_locked_files.Unlock();
806 return result;
807 }
808
809 IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
810 IODebugContext* /*dbg*/) override {
811 PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
812 IOStatus result;
813 mutex_locked_files.Lock();
814 // If we are unlocking, then verify that we had locked it earlier,
815 // it should already exist in locked_files. Remove it from locked_files.
816 if (locked_files.erase(my_lock->filename) != 1) {
817 errno = ENOLCK;
818 result = IOError("unlock", my_lock->filename, errno);
819 } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
820 result = IOError("unlock", my_lock->filename, errno);
821 }
822 close(my_lock->fd_);
823 delete my_lock;
824 mutex_locked_files.Unlock();
825 return result;
826 }
827
828 IOStatus GetAbsolutePath(const std::string& db_path,
829 const IOOptions& /*opts*/, std::string* output_path,
830 IODebugContext* /*dbg*/) override {
831 if (!db_path.empty() && db_path[0] == '/') {
832 *output_path = db_path;
833 return IOStatus::OK();
834 }
835
836 char the_path[256];
837 char* ret = getcwd(the_path, 256);
838 if (ret == nullptr) {
839 return IOStatus::IOError(strerror(errno));
840 }
841
842 *output_path = ret;
843 return IOStatus::OK();
844 }
845
846 IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result,
847 IODebugContext* /*dbg*/) override {
848 const char* env = getenv("TEST_TMPDIR");
849 if (env && env[0] != '\0') {
850 *result = env;
851 } else {
852 char buf[100];
853 snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
854 *result = buf;
855 }
856 // Directory may already exist
857 {
858 IOOptions opts;
859 return CreateDirIfMissing(*result, opts, nullptr);
860 }
861 return IOStatus::OK();
862 }
863
864 IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
865 uint64_t* free_space,
866 IODebugContext* /*dbg*/) override {
867 struct statvfs sbuf;
868
869 if (statvfs(fname.c_str(), &sbuf) < 0) {
870 return IOError("While doing statvfs", fname, errno);
871 }
872
873 *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
874 return IOStatus::OK();
875 }
876
877 IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/,
878 bool* is_dir, IODebugContext* /*dbg*/) override {
879 // First open
880 int fd = -1;
881 int flags = cloexec_flags(O_RDONLY, nullptr);
882 {
883 IOSTATS_TIMER_GUARD(open_nanos);
884 fd = open(path.c_str(), flags);
885 }
886 if (fd < 0) {
887 return IOError("While open for IsDirectory()", path, errno);
888 }
889 IOStatus io_s;
890 struct stat sbuf;
891 if (fstat(fd, &sbuf) < 0) {
892 io_s = IOError("While doing stat for IsDirectory()", path, errno);
893 }
894 close(fd);
895 if (io_s.ok() && nullptr != is_dir) {
896 *is_dir = S_ISDIR(sbuf.st_mode);
897 }
898 return io_s;
899 }
900
901 FileOptions OptimizeForLogWrite(const FileOptions& file_options,
902 const DBOptions& db_options) const override {
903 FileOptions optimized = file_options;
904 optimized.use_mmap_writes = false;
905 optimized.use_direct_writes = false;
906 optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
907 // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
908 // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
909 // test and make this false
910 optimized.fallocate_with_keep_size = true;
911 optimized.writable_file_max_buffer_size =
912 db_options.writable_file_max_buffer_size;
913 return optimized;
914 }
915
916 FileOptions OptimizeForManifestWrite(
917 const FileOptions& file_options) const override {
918 FileOptions optimized = file_options;
919 optimized.use_mmap_writes = false;
920 optimized.use_direct_writes = false;
921 optimized.fallocate_with_keep_size = true;
922 return optimized;
923 }
924 #ifdef OS_LINUX
925 Status RegisterDbPaths(const std::vector<std::string>& paths) override {
926 return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
927 }
928 Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
929 logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths);
930 return Status::OK();
931 }
932 #endif
933 private:
934 bool checkedDiskForMmap_;
935 bool forceMmapOff_; // do we override Env options?
936
937 // Returns true iff the named directory exists and is a directory.
938 virtual bool DirExists(const std::string& dname) {
939 struct stat statbuf;
940 if (stat(dname.c_str(), &statbuf) == 0) {
941 return S_ISDIR(statbuf.st_mode);
942 }
943 return false; // stat() failed return false
944 }
945
946 bool SupportsFastAllocate(const std::string& path) {
947 #ifdef ROCKSDB_FALLOCATE_PRESENT
948 struct statfs s;
949 if (statfs(path.c_str(), &s)) {
950 return false;
951 }
952 switch (s.f_type) {
953 case EXT4_SUPER_MAGIC:
954 return true;
955 case XFS_SUPER_MAGIC:
956 return true;
957 case TMPFS_MAGIC:
958 return true;
959 default:
960 return false;
961 }
962 #else
963 (void)path;
964 return false;
965 #endif
966 }
967
968 #if defined(ROCKSDB_IOURING_PRESENT)
969 // io_uring instance
970 std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
971 #endif
972
973 size_t page_size_;
974
975 // If true, allow non owner read access for db files. Otherwise, non-owner
976 // has no access to db files.
977 bool allow_non_owner_access_;
978
979 #ifdef OS_LINUX
980 static LogicalBlockSizeCache logical_block_size_cache_;
981 #endif
982 static size_t GetLogicalBlockSize(const std::string& fname, int fd);
983 // In non-direct IO mode, this directly returns kDefaultPageSize.
984 // Otherwise call GetLogicalBlockSize.
985 static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options,
986 const std::string& fname,
987 int fd);
988 static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options,
989 const std::string& fname,
990 int fd);
991 };
992
993 #ifdef OS_LINUX
994 LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_;
995 #endif
996
997 size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) {
998 #ifdef OS_LINUX
999 return logical_block_size_cache_.GetLogicalBlockSize(fname, fd);
1000 #else
1001 (void)fname;
1002 return PosixHelper::GetLogicalBlockSizeOfFd(fd);
1003 #endif
1004 }
1005
1006 size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded(
1007 const EnvOptions& options, const std::string& fname, int fd) {
1008 return options.use_direct_reads
1009 ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
1010 : kDefaultPageSize;
1011 }
1012
1013 size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
1014 const EnvOptions& options, const std::string& fname, int fd) {
1015 return options.use_direct_writes
1016 ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
1017 : kDefaultPageSize;
1018 }
1019
1020 PosixFileSystem::PosixFileSystem()
1021 : checkedDiskForMmap_(false),
1022 forceMmapOff_(false),
1023 page_size_(getpagesize()),
1024 allow_non_owner_access_(true) {
1025 #if defined(ROCKSDB_IOURING_PRESENT)
1026 // Test whether IOUring is supported, and if it does, create a managing
1027 // object for thread local point so that in the future thread-local
1028 // io_uring can be created.
1029 struct io_uring* new_io_uring = CreateIOUring();
1030 if (new_io_uring != nullptr) {
1031 thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
1032 delete new_io_uring;
1033 }
1034 #endif
1035 }
1036
1037 } // namespace
1038
1039 //
1040 // Default Posix FileSystem
1041 //
1042 std::shared_ptr<FileSystem> FileSystem::Default() {
1043 static PosixFileSystem default_fs;
1044 static std::shared_ptr<PosixFileSystem> default_fs_ptr(
1045 &default_fs, [](PosixFileSystem*) {});
1046 return default_fs_ptr;
1047 }
1048
1049 #ifndef ROCKSDB_LITE
1050 static FactoryFunc<FileSystem> posix_filesystem_reg =
1051 ObjectLibrary::Default()->Register<FileSystem>(
1052 "posix://.*",
1053 [](const std::string& /* uri */, std::unique_ptr<FileSystem>* f,
1054 std::string* /* errmsg */) {
1055 f->reset(new PosixFileSystem());
1056 return f->get();
1057 });
1058 #endif
1059
1060 } // namespace ROCKSDB_NAMESPACE
1061
1062 #endif