// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
#include "util/string_util.h"
#include "util/sync_point.h"
+#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
+#endif
+
namespace rocksdb {
// A wrapper for fadvise, if the platform doesn't support fadvise,
#ifdef OS_LINUX
return posix_fadvise(fd, offset, len, advice);
#else
+ (void)fd;
+ (void)offset;
+ (void)len;
+ (void)advice;
return 0; // simply do nothing.
#endif
}
*/
#ifndef NDEBUG
namespace {
-#ifdef OS_LINUX
-const size_t kPageSize = sysconf(_SC_PAGESIZE);
-#else
-const size_t kPageSize = 4 * 1024;
-#endif
bool IsSectorAligned(const size_t off, size_t sector_size) {
return off % sector_size == 0;
}
-static bool IsPageAligned(const void* ptr) {
- return uintptr_t(ptr) % (kPageSize) == 0;
+bool IsSectorAligned(const void* ptr, size_t sector_size) {
+ return uintptr_t(ptr) % sector_size == 0;
}
}
clearerr(file_);
} else {
// A partial read with an error: return a non-ok status
- s = IOError(filename_, errno);
+ s = IOError("While reading file sequentially", filename_, errno);
}
}
- // we need to fadvise away the entire range of pages because
- // we do not want readahead pages to be cached under buffered io
- Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
return s;
}
Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
Slice* result, char* scratch) {
+ assert(use_direct_io());
+ assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
+
Status s;
ssize_t r = -1;
size_t left = n;
char* ptr = scratch;
- assert(use_direct_io());
while (left > 0) {
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
if (r <= 0) {
}
if (r < 0) {
// An error: return a non-ok status
- s = IOError(filename_, errno);
+ s = IOError(
+ "While pread " + ToString(n) + " bytes from offset " + ToString(offset),
+ filename_, errno);
}
*result = Slice(scratch, (r < 0) ? 0 : n - left);
return s;
Status PosixSequentialFile::Skip(uint64_t n) {
if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
- return IOError(filename_, errno);
+ return IOError("While fseek to skip " + ToString(n) + " bytes", filename_,
+ errno);
}
return Status::OK();
}
Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
return Status::OK();
#else
if (!use_direct_io()) {
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret != 0) {
- return IOError(filename_, errno);
+ return IOError("While fadvise NotNeeded offset " + ToString(offset) +
+ " len " + ToString(length),
+ filename_, errno);
}
}
return Status::OK();
Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
+ if (use_direct_io()) {
+ assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
+ }
Status s;
ssize_t r = -1;
size_t left = n;
}
if (r < 0) {
// An error: return a non-ok status
- s = IOError(filename_, errno);
+ s = IOError(
+ "While pread offset " + ToString(offset) + " len " + ToString(n),
+ filename_, errno);
}
*result = Slice(scratch, (r < 0) ? 0 : n - left);
return s;
r = fcntl(fd_, F_RDADVISE, &advice);
#endif
if (r == -1) {
- s = IOError(filename_, errno);
+ s = IOError("While prefetching offset " + ToString(offset) + " len " +
+ ToString(n),
+ filename_, errno);
}
}
return s;
return Status::OK();
}
#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
return Status::OK();
#else
// free OS pages
if (ret == 0) {
return Status::OK();
}
- return IOError(filename_, errno);
+ return IOError("While fadvise NotNeeded offset " + ToString(offset) +
+ " len " + ToString(length),
+ filename_, errno);
#endif
}
void* base, size_t length,
const EnvOptions& options)
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
+#ifdef NDEBUG
+ (void)options;
+#endif
fd_ = fd_ + 0; // suppress the warning for used variables
assert(options.use_mmap_reads);
assert(!options.use_direct_reads);
fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
mmapped_region_, length_);
}
+ close(fd_);
}
Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
- char* scratch) const {
+ char* /*scratch*/) const {
Status s;
if (offset > length_) {
*result = Slice();
- return IOError(filename_, EINVAL);
+ return IOError("While mmap read offset " + ToString(offset) +
+ " larger than file length " + ToString(length_),
+ filename_, EINVAL);
} else if (offset + n > length_) {
n = static_cast<size_t>(length_ - offset);
}
Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
return Status::OK();
#else
// free OS pages
if (ret == 0) {
return Status::OK();
}
- return IOError(filename_, errno);
+ return IOError("While fadvise not needed. Offset " + ToString(offset) +
+ " len" + ToString(length),
+ filename_, errno);
#endif
}
if (base_ != nullptr) {
int munmap_status = munmap(base_, limit_ - base_);
if (munmap_status != 0) {
- return IOError(filename_, munmap_status);
+ return IOError("While munmap", filename_, munmap_status);
}
file_offset_ += limit_ - base_;
base_ = nullptr;
last_sync_ = dst_;
TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
- return IOError(filename_, errno);
+ return IOError("While msync", filename_, errno);
}
return Status::OK();
}
#ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#else
+ (void)options;
#endif
assert((page_size & (page_size - 1)) == 0);
assert(options.use_mmap_writes);
}
size_t n = (left <= avail) ? left : avail;
+ assert(dst_);
memcpy(dst_, src, n);
dst_ += n;
src += n;
s = UnmapCurrentRegion();
if (!s.ok()) {
- s = IOError(filename_, errno);
+ s = IOError("While closing mmapped file", filename_, errno);
} else if (unused > 0) {
// Trim the extra space at the end of the file
if (ftruncate(fd_, file_offset_ - unused) < 0) {
- s = IOError(filename_, errno);
+ s = IOError("While ftruncating mmaped file", filename_, errno);
}
}
if (close(fd_) < 0) {
if (s.ok()) {
- s = IOError(filename_, errno);
+ s = IOError("While closing mmapped file", filename_, errno);
}
}
Status PosixMmapFile::Sync() {
if (fdatasync(fd_) < 0) {
- return IOError(filename_, errno);
+ return IOError("While fdatasync mmapped file", filename_, errno);
}
return Msync();
*/
Status PosixMmapFile::Fsync() {
if (fsync(fd_) < 0) {
- return IOError(filename_, errno);
+ return IOError("While fsync mmaped file", filename_, errno);
}
return Msync();
Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
return Status::OK();
#else
// free OS pages
if (ret == 0) {
return Status::OK();
}
- return IOError(filename_, errno);
+ return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
#endif
}
if (alloc_status == 0) {
return Status::OK();
} else {
- return IOError(filename_, errno);
+ return IOError(
+ "While fallocate offset " + ToString(offset) + " len " + ToString(len),
+ filename_, errno);
}
}
#endif
}
Status PosixWritableFile::Append(const Slice& data) {
- assert(!use_direct_io() ||
- (IsSectorAligned(data.size(), GetRequiredBufferAlignment()) &&
- IsPageAligned(data.data())));
+ if (use_direct_io()) {
+ assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
+ }
const char* src = data.data();
size_t left = data.size();
while (left != 0) {
if (errno == EINTR) {
continue;
}
- return IOError(filename_, errno);
+ return IOError("While appending to file", filename_, errno);
}
left -= done;
src += done;
}
Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
- assert(use_direct_io() &&
- IsSectorAligned(offset, GetRequiredBufferAlignment()) &&
- IsSectorAligned(data.size(), GetRequiredBufferAlignment()) &&
- IsPageAligned(data.data()));
+ if (use_direct_io()) {
+ assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
+ }
assert(offset <= std::numeric_limits<off_t>::max());
const char* src = data.data();
size_t left = data.size();
if (errno == EINTR) {
continue;
}
- return IOError(filename_, errno);
+ return IOError("While pwrite to file at offset " + ToString(offset),
+ filename_, errno);
}
left -= done;
offset += done;
Status s;
int r = ftruncate(fd_, size);
if (r < 0) {
- s = IOError(filename_, errno);
+ s = IOError("While ftruncate file to size " + ToString(size), filename_,
+ errno);
} else {
filesize_ = size;
}
// trim the extra space preallocated at the end of the file
// NOTE(ljin): we probably don't want to surface failure as an IOError,
// but it will be nice to log these errors.
- int dummy __attribute__((unused));
+ int dummy __attribute__((__unused__));
dummy = ftruncate(fd_, filesize_);
#if defined(ROCKSDB_FALLOCATE_PRESENT) && !defined(TRAVIS)
// in some file systems, ftruncate only trims trailing space if the
// While we work with Travis-CI team to figure out if this is a
// quirk of Docker/AUFS, we will comment this out.
struct stat file_stats;
- fstat(fd_, &file_stats);
+ int result = fstat(fd_, &file_stats);
// After ftruncate, we check whether ftruncate has the correct behavior.
// If not, we should hack it with FALLOC_FL_PUNCH_HOLE
- if ((file_stats.st_size + file_stats.st_blksize - 1) /
+ if (result == 0 &&
+ (file_stats.st_size + file_stats.st_blksize - 1) /
file_stats.st_blksize !=
file_stats.st_blocks / (file_stats.st_blksize / 512)) {
IOSTATS_TIMER_GUARD(allocate_nanos);
}
if (close(fd_) < 0) {
- s = IOError(filename_, errno);
+ s = IOError("While closing file after writing", filename_, errno);
}
fd_ = -1;
return s;
Status PosixWritableFile::Sync() {
if (fdatasync(fd_) < 0) {
- return IOError(filename_, errno);
+ return IOError("While fdatasync", filename_, errno);
}
return Status::OK();
}
Status PosixWritableFile::Fsync() {
if (fsync(fd_) < 0) {
- return IOError(filename_, errno);
+ return IOError("While fsync", filename_, errno);
}
return Status::OK();
}
uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
+void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+#ifdef OS_LINUX
+// Suppress Valgrind "Unimplemented functionality" error.
+#ifndef ROCKSDB_VALGRIND_RUN
+ if (hint == write_hint_) {
+ return;
+ }
+ if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
+ write_hint_ = hint;
+ }
+#else
+ (void)hint;
+#endif // ROCKSDB_VALGRIND_RUN
+#else
+ (void)hint;
+#endif // OS_LINUX
+}
+
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
if (use_direct_io()) {
return Status::OK();
}
#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
return Status::OK();
#else
// free OS pages
if (ret == 0) {
return Status::OK();
}
- return IOError(filename_, errno);
+ return IOError("While fadvise NotNeeded", filename_, errno);
#endif
}
if (alloc_status == 0) {
return Status::OK();
} else {
- return IOError(filename_, errno);
+ return IOError(
+ "While fallocate offset " + ToString(offset) + " len " + ToString(len),
+ filename_, errno);
}
}
#endif
static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE) == 0) {
return Status::OK();
} else {
- return IOError(filename_, errno);
+ return IOError("While sync_file_range offset " + ToString(offset) +
+ " bytes " + ToString(nbytes),
+ filename_, errno);
}
}
#endif
*/
PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
- const EnvOptions& options)
+ const EnvOptions& /*options*/)
: filename_(fname), fd_(fd) {}
PosixRandomRWFile::~PosixRandomRWFile() {
// write was interrupted, try again.
continue;
}
- return IOError(filename_, errno);
+ return IOError(
+ "While write random read/write file at offset " + ToString(offset),
+ filename_, errno);
}
// Wrote `done` bytes
// read was interrupted, try again.
continue;
}
- return IOError(filename_, errno);
+ return IOError("While reading random read/write file offset " +
+ ToString(offset) + " len " + ToString(n),
+ filename_, errno);
} else if (done == 0) {
// Nothing more to read
break;
Status PosixRandomRWFile::Sync() {
if (fdatasync(fd_) < 0) {
- return IOError(filename_, errno);
+ return IOError("While fdatasync random read/write file", filename_, errno);
}
return Status::OK();
}
Status PosixRandomRWFile::Fsync() {
if (fsync(fd_) < 0) {
- return IOError(filename_, errno);
+ return IOError("While fsync random read/write file", filename_, errno);
}
return Status::OK();
}
Status PosixRandomRWFile::Close() {
if (close(fd_) < 0) {
- return IOError(filename_, errno);
+ return IOError("While close random read/write file", filename_, errno);
}
fd_ = -1;
return Status::OK();
}
+PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
+ // TODO should have error handling though not much we can do...
+ munmap(this->base_, length_);
+}
+
/*
* PosixDirectory
*/
Status PosixDirectory::Fsync() {
#ifndef OS_AIX
if (fsync(fd_) == -1) {
- return IOError("directory", errno);
+ return IOError("While fsync", "a directory", errno);
}
#endif
return Status::OK();