[ceph.git] / ceph / src / rocksdb / file / writable_file_writer.cc

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "file/writable_file_writer.h"

#include <algorithm>
#include <mutex>

#include "db/version_edit.h"
#include "monitoring/histogram.h"
#include "monitoring/iostats_context_imp.h"
#include "port/port.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/rate_limiter.h"

namespace ROCKSDB_NAMESPACE {
Status WritableFileWriter::Append(const Slice& data) {
  const char* src = data.data();
  size_t left = data.size();
  Status s;
  pending_sync_ = true;

  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
                   rocksdb_kill_odds * REDUCE_ODDS2);

  {
    IOSTATS_TIMER_GUARD(prepare_write_nanos);
    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
                                 IOOptions(), nullptr);
  }

  // See whether we need to enlarge the buffer to avoid the flush
  if (buf_.Capacity() - buf_.CurrentSize() < left) {
    for (size_t cap = buf_.Capacity();
         cap < max_buffer_size_;  // There is still room to increase
         cap *= 2) {
      // See whether the next available size is large enough.
      // Buffer will never be increased to more than max_buffer_size_.
      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
      if (desired_capacity - buf_.CurrentSize() >= left ||
          (use_direct_io() && desired_capacity == max_buffer_size_)) {
        buf_.AllocateNewBuffer(desired_capacity, true);
        break;
      }
    }
  }

  // Flush only when buffered I/O
  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
    if (buf_.CurrentSize() > 0) {
      s = Flush();
      if (!s.ok()) {
        return s;
      }
    }
    assert(buf_.CurrentSize() == 0);
  }

  // We never write directly to disk with direct I/O on.
  // or we simply use it for its original purpose to accumulate many small
  // chunks
  if (use_direct_io() || (buf_.Capacity() >= left)) {
    while (left > 0) {
      size_t appended = buf_.Append(src, left);
      left -= appended;
      src += appended;

      if (left > 0) {
        s = Flush();
        if (!s.ok()) {
          break;
        }
      }
    }
  } else {
    // Writing directly to file bypassing the buffer
    assert(buf_.CurrentSize() == 0);
    s = WriteBuffered(src, left);
  }

  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
  if (s.ok()) {
    filesize_ += data.size();
    CalculateFileChecksum(data);
  }
  return s;
}

Status WritableFileWriter::Pad(const size_t pad_bytes) {
  assert(pad_bytes < kDefaultPageSize);
  size_t left = pad_bytes;
  size_t cap = buf_.Capacity() - buf_.CurrentSize();

  // Assume pad_bytes is small compared to buf_ capacity. So we always
  // use buf_ rather than write directly to file in certain cases like
  // Append() does.
  while (left) {
    size_t append_bytes = std::min(cap, left);
    buf_.PadWith(append_bytes, 0);
    left -= append_bytes;
    if (left > 0) {
      Status s = Flush();
      if (!s.ok()) {
        return s;
      }
    }
    cap = buf_.Capacity() - buf_.CurrentSize();
  }
  pending_sync_ = true;
  filesize_ += pad_bytes;
  return Status::OK();
}

Status WritableFileWriter::Close() {
  // Do not quit immediately on failure the file MUST be closed
  Status s;

  // Possible to close it twice now as we MUST close
  // in __dtor, simply flushing is not enough
  // Windows when pre-allocating does not fill with zeros
  // also with unbuffered access we also set the end of data.
  if (!writable_file_) {
    return s;
  }

  s = Flush();  // flush cache to OS

  Status interim;
  // In direct I/O mode we write whole pages so
  // we need to let the file know where data ends.
  if (use_direct_io()) {
    interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
    if (interim.ok()) {
      interim = writable_file_->Fsync(IOOptions(), nullptr);
    }
    if (!interim.ok() && s.ok()) {
      s = interim;
    }
  }

  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
  interim = writable_file_->Close(IOOptions(), nullptr);
  if (!interim.ok() && s.ok()) {
    s = interim;
  }

  writable_file_.reset();
  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);

  return s;
}

// write out the cached data to the OS cache or storage if direct I/O
// enabled
Status WritableFileWriter::Flush() {
  Status s;
  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
                   rocksdb_kill_odds * REDUCE_ODDS2);

  if (buf_.CurrentSize() > 0) {
    if (use_direct_io()) {
#ifndef ROCKSDB_LITE
      if (pending_sync_) {
        s = WriteDirect();
      }
#endif  // !ROCKSDB_LITE
    } else {
      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
    }
    if (!s.ok()) {
      return s;
    }
  }

  s = writable_file_->Flush(IOOptions(), nullptr);

  if (!s.ok()) {
    return s;
  }

  // sync OS cache to disk for every bytes_per_sync_
  // TODO: give log file and sst file different options (log
  // files could be potentially cached in OS for their whole
  // life time, thus we might not want to flush at all).

  // We try to avoid sync to the last 1MB of data. For two reasons:
  // (1) avoid rewrite the same page that is modified later.
  // (2) for older version of OS, write can block while writing out
  //     the page.
  // Xfs does neighbor page flushing outside of the specified ranges. We
  // need to make sure sync range is far from the write offset.
  if (!use_direct_io() && bytes_per_sync_) {
    const uint64_t kBytesNotSyncRange =
        1024 * 1024;                                // recent 1MB is not synced.
    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
    if (filesize_ > kBytesNotSyncRange) {
      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
      assert(offset_sync_to >= last_sync_size_);
      if (offset_sync_to > 0 &&
          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
        last_sync_size_ = offset_sync_to;
      }
    }
  }

  return s;
}

const char* WritableFileWriter::GetFileChecksumFuncName() const {
  if (checksum_func_ != nullptr) {
    return checksum_func_->Name();
  } else {
    return kUnknownFileChecksumFuncName.c_str();
  }
}

Status WritableFileWriter::Sync(bool use_fsync) {
  Status s = Flush();
  if (!s.ok()) {
    return s;
  }
  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
  if (!use_direct_io() && pending_sync_) {
    s = SyncInternal(use_fsync);
    if (!s.ok()) {
      return s;
    }
  }
  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
  pending_sync_ = false;
  return Status::OK();
}

Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
  if (!writable_file_->IsSyncThreadSafe()) {
    return Status::NotSupported(
        "Can't WritableFileWriter::SyncWithoutFlush() because "
        "WritableFile::IsSyncThreadSafe() is false");
  }
  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
  Status s = SyncInternal(use_fsync);
  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
  return s;
}

Status WritableFileWriter::SyncInternal(bool use_fsync) {
  Status s;
  IOSTATS_TIMER_GUARD(fsync_nanos);
  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
  auto prev_perf_level = GetPerfLevel();
  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
  if (use_fsync) {
    s = writable_file_->Fsync(IOOptions(), nullptr);
  } else {
    s = writable_file_->Sync(IOOptions(), nullptr);
  }
  SetPerfLevel(prev_perf_level);
  return s;
}

Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
  IOSTATS_TIMER_GUARD(range_sync_nanos);
  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
  return writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
}

// This method writes to disk the specified data and makes use of the rate
// limiter if available
Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
  Status s;
  assert(!use_direct_io());
  const char* src = data;
  size_t left = size;

  while (left > 0) {
    size_t allowed;
    if (rate_limiter_ != nullptr) {
      allowed = rate_limiter_->RequestToken(
          left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
          RateLimiter::OpType::kWrite);
    } else {
      allowed = left;
    }

    {
      IOSTATS_TIMER_GUARD(write_nanos);
      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");

#ifndef ROCKSDB_LITE
      FileOperationInfo::TimePoint start_ts;
      uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
      if (ShouldNotifyListeners()) {
        start_ts = std::chrono::system_clock::now();
        old_size = next_write_offset_;
      }
#endif
      {
        auto prev_perf_level = GetPerfLevel();
        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
        s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
        SetPerfLevel(prev_perf_level);
      }
#ifndef ROCKSDB_LITE
      if (ShouldNotifyListeners()) {
        auto finish_ts = std::chrono::system_clock::now();
        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
      }
#endif
      if (!s.ok()) {
        return s;
      }
    }

    IOSTATS_ADD(bytes_written, allowed);
    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);

    left -= allowed;
    src += allowed;
  }
  buf_.Size(0);
  return s;
}

void WritableFileWriter::CalculateFileChecksum(const Slice& data) {
  if (checksum_func_ != nullptr) {
    if (is_first_checksum_) {
      file_checksum_ = checksum_func_->Value(data.data(), data.size());
      is_first_checksum_ = false;
    } else {
      file_checksum_ =
          checksum_func_->Extend(file_checksum_, data.data(), data.size());
    }
  }
}

// This flushes the accumulated data in the buffer. We pad data with zeros if
// necessary to the whole page.
// However, during automatic flushes padding would not be necessary.
// We always use RateLimiter if available. We move (Refit) any buffer bytes
// that are left over the
// whole number of pages to be written again on the next flush because we can
// only write on aligned
// offsets.
#ifndef ROCKSDB_LITE
Status WritableFileWriter::WriteDirect() {
  assert(use_direct_io());
  Status s;
  const size_t alignment = buf_.Alignment();
  assert((next_write_offset_ % alignment) == 0);

  // Calculate whole page final file advance if all writes succeed
  size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());

  // Calculate the leftover tail, we write it here padded with zeros BUT we
  // will write
  // it again in the future either on Close() OR when the current whole page
  // fills out
  size_t leftover_tail = buf_.CurrentSize() - file_advance;

  // Round up and pad
  buf_.PadToAlignmentWith(0);

  const char* src = buf_.BufferStart();
  uint64_t write_offset = next_write_offset_;
  size_t left = buf_.CurrentSize();

  while (left > 0) {
    // Check how much is allowed
    size_t size;
    if (rate_limiter_ != nullptr) {
      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
                                         writable_file_->GetIOPriority(),
                                         stats_, RateLimiter::OpType::kWrite);
    } else {
      size = left;
    }

    {
      IOSTATS_TIMER_GUARD(write_nanos);
      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
      FileOperationInfo::TimePoint start_ts;
      if (ShouldNotifyListeners()) {
        start_ts = std::chrono::system_clock::now();
      }
      // direct writes must be positional
      s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
                                           IOOptions(), nullptr);
      if (ShouldNotifyListeners()) {
        auto finish_ts = std::chrono::system_clock::now();
        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
      }
      if (!s.ok()) {
        buf_.Size(file_advance + leftover_tail);
        return s;
      }
    }

    IOSTATS_ADD(bytes_written, size);
    left -= size;
    src += size;
    write_offset += size;
    assert((next_write_offset_ % alignment) == 0);
  }

  if (s.ok()) {
    // Move the tail to the beginning of the buffer
    // This never happens during normal Append but rather during
    // explicit call to Flush()/Sync() or Close()
    buf_.RefitTail(file_advance, leftover_tail);
    // This is where we start writing next time which may or not be
    // the actual file size on disk. They match if the buffer size
    // is a multiple of whole pages otherwise filesize_ is leftover_tail
    // behind
    next_write_offset_ += file_advance;
  }
  return s;
}
#endif  // !ROCKSDB_LITE
}  // namespace ROCKSDB_NAMESPACE
Commit	Line	Data
f67539c2 TL	1	// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
	2	// This source code is licensed under both the GPLv2 (found in the
	3	// COPYING file in the root directory) and Apache 2.0 License
	4	// (found in the LICENSE.Apache file in the root directory).
	5	//
	6	// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
	7	// Use of this source code is governed by a BSD-style license that can be
	8	// found in the LICENSE file. See the AUTHORS file for names of contributors.
	9
	10	#include "file/writable_file_writer.h"
	11
	12	#include <algorithm>
	13	#include <mutex>
	14
	15	#include "db/version_edit.h"
	16	#include "monitoring/histogram.h"
	17	#include "monitoring/iostats_context_imp.h"
	18	#include "port/port.h"
	19	#include "test_util/sync_point.h"
	20	#include "util/random.h"
	21	#include "util/rate_limiter.h"
	22
	23	namespace ROCKSDB_NAMESPACE {
	24	Status WritableFileWriter::Append(const Slice& data) {
	25	const char* src = data.data();
	26	size_t left = data.size();
	27	Status s;
	28	pending_sync_ = true;
	29
	30	TEST_KILL_RANDOM("WritableFileWriter::Append:0",
	31	rocksdb_kill_odds * REDUCE_ODDS2);
	32
	33	{
	34	IOSTATS_TIMER_GUARD(prepare_write_nanos);
	35	TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
	36	writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
	37	IOOptions(), nullptr);
	38	}
	39
	40	// See whether we need to enlarge the buffer to avoid the flush
	41	if (buf_.Capacity() - buf_.CurrentSize() < left) {
	42	for (size_t cap = buf_.Capacity();
	43	cap < max_buffer_size_; // There is still room to increase
	44	cap *= 2) {
	45	// See whether the next available size is large enough.
	46	// Buffer will never be increased to more than max_buffer_size_.
	47	size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
	48	if (desired_capacity - buf_.CurrentSize() >= left \|\|
	49	(use_direct_io() && desired_capacity == max_buffer_size_)) {
	50	buf_.AllocateNewBuffer(desired_capacity, true);
	51	break;
	52	}
	53	}
	54	}
	55
	56	// Flush only when buffered I/O
	57	if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
	58	if (buf_.CurrentSize() > 0) {
	59	s = Flush();
	60	if (!s.ok()) {
	61	return s;
	62	}
	63	}
	64	assert(buf_.CurrentSize() == 0);
65	}
66
67	// We never write directly to disk with direct I/O on.
68	// or we simply use it for its original purpose to accumulate many small
69	// chunks
70	if (use_direct_io() \|\| (buf_.Capacity() >= left)) {
71	while (left > 0) {
72	size_t appended = buf_.Append(src, left);
73	left -= appended;
74	src += appended;
75
76	if (left > 0) {
77	s = Flush();
78	if (!s.ok()) {
79	break;
80	}
81	}
82	}
83	} else {
84	// Writing directly to file bypassing the buffer
85	assert(buf_.CurrentSize() == 0);
86	s = WriteBuffered(src, left);
87	}
88
89	TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
90	if (s.ok()) {
91	filesize_ += data.size();
92	CalculateFileChecksum(data);
93	}
94	return s;
95	}
96
97	Status WritableFileWriter::Pad(const size_t pad_bytes) {
98	assert(pad_bytes < kDefaultPageSize);
99	size_t left = pad_bytes;
100	size_t cap = buf_.Capacity() - buf_.CurrentSize();
101
102	// Assume pad_bytes is small compared to buf_ capacity. So we always
103	// use buf_ rather than write directly to file in certain cases like
104	// Append() does.
105	while (left) {
106	size_t append_bytes = std::min(cap, left);
107	buf_.PadWith(append_bytes, 0);
108	left -= append_bytes;
109	if (left > 0) {
110	Status s = Flush();
111	if (!s.ok()) {
112	return s;
113	}
114	}
115	cap = buf_.Capacity() - buf_.CurrentSize();
116	}
117	pending_sync_ = true;
118	filesize_ += pad_bytes;
119	return Status::OK();
120	}
121
122	Status WritableFileWriter::Close() {
123	// Do not quit immediately on failure the file MUST be closed
124	Status s;
125
126	// Possible to close it twice now as we MUST close
127	// in __dtor, simply flushing is not enough
128	// Windows when pre-allocating does not fill with zeros
129	// also with unbuffered access we also set the end of data.
130	if (!writable_file_) {
131	return s;
132	}
133
134	s = Flush(); // flush cache to OS
135
136	Status interim;
137	// In direct I/O mode we write whole pages so
138	// we need to let the file know where data ends.
139	if (use_direct_io()) {
140	interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
141	if (interim.ok()) {
142	interim = writable_file_->Fsync(IOOptions(), nullptr);
143	}
144	if (!interim.ok() && s.ok()) {
145	s = interim;
146	}
147	}
148
149	TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
150	interim = writable_file_->Close(IOOptions(), nullptr);
151	if (!interim.ok() && s.ok()) {
152	s = interim;
153	}
154
155	writable_file_.reset();
156	TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
157
158	return s;
159	}
160
161	// write out the cached data to the OS cache or storage if direct I/O
162	// enabled
163	Status WritableFileWriter::Flush() {
164	Status s;
165	TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
166	rocksdb_kill_odds * REDUCE_ODDS2);
167
168	if (buf_.CurrentSize() > 0) {
169	if (use_direct_io()) {
170	#ifndef ROCKSDB_LITE
171	if (pending_sync_) {
172	s = WriteDirect();
173	}
174	#endif // !ROCKSDB_LITE
175	} else {
176	s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
177	}
178	if (!s.ok()) {
179	return s;
180	}
181	}
182
183	s = writable_file_->Flush(IOOptions(), nullptr);
184
185	if (!s.ok()) {
186	return s;
187	}
188
189	// sync OS cache to disk for every bytes_per_sync_
190	// TODO: give log file and sst file different options (log
191	// files could be potentially cached in OS for their whole
192	// life time, thus we might not want to flush at all).
193
194	// We try to avoid sync to the last 1MB of data. For two reasons:
195	// (1) avoid rewrite the same page that is modified later.
196	// (2) for older version of OS, write can block while writing out
197	// the page.
198	// Xfs does neighbor page flushing outside of the specified ranges. We
199	// need to make sure sync range is far from the write offset.
200	if (!use_direct_io() && bytes_per_sync_) {
201	const uint64_t kBytesNotSyncRange =
202	1024 * 1024; // recent 1MB is not synced.
203	const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
204	if (filesize_ > kBytesNotSyncRange) {
205	uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
206	offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
207	assert(offset_sync_to >= last_sync_size_);
208	if (offset_sync_to > 0 &&
209	offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
210	s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
211	last_sync_size_ = offset_sync_to;
212	}
213	}
214	}
215
216	return s;
217	}
218
219	const char* WritableFileWriter::GetFileChecksumFuncName() const {
220	if (checksum_func_ != nullptr) {
221	return checksum_func_->Name();
222	} else {
223	return kUnknownFileChecksumFuncName.c_str();
224	}
225	}
226
227	Status WritableFileWriter::Sync(bool use_fsync) {
228	Status s = Flush();
229	if (!s.ok()) {
230	return s;
231	}
232	TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
233	if (!use_direct_io() && pending_sync_) {
234	s = SyncInternal(use_fsync);
235	if (!s.ok()) {
236	return s;
237	}
238	}
239	TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
240	pending_sync_ = false;
241	return Status::OK();
242	}
243
244	Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
245	if (!writable_file_->IsSyncThreadSafe()) {
246	return Status::NotSupported(
247	"Can't WritableFileWriter::SyncWithoutFlush() because "
248	"WritableFile::IsSyncThreadSafe() is false");
249	}
250	TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
251	Status s = SyncInternal(use_fsync);
252	TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
253	return s;
254	}
255
256	Status WritableFileWriter::SyncInternal(bool use_fsync) {
257	Status s;
258	IOSTATS_TIMER_GUARD(fsync_nanos);
259	TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
260	auto prev_perf_level = GetPerfLevel();
261	IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
262	if (use_fsync) {
263	s = writable_file_->Fsync(IOOptions(), nullptr);
264	} else {
265	s = writable_file_->Sync(IOOptions(), nullptr);
266	}
267	SetPerfLevel(prev_perf_level);
268	return s;
269	}
270
271	Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
272	IOSTATS_TIMER_GUARD(range_sync_nanos);
273	TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
274	return writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
275	}
276
277	// This method writes to disk the specified data and makes use of the rate
278	// limiter if available
279	Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
280	Status s;
281	assert(!use_direct_io());
282	const char* src = data;
283	size_t left = size;
284
285	while (left > 0) {
286	size_t allowed;
287	if (rate_limiter_ != nullptr) {
288	allowed = rate_limiter_->RequestToken(
289	left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
290	RateLimiter::OpType::kWrite);
291	} else {
292	allowed = left;
293	}
294
295	{
296	IOSTATS_TIMER_GUARD(write_nanos);
297	TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
298
299	#ifndef ROCKSDB_LITE
300	FileOperationInfo::TimePoint start_ts;
301	uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
302	if (ShouldNotifyListeners()) {
303	start_ts = std::chrono::system_clock::now();
304	old_size = next_write_offset_;
305	}
306	#endif
307	{
308	auto prev_perf_level = GetPerfLevel();
309	IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
310	s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
311	SetPerfLevel(prev_perf_level);
312	}
313	#ifndef ROCKSDB_LITE
314	if (ShouldNotifyListeners()) {
315	auto finish_ts = std::chrono::system_clock::now();
316	NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
317	}
318	#endif
319	if (!s.ok()) {
320	return s;
321	}
322	}
323
324	IOSTATS_ADD(bytes_written, allowed);
325	TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
326
327	left -= allowed;
328	src += allowed;
329	}
330	buf_.Size(0);
331	return s;
332	}
333
334	void WritableFileWriter::CalculateFileChecksum(const Slice& data) {
335	if (checksum_func_ != nullptr) {
336	if (is_first_checksum_) {
337	file_checksum_ = checksum_func_->Value(data.data(), data.size());
338	is_first_checksum_ = false;
339	} else {
340	file_checksum_ =
341	checksum_func_->Extend(file_checksum_, data.data(), data.size());
342	}
343	}
344	}
345
346	// This flushes the accumulated data in the buffer. We pad data with zeros if
347	// necessary to the whole page.
348	// However, during automatic flushes padding would not be necessary.
349	// We always use RateLimiter if available. We move (Refit) any buffer bytes
350	// that are left over the
351	// whole number of pages to be written again on the next flush because we can
352	// only write on aligned
353	// offsets.
354	#ifndef ROCKSDB_LITE
355	Status WritableFileWriter::WriteDirect() {
356	assert(use_direct_io());
357	Status s;
358	const size_t alignment = buf_.Alignment();
359	assert((next_write_offset_ % alignment) == 0);
360
361	// Calculate whole page final file advance if all writes succeed
362	size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());
363
364	// Calculate the leftover tail, we write it here padded with zeros BUT we
365	// will write
366	// it again in the future either on Close() OR when the current whole page
367	// fills out
368	size_t leftover_tail = buf_.CurrentSize() - file_advance;
369
370	// Round up and pad
371	buf_.PadToAlignmentWith(0);
372
373	const char* src = buf_.BufferStart();
374	uint64_t write_offset = next_write_offset_;
375	size_t left = buf_.CurrentSize();
376
377	while (left > 0) {
378	// Check how much is allowed
379	size_t size;
380	if (rate_limiter_ != nullptr) {
381	size = rate_limiter_->RequestToken(left, buf_.Alignment(),
382	writable_file_->GetIOPriority(),
383	stats_, RateLimiter::OpType::kWrite);
384	} else {
385	size = left;
386	}
387
388	{
389	IOSTATS_TIMER_GUARD(write_nanos);
390	TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
391	FileOperationInfo::TimePoint start_ts;
392	if (ShouldNotifyListeners()) {
393	start_ts = std::chrono::system_clock::now();
394	}
395	// direct writes must be positional
396	s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
397	IOOptions(), nullptr);
398	if (ShouldNotifyListeners()) {
399	auto finish_ts = std::chrono::system_clock::now();
400	NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
401	}
402	if (!s.ok()) {
403	buf_.Size(file_advance + leftover_tail);
404	return s;
405	}
406	}
407
408	IOSTATS_ADD(bytes_written, size);
409	left -= size;
410	src += size;
411	write_offset += size;
412	assert((next_write_offset_ % alignment) == 0);
413	}
414
415	if (s.ok()) {
416	// Move the tail to the beginning of the buffer
417	// This never happens during normal Append but rather during
418	// explicit call to Flush()/Sync() or Close()
419	buf_.RefitTail(file_advance, leftover_tail);
420	// This is where we start writing next time which may or not be
421	// the actual file size on disk. They match if the buffer size
422	// is a multiple of whole pages otherwise filesize_ is leftover_tail
423	// behind
424	next_write_offset_ += file_advance;
425	}
426	return s;
427	}
428	#endif // !ROCKSDB_LITE
429	} // namespace ROCKSDB_NAMESPACE