]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/file/delete_scheduler.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / file / delete_scheduler.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#ifndef ROCKSDB_LITE
7
f67539c2 8#include "file/delete_scheduler.h"
7c673cae 9
20effc67 10#include <cinttypes>
7c673cae
FG
11#include <thread>
12#include <vector>
13
f67539c2
TL
14#include "file/sst_file_manager_impl.h"
15#include "logging/logging.h"
7c673cae
FG
16#include "port/port.h"
17#include "rocksdb/env.h"
1e59de90
TL
18#include "rocksdb/file_system.h"
19#include "rocksdb/system_clock.h"
f67539c2 20#include "test_util/sync_point.h"
7c673cae 21#include "util/mutexlock.h"
7c673cae 22
f67539c2 23namespace ROCKSDB_NAMESPACE {
7c673cae 24
1e59de90 25DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
f67539c2 26 int64_t rate_bytes_per_sec, Logger* info_log,
11fdf7f2
TL
27 SstFileManagerImpl* sst_file_manager,
28 double max_trash_db_ratio,
29 uint64_t bytes_max_delete_chunk)
1e59de90 30 : clock_(clock),
f67539c2 31 fs_(fs),
11fdf7f2 32 total_trash_size_(0),
7c673cae
FG
33 rate_bytes_per_sec_(rate_bytes_per_sec),
34 pending_files_(0),
11fdf7f2 35 bytes_max_delete_chunk_(bytes_max_delete_chunk),
7c673cae
FG
36 closing_(false),
37 cv_(&mu_),
20effc67 38 bg_thread_(nullptr),
7c673cae 39 info_log_(info_log),
11fdf7f2
TL
40 sst_file_manager_(sst_file_manager),
41 max_trash_db_ratio_(max_trash_db_ratio) {
42 assert(sst_file_manager != nullptr);
43 assert(max_trash_db_ratio >= 0);
20effc67 44 MaybeCreateBackgroundThread();
7c673cae
FG
45}
46
47DeleteScheduler::~DeleteScheduler() {
48 {
49 InstrumentedMutexLock l(&mu_);
50 closing_ = true;
51 cv_.SignalAll();
52 }
53 if (bg_thread_) {
54 bg_thread_->join();
55 }
1e59de90
TL
56 for (const auto& it : bg_errors_) {
57 it.second.PermitUncheckedError();
58 }
7c673cae
FG
59}
60
11fdf7f2 61Status DeleteScheduler::DeleteFile(const std::string& file_path,
494da23a
TL
62 const std::string& dir_to_sync,
63 const bool force_bg) {
1e59de90
TL
64 if (rate_bytes_per_sec_.load() <= 0 ||
65 (!force_bg &&
66 total_trash_size_.load() >
67 sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
11fdf7f2
TL
68 // Rate limiting is disabled or trash size makes up more than
69 // max_trash_db_ratio_ (default 25%) of the total DB size
7c673cae 70 TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
1e59de90 71 Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
11fdf7f2 72 if (s.ok()) {
20effc67
TL
73 s = sst_file_manager_->OnDeleteFile(file_path);
74 ROCKS_LOG_INFO(info_log_,
75 "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
76 ", total_trash_size %" PRIu64 " max_trash_db_ratio %lf",
77 file_path.c_str(), rate_bytes_per_sec_.load(),
78 total_trash_size_.load(), max_trash_db_ratio_.load());
79 InstrumentedMutexLock l(&mu_);
80 RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
7c673cae
FG
81 }
82 return s;
83 }
84
85 // Move file to trash
11fdf7f2 86 std::string trash_file;
1e59de90 87 Status s = MarkAsTrash(file_path, &trash_file);
20effc67
TL
88 ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
89 s.ToString().c_str());
11fdf7f2 90
7c673cae 91 if (!s.ok()) {
f67539c2
TL
92 ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
93 file_path.c_str(), s.ToString().c_str());
94 s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
11fdf7f2 95 if (s.ok()) {
20effc67
TL
96 s = sst_file_manager_->OnDeleteFile(file_path);
97 ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
98 trash_file.c_str());
99 InstrumentedMutexLock l(&mu_);
100 RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
7c673cae
FG
101 }
102 return s;
103 }
104
11fdf7f2
TL
105 // Update the total trash size
106 uint64_t trash_file_size = 0;
1e59de90
TL
107 IOStatus io_s =
108 fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
109 if (io_s.ok()) {
110 total_trash_size_.fetch_add(trash_file_size);
111 }
112 //**TODO: What should we do if we failed to
113 // get the file size?
11fdf7f2 114
7c673cae
FG
115 // Add file to delete queue
116 {
117 InstrumentedMutexLock l(&mu_);
20effc67 118 RecordTick(stats_.get(), FILES_MARKED_TRASH);
11fdf7f2 119 queue_.emplace(trash_file, dir_to_sync);
7c673cae
FG
120 pending_files_++;
121 if (pending_files_ == 1) {
122 cv_.SignalAll();
123 }
124 }
125 return s;
126}
127
128std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
129 InstrumentedMutexLock l(&mu_);
130 return bg_errors_;
131}
132
11fdf7f2
TL
133const std::string DeleteScheduler::kTrashExtension = ".trash";
134bool DeleteScheduler::IsTrashFile(const std::string& file_path) {
135 return (file_path.size() >= kTrashExtension.size() &&
136 file_path.rfind(kTrashExtension) ==
137 file_path.size() - kTrashExtension.size());
138}
139
140Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
141 const std::string& path) {
7c673cae 142 Status s;
11fdf7f2
TL
143 // Check if there are any files marked as trash in this path
144 std::vector<std::string> files_in_path;
1e59de90
TL
145 const auto& fs = env->GetFileSystem();
146 IOOptions io_opts;
147 io_opts.do_not_recurse = true;
148 s = fs->GetChildren(path, io_opts, &files_in_path,
149 /*IODebugContext*=*/nullptr);
11fdf7f2
TL
150 if (!s.ok()) {
151 return s;
152 }
153 for (const std::string& current_file : files_in_path) {
154 if (!DeleteScheduler::IsTrashFile(current_file)) {
155 // not a trash file, skip
156 continue;
157 }
158
159 Status file_delete;
160 std::string trash_file = path + "/" + current_file;
161 if (sfm) {
162 // We have an SstFileManager that will schedule the file delete
20effc67 163 s = sfm->OnAddFile(trash_file);
11fdf7f2
TL
164 file_delete = sfm->ScheduleFileDeletion(trash_file, path);
165 } else {
166 // Delete the file immediately
167 file_delete = env->DeleteFile(trash_file);
168 }
169
170 if (s.ok() && !file_delete.ok()) {
171 s = file_delete;
172 }
173 }
174
175 return s;
176}
177
178Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
179 std::string* trash_file) {
180 // Sanity check of the path
7c673cae
FG
181 size_t idx = file_path.rfind("/");
182 if (idx == std::string::npos || idx == file_path.size() - 1) {
183 return Status::InvalidArgument("file_path is corrupted");
184 }
7c673cae 185
11fdf7f2
TL
186 if (DeleteScheduler::IsTrashFile(file_path)) {
187 // This is already a trash file
188 *trash_file = file_path;
1e59de90 189 return Status::OK();
7c673cae
FG
190 }
191
11fdf7f2 192 *trash_file = file_path + kTrashExtension;
7c673cae
FG
193 // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
194 // file_move_mu mutex.
11fdf7f2 195 int cnt = 0;
1e59de90 196 Status s;
7c673cae
FG
197 InstrumentedMutexLock l(&file_move_mu_);
198 while (true) {
f67539c2 199 s = fs_->FileExists(*trash_file, IOOptions(), nullptr);
7c673cae
FG
200 if (s.IsNotFound()) {
201 // We found a path for our file in trash
f67539c2 202 s = fs_->RenameFile(file_path, *trash_file, IOOptions(), nullptr);
7c673cae
FG
203 break;
204 } else if (s.ok()) {
205 // Name conflict, generate new random suffix
11fdf7f2 206 *trash_file = file_path + std::to_string(cnt) + kTrashExtension;
7c673cae
FG
207 } else {
208 // Error during FileExists call, we cannot continue
209 break;
210 }
11fdf7f2 211 cnt++;
7c673cae 212 }
11fdf7f2 213 if (s.ok()) {
1e59de90 214 s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
7c673cae
FG
215 }
216 return s;
217}
218
219void DeleteScheduler::BackgroundEmptyTrash() {
220 TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash");
221
222 while (true) {
223 InstrumentedMutexLock l(&mu_);
224 while (queue_.empty() && !closing_) {
225 cv_.Wait();
226 }
227
228 if (closing_) {
229 return;
230 }
231
232 // Delete all files in queue_
1e59de90 233 uint64_t start_time = clock_->NowMicros();
7c673cae
FG
234 uint64_t total_deleted_bytes = 0;
235 int64_t current_delete_rate = rate_bytes_per_sec_.load();
236 while (!queue_.empty() && !closing_) {
237 if (current_delete_rate != rate_bytes_per_sec_.load()) {
238 // User changed the delete rate
239 current_delete_rate = rate_bytes_per_sec_.load();
1e59de90 240 start_time = clock_->NowMicros();
7c673cae 241 total_deleted_bytes = 0;
20effc67
TL
242 ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64,
243 current_delete_rate);
7c673cae
FG
244 }
245
246 // Get new file to delete
11fdf7f2
TL
247 const FileAndDir& fad = queue_.front();
248 std::string path_in_trash = fad.fname;
7c673cae 249
20effc67 250 // We don't need to hold the lock while deleting the file
7c673cae
FG
251 mu_.Unlock();
252 uint64_t deleted_bytes = 0;
11fdf7f2 253 bool is_complete = true;
7c673cae 254 // Delete file from trash and update total_penlty value
11fdf7f2
TL
255 Status s =
256 DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
7c673cae
FG
257 total_deleted_bytes += deleted_bytes;
258 mu_.Lock();
11fdf7f2
TL
259 if (is_complete) {
260 queue_.pop();
261 }
7c673cae
FG
262
263 if (!s.ok()) {
264 bg_errors_[path_in_trash] = s;
265 }
266
20effc67
TL
267 // Apply penalty if necessary
268 uint64_t total_penalty;
7c673cae
FG
269 if (current_delete_rate > 0) {
270 // rate limiting is enabled
20effc67 271 total_penalty =
7c673cae 272 ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate);
20effc67
TL
273 ROCKS_LOG_INFO(info_log_,
274 "Rate limiting is enabled with penalty %" PRIu64
275 " after deleting file %s",
276 total_penalty, path_in_trash.c_str());
277 while (!closing_ && !cv_.TimedWait(start_time + total_penalty)) {
278 }
7c673cae
FG
279 } else {
280 // rate limiting is disabled
20effc67
TL
281 total_penalty = 0;
282 ROCKS_LOG_INFO(info_log_,
283 "Rate limiting is disabled after deleting file %s",
284 path_in_trash.c_str());
7c673cae
FG
285 }
286 TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
20effc67 287 &total_penalty);
7c673cae 288
11fdf7f2
TL
289 if (is_complete) {
290 pending_files_--;
291 }
7c673cae
FG
292 if (pending_files_ == 0) {
293 // Unblock WaitForEmptyTrash since there are no more files waiting
294 // to be deleted
295 cv_.SignalAll();
296 }
297 }
298 }
299}
300
301Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
11fdf7f2
TL
302 const std::string& dir_to_sync,
303 uint64_t* deleted_bytes,
304 bool* is_complete) {
7c673cae 305 uint64_t file_size;
f67539c2 306 Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
11fdf7f2
TL
307 *is_complete = true;
308 TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
7c673cae 309 if (s.ok()) {
11fdf7f2
TL
310 bool need_full_delete = true;
311 if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
312 uint64_t num_hard_links = 2;
313 // We don't have to worry aobut data race between linking a new
314 // file after the number of file link check and ftruncte because
315 // the file is now in trash and no hardlink is supposed to create
316 // to trash files by RocksDB.
f67539c2
TL
317 Status my_status = fs_->NumFileLinks(path_in_trash, IOOptions(),
318 &num_hard_links, nullptr);
11fdf7f2
TL
319 if (my_status.ok()) {
320 if (num_hard_links == 1) {
f67539c2 321 std::unique_ptr<FSWritableFile> wf;
1e59de90
TL
322 my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), &wf,
323 nullptr);
11fdf7f2 324 if (my_status.ok()) {
f67539c2
TL
325 my_status = wf->Truncate(file_size - bytes_max_delete_chunk_,
326 IOOptions(), nullptr);
11fdf7f2
TL
327 if (my_status.ok()) {
328 TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
f67539c2 329 my_status = wf->Fsync(IOOptions(), nullptr);
11fdf7f2
TL
330 }
331 }
332 if (my_status.ok()) {
333 *deleted_bytes = bytes_max_delete_chunk_;
334 need_full_delete = false;
335 *is_complete = false;
336 } else {
337 ROCKS_LOG_WARN(info_log_,
338 "Failed to partially delete %s from trash -- %s",
339 path_in_trash.c_str(), my_status.ToString().c_str());
340 }
341 } else {
342 ROCKS_LOG_INFO(info_log_,
343 "Cannot delete %s slowly through ftruncate from trash "
344 "as it has other links",
345 path_in_trash.c_str());
346 }
347 } else if (!num_link_error_printed_) {
348 ROCKS_LOG_INFO(
349 info_log_,
350 "Cannot delete files slowly through ftruncate from trash "
351 "as Env::NumFileLinks() returns error: %s",
352 my_status.ToString().c_str());
353 num_link_error_printed_ = true;
354 }
355 }
7c673cae 356
11fdf7f2 357 if (need_full_delete) {
f67539c2 358 s = fs_->DeleteFile(path_in_trash, IOOptions(), nullptr);
11fdf7f2 359 if (!dir_to_sync.empty()) {
f67539c2 360 std::unique_ptr<FSDirectory> dir_obj;
11fdf7f2 361 if (s.ok()) {
f67539c2 362 s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr);
11fdf7f2
TL
363 }
364 if (s.ok()) {
1e59de90
TL
365 s = dir_obj->FsyncWithDirOptions(
366 IOOptions(), nullptr,
367 DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted));
11fdf7f2
TL
368 TEST_SYNC_POINT_CALLBACK(
369 "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
370 reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
371 }
372 }
20effc67
TL
373 if (s.ok()) {
374 *deleted_bytes = file_size;
375 s = sst_file_manager_->OnDeleteFile(path_in_trash);
376 }
11fdf7f2
TL
377 }
378 }
7c673cae
FG
379 if (!s.ok()) {
380 // Error while getting file size or while deleting
381 ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s",
382 path_in_trash.c_str(), s.ToString().c_str());
383 *deleted_bytes = 0;
384 } else {
11fdf7f2 385 total_trash_size_.fetch_sub(*deleted_bytes);
7c673cae
FG
386 }
387
388 return s;
389}
390
391void DeleteScheduler::WaitForEmptyTrash() {
392 InstrumentedMutexLock l(&mu_);
393 while (pending_files_ > 0 && !closing_) {
394 cv_.Wait();
395 }
396}
397
20effc67
TL
398void DeleteScheduler::MaybeCreateBackgroundThread() {
399 if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
400 bg_thread_.reset(
401 new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
402 ROCKS_LOG_INFO(info_log_,
403 "Created background thread for deletion scheduler with "
404 "rate_bytes_per_sec: %" PRIi64,
405 rate_bytes_per_sec_.load());
406 }
407}
408
f67539c2 409} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
410
411#endif // ROCKSDB_LITE