]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/util/sst_file_manager_impl.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / util / sst_file_manager_impl.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#include "util/sst_file_manager_impl.h"
7
494da23a
TL
8#ifndef __STDC_FORMAT_MACROS
9#define __STDC_FORMAT_MACROS
10#endif
11
12#include <inttypes.h>
7c673cae
FG
13#include <vector>
14
11fdf7f2 15#include "db/db_impl.h"
7c673cae
FG
16#include "port/port.h"
17#include "rocksdb/env.h"
18#include "rocksdb/sst_file_manager.h"
19#include "util/mutexlock.h"
20#include "util/sync_point.h"
21
22namespace rocksdb {
23
24#ifndef ROCKSDB_LITE
25SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger,
11fdf7f2
TL
26 int64_t rate_bytes_per_sec,
27 double max_trash_db_ratio,
28 uint64_t bytes_max_delete_chunk)
7c673cae
FG
29 : env_(env),
30 logger_(logger),
31 total_files_size_(0),
11fdf7f2
TL
32 in_progress_files_size_(0),
33 compaction_buffer_size_(0),
34 cur_compactions_reserved_size_(0),
7c673cae 35 max_allowed_space_(0),
11fdf7f2
TL
36 delete_scheduler_(env, rate_bytes_per_sec, logger.get(), this,
37 max_trash_db_ratio, bytes_max_delete_chunk),
38 cv_(&mu_),
39 closing_(false),
40 bg_thread_(nullptr),
41 reserved_disk_buffer_(0),
42 free_space_trigger_(0),
43 cur_instance_(nullptr) {
44}
7c673cae 45
11fdf7f2
TL
46SstFileManagerImpl::~SstFileManagerImpl() {
47 Close();
48}
7c673cae 49
11fdf7f2
TL
50void SstFileManagerImpl::Close() {
51 {
52 MutexLock l(&mu_);
53 if (closing_) {
54 return;
55 }
56 closing_ = true;
57 cv_.SignalAll();
58 }
59 if (bg_thread_) {
60 bg_thread_->join();
61 }
62}
63
64Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
65 bool compaction) {
7c673cae
FG
66 uint64_t file_size;
67 Status s = env_->GetFileSize(file_path, &file_size);
68 if (s.ok()) {
69 MutexLock l(&mu_);
11fdf7f2 70 OnAddFileImpl(file_path, file_size, compaction);
7c673cae
FG
71 }
72 TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
73 return s;
74}
75
76Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
77 {
78 MutexLock l(&mu_);
79 OnDeleteFileImpl(file_path);
80 }
81 TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile");
82 return Status::OK();
83}
84
11fdf7f2
TL
85void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
86 MutexLock l(&mu_);
87 uint64_t size_added_by_compaction = 0;
88 for (size_t i = 0; i < c->num_input_levels(); i++) {
89 for (size_t j = 0; j < c->num_input_files(i); j++) {
90 FileMetaData* filemeta = c->input(i, j);
91 size_added_by_compaction += filemeta->fd.GetFileSize();
92 }
93 }
94 cur_compactions_reserved_size_ -= size_added_by_compaction;
95
96 auto new_files = c->edit()->GetNewFiles();
97 for (auto& new_file : new_files) {
98 auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
99 new_file.second.fd.GetNumber(),
100 new_file.second.fd.GetPathId());
101 if (in_progress_files_.find(fn) != in_progress_files_.end()) {
102 auto tracked_file = tracked_files_.find(fn);
103 assert(tracked_file != tracked_files_.end());
104 in_progress_files_size_ -= tracked_file->second;
105 in_progress_files_.erase(fn);
106 }
107 }
108}
109
7c673cae 110Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
11fdf7f2
TL
111 const std::string& new_path,
112 uint64_t* file_size) {
7c673cae
FG
113 {
114 MutexLock l(&mu_);
11fdf7f2
TL
115 if (file_size != nullptr) {
116 *file_size = tracked_files_[old_path];
117 }
118 OnAddFileImpl(new_path, tracked_files_[old_path], false);
7c673cae
FG
119 OnDeleteFileImpl(old_path);
120 }
121 TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
122 return Status::OK();
123}
124
125void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
126 MutexLock l(&mu_);
127 max_allowed_space_ = max_allowed_space;
128}
129
11fdf7f2
TL
130void SstFileManagerImpl::SetCompactionBufferSize(
131 uint64_t compaction_buffer_size) {
132 MutexLock l(&mu_);
133 compaction_buffer_size_ = compaction_buffer_size;
134}
135
7c673cae
FG
136bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
137 MutexLock l(&mu_);
138 if (max_allowed_space_ <= 0) {
139 return false;
140 }
141 return total_files_size_ >= max_allowed_space_;
142}
143
11fdf7f2
TL
144bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
145 MutexLock l(&mu_);
146 if (max_allowed_space_ <= 0) {
147 return false;
148 }
149 return total_files_size_ + cur_compactions_reserved_size_ >=
150 max_allowed_space_;
151}
152
153bool SstFileManagerImpl::EnoughRoomForCompaction(
154 ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
155 Status bg_error) {
156 MutexLock l(&mu_);
157 uint64_t size_added_by_compaction = 0;
158 // First check if we even have the space to do the compaction
159 for (size_t i = 0; i < inputs.size(); i++) {
160 for (size_t j = 0; j < inputs[i].size(); j++) {
161 FileMetaData* filemeta = inputs[i][j];
162 size_added_by_compaction += filemeta->fd.GetFileSize();
163 }
164 }
165
166 // Update cur_compactions_reserved_size_ so concurrent compaction
167 // don't max out space
168 size_t needed_headroom =
169 cur_compactions_reserved_size_ + size_added_by_compaction +
170 compaction_buffer_size_;
171 if (max_allowed_space_ != 0 &&
172 (needed_headroom + total_files_size_ > max_allowed_space_)) {
173 return false;
174 }
175
176 // Implement more aggressive checks only if this DB instance has already
177 // seen a NoSpace() error. This is tin order to contain a single potentially
178 // misbehaving DB instance and prevent it from slowing down compactions of
179 // other DB instances
180 if (CheckFreeSpace() && bg_error == Status::NoSpace()) {
181 auto fn =
182 TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
183 inputs[0][0]->fd.GetPathId());
184 uint64_t free_space = 0;
185 env_->GetFreeSpace(fn, &free_space);
186 // needed_headroom is based on current size reserved by compactions,
187 // minus any files created by running compactions as they would count
188 // against the reserved size. If user didn't specify any compaction
189 // buffer, add reserved_disk_buffer_ that's calculated by default so the
190 // compaction doesn't end up leaving nothing for logs and flush SSTs
191 if (compaction_buffer_size_ == 0) {
192 needed_headroom += reserved_disk_buffer_;
193 }
194 needed_headroom -= in_progress_files_size_;
195 if (free_space < needed_headroom + size_added_by_compaction) {
196 // We hit the condition of not enough disk space
494da23a
TL
197 ROCKS_LOG_ERROR(logger_,
198 "free space [%" PRIu64
199 " bytes] is less than "
200 "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
201 free_space, needed_headroom);
11fdf7f2
TL
202 return false;
203 }
204 }
205
206 cur_compactions_reserved_size_ += size_added_by_compaction;
207 // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
208 // a NoSpace error.
209 free_space_trigger_ = cur_compactions_reserved_size_;
210 return true;
211}
212
213uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
214 MutexLock l(&mu_);
215 return cur_compactions_reserved_size_;
216}
217
7c673cae
FG
218uint64_t SstFileManagerImpl::GetTotalSize() {
219 MutexLock l(&mu_);
220 return total_files_size_;
221}
222
223std::unordered_map<std::string, uint64_t>
224SstFileManagerImpl::GetTrackedFiles() {
225 MutexLock l(&mu_);
226 return tracked_files_;
227}
228
229int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
230 return delete_scheduler_.GetRateBytesPerSecond();
231}
232
233void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
234 return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
235}
236
11fdf7f2
TL
237double SstFileManagerImpl::GetMaxTrashDBRatio() {
238 return delete_scheduler_.GetMaxTrashDBRatio();
239}
240
241void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
242 return delete_scheduler_.SetMaxTrashDBRatio(r);
243}
244
245uint64_t SstFileManagerImpl::GetTotalTrashSize() {
246 return delete_scheduler_.GetTotalTrashSize();
247}
248
249void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
250 const std::string& path) {
251 MutexLock l(&mu_);
252
253 reserved_disk_buffer_ += size;
254 if (path_.empty()) {
255 path_ = path;
256 }
257}
258
259void SstFileManagerImpl::ClearError() {
260 while (true) {
261 MutexLock l(&mu_);
262
263 if (closing_) {
264 return;
265 }
266
267 uint64_t free_space;
268 Status s = env_->GetFreeSpace(path_, &free_space);
269 if (s.ok()) {
270 // In case of multi-DB instances, some of them may have experienced a
271 // soft error and some a hard error. In the SstFileManagerImpl, a hard
272 // error will basically override previously reported soft errors. Once
273 // we clear the hard error, we don't keep track of previous errors for
274 // now
275 if (bg_err_.severity() == Status::Severity::kHardError) {
276 if (free_space < reserved_disk_buffer_) {
494da23a
TL
277 ROCKS_LOG_ERROR(logger_,
278 "free space [%" PRIu64
279 " bytes] is less than "
280 "required disk buffer [%" PRIu64 " bytes]\n",
281 free_space, reserved_disk_buffer_);
11fdf7f2
TL
282 ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
283 s = Status::NoSpace();
284 }
285 } else if (bg_err_.severity() == Status::Severity::kSoftError) {
286 if (free_space < free_space_trigger_) {
494da23a
TL
287 ROCKS_LOG_WARN(logger_,
288 "free space [%" PRIu64
289 " bytes] is less than "
290 "free space for compaction trigger [%" PRIu64
291 " bytes]\n",
292 free_space, free_space_trigger_);
11fdf7f2
TL
293 ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
294 s = Status::NoSpace();
295 }
296 }
297 }
298
299 // Someone could have called CancelErrorRecovery() and the list could have
300 // become empty, so check again here
301 if (s.ok() && !error_handler_list_.empty()) {
302 auto error_handler = error_handler_list_.front();
303 // Since we will release the mutex, set cur_instance_ to signal to the
304 // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
305 // to indicate that this DB instance is busy. The DB instance is
306 // guaranteed to not be deleted before RecoverFromBGError() returns,
307 // since the ErrorHandler::recovery_in_prog_ flag would be true
308 cur_instance_ = error_handler;
309 mu_.Unlock();
310 s = error_handler->RecoverFromBGError();
311 mu_.Lock();
312 // The DB instance might have been deleted while we were
313 // waiting for the mutex, so check cur_instance_ to make sure its
314 // still non-null
315 if (cur_instance_) {
316 // Check for error again, since the instance may have recovered but
317 // immediately got another error. If that's the case, and the new
318 // error is also a NoSpace() non-fatal error, leave the instance in
319 // the list
320 Status err = cur_instance_->GetBGError();
321 if (s.ok() && err == Status::NoSpace() &&
322 err.severity() < Status::Severity::kFatalError) {
323 s = err;
324 }
325 cur_instance_ = nullptr;
326 }
327
328 if (s.ok() || s.IsShutdownInProgress() ||
329 (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
330 // If shutdown is in progress, abandon this handler instance
331 // and continue with the others
332 error_handler_list_.pop_front();
333 }
334 }
335
336 if (!error_handler_list_.empty()) {
337 // If there are more instances to be recovered, reschedule after 5
338 // seconds
339 int64_t wait_until = env_->NowMicros() + 5000000;
340 cv_.TimedWait(wait_until);
341 }
342
343 // Check again for error_handler_list_ empty, as a DB instance shutdown
344 // could have removed it from the queue while we were in timed wait
345 if (error_handler_list_.empty()) {
346 ROCKS_LOG_INFO(logger_, "Clearing error\n");
347 bg_err_ = Status::OK();
348 return;
349 }
350 }
351}
352
353void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
354 Status bg_error) {
355 MutexLock l(&mu_);
356 if (bg_error.severity() == Status::Severity::kSoftError) {
357 if (bg_err_.ok()) {
358 // Setting bg_err_ basically means we're in degraded mode
359 // Assume that all pending compactions will fail similarly. The trigger
360 // for clearing this condition is set to current compaction reserved
361 // size, so we stop checking disk space available in
362 // EnoughRoomForCompaction once this much free space is available
363 bg_err_ = bg_error;
364 }
365 } else if (bg_error.severity() == Status::Severity::kHardError) {
366 bg_err_ = bg_error;
367 } else {
368 assert(false);
369 }
370
371 // If this is the first instance of this error, kick of a thread to poll
372 // and recover from this condition
373 if (error_handler_list_.empty()) {
374 error_handler_list_.push_back(handler);
375 // Release lock before calling join. Its ok to do so because
376 // error_handler_list_ is now non-empty, so no other invocation of this
377 // function will execute this piece of code
378 mu_.Unlock();
379 if (bg_thread_) {
380 bg_thread_->join();
381 }
382 // Start a new thread. The previous one would have exited.
383 bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
384 mu_.Lock();
385 } else {
386 // Check if this DB instance is already in the list
387 for (auto iter = error_handler_list_.begin();
388 iter != error_handler_list_.end(); ++iter) {
389 if ((*iter) == handler) {
390 return;
391 }
392 }
393 error_handler_list_.push_back(handler);
394 }
395}
396
397bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
398 MutexLock l(&mu_);
399
400 if (cur_instance_ == handler) {
401 // This instance is currently busy attempting to recover
402 // Nullify it so the recovery thread doesn't attempt to access it again
403 cur_instance_ = nullptr;
404 return false;
405 }
406
407 for (auto iter = error_handler_list_.begin();
408 iter != error_handler_list_.end(); ++iter) {
409 if ((*iter) == handler) {
410 error_handler_list_.erase(iter);
411 return true;
412 }
413 }
414 return false;
415}
416
417Status SstFileManagerImpl::ScheduleFileDeletion(
494da23a
TL
418 const std::string& file_path, const std::string& path_to_sync,
419 const bool force_bg) {
420 TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion");
421 return delete_scheduler_.DeleteFile(file_path, path_to_sync,
422 force_bg);
7c673cae
FG
423}
424
425void SstFileManagerImpl::WaitForEmptyTrash() {
426 delete_scheduler_.WaitForEmptyTrash();
427}
428
429void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
11fdf7f2 430 uint64_t file_size, bool compaction) {
7c673cae
FG
431 auto tracked_file = tracked_files_.find(file_path);
432 if (tracked_file != tracked_files_.end()) {
433 // File was added before, we will just update the size
11fdf7f2 434 assert(!compaction);
7c673cae
FG
435 total_files_size_ -= tracked_file->second;
436 total_files_size_ += file_size;
11fdf7f2 437 cur_compactions_reserved_size_ -= file_size;
7c673cae
FG
438 } else {
439 total_files_size_ += file_size;
11fdf7f2
TL
440 if (compaction) {
441 // Keep track of the size of files created by in-progress compactions.
442 // When calculating whether there's enough headroom for new compactions,
443 // this will be subtracted from cur_compactions_reserved_size_.
444 // Otherwise, compactions will be double counted.
445 in_progress_files_size_ += file_size;
446 in_progress_files_.insert(file_path);
447 }
7c673cae
FG
448 }
449 tracked_files_[file_path] = file_size;
450}
451
452void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
453 auto tracked_file = tracked_files_.find(file_path);
454 if (tracked_file == tracked_files_.end()) {
455 // File is not tracked
11fdf7f2 456 assert(in_progress_files_.find(file_path) == in_progress_files_.end());
7c673cae
FG
457 return;
458 }
459
460 total_files_size_ -= tracked_file->second;
11fdf7f2
TL
461 // Check if it belonged to an in-progress compaction
462 if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
463 in_progress_files_size_ -= tracked_file->second;
464 in_progress_files_.erase(file_path);
465 }
7c673cae
FG
466 tracked_files_.erase(tracked_file);
467}
468
469SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
470 std::string trash_dir,
471 int64_t rate_bytes_per_sec,
11fdf7f2
TL
472 bool delete_existing_trash, Status* status,
473 double max_trash_db_ratio,
474 uint64_t bytes_max_delete_chunk) {
7c673cae 475 SstFileManagerImpl* res =
11fdf7f2
TL
476 new SstFileManagerImpl(env, info_log, rate_bytes_per_sec,
477 max_trash_db_ratio, bytes_max_delete_chunk);
7c673cae 478
11fdf7f2
TL
479 // trash_dir is deprecated and not needed anymore, but if user passed it
480 // we will still remove files in it.
7c673cae 481 Status s;
11fdf7f2
TL
482 if (delete_existing_trash && trash_dir != "") {
483 std::vector<std::string> files_in_trash;
484 s = env->GetChildren(trash_dir, &files_in_trash);
485 if (s.ok()) {
486 for (const std::string& trash_file : files_in_trash) {
487 if (trash_file == "." || trash_file == "..") {
488 continue;
489 }
490
491 std::string path_in_trash = trash_dir + "/" + trash_file;
492 res->OnAddFile(path_in_trash);
493 Status file_delete =
494 res->ScheduleFileDeletion(path_in_trash, trash_dir);
495 if (s.ok() && !file_delete.ok()) {
496 s = file_delete;
7c673cae
FG
497 }
498 }
499 }
500 }
501
502 if (status) {
503 *status = s;
504 }
505
506 return res;
507}
508
509#else
510
11fdf7f2
TL
511SstFileManager* NewSstFileManager(Env* /*env*/,
512 std::shared_ptr<Logger> /*info_log*/,
513 std::string /*trash_dir*/,
514 int64_t /*rate_bytes_per_sec*/,
515 bool /*delete_existing_trash*/,
516 Status* status, double /*max_trash_db_ratio*/,
517 uint64_t /*bytes_max_delete_chunk*/) {
7c673cae
FG
518 if (status) {
519 *status =
520 Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
521 }
522 return nullptr;
523}
524
525#endif // ROCKSDB_LITE
526
527} // namespace rocksdb