1 // Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 #include "db/error_handler.h"
7 #include "db/db_impl.h"
8 #include "db/event_helpers.h"
9 #include "util/sst_file_manager_impl.h"
13 // Maps to help decide the severity of an error based on the
14 // BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
15 // is set or not. There are 3 maps, going from most specific to least specific
16 // (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
17 // paranoid_checks). The less specific map serves as a catch all in case we miss
18 // a specific error code or subcode.
19 std::map
<std::tuple
<BackgroundErrorReason
, Status::Code
, Status::SubCode
, bool>,
22 // Errors during BG compaction
23 {std::make_tuple(BackgroundErrorReason::kCompaction
,
24 Status::Code::kIOError
, Status::SubCode::kNoSpace
,
26 Status::Severity::kSoftError
},
27 {std::make_tuple(BackgroundErrorReason::kCompaction
,
28 Status::Code::kIOError
, Status::SubCode::kNoSpace
,
30 Status::Severity::kNoError
},
31 {std::make_tuple(BackgroundErrorReason::kCompaction
,
32 Status::Code::kIOError
, Status::SubCode::kSpaceLimit
,
34 Status::Severity::kHardError
},
35 // Errors during BG flush
36 {std::make_tuple(BackgroundErrorReason::kFlush
, Status::Code::kIOError
,
37 Status::SubCode::kNoSpace
, true),
38 Status::Severity::kHardError
},
39 {std::make_tuple(BackgroundErrorReason::kFlush
, Status::Code::kIOError
,
40 Status::SubCode::kNoSpace
, false),
41 Status::Severity::kNoError
},
42 {std::make_tuple(BackgroundErrorReason::kFlush
, Status::Code::kIOError
,
43 Status::SubCode::kSpaceLimit
, true),
44 Status::Severity::kHardError
},
45 // Errors during Write
46 {std::make_tuple(BackgroundErrorReason::kWriteCallback
,
47 Status::Code::kIOError
, Status::SubCode::kNoSpace
,
49 Status::Severity::kHardError
},
50 {std::make_tuple(BackgroundErrorReason::kWriteCallback
,
51 Status::Code::kIOError
, Status::SubCode::kNoSpace
,
53 Status::Severity::kHardError
},
56 std::map
<std::tuple
<BackgroundErrorReason
, Status::Code
, bool>, Status::Severity
>
57 DefaultErrorSeverityMap
= {
58 // Errors during BG compaction
59 {std::make_tuple(BackgroundErrorReason::kCompaction
,
60 Status::Code::kCorruption
, true),
61 Status::Severity::kUnrecoverableError
},
62 {std::make_tuple(BackgroundErrorReason::kCompaction
,
63 Status::Code::kCorruption
, false),
64 Status::Severity::kNoError
},
65 {std::make_tuple(BackgroundErrorReason::kCompaction
,
66 Status::Code::kIOError
, true),
67 Status::Severity::kFatalError
},
68 {std::make_tuple(BackgroundErrorReason::kCompaction
,
69 Status::Code::kIOError
, false),
70 Status::Severity::kNoError
},
71 // Errors during BG flush
72 {std::make_tuple(BackgroundErrorReason::kFlush
,
73 Status::Code::kCorruption
, true),
74 Status::Severity::kUnrecoverableError
},
75 {std::make_tuple(BackgroundErrorReason::kFlush
,
76 Status::Code::kCorruption
, false),
77 Status::Severity::kNoError
},
78 {std::make_tuple(BackgroundErrorReason::kFlush
,
79 Status::Code::kIOError
, true),
80 Status::Severity::kFatalError
},
81 {std::make_tuple(BackgroundErrorReason::kFlush
,
82 Status::Code::kIOError
, false),
83 Status::Severity::kNoError
},
84 // Errors during Write
85 {std::make_tuple(BackgroundErrorReason::kWriteCallback
,
86 Status::Code::kCorruption
, true),
87 Status::Severity::kUnrecoverableError
},
88 {std::make_tuple(BackgroundErrorReason::kWriteCallback
,
89 Status::Code::kCorruption
, false),
90 Status::Severity::kNoError
},
91 {std::make_tuple(BackgroundErrorReason::kWriteCallback
,
92 Status::Code::kIOError
, true),
93 Status::Severity::kFatalError
},
94 {std::make_tuple(BackgroundErrorReason::kWriteCallback
,
95 Status::Code::kIOError
, false),
96 Status::Severity::kNoError
},
99 std::map
<std::tuple
<BackgroundErrorReason
, bool>, Status::Severity
>
101 // Errors during BG compaction
102 {std::make_tuple(BackgroundErrorReason::kCompaction
, true),
103 Status::Severity::kFatalError
},
104 {std::make_tuple(BackgroundErrorReason::kCompaction
, false),
105 Status::Severity::kNoError
},
106 // Errors during BG flush
107 {std::make_tuple(BackgroundErrorReason::kFlush
, true),
108 Status::Severity::kFatalError
},
109 {std::make_tuple(BackgroundErrorReason::kFlush
, false),
110 Status::Severity::kNoError
},
111 // Errors during Write
112 {std::make_tuple(BackgroundErrorReason::kWriteCallback
, true),
113 Status::Severity::kFatalError
},
114 {std::make_tuple(BackgroundErrorReason::kWriteCallback
, false),
115 Status::Severity::kFatalError
},
116 // Errors during Memtable update
117 {std::make_tuple(BackgroundErrorReason::kMemTable
, true),
118 Status::Severity::kFatalError
},
119 {std::make_tuple(BackgroundErrorReason::kMemTable
, false),
120 Status::Severity::kFatalError
},
123 void ErrorHandler::CancelErrorRecovery() {
125 db_mutex_
->AssertHeld();
127 // We'll release the lock before calling sfm, so make sure no new
128 // recovery gets scheduled at that point
129 auto_recovery_
= false;
130 SstFileManagerImpl
* sfm
= reinterpret_cast<SstFileManagerImpl
*>(
131 db_options_
.sst_file_manager
.get());
133 // This may or may not cancel a pending recovery
135 bool cancelled
= sfm
->CancelErrorRecovery(this);
138 recovery_in_prog_
= false;
144 // This is the main function for looking at an error during a background
145 // operation and deciding the severity, and error recovery strategy. The high
146 // level algorithm is as follows -
147 // 1. Classify the severity of the error based on the ErrorSeverityMap,
148 // DefaultErrorSeverityMap and DefaultReasonMap defined earlier
149 // 2. Call a Status code specific override function to adjust the severity
150 // if needed. The reason for this is our ability to recover may depend on
151 // the exact options enabled in DBOptions
152 // 3. Determine if auto recovery is possible. A listener notification callback
153 // is called, which can disable the auto recovery even if we decide its
155 // 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
156 // the actual recovery. If no sst file manager is specified in DBOptions,
157 // a default one is allocated during DB::Open(), so there will always be
159 // This can also get called as part of a recovery operation. In that case, we
160 // also track the error separately in recovery_error_ so we can tell in the
161 // end whether recovery succeeded or not
162 Status
ErrorHandler::SetBGError(const Status
& bg_err
, BackgroundErrorReason reason
) {
163 db_mutex_
->AssertHeld();
169 // Check if recovery is currently in progress. If it is, we will save this
170 // error so we can check it at the end to see if recovery succeeded or not
171 if (recovery_in_prog_
&& recovery_error_
.ok()) {
172 recovery_error_
= bg_err
;
175 bool paranoid
= db_options_
.paranoid_checks
;
176 Status::Severity sev
= Status::Severity::kFatalError
;
181 auto entry
= ErrorSeverityMap
.find(std::make_tuple(reason
, bg_err
.code(),
182 bg_err
.subcode(), paranoid
));
183 if (entry
!= ErrorSeverityMap
.end()) {
190 auto entry
= DefaultErrorSeverityMap
.find(std::make_tuple(reason
,
191 bg_err
.code(), paranoid
));
192 if (entry
!= DefaultErrorSeverityMap
.end()) {
199 auto entry
= DefaultReasonMap
.find(std::make_tuple(reason
, paranoid
));
200 if (entry
!= DefaultReasonMap
.end()) {
205 new_bg_err
= Status(bg_err
, sev
);
207 bool auto_recovery
= auto_recovery_
;
208 if (new_bg_err
.severity() >= Status::Severity::kFatalError
&& auto_recovery
) {
209 auto_recovery
= false;
213 // Allow some error specific overrides
214 if (new_bg_err
== Status::NoSpace()) {
215 new_bg_err
= OverrideNoSpaceError(new_bg_err
, &auto_recovery
);
218 if (!new_bg_err
.ok()) {
219 Status s
= new_bg_err
;
220 EventHelpers::NotifyOnBackgroundError(db_options_
.listeners
, reason
, &s
,
221 db_mutex_
, &auto_recovery
);
222 if (!s
.ok() && (s
.severity() > bg_error_
.severity())) {
225 // This error is less severe than previously encountered error. Don't
226 // take any further action
232 recovery_in_prog_
= true;
234 // Kick-off error specific recovery
235 if (bg_error_
== Status::NoSpace()) {
236 RecoverFromNoSpace();
242 Status
ErrorHandler::OverrideNoSpaceError(Status bg_error
,
243 bool* auto_recovery
) {
245 if (bg_error
.severity() >= Status::Severity::kFatalError
) {
249 if (db_options_
.sst_file_manager
.get() == nullptr) {
250 // We rely on SFM to poll for enough disk space and recover
251 *auto_recovery
= false;
255 if (db_options_
.allow_2pc
&&
256 (bg_error
.severity() <= Status::Severity::kSoftError
)) {
257 // Don't know how to recover, as the contents of the current WAL file may
258 // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
259 // we can just flush the memtable and discard the log
260 *auto_recovery
= false;
261 return Status(bg_error
, Status::Severity::kFatalError
);
266 if (db_options_
.env
->GetFreeSpace(db_options_
.db_paths
[0].path
,
267 &free_space
) == Status::NotSupported()) {
268 *auto_recovery
= false;
275 return Status(bg_error
, Status::Severity::kFatalError
);
279 void ErrorHandler::RecoverFromNoSpace() {
281 SstFileManagerImpl
* sfm
=
282 reinterpret_cast<SstFileManagerImpl
*>(db_options_
.sst_file_manager
.get());
284 // Inform SFM of the error, so it can kick-off the recovery
286 sfm
->StartErrorRecovery(this, bg_error_
);
291 Status
ErrorHandler::ClearBGError() {
293 db_mutex_
->AssertHeld();
295 // Signal that recovery succeeded
296 if (recovery_error_
.ok()) {
297 Status old_bg_error
= bg_error_
;
298 bg_error_
= Status::OK();
299 recovery_in_prog_
= false;
300 EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_
.listeners
,
301 old_bg_error
, db_mutex_
);
303 return recovery_error_
;
309 Status
ErrorHandler::RecoverFromBGError(bool is_manual
) {
311 InstrumentedMutexLock
l(db_mutex_
);
313 // If its a manual recovery and there's a background recovery in progress
314 // return busy status
315 if (recovery_in_prog_
) {
316 return Status::Busy();
318 recovery_in_prog_
= true;
321 if (bg_error_
.severity() == Status::Severity::kSoftError
) {
322 // Simply clear the background error and return
323 recovery_error_
= Status::OK();
324 return ClearBGError();
327 // Reset recovery_error_. We will use this to record any errors that happen
328 // during the recovery process. While recovering, the only operations that
329 // can generate background errors should be the flush operations
330 recovery_error_
= Status::OK();
331 Status s
= db_
->ResumeImpl();
332 // For manual recover, shutdown, and fatal error cases, set
333 // recovery_in_prog_ to false. For automatic background recovery, leave it
334 // as is regardless of success or failure as it will be retried
335 if (is_manual
|| s
.IsShutdownInProgress() ||
336 bg_error_
.severity() >= Status::Severity::kFatalError
) {
337 recovery_in_prog_
= false;