]>
Commit | Line | Data |
---|---|---|
2aaae970 | 1 | use std::collections::HashSet; |
6b809ff5 DM |
2 | use std::sync::{Arc, Mutex}; |
3 | use std::sync::atomic::{Ordering, AtomicUsize}; | |
4 | use std::time::Instant; | |
2aaae970 | 5 | |
3b2046d2 | 6 | use anyhow::{bail, format_err, Error}; |
c2009e53 | 7 | |
ee7a308d | 8 | use crate::{ |
ee7a308d DM |
9 | api2::types::*, |
10 | backup::{ | |
11 | DataStore, | |
12 | DataBlob, | |
13 | BackupGroup, | |
14 | BackupDir, | |
15 | BackupInfo, | |
16 | IndexFile, | |
17 | CryptMode, | |
18 | FileInfo, | |
19 | ArchiveType, | |
20 | archive_type, | |
21 | }, | |
f6b1d1cc WB |
22 | server::UPID, |
23 | task::TaskState, | |
24 | task_log, | |
25 | tools::ParallelHandler, | |
bfa54f2e | 26 | tools::fs::lock_dir_noblock_shared, |
c2009e53 DM |
27 | }; |
28 | ||
6b809ff5 | 29 | fn verify_blob(datastore: Arc<DataStore>, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> { |
c2009e53 | 30 | |
39f18b30 | 31 | let blob = datastore.load_blob(backup_dir, &info.filename)?; |
c2009e53 | 32 | |
2aaae970 | 33 | let raw_size = blob.raw_size(); |
c2009e53 DM |
34 | if raw_size != info.size { |
35 | bail!("wrong size ({} != {})", info.size, raw_size); | |
36 | } | |
37 | ||
39f18b30 | 38 | let csum = openssl::sha::sha256(blob.raw_data()); |
c2009e53 DM |
39 | if csum != info.csum { |
40 | bail!("wrong index checksum"); | |
41 | } | |
42 | ||
8819d1f2 FG |
43 | match blob.crypt_mode()? { |
44 | CryptMode::Encrypt => Ok(()), | |
45 | CryptMode::None => { | |
46 | // digest already verified above | |
47 | blob.decode(None, None)?; | |
48 | Ok(()) | |
49 | }, | |
50 | CryptMode::SignOnly => bail!("Invalid CryptMode for blob"), | |
c2009e53 | 51 | } |
c2009e53 DM |
52 | } |
53 | ||
0f3b7efa SR |
54 | fn rename_corrupted_chunk( |
55 | datastore: Arc<DataStore>, | |
56 | digest: &[u8;32], | |
f6b1d1cc | 57 | worker: &dyn TaskState, |
0f3b7efa SR |
58 | ) { |
59 | let (path, digest_str) = datastore.chunk_path(digest); | |
60 | ||
61 | let mut counter = 0; | |
62 | let mut new_path = path.clone(); | |
aadcc281 | 63 | loop { |
0f3b7efa | 64 | new_path.set_file_name(format!("{}.{}.bad", digest_str, counter)); |
aadcc281 | 65 | if new_path.exists() && counter < 9 { counter += 1; } else { break; } |
0f3b7efa SR |
66 | } |
67 | ||
68 | match std::fs::rename(&path, &new_path) { | |
69 | Ok(_) => { | |
f6b1d1cc | 70 | task_log!(worker, "corrupted chunk renamed to {:?}", &new_path); |
0f3b7efa SR |
71 | }, |
72 | Err(err) => { | |
73 | match err.kind() { | |
74 | std::io::ErrorKind::NotFound => { /* ignored */ }, | |
f6b1d1cc | 75 | _ => task_log!(worker, "could not rename corrupted chunk {:?} - {}", &path, err) |
0f3b7efa SR |
76 | } |
77 | } | |
78 | }; | |
79 | } | |
80 | ||
fdaab0df | 81 | fn verify_index_chunks( |
6b809ff5 DM |
82 | datastore: Arc<DataStore>, |
83 | index: Box<dyn IndexFile + Send>, | |
84 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
85 | corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>, | |
9a38fa29 | 86 | crypt_mode: CryptMode, |
f6b1d1cc | 87 | worker: Arc<dyn TaskState + Send + Sync>, |
fdaab0df DM |
88 | ) -> Result<(), Error> { |
89 | ||
a71bc08f | 90 | let errors = Arc::new(AtomicUsize::new(0)); |
fdaab0df | 91 | |
6b809ff5 | 92 | let start_time = Instant::now(); |
fdaab0df | 93 | |
6b809ff5 DM |
94 | let mut read_bytes = 0; |
95 | let mut decoded_bytes = 0; | |
7ae571e7 | 96 | |
f21508b9 DM |
97 | let worker2 = Arc::clone(&worker); |
98 | let datastore2 = Arc::clone(&datastore); | |
99 | let corrupt_chunks2 = Arc::clone(&corrupt_chunks); | |
100 | let verified_chunks2 = Arc::clone(&verified_chunks); | |
a71bc08f | 101 | let errors2 = Arc::clone(&errors); |
f21508b9 DM |
102 | |
103 | let decoder_pool = ParallelHandler::new( | |
104 | "verify chunk decoder", 4, | |
105 | move |(chunk, digest, size): (DataBlob, [u8;32], u64)| { | |
106 | let chunk_crypt_mode = match chunk.crypt_mode() { | |
107 | Err(err) => { | |
108 | corrupt_chunks2.lock().unwrap().insert(digest); | |
f6b1d1cc | 109 | task_log!(worker2, "can't verify chunk, unknown CryptMode - {}", err); |
f21508b9 DM |
110 | errors2.fetch_add(1, Ordering::SeqCst); |
111 | return Ok(()); | |
112 | }, | |
113 | Ok(mode) => mode, | |
114 | }; | |
115 | ||
116 | if chunk_crypt_mode != crypt_mode { | |
f6b1d1cc WB |
117 | task_log!( |
118 | worker2, | |
f21508b9 DM |
119 | "chunk CryptMode {:?} does not match index CryptMode {:?}", |
120 | chunk_crypt_mode, | |
121 | crypt_mode | |
f6b1d1cc | 122 | ); |
f21508b9 DM |
123 | errors2.fetch_add(1, Ordering::SeqCst); |
124 | } | |
125 | ||
126 | if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) { | |
127 | corrupt_chunks2.lock().unwrap().insert(digest); | |
f6b1d1cc | 128 | task_log!(worker2, "{}", err); |
f21508b9 | 129 | errors2.fetch_add(1, Ordering::SeqCst); |
f6b1d1cc | 130 | rename_corrupted_chunk(datastore2.clone(), &digest, &worker2); |
f21508b9 DM |
131 | } else { |
132 | verified_chunks2.lock().unwrap().insert(digest); | |
133 | } | |
134 | ||
135 | Ok(()) | |
136 | } | |
137 | ); | |
138 | ||
139 | for pos in 0..index.index_count() { | |
2aaae970 | 140 | |
f6b1d1cc | 141 | worker.check_abort()?; |
deef6369 | 142 | crate::tools::fail_on_shutdown()?; |
6b809ff5 | 143 | |
f21508b9 DM |
144 | let info = index.chunk_info(pos).unwrap(); |
145 | let size = info.size(); | |
9a38fa29 | 146 | |
f21508b9 DM |
147 | if verified_chunks.lock().unwrap().contains(&info.digest) { |
148 | continue; // already verified | |
149 | } | |
6b809ff5 | 150 | |
f21508b9 DM |
151 | if corrupt_chunks.lock().unwrap().contains(&info.digest) { |
152 | let digest_str = proxmox::tools::digest_to_hex(&info.digest); | |
f6b1d1cc | 153 | task_log!(worker, "chunk {} was marked as corrupt", digest_str); |
6b809ff5 | 154 | errors.fetch_add(1, Ordering::SeqCst); |
f21508b9 | 155 | continue; |
9a38fa29 FG |
156 | } |
157 | ||
f21508b9 DM |
158 | match datastore.load_chunk(&info.digest) { |
159 | Err(err) => { | |
160 | corrupt_chunks.lock().unwrap().insert(info.digest); | |
f6b1d1cc | 161 | task_log!(worker, "can't verify chunk, load failed - {}", err); |
f21508b9 | 162 | errors.fetch_add(1, Ordering::SeqCst); |
f6b1d1cc | 163 | rename_corrupted_chunk(datastore.clone(), &info.digest, &worker); |
f21508b9 DM |
164 | continue; |
165 | } | |
166 | Ok(chunk) => { | |
167 | read_bytes += chunk.raw_size(); | |
168 | decoder_pool.send((chunk, info.digest, size))?; | |
169 | decoded_bytes += size; | |
170 | } | |
2aaae970 | 171 | } |
fdaab0df DM |
172 | } |
173 | ||
f21508b9 DM |
174 | decoder_pool.complete()?; |
175 | ||
6b809ff5 DM |
176 | let elapsed = start_time.elapsed().as_secs_f64(); |
177 | ||
178 | let read_bytes_mib = (read_bytes as f64)/(1024.0*1024.0); | |
179 | let decoded_bytes_mib = (decoded_bytes as f64)/(1024.0*1024.0); | |
180 | ||
181 | let read_speed = read_bytes_mib/elapsed; | |
182 | let decode_speed = decoded_bytes_mib/elapsed; | |
183 | ||
184 | let error_count = errors.load(Ordering::SeqCst); | |
185 | ||
f6b1d1cc WB |
186 | task_log!( |
187 | worker, | |
188 | " verified {:.2}/{:.2} MiB in {:.2} seconds, speed {:.2}/{:.2} MiB/s ({} errors)", | |
189 | read_bytes_mib, | |
190 | decoded_bytes_mib, | |
191 | elapsed, | |
192 | read_speed, | |
193 | decode_speed, | |
194 | error_count, | |
195 | ); | |
6b809ff5 DM |
196 | |
197 | if errors.load(Ordering::SeqCst) > 0 { | |
f66f537d DC |
198 | bail!("chunks could not be verified"); |
199 | } | |
200 | ||
fdaab0df DM |
201 | Ok(()) |
202 | } | |
203 | ||
2aaae970 | 204 | fn verify_fixed_index( |
6b809ff5 | 205 | datastore: Arc<DataStore>, |
2aaae970 DM |
206 | backup_dir: &BackupDir, |
207 | info: &FileInfo, | |
6b809ff5 DM |
208 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, |
209 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
f6b1d1cc | 210 | worker: Arc<dyn TaskState + Send + Sync>, |
2aaae970 | 211 | ) -> Result<(), Error> { |
c2009e53 DM |
212 | |
213 | let mut path = backup_dir.relative_path(); | |
214 | path.push(&info.filename); | |
215 | ||
216 | let index = datastore.open_fixed_reader(&path)?; | |
217 | ||
218 | let (csum, size) = index.compute_csum(); | |
219 | if size != info.size { | |
220 | bail!("wrong size ({} != {})", info.size, size); | |
221 | } | |
222 | ||
223 | if csum != info.csum { | |
224 | bail!("wrong index checksum"); | |
225 | } | |
226 | ||
f6b1d1cc WB |
227 | verify_index_chunks( |
228 | datastore, | |
229 | Box::new(index), | |
230 | verified_chunks, | |
231 | corrupt_chunks, | |
232 | info.chunk_crypt_mode(), | |
233 | worker, | |
234 | ) | |
c2009e53 DM |
235 | } |
236 | ||
2aaae970 | 237 | fn verify_dynamic_index( |
6b809ff5 | 238 | datastore: Arc<DataStore>, |
2aaae970 DM |
239 | backup_dir: &BackupDir, |
240 | info: &FileInfo, | |
6b809ff5 DM |
241 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, |
242 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
f6b1d1cc | 243 | worker: Arc<dyn TaskState + Send + Sync>, |
2aaae970 DM |
244 | ) -> Result<(), Error> { |
245 | ||
c2009e53 DM |
246 | let mut path = backup_dir.relative_path(); |
247 | path.push(&info.filename); | |
248 | ||
249 | let index = datastore.open_dynamic_reader(&path)?; | |
250 | ||
251 | let (csum, size) = index.compute_csum(); | |
252 | if size != info.size { | |
253 | bail!("wrong size ({} != {})", info.size, size); | |
254 | } | |
255 | ||
256 | if csum != info.csum { | |
257 | bail!("wrong index checksum"); | |
258 | } | |
259 | ||
f6b1d1cc WB |
260 | verify_index_chunks( |
261 | datastore, | |
262 | Box::new(index), | |
263 | verified_chunks, | |
264 | corrupt_chunks, | |
265 | info.chunk_crypt_mode(), | |
266 | worker, | |
267 | ) | |
c2009e53 DM |
268 | } |
269 | ||
270 | /// Verify a single backup snapshot | |
271 | /// | |
272 | /// This checks all archives inside a backup snapshot. | |
273 | /// Errors are logged to the worker log. | |
274 | /// | |
8ea00f6e DM |
275 | /// Returns |
276 | /// - Ok(true) if verify is successful | |
277 | /// - Ok(false) if there were verification errors | |
278 | /// - Err(_) if task was aborted | |
2aaae970 | 279 | pub fn verify_backup_dir( |
6b809ff5 | 280 | datastore: Arc<DataStore>, |
2aaae970 | 281 | backup_dir: &BackupDir, |
6b809ff5 DM |
282 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, |
283 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
f6b1d1cc WB |
284 | worker: Arc<dyn TaskState + Send + Sync>, |
285 | upid: UPID, | |
2aaae970 | 286 | ) -> Result<bool, Error> { |
c2009e53 | 287 | |
bfa54f2e SR |
288 | let _guard_res = lock_dir_noblock_shared( |
289 | &datastore.snapshot_path(&backup_dir), | |
290 | "snapshot", | |
291 | "locked by another operation"); | |
292 | if let Err(err) = _guard_res { | |
293 | task_log!( | |
294 | worker, | |
295 | "SKIPPED: verify {}:{} - could not acquire snapshot lock: {}", | |
296 | datastore.name(), | |
297 | backup_dir, | |
298 | err, | |
299 | ); | |
300 | return Ok(true); | |
301 | } | |
302 | ||
3b2046d2 | 303 | let mut manifest = match datastore.load_manifest(&backup_dir) { |
ff86ef00 | 304 | Ok((manifest, _)) => manifest, |
c2009e53 | 305 | Err(err) => { |
f6b1d1cc WB |
306 | task_log!( |
307 | worker, | |
308 | "verify {}:{} - manifest load error: {}", | |
309 | datastore.name(), | |
310 | backup_dir, | |
311 | err, | |
312 | ); | |
8ea00f6e | 313 | return Ok(false); |
c2009e53 DM |
314 | } |
315 | }; | |
316 | ||
f6b1d1cc | 317 | task_log!(worker, "verify {}:{}", datastore.name(), backup_dir); |
c2009e53 DM |
318 | |
319 | let mut error_count = 0; | |
320 | ||
d10332a1 | 321 | let mut verify_result = VerifyState::Ok; |
c2009e53 DM |
322 | for info in manifest.files() { |
323 | let result = proxmox::try_block!({ | |
f6b1d1cc | 324 | task_log!(worker, " check {}", info.filename); |
c2009e53 | 325 | match archive_type(&info.filename)? { |
d8594d87 DC |
326 | ArchiveType::FixedIndex => |
327 | verify_fixed_index( | |
6b809ff5 | 328 | datastore.clone(), |
d8594d87 DC |
329 | &backup_dir, |
330 | info, | |
6b809ff5 DM |
331 | verified_chunks.clone(), |
332 | corrupt_chunks.clone(), | |
333 | worker.clone(), | |
d8594d87 DC |
334 | ), |
335 | ArchiveType::DynamicIndex => | |
336 | verify_dynamic_index( | |
6b809ff5 | 337 | datastore.clone(), |
d8594d87 DC |
338 | &backup_dir, |
339 | info, | |
6b809ff5 DM |
340 | verified_chunks.clone(), |
341 | corrupt_chunks.clone(), | |
342 | worker.clone(), | |
d8594d87 | 343 | ), |
6b809ff5 | 344 | ArchiveType::Blob => verify_blob(datastore.clone(), &backup_dir, info), |
c2009e53 DM |
345 | } |
346 | }); | |
8ea00f6e | 347 | |
f6b1d1cc | 348 | worker.check_abort()?; |
deef6369 | 349 | crate::tools::fail_on_shutdown()?; |
8ea00f6e | 350 | |
c2009e53 | 351 | if let Err(err) = result { |
f6b1d1cc WB |
352 | task_log!( |
353 | worker, | |
354 | "verify {}:{}/{} failed: {}", | |
355 | datastore.name(), | |
356 | backup_dir, | |
357 | info.filename, | |
358 | err, | |
359 | ); | |
c2009e53 | 360 | error_count += 1; |
d10332a1 | 361 | verify_result = VerifyState::Failed; |
c2009e53 | 362 | } |
3b2046d2 | 363 | |
c2009e53 DM |
364 | } |
365 | ||
3b2046d2 | 366 | let verify_state = SnapshotVerifyState { |
d10332a1 | 367 | state: verify_result, |
f6b1d1cc | 368 | upid, |
3b2046d2 TL |
369 | }; |
370 | manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?; | |
371 | datastore.store_manifest(&backup_dir, serde_json::to_value(manifest)?) | |
372 | .map_err(|err| format_err!("unable to store manifest blob - {}", err))?; | |
373 | ||
8ea00f6e | 374 | Ok(error_count == 0) |
c2009e53 DM |
375 | } |
376 | ||
8ea00f6e DM |
377 | /// Verify all backups inside a backup group |
378 | /// | |
379 | /// Errors are logged to the worker log. | |
380 | /// | |
381 | /// Returns | |
63d9aca9 | 382 | /// - Ok((count, failed_dirs)) where failed_dirs had verification errors |
8ea00f6e | 383 | /// - Err(_) if task was aborted |
4f09d310 DM |
384 | pub fn verify_backup_group( |
385 | datastore: Arc<DataStore>, | |
386 | group: &BackupGroup, | |
387 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
388 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
63d9aca9 | 389 | progress: Option<(usize, usize)>, // (done, snapshot_count) |
f6b1d1cc WB |
390 | worker: Arc<dyn TaskState + Send + Sync>, |
391 | upid: &UPID, | |
63d9aca9 | 392 | ) -> Result<(usize, Vec<String>), Error> { |
c2009e53 | 393 | |
adfdc369 | 394 | let mut errors = Vec::new(); |
c2009e53 DM |
395 | let mut list = match group.list_backups(&datastore.base_path()) { |
396 | Ok(list) => list, | |
397 | Err(err) => { | |
f6b1d1cc WB |
398 | task_log!( |
399 | worker, | |
400 | "verify group {}:{} - unable to list backups: {}", | |
401 | datastore.name(), | |
402 | group, | |
403 | err, | |
404 | ); | |
63d9aca9 | 405 | return Ok((0, errors)); |
c2009e53 DM |
406 | } |
407 | }; | |
408 | ||
f6b1d1cc | 409 | task_log!(worker, "verify group {}:{}", datastore.name(), group); |
c2009e53 | 410 | |
63d9aca9 DM |
411 | let (done, snapshot_count) = progress.unwrap_or((0, list.len())); |
412 | ||
413 | let mut count = 0; | |
c2009e53 DM |
414 | BackupInfo::sort_list(&mut list, false); // newest first |
415 | for info in list { | |
63d9aca9 | 416 | count += 1; |
f6b1d1cc WB |
417 | if !verify_backup_dir( |
418 | datastore.clone(), | |
419 | &info.backup_dir, | |
420 | verified_chunks.clone(), | |
421 | corrupt_chunks.clone(), | |
422 | worker.clone(), | |
423 | upid.clone(), | |
424 | )? { | |
adfdc369 | 425 | errors.push(info.backup_dir.to_string()); |
c2009e53 | 426 | } |
63d9aca9 DM |
427 | if snapshot_count != 0 { |
428 | let pos = done + count; | |
429 | let percentage = ((pos as f64) * 100.0)/(snapshot_count as f64); | |
f6b1d1cc WB |
430 | task_log!( |
431 | worker, | |
432 | "percentage done: {:.2}% ({} of {} snapshots)", | |
433 | percentage, | |
434 | pos, | |
435 | snapshot_count, | |
436 | ); | |
63d9aca9 | 437 | } |
c2009e53 DM |
438 | } |
439 | ||
63d9aca9 | 440 | Ok((count, errors)) |
c2009e53 DM |
441 | } |
442 | ||
8ea00f6e DM |
443 | /// Verify all backups inside a datastore |
444 | /// | |
445 | /// Errors are logged to the worker log. | |
446 | /// | |
447 | /// Returns | |
adfdc369 | 448 | /// - Ok(failed_dirs) where failed_dirs had verification errors |
8ea00f6e | 449 | /// - Err(_) if task was aborted |
f6b1d1cc WB |
450 | pub fn verify_all_backups( |
451 | datastore: Arc<DataStore>, | |
452 | worker: Arc<dyn TaskState + Send + Sync>, | |
453 | upid: &UPID, | |
454 | ) -> Result<Vec<String>, Error> { | |
adfdc369 | 455 | let mut errors = Vec::new(); |
c2009e53 | 456 | |
4264c502 | 457 | let mut list = match BackupGroup::list_groups(&datastore.base_path()) { |
5656888c DM |
458 | Ok(list) => list |
459 | .into_iter() | |
460 | .filter(|group| !(group.backup_type() == "host" && group.backup_id() == "benchmark")) | |
461 | .collect::<Vec<BackupGroup>>(), | |
c2009e53 | 462 | Err(err) => { |
f6b1d1cc WB |
463 | task_log!( |
464 | worker, | |
465 | "verify datastore {} - unable to list backups: {}", | |
466 | datastore.name(), | |
467 | err, | |
468 | ); | |
adfdc369 | 469 | return Ok(errors); |
c2009e53 DM |
470 | } |
471 | }; | |
472 | ||
4264c502 DM |
473 | list.sort_unstable(); |
474 | ||
63d9aca9 DM |
475 | let mut snapshot_count = 0; |
476 | for group in list.iter() { | |
477 | snapshot_count += group.list_backups(&datastore.base_path())?.len(); | |
478 | } | |
479 | ||
4f09d310 DM |
480 | // start with 16384 chunks (up to 65GB) |
481 | let verified_chunks = Arc::new(Mutex::new(HashSet::with_capacity(1024*16))); | |
482 | ||
483 | // start with 64 chunks since we assume there are few corrupt ones | |
484 | let corrupt_chunks = Arc::new(Mutex::new(HashSet::with_capacity(64))); | |
485 | ||
f6b1d1cc | 486 | task_log!(worker, "verify datastore {} ({} snapshots)", datastore.name(), snapshot_count); |
c2009e53 | 487 | |
63d9aca9 | 488 | let mut done = 0; |
c2009e53 | 489 | for group in list { |
63d9aca9 | 490 | let (count, mut group_errors) = verify_backup_group( |
4f09d310 DM |
491 | datastore.clone(), |
492 | &group, | |
493 | verified_chunks.clone(), | |
494 | corrupt_chunks.clone(), | |
63d9aca9 | 495 | Some((done, snapshot_count)), |
4f09d310 | 496 | worker.clone(), |
f6b1d1cc | 497 | upid, |
4f09d310 | 498 | )?; |
adfdc369 | 499 | errors.append(&mut group_errors); |
63d9aca9 DM |
500 | |
501 | done += count; | |
c2009e53 DM |
502 | } |
503 | ||
adfdc369 | 504 | Ok(errors) |
c2009e53 | 505 | } |