]> git.proxmox.com Git - proxmox-backup.git/blame - src/backup/verify.rs
verify: acquire shared snapshot flock and skip on error
[proxmox-backup.git] / src / backup / verify.rs
CommitLineData
2aaae970 1use std::collections::HashSet;
6b809ff5
DM
2use std::sync::{Arc, Mutex};
3use std::sync::atomic::{Ordering, AtomicUsize};
4use std::time::Instant;
2aaae970 5
3b2046d2 6use anyhow::{bail, format_err, Error};
c2009e53 7
ee7a308d 8use crate::{
ee7a308d
DM
9 api2::types::*,
10 backup::{
11 DataStore,
12 DataBlob,
13 BackupGroup,
14 BackupDir,
15 BackupInfo,
16 IndexFile,
17 CryptMode,
18 FileInfo,
19 ArchiveType,
20 archive_type,
21 },
f6b1d1cc
WB
22 server::UPID,
23 task::TaskState,
24 task_log,
25 tools::ParallelHandler,
bfa54f2e 26 tools::fs::lock_dir_noblock_shared,
c2009e53
DM
27};
28
6b809ff5 29fn verify_blob(datastore: Arc<DataStore>, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> {
c2009e53 30
39f18b30 31 let blob = datastore.load_blob(backup_dir, &info.filename)?;
c2009e53 32
2aaae970 33 let raw_size = blob.raw_size();
c2009e53
DM
34 if raw_size != info.size {
35 bail!("wrong size ({} != {})", info.size, raw_size);
36 }
37
39f18b30 38 let csum = openssl::sha::sha256(blob.raw_data());
c2009e53
DM
39 if csum != info.csum {
40 bail!("wrong index checksum");
41 }
42
8819d1f2
FG
43 match blob.crypt_mode()? {
44 CryptMode::Encrypt => Ok(()),
45 CryptMode::None => {
46 // digest already verified above
47 blob.decode(None, None)?;
48 Ok(())
49 },
50 CryptMode::SignOnly => bail!("Invalid CryptMode for blob"),
c2009e53 51 }
c2009e53
DM
52}
53
0f3b7efa
SR
54fn rename_corrupted_chunk(
55 datastore: Arc<DataStore>,
56 digest: &[u8;32],
f6b1d1cc 57 worker: &dyn TaskState,
0f3b7efa
SR
58) {
59 let (path, digest_str) = datastore.chunk_path(digest);
60
61 let mut counter = 0;
62 let mut new_path = path.clone();
aadcc281 63 loop {
0f3b7efa 64 new_path.set_file_name(format!("{}.{}.bad", digest_str, counter));
aadcc281 65 if new_path.exists() && counter < 9 { counter += 1; } else { break; }
0f3b7efa
SR
66 }
67
68 match std::fs::rename(&path, &new_path) {
69 Ok(_) => {
f6b1d1cc 70 task_log!(worker, "corrupted chunk renamed to {:?}", &new_path);
0f3b7efa
SR
71 },
72 Err(err) => {
73 match err.kind() {
74 std::io::ErrorKind::NotFound => { /* ignored */ },
f6b1d1cc 75 _ => task_log!(worker, "could not rename corrupted chunk {:?} - {}", &path, err)
0f3b7efa
SR
76 }
77 }
78 };
79}
80
fdaab0df 81fn verify_index_chunks(
6b809ff5
DM
82 datastore: Arc<DataStore>,
83 index: Box<dyn IndexFile + Send>,
84 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
85 corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
9a38fa29 86 crypt_mode: CryptMode,
f6b1d1cc 87 worker: Arc<dyn TaskState + Send + Sync>,
fdaab0df
DM
88) -> Result<(), Error> {
89
a71bc08f 90 let errors = Arc::new(AtomicUsize::new(0));
fdaab0df 91
6b809ff5 92 let start_time = Instant::now();
fdaab0df 93
6b809ff5
DM
94 let mut read_bytes = 0;
95 let mut decoded_bytes = 0;
7ae571e7 96
f21508b9
DM
97 let worker2 = Arc::clone(&worker);
98 let datastore2 = Arc::clone(&datastore);
99 let corrupt_chunks2 = Arc::clone(&corrupt_chunks);
100 let verified_chunks2 = Arc::clone(&verified_chunks);
a71bc08f 101 let errors2 = Arc::clone(&errors);
f21508b9
DM
102
103 let decoder_pool = ParallelHandler::new(
104 "verify chunk decoder", 4,
105 move |(chunk, digest, size): (DataBlob, [u8;32], u64)| {
106 let chunk_crypt_mode = match chunk.crypt_mode() {
107 Err(err) => {
108 corrupt_chunks2.lock().unwrap().insert(digest);
f6b1d1cc 109 task_log!(worker2, "can't verify chunk, unknown CryptMode - {}", err);
f21508b9
DM
110 errors2.fetch_add(1, Ordering::SeqCst);
111 return Ok(());
112 },
113 Ok(mode) => mode,
114 };
115
116 if chunk_crypt_mode != crypt_mode {
f6b1d1cc
WB
117 task_log!(
118 worker2,
f21508b9
DM
119 "chunk CryptMode {:?} does not match index CryptMode {:?}",
120 chunk_crypt_mode,
121 crypt_mode
f6b1d1cc 122 );
f21508b9
DM
123 errors2.fetch_add(1, Ordering::SeqCst);
124 }
125
126 if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) {
127 corrupt_chunks2.lock().unwrap().insert(digest);
f6b1d1cc 128 task_log!(worker2, "{}", err);
f21508b9 129 errors2.fetch_add(1, Ordering::SeqCst);
f6b1d1cc 130 rename_corrupted_chunk(datastore2.clone(), &digest, &worker2);
f21508b9
DM
131 } else {
132 verified_chunks2.lock().unwrap().insert(digest);
133 }
134
135 Ok(())
136 }
137 );
138
139 for pos in 0..index.index_count() {
2aaae970 140
f6b1d1cc 141 worker.check_abort()?;
deef6369 142 crate::tools::fail_on_shutdown()?;
6b809ff5 143
f21508b9
DM
144 let info = index.chunk_info(pos).unwrap();
145 let size = info.size();
9a38fa29 146
f21508b9
DM
147 if verified_chunks.lock().unwrap().contains(&info.digest) {
148 continue; // already verified
149 }
6b809ff5 150
f21508b9
DM
151 if corrupt_chunks.lock().unwrap().contains(&info.digest) {
152 let digest_str = proxmox::tools::digest_to_hex(&info.digest);
f6b1d1cc 153 task_log!(worker, "chunk {} was marked as corrupt", digest_str);
6b809ff5 154 errors.fetch_add(1, Ordering::SeqCst);
f21508b9 155 continue;
9a38fa29
FG
156 }
157
f21508b9
DM
158 match datastore.load_chunk(&info.digest) {
159 Err(err) => {
160 corrupt_chunks.lock().unwrap().insert(info.digest);
f6b1d1cc 161 task_log!(worker, "can't verify chunk, load failed - {}", err);
f21508b9 162 errors.fetch_add(1, Ordering::SeqCst);
f6b1d1cc 163 rename_corrupted_chunk(datastore.clone(), &info.digest, &worker);
f21508b9
DM
164 continue;
165 }
166 Ok(chunk) => {
167 read_bytes += chunk.raw_size();
168 decoder_pool.send((chunk, info.digest, size))?;
169 decoded_bytes += size;
170 }
2aaae970 171 }
fdaab0df
DM
172 }
173
f21508b9
DM
174 decoder_pool.complete()?;
175
6b809ff5
DM
176 let elapsed = start_time.elapsed().as_secs_f64();
177
178 let read_bytes_mib = (read_bytes as f64)/(1024.0*1024.0);
179 let decoded_bytes_mib = (decoded_bytes as f64)/(1024.0*1024.0);
180
181 let read_speed = read_bytes_mib/elapsed;
182 let decode_speed = decoded_bytes_mib/elapsed;
183
184 let error_count = errors.load(Ordering::SeqCst);
185
f6b1d1cc
WB
186 task_log!(
187 worker,
188 " verified {:.2}/{:.2} MiB in {:.2} seconds, speed {:.2}/{:.2} MiB/s ({} errors)",
189 read_bytes_mib,
190 decoded_bytes_mib,
191 elapsed,
192 read_speed,
193 decode_speed,
194 error_count,
195 );
6b809ff5
DM
196
197 if errors.load(Ordering::SeqCst) > 0 {
f66f537d
DC
198 bail!("chunks could not be verified");
199 }
200
fdaab0df
DM
201 Ok(())
202}
203
2aaae970 204fn verify_fixed_index(
6b809ff5 205 datastore: Arc<DataStore>,
2aaae970
DM
206 backup_dir: &BackupDir,
207 info: &FileInfo,
6b809ff5
DM
208 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
209 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
f6b1d1cc 210 worker: Arc<dyn TaskState + Send + Sync>,
2aaae970 211) -> Result<(), Error> {
c2009e53
DM
212
213 let mut path = backup_dir.relative_path();
214 path.push(&info.filename);
215
216 let index = datastore.open_fixed_reader(&path)?;
217
218 let (csum, size) = index.compute_csum();
219 if size != info.size {
220 bail!("wrong size ({} != {})", info.size, size);
221 }
222
223 if csum != info.csum {
224 bail!("wrong index checksum");
225 }
226
f6b1d1cc
WB
227 verify_index_chunks(
228 datastore,
229 Box::new(index),
230 verified_chunks,
231 corrupt_chunks,
232 info.chunk_crypt_mode(),
233 worker,
234 )
c2009e53
DM
235}
236
2aaae970 237fn verify_dynamic_index(
6b809ff5 238 datastore: Arc<DataStore>,
2aaae970
DM
239 backup_dir: &BackupDir,
240 info: &FileInfo,
6b809ff5
DM
241 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
242 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
f6b1d1cc 243 worker: Arc<dyn TaskState + Send + Sync>,
2aaae970
DM
244) -> Result<(), Error> {
245
c2009e53
DM
246 let mut path = backup_dir.relative_path();
247 path.push(&info.filename);
248
249 let index = datastore.open_dynamic_reader(&path)?;
250
251 let (csum, size) = index.compute_csum();
252 if size != info.size {
253 bail!("wrong size ({} != {})", info.size, size);
254 }
255
256 if csum != info.csum {
257 bail!("wrong index checksum");
258 }
259
f6b1d1cc
WB
260 verify_index_chunks(
261 datastore,
262 Box::new(index),
263 verified_chunks,
264 corrupt_chunks,
265 info.chunk_crypt_mode(),
266 worker,
267 )
c2009e53
DM
268}
269
270/// Verify a single backup snapshot
271///
272/// This checks all archives inside a backup snapshot.
273/// Errors are logged to the worker log.
274///
8ea00f6e
DM
275/// Returns
276/// - Ok(true) if verify is successful
277/// - Ok(false) if there were verification errors
278/// - Err(_) if task was aborted
2aaae970 279pub fn verify_backup_dir(
6b809ff5 280 datastore: Arc<DataStore>,
2aaae970 281 backup_dir: &BackupDir,
6b809ff5
DM
282 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
283 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
f6b1d1cc
WB
284 worker: Arc<dyn TaskState + Send + Sync>,
285 upid: UPID,
2aaae970 286) -> Result<bool, Error> {
c2009e53 287
bfa54f2e
SR
288 let _guard_res = lock_dir_noblock_shared(
289 &datastore.snapshot_path(&backup_dir),
290 "snapshot",
291 "locked by another operation");
292 if let Err(err) = _guard_res {
293 task_log!(
294 worker,
295 "SKIPPED: verify {}:{} - could not acquire snapshot lock: {}",
296 datastore.name(),
297 backup_dir,
298 err,
299 );
300 return Ok(true);
301 }
302
3b2046d2 303 let mut manifest = match datastore.load_manifest(&backup_dir) {
ff86ef00 304 Ok((manifest, _)) => manifest,
c2009e53 305 Err(err) => {
f6b1d1cc
WB
306 task_log!(
307 worker,
308 "verify {}:{} - manifest load error: {}",
309 datastore.name(),
310 backup_dir,
311 err,
312 );
8ea00f6e 313 return Ok(false);
c2009e53
DM
314 }
315 };
316
f6b1d1cc 317 task_log!(worker, "verify {}:{}", datastore.name(), backup_dir);
c2009e53
DM
318
319 let mut error_count = 0;
320
d10332a1 321 let mut verify_result = VerifyState::Ok;
c2009e53
DM
322 for info in manifest.files() {
323 let result = proxmox::try_block!({
f6b1d1cc 324 task_log!(worker, " check {}", info.filename);
c2009e53 325 match archive_type(&info.filename)? {
d8594d87
DC
326 ArchiveType::FixedIndex =>
327 verify_fixed_index(
6b809ff5 328 datastore.clone(),
d8594d87
DC
329 &backup_dir,
330 info,
6b809ff5
DM
331 verified_chunks.clone(),
332 corrupt_chunks.clone(),
333 worker.clone(),
d8594d87
DC
334 ),
335 ArchiveType::DynamicIndex =>
336 verify_dynamic_index(
6b809ff5 337 datastore.clone(),
d8594d87
DC
338 &backup_dir,
339 info,
6b809ff5
DM
340 verified_chunks.clone(),
341 corrupt_chunks.clone(),
342 worker.clone(),
d8594d87 343 ),
6b809ff5 344 ArchiveType::Blob => verify_blob(datastore.clone(), &backup_dir, info),
c2009e53
DM
345 }
346 });
8ea00f6e 347
f6b1d1cc 348 worker.check_abort()?;
deef6369 349 crate::tools::fail_on_shutdown()?;
8ea00f6e 350
c2009e53 351 if let Err(err) = result {
f6b1d1cc
WB
352 task_log!(
353 worker,
354 "verify {}:{}/{} failed: {}",
355 datastore.name(),
356 backup_dir,
357 info.filename,
358 err,
359 );
c2009e53 360 error_count += 1;
d10332a1 361 verify_result = VerifyState::Failed;
c2009e53 362 }
3b2046d2 363
c2009e53
DM
364 }
365
3b2046d2 366 let verify_state = SnapshotVerifyState {
d10332a1 367 state: verify_result,
f6b1d1cc 368 upid,
3b2046d2
TL
369 };
370 manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?;
371 datastore.store_manifest(&backup_dir, serde_json::to_value(manifest)?)
372 .map_err(|err| format_err!("unable to store manifest blob - {}", err))?;
373
8ea00f6e 374 Ok(error_count == 0)
c2009e53
DM
375}
376
8ea00f6e
DM
377/// Verify all backups inside a backup group
378///
379/// Errors are logged to the worker log.
380///
381/// Returns
63d9aca9 382/// - Ok((count, failed_dirs)) where failed_dirs had verification errors
8ea00f6e 383/// - Err(_) if task was aborted
4f09d310
DM
384pub fn verify_backup_group(
385 datastore: Arc<DataStore>,
386 group: &BackupGroup,
387 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
388 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
63d9aca9 389 progress: Option<(usize, usize)>, // (done, snapshot_count)
f6b1d1cc
WB
390 worker: Arc<dyn TaskState + Send + Sync>,
391 upid: &UPID,
63d9aca9 392) -> Result<(usize, Vec<String>), Error> {
c2009e53 393
adfdc369 394 let mut errors = Vec::new();
c2009e53
DM
395 let mut list = match group.list_backups(&datastore.base_path()) {
396 Ok(list) => list,
397 Err(err) => {
f6b1d1cc
WB
398 task_log!(
399 worker,
400 "verify group {}:{} - unable to list backups: {}",
401 datastore.name(),
402 group,
403 err,
404 );
63d9aca9 405 return Ok((0, errors));
c2009e53
DM
406 }
407 };
408
f6b1d1cc 409 task_log!(worker, "verify group {}:{}", datastore.name(), group);
c2009e53 410
63d9aca9
DM
411 let (done, snapshot_count) = progress.unwrap_or((0, list.len()));
412
413 let mut count = 0;
c2009e53
DM
414 BackupInfo::sort_list(&mut list, false); // newest first
415 for info in list {
63d9aca9 416 count += 1;
f6b1d1cc
WB
417 if !verify_backup_dir(
418 datastore.clone(),
419 &info.backup_dir,
420 verified_chunks.clone(),
421 corrupt_chunks.clone(),
422 worker.clone(),
423 upid.clone(),
424 )? {
adfdc369 425 errors.push(info.backup_dir.to_string());
c2009e53 426 }
63d9aca9
DM
427 if snapshot_count != 0 {
428 let pos = done + count;
429 let percentage = ((pos as f64) * 100.0)/(snapshot_count as f64);
f6b1d1cc
WB
430 task_log!(
431 worker,
432 "percentage done: {:.2}% ({} of {} snapshots)",
433 percentage,
434 pos,
435 snapshot_count,
436 );
63d9aca9 437 }
c2009e53
DM
438 }
439
63d9aca9 440 Ok((count, errors))
c2009e53
DM
441}
442
8ea00f6e
DM
443/// Verify all backups inside a datastore
444///
445/// Errors are logged to the worker log.
446///
447/// Returns
adfdc369 448/// - Ok(failed_dirs) where failed_dirs had verification errors
8ea00f6e 449/// - Err(_) if task was aborted
f6b1d1cc
WB
450pub fn verify_all_backups(
451 datastore: Arc<DataStore>,
452 worker: Arc<dyn TaskState + Send + Sync>,
453 upid: &UPID,
454) -> Result<Vec<String>, Error> {
adfdc369 455 let mut errors = Vec::new();
c2009e53 456
4264c502 457 let mut list = match BackupGroup::list_groups(&datastore.base_path()) {
5656888c
DM
458 Ok(list) => list
459 .into_iter()
460 .filter(|group| !(group.backup_type() == "host" && group.backup_id() == "benchmark"))
461 .collect::<Vec<BackupGroup>>(),
c2009e53 462 Err(err) => {
f6b1d1cc
WB
463 task_log!(
464 worker,
465 "verify datastore {} - unable to list backups: {}",
466 datastore.name(),
467 err,
468 );
adfdc369 469 return Ok(errors);
c2009e53
DM
470 }
471 };
472
4264c502
DM
473 list.sort_unstable();
474
63d9aca9
DM
475 let mut snapshot_count = 0;
476 for group in list.iter() {
477 snapshot_count += group.list_backups(&datastore.base_path())?.len();
478 }
479
4f09d310
DM
480 // start with 16384 chunks (up to 65GB)
481 let verified_chunks = Arc::new(Mutex::new(HashSet::with_capacity(1024*16)));
482
483 // start with 64 chunks since we assume there are few corrupt ones
484 let corrupt_chunks = Arc::new(Mutex::new(HashSet::with_capacity(64)));
485
f6b1d1cc 486 task_log!(worker, "verify datastore {} ({} snapshots)", datastore.name(), snapshot_count);
c2009e53 487
63d9aca9 488 let mut done = 0;
c2009e53 489 for group in list {
63d9aca9 490 let (count, mut group_errors) = verify_backup_group(
4f09d310
DM
491 datastore.clone(),
492 &group,
493 verified_chunks.clone(),
494 corrupt_chunks.clone(),
63d9aca9 495 Some((done, snapshot_count)),
4f09d310 496 worker.clone(),
f6b1d1cc 497 upid,
4f09d310 498 )?;
adfdc369 499 errors.append(&mut group_errors);
63d9aca9
DM
500
501 done += count;
c2009e53
DM
502 }
503
adfdc369 504 Ok(errors)
c2009e53 505}