]> git.proxmox.com Git - proxmox-backup.git/blame - src/backup/verify.rs
reader: acquire shared flock on open snapshot
[proxmox-backup.git] / src / backup / verify.rs
CommitLineData
2aaae970 1use std::collections::HashSet;
6b809ff5
DM
2use std::sync::{Arc, Mutex};
3use std::sync::atomic::{Ordering, AtomicUsize};
4use std::time::Instant;
2aaae970 5
3b2046d2 6use anyhow::{bail, format_err, Error};
c2009e53 7
ee7a308d 8use crate::{
ee7a308d
DM
9 api2::types::*,
10 backup::{
11 DataStore,
12 DataBlob,
13 BackupGroup,
14 BackupDir,
15 BackupInfo,
16 IndexFile,
17 CryptMode,
18 FileInfo,
19 ArchiveType,
20 archive_type,
21 },
f6b1d1cc
WB
22 server::UPID,
23 task::TaskState,
24 task_log,
25 tools::ParallelHandler,
c2009e53
DM
26};
27
6b809ff5 28fn verify_blob(datastore: Arc<DataStore>, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> {
c2009e53 29
39f18b30 30 let blob = datastore.load_blob(backup_dir, &info.filename)?;
c2009e53 31
2aaae970 32 let raw_size = blob.raw_size();
c2009e53
DM
33 if raw_size != info.size {
34 bail!("wrong size ({} != {})", info.size, raw_size);
35 }
36
39f18b30 37 let csum = openssl::sha::sha256(blob.raw_data());
c2009e53
DM
38 if csum != info.csum {
39 bail!("wrong index checksum");
40 }
41
8819d1f2
FG
42 match blob.crypt_mode()? {
43 CryptMode::Encrypt => Ok(()),
44 CryptMode::None => {
45 // digest already verified above
46 blob.decode(None, None)?;
47 Ok(())
48 },
49 CryptMode::SignOnly => bail!("Invalid CryptMode for blob"),
c2009e53 50 }
c2009e53
DM
51}
52
0f3b7efa
SR
53fn rename_corrupted_chunk(
54 datastore: Arc<DataStore>,
55 digest: &[u8;32],
f6b1d1cc 56 worker: &dyn TaskState,
0f3b7efa
SR
57) {
58 let (path, digest_str) = datastore.chunk_path(digest);
59
60 let mut counter = 0;
61 let mut new_path = path.clone();
aadcc281 62 loop {
0f3b7efa 63 new_path.set_file_name(format!("{}.{}.bad", digest_str, counter));
aadcc281 64 if new_path.exists() && counter < 9 { counter += 1; } else { break; }
0f3b7efa
SR
65 }
66
67 match std::fs::rename(&path, &new_path) {
68 Ok(_) => {
f6b1d1cc 69 task_log!(worker, "corrupted chunk renamed to {:?}", &new_path);
0f3b7efa
SR
70 },
71 Err(err) => {
72 match err.kind() {
73 std::io::ErrorKind::NotFound => { /* ignored */ },
f6b1d1cc 74 _ => task_log!(worker, "could not rename corrupted chunk {:?} - {}", &path, err)
0f3b7efa
SR
75 }
76 }
77 };
78}
79
fdaab0df 80fn verify_index_chunks(
6b809ff5
DM
81 datastore: Arc<DataStore>,
82 index: Box<dyn IndexFile + Send>,
83 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
84 corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
9a38fa29 85 crypt_mode: CryptMode,
f6b1d1cc 86 worker: Arc<dyn TaskState + Send + Sync>,
fdaab0df
DM
87) -> Result<(), Error> {
88
a71bc08f 89 let errors = Arc::new(AtomicUsize::new(0));
fdaab0df 90
6b809ff5 91 let start_time = Instant::now();
fdaab0df 92
6b809ff5
DM
93 let mut read_bytes = 0;
94 let mut decoded_bytes = 0;
7ae571e7 95
f21508b9
DM
96 let worker2 = Arc::clone(&worker);
97 let datastore2 = Arc::clone(&datastore);
98 let corrupt_chunks2 = Arc::clone(&corrupt_chunks);
99 let verified_chunks2 = Arc::clone(&verified_chunks);
a71bc08f 100 let errors2 = Arc::clone(&errors);
f21508b9
DM
101
102 let decoder_pool = ParallelHandler::new(
103 "verify chunk decoder", 4,
104 move |(chunk, digest, size): (DataBlob, [u8;32], u64)| {
105 let chunk_crypt_mode = match chunk.crypt_mode() {
106 Err(err) => {
107 corrupt_chunks2.lock().unwrap().insert(digest);
f6b1d1cc 108 task_log!(worker2, "can't verify chunk, unknown CryptMode - {}", err);
f21508b9
DM
109 errors2.fetch_add(1, Ordering::SeqCst);
110 return Ok(());
111 },
112 Ok(mode) => mode,
113 };
114
115 if chunk_crypt_mode != crypt_mode {
f6b1d1cc
WB
116 task_log!(
117 worker2,
f21508b9
DM
118 "chunk CryptMode {:?} does not match index CryptMode {:?}",
119 chunk_crypt_mode,
120 crypt_mode
f6b1d1cc 121 );
f21508b9
DM
122 errors2.fetch_add(1, Ordering::SeqCst);
123 }
124
125 if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) {
126 corrupt_chunks2.lock().unwrap().insert(digest);
f6b1d1cc 127 task_log!(worker2, "{}", err);
f21508b9 128 errors2.fetch_add(1, Ordering::SeqCst);
f6b1d1cc 129 rename_corrupted_chunk(datastore2.clone(), &digest, &worker2);
f21508b9
DM
130 } else {
131 verified_chunks2.lock().unwrap().insert(digest);
132 }
133
134 Ok(())
135 }
136 );
137
138 for pos in 0..index.index_count() {
2aaae970 139
f6b1d1cc 140 worker.check_abort()?;
deef6369 141 crate::tools::fail_on_shutdown()?;
6b809ff5 142
f21508b9
DM
143 let info = index.chunk_info(pos).unwrap();
144 let size = info.size();
9a38fa29 145
f21508b9
DM
146 if verified_chunks.lock().unwrap().contains(&info.digest) {
147 continue; // already verified
148 }
6b809ff5 149
f21508b9
DM
150 if corrupt_chunks.lock().unwrap().contains(&info.digest) {
151 let digest_str = proxmox::tools::digest_to_hex(&info.digest);
f6b1d1cc 152 task_log!(worker, "chunk {} was marked as corrupt", digest_str);
6b809ff5 153 errors.fetch_add(1, Ordering::SeqCst);
f21508b9 154 continue;
9a38fa29
FG
155 }
156
f21508b9
DM
157 match datastore.load_chunk(&info.digest) {
158 Err(err) => {
159 corrupt_chunks.lock().unwrap().insert(info.digest);
f6b1d1cc 160 task_log!(worker, "can't verify chunk, load failed - {}", err);
f21508b9 161 errors.fetch_add(1, Ordering::SeqCst);
f6b1d1cc 162 rename_corrupted_chunk(datastore.clone(), &info.digest, &worker);
f21508b9
DM
163 continue;
164 }
165 Ok(chunk) => {
166 read_bytes += chunk.raw_size();
167 decoder_pool.send((chunk, info.digest, size))?;
168 decoded_bytes += size;
169 }
2aaae970 170 }
fdaab0df
DM
171 }
172
f21508b9
DM
173 decoder_pool.complete()?;
174
6b809ff5
DM
175 let elapsed = start_time.elapsed().as_secs_f64();
176
177 let read_bytes_mib = (read_bytes as f64)/(1024.0*1024.0);
178 let decoded_bytes_mib = (decoded_bytes as f64)/(1024.0*1024.0);
179
180 let read_speed = read_bytes_mib/elapsed;
181 let decode_speed = decoded_bytes_mib/elapsed;
182
183 let error_count = errors.load(Ordering::SeqCst);
184
f6b1d1cc
WB
185 task_log!(
186 worker,
187 " verified {:.2}/{:.2} MiB in {:.2} seconds, speed {:.2}/{:.2} MiB/s ({} errors)",
188 read_bytes_mib,
189 decoded_bytes_mib,
190 elapsed,
191 read_speed,
192 decode_speed,
193 error_count,
194 );
6b809ff5
DM
195
196 if errors.load(Ordering::SeqCst) > 0 {
f66f537d
DC
197 bail!("chunks could not be verified");
198 }
199
fdaab0df
DM
200 Ok(())
201}
202
2aaae970 203fn verify_fixed_index(
6b809ff5 204 datastore: Arc<DataStore>,
2aaae970
DM
205 backup_dir: &BackupDir,
206 info: &FileInfo,
6b809ff5
DM
207 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
208 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
f6b1d1cc 209 worker: Arc<dyn TaskState + Send + Sync>,
2aaae970 210) -> Result<(), Error> {
c2009e53
DM
211
212 let mut path = backup_dir.relative_path();
213 path.push(&info.filename);
214
215 let index = datastore.open_fixed_reader(&path)?;
216
217 let (csum, size) = index.compute_csum();
218 if size != info.size {
219 bail!("wrong size ({} != {})", info.size, size);
220 }
221
222 if csum != info.csum {
223 bail!("wrong index checksum");
224 }
225
f6b1d1cc
WB
226 verify_index_chunks(
227 datastore,
228 Box::new(index),
229 verified_chunks,
230 corrupt_chunks,
231 info.chunk_crypt_mode(),
232 worker,
233 )
c2009e53
DM
234}
235
2aaae970 236fn verify_dynamic_index(
6b809ff5 237 datastore: Arc<DataStore>,
2aaae970
DM
238 backup_dir: &BackupDir,
239 info: &FileInfo,
6b809ff5
DM
240 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
241 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
f6b1d1cc 242 worker: Arc<dyn TaskState + Send + Sync>,
2aaae970
DM
243) -> Result<(), Error> {
244
c2009e53
DM
245 let mut path = backup_dir.relative_path();
246 path.push(&info.filename);
247
248 let index = datastore.open_dynamic_reader(&path)?;
249
250 let (csum, size) = index.compute_csum();
251 if size != info.size {
252 bail!("wrong size ({} != {})", info.size, size);
253 }
254
255 if csum != info.csum {
256 bail!("wrong index checksum");
257 }
258
f6b1d1cc
WB
259 verify_index_chunks(
260 datastore,
261 Box::new(index),
262 verified_chunks,
263 corrupt_chunks,
264 info.chunk_crypt_mode(),
265 worker,
266 )
c2009e53
DM
267}
268
269/// Verify a single backup snapshot
270///
271/// This checks all archives inside a backup snapshot.
272/// Errors are logged to the worker log.
273///
8ea00f6e
DM
274/// Returns
275/// - Ok(true) if verify is successful
276/// - Ok(false) if there were verification errors
277/// - Err(_) if task was aborted
2aaae970 278pub fn verify_backup_dir(
6b809ff5 279 datastore: Arc<DataStore>,
2aaae970 280 backup_dir: &BackupDir,
6b809ff5
DM
281 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
282 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
f6b1d1cc
WB
283 worker: Arc<dyn TaskState + Send + Sync>,
284 upid: UPID,
2aaae970 285) -> Result<bool, Error> {
c2009e53 286
3b2046d2 287 let mut manifest = match datastore.load_manifest(&backup_dir) {
ff86ef00 288 Ok((manifest, _)) => manifest,
c2009e53 289 Err(err) => {
f6b1d1cc
WB
290 task_log!(
291 worker,
292 "verify {}:{} - manifest load error: {}",
293 datastore.name(),
294 backup_dir,
295 err,
296 );
8ea00f6e 297 return Ok(false);
c2009e53
DM
298 }
299 };
300
f6b1d1cc 301 task_log!(worker, "verify {}:{}", datastore.name(), backup_dir);
c2009e53
DM
302
303 let mut error_count = 0;
304
d10332a1 305 let mut verify_result = VerifyState::Ok;
c2009e53
DM
306 for info in manifest.files() {
307 let result = proxmox::try_block!({
f6b1d1cc 308 task_log!(worker, " check {}", info.filename);
c2009e53 309 match archive_type(&info.filename)? {
d8594d87
DC
310 ArchiveType::FixedIndex =>
311 verify_fixed_index(
6b809ff5 312 datastore.clone(),
d8594d87
DC
313 &backup_dir,
314 info,
6b809ff5
DM
315 verified_chunks.clone(),
316 corrupt_chunks.clone(),
317 worker.clone(),
d8594d87
DC
318 ),
319 ArchiveType::DynamicIndex =>
320 verify_dynamic_index(
6b809ff5 321 datastore.clone(),
d8594d87
DC
322 &backup_dir,
323 info,
6b809ff5
DM
324 verified_chunks.clone(),
325 corrupt_chunks.clone(),
326 worker.clone(),
d8594d87 327 ),
6b809ff5 328 ArchiveType::Blob => verify_blob(datastore.clone(), &backup_dir, info),
c2009e53
DM
329 }
330 });
8ea00f6e 331
f6b1d1cc 332 worker.check_abort()?;
deef6369 333 crate::tools::fail_on_shutdown()?;
8ea00f6e 334
c2009e53 335 if let Err(err) = result {
f6b1d1cc
WB
336 task_log!(
337 worker,
338 "verify {}:{}/{} failed: {}",
339 datastore.name(),
340 backup_dir,
341 info.filename,
342 err,
343 );
c2009e53 344 error_count += 1;
d10332a1 345 verify_result = VerifyState::Failed;
c2009e53 346 }
3b2046d2 347
c2009e53
DM
348 }
349
3b2046d2 350 let verify_state = SnapshotVerifyState {
d10332a1 351 state: verify_result,
f6b1d1cc 352 upid,
3b2046d2
TL
353 };
354 manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?;
355 datastore.store_manifest(&backup_dir, serde_json::to_value(manifest)?)
356 .map_err(|err| format_err!("unable to store manifest blob - {}", err))?;
357
8ea00f6e 358 Ok(error_count == 0)
c2009e53
DM
359}
360
8ea00f6e
DM
361/// Verify all backups inside a backup group
362///
363/// Errors are logged to the worker log.
364///
365/// Returns
63d9aca9 366/// - Ok((count, failed_dirs)) where failed_dirs had verification errors
8ea00f6e 367/// - Err(_) if task was aborted
4f09d310
DM
368pub fn verify_backup_group(
369 datastore: Arc<DataStore>,
370 group: &BackupGroup,
371 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
372 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
63d9aca9 373 progress: Option<(usize, usize)>, // (done, snapshot_count)
f6b1d1cc
WB
374 worker: Arc<dyn TaskState + Send + Sync>,
375 upid: &UPID,
63d9aca9 376) -> Result<(usize, Vec<String>), Error> {
c2009e53 377
adfdc369 378 let mut errors = Vec::new();
c2009e53
DM
379 let mut list = match group.list_backups(&datastore.base_path()) {
380 Ok(list) => list,
381 Err(err) => {
f6b1d1cc
WB
382 task_log!(
383 worker,
384 "verify group {}:{} - unable to list backups: {}",
385 datastore.name(),
386 group,
387 err,
388 );
63d9aca9 389 return Ok((0, errors));
c2009e53
DM
390 }
391 };
392
f6b1d1cc 393 task_log!(worker, "verify group {}:{}", datastore.name(), group);
c2009e53 394
63d9aca9
DM
395 let (done, snapshot_count) = progress.unwrap_or((0, list.len()));
396
397 let mut count = 0;
c2009e53
DM
398 BackupInfo::sort_list(&mut list, false); // newest first
399 for info in list {
63d9aca9 400 count += 1;
f6b1d1cc
WB
401 if !verify_backup_dir(
402 datastore.clone(),
403 &info.backup_dir,
404 verified_chunks.clone(),
405 corrupt_chunks.clone(),
406 worker.clone(),
407 upid.clone(),
408 )? {
adfdc369 409 errors.push(info.backup_dir.to_string());
c2009e53 410 }
63d9aca9
DM
411 if snapshot_count != 0 {
412 let pos = done + count;
413 let percentage = ((pos as f64) * 100.0)/(snapshot_count as f64);
f6b1d1cc
WB
414 task_log!(
415 worker,
416 "percentage done: {:.2}% ({} of {} snapshots)",
417 percentage,
418 pos,
419 snapshot_count,
420 );
63d9aca9 421 }
c2009e53
DM
422 }
423
63d9aca9 424 Ok((count, errors))
c2009e53
DM
425}
426
8ea00f6e
DM
427/// Verify all backups inside a datastore
428///
429/// Errors are logged to the worker log.
430///
431/// Returns
adfdc369 432/// - Ok(failed_dirs) where failed_dirs had verification errors
8ea00f6e 433/// - Err(_) if task was aborted
f6b1d1cc
WB
434pub fn verify_all_backups(
435 datastore: Arc<DataStore>,
436 worker: Arc<dyn TaskState + Send + Sync>,
437 upid: &UPID,
438) -> Result<Vec<String>, Error> {
adfdc369 439 let mut errors = Vec::new();
c2009e53 440
4264c502 441 let mut list = match BackupGroup::list_groups(&datastore.base_path()) {
5656888c
DM
442 Ok(list) => list
443 .into_iter()
444 .filter(|group| !(group.backup_type() == "host" && group.backup_id() == "benchmark"))
445 .collect::<Vec<BackupGroup>>(),
c2009e53 446 Err(err) => {
f6b1d1cc
WB
447 task_log!(
448 worker,
449 "verify datastore {} - unable to list backups: {}",
450 datastore.name(),
451 err,
452 );
adfdc369 453 return Ok(errors);
c2009e53
DM
454 }
455 };
456
4264c502
DM
457 list.sort_unstable();
458
63d9aca9
DM
459 let mut snapshot_count = 0;
460 for group in list.iter() {
461 snapshot_count += group.list_backups(&datastore.base_path())?.len();
462 }
463
4f09d310
DM
464 // start with 16384 chunks (up to 65GB)
465 let verified_chunks = Arc::new(Mutex::new(HashSet::with_capacity(1024*16)));
466
467 // start with 64 chunks since we assume there are few corrupt ones
468 let corrupt_chunks = Arc::new(Mutex::new(HashSet::with_capacity(64)));
469
f6b1d1cc 470 task_log!(worker, "verify datastore {} ({} snapshots)", datastore.name(), snapshot_count);
c2009e53 471
63d9aca9 472 let mut done = 0;
c2009e53 473 for group in list {
63d9aca9 474 let (count, mut group_errors) = verify_backup_group(
4f09d310
DM
475 datastore.clone(),
476 &group,
477 verified_chunks.clone(),
478 corrupt_chunks.clone(),
63d9aca9 479 Some((done, snapshot_count)),
4f09d310 480 worker.clone(),
f6b1d1cc 481 upid,
4f09d310 482 )?;
adfdc369 483 errors.append(&mut group_errors);
63d9aca9
DM
484
485 done += count;
c2009e53
DM
486 }
487
adfdc369 488 Ok(errors)
c2009e53 489}