]> git.proxmox.com Git - proxmox-backup.git/blame - src/backup/verify.rs
introduce TaskState trait
[proxmox-backup.git] / src / backup / verify.rs
CommitLineData
2aaae970 1use std::collections::HashSet;
6b809ff5
DM
2use std::sync::{Arc, Mutex};
3use std::sync::atomic::{Ordering, AtomicUsize};
4use std::time::Instant;
2aaae970 5
3b2046d2 6use anyhow::{bail, format_err, Error};
c2009e53 7
ee7a308d
DM
8use crate::{
9 server::WorkerTask,
10 api2::types::*,
f21508b9 11 tools::ParallelHandler,
ee7a308d
DM
12 backup::{
13 DataStore,
14 DataBlob,
15 BackupGroup,
16 BackupDir,
17 BackupInfo,
18 IndexFile,
19 CryptMode,
20 FileInfo,
21 ArchiveType,
22 archive_type,
23 },
c2009e53
DM
24};
25
6b809ff5 26fn verify_blob(datastore: Arc<DataStore>, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> {
c2009e53 27
39f18b30 28 let blob = datastore.load_blob(backup_dir, &info.filename)?;
c2009e53 29
2aaae970 30 let raw_size = blob.raw_size();
c2009e53
DM
31 if raw_size != info.size {
32 bail!("wrong size ({} != {})", info.size, raw_size);
33 }
34
39f18b30 35 let csum = openssl::sha::sha256(blob.raw_data());
c2009e53
DM
36 if csum != info.csum {
37 bail!("wrong index checksum");
38 }
39
8819d1f2
FG
40 match blob.crypt_mode()? {
41 CryptMode::Encrypt => Ok(()),
42 CryptMode::None => {
43 // digest already verified above
44 blob.decode(None, None)?;
45 Ok(())
46 },
47 CryptMode::SignOnly => bail!("Invalid CryptMode for blob"),
c2009e53 48 }
c2009e53
DM
49}
50
0f3b7efa
SR
51fn rename_corrupted_chunk(
52 datastore: Arc<DataStore>,
53 digest: &[u8;32],
54 worker: Arc<WorkerTask>,
55) {
56 let (path, digest_str) = datastore.chunk_path(digest);
57
58 let mut counter = 0;
59 let mut new_path = path.clone();
aadcc281 60 loop {
0f3b7efa 61 new_path.set_file_name(format!("{}.{}.bad", digest_str, counter));
aadcc281 62 if new_path.exists() && counter < 9 { counter += 1; } else { break; }
0f3b7efa
SR
63 }
64
65 match std::fs::rename(&path, &new_path) {
66 Ok(_) => {
67 worker.log(format!("corrupted chunk renamed to {:?}", &new_path));
68 },
69 Err(err) => {
70 match err.kind() {
71 std::io::ErrorKind::NotFound => { /* ignored */ },
72 _ => worker.log(format!("could not rename corrupted chunk {:?} - {}", &path, err))
73 }
74 }
75 };
76}
77
fdaab0df 78fn verify_index_chunks(
6b809ff5
DM
79 datastore: Arc<DataStore>,
80 index: Box<dyn IndexFile + Send>,
81 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
82 corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
9a38fa29 83 crypt_mode: CryptMode,
6b809ff5 84 worker: Arc<WorkerTask>,
fdaab0df
DM
85) -> Result<(), Error> {
86
a71bc08f 87 let errors = Arc::new(AtomicUsize::new(0));
fdaab0df 88
6b809ff5 89 let start_time = Instant::now();
fdaab0df 90
6b809ff5
DM
91 let mut read_bytes = 0;
92 let mut decoded_bytes = 0;
7ae571e7 93
f21508b9
DM
94 let worker2 = Arc::clone(&worker);
95 let datastore2 = Arc::clone(&datastore);
96 let corrupt_chunks2 = Arc::clone(&corrupt_chunks);
97 let verified_chunks2 = Arc::clone(&verified_chunks);
a71bc08f 98 let errors2 = Arc::clone(&errors);
f21508b9
DM
99
100 let decoder_pool = ParallelHandler::new(
101 "verify chunk decoder", 4,
102 move |(chunk, digest, size): (DataBlob, [u8;32], u64)| {
103 let chunk_crypt_mode = match chunk.crypt_mode() {
104 Err(err) => {
105 corrupt_chunks2.lock().unwrap().insert(digest);
106 worker2.log(format!("can't verify chunk, unknown CryptMode - {}", err));
107 errors2.fetch_add(1, Ordering::SeqCst);
108 return Ok(());
109 },
110 Ok(mode) => mode,
111 };
112
113 if chunk_crypt_mode != crypt_mode {
114 worker2.log(format!(
115 "chunk CryptMode {:?} does not match index CryptMode {:?}",
116 chunk_crypt_mode,
117 crypt_mode
118 ));
119 errors2.fetch_add(1, Ordering::SeqCst);
120 }
121
122 if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) {
123 corrupt_chunks2.lock().unwrap().insert(digest);
124 worker2.log(format!("{}", err));
125 errors2.fetch_add(1, Ordering::SeqCst);
126 rename_corrupted_chunk(datastore2.clone(), &digest, worker2.clone());
127 } else {
128 verified_chunks2.lock().unwrap().insert(digest);
129 }
130
131 Ok(())
132 }
133 );
134
135 for pos in 0..index.index_count() {
2aaae970 136
6b809ff5 137 worker.fail_on_abort()?;
deef6369 138 crate::tools::fail_on_shutdown()?;
6b809ff5 139
f21508b9
DM
140 let info = index.chunk_info(pos).unwrap();
141 let size = info.size();
9a38fa29 142
f21508b9
DM
143 if verified_chunks.lock().unwrap().contains(&info.digest) {
144 continue; // already verified
145 }
6b809ff5 146
f21508b9
DM
147 if corrupt_chunks.lock().unwrap().contains(&info.digest) {
148 let digest_str = proxmox::tools::digest_to_hex(&info.digest);
149 worker.log(format!("chunk {} was marked as corrupt", digest_str));
6b809ff5 150 errors.fetch_add(1, Ordering::SeqCst);
f21508b9 151 continue;
9a38fa29
FG
152 }
153
f21508b9
DM
154 match datastore.load_chunk(&info.digest) {
155 Err(err) => {
156 corrupt_chunks.lock().unwrap().insert(info.digest);
157 worker.log(format!("can't verify chunk, load failed - {}", err));
158 errors.fetch_add(1, Ordering::SeqCst);
159 rename_corrupted_chunk(datastore.clone(), &info.digest, worker.clone());
160 continue;
161 }
162 Ok(chunk) => {
163 read_bytes += chunk.raw_size();
164 decoder_pool.send((chunk, info.digest, size))?;
165 decoded_bytes += size;
166 }
2aaae970 167 }
fdaab0df
DM
168 }
169
f21508b9
DM
170 decoder_pool.complete()?;
171
6b809ff5
DM
172 let elapsed = start_time.elapsed().as_secs_f64();
173
174 let read_bytes_mib = (read_bytes as f64)/(1024.0*1024.0);
175 let decoded_bytes_mib = (decoded_bytes as f64)/(1024.0*1024.0);
176
177 let read_speed = read_bytes_mib/elapsed;
178 let decode_speed = decoded_bytes_mib/elapsed;
179
180 let error_count = errors.load(Ordering::SeqCst);
181
7c77e2f9 182 worker.log(format!(" verified {:.2}/{:.2} MiB in {:.2} seconds, speed {:.2}/{:.2} MiB/s ({} errors)",
6b809ff5
DM
183 read_bytes_mib, decoded_bytes_mib, elapsed, read_speed, decode_speed, error_count));
184
185 if errors.load(Ordering::SeqCst) > 0 {
f66f537d
DC
186 bail!("chunks could not be verified");
187 }
188
fdaab0df
DM
189 Ok(())
190}
191
2aaae970 192fn verify_fixed_index(
6b809ff5 193 datastore: Arc<DataStore>,
2aaae970
DM
194 backup_dir: &BackupDir,
195 info: &FileInfo,
6b809ff5
DM
196 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
197 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
198 worker: Arc<WorkerTask>,
2aaae970 199) -> Result<(), Error> {
c2009e53
DM
200
201 let mut path = backup_dir.relative_path();
202 path.push(&info.filename);
203
204 let index = datastore.open_fixed_reader(&path)?;
205
206 let (csum, size) = index.compute_csum();
207 if size != info.size {
208 bail!("wrong size ({} != {})", info.size, size);
209 }
210
211 if csum != info.csum {
212 bail!("wrong index checksum");
213 }
214
9a38fa29 215 verify_index_chunks(datastore, Box::new(index), verified_chunks, corrupt_chunks, info.chunk_crypt_mode(), worker)
c2009e53
DM
216}
217
2aaae970 218fn verify_dynamic_index(
6b809ff5 219 datastore: Arc<DataStore>,
2aaae970
DM
220 backup_dir: &BackupDir,
221 info: &FileInfo,
6b809ff5
DM
222 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
223 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
224 worker: Arc<WorkerTask>,
2aaae970
DM
225) -> Result<(), Error> {
226
c2009e53
DM
227 let mut path = backup_dir.relative_path();
228 path.push(&info.filename);
229
230 let index = datastore.open_dynamic_reader(&path)?;
231
232 let (csum, size) = index.compute_csum();
233 if size != info.size {
234 bail!("wrong size ({} != {})", info.size, size);
235 }
236
237 if csum != info.csum {
238 bail!("wrong index checksum");
239 }
240
9a38fa29 241 verify_index_chunks(datastore, Box::new(index), verified_chunks, corrupt_chunks, info.chunk_crypt_mode(), worker)
c2009e53
DM
242}
243
244/// Verify a single backup snapshot
245///
246/// This checks all archives inside a backup snapshot.
247/// Errors are logged to the worker log.
248///
8ea00f6e
DM
249/// Returns
250/// - Ok(true) if verify is successful
251/// - Ok(false) if there were verification errors
252/// - Err(_) if task was aborted
2aaae970 253pub fn verify_backup_dir(
6b809ff5 254 datastore: Arc<DataStore>,
2aaae970 255 backup_dir: &BackupDir,
6b809ff5
DM
256 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
257 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
258 worker: Arc<WorkerTask>
2aaae970 259) -> Result<bool, Error> {
c2009e53 260
3b2046d2 261 let mut manifest = match datastore.load_manifest(&backup_dir) {
ff86ef00 262 Ok((manifest, _)) => manifest,
c2009e53
DM
263 Err(err) => {
264 worker.log(format!("verify {}:{} - manifest load error: {}", datastore.name(), backup_dir, err));
8ea00f6e 265 return Ok(false);
c2009e53
DM
266 }
267 };
268
269 worker.log(format!("verify {}:{}", datastore.name(), backup_dir));
270
271 let mut error_count = 0;
272
d10332a1 273 let mut verify_result = VerifyState::Ok;
c2009e53
DM
274 for info in manifest.files() {
275 let result = proxmox::try_block!({
276 worker.log(format!(" check {}", info.filename));
277 match archive_type(&info.filename)? {
d8594d87
DC
278 ArchiveType::FixedIndex =>
279 verify_fixed_index(
6b809ff5 280 datastore.clone(),
d8594d87
DC
281 &backup_dir,
282 info,
6b809ff5
DM
283 verified_chunks.clone(),
284 corrupt_chunks.clone(),
285 worker.clone(),
d8594d87
DC
286 ),
287 ArchiveType::DynamicIndex =>
288 verify_dynamic_index(
6b809ff5 289 datastore.clone(),
d8594d87
DC
290 &backup_dir,
291 info,
6b809ff5
DM
292 verified_chunks.clone(),
293 corrupt_chunks.clone(),
294 worker.clone(),
d8594d87 295 ),
6b809ff5 296 ArchiveType::Blob => verify_blob(datastore.clone(), &backup_dir, info),
c2009e53
DM
297 }
298 });
8ea00f6e
DM
299
300 worker.fail_on_abort()?;
deef6369 301 crate::tools::fail_on_shutdown()?;
8ea00f6e 302
c2009e53
DM
303 if let Err(err) = result {
304 worker.log(format!("verify {}:{}/{} failed: {}", datastore.name(), backup_dir, info.filename, err));
305 error_count += 1;
d10332a1 306 verify_result = VerifyState::Failed;
c2009e53 307 }
3b2046d2 308
c2009e53
DM
309 }
310
3b2046d2 311 let verify_state = SnapshotVerifyState {
d10332a1 312 state: verify_result,
3b2046d2
TL
313 upid: worker.upid().clone(),
314 };
315 manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?;
316 datastore.store_manifest(&backup_dir, serde_json::to_value(manifest)?)
317 .map_err(|err| format_err!("unable to store manifest blob - {}", err))?;
318
8ea00f6e 319 Ok(error_count == 0)
c2009e53
DM
320}
321
8ea00f6e
DM
322/// Verify all backups inside a backup group
323///
324/// Errors are logged to the worker log.
325///
326/// Returns
63d9aca9 327/// - Ok((count, failed_dirs)) where failed_dirs had verification errors
8ea00f6e 328/// - Err(_) if task was aborted
4f09d310
DM
329pub fn verify_backup_group(
330 datastore: Arc<DataStore>,
331 group: &BackupGroup,
332 verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
333 corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
63d9aca9 334 progress: Option<(usize, usize)>, // (done, snapshot_count)
4f09d310 335 worker: Arc<WorkerTask>,
63d9aca9 336) -> Result<(usize, Vec<String>), Error> {
c2009e53 337
adfdc369 338 let mut errors = Vec::new();
c2009e53
DM
339 let mut list = match group.list_backups(&datastore.base_path()) {
340 Ok(list) => list,
341 Err(err) => {
342 worker.log(format!("verify group {}:{} - unable to list backups: {}", datastore.name(), group, err));
63d9aca9 343 return Ok((0, errors));
c2009e53
DM
344 }
345 };
346
347 worker.log(format!("verify group {}:{}", datastore.name(), group));
348
63d9aca9
DM
349 let (done, snapshot_count) = progress.unwrap_or((0, list.len()));
350
351 let mut count = 0;
c2009e53
DM
352 BackupInfo::sort_list(&mut list, false); // newest first
353 for info in list {
63d9aca9 354 count += 1;
6b809ff5 355 if !verify_backup_dir(datastore.clone(), &info.backup_dir, verified_chunks.clone(), corrupt_chunks.clone(), worker.clone())?{
adfdc369 356 errors.push(info.backup_dir.to_string());
c2009e53 357 }
63d9aca9
DM
358 if snapshot_count != 0 {
359 let pos = done + count;
360 let percentage = ((pos as f64) * 100.0)/(snapshot_count as f64);
361 worker.log(format!("percentage done: {:.2}% ({} of {} snapshots)", percentage, pos, snapshot_count));
362 }
c2009e53
DM
363 }
364
63d9aca9 365 Ok((count, errors))
c2009e53
DM
366}
367
8ea00f6e
DM
368/// Verify all backups inside a datastore
369///
370/// Errors are logged to the worker log.
371///
372/// Returns
adfdc369 373/// - Ok(failed_dirs) where failed_dirs had verification errors
8ea00f6e 374/// - Err(_) if task was aborted
6b809ff5 375pub fn verify_all_backups(datastore: Arc<DataStore>, worker: Arc<WorkerTask>) -> Result<Vec<String>, Error> {
adfdc369
DC
376
377 let mut errors = Vec::new();
c2009e53 378
4264c502 379 let mut list = match BackupGroup::list_groups(&datastore.base_path()) {
5656888c
DM
380 Ok(list) => list
381 .into_iter()
382 .filter(|group| !(group.backup_type() == "host" && group.backup_id() == "benchmark"))
383 .collect::<Vec<BackupGroup>>(),
c2009e53
DM
384 Err(err) => {
385 worker.log(format!("verify datastore {} - unable to list backups: {}", datastore.name(), err));
adfdc369 386 return Ok(errors);
c2009e53
DM
387 }
388 };
389
4264c502
DM
390 list.sort_unstable();
391
63d9aca9
DM
392 let mut snapshot_count = 0;
393 for group in list.iter() {
394 snapshot_count += group.list_backups(&datastore.base_path())?.len();
395 }
396
4f09d310
DM
397 // start with 16384 chunks (up to 65GB)
398 let verified_chunks = Arc::new(Mutex::new(HashSet::with_capacity(1024*16)));
399
400 // start with 64 chunks since we assume there are few corrupt ones
401 let corrupt_chunks = Arc::new(Mutex::new(HashSet::with_capacity(64)));
402
63d9aca9 403 worker.log(format!("verify datastore {} ({} snapshots)", datastore.name(), snapshot_count));
c2009e53 404
63d9aca9 405 let mut done = 0;
c2009e53 406 for group in list {
63d9aca9 407 let (count, mut group_errors) = verify_backup_group(
4f09d310
DM
408 datastore.clone(),
409 &group,
410 verified_chunks.clone(),
411 corrupt_chunks.clone(),
63d9aca9 412 Some((done, snapshot_count)),
4f09d310
DM
413 worker.clone(),
414 )?;
adfdc369 415 errors.append(&mut group_errors);
63d9aca9
DM
416
417 done += count;
c2009e53
DM
418 }
419
adfdc369 420 Ok(errors)
c2009e53 421}