]>
Commit | Line | Data |
---|---|---|
2aaae970 | 1 | use std::collections::HashSet; |
6b809ff5 DM |
2 | use std::sync::{Arc, Mutex}; |
3 | use std::sync::atomic::{Ordering, AtomicUsize}; | |
4 | use std::time::Instant; | |
2aaae970 | 5 | |
3b2046d2 | 6 | use anyhow::{bail, format_err, Error}; |
c2009e53 | 7 | |
ee7a308d DM |
8 | use crate::{ |
9 | server::WorkerTask, | |
10 | api2::types::*, | |
f21508b9 | 11 | tools::ParallelHandler, |
ee7a308d DM |
12 | backup::{ |
13 | DataStore, | |
14 | DataBlob, | |
15 | BackupGroup, | |
16 | BackupDir, | |
17 | BackupInfo, | |
18 | IndexFile, | |
19 | CryptMode, | |
20 | FileInfo, | |
21 | ArchiveType, | |
22 | archive_type, | |
23 | }, | |
c2009e53 DM |
24 | }; |
25 | ||
6b809ff5 | 26 | fn verify_blob(datastore: Arc<DataStore>, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> { |
c2009e53 | 27 | |
39f18b30 | 28 | let blob = datastore.load_blob(backup_dir, &info.filename)?; |
c2009e53 | 29 | |
2aaae970 | 30 | let raw_size = blob.raw_size(); |
c2009e53 DM |
31 | if raw_size != info.size { |
32 | bail!("wrong size ({} != {})", info.size, raw_size); | |
33 | } | |
34 | ||
39f18b30 | 35 | let csum = openssl::sha::sha256(blob.raw_data()); |
c2009e53 DM |
36 | if csum != info.csum { |
37 | bail!("wrong index checksum"); | |
38 | } | |
39 | ||
8819d1f2 FG |
40 | match blob.crypt_mode()? { |
41 | CryptMode::Encrypt => Ok(()), | |
42 | CryptMode::None => { | |
43 | // digest already verified above | |
44 | blob.decode(None, None)?; | |
45 | Ok(()) | |
46 | }, | |
47 | CryptMode::SignOnly => bail!("Invalid CryptMode for blob"), | |
c2009e53 | 48 | } |
c2009e53 DM |
49 | } |
50 | ||
0f3b7efa SR |
51 | fn rename_corrupted_chunk( |
52 | datastore: Arc<DataStore>, | |
53 | digest: &[u8;32], | |
54 | worker: Arc<WorkerTask>, | |
55 | ) { | |
56 | let (path, digest_str) = datastore.chunk_path(digest); | |
57 | ||
58 | let mut counter = 0; | |
59 | let mut new_path = path.clone(); | |
aadcc281 | 60 | loop { |
0f3b7efa | 61 | new_path.set_file_name(format!("{}.{}.bad", digest_str, counter)); |
aadcc281 | 62 | if new_path.exists() && counter < 9 { counter += 1; } else { break; } |
0f3b7efa SR |
63 | } |
64 | ||
65 | match std::fs::rename(&path, &new_path) { | |
66 | Ok(_) => { | |
67 | worker.log(format!("corrupted chunk renamed to {:?}", &new_path)); | |
68 | }, | |
69 | Err(err) => { | |
70 | match err.kind() { | |
71 | std::io::ErrorKind::NotFound => { /* ignored */ }, | |
72 | _ => worker.log(format!("could not rename corrupted chunk {:?} - {}", &path, err)) | |
73 | } | |
74 | } | |
75 | }; | |
76 | } | |
77 | ||
fdaab0df | 78 | fn verify_index_chunks( |
6b809ff5 DM |
79 | datastore: Arc<DataStore>, |
80 | index: Box<dyn IndexFile + Send>, | |
81 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
82 | corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>, | |
9a38fa29 | 83 | crypt_mode: CryptMode, |
6b809ff5 | 84 | worker: Arc<WorkerTask>, |
fdaab0df DM |
85 | ) -> Result<(), Error> { |
86 | ||
a71bc08f | 87 | let errors = Arc::new(AtomicUsize::new(0)); |
fdaab0df | 88 | |
6b809ff5 | 89 | let start_time = Instant::now(); |
fdaab0df | 90 | |
6b809ff5 DM |
91 | let mut read_bytes = 0; |
92 | let mut decoded_bytes = 0; | |
7ae571e7 | 93 | |
f21508b9 DM |
94 | let worker2 = Arc::clone(&worker); |
95 | let datastore2 = Arc::clone(&datastore); | |
96 | let corrupt_chunks2 = Arc::clone(&corrupt_chunks); | |
97 | let verified_chunks2 = Arc::clone(&verified_chunks); | |
a71bc08f | 98 | let errors2 = Arc::clone(&errors); |
f21508b9 DM |
99 | |
100 | let decoder_pool = ParallelHandler::new( | |
101 | "verify chunk decoder", 4, | |
102 | move |(chunk, digest, size): (DataBlob, [u8;32], u64)| { | |
103 | let chunk_crypt_mode = match chunk.crypt_mode() { | |
104 | Err(err) => { | |
105 | corrupt_chunks2.lock().unwrap().insert(digest); | |
106 | worker2.log(format!("can't verify chunk, unknown CryptMode - {}", err)); | |
107 | errors2.fetch_add(1, Ordering::SeqCst); | |
108 | return Ok(()); | |
109 | }, | |
110 | Ok(mode) => mode, | |
111 | }; | |
112 | ||
113 | if chunk_crypt_mode != crypt_mode { | |
114 | worker2.log(format!( | |
115 | "chunk CryptMode {:?} does not match index CryptMode {:?}", | |
116 | chunk_crypt_mode, | |
117 | crypt_mode | |
118 | )); | |
119 | errors2.fetch_add(1, Ordering::SeqCst); | |
120 | } | |
121 | ||
122 | if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) { | |
123 | corrupt_chunks2.lock().unwrap().insert(digest); | |
124 | worker2.log(format!("{}", err)); | |
125 | errors2.fetch_add(1, Ordering::SeqCst); | |
126 | rename_corrupted_chunk(datastore2.clone(), &digest, worker2.clone()); | |
127 | } else { | |
128 | verified_chunks2.lock().unwrap().insert(digest); | |
129 | } | |
130 | ||
131 | Ok(()) | |
132 | } | |
133 | ); | |
134 | ||
135 | for pos in 0..index.index_count() { | |
2aaae970 | 136 | |
6b809ff5 | 137 | worker.fail_on_abort()?; |
deef6369 | 138 | crate::tools::fail_on_shutdown()?; |
6b809ff5 | 139 | |
f21508b9 DM |
140 | let info = index.chunk_info(pos).unwrap(); |
141 | let size = info.size(); | |
9a38fa29 | 142 | |
f21508b9 DM |
143 | if verified_chunks.lock().unwrap().contains(&info.digest) { |
144 | continue; // already verified | |
145 | } | |
6b809ff5 | 146 | |
f21508b9 DM |
147 | if corrupt_chunks.lock().unwrap().contains(&info.digest) { |
148 | let digest_str = proxmox::tools::digest_to_hex(&info.digest); | |
149 | worker.log(format!("chunk {} was marked as corrupt", digest_str)); | |
6b809ff5 | 150 | errors.fetch_add(1, Ordering::SeqCst); |
f21508b9 | 151 | continue; |
9a38fa29 FG |
152 | } |
153 | ||
f21508b9 DM |
154 | match datastore.load_chunk(&info.digest) { |
155 | Err(err) => { | |
156 | corrupt_chunks.lock().unwrap().insert(info.digest); | |
157 | worker.log(format!("can't verify chunk, load failed - {}", err)); | |
158 | errors.fetch_add(1, Ordering::SeqCst); | |
159 | rename_corrupted_chunk(datastore.clone(), &info.digest, worker.clone()); | |
160 | continue; | |
161 | } | |
162 | Ok(chunk) => { | |
163 | read_bytes += chunk.raw_size(); | |
164 | decoder_pool.send((chunk, info.digest, size))?; | |
165 | decoded_bytes += size; | |
166 | } | |
2aaae970 | 167 | } |
fdaab0df DM |
168 | } |
169 | ||
f21508b9 DM |
170 | decoder_pool.complete()?; |
171 | ||
6b809ff5 DM |
172 | let elapsed = start_time.elapsed().as_secs_f64(); |
173 | ||
174 | let read_bytes_mib = (read_bytes as f64)/(1024.0*1024.0); | |
175 | let decoded_bytes_mib = (decoded_bytes as f64)/(1024.0*1024.0); | |
176 | ||
177 | let read_speed = read_bytes_mib/elapsed; | |
178 | let decode_speed = decoded_bytes_mib/elapsed; | |
179 | ||
180 | let error_count = errors.load(Ordering::SeqCst); | |
181 | ||
7c77e2f9 | 182 | worker.log(format!(" verified {:.2}/{:.2} MiB in {:.2} seconds, speed {:.2}/{:.2} MiB/s ({} errors)", |
6b809ff5 DM |
183 | read_bytes_mib, decoded_bytes_mib, elapsed, read_speed, decode_speed, error_count)); |
184 | ||
185 | if errors.load(Ordering::SeqCst) > 0 { | |
f66f537d DC |
186 | bail!("chunks could not be verified"); |
187 | } | |
188 | ||
fdaab0df DM |
189 | Ok(()) |
190 | } | |
191 | ||
2aaae970 | 192 | fn verify_fixed_index( |
6b809ff5 | 193 | datastore: Arc<DataStore>, |
2aaae970 DM |
194 | backup_dir: &BackupDir, |
195 | info: &FileInfo, | |
6b809ff5 DM |
196 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, |
197 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
198 | worker: Arc<WorkerTask>, | |
2aaae970 | 199 | ) -> Result<(), Error> { |
c2009e53 DM |
200 | |
201 | let mut path = backup_dir.relative_path(); | |
202 | path.push(&info.filename); | |
203 | ||
204 | let index = datastore.open_fixed_reader(&path)?; | |
205 | ||
206 | let (csum, size) = index.compute_csum(); | |
207 | if size != info.size { | |
208 | bail!("wrong size ({} != {})", info.size, size); | |
209 | } | |
210 | ||
211 | if csum != info.csum { | |
212 | bail!("wrong index checksum"); | |
213 | } | |
214 | ||
9a38fa29 | 215 | verify_index_chunks(datastore, Box::new(index), verified_chunks, corrupt_chunks, info.chunk_crypt_mode(), worker) |
c2009e53 DM |
216 | } |
217 | ||
2aaae970 | 218 | fn verify_dynamic_index( |
6b809ff5 | 219 | datastore: Arc<DataStore>, |
2aaae970 DM |
220 | backup_dir: &BackupDir, |
221 | info: &FileInfo, | |
6b809ff5 DM |
222 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, |
223 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
224 | worker: Arc<WorkerTask>, | |
2aaae970 DM |
225 | ) -> Result<(), Error> { |
226 | ||
c2009e53 DM |
227 | let mut path = backup_dir.relative_path(); |
228 | path.push(&info.filename); | |
229 | ||
230 | let index = datastore.open_dynamic_reader(&path)?; | |
231 | ||
232 | let (csum, size) = index.compute_csum(); | |
233 | if size != info.size { | |
234 | bail!("wrong size ({} != {})", info.size, size); | |
235 | } | |
236 | ||
237 | if csum != info.csum { | |
238 | bail!("wrong index checksum"); | |
239 | } | |
240 | ||
9a38fa29 | 241 | verify_index_chunks(datastore, Box::new(index), verified_chunks, corrupt_chunks, info.chunk_crypt_mode(), worker) |
c2009e53 DM |
242 | } |
243 | ||
244 | /// Verify a single backup snapshot | |
245 | /// | |
246 | /// This checks all archives inside a backup snapshot. | |
247 | /// Errors are logged to the worker log. | |
248 | /// | |
8ea00f6e DM |
249 | /// Returns |
250 | /// - Ok(true) if verify is successful | |
251 | /// - Ok(false) if there were verification errors | |
252 | /// - Err(_) if task was aborted | |
2aaae970 | 253 | pub fn verify_backup_dir( |
6b809ff5 | 254 | datastore: Arc<DataStore>, |
2aaae970 | 255 | backup_dir: &BackupDir, |
6b809ff5 DM |
256 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, |
257 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
258 | worker: Arc<WorkerTask> | |
2aaae970 | 259 | ) -> Result<bool, Error> { |
c2009e53 | 260 | |
3b2046d2 | 261 | let mut manifest = match datastore.load_manifest(&backup_dir) { |
ff86ef00 | 262 | Ok((manifest, _)) => manifest, |
c2009e53 DM |
263 | Err(err) => { |
264 | worker.log(format!("verify {}:{} - manifest load error: {}", datastore.name(), backup_dir, err)); | |
8ea00f6e | 265 | return Ok(false); |
c2009e53 DM |
266 | } |
267 | }; | |
268 | ||
269 | worker.log(format!("verify {}:{}", datastore.name(), backup_dir)); | |
270 | ||
271 | let mut error_count = 0; | |
272 | ||
d10332a1 | 273 | let mut verify_result = VerifyState::Ok; |
c2009e53 DM |
274 | for info in manifest.files() { |
275 | let result = proxmox::try_block!({ | |
276 | worker.log(format!(" check {}", info.filename)); | |
277 | match archive_type(&info.filename)? { | |
d8594d87 DC |
278 | ArchiveType::FixedIndex => |
279 | verify_fixed_index( | |
6b809ff5 | 280 | datastore.clone(), |
d8594d87 DC |
281 | &backup_dir, |
282 | info, | |
6b809ff5 DM |
283 | verified_chunks.clone(), |
284 | corrupt_chunks.clone(), | |
285 | worker.clone(), | |
d8594d87 DC |
286 | ), |
287 | ArchiveType::DynamicIndex => | |
288 | verify_dynamic_index( | |
6b809ff5 | 289 | datastore.clone(), |
d8594d87 DC |
290 | &backup_dir, |
291 | info, | |
6b809ff5 DM |
292 | verified_chunks.clone(), |
293 | corrupt_chunks.clone(), | |
294 | worker.clone(), | |
d8594d87 | 295 | ), |
6b809ff5 | 296 | ArchiveType::Blob => verify_blob(datastore.clone(), &backup_dir, info), |
c2009e53 DM |
297 | } |
298 | }); | |
8ea00f6e DM |
299 | |
300 | worker.fail_on_abort()?; | |
deef6369 | 301 | crate::tools::fail_on_shutdown()?; |
8ea00f6e | 302 | |
c2009e53 DM |
303 | if let Err(err) = result { |
304 | worker.log(format!("verify {}:{}/{} failed: {}", datastore.name(), backup_dir, info.filename, err)); | |
305 | error_count += 1; | |
d10332a1 | 306 | verify_result = VerifyState::Failed; |
c2009e53 | 307 | } |
3b2046d2 | 308 | |
c2009e53 DM |
309 | } |
310 | ||
3b2046d2 | 311 | let verify_state = SnapshotVerifyState { |
d10332a1 | 312 | state: verify_result, |
3b2046d2 TL |
313 | upid: worker.upid().clone(), |
314 | }; | |
315 | manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?; | |
316 | datastore.store_manifest(&backup_dir, serde_json::to_value(manifest)?) | |
317 | .map_err(|err| format_err!("unable to store manifest blob - {}", err))?; | |
318 | ||
8ea00f6e | 319 | Ok(error_count == 0) |
c2009e53 DM |
320 | } |
321 | ||
8ea00f6e DM |
322 | /// Verify all backups inside a backup group |
323 | /// | |
324 | /// Errors are logged to the worker log. | |
325 | /// | |
326 | /// Returns | |
63d9aca9 | 327 | /// - Ok((count, failed_dirs)) where failed_dirs had verification errors |
8ea00f6e | 328 | /// - Err(_) if task was aborted |
4f09d310 DM |
329 | pub fn verify_backup_group( |
330 | datastore: Arc<DataStore>, | |
331 | group: &BackupGroup, | |
332 | verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
333 | corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>, | |
63d9aca9 | 334 | progress: Option<(usize, usize)>, // (done, snapshot_count) |
4f09d310 | 335 | worker: Arc<WorkerTask>, |
63d9aca9 | 336 | ) -> Result<(usize, Vec<String>), Error> { |
c2009e53 | 337 | |
adfdc369 | 338 | let mut errors = Vec::new(); |
c2009e53 DM |
339 | let mut list = match group.list_backups(&datastore.base_path()) { |
340 | Ok(list) => list, | |
341 | Err(err) => { | |
342 | worker.log(format!("verify group {}:{} - unable to list backups: {}", datastore.name(), group, err)); | |
63d9aca9 | 343 | return Ok((0, errors)); |
c2009e53 DM |
344 | } |
345 | }; | |
346 | ||
347 | worker.log(format!("verify group {}:{}", datastore.name(), group)); | |
348 | ||
63d9aca9 DM |
349 | let (done, snapshot_count) = progress.unwrap_or((0, list.len())); |
350 | ||
351 | let mut count = 0; | |
c2009e53 DM |
352 | BackupInfo::sort_list(&mut list, false); // newest first |
353 | for info in list { | |
63d9aca9 | 354 | count += 1; |
6b809ff5 | 355 | if !verify_backup_dir(datastore.clone(), &info.backup_dir, verified_chunks.clone(), corrupt_chunks.clone(), worker.clone())?{ |
adfdc369 | 356 | errors.push(info.backup_dir.to_string()); |
c2009e53 | 357 | } |
63d9aca9 DM |
358 | if snapshot_count != 0 { |
359 | let pos = done + count; | |
360 | let percentage = ((pos as f64) * 100.0)/(snapshot_count as f64); | |
361 | worker.log(format!("percentage done: {:.2}% ({} of {} snapshots)", percentage, pos, snapshot_count)); | |
362 | } | |
c2009e53 DM |
363 | } |
364 | ||
63d9aca9 | 365 | Ok((count, errors)) |
c2009e53 DM |
366 | } |
367 | ||
8ea00f6e DM |
368 | /// Verify all backups inside a datastore |
369 | /// | |
370 | /// Errors are logged to the worker log. | |
371 | /// | |
372 | /// Returns | |
adfdc369 | 373 | /// - Ok(failed_dirs) where failed_dirs had verification errors |
8ea00f6e | 374 | /// - Err(_) if task was aborted |
6b809ff5 | 375 | pub fn verify_all_backups(datastore: Arc<DataStore>, worker: Arc<WorkerTask>) -> Result<Vec<String>, Error> { |
adfdc369 DC |
376 | |
377 | let mut errors = Vec::new(); | |
c2009e53 | 378 | |
4264c502 | 379 | let mut list = match BackupGroup::list_groups(&datastore.base_path()) { |
5656888c DM |
380 | Ok(list) => list |
381 | .into_iter() | |
382 | .filter(|group| !(group.backup_type() == "host" && group.backup_id() == "benchmark")) | |
383 | .collect::<Vec<BackupGroup>>(), | |
c2009e53 DM |
384 | Err(err) => { |
385 | worker.log(format!("verify datastore {} - unable to list backups: {}", datastore.name(), err)); | |
adfdc369 | 386 | return Ok(errors); |
c2009e53 DM |
387 | } |
388 | }; | |
389 | ||
4264c502 DM |
390 | list.sort_unstable(); |
391 | ||
63d9aca9 DM |
392 | let mut snapshot_count = 0; |
393 | for group in list.iter() { | |
394 | snapshot_count += group.list_backups(&datastore.base_path())?.len(); | |
395 | } | |
396 | ||
4f09d310 DM |
397 | // start with 16384 chunks (up to 65GB) |
398 | let verified_chunks = Arc::new(Mutex::new(HashSet::with_capacity(1024*16))); | |
399 | ||
400 | // start with 64 chunks since we assume there are few corrupt ones | |
401 | let corrupt_chunks = Arc::new(Mutex::new(HashSet::with_capacity(64))); | |
402 | ||
63d9aca9 | 403 | worker.log(format!("verify datastore {} ({} snapshots)", datastore.name(), snapshot_count)); |
c2009e53 | 404 | |
63d9aca9 | 405 | let mut done = 0; |
c2009e53 | 406 | for group in list { |
63d9aca9 | 407 | let (count, mut group_errors) = verify_backup_group( |
4f09d310 DM |
408 | datastore.clone(), |
409 | &group, | |
410 | verified_chunks.clone(), | |
411 | corrupt_chunks.clone(), | |
63d9aca9 | 412 | Some((done, snapshot_count)), |
4f09d310 DM |
413 | worker.clone(), |
414 | )?; | |
adfdc369 | 415 | errors.append(&mut group_errors); |
63d9aca9 DM |
416 | |
417 | done += count; | |
c2009e53 DM |
418 | } |
419 | ||
adfdc369 | 420 | Ok(errors) |
c2009e53 | 421 | } |