]> git.proxmox.com Git - proxmox-backup.git/blob - src/server/worker_task.rs
eb47890697879798584d243d08e0c6a868d666ff
[proxmox-backup.git] / src / server / worker_task.rs
1 use std::collections::HashMap;
2 use std::fs::File;
3 use std::io::{BufRead, BufReader};
4 use std::panic::UnwindSafe;
5 use std::sync::atomic::{AtomicBool, Ordering};
6 use std::sync::{Arc, Mutex};
7
8 use chrono::Local;
9 use anyhow::{bail, format_err, Error};
10 use futures::*;
11 use lazy_static::lazy_static;
12 use nix::unistd::Pid;
13 use serde_json::{json, Value};
14 use tokio::sync::oneshot;
15
16 use proxmox::sys::linux::procfs;
17 use proxmox::try_block;
18 use proxmox::tools::fs::{create_path, replace_file, CreateOptions};
19
20 use super::UPID;
21
22 use crate::tools::FileLogger;
23
24 macro_rules! PROXMOX_BACKUP_VAR_RUN_DIR_M { () => ("/run/proxmox-backup") }
25 macro_rules! PROXMOX_BACKUP_LOG_DIR_M { () => ("/var/log/proxmox-backup") }
26 macro_rules! PROXMOX_BACKUP_TASK_DIR_M { () => (concat!( PROXMOX_BACKUP_LOG_DIR_M!(), "/tasks")) }
27
28 pub const PROXMOX_BACKUP_VAR_RUN_DIR: &str = PROXMOX_BACKUP_VAR_RUN_DIR_M!();
29 pub const PROXMOX_BACKUP_LOG_DIR: &str = PROXMOX_BACKUP_LOG_DIR_M!();
30 pub const PROXMOX_BACKUP_TASK_DIR: &str = PROXMOX_BACKUP_TASK_DIR_M!();
31 pub const PROXMOX_BACKUP_TASK_LOCK_FN: &str = concat!(PROXMOX_BACKUP_TASK_DIR_M!(), "/.active.lock");
32 pub const PROXMOX_BACKUP_ACTIVE_TASK_FN: &str = concat!(PROXMOX_BACKUP_TASK_DIR_M!(), "/active");
33
34 lazy_static! {
35 static ref WORKER_TASK_LIST: Mutex<HashMap<usize, Arc<WorkerTask>>> = Mutex::new(HashMap::new());
36
37 static ref MY_PID: i32 = unsafe { libc::getpid() };
38 static ref MY_PID_PSTART: u64 = procfs::PidStat::read_from_pid(Pid::from_raw(*MY_PID))
39 .unwrap()
40 .starttime;
41 }
42
43 /// Test if the task is still running
44 pub async fn worker_is_active(upid: &UPID) -> Result<bool, Error> {
45 if (upid.pid == *MY_PID) && (upid.pstart == *MY_PID_PSTART) {
46 return Ok(WORKER_TASK_LIST.lock().unwrap().contains_key(&upid.task_id));
47 }
48
49 if !procfs::check_process_running_pstart(upid.pid, upid.pstart).is_some() {
50 return Ok(false);
51 }
52
53 let socketname = format!(
54 "\0{}/proxmox-task-control-{}.sock", PROXMOX_BACKUP_VAR_RUN_DIR, upid.pid);
55
56 let cmd = json!({
57 "command": "status",
58 "upid": upid.to_string(),
59 });
60
61 let status = super::send_command(socketname, cmd).await?;
62
63 if let Some(active) = status.as_bool() {
64 Ok(active)
65 } else {
66 bail!("got unexpected result {:?} (expected bool)", status);
67 }
68 }
69
70 /// Test if the task is still running (fast but inaccurate implementation)
71 ///
72 /// If the task is spanned from a different process, we simply return if
73 /// that process is still running. This information is good enough to detect
74 /// stale tasks...
75 pub fn worker_is_active_local(upid: &UPID) -> bool {
76 if (upid.pid == *MY_PID) && (upid.pstart == *MY_PID_PSTART) {
77 WORKER_TASK_LIST.lock().unwrap().contains_key(&upid.task_id)
78 } else {
79 procfs::check_process_running_pstart(upid.pid, upid.pstart).is_some()
80 }
81 }
82
83 pub fn create_task_control_socket() -> Result<(), Error> {
84
85 let socketname = format!(
86 "\0{}/proxmox-task-control-{}.sock", PROXMOX_BACKUP_VAR_RUN_DIR, *MY_PID);
87
88 let control_future = super::create_control_socket(socketname, |param| {
89 let param = param.as_object()
90 .ok_or_else(|| format_err!("unable to parse parameters (expected json object)"))?;
91 if param.keys().count() != 2 { bail!("wrong number of parameters"); }
92
93 let command = param["command"].as_str()
94 .ok_or_else(|| format_err!("unable to parse parameters (missing command)"))?;
95
96 // we have only two commands for now
97 if !(command == "abort-task" || command == "status") { bail!("got unknown command '{}'", command); }
98
99 let upid_str = param["upid"].as_str()
100 .ok_or_else(|| format_err!("unable to parse parameters (missing upid)"))?;
101
102 let upid = upid_str.parse::<UPID>()?;
103
104 if !((upid.pid == *MY_PID) && (upid.pstart == *MY_PID_PSTART)) {
105 bail!("upid does not belong to this process");
106 }
107
108 let hash = WORKER_TASK_LIST.lock().unwrap();
109
110 match command {
111 "abort-task" => {
112 if let Some(ref worker) = hash.get(&upid.task_id) {
113 worker.request_abort();
114 } else {
115 // assume task is already stopped
116 }
117 Ok(Value::Null)
118 }
119 "status" => {
120 let active = hash.contains_key(&upid.task_id);
121 Ok(active.into())
122 }
123 _ => {
124 bail!("got unknown command '{}'", command);
125 }
126 }
127 })?;
128
129 tokio::spawn(control_future);
130
131 Ok(())
132 }
133
134 pub fn abort_worker_async(upid: UPID) {
135 tokio::spawn(async move {
136 if let Err(err) = abort_worker(upid).await {
137 eprintln!("abort worker failed - {}", err);
138 }
139 });
140 }
141
142 pub async fn abort_worker(upid: UPID) -> Result<(), Error> {
143
144 let target_pid = upid.pid;
145
146 let socketname = format!(
147 "\0{}/proxmox-task-control-{}.sock", PROXMOX_BACKUP_VAR_RUN_DIR, target_pid);
148
149 let cmd = json!({
150 "command": "abort-task",
151 "upid": upid.to_string(),
152 });
153
154 super::send_command(socketname, cmd).map_ok(|_| ()).await
155 }
156
157 fn parse_worker_status_line(line: &str) -> Result<(String, UPID, Option<(i64, String)>), Error> {
158
159 let data = line.splitn(3, ' ').collect::<Vec<&str>>();
160
161 let len = data.len();
162
163 match len {
164 1 => Ok((data[0].to_owned(), data[0].parse::<UPID>()?, None)),
165 3 => {
166 let endtime = i64::from_str_radix(data[1], 16)?;
167 Ok((data[0].to_owned(), data[0].parse::<UPID>()?, Some((endtime, data[2].to_owned()))))
168 }
169 _ => bail!("wrong number of components"),
170 }
171 }
172
173 /// Create task log directory with correct permissions
174 pub fn create_task_log_dirs() -> Result<(), Error> {
175
176 try_block!({
177 let backup_user = crate::backup::backup_user()?;
178 let opts = CreateOptions::new()
179 .owner(backup_user.uid)
180 .group(backup_user.gid);
181
182 create_path(PROXMOX_BACKUP_LOG_DIR, None, Some(opts.clone()))?;
183 create_path(PROXMOX_BACKUP_TASK_DIR, None, Some(opts.clone()))?;
184 create_path(PROXMOX_BACKUP_VAR_RUN_DIR, None, Some(opts))?;
185 Ok(())
186 }).map_err(|err: Error| format_err!("unable to create task log dir - {}", err))?;
187
188 Ok(())
189 }
190
191 /// Read exits status from task log file
192 pub fn upid_read_status(upid: &UPID) -> Result<String, Error> {
193 let mut status = String::from("unknown");
194
195 let path = upid.log_path();
196
197 let mut file = File::open(path)?;
198
199 /// speedup - only read tail
200 use std::io::Seek;
201 use std::io::SeekFrom;
202 let _ = file.seek(SeekFrom::End(-8192)); // ignore errors
203
204 let reader = BufReader::new(file);
205
206 for line in reader.lines() {
207 let line = line?;
208
209 let mut iter = line.splitn(2, ": TASK ");
210 if iter.next() == None { continue; }
211 match iter.next() {
212 None => continue,
213 Some(rest) => {
214 if rest == "OK" {
215 status = String::from(rest);
216 } else if rest.starts_with("ERROR: ") {
217 status = String::from(&rest[7..]);
218 }
219 }
220 }
221 }
222
223 Ok(status)
224 }
225
226 /// Task details including parsed UPID
227 ///
228 /// If there is no `state`, the task is still running.
229 #[derive(Debug)]
230 pub struct TaskListInfo {
231 /// The parsed UPID
232 pub upid: UPID,
233 /// UPID string representation
234 pub upid_str: String,
235 /// Task `(endtime, status)` if already finished
236 ///
237 /// The `status` ise iether `unknown`, `OK`, or `ERROR: ...`
238 pub state: Option<(i64, String)>, // endtime, status
239 }
240
241 // atomically read/update the task list, update status of finished tasks
242 // new_upid is added to the list when specified.
243 // Returns a sorted list of known tasks,
244 fn update_active_workers(new_upid: Option<&UPID>) -> Result<Vec<TaskListInfo>, Error> {
245
246 let backup_user = crate::backup::backup_user()?;
247
248 let lock = crate::tools::open_file_locked(PROXMOX_BACKUP_TASK_LOCK_FN, std::time::Duration::new(10, 0))?;
249 nix::unistd::chown(PROXMOX_BACKUP_TASK_LOCK_FN, Some(backup_user.uid), Some(backup_user.gid))?;
250
251 let reader = match File::open(PROXMOX_BACKUP_ACTIVE_TASK_FN) {
252 Ok(f) => Some(BufReader::new(f)),
253 Err(err) => {
254 if err.kind() == std::io::ErrorKind::NotFound {
255 None
256 } else {
257 bail!("unable to open active worker {:?} - {}", PROXMOX_BACKUP_ACTIVE_TASK_FN, err);
258 }
259 }
260 };
261
262 let mut active_list = vec![];
263 let mut finish_list = vec![];
264
265 if let Some(lines) = reader.map(|r| r.lines()) {
266
267 for line in lines {
268 let line = line?;
269 match parse_worker_status_line(&line) {
270 Err(err) => bail!("unable to parse active worker status '{}' - {}", line, err),
271 Ok((upid_str, upid, state)) => {
272
273 let running = worker_is_active_local(&upid);
274
275 if running {
276 active_list.push(TaskListInfo { upid, upid_str, state: None });
277 } else {
278 match state {
279 None => {
280 println!("Detected stoped UPID {}", upid_str);
281 let status = upid_read_status(&upid)
282 .unwrap_or_else(|_| String::from("unknown"));
283 finish_list.push(TaskListInfo {
284 upid, upid_str, state: Some((Local::now().timestamp(), status))
285 });
286 }
287 Some((endtime, status)) => {
288 finish_list.push(TaskListInfo {
289 upid, upid_str, state: Some((endtime, status))
290 })
291 }
292 }
293 }
294 }
295 }
296 }
297 }
298
299 if let Some(upid) = new_upid {
300 active_list.push(TaskListInfo { upid: upid.clone(), upid_str: upid.to_string(), state: None });
301 }
302
303 // assemble list without duplicates
304 // we include all active tasks,
305 // and fill up to 1000 entries with finished tasks
306
307 let max = 1000;
308
309 let mut task_hash = HashMap::new();
310
311 for info in active_list {
312 task_hash.insert(info.upid_str.clone(), info);
313 }
314
315 for info in finish_list {
316 if task_hash.len() > max { break; }
317 if !task_hash.contains_key(&info.upid_str) {
318 task_hash.insert(info.upid_str.clone(), info);
319 }
320 }
321
322 let mut task_list: Vec<TaskListInfo> = vec![];
323 for (_, info) in task_hash { task_list.push(info); }
324
325 task_list.sort_unstable_by(|b, a| { // lastest on top
326 match (&a.state, &b.state) {
327 (Some(s1), Some(s2)) => s1.0.cmp(&s2.0),
328 (Some(_), None) => std::cmp::Ordering::Less,
329 (None, Some(_)) => std::cmp::Ordering::Greater,
330 _ => a.upid.starttime.cmp(&b.upid.starttime),
331 }
332 });
333
334 let mut raw = String::new();
335 for info in &task_list {
336 if let Some((endtime, status)) = &info.state {
337 raw.push_str(&format!("{} {:08X} {}\n", info.upid_str, endtime, status));
338 } else {
339 raw.push_str(&info.upid_str);
340 raw.push('\n');
341 }
342 }
343
344 replace_file(
345 PROXMOX_BACKUP_ACTIVE_TASK_FN,
346 raw.as_bytes(),
347 CreateOptions::new()
348 .owner(backup_user.uid)
349 .group(backup_user.gid),
350 )?;
351
352 drop(lock);
353
354 Ok(task_list)
355 }
356
357 /// Returns a sorted list of known tasks
358 ///
359 /// The list is sorted by `(starttime, endtime)` in ascending order
360 pub fn read_task_list() -> Result<Vec<TaskListInfo>, Error> {
361 update_active_workers(None)
362 }
363
364 /// Launch long running worker tasks.
365 ///
366 /// A worker task can either be a whole thread, or a simply tokio
367 /// task/future. Each task can `log()` messages, which are stored
368 /// persistently to files. Task should poll the `abort_requested`
369 /// flag, and stop execution when requested.
370 #[derive(Debug)]
371 pub struct WorkerTask {
372 upid: UPID,
373 data: Mutex<WorkerTaskData>,
374 abort_requested: AtomicBool,
375 }
376
377 impl std::fmt::Display for WorkerTask {
378
379 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
380 self.upid.fmt(f)
381 }
382 }
383
384 #[derive(Debug)]
385 struct WorkerTaskData {
386 logger: FileLogger,
387 progress: f64, // 0..1
388 pub abort_listeners: Vec<oneshot::Sender<()>>,
389 }
390
391 impl Drop for WorkerTask {
392
393 fn drop(&mut self) {
394 println!("unregister worker");
395 }
396 }
397
398 impl WorkerTask {
399
400 pub fn new(worker_type: &str, worker_id: Option<String>, username: &str, to_stdout: bool) -> Result<Arc<Self>, Error> {
401 println!("register worker");
402
403 let upid = UPID::new(worker_type, worker_id, username)?;
404 let task_id = upid.task_id;
405
406 let mut path = std::path::PathBuf::from(PROXMOX_BACKUP_TASK_DIR);
407
408 path.push(format!("{:02X}", upid.pstart % 256));
409
410 let backup_user = crate::backup::backup_user()?;
411
412 create_path(&path, None, Some(CreateOptions::new().owner(backup_user.uid).group(backup_user.gid)))?;
413
414 path.push(upid.to_string());
415
416 println!("FILE: {:?}", path);
417
418 let logger = FileLogger::new(&path, to_stdout)?;
419 nix::unistd::chown(&path, Some(backup_user.uid), Some(backup_user.gid))?;
420
421 let worker = Arc::new(Self {
422 upid: upid.clone(),
423 abort_requested: AtomicBool::new(false),
424 data: Mutex::new(WorkerTaskData {
425 logger,
426 progress: 0.0,
427 abort_listeners: vec![],
428 }),
429 });
430
431 // scope to drop the lock again after inserting
432 {
433 let mut hash = WORKER_TASK_LIST.lock().unwrap();
434 hash.insert(task_id, worker.clone());
435 super::set_worker_count(hash.len());
436 }
437
438 update_active_workers(Some(&upid))?;
439
440 Ok(worker)
441 }
442
443 /// Spawn a new tokio task/future.
444 pub fn spawn<F, T>(
445 worker_type: &str,
446 worker_id: Option<String>,
447 username: &str,
448 to_stdout: bool,
449 f: F,
450 ) -> Result<String, Error>
451 where F: Send + 'static + FnOnce(Arc<WorkerTask>) -> T,
452 T: Send + 'static + Future<Output = Result<(), Error>>,
453 {
454 let worker = WorkerTask::new(worker_type, worker_id, username, to_stdout)?;
455 let upid_str = worker.upid.to_string();
456 let f = f(worker.clone());
457 tokio::spawn(async move {
458 let result = f.await;
459 worker.log_result(&result);
460 });
461
462 Ok(upid_str)
463 }
464
465 /// Create a new worker thread.
466 pub fn new_thread<F>(
467 worker_type: &str,
468 worker_id: Option<String>,
469 username: &str,
470 to_stdout: bool,
471 f: F,
472 ) -> Result<String, Error>
473 where F: Send + UnwindSafe + 'static + FnOnce(Arc<WorkerTask>) -> Result<(), Error>
474 {
475 println!("register worker thread");
476
477 let (p, c) = oneshot::channel::<()>();
478
479 let worker = WorkerTask::new(worker_type, worker_id, username, to_stdout)?;
480 let upid_str = worker.upid.to_string();
481
482 let _child = std::thread::Builder::new().name(upid_str.clone()).spawn(move || {
483 let worker1 = worker.clone();
484 let result = match std::panic::catch_unwind(move || f(worker1)) {
485 Ok(r) => r,
486 Err(panic) => {
487 match panic.downcast::<&str>() {
488 Ok(panic_msg) => {
489 Err(format_err!("worker panicked: {}", panic_msg))
490 }
491 Err(_) => {
492 Err(format_err!("worker panicked: unknown type."))
493 }
494 }
495 }
496 };
497
498 worker.log_result(&result);
499 p.send(()).unwrap();
500 });
501
502 tokio::spawn(c.map(|_| ()));
503
504 Ok(upid_str)
505 }
506
507 /// Log task result, remove task from running list
508 pub fn log_result(&self, result: &Result<(), Error>) {
509
510 if let Err(err) = result {
511 self.log(&format!("TASK ERROR: {}", err));
512 } else {
513 self.log("TASK OK");
514 }
515
516 WORKER_TASK_LIST.lock().unwrap().remove(&self.upid.task_id);
517 let _ = update_active_workers(None);
518 super::set_worker_count(WORKER_TASK_LIST.lock().unwrap().len());
519 }
520
521 /// Log a message.
522 pub fn log<S: AsRef<str>>(&self, msg: S) {
523 let mut data = self.data.lock().unwrap();
524 data.logger.log(msg);
525 }
526
527 /// Set progress indicator
528 pub fn progress(&self, progress: f64) {
529 if progress >= 0.0 && progress <= 1.0 {
530 let mut data = self.data.lock().unwrap();
531 data.progress = progress;
532 } else {
533 // fixme: log!("task '{}': ignoring strange value for progress '{}'", self.upid, progress);
534 }
535 }
536
537 /// Request abort
538 pub fn request_abort(&self) {
539 eprintln!("set abort flag for worker {}", self.upid);
540 self.abort_requested.store(true, Ordering::SeqCst);
541 // noitify listeners
542 let mut data = self.data.lock().unwrap();
543 loop {
544 match data.abort_listeners.pop() {
545 None => { break; },
546 Some(ch) => {
547 let _ = ch.send(()); // ignore erros here
548 },
549 }
550 }
551 }
552
553 /// Test if abort was requested.
554 pub fn abort_requested(&self) -> bool {
555 self.abort_requested.load(Ordering::SeqCst)
556 }
557
558 /// Fail if abort was requested.
559 pub fn fail_on_abort(&self) -> Result<(), Error> {
560 if self.abort_requested() {
561 bail!("abort requested - aborting task");
562 }
563 Ok(())
564 }
565
566 /// Get a future which resolves on task abort
567 pub fn abort_future(&self) -> oneshot::Receiver<()> {
568 let (tx, rx) = oneshot::channel::<()>();
569
570 let mut data = self.data.lock().unwrap();
571 if self.abort_requested() {
572 let _ = tx.send(());
573 } else {
574 data.abort_listeners.push(tx);
575 }
576 rx
577 }
578 }