]> git.proxmox.com Git - proxmox-backup.git/blob - src/server/worker_task.rs
cleanup: merge endtime into TaskState
[proxmox-backup.git] / src / server / worker_task.rs
1 use std::collections::HashMap;
2 use std::fs::File;
3 use std::io::{BufRead, BufReader};
4 use std::panic::UnwindSafe;
5 use std::sync::atomic::{AtomicBool, Ordering};
6 use std::sync::{Arc, Mutex};
7
8 use chrono::Local;
9 use anyhow::{bail, format_err, Error};
10 use futures::*;
11 use lazy_static::lazy_static;
12 use nix::unistd::Pid;
13 use serde_json::{json, Value};
14 use serde::{Serialize, Deserialize};
15 use tokio::sync::oneshot;
16
17 use proxmox::sys::linux::procfs;
18 use proxmox::try_block;
19 use proxmox::tools::fs::{create_path, open_file_locked, replace_file, CreateOptions};
20
21 use super::UPID;
22
23 use crate::tools::FileLogger;
24 use crate::api2::types::Userid;
25
26 macro_rules! PROXMOX_BACKUP_VAR_RUN_DIR_M { () => ("/run/proxmox-backup") }
27 macro_rules! PROXMOX_BACKUP_LOG_DIR_M { () => ("/var/log/proxmox-backup") }
28 macro_rules! PROXMOX_BACKUP_TASK_DIR_M { () => (concat!( PROXMOX_BACKUP_LOG_DIR_M!(), "/tasks")) }
29
30 pub const PROXMOX_BACKUP_VAR_RUN_DIR: &str = PROXMOX_BACKUP_VAR_RUN_DIR_M!();
31 pub const PROXMOX_BACKUP_LOG_DIR: &str = PROXMOX_BACKUP_LOG_DIR_M!();
32 pub const PROXMOX_BACKUP_TASK_DIR: &str = PROXMOX_BACKUP_TASK_DIR_M!();
33 pub const PROXMOX_BACKUP_TASK_LOCK_FN: &str = concat!(PROXMOX_BACKUP_TASK_DIR_M!(), "/.active.lock");
34 pub const PROXMOX_BACKUP_ACTIVE_TASK_FN: &str = concat!(PROXMOX_BACKUP_TASK_DIR_M!(), "/active");
35
36 lazy_static! {
37 static ref WORKER_TASK_LIST: Mutex<HashMap<usize, Arc<WorkerTask>>> = Mutex::new(HashMap::new());
38
39 static ref MY_PID: i32 = unsafe { libc::getpid() };
40 static ref MY_PID_PSTART: u64 = procfs::PidStat::read_from_pid(Pid::from_raw(*MY_PID))
41 .unwrap()
42 .starttime;
43 }
44
45 /// Test if the task is still running
46 pub async fn worker_is_active(upid: &UPID) -> Result<bool, Error> {
47 if (upid.pid == *MY_PID) && (upid.pstart == *MY_PID_PSTART) {
48 return Ok(WORKER_TASK_LIST.lock().unwrap().contains_key(&upid.task_id));
49 }
50
51 if !procfs::check_process_running_pstart(upid.pid, upid.pstart).is_some() {
52 return Ok(false);
53 }
54
55 let socketname = format!(
56 "\0{}/proxmox-task-control-{}.sock", PROXMOX_BACKUP_VAR_RUN_DIR, upid.pid);
57
58 let cmd = json!({
59 "command": "status",
60 "upid": upid.to_string(),
61 });
62
63 let status = super::send_command(socketname, cmd).await?;
64
65 if let Some(active) = status.as_bool() {
66 Ok(active)
67 } else {
68 bail!("got unexpected result {:?} (expected bool)", status);
69 }
70 }
71
72 /// Test if the task is still running (fast but inaccurate implementation)
73 ///
74 /// If the task is spanned from a different process, we simply return if
75 /// that process is still running. This information is good enough to detect
76 /// stale tasks...
77 pub fn worker_is_active_local(upid: &UPID) -> bool {
78 if (upid.pid == *MY_PID) && (upid.pstart == *MY_PID_PSTART) {
79 WORKER_TASK_LIST.lock().unwrap().contains_key(&upid.task_id)
80 } else {
81 procfs::check_process_running_pstart(upid.pid, upid.pstart).is_some()
82 }
83 }
84
85 pub fn create_task_control_socket() -> Result<(), Error> {
86
87 let socketname = format!(
88 "\0{}/proxmox-task-control-{}.sock", PROXMOX_BACKUP_VAR_RUN_DIR, *MY_PID);
89
90 let control_future = super::create_control_socket(socketname, |param| {
91 let param = param.as_object()
92 .ok_or_else(|| format_err!("unable to parse parameters (expected json object)"))?;
93 if param.keys().count() != 2 { bail!("wrong number of parameters"); }
94
95 let command = param["command"].as_str()
96 .ok_or_else(|| format_err!("unable to parse parameters (missing command)"))?;
97
98 // we have only two commands for now
99 if !(command == "abort-task" || command == "status") { bail!("got unknown command '{}'", command); }
100
101 let upid_str = param["upid"].as_str()
102 .ok_or_else(|| format_err!("unable to parse parameters (missing upid)"))?;
103
104 let upid = upid_str.parse::<UPID>()?;
105
106 if !((upid.pid == *MY_PID) && (upid.pstart == *MY_PID_PSTART)) {
107 bail!("upid does not belong to this process");
108 }
109
110 let hash = WORKER_TASK_LIST.lock().unwrap();
111
112 match command {
113 "abort-task" => {
114 if let Some(ref worker) = hash.get(&upid.task_id) {
115 worker.request_abort();
116 } else {
117 // assume task is already stopped
118 }
119 Ok(Value::Null)
120 }
121 "status" => {
122 let active = hash.contains_key(&upid.task_id);
123 Ok(active.into())
124 }
125 _ => {
126 bail!("got unknown command '{}'", command);
127 }
128 }
129 })?;
130
131 tokio::spawn(control_future);
132
133 Ok(())
134 }
135
136 pub fn abort_worker_async(upid: UPID) {
137 tokio::spawn(async move {
138 if let Err(err) = abort_worker(upid).await {
139 eprintln!("abort worker failed - {}", err);
140 }
141 });
142 }
143
144 pub async fn abort_worker(upid: UPID) -> Result<(), Error> {
145
146 let target_pid = upid.pid;
147
148 let socketname = format!(
149 "\0{}/proxmox-task-control-{}.sock", PROXMOX_BACKUP_VAR_RUN_DIR, target_pid);
150
151 let cmd = json!({
152 "command": "abort-task",
153 "upid": upid.to_string(),
154 });
155
156 super::send_command(socketname, cmd).map_ok(|_| ()).await
157 }
158
159 fn parse_worker_status_line(line: &str) -> Result<(String, UPID, Option<TaskState>), Error> {
160
161 let data = line.splitn(3, ' ').collect::<Vec<&str>>();
162
163 let len = data.len();
164
165 match len {
166 1 => Ok((data[0].to_owned(), data[0].parse::<UPID>()?, None)),
167 3 => {
168 let endtime = i64::from_str_radix(data[1], 16)?;
169 let state = TaskState::from_endtime_and_message(endtime, data[2])?;
170 Ok((data[0].to_owned(), data[0].parse::<UPID>()?, Some(state)))
171 }
172 _ => bail!("wrong number of components"),
173 }
174 }
175
176 /// Create task log directory with correct permissions
177 pub fn create_task_log_dirs() -> Result<(), Error> {
178
179 try_block!({
180 let backup_user = crate::backup::backup_user()?;
181 let opts = CreateOptions::new()
182 .owner(backup_user.uid)
183 .group(backup_user.gid);
184
185 create_path(PROXMOX_BACKUP_LOG_DIR, None, Some(opts.clone()))?;
186 create_path(PROXMOX_BACKUP_TASK_DIR, None, Some(opts.clone()))?;
187 create_path(PROXMOX_BACKUP_VAR_RUN_DIR, None, Some(opts))?;
188 Ok(())
189 }).map_err(|err: Error| format_err!("unable to create task log dir - {}", err))?;
190
191 Ok(())
192 }
193
194 /// Read endtime (time of last log line) and exitstatus from task log file
195 /// If there is not a single line with at valid datetime, we assume the
196 /// starttime to be the endtime
197 pub fn upid_read_status(upid: &UPID) -> Result<TaskState, Error> {
198 let mut endtime = upid.starttime;
199 let mut status = TaskState::Unknown { endtime };
200
201 let path = upid.log_path();
202
203 let mut file = File::open(path)?;
204
205 /// speedup - only read tail
206 use std::io::Seek;
207 use std::io::SeekFrom;
208 let _ = file.seek(SeekFrom::End(-8192)); // ignore errors
209
210 let reader = BufReader::new(file);
211
212 for line in reader.lines() {
213 let line = line?;
214
215 let mut iter = line.splitn(2, ": ");
216 if let Some(time_str) = iter.next() {
217 endtime = chrono::DateTime::parse_from_rfc3339(time_str)
218 .map_err(|err| format_err!("cannot parse '{}': {}", time_str, err))?
219 .timestamp();
220 } else {
221 continue;
222 }
223 match iter.next().and_then(|rest| rest.strip_prefix("TASK ")) {
224 None => continue,
225 Some(rest) => {
226 if let Ok(state) = TaskState::from_endtime_and_message(endtime, rest) {
227 status = state;
228 }
229 }
230 }
231 }
232
233 Ok(status)
234 }
235
236 /// Task State
237 #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
238 pub enum TaskState {
239 /// The Task ended with an undefined state
240 Unknown { endtime: i64 },
241 /// The Task ended and there were no errors or warnings
242 OK { endtime: i64 },
243 /// The Task had 'count' amount of warnings and no errors
244 Warning { count: u64, endtime: i64 },
245 /// The Task ended with the error described in 'message'
246 Error { message: String, endtime: i64 },
247 }
248
249 impl TaskState {
250 pub fn endtime(&self) -> i64 {
251 match *self {
252 TaskState::Unknown { endtime } => endtime,
253 TaskState::OK { endtime } => endtime,
254 TaskState::Warning { endtime, .. } => endtime,
255 TaskState::Error { endtime, .. } => endtime,
256 }
257 }
258
259 fn result_text(&self) -> String {
260 match self {
261 TaskState::Error { message, .. } => format!("TASK ERROR: {}", message),
262 other => format!("TASK {}", other),
263 }
264 }
265
266 fn from_endtime_and_message(endtime: i64, s: &str) -> Result<Self, Error> {
267 if s == "unknown" {
268 Ok(TaskState::Unknown { endtime })
269 } else if s == "OK" {
270 Ok(TaskState::OK { endtime })
271 } else if s.starts_with("WARNINGS: ") {
272 let count: u64 = s[10..].parse()?;
273 Ok(TaskState::Warning{ count, endtime })
274 } else if s.len() > 0 {
275 let message = if s.starts_with("ERROR: ") { &s[7..] } else { s }.to_string();
276 Ok(TaskState::Error{ message, endtime })
277 } else {
278 bail!("unable to parse Task Status '{}'", s);
279 }
280 }
281 }
282
283 impl std::cmp::PartialOrd for TaskState {
284 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
285 Some(self.endtime().cmp(&other.endtime()))
286 }
287 }
288
289 impl std::cmp::Ord for TaskState {
290 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
291 self.endtime().cmp(&other.endtime())
292 }
293 }
294
295 impl std::fmt::Display for TaskState {
296 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
297 match self {
298 TaskState::Unknown { .. } => write!(f, "unknown"),
299 TaskState::OK { .. }=> write!(f, "OK"),
300 TaskState::Warning { count, .. } => write!(f, "WARNINGS: {}", count),
301 TaskState::Error { message, .. } => write!(f, "{}", message),
302 }
303 }
304 }
305
306 /// Task details including parsed UPID
307 ///
308 /// If there is no `state`, the task is still running.
309 #[derive(Debug)]
310 pub struct TaskListInfo {
311 /// The parsed UPID
312 pub upid: UPID,
313 /// UPID string representation
314 pub upid_str: String,
315 /// Task `(endtime, status)` if already finished
316 pub state: Option<TaskState>, // endtime, status
317 }
318
319 // atomically read/update the task list, update status of finished tasks
320 // new_upid is added to the list when specified.
321 // Returns a sorted list of known tasks,
322 fn update_active_workers(new_upid: Option<&UPID>) -> Result<Vec<TaskListInfo>, Error> {
323
324 let backup_user = crate::backup::backup_user()?;
325
326 let lock = open_file_locked(PROXMOX_BACKUP_TASK_LOCK_FN, std::time::Duration::new(10, 0))?;
327 nix::unistd::chown(PROXMOX_BACKUP_TASK_LOCK_FN, Some(backup_user.uid), Some(backup_user.gid))?;
328
329 let reader = match File::open(PROXMOX_BACKUP_ACTIVE_TASK_FN) {
330 Ok(f) => Some(BufReader::new(f)),
331 Err(err) => {
332 if err.kind() == std::io::ErrorKind::NotFound {
333 None
334 } else {
335 bail!("unable to open active worker {:?} - {}", PROXMOX_BACKUP_ACTIVE_TASK_FN, err);
336 }
337 }
338 };
339
340 let mut active_list = vec![];
341 let mut finish_list = vec![];
342
343 if let Some(lines) = reader.map(|r| r.lines()) {
344
345 for line in lines {
346 let line = line?;
347 match parse_worker_status_line(&line) {
348 Err(err) => bail!("unable to parse active worker status '{}' - {}", line, err),
349 Ok((upid_str, upid, state)) => match state {
350 None if worker_is_active_local(&upid) => {
351 active_list.push(TaskListInfo { upid, upid_str, state: None });
352 },
353 None => {
354 println!("Detected stopped UPID {}", upid_str);
355 let status = upid_read_status(&upid)
356 .unwrap_or_else(|_| TaskState::Unknown { endtime: Local::now().timestamp() });
357 finish_list.push(TaskListInfo {
358 upid, upid_str, state: Some(status)
359 });
360 },
361 Some(status) => {
362 finish_list.push(TaskListInfo {
363 upid, upid_str, state: Some(status)
364 })
365 }
366 }
367 }
368 }
369 }
370
371 if let Some(upid) = new_upid {
372 active_list.push(TaskListInfo { upid: upid.clone(), upid_str: upid.to_string(), state: None });
373 }
374
375 // assemble list without duplicates
376 // we include all active tasks,
377 // and fill up to 1000 entries with finished tasks
378
379 let max = 1000;
380
381 let mut task_hash = HashMap::new();
382
383 for info in active_list {
384 task_hash.insert(info.upid_str.clone(), info);
385 }
386
387 for info in finish_list {
388 if task_hash.len() > max { break; }
389 if !task_hash.contains_key(&info.upid_str) {
390 task_hash.insert(info.upid_str.clone(), info);
391 }
392 }
393
394 let mut task_list: Vec<TaskListInfo> = vec![];
395 for (_, info) in task_hash { task_list.push(info); }
396
397 task_list.sort_unstable_by(|b, a| { // lastest on top
398 match (&a.state, &b.state) {
399 (Some(s1), Some(s2)) => s1.cmp(&s2),
400 (Some(_), None) => std::cmp::Ordering::Less,
401 (None, Some(_)) => std::cmp::Ordering::Greater,
402 _ => a.upid.starttime.cmp(&b.upid.starttime),
403 }
404 });
405
406 let mut raw = String::new();
407 for info in &task_list {
408 if let Some(status) = &info.state {
409 raw.push_str(&format!("{} {:08X} {}\n", info.upid_str, status.endtime(), status));
410 } else {
411 raw.push_str(&info.upid_str);
412 raw.push('\n');
413 }
414 }
415
416 replace_file(
417 PROXMOX_BACKUP_ACTIVE_TASK_FN,
418 raw.as_bytes(),
419 CreateOptions::new()
420 .owner(backup_user.uid)
421 .group(backup_user.gid),
422 )?;
423
424 drop(lock);
425
426 Ok(task_list)
427 }
428
429 /// Returns a sorted list of known tasks
430 ///
431 /// The list is sorted by `(starttime, endtime)` in ascending order
432 pub fn read_task_list() -> Result<Vec<TaskListInfo>, Error> {
433 update_active_workers(None)
434 }
435
436 /// Launch long running worker tasks.
437 ///
438 /// A worker task can either be a whole thread, or a simply tokio
439 /// task/future. Each task can `log()` messages, which are stored
440 /// persistently to files. Task should poll the `abort_requested`
441 /// flag, and stop execution when requested.
442 #[derive(Debug)]
443 pub struct WorkerTask {
444 upid: UPID,
445 data: Mutex<WorkerTaskData>,
446 abort_requested: AtomicBool,
447 }
448
449 impl std::fmt::Display for WorkerTask {
450
451 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
452 self.upid.fmt(f)
453 }
454 }
455
456 #[derive(Debug)]
457 struct WorkerTaskData {
458 logger: FileLogger,
459 progress: f64, // 0..1
460 warn_count: u64,
461 pub abort_listeners: Vec<oneshot::Sender<()>>,
462 }
463
464 impl Drop for WorkerTask {
465
466 fn drop(&mut self) {
467 println!("unregister worker");
468 }
469 }
470
471 impl WorkerTask {
472
473 pub fn new(worker_type: &str, worker_id: Option<String>, userid: Userid, to_stdout: bool) -> Result<Arc<Self>, Error> {
474 println!("register worker");
475
476 let upid = UPID::new(worker_type, worker_id, userid)?;
477 let task_id = upid.task_id;
478
479 let mut path = std::path::PathBuf::from(PROXMOX_BACKUP_TASK_DIR);
480
481 path.push(format!("{:02X}", upid.pstart % 256));
482
483 let backup_user = crate::backup::backup_user()?;
484
485 create_path(&path, None, Some(CreateOptions::new().owner(backup_user.uid).group(backup_user.gid)))?;
486
487 path.push(upid.to_string());
488
489 println!("FILE: {:?}", path);
490
491 let logger = FileLogger::new(&path, to_stdout)?;
492 nix::unistd::chown(&path, Some(backup_user.uid), Some(backup_user.gid))?;
493
494 let worker = Arc::new(Self {
495 upid: upid.clone(),
496 abort_requested: AtomicBool::new(false),
497 data: Mutex::new(WorkerTaskData {
498 logger,
499 progress: 0.0,
500 warn_count: 0,
501 abort_listeners: vec![],
502 }),
503 });
504
505 // scope to drop the lock again after inserting
506 {
507 let mut hash = WORKER_TASK_LIST.lock().unwrap();
508 hash.insert(task_id, worker.clone());
509 super::set_worker_count(hash.len());
510 }
511
512 update_active_workers(Some(&upid))?;
513
514 Ok(worker)
515 }
516
517 /// Spawn a new tokio task/future.
518 pub fn spawn<F, T>(
519 worker_type: &str,
520 worker_id: Option<String>,
521 userid: Userid,
522 to_stdout: bool,
523 f: F,
524 ) -> Result<String, Error>
525 where F: Send + 'static + FnOnce(Arc<WorkerTask>) -> T,
526 T: Send + 'static + Future<Output = Result<(), Error>>,
527 {
528 let worker = WorkerTask::new(worker_type, worker_id, userid, to_stdout)?;
529 let upid_str = worker.upid.to_string();
530 let f = f(worker.clone());
531 tokio::spawn(async move {
532 let result = f.await;
533 worker.log_result(&result);
534 });
535
536 Ok(upid_str)
537 }
538
539 /// Create a new worker thread.
540 pub fn new_thread<F>(
541 worker_type: &str,
542 worker_id: Option<String>,
543 userid: Userid,
544 to_stdout: bool,
545 f: F,
546 ) -> Result<String, Error>
547 where F: Send + UnwindSafe + 'static + FnOnce(Arc<WorkerTask>) -> Result<(), Error>
548 {
549 println!("register worker thread");
550
551 let worker = WorkerTask::new(worker_type, worker_id, userid, to_stdout)?;
552 let upid_str = worker.upid.to_string();
553
554 let _child = std::thread::Builder::new().name(upid_str.clone()).spawn(move || {
555 let worker1 = worker.clone();
556 let result = match std::panic::catch_unwind(move || f(worker1)) {
557 Ok(r) => r,
558 Err(panic) => {
559 match panic.downcast::<&str>() {
560 Ok(panic_msg) => {
561 Err(format_err!("worker panicked: {}", panic_msg))
562 }
563 Err(_) => {
564 Err(format_err!("worker panicked: unknown type."))
565 }
566 }
567 }
568 };
569
570 worker.log_result(&result);
571 });
572
573 Ok(upid_str)
574 }
575
576 /// create state from self and a result
577 pub fn create_state(&self, result: &Result<(), Error>) -> TaskState {
578 let warn_count = self.data.lock().unwrap().warn_count;
579
580 let endtime = Local::now().timestamp();
581
582 if let Err(err) = result {
583 TaskState::Error { message: err.to_string(), endtime }
584 } else if warn_count > 0 {
585 TaskState::Warning { count: warn_count, endtime }
586 } else {
587 TaskState::OK { endtime }
588 }
589 }
590
591 /// Log task result, remove task from running list
592 pub fn log_result(&self, result: &Result<(), Error>) {
593 let state = self.create_state(result);
594 self.log(state.result_text());
595
596 WORKER_TASK_LIST.lock().unwrap().remove(&self.upid.task_id);
597 let _ = update_active_workers(None);
598 super::set_worker_count(WORKER_TASK_LIST.lock().unwrap().len());
599 }
600
601 /// Log a message.
602 pub fn log<S: AsRef<str>>(&self, msg: S) {
603 let mut data = self.data.lock().unwrap();
604 data.logger.log(msg);
605 }
606
607 /// Log a message as warning.
608 pub fn warn<S: AsRef<str>>(&self, msg: S) {
609 let mut data = self.data.lock().unwrap();
610 data.logger.log(format!("WARN: {}", msg.as_ref()));
611 data.warn_count += 1;
612 }
613
614 /// Set progress indicator
615 pub fn progress(&self, progress: f64) {
616 if progress >= 0.0 && progress <= 1.0 {
617 let mut data = self.data.lock().unwrap();
618 data.progress = progress;
619 } else {
620 // fixme: log!("task '{}': ignoring strange value for progress '{}'", self.upid, progress);
621 }
622 }
623
624 /// Request abort
625 pub fn request_abort(&self) {
626 eprintln!("set abort flag for worker {}", self.upid);
627 self.abort_requested.store(true, Ordering::SeqCst);
628 // noitify listeners
629 let mut data = self.data.lock().unwrap();
630 loop {
631 match data.abort_listeners.pop() {
632 None => { break; },
633 Some(ch) => {
634 let _ = ch.send(()); // ignore erros here
635 },
636 }
637 }
638 }
639
640 /// Test if abort was requested.
641 pub fn abort_requested(&self) -> bool {
642 self.abort_requested.load(Ordering::SeqCst)
643 }
644
645 /// Fail if abort was requested.
646 pub fn fail_on_abort(&self) -> Result<(), Error> {
647 if self.abort_requested() {
648 bail!("abort requested - aborting task");
649 }
650 Ok(())
651 }
652
653 /// Get a future which resolves on task abort
654 pub fn abort_future(&self) -> oneshot::Receiver<()> {
655 let (tx, rx) = oneshot::channel::<()>();
656
657 let mut data = self.data.lock().unwrap();
658 if self.abort_requested() {
659 let _ = tx.send(());
660 } else {
661 data.abort_listeners.push(tx);
662 }
663 rx
664 }
665
666 pub fn upid(&self) -> &UPID {
667 &self.upid
668 }
669 }