]>
Commit | Line | Data |
---|---|---|
892821fd DM |
1 | package PVE::API2::Replication; |
2 | ||
3 | use warnings; | |
4 | use strict; | |
5 | ||
6 | use PVE::JSONSchema qw(get_standard_option); | |
7 | use PVE::RPCEnvironment; | |
5ac1eaa0 | 8 | use PVE::Format qw(render_timestamp); |
483f89dd | 9 | use PVE::ProcFSTools; |
5ac1eaa0 | 10 | |
892821fd | 11 | use PVE::ReplicationConfig; |
d092dc4f | 12 | use PVE::ReplicationState; |
892821fd | 13 | use PVE::Replication; |
810c6776 DM |
14 | use PVE::QemuConfig; |
15 | use PVE::QemuServer; | |
16 | use PVE::LXC::Config; | |
17 | use PVE::LXC; | |
892821fd DM |
18 | |
19 | use PVE::RESTHandler; | |
20 | ||
21 | use base qw(PVE::RESTHandler); | |
22 | ||
5b358450 | 23 | our $pvesr_lock_path = "/var/lock/pvesr.lck"; |
810c6776 | 24 | |
25420507 | 25 | our $lookup_guest_class = sub { |
810c6776 DM |
26 | my ($vmtype) = @_; |
27 | ||
28 | if ($vmtype eq 'qemu') { | |
29 | return 'PVE::QemuConfig'; | |
30 | } elsif ($vmtype eq 'lxc') { | |
31 | return 'PVE::LXC::Config'; | |
32 | } else { | |
33 | die "unknown guest type '$vmtype' - internal error"; | |
34 | } | |
35 | }; | |
36 | ||
37 | # passing $now is useful for regression testing | |
38 | sub run_single_job { | |
39 | my ($jobid, $now, $logfunc) = @_; | |
40 | ||
41 | my $local_node = PVE::INotify::nodename(); | |
42 | ||
43 | my $code = sub { | |
44 | $now //= time(); | |
45 | ||
46 | my $cfg = PVE::ReplicationConfig->new(); | |
47 | ||
48 | my $jobcfg = $cfg->{ids}->{$jobid}; | |
49 | die "no such job '$jobid'\n" if !$jobcfg; | |
50 | ||
51 | die "internal error - not implemented" if $jobcfg->{type} ne 'local'; | |
52 | ||
53 | die "job '$jobid' is disabled\n" if $jobcfg->{disable}; | |
54 | ||
55 | my $vms = PVE::Cluster::get_vmlist(); | |
56 | my $vmid = $jobcfg->{guest}; | |
57 | ||
58 | die "no such guest '$vmid'\n" if !$vms->{ids}->{$vmid}; | |
59 | ||
60 | die "guest '$vmid' is not on local node\n" | |
61 | if $vms->{ids}->{$vmid}->{node} ne $local_node; | |
62 | ||
63 | die "unable to sync to local node\n" if $jobcfg->{target} eq $local_node; | |
64 | ||
a4beaa94 | 65 | my $vmtype = $vms->{ids}->{$vmid}->{type}; |
810c6776 | 66 | |
a4beaa94 | 67 | my $guest_class = $lookup_guest_class->($vmtype); |
810c6776 DM |
68 | PVE::Replication::run_replication($guest_class, $jobcfg, $now, $now, $logfunc); |
69 | }; | |
70 | ||
71 | my $res = PVE::Tools::lock_file($pvesr_lock_path, 60, $code); | |
72 | die $@ if $@; | |
73 | } | |
74 | ||
810c6776 | 75 | |
5ac1eaa0 | 76 | # TODO: below two should probably part of the general job framework/plugin system |
fa4bb659 TL |
77 | my sub _should_mail_at_failcount { |
78 | my ($fail_count) = @_; | |
237f00be | 79 | |
b70458b6 FE |
80 | # avoid spam during migration (bug #4111): when failing to obtain the guest's migration lock, |
81 | # fail_count will be 0 | |
82 | return 0 if $fail_count == 0; | |
83 | ||
fa4bb659 | 84 | return 1 if $fail_count <= 3; # always send the first few for better visibility of the issue |
237f00be | 85 | |
fa4bb659 TL |
86 | # failing job is re-tried every half hour, try to send one mail after 1, 2, 4, 8, etc. days |
87 | my $i = 1; | |
88 | while ($i * 48 < $fail_count) { | |
89 | $i = $i * 2; | |
90 | } | |
91 | return $i * 48 == $fail_count; | |
92 | }; | |
93 | ||
5ac1eaa0 TL |
94 | my sub _handle_job_err { |
95 | my ($job, $err, $mail) = @_; | |
96 | ||
97 | warn "$job->{id}: got unexpected replication job error - $err"; | |
98 | return if !$mail; | |
99 | ||
100 | my $state = PVE::ReplicationState::read_state(); | |
101 | my $jobstate = PVE::ReplicationState::extract_job_state($state, $job); | |
102 | my $fail_count = $jobstate->{fail_count}; | |
103 | ||
104 | return if !_should_mail_at_failcount($fail_count); | |
105 | ||
178fb4fe FG |
106 | my $schedule = $job->{schedule} // '*/15'; |
107 | ||
5ac1eaa0 | 108 | my $msg = "Replication job $job->{id} with target '$job->{target}' and schedule"; |
178fb4fe | 109 | $msg .= " '$schedule' failed!\n"; |
5ac1eaa0 TL |
110 | |
111 | $msg .= " Last successful sync: "; | |
112 | if (my $last_sync = $jobstate->{last_sync}) { | |
113 | $msg .= render_timestamp($last_sync) ."\n"; | |
114 | } else { | |
115 | $msg .= "None/Unknown\n"; | |
116 | } | |
117 | # not yet updated, so $job->next_sync here is actually the current one. | |
118 | # NOTE: Copied from PVE::ReplicationState::job_status() | |
119 | my $next_sync = $job->{next_sync} + 60 * ($fail_count <= 3 ? 5 * $fail_count : 30); | |
120 | $msg .= " Next sync try: " . render_timestamp($next_sync) ."\n"; | |
121 | $msg .= " Failure count: $fail_count\n"; | |
122 | ||
123 | ||
124 | if ($fail_count == 3) { | |
125 | $msg .= "\nNote: The system will now reduce the frequency of error reports,"; | |
126 | $msg .= " as the job appears to be stuck.\n"; | |
127 | } | |
128 | ||
129 | $msg .= "\nError:\n$err"; | |
130 | ||
131 | eval { | |
132 | PVE::Tools::sendmail('root', "Replication Job: $job->{id} failed", $msg) | |
133 | }; | |
134 | warn ": $@" if $@; | |
135 | } | |
136 | ||
fa4bb659 TL |
137 | # passing $now and $verbose is useful for regression testing |
138 | sub run_jobs { | |
139 | my ($now, $logfunc, $verbose, $mail) = @_; | |
237f00be | 140 | |
810c6776 DM |
141 | my $iteration = $now // time(); |
142 | ||
143 | my $code = sub { | |
c8742096 | 144 | my $start_time = $now // time(); |
810c6776 | 145 | |
c8742096 | 146 | PVE::ReplicationState::purge_old_states(); |
5a26b006 | 147 | |
c8742096 WL |
148 | while (my $jobcfg = PVE::ReplicationState::get_next_job($iteration, $start_time)) { |
149 | my $guest_class = $lookup_guest_class->($jobcfg->{vmtype}); | |
64d39c2e WL |
150 | |
151 | eval { | |
152 | PVE::Replication::run_replication($guest_class, $jobcfg, $iteration, $start_time, $logfunc, $verbose); | |
153 | }; | |
154 | if (my $err = $@) { | |
5ac1eaa0 | 155 | _handle_job_err($jobcfg, $err, $mail); |
af344184 | 156 | } |
64d39c2e | 157 | |
c8742096 WL |
158 | $start_time = $now // time(); |
159 | } | |
810c6776 DM |
160 | }; |
161 | ||
162 | my $res = PVE::Tools::lock_file($pvesr_lock_path, 60, $code); | |
163 | die $@ if $@; | |
164 | } | |
165 | ||
fc527b4d DM |
166 | my $extract_job_status = sub { |
167 | my ($jobcfg, $jobid) = @_; | |
168 | ||
169 | # Note: we modify $jobcfg | |
170 | my $state = delete $jobcfg->{state}; | |
171 | my $data = $jobcfg; | |
172 | ||
173 | $data->{id} = $jobid; | |
174 | ||
175 | foreach my $k (qw(last_sync last_try fail_count error duration)) { | |
176 | $data->{$k} = $state->{$k} if defined($state->{$k}); | |
177 | } | |
178 | ||
179 | if ($state->{pid} && $state->{ptime}) { | |
180 | if (PVE::ProcFSTools::check_process_running($state->{pid}, $state->{ptime})) { | |
181 | $data->{pid} = $state->{pid}; | |
182 | } | |
183 | } | |
184 | ||
185 | return $data; | |
186 | }; | |
187 | ||
892821fd | 188 | __PACKAGE__->register_method ({ |
fc527b4d | 189 | name => 'status', |
892821fd DM |
190 | path => '', |
191 | method => 'GET', | |
fc527b4d DM |
192 | description => "List status of all replication jobs on this node.", |
193 | permissions => { | |
194 | description => "Requires the VM.Audit permission on /vms/<vmid>.", | |
195 | user => 'all', | |
196 | }, | |
197 | protected => 1, | |
198 | proxyto => 'node', | |
199 | parameters => { | |
200 | additionalProperties => 0, | |
201 | properties => { | |
202 | node => get_standard_option('pve-node'), | |
203 | guest => get_standard_option('pve-vmid', { | |
204 | optional => 1, | |
205 | description => "Only list replication jobs for this guest.", | |
206 | }), | |
207 | }, | |
208 | }, | |
209 | returns => { | |
210 | type => 'array', | |
211 | items => { | |
212 | type => "object", | |
213 | properties => { | |
214 | id => { type => 'string' }, | |
215 | }, | |
216 | }, | |
217 | links => [ { rel => 'child', href => "{id}" } ], | |
218 | }, | |
219 | code => sub { | |
220 | my ($param) = @_; | |
221 | ||
222 | my $rpcenv = PVE::RPCEnvironment::get(); | |
223 | my $authuser = $rpcenv->get_user(); | |
224 | ||
959f37af | 225 | my $jobs = PVE::ReplicationState::job_status(1); |
fc527b4d DM |
226 | |
227 | my $res = []; | |
228 | foreach my $id (sort keys %$jobs) { | |
229 | my $data = $extract_job_status->($jobs->{$id}, $id); | |
230 | my $guest = $data->{guest}; | |
231 | next if defined($param->{guest}) && $guest != $param->{guest}; | |
232 | next if !$rpcenv->check($authuser, "/vms/$guest", [ 'VM.Audit' ]); | |
233 | push @$res, $data; | |
234 | } | |
235 | ||
236 | return $res; | |
237 | }}); | |
238 | ||
239 | __PACKAGE__->register_method ({ | |
240 | name => 'index', | |
241 | path => '{id}', | |
242 | method => 'GET', | |
892821fd DM |
243 | permissions => { user => 'all' }, |
244 | description => "Directory index.", | |
245 | parameters => { | |
246 | additionalProperties => 0, | |
247 | properties => { | |
fc527b4d | 248 | id => get_standard_option('pve-replication-id'), |
892821fd DM |
249 | node => get_standard_option('pve-node'), |
250 | }, | |
251 | }, | |
252 | returns => { | |
253 | type => 'array', | |
254 | items => { | |
255 | type => "object", | |
256 | properties => {}, | |
257 | }, | |
258 | links => [ { rel => 'child', href => "{name}" } ], | |
259 | }, | |
260 | code => sub { | |
261 | my ($param) = @_; | |
262 | ||
263 | return [ | |
88ea8e67 | 264 | { name => 'schedule_now' }, |
fc527b4d | 265 | { name => 'log' }, |
892821fd | 266 | { name => 'status' }, |
fc527b4d | 267 | ]; |
892821fd DM |
268 | }}); |
269 | ||
270 | ||
271 | __PACKAGE__->register_method ({ | |
fc527b4d DM |
272 | name => 'job_status', |
273 | path => '{id}/status', | |
892821fd | 274 | method => 'GET', |
fc527b4d | 275 | description => "Get replication job status.", |
892821fd DM |
276 | permissions => { |
277 | description => "Requires the VM.Audit permission on /vms/<vmid>.", | |
278 | user => 'all', | |
279 | }, | |
280 | protected => 1, | |
281 | proxyto => 'node', | |
282 | parameters => { | |
283 | additionalProperties => 0, | |
284 | properties => { | |
fc527b4d | 285 | id => get_standard_option('pve-replication-id'), |
892821fd DM |
286 | node => get_standard_option('pve-node'), |
287 | }, | |
288 | }, | |
289 | returns => { | |
fc527b4d DM |
290 | type => "object", |
291 | properties => {}, | |
892821fd DM |
292 | }, |
293 | code => sub { | |
294 | my ($param) = @_; | |
295 | ||
296 | my $rpcenv = PVE::RPCEnvironment::get(); | |
297 | my $authuser = $rpcenv->get_user(); | |
298 | ||
e81a10a4 | 299 | my $jobs = PVE::ReplicationState::job_status(1); |
fc527b4d DM |
300 | my $jobid = $param->{id}; |
301 | my $jobcfg = $jobs->{$jobid}; | |
892821fd | 302 | |
fc527b4d DM |
303 | die "no such replication job '$jobid'\n" if !defined($jobcfg); |
304 | ||
305 | my $data = $extract_job_status->($jobcfg, $jobid); | |
306 | my $guest = $data->{guest}; | |
307 | ||
308 | raise_perm_exc() if !$rpcenv->check($authuser, "/vms/$guest", [ 'VM.Audit' ]); | |
309 | ||
310 | return $data; | |
311 | }}); | |
312 | ||
313 | __PACKAGE__->register_method({ | |
314 | name => 'read_job_log', | |
315 | path => '{id}/log', | |
316 | method => 'GET', | |
317 | permissions => { | |
318 | description => "Requires the VM.Audit permission on /vms/<vmid>, or 'Sys.Audit' on '/nodes/<node>'", | |
319 | user => 'all', | |
320 | }, | |
321 | protected => 1, | |
322 | description => "Read replication job log.", | |
323 | proxyto => 'node', | |
324 | parameters => { | |
325 | additionalProperties => 0, | |
326 | properties => { | |
327 | id => get_standard_option('pve-replication-id'), | |
328 | node => get_standard_option('pve-node'), | |
329 | start => { | |
330 | type => 'integer', | |
331 | minimum => 0, | |
332 | optional => 1, | |
333 | }, | |
334 | limit => { | |
335 | type => 'integer', | |
336 | minimum => 0, | |
337 | optional => 1, | |
338 | }, | |
339 | }, | |
340 | }, | |
341 | returns => { | |
342 | type => 'array', | |
343 | items => { | |
344 | type => "object", | |
345 | properties => { | |
346 | n => { | |
347 | description=> "Line number", | |
348 | type=> 'integer', | |
349 | }, | |
350 | t => { | |
351 | description=> "Line text", | |
352 | type => 'string', | |
483f89dd DM |
353 | } |
354 | } | |
892821fd | 355 | } |
fc527b4d DM |
356 | }, |
357 | code => sub { | |
358 | my ($param) = @_; | |
892821fd | 359 | |
fc527b4d DM |
360 | my $rpcenv = PVE::RPCEnvironment::get(); |
361 | my $authuser = $rpcenv->get_user(); | |
362 | ||
363 | my $jobid = $param->{id}; | |
d09c076a | 364 | my $filename = PVE::ReplicationState::job_logfile_name($jobid); |
fc527b4d DM |
365 | |
366 | my $cfg = PVE::ReplicationConfig->new(); | |
367 | my $data = $cfg->{ids}->{$jobid}; | |
368 | ||
369 | die "no such replication job '$jobid'\n" if !defined($data); | |
370 | ||
371 | my $node = $param->{node}; | |
372 | ||
373 | my $vmid = $data->{guest}; | |
374 | raise_perm_exc() if (!($rpcenv->check($authuser, "/vms/$vmid", [ 'VM.Audit' ]) || | |
375 | $rpcenv->check($authuser, "/nodes/$node", [ 'Sys.Audit' ]))); | |
376 | ||
377 | my ($count, $lines) = PVE::Tools::dump_logfile($filename, $param->{start}, $param->{limit}); | |
378 | ||
379 | $rpcenv->set_result_attrib('total', $count); | |
380 | ||
381 | return $lines; | |
892821fd DM |
382 | }}); |
383 | ||
88ea8e67 DM |
384 | __PACKAGE__->register_method ({ |
385 | name => 'schedule_now', | |
386 | path => '{id}/schedule_now', | |
387 | method => 'POST', | |
388 | description => "Schedule replication job to start as soon as possible.", | |
389 | proxyto => 'node', | |
390 | protected => 1, | |
391 | permissions => { | |
392 | check => ['perm', '/storage', ['Datastore.Allocate']], | |
393 | }, | |
394 | parameters => { | |
395 | additionalProperties => 0, | |
396 | properties => { | |
397 | id => get_standard_option('pve-replication-id'), | |
398 | node => get_standard_option('pve-node'), | |
399 | }, | |
400 | }, | |
401 | returns => { | |
402 | type => 'string', | |
403 | }, | |
404 | code => sub { | |
405 | my ($param) = @_; | |
406 | ||
407 | my $jobid = $param->{id}; | |
408 | ||
409 | my $cfg = PVE::ReplicationConfig->new(); | |
410 | my $jobcfg = $cfg->{ids}->{$jobid}; | |
411 | ||
412 | die "no such replication job '$jobid'\n" if !defined($jobcfg); | |
413 | ||
414 | PVE::ReplicationState::schedule_job_now($jobcfg); | |
415 | ||
416 | }}); | |
417 | ||
892821fd | 418 | 1; |