]> git.proxmox.com Git - pve-manager.git/blob - bin/ocf/pvevm
update shipped appliance info index
[pve-manager.git] / bin / ocf / pvevm
1 #!/usr/bin/perl -w
2
3 # Resource Agent for managing PVE VMs (openvz and qemu-kvm)
4 #
5 # License: GNU Affero General Public License (AGPL3)
6 # Copyright (C) 2011 Proxmox Server Solutions GmbH
7
8 use strict;
9 use PVE::Tools;
10 use PVE::ProcFSTools;
11 use PVE::Cluster;
12 use PVE::INotify;
13 use PVE::RPCEnvironment;
14 use PVE::OpenVZ;
15 use PVE::API2::OpenVZ;
16 use PVE::QemuServer;
17 use PVE::API2::Qemu;
18
19 use constant OCF_SUCCESS => 0;
20 use constant OCF_ERR_GENERIC => 1;
21 use constant OCF_ERR_ARGS => 2;
22 use constant OCF_ERR_UNIMPLEMENTED => 3;
23 use constant OCF_ERR_PERM => 4;
24 use constant OCF_ERR_INSTALLED => 5;
25 use constant OCF_ERR_CONFIGURED => 6;
26 use constant OCF_NOT_RUNNING => 7;
27 use constant OCF_RUNNING_MASTER => 8;
28 use constant OCF_FAILED_MASTER => 9;
29
30 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
31
32 my $ocf_ressource_type = 'pvevm';
33
34 my $prio_hash = {
35 err => 3,
36 note => 5,
37 info => 6,
38 debug => 7,
39 };
40
41 $SIG{__DIE__} = sub {
42 die @_ if $^S; # skip if inside eval
43 $! = OCF_ERR_GENERIC;
44 ocf_log('err', @_);
45 exit($!);
46 };
47
48 if ($> != 0) {
49 print STDERR "Cannot control VMs. as non-root user.\n";
50 exit(OCF_ERR_PERM);
51 }
52
53 PVE::INotify::inotify_init();
54
55 my $rpcenv = PVE::RPCEnvironment->init('ha');
56
57 $rpcenv->init_request();
58 $rpcenv->set_language($ENV{LANG});
59 $rpcenv->set_user('root@pam');
60
61 my $nodename = PVE::INotify::nodename();
62
63 my @ssh_opts = ('-o', 'BatchMode=yes');
64 my @ssh_cmd = ('ssh', @ssh_opts);
65
66 sub ocf_log {
67 my ($level, $msg) = @_;
68
69 chomp $msg;
70 print "$level: $msg\n";
71
72 my $level_n = $prio_hash->{$level};
73 $level_n = $prio_hash->{note} if !defined($level_n);
74
75 my $cmd = ['clulog', '-m', $ocf_ressource_type, '-s', $level_n, $msg];
76
77 eval { PVE::Tools::run_command($cmd); }; # ignore errors
78 }
79
80 sub get_timeout {
81 my $default_timeout = 60;
82 my $tout = $default_timeout;
83
84 if ($ENV{OCF_RESKEY_RGMANAGER_meta_timeout}) {
85 $tout = $ENV{OCF_RESKEY_RGMANAGER_meta_timeout};
86 } elsif ($ENV{OCF_RESKEY_CRM_meta_timeout}) {
87 $tout = $ENV{OCF_RESKEY_CRM_meta_timeout};
88 }
89
90 return $default_timeout if $tout <= 0;
91
92 return $tout;
93 }
94
95 sub check_running {
96 my ($status, $verbose) = @_;
97
98 if ($status->{type} eq 'qemu') {
99 $status->{running} = PVE::QemuServer::check_running($status->{vmid}, 1);
100 } elsif ($status->{type} eq 'openvz') {
101 $status->{running} = PVE::OpenVZ::check_running($status->{vmid});
102 } else {
103 die "got strange VM type '$status->{type}'\n";
104 }
105 }
106
107 sub validate_all {
108 my $status = {};
109
110 eval {
111
112 my $vmid = $ENV{OCF_RESKEY_vmid};
113 die "no VMID specified\n" if !defined($vmid);
114 die "got invalid VMID '$vmid'\n" if $vmid !~ m/^[1-9]\d*$/;
115
116 my $vmlist = PVE::Cluster::get_vmlist();
117 die "got empty cluster VM list\n" if !$vmlist || !$vmlist->{ids};
118 my $data = $vmlist->{ids}->{$vmid};
119 die "VM $vmid does not exist\n" if !$data;
120
121 $status->{vmid} = $vmid;
122 $status->{type} = $data->{type};
123 $status->{node} = $data->{node};
124
125 if ($status->{type} eq 'qemu') {
126 $status->{name} = "VM $vmid";
127 } else {
128 $status->{name} = "CT $vmid";
129 }
130
131 check_running($status);
132 };
133 if (my $err = $@) {
134 ocf_log('err', $err);
135 exit(OCF_ERR_ARGS);
136 }
137
138 return $status;
139 }
140
141 sub upid_wait {
142 my ($upid) = @_;
143
144 my $task = PVE::Tools::upid_decode($upid);
145
146 sleep(1);
147 while (PVE::ProcFSTools::check_process_running($task->{pid}, $task->{pstart})) {
148 ocf_log('debug', "Task still active, waiting");
149 sleep(1);
150 }
151 }
152
153 my $cmd = shift || '';
154 my $migratetarget = shift if $cmd eq 'migrate';
155
156 die "too many arguments\n" if scalar (@ARGV) != 0;
157
158 if ($cmd eq 'start') {
159 my $status = validate_all();
160 if ($status->{running}) {
161 ocf_log('info', "$status->{name} is already running");
162 exit(OCF_SUCCESS);
163 }
164
165 if ($status->{node} ne $nodename) {
166 ocf_log('info', "Move config for $status->{name} to local node");
167 my ($oldconfig, $newconfig);
168 if ($status->{type} eq 'qemu') {
169 $oldconfig = PVE::QemuServer::config_file($status->{vmid}, $status->{node});
170 $newconfig = PVE::QemuServer::config_file($status->{vmid}, $nodename);
171 } else {
172 $oldconfig = PVE::OpenVZ::config_file($status->{vmid}, $status->{node});
173 $newconfig = PVE::OpenVZ::config_file($status->{vmid}, $nodename);
174 }
175 if (!rename($oldconfig, $newconfig)) {
176 ocf_log('err', "unable to move config file from '$oldconfig' to '$newconfig' - $!");
177 exit(OCF_ERR_GENERIC);
178 }
179 }
180
181 my $upid;
182
183 if ($status->{type} eq 'qemu') {
184 $upid = PVE::API2::Qemu->vm_start({node => $nodename, vmid => $status->{vmid}});
185 } else {
186 $upid = PVE::API2::OpenVZ->vm_start({node => $nodename, vmid => $status->{vmid}});
187 }
188
189 upid_wait($upid);
190
191 check_running($status);
192
193 exit(OCF_ERR_GENERIC) if !$status->{running};
194
195 if (my $testprog = $ENV{OCF_RESKEY_status_program}) {
196
197 my $timeout = get_timeout();
198
199 my $wait_func = sub {
200 while (system($testprog) != 0) { sleep(3); }
201 };
202
203 eval { PVE::Tools::run_with_timeout($timeout, $wait_func); };
204 if (my $err = $@) {
205 ocf_log('err', "Start of $status->{name} has failed");
206 ocf_log('err', "error while waiting for '$testprog' - $err");
207 exit(OCF_ERR_GENERIC);
208 }
209 }
210
211 exit(OCF_SUCCESS);
212
213 } elsif($cmd eq 'stop') {
214 my $status = validate_all();
215
216 if (!$status->{running}) {
217 ocf_log('info', "$status->{name} is already stopped");
218 exit(OCF_SUCCESS);
219 }
220
221 my $timeout = get_timeout();
222
223 my $upid;
224
225 my $param = {
226 node => $nodename,
227 vmid => $status->{vmid},
228 timeout => $timeout,
229 forceStop => 1,
230 };
231
232 if ($status->{type} eq 'qemu') {
233 $upid = PVE::API2::Qemu->vm_shutdown($param);
234 } else {
235 $upid = PVE::API2::OpenVZ->vm_shutdown($param);
236 }
237
238 upid_wait($upid);
239
240 check_running($status);
241
242 exit($status->{running} ? OCF_ERR_GENERIC : OCF_SUCCESS);
243
244 } elsif($cmd eq 'recover' || $cmd eq 'restart' || $cmd eq 'reload') {
245
246 exit(OCF_SUCCESS);
247
248 } elsif($cmd eq 'status' || $cmd eq 'monitor') {
249
250 my $status = validate_all();
251
252 if (!$status->{running}) {
253 ocf_log('debug', "$status->{name} is not running");
254 exit(OCF_NOT_RUNNING);
255 }
256
257 ocf_log('debug', "$status->{name} is running");
258
259 my $testprog = $ENV{OCF_RESKEY_status_program};
260 my $checklevel = $ENV{OCF_CHECK_LEVEL};
261
262 if ($testprog && $checklevel && $checklevel >= 10) {
263 if (system($testprog) != 0) {
264 exit(OCF_NOT_RUNNING);
265 }
266 }
267
268 exit(OCF_SUCCESS);
269
270 } elsif($cmd eq 'migrate') {
271 my $status = validate_all();
272 if (!$status->{running}) {
273 ocf_log('err', "$status->{name} is not running");
274 exit(OCF_ERR_GENERIC);
275 }
276
277 if (!$migratetarget) {
278 ocf_log('err', "No target specified");
279 exit(OCF_ERR_ARGS);
280
281 };
282
283 my $upid;
284 my $params = {
285 node => $nodename,
286 vmid => $status->{vmid},
287 target => $migratetarget,
288 online => 1,
289 };
290
291 my $oldconfig;
292 if ($status->{type} eq 'qemu') {
293 $oldconfig = PVE::QemuServer::config_file($status->{vmid}, $status->{node});
294 $upid = PVE::API2::Qemu->migrate_vm($params);
295 } else {
296 $oldconfig = PVE::OpenVZ::config_file($status->{vmid}, $status->{node});
297 $upid = PVE::API2::OpenVZ->migrate_vm($params);
298 }
299
300 upid_wait($upid);
301
302 # something went wrong if old config file is still there
303 exit((-f $oldconfig) ? OCF_ERR_GENERIC : OCF_SUCCESS);
304
305 } elsif($cmd eq 'stop') {
306 my $status = validate_all();
307
308 if (!$status->{running}) {
309 ocf_log('info', "$status->{name} is already stopped");
310 exit(OCF_SUCCESS);
311 }
312
313 my $upid;
314
315 if ($status->{type} eq 'qemu') {
316 $upid = PVE::API2::Qemu->vm_stop({node => $nodename, vmid => $status->{vmid}});
317 } else {
318 $upid = PVE::API2::OpenVZ->vm_stop({node => $nodename, vmid => $status->{vmid}, fast => 1});
319 }
320
321 upid_wait($upid);
322
323 die "implement me";
324
325 } elsif($cmd eq 'reconfig') {
326 # Reconfigure a running VM
327 my $status = validate_all();
328
329 # we do nothing here
330
331 } elsif($cmd eq 'meta-data') {
332 while(<DATA>) {
333 print;
334 }
335 } elsif($cmd eq 'validate-all') {
336 my $status = validate_all();
337 } else {
338 die "usage: $0 {start|stop|restart|status|reload|reconfig|meta-data|validate-all}\n";
339 }
340
341 exit(OCF_SUCCESS);
342
343 __DATA__
344 <?xml version="1.0"?>
345 <resource-agent version="rgmanager 2.0" name="pvevm">
346 <version>1.0</version>
347
348 <longdesc lang="en">
349 Defines a PVE Virtual Machine
350 </longdesc>
351 <shortdesc lang="en">
352 Defines a PVE Virtual Machine
353 </shortdesc>
354
355 <parameters>
356 <parameter name="vmid" primary="1">
357 <longdesc lang="en">
358 This is the VMID of the virtual machine.
359 </longdesc>
360 <shortdesc lang="en">
361 VMID
362 </shortdesc>
363 <content type="string"/>
364 </parameter>
365
366 <parameter name="domain" reconfig="1">
367 <longdesc lang="en">
368 Failover domains define lists of cluster members
369 to try in the event that the host of the virtual machine
370 fails.
371 </longdesc>
372 <shortdesc lang="en">
373 Cluster failover Domain
374 </shortdesc>
375 <content type="string"/>
376 </parameter>
377
378 <parameter name="autostart" reconfig="1">
379 <longdesc lang="en">
380 If set to yes, this resource group will automatically be started
381 after the cluster forms a quorum. If set to no, this virtual
382 machine will start in the 'disabled' state after the cluster
383 forms a quorum.
384 </longdesc>
385 <shortdesc lang="en">
386 Automatic start after quorum formation
387 </shortdesc>
388 <content type="boolean" default="1"/>
389 </parameter>
390
391 <parameter name="exclusive" reconfig="1">
392 <longdesc lang="en">
393 If set, this resource group will only relocate to
394 nodes which have no other resource groups running in the
395 event of a failure. If no empty nodes are available,
396 this resource group will not be restarted after a failure.
397 Additionally, resource groups will not automatically
398 relocate to the node running this resource group. This
399 option can be overridden by manual start and/or relocate
400 operations.
401 </longdesc>
402 <shortdesc lang="en">
403 Exclusive resource group
404 </shortdesc>
405 <content type="boolean" default="0"/>
406 </parameter>
407
408 <parameter name="recovery" reconfig="1">
409 <longdesc lang="en">
410 This currently has three possible options: "restart" tries
411 to restart this virtual machine locally before
412 attempting to relocate (default); "relocate" does not bother
413 trying to restart the VM locally; "disable" disables
414 the VM if it fails.
415 </longdesc>
416 <shortdesc lang="en">
417 Failure recovery policy
418 </shortdesc>
419 <content type="string"/>
420 </parameter>
421
422 <parameter name="migrate">
423 <longdesc lang="en">
424 Migration type (live or pause, default = live).
425 </longdesc>
426 <shortdesc lang="en">
427 Migration type (live or pause, default = live).
428 </shortdesc>
429 <content type="string" default="live"/>
430 </parameter>
431
432 <parameter name="depend">
433 <longdesc lang="en">
434 Service dependency; will not start without the specified
435 service running.
436 </longdesc>
437 <shortdesc lang="en">
438 Top-level service this depends on, in service:name format.
439 </shortdesc>
440 <content type="string"/>
441 </parameter>
442
443 <parameter name="depend_mode">
444 <longdesc lang="en">
445 Service dependency mode.
446 hard - This service is stopped/started if its dependency
447 is stopped/started
448 soft - This service only depends on the other service for
449 initial startip. If the other service stops, this
450 service is not stopped.
451 </longdesc>
452 <shortdesc lang="en">
453 Service dependency mode (soft or hard).
454 </shortdesc>
455 <content type="string" default="hard"/>
456 </parameter>
457
458 <parameter name="max_restarts" reconfig="1">
459 <longdesc lang="en">
460 Maximum restarts for this service.
461 </longdesc>
462 <shortdesc lang="en">
463 Maximum restarts for this service.
464 </shortdesc>
465 <content type="string" default="0"/>
466 </parameter>
467
468 <parameter name="restart_expire_time" reconfig="1">
469 <longdesc lang="en">
470 Restart expiration time. A restart is forgotten
471 after this time. When combined with the max_restarts
472 option, this lets administrators specify a threshold
473 for when to fail over services. If max_restarts
474 is exceeded in this given expiration time, the service
475 is relocated instead of restarted again.
476 </longdesc>
477 <shortdesc lang="en">
478 Restart expiration time; amount of time before a restart
479 is forgotten.
480 </shortdesc>
481 <content type="string" default="0"/>
482 </parameter>
483
484 <parameter name="status_program" reconfig="1">
485 <longdesc lang="en">
486 Ordinarily, only the presence/health of a virtual machine
487 is checked. If specified, the status_program value is
488 executed during a depth 10 check. The intent of this
489 program is to ascertain the status of critical services
490 within a virtual machine.
491 </longdesc>
492 <shortdesc lang="en">
493 Additional status check program
494 </shortdesc>
495 <content type="string" default=""/>
496 </parameter>
497 </parameters>
498
499 <actions>
500 <action name="start" timeout="75"/>
501 <action name="stop" timeout="75"/>
502
503 <action name="status" timeout="10" interval="30"/>
504 <action name="monitor" timeout="10" interval="30"/>
505
506 <!-- depth 10 calls the status_program -->
507 <action name="status" depth="10" timeout="20" interval="60"/>
508 <action name="monitor" depth="10" timeout="20" interval="60"/>
509
510 <!-- reconfigure - reconfigure with new OCF parameters.
511 NOT OCF COMPATIBLE AT ALL -->
512 <action name="reconfig" timeout="10"/>
513
514 <action name="migrate" timeout="10m"/>
515
516 <action name="meta-data" timeout="5"/>
517 <action name="validate-all" timeout="5"/>
518
519 </actions>
520
521 <special tag="rgmanager">
522 <!-- Destroy_on_delete / init_on_add are currently only
523 supported for migratory resources (no children
524 and the 'migrate' action; see above. Do not try this
525 with normal services -->
526 <attributes maxinstances="1" destroy_on_delete="0" init_on_add="0"/>
527 </special>
528 </resource-agent>
529