]> git.proxmox.com Git - pve-manager.git/commitdiff
add HA resource agent
authorDietmar Maurer <dietmar@proxmox.com>
Tue, 13 Dec 2011 09:19:02 +0000 (10:19 +0100)
committerDietmar Maurer <dietmar@proxmox.com>
Tue, 13 Dec 2011 09:19:02 +0000 (10:19 +0100)
bin/Makefile
bin/ocf/Makefile [new file with mode: 0644]
bin/ocf/pvevm [new file with mode: 0755]
debian/changelog.Debian
defines.mk

index f2070553d6eba2c3b1aedb5291137465c48248e1..cda55d79df92026c216ba29414b342a2750c6893 100644 (file)
@@ -1,6 +1,6 @@
 include ../defines.mk
 
-SUBDIRS = init.d cron test
+SUBDIRS = init.d cron ocf test
 
 SCRIPTS =                      \
        vzdump                  \
diff --git a/bin/ocf/Makefile b/bin/ocf/Makefile
new file mode 100644 (file)
index 0000000..543d218
--- /dev/null
@@ -0,0 +1,17 @@
+include ../../defines.mk
+
+all:
+
+SCRIPTS = pvevm
+
+.PHONY: install 
+install: ${SCRIPTS}
+       install -d ${HARADIR}
+       install -m 0755 ${SCRIPTS} ${HARADIR}
+
+.PHONY: distclean
+distclean: clean
+
+.PHONY: clean
+clean:
+       rm -rf *~
diff --git a/bin/ocf/pvevm b/bin/ocf/pvevm
new file mode 100755 (executable)
index 0000000..aa9cce0
--- /dev/null
@@ -0,0 +1,465 @@
+#!/usr/bin/perl -w
+
+# Resource Agent for managing PVE VMs (openvz and qemu-kvm)
+#
+# License: GNU Affero General Public License (AGPL3)
+# Copyright (C) 2011 Proxmox Server Solutions GmbH
+
+use strict;
+use PVE::Tools;
+use PVE::ProcFSTools;
+use PVE::Cluster;
+use PVE::INotify;
+use PVE::RPCEnvironment;
+use PVE::OpenVZ;
+use PVE::API2::OpenVZ;
+use PVE::QemuServer;
+use PVE::API2::Qemu;
+
+use constant OCF_SUCCESS => 0;
+use constant OCF_ERR_GENERIC => 1;
+use constant OCF_ERR_ARGS => 2;
+use constant OCF_ERR_UNIMPLEMENTED => 3;
+use constant OCF_ERR_PERM => 4;
+use constant OCF_ERR_INSTALLED => 5;
+use constant OCF_ERR_CONFIGURED => 6;
+use constant OCF_NOT_RUNNING => 7;
+use constant OCF_RUNNING_MASTER => 8;
+use constant OCF_FAILED_MASTER => 9;
+
+$ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
+
+$SIG{__DIE__} = sub {
+    die @_ if $^S; # skip if inside eval
+    $! = OCF_ERR_GENERIC;
+};
+
+if ($> != 0) {
+    print STDERR "Cannot control VMs. as non-root user.\n";
+    exit(OCF_ERR_PERM);
+}
+
+PVE::INotify::inotify_init();
+
+my $rpcenv = PVE::RPCEnvironment->init('ha');
+
+$rpcenv->init_request();
+$rpcenv->set_language($ENV{LANG});
+$rpcenv->set_user('root@pam'); 
+
+my $nodename = PVE::INotify::nodename();
+
+my @ssh_opts = ('-o', 'BatchMode=yes');
+my @ssh_cmd = ('ssh', @ssh_opts);
+
+sub ocf_log {
+    my ($level, $msg) = @_;
+
+    # fixme:
+    
+    chomp $msg;
+    
+    print "$level: $msg\n";
+}
+
+sub check_running {
+    my ($status, $verbose) = @_;
+
+    if ($status->{type} eq 'qemu') {
+       $status->{running} = PVE::QemuServer::check_running($status->{vmid}, 1);
+    } elsif ($status->{type} eq 'openvz') {
+       $status->{running} = PVE::OpenVZ::check_running($status->{vmid});
+    } else {
+       die "got strange VM type '$status->{type}'\n";
+    }
+}
+
+sub validate_all {
+    my $status = {};
+
+    eval {
+
+       my $vmid = $ENV{OCF_RESKEY_vmid};
+       die "no VMID specified\n" if !defined($vmid);
+       die "got invalid VMID '$vmid'\n" if $vmid !~ m/^[1-9]\d*$/;
+
+       my $vmlist = PVE::Cluster::get_vmlist();
+       die "got empty cluster VM list\n" if !$vmlist || !$vmlist->{ids};
+       my $data = $vmlist->{ids}->{$vmid};
+       die "VM $vmid does not exist\n" if !$data;
+
+       $status->{vmid} = $vmid;
+       $status->{type} = $data->{type};
+       $status->{node} = $data->{node};
+
+       ocf_log('debug', "VM $vmid ($status->{type}) on node $status->{node}\n");
+
+       check_running($status);
+    };
+    if (my $err = $@) {
+       ocf_log('err', $err);
+       exit(OCF_ERR_ARGS);
+    }
+
+    return $status;
+}
+
+sub upid_wait {
+    my ($upid) = @_;
+
+    my $task = PVE::Tools::upid_decode($upid);
+
+    sleep(1);
+    while (PVE::ProcFSTools::check_process_running($task->{pid}, $task->{pstart})) {
+       ocf_log('debug', "Task still active, waiting");
+       sleep(1);
+    }
+}
+
+my $cmd = shift || '';
+my $migratetarget = shift if $cmd eq 'migrate';
+
+die "too many arguments\n" if scalar (@ARGV) != 0;
+
+if ($cmd eq 'start') {
+    my $status = validate_all();
+    if ($status->{running}) {
+       ocf_log('info', "Resource is already running");
+       exit(OCF_SUCCESS);
+    }
+
+    if ($status->{node} ne $nodename) {
+       ocf_log('info', "Move config to local node");
+       my ($oldconfig, $newconfig);
+       if ($status->{type} eq 'qemu') {
+           $oldconfig = PVE::QemuServer::config_file($status->{vmid}, $status->{node});
+           $newconfig = PVE::QemuServer::config_file($status->{vmid}, $nodename);
+       } else {
+           $oldconfig = PVE::OpenVZ::config_file($status->{vmid}, $status->{node});
+           $newconfig = PVE::OpenVZ::config_file($status->{vmid}, $nodename);
+       }
+       if (!rename($oldconfig, $newconfig)) {
+           ocf_log('err', "unable to move config file from '$oldconfig' to '$newconfig' - $!");
+           exit(OCF_ERR_GENERIC);
+       }
+    }
+
+    my $upid;
+    
+    if ($status->{type} eq 'qemu') {
+       $upid = PVE::API2::Qemu->vm_start({node => $nodename, vmid => $status->{vmid}});
+    } else {
+       $upid = PVE::API2::OpenVZ->vm_start({node => $nodename, vmid => $status->{vmid}});
+    }
+
+    upid_wait($upid);
+
+    check_running($status);
+
+    exit($status->{running} ? OCF_SUCCESS : OCF_ERR_GENERIC);
+
+} elsif($cmd eq 'stop') {
+    my $status = validate_all();
+
+    if (!$status->{running}) {
+       ocf_log('info', "Resource is already stopped");
+       exit(OCF_SUCCESS);
+    }
+
+    my $upid;
+    
+    if ($status->{type} eq 'qemu') {
+       $upid = PVE::API2::Qemu->vm_stop({node => $nodename, vmid => $status->{vmid}});
+    } else {
+       $upid = PVE::API2::OpenVZ->vm_stop({node => $nodename, vmid => $status->{vmid}, fast => 1});
+    }
+
+    upid_wait($upid);
+
+    check_running($status);
+
+    exit($status->{running} ? OCF_ERR_GENERIC : OCF_SUCCESS);
+
+} elsif($cmd eq 'recover' || $cmd eq 'restart' || $cmd eq 'reload') {
+
+    exit(OCF_SUCCESS);
+
+} elsif($cmd eq 'status' || $cmd eq 'monitor') {
+
+    my $status = validate_all();
+    if ($status->{running}) {
+       ocf_log('debug', "Resource is running");
+       exit(OCF_SUCCESS);
+    } else {
+       ocf_log('debug', "Resource is not running");
+       exit(OCF_NOT_RUNNING);
+    }
+
+} elsif($cmd eq 'migrate') {
+    my $status = validate_all();
+    if (!$status->{running}) {
+       ocf_log('err', "Resource is not running");
+       exit(OCF_ERR_GENERIC);
+    }
+
+    if (!$migratetarget) {
+       ocf_log('err', "No target specified");
+       exit(OCF_ERR_ARGS);
+
+    };
+
+    # test ssh connection and try to detect node name
+    my @rem_ssh = (@ssh_cmd, "root\@$migratetarget");
+    my $cmd = [ @rem_ssh, '/bin/hostname' ];
+    my $targetnode = '';
+    eval {
+       PVE::Tools::run_command($cmd, outfunc => sub {
+           $targetnode = shift if !$targetnode; 
+       });
+    };
+    if (my $err = $@) {
+       ocf_log('err', "can't connect to target '$migratetarget' - $err");
+       exit(OCF_ERR_GENERIC);  
+    }
+    if (!PVE::Cluster::check_node_exists($targetnode, 1)) {
+       ocf_log('err', "target hostname '$targetnode' is no cluster member");
+       exit(OCF_ERR_GENERIC);
+    }
+
+    my $upid;
+    my $params = {
+       node => $nodename, 
+       vmid => $status->{vmid},
+       target => $targetnode,
+       online => 1,
+    };
+
+    my $oldconfig;
+    if ($status->{type} eq 'qemu') {
+       $oldconfig = PVE::QemuServer::config_file($status->{vmid}, $status->{node});
+       $upid = PVE::API2::Qemu->migrate_vm($params);
+    } else {
+       $oldconfig = PVE::OpenVZ::config_file($status->{vmid}, $status->{node});
+       $upid = PVE::API2::OpenVZ->migrate_vm($params);
+    }
+
+    upid_wait($upid);
+
+    # something went wrong if old config file is still there
+    exit((-f $oldconfig) ? OCF_ERR_GENERIC : OCF_SUCCESS);
+
+} elsif($cmd eq 'stop') {
+    my $status = validate_all();
+
+    if (!$status->{running}) {
+       ocf_log('info', "Resource is already stopped");
+       exit(OCF_SUCCESS);
+    }
+
+    my $upid;
+    
+    if ($status->{type} eq 'qemu') {
+       $upid = PVE::API2::Qemu->vm_stop({node => $nodename, vmid => $status->{vmid}});
+    } else {
+       $upid = PVE::API2::OpenVZ->vm_stop({node => $nodename, vmid => $status->{vmid}, fast => 1});
+    }
+
+    upid_wait($upid);
+
+    die "implement me";
+
+} elsif($cmd eq 'reconfig') {
+    # Reconfigure a running VM
+    my $status = validate_all();
+
+    # we do nothing here
+
+} elsif($cmd eq 'meta-data') {
+    while(<DATA>) {
+       print;
+    }
+} elsif($cmd eq 'validate-all') {
+    my $status = validate_all();
+} else {
+    die "usage: $0 {start|stop|restart|status|reload|reconfig|meta-data|validate-all}\n";
+}
+                       
+exit(OCF_SUCCESS);
+
+__DATA__
+<?xml version="1.0"?>
+<resource-agent version="rgmanager 2.0" name="pvevm">
+    <version>1.0</version>
+
+    <longdesc lang="en">
+       Defines a PVE Virtual Machine
+    </longdesc>
+    <shortdesc lang="en">
+        Defines a PVE Virtual Machine
+    </shortdesc>
+
+    <parameters>
+        <parameter name="vmid" primary="1">
+            <longdesc lang="en">
+                This is the VMID of the virtual machine.
+            </longdesc>
+            <shortdesc lang="en">
+                VMID
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+    
+        <parameter name="domain" reconfig="1">
+            <longdesc lang="en">
+                Failover domains define lists of cluster members
+                to try in the event that the host of the virtual machine
+               fails.
+            </longdesc>
+            <shortdesc lang="en">
+                Cluster failover Domain
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
+        <parameter name="autostart" reconfig="1">
+            <longdesc lang="en">
+               If set to yes, this resource group will automatically be started
+               after the cluster forms a quorum.  If set to no, this virtual
+               machine will start in the 'disabled' state after the cluster
+               forms a quorum.
+            </longdesc>
+            <shortdesc lang="en">
+               Automatic start after quorum formation
+            </shortdesc>
+            <content type="boolean" default="1"/>
+        </parameter>
+
+        <parameter name="exclusive" reconfig="1">
+            <longdesc lang="en">
+               If set, this resource group will only relocate to
+               nodes which have no other resource groups running in the
+               event of a failure.  If no empty nodes are available,
+               this resource group will not be restarted after a failure.
+               Additionally, resource groups will not automatically
+               relocate to the node running this resource group.  This
+               option can be overridden by manual start and/or relocate
+               operations.
+            </longdesc>
+            <shortdesc lang="en">
+               Exclusive resource group
+            </shortdesc>
+            <content type="boolean" default="0"/>
+        </parameter>
+
+        <parameter name="recovery" reconfig="1">
+            <longdesc lang="en">
+               This currently has three possible options: "restart" tries
+               to restart this virtual machine locally before
+               attempting to relocate (default); "relocate" does not bother
+               trying to restart the VM locally; "disable" disables
+               the VM if it fails.
+            </longdesc>
+            <shortdesc lang="en">
+               Failure recovery policy
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
+        <parameter name="depend">
+            <longdesc lang="en">
+               Service dependency; will not start without the specified
+               service running.
+            </longdesc>
+            <shortdesc lang="en">
+               Top-level service this depends on, in service:name format.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
+        <parameter name="depend_mode">
+            <longdesc lang="en">
+               Service dependency mode.
+               hard - This service is stopped/started if its dependency
+                      is stopped/started
+               soft - This service only depends on the other service for
+                      initial startip.  If the other service stops, this
+                      service is not stopped.
+            </longdesc>
+            <shortdesc lang="en">
+               Service dependency mode (soft or hard).
+            </shortdesc>
+            <content type="string" default="hard"/>
+        </parameter>
+
+        <parameter name="max_restarts" reconfig="1">
+            <longdesc lang="en">
+               Maximum restarts for this service.
+            </longdesc>
+            <shortdesc lang="en">
+               Maximum restarts for this service.
+            </shortdesc>
+            <content type="string" default="0"/>
+        </parameter>
+
+        <parameter name="restart_expire_time" reconfig="1">
+            <longdesc lang="en">
+               Restart expiration time.  A restart is forgotten
+               after this time.  When combined with the max_restarts
+               option, this lets administrators specify a threshold
+               for when to fail over services.  If max_restarts
+               is exceeded in this given expiration time, the service
+               is relocated instead of restarted again.
+            </longdesc>
+            <shortdesc lang="en">
+               Restart expiration time; amount of time before a restart
+               is forgotten.
+            </shortdesc>
+            <content type="string" default="0"/>
+        </parameter>
+
+        <parameter name="status_program" reconfig="1">
+            <longdesc lang="en">
+               Ordinarily, only the presence/health of a virtual machine
+               is checked.  If specified, the status_program value is
+               executed during a depth 10 check.  The intent of this 
+               program is to ascertain the status of critical services
+               within a virtual machine.
+            </longdesc>
+            <shortdesc lang="en">
+               Additional status check program
+            </shortdesc>
+            <content type="string" default=""/>
+        </parameter>
+    </parameters>
+
+    <actions>
+        <action name="start" timeout="75"/>
+        <action name="stop" timeout="75"/>
+       
+        <action name="status" timeout="10" interval="30"/>
+        <action name="monitor" timeout="10" interval="30"/>
+
+       <!-- depth 10 calls the status_program -->
+        <action name="status" depth="10" timeout="20" interval="60"/>
+        <action name="monitor" depth="10" timeout="20" interval="60"/>
+
+       <!-- reconfigure - reconfigure with new OCF parameters.
+            NOT OCF COMPATIBLE AT ALL -->
+       <action name="reconfig" timeout="10"/>
+
+       <action name="migrate" timeout="10m"/>
+
+        <action name="meta-data" timeout="5"/>
+        <action name="validate-all" timeout="5"/>
+
+    </actions>
+    
+    <special tag="rgmanager">
+       <!-- Destroy_on_delete / init_on_add are currently only
+            supported for migratory resources (no children
+            and the 'migrate' action; see above.  Do not try this
+            with normal services -->
+        <attributes maxinstances="1" destroy_on_delete="0" init_on_add="0"/>
+    </special>
+</resource-agent>
+
index 125da37d449be735edf08d622bf77b889858d9d5..b01503c723abd5603fa4dc82ccf42fff1289d7d8 100644 (file)
@@ -1,3 +1,9 @@
+pve-manager (2.0-15) unstable; urgency=low
+
+  * add HA resource agent
+
+ -- Proxmox Support Team <support@proxmox.com>  Tue, 13 Dec 2011 10:18:20 +0100
+
 pve-manager (2.0-14) unstable; urgency=low
 
   * add Japanese translation (many thanks to Koichi!)
index d1944a47d7ac4158951ee9b92d1045d4c766bd98..ec4d22810f840100af7f8df4e1f6493f303fa068 100644 (file)
@@ -2,13 +2,14 @@ RELEASE=2.0
 
 VERSION=2.0
 PACKAGE=pve-manager
-PACKAGERELEASE=14
+PACKAGERELEASE=15
 
 BINDIR=${DESTDIR}/usr/bin
 PERLLIBDIR=${DESTDIR}/usr/share/perl5
 MAN1DIR=${DESTDIR}/usr/share/man/man1
 CRONDAILYDIR=${DESTDIR}/etc/cron.daily
-INITDBINDIR=${DESTDIR}/etc/init.d/
+INITDBINDIR=${DESTDIR}/etc/init.d
+HARADIR=${DESTDIR}/usr/share/cluster
 DOCDIR=${DESTDIR}/usr/share/doc/${PACKAGE}
 PODDIR=${DESTDIR}/usr/share/doc/${PACKAGE}/pod
 WWWBASEDIR=${DESTDIR}/usr/share/${PACKAGE}