" to '${new_state}'$text_state");
};
+# clean up a possible bad state from a recovered service to allow its start
+my $fence_recovery_cleanup = sub {
+ my ($self, $sid, $fenced_node) = @_;
+
+ my $haenv = $self->{haenv};
+
+ my (undef, $type, $id) = PVE::HA::Tools::parse_sid($sid);
+ my $plugin = PVE::HA::Resources->lookup($type);
+
+ # should not happen
+ die "unknown resource type '$type'" if !$plugin;
+
+ # locks may block recovery, cleanup those which are safe to remove after fencing
+ my $removable_locks = ['backup', 'mounted'];
+ if (my $removed_lock = $plugin->remove_locks($haenv, $id, $removable_locks, $fenced_node)) {
+ $haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
+ "service '$sid' to allow its start.");
+ }
+};
+
# after a node was fenced this recovers the service to a new node
my $recover_fenced_service = sub {
my ($self, $sid, $cd) = @_;
$haenv->log('info', "recover service '$sid' from fenced node " .
"'$fenced_node' to node '$recovery_node'");
+ &$fence_recovery_cleanup($self, $sid, $fenced_node);
+
$haenv->steal_service($sid, $sd->{node}, $recovery_node);
# $sd *is normally read-only*, fencing is the exception
die "implement in subclass";
}
+sub remove_locks {
+ my ($self, $haenv, $id, $locks, $service_node) = @_;
+
+ die "implement in subclass";
+}
+
# package PVE::HA::Resources::IPAddr;
return PVE::LXC::check_running($vmid);
}
+sub remove_locks {
+ my ($self, $haenv, $id, $locks, $service_node) = @_;
+
+ $service_node = $service_node || $haenv->nodename();
+
+ my $conf = PVE::LXC::Config->load_config($id, $service_node);
+
+ return undef if !defined($conf->{lock});
+
+ foreach my $lock (@$locks) {
+ if ($conf->{lock} eq $lock) {
+ delete $conf->{lock};
+
+ my $cfspath = PVE::LXC::Config->cfs_config_path($id, $service_node);
+ PVE::Cluster::cfs_write_file($cfspath, $conf);
+
+ return $lock;
+ }
+ }
+
+ return undef;
+}
+
1;
return PVE::QemuServer::check_running($vmid, 1, $nodename);
}
+sub remove_locks {
+ my ($self, $haenv, $id, $locks, $service_node) = @_;
+
+ $service_node = $service_node || $haenv->nodename();
+
+ my $conf = PVE::QemuConfig->load_config($id, $service_node);
+
+ return undef if !defined($conf->{lock});
+
+ foreach my $lock (@$locks) {
+ if ($conf->{lock} eq $lock) {
+ delete $conf->{lock};
+
+ my $cfspath = PVE::QemuConfig->cfs_config_path($id, $service_node);
+ PVE::Cluster::cfs_write_file($cfspath, $conf);
+
+ return $lock;
+ }
+ }
+
+ return undef;
+}
+
1;
}
+sub remove_locks {
+ my ($self, $haenv, $id, $locks, $service_node) = @_;
+
+ my $sid = $self->type() . ":$id";
+ my $hardware = $haenv->hardware();
+
+ foreach my $lock (@$locks) {
+ if (my $removed_lock = $hardware->unlock_service($sid, $lock)) {
+ return $removed_lock;
+ }
+ }
+
+ return undef;
+}
+
1;
--- /dev/null
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 120 cmdlist: execute service vm:103 lock
+info 220 cmdlist: execute network node3 off
+info 220 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 224 node3/crm: status change slave => wait_for_quorum
+info 225 node3/lrm: status change active => lost_agent_lock
+info 260 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info 260 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info 266 watchdog: execute power node3 off
+info 265 node3/crm: killed by poweroff
+info 266 node3/lrm: killed by poweroff
+info 266 hardware: server 'node3' stopped by poweroff (watchdog)
+info 340 node1/crm: got lock 'ha_agent_node3_lock'
+info 340 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info 340 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+warn 340 node1/crm: removed leftover lock 'backup' from recovered service 'vm:103' to allow its start.
+info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 341 node1/lrm: got lock 'ha_agent_node1_lock'
+info 341 node1/lrm: status change wait_for_agent_lock => active
+info 341 node1/lrm: starting service vm:103
+info 341 node1/lrm: service status vm:103 started
+info 820 hardware: exit simulation - done