1 package PVE
::HA
::Sim
::Hardware
;
3 # Simulate Hardware resources
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
12 use Fcntl
qw(:DEFAULT :flock);
14 use File
::Path
qw(make_path remove_tree);
17 use POSIX
qw(strftime EINTR);
19 use PVE
::HA
::FenceConfig
;
22 my $watchdog_timeout = 60;
24 # Status directory layout
28 # $testdir/cmdlist Command list for simulation
29 # $testdir/hardware_status Hardware description (number of nodes, ...)
30 # $testdir/manager_status CRM status (start with {})
31 # $testdir/service_config Service configuration
32 # $testdir/groups HA groups configuration
33 # $testdir/service_status_<node> Service status
34 # $testdir/datacenter.cfg Datacenter wide HA configuration
37 # runtime status for simulation system
39 # $testdir/status/cluster_locks Cluster locks
40 # $testdir/status/hardware_status Hardware status (power/network on/off)
41 # $testdir/status/watchdog_status Watchdog status
45 # $testdir/status/lrm_status_<node> LRM status
46 # $testdir/status/manager_status CRM status
47 # $testdir/status/crm_commands CRM command queue
48 # $testdir/status/service_config Service configuration
49 # $testdir/status/service_status_<node> Service status
50 # $testdir/status/groups HA groups configuration
53 my ($self, $node) = @_;
55 my $filename = "$self->{statusdir}/lrm_status_$node";
57 return PVE
::HA
::Tools
::read_json_from_file
($filename, {});
60 sub write_lrm_status
{
61 my ($self, $node, $status_obj) = @_;
63 my $filename = "$self->{statusdir}/lrm_status_$node";
65 PVE
::HA
::Tools
::write_json_to_file
($filename, $status_obj);
68 sub read_hardware_status_nolock
{
71 my $filename = "$self->{statusdir}/hardware_status";
73 my $raw = PVE
::Tools
::file_get_contents
($filename);
74 my $cstatus = decode_json
($raw);
79 sub write_hardware_status_nolock
{
80 my ($self, $cstatus) = @_;
82 my $filename = "$self->{statusdir}/hardware_status";
84 PVE
::Tools
::file_set_contents
($filename, encode_json
($cstatus));
87 sub read_service_config
{
90 my $filename = "$self->{statusdir}/service_config";
91 my $conf = PVE
::HA
::Tools
::read_json_from_file
($filename);
93 foreach my $sid (keys %$conf) {
94 my $d = $conf->{$sid};
96 die "service '$sid' without assigned node!" if !$d->{node
};
98 if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
104 $d->{state} = 'disabled' if !$d->{state};
105 $d->{state} = 'started' if $d->{state} eq 'enabled'; # backward compatibility
106 $d->{max_restart
} = 1 if !defined($d->{max_restart
});
107 $d->{max_relocate
} = 1 if !defined($d->{max_relocate
});
113 sub update_service_config
{
114 my ($self, $sid, $param) = @_;
116 my $conf = $self->read_service_config();
118 my $sconf = $conf->{$sid} || die "no such resource '$sid'\n";
120 foreach my $k (%$param) {
121 $sconf->{$k} = $param->{$k};
124 $self->write_service_config($conf);
127 sub write_service_config
{
128 my ($self, $conf) = @_;
130 $self->{service_config
} = $conf;
132 my $filename = "$self->{statusdir}/service_config";
133 return PVE
::HA
::Tools
::write_json_to_file
($filename, $conf);
136 sub read_fence_config
{
141 my $filename = "$self->{statusdir}/fence.cfg";
143 $raw = PVE
::Tools
::file_get_contents
($filename);
146 return PVE
::HA
::FenceConfig
::parse_config
($filename, $raw);
149 sub exec_fence_agent
{
150 my ($self, $agent, $node, @param) = @_;
152 # let all agent succeed and behave the same for now
153 $self->sim_hardware_cmd("power $node off", $agent);
155 return 0; # EXIT_SUCCESS
158 sub set_service_state
{
159 my ($self, $sid, $state) = @_;
161 my $conf = $self->read_service_config();
162 die "no such service '$sid'" if !$conf->{$sid};
164 $conf->{$sid}->{state} = $state;
166 $self->write_service_config($conf);
172 my ($self, $sid, $opts) = @_;
174 my $conf = $self->read_service_config();
175 die "resource ID '$sid' already defined\n" if $conf->{$sid};
177 $conf->{$sid} = $opts;
179 $self->write_service_config($conf);
185 my ($self, $sid) = @_;
187 my $conf = $self->read_service_config();
189 die "no such service '$sid'" if !$conf->{$sid};
191 delete $conf->{$sid};
193 $self->write_service_config($conf);
198 sub change_service_location
{
199 my ($self, $sid, $current_node, $new_node) = @_;
201 my $conf = $self->read_service_config();
203 die "no such service '$sid'\n" if !$conf->{$sid};
205 die "current_node for '$sid' does not match ($current_node != $conf->{$sid}->{node})\n"
206 if $current_node ne $conf->{$sid}->{node
};
208 $conf->{$sid}->{node
} = $new_node;
210 $self->write_service_config($conf);
213 sub service_has_lock
{
214 my ($self, $sid) = @_;
216 my $conf = $self->read_service_config();
218 die "no such service '$sid'\n" if !$conf->{$sid};
220 return $conf->{$sid}->{lock};
224 my ($self, $sid, $lock) = @_;
226 my $conf = $self->read_service_config();
228 die "no such service '$sid'\n" if !$conf->{$sid};
230 $conf->{$sid}->{lock} = $lock || 'backup';
232 $self->write_service_config($conf);
238 my ($self, $sid, $lock) = @_;
240 my $conf = $self->read_service_config();
242 die "no such service '$sid'\n" if !$conf->{$sid};
244 if (!defined($conf->{$sid}->{lock})) {
248 if (defined($lock) && $conf->{$sid}->{lock} ne $lock) {
249 warn "found lock '$conf->{$sid}->{lock}' trying to remove '$lock' lock\n";
253 my $removed_lock = delete $conf->{$sid}->{lock};
255 $self->write_service_config($conf);
257 return $removed_lock;
260 sub queue_crm_commands_nolock
{
261 my ($self, $cmd) = @_;
266 my $filename = "$self->{statusdir}/crm_commands";
268 $data = PVE
::Tools
::file_get_contents
($filename);
271 PVE
::Tools
::file_set_contents
($filename, $data);
276 sub queue_crm_commands
{
277 my ($self, $cmd) = @_;
279 my $code = sub { $self->queue_crm_commands_nolock($cmd); };
281 $self->global_lock($code);
286 sub read_crm_commands
{
292 my $filename = "$self->{statusdir}/crm_commands";
294 $data = PVE
::Tools
::file_get_contents
($filename);
296 PVE
::Tools
::file_set_contents
($filename, '');
301 return $self->global_lock($code);
304 sub read_group_config
{
307 my $filename = "$self->{statusdir}/groups";
309 $raw = PVE
::Tools
::file_get_contents
($filename) if -f
$filename;
311 return PVE
::HA
::Groups-
>parse_config($filename, $raw);
314 sub read_service_status
{
315 my ($self, $node) = @_;
317 my $filename = "$self->{statusdir}/service_status_$node";
318 return PVE
::HA
::Tools
::read_json_from_file
($filename);
321 sub write_service_status
{
322 my ($self, $node, $data) = @_;
324 my $filename = "$self->{statusdir}/service_status_$node";
325 my $res = PVE
::HA
::Tools
::write_json_to_file
($filename, $data);
327 # fixme: add test if a service runs on two nodes!!!
332 my $default_group_config = <<__EOD;
347 my ($this, $testdir) = @_;
349 die "missing testdir" if !$testdir;
351 die "testdir '$testdir' does not exist or is not a directory!\n"
354 my $class = ref($this) || $this;
356 my $self = bless {}, $class;
358 my $statusdir = $self->{statusdir
} = "$testdir/status";
360 remove_tree
($statusdir);
363 # copy initial configuartion
364 copy
("$testdir/manager_status", "$statusdir/manager_status"); # optional
366 if (-f
"$testdir/groups") {
367 copy
("$testdir/groups", "$statusdir/groups");
369 PVE
::Tools
::file_set_contents
("$statusdir/groups", $default_group_config);
372 if (-f
"$testdir/service_config") {
373 copy
("$testdir/service_config", "$statusdir/service_config");
376 'vm:101' => { node
=> 'node1', group
=> 'prefer_node1' },
377 'vm:102' => { node
=> 'node2', group
=> 'prefer_node2' },
378 'vm:103' => { node
=> 'node3', group
=> 'prefer_node3' },
379 'vm:104' => { node
=> 'node1', group
=> 'prefer_node1' },
380 'vm:105' => { node
=> 'node2', group
=> 'prefer_node2' },
381 'vm:106' => { node
=> 'node3', group
=> 'prefer_node3' },
383 $self->write_service_config($conf);
386 if (-f
"$testdir/hardware_status") {
387 copy
("$testdir/hardware_status", "$statusdir/hardware_status") ||
388 die "Copy failed: $!\n";
391 node1
=> { power
=> 'off', network
=> 'off' },
392 node2
=> { power
=> 'off', network
=> 'off' },
393 node3
=> { power
=> 'off', network
=> 'off' },
395 $self->write_hardware_status_nolock($cstatus);
398 if (-f
"$testdir/fence.cfg") {
399 copy
("$testdir/fence.cfg", "$statusdir/fence.cfg");
402 if (-f
"$testdir/datacenter.cfg") {
403 copy
("$testdir/datacenter.cfg", "$statusdir/datacenter.cfg");
406 my $cstatus = $self->read_hardware_status_nolock();
408 foreach my $node (sort keys %$cstatus) {
409 $self->{nodes
}->{$node} = {};
411 if (-f
"$testdir/service_status_$node") {
412 copy
("$testdir/service_status_$node", "$statusdir/service_status_$node");
414 $self->write_service_status($node, {});
418 $self->{service_config
} = $self->read_service_config();
426 die "implement in subclass";
430 my ($self, $level, $msg, $id) = @_;
434 my $time = $self->get_time();
436 $id = 'hardware' if !$id;
438 printf("%-5s %5d %12s: $msg\n", $level, $time, $id);
442 my ($self, $node) = @_;
444 return $self->{statusdir
};
447 sub read_datacenter_conf
{
448 my ($self, $node) = @_;
450 my $filename = "$self->{statusdir}/datacenter.cfg";
451 return PVE
::HA
::Tools
::read_json_from_file
($filename, {});
455 my ($self, $code, @param) = @_;
457 my $lockfile = "$self->{statusdir}/hardware.lck";
458 my $fh = IO
::File-
>new(">>$lockfile") ||
459 die "unable to open '$lockfile'\n";
463 $success = flock($fh, LOCK_EX
);
464 if ($success || ($! != EINTR
)) {
469 die "can't acquire lock '$lockfile' - $!\n";
475 eval { $res = &$code($fh, @param) };
485 my $compute_node_info = sub {
486 my ($self, $cstatus) = @_;
491 my $online_count = 0;
493 foreach my $node (keys %$cstatus) {
494 my $d = $cstatus->{$node};
496 my $online = ($d->{power
} eq 'on' && $d->{network
} eq 'on') ?
1 : 0;
497 $node_info->{$node}->{online
} = $online;
500 $online_count++ if $online;
503 my $quorate = ($online_count > int($node_count/2)) ?
1 : 0;
506 foreach my $node (keys %$cstatus) {
507 my $d = $cstatus->{$node};
508 $node_info->{$node}->{online
} = 0;
512 return ($node_info, $quorate);
518 my $cstatus = $self->read_hardware_status_nolock();
519 my ($node_info, $quorate) = &$compute_node_info($self, $cstatus);
521 return ($node_info, $quorate);
524 # helper for Sim/ only
526 my ($self, $node, $state) = @_;
528 # TODO: ensure nolock is OK when adding this to RTSim
529 my $cstatus = $self->read_hardware_status_nolock();
530 my $res = $cstatus->{$node}->{cfs
}->{$state};
532 # we assume default true if not defined
533 return !defined($res) || $res;
536 # simulate hardware commands, the following commands are available:
537 # power <node> <on|off>
538 # network <node> <on|off>
540 # cfs <node> <rw|update> <work|fail>
544 # service <sid> <started|disabled|stopped|ignored>
545 # service <sid> <migrate|relocate> <target>
546 # service <sid> stop <timeout>
547 # service <sid> lock/unlock [lockname]
548 # service <sid> <add|delete>
549 sub sim_hardware_cmd
{
550 my ($self, $cmdstr, $logid) = @_;
555 my $cstatus = $self->read_hardware_status_nolock();
557 my ($cmd, $objid, $action, $param) = split(/\s+/, $cmdstr);
559 die "sim_hardware_cmd: no node or service for command specified"
562 my ($node, $sid, $d);
564 if ($cmd eq 'service') {
565 $sid = PVE
::HA
::Tools
::pve_verify_ha_resource_id
($objid);
568 $d = $self->{nodes
}->{$node} ||
569 die "sim_hardware_cmd: no such node '$node'\n";
572 $self->log('info', "execute $cmdstr", $logid);
574 if ($cmd eq 'power') {
575 die "sim_hardware_cmd: unknown action '$action'\n"
576 if $action !~ m/^(on|off)$/;
578 if ($cstatus->{$node}->{power
} ne $action) {
579 if ($action eq 'on') {
581 $d->{crm
} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm
});
582 $d->{lrm
} = $self->lrm_control('start', $d, $lock_fh) if !defined($d->{lrm
});
583 $d->{lrm_restart
} = undef;
584 $cstatus->{$node}->{cfs
} = {};
589 $d->{crm_env
}->log('info', "killed by poweroff");
590 $self->crm_control('stop', $d, $lock_fh);
594 $d->{lrm_env
}->log('info', "killed by poweroff");
595 $self->lrm_control('stop', $d, $lock_fh);
597 $d->{lrm_restart
} = undef;
600 $self->watchdog_reset_nolock($node);
601 $self->write_service_status($node, {});
605 $cstatus->{$node}->{power
} = $action;
606 $cstatus->{$node}->{network
} = $action;
607 $cstatus->{$node}->{shutdown} = undef;
609 $self->write_hardware_status_nolock($cstatus);
611 } elsif ($cmd eq 'network') {
612 die "sim_hardware_cmd: unknown network action '$action'"
613 if $action !~ m/^(on|off)$/;
614 $cstatus->{$node}->{network
} = $action;
616 $self->write_hardware_status_nolock($cstatus);
618 } elsif ($cmd eq 'cfs') {
619 die "sim_hardware_cmd: unknown cfs action '$action' for node '$node'"
620 if $action !~ m/^(rw|update)$/;
621 die "sim_hardware_cmd: unknown cfs command '$param' for '$action' on node '$node'"
622 if $param !~ m/^(work|fail)$/;
624 $cstatus->{$node}->{cfs
}->{$action} = $param eq 'work';
625 $self->write_hardware_status_nolock($cstatus);
627 } elsif ($cmd eq 'reboot' || $cmd eq 'shutdown') {
628 $cstatus->{$node}->{shutdown} = $cmd;
630 $self->write_hardware_status_nolock($cstatus);
632 $self->lrm_control('shutdown', $d, $lock_fh) if defined($d->{lrm
});
633 } elsif ($cmd eq 'restart-lrm') {
635 $d->{lrm_restart
} = 1;
636 $self->lrm_control('shutdown', $d, $lock_fh);
638 } elsif ($cmd eq 'crm') {
640 if ($action eq 'stop') {
643 $self->crm_control('shutdown', $d, $lock_fh);
645 } elsif ($action eq 'start') {
646 $d->{crm
} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm
});
648 die "sim_hardware_cmd: unknown action '$action'";
651 } elsif ($cmd eq 'service') {
652 if ($action eq 'started' || $action eq 'disabled' ||
653 $action eq 'stopped' || $action eq 'ignored') {
655 $self->set_service_state($sid, $action);
657 } elsif ($action eq 'migrate' || $action eq 'relocate') {
659 die "sim_hardware_cmd: missing target node for '$action' command"
662 $self->queue_crm_commands_nolock("$action $sid $param");
664 } elsif ($action eq 'stop') {
666 die "sim_hardware_cmd: missing timeout for '$action' command"
669 $self->queue_crm_commands_nolock("$action $sid $param");
671 } elsif ($action eq 'add') {
673 $self->add_service($sid, {state => 'started', node
=> $param});
675 } elsif ($action eq 'delete') {
677 $self->delete_service($sid);
679 } elsif ($action eq 'lock') {
681 $self->lock_service($sid, $param);
683 } elsif ($action eq 'unlock') {
685 $self->unlock_service($sid, $param);
688 die "sim_hardware_cmd: unknown service action '$action' " .
689 "- not implemented\n"
692 die "sim_hardware_cmd: unknown command '$cmdstr'\n";
698 return $self->global_lock($code);
701 # for controlling the resource manager services
703 my ($self, $action, $data, $lock_fh) = @_;
705 die "implement in subclass";
709 my ($self, $action, $data, $lock_fh) = @_;
711 die "implement in subclass";
717 die "implement in subclass";
720 my $modify_watchog = sub {
721 my ($self, $code) = @_;
723 my $update_cmd = sub {
725 my $filename = "$self->{statusdir}/watchdog_status";
727 my ($res, $wdstatus);
730 my $raw = PVE
::Tools
::file_get_contents
($filename);
731 $wdstatus = decode_json
($raw);
736 ($wdstatus, $res) = &$code($wdstatus);
738 PVE
::Tools
::file_set_contents
($filename, encode_json
($wdstatus));
743 return $self->global_lock($update_cmd);
746 sub watchdog_reset_nolock
{
747 my ($self, $node) = @_;
749 my $filename = "$self->{statusdir}/watchdog_status";
752 my $raw = PVE
::Tools
::file_get_contents
($filename);
753 my $wdstatus = decode_json
($raw);
755 foreach my $id (keys %$wdstatus) {
756 delete $wdstatus->{$id} if $wdstatus->{$id}->{node
} eq $node;
759 PVE
::Tools
::file_set_contents
($filename, encode_json
($wdstatus));
764 my ($self, $node) = @_;
771 foreach my $wfh (keys %$wdstatus) {
772 my $wd = $wdstatus->{$wfh};
773 next if $wd->{node
} ne $node;
775 my $ctime = $self->get_time();
776 my $tdiff = $ctime - $wd->{update_time
};
778 if ($tdiff > $watchdog_timeout) { # expired
780 delete $wdstatus->{$wfh};
784 return ($wdstatus, $res);
787 return &$modify_watchog($self, $code);
793 my ($self, $node) = @_;
800 my $id = "WD:$node:$$:$wdcounter";
802 die "internal error" if defined($wdstatus->{$id});
806 update_time
=> $self->get_time(),
809 return ($wdstatus, $id);
812 return &$modify_watchog($self, $code);
816 my ($self, $wfh) = @_;
821 my $wd = $wdstatus->{$wfh};
822 die "no such watchdog handle '$wfh'\n" if !defined($wd);
824 my $tdiff = $self->get_time() - $wd->{update_time
};
825 die "watchdog expired" if $tdiff > $watchdog_timeout;
827 delete $wdstatus->{$wfh};
832 return &$modify_watchog($self, $code);
835 sub watchdog_update
{
836 my ($self, $wfh) = @_;
841 my $wd = $wdstatus->{$wfh};
843 die "no such watchdog handle '$wfh'\n" if !defined($wd);
845 my $ctime = $self->get_time();
846 my $tdiff = $ctime - $wd->{update_time
};
848 die "watchdog expired" if $tdiff > $watchdog_timeout;
850 $wd->{update_time
} = $ctime;
855 return &$modify_watchog($self, $code);