1 package PVE
::HA
::Sim
::Hardware
;
3 # Simulate Hardware resources
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
12 use Fcntl
qw(:DEFAULT :flock);
14 use File
::Path
qw(make_path remove_tree);
17 use POSIX
qw(strftime EINTR);
19 use PVE
::HA
::FenceConfig
;
22 my $watchdog_timeout = 60;
24 # Status directory layout
28 # $testdir/cmdlist Command list for simulation
29 # $testdir/hardware_status Hardware description (number of nodes, ...)
30 # $testdir/manager_status CRM status (start with {})
31 # $testdir/service_config Service configuration
32 # $testdir/static_service_stats Static service usage information (cpu, memory)
33 # $testdir/groups HA groups configuration
34 # $testdir/service_status_<node> Service status
35 # $testdir/datacenter.cfg Datacenter wide HA configuration
38 # runtime status for simulation system
40 # $testdir/status/cluster_locks Cluster locks
41 # $testdir/status/hardware_status Hardware status (power/network on/off)
42 # $testdir/status/static_service_stats Static service usage information (cpu, memory)
43 # $testdir/status/watchdog_status Watchdog status
47 # $testdir/status/lrm_status_<node> LRM status
48 # $testdir/status/manager_status CRM status
49 # $testdir/status/crm_commands CRM command queue
50 # $testdir/status/service_config Service configuration
51 # $testdir/status/service_status_<node> Service status
52 # $testdir/status/groups HA groups configuration
55 my ($self, $node) = @_;
57 my $filename = "$self->{statusdir}/lrm_status_$node";
59 return PVE
::HA
::Tools
::read_json_from_file
($filename, {});
62 sub write_lrm_status
{
63 my ($self, $node, $status_obj) = @_;
65 my $filename = "$self->{statusdir}/lrm_status_$node";
67 PVE
::HA
::Tools
::write_json_to_file
($filename, $status_obj);
70 sub read_hardware_status_nolock
{
73 my $filename = "$self->{statusdir}/hardware_status";
75 my $raw = PVE
::Tools
::file_get_contents
($filename);
76 my $cstatus = decode_json
($raw);
81 sub write_hardware_status_nolock
{
82 my ($self, $cstatus) = @_;
84 my $filename = "$self->{statusdir}/hardware_status";
86 PVE
::Tools
::file_set_contents
($filename, encode_json
($cstatus));
89 sub read_service_config
{
92 my $filename = "$self->{statusdir}/service_config";
93 my $conf = PVE
::HA
::Tools
::read_json_from_file
($filename);
95 foreach my $sid (keys %$conf) {
96 my $d = $conf->{$sid};
98 die "service '$sid' without assigned node!" if !$d->{node
};
100 if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
106 $d->{state} = 'disabled' if !$d->{state};
107 $d->{state} = 'started' if $d->{state} eq 'enabled'; # backward compatibility
108 $d->{max_restart
} = 1 if !defined($d->{max_restart
});
109 $d->{max_relocate
} = 1 if !defined($d->{max_relocate
});
115 sub update_service_config
{
116 my ($self, $sid, $param) = @_;
118 my $conf = $self->read_service_config();
120 my $sconf = $conf->{$sid} || die "no such resource '$sid'\n";
122 foreach my $k (%$param) {
123 $sconf->{$k} = $param->{$k};
126 $self->write_service_config($conf);
129 sub write_service_config
{
130 my ($self, $conf) = @_;
132 $self->{service_config
} = $conf;
134 my $filename = "$self->{statusdir}/service_config";
135 return PVE
::HA
::Tools
::write_json_to_file
($filename, $conf);
138 sub read_fence_config
{
143 my $filename = "$self->{statusdir}/fence.cfg";
145 $raw = PVE
::Tools
::file_get_contents
($filename);
148 return PVE
::HA
::FenceConfig
::parse_config
($filename, $raw);
151 sub exec_fence_agent
{
152 my ($self, $agent, $node, @param) = @_;
154 # let all agent succeed and behave the same for now
155 $self->sim_hardware_cmd("power $node off", $agent);
157 return 0; # EXIT_SUCCESS
160 sub set_service_state
{
161 my ($self, $sid, $state) = @_;
163 my $conf = $self->read_service_config();
164 die "no such service '$sid'" if !$conf->{$sid};
166 $conf->{$sid}->{state} = $state;
168 $self->write_service_config($conf);
174 my ($self, $sid, $opts, $running) = @_;
176 my $conf = $self->read_service_config();
177 die "resource ID '$sid' already defined\n" if $conf->{$sid};
179 $conf->{$sid} = $opts;
180 $conf->{$sid}->@{qw(type name)} = split(/:/, $sid);
182 $self->write_service_config($conf);
184 my $ss = $self->read_service_status($opts->{node
});
185 $ss->{$sid} = $running;
186 $self->write_service_status($opts->{node
}, $ss);
192 my ($self, $sid) = @_;
194 my $conf = $self->read_service_config();
196 die "no such service '$sid'" if !$conf->{$sid};
198 delete $conf->{$sid};
200 $self->write_service_config($conf);
205 sub change_service_location
{
206 my ($self, $sid, $current_node, $new_node) = @_;
208 my $conf = $self->read_service_config();
210 die "no such service '$sid'\n" if !$conf->{$sid};
212 die "current_node for '$sid' does not match ($current_node != $conf->{$sid}->{node})\n"
213 if $current_node ne $conf->{$sid}->{node
};
215 $conf->{$sid}->{node
} = $new_node;
217 $self->write_service_config($conf);
220 sub service_has_lock
{
221 my ($self, $sid) = @_;
223 my $conf = $self->read_service_config();
225 die "no such service '$sid'\n" if !$conf->{$sid};
227 return $conf->{$sid}->{lock};
231 my ($self, $sid, $lock) = @_;
233 my $conf = $self->read_service_config();
235 die "no such service '$sid'\n" if !$conf->{$sid};
237 $conf->{$sid}->{lock} = $lock || 'backup';
239 $self->write_service_config($conf);
245 my ($self, $sid, $lock) = @_;
247 my $conf = $self->read_service_config();
249 die "no such service '$sid'\n" if !$conf->{$sid};
251 if (!defined($conf->{$sid}->{lock})) {
255 if (defined($lock) && $conf->{$sid}->{lock} ne $lock) {
256 warn "found lock '$conf->{$sid}->{lock}' trying to remove '$lock' lock\n";
260 my $removed_lock = delete $conf->{$sid}->{lock};
262 $self->write_service_config($conf);
264 return $removed_lock;
267 sub queue_crm_commands_nolock
{
268 my ($self, $cmd) = @_;
273 my $filename = "$self->{statusdir}/crm_commands";
275 $data = PVE
::Tools
::file_get_contents
($filename);
278 PVE
::Tools
::file_set_contents
($filename, $data);
283 sub queue_crm_commands
{
284 my ($self, $cmd) = @_;
286 my $code = sub { $self->queue_crm_commands_nolock($cmd); };
288 $self->global_lock($code);
293 sub read_crm_commands
{
299 my $filename = "$self->{statusdir}/crm_commands";
301 $data = PVE
::Tools
::file_get_contents
($filename);
303 PVE
::Tools
::file_set_contents
($filename, '');
308 return $self->global_lock($code);
311 sub read_group_config
{
314 my $filename = "$self->{statusdir}/groups";
316 $raw = PVE
::Tools
::file_get_contents
($filename) if -f
$filename;
318 return PVE
::HA
::Groups-
>parse_config($filename, $raw);
321 sub read_service_status
{
322 my ($self, $node) = @_;
324 my $filename = "$self->{statusdir}/service_status_$node";
325 return PVE
::HA
::Tools
::read_json_from_file
($filename);
328 sub write_service_status
{
329 my ($self, $node, $data) = @_;
331 my $filename = "$self->{statusdir}/service_status_$node";
332 my $res = PVE
::HA
::Tools
::write_json_to_file
($filename, $data);
334 # fixme: add test if a service runs on two nodes!!!
339 sub read_static_service_stats
{
342 my $filename = "$self->{statusdir}/static_service_stats";
343 my $stats = eval { PVE
::HA
::Tools
::read_json_from_file
($filename) };
344 $self->log('error', "loading static service stats failed - $@") if $@;
349 my $default_group_config = <<__EOD;
364 my ($this, $testdir) = @_;
366 die "missing testdir" if !$testdir;
368 die "testdir '$testdir' does not exist or is not a directory!\n"
371 my $class = ref($this) || $this;
373 my $self = bless {}, $class;
375 my $statusdir = $self->{statusdir
} = "$testdir/status";
377 remove_tree
($statusdir);
380 # copy initial configuartion
381 copy
("$testdir/manager_status", "$statusdir/manager_status"); # optional
383 if (-f
"$testdir/groups") {
384 copy
("$testdir/groups", "$statusdir/groups");
386 PVE
::Tools
::file_set_contents
("$statusdir/groups", $default_group_config);
389 if (-f
"$testdir/service_config") {
390 copy
("$testdir/service_config", "$statusdir/service_config");
393 'vm:101' => { node
=> 'node1', group
=> 'prefer_node1' },
394 'vm:102' => { node
=> 'node2', group
=> 'prefer_node2' },
395 'vm:103' => { node
=> 'node3', group
=> 'prefer_node3' },
396 'vm:104' => { node
=> 'node1', group
=> 'prefer_node1' },
397 'vm:105' => { node
=> 'node2', group
=> 'prefer_node2' },
398 'vm:106' => { node
=> 'node3', group
=> 'prefer_node3' },
400 $self->write_service_config($conf);
403 if (-f
"$testdir/hardware_status") {
404 copy
("$testdir/hardware_status", "$statusdir/hardware_status") ||
405 die "Copy failed: $!\n";
408 node1
=> { power
=> 'off', network
=> 'off' },
409 node2
=> { power
=> 'off', network
=> 'off' },
410 node3
=> { power
=> 'off', network
=> 'off' },
412 $self->write_hardware_status_nolock($cstatus);
415 if (-f
"$testdir/fence.cfg") {
416 copy
("$testdir/fence.cfg", "$statusdir/fence.cfg");
419 if (-f
"$testdir/datacenter.cfg") {
420 copy
("$testdir/datacenter.cfg", "$statusdir/datacenter.cfg");
423 if (-f
"$testdir/static_service_stats") {
424 copy
("$testdir/static_service_stats", "$statusdir/static_service_stats");
427 my $cstatus = $self->read_hardware_status_nolock();
429 foreach my $node (sort keys %$cstatus) {
430 $self->{nodes
}->{$node} = {};
432 if (-f
"$testdir/service_status_$node") {
433 copy
("$testdir/service_status_$node", "$statusdir/service_status_$node");
435 $self->write_service_status($node, {});
439 $self->{service_config
} = $self->read_service_config();
447 die "implement in subclass";
451 my ($self, $level, $msg, $id) = @_;
455 my $time = $self->get_time();
457 $id = 'hardware' if !$id;
459 printf("%-5s %5d %12s: $msg\n", $level, $time, $id);
463 my ($self, $node) = @_;
465 return $self->{statusdir
};
468 sub read_datacenter_conf
{
469 my ($self, $node) = @_;
471 my $filename = "$self->{statusdir}/datacenter.cfg";
472 return PVE
::HA
::Tools
::read_json_from_file
($filename, {});
476 my ($self, $code, @param) = @_;
478 my $lockfile = "$self->{statusdir}/hardware.lck";
479 my $fh = IO
::File-
>new(">>$lockfile") ||
480 die "unable to open '$lockfile'\n";
484 $success = flock($fh, LOCK_EX
);
485 if ($success || ($! != EINTR
)) {
490 die "can't acquire lock '$lockfile' - $!\n";
496 eval { $res = &$code($fh, @param) };
506 my $compute_node_info = sub {
507 my ($self, $cstatus) = @_;
512 my $online_count = 0;
514 foreach my $node (keys %$cstatus) {
515 my $d = $cstatus->{$node};
517 my $online = ($d->{power
} eq 'on' && $d->{network
} eq 'on') ?
1 : 0;
518 $node_info->{$node}->{online
} = $online;
521 $online_count++ if $online;
524 my $quorate = ($online_count > int($node_count/2)) ?
1 : 0;
527 foreach my $node (keys %$cstatus) {
528 my $d = $cstatus->{$node};
529 $node_info->{$node}->{online
} = 0;
533 return ($node_info, $quorate);
539 my $cstatus = $self->read_hardware_status_nolock();
540 my ($node_info, $quorate) = &$compute_node_info($self, $cstatus);
542 return ($node_info, $quorate);
545 # helper for Sim/ only
547 my ($self, $node, $state) = @_;
549 # TODO: ensure nolock is OK when adding this to RTSim
550 my $cstatus = $self->read_hardware_status_nolock();
551 my $res = $cstatus->{$node}->{cfs
}->{$state};
553 # we assume default true if not defined
554 return !defined($res) || $res;
557 # simulate hardware commands, the following commands are available:
558 # power <node> <on|off>
559 # network <node> <on|off>
561 # skip-round <crm|lrm> [<rounds=1>]
562 # cfs <node> <rw|update> <work|fail>
566 # service <sid> <started|disabled|stopped|ignored>
567 # service <sid> <migrate|relocate> <target>
568 # service <sid> stop <timeout>
569 # service <sid> lock/unlock [lockname]
570 # service <sid> add <node> [<request-state=started>] [<running=0>]
571 # service <sid> delete
572 sub sim_hardware_cmd
{
573 my ($self, $cmdstr, $logid) = @_;
578 my $cstatus = $self->read_hardware_status_nolock();
580 my ($cmd, $objid, $action, @params) = split(/\s+/, $cmdstr);
581 my $param = $params[0]; # for convenience/legacy
583 die "sim_hardware_cmd: no node or service for command specified"
586 my ($node, $sid, $d);
588 if ($cmd eq 'service') {
589 $sid = PVE
::HA
::Tools
::pve_verify_ha_resource_id
($objid);
592 $d = $self->{nodes
}->{$node} ||
593 die "sim_hardware_cmd: no such node '$node'\n";
596 $self->log('info', "execute $cmdstr", $logid);
598 if ($cmd eq 'power') {
599 die "sim_hardware_cmd: unknown action '$action'\n"
600 if $action !~ m/^(on|off)$/;
602 if ($cstatus->{$node}->{power
} ne $action) {
603 if ($action eq 'on') {
605 $d->{crm
} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm
});
606 $d->{lrm
} = $self->lrm_control('start', $d, $lock_fh) if !defined($d->{lrm
});
607 $d->{lrm_restart
} = undef;
608 $cstatus->{$node}->{cfs
} = {};
613 $d->{crm_env
}->log('info', "killed by poweroff");
614 $self->crm_control('stop', $d, $lock_fh);
618 $d->{lrm_env
}->log('info', "killed by poweroff");
619 $self->lrm_control('stop', $d, $lock_fh);
621 $d->{lrm_restart
} = undef;
624 $self->watchdog_reset_nolock($node);
625 $self->write_service_status($node, {});
629 $cstatus->{$node}->{power
} = $action;
630 $cstatus->{$node}->{network
} = $action;
631 $cstatus->{$node}->{shutdown} = undef;
633 $self->write_hardware_status_nolock($cstatus);
635 } elsif ($cmd eq 'network') {
636 die "sim_hardware_cmd: unknown network action '$action'"
637 if $action !~ m/^(on|off)$/;
638 $cstatus->{$node}->{network
} = $action;
640 $self->write_hardware_status_nolock($cstatus);
642 } elsif ($cmd eq 'cfs') {
643 die "sim_hardware_cmd: unknown cfs action '$action' for node '$node'"
644 if $action !~ m/^(rw|update)$/;
645 die "sim_hardware_cmd: unknown cfs command '$param' for '$action' on node '$node'"
646 if $param !~ m/^(work|fail)$/;
648 $cstatus->{$node}->{cfs
}->{$action} = $param eq 'work';
649 $self->write_hardware_status_nolock($cstatus);
651 } elsif ($cmd eq 'reboot' || $cmd eq 'shutdown') {
652 $cstatus->{$node}->{shutdown} = $cmd;
654 $self->write_hardware_status_nolock($cstatus);
656 $self->lrm_control('shutdown', $d, $lock_fh) if defined($d->{lrm
});
657 } elsif ($cmd eq 'restart-lrm') {
659 $d->{lrm_restart
} = 1;
660 $self->lrm_control('shutdown', $d, $lock_fh);
662 } elsif ($cmd eq 'crm') {
664 if ($action eq 'stop') {
667 $self->crm_control('shutdown', $d, $lock_fh);
669 } elsif ($action eq 'start') {
670 $d->{crm
} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm
});
671 } elsif ($action eq 'enable-node-maintenance' || $action eq 'disable-node-maintenance') {
672 $self->queue_crm_commands_nolock("$action $node");
674 die "sim_hardware_cmd: unknown action '$action'";
677 } elsif ($cmd eq 'service') {
678 if ($action eq 'started' || $action eq 'disabled' ||
679 $action eq 'stopped' || $action eq 'ignored') {
681 $self->set_service_state($sid, $action);
683 } elsif ($action eq 'migrate' || $action eq 'relocate') {
685 die "sim_hardware_cmd: missing target node for '$action' command"
688 $self->queue_crm_commands_nolock("$action $sid $param");
690 } elsif ($action eq 'stop') {
692 die "sim_hardware_cmd: missing timeout for '$action' command"
695 $self->queue_crm_commands_nolock("$action $sid $param");
697 } elsif ($action eq 'add') {
700 {state => $params[1] || 'started', node
=> $param},
704 } elsif ($action eq 'delete') {
706 $self->delete_service($sid);
708 } elsif ($action eq 'lock') {
710 $self->lock_service($sid, $param);
712 } elsif ($action eq 'unlock') {
714 $self->unlock_service($sid, $param);
717 die "sim_hardware_cmd: unknown service action '$action' " .
718 "- not implemented\n"
721 die "sim_hardware_cmd: unknown command '$cmdstr'\n";
727 return $self->global_lock($code);
730 # for controlling the resource manager services
732 my ($self, $action, $data, $lock_fh) = @_;
734 die "implement in subclass";
738 my ($self, $action, $data, $lock_fh) = @_;
740 die "implement in subclass";
746 die "implement in subclass";
749 my $modify_watchog = sub {
750 my ($self, $code) = @_;
752 my $update_cmd = sub {
754 my $filename = "$self->{statusdir}/watchdog_status";
756 my ($res, $wdstatus);
759 my $raw = PVE
::Tools
::file_get_contents
($filename);
760 $wdstatus = decode_json
($raw);
765 ($wdstatus, $res) = &$code($wdstatus);
767 PVE
::Tools
::file_set_contents
($filename, encode_json
($wdstatus));
772 return $self->global_lock($update_cmd);
775 sub watchdog_reset_nolock
{
776 my ($self, $node) = @_;
778 my $filename = "$self->{statusdir}/watchdog_status";
781 my $raw = PVE
::Tools
::file_get_contents
($filename);
782 my $wdstatus = decode_json
($raw);
784 foreach my $id (keys %$wdstatus) {
785 delete $wdstatus->{$id} if $wdstatus->{$id}->{node
} eq $node;
788 PVE
::Tools
::file_set_contents
($filename, encode_json
($wdstatus));
793 my ($self, $node) = @_;
800 foreach my $wfh (keys %$wdstatus) {
801 my $wd = $wdstatus->{$wfh};
802 next if $wd->{node
} ne $node;
804 my $ctime = $self->get_time();
805 my $tdiff = $ctime - $wd->{update_time
};
807 if ($tdiff > $watchdog_timeout) { # expired
809 delete $wdstatus->{$wfh};
813 return ($wdstatus, $res);
816 return &$modify_watchog($self, $code);
822 my ($self, $node) = @_;
829 my $id = "WD:$node:$$:$wdcounter";
831 die "internal error" if defined($wdstatus->{$id});
835 update_time
=> $self->get_time(),
838 return ($wdstatus, $id);
841 return &$modify_watchog($self, $code);
845 my ($self, $wfh) = @_;
850 my $wd = $wdstatus->{$wfh};
851 die "no such watchdog handle '$wfh'\n" if !defined($wd);
853 my $tdiff = $self->get_time() - $wd->{update_time
};
854 die "watchdog expired" if $tdiff > $watchdog_timeout;
856 delete $wdstatus->{$wfh};
861 return &$modify_watchog($self, $code);
864 sub watchdog_update
{
865 my ($self, $wfh) = @_;
870 my $wd = $wdstatus->{$wfh};
872 die "no such watchdog handle '$wfh'\n" if !defined($wd);
874 my $ctime = $self->get_time();
875 my $tdiff = $ctime - $wd->{update_time
};
877 die "watchdog expired" if $tdiff > $watchdog_timeout;
879 $wd->{update_time
} = $ctime;
884 return &$modify_watchog($self, $code);
887 sub get_static_node_stats
{
890 my $cstatus = $self->read_hardware_status_nolock();
893 for my $node (keys $cstatus->%*) {
894 $stats->{$node} = { $cstatus->{$node}->%{qw(cpus memory)} };