1 package PVE
::HA
::Sim
::Hardware
;
3 # Simulate Hardware resources
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
11 use POSIX
qw(strftime EINTR);
14 use Fcntl
qw(:DEFAULT :flock);
16 use File
::Path
qw(make_path remove_tree);
18 use PVE
::HA
::FenceConfig
;
20 my $watchdog_timeout = 60;
23 # Status directory layout
27 # $testdir/cmdlist Command list for simulation
28 # $testdir/hardware_status Hardware description (number of nodes, ...)
29 # $testdir/manager_status CRM status (start with {})
30 # $testdir/service_config Service configuration
31 # $testdir/groups HA groups configuration
32 # $testdir/service_status_<node> Service status
35 # runtime status for simulation system
37 # $testdir/status/cluster_locks Cluster locks
38 # $testdir/status/hardware_status Hardware status (power/network on/off)
39 # $testdir/status/watchdog_status Watchdog status
43 # $testdir/status/lrm_status_<node> LRM status
44 # $testdir/status/manager_status CRM status
45 # $testdir/status/crm_commands CRM command queue
46 # $testdir/status/service_config Service configuration
47 # $testdir/status/service_status_<node> Service status
48 # $testdir/status/groups HA groups configuration
51 my ($self, $node) = @_;
53 my $filename = "$self->{statusdir}/lrm_status_$node";
55 return PVE
::HA
::Tools
::read_json_from_file
($filename, {});
58 sub write_lrm_status
{
59 my ($self, $node, $status_obj) = @_;
61 my $filename = "$self->{statusdir}/lrm_status_$node";
63 PVE
::HA
::Tools
::write_json_to_file
($filename, $status_obj);
66 sub read_hardware_status_nolock
{
69 my $filename = "$self->{statusdir}/hardware_status";
71 my $raw = PVE
::Tools
::file_get_contents
($filename);
72 my $cstatus = decode_json
($raw);
77 sub write_hardware_status_nolock
{
78 my ($self, $cstatus) = @_;
80 my $filename = "$self->{statusdir}/hardware_status";
82 PVE
::Tools
::file_set_contents
($filename, encode_json
($cstatus));
85 sub read_service_config
{
88 my $filename = "$self->{statusdir}/service_config";
89 my $conf = PVE
::HA
::Tools
::read_json_from_file
($filename);
91 foreach my $sid (keys %$conf) {
92 my $d = $conf->{$sid};
94 die "service '$sid' without assigned node!" if !$d->{node
};
96 if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
102 $d->{state} = 'disabled' if !$d->{state};
103 $d->{state} = 'started' if $d->{state} eq 'enabled'; # backward compatibility
104 $d->{max_restart
} = 1 if !defined($d->{max_restart
});
105 $d->{max_relocate
} = 1 if !defined($d->{max_relocate
});
111 sub write_service_config
{
112 my ($self, $conf) = @_;
114 $self->{service_config
} = $conf;
116 my $filename = "$self->{statusdir}/service_config";
117 return PVE
::HA
::Tools
::write_json_to_file
($filename, $conf);
120 sub read_fence_config
{
125 my $filename = "$self->{statusdir}/fence.cfg";
127 $raw = PVE
::Tools
::file_get_contents
($filename);
130 return PVE
::HA
::FenceConfig
::parse_config
($filename, $raw);
133 sub exec_fence_agent
{
134 my ($self, $agent, $node, @param) = @_;
136 # let all agent succeed and behave the same for now
137 $self->sim_hardware_cmd("power $node off", $agent);
139 return 0; # EXIT_SUCCESS
142 sub set_service_state
{
143 my ($self, $sid, $state) = @_;
145 my $conf = $self->read_service_config();
146 die "no such service '$sid'" if !$conf->{$sid};
148 $conf->{$sid}->{state} = $state;
150 $self->write_service_config($conf);
156 my ($self, $sid, $opts) = @_;
158 my $conf = $self->read_service_config();
159 die "resource ID '$sid' already defined\n" if $conf->{$sid};
161 $conf->{$sid} = $opts;
163 $self->write_service_config($conf);
169 my ($self, $sid) = @_;
171 my $conf = $self->read_service_config();
173 die "no such service '$sid'" if !$conf->{$sid};
175 delete $conf->{$sid};
177 $self->write_service_config($conf);
182 sub change_service_location
{
183 my ($self, $sid, $current_node, $new_node) = @_;
185 my $conf = $self->read_service_config();
187 die "no such service '$sid'\n" if !$conf->{$sid};
189 die "current_node for '$sid' does not match ($current_node != $conf->{$sid}->{node})\n"
190 if $current_node ne $conf->{$sid}->{node
};
192 $conf->{$sid}->{node
} = $new_node;
194 $self->write_service_config($conf);
197 sub service_has_lock
{
198 my ($self, $sid) = @_;
200 my $conf = $self->read_service_config();
202 die "no such service '$sid'\n" if !$conf->{$sid};
204 return $conf->{$sid}->{lock};
208 my ($self, $sid, $lock) = @_;
210 my $conf = $self->read_service_config();
212 die "no such service '$sid'\n" if !$conf->{$sid};
214 $conf->{$sid}->{lock} = $lock || 'backup';
216 $self->write_service_config($conf);
222 my ($self, $sid, $lock) = @_;
224 my $conf = $self->read_service_config();
226 die "no such service '$sid'\n" if !$conf->{$sid};
228 if (!defined($conf->{$sid}->{lock})) {
232 if (defined($lock) && $conf->{$sid}->{lock} ne $lock) {
233 warn "found lock '$conf->{$sid}->{lock}' trying to remove '$lock' lock\n";
237 my $removed_lock = delete $conf->{$sid}->{lock};
239 $self->write_service_config($conf);
241 return $removed_lock;
244 sub queue_crm_commands_nolock
{
245 my ($self, $cmd) = @_;
250 my $filename = "$self->{statusdir}/crm_commands";
252 $data = PVE
::Tools
::file_get_contents
($filename);
255 PVE
::Tools
::file_set_contents
($filename, $data);
260 sub queue_crm_commands
{
261 my ($self, $cmd) = @_;
263 my $code = sub { $self->queue_crm_commands_nolock($cmd); };
265 $self->global_lock($code);
270 sub read_crm_commands
{
276 my $filename = "$self->{statusdir}/crm_commands";
278 $data = PVE
::Tools
::file_get_contents
($filename);
280 PVE
::Tools
::file_set_contents
($filename, '');
285 return $self->global_lock($code);
288 sub read_group_config
{
291 my $filename = "$self->{statusdir}/groups";
293 $raw = PVE
::Tools
::file_get_contents
($filename) if -f
$filename;
295 return PVE
::HA
::Config
::parse_groups_config
($filename, $raw);
298 sub read_service_status
{
299 my ($self, $node) = @_;
301 my $filename = "$self->{statusdir}/service_status_$node";
302 return PVE
::HA
::Tools
::read_json_from_file
($filename);
305 sub write_service_status
{
306 my ($self, $node, $data) = @_;
308 my $filename = "$self->{statusdir}/service_status_$node";
309 my $res = PVE
::HA
::Tools
::write_json_to_file
($filename, $data);
311 # fixme: add test if a service runs on two nodes!!!
316 my $default_group_config = <<__EOD;
331 my ($this, $testdir) = @_;
333 die "missing testdir" if !$testdir;
335 die "testdir '$testdir' does not exist or is not a directory!\n"
338 my $class = ref($this) || $this;
340 my $self = bless {}, $class;
342 my $statusdir = $self->{statusdir
} = "$testdir/status";
344 remove_tree
($statusdir);
347 # copy initial configuartion
348 copy
("$testdir/manager_status", "$statusdir/manager_status"); # optional
350 if (-f
"$testdir/groups") {
351 copy
("$testdir/groups", "$statusdir/groups");
353 PVE
::Tools
::file_set_contents
("$statusdir/groups", $default_group_config);
356 if (-f
"$testdir/service_config") {
357 copy
("$testdir/service_config", "$statusdir/service_config");
360 'vm:101' => { node
=> 'node1', group
=> 'prefer_node1' },
361 'vm:102' => { node
=> 'node2', group
=> 'prefer_node2' },
362 'vm:103' => { node
=> 'node3', group
=> 'prefer_node3' },
363 'vm:104' => { node
=> 'node1', group
=> 'prefer_node1' },
364 'vm:105' => { node
=> 'node2', group
=> 'prefer_node2' },
365 'vm:106' => { node
=> 'node3', group
=> 'prefer_node3' },
367 $self->write_service_config($conf);
370 if (-f
"$testdir/hardware_status") {
371 copy
("$testdir/hardware_status", "$statusdir/hardware_status") ||
372 die "Copy failed: $!\n";
375 node1
=> { power
=> 'off', network
=> 'off' },
376 node2
=> { power
=> 'off', network
=> 'off' },
377 node3
=> { power
=> 'off', network
=> 'off' },
379 $self->write_hardware_status_nolock($cstatus);
382 if (-f
"$testdir/fence.cfg") {
383 copy
("$testdir/fence.cfg", "$statusdir/fence.cfg");
386 my $cstatus = $self->read_hardware_status_nolock();
388 foreach my $node (sort keys %$cstatus) {
389 $self->{nodes
}->{$node} = {};
391 if (-f
"$testdir/service_status_$node") {
392 copy
("$testdir/service_status_$node", "$statusdir/service_status_$node");
394 $self->write_service_status($node, {});
398 $self->{service_config
} = $self->read_service_config();
406 die "implement in subclass";
410 my ($self, $level, $msg, $id) = @_;
414 my $time = $self->get_time();
416 $id = 'hardware' if !$id;
418 printf("%-5s %5d %12s: $msg\n", $level, $time, $id);
422 my ($self, $node) = @_;
424 return $self->{statusdir
};
428 my ($self, $code, @param) = @_;
430 my $lockfile = "$self->{statusdir}/hardware.lck";
431 my $fh = IO
::File-
>new(">>$lockfile") ||
432 die "unable to open '$lockfile'\n";
436 $success = flock($fh, LOCK_EX
);
437 if ($success || ($! != EINTR
)) {
442 die "can't acquire lock '$lockfile' - $!\n";
448 eval { $res = &$code($fh, @param) };
458 my $compute_node_info = sub {
459 my ($self, $cstatus) = @_;
464 my $online_count = 0;
466 foreach my $node (keys %$cstatus) {
467 my $d = $cstatus->{$node};
469 my $online = ($d->{power
} eq 'on' && $d->{network
} eq 'on') ?
1 : 0;
470 $node_info->{$node}->{online
} = $online;
473 $online_count++ if $online;
476 my $quorate = ($online_count > int($node_count/2)) ?
1 : 0;
479 foreach my $node (keys %$cstatus) {
480 my $d = $cstatus->{$node};
481 $node_info->{$node}->{online
} = 0;
485 return ($node_info, $quorate);
491 my $cstatus = $self->read_hardware_status_nolock();
492 my ($node_info, $quorate) = &$compute_node_info($self, $cstatus);
494 return ($node_info, $quorate);
497 # simulate hardware commands
498 # power <node> <on|off>
499 # network <node> <on|off>
503 # service <sid> <started|disabled|stopped>
504 # service <sid> <migrate|relocate> <target>
505 # service <sid> lock/unlock [lockname]
507 sub sim_hardware_cmd
{
508 my ($self, $cmdstr, $logid) = @_;
513 my $cstatus = $self->read_hardware_status_nolock();
515 my ($cmd, $objid, $action, $target) = split(/\s+/, $cmdstr);
517 die "sim_hardware_cmd: no node or service for command specified"
520 my ($node, $sid, $d);
522 if ($cmd eq 'service') {
523 $sid = PVE
::HA
::Tools
::pve_verify_ha_resource_id
($objid);
526 $d = $self->{nodes
}->{$node} ||
527 die "sim_hardware_cmd: no such node '$node'\n";
530 $self->log('info', "execute $cmdstr", $logid);
532 if ($cmd eq 'power') {
533 die "sim_hardware_cmd: unknown action '$action'\n"
534 if $action !~ m/^(on|off)$/;
536 if ($cstatus->{$node}->{power
} ne $action) {
537 if ($action eq 'on') {
539 $d->{crm
} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm
});
540 $d->{lrm
} = $self->lrm_control('start', $d, $lock_fh) if !defined($d->{lrm
});
541 $d->{lrm_restart
} = undef;
546 $d->{crm_env
}->log('info', "killed by poweroff");
547 $self->crm_control('stop', $d, $lock_fh);
551 $d->{lrm_env
}->log('info', "killed by poweroff");
552 $self->lrm_control('stop', $d, $lock_fh);
554 $d->{lrm_restart
} = undef;
557 $self->watchdog_reset_nolock($node);
558 $self->write_service_status($node, {});
562 $cstatus->{$node}->{power
} = $action;
563 $cstatus->{$node}->{network
} = $action;
564 $cstatus->{$node}->{shutdown} = undef;
566 $self->write_hardware_status_nolock($cstatus);
568 } elsif ($cmd eq 'network') {
569 die "sim_hardware_cmd: unknown network action '$action'"
570 if $action !~ m/^(on|off)$/;
571 $cstatus->{$node}->{network
} = $action;
573 $self->write_hardware_status_nolock($cstatus);
575 } elsif ($cmd eq 'reboot' || $cmd eq 'shutdown') {
576 $cstatus->{$node}->{shutdown} = $cmd;
578 $self->write_hardware_status_nolock($cstatus);
580 $self->lrm_control('shutdown', $d, $lock_fh) if defined($d->{lrm
});
581 } elsif ($cmd eq 'restart-lrm') {
583 $d->{lrm_restart
} = 1;
584 $self->lrm_control('shutdown', $d, $lock_fh);
586 } elsif ($cmd eq 'crm') {
588 if ($action eq 'stop') {
591 $self->crm_control('shutdown', $d, $lock_fh);
593 } elsif ($action eq 'start') {
594 $d->{crm
} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm
});
596 die "sim_hardware_cmd: unknown action '$action'";
599 } elsif ($cmd eq 'service') {
600 if ($action eq 'started' || $action eq 'disabled' || $action eq 'stopped') {
602 $self->set_service_state($sid, $action);
604 } elsif ($action eq 'migrate' || $action eq 'relocate') {
606 die "sim_hardware_cmd: missing target node for '$action' command"
609 $self->queue_crm_commands_nolock("$action $sid $target");
611 } elsif ($action eq 'add') {
613 $self->add_service($sid, {state => 'started', node
=> $target});
615 } elsif ($action eq 'delete') {
617 $self->delete_service($sid);
619 } elsif ($action eq 'lock') {
621 $self->lock_service($sid, $target);
623 } elsif ($action eq 'unlock') {
625 $self->unlock_service($sid, $target);
628 die "sim_hardware_cmd: unknown service action '$action' " .
629 "- not implemented\n"
632 die "sim_hardware_cmd: unknown command '$cmdstr'\n";
638 return $self->global_lock($code);
641 # for controlling the resource manager services
643 my ($self, $action, $data, $lock_fh) = @_;
645 die "implement in subclass";
649 my ($self, $action, $data, $lock_fh) = @_;
651 die "implement in subclass";
657 die "implement in subclass";
660 my $modify_watchog = sub {
661 my ($self, $code) = @_;
663 my $update_cmd = sub {
665 my $filename = "$self->{statusdir}/watchdog_status";
667 my ($res, $wdstatus);
670 my $raw = PVE
::Tools
::file_get_contents
($filename);
671 $wdstatus = decode_json
($raw);
676 ($wdstatus, $res) = &$code($wdstatus);
678 PVE
::Tools
::file_set_contents
($filename, encode_json
($wdstatus));
683 return $self->global_lock($update_cmd);
686 sub watchdog_reset_nolock
{
687 my ($self, $node) = @_;
689 my $filename = "$self->{statusdir}/watchdog_status";
692 my $raw = PVE
::Tools
::file_get_contents
($filename);
693 my $wdstatus = decode_json
($raw);
695 foreach my $id (keys %$wdstatus) {
696 delete $wdstatus->{$id} if $wdstatus->{$id}->{node
} eq $node;
699 PVE
::Tools
::file_set_contents
($filename, encode_json
($wdstatus));
704 my ($self, $node) = @_;
711 foreach my $wfh (keys %$wdstatus) {
712 my $wd = $wdstatus->{$wfh};
713 next if $wd->{node
} ne $node;
715 my $ctime = $self->get_time();
716 my $tdiff = $ctime - $wd->{update_time
};
718 if ($tdiff > $watchdog_timeout) { # expired
720 delete $wdstatus->{$wfh};
724 return ($wdstatus, $res);
727 return &$modify_watchog($self, $code);
733 my ($self, $node) = @_;
740 my $id = "WD:$node:$$:$wdcounter";
742 die "internal error" if defined($wdstatus->{$id});
746 update_time
=> $self->get_time(),
749 return ($wdstatus, $id);
752 return &$modify_watchog($self, $code);
756 my ($self, $wfh) = @_;
761 my $wd = $wdstatus->{$wfh};
762 die "no such watchdog handle '$wfh'\n" if !defined($wd);
764 my $tdiff = $self->get_time() - $wd->{update_time
};
765 die "watchdog expired" if $tdiff > $watchdog_timeout;
767 delete $wdstatus->{$wfh};
772 return &$modify_watchog($self, $code);
775 sub watchdog_update
{
776 my ($self, $wfh) = @_;
781 my $wd = $wdstatus->{$wfh};
783 die "no such watchdog handle '$wfh'\n" if !defined($wd);
785 my $ctime = $self->get_time();
786 my $tdiff = $ctime - $wd->{update_time
};
788 die "watchdog expired" if $tdiff > $watchdog_timeout;
790 $wd->{update_time
} = $ctime;
795 return &$modify_watchog($self, $code);