]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/Sim/TestHardware.pm
TestHardware: use more informative error message
[pve-ha-manager.git] / src / PVE / HA / Sim / TestHardware.pm
CommitLineData
bf93e2a2
DM
1package PVE::HA::Sim::TestHardware;
2
3# Simulate Hardware resources
4
5# power supply for nodes: on/off
6# network connection to nodes: on/off
7# watchdog devices for nodes
8
9use strict;
10use warnings;
11use POSIX qw(strftime EINTR);
12use Data::Dumper;
13use JSON;
14use IO::File;
15use Fcntl qw(:DEFAULT :flock);
16use File::Copy;
17use File::Path qw(make_path remove_tree);
18
bf93e2a2
DM
19use PVE::HA::CRM;
20use PVE::HA::LRM;
21
533d82ca 22use PVE::HA::Sim::TestEnv;
bf93e2a2
DM
23use base qw(PVE::HA::Sim::Hardware);
24
25my $max_sim_time = 10000;
26
27sub new {
28 my ($this, $testdir) = @_;
29
30 my $class = ref($this) || $this;
31
32 my $self = $class->SUPER::new($testdir);
33
34 my $raw = PVE::Tools::file_get_contents("$testdir/cmdlist");
35 $self->{cmdlist} = decode_json($raw);
36
37 $self->{loop_count} = 0;
38 $self->{cur_time} = 0;
39
8a6e5294
DM
40 my $statusdir = $self->statusdir();
41 my $logfile = "$statusdir/log";
42 $self->{logfh} = IO::File->new(">>$logfile") ||
43 die "unable to open '$logfile' - $!";
44
bf93e2a2
DM
45 foreach my $node (sort keys %{$self->{nodes}}) {
46
47 my $d = $self->{nodes}->{$node};
48
49 $d->{crm_env} =
533d82ca 50 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'crm');
bf93e2a2
DM
51
52 $d->{lrm_env} =
533d82ca 53 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'lrm');
bf93e2a2
DM
54
55 $d->{crm} = undef; # create on power on
56 $d->{lrm} = undef; # create on power on
57 }
58
59 return $self;
60}
61
62sub get_time {
63 my ($self) = @_;
64
65 return $self->{cur_time};
66}
67
8a6e5294
DM
68sub log {
69 my ($self, $level, $msg, $id) = @_;
70
71 chomp $msg;
72
73 my $time = $self->get_time();
74
75 $id = 'hardware' if !$id;
76
77 my $line = sprintf("%-5s %5d %12s: $msg\n", $level, $time, $id);
78 print $line;
79
80 $self->{logfh}->print($line);
81 $self->{logfh}->flush();
82}
83
bf93e2a2
DM
84# simulate hardware commands
85# power <node> <on|off>
86# network <node> <on|off>
27a9e51d
DM
87# reboot <node>
88# shutdown <node>
cadf64a6 89# restart-lrm <node>
bf93e2a2
DM
90
91sub sim_hardware_cmd {
92 my ($self, $cmdstr, $logid) = @_;
93
94 my $code = sub {
95
96 my $cstatus = $self->read_hardware_status_nolock();
97
98 my ($cmd, $node, $action) = split(/\s+/, $cmdstr);
99
100 die "sim_hardware_cmd: no node specified" if !$node;
bf93e2a2
DM
101
102 my $d = $self->{nodes}->{$node};
103 die "sim_hardware_cmd: no such node '$node'\n" if !$d;
104
105 $self->log('info', "execute $cmdstr", $logid);
106
107 if ($cmd eq 'power') {
27a9e51d 108 die "sim_hardware_cmd: unknown action '$action'" if $action !~ m/^(on|off)$/;
bf93e2a2 109 if ($cstatus->{$node}->{power} ne $action) {
7987c18c 110 if ($action eq 'on') {
bf93e2a2
DM
111 $d->{crm} = PVE::HA::CRM->new($d->{crm_env}) if !$d->{crm};
112 $d->{lrm} = PVE::HA::LRM->new($d->{lrm_env}) if !$d->{lrm};
cadf64a6 113 $d->{lrm_restart} = undef;
bf93e2a2
DM
114 } else {
115 if ($d->{crm}) {
116 $d->{crm_env}->log('info', "killed by poweroff");
117 $d->{crm} = undef;
118 }
119 if ($d->{lrm}) {
120 $d->{lrm_env}->log('info', "killed by poweroff");
121 $d->{lrm} = undef;
cadf64a6 122 $d->{lrm_restart} = undef;
bf93e2a2 123 }
7987c18c
DM
124 $self->watchdog_reset_nolock($node);
125 $self->write_service_status($node, {});
bf93e2a2
DM
126 }
127 }
128
129 $cstatus->{$node}->{power} = $action;
130 $cstatus->{$node}->{network} = $action;
27a9e51d
DM
131 $cstatus->{$node}->{shutdown} = undef;
132
133 $self->write_hardware_status_nolock($cstatus);
bf93e2a2
DM
134
135 } elsif ($cmd eq 'network') {
8b5f7921
TL
136 die "sim_hardware_cmd: unknown network action '$action'"
137 if $action !~ m/^(on|off)$/;
27a9e51d
DM
138 $cstatus->{$node}->{network} = $action;
139
140 $self->write_hardware_status_nolock($cstatus);
141
142 } elsif ($cmd eq 'reboot' || $cmd eq 'shutdown') {
143 $cstatus->{$node}->{shutdown} = $cmd;
144
145 $self->write_hardware_status_nolock($cstatus);
146
499f06e3 147 $d->{lrm}->shutdown_request() if $d->{lrm};
cadf64a6
DM
148 } elsif ($cmd eq 'restart-lrm') {
149 if ($d->{lrm}) {
150 $d->{lrm_restart} = 1;
151 $d->{lrm}->shutdown_request();
152 }
bf93e2a2 153 } else {
27a9e51d 154 die "sim_hardware_cmd: unknown command '$cmdstr'\n";
bf93e2a2
DM
155 }
156
bf93e2a2
DM
157 };
158
159 return $self->global_lock($code);
160}
161
162sub run {
163 my ($self) = @_;
164
165 my $last_command_time = 0;
a4b0c3d8
DM
166 my $next_cmd_at = 0;
167
bf93e2a2
DM
168 for (;;) {
169
170 my $starttime = $self->get_time();
171
172 my @nodes = sort keys %{$self->{nodes}};
173
17b5cf98
DM
174 my $nodecount = scalar(@nodes);
175
176 my $looptime = $nodecount*2;
177 $looptime = 20 if $looptime < 20;
178
6323a5ce
DM
179 die "unable to simulate so many nodes. You need to increate watchdog/lock timeouts.\n"
180 if $looptime >= 60;
181
bf93e2a2
DM
182 foreach my $node (@nodes) {
183
184 my $d = $self->{nodes}->{$node};
185
186 if (my $crm = $d->{crm}) {
187
188 $d->{crm_env}->loop_start_hook($self->get_time());
189
190 die "implement me (CRM exit)" if !$crm->do_one_iteration();
191
192 $d->{crm_env}->loop_end_hook();
193
194 my $nodetime = $d->{crm_env}->get_time();
195 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
196 }
197
198 if (my $lrm = $d->{lrm}) {
199
200 $d->{lrm_env}->loop_start_hook($self->get_time());
201
27a9e51d 202 my $exit_lrm = !$lrm->do_one_iteration();
bf93e2a2
DM
203
204 $d->{lrm_env}->loop_end_hook();
205
206 my $nodetime = $d->{lrm_env}->get_time();
207 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
27a9e51d
DM
208
209 if ($exit_lrm) {
210 $d->{lrm_env}->log('info', "exit (loop end)");
211 $d->{lrm} = undef;
212 my $cstatus = $self->read_hardware_status_nolock();
213 my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
fe0ce040 214 my $shutdown = $nstatus->{shutdown} || '';
cadf64a6 215 if ($d->{lrm_restart}) {
116dea30 216 die "lrm restart during shutdown - not implemented" if $shutdown;
cadf64a6
DM
217 $d->{lrm_restart} = undef;
218 $d->{lrm} = PVE::HA::LRM->new($d->{lrm_env});
219 } elsif ($shutdown eq 'reboot') {
27a9e51d
DM
220 $self->sim_hardware_cmd("power $node off", 'reboot');
221 $self->sim_hardware_cmd("power $node on", 'reboot');
222 } elsif ($shutdown eq 'shutdown') {
223 $self->sim_hardware_cmd("power $node off", 'shutdown');
224 } else {
225 die "unexpected LRM exit - not implemented"
226 }
227 }
bf93e2a2
DM
228 }
229
230 foreach my $n (@nodes) {
231 if (!$self->watchdog_check($n)) {
232 $self->sim_hardware_cmd("power $n off", 'watchdog');
233 $self->log('info', "server '$n' stopped by poweroff (watchdog)");
234 $self->{nodes}->{$n}->{crm} = undef;
235 $self->{nodes}->{$n}->{lrm} = undef;
236 }
237 }
238 }
239
17b5cf98
DM
240
241 $self->{cur_time} = $starttime + $looptime
242 if ($self->{cur_time} - $starttime) < $looptime;
bf93e2a2
DM
243
244 die "simulation end\n" if $self->{cur_time} > $max_sim_time;
245
8a6e5294
DM
246 foreach my $node (@nodes) {
247 my $d = $self->{nodes}->{$node};
248 # forced time update
249 $d->{lrm_env}->loop_start_hook($self->get_time());
250 $d->{crm_env}->loop_start_hook($self->get_time());
251 }
a4b0c3d8
DM
252
253 next if $self->{cur_time} < $next_cmd_at;
254
bf93e2a2
DM
255 # apply new comand after 5 loop iterations
256
257 if (($self->{loop_count} % 5) == 0) {
a1af884e 258 my $list = shift @{$self->{cmdlist}};
bf93e2a2
DM
259 if (!$list) {
260 # end sumulation (500 seconds after last command)
261 return if (($self->{cur_time} - $last_command_time) > 500);
262 }
263
264 foreach my $cmd (@$list) {
265 $last_command_time = $self->{cur_time};
a4b0c3d8
DM
266
267 if ($cmd =~ m/^delay\s+(\d+)\s*$/) {
268 $next_cmd_at = $self->{cur_time} + $1;
269 } else {
270 $self->sim_hardware_cmd($cmd, 'cmdlist');
271 }
bf93e2a2
DM
272 }
273 }
274
275 ++$self->{loop_count};
276 }
277}
278
2791;