]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/TestHardware.pm
bump version to 4.0.5
[pve-ha-manager.git] / src / PVE / HA / Sim / TestHardware.pm
1 package PVE::HA::Sim::TestHardware;
2
3 # Simulate Hardware resources
4
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
8
9 use strict;
10 use warnings;
11 use POSIX qw(strftime EINTR);
12 use JSON;
13 use IO::File;
14 use Fcntl qw(:DEFAULT :flock);
15 use File::Copy;
16 use File::Path qw(make_path remove_tree);
17
18 use PVE::HA::CRM;
19 use PVE::HA::LRM;
20
21 use PVE::HA::Sim::TestEnv;
22 use base qw(PVE::HA::Sim::Hardware);
23
24 my $max_sim_time = 10000;
25
26 sub new {
27 my ($this, $testdir) = @_;
28
29 my $class = ref($this) || $this;
30
31 my $self = $class->SUPER::new($testdir);
32
33 my $raw = PVE::Tools::file_get_contents("$testdir/cmdlist");
34 $self->{cmdlist} = decode_json($raw);
35
36 $self->{loop_count} = 0;
37 $self->{cur_time} = 0;
38
39 my $statusdir = $self->statusdir();
40 my $logfile = "$statusdir/log";
41 $self->{logfh} = IO::File->new(">>$logfile") ||
42 die "unable to open '$logfile' - $!";
43
44 foreach my $node (sort keys %{$self->{nodes}}) {
45
46 my $d = $self->{nodes}->{$node};
47
48 $d->{crm_env} =
49 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'crm');
50
51 $d->{lrm_env} =
52 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'lrm');
53
54 $d->{crm} = undef; # create on power on
55 $d->{lrm} = undef; # create on power on
56 }
57
58 return $self;
59 }
60
61 sub get_time {
62 my ($self) = @_;
63
64 return $self->{cur_time};
65 }
66
67 sub log {
68 my ($self, $level, $msg, $id) = @_;
69
70 chomp $msg;
71
72 my $time = $self->get_time();
73
74 $id = 'hardware' if !$id;
75
76 my $line = sprintf("%-5s %5d %12s: $msg\n", $level, $time, $id);
77 print $line;
78
79 $self->{logfh}->print($line);
80 $self->{logfh}->flush();
81 }
82
83 # for controlling the resource manager services (CRM and LRM)
84 sub crm_control {
85 my ($self, $action, $data, $lock_fh) = @_;
86
87 if ($action eq 'start') {
88 return PVE::HA::CRM->new($data->{crm_env});
89 } elsif ($action eq 'stop') {
90 # nothing todo sim_hardware_cmd sets us to undef, thats enough
91 } elsif ($action eq 'shutdown') {
92 $data->{crm}->shutdown_request();
93 } else {
94 die "unknown CRM control action: '$action'\n";
95 }
96 }
97
98 sub lrm_control {
99 my ($self, $action, $data, $lock_fh) = @_;
100
101 if ($action eq 'start') {
102 return PVE::HA::LRM->new($data->{lrm_env});
103 } elsif ($action eq 'stop') {
104 # nothing todo sim_hardware_cmd sets us to undef, thats enough
105 } elsif ($action eq 'shutdown') {
106 $data->{lrm}->shutdown_request();
107 } else {
108 die "unknown LRM control action: '$action'\n";
109 }
110
111 }
112
113 sub run {
114 my ($self) = @_;
115
116 my ($last_command_time, $next_cmd_at, $skip_service_round) = (0, 0, {});
117
118 for (;;) {
119 my $starttime = $self->get_time();
120
121 my @nodes = sort keys %{$self->{nodes}};
122
123 my $looptime = scalar(@nodes) * 2; # twice the node count
124 $looptime = 20 if $looptime < 20;
125
126 die "unable to simulate so many nodes. You need to increate watchdog/lock timeouts.\n"
127 if $looptime >= 60;
128
129 my $first_loop = 1;
130 foreach my $node (@nodes) {
131 my $d = $self->{nodes}->{$node};
132
133 if (my $crm = $d->{crm}) {
134 my $exit_crm;
135
136 if (!$skip_service_round->{crm}) {
137 $exit_crm = !$crm->do_one_iteration();
138 } else {
139 $self->log('info', "skipping CRM round", 'run-loop') if $first_loop;
140 }
141
142 my $nodetime = $d->{crm_env}->get_time();
143 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
144
145 if ($exit_crm) {
146 $d->{crm_env}->log('info', "exit (loop end)");
147 $d->{crm} = undef;
148
149 my $cstatus = $self->read_hardware_status_nolock();
150 my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
151 my $shutdown = $nstatus->{shutdown} || '';
152 if ($shutdown eq 'reboot') {
153 $self->sim_hardware_cmd("power $node off", 'reboot');
154 $self->sim_hardware_cmd("power $node on", 'reboot');
155 } elsif ($shutdown eq 'shutdown') {
156 $self->sim_hardware_cmd("power $node off", 'shutdown');
157 } elsif (!$d->{crm_stop}) {
158 die "unexpected CRM exit - not implemented"
159 }
160 $d->{crm_stop} = undef;
161 }
162 }
163
164 if (my $lrm = $d->{lrm}) {
165 my $exit_lrm;
166
167 if (!$skip_service_round->{lrm}) {
168 $exit_lrm = !$lrm->do_one_iteration();
169 } else {
170 $self->log('info', "skipping LRM round", 'run-loop') if $first_loop;
171 }
172
173 my $nodetime = $d->{lrm_env}->get_time();
174 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
175
176 if ($exit_lrm) {
177 $d->{lrm_env}->log('info', "exit (loop end)");
178 $d->{lrm} = undef;
179 my $cstatus = $self->read_hardware_status_nolock();
180 my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
181 my $shutdown = $nstatus->{shutdown} || '';
182 if ($d->{lrm_restart}) {
183 die "lrm restart during shutdown - not implemented" if $shutdown;
184 $d->{lrm_restart} = undef;
185 $d->{lrm} = PVE::HA::LRM->new($d->{lrm_env});
186 } elsif ($shutdown eq 'reboot' || $shutdown eq 'shutdown') {
187 # exit the LRM before the CRM to reflect real world behaviour
188 $self->sim_hardware_cmd("crm $node stop", $shutdown);
189 } else {
190 die "unexpected LRM exit - not implemented"
191 }
192 }
193 }
194
195 foreach my $n (@nodes) {
196 if (!$self->watchdog_check($n)) {
197 $self->sim_hardware_cmd("power $n off", 'watchdog');
198 $self->log('info', "server '$n' stopped by poweroff (watchdog)");
199 $self->{nodes}->{$n}->{crm} = undef;
200 $self->{nodes}->{$n}->{lrm} = undef;
201 }
202 }
203 $first_loop = 0;
204 }
205
206 $skip_service_round->{crm}-- if $skip_service_round->{crm};
207 $skip_service_round->{lrm}-- if $skip_service_round->{lrm};
208
209 $self->{cur_time} = $starttime + $looptime if ($self->{cur_time} - $starttime) < $looptime;
210
211 die "simulation exceeded maximum time ($max_sim_time) - force end\n"
212 if $self->{cur_time} > $max_sim_time;
213
214 foreach my $node (@nodes) {
215 my $d = $self->{nodes}->{$node};
216 # forced time update
217 $d->{lrm_env}->loop_start_hook();
218 $d->{crm_env}->loop_start_hook();
219 }
220
221 next if $self->{cur_time} < $next_cmd_at;
222
223 if (($self->{loop_count} % 5) == 0) { # apply new command every 5 loop iterations
224 my $list = shift @{$self->{cmdlist}};
225 if (!$list) {
226 # end simulation 500 seconds after the last command was executed
227 return if ($self->{cur_time} - $last_command_time) > 500;
228 }
229
230 foreach my $cmd (@$list) {
231 $last_command_time = $self->{cur_time};
232
233 if ($cmd =~ m/^delay\s+(\d+)\s*$/) {
234 $self->log('info', "execute $cmd", 'cmdlist');
235 $next_cmd_at = $self->{cur_time} + $1;
236 } elsif ($cmd =~ m/^skip-round\s+(lrm|crm)(?:\s+(\d+))?\s*$/) {
237 $self->log('info', "execute $cmd", 'cmdlist');
238 my ($what, $rounds) = ($1, $2 // 1);
239 $skip_service_round->{$what} = 0 if !defined($skip_service_round->{$what});
240 $skip_service_round->{$what} += $rounds;
241 } else {
242 $self->sim_hardware_cmd($cmd, 'cmdlist');
243 }
244 }
245 }
246
247 ++$self->{loop_count};
248 }
249 }
250
251 1;