]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/TestHardware.pm
8b2ab71f0e08b8179d67f1e77dac64f9e93e5a0e
[pve-ha-manager.git] / src / PVE / HA / Sim / TestHardware.pm
1 package PVE::HA::Sim::TestHardware;
2
3 # Simulate Hardware resources
4
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
8
9 use strict;
10 use warnings;
11 use POSIX qw(strftime EINTR);
12 use JSON;
13 use IO::File;
14 use Fcntl qw(:DEFAULT :flock);
15 use File::Copy;
16 use File::Path qw(make_path remove_tree);
17
18 use PVE::HA::CRM;
19 use PVE::HA::LRM;
20
21 use PVE::HA::Sim::TestEnv;
22 use base qw(PVE::HA::Sim::Hardware);
23
24 my $max_sim_time = 10000;
25
26 sub new {
27 my ($this, $testdir) = @_;
28
29 my $class = ref($this) || $this;
30
31 my $self = $class->SUPER::new($testdir);
32
33 my $raw = PVE::Tools::file_get_contents("$testdir/cmdlist");
34 $self->{cmdlist} = decode_json($raw);
35
36 $self->{loop_count} = 0;
37 $self->{cur_time} = 0;
38
39 my $statusdir = $self->statusdir();
40 my $logfile = "$statusdir/log";
41 $self->{logfh} = IO::File->new(">>$logfile") ||
42 die "unable to open '$logfile' - $!";
43
44 foreach my $node (sort keys %{$self->{nodes}}) {
45
46 my $d = $self->{nodes}->{$node};
47
48 $d->{crm_env} =
49 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'crm');
50
51 $d->{lrm_env} =
52 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'lrm');
53
54 $d->{crm} = undef; # create on power on
55 $d->{lrm} = undef; # create on power on
56 }
57
58 return $self;
59 }
60
61 sub get_time {
62 my ($self) = @_;
63
64 return $self->{cur_time};
65 }
66
67 sub log {
68 my ($self, $level, $msg, $id) = @_;
69
70 chomp $msg;
71
72 my $time = $self->get_time();
73
74 $id = 'hardware' if !$id;
75
76 my $line = sprintf("%-5s %5d %12s: $msg\n", $level, $time, $id);
77 print $line;
78
79 $self->{logfh}->print($line);
80 $self->{logfh}->flush();
81 }
82
83 # for controlling the resource manager services (CRM and LRM)
84 sub crm_control {
85 my ($self, $action, $data, $lock_fh) = @_;
86
87 if ($action eq 'start') {
88 return PVE::HA::CRM->new($data->{crm_env});
89 } elsif ($action eq 'stop') {
90 # nothing todo sim_hardware_cmd sets us to undef, thats enough
91 } elsif ($action eq 'shutdown') {
92 $data->{crm}->shutdown_request();
93 } else {
94 die "unknown CRM control action: '$action'\n";
95 }
96 }
97
98 sub lrm_control {
99 my ($self, $action, $data, $lock_fh) = @_;
100
101 if ($action eq 'start') {
102 return PVE::HA::LRM->new($data->{lrm_env});
103 } elsif ($action eq 'stop') {
104 # nothing todo sim_hardware_cmd sets us to undef, thats enough
105 } elsif ($action eq 'shutdown') {
106 $data->{lrm}->shutdown_request();
107 } else {
108 die "unknown LRM control action: '$action'\n";
109 }
110
111 }
112
113 sub run {
114 my ($self) = @_;
115
116 my $last_command_time = 0;
117 my $next_cmd_at = 0;
118
119 for (;;) {
120
121 my $starttime = $self->get_time();
122
123 my @nodes = sort keys %{$self->{nodes}};
124
125 my $nodecount = scalar(@nodes);
126
127 my $looptime = $nodecount*2;
128 $looptime = 20 if $looptime < 20;
129
130 die "unable to simulate so many nodes. You need to increate watchdog/lock timeouts.\n"
131 if $looptime >= 60;
132
133 foreach my $node (@nodes) {
134
135 my $d = $self->{nodes}->{$node};
136
137 if (my $crm = $d->{crm}) {
138
139 my $exit_crm = !$crm->do_one_iteration();
140
141 my $nodetime = $d->{crm_env}->get_time();
142 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
143
144 if ($exit_crm) {
145 $d->{crm_env}->log('info', "exit (loop end)");
146 $d->{crm} = undef;
147
148 my $cstatus = $self->read_hardware_status_nolock();
149 my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
150 my $shutdown = $nstatus->{shutdown} || '';
151 if ($shutdown eq 'reboot') {
152 $self->sim_hardware_cmd("power $node off", 'reboot');
153 $self->sim_hardware_cmd("power $node on", 'reboot');
154 } elsif ($shutdown eq 'shutdown') {
155 $self->sim_hardware_cmd("power $node off", 'shutdown');
156 } elsif (!$d->{crm_stop}) {
157 die "unexpected CRM exit - not implemented"
158 }
159 $d->{crm_stop} = undef;
160 }
161 }
162
163 if (my $lrm = $d->{lrm}) {
164
165 my $exit_lrm = !$lrm->do_one_iteration();
166
167 my $nodetime = $d->{lrm_env}->get_time();
168 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
169
170 if ($exit_lrm) {
171 $d->{lrm_env}->log('info', "exit (loop end)");
172 $d->{lrm} = undef;
173 my $cstatus = $self->read_hardware_status_nolock();
174 my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
175 my $shutdown = $nstatus->{shutdown} || '';
176 if ($d->{lrm_restart}) {
177 die "lrm restart during shutdown - not implemented" if $shutdown;
178 $d->{lrm_restart} = undef;
179 $d->{lrm} = PVE::HA::LRM->new($d->{lrm_env});
180 } elsif ($shutdown eq 'reboot' || $shutdown eq 'shutdown') {
181 # exit the LRM before the CRM to reflect real world behaviour
182 $self->sim_hardware_cmd("crm $node stop", $shutdown);
183 } else {
184 die "unexpected LRM exit - not implemented"
185 }
186 }
187 }
188
189 foreach my $n (@nodes) {
190 if (!$self->watchdog_check($n)) {
191 $self->sim_hardware_cmd("power $n off", 'watchdog');
192 $self->log('info', "server '$n' stopped by poweroff (watchdog)");
193 $self->{nodes}->{$n}->{crm} = undef;
194 $self->{nodes}->{$n}->{lrm} = undef;
195 }
196 }
197 }
198
199
200 $self->{cur_time} = $starttime + $looptime
201 if ($self->{cur_time} - $starttime) < $looptime;
202
203 die "simulation end\n" if $self->{cur_time} > $max_sim_time;
204
205 foreach my $node (@nodes) {
206 my $d = $self->{nodes}->{$node};
207 # forced time update
208 $d->{lrm_env}->loop_start_hook();
209 $d->{crm_env}->loop_start_hook();
210 }
211
212 next if $self->{cur_time} < $next_cmd_at;
213
214 # apply new comand after 5 loop iterations
215
216 if (($self->{loop_count} % 5) == 0) {
217 my $list = shift @{$self->{cmdlist}};
218 if (!$list) {
219 # end sumulation (500 seconds after last command)
220 return if (($self->{cur_time} - $last_command_time) > 500);
221 }
222
223 foreach my $cmd (@$list) {
224 $last_command_time = $self->{cur_time};
225
226 if ($cmd =~ m/^delay\s+(\d+)\s*$/) {
227 $next_cmd_at = $self->{cur_time} + $1;
228 } else {
229 $self->sim_hardware_cmd($cmd, 'cmdlist');
230 }
231 }
232 }
233
234 ++$self->{loop_count};
235 }
236 }
237
238 1;