]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/TestHardware.pm
lrm: implement different behavior for shutdown and restart
[pve-ha-manager.git] / src / PVE / HA / Sim / TestHardware.pm
1 package PVE::HA::Sim::TestHardware;
2
3 # Simulate Hardware resources
4
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
8
9 use strict;
10 use warnings;
11 use POSIX qw(strftime EINTR);
12 use Data::Dumper;
13 use JSON;
14 use IO::File;
15 use Fcntl qw(:DEFAULT :flock);
16 use File::Copy;
17 use File::Path qw(make_path remove_tree);
18
19 use PVE::HA::CRM;
20 use PVE::HA::LRM;
21
22 use PVE::HA::Sim::TestEnv;
23 use base qw(PVE::HA::Sim::Hardware);
24
25 my $max_sim_time = 10000;
26
27 sub new {
28 my ($this, $testdir) = @_;
29
30 my $class = ref($this) || $this;
31
32 my $self = $class->SUPER::new($testdir);
33
34 my $raw = PVE::Tools::file_get_contents("$testdir/cmdlist");
35 $self->{cmdlist} = decode_json($raw);
36
37 $self->{loop_count} = 0;
38 $self->{cur_time} = 0;
39
40 my $statusdir = $self->statusdir();
41 my $logfile = "$statusdir/log";
42 $self->{logfh} = IO::File->new(">>$logfile") ||
43 die "unable to open '$logfile' - $!";
44
45 foreach my $node (sort keys %{$self->{nodes}}) {
46
47 my $d = $self->{nodes}->{$node};
48
49 $d->{crm_env} =
50 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'crm');
51
52 $d->{lrm_env} =
53 PVE::HA::Env->new('PVE::HA::Sim::TestEnv', $node, $self, 'lrm');
54
55 $d->{crm} = undef; # create on power on
56 $d->{lrm} = undef; # create on power on
57 }
58
59 return $self;
60 }
61
62 sub get_time {
63 my ($self) = @_;
64
65 return $self->{cur_time};
66 }
67
68 sub log {
69 my ($self, $level, $msg, $id) = @_;
70
71 chomp $msg;
72
73 my $time = $self->get_time();
74
75 $id = 'hardware' if !$id;
76
77 my $line = sprintf("%-5s %5d %12s: $msg\n", $level, $time, $id);
78 print $line;
79
80 $self->{logfh}->print($line);
81 $self->{logfh}->flush();
82 }
83
84 # simulate hardware commands
85 # power <node> <on|off>
86 # network <node> <on|off>
87 # reboot <node>
88 # shutdown <node>
89
90 sub sim_hardware_cmd {
91 my ($self, $cmdstr, $logid) = @_;
92
93 my $code = sub {
94
95 my $cstatus = $self->read_hardware_status_nolock();
96
97 my ($cmd, $node, $action) = split(/\s+/, $cmdstr);
98
99 die "sim_hardware_cmd: no node specified" if !$node;
100
101 my $d = $self->{nodes}->{$node};
102 die "sim_hardware_cmd: no such node '$node'\n" if !$d;
103
104 $self->log('info', "execute $cmdstr", $logid);
105
106 if ($cmd eq 'power') {
107 die "sim_hardware_cmd: unknown action '$action'" if $action !~ m/^(on|off)$/;
108 if ($cstatus->{$node}->{power} ne $action) {
109 if ($action eq 'on') {
110 $d->{crm} = PVE::HA::CRM->new($d->{crm_env}) if !$d->{crm};
111 $d->{lrm} = PVE::HA::LRM->new($d->{lrm_env}) if !$d->{lrm};
112 } else {
113 if ($d->{crm}) {
114 $d->{crm_env}->log('info', "killed by poweroff");
115 $d->{crm} = undef;
116 }
117 if ($d->{lrm}) {
118 $d->{lrm_env}->log('info', "killed by poweroff");
119 $d->{lrm} = undef;
120 }
121 }
122 }
123
124 $cstatus->{$node}->{power} = $action;
125 $cstatus->{$node}->{network} = $action;
126 $cstatus->{$node}->{shutdown} = undef;
127
128 $self->write_hardware_status_nolock($cstatus);
129
130 } elsif ($cmd eq 'network') {
131 die "sim_hardware_cmd: unknown action '$action'" if $action !~ m/^(on|off)$/;
132 $cstatus->{$node}->{network} = $action;
133
134 $self->write_hardware_status_nolock($cstatus);
135
136 } elsif ($cmd eq 'reboot' || $cmd eq 'shutdown') {
137 $cstatus->{$node}->{shutdown} = $cmd;
138
139 $self->write_hardware_status_nolock($cstatus);
140
141 $d->{lrm}->shutdown_request() if $d->{lrm};
142
143 } else {
144 die "sim_hardware_cmd: unknown command '$cmdstr'\n";
145 }
146
147 };
148
149 return $self->global_lock($code);
150 }
151
152 sub run {
153 my ($self) = @_;
154
155 my $last_command_time = 0;
156 my $next_cmd_at = 0;
157
158 for (;;) {
159
160 my $starttime = $self->get_time();
161
162 my @nodes = sort keys %{$self->{nodes}};
163
164 my $nodecount = scalar(@nodes);
165
166 my $looptime = $nodecount*2;
167 $looptime = 20 if $looptime < 20;
168
169 die "unable to simulate so many nodes. You need to increate watchdog/lock timeouts.\n"
170 if $looptime >= 60;
171
172 foreach my $node (@nodes) {
173
174 my $d = $self->{nodes}->{$node};
175
176 if (my $crm = $d->{crm}) {
177
178 $d->{crm_env}->loop_start_hook($self->get_time());
179
180 die "implement me (CRM exit)" if !$crm->do_one_iteration();
181
182 $d->{crm_env}->loop_end_hook();
183
184 my $nodetime = $d->{crm_env}->get_time();
185 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
186 }
187
188 if (my $lrm = $d->{lrm}) {
189
190 $d->{lrm_env}->loop_start_hook($self->get_time());
191
192 my $exit_lrm = !$lrm->do_one_iteration();
193
194 $d->{lrm_env}->loop_end_hook();
195
196 my $nodetime = $d->{lrm_env}->get_time();
197 $self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
198
199 if ($exit_lrm) {
200 $d->{lrm_env}->log('info', "exit (loop end)");
201 $d->{lrm} = undef;
202 my $cstatus = $self->read_hardware_status_nolock();
203 my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
204 my $shutdown = $nstatus->{shutdown};
205 if ($shutdown eq 'reboot') {
206 $self->sim_hardware_cmd("power $node off", 'reboot');
207 $self->sim_hardware_cmd("power $node on", 'reboot');
208 } elsif ($shutdown eq 'shutdown') {
209 $self->sim_hardware_cmd("power $node off", 'shutdown');
210 } else {
211 die "unexpected LRM exit - not implemented"
212 }
213 }
214 }
215
216 foreach my $n (@nodes) {
217 if (!$self->watchdog_check($n)) {
218 $self->sim_hardware_cmd("power $n off", 'watchdog');
219 $self->log('info', "server '$n' stopped by poweroff (watchdog)");
220 $self->{nodes}->{$n}->{crm} = undef;
221 $self->{nodes}->{$n}->{lrm} = undef;
222 }
223 }
224 }
225
226
227 $self->{cur_time} = $starttime + $looptime
228 if ($self->{cur_time} - $starttime) < $looptime;
229
230 die "simulation end\n" if $self->{cur_time} > $max_sim_time;
231
232 foreach my $node (@nodes) {
233 my $d = $self->{nodes}->{$node};
234 # forced time update
235 $d->{lrm_env}->loop_start_hook($self->get_time());
236 $d->{crm_env}->loop_start_hook($self->get_time());
237 }
238
239 next if $self->{cur_time} < $next_cmd_at;
240
241 # apply new comand after 5 loop iterations
242
243 if (($self->{loop_count} % 5) == 0) {
244 my $list = shift @{$self->{cmdlist}};
245 if (!$list) {
246 # end sumulation (500 seconds after last command)
247 return if (($self->{cur_time} - $last_command_time) > 500);
248 }
249
250 foreach my $cmd (@$list) {
251 $last_command_time = $self->{cur_time};
252
253 if ($cmd =~ m/^delay\s+(\d+)\s*$/) {
254 $next_cmd_at = $self->{cur_time} + $1;
255 } else {
256 $self->sim_hardware_cmd($cmd, 'cmdlist');
257 }
258 }
259 }
260
261 ++$self->{loop_count};
262 }
263 }
264
265 1;