]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/Env.pm
move cfs update to common code
[pve-ha-manager.git] / src / PVE / HA / Sim / Env.pm
1 package PVE::HA::Sim::Env;
2
3 use strict;
4 use warnings;
5 use POSIX qw(strftime EINTR);
6 use JSON;
7 use IO::File;
8 use Fcntl qw(:DEFAULT :flock);
9
10 use PVE::HA::Tools;
11 use PVE::HA::Env;
12 use PVE::HA::Resources;
13 use PVE::HA::Sim::Resources::VirtVM;
14 use PVE::HA::Sim::Resources::VirtCT;
15 use PVE::HA::Sim::Resources::VirtFail;
16
17 PVE::HA::Sim::Resources::VirtVM->register();
18 PVE::HA::Sim::Resources::VirtCT->register();
19 PVE::HA::Sim::Resources::VirtFail->register();
20
21 PVE::HA::Resources->init();
22
23 sub new {
24 my ($this, $nodename, $hardware, $log_id) = @_;
25
26 die "missing nodename" if !$nodename;
27 die "missing log_id" if !$log_id;
28
29 my $class = ref($this) || $this;
30
31 my $self = bless {}, $class;
32
33 $self->{statusdir} = $hardware->statusdir();
34 $self->{nodename} = $nodename;
35
36 $self->{hardware} = $hardware;
37 $self->{lock_timeout} = 120;
38
39 $self->{log_id} = $log_id;
40
41 return $self;
42 }
43
44 sub nodename {
45 my ($self) = @_;
46
47 return $self->{nodename};
48 }
49
50 sub hardware {
51 my ($self) = @_;
52
53 return $self->{hardware};
54 }
55
56 my $assert_cfs_can_rw = sub {
57 my ($self, $emsg) = @_;
58
59 $emsg //= 'cfs connection refused - not mounted?';
60
61 die "$emsg\n"
62 if !$self->{hardware}->get_cfs_state($self->{nodename}, 'rw');
63 };
64
65 sub sim_get_lock {
66 my ($self, $lock_name, $unlock) = @_;
67
68 return 0 if !$self->quorate();
69
70 my $filename = "$self->{statusdir}/cluster_locks";
71
72 my $code = sub {
73
74 my $data = PVE::HA::Tools::read_json_from_file($filename, {});
75
76 my $res;
77
78 my $nodename = $self->nodename();
79 my $ctime = $self->get_time();
80
81 if ($unlock) {
82
83 if (my $d = $data->{$lock_name}) {
84 my $tdiff = $ctime - $d->{time};
85
86 if ($tdiff > $self->{lock_timeout}) {
87 $res = 1;
88 } elsif (($tdiff <= $self->{lock_timeout}) && ($d->{node} eq $nodename)) {
89 delete $data->{$lock_name};
90 $res = 1;
91 } else {
92 $res = 0;
93 }
94 }
95
96 } else {
97
98 if (my $d = $data->{$lock_name}) {
99
100 my $tdiff = $ctime - $d->{time};
101
102 if ($tdiff <= $self->{lock_timeout}) {
103 if ($d->{node} eq $nodename) {
104 $d->{time} = $ctime;
105 $res = 1;
106 } else {
107 $res = 0;
108 }
109 } else {
110 $self->log('info', "got lock '$lock_name'");
111 $d->{node} = $nodename;
112 $d->{time} = $ctime;
113 $res = 1;
114 }
115
116 } else {
117 $data->{$lock_name} = {
118 time => $ctime,
119 node => $nodename,
120 };
121 $self->log('info', "got lock '$lock_name'");
122 $res = 1;
123 }
124 }
125
126 PVE::HA::Tools::write_json_to_file($filename, $data);
127
128 return $res;
129 };
130
131 return $self->{hardware}->global_lock($code);
132 }
133
134 sub read_manager_status {
135 my ($self) = @_;
136
137 $assert_cfs_can_rw->($self);
138
139 my $filename = "$self->{statusdir}/manager_status";
140
141 return PVE::HA::Tools::read_json_from_file($filename, {});
142 }
143
144 sub write_manager_status {
145 my ($self, $status_obj) = @_;
146
147 $assert_cfs_can_rw->($self);
148
149 my $filename = "$self->{statusdir}/manager_status";
150
151 PVE::HA::Tools::write_json_to_file($filename, $status_obj);
152 }
153
154 sub read_lrm_status {
155 my ($self, $node) = @_;
156
157 $node = $self->{nodename} if !defined($node);
158
159 $assert_cfs_can_rw->($self);
160
161 return $self->{hardware}->read_lrm_status($node);
162 }
163
164 sub write_lrm_status {
165 my ($self, $status_obj) = @_;
166
167 my $node = $self->{nodename};
168
169 $assert_cfs_can_rw->($self);
170
171 return $self->{hardware}->write_lrm_status($node, $status_obj);
172 }
173
174 sub is_node_shutdown {
175 my ($self) = @_;
176
177 my $node = $self->{nodename};
178 my $cstatus = $self->{hardware}->read_hardware_status_nolock();
179
180 die "undefined node status for node '$node'" if !defined($cstatus->{$node});
181
182 my ($shutdown, $reboot) = (0, 0);
183
184 if (my $target = $cstatus->{$node}->{shutdown}) {
185 if ($target eq 'shutdown') {
186 $shutdown = 1;
187 } elsif ($target eq 'reboot') {
188 $shutdown = 1;
189 $reboot = 1;
190 } else {
191 die "unknown shutdown target '$target'";
192 }
193 }
194
195 return ($shutdown, $reboot);
196 }
197
198 sub read_service_config {
199 my ($self) = @_;
200
201 $assert_cfs_can_rw->($self);
202
203 return $self->{hardware}->read_service_config();
204 }
205
206 sub read_fence_config {
207 my ($self) = @_;
208
209 $assert_cfs_can_rw->($self);
210
211 return $self->{hardware}->read_fence_config();
212 }
213
214 # the test/sim framework has hardware enabled fencing if
215 # it has devices configured
216 sub fencing_mode {
217 my ($self) = @_;
218
219 my $cfg = $self->read_fence_config();
220
221 return (defined($cfg) && keys %{$cfg}) ? 'hardware' : 'watchdog';
222 }
223
224 sub exec_fence_agent {
225 my ($self, $agent, $node, @param) = @_;
226
227 return $self->{hardware}->exec_fence_agent($agent, $node, @param);
228 }
229
230 sub read_group_config {
231 my ($self) = @_;
232
233 $assert_cfs_can_rw->($self);
234
235 return $self->{hardware}->read_group_config();
236 }
237
238 # this is normally only allowed by the master to recover a _fenced_ service
239 sub steal_service {
240 my ($self, $sid, $current_node, $new_node) = @_;
241
242 $assert_cfs_can_rw->($self);
243
244 return $self->{hardware}->change_service_location($sid, $current_node, $new_node);
245 }
246
247 sub queue_crm_commands {
248 my ($self, $cmd) = @_;
249
250 $assert_cfs_can_rw->($self);
251
252 return $self->{hardware}->queue_crm_commands($cmd);
253 }
254
255 sub read_crm_commands {
256 my ($self) = @_;
257
258 $assert_cfs_can_rw->($self);
259
260 return $self->{hardware}->read_crm_commands();
261 }
262
263 sub log {
264 my ($self, $level, $msg) = @_;
265
266 chomp $msg;
267
268 my $time = $self->get_time();
269
270 printf("%-5s %5d %12s: $msg\n", $level, $time, "$self->{nodename}/$self->{log_id}");
271 }
272
273 sub sendmail {
274 my ($self, $subject, $text) = @_;
275
276 # only log subject, do not spam the logs
277 $self->log('email', $subject);
278 }
279
280 sub get_time {
281 my ($self) = @_;
282
283 die "implement in subclass";
284 }
285
286 sub sleep {
287 my ($self, $delay) = @_;
288
289 die "implement in subclass";
290 }
291
292 sub sleep_until {
293 my ($self, $end_time) = @_;
294
295 die "implement in subclass";
296 }
297
298 sub get_ha_manager_lock {
299 my ($self) = @_;
300
301 return $self->sim_get_lock('ha_manager_lock');
302 }
303
304 # release the cluster wide manager lock.
305 # when released another CRM may step up and get the lock, thus this should only
306 # get called when shutting down/deactivating the current master
307 sub release_ha_manager_lock {
308 my ($self) = @_;
309
310 return $self->sim_get_lock('ha_manager_lock', 1);
311 }
312
313 sub get_ha_agent_lock_name {
314 my ($self, $node) = @_;
315
316 $node = $self->nodename() if !$node;
317
318 return "ha_agent_${node}_lock";
319 }
320
321 sub get_ha_agent_lock {
322 my ($self, $node) = @_;
323
324 my $lck = $self->get_ha_agent_lock_name($node);
325 return $self->sim_get_lock($lck);
326 }
327
328
329 # release the respective node agent lock.
330 # this should only get called if the nodes LRM gracefully shuts down with
331 # all services already cleanly stopped!
332 sub release_ha_agent_lock {
333 my ($self) = @_;
334
335 my $node = $self->nodename();
336
337 my $lock = $self->get_ha_agent_lock_name($node);
338 return $self->sim_get_lock($lock, 1);
339 }
340
341 # return true when cluster is quorate
342 sub quorate {
343 my ($self) = @_;
344
345 my ($node_info, $quorate) = $self->{hardware}->get_node_info();
346 my $node = $self->nodename();
347 return 0 if !$node_info->{$node}->{online};
348 return $quorate;
349 }
350
351 sub get_node_info {
352 my ($self) = @_;
353
354 return $self->{hardware}->get_node_info();
355 }
356
357 sub loop_start_hook {
358 my ($self) = @_;
359
360 # do nothing, overwrite in subclass
361 }
362
363 sub loop_end_hook {
364 my ($self) = @_;
365
366 # do nothing, overwrite in subclass
367 }
368
369
370 sub cluster_state_update {
371 my ($self) = @_;
372
373 return $self->{hardware}->get_cfs_state($self->{nodename}, 'update');
374 }
375
376 sub watchdog_open {
377 my ($self) = @_;
378
379 my $node = $self->nodename();
380
381 return $self->{hardware}->watchdog_open($node);
382 }
383
384 sub watchdog_update {
385 my ($self, $wfh) = @_;
386
387 return $self->{hardware}->watchdog_update($wfh);
388 }
389
390 sub watchdog_close {
391 my ($self, $wfh) = @_;
392
393 return $self->{hardware}->watchdog_close($wfh);
394 }
395
396 sub after_fork {
397 my ($self) = @_;
398
399 # nothing to clean up in the simulation environment
400 }
401
402
403 sub get_max_workers {
404 my ($self) = @_;
405
406 return 4;
407 }
408
409 1;