]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/Hardware.pm
Env, HW: add HW fencing related functions
[pve-ha-manager.git] / src / PVE / HA / Sim / Hardware.pm
1 package PVE::HA::Sim::Hardware;
2
3 # Simulate Hardware resources
4
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
8
9 use strict;
10 use warnings;
11 use POSIX qw(strftime EINTR);
12 use Data::Dumper;
13 use JSON;
14 use IO::File;
15 use Fcntl qw(:DEFAULT :flock);
16 use File::Copy;
17 use File::Path qw(make_path remove_tree);
18 use PVE::HA::Config;
19 use PVE::HA::FenceConfig;
20
21 my $watchdog_timeout = 60;
22
23
24 # Status directory layout
25 #
26 # configuration
27 #
28 # $testdir/cmdlist Command list for simulation
29 # $testdir/hardware_status Hardware description (number of nodes, ...)
30 # $testdir/manager_status CRM status (start with {})
31 # $testdir/service_config Service configuration
32 # $testdir/groups HA groups configuration
33 # $testdir/service_status_<node> Service status
34
35 #
36 # runtime status for simulation system
37 #
38 # $testdir/status/cluster_locks Cluster locks
39 # $testdir/status/hardware_status Hardware status (power/network on/off)
40 # $testdir/status/watchdog_status Watchdog status
41 #
42 # runtime status
43 #
44 # $testdir/status/lrm_status_<node> LRM status
45 # $testdir/status/manager_status CRM status
46 # $testdir/status/crm_commands CRM command queue
47 # $testdir/status/service_config Service configuration
48 # $testdir/status/service_status_<node> Service status
49 # $testdir/status/groups HA groups configuration
50
51 sub read_lrm_status {
52 my ($self, $node) = @_;
53
54 my $filename = "$self->{statusdir}/lrm_status_$node";
55
56 return PVE::HA::Tools::read_json_from_file($filename, {});
57 }
58
59 sub write_lrm_status {
60 my ($self, $node, $status_obj) = @_;
61
62 my $filename = "$self->{statusdir}/lrm_status_$node";
63
64 PVE::HA::Tools::write_json_to_file($filename, $status_obj);
65 }
66
67 sub read_hardware_status_nolock {
68 my ($self) = @_;
69
70 my $filename = "$self->{statusdir}/hardware_status";
71
72 my $raw = PVE::Tools::file_get_contents($filename);
73 my $cstatus = decode_json($raw);
74
75 return $cstatus;
76 }
77
78 sub write_hardware_status_nolock {
79 my ($self, $cstatus) = @_;
80
81 my $filename = "$self->{statusdir}/hardware_status";
82
83 PVE::Tools::file_set_contents($filename, encode_json($cstatus));
84 };
85
86 sub read_service_config {
87 my ($self) = @_;
88
89 my $filename = "$self->{statusdir}/service_config";
90 my $conf = PVE::HA::Tools::read_json_from_file($filename);
91
92 foreach my $sid (keys %$conf) {
93 my $d = $conf->{$sid};
94
95 die "service '$sid' without assigned node!" if !$d->{node};
96
97 if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
98 $d->{type} = $1;
99 $d->{name} = $2;
100 } else {
101 die "implement me";
102 }
103 $d->{state} = 'disabled' if !$d->{state};
104 $d->{max_restart} = 1 if !defined($d->{max_restart});
105 $d->{max_relocate} = 1 if !defined($d->{max_relocate});
106 }
107
108 return $conf;
109 }
110
111 sub write_service_config {
112 my ($self, $conf) = @_;
113
114 $self->{service_config} = $conf;
115
116 my $filename = "$self->{statusdir}/service_config";
117 return PVE::HA::Tools::write_json_to_file($filename, $conf);
118 }
119
120 sub read_fence_config {
121 my ($self) = @_;
122
123 my $raw = undef;
124
125 my $filename = "$self->{statusdir}/fence.cfg";
126 if (-e $filename) {
127 $raw = PVE::Tools::file_get_contents($filename);
128 }
129
130 return PVE::HA::FenceConfig::parse_config($filename, $raw);
131 }
132
133 sub exec_fence_agent {
134 my ($self, $agent, $node, @param) = @_;
135
136 # let all agent succeed and behave the same for now
137 $self->sim_hardware_cmd("power $node off", $agent);
138
139 return 0; # EXIT_SUCCESS
140 }
141
142 sub set_service_state {
143 my ($self, $sid, $state) = @_;
144
145 my $conf = $self->read_service_config();
146 die "no such service '$sid'" if !$conf->{$sid};
147
148 $conf->{$sid}->{state} = $state;
149
150 $self->write_service_config($conf);
151
152 return $conf;
153 }
154
155 sub add_service {
156 my ($self, $sid, $opts) = @_;
157
158 my $conf = $self->read_service_config();
159 die "resource ID '$sid' already defined\n" if $conf->{$sid};
160
161 $conf->{$sid} = $opts;
162
163 $self->write_service_config($conf);
164
165 return $conf;
166 }
167
168 sub delete_service {
169 my ($self, $sid) = @_;
170
171 my $conf = $self->read_service_config();
172
173 die "no such service '$sid'" if !$conf->{$sid};
174
175 delete $conf->{$sid};
176
177 $self->write_service_config($conf);
178
179 return $conf;
180 }
181
182 sub change_service_location {
183 my ($self, $sid, $current_node, $new_node) = @_;
184
185 my $conf = $self->read_service_config();
186
187 die "no such service '$sid'\n" if !$conf->{$sid};
188
189 die "current_node for '$sid' does not match ($current_node != $conf->{$sid}->{node})\n"
190 if $current_node ne $conf->{$sid}->{node};
191
192 $conf->{$sid}->{node} = $new_node;
193
194 $self->write_service_config($conf);
195 }
196
197 sub queue_crm_commands_nolock {
198 my ($self, $cmd) = @_;
199
200 chomp $cmd;
201
202 my $data = '';
203 my $filename = "$self->{statusdir}/crm_commands";
204 if (-f $filename) {
205 $data = PVE::Tools::file_get_contents($filename);
206 }
207 $data .= "$cmd\n";
208 PVE::Tools::file_set_contents($filename, $data);
209
210 return undef;
211 }
212
213 sub queue_crm_commands {
214 my ($self, $cmd) = @_;
215
216 my $code = sub { $self->queue_crm_commands_nolock($cmd); };
217
218 $self->global_lock($code);
219
220 return undef;
221 }
222
223 sub read_crm_commands {
224 my ($self) = @_;
225
226 my $code = sub {
227 my $data = '';
228
229 my $filename = "$self->{statusdir}/crm_commands";
230 if (-f $filename) {
231 $data = PVE::Tools::file_get_contents($filename);
232 }
233 PVE::Tools::file_set_contents($filename, '');
234
235 return $data;
236 };
237
238 return $self->global_lock($code);
239 }
240
241 sub read_group_config {
242 my ($self) = @_;
243
244 my $filename = "$self->{statusdir}/groups";
245 my $raw = '';
246 $raw = PVE::Tools::file_get_contents($filename) if -f $filename;
247
248 return PVE::HA::Config::parse_groups_config($filename, $raw);
249 }
250
251 sub read_service_status {
252 my ($self, $node) = @_;
253
254 my $filename = "$self->{statusdir}/service_status_$node";
255 return PVE::HA::Tools::read_json_from_file($filename);
256 }
257
258 sub write_service_status {
259 my ($self, $node, $data) = @_;
260
261 my $filename = "$self->{statusdir}/service_status_$node";
262 my $res = PVE::HA::Tools::write_json_to_file($filename, $data);
263
264 # fixme: add test if a service runs on two nodes!!!
265
266 return $res;
267 }
268
269 my $default_group_config = <<__EOD;
270 group: prefer_node1
271 nodes node1
272 nofailback 1
273
274 group: prefer_node2
275 nodes node2
276 nofailback 1
277
278 group: prefer_node3
279 nodes node3
280 nofailback 1
281 __EOD
282
283 sub new {
284 my ($this, $testdir) = @_;
285
286 die "missing testdir" if !$testdir;
287
288 my $class = ref($this) || $this;
289
290 my $self = bless {}, $class;
291
292 my $statusdir = $self->{statusdir} = "$testdir/status";
293
294 remove_tree($statusdir);
295 mkdir $statusdir;
296
297 # copy initial configuartion
298 copy("$testdir/manager_status", "$statusdir/manager_status"); # optional
299
300 if (-f "$testdir/groups") {
301 copy("$testdir/groups", "$statusdir/groups");
302 } else {
303 PVE::Tools::file_set_contents("$statusdir/groups", $default_group_config);
304 }
305
306 if (-f "$testdir/service_config") {
307 copy("$testdir/service_config", "$statusdir/service_config");
308 } else {
309 my $conf = {
310 'vm:101' => { node => 'node1', group => 'prefer_node1' },
311 'vm:102' => { node => 'node2', group => 'prefer_node2' },
312 'vm:103' => { node => 'node3', group => 'prefer_node3' },
313 'vm:104' => { node => 'node1', group => 'prefer_node1' },
314 'vm:105' => { node => 'node2', group => 'prefer_node2' },
315 'vm:106' => { node => 'node3', group => 'prefer_node3' },
316 };
317 $self->write_service_config($conf);
318 }
319
320 if (-f "$testdir/hardware_status") {
321 copy("$testdir/hardware_status", "$statusdir/hardware_status") ||
322 die "Copy failed: $!\n";
323 } else {
324 my $cstatus = {
325 node1 => { power => 'off', network => 'off' },
326 node2 => { power => 'off', network => 'off' },
327 node3 => { power => 'off', network => 'off' },
328 };
329 $self->write_hardware_status_nolock($cstatus);
330 }
331
332 if (-f "$testdir/fence.cfg") {
333 copy("$testdir/fence.cfg", "$statusdir/fence.cfg");
334 }
335
336 my $cstatus = $self->read_hardware_status_nolock();
337
338 foreach my $node (sort keys %$cstatus) {
339 $self->{nodes}->{$node} = {};
340
341 if (-f "$testdir/service_status_$node") {
342 copy("$testdir/service_status_$node", "$statusdir/service_status_$node");
343 } else {
344 $self->write_service_status($node, {});
345 }
346 }
347
348 $self->{service_config} = $self->read_service_config();
349
350 return $self;
351 }
352
353 sub get_time {
354 my ($self) = @_;
355
356 die "implement in subclass";
357 }
358
359 sub log {
360 my ($self, $level, $msg, $id) = @_;
361
362 chomp $msg;
363
364 my $time = $self->get_time();
365
366 $id = 'hardware' if !$id;
367
368 printf("%-5s %5d %12s: $msg\n", $level, $time, $id);
369 }
370
371 sub statusdir {
372 my ($self, $node) = @_;
373
374 return $self->{statusdir};
375 }
376
377 sub global_lock {
378 my ($self, $code, @param) = @_;
379
380 my $lockfile = "$self->{statusdir}/hardware.lck";
381 my $fh = IO::File->new(">>$lockfile") ||
382 die "unable to open '$lockfile'\n";
383
384 my $success;
385 for (;;) {
386 $success = flock($fh, LOCK_EX);
387 if ($success || ($! != EINTR)) {
388 last;
389 }
390 if (!$success) {
391 close($fh);
392 die "can't acquire lock '$lockfile' - $!\n";
393 }
394 }
395
396 my $res;
397
398 eval { $res = &$code($fh, @param) };
399 my $err = $@;
400
401 close($fh);
402
403 die $err if $err;
404
405 return $res;
406 }
407
408 my $compute_node_info = sub {
409 my ($self, $cstatus) = @_;
410
411 my $node_info = {};
412
413 my $node_count = 0;
414 my $online_count = 0;
415
416 foreach my $node (keys %$cstatus) {
417 my $d = $cstatus->{$node};
418
419 my $online = ($d->{power} eq 'on' && $d->{network} eq 'on') ? 1 : 0;
420 $node_info->{$node}->{online} = $online;
421
422 $node_count++;
423 $online_count++ if $online;
424 }
425
426 my $quorate = ($online_count > int($node_count/2)) ? 1 : 0;
427
428 if (!$quorate) {
429 foreach my $node (keys %$cstatus) {
430 my $d = $cstatus->{$node};
431 $node_info->{$node}->{online} = 0;
432 }
433 }
434
435 return ($node_info, $quorate);
436 };
437
438 sub get_node_info {
439 my ($self) = @_;
440
441 my $cstatus = $self->read_hardware_status_nolock();
442 my ($node_info, $quorate) = &$compute_node_info($self, $cstatus);
443
444 return ($node_info, $quorate);
445 }
446
447 # simulate hardware commands
448 # power <node> <on|off>
449 # network <node> <on|off>
450
451 sub sim_hardware_cmd {
452 my ($self, $cmdstr, $logid) = @_;
453
454 die "implement in subclass";
455 }
456
457 sub run {
458 my ($self) = @_;
459
460 die "implement in subclass";
461 }
462
463 my $modify_watchog = sub {
464 my ($self, $code) = @_;
465
466 my $update_cmd = sub {
467
468 my $filename = "$self->{statusdir}/watchdog_status";
469
470 my ($res, $wdstatus);
471
472 if (-f $filename) {
473 my $raw = PVE::Tools::file_get_contents($filename);
474 $wdstatus = decode_json($raw);
475 } else {
476 $wdstatus = {};
477 }
478
479 ($wdstatus, $res) = &$code($wdstatus);
480
481 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
482
483 return $res;
484 };
485
486 return $self->global_lock($update_cmd);
487 };
488
489 sub watchdog_reset_nolock {
490 my ($self, $node) = @_;
491
492 my $filename = "$self->{statusdir}/watchdog_status";
493
494 if (-f $filename) {
495 my $raw = PVE::Tools::file_get_contents($filename);
496 my $wdstatus = decode_json($raw);
497
498 foreach my $id (keys %$wdstatus) {
499 delete $wdstatus->{$id} if $wdstatus->{$id}->{node} eq $node;
500 }
501
502 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
503 }
504 }
505
506 sub watchdog_check {
507 my ($self, $node) = @_;
508
509 my $code = sub {
510 my ($wdstatus) = @_;
511
512 my $res = 1;
513
514 foreach my $wfh (keys %$wdstatus) {
515 my $wd = $wdstatus->{$wfh};
516 next if $wd->{node} ne $node;
517
518 my $ctime = $self->get_time();
519 my $tdiff = $ctime - $wd->{update_time};
520
521 if ($tdiff > $watchdog_timeout) { # expired
522 $res = 0;
523 delete $wdstatus->{$wfh};
524 }
525 }
526
527 return ($wdstatus, $res);
528 };
529
530 return &$modify_watchog($self, $code);
531 }
532
533 my $wdcounter = 0;
534
535 sub watchdog_open {
536 my ($self, $node) = @_;
537
538 my $code = sub {
539 my ($wdstatus) = @_;
540
541 ++$wdcounter;
542
543 my $id = "WD:$node:$$:$wdcounter";
544
545 die "internal error" if defined($wdstatus->{$id});
546
547 $wdstatus->{$id} = {
548 node => $node,
549 update_time => $self->get_time(),
550 };
551
552 return ($wdstatus, $id);
553 };
554
555 return &$modify_watchog($self, $code);
556 }
557
558 sub watchdog_close {
559 my ($self, $wfh) = @_;
560
561 my $code = sub {
562 my ($wdstatus) = @_;
563
564 my $wd = $wdstatus->{$wfh};
565 die "no such watchdog handle '$wfh'\n" if !defined($wd);
566
567 my $tdiff = $self->get_time() - $wd->{update_time};
568 die "watchdog expired" if $tdiff > $watchdog_timeout;
569
570 delete $wdstatus->{$wfh};
571
572 return ($wdstatus);
573 };
574
575 return &$modify_watchog($self, $code);
576 }
577
578 sub watchdog_update {
579 my ($self, $wfh) = @_;
580
581 my $code = sub {
582 my ($wdstatus) = @_;
583
584 my $wd = $wdstatus->{$wfh};
585
586 die "no such watchdog handle '$wfh'\n" if !defined($wd);
587
588 my $ctime = $self->get_time();
589 my $tdiff = $ctime - $wd->{update_time};
590
591 die "watchdog expired" if $tdiff > $watchdog_timeout;
592
593 $wd->{update_time} = $ctime;
594
595 return ($wdstatus);
596 };
597
598 return &$modify_watchog($self, $code);
599 }
600
601 1;