]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/Hardware.pm
clean up 'Data::Dumper' usage tree wide
[pve-ha-manager.git] / src / PVE / HA / Sim / Hardware.pm
1 package PVE::HA::Sim::Hardware;
2
3 # Simulate Hardware resources
4
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
8
9 use strict;
10 use warnings;
11 use POSIX qw(strftime EINTR);
12 use JSON;
13 use IO::File;
14 use Fcntl qw(:DEFAULT :flock);
15 use File::Copy;
16 use File::Path qw(make_path remove_tree);
17 use PVE::HA::Config;
18 use PVE::HA::FenceConfig;
19
20 my $watchdog_timeout = 60;
21
22
23 # Status directory layout
24 #
25 # configuration
26 #
27 # $testdir/cmdlist Command list for simulation
28 # $testdir/hardware_status Hardware description (number of nodes, ...)
29 # $testdir/manager_status CRM status (start with {})
30 # $testdir/service_config Service configuration
31 # $testdir/groups HA groups configuration
32 # $testdir/service_status_<node> Service status
33
34 #
35 # runtime status for simulation system
36 #
37 # $testdir/status/cluster_locks Cluster locks
38 # $testdir/status/hardware_status Hardware status (power/network on/off)
39 # $testdir/status/watchdog_status Watchdog status
40 #
41 # runtime status
42 #
43 # $testdir/status/lrm_status_<node> LRM status
44 # $testdir/status/manager_status CRM status
45 # $testdir/status/crm_commands CRM command queue
46 # $testdir/status/service_config Service configuration
47 # $testdir/status/service_status_<node> Service status
48 # $testdir/status/groups HA groups configuration
49
50 sub read_lrm_status {
51 my ($self, $node) = @_;
52
53 my $filename = "$self->{statusdir}/lrm_status_$node";
54
55 return PVE::HA::Tools::read_json_from_file($filename, {});
56 }
57
58 sub write_lrm_status {
59 my ($self, $node, $status_obj) = @_;
60
61 my $filename = "$self->{statusdir}/lrm_status_$node";
62
63 PVE::HA::Tools::write_json_to_file($filename, $status_obj);
64 }
65
66 sub read_hardware_status_nolock {
67 my ($self) = @_;
68
69 my $filename = "$self->{statusdir}/hardware_status";
70
71 my $raw = PVE::Tools::file_get_contents($filename);
72 my $cstatus = decode_json($raw);
73
74 return $cstatus;
75 }
76
77 sub write_hardware_status_nolock {
78 my ($self, $cstatus) = @_;
79
80 my $filename = "$self->{statusdir}/hardware_status";
81
82 PVE::Tools::file_set_contents($filename, encode_json($cstatus));
83 };
84
85 sub read_service_config {
86 my ($self) = @_;
87
88 my $filename = "$self->{statusdir}/service_config";
89 my $conf = PVE::HA::Tools::read_json_from_file($filename);
90
91 foreach my $sid (keys %$conf) {
92 my $d = $conf->{$sid};
93
94 die "service '$sid' without assigned node!" if !$d->{node};
95
96 if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
97 $d->{type} = $1;
98 $d->{name} = $2;
99 } else {
100 die "implement me";
101 }
102 $d->{state} = 'disabled' if !$d->{state};
103 $d->{state} = 'started' if $d->{state} eq 'enabled'; # backward compatibility
104 $d->{max_restart} = 1 if !defined($d->{max_restart});
105 $d->{max_relocate} = 1 if !defined($d->{max_relocate});
106 }
107
108 return $conf;
109 }
110
111 sub write_service_config {
112 my ($self, $conf) = @_;
113
114 $self->{service_config} = $conf;
115
116 my $filename = "$self->{statusdir}/service_config";
117 return PVE::HA::Tools::write_json_to_file($filename, $conf);
118 }
119
120 sub read_fence_config {
121 my ($self) = @_;
122
123 my $raw = undef;
124
125 my $filename = "$self->{statusdir}/fence.cfg";
126 if (-e $filename) {
127 $raw = PVE::Tools::file_get_contents($filename);
128 }
129
130 return PVE::HA::FenceConfig::parse_config($filename, $raw);
131 }
132
133 sub exec_fence_agent {
134 my ($self, $agent, $node, @param) = @_;
135
136 # let all agent succeed and behave the same for now
137 $self->sim_hardware_cmd("power $node off", $agent);
138
139 return 0; # EXIT_SUCCESS
140 }
141
142 sub set_service_state {
143 my ($self, $sid, $state) = @_;
144
145 my $conf = $self->read_service_config();
146 die "no such service '$sid'" if !$conf->{$sid};
147
148 $conf->{$sid}->{state} = $state;
149
150 $self->write_service_config($conf);
151
152 return $conf;
153 }
154
155 sub add_service {
156 my ($self, $sid, $opts) = @_;
157
158 my $conf = $self->read_service_config();
159 die "resource ID '$sid' already defined\n" if $conf->{$sid};
160
161 $conf->{$sid} = $opts;
162
163 $self->write_service_config($conf);
164
165 return $conf;
166 }
167
168 sub delete_service {
169 my ($self, $sid) = @_;
170
171 my $conf = $self->read_service_config();
172
173 die "no such service '$sid'" if !$conf->{$sid};
174
175 delete $conf->{$sid};
176
177 $self->write_service_config($conf);
178
179 return $conf;
180 }
181
182 sub change_service_location {
183 my ($self, $sid, $current_node, $new_node) = @_;
184
185 my $conf = $self->read_service_config();
186
187 die "no such service '$sid'\n" if !$conf->{$sid};
188
189 die "current_node for '$sid' does not match ($current_node != $conf->{$sid}->{node})\n"
190 if $current_node ne $conf->{$sid}->{node};
191
192 $conf->{$sid}->{node} = $new_node;
193
194 $self->write_service_config($conf);
195 }
196
197 sub service_has_lock {
198 my ($self, $sid) = @_;
199
200 my $conf = $self->read_service_config();
201
202 die "no such service '$sid'\n" if !$conf->{$sid};
203
204 return $conf->{$sid}->{lock};
205 }
206
207 sub lock_service {
208 my ($self, $sid, $lock) = @_;
209
210 my $conf = $self->read_service_config();
211
212 die "no such service '$sid'\n" if !$conf->{$sid};
213
214 $conf->{$sid}->{lock} = $lock || 'backup';
215
216 $self->write_service_config($conf);
217
218 return $conf;
219 }
220
221 sub unlock_service {
222 my ($self, $sid, $lock) = @_;
223
224 my $conf = $self->read_service_config();
225
226 die "no such service '$sid'\n" if !$conf->{$sid};
227
228 if (!defined($conf->{$sid}->{lock})) {
229 return undef;
230 }
231
232 if (defined($lock) && $conf->{$sid}->{lock} ne $lock) {
233 warn "found lock '$conf->{$sid}->{lock}' trying to remove '$lock' lock\n";
234 return undef;
235 }
236
237 my $removed_lock = delete $conf->{$sid}->{lock};
238
239 $self->write_service_config($conf);
240
241 return $removed_lock;
242 }
243
244 sub queue_crm_commands_nolock {
245 my ($self, $cmd) = @_;
246
247 chomp $cmd;
248
249 my $data = '';
250 my $filename = "$self->{statusdir}/crm_commands";
251 if (-f $filename) {
252 $data = PVE::Tools::file_get_contents($filename);
253 }
254 $data .= "$cmd\n";
255 PVE::Tools::file_set_contents($filename, $data);
256
257 return undef;
258 }
259
260 sub queue_crm_commands {
261 my ($self, $cmd) = @_;
262
263 my $code = sub { $self->queue_crm_commands_nolock($cmd); };
264
265 $self->global_lock($code);
266
267 return undef;
268 }
269
270 sub read_crm_commands {
271 my ($self) = @_;
272
273 my $code = sub {
274 my $data = '';
275
276 my $filename = "$self->{statusdir}/crm_commands";
277 if (-f $filename) {
278 $data = PVE::Tools::file_get_contents($filename);
279 }
280 PVE::Tools::file_set_contents($filename, '');
281
282 return $data;
283 };
284
285 return $self->global_lock($code);
286 }
287
288 sub read_group_config {
289 my ($self) = @_;
290
291 my $filename = "$self->{statusdir}/groups";
292 my $raw = '';
293 $raw = PVE::Tools::file_get_contents($filename) if -f $filename;
294
295 return PVE::HA::Config::parse_groups_config($filename, $raw);
296 }
297
298 sub read_service_status {
299 my ($self, $node) = @_;
300
301 my $filename = "$self->{statusdir}/service_status_$node";
302 return PVE::HA::Tools::read_json_from_file($filename);
303 }
304
305 sub write_service_status {
306 my ($self, $node, $data) = @_;
307
308 my $filename = "$self->{statusdir}/service_status_$node";
309 my $res = PVE::HA::Tools::write_json_to_file($filename, $data);
310
311 # fixme: add test if a service runs on two nodes!!!
312
313 return $res;
314 }
315
316 my $default_group_config = <<__EOD;
317 group: prefer_node1
318 nodes node1
319 nofailback 1
320
321 group: prefer_node2
322 nodes node2
323 nofailback 1
324
325 group: prefer_node3
326 nodes node3
327 nofailback 1
328 __EOD
329
330 sub new {
331 my ($this, $testdir) = @_;
332
333 die "missing testdir" if !$testdir;
334
335 die "testdir '$testdir' does not exist or is not a directory!\n"
336 if !-d $testdir;
337
338 my $class = ref($this) || $this;
339
340 my $self = bless {}, $class;
341
342 my $statusdir = $self->{statusdir} = "$testdir/status";
343
344 remove_tree($statusdir);
345 mkdir $statusdir;
346
347 # copy initial configuartion
348 copy("$testdir/manager_status", "$statusdir/manager_status"); # optional
349
350 if (-f "$testdir/groups") {
351 copy("$testdir/groups", "$statusdir/groups");
352 } else {
353 PVE::Tools::file_set_contents("$statusdir/groups", $default_group_config);
354 }
355
356 if (-f "$testdir/service_config") {
357 copy("$testdir/service_config", "$statusdir/service_config");
358 } else {
359 my $conf = {
360 'vm:101' => { node => 'node1', group => 'prefer_node1' },
361 'vm:102' => { node => 'node2', group => 'prefer_node2' },
362 'vm:103' => { node => 'node3', group => 'prefer_node3' },
363 'vm:104' => { node => 'node1', group => 'prefer_node1' },
364 'vm:105' => { node => 'node2', group => 'prefer_node2' },
365 'vm:106' => { node => 'node3', group => 'prefer_node3' },
366 };
367 $self->write_service_config($conf);
368 }
369
370 if (-f "$testdir/hardware_status") {
371 copy("$testdir/hardware_status", "$statusdir/hardware_status") ||
372 die "Copy failed: $!\n";
373 } else {
374 my $cstatus = {
375 node1 => { power => 'off', network => 'off' },
376 node2 => { power => 'off', network => 'off' },
377 node3 => { power => 'off', network => 'off' },
378 };
379 $self->write_hardware_status_nolock($cstatus);
380 }
381
382 if (-f "$testdir/fence.cfg") {
383 copy("$testdir/fence.cfg", "$statusdir/fence.cfg");
384 }
385
386 my $cstatus = $self->read_hardware_status_nolock();
387
388 foreach my $node (sort keys %$cstatus) {
389 $self->{nodes}->{$node} = {};
390
391 if (-f "$testdir/service_status_$node") {
392 copy("$testdir/service_status_$node", "$statusdir/service_status_$node");
393 } else {
394 $self->write_service_status($node, {});
395 }
396 }
397
398 $self->{service_config} = $self->read_service_config();
399
400 return $self;
401 }
402
403 sub get_time {
404 my ($self) = @_;
405
406 die "implement in subclass";
407 }
408
409 sub log {
410 my ($self, $level, $msg, $id) = @_;
411
412 chomp $msg;
413
414 my $time = $self->get_time();
415
416 $id = 'hardware' if !$id;
417
418 printf("%-5s %5d %12s: $msg\n", $level, $time, $id);
419 }
420
421 sub statusdir {
422 my ($self, $node) = @_;
423
424 return $self->{statusdir};
425 }
426
427 sub global_lock {
428 my ($self, $code, @param) = @_;
429
430 my $lockfile = "$self->{statusdir}/hardware.lck";
431 my $fh = IO::File->new(">>$lockfile") ||
432 die "unable to open '$lockfile'\n";
433
434 my $success;
435 for (;;) {
436 $success = flock($fh, LOCK_EX);
437 if ($success || ($! != EINTR)) {
438 last;
439 }
440 if (!$success) {
441 close($fh);
442 die "can't acquire lock '$lockfile' - $!\n";
443 }
444 }
445
446 my $res;
447
448 eval { $res = &$code($fh, @param) };
449 my $err = $@;
450
451 close($fh);
452
453 die $err if $err;
454
455 return $res;
456 }
457
458 my $compute_node_info = sub {
459 my ($self, $cstatus) = @_;
460
461 my $node_info = {};
462
463 my $node_count = 0;
464 my $online_count = 0;
465
466 foreach my $node (keys %$cstatus) {
467 my $d = $cstatus->{$node};
468
469 my $online = ($d->{power} eq 'on' && $d->{network} eq 'on') ? 1 : 0;
470 $node_info->{$node}->{online} = $online;
471
472 $node_count++;
473 $online_count++ if $online;
474 }
475
476 my $quorate = ($online_count > int($node_count/2)) ? 1 : 0;
477
478 if (!$quorate) {
479 foreach my $node (keys %$cstatus) {
480 my $d = $cstatus->{$node};
481 $node_info->{$node}->{online} = 0;
482 }
483 }
484
485 return ($node_info, $quorate);
486 };
487
488 sub get_node_info {
489 my ($self) = @_;
490
491 my $cstatus = $self->read_hardware_status_nolock();
492 my ($node_info, $quorate) = &$compute_node_info($self, $cstatus);
493
494 return ($node_info, $quorate);
495 }
496
497 # simulate hardware commands
498 # power <node> <on|off>
499 # network <node> <on|off>
500 # reboot <node>
501 # shutdown <node>
502 # restart-lrm <node>
503 # service <sid> <started|disabled|stopped>
504 # service <sid> <migrate|relocate> <target>
505 # service <sid> lock/unlock [lockname]
506
507 sub sim_hardware_cmd {
508 my ($self, $cmdstr, $logid) = @_;
509
510 my $code = sub {
511 my ($lock_fh) = @_;
512
513 my $cstatus = $self->read_hardware_status_nolock();
514
515 my ($cmd, $objid, $action, $target) = split(/\s+/, $cmdstr);
516
517 die "sim_hardware_cmd: no node or service for command specified"
518 if !$objid;
519
520 my ($node, $sid, $d);
521
522 if ($cmd eq 'service') {
523 $sid = PVE::HA::Tools::pve_verify_ha_resource_id($objid);
524 } else {
525 $node = $objid;
526 $d = $self->{nodes}->{$node} ||
527 die "sim_hardware_cmd: no such node '$node'\n";
528 }
529
530 $self->log('info', "execute $cmdstr", $logid);
531
532 if ($cmd eq 'power') {
533 die "sim_hardware_cmd: unknown action '$action'\n"
534 if $action !~ m/^(on|off)$/;
535
536 if ($cstatus->{$node}->{power} ne $action) {
537 if ($action eq 'on') {
538
539 $d->{crm} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm});
540 $d->{lrm} = $self->lrm_control('start', $d, $lock_fh) if !defined($d->{lrm});
541 $d->{lrm_restart} = undef;
542
543 } else {
544
545 if ($d->{crm}) {
546 $d->{crm_env}->log('info', "killed by poweroff");
547 $self->crm_control('stop', $d, $lock_fh);
548 $d->{crm} = undef;
549 }
550 if ($d->{lrm}) {
551 $d->{lrm_env}->log('info', "killed by poweroff");
552 $self->lrm_control('stop', $d, $lock_fh);
553 $d->{lrm} = undef;
554 $d->{lrm_restart} = undef;
555 }
556
557 $self->watchdog_reset_nolock($node);
558 $self->write_service_status($node, {});
559 }
560 }
561
562 $cstatus->{$node}->{power} = $action;
563 $cstatus->{$node}->{network} = $action;
564 $cstatus->{$node}->{shutdown} = undef;
565
566 $self->write_hardware_status_nolock($cstatus);
567
568 } elsif ($cmd eq 'network') {
569 die "sim_hardware_cmd: unknown network action '$action'"
570 if $action !~ m/^(on|off)$/;
571 $cstatus->{$node}->{network} = $action;
572
573 $self->write_hardware_status_nolock($cstatus);
574
575 } elsif ($cmd eq 'reboot' || $cmd eq 'shutdown') {
576 $cstatus->{$node}->{shutdown} = $cmd;
577
578 $self->write_hardware_status_nolock($cstatus);
579
580 $self->lrm_control('shutdown', $d, $lock_fh) if defined($d->{lrm});
581 } elsif ($cmd eq 'restart-lrm') {
582 if ($d->{lrm}) {
583 $d->{lrm_restart} = 1;
584 $self->lrm_control('shutdown', $d, $lock_fh);
585 }
586 } elsif ($cmd eq 'crm') {
587
588 if ($action eq 'stop') {
589 if ($d->{crm}) {
590 $d->{crm_stop} = 1;
591 $self->crm_control('shutdown', $d, $lock_fh);
592 }
593 } elsif ($action eq 'start') {
594 $d->{crm} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm});
595 } else {
596 die "sim_hardware_cmd: unknown action '$action'";
597 }
598
599 } elsif ($cmd eq 'service') {
600 if ($action eq 'started' || $action eq 'disabled' || $action eq 'stopped') {
601
602 $self->set_service_state($sid, $action);
603
604 } elsif ($action eq 'migrate' || $action eq 'relocate') {
605
606 die "sim_hardware_cmd: missing target node for '$action' command"
607 if !$target;
608
609 $self->queue_crm_commands_nolock("$action $sid $target");
610
611 } elsif ($action eq 'add') {
612
613 $self->add_service($sid, {state => 'started', node => $target});
614
615 } elsif ($action eq 'delete') {
616
617 $self->delete_service($sid);
618
619 } elsif ($action eq 'lock') {
620
621 $self->lock_service($sid, $target);
622
623 } elsif ($action eq 'unlock') {
624
625 $self->unlock_service($sid, $target);
626
627 } else {
628 die "sim_hardware_cmd: unknown service action '$action' " .
629 "- not implemented\n"
630 }
631 } else {
632 die "sim_hardware_cmd: unknown command '$cmdstr'\n";
633 }
634
635 return $cstatus;
636 };
637
638 return $self->global_lock($code);
639 }
640
641 # for controlling the resource manager services
642 sub crm_control {
643 my ($self, $action, $data, $lock_fh) = @_;
644
645 die "implement in subclass";
646 }
647
648 sub lrm_control {
649 my ($self, $action, $data, $lock_fh) = @_;
650
651 die "implement in subclass";
652 }
653
654 sub run {
655 my ($self) = @_;
656
657 die "implement in subclass";
658 }
659
660 my $modify_watchog = sub {
661 my ($self, $code) = @_;
662
663 my $update_cmd = sub {
664
665 my $filename = "$self->{statusdir}/watchdog_status";
666
667 my ($res, $wdstatus);
668
669 if (-f $filename) {
670 my $raw = PVE::Tools::file_get_contents($filename);
671 $wdstatus = decode_json($raw);
672 } else {
673 $wdstatus = {};
674 }
675
676 ($wdstatus, $res) = &$code($wdstatus);
677
678 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
679
680 return $res;
681 };
682
683 return $self->global_lock($update_cmd);
684 };
685
686 sub watchdog_reset_nolock {
687 my ($self, $node) = @_;
688
689 my $filename = "$self->{statusdir}/watchdog_status";
690
691 if (-f $filename) {
692 my $raw = PVE::Tools::file_get_contents($filename);
693 my $wdstatus = decode_json($raw);
694
695 foreach my $id (keys %$wdstatus) {
696 delete $wdstatus->{$id} if $wdstatus->{$id}->{node} eq $node;
697 }
698
699 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
700 }
701 }
702
703 sub watchdog_check {
704 my ($self, $node) = @_;
705
706 my $code = sub {
707 my ($wdstatus) = @_;
708
709 my $res = 1;
710
711 foreach my $wfh (keys %$wdstatus) {
712 my $wd = $wdstatus->{$wfh};
713 next if $wd->{node} ne $node;
714
715 my $ctime = $self->get_time();
716 my $tdiff = $ctime - $wd->{update_time};
717
718 if ($tdiff > $watchdog_timeout) { # expired
719 $res = 0;
720 delete $wdstatus->{$wfh};
721 }
722 }
723
724 return ($wdstatus, $res);
725 };
726
727 return &$modify_watchog($self, $code);
728 }
729
730 my $wdcounter = 0;
731
732 sub watchdog_open {
733 my ($self, $node) = @_;
734
735 my $code = sub {
736 my ($wdstatus) = @_;
737
738 ++$wdcounter;
739
740 my $id = "WD:$node:$$:$wdcounter";
741
742 die "internal error" if defined($wdstatus->{$id});
743
744 $wdstatus->{$id} = {
745 node => $node,
746 update_time => $self->get_time(),
747 };
748
749 return ($wdstatus, $id);
750 };
751
752 return &$modify_watchog($self, $code);
753 }
754
755 sub watchdog_close {
756 my ($self, $wfh) = @_;
757
758 my $code = sub {
759 my ($wdstatus) = @_;
760
761 my $wd = $wdstatus->{$wfh};
762 die "no such watchdog handle '$wfh'\n" if !defined($wd);
763
764 my $tdiff = $self->get_time() - $wd->{update_time};
765 die "watchdog expired" if $tdiff > $watchdog_timeout;
766
767 delete $wdstatus->{$wfh};
768
769 return ($wdstatus);
770 };
771
772 return &$modify_watchog($self, $code);
773 }
774
775 sub watchdog_update {
776 my ($self, $wfh) = @_;
777
778 my $code = sub {
779 my ($wdstatus) = @_;
780
781 my $wd = $wdstatus->{$wfh};
782
783 die "no such watchdog handle '$wfh'\n" if !defined($wd);
784
785 my $ctime = $self->get_time();
786 my $tdiff = $ctime - $wd->{update_time};
787
788 die "watchdog expired" if $tdiff > $watchdog_timeout;
789
790 $wd->{update_time} = $ctime;
791
792 return ($wdstatus);
793 };
794
795 return &$modify_watchog($self, $code);
796 }
797
798 1;