]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/Hardware.pm
rename request state 'enabled' to 'started'
[pve-ha-manager.git] / src / PVE / HA / Sim / Hardware.pm
1 package PVE::HA::Sim::Hardware;
2
3 # Simulate Hardware resources
4
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
8
9 use strict;
10 use warnings;
11 use POSIX qw(strftime EINTR);
12 use Data::Dumper;
13 use JSON;
14 use IO::File;
15 use Fcntl qw(:DEFAULT :flock);
16 use File::Copy;
17 use File::Path qw(make_path remove_tree);
18 use PVE::HA::Config;
19 use PVE::HA::FenceConfig;
20
21 my $watchdog_timeout = 60;
22
23
24 # Status directory layout
25 #
26 # configuration
27 #
28 # $testdir/cmdlist Command list for simulation
29 # $testdir/hardware_status Hardware description (number of nodes, ...)
30 # $testdir/manager_status CRM status (start with {})
31 # $testdir/service_config Service configuration
32 # $testdir/groups HA groups configuration
33 # $testdir/service_status_<node> Service status
34
35 #
36 # runtime status for simulation system
37 #
38 # $testdir/status/cluster_locks Cluster locks
39 # $testdir/status/hardware_status Hardware status (power/network on/off)
40 # $testdir/status/watchdog_status Watchdog status
41 #
42 # runtime status
43 #
44 # $testdir/status/lrm_status_<node> LRM status
45 # $testdir/status/manager_status CRM status
46 # $testdir/status/crm_commands CRM command queue
47 # $testdir/status/service_config Service configuration
48 # $testdir/status/service_status_<node> Service status
49 # $testdir/status/groups HA groups configuration
50
51 sub read_lrm_status {
52 my ($self, $node) = @_;
53
54 my $filename = "$self->{statusdir}/lrm_status_$node";
55
56 return PVE::HA::Tools::read_json_from_file($filename, {});
57 }
58
59 sub write_lrm_status {
60 my ($self, $node, $status_obj) = @_;
61
62 my $filename = "$self->{statusdir}/lrm_status_$node";
63
64 PVE::HA::Tools::write_json_to_file($filename, $status_obj);
65 }
66
67 sub read_hardware_status_nolock {
68 my ($self) = @_;
69
70 my $filename = "$self->{statusdir}/hardware_status";
71
72 my $raw = PVE::Tools::file_get_contents($filename);
73 my $cstatus = decode_json($raw);
74
75 return $cstatus;
76 }
77
78 sub write_hardware_status_nolock {
79 my ($self, $cstatus) = @_;
80
81 my $filename = "$self->{statusdir}/hardware_status";
82
83 PVE::Tools::file_set_contents($filename, encode_json($cstatus));
84 };
85
86 sub read_service_config {
87 my ($self) = @_;
88
89 my $filename = "$self->{statusdir}/service_config";
90 my $conf = PVE::HA::Tools::read_json_from_file($filename);
91
92 foreach my $sid (keys %$conf) {
93 my $d = $conf->{$sid};
94
95 die "service '$sid' without assigned node!" if !$d->{node};
96
97 if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
98 $d->{type} = $1;
99 $d->{name} = $2;
100 } else {
101 die "implement me";
102 }
103 $d->{state} = 'disabled' if !$d->{state};
104 $d->{state} = 'started' if $d->{state} eq 'enabled'; # backward compatibility
105 $d->{max_restart} = 1 if !defined($d->{max_restart});
106 $d->{max_relocate} = 1 if !defined($d->{max_relocate});
107 }
108
109 return $conf;
110 }
111
112 sub write_service_config {
113 my ($self, $conf) = @_;
114
115 $self->{service_config} = $conf;
116
117 my $filename = "$self->{statusdir}/service_config";
118 return PVE::HA::Tools::write_json_to_file($filename, $conf);
119 }
120
121 sub read_fence_config {
122 my ($self) = @_;
123
124 my $raw = undef;
125
126 my $filename = "$self->{statusdir}/fence.cfg";
127 if (-e $filename) {
128 $raw = PVE::Tools::file_get_contents($filename);
129 }
130
131 return PVE::HA::FenceConfig::parse_config($filename, $raw);
132 }
133
134 sub exec_fence_agent {
135 my ($self, $agent, $node, @param) = @_;
136
137 # let all agent succeed and behave the same for now
138 $self->sim_hardware_cmd("power $node off", $agent);
139
140 return 0; # EXIT_SUCCESS
141 }
142
143 sub set_service_state {
144 my ($self, $sid, $state) = @_;
145
146 my $conf = $self->read_service_config();
147 die "no such service '$sid'" if !$conf->{$sid};
148
149 $conf->{$sid}->{state} = $state;
150
151 $self->write_service_config($conf);
152
153 return $conf;
154 }
155
156 sub add_service {
157 my ($self, $sid, $opts) = @_;
158
159 my $conf = $self->read_service_config();
160 die "resource ID '$sid' already defined\n" if $conf->{$sid};
161
162 $conf->{$sid} = $opts;
163
164 $self->write_service_config($conf);
165
166 return $conf;
167 }
168
169 sub delete_service {
170 my ($self, $sid) = @_;
171
172 my $conf = $self->read_service_config();
173
174 die "no such service '$sid'" if !$conf->{$sid};
175
176 delete $conf->{$sid};
177
178 $self->write_service_config($conf);
179
180 return $conf;
181 }
182
183 sub change_service_location {
184 my ($self, $sid, $current_node, $new_node) = @_;
185
186 my $conf = $self->read_service_config();
187
188 die "no such service '$sid'\n" if !$conf->{$sid};
189
190 die "current_node for '$sid' does not match ($current_node != $conf->{$sid}->{node})\n"
191 if $current_node ne $conf->{$sid}->{node};
192
193 $conf->{$sid}->{node} = $new_node;
194
195 $self->write_service_config($conf);
196 }
197
198 sub service_has_lock {
199 my ($self, $sid) = @_;
200
201 my $conf = $self->read_service_config();
202
203 die "no such service '$sid'\n" if !$conf->{$sid};
204
205 return $conf->{$sid}->{lock};
206 }
207
208 sub lock_service {
209 my ($self, $sid, $lock) = @_;
210
211 my $conf = $self->read_service_config();
212
213 die "no such service '$sid'\n" if !$conf->{$sid};
214
215 $conf->{$sid}->{lock} = $lock || 'backup';
216
217 $self->write_service_config($conf);
218
219 return $conf;
220 }
221
222 sub unlock_service {
223 my ($self, $sid, $lock) = @_;
224
225 my $conf = $self->read_service_config();
226
227 die "no such service '$sid'\n" if !$conf->{$sid};
228
229 if (!defined($conf->{$sid}->{lock})) {
230 return undef;
231 }
232
233 if (defined($lock) && $conf->{$sid}->{lock} ne $lock) {
234 warn "found lock '$conf->{$sid}->{lock}' trying to remove '$lock' lock\n";
235 return undef;
236 }
237
238 my $removed_lock = delete $conf->{$sid}->{lock};
239
240 $self->write_service_config($conf);
241
242 return $removed_lock;
243 }
244
245 sub queue_crm_commands_nolock {
246 my ($self, $cmd) = @_;
247
248 chomp $cmd;
249
250 my $data = '';
251 my $filename = "$self->{statusdir}/crm_commands";
252 if (-f $filename) {
253 $data = PVE::Tools::file_get_contents($filename);
254 }
255 $data .= "$cmd\n";
256 PVE::Tools::file_set_contents($filename, $data);
257
258 return undef;
259 }
260
261 sub queue_crm_commands {
262 my ($self, $cmd) = @_;
263
264 my $code = sub { $self->queue_crm_commands_nolock($cmd); };
265
266 $self->global_lock($code);
267
268 return undef;
269 }
270
271 sub read_crm_commands {
272 my ($self) = @_;
273
274 my $code = sub {
275 my $data = '';
276
277 my $filename = "$self->{statusdir}/crm_commands";
278 if (-f $filename) {
279 $data = PVE::Tools::file_get_contents($filename);
280 }
281 PVE::Tools::file_set_contents($filename, '');
282
283 return $data;
284 };
285
286 return $self->global_lock($code);
287 }
288
289 sub read_group_config {
290 my ($self) = @_;
291
292 my $filename = "$self->{statusdir}/groups";
293 my $raw = '';
294 $raw = PVE::Tools::file_get_contents($filename) if -f $filename;
295
296 return PVE::HA::Config::parse_groups_config($filename, $raw);
297 }
298
299 sub read_service_status {
300 my ($self, $node) = @_;
301
302 my $filename = "$self->{statusdir}/service_status_$node";
303 return PVE::HA::Tools::read_json_from_file($filename);
304 }
305
306 sub write_service_status {
307 my ($self, $node, $data) = @_;
308
309 my $filename = "$self->{statusdir}/service_status_$node";
310 my $res = PVE::HA::Tools::write_json_to_file($filename, $data);
311
312 # fixme: add test if a service runs on two nodes!!!
313
314 return $res;
315 }
316
317 my $default_group_config = <<__EOD;
318 group: prefer_node1
319 nodes node1
320 nofailback 1
321
322 group: prefer_node2
323 nodes node2
324 nofailback 1
325
326 group: prefer_node3
327 nodes node3
328 nofailback 1
329 __EOD
330
331 sub new {
332 my ($this, $testdir) = @_;
333
334 die "missing testdir" if !$testdir;
335
336 die "testdir '$testdir' does not exist or is not a directory!\n"
337 if !-d $testdir;
338
339 my $class = ref($this) || $this;
340
341 my $self = bless {}, $class;
342
343 my $statusdir = $self->{statusdir} = "$testdir/status";
344
345 remove_tree($statusdir);
346 mkdir $statusdir;
347
348 # copy initial configuartion
349 copy("$testdir/manager_status", "$statusdir/manager_status"); # optional
350
351 if (-f "$testdir/groups") {
352 copy("$testdir/groups", "$statusdir/groups");
353 } else {
354 PVE::Tools::file_set_contents("$statusdir/groups", $default_group_config);
355 }
356
357 if (-f "$testdir/service_config") {
358 copy("$testdir/service_config", "$statusdir/service_config");
359 } else {
360 my $conf = {
361 'vm:101' => { node => 'node1', group => 'prefer_node1' },
362 'vm:102' => { node => 'node2', group => 'prefer_node2' },
363 'vm:103' => { node => 'node3', group => 'prefer_node3' },
364 'vm:104' => { node => 'node1', group => 'prefer_node1' },
365 'vm:105' => { node => 'node2', group => 'prefer_node2' },
366 'vm:106' => { node => 'node3', group => 'prefer_node3' },
367 };
368 $self->write_service_config($conf);
369 }
370
371 if (-f "$testdir/hardware_status") {
372 copy("$testdir/hardware_status", "$statusdir/hardware_status") ||
373 die "Copy failed: $!\n";
374 } else {
375 my $cstatus = {
376 node1 => { power => 'off', network => 'off' },
377 node2 => { power => 'off', network => 'off' },
378 node3 => { power => 'off', network => 'off' },
379 };
380 $self->write_hardware_status_nolock($cstatus);
381 }
382
383 if (-f "$testdir/fence.cfg") {
384 copy("$testdir/fence.cfg", "$statusdir/fence.cfg");
385 }
386
387 my $cstatus = $self->read_hardware_status_nolock();
388
389 foreach my $node (sort keys %$cstatus) {
390 $self->{nodes}->{$node} = {};
391
392 if (-f "$testdir/service_status_$node") {
393 copy("$testdir/service_status_$node", "$statusdir/service_status_$node");
394 } else {
395 $self->write_service_status($node, {});
396 }
397 }
398
399 $self->{service_config} = $self->read_service_config();
400
401 return $self;
402 }
403
404 sub get_time {
405 my ($self) = @_;
406
407 die "implement in subclass";
408 }
409
410 sub log {
411 my ($self, $level, $msg, $id) = @_;
412
413 chomp $msg;
414
415 my $time = $self->get_time();
416
417 $id = 'hardware' if !$id;
418
419 printf("%-5s %5d %12s: $msg\n", $level, $time, $id);
420 }
421
422 sub statusdir {
423 my ($self, $node) = @_;
424
425 return $self->{statusdir};
426 }
427
428 sub global_lock {
429 my ($self, $code, @param) = @_;
430
431 my $lockfile = "$self->{statusdir}/hardware.lck";
432 my $fh = IO::File->new(">>$lockfile") ||
433 die "unable to open '$lockfile'\n";
434
435 my $success;
436 for (;;) {
437 $success = flock($fh, LOCK_EX);
438 if ($success || ($! != EINTR)) {
439 last;
440 }
441 if (!$success) {
442 close($fh);
443 die "can't acquire lock '$lockfile' - $!\n";
444 }
445 }
446
447 my $res;
448
449 eval { $res = &$code($fh, @param) };
450 my $err = $@;
451
452 close($fh);
453
454 die $err if $err;
455
456 return $res;
457 }
458
459 my $compute_node_info = sub {
460 my ($self, $cstatus) = @_;
461
462 my $node_info = {};
463
464 my $node_count = 0;
465 my $online_count = 0;
466
467 foreach my $node (keys %$cstatus) {
468 my $d = $cstatus->{$node};
469
470 my $online = ($d->{power} eq 'on' && $d->{network} eq 'on') ? 1 : 0;
471 $node_info->{$node}->{online} = $online;
472
473 $node_count++;
474 $online_count++ if $online;
475 }
476
477 my $quorate = ($online_count > int($node_count/2)) ? 1 : 0;
478
479 if (!$quorate) {
480 foreach my $node (keys %$cstatus) {
481 my $d = $cstatus->{$node};
482 $node_info->{$node}->{online} = 0;
483 }
484 }
485
486 return ($node_info, $quorate);
487 };
488
489 sub get_node_info {
490 my ($self) = @_;
491
492 my $cstatus = $self->read_hardware_status_nolock();
493 my ($node_info, $quorate) = &$compute_node_info($self, $cstatus);
494
495 return ($node_info, $quorate);
496 }
497
498 # simulate hardware commands
499 # power <node> <on|off>
500 # network <node> <on|off>
501
502 sub sim_hardware_cmd {
503 my ($self, $cmdstr, $logid) = @_;
504
505 die "implement in subclass";
506 }
507
508 sub run {
509 my ($self) = @_;
510
511 die "implement in subclass";
512 }
513
514 my $modify_watchog = sub {
515 my ($self, $code) = @_;
516
517 my $update_cmd = sub {
518
519 my $filename = "$self->{statusdir}/watchdog_status";
520
521 my ($res, $wdstatus);
522
523 if (-f $filename) {
524 my $raw = PVE::Tools::file_get_contents($filename);
525 $wdstatus = decode_json($raw);
526 } else {
527 $wdstatus = {};
528 }
529
530 ($wdstatus, $res) = &$code($wdstatus);
531
532 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
533
534 return $res;
535 };
536
537 return $self->global_lock($update_cmd);
538 };
539
540 sub watchdog_reset_nolock {
541 my ($self, $node) = @_;
542
543 my $filename = "$self->{statusdir}/watchdog_status";
544
545 if (-f $filename) {
546 my $raw = PVE::Tools::file_get_contents($filename);
547 my $wdstatus = decode_json($raw);
548
549 foreach my $id (keys %$wdstatus) {
550 delete $wdstatus->{$id} if $wdstatus->{$id}->{node} eq $node;
551 }
552
553 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
554 }
555 }
556
557 sub watchdog_check {
558 my ($self, $node) = @_;
559
560 my $code = sub {
561 my ($wdstatus) = @_;
562
563 my $res = 1;
564
565 foreach my $wfh (keys %$wdstatus) {
566 my $wd = $wdstatus->{$wfh};
567 next if $wd->{node} ne $node;
568
569 my $ctime = $self->get_time();
570 my $tdiff = $ctime - $wd->{update_time};
571
572 if ($tdiff > $watchdog_timeout) { # expired
573 $res = 0;
574 delete $wdstatus->{$wfh};
575 }
576 }
577
578 return ($wdstatus, $res);
579 };
580
581 return &$modify_watchog($self, $code);
582 }
583
584 my $wdcounter = 0;
585
586 sub watchdog_open {
587 my ($self, $node) = @_;
588
589 my $code = sub {
590 my ($wdstatus) = @_;
591
592 ++$wdcounter;
593
594 my $id = "WD:$node:$$:$wdcounter";
595
596 die "internal error" if defined($wdstatus->{$id});
597
598 $wdstatus->{$id} = {
599 node => $node,
600 update_time => $self->get_time(),
601 };
602
603 return ($wdstatus, $id);
604 };
605
606 return &$modify_watchog($self, $code);
607 }
608
609 sub watchdog_close {
610 my ($self, $wfh) = @_;
611
612 my $code = sub {
613 my ($wdstatus) = @_;
614
615 my $wd = $wdstatus->{$wfh};
616 die "no such watchdog handle '$wfh'\n" if !defined($wd);
617
618 my $tdiff = $self->get_time() - $wd->{update_time};
619 die "watchdog expired" if $tdiff > $watchdog_timeout;
620
621 delete $wdstatus->{$wfh};
622
623 return ($wdstatus);
624 };
625
626 return &$modify_watchog($self, $code);
627 }
628
629 sub watchdog_update {
630 my ($self, $wfh) = @_;
631
632 my $code = sub {
633 my ($wdstatus) = @_;
634
635 my $wd = $wdstatus->{$wfh};
636
637 die "no such watchdog handle '$wfh'\n" if !defined($wd);
638
639 my $ctime = $self->get_time();
640 my $tdiff = $ctime - $wd->{update_time};
641
642 die "watchdog expired" if $tdiff > $watchdog_timeout;
643
644 $wd->{update_time} = $ctime;
645
646 return ($wdstatus);
647 };
648
649 return &$modify_watchog($self, $code);
650 }
651
652 1;