]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/Hardware.pm
Add virtual resources for tests and simulation
[pve-ha-manager.git] / src / PVE / HA / Sim / Hardware.pm
1 package PVE::HA::Sim::Hardware;
2
3 # Simulate Hardware resources
4
5 # power supply for nodes: on/off
6 # network connection to nodes: on/off
7 # watchdog devices for nodes
8
9 use strict;
10 use warnings;
11 use POSIX qw(strftime EINTR);
12 use Data::Dumper;
13 use JSON;
14 use IO::File;
15 use Fcntl qw(:DEFAULT :flock);
16 use File::Copy;
17 use File::Path qw(make_path remove_tree);
18 use PVE::HA::Config;
19
20 # virtual resource classes
21 use PVE::HA::Sim::Resources::VirtVM;
22 use PVE::HA::Sim::Resources::VirtCT;
23
24 PVE::HA::Sim::Resources::VirtVM->register();
25 PVE::HA::Sim::Resources::VirtCT->register();
26
27 PVE::HA::Sim::Resources->init();
28
29 my $watchdog_timeout = 60;
30
31
32 # Status directory layout
33 #
34 # configuration
35 #
36 # $testdir/cmdlist Command list for simulation
37 # $testdir/hardware_status Hardware description (number of nodes, ...)
38 # $testdir/manager_status CRM status (start with {})
39 # $testdir/service_config Service configuration
40 # $testdir/groups HA groups configuration
41 # $testdir/service_status_<node> Service status
42
43 #
44 # runtime status for simulation system
45 #
46 # $testdir/status/cluster_locks Cluster locks
47 # $testdir/status/hardware_status Hardware status (power/network on/off)
48 # $testdir/status/watchdog_status Watchdog status
49 #
50 # runtime status
51 #
52 # $testdir/status/lrm_status_<node> LRM status
53 # $testdir/status/manager_status CRM status
54 # $testdir/status/crm_commands CRM command queue
55 # $testdir/status/service_config Service configuration
56 # $testdir/status/service_status_<node> Service status
57 # $testdir/status/groups HA groups configuration
58
59 sub read_lrm_status {
60 my ($self, $node) = @_;
61
62 my $filename = "$self->{statusdir}/lrm_status_$node";
63
64 return PVE::HA::Tools::read_json_from_file($filename, {});
65 }
66
67 sub write_lrm_status {
68 my ($self, $node, $status_obj) = @_;
69
70 my $filename = "$self->{statusdir}/lrm_status_$node";
71
72 PVE::HA::Tools::write_json_to_file($filename, $status_obj);
73 }
74
75 sub read_hardware_status_nolock {
76 my ($self) = @_;
77
78 my $filename = "$self->{statusdir}/hardware_status";
79
80 my $raw = PVE::Tools::file_get_contents($filename);
81 my $cstatus = decode_json($raw);
82
83 return $cstatus;
84 }
85
86 sub write_hardware_status_nolock {
87 my ($self, $cstatus) = @_;
88
89 my $filename = "$self->{statusdir}/hardware_status";
90
91 PVE::Tools::file_set_contents($filename, encode_json($cstatus));
92 };
93
94 sub read_service_config {
95 my ($self) = @_;
96
97 my $filename = "$self->{statusdir}/service_config";
98 my $conf = PVE::HA::Tools::read_json_from_file($filename);
99
100 foreach my $sid (keys %$conf) {
101 my $d = $conf->{$sid};
102
103 die "service '$sid' without assigned node!" if !$d->{node};
104
105 if ($sid =~ m/^(vm|ct):(\d+)$/) {
106 $d->{type} = $1;
107 $d->{name} = $2;
108 } else {
109 die "implement me";
110 }
111 $d->{state} = 'disabled' if !$d->{state};
112 }
113
114 return $conf;
115 }
116
117 sub write_service_config {
118 my ($self, $conf) = @_;
119
120 $self->{service_config} = $conf;
121
122 my $filename = "$self->{statusdir}/service_config";
123 return PVE::HA::Tools::write_json_to_file($filename, $conf);
124 }
125
126 sub set_service_state {
127 my ($self, $sid, $state) = @_;
128
129 my $conf = $self->read_service_config();
130 die "no such service '$sid'" if !$conf->{$sid};
131
132 $conf->{$sid}->{state} = $state;
133
134 $self->write_service_config($conf);
135
136 return $conf;
137 }
138
139 sub add_service {
140 my ($self, $sid, $opts) = @_;
141
142 my $conf = $self->read_service_config();
143 die "resource ID '$sid' already defined\n" if $conf->{$sid};
144
145 $conf->{$sid} = $opts;
146
147 $self->write_service_config($conf);
148
149 return $conf;
150 }
151
152 sub delete_service {
153 my ($self, $sid) = @_;
154
155 my $conf = $self->read_service_config();
156
157 die "no such service '$sid'" if !$conf->{$sid};
158
159 delete $conf->{$sid};
160
161 $self->write_service_config($conf);
162
163 return $conf;
164 }
165
166 sub change_service_location {
167 my ($self, $sid, $current_node, $new_node) = @_;
168
169 my $conf = $self->read_service_config();
170
171 die "no such service '$sid'\n" if !$conf->{$sid};
172
173 die "current_node for '$sid' does not match ($current_node != $conf->{$sid}->{node})\n"
174 if $current_node ne $conf->{$sid}->{node};
175
176 $conf->{$sid}->{node} = $new_node;
177
178 $self->write_service_config($conf);
179 }
180
181 sub queue_crm_commands_nolock {
182 my ($self, $cmd) = @_;
183
184 chomp $cmd;
185
186 my $data = '';
187 my $filename = "$self->{statusdir}/crm_commands";
188 if (-f $filename) {
189 $data = PVE::Tools::file_get_contents($filename);
190 }
191 $data .= "$cmd\n";
192 PVE::Tools::file_set_contents($filename, $data);
193
194 return undef;
195 }
196
197 sub queue_crm_commands {
198 my ($self, $cmd) = @_;
199
200 my $code = sub { $self->queue_crm_commands_nolock($cmd); };
201
202 $self->global_lock($code);
203
204 return undef;
205 }
206
207 sub read_crm_commands {
208 my ($self) = @_;
209
210 my $code = sub {
211 my $data = '';
212
213 my $filename = "$self->{statusdir}/crm_commands";
214 if (-f $filename) {
215 $data = PVE::Tools::file_get_contents($filename);
216 }
217 PVE::Tools::file_set_contents($filename, '');
218
219 return $data;
220 };
221
222 return $self->global_lock($code);
223 }
224
225 sub read_group_config {
226 my ($self) = @_;
227
228 my $filename = "$self->{statusdir}/groups";
229 my $raw = '';
230 $raw = PVE::Tools::file_get_contents($filename) if -f $filename;
231
232 return PVE::HA::Config::parse_groups_config($filename, $raw);
233 }
234
235 sub read_service_status {
236 my ($self, $node) = @_;
237
238 my $filename = "$self->{statusdir}/service_status_$node";
239 return PVE::HA::Tools::read_json_from_file($filename);
240 }
241
242 sub write_service_status {
243 my ($self, $node, $data) = @_;
244
245 my $filename = "$self->{statusdir}/service_status_$node";
246 my $res = PVE::HA::Tools::write_json_to_file($filename, $data);
247
248 # fixme: add test if a service runs on two nodes!!!
249
250 return $res;
251 }
252
253 my $default_group_config = <<__EOD;
254 group: prefer_node1
255 nodes node1
256 nofailback 1
257
258 group: prefer_node2
259 nodes node2
260 nofailback 1
261
262 group: prefer_node3
263 nodes node3
264 nofailback 1
265 __EOD
266
267 sub new {
268 my ($this, $testdir) = @_;
269
270 die "missing testdir" if !$testdir;
271
272 my $class = ref($this) || $this;
273
274 my $self = bless {}, $class;
275
276 my $statusdir = $self->{statusdir} = "$testdir/status";
277
278 remove_tree($statusdir);
279 mkdir $statusdir;
280
281 # copy initial configuartion
282 copy("$testdir/manager_status", "$statusdir/manager_status"); # optional
283
284 if (-f "$testdir/groups") {
285 copy("$testdir/groups", "$statusdir/groups");
286 } else {
287 PVE::Tools::file_set_contents("$statusdir/groups", $default_group_config);
288 }
289
290 if (-f "$testdir/service_config") {
291 copy("$testdir/service_config", "$statusdir/service_config");
292 } else {
293 my $conf = {
294 'vm:101' => { node => 'node1', group => 'prefer_node1' },
295 'vm:102' => { node => 'node2', group => 'prefer_node2' },
296 'vm:103' => { node => 'node3', group => 'prefer_node3' },
297 'vm:104' => { node => 'node1', group => 'prefer_node1' },
298 'vm:105' => { node => 'node2', group => 'prefer_node2' },
299 'vm:106' => { node => 'node3', group => 'prefer_node3' },
300 };
301 $self->write_service_config($conf);
302 }
303
304 if (-f "$testdir/hardware_status") {
305 copy("$testdir/hardware_status", "$statusdir/hardware_status") ||
306 die "Copy failed: $!\n";
307 } else {
308 my $cstatus = {
309 node1 => { power => 'off', network => 'off' },
310 node2 => { power => 'off', network => 'off' },
311 node3 => { power => 'off', network => 'off' },
312 };
313 $self->write_hardware_status_nolock($cstatus);
314 }
315
316
317 my $cstatus = $self->read_hardware_status_nolock();
318
319 foreach my $node (sort keys %$cstatus) {
320 $self->{nodes}->{$node} = {};
321
322 if (-f "$testdir/service_status_$node") {
323 copy("$testdir/service_status_$node", "$statusdir/service_status_$node");
324 } else {
325 $self->write_service_status($node, {});
326 }
327 }
328
329 $self->{service_config} = $self->read_service_config();
330
331 return $self;
332 }
333
334 sub get_time {
335 my ($self) = @_;
336
337 die "implement in subclass";
338 }
339
340 sub log {
341 my ($self, $level, $msg, $id) = @_;
342
343 chomp $msg;
344
345 my $time = $self->get_time();
346
347 $id = 'hardware' if !$id;
348
349 printf("%-5s %5d %12s: $msg\n", $level, $time, $id);
350 }
351
352 sub statusdir {
353 my ($self, $node) = @_;
354
355 return $self->{statusdir};
356 }
357
358 sub global_lock {
359 my ($self, $code, @param) = @_;
360
361 my $lockfile = "$self->{statusdir}/hardware.lck";
362 my $fh = IO::File->new(">>$lockfile") ||
363 die "unable to open '$lockfile'\n";
364
365 my $success;
366 for (;;) {
367 $success = flock($fh, LOCK_EX);
368 if ($success || ($! != EINTR)) {
369 last;
370 }
371 if (!$success) {
372 close($fh);
373 die "can't acquire lock '$lockfile' - $!\n";
374 }
375 }
376
377 my $res;
378
379 eval { $res = &$code($fh, @param) };
380 my $err = $@;
381
382 close($fh);
383
384 die $err if $err;
385
386 return $res;
387 }
388
389 my $compute_node_info = sub {
390 my ($self, $cstatus) = @_;
391
392 my $node_info = {};
393
394 my $node_count = 0;
395 my $online_count = 0;
396
397 foreach my $node (keys %$cstatus) {
398 my $d = $cstatus->{$node};
399
400 my $online = ($d->{power} eq 'on' && $d->{network} eq 'on') ? 1 : 0;
401 $node_info->{$node}->{online} = $online;
402
403 $node_count++;
404 $online_count++ if $online;
405 }
406
407 my $quorate = ($online_count > int($node_count/2)) ? 1 : 0;
408
409 if (!$quorate) {
410 foreach my $node (keys %$cstatus) {
411 my $d = $cstatus->{$node};
412 $node_info->{$node}->{online} = 0;
413 }
414 }
415
416 return ($node_info, $quorate);
417 };
418
419 sub get_node_info {
420 my ($self) = @_;
421
422 my $cstatus = $self->read_hardware_status_nolock();
423 my ($node_info, $quorate) = &$compute_node_info($self, $cstatus);
424
425 return ($node_info, $quorate);
426 }
427
428 # simulate hardware commands
429 # power <node> <on|off>
430 # network <node> <on|off>
431
432 sub sim_hardware_cmd {
433 my ($self, $cmdstr, $logid) = @_;
434
435 die "implement in subclass";
436 }
437
438 sub run {
439 my ($self) = @_;
440
441 die "implement in subclass";
442 }
443
444 my $modify_watchog = sub {
445 my ($self, $code) = @_;
446
447 my $update_cmd = sub {
448
449 my $filename = "$self->{statusdir}/watchdog_status";
450
451 my ($res, $wdstatus);
452
453 if (-f $filename) {
454 my $raw = PVE::Tools::file_get_contents($filename);
455 $wdstatus = decode_json($raw);
456 } else {
457 $wdstatus = {};
458 }
459
460 ($wdstatus, $res) = &$code($wdstatus);
461
462 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
463
464 return $res;
465 };
466
467 return $self->global_lock($update_cmd);
468 };
469
470 sub watchdog_reset_nolock {
471 my ($self, $node) = @_;
472
473 my $filename = "$self->{statusdir}/watchdog_status";
474
475 if (-f $filename) {
476 my $raw = PVE::Tools::file_get_contents($filename);
477 my $wdstatus = decode_json($raw);
478
479 foreach my $id (keys %$wdstatus) {
480 delete $wdstatus->{$id} if $wdstatus->{$id}->{node} eq $node;
481 }
482
483 PVE::Tools::file_set_contents($filename, encode_json($wdstatus));
484 }
485 }
486
487 sub watchdog_check {
488 my ($self, $node) = @_;
489
490 my $code = sub {
491 my ($wdstatus) = @_;
492
493 my $res = 1;
494
495 foreach my $wfh (keys %$wdstatus) {
496 my $wd = $wdstatus->{$wfh};
497 next if $wd->{node} ne $node;
498
499 my $ctime = $self->get_time();
500 my $tdiff = $ctime - $wd->{update_time};
501
502 if ($tdiff > $watchdog_timeout) { # expired
503 $res = 0;
504 delete $wdstatus->{$wfh};
505 }
506 }
507
508 return ($wdstatus, $res);
509 };
510
511 return &$modify_watchog($self, $code);
512 }
513
514 my $wdcounter = 0;
515
516 sub watchdog_open {
517 my ($self, $node) = @_;
518
519 my $code = sub {
520 my ($wdstatus) = @_;
521
522 ++$wdcounter;
523
524 my $id = "WD:$node:$$:$wdcounter";
525
526 die "internal error" if defined($wdstatus->{$id});
527
528 $wdstatus->{$id} = {
529 node => $node,
530 update_time => $self->get_time(),
531 };
532
533 return ($wdstatus, $id);
534 };
535
536 return &$modify_watchog($self, $code);
537 }
538
539 sub watchdog_close {
540 my ($self, $wfh) = @_;
541
542 my $code = sub {
543 my ($wdstatus) = @_;
544
545 my $wd = $wdstatus->{$wfh};
546 die "no such watchdog handle '$wfh'\n" if !defined($wd);
547
548 my $tdiff = $self->get_time() - $wd->{update_time};
549 die "watchdog expired" if $tdiff > $watchdog_timeout;
550
551 delete $wdstatus->{$wfh};
552
553 return ($wdstatus);
554 };
555
556 return &$modify_watchog($self, $code);
557 }
558
559 sub watchdog_update {
560 my ($self, $wfh) = @_;
561
562 my $code = sub {
563 my ($wdstatus) = @_;
564
565 my $wd = $wdstatus->{$wfh};
566
567 die "no such watchdog handle '$wfh'\n" if !defined($wd);
568
569 my $ctime = $self->get_time();
570 my $tdiff = $ctime - $wd->{update_time};
571
572 die "watchdog expired" if $tdiff > $watchdog_timeout;
573
574 $wd->{update_time} = $ctime;
575
576 return ($wdstatus);
577 };
578
579 return &$modify_watchog($self, $code);
580 }
581
582 1;