]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Env/PVE2.pm
Env, HW: add HW fencing related functions
[pve-ha-manager.git] / src / PVE / HA / Env / PVE2.pm
1 package PVE::HA::Env::PVE2;
2
3 use strict;
4 use warnings;
5 use POSIX qw(:errno_h :fcntl_h);
6 use IO::File;
7 use IO::Socket::UNIX;
8
9 use PVE::SafeSyslog;
10 use PVE::Tools;
11 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
12 use PVE::INotify;
13 use PVE::RPCEnvironment;
14
15 use PVE::HA::Tools ':exit_codes';
16 use PVE::HA::Env;
17 use PVE::HA::Config;
18 use PVE::HA::FenceConfig;
19 use PVE::HA::Resources;
20 use PVE::HA::Resources::PVEVM;
21 use PVE::HA::Resources::PVECT;
22
23 PVE::HA::Resources::PVEVM->register();
24 PVE::HA::Resources::PVECT->register();
25
26 PVE::HA::Resources->init();
27
28 my $lockdir = "/etc/pve/priv/lock";
29
30 sub new {
31 my ($this, $nodename) = @_;
32
33 die "missing nodename" if !$nodename;
34
35 my $class = ref($this) || $this;
36
37 my $self = bless {}, $class;
38
39 $self->{nodename} = $nodename;
40
41 return $self;
42 }
43
44 sub nodename {
45 my ($self) = @_;
46
47 return $self->{nodename};
48 }
49
50 sub hardware {
51 my ($self) = @_;
52
53 die "hardware is for testing and simulation only";
54 }
55
56 sub read_manager_status {
57 my ($self) = @_;
58
59 return PVE::HA::Config::read_manager_status();
60 }
61
62 sub write_manager_status {
63 my ($self, $status_obj) = @_;
64
65 PVE::HA::Config::write_manager_status($status_obj);
66 }
67
68 sub read_lrm_status {
69 my ($self, $node) = @_;
70
71 $node = $self->{nodename} if !defined($node);
72
73 return PVE::HA::Config::read_lrm_status($node);
74 }
75
76 sub write_lrm_status {
77 my ($self, $status_obj) = @_;
78
79 my $node = $self->{nodename};
80
81 PVE::HA::Config::write_lrm_status($node, $status_obj);
82 }
83
84 sub is_node_shutdown {
85 my ($self) = @_;
86
87 my $shutdown = 0;
88
89 my $code = sub {
90 my $line = shift;
91
92 $shutdown = 1 if ($line =~ m/shutdown\.target/);
93 };
94
95 my $cmd = ['/bin/systemctl', 'list-jobs'];
96 eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
97
98 return $shutdown;
99 }
100
101 sub queue_crm_commands {
102 my ($self, $cmd) = @_;
103
104 return PVE::HA::Config::queue_crm_commands($cmd);
105 }
106
107 sub read_crm_commands {
108 my ($self) = @_;
109
110 return PVE::HA::Config::read_crm_commands();
111 }
112
113 sub read_service_config {
114 my ($self) = @_;
115
116 my $res = PVE::HA::Config::read_resources_config();
117
118 my $vmlist = PVE::Cluster::get_vmlist();
119 my $conf = {};
120
121 foreach my $sid (keys %{$res->{ids}}) {
122 my $d = $res->{ids}->{$sid};
123 my (undef, undef, $name) = PVE::HA::Tools::parse_sid($sid);
124 $d->{state} = 'enabled' if !defined($d->{state});
125 $d->{max_restart} = 1 if !defined($d->{max_restart});
126 $d->{max_relocate} = 1 if !defined($d->{max_relocate});
127 if (PVE::HA::Resources->lookup($d->{type})) {
128 if (my $vmd = $vmlist->{ids}->{$name}) {
129 if (!$vmd) {
130 warn "no such VM '$name'\n";
131 } else {
132 $d->{node} = $vmd->{node};
133 $conf->{$sid} = $d;
134 }
135 } else {
136 if (defined($d->{node})) {
137 $conf->{$sid} = $d;
138 } else {
139 warn "service '$sid' without node\n";
140 }
141 }
142 }
143 }
144
145 return $conf;
146 }
147
148 sub read_fence_config {
149 my ($self) = @_;
150
151 return PVE::HA::Config::read_fence_config();
152 }
153
154 sub fencing_mode {
155 my ($self) = @_;
156
157 my $datacenterconfig = cfs_read_file('datacenter.cfg');
158
159 return 'watchdog' if !$datacenterconfig->{fencing};
160
161 return $datacenterconfig->{fencing};
162 }
163
164 sub exec_fence_agent {
165 my ($self, $agent, $node, @param) = @_;
166
167 # setup execution environment
168 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
169
170 my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
171
172 exec($cmd);
173 exit -1;
174 }
175
176 # this is only allowed by the master to recover a _fenced_ service
177 sub steal_service {
178 my ($self, $sid, $current_node, $new_node) = @_;
179
180 my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
181
182 if(my $plugin = PVE::HA::Resources->lookup($type)) {
183 my $old = $plugin->config_file($name, $current_node);
184 my $new = $plugin->config_file($name, $new_node);
185 rename($old, $new) ||
186 die "rename '$old' to '$new' failed - $!\n";
187 } else {
188 die "implement me";
189 }
190 }
191
192 sub read_group_config {
193 my ($self) = @_;
194
195 return PVE::HA::Config::read_group_config();
196 }
197
198 # this should return a hash containing info
199 # what nodes are members and online.
200 sub get_node_info {
201 my ($self) = @_;
202
203 my ($node_info, $quorate) = ({}, 0);
204
205 my $nodename = $self->{nodename};
206
207 $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
208
209 my $members = PVE::Cluster::get_members();
210
211 foreach my $node (keys %$members) {
212 my $d = $members->{$node};
213 $node_info->{$node}->{online} = $d->{online};
214 }
215
216 $node_info->{$nodename}->{online} = 1; # local node is always up
217
218 return ($node_info, $quorate);
219 }
220
221 sub log {
222 my ($self, $level, $msg) = @_;
223
224 chomp $msg;
225
226 syslog($level, $msg);
227 }
228
229 my $last_lock_status = {};
230
231 sub get_pve_lock {
232 my ($self, $lockid) = @_;
233
234 my $got_lock = 0;
235
236 my $filename = "$lockdir/$lockid";
237
238 my $last = $last_lock_status->{$lockid} || 0;
239
240 my $ctime = time();
241
242 my $retry = 0;
243 my $retry_timeout = 100; # fixme: what timeout
244
245 eval {
246
247 mkdir $lockdir;
248
249 # pve cluster filesystem not online
250 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
251
252 if ($last && (($ctime - $last) < $retry_timeout)) {
253 # send cfs lock update request (utime)
254 if (!utime(0, $ctime, $filename)) {
255 $retry = 1;
256 die "cfs lock update failed - $!\n";
257 }
258 } else {
259
260 # fixme: wait some time?
261 if (!(mkdir $filename)) {
262 utime 0, 0, $filename; # cfs unlock request
263 die "can't get cfs lock\n";
264 }
265 }
266
267 $got_lock = 1;
268 };
269
270 my $err = $@;
271
272 if ($retry) {
273 # $self->log('err', $err) if $err; # for debugging
274 return 0;
275 }
276
277 $last_lock_status->{$lockid} = $got_lock ? $ctime : 0;
278
279 if (!!$got_lock != !!$last) {
280 if ($got_lock) {
281 $self->log('info', "successfully acquired lock '$lockid'");
282 } else {
283 my $msg = "lost lock '$lockid";
284 $msg .= " - $err" if $err;
285 $self->log('err', $msg);
286 }
287 } else {
288 # $self->log('err', $err) if $err; # for debugging
289 }
290
291 return $got_lock;
292 }
293
294 sub get_ha_manager_lock {
295 my ($self) = @_;
296
297 return $self->get_pve_lock("ha_manager_lock");
298 }
299
300 # release the cluster wide manager lock.
301 # when released another CRM may step up and get the lock, thus this should only
302 # get called when shutting down/deactivating the current master
303 sub release_ha_manager_lock {
304 my ($self) = @_;
305
306 return rmdir("$lockdir/ha_manager_lock");
307 }
308
309 sub get_ha_agent_lock {
310 my ($self, $node) = @_;
311
312 $node = $self->nodename() if !defined($node);
313
314 return $self->get_pve_lock("ha_agent_${node}_lock");
315 }
316
317 # release the respective node agent lock.
318 # this should only get called if the nodes LRM gracefully shuts down with
319 # all services already cleanly stopped!
320 sub release_ha_agent_lock {
321 my ($self) = @_;
322
323 my $node = $self->nodename();
324
325 return rmdir("$lockdir/ha_agent_${node}_lock");
326 }
327
328 sub quorate {
329 my ($self) = @_;
330
331 my $quorate = 0;
332 eval {
333 $quorate = PVE::Cluster::check_cfs_quorum();
334 };
335
336 return $quorate;
337 }
338
339 sub get_time {
340 my ($self) = @_;
341
342 return time();
343 }
344
345 sub sleep {
346 my ($self, $delay) = @_;
347
348 CORE::sleep($delay);
349 }
350
351 sub sleep_until {
352 my ($self, $end_time) = @_;
353
354 for (;;) {
355 my $cur_time = time();
356
357 last if $cur_time >= $end_time;
358
359 $self->sleep(1);
360 }
361 }
362
363 sub loop_start_hook {
364 my ($self) = @_;
365
366 PVE::Cluster::cfs_update();
367
368 $self->{loop_start} = $self->get_time();
369 }
370
371 sub loop_end_hook {
372 my ($self) = @_;
373
374 my $delay = $self->get_time() - $self->{loop_start};
375
376 warn "loop take too long ($delay seconds)\n" if $delay > 30;
377 }
378
379 my $watchdog_fh;
380
381 sub watchdog_open {
382 my ($self) = @_;
383
384 die "watchdog already open\n" if defined($watchdog_fh);
385
386 $watchdog_fh = IO::Socket::UNIX->new(
387 Type => SOCK_STREAM(),
388 Peer => "/run/watchdog-mux.sock") ||
389 die "unable to open watchdog socket - $!\n";
390
391 $self->log('info', "watchdog active");
392 }
393
394 sub watchdog_update {
395 my ($self, $wfh) = @_;
396
397 my $res = $watchdog_fh->syswrite("\0", 1);
398 if (!defined($res)) {
399 $self->log('err', "watchdog update failed - $!\n");
400 return 0;
401 }
402 if ($res != 1) {
403 $self->log('err', "watchdog update failed - write $res bytes\n");
404 return 0;
405 }
406
407 return 1;
408 }
409
410 sub watchdog_close {
411 my ($self, $wfh) = @_;
412
413 $watchdog_fh->syswrite("V", 1); # magic watchdog close
414 if (!$watchdog_fh->close()) {
415 $self->log('err', "watchdog close failed - $!");
416 } else {
417 $watchdog_fh = undef;
418 $self->log('info', "watchdog closed (disabled)");
419 }
420 }
421
422 sub after_fork {
423 my ($self) = @_;
424
425 # close inherited inotify FD from parent and reopen our own
426 PVE::INotify::inotify_close();
427 PVE::INotify::inotify_init();
428
429 PVE::Cluster::cfs_update();
430 }
431
432 sub get_max_workers {
433 my ($self) = @_;
434
435 my $datacenterconfig = cfs_read_file('datacenter.cfg');
436
437 return $datacenterconfig->{max_workers} || 4;
438 }
439
440 1;