]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Env/PVE2.pm
Adding error state behaviour
[pve-ha-manager.git] / src / PVE / HA / Env / PVE2.pm
1 package PVE::HA::Env::PVE2;
2
3 use strict;
4 use warnings;
5 use POSIX qw(:errno_h :fcntl_h);
6 use IO::File;
7 use IO::Socket::UNIX;
8
9 use PVE::SafeSyslog;
10 use PVE::Tools;
11 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
12 use PVE::INotify;
13 use PVE::RPCEnvironment;
14
15 use PVE::HA::Tools;
16 use PVE::HA::Env;
17 use PVE::HA::Config;
18
19
20 my $lockdir = "/etc/pve/priv/lock";
21
22 sub new {
23 my ($this, $nodename) = @_;
24
25 die "missing nodename" if !$nodename;
26
27 my $class = ref($this) || $this;
28
29 my $self = bless {}, $class;
30
31 $self->{nodename} = $nodename;
32
33 return $self;
34 }
35
36 sub nodename {
37 my ($self) = @_;
38
39 return $self->{nodename};
40 }
41
42 sub read_manager_status {
43 my ($self) = @_;
44
45 return PVE::HA::Config::read_manager_status();
46 }
47
48 sub write_manager_status {
49 my ($self, $status_obj) = @_;
50
51 PVE::HA::Config::write_manager_status($status_obj);
52 }
53
54 sub read_lrm_status {
55 my ($self, $node) = @_;
56
57 $node = $self->{nodename} if !defined($node);
58
59 return PVE::HA::Config::read_lrm_status($node);
60 }
61
62 sub write_lrm_status {
63 my ($self, $status_obj) = @_;
64
65 my $node = $self->{nodename};
66
67 PVE::HA::Config::write_lrm_status($node, $status_obj);
68 }
69
70 sub queue_crm_commands {
71 my ($self, $cmd) = @_;
72
73 return PVE::HA::Config::queue_crm_commands($cmd);
74 }
75
76 sub read_crm_commands {
77 my ($self) = @_;
78
79 return PVE::HA::Config::read_crm_commands();
80 }
81
82 sub service_config_exists {
83 my ($self) = @_;
84
85 return PVE::HA::Config::resources_config_exists();
86 }
87
88 sub read_service_config {
89 my ($self) = @_;
90
91 my $res = PVE::HA::Config::read_resources_config();
92
93 my $vmlist = PVE::Cluster::get_vmlist();
94 my $conf = {};
95
96 foreach my $sid (keys %{$res->{ids}}) {
97 my $d = $res->{ids}->{$sid};
98 my (undef, undef, $name) = PVE::HA::Tools::parse_sid($sid);
99 $d->{state} = 'enabled' if !defined($d->{state});
100 if (PVE::HA::Resources->lookup($d->{type})) {
101 if (my $vmd = $vmlist->{ids}->{$name}) {
102 if (!$vmd) {
103 warn "no such VM '$name'\n";
104 } else {
105 $d->{node} = $vmd->{node};
106 $conf->{$sid} = $d;
107 }
108 } else {
109 if (defined($d->{node})) {
110 $conf->{$sid} = $d;
111 } else {
112 warn "service '$sid' without node\n";
113 }
114 }
115 }
116 }
117
118 return $conf;
119 }
120
121 sub change_service_location {
122 my ($self, $sid, $current_node, $new_node) = @_;
123
124 my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
125
126 if(my $plugin = PVE::HA::Resources->lookup($type)) {
127 my $old = $plugin->config_file($name, $current_node);
128 my $new = $plugin->config_file($name, $new_node);
129 rename($old, $new) ||
130 die "rename '$old' to '$new' failed - $!\n";
131 } else {
132 die "implement me";
133 }
134 }
135
136 sub read_group_config {
137 my ($self) = @_;
138
139 return PVE::HA::Config::read_group_config();
140 }
141
142 # this should return a hash containing info
143 # what nodes are members and online.
144 sub get_node_info {
145 my ($self) = @_;
146
147 my ($node_info, $quorate) = ({}, 0);
148
149 my $nodename = $self->{nodename};
150
151 $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
152
153 my $members = PVE::Cluster::get_members();
154
155 foreach my $node (keys %$members) {
156 my $d = $members->{$node};
157 $node_info->{$node}->{online} = $d->{online};
158 }
159
160 $node_info->{$nodename}->{online} = 1; # local node is always up
161
162 return ($node_info, $quorate);
163 }
164
165 sub log {
166 my ($self, $level, $msg) = @_;
167
168 chomp $msg;
169
170 syslog($level, $msg);
171 }
172
173 my $last_lock_status = {};
174
175 sub get_pve_lock {
176 my ($self, $lockid) = @_;
177
178 my $got_lock = 0;
179
180 my $filename = "$lockdir/$lockid";
181
182 my $last = $last_lock_status->{$lockid} || 0;
183
184 my $ctime = time();
185
186 my $retry = 0;
187 my $retry_timeout = 100; # fixme: what timeout
188
189 eval {
190
191 mkdir $lockdir;
192
193 # pve cluster filesystem not online
194 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
195
196 if ($last && (($ctime - $last) < $retry_timeout)) {
197 # send cfs lock update request (utime)
198 if (!utime(0, $ctime, $filename)) {
199 $retry = 1;
200 die "cfs lock update failed - $!\n";
201 }
202 } else {
203
204 # fixme: wait some time?
205 if (!(mkdir $filename)) {
206 utime 0, 0, $filename; # cfs unlock request
207 die "can't get cfs lock\n";
208 }
209 }
210
211 $got_lock = 1;
212 };
213
214 my $err = $@;
215
216 if ($retry) {
217 # $self->log('err', $err) if $err; # for debugging
218 return 0;
219 }
220
221 $last_lock_status->{$lockid} = $got_lock ? $ctime : 0;
222
223 if (!!$got_lock != !!$last) {
224 if ($got_lock) {
225 $self->log('info', "successfully aquired lock '$lockid'");
226 } else {
227 my $msg = "lost lock '$lockid";
228 $msg .= " - $err" if $err;
229 $self->log('err', $msg);
230 }
231 } else {
232 # $self->log('err', $err) if $err; # for debugging
233 }
234
235 return $got_lock;
236 }
237
238 sub get_ha_manager_lock {
239 my ($self) = @_;
240
241 return $self->get_pve_lock("ha_manager_lock");
242 }
243
244 sub get_ha_agent_lock {
245 my ($self, $node) = @_;
246
247 $node = $self->nodename() if !defined($node);
248
249 return $self->get_pve_lock("ha_agent_${node}_lock");
250 }
251
252 sub quorate {
253 my ($self) = @_;
254
255 my $quorate = 0;
256 eval {
257 $quorate = PVE::Cluster::check_cfs_quorum();
258 };
259
260 return $quorate;
261 }
262
263 sub get_time {
264 my ($self) = @_;
265
266 return time();
267 }
268
269 sub sleep {
270 my ($self, $delay) = @_;
271
272 CORE::sleep($delay);
273 }
274
275 sub sleep_until {
276 my ($self, $end_time) = @_;
277
278 for (;;) {
279 my $cur_time = time();
280
281 last if $cur_time >= $end_time;
282
283 $self->sleep(1);
284 }
285 }
286
287 sub loop_start_hook {
288 my ($self) = @_;
289
290 PVE::Cluster::cfs_update();
291
292 $self->{loop_start} = $self->get_time();
293 }
294
295 sub loop_end_hook {
296 my ($self) = @_;
297
298 my $delay = $self->get_time() - $self->{loop_start};
299
300 warn "loop take too long ($delay seconds)\n" if $delay > 30;
301 }
302
303 my $watchdog_fh;
304
305 sub watchdog_open {
306 my ($self) = @_;
307
308 die "watchdog already open\n" if defined($watchdog_fh);
309
310 $watchdog_fh = IO::Socket::UNIX->new(
311 Type => SOCK_STREAM(),
312 Peer => "/run/watchdog-mux.sock") ||
313 die "unable to open watchdog socket - $!\n";
314
315 $self->log('info', "watchdog active");
316 }
317
318 sub watchdog_update {
319 my ($self, $wfh) = @_;
320
321 my $res = $watchdog_fh->syswrite("\0", 1);
322 if (!defined($res)) {
323 $self->log('err', "watchdog update failed - $!\n");
324 return 0;
325 }
326 if ($res != 1) {
327 $self->log('err', "watchdog update failed - write $res bytes\n");
328 return 0;
329 }
330
331 return 1;
332 }
333
334 sub watchdog_close {
335 my ($self, $wfh) = @_;
336
337 $watchdog_fh->syswrite("V", 1); # magic watchdog close
338 if (!$watchdog_fh->close()) {
339 $self->log('err', "watchdog close failed - $!");
340 } else {
341 $watchdog_fh = undef;
342 $self->log('info', "watchdog closed (disabled)");
343 }
344 }
345
346 sub upid_wait {
347 my ($self, $upid) = @_;
348
349 my $task = PVE::Tools::upid_decode($upid);
350
351 CORE::sleep(1);
352 while (PVE::ProcFSTools::check_process_running($task->{pid}, $task->{pstart})) {
353 $self->log('debug', "Task still active, waiting");
354 CORE::sleep(1);
355 }
356 }
357
358 sub can_fork {
359 my ($self) = @_;
360
361 return 1;
362 }
363
364 sub exec_resource_agent {
365 my ($self, $sid, $service_config, $cmd, @params) = @_;
366
367 # setup execution environment
368
369 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
370
371 PVE::INotify::inotify_close();
372
373 PVE::INotify::inotify_init();
374
375 PVE::Cluster::cfs_update();
376
377 my $nodename = $self->{nodename};
378
379 # fixme: return valid_exit code (instead of using die) ?
380
381 my (undef, $service_type, $service_name) = PVE::HA::Tools::parse_sid($sid);
382
383 my $plugin = PVE::HA::Resources->lookup($service_type);
384 die "service type '$service_type' not implemented" if !$plugin;
385
386 # fixme: return valid_exit code
387 die "service '$sid' not on this node" if $service_config->{node} ne $nodename;
388
389 my $vmid = $service_name;
390
391 my $running = $plugin->check_running($vmid);
392
393 if ($cmd eq 'started') {
394
395 # fixme: count failures
396
397 return 0 if $running;
398
399 $self->log("info", "starting service $sid");
400
401 my $params = {
402 node => $nodename,
403 vmid => $vmid
404 };
405
406 $plugin->start($self, $params);
407
408 $running = $plugin->check_running($vmid);
409
410 if ($running) {
411 $self->log("info", "service status $sid started");
412 return 0;
413 } else {
414 $self->log("info", "unable to start service $sid");
415 return 1;
416 }
417
418 } elsif ($cmd eq 'request_stop' || $cmd eq 'stopped') {
419
420 return 0 if !$running;
421
422 $self->log("info", "stopping service $sid");
423
424 my $timeout = 60; # fixme: make this configurable
425
426 my $params = {
427 node => $nodename,
428 vmid => $vmid,
429 timeout => $timeout,
430 forceStop => 1,
431 };
432
433 $plugin->shutdown($self, $params);
434
435 $running = $plugin->check_running($vmid);
436
437 if (!$running) {
438 $self->log("info", "service status $sid stopped");
439 return 0;
440 } else {
441 return 1;
442 }
443
444 } elsif ($cmd eq 'migrate' || $cmd eq 'relocate') {
445
446 my $target = $params[0];
447 die "$cmd '$sid' failed - missing target\n" if !defined($target);
448
449 if ($service_config->{node} eq $target) {
450 # already there
451 return 0;
452 }
453
454 # we always do (live) migration
455 my $params = {
456 node => $nodename,
457 vmid => $vmid,
458 target => $target,
459 online => 1,
460 };
461
462 my $oldconfig = $plugin->config_file($vmid, $nodename);
463
464 $plugin->migrate($self, $params);
465
466 # something went wrong if old config file is still there
467 if (-f $oldconfig) {
468 $self->log("err", "service $sid not moved (migration error)");
469 return 1;
470 }
471
472 return 0;
473
474 } elsif ($cmd eq 'error') {
475
476
477 if($running) {
478 $self->log("err", "service $sid is in an error state while running");
479 } else {
480 $self->log("info", "service $sid is not running and in an error state");
481 }
482 return 0;
483
484 }
485
486 die "implement me (cmd '$cmd')";
487 }
488
489 1;