]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Sim/Env.pm
CRM: release lock on shutdown request
[pve-ha-manager.git] / src / PVE / HA / Sim / Env.pm
1 package PVE::HA::Sim::Env;
2
3 use strict;
4 use warnings;
5 use POSIX qw(strftime EINTR);
6 use Data::Dumper;
7 use JSON;
8 use IO::File;
9 use Fcntl qw(:DEFAULT :flock);
10
11 use PVE::HA::Tools;
12 use PVE::HA::Env;
13
14 sub new {
15 my ($this, $nodename, $hardware, $log_id) = @_;
16
17 die "missing nodename" if !$nodename;
18 die "missing log_id" if !$log_id;
19
20 my $class = ref($this) || $this;
21
22 my $self = bless {}, $class;
23
24 $self->{statusdir} = $hardware->statusdir();
25 $self->{nodename} = $nodename;
26
27 $self->{hardware} = $hardware;
28 $self->{lock_timeout} = 120;
29
30 $self->{log_id} = $log_id;
31
32 return $self;
33 }
34
35 sub nodename {
36 my ($self) = @_;
37
38 return $self->{nodename};
39 }
40
41 sub sim_get_lock {
42 my ($self, $lock_name, $unlock) = @_;
43
44 return 0 if !$self->quorate();
45
46 my $filename = "$self->{statusdir}/cluster_locks";
47
48 my $code = sub {
49
50 my $data = PVE::HA::Tools::read_json_from_file($filename, {});
51
52 my $res;
53
54 my $nodename = $self->nodename();
55 my $ctime = $self->get_time();
56
57 if ($unlock) {
58
59 if (my $d = $data->{$lock_name}) {
60 my $tdiff = $ctime - $d->{time};
61
62 if ($tdiff > $self->{lock_timeout}) {
63 $res = 1;
64 } elsif (($tdiff <= $self->{lock_timeout}) && ($d->{node} eq $nodename)) {
65 delete $data->{$lock_name};
66 $res = 1;
67 } else {
68 $res = 0;
69 }
70 }
71
72 } else {
73
74 if (my $d = $data->{$lock_name}) {
75
76 my $tdiff = $ctime - $d->{time};
77
78 if ($tdiff <= $self->{lock_timeout}) {
79 if ($d->{node} eq $nodename) {
80 $d->{time} = $ctime;
81 $res = 1;
82 } else {
83 $res = 0;
84 }
85 } else {
86 $self->log('info', "got lock '$lock_name'");
87 $d->{node} = $nodename;
88 $d->{time} = $ctime;
89 $res = 1;
90 }
91
92 } else {
93 $data->{$lock_name} = {
94 time => $ctime,
95 node => $nodename,
96 };
97 $self->log('info', "got lock '$lock_name'");
98 $res = 1;
99 }
100 }
101
102 PVE::HA::Tools::write_json_to_file($filename, $data);
103
104 return $res;
105 };
106
107 return $self->{hardware}->global_lock($code);
108 }
109
110 sub read_manager_status {
111 my ($self) = @_;
112
113 my $filename = "$self->{statusdir}/manager_status";
114
115 return PVE::HA::Tools::read_json_from_file($filename, {});
116 }
117
118 sub write_manager_status {
119 my ($self, $status_obj) = @_;
120
121 my $filename = "$self->{statusdir}/manager_status";
122
123 PVE::HA::Tools::write_json_to_file($filename, $status_obj);
124 }
125
126 sub read_lrm_status {
127 my ($self, $node) = @_;
128
129 $node = $self->{nodename} if !defined($node);
130
131 return $self->{hardware}->read_lrm_status($node);
132 }
133
134 sub write_lrm_status {
135 my ($self, $status_obj) = @_;
136
137 my $node = $self->{nodename};
138
139 return $self->{hardware}->write_lrm_status($node, $status_obj);
140 }
141
142 sub is_node_shutdown {
143 my ($self) = @_;
144
145 return 0; # default to freezing services if not overwritten by subclass
146 }
147
148 sub service_config_exists {
149 my ($self) = @_;
150
151 return 1;
152 }
153
154 sub read_service_config {
155 my ($self) = @_;
156
157 return $self->{hardware}->read_service_config();
158 }
159
160 sub read_group_config {
161 my ($self) = @_;
162
163 return $self->{hardware}->read_group_config();
164 }
165
166 sub change_service_location {
167 my ($self, $sid, $current_node, $new_node) = @_;
168
169 return $self->{hardware}->change_service_location($sid, $current_node, $new_node);
170 }
171
172 sub queue_crm_commands {
173 my ($self, $cmd) = @_;
174
175 return $self->{hardware}->queue_crm_commands($cmd);
176 }
177
178 sub read_crm_commands {
179 my ($self) = @_;
180
181 return $self->{hardware}->read_crm_commands();
182 }
183
184 sub log {
185 my ($self, $level, $msg) = @_;
186
187 chomp $msg;
188
189 my $time = $self->get_time();
190
191 printf("%-5s %5d %12s: $msg\n", $level, $time, "$self->{nodename}/$self->{log_id}");
192 }
193
194 sub get_time {
195 my ($self) = @_;
196
197 die "implement in subclass";
198 }
199
200 sub sleep {
201 my ($self, $delay) = @_;
202
203 die "implement in subclass";
204 }
205
206 sub sleep_until {
207 my ($self, $end_time) = @_;
208
209 die "implement in subclass";
210 }
211
212 sub get_ha_manager_lock {
213 my ($self) = @_;
214
215 return $self->sim_get_lock('ha_manager_lock');
216 }
217
218 # release the cluster wide manager lock.
219 # when released another CRM may step up and get the lock, thus this should only
220 # get called when shutting down/deactivating the current master
221 sub release_ha_manager_lock {
222 my ($self) = @_;
223
224 return $self->sim_get_lock('ha_manager_lock', 1);
225 }
226
227 sub get_ha_agent_lock_name {
228 my ($self, $node) = @_;
229
230 $node = $self->nodename() if !$node;
231
232 return "ha_agent_${node}_lock";
233 }
234
235 sub get_ha_agent_lock {
236 my ($self, $node) = @_;
237
238 my $lck = $self->get_ha_agent_lock_name($node);
239 return $self->sim_get_lock($lck);
240 }
241
242 # return true when cluster is quorate
243 sub quorate {
244 my ($self) = @_;
245
246 my ($node_info, $quorate) = $self->{hardware}->get_node_info();
247 my $node = $self->nodename();
248 return 0 if !$node_info->{$node}->{online};
249 return $quorate;
250 }
251
252 sub get_node_info {
253 my ($self) = @_;
254
255 return $self->{hardware}->get_node_info();
256 }
257
258 sub loop_start_hook {
259 my ($self, $starttime) = @_;
260
261 # do nothing, overwrite in subclass
262 }
263
264 sub loop_end_hook {
265 my ($self) = @_;
266
267 # do nothing, overwrite in subclass
268 }
269
270 sub watchdog_open {
271 my ($self) = @_;
272
273 my $node = $self->nodename();
274
275 return $self->{hardware}->watchdog_open($node);
276 }
277
278 sub watchdog_update {
279 my ($self, $wfh) = @_;
280
281 return $self->{hardware}->watchdog_update($wfh);
282 }
283
284 sub watchdog_close {
285 my ($self, $wfh) = @_;
286
287 return $self->{hardware}->watchdog_close($wfh);
288 }
289
290 sub can_fork {
291 my ($self) = @_;
292
293 return 1;
294 }
295
296 sub exec_resource_agent {
297 my ($self, $sid, $cd, $cmd, @params) = @_;
298
299 my $hardware = $self->{hardware};
300
301 my $nodename = $self->{nodename};
302
303 # fixme: return valid_exit code (instead of using die)
304
305 my $ss = $hardware->read_service_status($nodename);
306
307 if ($cmd eq 'started') {
308
309 # fixme: return valid_exit code
310 die "service '$sid' not on this node" if $cd->{node} ne $nodename;
311
312 if ($ss->{$sid}) {
313 return 0;
314 }
315 $self->log("info", "starting service $sid");
316
317 $self->sleep(2);
318
319 $ss->{$sid} = 1;
320 $hardware->write_service_status($nodename, $ss);
321
322 $self->log("info", "service status $sid started");
323
324 return 0;
325
326 } elsif ($cmd eq 'request_stop' || $cmd eq 'stopped') {
327
328 # fixme: return valid_exit code
329 die "service '$sid' not on this node" if $cd->{node} ne $nodename;
330
331 if (!$ss->{$sid}) {
332 return 0;
333 }
334 $self->log("info", "stopping service $sid");
335
336 $self->sleep(2);
337
338 $ss->{$sid} = 0;
339 $hardware->write_service_status($nodename, $ss);
340
341 $self->log("info", "service status $sid stopped");
342
343 return 0;
344
345 } elsif ($cmd eq 'migrate' || $cmd eq 'relocate') {
346
347 if ($cd->{type} eq 'ct' && $cmd eq 'migrate' && $ss->{$sid}) {
348 $self->log('err', "unable to live migrate running container");
349 return 1;
350 }
351
352 my $target = $params[0];
353 die "$cmd '$sid' failed - missing target\n" if !defined($target);
354
355 if ($cd->{node} eq $target) {
356 # already migrate
357 return 0;
358 } elsif ($cd->{node} eq $nodename) {
359
360 $self->log("info", "service $sid - start $cmd to node '$target'");
361
362 if ($cmd eq 'relocate') {
363
364 if ($ss->{$sid}) {
365 $self->log("info", "stopping service $sid (relocate)");
366 $self->sleep(1); # time to stop service
367 $ss->{$sid} = 0;
368 $hardware->write_service_status($nodename, $ss);
369 }
370
371 $self->log("info", "service status $sid stopped");
372
373 } else {
374 $self->sleep(2); # (live) migration time
375 }
376
377 $self->change_service_location($sid, $nodename, $target);
378 $self->log("info", "service $sid - end $cmd to node '$target'");
379 # ensure that the old node doesn't has the service anymore
380 $ss->{$sid} = 0;
381 $hardware->write_service_status($nodename, $ss);
382
383 return 0;
384
385 } else {
386 die "migrate '$sid' failed - service is not on this node\n";
387 }
388
389
390 }
391
392 die "implement me (cmd '$cmd')";
393 }
394
395 1;