1 package PVE
::QemuMigrate
;
5 use POSIX
qw(strftime);
8 use PVE
::Tools
qw(run_command);
15 my $delayed_interrupt = 0;
17 # blowfish is a fast block cipher, much faster then 3des
18 my @ssh_opts = ('-c', 'blowfish', '-o', 'BatchMode=yes');
19 my @ssh_cmd = ('/usr/bin/ssh', @ssh_opts);
20 my @scp_cmd = ('/usr/bin/scp', @ssh_opts);
21 my $qm_cmd = '/usr/sbin/qm';
24 my ($level, $msg) = @_;
30 my $tstr = strftime
("%b %d %H:%M:%S", localtime);
34 foreach my $line (split (/\n/, $msg)) {
35 print STDOUT
"$tstr $line\n";
44 local $SIG{INT
} = $SIG{TERM
} = $SIG{QUIT
} = $SIG{HUP
} = sub {
45 $delayed_interrupt = 0;
46 logmsg
('err', "received interrupt");
47 die "interrupted by signal\n";
49 local $SIG{PIPE
} = sub {
50 $delayed_interrupt = 0;
51 logmsg
('err', "received broken pipe interrupt");
52 die "interrupted by signal\n";
55 my $di = $delayed_interrupt;
56 $delayed_interrupt = 0;
58 die "interrupted by signal\n" if $di;
64 sub fork_command_pipe
{
67 my $reader = IO
::File-
>new();
68 my $writer = IO
::File-
>new();
74 eval { $cpid = open2
($reader, $writer, @$cmd); };
79 if ($orig_pid != $$) {
80 logmsg
('err', "can't fork command pipe\n");
87 return { writer
=> $writer, reader
=> $reader, pid
=> $cpid };
90 sub finish_command_pipe
{
93 my $writer = $cmdpipe->{writer
};
94 my $reader = $cmdpipe->{reader
};
99 my $cpid = $cmdpipe->{pid
};
101 kill(15, $cpid) if kill(0, $cpid);
106 sub run_with_timeout
{
107 my ($timeout, $code, @param) = @_;
109 die "got timeout\n" if $timeout <= 0;
118 local $SIG{ALRM
} = sub { $sigcount++; die "got timeout\n"; };
119 local $SIG{PIPE
} = sub { $sigcount++; die "broken pipe\n" };
120 local $SIG{__DIE__
}; # see SA bug 4631
122 $prev_alarm = alarm($timeout);
124 $res = &$code(@param);
126 alarm(0); # avoid race conditions
131 alarm($prev_alarm) if defined($prev_alarm);
133 die "unknown error" if $sigcount && !$err; # seems to happen sometimes
141 my ($nodeip, $lport, $rport) = @_;
143 my $cmd = [@ssh_cmd, '-o', 'BatchMode=yes',
144 '-L', "$lport:localhost:$rport", $nodeip,
147 my $tunnel = fork_command_pipe
($cmd);
149 my $reader = $tunnel->{reader
};
153 run_with_timeout
(60, sub { $helo = <$reader>; });
154 die "no reply\n" if !$helo;
155 die "no quorum on target node\n" if $helo =~ m/^no quorum$/;
156 die "got strange reply from mtunnel ('$helo')\n"
157 if $helo !~ m/^tunnel online$/;
162 finish_command_pipe
($tunnel);
163 die "can't open migration tunnel - $err";
171 my $writer = $tunnel->{writer
};
174 run_with_timeout
(30, sub {
175 print $writer "quit\n";
181 finish_command_pipe
($tunnel);
187 my ($node, $nodeip, $vmid, $online, $force) = @_;
189 my $starttime = time();
191 my $rem_ssh = [@ssh_cmd, "root\@$nodeip"];
193 local $SIG{INT
} = $SIG{TERM
} = $SIG{QUIT
} = $SIG{HUP
} = $SIG{PIPE
} = sub {
194 logmsg
('err', "received interrupt - delayed");
195 $delayed_interrupt = 1;
198 local $ENV{RSYNC_RSH
} = join(' ', @ssh_cmd);
205 storecfg
=> PVE
::Storage
::config
(),
211 # lock config during migration
212 eval { PVE
::QemuServer
::lock_config
($vmid, sub {
214 eval_int
(sub { prepare
($session); });
217 my $conf = PVE
::QemuServer
::load_config
($vmid);
219 PVE
::QemuServer
::check_lock
($conf);
222 if (my $pid = PVE
::QemuServer
::check_running
($vmid)) {
223 die "cant migrate running VM without --online\n" if !$online;
228 eval_int
(sub { phase1
($session, $conf, $rhash, $running); });
232 if ($rhash->{clearlock
}) {
233 my $unset = { lock => 1 };
234 eval { PVE
::QemuServer
::change_config_nolock
($session->{vmid
}, {}, $unset, 1) };
235 logmsg
('err', $@) if $@;
237 if ($rhash->{volumes
}) {
238 foreach my $volid (@{$rhash->{volumes
}}) {
239 logmsg
('err', "found stale volume copy '$volid' on node '$session->{node}'");
245 # vm is now owned by other node
246 my $volids = $rhash->{volumes
};
251 eval_int
(sub { phase2
($session, $conf, $rhash); });
255 if ($rhash->{tunnel
}) {
256 eval_int
(sub { finish_tunnel
($rhash->{tunnel
}) });
258 logmsg
('err', "stopping tunnel failed - $@");
263 # fixme: ther is no config file, so this will never work
264 # fixme: use kill(9, $running) to make sure it is stopped
265 # always stop local VM - no interrupts possible
266 eval { PVE
::QemuServer
::vm_stop
($session->{vmid
}, 1); };
268 logmsg
('err', "stopping vm failed - $@");
274 logmsg
('err', "online migrate failure - $err");
278 # finalize -- clear migrate lock
280 my $cmd = [ @{$session->{rem_ssh
}}, $qm_cmd, 'unlock', $session->{vmid
} ];
284 logmsg
('err', "failed to clear migrate lock - $@");
288 # destroy local copies
289 foreach my $volid (@$volids) {
290 eval_int
(sub { PVE
::Storage
::vdisk_free
($session->{storecfg
}, $volid); });
294 logmsg
('err', "removing local copy of '$volid' failed - $err");
297 last if $err =~ /^interrupted by signal$/;
304 my $delay = time() - $starttime;
305 my $mins = int($delay/60);
306 my $secs = $delay - $mins*60;
307 my $hours = int($mins/60);
308 $mins = $mins - $hours*60;
310 my $duration = sprintf "%02d:%02d:%02d", $hours, $mins, $secs;
313 my $msg = "migration aborted (duration $duration): $err\n";
319 my $msg = "migration finished with problems (duration $duration)\n";
324 logmsg
('info', "migration finished successfuly (duration $duration)");
330 my $conffile = PVE
::QemuServer
::config_file
($session->{vmid
});
331 die "VM $session->{vmid} does not exist on this node\n" if ! -f
$conffile;
333 # test ssh connection
334 my $cmd = [ @{$session->{rem_ssh
}}, '/bin/true' ];
335 eval { run_command
($cmd); };
336 die "Can't connect to destination address using public key\n" if $@;
340 my ($session, $conf, $rhash, $running) = @_;
342 logmsg
('info', "copying disk images");
351 # get list from PVE::Storage (for unused volumes)
352 my $dl = PVE
::Storage
::vdisk_list
($session->{storecfg
}, undef, $session->{vmid
});
353 PVE
::Storage
::foreach_volid
($dl, sub {
354 my ($volid, $sid, $volname) = @_;
356 my $scfg = PVE
::Storage
::storage_config
($session->{storecfg
}, $sid);
358 return if $scfg->{shared
};
360 $volhash->{$volid} = 1;
363 # and add used,owned/non-shared disks (just to be sure we have all)
366 PVE
::QemuServer
::foreach_drive
($conf, sub {
367 my ($ds, $drive) = @_;
369 my $volid = $drive->{file
};
372 die "cant migrate local file/device '$volid'\n" if $volid =~ m
|^/|;
374 if (PVE
::QemuServer
::drive_is_cdrom
($drive)) {
375 die "cant migrate local cdrom drive\n" if $volid eq 'cdrom';
376 return if $volid eq 'none';
377 $cdromhash->{$volid} = 1;
380 my ($sid, $volname) = PVE
::Storage
::parse_volume_id
($volid);
382 my $scfg = PVE
::Storage
::storage_config
($session->{storecfg
}, $sid);
384 return if $scfg->{shared
};
386 die "can't migrate local cdrom '$volid'\n" if $cdromhash->{$volid};
390 my ($path, $owner) = PVE
::Storage
::path
($session->{storecfg
}, $volid);
392 die "can't migrate volume '$volid' - owned by other VM (owner = VM $owner)\n"
393 if !$owner || ($owner != $session->{vmid
});
395 $volhash->{$volid} = 1;
398 if ($running && !$sharedvm) {
399 die "can't do online migration - VM uses local disks\n";
402 # do some checks first
403 foreach my $volid (keys %$volhash) {
404 my ($sid, $volname) = PVE
::Storage
::parse_volume_id
($volid);
405 my $scfg = PVE
::Storage
::storage_config
($session->{storecfg
}, $sid);
407 die "can't migrate '$volid' - storagy type '$scfg->{type}' not supported\n"
408 if $scfg->{type
} ne 'dir';
411 foreach my $volid (keys %$volhash) {
412 my ($sid, $volname) = PVE
::Storage
::parse_volume_id
($volid);
413 push @{$rhash->{volumes
}}, $volid;
414 PVE
::Storage
::storage_migrate
($session->{storecfg
}, $volid, $session->{nodeip
}, $sid);
417 die "Failed to sync data - $@" if $@;
421 my ($session, $conf, $rhash, $running) = @_;
423 logmsg
('info', "starting migration of VM $session->{vmid} to node '$session->{node}' ($session->{nodeip})");
425 if (my $loc_res = PVE
::QemuServer
::check_local_resources
($conf, 1)) {
426 if ($running || !$session->{force
}) {
427 die "can't migrate VM which uses local devices\n";
429 logmsg
('info', "migrating VM which uses local devices");
433 # set migrate lock in config file
434 $rhash->{clearlock
} = 1;
436 PVE
::QemuServer
::change_config_nolock
($session->{vmid
}, { lock => 'migrate' }, {}, 1);
438 sync_disks
($session, $conf, $rhash, $running);
440 # move config to remote node
441 my $conffile = PVE
::QemuServer
::config_file
($session->{vmid
});
442 my $newconffile = PVE
::QemuServer
::config_file
($session->{vmid
}, $session->{node
});
444 die "Failed to move config to node '$session->{node}' - rename failed: $!\n"
445 if !rename($conffile, $newconffile);
449 my ($session, $conf, $rhash) = shift;
451 logmsg
('info', "starting VM on remote node '$session->{node}'");
455 ## start on remote node
456 my $cmd = [@{$session->{rem_ssh
}}, $qm_cmd, 'start',
457 $session->{vmid
}, '--stateuri', 'tcp', '--skiplock'];
459 run_command
($cmd, outfunc
=> sub {
462 if ($line =~ m/^migration listens on port (\d+)$/) {
467 die "unable to detect remote migration port\n" if !$rport;
469 logmsg
('info', "starting migration tunnel");
471 ## create tunnel to remote port
472 my $lport = PVE
::QemuServer
::next_migrate_port
();
473 $rhash->{tunnel
} = fork_tunnel
($session->{nodeip
}, $lport, $rport);
475 logmsg
('info', "starting online/live migration");
480 PVE
::QemuServer
::vm_monitor_command
($session->{vmid
}, "migrate -d \"tcp:localhost:$lport\"");
485 my $stat = PVE::QemuServer::vm_monitor_command($session->{vmid}, "info migrate
", 1);
486 if ($stat =~ m/^Migration status: (active|completed|failed|cancelled)$/im) {
489 if ($stat ne $lstat) {
490 if ($ms eq 'active') {
491 my ($trans, $rem, $total) = (0, 0, 0);
492 $trans = $1 if $stat =~ m/^transferred ram: (\d+) kbytes$/im;
493 $rem = $1 if $stat =~ m/^remaining ram: (\d+) kbytes$/im;
494 $total = $1 if $stat =~ m/^total ram: (\d+) kbytes$/im;
496 logmsg('info', "migration status
: $ms (transferred
${trans
}KB
, " .
497 "remaining
${rem
}KB
), total
${total
}KB
)");
499 logmsg('info', "migration status
: $ms");
503 if ($ms eq 'completed') {
504 my $delay = time() - $start;
506 my $mbps = sprintf "%.2f", $conf->{memory}/$delay;
507 logmsg('info', "migration speed
: $mbps MB
/s
");
511 if ($ms eq 'failed' || $ms eq 'cancelled') {
515 last if $ms ne 'active';
517 die "unable to parse migration status
'$stat' - aborting
\n";