migration/colo.c

   1 /*
   2  * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
   3  * (a.k.a. Fault Tolerance or Continuous Replication)
   4  *
   5  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
   6  * Copyright (c) 2016 FUJITSU LIMITED
   7  * Copyright (c) 2016 Intel Corporation
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or
  10  * later.  See the COPYING file in the top-level directory.
  11  */
  12
  13 #include "qemu/osdep.h"
  14 #include "qemu/timer.h"
  15 #include "sysemu/sysemu.h"
  16 #include "migration/colo.h"
  17 #include "io/channel-buffer.h"
  18 #include "trace.h"
  19 #include "qemu/error-report.h"
  20 #include "qapi/error.h"
  21 #include "migration/failover.h"
  22 #include "replication.h"
  23 #include "qmp-commands.h"
  24
  25 static bool vmstate_loading;
  26
  27 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
  28
  29 bool colo_supported(void)
  30 {
  31     return true;
  32 }
  33
  34 bool migration_in_colo_state(void)
  35 {
  36     MigrationState *s = migrate_get_current();
  37
  38     return (s->state == MIGRATION_STATUS_COLO);
  39 }
  40
  41 bool migration_incoming_in_colo_state(void)
  42 {
  43     MigrationIncomingState *mis = migration_incoming_get_current();
  44
  45     return mis && (mis->state == MIGRATION_STATUS_COLO);
  46 }
  47
  48 static bool colo_runstate_is_stopped(void)
  49 {
  50     return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
  51 }
  52
  53 static void secondary_vm_do_failover(void)
  54 {
  55     int old_state;
  56     MigrationIncomingState *mis = migration_incoming_get_current();
  57
  58     /* Can not do failover during the process of VM's loading VMstate, Or
  59      * it will break the secondary VM.
  60      */
  61     if (vmstate_loading) {
  62         old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
  63                         FAILOVER_STATUS_RELAUNCH);
  64         if (old_state != FAILOVER_STATUS_ACTIVE) {
  65             error_report("Unknown error while do failover for secondary VM,"
  66                          "old_state: %s", FailoverStatus_lookup[old_state]);
  67         }
  68         return;
  69     }
  70
  71     migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
  72                       MIGRATION_STATUS_COMPLETED);
  73
  74     if (!autostart) {
  75         error_report("\"-S\" qemu option will be ignored in secondary side");
  76         /* recover runstate to normal migration finish state */
  77         autostart = true;
  78     }
  79     /*
  80      * Make sure COLO incoming thread not block in recv or send,
  81      * If mis->from_src_file and mis->to_src_file use the same fd,
  82      * The second shutdown() will return -1, we ignore this value,
  83      * It is harmless.
  84      */
  85     if (mis->from_src_file) {
  86         qemu_file_shutdown(mis->from_src_file);
  87     }
  88     if (mis->to_src_file) {
  89         qemu_file_shutdown(mis->to_src_file);
  90     }
  91
  92     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
  93                                    FAILOVER_STATUS_COMPLETED);
  94     if (old_state != FAILOVER_STATUS_ACTIVE) {
  95         error_report("Incorrect state (%s) while doing failover for "
  96                      "secondary VM", FailoverStatus_lookup[old_state]);
  97         return;
  98     }
  99     /* Notify COLO incoming thread that failover work is finished */
 100     qemu_sem_post(&mis->colo_incoming_sem);
 101     /* For Secondary VM, jump to incoming co */
 102     if (mis->migration_incoming_co) {
 103         qemu_coroutine_enter(mis->migration_incoming_co);
 104     }
 105 }
 106
 107 static void primary_vm_do_failover(void)
 108 {
 109     MigrationState *s = migrate_get_current();
 110     int old_state;
 111
 112     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
 113                       MIGRATION_STATUS_COMPLETED);
 114
 115     /*
 116      * Wake up COLO thread which may blocked in recv() or send(),
 117      * The s->rp_state.from_dst_file and s->to_dst_file may use the
 118      * same fd, but we still shutdown the fd for twice, it is harmless.
 119      */
 120     if (s->to_dst_file) {
 121         qemu_file_shutdown(s->to_dst_file);
 122     }
 123     if (s->rp_state.from_dst_file) {
 124         qemu_file_shutdown(s->rp_state.from_dst_file);
 125     }
 126
 127     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
 128                                    FAILOVER_STATUS_COMPLETED);
 129     if (old_state != FAILOVER_STATUS_ACTIVE) {
 130         error_report("Incorrect state (%s) while doing failover for Primary VM",
 131                      FailoverStatus_lookup[old_state]);
 132         return;
 133     }
 134     /* Notify COLO thread that failover work is finished */
 135     qemu_sem_post(&s->colo_exit_sem);
 136 }
 137
 138 void colo_do_failover(MigrationState *s)
 139 {
 140     /* Make sure VM stopped while failover happened. */
 141     if (!colo_runstate_is_stopped()) {
 142         vm_stop_force_state(RUN_STATE_COLO);
 143     }
 144
 145     if (get_colo_mode() == COLO_MODE_PRIMARY) {
 146         primary_vm_do_failover();
 147     } else {
 148         secondary_vm_do_failover();
 149     }
 150 }
 151
 152 void qmp_xen_set_replication(bool enable, bool primary,
 153                              bool has_failover, bool failover,
 154                              Error **errp)
 155 {
 156     ReplicationMode mode = primary ?
 157                            REPLICATION_MODE_PRIMARY :
 158                            REPLICATION_MODE_SECONDARY;
 159
 160     if (has_failover && enable) {
 161         error_setg(errp, "Parameter 'failover' is only for"
 162                    " stopping replication");
 163         return;
 164     }
 165
 166     if (enable) {
 167         replication_start_all(mode, errp);
 168     } else {
 169         if (!has_failover) {
 170             failover = NULL;
 171         }
 172         replication_stop_all(failover, failover ? NULL : errp);
 173     }
 174 }
 175
 176 static void colo_send_message(QEMUFile *f, COLOMessage msg,
 177                               Error **errp)
 178 {
 179     int ret;
 180
 181     if (msg >= COLO_MESSAGE__MAX) {
 182         error_setg(errp, "%s: Invalid message", __func__);
 183         return;
 184     }
 185     qemu_put_be32(f, msg);
 186     qemu_fflush(f);
 187
 188     ret = qemu_file_get_error(f);
 189     if (ret < 0) {
 190         error_setg_errno(errp, -ret, "Can't send COLO message");
 191     }
 192     trace_colo_send_message(COLOMessage_lookup[msg]);
 193 }
 194
 195 static void colo_send_message_value(QEMUFile *f, COLOMessage msg,
 196                                     uint64_t value, Error **errp)
 197 {
 198     Error *local_err = NULL;
 199     int ret;
 200
 201     colo_send_message(f, msg, &local_err);
 202     if (local_err) {
 203         error_propagate(errp, local_err);
 204         return;
 205     }
 206     qemu_put_be64(f, value);
 207     qemu_fflush(f);
 208
 209     ret = qemu_file_get_error(f);
 210     if (ret < 0) {
 211         error_setg_errno(errp, -ret, "Failed to send value for message:%s",
 212                          COLOMessage_lookup[msg]);
 213     }
 214 }
 215
 216 static COLOMessage colo_receive_message(QEMUFile *f, Error **errp)
 217 {
 218     COLOMessage msg;
 219     int ret;
 220
 221     msg = qemu_get_be32(f);
 222     ret = qemu_file_get_error(f);
 223     if (ret < 0) {
 224         error_setg_errno(errp, -ret, "Can't receive COLO message");
 225         return msg;
 226     }
 227     if (msg >= COLO_MESSAGE__MAX) {
 228         error_setg(errp, "%s: Invalid message", __func__);
 229         return msg;
 230     }
 231     trace_colo_receive_message(COLOMessage_lookup[msg]);
 232     return msg;
 233 }
 234
 235 static void colo_receive_check_message(QEMUFile *f, COLOMessage expect_msg,
 236                                        Error **errp)
 237 {
 238     COLOMessage msg;
 239     Error *local_err = NULL;
 240
 241     msg = colo_receive_message(f, &local_err);
 242     if (local_err) {
 243         error_propagate(errp, local_err);
 244         return;
 245     }
 246     if (msg != expect_msg) {
 247         error_setg(errp, "Unexpected COLO message %d, expected %d",
 248                           msg, expect_msg);
 249     }
 250 }
 251
 252 static uint64_t colo_receive_message_value(QEMUFile *f, uint32_t expect_msg,
 253                                            Error **errp)
 254 {
 255     Error *local_err = NULL;
 256     uint64_t value;
 257     int ret;
 258
 259     colo_receive_check_message(f, expect_msg, &local_err);
 260     if (local_err) {
 261         error_propagate(errp, local_err);
 262         return 0;
 263     }
 264
 265     value = qemu_get_be64(f);
 266     ret = qemu_file_get_error(f);
 267     if (ret < 0) {
 268         error_setg_errno(errp, -ret, "Failed to get value for COLO message: %s",
 269                          COLOMessage_lookup[expect_msg]);
 270     }
 271     return value;
 272 }
 273
 274 static int colo_do_checkpoint_transaction(MigrationState *s,
 275                                           QIOChannelBuffer *bioc,
 276                                           QEMUFile *fb)
 277 {
 278     Error *local_err = NULL;
 279     int ret = -1;
 280
 281     colo_send_message(s->to_dst_file, COLO_MESSAGE_CHECKPOINT_REQUEST,
 282                       &local_err);
 283     if (local_err) {
 284         goto out;
 285     }
 286
 287     colo_receive_check_message(s->rp_state.from_dst_file,
 288                     COLO_MESSAGE_CHECKPOINT_REPLY, &local_err);
 289     if (local_err) {
 290         goto out;
 291     }
 292     /* Reset channel-buffer directly */
 293     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 294     bioc->usage = 0;
 295
 296     qemu_mutex_lock_iothread();
 297     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 298         qemu_mutex_unlock_iothread();
 299         goto out;
 300     }
 301     vm_stop_force_state(RUN_STATE_COLO);
 302     qemu_mutex_unlock_iothread();
 303     trace_colo_vm_state_change("run", "stop");
 304     /*
 305      * Failover request bh could be called after vm_stop_force_state(),
 306      * So we need check failover_request_is_active() again.
 307      */
 308     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 309         goto out;
 310     }
 311
 312     /* Disable block migration */
 313     s->params.blk = 0;
 314     s->params.shared = 0;
 315     qemu_savevm_state_header(fb);
 316     qemu_savevm_state_begin(fb, &s->params);
 317     qemu_mutex_lock_iothread();
 318     qemu_savevm_state_complete_precopy(fb, false);
 319     qemu_mutex_unlock_iothread();
 320
 321     qemu_fflush(fb);
 322
 323     colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
 324     if (local_err) {
 325         goto out;
 326     }
 327     /*
 328      * We need the size of the VMstate data in Secondary side,
 329      * With which we can decide how much data should be read.
 330      */
 331     colo_send_message_value(s->to_dst_file, COLO_MESSAGE_VMSTATE_SIZE,
 332                             bioc->usage, &local_err);
 333     if (local_err) {
 334         goto out;
 335     }
 336
 337     qemu_put_buffer(s->to_dst_file, bioc->data, bioc->usage);
 338     qemu_fflush(s->to_dst_file);
 339     ret = qemu_file_get_error(s->to_dst_file);
 340     if (ret < 0) {
 341         goto out;
 342     }
 343
 344     colo_receive_check_message(s->rp_state.from_dst_file,
 345                        COLO_MESSAGE_VMSTATE_RECEIVED, &local_err);
 346     if (local_err) {
 347         goto out;
 348     }
 349
 350     colo_receive_check_message(s->rp_state.from_dst_file,
 351                        COLO_MESSAGE_VMSTATE_LOADED, &local_err);
 352     if (local_err) {
 353         goto out;
 354     }
 355
 356     ret = 0;
 357
 358     qemu_mutex_lock_iothread();
 359     vm_start();
 360     qemu_mutex_unlock_iothread();
 361     trace_colo_vm_state_change("stop", "run");
 362
 363 out:
 364     if (local_err) {
 365         error_report_err(local_err);
 366     }
 367     return ret;
 368 }
 369
 370 static void colo_process_checkpoint(MigrationState *s)
 371 {
 372     QIOChannelBuffer *bioc;
 373     QEMUFile *fb = NULL;
 374     int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 375     Error *local_err = NULL;
 376     int ret;
 377
 378     failover_init_state();
 379
 380     s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
 381     if (!s->rp_state.from_dst_file) {
 382         error_report("Open QEMUFile from_dst_file failed");
 383         goto out;
 384     }
 385
 386     /*
 387      * Wait for Secondary finish loading VM states and enter COLO
 388      * restore.
 389      */
 390     colo_receive_check_message(s->rp_state.from_dst_file,
 391                        COLO_MESSAGE_CHECKPOINT_READY, &local_err);
 392     if (local_err) {
 393         goto out;
 394     }
 395     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 396     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
 397     object_unref(OBJECT(bioc));
 398
 399     qemu_mutex_lock_iothread();
 400     vm_start();
 401     qemu_mutex_unlock_iothread();
 402     trace_colo_vm_state_change("stop", "run");
 403
 404     timer_mod(s->colo_delay_timer,
 405             current_time + s->parameters.x_checkpoint_delay);
 406
 407     while (s->state == MIGRATION_STATUS_COLO) {
 408         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 409             error_report("failover request");
 410             goto out;
 411         }
 412
 413         qemu_sem_wait(&s->colo_checkpoint_sem);
 414
 415         ret = colo_do_checkpoint_transaction(s, bioc, fb);
 416         if (ret < 0) {
 417             goto out;
 418         }
 419     }
 420
 421 out:
 422     /* Throw the unreported error message after exited from loop */
 423     if (local_err) {
 424         error_report_err(local_err);
 425     }
 426
 427     if (fb) {
 428         qemu_fclose(fb);
 429     }
 430
 431     timer_del(s->colo_delay_timer);
 432
 433     /* Hope this not to be too long to wait here */
 434     qemu_sem_wait(&s->colo_exit_sem);
 435     qemu_sem_destroy(&s->colo_exit_sem);
 436     /*
 437      * Must be called after failover BH is completed,
 438      * Or the failover BH may shutdown the wrong fd that
 439      * re-used by other threads after we release here.
 440      */
 441     if (s->rp_state.from_dst_file) {
 442         qemu_fclose(s->rp_state.from_dst_file);
 443     }
 444 }
 445
 446 void colo_checkpoint_notify(void *opaque)
 447 {
 448     MigrationState *s = opaque;
 449     int64_t next_notify_time;
 450
 451     qemu_sem_post(&s->colo_checkpoint_sem);
 452     s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 453     next_notify_time = s->colo_checkpoint_time +
 454                     s->parameters.x_checkpoint_delay;
 455     timer_mod(s->colo_delay_timer, next_notify_time);
 456 }
 457
 458 void migrate_start_colo_process(MigrationState *s)
 459 {
 460     qemu_mutex_unlock_iothread();
 461     qemu_sem_init(&s->colo_checkpoint_sem, 0);
 462     s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
 463                                 colo_checkpoint_notify, s);
 464
 465     qemu_sem_init(&s->colo_exit_sem, 0);
 466     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
 467                       MIGRATION_STATUS_COLO);
 468     colo_process_checkpoint(s);
 469     qemu_mutex_lock_iothread();
 470 }
 471
 472 static void colo_wait_handle_message(QEMUFile *f, int *checkpoint_request,
 473                                      Error **errp)
 474 {
 475     COLOMessage msg;
 476     Error *local_err = NULL;
 477
 478     msg = colo_receive_message(f, &local_err);
 479     if (local_err) {
 480         error_propagate(errp, local_err);
 481         return;
 482     }
 483
 484     switch (msg) {
 485     case COLO_MESSAGE_CHECKPOINT_REQUEST:
 486         *checkpoint_request = 1;
 487         break;
 488     default:
 489         *checkpoint_request = 0;
 490         error_setg(errp, "Got unknown COLO message: %d", msg);
 491         break;
 492     }
 493 }
 494
 495 void *colo_process_incoming_thread(void *opaque)
 496 {
 497     MigrationIncomingState *mis = opaque;
 498     QEMUFile *fb = NULL;
 499     QIOChannelBuffer *bioc = NULL; /* Cache incoming device state */
 500     uint64_t total_size;
 501     uint64_t value;
 502     Error *local_err = NULL;
 503
 504     qemu_sem_init(&mis->colo_incoming_sem, 0);
 505
 506     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 507                       MIGRATION_STATUS_COLO);
 508
 509     failover_init_state();
 510
 511     mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
 512     if (!mis->to_src_file) {
 513         error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
 514         goto out;
 515     }
 516     /*
 517      * Note: the communication between Primary side and Secondary side
 518      * should be sequential, we set the fd to unblocked in migration incoming
 519      * coroutine, and here we are in the COLO incoming thread, so it is ok to
 520      * set the fd back to blocked.
 521      */
 522     qemu_file_set_blocking(mis->from_src_file, true);
 523
 524     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 525     fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
 526     object_unref(OBJECT(bioc));
 527
 528     colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
 529                       &local_err);
 530     if (local_err) {
 531         goto out;
 532     }
 533
 534     while (mis->state == MIGRATION_STATUS_COLO) {
 535         int request = 0;
 536
 537         colo_wait_handle_message(mis->from_src_file, &request, &local_err);
 538         if (local_err) {
 539             goto out;
 540         }
 541         assert(request);
 542         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 543             error_report("failover request");
 544             goto out;
 545         }
 546
 547         /* FIXME: This is unnecessary for periodic checkpoint mode */
 548         colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
 549                      &local_err);
 550         if (local_err) {
 551             goto out;
 552         }
 553
 554         colo_receive_check_message(mis->from_src_file,
 555                            COLO_MESSAGE_VMSTATE_SEND, &local_err);
 556         if (local_err) {
 557             goto out;
 558         }
 559
 560         value = colo_receive_message_value(mis->from_src_file,
 561                                  COLO_MESSAGE_VMSTATE_SIZE, &local_err);
 562         if (local_err) {
 563             goto out;
 564         }
 565
 566         /*
 567          * Read VM device state data into channel buffer,
 568          * It's better to re-use the memory allocated.
 569          * Here we need to handle the channel buffer directly.
 570          */
 571         if (value > bioc->capacity) {
 572             bioc->capacity = value;
 573             bioc->data = g_realloc(bioc->data, bioc->capacity);
 574         }
 575         total_size = qemu_get_buffer(mis->from_src_file, bioc->data, value);
 576         if (total_size != value) {
 577             error_report("Got %" PRIu64 " VMState data, less than expected"
 578                         " %" PRIu64, total_size, value);
 579             goto out;
 580         }
 581         bioc->usage = total_size;
 582         qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 583
 584         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_RECEIVED,
 585                      &local_err);
 586         if (local_err) {
 587             goto out;
 588         }
 589
 590         qemu_mutex_lock_iothread();
 591         qemu_system_reset(VMRESET_SILENT);
 592         vmstate_loading = true;
 593         if (qemu_loadvm_state(fb) < 0) {
 594             error_report("COLO: loadvm failed");
 595             qemu_mutex_unlock_iothread();
 596             goto out;
 597         }
 598
 599         vmstate_loading = false;
 600         qemu_mutex_unlock_iothread();
 601
 602         if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
 603             failover_set_state(FAILOVER_STATUS_RELAUNCH,
 604                             FAILOVER_STATUS_NONE);
 605             failover_request_active(NULL);
 606             goto out;
 607         }
 608
 609         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
 610                      &local_err);
 611         if (local_err) {
 612             goto out;
 613         }
 614     }
 615
 616 out:
 617     vmstate_loading = false;
 618     /* Throw the unreported error message after exited from loop */
 619     if (local_err) {
 620         error_report_err(local_err);
 621     }
 622
 623     if (fb) {
 624         qemu_fclose(fb);
 625     }
 626
 627     /* Hope this not to be too long to loop here */
 628     qemu_sem_wait(&mis->colo_incoming_sem);
 629     qemu_sem_destroy(&mis->colo_incoming_sem);
 630     /* Must be called after failover BH is completed */
 631     if (mis->to_src_file) {
 632         qemu_fclose(mis->to_src_file);
 633     }
 634     migration_incoming_exit_colo();
 635
 636     return NULL;
 637 }