migration/postcopy-ram.c

   1 /*
   2  * Postcopy migration for RAM
   3  *
   4  * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
   5  *
   6  * Authors:
   7  *  Dave Gilbert  <dgilbert@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10  * See the COPYING file in the top-level directory.
  11  *
  12  */
  13
  14 /*
  15  * Postcopy is a migration technique where the execution flips from the
  16  * source to the destination before all the data has been copied.
  17  */
  18
  19 #include <glib.h>
  20 #include <stdio.h>
  21 #include <unistd.h>
  22
  23 #include "qemu-common.h"
  24 #include "migration/migration.h"
  25 #include "migration/postcopy-ram.h"
  26 #include "sysemu/sysemu.h"
  27 #include "qemu/error-report.h"
  28 #include "trace.h"
  29
  30 /* Arbitrary limit on size of each discard command,
  31  * keeps them around ~200 bytes
  32  */
  33 #define MAX_DISCARDS_PER_COMMAND 12
  34
  35 struct PostcopyDiscardState {
  36     const char *ramblock_name;
  37     uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */
  38     uint16_t cur_entry;
  39     /*
  40      * Start and length of a discard range (bytes)
  41      */
  42     uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
  43     uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
  44     unsigned int nsentwords;
  45     unsigned int nsentcmds;
  46 };
  47
  48 /* Postcopy needs to detect accesses to pages that haven't yet been copied
  49  * across, and efficiently map new pages in, the techniques for doing this
  50  * are target OS specific.
  51  */
  52 #if defined(__linux__)
  53
  54 #include <sys/mman.h>
  55 #include <sys/ioctl.h>
  56 #include <sys/syscall.h>
  57 #include <sys/types.h>
  58 #include <asm/types.h> /* for __u64 */
  59 #endif
  60
  61 #if defined(__linux__) && defined(__NR_userfaultfd)
  62 #include <linux/userfaultfd.h>
  63
  64 static bool ufd_version_check(int ufd)
  65 {
  66     struct uffdio_api api_struct;
  67     uint64_t ioctl_mask;
  68
  69     api_struct.api = UFFD_API;
  70     api_struct.features = 0;
  71     if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  72         error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
  73                      strerror(errno));
  74         return false;
  75     }
  76
  77     ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
  78                  (__u64)1 << _UFFDIO_UNREGISTER;
  79     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  80         error_report("Missing userfault features: %" PRIx64,
  81                      (uint64_t)(~api_struct.ioctls & ioctl_mask));
  82         return false;
  83     }
  84
  85     return true;
  86 }
  87
  88 bool postcopy_ram_supported_by_host(void)
  89 {
  90     long pagesize = getpagesize();
  91     int ufd = -1;
  92     bool ret = false; /* Error unless we change it */
  93     void *testarea = NULL;
  94     struct uffdio_register reg_struct;
  95     struct uffdio_range range_struct;
  96     uint64_t feature_mask;
  97
  98     if ((1ul << qemu_target_page_bits()) > pagesize) {
  99         error_report("Target page size bigger than host page size");
 100         goto out;
 101     }
 102
 103     ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
 104     if (ufd == -1) {
 105         error_report("%s: userfaultfd not available: %s", __func__,
 106                      strerror(errno));
 107         goto out;
 108     }
 109
 110     /* Version and features check */
 111     if (!ufd_version_check(ufd)) {
 112         goto out;
 113     }
 114
 115     /*
 116      *  We need to check that the ops we need are supported on anon memory
 117      *  To do that we need to register a chunk and see the flags that
 118      *  are returned.
 119      */
 120     testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
 121                                     MAP_ANONYMOUS, -1, 0);
 122     if (testarea == MAP_FAILED) {
 123         error_report("%s: Failed to map test area: %s", __func__,
 124                      strerror(errno));
 125         goto out;
 126     }
 127     g_assert(((size_t)testarea & (pagesize-1)) == 0);
 128
 129     reg_struct.range.start = (uintptr_t)testarea;
 130     reg_struct.range.len = pagesize;
 131     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 132
 133     if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
 134         error_report("%s userfault register: %s", __func__, strerror(errno));
 135         goto out;
 136     }
 137
 138     range_struct.start = (uintptr_t)testarea;
 139     range_struct.len = pagesize;
 140     if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
 141         error_report("%s userfault unregister: %s", __func__, strerror(errno));
 142         goto out;
 143     }
 144
 145     feature_mask = (__u64)1 << _UFFDIO_WAKE |
 146                    (__u64)1 << _UFFDIO_COPY |
 147                    (__u64)1 << _UFFDIO_ZEROPAGE;
 148     if ((reg_struct.ioctls & feature_mask) != feature_mask) {
 149         error_report("Missing userfault map features: %" PRIx64,
 150                      (uint64_t)(~reg_struct.ioctls & feature_mask));
 151         goto out;
 152     }
 153
 154     /* Success! */
 155     ret = true;
 156 out:
 157     if (testarea) {
 158         munmap(testarea, pagesize);
 159     }
 160     if (ufd != -1) {
 161         close(ufd);
 162     }
 163     return ret;
 164 }
 165
 166 /**
 167  * postcopy_ram_discard_range: Discard a range of memory.
 168  * We can assume that if we've been called postcopy_ram_hosttest returned true.
 169  *
 170  * @mis: Current incoming migration state.
 171  * @start, @length: range of memory to discard.
 172  *
 173  * returns: 0 on success.
 174  */
 175 int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
 176                                size_t length)
 177 {
 178     trace_postcopy_ram_discard_range(start, length);
 179     if (madvise(start, length, MADV_DONTNEED)) {
 180         error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
 181         return -1;
 182     }
 183
 184     return 0;
 185 }
 186
 187 /*
 188  * Setup an area of RAM so that it *can* be used for postcopy later; this
 189  * must be done right at the start prior to pre-copy.
 190  * opaque should be the MIS.
 191  */
 192 static int init_range(const char *block_name, void *host_addr,
 193                       ram_addr_t offset, ram_addr_t length, void *opaque)
 194 {
 195     MigrationIncomingState *mis = opaque;
 196
 197     trace_postcopy_init_range(block_name, host_addr, offset, length);
 198
 199     /*
 200      * We need the whole of RAM to be truly empty for postcopy, so things
 201      * like ROMs and any data tables built during init must be zero'd
 202      * - we're going to get the copy from the source anyway.
 203      * (Precopy will just overwrite this data, so doesn't need the discard)
 204      */
 205     if (postcopy_ram_discard_range(mis, host_addr, length)) {
 206         return -1;
 207     }
 208
 209     return 0;
 210 }
 211
 212 /*
 213  * At the end of migration, undo the effects of init_range
 214  * opaque should be the MIS.
 215  */
 216 static int cleanup_range(const char *block_name, void *host_addr,
 217                         ram_addr_t offset, ram_addr_t length, void *opaque)
 218 {
 219     MigrationIncomingState *mis = opaque;
 220     struct uffdio_range range_struct;
 221     trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
 222
 223     /*
 224      * We turned off hugepage for the precopy stage with postcopy enabled
 225      * we can turn it back on now.
 226      */
 227 #ifdef MADV_HUGEPAGE
 228     if (madvise(host_addr, length, MADV_HUGEPAGE)) {
 229         error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
 230         return -1;
 231     }
 232 #endif
 233
 234     /*
 235      * We can also turn off userfault now since we should have all the
 236      * pages.   It can be useful to leave it on to debug postcopy
 237      * if you're not sure it's always getting every page.
 238      */
 239     range_struct.start = (uintptr_t)host_addr;
 240     range_struct.len = length;
 241
 242     if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
 243         error_report("%s: userfault unregister %s", __func__, strerror(errno));
 244
 245         return -1;
 246     }
 247
 248     return 0;
 249 }
 250
 251 /*
 252  * Initialise postcopy-ram, setting the RAM to a state where we can go into
 253  * postcopy later; must be called prior to any precopy.
 254  * called from arch_init's similarly named ram_postcopy_incoming_init
 255  */
 256 int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
 257 {
 258     if (qemu_ram_foreach_block(init_range, mis)) {
 259         return -1;
 260     }
 261
 262     return 0;
 263 }
 264
 265 /*
 266  * At the end of a migration where postcopy_ram_incoming_init was called.
 267  */
 268 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
 269 {
 270     /* TODO: Join the fault thread once we're sure it will exit */
 271     if (qemu_ram_foreach_block(cleanup_range, mis)) {
 272         return -1;
 273     }
 274
 275     if (mis->postcopy_tmp_page) {
 276         munmap(mis->postcopy_tmp_page, getpagesize());
 277         mis->postcopy_tmp_page = NULL;
 278     }
 279     return 0;
 280 }
 281
 282 /*
 283  * Mark the given area of RAM as requiring notification to unwritten areas
 284  * Used as a  callback on qemu_ram_foreach_block.
 285  *   host_addr: Base of area to mark
 286  *   offset: Offset in the whole ram arena
 287  *   length: Length of the section
 288  *   opaque: MigrationIncomingState pointer
 289  * Returns 0 on success
 290  */
 291 static int ram_block_enable_notify(const char *block_name, void *host_addr,
 292                                    ram_addr_t offset, ram_addr_t length,
 293                                    void *opaque)
 294 {
 295     MigrationIncomingState *mis = opaque;
 296     struct uffdio_register reg_struct;
 297
 298     reg_struct.range.start = (uintptr_t)host_addr;
 299     reg_struct.range.len = length;
 300     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 301
 302     /* Now tell our userfault_fd that it's responsible for this area */
 303     if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
 304         error_report("%s userfault register: %s", __func__, strerror(errno));
 305         return -1;
 306     }
 307
 308     return 0;
 309 }
 310
 311 /*
 312  * Handle faults detected by the USERFAULT markings
 313  */
 314 static void *postcopy_ram_fault_thread(void *opaque)
 315 {
 316     MigrationIncomingState *mis = opaque;
 317
 318     fprintf(stderr, "postcopy_ram_fault_thread\n");
 319     /* TODO: In later patch */
 320     qemu_sem_post(&mis->fault_thread_sem);
 321     while (1) {
 322         /* TODO: In later patch */
 323     }
 324
 325     return NULL;
 326 }
 327
 328 int postcopy_ram_enable_notify(MigrationIncomingState *mis)
 329 {
 330     /* Create the fault handler thread and wait for it to be ready */
 331     qemu_sem_init(&mis->fault_thread_sem, 0);
 332     qemu_thread_create(&mis->fault_thread, "postcopy/fault",
 333                        postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
 334     qemu_sem_wait(&mis->fault_thread_sem);
 335     qemu_sem_destroy(&mis->fault_thread_sem);
 336
 337     /* Mark so that we get notified of accesses to unwritten areas */
 338     if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
 339         return -1;
 340     }
 341
 342     return 0;
 343 }
 344
 345 /*
 346  * Place a host page (from) at (host) atomically
 347  * returns 0 on success
 348  */
 349 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
 350 {
 351     struct uffdio_copy copy_struct;
 352
 353     copy_struct.dst = (uint64_t)(uintptr_t)host;
 354     copy_struct.src = (uint64_t)(uintptr_t)from;
 355     copy_struct.len = getpagesize();
 356     copy_struct.mode = 0;
 357
 358     /* copy also acks to the kernel waking the stalled thread up
 359      * TODO: We can inhibit that ack and only do it if it was requested
 360      * which would be slightly cheaper, but we'd have to be careful
 361      * of the order of updating our page state.
 362      */
 363     if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
 364         int e = errno;
 365         error_report("%s: %s copy host: %p from: %p",
 366                      __func__, strerror(e), host, from);
 367
 368         return -e;
 369     }
 370
 371     trace_postcopy_place_page(host);
 372     return 0;
 373 }
 374
 375 /*
 376  * Place a zero page at (host) atomically
 377  * returns 0 on success
 378  */
 379 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
 380 {
 381     struct uffdio_zeropage zero_struct;
 382
 383     zero_struct.range.start = (uint64_t)(uintptr_t)host;
 384     zero_struct.range.len = getpagesize();
 385     zero_struct.mode = 0;
 386
 387     if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
 388         int e = errno;
 389         error_report("%s: %s zero host: %p",
 390                      __func__, strerror(e), host);
 391
 392         return -e;
 393     }
 394
 395     trace_postcopy_place_page_zero(host);
 396     return 0;
 397 }
 398
 399 /*
 400  * Returns a target page of memory that can be mapped at a later point in time
 401  * using postcopy_place_page
 402  * The same address is used repeatedly, postcopy_place_page just takes the
 403  * backing page away.
 404  * Returns: Pointer to allocated page
 405  *
 406  */
 407 void *postcopy_get_tmp_page(MigrationIncomingState *mis)
 408 {
 409     if (!mis->postcopy_tmp_page) {
 410         mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
 411                              PROT_READ | PROT_WRITE, MAP_PRIVATE |
 412                              MAP_ANONYMOUS, -1, 0);
 413         if (!mis->postcopy_tmp_page) {
 414             error_report("%s: %s", __func__, strerror(errno));
 415             return NULL;
 416         }
 417     }
 418
 419     return mis->postcopy_tmp_page;
 420 }
 421
 422 #else
 423 /* No target OS support, stubs just fail */
 424 bool postcopy_ram_supported_by_host(void)
 425 {
 426     error_report("%s: No OS support", __func__);
 427     return false;
 428 }
 429
 430 int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
 431 {
 432     error_report("postcopy_ram_incoming_init: No OS support");
 433     return -1;
 434 }
 435
 436 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
 437 {
 438     assert(0);
 439     return -1;
 440 }
 441
 442 int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
 443                                size_t length)
 444 {
 445     assert(0);
 446     return -1;
 447 }
 448
 449 int postcopy_ram_enable_notify(MigrationIncomingState *mis)
 450 {
 451     assert(0);
 452     return -1;
 453 }
 454
 455 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
 456 {
 457     assert(0);
 458     return -1;
 459 }
 460
 461 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
 462 {
 463     assert(0);
 464     return -1;
 465 }
 466
 467 void *postcopy_get_tmp_page(MigrationIncomingState *mis)
 468 {
 469     assert(0);
 470     return NULL;
 471 }
 472
 473 #endif
 474
 475 /* ------------------------------------------------------------------------- */
 476
 477 /**
 478  * postcopy_discard_send_init: Called at the start of each RAMBlock before
 479  *   asking to discard individual ranges.
 480  *
 481  * @ms: The current migration state.
 482  * @offset: the bitmap offset of the named RAMBlock in the migration
 483  *   bitmap.
 484  * @name: RAMBlock that discards will operate on.
 485  *
 486  * returns: a new PDS.
 487  */
 488 PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
 489                                                  unsigned long offset,
 490                                                  const char *name)
 491 {
 492     PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
 493
 494     if (res) {
 495         res->ramblock_name = name;
 496         res->offset = offset;
 497     }
 498
 499     return res;
 500 }
 501
 502 /**
 503  * postcopy_discard_send_range: Called by the bitmap code for each chunk to
 504  *   discard. May send a discard message, may just leave it queued to
 505  *   be sent later.
 506  *
 507  * @ms: Current migration state.
 508  * @pds: Structure initialised by postcopy_discard_send_init().
 509  * @start,@length: a range of pages in the migration bitmap in the
 510  *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
 511  */
 512 void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
 513                                 unsigned long start, unsigned long length)
 514 {
 515     size_t tp_bits = qemu_target_page_bits();
 516     /* Convert to byte offsets within the RAM block */
 517     pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
 518     pds->length_list[pds->cur_entry] = length << tp_bits;
 519     trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
 520     pds->cur_entry++;
 521     pds->nsentwords++;
 522
 523     if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
 524         /* Full set, ship it! */
 525         qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
 526                                               pds->cur_entry,
 527                                               pds->start_list,
 528                                               pds->length_list);
 529         pds->nsentcmds++;
 530         pds->cur_entry = 0;
 531     }
 532 }
 533
 534 /**
 535  * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
 536  * bitmap code. Sends any outstanding discard messages, frees the PDS
 537  *
 538  * @ms: Current migration state.
 539  * @pds: Structure initialised by postcopy_discard_send_init().
 540  */
 541 void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
 542 {
 543     /* Anything unsent? */
 544     if (pds->cur_entry) {
 545         qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
 546                                               pds->cur_entry,
 547                                               pds->start_list,
 548                                               pds->length_list);
 549         pds->nsentcmds++;
 550     }
 551
 552     trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
 553                                        pds->nsentcmds);
 554
 555     g_free(pds);
 556 }