migration/postcopy-ram.c

   1 /*
   2  * Postcopy migration for RAM
   3  *
   4  * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
   5  *
   6  * Authors:
   7  *  Dave Gilbert  <dgilbert@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10  * See the COPYING file in the top-level directory.
  11  *
  12  */
  13
  14 /*
  15  * Postcopy is a migration technique where the execution flips from the
  16  * source to the destination before all the data has been copied.
  17  */
  18
  19 #include <glib.h>
  20 #include <stdio.h>
  21 #include <unistd.h>
  22
  23 #include "qemu-common.h"
  24 #include "migration/migration.h"
  25 #include "migration/postcopy-ram.h"
  26 #include "sysemu/sysemu.h"
  27 #include "qemu/error-report.h"
  28 #include "trace.h"
  29
  30 /* Arbitrary limit on size of each discard command,
  31  * keeps them around ~200 bytes
  32  */
  33 #define MAX_DISCARDS_PER_COMMAND 12
  34
  35 struct PostcopyDiscardState {
  36     const char *ramblock_name;
  37     uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */
  38     uint16_t cur_entry;
  39     /*
  40      * Start and length of a discard range (bytes)
  41      */
  42     uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
  43     uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
  44     unsigned int nsentwords;
  45     unsigned int nsentcmds;
  46 };
  47
  48 /* Postcopy needs to detect accesses to pages that haven't yet been copied
  49  * across, and efficiently map new pages in, the techniques for doing this
  50  * are target OS specific.
  51  */
  52 #if defined(__linux__)
  53
  54 #include <sys/mman.h>
  55 #include <sys/ioctl.h>
  56 #include <sys/syscall.h>
  57 #include <sys/types.h>
  58 #include <asm/types.h> /* for __u64 */
  59 #endif
  60
  61 #if defined(__linux__) && defined(__NR_userfaultfd)
  62 #include <linux/userfaultfd.h>
  63
  64 static bool ufd_version_check(int ufd)
  65 {
  66     struct uffdio_api api_struct;
  67     uint64_t ioctl_mask;
  68
  69     api_struct.api = UFFD_API;
  70     api_struct.features = 0;
  71     if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  72         error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
  73                      strerror(errno));
  74         return false;
  75     }
  76
  77     ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
  78                  (__u64)1 << _UFFDIO_UNREGISTER;
  79     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  80         error_report("Missing userfault features: %" PRIx64,
  81                      (uint64_t)(~api_struct.ioctls & ioctl_mask));
  82         return false;
  83     }
  84
  85     return true;
  86 }
  87
  88 bool postcopy_ram_supported_by_host(void)
  89 {
  90     long pagesize = getpagesize();
  91     int ufd = -1;
  92     bool ret = false; /* Error unless we change it */
  93     void *testarea = NULL;
  94     struct uffdio_register reg_struct;
  95     struct uffdio_range range_struct;
  96     uint64_t feature_mask;
  97
  98     if ((1ul << qemu_target_page_bits()) > pagesize) {
  99         error_report("Target page size bigger than host page size");
 100         goto out;
 101     }
 102
 103     ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
 104     if (ufd == -1) {
 105         error_report("%s: userfaultfd not available: %s", __func__,
 106                      strerror(errno));
 107         goto out;
 108     }
 109
 110     /* Version and features check */
 111     if (!ufd_version_check(ufd)) {
 112         goto out;
 113     }
 114
 115     /*
 116      *  We need to check that the ops we need are supported on anon memory
 117      *  To do that we need to register a chunk and see the flags that
 118      *  are returned.
 119      */
 120     testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
 121                                     MAP_ANONYMOUS, -1, 0);
 122     if (testarea == MAP_FAILED) {
 123         error_report("%s: Failed to map test area: %s", __func__,
 124                      strerror(errno));
 125         goto out;
 126     }
 127     g_assert(((size_t)testarea & (pagesize-1)) == 0);
 128
 129     reg_struct.range.start = (uintptr_t)testarea;
 130     reg_struct.range.len = pagesize;
 131     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 132
 133     if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
 134         error_report("%s userfault register: %s", __func__, strerror(errno));
 135         goto out;
 136     }
 137
 138     range_struct.start = (uintptr_t)testarea;
 139     range_struct.len = pagesize;
 140     if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
 141         error_report("%s userfault unregister: %s", __func__, strerror(errno));
 142         goto out;
 143     }
 144
 145     feature_mask = (__u64)1 << _UFFDIO_WAKE |
 146                    (__u64)1 << _UFFDIO_COPY |
 147                    (__u64)1 << _UFFDIO_ZEROPAGE;
 148     if ((reg_struct.ioctls & feature_mask) != feature_mask) {
 149         error_report("Missing userfault map features: %" PRIx64,
 150                      (uint64_t)(~reg_struct.ioctls & feature_mask));
 151         goto out;
 152     }
 153
 154     /* Success! */
 155     ret = true;
 156 out:
 157     if (testarea) {
 158         munmap(testarea, pagesize);
 159     }
 160     if (ufd != -1) {
 161         close(ufd);
 162     }
 163     return ret;
 164 }
 165
 166 /**
 167  * postcopy_ram_discard_range: Discard a range of memory.
 168  * We can assume that if we've been called postcopy_ram_hosttest returned true.
 169  *
 170  * @mis: Current incoming migration state.
 171  * @start, @length: range of memory to discard.
 172  *
 173  * returns: 0 on success.
 174  */
 175 int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
 176                                size_t length)
 177 {
 178     trace_postcopy_ram_discard_range(start, length);
 179     if (madvise(start, length, MADV_DONTNEED)) {
 180         error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
 181         return -1;
 182     }
 183
 184     return 0;
 185 }
 186
 187 /*
 188  * Setup an area of RAM so that it *can* be used for postcopy later; this
 189  * must be done right at the start prior to pre-copy.
 190  * opaque should be the MIS.
 191  */
 192 static int init_range(const char *block_name, void *host_addr,
 193                       ram_addr_t offset, ram_addr_t length, void *opaque)
 194 {
 195     MigrationIncomingState *mis = opaque;
 196
 197     trace_postcopy_init_range(block_name, host_addr, offset, length);
 198
 199     /*
 200      * We need the whole of RAM to be truly empty for postcopy, so things
 201      * like ROMs and any data tables built during init must be zero'd
 202      * - we're going to get the copy from the source anyway.
 203      * (Precopy will just overwrite this data, so doesn't need the discard)
 204      */
 205     if (postcopy_ram_discard_range(mis, host_addr, length)) {
 206         return -1;
 207     }
 208
 209     return 0;
 210 }
 211
 212 /*
 213  * At the end of migration, undo the effects of init_range
 214  * opaque should be the MIS.
 215  */
 216 static int cleanup_range(const char *block_name, void *host_addr,
 217                         ram_addr_t offset, ram_addr_t length, void *opaque)
 218 {
 219     MigrationIncomingState *mis = opaque;
 220     struct uffdio_range range_struct;
 221     trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
 222
 223     /*
 224      * We turned off hugepage for the precopy stage with postcopy enabled
 225      * we can turn it back on now.
 226      */
 227 #ifdef MADV_HUGEPAGE
 228     if (madvise(host_addr, length, MADV_HUGEPAGE)) {
 229         error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
 230         return -1;
 231     }
 232 #endif
 233
 234     /*
 235      * We can also turn off userfault now since we should have all the
 236      * pages.   It can be useful to leave it on to debug postcopy
 237      * if you're not sure it's always getting every page.
 238      */
 239     range_struct.start = (uintptr_t)host_addr;
 240     range_struct.len = length;
 241
 242     if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
 243         error_report("%s: userfault unregister %s", __func__, strerror(errno));
 244
 245         return -1;
 246     }
 247
 248     return 0;
 249 }
 250
 251 /*
 252  * Initialise postcopy-ram, setting the RAM to a state where we can go into
 253  * postcopy later; must be called prior to any precopy.
 254  * called from arch_init's similarly named ram_postcopy_incoming_init
 255  */
 256 int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
 257 {
 258     if (qemu_ram_foreach_block(init_range, mis)) {
 259         return -1;
 260     }
 261
 262     return 0;
 263 }
 264
 265 /*
 266  * At the end of a migration where postcopy_ram_incoming_init was called.
 267  */
 268 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
 269 {
 270     /* TODO: Join the fault thread once we're sure it will exit */
 271     if (qemu_ram_foreach_block(cleanup_range, mis)) {
 272         return -1;
 273     }
 274
 275     return 0;
 276 }
 277
 278 /*
 279  * Mark the given area of RAM as requiring notification to unwritten areas
 280  * Used as a  callback on qemu_ram_foreach_block.
 281  *   host_addr: Base of area to mark
 282  *   offset: Offset in the whole ram arena
 283  *   length: Length of the section
 284  *   opaque: MigrationIncomingState pointer
 285  * Returns 0 on success
 286  */
 287 static int ram_block_enable_notify(const char *block_name, void *host_addr,
 288                                    ram_addr_t offset, ram_addr_t length,
 289                                    void *opaque)
 290 {
 291     MigrationIncomingState *mis = opaque;
 292     struct uffdio_register reg_struct;
 293
 294     reg_struct.range.start = (uintptr_t)host_addr;
 295     reg_struct.range.len = length;
 296     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 297
 298     /* Now tell our userfault_fd that it's responsible for this area */
 299     if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
 300         error_report("%s userfault register: %s", __func__, strerror(errno));
 301         return -1;
 302     }
 303
 304     return 0;
 305 }
 306
 307 /*
 308  * Handle faults detected by the USERFAULT markings
 309  */
 310 static void *postcopy_ram_fault_thread(void *opaque)
 311 {
 312     MigrationIncomingState *mis = opaque;
 313
 314     fprintf(stderr, "postcopy_ram_fault_thread\n");
 315     /* TODO: In later patch */
 316     qemu_sem_post(&mis->fault_thread_sem);
 317     while (1) {
 318         /* TODO: In later patch */
 319     }
 320
 321     return NULL;
 322 }
 323
 324 int postcopy_ram_enable_notify(MigrationIncomingState *mis)
 325 {
 326     /* Create the fault handler thread and wait for it to be ready */
 327     qemu_sem_init(&mis->fault_thread_sem, 0);
 328     qemu_thread_create(&mis->fault_thread, "postcopy/fault",
 329                        postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
 330     qemu_sem_wait(&mis->fault_thread_sem);
 331     qemu_sem_destroy(&mis->fault_thread_sem);
 332
 333     /* Mark so that we get notified of accesses to unwritten areas */
 334     if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
 335         return -1;
 336     }
 337
 338     return 0;
 339 }
 340
 341 #else
 342 /* No target OS support, stubs just fail */
 343 bool postcopy_ram_supported_by_host(void)
 344 {
 345     error_report("%s: No OS support", __func__);
 346     return false;
 347 }
 348
 349 int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
 350 {
 351     error_report("postcopy_ram_incoming_init: No OS support");
 352     return -1;
 353 }
 354
 355 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
 356 {
 357     assert(0);
 358     return -1;
 359 }
 360
 361 int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
 362                                size_t length)
 363 {
 364     assert(0);
 365     return -1;
 366 }
 367
 368 int postcopy_ram_enable_notify(MigrationIncomingState *mis)
 369 {
 370     assert(0);
 371     return -1;
 372 }
 373 #endif
 374
 375 /* ------------------------------------------------------------------------- */
 376
 377 /**
 378  * postcopy_discard_send_init: Called at the start of each RAMBlock before
 379  *   asking to discard individual ranges.
 380  *
 381  * @ms: The current migration state.
 382  * @offset: the bitmap offset of the named RAMBlock in the migration
 383  *   bitmap.
 384  * @name: RAMBlock that discards will operate on.
 385  *
 386  * returns: a new PDS.
 387  */
 388 PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
 389                                                  unsigned long offset,
 390                                                  const char *name)
 391 {
 392     PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
 393
 394     if (res) {
 395         res->ramblock_name = name;
 396         res->offset = offset;
 397     }
 398
 399     return res;
 400 }
 401
 402 /**
 403  * postcopy_discard_send_range: Called by the bitmap code for each chunk to
 404  *   discard. May send a discard message, may just leave it queued to
 405  *   be sent later.
 406  *
 407  * @ms: Current migration state.
 408  * @pds: Structure initialised by postcopy_discard_send_init().
 409  * @start,@length: a range of pages in the migration bitmap in the
 410  *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
 411  */
 412 void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
 413                                 unsigned long start, unsigned long length)
 414 {
 415     size_t tp_bits = qemu_target_page_bits();
 416     /* Convert to byte offsets within the RAM block */
 417     pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
 418     pds->length_list[pds->cur_entry] = length << tp_bits;
 419     trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
 420     pds->cur_entry++;
 421     pds->nsentwords++;
 422
 423     if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
 424         /* Full set, ship it! */
 425         qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
 426                                               pds->cur_entry,
 427                                               pds->start_list,
 428                                               pds->length_list);
 429         pds->nsentcmds++;
 430         pds->cur_entry = 0;
 431     }
 432 }
 433
 434 /**
 435  * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
 436  * bitmap code. Sends any outstanding discard messages, frees the PDS
 437  *
 438  * @ms: Current migration state.
 439  * @pds: Structure initialised by postcopy_discard_send_init().
 440  */
 441 void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
 442 {
 443     /* Anything unsent? */
 444     if (pds->cur_entry) {
 445         qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
 446                                               pds->cur_entry,
 447                                               pds->start_list,
 448                                               pds->length_list);
 449         pds->nsentcmds++;
 450     }
 451
 452     trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
 453                                        pds->nsentcmds);
 454
 455     g_free(pds);
 456 }