X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=arch_init.c;h=e0acbc56611413c11677c540af706ed7dca39879;hb=30c367ed446b6ea53245589a5cf373578ac075d7;hp=0e553c92854b10b48e8b5d3bf155f924339f62f6;hpb=1acd5a373905ddb28957842256a038956941f332;p=qemu.git diff --git a/arch_init.c b/arch_init.c index 0e553c928..e0acbc566 100644 --- a/arch_init.c +++ b/arch_init.c @@ -104,6 +104,9 @@ int graphic_depth = 32; #endif const uint32_t arch_type = QEMU_ARCH; +static bool mig_throttle_on; +static int dirty_rate_high_cnt; +static void check_guest_throttling(void); /***********************************************************/ /* ram save/restore */ @@ -115,6 +118,7 @@ const uint32_t arch_type = QEMU_ARCH; #define RAM_SAVE_FLAG_EOS 0x10 #define RAM_SAVE_FLAG_CONTINUE 0x20 #define RAM_SAVE_FLAG_XBZRLE 0x40 +/* 0x80 is reserved in migration.h start with 0x100 next */ static struct defconfig_file { @@ -146,10 +150,9 @@ int qemu_read_default_config_files(bool userconfig) return 0; } -static inline bool is_zero_page(uint8_t *p) +static inline bool is_zero_range(uint8_t *p, uint64_t size) { - return buffer_find_nonzero_offset(p, TARGET_PAGE_SIZE) == - TARGET_PAGE_SIZE; + return buffer_find_nonzero_offset(p, size) == size; } /* struct contains XBZRLE cache and a static page @@ -338,7 +341,8 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, { unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; unsigned long nr = base + (start >> TARGET_PAGE_BITS); - unsigned long size = base + (int128_get64(mr->size) >> TARGET_PAGE_BITS); + uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr)); + unsigned long size = base + (mr_size >> TARGET_PAGE_BITS); unsigned long next; @@ -378,11 +382,17 @@ static void migration_bitmap_sync(void) uint64_t num_dirty_pages_init = migration_dirty_pages; MigrationState *s = migrate_get_current(); static int64_t start_time; + static int64_t bytes_xfer_prev; static int64_t num_dirty_pages_period; int64_t end_time; + int64_t bytes_xfer_now; + + if (!bytes_xfer_prev) { + bytes_xfer_prev = ram_bytes_transferred(); + } if (!start_time) { - start_time = qemu_get_clock_ms(rt_clock); + start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); } trace_migration_bitmap_sync_start(); @@ -400,10 +410,29 @@ static void migration_bitmap_sync(void) trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; - end_time = qemu_get_clock_ms(rt_clock); + end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); /* more than 1 second = 1000 millisecons */ if (end_time > start_time + 1000) { + if (migrate_auto_converge()) { + /* The following detection logic can be refined later. For now: + Check to see if the dirtied bytes is 50% more than the approx. + amount of bytes that just got transferred since the last time we + were in this routine. If that happens >N times (for now N==4) + we turn on the throttle down logic */ + bytes_xfer_now = ram_bytes_transferred(); + if (s->dirty_pages_rate && + (num_dirty_pages_period * TARGET_PAGE_SIZE > + (bytes_xfer_now - bytes_xfer_prev)/2) && + (dirty_rate_high_cnt++ > 4)) { + trace_migration_throttle(); + mig_throttle_on = true; + dirty_rate_high_cnt = 0; + } + bytes_xfer_prev = bytes_xfer_now; + } else { + mig_throttle_on = false; + } s->dirty_pages_rate = num_dirty_pages_period * 1000 / (end_time - start_time); s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; @@ -447,6 +476,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) ram_bulk_stage = false; } } else { + int ret; uint8_t *p; int cont = (block == last_sent_block) ? RAM_SAVE_FLAG_CONTINUE : 0; @@ -455,7 +485,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage) /* In doubt sent page as normal */ bytes_sent = -1; - if (is_zero_page(p)) { + ret = ram_control_save_page(f, block->offset, + offset, TARGET_PAGE_SIZE, &bytes_sent); + + if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { + if (ret != RAM_SAVE_CONTROL_DELAYED) { + if (bytes_sent > 0) { + acct_info.norm_pages++; + } else if (bytes_sent == 0) { + acct_info.dup_pages++; + } + } + } else if (is_zero_range(p, TARGET_PAGE_SIZE)) { acct_info.dup_pages++; bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS); @@ -573,6 +614,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_bitmap = bitmap_new(ram_pages); bitmap_set(migration_bitmap, 0, ram_pages); migration_dirty_pages = ram_pages; + mig_throttle_on = false; + dirty_rate_high_cnt = 0; if (migrate_use_xbzrle()) { XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / @@ -605,6 +648,10 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } qemu_mutex_unlock_ramlist(); + + ram_control_before_iterate(f, RAM_CONTROL_SETUP); + ram_control_after_iterate(f, RAM_CONTROL_SETUP); + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); return 0; @@ -623,7 +670,9 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) reset_ram_globals(); } - t0 = qemu_get_clock_ns(rt_clock); + ram_control_before_iterate(f, RAM_CONTROL_ROUND); + + t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); i = 0; while ((ret = qemu_file_rate_limit(f)) == 0) { int bytes_sent; @@ -635,13 +684,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } total_sent += bytes_sent; acct_info.iterations++; + check_guest_throttling(); /* we want to check in the 1st loop, just in case it was the 1st time and we had to sync the dirty bitmap. qemu_get_clock_ns() is a bit expensive, so we only check each some iterations */ if ((i & 63) == 0) { - uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000; + uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; if (t1 > MAX_WAIT) { DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n", t1, i); @@ -653,15 +703,26 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) qemu_mutex_unlock_ramlist(); + /* + * Must occur before EOS (or any QEMUFile operation) + * because of RDMA protocol. + */ + ram_control_after_iterate(f, RAM_CONTROL_ROUND); + + bytes_transferred += total_sent; + + /* + * Do not count these 8 bytes into total_sent, so that we can + * return 0 if no page had been dirtied. + */ + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + bytes_transferred += 8; + + ret = qemu_file_get_error(f); if (ret < 0) { - bytes_transferred += total_sent; return ret; } - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); - total_sent += 8; - bytes_transferred += total_sent; - return total_sent; } @@ -670,6 +731,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque) qemu_mutex_lock_ramlist(); migration_bitmap_sync(); + ram_control_before_iterate(f, RAM_CONTROL_FINISH); + /* try transferring iterative blocks of memory */ /* flush all remaining blocks regardless of rate limiting */ @@ -683,6 +746,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } bytes_transferred += bytes_sent; } + + ram_control_after_iterate(f, RAM_CONTROL_FINISH); migration_end(); qemu_mutex_unlock_ramlist(); @@ -777,6 +842,17 @@ static inline void *host_from_stream_offset(QEMUFile *f, return NULL; } +/* + * If a page (or a whole RDMA chunk) has been + * determined to be zero, then zap it. + */ +void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) +{ + if (ch != 0 || !is_zero_range(host, size)) { + memset(host, ch, size); + } +} + static int ram_load(QEMUFile *f, void *opaque, int version_id) { ram_addr_t addr; @@ -848,16 +924,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } ch = qemu_get_byte(f); - if (ch != 0 || !is_zero_page(host)) { - memset(host, ch, TARGET_PAGE_SIZE); -#ifndef _WIN32 - if (ch == 0 && - (!kvm_enabled() || kvm_has_sync_mmu()) && - getpagesize() <= TARGET_PAGE_SIZE) { - qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED); - } -#endif - } + ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); } else if (flags & RAM_SAVE_FLAG_PAGE) { void *host; @@ -877,6 +944,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ret = -EINVAL; goto done; } + } else if (flags & RAM_SAVE_FLAG_HOOK) { + ram_control_load_hook(f, flags); } error = qemu_file_get_error(f); if (error) { @@ -1041,9 +1110,6 @@ int qemu_uuid_parse(const char *str, uint8_t *uuid) if (ret != 16) { return -1; } -#ifdef TARGET_I386 - smbios_add_field(1, offsetof(struct smbios_type_1, uuid), uuid, 16); -#endif return 0; } @@ -1054,20 +1120,18 @@ void do_acpitable_option(const QemuOpts *opts) acpi_table_add(opts, &err); if (err) { - fprintf(stderr, "Wrong acpi table provided: %s\n", - error_get_pretty(err)); + error_report("Wrong acpi table provided: %s", + error_get_pretty(err)); error_free(err); exit(1); } #endif } -void do_smbios_option(const char *optarg) +void do_smbios_option(QemuOpts *opts) { #ifdef TARGET_I386 - if (smbios_entry_add(optarg) < 0) { - exit(1); - } + smbios_entry_add(opts); #endif } @@ -1110,3 +1174,52 @@ TargetInfo *qmp_query_target(Error **errp) return info; } + +/* Stub function that's gets run on the vcpu when its brought out of the + VM to run inside qemu via async_run_on_cpu()*/ +static void mig_sleep_cpu(void *opq) +{ + qemu_mutex_unlock_iothread(); + g_usleep(30*1000); + qemu_mutex_lock_iothread(); +} + +/* To reduce the dirty rate explicitly disallow the VCPUs from spending + much time in the VM. The migration thread will try to catchup. + Workload will experience a performance drop. +*/ +static void mig_throttle_guest_down(void) +{ + CPUState *cpu; + + qemu_mutex_lock_iothread(); + CPU_FOREACH(cpu) { + async_run_on_cpu(cpu, mig_sleep_cpu, NULL); + } + qemu_mutex_unlock_iothread(); +} + +static void check_guest_throttling(void) +{ + static int64_t t0; + int64_t t1; + + if (!mig_throttle_on) { + return; + } + + if (!t0) { + t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + return; + } + + t1 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + + /* If it has been more than 40 ms since the last time the guest + * was throttled then do it again. + */ + if (40 < (t1-t0)/1000000) { + mig_throttle_guest_down(); + t0 = t1; + } +}