]> git.proxmox.com Git - mirror_qemu.git/blob - hw/ppc/spapr.c
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging
[mirror_qemu.git] / hw / ppc / spapr.c
1 /*
2 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
3 *
4 * Copyright (c) 2004-2007 Fabrice Bellard
5 * Copyright (c) 2007 Jocelyn Mayer
6 * Copyright (c) 2010 David Gibson, IBM Corporation.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
27 #include "qemu/osdep.h"
28 #include "qemu-common.h"
29 #include "qemu/datadir.h"
30 #include "qapi/error.h"
31 #include "qapi/qapi-events-machine.h"
32 #include "qapi/visitor.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/hostmem.h"
35 #include "sysemu/numa.h"
36 #include "sysemu/qtest.h"
37 #include "sysemu/reset.h"
38 #include "sysemu/runstate.h"
39 #include "qemu/log.h"
40 #include "hw/fw-path-provider.h"
41 #include "elf.h"
42 #include "net/net.h"
43 #include "sysemu/device_tree.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/hw_accel.h"
46 #include "kvm_ppc.h"
47 #include "migration/misc.h"
48 #include "migration/qemu-file-types.h"
49 #include "migration/global_state.h"
50 #include "migration/register.h"
51 #include "migration/blocker.h"
52 #include "mmu-hash64.h"
53 #include "mmu-book3s-v3.h"
54 #include "cpu-models.h"
55 #include "hw/core/cpu.h"
56
57 #include "hw/ppc/ppc.h"
58 #include "hw/loader.h"
59
60 #include "hw/ppc/fdt.h"
61 #include "hw/ppc/spapr.h"
62 #include "hw/ppc/spapr_vio.h"
63 #include "hw/qdev-properties.h"
64 #include "hw/pci-host/spapr.h"
65 #include "hw/pci/msi.h"
66
67 #include "hw/pci/pci.h"
68 #include "hw/scsi/scsi.h"
69 #include "hw/virtio/virtio-scsi.h"
70 #include "hw/virtio/vhost-scsi-common.h"
71
72 #include "exec/ram_addr.h"
73 #include "hw/usb.h"
74 #include "qemu/config-file.h"
75 #include "qemu/error-report.h"
76 #include "trace.h"
77 #include "hw/nmi.h"
78 #include "hw/intc/intc.h"
79
80 #include "hw/ppc/spapr_cpu_core.h"
81 #include "hw/mem/memory-device.h"
82 #include "hw/ppc/spapr_tpm_proxy.h"
83 #include "hw/ppc/spapr_nvdimm.h"
84 #include "hw/ppc/spapr_numa.h"
85 #include "hw/ppc/pef.h"
86
87 #include "monitor/monitor.h"
88
89 #include <libfdt.h>
90
91 /* SLOF memory layout:
92 *
93 * SLOF raw image loaded at 0, copies its romfs right below the flat
94 * device-tree, then position SLOF itself 31M below that
95 *
96 * So we set FW_OVERHEAD to 40MB which should account for all of that
97 * and more
98 *
99 * We load our kernel at 4M, leaving space for SLOF initial image
100 */
101 #define FDT_MAX_ADDR 0x80000000 /* FDT must stay below that */
102 #define FW_MAX_SIZE 0x400000
103 #define FW_FILE_NAME "slof.bin"
104 #define FW_FILE_NAME_VOF "vof.bin"
105 #define FW_OVERHEAD 0x2800000
106 #define KERNEL_LOAD_ADDR FW_MAX_SIZE
107
108 #define MIN_RMA_SLOF (128 * MiB)
109
110 #define PHANDLE_INTC 0x00001111
111
112 /* These two functions implement the VCPU id numbering: one to compute them
113 * all and one to identify thread 0 of a VCORE. Any change to the first one
114 * is likely to have an impact on the second one, so let's keep them close.
115 */
116 static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
117 {
118 MachineState *ms = MACHINE(spapr);
119 unsigned int smp_threads = ms->smp.threads;
120
121 assert(spapr->vsmt);
122 return
123 (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
124 }
125 static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
126 PowerPCCPU *cpu)
127 {
128 assert(spapr->vsmt);
129 return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
130 }
131
132 static bool pre_2_10_vmstate_dummy_icp_needed(void *opaque)
133 {
134 /* Dummy entries correspond to unused ICPState objects in older QEMUs,
135 * and newer QEMUs don't even have them. In both cases, we don't want
136 * to send anything on the wire.
137 */
138 return false;
139 }
140
141 static const VMStateDescription pre_2_10_vmstate_dummy_icp = {
142 .name = "icp/server",
143 .version_id = 1,
144 .minimum_version_id = 1,
145 .needed = pre_2_10_vmstate_dummy_icp_needed,
146 .fields = (VMStateField[]) {
147 VMSTATE_UNUSED(4), /* uint32_t xirr */
148 VMSTATE_UNUSED(1), /* uint8_t pending_priority */
149 VMSTATE_UNUSED(1), /* uint8_t mfrr */
150 VMSTATE_END_OF_LIST()
151 },
152 };
153
154 static void pre_2_10_vmstate_register_dummy_icp(int i)
155 {
156 vmstate_register(NULL, i, &pre_2_10_vmstate_dummy_icp,
157 (void *)(uintptr_t) i);
158 }
159
160 static void pre_2_10_vmstate_unregister_dummy_icp(int i)
161 {
162 vmstate_unregister(NULL, &pre_2_10_vmstate_dummy_icp,
163 (void *)(uintptr_t) i);
164 }
165
166 int spapr_max_server_number(SpaprMachineState *spapr)
167 {
168 MachineState *ms = MACHINE(spapr);
169
170 assert(spapr->vsmt);
171 return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads);
172 }
173
174 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
175 int smt_threads)
176 {
177 int i, ret = 0;
178 uint32_t servers_prop[smt_threads];
179 uint32_t gservers_prop[smt_threads * 2];
180 int index = spapr_get_vcpu_id(cpu);
181
182 if (cpu->compat_pvr) {
183 ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
184 if (ret < 0) {
185 return ret;
186 }
187 }
188
189 /* Build interrupt servers and gservers properties */
190 for (i = 0; i < smt_threads; i++) {
191 servers_prop[i] = cpu_to_be32(index + i);
192 /* Hack, direct the group queues back to cpu 0 */
193 gservers_prop[i*2] = cpu_to_be32(index + i);
194 gservers_prop[i*2 + 1] = 0;
195 }
196 ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
197 servers_prop, sizeof(servers_prop));
198 if (ret < 0) {
199 return ret;
200 }
201 ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
202 gservers_prop, sizeof(gservers_prop));
203
204 return ret;
205 }
206
207 static void spapr_dt_pa_features(SpaprMachineState *spapr,
208 PowerPCCPU *cpu,
209 void *fdt, int offset)
210 {
211 uint8_t pa_features_206[] = { 6, 0,
212 0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
213 uint8_t pa_features_207[] = { 24, 0,
214 0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
215 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
216 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
217 0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
218 uint8_t pa_features_300[] = { 66, 0,
219 /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
220 /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, SSO, 5: LE|CFAR|EB|LSQ */
221 0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /* 0 - 5 */
222 /* 6: DS207 */
223 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
224 /* 16: Vector */
225 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
226 /* 18: Vec. Scalar, 20: Vec. XOR, 22: HTM */
227 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
228 /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
229 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
230 /* 30: MMR, 32: LE atomic, 34: EBB + ext EBB */
231 0x80, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
232 /* 36: SPR SO, 38: Copy/Paste, 40: Radix MMU */
233 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 36 - 41 */
234 /* 42: PM, 44: PC RA, 46: SC vec'd */
235 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
236 /* 48: SIMD, 50: QP BFP, 52: String */
237 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
238 /* 54: DecFP, 56: DecI, 58: SHA */
239 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
240 /* 60: NM atomic, 62: RNG */
241 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
242 };
243 uint8_t *pa_features = NULL;
244 size_t pa_size;
245
246 if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
247 pa_features = pa_features_206;
248 pa_size = sizeof(pa_features_206);
249 }
250 if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
251 pa_features = pa_features_207;
252 pa_size = sizeof(pa_features_207);
253 }
254 if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
255 pa_features = pa_features_300;
256 pa_size = sizeof(pa_features_300);
257 }
258 if (!pa_features) {
259 return;
260 }
261
262 if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
263 /*
264 * Note: we keep CI large pages off by default because a 64K capable
265 * guest provisioned with large pages might otherwise try to map a qemu
266 * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
267 * even if that qemu runs on a 4k host.
268 * We dd this bit back here if we are confident this is not an issue
269 */
270 pa_features[3] |= 0x20;
271 }
272 if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
273 pa_features[24] |= 0x80; /* Transactional memory support */
274 }
275 if (spapr->cas_pre_isa3_guest && pa_size > 40) {
276 /* Workaround for broken kernels that attempt (guest) radix
277 * mode when they can't handle it, if they see the radix bit set
278 * in pa-features. So hide it from them. */
279 pa_features[40 + 2] &= ~0x80; /* Radix MMU */
280 }
281
282 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
283 }
284
285 static hwaddr spapr_node0_size(MachineState *machine)
286 {
287 if (machine->numa_state->num_nodes) {
288 int i;
289 for (i = 0; i < machine->numa_state->num_nodes; ++i) {
290 if (machine->numa_state->nodes[i].node_mem) {
291 return MIN(pow2floor(machine->numa_state->nodes[i].node_mem),
292 machine->ram_size);
293 }
294 }
295 }
296 return machine->ram_size;
297 }
298
299 static void add_str(GString *s, const gchar *s1)
300 {
301 g_string_append_len(s, s1, strlen(s1) + 1);
302 }
303
304 static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid,
305 hwaddr start, hwaddr size)
306 {
307 char mem_name[32];
308 uint64_t mem_reg_property[2];
309 int off;
310
311 mem_reg_property[0] = cpu_to_be64(start);
312 mem_reg_property[1] = cpu_to_be64(size);
313
314 sprintf(mem_name, "memory@%" HWADDR_PRIx, start);
315 off = fdt_add_subnode(fdt, 0, mem_name);
316 _FDT(off);
317 _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
318 _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
319 sizeof(mem_reg_property))));
320 spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid);
321 return off;
322 }
323
324 static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
325 {
326 MemoryDeviceInfoList *info;
327
328 for (info = list; info; info = info->next) {
329 MemoryDeviceInfo *value = info->value;
330
331 if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
332 PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
333
334 if (addr >= pcdimm_info->addr &&
335 addr < (pcdimm_info->addr + pcdimm_info->size)) {
336 return pcdimm_info->node;
337 }
338 }
339 }
340
341 return -1;
342 }
343
344 struct sPAPRDrconfCellV2 {
345 uint32_t seq_lmbs;
346 uint64_t base_addr;
347 uint32_t drc_index;
348 uint32_t aa_index;
349 uint32_t flags;
350 } QEMU_PACKED;
351
352 typedef struct DrconfCellQueue {
353 struct sPAPRDrconfCellV2 cell;
354 QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
355 } DrconfCellQueue;
356
357 static DrconfCellQueue *
358 spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
359 uint32_t drc_index, uint32_t aa_index,
360 uint32_t flags)
361 {
362 DrconfCellQueue *elem;
363
364 elem = g_malloc0(sizeof(*elem));
365 elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
366 elem->cell.base_addr = cpu_to_be64(base_addr);
367 elem->cell.drc_index = cpu_to_be32(drc_index);
368 elem->cell.aa_index = cpu_to_be32(aa_index);
369 elem->cell.flags = cpu_to_be32(flags);
370
371 return elem;
372 }
373
374 static int spapr_dt_dynamic_memory_v2(SpaprMachineState *spapr, void *fdt,
375 int offset, MemoryDeviceInfoList *dimms)
376 {
377 MachineState *machine = MACHINE(spapr);
378 uint8_t *int_buf, *cur_index;
379 int ret;
380 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
381 uint64_t addr, cur_addr, size;
382 uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
383 uint64_t mem_end = machine->device_memory->base +
384 memory_region_size(&machine->device_memory->mr);
385 uint32_t node, buf_len, nr_entries = 0;
386 SpaprDrc *drc;
387 DrconfCellQueue *elem, *next;
388 MemoryDeviceInfoList *info;
389 QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
390 = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
391
392 /* Entry to cover RAM and the gap area */
393 elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
394 SPAPR_LMB_FLAGS_RESERVED |
395 SPAPR_LMB_FLAGS_DRC_INVALID);
396 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
397 nr_entries++;
398
399 cur_addr = machine->device_memory->base;
400 for (info = dimms; info; info = info->next) {
401 PCDIMMDeviceInfo *di = info->value->u.dimm.data;
402
403 addr = di->addr;
404 size = di->size;
405 node = di->node;
406
407 /*
408 * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
409 * area is marked hotpluggable in the next iteration for the bigger
410 * chunk including the NVDIMM occupied area.
411 */
412 if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
413 continue;
414
415 /* Entry for hot-pluggable area */
416 if (cur_addr < addr) {
417 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
418 g_assert(drc);
419 elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
420 cur_addr, spapr_drc_index(drc), -1, 0);
421 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
422 nr_entries++;
423 }
424
425 /* Entry for DIMM */
426 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
427 g_assert(drc);
428 elem = spapr_get_drconf_cell(size / lmb_size, addr,
429 spapr_drc_index(drc), node,
430 (SPAPR_LMB_FLAGS_ASSIGNED |
431 SPAPR_LMB_FLAGS_HOTREMOVABLE));
432 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
433 nr_entries++;
434 cur_addr = addr + size;
435 }
436
437 /* Entry for remaining hotpluggable area */
438 if (cur_addr < mem_end) {
439 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
440 g_assert(drc);
441 elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
442 cur_addr, spapr_drc_index(drc), -1, 0);
443 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
444 nr_entries++;
445 }
446
447 buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
448 int_buf = cur_index = g_malloc0(buf_len);
449 *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
450 cur_index += sizeof(nr_entries);
451
452 QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
453 memcpy(cur_index, &elem->cell, sizeof(elem->cell));
454 cur_index += sizeof(elem->cell);
455 QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
456 g_free(elem);
457 }
458
459 ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
460 g_free(int_buf);
461 if (ret < 0) {
462 return -1;
463 }
464 return 0;
465 }
466
467 static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt,
468 int offset, MemoryDeviceInfoList *dimms)
469 {
470 MachineState *machine = MACHINE(spapr);
471 int i, ret;
472 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
473 uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
474 uint32_t nr_lmbs = (machine->device_memory->base +
475 memory_region_size(&machine->device_memory->mr)) /
476 lmb_size;
477 uint32_t *int_buf, *cur_index, buf_len;
478
479 /*
480 * Allocate enough buffer size to fit in ibm,dynamic-memory
481 */
482 buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
483 cur_index = int_buf = g_malloc0(buf_len);
484 int_buf[0] = cpu_to_be32(nr_lmbs);
485 cur_index++;
486 for (i = 0; i < nr_lmbs; i++) {
487 uint64_t addr = i * lmb_size;
488 uint32_t *dynamic_memory = cur_index;
489
490 if (i >= device_lmb_start) {
491 SpaprDrc *drc;
492
493 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
494 g_assert(drc);
495
496 dynamic_memory[0] = cpu_to_be32(addr >> 32);
497 dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
498 dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
499 dynamic_memory[3] = cpu_to_be32(0); /* reserved */
500 dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
501 if (memory_region_present(get_system_memory(), addr)) {
502 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
503 } else {
504 dynamic_memory[5] = cpu_to_be32(0);
505 }
506 } else {
507 /*
508 * LMB information for RMA, boot time RAM and gap b/n RAM and
509 * device memory region -- all these are marked as reserved
510 * and as having no valid DRC.
511 */
512 dynamic_memory[0] = cpu_to_be32(addr >> 32);
513 dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
514 dynamic_memory[2] = cpu_to_be32(0);
515 dynamic_memory[3] = cpu_to_be32(0); /* reserved */
516 dynamic_memory[4] = cpu_to_be32(-1);
517 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
518 SPAPR_LMB_FLAGS_DRC_INVALID);
519 }
520
521 cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
522 }
523 ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
524 g_free(int_buf);
525 if (ret < 0) {
526 return -1;
527 }
528 return 0;
529 }
530
531 /*
532 * Adds ibm,dynamic-reconfiguration-memory node.
533 * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
534 * of this device tree node.
535 */
536 static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr,
537 void *fdt)
538 {
539 MachineState *machine = MACHINE(spapr);
540 int ret, offset;
541 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
542 uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32),
543 cpu_to_be32(lmb_size & 0xffffffff)};
544 MemoryDeviceInfoList *dimms = NULL;
545
546 /*
547 * Don't create the node if there is no device memory
548 */
549 if (machine->ram_size == machine->maxram_size) {
550 return 0;
551 }
552
553 offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
554
555 ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
556 sizeof(prop_lmb_size));
557 if (ret < 0) {
558 return ret;
559 }
560
561 ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
562 if (ret < 0) {
563 return ret;
564 }
565
566 ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
567 if (ret < 0) {
568 return ret;
569 }
570
571 /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
572 dimms = qmp_memory_device_list();
573 if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
574 ret = spapr_dt_dynamic_memory_v2(spapr, fdt, offset, dimms);
575 } else {
576 ret = spapr_dt_dynamic_memory(spapr, fdt, offset, dimms);
577 }
578 qapi_free_MemoryDeviceInfoList(dimms);
579
580 if (ret < 0) {
581 return ret;
582 }
583
584 ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset);
585
586 return ret;
587 }
588
589 static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt)
590 {
591 MachineState *machine = MACHINE(spapr);
592 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
593 hwaddr mem_start, node_size;
594 int i, nb_nodes = machine->numa_state->num_nodes;
595 NodeInfo *nodes = machine->numa_state->nodes;
596
597 for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
598 if (!nodes[i].node_mem) {
599 continue;
600 }
601 if (mem_start >= machine->ram_size) {
602 node_size = 0;
603 } else {
604 node_size = nodes[i].node_mem;
605 if (node_size > machine->ram_size - mem_start) {
606 node_size = machine->ram_size - mem_start;
607 }
608 }
609 if (!mem_start) {
610 /* spapr_machine_init() checks for rma_size <= node0_size
611 * already */
612 spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size);
613 mem_start += spapr->rma_size;
614 node_size -= spapr->rma_size;
615 }
616 for ( ; node_size; ) {
617 hwaddr sizetmp = pow2floor(node_size);
618
619 /* mem_start != 0 here */
620 if (ctzl(mem_start) < ctzl(sizetmp)) {
621 sizetmp = 1ULL << ctzl(mem_start);
622 }
623
624 spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp);
625 node_size -= sizetmp;
626 mem_start += sizetmp;
627 }
628 }
629
630 /* Generate ibm,dynamic-reconfiguration-memory node if required */
631 if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
632 int ret;
633
634 g_assert(smc->dr_lmb_enabled);
635 ret = spapr_dt_dynamic_reconfiguration_memory(spapr, fdt);
636 if (ret) {
637 return ret;
638 }
639 }
640
641 return 0;
642 }
643
644 static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset,
645 SpaprMachineState *spapr)
646 {
647 MachineState *ms = MACHINE(spapr);
648 PowerPCCPU *cpu = POWERPC_CPU(cs);
649 CPUPPCState *env = &cpu->env;
650 PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
651 int index = spapr_get_vcpu_id(cpu);
652 uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
653 0xffffffff, 0xffffffff};
654 uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
655 : SPAPR_TIMEBASE_FREQ;
656 uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
657 uint32_t page_sizes_prop[64];
658 size_t page_sizes_prop_size;
659 unsigned int smp_threads = ms->smp.threads;
660 uint32_t vcpus_per_socket = smp_threads * ms->smp.cores;
661 uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
662 int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
663 SpaprDrc *drc;
664 int drc_index;
665 uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
666 int i;
667
668 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index);
669 if (drc) {
670 drc_index = spapr_drc_index(drc);
671 _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
672 }
673
674 _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
675 _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
676
677 _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
678 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
679 env->dcache_line_size)));
680 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
681 env->dcache_line_size)));
682 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
683 env->icache_line_size)));
684 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
685 env->icache_line_size)));
686
687 if (pcc->l1_dcache_size) {
688 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
689 pcc->l1_dcache_size)));
690 } else {
691 warn_report("Unknown L1 dcache size for cpu");
692 }
693 if (pcc->l1_icache_size) {
694 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
695 pcc->l1_icache_size)));
696 } else {
697 warn_report("Unknown L1 icache size for cpu");
698 }
699
700 _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
701 _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
702 _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
703 _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
704 _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
705 _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
706
707 if (ppc_has_spr(cpu, SPR_PURR)) {
708 _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
709 }
710 if (ppc_has_spr(cpu, SPR_PURR)) {
711 _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
712 }
713
714 if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
715 _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
716 segs, sizeof(segs))));
717 }
718
719 /* Advertise VSX (vector extensions) if available
720 * 1 == VMX / Altivec available
721 * 2 == VSX available
722 *
723 * Only CPUs for which we create core types in spapr_cpu_core.c
724 * are possible, and all of those have VMX */
725 if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
726 _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
727 } else {
728 _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
729 }
730
731 /* Advertise DFP (Decimal Floating Point) if available
732 * 0 / no property == no DFP
733 * 1 == DFP available */
734 if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
735 _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
736 }
737
738 page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
739 sizeof(page_sizes_prop));
740 if (page_sizes_prop_size) {
741 _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
742 page_sizes_prop, page_sizes_prop_size)));
743 }
744
745 spapr_dt_pa_features(spapr, cpu, fdt, offset);
746
747 _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
748 cs->cpu_index / vcpus_per_socket)));
749
750 _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
751 pft_size_prop, sizeof(pft_size_prop))));
752
753 if (ms->numa_state->num_nodes > 1) {
754 _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu));
755 }
756
757 _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
758
759 if (pcc->radix_page_info) {
760 for (i = 0; i < pcc->radix_page_info->count; i++) {
761 radix_AP_encodings[i] =
762 cpu_to_be32(pcc->radix_page_info->entries[i]);
763 }
764 _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
765 radix_AP_encodings,
766 pcc->radix_page_info->count *
767 sizeof(radix_AP_encodings[0]))));
768 }
769
770 /*
771 * We set this property to let the guest know that it can use the large
772 * decrementer and its width in bits.
773 */
774 if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
775 _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
776 pcc->lrg_decr_bits)));
777 }
778
779 static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
780 {
781 CPUState **rev;
782 CPUState *cs;
783 int n_cpus;
784 int cpus_offset;
785 int i;
786
787 cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
788 _FDT(cpus_offset);
789 _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
790 _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
791
792 /*
793 * We walk the CPUs in reverse order to ensure that CPU DT nodes
794 * created by fdt_add_subnode() end up in the right order in FDT
795 * for the guest kernel the enumerate the CPUs correctly.
796 *
797 * The CPU list cannot be traversed in reverse order, so we need
798 * to do extra work.
799 */
800 n_cpus = 0;
801 rev = NULL;
802 CPU_FOREACH(cs) {
803 rev = g_renew(CPUState *, rev, n_cpus + 1);
804 rev[n_cpus++] = cs;
805 }
806
807 for (i = n_cpus - 1; i >= 0; i--) {
808 CPUState *cs = rev[i];
809 PowerPCCPU *cpu = POWERPC_CPU(cs);
810 int index = spapr_get_vcpu_id(cpu);
811 DeviceClass *dc = DEVICE_GET_CLASS(cs);
812 g_autofree char *nodename = NULL;
813 int offset;
814
815 if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
816 continue;
817 }
818
819 nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
820 offset = fdt_add_subnode(fdt, cpus_offset, nodename);
821 _FDT(offset);
822 spapr_dt_cpu(cs, fdt, offset, spapr);
823 }
824
825 g_free(rev);
826 }
827
828 static int spapr_dt_rng(void *fdt)
829 {
830 int node;
831 int ret;
832
833 node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
834 if (node <= 0) {
835 return -1;
836 }
837 ret = fdt_setprop_string(fdt, node, "device_type",
838 "ibm,platform-facilities");
839 ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
840 ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
841
842 node = fdt_add_subnode(fdt, node, "ibm,random-v1");
843 if (node <= 0) {
844 return -1;
845 }
846 ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
847
848 return ret ? -1 : 0;
849 }
850
851 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
852 {
853 MachineState *ms = MACHINE(spapr);
854 int rtas;
855 GString *hypertas = g_string_sized_new(256);
856 GString *qemu_hypertas = g_string_sized_new(256);
857 uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
858 memory_region_size(&MACHINE(spapr)->device_memory->mr);
859 uint32_t lrdr_capacity[] = {
860 cpu_to_be32(max_device_addr >> 32),
861 cpu_to_be32(max_device_addr & 0xffffffff),
862 cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE >> 32),
863 cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff),
864 cpu_to_be32(ms->smp.max_cpus / ms->smp.threads),
865 };
866
867 _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
868
869 /* hypertas */
870 add_str(hypertas, "hcall-pft");
871 add_str(hypertas, "hcall-term");
872 add_str(hypertas, "hcall-dabr");
873 add_str(hypertas, "hcall-interrupt");
874 add_str(hypertas, "hcall-tce");
875 add_str(hypertas, "hcall-vio");
876 add_str(hypertas, "hcall-splpar");
877 add_str(hypertas, "hcall-join");
878 add_str(hypertas, "hcall-bulk");
879 add_str(hypertas, "hcall-set-mode");
880 add_str(hypertas, "hcall-sprg0");
881 add_str(hypertas, "hcall-copy");
882 add_str(hypertas, "hcall-debug");
883 add_str(hypertas, "hcall-vphn");
884 if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) {
885 add_str(hypertas, "hcall-rpt-invalidate");
886 }
887
888 add_str(qemu_hypertas, "hcall-memop1");
889
890 if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
891 add_str(hypertas, "hcall-multi-tce");
892 }
893
894 if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
895 add_str(hypertas, "hcall-hpt-resize");
896 }
897
898 _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
899 hypertas->str, hypertas->len));
900 g_string_free(hypertas, TRUE);
901 _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
902 qemu_hypertas->str, qemu_hypertas->len));
903 g_string_free(qemu_hypertas, TRUE);
904
905 spapr_numa_write_rtas_dt(spapr, fdt, rtas);
906
907 /*
908 * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log,
909 * and 16 bytes per CPU for system reset error log plus an extra 8 bytes.
910 *
911 * The system reset requirements are driven by existing Linux and PowerVM
912 * implementation which (contrary to PAPR) saves r3 in the error log
913 * structure like machine check, so Linux expects to find the saved r3
914 * value at the address in r3 upon FWNMI-enabled sreset interrupt (and
915 * does not look at the error value).
916 *
917 * System reset interrupts are not subject to interlock like machine
918 * check, so this memory area could be corrupted if the sreset is
919 * interrupted by a machine check (or vice versa) if it was shared. To
920 * prevent this, system reset uses per-CPU areas for the sreset save
921 * area. A system reset that interrupts a system reset handler could
922 * still overwrite this area, but Linux doesn't try to recover in that
923 * case anyway.
924 *
925 * The extra 8 bytes is required because Linux's FWNMI error log check
926 * is off-by-one.
927 *
928 * RTAS_MIN_SIZE is required for the RTAS blob itself.
929 */
930 _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE +
931 RTAS_ERROR_LOG_MAX +
932 ms->smp.max_cpus * sizeof(uint64_t) * 2 +
933 sizeof(uint64_t)));
934 _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
935 RTAS_ERROR_LOG_MAX));
936 _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
937 RTAS_EVENT_SCAN_RATE));
938
939 g_assert(msi_nonbroken);
940 _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
941
942 /*
943 * According to PAPR, rtas ibm,os-term does not guarantee a return
944 * back to the guest cpu.
945 *
946 * While an additional ibm,extended-os-term property indicates
947 * that rtas call return will always occur. Set this property.
948 */
949 _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
950
951 _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
952 lrdr_capacity, sizeof(lrdr_capacity)));
953
954 spapr_dt_rtas_tokens(fdt, rtas);
955 }
956
957 /*
958 * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
959 * and the XIVE features that the guest may request and thus the valid
960 * values for bytes 23..26 of option vector 5:
961 */
962 static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
963 int chosen)
964 {
965 PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
966
967 char val[2 * 4] = {
968 23, 0x00, /* XICS / XIVE mode */
969 24, 0x00, /* Hash/Radix, filled in below. */
970 25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
971 26, 0x40, /* Radix options: GTSE == yes. */
972 };
973
974 if (spapr->irq->xics && spapr->irq->xive) {
975 val[1] = SPAPR_OV5_XIVE_BOTH;
976 } else if (spapr->irq->xive) {
977 val[1] = SPAPR_OV5_XIVE_EXPLOIT;
978 } else {
979 assert(spapr->irq->xics);
980 val[1] = SPAPR_OV5_XIVE_LEGACY;
981 }
982
983 if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
984 first_ppc_cpu->compat_pvr)) {
985 /*
986 * If we're in a pre POWER9 compat mode then the guest should
987 * do hash and use the legacy interrupt mode
988 */
989 val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */
990 val[3] = 0x00; /* Hash */
991 spapr_check_mmu_mode(false);
992 } else if (kvm_enabled()) {
993 if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
994 val[3] = 0x80; /* OV5_MMU_BOTH */
995 } else if (kvmppc_has_cap_mmu_radix()) {
996 val[3] = 0x40; /* OV5_MMU_RADIX_300 */
997 } else {
998 val[3] = 0x00; /* Hash */
999 }
1000 } else {
1001 /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
1002 val[3] = 0xC0;
1003 }
1004 _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
1005 val, sizeof(val)));
1006 }
1007
1008 static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
1009 {
1010 MachineState *machine = MACHINE(spapr);
1011 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1012 int chosen;
1013
1014 _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
1015
1016 if (reset) {
1017 const char *boot_device = spapr->boot_device;
1018 char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
1019 size_t cb = 0;
1020 char *bootlist = get_boot_devices_list(&cb);
1021
1022 if (machine->kernel_cmdline && machine->kernel_cmdline[0]) {
1023 _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
1024 machine->kernel_cmdline));
1025 }
1026
1027 if (spapr->initrd_size) {
1028 _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
1029 spapr->initrd_base));
1030 _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
1031 spapr->initrd_base + spapr->initrd_size));
1032 }
1033
1034 if (spapr->kernel_size) {
1035 uint64_t kprop[2] = { cpu_to_be64(spapr->kernel_addr),
1036 cpu_to_be64(spapr->kernel_size) };
1037
1038 _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
1039 &kprop, sizeof(kprop)));
1040 if (spapr->kernel_le) {
1041 _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
1042 }
1043 }
1044 if (boot_menu) {
1045 _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", boot_menu)));
1046 }
1047 _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
1048 _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
1049 _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
1050
1051 if (cb && bootlist) {
1052 int i;
1053
1054 for (i = 0; i < cb; i++) {
1055 if (bootlist[i] == '\n') {
1056 bootlist[i] = ' ';
1057 }
1058 }
1059 _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
1060 }
1061
1062 if (boot_device && strlen(boot_device)) {
1063 _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
1064 }
1065
1066 if (!spapr->has_graphics && stdout_path) {
1067 /*
1068 * "linux,stdout-path" and "stdout" properties are
1069 * deprecated by linux kernel. New platforms should only
1070 * use the "stdout-path" property. Set the new property
1071 * and continue using older property to remain compatible
1072 * with the existing firmware.
1073 */
1074 _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
1075 _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
1076 }
1077
1078 /*
1079 * We can deal with BAR reallocation just fine, advertise it
1080 * to the guest
1081 */
1082 if (smc->linux_pci_probe) {
1083 _FDT(fdt_setprop_cell(fdt, chosen, "linux,pci-probe-only", 0));
1084 }
1085
1086 spapr_dt_ov5_platform_support(spapr, fdt, chosen);
1087
1088 g_free(stdout_path);
1089 g_free(bootlist);
1090 }
1091
1092 _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
1093 }
1094
1095 static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
1096 {
1097 /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
1098 * KVM to work under pHyp with some guest co-operation */
1099 int hypervisor;
1100 uint8_t hypercall[16];
1101
1102 _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
1103 /* indicate KVM hypercall interface */
1104 _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
1105 if (kvmppc_has_cap_fixup_hcalls()) {
1106 /*
1107 * Older KVM versions with older guest kernels were broken
1108 * with the magic page, don't allow the guest to map it.
1109 */
1110 if (!kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
1111 sizeof(hypercall))) {
1112 _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
1113 hypercall, sizeof(hypercall)));
1114 }
1115 }
1116 }
1117
1118 void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space)
1119 {
1120 MachineState *machine = MACHINE(spapr);
1121 MachineClass *mc = MACHINE_GET_CLASS(machine);
1122 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1123 uint32_t root_drc_type_mask = 0;
1124 int ret;
1125 void *fdt;
1126 SpaprPhbState *phb;
1127 char *buf;
1128
1129 fdt = g_malloc0(space);
1130 _FDT((fdt_create_empty_tree(fdt, space)));
1131
1132 /* Root node */
1133 _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
1134 _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
1135 _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
1136
1137 /* Guest UUID & Name*/
1138 buf = qemu_uuid_unparse_strdup(&qemu_uuid);
1139 _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
1140 if (qemu_uuid_set) {
1141 _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
1142 }
1143 g_free(buf);
1144
1145 if (qemu_get_vm_name()) {
1146 _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
1147 qemu_get_vm_name()));
1148 }
1149
1150 /* Host Model & Serial Number */
1151 if (spapr->host_model) {
1152 _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
1153 } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
1154 _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
1155 g_free(buf);
1156 }
1157
1158 if (spapr->host_serial) {
1159 _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
1160 } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
1161 _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
1162 g_free(buf);
1163 }
1164
1165 _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
1166 _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
1167
1168 /* /interrupt controller */
1169 spapr_irq_dt(spapr, spapr_max_server_number(spapr), fdt, PHANDLE_INTC);
1170
1171 ret = spapr_dt_memory(spapr, fdt);
1172 if (ret < 0) {
1173 error_report("couldn't setup memory nodes in fdt");
1174 exit(1);
1175 }
1176
1177 /* /vdevice */
1178 spapr_dt_vdevice(spapr->vio_bus, fdt);
1179
1180 if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
1181 ret = spapr_dt_rng(fdt);
1182 if (ret < 0) {
1183 error_report("could not set up rng device in the fdt");
1184 exit(1);
1185 }
1186 }
1187
1188 QLIST_FOREACH(phb, &spapr->phbs, list) {
1189 ret = spapr_dt_phb(spapr, phb, PHANDLE_INTC, fdt, NULL);
1190 if (ret < 0) {
1191 error_report("couldn't setup PCI devices in fdt");
1192 exit(1);
1193 }
1194 }
1195
1196 spapr_dt_cpus(fdt, spapr);
1197
1198 /* ibm,drc-indexes and friends */
1199 if (smc->dr_lmb_enabled) {
1200 root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_LMB;
1201 }
1202 if (smc->dr_phb_enabled) {
1203 root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PHB;
1204 }
1205 if (mc->nvdimm_supported) {
1206 root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PMEM;
1207 }
1208 if (root_drc_type_mask) {
1209 _FDT(spapr_dt_drc(fdt, 0, NULL, root_drc_type_mask));
1210 }
1211
1212 if (mc->has_hotpluggable_cpus) {
1213 int offset = fdt_path_offset(fdt, "/cpus");
1214 ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU);
1215 if (ret < 0) {
1216 error_report("Couldn't set up CPU DR device tree properties");
1217 exit(1);
1218 }
1219 }
1220
1221 /* /event-sources */
1222 spapr_dt_events(spapr, fdt);
1223
1224 /* /rtas */
1225 spapr_dt_rtas(spapr, fdt);
1226
1227 /* /chosen */
1228 spapr_dt_chosen(spapr, fdt, reset);
1229
1230 /* /hypervisor */
1231 if (kvm_enabled()) {
1232 spapr_dt_hypervisor(spapr, fdt);
1233 }
1234
1235 /* Build memory reserve map */
1236 if (reset) {
1237 if (spapr->kernel_size) {
1238 _FDT((fdt_add_mem_rsv(fdt, spapr->kernel_addr,
1239 spapr->kernel_size)));
1240 }
1241 if (spapr->initrd_size) {
1242 _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base,
1243 spapr->initrd_size)));
1244 }
1245 }
1246
1247 /* NVDIMM devices */
1248 if (mc->nvdimm_supported) {
1249 spapr_dt_persistent_memory(spapr, fdt);
1250 }
1251
1252 return fdt;
1253 }
1254
1255 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
1256 {
1257 SpaprMachineState *spapr = opaque;
1258
1259 return (addr & 0x0fffffff) + spapr->kernel_addr;
1260 }
1261
1262 static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
1263 PowerPCCPU *cpu)
1264 {
1265 CPUPPCState *env = &cpu->env;
1266
1267 /* The TCG path should also be holding the BQL at this point */
1268 g_assert(qemu_mutex_iothread_locked());
1269
1270 if (msr_pr) {
1271 hcall_dprintf("Hypercall made with MSR[PR]=1\n");
1272 env->gpr[3] = H_PRIVILEGE;
1273 } else {
1274 env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
1275 }
1276 }
1277
1278 struct LPCRSyncState {
1279 target_ulong value;
1280 target_ulong mask;
1281 };
1282
1283 static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
1284 {
1285 struct LPCRSyncState *s = arg.host_ptr;
1286 PowerPCCPU *cpu = POWERPC_CPU(cs);
1287 CPUPPCState *env = &cpu->env;
1288 target_ulong lpcr;
1289
1290 cpu_synchronize_state(cs);
1291 lpcr = env->spr[SPR_LPCR];
1292 lpcr &= ~s->mask;
1293 lpcr |= s->value;
1294 ppc_store_lpcr(cpu, lpcr);
1295 }
1296
1297 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
1298 {
1299 CPUState *cs;
1300 struct LPCRSyncState s = {
1301 .value = value,
1302 .mask = mask
1303 };
1304 CPU_FOREACH(cs) {
1305 run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
1306 }
1307 }
1308
1309 static void spapr_get_pate(PPCVirtualHypervisor *vhyp, ppc_v3_pate_t *entry)
1310 {
1311 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1312
1313 /* Copy PATE1:GR into PATE0:HR */
1314 entry->dw0 = spapr->patb_entry & PATE0_HR;
1315 entry->dw1 = spapr->patb_entry;
1316 }
1317
1318 #define HPTE(_table, _i) (void *)(((uint64_t *)(_table)) + ((_i) * 2))
1319 #define HPTE_VALID(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
1320 #define HPTE_DIRTY(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
1321 #define CLEAN_HPTE(_hpte) ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
1322 #define DIRTY_HPTE(_hpte) ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
1323
1324 /*
1325 * Get the fd to access the kernel htab, re-opening it if necessary
1326 */
1327 static int get_htab_fd(SpaprMachineState *spapr)
1328 {
1329 Error *local_err = NULL;
1330
1331 if (spapr->htab_fd >= 0) {
1332 return spapr->htab_fd;
1333 }
1334
1335 spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
1336 if (spapr->htab_fd < 0) {
1337 error_report_err(local_err);
1338 }
1339
1340 return spapr->htab_fd;
1341 }
1342
1343 void close_htab_fd(SpaprMachineState *spapr)
1344 {
1345 if (spapr->htab_fd >= 0) {
1346 close(spapr->htab_fd);
1347 }
1348 spapr->htab_fd = -1;
1349 }
1350
1351 static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
1352 {
1353 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1354
1355 return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
1356 }
1357
1358 static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
1359 {
1360 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1361
1362 assert(kvm_enabled());
1363
1364 if (!spapr->htab) {
1365 return 0;
1366 }
1367
1368 return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
1369 }
1370
1371 static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
1372 hwaddr ptex, int n)
1373 {
1374 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1375 hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
1376
1377 if (!spapr->htab) {
1378 /*
1379 * HTAB is controlled by KVM. Fetch into temporary buffer
1380 */
1381 ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
1382 kvmppc_read_hptes(hptes, ptex, n);
1383 return hptes;
1384 }
1385
1386 /*
1387 * HTAB is controlled by QEMU. Just point to the internally
1388 * accessible PTEG.
1389 */
1390 return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
1391 }
1392
1393 static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
1394 const ppc_hash_pte64_t *hptes,
1395 hwaddr ptex, int n)
1396 {
1397 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1398
1399 if (!spapr->htab) {
1400 g_free((void *)hptes);
1401 }
1402
1403 /* Nothing to do for qemu managed HPT */
1404 }
1405
1406 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
1407 uint64_t pte0, uint64_t pte1)
1408 {
1409 SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
1410 hwaddr offset = ptex * HASH_PTE_SIZE_64;
1411
1412 if (!spapr->htab) {
1413 kvmppc_write_hpte(ptex, pte0, pte1);
1414 } else {
1415 if (pte0 & HPTE64_V_VALID) {
1416 stq_p(spapr->htab + offset + HASH_PTE_SIZE_64 / 2, pte1);
1417 /*
1418 * When setting valid, we write PTE1 first. This ensures
1419 * proper synchronization with the reading code in
1420 * ppc_hash64_pteg_search()
1421 */
1422 smp_wmb();
1423 stq_p(spapr->htab + offset, pte0);
1424 } else {
1425 stq_p(spapr->htab + offset, pte0);
1426 /*
1427 * When clearing it we set PTE0 first. This ensures proper
1428 * synchronization with the reading code in
1429 * ppc_hash64_pteg_search()
1430 */
1431 smp_wmb();
1432 stq_p(spapr->htab + offset + HASH_PTE_SIZE_64 / 2, pte1);
1433 }
1434 }
1435 }
1436
1437 static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1438 uint64_t pte1)
1439 {
1440 hwaddr offset = ptex * HASH_PTE_SIZE_64 + 15;
1441 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1442
1443 if (!spapr->htab) {
1444 /* There should always be a hash table when this is called */
1445 error_report("spapr_hpte_set_c called with no hash table !");
1446 return;
1447 }
1448
1449 /* The HW performs a non-atomic byte update */
1450 stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
1451 }
1452
1453 static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1454 uint64_t pte1)
1455 {
1456 hwaddr offset = ptex * HASH_PTE_SIZE_64 + 14;
1457 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1458
1459 if (!spapr->htab) {
1460 /* There should always be a hash table when this is called */
1461 error_report("spapr_hpte_set_r called with no hash table !");
1462 return;
1463 }
1464
1465 /* The HW performs a non-atomic byte update */
1466 stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
1467 }
1468
1469 int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
1470 {
1471 int shift;
1472
1473 /* We aim for a hash table of size 1/128 the size of RAM (rounded
1474 * up). The PAPR recommendation is actually 1/64 of RAM size, but
1475 * that's much more than is needed for Linux guests */
1476 shift = ctz64(pow2ceil(ramsize)) - 7;
1477 shift = MAX(shift, 18); /* Minimum architected size */
1478 shift = MIN(shift, 46); /* Maximum architected size */
1479 return shift;
1480 }
1481
1482 void spapr_free_hpt(SpaprMachineState *spapr)
1483 {
1484 g_free(spapr->htab);
1485 spapr->htab = NULL;
1486 spapr->htab_shift = 0;
1487 close_htab_fd(spapr);
1488 }
1489
1490 int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp)
1491 {
1492 ERRP_GUARD();
1493 long rc;
1494
1495 /* Clean up any HPT info from a previous boot */
1496 spapr_free_hpt(spapr);
1497
1498 rc = kvmppc_reset_htab(shift);
1499
1500 if (rc == -EOPNOTSUPP) {
1501 error_setg(errp, "HPT not supported in nested guests");
1502 return -EOPNOTSUPP;
1503 }
1504
1505 if (rc < 0) {
1506 /* kernel-side HPT needed, but couldn't allocate one */
1507 error_setg_errno(errp, errno, "Failed to allocate KVM HPT of order %d",
1508 shift);
1509 error_append_hint(errp, "Try smaller maxmem?\n");
1510 return -errno;
1511 } else if (rc > 0) {
1512 /* kernel-side HPT allocated */
1513 if (rc != shift) {
1514 error_setg(errp,
1515 "Requested order %d HPT, but kernel allocated order %ld",
1516 shift, rc);
1517 error_append_hint(errp, "Try smaller maxmem?\n");
1518 return -ENOSPC;
1519 }
1520
1521 spapr->htab_shift = shift;
1522 spapr->htab = NULL;
1523 } else {
1524 /* kernel-side HPT not needed, allocate in userspace instead */
1525 size_t size = 1ULL << shift;
1526 int i;
1527
1528 spapr->htab = qemu_memalign(size, size);
1529 memset(spapr->htab, 0, size);
1530 spapr->htab_shift = shift;
1531
1532 for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
1533 DIRTY_HPTE(HPTE(spapr->htab, i));
1534 }
1535 }
1536 /* We're setting up a hash table, so that means we're not radix */
1537 spapr->patb_entry = 0;
1538 spapr_set_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
1539 return 0;
1540 }
1541
1542 void spapr_setup_hpt(SpaprMachineState *spapr)
1543 {
1544 int hpt_shift;
1545
1546 if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
1547 hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
1548 } else {
1549 uint64_t current_ram_size;
1550
1551 current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
1552 hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
1553 }
1554 spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
1555
1556 if (kvm_enabled()) {
1557 hwaddr vrma_limit = kvmppc_vrma_limit(spapr->htab_shift);
1558
1559 /* Check our RMA fits in the possible VRMA */
1560 if (vrma_limit < spapr->rma_size) {
1561 error_report("Unable to create %" HWADDR_PRIu
1562 "MiB RMA (VRMA only allows %" HWADDR_PRIu "MiB",
1563 spapr->rma_size / MiB, vrma_limit / MiB);
1564 exit(EXIT_FAILURE);
1565 }
1566 }
1567 }
1568
1569 void spapr_check_mmu_mode(bool guest_radix)
1570 {
1571 if (guest_radix) {
1572 if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
1573 error_report("Guest requested unavailable MMU mode (radix).");
1574 exit(EXIT_FAILURE);
1575 }
1576 } else {
1577 if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
1578 && !kvmppc_has_cap_mmu_hash_v3()) {
1579 error_report("Guest requested unavailable MMU mode (hash).");
1580 exit(EXIT_FAILURE);
1581 }
1582 }
1583 }
1584
1585 static void spapr_machine_reset(MachineState *machine)
1586 {
1587 SpaprMachineState *spapr = SPAPR_MACHINE(machine);
1588 PowerPCCPU *first_ppc_cpu;
1589 hwaddr fdt_addr;
1590 void *fdt;
1591 int rc;
1592
1593 pef_kvm_reset(machine->cgs, &error_fatal);
1594 spapr_caps_apply(spapr);
1595
1596 first_ppc_cpu = POWERPC_CPU(first_cpu);
1597 if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
1598 ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
1599 spapr->max_compat_pvr)) {
1600 /*
1601 * If using KVM with radix mode available, VCPUs can be started
1602 * without a HPT because KVM will start them in radix mode.
1603 * Set the GR bit in PATE so that we know there is no HPT.
1604 */
1605 spapr->patb_entry = PATE1_GR;
1606 spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
1607 } else {
1608 spapr_setup_hpt(spapr);
1609 }
1610
1611 qemu_devices_reset();
1612
1613 spapr_ovec_cleanup(spapr->ov5_cas);
1614 spapr->ov5_cas = spapr_ovec_new();
1615
1616 ppc_set_compat_all(spapr->max_compat_pvr, &error_fatal);
1617
1618 /*
1619 * This is fixing some of the default configuration of the XIVE
1620 * devices. To be called after the reset of the machine devices.
1621 */
1622 spapr_irq_reset(spapr, &error_fatal);
1623
1624 /*
1625 * There is no CAS under qtest. Simulate one to please the code that
1626 * depends on spapr->ov5_cas. This is especially needed to test device
1627 * unplug, so we do that before resetting the DRCs.
1628 */
1629 if (qtest_enabled()) {
1630 spapr_ovec_cleanup(spapr->ov5_cas);
1631 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
1632 }
1633
1634 /* DRC reset may cause a device to be unplugged. This will cause troubles
1635 * if this device is used by another device (eg, a running vhost backend
1636 * will crash QEMU if the DIMM holding the vring goes away). To avoid such
1637 * situations, we reset DRCs after all devices have been reset.
1638 */
1639 spapr_drc_reset_all(spapr);
1640
1641 spapr_clear_pending_events(spapr);
1642
1643 /*
1644 * We place the device tree just below either the top of the RMA,
1645 * or just below 2GB, whichever is lower, so that it can be
1646 * processed with 32-bit real mode code if necessary
1647 */
1648 fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
1649
1650 fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
1651 if (spapr->vof) {
1652 spapr_vof_reset(spapr, fdt, &error_fatal);
1653 /*
1654 * Do not pack the FDT as the client may change properties.
1655 * VOF client does not expect the FDT so we do not load it to the VM.
1656 */
1657 } else {
1658 rc = fdt_pack(fdt);
1659 /* Should only fail if we've built a corrupted tree */
1660 assert(rc == 0);
1661
1662 spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
1663 0, fdt_addr, 0);
1664 cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
1665 }
1666 qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
1667
1668 g_free(spapr->fdt_blob);
1669 spapr->fdt_size = fdt_totalsize(fdt);
1670 spapr->fdt_initial_size = spapr->fdt_size;
1671 spapr->fdt_blob = fdt;
1672
1673 /* Set up the entry state */
1674 first_ppc_cpu->env.gpr[5] = 0;
1675
1676 spapr->fwnmi_system_reset_addr = -1;
1677 spapr->fwnmi_machine_check_addr = -1;
1678 spapr->fwnmi_machine_check_interlock = -1;
1679
1680 /* Signal all vCPUs waiting on this condition */
1681 qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond);
1682
1683 migrate_del_blocker(spapr->fwnmi_migration_blocker);
1684 }
1685
1686 static void spapr_create_nvram(SpaprMachineState *spapr)
1687 {
1688 DeviceState *dev = qdev_new("spapr-nvram");
1689 DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1690
1691 if (dinfo) {
1692 qdev_prop_set_drive_err(dev, "drive", blk_by_legacy_dinfo(dinfo),
1693 &error_fatal);
1694 }
1695
1696 qdev_realize_and_unref(dev, &spapr->vio_bus->bus, &error_fatal);
1697
1698 spapr->nvram = (struct SpaprNvram *)dev;
1699 }
1700
1701 static void spapr_rtc_create(SpaprMachineState *spapr)
1702 {
1703 object_initialize_child_with_props(OBJECT(spapr), "rtc", &spapr->rtc,
1704 sizeof(spapr->rtc), TYPE_SPAPR_RTC,
1705 &error_fatal, NULL);
1706 qdev_realize(DEVICE(&spapr->rtc), NULL, &error_fatal);
1707 object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
1708 "date");
1709 }
1710
1711 /* Returns whether we want to use VGA or not */
1712 static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
1713 {
1714 switch (vga_interface_type) {
1715 case VGA_NONE:
1716 return false;
1717 case VGA_DEVICE:
1718 return true;
1719 case VGA_STD:
1720 case VGA_VIRTIO:
1721 case VGA_CIRRUS:
1722 return pci_vga_init(pci_bus) != NULL;
1723 default:
1724 error_setg(errp,
1725 "Unsupported VGA mode, only -vga std or -vga virtio is supported");
1726 return false;
1727 }
1728 }
1729
1730 static int spapr_pre_load(void *opaque)
1731 {
1732 int rc;
1733
1734 rc = spapr_caps_pre_load(opaque);
1735 if (rc) {
1736 return rc;
1737 }
1738
1739 return 0;
1740 }
1741
1742 static int spapr_post_load(void *opaque, int version_id)
1743 {
1744 SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1745 int err = 0;
1746
1747 err = spapr_caps_post_migration(spapr);
1748 if (err) {
1749 return err;
1750 }
1751
1752 /*
1753 * In earlier versions, there was no separate qdev for the PAPR
1754 * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1755 * So when migrating from those versions, poke the incoming offset
1756 * value into the RTC device
1757 */
1758 if (version_id < 3) {
1759 err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
1760 if (err) {
1761 return err;
1762 }
1763 }
1764
1765 if (kvm_enabled() && spapr->patb_entry) {
1766 PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
1767 bool radix = !!(spapr->patb_entry & PATE1_GR);
1768 bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
1769
1770 /*
1771 * Update LPCR:HR and UPRT as they may not be set properly in
1772 * the stream
1773 */
1774 spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
1775 LPCR_HR | LPCR_UPRT);
1776
1777 err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
1778 if (err) {
1779 error_report("Process table config unsupported by the host");
1780 return -EINVAL;
1781 }
1782 }
1783
1784 err = spapr_irq_post_load(spapr, version_id);
1785 if (err) {
1786 return err;
1787 }
1788
1789 return err;
1790 }
1791
1792 static int spapr_pre_save(void *opaque)
1793 {
1794 int rc;
1795
1796 rc = spapr_caps_pre_save(opaque);
1797 if (rc) {
1798 return rc;
1799 }
1800
1801 return 0;
1802 }
1803
1804 static bool version_before_3(void *opaque, int version_id)
1805 {
1806 return version_id < 3;
1807 }
1808
1809 static bool spapr_pending_events_needed(void *opaque)
1810 {
1811 SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1812 return !QTAILQ_EMPTY(&spapr->pending_events);
1813 }
1814
1815 static const VMStateDescription vmstate_spapr_event_entry = {
1816 .name = "spapr_event_log_entry",
1817 .version_id = 1,
1818 .minimum_version_id = 1,
1819 .fields = (VMStateField[]) {
1820 VMSTATE_UINT32(summary, SpaprEventLogEntry),
1821 VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
1822 VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
1823 NULL, extended_length),
1824 VMSTATE_END_OF_LIST()
1825 },
1826 };
1827
1828 static const VMStateDescription vmstate_spapr_pending_events = {
1829 .name = "spapr_pending_events",
1830 .version_id = 1,
1831 .minimum_version_id = 1,
1832 .needed = spapr_pending_events_needed,
1833 .fields = (VMStateField[]) {
1834 VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
1835 vmstate_spapr_event_entry, SpaprEventLogEntry, next),
1836 VMSTATE_END_OF_LIST()
1837 },
1838 };
1839
1840 static bool spapr_ov5_cas_needed(void *opaque)
1841 {
1842 SpaprMachineState *spapr = opaque;
1843 SpaprOptionVector *ov5_mask = spapr_ovec_new();
1844 bool cas_needed;
1845
1846 /* Prior to the introduction of SpaprOptionVector, we had two option
1847 * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
1848 * Both of these options encode machine topology into the device-tree
1849 * in such a way that the now-booted OS should still be able to interact
1850 * appropriately with QEMU regardless of what options were actually
1851 * negotiatied on the source side.
1852 *
1853 * As such, we can avoid migrating the CAS-negotiated options if these
1854 * are the only options available on the current machine/platform.
1855 * Since these are the only options available for pseries-2.7 and
1856 * earlier, this allows us to maintain old->new/new->old migration
1857 * compatibility.
1858 *
1859 * For QEMU 2.8+, there are additional CAS-negotiatable options available
1860 * via default pseries-2.8 machines and explicit command-line parameters.
1861 * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
1862 * of the actual CAS-negotiated values to continue working properly. For
1863 * example, availability of memory unplug depends on knowing whether
1864 * OV5_HP_EVT was negotiated via CAS.
1865 *
1866 * Thus, for any cases where the set of available CAS-negotiatable
1867 * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
1868 * include the CAS-negotiated options in the migration stream, unless
1869 * if they affect boot time behaviour only.
1870 */
1871 spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
1872 spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
1873 spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
1874
1875 /* We need extra information if we have any bits outside the mask
1876 * defined above */
1877 cas_needed = !spapr_ovec_subset(spapr->ov5, ov5_mask);
1878
1879 spapr_ovec_cleanup(ov5_mask);
1880
1881 return cas_needed;
1882 }
1883
1884 static const VMStateDescription vmstate_spapr_ov5_cas = {
1885 .name = "spapr_option_vector_ov5_cas",
1886 .version_id = 1,
1887 .minimum_version_id = 1,
1888 .needed = spapr_ov5_cas_needed,
1889 .fields = (VMStateField[]) {
1890 VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
1891 vmstate_spapr_ovec, SpaprOptionVector),
1892 VMSTATE_END_OF_LIST()
1893 },
1894 };
1895
1896 static bool spapr_patb_entry_needed(void *opaque)
1897 {
1898 SpaprMachineState *spapr = opaque;
1899
1900 return !!spapr->patb_entry;
1901 }
1902
1903 static const VMStateDescription vmstate_spapr_patb_entry = {
1904 .name = "spapr_patb_entry",
1905 .version_id = 1,
1906 .minimum_version_id = 1,
1907 .needed = spapr_patb_entry_needed,
1908 .fields = (VMStateField[]) {
1909 VMSTATE_UINT64(patb_entry, SpaprMachineState),
1910 VMSTATE_END_OF_LIST()
1911 },
1912 };
1913
1914 static bool spapr_irq_map_needed(void *opaque)
1915 {
1916 SpaprMachineState *spapr = opaque;
1917
1918 return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
1919 }
1920
1921 static const VMStateDescription vmstate_spapr_irq_map = {
1922 .name = "spapr_irq_map",
1923 .version_id = 1,
1924 .minimum_version_id = 1,
1925 .needed = spapr_irq_map_needed,
1926 .fields = (VMStateField[]) {
1927 VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
1928 VMSTATE_END_OF_LIST()
1929 },
1930 };
1931
1932 static bool spapr_dtb_needed(void *opaque)
1933 {
1934 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
1935
1936 return smc->update_dt_enabled;
1937 }
1938
1939 static int spapr_dtb_pre_load(void *opaque)
1940 {
1941 SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1942
1943 g_free(spapr->fdt_blob);
1944 spapr->fdt_blob = NULL;
1945 spapr->fdt_size = 0;
1946
1947 return 0;
1948 }
1949
1950 static const VMStateDescription vmstate_spapr_dtb = {
1951 .name = "spapr_dtb",
1952 .version_id = 1,
1953 .minimum_version_id = 1,
1954 .needed = spapr_dtb_needed,
1955 .pre_load = spapr_dtb_pre_load,
1956 .fields = (VMStateField[]) {
1957 VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
1958 VMSTATE_UINT32(fdt_size, SpaprMachineState),
1959 VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
1960 fdt_size),
1961 VMSTATE_END_OF_LIST()
1962 },
1963 };
1964
1965 static bool spapr_fwnmi_needed(void *opaque)
1966 {
1967 SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1968
1969 return spapr->fwnmi_machine_check_addr != -1;
1970 }
1971
1972 static int spapr_fwnmi_pre_save(void *opaque)
1973 {
1974 SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1975
1976 /*
1977 * Check if machine check handling is in progress and print a
1978 * warning message.
1979 */
1980 if (spapr->fwnmi_machine_check_interlock != -1) {
1981 warn_report("A machine check is being handled during migration. The"
1982 "handler may run and log hardware error on the destination");
1983 }
1984
1985 return 0;
1986 }
1987
1988 static const VMStateDescription vmstate_spapr_fwnmi = {
1989 .name = "spapr_fwnmi",
1990 .version_id = 1,
1991 .minimum_version_id = 1,
1992 .needed = spapr_fwnmi_needed,
1993 .pre_save = spapr_fwnmi_pre_save,
1994 .fields = (VMStateField[]) {
1995 VMSTATE_UINT64(fwnmi_system_reset_addr, SpaprMachineState),
1996 VMSTATE_UINT64(fwnmi_machine_check_addr, SpaprMachineState),
1997 VMSTATE_INT32(fwnmi_machine_check_interlock, SpaprMachineState),
1998 VMSTATE_END_OF_LIST()
1999 },
2000 };
2001
2002 static const VMStateDescription vmstate_spapr = {
2003 .name = "spapr",
2004 .version_id = 3,
2005 .minimum_version_id = 1,
2006 .pre_load = spapr_pre_load,
2007 .post_load = spapr_post_load,
2008 .pre_save = spapr_pre_save,
2009 .fields = (VMStateField[]) {
2010 /* used to be @next_irq */
2011 VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
2012
2013 /* RTC offset */
2014 VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
2015
2016 VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
2017 VMSTATE_END_OF_LIST()
2018 },
2019 .subsections = (const VMStateDescription*[]) {
2020 &vmstate_spapr_ov5_cas,
2021 &vmstate_spapr_patb_entry,
2022 &vmstate_spapr_pending_events,
2023 &vmstate_spapr_cap_htm,
2024 &vmstate_spapr_cap_vsx,
2025 &vmstate_spapr_cap_dfp,
2026 &vmstate_spapr_cap_cfpc,
2027 &vmstate_spapr_cap_sbbc,
2028 &vmstate_spapr_cap_ibs,
2029 &vmstate_spapr_cap_hpt_maxpagesize,
2030 &vmstate_spapr_irq_map,
2031 &vmstate_spapr_cap_nested_kvm_hv,
2032 &vmstate_spapr_dtb,
2033 &vmstate_spapr_cap_large_decr,
2034 &vmstate_spapr_cap_ccf_assist,
2035 &vmstate_spapr_cap_fwnmi,
2036 &vmstate_spapr_fwnmi,
2037 &vmstate_spapr_cap_rpt_invalidate,
2038 NULL
2039 }
2040 };
2041
2042 static int htab_save_setup(QEMUFile *f, void *opaque)
2043 {
2044 SpaprMachineState *spapr = opaque;
2045
2046 /* "Iteration" header */
2047 if (!spapr->htab_shift) {
2048 qemu_put_be32(f, -1);
2049 } else {
2050 qemu_put_be32(f, spapr->htab_shift);
2051 }
2052
2053 if (spapr->htab) {
2054 spapr->htab_save_index = 0;
2055 spapr->htab_first_pass = true;
2056 } else {
2057 if (spapr->htab_shift) {
2058 assert(kvm_enabled());
2059 }
2060 }
2061
2062
2063 return 0;
2064 }
2065
2066 static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
2067 int chunkstart, int n_valid, int n_invalid)
2068 {
2069 qemu_put_be32(f, chunkstart);
2070 qemu_put_be16(f, n_valid);
2071 qemu_put_be16(f, n_invalid);
2072 qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
2073 HASH_PTE_SIZE_64 * n_valid);
2074 }
2075
2076 static void htab_save_end_marker(QEMUFile *f)
2077 {
2078 qemu_put_be32(f, 0);
2079 qemu_put_be16(f, 0);
2080 qemu_put_be16(f, 0);
2081 }
2082
2083 static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
2084 int64_t max_ns)
2085 {
2086 bool has_timeout = max_ns != -1;
2087 int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2088 int index = spapr->htab_save_index;
2089 int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2090
2091 assert(spapr->htab_first_pass);
2092
2093 do {
2094 int chunkstart;
2095
2096 /* Consume invalid HPTEs */
2097 while ((index < htabslots)
2098 && !HPTE_VALID(HPTE(spapr->htab, index))) {
2099 CLEAN_HPTE(HPTE(spapr->htab, index));
2100 index++;
2101 }
2102
2103 /* Consume valid HPTEs */
2104 chunkstart = index;
2105 while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2106 && HPTE_VALID(HPTE(spapr->htab, index))) {
2107 CLEAN_HPTE(HPTE(spapr->htab, index));
2108 index++;
2109 }
2110
2111 if (index > chunkstart) {
2112 int n_valid = index - chunkstart;
2113
2114 htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
2115
2116 if (has_timeout &&
2117 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2118 break;
2119 }
2120 }
2121 } while ((index < htabslots) && !qemu_file_rate_limit(f));
2122
2123 if (index >= htabslots) {
2124 assert(index == htabslots);
2125 index = 0;
2126 spapr->htab_first_pass = false;
2127 }
2128 spapr->htab_save_index = index;
2129 }
2130
2131 static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
2132 int64_t max_ns)
2133 {
2134 bool final = max_ns < 0;
2135 int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2136 int examined = 0, sent = 0;
2137 int index = spapr->htab_save_index;
2138 int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2139
2140 assert(!spapr->htab_first_pass);
2141
2142 do {
2143 int chunkstart, invalidstart;
2144
2145 /* Consume non-dirty HPTEs */
2146 while ((index < htabslots)
2147 && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
2148 index++;
2149 examined++;
2150 }
2151
2152 chunkstart = index;
2153 /* Consume valid dirty HPTEs */
2154 while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2155 && HPTE_DIRTY(HPTE(spapr->htab, index))
2156 && HPTE_VALID(HPTE(spapr->htab, index))) {
2157 CLEAN_HPTE(HPTE(spapr->htab, index));
2158 index++;
2159 examined++;
2160 }
2161
2162 invalidstart = index;
2163 /* Consume invalid dirty HPTEs */
2164 while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
2165 && HPTE_DIRTY(HPTE(spapr->htab, index))
2166 && !HPTE_VALID(HPTE(spapr->htab, index))) {
2167 CLEAN_HPTE(HPTE(spapr->htab, index));
2168 index++;
2169 examined++;
2170 }
2171
2172 if (index > chunkstart) {
2173 int n_valid = invalidstart - chunkstart;
2174 int n_invalid = index - invalidstart;
2175
2176 htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
2177 sent += index - chunkstart;
2178
2179 if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2180 break;
2181 }
2182 }
2183
2184 if (examined >= htabslots) {
2185 break;
2186 }
2187
2188 if (index >= htabslots) {
2189 assert(index == htabslots);
2190 index = 0;
2191 }
2192 } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
2193
2194 if (index >= htabslots) {
2195 assert(index == htabslots);
2196 index = 0;
2197 }
2198
2199 spapr->htab_save_index = index;
2200
2201 return (examined >= htabslots) && (sent == 0) ? 1 : 0;
2202 }
2203
2204 #define MAX_ITERATION_NS 5000000 /* 5 ms */
2205 #define MAX_KVM_BUF_SIZE 2048
2206
2207 static int htab_save_iterate(QEMUFile *f, void *opaque)
2208 {
2209 SpaprMachineState *spapr = opaque;
2210 int fd;
2211 int rc = 0;
2212
2213 /* Iteration header */
2214 if (!spapr->htab_shift) {
2215 qemu_put_be32(f, -1);
2216 return 1;
2217 } else {
2218 qemu_put_be32(f, 0);
2219 }
2220
2221 if (!spapr->htab) {
2222 assert(kvm_enabled());
2223
2224 fd = get_htab_fd(spapr);
2225 if (fd < 0) {
2226 return fd;
2227 }
2228
2229 rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
2230 if (rc < 0) {
2231 return rc;
2232 }
2233 } else if (spapr->htab_first_pass) {
2234 htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
2235 } else {
2236 rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
2237 }
2238
2239 htab_save_end_marker(f);
2240
2241 return rc;
2242 }
2243
2244 static int htab_save_complete(QEMUFile *f, void *opaque)
2245 {
2246 SpaprMachineState *spapr = opaque;
2247 int fd;
2248
2249 /* Iteration header */
2250 if (!spapr->htab_shift) {
2251 qemu_put_be32(f, -1);
2252 return 0;
2253 } else {
2254 qemu_put_be32(f, 0);
2255 }
2256
2257 if (!spapr->htab) {
2258 int rc;
2259
2260 assert(kvm_enabled());
2261
2262 fd = get_htab_fd(spapr);
2263 if (fd < 0) {
2264 return fd;
2265 }
2266
2267 rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
2268 if (rc < 0) {
2269 return rc;
2270 }
2271 } else {
2272 if (spapr->htab_first_pass) {
2273 htab_save_first_pass(f, spapr, -1);
2274 }
2275 htab_save_later_pass(f, spapr, -1);
2276 }
2277
2278 /* End marker */
2279 htab_save_end_marker(f);
2280
2281 return 0;
2282 }
2283
2284 static int htab_load(QEMUFile *f, void *opaque, int version_id)
2285 {
2286 SpaprMachineState *spapr = opaque;
2287 uint32_t section_hdr;
2288 int fd = -1;
2289 Error *local_err = NULL;
2290
2291 if (version_id < 1 || version_id > 1) {
2292 error_report("htab_load() bad version");
2293 return -EINVAL;
2294 }
2295
2296 section_hdr = qemu_get_be32(f);
2297
2298 if (section_hdr == -1) {
2299 spapr_free_hpt(spapr);
2300 return 0;
2301 }
2302
2303 if (section_hdr) {
2304 int ret;
2305
2306 /* First section gives the htab size */
2307 ret = spapr_reallocate_hpt(spapr, section_hdr, &local_err);
2308 if (ret < 0) {
2309 error_report_err(local_err);
2310 return ret;
2311 }
2312 return 0;
2313 }
2314
2315 if (!spapr->htab) {
2316 assert(kvm_enabled());
2317
2318 fd = kvmppc_get_htab_fd(true, 0, &local_err);
2319 if (fd < 0) {
2320 error_report_err(local_err);
2321 return fd;
2322 }
2323 }
2324
2325 while (true) {
2326 uint32_t index;
2327 uint16_t n_valid, n_invalid;
2328
2329 index = qemu_get_be32(f);
2330 n_valid = qemu_get_be16(f);
2331 n_invalid = qemu_get_be16(f);
2332
2333 if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
2334 /* End of Stream */
2335 break;
2336 }
2337
2338 if ((index + n_valid + n_invalid) >
2339 (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
2340 /* Bad index in stream */
2341 error_report(
2342 "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
2343 index, n_valid, n_invalid, spapr->htab_shift);
2344 return -EINVAL;
2345 }
2346
2347 if (spapr->htab) {
2348 if (n_valid) {
2349 qemu_get_buffer(f, HPTE(spapr->htab, index),
2350 HASH_PTE_SIZE_64 * n_valid);
2351 }
2352 if (n_invalid) {
2353 memset(HPTE(spapr->htab, index + n_valid), 0,
2354 HASH_PTE_SIZE_64 * n_invalid);
2355 }
2356 } else {
2357 int rc;
2358
2359 assert(fd >= 0);
2360
2361 rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid,
2362 &local_err);
2363 if (rc < 0) {
2364 error_report_err(local_err);
2365 return rc;
2366 }
2367 }
2368 }
2369
2370 if (!spapr->htab) {
2371 assert(fd >= 0);
2372 close(fd);
2373 }
2374
2375 return 0;
2376 }
2377
2378 static void htab_save_cleanup(void *opaque)
2379 {
2380 SpaprMachineState *spapr = opaque;
2381
2382 close_htab_fd(spapr);
2383 }
2384
2385 static SaveVMHandlers savevm_htab_handlers = {
2386 .save_setup = htab_save_setup,
2387 .save_live_iterate = htab_save_iterate,
2388 .save_live_complete_precopy = htab_save_complete,
2389 .save_cleanup = htab_save_cleanup,
2390 .load_state = htab_load,
2391 };
2392
2393 static void spapr_boot_set(void *opaque, const char *boot_device,
2394 Error **errp)
2395 {
2396 SpaprMachineState *spapr = SPAPR_MACHINE(opaque);
2397
2398 g_free(spapr->boot_device);
2399 spapr->boot_device = g_strdup(boot_device);
2400 }
2401
2402 static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
2403 {
2404 MachineState *machine = MACHINE(spapr);
2405 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
2406 uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
2407 int i;
2408
2409 for (i = 0; i < nr_lmbs; i++) {
2410 uint64_t addr;
2411
2412 addr = i * lmb_size + machine->device_memory->base;
2413 spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
2414 addr / lmb_size);
2415 }
2416 }
2417
2418 /*
2419 * If RAM size, maxmem size and individual node mem sizes aren't aligned
2420 * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
2421 * since we can't support such unaligned sizes with DRCONF_MEMORY.
2422 */
2423 static void spapr_validate_node_memory(MachineState *machine, Error **errp)
2424 {
2425 int i;
2426
2427 if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2428 error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
2429 " is not aligned to %" PRIu64 " MiB",
2430 machine->ram_size,
2431 SPAPR_MEMORY_BLOCK_SIZE / MiB);
2432 return;
2433 }
2434
2435 if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2436 error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
2437 " is not aligned to %" PRIu64 " MiB",
2438 machine->ram_size,
2439 SPAPR_MEMORY_BLOCK_SIZE / MiB);
2440 return;
2441 }
2442
2443 for (i = 0; i < machine->numa_state->num_nodes; i++) {
2444 if (machine->numa_state->nodes[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
2445 error_setg(errp,
2446 "Node %d memory size 0x%" PRIx64
2447 " is not aligned to %" PRIu64 " MiB",
2448 i, machine->numa_state->nodes[i].node_mem,
2449 SPAPR_MEMORY_BLOCK_SIZE / MiB);
2450 return;
2451 }
2452 }
2453 }
2454
2455 /* find cpu slot in machine->possible_cpus by core_id */
2456 static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
2457 {
2458 int index = id / ms->smp.threads;
2459
2460 if (index >= ms->possible_cpus->len) {
2461 return NULL;
2462 }
2463 if (idx) {
2464 *idx = index;
2465 }
2466 return &ms->possible_cpus->cpus[index];
2467 }
2468
2469 static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
2470 {
2471 MachineState *ms = MACHINE(spapr);
2472 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2473 Error *local_err = NULL;
2474 bool vsmt_user = !!spapr->vsmt;
2475 int kvm_smt = kvmppc_smt_threads();
2476 int ret;
2477 unsigned int smp_threads = ms->smp.threads;
2478
2479 if (!kvm_enabled() && (smp_threads > 1)) {
2480 error_setg(errp, "TCG cannot support more than 1 thread/core "
2481 "on a pseries machine");
2482 return;
2483 }
2484 if (!is_power_of_2(smp_threads)) {
2485 error_setg(errp, "Cannot support %d threads/core on a pseries "
2486 "machine because it must be a power of 2", smp_threads);
2487 return;
2488 }
2489
2490 /* Detemine the VSMT mode to use: */
2491 if (vsmt_user) {
2492 if (spapr->vsmt < smp_threads) {
2493 error_setg(errp, "Cannot support VSMT mode %d"
2494 " because it must be >= threads/core (%d)",
2495 spapr->vsmt, smp_threads);
2496 return;
2497 }
2498 /* In this case, spapr->vsmt has been set by the command line */
2499 } else if (!smc->smp_threads_vsmt) {
2500 /*
2501 * Default VSMT value is tricky, because we need it to be as
2502 * consistent as possible (for migration), but this requires
2503 * changing it for at least some existing cases. We pick 8 as
2504 * the value that we'd get with KVM on POWER8, the
2505 * overwhelmingly common case in production systems.
2506 */
2507 spapr->vsmt = MAX(8, smp_threads);
2508 } else {
2509 spapr->vsmt = smp_threads;
2510 }
2511
2512 /* KVM: If necessary, set the SMT mode: */
2513 if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
2514 ret = kvmppc_set_smt_threads(spapr->vsmt);
2515 if (ret) {
2516 /* Looks like KVM isn't able to change VSMT mode */
2517 error_setg(&local_err,
2518 "Failed to set KVM's VSMT mode to %d (errno %d)",
2519 spapr->vsmt, ret);
2520 /* We can live with that if the default one is big enough
2521 * for the number of threads, and a submultiple of the one
2522 * we want. In this case we'll waste some vcpu ids, but
2523 * behaviour will be correct */
2524 if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
2525 warn_report_err(local_err);
2526 } else {
2527 if (!vsmt_user) {
2528 error_append_hint(&local_err,
2529 "On PPC, a VM with %d threads/core"
2530 " on a host with %d threads/core"
2531 " requires the use of VSMT mode %d.\n",
2532 smp_threads, kvm_smt, spapr->vsmt);
2533 }
2534 kvmppc_error_append_smt_possible_hint(&local_err);
2535 error_propagate(errp, local_err);
2536 }
2537 }
2538 }
2539 /* else TCG: nothing to do currently */
2540 }
2541
2542 static void spapr_init_cpus(SpaprMachineState *spapr)
2543 {
2544 MachineState *machine = MACHINE(spapr);
2545 MachineClass *mc = MACHINE_GET_CLASS(machine);
2546 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2547 const char *type = spapr_get_cpu_core_type(machine->cpu_type);
2548 const CPUArchIdList *possible_cpus;
2549 unsigned int smp_cpus = machine->smp.cpus;
2550 unsigned int smp_threads = machine->smp.threads;
2551 unsigned int max_cpus = machine->smp.max_cpus;
2552 int boot_cores_nr = smp_cpus / smp_threads;
2553 int i;
2554
2555 possible_cpus = mc->possible_cpu_arch_ids(machine);
2556 if (mc->has_hotpluggable_cpus) {
2557 if (smp_cpus % smp_threads) {
2558 error_report("smp_cpus (%u) must be multiple of threads (%u)",
2559 smp_cpus, smp_threads);
2560 exit(1);
2561 }
2562 if (max_cpus % smp_threads) {
2563 error_report("max_cpus (%u) must be multiple of threads (%u)",
2564 max_cpus, smp_threads);
2565 exit(1);
2566 }
2567 } else {
2568 if (max_cpus != smp_cpus) {
2569 error_report("This machine version does not support CPU hotplug");
2570 exit(1);
2571 }
2572 boot_cores_nr = possible_cpus->len;
2573 }
2574
2575 if (smc->pre_2_10_has_unused_icps) {
2576 int i;
2577
2578 for (i = 0; i < spapr_max_server_number(spapr); i++) {
2579 /* Dummy entries get deregistered when real ICPState objects
2580 * are registered during CPU core hotplug.
2581 */
2582 pre_2_10_vmstate_register_dummy_icp(i);
2583 }
2584 }
2585
2586 for (i = 0; i < possible_cpus->len; i++) {
2587 int core_id = i * smp_threads;
2588
2589 if (mc->has_hotpluggable_cpus) {
2590 spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
2591 spapr_vcpu_id(spapr, core_id));
2592 }
2593
2594 if (i < boot_cores_nr) {
2595 Object *core = object_new(type);
2596 int nr_threads = smp_threads;
2597
2598 /* Handle the partially filled core for older machine types */
2599 if ((i + 1) * smp_threads >= smp_cpus) {
2600 nr_threads = smp_cpus - i * smp_threads;
2601 }
2602
2603 object_property_set_int(core, "nr-threads", nr_threads,
2604 &error_fatal);
2605 object_property_set_int(core, CPU_CORE_PROP_CORE_ID, core_id,
2606 &error_fatal);
2607 qdev_realize(DEVICE(core), NULL, &error_fatal);
2608
2609 object_unref(core);
2610 }
2611 }
2612 }
2613
2614 static PCIHostState *spapr_create_default_phb(void)
2615 {
2616 DeviceState *dev;
2617
2618 dev = qdev_new(TYPE_SPAPR_PCI_HOST_BRIDGE);
2619 qdev_prop_set_uint32(dev, "index", 0);
2620 sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
2621
2622 return PCI_HOST_BRIDGE(dev);
2623 }
2624
2625 static hwaddr spapr_rma_size(SpaprMachineState *spapr, Error **errp)
2626 {
2627 MachineState *machine = MACHINE(spapr);
2628 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2629 hwaddr rma_size = machine->ram_size;
2630 hwaddr node0_size = spapr_node0_size(machine);
2631
2632 /* RMA has to fit in the first NUMA node */
2633 rma_size = MIN(rma_size, node0_size);
2634
2635 /*
2636 * VRMA access is via a special 1TiB SLB mapping, so the RMA can
2637 * never exceed that
2638 */
2639 rma_size = MIN(rma_size, 1 * TiB);
2640
2641 /*
2642 * Clamp the RMA size based on machine type. This is for
2643 * migration compatibility with older qemu versions, which limited
2644 * the RMA size for complicated and mostly bad reasons.
2645 */
2646 if (smc->rma_limit) {
2647 rma_size = MIN(rma_size, smc->rma_limit);
2648 }
2649
2650 if (rma_size < MIN_RMA_SLOF) {
2651 error_setg(errp,
2652 "pSeries SLOF firmware requires >= %" HWADDR_PRIx
2653 "ldMiB guest RMA (Real Mode Area memory)",
2654 MIN_RMA_SLOF / MiB);
2655 return 0;
2656 }
2657
2658 return rma_size;
2659 }
2660
2661 static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
2662 {
2663 MachineState *machine = MACHINE(spapr);
2664 int i;
2665
2666 for (i = 0; i < machine->ram_slots; i++) {
2667 spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);
2668 }
2669 }
2670
2671 /* pSeries LPAR / sPAPR hardware init */
2672 static void spapr_machine_init(MachineState *machine)
2673 {
2674 SpaprMachineState *spapr = SPAPR_MACHINE(machine);
2675 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2676 MachineClass *mc = MACHINE_GET_CLASS(machine);
2677 const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
2678 const char *bios_name = machine->firmware ?: bios_default;
2679 const char *kernel_filename = machine->kernel_filename;
2680 const char *initrd_filename = machine->initrd_filename;
2681 PCIHostState *phb;
2682 int i;
2683 MemoryRegion *sysmem = get_system_memory();
2684 long load_limit, fw_size;
2685 char *filename;
2686 Error *resize_hpt_err = NULL;
2687
2688 /*
2689 * if Secure VM (PEF) support is configured, then initialize it
2690 */
2691 pef_kvm_init(machine->cgs, &error_fatal);
2692
2693 msi_nonbroken = true;
2694
2695 QLIST_INIT(&spapr->phbs);
2696 QTAILQ_INIT(&spapr->pending_dimm_unplugs);
2697
2698 /* Determine capabilities to run with */
2699 spapr_caps_init(spapr);
2700
2701 kvmppc_check_papr_resize_hpt(&resize_hpt_err);
2702 if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
2703 /*
2704 * If the user explicitly requested a mode we should either
2705 * supply it, or fail completely (which we do below). But if
2706 * it's not set explicitly, we reset our mode to something
2707 * that works
2708 */
2709 if (resize_hpt_err) {
2710 spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
2711 error_free(resize_hpt_err);
2712 resize_hpt_err = NULL;
2713 } else {
2714 spapr->resize_hpt = smc->resize_hpt_default;
2715 }
2716 }
2717
2718 assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
2719
2720 if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
2721 /*
2722 * User requested HPT resize, but this host can't supply it. Bail out
2723 */
2724 error_report_err(resize_hpt_err);
2725 exit(1);
2726 }
2727 error_free(resize_hpt_err);
2728
2729 spapr->rma_size = spapr_rma_size(spapr, &error_fatal);
2730
2731 /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
2732 load_limit = MIN(spapr->rma_size, FDT_MAX_ADDR) - FW_OVERHEAD;
2733
2734 /*
2735 * VSMT must be set in order to be able to compute VCPU ids, ie to
2736 * call spapr_max_server_number() or spapr_vcpu_id().
2737 */
2738 spapr_set_vsmt_mode(spapr, &error_fatal);
2739
2740 /* Set up Interrupt Controller before we create the VCPUs */
2741 spapr_irq_init(spapr, &error_fatal);
2742
2743 /* Set up containers for ibm,client-architecture-support negotiated options
2744 */
2745 spapr->ov5 = spapr_ovec_new();
2746 spapr->ov5_cas = spapr_ovec_new();
2747
2748 if (smc->dr_lmb_enabled) {
2749 spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
2750 spapr_validate_node_memory(machine, &error_fatal);
2751 }
2752
2753 spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
2754
2755 /* advertise support for dedicated HP event source to guests */
2756 if (spapr->use_hotplug_event_source) {
2757 spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
2758 }
2759
2760 /* advertise support for HPT resizing */
2761 if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
2762 spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
2763 }
2764
2765 /* advertise support for ibm,dyamic-memory-v2 */
2766 spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
2767
2768 /* advertise XIVE on POWER9 machines */
2769 if (spapr->irq->xive) {
2770 spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
2771 }
2772
2773 /* init CPUs */
2774 spapr_init_cpus(spapr);
2775
2776 /*
2777 * check we don't have a memory-less/cpu-less NUMA node
2778 * Firmware relies on the existing memory/cpu topology to provide the
2779 * NUMA topology to the kernel.
2780 * And the linux kernel needs to know the NUMA topology at start
2781 * to be able to hotplug CPUs later.
2782 */
2783 if (machine->numa_state->num_nodes) {
2784 for (i = 0; i < machine->numa_state->num_nodes; ++i) {
2785 /* check for memory-less node */
2786 if (machine->numa_state->nodes[i].node_mem == 0) {
2787 CPUState *cs;
2788 int found = 0;
2789 /* check for cpu-less node */
2790 CPU_FOREACH(cs) {
2791 PowerPCCPU *cpu = POWERPC_CPU(cs);
2792 if (cpu->node_id == i) {
2793 found = 1;
2794 break;
2795 }
2796 }
2797 /* memory-less and cpu-less node */
2798 if (!found) {
2799 error_report(
2800 "Memory-less/cpu-less nodes are not supported (node %d)",
2801 i);
2802 exit(1);
2803 }
2804 }
2805 }
2806
2807 }
2808
2809 spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
2810
2811 /* Init numa_assoc_array */
2812 spapr_numa_associativity_init(spapr, machine);
2813
2814 if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
2815 ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
2816 spapr->max_compat_pvr)) {
2817 spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
2818 /* KVM and TCG always allow GTSE with radix... */
2819 spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
2820 }
2821 /* ... but not with hash (currently). */
2822
2823 if (kvm_enabled()) {
2824 /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
2825 kvmppc_enable_logical_ci_hcalls();
2826 kvmppc_enable_set_mode_hcall();
2827
2828 /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
2829 kvmppc_enable_clear_ref_mod_hcalls();
2830
2831 /* Enable H_PAGE_INIT */
2832 kvmppc_enable_h_page_init();
2833 }
2834
2835 /* map RAM */
2836 memory_region_add_subregion(sysmem, 0, machine->ram);
2837
2838 /* always allocate the device memory information */
2839 machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
2840
2841 /* initialize hotplug memory address space */
2842 if (machine->ram_size < machine->maxram_size) {
2843 ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
2844 /*
2845 * Limit the number of hotpluggable memory slots to half the number
2846 * slots that KVM supports, leaving the other half for PCI and other
2847 * devices. However ensure that number of slots doesn't drop below 32.
2848 */
2849 int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
2850 SPAPR_MAX_RAM_SLOTS;
2851
2852 if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
2853 max_memslots = SPAPR_MAX_RAM_SLOTS;
2854 }
2855 if (machine->ram_slots > max_memslots) {
2856 error_report("Specified number of memory slots %"
2857 PRIu64" exceeds max supported %d",
2858 machine->ram_slots, max_memslots);
2859 exit(1);
2860 }
2861
2862 machine->device_memory->base = ROUND_UP(machine->ram_size,
2863 SPAPR_DEVICE_MEM_ALIGN);
2864 memory_region_init(&machine->device_memory->mr, OBJECT(spapr),
2865 "device-memory", device_mem_size);
2866 memory_region_add_subregion(sysmem, machine->device_memory->base,
2867 &machine->device_memory->mr);
2868 }
2869
2870 if (smc->dr_lmb_enabled) {
2871 spapr_create_lmb_dr_connectors(spapr);
2872 }
2873
2874 if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_ON) {
2875 /* Create the error string for live migration blocker */
2876 error_setg(&spapr->fwnmi_migration_blocker,
2877 "A machine check is being handled during migration. The handler"
2878 "may run and log hardware error on the destination");
2879 }
2880
2881 if (mc->nvdimm_supported) {
2882 spapr_create_nvdimm_dr_connectors(spapr);
2883 }
2884
2885 /* Set up RTAS event infrastructure */
2886 spapr_events_init(spapr);
2887
2888 /* Set up the RTC RTAS interfaces */
2889 spapr_rtc_create(spapr);
2890
2891 /* Set up VIO bus */
2892 spapr->vio_bus = spapr_vio_bus_init();
2893
2894 for (i = 0; serial_hd(i); i++) {
2895 spapr_vty_create(spapr->vio_bus, serial_hd(i));
2896 }
2897
2898 /* We always have at least the nvram device on VIO */
2899 spapr_create_nvram(spapr);
2900
2901 /*
2902 * Setup hotplug / dynamic-reconfiguration connectors. top-level
2903 * connectors (described in root DT node's "ibm,drc-types" property)
2904 * are pre-initialized here. additional child connectors (such as
2905 * connectors for a PHBs PCI slots) are added as needed during their
2906 * parent's realization.
2907 */
2908 if (smc->dr_phb_enabled) {
2909 for (i = 0; i < SPAPR_MAX_PHBS; i++) {
2910 spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
2911 }
2912 }
2913
2914 /* Set up PCI */
2915 spapr_pci_rtas_init();
2916
2917 phb = spapr_create_default_phb();
2918
2919 for (i = 0; i < nb_nics; i++) {
2920 NICInfo *nd = &nd_table[i];
2921
2922 if (!nd->model) {
2923 nd->model = g_strdup("spapr-vlan");
2924 }
2925
2926 if (g_str_equal(nd->model, "spapr-vlan") ||
2927 g_str_equal(nd->model, "ibmveth")) {
2928 spapr_vlan_create(spapr->vio_bus, nd);
2929 } else {
2930 pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
2931 }
2932 }
2933
2934 for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
2935 spapr_vscsi_create(spapr->vio_bus);
2936 }
2937
2938 /* Graphics */
2939 if (spapr_vga_init(phb->bus, &error_fatal)) {
2940 spapr->has_graphics = true;
2941 machine->usb |= defaults_enabled() && !machine->usb_disabled;
2942 }
2943
2944 if (machine->usb) {
2945 if (smc->use_ohci_by_default) {
2946 pci_create_simple(phb->bus, -1, "pci-ohci");
2947 } else {
2948 pci_create_simple(phb->bus, -1, "nec-usb-xhci");
2949 }
2950
2951 if (spapr->has_graphics) {
2952 USBBus *usb_bus = usb_bus_find(-1);
2953
2954 usb_create_simple(usb_bus, "usb-kbd");
2955 usb_create_simple(usb_bus, "usb-mouse");
2956 }
2957 }
2958
2959 if (kernel_filename) {
2960 spapr->kernel_size = load_elf(kernel_filename, NULL,
2961 translate_kernel_address, spapr,
2962 NULL, NULL, NULL, NULL, 1,
2963 PPC_ELF_MACHINE, 0, 0);
2964 if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
2965 spapr->kernel_size = load_elf(kernel_filename, NULL,
2966 translate_kernel_address, spapr,
2967 NULL, NULL, NULL, NULL, 0,
2968 PPC_ELF_MACHINE, 0, 0);
2969 spapr->kernel_le = spapr->kernel_size > 0;
2970 }
2971 if (spapr->kernel_size < 0) {
2972 error_report("error loading %s: %s", kernel_filename,
2973 load_elf_strerror(spapr->kernel_size));
2974 exit(1);
2975 }
2976
2977 /* load initrd */
2978 if (initrd_filename) {
2979 /* Try to locate the initrd in the gap between the kernel
2980 * and the firmware. Add a bit of space just in case
2981 */
2982 spapr->initrd_base = (spapr->kernel_addr + spapr->kernel_size
2983 + 0x1ffff) & ~0xffff;
2984 spapr->initrd_size = load_image_targphys(initrd_filename,
2985 spapr->initrd_base,
2986 load_limit
2987 - spapr->initrd_base);
2988 if (spapr->initrd_size < 0) {
2989 error_report("could not load initial ram disk '%s'",
2990 initrd_filename);
2991 exit(1);
2992 }
2993 }
2994 }
2995
2996 filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
2997 if (!filename) {
2998 error_report("Could not find LPAR firmware '%s'", bios_name);
2999 exit(1);
3000 }
3001 fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
3002 if (fw_size <= 0) {
3003 error_report("Could not load LPAR firmware '%s'", filename);
3004 exit(1);
3005 }
3006 g_free(filename);
3007
3008 /* FIXME: Should register things through the MachineState's qdev
3009 * interface, this is a legacy from the sPAPREnvironment structure
3010 * which predated MachineState but had a similar function */
3011 vmstate_register(NULL, 0, &vmstate_spapr, spapr);
3012 register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
3013 &savevm_htab_handlers, spapr);
3014
3015 qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine));
3016
3017 qemu_register_boot_set(spapr_boot_set, spapr);
3018
3019 /*
3020 * Nothing needs to be done to resume a suspended guest because
3021 * suspending does not change the machine state, so no need for
3022 * a ->wakeup method.
3023 */
3024 qemu_register_wakeup_support();
3025
3026 if (kvm_enabled()) {
3027 /* to stop and start vmclock */
3028 qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
3029 &spapr->tb);
3030
3031 kvmppc_spapr_enable_inkernel_multitce();
3032 }
3033
3034 qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
3035 if (spapr->vof) {
3036 spapr->vof->fw_size = fw_size; /* for claim() on itself */
3037 spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
3038 }
3039 }
3040
3041 #define DEFAULT_KVM_TYPE "auto"
3042 static int spapr_kvm_type(MachineState *machine, const char *vm_type)
3043 {
3044 /*
3045 * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to
3046 * accomodate the 'HV' and 'PV' formats that exists in the
3047 * wild. The 'auto' mode is being introduced already as
3048 * lower-case, thus we don't need to bother checking for
3049 * "AUTO".
3050 */
3051 if (!vm_type || !strcmp(vm_type, DEFAULT_KVM_TYPE)) {
3052 return 0;
3053 }
3054
3055 if (!g_ascii_strcasecmp(vm_type, "hv")) {
3056 return 1;
3057 }
3058
3059 if (!g_ascii_strcasecmp(vm_type, "pr")) {
3060 return 2;
3061 }
3062
3063 error_report("Unknown kvm-type specified '%s'", vm_type);
3064 exit(1);
3065 }
3066
3067 /*
3068 * Implementation of an interface to adjust firmware path
3069 * for the bootindex property handling.
3070 */
3071 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
3072 DeviceState *dev)
3073 {
3074 #define CAST(type, obj, name) \
3075 ((type *)object_dynamic_cast(OBJECT(obj), (name)))
3076 SCSIDevice *d = CAST(SCSIDevice, dev, TYPE_SCSI_DEVICE);
3077 SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
3078 VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
3079 PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3080
3081 if (d) {
3082 void *spapr = CAST(void, bus->parent, "spapr-vscsi");
3083 VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
3084 USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
3085
3086 if (spapr) {
3087 /*
3088 * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
3089 * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
3090 * 0x8000 | (target << 8) | (bus << 5) | lun
3091 * (see the "Logical unit addressing format" table in SAM5)
3092 */
3093 unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
3094 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3095 (uint64_t)id << 48);
3096 } else if (virtio) {
3097 /*
3098 * We use SRP luns of the form 01000000 | (target << 8) | lun
3099 * in the top 32 bits of the 64-bit LUN
3100 * Note: the quote above is from SLOF and it is wrong,
3101 * the actual binding is:
3102 * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
3103 */
3104 unsigned id = 0x1000000 | (d->id << 16) | d->lun;
3105 if (d->lun >= 256) {
3106 /* Use the LUN "flat space addressing method" */
3107 id |= 0x4000;
3108 }
3109 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3110 (uint64_t)id << 32);
3111 } else if (usb) {
3112 /*
3113 * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
3114 * in the top 32 bits of the 64-bit LUN
3115 */
3116 unsigned usb_port = atoi(usb->port->path);
3117 unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
3118 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3119 (uint64_t)id << 32);
3120 }
3121 }
3122
3123 /*
3124 * SLOF probes the USB devices, and if it recognizes that the device is a
3125 * storage device, it changes its name to "storage" instead of "usb-host",
3126 * and additionally adds a child node for the SCSI LUN, so the correct
3127 * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
3128 */
3129 if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
3130 USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
3131 if (usb_host_dev_is_scsi_storage(usbdev)) {
3132 return g_strdup_printf("storage@%s/disk", usbdev->port->path);
3133 }
3134 }
3135
3136 if (phb) {
3137 /* Replace "pci" with "pci@800000020000000" */
3138 return g_strdup_printf("pci@%"PRIX64, phb->buid);
3139 }
3140
3141 if (vsc) {
3142 /* Same logic as virtio above */
3143 unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
3144 return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
3145 }
3146
3147 if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
3148 /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
3149 PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3150 return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
3151 }
3152
3153 if (pcidev) {
3154 return spapr_pci_fw_dev_name(pcidev);
3155 }
3156
3157 return NULL;
3158 }
3159
3160 static char *spapr_get_kvm_type(Object *obj, Error **errp)
3161 {
3162 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3163
3164 return g_strdup(spapr->kvm_type);
3165 }
3166
3167 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
3168 {
3169 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3170
3171 g_free(spapr->kvm_type);
3172 spapr->kvm_type = g_strdup(value);
3173 }
3174
3175 static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
3176 {
3177 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3178
3179 return spapr->use_hotplug_event_source;
3180 }
3181
3182 static void spapr_set_modern_hotplug_events(Object *obj, bool value,
3183 Error **errp)
3184 {
3185 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3186
3187 spapr->use_hotplug_event_source = value;
3188 }
3189
3190 static bool spapr_get_msix_emulation(Object *obj, Error **errp)
3191 {
3192 return true;
3193 }
3194
3195 static char *spapr_get_resize_hpt(Object *obj, Error **errp)
3196 {
3197 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3198
3199 switch (spapr->resize_hpt) {
3200 case SPAPR_RESIZE_HPT_DEFAULT:
3201 return g_strdup("default");
3202 case SPAPR_RESIZE_HPT_DISABLED:
3203 return g_strdup("disabled");
3204 case SPAPR_RESIZE_HPT_ENABLED:
3205 return g_strdup("enabled");
3206 case SPAPR_RESIZE_HPT_REQUIRED:
3207 return g_strdup("required");
3208 }
3209 g_assert_not_reached();
3210 }
3211
3212 static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
3213 {
3214 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3215
3216 if (strcmp(value, "default") == 0) {
3217 spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
3218 } else if (strcmp(value, "disabled") == 0) {
3219 spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
3220 } else if (strcmp(value, "enabled") == 0) {
3221 spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
3222 } else if (strcmp(value, "required") == 0) {
3223 spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
3224 } else {
3225 error_setg(errp, "Bad value for \"resize-hpt\" property");
3226 }
3227 }
3228
3229 static bool spapr_get_vof(Object *obj, Error **errp)
3230 {
3231 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3232
3233 return spapr->vof != NULL;
3234 }
3235
3236 static void spapr_set_vof(Object *obj, bool value, Error **errp)
3237 {
3238 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3239
3240 if (spapr->vof) {
3241 vof_cleanup(spapr->vof);
3242 g_free(spapr->vof);
3243 spapr->vof = NULL;
3244 }
3245 if (!value) {
3246 return;
3247 }
3248 spapr->vof = g_malloc0(sizeof(*spapr->vof));
3249 }
3250
3251 static char *spapr_get_ic_mode(Object *obj, Error **errp)
3252 {
3253 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3254
3255 if (spapr->irq == &spapr_irq_xics_legacy) {
3256 return g_strdup("legacy");
3257 } else if (spapr->irq == &spapr_irq_xics) {
3258 return g_strdup("xics");
3259 } else if (spapr->irq == &spapr_irq_xive) {
3260 return g_strdup("xive");
3261 } else if (spapr->irq == &spapr_irq_dual) {
3262 return g_strdup("dual");
3263 }
3264 g_assert_not_reached();
3265 }
3266
3267 static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
3268 {
3269 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3270
3271 if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
3272 error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
3273 return;
3274 }
3275
3276 /* The legacy IRQ backend can not be set */
3277 if (strcmp(value, "xics") == 0) {
3278 spapr->irq = &spapr_irq_xics;
3279 } else if (strcmp(value, "xive") == 0) {
3280 spapr->irq = &spapr_irq_xive;
3281 } else if (strcmp(value, "dual") == 0) {
3282 spapr->irq = &spapr_irq_dual;
3283 } else {
3284 error_setg(errp, "Bad value for \"ic-mode\" property");
3285 }
3286 }
3287
3288 static char *spapr_get_host_model(Object *obj, Error **errp)
3289 {
3290 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3291
3292 return g_strdup(spapr->host_model);
3293 }
3294
3295 static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
3296 {
3297 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3298
3299 g_free(spapr->host_model);
3300 spapr->host_model = g_strdup(value);
3301 }
3302
3303 static char *spapr_get_host_serial(Object *obj, Error **errp)
3304 {
3305 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3306
3307 return g_strdup(spapr->host_serial);
3308 }
3309
3310 static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
3311 {
3312 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3313
3314 g_free(spapr->host_serial);
3315 spapr->host_serial = g_strdup(value);
3316 }
3317
3318 static void spapr_instance_init(Object *obj)
3319 {
3320 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3321 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3322 MachineState *ms = MACHINE(spapr);
3323 MachineClass *mc = MACHINE_GET_CLASS(ms);
3324
3325 /*
3326 * NVDIMM support went live in 5.1 without considering that, in
3327 * other archs, the user needs to enable NVDIMM support with the
3328 * 'nvdimm' machine option and the default behavior is NVDIMM
3329 * support disabled. It is too late to roll back to the standard
3330 * behavior without breaking 5.1 guests.
3331 */
3332 if (mc->nvdimm_supported) {
3333 ms->nvdimms_state->is_enabled = true;
3334 }
3335
3336 spapr->htab_fd = -1;
3337 spapr->use_hotplug_event_source = true;
3338 spapr->kvm_type = g_strdup(DEFAULT_KVM_TYPE);
3339 object_property_add_str(obj, "kvm-type",
3340 spapr_get_kvm_type, spapr_set_kvm_type);
3341 object_property_set_description(obj, "kvm-type",
3342 "Specifies the KVM virtualization mode (auto,"
3343 " hv, pr). Defaults to 'auto'. This mode will use"
3344 " any available KVM module loaded in the host,"
3345 " where kvm_hv takes precedence if both kvm_hv and"
3346 " kvm_pr are loaded.");
3347 object_property_add_bool(obj, "modern-hotplug-events",
3348 spapr_get_modern_hotplug_events,
3349 spapr_set_modern_hotplug_events);
3350 object_property_set_description(obj, "modern-hotplug-events",
3351 "Use dedicated hotplug event mechanism in"
3352 " place of standard EPOW events when possible"
3353 " (required for memory hot-unplug support)");
3354 ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
3355 "Maximum permitted CPU compatibility mode");
3356
3357 object_property_add_str(obj, "resize-hpt",
3358 spapr_get_resize_hpt, spapr_set_resize_hpt);
3359 object_property_set_description(obj, "resize-hpt",
3360 "Resizing of the Hash Page Table (enabled, disabled, required)");
3361 object_property_add_uint32_ptr(obj, "vsmt",
3362 &spapr->vsmt, OBJ_PROP_FLAG_READWRITE);
3363 object_property_set_description(obj, "vsmt",
3364 "Virtual SMT: KVM behaves as if this were"
3365 " the host's SMT mode");
3366
3367 object_property_add_bool(obj, "vfio-no-msix-emulation",
3368 spapr_get_msix_emulation, NULL);
3369
3370 object_property_add_uint64_ptr(obj, "kernel-addr",
3371 &spapr->kernel_addr, OBJ_PROP_FLAG_READWRITE);
3372 object_property_set_description(obj, "kernel-addr",
3373 stringify(KERNEL_LOAD_ADDR)
3374 " for -kernel is the default");
3375 spapr->kernel_addr = KERNEL_LOAD_ADDR;
3376
3377 object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
3378 object_property_set_description(obj, "x-vof",
3379 "Enable Virtual Open Firmware (experimental)");
3380
3381 /* The machine class defines the default interrupt controller mode */
3382 spapr->irq = smc->irq;
3383 object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
3384 spapr_set_ic_mode);
3385 object_property_set_description(obj, "ic-mode",
3386 "Specifies the interrupt controller mode (xics, xive, dual)");
3387
3388 object_property_add_str(obj, "host-model",
3389 spapr_get_host_model, spapr_set_host_model);
3390 object_property_set_description(obj, "host-model",
3391 "Host model to advertise in guest device tree");
3392 object_property_add_str(obj, "host-serial",
3393 spapr_get_host_serial, spapr_set_host_serial);
3394 object_property_set_description(obj, "host-serial",
3395 "Host serial number to advertise in guest device tree");
3396 }
3397
3398 static void spapr_machine_finalizefn(Object *obj)
3399 {
3400 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3401
3402 g_free(spapr->kvm_type);
3403 }
3404
3405 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
3406 {
3407 SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
3408 PowerPCCPU *cpu = POWERPC_CPU(cs);
3409 CPUPPCState *env = &cpu->env;
3410
3411 cpu_synchronize_state(cs);
3412 /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */
3413 if (spapr->fwnmi_system_reset_addr != -1) {
3414 uint64_t rtas_addr, addr;
3415
3416 /* get rtas addr from fdt */
3417 rtas_addr = spapr_get_rtas_addr();
3418 if (!rtas_addr) {
3419 qemu_system_guest_panicked(NULL);
3420 return;
3421 }
3422
3423 addr = rtas_addr + RTAS_ERROR_LOG_MAX + cs->cpu_index * sizeof(uint64_t)*2;
3424 stq_be_phys(&address_space_memory, addr, env->gpr[3]);
3425 stq_be_phys(&address_space_memory, addr + sizeof(uint64_t), 0);
3426 env->gpr[3] = addr;
3427 }
3428 ppc_cpu_do_system_reset(cs);
3429 if (spapr->fwnmi_system_reset_addr != -1) {
3430 env->nip = spapr->fwnmi_system_reset_addr;
3431 }
3432 }
3433
3434 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
3435 {
3436 CPUState *cs;
3437
3438 CPU_FOREACH(cs) {
3439 async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
3440 }
3441 }
3442
3443 int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3444 void *fdt, int *fdt_start_offset, Error **errp)
3445 {
3446 uint64_t addr;
3447 uint32_t node;
3448
3449 addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
3450 node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
3451 &error_abort);
3452 *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr,
3453 SPAPR_MEMORY_BLOCK_SIZE);
3454 return 0;
3455 }
3456
3457 static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
3458 bool dedicated_hp_event_source)
3459 {
3460 SpaprDrc *drc;
3461 uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
3462 int i;
3463 uint64_t addr = addr_start;
3464 bool hotplugged = spapr_drc_hotplugged(dev);
3465
3466 for (i = 0; i < nr_lmbs; i++) {
3467 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3468 addr / SPAPR_MEMORY_BLOCK_SIZE);
3469 g_assert(drc);
3470
3471 /*
3472 * memory_device_get_free_addr() provided a range of free addresses
3473 * that doesn't overlap with any existing mapping at pre-plug. The
3474 * corresponding LMB DRCs are thus assumed to be all attachable.
3475 */
3476 spapr_drc_attach(drc, dev);
3477 if (!hotplugged) {
3478 spapr_drc_reset(drc);
3479 }
3480 addr += SPAPR_MEMORY_BLOCK_SIZE;
3481 }
3482 /* send hotplug notification to the
3483 * guest only in case of hotplugged memory
3484 */
3485 if (hotplugged) {
3486 if (dedicated_hp_event_source) {
3487 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3488 addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3489 g_assert(drc);
3490 spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3491 nr_lmbs,
3492 spapr_drc_index(drc));
3493 } else {
3494 spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
3495 nr_lmbs);
3496 }
3497 }
3498 }
3499
3500 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3501 {
3502 SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
3503 PCDIMMDevice *dimm = PC_DIMM(dev);
3504 uint64_t size, addr;
3505 int64_t slot;
3506 bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3507
3508 size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
3509
3510 pc_dimm_plug(dimm, MACHINE(ms));
3511
3512 if (!is_nvdimm) {
3513 addr = object_property_get_uint(OBJECT(dimm),
3514 PC_DIMM_ADDR_PROP, &error_abort);
3515 spapr_add_lmbs(dev, addr, size,
3516 spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT));
3517 } else {
3518 slot = object_property_get_int(OBJECT(dimm),
3519 PC_DIMM_SLOT_PROP, &error_abort);
3520 /* We should have valid slot number at this point */
3521 g_assert(slot >= 0);
3522 spapr_add_nvdimm(dev, slot);
3523 }
3524 }
3525
3526 static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3527 Error **errp)
3528 {
3529 const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
3530 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3531 bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3532 PCDIMMDevice *dimm = PC_DIMM(dev);
3533 Error *local_err = NULL;
3534 uint64_t size;
3535 Object *memdev;
3536 hwaddr pagesize;
3537
3538 if (!smc->dr_lmb_enabled) {
3539 error_setg(errp, "Memory hotplug not supported for this machine");
3540 return;
3541 }
3542
3543 size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
3544 if (local_err) {
3545 error_propagate(errp, local_err);
3546 return;
3547 }
3548
3549 if (is_nvdimm) {
3550 if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) {
3551 return;
3552 }
3553 } else if (size % SPAPR_MEMORY_BLOCK_SIZE) {
3554 error_setg(errp, "Hotplugged memory size must be a multiple of "
3555 "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
3556 return;
3557 }
3558
3559 memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
3560 &error_abort);
3561 pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
3562 if (!spapr_check_pagesize(spapr, pagesize, errp)) {
3563 return;
3564 }
3565
3566 pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), NULL, errp);
3567 }
3568
3569 struct SpaprDimmState {
3570 PCDIMMDevice *dimm;
3571 uint32_t nr_lmbs;
3572 QTAILQ_ENTRY(SpaprDimmState) next;
3573 };
3574
3575 static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
3576 PCDIMMDevice *dimm)
3577 {
3578 SpaprDimmState *dimm_state = NULL;
3579
3580 QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
3581 if (dimm_state->dimm == dimm) {
3582 break;
3583 }
3584 }
3585 return dimm_state;
3586 }
3587
3588 static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
3589 uint32_t nr_lmbs,
3590 PCDIMMDevice *dimm)
3591 {
3592 SpaprDimmState *ds = NULL;
3593
3594 /*
3595 * If this request is for a DIMM whose removal had failed earlier
3596 * (due to guest's refusal to remove the LMBs), we would have this
3597 * dimm already in the pending_dimm_unplugs list. In that
3598 * case don't add again.
3599 */
3600 ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3601 if (!ds) {
3602 ds = g_malloc0(sizeof(SpaprDimmState));
3603 ds->nr_lmbs = nr_lmbs;
3604 ds->dimm = dimm;
3605 QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
3606 }
3607 return ds;
3608 }
3609
3610 static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
3611 SpaprDimmState *dimm_state)
3612 {
3613 QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
3614 g_free(dimm_state);
3615 }
3616
3617 static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
3618 PCDIMMDevice *dimm)
3619 {
3620 SpaprDrc *drc;
3621 uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
3622 &error_abort);
3623 uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3624 uint32_t avail_lmbs = 0;
3625 uint64_t addr_start, addr;
3626 int i;
3627
3628 addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3629 &error_abort);
3630
3631 addr = addr_start;
3632 for (i = 0; i < nr_lmbs; i++) {
3633 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3634 addr / SPAPR_MEMORY_BLOCK_SIZE);
3635 g_assert(drc);
3636 if (drc->dev) {
3637 avail_lmbs++;
3638 }
3639 addr += SPAPR_MEMORY_BLOCK_SIZE;
3640 }
3641
3642 return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
3643 }
3644
3645 void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev)
3646 {
3647 SpaprDimmState *ds;
3648 PCDIMMDevice *dimm;
3649 SpaprDrc *drc;
3650 uint32_t nr_lmbs;
3651 uint64_t size, addr_start, addr;
3652 g_autofree char *qapi_error = NULL;
3653 int i;
3654
3655 if (!dev) {
3656 return;
3657 }
3658
3659 dimm = PC_DIMM(dev);
3660 ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3661
3662 /*
3663 * 'ds == NULL' would mean that the DIMM doesn't have a pending
3664 * unplug state, but one of its DRC is marked as unplug_requested.
3665 * This is bad and weird enough to g_assert() out.
3666 */
3667 g_assert(ds);
3668
3669 spapr_pending_dimm_unplugs_remove(spapr, ds);
3670
3671 size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3672 nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3673
3674 addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3675 &error_abort);
3676
3677 addr = addr_start;
3678 for (i = 0; i < nr_lmbs; i++) {
3679 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3680 addr / SPAPR_MEMORY_BLOCK_SIZE);
3681 g_assert(drc);
3682
3683 drc->unplug_requested = false;
3684 addr += SPAPR_MEMORY_BLOCK_SIZE;
3685 }
3686
3687 /*
3688 * Tell QAPI that something happened and the memory
3689 * hotunplug wasn't successful.
3690 */
3691 qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest "
3692 "for device %s", dev->id);
3693 qapi_event_send_mem_unplug_error(dev->id, qapi_error);
3694 }
3695
3696 /* Callback to be called during DRC release. */
3697 void spapr_lmb_release(DeviceState *dev)
3698 {
3699 HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3700 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
3701 SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3702
3703 /* This information will get lost if a migration occurs
3704 * during the unplug process. In this case recover it. */
3705 if (ds == NULL) {
3706 ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
3707 g_assert(ds);
3708 /* The DRC being examined by the caller at least must be counted */
3709 g_assert(ds->nr_lmbs);
3710 }
3711
3712 if (--ds->nr_lmbs) {
3713 return;
3714 }
3715
3716 /*
3717 * Now that all the LMBs have been removed by the guest, call the
3718 * unplug handler chain. This can never fail.
3719 */
3720 hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3721 object_unparent(OBJECT(dev));
3722 }
3723
3724 static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3725 {
3726 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3727 SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3728
3729 /* We really shouldn't get this far without anything to unplug */
3730 g_assert(ds);
3731
3732 pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
3733 qdev_unrealize(dev);
3734 spapr_pending_dimm_unplugs_remove(spapr, ds);
3735 }
3736
3737 static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
3738 DeviceState *dev, Error **errp)
3739 {
3740 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3741 PCDIMMDevice *dimm = PC_DIMM(dev);
3742 uint32_t nr_lmbs;
3743 uint64_t size, addr_start, addr;
3744 int i;
3745 SpaprDrc *drc;
3746
3747 if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
3748 error_setg(errp, "nvdimm device hot unplug is not supported yet.");
3749 return;
3750 }
3751
3752 size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3753 nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3754
3755 addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3756 &error_abort);
3757
3758 /*
3759 * An existing pending dimm state for this DIMM means that there is an
3760 * unplug operation in progress, waiting for the spapr_lmb_release
3761 * callback to complete the job (BQL can't cover that far). In this case,
3762 * bail out to avoid detaching DRCs that were already released.
3763 */
3764 if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
3765 error_setg(errp, "Memory unplug already in progress for device %s",
3766 dev->id);
3767 return;
3768 }
3769
3770 spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
3771
3772 addr = addr_start;
3773 for (i = 0; i < nr_lmbs; i++) {
3774 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3775 addr / SPAPR_MEMORY_BLOCK_SIZE);
3776 g_assert(drc);
3777
3778 spapr_drc_unplug_request(drc);
3779 addr += SPAPR_MEMORY_BLOCK_SIZE;
3780 }
3781
3782 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3783 addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3784 spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3785 nr_lmbs, spapr_drc_index(drc));
3786 }
3787
3788 /* Callback to be called during DRC release. */
3789 void spapr_core_release(DeviceState *dev)
3790 {
3791 HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3792
3793 /* Call the unplug handler chain. This can never fail. */
3794 hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3795 object_unparent(OBJECT(dev));
3796 }
3797
3798 static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3799 {
3800 MachineState *ms = MACHINE(hotplug_dev);
3801 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
3802 CPUCore *cc = CPU_CORE(dev);
3803 CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
3804
3805 if (smc->pre_2_10_has_unused_icps) {
3806 SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
3807 int i;
3808
3809 for (i = 0; i < cc->nr_threads; i++) {
3810 CPUState *cs = CPU(sc->threads[i]);
3811
3812 pre_2_10_vmstate_register_dummy_icp(cs->cpu_index);
3813 }
3814 }
3815
3816 assert(core_slot);
3817 core_slot->cpu = NULL;
3818 qdev_unrealize(dev);
3819 }
3820
3821 static
3822 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
3823 Error **errp)
3824 {
3825 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3826 int index;
3827 SpaprDrc *drc;
3828 CPUCore *cc = CPU_CORE(dev);
3829
3830 if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
3831 error_setg(errp, "Unable to find CPU core with core-id: %d",
3832 cc->core_id);
3833 return;
3834 }
3835 if (index == 0) {
3836 error_setg(errp, "Boot CPU core may not be unplugged");
3837 return;
3838 }
3839
3840 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3841 spapr_vcpu_id(spapr, cc->core_id));
3842 g_assert(drc);
3843
3844 if (!spapr_drc_unplug_requested(drc)) {
3845 spapr_drc_unplug_request(drc);
3846 }
3847
3848 /*
3849 * spapr_hotplug_req_remove_by_index is left unguarded, out of the
3850 * "!spapr_drc_unplug_requested" check, to allow for multiple IRQ
3851 * pulses removing the same CPU. Otherwise, in an failed hotunplug
3852 * attempt (e.g. the kernel will refuse to remove the last online
3853 * CPU), we will never attempt it again because unplug_requested
3854 * will still be 'true' in that case.
3855 */
3856 spapr_hotplug_req_remove_by_index(drc);
3857 }
3858
3859 int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3860 void *fdt, int *fdt_start_offset, Error **errp)
3861 {
3862 SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
3863 CPUState *cs = CPU(core->threads[0]);
3864 PowerPCCPU *cpu = POWERPC_CPU(cs);
3865 DeviceClass *dc = DEVICE_GET_CLASS(cs);
3866 int id = spapr_get_vcpu_id(cpu);
3867 g_autofree char *nodename = NULL;
3868 int offset;
3869
3870 nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
3871 offset = fdt_add_subnode(fdt, 0, nodename);
3872
3873 spapr_dt_cpu(cs, fdt, offset, spapr);
3874
3875 /*
3876 * spapr_dt_cpu() does not fill the 'name' property in the
3877 * CPU node. The function is called during boot process, before
3878 * and after CAS, and overwriting the 'name' property written
3879 * by SLOF is not allowed.
3880 *
3881 * Write it manually after spapr_dt_cpu(). This makes the hotplug
3882 * CPUs more compatible with the coldplugged ones, which have
3883 * the 'name' property. Linux Kernel also relies on this
3884 * property to identify CPU nodes.
3885 */
3886 _FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
3887
3888 *fdt_start_offset = offset;
3889 return 0;
3890 }
3891
3892 static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3893 {
3894 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3895 MachineClass *mc = MACHINE_GET_CLASS(spapr);
3896 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
3897 SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
3898 CPUCore *cc = CPU_CORE(dev);
3899 CPUState *cs;
3900 SpaprDrc *drc;
3901 CPUArchId *core_slot;
3902 int index;
3903 bool hotplugged = spapr_drc_hotplugged(dev);
3904 int i;
3905
3906 core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3907 g_assert(core_slot); /* Already checked in spapr_core_pre_plug() */
3908
3909 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3910 spapr_vcpu_id(spapr, cc->core_id));
3911
3912 g_assert(drc || !mc->has_hotpluggable_cpus);
3913
3914 if (drc) {
3915 /*
3916 * spapr_core_pre_plug() already buys us this is a brand new
3917 * core being plugged into a free slot. Nothing should already
3918 * be attached to the corresponding DRC.
3919 */
3920 spapr_drc_attach(drc, dev);
3921
3922 if (hotplugged) {
3923 /*
3924 * Send hotplug notification interrupt to the guest only
3925 * in case of hotplugged CPUs.
3926 */
3927 spapr_hotplug_req_add_by_index(drc);
3928 } else {
3929 spapr_drc_reset(drc);
3930 }
3931 }
3932
3933 core_slot->cpu = OBJECT(dev);
3934
3935 /*
3936 * Set compatibility mode to match the boot CPU, which was either set
3937 * by the machine reset code or by CAS. This really shouldn't fail at
3938 * this point.
3939 */
3940 if (hotplugged) {
3941 for (i = 0; i < cc->nr_threads; i++) {
3942 ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr,
3943 &error_abort);
3944 }
3945 }
3946
3947 if (smc->pre_2_10_has_unused_icps) {
3948 for (i = 0; i < cc->nr_threads; i++) {
3949 cs = CPU(core->threads[i]);
3950 pre_2_10_vmstate_unregister_dummy_icp(cs->cpu_index);
3951 }
3952 }
3953 }
3954
3955 static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3956 Error **errp)
3957 {
3958 MachineState *machine = MACHINE(OBJECT(hotplug_dev));
3959 MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
3960 CPUCore *cc = CPU_CORE(dev);
3961 const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
3962 const char *type = object_get_typename(OBJECT(dev));
3963 CPUArchId *core_slot;
3964 int index;
3965 unsigned int smp_threads = machine->smp.threads;
3966
3967 if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
3968 error_setg(errp, "CPU hotplug not supported for this machine");
3969 return;
3970 }
3971
3972 if (strcmp(base_core_type, type)) {
3973 error_setg(errp, "CPU core type should be %s", base_core_type);
3974 return;
3975 }
3976
3977 if (cc->core_id % smp_threads) {
3978 error_setg(errp, "invalid core id %d", cc->core_id);
3979 return;
3980 }
3981
3982 /*
3983 * In general we should have homogeneous threads-per-core, but old
3984 * (pre hotplug support) machine types allow the last core to have
3985 * reduced threads as a compatibility hack for when we allowed
3986 * total vcpus not a multiple of threads-per-core.
3987 */
3988 if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
3989 error_setg(errp, "invalid nr-threads %d, must be %d", cc->nr_threads,
3990 smp_threads);
3991 return;
3992 }
3993
3994 core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3995 if (!core_slot) {
3996 error_setg(errp, "core id %d out of range", cc->core_id);
3997 return;
3998 }
3999
4000 if (core_slot->cpu) {
4001 error_setg(errp, "core %d already populated", cc->core_id);
4002 return;
4003 }
4004
4005 numa_cpu_pre_plug(core_slot, dev, errp);
4006 }
4007
4008 int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
4009 void *fdt, int *fdt_start_offset, Error **errp)
4010 {
4011 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
4012 int intc_phandle;
4013
4014 intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
4015 if (intc_phandle <= 0) {
4016 return -1;
4017 }
4018
4019 if (spapr_dt_phb(spapr, sphb, intc_phandle, fdt, fdt_start_offset)) {
4020 error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
4021 return -1;
4022 }
4023
4024 /* generally SLOF creates these, for hotplug it's up to QEMU */
4025 _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
4026
4027 return 0;
4028 }
4029
4030 static bool spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4031 Error **errp)
4032 {
4033 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4034 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4035 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4036 const unsigned windows_supported = spapr_phb_windows_supported(sphb);
4037 SpaprDrc *drc;
4038
4039 if (dev->hotplugged && !smc->dr_phb_enabled) {
4040 error_setg(errp, "PHB hotplug not supported for this machine");
4041 return false;
4042 }
4043
4044 if (sphb->index == (uint32_t)-1) {
4045 error_setg(errp, "\"index\" for PAPR PHB is mandatory");
4046 return false;
4047 }
4048
4049 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4050 if (drc && drc->dev) {
4051 error_setg(errp, "PHB %d already attached", sphb->index);
4052 return false;
4053 }
4054
4055 /*
4056 * This will check that sphb->index doesn't exceed the maximum number of
4057 * PHBs for the current machine type.
4058 */
4059 return
4060 smc->phb_placement(spapr, sphb->index,
4061 &sphb->buid, &sphb->io_win_addr,
4062 &sphb->mem_win_addr, &sphb->mem64_win_addr,
4063 windows_supported, sphb->dma_liobn,
4064 &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
4065 errp);
4066 }
4067
4068 static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4069 {
4070 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4071 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4072 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4073 SpaprDrc *drc;
4074 bool hotplugged = spapr_drc_hotplugged(dev);
4075
4076 if (!smc->dr_phb_enabled) {
4077 return;
4078 }
4079
4080 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4081 /* hotplug hooks should check it's enabled before getting this far */
4082 assert(drc);
4083
4084 /* spapr_phb_pre_plug() already checked the DRC is attachable */
4085 spapr_drc_attach(drc, dev);
4086
4087 if (hotplugged) {
4088 spapr_hotplug_req_add_by_index(drc);
4089 } else {
4090 spapr_drc_reset(drc);
4091 }
4092 }
4093
4094 void spapr_phb_release(DeviceState *dev)
4095 {
4096 HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
4097
4098 hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
4099 object_unparent(OBJECT(dev));
4100 }
4101
4102 static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4103 {
4104 qdev_unrealize(dev);
4105 }
4106
4107 static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
4108 DeviceState *dev, Error **errp)
4109 {
4110 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4111 SpaprDrc *drc;
4112
4113 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4114 assert(drc);
4115
4116 if (!spapr_drc_unplug_requested(drc)) {
4117 spapr_drc_unplug_request(drc);
4118 spapr_hotplug_req_remove_by_index(drc);
4119 } else {
4120 error_setg(errp,
4121 "PCI Host Bridge unplug already in progress for device %s",
4122 dev->id);
4123 }
4124 }
4125
4126 static
4127 bool spapr_tpm_proxy_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4128 Error **errp)
4129 {
4130 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4131
4132 if (spapr->tpm_proxy != NULL) {
4133 error_setg(errp, "Only one TPM proxy can be specified for this machine");
4134 return false;
4135 }
4136
4137 return true;
4138 }
4139
4140 static void spapr_tpm_proxy_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4141 {
4142 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4143 SpaprTpmProxy *tpm_proxy = SPAPR_TPM_PROXY(dev);
4144
4145 /* Already checked in spapr_tpm_proxy_pre_plug() */
4146 g_assert(spapr->tpm_proxy == NULL);
4147
4148 spapr->tpm_proxy = tpm_proxy;
4149 }
4150
4151 static void spapr_tpm_proxy_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4152 {
4153 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4154
4155 qdev_unrealize(dev);
4156 object_unparent(OBJECT(dev));
4157 spapr->tpm_proxy = NULL;
4158 }
4159
4160 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
4161 DeviceState *dev, Error **errp)
4162 {
4163 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4164 spapr_memory_plug(hotplug_dev, dev);
4165 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4166 spapr_core_plug(hotplug_dev, dev);
4167 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4168 spapr_phb_plug(hotplug_dev, dev);
4169 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4170 spapr_tpm_proxy_plug(hotplug_dev, dev);
4171 }
4172 }
4173
4174 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
4175 DeviceState *dev, Error **errp)
4176 {
4177 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4178 spapr_memory_unplug(hotplug_dev, dev);
4179 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4180 spapr_core_unplug(hotplug_dev, dev);
4181 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4182 spapr_phb_unplug(hotplug_dev, dev);
4183 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4184 spapr_tpm_proxy_unplug(hotplug_dev, dev);
4185 }
4186 }
4187
4188 bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr)
4189 {
4190 return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) ||
4191 /*
4192 * CAS will process all pending unplug requests.
4193 *
4194 * HACK: a guest could theoretically have cleared all bits in OV5,
4195 * but none of the guests we care for do.
4196 */
4197 spapr_ovec_empty(spapr->ov5_cas);
4198 }
4199
4200 static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
4201 DeviceState *dev, Error **errp)
4202 {
4203 SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
4204 MachineClass *mc = MACHINE_GET_CLASS(sms);
4205 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4206
4207 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4208 if (spapr_memory_hot_unplug_supported(sms)) {
4209 spapr_memory_unplug_request(hotplug_dev, dev, errp);
4210 } else {
4211 error_setg(errp, "Memory hot unplug not supported for this guest");
4212 }
4213 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4214 if (!mc->has_hotpluggable_cpus) {
4215 error_setg(errp, "CPU hot unplug not supported on this machine");
4216 return;
4217 }
4218 spapr_core_unplug_request(hotplug_dev, dev, errp);
4219 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4220 if (!smc->dr_phb_enabled) {
4221 error_setg(errp, "PHB hot unplug not supported on this machine");
4222 return;
4223 }
4224 spapr_phb_unplug_request(hotplug_dev, dev, errp);
4225 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4226 spapr_tpm_proxy_unplug(hotplug_dev, dev);
4227 }
4228 }
4229
4230 static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
4231 DeviceState *dev, Error **errp)
4232 {
4233 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4234 spapr_memory_pre_plug(hotplug_dev, dev, errp);
4235 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4236 spapr_core_pre_plug(hotplug_dev, dev, errp);
4237 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4238 spapr_phb_pre_plug(hotplug_dev, dev, errp);
4239 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4240 spapr_tpm_proxy_pre_plug(hotplug_dev, dev, errp);
4241 }
4242 }
4243
4244 static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
4245 DeviceState *dev)
4246 {
4247 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
4248 object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
4249 object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE) ||
4250 object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4251 return HOTPLUG_HANDLER(machine);
4252 }
4253 if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
4254 PCIDevice *pcidev = PCI_DEVICE(dev);
4255 PCIBus *root = pci_device_root_bus(pcidev);
4256 SpaprPhbState *phb =
4257 (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent),
4258 TYPE_SPAPR_PCI_HOST_BRIDGE);
4259
4260 if (phb) {
4261 return HOTPLUG_HANDLER(phb);
4262 }
4263 }
4264 return NULL;
4265 }
4266
4267 static CpuInstanceProperties
4268 spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
4269 {
4270 CPUArchId *core_slot;
4271 MachineClass *mc = MACHINE_GET_CLASS(machine);
4272
4273 /* make sure possible_cpu are intialized */
4274 mc->possible_cpu_arch_ids(machine);
4275 /* get CPU core slot containing thread that matches cpu_index */
4276 core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
4277 assert(core_slot);
4278 return core_slot->props;
4279 }
4280
4281 static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
4282 {
4283 return idx / ms->smp.cores % ms->numa_state->num_nodes;
4284 }
4285
4286 static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
4287 {
4288 int i;
4289 unsigned int smp_threads = machine->smp.threads;
4290 unsigned int smp_cpus = machine->smp.cpus;
4291 const char *core_type;
4292 int spapr_max_cores = machine->smp.max_cpus / smp_threads;
4293 MachineClass *mc = MACHINE_GET_CLASS(machine);
4294
4295 if (!mc->has_hotpluggable_cpus) {
4296 spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
4297 }
4298 if (machine->possible_cpus) {
4299 assert(machine->possible_cpus->len == spapr_max_cores);
4300 return machine->possible_cpus;
4301 }
4302
4303 core_type = spapr_get_cpu_core_type(machine->cpu_type);
4304 if (!core_type) {
4305 error_report("Unable to find sPAPR CPU Core definition");
4306 exit(1);
4307 }
4308
4309 machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
4310 sizeof(CPUArchId) * spapr_max_cores);
4311 machine->possible_cpus->len = spapr_max_cores;
4312 for (i = 0; i < machine->possible_cpus->len; i++) {
4313 int core_id = i * smp_threads;
4314
4315 machine->possible_cpus->cpus[i].type = core_type;
4316 machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
4317 machine->possible_cpus->cpus[i].arch_id = core_id;
4318 machine->possible_cpus->cpus[i].props.has_core_id = true;
4319 machine->possible_cpus->cpus[i].props.core_id = core_id;
4320 }
4321 return machine->possible_cpus;
4322 }
4323
4324 static bool spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
4325 uint64_t *buid, hwaddr *pio,
4326 hwaddr *mmio32, hwaddr *mmio64,
4327 unsigned n_dma, uint32_t *liobns,
4328 hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4329 {
4330 /*
4331 * New-style PHB window placement.
4332 *
4333 * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
4334 * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
4335 * windows.
4336 *
4337 * Some guest kernels can't work with MMIO windows above 1<<46
4338 * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
4339 *
4340 * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
4341 * PHB stacked together. (32TiB+2GiB)..(32TiB+64GiB) contains the
4342 * 2GiB 32-bit MMIO windows for each PHB. Then 33..64TiB has the
4343 * 1TiB 64-bit MMIO windows for each PHB.
4344 */
4345 const uint64_t base_buid = 0x800000020000000ULL;
4346 int i;
4347
4348 /* Sanity check natural alignments */
4349 QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4350 QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4351 QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
4352 QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
4353 /* Sanity check bounds */
4354 QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
4355 SPAPR_PCI_MEM32_WIN_SIZE);
4356 QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
4357 SPAPR_PCI_MEM64_WIN_SIZE);
4358
4359 if (index >= SPAPR_MAX_PHBS) {
4360 error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
4361 SPAPR_MAX_PHBS - 1);
4362 return false;
4363 }
4364
4365 *buid = base_buid + index;
4366 for (i = 0; i < n_dma; ++i) {
4367 liobns[i] = SPAPR_PCI_LIOBN(index, i);
4368 }
4369
4370 *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
4371 *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
4372 *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
4373
4374 *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
4375 *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
4376 return true;
4377 }
4378
4379 static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
4380 {
4381 SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4382
4383 return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
4384 }
4385
4386 static void spapr_ics_resend(XICSFabric *dev)
4387 {
4388 SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4389
4390 ics_resend(spapr->ics);
4391 }
4392
4393 static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
4394 {
4395 PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
4396
4397 return cpu ? spapr_cpu_state(cpu)->icp : NULL;
4398 }
4399
4400 static void spapr_pic_print_info(InterruptStatsProvider *obj,
4401 Monitor *mon)
4402 {
4403 SpaprMachineState *spapr = SPAPR_MACHINE(obj);
4404
4405 spapr_irq_print_info(spapr, mon);
4406 monitor_printf(mon, "irqchip: %s\n",
4407 kvm_irqchip_in_kernel() ? "in-kernel" : "emulated");
4408 }
4409
4410 /*
4411 * This is a XIVE only operation
4412 */
4413 static int spapr_match_nvt(XiveFabric *xfb, uint8_t format,
4414 uint8_t nvt_blk, uint32_t nvt_idx,
4415 bool cam_ignore, uint8_t priority,
4416 uint32_t logic_serv, XiveTCTXMatch *match)
4417 {
4418 SpaprMachineState *spapr = SPAPR_MACHINE(xfb);
4419 XivePresenter *xptr = XIVE_PRESENTER(spapr->active_intc);
4420 XivePresenterClass *xpc = XIVE_PRESENTER_GET_CLASS(xptr);
4421 int count;
4422
4423 count = xpc->match_nvt(xptr, format, nvt_blk, nvt_idx, cam_ignore,
4424 priority, logic_serv, match);
4425 if (count < 0) {
4426 return count;
4427 }
4428
4429 /*
4430 * When we implement the save and restore of the thread interrupt
4431 * contexts in the enter/exit CPU handlers of the machine and the
4432 * escalations in QEMU, we should be able to handle non dispatched
4433 * vCPUs.
4434 *
4435 * Until this is done, the sPAPR machine should find at least one
4436 * matching context always.
4437 */
4438 if (count == 0) {
4439 qemu_log_mask(LOG_GUEST_ERROR, "XIVE: NVT %x/%x is not dispatched\n",
4440 nvt_blk, nvt_idx);
4441 }
4442
4443 return count;
4444 }
4445
4446 int spapr_get_vcpu_id(PowerPCCPU *cpu)
4447 {
4448 return cpu->vcpu_id;
4449 }
4450
4451 bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
4452 {
4453 SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
4454 MachineState *ms = MACHINE(spapr);
4455 int vcpu_id;
4456
4457 vcpu_id = spapr_vcpu_id(spapr, cpu_index);
4458
4459 if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
4460 error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
4461 error_append_hint(errp, "Adjust the number of cpus to %d "
4462 "or try to raise the number of threads per core\n",
4463 vcpu_id * ms->smp.threads / spapr->vsmt);
4464 return false;
4465 }
4466
4467 cpu->vcpu_id = vcpu_id;
4468 return true;
4469 }
4470
4471 PowerPCCPU *spapr_find_cpu(int vcpu_id)
4472 {
4473 CPUState *cs;
4474
4475 CPU_FOREACH(cs) {
4476 PowerPCCPU *cpu = POWERPC_CPU(cs);
4477
4478 if (spapr_get_vcpu_id(cpu) == vcpu_id) {
4479 return cpu;
4480 }
4481 }
4482
4483 return NULL;
4484 }
4485
4486 static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4487 {
4488 SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4489
4490 /* These are only called by TCG, KVM maintains dispatch state */
4491
4492 spapr_cpu->prod = false;
4493 if (spapr_cpu->vpa_addr) {
4494 CPUState *cs = CPU(cpu);
4495 uint32_t dispatch;
4496
4497 dispatch = ldl_be_phys(cs->as,
4498 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4499 dispatch++;
4500 if ((dispatch & 1) != 0) {
4501 qemu_log_mask(LOG_GUEST_ERROR,
4502 "VPA: incorrect dispatch counter value for "
4503 "dispatched partition %u, correcting.\n", dispatch);
4504 dispatch++;
4505 }
4506 stl_be_phys(cs->as,
4507 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4508 }
4509 }
4510
4511 static void spapr_cpu_exec_exit(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4512 {
4513 SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4514
4515 if (spapr_cpu->vpa_addr) {
4516 CPUState *cs = CPU(cpu);
4517 uint32_t dispatch;
4518
4519 dispatch = ldl_be_phys(cs->as,
4520 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4521 dispatch++;
4522 if ((dispatch & 1) != 1) {
4523 qemu_log_mask(LOG_GUEST_ERROR,
4524 "VPA: incorrect dispatch counter value for "
4525 "preempted partition %u, correcting.\n", dispatch);
4526 dispatch++;
4527 }
4528 stl_be_phys(cs->as,
4529 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4530 }
4531 }
4532
4533 static void spapr_machine_class_init(ObjectClass *oc, void *data)
4534 {
4535 MachineClass *mc = MACHINE_CLASS(oc);
4536 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
4537 FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
4538 NMIClass *nc = NMI_CLASS(oc);
4539 HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
4540 PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
4541 XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
4542 InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
4543 XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
4544 VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
4545
4546 mc->desc = "pSeries Logical Partition (PAPR compliant)";
4547 mc->ignore_boot_device_suffixes = true;
4548
4549 /*
4550 * We set up the default / latest behaviour here. The class_init
4551 * functions for the specific versioned machine types can override
4552 * these details for backwards compatibility
4553 */
4554 mc->init = spapr_machine_init;
4555 mc->reset = spapr_machine_reset;
4556 mc->block_default_type = IF_SCSI;
4557
4558 /*
4559 * Setting max_cpus to INT32_MAX. Both KVM and TCG max_cpus values
4560 * should be limited by the host capability instead of hardcoded.
4561 * max_cpus for KVM guests will be checked in kvm_init(), and TCG
4562 * guests are welcome to have as many CPUs as the host are capable
4563 * of emulate.
4564 */
4565 mc->max_cpus = INT32_MAX;
4566
4567 mc->no_parallel = 1;
4568 mc->default_boot_order = "";
4569 mc->default_ram_size = 512 * MiB;
4570 mc->default_ram_id = "ppc_spapr.ram";
4571 mc->default_display = "std";
4572 mc->kvm_type = spapr_kvm_type;
4573 machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
4574 mc->pci_allow_0_address = true;
4575 assert(!mc->get_hotplug_handler);
4576 mc->get_hotplug_handler = spapr_get_hotplug_handler;
4577 hc->pre_plug = spapr_machine_device_pre_plug;
4578 hc->plug = spapr_machine_device_plug;
4579 mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
4580 mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
4581 mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
4582 hc->unplug_request = spapr_machine_device_unplug_request;
4583 hc->unplug = spapr_machine_device_unplug;
4584
4585 smc->dr_lmb_enabled = true;
4586 smc->update_dt_enabled = true;
4587 mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.0");
4588 mc->has_hotpluggable_cpus = true;
4589 mc->nvdimm_supported = true;
4590 smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
4591 fwc->get_dev_path = spapr_get_fw_dev_path;
4592 nc->nmi_monitor_handler = spapr_nmi;
4593 smc->phb_placement = spapr_phb_placement;
4594 vhc->hypercall = emulate_spapr_hypercall;
4595 vhc->hpt_mask = spapr_hpt_mask;
4596 vhc->map_hptes = spapr_map_hptes;
4597 vhc->unmap_hptes = spapr_unmap_hptes;
4598 vhc->hpte_set_c = spapr_hpte_set_c;
4599 vhc->hpte_set_r = spapr_hpte_set_r;
4600 vhc->get_pate = spapr_get_pate;
4601 vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
4602 vhc->cpu_exec_enter = spapr_cpu_exec_enter;
4603 vhc->cpu_exec_exit = spapr_cpu_exec_exit;
4604 xic->ics_get = spapr_ics_get;
4605 xic->ics_resend = spapr_ics_resend;
4606 xic->icp_get = spapr_icp_get;
4607 ispc->print_info = spapr_pic_print_info;
4608 /* Force NUMA node memory size to be a multiple of
4609 * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
4610 * in which LMBs are represented and hot-added
4611 */
4612 mc->numa_mem_align_shift = 28;
4613 mc->auto_enable_numa = true;
4614
4615 smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
4616 smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
4617 smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
4618 smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4619 smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4620 smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
4621 smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
4622 smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
4623 smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
4624 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
4625 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
4626 smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
4627 spapr_caps_add_properties(smc);
4628 smc->irq = &spapr_irq_dual;
4629 smc->dr_phb_enabled = true;
4630 smc->linux_pci_probe = true;
4631 smc->smp_threads_vsmt = true;
4632 smc->nr_xirqs = SPAPR_NR_XIRQS;
4633 xfc->match_nvt = spapr_match_nvt;
4634 vmc->client_architecture_support = spapr_vof_client_architecture_support;
4635 vmc->quiesce = spapr_vof_quiesce;
4636 vmc->setprop = spapr_vof_setprop;
4637 }
4638
4639 static const TypeInfo spapr_machine_info = {
4640 .name = TYPE_SPAPR_MACHINE,
4641 .parent = TYPE_MACHINE,
4642 .abstract = true,
4643 .instance_size = sizeof(SpaprMachineState),
4644 .instance_init = spapr_instance_init,
4645 .instance_finalize = spapr_machine_finalizefn,
4646 .class_size = sizeof(SpaprMachineClass),
4647 .class_init = spapr_machine_class_init,
4648 .interfaces = (InterfaceInfo[]) {
4649 { TYPE_FW_PATH_PROVIDER },
4650 { TYPE_NMI },
4651 { TYPE_HOTPLUG_HANDLER },
4652 { TYPE_PPC_VIRTUAL_HYPERVISOR },
4653 { TYPE_XICS_FABRIC },
4654 { TYPE_INTERRUPT_STATS_PROVIDER },
4655 { TYPE_XIVE_FABRIC },
4656 { TYPE_VOF_MACHINE_IF },
4657 { }
4658 },
4659 };
4660
4661 static void spapr_machine_latest_class_options(MachineClass *mc)
4662 {
4663 mc->alias = "pseries";
4664 mc->is_default = true;
4665 }
4666
4667 #define DEFINE_SPAPR_MACHINE(suffix, verstr, latest) \
4668 static void spapr_machine_##suffix##_class_init(ObjectClass *oc, \
4669 void *data) \
4670 { \
4671 MachineClass *mc = MACHINE_CLASS(oc); \
4672 spapr_machine_##suffix##_class_options(mc); \
4673 if (latest) { \
4674 spapr_machine_latest_class_options(mc); \
4675 } \
4676 } \
4677 static const TypeInfo spapr_machine_##suffix##_info = { \
4678 .name = MACHINE_TYPE_NAME("pseries-" verstr), \
4679 .parent = TYPE_SPAPR_MACHINE, \
4680 .class_init = spapr_machine_##suffix##_class_init, \
4681 }; \
4682 static void spapr_machine_register_##suffix(void) \
4683 { \
4684 type_register(&spapr_machine_##suffix##_info); \
4685 } \
4686 type_init(spapr_machine_register_##suffix)
4687
4688 /*
4689 * pseries-6.1
4690 */
4691 static void spapr_machine_6_1_class_options(MachineClass *mc)
4692 {
4693 /* Defaults for the latest behaviour inherited from the base class */
4694 }
4695
4696 DEFINE_SPAPR_MACHINE(6_1, "6.1", true);
4697
4698 /*
4699 * pseries-6.0
4700 */
4701 static void spapr_machine_6_0_class_options(MachineClass *mc)
4702 {
4703 spapr_machine_6_1_class_options(mc);
4704 compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
4705 }
4706
4707 DEFINE_SPAPR_MACHINE(6_0, "6.0", false);
4708
4709 /*
4710 * pseries-5.2
4711 */
4712 static void spapr_machine_5_2_class_options(MachineClass *mc)
4713 {
4714 spapr_machine_6_0_class_options(mc);
4715 compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
4716 }
4717
4718 DEFINE_SPAPR_MACHINE(5_2, "5.2", false);
4719
4720 /*
4721 * pseries-5.1
4722 */
4723 static void spapr_machine_5_1_class_options(MachineClass *mc)
4724 {
4725 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4726
4727 spapr_machine_5_2_class_options(mc);
4728 compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len);
4729 smc->pre_5_2_numa_associativity = true;
4730 }
4731
4732 DEFINE_SPAPR_MACHINE(5_1, "5.1", false);
4733
4734 /*
4735 * pseries-5.0
4736 */
4737 static void spapr_machine_5_0_class_options(MachineClass *mc)
4738 {
4739 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4740 static GlobalProperty compat[] = {
4741 { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
4742 };
4743
4744 spapr_machine_5_1_class_options(mc);
4745 compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
4746 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4747 mc->numa_mem_supported = true;
4748 smc->pre_5_1_assoc_refpoints = true;
4749 }
4750
4751 DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
4752
4753 /*
4754 * pseries-4.2
4755 */
4756 static void spapr_machine_4_2_class_options(MachineClass *mc)
4757 {
4758 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4759
4760 spapr_machine_5_0_class_options(mc);
4761 compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len);
4762 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
4763 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_OFF;
4764 smc->rma_limit = 16 * GiB;
4765 mc->nvdimm_supported = false;
4766 }
4767
4768 DEFINE_SPAPR_MACHINE(4_2, "4.2", false);
4769
4770 /*
4771 * pseries-4.1
4772 */
4773 static void spapr_machine_4_1_class_options(MachineClass *mc)
4774 {
4775 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4776 static GlobalProperty compat[] = {
4777 /* Only allow 4kiB and 64kiB IOMMU pagesizes */
4778 { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" },
4779 };
4780
4781 spapr_machine_4_2_class_options(mc);
4782 smc->linux_pci_probe = false;
4783 smc->smp_threads_vsmt = false;
4784 compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
4785 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4786 }
4787
4788 DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
4789
4790 /*
4791 * pseries-4.0
4792 */
4793 static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
4794 uint64_t *buid, hwaddr *pio,
4795 hwaddr *mmio32, hwaddr *mmio64,
4796 unsigned n_dma, uint32_t *liobns,
4797 hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4798 {
4799 if (!spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma,
4800 liobns, nv2gpa, nv2atsd, errp)) {
4801 return false;
4802 }
4803
4804 *nv2gpa = 0;
4805 *nv2atsd = 0;
4806 return true;
4807 }
4808 static void spapr_machine_4_0_class_options(MachineClass *mc)
4809 {
4810 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4811
4812 spapr_machine_4_1_class_options(mc);
4813 compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
4814 smc->phb_placement = phb_placement_4_0;
4815 smc->irq = &spapr_irq_xics;
4816 smc->pre_4_1_migration = true;
4817 }
4818
4819 DEFINE_SPAPR_MACHINE(4_0, "4.0", false);
4820
4821 /*
4822 * pseries-3.1
4823 */
4824 static void spapr_machine_3_1_class_options(MachineClass *mc)
4825 {
4826 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4827
4828 spapr_machine_4_0_class_options(mc);
4829 compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
4830
4831 mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
4832 smc->update_dt_enabled = false;
4833 smc->dr_phb_enabled = false;
4834 smc->broken_host_serial_model = true;
4835 smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
4836 smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
4837 smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
4838 smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
4839 }
4840
4841 DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
4842
4843 /*
4844 * pseries-3.0
4845 */
4846
4847 static void spapr_machine_3_0_class_options(MachineClass *mc)
4848 {
4849 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4850
4851 spapr_machine_3_1_class_options(mc);
4852 compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
4853
4854 smc->legacy_irq_allocation = true;
4855 smc->nr_xirqs = 0x400;
4856 smc->irq = &spapr_irq_xics_legacy;
4857 }
4858
4859 DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
4860
4861 /*
4862 * pseries-2.12
4863 */
4864 static void spapr_machine_2_12_class_options(MachineClass *mc)
4865 {
4866 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4867 static GlobalProperty compat[] = {
4868 { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" },
4869 { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" },
4870 };
4871
4872 spapr_machine_3_0_class_options(mc);
4873 compat_props_add(mc->compat_props, hw_compat_2_12, hw_compat_2_12_len);
4874 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4875
4876 /* We depend on kvm_enabled() to choose a default value for the
4877 * hpt-max-page-size capability. Of course we can't do it here
4878 * because this is too early and the HW accelerator isn't initialzed
4879 * yet. Postpone this to machine init (see default_caps_with_cpu()).
4880 */
4881 smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0;
4882 }
4883
4884 DEFINE_SPAPR_MACHINE(2_12, "2.12", false);
4885
4886 static void spapr_machine_2_12_sxxm_class_options(MachineClass *mc)
4887 {
4888 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4889
4890 spapr_machine_2_12_class_options(mc);
4891 smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4892 smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4893 smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD;
4894 }
4895
4896 DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
4897
4898 /*
4899 * pseries-2.11
4900 */
4901
4902 static void spapr_machine_2_11_class_options(MachineClass *mc)
4903 {
4904 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4905
4906 spapr_machine_2_12_class_options(mc);
4907 smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON;
4908 compat_props_add(mc->compat_props, hw_compat_2_11, hw_compat_2_11_len);
4909 }
4910
4911 DEFINE_SPAPR_MACHINE(2_11, "2.11", false);
4912
4913 /*
4914 * pseries-2.10
4915 */
4916
4917 static void spapr_machine_2_10_class_options(MachineClass *mc)
4918 {
4919 spapr_machine_2_11_class_options(mc);
4920 compat_props_add(mc->compat_props, hw_compat_2_10, hw_compat_2_10_len);
4921 }
4922
4923 DEFINE_SPAPR_MACHINE(2_10, "2.10", false);
4924
4925 /*
4926 * pseries-2.9
4927 */
4928
4929 static void spapr_machine_2_9_class_options(MachineClass *mc)
4930 {
4931 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4932 static GlobalProperty compat[] = {
4933 { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" },
4934 };
4935
4936 spapr_machine_2_10_class_options(mc);
4937 compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len);
4938 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4939 smc->pre_2_10_has_unused_icps = true;
4940 smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
4941 }
4942
4943 DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
4944
4945 /*
4946 * pseries-2.8
4947 */
4948
4949 static void spapr_machine_2_8_class_options(MachineClass *mc)
4950 {
4951 static GlobalProperty compat[] = {
4952 { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" },
4953 };
4954
4955 spapr_machine_2_9_class_options(mc);
4956 compat_props_add(mc->compat_props, hw_compat_2_8, hw_compat_2_8_len);
4957 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4958 mc->numa_mem_align_shift = 23;
4959 }
4960
4961 DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
4962
4963 /*
4964 * pseries-2.7
4965 */
4966
4967 static bool phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
4968 uint64_t *buid, hwaddr *pio,
4969 hwaddr *mmio32, hwaddr *mmio64,
4970 unsigned n_dma, uint32_t *liobns,
4971 hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4972 {
4973 /* Legacy PHB placement for pseries-2.7 and earlier machine types */
4974 const uint64_t base_buid = 0x800000020000000ULL;
4975 const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */
4976 const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */
4977 const hwaddr pio_offset = 0x80000000; /* 2 GiB */
4978 const uint32_t max_index = 255;
4979 const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */
4980
4981 uint64_t ram_top = MACHINE(spapr)->ram_size;
4982 hwaddr phb0_base, phb_base;
4983 int i;
4984
4985 /* Do we have device memory? */
4986 if (MACHINE(spapr)->maxram_size > ram_top) {
4987 /* Can't just use maxram_size, because there may be an
4988 * alignment gap between normal and device memory regions
4989 */
4990 ram_top = MACHINE(spapr)->device_memory->base +
4991 memory_region_size(&MACHINE(spapr)->device_memory->mr);
4992 }
4993
4994 phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
4995
4996 if (index > max_index) {
4997 error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
4998 max_index);
4999 return false;
5000 }
5001
5002 *buid = base_buid + index;
5003 for (i = 0; i < n_dma; ++i) {
5004 liobns[i] = SPAPR_PCI_LIOBN(index, i);
5005 }
5006
5007 phb_base = phb0_base + index * phb_spacing;
5008 *pio = phb_base + pio_offset;
5009 *mmio32 = phb_base + mmio_offset;
5010 /*
5011 * We don't set the 64-bit MMIO window, relying on the PHB's
5012 * fallback behaviour of automatically splitting a large "32-bit"
5013 * window into contiguous 32-bit and 64-bit windows
5014 */
5015
5016 *nv2gpa = 0;
5017 *nv2atsd = 0;
5018 return true;
5019 }
5020
5021 static void spapr_machine_2_7_class_options(MachineClass *mc)
5022 {
5023 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5024 static GlobalProperty compat[] = {
5025 { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000", },
5026 { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0", },
5027 { TYPE_POWERPC_CPU, "pre-2.8-migration", "on", },
5028 { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on", },
5029 };
5030
5031 spapr_machine_2_8_class_options(mc);
5032 mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3");
5033 mc->default_machine_opts = "modern-hotplug-events=off";
5034 compat_props_add(mc->compat_props, hw_compat_2_7, hw_compat_2_7_len);
5035 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5036 smc->phb_placement = phb_placement_2_7;
5037 }
5038
5039 DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
5040
5041 /*
5042 * pseries-2.6
5043 */
5044
5045 static void spapr_machine_2_6_class_options(MachineClass *mc)
5046 {
5047 static GlobalProperty compat[] = {
5048 { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" },
5049 };
5050
5051 spapr_machine_2_7_class_options(mc);
5052 mc->has_hotpluggable_cpus = false;
5053 compat_props_add(mc->compat_props, hw_compat_2_6, hw_compat_2_6_len);
5054 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5055 }
5056
5057 DEFINE_SPAPR_MACHINE(2_6, "2.6", false);
5058
5059 /*
5060 * pseries-2.5
5061 */
5062
5063 static void spapr_machine_2_5_class_options(MachineClass *mc)
5064 {
5065 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5066 static GlobalProperty compat[] = {
5067 { "spapr-vlan", "use-rx-buffer-pools", "off" },
5068 };
5069
5070 spapr_machine_2_6_class_options(mc);
5071 smc->use_ohci_by_default = true;
5072 compat_props_add(mc->compat_props, hw_compat_2_5, hw_compat_2_5_len);
5073 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5074 }
5075
5076 DEFINE_SPAPR_MACHINE(2_5, "2.5", false);
5077
5078 /*
5079 * pseries-2.4
5080 */
5081
5082 static void spapr_machine_2_4_class_options(MachineClass *mc)
5083 {
5084 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5085
5086 spapr_machine_2_5_class_options(mc);
5087 smc->dr_lmb_enabled = false;
5088 compat_props_add(mc->compat_props, hw_compat_2_4, hw_compat_2_4_len);
5089 }
5090
5091 DEFINE_SPAPR_MACHINE(2_4, "2.4", false);
5092
5093 /*
5094 * pseries-2.3
5095 */
5096
5097 static void spapr_machine_2_3_class_options(MachineClass *mc)
5098 {
5099 static GlobalProperty compat[] = {
5100 { "spapr-pci-host-bridge", "dynamic-reconfiguration", "off" },
5101 };
5102 spapr_machine_2_4_class_options(mc);
5103 compat_props_add(mc->compat_props, hw_compat_2_3, hw_compat_2_3_len);
5104 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5105 }
5106 DEFINE_SPAPR_MACHINE(2_3, "2.3", false);
5107
5108 /*
5109 * pseries-2.2
5110 */
5111
5112 static void spapr_machine_2_2_class_options(MachineClass *mc)
5113 {
5114 static GlobalProperty compat[] = {
5115 { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0x20000000" },
5116 };
5117
5118 spapr_machine_2_3_class_options(mc);
5119 compat_props_add(mc->compat_props, hw_compat_2_2, hw_compat_2_2_len);
5120 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5121 mc->default_machine_opts = "modern-hotplug-events=off,suppress-vmdesc=on";
5122 }
5123 DEFINE_SPAPR_MACHINE(2_2, "2.2", false);
5124
5125 /*
5126 * pseries-2.1
5127 */
5128
5129 static void spapr_machine_2_1_class_options(MachineClass *mc)
5130 {
5131 spapr_machine_2_2_class_options(mc);
5132 compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len);
5133 }
5134 DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
5135
5136 static void spapr_machine_register_types(void)
5137 {
5138 type_register_static(&spapr_machine_info);
5139 }
5140
5141 type_init(spapr_machine_register_types)