]>
Commit | Line | Data |
---|---|---|
1eee9950 DHB |
1 | /* |
2 | * QEMU PowerPC pSeries Logical Partition NUMA associativity handling | |
3 | * | |
4 | * Copyright IBM Corp. 2020 | |
5 | * | |
6 | * Authors: | |
7 | * Daniel Henrique Barboza <danielhb413@gmail.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
10 | * See the COPYING file in the top-level directory. | |
11 | */ | |
12 | ||
13 | #include "qemu/osdep.h" | |
1eee9950 | 14 | #include "hw/ppc/spapr_numa.h" |
dd7e1d7a | 15 | #include "hw/pci-host/spapr.h" |
1eee9950 DHB |
16 | #include "hw/ppc/fdt.h" |
17 | ||
dd7e1d7a DHB |
18 | /* Moved from hw/ppc/spapr_pci_nvlink2.c */ |
19 | #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) | |
f1aa45ff | 20 | |
3a6e4ce6 DHB |
21 | /* |
22 | * Retrieves max_dist_ref_points of the current NUMA affinity. | |
23 | */ | |
24 | static int get_max_dist_ref_points(SpaprMachineState *spapr) | |
25 | { | |
e0eb84d4 DHB |
26 | if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { |
27 | return FORM2_DIST_REF_POINTS; | |
28 | } | |
29 | ||
3a6e4ce6 DHB |
30 | return FORM1_DIST_REF_POINTS; |
31 | } | |
32 | ||
33 | /* | |
34 | * Retrieves numa_assoc_size of the current NUMA affinity. | |
35 | */ | |
36 | static int get_numa_assoc_size(SpaprMachineState *spapr) | |
37 | { | |
e0eb84d4 DHB |
38 | if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { |
39 | return FORM2_NUMA_ASSOC_SIZE; | |
40 | } | |
41 | ||
3a6e4ce6 DHB |
42 | return FORM1_NUMA_ASSOC_SIZE; |
43 | } | |
44 | ||
45 | /* | |
46 | * Retrieves vcpu_assoc_size of the current NUMA affinity. | |
47 | * | |
48 | * vcpu_assoc_size is the size of ibm,associativity array | |
49 | * for CPUs, which has an extra element (vcpu_id) in the end. | |
50 | */ | |
51 | static int get_vcpu_assoc_size(SpaprMachineState *spapr) | |
52 | { | |
53 | return get_numa_assoc_size(spapr) + 1; | |
54 | } | |
55 | ||
a165ac67 DHB |
56 | /* |
57 | * Retrieves the ibm,associativity array of NUMA node 'node_id' | |
58 | * for the current NUMA affinity. | |
59 | */ | |
60 | static const uint32_t *get_associativity(SpaprMachineState *spapr, int node_id) | |
61 | { | |
e0eb84d4 DHB |
62 | if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { |
63 | return spapr->FORM2_assoc_array[node_id]; | |
64 | } | |
a165ac67 DHB |
65 | return spapr->FORM1_assoc_array[node_id]; |
66 | } | |
67 | ||
1fde73bc DHB |
68 | /* |
69 | * Wrapper that returns node distance from ms->numa_state->nodes | |
70 | * after handling edge cases where the distance might be absent. | |
71 | */ | |
72 | static int get_numa_distance(MachineState *ms, int src, int dst) | |
73 | { | |
74 | NodeInfo *numa_info = ms->numa_state->nodes; | |
75 | int ret = numa_info[src].distance[dst]; | |
76 | ||
77 | if (ret != 0) { | |
78 | return ret; | |
79 | } | |
80 | ||
81 | /* | |
82 | * In case QEMU adds a default NUMA single node when the user | |
83 | * did not add any, or where the user did not supply distances, | |
84 | * the distance will be absent (zero). Return local/remote | |
85 | * distance in this case. | |
86 | */ | |
87 | if (src == dst) { | |
88 | return NUMA_DISTANCE_MIN; | |
89 | } | |
90 | ||
91 | return NUMA_DISTANCE_DEFAULT; | |
92 | } | |
93 | ||
ee6635b2 DHB |
94 | static bool spapr_numa_is_symmetrical(MachineState *ms) |
95 | { | |
ee6635b2 | 96 | int nb_numa_nodes = ms->numa_state->num_nodes; |
1fde73bc | 97 | int src, dst; |
ee6635b2 DHB |
98 | |
99 | for (src = 0; src < nb_numa_nodes; src++) { | |
100 | for (dst = src; dst < nb_numa_nodes; dst++) { | |
1fde73bc DHB |
101 | if (get_numa_distance(ms, src, dst) != |
102 | get_numa_distance(ms, dst, src)) { | |
ee6635b2 DHB |
103 | return false; |
104 | } | |
105 | } | |
106 | } | |
107 | ||
108 | return true; | |
109 | } | |
110 | ||
66407069 DHB |
111 | /* |
112 | * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node. | |
113 | * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is | |
114 | * called from vPHB reset handler so we initialize the counter here. | |
115 | * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM | |
116 | * must be equally distant from any other node. | |
117 | * The final value of spapr->gpu_numa_id is going to be written to | |
118 | * max-associativity-domains in spapr_build_fdt(). | |
119 | */ | |
120 | unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine) | |
121 | { | |
122 | return MAX(1, machine->numa_state->num_nodes); | |
123 | } | |
124 | ||
690fbe42 DHB |
125 | /* |
126 | * This function will translate the user distances into | |
127 | * what the kernel understand as possible values: 10 | |
128 | * (local distance), 20, 40, 80 and 160, and return the equivalent | |
129 | * NUMA level for each. Current heuristic is: | |
130 | * - local distance (10) returns numa_level = 0x4, meaning there is | |
131 | * no rounding for local distance | |
132 | * - distances between 11 and 30 inclusive -> rounded to 20, | |
133 | * numa_level = 0x3 | |
134 | * - distances between 31 and 60 inclusive -> rounded to 40, | |
135 | * numa_level = 0x2 | |
136 | * - distances between 61 and 120 inclusive -> rounded to 80, | |
137 | * numa_level = 0x1 | |
138 | * - everything above 120 returns numa_level = 0 to indicate that | |
139 | * there is no match. This will be calculated as disntace = 160 | |
140 | * by the kernel (as of v5.9) | |
141 | */ | |
142 | static uint8_t spapr_numa_get_numa_level(uint8_t distance) | |
143 | { | |
144 | if (distance == 10) { | |
145 | return 0x4; | |
146 | } else if (distance > 11 && distance <= 30) { | |
147 | return 0x3; | |
148 | } else if (distance > 31 && distance <= 60) { | |
149 | return 0x2; | |
150 | } else if (distance > 61 && distance <= 120) { | |
151 | return 0x1; | |
152 | } | |
153 | ||
154 | return 0; | |
155 | } | |
156 | ||
d98dbe2a | 157 | static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) |
690fbe42 DHB |
158 | { |
159 | MachineState *ms = MACHINE(spapr); | |
690fbe42 | 160 | int nb_numa_nodes = ms->numa_state->num_nodes; |
afa3b3c9 DHB |
161 | int src, dst, i, j; |
162 | ||
163 | /* | |
164 | * Fill all associativity domains of non-zero NUMA nodes with | |
165 | * node_id. This is required because the default value (0) is | |
166 | * considered a match with associativity domains of node 0. | |
167 | */ | |
168 | for (i = 1; i < nb_numa_nodes; i++) { | |
3a6e4ce6 | 169 | for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { |
a165ac67 | 170 | spapr->FORM1_assoc_array[i][j] = cpu_to_be32(i); |
afa3b3c9 DHB |
171 | } |
172 | } | |
690fbe42 DHB |
173 | |
174 | for (src = 0; src < nb_numa_nodes; src++) { | |
175 | for (dst = src; dst < nb_numa_nodes; dst++) { | |
176 | /* | |
177 | * This is how the associativity domain between A and B | |
178 | * is calculated: | |
179 | * | |
180 | * - get the distance D between them | |
181 | * - get the correspondent NUMA level 'n_level' for D | |
182 | * - all associativity arrays were initialized with their own | |
183 | * numa_ids, and we're calculating the distance in node_id | |
184 | * ascending order, starting from node id 0 (the first node | |
185 | * retrieved by numa_state). This will have a cascade effect in | |
186 | * the algorithm because the associativity domains that node 0 | |
187 | * defines will be carried over to other nodes, and node 1 | |
188 | * associativities will be carried over after taking node 0 | |
189 | * associativities into account, and so on. This happens because | |
190 | * we'll assign assoc_src as the associativity domain of dst | |
191 | * as well, for all NUMA levels beyond and including n_level. | |
192 | * | |
193 | * The PPC kernel expects the associativity domains of node 0 to | |
194 | * be always 0, and this algorithm will grant that by default. | |
195 | */ | |
1fde73bc | 196 | uint8_t distance = get_numa_distance(ms, src, dst); |
690fbe42 DHB |
197 | uint8_t n_level = spapr_numa_get_numa_level(distance); |
198 | uint32_t assoc_src; | |
199 | ||
200 | /* | |
201 | * n_level = 0 means that the distance is greater than our last | |
202 | * rounded value (120). In this case there is no NUMA level match | |
203 | * between src and dst and we can skip the remaining of the loop. | |
204 | * | |
205 | * The Linux kernel will assume that the distance between src and | |
206 | * dst, in this case of no match, is 10 (local distance) doubled | |
3a6e4ce6 | 207 | * for each NUMA it didn't match. We have FORM1_DIST_REF_POINTS |
690fbe42 DHB |
208 | * levels (4), so this gives us 10*2*2*2*2 = 160. |
209 | * | |
210 | * This logic can be seen in the Linux kernel source code, as of | |
211 | * v5.9, in arch/powerpc/mm/numa.c, function __node_distance(). | |
212 | */ | |
213 | if (n_level == 0) { | |
214 | continue; | |
215 | } | |
216 | ||
217 | /* | |
218 | * We must assign all assoc_src to dst, starting from n_level | |
219 | * and going up to 0x1. | |
220 | */ | |
221 | for (i = n_level; i > 0; i--) { | |
a165ac67 DHB |
222 | assoc_src = spapr->FORM1_assoc_array[src][i]; |
223 | spapr->FORM1_assoc_array[dst][i] = assoc_src; | |
690fbe42 DHB |
224 | } |
225 | } | |
226 | } | |
227 | ||
228 | } | |
229 | ||
5dab5abe DHB |
230 | static void spapr_numa_FORM1_affinity_check(MachineState *machine) |
231 | { | |
232 | int i; | |
233 | ||
234 | /* | |
235 | * Check we don't have a memory-less/cpu-less NUMA node | |
236 | * Firmware relies on the existing memory/cpu topology to provide the | |
237 | * NUMA topology to the kernel. | |
238 | * And the linux kernel needs to know the NUMA topology at start | |
239 | * to be able to hotplug CPUs later. | |
240 | */ | |
241 | if (machine->numa_state->num_nodes) { | |
242 | for (i = 0; i < machine->numa_state->num_nodes; ++i) { | |
243 | /* check for memory-less node */ | |
244 | if (machine->numa_state->nodes[i].node_mem == 0) { | |
245 | CPUState *cs; | |
246 | int found = 0; | |
247 | /* check for cpu-less node */ | |
248 | CPU_FOREACH(cs) { | |
249 | PowerPCCPU *cpu = POWERPC_CPU(cs); | |
250 | if (cpu->node_id == i) { | |
251 | found = 1; | |
252 | break; | |
253 | } | |
254 | } | |
255 | /* memory-less and cpu-less node */ | |
256 | if (!found) { | |
257 | error_report( | |
258 | "Memory-less/cpu-less nodes are not supported with FORM1 NUMA (node %d)", i); | |
259 | exit(EXIT_FAILURE); | |
260 | } | |
261 | } | |
262 | } | |
263 | } | |
264 | ||
265 | if (!spapr_numa_is_symmetrical(machine)) { | |
266 | error_report( | |
267 | "Asymmetrical NUMA topologies aren't supported in the pSeries machine using FORM1 NUMA"); | |
268 | exit(EXIT_FAILURE); | |
269 | } | |
270 | } | |
271 | ||
d98dbe2a DHB |
272 | /* |
273 | * Set NUMA machine state data based on FORM1 affinity semantics. | |
274 | */ | |
275 | static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, | |
276 | MachineState *machine) | |
f1aa45ff | 277 | { |
dd7e1d7a | 278 | SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); |
f1aa45ff | 279 | int nb_numa_nodes = machine->numa_state->num_nodes; |
dd7e1d7a | 280 | int i, j, max_nodes_with_gpus; |
f1aa45ff DHB |
281 | |
282 | /* | |
283 | * For all associativity arrays: first position is the size, | |
3a6e4ce6 | 284 | * position FORM1_DIST_REF_POINTS is always the numa_id, |
f1aa45ff DHB |
285 | * represented by the index 'i'. |
286 | * | |
287 | * This will break on sparse NUMA setups, when/if QEMU starts | |
288 | * to support it, because there will be no more guarantee that | |
289 | * 'i' will be a valid node_id set by the user. | |
290 | */ | |
291 | for (i = 0; i < nb_numa_nodes; i++) { | |
a165ac67 DHB |
292 | spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); |
293 | spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); | |
f1aa45ff | 294 | } |
dd7e1d7a DHB |
295 | |
296 | /* | |
297 | * Initialize NVLink GPU associativity arrays. We know that | |
298 | * the first GPU will take the first available NUMA id, and | |
299 | * we'll have a maximum of NVGPU_MAX_NUM GPUs in the machine. | |
300 | * At this point we're not sure if there are GPUs or not, but | |
301 | * let's initialize the associativity arrays and allow NVLink | |
302 | * GPUs to be handled like regular NUMA nodes later on. | |
303 | */ | |
304 | max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; | |
305 | ||
306 | for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { | |
a165ac67 | 307 | spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); |
dd7e1d7a | 308 | |
3a6e4ce6 | 309 | for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { |
dd7e1d7a DHB |
310 | uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? |
311 | SPAPR_GPU_NUMA_ID : cpu_to_be32(i); | |
a165ac67 | 312 | spapr->FORM1_assoc_array[i][j] = gpu_assoc; |
dd7e1d7a DHB |
313 | } |
314 | ||
a165ac67 | 315 | spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); |
dd7e1d7a | 316 | } |
ee6635b2 DHB |
317 | |
318 | /* | |
afa3b3c9 DHB |
319 | * Guests pseries-5.1 and older uses zeroed associativity domains, |
320 | * i.e. no domain definition based on NUMA distance input. | |
321 | * | |
322 | * Same thing with guests that have only one NUMA node. | |
ee6635b2 | 323 | */ |
afa3b3c9 DHB |
324 | if (smc->pre_5_2_numa_associativity || |
325 | machine->numa_state->num_nodes <= 1) { | |
ee6635b2 DHB |
326 | return; |
327 | } | |
328 | ||
d98dbe2a DHB |
329 | spapr_numa_define_FORM1_domains(spapr); |
330 | } | |
331 | ||
e0eb84d4 DHB |
332 | /* |
333 | * Init NUMA FORM2 machine state data | |
334 | */ | |
335 | static void spapr_numa_FORM2_affinity_init(SpaprMachineState *spapr) | |
336 | { | |
337 | int i; | |
338 | ||
339 | /* | |
340 | * For all resources but CPUs, FORM2 associativity arrays will | |
341 | * be a size 2 array with the following format: | |
342 | * | |
343 | * ibm,associativity = {1, numa_id} | |
344 | * | |
345 | * CPUs will write an additional 'vcpu_id' on top of the arrays | |
346 | * being initialized here. 'numa_id' is represented by the | |
347 | * index 'i' of the loop. | |
348 | * | |
349 | * Given that this initialization is also valid for GPU associativity | |
350 | * arrays, handle everything in one single step by populating the | |
351 | * arrays up to NUMA_NODES_MAX_NUM. | |
352 | */ | |
353 | for (i = 0; i < NUMA_NODES_MAX_NUM; i++) { | |
354 | spapr->FORM2_assoc_array[i][0] = cpu_to_be32(1); | |
355 | spapr->FORM2_assoc_array[i][1] = cpu_to_be32(i); | |
356 | } | |
357 | } | |
358 | ||
d98dbe2a DHB |
359 | void spapr_numa_associativity_init(SpaprMachineState *spapr, |
360 | MachineState *machine) | |
361 | { | |
362 | spapr_numa_FORM1_affinity_init(spapr, machine); | |
e0eb84d4 | 363 | spapr_numa_FORM2_affinity_init(spapr); |
f1aa45ff DHB |
364 | } |
365 | ||
5dab5abe DHB |
366 | void spapr_numa_associativity_check(SpaprMachineState *spapr) |
367 | { | |
e0eb84d4 DHB |
368 | /* |
369 | * FORM2 does not have any restrictions we need to handle | |
370 | * at CAS time, for now. | |
371 | */ | |
372 | if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { | |
373 | return; | |
374 | } | |
375 | ||
5dab5abe DHB |
376 | spapr_numa_FORM1_affinity_check(MACHINE(spapr)); |
377 | } | |
378 | ||
f1aa45ff DHB |
379 | void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, |
380 | int offset, int nodeid) | |
381 | { | |
a165ac67 DHB |
382 | const uint32_t *associativity = get_associativity(spapr, nodeid); |
383 | ||
f1aa45ff | 384 | _FDT((fdt_setprop(fdt, offset, "ibm,associativity", |
a165ac67 | 385 | associativity, |
3a6e4ce6 | 386 | get_numa_assoc_size(spapr) * sizeof(uint32_t)))); |
8f86a408 DHB |
387 | } |
388 | ||
d370f9cf DHB |
389 | static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, |
390 | PowerPCCPU *cpu) | |
8f86a408 | 391 | { |
a165ac67 | 392 | const uint32_t *associativity = get_associativity(spapr, cpu->node_id); |
3a6e4ce6 DHB |
393 | int max_distance_ref_points = get_max_dist_ref_points(spapr); |
394 | int vcpu_assoc_size = get_vcpu_assoc_size(spapr); | |
395 | uint32_t *vcpu_assoc = g_new(uint32_t, vcpu_assoc_size); | |
8f86a408 | 396 | int index = spapr_get_vcpu_id(cpu); |
8f86a408 DHB |
397 | |
398 | /* | |
399 | * VCPUs have an extra 'cpu_id' value in ibm,associativity | |
400 | * compared to other resources. Increment the size at index | |
d370f9cf DHB |
401 | * 0, put cpu_id last, then copy the remaining associativity |
402 | * domains. | |
8f86a408 | 403 | */ |
3a6e4ce6 DHB |
404 | vcpu_assoc[0] = cpu_to_be32(max_distance_ref_points + 1); |
405 | vcpu_assoc[vcpu_assoc_size - 1] = cpu_to_be32(index); | |
a165ac67 | 406 | memcpy(vcpu_assoc + 1, associativity + 1, |
3a6e4ce6 | 407 | (vcpu_assoc_size - 2) * sizeof(uint32_t)); |
8f86a408 | 408 | |
d370f9cf DHB |
409 | return vcpu_assoc; |
410 | } | |
411 | ||
412 | int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, | |
413 | int offset, PowerPCCPU *cpu) | |
414 | { | |
415 | g_autofree uint32_t *vcpu_assoc = NULL; | |
3a6e4ce6 | 416 | int vcpu_assoc_size = get_vcpu_assoc_size(spapr); |
8f86a408 | 417 | |
d370f9cf | 418 | vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, cpu); |
8f86a408 DHB |
419 | |
420 | /* Advertise NUMA via ibm,associativity */ | |
d370f9cf | 421 | return fdt_setprop(fdt, offset, "ibm,associativity", vcpu_assoc, |
3a6e4ce6 | 422 | vcpu_assoc_size * sizeof(uint32_t)); |
f1aa45ff DHB |
423 | } |
424 | ||
0ee52012 DHB |
425 | |
426 | int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, | |
427 | int offset) | |
428 | { | |
429 | MachineState *machine = MACHINE(spapr); | |
3a6e4ce6 | 430 | int max_distance_ref_points = get_max_dist_ref_points(spapr); |
0ee52012 DHB |
431 | int nb_numa_nodes = machine->numa_state->num_nodes; |
432 | int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; | |
16282937 DHB |
433 | g_autofree uint32_t *int_buf = NULL; |
434 | uint32_t *cur_index; | |
435 | int i; | |
0ee52012 DHB |
436 | |
437 | /* ibm,associativity-lookup-arrays */ | |
b21e2380 | 438 | int_buf = g_new0(uint32_t, nr_nodes * max_distance_ref_points + 2); |
16282937 | 439 | cur_index = int_buf; |
0ee52012 DHB |
440 | int_buf[0] = cpu_to_be32(nr_nodes); |
441 | /* Number of entries per associativity list */ | |
3a6e4ce6 | 442 | int_buf[1] = cpu_to_be32(max_distance_ref_points); |
0ee52012 DHB |
443 | cur_index += 2; |
444 | for (i = 0; i < nr_nodes; i++) { | |
445 | /* | |
a165ac67 DHB |
446 | * For the lookup-array we use the ibm,associativity array of the |
447 | * current NUMA affinity, without the first element (size). | |
0ee52012 | 448 | */ |
a165ac67 | 449 | const uint32_t *associativity = get_associativity(spapr, i); |
0ee52012 | 450 | memcpy(cur_index, ++associativity, |
3a6e4ce6 DHB |
451 | sizeof(uint32_t) * max_distance_ref_points); |
452 | cur_index += max_distance_ref_points; | |
0ee52012 | 453 | } |
0ee52012 | 454 | |
16282937 DHB |
455 | return fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", |
456 | int_buf, (cur_index - int_buf) * sizeof(uint32_t)); | |
0ee52012 DHB |
457 | } |
458 | ||
d98dbe2a DHB |
459 | static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr, |
460 | void *fdt, int rtas) | |
1eee9950 | 461 | { |
491e884e | 462 | MachineState *ms = MACHINE(spapr); |
1eee9950 | 463 | SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); |
b01fec36 DHB |
464 | uint32_t number_nvgpus_nodes = spapr->gpu_numa_id - |
465 | spapr_numa_initial_nvgpu_numa_id(ms); | |
1eee9950 DHB |
466 | uint32_t refpoints[] = { |
467 | cpu_to_be32(0x4), | |
491e884e | 468 | cpu_to_be32(0x3), |
1eee9950 | 469 | cpu_to_be32(0x2), |
491e884e | 470 | cpu_to_be32(0x1), |
1eee9950 DHB |
471 | }; |
472 | uint32_t nr_refpoints = ARRAY_SIZE(refpoints); | |
b01fec36 | 473 | uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes; |
1eee9950 DHB |
474 | uint32_t maxdomains[] = { |
475 | cpu_to_be32(4), | |
491e884e DHB |
476 | cpu_to_be32(maxdomain), |
477 | cpu_to_be32(maxdomain), | |
478 | cpu_to_be32(maxdomain), | |
479 | cpu_to_be32(maxdomain) | |
1eee9950 DHB |
480 | }; |
481 | ||
afa3b3c9 DHB |
482 | if (smc->pre_5_2_numa_associativity || |
483 | ms->numa_state->num_nodes <= 1) { | |
491e884e DHB |
484 | uint32_t legacy_refpoints[] = { |
485 | cpu_to_be32(0x4), | |
486 | cpu_to_be32(0x4), | |
487 | cpu_to_be32(0x2), | |
488 | }; | |
489 | uint32_t legacy_maxdomain = spapr->gpu_numa_id > 1 ? 1 : 0; | |
490 | uint32_t legacy_maxdomains[] = { | |
491 | cpu_to_be32(4), | |
492 | cpu_to_be32(legacy_maxdomain), | |
493 | cpu_to_be32(legacy_maxdomain), | |
494 | cpu_to_be32(legacy_maxdomain), | |
495 | cpu_to_be32(spapr->gpu_numa_id), | |
496 | }; | |
497 | ||
498 | G_STATIC_ASSERT(sizeof(legacy_refpoints) <= sizeof(refpoints)); | |
499 | G_STATIC_ASSERT(sizeof(legacy_maxdomains) <= sizeof(maxdomains)); | |
500 | ||
501 | nr_refpoints = 3; | |
502 | ||
503 | memcpy(refpoints, legacy_refpoints, sizeof(legacy_refpoints)); | |
504 | memcpy(maxdomains, legacy_maxdomains, sizeof(legacy_maxdomains)); | |
505 | ||
506 | /* pseries-5.0 and older reference-points array is {0x4, 0x4} */ | |
507 | if (smc->pre_5_1_assoc_refpoints) { | |
508 | nr_refpoints = 2; | |
509 | } | |
1eee9950 DHB |
510 | } |
511 | ||
512 | _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", | |
513 | refpoints, nr_refpoints * sizeof(refpoints[0]))); | |
514 | ||
515 | _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", | |
516 | maxdomains, sizeof(maxdomains))); | |
517 | } | |
f8a13fc3 | 518 | |
e0eb84d4 DHB |
519 | static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr, |
520 | void *fdt, int rtas) | |
521 | { | |
522 | MachineState *ms = MACHINE(spapr); | |
e0eb84d4 DHB |
523 | int nb_numa_nodes = ms->numa_state->num_nodes; |
524 | int distance_table_entries = nb_numa_nodes * nb_numa_nodes; | |
525 | g_autofree uint32_t *lookup_index_table = NULL; | |
28d86252 | 526 | g_autofree uint8_t *distance_table = NULL; |
e0eb84d4 | 527 | int src, dst, i, distance_table_size; |
e0eb84d4 DHB |
528 | |
529 | /* | |
530 | * ibm,numa-lookup-index-table: array with length and a | |
531 | * list of NUMA ids present in the guest. | |
532 | */ | |
533 | lookup_index_table = g_new0(uint32_t, nb_numa_nodes + 1); | |
534 | lookup_index_table[0] = cpu_to_be32(nb_numa_nodes); | |
535 | ||
536 | for (i = 0; i < nb_numa_nodes; i++) { | |
537 | lookup_index_table[i + 1] = cpu_to_be32(i); | |
538 | } | |
539 | ||
540 | _FDT(fdt_setprop(fdt, rtas, "ibm,numa-lookup-index-table", | |
541 | lookup_index_table, | |
542 | (nb_numa_nodes + 1) * sizeof(uint32_t))); | |
543 | ||
544 | /* | |
545 | * ibm,numa-distance-table: contains all node distances. First | |
546 | * element is the size of the table as uint32, followed up | |
547 | * by all the uint8 distances from the first NUMA node, then all | |
548 | * distances from the second NUMA node and so on. | |
549 | * | |
550 | * ibm,numa-lookup-index-table is used by guest to navigate this | |
551 | * array because NUMA ids can be sparse (node 0 is the first, | |
552 | * node 8 is the second ...). | |
553 | */ | |
28d86252 DHB |
554 | distance_table_size = distance_table_entries * sizeof(uint8_t) + |
555 | sizeof(uint32_t); | |
556 | distance_table = g_new0(uint8_t, distance_table_size); | |
557 | stl_be_p(distance_table, distance_table_entries); | |
e0eb84d4 | 558 | |
28d86252 DHB |
559 | /* Skip the uint32_t array length at the start */ |
560 | i = sizeof(uint32_t); | |
e0eb84d4 DHB |
561 | |
562 | for (src = 0; src < nb_numa_nodes; src++) { | |
563 | for (dst = 0; dst < nb_numa_nodes; dst++) { | |
1fde73bc | 564 | distance_table[i++] = get_numa_distance(ms, src, dst); |
e0eb84d4 DHB |
565 | } |
566 | } | |
567 | ||
e0eb84d4 DHB |
568 | _FDT(fdt_setprop(fdt, rtas, "ibm,numa-distance-table", |
569 | distance_table, distance_table_size)); | |
570 | } | |
571 | ||
572 | /* | |
573 | * This helper could be compressed in a single function with | |
574 | * FORM1 logic since we're setting the same DT values, with the | |
575 | * difference being a call to spapr_numa_FORM2_write_rtas_tables() | |
576 | * in the end. The separation was made to avoid clogging FORM1 code | |
577 | * which already has to deal with compat modes from previous | |
578 | * QEMU machine types. | |
579 | */ | |
580 | static void spapr_numa_FORM2_write_rtas_dt(SpaprMachineState *spapr, | |
581 | void *fdt, int rtas) | |
582 | { | |
583 | MachineState *ms = MACHINE(spapr); | |
584 | uint32_t number_nvgpus_nodes = spapr->gpu_numa_id - | |
585 | spapr_numa_initial_nvgpu_numa_id(ms); | |
586 | ||
587 | /* | |
588 | * In FORM2, ibm,associativity-reference-points will point to | |
589 | * the element in the ibm,associativity array that contains the | |
590 | * primary domain index (for FORM2, the first element). | |
591 | * | |
592 | * This value (in our case, the numa-id) is then used as an index | |
593 | * to retrieve all other attributes of the node (distance, | |
594 | * bandwidth, latency) via ibm,numa-lookup-index-table and other | |
595 | * ibm,numa-*-table properties. | |
596 | */ | |
597 | uint32_t refpoints[] = { cpu_to_be32(1) }; | |
598 | ||
599 | uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes; | |
600 | uint32_t maxdomains[] = { cpu_to_be32(1), cpu_to_be32(maxdomain) }; | |
601 | ||
602 | _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", | |
603 | refpoints, sizeof(refpoints))); | |
604 | ||
605 | _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", | |
606 | maxdomains, sizeof(maxdomains))); | |
607 | ||
608 | spapr_numa_FORM2_write_rtas_tables(spapr, fdt, rtas); | |
609 | } | |
610 | ||
d98dbe2a DHB |
611 | /* |
612 | * Helper that writes ibm,associativity-reference-points and | |
613 | * max-associativity-domains in the RTAS pointed by @rtas | |
614 | * in the DT @fdt. | |
615 | */ | |
616 | void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) | |
617 | { | |
e0eb84d4 DHB |
618 | if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { |
619 | spapr_numa_FORM2_write_rtas_dt(spapr, fdt, rtas); | |
620 | return; | |
621 | } | |
622 | ||
d98dbe2a DHB |
623 | spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas); |
624 | } | |
625 | ||
f8a13fc3 DHB |
626 | static target_ulong h_home_node_associativity(PowerPCCPU *cpu, |
627 | SpaprMachineState *spapr, | |
628 | target_ulong opcode, | |
629 | target_ulong *args) | |
630 | { | |
876ab8d8 | 631 | g_autofree uint32_t *vcpu_assoc = NULL; |
f8a13fc3 DHB |
632 | target_ulong flags = args[0]; |
633 | target_ulong procno = args[1]; | |
634 | PowerPCCPU *tcpu; | |
876ab8d8 | 635 | int idx, assoc_idx; |
3a6e4ce6 | 636 | int vcpu_assoc_size = get_vcpu_assoc_size(spapr); |
f8a13fc3 DHB |
637 | |
638 | /* only support procno from H_REGISTER_VPA */ | |
639 | if (flags != 0x1) { | |
640 | return H_FUNCTION; | |
641 | } | |
642 | ||
643 | tcpu = spapr_find_cpu(procno); | |
644 | if (tcpu == NULL) { | |
645 | return H_P2; | |
646 | } | |
647 | ||
876ab8d8 DHB |
648 | /* |
649 | * Given that we want to be flexible with the sizes and indexes, | |
650 | * we must consider that there is a hard limit of how many | |
651 | * associativities domain we can fit in R4 up to R9, which would be | |
652 | * 12 associativity domains for vcpus. Assert and bail if that's | |
653 | * not the case. | |
654 | */ | |
3a6e4ce6 | 655 | g_assert((vcpu_assoc_size - 1) <= 12); |
876ab8d8 DHB |
656 | |
657 | vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, tcpu); | |
658 | /* assoc_idx starts at 1 to skip associativity size */ | |
659 | assoc_idx = 1; | |
f8a13fc3 | 660 | |
f8a13fc3 DHB |
661 | #define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \ |
662 | ((uint64_t)(b) & 0xffffffff)) | |
876ab8d8 DHB |
663 | |
664 | for (idx = 0; idx < 6; idx++) { | |
665 | int32_t a, b; | |
666 | ||
667 | /* | |
668 | * vcpu_assoc[] will contain the associativity domains for tcpu, | |
669 | * including tcpu->node_id and procno, meaning that we don't | |
670 | * need to use these variables here. | |
671 | * | |
672 | * We'll read 2 values at a time to fill up the ASSOCIATIVITY() | |
673 | * macro. The ternary will fill the remaining registers with -1 | |
674 | * after we went through vcpu_assoc[]. | |
675 | */ | |
3a6e4ce6 | 676 | a = assoc_idx < vcpu_assoc_size ? |
876ab8d8 | 677 | be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; |
3a6e4ce6 | 678 | b = assoc_idx < vcpu_assoc_size ? |
876ab8d8 DHB |
679 | be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; |
680 | ||
681 | args[idx] = ASSOCIATIVITY(a, b); | |
f8a13fc3 DHB |
682 | } |
683 | #undef ASSOCIATIVITY | |
684 | ||
685 | return H_SUCCESS; | |
686 | } | |
687 | ||
688 | static void spapr_numa_register_types(void) | |
689 | { | |
690 | /* Virtual Processor Home Node */ | |
691 | spapr_register_hypercall(H_HOME_NODE_ASSOCIATIVITY, | |
692 | h_home_node_associativity); | |
693 | } | |
694 | ||
695 | type_init(spapr_numa_register_types) |