]>
Commit | Line | Data |
---|---|---|
1eee9950 DHB |
1 | /* |
2 | * QEMU PowerPC pSeries Logical Partition NUMA associativity handling | |
3 | * | |
4 | * Copyright IBM Corp. 2020 | |
5 | * | |
6 | * Authors: | |
7 | * Daniel Henrique Barboza <danielhb413@gmail.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
10 | * See the COPYING file in the top-level directory. | |
11 | */ | |
12 | ||
13 | #include "qemu/osdep.h" | |
14 | #include "qemu-common.h" | |
15 | #include "hw/ppc/spapr_numa.h" | |
dd7e1d7a | 16 | #include "hw/pci-host/spapr.h" |
1eee9950 DHB |
17 | #include "hw/ppc/fdt.h" |
18 | ||
dd7e1d7a DHB |
19 | /* Moved from hw/ppc/spapr_pci_nvlink2.c */ |
20 | #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) | |
f1aa45ff | 21 | |
ee6635b2 DHB |
22 | static bool spapr_numa_is_symmetrical(MachineState *ms) |
23 | { | |
24 | int src, dst; | |
25 | int nb_numa_nodes = ms->numa_state->num_nodes; | |
26 | NodeInfo *numa_info = ms->numa_state->nodes; | |
27 | ||
28 | for (src = 0; src < nb_numa_nodes; src++) { | |
29 | for (dst = src; dst < nb_numa_nodes; dst++) { | |
30 | if (numa_info[src].distance[dst] != | |
31 | numa_info[dst].distance[src]) { | |
32 | return false; | |
33 | } | |
34 | } | |
35 | } | |
36 | ||
37 | return true; | |
38 | } | |
39 | ||
690fbe42 DHB |
40 | /* |
41 | * This function will translate the user distances into | |
42 | * what the kernel understand as possible values: 10 | |
43 | * (local distance), 20, 40, 80 and 160, and return the equivalent | |
44 | * NUMA level for each. Current heuristic is: | |
45 | * - local distance (10) returns numa_level = 0x4, meaning there is | |
46 | * no rounding for local distance | |
47 | * - distances between 11 and 30 inclusive -> rounded to 20, | |
48 | * numa_level = 0x3 | |
49 | * - distances between 31 and 60 inclusive -> rounded to 40, | |
50 | * numa_level = 0x2 | |
51 | * - distances between 61 and 120 inclusive -> rounded to 80, | |
52 | * numa_level = 0x1 | |
53 | * - everything above 120 returns numa_level = 0 to indicate that | |
54 | * there is no match. This will be calculated as disntace = 160 | |
55 | * by the kernel (as of v5.9) | |
56 | */ | |
57 | static uint8_t spapr_numa_get_numa_level(uint8_t distance) | |
58 | { | |
59 | if (distance == 10) { | |
60 | return 0x4; | |
61 | } else if (distance > 11 && distance <= 30) { | |
62 | return 0x3; | |
63 | } else if (distance > 31 && distance <= 60) { | |
64 | return 0x2; | |
65 | } else if (distance > 61 && distance <= 120) { | |
66 | return 0x1; | |
67 | } | |
68 | ||
69 | return 0; | |
70 | } | |
71 | ||
72 | static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) | |
73 | { | |
74 | MachineState *ms = MACHINE(spapr); | |
75 | NodeInfo *numa_info = ms->numa_state->nodes; | |
76 | int nb_numa_nodes = ms->numa_state->num_nodes; | |
77 | int src, dst, i; | |
78 | ||
79 | for (src = 0; src < nb_numa_nodes; src++) { | |
80 | for (dst = src; dst < nb_numa_nodes; dst++) { | |
81 | /* | |
82 | * This is how the associativity domain between A and B | |
83 | * is calculated: | |
84 | * | |
85 | * - get the distance D between them | |
86 | * - get the correspondent NUMA level 'n_level' for D | |
87 | * - all associativity arrays were initialized with their own | |
88 | * numa_ids, and we're calculating the distance in node_id | |
89 | * ascending order, starting from node id 0 (the first node | |
90 | * retrieved by numa_state). This will have a cascade effect in | |
91 | * the algorithm because the associativity domains that node 0 | |
92 | * defines will be carried over to other nodes, and node 1 | |
93 | * associativities will be carried over after taking node 0 | |
94 | * associativities into account, and so on. This happens because | |
95 | * we'll assign assoc_src as the associativity domain of dst | |
96 | * as well, for all NUMA levels beyond and including n_level. | |
97 | * | |
98 | * The PPC kernel expects the associativity domains of node 0 to | |
99 | * be always 0, and this algorithm will grant that by default. | |
100 | */ | |
101 | uint8_t distance = numa_info[src].distance[dst]; | |
102 | uint8_t n_level = spapr_numa_get_numa_level(distance); | |
103 | uint32_t assoc_src; | |
104 | ||
105 | /* | |
106 | * n_level = 0 means that the distance is greater than our last | |
107 | * rounded value (120). In this case there is no NUMA level match | |
108 | * between src and dst and we can skip the remaining of the loop. | |
109 | * | |
110 | * The Linux kernel will assume that the distance between src and | |
111 | * dst, in this case of no match, is 10 (local distance) doubled | |
112 | * for each NUMA it didn't match. We have MAX_DISTANCE_REF_POINTS | |
113 | * levels (4), so this gives us 10*2*2*2*2 = 160. | |
114 | * | |
115 | * This logic can be seen in the Linux kernel source code, as of | |
116 | * v5.9, in arch/powerpc/mm/numa.c, function __node_distance(). | |
117 | */ | |
118 | if (n_level == 0) { | |
119 | continue; | |
120 | } | |
121 | ||
122 | /* | |
123 | * We must assign all assoc_src to dst, starting from n_level | |
124 | * and going up to 0x1. | |
125 | */ | |
126 | for (i = n_level; i > 0; i--) { | |
127 | assoc_src = spapr->numa_assoc_array[src][i]; | |
128 | spapr->numa_assoc_array[dst][i] = assoc_src; | |
129 | } | |
130 | } | |
131 | } | |
132 | ||
133 | } | |
134 | ||
f1aa45ff DHB |
135 | void spapr_numa_associativity_init(SpaprMachineState *spapr, |
136 | MachineState *machine) | |
137 | { | |
dd7e1d7a | 138 | SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); |
f1aa45ff | 139 | int nb_numa_nodes = machine->numa_state->num_nodes; |
dd7e1d7a | 140 | int i, j, max_nodes_with_gpus; |
690fbe42 | 141 | bool using_legacy_numa = spapr_machine_using_legacy_numa(spapr); |
f1aa45ff DHB |
142 | |
143 | /* | |
144 | * For all associativity arrays: first position is the size, | |
145 | * position MAX_DISTANCE_REF_POINTS is always the numa_id, | |
146 | * represented by the index 'i'. | |
147 | * | |
148 | * This will break on sparse NUMA setups, when/if QEMU starts | |
149 | * to support it, because there will be no more guarantee that | |
150 | * 'i' will be a valid node_id set by the user. | |
151 | */ | |
152 | for (i = 0; i < nb_numa_nodes; i++) { | |
153 | spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); | |
154 | spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); | |
690fbe42 DHB |
155 | |
156 | /* | |
157 | * Fill all associativity domains of non-zero NUMA nodes with | |
158 | * node_id. This is required because the default value (0) is | |
159 | * considered a match with associativity domains of node 0. | |
160 | */ | |
161 | if (!using_legacy_numa && i != 0) { | |
162 | for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { | |
163 | spapr->numa_assoc_array[i][j] = cpu_to_be32(i); | |
164 | } | |
165 | } | |
f1aa45ff | 166 | } |
dd7e1d7a DHB |
167 | |
168 | /* | |
169 | * Initialize NVLink GPU associativity arrays. We know that | |
170 | * the first GPU will take the first available NUMA id, and | |
171 | * we'll have a maximum of NVGPU_MAX_NUM GPUs in the machine. | |
172 | * At this point we're not sure if there are GPUs or not, but | |
173 | * let's initialize the associativity arrays and allow NVLink | |
174 | * GPUs to be handled like regular NUMA nodes later on. | |
175 | */ | |
176 | max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; | |
177 | ||
178 | for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { | |
179 | spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); | |
180 | ||
181 | for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { | |
182 | uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? | |
183 | SPAPR_GPU_NUMA_ID : cpu_to_be32(i); | |
184 | spapr->numa_assoc_array[i][j] = gpu_assoc; | |
185 | } | |
186 | ||
187 | spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); | |
188 | } | |
ee6635b2 DHB |
189 | |
190 | /* | |
191 | * Legacy NUMA guests (pseries-5.1 and older, or guests with only | |
192 | * 1 NUMA node) will not benefit from anything we're going to do | |
193 | * after this point. | |
194 | */ | |
690fbe42 | 195 | if (using_legacy_numa) { |
ee6635b2 DHB |
196 | return; |
197 | } | |
198 | ||
199 | if (!spapr_numa_is_symmetrical(machine)) { | |
200 | error_report("Asymmetrical NUMA topologies aren't supported " | |
201 | "in the pSeries machine"); | |
202 | exit(EXIT_FAILURE); | |
203 | } | |
204 | ||
690fbe42 | 205 | spapr_numa_define_associativity_domains(spapr); |
f1aa45ff DHB |
206 | } |
207 | ||
208 | void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, | |
209 | int offset, int nodeid) | |
210 | { | |
211 | _FDT((fdt_setprop(fdt, offset, "ibm,associativity", | |
212 | spapr->numa_assoc_array[nodeid], | |
213 | sizeof(spapr->numa_assoc_array[nodeid])))); | |
8f86a408 DHB |
214 | } |
215 | ||
d370f9cf DHB |
216 | static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, |
217 | PowerPCCPU *cpu) | |
8f86a408 | 218 | { |
d370f9cf | 219 | uint32_t *vcpu_assoc = g_new(uint32_t, VCPU_ASSOC_SIZE); |
8f86a408 | 220 | int index = spapr_get_vcpu_id(cpu); |
8f86a408 DHB |
221 | |
222 | /* | |
223 | * VCPUs have an extra 'cpu_id' value in ibm,associativity | |
224 | * compared to other resources. Increment the size at index | |
d370f9cf DHB |
225 | * 0, put cpu_id last, then copy the remaining associativity |
226 | * domains. | |
8f86a408 DHB |
227 | */ |
228 | vcpu_assoc[0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS + 1); | |
d370f9cf DHB |
229 | vcpu_assoc[VCPU_ASSOC_SIZE - 1] = cpu_to_be32(index); |
230 | memcpy(vcpu_assoc + 1, spapr->numa_assoc_array[cpu->node_id] + 1, | |
231 | (VCPU_ASSOC_SIZE - 2) * sizeof(uint32_t)); | |
8f86a408 | 232 | |
d370f9cf DHB |
233 | return vcpu_assoc; |
234 | } | |
235 | ||
236 | int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, | |
237 | int offset, PowerPCCPU *cpu) | |
238 | { | |
239 | g_autofree uint32_t *vcpu_assoc = NULL; | |
8f86a408 | 240 | |
d370f9cf | 241 | vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, cpu); |
8f86a408 DHB |
242 | |
243 | /* Advertise NUMA via ibm,associativity */ | |
d370f9cf DHB |
244 | return fdt_setprop(fdt, offset, "ibm,associativity", vcpu_assoc, |
245 | VCPU_ASSOC_SIZE * sizeof(uint32_t)); | |
f1aa45ff DHB |
246 | } |
247 | ||
0ee52012 DHB |
248 | |
249 | int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, | |
250 | int offset) | |
251 | { | |
252 | MachineState *machine = MACHINE(spapr); | |
253 | int nb_numa_nodes = machine->numa_state->num_nodes; | |
254 | int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; | |
255 | uint32_t *int_buf, *cur_index, buf_len; | |
256 | int ret, i; | |
257 | ||
258 | /* ibm,associativity-lookup-arrays */ | |
259 | buf_len = (nr_nodes * MAX_DISTANCE_REF_POINTS + 2) * sizeof(uint32_t); | |
260 | cur_index = int_buf = g_malloc0(buf_len); | |
261 | int_buf[0] = cpu_to_be32(nr_nodes); | |
262 | /* Number of entries per associativity list */ | |
263 | int_buf[1] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); | |
264 | cur_index += 2; | |
265 | for (i = 0; i < nr_nodes; i++) { | |
266 | /* | |
267 | * For the lookup-array we use the ibm,associativity array, | |
268 | * from numa_assoc_array. without the first element (size). | |
269 | */ | |
270 | uint32_t *associativity = spapr->numa_assoc_array[i]; | |
271 | memcpy(cur_index, ++associativity, | |
272 | sizeof(uint32_t) * MAX_DISTANCE_REF_POINTS); | |
273 | cur_index += MAX_DISTANCE_REF_POINTS; | |
274 | } | |
275 | ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf, | |
276 | (cur_index - int_buf) * sizeof(uint32_t)); | |
277 | g_free(int_buf); | |
278 | ||
279 | return ret; | |
280 | } | |
281 | ||
1eee9950 DHB |
282 | /* |
283 | * Helper that writes ibm,associativity-reference-points and | |
284 | * max-associativity-domains in the RTAS pointed by @rtas | |
285 | * in the DT @fdt. | |
286 | */ | |
287 | void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) | |
288 | { | |
491e884e | 289 | MachineState *ms = MACHINE(spapr); |
1eee9950 DHB |
290 | SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); |
291 | uint32_t refpoints[] = { | |
292 | cpu_to_be32(0x4), | |
491e884e | 293 | cpu_to_be32(0x3), |
1eee9950 | 294 | cpu_to_be32(0x2), |
491e884e | 295 | cpu_to_be32(0x1), |
1eee9950 DHB |
296 | }; |
297 | uint32_t nr_refpoints = ARRAY_SIZE(refpoints); | |
491e884e | 298 | uint32_t maxdomain = ms->numa_state->num_nodes + spapr->gpu_numa_id; |
1eee9950 DHB |
299 | uint32_t maxdomains[] = { |
300 | cpu_to_be32(4), | |
491e884e DHB |
301 | cpu_to_be32(maxdomain), |
302 | cpu_to_be32(maxdomain), | |
303 | cpu_to_be32(maxdomain), | |
304 | cpu_to_be32(maxdomain) | |
1eee9950 DHB |
305 | }; |
306 | ||
491e884e DHB |
307 | if (spapr_machine_using_legacy_numa(spapr)) { |
308 | uint32_t legacy_refpoints[] = { | |
309 | cpu_to_be32(0x4), | |
310 | cpu_to_be32(0x4), | |
311 | cpu_to_be32(0x2), | |
312 | }; | |
313 | uint32_t legacy_maxdomain = spapr->gpu_numa_id > 1 ? 1 : 0; | |
314 | uint32_t legacy_maxdomains[] = { | |
315 | cpu_to_be32(4), | |
316 | cpu_to_be32(legacy_maxdomain), | |
317 | cpu_to_be32(legacy_maxdomain), | |
318 | cpu_to_be32(legacy_maxdomain), | |
319 | cpu_to_be32(spapr->gpu_numa_id), | |
320 | }; | |
321 | ||
322 | G_STATIC_ASSERT(sizeof(legacy_refpoints) <= sizeof(refpoints)); | |
323 | G_STATIC_ASSERT(sizeof(legacy_maxdomains) <= sizeof(maxdomains)); | |
324 | ||
325 | nr_refpoints = 3; | |
326 | ||
327 | memcpy(refpoints, legacy_refpoints, sizeof(legacy_refpoints)); | |
328 | memcpy(maxdomains, legacy_maxdomains, sizeof(legacy_maxdomains)); | |
329 | ||
330 | /* pseries-5.0 and older reference-points array is {0x4, 0x4} */ | |
331 | if (smc->pre_5_1_assoc_refpoints) { | |
332 | nr_refpoints = 2; | |
333 | } | |
1eee9950 DHB |
334 | } |
335 | ||
336 | _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", | |
337 | refpoints, nr_refpoints * sizeof(refpoints[0]))); | |
338 | ||
339 | _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", | |
340 | maxdomains, sizeof(maxdomains))); | |
341 | } | |
f8a13fc3 DHB |
342 | |
343 | static target_ulong h_home_node_associativity(PowerPCCPU *cpu, | |
344 | SpaprMachineState *spapr, | |
345 | target_ulong opcode, | |
346 | target_ulong *args) | |
347 | { | |
876ab8d8 | 348 | g_autofree uint32_t *vcpu_assoc = NULL; |
f8a13fc3 DHB |
349 | target_ulong flags = args[0]; |
350 | target_ulong procno = args[1]; | |
351 | PowerPCCPU *tcpu; | |
876ab8d8 | 352 | int idx, assoc_idx; |
f8a13fc3 DHB |
353 | |
354 | /* only support procno from H_REGISTER_VPA */ | |
355 | if (flags != 0x1) { | |
356 | return H_FUNCTION; | |
357 | } | |
358 | ||
359 | tcpu = spapr_find_cpu(procno); | |
360 | if (tcpu == NULL) { | |
361 | return H_P2; | |
362 | } | |
363 | ||
876ab8d8 DHB |
364 | /* |
365 | * Given that we want to be flexible with the sizes and indexes, | |
366 | * we must consider that there is a hard limit of how many | |
367 | * associativities domain we can fit in R4 up to R9, which would be | |
368 | * 12 associativity domains for vcpus. Assert and bail if that's | |
369 | * not the case. | |
370 | */ | |
371 | G_STATIC_ASSERT((VCPU_ASSOC_SIZE - 1) <= 12); | |
372 | ||
373 | vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, tcpu); | |
374 | /* assoc_idx starts at 1 to skip associativity size */ | |
375 | assoc_idx = 1; | |
f8a13fc3 | 376 | |
f8a13fc3 DHB |
377 | #define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \ |
378 | ((uint64_t)(b) & 0xffffffff)) | |
876ab8d8 DHB |
379 | |
380 | for (idx = 0; idx < 6; idx++) { | |
381 | int32_t a, b; | |
382 | ||
383 | /* | |
384 | * vcpu_assoc[] will contain the associativity domains for tcpu, | |
385 | * including tcpu->node_id and procno, meaning that we don't | |
386 | * need to use these variables here. | |
387 | * | |
388 | * We'll read 2 values at a time to fill up the ASSOCIATIVITY() | |
389 | * macro. The ternary will fill the remaining registers with -1 | |
390 | * after we went through vcpu_assoc[]. | |
391 | */ | |
392 | a = assoc_idx < VCPU_ASSOC_SIZE ? | |
393 | be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; | |
394 | b = assoc_idx < VCPU_ASSOC_SIZE ? | |
395 | be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; | |
396 | ||
397 | args[idx] = ASSOCIATIVITY(a, b); | |
f8a13fc3 DHB |
398 | } |
399 | #undef ASSOCIATIVITY | |
400 | ||
401 | return H_SUCCESS; | |
402 | } | |
403 | ||
404 | static void spapr_numa_register_types(void) | |
405 | { | |
406 | /* Virtual Processor Home Node */ | |
407 | spapr_register_hypercall(H_HOME_NODE_ASSOCIATIVITY, | |
408 | h_home_node_associativity); | |
409 | } | |
410 | ||
411 | type_init(spapr_numa_register_types) |