]>
Commit | Line | Data |
---|---|---|
cdfac37b RH |
1 | /* |
2 | * Routines common to user and system emulation of load/store. | |
3 | * | |
4 | * Copyright (c) 2022 Linaro, Ltd. | |
5 | * | |
6 | * SPDX-License-Identifier: GPL-2.0-or-later | |
7 | * | |
8 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
9 | * See the COPYING file in the top-level directory. | |
10 | */ | |
11 | ||
af844a11 | 12 | #include "host/load-extract-al16-al8.h" |
b3f4144f | 13 | #include "host/store-insert-al16.h" |
af844a11 | 14 | |
cdfac37b RH |
15 | #ifdef CONFIG_ATOMIC64 |
16 | # define HAVE_al8 true | |
17 | #else | |
18 | # define HAVE_al8 false | |
19 | #endif | |
20 | #define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) | |
21 | ||
cdfac37b RH |
22 | /** |
23 | * required_atomicity: | |
24 | * | |
25 | * Return the lg2 bytes of atomicity required by @memop for @p. | |
26 | * If the operation must be split into two operations to be | |
27 | * examined separately for atomicity, return -lg2. | |
28 | */ | |
73fda56f | 29 | static int required_atomicity(CPUState *cpu, uintptr_t p, MemOp memop) |
cdfac37b RH |
30 | { |
31 | MemOp atom = memop & MO_ATOM_MASK; | |
32 | MemOp size = memop & MO_SIZE; | |
33 | MemOp half = size ? size - 1 : 0; | |
34 | unsigned tmp; | |
35 | int atmax; | |
36 | ||
37 | switch (atom) { | |
38 | case MO_ATOM_NONE: | |
39 | atmax = MO_8; | |
40 | break; | |
41 | ||
42 | case MO_ATOM_IFALIGN_PAIR: | |
43 | size = half; | |
44 | /* fall through */ | |
45 | ||
46 | case MO_ATOM_IFALIGN: | |
47 | tmp = (1 << size) - 1; | |
48 | atmax = p & tmp ? MO_8 : size; | |
49 | break; | |
50 | ||
51 | case MO_ATOM_WITHIN16: | |
52 | tmp = p & 15; | |
53 | atmax = (tmp + (1 << size) <= 16 ? size : MO_8); | |
54 | break; | |
55 | ||
56 | case MO_ATOM_WITHIN16_PAIR: | |
57 | tmp = p & 15; | |
58 | if (tmp + (1 << size) <= 16) { | |
59 | atmax = size; | |
60 | } else if (tmp + (1 << half) == 16) { | |
61 | /* | |
62 | * The pair exactly straddles the boundary. | |
63 | * Both halves are naturally aligned and atomic. | |
64 | */ | |
65 | atmax = half; | |
66 | } else { | |
67 | /* | |
68 | * One of the pair crosses the boundary, and is non-atomic. | |
69 | * The other of the pair does not cross, and is atomic. | |
70 | */ | |
71 | atmax = -half; | |
72 | } | |
73 | break; | |
74 | ||
75 | case MO_ATOM_SUBALIGN: | |
76 | /* | |
77 | * Examine the alignment of p to determine if there are subobjects | |
78 | * that must be aligned. Note that we only really need ctz4() -- | |
79 | * any more sigificant bits are discarded by the immediately | |
80 | * following comparison. | |
81 | */ | |
82 | tmp = ctz32(p); | |
83 | atmax = MIN(size, tmp); | |
84 | break; | |
85 | ||
86 | default: | |
87 | g_assert_not_reached(); | |
88 | } | |
89 | ||
90 | /* | |
91 | * Here we have the architectural atomicity of the operation. | |
92 | * However, when executing in a serial context, we need no extra | |
93 | * host atomicity in order to avoid racing. This reduction | |
94 | * avoids looping with cpu_loop_exit_atomic. | |
95 | */ | |
73fda56f | 96 | if (cpu_in_serial_context(cpu)) { |
cdfac37b RH |
97 | return MO_8; |
98 | } | |
99 | return atmax; | |
100 | } | |
101 | ||
102 | /** | |
103 | * load_atomic2: | |
104 | * @pv: host address | |
105 | * | |
106 | * Atomically load 2 aligned bytes from @pv. | |
107 | */ | |
108 | static inline uint16_t load_atomic2(void *pv) | |
109 | { | |
110 | uint16_t *p = __builtin_assume_aligned(pv, 2); | |
111 | return qatomic_read(p); | |
112 | } | |
113 | ||
114 | /** | |
115 | * load_atomic4: | |
116 | * @pv: host address | |
117 | * | |
118 | * Atomically load 4 aligned bytes from @pv. | |
119 | */ | |
120 | static inline uint32_t load_atomic4(void *pv) | |
121 | { | |
122 | uint32_t *p = __builtin_assume_aligned(pv, 4); | |
123 | return qatomic_read(p); | |
124 | } | |
125 | ||
126 | /** | |
127 | * load_atomic8: | |
128 | * @pv: host address | |
129 | * | |
130 | * Atomically load 8 aligned bytes from @pv. | |
131 | */ | |
132 | static inline uint64_t load_atomic8(void *pv) | |
133 | { | |
134 | uint64_t *p = __builtin_assume_aligned(pv, 8); | |
135 | ||
136 | qemu_build_assert(HAVE_al8); | |
137 | return qatomic_read__nocheck(p); | |
138 | } | |
139 | ||
cdfac37b RH |
140 | /** |
141 | * load_atomic8_or_exit: | |
73fda56f | 142 | * @cpu: generic cpu state |
cdfac37b RH |
143 | * @ra: host unwind address |
144 | * @pv: host address | |
145 | * | |
146 | * Atomically load 8 aligned bytes from @pv. | |
147 | * If this is not possible, longjmp out to restart serially. | |
148 | */ | |
73fda56f | 149 | static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv) |
cdfac37b RH |
150 | { |
151 | if (HAVE_al8) { | |
152 | return load_atomic8(pv); | |
153 | } | |
154 | ||
155 | #ifdef CONFIG_USER_ONLY | |
156 | /* | |
157 | * If the page is not writable, then assume the value is immutable | |
158 | * and requires no locking. This ignores the case of MAP_SHARED with | |
159 | * another process, because the fallback start_exclusive solution | |
160 | * provides no protection across processes. | |
161 | */ | |
2c8412d4 RH |
162 | WITH_MMAP_LOCK_GUARD() { |
163 | if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) { | |
164 | uint64_t *p = __builtin_assume_aligned(pv, 8); | |
165 | return *p; | |
166 | } | |
cdfac37b RH |
167 | } |
168 | #endif | |
169 | ||
170 | /* Ultimate fallback: re-execute in serial context. */ | |
73fda56f | 171 | cpu_loop_exit_atomic(cpu, ra); |
cdfac37b RH |
172 | } |
173 | ||
174 | /** | |
175 | * load_atomic16_or_exit: | |
73fda56f | 176 | * @cpu: generic cpu state |
cdfac37b RH |
177 | * @ra: host unwind address |
178 | * @pv: host address | |
179 | * | |
180 | * Atomically load 16 aligned bytes from @pv. | |
181 | * If this is not possible, longjmp out to restart serially. | |
182 | */ | |
73fda56f | 183 | static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv) |
cdfac37b RH |
184 | { |
185 | Int128 *p = __builtin_assume_aligned(pv, 16); | |
186 | ||
8dc24ff4 RH |
187 | if (HAVE_ATOMIC128_RO) { |
188 | return atomic16_read_ro(p); | |
cdfac37b RH |
189 | } |
190 | ||
cdfac37b RH |
191 | /* |
192 | * We can only use cmpxchg to emulate a load if the page is writable. | |
193 | * If the page is not writable, then assume the value is immutable | |
194 | * and requires no locking. This ignores the case of MAP_SHARED with | |
195 | * another process, because the fallback start_exclusive solution | |
196 | * provides no protection across processes. | |
2c8412d4 RH |
197 | * |
198 | * In system mode all guest pages are writable. For user mode, | |
199 | * we must take mmap_lock so that the query remains valid until | |
200 | * the write is complete -- tests/tcg/multiarch/munmap-pthread.c | |
201 | * is an example that can race. | |
cdfac37b | 202 | */ |
2c8412d4 RH |
203 | WITH_MMAP_LOCK_GUARD() { |
204 | #ifdef CONFIG_USER_ONLY | |
205 | if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) { | |
206 | return *p; | |
207 | } | |
cdfac37b | 208 | #endif |
2c8412d4 RH |
209 | if (HAVE_ATOMIC128_RW) { |
210 | return atomic16_read_rw(p); | |
211 | } | |
cdfac37b | 212 | } |
cdfac37b RH |
213 | |
214 | /* Ultimate fallback: re-execute in serial context. */ | |
73fda56f | 215 | cpu_loop_exit_atomic(cpu, ra); |
cdfac37b RH |
216 | } |
217 | ||
218 | /** | |
219 | * load_atom_extract_al4x2: | |
220 | * @pv: host address | |
221 | * | |
222 | * Load 4 bytes from @p, from two sequential atomic 4-byte loads. | |
223 | */ | |
224 | static uint32_t load_atom_extract_al4x2(void *pv) | |
225 | { | |
226 | uintptr_t pi = (uintptr_t)pv; | |
227 | int sh = (pi & 3) * 8; | |
228 | uint32_t a, b; | |
229 | ||
230 | pv = (void *)(pi & ~3); | |
231 | a = load_atomic4(pv); | |
232 | b = load_atomic4(pv + 4); | |
233 | ||
234 | if (HOST_BIG_ENDIAN) { | |
235 | return (a << sh) | (b >> (-sh & 31)); | |
236 | } else { | |
237 | return (a >> sh) | (b << (-sh & 31)); | |
238 | } | |
239 | } | |
240 | ||
241 | /** | |
242 | * load_atom_extract_al8x2: | |
243 | * @pv: host address | |
244 | * | |
245 | * Load 8 bytes from @p, from two sequential atomic 8-byte loads. | |
246 | */ | |
247 | static uint64_t load_atom_extract_al8x2(void *pv) | |
248 | { | |
249 | uintptr_t pi = (uintptr_t)pv; | |
250 | int sh = (pi & 7) * 8; | |
251 | uint64_t a, b; | |
252 | ||
253 | pv = (void *)(pi & ~7); | |
254 | a = load_atomic8(pv); | |
255 | b = load_atomic8(pv + 8); | |
256 | ||
257 | if (HOST_BIG_ENDIAN) { | |
258 | return (a << sh) | (b >> (-sh & 63)); | |
259 | } else { | |
260 | return (a >> sh) | (b << (-sh & 63)); | |
261 | } | |
262 | } | |
263 | ||
264 | /** | |
265 | * load_atom_extract_al8_or_exit: | |
73fda56f | 266 | * @cpu: generic cpu state |
cdfac37b RH |
267 | * @ra: host unwind address |
268 | * @pv: host address | |
269 | * @s: object size in bytes, @s <= 4. | |
270 | * | |
271 | * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does | |
272 | * not cross an 8-byte boundary. This means that we can perform an atomic | |
273 | * 8-byte load and extract. | |
274 | * The value is returned in the low bits of a uint32_t. | |
275 | */ | |
73fda56f | 276 | static uint32_t load_atom_extract_al8_or_exit(CPUState *cpu, uintptr_t ra, |
cdfac37b RH |
277 | void *pv, int s) |
278 | { | |
279 | uintptr_t pi = (uintptr_t)pv; | |
280 | int o = pi & 7; | |
281 | int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; | |
282 | ||
283 | pv = (void *)(pi & ~7); | |
73fda56f | 284 | return load_atomic8_or_exit(cpu, ra, pv) >> shr; |
cdfac37b RH |
285 | } |
286 | ||
287 | /** | |
288 | * load_atom_extract_al16_or_exit: | |
73fda56f | 289 | * @cpu: generic cpu state |
cdfac37b RH |
290 | * @ra: host unwind address |
291 | * @p: host address | |
292 | * @s: object size in bytes, @s <= 8. | |
293 | * | |
294 | * Atomically load @s bytes from @p, when p % 16 < 8 | |
295 | * and p % 16 + s > 8. I.e. does not cross a 16-byte | |
296 | * boundary, but *does* cross an 8-byte boundary. | |
297 | * This is the slow version, so we must have eliminated | |
298 | * any faster load_atom_extract_al8_or_exit case. | |
299 | * | |
300 | * If this is not possible, longjmp out to restart serially. | |
301 | */ | |
73fda56f | 302 | static uint64_t load_atom_extract_al16_or_exit(CPUState *cpu, uintptr_t ra, |
cdfac37b RH |
303 | void *pv, int s) |
304 | { | |
305 | uintptr_t pi = (uintptr_t)pv; | |
306 | int o = pi & 7; | |
307 | int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; | |
308 | Int128 r; | |
309 | ||
310 | /* | |
311 | * Note constraints above: p & 8 must be clear. | |
312 | * Provoke SIGBUS if possible otherwise. | |
313 | */ | |
314 | pv = (void *)(pi & ~7); | |
73fda56f | 315 | r = load_atomic16_or_exit(cpu, ra, pv); |
cdfac37b RH |
316 | |
317 | r = int128_urshift(r, shr); | |
318 | return int128_getlo(r); | |
319 | } | |
320 | ||
cdfac37b RH |
321 | /** |
322 | * load_atom_4_by_2: | |
323 | * @pv: host address | |
324 | * | |
325 | * Load 4 bytes from @pv, with two 2-byte atomic loads. | |
326 | */ | |
327 | static inline uint32_t load_atom_4_by_2(void *pv) | |
328 | { | |
329 | uint32_t a = load_atomic2(pv); | |
330 | uint32_t b = load_atomic2(pv + 2); | |
331 | ||
332 | if (HOST_BIG_ENDIAN) { | |
333 | return (a << 16) | b; | |
334 | } else { | |
335 | return (b << 16) | a; | |
336 | } | |
337 | } | |
338 | ||
339 | /** | |
340 | * load_atom_8_by_2: | |
341 | * @pv: host address | |
342 | * | |
343 | * Load 8 bytes from @pv, with four 2-byte atomic loads. | |
344 | */ | |
345 | static inline uint64_t load_atom_8_by_2(void *pv) | |
346 | { | |
347 | uint32_t a = load_atom_4_by_2(pv); | |
348 | uint32_t b = load_atom_4_by_2(pv + 4); | |
349 | ||
350 | if (HOST_BIG_ENDIAN) { | |
351 | return ((uint64_t)a << 32) | b; | |
352 | } else { | |
353 | return ((uint64_t)b << 32) | a; | |
354 | } | |
355 | } | |
356 | ||
357 | /** | |
358 | * load_atom_8_by_4: | |
359 | * @pv: host address | |
360 | * | |
361 | * Load 8 bytes from @pv, with two 4-byte atomic loads. | |
362 | */ | |
363 | static inline uint64_t load_atom_8_by_4(void *pv) | |
364 | { | |
365 | uint32_t a = load_atomic4(pv); | |
366 | uint32_t b = load_atomic4(pv + 4); | |
367 | ||
368 | if (HOST_BIG_ENDIAN) { | |
369 | return ((uint64_t)a << 32) | b; | |
370 | } else { | |
371 | return ((uint64_t)b << 32) | a; | |
372 | } | |
373 | } | |
374 | ||
35c653c4 RH |
375 | /** |
376 | * load_atom_8_by_8_or_4: | |
377 | * @pv: host address | |
378 | * | |
379 | * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. | |
380 | */ | |
381 | static inline uint64_t load_atom_8_by_8_or_4(void *pv) | |
382 | { | |
383 | if (HAVE_al8_fast) { | |
384 | return load_atomic8(pv); | |
385 | } else { | |
386 | return load_atom_8_by_4(pv); | |
387 | } | |
388 | } | |
389 | ||
cdfac37b RH |
390 | /** |
391 | * load_atom_2: | |
392 | * @p: host address | |
393 | * @memop: the full memory op | |
394 | * | |
395 | * Load 2 bytes from @p, honoring the atomicity of @memop. | |
396 | */ | |
73fda56f | 397 | static uint16_t load_atom_2(CPUState *cpu, uintptr_t ra, |
cdfac37b RH |
398 | void *pv, MemOp memop) |
399 | { | |
400 | uintptr_t pi = (uintptr_t)pv; | |
401 | int atmax; | |
402 | ||
403 | if (likely((pi & 1) == 0)) { | |
404 | return load_atomic2(pv); | |
405 | } | |
8dc24ff4 | 406 | if (HAVE_ATOMIC128_RO) { |
6a2c23dd RH |
407 | intptr_t left_in_page = -(pi | TARGET_PAGE_MASK); |
408 | if (likely(left_in_page > 8)) { | |
409 | return load_atom_extract_al16_or_al8(pv, 2); | |
410 | } | |
cdfac37b RH |
411 | } |
412 | ||
73fda56f | 413 | atmax = required_atomicity(cpu, pi, memop); |
cdfac37b RH |
414 | switch (atmax) { |
415 | case MO_8: | |
416 | return lduw_he_p(pv); | |
417 | case MO_16: | |
418 | /* The only case remaining is MO_ATOM_WITHIN16. */ | |
419 | if (!HAVE_al8_fast && (pi & 3) == 1) { | |
420 | /* Big or little endian, we want the middle two bytes. */ | |
421 | return load_atomic4(pv - 1) >> 8; | |
422 | } | |
423 | if ((pi & 15) != 7) { | |
73fda56f | 424 | return load_atom_extract_al8_or_exit(cpu, ra, pv, 2); |
cdfac37b | 425 | } |
73fda56f | 426 | return load_atom_extract_al16_or_exit(cpu, ra, pv, 2); |
cdfac37b RH |
427 | default: |
428 | g_assert_not_reached(); | |
429 | } | |
430 | } | |
431 | ||
432 | /** | |
433 | * load_atom_4: | |
434 | * @p: host address | |
435 | * @memop: the full memory op | |
436 | * | |
437 | * Load 4 bytes from @p, honoring the atomicity of @memop. | |
438 | */ | |
73fda56f | 439 | static uint32_t load_atom_4(CPUState *cpu, uintptr_t ra, |
cdfac37b RH |
440 | void *pv, MemOp memop) |
441 | { | |
442 | uintptr_t pi = (uintptr_t)pv; | |
443 | int atmax; | |
444 | ||
445 | if (likely((pi & 3) == 0)) { | |
446 | return load_atomic4(pv); | |
447 | } | |
8dc24ff4 | 448 | if (HAVE_ATOMIC128_RO) { |
6a2c23dd RH |
449 | intptr_t left_in_page = -(pi | TARGET_PAGE_MASK); |
450 | if (likely(left_in_page > 8)) { | |
451 | return load_atom_extract_al16_or_al8(pv, 4); | |
452 | } | |
cdfac37b RH |
453 | } |
454 | ||
73fda56f | 455 | atmax = required_atomicity(cpu, pi, memop); |
cdfac37b RH |
456 | switch (atmax) { |
457 | case MO_8: | |
458 | case MO_16: | |
459 | case -MO_16: | |
460 | /* | |
461 | * For MO_ATOM_IFALIGN, this is more atomicity than required, | |
462 | * but it's trivially supported on all hosts, better than 4 | |
463 | * individual byte loads (when the host requires alignment), | |
464 | * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. | |
465 | */ | |
466 | return load_atom_extract_al4x2(pv); | |
467 | case MO_32: | |
468 | if (!(pi & 4)) { | |
73fda56f | 469 | return load_atom_extract_al8_or_exit(cpu, ra, pv, 4); |
cdfac37b | 470 | } |
73fda56f | 471 | return load_atom_extract_al16_or_exit(cpu, ra, pv, 4); |
cdfac37b RH |
472 | default: |
473 | g_assert_not_reached(); | |
474 | } | |
475 | } | |
476 | ||
477 | /** | |
478 | * load_atom_8: | |
479 | * @p: host address | |
480 | * @memop: the full memory op | |
481 | * | |
482 | * Load 8 bytes from @p, honoring the atomicity of @memop. | |
483 | */ | |
73fda56f | 484 | static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra, |
cdfac37b RH |
485 | void *pv, MemOp memop) |
486 | { | |
487 | uintptr_t pi = (uintptr_t)pv; | |
488 | int atmax; | |
489 | ||
490 | /* | |
491 | * If the host does not support 8-byte atomics, wait until we have | |
492 | * examined the atomicity parameters below. | |
493 | */ | |
494 | if (HAVE_al8 && likely((pi & 7) == 0)) { | |
495 | return load_atomic8(pv); | |
496 | } | |
8dc24ff4 | 497 | if (HAVE_ATOMIC128_RO) { |
cdfac37b RH |
498 | return load_atom_extract_al16_or_al8(pv, 8); |
499 | } | |
500 | ||
73fda56f | 501 | atmax = required_atomicity(cpu, pi, memop); |
cdfac37b RH |
502 | if (atmax == MO_64) { |
503 | if (!HAVE_al8 && (pi & 7) == 0) { | |
73fda56f | 504 | load_atomic8_or_exit(cpu, ra, pv); |
cdfac37b | 505 | } |
73fda56f | 506 | return load_atom_extract_al16_or_exit(cpu, ra, pv, 8); |
cdfac37b RH |
507 | } |
508 | if (HAVE_al8_fast) { | |
509 | return load_atom_extract_al8x2(pv); | |
510 | } | |
511 | switch (atmax) { | |
512 | case MO_8: | |
513 | return ldq_he_p(pv); | |
514 | case MO_16: | |
515 | return load_atom_8_by_2(pv); | |
516 | case MO_32: | |
517 | return load_atom_8_by_4(pv); | |
518 | case -MO_32: | |
519 | if (HAVE_al8) { | |
520 | return load_atom_extract_al8x2(pv); | |
521 | } | |
73fda56f | 522 | cpu_loop_exit_atomic(cpu, ra); |
cdfac37b RH |
523 | default: |
524 | g_assert_not_reached(); | |
525 | } | |
526 | } | |
5b36f268 | 527 | |
35c653c4 RH |
528 | /** |
529 | * load_atom_16: | |
530 | * @p: host address | |
531 | * @memop: the full memory op | |
532 | * | |
533 | * Load 16 bytes from @p, honoring the atomicity of @memop. | |
534 | */ | |
73fda56f | 535 | static Int128 load_atom_16(CPUState *cpu, uintptr_t ra, |
35c653c4 RH |
536 | void *pv, MemOp memop) |
537 | { | |
538 | uintptr_t pi = (uintptr_t)pv; | |
539 | int atmax; | |
540 | Int128 r; | |
541 | uint64_t a, b; | |
542 | ||
543 | /* | |
544 | * If the host does not support 16-byte atomics, wait until we have | |
545 | * examined the atomicity parameters below. | |
546 | */ | |
8dc24ff4 RH |
547 | if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) { |
548 | return atomic16_read_ro(pv); | |
35c653c4 RH |
549 | } |
550 | ||
73fda56f | 551 | atmax = required_atomicity(cpu, pi, memop); |
35c653c4 RH |
552 | switch (atmax) { |
553 | case MO_8: | |
554 | memcpy(&r, pv, 16); | |
555 | return r; | |
556 | case MO_16: | |
557 | a = load_atom_8_by_2(pv); | |
558 | b = load_atom_8_by_2(pv + 8); | |
559 | break; | |
560 | case MO_32: | |
561 | a = load_atom_8_by_4(pv); | |
562 | b = load_atom_8_by_4(pv + 8); | |
563 | break; | |
564 | case MO_64: | |
565 | if (!HAVE_al8) { | |
73fda56f | 566 | cpu_loop_exit_atomic(cpu, ra); |
35c653c4 RH |
567 | } |
568 | a = load_atomic8(pv); | |
569 | b = load_atomic8(pv + 8); | |
570 | break; | |
571 | case -MO_64: | |
572 | if (!HAVE_al8) { | |
73fda56f | 573 | cpu_loop_exit_atomic(cpu, ra); |
35c653c4 RH |
574 | } |
575 | a = load_atom_extract_al8x2(pv); | |
576 | b = load_atom_extract_al8x2(pv + 8); | |
577 | break; | |
578 | case MO_128: | |
73fda56f | 579 | return load_atomic16_or_exit(cpu, ra, pv); |
35c653c4 RH |
580 | default: |
581 | g_assert_not_reached(); | |
582 | } | |
583 | return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); | |
584 | } | |
585 | ||
5b36f268 RH |
586 | /** |
587 | * store_atomic2: | |
588 | * @pv: host address | |
589 | * @val: value to store | |
590 | * | |
591 | * Atomically store 2 aligned bytes to @pv. | |
592 | */ | |
593 | static inline void store_atomic2(void *pv, uint16_t val) | |
594 | { | |
595 | uint16_t *p = __builtin_assume_aligned(pv, 2); | |
596 | qatomic_set(p, val); | |
597 | } | |
598 | ||
599 | /** | |
600 | * store_atomic4: | |
601 | * @pv: host address | |
602 | * @val: value to store | |
603 | * | |
604 | * Atomically store 4 aligned bytes to @pv. | |
605 | */ | |
606 | static inline void store_atomic4(void *pv, uint32_t val) | |
607 | { | |
608 | uint32_t *p = __builtin_assume_aligned(pv, 4); | |
609 | qatomic_set(p, val); | |
610 | } | |
611 | ||
612 | /** | |
613 | * store_atomic8: | |
614 | * @pv: host address | |
615 | * @val: value to store | |
616 | * | |
617 | * Atomically store 8 aligned bytes to @pv. | |
618 | */ | |
619 | static inline void store_atomic8(void *pv, uint64_t val) | |
620 | { | |
621 | uint64_t *p = __builtin_assume_aligned(pv, 8); | |
622 | ||
623 | qemu_build_assert(HAVE_al8); | |
624 | qatomic_set__nocheck(p, val); | |
625 | } | |
626 | ||
627 | /** | |
628 | * store_atom_4x2 | |
629 | */ | |
630 | static inline void store_atom_4_by_2(void *pv, uint32_t val) | |
631 | { | |
632 | store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); | |
633 | store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); | |
634 | } | |
635 | ||
636 | /** | |
637 | * store_atom_8_by_2 | |
638 | */ | |
639 | static inline void store_atom_8_by_2(void *pv, uint64_t val) | |
640 | { | |
641 | store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); | |
642 | store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); | |
643 | } | |
644 | ||
645 | /** | |
646 | * store_atom_8_by_4 | |
647 | */ | |
648 | static inline void store_atom_8_by_4(void *pv, uint64_t val) | |
649 | { | |
650 | store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); | |
651 | store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); | |
652 | } | |
653 | ||
654 | /** | |
655 | * store_atom_insert_al4: | |
656 | * @p: host address | |
657 | * @val: shifted value to store | |
658 | * @msk: mask for value to store | |
659 | * | |
660 | * Atomically store @val to @p, masked by @msk. | |
661 | */ | |
662 | static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) | |
663 | { | |
664 | uint32_t old, new; | |
665 | ||
666 | p = __builtin_assume_aligned(p, 4); | |
667 | old = qatomic_read(p); | |
668 | do { | |
669 | new = (old & ~msk) | val; | |
670 | } while (!__atomic_compare_exchange_n(p, &old, new, true, | |
671 | __ATOMIC_RELAXED, __ATOMIC_RELAXED)); | |
672 | } | |
673 | ||
674 | /** | |
675 | * store_atom_insert_al8: | |
676 | * @p: host address | |
677 | * @val: shifted value to store | |
678 | * @msk: mask for value to store | |
679 | * | |
680 | * Atomically store @val to @p masked by @msk. | |
681 | */ | |
682 | static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) | |
683 | { | |
684 | uint64_t old, new; | |
685 | ||
686 | qemu_build_assert(HAVE_al8); | |
687 | p = __builtin_assume_aligned(p, 8); | |
688 | old = qatomic_read__nocheck(p); | |
689 | do { | |
690 | new = (old & ~msk) | val; | |
691 | } while (!__atomic_compare_exchange_n(p, &old, new, true, | |
692 | __ATOMIC_RELAXED, __ATOMIC_RELAXED)); | |
693 | } | |
694 | ||
5b36f268 RH |
695 | /** |
696 | * store_bytes_leN: | |
697 | * @pv: host address | |
698 | * @size: number of bytes to store | |
699 | * @val_le: data to store | |
700 | * | |
701 | * Store @size bytes at @p. The bytes to store are extracted in little-endian order | |
702 | * from @val_le; return the bytes of @val_le beyond @size that have not been stored. | |
703 | */ | |
704 | static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) | |
705 | { | |
706 | uint8_t *p = pv; | |
707 | for (int i = 0; i < size; i++, val_le >>= 8) { | |
708 | p[i] = val_le; | |
709 | } | |
710 | return val_le; | |
711 | } | |
712 | ||
713 | /** | |
714 | * store_parts_leN | |
715 | * @pv: host address | |
716 | * @size: number of bytes to store | |
717 | * @val_le: data to store | |
718 | * | |
719 | * As store_bytes_leN, but atomically on each aligned part. | |
720 | */ | |
721 | G_GNUC_UNUSED | |
722 | static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) | |
723 | { | |
724 | do { | |
725 | int n; | |
726 | ||
727 | /* Find minimum of alignment and size */ | |
728 | switch (((uintptr_t)pv | size) & 7) { | |
729 | case 4: | |
730 | store_atomic4(pv, le32_to_cpu(val_le)); | |
731 | val_le >>= 32; | |
732 | n = 4; | |
733 | break; | |
734 | case 2: | |
735 | case 6: | |
736 | store_atomic2(pv, le16_to_cpu(val_le)); | |
737 | val_le >>= 16; | |
738 | n = 2; | |
739 | break; | |
740 | default: | |
741 | *(uint8_t *)pv = val_le; | |
742 | val_le >>= 8; | |
743 | n = 1; | |
744 | break; | |
745 | case 0: | |
746 | g_assert_not_reached(); | |
747 | } | |
748 | pv += n; | |
749 | size -= n; | |
750 | } while (size != 0); | |
751 | ||
752 | return val_le; | |
753 | } | |
754 | ||
755 | /** | |
756 | * store_whole_le4 | |
757 | * @pv: host address | |
758 | * @size: number of bytes to store | |
759 | * @val_le: data to store | |
760 | * | |
761 | * As store_bytes_leN, but atomically as a whole. | |
762 | * Four aligned bytes are guaranteed to cover the store. | |
763 | */ | |
764 | static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) | |
765 | { | |
766 | int sz = size * 8; | |
767 | int o = (uintptr_t)pv & 3; | |
768 | int sh = o * 8; | |
769 | uint32_t m = MAKE_64BIT_MASK(0, sz); | |
770 | uint32_t v; | |
771 | ||
772 | if (HOST_BIG_ENDIAN) { | |
773 | v = bswap32(val_le) >> sh; | |
774 | m = bswap32(m) >> sh; | |
775 | } else { | |
776 | v = val_le << sh; | |
777 | m <<= sh; | |
778 | } | |
779 | store_atom_insert_al4(pv - o, v, m); | |
780 | return val_le >> sz; | |
781 | } | |
782 | ||
783 | /** | |
784 | * store_whole_le8 | |
785 | * @pv: host address | |
786 | * @size: number of bytes to store | |
787 | * @val_le: data to store | |
788 | * | |
789 | * As store_bytes_leN, but atomically as a whole. | |
790 | * Eight aligned bytes are guaranteed to cover the store. | |
791 | */ | |
792 | static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) | |
793 | { | |
794 | int sz = size * 8; | |
795 | int o = (uintptr_t)pv & 7; | |
796 | int sh = o * 8; | |
797 | uint64_t m = MAKE_64BIT_MASK(0, sz); | |
798 | uint64_t v; | |
799 | ||
800 | qemu_build_assert(HAVE_al8); | |
801 | if (HOST_BIG_ENDIAN) { | |
802 | v = bswap64(val_le) >> sh; | |
803 | m = bswap64(m) >> sh; | |
804 | } else { | |
805 | v = val_le << sh; | |
806 | m <<= sh; | |
807 | } | |
808 | store_atom_insert_al8(pv - o, v, m); | |
809 | return val_le >> sz; | |
810 | } | |
811 | ||
812 | /** | |
813 | * store_whole_le16 | |
814 | * @pv: host address | |
815 | * @size: number of bytes to store | |
816 | * @val_le: data to store | |
817 | * | |
818 | * As store_bytes_leN, but atomically as a whole. | |
819 | * 16 aligned bytes are guaranteed to cover the store. | |
820 | */ | |
821 | static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) | |
822 | { | |
823 | int sz = size * 8; | |
824 | int o = (uintptr_t)pv & 15; | |
825 | int sh = o * 8; | |
826 | Int128 m, v; | |
827 | ||
8dc24ff4 | 828 | qemu_build_assert(HAVE_ATOMIC128_RW); |
5b36f268 RH |
829 | |
830 | /* Like MAKE_64BIT_MASK(0, sz), but larger. */ | |
831 | if (sz <= 64) { | |
832 | m = int128_make64(MAKE_64BIT_MASK(0, sz)); | |
833 | } else { | |
834 | m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); | |
835 | } | |
836 | ||
837 | if (HOST_BIG_ENDIAN) { | |
838 | v = int128_urshift(bswap128(val_le), sh); | |
839 | m = int128_urshift(bswap128(m), sh); | |
840 | } else { | |
841 | v = int128_lshift(val_le, sh); | |
842 | m = int128_lshift(m, sh); | |
843 | } | |
844 | store_atom_insert_al16(pv - o, v, m); | |
845 | ||
c0dde5fc RH |
846 | if (sz <= 64) { |
847 | return 0; | |
848 | } | |
5b36f268 RH |
849 | return int128_gethi(val_le) >> (sz - 64); |
850 | } | |
851 | ||
852 | /** | |
853 | * store_atom_2: | |
854 | * @p: host address | |
855 | * @val: the value to store | |
856 | * @memop: the full memory op | |
857 | * | |
858 | * Store 2 bytes to @p, honoring the atomicity of @memop. | |
859 | */ | |
73fda56f | 860 | static void store_atom_2(CPUState *cpu, uintptr_t ra, |
5b36f268 RH |
861 | void *pv, MemOp memop, uint16_t val) |
862 | { | |
863 | uintptr_t pi = (uintptr_t)pv; | |
864 | int atmax; | |
865 | ||
866 | if (likely((pi & 1) == 0)) { | |
867 | store_atomic2(pv, val); | |
868 | return; | |
869 | } | |
870 | ||
73fda56f | 871 | atmax = required_atomicity(cpu, pi, memop); |
5b36f268 RH |
872 | if (atmax == MO_8) { |
873 | stw_he_p(pv, val); | |
874 | return; | |
875 | } | |
876 | ||
877 | /* | |
878 | * The only case remaining is MO_ATOM_WITHIN16. | |
879 | * Big or little endian, we want the middle two bytes in each test. | |
880 | */ | |
881 | if ((pi & 3) == 1) { | |
882 | store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); | |
883 | return; | |
884 | } else if ((pi & 7) == 3) { | |
885 | if (HAVE_al8) { | |
886 | store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); | |
887 | return; | |
888 | } | |
889 | } else if ((pi & 15) == 7) { | |
8dc24ff4 | 890 | if (HAVE_ATOMIC128_RW) { |
5b36f268 RH |
891 | Int128 v = int128_lshift(int128_make64(val), 56); |
892 | Int128 m = int128_lshift(int128_make64(0xffff), 56); | |
893 | store_atom_insert_al16(pv - 7, v, m); | |
894 | return; | |
895 | } | |
896 | } else { | |
897 | g_assert_not_reached(); | |
898 | } | |
899 | ||
73fda56f | 900 | cpu_loop_exit_atomic(cpu, ra); |
5b36f268 RH |
901 | } |
902 | ||
903 | /** | |
904 | * store_atom_4: | |
905 | * @p: host address | |
906 | * @val: the value to store | |
907 | * @memop: the full memory op | |
908 | * | |
909 | * Store 4 bytes to @p, honoring the atomicity of @memop. | |
910 | */ | |
73fda56f | 911 | static void store_atom_4(CPUState *cpu, uintptr_t ra, |
5b36f268 RH |
912 | void *pv, MemOp memop, uint32_t val) |
913 | { | |
914 | uintptr_t pi = (uintptr_t)pv; | |
915 | int atmax; | |
916 | ||
917 | if (likely((pi & 3) == 0)) { | |
918 | store_atomic4(pv, val); | |
919 | return; | |
920 | } | |
921 | ||
73fda56f | 922 | atmax = required_atomicity(cpu, pi, memop); |
5b36f268 RH |
923 | switch (atmax) { |
924 | case MO_8: | |
925 | stl_he_p(pv, val); | |
926 | return; | |
927 | case MO_16: | |
928 | store_atom_4_by_2(pv, val); | |
929 | return; | |
930 | case -MO_16: | |
931 | { | |
932 | uint32_t val_le = cpu_to_le32(val); | |
933 | int s2 = pi & 3; | |
934 | int s1 = 4 - s2; | |
935 | ||
936 | switch (s2) { | |
937 | case 1: | |
938 | val_le = store_whole_le4(pv, s1, val_le); | |
939 | *(uint8_t *)(pv + 3) = val_le; | |
940 | break; | |
941 | case 3: | |
942 | *(uint8_t *)pv = val_le; | |
943 | store_whole_le4(pv + 1, s2, val_le >> 8); | |
944 | break; | |
945 | case 0: /* aligned */ | |
946 | case 2: /* atmax MO_16 */ | |
947 | default: | |
948 | g_assert_not_reached(); | |
949 | } | |
950 | } | |
951 | return; | |
952 | case MO_32: | |
953 | if ((pi & 7) < 4) { | |
954 | if (HAVE_al8) { | |
955 | store_whole_le8(pv, 4, cpu_to_le32(val)); | |
956 | return; | |
957 | } | |
958 | } else { | |
8dc24ff4 | 959 | if (HAVE_ATOMIC128_RW) { |
5b36f268 RH |
960 | store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); |
961 | return; | |
962 | } | |
963 | } | |
73fda56f | 964 | cpu_loop_exit_atomic(cpu, ra); |
5b36f268 RH |
965 | default: |
966 | g_assert_not_reached(); | |
967 | } | |
968 | } | |
969 | ||
970 | /** | |
971 | * store_atom_8: | |
972 | * @p: host address | |
973 | * @val: the value to store | |
974 | * @memop: the full memory op | |
975 | * | |
976 | * Store 8 bytes to @p, honoring the atomicity of @memop. | |
977 | */ | |
73fda56f | 978 | static void store_atom_8(CPUState *cpu, uintptr_t ra, |
5b36f268 RH |
979 | void *pv, MemOp memop, uint64_t val) |
980 | { | |
981 | uintptr_t pi = (uintptr_t)pv; | |
982 | int atmax; | |
983 | ||
984 | if (HAVE_al8 && likely((pi & 7) == 0)) { | |
985 | store_atomic8(pv, val); | |
986 | return; | |
987 | } | |
988 | ||
73fda56f | 989 | atmax = required_atomicity(cpu, pi, memop); |
5b36f268 RH |
990 | switch (atmax) { |
991 | case MO_8: | |
992 | stq_he_p(pv, val); | |
993 | return; | |
994 | case MO_16: | |
995 | store_atom_8_by_2(pv, val); | |
996 | return; | |
997 | case MO_32: | |
998 | store_atom_8_by_4(pv, val); | |
999 | return; | |
1000 | case -MO_32: | |
1001 | if (HAVE_al8) { | |
1002 | uint64_t val_le = cpu_to_le64(val); | |
1003 | int s2 = pi & 7; | |
1004 | int s1 = 8 - s2; | |
1005 | ||
1006 | switch (s2) { | |
1007 | case 1 ... 3: | |
1008 | val_le = store_whole_le8(pv, s1, val_le); | |
1009 | store_bytes_leN(pv + s1, s2, val_le); | |
1010 | break; | |
1011 | case 5 ... 7: | |
1012 | val_le = store_bytes_leN(pv, s1, val_le); | |
1013 | store_whole_le8(pv + s1, s2, val_le); | |
1014 | break; | |
1015 | case 0: /* aligned */ | |
1016 | case 4: /* atmax MO_32 */ | |
1017 | default: | |
1018 | g_assert_not_reached(); | |
1019 | } | |
1020 | return; | |
1021 | } | |
1022 | break; | |
1023 | case MO_64: | |
8dc24ff4 | 1024 | if (HAVE_ATOMIC128_RW) { |
5b36f268 RH |
1025 | store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); |
1026 | return; | |
1027 | } | |
1028 | break; | |
1029 | default: | |
1030 | g_assert_not_reached(); | |
1031 | } | |
73fda56f | 1032 | cpu_loop_exit_atomic(cpu, ra); |
5b36f268 | 1033 | } |
35c653c4 RH |
1034 | |
1035 | /** | |
1036 | * store_atom_16: | |
1037 | * @p: host address | |
1038 | * @val: the value to store | |
1039 | * @memop: the full memory op | |
1040 | * | |
1041 | * Store 16 bytes to @p, honoring the atomicity of @memop. | |
1042 | */ | |
73fda56f | 1043 | static void store_atom_16(CPUState *cpu, uintptr_t ra, |
35c653c4 RH |
1044 | void *pv, MemOp memop, Int128 val) |
1045 | { | |
1046 | uintptr_t pi = (uintptr_t)pv; | |
1047 | uint64_t a, b; | |
1048 | int atmax; | |
1049 | ||
8dc24ff4 RH |
1050 | if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) { |
1051 | atomic16_set(pv, val); | |
35c653c4 RH |
1052 | return; |
1053 | } | |
1054 | ||
73fda56f | 1055 | atmax = required_atomicity(cpu, pi, memop); |
35c653c4 RH |
1056 | |
1057 | a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); | |
1058 | b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); | |
1059 | switch (atmax) { | |
1060 | case MO_8: | |
1061 | memcpy(pv, &val, 16); | |
1062 | return; | |
1063 | case MO_16: | |
1064 | store_atom_8_by_2(pv, a); | |
1065 | store_atom_8_by_2(pv + 8, b); | |
1066 | return; | |
1067 | case MO_32: | |
1068 | store_atom_8_by_4(pv, a); | |
1069 | store_atom_8_by_4(pv + 8, b); | |
1070 | return; | |
1071 | case MO_64: | |
1072 | if (HAVE_al8) { | |
1073 | store_atomic8(pv, a); | |
1074 | store_atomic8(pv + 8, b); | |
1075 | return; | |
1076 | } | |
1077 | break; | |
1078 | case -MO_64: | |
8dc24ff4 | 1079 | if (HAVE_ATOMIC128_RW) { |
35c653c4 RH |
1080 | uint64_t val_le; |
1081 | int s2 = pi & 15; | |
1082 | int s1 = 16 - s2; | |
1083 | ||
1084 | if (HOST_BIG_ENDIAN) { | |
1085 | val = bswap128(val); | |
1086 | } | |
1087 | switch (s2) { | |
1088 | case 1 ... 7: | |
1089 | val_le = store_whole_le16(pv, s1, val); | |
1090 | store_bytes_leN(pv + s1, s2, val_le); | |
1091 | break; | |
1092 | case 9 ... 15: | |
1093 | store_bytes_leN(pv, s1, int128_getlo(val)); | |
1094 | val = int128_urshift(val, s1 * 8); | |
1095 | store_whole_le16(pv + s1, s2, val); | |
1096 | break; | |
1097 | case 0: /* aligned */ | |
1098 | case 8: /* atmax MO_64 */ | |
1099 | default: | |
1100 | g_assert_not_reached(); | |
1101 | } | |
1102 | return; | |
1103 | } | |
1104 | break; | |
1105 | case MO_128: | |
35c653c4 RH |
1106 | break; |
1107 | default: | |
1108 | g_assert_not_reached(); | |
1109 | } | |
73fda56f | 1110 | cpu_loop_exit_atomic(cpu, ra); |
35c653c4 | 1111 | } |