]> git.proxmox.com Git - mirror_qemu.git/blame - accel/tcg/ldst_atomicity.c.inc
accel/tcg: Remove redundant case in store_atom_16
[mirror_qemu.git] / accel / tcg / ldst_atomicity.c.inc
CommitLineData
cdfac37b
RH
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 * Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
af844a11 12#include "host/load-extract-al16-al8.h"
b3f4144f 13#include "host/store-insert-al16.h"
af844a11 14
cdfac37b
RH
15#ifdef CONFIG_ATOMIC64
16# define HAVE_al8 true
17#else
18# define HAVE_al8 false
19#endif
20#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8)
21
cdfac37b
RH
22/**
23 * required_atomicity:
24 *
25 * Return the lg2 bytes of atomicity required by @memop for @p.
26 * If the operation must be split into two operations to be
27 * examined separately for atomicity, return -lg2.
28 */
73fda56f 29static int required_atomicity(CPUState *cpu, uintptr_t p, MemOp memop)
cdfac37b
RH
30{
31 MemOp atom = memop & MO_ATOM_MASK;
32 MemOp size = memop & MO_SIZE;
33 MemOp half = size ? size - 1 : 0;
34 unsigned tmp;
35 int atmax;
36
37 switch (atom) {
38 case MO_ATOM_NONE:
39 atmax = MO_8;
40 break;
41
42 case MO_ATOM_IFALIGN_PAIR:
43 size = half;
44 /* fall through */
45
46 case MO_ATOM_IFALIGN:
47 tmp = (1 << size) - 1;
48 atmax = p & tmp ? MO_8 : size;
49 break;
50
51 case MO_ATOM_WITHIN16:
52 tmp = p & 15;
53 atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
54 break;
55
56 case MO_ATOM_WITHIN16_PAIR:
57 tmp = p & 15;
58 if (tmp + (1 << size) <= 16) {
59 atmax = size;
60 } else if (tmp + (1 << half) == 16) {
61 /*
62 * The pair exactly straddles the boundary.
63 * Both halves are naturally aligned and atomic.
64 */
65 atmax = half;
66 } else {
67 /*
68 * One of the pair crosses the boundary, and is non-atomic.
69 * The other of the pair does not cross, and is atomic.
70 */
71 atmax = -half;
72 }
73 break;
74
75 case MO_ATOM_SUBALIGN:
76 /*
77 * Examine the alignment of p to determine if there are subobjects
78 * that must be aligned. Note that we only really need ctz4() --
79 * any more sigificant bits are discarded by the immediately
80 * following comparison.
81 */
82 tmp = ctz32(p);
83 atmax = MIN(size, tmp);
84 break;
85
86 default:
87 g_assert_not_reached();
88 }
89
90 /*
91 * Here we have the architectural atomicity of the operation.
92 * However, when executing in a serial context, we need no extra
93 * host atomicity in order to avoid racing. This reduction
94 * avoids looping with cpu_loop_exit_atomic.
95 */
73fda56f 96 if (cpu_in_serial_context(cpu)) {
cdfac37b
RH
97 return MO_8;
98 }
99 return atmax;
100}
101
102/**
103 * load_atomic2:
104 * @pv: host address
105 *
106 * Atomically load 2 aligned bytes from @pv.
107 */
108static inline uint16_t load_atomic2(void *pv)
109{
110 uint16_t *p = __builtin_assume_aligned(pv, 2);
111 return qatomic_read(p);
112}
113
114/**
115 * load_atomic4:
116 * @pv: host address
117 *
118 * Atomically load 4 aligned bytes from @pv.
119 */
120static inline uint32_t load_atomic4(void *pv)
121{
122 uint32_t *p = __builtin_assume_aligned(pv, 4);
123 return qatomic_read(p);
124}
125
126/**
127 * load_atomic8:
128 * @pv: host address
129 *
130 * Atomically load 8 aligned bytes from @pv.
131 */
132static inline uint64_t load_atomic8(void *pv)
133{
134 uint64_t *p = __builtin_assume_aligned(pv, 8);
135
136 qemu_build_assert(HAVE_al8);
137 return qatomic_read__nocheck(p);
138}
139
cdfac37b
RH
140/**
141 * load_atomic8_or_exit:
73fda56f 142 * @cpu: generic cpu state
cdfac37b
RH
143 * @ra: host unwind address
144 * @pv: host address
145 *
146 * Atomically load 8 aligned bytes from @pv.
147 * If this is not possible, longjmp out to restart serially.
148 */
73fda56f 149static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
cdfac37b
RH
150{
151 if (HAVE_al8) {
152 return load_atomic8(pv);
153 }
154
155#ifdef CONFIG_USER_ONLY
156 /*
157 * If the page is not writable, then assume the value is immutable
158 * and requires no locking. This ignores the case of MAP_SHARED with
159 * another process, because the fallback start_exclusive solution
160 * provides no protection across processes.
161 */
2c8412d4
RH
162 WITH_MMAP_LOCK_GUARD() {
163 if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
164 uint64_t *p = __builtin_assume_aligned(pv, 8);
165 return *p;
166 }
cdfac37b
RH
167 }
168#endif
169
170 /* Ultimate fallback: re-execute in serial context. */
73fda56f 171 cpu_loop_exit_atomic(cpu, ra);
cdfac37b
RH
172}
173
174/**
175 * load_atomic16_or_exit:
73fda56f 176 * @cpu: generic cpu state
cdfac37b
RH
177 * @ra: host unwind address
178 * @pv: host address
179 *
180 * Atomically load 16 aligned bytes from @pv.
181 * If this is not possible, longjmp out to restart serially.
182 */
73fda56f 183static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
cdfac37b
RH
184{
185 Int128 *p = __builtin_assume_aligned(pv, 16);
186
8dc24ff4
RH
187 if (HAVE_ATOMIC128_RO) {
188 return atomic16_read_ro(p);
cdfac37b
RH
189 }
190
cdfac37b
RH
191 /*
192 * We can only use cmpxchg to emulate a load if the page is writable.
193 * If the page is not writable, then assume the value is immutable
194 * and requires no locking. This ignores the case of MAP_SHARED with
195 * another process, because the fallback start_exclusive solution
196 * provides no protection across processes.
2c8412d4
RH
197 *
198 * In system mode all guest pages are writable. For user mode,
199 * we must take mmap_lock so that the query remains valid until
200 * the write is complete -- tests/tcg/multiarch/munmap-pthread.c
201 * is an example that can race.
cdfac37b 202 */
2c8412d4
RH
203 WITH_MMAP_LOCK_GUARD() {
204#ifdef CONFIG_USER_ONLY
205 if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
206 return *p;
207 }
cdfac37b 208#endif
2c8412d4
RH
209 if (HAVE_ATOMIC128_RW) {
210 return atomic16_read_rw(p);
211 }
cdfac37b 212 }
cdfac37b
RH
213
214 /* Ultimate fallback: re-execute in serial context. */
73fda56f 215 cpu_loop_exit_atomic(cpu, ra);
cdfac37b
RH
216}
217
218/**
219 * load_atom_extract_al4x2:
220 * @pv: host address
221 *
222 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
223 */
224static uint32_t load_atom_extract_al4x2(void *pv)
225{
226 uintptr_t pi = (uintptr_t)pv;
227 int sh = (pi & 3) * 8;
228 uint32_t a, b;
229
230 pv = (void *)(pi & ~3);
231 a = load_atomic4(pv);
232 b = load_atomic4(pv + 4);
233
234 if (HOST_BIG_ENDIAN) {
235 return (a << sh) | (b >> (-sh & 31));
236 } else {
237 return (a >> sh) | (b << (-sh & 31));
238 }
239}
240
241/**
242 * load_atom_extract_al8x2:
243 * @pv: host address
244 *
245 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
246 */
247static uint64_t load_atom_extract_al8x2(void *pv)
248{
249 uintptr_t pi = (uintptr_t)pv;
250 int sh = (pi & 7) * 8;
251 uint64_t a, b;
252
253 pv = (void *)(pi & ~7);
254 a = load_atomic8(pv);
255 b = load_atomic8(pv + 8);
256
257 if (HOST_BIG_ENDIAN) {
258 return (a << sh) | (b >> (-sh & 63));
259 } else {
260 return (a >> sh) | (b << (-sh & 63));
261 }
262}
263
264/**
265 * load_atom_extract_al8_or_exit:
73fda56f 266 * @cpu: generic cpu state
cdfac37b
RH
267 * @ra: host unwind address
268 * @pv: host address
269 * @s: object size in bytes, @s <= 4.
270 *
271 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
272 * not cross an 8-byte boundary. This means that we can perform an atomic
273 * 8-byte load and extract.
274 * The value is returned in the low bits of a uint32_t.
275 */
73fda56f 276static uint32_t load_atom_extract_al8_or_exit(CPUState *cpu, uintptr_t ra,
cdfac37b
RH
277 void *pv, int s)
278{
279 uintptr_t pi = (uintptr_t)pv;
280 int o = pi & 7;
281 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
282
283 pv = (void *)(pi & ~7);
73fda56f 284 return load_atomic8_or_exit(cpu, ra, pv) >> shr;
cdfac37b
RH
285}
286
287/**
288 * load_atom_extract_al16_or_exit:
73fda56f 289 * @cpu: generic cpu state
cdfac37b
RH
290 * @ra: host unwind address
291 * @p: host address
292 * @s: object size in bytes, @s <= 8.
293 *
294 * Atomically load @s bytes from @p, when p % 16 < 8
295 * and p % 16 + s > 8. I.e. does not cross a 16-byte
296 * boundary, but *does* cross an 8-byte boundary.
297 * This is the slow version, so we must have eliminated
298 * any faster load_atom_extract_al8_or_exit case.
299 *
300 * If this is not possible, longjmp out to restart serially.
301 */
73fda56f 302static uint64_t load_atom_extract_al16_or_exit(CPUState *cpu, uintptr_t ra,
cdfac37b
RH
303 void *pv, int s)
304{
305 uintptr_t pi = (uintptr_t)pv;
306 int o = pi & 7;
307 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
308 Int128 r;
309
310 /*
311 * Note constraints above: p & 8 must be clear.
312 * Provoke SIGBUS if possible otherwise.
313 */
314 pv = (void *)(pi & ~7);
73fda56f 315 r = load_atomic16_or_exit(cpu, ra, pv);
cdfac37b
RH
316
317 r = int128_urshift(r, shr);
318 return int128_getlo(r);
319}
320
cdfac37b
RH
321/**
322 * load_atom_4_by_2:
323 * @pv: host address
324 *
325 * Load 4 bytes from @pv, with two 2-byte atomic loads.
326 */
327static inline uint32_t load_atom_4_by_2(void *pv)
328{
329 uint32_t a = load_atomic2(pv);
330 uint32_t b = load_atomic2(pv + 2);
331
332 if (HOST_BIG_ENDIAN) {
333 return (a << 16) | b;
334 } else {
335 return (b << 16) | a;
336 }
337}
338
339/**
340 * load_atom_8_by_2:
341 * @pv: host address
342 *
343 * Load 8 bytes from @pv, with four 2-byte atomic loads.
344 */
345static inline uint64_t load_atom_8_by_2(void *pv)
346{
347 uint32_t a = load_atom_4_by_2(pv);
348 uint32_t b = load_atom_4_by_2(pv + 4);
349
350 if (HOST_BIG_ENDIAN) {
351 return ((uint64_t)a << 32) | b;
352 } else {
353 return ((uint64_t)b << 32) | a;
354 }
355}
356
357/**
358 * load_atom_8_by_4:
359 * @pv: host address
360 *
361 * Load 8 bytes from @pv, with two 4-byte atomic loads.
362 */
363static inline uint64_t load_atom_8_by_4(void *pv)
364{
365 uint32_t a = load_atomic4(pv);
366 uint32_t b = load_atomic4(pv + 4);
367
368 if (HOST_BIG_ENDIAN) {
369 return ((uint64_t)a << 32) | b;
370 } else {
371 return ((uint64_t)b << 32) | a;
372 }
373}
374
35c653c4
RH
375/**
376 * load_atom_8_by_8_or_4:
377 * @pv: host address
378 *
379 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
380 */
381static inline uint64_t load_atom_8_by_8_or_4(void *pv)
382{
383 if (HAVE_al8_fast) {
384 return load_atomic8(pv);
385 } else {
386 return load_atom_8_by_4(pv);
387 }
388}
389
cdfac37b
RH
390/**
391 * load_atom_2:
392 * @p: host address
393 * @memop: the full memory op
394 *
395 * Load 2 bytes from @p, honoring the atomicity of @memop.
396 */
73fda56f 397static uint16_t load_atom_2(CPUState *cpu, uintptr_t ra,
cdfac37b
RH
398 void *pv, MemOp memop)
399{
400 uintptr_t pi = (uintptr_t)pv;
401 int atmax;
402
403 if (likely((pi & 1) == 0)) {
404 return load_atomic2(pv);
405 }
8dc24ff4 406 if (HAVE_ATOMIC128_RO) {
6a2c23dd
RH
407 intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
408 if (likely(left_in_page > 8)) {
409 return load_atom_extract_al16_or_al8(pv, 2);
410 }
cdfac37b
RH
411 }
412
73fda56f 413 atmax = required_atomicity(cpu, pi, memop);
cdfac37b
RH
414 switch (atmax) {
415 case MO_8:
416 return lduw_he_p(pv);
417 case MO_16:
418 /* The only case remaining is MO_ATOM_WITHIN16. */
419 if (!HAVE_al8_fast && (pi & 3) == 1) {
420 /* Big or little endian, we want the middle two bytes. */
421 return load_atomic4(pv - 1) >> 8;
422 }
423 if ((pi & 15) != 7) {
73fda56f 424 return load_atom_extract_al8_or_exit(cpu, ra, pv, 2);
cdfac37b 425 }
73fda56f 426 return load_atom_extract_al16_or_exit(cpu, ra, pv, 2);
cdfac37b
RH
427 default:
428 g_assert_not_reached();
429 }
430}
431
432/**
433 * load_atom_4:
434 * @p: host address
435 * @memop: the full memory op
436 *
437 * Load 4 bytes from @p, honoring the atomicity of @memop.
438 */
73fda56f 439static uint32_t load_atom_4(CPUState *cpu, uintptr_t ra,
cdfac37b
RH
440 void *pv, MemOp memop)
441{
442 uintptr_t pi = (uintptr_t)pv;
443 int atmax;
444
445 if (likely((pi & 3) == 0)) {
446 return load_atomic4(pv);
447 }
8dc24ff4 448 if (HAVE_ATOMIC128_RO) {
6a2c23dd
RH
449 intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
450 if (likely(left_in_page > 8)) {
451 return load_atom_extract_al16_or_al8(pv, 4);
452 }
cdfac37b
RH
453 }
454
73fda56f 455 atmax = required_atomicity(cpu, pi, memop);
cdfac37b
RH
456 switch (atmax) {
457 case MO_8:
458 case MO_16:
459 case -MO_16:
460 /*
461 * For MO_ATOM_IFALIGN, this is more atomicity than required,
462 * but it's trivially supported on all hosts, better than 4
463 * individual byte loads (when the host requires alignment),
464 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
465 */
466 return load_atom_extract_al4x2(pv);
467 case MO_32:
468 if (!(pi & 4)) {
73fda56f 469 return load_atom_extract_al8_or_exit(cpu, ra, pv, 4);
cdfac37b 470 }
73fda56f 471 return load_atom_extract_al16_or_exit(cpu, ra, pv, 4);
cdfac37b
RH
472 default:
473 g_assert_not_reached();
474 }
475}
476
477/**
478 * load_atom_8:
479 * @p: host address
480 * @memop: the full memory op
481 *
482 * Load 8 bytes from @p, honoring the atomicity of @memop.
483 */
73fda56f 484static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra,
cdfac37b
RH
485 void *pv, MemOp memop)
486{
487 uintptr_t pi = (uintptr_t)pv;
488 int atmax;
489
490 /*
491 * If the host does not support 8-byte atomics, wait until we have
492 * examined the atomicity parameters below.
493 */
494 if (HAVE_al8 && likely((pi & 7) == 0)) {
495 return load_atomic8(pv);
496 }
8dc24ff4 497 if (HAVE_ATOMIC128_RO) {
cdfac37b
RH
498 return load_atom_extract_al16_or_al8(pv, 8);
499 }
500
73fda56f 501 atmax = required_atomicity(cpu, pi, memop);
cdfac37b
RH
502 if (atmax == MO_64) {
503 if (!HAVE_al8 && (pi & 7) == 0) {
73fda56f 504 load_atomic8_or_exit(cpu, ra, pv);
cdfac37b 505 }
73fda56f 506 return load_atom_extract_al16_or_exit(cpu, ra, pv, 8);
cdfac37b
RH
507 }
508 if (HAVE_al8_fast) {
509 return load_atom_extract_al8x2(pv);
510 }
511 switch (atmax) {
512 case MO_8:
513 return ldq_he_p(pv);
514 case MO_16:
515 return load_atom_8_by_2(pv);
516 case MO_32:
517 return load_atom_8_by_4(pv);
518 case -MO_32:
519 if (HAVE_al8) {
520 return load_atom_extract_al8x2(pv);
521 }
73fda56f 522 cpu_loop_exit_atomic(cpu, ra);
cdfac37b
RH
523 default:
524 g_assert_not_reached();
525 }
526}
5b36f268 527
35c653c4
RH
528/**
529 * load_atom_16:
530 * @p: host address
531 * @memop: the full memory op
532 *
533 * Load 16 bytes from @p, honoring the atomicity of @memop.
534 */
73fda56f 535static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
35c653c4
RH
536 void *pv, MemOp memop)
537{
538 uintptr_t pi = (uintptr_t)pv;
539 int atmax;
540 Int128 r;
541 uint64_t a, b;
542
543 /*
544 * If the host does not support 16-byte atomics, wait until we have
545 * examined the atomicity parameters below.
546 */
8dc24ff4
RH
547 if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
548 return atomic16_read_ro(pv);
35c653c4
RH
549 }
550
73fda56f 551 atmax = required_atomicity(cpu, pi, memop);
35c653c4
RH
552 switch (atmax) {
553 case MO_8:
554 memcpy(&r, pv, 16);
555 return r;
556 case MO_16:
557 a = load_atom_8_by_2(pv);
558 b = load_atom_8_by_2(pv + 8);
559 break;
560 case MO_32:
561 a = load_atom_8_by_4(pv);
562 b = load_atom_8_by_4(pv + 8);
563 break;
564 case MO_64:
565 if (!HAVE_al8) {
73fda56f 566 cpu_loop_exit_atomic(cpu, ra);
35c653c4
RH
567 }
568 a = load_atomic8(pv);
569 b = load_atomic8(pv + 8);
570 break;
571 case -MO_64:
572 if (!HAVE_al8) {
73fda56f 573 cpu_loop_exit_atomic(cpu, ra);
35c653c4
RH
574 }
575 a = load_atom_extract_al8x2(pv);
576 b = load_atom_extract_al8x2(pv + 8);
577 break;
578 case MO_128:
73fda56f 579 return load_atomic16_or_exit(cpu, ra, pv);
35c653c4
RH
580 default:
581 g_assert_not_reached();
582 }
583 return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
584}
585
5b36f268
RH
586/**
587 * store_atomic2:
588 * @pv: host address
589 * @val: value to store
590 *
591 * Atomically store 2 aligned bytes to @pv.
592 */
593static inline void store_atomic2(void *pv, uint16_t val)
594{
595 uint16_t *p = __builtin_assume_aligned(pv, 2);
596 qatomic_set(p, val);
597}
598
599/**
600 * store_atomic4:
601 * @pv: host address
602 * @val: value to store
603 *
604 * Atomically store 4 aligned bytes to @pv.
605 */
606static inline void store_atomic4(void *pv, uint32_t val)
607{
608 uint32_t *p = __builtin_assume_aligned(pv, 4);
609 qatomic_set(p, val);
610}
611
612/**
613 * store_atomic8:
614 * @pv: host address
615 * @val: value to store
616 *
617 * Atomically store 8 aligned bytes to @pv.
618 */
619static inline void store_atomic8(void *pv, uint64_t val)
620{
621 uint64_t *p = __builtin_assume_aligned(pv, 8);
622
623 qemu_build_assert(HAVE_al8);
624 qatomic_set__nocheck(p, val);
625}
626
627/**
628 * store_atom_4x2
629 */
630static inline void store_atom_4_by_2(void *pv, uint32_t val)
631{
632 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
633 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
634}
635
636/**
637 * store_atom_8_by_2
638 */
639static inline void store_atom_8_by_2(void *pv, uint64_t val)
640{
641 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
642 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
643}
644
645/**
646 * store_atom_8_by_4
647 */
648static inline void store_atom_8_by_4(void *pv, uint64_t val)
649{
650 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
651 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
652}
653
654/**
655 * store_atom_insert_al4:
656 * @p: host address
657 * @val: shifted value to store
658 * @msk: mask for value to store
659 *
660 * Atomically store @val to @p, masked by @msk.
661 */
662static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
663{
664 uint32_t old, new;
665
666 p = __builtin_assume_aligned(p, 4);
667 old = qatomic_read(p);
668 do {
669 new = (old & ~msk) | val;
670 } while (!__atomic_compare_exchange_n(p, &old, new, true,
671 __ATOMIC_RELAXED, __ATOMIC_RELAXED));
672}
673
674/**
675 * store_atom_insert_al8:
676 * @p: host address
677 * @val: shifted value to store
678 * @msk: mask for value to store
679 *
680 * Atomically store @val to @p masked by @msk.
681 */
682static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
683{
684 uint64_t old, new;
685
686 qemu_build_assert(HAVE_al8);
687 p = __builtin_assume_aligned(p, 8);
688 old = qatomic_read__nocheck(p);
689 do {
690 new = (old & ~msk) | val;
691 } while (!__atomic_compare_exchange_n(p, &old, new, true,
692 __ATOMIC_RELAXED, __ATOMIC_RELAXED));
693}
694
5b36f268
RH
695/**
696 * store_bytes_leN:
697 * @pv: host address
698 * @size: number of bytes to store
699 * @val_le: data to store
700 *
701 * Store @size bytes at @p. The bytes to store are extracted in little-endian order
702 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
703 */
704static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
705{
706 uint8_t *p = pv;
707 for (int i = 0; i < size; i++, val_le >>= 8) {
708 p[i] = val_le;
709 }
710 return val_le;
711}
712
713/**
714 * store_parts_leN
715 * @pv: host address
716 * @size: number of bytes to store
717 * @val_le: data to store
718 *
719 * As store_bytes_leN, but atomically on each aligned part.
720 */
721G_GNUC_UNUSED
722static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
723{
724 do {
725 int n;
726
727 /* Find minimum of alignment and size */
728 switch (((uintptr_t)pv | size) & 7) {
729 case 4:
730 store_atomic4(pv, le32_to_cpu(val_le));
731 val_le >>= 32;
732 n = 4;
733 break;
734 case 2:
735 case 6:
736 store_atomic2(pv, le16_to_cpu(val_le));
737 val_le >>= 16;
738 n = 2;
739 break;
740 default:
741 *(uint8_t *)pv = val_le;
742 val_le >>= 8;
743 n = 1;
744 break;
745 case 0:
746 g_assert_not_reached();
747 }
748 pv += n;
749 size -= n;
750 } while (size != 0);
751
752 return val_le;
753}
754
755/**
756 * store_whole_le4
757 * @pv: host address
758 * @size: number of bytes to store
759 * @val_le: data to store
760 *
761 * As store_bytes_leN, but atomically as a whole.
762 * Four aligned bytes are guaranteed to cover the store.
763 */
764static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
765{
766 int sz = size * 8;
767 int o = (uintptr_t)pv & 3;
768 int sh = o * 8;
769 uint32_t m = MAKE_64BIT_MASK(0, sz);
770 uint32_t v;
771
772 if (HOST_BIG_ENDIAN) {
773 v = bswap32(val_le) >> sh;
774 m = bswap32(m) >> sh;
775 } else {
776 v = val_le << sh;
777 m <<= sh;
778 }
779 store_atom_insert_al4(pv - o, v, m);
780 return val_le >> sz;
781}
782
783/**
784 * store_whole_le8
785 * @pv: host address
786 * @size: number of bytes to store
787 * @val_le: data to store
788 *
789 * As store_bytes_leN, but atomically as a whole.
790 * Eight aligned bytes are guaranteed to cover the store.
791 */
792static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
793{
794 int sz = size * 8;
795 int o = (uintptr_t)pv & 7;
796 int sh = o * 8;
797 uint64_t m = MAKE_64BIT_MASK(0, sz);
798 uint64_t v;
799
800 qemu_build_assert(HAVE_al8);
801 if (HOST_BIG_ENDIAN) {
802 v = bswap64(val_le) >> sh;
803 m = bswap64(m) >> sh;
804 } else {
805 v = val_le << sh;
806 m <<= sh;
807 }
808 store_atom_insert_al8(pv - o, v, m);
809 return val_le >> sz;
810}
811
812/**
813 * store_whole_le16
814 * @pv: host address
815 * @size: number of bytes to store
816 * @val_le: data to store
817 *
818 * As store_bytes_leN, but atomically as a whole.
819 * 16 aligned bytes are guaranteed to cover the store.
820 */
821static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
822{
823 int sz = size * 8;
824 int o = (uintptr_t)pv & 15;
825 int sh = o * 8;
826 Int128 m, v;
827
8dc24ff4 828 qemu_build_assert(HAVE_ATOMIC128_RW);
5b36f268
RH
829
830 /* Like MAKE_64BIT_MASK(0, sz), but larger. */
831 if (sz <= 64) {
832 m = int128_make64(MAKE_64BIT_MASK(0, sz));
833 } else {
834 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
835 }
836
837 if (HOST_BIG_ENDIAN) {
838 v = int128_urshift(bswap128(val_le), sh);
839 m = int128_urshift(bswap128(m), sh);
840 } else {
841 v = int128_lshift(val_le, sh);
842 m = int128_lshift(m, sh);
843 }
844 store_atom_insert_al16(pv - o, v, m);
845
c0dde5fc
RH
846 if (sz <= 64) {
847 return 0;
848 }
5b36f268
RH
849 return int128_gethi(val_le) >> (sz - 64);
850}
851
852/**
853 * store_atom_2:
854 * @p: host address
855 * @val: the value to store
856 * @memop: the full memory op
857 *
858 * Store 2 bytes to @p, honoring the atomicity of @memop.
859 */
73fda56f 860static void store_atom_2(CPUState *cpu, uintptr_t ra,
5b36f268
RH
861 void *pv, MemOp memop, uint16_t val)
862{
863 uintptr_t pi = (uintptr_t)pv;
864 int atmax;
865
866 if (likely((pi & 1) == 0)) {
867 store_atomic2(pv, val);
868 return;
869 }
870
73fda56f 871 atmax = required_atomicity(cpu, pi, memop);
5b36f268
RH
872 if (atmax == MO_8) {
873 stw_he_p(pv, val);
874 return;
875 }
876
877 /*
878 * The only case remaining is MO_ATOM_WITHIN16.
879 * Big or little endian, we want the middle two bytes in each test.
880 */
881 if ((pi & 3) == 1) {
882 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
883 return;
884 } else if ((pi & 7) == 3) {
885 if (HAVE_al8) {
886 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
887 return;
888 }
889 } else if ((pi & 15) == 7) {
8dc24ff4 890 if (HAVE_ATOMIC128_RW) {
5b36f268
RH
891 Int128 v = int128_lshift(int128_make64(val), 56);
892 Int128 m = int128_lshift(int128_make64(0xffff), 56);
893 store_atom_insert_al16(pv - 7, v, m);
894 return;
895 }
896 } else {
897 g_assert_not_reached();
898 }
899
73fda56f 900 cpu_loop_exit_atomic(cpu, ra);
5b36f268
RH
901}
902
903/**
904 * store_atom_4:
905 * @p: host address
906 * @val: the value to store
907 * @memop: the full memory op
908 *
909 * Store 4 bytes to @p, honoring the atomicity of @memop.
910 */
73fda56f 911static void store_atom_4(CPUState *cpu, uintptr_t ra,
5b36f268
RH
912 void *pv, MemOp memop, uint32_t val)
913{
914 uintptr_t pi = (uintptr_t)pv;
915 int atmax;
916
917 if (likely((pi & 3) == 0)) {
918 store_atomic4(pv, val);
919 return;
920 }
921
73fda56f 922 atmax = required_atomicity(cpu, pi, memop);
5b36f268
RH
923 switch (atmax) {
924 case MO_8:
925 stl_he_p(pv, val);
926 return;
927 case MO_16:
928 store_atom_4_by_2(pv, val);
929 return;
930 case -MO_16:
931 {
932 uint32_t val_le = cpu_to_le32(val);
933 int s2 = pi & 3;
934 int s1 = 4 - s2;
935
936 switch (s2) {
937 case 1:
938 val_le = store_whole_le4(pv, s1, val_le);
939 *(uint8_t *)(pv + 3) = val_le;
940 break;
941 case 3:
942 *(uint8_t *)pv = val_le;
943 store_whole_le4(pv + 1, s2, val_le >> 8);
944 break;
945 case 0: /* aligned */
946 case 2: /* atmax MO_16 */
947 default:
948 g_assert_not_reached();
949 }
950 }
951 return;
952 case MO_32:
953 if ((pi & 7) < 4) {
954 if (HAVE_al8) {
955 store_whole_le8(pv, 4, cpu_to_le32(val));
956 return;
957 }
958 } else {
8dc24ff4 959 if (HAVE_ATOMIC128_RW) {
5b36f268
RH
960 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
961 return;
962 }
963 }
73fda56f 964 cpu_loop_exit_atomic(cpu, ra);
5b36f268
RH
965 default:
966 g_assert_not_reached();
967 }
968}
969
970/**
971 * store_atom_8:
972 * @p: host address
973 * @val: the value to store
974 * @memop: the full memory op
975 *
976 * Store 8 bytes to @p, honoring the atomicity of @memop.
977 */
73fda56f 978static void store_atom_8(CPUState *cpu, uintptr_t ra,
5b36f268
RH
979 void *pv, MemOp memop, uint64_t val)
980{
981 uintptr_t pi = (uintptr_t)pv;
982 int atmax;
983
984 if (HAVE_al8 && likely((pi & 7) == 0)) {
985 store_atomic8(pv, val);
986 return;
987 }
988
73fda56f 989 atmax = required_atomicity(cpu, pi, memop);
5b36f268
RH
990 switch (atmax) {
991 case MO_8:
992 stq_he_p(pv, val);
993 return;
994 case MO_16:
995 store_atom_8_by_2(pv, val);
996 return;
997 case MO_32:
998 store_atom_8_by_4(pv, val);
999 return;
1000 case -MO_32:
1001 if (HAVE_al8) {
1002 uint64_t val_le = cpu_to_le64(val);
1003 int s2 = pi & 7;
1004 int s1 = 8 - s2;
1005
1006 switch (s2) {
1007 case 1 ... 3:
1008 val_le = store_whole_le8(pv, s1, val_le);
1009 store_bytes_leN(pv + s1, s2, val_le);
1010 break;
1011 case 5 ... 7:
1012 val_le = store_bytes_leN(pv, s1, val_le);
1013 store_whole_le8(pv + s1, s2, val_le);
1014 break;
1015 case 0: /* aligned */
1016 case 4: /* atmax MO_32 */
1017 default:
1018 g_assert_not_reached();
1019 }
1020 return;
1021 }
1022 break;
1023 case MO_64:
8dc24ff4 1024 if (HAVE_ATOMIC128_RW) {
5b36f268
RH
1025 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1026 return;
1027 }
1028 break;
1029 default:
1030 g_assert_not_reached();
1031 }
73fda56f 1032 cpu_loop_exit_atomic(cpu, ra);
5b36f268 1033}
35c653c4
RH
1034
1035/**
1036 * store_atom_16:
1037 * @p: host address
1038 * @val: the value to store
1039 * @memop: the full memory op
1040 *
1041 * Store 16 bytes to @p, honoring the atomicity of @memop.
1042 */
73fda56f 1043static void store_atom_16(CPUState *cpu, uintptr_t ra,
35c653c4
RH
1044 void *pv, MemOp memop, Int128 val)
1045{
1046 uintptr_t pi = (uintptr_t)pv;
1047 uint64_t a, b;
1048 int atmax;
1049
8dc24ff4
RH
1050 if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
1051 atomic16_set(pv, val);
35c653c4
RH
1052 return;
1053 }
1054
73fda56f 1055 atmax = required_atomicity(cpu, pi, memop);
35c653c4
RH
1056
1057 a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1058 b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1059 switch (atmax) {
1060 case MO_8:
1061 memcpy(pv, &val, 16);
1062 return;
1063 case MO_16:
1064 store_atom_8_by_2(pv, a);
1065 store_atom_8_by_2(pv + 8, b);
1066 return;
1067 case MO_32:
1068 store_atom_8_by_4(pv, a);
1069 store_atom_8_by_4(pv + 8, b);
1070 return;
1071 case MO_64:
1072 if (HAVE_al8) {
1073 store_atomic8(pv, a);
1074 store_atomic8(pv + 8, b);
1075 return;
1076 }
1077 break;
1078 case -MO_64:
8dc24ff4 1079 if (HAVE_ATOMIC128_RW) {
35c653c4
RH
1080 uint64_t val_le;
1081 int s2 = pi & 15;
1082 int s1 = 16 - s2;
1083
1084 if (HOST_BIG_ENDIAN) {
1085 val = bswap128(val);
1086 }
1087 switch (s2) {
1088 case 1 ... 7:
1089 val_le = store_whole_le16(pv, s1, val);
1090 store_bytes_leN(pv + s1, s2, val_le);
1091 break;
1092 case 9 ... 15:
1093 store_bytes_leN(pv, s1, int128_getlo(val));
1094 val = int128_urshift(val, s1 * 8);
1095 store_whole_le16(pv + s1, s2, val);
1096 break;
1097 case 0: /* aligned */
1098 case 8: /* atmax MO_64 */
1099 default:
1100 g_assert_not_reached();
1101 }
1102 return;
1103 }
1104 break;
1105 case MO_128:
35c653c4
RH
1106 break;
1107 default:
1108 g_assert_not_reached();
1109 }
73fda56f 1110 cpu_loop_exit_atomic(cpu, ra);
35c653c4 1111}