accel/tcg/ldst_atomicity.c.inc

   1 /*
   2  * Routines common to user and system emulation of load/store.
   3  *
   4  *  Copyright (c) 2022 Linaro, Ltd.
   5  *
   6  * SPDX-License-Identifier: GPL-2.0-or-later
   7  *
   8  * This work is licensed under the terms of the GNU GPL, version 2 or later.
   9  * See the COPYING file in the top-level directory.
  10  */
  11
  12 #include "host/load-extract-al16-al8.h"
  13 #include "host/store-insert-al16.h"
  14
  15 #ifdef CONFIG_ATOMIC64
  16 # define HAVE_al8          true
  17 #else
  18 # define HAVE_al8          false
  19 #endif
  20 #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
  21
  22 /**
  23  * required_atomicity:
  24  *
  25  * Return the lg2 bytes of atomicity required by @memop for @p.
  26  * If the operation must be split into two operations to be
  27  * examined separately for atomicity, return -lg2.
  28  */
  29 static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
  30 {
  31     MemOp atom = memop & MO_ATOM_MASK;
  32     MemOp size = memop & MO_SIZE;
  33     MemOp half = size ? size - 1 : 0;
  34     unsigned tmp;
  35     int atmax;
  36
  37     switch (atom) {
  38     case MO_ATOM_NONE:
  39         atmax = MO_8;
  40         break;
  41
  42     case MO_ATOM_IFALIGN_PAIR:
  43         size = half;
  44         /* fall through */
  45
  46     case MO_ATOM_IFALIGN:
  47         tmp = (1 << size) - 1;
  48         atmax = p & tmp ? MO_8 : size;
  49         break;
  50
  51     case MO_ATOM_WITHIN16:
  52         tmp = p & 15;
  53         atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
  54         break;
  55
  56     case MO_ATOM_WITHIN16_PAIR:
  57         tmp = p & 15;
  58         if (tmp + (1 << size) <= 16) {
  59             atmax = size;
  60         } else if (tmp + (1 << half) == 16) {
  61             /*
  62              * The pair exactly straddles the boundary.
  63              * Both halves are naturally aligned and atomic.
  64              */
  65             atmax = half;
  66         } else {
  67             /*
  68              * One of the pair crosses the boundary, and is non-atomic.
  69              * The other of the pair does not cross, and is atomic.
  70              */
  71             atmax = -half;
  72         }
  73         break;
  74
  75     case MO_ATOM_SUBALIGN:
  76         /*
  77          * Examine the alignment of p to determine if there are subobjects
  78          * that must be aligned.  Note that we only really need ctz4() --
  79          * any more sigificant bits are discarded by the immediately
  80          * following comparison.
  81          */
  82         tmp = ctz32(p);
  83         atmax = MIN(size, tmp);
  84         break;
  85
  86     default:
  87         g_assert_not_reached();
  88     }
  89
  90     /*
  91      * Here we have the architectural atomicity of the operation.
  92      * However, when executing in a serial context, we need no extra
  93      * host atomicity in order to avoid racing.  This reduction
  94      * avoids looping with cpu_loop_exit_atomic.
  95      */
  96     if (cpu_in_serial_context(env_cpu(env))) {
  97         return MO_8;
  98     }
  99     return atmax;
 100 }
 101
 102 /**
 103  * load_atomic2:
 104  * @pv: host address
 105  *
 106  * Atomically load 2 aligned bytes from @pv.
 107  */
 108 static inline uint16_t load_atomic2(void *pv)
 109 {
 110     uint16_t *p = __builtin_assume_aligned(pv, 2);
 111     return qatomic_read(p);
 112 }
 113
 114 /**
 115  * load_atomic4:
 116  * @pv: host address
 117  *
 118  * Atomically load 4 aligned bytes from @pv.
 119  */
 120 static inline uint32_t load_atomic4(void *pv)
 121 {
 122     uint32_t *p = __builtin_assume_aligned(pv, 4);
 123     return qatomic_read(p);
 124 }
 125
 126 /**
 127  * load_atomic8:
 128  * @pv: host address
 129  *
 130  * Atomically load 8 aligned bytes from @pv.
 131  */
 132 static inline uint64_t load_atomic8(void *pv)
 133 {
 134     uint64_t *p = __builtin_assume_aligned(pv, 8);
 135
 136     qemu_build_assert(HAVE_al8);
 137     return qatomic_read__nocheck(p);
 138 }
 139
 140 /**
 141  * load_atomic8_or_exit:
 142  * @env: cpu context
 143  * @ra: host unwind address
 144  * @pv: host address
 145  *
 146  * Atomically load 8 aligned bytes from @pv.
 147  * If this is not possible, longjmp out to restart serially.
 148  */
 149 static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
 150 {
 151     if (HAVE_al8) {
 152         return load_atomic8(pv);
 153     }
 154
 155 #ifdef CONFIG_USER_ONLY
 156     /*
 157      * If the page is not writable, then assume the value is immutable
 158      * and requires no locking.  This ignores the case of MAP_SHARED with
 159      * another process, because the fallback start_exclusive solution
 160      * provides no protection across processes.
 161      */
 162     if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
 163         uint64_t *p = __builtin_assume_aligned(pv, 8);
 164         return *p;
 165     }
 166 #endif
 167
 168     /* Ultimate fallback: re-execute in serial context. */
 169     cpu_loop_exit_atomic(env_cpu(env), ra);
 170 }
 171
 172 /**
 173  * load_atomic16_or_exit:
 174  * @env: cpu context
 175  * @ra: host unwind address
 176  * @pv: host address
 177  *
 178  * Atomically load 16 aligned bytes from @pv.
 179  * If this is not possible, longjmp out to restart serially.
 180  */
 181 static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
 182 {
 183     Int128 *p = __builtin_assume_aligned(pv, 16);
 184
 185     if (HAVE_ATOMIC128_RO) {
 186         return atomic16_read_ro(p);
 187     }
 188
 189 #ifdef CONFIG_USER_ONLY
 190     /*
 191      * We can only use cmpxchg to emulate a load if the page is writable.
 192      * If the page is not writable, then assume the value is immutable
 193      * and requires no locking.  This ignores the case of MAP_SHARED with
 194      * another process, because the fallback start_exclusive solution
 195      * provides no protection across processes.
 196      */
 197     if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
 198         return *p;
 199     }
 200 #endif
 201
 202     /*
 203      * In system mode all guest pages are writable, and for user-only
 204      * we have just checked writability.  Try cmpxchg.
 205      */
 206     if (HAVE_ATOMIC128_RW) {
 207         return atomic16_read_rw(p);
 208     }
 209
 210     /* Ultimate fallback: re-execute in serial context. */
 211     cpu_loop_exit_atomic(env_cpu(env), ra);
 212 }
 213
 214 /**
 215  * load_atom_extract_al4x2:
 216  * @pv: host address
 217  *
 218  * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
 219  */
 220 static uint32_t load_atom_extract_al4x2(void *pv)
 221 {
 222     uintptr_t pi = (uintptr_t)pv;
 223     int sh = (pi & 3) * 8;
 224     uint32_t a, b;
 225
 226     pv = (void *)(pi & ~3);
 227     a = load_atomic4(pv);
 228     b = load_atomic4(pv + 4);
 229
 230     if (HOST_BIG_ENDIAN) {
 231         return (a << sh) | (b >> (-sh & 31));
 232     } else {
 233         return (a >> sh) | (b << (-sh & 31));
 234     }
 235 }
 236
 237 /**
 238  * load_atom_extract_al8x2:
 239  * @pv: host address
 240  *
 241  * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
 242  */
 243 static uint64_t load_atom_extract_al8x2(void *pv)
 244 {
 245     uintptr_t pi = (uintptr_t)pv;
 246     int sh = (pi & 7) * 8;
 247     uint64_t a, b;
 248
 249     pv = (void *)(pi & ~7);
 250     a = load_atomic8(pv);
 251     b = load_atomic8(pv + 8);
 252
 253     if (HOST_BIG_ENDIAN) {
 254         return (a << sh) | (b >> (-sh & 63));
 255     } else {
 256         return (a >> sh) | (b << (-sh & 63));
 257     }
 258 }
 259
 260 /**
 261  * load_atom_extract_al8_or_exit:
 262  * @env: cpu context
 263  * @ra: host unwind address
 264  * @pv: host address
 265  * @s: object size in bytes, @s <= 4.
 266  *
 267  * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
 268  * not cross an 8-byte boundary.  This means that we can perform an atomic
 269  * 8-byte load and extract.
 270  * The value is returned in the low bits of a uint32_t.
 271  */
 272 static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra,
 273                                               void *pv, int s)
 274 {
 275     uintptr_t pi = (uintptr_t)pv;
 276     int o = pi & 7;
 277     int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
 278
 279     pv = (void *)(pi & ~7);
 280     return load_atomic8_or_exit(env, ra, pv) >> shr;
 281 }
 282
 283 /**
 284  * load_atom_extract_al16_or_exit:
 285  * @env: cpu context
 286  * @ra: host unwind address
 287  * @p: host address
 288  * @s: object size in bytes, @s <= 8.
 289  *
 290  * Atomically load @s bytes from @p, when p % 16 < 8
 291  * and p % 16 + s > 8.  I.e. does not cross a 16-byte
 292  * boundary, but *does* cross an 8-byte boundary.
 293  * This is the slow version, so we must have eliminated
 294  * any faster load_atom_extract_al8_or_exit case.
 295  *
 296  * If this is not possible, longjmp out to restart serially.
 297  */
 298 static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
 299                                                void *pv, int s)
 300 {
 301     uintptr_t pi = (uintptr_t)pv;
 302     int o = pi & 7;
 303     int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
 304     Int128 r;
 305
 306     /*
 307      * Note constraints above: p & 8 must be clear.
 308      * Provoke SIGBUS if possible otherwise.
 309      */
 310     pv = (void *)(pi & ~7);
 311     r = load_atomic16_or_exit(env, ra, pv);
 312
 313     r = int128_urshift(r, shr);
 314     return int128_getlo(r);
 315 }
 316
 317 /**
 318  * load_atom_4_by_2:
 319  * @pv: host address
 320  *
 321  * Load 4 bytes from @pv, with two 2-byte atomic loads.
 322  */
 323 static inline uint32_t load_atom_4_by_2(void *pv)
 324 {
 325     uint32_t a = load_atomic2(pv);
 326     uint32_t b = load_atomic2(pv + 2);
 327
 328     if (HOST_BIG_ENDIAN) {
 329         return (a << 16) | b;
 330     } else {
 331         return (b << 16) | a;
 332     }
 333 }
 334
 335 /**
 336  * load_atom_8_by_2:
 337  * @pv: host address
 338  *
 339  * Load 8 bytes from @pv, with four 2-byte atomic loads.
 340  */
 341 static inline uint64_t load_atom_8_by_2(void *pv)
 342 {
 343     uint32_t a = load_atom_4_by_2(pv);
 344     uint32_t b = load_atom_4_by_2(pv + 4);
 345
 346     if (HOST_BIG_ENDIAN) {
 347         return ((uint64_t)a << 32) | b;
 348     } else {
 349         return ((uint64_t)b << 32) | a;
 350     }
 351 }
 352
 353 /**
 354  * load_atom_8_by_4:
 355  * @pv: host address
 356  *
 357  * Load 8 bytes from @pv, with two 4-byte atomic loads.
 358  */
 359 static inline uint64_t load_atom_8_by_4(void *pv)
 360 {
 361     uint32_t a = load_atomic4(pv);
 362     uint32_t b = load_atomic4(pv + 4);
 363
 364     if (HOST_BIG_ENDIAN) {
 365         return ((uint64_t)a << 32) | b;
 366     } else {
 367         return ((uint64_t)b << 32) | a;
 368     }
 369 }
 370
 371 /**
 372  * load_atom_8_by_8_or_4:
 373  * @pv: host address
 374  *
 375  * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
 376  */
 377 static inline uint64_t load_atom_8_by_8_or_4(void *pv)
 378 {
 379     if (HAVE_al8_fast) {
 380         return load_atomic8(pv);
 381     } else {
 382         return load_atom_8_by_4(pv);
 383     }
 384 }
 385
 386 /**
 387  * load_atom_2:
 388  * @p: host address
 389  * @memop: the full memory op
 390  *
 391  * Load 2 bytes from @p, honoring the atomicity of @memop.
 392  */
 393 static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
 394                             void *pv, MemOp memop)
 395 {
 396     uintptr_t pi = (uintptr_t)pv;
 397     int atmax;
 398
 399     if (likely((pi & 1) == 0)) {
 400         return load_atomic2(pv);
 401     }
 402     if (HAVE_ATOMIC128_RO) {
 403         return load_atom_extract_al16_or_al8(pv, 2);
 404     }
 405
 406     atmax = required_atomicity(env, pi, memop);
 407     switch (atmax) {
 408     case MO_8:
 409         return lduw_he_p(pv);
 410     case MO_16:
 411         /* The only case remaining is MO_ATOM_WITHIN16. */
 412         if (!HAVE_al8_fast && (pi & 3) == 1) {
 413             /* Big or little endian, we want the middle two bytes. */
 414             return load_atomic4(pv - 1) >> 8;
 415         }
 416         if ((pi & 15) != 7) {
 417             return load_atom_extract_al8_or_exit(env, ra, pv, 2);
 418         }
 419         return load_atom_extract_al16_or_exit(env, ra, pv, 2);
 420     default:
 421         g_assert_not_reached();
 422     }
 423 }
 424
 425 /**
 426  * load_atom_4:
 427  * @p: host address
 428  * @memop: the full memory op
 429  *
 430  * Load 4 bytes from @p, honoring the atomicity of @memop.
 431  */
 432 static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
 433                             void *pv, MemOp memop)
 434 {
 435     uintptr_t pi = (uintptr_t)pv;
 436     int atmax;
 437
 438     if (likely((pi & 3) == 0)) {
 439         return load_atomic4(pv);
 440     }
 441     if (HAVE_ATOMIC128_RO) {
 442         return load_atom_extract_al16_or_al8(pv, 4);
 443     }
 444
 445     atmax = required_atomicity(env, pi, memop);
 446     switch (atmax) {
 447     case MO_8:
 448     case MO_16:
 449     case -MO_16:
 450         /*
 451          * For MO_ATOM_IFALIGN, this is more atomicity than required,
 452          * but it's trivially supported on all hosts, better than 4
 453          * individual byte loads (when the host requires alignment),
 454          * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
 455          */
 456         return load_atom_extract_al4x2(pv);
 457     case MO_32:
 458         if (!(pi & 4)) {
 459             return load_atom_extract_al8_or_exit(env, ra, pv, 4);
 460         }
 461         return load_atom_extract_al16_or_exit(env, ra, pv, 4);
 462     default:
 463         g_assert_not_reached();
 464     }
 465 }
 466
 467 /**
 468  * load_atom_8:
 469  * @p: host address
 470  * @memop: the full memory op
 471  *
 472  * Load 8 bytes from @p, honoring the atomicity of @memop.
 473  */
 474 static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
 475                             void *pv, MemOp memop)
 476 {
 477     uintptr_t pi = (uintptr_t)pv;
 478     int atmax;
 479
 480     /*
 481      * If the host does not support 8-byte atomics, wait until we have
 482      * examined the atomicity parameters below.
 483      */
 484     if (HAVE_al8 && likely((pi & 7) == 0)) {
 485         return load_atomic8(pv);
 486     }
 487     if (HAVE_ATOMIC128_RO) {
 488         return load_atom_extract_al16_or_al8(pv, 8);
 489     }
 490
 491     atmax = required_atomicity(env, pi, memop);
 492     if (atmax == MO_64) {
 493         if (!HAVE_al8 && (pi & 7) == 0) {
 494             load_atomic8_or_exit(env, ra, pv);
 495         }
 496         return load_atom_extract_al16_or_exit(env, ra, pv, 8);
 497     }
 498     if (HAVE_al8_fast) {
 499         return load_atom_extract_al8x2(pv);
 500     }
 501     switch (atmax) {
 502     case MO_8:
 503         return ldq_he_p(pv);
 504     case MO_16:
 505         return load_atom_8_by_2(pv);
 506     case MO_32:
 507         return load_atom_8_by_4(pv);
 508     case -MO_32:
 509         if (HAVE_al8) {
 510             return load_atom_extract_al8x2(pv);
 511         }
 512         cpu_loop_exit_atomic(env_cpu(env), ra);
 513     default:
 514         g_assert_not_reached();
 515     }
 516 }
 517
 518 /**
 519  * load_atom_16:
 520  * @p: host address
 521  * @memop: the full memory op
 522  *
 523  * Load 16 bytes from @p, honoring the atomicity of @memop.
 524  */
 525 static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
 526                            void *pv, MemOp memop)
 527 {
 528     uintptr_t pi = (uintptr_t)pv;
 529     int atmax;
 530     Int128 r;
 531     uint64_t a, b;
 532
 533     /*
 534      * If the host does not support 16-byte atomics, wait until we have
 535      * examined the atomicity parameters below.
 536      */
 537     if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
 538         return atomic16_read_ro(pv);
 539     }
 540
 541     atmax = required_atomicity(env, pi, memop);
 542     switch (atmax) {
 543     case MO_8:
 544         memcpy(&r, pv, 16);
 545         return r;
 546     case MO_16:
 547         a = load_atom_8_by_2(pv);
 548         b = load_atom_8_by_2(pv + 8);
 549         break;
 550     case MO_32:
 551         a = load_atom_8_by_4(pv);
 552         b = load_atom_8_by_4(pv + 8);
 553         break;
 554     case MO_64:
 555         if (!HAVE_al8) {
 556             cpu_loop_exit_atomic(env_cpu(env), ra);
 557         }
 558         a = load_atomic8(pv);
 559         b = load_atomic8(pv + 8);
 560         break;
 561     case -MO_64:
 562         if (!HAVE_al8) {
 563             cpu_loop_exit_atomic(env_cpu(env), ra);
 564         }
 565         a = load_atom_extract_al8x2(pv);
 566         b = load_atom_extract_al8x2(pv + 8);
 567         break;
 568     case MO_128:
 569         return load_atomic16_or_exit(env, ra, pv);
 570     default:
 571         g_assert_not_reached();
 572     }
 573     return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
 574 }
 575
 576 /**
 577  * store_atomic2:
 578  * @pv: host address
 579  * @val: value to store
 580  *
 581  * Atomically store 2 aligned bytes to @pv.
 582  */
 583 static inline void store_atomic2(void *pv, uint16_t val)
 584 {
 585     uint16_t *p = __builtin_assume_aligned(pv, 2);
 586     qatomic_set(p, val);
 587 }
 588
 589 /**
 590  * store_atomic4:
 591  * @pv: host address
 592  * @val: value to store
 593  *
 594  * Atomically store 4 aligned bytes to @pv.
 595  */
 596 static inline void store_atomic4(void *pv, uint32_t val)
 597 {
 598     uint32_t *p = __builtin_assume_aligned(pv, 4);
 599     qatomic_set(p, val);
 600 }
 601
 602 /**
 603  * store_atomic8:
 604  * @pv: host address
 605  * @val: value to store
 606  *
 607  * Atomically store 8 aligned bytes to @pv.
 608  */
 609 static inline void store_atomic8(void *pv, uint64_t val)
 610 {
 611     uint64_t *p = __builtin_assume_aligned(pv, 8);
 612
 613     qemu_build_assert(HAVE_al8);
 614     qatomic_set__nocheck(p, val);
 615 }
 616
 617 /**
 618  * store_atom_4x2
 619  */
 620 static inline void store_atom_4_by_2(void *pv, uint32_t val)
 621 {
 622     store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
 623     store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
 624 }
 625
 626 /**
 627  * store_atom_8_by_2
 628  */
 629 static inline void store_atom_8_by_2(void *pv, uint64_t val)
 630 {
 631     store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
 632     store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
 633 }
 634
 635 /**
 636  * store_atom_8_by_4
 637  */
 638 static inline void store_atom_8_by_4(void *pv, uint64_t val)
 639 {
 640     store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
 641     store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
 642 }
 643
 644 /**
 645  * store_atom_insert_al4:
 646  * @p: host address
 647  * @val: shifted value to store
 648  * @msk: mask for value to store
 649  *
 650  * Atomically store @val to @p, masked by @msk.
 651  */
 652 static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
 653 {
 654     uint32_t old, new;
 655
 656     p = __builtin_assume_aligned(p, 4);
 657     old = qatomic_read(p);
 658     do {
 659         new = (old & ~msk) | val;
 660     } while (!__atomic_compare_exchange_n(p, &old, new, true,
 661                                           __ATOMIC_RELAXED, __ATOMIC_RELAXED));
 662 }
 663
 664 /**
 665  * store_atom_insert_al8:
 666  * @p: host address
 667  * @val: shifted value to store
 668  * @msk: mask for value to store
 669  *
 670  * Atomically store @val to @p masked by @msk.
 671  */
 672 static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
 673 {
 674     uint64_t old, new;
 675
 676     qemu_build_assert(HAVE_al8);
 677     p = __builtin_assume_aligned(p, 8);
 678     old = qatomic_read__nocheck(p);
 679     do {
 680         new = (old & ~msk) | val;
 681     } while (!__atomic_compare_exchange_n(p, &old, new, true,
 682                                           __ATOMIC_RELAXED, __ATOMIC_RELAXED));
 683 }
 684
 685 /**
 686  * store_bytes_leN:
 687  * @pv: host address
 688  * @size: number of bytes to store
 689  * @val_le: data to store
 690  *
 691  * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
 692  * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
 693  */
 694 static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
 695 {
 696     uint8_t *p = pv;
 697     for (int i = 0; i < size; i++, val_le >>= 8) {
 698         p[i] = val_le;
 699     }
 700     return val_le;
 701 }
 702
 703 /**
 704  * store_parts_leN
 705  * @pv: host address
 706  * @size: number of bytes to store
 707  * @val_le: data to store
 708  *
 709  * As store_bytes_leN, but atomically on each aligned part.
 710  */
 711 G_GNUC_UNUSED
 712 static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
 713 {
 714     do {
 715         int n;
 716
 717         /* Find minimum of alignment and size */
 718         switch (((uintptr_t)pv | size) & 7) {
 719         case 4:
 720             store_atomic4(pv, le32_to_cpu(val_le));
 721             val_le >>= 32;
 722             n = 4;
 723             break;
 724         case 2:
 725         case 6:
 726             store_atomic2(pv, le16_to_cpu(val_le));
 727             val_le >>= 16;
 728             n = 2;
 729             break;
 730         default:
 731             *(uint8_t *)pv = val_le;
 732             val_le >>= 8;
 733             n = 1;
 734             break;
 735         case 0:
 736             g_assert_not_reached();
 737         }
 738         pv += n;
 739         size -= n;
 740     } while (size != 0);
 741
 742     return val_le;
 743 }
 744
 745 /**
 746  * store_whole_le4
 747  * @pv: host address
 748  * @size: number of bytes to store
 749  * @val_le: data to store
 750  *
 751  * As store_bytes_leN, but atomically as a whole.
 752  * Four aligned bytes are guaranteed to cover the store.
 753  */
 754 static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
 755 {
 756     int sz = size * 8;
 757     int o = (uintptr_t)pv & 3;
 758     int sh = o * 8;
 759     uint32_t m = MAKE_64BIT_MASK(0, sz);
 760     uint32_t v;
 761
 762     if (HOST_BIG_ENDIAN) {
 763         v = bswap32(val_le) >> sh;
 764         m = bswap32(m) >> sh;
 765     } else {
 766         v = val_le << sh;
 767         m <<= sh;
 768     }
 769     store_atom_insert_al4(pv - o, v, m);
 770     return val_le >> sz;
 771 }
 772
 773 /**
 774  * store_whole_le8
 775  * @pv: host address
 776  * @size: number of bytes to store
 777  * @val_le: data to store
 778  *
 779  * As store_bytes_leN, but atomically as a whole.
 780  * Eight aligned bytes are guaranteed to cover the store.
 781  */
 782 static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
 783 {
 784     int sz = size * 8;
 785     int o = (uintptr_t)pv & 7;
 786     int sh = o * 8;
 787     uint64_t m = MAKE_64BIT_MASK(0, sz);
 788     uint64_t v;
 789
 790     qemu_build_assert(HAVE_al8);
 791     if (HOST_BIG_ENDIAN) {
 792         v = bswap64(val_le) >> sh;
 793         m = bswap64(m) >> sh;
 794     } else {
 795         v = val_le << sh;
 796         m <<= sh;
 797     }
 798     store_atom_insert_al8(pv - o, v, m);
 799     return val_le >> sz;
 800 }
 801
 802 /**
 803  * store_whole_le16
 804  * @pv: host address
 805  * @size: number of bytes to store
 806  * @val_le: data to store
 807  *
 808  * As store_bytes_leN, but atomically as a whole.
 809  * 16 aligned bytes are guaranteed to cover the store.
 810  */
 811 static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
 812 {
 813     int sz = size * 8;
 814     int o = (uintptr_t)pv & 15;
 815     int sh = o * 8;
 816     Int128 m, v;
 817
 818     qemu_build_assert(HAVE_ATOMIC128_RW);
 819
 820     /* Like MAKE_64BIT_MASK(0, sz), but larger. */
 821     if (sz <= 64) {
 822         m = int128_make64(MAKE_64BIT_MASK(0, sz));
 823     } else {
 824         m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
 825     }
 826
 827     if (HOST_BIG_ENDIAN) {
 828         v = int128_urshift(bswap128(val_le), sh);
 829         m = int128_urshift(bswap128(m), sh);
 830     } else {
 831         v = int128_lshift(val_le, sh);
 832         m = int128_lshift(m, sh);
 833     }
 834     store_atom_insert_al16(pv - o, v, m);
 835
 836     /* Unused if sz <= 64. */
 837     return int128_gethi(val_le) >> (sz - 64);
 838 }
 839
 840 /**
 841  * store_atom_2:
 842  * @p: host address
 843  * @val: the value to store
 844  * @memop: the full memory op
 845  *
 846  * Store 2 bytes to @p, honoring the atomicity of @memop.
 847  */
 848 static void store_atom_2(CPUArchState *env, uintptr_t ra,
 849                          void *pv, MemOp memop, uint16_t val)
 850 {
 851     uintptr_t pi = (uintptr_t)pv;
 852     int atmax;
 853
 854     if (likely((pi & 1) == 0)) {
 855         store_atomic2(pv, val);
 856         return;
 857     }
 858
 859     atmax = required_atomicity(env, pi, memop);
 860     if (atmax == MO_8) {
 861         stw_he_p(pv, val);
 862         return;
 863     }
 864
 865     /*
 866      * The only case remaining is MO_ATOM_WITHIN16.
 867      * Big or little endian, we want the middle two bytes in each test.
 868      */
 869     if ((pi & 3) == 1) {
 870         store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
 871         return;
 872     } else if ((pi & 7) == 3) {
 873         if (HAVE_al8) {
 874             store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
 875             return;
 876         }
 877     } else if ((pi & 15) == 7) {
 878         if (HAVE_ATOMIC128_RW) {
 879             Int128 v = int128_lshift(int128_make64(val), 56);
 880             Int128 m = int128_lshift(int128_make64(0xffff), 56);
 881             store_atom_insert_al16(pv - 7, v, m);
 882             return;
 883         }
 884     } else {
 885         g_assert_not_reached();
 886     }
 887
 888     cpu_loop_exit_atomic(env_cpu(env), ra);
 889 }
 890
 891 /**
 892  * store_atom_4:
 893  * @p: host address
 894  * @val: the value to store
 895  * @memop: the full memory op
 896  *
 897  * Store 4 bytes to @p, honoring the atomicity of @memop.
 898  */
 899 static void store_atom_4(CPUArchState *env, uintptr_t ra,
 900                          void *pv, MemOp memop, uint32_t val)
 901 {
 902     uintptr_t pi = (uintptr_t)pv;
 903     int atmax;
 904
 905     if (likely((pi & 3) == 0)) {
 906         store_atomic4(pv, val);
 907         return;
 908     }
 909
 910     atmax = required_atomicity(env, pi, memop);
 911     switch (atmax) {
 912     case MO_8:
 913         stl_he_p(pv, val);
 914         return;
 915     case MO_16:
 916         store_atom_4_by_2(pv, val);
 917         return;
 918     case -MO_16:
 919         {
 920             uint32_t val_le = cpu_to_le32(val);
 921             int s2 = pi & 3;
 922             int s1 = 4 - s2;
 923
 924             switch (s2) {
 925             case 1:
 926                 val_le = store_whole_le4(pv, s1, val_le);
 927                 *(uint8_t *)(pv + 3) = val_le;
 928                 break;
 929             case 3:
 930                 *(uint8_t *)pv = val_le;
 931                 store_whole_le4(pv + 1, s2, val_le >> 8);
 932                 break;
 933             case 0: /* aligned */
 934             case 2: /* atmax MO_16 */
 935             default:
 936                 g_assert_not_reached();
 937             }
 938         }
 939         return;
 940     case MO_32:
 941         if ((pi & 7) < 4) {
 942             if (HAVE_al8) {
 943                 store_whole_le8(pv, 4, cpu_to_le32(val));
 944                 return;
 945             }
 946         } else {
 947             if (HAVE_ATOMIC128_RW) {
 948                 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
 949                 return;
 950             }
 951         }
 952         cpu_loop_exit_atomic(env_cpu(env), ra);
 953     default:
 954         g_assert_not_reached();
 955     }
 956 }
 957
 958 /**
 959  * store_atom_8:
 960  * @p: host address
 961  * @val: the value to store
 962  * @memop: the full memory op
 963  *
 964  * Store 8 bytes to @p, honoring the atomicity of @memop.
 965  */
 966 static void store_atom_8(CPUArchState *env, uintptr_t ra,
 967                          void *pv, MemOp memop, uint64_t val)
 968 {
 969     uintptr_t pi = (uintptr_t)pv;
 970     int atmax;
 971
 972     if (HAVE_al8 && likely((pi & 7) == 0)) {
 973         store_atomic8(pv, val);
 974         return;
 975     }
 976
 977     atmax = required_atomicity(env, pi, memop);
 978     switch (atmax) {
 979     case MO_8:
 980         stq_he_p(pv, val);
 981         return;
 982     case MO_16:
 983         store_atom_8_by_2(pv, val);
 984         return;
 985     case MO_32:
 986         store_atom_8_by_4(pv, val);
 987         return;
 988     case -MO_32:
 989         if (HAVE_al8) {
 990             uint64_t val_le = cpu_to_le64(val);
 991             int s2 = pi & 7;
 992             int s1 = 8 - s2;
 993
 994             switch (s2) {
 995             case 1 ... 3:
 996                 val_le = store_whole_le8(pv, s1, val_le);
 997                 store_bytes_leN(pv + s1, s2, val_le);
 998                 break;
 999             case 5 ... 7:
1000                 val_le = store_bytes_leN(pv, s1, val_le);
1001                 store_whole_le8(pv + s1, s2, val_le);
1002                 break;
1003             case 0: /* aligned */
1004             case 4: /* atmax MO_32 */
1005             default:
1006                 g_assert_not_reached();
1007             }
1008             return;
1009         }
1010         break;
1011     case MO_64:
1012         if (HAVE_ATOMIC128_RW) {
1013             store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1014             return;
1015         }
1016         break;
1017     default:
1018         g_assert_not_reached();
1019     }
1020     cpu_loop_exit_atomic(env_cpu(env), ra);
1021 }
1022
1023 /**
1024  * store_atom_16:
1025  * @p: host address
1026  * @val: the value to store
1027  * @memop: the full memory op
1028  *
1029  * Store 16 bytes to @p, honoring the atomicity of @memop.
1030  */
1031 static void store_atom_16(CPUArchState *env, uintptr_t ra,
1032                           void *pv, MemOp memop, Int128 val)
1033 {
1034     uintptr_t pi = (uintptr_t)pv;
1035     uint64_t a, b;
1036     int atmax;
1037
1038     if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
1039         atomic16_set(pv, val);
1040         return;
1041     }
1042
1043     atmax = required_atomicity(env, pi, memop);
1044
1045     a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1046     b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1047     switch (atmax) {
1048     case MO_8:
1049         memcpy(pv, &val, 16);
1050         return;
1051     case MO_16:
1052         store_atom_8_by_2(pv, a);
1053         store_atom_8_by_2(pv + 8, b);
1054         return;
1055     case MO_32:
1056         store_atom_8_by_4(pv, a);
1057         store_atom_8_by_4(pv + 8, b);
1058         return;
1059     case MO_64:
1060         if (HAVE_al8) {
1061             store_atomic8(pv, a);
1062             store_atomic8(pv + 8, b);
1063             return;
1064         }
1065         break;
1066     case -MO_64:
1067         if (HAVE_ATOMIC128_RW) {
1068             uint64_t val_le;
1069             int s2 = pi & 15;
1070             int s1 = 16 - s2;
1071
1072             if (HOST_BIG_ENDIAN) {
1073                 val = bswap128(val);
1074             }
1075             switch (s2) {
1076             case 1 ... 7:
1077                 val_le = store_whole_le16(pv, s1, val);
1078                 store_bytes_leN(pv + s1, s2, val_le);
1079                 break;
1080             case 9 ... 15:
1081                 store_bytes_leN(pv, s1, int128_getlo(val));
1082                 val = int128_urshift(val, s1 * 8);
1083                 store_whole_le16(pv + s1, s2, val);
1084                 break;
1085             case 0: /* aligned */
1086             case 8: /* atmax MO_64 */
1087             default:
1088                 g_assert_not_reached();
1089             }
1090             return;
1091         }
1092         break;
1093     case MO_128:
1094         if (HAVE_ATOMIC128_RW) {
1095             atomic16_set(pv, val);
1096             return;
1097         }
1098         break;
1099     default:
1100         g_assert_not_reached();
1101     }
1102     cpu_loop_exit_atomic(env_cpu(env), ra);
1103 }