]>
Commit | Line | Data |
---|---|---|
ed00b5ec FG |
1 | // SPDX-License-Identifier: Apache-2.0 OR MIT |
2 | ||
781aab86 FG |
3 | // Atomic{I,U}128 implementation on AArch64. |
4 | // | |
5 | // There are a few ways to implement 128-bit atomic operations in AArch64. | |
6 | // | |
7 | // - LDXP/STXP loop (DW LL/SC) | |
8 | // - CASP (DWCAS) added as FEAT_LSE (mandatory from armv8.1-a) | |
9 | // - LDP/STP (DW load/store) if FEAT_LSE2 (optional from armv8.2-a, mandatory from armv8.4-a) is available | |
ed00b5ec FG |
10 | // - LDIAPP/STILP (DW acquire-load/release-store) added as FEAT_LRCPC3 (optional from armv8.9-a/armv9.4-a) (if FEAT_LSE2 is also available) |
11 | // - LDCLRP/LDSETP/SWPP (DW RMW) added as FEAT_LSE128 (optional from armv9.4-a) | |
781aab86 FG |
12 | // |
13 | // If outline-atomics is not enabled and FEAT_LSE is not available at | |
14 | // compile-time, we use LDXP/STXP loop. | |
15 | // If outline-atomics is enabled and FEAT_LSE is not available at | |
16 | // compile-time, we use CASP for CAS if FEAT_LSE is available | |
17 | // at run-time, otherwise, use LDXP/STXP loop. | |
18 | // If FEAT_LSE is available at compile-time, we use CASP for load/store/CAS/RMW. | |
19 | // However, when portable_atomic_ll_sc_rmw cfg is set, use LDXP/STXP loop instead of CASP | |
20 | // loop for RMW (by default, it is set on Apple hardware; see build script for details). | |
21 | // If FEAT_LSE2 is available at compile-time, we use LDP/STP for load/store. | |
ed00b5ec FG |
22 | // If FEAT_LSE128 is available at compile-time, we use LDCLRP/LDSETP/SWPP for fetch_and/fetch_or/swap/{release,seqcst}-store. |
23 | // If FEAT_LSE2 and FEAT_LRCPC3 are available at compile-time, we use LDIAPP/STILP for acquire-load/release-store. | |
781aab86 | 24 | // |
ed00b5ec | 25 | // Note: FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2. |
781aab86 FG |
26 | // |
27 | // Note that we do not separate LL and SC into separate functions, but handle | |
28 | // them within a single asm block. This is because it is theoretically possible | |
29 | // for the compiler to insert operations that might clear the reservation between | |
30 | // LL and SC. Considering the type of operations we are providing and the fact | |
31 | // that [progress64](https://github.com/ARM-software/progress64) uses such code, | |
32 | // this is probably not a problem for aarch64, but it seems that aarch64 doesn't | |
33 | // guarantee it and hexagon is the only architecture with hardware guarantees | |
34 | // that such code works. See also: | |
35 | // | |
36 | // - https://yarchive.net/comp/linux/cmpxchg_ll_sc_portability.html | |
37 | // - https://lists.llvm.org/pipermail/llvm-dev/2016-May/099490.html | |
38 | // - https://lists.llvm.org/pipermail/llvm-dev/2018-June/123993.html | |
39 | // | |
40 | // Also, even when using a CAS loop to implement atomic RMW, include the loop itself | |
41 | // in the asm block because it is more efficient for some codegen backends. | |
42 | // https://github.com/rust-lang/compiler-builtins/issues/339#issuecomment-1191260474 | |
43 | // | |
44 | // Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use | |
45 | // this module and use intrinsics.rs instead. | |
46 | // | |
47 | // Refs: | |
48 | // - ARM Compiler armasm User Guide | |
49 | // https://developer.arm.com/documentation/dui0801/latest | |
50 | // - Arm A-profile A64 Instruction Set Architecture | |
51 | // https://developer.arm.com/documentation/ddi0602/latest | |
52 | // - Arm Architecture Reference Manual for A-profile architecture | |
53 | // https://developer.arm.com/documentation/ddi0487/latest | |
54 | // - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit | |
55 | // | |
56 | // Generated asm: | |
ed00b5ec FG |
57 | // - aarch64 https://godbolt.org/z/5Mz1E33vz |
58 | // - aarch64 msvc https://godbolt.org/z/P53d1MsGY | |
59 | // - aarch64 (+lse) https://godbolt.org/z/qvaE8n79K | |
60 | // - aarch64 msvc (+lse) https://godbolt.org/z/dj4aYerfr | |
61 | // - aarch64 (+lse,+lse2) https://godbolt.org/z/1E15jjxah | |
62 | // - aarch64 (+lse,+lse2,+rcpc3) https://godbolt.org/z/YreM4n84o | |
63 | // - aarch64 (+lse2,+lse128) https://godbolt.org/z/Kfeqs54ox | |
64 | // - aarch64 (+lse2,+lse128,+rcpc3) https://godbolt.org/z/n6zhjE77s | |
781aab86 FG |
65 | |
66 | include!("macros.rs"); | |
67 | ||
68 | // On musl with static linking, it seems that getauxval is not always available. | |
69 | // See detect/auxv.rs for more. | |
70 | #[cfg(not(portable_atomic_no_outline_atomics))] | |
ed00b5ec FG |
71 | #[cfg(any( |
72 | test, | |
73 | not(all( | |
74 | any(target_feature = "lse2", portable_atomic_target_feature = "lse2"), | |
75 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
76 | )), | |
77 | ))] | |
781aab86 FG |
78 | #[cfg(any( |
79 | all( | |
80 | target_os = "linux", | |
81 | any( | |
82 | target_env = "gnu", | |
83 | all(any(target_env = "musl", target_env = "ohos"), not(target_feature = "crt-static")), | |
84 | portable_atomic_outline_atomics, | |
85 | ), | |
86 | ), | |
87 | target_os = "android", | |
88 | target_os = "freebsd", | |
89 | ))] | |
90 | #[path = "detect/auxv.rs"] | |
91 | mod detect; | |
92 | #[cfg(not(portable_atomic_no_outline_atomics))] | |
ed00b5ec FG |
93 | #[cfg_attr( |
94 | target_os = "netbsd", | |
95 | cfg(any( | |
96 | test, | |
97 | not(all( | |
98 | any(target_feature = "lse2", portable_atomic_target_feature = "lse2"), | |
99 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
100 | )), | |
101 | )) | |
102 | )] | |
103 | #[cfg_attr( | |
104 | target_os = "openbsd", | |
105 | cfg(any(test, not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))) | |
106 | )] | |
107 | #[cfg(any(target_os = "netbsd", target_os = "openbsd"))] | |
781aab86 FG |
108 | #[path = "detect/aarch64_aa64reg.rs"] |
109 | mod detect; | |
110 | #[cfg(not(portable_atomic_no_outline_atomics))] | |
111 | #[cfg(any(test, not(any(target_feature = "lse", portable_atomic_target_feature = "lse"))))] | |
112 | #[cfg(target_os = "fuchsia")] | |
113 | #[path = "detect/aarch64_fuchsia.rs"] | |
114 | mod detect; | |
115 | #[cfg(not(portable_atomic_no_outline_atomics))] | |
116 | #[cfg(any(test, not(any(target_feature = "lse", portable_atomic_target_feature = "lse"))))] | |
117 | #[cfg(target_os = "windows")] | |
118 | #[path = "detect/aarch64_windows.rs"] | |
119 | mod detect; | |
120 | ||
121 | // test only | |
122 | #[cfg(test)] | |
123 | #[cfg(not(qemu))] | |
124 | #[cfg(not(valgrind))] | |
125 | #[cfg(not(portable_atomic_no_outline_atomics))] | |
126 | #[cfg(any(target_os = "linux", target_os = "android", target_os = "freebsd"))] | |
127 | #[path = "detect/aarch64_aa64reg.rs"] | |
128 | mod detect_aa64reg; | |
129 | #[cfg(test)] | |
130 | #[cfg(not(portable_atomic_no_outline_atomics))] | |
131 | #[cfg(target_os = "macos")] | |
132 | #[path = "detect/aarch64_macos.rs"] | |
133 | mod detect_macos; | |
134 | ||
135 | #[cfg(not(portable_atomic_no_asm))] | |
136 | use core::arch::asm; | |
137 | use core::sync::atomic::Ordering; | |
138 | ||
ed00b5ec FG |
139 | use crate::utils::{Pair, U128}; |
140 | ||
781aab86 FG |
141 | #[cfg(any( |
142 | target_feature = "lse", | |
143 | portable_atomic_target_feature = "lse", | |
144 | not(portable_atomic_no_outline_atomics), | |
145 | ))] | |
146 | macro_rules! debug_assert_lse { | |
147 | () => { | |
148 | #[cfg(all( | |
149 | not(portable_atomic_no_outline_atomics), | |
150 | any( | |
151 | all( | |
152 | target_os = "linux", | |
153 | any( | |
154 | target_env = "gnu", | |
155 | all( | |
156 | any(target_env = "musl", target_env = "ohos"), | |
157 | not(target_feature = "crt-static"), | |
158 | ), | |
159 | portable_atomic_outline_atomics, | |
160 | ), | |
161 | ), | |
162 | target_os = "android", | |
163 | target_os = "freebsd", | |
ed00b5ec | 164 | target_os = "netbsd", |
781aab86 FG |
165 | target_os = "openbsd", |
166 | target_os = "fuchsia", | |
167 | target_os = "windows", | |
168 | ), | |
169 | ))] | |
170 | #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))] | |
171 | { | |
172 | debug_assert!(detect::detect().has_lse()); | |
173 | } | |
174 | }; | |
175 | } | |
ed00b5ec FG |
176 | #[rustfmt::skip] |
177 | #[cfg(any( | |
178 | target_feature = "lse2", | |
179 | portable_atomic_target_feature = "lse2", | |
180 | not(portable_atomic_no_outline_atomics), | |
181 | ))] | |
182 | macro_rules! debug_assert_lse2 { | |
183 | () => { | |
184 | #[cfg(all( | |
185 | not(portable_atomic_no_outline_atomics), | |
186 | any( | |
187 | all( | |
188 | target_os = "linux", | |
189 | any( | |
190 | target_env = "gnu", | |
191 | all( | |
192 | any(target_env = "musl", target_env = "ohos"), | |
193 | not(target_feature = "crt-static"), | |
194 | ), | |
195 | portable_atomic_outline_atomics, | |
196 | ), | |
197 | ), | |
198 | target_os = "android", | |
199 | target_os = "freebsd", | |
200 | target_os = "netbsd", | |
201 | // These don't support detection of FEAT_LSE2. | |
202 | // target_os = "openbsd", | |
203 | // target_os = "fuchsia", | |
204 | // target_os = "windows", | |
205 | ), | |
206 | ))] | |
207 | #[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))] | |
208 | { | |
209 | debug_assert!(detect::detect().has_lse2()); | |
210 | } | |
211 | }; | |
212 | } | |
213 | ||
214 | // Refs: https://developer.arm.com/documentation/100067/0612/armclang-Integrated-Assembler/AArch32-Target-selection-directives?lang=en | |
215 | // | |
216 | // This is similar to #[target_feature(enable = "lse")], except that there are | |
217 | // no compiler guarantees regarding (un)inlining, and the scope is within an asm | |
218 | // block rather than a function. We use this directive to support outline-atomics | |
219 | // on pre-1.61 rustc (aarch64_target_feature stabilized in Rust 1.61). | |
220 | // | |
221 | // The .arch_extension directive is effective until the end of the assembly block and | |
222 | // is not propagated to subsequent code, so the end_lse macro is unneeded. | |
223 | // https://godbolt.org/z/4oMEW8vWc | |
224 | // https://github.com/torvalds/linux/commit/e0d5896bd356cd577f9710a02d7a474cdf58426b | |
225 | // https://github.com/torvalds/linux/commit/dd1f6308b28edf0452dd5dc7877992903ec61e69 | |
226 | // (It seems GCC effectively ignores this directive and always allow FEAT_LSE instructions: https://godbolt.org/z/W9W6rensG) | |
227 | // | |
228 | // The .arch directive has a similar effect, but we don't use it due to the following issue: | |
229 | // https://github.com/torvalds/linux/commit/dd1f6308b28edf0452dd5dc7877992903ec61e69 | |
230 | // | |
231 | // This is also needed for compatibility with rustc_codegen_cranelift: | |
232 | // https://github.com/rust-lang/rustc_codegen_cranelift/issues/1400#issuecomment-1774599775 | |
233 | // | |
234 | // Note: If FEAT_LSE is not available at compile-time, we must guarantee that | |
235 | // the function that uses it is not inlined into a function where it is not | |
236 | // clear whether FEAT_LSE is available. Otherwise, (even if we checked whether | |
237 | // FEAT_LSE is available at run-time) optimizations that reorder its | |
238 | // instructions across the if condition might introduce undefined behavior. | |
239 | // (see also https://rust-lang.github.io/rfcs/2045-target-feature.html#safely-inlining-target_feature-functions-on-more-contexts) | |
240 | // However, our code uses the ifunc helper macro that works with function pointers, | |
241 | // so we don't have to worry about this unless calling without helper macro. | |
242 | #[cfg(any( | |
243 | target_feature = "lse", | |
244 | portable_atomic_target_feature = "lse", | |
245 | not(portable_atomic_no_outline_atomics), | |
246 | ))] | |
247 | macro_rules! start_lse { | |
248 | () => { | |
249 | ".arch_extension lse" | |
250 | }; | |
251 | } | |
781aab86 FG |
252 | |
253 | #[cfg(target_endian = "little")] | |
254 | macro_rules! select_le_or_be { | |
255 | ($le:expr, $be:expr) => { | |
256 | $le | |
257 | }; | |
258 | } | |
259 | #[cfg(target_endian = "big")] | |
260 | macro_rules! select_le_or_be { | |
261 | ($le:expr, $be:expr) => { | |
262 | $be | |
263 | }; | |
264 | } | |
265 | ||
781aab86 FG |
266 | macro_rules! atomic_rmw { |
267 | ($op:ident, $order:ident) => { | |
268 | atomic_rmw!($op, $order, write = $order) | |
269 | }; | |
270 | ($op:ident, $order:ident, write = $write:ident) => { | |
271 | match $order { | |
272 | Ordering::Relaxed => $op!("", "", ""), | |
273 | Ordering::Acquire => $op!("a", "", ""), | |
274 | Ordering::Release => $op!("", "l", ""), | |
275 | Ordering::AcqRel => $op!("a", "l", ""), | |
276 | // In MSVC environments, SeqCst stores/writes needs fences after writes. | |
277 | // https://reviews.llvm.org/D141748 | |
278 | #[cfg(target_env = "msvc")] | |
279 | Ordering::SeqCst if $write == Ordering::SeqCst => $op!("a", "l", "dmb ish"), | |
280 | // AcqRel and SeqCst RMWs are equivalent in non-MSVC environments. | |
281 | Ordering::SeqCst => $op!("a", "l", ""), | |
282 | _ => unreachable!("{:?}", $order), | |
283 | } | |
284 | }; | |
285 | } | |
286 | ||
ed00b5ec FG |
287 | // cfg guarantee that the CPU supports FEAT_LSE2. |
288 | #[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))] | |
289 | use _atomic_load_ldp as atomic_load; | |
290 | #[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))] | |
781aab86 FG |
291 | #[inline] |
292 | unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 { | |
ed00b5ec FG |
293 | #[inline] |
294 | unsafe fn atomic_load_no_lse2(src: *mut u128, order: Ordering) -> u128 { | |
781aab86 FG |
295 | #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] |
296 | // SAFETY: the caller must uphold the safety contract. | |
297 | // cfg guarantee that the CPU supports FEAT_LSE. | |
298 | unsafe { | |
299 | _atomic_load_casp(src, order) | |
300 | } | |
301 | #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))] | |
302 | // SAFETY: the caller must uphold the safety contract. | |
303 | unsafe { | |
304 | _atomic_load_ldxp_stxp(src, order) | |
305 | } | |
306 | } | |
ed00b5ec FG |
307 | #[cfg(not(all( |
308 | not(portable_atomic_no_outline_atomics), | |
309 | any( | |
310 | all( | |
311 | target_os = "linux", | |
312 | any( | |
313 | target_env = "gnu", | |
314 | all( | |
315 | any(target_env = "musl", target_env = "ohos"), | |
316 | not(target_feature = "crt-static"), | |
317 | ), | |
318 | portable_atomic_outline_atomics, | |
319 | ), | |
320 | ), | |
321 | target_os = "android", | |
322 | target_os = "freebsd", | |
323 | target_os = "netbsd", | |
324 | // These don't support detection of FEAT_LSE2. | |
325 | // target_os = "openbsd", | |
326 | // target_os = "fuchsia", | |
327 | // target_os = "windows", | |
328 | ), | |
329 | )))] | |
330 | // SAFETY: the caller must uphold the safety contract. | |
331 | unsafe { | |
332 | atomic_load_no_lse2(src, order) | |
333 | } | |
334 | #[cfg(all( | |
335 | not(portable_atomic_no_outline_atomics), | |
336 | any( | |
337 | all( | |
338 | target_os = "linux", | |
339 | any( | |
340 | target_env = "gnu", | |
341 | all( | |
342 | any(target_env = "musl", target_env = "ohos"), | |
343 | not(target_feature = "crt-static"), | |
344 | ), | |
345 | portable_atomic_outline_atomics, | |
346 | ), | |
347 | ), | |
348 | target_os = "android", | |
349 | target_os = "freebsd", | |
350 | target_os = "netbsd", | |
351 | // These don't support detection of FEAT_LSE2. | |
352 | // target_os = "openbsd", | |
353 | // target_os = "fuchsia", | |
354 | // target_os = "windows", | |
355 | ), | |
356 | ))] | |
357 | { | |
358 | fn_alias! { | |
359 | // inline(never) is just a hint and also not strictly necessary | |
360 | // because we use ifunc helper macro, but used for clarity. | |
361 | #[inline(never)] | |
362 | unsafe fn(src: *mut u128) -> u128; | |
363 | atomic_load_lse2_relaxed = _atomic_load_ldp(Ordering::Relaxed); | |
364 | atomic_load_lse2_acquire = _atomic_load_ldp(Ordering::Acquire); | |
365 | atomic_load_lse2_seqcst = _atomic_load_ldp(Ordering::SeqCst); | |
366 | } | |
367 | fn_alias! { | |
368 | unsafe fn(src: *mut u128) -> u128; | |
369 | atomic_load_no_lse2_relaxed = atomic_load_no_lse2(Ordering::Relaxed); | |
370 | atomic_load_no_lse2_acquire = atomic_load_no_lse2(Ordering::Acquire); | |
371 | atomic_load_no_lse2_seqcst = atomic_load_no_lse2(Ordering::SeqCst); | |
372 | } | |
373 | // SAFETY: the caller must uphold the safety contract. | |
374 | // and we've checked if FEAT_LSE2 is available. | |
375 | unsafe { | |
376 | match order { | |
377 | Ordering::Relaxed => { | |
378 | ifunc!(unsafe fn(src: *mut u128) -> u128 { | |
379 | let cpuinfo = detect::detect(); | |
380 | if cpuinfo.has_lse2() { | |
381 | atomic_load_lse2_relaxed | |
382 | } else { | |
383 | atomic_load_no_lse2_relaxed | |
384 | } | |
385 | }) | |
386 | } | |
387 | Ordering::Acquire => { | |
388 | ifunc!(unsafe fn(src: *mut u128) -> u128 { | |
389 | let cpuinfo = detect::detect(); | |
390 | if cpuinfo.has_lse2() { | |
391 | atomic_load_lse2_acquire | |
392 | } else { | |
393 | atomic_load_no_lse2_acquire | |
394 | } | |
395 | }) | |
396 | } | |
397 | Ordering::SeqCst => { | |
398 | ifunc!(unsafe fn(src: *mut u128) -> u128 { | |
399 | let cpuinfo = detect::detect(); | |
400 | if cpuinfo.has_lse2() { | |
401 | atomic_load_lse2_seqcst | |
402 | } else { | |
403 | atomic_load_no_lse2_seqcst | |
404 | } | |
405 | }) | |
406 | } | |
407 | _ => unreachable!("{:?}", order), | |
408 | } | |
409 | } | |
410 | } | |
781aab86 | 411 | } |
ed00b5ec | 412 | // If CPU supports FEAT_LSE2, LDP/LDIAPP is single-copy atomic reads, |
781aab86 FG |
413 | // otherwise it is two single-copy atomic reads. |
414 | // Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile | |
ed00b5ec FG |
415 | #[cfg(any( |
416 | target_feature = "lse2", | |
417 | portable_atomic_target_feature = "lse2", | |
418 | not(portable_atomic_no_outline_atomics), | |
419 | ))] | |
781aab86 | 420 | #[inline] |
ed00b5ec | 421 | unsafe fn _atomic_load_ldp(src: *mut u128, order: Ordering) -> u128 { |
781aab86 | 422 | debug_assert!(src as usize % 16 == 0); |
ed00b5ec | 423 | debug_assert_lse2!(); |
781aab86 FG |
424 | |
425 | // SAFETY: the caller must guarantee that `dst` is valid for reads, | |
426 | // 16-byte aligned, that there are no concurrent non-atomic operations. | |
427 | // | |
428 | // Refs: | |
429 | // - LDP: https://developer.arm.com/documentation/dui0801/l/A64-Data-Transfer-Instructions/LDP--A64- | |
430 | unsafe { | |
ed00b5ec | 431 | let (out_lo, out_hi); |
781aab86 FG |
432 | macro_rules! atomic_load_relaxed { |
433 | ($acquire:tt $(, $readonly:tt)?) => { | |
434 | asm!( | |
ed00b5ec | 435 | "ldp {out_lo}, {out_hi}, [{src}]", |
781aab86 FG |
436 | $acquire, |
437 | src = in(reg) ptr_reg!(src), | |
ed00b5ec FG |
438 | out_hi = lateout(reg) out_hi, |
439 | out_lo = lateout(reg) out_lo, | |
781aab86 FG |
440 | options(nostack, preserves_flags $(, $readonly)?), |
441 | ) | |
442 | }; | |
443 | } | |
444 | match order { | |
445 | Ordering::Relaxed => atomic_load_relaxed!("", readonly), | |
ed00b5ec FG |
446 | #[cfg(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3"))] |
447 | Ordering::Acquire => { | |
448 | // SAFETY: cfg guarantee that the CPU supports FEAT_LRCPC3. | |
449 | // Refs: https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDIAPP--Load-Acquire-RCpc-ordered-Pair-of-registers- | |
450 | asm!( | |
451 | "ldiapp {out_lo}, {out_hi}, [{src}]", | |
452 | src = in(reg) ptr_reg!(src), | |
453 | out_hi = lateout(reg) out_hi, | |
454 | out_lo = lateout(reg) out_lo, | |
455 | options(nostack, preserves_flags), | |
456 | ); | |
457 | } | |
458 | #[cfg(not(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3")))] | |
781aab86 FG |
459 | Ordering::Acquire => atomic_load_relaxed!("dmb ishld"), |
460 | Ordering::SeqCst => { | |
461 | asm!( | |
462 | // ldar (or dmb ishld) is required to prevent reordering with preceding stlxp. | |
463 | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108891 for details. | |
464 | "ldar {tmp}, [{src}]", | |
ed00b5ec | 465 | "ldp {out_lo}, {out_hi}, [{src}]", |
781aab86 FG |
466 | "dmb ishld", |
467 | src = in(reg) ptr_reg!(src), | |
ed00b5ec FG |
468 | out_hi = lateout(reg) out_hi, |
469 | out_lo = lateout(reg) out_lo, | |
781aab86 FG |
470 | tmp = out(reg) _, |
471 | options(nostack, preserves_flags), | |
472 | ); | |
473 | } | |
474 | _ => unreachable!("{:?}", order), | |
475 | } | |
ed00b5ec | 476 | U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole |
781aab86 FG |
477 | } |
478 | } | |
479 | // Do not use _atomic_compare_exchange_casp because it needs extra MOV to implement load. | |
480 | #[cfg(any(test, not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))))] | |
481 | #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] | |
482 | #[inline] | |
483 | unsafe fn _atomic_load_casp(src: *mut u128, order: Ordering) -> u128 { | |
484 | debug_assert!(src as usize % 16 == 0); | |
485 | debug_assert_lse!(); | |
486 | ||
487 | // SAFETY: the caller must uphold the safety contract. | |
488 | // cfg guarantee that the CPU supports FEAT_LSE. | |
489 | unsafe { | |
ed00b5ec | 490 | let (out_lo, out_hi); |
781aab86 FG |
491 | macro_rules! atomic_load { |
492 | ($acquire:tt, $release:tt) => { | |
493 | asm!( | |
ed00b5ec | 494 | start_lse!(), |
781aab86 FG |
495 | concat!("casp", $acquire, $release, " x2, x3, x2, x3, [{src}]"), |
496 | src = in(reg) ptr_reg!(src), | |
497 | // must be allocated to even/odd register pair | |
ed00b5ec FG |
498 | inout("x2") 0_u64 => out_lo, |
499 | inout("x3") 0_u64 => out_hi, | |
781aab86 FG |
500 | options(nostack, preserves_flags), |
501 | ) | |
502 | }; | |
503 | } | |
504 | match order { | |
505 | Ordering::Relaxed => atomic_load!("", ""), | |
506 | Ordering::Acquire => atomic_load!("a", ""), | |
507 | Ordering::SeqCst => atomic_load!("a", "l"), | |
508 | _ => unreachable!("{:?}", order), | |
509 | } | |
ed00b5ec | 510 | U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole |
781aab86 FG |
511 | } |
512 | } | |
513 | #[cfg(any( | |
514 | test, | |
515 | all( | |
516 | not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")), | |
517 | not(any(target_feature = "lse", portable_atomic_target_feature = "lse")), | |
518 | ), | |
519 | ))] | |
520 | #[inline] | |
521 | unsafe fn _atomic_load_ldxp_stxp(src: *mut u128, order: Ordering) -> u128 { | |
522 | debug_assert!(src as usize % 16 == 0); | |
523 | ||
524 | // SAFETY: the caller must uphold the safety contract. | |
525 | unsafe { | |
ed00b5ec | 526 | let (mut out_lo, mut out_hi); |
781aab86 FG |
527 | macro_rules! atomic_load { |
528 | ($acquire:tt, $release:tt) => { | |
529 | asm!( | |
530 | "2:", | |
ed00b5ec FG |
531 | concat!("ld", $acquire, "xp {out_lo}, {out_hi}, [{src}]"), |
532 | concat!("st", $release, "xp {r:w}, {out_lo}, {out_hi}, [{src}]"), | |
781aab86 FG |
533 | // 0 if the store was successful, 1 if no store was performed |
534 | "cbnz {r:w}, 2b", | |
535 | src = in(reg) ptr_reg!(src), | |
ed00b5ec FG |
536 | out_lo = out(reg) out_lo, |
537 | out_hi = out(reg) out_hi, | |
781aab86 FG |
538 | r = out(reg) _, |
539 | options(nostack, preserves_flags), | |
540 | ) | |
541 | }; | |
542 | } | |
543 | match order { | |
544 | Ordering::Relaxed => atomic_load!("", ""), | |
545 | Ordering::Acquire => atomic_load!("a", ""), | |
546 | Ordering::SeqCst => atomic_load!("a", "l"), | |
547 | _ => unreachable!("{:?}", order), | |
548 | } | |
ed00b5ec | 549 | U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole |
781aab86 FG |
550 | } |
551 | } | |
552 | ||
ed00b5ec FG |
553 | // cfg guarantee that the CPU supports FEAT_LSE2. |
554 | #[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))] | |
555 | use _atomic_store_stp as atomic_store; | |
556 | #[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))] | |
781aab86 FG |
557 | #[inline] |
558 | unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { | |
ed00b5ec FG |
559 | #[inline] |
560 | unsafe fn atomic_store_no_lse2(dst: *mut u128, val: u128, order: Ordering) { | |
561 | // If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set, | |
562 | // we use CAS-based atomic RMW. | |
563 | #[cfg(all( | |
564 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
565 | not(portable_atomic_ll_sc_rmw), | |
566 | ))] | |
567 | // SAFETY: the caller must uphold the safety contract. | |
568 | // cfg guarantee that the CPU supports FEAT_LSE. | |
569 | unsafe { | |
570 | _atomic_swap_casp(dst, val, order); | |
571 | } | |
572 | #[cfg(not(all( | |
573 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
574 | not(portable_atomic_ll_sc_rmw), | |
575 | )))] | |
576 | // SAFETY: the caller must uphold the safety contract. | |
577 | unsafe { | |
578 | _atomic_store_ldxp_stxp(dst, val, order); | |
579 | } | |
781aab86 | 580 | } |
ed00b5ec FG |
581 | #[cfg(not(all( |
582 | not(portable_atomic_no_outline_atomics), | |
583 | any( | |
584 | all( | |
585 | target_os = "linux", | |
586 | any( | |
587 | target_env = "gnu", | |
588 | all( | |
589 | any(target_env = "musl", target_env = "ohos"), | |
590 | not(target_feature = "crt-static"), | |
591 | ), | |
592 | portable_atomic_outline_atomics, | |
593 | ), | |
594 | ), | |
595 | target_os = "android", | |
596 | target_os = "freebsd", | |
597 | target_os = "netbsd", | |
598 | // These don't support detection of FEAT_LSE2. | |
599 | // target_os = "openbsd", | |
600 | // target_os = "fuchsia", | |
601 | // target_os = "windows", | |
602 | ), | |
603 | )))] | |
781aab86 FG |
604 | // SAFETY: the caller must uphold the safety contract. |
605 | unsafe { | |
ed00b5ec FG |
606 | atomic_store_no_lse2(dst, val, order); |
607 | } | |
608 | #[cfg(all( | |
609 | not(portable_atomic_no_outline_atomics), | |
610 | any( | |
611 | all( | |
612 | target_os = "linux", | |
613 | any( | |
614 | target_env = "gnu", | |
615 | all( | |
616 | any(target_env = "musl", target_env = "ohos"), | |
617 | not(target_feature = "crt-static"), | |
618 | ), | |
619 | portable_atomic_outline_atomics, | |
620 | ), | |
621 | ), | |
622 | target_os = "android", | |
623 | target_os = "freebsd", | |
624 | target_os = "netbsd", | |
625 | // These don't support detection of FEAT_LSE2. | |
626 | // target_os = "openbsd", | |
627 | // target_os = "fuchsia", | |
628 | // target_os = "windows", | |
629 | ), | |
630 | ))] | |
631 | { | |
632 | fn_alias! { | |
633 | // inline(never) is just a hint and also not strictly necessary | |
634 | // because we use ifunc helper macro, but used for clarity. | |
635 | #[inline(never)] | |
636 | unsafe fn(dst: *mut u128, val: u128); | |
637 | atomic_store_lse2_relaxed = _atomic_store_stp(Ordering::Relaxed); | |
638 | atomic_store_lse2_release = _atomic_store_stp(Ordering::Release); | |
639 | atomic_store_lse2_seqcst = _atomic_store_stp(Ordering::SeqCst); | |
640 | } | |
641 | fn_alias! { | |
642 | unsafe fn(dst: *mut u128, val: u128); | |
643 | atomic_store_no_lse2_relaxed = atomic_store_no_lse2(Ordering::Relaxed); | |
644 | atomic_store_no_lse2_release = atomic_store_no_lse2(Ordering::Release); | |
645 | atomic_store_no_lse2_seqcst = atomic_store_no_lse2(Ordering::SeqCst); | |
646 | } | |
647 | // SAFETY: the caller must uphold the safety contract. | |
648 | // and we've checked if FEAT_LSE2 is available. | |
649 | unsafe { | |
650 | match order { | |
651 | Ordering::Relaxed => { | |
652 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { | |
653 | let cpuinfo = detect::detect(); | |
654 | if cpuinfo.has_lse2() { | |
655 | atomic_store_lse2_relaxed | |
656 | } else { | |
657 | atomic_store_no_lse2_relaxed | |
658 | } | |
659 | }); | |
660 | } | |
661 | Ordering::Release => { | |
662 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { | |
663 | let cpuinfo = detect::detect(); | |
664 | if cpuinfo.has_lse2() { | |
665 | atomic_store_lse2_release | |
666 | } else { | |
667 | atomic_store_no_lse2_release | |
668 | } | |
669 | }); | |
670 | } | |
671 | Ordering::SeqCst => { | |
672 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { | |
673 | let cpuinfo = detect::detect(); | |
674 | if cpuinfo.has_lse2() { | |
675 | atomic_store_lse2_seqcst | |
676 | } else { | |
677 | atomic_store_no_lse2_seqcst | |
678 | } | |
679 | }); | |
680 | } | |
681 | _ => unreachable!("{:?}", order), | |
682 | } | |
683 | } | |
781aab86 FG |
684 | } |
685 | } | |
ed00b5ec | 686 | // If CPU supports FEAT_LSE2, STP/STILP is single-copy atomic writes, |
781aab86 FG |
687 | // otherwise it is two single-copy atomic writes. |
688 | // Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile | |
ed00b5ec FG |
689 | #[cfg(any( |
690 | target_feature = "lse2", | |
691 | portable_atomic_target_feature = "lse2", | |
692 | not(portable_atomic_no_outline_atomics), | |
693 | ))] | |
781aab86 | 694 | #[inline] |
ed00b5ec | 695 | unsafe fn _atomic_store_stp(dst: *mut u128, val: u128, order: Ordering) { |
781aab86 | 696 | debug_assert!(dst as usize % 16 == 0); |
ed00b5ec | 697 | debug_assert_lse2!(); |
781aab86 FG |
698 | |
699 | // SAFETY: the caller must guarantee that `dst` is valid for writes, | |
700 | // 16-byte aligned, that there are no concurrent non-atomic operations. | |
701 | // | |
702 | // Refs: | |
703 | // - STP: https://developer.arm.com/documentation/dui0801/l/A64-Data-Transfer-Instructions/STP--A64- | |
704 | unsafe { | |
ed00b5ec | 705 | #[rustfmt::skip] |
781aab86 | 706 | macro_rules! atomic_store { |
ed00b5ec FG |
707 | ($acquire:tt, $release:tt) => {{ |
708 | let val = U128 { whole: val }; | |
781aab86 FG |
709 | asm!( |
710 | $release, | |
711 | "stp {val_lo}, {val_hi}, [{dst}]", | |
712 | $acquire, | |
713 | dst = in(reg) ptr_reg!(dst), | |
714 | val_lo = in(reg) val.pair.lo, | |
715 | val_hi = in(reg) val.pair.hi, | |
716 | options(nostack, preserves_flags), | |
ed00b5ec FG |
717 | ); |
718 | }}; | |
781aab86 FG |
719 | } |
720 | match order { | |
721 | Ordering::Relaxed => atomic_store!("", ""), | |
ed00b5ec FG |
722 | #[cfg(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3"))] |
723 | Ordering::Release => { | |
724 | let val = U128 { whole: val }; | |
725 | // SAFETY: cfg guarantee that the CPU supports FEAT_LRCPC3. | |
726 | // Refs: https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/STILP--Store-Release-ordered-Pair-of-registers- | |
727 | asm!( | |
728 | "stilp {val_lo}, {val_hi}, [{dst}]", | |
729 | dst = in(reg) ptr_reg!(dst), | |
730 | val_lo = in(reg) val.pair.lo, | |
731 | val_hi = in(reg) val.pair.hi, | |
732 | options(nostack, preserves_flags), | |
733 | ); | |
734 | } | |
735 | #[cfg(not(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3")))] | |
736 | #[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] | |
737 | Ordering::Release => { | |
738 | // Use swpp if stp requires fences. | |
739 | // https://reviews.llvm.org/D143506 | |
740 | // SAFETY: cfg guarantee that the CPU supports FEAT_LSE128. | |
741 | _atomic_swap_swpp(dst, val, order); | |
742 | } | |
743 | #[cfg(not(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3")))] | |
744 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] | |
781aab86 | 745 | Ordering::Release => atomic_store!("", "dmb ish"), |
ed00b5ec FG |
746 | #[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] |
747 | Ordering::SeqCst => { | |
748 | // Use swpp if stp requires fences. | |
749 | // https://reviews.llvm.org/D143506 | |
750 | // SAFETY: cfg guarantee that the CPU supports FEAT_LSE128. | |
751 | _atomic_swap_swpp(dst, val, order); | |
752 | } | |
753 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] | |
781aab86 FG |
754 | Ordering::SeqCst => atomic_store!("dmb ish", "dmb ish"), |
755 | _ => unreachable!("{:?}", order), | |
756 | } | |
757 | } | |
758 | } | |
ed00b5ec FG |
759 | // Do not use _atomic_swap_ldxp_stxp because it needs extra registers to implement store. |
760 | #[cfg(any( | |
761 | test, | |
762 | not(all( | |
763 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
764 | not(portable_atomic_ll_sc_rmw), | |
765 | )) | |
766 | ))] | |
767 | #[inline] | |
768 | unsafe fn _atomic_store_ldxp_stxp(dst: *mut u128, val: u128, order: Ordering) { | |
769 | debug_assert!(dst as usize % 16 == 0); | |
770 | ||
771 | // SAFETY: the caller must uphold the safety contract. | |
772 | unsafe { | |
773 | let val = U128 { whole: val }; | |
774 | macro_rules! store { | |
775 | ($acquire:tt, $release:tt, $fence:tt) => { | |
776 | asm!( | |
777 | "2:", | |
778 | concat!("ld", $acquire, "xp xzr, {tmp}, [{dst}]"), | |
779 | concat!("st", $release, "xp {tmp:w}, {val_lo}, {val_hi}, [{dst}]"), | |
780 | // 0 if the store was successful, 1 if no store was performed | |
781 | "cbnz {tmp:w}, 2b", | |
782 | $fence, | |
783 | dst = in(reg) ptr_reg!(dst), | |
784 | val_lo = in(reg) val.pair.lo, | |
785 | val_hi = in(reg) val.pair.hi, | |
786 | tmp = out(reg) _, | |
787 | options(nostack, preserves_flags), | |
788 | ) | |
789 | }; | |
790 | } | |
791 | atomic_rmw!(store, order); | |
792 | } | |
793 | } | |
781aab86 FG |
794 | |
795 | #[inline] | |
796 | unsafe fn atomic_compare_exchange( | |
797 | dst: *mut u128, | |
798 | old: u128, | |
799 | new: u128, | |
800 | success: Ordering, | |
801 | failure: Ordering, | |
802 | ) -> Result<u128, u128> { | |
803 | #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] | |
804 | // SAFETY: the caller must uphold the safety contract. | |
805 | // cfg guarantee that the CPU supports FEAT_LSE. | |
ed00b5ec | 806 | let prev = unsafe { _atomic_compare_exchange_casp(dst, old, new, success, failure) }; |
781aab86 FG |
807 | #[cfg(not(all( |
808 | not(portable_atomic_no_outline_atomics), | |
809 | any( | |
810 | all( | |
811 | target_os = "linux", | |
812 | any( | |
813 | target_env = "gnu", | |
814 | all( | |
815 | any(target_env = "musl", target_env = "ohos"), | |
816 | not(target_feature = "crt-static"), | |
817 | ), | |
818 | portable_atomic_outline_atomics, | |
819 | ), | |
820 | ), | |
821 | target_os = "android", | |
822 | target_os = "freebsd", | |
ed00b5ec | 823 | target_os = "netbsd", |
781aab86 FG |
824 | target_os = "openbsd", |
825 | target_os = "fuchsia", | |
826 | target_os = "windows", | |
827 | ), | |
828 | )))] | |
829 | #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))] | |
830 | // SAFETY: the caller must uphold the safety contract. | |
ed00b5ec | 831 | let prev = unsafe { _atomic_compare_exchange_ldxp_stxp(dst, old, new, success, failure) }; |
781aab86 FG |
832 | #[cfg(all( |
833 | not(portable_atomic_no_outline_atomics), | |
834 | any( | |
835 | all( | |
836 | target_os = "linux", | |
837 | any( | |
838 | target_env = "gnu", | |
839 | all( | |
840 | any(target_env = "musl", target_env = "ohos"), | |
841 | not(target_feature = "crt-static"), | |
842 | ), | |
843 | portable_atomic_outline_atomics, | |
844 | ), | |
845 | ), | |
846 | target_os = "android", | |
847 | target_os = "freebsd", | |
ed00b5ec | 848 | target_os = "netbsd", |
781aab86 FG |
849 | target_os = "openbsd", |
850 | target_os = "fuchsia", | |
851 | target_os = "windows", | |
852 | ), | |
853 | ))] | |
854 | #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))] | |
ed00b5ec | 855 | let prev = { |
781aab86 | 856 | fn_alias! { |
ed00b5ec FG |
857 | // inline(never) is just a hint and also not strictly necessary |
858 | // because we use ifunc helper macro, but used for clarity. | |
859 | #[inline(never)] | |
781aab86 FG |
860 | unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128; |
861 | atomic_compare_exchange_casp_relaxed | |
862 | = _atomic_compare_exchange_casp(Ordering::Relaxed, Ordering::Relaxed); | |
863 | atomic_compare_exchange_casp_acquire | |
864 | = _atomic_compare_exchange_casp(Ordering::Acquire, Ordering::Acquire); | |
865 | atomic_compare_exchange_casp_release | |
866 | = _atomic_compare_exchange_casp(Ordering::Release, Ordering::Relaxed); | |
867 | atomic_compare_exchange_casp_acqrel | |
868 | = _atomic_compare_exchange_casp(Ordering::AcqRel, Ordering::Acquire); | |
869 | // AcqRel and SeqCst RMWs are equivalent in non-MSVC environments. | |
870 | #[cfg(target_env = "msvc")] | |
871 | atomic_compare_exchange_casp_seqcst | |
872 | = _atomic_compare_exchange_casp(Ordering::SeqCst, Ordering::SeqCst); | |
873 | } | |
874 | fn_alias! { | |
875 | unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128; | |
876 | atomic_compare_exchange_ldxp_stxp_relaxed | |
877 | = _atomic_compare_exchange_ldxp_stxp(Ordering::Relaxed, Ordering::Relaxed); | |
878 | atomic_compare_exchange_ldxp_stxp_acquire | |
879 | = _atomic_compare_exchange_ldxp_stxp(Ordering::Acquire, Ordering::Acquire); | |
880 | atomic_compare_exchange_ldxp_stxp_release | |
881 | = _atomic_compare_exchange_ldxp_stxp(Ordering::Release, Ordering::Relaxed); | |
882 | atomic_compare_exchange_ldxp_stxp_acqrel | |
883 | = _atomic_compare_exchange_ldxp_stxp(Ordering::AcqRel, Ordering::Acquire); | |
884 | // AcqRel and SeqCst RMWs are equivalent in non-MSVC environments. | |
885 | #[cfg(target_env = "msvc")] | |
886 | atomic_compare_exchange_ldxp_stxp_seqcst | |
887 | = _atomic_compare_exchange_ldxp_stxp(Ordering::SeqCst, Ordering::SeqCst); | |
888 | } | |
889 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and | |
890 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, | |
891 | // and we've checked if FEAT_LSE is available. | |
892 | unsafe { | |
893 | let success = crate::utils::upgrade_success_ordering(success, failure); | |
894 | match success { | |
895 | Ordering::Relaxed => { | |
896 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 { | |
897 | if detect::detect().has_lse() { | |
898 | atomic_compare_exchange_casp_relaxed | |
899 | } else { | |
900 | atomic_compare_exchange_ldxp_stxp_relaxed | |
901 | } | |
902 | }) | |
903 | } | |
904 | Ordering::Acquire => { | |
905 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 { | |
906 | if detect::detect().has_lse() { | |
907 | atomic_compare_exchange_casp_acquire | |
908 | } else { | |
909 | atomic_compare_exchange_ldxp_stxp_acquire | |
910 | } | |
911 | }) | |
912 | } | |
913 | Ordering::Release => { | |
914 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 { | |
915 | if detect::detect().has_lse() { | |
916 | atomic_compare_exchange_casp_release | |
917 | } else { | |
918 | atomic_compare_exchange_ldxp_stxp_release | |
919 | } | |
920 | }) | |
921 | } | |
922 | // AcqRel and SeqCst RMWs are equivalent in both implementations in non-MSVC environments. | |
923 | #[cfg(not(target_env = "msvc"))] | |
924 | Ordering::AcqRel | Ordering::SeqCst => { | |
925 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 { | |
926 | if detect::detect().has_lse() { | |
927 | atomic_compare_exchange_casp_acqrel | |
928 | } else { | |
929 | atomic_compare_exchange_ldxp_stxp_acqrel | |
930 | } | |
931 | }) | |
932 | } | |
933 | #[cfg(target_env = "msvc")] | |
934 | Ordering::AcqRel => { | |
935 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 { | |
936 | if detect::detect().has_lse() { | |
937 | atomic_compare_exchange_casp_acqrel | |
938 | } else { | |
939 | atomic_compare_exchange_ldxp_stxp_acqrel | |
940 | } | |
941 | }) | |
942 | } | |
943 | #[cfg(target_env = "msvc")] | |
944 | Ordering::SeqCst => { | |
945 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 { | |
946 | if detect::detect().has_lse() { | |
947 | atomic_compare_exchange_casp_seqcst | |
948 | } else { | |
949 | atomic_compare_exchange_ldxp_stxp_seqcst | |
950 | } | |
951 | }) | |
952 | } | |
953 | _ => unreachable!("{:?}", success), | |
954 | } | |
955 | } | |
956 | }; | |
ed00b5ec FG |
957 | if prev == old { |
958 | Ok(prev) | |
781aab86 | 959 | } else { |
ed00b5ec | 960 | Err(prev) |
781aab86 FG |
961 | } |
962 | } | |
963 | #[cfg(any( | |
964 | target_feature = "lse", | |
965 | portable_atomic_target_feature = "lse", | |
966 | not(portable_atomic_no_outline_atomics), | |
967 | ))] | |
781aab86 FG |
968 | #[inline] |
969 | unsafe fn _atomic_compare_exchange_casp( | |
970 | dst: *mut u128, | |
971 | old: u128, | |
972 | new: u128, | |
973 | success: Ordering, | |
974 | failure: Ordering, | |
975 | ) -> u128 { | |
976 | debug_assert!(dst as usize % 16 == 0); | |
977 | debug_assert_lse!(); | |
978 | let order = crate::utils::upgrade_success_ordering(success, failure); | |
979 | ||
980 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and | |
981 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, | |
982 | // and the CPU supports FEAT_LSE. | |
983 | // | |
984 | // Refs: | |
985 | // - https://developer.arm.com/documentation/dui0801/l/A64-Data-Transfer-Instructions/CASPA--CASPAL--CASP--CASPL--CASPAL--CASP--CASPL--A64- | |
986 | // - https://developer.arm.com/documentation/ddi0602/2023-06/Base-Instructions/CASP--CASPA--CASPAL--CASPL--Compare-and-Swap-Pair-of-words-or-doublewords-in-memory- | |
987 | unsafe { | |
988 | let old = U128 { whole: old }; | |
989 | let new = U128 { whole: new }; | |
990 | let (prev_lo, prev_hi); | |
991 | macro_rules! cmpxchg { | |
992 | ($acquire:tt, $release:tt, $fence:tt) => { | |
993 | asm!( | |
ed00b5ec | 994 | start_lse!(), |
781aab86 FG |
995 | concat!("casp", $acquire, $release, " x6, x7, x4, x5, [{dst}]"), |
996 | $fence, | |
997 | dst = in(reg) ptr_reg!(dst), | |
998 | // must be allocated to even/odd register pair | |
999 | inout("x6") old.pair.lo => prev_lo, | |
1000 | inout("x7") old.pair.hi => prev_hi, | |
1001 | // must be allocated to even/odd register pair | |
1002 | in("x4") new.pair.lo, | |
1003 | in("x5") new.pair.hi, | |
1004 | options(nostack, preserves_flags), | |
1005 | ) | |
1006 | }; | |
1007 | } | |
1008 | atomic_rmw!(cmpxchg, order, write = success); | |
1009 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1010 | } | |
1011 | } | |
1012 | #[cfg(any(test, not(any(target_feature = "lse", portable_atomic_target_feature = "lse"))))] | |
1013 | #[inline] | |
1014 | unsafe fn _atomic_compare_exchange_ldxp_stxp( | |
1015 | dst: *mut u128, | |
1016 | old: u128, | |
1017 | new: u128, | |
1018 | success: Ordering, | |
1019 | failure: Ordering, | |
1020 | ) -> u128 { | |
1021 | debug_assert!(dst as usize % 16 == 0); | |
1022 | let order = crate::utils::upgrade_success_ordering(success, failure); | |
1023 | ||
1024 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and | |
1025 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. | |
1026 | // | |
1027 | // Refs: | |
1028 | // - LDXP: https://developer.arm.com/documentation/dui0801/l/A64-Data-Transfer-Instructions/LDXP--A64- | |
1029 | // - LDAXP: https://developer.arm.com/documentation/dui0801/l/A64-Data-Transfer-Instructions/LDAXP--A64- | |
1030 | // - STXP: https://developer.arm.com/documentation/dui0801/l/A64-Data-Transfer-Instructions/STXP--A64- | |
1031 | // - STLXP: https://developer.arm.com/documentation/dui0801/l/A64-Data-Transfer-Instructions/STLXP--A64- | |
1032 | // | |
1033 | // Note: Load-Exclusive pair (by itself) does not guarantee atomicity; to complete an atomic | |
1034 | // operation (even load/store), a corresponding Store-Exclusive pair must succeed. | |
1035 | // See Arm Architecture Reference Manual for A-profile architecture | |
1036 | // Section B2.2.1 "Requirements for single-copy atomicity", and | |
1037 | // Section B2.9 "Synchronization and semaphores" for more. | |
1038 | unsafe { | |
1039 | let old = U128 { whole: old }; | |
1040 | let new = U128 { whole: new }; | |
1041 | let (mut prev_lo, mut prev_hi); | |
1042 | macro_rules! cmpxchg { | |
1043 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1044 | asm!( | |
1045 | "2:", | |
ed00b5ec FG |
1046 | concat!("ld", $acquire, "xp {prev_lo}, {prev_hi}, [{dst}]"), |
1047 | "cmp {prev_lo}, {old_lo}", | |
781aab86 | 1048 | "cset {r:w}, ne", |
ed00b5ec | 1049 | "cmp {prev_hi}, {old_hi}", |
781aab86 FG |
1050 | "cinc {r:w}, {r:w}, ne", |
1051 | "cbz {r:w}, 3f", | |
ed00b5ec | 1052 | concat!("st", $release, "xp {r:w}, {prev_lo}, {prev_hi}, [{dst}]"), |
781aab86 FG |
1053 | // 0 if the store was successful, 1 if no store was performed |
1054 | "cbnz {r:w}, 2b", | |
1055 | "b 4f", | |
1056 | "3:", | |
1057 | concat!("st", $release, "xp {r:w}, {new_lo}, {new_hi}, [{dst}]"), | |
1058 | // 0 if the store was successful, 1 if no store was performed | |
1059 | "cbnz {r:w}, 2b", | |
1060 | "4:", | |
1061 | $fence, | |
1062 | dst = in(reg) ptr_reg!(dst), | |
1063 | old_lo = in(reg) old.pair.lo, | |
1064 | old_hi = in(reg) old.pair.hi, | |
1065 | new_lo = in(reg) new.pair.lo, | |
1066 | new_hi = in(reg) new.pair.hi, | |
ed00b5ec FG |
1067 | prev_lo = out(reg) prev_lo, |
1068 | prev_hi = out(reg) prev_hi, | |
781aab86 FG |
1069 | r = out(reg) _, |
1070 | // Do not use `preserves_flags` because CMP modifies the condition flags. | |
1071 | options(nostack), | |
1072 | ) | |
1073 | }; | |
1074 | } | |
1075 | atomic_rmw!(cmpxchg, order, write = success); | |
1076 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1077 | } | |
1078 | } | |
1079 | ||
1080 | // casp is always strong, and ldxp requires a corresponding (succeed) stxp for | |
1081 | // its atomicity (see code comment in _atomic_compare_exchange_ldxp_stxp). | |
1082 | // (i.e., aarch64 doesn't have 128-bit weak CAS) | |
1083 | use self::atomic_compare_exchange as atomic_compare_exchange_weak; | |
1084 | ||
1085 | // If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set, | |
1086 | // we use CAS-based atomic RMW. | |
ed00b5ec | 1087 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] |
781aab86 FG |
1088 | #[cfg(all( |
1089 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1090 | not(portable_atomic_ll_sc_rmw), | |
1091 | ))] | |
1092 | use _atomic_swap_casp as atomic_swap; | |
ed00b5ec | 1093 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] |
781aab86 FG |
1094 | #[cfg(not(all( |
1095 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1096 | not(portable_atomic_ll_sc_rmw), | |
1097 | )))] | |
1098 | use _atomic_swap_ldxp_stxp as atomic_swap; | |
ed00b5ec FG |
1099 | #[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] |
1100 | use _atomic_swap_swpp as atomic_swap; | |
1101 | #[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] | |
1102 | #[inline] | |
1103 | unsafe fn _atomic_swap_swpp(dst: *mut u128, val: u128, order: Ordering) -> u128 { | |
1104 | debug_assert!(dst as usize % 16 == 0); | |
1105 | ||
1106 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and | |
1107 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, | |
1108 | // and the CPU supports FEAT_LSE128. | |
1109 | // | |
1110 | // Refs: | |
1111 | // - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/SWPP--SWPPA--SWPPAL--SWPPL--Swap-quadword-in-memory-?lang=en | |
1112 | unsafe { | |
1113 | let val = U128 { whole: val }; | |
1114 | let (prev_lo, prev_hi); | |
1115 | macro_rules! swap { | |
1116 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1117 | asm!( | |
1118 | concat!("swpp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"), | |
1119 | $fence, | |
1120 | dst = in(reg) ptr_reg!(dst), | |
1121 | val_lo = inout(reg) val.pair.lo => prev_lo, | |
1122 | val_hi = inout(reg) val.pair.hi => prev_hi, | |
1123 | options(nostack, preserves_flags), | |
1124 | ) | |
1125 | }; | |
1126 | } | |
1127 | atomic_rmw!(swap, order); | |
1128 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1129 | } | |
1130 | } | |
781aab86 | 1131 | // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap. |
ed00b5ec | 1132 | #[cfg(any(test, not(portable_atomic_ll_sc_rmw)))] |
781aab86 FG |
1133 | #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] |
1134 | #[inline] | |
1135 | unsafe fn _atomic_swap_casp(dst: *mut u128, val: u128, order: Ordering) -> u128 { | |
1136 | debug_assert!(dst as usize % 16 == 0); | |
1137 | debug_assert_lse!(); | |
1138 | ||
1139 | // SAFETY: the caller must uphold the safety contract. | |
1140 | // cfg guarantee that the CPU supports FEAT_LSE. | |
1141 | unsafe { | |
1142 | let val = U128 { whole: val }; | |
1143 | let (mut prev_lo, mut prev_hi); | |
1144 | macro_rules! swap { | |
1145 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1146 | asm!( | |
ed00b5ec | 1147 | start_lse!(), |
781aab86 FG |
1148 | // If FEAT_LSE2 is not supported, this works like byte-wise atomic. |
1149 | // This is not single-copy atomic reads, but this is ok because subsequent | |
1150 | // CAS will check for consistency. | |
1151 | "ldp x4, x5, [{dst}]", | |
1152 | "2:", | |
1153 | // casp writes the current value to the first register pair, | |
1154 | // so copy the `out`'s value for later comparison. | |
1155 | "mov {tmp_lo}, x4", | |
1156 | "mov {tmp_hi}, x5", | |
1157 | concat!("casp", $acquire, $release, " x4, x5, x2, x3, [{dst}]"), | |
1158 | "cmp {tmp_hi}, x5", | |
1159 | "ccmp {tmp_lo}, x4, #0, eq", | |
1160 | "b.ne 2b", | |
1161 | $fence, | |
1162 | dst = in(reg) ptr_reg!(dst), | |
1163 | tmp_lo = out(reg) _, | |
1164 | tmp_hi = out(reg) _, | |
1165 | // must be allocated to even/odd register pair | |
1166 | out("x4") prev_lo, | |
1167 | out("x5") prev_hi, | |
1168 | // must be allocated to even/odd register pair | |
1169 | in("x2") val.pair.lo, | |
1170 | in("x3") val.pair.hi, | |
1171 | // Do not use `preserves_flags` because CMP and CCMP modify the condition flags. | |
1172 | options(nostack), | |
1173 | ) | |
1174 | }; | |
1175 | } | |
1176 | atomic_rmw!(swap, order); | |
1177 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1178 | } | |
1179 | } | |
1180 | // Do not use atomic_rmw_ll_sc_3 because it needs extra MOV to implement swap. | |
1181 | #[cfg(any( | |
1182 | test, | |
1183 | not(all( | |
1184 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1185 | not(portable_atomic_ll_sc_rmw), | |
1186 | )) | |
1187 | ))] | |
1188 | #[inline] | |
1189 | unsafe fn _atomic_swap_ldxp_stxp(dst: *mut u128, val: u128, order: Ordering) -> u128 { | |
1190 | debug_assert!(dst as usize % 16 == 0); | |
1191 | ||
1192 | // SAFETY: the caller must uphold the safety contract. | |
1193 | unsafe { | |
1194 | let val = U128 { whole: val }; | |
1195 | let (mut prev_lo, mut prev_hi); | |
1196 | macro_rules! swap { | |
1197 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1198 | asm!( | |
1199 | "2:", | |
1200 | concat!("ld", $acquire, "xp {prev_lo}, {prev_hi}, [{dst}]"), | |
1201 | concat!("st", $release, "xp {r:w}, {val_lo}, {val_hi}, [{dst}]"), | |
1202 | // 0 if the store was successful, 1 if no store was performed | |
1203 | "cbnz {r:w}, 2b", | |
1204 | $fence, | |
1205 | dst = in(reg) ptr_reg!(dst), | |
1206 | val_lo = in(reg) val.pair.lo, | |
1207 | val_hi = in(reg) val.pair.hi, | |
1208 | prev_lo = out(reg) prev_lo, | |
1209 | prev_hi = out(reg) prev_hi, | |
1210 | r = out(reg) _, | |
1211 | options(nostack, preserves_flags), | |
1212 | ) | |
1213 | }; | |
1214 | } | |
1215 | atomic_rmw!(swap, order); | |
1216 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1217 | } | |
1218 | } | |
1219 | ||
1220 | /// Atomic RMW by LL/SC loop (3 arguments) | |
1221 | /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;` | |
1222 | /// | |
1223 | /// `$op` can use the following registers: | |
1224 | /// - val_lo/val_hi pair: val argument (read-only for `$op`) | |
1225 | /// - prev_lo/prev_hi pair: previous value loaded by ll (read-only for `$op`) | |
ed00b5ec | 1226 | /// - new_lo/new_hi pair: new value that will be stored by sc |
781aab86 FG |
1227 | macro_rules! atomic_rmw_ll_sc_3 { |
1228 | ($name:ident as $reexport_name:ident $(($preserves_flags:tt))?, $($op:tt)*) => { | |
1229 | // If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set, | |
1230 | // we use CAS-based atomic RMW generated by atomic_rmw_cas_3! macro instead. | |
1231 | #[cfg(not(all( | |
1232 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1233 | not(portable_atomic_ll_sc_rmw), | |
1234 | )))] | |
1235 | use $name as $reexport_name; | |
1236 | #[cfg(any( | |
1237 | test, | |
1238 | not(all( | |
1239 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1240 | not(portable_atomic_ll_sc_rmw), | |
1241 | )) | |
1242 | ))] | |
1243 | #[inline] | |
1244 | unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 { | |
1245 | debug_assert!(dst as usize % 16 == 0); | |
1246 | // SAFETY: the caller must uphold the safety contract. | |
1247 | unsafe { | |
1248 | let val = U128 { whole: val }; | |
1249 | let (mut prev_lo, mut prev_hi); | |
1250 | macro_rules! op { | |
1251 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1252 | asm!( | |
1253 | "2:", | |
1254 | concat!("ld", $acquire, "xp {prev_lo}, {prev_hi}, [{dst}]"), | |
1255 | $($op)* | |
1256 | concat!("st", $release, "xp {r:w}, {new_lo}, {new_hi}, [{dst}]"), | |
1257 | // 0 if the store was successful, 1 if no store was performed | |
1258 | "cbnz {r:w}, 2b", | |
1259 | $fence, | |
1260 | dst = in(reg) ptr_reg!(dst), | |
1261 | val_lo = in(reg) val.pair.lo, | |
1262 | val_hi = in(reg) val.pair.hi, | |
1263 | prev_lo = out(reg) prev_lo, | |
1264 | prev_hi = out(reg) prev_hi, | |
1265 | new_lo = out(reg) _, | |
1266 | new_hi = out(reg) _, | |
1267 | r = out(reg) _, | |
1268 | options(nostack $(, $preserves_flags)?), | |
1269 | ) | |
1270 | }; | |
1271 | } | |
1272 | atomic_rmw!(op, order); | |
1273 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1274 | } | |
1275 | } | |
1276 | }; | |
1277 | } | |
1278 | /// Atomic RMW by CAS loop (3 arguments) | |
1279 | /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;` | |
1280 | /// | |
1281 | /// `$op` can use the following registers: | |
1282 | /// - val_lo/val_hi pair: val argument (read-only for `$op`) | |
1283 | /// - x6/x7 pair: previous value loaded (read-only for `$op`) | |
ed00b5ec | 1284 | /// - x4/x5 pair: new value that will be stored |
781aab86 FG |
1285 | macro_rules! atomic_rmw_cas_3 { |
1286 | ($name:ident as $reexport_name:ident, $($op:tt)*) => { | |
1287 | // If FEAT_LSE is not available at compile-time or portable_atomic_ll_sc_rmw cfg is set, | |
1288 | // we use LL/SC-based atomic RMW generated by atomic_rmw_ll_sc_3! macro instead. | |
1289 | #[cfg(all( | |
1290 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1291 | not(portable_atomic_ll_sc_rmw), | |
1292 | ))] | |
1293 | use $name as $reexport_name; | |
ed00b5ec | 1294 | #[cfg(any(test, not(portable_atomic_ll_sc_rmw)))] |
781aab86 FG |
1295 | #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] |
1296 | #[inline] | |
1297 | unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 { | |
1298 | debug_assert!(dst as usize % 16 == 0); | |
1299 | debug_assert_lse!(); | |
1300 | // SAFETY: the caller must uphold the safety contract. | |
1301 | // cfg guarantee that the CPU supports FEAT_LSE. | |
1302 | unsafe { | |
1303 | let val = U128 { whole: val }; | |
1304 | let (mut prev_lo, mut prev_hi); | |
1305 | macro_rules! op { | |
1306 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1307 | asm!( | |
ed00b5ec | 1308 | start_lse!(), |
781aab86 FG |
1309 | // If FEAT_LSE2 is not supported, this works like byte-wise atomic. |
1310 | // This is not single-copy atomic reads, but this is ok because subsequent | |
1311 | // CAS will check for consistency. | |
1312 | "ldp x6, x7, [{dst}]", | |
1313 | "2:", | |
1314 | // casp writes the current value to the first register pair, | |
1315 | // so copy the `out`'s value for later comparison. | |
1316 | "mov {tmp_lo}, x6", | |
1317 | "mov {tmp_hi}, x7", | |
1318 | $($op)* | |
1319 | concat!("casp", $acquire, $release, " x6, x7, x4, x5, [{dst}]"), | |
1320 | "cmp {tmp_hi}, x7", | |
1321 | "ccmp {tmp_lo}, x6, #0, eq", | |
1322 | "b.ne 2b", | |
1323 | $fence, | |
1324 | dst = in(reg) ptr_reg!(dst), | |
1325 | val_lo = in(reg) val.pair.lo, | |
1326 | val_hi = in(reg) val.pair.hi, | |
1327 | tmp_lo = out(reg) _, | |
1328 | tmp_hi = out(reg) _, | |
1329 | // must be allocated to even/odd register pair | |
1330 | out("x6") prev_lo, | |
1331 | out("x7") prev_hi, | |
1332 | // must be allocated to even/odd register pair | |
1333 | out("x4") _, | |
1334 | out("x5") _, | |
1335 | // Do not use `preserves_flags` because CMP and CCMP modify the condition flags. | |
1336 | options(nostack), | |
1337 | ) | |
1338 | }; | |
1339 | } | |
1340 | atomic_rmw!(op, order); | |
1341 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1342 | } | |
1343 | } | |
1344 | }; | |
1345 | } | |
1346 | ||
1347 | /// Atomic RMW by LL/SC loop (2 arguments) | |
1348 | /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;` | |
1349 | /// | |
1350 | /// `$op` can use the following registers: | |
1351 | /// - prev_lo/prev_hi pair: previous value loaded by ll (read-only for `$op`) | |
ed00b5ec | 1352 | /// - new_lo/new_hi pair: new value that will be stored by sc |
781aab86 FG |
1353 | macro_rules! atomic_rmw_ll_sc_2 { |
1354 | ($name:ident as $reexport_name:ident $(($preserves_flags:tt))?, $($op:tt)*) => { | |
1355 | // If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set, | |
1356 | // we use CAS-based atomic RMW generated by atomic_rmw_cas_2! macro instead. | |
1357 | #[cfg(not(all( | |
1358 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1359 | not(portable_atomic_ll_sc_rmw), | |
1360 | )))] | |
1361 | use $name as $reexport_name; | |
1362 | #[cfg(any( | |
1363 | test, | |
1364 | not(all( | |
1365 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1366 | not(portable_atomic_ll_sc_rmw), | |
1367 | )) | |
1368 | ))] | |
1369 | #[inline] | |
1370 | unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 { | |
1371 | debug_assert!(dst as usize % 16 == 0); | |
1372 | // SAFETY: the caller must uphold the safety contract. | |
1373 | unsafe { | |
1374 | let (mut prev_lo, mut prev_hi); | |
1375 | macro_rules! op { | |
1376 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1377 | asm!( | |
1378 | "2:", | |
1379 | concat!("ld", $acquire, "xp {prev_lo}, {prev_hi}, [{dst}]"), | |
1380 | $($op)* | |
1381 | concat!("st", $release, "xp {r:w}, {new_lo}, {new_hi}, [{dst}]"), | |
1382 | // 0 if the store was successful, 1 if no store was performed | |
1383 | "cbnz {r:w}, 2b", | |
1384 | $fence, | |
1385 | dst = in(reg) ptr_reg!(dst), | |
1386 | prev_lo = out(reg) prev_lo, | |
1387 | prev_hi = out(reg) prev_hi, | |
1388 | new_lo = out(reg) _, | |
1389 | new_hi = out(reg) _, | |
1390 | r = out(reg) _, | |
1391 | options(nostack $(, $preserves_flags)?), | |
1392 | ) | |
1393 | }; | |
1394 | } | |
1395 | atomic_rmw!(op, order); | |
1396 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1397 | } | |
1398 | } | |
1399 | }; | |
1400 | } | |
1401 | /// Atomic RMW by CAS loop (2 arguments) | |
1402 | /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;` | |
1403 | /// | |
1404 | /// `$op` can use the following registers: | |
1405 | /// - x6/x7 pair: previous value loaded (read-only for `$op`) | |
ed00b5ec | 1406 | /// - x4/x5 pair: new value that will be stored |
781aab86 FG |
1407 | macro_rules! atomic_rmw_cas_2 { |
1408 | ($name:ident as $reexport_name:ident, $($op:tt)*) => { | |
1409 | // If FEAT_LSE is not available at compile-time or portable_atomic_ll_sc_rmw cfg is set, | |
1410 | // we use LL/SC-based atomic RMW generated by atomic_rmw_ll_sc_3! macro instead. | |
1411 | #[cfg(all( | |
1412 | any(target_feature = "lse", portable_atomic_target_feature = "lse"), | |
1413 | not(portable_atomic_ll_sc_rmw), | |
1414 | ))] | |
1415 | use $name as $reexport_name; | |
ed00b5ec | 1416 | #[cfg(any(test, not(portable_atomic_ll_sc_rmw)))] |
781aab86 FG |
1417 | #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] |
1418 | #[inline] | |
1419 | unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 { | |
1420 | debug_assert!(dst as usize % 16 == 0); | |
1421 | debug_assert_lse!(); | |
1422 | // SAFETY: the caller must uphold the safety contract. | |
1423 | // cfg guarantee that the CPU supports FEAT_LSE. | |
1424 | unsafe { | |
1425 | let (mut prev_lo, mut prev_hi); | |
1426 | macro_rules! op { | |
1427 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1428 | asm!( | |
ed00b5ec | 1429 | start_lse!(), |
781aab86 FG |
1430 | // If FEAT_LSE2 is not supported, this works like byte-wise atomic. |
1431 | // This is not single-copy atomic reads, but this is ok because subsequent | |
1432 | // CAS will check for consistency. | |
1433 | "ldp x6, x7, [{dst}]", | |
1434 | "2:", | |
1435 | // casp writes the current value to the first register pair, | |
1436 | // so copy the `out`'s value for later comparison. | |
1437 | "mov {tmp_lo}, x6", | |
1438 | "mov {tmp_hi}, x7", | |
1439 | $($op)* | |
1440 | concat!("casp", $acquire, $release, " x6, x7, x4, x5, [{dst}]"), | |
1441 | "cmp {tmp_hi}, x7", | |
1442 | "ccmp {tmp_lo}, x6, #0, eq", | |
1443 | "b.ne 2b", | |
1444 | $fence, | |
1445 | dst = in(reg) ptr_reg!(dst), | |
1446 | tmp_lo = out(reg) _, | |
1447 | tmp_hi = out(reg) _, | |
1448 | // must be allocated to even/odd register pair | |
1449 | out("x6") prev_lo, | |
1450 | out("x7") prev_hi, | |
1451 | // must be allocated to even/odd register pair | |
1452 | out("x4") _, | |
1453 | out("x5") _, | |
1454 | // Do not use `preserves_flags` because CMP and CCMP modify the condition flags. | |
1455 | options(nostack), | |
1456 | ) | |
1457 | }; | |
1458 | } | |
1459 | atomic_rmw!(op, order); | |
1460 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1461 | } | |
1462 | } | |
1463 | }; | |
1464 | } | |
1465 | ||
1466 | // Do not use `preserves_flags` because ADDS modifies the condition flags. | |
1467 | atomic_rmw_ll_sc_3! { | |
1468 | _atomic_add_ldxp_stxp as atomic_add, | |
1469 | select_le_or_be!("adds {new_lo}, {prev_lo}, {val_lo}", "adds {new_hi}, {prev_hi}, {val_hi}"), | |
1470 | select_le_or_be!("adc {new_hi}, {prev_hi}, {val_hi}", "adc {new_lo}, {prev_lo}, {val_lo}"), | |
1471 | } | |
1472 | atomic_rmw_cas_3! { | |
1473 | _atomic_add_casp as atomic_add, | |
1474 | select_le_or_be!("adds x4, x6, {val_lo}", "adds x5, x7, {val_hi}"), | |
1475 | select_le_or_be!("adc x5, x7, {val_hi}", "adc x4, x6, {val_lo}"), | |
1476 | } | |
1477 | ||
1478 | // Do not use `preserves_flags` because SUBS modifies the condition flags. | |
1479 | atomic_rmw_ll_sc_3! { | |
1480 | _atomic_sub_ldxp_stxp as atomic_sub, | |
1481 | select_le_or_be!("subs {new_lo}, {prev_lo}, {val_lo}", "subs {new_hi}, {prev_hi}, {val_hi}"), | |
1482 | select_le_or_be!("sbc {new_hi}, {prev_hi}, {val_hi}", "sbc {new_lo}, {prev_lo}, {val_lo}"), | |
1483 | } | |
1484 | atomic_rmw_cas_3! { | |
1485 | _atomic_sub_casp as atomic_sub, | |
1486 | select_le_or_be!("subs x4, x6, {val_lo}", "subs x5, x7, {val_hi}"), | |
1487 | select_le_or_be!("sbc x5, x7, {val_hi}", "sbc x4, x6, {val_lo}"), | |
1488 | } | |
1489 | ||
ed00b5ec | 1490 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] |
781aab86 FG |
1491 | atomic_rmw_ll_sc_3! { |
1492 | _atomic_and_ldxp_stxp as atomic_and (preserves_flags), | |
1493 | "and {new_lo}, {prev_lo}, {val_lo}", | |
1494 | "and {new_hi}, {prev_hi}, {val_hi}", | |
1495 | } | |
ed00b5ec | 1496 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] |
781aab86 FG |
1497 | atomic_rmw_cas_3! { |
1498 | _atomic_and_casp as atomic_and, | |
1499 | "and x4, x6, {val_lo}", | |
1500 | "and x5, x7, {val_hi}", | |
1501 | } | |
ed00b5ec FG |
1502 | #[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] |
1503 | #[inline] | |
1504 | unsafe fn atomic_and(dst: *mut u128, val: u128, order: Ordering) -> u128 { | |
1505 | debug_assert!(dst as usize % 16 == 0); | |
1506 | ||
1507 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and | |
1508 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, | |
1509 | // and the CPU supports FEAT_LSE128. | |
1510 | // | |
1511 | // Refs: | |
1512 | // - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDCLRP--LDCLRPA--LDCLRPAL--LDCLRPL--Atomic-bit-clear-on-quadword-in-memory-?lang=en | |
1513 | unsafe { | |
1514 | let val = U128 { whole: !val }; | |
1515 | let (prev_lo, prev_hi); | |
1516 | macro_rules! and { | |
1517 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1518 | asm!( | |
1519 | concat!("ldclrp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"), | |
1520 | $fence, | |
1521 | dst = in(reg) ptr_reg!(dst), | |
1522 | val_lo = inout(reg) val.pair.lo => prev_lo, | |
1523 | val_hi = inout(reg) val.pair.hi => prev_hi, | |
1524 | options(nostack, preserves_flags), | |
1525 | ) | |
1526 | }; | |
1527 | } | |
1528 | atomic_rmw!(and, order); | |
1529 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1530 | } | |
1531 | } | |
781aab86 FG |
1532 | |
1533 | atomic_rmw_ll_sc_3! { | |
1534 | _atomic_nand_ldxp_stxp as atomic_nand (preserves_flags), | |
1535 | "and {new_lo}, {prev_lo}, {val_lo}", | |
1536 | "mvn {new_lo}, {new_lo}", | |
1537 | "and {new_hi}, {prev_hi}, {val_hi}", | |
1538 | "mvn {new_hi}, {new_hi}", | |
1539 | } | |
1540 | atomic_rmw_cas_3! { | |
1541 | _atomic_nand_casp as atomic_nand, | |
1542 | "and x4, x6, {val_lo}", | |
1543 | "mvn x4, x4", | |
1544 | "and x5, x7, {val_hi}", | |
1545 | "mvn x5, x5", | |
1546 | } | |
1547 | ||
ed00b5ec | 1548 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] |
781aab86 FG |
1549 | atomic_rmw_ll_sc_3! { |
1550 | _atomic_or_ldxp_stxp as atomic_or (preserves_flags), | |
1551 | "orr {new_lo}, {prev_lo}, {val_lo}", | |
1552 | "orr {new_hi}, {prev_hi}, {val_hi}", | |
1553 | } | |
ed00b5ec | 1554 | #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] |
781aab86 FG |
1555 | atomic_rmw_cas_3! { |
1556 | _atomic_or_casp as atomic_or, | |
1557 | "orr x4, x6, {val_lo}", | |
1558 | "orr x5, x7, {val_hi}", | |
1559 | } | |
ed00b5ec FG |
1560 | #[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] |
1561 | #[inline] | |
1562 | unsafe fn atomic_or(dst: *mut u128, val: u128, order: Ordering) -> u128 { | |
1563 | debug_assert!(dst as usize % 16 == 0); | |
1564 | ||
1565 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and | |
1566 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, | |
1567 | // and the CPU supports FEAT_LSE128. | |
1568 | // | |
1569 | // Refs: | |
1570 | // - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDSETP--LDSETPA--LDSETPAL--LDSETPL--Atomic-bit-set-on-quadword-in-memory-?lang=en | |
1571 | unsafe { | |
1572 | let val = U128 { whole: val }; | |
1573 | let (prev_lo, prev_hi); | |
1574 | macro_rules! or { | |
1575 | ($acquire:tt, $release:tt, $fence:tt) => { | |
1576 | asm!( | |
1577 | concat!("ldsetp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"), | |
1578 | $fence, | |
1579 | dst = in(reg) ptr_reg!(dst), | |
1580 | val_lo = inout(reg) val.pair.lo => prev_lo, | |
1581 | val_hi = inout(reg) val.pair.hi => prev_hi, | |
1582 | options(nostack, preserves_flags), | |
1583 | ) | |
1584 | }; | |
1585 | } | |
1586 | atomic_rmw!(or, order); | |
1587 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole | |
1588 | } | |
1589 | } | |
781aab86 FG |
1590 | |
1591 | atomic_rmw_ll_sc_3! { | |
1592 | _atomic_xor_ldxp_stxp as atomic_xor (preserves_flags), | |
1593 | "eor {new_lo}, {prev_lo}, {val_lo}", | |
1594 | "eor {new_hi}, {prev_hi}, {val_hi}", | |
1595 | } | |
1596 | atomic_rmw_cas_3! { | |
1597 | _atomic_xor_casp as atomic_xor, | |
1598 | "eor x4, x6, {val_lo}", | |
1599 | "eor x5, x7, {val_hi}", | |
1600 | } | |
1601 | ||
1602 | atomic_rmw_ll_sc_2! { | |
1603 | _atomic_not_ldxp_stxp as atomic_not (preserves_flags), | |
1604 | "mvn {new_lo}, {prev_lo}", | |
1605 | "mvn {new_hi}, {prev_hi}", | |
1606 | } | |
1607 | atomic_rmw_cas_2! { | |
1608 | _atomic_not_casp as atomic_not, | |
1609 | "mvn x4, x6", | |
1610 | "mvn x5, x7", | |
1611 | } | |
1612 | ||
1613 | // Do not use `preserves_flags` because NEGS modifies the condition flags. | |
1614 | atomic_rmw_ll_sc_2! { | |
1615 | _atomic_neg_ldxp_stxp as atomic_neg, | |
1616 | select_le_or_be!("negs {new_lo}, {prev_lo}", "negs {new_hi}, {prev_hi}"), | |
1617 | select_le_or_be!("ngc {new_hi}, {prev_hi}", "ngc {new_lo}, {prev_lo}"), | |
1618 | } | |
1619 | atomic_rmw_cas_2! { | |
1620 | _atomic_neg_casp as atomic_neg, | |
1621 | select_le_or_be!("negs x4, x6", "negs x5, x7"), | |
1622 | select_le_or_be!("ngc x5, x7", "ngc x4, x6"), | |
1623 | } | |
1624 | ||
1625 | // Do not use `preserves_flags` because CMP and SBCS modify the condition flags. | |
1626 | atomic_rmw_ll_sc_3! { | |
1627 | _atomic_max_ldxp_stxp as atomic_max, | |
1628 | select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"), | |
1629 | select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"), | |
1630 | "csel {new_hi}, {prev_hi}, {val_hi}, lt", // select hi 64-bit | |
1631 | "csel {new_lo}, {prev_lo}, {val_lo}, lt", // select lo 64-bit | |
1632 | } | |
1633 | atomic_rmw_cas_3! { | |
1634 | _atomic_max_casp as atomic_max, | |
1635 | select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"), | |
1636 | select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"), | |
1637 | "csel x5, x7, {val_hi}, lt", // select hi 64-bit | |
1638 | "csel x4, x6, {val_lo}, lt", // select lo 64-bit | |
1639 | } | |
1640 | ||
1641 | // Do not use `preserves_flags` because CMP and SBCS modify the condition flags. | |
1642 | atomic_rmw_ll_sc_3! { | |
1643 | _atomic_umax_ldxp_stxp as atomic_umax, | |
1644 | select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"), | |
1645 | select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"), | |
1646 | "csel {new_hi}, {prev_hi}, {val_hi}, lo", // select hi 64-bit | |
1647 | "csel {new_lo}, {prev_lo}, {val_lo}, lo", // select lo 64-bit | |
1648 | } | |
1649 | atomic_rmw_cas_3! { | |
1650 | _atomic_umax_casp as atomic_umax, | |
1651 | select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"), | |
1652 | select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"), | |
1653 | "csel x5, x7, {val_hi}, lo", // select hi 64-bit | |
1654 | "csel x4, x6, {val_lo}, lo", // select lo 64-bit | |
1655 | } | |
1656 | ||
1657 | // Do not use `preserves_flags` because CMP and SBCS modify the condition flags. | |
1658 | atomic_rmw_ll_sc_3! { | |
1659 | _atomic_min_ldxp_stxp as atomic_min, | |
1660 | select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"), | |
1661 | select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"), | |
1662 | "csel {new_hi}, {prev_hi}, {val_hi}, ge", // select hi 64-bit | |
1663 | "csel {new_lo}, {prev_lo}, {val_lo}, ge", // select lo 64-bit | |
1664 | } | |
1665 | atomic_rmw_cas_3! { | |
1666 | _atomic_min_casp as atomic_min, | |
1667 | select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"), | |
1668 | select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"), | |
1669 | "csel x5, x7, {val_hi}, ge", // select hi 64-bit | |
1670 | "csel x4, x6, {val_lo}, ge", // select lo 64-bit | |
1671 | } | |
1672 | ||
1673 | // Do not use `preserves_flags` because CMP and SBCS modify the condition flags. | |
1674 | atomic_rmw_ll_sc_3! { | |
1675 | _atomic_umin_ldxp_stxp as atomic_umin, | |
1676 | select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"), | |
1677 | select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"), | |
1678 | "csel {new_hi}, {prev_hi}, {val_hi}, hs", // select hi 64-bit | |
1679 | "csel {new_lo}, {prev_lo}, {val_lo}, hs", // select lo 64-bit | |
1680 | } | |
1681 | atomic_rmw_cas_3! { | |
1682 | _atomic_umin_casp as atomic_umin, | |
1683 | select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"), | |
1684 | select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"), | |
1685 | "csel x5, x7, {val_hi}, hs", // select hi 64-bit | |
1686 | "csel x4, x6, {val_lo}, hs", // select lo 64-bit | |
1687 | } | |
1688 | ||
1689 | #[inline] | |
1690 | const fn is_lock_free() -> bool { | |
1691 | IS_ALWAYS_LOCK_FREE | |
1692 | } | |
1693 | const IS_ALWAYS_LOCK_FREE: bool = true; | |
1694 | ||
1695 | atomic128!(AtomicI128, i128, atomic_max, atomic_min); | |
1696 | atomic128!(AtomicU128, u128, atomic_umax, atomic_umin); | |
1697 | ||
1698 | #[cfg(test)] | |
1699 | mod tests { | |
1700 | use super::*; | |
1701 | ||
1702 | test_atomic_int!(i128); | |
1703 | test_atomic_int!(u128); | |
1704 | ||
1705 | // load/store/swap implementation is not affected by signedness, so it is | |
1706 | // enough to test only unsigned types. | |
1707 | stress_test!(u128); | |
1708 | } |