]>
Commit | Line | Data |
---|---|---|
5e7ed085 FG |
1 | // compile-flags: -O |
2 | // only-x86_64 | |
3 | // ignore-debug: the debug assertions get in the way | |
4 | ||
5 | #![crate_type = "lib"] | |
6 | ||
7 | use std::mem::swap; | |
8 | use std::ptr::{read, copy_nonoverlapping, write}; | |
9 | ||
10 | type KeccakBuffer = [[u64; 5]; 5]; | |
11 | ||
12 | // A basic read+copy+write swap implementation ends up copying one of the values | |
13 | // to stack for large types, which is completely unnecessary as the lack of | |
14 | // overlap means we can just do whatever fits in registers at a time. | |
15 | ||
16 | // CHECK-LABEL: @swap_basic | |
17 | #[no_mangle] | |
18 | pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { | |
19 | // CHECK: alloca [5 x [5 x i64]] | |
20 | ||
21 | // SAFETY: exclusive references are always valid to read/write, | |
22 | // are non-overlapping, and nothing here panics so it's drop-safe. | |
23 | unsafe { | |
24 | let z = read(x); | |
25 | copy_nonoverlapping(y, x, 1); | |
26 | write(y, z); | |
27 | } | |
28 | } | |
29 | ||
30 | // This test verifies that the library does something smarter, and thus | |
31 | // doesn't need any scratch space on the stack. | |
32 | ||
33 | // CHECK-LABEL: @swap_std | |
34 | #[no_mangle] | |
35 | pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { | |
36 | // CHECK-NOT: alloca | |
37 | // CHECK: load <{{[0-9]+}} x i64> | |
38 | // CHECK: store <{{[0-9]+}} x i64> | |
39 | swap(x, y) | |
40 | } | |
41 | ||
42 | // Verify that types with usize alignment are swapped via vectored usizes, | |
43 | // not falling back to byte-level code. | |
44 | ||
45 | // CHECK-LABEL: @swap_slice | |
46 | #[no_mangle] | |
47 | pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) { | |
48 | // CHECK-NOT: alloca | |
49 | // CHECK: load <{{[0-9]+}} x i64> | |
50 | // CHECK: store <{{[0-9]+}} x i64> | |
51 | if x.len() == y.len() { | |
52 | x.swap_with_slice(y); | |
53 | } | |
54 | } | |
55 | ||
56 | // But for a large align-1 type, vectorized byte copying is what we want. | |
57 | ||
58 | type OneKilobyteBuffer = [u8; 1024]; | |
59 | ||
60 | // CHECK-LABEL: @swap_1kb_slices | |
61 | #[no_mangle] | |
62 | pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) { | |
63 | // CHECK-NOT: alloca | |
64 | // CHECK: load <{{[0-9]+}} x i8> | |
65 | // CHECK: store <{{[0-9]+}} x i8> | |
66 | if x.len() == y.len() { | |
67 | x.swap_with_slice(y); | |
68 | } | |
69 | } | |
70 | ||
71 | // This verifies that the 2×read + 2×write optimizes to just 3 memcpys | |
72 | // for an unusual type like this. It's not clear whether we should do anything | |
73 | // smarter in Rust for these, so for now it's fine to leave these up to the backend. | |
74 | // That's not as bad as it might seem, as for example, LLVM will lower the | |
75 | // memcpys below to VMOVAPS on YMMs if one enables the AVX target feature. | |
76 | // Eventually we'll be able to pass `align_of::<T>` to a const generic and | |
77 | // thus pick a smarter chunk size ourselves without huge code duplication. | |
78 | ||
79 | #[repr(align(64))] | |
80 | pub struct BigButHighlyAligned([u8; 64 * 3]); | |
81 | ||
82 | // CHECK-LABEL: @swap_big_aligned | |
83 | #[no_mangle] | |
84 | pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) { | |
85 | // CHECK-NOT: call void @llvm.memcpy | |
923072b8 FG |
86 | // CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) |
87 | // CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) | |
88 | // CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) | |
5e7ed085 FG |
89 | // CHECK-NOT: call void @llvm.memcpy |
90 | swap(x, y) | |
91 | } |