]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | //! Code generation for the sum reduction. |
2 | use coresimd::simd::*; | |
3 | ||
4 | /// LLVM intrinsics used in the sum reduction | |
5 | #[allow(improper_ctypes)] | |
6 | extern "C" { | |
7 | #[link_name = "llvm.experimental.vector.reduce.add.i8.v2i8"] | |
8 | fn reduce_add_i8x2(x: i8x2) -> i8; | |
9 | #[link_name = "llvm.experimental.vector.reduce.add.u8.v2u8"] | |
10 | fn reduce_add_u8x2(x: u8x2) -> u8; | |
11 | #[link_name = "llvm.experimental.vector.reduce.add.i16.v2i16"] | |
12 | fn reduce_add_i16x2(x: i16x2) -> i16; | |
13 | #[link_name = "llvm.experimental.vector.reduce.add.u16.v2u16"] | |
14 | fn reduce_add_u16x2(x: u16x2) -> u16; | |
15 | #[link_name = "llvm.experimental.vector.reduce.add.i32.v2i32"] | |
16 | fn reduce_add_i32x2(x: i32x2) -> i32; | |
17 | #[link_name = "llvm.experimental.vector.reduce.add.u32.v2u32"] | |
18 | fn reduce_add_u32x2(x: u32x2) -> u32; | |
19 | #[link_name = "llvm.experimental.vector.reduce.add.i64.v2i64"] | |
20 | fn reduce_add_i64x2(x: i64x2) -> i64; | |
21 | #[link_name = "llvm.experimental.vector.reduce.add.u64.v2u64"] | |
22 | fn reduce_add_u64x2(x: u64x2) -> u64; | |
23 | #[link_name = "llvm.experimental.vector.reduce.add.i8.v4i8"] | |
24 | fn reduce_add_i8x4(x: i8x4) -> i8; | |
25 | #[link_name = "llvm.experimental.vector.reduce.add.u8.v4u8"] | |
26 | fn reduce_add_u8x4(x: u8x4) -> u8; | |
27 | #[link_name = "llvm.experimental.vector.reduce.add.i16.v4i16"] | |
28 | fn reduce_add_i16x4(x: i16x4) -> i16; | |
29 | #[link_name = "llvm.experimental.vector.reduce.add.u16.v4u16"] | |
30 | fn reduce_add_u16x4(x: u16x4) -> u16; | |
31 | #[link_name = "llvm.experimental.vector.reduce.add.i32.v4i32"] | |
32 | fn reduce_add_i32x4(x: i32x4) -> i32; | |
33 | #[link_name = "llvm.experimental.vector.reduce.add.u32.v4u32"] | |
34 | fn reduce_add_u32x4(x: u32x4) -> u32; | |
35 | #[link_name = "llvm.experimental.vector.reduce.add.i64.v4i64"] | |
36 | fn reduce_add_i64x4(x: i64x4) -> i64; | |
37 | #[link_name = "llvm.experimental.vector.reduce.add.u64.v4u64"] | |
38 | fn reduce_add_u64x4(x: u64x4) -> u64; | |
39 | #[link_name = "llvm.experimental.vector.reduce.add.i8.v8i8"] | |
40 | fn reduce_add_i8x8(x: i8x8) -> i8; | |
41 | #[link_name = "llvm.experimental.vector.reduce.add.u8.v8u8"] | |
42 | fn reduce_add_u8x8(x: u8x8) -> u8; | |
43 | #[link_name = "llvm.experimental.vector.reduce.add.i16.v8i16"] | |
44 | fn reduce_add_i16x8(x: i16x8) -> i16; | |
45 | #[link_name = "llvm.experimental.vector.reduce.add.u16.v8u16"] | |
46 | fn reduce_add_u16x8(x: u16x8) -> u16; | |
47 | #[link_name = "llvm.experimental.vector.reduce.add.i32.v8i32"] | |
48 | fn reduce_add_i32x8(x: i32x8) -> i32; | |
49 | #[link_name = "llvm.experimental.vector.reduce.add.u32.v8u32"] | |
50 | fn reduce_add_u32x8(x: u32x8) -> u32; | |
51 | #[link_name = "llvm.experimental.vector.reduce.add.i64.v8i64"] | |
52 | fn reduce_add_i64x8(x: i64x8) -> i64; | |
53 | #[link_name = "llvm.experimental.vector.reduce.add.u64.v8u64"] | |
54 | fn reduce_add_u64x8(x: u64x8) -> u64; | |
55 | #[link_name = "llvm.experimental.vector.reduce.add.i8.v16i8"] | |
56 | fn reduce_add_i8x16(x: i8x16) -> i8; | |
57 | #[link_name = "llvm.experimental.vector.reduce.add.u8.v16u8"] | |
58 | fn reduce_add_u8x16(x: u8x16) -> u8; | |
59 | #[link_name = "llvm.experimental.vector.reduce.add.i16.v16i16"] | |
60 | fn reduce_add_i16x16(x: i16x16) -> i16; | |
61 | #[link_name = "llvm.experimental.vector.reduce.add.u16.v16u16"] | |
62 | fn reduce_add_u16x16(x: u16x16) -> u16; | |
63 | #[link_name = "llvm.experimental.vector.reduce.add.i32.v16i32"] | |
64 | fn reduce_add_i32x16(x: i32x16) -> i32; | |
65 | #[link_name = "llvm.experimental.vector.reduce.add.u32.v16u32"] | |
66 | fn reduce_add_u32x16(x: u32x16) -> u32; | |
67 | #[link_name = "llvm.experimental.vector.reduce.add.i8.v32i8"] | |
68 | fn reduce_add_i8x32(x: i8x32) -> i8; | |
69 | #[link_name = "llvm.experimental.vector.reduce.add.u8.v32u8"] | |
70 | fn reduce_add_u8x32(x: u8x32) -> u8; | |
71 | #[link_name = "llvm.experimental.vector.reduce.add.i16.v32i16"] | |
72 | fn reduce_add_i16x32(x: i16x32) -> i16; | |
73 | #[link_name = "llvm.experimental.vector.reduce.add.u16.v32u16"] | |
74 | fn reduce_add_u16x32(x: u16x32) -> u16; | |
75 | #[link_name = "llvm.experimental.vector.reduce.add.i8.v64i8"] | |
76 | fn reduce_add_i8x64(x: i8x64) -> i8; | |
77 | #[link_name = "llvm.experimental.vector.reduce.add.u8.v64u8"] | |
78 | fn reduce_add_u8x64(x: u8x64) -> u8; | |
79 | #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v2f32"] | |
80 | fn reduce_fadd_f32x2(acc: f32, x: f32x2) -> f32; | |
81 | #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v2f64"] | |
82 | fn reduce_fadd_f64x2(acc: f64, x: f64x2) -> f64; | |
83 | #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v4f32"] | |
84 | fn reduce_fadd_f32x4(acc: f32, x: f32x4) -> f32; | |
85 | #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v4f64"] | |
86 | fn reduce_fadd_f64x4(acc: f64, x: f64x4) -> f64; | |
87 | #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v8f32"] | |
88 | fn reduce_fadd_f32x8(acc: f32, x: f32x8) -> f32; | |
89 | #[link_name = "llvm.experimental.vector.reduce.fadd.f64.v8f64"] | |
90 | fn reduce_fadd_f64x8(acc: f64, x: f64x8) -> f64; | |
91 | #[link_name = "llvm.experimental.vector.reduce.fadd.f32.v16f32"] | |
92 | fn reduce_fadd_f32x16(acc: f32, x: f32x16) -> f32; | |
93 | } | |
94 | ||
95 | /// Reduction: horizontal sum of the vector elements. | |
96 | pub trait ReduceAdd { | |
97 | /// Result type of the reduction. | |
98 | type Acc; | |
99 | /// Computes the horizontal sum of the vector elements. | |
100 | fn reduce_add(self) -> Self::Acc; | |
101 | } | |
102 | ||
103 | macro_rules! red_add { | |
104 | ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { | |
105 | impl ReduceAdd for $id { | |
106 | type Acc = $elem_ty; | |
107 | #[cfg(not(target_arch = "aarch64"))] | |
108 | #[inline] | |
109 | fn reduce_add(self) -> Self::Acc { | |
110 | unsafe { $llvm_intr(self) } | |
111 | } | |
112 | // FIXME: broken in AArch64 | |
113 | #[cfg(target_arch = "aarch64")] | |
114 | #[inline] | |
115 | fn reduce_add(self) -> Self::Acc { | |
116 | let mut x = self.extract(0) as Self::Acc; | |
117 | for i in 1..$id::lanes() { | |
118 | x += self.extract(i) as Self::Acc; | |
119 | } | |
120 | x | |
121 | } | |
122 | } | |
123 | }; | |
124 | } | |
125 | red_add!(i8x2, i8, reduce_add_i8x2); | |
126 | red_add!(u8x2, u8, reduce_add_u8x2); | |
127 | red_add!(i16x2, i16, reduce_add_i16x2); | |
128 | red_add!(u16x2, u16, reduce_add_u16x2); | |
129 | red_add!(i32x2, i32, reduce_add_i32x2); | |
130 | red_add!(u32x2, u32, reduce_add_u32x2); | |
131 | red_add!(i64x2, i64, reduce_add_i64x2); | |
132 | red_add!(u64x2, u64, reduce_add_u64x2); | |
133 | red_add!(i8x4, i8, reduce_add_i8x4); | |
134 | red_add!(u8x4, u8, reduce_add_u8x4); | |
135 | red_add!(i16x4, i16, reduce_add_i16x4); | |
136 | red_add!(u16x4, u16, reduce_add_u16x4); | |
137 | red_add!(i32x4, i32, reduce_add_i32x4); | |
138 | red_add!(u32x4, u32, reduce_add_u32x4); | |
139 | red_add!(i64x4, i64, reduce_add_i64x4); | |
140 | red_add!(u64x4, u64, reduce_add_u64x4); | |
141 | red_add!(i8x8, i8, reduce_add_i8x8); | |
142 | red_add!(u8x8, u8, reduce_add_u8x8); | |
143 | red_add!(i16x8, i16, reduce_add_i16x8); | |
144 | red_add!(u16x8, u16, reduce_add_u16x8); | |
145 | red_add!(i32x8, i32, reduce_add_i32x8); | |
146 | red_add!(u32x8, u32, reduce_add_u32x8); | |
147 | red_add!(i64x8, i64, reduce_add_i64x8); | |
148 | red_add!(u64x8, u64, reduce_add_u64x8); | |
149 | red_add!(i8x16, i8, reduce_add_i8x16); | |
150 | red_add!(u8x16, u8, reduce_add_u8x16); | |
151 | red_add!(i16x16, i16, reduce_add_i16x16); | |
152 | red_add!(u16x16, u16, reduce_add_u16x16); | |
153 | red_add!(i32x16, i32, reduce_add_i32x16); | |
154 | red_add!(u32x16, u32, reduce_add_u32x16); | |
155 | red_add!(i8x32, i8, reduce_add_i8x32); | |
156 | red_add!(u8x32, u8, reduce_add_u8x32); | |
157 | red_add!(i16x32, i16, reduce_add_i16x32); | |
158 | red_add!(u16x32, u16, reduce_add_u16x32); | |
159 | red_add!(i8x64, i8, reduce_add_i8x64); | |
160 | red_add!(u8x64, u8, reduce_add_u8x64); | |
161 | ||
162 | macro_rules! red_fadd { | |
163 | ($id:ident, $elem_ty:ident, $llvm_intr:ident) => { | |
164 | impl ReduceAdd for $id { | |
165 | type Acc = $elem_ty; | |
166 | #[inline] | |
167 | fn reduce_add(self) -> Self::Acc { | |
168 | // FIXME: | |
169 | //unsafe { $llvm_intr(0. as $elem_ty, self) } | |
170 | let mut x = self.extract(0); | |
171 | for i in 1..$id::lanes() { | |
172 | x += self.extract(i); | |
173 | } | |
174 | x | |
175 | } | |
176 | } | |
177 | }; | |
178 | } | |
179 | ||
180 | red_fadd!(f32x2, f32, reduce_fadd_f32x2); | |
181 | red_fadd!(f64x2, f64, reduce_fadd_f64x2); | |
182 | red_fadd!(f32x4, f32, reduce_fadd_f32x4); | |
183 | red_fadd!(f64x4, f64, reduce_fadd_f64x4); | |
184 | red_fadd!(f32x8, f32, reduce_fadd_f32x8); | |
185 | red_fadd!(f64x8, f64, reduce_fadd_f64x8); | |
186 | red_fadd!(f32x16, f32, reduce_fadd_f32x16); | |
187 | ||
188 | #[cfg(test)] | |
189 | mod tests { | |
190 | use super::ReduceAdd; | |
191 | use coresimd::simd::*; | |
192 | ||
193 | // note: these are tested in the portable vector API tests | |
194 | ||
195 | #[test] | |
196 | fn reduce_add_i32x4() { | |
197 | let v = i32x4::splat(1); | |
198 | assert_eq!(v.reduce_add(), 4_i32); | |
199 | } | |
200 | #[test] | |
201 | fn reduce_add_u32x4() { | |
202 | let v = u32x4::splat(1); | |
203 | assert_eq!(v.reduce_add(), 4_u32); | |
204 | } | |
205 | #[test] | |
206 | fn reduce_add_f32x4() { | |
207 | let v = f32x4::splat(1.); | |
208 | assert_eq!(v.reduce_add(), 4.); | |
209 | } | |
210 | } |