]> git.proxmox.com Git - rustc.git/blame - vendor/blake2b_simd/src/sse41.rs
New upstream version 1.52.1+dfsg1
[rustc.git] / vendor / blake2b_simd / src / sse41.rs
CommitLineData
f20569fa
XL
1#[cfg(target_arch = "x86")]
2use core::arch::x86::*;
3#[cfg(target_arch = "x86_64")]
4use core::arch::x86_64::*;
5
6use crate::guts::{
7 assemble_count, count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize,
8 Job, Stride,
9};
10use crate::{Word, BLOCKBYTES, IV, SIGMA};
11use arrayref::{array_refs, mut_array_refs};
12use core::cmp;
13use core::mem;
14
15pub const DEGREE: usize = 2;
16
17#[inline(always)]
18unsafe fn loadu(src: *const [Word; DEGREE]) -> __m128i {
19 // This is an unaligned load, so the pointer cast is allowed.
20 _mm_loadu_si128(src as *const __m128i)
21}
22
23#[inline(always)]
24unsafe fn storeu(src: __m128i, dest: *mut [Word; DEGREE]) {
25 // This is an unaligned store, so the pointer cast is allowed.
26 _mm_storeu_si128(dest as *mut __m128i, src)
27}
28
29#[inline(always)]
30unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
31 _mm_add_epi64(a, b)
32}
33
34#[inline(always)]
35unsafe fn eq(a: __m128i, b: __m128i) -> __m128i {
36 _mm_cmpeq_epi64(a, b)
37}
38
39#[inline(always)]
40unsafe fn and(a: __m128i, b: __m128i) -> __m128i {
41 _mm_and_si128(a, b)
42}
43
44#[inline(always)]
45unsafe fn negate_and(a: __m128i, b: __m128i) -> __m128i {
46 // Note that "and not" implies the reverse of the actual arg order.
47 _mm_andnot_si128(a, b)
48}
49
50#[inline(always)]
51unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
52 _mm_xor_si128(a, b)
53}
54
55#[inline(always)]
56unsafe fn set1(x: u64) -> __m128i {
57 _mm_set1_epi64x(x as i64)
58}
59
60#[inline(always)]
61unsafe fn set2(a: u64, b: u64) -> __m128i {
62 // There's no _mm_setr_epi64x, so note the arg order is backwards.
63 _mm_set_epi64x(b as i64, a as i64)
64}
65
66// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
67macro_rules! _MM_SHUFFLE {
68 ($z:expr, $y:expr, $x:expr, $w:expr) => {
69 ($z << 6) | ($y << 4) | ($x << 2) | $w
70 };
71}
72
73// These rotations are the "simple version". For the "complicated version", see
74// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2b-common.h#L43-L46.
75// For a discussion of the tradeoffs, see
76// https://github.com/sneves/blake2-avx2/pull/5. In short:
77// - This version performs better on modern x86 chips, Skylake and later.
78// - LLVM is able to optimize this version to AVX-512 rotation instructions
79// when those are enabled.
80
81#[inline(always)]
82unsafe fn rot32(x: __m128i) -> __m128i {
83 _mm_or_si128(_mm_srli_epi64(x, 32), _mm_slli_epi64(x, 64 - 32))
84}
85
86#[inline(always)]
87unsafe fn rot24(x: __m128i) -> __m128i {
88 _mm_or_si128(_mm_srli_epi64(x, 24), _mm_slli_epi64(x, 64 - 24))
89}
90
91#[inline(always)]
92unsafe fn rot16(x: __m128i) -> __m128i {
93 _mm_or_si128(_mm_srli_epi64(x, 16), _mm_slli_epi64(x, 64 - 16))
94}
95
96#[inline(always)]
97unsafe fn rot63(x: __m128i) -> __m128i {
98 _mm_or_si128(_mm_srli_epi64(x, 63), _mm_slli_epi64(x, 64 - 63))
99}
100
101#[inline(always)]
102unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
103 v[0] = add(v[0], m[SIGMA[r][0] as usize]);
104 v[1] = add(v[1], m[SIGMA[r][2] as usize]);
105 v[2] = add(v[2], m[SIGMA[r][4] as usize]);
106 v[3] = add(v[3], m[SIGMA[r][6] as usize]);
107 v[0] = add(v[0], v[4]);
108 v[1] = add(v[1], v[5]);
109 v[2] = add(v[2], v[6]);
110 v[3] = add(v[3], v[7]);
111 v[12] = xor(v[12], v[0]);
112 v[13] = xor(v[13], v[1]);
113 v[14] = xor(v[14], v[2]);
114 v[15] = xor(v[15], v[3]);
115 v[12] = rot32(v[12]);
116 v[13] = rot32(v[13]);
117 v[14] = rot32(v[14]);
118 v[15] = rot32(v[15]);
119 v[8] = add(v[8], v[12]);
120 v[9] = add(v[9], v[13]);
121 v[10] = add(v[10], v[14]);
122 v[11] = add(v[11], v[15]);
123 v[4] = xor(v[4], v[8]);
124 v[5] = xor(v[5], v[9]);
125 v[6] = xor(v[6], v[10]);
126 v[7] = xor(v[7], v[11]);
127 v[4] = rot24(v[4]);
128 v[5] = rot24(v[5]);
129 v[6] = rot24(v[6]);
130 v[7] = rot24(v[7]);
131 v[0] = add(v[0], m[SIGMA[r][1] as usize]);
132 v[1] = add(v[1], m[SIGMA[r][3] as usize]);
133 v[2] = add(v[2], m[SIGMA[r][5] as usize]);
134 v[3] = add(v[3], m[SIGMA[r][7] as usize]);
135 v[0] = add(v[0], v[4]);
136 v[1] = add(v[1], v[5]);
137 v[2] = add(v[2], v[6]);
138 v[3] = add(v[3], v[7]);
139 v[12] = xor(v[12], v[0]);
140 v[13] = xor(v[13], v[1]);
141 v[14] = xor(v[14], v[2]);
142 v[15] = xor(v[15], v[3]);
143 v[12] = rot16(v[12]);
144 v[13] = rot16(v[13]);
145 v[14] = rot16(v[14]);
146 v[15] = rot16(v[15]);
147 v[8] = add(v[8], v[12]);
148 v[9] = add(v[9], v[13]);
149 v[10] = add(v[10], v[14]);
150 v[11] = add(v[11], v[15]);
151 v[4] = xor(v[4], v[8]);
152 v[5] = xor(v[5], v[9]);
153 v[6] = xor(v[6], v[10]);
154 v[7] = xor(v[7], v[11]);
155 v[4] = rot63(v[4]);
156 v[5] = rot63(v[5]);
157 v[6] = rot63(v[6]);
158 v[7] = rot63(v[7]);
159
160 v[0] = add(v[0], m[SIGMA[r][8] as usize]);
161 v[1] = add(v[1], m[SIGMA[r][10] as usize]);
162 v[2] = add(v[2], m[SIGMA[r][12] as usize]);
163 v[3] = add(v[3], m[SIGMA[r][14] as usize]);
164 v[0] = add(v[0], v[5]);
165 v[1] = add(v[1], v[6]);
166 v[2] = add(v[2], v[7]);
167 v[3] = add(v[3], v[4]);
168 v[15] = xor(v[15], v[0]);
169 v[12] = xor(v[12], v[1]);
170 v[13] = xor(v[13], v[2]);
171 v[14] = xor(v[14], v[3]);
172 v[15] = rot32(v[15]);
173 v[12] = rot32(v[12]);
174 v[13] = rot32(v[13]);
175 v[14] = rot32(v[14]);
176 v[10] = add(v[10], v[15]);
177 v[11] = add(v[11], v[12]);
178 v[8] = add(v[8], v[13]);
179 v[9] = add(v[9], v[14]);
180 v[5] = xor(v[5], v[10]);
181 v[6] = xor(v[6], v[11]);
182 v[7] = xor(v[7], v[8]);
183 v[4] = xor(v[4], v[9]);
184 v[5] = rot24(v[5]);
185 v[6] = rot24(v[6]);
186 v[7] = rot24(v[7]);
187 v[4] = rot24(v[4]);
188 v[0] = add(v[0], m[SIGMA[r][9] as usize]);
189 v[1] = add(v[1], m[SIGMA[r][11] as usize]);
190 v[2] = add(v[2], m[SIGMA[r][13] as usize]);
191 v[3] = add(v[3], m[SIGMA[r][15] as usize]);
192 v[0] = add(v[0], v[5]);
193 v[1] = add(v[1], v[6]);
194 v[2] = add(v[2], v[7]);
195 v[3] = add(v[3], v[4]);
196 v[15] = xor(v[15], v[0]);
197 v[12] = xor(v[12], v[1]);
198 v[13] = xor(v[13], v[2]);
199 v[14] = xor(v[14], v[3]);
200 v[15] = rot16(v[15]);
201 v[12] = rot16(v[12]);
202 v[13] = rot16(v[13]);
203 v[14] = rot16(v[14]);
204 v[10] = add(v[10], v[15]);
205 v[11] = add(v[11], v[12]);
206 v[8] = add(v[8], v[13]);
207 v[9] = add(v[9], v[14]);
208 v[5] = xor(v[5], v[10]);
209 v[6] = xor(v[6], v[11]);
210 v[7] = xor(v[7], v[8]);
211 v[4] = xor(v[4], v[9]);
212 v[5] = rot63(v[5]);
213 v[6] = rot63(v[6]);
214 v[7] = rot63(v[7]);
215 v[4] = rot63(v[4]);
216}
217
218// We'd rather make this a regular function with #[inline(always)], but for
219// some reason that blows up compile times by about 10 seconds, at least in
220// some cases (BLAKE2b avx2.rs). This macro seems to get the same performance
221// result, without the compile time issue.
222macro_rules! compress2_transposed {
223 (
224 $h_vecs:expr,
225 $msg_vecs:expr,
226 $count_low:expr,
227 $count_high:expr,
228 $lastblock:expr,
229 $lastnode:expr,
230 ) => {
231 let h_vecs: &mut [__m128i; 8] = $h_vecs;
232 let msg_vecs: &[__m128i; 16] = $msg_vecs;
233 let count_low: __m128i = $count_low;
234 let count_high: __m128i = $count_high;
235 let lastblock: __m128i = $lastblock;
236 let lastnode: __m128i = $lastnode;
237 let mut v = [
238 h_vecs[0],
239 h_vecs[1],
240 h_vecs[2],
241 h_vecs[3],
242 h_vecs[4],
243 h_vecs[5],
244 h_vecs[6],
245 h_vecs[7],
246 set1(IV[0]),
247 set1(IV[1]),
248 set1(IV[2]),
249 set1(IV[3]),
250 xor(set1(IV[4]), count_low),
251 xor(set1(IV[5]), count_high),
252 xor(set1(IV[6]), lastblock),
253 xor(set1(IV[7]), lastnode),
254 ];
255
256 round(&mut v, &msg_vecs, 0);
257 round(&mut v, &msg_vecs, 1);
258 round(&mut v, &msg_vecs, 2);
259 round(&mut v, &msg_vecs, 3);
260 round(&mut v, &msg_vecs, 4);
261 round(&mut v, &msg_vecs, 5);
262 round(&mut v, &msg_vecs, 6);
263 round(&mut v, &msg_vecs, 7);
264 round(&mut v, &msg_vecs, 8);
265 round(&mut v, &msg_vecs, 9);
266 round(&mut v, &msg_vecs, 10);
267 round(&mut v, &msg_vecs, 11);
268
269 h_vecs[0] = xor(xor(h_vecs[0], v[0]), v[8]);
270 h_vecs[1] = xor(xor(h_vecs[1], v[1]), v[9]);
271 h_vecs[2] = xor(xor(h_vecs[2], v[2]), v[10]);
272 h_vecs[3] = xor(xor(h_vecs[3], v[3]), v[11]);
273 h_vecs[4] = xor(xor(h_vecs[4], v[4]), v[12]);
274 h_vecs[5] = xor(xor(h_vecs[5], v[5]), v[13]);
275 h_vecs[6] = xor(xor(h_vecs[6], v[6]), v[14]);
276 h_vecs[7] = xor(xor(h_vecs[7], v[7]), v[15]);
277 };
278}
279
280#[inline(always)]
281unsafe fn transpose_vecs(a: __m128i, b: __m128i) -> [__m128i; DEGREE] {
282 let a_words: [Word; DEGREE] = mem::transmute(a);
283 let b_words: [Word; DEGREE] = mem::transmute(b);
284 [set2(a_words[0], b_words[0]), set2(a_words[1], b_words[1])]
285}
286
287#[inline(always)]
288unsafe fn transpose_state_vecs(jobs: &[Job; DEGREE]) -> [__m128i; 8] {
289 // Load all the state words into transposed vectors, where the first vector
290 // has the first word of each state, etc. Transposing once at the beginning
291 // and once at the end is more efficient that repeating it for each block.
292 let words0 = array_refs!(&jobs[0].words, DEGREE, DEGREE, DEGREE, DEGREE);
293 let words1 = array_refs!(&jobs[1].words, DEGREE, DEGREE, DEGREE, DEGREE);
294 let [h0, h1] = transpose_vecs(loadu(words0.0), loadu(words1.0));
295 let [h2, h3] = transpose_vecs(loadu(words0.1), loadu(words1.1));
296 let [h4, h5] = transpose_vecs(loadu(words0.2), loadu(words1.2));
297 let [h6, h7] = transpose_vecs(loadu(words0.3), loadu(words1.3));
298 [h0, h1, h2, h3, h4, h5, h6, h7]
299}
300
301#[inline(always)]
302unsafe fn untranspose_state_vecs(h_vecs: &[__m128i; 8], jobs: &mut [Job; DEGREE]) {
303 // Un-transpose the updated state vectors back into the caller's arrays.
304 let [job0, job1] = jobs;
305 let words0 = mut_array_refs!(&mut job0.words, DEGREE, DEGREE, DEGREE, DEGREE);
306 let words1 = mut_array_refs!(&mut job1.words, DEGREE, DEGREE, DEGREE, DEGREE);
307
308 let out = transpose_vecs(h_vecs[0], h_vecs[1]);
309 storeu(out[0], words0.0);
310 storeu(out[1], words1.0);
311 let out = transpose_vecs(h_vecs[2], h_vecs[3]);
312 storeu(out[0], words0.1);
313 storeu(out[1], words1.1);
314 let out = transpose_vecs(h_vecs[4], h_vecs[5]);
315 storeu(out[0], words0.2);
316 storeu(out[1], words1.2);
317 let out = transpose_vecs(h_vecs[6], h_vecs[7]);
318 storeu(out[0], words0.3);
319 storeu(out[1], words1.3);
320}
321
322#[inline(always)]
323unsafe fn transpose_msg_vecs(blocks: [*const [u8; BLOCKBYTES]; DEGREE]) -> [__m128i; 16] {
324 // These input arrays have no particular alignment, so we use unaligned
325 // loads to read from them.
326 let block0 = blocks[0] as *const [Word; DEGREE];
327 let block1 = blocks[1] as *const [Word; DEGREE];
328 let [m0, m1] = transpose_vecs(loadu(block0.add(0)), loadu(block1.add(0)));
329 let [m2, m3] = transpose_vecs(loadu(block0.add(1)), loadu(block1.add(1)));
330 let [m4, m5] = transpose_vecs(loadu(block0.add(2)), loadu(block1.add(2)));
331 let [m6, m7] = transpose_vecs(loadu(block0.add(3)), loadu(block1.add(3)));
332 let [m8, m9] = transpose_vecs(loadu(block0.add(4)), loadu(block1.add(4)));
333 let [m10, m11] = transpose_vecs(loadu(block0.add(5)), loadu(block1.add(5)));
334 let [m12, m13] = transpose_vecs(loadu(block0.add(6)), loadu(block1.add(6)));
335 let [m14, m15] = transpose_vecs(loadu(block0.add(7)), loadu(block1.add(7)));
336 [
337 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15,
338 ]
339}
340
341#[inline(always)]
342unsafe fn load_counts(jobs: &[Job; DEGREE]) -> (__m128i, __m128i) {
343 (
344 set2(count_low(jobs[0].count), count_low(jobs[1].count)),
345 set2(count_high(jobs[0].count), count_high(jobs[1].count)),
346 )
347}
348
349#[inline(always)]
350unsafe fn store_counts(jobs: &mut [Job; DEGREE], low: __m128i, high: __m128i) {
351 let low_ints: [Word; DEGREE] = mem::transmute(low);
352 let high_ints: [Word; DEGREE] = mem::transmute(high);
353 for i in 0..DEGREE {
354 jobs[i].count = assemble_count(low_ints[i], high_ints[i]);
355 }
356}
357
358#[inline(always)]
359unsafe fn add_to_counts(lo: &mut __m128i, hi: &mut __m128i, delta: __m128i) {
360 // If the low counts reach zero, that means they wrapped, unless the delta
361 // was also zero.
362 *lo = add(*lo, delta);
363 let lo_reached_zero = eq(*lo, set1(0));
364 let delta_was_zero = eq(delta, set1(0));
365 let hi_inc = and(set1(1), negate_and(delta_was_zero, lo_reached_zero));
366 *hi = add(*hi, hi_inc);
367}
368
369#[inline(always)]
370unsafe fn flags_vec(flags: [bool; DEGREE]) -> __m128i {
371 set2(flag_word(flags[0]), flag_word(flags[1]))
372}
373
374#[target_feature(enable = "sse4.1")]
375pub unsafe fn compress2_loop(jobs: &mut [Job; DEGREE], finalize: Finalize, stride: Stride) {
376 // If we're not finalizing, there can't be a partial block at the end.
377 for job in jobs.iter() {
378 input_debug_asserts(job.input, finalize);
379 }
380
381 let msg_ptrs = [jobs[0].input.as_ptr(), jobs[1].input.as_ptr()];
382 let mut h_vecs = transpose_state_vecs(&jobs);
383 let (mut counts_lo, mut counts_hi) = load_counts(&jobs);
384
385 // Prepare the final blocks (note, which could be empty if the input is
386 // empty). Do all this before entering the main loop.
387 let min_len = jobs.iter().map(|job| job.input.len()).min().unwrap();
388 let mut fin_offset = min_len.saturating_sub(1);
389 fin_offset -= fin_offset % stride.padded_blockbytes();
390 // Performance note, making these buffers mem::uninitialized() seems to
391 // cause problems in the optimizer.
392 let mut buf0: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
393 let mut buf1: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
394 let (block0, len0, finalize0) = final_block(jobs[0].input, fin_offset, &mut buf0, stride);
395 let (block1, len1, finalize1) = final_block(jobs[1].input, fin_offset, &mut buf1, stride);
396 let fin_blocks: [*const [u8; BLOCKBYTES]; DEGREE] = [block0, block1];
397 let fin_counts_delta = set2(len0 as Word, len1 as Word);
398 let fin_last_block;
399 let fin_last_node;
400 if finalize.yes() {
401 fin_last_block = flags_vec([finalize0, finalize1]);
402 fin_last_node = flags_vec([
403 finalize0 && jobs[0].last_node.yes(),
404 finalize1 && jobs[1].last_node.yes(),
405 ]);
406 } else {
407 fin_last_block = set1(0);
408 fin_last_node = set1(0);
409 }
410
411 // The main loop.
412 let mut offset = 0;
413 loop {
414 let blocks;
415 let counts_delta;
416 let last_block;
417 let last_node;
418 if offset == fin_offset {
419 blocks = fin_blocks;
420 counts_delta = fin_counts_delta;
421 last_block = fin_last_block;
422 last_node = fin_last_node;
423 } else {
424 blocks = [
425 msg_ptrs[0].add(offset) as *const [u8; BLOCKBYTES],
426 msg_ptrs[1].add(offset) as *const [u8; BLOCKBYTES],
427 ];
428 counts_delta = set1(BLOCKBYTES as Word);
429 last_block = set1(0);
430 last_node = set1(0);
431 };
432
433 let m_vecs = transpose_msg_vecs(blocks);
434 add_to_counts(&mut counts_lo, &mut counts_hi, counts_delta);
435 compress2_transposed!(
436 &mut h_vecs,
437 &m_vecs,
438 counts_lo,
439 counts_hi,
440 last_block,
441 last_node,
442 );
443
444 // Check for termination before bumping the offset, to avoid overflow.
445 if offset == fin_offset {
446 break;
447 }
448
449 offset += stride.padded_blockbytes();
450 }
451
452 // Write out the results.
453 untranspose_state_vecs(&h_vecs, &mut *jobs);
454 store_counts(&mut *jobs, counts_lo, counts_hi);
455 let max_consumed = offset.saturating_add(stride.padded_blockbytes());
456 for job in jobs.iter_mut() {
457 let consumed = cmp::min(max_consumed, job.input.len());
458 job.input = &job.input[consumed..];
459 }
460}