]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / mh_sha1_murmur3_x64_128 / mh_sha1_murmur3_x64_128_block_avx2.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;; code to compute 16 SHA1 using AVX2
31 ;;
32
33 %include "reg_sizes.asm"
34 default rel
35
36 ;; Magic functions defined in FIPS 180-1
37 ;;
38 ;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
39 %macro MAGIC_F0 5
40 %define %%regF %1
41 %define %%regB %2
42 %define %%regC %3
43 %define %%regD %4
44 %define %%regT %5
45 vpxor %%regF, %%regC,%%regD
46 vpand %%regF, %%regF,%%regB
47 vpxor %%regF, %%regF,%%regD
48 %endmacro
49
50 ;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
51 %macro MAGIC_F1 5
52 %define %%regF %1
53 %define %%regB %2
54 %define %%regC %3
55 %define %%regD %4
56 %define %%regT %5
57 vpxor %%regF,%%regD,%%regC
58 vpxor %%regF,%%regF,%%regB
59 %endmacro
60
61
62
63 ;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
64 %macro MAGIC_F2 5
65 %define %%regF %1
66 %define %%regB %2
67 %define %%regC %3
68 %define %%regD %4
69 %define %%regT %5
70 vpor %%regF,%%regB,%%regC
71 vpand %%regT,%%regB,%%regC
72 vpand %%regF,%%regF,%%regD
73 vpor %%regF,%%regF,%%regT
74 %endmacro
75
76 ;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
77 %macro MAGIC_F3 5
78 %define %%regF %1
79 %define %%regB %2
80 %define %%regC %3
81 %define %%regD %4
82 %define %%regT %5
83 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
84 %endmacro
85
86 ; PROLD reg, imm, tmp
87 %macro PROLD 3
88 %define %%reg %1
89 %define %%imm %2
90 %define %%tmp %3
91 vpsrld %%tmp, %%reg, (32-%%imm)
92 vpslld %%reg, %%reg, %%imm
93 vpor %%reg, %%reg, %%tmp
94 %endmacro
95
96 ; PROLD reg, imm, tmp
97 %macro PROLD_nd 4
98 %define %%reg %1
99 %define %%imm %2
100 %define %%tmp %3
101 %define %%src %4
102 vpsrld %%tmp, %%src, (32-%%imm)
103 vpslld %%reg, %%src, %%imm
104 vpor %%reg, %%reg, %%tmp
105 %endmacro
106
107 %macro SHA1_STEP_00_15 11
108 %define %%regA %1
109 %define %%regB %2
110 %define %%regC %3
111 %define %%regD %4
112 %define %%regE %5
113 %define %%regT %6
114 %define %%regF %7
115 %define %%memW %8
116 %define %%immCNT %9
117 %define %%MAGIC %10
118 %define %%data %11
119 vpaddd %%regE, %%regE,%%immCNT
120 vpaddd %%regE, %%regE,[%%data + (%%memW * 32)]
121 PROLD_nd %%regT,5, %%regF,%%regA
122 vpaddd %%regE, %%regE,%%regT
123 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
124 PROLD %%regB,30, %%regT
125 vpaddd %%regE, %%regE,%%regF
126 %endmacro
127 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
128 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
129 %macro SHA1_STEP_16_79 11
130 %define %%regA %1
131 %define %%regB %2
132 %define %%regC %3
133 %define %%regD %4
134 %define %%regE %5
135 %define %%regT %6
136 %define %%regF %7
137 %define %%memW %8
138 %define %%immCNT %9
139 %define %%MAGIC %10
140 %define %%data %11
141 vpaddd %%regE, %%regE,%%immCNT
142
143 vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
144 vpxor W16, W16, W14
145 vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
146 vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
147
148 vpsrld %%regF, W16, (32-1)
149 vpslld W16, W16, 1
150 vpor %%regF, %%regF, W16
151 ROTATE_W
152
153 vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
154 vpaddd %%regE, %%regE,%%regF
155
156 PROLD_nd %%regT,5, %%regF, %%regA
157 vpaddd %%regE, %%regE,%%regT
158 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
159 PROLD %%regB,30, %%regT
160 vpaddd %%regE,%%regE,%%regF
161 %endmacro
162
163 ;; Insert murmur's instructions into this macro.
164 ;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 512Byte.
165 ;; So insert 1 murmur block into every 2 SHA1_STEP_16_79.
166 %define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
167
168 %macro SHA1_STEP_16_79_0 11
169 %define %%regA %1
170 %define %%regB %2
171 %define %%regC %3
172 %define %%regD %4
173 %define %%regE %5
174 %define %%regT %6
175 %define %%regF %7
176 %define %%memW %8
177 %define %%immCNT %9
178 %define %%MAGIC %10
179 %define %%data %11
180 vpaddd %%regE, %%regE,%%immCNT
181
182 vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
183 vpxor W16, W16, W14
184 vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
185 vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
186 mov mur_data1, [mur_in_p]
187 mov mur_data2, [mur_in_p + 8]
188
189 vpsrld %%regF, W16, (32-1)
190 imul mur_data1, mur_c1_r
191 vpslld W16, W16, 1
192 vpor %%regF, %%regF, W16
193 imul mur_data2, mur_c2_r
194 ROTATE_W
195
196 vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
197 rol mur_data1, R1
198 vpaddd %%regE, %%regE,%%regF
199 rol mur_data2, R2
200 PROLD_nd %%regT,5, %%regF, %%regA
201 vpaddd %%regE, %%regE,%%regT
202 imul mur_data1, mur_c2_r
203 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
204 PROLD %%regB,30, %%regT
205 imul mur_data2, mur_c1_r
206 vpaddd %%regE,%%regE,%%regF
207 %endmacro
208
209
210 %macro SHA1_STEP_16_79_1 11
211 %define %%regA %1
212 %define %%regB %2
213 %define %%regC %3
214 %define %%regD %4
215 %define %%regE %5
216 %define %%regT %6
217 %define %%regF %7
218 %define %%memW %8
219 %define %%immCNT %9
220 %define %%MAGIC %10
221 %define %%data %11
222 vpaddd %%regE, %%regE,%%immCNT
223 xor mur_hash1, mur_data1
224 vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
225 rol mur_hash1, R3
226 vpxor W16, W16, W14
227 add mur_hash1, mur_hash2
228 vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
229 vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
230 lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
231 vpsrld %%regF, W16, (32-1)
232 vpslld W16, W16, 1
233 xor mur_hash2, mur_data2
234 vpor %%regF, %%regF, W16
235 rol mur_hash2, R4
236 ROTATE_W
237
238 vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
239 vpaddd %%regE, %%regE,%%regF
240 add mur_hash2, mur_hash1
241 PROLD_nd %%regT,5, %%regF, %%regA
242 vpaddd %%regE, %%regE,%%regT
243 lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
244 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
245 PROLD %%regB,30, %%regT
246 add mur_in_p, 16
247 vpaddd %%regE,%%regE,%%regF
248 %endmacro
249
250 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
251 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
253 %ifidn __OUTPUT_FORMAT__, elf64
254 ; Linux
255 %define arg0 rdi
256 %define arg1 rsi
257 %define arg2 rdx
258 %define arg3 rcx
259
260 %define arg4 r8d
261 %define arg5 r9
262
263 %define tmp1 r10
264 %define tmp2 r11
265 %define tmp3 r12 ; must be saved and restored
266 %define tmp4 r13 ; must be saved and restored
267 %define tmp5 r14 ; must be saved and restored
268 %define tmp6 r15 ; must be saved and restored
269 %define tmp7 rbx ; must be saved and restored
270 %define tmp8 rbp ; must be saved and restored
271 %define return rax
272
273 %define func(x) x:
274 %macro FUNC_SAVE 0
275 push r12
276 push r13
277 push r14
278 push r15
279 push rbx
280 push rbp
281 %endmacro
282 %macro FUNC_RESTORE 0
283 pop rbp
284 pop rbx
285 pop r15
286 pop r14
287 pop r13
288 pop r12
289 %endmacro
290 %else
291 ; Windows
292 %define arg0 rcx
293 %define arg1 rdx
294 %define arg2 r8
295 %define arg3 r9
296
297 %define arg4 r10d
298 %define arg5 r11
299 %define tmp1 r12 ; must be saved and restored
300 %define tmp2 r13 ; must be saved and restored
301 %define tmp3 r14 ; must be saved and restored
302 %define tmp4 r15 ; must be saved and restored
303 %define tmp5 rdi ; must be saved and restored
304 %define tmp6 rsi ; must be saved and restored
305 %define tmp7 rbx ; must be saved and restored
306 %define tmp8 rbp ; must be saved and restored
307 %define return rax
308
309 %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
310 %define PS 8
311 %define arg(x) [rsp + stack_size + PS + PS*x]
312 %define func(x) proc_frame x
313 %macro FUNC_SAVE 0
314 alloc_stack stack_size
315 save_xmm128 xmm6, 0*16
316 save_xmm128 xmm7, 1*16
317 save_xmm128 xmm8, 2*16
318 save_xmm128 xmm9, 3*16
319 save_xmm128 xmm10, 4*16
320 save_xmm128 xmm11, 5*16
321 save_xmm128 xmm12, 6*16
322 save_xmm128 xmm13, 7*16
323 save_xmm128 xmm14, 8*16
324 save_xmm128 xmm15, 9*16
325 save_reg r12, 10*16 + 0*8
326 save_reg r13, 10*16 + 1*8
327 save_reg r14, 10*16 + 2*8
328 save_reg r15, 10*16 + 3*8
329 save_reg rdi, 10*16 + 4*8
330 save_reg rsi, 10*16 + 5*8
331 save_reg rbx, 10*16 + 6*8
332 save_reg rbp, 10*16 + 7*8
333 end_prolog
334 mov arg4, arg(4)
335 %endmacro
336
337 %macro FUNC_RESTORE 0
338 movdqa xmm6, [rsp + 0*16]
339 movdqa xmm7, [rsp + 1*16]
340 movdqa xmm8, [rsp + 2*16]
341 movdqa xmm9, [rsp + 3*16]
342 movdqa xmm10, [rsp + 4*16]
343 movdqa xmm11, [rsp + 5*16]
344 movdqa xmm12, [rsp + 6*16]
345 movdqa xmm13, [rsp + 7*16]
346 movdqa xmm14, [rsp + 8*16]
347 movdqa xmm15, [rsp + 9*16]
348 mov r12, [rsp + 10*16 + 0*8]
349 mov r13, [rsp + 10*16 + 1*8]
350 mov r14, [rsp + 10*16 + 2*8]
351 mov r15, [rsp + 10*16 + 3*8]
352 mov rdi, [rsp + 10*16 + 4*8]
353 mov rsi, [rsp + 10*16 + 5*8]
354 mov rbx, [rsp + 10*16 + 6*8]
355 mov rbp, [rsp + 10*16 + 7*8]
356 add rsp, stack_size
357 %endmacro
358 %endif
359 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
360 %define loops arg4
361 ;variables of mh_sha1
362 %define mh_in_p arg0
363 %define mh_digests_p arg1
364 %define mh_data_p arg2
365 %define mh_segs tmp1
366 ;variables of murmur3
367 %define mur_in_p tmp2
368 %define mur_digest_p arg3
369 %define mur_hash1 tmp3
370 %define mur_hash2 tmp4
371 %define mur_data1 tmp5
372 %define mur_data2 return
373 %define mur_c1_r tmp6
374 %define mur_c2_r arg5
375 ; constants of murmur3_x64_128
376 %define R1 31
377 %define R2 33
378 %define R3 27
379 %define R4 31
380 %define M 5
381 %define N1 0x52dce729;DWORD
382 %define N2 0x38495ab5;DWORD
383 %define C1 QWORD(0x87c37b91114253d5)
384 %define C2 QWORD(0x4cf5ad432745937f)
385 ;variables used by storing segs_digests on stack
386 %define RSP_SAVE tmp7
387 %define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
388
389 %define pref tmp8
390 %macro PREFETCH_X 1
391 %define %%mem %1
392 prefetchnta %%mem
393 %endmacro
394 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
395 %define VMOVPS vmovups
396
397 %define A ymm0
398 %define B ymm1
399 %define C ymm2
400 %define D ymm3
401 %define E ymm4
402
403 %define F ymm5
404 %define T0 ymm6
405 %define T1 ymm7
406 %define T2 ymm8
407 %define T3 ymm9
408 %define T4 ymm10
409 %define T5 ymm11
410 %define T6 ymm12
411 %define T7 ymm13
412 %define T8 ymm14
413 %define T9 ymm15
414
415 %define AA ymm5
416 %define BB ymm6
417 %define CC ymm7
418 %define DD ymm8
419 %define EE ymm9
420 %define TMP ymm10
421 %define FUN ymm11
422 %define K ymm12
423 %define W14 ymm13
424 %define W15 ymm14
425 %define W16 ymm15
426
427
428 %macro ROTATE_ARGS 0
429 %xdefine TMP_ E
430 %xdefine E D
431 %xdefine D C
432 %xdefine C B
433 %xdefine B A
434 %xdefine A TMP_
435 %endm
436
437 %macro ROTATE_W 0
438 %xdefine TMP_ W16
439 %xdefine W16 W15
440 %xdefine W15 W14
441 %xdefine W14 TMP_
442 %endm
443
444
445 ;init hash digests
446 ; segs_digests:low addr-> high_addr
447 ; a | b | c | ...| p | (16)
448 ; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
449 ; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
450 ; ....
451 ; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
452
453 align 32
454 ;void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
455 ; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
456 ; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
457 ; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
458 ; uint32_t num_blocks);
459 ; arg 0 pointer to input data
460 ; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
461 ; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
462 ; arg 3 pointer to murmur3 digest
463 ; arg 4 number of 1KB blocks
464 ;
465 global mh_sha1_murmur3_x64_128_block_avx2:function internal
466 func(mh_sha1_murmur3_x64_128_block_avx2)
467 FUNC_SAVE
468
469 ; save rsp
470 mov RSP_SAVE, rsp
471
472 cmp loops, 0
473 jle .return
474
475 ; leave enough space to store segs_digests
476 sub rsp, FRAMESZ
477 ; align rsp to 32 Bytes needed by avx2
478 and rsp, ~0x1F
479
480 %assign I 0 ; copy segs_digests into stack
481 %rep 2
482 VMOVPS A, [mh_digests_p + I*32*5 + 32*0]
483 VMOVPS B, [mh_digests_p + I*32*5 + 32*1]
484 VMOVPS C, [mh_digests_p + I*32*5 + 32*2]
485 VMOVPS D, [mh_digests_p + I*32*5 + 32*3]
486 VMOVPS E, [mh_digests_p + I*32*5 + 32*4]
487
488 vmovdqa [rsp + I*32*5 + 32*0], A
489 vmovdqa [rsp + I*32*5 + 32*1], B
490 vmovdqa [rsp + I*32*5 + 32*2], C
491 vmovdqa [rsp + I*32*5 + 32*3], D
492 vmovdqa [rsp + I*32*5 + 32*4], E
493 %assign I (I+1)
494 %endrep
495
496 ;init murmur variables
497 mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
498 ;load murmur hash digests and multiplier
499 mov mur_hash1, [mur_digest_p]
500 mov mur_hash2, [mur_digest_p + 8]
501 mov mur_c1_r, C1
502 mov mur_c2_r, C2
503
504 .block_loop:
505 ;transform to big-endian data and store on aligned_frame
506 vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
507 ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
508 %assign I 0
509 %rep 16
510 VMOVPS T0,[mh_in_p + I*64+0*32]
511 VMOVPS T1,[mh_in_p + I*64+1*32]
512
513 vpshufb T0, T0, F
514 vmovdqa [mh_data_p +I*32+0*512],T0
515 vpshufb T1, T1, F
516 vmovdqa [mh_data_p +I*32+1*512],T1
517 %assign I (I+1)
518 %endrep
519
520 mov mh_segs, 0 ;start from the first 8 segments
521 mov pref, 1024 ;avoid prefetch repeadtedly
522 .segs_loop:
523 ;; Initialize digests
524 vmovdqa A, [rsp + 0*64 + mh_segs]
525 vmovdqa B, [rsp + 1*64 + mh_segs]
526 vmovdqa C, [rsp + 2*64 + mh_segs]
527 vmovdqa D, [rsp + 3*64 + mh_segs]
528 vmovdqa E, [rsp + 4*64 + mh_segs]
529
530 vmovdqa AA, A
531 vmovdqa BB, B
532 vmovdqa CC, C
533 vmovdqa DD, D
534 vmovdqa EE, E
535 ;;
536 ;; perform 0-79 steps
537 ;;
538 vmovdqa K, [K00_19]
539 ;; do rounds 0...15
540 %assign I 0
541 %rep 16
542 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
543 ROTATE_ARGS
544 %assign I (I+1)
545 %endrep
546
547 ;; do rounds 16...19
548 vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32]
549 vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32]
550 %rep 4
551 %assign J (I % 2)
552 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
553 ROTATE_ARGS
554 %assign I (I+1)
555 %endrep
556 PREFETCH_X [mh_in_p + pref+128*0]
557 PREFETCH_X [mh_in_p + pref+128*1]
558 ;; do rounds 20...39
559 vmovdqa K, [K20_39]
560 %rep 20
561 %assign J (I % 2)
562 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
563 ROTATE_ARGS
564 %assign I (I+1)
565 %endrep
566 ;; do rounds 40...59
567 vmovdqa K, [K40_59]
568 %rep 20
569 %assign J (I % 2)
570 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
571 ROTATE_ARGS
572 %assign I (I+1)
573 %endrep
574 PREFETCH_X [mh_in_p + pref+128*2]
575 PREFETCH_X [mh_in_p + pref+128*3]
576 ;; do rounds 60...79
577 vmovdqa K, [K60_79]
578 %rep 20
579 %assign J (I % 2)
580 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
581 ROTATE_ARGS
582 %assign I (I+1)
583 %endrep
584
585 vpaddd A,A, AA
586 vpaddd B,B, BB
587 vpaddd C,C, CC
588 vpaddd D,D, DD
589 vpaddd E,E, EE
590
591 ; write out digests
592 vmovdqa [rsp + 0*64 + mh_segs], A
593 vmovdqa [rsp + 1*64 + mh_segs], B
594 vmovdqa [rsp + 2*64 + mh_segs], C
595 vmovdqa [rsp + 3*64 + mh_segs], D
596 vmovdqa [rsp + 4*64 + mh_segs], E
597
598 add pref, 512
599
600 add mh_data_p, 512
601 add mh_segs, 32
602 cmp mh_segs, 64
603 jc .segs_loop
604
605 sub mh_data_p, (1024)
606 add mh_in_p, (1024)
607 sub loops, 1
608 jne .block_loop
609
610 ;store murmur-hash digest
611 mov [mur_digest_p], mur_hash1
612 mov [mur_digest_p + 8], mur_hash2
613
614 %assign I 0 ; copy segs_digests back to mh_digests_p
615 %rep 2
616 vmovdqa A, [rsp + I*32*5 + 32*0]
617 vmovdqa B, [rsp + I*32*5 + 32*1]
618 vmovdqa C, [rsp + I*32*5 + 32*2]
619 vmovdqa D, [rsp + I*32*5 + 32*3]
620 vmovdqa E, [rsp + I*32*5 + 32*4]
621
622 VMOVPS [mh_digests_p + I*32*5 + 32*0], A
623 VMOVPS [mh_digests_p + I*32*5 + 32*1], B
624 VMOVPS [mh_digests_p + I*32*5 + 32*2], C
625 VMOVPS [mh_digests_p + I*32*5 + 32*3], D
626 VMOVPS [mh_digests_p + I*32*5 + 32*4], E
627 %assign I (I+1)
628 %endrep
629 mov rsp, RSP_SAVE ; restore rsp
630
631 .return:
632 FUNC_RESTORE
633 ret
634
635 endproc_frame
636
637 section .data align=32
638
639 align 32
640 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
641 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
642 K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
643 dq 0x5A8279995A827999, 0x5A8279995A827999
644 K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
645 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
646 K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
647 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
648 K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
649 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6