]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | ;; code to compute 16 SHA256 using AVX-2 | |
31 | ;; | |
32 | ||
33 | %include "reg_sizes.asm" | |
34 | ||
35 | [bits 64] | |
36 | default rel | |
37 | section .text | |
38 | ||
39 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
40 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
41 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
42 | %ifidn __OUTPUT_FORMAT__, elf64 | |
43 | ; Linux | |
44 | %define arg0 rdi | |
45 | %define arg1 rsi | |
46 | %define arg2 rdx | |
47 | %define arg3 rcx | |
48 | ||
49 | %define arg4 r8 | |
50 | %define arg5 r9 | |
51 | ||
52 | %define tmp1 r10 | |
53 | %define tmp2 r11 | |
54 | %define tmp3 r12 ; must be saved and restored | |
55 | %define tmp4 r13 ; must be saved and restored | |
56 | %define tmp5 r14 ; must be saved and restored | |
57 | %define tmp6 r15 ; must be saved and restored | |
58 | %define return rax | |
59 | ||
60 | %define func(x) x: | |
61 | %macro FUNC_SAVE 0 | |
62 | push r12 | |
63 | push r13 | |
64 | push r14 | |
65 | push r15 | |
66 | %endmacro | |
67 | %macro FUNC_RESTORE 0 | |
68 | pop r15 | |
69 | pop r14 | |
70 | pop r13 | |
71 | pop r12 | |
72 | %endmacro | |
73 | %else | |
74 | ; Windows | |
75 | %define arg0 rcx | |
76 | %define arg1 rdx | |
77 | %define arg2 r8 | |
78 | %define arg3 r9 | |
79 | ||
80 | %define arg4 r10 | |
81 | %define arg5 r11 | |
82 | %define tmp1 r12 ; must be saved and restored | |
83 | %define tmp2 r13 ; must be saved and restored | |
84 | %define tmp3 r14 ; must be saved and restored | |
85 | %define tmp4 r15 ; must be saved and restored | |
86 | %define tmp5 rdi ; must be saved and restored | |
87 | %define tmp6 rsi ; must be saved and restored | |
88 | %define return rax | |
89 | ||
90 | %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 | |
91 | %define func(x) proc_frame x | |
92 | %macro FUNC_SAVE 0 | |
93 | alloc_stack stack_size | |
94 | save_xmm128 xmm6, 0*16 | |
95 | save_xmm128 xmm7, 1*16 | |
96 | save_xmm128 xmm8, 2*16 | |
97 | save_xmm128 xmm9, 3*16 | |
98 | save_xmm128 xmm10, 4*16 | |
99 | save_xmm128 xmm11, 5*16 | |
100 | save_xmm128 xmm12, 6*16 | |
101 | save_xmm128 xmm13, 7*16 | |
102 | save_xmm128 xmm14, 8*16 | |
103 | save_xmm128 xmm15, 9*16 | |
104 | save_reg r12, 10*16 + 0*8 | |
105 | save_reg r13, 10*16 + 1*8 | |
106 | save_reg r14, 10*16 + 2*8 | |
107 | save_reg r15, 10*16 + 3*8 | |
108 | save_reg rdi, 10*16 + 4*8 | |
109 | save_reg rsi, 10*16 + 5*8 | |
110 | end_prolog | |
111 | %endmacro | |
112 | ||
113 | %macro FUNC_RESTORE 0 | |
114 | movdqa xmm6, [rsp + 0*16] | |
115 | movdqa xmm7, [rsp + 1*16] | |
116 | movdqa xmm8, [rsp + 2*16] | |
117 | movdqa xmm9, [rsp + 3*16] | |
118 | movdqa xmm10, [rsp + 4*16] | |
119 | movdqa xmm11, [rsp + 5*16] | |
120 | movdqa xmm12, [rsp + 6*16] | |
121 | movdqa xmm13, [rsp + 7*16] | |
122 | movdqa xmm14, [rsp + 8*16] | |
123 | movdqa xmm15, [rsp + 9*16] | |
124 | mov r12, [rsp + 10*16 + 0*8] | |
125 | mov r13, [rsp + 10*16 + 1*8] | |
126 | mov r14, [rsp + 10*16 + 2*8] | |
127 | mov r15, [rsp + 10*16 + 3*8] | |
128 | mov rdi, [rsp + 10*16 + 4*8] | |
129 | mov rsi, [rsp + 10*16 + 5*8] | |
130 | add rsp, stack_size | |
131 | %endmacro | |
132 | %endif | |
133 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
134 | %define loops arg3 | |
135 | ;variables of mh_sha256 | |
136 | %define mh_in_p arg0 | |
137 | %define mh_digests_p arg1 | |
138 | %define mh_data_p arg2 | |
139 | %define mh_segs tmp1 | |
140 | ;variables used by storing segs_digests on stack | |
141 | %define RSP_SAVE tmp2 | |
142 | %define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS | |
143 | ||
144 | ; Common definitions | |
145 | %define ROUND tmp4 | |
146 | %define TBL tmp5 | |
147 | ||
148 | %define pref tmp3 | |
149 | %macro PREFETCH_X 1 | |
150 | %define %%mem %1 | |
151 | prefetchnta %%mem | |
152 | %endmacro | |
153 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
154 | %define VMOVPS vmovups | |
155 | ||
156 | %define SZ 4 | |
157 | %define SZ8 8*SZ | |
158 | %define ROUNDS 64*SZ8 | |
159 | ||
160 | %define a ymm0 | |
161 | %define b ymm1 | |
162 | %define c ymm2 | |
163 | %define d ymm3 | |
164 | %define e ymm4 | |
165 | %define f ymm5 | |
166 | %define g ymm6 | |
167 | %define h ymm7 | |
168 | ||
169 | %define a0 ymm8 | |
170 | %define a1 ymm9 | |
171 | %define a2 ymm10 | |
172 | ||
173 | %define TT0 ymm14 | |
174 | %define TT1 ymm13 | |
175 | %define TT2 ymm12 | |
176 | %define TT3 ymm11 | |
177 | %define TT4 ymm10 | |
178 | %define TT5 ymm9 | |
179 | ||
180 | %define T1 ymm14 | |
181 | %define TMP ymm15 | |
182 | ||
183 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
184 | %macro ROTATE_ARGS 0 | |
185 | %xdefine TMP_ h | |
186 | %xdefine h g | |
187 | %xdefine g f | |
188 | %xdefine f e | |
189 | %xdefine e d | |
190 | %xdefine d c | |
191 | %xdefine c b | |
192 | %xdefine b a | |
193 | %xdefine a TMP_ | |
194 | %endm | |
195 | ||
196 | ; PRORD reg, imm, tmp | |
197 | %macro PRORD 3 | |
198 | %define %%reg %1 | |
199 | %define %%imm %2 | |
200 | %define %%tmp %3 | |
201 | vpslld %%tmp, %%reg, (32-(%%imm)) | |
202 | vpsrld %%reg, %%reg, %%imm | |
203 | vpor %%reg, %%reg, %%tmp | |
204 | %endmacro | |
205 | ||
206 | ; non-destructive | |
207 | ; PRORD_nd reg, imm, tmp, src | |
208 | %macro PRORD_nd 4 | |
209 | %define %%reg %1 | |
210 | %define %%imm %2 | |
211 | %define %%tmp %3 | |
212 | %define %%src %4 | |
213 | vpslld %%tmp, %%src, (32-(%%imm)) | |
214 | vpsrld %%reg, %%src, %%imm | |
215 | vpor %%reg, %%reg, %%tmp | |
216 | %endmacro | |
217 | ||
218 | ; PRORD dst/src, amt | |
219 | %macro PRORD 2 | |
220 | PRORD %1, %2, TMP | |
221 | %endmacro | |
222 | ||
223 | ; PRORD_nd dst, src, amt | |
224 | %macro PRORD_nd 3 | |
225 | PRORD_nd %1, %3, TMP, %2 | |
226 | %endmacro | |
227 | ||
228 | ;; arguments passed implicitly in preprocessor symbols i, a...h | |
229 | %macro ROUND_00_15_R 3 | |
230 | %define %%T1 %1 | |
231 | %define %%i %2 | |
232 | %define %%data %3 | |
233 | ||
234 | PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) | |
235 | ||
236 | vpxor a2, f, g ; ch: a2 = f^g | |
237 | vpand a2, a2, e ; ch: a2 = (f^g)&e | |
238 | vpxor a2, a2, g ; a2 = ch | |
239 | ||
240 | PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) | |
241 | vmovdqa %%T1, [SZ8*(%%i&0xf) + %%data] | |
242 | vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K | |
243 | vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) | |
244 | PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) | |
245 | vpaddd h, h, a2 ; h = h + ch | |
246 | PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) | |
247 | vpaddd h, h, %%T1 ; h = h + ch + W + K | |
248 | vpxor a0, a0, a1 ; a0 = sigma1 | |
249 | PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) | |
250 | vpxor %%T1, a, c ; maj: T1 = a^c | |
251 | add ROUND, SZ8 ; ROUND++ | |
252 | vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b | |
253 | vpaddd h, h, a0 | |
254 | ||
255 | vpaddd d, d, h | |
256 | ||
257 | vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) | |
258 | PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) | |
259 | vpxor a2, a2, a1 ; a2 = sig0 | |
260 | vpand a1, a, c ; maj: a1 = a&c | |
261 | vpor a1, a1, %%T1 ; a1 = maj | |
262 | vpaddd h, h, a1 ; h = h + ch + W + K + maj | |
263 | vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 | |
264 | ||
265 | ROTATE_ARGS | |
266 | %endm | |
267 | ||
268 | ;; arguments passed implicitly in preprocessor symbols i, a...h | |
269 | %macro ROUND_00_15_W 3 | |
270 | %define %%T1 %1 | |
271 | %define %%i %2 | |
272 | %define %%data %3 | |
273 | ||
274 | PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) | |
275 | ||
276 | vpxor a2, f, g ; ch: a2 = f^g | |
277 | vpand a2, a2, e ; ch: a2 = (f^g)&e | |
278 | vpxor a2, a2, g ; a2 = ch | |
279 | ||
280 | PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) | |
281 | vmovdqa [SZ8*(%%i&0xf) + %%data], %%T1 | |
282 | vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K | |
283 | vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) | |
284 | PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) | |
285 | vpaddd h, h, a2 ; h = h + ch | |
286 | PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) | |
287 | vpaddd h, h, %%T1 ; h = h + ch + W + K | |
288 | vpxor a0, a0, a1 ; a0 = sigma1 | |
289 | PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) | |
290 | vpxor %%T1, a, c ; maj: T1 = a^c | |
291 | add ROUND, SZ8 ; ROUND++ | |
292 | vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b | |
293 | vpaddd h, h, a0 | |
294 | ||
295 | vpaddd d, d, h | |
296 | ||
297 | vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) | |
298 | PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) | |
299 | vpxor a2, a2, a1 ; a2 = sig0 | |
300 | vpand a1, a, c ; maj: a1 = a&c | |
301 | vpor a1, a1, %%T1 ; a1 = maj | |
302 | vpaddd h, h, a1 ; h = h + ch + W + K + maj | |
303 | vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 | |
304 | ||
305 | ROTATE_ARGS | |
306 | %endm | |
307 | ||
308 | ;; arguments passed implicitly in preprocessor symbols i, a...h | |
309 | %macro ROUND_16_XX 3 | |
310 | %define %%T1 %1 | |
311 | %define %%i %2 | |
312 | %define %%data %3 | |
313 | ||
314 | vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + %%data] | |
315 | vmovdqa a1, [SZ8*((%%i-2)&0xf) + %%data] | |
316 | vmovdqa a0, %%T1 | |
317 | PRORD %%T1, 18-7 | |
318 | vmovdqa a2, a1 | |
319 | PRORD a1, 19-17 | |
320 | vpxor %%T1, %%T1, a0 | |
321 | PRORD %%T1, 7 | |
322 | vpxor a1, a1, a2 | |
323 | PRORD a1, 17 | |
324 | vpsrld a0, a0, 3 | |
325 | vpxor %%T1, %%T1, a0 | |
326 | vpsrld a2, a2, 10 | |
327 | vpxor a1, a1, a2 | |
328 | vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + %%data] | |
329 | vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + %%data] | |
330 | vpaddd %%T1, %%T1, a1 | |
331 | ||
332 | ROUND_00_15_W %%T1, %%i, %%data | |
333 | %endm | |
334 | ||
335 | ;init hash digests | |
336 | ; segs_digests:low addr-> high_addr | |
337 | ; a | b | c | ...| p | (16) | |
338 | ; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | | |
339 | ; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | | |
340 | ; .... | |
341 | ; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | | |
342 | ||
343 | align 32 | |
344 | ||
345 | ;void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], | |
346 | ; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); | |
347 | ; arg 0 pointer to input data | |
348 | ; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) | |
349 | ; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. | |
350 | ; arg 3 number of 1KB blocks | |
351 | ; | |
352 | mk_global mh_sha256_block_avx2, function, internal | |
353 | func(mh_sha256_block_avx2) | |
354 | endbranch | |
355 | FUNC_SAVE | |
356 | ; save rsp | |
357 | mov RSP_SAVE, rsp | |
358 | ||
359 | cmp loops, 0 | |
360 | jle .return | |
361 | ||
362 | ; leave enough space to store segs_digests | |
363 | sub rsp, FRAMESZ | |
364 | ; align rsp to 32 Bytes needed by avx2 | |
365 | and rsp, ~0x1F | |
366 | lea TBL,[TABLE] | |
367 | ||
368 | %assign I 0 ; copy segs_digests into stack | |
369 | %rep 4 | |
370 | VMOVPS a, [mh_digests_p + I*64*2 + 32*0] | |
371 | VMOVPS b, [mh_digests_p + I*64*2 + 32*1] | |
372 | VMOVPS c, [mh_digests_p + I*64*2 + 32*2] | |
373 | VMOVPS d, [mh_digests_p + I*64*2 + 32*3] | |
374 | ||
375 | vmovdqa [rsp + I*64*2 + 32*0], a | |
376 | vmovdqa [rsp + I*64*2 + 32*1], b | |
377 | vmovdqa [rsp + I*64*2 + 32*2], c | |
378 | vmovdqa [rsp + I*64*2 + 32*3], d | |
379 | %assign I (I+1) | |
380 | %endrep | |
381 | ||
382 | .block_loop: | |
383 | ;transform to big-endian data and store on aligned_frame | |
384 | vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] | |
385 | ;transform input data from DWORD*16_SEGS*8 to DWORD*8_SEGS*8*2 | |
386 | %assign I 0 | |
387 | %rep 16 | |
388 | VMOVPS TT0,[mh_in_p + I*64+0*32] | |
389 | VMOVPS TT1,[mh_in_p + I*64+1*32] | |
390 | ||
391 | vpshufb TT0, TT0, TMP | |
392 | vmovdqa [mh_data_p +I*32 +0*512],TT0 | |
393 | vpshufb TT1, TT1, TMP | |
394 | vmovdqa [mh_data_p +I*32 +1*512],TT1 | |
395 | %assign I (I+1) | |
396 | %endrep | |
397 | ||
398 | mov mh_segs, 0 ;start from the first 8 segments | |
399 | mov pref, 1024 ;avoid prefetch repeadtedly | |
400 | .segs_loop: | |
401 | xor ROUND, ROUND | |
402 | ;; Initialize digests | |
403 | vmovdqa a, [rsp + 0*64 + mh_segs] | |
404 | vmovdqa b, [rsp + 1*64 + mh_segs] | |
405 | vmovdqa c, [rsp + 2*64 + mh_segs] | |
406 | vmovdqa d, [rsp + 3*64 + mh_segs] | |
407 | vmovdqa e, [rsp + 4*64 + mh_segs] | |
408 | vmovdqa f, [rsp + 5*64 + mh_segs] | |
409 | vmovdqa g, [rsp + 6*64 + mh_segs] | |
410 | vmovdqa h, [rsp + 7*64 + mh_segs] | |
411 | ||
412 | %assign i 0 | |
413 | %rep 4 | |
414 | ROUND_00_15_R TT0, (i*4+0), mh_data_p | |
415 | ROUND_00_15_R TT1, (i*4+1), mh_data_p | |
416 | ROUND_00_15_R TT2, (i*4+2), mh_data_p | |
417 | ROUND_00_15_R TT3, (i*4+3), mh_data_p | |
418 | %assign i (i+1) | |
419 | %endrep | |
420 | PREFETCH_X [mh_in_p + pref+128*0] | |
421 | ||
422 | %assign i 16 | |
423 | %rep 48 | |
424 | ROUND_16_XX T1, i, mh_data_p | |
425 | %if i % 16 = 8 | |
426 | PREFETCH_X [mh_in_p + pref+128*(i/16)] | |
427 | %endif | |
428 | %assign i (i+1) | |
429 | %endrep | |
430 | ||
431 | ;; add old digest | |
432 | vpaddd a, a, [rsp + 0*64 + mh_segs] | |
433 | vpaddd b, b, [rsp + 1*64 + mh_segs] | |
434 | vpaddd c, c, [rsp + 2*64 + mh_segs] | |
435 | vpaddd d, d, [rsp + 3*64 + mh_segs] | |
436 | vpaddd e, e, [rsp + 4*64 + mh_segs] | |
437 | vpaddd f, f, [rsp + 5*64 + mh_segs] | |
438 | vpaddd g, g, [rsp + 6*64 + mh_segs] | |
439 | vpaddd h, h, [rsp + 7*64 + mh_segs] | |
440 | ||
441 | ; write out digests | |
442 | vmovdqa [rsp + 0*64 + mh_segs], a | |
443 | vmovdqa [rsp + 1*64 + mh_segs], b | |
444 | vmovdqa [rsp + 2*64 + mh_segs], c | |
445 | vmovdqa [rsp + 3*64 + mh_segs], d | |
446 | vmovdqa [rsp + 4*64 + mh_segs], e | |
447 | vmovdqa [rsp + 5*64 + mh_segs], f | |
448 | vmovdqa [rsp + 6*64 + mh_segs], g | |
449 | vmovdqa [rsp + 7*64 + mh_segs], h | |
450 | ||
451 | add pref, 512 | |
452 | add mh_data_p, 512 | |
453 | add mh_segs, 32 | |
454 | cmp mh_segs, 64 | |
455 | jc .segs_loop | |
456 | ||
457 | sub mh_data_p, (1024) | |
458 | add mh_in_p, (1024) | |
459 | sub loops, 1 | |
460 | jne .block_loop | |
461 | ||
462 | %assign I 0 ; copy segs_digests back to mh_digests_p | |
463 | %rep 4 | |
464 | vmovdqa a, [rsp + I*64*2 + 32*0] | |
465 | vmovdqa b, [rsp + I*64*2 + 32*1] | |
466 | vmovdqa c, [rsp + I*64*2 + 32*2] | |
467 | vmovdqa d, [rsp + I*64*2 + 32*3] | |
468 | ||
469 | VMOVPS [mh_digests_p + I*64*2 + 32*0], a | |
470 | VMOVPS [mh_digests_p + I*64*2 + 32*1], b | |
471 | VMOVPS [mh_digests_p + I*64*2 + 32*2], c | |
472 | VMOVPS [mh_digests_p + I*64*2 + 32*3], d | |
473 | %assign I (I+1) | |
474 | %endrep | |
475 | mov rsp, RSP_SAVE ; restore rsp | |
476 | ||
477 | .return: | |
478 | FUNC_RESTORE | |
479 | ret | |
480 | ||
481 | endproc_frame | |
482 | ||
483 | section .data align=64 | |
484 | ||
485 | align 64 | |
486 | TABLE: | |
487 | dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 | |
488 | dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 | |
489 | dq 0x7137449171374491, 0x7137449171374491 | |
490 | dq 0x7137449171374491, 0x7137449171374491 | |
491 | dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf | |
492 | dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf | |
493 | dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 | |
494 | dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 | |
495 | dq 0x3956c25b3956c25b, 0x3956c25b3956c25b | |
496 | dq 0x3956c25b3956c25b, 0x3956c25b3956c25b | |
497 | dq 0x59f111f159f111f1, 0x59f111f159f111f1 | |
498 | dq 0x59f111f159f111f1, 0x59f111f159f111f1 | |
499 | dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 | |
500 | dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 | |
501 | dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 | |
502 | dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 | |
503 | dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 | |
504 | dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 | |
505 | dq 0x12835b0112835b01, 0x12835b0112835b01 | |
506 | dq 0x12835b0112835b01, 0x12835b0112835b01 | |
507 | dq 0x243185be243185be, 0x243185be243185be | |
508 | dq 0x243185be243185be, 0x243185be243185be | |
509 | dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 | |
510 | dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 | |
511 | dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 | |
512 | dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 | |
513 | dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe | |
514 | dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe | |
515 | dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 | |
516 | dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 | |
517 | dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 | |
518 | dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 | |
519 | dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 | |
520 | dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 | |
521 | dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 | |
522 | dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 | |
523 | dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 | |
524 | dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 | |
525 | dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc | |
526 | dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc | |
527 | dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f | |
528 | dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f | |
529 | dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa | |
530 | dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa | |
531 | dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc | |
532 | dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc | |
533 | dq 0x76f988da76f988da, 0x76f988da76f988da | |
534 | dq 0x76f988da76f988da, 0x76f988da76f988da | |
535 | dq 0x983e5152983e5152, 0x983e5152983e5152 | |
536 | dq 0x983e5152983e5152, 0x983e5152983e5152 | |
537 | dq 0xa831c66da831c66d, 0xa831c66da831c66d | |
538 | dq 0xa831c66da831c66d, 0xa831c66da831c66d | |
539 | dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 | |
540 | dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 | |
541 | dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 | |
542 | dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 | |
543 | dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 | |
544 | dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 | |
545 | dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 | |
546 | dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 | |
547 | dq 0x06ca635106ca6351, 0x06ca635106ca6351 | |
548 | dq 0x06ca635106ca6351, 0x06ca635106ca6351 | |
549 | dq 0x1429296714292967, 0x1429296714292967 | |
550 | dq 0x1429296714292967, 0x1429296714292967 | |
551 | dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 | |
552 | dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 | |
553 | dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 | |
554 | dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 | |
555 | dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc | |
556 | dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc | |
557 | dq 0x53380d1353380d13, 0x53380d1353380d13 | |
558 | dq 0x53380d1353380d13, 0x53380d1353380d13 | |
559 | dq 0x650a7354650a7354, 0x650a7354650a7354 | |
560 | dq 0x650a7354650a7354, 0x650a7354650a7354 | |
561 | dq 0x766a0abb766a0abb, 0x766a0abb766a0abb | |
562 | dq 0x766a0abb766a0abb, 0x766a0abb766a0abb | |
563 | dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e | |
564 | dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e | |
565 | dq 0x92722c8592722c85, 0x92722c8592722c85 | |
566 | dq 0x92722c8592722c85, 0x92722c8592722c85 | |
567 | dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 | |
568 | dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 | |
569 | dq 0xa81a664ba81a664b, 0xa81a664ba81a664b | |
570 | dq 0xa81a664ba81a664b, 0xa81a664ba81a664b | |
571 | dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 | |
572 | dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 | |
573 | dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 | |
574 | dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 | |
575 | dq 0xd192e819d192e819, 0xd192e819d192e819 | |
576 | dq 0xd192e819d192e819, 0xd192e819d192e819 | |
577 | dq 0xd6990624d6990624, 0xd6990624d6990624 | |
578 | dq 0xd6990624d6990624, 0xd6990624d6990624 | |
579 | dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 | |
580 | dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 | |
581 | dq 0x106aa070106aa070, 0x106aa070106aa070 | |
582 | dq 0x106aa070106aa070, 0x106aa070106aa070 | |
583 | dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 | |
584 | dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 | |
585 | dq 0x1e376c081e376c08, 0x1e376c081e376c08 | |
586 | dq 0x1e376c081e376c08, 0x1e376c081e376c08 | |
587 | dq 0x2748774c2748774c, 0x2748774c2748774c | |
588 | dq 0x2748774c2748774c, 0x2748774c2748774c | |
589 | dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 | |
590 | dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 | |
591 | dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 | |
592 | dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 | |
593 | dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a | |
594 | dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a | |
595 | dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f | |
596 | dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f | |
597 | dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 | |
598 | dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 | |
599 | dq 0x748f82ee748f82ee, 0x748f82ee748f82ee | |
600 | dq 0x748f82ee748f82ee, 0x748f82ee748f82ee | |
601 | dq 0x78a5636f78a5636f, 0x78a5636f78a5636f | |
602 | dq 0x78a5636f78a5636f, 0x78a5636f78a5636f | |
603 | dq 0x84c8781484c87814, 0x84c8781484c87814 | |
604 | dq 0x84c8781484c87814, 0x84c8781484c87814 | |
605 | dq 0x8cc702088cc70208, 0x8cc702088cc70208 | |
606 | dq 0x8cc702088cc70208, 0x8cc702088cc70208 | |
607 | dq 0x90befffa90befffa, 0x90befffa90befffa | |
608 | dq 0x90befffa90befffa, 0x90befffa90befffa | |
609 | dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb | |
610 | dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb | |
611 | dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 | |
612 | dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 | |
613 | dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 | |
614 | dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 | |
615 | PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
616 | dq 0x0405060700010203, 0x0c0d0e0f08090a0b |