]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | ;; code to compute octal MD5 using AVX | |
29 | ||
30 | ;; Stack must be aligned to 16 bytes before call | |
31 | ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 | |
32 | ;; Windows preserves: rcx rbp | |
33 | ;; | |
34 | ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 | |
35 | ;; Linux preserves: rdi rbp | |
36 | ;; | |
37 | ;; clobbers xmm0-15 | |
38 | ||
f67539c2 | 39 | %include "include/os.asm" |
11fdf7f2 TL |
40 | %include "mb_mgr_datastruct.asm" |
41 | ||
42 | extern MD5_TABLE | |
43 | ||
44 | section .data | |
45 | default rel | |
46 | align 64 | |
47 | ONES: | |
48 | dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff | |
49 | ||
50 | section .text | |
51 | ||
52 | %ifdef LINUX | |
53 | ;; Linux Registers | |
54 | %define arg1 rdi | |
55 | %define arg2 rsi | |
56 | %define mem1 rcx | |
57 | %define mem2 rdx | |
58 | %else | |
59 | %define arg1 rcx | |
60 | %define arg2 rdx | |
61 | %define mem1 rdi | |
62 | %define mem2 rsi | |
63 | %endif | |
64 | ||
65 | ;; rbp is not clobbered | |
66 | ||
67 | %define state arg1 | |
68 | %define num_blks arg2 | |
69 | ||
70 | %define inp0 r8 | |
71 | %define inp1 r9 | |
72 | %define inp2 r10 | |
73 | %define inp3 r11 | |
74 | %define inp4 r12 | |
75 | %define inp5 r13 | |
76 | %define inp6 r14 | |
77 | %define inp7 r15 | |
78 | ||
79 | %define TBL rax | |
80 | %define IDX rbx | |
81 | ||
82 | %define A xmm0 | |
83 | %define B xmm1 | |
84 | %define C xmm2 | |
85 | %define D xmm3 | |
86 | %define E xmm4 ; tmp | |
87 | %define F xmm5 ; tmp | |
88 | ||
89 | %define A2 xmm6 | |
90 | %define B2 xmm7 | |
91 | %define C2 xmm8 | |
92 | %define D2 xmm9 | |
93 | ||
94 | ||
95 | %define FUN E | |
96 | %define TMP F | |
97 | %define FUN2 xmm10 | |
98 | %define TMP2 xmm11 | |
99 | ||
100 | %define T0 xmm10 | |
101 | %define T1 xmm11 | |
102 | %define T2 xmm12 | |
103 | %define T3 xmm13 | |
104 | %define T4 xmm14 | |
105 | %define T5 xmm15 | |
106 | ||
107 | ; Stack Layout | |
108 | ; | |
109 | ; 470 DD2 | |
110 | ; 460 CC2 | |
111 | ; 450 BB2 | |
112 | ; 440 AA2 | |
113 | ; 430 DD | |
114 | ; 420 CC | |
115 | ; 410 BB | |
116 | ; 400 AA | |
117 | ; | |
118 | ; 3F0 data2[15] for lanes 7...4 \ | |
119 | ; ... \ | |
120 | ; 300 data2[0] for lanes 7...4 \ | |
121 | ; 2F0 data2[15] for lanes 3...0 > mem block 2 | |
122 | ; ... / | |
123 | ; 210 data2[1] for lanes 3...0 / | |
124 | ; 200 data2[0] for lanes 3...0 / | |
125 | ; | |
126 | ; 1F0 data1[15] for lanes 7...4 \ | |
127 | ; ... \ | |
128 | ; 100 data1[0] for lanes 7...4 \ | |
129 | ; F0 data1[15] for lanes 3...0 > mem block 1 | |
130 | ; ... / | |
131 | ; 10 data1[1] for lanes 3...0 / | |
132 | ; 0 data1[0] for lanes 3...0 / | |
133 | ||
134 | ; stack size must be an odd multiple of 8 bytes in size | |
135 | struc STACK | |
136 | _DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs | |
137 | _DIGEST: reso 8 ; stores AA-DD, AA2-DD2 | |
138 | resb 8 ; for alignment | |
139 | endstruc | |
140 | %define STACK_SIZE STACK_size | |
141 | ||
142 | %define AA rsp + _DIGEST + 16*0 | |
143 | %define BB rsp + _DIGEST + 16*1 | |
144 | %define CC rsp + _DIGEST + 16*2 | |
145 | %define DD rsp + _DIGEST + 16*3 | |
146 | %define AA2 rsp + _DIGEST + 16*4 | |
147 | %define BB2 rsp + _DIGEST + 16*5 | |
148 | %define CC2 rsp + _DIGEST + 16*6 | |
149 | %define DD2 rsp + _DIGEST + 16*7 | |
150 | ||
151 | ;; | |
152 | ;; MD5 left rotations (number of bits) | |
153 | ;; | |
154 | rot11 equ 7 | |
155 | rot12 equ 12 | |
156 | rot13 equ 17 | |
157 | rot14 equ 22 | |
158 | rot21 equ 5 | |
159 | rot22 equ 9 | |
160 | rot23 equ 14 | |
161 | rot24 equ 20 | |
162 | rot31 equ 4 | |
163 | rot32 equ 11 | |
164 | rot33 equ 16 | |
165 | rot34 equ 23 | |
166 | rot41 equ 6 | |
167 | rot42 equ 10 | |
168 | rot43 equ 15 | |
169 | rot44 equ 21 | |
170 | ||
171 | ; transpose r0, r1, r2, r3, t0, t1 | |
172 | ; "transpose" data in {r0..r3} using temps {t0..t3} | |
173 | ; Input looks like: {r0 r1 r2 r3} | |
174 | ; r0 = {a3 a2 a1 a0} | |
175 | ; r1 = {b3 b2 b1 b0} | |
176 | ; r2 = {c3 c2 c1 c0} | |
177 | ; r3 = {d3 d2 d1 d0} | |
178 | ; | |
179 | ; output looks like: {t0 r1 r0 r3} | |
180 | ; t0 = {d0 c0 b0 a0} | |
181 | ; r1 = {d1 c1 b1 a1} | |
182 | ; r0 = {d2 c2 b2 a2} | |
183 | ; r3 = {d3 c3 b3 a3} | |
184 | ; | |
185 | %macro TRANSPOSE 6 | |
186 | %define %%r0 %1 | |
187 | %define %%r1 %2 | |
188 | %define %%r2 %3 | |
189 | %define %%r3 %4 | |
190 | %define %%t0 %5 | |
191 | %define %%t1 %6 | |
192 | vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} | |
193 | vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} | |
194 | ||
195 | vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} | |
196 | vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} | |
197 | ||
198 | vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} | |
199 | vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} | |
200 | ||
201 | vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} | |
202 | vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} | |
203 | %endmacro | |
204 | ||
205 | ;; | |
206 | ;; Magic functions defined in RFC 1321 | |
207 | ;; | |
208 | ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) | |
209 | %macro MAGIC_F 4 | |
210 | %define %%F %1 | |
211 | %define %%X %2 | |
212 | %define %%Y %3 | |
213 | %define %%Z %4 | |
214 | vpxor %%F,%%Z, %%Y | |
215 | vpand %%F,%%F,%%X | |
216 | vpxor %%F,%%F,%%Z | |
217 | %endmacro | |
218 | ||
219 | ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) | |
220 | %macro MAGIC_G 4 | |
221 | %define %%F %1 | |
222 | %define %%X %2 | |
223 | %define %%Y %3 | |
224 | %define %%Z %4 | |
225 | MAGIC_F %%F,%%Z,%%X,%%Y | |
226 | %endmacro | |
227 | ||
228 | ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) | |
229 | %macro MAGIC_H 4 | |
230 | %define %%F %1 | |
231 | %define %%X %2 | |
232 | %define %%Y %3 | |
233 | %define %%Z %4 | |
234 | vpxor %%F,%%Z, %%Y | |
235 | vpxor %%F,%%F, %%X | |
236 | %endmacro | |
237 | ||
238 | ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) | |
239 | %macro MAGIC_I 4 | |
240 | %define %%F %1 | |
241 | %define %%X %2 | |
242 | %define %%Y %3 | |
243 | %define %%Z %4 | |
244 | vpxor %%F,%%Z,[rel ONES] ; pnot %%F | |
245 | vpor %%F,%%F,%%X | |
246 | vpxor %%F,%%F,%%Y | |
247 | %endmacro | |
248 | ||
249 | ; PROLD reg, imm, tmp | |
250 | %macro PROLD 3 | |
251 | %define %%reg %1 | |
252 | %define %%imm %2 | |
253 | %define %%tmp %3 | |
254 | vpsrld %%tmp, %%reg, (32-%%imm) | |
255 | vpslld %%reg, %%reg, %%imm | |
256 | vpor %%reg, %%reg, %%tmp | |
257 | %endmacro | |
258 | ||
259 | ;; | |
260 | ;; single MD5 step | |
261 | ;; | |
262 | ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) | |
263 | ;; | |
264 | ; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot | |
265 | %macro MD5_STEP1 14 | |
266 | %define %%MAGIC_FUN %1 | |
267 | %define %%A %2 | |
268 | %define %%B %3 | |
269 | %define %%C %4 | |
270 | %define %%D %5 | |
271 | %define %%A2 %6 | |
272 | %define %%B2 %7 | |
273 | %define %%C2 %8 | |
274 | %define %%D2 %9 | |
275 | %define %%FUN %10 | |
276 | %define %%TMP %11 | |
277 | %define %%data %12 | |
278 | %define %%MD5const %13 | |
279 | %define %%nrot %14 | |
280 | ||
281 | vpaddd %%A, %%A, %%MD5const | |
282 | vpaddd %%A2, %%A2, %%MD5const | |
283 | vpaddd %%A, %%A, [%%data] | |
284 | vpaddd %%A2, %%A2, [%%data + 16*16] | |
285 | %%MAGIC_FUN %%FUN, %%B,%%C,%%D | |
286 | vpaddd %%A, %%A, %%FUN | |
287 | %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 | |
288 | vpaddd %%A2, %%A2, %%FUN | |
289 | PROLD %%A,%%nrot, %%TMP | |
290 | PROLD %%A2,%%nrot, %%TMP | |
291 | vpaddd %%A, %%A, %%B | |
292 | vpaddd %%A2, %%A2, %%B2 | |
293 | %endmacro | |
294 | ||
295 | ;; | |
296 | ;; single MD5 step | |
297 | ;; | |
298 | ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) | |
299 | ;; | |
300 | ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, | |
301 | ; MD5const, nrot | |
302 | %macro MD5_STEP 16 | |
303 | %define %%MAGIC_FUN %1 | |
304 | %define %%A %2 | |
305 | %define %%B %3 | |
306 | %define %%C %4 | |
307 | %define %%D %5 | |
308 | %define %%A2 %6 | |
309 | %define %%B2 %7 | |
310 | %define %%C2 %8 | |
311 | %define %%D2 %9 | |
312 | %define %%FUN %10 | |
313 | %define %%TMP %11 | |
314 | %define %%FUN2 %12 | |
315 | %define %%TMP2 %13 | |
316 | %define %%data %14 | |
317 | %define %%MD5const %15 | |
318 | %define %%nrot %16 | |
319 | ||
320 | vmovdqa %%TMP,[%%data] | |
321 | vmovdqa %%TMP2,[%%data + 16*16] | |
322 | vpaddd %%A, %%A, %%MD5const | |
323 | vpaddd %%A2, %%A2, %%MD5const | |
324 | vpaddd %%A, %%A, %%TMP | |
325 | vpaddd %%A2, %%A2, %%TMP2 | |
326 | %%MAGIC_FUN %%FUN, %%B,%%C,%%D | |
327 | %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 | |
328 | vpaddd %%A, %%A, %%FUN | |
329 | vpaddd %%A2, %%A2, %%FUN2 | |
330 | PROLD %%A,%%nrot, %%TMP | |
331 | PROLD %%A2,%%nrot, %%TMP2 | |
332 | vpaddd %%A, %%A, %%B | |
333 | vpaddd %%A2, %%A2, %%B2 | |
334 | %endmacro | |
335 | ||
336 | ; void md5_x4x2_avx(MD5_ARGS *args, UINT64 num_blks) | |
337 | ; arg 1 : pointer to MD5_ARGS structure | |
338 | ; arg 2 : number of blocks (>=1) | |
339 | ; | |
340 | align 32 | |
341 | MKGLOBAL(md5_x4x2_avx,function,internal) | |
342 | md5_x4x2_avx: | |
343 | ||
344 | sub rsp, STACK_SIZE | |
345 | ||
346 | ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2 | |
347 | ;; Initialize digests | |
348 | vmovdqa A,[state + 0*16 + 0*MD5_DIGEST_ROW_SIZE] | |
349 | vmovdqa B,[state + 0*16 + 1*MD5_DIGEST_ROW_SIZE] | |
350 | vmovdqa C,[state + 0*16 + 2*MD5_DIGEST_ROW_SIZE] | |
351 | vmovdqa D,[state + 0*16 + 3*MD5_DIGEST_ROW_SIZE] | |
352 | ||
353 | vmovdqa A2,[state + 1*16 + 0*MD5_DIGEST_ROW_SIZE] | |
354 | vmovdqa B2,[state + 1*16 + 1*MD5_DIGEST_ROW_SIZE] | |
355 | vmovdqa C2,[state + 1*16 + 2*MD5_DIGEST_ROW_SIZE] | |
356 | vmovdqa D2,[state + 1*16 + 3*MD5_DIGEST_ROW_SIZE] | |
357 | ||
358 | lea TBL, [rel MD5_TABLE] | |
359 | ||
360 | ;; load input pointers | |
361 | mov inp0,[state+_data_ptr_md5 +0*PTR_SZ] | |
362 | mov inp1,[state+_data_ptr_md5 +1*PTR_SZ] | |
363 | mov inp2,[state+_data_ptr_md5 +2*PTR_SZ] | |
364 | mov inp3,[state+_data_ptr_md5 +3*PTR_SZ] | |
365 | mov inp4,[state+_data_ptr_md5 +4*PTR_SZ] | |
366 | mov inp5,[state+_data_ptr_md5 +5*PTR_SZ] | |
367 | mov inp6,[state+_data_ptr_md5 +6*PTR_SZ] | |
368 | mov inp7,[state+_data_ptr_md5 +7*PTR_SZ] | |
369 | xor IDX, IDX | |
370 | ||
371 | ; Make ping-pong pointers to the two memory blocks | |
372 | mov mem1, rsp | |
373 | lea mem2, [rsp + 16*16*2] | |
374 | ||
375 | ;; Load first block of data and save back to stack | |
376 | %assign I 0 | |
377 | %rep 4 | |
378 | vmovdqu T2,[inp0+IDX+I*16] | |
379 | vmovdqu T1,[inp1+IDX+I*16] | |
380 | vmovdqu T4,[inp2+IDX+I*16] | |
381 | vmovdqu T3,[inp3+IDX+I*16] | |
382 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
383 | vmovdqa [mem1+(I*4+0)*16],T0 | |
384 | vmovdqa [mem1+(I*4+1)*16],T1 | |
385 | vmovdqa [mem1+(I*4+2)*16],T2 | |
386 | vmovdqa [mem1+(I*4+3)*16],T3 | |
387 | ||
388 | vmovdqu T2,[inp4+IDX+I*16] | |
389 | vmovdqu T1,[inp5+IDX+I*16] | |
390 | vmovdqu T4,[inp6+IDX+I*16] | |
391 | vmovdqu T3,[inp7+IDX+I*16] | |
392 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
393 | vmovdqa [mem1+(I*4+0)*16 + 16*16],T0 | |
394 | vmovdqa [mem1+(I*4+1)*16 + 16*16],T1 | |
395 | vmovdqa [mem1+(I*4+2)*16 + 16*16],T2 | |
396 | vmovdqa [mem1+(I*4+3)*16 + 16*16],T3 | |
397 | %assign I (I+1) | |
398 | %endrep | |
399 | ||
400 | lloop: | |
401 | ; save old digests | |
402 | vmovdqa [AA], A | |
403 | vmovdqa [BB], B | |
404 | vmovdqa [CC], C | |
405 | vmovdqa [DD], D | |
406 | ; save old digests | |
407 | vmovdqa [AA2], A2 | |
408 | vmovdqa [BB2], B2 | |
409 | vmovdqa [CC2], C2 | |
410 | vmovdqa [DD2], D2 | |
411 | ||
412 | add IDX, 4*16 | |
413 | sub num_blks, 1 | |
414 | je lastblock | |
415 | ||
416 | MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 | |
417 | MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 | |
418 | MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 | |
419 | MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 | |
420 | MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 | |
421 | MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 | |
422 | MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 | |
423 | MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 | |
424 | ||
425 | %assign I 0 | |
426 | vmovdqu T2,[inp0+IDX+I*16] | |
427 | vmovdqu T1,[inp1+IDX+I*16] | |
428 | vmovdqu T4,[inp2+IDX+I*16] | |
429 | vmovdqu T3,[inp3+IDX+I*16] | |
430 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
431 | vmovdqa [mem2+(I*4+0)*16],T0 | |
432 | vmovdqa [mem2+(I*4+1)*16],T1 | |
433 | vmovdqa [mem2+(I*4+2)*16],T2 | |
434 | vmovdqa [mem2+(I*4+3)*16],T3 | |
435 | ||
436 | MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 | |
437 | MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 | |
438 | MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 | |
439 | MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 | |
440 | MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 | |
441 | MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 | |
442 | MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 | |
443 | MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 | |
444 | ||
445 | ||
446 | vmovdqu T2,[inp4+IDX+I*16] | |
447 | vmovdqu T1,[inp5+IDX+I*16] | |
448 | vmovdqu T4,[inp6+IDX+I*16] | |
449 | vmovdqu T3,[inp7+IDX+I*16] | |
450 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
451 | vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 | |
452 | vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 | |
453 | vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 | |
454 | vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 | |
455 | %assign I (I+1) | |
456 | ||
457 | MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 | |
458 | MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 | |
459 | MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 | |
460 | MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 | |
461 | MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 | |
462 | MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 | |
463 | MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 | |
464 | MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 | |
465 | ||
466 | vmovdqu T2,[inp0+IDX+I*16] | |
467 | vmovdqu T1,[inp1+IDX+I*16] | |
468 | vmovdqu T4,[inp2+IDX+I*16] | |
469 | vmovdqu T3,[inp3+IDX+I*16] | |
470 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
471 | vmovdqa [mem2+(I*4+0)*16],T0 | |
472 | vmovdqa [mem2+(I*4+1)*16],T1 | |
473 | vmovdqa [mem2+(I*4+2)*16],T2 | |
474 | vmovdqa [mem2+(I*4+3)*16],T3 | |
475 | ||
476 | MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 | |
477 | MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 | |
478 | MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 | |
479 | MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 | |
480 | MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 | |
481 | MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 | |
482 | MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 | |
483 | MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 | |
484 | ||
485 | vmovdqu T2,[inp4+IDX+I*16] | |
486 | vmovdqu T1,[inp5+IDX+I*16] | |
487 | vmovdqu T4,[inp6+IDX+I*16] | |
488 | vmovdqu T3,[inp7+IDX+I*16] | |
489 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
490 | vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 | |
491 | vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 | |
492 | vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 | |
493 | vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 | |
494 | %assign I (I+1) | |
495 | ||
496 | MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 | |
497 | MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 | |
498 | MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 | |
499 | MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 | |
500 | MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 | |
501 | MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 | |
502 | MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 | |
503 | MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 | |
504 | ||
505 | vmovdqu T2,[inp0+IDX+I*16] | |
506 | vmovdqu T1,[inp1+IDX+I*16] | |
507 | vmovdqu T4,[inp2+IDX+I*16] | |
508 | vmovdqu T3,[inp3+IDX+I*16] | |
509 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
510 | vmovdqa [mem2+(I*4+0)*16],T0 | |
511 | vmovdqa [mem2+(I*4+1)*16],T1 | |
512 | vmovdqa [mem2+(I*4+2)*16],T2 | |
513 | vmovdqa [mem2+(I*4+3)*16],T3 | |
514 | ||
515 | MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 | |
516 | MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 | |
517 | MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 | |
518 | MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 | |
519 | MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 | |
520 | MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 | |
521 | MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 | |
522 | MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 | |
523 | ||
524 | vmovdqu T2,[inp4+IDX+I*16] | |
525 | vmovdqu T1,[inp5+IDX+I*16] | |
526 | vmovdqu T4,[inp6+IDX+I*16] | |
527 | vmovdqu T3,[inp7+IDX+I*16] | |
528 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
529 | vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 | |
530 | vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 | |
531 | vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 | |
532 | vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 | |
533 | %assign I (I+1) | |
534 | ||
535 | MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 | |
536 | MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 | |
537 | MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 | |
538 | MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 | |
539 | MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 | |
540 | MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 | |
541 | MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 | |
542 | MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 | |
543 | ||
544 | vmovdqu T2,[inp0+IDX+I*16] | |
545 | vmovdqu T1,[inp1+IDX+I*16] | |
546 | vmovdqu T4,[inp2+IDX+I*16] | |
547 | vmovdqu T3,[inp3+IDX+I*16] | |
548 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
549 | vmovdqa [mem2+(I*4+0)*16],T0 | |
550 | vmovdqa [mem2+(I*4+1)*16],T1 | |
551 | vmovdqa [mem2+(I*4+2)*16],T2 | |
552 | vmovdqa [mem2+(I*4+3)*16],T3 | |
553 | ||
554 | MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 | |
555 | MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 | |
556 | MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 | |
557 | MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 | |
558 | MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 | |
559 | MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 | |
560 | MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 | |
561 | MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 | |
562 | ||
563 | vmovdqu T2,[inp4+IDX+I*16] | |
564 | vmovdqu T1,[inp5+IDX+I*16] | |
565 | vmovdqu T4,[inp6+IDX+I*16] | |
566 | vmovdqu T3,[inp7+IDX+I*16] | |
567 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
568 | vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 | |
569 | vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 | |
570 | vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 | |
571 | vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 | |
572 | %assign I (I+1) | |
573 | ||
574 | ||
575 | vpaddd A,A,[AA] | |
576 | vpaddd B,B,[BB] | |
577 | vpaddd C,C,[CC] | |
578 | vpaddd D,D,[DD] | |
579 | ||
580 | vpaddd A2,A2,[AA2] | |
581 | vpaddd B2,B2,[BB2] | |
582 | vpaddd C2,C2,[CC2] | |
583 | vpaddd D2,D2,[DD2] | |
584 | ||
585 | ; swap mem1 and mem2 | |
586 | xchg mem1, mem2 | |
587 | ||
588 | jmp lloop | |
589 | ||
590 | lastblock: | |
591 | ||
592 | MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 | |
593 | MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 | |
594 | MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 | |
595 | MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 | |
596 | MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 | |
597 | MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 | |
598 | MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 | |
599 | MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 | |
600 | MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 | |
601 | MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 | |
602 | MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 | |
603 | MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 | |
604 | MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 | |
605 | MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 | |
606 | MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 | |
607 | MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 | |
608 | ||
609 | MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 | |
610 | MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 | |
611 | MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 | |
612 | MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 | |
613 | MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 | |
614 | MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 | |
615 | MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 | |
616 | MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 | |
617 | MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 | |
618 | MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 | |
619 | MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 | |
620 | MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 | |
621 | MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 | |
622 | MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 | |
623 | MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 | |
624 | MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 | |
625 | ||
626 | MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 | |
627 | MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 | |
628 | MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 | |
629 | MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 | |
630 | MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 | |
631 | MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 | |
632 | MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 | |
633 | MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 | |
634 | MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 | |
635 | MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 | |
636 | MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 | |
637 | MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 | |
638 | MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 | |
639 | MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 | |
640 | MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 | |
641 | MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 | |
642 | ||
643 | MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 | |
644 | MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 | |
645 | MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 | |
646 | MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 | |
647 | MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 | |
648 | MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 | |
649 | MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 | |
650 | MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 | |
651 | MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 | |
652 | MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 | |
653 | MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 | |
654 | MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 | |
655 | MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 | |
656 | MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 | |
657 | MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 | |
658 | MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 | |
659 | ||
660 | vpaddd A,A,[AA] | |
661 | vpaddd B,B,[BB] | |
662 | vpaddd C,C,[CC] | |
663 | vpaddd D,D,[DD] | |
664 | ||
665 | vpaddd A2,A2,[AA2] | |
666 | vpaddd B2,B2,[BB2] | |
667 | vpaddd C2,C2,[CC2] | |
668 | vpaddd D2,D2,[DD2] | |
669 | ||
670 | ; write out digests | |
671 | vmovdqu [state + 0*16 + 0*MD5_DIGEST_ROW_SIZE ], A | |
672 | vmovdqu [state + 0*16 + 1*MD5_DIGEST_ROW_SIZE ], B | |
673 | vmovdqu [state + 0*16 + 2*MD5_DIGEST_ROW_SIZE ], C | |
674 | vmovdqu [state + 0*16 + 3*MD5_DIGEST_ROW_SIZE ], D | |
675 | vmovdqu [state + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2 | |
676 | vmovdqu [state + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2 | |
677 | vmovdqu [state + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2 | |
678 | vmovdqu [state + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2 | |
679 | ||
680 | ;; update input pointers | |
681 | add inp0, IDX | |
682 | add inp1, IDX | |
683 | add inp2, IDX | |
684 | add inp3, IDX | |
685 | add inp4, IDX | |
686 | add inp5, IDX | |
687 | add inp6, IDX | |
688 | add inp7, IDX | |
689 | mov [state +_data_ptr_md5 + 0*PTR_SZ], inp0 | |
690 | mov [state +_data_ptr_md5 + 1*PTR_SZ], inp1 | |
691 | mov [state +_data_ptr_md5 + 2*PTR_SZ], inp2 | |
692 | mov [state +_data_ptr_md5 + 3*PTR_SZ], inp3 | |
693 | mov [state +_data_ptr_md5 + 4*PTR_SZ], inp4 | |
694 | mov [state +_data_ptr_md5 + 5*PTR_SZ], inp5 | |
695 | mov [state +_data_ptr_md5 + 6*PTR_SZ], inp6 | |
696 | mov [state +_data_ptr_md5 + 7*PTR_SZ], inp7 | |
697 | ||
f67539c2 TL |
698 | ;; Clear stack frame (72*16 bytes) |
699 | %ifdef SAFE_DATA | |
700 | vpxor xmm0, xmm0 | |
701 | %assign i 0 | |
702 | %rep (2*2*16+8) | |
703 | vmovdqa [rsp + i*16], xmm0 | |
704 | %assign i (i+1) | |
705 | %endrep | |
706 | %endif | |
707 | ||
11fdf7f2 TL |
708 | ;;;;;;;;;;;;;;;; |
709 | ;; Postamble | |
710 | add rsp, STACK_SIZE | |
711 | ||
712 | ret | |
713 | ||
714 | %ifdef LINUX | |
715 | section .note.GNU-stack noalloc noexec nowrite progbits | |
716 | %endif |