]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / md5_x4x2_sse.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; code to compute octal MD5 using SSE
29
30;; Stack must be aligned to 16 bytes before call
31;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
32;; Windows preserves: rcx rbp
33;;
34;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
35;; Linux preserves: rdi rbp
36;;
37;; clobbers xmm0-15
38
f67539c2 39%include "include/os.asm"
11fdf7f2
TL
40%include "mb_mgr_datastruct.asm"
41
42section .data align=64
43default rel
44
45align 64
46MKGLOBAL(MD5_TABLE,data,internal)
47MD5_TABLE:
48 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
49 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
50 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
51 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
52 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
53 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
54 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
55 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
56 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
57 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
58 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
59 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
60 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
61 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
62 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
63 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
64 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
65 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
66 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
67 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
68 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
69 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
70 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
71 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
72 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
73 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
74 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
75 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
76 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
77 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
78 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
79 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
80 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
81 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
82 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
83 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
84 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
85 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
86 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
87 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
88 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
89 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
90 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
91 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
92 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
93 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
94 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
95 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
96 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
97 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
98 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
99 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
100 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
101 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
102 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
103 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
104 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
105 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
106 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
107 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
108 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
109 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
110 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
111 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
112
113ONES:
114 dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
115
116section .text
117
118%ifdef LINUX
119;; Linux Registers
120%define arg1 rdi
121%define arg2 rsi
122%define mem1 rcx
123%define mem2 rdx
124%else
125%define arg1 rcx
126%define arg2 rdx
127%define mem1 rdi
128%define mem2 rsi
129%endif
130
131;; rbp is not clobbered
132
133%define inp0 r8
134%define inp1 r9
135%define inp2 r10
136%define inp3 r11
137%define inp4 r12
138%define inp5 r13
139%define inp6 r14
140%define inp7 r15
141
142%define TBL rax
143%define IDX rbx
144
145%define A xmm0
146%define B xmm1
147%define C xmm2
148%define D xmm3
149%define E xmm4 ; tmp
150%define F xmm5 ; tmp
151
152%define A2 xmm6
153%define B2 xmm7
154%define C2 xmm8
155%define D2 xmm9
156
157
158%define FUN E
159%define TMP F
160%define FUN2 xmm10
161%define TMP2 xmm11
162
163%define T0 xmm10
164%define T1 xmm11
165%define T2 xmm12
166%define T3 xmm13
167%define T4 xmm14
168%define T5 xmm15
169
170; Stack Layout
171;
172; 470 DD2
173; 460 CC2
174; 450 BB2
175; 440 AA2
176; 430 DD
177; 420 CC
178; 410 BB
179; 400 AA
180;
181; 3F0 data2[15] for lanes 7...4 \
182; ... \
183; 300 data2[0] for lanes 7...4 \
184; 2F0 data2[15] for lanes 3...0 > mem block 2
185; ... /
186; 210 data2[1] for lanes 3...0 /
187; 200 data2[0] for lanes 3...0 /
188;
189; 1F0 data1[15] for lanes 7...4 \
190; ... \
191; 100 data1[0] for lanes 7...4 \
192; F0 data1[15] for lanes 3...0 > mem block 1
193; ... /
194; 10 data1[1] for lanes 3...0 /
195; 0 data1[0] for lanes 3...0 /
196
197; stack size must be an odd multiple of 8 bytes in size
198struc STACK
199_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
200_DIGEST: reso 8 ; stores AA-DD, AA2-DD2
201 resb 8 ; for alignment
202endstruc
203%define STACK_SIZE STACK_size
204
205%define AA rsp + _DIGEST + 16*0
206%define BB rsp + _DIGEST + 16*1
207%define CC rsp + _DIGEST + 16*2
208%define DD rsp + _DIGEST + 16*3
209%define AA2 rsp + _DIGEST + 16*4
210%define BB2 rsp + _DIGEST + 16*5
211%define CC2 rsp + _DIGEST + 16*6
212%define DD2 rsp + _DIGEST + 16*7
213
214;;
215;; MD5 left rotations (number of bits)
216;;
217rot11 equ 7
218rot12 equ 12
219rot13 equ 17
220rot14 equ 22
221rot21 equ 5
222rot22 equ 9
223rot23 equ 14
224rot24 equ 20
225rot31 equ 4
226rot32 equ 11
227rot33 equ 16
228rot34 equ 23
229rot41 equ 6
230rot42 equ 10
231rot43 equ 15
232rot44 equ 21
233
234; transpose r0, r1, r2, r3, t0, t1
235; "transpose" data in {r0..r3} using temps {t0..t3}
236; Input looks like: {r0 r1 r2 r3}
237; r0 = {a3 a2 a1 a0}
238; r1 = {b3 b2 b1 b0}
239; r2 = {c3 c2 c1 c0}
240; r3 = {d3 d2 d1 d0}
241;
242; output looks like: {t0 r1 r0 r3}
243; t0 = {d0 c0 b0 a0}
244; r1 = {d1 c1 b1 a1}
245; r0 = {d2 c2 b2 a2}
246; r3 = {d3 c3 b3 a3}
247;
248%macro TRANSPOSE 6
249%define %%r0 %1
250%define %%r1 %2
251%define %%r2 %3
252%define %%r3 %4
253%define %%t0 %5
254%define %%t1 %6
255 movdqa %%t0, %%r0
256 shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
257 shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
258
259 movdqa %%t1, %%r2
260 shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
261 shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
262
263 movdqa %%r1, %%t0
264 shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
265
266 movdqa %%r3, %%r0
267 shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
268
269 shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
270 shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
271%endmacro
272
273;;
274;; Magic functions defined in RFC 1321
275;;
276; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
277%macro MAGIC_F 4
278%define %%F %1
279%define %%X %2
280%define %%Y %3
281%define %%Z %4
282 movdqa %%F,%%Z
283 pxor %%F,%%Y
284 pand %%F,%%X
285 pxor %%F,%%Z
286%endmacro
287
288; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
289%macro MAGIC_G 4
290%define %%F %1
291%define %%X %2
292%define %%Y %3
293%define %%Z %4
294 MAGIC_F %%F,%%Z,%%X,%%Y
295%endmacro
296
297; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
298%macro MAGIC_H 4
299%define %%F %1
300%define %%X %2
301%define %%Y %3
302%define %%Z %4
303 movdqa %%F,%%Z
304 pxor %%F,%%Y
305 pxor %%F,%%X
306%endmacro
307
308; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
309%macro MAGIC_I 4
310%define %%F %1
311%define %%X %2
312%define %%Y %3
313%define %%Z %4
314 movdqa %%F,%%Z
315 pxor %%F,[rel ONES] ; pnot %%F
316 por %%F,%%X
317 pxor %%F,%%Y
318%endmacro
319
320; PROLD reg, imm, tmp
321%macro PROLD 3
322%define %%reg %1
323%define %%imm %2
324%define %%tmp %3
325 movdqa %%tmp, %%reg
326 psrld %%tmp, (32-%%imm)
327 pslld %%reg, %%imm
328 por %%reg, %%tmp
329%endmacro
330
331;;
332;; single MD5 step
333;;
334;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
335;;
336; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
337%macro MD5_STEP1 14
338%define %%MAGIC_FUN %1
339%define %%A %2
340%define %%B %3
341%define %%C %4
342%define %%D %5
343%define %%A2 %6
344%define %%B2 %7
345%define %%C2 %8
346%define %%D2 %9
347%define %%FUN %10
348%define %%TMP %11
349%define %%data %12
350%define %%MD5const %13
351%define %%nrot %14
352
353 paddd %%A, %%MD5const
354 paddd %%A2, %%MD5const
355 paddd %%A, [%%data]
356 paddd %%A2, [%%data + 16*16]
357 %%MAGIC_FUN %%FUN, %%B,%%C,%%D
358 paddd %%A, %%FUN
359 %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
360 paddd %%A2, %%FUN
361 PROLD %%A,%%nrot, %%TMP
362 PROLD %%A2,%%nrot, %%TMP
363 paddd %%A, %%B
364 paddd %%A2, %%B2
365%endmacro
366
367;;
368;; single MD5 step
369;;
370;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
371;;
372; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
373; MD5const, nrot
374%macro MD5_STEP 16
375%define %%MAGIC_FUN %1
376%define %%A %2
377%define %%B %3
378%define %%C %4
379%define %%D %5
380%define %%A2 %6
381%define %%B2 %7
382%define %%C2 %8
383%define %%D2 %9
384%define %%FUN %10
385%define %%TMP %11
386%define %%FUN2 %12
387%define %%TMP2 %13
388%define %%data %14
389%define %%MD5const %15
390%define %%nrot %16
391
392 paddd %%A, %%MD5const
393 paddd %%A2, %%MD5const
394 paddd %%A, [%%data]
395 paddd %%A2, [%%data + 16*16]
396 %%MAGIC_FUN %%FUN, %%B,%%C,%%D
397 %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
398 paddd %%A, %%FUN
399 paddd %%A2, %%FUN2
400 PROLD %%A,%%nrot, %%TMP
401 PROLD %%A2,%%nrot, %%TMP2
402 paddd %%A, %%B
403 paddd %%A2, %%B2
404%endmacro
405
406; void md5_x4x2_sse(MD5_ARGS *args, UINT64 num_blks)
407; arg 1 : pointer to MD5_ARGS structure
408; arg 2 : number of blocks (>=1)
409;
410align 32
411MKGLOBAL(md5_x4x2_sse,function,internal)
412md5_x4x2_sse:
413
414 sub rsp, STACK_SIZE
415
416 ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2
417 ;; Initialize digests
418 movdqa A,[arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE]
419 movdqa B,[arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE]
420 movdqa C,[arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE]
421 movdqa D,[arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE]
422
423 ;; Initialize digests
424 movdqa A2,[arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE]
425 movdqa B2,[arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE]
426 movdqa C2,[arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE]
427 movdqa D2,[arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE]
428
429 lea TBL, [rel MD5_TABLE]
430
431 ;; load input pointers
432 mov inp0,[arg1+_data_ptr_md5 +0*PTR_SZ]
433 mov inp1,[arg1+_data_ptr_md5 +1*PTR_SZ]
434 mov inp2,[arg1+_data_ptr_md5 +2*PTR_SZ]
435 mov inp3,[arg1+_data_ptr_md5 +3*PTR_SZ]
436 mov inp4,[arg1+_data_ptr_md5 +4*PTR_SZ]
437 mov inp5,[arg1+_data_ptr_md5 +5*PTR_SZ]
438 mov inp6,[arg1+_data_ptr_md5 +6*PTR_SZ]
439 mov inp7,[arg1+_data_ptr_md5 +7*PTR_SZ]
440 xor IDX, IDX
441
442 ; Make ping-pong pointers to the two memory blocks
443 mov mem1, rsp
444 lea mem2, [rsp + 16*16*2]
445
446
447;; Load first block of data and save back to stack
448%assign I 0
449%rep 4
450 movdqu T2,[inp0+IDX+I*16]
451 movdqu T1,[inp1+IDX+I*16]
452 movdqu T4,[inp2+IDX+I*16]
453 movdqu T3,[inp3+IDX+I*16]
454 TRANSPOSE T2, T1, T4, T3, T0, T5
455 movdqa [mem1+(I*4+0)*16],T0
456 movdqa [mem1+(I*4+1)*16],T1
457 movdqa [mem1+(I*4+2)*16],T2
458 movdqa [mem1+(I*4+3)*16],T3
459
460 movdqu T2,[inp4+IDX+I*16]
461 movdqu T1,[inp5+IDX+I*16]
462 movdqu T4,[inp6+IDX+I*16]
463 movdqu T3,[inp7+IDX+I*16]
464 TRANSPOSE T2, T1, T4, T3, T0, T5
465 movdqa [mem1+(I*4+0)*16 + 16*16],T0
466 movdqa [mem1+(I*4+1)*16 + 16*16],T1
467 movdqa [mem1+(I*4+2)*16 + 16*16],T2
468 movdqa [mem1+(I*4+3)*16 + 16*16],T3
469%assign I (I+1)
470%endrep
471
472lloop:
473 ; save old digests
474 movdqa [AA], A
475 movdqa [BB], B
476 movdqa [CC], C
477 movdqa [DD], D
478 ; save old digests
479 movdqa [AA2], A2
480 movdqa [BB2], B2
481 movdqa [CC2], C2
482 movdqa [DD2], D2
483
484 add IDX, 4*16
485 sub arg2, 1
486 je lastblock
487
488 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
489 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
490 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
491 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
492 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
493 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
494 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
495 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
496
497%assign I 0
498 movdqu T2,[inp0+IDX+I*16]
499 movdqu T1,[inp1+IDX+I*16]
500 movdqu T4,[inp2+IDX+I*16]
501 movdqu T3,[inp3+IDX+I*16]
502 TRANSPOSE T2, T1, T4, T3, T0, T5
503 movdqa [mem2+(I*4+0)*16],T0
504 movdqa [mem2+(I*4+1)*16],T1
505 movdqa [mem2+(I*4+2)*16],T2
506 movdqa [mem2+(I*4+3)*16],T3
507
508 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
509 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
510 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
511 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
512 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
513 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
514 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
515 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
516
517
518 movdqu T2,[inp4+IDX+I*16]
519 movdqu T1,[inp5+IDX+I*16]
520 movdqu T4,[inp6+IDX+I*16]
521 movdqu T3,[inp7+IDX+I*16]
522 TRANSPOSE T2, T1, T4, T3, T0, T5
523 movdqa [mem2+(I*4+0)*16 + 16*16],T0
524 movdqa [mem2+(I*4+1)*16 + 16*16],T1
525 movdqa [mem2+(I*4+2)*16 + 16*16],T2
526 movdqa [mem2+(I*4+3)*16 + 16*16],T3
527%assign I (I+1)
528
529 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
530 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
531 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
532 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
533 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
534 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
535 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
536 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
537
538 movdqu T2,[inp0+IDX+I*16]
539 movdqu T1,[inp1+IDX+I*16]
540 movdqu T4,[inp2+IDX+I*16]
541 movdqu T3,[inp3+IDX+I*16]
542 TRANSPOSE T2, T1, T4, T3, T0, T5
543 movdqa [mem2+(I*4+0)*16],T0
544 movdqa [mem2+(I*4+1)*16],T1
545 movdqa [mem2+(I*4+2)*16],T2
546 movdqa [mem2+(I*4+3)*16],T3
547
548 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
549 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
550 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
551 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
552 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
553 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
554 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
555 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
556
557 movdqu T2,[inp4+IDX+I*16]
558 movdqu T1,[inp5+IDX+I*16]
559 movdqu T4,[inp6+IDX+I*16]
560 movdqu T3,[inp7+IDX+I*16]
561 TRANSPOSE T2, T1, T4, T3, T0, T5
562 movdqa [mem2+(I*4+0)*16 + 16*16],T0
563 movdqa [mem2+(I*4+1)*16 + 16*16],T1
564 movdqa [mem2+(I*4+2)*16 + 16*16],T2
565 movdqa [mem2+(I*4+3)*16 + 16*16],T3
566%assign I (I+1)
567
568 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
569 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
570 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
571 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
572 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
573 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
574 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
575 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
576
577 movdqu T2,[inp0+IDX+I*16]
578 movdqu T1,[inp1+IDX+I*16]
579 movdqu T4,[inp2+IDX+I*16]
580 movdqu T3,[inp3+IDX+I*16]
581 TRANSPOSE T2, T1, T4, T3, T0, T5
582 movdqa [mem2+(I*4+0)*16],T0
583 movdqa [mem2+(I*4+1)*16],T1
584 movdqa [mem2+(I*4+2)*16],T2
585 movdqa [mem2+(I*4+3)*16],T3
586
587 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
588 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
589 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
590 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
591 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
592 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
593 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
594 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
595
596 movdqu T2,[inp4+IDX+I*16]
597 movdqu T1,[inp5+IDX+I*16]
598 movdqu T4,[inp6+IDX+I*16]
599 movdqu T3,[inp7+IDX+I*16]
600 TRANSPOSE T2, T1, T4, T3, T0, T5
601 movdqa [mem2+(I*4+0)*16 + 16*16],T0
602 movdqa [mem2+(I*4+1)*16 + 16*16],T1
603 movdqa [mem2+(I*4+2)*16 + 16*16],T2
604 movdqa [mem2+(I*4+3)*16 + 16*16],T3
605%assign I (I+1)
606
607 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
608 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
609 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
610 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
611 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
612 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
613 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
614 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
615
616 movdqu T2,[inp0+IDX+I*16]
617 movdqu T1,[inp1+IDX+I*16]
618 movdqu T4,[inp2+IDX+I*16]
619 movdqu T3,[inp3+IDX+I*16]
620 TRANSPOSE T2, T1, T4, T3, T0, T5
621 movdqa [mem2+(I*4+0)*16],T0
622 movdqa [mem2+(I*4+1)*16],T1
623 movdqa [mem2+(I*4+2)*16],T2
624 movdqa [mem2+(I*4+3)*16],T3
625
626 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
627 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
628 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
629 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
630 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
631 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
632 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
633 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
634
635 movdqu T2,[inp4+IDX+I*16]
636 movdqu T1,[inp5+IDX+I*16]
637 movdqu T4,[inp6+IDX+I*16]
638 movdqu T3,[inp7+IDX+I*16]
639 TRANSPOSE T2, T1, T4, T3, T0, T5
640 movdqa [mem2+(I*4+0)*16 + 16*16],T0
641 movdqa [mem2+(I*4+1)*16 + 16*16],T1
642 movdqa [mem2+(I*4+2)*16 + 16*16],T2
643 movdqa [mem2+(I*4+3)*16 + 16*16],T3
644%assign I (I+1)
645
646
647 paddd A,[AA]
648 paddd B,[BB]
649 paddd C,[CC]
650 paddd D,[DD]
651
652 paddd A2,[AA2]
653 paddd B2,[BB2]
654 paddd C2,[CC2]
655 paddd D2,[DD2]
656
657 ; swap mem1 and mem2
658 xchg mem1, mem2
659
660 jmp lloop
661
662lastblock:
663
664 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
665 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
666 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
667 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
668 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
669 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
670 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
671 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
672 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
673 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
674 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
675 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
676 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
677 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
678 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
679 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
680
681 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
682 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
683 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
684 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
685 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
686 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
687 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
688 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
689 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
690 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
691 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
692 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
693 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
694 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
695 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
696 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
697
698 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
699 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
700 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
701 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
702 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
703 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
704 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
705 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
706 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
707 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
708 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
709 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
710 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
711 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
712 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
713 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
714
715 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
716 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
717 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
718 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
719 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
720 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
721 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
722 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
723 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
724 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
725 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
726 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
727 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
728 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
729 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
730 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
731
732 paddd A,[AA]
733 paddd B,[BB]
734 paddd C,[CC]
735 paddd D,[DD]
736
737 paddd A2,[AA2]
738 paddd B2,[BB2]
739 paddd C2,[CC2]
740 paddd D2,[DD2]
741
742 ; write out digests
743 movdqu [arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE], A
744 movdqu [arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE], B
745 movdqu [arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE], C
746 movdqu [arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE], D
747 movdqu [arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2
748 movdqu [arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2
749 movdqu [arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2
750 movdqu [arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2
751
752 ;; update input pointers
753 add inp0, IDX
754 add inp1, IDX
755 add inp2, IDX
756 add inp3, IDX
757 add inp4, IDX
758 add inp5, IDX
759 add inp6, IDX
760 add inp7, IDX
761 mov [arg1 +_data_ptr_md5 + 0*PTR_SZ], inp0
762 mov [arg1 +_data_ptr_md5 + 1*PTR_SZ], inp1
763 mov [arg1 +_data_ptr_md5 + 2*PTR_SZ], inp2
764 mov [arg1 +_data_ptr_md5 + 3*PTR_SZ], inp3
765 mov [arg1 +_data_ptr_md5 + 4*PTR_SZ], inp4
766 mov [arg1 +_data_ptr_md5 + 5*PTR_SZ], inp5
767 mov [arg1 +_data_ptr_md5 + 6*PTR_SZ], inp6
768 mov [arg1 +_data_ptr_md5 + 7*PTR_SZ], inp7
769
f67539c2
TL
770 ;; Clear stack frame (72*16 bytes)
771%ifdef SAFE_DATA
772 pxor xmm0, xmm0
773%assign i 0
774%rep (2*2*16+8)
775 movdqa [rsp + i*16], xmm0
776%assign i (i+1)
777%endrep
778%endif
779
11fdf7f2
TL
780 ;;;;;;;;;;;;;;;;
781 ;; Postamble
782 add rsp, STACK_SIZE
783 ret
784
785%ifdef LINUX
786section .note.GNU-stack noalloc noexec nowrite progbits
787%endif