]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / md5_x4x2_avx.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; code to compute octal MD5 using AVX
29
30;; Stack must be aligned to 16 bytes before call
31;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
32;; Windows preserves: rcx rbp
33;;
34;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
35;; Linux preserves: rdi rbp
36;;
37;; clobbers xmm0-15
38
f67539c2 39%include "include/os.asm"
11fdf7f2
TL
40%include "mb_mgr_datastruct.asm"
41
42extern MD5_TABLE
43
44section .data
45default rel
46align 64
47ONES:
48 dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
49
50section .text
51
52%ifdef LINUX
53;; Linux Registers
54%define arg1 rdi
55%define arg2 rsi
56%define mem1 rcx
57%define mem2 rdx
58%else
59%define arg1 rcx
60%define arg2 rdx
61%define mem1 rdi
62%define mem2 rsi
63%endif
64
65;; rbp is not clobbered
66
67%define state arg1
68%define num_blks arg2
69
70%define inp0 r8
71%define inp1 r9
72%define inp2 r10
73%define inp3 r11
74%define inp4 r12
75%define inp5 r13
76%define inp6 r14
77%define inp7 r15
78
79%define TBL rax
80%define IDX rbx
81
82%define A xmm0
83%define B xmm1
84%define C xmm2
85%define D xmm3
86%define E xmm4 ; tmp
87%define F xmm5 ; tmp
88
89%define A2 xmm6
90%define B2 xmm7
91%define C2 xmm8
92%define D2 xmm9
93
94
95%define FUN E
96%define TMP F
97%define FUN2 xmm10
98%define TMP2 xmm11
99
100%define T0 xmm10
101%define T1 xmm11
102%define T2 xmm12
103%define T3 xmm13
104%define T4 xmm14
105%define T5 xmm15
106
107; Stack Layout
108;
109; 470 DD2
110; 460 CC2
111; 450 BB2
112; 440 AA2
113; 430 DD
114; 420 CC
115; 410 BB
116; 400 AA
117;
118; 3F0 data2[15] for lanes 7...4 \
119; ... \
120; 300 data2[0] for lanes 7...4 \
121; 2F0 data2[15] for lanes 3...0 > mem block 2
122; ... /
123; 210 data2[1] for lanes 3...0 /
124; 200 data2[0] for lanes 3...0 /
125;
126; 1F0 data1[15] for lanes 7...4 \
127; ... \
128; 100 data1[0] for lanes 7...4 \
129; F0 data1[15] for lanes 3...0 > mem block 1
130; ... /
131; 10 data1[1] for lanes 3...0 /
132; 0 data1[0] for lanes 3...0 /
133
134; stack size must be an odd multiple of 8 bytes in size
135struc STACK
136_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
137_DIGEST: reso 8 ; stores AA-DD, AA2-DD2
138 resb 8 ; for alignment
139endstruc
140%define STACK_SIZE STACK_size
141
142%define AA rsp + _DIGEST + 16*0
143%define BB rsp + _DIGEST + 16*1
144%define CC rsp + _DIGEST + 16*2
145%define DD rsp + _DIGEST + 16*3
146%define AA2 rsp + _DIGEST + 16*4
147%define BB2 rsp + _DIGEST + 16*5
148%define CC2 rsp + _DIGEST + 16*6
149%define DD2 rsp + _DIGEST + 16*7
150
151;;
152;; MD5 left rotations (number of bits)
153;;
154rot11 equ 7
155rot12 equ 12
156rot13 equ 17
157rot14 equ 22
158rot21 equ 5
159rot22 equ 9
160rot23 equ 14
161rot24 equ 20
162rot31 equ 4
163rot32 equ 11
164rot33 equ 16
165rot34 equ 23
166rot41 equ 6
167rot42 equ 10
168rot43 equ 15
169rot44 equ 21
170
171; transpose r0, r1, r2, r3, t0, t1
172; "transpose" data in {r0..r3} using temps {t0..t3}
173; Input looks like: {r0 r1 r2 r3}
174; r0 = {a3 a2 a1 a0}
175; r1 = {b3 b2 b1 b0}
176; r2 = {c3 c2 c1 c0}
177; r3 = {d3 d2 d1 d0}
178;
179; output looks like: {t0 r1 r0 r3}
180; t0 = {d0 c0 b0 a0}
181; r1 = {d1 c1 b1 a1}
182; r0 = {d2 c2 b2 a2}
183; r3 = {d3 c3 b3 a3}
184;
185%macro TRANSPOSE 6
186%define %%r0 %1
187%define %%r1 %2
188%define %%r2 %3
189%define %%r3 %4
190%define %%t0 %5
191%define %%t1 %6
192 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
193 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
194
195 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
196 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
197
198 vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
199 vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
200
201 vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
202 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
203%endmacro
204
205;;
206;; Magic functions defined in RFC 1321
207;;
208; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
209%macro MAGIC_F 4
210%define %%F %1
211%define %%X %2
212%define %%Y %3
213%define %%Z %4
214 vpxor %%F,%%Z, %%Y
215 vpand %%F,%%F,%%X
216 vpxor %%F,%%F,%%Z
217%endmacro
218
219; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
220%macro MAGIC_G 4
221%define %%F %1
222%define %%X %2
223%define %%Y %3
224%define %%Z %4
225 MAGIC_F %%F,%%Z,%%X,%%Y
226%endmacro
227
228; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
229%macro MAGIC_H 4
230%define %%F %1
231%define %%X %2
232%define %%Y %3
233%define %%Z %4
234 vpxor %%F,%%Z, %%Y
235 vpxor %%F,%%F, %%X
236%endmacro
237
238; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
239%macro MAGIC_I 4
240%define %%F %1
241%define %%X %2
242%define %%Y %3
243%define %%Z %4
244 vpxor %%F,%%Z,[rel ONES] ; pnot %%F
245 vpor %%F,%%F,%%X
246 vpxor %%F,%%F,%%Y
247%endmacro
248
249; PROLD reg, imm, tmp
250%macro PROLD 3
251%define %%reg %1
252%define %%imm %2
253%define %%tmp %3
254 vpsrld %%tmp, %%reg, (32-%%imm)
255 vpslld %%reg, %%reg, %%imm
256 vpor %%reg, %%reg, %%tmp
257%endmacro
258
259;;
260;; single MD5 step
261;;
262;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
263;;
264; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
265%macro MD5_STEP1 14
266%define %%MAGIC_FUN %1
267%define %%A %2
268%define %%B %3
269%define %%C %4
270%define %%D %5
271%define %%A2 %6
272%define %%B2 %7
273%define %%C2 %8
274%define %%D2 %9
275%define %%FUN %10
276%define %%TMP %11
277%define %%data %12
278%define %%MD5const %13
279%define %%nrot %14
280
281 vpaddd %%A, %%A, %%MD5const
282 vpaddd %%A2, %%A2, %%MD5const
283 vpaddd %%A, %%A, [%%data]
284 vpaddd %%A2, %%A2, [%%data + 16*16]
285 %%MAGIC_FUN %%FUN, %%B,%%C,%%D
286 vpaddd %%A, %%A, %%FUN
287 %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
288 vpaddd %%A2, %%A2, %%FUN
289 PROLD %%A,%%nrot, %%TMP
290 PROLD %%A2,%%nrot, %%TMP
291 vpaddd %%A, %%A, %%B
292 vpaddd %%A2, %%A2, %%B2
293%endmacro
294
295;;
296;; single MD5 step
297;;
298;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
299;;
300; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
301; MD5const, nrot
302%macro MD5_STEP 16
303%define %%MAGIC_FUN %1
304%define %%A %2
305%define %%B %3
306%define %%C %4
307%define %%D %5
308%define %%A2 %6
309%define %%B2 %7
310%define %%C2 %8
311%define %%D2 %9
312%define %%FUN %10
313%define %%TMP %11
314%define %%FUN2 %12
315%define %%TMP2 %13
316%define %%data %14
317%define %%MD5const %15
318%define %%nrot %16
319
320 vmovdqa %%TMP,[%%data]
321 vmovdqa %%TMP2,[%%data + 16*16]
322 vpaddd %%A, %%A, %%MD5const
323 vpaddd %%A2, %%A2, %%MD5const
324 vpaddd %%A, %%A, %%TMP
325 vpaddd %%A2, %%A2, %%TMP2
326 %%MAGIC_FUN %%FUN, %%B,%%C,%%D
327 %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
328 vpaddd %%A, %%A, %%FUN
329 vpaddd %%A2, %%A2, %%FUN2
330 PROLD %%A,%%nrot, %%TMP
331 PROLD %%A2,%%nrot, %%TMP2
332 vpaddd %%A, %%A, %%B
333 vpaddd %%A2, %%A2, %%B2
334%endmacro
335
336; void md5_x4x2_avx(MD5_ARGS *args, UINT64 num_blks)
337; arg 1 : pointer to MD5_ARGS structure
338; arg 2 : number of blocks (>=1)
339;
340align 32
341MKGLOBAL(md5_x4x2_avx,function,internal)
342md5_x4x2_avx:
343
344 sub rsp, STACK_SIZE
345
346 ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2
347 ;; Initialize digests
348 vmovdqa A,[state + 0*16 + 0*MD5_DIGEST_ROW_SIZE]
349 vmovdqa B,[state + 0*16 + 1*MD5_DIGEST_ROW_SIZE]
350 vmovdqa C,[state + 0*16 + 2*MD5_DIGEST_ROW_SIZE]
351 vmovdqa D,[state + 0*16 + 3*MD5_DIGEST_ROW_SIZE]
352
353 vmovdqa A2,[state + 1*16 + 0*MD5_DIGEST_ROW_SIZE]
354 vmovdqa B2,[state + 1*16 + 1*MD5_DIGEST_ROW_SIZE]
355 vmovdqa C2,[state + 1*16 + 2*MD5_DIGEST_ROW_SIZE]
356 vmovdqa D2,[state + 1*16 + 3*MD5_DIGEST_ROW_SIZE]
357
358 lea TBL, [rel MD5_TABLE]
359
360 ;; load input pointers
361 mov inp0,[state+_data_ptr_md5 +0*PTR_SZ]
362 mov inp1,[state+_data_ptr_md5 +1*PTR_SZ]
363 mov inp2,[state+_data_ptr_md5 +2*PTR_SZ]
364 mov inp3,[state+_data_ptr_md5 +3*PTR_SZ]
365 mov inp4,[state+_data_ptr_md5 +4*PTR_SZ]
366 mov inp5,[state+_data_ptr_md5 +5*PTR_SZ]
367 mov inp6,[state+_data_ptr_md5 +6*PTR_SZ]
368 mov inp7,[state+_data_ptr_md5 +7*PTR_SZ]
369 xor IDX, IDX
370
371 ; Make ping-pong pointers to the two memory blocks
372 mov mem1, rsp
373 lea mem2, [rsp + 16*16*2]
374
375;; Load first block of data and save back to stack
376%assign I 0
377%rep 4
378 vmovdqu T2,[inp0+IDX+I*16]
379 vmovdqu T1,[inp1+IDX+I*16]
380 vmovdqu T4,[inp2+IDX+I*16]
381 vmovdqu T3,[inp3+IDX+I*16]
382 TRANSPOSE T2, T1, T4, T3, T0, T5
383 vmovdqa [mem1+(I*4+0)*16],T0
384 vmovdqa [mem1+(I*4+1)*16],T1
385 vmovdqa [mem1+(I*4+2)*16],T2
386 vmovdqa [mem1+(I*4+3)*16],T3
387
388 vmovdqu T2,[inp4+IDX+I*16]
389 vmovdqu T1,[inp5+IDX+I*16]
390 vmovdqu T4,[inp6+IDX+I*16]
391 vmovdqu T3,[inp7+IDX+I*16]
392 TRANSPOSE T2, T1, T4, T3, T0, T5
393 vmovdqa [mem1+(I*4+0)*16 + 16*16],T0
394 vmovdqa [mem1+(I*4+1)*16 + 16*16],T1
395 vmovdqa [mem1+(I*4+2)*16 + 16*16],T2
396 vmovdqa [mem1+(I*4+3)*16 + 16*16],T3
397%assign I (I+1)
398%endrep
399
400lloop:
401 ; save old digests
402 vmovdqa [AA], A
403 vmovdqa [BB], B
404 vmovdqa [CC], C
405 vmovdqa [DD], D
406 ; save old digests
407 vmovdqa [AA2], A2
408 vmovdqa [BB2], B2
409 vmovdqa [CC2], C2
410 vmovdqa [DD2], D2
411
412 add IDX, 4*16
413 sub num_blks, 1
414 je lastblock
415
416 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
417 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
418 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
419 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
420 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
421 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
422 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
423 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
424
425%assign I 0
426 vmovdqu T2,[inp0+IDX+I*16]
427 vmovdqu T1,[inp1+IDX+I*16]
428 vmovdqu T4,[inp2+IDX+I*16]
429 vmovdqu T3,[inp3+IDX+I*16]
430 TRANSPOSE T2, T1, T4, T3, T0, T5
431 vmovdqa [mem2+(I*4+0)*16],T0
432 vmovdqa [mem2+(I*4+1)*16],T1
433 vmovdqa [mem2+(I*4+2)*16],T2
434 vmovdqa [mem2+(I*4+3)*16],T3
435
436 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
437 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
438 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
439 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
440 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
441 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
442 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
443 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
444
445
446 vmovdqu T2,[inp4+IDX+I*16]
447 vmovdqu T1,[inp5+IDX+I*16]
448 vmovdqu T4,[inp6+IDX+I*16]
449 vmovdqu T3,[inp7+IDX+I*16]
450 TRANSPOSE T2, T1, T4, T3, T0, T5
451 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
452 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
453 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
454 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
455%assign I (I+1)
456
457 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
458 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
459 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
460 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
461 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
462 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
463 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
464 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
465
466 vmovdqu T2,[inp0+IDX+I*16]
467 vmovdqu T1,[inp1+IDX+I*16]
468 vmovdqu T4,[inp2+IDX+I*16]
469 vmovdqu T3,[inp3+IDX+I*16]
470 TRANSPOSE T2, T1, T4, T3, T0, T5
471 vmovdqa [mem2+(I*4+0)*16],T0
472 vmovdqa [mem2+(I*4+1)*16],T1
473 vmovdqa [mem2+(I*4+2)*16],T2
474 vmovdqa [mem2+(I*4+3)*16],T3
475
476 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
477 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
478 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
479 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
480 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
481 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
482 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
483 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
484
485 vmovdqu T2,[inp4+IDX+I*16]
486 vmovdqu T1,[inp5+IDX+I*16]
487 vmovdqu T4,[inp6+IDX+I*16]
488 vmovdqu T3,[inp7+IDX+I*16]
489 TRANSPOSE T2, T1, T4, T3, T0, T5
490 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
491 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
492 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
493 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
494%assign I (I+1)
495
496 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
497 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
498 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
499 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
500 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
501 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
502 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
503 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
504
505 vmovdqu T2,[inp0+IDX+I*16]
506 vmovdqu T1,[inp1+IDX+I*16]
507 vmovdqu T4,[inp2+IDX+I*16]
508 vmovdqu T3,[inp3+IDX+I*16]
509 TRANSPOSE T2, T1, T4, T3, T0, T5
510 vmovdqa [mem2+(I*4+0)*16],T0
511 vmovdqa [mem2+(I*4+1)*16],T1
512 vmovdqa [mem2+(I*4+2)*16],T2
513 vmovdqa [mem2+(I*4+3)*16],T3
514
515 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
516 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
517 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
518 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
519 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
520 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
521 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
522 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
523
524 vmovdqu T2,[inp4+IDX+I*16]
525 vmovdqu T1,[inp5+IDX+I*16]
526 vmovdqu T4,[inp6+IDX+I*16]
527 vmovdqu T3,[inp7+IDX+I*16]
528 TRANSPOSE T2, T1, T4, T3, T0, T5
529 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
530 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
531 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
532 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
533%assign I (I+1)
534
535 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
536 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
537 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
538 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
539 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
540 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
541 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
542 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
543
544 vmovdqu T2,[inp0+IDX+I*16]
545 vmovdqu T1,[inp1+IDX+I*16]
546 vmovdqu T4,[inp2+IDX+I*16]
547 vmovdqu T3,[inp3+IDX+I*16]
548 TRANSPOSE T2, T1, T4, T3, T0, T5
549 vmovdqa [mem2+(I*4+0)*16],T0
550 vmovdqa [mem2+(I*4+1)*16],T1
551 vmovdqa [mem2+(I*4+2)*16],T2
552 vmovdqa [mem2+(I*4+3)*16],T3
553
554 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
555 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
556 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
557 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
558 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
559 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
560 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
561 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
562
563 vmovdqu T2,[inp4+IDX+I*16]
564 vmovdqu T1,[inp5+IDX+I*16]
565 vmovdqu T4,[inp6+IDX+I*16]
566 vmovdqu T3,[inp7+IDX+I*16]
567 TRANSPOSE T2, T1, T4, T3, T0, T5
568 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
569 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
570 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
571 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
572%assign I (I+1)
573
574
575 vpaddd A,A,[AA]
576 vpaddd B,B,[BB]
577 vpaddd C,C,[CC]
578 vpaddd D,D,[DD]
579
580 vpaddd A2,A2,[AA2]
581 vpaddd B2,B2,[BB2]
582 vpaddd C2,C2,[CC2]
583 vpaddd D2,D2,[DD2]
584
585 ; swap mem1 and mem2
586 xchg mem1, mem2
587
588 jmp lloop
589
590lastblock:
591
592 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
593 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
594 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
595 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
596 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
597 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
598 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
599 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
600 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
601 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
602 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
603 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
604 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
605 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
606 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
607 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
608
609 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
610 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
611 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
612 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
613 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
614 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
615 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
616 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
617 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
618 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
619 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
620 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
621 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
622 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
623 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
624 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
625
626 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
627 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
628 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
629 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
630 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
631 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
632 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
633 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
634 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
635 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
636 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
637 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
638 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
639 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
640 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
641 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
642
643 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
644 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
645 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
646 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
647 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
648 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
649 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
650 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
651 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
652 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
653 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
654 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
655 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
656 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
657 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
658 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
659
660 vpaddd A,A,[AA]
661 vpaddd B,B,[BB]
662 vpaddd C,C,[CC]
663 vpaddd D,D,[DD]
664
665 vpaddd A2,A2,[AA2]
666 vpaddd B2,B2,[BB2]
667 vpaddd C2,C2,[CC2]
668 vpaddd D2,D2,[DD2]
669
670 ; write out digests
671 vmovdqu [state + 0*16 + 0*MD5_DIGEST_ROW_SIZE ], A
672 vmovdqu [state + 0*16 + 1*MD5_DIGEST_ROW_SIZE ], B
673 vmovdqu [state + 0*16 + 2*MD5_DIGEST_ROW_SIZE ], C
674 vmovdqu [state + 0*16 + 3*MD5_DIGEST_ROW_SIZE ], D
675 vmovdqu [state + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2
676 vmovdqu [state + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2
677 vmovdqu [state + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2
678 vmovdqu [state + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2
679
680 ;; update input pointers
681 add inp0, IDX
682 add inp1, IDX
683 add inp2, IDX
684 add inp3, IDX
685 add inp4, IDX
686 add inp5, IDX
687 add inp6, IDX
688 add inp7, IDX
689 mov [state +_data_ptr_md5 + 0*PTR_SZ], inp0
690 mov [state +_data_ptr_md5 + 1*PTR_SZ], inp1
691 mov [state +_data_ptr_md5 + 2*PTR_SZ], inp2
692 mov [state +_data_ptr_md5 + 3*PTR_SZ], inp3
693 mov [state +_data_ptr_md5 + 4*PTR_SZ], inp4
694 mov [state +_data_ptr_md5 + 5*PTR_SZ], inp5
695 mov [state +_data_ptr_md5 + 6*PTR_SZ], inp6
696 mov [state +_data_ptr_md5 + 7*PTR_SZ], inp7
697
f67539c2
TL
698 ;; Clear stack frame (72*16 bytes)
699%ifdef SAFE_DATA
700 vpxor xmm0, xmm0
701%assign i 0
702%rep (2*2*16+8)
703 vmovdqa [rsp + i*16], xmm0
704%assign i (i+1)
705%endrep
706%endif
707
11fdf7f2
TL
708 ;;;;;;;;;;;;;;;;
709 ;; Postamble
710 add rsp, STACK_SIZE
711
712 ret
713
714%ifdef LINUX
715section .note.GNU-stack noalloc noexec nowrite progbits
716%endif