1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "md5_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
37 ; clobbers all XMM registers
38 ; clobbers all GPRs except arg1 and r8
40 ;; code to compute octal MD5 using SSE
42 ; transpose r0, r1, r2, r3, t0, t1
43 ; "transpose" data in {r0..r3} using temps {t0..t3}
44 ; Input looks like: {r0 r1 r2 r3}
50 ; output looks like: {t0 r1 r0 r3}
64 shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
65 shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
68 shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
69 shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
72 shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
75 shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
77 shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
78 shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
82 ;; Magic functions defined in RFC 1321
84 ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
96 ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
102 MAGIC_F %%F,%%Z,%%X,%%Y
105 ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
116 ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
123 pxor %%F,%%Z ; pnot %%Z
128 ; PROLD reg, imm, tmp
134 psrld %%tmp, (32-%%imm)
142 ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
144 ; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
146 %define %%MAGIC_FUN %1
158 %define %%MD5const %13
161 paddd %%A, %%MD5const
162 paddd %%A2, %%MD5const
164 paddd %%A2, [%%data + 16*16]
165 %%MAGIC_FUN %%FUN, %%B,%%C,%%D
167 %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
169 PROLD %%A,%%nrot, %%TMP
170 PROLD %%A2,%%nrot, %%TMP
178 ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
180 ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
183 %define %%MAGIC_FUN %1
197 %define %%MD5const %15
200 paddd %%A, %%MD5const
201 paddd %%A2, %%MD5const
203 paddd %%A2, [%%data + 16*16]
204 %%MAGIC_FUN %%FUN, %%B,%%C,%%D
205 %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
208 PROLD %%A,%%nrot, %%TMP
209 PROLD %%A2,%%nrot, %%TMP2
215 ;; MD5 left rotations (number of bits)
259 %ifidn __OUTPUT_FORMAT__, elf64
299 ; 3F0 data2[15] for lanes 7...4 \
301 ; 300 data2[0] for lanes 7...4 \
302 ; 2F0 data2[15] for lanes 3...0 > mem block 2
304 ; 210 data2[1] for lanes 3...0 /
305 ; 200 data2[0] for lanes 3...0 /
307 ; 1F0 data1[15] for lanes 7...4 \
309 ; 100 data1[0] for lanes 7...4 \
310 ; F0 data1[15] for lanes 3...0 > mem block 1
312 ; 10 data1[1] for lanes 3...0 /
313 ; 0 data1[0] for lanes 3...0 /
315 MEM equ 16*16*2*2 ; two blocks of data stored in stack
316 ; STACK_SIZE must be an odd multiple of 8 bytes in size
317 STACK_SIZE equ MEM + 16*8 + 8
319 %define AA rsp + MEM + 16*0
320 %define BB rsp + MEM + 16*1
321 %define CC rsp + MEM + 16*2
322 %define DD rsp + MEM + 16*3
323 %define AA2 rsp + MEM + 16*4
324 %define BB2 rsp + MEM + 16*5
325 %define CC2 rsp + MEM + 16*6
326 %define DD2 rsp + MEM + 16*7
328 ;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
330 ;#define NUM_MD5_DIGEST_WORDS 4
332 ;#define MD5_BLOCK_SIZE 64
334 ;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
337 ; DECLARE_ALIGNED(digest_array digest, 16);
338 ; UINT8* data_ptr[NUM_LANES];
341 ; void md5_mb_x4x2_sse(MD5_ARGS_X8 *args, UINT64 size)
342 ; arg 1 : pointer to MD5_ARGS_X8 structure
343 ; arg 2 : size (in blocks) ;; assumed to be >= 1
345 ; arg1 and r8 are maintained by this function
348 mk_global md5_mb_x4x2_sse, function, internal
353 ;; Initialize digests
359 ;; Initialize digests
360 movdqu A2,[arg1+1*16]
361 movdqu B2,[arg1+3*16]
362 movdqu C2,[arg1+5*16]
363 movdqu D2,[arg1+7*16]
367 ;; load input pointers
368 mov inp0,[arg1 + _data_ptr + 0*8]
369 mov inp1,[arg1 + _data_ptr + 1*8]
370 mov inp2,[arg1 + _data_ptr + 2*8]
371 mov inp3,[arg1 + _data_ptr + 3*8]
372 mov inp4,[arg1 + _data_ptr + 4*8]
373 mov inp5,[arg1 + _data_ptr + 5*8]
374 mov inp6,[arg1 + _data_ptr + 6*8]
375 mov inp7,[arg1 + _data_ptr + 7*8]
378 ; Make ping-pong pointers to the two memory blocks
380 lea mem2, [rsp + 16*16*2]
383 ;; Load first block of data and save back to stack
386 movdqu T2,[inp0+IDX+I*16]
387 movdqu T1,[inp1+IDX+I*16]
388 movdqu T4,[inp2+IDX+I*16]
389 movdqu T3,[inp3+IDX+I*16]
390 TRANSPOSE T2, T1, T4, T3, T0, T5
391 movdqa [mem1+(I*4+0)*16],T0
392 movdqa [mem1+(I*4+1)*16],T1
393 movdqa [mem1+(I*4+2)*16],T2
394 movdqa [mem1+(I*4+3)*16],T3
396 movdqu T2,[inp4+IDX+I*16]
397 movdqu T1,[inp5+IDX+I*16]
398 movdqu T4,[inp6+IDX+I*16]
399 movdqu T3,[inp7+IDX+I*16]
400 TRANSPOSE T2, T1, T4, T3, T0, T5
401 movdqa [mem1+(I*4+0)*16 + 16*16],T0
402 movdqa [mem1+(I*4+1)*16 + 16*16],T1
403 movdqa [mem1+(I*4+2)*16 + 16*16],T2
404 movdqa [mem1+(I*4+3)*16 + 16*16],T3
424 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
425 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
426 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
427 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
428 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
429 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
430 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
431 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
434 movdqu T2,[inp0+IDX+I*16]
435 movdqu T1,[inp1+IDX+I*16]
436 movdqu T4,[inp2+IDX+I*16]
437 movdqu T3,[inp3+IDX+I*16]
438 TRANSPOSE T2, T1, T4, T3, T0, T5
439 movdqa [mem2+(I*4+0)*16],T0
440 movdqa [mem2+(I*4+1)*16],T1
441 movdqa [mem2+(I*4+2)*16],T2
442 movdqa [mem2+(I*4+3)*16],T3
444 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
445 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
446 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
447 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
448 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
449 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
450 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
451 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
454 movdqu T2,[inp4+IDX+I*16]
455 movdqu T1,[inp5+IDX+I*16]
456 movdqu T4,[inp6+IDX+I*16]
457 movdqu T3,[inp7+IDX+I*16]
458 TRANSPOSE T2, T1, T4, T3, T0, T5
459 movdqa [mem2+(I*4+0)*16 + 16*16],T0
460 movdqa [mem2+(I*4+1)*16 + 16*16],T1
461 movdqa [mem2+(I*4+2)*16 + 16*16],T2
462 movdqa [mem2+(I*4+3)*16 + 16*16],T3
465 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
466 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
467 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
468 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
469 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
470 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
471 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
472 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
474 movdqu T2,[inp0+IDX+I*16]
475 movdqu T1,[inp1+IDX+I*16]
476 movdqu T4,[inp2+IDX+I*16]
477 movdqu T3,[inp3+IDX+I*16]
478 TRANSPOSE T2, T1, T4, T3, T0, T5
479 movdqa [mem2+(I*4+0)*16],T0
480 movdqa [mem2+(I*4+1)*16],T1
481 movdqa [mem2+(I*4+2)*16],T2
482 movdqa [mem2+(I*4+3)*16],T3
484 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
485 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
486 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
487 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
488 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
489 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
490 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
491 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
493 movdqu T2,[inp4+IDX+I*16]
494 movdqu T1,[inp5+IDX+I*16]
495 movdqu T4,[inp6+IDX+I*16]
496 movdqu T3,[inp7+IDX+I*16]
497 TRANSPOSE T2, T1, T4, T3, T0, T5
498 movdqa [mem2+(I*4+0)*16 + 16*16],T0
499 movdqa [mem2+(I*4+1)*16 + 16*16],T1
500 movdqa [mem2+(I*4+2)*16 + 16*16],T2
501 movdqa [mem2+(I*4+3)*16 + 16*16],T3
504 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
505 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
506 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
507 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
508 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
509 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
510 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
511 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
513 movdqu T2,[inp0+IDX+I*16]
514 movdqu T1,[inp1+IDX+I*16]
515 movdqu T4,[inp2+IDX+I*16]
516 movdqu T3,[inp3+IDX+I*16]
517 TRANSPOSE T2, T1, T4, T3, T0, T5
518 movdqa [mem2+(I*4+0)*16],T0
519 movdqa [mem2+(I*4+1)*16],T1
520 movdqa [mem2+(I*4+2)*16],T2
521 movdqa [mem2+(I*4+3)*16],T3
523 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
524 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
525 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
526 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
527 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
528 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
529 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
530 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
532 movdqu T2,[inp4+IDX+I*16]
533 movdqu T1,[inp5+IDX+I*16]
534 movdqu T4,[inp6+IDX+I*16]
535 movdqu T3,[inp7+IDX+I*16]
536 TRANSPOSE T2, T1, T4, T3, T0, T5
537 movdqa [mem2+(I*4+0)*16 + 16*16],T0
538 movdqa [mem2+(I*4+1)*16 + 16*16],T1
539 movdqa [mem2+(I*4+2)*16 + 16*16],T2
540 movdqa [mem2+(I*4+3)*16 + 16*16],T3
543 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
544 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
545 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
546 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
547 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
548 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
549 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
550 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
552 movdqu T2,[inp0+IDX+I*16]
553 movdqu T1,[inp1+IDX+I*16]
554 movdqu T4,[inp2+IDX+I*16]
555 movdqu T3,[inp3+IDX+I*16]
556 TRANSPOSE T2, T1, T4, T3, T0, T5
557 movdqa [mem2+(I*4+0)*16],T0
558 movdqa [mem2+(I*4+1)*16],T1
559 movdqa [mem2+(I*4+2)*16],T2
560 movdqa [mem2+(I*4+3)*16],T3
562 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
563 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
564 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
565 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
566 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
567 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
568 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
569 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
571 movdqu T2,[inp4+IDX+I*16]
572 movdqu T1,[inp5+IDX+I*16]
573 movdqu T4,[inp6+IDX+I*16]
574 movdqu T3,[inp7+IDX+I*16]
575 TRANSPOSE T2, T1, T4, T3, T0, T5
576 movdqa [mem2+(I*4+0)*16 + 16*16],T0
577 movdqa [mem2+(I*4+1)*16 + 16*16],T1
578 movdqa [mem2+(I*4+2)*16 + 16*16],T2
579 movdqa [mem2+(I*4+3)*16 + 16*16],T3
600 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
601 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
602 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
603 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
604 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
605 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
606 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
607 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
608 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
609 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
610 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
611 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
612 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
613 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
614 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
615 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
617 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
618 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
619 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
620 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
621 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
622 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
623 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
624 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
625 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
626 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
627 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
628 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
629 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
630 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
631 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
632 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
634 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
635 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
636 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
637 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
638 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
639 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
640 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
641 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
642 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
643 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
644 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
645 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
646 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
647 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
648 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
649 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
651 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
652 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
653 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
654 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
655 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
656 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
657 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
658 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
659 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
660 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
661 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
662 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
663 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
664 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
665 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
666 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
679 movdqu [arg1+0*16], A
680 movdqu [arg1+2*16], B
681 movdqu [arg1+4*16], C
682 movdqu [arg1+6*16], D
683 movdqu [arg1+1*16], A2
684 movdqu [arg1+3*16], B2
685 movdqu [arg1+5*16], C2
686 movdqu [arg1+7*16], D2
688 ;; update input pointers
697 mov [arg1 + _data_ptr + 0*8], inp0
698 mov [arg1 + _data_ptr + 1*8], inp1
699 mov [arg1 + _data_ptr + 2*8], inp2
700 mov [arg1 + _data_ptr + 3*8], inp3
701 mov [arg1 + _data_ptr + 4*8], inp4
702 mov [arg1 + _data_ptr + 5*8], inp5
703 mov [arg1 + _data_ptr + 6*8], inp6
704 mov [arg1 + _data_ptr + 7*8], inp7
712 section .data align=64
716 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
717 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
718 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
719 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
720 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
721 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
722 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
723 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
724 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
725 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
726 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
727 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
728 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
729 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
730 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
731 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
732 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
733 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
734 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
735 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
736 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
737 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
738 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
739 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
740 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
741 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
742 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
743 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
744 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
745 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
746 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
747 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
748 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
749 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
750 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
751 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
752 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
753 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
754 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
755 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
756 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
757 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
758 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
759 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
760 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
761 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
762 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
763 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
764 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
765 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
766 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
767 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
768 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
769 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
770 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
771 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
772 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
773 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
774 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
775 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
776 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
777 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
778 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
779 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391