2 * ====================================================================
3 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project. Rights for redistribution and usage in source and binary
5 * forms are granted according to the OpenSSL license.
6 * ====================================================================
8 * sha256/512_block procedure for x86_64.
10 * 40% improvement over compiler-generated code on Opteron. On EM64T
11 * sha256 was observed to run >80% faster and sha512 - >40%. No magical
12 * tricks, just straight implementation... I really wonder why gcc
13 * [being armed with inline assembler] fails to generate as fast code.
14 * The only thing which is cool about this module is that it's very
15 * same instruction sequence used for both SHA-256 and SHA-512. In
16 * former case the instructions operate on 32-bit operands, while in
17 * latter - on 64-bit ones. All I had to do is to get one flavor right,
18 * the other one passed the test right away:-)
20 * sha256_block runs in ~1005 cycles on Opteron, which gives you
21 * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
22 * frequency in GHz. sha512_block runs in ~1275 cycles, which results
23 * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
24 * Well, if you compare it to IA-64 implementation, which maintains
25 * X[16] in register bank[!], tends to 4 instructions per CPU clock
26 * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
27 * issue Opteron pipeline and X[16] maintained in memory. So that *if*
28 * there is a way to improve it, *then* the only way would be to try to
29 * offload X[16] updates to SSE unit, but that would require "deeper"
30 * loop unroll, which in turn would naturally cause size blow-up, not
31 * to mention increased complexity! And once again, only *if* it's
32 * actually possible to noticeably improve overall ILP, instruction
33 * level parallelism, on a given CPU implementation in this case.
35 * Special note on Intel EM64T. While Opteron CPU exhibits perfect
36 * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
37 * [currently available] EM64T CPUs apparently are far from it. On the
38 * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
39 * sha256_block:-( This is presumably because 64-bit shifts/rotates
40 * apparently are not atomic instructions, but implemented in microcode.
44 * OpenSolaris OS modifications
46 * Sun elects to use this software under the BSD license.
48 * This source originates from OpenSSL file sha512-x86_64.pl at
49 * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
50 * (presumably for future OpenSSL release 0.9.8h), with these changes:
52 * 1. Added perl "use strict" and declared variables.
54 * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
55 * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
57 * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
58 * assemblers). Replaced the .picmeup macro with assembler code.
60 * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
61 * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
65 * This file was generated by a perl script (sha512-x86_64.pl) that were
66 * used to generate sha256 and sha512 variants from the same code base.
67 * The comments from the original file have been pasted above.
70 #if defined(lint) || defined(__lint)
71 #include <sys/stdint.h>
72 #include <sha2/sha2.h>
76 SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
83 #include <sys/asm_linkage.h>
85 ENTRY_NP(SHA256TransformBlocks)
92 mov %rsp,%rbp # copy %rsp
95 lea (%rsi,%rdx,4),%rdx # inp+num*16*4
96 and $-64,%rsp # align stack frame
97 add $8,%rdi # Skip OpenSolaris field, "algotype"
98 mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg
99 mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg
100 mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg
101 mov %rbp,16*4+3*8(%rsp) # save copy of %rsp
104 / The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
105 / the address of the "next" instruction into the target register
106 / (%rbp). This generates these 2 instructions:
108 /nop / .picmeup generates a nop for mod 8 alignment--not needed here
111 lea K256-.(%rbp),%rbp
134 xor %r10d,%r15d # f^g
138 and %r8d,%r15d # (f^g)&e
141 xor %r14d,%r13d # Sigma1(e)
142 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
143 add %r11d,%r12d # T1+=h
146 add %r13d,%r12d # T1+=Sigma1(e)
148 add %r15d,%r12d # T1+=Ch(e,f,g)
155 add (%rbp,%rdi,4),%r12d # T1+=K[round]
161 xor %r13d,%r11d # h=Sigma0(a)
163 add %r12d,%edx # d+=T1
165 and %ebx,%r14d # (a|c)&b
166 add %r12d,%r11d # h+=T1
168 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
169 lea 1(%rdi),%rdi # round++
171 add %r14d,%r11d # h+=Maj(a,b,c)
184 and %edx,%r15d # (f^g)&e
187 xor %r14d,%r13d # Sigma1(e)
188 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
189 add %r10d,%r12d # T1+=h
192 add %r13d,%r12d # T1+=Sigma1(e)
194 add %r15d,%r12d # T1+=Ch(e,f,g)
201 add (%rbp,%rdi,4),%r12d # T1+=K[round]
207 xor %r13d,%r10d # h=Sigma0(a)
209 add %r12d,%ecx # d+=T1
211 and %eax,%r14d # (a|c)&b
212 add %r12d,%r10d # h+=T1
214 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
215 lea 1(%rdi),%rdi # round++
217 add %r14d,%r10d # h+=Maj(a,b,c)
230 and %ecx,%r15d # (f^g)&e
233 xor %r14d,%r13d # Sigma1(e)
234 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
235 add %r9d,%r12d # T1+=h
238 add %r13d,%r12d # T1+=Sigma1(e)
240 add %r15d,%r12d # T1+=Ch(e,f,g)
247 add (%rbp,%rdi,4),%r12d # T1+=K[round]
253 xor %r13d,%r9d # h=Sigma0(a)
255 add %r12d,%ebx # d+=T1
257 and %r11d,%r14d # (a|c)&b
258 add %r12d,%r9d # h+=T1
260 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
261 lea 1(%rdi),%rdi # round++
263 add %r14d,%r9d # h+=Maj(a,b,c)
276 and %ebx,%r15d # (f^g)&e
279 xor %r14d,%r13d # Sigma1(e)
280 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
281 add %r8d,%r12d # T1+=h
284 add %r13d,%r12d # T1+=Sigma1(e)
286 add %r15d,%r12d # T1+=Ch(e,f,g)
293 add (%rbp,%rdi,4),%r12d # T1+=K[round]
299 xor %r13d,%r8d # h=Sigma0(a)
300 and %r11d,%r15d # a&c
301 add %r12d,%eax # d+=T1
303 and %r10d,%r14d # (a|c)&b
304 add %r12d,%r8d # h+=T1
306 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
307 lea 1(%rdi),%rdi # round++
309 add %r14d,%r8d # h+=Maj(a,b,c)
322 and %eax,%r15d # (f^g)&e
325 xor %r14d,%r13d # Sigma1(e)
326 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
327 add %edx,%r12d # T1+=h
330 add %r13d,%r12d # T1+=Sigma1(e)
332 add %r15d,%r12d # T1+=Ch(e,f,g)
339 add (%rbp,%rdi,4),%r12d # T1+=K[round]
345 xor %r13d,%edx # h=Sigma0(a)
346 and %r10d,%r15d # a&c
347 add %r12d,%r11d # d+=T1
349 and %r9d,%r14d # (a|c)&b
350 add %r12d,%edx # h+=T1
352 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
353 lea 1(%rdi),%rdi # round++
355 add %r14d,%edx # h+=Maj(a,b,c)
368 and %r11d,%r15d # (f^g)&e
371 xor %r14d,%r13d # Sigma1(e)
372 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
373 add %ecx,%r12d # T1+=h
376 add %r13d,%r12d # T1+=Sigma1(e)
378 add %r15d,%r12d # T1+=Ch(e,f,g)
385 add (%rbp,%rdi,4),%r12d # T1+=K[round]
391 xor %r13d,%ecx # h=Sigma0(a)
393 add %r12d,%r10d # d+=T1
395 and %r8d,%r14d # (a|c)&b
396 add %r12d,%ecx # h+=T1
398 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
399 lea 1(%rdi),%rdi # round++
401 add %r14d,%ecx # h+=Maj(a,b,c)
414 and %r10d,%r15d # (f^g)&e
417 xor %r14d,%r13d # Sigma1(e)
418 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
419 add %ebx,%r12d # T1+=h
422 add %r13d,%r12d # T1+=Sigma1(e)
424 add %r15d,%r12d # T1+=Ch(e,f,g)
431 add (%rbp,%rdi,4),%r12d # T1+=K[round]
437 xor %r13d,%ebx # h=Sigma0(a)
439 add %r12d,%r9d # d+=T1
441 and %edx,%r14d # (a|c)&b
442 add %r12d,%ebx # h+=T1
444 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
445 lea 1(%rdi),%rdi # round++
447 add %r14d,%ebx # h+=Maj(a,b,c)
456 xor %r11d,%r15d # f^g
460 and %r9d,%r15d # (f^g)&e
463 xor %r14d,%r13d # Sigma1(e)
464 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
465 add %eax,%r12d # T1+=h
468 add %r13d,%r12d # T1+=Sigma1(e)
470 add %r15d,%r12d # T1+=Ch(e,f,g)
477 add (%rbp,%rdi,4),%r12d # T1+=K[round]
483 xor %r13d,%eax # h=Sigma0(a)
485 add %r12d,%r8d # d+=T1
487 and %ecx,%r14d # (a|c)&b
488 add %r12d,%eax # h+=T1
490 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
491 lea 1(%rdi),%rdi # round++
493 add %r14d,%eax # h+=Maj(a,b,c)
502 xor %r10d,%r15d # f^g
506 and %r8d,%r15d # (f^g)&e
509 xor %r14d,%r13d # Sigma1(e)
510 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
511 add %r11d,%r12d # T1+=h
514 add %r13d,%r12d # T1+=Sigma1(e)
516 add %r15d,%r12d # T1+=Ch(e,f,g)
523 add (%rbp,%rdi,4),%r12d # T1+=K[round]
529 xor %r13d,%r11d # h=Sigma0(a)
531 add %r12d,%edx # d+=T1
533 and %ebx,%r14d # (a|c)&b
534 add %r12d,%r11d # h+=T1
536 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
537 lea 1(%rdi),%rdi # round++
539 add %r14d,%r11d # h+=Maj(a,b,c)
552 and %edx,%r15d # (f^g)&e
555 xor %r14d,%r13d # Sigma1(e)
556 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
557 add %r10d,%r12d # T1+=h
560 add %r13d,%r12d # T1+=Sigma1(e)
562 add %r15d,%r12d # T1+=Ch(e,f,g)
569 add (%rbp,%rdi,4),%r12d # T1+=K[round]
575 xor %r13d,%r10d # h=Sigma0(a)
577 add %r12d,%ecx # d+=T1
579 and %eax,%r14d # (a|c)&b
580 add %r12d,%r10d # h+=T1
582 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
583 lea 1(%rdi),%rdi # round++
585 add %r14d,%r10d # h+=Maj(a,b,c)
598 and %ecx,%r15d # (f^g)&e
601 xor %r14d,%r13d # Sigma1(e)
602 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
603 add %r9d,%r12d # T1+=h
606 add %r13d,%r12d # T1+=Sigma1(e)
608 add %r15d,%r12d # T1+=Ch(e,f,g)
615 add (%rbp,%rdi,4),%r12d # T1+=K[round]
621 xor %r13d,%r9d # h=Sigma0(a)
623 add %r12d,%ebx # d+=T1
625 and %r11d,%r14d # (a|c)&b
626 add %r12d,%r9d # h+=T1
628 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
629 lea 1(%rdi),%rdi # round++
631 add %r14d,%r9d # h+=Maj(a,b,c)
644 and %ebx,%r15d # (f^g)&e
647 xor %r14d,%r13d # Sigma1(e)
648 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
649 add %r8d,%r12d # T1+=h
652 add %r13d,%r12d # T1+=Sigma1(e)
654 add %r15d,%r12d # T1+=Ch(e,f,g)
661 add (%rbp,%rdi,4),%r12d # T1+=K[round]
667 xor %r13d,%r8d # h=Sigma0(a)
668 and %r11d,%r15d # a&c
669 add %r12d,%eax # d+=T1
671 and %r10d,%r14d # (a|c)&b
672 add %r12d,%r8d # h+=T1
674 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
675 lea 1(%rdi),%rdi # round++
677 add %r14d,%r8d # h+=Maj(a,b,c)
690 and %eax,%r15d # (f^g)&e
693 xor %r14d,%r13d # Sigma1(e)
694 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
695 add %edx,%r12d # T1+=h
698 add %r13d,%r12d # T1+=Sigma1(e)
700 add %r15d,%r12d # T1+=Ch(e,f,g)
707 add (%rbp,%rdi,4),%r12d # T1+=K[round]
713 xor %r13d,%edx # h=Sigma0(a)
714 and %r10d,%r15d # a&c
715 add %r12d,%r11d # d+=T1
717 and %r9d,%r14d # (a|c)&b
718 add %r12d,%edx # h+=T1
720 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
721 lea 1(%rdi),%rdi # round++
723 add %r14d,%edx # h+=Maj(a,b,c)
736 and %r11d,%r15d # (f^g)&e
739 xor %r14d,%r13d # Sigma1(e)
740 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
741 add %ecx,%r12d # T1+=h
744 add %r13d,%r12d # T1+=Sigma1(e)
746 add %r15d,%r12d # T1+=Ch(e,f,g)
753 add (%rbp,%rdi,4),%r12d # T1+=K[round]
759 xor %r13d,%ecx # h=Sigma0(a)
761 add %r12d,%r10d # d+=T1
763 and %r8d,%r14d # (a|c)&b
764 add %r12d,%ecx # h+=T1
766 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
767 lea 1(%rdi),%rdi # round++
769 add %r14d,%ecx # h+=Maj(a,b,c)
782 and %r10d,%r15d # (f^g)&e
785 xor %r14d,%r13d # Sigma1(e)
786 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
787 add %ebx,%r12d # T1+=h
790 add %r13d,%r12d # T1+=Sigma1(e)
792 add %r15d,%r12d # T1+=Ch(e,f,g)
799 add (%rbp,%rdi,4),%r12d # T1+=K[round]
805 xor %r13d,%ebx # h=Sigma0(a)
807 add %r12d,%r9d # d+=T1
809 and %edx,%r14d # (a|c)&b
810 add %r12d,%ebx # h+=T1
812 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
813 lea 1(%rdi),%rdi # round++
815 add %r14d,%ebx # h+=Maj(a,b,c)
824 xor %r11d,%r15d # f^g
828 and %r9d,%r15d # (f^g)&e
831 xor %r14d,%r13d # Sigma1(e)
832 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
833 add %eax,%r12d # T1+=h
836 add %r13d,%r12d # T1+=Sigma1(e)
838 add %r15d,%r12d # T1+=Ch(e,f,g)
845 add (%rbp,%rdi,4),%r12d # T1+=K[round]
851 xor %r13d,%eax # h=Sigma0(a)
853 add %r12d,%r8d # d+=T1
855 and %ecx,%r14d # (a|c)&b
856 add %r12d,%eax # h+=T1
858 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
859 lea 1(%rdi),%rdi # round++
861 add %r14d,%eax # h+=Maj(a,b,c)
876 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
885 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
898 xor %r10d,%r15d # f^g
902 and %r8d,%r15d # (f^g)&e
905 xor %r14d,%r13d # Sigma1(e)
906 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
907 add %r11d,%r12d # T1+=h
910 add %r13d,%r12d # T1+=Sigma1(e)
912 add %r15d,%r12d # T1+=Ch(e,f,g)
919 add (%rbp,%rdi,4),%r12d # T1+=K[round]
925 xor %r13d,%r11d # h=Sigma0(a)
927 add %r12d,%edx # d+=T1
929 and %ebx,%r14d # (a|c)&b
930 add %r12d,%r11d # h+=T1
932 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
933 lea 1(%rdi),%rdi # round++
935 add %r14d,%r11d # h+=Maj(a,b,c)
947 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
956 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
973 and %edx,%r15d # (f^g)&e
976 xor %r14d,%r13d # Sigma1(e)
977 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
978 add %r10d,%r12d # T1+=h
981 add %r13d,%r12d # T1+=Sigma1(e)
983 add %r15d,%r12d # T1+=Ch(e,f,g)
990 add (%rbp,%rdi,4),%r12d # T1+=K[round]
996 xor %r13d,%r10d # h=Sigma0(a)
998 add %r12d,%ecx # d+=T1
1000 and %eax,%r14d # (a|c)&b
1001 add %r12d,%r10d # h+=T1
1003 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1004 lea 1(%rdi),%rdi # round++
1006 add %r14d,%r10d # h+=Maj(a,b,c)
1018 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1027 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1040 xor %r8d,%r15d # f^g
1044 and %ecx,%r15d # (f^g)&e
1047 xor %r14d,%r13d # Sigma1(e)
1048 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1049 add %r9d,%r12d # T1+=h
1052 add %r13d,%r12d # T1+=Sigma1(e)
1054 add %r15d,%r12d # T1+=Ch(e,f,g)
1061 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1067 xor %r13d,%r9d # h=Sigma0(a)
1068 and %eax,%r15d # a&c
1069 add %r12d,%ebx # d+=T1
1071 and %r11d,%r14d # (a|c)&b
1072 add %r12d,%r9d # h+=T1
1074 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1075 lea 1(%rdi),%rdi # round++
1077 add %r14d,%r9d # h+=Maj(a,b,c)
1089 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1098 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1111 xor %edx,%r15d # f^g
1115 and %ebx,%r15d # (f^g)&e
1118 xor %r14d,%r13d # Sigma1(e)
1119 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1120 add %r8d,%r12d # T1+=h
1123 add %r13d,%r12d # T1+=Sigma1(e)
1125 add %r15d,%r12d # T1+=Ch(e,f,g)
1132 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1136 or %r11d,%r14d # a|c
1138 xor %r13d,%r8d # h=Sigma0(a)
1139 and %r11d,%r15d # a&c
1140 add %r12d,%eax # d+=T1
1142 and %r10d,%r14d # (a|c)&b
1143 add %r12d,%r8d # h+=T1
1145 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1146 lea 1(%rdi),%rdi # round++
1148 add %r14d,%r8d # h+=Maj(a,b,c)
1160 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1169 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1182 xor %ecx,%r15d # f^g
1186 and %eax,%r15d # (f^g)&e
1189 xor %r14d,%r13d # Sigma1(e)
1190 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1191 add %edx,%r12d # T1+=h
1194 add %r13d,%r12d # T1+=Sigma1(e)
1196 add %r15d,%r12d # T1+=Ch(e,f,g)
1203 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1207 or %r10d,%r14d # a|c
1209 xor %r13d,%edx # h=Sigma0(a)
1210 and %r10d,%r15d # a&c
1211 add %r12d,%r11d # d+=T1
1213 and %r9d,%r14d # (a|c)&b
1214 add %r12d,%edx # h+=T1
1216 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1217 lea 1(%rdi),%rdi # round++
1219 add %r14d,%edx # h+=Maj(a,b,c)
1231 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1240 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1253 xor %ebx,%r15d # f^g
1257 and %r11d,%r15d # (f^g)&e
1260 xor %r14d,%r13d # Sigma1(e)
1261 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1262 add %ecx,%r12d # T1+=h
1265 add %r13d,%r12d # T1+=Sigma1(e)
1267 add %r15d,%r12d # T1+=Ch(e,f,g)
1274 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1280 xor %r13d,%ecx # h=Sigma0(a)
1281 and %r9d,%r15d # a&c
1282 add %r12d,%r10d # d+=T1
1284 and %r8d,%r14d # (a|c)&b
1285 add %r12d,%ecx # h+=T1
1287 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1288 lea 1(%rdi),%rdi # round++
1290 add %r14d,%ecx # h+=Maj(a,b,c)
1302 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1311 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1324 xor %eax,%r15d # f^g
1328 and %r10d,%r15d # (f^g)&e
1331 xor %r14d,%r13d # Sigma1(e)
1332 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
1333 add %ebx,%r12d # T1+=h
1336 add %r13d,%r12d # T1+=Sigma1(e)
1338 add %r15d,%r12d # T1+=Ch(e,f,g)
1345 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1351 xor %r13d,%ebx # h=Sigma0(a)
1352 and %r8d,%r15d # a&c
1353 add %r12d,%r9d # d+=T1
1355 and %edx,%r14d # (a|c)&b
1356 add %r12d,%ebx # h+=T1
1358 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1359 lea 1(%rdi),%rdi # round++
1361 add %r14d,%ebx # h+=Maj(a,b,c)
1373 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1382 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1395 xor %r11d,%r15d # f^g
1399 and %r9d,%r15d # (f^g)&e
1402 xor %r14d,%r13d # Sigma1(e)
1403 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1404 add %eax,%r12d # T1+=h
1407 add %r13d,%r12d # T1+=Sigma1(e)
1409 add %r15d,%r12d # T1+=Ch(e,f,g)
1416 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1422 xor %r13d,%eax # h=Sigma0(a)
1423 and %edx,%r15d # a&c
1424 add %r12d,%r8d # d+=T1
1426 and %ecx,%r14d # (a|c)&b
1427 add %r12d,%eax # h+=T1
1429 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1430 lea 1(%rdi),%rdi # round++
1432 add %r14d,%eax # h+=Maj(a,b,c)
1444 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1453 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1466 xor %r10d,%r15d # f^g
1470 and %r8d,%r15d # (f^g)&e
1473 xor %r14d,%r13d # Sigma1(e)
1474 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1475 add %r11d,%r12d # T1+=h
1478 add %r13d,%r12d # T1+=Sigma1(e)
1480 add %r15d,%r12d # T1+=Ch(e,f,g)
1487 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1493 xor %r13d,%r11d # h=Sigma0(a)
1494 and %ecx,%r15d # a&c
1495 add %r12d,%edx # d+=T1
1497 and %ebx,%r14d # (a|c)&b
1498 add %r12d,%r11d # h+=T1
1500 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1501 lea 1(%rdi),%rdi # round++
1503 add %r14d,%r11d # h+=Maj(a,b,c)
1515 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1524 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1537 xor %r9d,%r15d # f^g
1541 and %edx,%r15d # (f^g)&e
1544 xor %r14d,%r13d # Sigma1(e)
1545 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1546 add %r10d,%r12d # T1+=h
1549 add %r13d,%r12d # T1+=Sigma1(e)
1551 add %r15d,%r12d # T1+=Ch(e,f,g)
1558 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1564 xor %r13d,%r10d # h=Sigma0(a)
1565 and %ebx,%r15d # a&c
1566 add %r12d,%ecx # d+=T1
1568 and %eax,%r14d # (a|c)&b
1569 add %r12d,%r10d # h+=T1
1571 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1572 lea 1(%rdi),%rdi # round++
1574 add %r14d,%r10d # h+=Maj(a,b,c)
1586 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1595 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1608 xor %r8d,%r15d # f^g
1612 and %ecx,%r15d # (f^g)&e
1615 xor %r14d,%r13d # Sigma1(e)
1616 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1617 add %r9d,%r12d # T1+=h
1620 add %r13d,%r12d # T1+=Sigma1(e)
1622 add %r15d,%r12d # T1+=Ch(e,f,g)
1629 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1635 xor %r13d,%r9d # h=Sigma0(a)
1636 and %eax,%r15d # a&c
1637 add %r12d,%ebx # d+=T1
1639 and %r11d,%r14d # (a|c)&b
1640 add %r12d,%r9d # h+=T1
1642 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1643 lea 1(%rdi),%rdi # round++
1645 add %r14d,%r9d # h+=Maj(a,b,c)
1657 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1666 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1679 xor %edx,%r15d # f^g
1683 and %ebx,%r15d # (f^g)&e
1686 xor %r14d,%r13d # Sigma1(e)
1687 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1688 add %r8d,%r12d # T1+=h
1691 add %r13d,%r12d # T1+=Sigma1(e)
1693 add %r15d,%r12d # T1+=Ch(e,f,g)
1700 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1704 or %r11d,%r14d # a|c
1706 xor %r13d,%r8d # h=Sigma0(a)
1707 and %r11d,%r15d # a&c
1708 add %r12d,%eax # d+=T1
1710 and %r10d,%r14d # (a|c)&b
1711 add %r12d,%r8d # h+=T1
1713 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1714 lea 1(%rdi),%rdi # round++
1716 add %r14d,%r8d # h+=Maj(a,b,c)
1728 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1737 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1750 xor %ecx,%r15d # f^g
1754 and %eax,%r15d # (f^g)&e
1757 xor %r14d,%r13d # Sigma1(e)
1758 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1759 add %edx,%r12d # T1+=h
1762 add %r13d,%r12d # T1+=Sigma1(e)
1764 add %r15d,%r12d # T1+=Ch(e,f,g)
1771 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1775 or %r10d,%r14d # a|c
1777 xor %r13d,%edx # h=Sigma0(a)
1778 and %r10d,%r15d # a&c
1779 add %r12d,%r11d # d+=T1
1781 and %r9d,%r14d # (a|c)&b
1782 add %r12d,%edx # h+=T1
1784 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1785 lea 1(%rdi),%rdi # round++
1787 add %r14d,%edx # h+=Maj(a,b,c)
1799 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1808 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1821 xor %ebx,%r15d # f^g
1825 and %r11d,%r15d # (f^g)&e
1828 xor %r14d,%r13d # Sigma1(e)
1829 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1830 add %ecx,%r12d # T1+=h
1833 add %r13d,%r12d # T1+=Sigma1(e)
1835 add %r15d,%r12d # T1+=Ch(e,f,g)
1842 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1848 xor %r13d,%ecx # h=Sigma0(a)
1849 and %r9d,%r15d # a&c
1850 add %r12d,%r10d # d+=T1
1852 and %r8d,%r14d # (a|c)&b
1853 add %r12d,%ecx # h+=T1
1855 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1856 lea 1(%rdi),%rdi # round++
1858 add %r14d,%ecx # h+=Maj(a,b,c)
1870 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1879 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1892 xor %eax,%r15d # f^g
1896 and %r10d,%r15d # (f^g)&e
1899 xor %r14d,%r13d # Sigma1(e)
1900 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
1901 add %ebx,%r12d # T1+=h
1904 add %r13d,%r12d # T1+=Sigma1(e)
1906 add %r15d,%r12d # T1+=Ch(e,f,g)
1913 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1919 xor %r13d,%ebx # h=Sigma0(a)
1920 and %r8d,%r15d # a&c
1921 add %r12d,%r9d # d+=T1
1923 and %edx,%r14d # (a|c)&b
1924 add %r12d,%ebx # h+=T1
1926 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1927 lea 1(%rdi),%rdi # round++
1929 add %r14d,%ebx # h+=Maj(a,b,c)
1941 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1950 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1963 xor %r11d,%r15d # f^g
1967 and %r9d,%r15d # (f^g)&e
1970 xor %r14d,%r13d # Sigma1(e)
1971 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1972 add %eax,%r12d # T1+=h
1975 add %r13d,%r12d # T1+=Sigma1(e)
1977 add %r15d,%r12d # T1+=Ch(e,f,g)
1984 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1990 xor %r13d,%eax # h=Sigma0(a)
1991 and %edx,%r15d # a&c
1992 add %r12d,%r8d # d+=T1
1994 and %ecx,%r14d # (a|c)&b
1995 add %r12d,%eax # h+=T1
1997 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1998 lea 1(%rdi),%rdi # round++
2000 add %r14d,%eax # h+=Maj(a,b,c)
2004 mov 16*4+0*8(%rsp),%rdi
2016 cmp 16*4+2*8(%rsp),%rsi
2028 mov 16*4+3*8(%rsp),%rsp
2037 SET_SIZE(SHA256TransformBlocks)
2042 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
2043 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
2044 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
2045 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
2046 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
2047 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
2048 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
2049 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
2050 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
2051 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
2052 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
2053 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
2054 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
2055 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
2056 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
2057 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
2058 #endif /* !lint && !__lint */
2061 .section .note.GNU-stack,"",%progbits