2 * ====================================================================
3 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project. Rights for redistribution and usage in source and binary
5 * forms are granted according to the OpenSSL license.
6 * ====================================================================
8 * sha256/512_block procedure for x86_64.
10 * 40% improvement over compiler-generated code on Opteron. On EM64T
11 * sha256 was observed to run >80% faster and sha512 - >40%. No magical
12 * tricks, just straight implementation... I really wonder why gcc
13 * [being armed with inline assembler] fails to generate as fast code.
14 * The only thing which is cool about this module is that it's very
15 * same instruction sequence used for both SHA-256 and SHA-512. In
16 * former case the instructions operate on 32-bit operands, while in
17 * latter - on 64-bit ones. All I had to do is to get one flavor right,
18 * the other one passed the test right away:-)
20 * sha256_block runs in ~1005 cycles on Opteron, which gives you
21 * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
22 * frequency in GHz. sha512_block runs in ~1275 cycles, which results
23 * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
24 * Well, if you compare it to IA-64 implementation, which maintains
25 * X[16] in register bank[!], tends to 4 instructions per CPU clock
26 * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
27 * issue Opteron pipeline and X[16] maintained in memory. So that *if*
28 * there is a way to improve it, *then* the only way would be to try to
29 * offload X[16] updates to SSE unit, but that would require "deeper"
30 * loop unroll, which in turn would naturally cause size blow-up, not
31 * to mention increased complexity! And once again, only *if* it's
32 * actually possible to noticeably improve overall ILP, instruction
33 * level parallelism, on a given CPU implementation in this case.
35 * Special note on Intel EM64T. While Opteron CPU exhibits perfect
36 * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
37 * [currently available] EM64T CPUs apparently are far from it. On the
38 * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
39 * sha256_block:-( This is presumably because 64-bit shifts/rotates
40 * apparently are not atomic instructions, but implemented in microcode.
44 * OpenSolaris OS modifications
46 * Sun elects to use this software under the BSD license.
48 * This source originates from OpenSSL file sha512-x86_64.pl at
49 * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
50 * (presumably for future OpenSSL release 0.9.8h), with these changes:
52 * 1. Added perl "use strict" and declared variables.
54 * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
55 * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
57 * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
58 * assemblers). Replaced the .picmeup macro with assembler code.
60 * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
61 * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
65 * This file was generated by a perl script (sha512-x86_64.pl) that were
66 * used to generate sha256 and sha512 variants from the same code base.
67 * The comments from the original file have been pasted above.
71 #if defined(lint) || defined(__lint)
72 #include <sys/stdint.h>
73 #include <sha2/sha2.h>
76 SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
78 (void) ctx, (void) in, (void) num;
84 #include <sys/asm_linkage.h>
86 ENTRY_NP(SHA512TransformBlocks)
89 .cfi_def_cfa_register %rax
102 mov %rsp,%rbp # copy %rsp
105 lea (%rsi,%rdx,8),%rdx # inp+num*16*8
106 and $-64,%rsp # align stack frame
107 add $8,%rdi # Skip OpenSolaris field, "algotype"
108 mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg
109 mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg
110 mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg
111 mov %rbp,16*8+3*8(%rsp) # save copy of %rsp
112 # echo ".cfi_cfa_expression %rsp+152,deref,+56" |
113 # openssl/crypto/perlasm/x86_64-xlate.pl
114 .cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x38
117 # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
118 # the address of the "next" instruction into the target register
119 # (%rbp). This generates these 2 instructions:
121 #nop # .picmeup generates a nop for mod 8 alignment--not needed here
124 lea K512-.(%rbp),%rbp
151 and %r8,%r15 # (f^g)&e
154 xor %r14,%r13 # Sigma1(e)
155 xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
156 add %r11,%r12 # T1+=h
159 add %r13,%r12 # T1+=Sigma1(e)
161 add %r15,%r12 # T1+=Ch(e,f,g)
168 add (%rbp,%rdi,8),%r12 # T1+=K[round]
174 xor %r13,%r11 # h=Sigma0(a)
176 add %r12,%rdx # d+=T1
178 and %rbx,%r14 # (a|c)&b
179 add %r12,%r11 # h+=T1
181 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
182 lea 1(%rdi),%rdi # round++
184 add %r14,%r11 # h+=Maj(a,b,c)
197 and %rdx,%r15 # (f^g)&e
200 xor %r14,%r13 # Sigma1(e)
201 xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
202 add %r10,%r12 # T1+=h
205 add %r13,%r12 # T1+=Sigma1(e)
207 add %r15,%r12 # T1+=Ch(e,f,g)
214 add (%rbp,%rdi,8),%r12 # T1+=K[round]
220 xor %r13,%r10 # h=Sigma0(a)
222 add %r12,%rcx # d+=T1
224 and %rax,%r14 # (a|c)&b
225 add %r12,%r10 # h+=T1
227 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
228 lea 1(%rdi),%rdi # round++
230 add %r14,%r10 # h+=Maj(a,b,c)
243 and %rcx,%r15 # (f^g)&e
246 xor %r14,%r13 # Sigma1(e)
247 xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
251 add %r13,%r12 # T1+=Sigma1(e)
253 add %r15,%r12 # T1+=Ch(e,f,g)
260 add (%rbp,%rdi,8),%r12 # T1+=K[round]
266 xor %r13,%r9 # h=Sigma0(a)
268 add %r12,%rbx # d+=T1
270 and %r11,%r14 # (a|c)&b
273 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
274 lea 1(%rdi),%rdi # round++
276 add %r14,%r9 # h+=Maj(a,b,c)
289 and %rbx,%r15 # (f^g)&e
292 xor %r14,%r13 # Sigma1(e)
293 xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
297 add %r13,%r12 # T1+=Sigma1(e)
299 add %r15,%r12 # T1+=Ch(e,f,g)
306 add (%rbp,%rdi,8),%r12 # T1+=K[round]
312 xor %r13,%r8 # h=Sigma0(a)
314 add %r12,%rax # d+=T1
316 and %r10,%r14 # (a|c)&b
319 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
320 lea 1(%rdi),%rdi # round++
322 add %r14,%r8 # h+=Maj(a,b,c)
335 and %rax,%r15 # (f^g)&e
338 xor %r14,%r13 # Sigma1(e)
339 xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
340 add %rdx,%r12 # T1+=h
343 add %r13,%r12 # T1+=Sigma1(e)
345 add %r15,%r12 # T1+=Ch(e,f,g)
352 add (%rbp,%rdi,8),%r12 # T1+=K[round]
358 xor %r13,%rdx # h=Sigma0(a)
360 add %r12,%r11 # d+=T1
362 and %r9,%r14 # (a|c)&b
363 add %r12,%rdx # h+=T1
365 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
366 lea 1(%rdi),%rdi # round++
368 add %r14,%rdx # h+=Maj(a,b,c)
381 and %r11,%r15 # (f^g)&e
384 xor %r14,%r13 # Sigma1(e)
385 xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
386 add %rcx,%r12 # T1+=h
389 add %r13,%r12 # T1+=Sigma1(e)
391 add %r15,%r12 # T1+=Ch(e,f,g)
398 add (%rbp,%rdi,8),%r12 # T1+=K[round]
404 xor %r13,%rcx # h=Sigma0(a)
406 add %r12,%r10 # d+=T1
408 and %r8,%r14 # (a|c)&b
409 add %r12,%rcx # h+=T1
411 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
412 lea 1(%rdi),%rdi # round++
414 add %r14,%rcx # h+=Maj(a,b,c)
427 and %r10,%r15 # (f^g)&e
430 xor %r14,%r13 # Sigma1(e)
431 xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
432 add %rbx,%r12 # T1+=h
435 add %r13,%r12 # T1+=Sigma1(e)
437 add %r15,%r12 # T1+=Ch(e,f,g)
444 add (%rbp,%rdi,8),%r12 # T1+=K[round]
450 xor %r13,%rbx # h=Sigma0(a)
454 and %rdx,%r14 # (a|c)&b
455 add %r12,%rbx # h+=T1
457 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
458 lea 1(%rdi),%rdi # round++
460 add %r14,%rbx # h+=Maj(a,b,c)
473 and %r9,%r15 # (f^g)&e
476 xor %r14,%r13 # Sigma1(e)
477 xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
478 add %rax,%r12 # T1+=h
481 add %r13,%r12 # T1+=Sigma1(e)
483 add %r15,%r12 # T1+=Ch(e,f,g)
490 add (%rbp,%rdi,8),%r12 # T1+=K[round]
496 xor %r13,%rax # h=Sigma0(a)
500 and %rcx,%r14 # (a|c)&b
501 add %r12,%rax # h+=T1
503 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
504 lea 1(%rdi),%rdi # round++
506 add %r14,%rax # h+=Maj(a,b,c)
519 and %r8,%r15 # (f^g)&e
522 xor %r14,%r13 # Sigma1(e)
523 xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
524 add %r11,%r12 # T1+=h
527 add %r13,%r12 # T1+=Sigma1(e)
529 add %r15,%r12 # T1+=Ch(e,f,g)
536 add (%rbp,%rdi,8),%r12 # T1+=K[round]
542 xor %r13,%r11 # h=Sigma0(a)
544 add %r12,%rdx # d+=T1
546 and %rbx,%r14 # (a|c)&b
547 add %r12,%r11 # h+=T1
549 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
550 lea 1(%rdi),%rdi # round++
552 add %r14,%r11 # h+=Maj(a,b,c)
565 and %rdx,%r15 # (f^g)&e
568 xor %r14,%r13 # Sigma1(e)
569 xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
570 add %r10,%r12 # T1+=h
573 add %r13,%r12 # T1+=Sigma1(e)
575 add %r15,%r12 # T1+=Ch(e,f,g)
582 add (%rbp,%rdi,8),%r12 # T1+=K[round]
588 xor %r13,%r10 # h=Sigma0(a)
590 add %r12,%rcx # d+=T1
592 and %rax,%r14 # (a|c)&b
593 add %r12,%r10 # h+=T1
595 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
596 lea 1(%rdi),%rdi # round++
598 add %r14,%r10 # h+=Maj(a,b,c)
611 and %rcx,%r15 # (f^g)&e
614 xor %r14,%r13 # Sigma1(e)
615 xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
619 add %r13,%r12 # T1+=Sigma1(e)
621 add %r15,%r12 # T1+=Ch(e,f,g)
628 add (%rbp,%rdi,8),%r12 # T1+=K[round]
634 xor %r13,%r9 # h=Sigma0(a)
636 add %r12,%rbx # d+=T1
638 and %r11,%r14 # (a|c)&b
641 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
642 lea 1(%rdi),%rdi # round++
644 add %r14,%r9 # h+=Maj(a,b,c)
657 and %rbx,%r15 # (f^g)&e
660 xor %r14,%r13 # Sigma1(e)
661 xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
665 add %r13,%r12 # T1+=Sigma1(e)
667 add %r15,%r12 # T1+=Ch(e,f,g)
674 add (%rbp,%rdi,8),%r12 # T1+=K[round]
680 xor %r13,%r8 # h=Sigma0(a)
682 add %r12,%rax # d+=T1
684 and %r10,%r14 # (a|c)&b
687 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
688 lea 1(%rdi),%rdi # round++
690 add %r14,%r8 # h+=Maj(a,b,c)
703 and %rax,%r15 # (f^g)&e
706 xor %r14,%r13 # Sigma1(e)
707 xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
708 add %rdx,%r12 # T1+=h
711 add %r13,%r12 # T1+=Sigma1(e)
713 add %r15,%r12 # T1+=Ch(e,f,g)
720 add (%rbp,%rdi,8),%r12 # T1+=K[round]
726 xor %r13,%rdx # h=Sigma0(a)
728 add %r12,%r11 # d+=T1
730 and %r9,%r14 # (a|c)&b
731 add %r12,%rdx # h+=T1
733 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
734 lea 1(%rdi),%rdi # round++
736 add %r14,%rdx # h+=Maj(a,b,c)
749 and %r11,%r15 # (f^g)&e
752 xor %r14,%r13 # Sigma1(e)
753 xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
754 add %rcx,%r12 # T1+=h
757 add %r13,%r12 # T1+=Sigma1(e)
759 add %r15,%r12 # T1+=Ch(e,f,g)
766 add (%rbp,%rdi,8),%r12 # T1+=K[round]
772 xor %r13,%rcx # h=Sigma0(a)
774 add %r12,%r10 # d+=T1
776 and %r8,%r14 # (a|c)&b
777 add %r12,%rcx # h+=T1
779 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
780 lea 1(%rdi),%rdi # round++
782 add %r14,%rcx # h+=Maj(a,b,c)
795 and %r10,%r15 # (f^g)&e
798 xor %r14,%r13 # Sigma1(e)
799 xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
800 add %rbx,%r12 # T1+=h
803 add %r13,%r12 # T1+=Sigma1(e)
805 add %r15,%r12 # T1+=Ch(e,f,g)
812 add (%rbp,%rdi,8),%r12 # T1+=K[round]
818 xor %r13,%rbx # h=Sigma0(a)
822 and %rdx,%r14 # (a|c)&b
823 add %r12,%rbx # h+=T1
825 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
826 lea 1(%rdi),%rdi # round++
828 add %r14,%rbx # h+=Maj(a,b,c)
841 and %r9,%r15 # (f^g)&e
844 xor %r14,%r13 # Sigma1(e)
845 xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
846 add %rax,%r12 # T1+=h
849 add %r13,%r12 # T1+=Sigma1(e)
851 add %r15,%r12 # T1+=Ch(e,f,g)
858 add (%rbp,%rdi,8),%r12 # T1+=K[round]
864 xor %r13,%rax # h=Sigma0(a)
868 and %rcx,%r14 # (a|c)&b
869 add %r12,%rax # h+=T1
871 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
872 lea 1(%rdi),%rdi # round++
874 add %r14,%rax # h+=Maj(a,b,c)
889 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
898 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
915 and %r8,%r15 # (f^g)&e
918 xor %r14,%r13 # Sigma1(e)
919 xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
920 add %r11,%r12 # T1+=h
923 add %r13,%r12 # T1+=Sigma1(e)
925 add %r15,%r12 # T1+=Ch(e,f,g)
932 add (%rbp,%rdi,8),%r12 # T1+=K[round]
938 xor %r13,%r11 # h=Sigma0(a)
940 add %r12,%rdx # d+=T1
942 and %rbx,%r14 # (a|c)&b
943 add %r12,%r11 # h+=T1
945 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
946 lea 1(%rdi),%rdi # round++
948 add %r14,%r11 # h+=Maj(a,b,c)
960 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
969 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
986 and %rdx,%r15 # (f^g)&e
989 xor %r14,%r13 # Sigma1(e)
990 xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
991 add %r10,%r12 # T1+=h
994 add %r13,%r12 # T1+=Sigma1(e)
996 add %r15,%r12 # T1+=Ch(e,f,g)
1003 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1009 xor %r13,%r10 # h=Sigma0(a)
1011 add %r12,%rcx # d+=T1
1013 and %rax,%r14 # (a|c)&b
1014 add %r12,%r10 # h+=T1
1016 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1017 lea 1(%rdi),%rdi # round++
1019 add %r14,%r10 # h+=Maj(a,b,c)
1031 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1040 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1057 and %rcx,%r15 # (f^g)&e
1060 xor %r14,%r13 # Sigma1(e)
1061 xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
1062 add %r9,%r12 # T1+=h
1065 add %r13,%r12 # T1+=Sigma1(e)
1067 add %r15,%r12 # T1+=Ch(e,f,g)
1074 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1080 xor %r13,%r9 # h=Sigma0(a)
1082 add %r12,%rbx # d+=T1
1084 and %r11,%r14 # (a|c)&b
1085 add %r12,%r9 # h+=T1
1087 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1088 lea 1(%rdi),%rdi # round++
1090 add %r14,%r9 # h+=Maj(a,b,c)
1102 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1111 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1128 and %rbx,%r15 # (f^g)&e
1131 xor %r14,%r13 # Sigma1(e)
1132 xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
1133 add %r8,%r12 # T1+=h
1136 add %r13,%r12 # T1+=Sigma1(e)
1138 add %r15,%r12 # T1+=Ch(e,f,g)
1145 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1151 xor %r13,%r8 # h=Sigma0(a)
1153 add %r12,%rax # d+=T1
1155 and %r10,%r14 # (a|c)&b
1156 add %r12,%r8 # h+=T1
1158 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1159 lea 1(%rdi),%rdi # round++
1161 add %r14,%r8 # h+=Maj(a,b,c)
1173 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1182 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1199 and %rax,%r15 # (f^g)&e
1202 xor %r14,%r13 # Sigma1(e)
1203 xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
1204 add %rdx,%r12 # T1+=h
1207 add %r13,%r12 # T1+=Sigma1(e)
1209 add %r15,%r12 # T1+=Ch(e,f,g)
1216 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1222 xor %r13,%rdx # h=Sigma0(a)
1224 add %r12,%r11 # d+=T1
1226 and %r9,%r14 # (a|c)&b
1227 add %r12,%rdx # h+=T1
1229 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1230 lea 1(%rdi),%rdi # round++
1232 add %r14,%rdx # h+=Maj(a,b,c)
1244 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1253 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1270 and %r11,%r15 # (f^g)&e
1273 xor %r14,%r13 # Sigma1(e)
1274 xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
1275 add %rcx,%r12 # T1+=h
1278 add %r13,%r12 # T1+=Sigma1(e)
1280 add %r15,%r12 # T1+=Ch(e,f,g)
1287 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1293 xor %r13,%rcx # h=Sigma0(a)
1295 add %r12,%r10 # d+=T1
1297 and %r8,%r14 # (a|c)&b
1298 add %r12,%rcx # h+=T1
1300 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1301 lea 1(%rdi),%rdi # round++
1303 add %r14,%rcx # h+=Maj(a,b,c)
1315 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1324 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1341 and %r10,%r15 # (f^g)&e
1344 xor %r14,%r13 # Sigma1(e)
1345 xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
1346 add %rbx,%r12 # T1+=h
1349 add %r13,%r12 # T1+=Sigma1(e)
1351 add %r15,%r12 # T1+=Ch(e,f,g)
1358 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1364 xor %r13,%rbx # h=Sigma0(a)
1366 add %r12,%r9 # d+=T1
1368 and %rdx,%r14 # (a|c)&b
1369 add %r12,%rbx # h+=T1
1371 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1372 lea 1(%rdi),%rdi # round++
1374 add %r14,%rbx # h+=Maj(a,b,c)
1386 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1395 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1412 and %r9,%r15 # (f^g)&e
1415 xor %r14,%r13 # Sigma1(e)
1416 xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
1417 add %rax,%r12 # T1+=h
1420 add %r13,%r12 # T1+=Sigma1(e)
1422 add %r15,%r12 # T1+=Ch(e,f,g)
1429 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1435 xor %r13,%rax # h=Sigma0(a)
1437 add %r12,%r8 # d+=T1
1439 and %rcx,%r14 # (a|c)&b
1440 add %r12,%rax # h+=T1
1442 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1443 lea 1(%rdi),%rdi # round++
1445 add %r14,%rax # h+=Maj(a,b,c)
1457 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1466 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1483 and %r8,%r15 # (f^g)&e
1486 xor %r14,%r13 # Sigma1(e)
1487 xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g
1488 add %r11,%r12 # T1+=h
1491 add %r13,%r12 # T1+=Sigma1(e)
1493 add %r15,%r12 # T1+=Ch(e,f,g)
1500 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1506 xor %r13,%r11 # h=Sigma0(a)
1508 add %r12,%rdx # d+=T1
1510 and %rbx,%r14 # (a|c)&b
1511 add %r12,%r11 # h+=T1
1513 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1514 lea 1(%rdi),%rdi # round++
1516 add %r14,%r11 # h+=Maj(a,b,c)
1528 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1537 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1554 and %rdx,%r15 # (f^g)&e
1557 xor %r14,%r13 # Sigma1(e)
1558 xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g
1559 add %r10,%r12 # T1+=h
1562 add %r13,%r12 # T1+=Sigma1(e)
1564 add %r15,%r12 # T1+=Ch(e,f,g)
1571 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1577 xor %r13,%r10 # h=Sigma0(a)
1579 add %r12,%rcx # d+=T1
1581 and %rax,%r14 # (a|c)&b
1582 add %r12,%r10 # h+=T1
1584 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1585 lea 1(%rdi),%rdi # round++
1587 add %r14,%r10 # h+=Maj(a,b,c)
1599 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1608 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1625 and %rcx,%r15 # (f^g)&e
1628 xor %r14,%r13 # Sigma1(e)
1629 xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g
1630 add %r9,%r12 # T1+=h
1633 add %r13,%r12 # T1+=Sigma1(e)
1635 add %r15,%r12 # T1+=Ch(e,f,g)
1642 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1648 xor %r13,%r9 # h=Sigma0(a)
1650 add %r12,%rbx # d+=T1
1652 and %r11,%r14 # (a|c)&b
1653 add %r12,%r9 # h+=T1
1655 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1656 lea 1(%rdi),%rdi # round++
1658 add %r14,%r9 # h+=Maj(a,b,c)
1670 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1679 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1696 and %rbx,%r15 # (f^g)&e
1699 xor %r14,%r13 # Sigma1(e)
1700 xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g
1701 add %r8,%r12 # T1+=h
1704 add %r13,%r12 # T1+=Sigma1(e)
1706 add %r15,%r12 # T1+=Ch(e,f,g)
1713 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1719 xor %r13,%r8 # h=Sigma0(a)
1721 add %r12,%rax # d+=T1
1723 and %r10,%r14 # (a|c)&b
1724 add %r12,%r8 # h+=T1
1726 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1727 lea 1(%rdi),%rdi # round++
1729 add %r14,%r8 # h+=Maj(a,b,c)
1741 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1750 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1767 and %rax,%r15 # (f^g)&e
1770 xor %r14,%r13 # Sigma1(e)
1771 xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g
1772 add %rdx,%r12 # T1+=h
1775 add %r13,%r12 # T1+=Sigma1(e)
1777 add %r15,%r12 # T1+=Ch(e,f,g)
1784 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1790 xor %r13,%rdx # h=Sigma0(a)
1792 add %r12,%r11 # d+=T1
1794 and %r9,%r14 # (a|c)&b
1795 add %r12,%rdx # h+=T1
1797 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1798 lea 1(%rdi),%rdi # round++
1800 add %r14,%rdx # h+=Maj(a,b,c)
1812 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1821 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1838 and %r11,%r15 # (f^g)&e
1841 xor %r14,%r13 # Sigma1(e)
1842 xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g
1843 add %rcx,%r12 # T1+=h
1846 add %r13,%r12 # T1+=Sigma1(e)
1848 add %r15,%r12 # T1+=Ch(e,f,g)
1855 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1861 xor %r13,%rcx # h=Sigma0(a)
1863 add %r12,%r10 # d+=T1
1865 and %r8,%r14 # (a|c)&b
1866 add %r12,%rcx # h+=T1
1868 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1869 lea 1(%rdi),%rdi # round++
1871 add %r14,%rcx # h+=Maj(a,b,c)
1883 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1892 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1909 and %r10,%r15 # (f^g)&e
1912 xor %r14,%r13 # Sigma1(e)
1913 xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g
1914 add %rbx,%r12 # T1+=h
1917 add %r13,%r12 # T1+=Sigma1(e)
1919 add %r15,%r12 # T1+=Ch(e,f,g)
1926 add (%rbp,%rdi,8),%r12 # T1+=K[round]
1932 xor %r13,%rbx # h=Sigma0(a)
1934 add %r12,%r9 # d+=T1
1936 and %rdx,%r14 # (a|c)&b
1937 add %r12,%rbx # h+=T1
1939 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
1940 lea 1(%rdi),%rdi # round++
1942 add %r14,%rbx # h+=Maj(a,b,c)
1954 xor %r15,%r13 # sigma0(X[(i+1)&0xf])
1963 xor %r14,%r12 # sigma1(X[(i+14)&0xf])
1980 and %r9,%r15 # (f^g)&e
1983 xor %r14,%r13 # Sigma1(e)
1984 xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g
1985 add %rax,%r12 # T1+=h
1988 add %r13,%r12 # T1+=Sigma1(e)
1990 add %r15,%r12 # T1+=Ch(e,f,g)
1997 add (%rbp,%rdi,8),%r12 # T1+=K[round]
2003 xor %r13,%rax # h=Sigma0(a)
2005 add %r12,%r8 # d+=T1
2007 and %rcx,%r14 # (a|c)&b
2008 add %r12,%rax # h+=T1
2010 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c)
2011 lea 1(%rdi),%rdi # round++
2013 add %r14,%rax # h+=Maj(a,b,c)
2017 mov 16*8+0*8(%rsp),%rdi
2029 cmp 16*8+2*8(%rsp),%rsi
2041 mov 16*8+3*8(%rsp),%rsp
2042 .cfi_def_cfa %rsp,56
2044 .cfi_adjust_cfa_offset -8
2047 .cfi_adjust_cfa_offset -8
2050 .cfi_adjust_cfa_offset -8
2053 .cfi_adjust_cfa_offset -8
2056 .cfi_adjust_cfa_offset -8
2059 .cfi_adjust_cfa_offset -8
2064 SET_SIZE(SHA512TransformBlocks)
2070 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
2071 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
2072 .quad 0x3956c25bf348b538,0x59f111f1b605d019
2073 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
2074 .quad 0xd807aa98a3030242,0x12835b0145706fbe
2075 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
2076 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
2077 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
2078 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
2079 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
2080 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
2081 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
2082 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
2083 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
2084 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
2085 .quad 0x06ca6351e003826f,0x142929670a0e6e70
2086 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
2087 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
2088 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
2089 .quad 0x81c2c92e47edaee6,0x92722c851482353b
2090 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
2091 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
2092 .quad 0xd192e819d6ef5218,0xd69906245565a910
2093 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
2094 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
2095 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
2096 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
2097 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
2098 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
2099 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
2100 .quad 0x90befffa23631e28,0xa4506cebde82bde9
2101 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
2102 .quad 0xca273eceea26619c,0xd186b8c721c0c207
2103 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
2104 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
2105 .quad 0x113f9804bef90dae,0x1b710b35131c471b
2106 .quad 0x28db77f523047d84,0x32caab7b40c72493
2107 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
2108 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
2109 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
2110 #endif /* !lint && !__lint */
2113 .section .note.GNU-stack,"",%progbits