]> git.proxmox.com Git - mirror_zfs.git/blob - module/icp/asm-x86_64/sha2/sha256_impl.S
OpenZFS 4185 - add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R
[mirror_zfs.git] / module / icp / asm-x86_64 / sha2 / sha256_impl.S
1 /*
2 * ====================================================================
3 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project. Rights for redistribution and usage in source and binary
5 * forms are granted according to the OpenSSL license.
6 * ====================================================================
7 *
8 * sha256/512_block procedure for x86_64.
9 *
10 * 40% improvement over compiler-generated code on Opteron. On EM64T
11 * sha256 was observed to run >80% faster and sha512 - >40%. No magical
12 * tricks, just straight implementation... I really wonder why gcc
13 * [being armed with inline assembler] fails to generate as fast code.
14 * The only thing which is cool about this module is that it's very
15 * same instruction sequence used for both SHA-256 and SHA-512. In
16 * former case the instructions operate on 32-bit operands, while in
17 * latter - on 64-bit ones. All I had to do is to get one flavor right,
18 * the other one passed the test right away:-)
19 *
20 * sha256_block runs in ~1005 cycles on Opteron, which gives you
21 * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
22 * frequency in GHz. sha512_block runs in ~1275 cycles, which results
23 * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
24 * Well, if you compare it to IA-64 implementation, which maintains
25 * X[16] in register bank[!], tends to 4 instructions per CPU clock
26 * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
27 * issue Opteron pipeline and X[16] maintained in memory. So that *if*
28 * there is a way to improve it, *then* the only way would be to try to
29 * offload X[16] updates to SSE unit, but that would require "deeper"
30 * loop unroll, which in turn would naturally cause size blow-up, not
31 * to mention increased complexity! And once again, only *if* it's
32 * actually possible to noticeably improve overall ILP, instruction
33 * level parallelism, on a given CPU implementation in this case.
34 *
35 * Special note on Intel EM64T. While Opteron CPU exhibits perfect
36 * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
37 * [currently available] EM64T CPUs apparently are far from it. On the
38 * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
39 * sha256_block:-( This is presumably because 64-bit shifts/rotates
40 * apparently are not atomic instructions, but implemented in microcode.
41 */
42
43 /*
44 * OpenSolaris OS modifications
45 *
46 * Sun elects to use this software under the BSD license.
47 *
48 * This source originates from OpenSSL file sha512-x86_64.pl at
49 * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
50 * (presumably for future OpenSSL release 0.9.8h), with these changes:
51 *
52 * 1. Added perl "use strict" and declared variables.
53 *
54 * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
55 * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
56 *
57 * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
58 * assemblers). Replaced the .picmeup macro with assembler code.
59 *
60 * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
61 * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
62 */
63
64 /*
65 * This file was generated by a perl script (sha512-x86_64.pl) that were
66 * used to generate sha256 and sha512 variants from the same code base.
67 * The comments from the original file have been pasted above.
68 */
69
70 #if defined(lint) || defined(__lint)
71 #include <sys/stdint.h>
72 #include <sha2/sha2.h>
73
74 /* ARGSUSED */
75 void
76 SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
77 {
78 }
79
80
81 #else
82 #define _ASM
83 #include <sys/asm_linkage.h>
84
85 ENTRY_NP(SHA256TransformBlocks)
86 push %rbx
87 push %rbp
88 push %r12
89 push %r13
90 push %r14
91 push %r15
92 mov %rsp,%rbp # copy %rsp
93 shl $4,%rdx # num*16
94 sub $16*4+4*8,%rsp
95 lea (%rsi,%rdx,4),%rdx # inp+num*16*4
96 and $-64,%rsp # align stack frame
97 add $8,%rdi # Skip OpenSolaris field, "algotype"
98 mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg
99 mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg
100 mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg
101 mov %rbp,16*4+3*8(%rsp) # save copy of %rsp
102
103 /.picmeup %rbp
104 / The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
105 / the address of the "next" instruction into the target register
106 / (%rbp). This generates these 2 instructions:
107 lea .Llea(%rip),%rbp
108 /nop / .picmeup generates a nop for mod 8 alignment--not needed here
109
110 .Llea:
111 lea K256-.(%rbp),%rbp
112
113 mov 4*0(%rdi),%eax
114 mov 4*1(%rdi),%ebx
115 mov 4*2(%rdi),%ecx
116 mov 4*3(%rdi),%edx
117 mov 4*4(%rdi),%r8d
118 mov 4*5(%rdi),%r9d
119 mov 4*6(%rdi),%r10d
120 mov 4*7(%rdi),%r11d
121 jmp .Lloop
122
123 .align 16
124 .Lloop:
125 xor %rdi,%rdi
126 mov 4*0(%rsi),%r12d
127 bswap %r12d
128 mov %r8d,%r13d
129 mov %r8d,%r14d
130 mov %r9d,%r15d
131
132 ror $6,%r13d
133 ror $11,%r14d
134 xor %r10d,%r15d # f^g
135
136 xor %r14d,%r13d
137 ror $14,%r14d
138 and %r8d,%r15d # (f^g)&e
139 mov %r12d,0(%rsp)
140
141 xor %r14d,%r13d # Sigma1(e)
142 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
143 add %r11d,%r12d # T1+=h
144
145 mov %eax,%r11d
146 add %r13d,%r12d # T1+=Sigma1(e)
147
148 add %r15d,%r12d # T1+=Ch(e,f,g)
149 mov %eax,%r13d
150 mov %eax,%r14d
151
152 ror $2,%r11d
153 ror $13,%r13d
154 mov %eax,%r15d
155 add (%rbp,%rdi,4),%r12d # T1+=K[round]
156
157 xor %r13d,%r11d
158 ror $9,%r13d
159 or %ecx,%r14d # a|c
160
161 xor %r13d,%r11d # h=Sigma0(a)
162 and %ecx,%r15d # a&c
163 add %r12d,%edx # d+=T1
164
165 and %ebx,%r14d # (a|c)&b
166 add %r12d,%r11d # h+=T1
167
168 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
169 lea 1(%rdi),%rdi # round++
170
171 add %r14d,%r11d # h+=Maj(a,b,c)
172 mov 4*1(%rsi),%r12d
173 bswap %r12d
174 mov %edx,%r13d
175 mov %edx,%r14d
176 mov %r8d,%r15d
177
178 ror $6,%r13d
179 ror $11,%r14d
180 xor %r9d,%r15d # f^g
181
182 xor %r14d,%r13d
183 ror $14,%r14d
184 and %edx,%r15d # (f^g)&e
185 mov %r12d,4(%rsp)
186
187 xor %r14d,%r13d # Sigma1(e)
188 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
189 add %r10d,%r12d # T1+=h
190
191 mov %r11d,%r10d
192 add %r13d,%r12d # T1+=Sigma1(e)
193
194 add %r15d,%r12d # T1+=Ch(e,f,g)
195 mov %r11d,%r13d
196 mov %r11d,%r14d
197
198 ror $2,%r10d
199 ror $13,%r13d
200 mov %r11d,%r15d
201 add (%rbp,%rdi,4),%r12d # T1+=K[round]
202
203 xor %r13d,%r10d
204 ror $9,%r13d
205 or %ebx,%r14d # a|c
206
207 xor %r13d,%r10d # h=Sigma0(a)
208 and %ebx,%r15d # a&c
209 add %r12d,%ecx # d+=T1
210
211 and %eax,%r14d # (a|c)&b
212 add %r12d,%r10d # h+=T1
213
214 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
215 lea 1(%rdi),%rdi # round++
216
217 add %r14d,%r10d # h+=Maj(a,b,c)
218 mov 4*2(%rsi),%r12d
219 bswap %r12d
220 mov %ecx,%r13d
221 mov %ecx,%r14d
222 mov %edx,%r15d
223
224 ror $6,%r13d
225 ror $11,%r14d
226 xor %r8d,%r15d # f^g
227
228 xor %r14d,%r13d
229 ror $14,%r14d
230 and %ecx,%r15d # (f^g)&e
231 mov %r12d,8(%rsp)
232
233 xor %r14d,%r13d # Sigma1(e)
234 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
235 add %r9d,%r12d # T1+=h
236
237 mov %r10d,%r9d
238 add %r13d,%r12d # T1+=Sigma1(e)
239
240 add %r15d,%r12d # T1+=Ch(e,f,g)
241 mov %r10d,%r13d
242 mov %r10d,%r14d
243
244 ror $2,%r9d
245 ror $13,%r13d
246 mov %r10d,%r15d
247 add (%rbp,%rdi,4),%r12d # T1+=K[round]
248
249 xor %r13d,%r9d
250 ror $9,%r13d
251 or %eax,%r14d # a|c
252
253 xor %r13d,%r9d # h=Sigma0(a)
254 and %eax,%r15d # a&c
255 add %r12d,%ebx # d+=T1
256
257 and %r11d,%r14d # (a|c)&b
258 add %r12d,%r9d # h+=T1
259
260 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
261 lea 1(%rdi),%rdi # round++
262
263 add %r14d,%r9d # h+=Maj(a,b,c)
264 mov 4*3(%rsi),%r12d
265 bswap %r12d
266 mov %ebx,%r13d
267 mov %ebx,%r14d
268 mov %ecx,%r15d
269
270 ror $6,%r13d
271 ror $11,%r14d
272 xor %edx,%r15d # f^g
273
274 xor %r14d,%r13d
275 ror $14,%r14d
276 and %ebx,%r15d # (f^g)&e
277 mov %r12d,12(%rsp)
278
279 xor %r14d,%r13d # Sigma1(e)
280 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
281 add %r8d,%r12d # T1+=h
282
283 mov %r9d,%r8d
284 add %r13d,%r12d # T1+=Sigma1(e)
285
286 add %r15d,%r12d # T1+=Ch(e,f,g)
287 mov %r9d,%r13d
288 mov %r9d,%r14d
289
290 ror $2,%r8d
291 ror $13,%r13d
292 mov %r9d,%r15d
293 add (%rbp,%rdi,4),%r12d # T1+=K[round]
294
295 xor %r13d,%r8d
296 ror $9,%r13d
297 or %r11d,%r14d # a|c
298
299 xor %r13d,%r8d # h=Sigma0(a)
300 and %r11d,%r15d # a&c
301 add %r12d,%eax # d+=T1
302
303 and %r10d,%r14d # (a|c)&b
304 add %r12d,%r8d # h+=T1
305
306 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
307 lea 1(%rdi),%rdi # round++
308
309 add %r14d,%r8d # h+=Maj(a,b,c)
310 mov 4*4(%rsi),%r12d
311 bswap %r12d
312 mov %eax,%r13d
313 mov %eax,%r14d
314 mov %ebx,%r15d
315
316 ror $6,%r13d
317 ror $11,%r14d
318 xor %ecx,%r15d # f^g
319
320 xor %r14d,%r13d
321 ror $14,%r14d
322 and %eax,%r15d # (f^g)&e
323 mov %r12d,16(%rsp)
324
325 xor %r14d,%r13d # Sigma1(e)
326 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
327 add %edx,%r12d # T1+=h
328
329 mov %r8d,%edx
330 add %r13d,%r12d # T1+=Sigma1(e)
331
332 add %r15d,%r12d # T1+=Ch(e,f,g)
333 mov %r8d,%r13d
334 mov %r8d,%r14d
335
336 ror $2,%edx
337 ror $13,%r13d
338 mov %r8d,%r15d
339 add (%rbp,%rdi,4),%r12d # T1+=K[round]
340
341 xor %r13d,%edx
342 ror $9,%r13d
343 or %r10d,%r14d # a|c
344
345 xor %r13d,%edx # h=Sigma0(a)
346 and %r10d,%r15d # a&c
347 add %r12d,%r11d # d+=T1
348
349 and %r9d,%r14d # (a|c)&b
350 add %r12d,%edx # h+=T1
351
352 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
353 lea 1(%rdi),%rdi # round++
354
355 add %r14d,%edx # h+=Maj(a,b,c)
356 mov 4*5(%rsi),%r12d
357 bswap %r12d
358 mov %r11d,%r13d
359 mov %r11d,%r14d
360 mov %eax,%r15d
361
362 ror $6,%r13d
363 ror $11,%r14d
364 xor %ebx,%r15d # f^g
365
366 xor %r14d,%r13d
367 ror $14,%r14d
368 and %r11d,%r15d # (f^g)&e
369 mov %r12d,20(%rsp)
370
371 xor %r14d,%r13d # Sigma1(e)
372 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
373 add %ecx,%r12d # T1+=h
374
375 mov %edx,%ecx
376 add %r13d,%r12d # T1+=Sigma1(e)
377
378 add %r15d,%r12d # T1+=Ch(e,f,g)
379 mov %edx,%r13d
380 mov %edx,%r14d
381
382 ror $2,%ecx
383 ror $13,%r13d
384 mov %edx,%r15d
385 add (%rbp,%rdi,4),%r12d # T1+=K[round]
386
387 xor %r13d,%ecx
388 ror $9,%r13d
389 or %r9d,%r14d # a|c
390
391 xor %r13d,%ecx # h=Sigma0(a)
392 and %r9d,%r15d # a&c
393 add %r12d,%r10d # d+=T1
394
395 and %r8d,%r14d # (a|c)&b
396 add %r12d,%ecx # h+=T1
397
398 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
399 lea 1(%rdi),%rdi # round++
400
401 add %r14d,%ecx # h+=Maj(a,b,c)
402 mov 4*6(%rsi),%r12d
403 bswap %r12d
404 mov %r10d,%r13d
405 mov %r10d,%r14d
406 mov %r11d,%r15d
407
408 ror $6,%r13d
409 ror $11,%r14d
410 xor %eax,%r15d # f^g
411
412 xor %r14d,%r13d
413 ror $14,%r14d
414 and %r10d,%r15d # (f^g)&e
415 mov %r12d,24(%rsp)
416
417 xor %r14d,%r13d # Sigma1(e)
418 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
419 add %ebx,%r12d # T1+=h
420
421 mov %ecx,%ebx
422 add %r13d,%r12d # T1+=Sigma1(e)
423
424 add %r15d,%r12d # T1+=Ch(e,f,g)
425 mov %ecx,%r13d
426 mov %ecx,%r14d
427
428 ror $2,%ebx
429 ror $13,%r13d
430 mov %ecx,%r15d
431 add (%rbp,%rdi,4),%r12d # T1+=K[round]
432
433 xor %r13d,%ebx
434 ror $9,%r13d
435 or %r8d,%r14d # a|c
436
437 xor %r13d,%ebx # h=Sigma0(a)
438 and %r8d,%r15d # a&c
439 add %r12d,%r9d # d+=T1
440
441 and %edx,%r14d # (a|c)&b
442 add %r12d,%ebx # h+=T1
443
444 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
445 lea 1(%rdi),%rdi # round++
446
447 add %r14d,%ebx # h+=Maj(a,b,c)
448 mov 4*7(%rsi),%r12d
449 bswap %r12d
450 mov %r9d,%r13d
451 mov %r9d,%r14d
452 mov %r10d,%r15d
453
454 ror $6,%r13d
455 ror $11,%r14d
456 xor %r11d,%r15d # f^g
457
458 xor %r14d,%r13d
459 ror $14,%r14d
460 and %r9d,%r15d # (f^g)&e
461 mov %r12d,28(%rsp)
462
463 xor %r14d,%r13d # Sigma1(e)
464 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
465 add %eax,%r12d # T1+=h
466
467 mov %ebx,%eax
468 add %r13d,%r12d # T1+=Sigma1(e)
469
470 add %r15d,%r12d # T1+=Ch(e,f,g)
471 mov %ebx,%r13d
472 mov %ebx,%r14d
473
474 ror $2,%eax
475 ror $13,%r13d
476 mov %ebx,%r15d
477 add (%rbp,%rdi,4),%r12d # T1+=K[round]
478
479 xor %r13d,%eax
480 ror $9,%r13d
481 or %edx,%r14d # a|c
482
483 xor %r13d,%eax # h=Sigma0(a)
484 and %edx,%r15d # a&c
485 add %r12d,%r8d # d+=T1
486
487 and %ecx,%r14d # (a|c)&b
488 add %r12d,%eax # h+=T1
489
490 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
491 lea 1(%rdi),%rdi # round++
492
493 add %r14d,%eax # h+=Maj(a,b,c)
494 mov 4*8(%rsi),%r12d
495 bswap %r12d
496 mov %r8d,%r13d
497 mov %r8d,%r14d
498 mov %r9d,%r15d
499
500 ror $6,%r13d
501 ror $11,%r14d
502 xor %r10d,%r15d # f^g
503
504 xor %r14d,%r13d
505 ror $14,%r14d
506 and %r8d,%r15d # (f^g)&e
507 mov %r12d,32(%rsp)
508
509 xor %r14d,%r13d # Sigma1(e)
510 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
511 add %r11d,%r12d # T1+=h
512
513 mov %eax,%r11d
514 add %r13d,%r12d # T1+=Sigma1(e)
515
516 add %r15d,%r12d # T1+=Ch(e,f,g)
517 mov %eax,%r13d
518 mov %eax,%r14d
519
520 ror $2,%r11d
521 ror $13,%r13d
522 mov %eax,%r15d
523 add (%rbp,%rdi,4),%r12d # T1+=K[round]
524
525 xor %r13d,%r11d
526 ror $9,%r13d
527 or %ecx,%r14d # a|c
528
529 xor %r13d,%r11d # h=Sigma0(a)
530 and %ecx,%r15d # a&c
531 add %r12d,%edx # d+=T1
532
533 and %ebx,%r14d # (a|c)&b
534 add %r12d,%r11d # h+=T1
535
536 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
537 lea 1(%rdi),%rdi # round++
538
539 add %r14d,%r11d # h+=Maj(a,b,c)
540 mov 4*9(%rsi),%r12d
541 bswap %r12d
542 mov %edx,%r13d
543 mov %edx,%r14d
544 mov %r8d,%r15d
545
546 ror $6,%r13d
547 ror $11,%r14d
548 xor %r9d,%r15d # f^g
549
550 xor %r14d,%r13d
551 ror $14,%r14d
552 and %edx,%r15d # (f^g)&e
553 mov %r12d,36(%rsp)
554
555 xor %r14d,%r13d # Sigma1(e)
556 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
557 add %r10d,%r12d # T1+=h
558
559 mov %r11d,%r10d
560 add %r13d,%r12d # T1+=Sigma1(e)
561
562 add %r15d,%r12d # T1+=Ch(e,f,g)
563 mov %r11d,%r13d
564 mov %r11d,%r14d
565
566 ror $2,%r10d
567 ror $13,%r13d
568 mov %r11d,%r15d
569 add (%rbp,%rdi,4),%r12d # T1+=K[round]
570
571 xor %r13d,%r10d
572 ror $9,%r13d
573 or %ebx,%r14d # a|c
574
575 xor %r13d,%r10d # h=Sigma0(a)
576 and %ebx,%r15d # a&c
577 add %r12d,%ecx # d+=T1
578
579 and %eax,%r14d # (a|c)&b
580 add %r12d,%r10d # h+=T1
581
582 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
583 lea 1(%rdi),%rdi # round++
584
585 add %r14d,%r10d # h+=Maj(a,b,c)
586 mov 4*10(%rsi),%r12d
587 bswap %r12d
588 mov %ecx,%r13d
589 mov %ecx,%r14d
590 mov %edx,%r15d
591
592 ror $6,%r13d
593 ror $11,%r14d
594 xor %r8d,%r15d # f^g
595
596 xor %r14d,%r13d
597 ror $14,%r14d
598 and %ecx,%r15d # (f^g)&e
599 mov %r12d,40(%rsp)
600
601 xor %r14d,%r13d # Sigma1(e)
602 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
603 add %r9d,%r12d # T1+=h
604
605 mov %r10d,%r9d
606 add %r13d,%r12d # T1+=Sigma1(e)
607
608 add %r15d,%r12d # T1+=Ch(e,f,g)
609 mov %r10d,%r13d
610 mov %r10d,%r14d
611
612 ror $2,%r9d
613 ror $13,%r13d
614 mov %r10d,%r15d
615 add (%rbp,%rdi,4),%r12d # T1+=K[round]
616
617 xor %r13d,%r9d
618 ror $9,%r13d
619 or %eax,%r14d # a|c
620
621 xor %r13d,%r9d # h=Sigma0(a)
622 and %eax,%r15d # a&c
623 add %r12d,%ebx # d+=T1
624
625 and %r11d,%r14d # (a|c)&b
626 add %r12d,%r9d # h+=T1
627
628 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
629 lea 1(%rdi),%rdi # round++
630
631 add %r14d,%r9d # h+=Maj(a,b,c)
632 mov 4*11(%rsi),%r12d
633 bswap %r12d
634 mov %ebx,%r13d
635 mov %ebx,%r14d
636 mov %ecx,%r15d
637
638 ror $6,%r13d
639 ror $11,%r14d
640 xor %edx,%r15d # f^g
641
642 xor %r14d,%r13d
643 ror $14,%r14d
644 and %ebx,%r15d # (f^g)&e
645 mov %r12d,44(%rsp)
646
647 xor %r14d,%r13d # Sigma1(e)
648 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
649 add %r8d,%r12d # T1+=h
650
651 mov %r9d,%r8d
652 add %r13d,%r12d # T1+=Sigma1(e)
653
654 add %r15d,%r12d # T1+=Ch(e,f,g)
655 mov %r9d,%r13d
656 mov %r9d,%r14d
657
658 ror $2,%r8d
659 ror $13,%r13d
660 mov %r9d,%r15d
661 add (%rbp,%rdi,4),%r12d # T1+=K[round]
662
663 xor %r13d,%r8d
664 ror $9,%r13d
665 or %r11d,%r14d # a|c
666
667 xor %r13d,%r8d # h=Sigma0(a)
668 and %r11d,%r15d # a&c
669 add %r12d,%eax # d+=T1
670
671 and %r10d,%r14d # (a|c)&b
672 add %r12d,%r8d # h+=T1
673
674 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
675 lea 1(%rdi),%rdi # round++
676
677 add %r14d,%r8d # h+=Maj(a,b,c)
678 mov 4*12(%rsi),%r12d
679 bswap %r12d
680 mov %eax,%r13d
681 mov %eax,%r14d
682 mov %ebx,%r15d
683
684 ror $6,%r13d
685 ror $11,%r14d
686 xor %ecx,%r15d # f^g
687
688 xor %r14d,%r13d
689 ror $14,%r14d
690 and %eax,%r15d # (f^g)&e
691 mov %r12d,48(%rsp)
692
693 xor %r14d,%r13d # Sigma1(e)
694 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
695 add %edx,%r12d # T1+=h
696
697 mov %r8d,%edx
698 add %r13d,%r12d # T1+=Sigma1(e)
699
700 add %r15d,%r12d # T1+=Ch(e,f,g)
701 mov %r8d,%r13d
702 mov %r8d,%r14d
703
704 ror $2,%edx
705 ror $13,%r13d
706 mov %r8d,%r15d
707 add (%rbp,%rdi,4),%r12d # T1+=K[round]
708
709 xor %r13d,%edx
710 ror $9,%r13d
711 or %r10d,%r14d # a|c
712
713 xor %r13d,%edx # h=Sigma0(a)
714 and %r10d,%r15d # a&c
715 add %r12d,%r11d # d+=T1
716
717 and %r9d,%r14d # (a|c)&b
718 add %r12d,%edx # h+=T1
719
720 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
721 lea 1(%rdi),%rdi # round++
722
723 add %r14d,%edx # h+=Maj(a,b,c)
724 mov 4*13(%rsi),%r12d
725 bswap %r12d
726 mov %r11d,%r13d
727 mov %r11d,%r14d
728 mov %eax,%r15d
729
730 ror $6,%r13d
731 ror $11,%r14d
732 xor %ebx,%r15d # f^g
733
734 xor %r14d,%r13d
735 ror $14,%r14d
736 and %r11d,%r15d # (f^g)&e
737 mov %r12d,52(%rsp)
738
739 xor %r14d,%r13d # Sigma1(e)
740 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
741 add %ecx,%r12d # T1+=h
742
743 mov %edx,%ecx
744 add %r13d,%r12d # T1+=Sigma1(e)
745
746 add %r15d,%r12d # T1+=Ch(e,f,g)
747 mov %edx,%r13d
748 mov %edx,%r14d
749
750 ror $2,%ecx
751 ror $13,%r13d
752 mov %edx,%r15d
753 add (%rbp,%rdi,4),%r12d # T1+=K[round]
754
755 xor %r13d,%ecx
756 ror $9,%r13d
757 or %r9d,%r14d # a|c
758
759 xor %r13d,%ecx # h=Sigma0(a)
760 and %r9d,%r15d # a&c
761 add %r12d,%r10d # d+=T1
762
763 and %r8d,%r14d # (a|c)&b
764 add %r12d,%ecx # h+=T1
765
766 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
767 lea 1(%rdi),%rdi # round++
768
769 add %r14d,%ecx # h+=Maj(a,b,c)
770 mov 4*14(%rsi),%r12d
771 bswap %r12d
772 mov %r10d,%r13d
773 mov %r10d,%r14d
774 mov %r11d,%r15d
775
776 ror $6,%r13d
777 ror $11,%r14d
778 xor %eax,%r15d # f^g
779
780 xor %r14d,%r13d
781 ror $14,%r14d
782 and %r10d,%r15d # (f^g)&e
783 mov %r12d,56(%rsp)
784
785 xor %r14d,%r13d # Sigma1(e)
786 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
787 add %ebx,%r12d # T1+=h
788
789 mov %ecx,%ebx
790 add %r13d,%r12d # T1+=Sigma1(e)
791
792 add %r15d,%r12d # T1+=Ch(e,f,g)
793 mov %ecx,%r13d
794 mov %ecx,%r14d
795
796 ror $2,%ebx
797 ror $13,%r13d
798 mov %ecx,%r15d
799 add (%rbp,%rdi,4),%r12d # T1+=K[round]
800
801 xor %r13d,%ebx
802 ror $9,%r13d
803 or %r8d,%r14d # a|c
804
805 xor %r13d,%ebx # h=Sigma0(a)
806 and %r8d,%r15d # a&c
807 add %r12d,%r9d # d+=T1
808
809 and %edx,%r14d # (a|c)&b
810 add %r12d,%ebx # h+=T1
811
812 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
813 lea 1(%rdi),%rdi # round++
814
815 add %r14d,%ebx # h+=Maj(a,b,c)
816 mov 4*15(%rsi),%r12d
817 bswap %r12d
818 mov %r9d,%r13d
819 mov %r9d,%r14d
820 mov %r10d,%r15d
821
822 ror $6,%r13d
823 ror $11,%r14d
824 xor %r11d,%r15d # f^g
825
826 xor %r14d,%r13d
827 ror $14,%r14d
828 and %r9d,%r15d # (f^g)&e
829 mov %r12d,60(%rsp)
830
831 xor %r14d,%r13d # Sigma1(e)
832 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
833 add %eax,%r12d # T1+=h
834
835 mov %ebx,%eax
836 add %r13d,%r12d # T1+=Sigma1(e)
837
838 add %r15d,%r12d # T1+=Ch(e,f,g)
839 mov %ebx,%r13d
840 mov %ebx,%r14d
841
842 ror $2,%eax
843 ror $13,%r13d
844 mov %ebx,%r15d
845 add (%rbp,%rdi,4),%r12d # T1+=K[round]
846
847 xor %r13d,%eax
848 ror $9,%r13d
849 or %edx,%r14d # a|c
850
851 xor %r13d,%eax # h=Sigma0(a)
852 and %edx,%r15d # a&c
853 add %r12d,%r8d # d+=T1
854
855 and %ecx,%r14d # (a|c)&b
856 add %r12d,%eax # h+=T1
857
858 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
859 lea 1(%rdi),%rdi # round++
860
861 add %r14d,%eax # h+=Maj(a,b,c)
862 jmp .Lrounds_16_xx
863 .align 16
864 .Lrounds_16_xx:
865 mov 4(%rsp),%r13d
866 mov 56(%rsp),%r12d
867
868 mov %r13d,%r15d
869
870 shr $3,%r13d
871 ror $7,%r15d
872
873 xor %r15d,%r13d
874 ror $11,%r15d
875
876 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
877 mov %r12d,%r14d
878
879 shr $10,%r12d
880 ror $17,%r14d
881
882 xor %r14d,%r12d
883 ror $2,%r14d
884
885 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
886
887 add %r13d,%r12d
888
889 add 36(%rsp),%r12d
890
891 add 0(%rsp),%r12d
892 mov %r8d,%r13d
893 mov %r8d,%r14d
894 mov %r9d,%r15d
895
896 ror $6,%r13d
897 ror $11,%r14d
898 xor %r10d,%r15d # f^g
899
900 xor %r14d,%r13d
901 ror $14,%r14d
902 and %r8d,%r15d # (f^g)&e
903 mov %r12d,0(%rsp)
904
905 xor %r14d,%r13d # Sigma1(e)
906 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
907 add %r11d,%r12d # T1+=h
908
909 mov %eax,%r11d
910 add %r13d,%r12d # T1+=Sigma1(e)
911
912 add %r15d,%r12d # T1+=Ch(e,f,g)
913 mov %eax,%r13d
914 mov %eax,%r14d
915
916 ror $2,%r11d
917 ror $13,%r13d
918 mov %eax,%r15d
919 add (%rbp,%rdi,4),%r12d # T1+=K[round]
920
921 xor %r13d,%r11d
922 ror $9,%r13d
923 or %ecx,%r14d # a|c
924
925 xor %r13d,%r11d # h=Sigma0(a)
926 and %ecx,%r15d # a&c
927 add %r12d,%edx # d+=T1
928
929 and %ebx,%r14d # (a|c)&b
930 add %r12d,%r11d # h+=T1
931
932 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
933 lea 1(%rdi),%rdi # round++
934
935 add %r14d,%r11d # h+=Maj(a,b,c)
936 mov 8(%rsp),%r13d
937 mov 60(%rsp),%r12d
938
939 mov %r13d,%r15d
940
941 shr $3,%r13d
942 ror $7,%r15d
943
944 xor %r15d,%r13d
945 ror $11,%r15d
946
947 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
948 mov %r12d,%r14d
949
950 shr $10,%r12d
951 ror $17,%r14d
952
953 xor %r14d,%r12d
954 ror $2,%r14d
955
956 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
957
958 add %r13d,%r12d
959
960 add 40(%rsp),%r12d
961
962 add 4(%rsp),%r12d
963 mov %edx,%r13d
964 mov %edx,%r14d
965 mov %r8d,%r15d
966
967 ror $6,%r13d
968 ror $11,%r14d
969 xor %r9d,%r15d # f^g
970
971 xor %r14d,%r13d
972 ror $14,%r14d
973 and %edx,%r15d # (f^g)&e
974 mov %r12d,4(%rsp)
975
976 xor %r14d,%r13d # Sigma1(e)
977 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
978 add %r10d,%r12d # T1+=h
979
980 mov %r11d,%r10d
981 add %r13d,%r12d # T1+=Sigma1(e)
982
983 add %r15d,%r12d # T1+=Ch(e,f,g)
984 mov %r11d,%r13d
985 mov %r11d,%r14d
986
987 ror $2,%r10d
988 ror $13,%r13d
989 mov %r11d,%r15d
990 add (%rbp,%rdi,4),%r12d # T1+=K[round]
991
992 xor %r13d,%r10d
993 ror $9,%r13d
994 or %ebx,%r14d # a|c
995
996 xor %r13d,%r10d # h=Sigma0(a)
997 and %ebx,%r15d # a&c
998 add %r12d,%ecx # d+=T1
999
1000 and %eax,%r14d # (a|c)&b
1001 add %r12d,%r10d # h+=T1
1002
1003 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1004 lea 1(%rdi),%rdi # round++
1005
1006 add %r14d,%r10d # h+=Maj(a,b,c)
1007 mov 12(%rsp),%r13d
1008 mov 0(%rsp),%r12d
1009
1010 mov %r13d,%r15d
1011
1012 shr $3,%r13d
1013 ror $7,%r15d
1014
1015 xor %r15d,%r13d
1016 ror $11,%r15d
1017
1018 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1019 mov %r12d,%r14d
1020
1021 shr $10,%r12d
1022 ror $17,%r14d
1023
1024 xor %r14d,%r12d
1025 ror $2,%r14d
1026
1027 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1028
1029 add %r13d,%r12d
1030
1031 add 44(%rsp),%r12d
1032
1033 add 8(%rsp),%r12d
1034 mov %ecx,%r13d
1035 mov %ecx,%r14d
1036 mov %edx,%r15d
1037
1038 ror $6,%r13d
1039 ror $11,%r14d
1040 xor %r8d,%r15d # f^g
1041
1042 xor %r14d,%r13d
1043 ror $14,%r14d
1044 and %ecx,%r15d # (f^g)&e
1045 mov %r12d,8(%rsp)
1046
1047 xor %r14d,%r13d # Sigma1(e)
1048 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1049 add %r9d,%r12d # T1+=h
1050
1051 mov %r10d,%r9d
1052 add %r13d,%r12d # T1+=Sigma1(e)
1053
1054 add %r15d,%r12d # T1+=Ch(e,f,g)
1055 mov %r10d,%r13d
1056 mov %r10d,%r14d
1057
1058 ror $2,%r9d
1059 ror $13,%r13d
1060 mov %r10d,%r15d
1061 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1062
1063 xor %r13d,%r9d
1064 ror $9,%r13d
1065 or %eax,%r14d # a|c
1066
1067 xor %r13d,%r9d # h=Sigma0(a)
1068 and %eax,%r15d # a&c
1069 add %r12d,%ebx # d+=T1
1070
1071 and %r11d,%r14d # (a|c)&b
1072 add %r12d,%r9d # h+=T1
1073
1074 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1075 lea 1(%rdi),%rdi # round++
1076
1077 add %r14d,%r9d # h+=Maj(a,b,c)
1078 mov 16(%rsp),%r13d
1079 mov 4(%rsp),%r12d
1080
1081 mov %r13d,%r15d
1082
1083 shr $3,%r13d
1084 ror $7,%r15d
1085
1086 xor %r15d,%r13d
1087 ror $11,%r15d
1088
1089 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1090 mov %r12d,%r14d
1091
1092 shr $10,%r12d
1093 ror $17,%r14d
1094
1095 xor %r14d,%r12d
1096 ror $2,%r14d
1097
1098 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1099
1100 add %r13d,%r12d
1101
1102 add 48(%rsp),%r12d
1103
1104 add 12(%rsp),%r12d
1105 mov %ebx,%r13d
1106 mov %ebx,%r14d
1107 mov %ecx,%r15d
1108
1109 ror $6,%r13d
1110 ror $11,%r14d
1111 xor %edx,%r15d # f^g
1112
1113 xor %r14d,%r13d
1114 ror $14,%r14d
1115 and %ebx,%r15d # (f^g)&e
1116 mov %r12d,12(%rsp)
1117
1118 xor %r14d,%r13d # Sigma1(e)
1119 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1120 add %r8d,%r12d # T1+=h
1121
1122 mov %r9d,%r8d
1123 add %r13d,%r12d # T1+=Sigma1(e)
1124
1125 add %r15d,%r12d # T1+=Ch(e,f,g)
1126 mov %r9d,%r13d
1127 mov %r9d,%r14d
1128
1129 ror $2,%r8d
1130 ror $13,%r13d
1131 mov %r9d,%r15d
1132 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1133
1134 xor %r13d,%r8d
1135 ror $9,%r13d
1136 or %r11d,%r14d # a|c
1137
1138 xor %r13d,%r8d # h=Sigma0(a)
1139 and %r11d,%r15d # a&c
1140 add %r12d,%eax # d+=T1
1141
1142 and %r10d,%r14d # (a|c)&b
1143 add %r12d,%r8d # h+=T1
1144
1145 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1146 lea 1(%rdi),%rdi # round++
1147
1148 add %r14d,%r8d # h+=Maj(a,b,c)
1149 mov 20(%rsp),%r13d
1150 mov 8(%rsp),%r12d
1151
1152 mov %r13d,%r15d
1153
1154 shr $3,%r13d
1155 ror $7,%r15d
1156
1157 xor %r15d,%r13d
1158 ror $11,%r15d
1159
1160 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1161 mov %r12d,%r14d
1162
1163 shr $10,%r12d
1164 ror $17,%r14d
1165
1166 xor %r14d,%r12d
1167 ror $2,%r14d
1168
1169 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1170
1171 add %r13d,%r12d
1172
1173 add 52(%rsp),%r12d
1174
1175 add 16(%rsp),%r12d
1176 mov %eax,%r13d
1177 mov %eax,%r14d
1178 mov %ebx,%r15d
1179
1180 ror $6,%r13d
1181 ror $11,%r14d
1182 xor %ecx,%r15d # f^g
1183
1184 xor %r14d,%r13d
1185 ror $14,%r14d
1186 and %eax,%r15d # (f^g)&e
1187 mov %r12d,16(%rsp)
1188
1189 xor %r14d,%r13d # Sigma1(e)
1190 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1191 add %edx,%r12d # T1+=h
1192
1193 mov %r8d,%edx
1194 add %r13d,%r12d # T1+=Sigma1(e)
1195
1196 add %r15d,%r12d # T1+=Ch(e,f,g)
1197 mov %r8d,%r13d
1198 mov %r8d,%r14d
1199
1200 ror $2,%edx
1201 ror $13,%r13d
1202 mov %r8d,%r15d
1203 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1204
1205 xor %r13d,%edx
1206 ror $9,%r13d
1207 or %r10d,%r14d # a|c
1208
1209 xor %r13d,%edx # h=Sigma0(a)
1210 and %r10d,%r15d # a&c
1211 add %r12d,%r11d # d+=T1
1212
1213 and %r9d,%r14d # (a|c)&b
1214 add %r12d,%edx # h+=T1
1215
1216 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1217 lea 1(%rdi),%rdi # round++
1218
1219 add %r14d,%edx # h+=Maj(a,b,c)
1220 mov 24(%rsp),%r13d
1221 mov 12(%rsp),%r12d
1222
1223 mov %r13d,%r15d
1224
1225 shr $3,%r13d
1226 ror $7,%r15d
1227
1228 xor %r15d,%r13d
1229 ror $11,%r15d
1230
1231 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1232 mov %r12d,%r14d
1233
1234 shr $10,%r12d
1235 ror $17,%r14d
1236
1237 xor %r14d,%r12d
1238 ror $2,%r14d
1239
1240 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1241
1242 add %r13d,%r12d
1243
1244 add 56(%rsp),%r12d
1245
1246 add 20(%rsp),%r12d
1247 mov %r11d,%r13d
1248 mov %r11d,%r14d
1249 mov %eax,%r15d
1250
1251 ror $6,%r13d
1252 ror $11,%r14d
1253 xor %ebx,%r15d # f^g
1254
1255 xor %r14d,%r13d
1256 ror $14,%r14d
1257 and %r11d,%r15d # (f^g)&e
1258 mov %r12d,20(%rsp)
1259
1260 xor %r14d,%r13d # Sigma1(e)
1261 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1262 add %ecx,%r12d # T1+=h
1263
1264 mov %edx,%ecx
1265 add %r13d,%r12d # T1+=Sigma1(e)
1266
1267 add %r15d,%r12d # T1+=Ch(e,f,g)
1268 mov %edx,%r13d
1269 mov %edx,%r14d
1270
1271 ror $2,%ecx
1272 ror $13,%r13d
1273 mov %edx,%r15d
1274 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1275
1276 xor %r13d,%ecx
1277 ror $9,%r13d
1278 or %r9d,%r14d # a|c
1279
1280 xor %r13d,%ecx # h=Sigma0(a)
1281 and %r9d,%r15d # a&c
1282 add %r12d,%r10d # d+=T1
1283
1284 and %r8d,%r14d # (a|c)&b
1285 add %r12d,%ecx # h+=T1
1286
1287 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1288 lea 1(%rdi),%rdi # round++
1289
1290 add %r14d,%ecx # h+=Maj(a,b,c)
1291 mov 28(%rsp),%r13d
1292 mov 16(%rsp),%r12d
1293
1294 mov %r13d,%r15d
1295
1296 shr $3,%r13d
1297 ror $7,%r15d
1298
1299 xor %r15d,%r13d
1300 ror $11,%r15d
1301
1302 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1303 mov %r12d,%r14d
1304
1305 shr $10,%r12d
1306 ror $17,%r14d
1307
1308 xor %r14d,%r12d
1309 ror $2,%r14d
1310
1311 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1312
1313 add %r13d,%r12d
1314
1315 add 60(%rsp),%r12d
1316
1317 add 24(%rsp),%r12d
1318 mov %r10d,%r13d
1319 mov %r10d,%r14d
1320 mov %r11d,%r15d
1321
1322 ror $6,%r13d
1323 ror $11,%r14d
1324 xor %eax,%r15d # f^g
1325
1326 xor %r14d,%r13d
1327 ror $14,%r14d
1328 and %r10d,%r15d # (f^g)&e
1329 mov %r12d,24(%rsp)
1330
1331 xor %r14d,%r13d # Sigma1(e)
1332 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
1333 add %ebx,%r12d # T1+=h
1334
1335 mov %ecx,%ebx
1336 add %r13d,%r12d # T1+=Sigma1(e)
1337
1338 add %r15d,%r12d # T1+=Ch(e,f,g)
1339 mov %ecx,%r13d
1340 mov %ecx,%r14d
1341
1342 ror $2,%ebx
1343 ror $13,%r13d
1344 mov %ecx,%r15d
1345 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1346
1347 xor %r13d,%ebx
1348 ror $9,%r13d
1349 or %r8d,%r14d # a|c
1350
1351 xor %r13d,%ebx # h=Sigma0(a)
1352 and %r8d,%r15d # a&c
1353 add %r12d,%r9d # d+=T1
1354
1355 and %edx,%r14d # (a|c)&b
1356 add %r12d,%ebx # h+=T1
1357
1358 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1359 lea 1(%rdi),%rdi # round++
1360
1361 add %r14d,%ebx # h+=Maj(a,b,c)
1362 mov 32(%rsp),%r13d
1363 mov 20(%rsp),%r12d
1364
1365 mov %r13d,%r15d
1366
1367 shr $3,%r13d
1368 ror $7,%r15d
1369
1370 xor %r15d,%r13d
1371 ror $11,%r15d
1372
1373 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1374 mov %r12d,%r14d
1375
1376 shr $10,%r12d
1377 ror $17,%r14d
1378
1379 xor %r14d,%r12d
1380 ror $2,%r14d
1381
1382 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1383
1384 add %r13d,%r12d
1385
1386 add 0(%rsp),%r12d
1387
1388 add 28(%rsp),%r12d
1389 mov %r9d,%r13d
1390 mov %r9d,%r14d
1391 mov %r10d,%r15d
1392
1393 ror $6,%r13d
1394 ror $11,%r14d
1395 xor %r11d,%r15d # f^g
1396
1397 xor %r14d,%r13d
1398 ror $14,%r14d
1399 and %r9d,%r15d # (f^g)&e
1400 mov %r12d,28(%rsp)
1401
1402 xor %r14d,%r13d # Sigma1(e)
1403 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1404 add %eax,%r12d # T1+=h
1405
1406 mov %ebx,%eax
1407 add %r13d,%r12d # T1+=Sigma1(e)
1408
1409 add %r15d,%r12d # T1+=Ch(e,f,g)
1410 mov %ebx,%r13d
1411 mov %ebx,%r14d
1412
1413 ror $2,%eax
1414 ror $13,%r13d
1415 mov %ebx,%r15d
1416 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1417
1418 xor %r13d,%eax
1419 ror $9,%r13d
1420 or %edx,%r14d # a|c
1421
1422 xor %r13d,%eax # h=Sigma0(a)
1423 and %edx,%r15d # a&c
1424 add %r12d,%r8d # d+=T1
1425
1426 and %ecx,%r14d # (a|c)&b
1427 add %r12d,%eax # h+=T1
1428
1429 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1430 lea 1(%rdi),%rdi # round++
1431
1432 add %r14d,%eax # h+=Maj(a,b,c)
1433 mov 36(%rsp),%r13d
1434 mov 24(%rsp),%r12d
1435
1436 mov %r13d,%r15d
1437
1438 shr $3,%r13d
1439 ror $7,%r15d
1440
1441 xor %r15d,%r13d
1442 ror $11,%r15d
1443
1444 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1445 mov %r12d,%r14d
1446
1447 shr $10,%r12d
1448 ror $17,%r14d
1449
1450 xor %r14d,%r12d
1451 ror $2,%r14d
1452
1453 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1454
1455 add %r13d,%r12d
1456
1457 add 4(%rsp),%r12d
1458
1459 add 32(%rsp),%r12d
1460 mov %r8d,%r13d
1461 mov %r8d,%r14d
1462 mov %r9d,%r15d
1463
1464 ror $6,%r13d
1465 ror $11,%r14d
1466 xor %r10d,%r15d # f^g
1467
1468 xor %r14d,%r13d
1469 ror $14,%r14d
1470 and %r8d,%r15d # (f^g)&e
1471 mov %r12d,32(%rsp)
1472
1473 xor %r14d,%r13d # Sigma1(e)
1474 xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1475 add %r11d,%r12d # T1+=h
1476
1477 mov %eax,%r11d
1478 add %r13d,%r12d # T1+=Sigma1(e)
1479
1480 add %r15d,%r12d # T1+=Ch(e,f,g)
1481 mov %eax,%r13d
1482 mov %eax,%r14d
1483
1484 ror $2,%r11d
1485 ror $13,%r13d
1486 mov %eax,%r15d
1487 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1488
1489 xor %r13d,%r11d
1490 ror $9,%r13d
1491 or %ecx,%r14d # a|c
1492
1493 xor %r13d,%r11d # h=Sigma0(a)
1494 and %ecx,%r15d # a&c
1495 add %r12d,%edx # d+=T1
1496
1497 and %ebx,%r14d # (a|c)&b
1498 add %r12d,%r11d # h+=T1
1499
1500 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1501 lea 1(%rdi),%rdi # round++
1502
1503 add %r14d,%r11d # h+=Maj(a,b,c)
1504 mov 40(%rsp),%r13d
1505 mov 28(%rsp),%r12d
1506
1507 mov %r13d,%r15d
1508
1509 shr $3,%r13d
1510 ror $7,%r15d
1511
1512 xor %r15d,%r13d
1513 ror $11,%r15d
1514
1515 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1516 mov %r12d,%r14d
1517
1518 shr $10,%r12d
1519 ror $17,%r14d
1520
1521 xor %r14d,%r12d
1522 ror $2,%r14d
1523
1524 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1525
1526 add %r13d,%r12d
1527
1528 add 8(%rsp),%r12d
1529
1530 add 36(%rsp),%r12d
1531 mov %edx,%r13d
1532 mov %edx,%r14d
1533 mov %r8d,%r15d
1534
1535 ror $6,%r13d
1536 ror $11,%r14d
1537 xor %r9d,%r15d # f^g
1538
1539 xor %r14d,%r13d
1540 ror $14,%r14d
1541 and %edx,%r15d # (f^g)&e
1542 mov %r12d,36(%rsp)
1543
1544 xor %r14d,%r13d # Sigma1(e)
1545 xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1546 add %r10d,%r12d # T1+=h
1547
1548 mov %r11d,%r10d
1549 add %r13d,%r12d # T1+=Sigma1(e)
1550
1551 add %r15d,%r12d # T1+=Ch(e,f,g)
1552 mov %r11d,%r13d
1553 mov %r11d,%r14d
1554
1555 ror $2,%r10d
1556 ror $13,%r13d
1557 mov %r11d,%r15d
1558 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1559
1560 xor %r13d,%r10d
1561 ror $9,%r13d
1562 or %ebx,%r14d # a|c
1563
1564 xor %r13d,%r10d # h=Sigma0(a)
1565 and %ebx,%r15d # a&c
1566 add %r12d,%ecx # d+=T1
1567
1568 and %eax,%r14d # (a|c)&b
1569 add %r12d,%r10d # h+=T1
1570
1571 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1572 lea 1(%rdi),%rdi # round++
1573
1574 add %r14d,%r10d # h+=Maj(a,b,c)
1575 mov 44(%rsp),%r13d
1576 mov 32(%rsp),%r12d
1577
1578 mov %r13d,%r15d
1579
1580 shr $3,%r13d
1581 ror $7,%r15d
1582
1583 xor %r15d,%r13d
1584 ror $11,%r15d
1585
1586 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1587 mov %r12d,%r14d
1588
1589 shr $10,%r12d
1590 ror $17,%r14d
1591
1592 xor %r14d,%r12d
1593 ror $2,%r14d
1594
1595 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1596
1597 add %r13d,%r12d
1598
1599 add 12(%rsp),%r12d
1600
1601 add 40(%rsp),%r12d
1602 mov %ecx,%r13d
1603 mov %ecx,%r14d
1604 mov %edx,%r15d
1605
1606 ror $6,%r13d
1607 ror $11,%r14d
1608 xor %r8d,%r15d # f^g
1609
1610 xor %r14d,%r13d
1611 ror $14,%r14d
1612 and %ecx,%r15d # (f^g)&e
1613 mov %r12d,40(%rsp)
1614
1615 xor %r14d,%r13d # Sigma1(e)
1616 xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1617 add %r9d,%r12d # T1+=h
1618
1619 mov %r10d,%r9d
1620 add %r13d,%r12d # T1+=Sigma1(e)
1621
1622 add %r15d,%r12d # T1+=Ch(e,f,g)
1623 mov %r10d,%r13d
1624 mov %r10d,%r14d
1625
1626 ror $2,%r9d
1627 ror $13,%r13d
1628 mov %r10d,%r15d
1629 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1630
1631 xor %r13d,%r9d
1632 ror $9,%r13d
1633 or %eax,%r14d # a|c
1634
1635 xor %r13d,%r9d # h=Sigma0(a)
1636 and %eax,%r15d # a&c
1637 add %r12d,%ebx # d+=T1
1638
1639 and %r11d,%r14d # (a|c)&b
1640 add %r12d,%r9d # h+=T1
1641
1642 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1643 lea 1(%rdi),%rdi # round++
1644
1645 add %r14d,%r9d # h+=Maj(a,b,c)
1646 mov 48(%rsp),%r13d
1647 mov 36(%rsp),%r12d
1648
1649 mov %r13d,%r15d
1650
1651 shr $3,%r13d
1652 ror $7,%r15d
1653
1654 xor %r15d,%r13d
1655 ror $11,%r15d
1656
1657 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1658 mov %r12d,%r14d
1659
1660 shr $10,%r12d
1661 ror $17,%r14d
1662
1663 xor %r14d,%r12d
1664 ror $2,%r14d
1665
1666 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1667
1668 add %r13d,%r12d
1669
1670 add 16(%rsp),%r12d
1671
1672 add 44(%rsp),%r12d
1673 mov %ebx,%r13d
1674 mov %ebx,%r14d
1675 mov %ecx,%r15d
1676
1677 ror $6,%r13d
1678 ror $11,%r14d
1679 xor %edx,%r15d # f^g
1680
1681 xor %r14d,%r13d
1682 ror $14,%r14d
1683 and %ebx,%r15d # (f^g)&e
1684 mov %r12d,44(%rsp)
1685
1686 xor %r14d,%r13d # Sigma1(e)
1687 xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1688 add %r8d,%r12d # T1+=h
1689
1690 mov %r9d,%r8d
1691 add %r13d,%r12d # T1+=Sigma1(e)
1692
1693 add %r15d,%r12d # T1+=Ch(e,f,g)
1694 mov %r9d,%r13d
1695 mov %r9d,%r14d
1696
1697 ror $2,%r8d
1698 ror $13,%r13d
1699 mov %r9d,%r15d
1700 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1701
1702 xor %r13d,%r8d
1703 ror $9,%r13d
1704 or %r11d,%r14d # a|c
1705
1706 xor %r13d,%r8d # h=Sigma0(a)
1707 and %r11d,%r15d # a&c
1708 add %r12d,%eax # d+=T1
1709
1710 and %r10d,%r14d # (a|c)&b
1711 add %r12d,%r8d # h+=T1
1712
1713 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1714 lea 1(%rdi),%rdi # round++
1715
1716 add %r14d,%r8d # h+=Maj(a,b,c)
1717 mov 52(%rsp),%r13d
1718 mov 40(%rsp),%r12d
1719
1720 mov %r13d,%r15d
1721
1722 shr $3,%r13d
1723 ror $7,%r15d
1724
1725 xor %r15d,%r13d
1726 ror $11,%r15d
1727
1728 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1729 mov %r12d,%r14d
1730
1731 shr $10,%r12d
1732 ror $17,%r14d
1733
1734 xor %r14d,%r12d
1735 ror $2,%r14d
1736
1737 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1738
1739 add %r13d,%r12d
1740
1741 add 20(%rsp),%r12d
1742
1743 add 48(%rsp),%r12d
1744 mov %eax,%r13d
1745 mov %eax,%r14d
1746 mov %ebx,%r15d
1747
1748 ror $6,%r13d
1749 ror $11,%r14d
1750 xor %ecx,%r15d # f^g
1751
1752 xor %r14d,%r13d
1753 ror $14,%r14d
1754 and %eax,%r15d # (f^g)&e
1755 mov %r12d,48(%rsp)
1756
1757 xor %r14d,%r13d # Sigma1(e)
1758 xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1759 add %edx,%r12d # T1+=h
1760
1761 mov %r8d,%edx
1762 add %r13d,%r12d # T1+=Sigma1(e)
1763
1764 add %r15d,%r12d # T1+=Ch(e,f,g)
1765 mov %r8d,%r13d
1766 mov %r8d,%r14d
1767
1768 ror $2,%edx
1769 ror $13,%r13d
1770 mov %r8d,%r15d
1771 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1772
1773 xor %r13d,%edx
1774 ror $9,%r13d
1775 or %r10d,%r14d # a|c
1776
1777 xor %r13d,%edx # h=Sigma0(a)
1778 and %r10d,%r15d # a&c
1779 add %r12d,%r11d # d+=T1
1780
1781 and %r9d,%r14d # (a|c)&b
1782 add %r12d,%edx # h+=T1
1783
1784 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1785 lea 1(%rdi),%rdi # round++
1786
1787 add %r14d,%edx # h+=Maj(a,b,c)
1788 mov 56(%rsp),%r13d
1789 mov 44(%rsp),%r12d
1790
1791 mov %r13d,%r15d
1792
1793 shr $3,%r13d
1794 ror $7,%r15d
1795
1796 xor %r15d,%r13d
1797 ror $11,%r15d
1798
1799 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1800 mov %r12d,%r14d
1801
1802 shr $10,%r12d
1803 ror $17,%r14d
1804
1805 xor %r14d,%r12d
1806 ror $2,%r14d
1807
1808 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1809
1810 add %r13d,%r12d
1811
1812 add 24(%rsp),%r12d
1813
1814 add 52(%rsp),%r12d
1815 mov %r11d,%r13d
1816 mov %r11d,%r14d
1817 mov %eax,%r15d
1818
1819 ror $6,%r13d
1820 ror $11,%r14d
1821 xor %ebx,%r15d # f^g
1822
1823 xor %r14d,%r13d
1824 ror $14,%r14d
1825 and %r11d,%r15d # (f^g)&e
1826 mov %r12d,52(%rsp)
1827
1828 xor %r14d,%r13d # Sigma1(e)
1829 xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
1830 add %ecx,%r12d # T1+=h
1831
1832 mov %edx,%ecx
1833 add %r13d,%r12d # T1+=Sigma1(e)
1834
1835 add %r15d,%r12d # T1+=Ch(e,f,g)
1836 mov %edx,%r13d
1837 mov %edx,%r14d
1838
1839 ror $2,%ecx
1840 ror $13,%r13d
1841 mov %edx,%r15d
1842 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1843
1844 xor %r13d,%ecx
1845 ror $9,%r13d
1846 or %r9d,%r14d # a|c
1847
1848 xor %r13d,%ecx # h=Sigma0(a)
1849 and %r9d,%r15d # a&c
1850 add %r12d,%r10d # d+=T1
1851
1852 and %r8d,%r14d # (a|c)&b
1853 add %r12d,%ecx # h+=T1
1854
1855 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1856 lea 1(%rdi),%rdi # round++
1857
1858 add %r14d,%ecx # h+=Maj(a,b,c)
1859 mov 60(%rsp),%r13d
1860 mov 48(%rsp),%r12d
1861
1862 mov %r13d,%r15d
1863
1864 shr $3,%r13d
1865 ror $7,%r15d
1866
1867 xor %r15d,%r13d
1868 ror $11,%r15d
1869
1870 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1871 mov %r12d,%r14d
1872
1873 shr $10,%r12d
1874 ror $17,%r14d
1875
1876 xor %r14d,%r12d
1877 ror $2,%r14d
1878
1879 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1880
1881 add %r13d,%r12d
1882
1883 add 28(%rsp),%r12d
1884
1885 add 56(%rsp),%r12d
1886 mov %r10d,%r13d
1887 mov %r10d,%r14d
1888 mov %r11d,%r15d
1889
1890 ror $6,%r13d
1891 ror $11,%r14d
1892 xor %eax,%r15d # f^g
1893
1894 xor %r14d,%r13d
1895 ror $14,%r14d
1896 and %r10d,%r15d # (f^g)&e
1897 mov %r12d,56(%rsp)
1898
1899 xor %r14d,%r13d # Sigma1(e)
1900 xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
1901 add %ebx,%r12d # T1+=h
1902
1903 mov %ecx,%ebx
1904 add %r13d,%r12d # T1+=Sigma1(e)
1905
1906 add %r15d,%r12d # T1+=Ch(e,f,g)
1907 mov %ecx,%r13d
1908 mov %ecx,%r14d
1909
1910 ror $2,%ebx
1911 ror $13,%r13d
1912 mov %ecx,%r15d
1913 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1914
1915 xor %r13d,%ebx
1916 ror $9,%r13d
1917 or %r8d,%r14d # a|c
1918
1919 xor %r13d,%ebx # h=Sigma0(a)
1920 and %r8d,%r15d # a&c
1921 add %r12d,%r9d # d+=T1
1922
1923 and %edx,%r14d # (a|c)&b
1924 add %r12d,%ebx # h+=T1
1925
1926 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1927 lea 1(%rdi),%rdi # round++
1928
1929 add %r14d,%ebx # h+=Maj(a,b,c)
1930 mov 0(%rsp),%r13d
1931 mov 52(%rsp),%r12d
1932
1933 mov %r13d,%r15d
1934
1935 shr $3,%r13d
1936 ror $7,%r15d
1937
1938 xor %r15d,%r13d
1939 ror $11,%r15d
1940
1941 xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
1942 mov %r12d,%r14d
1943
1944 shr $10,%r12d
1945 ror $17,%r14d
1946
1947 xor %r14d,%r12d
1948 ror $2,%r14d
1949
1950 xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
1951
1952 add %r13d,%r12d
1953
1954 add 32(%rsp),%r12d
1955
1956 add 60(%rsp),%r12d
1957 mov %r9d,%r13d
1958 mov %r9d,%r14d
1959 mov %r10d,%r15d
1960
1961 ror $6,%r13d
1962 ror $11,%r14d
1963 xor %r11d,%r15d # f^g
1964
1965 xor %r14d,%r13d
1966 ror $14,%r14d
1967 and %r9d,%r15d # (f^g)&e
1968 mov %r12d,60(%rsp)
1969
1970 xor %r14d,%r13d # Sigma1(e)
1971 xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
1972 add %eax,%r12d # T1+=h
1973
1974 mov %ebx,%eax
1975 add %r13d,%r12d # T1+=Sigma1(e)
1976
1977 add %r15d,%r12d # T1+=Ch(e,f,g)
1978 mov %ebx,%r13d
1979 mov %ebx,%r14d
1980
1981 ror $2,%eax
1982 ror $13,%r13d
1983 mov %ebx,%r15d
1984 add (%rbp,%rdi,4),%r12d # T1+=K[round]
1985
1986 xor %r13d,%eax
1987 ror $9,%r13d
1988 or %edx,%r14d # a|c
1989
1990 xor %r13d,%eax # h=Sigma0(a)
1991 and %edx,%r15d # a&c
1992 add %r12d,%r8d # d+=T1
1993
1994 and %ecx,%r14d # (a|c)&b
1995 add %r12d,%eax # h+=T1
1996
1997 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
1998 lea 1(%rdi),%rdi # round++
1999
2000 add %r14d,%eax # h+=Maj(a,b,c)
2001 cmp $64,%rdi
2002 jb .Lrounds_16_xx
2003
2004 mov 16*4+0*8(%rsp),%rdi
2005 lea 16*4(%rsi),%rsi
2006
2007 add 4*0(%rdi),%eax
2008 add 4*1(%rdi),%ebx
2009 add 4*2(%rdi),%ecx
2010 add 4*3(%rdi),%edx
2011 add 4*4(%rdi),%r8d
2012 add 4*5(%rdi),%r9d
2013 add 4*6(%rdi),%r10d
2014 add 4*7(%rdi),%r11d
2015
2016 cmp 16*4+2*8(%rsp),%rsi
2017
2018 mov %eax,4*0(%rdi)
2019 mov %ebx,4*1(%rdi)
2020 mov %ecx,4*2(%rdi)
2021 mov %edx,4*3(%rdi)
2022 mov %r8d,4*4(%rdi)
2023 mov %r9d,4*5(%rdi)
2024 mov %r10d,4*6(%rdi)
2025 mov %r11d,4*7(%rdi)
2026 jb .Lloop
2027
2028 mov 16*4+3*8(%rsp),%rsp
2029 pop %r15
2030 pop %r14
2031 pop %r13
2032 pop %r12
2033 pop %rbp
2034 pop %rbx
2035
2036 ret
2037 SET_SIZE(SHA256TransformBlocks)
2038
2039 .align 64
2040 .type K256,@object
2041 K256:
2042 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
2043 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
2044 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
2045 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
2046 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
2047 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
2048 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
2049 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
2050 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
2051 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
2052 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
2053 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
2054 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
2055 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
2056 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
2057 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
2058 #endif /* !lint && !__lint */
2059
2060 #ifdef __ELF__
2061 .section .note.GNU-stack,"",%progbits
2062 #endif