]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx2/sha512_x4_avx2.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx2 / sha512_x4_avx2.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; code to compute quad SHA512 using AVX
29;; use YMMs to tackle the larger digest size
30;; outer calling routine takes care of save and restore of XMM registers
31;; Logic designed/laid out by JDG
32
33;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
34;; Stack must be aligned to 32 bytes before call
35;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
36;; Windows preserves: rcx rsi rdi rbp r13 r14 r15
37;;
38;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12
39;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
40;;
41;; clobbers ymm0-15
42
f67539c2 43%include "include/os.asm"
11fdf7f2 44;%define DO_DBGPRINT
f67539c2
TL
45%include "include/dbgprint.asm"
46%include "include/transpose_avx2.asm"
47%include "include/dbgprint.asm"
11fdf7f2
TL
48%include "mb_mgr_datastruct.asm"
49
50section .data
51default rel
52align 64
53K512_4:
54 dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
55 dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
56 dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
57 dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
58 dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
59 dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
60 dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
61 dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
62 dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
63 dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
64 dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
65 dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
66 dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
67 dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
68 dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
69 dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
70 dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
71 dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
72 dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
73 dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
74 dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
75 dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
76 dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
77 dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
78 dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
79 dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
80 dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
81 dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
82 dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
83 dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
84 dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
85 dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
86 dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
87 dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
88 dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
89 dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
90 dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
91 dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
92 dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
93 dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
94 dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
95 dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
96 dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
97 dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
98 dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
99 dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
100 dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
101 dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
102 dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
103 dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
104 dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
105 dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
106 dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
107 dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
108 dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
109 dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
110 dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
111 dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
112 dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
113 dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
114 dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
115 dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
116 dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
117 dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
118 dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
119 dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
120 dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
121 dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
122 dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
123 dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
124 dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
125 dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
126 dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
127 dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
128 dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
129 dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
130 dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
131 dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
132 dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
133 dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
134
135align 32
136PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
137 dq 0x0001020304050607, 0x08090a0b0c0d0e0f
138 ;ddq 0x18191a1b1c1d1e1f1011121314151617
139 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
140
141section .text
142
143%ifdef LINUX
144%define arg1 rdi
145%define arg2 rsi
146%else
147%define arg1 rcx
148%define arg2 rdx
149%endif
150
151; Common definitions
152%define STATE arg1
153%define INP_SIZE arg2
154
155%define IDX rax
156%define ROUND rbx
157%define TBL r8
158
159%define inp0 r9
160%define inp1 r10
161%define inp2 r11
162%define inp3 r12
163
164%define a ymm0
165%define b ymm1
166%define c ymm2
167%define d ymm3
168%define e ymm4
169%define f ymm5
170%define g ymm6
171%define h ymm7
172
173%define a0 ymm8
174%define a1 ymm9
175%define a2 ymm10
176
177%define TT0 ymm14
178%define TT1 ymm13
179%define TT2 ymm12
180%define TT3 ymm11
181%define TT4 ymm10
182%define TT5 ymm9
183
184%define T1 ymm14
185%define TMP ymm15
186
187
188
189%define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
190%define ROUNDS 80*SZ4
191
192; Define stack usage
193
194;; Assume stack aligned to 32 bytes before call
195;; Therefore FRAMESZ mod 32 must be 32-8 = 24
196struc stack_frame
197 .data resb 16*SZ4
198 .digest resb NUM_SHA512_DIGEST_WORDS*SZ4
199 .align resb 24
200endstruc
201
202%define _DIGEST stack_frame.digest
203
11fdf7f2
TL
204%macro ROTATE_ARGS 0
205%xdefine TMP_ h
206%xdefine h g
207%xdefine g f
208%xdefine f e
209%xdefine e d
210%xdefine d c
211%xdefine c b
212%xdefine b a
213%xdefine a TMP_
214%endm
215
216; PRORQ reg, imm, tmp
217; packed-rotate-right-double
218; does a rotate by doing two shifts and an or
219%macro PRORQ 3
220%define %%reg %1
221%define %%imm %2
222%define %%tmp %3
223 vpsllq %%tmp, %%reg, (64-(%%imm))
224 vpsrlq %%reg, %%reg, %%imm
225 vpor %%reg, %%reg, %%tmp
226%endmacro
227
228; non-destructive
229; PRORQ_nd reg, imm, tmp, src
230%macro PRORQ_nd 4
231%define %%reg %1
232%define %%imm %2
233%define %%tmp %3
234%define %%src %4
235 vpsllq %%tmp, %%src, (64-(%%imm))
236 vpsrlq %%reg, %%src, %%imm
237 vpor %%reg, %%reg, %%tmp
238%endmacro
239
240; PRORQ dst/src, amt
241%macro PRORQ 2
242 PRORQ %1, %2, TMP
243%endmacro
244
245; PRORQ_nd dst, src, amt
246%macro PRORQ_nd 3
247 PRORQ_nd %1, %3, TMP, %2
248%endmacro
249
250
251
252;; arguments passed implicitly in preprocessor symbols i, a...h
253%macro ROUND_00_15 2
254%define %%T1 %1
255%define %%i %2
256 PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
257
258 vpxor a2, f, g ; ch: a2 = f^g
259 vpand a2, a2, e ; ch: a2 = (f^g)&e
260 vpxor a2, a2, g ; a2 = ch
261
262 PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
263 vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1
264 vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
265 vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
266 PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
267 vpaddq h, h, a2 ; h = h + ch
268 PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
269 vpaddq h, h, %%T1 ; h = h + ch + W + K
270 vpxor a0, a0, a1 ; a0 = sigma1
271 vmovdqa %%T1, a ; maj: T1 = a
272 PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
273 vpxor %%T1, %%T1, c ; maj: T1 = a^c
274 add ROUND, SZ4 ; ROUND++
275 vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
276 vpaddq h, h, a0
277
278 vpaddq d, d, h
279
280 vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
281 PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
282 vpxor a2, a2, a1 ; a2 = sig0
283 vpand a1, a, c ; maj: a1 = a&c
284 vpor a1, a1, %%T1 ; a1 = maj
285 vpaddq h, h, a1 ; h = h + ch + W + K + maj
286 vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
287 ROTATE_ARGS
288%endm
289
290
291;; arguments passed implicitly in preprocessor symbols i, a...h
292%macro ROUND_16_XX 2
293%define %%T1 %1
294%define %%i %2
295 vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
296 vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
297 vmovdqa a0, %%T1
298 PRORQ %%T1, 8-1
299 vmovdqa a2, a1
300 PRORQ a1, 61-19
301 vpxor %%T1, %%T1, a0
302 PRORQ %%T1, 1
303 vpxor a1, a1, a2
304 PRORQ a1, 19
305 vpsrlq a0, a0, 7
306 vpxor %%T1, %%T1, a0
307 vpsrlq a2, a2, 6
308 vpxor a1, a1, a2
309 vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
310 vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
311 vpaddq %%T1, %%T1, a1
312
313 ROUND_00_15 %%T1, %%i
314
315%endm
316
317
318;; void sha512_x4_avx2(void *STATE, const int INP_SIZE)
319;; arg 1 : STATE : pointer to input data
320;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
321MKGLOBAL(sha512_x4_avx2,function,internal)
322align 32
323sha512_x4_avx2:
324 ; general registers preserved in outer calling routine
325 ; outer calling routine saves all the XMM registers
326
327 sub rsp, stack_frame_size
328
329 ;; Load the pre-transposed incoming digest.
330 vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE]
331 vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE]
332 vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE]
333 vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE]
334 vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE]
335 vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE]
336 vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE]
337 vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE]
338
339 DBGPRINTL_YMM "sha512-avx2 Incoming digest", a, b, c, d, e, f, g, h
340 lea TBL,[K512_4]
341
342 ;; load the address of each of the MAX_LANES (4) message lanes
343 ;; getting ready to transpose input onto stack
344 mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ]
345 mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ]
346 mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ]
347 mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ]
348
349 xor IDX, IDX
350lloop:
351 xor ROUND, ROUND
352
353 ;; save old digest
354 vmovdqa [rsp + _DIGEST + 0*SZ4], a
355 vmovdqa [rsp + _DIGEST + 1*SZ4], b
356 vmovdqa [rsp + _DIGEST + 2*SZ4], c
357 vmovdqa [rsp + _DIGEST + 3*SZ4], d
358 vmovdqa [rsp + _DIGEST + 4*SZ4], e
359 vmovdqa [rsp + _DIGEST + 5*SZ4], f
360 vmovdqa [rsp + _DIGEST + 6*SZ4], g
361 vmovdqa [rsp + _DIGEST + 7*SZ4], h
362
363%assign i 0
364%rep 4
365 ;; load up the shuffler for little-endian to big-endian format
366 vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
f67539c2
TL
367
368 TRANSPOSE4_U64_LOAD4 TT4, TT1, TT5, TT3, inp0, inp1, inp2, inp3, IDX+i*32
369
370 TRANSPOSE4_U64 TT4, TT1, TT5, TT3, TT0, TT2
371 DBGPRINTL_YMM "sha512-avx2 Incoming data", TT0, TT1, TT2, TT3
11fdf7f2
TL
372 vpshufb TT0, TT0, TMP
373 vpshufb TT1, TT1, TMP
374 vpshufb TT2, TT2, TMP
375 vpshufb TT3, TT3, TMP
376 ROUND_00_15 TT0,(i*4+0)
377 ROUND_00_15 TT1,(i*4+1)
378 ROUND_00_15 TT2,(i*4+2)
379 ROUND_00_15 TT3,(i*4+3)
380%assign i (i+1)
381%endrep
382;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
383 add IDX, 4 * 32
384
385%assign i (i*4)
386
387 jmp Lrounds_16_xx
388align 16
389Lrounds_16_xx:
390%rep 16
391 ROUND_16_XX T1, i
392%assign i (i+1)
393%endrep
394
395 cmp ROUND,ROUNDS
396 jb Lrounds_16_xx
397
398 ;; add old digest
399 vpaddq a, a, [rsp + _DIGEST + 0*SZ4]
400 vpaddq b, b, [rsp + _DIGEST + 1*SZ4]
401 vpaddq c, c, [rsp + _DIGEST + 2*SZ4]
402 vpaddq d, d, [rsp + _DIGEST + 3*SZ4]
403 vpaddq e, e, [rsp + _DIGEST + 4*SZ4]
404 vpaddq f, f, [rsp + _DIGEST + 5*SZ4]
405 vpaddq g, g, [rsp + _DIGEST + 6*SZ4]
406 vpaddq h, h, [rsp + _DIGEST + 7*SZ4]
407
408 sub INP_SIZE, 1 ;; consumed one message block
409 jne lloop
410
411 ; write back to memory (state object) the transposed digest
412 vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a
413 vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b
414 vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c
415 vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d
416 vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e
417 vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f
418 vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g
419 vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h
420 DBGPRINTL_YMM "sha512-avx2 Outgoing digest", a, b, c, d, e, f, g, h
421
422 ;; update input data pointers
423 add inp0, IDX
424 mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
425 add inp1, IDX
426 mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
427 add inp2, IDX
428 mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2
429 add inp3, IDX
430 mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3
431
432 ;;;;;;;;;;;;;;;;
433 ;; Postamble
434
f67539c2
TL
435 ;; Clear stack frame ((16 + 8)*32 bytes)
436%ifdef SAFE_DATA
437 vpxor ymm0, ymm0
438%assign i 0
439%rep (16+NUM_SHA512_DIGEST_WORDS)
440 vmovdqa [rsp + i*SZ4], ymm0
441%assign i (i+1)
442%endrep
443%endif
444
11fdf7f2
TL
445 add rsp, stack_frame_size
446
447 ; outer calling routine restores XMM and other GP registers
448 ret
449
450%ifdef LINUX
451section .note.GNU-stack noalloc noexec nowrite progbits
452%endif