]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2017-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | ;; In System V AMD64 ABI | |
29 | ;; calle saves: RBX, RBP, R12-R15 | |
30 | ;; Windows x64 ABI | |
31 | ;; calle saves: RBX, RBP, RDI, RSI, R12-R15 | |
32 | ;; | |
33 | ;; Clobbers ZMM0-31 | |
34 | ||
f67539c2 | 35 | %include "include/os.asm" |
11fdf7f2 TL |
36 | %include "job_aes_hmac.asm" |
37 | %include "mb_mgr_datastruct.asm" | |
f67539c2 | 38 | %include "include/reg_sizes.asm" |
11fdf7f2 TL |
39 | |
40 | extern sha512_x8_avx512 | |
41 | ||
42 | section .data | |
43 | default rel | |
44 | ||
45 | align 16 | |
46 | dupw: ;ddq 0x01000100010001000100010001000100 | |
47 | dq 0x0100010001000100, 0x0100010001000100 | |
48 | ||
49 | align 16 | |
50 | byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 | |
51 | dq 0x0001020304050607, 0x08090a0b0c0d0e0f | |
52 | ||
53 | align 16 | |
54 | len_masks: | |
55 | ;ddq 0x0000000000000000000000000000FFFF | |
56 | dq 0x000000000000FFFF, 0x0000000000000000 | |
57 | ;ddq 0x000000000000000000000000FFFF0000 | |
58 | dq 0x00000000FFFF0000, 0x0000000000000000 | |
59 | ;ddq 0x00000000000000000000FFFF00000000 | |
60 | dq 0x0000FFFF00000000, 0x0000000000000000 | |
61 | ;ddq 0x0000000000000000FFFF000000000000 | |
62 | dq 0xFFFF000000000000, 0x0000000000000000 | |
63 | ;ddq 0x000000000000FFFF0000000000000000 | |
64 | dq 0x0000000000000000, 0x000000000000FFFF | |
65 | ;ddq 0x00000000FFFF00000000000000000000 | |
66 | dq 0x0000000000000000, 0x00000000FFFF0000 | |
67 | ;ddq 0x0000FFFF000000000000000000000000 | |
68 | dq 0x0000000000000000, 0x0000FFFF00000000 | |
69 | ;ddq 0xFFFF0000000000000000000000000000 | |
70 | dq 0x0000000000000000, 0xFFFF000000000000 | |
71 | ||
72 | lane_1: dq 1 | |
73 | lane_2: dq 2 | |
74 | lane_3: dq 3 | |
75 | lane_4: dq 4 | |
76 | lane_5: dq 5 | |
77 | lane_6: dq 6 | |
78 | lane_7: dq 7 | |
79 | ||
80 | section .text | |
81 | ||
82 | %ifdef LINUX | |
83 | %define arg1 rdi | |
84 | %define arg2 rsi | |
85 | %else | |
86 | %define arg1 rcx | |
87 | %define arg2 rdx | |
88 | %endif | |
89 | ||
90 | %define state arg1 | |
91 | %define job arg2 | |
92 | %define len2 arg2 | |
93 | ||
94 | ||
95 | ; idx needs to be in rbp | |
96 | %define idx rbp | |
97 | ||
98 | %define unused_lanes rbx | |
99 | %define lane_data rbx | |
100 | %define tmp2 rbx | |
101 | ||
102 | %define job_rax rax | |
103 | ||
104 | %define size_offset rax | |
105 | %define tmp rax | |
106 | %define start_offset rax | |
107 | ||
108 | %define extra_blocks arg2 | |
109 | %define p arg2 | |
110 | ||
111 | %define tmp4 r8 | |
112 | %define tmp5 r9 | |
113 | %define tmp6 r10 | |
114 | ||
115 | struc STACK | |
116 | _gpr_save: resq 7 ; rbx, rbp, r12-r15, rdi (windows) | |
117 | _rsp_save: resq 1 | |
118 | endstruc | |
119 | ||
120 | %define APPEND(a,b) a %+ b | |
121 | ||
122 | %ifndef SHA384 | |
123 | ; JOB* flush_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state) | |
124 | ; arg 1 : state | |
125 | %define SHA_X_DIGEST_SIZE 512 | |
126 | MKGLOBAL(flush_job_hmac_sha_512_avx512,function,internal) | |
127 | align 64 | |
128 | flush_job_hmac_sha_512_avx512: | |
129 | %else | |
130 | ; JOB* flush_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state) | |
131 | ; arg 1 : state | |
132 | %define SHA_X_DIGEST_SIZE 384 | |
133 | MKGLOBAL(flush_job_hmac_sha_384_avx512,function,internal) | |
134 | align 64 | |
135 | flush_job_hmac_sha_384_avx512: | |
136 | %endif | |
137 | mov rax, rsp | |
138 | sub rsp, STACK_size | |
139 | and rsp, -32 | |
140 | mov [rsp + _gpr_save + 8*0], rbx | |
141 | mov [rsp + _gpr_save + 8*1], rbp | |
142 | mov [rsp + _gpr_save + 8*2], r12 | |
143 | mov [rsp + _gpr_save + 8*3], r13 | |
144 | mov [rsp + _gpr_save + 8*4], r14 | |
145 | mov [rsp + _gpr_save + 8*5], r15 | |
146 | %ifndef LINUX | |
147 | mov [rsp + _gpr_save + 8*6], rdi | |
148 | %endif | |
149 | mov [rsp + _rsp_save], rax ; original SP | |
150 | ||
151 | mov unused_lanes, [state + _unused_lanes_sha512] | |
152 | bt unused_lanes, 32+3 | |
153 | jc return_null | |
154 | ||
155 | ; find a lane with a non-null job | |
156 | xor idx, idx | |
157 | %assign I 1 | |
158 | %rep 7 | |
159 | cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 | |
160 | cmovne idx, [rel APPEND(lane_, I)] | |
161 | %assign I (I+1) | |
162 | %endrep | |
163 | ||
164 | copy_lane_data: | |
165 | ; copy good lane (idx) to empty lanes | |
166 | vmovdqa xmm0, [state + _lens_sha512] | |
167 | mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] | |
168 | ||
169 | %assign I 0 | |
170 | %rep 8 | |
171 | cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 | |
172 | jne APPEND(skip_,I) | |
173 | mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp | |
174 | vpor xmm0, xmm0, [rel len_masks + 16*I] | |
175 | APPEND(skip_,I): | |
176 | %assign I (I+1) | |
177 | %endrep | |
178 | ||
179 | vmovdqa [state + _lens_sha512], xmm0 | |
180 | ||
181 | vphminposuw xmm1, xmm0 | |
182 | vpextrw DWORD(len2), xmm1, 0 ; min value | |
183 | vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) | |
184 | cmp len2, 0 | |
185 | je len_is_0 | |
186 | ||
187 | vpshufb xmm1, [rel dupw] ; duplicate words across all 8 lanes | |
188 | vpsubw xmm0, xmm0, xmm1 | |
189 | vmovdqa [state + _lens_sha512], xmm0 | |
190 | ||
191 | ; "state" and "args" are the same address, arg1 | |
192 | ; len is arg2 | |
193 | call sha512_x8_avx512 | |
194 | ; state and idx are intact | |
195 | ||
196 | len_is_0: | |
197 | ; process completed job "idx" | |
198 | imul lane_data, idx, _SHA512_LANE_DATA_size | |
199 | lea lane_data, [state + _ldata_sha512 + lane_data] | |
200 | mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] | |
201 | cmp extra_blocks, 0 | |
202 | jne proc_extra_blocks | |
203 | cmp dword [lane_data + _outer_done_sha512], 0 | |
204 | jne end_loop | |
205 | ||
206 | proc_outer: | |
207 | mov dword [lane_data + _outer_done_sha512], 1 | |
208 | mov DWORD(size_offset), [lane_data + _size_offset_sha512] | |
209 | mov qword [lane_data + _extra_block_sha512 + size_offset], 0 | |
210 | mov word [state + _lens_sha512 + 2*idx], 1 | |
211 | lea tmp, [lane_data + _outer_block_sha512] | |
212 | mov job, [lane_data + _job_in_lane_sha512] | |
213 | mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp | |
214 | ||
215 | ; move digest into data location | |
216 | %assign I 0 | |
217 | %rep (SHA_X_DIGEST_SIZE / (8*16)) | |
218 | vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] | |
219 | vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 | |
220 | vpshufb xmm0, [rel byteswap] | |
221 | vmovdqa [lane_data + _outer_block_sha512 + I*2*SHA512_DIGEST_WORD_SIZE], xmm0 | |
222 | %assign I (I+1) | |
223 | %endrep | |
224 | ||
225 | ; move the opad key into digest | |
226 | mov tmp, [job + _auth_key_xor_opad] | |
227 | ||
228 | %assign I 0 | |
229 | %rep 4 | |
230 | vmovdqu xmm0, [tmp + I * 16] | |
231 | vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0 | |
232 | vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 | |
233 | %assign I (I+1) | |
234 | %endrep | |
235 | ||
236 | jmp copy_lane_data | |
237 | ||
238 | align 32 | |
239 | proc_extra_blocks: | |
240 | mov DWORD(start_offset), [lane_data + _start_offset_sha512] | |
241 | mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) | |
242 | lea tmp, [lane_data + _extra_block_sha512 + start_offset] | |
243 | mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp | |
244 | mov dword [lane_data + _extra_blocks_sha512], 0 | |
245 | jmp copy_lane_data | |
246 | ||
247 | return_null: | |
248 | xor job_rax, job_rax | |
249 | jmp return | |
250 | ||
251 | align 32 | |
252 | end_loop: | |
253 | mov job_rax, [lane_data + _job_in_lane_sha512] | |
254 | mov qword [lane_data + _job_in_lane_sha512], 0 | |
255 | or dword [job_rax + _status], STS_COMPLETED_HMAC | |
256 | mov unused_lanes, [state + _unused_lanes_sha512] | |
257 | shl unused_lanes, 4 | |
258 | or unused_lanes, idx | |
259 | mov [state + _unused_lanes_sha512], unused_lanes | |
260 | ||
261 | mov p, [job_rax + _auth_tag_output] | |
262 | ||
9f95a23c TL |
263 | %if (SHA_X_DIGEST_SIZE != 384) |
264 | cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 | |
265 | jne copy_full_digest | |
266 | %else | |
267 | cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 | |
268 | jne copy_full_digest | |
269 | %endif | |
270 | ;; copy 32 bytes for SHA512 / 24 bytes for SHA384 | |
11fdf7f2 TL |
271 | mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] |
272 | mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] | |
273 | mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] | |
274 | %if (SHA_X_DIGEST_SIZE != 384) | |
275 | mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] | |
276 | %endif | |
277 | bswap QWORD(tmp2) | |
278 | bswap QWORD(tmp4) | |
279 | bswap QWORD(tmp6) | |
280 | %if (SHA_X_DIGEST_SIZE != 384) | |
281 | bswap QWORD(tmp5) | |
282 | %endif | |
283 | mov [p + 0*8], QWORD(tmp2) | |
284 | mov [p + 1*8], QWORD(tmp4) | |
285 | mov [p + 2*8], QWORD(tmp6) | |
286 | %if (SHA_X_DIGEST_SIZE != 384) | |
287 | mov [p + 3*8], QWORD(tmp5) | |
288 | %endif | |
f67539c2 | 289 | jmp clear_ret |
9f95a23c TL |
290 | |
291 | copy_full_digest: | |
292 | ;; copy 64 bytes for SHA512 / 48 bytes for SHA384 | |
293 | mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] | |
294 | mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] | |
295 | mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] | |
296 | mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] | |
297 | bswap QWORD(tmp2) | |
298 | bswap QWORD(tmp4) | |
299 | bswap QWORD(tmp6) | |
300 | bswap QWORD(tmp5) | |
301 | mov [p + 0*8], QWORD(tmp2) | |
302 | mov [p + 1*8], QWORD(tmp4) | |
303 | mov [p + 2*8], QWORD(tmp6) | |
304 | mov [p + 3*8], QWORD(tmp5) | |
305 | ||
306 | mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE] | |
307 | mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE] | |
308 | %if (SHA_X_DIGEST_SIZE != 384) | |
309 | mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE] | |
310 | mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE] | |
311 | %endif | |
312 | bswap QWORD(tmp2) | |
313 | bswap QWORD(tmp4) | |
314 | %if (SHA_X_DIGEST_SIZE != 384) | |
315 | bswap QWORD(tmp6) | |
316 | bswap QWORD(tmp5) | |
317 | %endif | |
318 | mov [p + 4*8], QWORD(tmp2) | |
319 | mov [p + 5*8], QWORD(tmp4) | |
320 | %if (SHA_X_DIGEST_SIZE != 384) | |
321 | mov [p + 6*8], QWORD(tmp6) | |
322 | mov [p + 7*8], QWORD(tmp5) | |
323 | %endif | |
11fdf7f2 | 324 | |
f67539c2 TL |
325 | clear_ret: |
326 | ||
327 | %ifdef SAFE_DATA | |
328 | vpxorq zmm0, zmm0 | |
329 | ||
330 | ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job | |
331 | %assign I 0 | |
332 | %rep 8 | |
333 | cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0 | |
334 | jne APPEND(skip_clear_,I) | |
335 | ||
336 | ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes) | |
337 | %assign J 0 | |
338 | %rep 6 | |
339 | mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0 | |
340 | %assign J (J+1) | |
341 | %endrep | |
342 | %if (SHA_X_DIGEST_SIZE != 384) | |
343 | mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0 | |
344 | mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0 | |
345 | %endif | |
346 | ||
347 | lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)] | |
348 | ;; Clear first 128 bytes of extra_block | |
349 | vmovdqu64 [lane_data + _extra_block], zmm0 | |
350 | vmovdqu64 [lane_data + _extra_block + 64], zmm0 | |
351 | ||
352 | ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block | |
353 | %if (SHA_X_DIGEST_SIZE == 384) | |
354 | vmovdqu64 [lane_data + _outer_block], ymm0 | |
355 | vmovdqa64 [lane_data + _outer_block + 32], xmm0 | |
356 | %else | |
357 | vmovdqu64 [lane_data + _outer_block], zmm0 | |
358 | %endif | |
359 | ||
360 | APPEND(skip_clear_,I): | |
361 | %assign I (I+1) | |
362 | %endrep | |
363 | ||
364 | %endif ;; SAFE_DATA | |
365 | ||
11fdf7f2 TL |
366 | return: |
367 | vzeroupper | |
368 | ||
369 | mov rbx, [rsp + _gpr_save + 8*0] | |
370 | mov rbp, [rsp + _gpr_save + 8*1] | |
371 | mov r12, [rsp + _gpr_save + 8*2] | |
372 | mov r13, [rsp + _gpr_save + 8*3] | |
373 | mov r14, [rsp + _gpr_save + 8*4] | |
374 | mov r15, [rsp + _gpr_save + 8*5] | |
375 | %ifndef LINUX | |
376 | mov rdi, [rsp + _gpr_save + 8*6] | |
377 | %endif | |
378 | mov rsp, [rsp + _rsp_save] ; original SP | |
379 | ||
380 | ret | |
381 | ||
382 | %ifdef LINUX | |
383 | section .note.GNU-stack noalloc noexec nowrite progbits | |
384 | %endif |