]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
f67539c2 | 28 | %include "include/os.asm" |
11fdf7f2 TL |
29 | %include "job_aes_hmac.asm" |
30 | %include "mb_mgr_datastruct.asm" | |
f67539c2 TL |
31 | %include "include/reg_sizes.asm" |
32 | %include "include/memcpy.asm" | |
33 | %include "include/const.inc" | |
11fdf7f2 TL |
34 | |
35 | extern sha512_x4_avx2 | |
36 | ||
37 | section .data | |
38 | default rel | |
39 | align 16 | |
40 | byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 | |
41 | dq 0x0001020304050607, 0x08090a0b0c0d0e0f | |
42 | ||
43 | section .text | |
44 | ||
45 | %ifndef FUNC | |
46 | %define FUNC submit_job_hmac_sha_512_avx2 | |
47 | %define SHA_X_DIGEST_SIZE 512 | |
48 | %endif | |
49 | ||
50 | %if 1 | |
51 | %ifdef LINUX | |
52 | %define arg1 rdi | |
53 | %define arg2 rsi | |
54 | %define reg3 rcx | |
55 | %define reg4 rdx | |
56 | %else | |
57 | %define arg1 rcx | |
58 | %define arg2 rdx | |
59 | %define reg3 rdi | |
60 | %define reg4 rsi | |
61 | %endif | |
62 | ||
63 | %define state arg1 | |
64 | %define job arg2 | |
65 | %define len2 arg2 | |
66 | ||
67 | ||
68 | ; idx needs to be in rbp, r13, r14, r16 | |
69 | %define last_len rbp | |
70 | %define idx rbp | |
71 | ||
72 | %define p r11 | |
73 | %define start_offset r11 | |
74 | ||
75 | %define unused_lanes rbx | |
76 | %define tmp4 rbx | |
77 | ||
78 | %define job_rax rax | |
79 | %define len rax | |
80 | ||
81 | %define size_offset reg3 | |
82 | %define tmp2 reg3 | |
83 | ||
84 | %define lane reg4 | |
85 | %define tmp3 reg4 | |
86 | ||
87 | %define extra_blocks r8 | |
88 | ||
89 | %define tmp r9 | |
90 | %define p2 r9 | |
91 | ||
92 | %define lane_data r10 | |
93 | ||
94 | %endif | |
95 | ||
96 | ; Define stack usage | |
97 | ||
98 | ; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12 | |
99 | struc STACK | |
100 | _gpr_save: resq 5 | |
101 | _rsp_save: resq 1 | |
102 | endstruc | |
103 | ||
104 | ; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job) | |
105 | ; arg 1 : rcx : state | |
106 | ; arg 2 : rdx : job | |
107 | MKGLOBAL(FUNC,function,internal) | |
108 | FUNC: | |
109 | ||
110 | mov rax, rsp | |
111 | sub rsp, STACK_size | |
112 | and rsp, -32 | |
113 | mov [rsp + _gpr_save + 8*0], rbx | |
114 | mov [rsp + _gpr_save + 8*1], rbp | |
115 | mov [rsp + _gpr_save + 8*2], r12 | |
116 | %ifndef LINUX | |
117 | mov [rsp + _gpr_save + 8*3], rsi | |
118 | mov [rsp + _gpr_save + 8*4], rdi | |
119 | %endif | |
120 | mov [rsp + _rsp_save], rax ; original SP | |
121 | ||
122 | mov unused_lanes, [state + _unused_lanes_sha512] | |
123 | movzx lane, BYTE(unused_lanes) | |
124 | shr unused_lanes, 8 | |
125 | imul lane_data, lane, _SHA512_LANE_DATA_size | |
126 | lea lane_data, [state + _ldata_sha512 + lane_data] | |
127 | mov [state + _unused_lanes_sha512], unused_lanes | |
128 | mov len, [job + _msg_len_to_hash_in_bytes] | |
129 | mov tmp, len | |
130 | shr tmp, 7 ; divide by 128, len in terms of blocks | |
131 | ||
132 | mov [lane_data + _job_in_lane_sha512], job | |
133 | mov dword [lane_data + _outer_done_sha512], 0 | |
9f95a23c TL |
134 | |
135 | vmovdqa xmm0, [state + _lens_sha512] | |
136 | XVPINSRW xmm0, xmm1, extra_blocks, lane, tmp, scale_x16 | |
137 | vmovdqa [state + _lens_sha512], xmm0 | |
138 | ||
11fdf7f2 TL |
139 | |
140 | mov last_len, len | |
141 | and last_len, 127 | |
142 | lea extra_blocks, [last_len + 17 + 127] | |
143 | shr extra_blocks, 7 | |
144 | mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) | |
145 | ||
146 | mov p, [job + _src] | |
147 | add p, [job + _hash_start_src_offset_in_bytes] | |
148 | mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p | |
149 | ||
150 | cmp len, 128 | |
151 | jb copy_lt128 | |
152 | ||
153 | fast_copy: | |
154 | add p, len | |
155 | vmovdqu ymm0, [p - 128 + 0*32] | |
156 | vmovdqu ymm1, [p - 128 + 1*32] | |
157 | vmovdqu ymm2, [p - 128 + 2*32] | |
158 | vmovdqu ymm3, [p - 128 + 3*32] | |
159 | vmovdqu [lane_data + _extra_block_sha512 + 0*32], ymm0 | |
160 | vmovdqu [lane_data + _extra_block_sha512 + 1*32], ymm1 | |
161 | vmovdqu [lane_data + _extra_block_sha512 + 2*32], ymm2 | |
162 | vmovdqu [lane_data + _extra_block_sha512 + 3*32], ymm3 | |
163 | end_fast_copy: | |
164 | ||
165 | mov size_offset, extra_blocks | |
166 | shl size_offset, 7 | |
167 | sub size_offset, last_len | |
168 | add size_offset, 128-8 | |
169 | mov [lane_data + _size_offset_sha512], DWORD(size_offset) | |
170 | mov start_offset, 128 | |
171 | sub start_offset, last_len | |
172 | mov [lane_data + _start_offset_sha512], DWORD(start_offset) | |
173 | ||
174 | lea tmp, [8*128 + 8*len] | |
175 | bswap tmp | |
176 | mov [lane_data + _extra_block_sha512 + size_offset], tmp | |
177 | ||
178 | mov tmp, [job + _auth_key_xor_ipad] | |
179 | ||
180 | %assign I 0 | |
181 | %rep 4 | |
182 | vmovdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE] | |
183 | vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0 | |
184 | vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 | |
185 | %assign I (I+1) | |
186 | %endrep | |
187 | ||
188 | test len, ~127 | |
189 | jnz ge128_bytes | |
190 | ||
191 | lt128_bytes: | |
9f95a23c TL |
192 | vmovdqa xmm0, [state + _lens_sha512] |
193 | XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 | |
194 | vmovdqa [state + _lens_sha512], xmm0 | |
195 | ||
11fdf7f2 TL |
196 | lea tmp, [lane_data + _extra_block_sha512 + start_offset] |
197 | mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 | |
198 | mov dword [lane_data + _extra_blocks_sha512], 0 | |
199 | ||
200 | ge128_bytes: | |
201 | cmp unused_lanes, 0xff | |
202 | jne return_null | |
203 | jmp start_loop | |
204 | ||
205 | align 16 | |
206 | start_loop: | |
207 | ; Find min length | |
208 | vmovdqa xmm0, [state + _lens_sha512] | |
209 | vphminposuw xmm1, xmm0 | |
210 | vpextrw DWORD(len2), xmm1, 0 ; min value | |
211 | vpextrw DWORD(idx), xmm1, 1 ; min index (0...1) | |
212 | cmp len2, 0 | |
213 | je len_is_0 | |
214 | ||
215 | vpshuflw xmm1, xmm1, 0x00 | |
216 | vpsubw xmm0, xmm0, xmm1 | |
217 | vmovdqa [state + _lens_sha512], xmm0 | |
218 | ||
219 | ; "state" and "args" are the same address, arg1 | |
220 | ; len is arg2 | |
221 | call sha512_x4_avx2 | |
222 | ; state and idx are intact | |
223 | ||
224 | len_is_0: | |
225 | ; process completed job "idx" | |
226 | imul lane_data, idx, _SHA512_LANE_DATA_size | |
227 | lea lane_data, [state + _ldata_sha512 + lane_data] | |
228 | mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] | |
229 | cmp extra_blocks, 0 | |
230 | jne proc_extra_blocks | |
231 | cmp dword [lane_data + _outer_done_sha512], 0 | |
232 | jne end_loop | |
233 | ||
234 | proc_outer: | |
235 | mov dword [lane_data + _outer_done_sha512], 1 | |
236 | mov DWORD(size_offset), [lane_data + _size_offset_sha512] | |
237 | mov qword [lane_data + _extra_block_sha512 + size_offset], 0 | |
9f95a23c TL |
238 | |
239 | vmovdqa xmm0, [state + _lens_sha512] | |
240 | XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 | |
241 | vmovdqa [state + _lens_sha512], xmm0 | |
242 | ||
11fdf7f2 TL |
243 | lea tmp, [lane_data + _outer_block_sha512] |
244 | mov job, [lane_data + _job_in_lane_sha512] | |
245 | mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp | |
246 | ||
247 | %assign I 0 | |
248 | %rep (SHA_X_DIGEST_SIZE / (8 * 16)) | |
249 | vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE] | |
250 | vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 | |
251 | vpshufb xmm0, [rel byteswap] | |
252 | vmovdqa [lane_data + _outer_block_sha512 + I * 2 * SHA512_DIGEST_WORD_SIZE], xmm0 | |
253 | %assign I (I+1) | |
254 | %endrep | |
255 | ||
256 | mov tmp, [job + _auth_key_xor_opad] | |
257 | %assign I 0 | |
258 | %rep 4 | |
259 | vmovdqu xmm0, [tmp + I * 16] | |
260 | vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I+0)*SHA512_DIGEST_ROW_SIZE], xmm0 | |
261 | vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 | |
262 | %assign I (I+1) | |
263 | %endrep | |
264 | ||
265 | jmp start_loop | |
266 | ||
267 | align 16 | |
268 | proc_extra_blocks: | |
269 | mov DWORD(start_offset), [lane_data + _start_offset_sha512] | |
9f95a23c TL |
270 | |
271 | vmovdqa xmm0, [state + _lens_sha512] | |
272 | XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 | |
273 | vmovdqa [state + _lens_sha512], xmm0 | |
274 | ||
11fdf7f2 TL |
275 | lea tmp, [lane_data + _extra_block_sha512 + start_offset] |
276 | mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message | |
277 | mov dword [lane_data + _extra_blocks_sha512], 0 | |
278 | jmp start_loop | |
279 | ||
280 | align 16 | |
281 | copy_lt128: | |
282 | ;; less than one message block of data | |
283 | ;; destination extra block but backwards by len from where 0x80 pre-populated | |
284 | lea p2, [lane_data + _extra_block + 128] | |
285 | sub p2, len | |
286 | memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3 | |
287 | mov unused_lanes, [state + _unused_lanes_sha512] | |
288 | jmp end_fast_copy | |
289 | ||
290 | return_null: | |
291 | xor job_rax, job_rax | |
292 | jmp return | |
293 | ||
294 | align 16 | |
295 | end_loop: | |
296 | mov job_rax, [lane_data + _job_in_lane_sha512] | |
297 | mov unused_lanes, [state + _unused_lanes_sha512] | |
298 | mov qword [lane_data + _job_in_lane_sha512], 0 | |
299 | or dword [job_rax + _status], STS_COMPLETED_HMAC | |
300 | shl unused_lanes, 8 | |
301 | or unused_lanes, idx | |
302 | mov [state + _unused_lanes_sha512], unused_lanes | |
303 | ||
304 | mov p, [job_rax + _auth_tag_output] | |
305 | ||
9f95a23c TL |
306 | vzeroupper |
307 | ||
308 | %if (SHA_X_DIGEST_SIZE != 384) | |
309 | cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 | |
310 | jne copy_full_digest | |
311 | %else | |
312 | cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 | |
313 | jne copy_full_digest | |
314 | %endif | |
315 | ;; copy 32 bytes for SHA512 / 24 bytes for SHA384 | |
11fdf7f2 TL |
316 | mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] |
317 | mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] | |
318 | mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] | |
319 | %if (SHA_X_DIGEST_SIZE != 384) | |
320 | mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] | |
321 | %endif | |
11fdf7f2 TL |
322 | bswap QWORD(tmp) |
323 | bswap QWORD(tmp2) | |
324 | bswap QWORD(tmp3) | |
325 | %if (SHA_X_DIGEST_SIZE != 384) | |
326 | bswap QWORD(tmp4) | |
327 | %endif | |
11fdf7f2 TL |
328 | mov [p + 0*8], QWORD(tmp) |
329 | mov [p + 1*8], QWORD(tmp2) | |
330 | mov [p + 2*8], QWORD(tmp3) | |
331 | %if (SHA_X_DIGEST_SIZE != 384) | |
332 | mov [p + 3*8], QWORD(tmp4) | |
333 | %endif | |
f67539c2 | 334 | jmp clear_ret |
9f95a23c TL |
335 | |
336 | copy_full_digest: | |
337 | ;; copy 64 bytes for SHA512 / 48 bytes for SHA384 | |
338 | mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] | |
339 | mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] | |
340 | mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] | |
341 | mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] | |
342 | bswap QWORD(tmp) | |
343 | bswap QWORD(tmp2) | |
344 | bswap QWORD(tmp3) | |
345 | bswap QWORD(tmp4) | |
346 | mov [p + 0*8], QWORD(tmp) | |
347 | mov [p + 1*8], QWORD(tmp2) | |
348 | mov [p + 2*8], QWORD(tmp3) | |
349 | mov [p + 3*8], QWORD(tmp4) | |
350 | ||
351 | mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE] | |
352 | mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE] | |
353 | %if (SHA_X_DIGEST_SIZE != 384) | |
354 | mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE] | |
355 | mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE] | |
356 | %endif | |
357 | bswap QWORD(tmp) | |
358 | bswap QWORD(tmp2) | |
359 | %if (SHA_X_DIGEST_SIZE != 384) | |
360 | bswap QWORD(tmp3) | |
361 | bswap QWORD(tmp4) | |
362 | %endif | |
363 | mov [p + 4*8], QWORD(tmp) | |
364 | mov [p + 5*8], QWORD(tmp2) | |
365 | %if (SHA_X_DIGEST_SIZE != 384) | |
366 | mov [p + 6*8], QWORD(tmp3) | |
367 | mov [p + 7*8], QWORD(tmp4) | |
368 | %endif | |
11fdf7f2 | 369 | |
f67539c2 TL |
370 | clear_ret: |
371 | ||
372 | %ifdef SAFE_DATA | |
373 | ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job | |
374 | %assign J 0 | |
375 | %rep 6 | |
376 | mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0 | |
377 | %assign J (J+1) | |
378 | %endrep | |
379 | %if (SHA_X_DIGEST_SIZE != 384) | |
380 | mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0 | |
381 | mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0 | |
382 | %endif | |
383 | ||
384 | vpxor ymm0, ymm0 | |
385 | imul lane_data, idx, _SHA512_LANE_DATA_size | |
386 | lea lane_data, [state + _ldata_sha512 + lane_data] | |
387 | ;; Clear first 128 bytes of extra_block | |
388 | %assign offset 0 | |
389 | %rep 4 | |
390 | vmovdqa [lane_data + _extra_block + offset], ymm0 | |
391 | %assign offset (offset + 32) | |
392 | %endrep | |
393 | ||
394 | ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block | |
395 | vmovdqu [lane_data + _outer_block], ymm0 | |
396 | %if (SHA_X_DIGEST_SIZE == 384) | |
397 | vmovdqa [lane_data + _outer_block + 32], xmm0 | |
398 | %else | |
399 | vmovdqu [lane_data + _outer_block + 32], ymm0 | |
400 | %endif | |
401 | %endif ;; SAFE_DATA | |
402 | ||
11fdf7f2 TL |
403 | return: |
404 | mov rbx, [rsp + _gpr_save + 8*0] | |
405 | mov rbp, [rsp + _gpr_save + 8*1] | |
406 | mov r12, [rsp + _gpr_save + 8*2] | |
407 | %ifndef LINUX | |
408 | mov rsi, [rsp + _gpr_save + 8*3] | |
409 | mov rdi, [rsp + _gpr_save + 8*4] | |
410 | %endif | |
411 | mov rsp, [rsp + _rsp_save] ; original SP | |
412 | ret | |
413 | ||
414 | %ifdef LINUX | |
415 | section .note.GNU-stack noalloc noexec nowrite progbits | |
416 | %endif |