2 ;; Copyright (c) 2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 %include "include/os.asm"
30 %include "job_aes_hmac.asm"
31 %include "mb_mgr_datastruct.asm"
33 %include "include/reg_sizes.asm"
34 %include "include/memcpy.asm"
35 %include "include/const.inc"
37 %include "include/dbgprint.asm"
39 %ifndef AES128_CBC_MAC
41 %define AES128_CBC_MAC aes128_cbc_mac_x4
42 %define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse
43 %define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse
54 ;ddq 0x0000000000000000000000000000FFFF
55 dq 0x000000000000FFFF, 0x0000000000000000
56 ;ddq 0x000000000000000000000000FFFF0000
57 dq 0x00000000FFFF0000, 0x0000000000000000
58 ;ddq 0x00000000000000000000FFFF00000000
59 dq 0x0000FFFF00000000, 0x0000000000000000
60 ;ddq 0x0000000000000000FFFF000000000000
61 dq 0xFFFF000000000000, 0x0000000000000000
68 %define APPEND(a,b) a %+ b
84 ; idx needs to be in rbp
95 %define unused_lanes rbx
102 %define good_lane r15
105 ; STACK_SPACE needs to be an odd multiple of 8
106 ; This routine and its callee clobbers all GPRs
112 ;;; ===========================================================================
113 ;;; ===========================================================================
115 ;;; ===========================================================================
116 ;;; ===========================================================================
118 ;;; ===========================================================================
119 ;;; AES CMAC job submit & flush
120 ;;; ===========================================================================
121 ;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
122 %macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1
123 %define %%SUBMIT_FLUSH %1
129 mov [rsp + _gpr_save + 8*0], rbx
130 mov [rsp + _gpr_save + 8*1], rbp
131 mov [rsp + _gpr_save + 8*2], r12
132 mov [rsp + _gpr_save + 8*3], r13
133 mov [rsp + _gpr_save + 8*4], r14
134 mov [rsp + _gpr_save + 8*5], r15
136 mov [rsp + _gpr_save + 8*6], rsi
137 mov [rsp + _gpr_save + 8*7], rdi
139 mov [rsp + _rsp_save], rax ; original SP
142 mov unused_lanes, [state + _aes_cmac_unused_lanes]
144 %ifidn %%SUBMIT_FLUSH, SUBMIT
146 mov lane, unused_lanes
149 mov [state + _aes_cmac_unused_lanes], unused_lanes
151 ;; Copy job info into lane
152 mov [state + _aes_cmac_job_in_lane + lane*8], job
153 ;; Copy keys into lane args
154 mov tmp, [job + _key_expanded]
155 mov [state + _aes_cmac_args_keys + lane*8], tmp
159 ;; Zero IV to store digest
161 movdqa [state + _aes_cmac_args_IV + tmp], xmm0
163 lea m_last, [state + _aes_cmac_scratch + tmp]
166 ;; convert bits to bytes (message length in bits for CMAC)
167 mov len, [job + _msg_len_to_hash_in_bits]
169 add len, 7 ; inc len if there are remainder bits
173 ;; Check at least 1 or more blocks (get n)
178 ;; Check for partial block
182 or n, n ; check one or more blocks?
185 ;; One or more blocks, potentially partial
186 mov word [state + _aes_cmac_init_done + lane*2], 0
188 mov tmp2, [job + _src]
189 add tmp2, [job + _hash_start_src_offset_in_bytes]
190 mov [state + _aes_cmac_args_in + lane*8], tmp2
195 movdqa xmm0, [state + _aes_cmac_lens]
196 XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
197 movdqa [state + _aes_cmac_lens], xmm0
199 ;; check remainder bits
201 jnz %%_not_complete_block_3gpp
203 ;; check if complete block
207 %%_not_complete_block:
208 ;; M_last = padding(M_n) XOR K2
209 lea tmp, [rel padding_0x80_tab16 + 16]
212 movdqa [m_last], xmm0
214 mov tmp, [job + _src]
215 add tmp, [job + _hash_start_src_offset_in_bytes]
220 memcpy_sse_16 m_last, tmp, r, tmp4, iv
223 mov tmp3, [job + _skey2]
224 movdqa xmm1, [m_last]
227 movdqa [m_last], xmm0
231 movdqa xmm0, [state + _aes_cmac_lens]
232 phminposuw xmm1, xmm0
234 cmp byte [state + _aes_cmac_unused_lanes], 0xf
239 ;; Check at least one job
243 ;; Find a lane with a non-null job
244 xor good_lane, good_lane
245 cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0
246 cmovne good_lane, [rel one]
247 cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0
248 cmovne good_lane, [rel two]
249 cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0
250 cmovne good_lane, [rel three]
252 ; Copy good_lane to empty lanes
253 mov tmp2, [state + _aes_cmac_args_in + good_lane*8]
254 mov tmp3, [state + _aes_cmac_args_keys + good_lane*8]
255 shl good_lane, 4 ; multiply by 16
256 movdqa xmm2, [state + _aes_cmac_args_IV + good_lane]
257 movdqa xmm0, [state + _aes_cmac_lens]
261 cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
263 mov [state + _aes_cmac_args_in + I*8], tmp2
264 mov [state + _aes_cmac_args_keys + I*8], tmp3
265 movdqa [state + _aes_cmac_args_IV + I*16], xmm2
266 por xmm0, [rel len_masks + 16*I]
271 phminposuw xmm1, xmm0
276 pextrw len2, xmm1, 0 ; min value
277 pextrw idx, xmm1, 1 ; min index (0...3)
280 pshuflw xmm1, xmm1, 0
282 movdqa [state + _aes_cmac_lens], xmm0
284 ; "state" and "args" are the same address, arg1
287 ; state and idx are intact
289 movdqa xmm0, [state + _aes_cmac_lens] ; preload lens
291 ; Check if job complete
292 test word [state + _aes_cmac_init_done + idx*2], 0xffff
293 jnz %%_copy_complete_digest
296 mov word [state + _aes_cmac_init_done + idx*2], 1
298 XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
299 movdqa [state + _aes_cmac_lens], xmm0
301 phminposuw xmm1, xmm0 ; find min length
305 lea m_last, [state + _aes_cmac_scratch + tmp3]
306 mov [state + _aes_cmac_args_in + idx*8], m_last
310 %%_copy_complete_digest:
311 ; Job complete, copy digest to AT output
312 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
316 lea tmp3, [state + _aes_cmac_args_IV + tmp4]
317 mov tmp4, [job_rax + _auth_tag_output_len_in_bytes]
318 mov tmp2, [job_rax + _auth_tag_output]
329 memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv
332 ; Update unused lanes
333 mov unused_lanes, [state + _aes_cmac_unused_lanes]
336 mov [state + _aes_cmac_unused_lanes], unused_lanes
339 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
341 mov qword [state + _aes_cmac_job_in_lane + idx*8], 0
342 or dword [job_rax + _status], STS_COMPLETED_HMAC
346 %ifidn %%SUBMIT_FLUSH, SUBMIT
347 ;; Clear digest (in memory for IV) and scratch memory of returned job
351 movdqa [state + _aes_cmac_scratch + idx], xmm0
354 ;; Clear digest and scratch memory of returned job and "NULL lanes"
357 cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
358 jne APPEND(skip_clear_,I)
359 movdqa [state + _aes_cmac_args_IV + I*16], xmm0
360 movdqa [state + _aes_cmac_scratch + I*16], xmm0
361 APPEND(skip_clear_,I):
369 mov rbx, [rsp + _gpr_save + 8*0]
370 mov rbp, [rsp + _gpr_save + 8*1]
371 mov r12, [rsp + _gpr_save + 8*2]
372 mov r13, [rsp + _gpr_save + 8*3]
373 mov r14, [rsp + _gpr_save + 8*4]
374 mov r15, [rsp + _gpr_save + 8*5]
376 mov rsi, [rsp + _gpr_save + 8*6]
377 mov rdi, [rsp + _gpr_save + 8*7]
379 mov rsp, [rsp + _rsp_save] ; original SP
386 %ifidn %%SUBMIT_FLUSH, SUBMIT
389 ;; Block size aligned
390 mov tmp2, [job + _src]
391 add tmp2, [job + _hash_start_src_offset_in_bytes]
396 ;; M_last = M_n XOR K1
397 mov tmp3, [job + _skey1]
401 movdqa [m_last], xmm0
406 ;; Single partial block
407 mov word [state + _aes_cmac_init_done + lane*2], 1
408 mov [state + _aes_cmac_args_in + lane*8], m_last
410 movdqa xmm0, [state + _aes_cmac_lens]
411 XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
412 movdqa [state + _aes_cmac_lens], xmm0
415 jmp %%_not_complete_block
417 %%_not_complete_block_3gpp:
418 ;; bit pad last block
422 ;; load pointer to src
423 mov tmp, [job + _src]
424 add tmp, [job + _hash_start_src_offset_in_bytes]
429 ;; check if partial block
431 jz %%_load_full_block_3gpp
433 simd_load_sse_15_1 xmm0, tmp, r
436 %%_update_mlast_3gpp:
437 ;; set last byte padding mask
438 ;; shift into correct xmm idx
440 ;; save and restore rcx on windows
448 XPSLLB xmm2, r, xmm1, tmp2
455 ;; set OR mask to pad final bit
458 xor tmp2, tmp3 ; XOR to get OR mask
460 ;; xmm1 contains shift table from previous shift
463 ;; load skey2 address
464 mov tmp3, [job + _skey2]
467 ;; set final padding bit
470 ;; XOR last partial block with skey2
473 movdqa [m_last], xmm2
477 %%_load_full_block_3gpp:
480 jmp %%_update_mlast_3gpp
486 ; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job)
489 MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
490 SUBMIT_JOB_AES_CMAC_AUTH:
491 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT
493 ; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state)
495 MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
496 FLUSH_JOB_AES_CMAC_AUTH:
497 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH
501 section .note.GNU-stack noalloc noexec nowrite progbits