2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 %include "include/os.asm"
29 %include "include/const.inc"
30 %include "job_aes_hmac.asm"
31 %include "mb_mgr_datastruct.asm"
33 %include "include/reg_sizes.asm"
34 %include "include/memcpy.asm"
36 %define AES_XCBC_X4 aes_xcbc_mac_128_x4
37 %define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse
40 ; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
47 x80: ;ddq 0x00000000000000000000000000000080
48 dq 0x0000000000000080, 0x0000000000000000
67 ; idx needs to be in rbp
82 %define unused_lanes rbx
85 ; STACK_SPACE needs to be an odd multiple of 8
86 ; This routine and its callee clobbers all GPRs
92 ; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
95 MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal)
102 mov [rsp + _gpr_save + 8*0], rbx
103 mov [rsp + _gpr_save + 8*1], rbp
104 mov [rsp + _gpr_save + 8*2], r12
105 mov [rsp + _gpr_save + 8*3], r13
106 mov [rsp + _gpr_save + 8*4], r14
107 mov [rsp + _gpr_save + 8*5], r15
109 mov [rsp + _gpr_save + 8*6], rsi
110 mov [rsp + _gpr_save + 8*7], rdi
112 mov [rsp + _rsp_save], rax ; original SP
114 mov unused_lanes, [state + _aes_xcbc_unused_lanes]
115 movzx lane, BYTE(unused_lanes)
117 imul lane_data, lane, _XCBC_LANE_DATA_size
118 lea lane_data, [state + _aes_xcbc_ldata + lane_data]
119 mov [state + _aes_xcbc_unused_lanes], unused_lanes
120 mov len, [job + _msg_len_to_hash_in_bytes]
121 mov [lane_data + _xcbc_job_in_lane], job
122 mov dword [lane_data + _xcbc_final_done], 0
123 mov tmp, [job + _k1_expanded]
124 mov [state + _aes_xcbc_args_keys + lane*8], tmp
126 add p, [job + _hash_start_src_offset_in_bytes]
133 mov [state + _aes_xcbc_args_in + lane*8], p
134 add p, len ; set point to end of data
136 and last_len, 15 ; Check lsbs of msg len
137 jnz slow_copy ; if not 16B mult, do slow copy
140 movdqu xmm0, [p - 16] ; load last block M[n]
141 mov tmp, [job + _k2] ; load K2 address
142 movdqu xmm1, [tmp] ; load K2
143 pxor xmm0, xmm1 ; M[n] XOR K2
144 movdqa [lane_data + _xcbc_final_block], xmm0
145 sub len, 16 ; take last block off length
148 shl lane, 4 ; multiply by 16
149 movdqa [state + _aes_xcbc_args_ICV + lane], xmm0
151 ;; insert len into proper lane
152 movdqa xmm0, [state + _aes_xcbc_lens]
153 XPINSRW xmm0, xmm1, tmp, lane, len, no_scale
154 movdqa [state + _aes_xcbc_lens], xmm0
156 cmp unused_lanes, 0xff
161 phminposuw xmm1, xmm0
162 pextrw len2, xmm1, 0 ; min value
163 pextrw idx, xmm1, 1 ; min index (0...3)
167 pshuflw xmm1, xmm1, 0
169 movdqa [state + _aes_xcbc_lens], xmm0
171 ; "state" and "args" are the same address, arg1
174 ; state and idx are intact
177 ; process completed job "idx"
178 imul lane_data, idx, _XCBC_LANE_DATA_size
179 lea lane_data, [state + _aes_xcbc_ldata + lane_data]
180 cmp dword [lane_data + _xcbc_final_done], 0
183 mov dword [lane_data + _xcbc_final_done], 1
184 mov word [state + _aes_xcbc_lens + 2*idx], 16
185 lea tmp, [lane_data + _xcbc_final_block]
186 mov [state + _aes_xcbc_args_in + 8*idx], tmp
187 movdqa xmm0, [state + _aes_xcbc_lens]
191 ; process completed job "idx"
192 mov job_rax, [lane_data + _xcbc_job_in_lane]
193 mov icv, [job_rax + _auth_tag_output]
194 mov unused_lanes, [state + _aes_xcbc_unused_lanes]
195 mov qword [lane_data + _xcbc_job_in_lane], 0
196 or dword [job_rax + _status], STS_COMPLETED_HMAC
199 shl idx, 4 ; multiply by 16
200 mov [state + _aes_xcbc_unused_lanes], unused_lanes
203 movdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
205 pextrd [icv + 8], xmm0, 2
210 movdqa [state + _aes_xcbc_args_ICV + idx], xmm0
212 ;; Clear final block (32 bytes)
213 movdqa [lane_data + _xcbc_final_block], xmm0
214 movdqa [lane_data + _xcbc_final_block + 16], xmm0
219 mov rbx, [rsp + _gpr_save + 8*0]
220 mov rbp, [rsp + _gpr_save + 8*1]
221 mov r12, [rsp + _gpr_save + 8*2]
222 mov r13, [rsp + _gpr_save + 8*3]
223 mov r14, [rsp + _gpr_save + 8*4]
224 mov r15, [rsp + _gpr_save + 8*5]
226 mov rsi, [rsp + _gpr_save + 8*6]
227 mov rdi, [rsp + _gpr_save + 8*7]
229 mov rsp, [rsp + _rsp_save] ; original SP
234 ; For buffers <= 16 Bytes
235 ; The input data is set to final block
236 lea tmp, [lane_data + _xcbc_final_block] ; final block
237 mov [state + _aes_xcbc_args_in + lane*8], tmp
238 add p, len ; set point to end of data
243 and len, ~15 ; take final block off len
244 sub p, last_len ; adjust data pointer
245 lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final
246 sub p2, last_len ; adjust data pointer backwards
247 memcpy_sse_16_1 p2, p, last_len, tmp, tmp2
248 movdqa xmm0, [rel x80] ; fill reg with padding
249 movdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding
250 movdqu xmm0, [p2] ; load final block to process
251 mov tmp, [job + _k3] ; load K3 address
252 movdqu xmm1, [tmp] ; load K3
253 pxor xmm0, xmm1 ; M[n] XOR K3
254 movdqu [lane_data + _xcbc_final_block], xmm0 ; write final block
262 section .note.GNU-stack noalloc noexec nowrite progbits