2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 %include "job_aes_hmac.asm"
30 %include "mb_mgr_datastruct.asm"
32 %include "reg_sizes.asm"
40 ;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203
41 dupw: ;ddq 0x01000100010001000100010001000100
42 dq 0x0100010001000100, 0x0100010001000100
64 ; idx needs to be in rbp
69 %define start_offset r11
71 %define unused_lanes rbx
77 %define size_offset reg3
83 %define extra_blocks r8
92 ; This routine and/or the called routine clobbers all GPRs
98 ; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
101 MKGLOBAL(submit_job_hmac_md5_sse,function,internal)
102 submit_job_hmac_md5_sse:
108 mov [rsp + _gpr_save + 8*0], rbx
109 mov [rsp + _gpr_save + 8*1], rbp
110 mov [rsp + _gpr_save + 8*2], r12
111 mov [rsp + _gpr_save + 8*3], r13
112 mov [rsp + _gpr_save + 8*4], r14
113 mov [rsp + _gpr_save + 8*5], r15
115 mov [rsp + _gpr_save + 8*6], rsi
116 mov [rsp + _gpr_save + 8*7], rdi
118 mov [rsp + _rsp_save], rax ; original SP
120 mov unused_lanes, [state + _unused_lanes_md5]
121 mov lane, unused_lanes
124 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
125 lea lane_data, [state + _ldata_md5 + lane_data]
126 mov [state + _unused_lanes_md5], unused_lanes
127 mov len, [job + _msg_len_to_hash_in_bytes]
129 shr tmp, 6 ; divide by 64, len in terms of blocks
131 mov [lane_data + _job_in_lane], job
132 mov dword [lane_data + _outer_done], 0
134 ;; insert len into proper lane
135 movdqa xmm0, [state + _lens_md5]
136 XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
137 movdqa [state + _lens_md5], xmm0
141 lea extra_blocks, [last_len + 9 + 63]
143 mov [lane_data + _extra_blocks], DWORD(extra_blocks)
146 add p, [job + _hash_start_src_offset_in_bytes]
147 mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p
154 movdqu xmm0, [p - 64 + 0*16]
155 movdqu xmm1, [p - 64 + 1*16]
156 movdqu xmm2, [p - 64 + 2*16]
157 movdqu xmm3, [p - 64 + 3*16]
158 movdqa [lane_data + _extra_block + 0*16], xmm0
159 movdqa [lane_data + _extra_block + 1*16], xmm1
160 movdqa [lane_data + _extra_block + 2*16], xmm2
161 movdqa [lane_data + _extra_block + 3*16], xmm3
164 mov size_offset, extra_blocks
166 sub size_offset, last_len
167 add size_offset, 64-8
168 mov [lane_data + _size_offset], DWORD(size_offset)
170 sub start_offset, last_len
171 mov [lane_data + _start_offset], DWORD(start_offset)
173 lea tmp, [8*64 + 8*len]
175 mov [lane_data + _extra_block + size_offset], tmp
177 mov tmp, [job + _auth_key_xor_ipad]
179 movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
180 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
181 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
182 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
188 movdqa xmm0, [state + _lens_md5]
189 XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
190 movdqa [state + _lens_md5], xmm0
192 lea tmp, [lane_data + _extra_block + start_offset]
193 mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
194 mov dword [lane_data + _extra_blocks], 0
197 cmp unused_lanes, 0xf
204 movdqa xmm0, [state + _lens_md5]
205 phminposuw xmm1, xmm0
206 pextrw len2, xmm1, 0 ; min value
207 pextrw idx, xmm1, 1 ; min index (0...3)
211 pshufb xmm1, [rel dupw] ; duplicate words across all lanes
213 movdqa [state + _lens_md5], xmm0
215 ; "state" and "args" are the same address, arg1
218 ; state and idx are intact
221 ; process completed job "idx"
222 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
223 lea lane_data, [state + _ldata_md5 + lane_data]
224 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
226 jne proc_extra_blocks
227 cmp dword [lane_data + _outer_done], 0
231 mov dword [lane_data + _outer_done], 1
232 mov DWORD(size_offset), [lane_data + _size_offset]
233 mov qword [lane_data + _extra_block + size_offset], 0
235 movdqa xmm0, [state + _lens_md5]
236 XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
237 movdqa [state + _lens_md5], xmm0
239 lea tmp, [lane_data + _outer_block]
240 mov job, [lane_data + _job_in_lane]
241 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
243 movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
244 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
245 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
246 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
247 ; pshufb xmm0, [rel byteswap]
248 movdqa [lane_data + _outer_block], xmm0
250 mov tmp, [job + _auth_key_xor_opad]
252 movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
253 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
254 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
255 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
260 mov DWORD(start_offset), [lane_data + _start_offset]
262 movdqa xmm0, [state + _lens_md5]
263 XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
264 movdqa [state + _lens_md5], xmm0
266 lea tmp, [lane_data + _extra_block + start_offset]
267 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
268 mov dword [lane_data + _extra_blocks], 0
274 ;; less than one message block of data
275 ;; beginning of source block
276 ;; destination extrablock but backwards by len from where 0x80 pre-populated
277 ;; p2 clobbers unused_lanes, undo before exiting
278 lea p2, [lane_data + _extra_block + 64]
280 memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
281 mov unused_lanes, [state + _unused_lanes_md5]
290 mov job_rax, [lane_data + _job_in_lane]
291 mov unused_lanes, [state + _unused_lanes_md5]
292 mov qword [lane_data + _job_in_lane], 0
293 or dword [job_rax + _status], STS_COMPLETED_HMAC
296 mov [state + _unused_lanes_md5], unused_lanes
298 mov p, [job_rax + _auth_tag_output]
301 mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
302 mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
303 mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
304 mov [p + 0*4], DWORD(tmp)
305 mov [p + 1*4], DWORD(tmp2)
306 mov [p + 2*4], DWORD(tmp3)
308 cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
312 mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
313 mov [p + 3*4], DWORD(tmp3)
317 mov rbx, [rsp + _gpr_save + 8*0]
318 mov rbp, [rsp + _gpr_save + 8*1]
319 mov r12, [rsp + _gpr_save + 8*2]
320 mov r13, [rsp + _gpr_save + 8*3]
321 mov r14, [rsp + _gpr_save + 8*4]
322 mov r15, [rsp + _gpr_save + 8*5]
324 mov rsi, [rsp + _gpr_save + 8*6]
325 mov rdi, [rsp + _gpr_save + 8*7]
327 mov rsp, [rsp + _rsp_save] ; original SP
332 section .note.GNU-stack noalloc noexec nowrite progbits