]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | %include "os.asm" | |
29 | %include "job_aes_hmac.asm" | |
30 | %include "mb_mgr_datastruct.asm" | |
31 | %include "memcpy.asm" | |
32 | %include "reg_sizes.asm" | |
9f95a23c | 33 | %include "const.inc" |
11fdf7f2 TL |
34 | |
35 | extern md5_x4x2_sse | |
36 | ||
37 | section .data | |
38 | default rel | |
39 | align 16 | |
40 | ;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203 | |
41 | dupw: ;ddq 0x01000100010001000100010001000100 | |
42 | dq 0x0100010001000100, 0x0100010001000100 | |
43 | ||
44 | section .text | |
45 | ||
46 | %if 1 | |
47 | %ifdef LINUX | |
48 | %define arg1 rdi | |
49 | %define arg2 rsi | |
50 | %define reg3 rcx | |
51 | %define reg4 rdx | |
52 | %else | |
53 | %define arg1 rcx | |
54 | %define arg2 rdx | |
55 | %define reg3 rdi | |
56 | %define reg4 rsi | |
57 | %endif | |
58 | ||
59 | %define state arg1 | |
60 | %define job arg2 | |
61 | %define len2 arg2 | |
62 | ||
63 | ||
64 | ; idx needs to be in rbp | |
65 | %define last_len rbp | |
66 | %define idx rbp | |
67 | ||
68 | %define p r11 | |
69 | %define start_offset r11 | |
70 | ||
71 | %define unused_lanes rbx | |
72 | %define tmp4 rbx | |
73 | ||
74 | %define job_rax rax | |
75 | %define len rax | |
76 | ||
77 | %define size_offset reg3 | |
78 | %define tmp2 reg3 | |
79 | ||
80 | %define lane reg4 | |
81 | %define tmp3 reg4 | |
82 | ||
83 | %define extra_blocks r8 | |
84 | ||
85 | %define tmp r9 | |
86 | %define p2 r9 | |
87 | ||
88 | %define lane_data r10 | |
89 | ||
90 | %endif | |
91 | ||
92 | ; This routine and/or the called routine clobbers all GPRs | |
93 | struc STACK | |
94 | _gpr_save: resq 8 | |
95 | _rsp_save: resq 1 | |
96 | endstruc | |
97 | ||
98 | ; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) | |
99 | ; arg 1 : rcx : state | |
100 | ; arg 2 : rdx : job | |
101 | MKGLOBAL(submit_job_hmac_md5_sse,function,internal) | |
102 | submit_job_hmac_md5_sse: | |
103 | ||
104 | mov rax, rsp | |
105 | sub rsp, STACK_size | |
106 | and rsp, -16 | |
107 | ||
108 | mov [rsp + _gpr_save + 8*0], rbx | |
109 | mov [rsp + _gpr_save + 8*1], rbp | |
110 | mov [rsp + _gpr_save + 8*2], r12 | |
111 | mov [rsp + _gpr_save + 8*3], r13 | |
112 | mov [rsp + _gpr_save + 8*4], r14 | |
113 | mov [rsp + _gpr_save + 8*5], r15 | |
114 | %ifndef LINUX | |
115 | mov [rsp + _gpr_save + 8*6], rsi | |
116 | mov [rsp + _gpr_save + 8*7], rdi | |
117 | %endif | |
118 | mov [rsp + _rsp_save], rax ; original SP | |
119 | ||
120 | mov unused_lanes, [state + _unused_lanes_md5] | |
121 | mov lane, unused_lanes | |
122 | and lane, 0xF | |
123 | shr unused_lanes, 4 | |
124 | imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size | |
125 | lea lane_data, [state + _ldata_md5 + lane_data] | |
126 | mov [state + _unused_lanes_md5], unused_lanes | |
127 | mov len, [job + _msg_len_to_hash_in_bytes] | |
128 | mov tmp, len | |
129 | shr tmp, 6 ; divide by 64, len in terms of blocks | |
130 | ||
131 | mov [lane_data + _job_in_lane], job | |
132 | mov dword [lane_data + _outer_done], 0 | |
9f95a23c TL |
133 | |
134 | ;; insert len into proper lane | |
135 | movdqa xmm0, [state + _lens_md5] | |
136 | XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 | |
137 | movdqa [state + _lens_md5], xmm0 | |
11fdf7f2 TL |
138 | |
139 | mov last_len, len | |
140 | and last_len, 63 | |
141 | lea extra_blocks, [last_len + 9 + 63] | |
142 | shr extra_blocks, 6 | |
143 | mov [lane_data + _extra_blocks], DWORD(extra_blocks) | |
144 | ||
145 | mov p, [job + _src] | |
146 | add p, [job + _hash_start_src_offset_in_bytes] | |
147 | mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p | |
148 | ||
149 | cmp len, 64 | |
150 | jb copy_lt64 | |
151 | ||
152 | fast_copy: | |
153 | add p, len | |
154 | movdqu xmm0, [p - 64 + 0*16] | |
155 | movdqu xmm1, [p - 64 + 1*16] | |
156 | movdqu xmm2, [p - 64 + 2*16] | |
157 | movdqu xmm3, [p - 64 + 3*16] | |
158 | movdqa [lane_data + _extra_block + 0*16], xmm0 | |
159 | movdqa [lane_data + _extra_block + 1*16], xmm1 | |
160 | movdqa [lane_data + _extra_block + 2*16], xmm2 | |
161 | movdqa [lane_data + _extra_block + 3*16], xmm3 | |
162 | end_fast_copy: | |
163 | ||
164 | mov size_offset, extra_blocks | |
165 | shl size_offset, 6 | |
166 | sub size_offset, last_len | |
167 | add size_offset, 64-8 | |
168 | mov [lane_data + _size_offset], DWORD(size_offset) | |
169 | mov start_offset, 64 | |
170 | sub start_offset, last_len | |
171 | mov [lane_data + _start_offset], DWORD(start_offset) | |
172 | ||
173 | lea tmp, [8*64 + 8*len] | |
174 | ; bswap tmp | |
175 | mov [lane_data + _extra_block + size_offset], tmp | |
176 | ||
177 | mov tmp, [job + _auth_key_xor_ipad] | |
178 | movdqu xmm0, [tmp] | |
179 | movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 | |
180 | pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 | |
181 | pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 | |
182 | pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 | |
183 | ||
184 | test len, ~63 | |
185 | jnz ge64_bytes | |
186 | ||
187 | lt64_bytes: | |
9f95a23c TL |
188 | movdqa xmm0, [state + _lens_md5] |
189 | XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 | |
190 | movdqa [state + _lens_md5], xmm0 | |
191 | ||
11fdf7f2 TL |
192 | lea tmp, [lane_data + _extra_block + start_offset] |
193 | mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp | |
194 | mov dword [lane_data + _extra_blocks], 0 | |
195 | ||
196 | ge64_bytes: | |
197 | cmp unused_lanes, 0xf | |
198 | jne return_null | |
199 | jmp start_loop | |
200 | ||
201 | align 16 | |
202 | start_loop: | |
203 | ; Find min length | |
204 | movdqa xmm0, [state + _lens_md5] | |
205 | phminposuw xmm1, xmm0 | |
206 | pextrw len2, xmm1, 0 ; min value | |
207 | pextrw idx, xmm1, 1 ; min index (0...3) | |
208 | cmp len2, 0 | |
209 | je len_is_0 | |
210 | ||
211 | pshufb xmm1, [rel dupw] ; duplicate words across all lanes | |
212 | psubw xmm0, xmm1 | |
213 | movdqa [state + _lens_md5], xmm0 | |
214 | ||
215 | ; "state" and "args" are the same address, arg1 | |
216 | ; len is arg2 | |
217 | call md5_x4x2_sse | |
218 | ; state and idx are intact | |
219 | ||
220 | len_is_0: | |
221 | ; process completed job "idx" | |
222 | imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size | |
223 | lea lane_data, [state + _ldata_md5 + lane_data] | |
224 | mov DWORD(extra_blocks), [lane_data + _extra_blocks] | |
225 | cmp extra_blocks, 0 | |
226 | jne proc_extra_blocks | |
227 | cmp dword [lane_data + _outer_done], 0 | |
228 | jne end_loop | |
229 | ||
230 | proc_outer: | |
231 | mov dword [lane_data + _outer_done], 1 | |
232 | mov DWORD(size_offset), [lane_data + _size_offset] | |
233 | mov qword [lane_data + _extra_block + size_offset], 0 | |
9f95a23c TL |
234 | |
235 | movdqa xmm0, [state + _lens_md5] | |
236 | XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 | |
237 | movdqa [state + _lens_md5], xmm0 | |
238 | ||
11fdf7f2 TL |
239 | lea tmp, [lane_data + _outer_block] |
240 | mov job, [lane_data + _job_in_lane] | |
241 | mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp | |
242 | ||
243 | movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] | |
244 | pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 | |
245 | pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 | |
246 | pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 | |
247 | ; pshufb xmm0, [rel byteswap] | |
248 | movdqa [lane_data + _outer_block], xmm0 | |
249 | ||
250 | mov tmp, [job + _auth_key_xor_opad] | |
251 | movdqu xmm0, [tmp] | |
252 | movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 | |
253 | pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 | |
254 | pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 | |
255 | pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 | |
256 | jmp start_loop | |
257 | ||
258 | align 16 | |
259 | proc_extra_blocks: | |
260 | mov DWORD(start_offset), [lane_data + _start_offset] | |
9f95a23c TL |
261 | |
262 | movdqa xmm0, [state + _lens_md5] | |
263 | XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 | |
264 | movdqa [state + _lens_md5], xmm0 | |
265 | ||
11fdf7f2 TL |
266 | lea tmp, [lane_data + _extra_block + start_offset] |
267 | mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp | |
268 | mov dword [lane_data + _extra_blocks], 0 | |
269 | jmp start_loop | |
270 | ||
271 | align 16 | |
272 | ||
273 | copy_lt64: | |
274 | ;; less than one message block of data | |
275 | ;; beginning of source block | |
276 | ;; destination extrablock but backwards by len from where 0x80 pre-populated | |
277 | ;; p2 clobbers unused_lanes, undo before exiting | |
278 | lea p2, [lane_data + _extra_block + 64] | |
279 | sub p2, len | |
280 | memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 | |
281 | mov unused_lanes, [state + _unused_lanes_md5] | |
282 | jmp end_fast_copy | |
283 | ||
284 | return_null: | |
285 | xor job_rax, job_rax | |
286 | jmp return | |
287 | ||
288 | align 16 | |
289 | end_loop: | |
290 | mov job_rax, [lane_data + _job_in_lane] | |
291 | mov unused_lanes, [state + _unused_lanes_md5] | |
292 | mov qword [lane_data + _job_in_lane], 0 | |
293 | or dword [job_rax + _status], STS_COMPLETED_HMAC | |
294 | shl unused_lanes, 4 | |
295 | or unused_lanes, idx | |
296 | mov [state + _unused_lanes_md5], unused_lanes | |
297 | ||
298 | mov p, [job_rax + _auth_tag_output] | |
299 | ||
300 | ; copy 12 bytes | |
301 | mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] | |
302 | mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] | |
303 | mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] | |
304 | mov [p + 0*4], DWORD(tmp) | |
305 | mov [p + 1*4], DWORD(tmp2) | |
306 | mov [p + 2*4], DWORD(tmp3) | |
307 | ||
9f95a23c TL |
308 | cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12 |
309 | je return | |
310 | ||
311 | ; copy 16 bytes | |
312 | mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE] | |
313 | mov [p + 3*4], DWORD(tmp3) | |
314 | ||
11fdf7f2 TL |
315 | return: |
316 | ||
317 | mov rbx, [rsp + _gpr_save + 8*0] | |
318 | mov rbp, [rsp + _gpr_save + 8*1] | |
319 | mov r12, [rsp + _gpr_save + 8*2] | |
320 | mov r13, [rsp + _gpr_save + 8*3] | |
321 | mov r14, [rsp + _gpr_save + 8*4] | |
322 | mov r15, [rsp + _gpr_save + 8*5] | |
323 | %ifndef LINUX | |
324 | mov rsi, [rsp + _gpr_save + 8*6] | |
325 | mov rdi, [rsp + _gpr_save + 8*7] | |
326 | %endif | |
327 | mov rsp, [rsp + _rsp_save] ; original SP | |
328 | ||
329 | ret | |
330 | ||
331 | %ifdef LINUX | |
332 | section .note.GNU-stack noalloc noexec nowrite progbits | |
333 | %endif |