]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | ;; In System V AMD64 ABI | |
29 | ;; calle saves: RBX, RBP, R12-R15 | |
30 | ;; Windows x64 ABI | |
31 | ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 | |
32 | ;; | |
33 | ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 | |
34 | ;; ----------------------------------------------------------- | |
35 | ;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 | |
36 | ;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 | |
37 | ;; ----------------------------------------------------------- | |
38 | ;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11 | |
39 | ;; Linux preserves: RBX RBP R12 R13 R14 R15 | |
40 | ;; ----------------------------------------------------------- | |
41 | ;; | |
42 | ;; Linux/Windows clobbers: xmm0 - xmm15 | |
43 | ;; | |
44 | ||
f67539c2 | 45 | %include "include/os.asm" |
11fdf7f2 TL |
46 | %include "job_aes_hmac.asm" |
47 | %include "mb_mgr_datastruct.asm" | |
f67539c2 TL |
48 | %include "include/reg_sizes.asm" |
49 | %include "include/memcpy.asm" | |
11fdf7f2 TL |
50 | |
51 | ;%define DO_DBGPRINT | |
f67539c2 | 52 | %include "include/dbgprint.asm" |
11fdf7f2 TL |
53 | |
54 | extern sha1_ni | |
55 | ||
56 | section .data | |
57 | default rel | |
58 | ||
59 | align 16 | |
60 | byteswap: | |
61 | dq 0x0405060700010203 | |
62 | dq 0x0c0d0e0f08090a0b | |
63 | ||
64 | section .text | |
65 | ||
66 | %ifdef LINUX | |
67 | %define arg1 rdi | |
68 | %define arg2 rsi | |
69 | %define reg3 rcx | |
70 | %define reg4 rdx | |
71 | %else | |
72 | %define arg1 rcx | |
73 | %define arg2 rdx | |
74 | %define reg3 rdi | |
75 | %define reg4 rsi | |
76 | %endif | |
77 | ||
78 | %define state arg1 | |
79 | %define job arg2 | |
80 | %define len2 arg2 | |
81 | ||
82 | ; idx needs to be in rbx, rbp, r12-r15 | |
83 | %define last_len rbp | |
84 | %define idx rbp | |
85 | %define p4 rbp | |
86 | ||
87 | %define p r11 | |
88 | %define start_offset r11 | |
89 | ||
90 | %define unused_lanes rbx | |
91 | %define tmp4 rbx | |
92 | %define p3 rbx | |
93 | ||
94 | %define job_rax rax | |
95 | %define len rax | |
96 | ||
97 | %define size_offset reg3 | |
98 | %define tmp2 reg3 | |
99 | ||
100 | %define lane reg4 | |
101 | %define tmp3 reg4 | |
102 | ||
103 | %define extra_blocks r8 | |
104 | ||
105 | %define tmp r9 | |
106 | %define p2 r9 | |
107 | ||
108 | %define lane_data r10 | |
109 | ||
110 | struc STACK | |
111 | _gpr_save: resq 4 | |
112 | _rsp_save: resq 1 | |
113 | endstruc | |
114 | ||
115 | ; JOB* submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) | |
116 | ; arg 1 : rcx : state | |
117 | ; arg 2 : rdx : job | |
118 | MKGLOBAL(submit_job_hmac_ni_sse,function,internal) | |
119 | submit_job_hmac_ni_sse: | |
120 | ||
121 | mov rax, rsp | |
122 | sub rsp, STACK_size | |
123 | and rsp, -16 | |
124 | ||
125 | mov [rsp + _gpr_save + 8*0], rbx | |
126 | mov [rsp + _gpr_save + 8*1], rbp | |
127 | %ifndef LINUX | |
128 | mov [rsp + _gpr_save + 8*2], rsi | |
129 | mov [rsp + _gpr_save + 8*3], rdi | |
130 | %endif | |
131 | mov [rsp + _rsp_save], rax ; original SP | |
132 | ||
133 | DBGPRINTL "enter sha1-ni-sse submit" | |
134 | mov unused_lanes, [state + _unused_lanes] | |
135 | movzx lane, BYTE(unused_lanes) | |
136 | DBGPRINTL64 "lane: ", lane | |
137 | shr unused_lanes, 8 | |
138 | imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size | |
139 | lea lane_data, [state + _ldata + lane_data] | |
140 | mov [state + _unused_lanes], unused_lanes | |
141 | mov len, [job + _msg_len_to_hash_in_bytes] | |
142 | DBGPRINTL64 "length: ", len | |
143 | mov tmp, len | |
144 | shr tmp, 6 ; divide by 64, len in terms of blocks | |
145 | ||
146 | mov [lane_data + _job_in_lane], job | |
147 | mov dword [lane_data + _outer_done], 0 | |
148 | mov [state + _lens + 2*lane], WORD(tmp) | |
149 | ||
150 | mov last_len, len | |
151 | and last_len, 63 | |
152 | lea extra_blocks, [last_len + 9 + 63] | |
153 | shr extra_blocks, 6 | |
154 | mov [lane_data + _extra_blocks], DWORD(extra_blocks) | |
155 | ||
156 | mov p, [job + _src] | |
157 | add p, [job + _hash_start_src_offset_in_bytes] | |
158 | DBGPRINTL64 "src pointer + offset:", p | |
159 | mov [state + _args_data_ptr + PTR_SZ*lane], p | |
160 | cmp len, 64 | |
161 | jb copy_lt64 | |
162 | ||
163 | fast_copy: | |
164 | add p, len | |
165 | movdqu xmm0, [p - 64 + 0*16] | |
166 | movdqu xmm1, [p - 64 + 1*16] | |
167 | movdqu xmm2, [p - 64 + 2*16] | |
168 | movdqu xmm3, [p - 64 + 3*16] | |
169 | movdqa [lane_data + _extra_block + 0*16], xmm0 | |
170 | movdqa [lane_data + _extra_block + 1*16], xmm1 | |
171 | movdqa [lane_data + _extra_block + 2*16], xmm2 | |
172 | movdqa [lane_data + _extra_block + 3*16], xmm3 | |
173 | end_fast_copy: | |
174 | ||
175 | mov size_offset, extra_blocks | |
176 | shl size_offset, 6 | |
177 | sub size_offset, last_len | |
178 | add size_offset, 64-8 | |
179 | mov [lane_data + _size_offset], DWORD(size_offset) | |
180 | mov start_offset, 64 | |
181 | sub start_offset, last_len | |
182 | mov [lane_data + _start_offset], DWORD(start_offset) | |
183 | ||
184 | lea tmp, [8*64 + 8*len] | |
185 | bswap tmp | |
186 | mov [lane_data + _extra_block + size_offset], tmp | |
187 | ||
188 | mov tmp, [job + _auth_key_xor_ipad] | |
189 | movdqu xmm0, [tmp] | |
190 | mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] | |
191 | %if SHA1NI_DIGEST_ROW_SIZE != 20 | |
192 | %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" | |
193 | %endif | |
194 | lea p4, [lane + lane*4] | |
195 | movdqu [state + _args_digest + p4*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 | |
196 | mov [state + _args_digest + p4*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) | |
197 | test len, ~63 | |
198 | jnz ge64_bytes | |
199 | ||
200 | lt64_bytes: | |
201 | mov [state + _lens + 2*lane], WORD(extra_blocks) | |
202 | lea tmp, [lane_data + _extra_block + start_offset] | |
203 | mov [state + _args_data_ptr + PTR_SZ*lane], tmp | |
204 | mov dword [lane_data + _extra_blocks], 0 | |
205 | ||
206 | ge64_bytes: | |
207 | cmp unused_lanes, 0xff | |
208 | jne return_null | |
209 | jmp start_loop | |
210 | ||
211 | align 16 | |
212 | start_loop: | |
213 | ; Find min length - only two lanes available | |
214 | xor len2, len2 | |
215 | mov p3, 0x10000 | |
216 | mov WORD(len2), word [state + _lens + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0) | |
217 | mov WORD(p3), word [state + _lens + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1) | |
218 | cmp WORD(len2), WORD(p3) | |
219 | cmovg DWORD(len2), DWORD(p3) ; move if lane 0 length is greater than lane 1 length | |
220 | ||
221 | mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields | |
222 | shr DWORD(idx), 16 | |
223 | and DWORD(len2), 0xffff | |
224 | je len_is_0 | |
225 | ||
226 | sub word [state + _lens + 0*2], WORD(len2) | |
227 | sub word [state + _lens + 1*2], WORD(len2) | |
228 | ||
229 | ; "state" and "args" are the same address, arg1 | |
230 | ; len is arg2 | |
231 | call sha1_ni | |
232 | ; state is intact | |
233 | ||
234 | len_is_0: | |
235 | ; process completed job "idx" | |
236 | imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size | |
237 | lea lane_data, [state + _ldata + lane_data] | |
238 | mov DWORD(extra_blocks), [lane_data + _extra_blocks] | |
239 | cmp extra_blocks, 0 | |
240 | jne proc_extra_blocks | |
241 | cmp dword [lane_data + _outer_done], 0 | |
242 | jne end_loop | |
243 | ||
244 | proc_outer: | |
245 | mov dword [lane_data + _outer_done], 1 | |
246 | mov DWORD(size_offset), [lane_data + _size_offset] | |
247 | mov qword [lane_data + _extra_block + size_offset], 0 | |
248 | mov word [state + _lens + 2*idx], 1 | |
249 | lea tmp, [lane_data + _outer_block] | |
250 | mov job, [lane_data + _job_in_lane] | |
251 | mov [state + _args_data_ptr + PTR_SZ*idx], tmp | |
252 | ||
253 | %if SHA1NI_DIGEST_ROW_SIZE != 20 | |
254 | %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" | |
255 | %endif | |
256 | lea p3, [idx + idx*4] | |
257 | movdqu xmm0, [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE] | |
258 | pshufb xmm0, [rel byteswap] | |
259 | mov DWORD(tmp), [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE] | |
260 | bswap DWORD(tmp) | |
261 | movdqa [lane_data + _outer_block], xmm0 | |
262 | mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) | |
263 | ||
264 | mov tmp, [job + _auth_key_xor_opad] | |
265 | movdqu xmm0, [tmp] | |
266 | mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] | |
267 | movdqu [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 | |
268 | mov [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) | |
269 | jmp start_loop | |
270 | ||
271 | align 16 | |
272 | proc_extra_blocks: | |
273 | mov DWORD(start_offset), [lane_data + _start_offset] | |
274 | mov [state + _lens + 2*idx], WORD(extra_blocks) | |
275 | lea tmp, [lane_data + _extra_block + start_offset] | |
276 | mov [state + _args_data_ptr + PTR_SZ*idx], tmp | |
277 | mov dword [lane_data + _extra_blocks], 0 | |
278 | jmp start_loop | |
279 | ||
280 | align 16 | |
281 | copy_lt64: | |
282 | ;; less than one message block of data | |
283 | ;; beginning of source block | |
284 | ;; destination extrablock but backwards by len from where 0x80 pre-populated | |
285 | lea p2, [lane_data + _extra_block + 64] | |
286 | sub p2, len | |
287 | memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 | |
288 | mov unused_lanes, [state + _unused_lanes] | |
289 | jmp end_fast_copy | |
290 | ||
291 | return_null: | |
292 | xor job_rax, job_rax | |
293 | jmp return | |
294 | ||
295 | align 16 | |
296 | end_loop: | |
297 | mov job_rax, [lane_data + _job_in_lane] | |
298 | mov unused_lanes, [state + _unused_lanes] | |
299 | mov qword [lane_data + _job_in_lane], 0 | |
300 | or dword [job_rax + _status], STS_COMPLETED_HMAC | |
301 | shl unused_lanes, 8 | |
302 | or unused_lanes, idx | |
303 | mov [state + _unused_lanes], unused_lanes | |
304 | ||
305 | mov p, [job_rax + _auth_tag_output] | |
306 | ||
307 | ; copy 12 bytes | |
308 | %if SHA1NI_DIGEST_ROW_SIZE != 20 | |
309 | %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" | |
310 | %endif | |
311 | lea idx, [idx + 4*idx] | |
312 | mov DWORD(tmp), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE] | |
313 | mov DWORD(tmp2), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE] | |
314 | mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE] | |
315 | bswap DWORD(tmp) | |
316 | bswap DWORD(tmp2) | |
317 | bswap DWORD(tmp3) | |
318 | mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) | |
319 | mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) | |
320 | mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) | |
321 | ||
9f95a23c | 322 | cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12 |
f67539c2 | 323 | je clear_ret |
9f95a23c TL |
324 | |
325 | ;; copy remaining 8 bytes to return 20 byte digest | |
326 | mov DWORD(tmp), [state + _args_digest + idx*4 + 3*SHA1_DIGEST_WORD_SIZE] | |
327 | mov DWORD(tmp2), [state + _args_digest + idx*4 + 4*SHA1_DIGEST_WORD_SIZE] | |
328 | bswap DWORD(tmp) | |
329 | bswap DWORD(tmp2) | |
330 | mov [p + 3*4], DWORD(tmp) | |
331 | mov [p + 4*4], DWORD(tmp2) | |
332 | ||
f67539c2 TL |
333 | clear_ret: |
334 | ||
335 | %ifdef SAFE_DATA | |
336 | pxor xmm0, xmm0 | |
337 | ;; Clear digest (20B), outer_block (20B) and extra_block (64B) | |
338 | ;; idx = 0 or 5 (depending on lane) | |
339 | movdqu [state + _args_digest + idx*4], xmm0 | |
340 | mov dword [state + _args_digest + idx*4 + 16], 0 | |
341 | ||
342 | shr idx, 2 ;; idx == 5 ? 1 : 0 | |
343 | imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size | |
344 | lea lane_data, [state + _ldata + lane_data] | |
345 | ;; Clear first 64 bytes of extra_block | |
346 | %assign offset 0 | |
347 | %rep 4 | |
348 | movdqa [lane_data + _extra_block + offset], xmm0 | |
349 | %assign offset (offset + 16) | |
350 | %endrep | |
351 | ||
352 | ;; Clear 20 bytes of outer_block | |
353 | movdqa [lane_data + _outer_block], xmm0 | |
354 | mov dword [lane_data + _outer_block + 16], 0 | |
355 | %endif | |
356 | ||
11fdf7f2 TL |
357 | return: |
358 | mov rbx, [rsp + _gpr_save + 8*0] | |
359 | mov rbp, [rsp + _gpr_save + 8*1] | |
360 | %ifndef LINUX | |
361 | mov rsi, [rsp + _gpr_save + 8*2] | |
362 | mov rdi, [rsp + _gpr_save + 8*3] | |
363 | %endif | |
364 | mov rsp, [rsp + _rsp_save] ; original SP | |
365 | ||
366 | ret | |
367 | ||
368 | %ifdef LINUX | |
369 | section .note.GNU-stack noalloc noexec nowrite progbits | |
370 | %endif |