]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | ;; |
2 | ;; Copyright (c) 2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | ||
f67539c2 | 29 | %include "include/os.asm" |
9f95a23c TL |
30 | %include "job_aes_hmac.asm" |
31 | %include "mb_mgr_datastruct.asm" | |
32 | ||
f67539c2 TL |
33 | %include "include/reg_sizes.asm" |
34 | %include "include/memcpy.asm" | |
35 | %include "include/const.inc" | |
9f95a23c | 36 | ;%define DO_DBGPRINT |
f67539c2 | 37 | %include "include/dbgprint.asm" |
9f95a23c TL |
38 | |
39 | %ifndef AES128_CBC_MAC | |
40 | ||
41 | %define AES128_CBC_MAC aes128_cbc_mac_x4 | |
42 | %define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse | |
43 | %define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse | |
44 | ||
45 | %endif | |
46 | ||
47 | extern AES128_CBC_MAC | |
48 | ||
49 | section .data | |
50 | default rel | |
51 | ||
52 | align 16 | |
53 | len_masks: | |
54 | ;ddq 0x0000000000000000000000000000FFFF | |
55 | dq 0x000000000000FFFF, 0x0000000000000000 | |
56 | ;ddq 0x000000000000000000000000FFFF0000 | |
57 | dq 0x00000000FFFF0000, 0x0000000000000000 | |
58 | ;ddq 0x00000000000000000000FFFF00000000 | |
59 | dq 0x0000FFFF00000000, 0x0000000000000000 | |
60 | ;ddq 0x0000000000000000FFFF000000000000 | |
61 | dq 0xFFFF000000000000, 0x0000000000000000 | |
62 | one: dq 1 | |
63 | two: dq 2 | |
64 | three: dq 3 | |
65 | ||
66 | section .text | |
67 | ||
68 | %define APPEND(a,b) a %+ b | |
69 | ||
70 | %ifdef LINUX | |
71 | %define arg1 rdi | |
72 | %define arg2 rsi | |
73 | %else | |
74 | %define arg1 rcx | |
75 | %define arg2 rdx | |
76 | %endif | |
77 | ||
78 | %define state arg1 | |
79 | %define job arg2 | |
80 | %define len2 arg2 | |
81 | ||
82 | %define job_rax rax | |
83 | ||
84 | ; idx needs to be in rbp | |
85 | %define len rbp | |
86 | %define idx rbp | |
87 | %define tmp rbp | |
88 | ||
89 | %define lane r8 | |
90 | ||
91 | %define iv r9 | |
92 | %define m_last r10 | |
93 | %define n r11 | |
94 | ||
95 | %define unused_lanes rbx | |
96 | %define r rbx | |
97 | ||
98 | %define tmp3 r12 | |
99 | %define tmp4 r13 | |
100 | %define tmp2 r14 | |
101 | ||
9f95a23c | 102 | %define good_lane r15 |
f67539c2 | 103 | %define rbits r15 |
9f95a23c TL |
104 | |
105 | ; STACK_SPACE needs to be an odd multiple of 8 | |
106 | ; This routine and its callee clobbers all GPRs | |
107 | struc STACK | |
108 | _gpr_save: resq 8 | |
109 | _rsp_save: resq 1 | |
110 | endstruc | |
111 | ||
112 | ;;; =========================================================================== | |
113 | ;;; =========================================================================== | |
114 | ;;; MACROS | |
115 | ;;; =========================================================================== | |
116 | ;;; =========================================================================== | |
117 | ||
118 | ;;; =========================================================================== | |
119 | ;;; AES CMAC job submit & flush | |
120 | ;;; =========================================================================== | |
121 | ;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection | |
122 | %macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1 | |
123 | %define %%SUBMIT_FLUSH %1 | |
124 | ||
125 | mov rax, rsp | |
126 | sub rsp, STACK_size | |
127 | and rsp, -16 | |
128 | ||
129 | mov [rsp + _gpr_save + 8*0], rbx | |
130 | mov [rsp + _gpr_save + 8*1], rbp | |
131 | mov [rsp + _gpr_save + 8*2], r12 | |
132 | mov [rsp + _gpr_save + 8*3], r13 | |
133 | mov [rsp + _gpr_save + 8*4], r14 | |
134 | mov [rsp + _gpr_save + 8*5], r15 | |
135 | %ifndef LINUX | |
136 | mov [rsp + _gpr_save + 8*6], rsi | |
137 | mov [rsp + _gpr_save + 8*7], rdi | |
138 | %endif | |
139 | mov [rsp + _rsp_save], rax ; original SP | |
140 | ||
141 | ;; Find free lane | |
142 | mov unused_lanes, [state + _aes_cmac_unused_lanes] | |
143 | ||
144 | %ifidn %%SUBMIT_FLUSH, SUBMIT | |
9f95a23c TL |
145 | |
146 | mov lane, unused_lanes | |
147 | and lane, 0xF | |
148 | shr unused_lanes, 4 | |
149 | mov [state + _aes_cmac_unused_lanes], unused_lanes | |
150 | ||
151 | ;; Copy job info into lane | |
152 | mov [state + _aes_cmac_job_in_lane + lane*8], job | |
153 | ;; Copy keys into lane args | |
154 | mov tmp, [job + _key_expanded] | |
155 | mov [state + _aes_cmac_args_keys + lane*8], tmp | |
156 | mov tmp, lane | |
157 | shl tmp, 4 ; lane*16 | |
158 | ||
159 | ;; Zero IV to store digest | |
160 | pxor xmm0, xmm0 | |
161 | movdqa [state + _aes_cmac_args_IV + tmp], xmm0 | |
162 | ||
163 | lea m_last, [state + _aes_cmac_scratch + tmp] | |
164 | ||
f67539c2 TL |
165 | ;; calculate len |
166 | ;; convert bits to bytes (message length in bits for CMAC) | |
167 | mov len, [job + _msg_len_to_hash_in_bits] | |
168 | mov rbits, len | |
169 | add len, 7 ; inc len if there are remainder bits | |
170 | shr len, 3 | |
171 | and rbits, 7 | |
172 | ||
9f95a23c | 173 | ;; Check at least 1 or more blocks (get n) |
9f95a23c TL |
174 | mov n, len |
175 | add n, 0xf | |
176 | shr n, 4 | |
177 | ||
178 | ;; Check for partial block | |
179 | mov r, len | |
180 | and r, 0xf | |
181 | ||
182 | or n, n ; check one or more blocks? | |
183 | jz %%_lt_one_block | |
184 | ||
185 | ;; One or more blocks, potentially partial | |
186 | mov word [state + _aes_cmac_init_done + lane*2], 0 | |
187 | ||
188 | mov tmp2, [job + _src] | |
189 | add tmp2, [job + _hash_start_src_offset_in_bytes] | |
190 | mov [state + _aes_cmac_args_in + lane*8], tmp2 | |
191 | ||
192 | ;; len = (n-1)*16 | |
193 | lea tmp2, [n - 1] | |
194 | shl tmp2, 4 | |
195 | movdqa xmm0, [state + _aes_cmac_lens] | |
196 | XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16 | |
197 | movdqa [state + _aes_cmac_lens], xmm0 | |
198 | ||
f67539c2 TL |
199 | ;; check remainder bits |
200 | or rbits, rbits | |
201 | jnz %%_not_complete_block_3gpp | |
202 | ||
203 | ;; check if complete block | |
9f95a23c TL |
204 | or r, r |
205 | jz %%_complete_block | |
206 | ||
207 | %%_not_complete_block: | |
208 | ;; M_last = padding(M_n) XOR K2 | |
209 | lea tmp, [rel padding_0x80_tab16 + 16] | |
210 | sub tmp, r | |
211 | movdqu xmm0, [tmp] | |
212 | movdqa [m_last], xmm0 | |
213 | ||
214 | mov tmp, [job + _src] | |
215 | add tmp, [job + _hash_start_src_offset_in_bytes] | |
216 | lea tmp3, [n - 1] | |
217 | shl tmp3, 4 | |
218 | add tmp, tmp3 | |
219 | ||
220 | memcpy_sse_16 m_last, tmp, r, tmp4, iv | |
221 | ||
222 | ;; src + n + r | |
223 | mov tmp3, [job + _skey2] | |
224 | movdqa xmm1, [m_last] | |
225 | movdqu xmm0, [tmp3] | |
226 | pxor xmm0, xmm1 | |
227 | movdqa [m_last], xmm0 | |
228 | ||
229 | %%_step_5: | |
230 | ;; Find min length | |
231 | movdqa xmm0, [state + _aes_cmac_lens] | |
232 | phminposuw xmm1, xmm0 | |
233 | ||
234 | cmp byte [state + _aes_cmac_unused_lanes], 0xf | |
235 | jne %%_return_null | |
236 | ||
237 | %else ; end SUBMIT | |
238 | ||
239 | ;; Check at least one job | |
240 | bt unused_lanes, 19 | |
241 | jc %%_return_null | |
242 | ||
243 | ;; Find a lane with a non-null job | |
244 | xor good_lane, good_lane | |
245 | cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0 | |
246 | cmovne good_lane, [rel one] | |
247 | cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0 | |
248 | cmovne good_lane, [rel two] | |
249 | cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0 | |
250 | cmovne good_lane, [rel three] | |
251 | ||
252 | ; Copy good_lane to empty lanes | |
253 | mov tmp2, [state + _aes_cmac_args_in + good_lane*8] | |
254 | mov tmp3, [state + _aes_cmac_args_keys + good_lane*8] | |
255 | shl good_lane, 4 ; multiply by 16 | |
256 | movdqa xmm2, [state + _aes_cmac_args_IV + good_lane] | |
257 | movdqa xmm0, [state + _aes_cmac_lens] | |
258 | ||
259 | %assign I 0 | |
260 | %rep 4 | |
261 | cmp qword [state + _aes_cmac_job_in_lane + I*8], 0 | |
262 | jne APPEND(skip_,I) | |
263 | mov [state + _aes_cmac_args_in + I*8], tmp2 | |
264 | mov [state + _aes_cmac_args_keys + I*8], tmp3 | |
265 | movdqa [state + _aes_cmac_args_IV + I*16], xmm2 | |
266 | por xmm0, [rel len_masks + 16*I] | |
267 | APPEND(skip_,I): | |
268 | %assign I (I+1) | |
269 | %endrep | |
270 | ;; Find min length | |
271 | phminposuw xmm1, xmm0 | |
272 | ||
273 | %endif ; end FLUSH | |
274 | ||
275 | %%_cmac_round: | |
276 | pextrw len2, xmm1, 0 ; min value | |
277 | pextrw idx, xmm1, 1 ; min index (0...3) | |
278 | cmp len2, 0 | |
279 | je %%_len_is_0 | |
280 | pshuflw xmm1, xmm1, 0 | |
281 | psubw xmm0, xmm1 | |
282 | movdqa [state + _aes_cmac_lens], xmm0 | |
283 | ||
284 | ; "state" and "args" are the same address, arg1 | |
285 | ; len2 is arg2 | |
286 | call AES128_CBC_MAC | |
287 | ; state and idx are intact | |
288 | ||
f67539c2 | 289 | movdqa xmm0, [state + _aes_cmac_lens] ; preload lens |
9f95a23c TL |
290 | %%_len_is_0: |
291 | ; Check if job complete | |
292 | test word [state + _aes_cmac_init_done + idx*2], 0xffff | |
293 | jnz %%_copy_complete_digest | |
294 | ||
295 | ; Finish step 6 | |
296 | mov word [state + _aes_cmac_init_done + idx*2], 1 | |
297 | ||
9f95a23c TL |
298 | XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16 |
299 | movdqa [state + _aes_cmac_lens], xmm0 | |
300 | ||
301 | phminposuw xmm1, xmm0 ; find min length | |
302 | ||
303 | mov tmp3, idx | |
304 | shl tmp3, 4 ; idx*16 | |
305 | lea m_last, [state + _aes_cmac_scratch + tmp3] | |
306 | mov [state + _aes_cmac_args_in + idx*8], m_last | |
307 | ||
308 | jmp %%_cmac_round | |
309 | ||
310 | %%_copy_complete_digest: | |
311 | ; Job complete, copy digest to AT output | |
312 | mov job_rax, [state + _aes_cmac_job_in_lane + idx*8] | |
313 | ||
314 | mov tmp4, idx | |
315 | shl tmp4, 4 | |
316 | lea tmp3, [state + _aes_cmac_args_IV + tmp4] | |
317 | mov tmp4, [job_rax + _auth_tag_output_len_in_bytes] | |
318 | mov tmp2, [job_rax + _auth_tag_output] | |
319 | ||
320 | cmp tmp4, 16 | |
321 | jne %%_ne_16_copy | |
322 | ||
323 | ;; 16 byte AT copy | |
324 | movdqu xmm0, [tmp3] | |
325 | movdqu [tmp2], xmm0 | |
326 | jmp %%_update_lanes | |
327 | ||
328 | %%_ne_16_copy: | |
329 | memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv | |
330 | ||
331 | %%_update_lanes: | |
332 | ; Update unused lanes | |
333 | mov unused_lanes, [state + _aes_cmac_unused_lanes] | |
334 | shl unused_lanes, 4 | |
335 | or unused_lanes, idx | |
336 | mov [state + _aes_cmac_unused_lanes], unused_lanes | |
337 | ||
338 | ; Set return job | |
339 | mov job_rax, [state + _aes_cmac_job_in_lane + idx*8] | |
340 | ||
341 | mov qword [state + _aes_cmac_job_in_lane + idx*8], 0 | |
342 | or dword [job_rax + _status], STS_COMPLETED_HMAC | |
343 | ||
f67539c2 TL |
344 | %ifdef SAFE_DATA |
345 | pxor xmm0, xmm0 | |
346 | %ifidn %%SUBMIT_FLUSH, SUBMIT | |
347 | ;; Clear digest (in memory for IV) and scratch memory of returned job | |
348 | movdqa [tmp3], xmm0 | |
349 | ||
350 | shl idx, 4 | |
351 | movdqa [state + _aes_cmac_scratch + idx], xmm0 | |
352 | ||
353 | %else | |
354 | ;; Clear digest and scratch memory of returned job and "NULL lanes" | |
355 | %assign I 0 | |
356 | %rep 4 | |
357 | cmp qword [state + _aes_cmac_job_in_lane + I*8], 0 | |
358 | jne APPEND(skip_clear_,I) | |
359 | movdqa [state + _aes_cmac_args_IV + I*16], xmm0 | |
360 | movdqa [state + _aes_cmac_scratch + I*16], xmm0 | |
361 | APPEND(skip_clear_,I): | |
362 | %assign I (I+1) | |
363 | %endrep | |
364 | %endif ;; SUBMIT | |
365 | ||
366 | %endif ;; SAFE_DATA | |
367 | ||
9f95a23c TL |
368 | %%_return: |
369 | mov rbx, [rsp + _gpr_save + 8*0] | |
370 | mov rbp, [rsp + _gpr_save + 8*1] | |
371 | mov r12, [rsp + _gpr_save + 8*2] | |
372 | mov r13, [rsp + _gpr_save + 8*3] | |
373 | mov r14, [rsp + _gpr_save + 8*4] | |
374 | mov r15, [rsp + _gpr_save + 8*5] | |
375 | %ifndef LINUX | |
376 | mov rsi, [rsp + _gpr_save + 8*6] | |
377 | mov rdi, [rsp + _gpr_save + 8*7] | |
378 | %endif | |
379 | mov rsp, [rsp + _rsp_save] ; original SP | |
380 | ret | |
381 | ||
382 | %%_return_null: | |
383 | xor job_rax, job_rax | |
384 | jmp %%_return | |
385 | ||
386 | %ifidn %%SUBMIT_FLUSH, SUBMIT | |
387 | %%_complete_block: | |
9f95a23c TL |
388 | |
389 | ;; Block size aligned | |
390 | mov tmp2, [job + _src] | |
391 | add tmp2, [job + _hash_start_src_offset_in_bytes] | |
392 | lea tmp3, [n - 1] | |
393 | shl tmp3, 4 | |
394 | add tmp2, tmp3 | |
395 | ||
396 | ;; M_last = M_n XOR K1 | |
397 | mov tmp3, [job + _skey1] | |
398 | movdqu xmm0, [tmp3] | |
399 | movdqu xmm1, [tmp2] | |
400 | pxor xmm0, xmm1 | |
401 | movdqa [m_last], xmm0 | |
402 | ||
403 | jmp %%_step_5 | |
404 | ||
405 | %%_lt_one_block: | |
406 | ;; Single partial block | |
407 | mov word [state + _aes_cmac_init_done + lane*2], 1 | |
408 | mov [state + _aes_cmac_args_in + lane*8], m_last | |
409 | ||
410 | movdqa xmm0, [state + _aes_cmac_lens] | |
411 | XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16 | |
412 | movdqa [state + _aes_cmac_lens], xmm0 | |
413 | ||
414 | mov n, 1 | |
415 | jmp %%_not_complete_block | |
f67539c2 TL |
416 | |
417 | %%_not_complete_block_3gpp: | |
418 | ;; bit pad last block | |
419 | ;; xor with skey2 | |
420 | ;; copy to m_last | |
421 | ||
422 | ;; load pointer to src | |
423 | mov tmp, [job + _src] | |
424 | add tmp, [job + _hash_start_src_offset_in_bytes] | |
425 | lea tmp3, [n - 1] | |
426 | shl tmp3, 4 | |
427 | add tmp, tmp3 | |
428 | ||
429 | ;; check if partial block | |
430 | or r, r | |
431 | jz %%_load_full_block_3gpp | |
432 | ||
433 | simd_load_sse_15_1 xmm0, tmp, r | |
434 | dec r | |
435 | ||
436 | %%_update_mlast_3gpp: | |
437 | ;; set last byte padding mask | |
438 | ;; shift into correct xmm idx | |
439 | ||
440 | ;; save and restore rcx on windows | |
441 | %ifndef LINUX | |
442 | mov tmp, rcx | |
443 | %endif | |
444 | mov rcx, rbits | |
445 | mov tmp3, 0xff | |
446 | shr tmp3, cl | |
447 | movq xmm2, tmp3 | |
448 | XPSLLB xmm2, r, xmm1, tmp2 | |
449 | ||
450 | ;; pad final byte | |
451 | pandn xmm2, xmm0 | |
452 | %ifndef LINUX | |
453 | mov rcx, tmp | |
454 | %endif | |
455 | ;; set OR mask to pad final bit | |
456 | mov tmp2, tmp3 | |
457 | shr tmp2, 1 | |
458 | xor tmp2, tmp3 ; XOR to get OR mask | |
459 | movq xmm3, tmp2 | |
460 | ;; xmm1 contains shift table from previous shift | |
461 | pshufb xmm3, xmm1 | |
462 | ||
463 | ;; load skey2 address | |
464 | mov tmp3, [job + _skey2] | |
465 | movdqu xmm1, [tmp3] | |
466 | ||
467 | ;; set final padding bit | |
468 | por xmm2, xmm3 | |
469 | ||
470 | ;; XOR last partial block with skey2 | |
471 | ;; update mlast | |
472 | pxor xmm2, xmm1 | |
473 | movdqa [m_last], xmm2 | |
474 | ||
475 | jmp %%_step_5 | |
476 | ||
477 | %%_load_full_block_3gpp: | |
478 | movdqu xmm0, [tmp] | |
479 | mov r, 0xf | |
480 | jmp %%_update_mlast_3gpp | |
9f95a23c TL |
481 | %endif |
482 | %endmacro | |
483 | ||
484 | ||
485 | align 64 | |
f67539c2 | 486 | ; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job) |
9f95a23c TL |
487 | ; arg 1 : state |
488 | ; arg 2 : job | |
489 | MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal) | |
490 | SUBMIT_JOB_AES_CMAC_AUTH: | |
491 | GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT | |
492 | ||
f67539c2 | 493 | ; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state) |
9f95a23c TL |
494 | ; arg 1 : state |
495 | MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal) | |
496 | FLUSH_JOB_AES_CMAC_AUTH: | |
497 | GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH | |
498 | ||
499 | ||
500 | %ifdef LINUX | |
501 | section .note.GNU-stack noalloc noexec nowrite progbits | |
502 | %endif |