]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | ;; |
2 | ;; Copyright (c) 2019, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | %include "include/os.asm" | |
29 | %include "job_aes_hmac.asm" | |
30 | %include "mb_mgr_datastruct.asm" | |
31 | %include "constants.asm" | |
32 | %include "include/reg_sizes.asm" | |
33 | ||
34 | %ifndef AES_CBC_ENC_X16 | |
35 | %define AES_CBC_ENC_X16 aes_cbc_enc_128_vaes_avx512 | |
36 | %define FLUSH_JOB_AES_ENC flush_job_aes128_enc_vaes_avx512 | |
37 | %define NUM_KEYS 11 | |
38 | %endif | |
39 | ||
40 | ; void AES_CBC_ENC_X16(AES_ARGS *args, UINT64 len_in_bytes); | |
41 | extern AES_CBC_ENC_X16 | |
42 | ||
43 | section .text | |
44 | ||
45 | %define APPEND(a,b) a %+ b | |
46 | ||
47 | %ifdef LINUX | |
48 | %define arg1 rdi | |
49 | %define arg2 rsi | |
50 | %else | |
51 | %define arg1 rcx | |
52 | %define arg2 rdx | |
53 | %endif | |
54 | ||
55 | %define state arg1 | |
56 | %define job arg2 | |
57 | %define len2 arg2 | |
58 | ||
59 | %define job_rax rax | |
60 | ||
61 | %if 1 | |
62 | %define unused_lanes rbx | |
63 | %define tmp1 rbx | |
64 | ||
65 | %define good_lane rdx | |
66 | %define iv rdx | |
67 | ||
68 | %define tmp2 rax | |
69 | ||
70 | ; idx needs to be in rbp | |
71 | %define tmp rbp | |
72 | %define idx rbp | |
73 | ||
74 | %define tmp3 r8 | |
75 | %define tmp4 r9 | |
76 | %endif | |
77 | ||
78 | ; copy IV into NULL lanes | |
79 | %macro COPY_IV_TO_NULL_LANES 4 | |
80 | %define %%IDX %1 ; [in] GP with good lane idx (scaled x16) | |
81 | %define %%NULL_MASK %2 ; [clobbered] GP to store NULL lane mask | |
82 | %define %%XTMP %3 ; [clobbered] temp XMM reg | |
83 | %define %%MASK_REG %4 ; [in] mask register | |
84 | ||
85 | vmovdqa64 %%XTMP, [state + _aes_args_IV + %%IDX] | |
86 | kmovw DWORD(%%NULL_MASK), %%MASK_REG | |
87 | %assign i 0 | |
88 | %rep 16 | |
89 | bt %%NULL_MASK, i | |
90 | jnc %%_skip_copy %+ i | |
91 | vmovdqa64 [state + _aes_args_IV + (i*16)], %%XTMP | |
92 | %%_skip_copy %+ i: | |
93 | %assign i (i + 1) | |
94 | %endrep | |
95 | ||
96 | %endmacro | |
97 | ||
98 | ; clear IV into NULL lanes | |
99 | %macro CLEAR_IV_IN_NULL_LANES 3 | |
100 | %define %%NULL_MASK %1 ; [clobbered] GP to store NULL lane mask | |
101 | %define %%XTMP %2 ; [clobbered] temp XMM reg | |
102 | %define %%MASK_REG %3 ; [in] mask register | |
103 | ||
104 | vpxorq %%XTMP, %%XTMP | |
105 | kmovw DWORD(%%NULL_MASK), %%MASK_REG | |
106 | %assign i 0 | |
107 | %rep 16 | |
108 | bt %%NULL_MASK, i | |
109 | jnc %%_skip_clear %+ i | |
110 | vmovdqa64 [state + _aes_args_IV + (i*16)], %%XTMP | |
111 | %%_skip_clear %+ i: | |
112 | %assign i (i + 1) | |
113 | %endrep | |
114 | ||
115 | %endmacro | |
116 | ||
117 | ; copy round key's into NULL lanes | |
118 | %macro COPY_KEYS_TO_NULL_LANES 5 | |
119 | %define %%IDX %1 ; [in] GP with good lane idx (scaled x16) | |
120 | %define %%NULL_MASK %2 ; [clobbered] GP to store NULL lane mask | |
121 | %define %%KEY_TAB %3 ; [clobbered] GP to store key table pointer | |
122 | %define %%XTMP %4 ; [clobbered] temp XMM reg | |
123 | %define %%MASK_REG %5 ; [in] mask register | |
124 | ||
125 | lea %%KEY_TAB, [state + _aes_args_key_tab] | |
126 | kmovw DWORD(%%NULL_MASK), %%MASK_REG | |
127 | %assign j 0 ; outer loop to iterate through round keys | |
128 | %rep 15 | |
129 | vmovdqa64 %%XTMP, [%%KEY_TAB + j + %%IDX] | |
130 | %assign k 0 ; inner loop to iterate through lanes | |
131 | %rep 16 | |
132 | bt %%NULL_MASK, k | |
133 | jnc %%_skip_copy %+ j %+ _ %+ k | |
134 | vmovdqa64 [%%KEY_TAB + j + (k*16)], %%XTMP | |
135 | %%_skip_copy %+ j %+ _ %+ k: | |
136 | %assign k (k + 1) | |
137 | %endrep | |
138 | ||
139 | %assign j (j + 256) | |
140 | %endrep | |
141 | ||
142 | %endmacro | |
143 | ||
144 | ; clear round key's in NULL lanes | |
145 | %macro CLEAR_KEYS_IN_NULL_LANES 3 | |
146 | %define %%NULL_MASK %1 ; [clobbered] GP to store NULL lane mask | |
147 | %define %%XTMP %2 ; [clobbered] temp XMM reg | |
148 | %define %%MASK_REG %3 ; [in] mask register | |
149 | ||
150 | vpxorq %%XTMP, %%XTMP | |
151 | kmovw DWORD(%%NULL_MASK), %%MASK_REG | |
152 | %assign k 0 ; outer loop to iterate through lanes | |
153 | %rep 16 | |
154 | bt %%NULL_MASK, k | |
155 | jnc %%_skip_clear %+ k | |
156 | %assign j 0 ; inner loop to iterate through round keys | |
157 | %rep NUM_KEYS | |
158 | vmovdqa64 [state + _aesarg_key_tab + j + (k*16)], %%XTMP | |
159 | %assign j (j + 256) | |
160 | %endrep | |
161 | %%_skip_clear %+ k: | |
162 | %assign k (k + 1) | |
163 | %endrep | |
164 | ||
165 | %endmacro | |
166 | ||
167 | ; STACK_SPACE needs to be an odd multiple of 8 | |
168 | ; This routine and its callee clobbers all GPRs | |
169 | struc STACK | |
170 | _gpr_save: resq 8 | |
171 | _rsp_save: resq 1 | |
172 | endstruc | |
173 | ||
174 | ; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) | |
175 | ; arg 1 : state | |
176 | ; arg 2 : job | |
177 | MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal) | |
178 | FLUSH_JOB_AES_ENC: | |
179 | ||
180 | mov rax, rsp | |
181 | sub rsp, STACK_size | |
182 | and rsp, -16 | |
183 | ||
184 | mov [rsp + _gpr_save + 8*0], rbx | |
185 | mov [rsp + _gpr_save + 8*1], rbp | |
186 | mov [rsp + _gpr_save + 8*2], r12 | |
187 | mov [rsp + _gpr_save + 8*3], r13 | |
188 | mov [rsp + _gpr_save + 8*4], r14 | |
189 | mov [rsp + _gpr_save + 8*5], r15 | |
190 | %ifndef LINUX | |
191 | mov [rsp + _gpr_save + 8*6], rsi | |
192 | mov [rsp + _gpr_save + 8*7], rdi | |
193 | %endif | |
194 | mov [rsp + _rsp_save], rax ; original SP | |
195 | ||
196 | ; check for empty | |
197 | cmp qword [state + _aes_lanes_in_use], 0 | |
198 | je return_null | |
199 | ||
200 | ; find a lane with a non-null job | |
201 | vpxord zmm0, zmm0, zmm0 | |
202 | vmovdqu64 zmm1, [state + _aes_job_in_lane + (0*PTR_SZ)] | |
203 | vmovdqu64 zmm2, [state + _aes_job_in_lane + (8*PTR_SZ)] | |
204 | vpcmpq k1, zmm1, zmm0, 4 ; NEQ | |
205 | vpcmpq k2, zmm2, zmm0, 4 ; NEQ | |
206 | kmovw DWORD(tmp), k1 | |
207 | kmovw DWORD(tmp1), k2 | |
208 | mov DWORD(tmp2), DWORD(tmp1) | |
209 | shl DWORD(tmp2), 8 | |
210 | or DWORD(tmp2), DWORD(tmp) ; mask of non-null jobs in tmp2 | |
211 | not BYTE(tmp) | |
212 | kmovw k4, DWORD(tmp) | |
213 | not BYTE(tmp1) | |
214 | kmovw k5, DWORD(tmp1) | |
215 | mov DWORD(tmp), DWORD(tmp2) | |
216 | not WORD(tmp) | |
217 | kmovw k6, DWORD(tmp) ; mask of NULL jobs in k4, k5 and k6 | |
218 | mov DWORD(tmp), DWORD(tmp2) | |
219 | xor tmp2, tmp2 | |
220 | bsf WORD(tmp2), WORD(tmp) ; index of the 1st set bit in tmp2 | |
221 | ||
222 | ;; copy good lane data into NULL lanes | |
223 | mov tmp, [state + _aes_args_in + tmp2*8] | |
224 | vpbroadcastq zmm1, tmp | |
225 | vmovdqa64 [state + _aes_args_in + (0*PTR_SZ)]{k4}, zmm1 | |
226 | vmovdqa64 [state + _aes_args_in + (8*PTR_SZ)]{k5}, zmm1 | |
227 | ;; - out pointer | |
228 | mov tmp, [state + _aes_args_out + tmp2*8] | |
229 | vpbroadcastq zmm1, tmp | |
230 | vmovdqa64 [state + _aes_args_out + (0*PTR_SZ)]{k4}, zmm1 | |
231 | vmovdqa64 [state + _aes_args_out + (8*PTR_SZ)]{k5}, zmm1 | |
232 | ||
233 | ;; - set len to UINT16_MAX | |
234 | mov WORD(tmp), 0xffff | |
235 | vpbroadcastw ymm3, WORD(tmp) | |
236 | vmovdqa64 ymm0, [state + _aes_lens] | |
237 | vmovdqu16 ymm0{k6}, ymm3 | |
238 | vmovdqa64 [state + _aes_lens], ymm0 | |
239 | ||
240 | ;; Find min length for lanes 0-7 | |
241 | vphminposuw xmm2, xmm0 | |
242 | ||
243 | ;; scale up good lane idx before copying IV and keys | |
244 | shl tmp2, 4 | |
245 | ;; - copy IV to null lanes | |
246 | COPY_IV_TO_NULL_LANES tmp2, tmp1, xmm4, k6 | |
247 | ||
248 | ; extract min length of lanes 0-7 | |
249 | vpextrw DWORD(len2), xmm2, 0 ; min value | |
250 | vpextrw DWORD(idx), xmm2, 1 ; min index | |
251 | ||
252 | ;; - copy round keys to null lanes | |
253 | COPY_KEYS_TO_NULL_LANES tmp2, tmp1, tmp3, xmm4, k6 | |
254 | ||
255 | ;; Update lens and find min for lanes 8-15 | |
256 | vextracti128 xmm1, ymm0, 1 | |
257 | vphminposuw xmm2, xmm1 | |
258 | vpextrw DWORD(tmp3), xmm2, 0 ; min value | |
259 | cmp DWORD(len2), DWORD(tmp3) | |
260 | jle use_min | |
261 | vpextrw DWORD(idx), xmm2, 1 ; min index | |
262 | add DWORD(idx), 8 ; but index +8 | |
263 | mov len2, tmp3 ; min len | |
264 | use_min: | |
265 | vpbroadcastw ymm3, WORD(len2) | |
266 | vpsubw ymm0, ymm0, ymm3 | |
267 | vmovdqa [state + _aes_lens], ymm0 | |
268 | ||
269 | ; "state" and "args" are the same address, arg1 | |
270 | ; len is arg2 | |
271 | call AES_CBC_ENC_X16 | |
272 | ; state and idx are intact | |
273 | ||
274 | len_is_0: | |
275 | ; process completed job "idx" | |
276 | mov job_rax, [state + _aes_job_in_lane + idx*8] | |
277 | mov unused_lanes, [state + _aes_unused_lanes] | |
278 | mov qword [state + _aes_job_in_lane + idx*8], 0 | |
279 | or dword [job_rax + _status], STS_COMPLETED_AES | |
280 | shl unused_lanes, 4 | |
281 | or unused_lanes, idx | |
282 | mov [state + _aes_unused_lanes], unused_lanes | |
283 | sub qword [state + _aes_lanes_in_use], 1 | |
284 | ||
285 | %ifdef SAFE_DATA | |
286 | ; Set bit of lane of returned job | |
287 | xor DWORD(tmp3), DWORD(tmp3) | |
288 | bts DWORD(tmp3), DWORD(idx) | |
289 | kmovw k1, DWORD(tmp3) | |
290 | korw k6, k1, k6 | |
291 | ||
292 | ;; Clear IV and expanded keys of returned job and "NULL lanes" | |
293 | ;; (k6 contains the mask of the jobs) | |
294 | CLEAR_IV_IN_NULL_LANES tmp1, xmm0, k6 | |
295 | CLEAR_KEYS_IN_NULL_LANES tmp1, xmm0, k6 | |
296 | %endif | |
297 | ||
298 | return: | |
299 | ||
300 | mov rbx, [rsp + _gpr_save + 8*0] | |
301 | mov rbp, [rsp + _gpr_save + 8*1] | |
302 | mov r12, [rsp + _gpr_save + 8*2] | |
303 | mov r13, [rsp + _gpr_save + 8*3] | |
304 | mov r14, [rsp + _gpr_save + 8*4] | |
305 | mov r15, [rsp + _gpr_save + 8*5] | |
306 | %ifndef LINUX | |
307 | mov rsi, [rsp + _gpr_save + 8*6] | |
308 | mov rdi, [rsp + _gpr_save + 8*7] | |
309 | %endif | |
310 | mov rsp, [rsp + _rsp_save] ; original SP | |
311 | ||
312 | ret | |
313 | ||
314 | return_null: | |
315 | xor job_rax, job_rax | |
316 | jmp return | |
317 | ||
318 | %ifdef LINUX | |
319 | section .note.GNU-stack noalloc noexec nowrite progbits | |
320 | %endif |