]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | %include "os.asm" | |
29 | %include "job_aes_hmac.asm" | |
30 | %include "mb_mgr_datastruct.asm" | |
31 | %include "reg_sizes.asm" | |
32 | ;%define DO_DBGPRINT | |
33 | %include "dbgprint.asm" | |
34 | extern md5_x8x2_avx2 | |
35 | ||
36 | section .data | |
37 | default rel | |
38 | align 16 | |
39 | dupw: ;ddq 0x01000100010001000100010001000100 | |
40 | dq 0x0100010001000100, 0x0100010001000100 | |
41 | x80: ;ddq 0x00000000000000000000000000000080 | |
42 | dq 0x0000000000000080, 0x0000000000000000 | |
43 | x00: ;ddq 0x00000000000000000000000000000000 | |
44 | dq 0x0000000000000000, 0x0000000000000000 | |
45 | len_masks: | |
46 | ;ddq 0x0000000000000000000000000000FFFF | |
47 | dq 0x000000000000FFFF, 0x0000000000000000 | |
48 | ;ddq 0x000000000000000000000000FFFF0000 | |
49 | dq 0x00000000FFFF0000, 0x0000000000000000 | |
50 | ;ddq 0x00000000000000000000FFFF00000000 | |
51 | dq 0x0000FFFF00000000, 0x0000000000000000 | |
52 | ;ddq 0x0000000000000000FFFF000000000000 | |
53 | dq 0xFFFF000000000000, 0x0000000000000000 | |
54 | ;ddq 0x000000000000FFFF0000000000000000 | |
55 | dq 0x0000000000000000, 0x000000000000FFFF | |
56 | ;ddq 0x00000000FFFF00000000000000000000 | |
57 | dq 0x0000000000000000, 0x00000000FFFF0000 | |
58 | ;ddq 0x0000FFFF000000000000000000000000 | |
59 | dq 0x0000000000000000, 0x0000FFFF00000000 | |
60 | ;ddq 0xFFFF0000000000000000000000000000 | |
61 | dq 0x0000000000000000, 0xFFFF000000000000 | |
62 | ||
63 | lane_1: dq 1 | |
64 | lane_2: dq 2 | |
65 | lane_3: dq 3 | |
66 | lane_4: dq 4 | |
67 | lane_5: dq 5 | |
68 | lane_6: dq 6 | |
69 | lane_7: dq 7 | |
70 | lane_8: dq 8 | |
71 | lane_9: dq 9 | |
72 | lane_10: dq 10 | |
73 | lane_11: dq 11 | |
74 | lane_12: dq 12 | |
75 | lane_13: dq 13 | |
76 | lane_14: dq 14 | |
77 | lane_15: dq 15 | |
78 | ||
79 | section .text | |
80 | ||
81 | %if 1 | |
82 | %ifdef LINUX | |
83 | %define arg1 rdi | |
84 | %define arg2 rsi | |
85 | %else | |
86 | %define arg1 rcx | |
87 | %define arg2 rdx | |
88 | %endif | |
89 | ||
90 | %define state arg1 | |
91 | %define job arg2 | |
92 | %define len2 arg2 | |
93 | ||
94 | ||
95 | ; idx needs to be in rbp | |
96 | %define idx rbp | |
97 | ||
98 | %define unused_lanes rbx | |
99 | %define lane_data rbx | |
100 | %define tmp2 rbx | |
101 | ||
102 | %define job_rax rax | |
103 | %define tmp1 rax | |
104 | %define size_offset rax | |
105 | %define tmp rax | |
106 | %define start_offset rax | |
107 | ||
108 | %define tmp3 arg1 | |
109 | ||
110 | %define extra_blocks arg2 | |
111 | %define p arg2 | |
112 | ||
113 | %define tmp4 r8 | |
9f95a23c | 114 | %define tmp5 r9 |
11fdf7f2 TL |
115 | %define num_lanes_inuse r12 |
116 | %define len_upper r13 | |
117 | %define idx_upper r14 | |
118 | %endif | |
119 | ||
120 | ; This routine and/or the called routine clobbers all GPRs | |
121 | struc STACK | |
122 | _gpr_save: resq 8 | |
123 | _rsp_save: resq 1 | |
124 | endstruc | |
125 | ||
126 | %define APPEND(a,b) a %+ b | |
127 | ||
128 | ; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state) | |
129 | ; arg 1 : rcx : state | |
130 | MKGLOBAL(flush_job_hmac_md5_avx2,function,internal) | |
131 | flush_job_hmac_md5_avx2: | |
132 | ||
133 | mov rax, rsp | |
134 | sub rsp, STACK_size | |
135 | and rsp, -32 | |
136 | ||
137 | mov [rsp + _gpr_save + 8*0], rbx | |
138 | mov [rsp + _gpr_save + 8*1], rbp | |
139 | mov [rsp + _gpr_save + 8*2], r12 | |
140 | mov [rsp + _gpr_save + 8*3], r13 | |
141 | mov [rsp + _gpr_save + 8*4], r14 | |
142 | mov [rsp + _gpr_save + 8*5], r15 | |
143 | %ifndef LINUX | |
144 | mov [rsp + _gpr_save + 8*6], rsi | |
145 | mov [rsp + _gpr_save + 8*7], rdi | |
146 | %endif | |
147 | mov [rsp + _rsp_save], rax ; original SP | |
148 | ||
149 | DBGPRINTL "---------- enter md5 flush -----------" | |
150 | mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; empty? | |
151 | cmp num_lanes_inuse, 0 | |
152 | jz return_null | |
153 | ||
154 | ; find a lane with a non-null job -- flush does not have to be efficient! | |
155 | mov idx, 0 | |
156 | %assign I 1 | |
157 | %rep 15 | |
158 | cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 | |
159 | cmovne idx, [rel APPEND(lane_,I)] | |
160 | %assign I (I+1) | |
161 | %endrep | |
162 | ||
163 | ||
164 | copy_lane_data: | |
165 | ; copy good lane (idx) to empty lanes | |
166 | mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] | |
167 | ;; tackle lower 8 lanes | |
168 | vmovdqa xmm0, [state + _lens_md5 + 0*16] ;; lower 8 lengths | |
169 | %assign I 0 | |
170 | %rep 8 | |
171 | cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 | |
172 | jne APPEND(lower_skip_,I) | |
173 | mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp | |
174 | vpor xmm0, xmm0, [rel len_masks + 16*I] | |
175 | APPEND(lower_skip_,I): | |
176 | %assign I (I+1) | |
177 | %endrep | |
178 | ;; tackle upper lanes | |
179 | vmovdqa xmm1, [state + _lens_md5 + 1*16] ;; upper 8 lengths | |
180 | %assign I 0 | |
181 | %rep 8 | |
182 | cmp qword [state + _ldata_md5 + (8 + I) * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 | |
183 | jne APPEND(upper_skip_,I) | |
184 | mov [state + _args_data_ptr_md5 + PTR_SZ*(8+I)], tmp | |
185 | vpor xmm1, xmm1, [rel len_masks + 16*I] | |
186 | APPEND(upper_skip_,I): | |
187 | %assign I (I+1) | |
188 | %endrep | |
189 | jmp start_loop0 | |
190 | ||
191 | align 32 | |
192 | start_loop0: | |
193 | ; Find min length | |
194 | vphminposuw xmm2, xmm0 | |
195 | vpextrw DWORD(len2), xmm2, 0 ; min value | |
196 | vpextrw DWORD(idx), xmm2, 1 ; min index (0...7) | |
197 | ||
198 | vphminposuw xmm3, xmm1 | |
199 | vpextrw DWORD(len_upper), xmm3, 0 ; min value | |
200 | vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F) | |
201 | ||
202 | cmp len2, len_upper | |
203 | jle use_min | |
204 | ||
205 | min_in_high: | |
206 | vmovdqa xmm2, xmm3 | |
207 | mov len2, len_upper | |
208 | mov idx, idx_upper | |
209 | or idx, 0x8 ; to reflect that index in 8-F | |
210 | use_min: | |
211 | and len2, len2 ; to set flags | |
212 | jz len_is_0 | |
213 | DBGPRINTL64 "min_length min_index ", len2, idx | |
214 | DBGPRINTL_XMM "FLUSH md5 lens before sub lower", xmm0 | |
215 | vpbroadcastw xmm2, xmm2 ; duplicate words across all lanes | |
216 | vpsubw xmm0, xmm0, xmm2 | |
217 | DBGPRINTL_XMM "FLUSH md5 lens after sub lower", xmm0 | |
218 | vmovdqa [state + _lens_md5 + 0*16], xmm0 | |
219 | ||
220 | vpsubw xmm1, xmm1, xmm2 | |
221 | DBGPRINTL_XMM "FLUSH md5 lens after sub upper", xmm1 | |
222 | vmovdqa [state + _lens_md5 + 1*16], xmm1 | |
223 | ||
224 | ; "state" and "args" are the same address, arg1 | |
225 | ; len is arg2 | |
226 | call md5_x8x2_avx2 | |
227 | ; state and idx are intact | |
228 | ||
229 | len_is_0: | |
230 | ; process completed job "idx" | |
231 | imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size | |
232 | lea lane_data, [state + _ldata_md5 + lane_data] | |
233 | mov DWORD(extra_blocks), [lane_data + _extra_blocks] | |
234 | cmp extra_blocks, 0 | |
235 | jne proc_extra_blocks | |
236 | cmp dword [lane_data + _outer_done], 0 | |
237 | jne end_loop | |
238 | ||
239 | proc_outer: | |
240 | mov dword [lane_data + _outer_done], 1 | |
241 | mov DWORD(size_offset), [lane_data + _size_offset] | |
242 | mov qword [lane_data + _extra_block + size_offset], 0 | |
243 | mov word [state + _lens_md5 + 2*idx], 1 | |
244 | lea tmp, [lane_data + _outer_block] | |
245 | mov job, [lane_data + _job_in_lane] | |
246 | mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp | |
247 | ||
248 | vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] | |
249 | vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 | |
250 | vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 | |
251 | vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 | |
252 | vmovdqa [lane_data + _outer_block], xmm0 | |
253 | ||
254 | mov tmp, [job + _auth_key_xor_opad] | |
255 | vmovdqu xmm0, [tmp] | |
256 | vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 | |
257 | vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 | |
258 | vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 | |
259 | vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 | |
260 | jmp copy_lane_data | |
261 | ||
262 | align 16 | |
263 | proc_extra_blocks: | |
264 | mov DWORD(start_offset), [lane_data + _start_offset] | |
265 | mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) | |
266 | lea tmp, [lane_data + _extra_block + start_offset] | |
267 | mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp | |
268 | mov dword [lane_data + _extra_blocks], 0 | |
269 | jmp copy_lane_data | |
270 | ||
271 | return_null: | |
272 | xor job_rax, job_rax | |
273 | jmp return | |
274 | ||
275 | align 16 | |
276 | end_loop: | |
277 | mov job_rax, [lane_data + _job_in_lane] | |
278 | mov qword [lane_data + _job_in_lane], 0 | |
279 | or dword [job_rax + _status], STS_COMPLETED_HMAC | |
280 | mov unused_lanes, [state + _unused_lanes_md5] | |
281 | shl unused_lanes, 4 | |
282 | or unused_lanes, idx | |
283 | mov [state + _unused_lanes_md5], unused_lanes | |
284 | ||
285 | mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; update lanes inuse | |
286 | sub num_lanes_inuse, 1 | |
287 | mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse) | |
288 | ||
289 | mov p, [job_rax + _auth_tag_output] | |
290 | ||
291 | ; copy 12 bytes | |
292 | mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] | |
293 | mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] | |
9f95a23c | 294 | mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] |
11fdf7f2 TL |
295 | ; bswap DWORD(tmp2) |
296 | ; bswap DWORD(tmp4) | |
297 | ; bswap DWORD(tmp3) | |
298 | mov [p + 0*4], DWORD(tmp2) | |
299 | mov [p + 1*4], DWORD(tmp4) | |
9f95a23c TL |
300 | mov [p + 2*4], DWORD(tmp5) |
301 | ||
302 | cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12 | |
303 | je return | |
304 | ||
305 | ; copy 16 bytes | |
306 | mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE] | |
307 | mov [p + 3*4], DWORD(tmp5) | |
11fdf7f2 TL |
308 | |
309 | return: | |
310 | DBGPRINTL "---------- exit md5 flush -----------" | |
311 | vzeroupper | |
312 | ||
313 | mov rbx, [rsp + _gpr_save + 8*0] | |
314 | mov rbp, [rsp + _gpr_save + 8*1] | |
315 | mov r12, [rsp + _gpr_save + 8*2] | |
316 | mov r13, [rsp + _gpr_save + 8*3] | |
317 | mov r14, [rsp + _gpr_save + 8*4] | |
318 | mov r15, [rsp + _gpr_save + 8*5] | |
319 | %ifndef LINUX | |
320 | mov rsi, [rsp + _gpr_save + 8*6] | |
321 | mov rdi, [rsp + _gpr_save + 8*7] | |
322 | %endif | |
323 | mov rsp, [rsp + _rsp_save] ; original SP | |
324 | ||
325 | ret | |
326 | ||
327 | %ifdef LINUX | |
328 | section .note.GNU-stack noalloc noexec nowrite progbits | |
329 | %endif |