]>
Commit | Line | Data |
---|---|---|
9f95a23c | 1 | ;; |
11fdf7f2 TL |
2 | ;; Copyright (c) 2012-2018, Intel Corporation |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | %include "os.asm" | |
29 | %include "job_aes_hmac.asm" | |
30 | %include "mb_mgr_datastruct.asm" | |
31 | %include "reg_sizes.asm" | |
32 | ||
33 | extern sha_256_mult_sse | |
34 | ||
35 | section .data | |
36 | default rel | |
37 | ||
38 | align 16 | |
39 | byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 | |
40 | dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
41 | len_masks: | |
42 | ;ddq 0x0000000000000000000000000000FFFF | |
43 | dq 0x000000000000FFFF, 0x0000000000000000 | |
44 | ;ddq 0x000000000000000000000000FFFF0000 | |
45 | dq 0x00000000FFFF0000, 0x0000000000000000 | |
46 | ;ddq 0x00000000000000000000FFFF00000000 | |
47 | dq 0x0000FFFF00000000, 0x0000000000000000 | |
48 | ;ddq 0x0000000000000000FFFF000000000000 | |
49 | dq 0xFFFF000000000000, 0x0000000000000000 | |
50 | one: dq 1 | |
51 | two: dq 2 | |
52 | three: dq 3 | |
53 | ||
54 | section .text | |
55 | ||
56 | %ifndef FUNC | |
57 | %define FUNC flush_job_hmac_sha_256_sse | |
58 | %endif | |
59 | ||
60 | %if 1 | |
61 | %ifdef LINUX | |
62 | %define arg1 rdi | |
63 | %define arg2 rsi | |
64 | %else | |
65 | %define arg1 rcx | |
66 | %define arg2 rdx | |
67 | %endif | |
68 | ||
69 | %define state arg1 | |
70 | %define job arg2 | |
71 | %define len2 arg2 | |
72 | ||
73 | ||
74 | ; idx needs to be in rbx, rbp, r13-r15 | |
75 | %define idx rbp | |
76 | ||
77 | %define unused_lanes rbx | |
78 | %define lane_data rbx | |
79 | %define tmp2 rbx | |
80 | ||
81 | %define job_rax rax | |
82 | %define tmp1 rax | |
83 | %define size_offset rax | |
84 | %define tmp rax | |
85 | %define start_offset rax | |
86 | ||
87 | %define tmp3 arg1 | |
88 | ||
89 | %define extra_blocks arg2 | |
90 | %define p arg2 | |
91 | ||
92 | %define tmp4 r8 | |
93 | ||
94 | %define tmp5 r9 | |
95 | ||
96 | %define tmp6 r10 | |
97 | ||
98 | %endif | |
99 | ||
100 | ; This routine clobbers rbx, rbp; called routine also clobbers r12 | |
101 | struc STACK | |
102 | _gpr_save: resq 3 | |
103 | _rsp_save: resq 1 | |
104 | endstruc | |
105 | ||
106 | %define APPEND(a,b) a %+ b | |
107 | ||
108 | ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) | |
109 | ; arg 1 : rcx : state | |
110 | MKGLOBAL(FUNC,function,internal) | |
111 | FUNC: | |
112 | ||
113 | mov rax, rsp | |
114 | sub rsp, STACK_size | |
115 | and rsp, -16 | |
116 | ||
117 | mov [rsp + _gpr_save + 8*0], rbx | |
118 | mov [rsp + _gpr_save + 8*1], rbp | |
119 | mov [rsp + _gpr_save + 8*2], r12 | |
120 | mov [rsp + _rsp_save], rax ; original SP | |
121 | ||
122 | mov unused_lanes, [state + _unused_lanes_sha256] | |
123 | bt unused_lanes, 32+7 | |
124 | jc return_null | |
125 | ||
126 | ; find a lane with a non-null job | |
127 | xor idx, idx | |
128 | cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 | |
129 | cmovne idx, [rel one] | |
130 | cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 | |
131 | cmovne idx, [rel two] | |
132 | cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 | |
133 | cmovne idx, [rel three] | |
134 | ||
135 | copy_lane_data: | |
136 | ; copy idx to empty lanes | |
137 | movdqa xmm0, [state + _lens_sha256] | |
138 | mov tmp, [state + _args_data_ptr_sha256 + 8*idx] | |
139 | ||
140 | %assign I 0 | |
141 | %rep 4 | |
142 | cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 | |
143 | jne APPEND(skip_,I) | |
144 | mov [state + _args_data_ptr_sha256 + 8*I], tmp | |
145 | por xmm0, [rel len_masks + 16*I] | |
146 | APPEND(skip_,I): | |
147 | %assign I (I+1) | |
148 | %endrep | |
149 | ||
150 | movdqa [state + _lens_sha256], xmm0 | |
151 | ||
152 | phminposuw xmm1, xmm0 | |
153 | pextrw len2, xmm1, 0 ; min value | |
154 | pextrw idx, xmm1, 1 ; min index (0...3) | |
155 | cmp len2, 0 | |
156 | je len_is_0 | |
157 | ||
158 | pshuflw xmm1, xmm1, 0 | |
159 | psubw xmm0, xmm1 | |
160 | movdqa [state + _lens_sha256], xmm0 | |
161 | ||
162 | ; "state" and "args" are the same address, arg1 | |
163 | ; len is arg2 | |
164 | call sha_256_mult_sse | |
165 | ; state and idx are intact | |
166 | ||
167 | len_is_0: | |
168 | ; process completed job "idx" | |
169 | imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size | |
170 | lea lane_data, [state + _ldata_sha256 + lane_data] | |
171 | mov DWORD(extra_blocks), [lane_data + _extra_blocks] | |
172 | cmp extra_blocks, 0 | |
173 | jne proc_extra_blocks | |
174 | cmp dword [lane_data + _outer_done], 0 | |
175 | jne end_loop | |
176 | ||
177 | proc_outer: | |
178 | mov dword [lane_data + _outer_done], 1 | |
179 | mov DWORD(size_offset), [lane_data + _size_offset] | |
180 | mov qword [lane_data + _extra_block + size_offset], 0 | |
181 | mov word [state + _lens_sha256 + 2*idx], 1 | |
182 | lea tmp, [lane_data + _outer_block] | |
183 | mov job, [lane_data + _job_in_lane] | |
184 | mov [state + _args_data_ptr_sha256 + 8*idx], tmp | |
185 | ||
186 | movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] | |
187 | pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 | |
188 | pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 | |
189 | pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 | |
190 | pshufb xmm0, [rel byteswap] | |
191 | movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] | |
192 | pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 | |
193 | pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 | |
194 | %ifndef SHA224 | |
195 | pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 | |
196 | %endif | |
197 | pshufb xmm1, [rel byteswap] | |
198 | movdqa [lane_data + _outer_block], xmm0 | |
199 | movdqa [lane_data + _outer_block + 4*4], xmm1 | |
200 | %ifdef SHA224 | |
201 | mov dword [lane_data + _outer_block + 7*4], 0x80 | |
202 | %endif | |
203 | ||
204 | mov tmp, [job + _auth_key_xor_opad] | |
205 | movdqu xmm0, [tmp] | |
206 | movdqu xmm1, [tmp + 4*4] | |
207 | movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 | |
208 | pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 | |
209 | pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 | |
210 | pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 | |
211 | movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 | |
212 | pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 | |
213 | pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 | |
214 | pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 | |
215 | jmp copy_lane_data | |
216 | ||
217 | align 16 | |
218 | proc_extra_blocks: | |
219 | mov DWORD(start_offset), [lane_data + _start_offset] | |
220 | mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) | |
221 | lea tmp, [lane_data + _extra_block + start_offset] | |
222 | mov [state + _args_data_ptr_sha256 + 8*idx], tmp | |
223 | mov dword [lane_data + _extra_blocks], 0 | |
224 | jmp copy_lane_data | |
225 | ||
226 | return_null: | |
227 | xor job_rax, job_rax | |
228 | jmp return | |
229 | ||
230 | align 16 | |
231 | end_loop: | |
232 | mov job_rax, [lane_data + _job_in_lane] | |
233 | mov qword [lane_data + _job_in_lane], 0 | |
234 | or dword [job_rax + _status], STS_COMPLETED_HMAC | |
235 | mov unused_lanes, [state + _unused_lanes_sha256] | |
236 | shl unused_lanes, 8 | |
237 | or unused_lanes, idx | |
238 | mov [state + _unused_lanes_sha256], unused_lanes | |
239 | ||
240 | mov p, [job_rax + _auth_tag_output] | |
241 | ||
9f95a23c TL |
242 | %ifdef SHA224 |
243 | cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14 | |
244 | jne copy_full_digest | |
245 | %else | |
246 | cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16 | |
247 | jne copy_full_digest | |
248 | %endif | |
249 | ;; copy 14 bytes for SHA224 / 16 bytes for SHA256 | |
11fdf7f2 TL |
250 | mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] |
251 | mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] | |
252 | mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] | |
253 | mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] | |
11fdf7f2 TL |
254 | bswap DWORD(tmp2) |
255 | bswap DWORD(tmp4) | |
256 | bswap DWORD(tmp6) | |
257 | bswap DWORD(tmp5) | |
11fdf7f2 TL |
258 | mov [p + 0*4], DWORD(tmp2) |
259 | mov [p + 1*4], DWORD(tmp4) | |
260 | mov [p + 2*4], DWORD(tmp6) | |
11fdf7f2 TL |
261 | %ifdef SHA224 |
262 | mov [p + 3*4], WORD(tmp5) | |
263 | %else | |
264 | mov [p + 3*4], DWORD(tmp5) | |
265 | %endif | |
9f95a23c | 266 | jmp return |
11fdf7f2 | 267 | |
9f95a23c TL |
268 | copy_full_digest: |
269 | ;; copy 28 bytes for SHA224 / 32 bytes for SHA256 | |
270 | mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] | |
271 | mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] | |
272 | mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] | |
273 | mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] | |
274 | bswap DWORD(tmp2) | |
275 | bswap DWORD(tmp4) | |
276 | bswap DWORD(tmp6) | |
277 | bswap DWORD(tmp5) | |
278 | mov [p + 0*4], DWORD(tmp2) | |
279 | mov [p + 1*4], DWORD(tmp4) | |
280 | mov [p + 2*4], DWORD(tmp6) | |
281 | mov [p + 3*4], DWORD(tmp5) | |
282 | ||
283 | mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] | |
284 | mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE] | |
285 | mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE] | |
286 | %ifndef SHA224 | |
287 | mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE] | |
288 | %endif | |
289 | bswap DWORD(tmp2) | |
290 | bswap DWORD(tmp4) | |
291 | bswap DWORD(tmp6) | |
292 | %ifndef SHA224 | |
293 | bswap DWORD(tmp5) | |
294 | %endif | |
295 | mov [p + 4*4], DWORD(tmp2) | |
296 | mov [p + 5*4], DWORD(tmp4) | |
297 | mov [p + 6*4], DWORD(tmp6) | |
298 | %ifndef SHA224 | |
299 | mov [p + 7*4], DWORD(tmp5) | |
300 | %endif | |
11fdf7f2 | 301 | |
9f95a23c | 302 | return: |
11fdf7f2 TL |
303 | mov rbx, [rsp + _gpr_save + 8*0] |
304 | mov rbp, [rsp + _gpr_save + 8*1] | |
305 | mov r12, [rsp + _gpr_save + 8*2] | |
306 | mov rsp, [rsp + _rsp_save] ; original SP | |
11fdf7f2 TL |
307 | ret |
308 | ||
309 | %ifdef LINUX | |
310 | section .note.GNU-stack noalloc noexec nowrite progbits | |
311 | %endif |