]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "sha256_job.asm" | |
31 | %include "sha256_mb_mgr_datastruct.asm" | |
32 | %include "reg_sizes.asm" | |
33 | ||
34 | %ifdef HAVE_AS_KNOWS_AVX512 | |
35 | %ifdef HAVE_AS_KNOWS_SHANI | |
36 | ||
37 | extern sha256_mb_x16_avx512 | |
38 | extern sha256_ni_x1 | |
39 | ||
40 | [bits 64] | |
41 | default rel | |
42 | section .text | |
43 | ||
44 | %ifidn __OUTPUT_FORMAT__, elf64 | |
45 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
46 | ; LINUX register definitions | |
47 | %define arg1 rdi ; rcx | |
48 | %define arg2 rsi ; rdx | |
49 | ||
50 | %define tmp4 rdx | |
51 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
52 | ||
53 | %else | |
54 | ||
55 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 | ; WINDOWS register definitions | |
57 | %define arg1 rcx | |
58 | %define arg2 rdx | |
59 | ||
60 | %define tmp4 rsi | |
61 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
62 | %endif | |
63 | ||
64 | ; Common register definitions | |
65 | ||
66 | %define state arg1 | |
67 | %define job arg2 | |
68 | %define len2 arg2 | |
69 | ||
70 | ; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1 | |
71 | %define idx rbp | |
72 | ||
73 | %define num_lanes_inuse r9 | |
74 | %define unused_lanes rbx | |
75 | %define lane_data rbx | |
76 | %define tmp2 rbx | |
77 | ||
78 | %define job_rax rax | |
79 | %define tmp1 rax | |
80 | %define size_offset rax | |
81 | %define tmp rax | |
82 | %define start_offset rax | |
83 | ||
84 | %define tmp3 arg1 | |
85 | ||
86 | %define extra_blocks arg2 | |
87 | %define p arg2 | |
88 | ||
89 | ||
90 | ; STACK_SPACE needs to be an odd multiple of 8 | |
91 | _XMM_SAVE_SIZE equ 10*16 | |
92 | _GPR_SAVE_SIZE equ 8*8 | |
93 | _ALIGN_SIZE equ 8 | |
94 | ||
95 | _XMM_SAVE equ 0 | |
96 | _GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE | |
97 | STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE | |
98 | ||
99 | %define APPEND(a,b) a %+ b | |
100 | ||
101 | ; SHA256_JOB* sha256_mb_mgr_flush_avx512_ni(SHA256_MB_JOB_MGR *state) | |
102 | ; arg 1 : rcx : state | |
103 | mk_global sha256_mb_mgr_flush_avx512_ni, function | |
104 | sha256_mb_mgr_flush_avx512_ni: | |
105 | endbranch | |
106 | sub rsp, STACK_SPACE | |
107 | mov [rsp + _GPR_SAVE + 8*0], rbx | |
108 | mov [rsp + _GPR_SAVE + 8*3], rbp | |
109 | mov [rsp + _GPR_SAVE + 8*4], r12 | |
110 | mov [rsp + _GPR_SAVE + 8*5], r13 | |
111 | mov [rsp + _GPR_SAVE + 8*6], r14 | |
112 | mov [rsp + _GPR_SAVE + 8*7], r15 | |
113 | %ifidn __OUTPUT_FORMAT__, win64 | |
114 | mov [rsp + _GPR_SAVE + 8*1], rsi | |
115 | mov [rsp + _GPR_SAVE + 8*2], rdi | |
116 | vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 | |
117 | vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 | |
118 | vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 | |
119 | vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 | |
120 | vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 | |
121 | vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 | |
122 | vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 | |
123 | vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 | |
124 | vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 | |
125 | vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 | |
126 | %endif | |
127 | ||
128 | mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] | |
129 | cmp num_lanes_inuse, 0 | |
130 | jz return_null | |
131 | ||
132 | ; find a lane with a non-null job | |
133 | xor idx, idx | |
134 | %assign I 1 | |
135 | %rep 15 | |
136 | cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 | |
137 | cmovne idx, [APPEND(lane_,I)] | |
138 | %assign I (I+1) | |
139 | %endrep | |
140 | ||
141 | ||
142 | ; copy idx to empty lanes | |
143 | copy_lane_data: | |
144 | mov tmp, [state + _args + _data_ptr + 8*idx] | |
145 | ||
146 | %assign I 0 | |
147 | %rep 16 | |
148 | cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 | |
149 | jne APPEND(skip_,I) | |
150 | mov [state + _args + _data_ptr + 8*I], tmp | |
151 | mov dword [state + _lens + 4*I], 0xFFFFFFFF | |
152 | APPEND(skip_,I): | |
153 | %assign I (I+1) | |
154 | %endrep | |
155 | ||
156 | ; Find min length | |
157 | vmovdqu ymm0, [state + _lens + 0*32] | |
158 | vmovdqu ymm1, [state + _lens + 1*32] | |
159 | ||
160 | vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} | |
161 | vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} | |
162 | vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} | |
163 | vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} | |
164 | vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} | |
165 | vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} | |
166 | vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword | |
167 | ||
168 | vmovd DWORD(idx), xmm2 | |
169 | mov len2, idx | |
170 | and idx, 0xF | |
171 | shr len2, 4 | |
172 | jz len_is_0 | |
173 | ||
174 | ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func | |
175 | cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_AVX512 | |
176 | ja mb_processing | |
177 | ||
178 | ; lensN-len2=idx | |
179 | mov [state + _lens + idx*4], DWORD(idx) | |
180 | mov r10, idx | |
181 | or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 | |
182 | ; "state" and "args" are the same address, arg1 | |
183 | ; len is arg2, idx and nlane in r10 | |
184 | call sha256_ni_x1 | |
185 | ; state and idx are intact | |
186 | jmp len_is_0 | |
187 | ||
188 | mb_processing: | |
189 | ||
190 | vpand ymm2, ymm2, [rel clear_low_nibble] | |
191 | vpshufd ymm2, ymm2, 0 | |
192 | ||
193 | vpsubd ymm0, ymm0, ymm2 | |
194 | vpsubd ymm1, ymm1, ymm2 | |
195 | ||
196 | vmovdqu [state + _lens + 0*32], ymm0 | |
197 | vmovdqu [state + _lens + 1*32], ymm1 | |
198 | ||
199 | ; "state" and "args" are the same address, arg1 | |
200 | ; len is arg2 | |
201 | call sha256_mb_x16_avx512 | |
202 | ; state and idx are intact | |
203 | ||
204 | len_is_0: | |
205 | ; process completed job "idx" | |
206 | imul lane_data, idx, _LANE_DATA_size | |
207 | lea lane_data, [state + _ldata + lane_data] | |
208 | ||
209 | mov job_rax, [lane_data + _job_in_lane] | |
210 | mov qword [lane_data + _job_in_lane], 0 | |
211 | mov dword [job_rax + _status], STS_COMPLETED | |
212 | mov unused_lanes, [state + _unused_lanes] | |
213 | shl unused_lanes, 4 | |
214 | or unused_lanes, idx | |
215 | mov [state + _unused_lanes], unused_lanes | |
216 | ||
217 | mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] | |
218 | sub num_lanes_inuse, 1 | |
219 | mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) | |
220 | ||
221 | vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] | |
222 | vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 | |
223 | vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 | |
224 | vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 | |
225 | vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] | |
226 | vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 | |
227 | vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 | |
228 | vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 | |
229 | ||
230 | vmovdqa [job_rax + _result_digest + 0*16], xmm0 | |
231 | vmovdqa [job_rax + _result_digest + 1*16], xmm1 | |
232 | ||
233 | return: | |
234 | %ifidn __OUTPUT_FORMAT__, win64 | |
235 | vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] | |
236 | vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] | |
237 | vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] | |
238 | vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] | |
239 | vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] | |
240 | vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] | |
241 | vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] | |
242 | vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] | |
243 | vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] | |
244 | vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] | |
245 | mov rsi, [rsp + _GPR_SAVE + 8*1] | |
246 | mov rdi, [rsp + _GPR_SAVE + 8*2] | |
247 | %endif | |
248 | mov rbx, [rsp + _GPR_SAVE + 8*0] | |
249 | mov rbp, [rsp + _GPR_SAVE + 8*3] | |
250 | mov r12, [rsp + _GPR_SAVE + 8*4] | |
251 | mov r13, [rsp + _GPR_SAVE + 8*5] | |
252 | mov r14, [rsp + _GPR_SAVE + 8*6] | |
253 | mov r15, [rsp + _GPR_SAVE + 8*7] | |
254 | add rsp, STACK_SPACE | |
255 | ||
256 | ret | |
257 | ||
258 | return_null: | |
259 | xor job_rax, job_rax | |
260 | jmp return | |
261 | ||
262 | section .data align=16 | |
263 | ||
264 | align 16 | |
265 | clear_low_nibble: | |
266 | dq 0x00000000FFFFFFF0, 0x0000000000000000 | |
267 | dq 0x00000000FFFFFFF0, 0x0000000000000000 | |
268 | lane_1: dq 1 | |
269 | lane_2: dq 2 | |
270 | lane_3: dq 3 | |
271 | lane_4: dq 4 | |
272 | lane_5: dq 5 | |
273 | lane_6: dq 6 | |
274 | lane_7: dq 7 | |
275 | lane_8: dq 8 | |
276 | lane_9: dq 9 | |
277 | lane_10: dq 10 | |
278 | lane_11: dq 11 | |
279 | lane_12: dq 12 | |
280 | lane_13: dq 13 | |
281 | lane_14: dq 14 | |
282 | lane_15: dq 15 | |
283 | ||
284 | %else | |
285 | %ifidn __OUTPUT_FORMAT__, win64 | |
286 | global no_sha256_mb_mgr_flush_avx512_ni | |
287 | no_sha256_mb_mgr_flush_avx512_ni: | |
288 | %endif | |
289 | %endif ; HAVE_AS_KNOWS_SHANI | |
290 | %else | |
291 | %ifidn __OUTPUT_FORMAT__, win64 | |
292 | global no_sha256_mb_mgr_flush_avx512_ni | |
293 | no_sha256_mb_mgr_flush_avx512_ni: | |
294 | %endif | |
295 | %endif ; HAVE_AS_KNOWS_AVX512 |