]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
1e59de90 | 5 | ; modification, are permitted provided that the following conditions |
7c673cae FG |
6 | ; are met: |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "sha512_job.asm" | |
31 | %include "sha512_mb_mgr_datastruct.asm" | |
32 | %include "reg_sizes.asm" | |
33 | ||
34 | %ifdef HAVE_AS_KNOWS_AVX512 | |
35 | extern sha512_mb_x8_avx512 | |
36 | ||
37 | %ifidn __OUTPUT_FORMAT__, elf64 | |
38 | ; LINUX register definitions | |
39 | %define arg1 rdi ; rcx | |
40 | %define arg2 rsi ; rdx | |
41 | ||
42 | ; idx needs to be other than arg1, arg2, rbx, r12 | |
43 | %define idx rdx ; rsi | |
44 | %define last_len rdx ; rsi | |
45 | ||
46 | %define size_offset rcx ; rdi | |
47 | %define tmp2 rcx ; rdi | |
48 | ||
49 | %else | |
50 | ; WINDOWS register definitions | |
51 | %define arg1 rcx | |
52 | %define arg2 rdx | |
53 | ||
54 | ; idx needs to be other than arg1, arg2, rbx, r12 | |
55 | %define last_len rsi | |
56 | %define idx rsi | |
57 | ||
58 | %define size_offset rdi | |
59 | %define tmp2 rdi | |
60 | ||
61 | %endif | |
62 | ||
63 | ; Common definitions | |
64 | %define state arg1 | |
65 | %define job arg2 | |
66 | %define len2 arg2 | |
67 | %define p2 arg2 | |
68 | ||
69 | %define p r11 | |
70 | %define start_offset r11 | |
71 | ||
72 | %define unused_lanes rbx | |
73 | ||
74 | %define job_rax rax | |
75 | %define len rax | |
76 | ||
77 | %define lane rbp | |
78 | %define tmp3 rbp | |
79 | %define lens3 rbp | |
80 | ||
81 | %define extra_blocks r8 | |
82 | %define lens0 r8 | |
83 | ||
84 | %define num_lanes_inuse r9 | |
85 | %define tmp r9 | |
86 | %define lens1 r9 | |
87 | ||
88 | %define lane_data r10 | |
89 | %define lens2 r10 | |
90 | ||
91 | struc stack_frame | |
92 | .xmm: resb 16*10 | |
93 | .gpr: resb 8*8 | |
94 | .rsp: resb 8 | |
95 | endstruc | |
96 | ||
97 | ; STACK_SPACE needs to be an odd multiple of 8 | |
98 | %define _XMM_SAVE stack_frame.gpr | |
99 | %define _GPR_SAVE stack_frame.rsp | |
100 | %define STACK_SPACE stack_frame_size | |
101 | ||
102 | ; SHA512_JOB* sha512_mb_mgr_submit_avx512(SHA512_MB_JOB_MGR *state, SHA512_JOB *job) | |
103 | ; arg 1 : rcx : state | |
104 | ; arg 2 : rdx : job | |
1e59de90 | 105 | mk_global sha512_mb_mgr_submit_avx512, function |
7c673cae | 106 | sha512_mb_mgr_submit_avx512: |
1e59de90 | 107 | endbranch |
7c673cae FG |
108 | |
109 | mov rax, rsp | |
110 | ||
111 | sub rsp, STACK_SPACE | |
112 | ||
113 | mov [rsp + stack_frame.rsp], rax | |
114 | ||
115 | mov [rsp + _XMM_SAVE + 8*0], rbx | |
116 | mov [rsp + _XMM_SAVE + 8*1], rbp | |
117 | mov [rsp + _XMM_SAVE + 8*2], r12 | |
118 | mov [rsp + _XMM_SAVE + 8*5], r13 | |
119 | mov [rsp + _XMM_SAVE + 8*6], r14 | |
120 | mov [rsp + _XMM_SAVE + 8*7], r15 | |
121 | %ifidn __OUTPUT_FORMAT__, win64 | |
122 | mov [rsp + _XMM_SAVE + 8*3], rsi | |
123 | mov [rsp + _XMM_SAVE + 8*4], rdi | |
124 | vmovdqu [rsp + 16*0], xmm6 | |
125 | vmovdqu [rsp + 16*1], xmm7 | |
126 | vmovdqu [rsp + 16*2], xmm8 | |
127 | vmovdqu [rsp + 16*3], xmm9 | |
128 | vmovdqu [rsp + 16*4], xmm10 | |
129 | vmovdqu [rsp + 16*5], xmm11 | |
130 | vmovdqu [rsp + 16*6], xmm12 | |
131 | vmovdqu [rsp + 16*7], xmm13 | |
132 | vmovdqu [rsp + 16*8], xmm14 | |
133 | vmovdqu [rsp + 16*9], xmm15 | |
134 | %endif | |
135 | ||
136 | mov unused_lanes, [state + _unused_lanes] | |
137 | movzx lane, BYTE(unused_lanes) | |
138 | shr unused_lanes, 8 | |
139 | imul lane_data, lane, _LANE_DATA_size | |
140 | mov dword [job + _status], STS_BEING_PROCESSED | |
141 | lea lane_data, [state + _ldata + lane_data] | |
142 | mov [state + _unused_lanes], unused_lanes | |
143 | mov DWORD(len), [job + _len] | |
144 | ||
145 | mov [lane_data + _job_in_lane], job | |
146 | mov [state + _lens + 4 + 8*lane], DWORD(len) | |
147 | ||
148 | ||
149 | ; Load digest words from result_digest | |
150 | vmovdqa xmm0, [job + _result_digest + 0*16] | |
151 | vmovdqa xmm1, [job + _result_digest + 1*16] | |
152 | vmovdqa xmm2, [job + _result_digest + 2*16] | |
153 | vmovdqa xmm3, [job + _result_digest + 3*16] | |
154 | vmovq [state + _args_digest + 8*lane + 0*64], xmm0 | |
155 | vpextrq [state + _args_digest + 8*lane + 1*64], xmm0, 1 | |
156 | vmovq [state + _args_digest + 8*lane + 2*64], xmm1 | |
157 | vpextrq [state + _args_digest + 8*lane + 3*64], xmm1, 1 | |
158 | vmovq [state + _args_digest + 8*lane + 4*64], xmm2 | |
159 | vpextrq [state + _args_digest + 8*lane + 5*64], xmm2, 1 | |
160 | vmovq [state + _args_digest + 8*lane + 6*64], xmm3 | |
161 | vpextrq [state + _args_digest + 8*lane + 7*64], xmm3, 1 | |
162 | ||
163 | mov p, [job + _buffer] | |
164 | mov [state + _args_data_ptr + 8*lane], p | |
165 | ||
166 | mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] | |
167 | add num_lanes_inuse, 1 | |
168 | mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) | |
169 | cmp num_lanes_inuse, 8 | |
170 | jne return_null | |
171 | ||
172 | start_loop: | |
173 | ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx | |
174 | vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a} | |
175 | vmovdqu ymm1, [state + _lens + 1*32] | |
176 | ||
177 | vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i} | |
178 | vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i} | |
179 | vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i} | |
180 | vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i} | |
181 | vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword | |
182 | ||
183 | vmovq idx, xmm2 | |
184 | mov len2, idx | |
185 | and idx, 0xF | |
186 | shr len2, 32 | |
187 | jz len_is_0 | |
188 | ||
189 | ||
190 | vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i} | |
191 | vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0} | |
192 | vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0} | |
193 | ||
194 | vpsubd ymm0, ymm0, ymm2 | |
195 | vpsubd ymm1, ymm1, ymm2 | |
196 | ||
197 | vmovdqu [state + _lens + 0*32], ymm0 | |
198 | vmovdqu [state + _lens + 1*32], ymm1 | |
199 | ||
200 | ; "state" and "args" are the same address, arg1 | |
201 | ; len is arg2 | |
202 | call sha512_mb_x8_avx512 | |
203 | ; state and idx are intact | |
204 | ||
205 | len_is_0: | |
206 | ||
207 | ; process completed job "idx" | |
208 | imul lane_data, idx, _LANE_DATA_size | |
209 | lea lane_data, [state + _ldata + lane_data] | |
210 | ||
211 | mov job_rax, [lane_data + _job_in_lane] | |
212 | ||
213 | ||
214 | mov unused_lanes, [state + _unused_lanes] | |
215 | mov qword [lane_data + _job_in_lane], 0 | |
216 | mov dword [job_rax + _status], STS_COMPLETED | |
217 | shl unused_lanes, 8 | |
218 | or unused_lanes, idx | |
219 | mov [state + _unused_lanes], unused_lanes | |
220 | ||
221 | mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] | |
222 | sub num_lanes_inuse, 1 | |
223 | mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) | |
224 | vmovq xmm0, [state + _args_digest + 8*idx + 0*64] | |
225 | vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1 | |
226 | vmovq xmm1, [state + _args_digest + 8*idx + 2*64] | |
227 | vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1 | |
228 | vmovq xmm2, [state + _args_digest + 8*idx + 4*64] | |
229 | vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1 | |
230 | vmovq xmm3, [state + _args_digest + 8*idx + 6*64] | |
231 | vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1 | |
232 | vmovdqa [job_rax + _result_digest + 0*16], xmm0 | |
233 | vmovdqa [job_rax + _result_digest + 1*16], xmm1 | |
234 | vmovdqa [job_rax + _result_digest + 2*16], xmm2 | |
235 | vmovdqa [job_rax + _result_digest + 3*16], xmm3 | |
236 | ||
237 | return: | |
238 | ||
239 | %ifidn __OUTPUT_FORMAT__, win64 | |
240 | vmovdqu xmm6, [rsp + 16*0] | |
241 | vmovdqu xmm7, [rsp + 16*1] | |
242 | vmovdqu xmm8, [rsp + 16*2] | |
243 | vmovdqu xmm9, [rsp + 16*3] | |
244 | vmovdqu xmm10, [rsp + 16*4] | |
245 | vmovdqu xmm11, [rsp + 16*5] | |
246 | vmovdqu xmm12, [rsp + 16*6] | |
247 | vmovdqu xmm13, [rsp + 16*7] | |
248 | vmovdqu xmm14, [rsp + 16*8] | |
249 | vmovdqu xmm15, [rsp + 16*9] | |
250 | mov rsi, [rsp + _XMM_SAVE + 8*3] | |
251 | mov rdi, [rsp + _XMM_SAVE + 8*4] | |
252 | %endif | |
253 | mov rbx, [rsp + _XMM_SAVE + 8*0] | |
254 | mov rbp, [rsp + _XMM_SAVE + 8*1] | |
255 | mov r12, [rsp + _XMM_SAVE + 8*2] | |
256 | mov r13, [rsp + _XMM_SAVE + 8*5] | |
257 | mov r14, [rsp + _XMM_SAVE + 8*6] | |
258 | mov r15, [rsp + _XMM_SAVE + 8*7] | |
259 | ||
260 | mov rsp, [rsp + stack_frame.rsp] | |
261 | ||
262 | ret | |
263 | ||
264 | return_null: | |
265 | xor job_rax, job_rax | |
266 | jmp return | |
267 | ||
268 | section .data align=32 | |
269 | ||
270 | align 32 | |
271 | clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index | |
272 | dq 0xFFFFFFFF00000000, 0x0000000000000000 | |
273 | dq 0xFFFFFFFF00000000, 0x0000000000000000 | |
274 | ||
275 | %else | |
276 | %ifidn __OUTPUT_FORMAT__, win64 | |
277 | global no_sha512_mb_mgr_submit_avx512 | |
278 | no_sha512_mb_mgr_submit_avx512: | |
279 | %endif | |
280 | %endif ; HAVE_AS_KNOWS_AVX512 |