]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
1e59de90 | 5 | ; modification, are permitted provided that the following conditions |
7c673cae FG |
6 | ; are met: |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "sha256_job.asm" | |
31 | %include "sha256_mb_mgr_datastruct.asm" | |
32 | ||
33 | %include "reg_sizes.asm" | |
34 | ||
35 | extern sha256_mb_x4_avx | |
1e59de90 TL |
36 | |
37 | [bits 64] | |
7c673cae | 38 | default rel |
1e59de90 | 39 | section .text |
7c673cae FG |
40 | |
41 | %ifidn __OUTPUT_FORMAT__, elf64 | |
42 | ; Linux register definitions | |
43 | %define arg1 rdi ; rcx | |
44 | %define arg2 rsi ; rdx | |
45 | ||
46 | ; idx needs to be other than arg1, arg2, rbx, r12 | |
47 | %define idx rdx ; rsi | |
48 | %define last_len rdx ; rsi | |
1e59de90 | 49 | |
7c673cae FG |
50 | %define size_offset rcx ; rdi |
51 | %define tmp2 rcx ; rdi | |
52 | ||
53 | %else | |
54 | ; WINDOWS register definitions | |
55 | %define arg1 rcx | |
56 | %define arg2 rdx | |
57 | ||
58 | ; idx needs to be other than arg1, arg2, rbx, r12 | |
59 | %define last_len rsi | |
60 | %define idx rsi | |
1e59de90 | 61 | |
7c673cae FG |
62 | %define size_offset rdi |
63 | %define tmp2 rdi | |
1e59de90 | 64 | |
7c673cae FG |
65 | %endif |
66 | ||
67 | ; Common definitions | |
68 | %define state arg1 | |
69 | %define job arg2 | |
70 | %define len2 arg2 | |
71 | %define p2 arg2 | |
72 | ||
73 | %define p r11 | |
74 | %define start_offset r11 | |
75 | ||
76 | %define unused_lanes rbx | |
1e59de90 | 77 | |
7c673cae FG |
78 | %define job_rax rax |
79 | %define len rax | |
80 | ||
81 | %define lane rbp | |
82 | %define tmp3 rbp | |
83 | %define lens3 rbp | |
1e59de90 | 84 | |
7c673cae FG |
85 | %define extra_blocks r8 |
86 | %define lens0 r8 | |
1e59de90 | 87 | |
7c673cae FG |
88 | %define tmp r9 |
89 | %define lens1 r9 | |
1e59de90 | 90 | |
7c673cae FG |
91 | %define lane_data r10 |
92 | %define lens2 r10 | |
93 | ||
94 | ||
95 | ; STACK_SPACE needs to be an odd multiple of 8 | |
96 | %define _XMM_SAVE 16*10 | |
97 | %define _GPR_SAVE 8*5 | |
98 | %define STACK_SPACE _GPR_SAVE + _XMM_SAVE | |
99 | ||
100 | ; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) | |
101 | ; arg 1 : rcx : state | |
102 | ; arg 2 : rdx : job | |
1e59de90 | 103 | mk_global sha256_mb_mgr_submit_avx, function |
7c673cae | 104 | sha256_mb_mgr_submit_avx: |
1e59de90 | 105 | endbranch |
7c673cae FG |
106 | |
107 | sub rsp, STACK_SPACE | |
108 | mov [rsp + _XMM_SAVE + 8*0], rbx | |
109 | mov [rsp + _XMM_SAVE + 8*1], rbp | |
110 | mov [rsp + _XMM_SAVE + 8*2], r12 | |
111 | %ifidn __OUTPUT_FORMAT__, win64 | |
112 | mov [rsp + _XMM_SAVE + 8*3], rsi | |
113 | mov [rsp + _XMM_SAVE + 8*4], rdi | |
114 | vmovdqa [rsp + 16*0], xmm6 | |
115 | vmovdqa [rsp + 16*1], xmm7 | |
116 | vmovdqa [rsp + 16*2], xmm8 | |
117 | vmovdqa [rsp + 16*3], xmm9 | |
118 | vmovdqa [rsp + 16*4], xmm10 | |
119 | vmovdqa [rsp + 16*5], xmm11 | |
120 | vmovdqa [rsp + 16*6], xmm12 | |
121 | vmovdqa [rsp + 16*7], xmm13 | |
122 | vmovdqa [rsp + 16*8], xmm14 | |
123 | vmovdqa [rsp + 16*9], xmm15 | |
124 | %endif | |
125 | ||
126 | mov unused_lanes, [state + _unused_lanes] | |
127 | movzx lane, BYTE(unused_lanes) | |
128 | and lane, 0xF | |
129 | shr unused_lanes, 4 | |
130 | imul lane_data, lane, _LANE_DATA_size | |
131 | mov dword [job + _status], STS_BEING_PROCESSED | |
132 | lea lane_data, [state + _ldata + lane_data] | |
133 | mov [state + _unused_lanes], unused_lanes | |
134 | mov DWORD(len), [job + _len] | |
1e59de90 | 135 | |
7c673cae FG |
136 | shl len, 4 |
137 | or len, lane | |
138 | ||
139 | mov [lane_data + _job_in_lane], job | |
140 | mov [state + _lens + 4*lane], DWORD(len) | |
141 | ||
142 | ; Load digest words from result_digest | |
143 | vmovdqa xmm0, [job + _result_digest + 0*16] | |
144 | vmovdqa xmm1, [job + _result_digest + 1*16] | |
145 | vmovd [state + _args_digest + 4*lane + 0*16], xmm0 | |
146 | vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 | |
147 | vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 | |
148 | vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 | |
149 | vmovd [state + _args_digest + 4*lane + 4*16], xmm1 | |
150 | vpextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1 | |
151 | vpextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2 | |
152 | vpextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3 | |
153 | ||
154 | ||
155 | mov p, [job + _buffer] | |
156 | mov [state + _args_data_ptr + 8*lane], p | |
157 | ||
1e59de90 | 158 | add dword [state + _num_lanes_inuse], 1 |
7c673cae FG |
159 | cmp unused_lanes, 0xF |
160 | jne return_null | |
161 | ||
162 | start_loop: | |
163 | ; Find min length | |
164 | mov DWORD(lens0), [state + _lens + 0*4] | |
165 | mov idx, lens0 | |
166 | mov DWORD(lens1), [state + _lens + 1*4] | |
167 | cmp lens1, idx | |
168 | cmovb idx, lens1 | |
169 | mov DWORD(lens2), [state + _lens + 2*4] | |
170 | cmp lens2, idx | |
171 | cmovb idx, lens2 | |
172 | mov DWORD(lens3), [state + _lens + 3*4] | |
173 | cmp lens3, idx | |
174 | cmovb idx, lens3 | |
175 | mov len2, idx | |
176 | and idx, 0xF | |
177 | and len2, ~0xF | |
1e59de90 | 178 | jz len_is_0 |
7c673cae FG |
179 | |
180 | sub lens0, len2 | |
181 | sub lens1, len2 | |
182 | sub lens2, len2 | |
183 | sub lens3, len2 | |
184 | shr len2, 4 | |
185 | mov [state + _lens + 0*4], DWORD(lens0) | |
186 | mov [state + _lens + 1*4], DWORD(lens1) | |
187 | mov [state + _lens + 2*4], DWORD(lens2) | |
188 | mov [state + _lens + 3*4], DWORD(lens3) | |
189 | ||
190 | ; "state" and "args" are the same address, arg1 | |
191 | ; len is arg2 | |
192 | call sha256_mb_x4_avx | |
193 | ; state and idx are intact | |
194 | ||
195 | len_is_0: | |
196 | ; process completed job "idx" | |
197 | imul lane_data, idx, _LANE_DATA_size | |
198 | lea lane_data, [state + _ldata + lane_data] | |
1e59de90 | 199 | |
7c673cae FG |
200 | mov job_rax, [lane_data + _job_in_lane] |
201 | mov unused_lanes, [state + _unused_lanes] | |
202 | mov qword [lane_data + _job_in_lane], 0 | |
203 | mov dword [job_rax + _status], STS_COMPLETED | |
204 | shl unused_lanes, 4 | |
205 | or unused_lanes, idx | |
206 | mov [state + _unused_lanes], unused_lanes | |
207 | ||
1e59de90 TL |
208 | sub dword [state + _num_lanes_inuse], 1 |
209 | ||
7c673cae FG |
210 | vmovd xmm0, [state + _args_digest + 4*idx + 0*16] |
211 | vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 | |
212 | vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 | |
213 | vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 | |
214 | vmovd xmm1, [state + _args_digest + 4*idx + 4*16] | |
215 | vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 | |
216 | vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 | |
217 | vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 | |
218 | ||
219 | vmovdqa [job_rax + _result_digest + 0*16], xmm0 | |
220 | vmovdqa [job_rax + _result_digest + 1*16], xmm1 | |
221 | ||
222 | return: | |
223 | ||
224 | %ifidn __OUTPUT_FORMAT__, win64 | |
225 | vmovdqa xmm6, [rsp + 16*0] | |
226 | vmovdqa xmm7, [rsp + 16*1] | |
227 | vmovdqa xmm8, [rsp + 16*2] | |
228 | vmovdqa xmm9, [rsp + 16*3] | |
229 | vmovdqa xmm10, [rsp + 16*4] | |
230 | vmovdqa xmm11, [rsp + 16*5] | |
231 | vmovdqa xmm12, [rsp + 16*6] | |
232 | vmovdqa xmm13, [rsp + 16*7] | |
233 | vmovdqa xmm14, [rsp + 16*8] | |
234 | vmovdqa xmm15, [rsp + 16*9] | |
235 | mov rsi, [rsp + _XMM_SAVE + 8*3] | |
236 | mov rdi, [rsp + _XMM_SAVE + 8*4] | |
237 | %endif | |
238 | mov rbx, [rsp + _XMM_SAVE + 8*0] | |
239 | mov rbp, [rsp + _XMM_SAVE + 8*1] | |
240 | mov r12, [rsp + _XMM_SAVE + 8*2] | |
241 | add rsp, STACK_SPACE | |
242 | ||
243 | ret | |
244 | ||
245 | return_null: | |
246 | xor job_rax, job_rax | |
247 | jmp return | |
248 | ||
249 | section .data align=16 | |
250 | ||
251 | align 16 | |
252 | H0: dd 0x6a09e667 | |
253 | H1: dd 0xbb67ae85 | |
254 | H2: dd 0x3c6ef372 | |
255 | H3: dd 0xa54ff53a | |
256 | H4: dd 0x510e527f | |
257 | H5: dd 0x9b05688c | |
258 | H6: dd 0x1f83d9ab | |
259 | H7: dd 0x5be0cd19 | |
260 |