]>
Commit | Line | Data |
---|---|---|
a377c6b1 MD |
1 | /* |
2 | * Flush routine for SHA256 multibuffer | |
3 | * | |
4 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
5 | * redistributing this file, you may do so under either license. | |
6 | * | |
7 | * GPL LICENSE SUMMARY | |
8 | * | |
9 | * Copyright(c) 2016 Intel Corporation. | |
10 | * | |
11 | * This program is free software; you can redistribute it and/or modify | |
12 | * it under the terms of version 2 of the GNU General Public License as | |
13 | * published by the Free Software Foundation. | |
14 | * | |
15 | * This program is distributed in the hope that it will be useful, but | |
16 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | * General Public License for more details. | |
19 | * | |
20 | * Contact Information: | |
21 | * Megha Dey <megha.dey@linux.intel.com> | |
22 | * | |
23 | * BSD LICENSE | |
24 | * | |
25 | * Copyright(c) 2016 Intel Corporation. | |
26 | * | |
27 | * Redistribution and use in source and binary forms, with or without | |
28 | * modification, are permitted provided that the following conditions | |
29 | * are met: | |
30 | * | |
31 | * * Redistributions of source code must retain the above copyright | |
32 | * notice, this list of conditions and the following disclaimer. | |
33 | * * Redistributions in binary form must reproduce the above copyright | |
34 | * notice, this list of conditions and the following disclaimer in | |
35 | * the documentation and/or other materials provided with the | |
36 | * distribution. | |
37 | * * Neither the name of Intel Corporation nor the names of its | |
38 | * contributors may be used to endorse or promote products derived | |
39 | * from this software without specific prior written permission. | |
40 | * | |
41 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
42 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
43 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
44 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
45 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
46 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
47 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
48 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
49 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
50 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
51 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
52 | */ | |
53 | #include <linux/linkage.h> | |
54 | #include <asm/frame.h> | |
55 | #include "sha256_mb_mgr_datastruct.S" | |
56 | ||
57 | .extern sha256_x8_avx2 | |
58 | ||
59 | #LINUX register definitions | |
60 | #define arg1 %rdi | |
61 | #define arg2 %rsi | |
62 | ||
63 | # Common register definitions | |
64 | #define state arg1 | |
65 | #define job arg2 | |
66 | #define len2 arg2 | |
67 | ||
68 | # idx must be a register not clobberred by sha1_mult | |
69 | #define idx %r8 | |
70 | #define DWORD_idx %r8d | |
71 | ||
72 | #define unused_lanes %rbx | |
73 | #define lane_data %rbx | |
74 | #define tmp2 %rbx | |
75 | #define tmp2_w %ebx | |
76 | ||
77 | #define job_rax %rax | |
78 | #define tmp1 %rax | |
79 | #define size_offset %rax | |
80 | #define tmp %rax | |
81 | #define start_offset %rax | |
82 | ||
83 | #define tmp3 %arg1 | |
84 | ||
85 | #define extra_blocks %arg2 | |
86 | #define p %arg2 | |
87 | ||
88 | .macro LABEL prefix n | |
89 | \prefix\n\(): | |
90 | .endm | |
91 | ||
92 | .macro JNE_SKIP i | |
93 | jne skip_\i | |
94 | .endm | |
95 | ||
96 | .altmacro | |
97 | .macro SET_OFFSET _offset | |
98 | offset = \_offset | |
99 | .endm | |
100 | .noaltmacro | |
101 | ||
102 | # JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state) | |
103 | # arg 1 : rcx : state | |
104 | ENTRY(sha256_mb_mgr_flush_avx2) | |
105 | FRAME_BEGIN | |
106 | push %rbx | |
107 | ||
108 | # If bit (32+3) is set, then all lanes are empty | |
109 | mov _unused_lanes(state), unused_lanes | |
110 | bt $32+3, unused_lanes | |
111 | jc return_null | |
112 | ||
113 | # find a lane with a non-null job | |
114 | xor idx, idx | |
115 | offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane) | |
116 | cmpq $0, offset(state) | |
117 | cmovne one(%rip), idx | |
118 | offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane) | |
119 | cmpq $0, offset(state) | |
120 | cmovne two(%rip), idx | |
121 | offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane) | |
122 | cmpq $0, offset(state) | |
123 | cmovne three(%rip), idx | |
124 | offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane) | |
125 | cmpq $0, offset(state) | |
126 | cmovne four(%rip), idx | |
127 | offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane) | |
128 | cmpq $0, offset(state) | |
129 | cmovne five(%rip), idx | |
130 | offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane) | |
131 | cmpq $0, offset(state) | |
132 | cmovne six(%rip), idx | |
133 | offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane) | |
134 | cmpq $0, offset(state) | |
135 | cmovne seven(%rip), idx | |
136 | ||
137 | # copy idx to empty lanes | |
138 | copy_lane_data: | |
139 | offset = (_args + _data_ptr) | |
140 | mov offset(state,idx,8), tmp | |
141 | ||
142 | I = 0 | |
143 | .rep 8 | |
144 | offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) | |
145 | cmpq $0, offset(state) | |
146 | .altmacro | |
147 | JNE_SKIP %I | |
148 | offset = (_args + _data_ptr + 8*I) | |
149 | mov tmp, offset(state) | |
150 | offset = (_lens + 4*I) | |
151 | movl $0xFFFFFFFF, offset(state) | |
152 | LABEL skip_ %I | |
153 | I = (I+1) | |
154 | .noaltmacro | |
155 | .endr | |
156 | ||
157 | # Find min length | |
158 | vmovdqa _lens+0*16(state), %xmm0 | |
159 | vmovdqa _lens+1*16(state), %xmm1 | |
160 | ||
161 | vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} | |
162 | vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} | |
163 | vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} | |
164 | vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} | |
165 | vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword | |
166 | ||
167 | vmovd %xmm2, DWORD_idx | |
168 | mov idx, len2 | |
169 | and $0xF, idx | |
170 | shr $4, len2 | |
171 | jz len_is_0 | |
172 | ||
173 | vpand clear_low_nibble(%rip), %xmm2, %xmm2 | |
174 | vpshufd $0, %xmm2, %xmm2 | |
175 | ||
176 | vpsubd %xmm2, %xmm0, %xmm0 | |
177 | vpsubd %xmm2, %xmm1, %xmm1 | |
178 | ||
179 | vmovdqa %xmm0, _lens+0*16(state) | |
180 | vmovdqa %xmm1, _lens+1*16(state) | |
181 | ||
182 | # "state" and "args" are the same address, arg1 | |
183 | # len is arg2 | |
184 | call sha256_x8_avx2 | |
185 | # state and idx are intact | |
186 | ||
187 | len_is_0: | |
188 | # process completed job "idx" | |
189 | imul $_LANE_DATA_size, idx, lane_data | |
190 | lea _ldata(state, lane_data), lane_data | |
191 | ||
192 | mov _job_in_lane(lane_data), job_rax | |
193 | movq $0, _job_in_lane(lane_data) | |
194 | movl $STS_COMPLETED, _status(job_rax) | |
195 | mov _unused_lanes(state), unused_lanes | |
196 | shl $4, unused_lanes | |
197 | or idx, unused_lanes | |
198 | ||
199 | mov unused_lanes, _unused_lanes(state) | |
200 | movl $0xFFFFFFFF, _lens(state,idx,4) | |
201 | ||
202 | vmovd _args_digest(state , idx, 4) , %xmm0 | |
203 | vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 | |
204 | vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 | |
205 | vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 | |
206 | vmovd _args_digest+4*32(state, idx, 4), %xmm1 | |
207 | vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 | |
208 | vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 | |
209 | vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 | |
210 | ||
211 | vmovdqu %xmm0, _result_digest(job_rax) | |
212 | offset = (_result_digest + 1*16) | |
213 | vmovdqu %xmm1, offset(job_rax) | |
214 | ||
215 | return: | |
216 | pop %rbx | |
217 | FRAME_END | |
218 | ret | |
219 | ||
220 | return_null: | |
221 | xor job_rax, job_rax | |
222 | jmp return | |
223 | ENDPROC(sha256_mb_mgr_flush_avx2) | |
224 | ||
225 | ############################################################################## | |
226 | ||
227 | .align 16 | |
228 | ENTRY(sha256_mb_mgr_get_comp_job_avx2) | |
229 | push %rbx | |
230 | ||
231 | ## if bit 32+3 is set, then all lanes are empty | |
232 | mov _unused_lanes(state), unused_lanes | |
233 | bt $(32+3), unused_lanes | |
234 | jc .return_null | |
235 | ||
236 | # Find min length | |
237 | vmovdqa _lens(state), %xmm0 | |
238 | vmovdqa _lens+1*16(state), %xmm1 | |
239 | ||
240 | vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} | |
241 | vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} | |
242 | vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} | |
243 | vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} | |
244 | vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword | |
245 | ||
246 | vmovd %xmm2, DWORD_idx | |
247 | test $~0xF, idx | |
248 | jnz .return_null | |
249 | ||
250 | # process completed job "idx" | |
251 | imul $_LANE_DATA_size, idx, lane_data | |
252 | lea _ldata(state, lane_data), lane_data | |
253 | ||
254 | mov _job_in_lane(lane_data), job_rax | |
255 | movq $0, _job_in_lane(lane_data) | |
256 | movl $STS_COMPLETED, _status(job_rax) | |
257 | mov _unused_lanes(state), unused_lanes | |
258 | shl $4, unused_lanes | |
259 | or idx, unused_lanes | |
260 | mov unused_lanes, _unused_lanes(state) | |
261 | ||
262 | movl $0xFFFFFFFF, _lens(state, idx, 4) | |
263 | ||
264 | vmovd _args_digest(state, idx, 4), %xmm0 | |
265 | vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 | |
266 | vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 | |
267 | vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 | |
172b1d6b | 268 | vmovd _args_digest(state , idx, 4) , %xmm0 |
a377c6b1 MD |
269 | vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 |
270 | vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 | |
271 | vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 | |
272 | ||
172b1d6b XL |
273 | vmovdqu %xmm0, _result_digest(job_rax) |
274 | offset = (_result_digest + 1*16) | |
275 | vmovdqu %xmm1, offset(job_rax) | |
a377c6b1 MD |
276 | |
277 | pop %rbx | |
278 | ||
279 | ret | |
280 | ||
281 | .return_null: | |
282 | xor job_rax, job_rax | |
283 | pop %rbx | |
284 | ret | |
285 | ENDPROC(sha256_mb_mgr_get_comp_job_avx2) | |
286 | ||
287 | .data | |
288 | ||
289 | .align 16 | |
290 | clear_low_nibble: | |
291 | .octa 0x000000000000000000000000FFFFFFF0 | |
292 | one: | |
293 | .quad 1 | |
294 | two: | |
295 | .quad 2 | |
296 | three: | |
297 | .quad 3 | |
298 | four: | |
299 | .quad 4 | |
300 | five: | |
301 | .quad 5 | |
302 | six: | |
303 | .quad 6 | |
304 | seven: | |
305 | .quad 7 |