]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
1e59de90 | 5 | ; modification, are permitted provided that the following conditions |
7c673cae FG |
6 | ; are met: |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "md5_job.asm" | |
31 | %include "md5_mb_mgr_datastruct.asm" | |
32 | ||
33 | %include "reg_sizes.asm" | |
1e59de90 TL |
34 | |
35 | [bits 64] | |
7c673cae | 36 | default rel |
1e59de90 | 37 | section .text |
7c673cae FG |
38 | |
39 | extern md5_mb_x4x2_avx | |
40 | ||
41 | %if 1 | |
42 | %ifidn __OUTPUT_FORMAT__, win64 | |
43 | ; WINDOWS register definitions | |
44 | %define arg1 rcx | |
45 | %define arg2 rdx | |
1e59de90 | 46 | |
7c673cae FG |
47 | %else |
48 | ; UN*X register definitions | |
49 | %define arg1 rdi | |
50 | %define arg2 rsi | |
1e59de90 | 51 | |
7c673cae FG |
52 | %endif |
53 | ||
54 | ; Common definitions | |
55 | %define state arg1 | |
56 | %define job arg2 | |
57 | %define len2 arg2 | |
58 | ||
59 | ; idx must be a register not clobberred by md5_mb_x4x2_avx | |
60 | %define idx r8 | |
61 | ||
62 | %define p r9 | |
63 | ||
64 | %define unused_lanes rbx | |
1e59de90 | 65 | |
7c673cae FG |
66 | %define job_rax rax |
67 | %define len rax | |
68 | ||
69 | %define lane r10 | |
1e59de90 | 70 | |
7c673cae FG |
71 | %define lane_data r11 |
72 | ||
73 | %endif ; if 1 | |
74 | ||
75 | ; STACK_SPACE needs to be an odd multiple of 8 | |
76 | %define STACK_SPACE 8*8 + 16*10 + 8 | |
77 | ||
78 | ; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) | |
79 | ; arg 1 : rcx : state | |
80 | ; arg 2 : rdx : job | |
1e59de90 | 81 | mk_global md5_mb_mgr_submit_avx, function |
7c673cae | 82 | md5_mb_mgr_submit_avx: |
1e59de90 | 83 | endbranch |
7c673cae FG |
84 | |
85 | sub rsp, STACK_SPACE | |
86 | ; we need to save/restore all GPRs because lower layer clobbers them | |
87 | mov [rsp + 8*0], rbx | |
88 | mov [rsp + 8*1], rbp | |
89 | mov [rsp + 8*2], r12 | |
90 | mov [rsp + 8*3], r13 | |
91 | mov [rsp + 8*4], r14 | |
92 | mov [rsp + 8*5], r15 | |
93 | %ifidn __OUTPUT_FORMAT__, win64 | |
94 | mov [rsp + 8*6], rsi | |
95 | mov [rsp + 8*7], rdi | |
96 | vmovdqa [rsp + 8*8 + 16*0], xmm6 | |
97 | vmovdqa [rsp + 8*8 + 16*1], xmm7 | |
98 | vmovdqa [rsp + 8*8 + 16*2], xmm8 | |
99 | vmovdqa [rsp + 8*8 + 16*3], xmm9 | |
100 | vmovdqa [rsp + 8*8 + 16*4], xmm10 | |
101 | vmovdqa [rsp + 8*8 + 16*5], xmm11 | |
102 | vmovdqa [rsp + 8*8 + 16*6], xmm12 | |
103 | vmovdqa [rsp + 8*8 + 16*7], xmm13 | |
104 | vmovdqa [rsp + 8*8 + 16*8], xmm14 | |
105 | vmovdqa [rsp + 8*8 + 16*9], xmm15 | |
106 | %endif | |
107 | ||
108 | mov unused_lanes, [state + _unused_lanes] | |
109 | mov lane, unused_lanes | |
110 | and lane, 0xF | |
111 | shr unused_lanes, 4 | |
112 | imul lane_data, lane, _LANE_DATA_size | |
113 | mov dword [job + _status], STS_BEING_PROCESSED | |
114 | lea lane_data, [state + _ldata + lane_data] | |
115 | mov [state + _unused_lanes], unused_lanes | |
116 | mov DWORD(len), [job + _len] | |
117 | ||
118 | shl len, 4 | |
119 | or len, lane | |
120 | ||
121 | mov [lane_data + _job_in_lane], job | |
122 | mov [state + _lens + 4*lane], DWORD(len) | |
123 | ||
124 | ; Load digest words from result_digest | |
125 | vmovdqu xmm0, [job + _result_digest + 0*16] | |
126 | vmovd [state + _args_digest + 4*lane + 0*32], xmm0 | |
127 | vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1 | |
128 | vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2 | |
129 | vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3 | |
130 | ||
131 | mov p, [job + _buffer] | |
132 | mov [state + _args_data_ptr + 8*lane], p | |
133 | ||
1e59de90 | 134 | add dword [state + _num_lanes_inuse], 1 |
7c673cae FG |
135 | cmp unused_lanes, 0xF |
136 | jne return_null | |
137 | ||
138 | start_loop: | |
139 | ; Find min length | |
140 | vmovdqa xmm0, [state + _lens + 0*16] | |
141 | vmovdqa xmm1, [state + _lens + 1*16] | |
1e59de90 | 142 | |
7c673cae FG |
143 | vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} |
144 | vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} | |
145 | vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} | |
146 | vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} | |
147 | vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword | |
148 | ||
149 | vmovd DWORD(idx), xmm2 | |
150 | mov len2, idx | |
151 | and idx, 0xF | |
152 | shr len2, 4 | |
153 | jz len_is_0 | |
1e59de90 | 154 | |
7c673cae FG |
155 | vpand xmm2, xmm2, [rel clear_low_nibble] |
156 | vpshufd xmm2, xmm2, 0 | |
157 | ||
158 | vpsubd xmm0, xmm0, xmm2 | |
159 | vpsubd xmm1, xmm1, xmm2 | |
160 | ||
161 | vmovdqa [state + _lens + 0*16], xmm0 | |
162 | vmovdqa [state + _lens + 1*16], xmm1 | |
163 | ||
164 | ; "state" and "args" are the same address, arg1 | |
165 | ; len is arg2 | |
166 | call md5_mb_x4x2_avx | |
167 | ; state and idx are intact | |
168 | ||
169 | len_is_0: | |
170 | ; process completed job "idx" | |
171 | imul lane_data, idx, _LANE_DATA_size | |
172 | lea lane_data, [state + _ldata + lane_data] | |
1e59de90 | 173 | |
7c673cae FG |
174 | mov job_rax, [lane_data + _job_in_lane] |
175 | mov unused_lanes, [state + _unused_lanes] | |
176 | mov qword [lane_data + _job_in_lane], 0 | |
177 | mov dword [job_rax + _status], STS_COMPLETED | |
178 | shl unused_lanes, 4 | |
179 | or unused_lanes, idx | |
180 | mov [state + _unused_lanes], unused_lanes | |
181 | ||
182 | mov dword [state + _lens + 4*idx], 0xFFFFFFFF | |
1e59de90 | 183 | sub dword [state + _num_lanes_inuse], 1 |
7c673cae FG |
184 | |
185 | vmovd xmm0, [state + _args_digest + 4*idx + 0*32] | |
186 | vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 | |
187 | vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 | |
188 | vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 | |
189 | ||
190 | vmovdqa [job_rax + _result_digest + 0*16], xmm0 | |
191 | ||
192 | return: | |
193 | ||
194 | %ifidn __OUTPUT_FORMAT__, win64 | |
195 | vmovdqa xmm6, [rsp + 8*8 + 16*0] | |
196 | vmovdqa xmm7, [rsp + 8*8 + 16*1] | |
197 | vmovdqa xmm8, [rsp + 8*8 + 16*2] | |
198 | vmovdqa xmm9, [rsp + 8*8 + 16*3] | |
199 | vmovdqa xmm10, [rsp + 8*8 + 16*4] | |
200 | vmovdqa xmm11, [rsp + 8*8 + 16*5] | |
201 | vmovdqa xmm12, [rsp + 8*8 + 16*6] | |
202 | vmovdqa xmm13, [rsp + 8*8 + 16*7] | |
203 | vmovdqa xmm14, [rsp + 8*8 + 16*8] | |
204 | vmovdqa xmm15, [rsp + 8*8 + 16*9] | |
205 | mov rsi, [rsp + 8*6] | |
206 | mov rdi, [rsp + 8*7] | |
207 | %endif | |
208 | mov rbx, [rsp + 8*0] | |
209 | mov rbp, [rsp + 8*1] | |
210 | mov r12, [rsp + 8*2] | |
211 | mov r13, [rsp + 8*3] | |
212 | mov r14, [rsp + 8*4] | |
213 | mov r15, [rsp + 8*5] | |
214 | ||
215 | add rsp, STACK_SPACE | |
216 | ||
217 | ret | |
218 | ||
219 | return_null: | |
220 | xor job_rax, job_rax | |
221 | jmp return | |
222 | ||
223 | ||
224 | section .data align=16 | |
225 | ||
226 | align 16 | |
227 | clear_low_nibble: | |
228 | dq 0x00000000FFFFFFF0, 0x0000000000000000 |