]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
1e59de90 | 5 | ; modification, are permitted provided that the following conditions |
7c673cae FG |
6 | ; are met: |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "md5_mb_mgr_datastruct.asm" | |
31 | %include "reg_sizes.asm" | |
32 | ||
33 | %ifdef HAVE_AS_KNOWS_AVX512 | |
1e59de90 TL |
34 | |
35 | [bits 64] | |
7c673cae | 36 | default rel |
1e59de90 TL |
37 | section .text |
38 | ||
7c673cae FG |
39 | |
40 | ;; code to compute double octal MD5 using AVX512 | |
41 | ||
42 | ;; Stack must be aligned to 64 bytes before call | |
43 | ||
44 | ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 | |
45 | ;; Windows preserves: rcx rbp | |
46 | ;; | |
47 | ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 | |
48 | ;; Linux preserves: rdi rbp | |
49 | ;; | |
50 | ;; clobbers zmm0-8, 14-31 | |
51 | ||
52 | ;; clobbers all GPRs other than arg1 and rbp | |
53 | ||
54 | %ifidn __OUTPUT_FORMAT__, win64 | |
55 | %define arg1 rcx ; arg0 | |
56 | %define arg2 rdx ; arg1 | |
57 | %define reg3 r8 ; arg2 | |
58 | %define reg4 r9 ; arg3 | |
59 | %define var1 rdi | |
60 | %define var2 rsi | |
61 | %define local_func_decl(func_name) global func_name | |
62 | %else | |
63 | %define arg1 rdi ; arg0 | |
64 | %define arg2 rsi ; arg1 | |
65 | %define var1 rdx ; arg2 | |
66 | %define var2 rcx ; arg3 | |
1e59de90 | 67 | %define local_func_decl(func_name) mk_global func_name, function, internal |
7c673cae FG |
68 | %endif |
69 | ||
70 | %define state arg1 | |
71 | %define num_blks arg2 | |
72 | ||
73 | %define IN (state + _data_ptr) | |
74 | %define DIGEST state | |
75 | %define SIZE num_blks | |
76 | ;; These are pointers to data block1 and block2 in the stack | |
77 | ; which will ping pong back and forth | |
78 | %define DPTR1 rbx | |
79 | %define DPTR2 var2 | |
80 | %define IDX var1 | |
81 | %define TBL rax | |
82 | ||
83 | %define inp0 r8 | |
84 | %define inp1 r9 | |
85 | %define inp2 r10 | |
86 | %define inp3 r11 | |
87 | %define inp4 r12 | |
88 | %define inp5 r13 | |
89 | %define inp6 r14 | |
90 | %define inp7 r15 | |
91 | ||
92 | ;; Transposed Digest Storage | |
93 | %define A zmm0 | |
94 | %define B zmm1 | |
95 | %define C zmm2 | |
96 | %define D zmm3 | |
97 | %define A1 zmm4 | |
98 | %define B1 zmm5 | |
99 | %define C1 zmm6 | |
100 | %define D1 zmm7 | |
101 | ||
102 | %define md5c zmm16 | |
103 | ||
104 | %define MASK0 zmm17 | |
105 | %define MASK1 zmm18 | |
106 | ||
107 | %define TMP0 zmm20 | |
108 | %define TMP1 zmm21 | |
109 | ||
110 | ||
111 | ;; Data are stored into the Wx after transposition | |
112 | %define W0 zmm8 | |
113 | %define W1 zmm9 | |
114 | %define W2 zmm10 | |
115 | %define W3 zmm11 | |
116 | %define W4 zmm12 | |
117 | %define W5 zmm13 | |
118 | %define W6 zmm14 | |
119 | %define W7 zmm15 | |
120 | ||
121 | %define W8 zmm24 | |
122 | %define W9 zmm25 | |
123 | %define W10 zmm26 | |
124 | %define W11 zmm27 | |
125 | %define W12 zmm28 | |
126 | %define W13 zmm29 | |
127 | %define W14 zmm30 | |
128 | %define W15 zmm31 | |
129 | ||
130 | %define MD5_DIGEST_ROW_SIZE (16*4) | |
131 | %define APPEND(a,b) a %+ b | |
132 | %define APPEND3(a,b,c) a %+ b %+ c | |
133 | ||
134 | ;; Temporary registers used during data transposition | |
135 | ||
136 | %define RESZ resb 64* | |
137 | ;; Assume stack aligned to 64 bytes before call | |
138 | ;; Therefore FRAMESIZE mod 64 must be 64-8 = 56 | |
139 | struc STACK | |
140 | _DATA: RESZ 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs | |
141 | _DIGEST: RESZ 8 ; stores Z_AA-Z_DD, Z_AA2-Z_DD2 | |
142 | _TMPDIGEST: RESZ 2 ; stores Z_AA, Z_BB temporarily | |
143 | _RSP_SAVE: RESQ 1 ; original RSP | |
144 | endstruc | |
145 | ||
146 | %define Z_AA rsp + _DIGEST + 64*0 | |
147 | %define Z_BB rsp + _DIGEST + 64*1 | |
148 | %define Z_CC rsp + _DIGEST + 64*2 | |
149 | %define Z_DD rsp + _DIGEST + 64*3 | |
150 | %define Z_AA1 rsp + _DIGEST + 64*4 | |
151 | %define Z_BB1 rsp + _DIGEST + 64*5 | |
152 | %define Z_CC1 rsp + _DIGEST + 64*6 | |
153 | %define Z_DD1 rsp + _DIGEST + 64*7 | |
154 | ||
155 | %define MD5_DIGEST_ROW_SIZE (32*4) | |
156 | ||
157 | ||
158 | ;; | |
159 | ;; MD5 left rotations (number of bits) | |
160 | ;; | |
161 | %define rot11 7 | |
162 | %define rot12 12 | |
163 | %define rot13 17 | |
164 | %define rot14 22 | |
165 | %define rot21 5 | |
166 | %define rot22 9 | |
167 | %define rot23 14 | |
168 | %define rot24 20 | |
169 | %define rot31 4 | |
170 | %define rot32 11 | |
171 | %define rot33 16 | |
172 | %define rot34 23 | |
173 | %define rot41 6 | |
174 | %define rot42 10 | |
175 | %define rot43 15 | |
176 | %define rot44 21 | |
177 | ||
178 | %macro TRANSPOSE16 18 | |
179 | %define %%r0 %1 | |
180 | %define %%r1 %2 | |
181 | %define %%r2 %3 | |
182 | %define %%r3 %4 | |
183 | %define %%r4 %5 | |
184 | %define %%r5 %6 | |
185 | %define %%r6 %7 | |
186 | %define %%r7 %8 | |
187 | %define %%r8 %9 | |
188 | %define %%r9 %10 | |
189 | %define %%r10 %11 | |
190 | %define %%r11 %12 | |
191 | %define %%r12 %13 | |
192 | %define %%r13 %14 | |
193 | %define %%r14 %15 | |
194 | %define %%r15 %16 | |
195 | %define %%t0 %17 | |
196 | %define %%t1 %18 | |
197 | ||
198 | ; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} | |
199 | ; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} | |
200 | ; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} | |
201 | ; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} | |
202 | ; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} | |
203 | ; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} | |
204 | ; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} | |
205 | ; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} | |
206 | ; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} | |
207 | ; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} | |
208 | ; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} | |
209 | ; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} | |
210 | ; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} | |
211 | ; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} | |
212 | ; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} | |
213 | ; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} | |
214 | ||
215 | ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} | |
216 | ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} | |
217 | ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} | |
218 | ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} | |
219 | ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} | |
220 | ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} | |
221 | ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} | |
222 | ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} | |
223 | ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} | |
224 | ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} | |
225 | ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} | |
226 | ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} | |
227 | ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} | |
228 | ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} | |
229 | ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} | |
230 | ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} | |
231 | ||
232 | ||
233 | ; process top half (r0..r3) {a...d} | |
234 | vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} | |
235 | vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} | |
236 | vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} | |
237 | vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} | |
238 | ||
239 | vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} | |
240 | vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} | |
241 | vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} | |
242 | vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} | |
243 | ||
244 | ; use r2 in place of t0 | |
245 | vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} | |
246 | vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} | |
247 | vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} | |
248 | vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} | |
249 | ||
250 | vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} | |
251 | vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} | |
252 | vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} | |
253 | vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} | |
254 | ||
255 | ; use r6 in place of t0 | |
256 | vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} | |
257 | vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} | |
258 | vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} | |
259 | vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} | |
260 | ||
261 | vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} | |
262 | vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} | |
263 | vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} | |
264 | vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} | |
265 | ||
266 | ; use r10 in place of t0 | |
267 | vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} | |
268 | vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} | |
269 | vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} | |
270 | vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} | |
271 | ||
272 | vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} | |
273 | vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} | |
274 | vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} | |
275 | vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} | |
276 | ||
277 | ;; At this point, the registers that contain interesting data are: | |
278 | ;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 | |
279 | ;; Can use t1 and r14 as scratch registers | |
280 | ||
281 | vmovdqa32 %%r14, MASK0 | |
282 | vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} | |
283 | vmovdqa32 %%t1, MASK1 | |
284 | vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} | |
285 | ||
286 | vmovdqa32 %%r2, MASK0 | |
287 | vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} | |
288 | vmovdqa32 %%t0, MASK1 | |
289 | vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} | |
290 | ||
291 | vmovdqa32 %%r3, MASK0 | |
292 | vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} | |
293 | vmovdqa32 %%r7, MASK1 | |
294 | vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} | |
295 | ||
296 | vmovdqa32 %%r1, MASK0 | |
297 | vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} | |
298 | vmovdqa32 %%r5, MASK1 | |
299 | vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} | |
300 | ||
301 | vmovdqa32 %%r0, MASK0 | |
302 | vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} | |
303 | vmovdqa32 %%r4, MASK1 | |
304 | vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} | |
305 | ||
306 | vmovdqa32 %%r6, MASK0 | |
307 | vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} | |
308 | vmovdqa32 %%r10, MASK1 | |
309 | vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} | |
310 | ||
311 | vmovdqa32 %%r11, MASK0 | |
312 | vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} | |
313 | vmovdqa32 %%r15, MASK1 | |
314 | vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} | |
315 | ||
316 | vmovdqa32 %%r9, MASK0 | |
317 | vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} | |
318 | vmovdqa32 %%r13, MASK1 | |
319 | vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} | |
320 | ||
321 | ;; At this point r8 and r12 can be used as scratch registers | |
322 | ||
323 | vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} | |
324 | vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} | |
325 | ||
326 | vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} | |
327 | vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} | |
328 | ||
329 | vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} | |
330 | vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} | |
331 | ||
332 | vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} | |
333 | vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} | |
334 | ||
335 | vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} | |
336 | vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} | |
337 | ||
338 | vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} | |
339 | vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} | |
340 | ||
341 | vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} | |
342 | vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} | |
343 | ||
344 | vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} | |
345 | vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} | |
346 | ||
347 | vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} | |
348 | vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} | |
349 | ||
350 | %endmacro | |
351 | ||
352 | %macro ROTATE_ARGS 0 | |
353 | %xdefine TMP_ D | |
354 | %xdefine D C | |
355 | %xdefine C B | |
356 | %xdefine B A | |
357 | %xdefine A TMP_ | |
358 | %endm | |
359 | ||
360 | %macro ROTATE_ARGS1 0 | |
361 | %xdefine TMP_ D1 | |
362 | %xdefine D1 C1 | |
363 | %xdefine C1 B1 | |
364 | %xdefine B1 A1 | |
365 | %xdefine A1 TMP_ | |
366 | %endm | |
367 | ||
368 | ;; | |
369 | ;; single MD5 step | |
370 | ;; | |
371 | ;; A = B +ROL32((A +Ft(B,C,D) +data +const), nrot) | |
372 | ;;eg: PROCESS_LOOP MD5constx, Mdatax, F_IMMEDx, NROTx | |
373 | %macro PROCESS_LOOP 6 | |
374 | %define %%MD5const %1 | |
375 | %define %%data %2 | |
376 | %define %%F_IMMED %3 | |
377 | %define %%NROT %4 | |
378 | %define %%TMP_PR0 %5 | |
379 | %define %%TMP_PR1 %6 | |
380 | ; a=b+((a+Ft(b,c,d)+Mj+ti)<<s) | |
381 | ||
382 | ; Ft | |
383 | ; 0-15 Ft:F(X,Y,Z)=(X&Y)|((~X)&Z) 0xca | |
384 | ; 16-31 Ft:G(X,Y,Z)=(X&Z)|(Y&(~Z)) 0xe4 | |
385 | ; 32-47 Ft:H(X,Y,Z)=X^Y^Z 0x96 | |
386 | ; 48-63 Ft:I(X,Y,Z)=Y^(X|(~Z)) 0x39 | |
387 | ||
388 | vpaddd A, A, %%MD5const | |
389 | vpaddd A1, A1, %%MD5const | |
390 | vpaddd A, A, [%%data] | |
391 | vpaddd A1, A1, [%%data + 16*64] | |
392 | vmovdqa32 %%TMP_PR0, B ; Copy B | |
393 | vmovdqa32 %%TMP_PR1, B1 ; Copy B | |
394 | vpternlogd %%TMP_PR0, C, D, %%F_IMMED | |
395 | vpternlogd %%TMP_PR1, C1, D1, %%F_IMMED | |
396 | vpaddd A, A, %%TMP_PR0 | |
397 | vpaddd A1, A1, %%TMP_PR1 | |
398 | vprold A, A, %%NROT | |
399 | vprold A1, A1, %%NROT | |
400 | vpaddd A, A, B | |
401 | vpaddd A1, A1, B1 | |
402 | ||
403 | ROTATE_ARGS | |
404 | ROTATE_ARGS1 | |
405 | %endmacro | |
406 | ||
407 | align 64 | |
7c673cae FG |
408 | |
409 | ; void md5_mb_x16x2_avx512(MD5_ARGS *args, UINT64 num_blks) | |
410 | ; arg 1 : pointer to MD5_ARGS structure | |
411 | ; arg 2 : number of blocks (>=1) | |
412 | ||
413 | local_func_decl(md5_mb_x16x2_avx512) | |
414 | md5_mb_x16x2_avx512: | |
1e59de90 | 415 | endbranch |
7c673cae FG |
416 | mov rax, rsp |
417 | sub rsp, STACK_size | |
418 | and rsp, -64 | |
419 | mov [rsp + _RSP_SAVE], rax | |
420 | ||
421 | mov DPTR1, rsp | |
422 | lea DPTR2, [rsp + 64*32] | |
423 | ||
424 | ;; Load MD5 constant pointer to register | |
425 | lea TBL, [MD5_TABLE] | |
426 | vmovdqa32 MASK0, [PSHUFFLE_TRANSPOSE16_MASK1] | |
427 | vmovdqa32 MASK1, [PSHUFFLE_TRANSPOSE16_MASK2] | |
428 | ||
429 | ;; Preload input data from 16 segments. | |
430 | xor IDX, IDX | |
431 | ||
432 | ;; transpose input onto stack | |
433 | ;; first 16 lanes read | |
434 | mov inp0, [IN + 0*8] | |
435 | mov inp1, [IN + 1*8] | |
436 | mov inp2, [IN + 2*8] | |
437 | mov inp3, [IN + 3*8] | |
438 | mov inp4, [IN + 4*8] | |
439 | mov inp5, [IN + 5*8] | |
440 | mov inp6, [IN + 6*8] | |
441 | mov inp7, [IN + 7*8] | |
442 | vmovdqu32 W0,[inp0+IDX] | |
443 | vmovdqu32 W1,[inp1+IDX] | |
444 | vmovdqu32 W2,[inp2+IDX] | |
445 | vmovdqu32 W3,[inp3+IDX] | |
446 | vmovdqu32 W4,[inp4+IDX] | |
447 | vmovdqu32 W5,[inp5+IDX] | |
448 | vmovdqu32 W6,[inp6+IDX] | |
449 | vmovdqu32 W7,[inp7+IDX] | |
450 | mov inp0, [IN + 8*8] | |
451 | mov inp1, [IN + 9*8] | |
452 | mov inp2, [IN +10*8] | |
453 | mov inp3, [IN +11*8] | |
454 | mov inp4, [IN +12*8] | |
455 | mov inp5, [IN +13*8] | |
456 | mov inp6, [IN +14*8] | |
457 | mov inp7, [IN +15*8] | |
458 | vmovdqu32 W8, [inp0+IDX] | |
459 | vmovdqu32 W9, [inp1+IDX] | |
460 | vmovdqu32 W10,[inp2+IDX] | |
461 | vmovdqu32 W11,[inp3+IDX] | |
462 | vmovdqu32 W12,[inp4+IDX] | |
463 | vmovdqu32 W13,[inp5+IDX] | |
464 | vmovdqu32 W14,[inp6+IDX] | |
465 | vmovdqu32 W15,[inp7+IDX] | |
466 | ;; first 16 lanes trans&write | |
467 | TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 | |
468 | vmovdqa32 [DPTR1+_DATA+(0)*64],W0 | |
469 | vmovdqa32 [DPTR1+_DATA+(1)*64],W1 | |
470 | vmovdqa32 [DPTR1+_DATA+(2)*64],W2 | |
471 | vmovdqa32 [DPTR1+_DATA+(3)*64],W3 | |
472 | vmovdqa32 [DPTR1+_DATA+(4)*64],W4 | |
473 | vmovdqa32 [DPTR1+_DATA+(5)*64],W5 | |
474 | vmovdqa32 [DPTR1+_DATA+(6)*64],W6 | |
475 | vmovdqa32 [DPTR1+_DATA+(7)*64],W7 | |
476 | vmovdqa32 [DPTR1+_DATA+(8)*64],W8 | |
477 | vmovdqa32 [DPTR1+_DATA+(9)*64],W9 | |
478 | vmovdqa32 [DPTR1+_DATA+(10)*64],W10 | |
479 | vmovdqa32 [DPTR1+_DATA+(11)*64],W11 | |
480 | vmovdqa32 [DPTR1+_DATA+(12)*64],W12 | |
481 | vmovdqa32 [DPTR1+_DATA+(13)*64],W13 | |
482 | vmovdqa32 [DPTR1+_DATA+(14)*64],W14 | |
483 | vmovdqa32 [DPTR1+_DATA+(15)*64],W15 | |
484 | ||
485 | ;; second 16 lanes read | |
486 | mov inp0, [IN + 16*8] | |
487 | mov inp1, [IN + 17*8] | |
488 | mov inp2, [IN + 18*8] | |
489 | mov inp3, [IN + 19*8] | |
490 | mov inp4, [IN + 20*8] | |
491 | mov inp5, [IN + 21*8] | |
492 | mov inp6, [IN + 22*8] | |
493 | mov inp7, [IN + 23*8] | |
494 | vmovdqu32 W0,[inp0+IDX] | |
495 | vmovdqu32 W1,[inp1+IDX] | |
496 | vmovdqu32 W2,[inp2+IDX] | |
497 | vmovdqu32 W3,[inp3+IDX] | |
498 | vmovdqu32 W4,[inp4+IDX] | |
499 | vmovdqu32 W5,[inp5+IDX] | |
500 | vmovdqu32 W6,[inp6+IDX] | |
501 | vmovdqu32 W7,[inp7+IDX] | |
502 | mov inp0, [IN + 24*8] | |
503 | mov inp1, [IN + 25*8] | |
504 | mov inp2, [IN + 26*8] | |
505 | mov inp3, [IN + 27*8] | |
506 | mov inp4, [IN + 28*8] | |
507 | mov inp5, [IN + 29*8] | |
508 | mov inp6, [IN + 30*8] | |
509 | mov inp7, [IN + 31*8] | |
510 | vmovdqu32 W8, [inp0+IDX] | |
511 | vmovdqu32 W9, [inp1+IDX] | |
512 | vmovdqu32 W10,[inp2+IDX] | |
513 | vmovdqu32 W11,[inp3+IDX] | |
514 | vmovdqu32 W12,[inp4+IDX] | |
515 | vmovdqu32 W13,[inp5+IDX] | |
516 | vmovdqu32 W14,[inp6+IDX] | |
517 | vmovdqu32 W15,[inp7+IDX] | |
518 | ;; second 16 lanes trans&write | |
519 | TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 | |
520 | vmovdqa32 [DPTR1+_DATA+(16+0)*64],W0 | |
521 | vmovdqa32 [DPTR1+_DATA+(16+1)*64],W1 | |
522 | vmovdqa32 [DPTR1+_DATA+(16+2)*64],W2 | |
523 | vmovdqa32 [DPTR1+_DATA+(16+3)*64],W3 | |
524 | vmovdqa32 [DPTR1+_DATA+(16+4)*64],W4 | |
525 | vmovdqa32 [DPTR1+_DATA+(16+5)*64],W5 | |
526 | vmovdqa32 [DPTR1+_DATA+(16+6)*64],W6 | |
527 | vmovdqa32 [DPTR1+_DATA+(16+7)*64],W7 | |
528 | vmovdqa32 [DPTR1+_DATA+(16+8)*64],W8 | |
529 | vmovdqa32 [DPTR1+_DATA+(16+9)*64],W9 | |
530 | vmovdqa32 [DPTR1+_DATA+(16+10)*64],W10 | |
531 | vmovdqa32 [DPTR1+_DATA+(16+11)*64],W11 | |
532 | vmovdqa32 [DPTR1+_DATA+(16+12)*64],W12 | |
533 | vmovdqa32 [DPTR1+_DATA+(16+13)*64],W13 | |
534 | vmovdqa32 [DPTR1+_DATA+(16+14)*64],W14 | |
535 | vmovdqa32 [DPTR1+_DATA+(16+15)*64],W15 | |
536 | ||
537 | ;; Initialize digests | |
538 | ;; vmovdqu32 replace vmovdqa32 | |
539 | vmovdqu32 A, [DIGEST + 0 * MD5_DIGEST_ROW_SIZE] | |
540 | vmovdqu32 B, [DIGEST + 1 * MD5_DIGEST_ROW_SIZE] | |
541 | vmovdqu32 C, [DIGEST + 2 * MD5_DIGEST_ROW_SIZE] | |
542 | vmovdqu32 D, [DIGEST + 3 * MD5_DIGEST_ROW_SIZE] | |
543 | ; Load the digest for each stream (9-16) | |
544 | vmovdqu32 A1,[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64] | |
545 | vmovdqu32 B1,[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64] | |
546 | vmovdqu32 C1,[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64] | |
547 | vmovdqu32 D1,[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64] | |
548 | ||
549 | .lloop: | |
550 | ;; Increment IDX to point to next data block (64 bytes per block) | |
551 | add IDX, 64 | |
552 | ||
553 | ; Save digests for later addition | |
554 | vmovdqa32 [Z_AA], A | |
555 | vmovdqa32 [Z_BB], B | |
556 | vmovdqa32 [Z_CC], C | |
557 | vmovdqa32 [Z_DD], D | |
558 | vmovdqa32 [Z_AA1], A1 | |
559 | vmovdqa32 [Z_BB1], B1 | |
560 | vmovdqa32 [Z_CC1], C1 | |
561 | vmovdqa32 [Z_DD1], D1 | |
562 | ||
563 | sub SIZE, 1 | |
564 | je .LastLoop | |
565 | ||
566 | %assign I 0 | |
567 | %assign I_fimm 0xCA | |
568 | %rep 16 ; 0<=I<=15 | |
569 | %assign I_rotX I/16+1 | |
570 | %assign I_rotY (I % 4 + 1) | |
571 | %assign I_data I | |
572 | vpbroadcastd md5c, [TBL + I * 4] | |
573 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
574 | %assign I (I+1) | |
575 | %endrep | |
576 | ;; first 16 lanes read | |
577 | mov inp0, [IN + 0*8] | |
578 | mov inp1, [IN + 1*8] | |
579 | mov inp2, [IN + 2*8] | |
580 | mov inp3, [IN + 3*8] | |
581 | mov inp4, [IN + 4*8] | |
582 | mov inp5, [IN + 5*8] | |
583 | mov inp6, [IN + 6*8] | |
584 | mov inp7, [IN + 7*8] | |
585 | vmovdqu32 W0,[inp0+IDX] | |
586 | vmovdqu32 W1,[inp1+IDX] | |
587 | vmovdqu32 W2,[inp2+IDX] | |
588 | vmovdqu32 W3,[inp3+IDX] | |
589 | vmovdqu32 W4,[inp4+IDX] | |
590 | vmovdqu32 W5,[inp5+IDX] | |
591 | vmovdqu32 W6,[inp6+IDX] | |
592 | vmovdqu32 W7,[inp7+IDX] | |
593 | mov inp0, [IN + 8*8] | |
594 | mov inp1, [IN + 9*8] | |
595 | mov inp2, [IN +10*8] | |
596 | mov inp3, [IN +11*8] | |
597 | mov inp4, [IN +12*8] | |
598 | mov inp5, [IN +13*8] | |
599 | mov inp6, [IN +14*8] | |
600 | mov inp7, [IN +15*8] | |
601 | vmovdqu32 W8, [inp0+IDX] | |
602 | vmovdqu32 W9, [inp1+IDX] | |
603 | vmovdqu32 W10,[inp2+IDX] | |
604 | vmovdqu32 W11,[inp3+IDX] | |
605 | vmovdqu32 W12,[inp4+IDX] | |
606 | vmovdqu32 W13,[inp5+IDX] | |
607 | vmovdqu32 W14,[inp6+IDX] | |
608 | vmovdqu32 W15,[inp7+IDX] | |
609 | ||
610 | %assign I 16 | |
611 | %assign I_fimm 0xE4 | |
612 | %rep 16 ; 16<=I<=31 | |
613 | %assign I_data ((5*I+1) % 16) | |
614 | %assign I_rotX I/16+1 | |
615 | %assign I_rotY (I % 4 + 1) | |
616 | vpbroadcastd md5c, [TBL + I * 4] | |
617 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
618 | %assign I (I+1) | |
619 | %endrep | |
620 | ||
621 | ;; first 16 lanes trans&write | |
622 | TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 | |
623 | vmovdqa32 [DPTR2+_DATA+(0)*64],W0 | |
624 | vmovdqa32 [DPTR2+_DATA+(1)*64],W1 | |
625 | vmovdqa32 [DPTR2+_DATA+(2)*64],W2 | |
626 | vmovdqa32 [DPTR2+_DATA+(3)*64],W3 | |
627 | vmovdqa32 [DPTR2+_DATA+(4)*64],W4 | |
628 | vmovdqa32 [DPTR2+_DATA+(5)*64],W5 | |
629 | vmovdqa32 [DPTR2+_DATA+(6)*64],W6 | |
630 | vmovdqa32 [DPTR2+_DATA+(7)*64],W7 | |
631 | vmovdqa32 [DPTR2+_DATA+(8)*64],W8 | |
632 | vmovdqa32 [DPTR2+_DATA+(9)*64],W9 | |
633 | vmovdqa32 [DPTR2+_DATA+(10)*64],W10 | |
634 | vmovdqa32 [DPTR2+_DATA+(11)*64],W11 | |
635 | vmovdqa32 [DPTR2+_DATA+(12)*64],W12 | |
636 | vmovdqa32 [DPTR2+_DATA+(13)*64],W13 | |
637 | vmovdqa32 [DPTR2+_DATA+(14)*64],W14 | |
638 | vmovdqa32 [DPTR2+_DATA+(15)*64],W15 | |
639 | ||
640 | %assign I 32 | |
641 | %assign I_fimm 0x96 | |
642 | %rep 16 ; 32<=I<=47 | |
643 | %assign I_data ((3*I+5) % 16) | |
644 | %assign I_rotX I/16+1 | |
645 | %assign I_rotY (I % 4 + 1) | |
646 | vpbroadcastd md5c, [TBL + I * 4] | |
647 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
648 | %assign I (I+1) | |
649 | %endrep | |
650 | ||
651 | ;; second 16 lanes read | |
652 | mov inp0, [IN + 16*8] | |
653 | mov inp1, [IN + 17*8] | |
654 | mov inp2, [IN + 18*8] | |
655 | mov inp3, [IN + 19*8] | |
656 | mov inp4, [IN + 20*8] | |
657 | mov inp5, [IN + 21*8] | |
658 | mov inp6, [IN + 22*8] | |
659 | mov inp7, [IN + 23*8] | |
660 | vmovdqu32 W0,[inp0+IDX] | |
661 | vmovdqu32 W1,[inp1+IDX] | |
662 | vmovdqu32 W2,[inp2+IDX] | |
663 | vmovdqu32 W3,[inp3+IDX] | |
664 | vmovdqu32 W4,[inp4+IDX] | |
665 | vmovdqu32 W5,[inp5+IDX] | |
666 | vmovdqu32 W6,[inp6+IDX] | |
667 | vmovdqu32 W7,[inp7+IDX] | |
668 | mov inp0, [IN + 24*8] | |
669 | mov inp1, [IN + 25*8] | |
670 | mov inp2, [IN + 26*8] | |
671 | mov inp3, [IN + 27*8] | |
672 | mov inp4, [IN + 28*8] | |
673 | mov inp5, [IN + 29*8] | |
674 | mov inp6, [IN + 30*8] | |
675 | mov inp7, [IN + 31*8] | |
676 | vmovdqu32 W8, [inp0+IDX] | |
677 | vmovdqu32 W9, [inp1+IDX] | |
678 | vmovdqu32 W10,[inp2+IDX] | |
679 | vmovdqu32 W11,[inp3+IDX] | |
680 | vmovdqu32 W12,[inp4+IDX] | |
681 | vmovdqu32 W13,[inp5+IDX] | |
682 | vmovdqu32 W14,[inp6+IDX] | |
683 | vmovdqu32 W15,[inp7+IDX] | |
684 | ||
685 | %assign I 48 | |
686 | %assign I_fimm 0x39 | |
687 | %rep 16 ; 48<=I<=63 | |
688 | %assign I_rotX (I/16+1) | |
689 | %assign I_rotY (I % 4 + 1) | |
690 | %assign I_data ((7*I) % 16) | |
691 | vpbroadcastd md5c, [TBL + I * 4] | |
692 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
693 | %assign I (I+1) | |
694 | %endrep | |
695 | ||
696 | ;; second 16 lanes trans&write | |
697 | TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 | |
698 | vmovdqa32 [DPTR2+_DATA+(16+0)*64],W0 | |
699 | vmovdqa32 [DPTR2+_DATA+(16+1)*64],W1 | |
700 | vmovdqa32 [DPTR2+_DATA+(16+2)*64],W2 | |
701 | vmovdqa32 [DPTR2+_DATA+(16+3)*64],W3 | |
702 | vmovdqa32 [DPTR2+_DATA+(16+4)*64],W4 | |
703 | vmovdqa32 [DPTR2+_DATA+(16+5)*64],W5 | |
704 | vmovdqa32 [DPTR2+_DATA+(16+6)*64],W6 | |
705 | vmovdqa32 [DPTR2+_DATA+(16+7)*64],W7 | |
706 | vmovdqa32 [DPTR2+_DATA+(16+8)*64],W8 | |
707 | vmovdqa32 [DPTR2+_DATA+(16+9)*64],W9 | |
708 | vmovdqa32 [DPTR2+_DATA+(16+10)*64],W10 | |
709 | vmovdqa32 [DPTR2+_DATA+(16+11)*64],W11 | |
710 | vmovdqa32 [DPTR2+_DATA+(16+12)*64],W12 | |
711 | vmovdqa32 [DPTR2+_DATA+(16+13)*64],W13 | |
712 | vmovdqa32 [DPTR2+_DATA+(16+14)*64],W14 | |
713 | vmovdqa32 [DPTR2+_DATA+(16+15)*64],W15 | |
714 | ||
715 | ; Add old digest | |
716 | vpaddd A,A,[Z_AA] | |
717 | vpaddd B,B,[Z_BB] | |
718 | vpaddd C,C,[Z_CC] | |
719 | vpaddd D,D,[Z_DD] | |
720 | vpaddd A1,A1,[Z_AA1] | |
721 | vpaddd B1,B1,[Z_BB1] | |
722 | vpaddd C1,C1,[Z_CC1] | |
723 | vpaddd D1,D1,[Z_DD1] | |
724 | ||
725 | ; Swap DPTR1 and DPTR2 | |
726 | xchg DPTR1, DPTR2 | |
727 | ;; Proceed to processing of next block | |
728 | jmp .lloop | |
729 | ||
730 | .LastLoop: | |
731 | %assign I 0 | |
732 | %assign I_fimm 0xCA | |
733 | %rep 16 ; 0<=I<=15 | |
734 | %assign I_rotX I/16+1 | |
735 | %assign I_rotY (I % 4 + 1) | |
736 | %assign I_data I | |
737 | vpbroadcastd md5c, [TBL + I * 4] | |
738 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
739 | %assign I (I+1) | |
740 | %endrep | |
741 | ||
742 | %assign I 16 | |
743 | %assign I_fimm 0xE4 | |
744 | %rep 16 ; 16<=I<=31 | |
745 | %assign I_data ((5*I+1) % 16) | |
746 | %assign I_rotX I/16+1 | |
747 | %assign I_rotY (I % 4 + 1) | |
748 | vpbroadcastd md5c, [TBL + I * 4] | |
749 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
750 | %assign I (I+1) | |
751 | %endrep | |
752 | ||
753 | %assign I 32 | |
754 | %assign I_fimm 0x96 | |
755 | %rep 16 ; 32<=I<=47 | |
756 | %assign I_data ((3*I+5) % 16) | |
757 | %assign I_rotX I/16+1 | |
758 | %assign I_rotY (I % 4 + 1) | |
759 | vpbroadcastd md5c, [TBL + I * 4] | |
760 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
761 | %assign I (I+1) | |
762 | %endrep | |
763 | ||
764 | %assign I 48 | |
765 | %assign I_fimm 0x39 | |
766 | %rep 16 ; 48<=I<=63 | |
767 | %assign I_rotX (I/16+1) | |
768 | %assign I_rotY (I % 4 + 1) | |
769 | %assign I_data ((7*I) % 16) | |
770 | vpbroadcastd md5c, [TBL + I * 4] | |
771 | PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 | |
772 | %assign I (I+1) | |
773 | %endrep | |
774 | ||
775 | ; Add old digest | |
776 | vpaddd A,A,[Z_AA] | |
777 | vpaddd B,B,[Z_BB] | |
778 | vpaddd C,C,[Z_CC] | |
779 | vpaddd D,D,[Z_DD] | |
780 | vpaddd A1,A1,[Z_AA1] | |
781 | vpaddd B1,B1,[Z_BB1] | |
782 | vpaddd C1,C1,[Z_CC1] | |
783 | vpaddd D1,D1,[Z_DD1] | |
784 | ||
785 | ;; update into data pointers | |
786 | %assign I 0 | |
787 | %rep 16 | |
788 | mov inp0, [IN + (2*I)*8] | |
789 | mov inp1, [IN + (2*I +1)*8] | |
790 | add inp0, IDX | |
791 | add inp1, IDX | |
792 | mov [IN + (2*I)*8], inp0 | |
793 | mov [IN + (2*I+1)*8], inp1 | |
794 | %assign I (I+1) | |
795 | %endrep | |
796 | ||
797 | vmovdqu32 [DIGEST + 0*MD5_DIGEST_ROW_SIZE ], A | |
798 | vmovdqu32 [DIGEST + 1*MD5_DIGEST_ROW_SIZE ], B | |
799 | vmovdqu32 [DIGEST + 2*MD5_DIGEST_ROW_SIZE ], C | |
800 | vmovdqu32 [DIGEST + 3*MD5_DIGEST_ROW_SIZE ], D | |
801 | ; Store the digest for each stream (9-16) | |
802 | vmovdqu32 [DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64], A1 | |
803 | vmovdqu32 [DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64], B1 | |
804 | vmovdqu32 [DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64], C1 | |
805 | vmovdqu32 [DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64], D1 | |
806 | ||
807 | mov rsp, [rsp + _RSP_SAVE] | |
808 | ret | |
809 | ||
810 | section .data | |
811 | align 64 | |
812 | MD5_TABLE: | |
813 | dd 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee | |
814 | dd 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 | |
815 | dd 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be | |
816 | dd 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 | |
817 | dd 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa | |
818 | dd 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 | |
819 | dd 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed | |
820 | dd 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a | |
821 | dd 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c | |
822 | dd 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 | |
823 | dd 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 | |
824 | dd 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 | |
825 | dd 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 | |
826 | dd 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 | |
827 | dd 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 | |
828 | dd 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 | |
829 | ||
830 | PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 | |
831 | dq 0x0000000000000001 | |
832 | dq 0x0000000000000008 | |
833 | dq 0x0000000000000009 | |
834 | dq 0x0000000000000004 | |
835 | dq 0x0000000000000005 | |
836 | dq 0x000000000000000C | |
837 | dq 0x000000000000000D | |
838 | ||
839 | PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 | |
840 | dq 0x0000000000000003 | |
841 | dq 0x000000000000000A | |
842 | dq 0x000000000000000B | |
843 | dq 0x0000000000000006 | |
844 | dq 0x0000000000000007 | |
845 | dq 0x000000000000000E | |
846 | dq 0x000000000000000F | |
847 | ||
848 | %else | |
849 | %ifidn __OUTPUT_FORMAT__, win64 | |
850 | global no_md5_mb_x16x2_avx512 | |
851 | no_md5_mb_x16x2_avx512: | |
852 | %endif | |
853 | %endif ; HAVE_AS_KNOWS_AVX512 |