]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx2/md5_x8x2_avx2.asm
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx2 / md5_x8x2_avx2.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; code to compute double octal MD5 using AVX2
29
30 ;; Stack must be aligned to 32 bytes before call
31 ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
32 ;; Windows preserves: rcx rbp
33 ;;
34 ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
35 ;; Linux preserves: rdi rbp
36 ;;
37 ;; clobbers ymm0-15
38
39 %include "os.asm"
40 %include "mb_mgr_datastruct.asm"
41
42 section .data
43 default rel
44 align 64
45 MD5_TABLE:
46 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
47 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
48 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
49 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
50 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
51 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
52 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
53 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
54 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
55 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
56 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
57 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
58 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
59 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
60 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
61 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
62 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
63 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
64 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
65 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
66 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
67 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
68 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
69 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
70 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
71 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
72 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
73 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
74 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
75 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
76 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
77 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
78 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
79 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
80 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
81 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
82 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
83 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
84 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
85 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
86 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
87 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
88 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
89 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
90 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
91 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
92 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
93 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
94 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
95 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
96 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
97 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
98 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
99 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
100 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
101 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
102 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
103 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
104 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
105 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
106 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
107 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
108 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
109 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
110 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
111 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
112 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
113 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
114 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
115 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
116 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
117 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
118 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
119 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
120 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
121 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
122 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
123 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
124 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
125 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
126 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
127 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
128 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
129 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
130 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
131 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
132 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
133 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
134 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
135 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
136 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
137 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
138 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
139 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
140 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
141 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
142 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
143 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
144 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
145 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
146 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
147 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
148 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
149 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
150 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
151 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
152 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
153 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
154 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
155 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
156 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
157 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
158 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
159 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
160 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
161 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
162 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
163 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
164 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
165 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
166 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
167 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
168 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
169 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
170 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
171 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
172 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
173 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
174 ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
175 dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
176
177 section .text
178
179 %ifndef LINUX
180 %define arg1 rcx
181 %define arg2 rdx
182 %define reg3 rdi
183 %define reg4 rsi
184 %else
185 %define arg1 rdi
186 %define arg2 rsi
187 %define reg3 rcx
188 %define reg4 rdx
189 %endif
190
191 ;; rbp is not clobbered
192
193 %define state arg1
194 %define num_blks arg2
195
196 %define inp0 r8
197 %define inp1 r9
198 %define inp2 r10
199 %define inp3 r11
200 %define inp4 r12
201 %define inp5 r13
202 %define inp6 r14
203 %define inp7 r15
204
205 ;; These are pointers to data block1 and block2 in the stack
206 ; which will ping pong back and forth
207 %define DPTR1 rbx
208 %define DPTR2 reg3
209
210 %define TBL rax
211 %define IDX reg4
212
213 ;; Transposed Digest Storage
214 %define Y_A ymm0
215 %define Y_B ymm1
216 %define Y_C ymm2
217 %define Y_D ymm3
218 %define Y_A2 ymm4
219 %define Y_B2 ymm5
220 %define Y_C2 ymm6
221 %define Y_D2 ymm7
222
223 ;; Temp YMM registers corresponding to the Temp XMM registers
224 ;; used during the transposition of the digests
225 %define Y_KTMP1 ymm12
226 %define Y_KTMP2 ymm13
227 ;; Temporary registers used during MD5 round operations
228 %define Y_FUN ymm8
229 %define Y_TMP ymm9
230 %define Y_FUN2 ymm10
231 %define Y_TMP2 ymm11
232
233 ;; YMM registers used during data fetching.
234 ;; Data are stored into the stack after transposition
235 %define Y_DAT0 ymm8
236 %define Y_DAT1 ymm9
237 %define Y_DAT2 ymm10
238 %define Y_DAT3 ymm11
239 %define Y_DAT4 ymm12
240 %define Y_DAT5 ymm13
241 %define Y_DAT6 ymm14
242 %define Y_DAT7 ymm15
243
244 ;; Temporary registers used during data transposition
245 %define Y_DTMP1 ymm0
246 %define Y_DTMP2 ymm1
247
248
249 %define RESY resb 32*
250 ;; Assume stack aligned to 32 bytes before call
251 ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
252 struc STACK
253 _DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
254 _DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2
255 _TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily
256 resb 24 ; align
257 endstruc
258
259
260 %define Y_AA rsp + _DIGEST + 32*0
261 %define Y_BB rsp + _DIGEST + 32*1
262 %define Y_CC rsp + _DIGEST + 32*2
263 %define Y_DD rsp + _DIGEST + 32*3
264 %define Y_AA2 rsp + _DIGEST + 32*4
265 %define Y_BB2 rsp + _DIGEST + 32*5
266 %define Y_CC2 rsp + _DIGEST + 32*6
267 %define Y_DD2 rsp + _DIGEST + 32*7
268
269 ;;
270 ;; MD5 left rotations (number of bits)
271 ;;
272 rot11 equ 7
273 rot12 equ 12
274 rot13 equ 17
275 rot14 equ 22
276 rot21 equ 5
277 rot22 equ 9
278 rot23 equ 14
279 rot24 equ 20
280 rot31 equ 4
281 rot32 equ 11
282 rot33 equ 16
283 rot34 equ 23
284 rot41 equ 6
285 rot42 equ 10
286 rot43 equ 15
287 rot44 equ 21
288
289 ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
290 ; "transpose" data in {r0...r7} using temps {t0...t1}
291 ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
292 ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
293 ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
294 ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
295 ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
296 ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
297 ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
298 ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
299 ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
300 ;
301 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
302 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
303 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
304 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
305 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
306 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
307 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
308 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
309 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
310
311 ;
312 %macro TRANSPOSE8 10
313 %define %%r0 %1
314 %define %%r1 %2
315 %define %%r2 %3
316 %define %%r3 %4
317 %define %%r4 %5
318 %define %%r5 %6
319 %define %%r6 %7
320 %define %%r7 %8
321 %define %%t0 %9
322 %define %%t1 %10
323
324 ; process top half (r0..r3) {a...d}
325 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
326 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
327 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
328 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
329 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
330 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
331 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
332 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
333
334 ; use r2 in place of t0
335 ; process bottom half (r4..r7) {e...h}
336 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
337 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
338 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
339 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
340 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
341 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
342 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
343 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
344
345
346 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
347 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
348 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
349 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
350 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
351 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
352 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
353 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
354 %endmacro
355
356
357 ;;
358 ;; Magic functions defined in RFC 1321
359 ;;
360 ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
361 %macro MAGIC_F 4
362 %define %%F %1
363 %define %%X %2
364 %define %%Y %3
365 %define %%Z %4
366 vpxor %%F,%%Z, %%Y
367 vpand %%F,%%F,%%X
368 vpxor %%F,%%F,%%Z
369 %endmacro
370
371 ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
372 %macro MAGIC_G 4
373 %define %%F %1
374 %define %%X %2
375 %define %%Y %3
376 %define %%Z %4
377 MAGIC_F %%F,%%Z,%%X,%%Y
378 %endmacro
379
380 ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
381 %macro MAGIC_H 4
382 %define %%F %1
383 %define %%X %2
384 %define %%Y %3
385 %define %%Z %4
386 vpxor %%F,%%Z, %%Y
387 vpxor %%F,%%F, %%X
388 %endmacro
389
390 ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
391 %macro MAGIC_I 4
392 %define %%F %1
393 %define %%X %2
394 %define %%Y %3
395 %define %%Z %4
396 vpxor %%F,%%Z,[rel ONES] ; pnot %%F
397 vpor %%F,%%F,%%X
398 vpxor %%F,%%F,%%Y
399 %endmacro
400
401 ; PROLD reg, imm, tmp
402 %macro PROLD 3
403 %define %%reg %1
404 %define %%imm %2
405 %define %%tmp %3
406 vpsrld %%tmp, %%reg, (32-%%imm)
407 vpslld %%reg, %%reg, %%imm
408 vpor %%reg, %%reg, %%tmp
409 %endmacro
410
411 ;;
412 ;; single MD5 step
413 ;;
414 ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
415 ;;
416 ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
417 ; MD5const, nrot
418 %macro MD5_STEP 16
419 %define %%MAGIC_FUN %1
420 %define %%rA %2
421 %define %%rB %3
422 %define %%rC %4
423 %define %%rD %5
424 %define %%rA2 %6
425 %define %%rB2 %7
426 %define %%rC2 %8
427 %define %%rD2 %9
428 %define %%FUN %10
429 %define %%TMP %11
430 %define %%FUN2 %12
431 %define %%TMP2 %13
432 %define %%data %14
433 %define %%MD5const %15
434 %define %%nrot %16
435
436 vpaddd %%rA, %%rA, %%MD5const
437 vpaddd %%rA2, %%rA2, %%MD5const
438 vpaddd %%rA, %%rA, [%%data]
439 vpaddd %%rA2, %%rA2, [%%data + 16*32]
440 %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
441 %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
442 vpaddd %%rA, %%rA, %%FUN
443 vpaddd %%rA2, %%rA2, %%FUN2
444 PROLD %%rA,%%nrot, %%TMP
445 PROLD %%rA2,%%nrot, %%TMP2
446 vpaddd %%rA, %%rA, %%rB
447 vpaddd %%rA2, %%rA2, %%rB2
448 %endmacro
449
450 align 32
451
452 ; void md5_x8x2_avx(MD5_ARGS *args, UINT64 num_blks)
453 ; arg 1 : pointer to MD5_ARGS structure
454 ; arg 2 : number of blocks (>=1)
455
456 MKGLOBAL(md5_x8x2_avx2,function,internal)
457 md5_x8x2_avx2:
458 sub rsp, STACK_size
459
460 mov DPTR1, rsp
461 lea DPTR2, [rsp + 32*32]
462
463 ;; Load MD5 constant pointer to register
464 lea TBL, [rel MD5_TABLE]
465
466 ; Initialize index for data retrieval
467 xor IDX, IDX
468
469 ;; Fetch Pointers to Data Stream 1 to 8
470 mov inp0,[state + _data_ptr_md5+0*PTR_SZ]
471 mov inp1,[state + _data_ptr_md5+1*PTR_SZ]
472 mov inp2,[state + _data_ptr_md5+2*PTR_SZ]
473 mov inp3,[state + _data_ptr_md5+3*PTR_SZ]
474 mov inp4,[state + _data_ptr_md5+4*PTR_SZ]
475 mov inp5,[state + _data_ptr_md5+5*PTR_SZ]
476 mov inp6,[state + _data_ptr_md5+6*PTR_SZ]
477 mov inp7,[state + _data_ptr_md5+7*PTR_SZ]
478
479 %assign I 0
480 %rep 2
481 vmovdqu Y_DAT0,[inp0+IDX+I*32]
482 vmovdqu Y_DAT1,[inp1+IDX+I*32]
483 vmovdqu Y_DAT2,[inp2+IDX+I*32]
484 vmovdqu Y_DAT3,[inp3+IDX+I*32]
485 vmovdqu Y_DAT4,[inp4+IDX+I*32]
486 vmovdqu Y_DAT5,[inp5+IDX+I*32]
487 vmovdqu Y_DAT6,[inp6+IDX+I*32]
488 vmovdqu Y_DAT7,[inp7+IDX+I*32]
489 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
490 vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0
491 vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1
492 vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2
493 vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3
494 vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4
495 vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5
496 vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6
497 vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7
498
499 %assign I (I+1)
500 %endrep
501
502 ;; Fetch Pointers to Data Stream 9 to 16
503 mov inp0,[state + _data_ptr_md5 + 8*8]
504 mov inp1,[state + _data_ptr_md5 + 9*8]
505 mov inp2,[state + _data_ptr_md5 + 10*8]
506 mov inp3,[state + _data_ptr_md5 + 11*8]
507 mov inp4,[state + _data_ptr_md5 + 12*8]
508 mov inp5,[state + _data_ptr_md5 + 13*8]
509 mov inp6,[state + _data_ptr_md5 + 14*8]
510 mov inp7,[state + _data_ptr_md5 + 15*8]
511
512 %assign I 0
513 %rep 2
514
515 vmovdqu Y_DAT0,[inp0+IDX+I*32]
516 vmovdqu Y_DAT1,[inp1+IDX+I*32]
517 vmovdqu Y_DAT2,[inp2+IDX+I*32]
518 vmovdqu Y_DAT3,[inp3+IDX+I*32]
519 vmovdqu Y_DAT4,[inp4+IDX+I*32]
520 vmovdqu Y_DAT5,[inp5+IDX+I*32]
521 vmovdqu Y_DAT6,[inp6+IDX+I*32]
522 vmovdqu Y_DAT7,[inp7+IDX+I*32]
523 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
524 vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
525 vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
526 vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
527 vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
528 vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
529 vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
530 vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
531 vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
532
533 %assign I (I+1)
534 %endrep
535 ;; digests are already transposed
536 vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
537 vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
538 vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
539 vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
540
541 ; Load the digest for each stream (9-16)
542 vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
543 vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
544 vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
545 vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
546
547 lloop:
548
549 ; save old digests to stack
550 vmovdqa [Y_AA], Y_A
551 vmovdqa [Y_BB], Y_B
552 vmovdqa [Y_CC], Y_C
553 vmovdqa [Y_DD], Y_D
554
555 vmovdqa [Y_AA2], Y_A2
556 vmovdqa [Y_BB2], Y_B2
557 vmovdqa [Y_CC2], Y_C2
558 vmovdqa [Y_DD2], Y_D2
559
560 ;; Increment IDX to point to next data block (64 bytes per block)
561 add IDX, 64
562
563 ;; Update size of remaining blocks to process
564 sub num_blks, 1
565 je lastblock
566
567 ; Perform the 64 rounds of processing ...
568 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
569 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
570 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
571 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
572 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
573 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
574 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
575 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
576
577
578 ;; Fetch Pointers to Data Stream 1 to 8 ??
579 mov inp0,[state + _data_ptr_md5 + 0*8]
580 mov inp1,[state + _data_ptr_md5 + 1*8]
581 mov inp2,[state + _data_ptr_md5 + 2*8]
582 mov inp3,[state + _data_ptr_md5 + 3*8]
583 mov inp4,[state + _data_ptr_md5 + 4*8]
584 mov inp5,[state + _data_ptr_md5 + 5*8]
585 mov inp6,[state + _data_ptr_md5 + 6*8]
586 mov inp7,[state + _data_ptr_md5 + 7*8]
587
588 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
589 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
590 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
591 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
592 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
593 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
594 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
595 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
596
597 %assign I 0
598
599 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
600 ; Therefore we need to save these to stack and restore after transpose
601 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
602 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
603
604 vmovdqu Y_DAT0,[inp0+IDX+I*32]
605 vmovdqu Y_DAT1,[inp1+IDX+I*32]
606 vmovdqu Y_DAT2,[inp2+IDX+I*32]
607 vmovdqu Y_DAT3,[inp3+IDX+I*32]
608 vmovdqu Y_DAT4,[inp4+IDX+I*32]
609 vmovdqu Y_DAT5,[inp5+IDX+I*32]
610 vmovdqu Y_DAT6,[inp6+IDX+I*32]
611 vmovdqu Y_DAT7,[inp7+IDX+I*32]
612 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
613 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
614 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
615 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
616 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
617 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
618 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
619 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
620 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
621
622 ; Restore Y_A and Y_B
623 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
624 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
625
626
627 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
628 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
629 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
630 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
631 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
632 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
633 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
634 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
635 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
636 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
637 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
638 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
639 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
640 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
641 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
642 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
643
644 %assign I (I+1)
645
646 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
647 ; Therefore we need to save these to stack and restore after transpose
648 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
649 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
650
651 vmovdqu Y_DAT0,[inp0+IDX+I*32]
652 vmovdqu Y_DAT1,[inp1+IDX+I*32]
653 vmovdqu Y_DAT2,[inp2+IDX+I*32]
654 vmovdqu Y_DAT3,[inp3+IDX+I*32]
655 vmovdqu Y_DAT4,[inp4+IDX+I*32]
656 vmovdqu Y_DAT5,[inp5+IDX+I*32]
657 vmovdqu Y_DAT6,[inp6+IDX+I*32]
658 vmovdqu Y_DAT7,[inp7+IDX+I*32]
659 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
660 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
661 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
662 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
663 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
664 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
665 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
666 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
667 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
668
669 ; Restore Y_A and Y_B
670 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
671 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
672
673 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
674 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
675 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
676 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
677 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
678 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
679 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
680 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
681
682 ;; Fetch Pointers to Data Stream 9 to 16
683 mov inp0,[state + _data_ptr_md5 + 8*8]
684 mov inp1,[state + _data_ptr_md5 + 9*8]
685 mov inp2,[state + _data_ptr_md5 + 10*8]
686 mov inp3,[state + _data_ptr_md5 + 11*8]
687 mov inp4,[state + _data_ptr_md5 + 12*8]
688 mov inp5,[state + _data_ptr_md5 + 13*8]
689 mov inp6,[state + _data_ptr_md5 + 14*8]
690 mov inp7,[state + _data_ptr_md5 + 15*8]
691
692 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
693 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
694 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
695 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
696 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
697 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
698 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
699 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
700
701 %assign I 0
702
703 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
704 ; Therefore we need to save these to stack and restore after transpose
705 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
706 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
707
708 vmovdqu Y_DAT0,[inp0+IDX+I*32]
709 vmovdqu Y_DAT1,[inp1+IDX+I*32]
710 vmovdqu Y_DAT2,[inp2+IDX+I*32]
711 vmovdqu Y_DAT3,[inp3+IDX+I*32]
712 vmovdqu Y_DAT4,[inp4+IDX+I*32]
713 vmovdqu Y_DAT5,[inp5+IDX+I*32]
714 vmovdqu Y_DAT6,[inp6+IDX+I*32]
715 vmovdqu Y_DAT7,[inp7+IDX+I*32]
716 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
717 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
718 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
719 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
720 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
721 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
722 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
723 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
724 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
725
726 ; Restore Y_A and Y_B
727 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
728 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
729
730 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
731 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
732 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
733 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
734 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
735 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
736 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
737 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
738 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
739 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
740 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
741 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
742 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
743 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
744 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
745 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
746
747 %assign I (I+1)
748
749 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
750 ; Therefore we need to save these to stack and restore after transpose
751 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
752 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
753
754 vmovdqu Y_DAT0,[inp0+IDX+I*32]
755 vmovdqu Y_DAT1,[inp1+IDX+I*32]
756 vmovdqu Y_DAT2,[inp2+IDX+I*32]
757 vmovdqu Y_DAT3,[inp3+IDX+I*32]
758 vmovdqu Y_DAT4,[inp4+IDX+I*32]
759 vmovdqu Y_DAT5,[inp5+IDX+I*32]
760 vmovdqu Y_DAT6,[inp6+IDX+I*32]
761 vmovdqu Y_DAT7,[inp7+IDX+I*32]
762 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
763 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
764 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
765 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
766 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
767 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
768 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
769 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
770 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
771
772 ; Restore Y_A and Y_B
773 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
774 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
775
776 ; Add results to old digest values
777
778 vpaddd Y_A,Y_A,[Y_AA]
779 vpaddd Y_B,Y_B,[Y_BB]
780 vpaddd Y_C,Y_C,[Y_CC]
781 vpaddd Y_D,Y_D,[Y_DD]
782
783 vpaddd Y_A2,Y_A2,[Y_AA2]
784 vpaddd Y_B2,Y_B2,[Y_BB2]
785 vpaddd Y_C2,Y_C2,[Y_CC2]
786 vpaddd Y_D2,Y_D2,[Y_DD2]
787
788 ; Swap DPTR1 and DPTR2
789 xchg DPTR1, DPTR2
790
791 ;; Proceed to processing of next block
792 jmp lloop
793
794 lastblock:
795
796 ; Perform the 64 rounds of processing ...
797 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
798 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
799 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
800 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
801 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
802 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
803 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
804 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
805 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
806 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
807 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
808 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
809 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
810 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
811 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
812 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
813
814 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
815 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
816 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
817 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
818 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
819 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
820 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
821 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
822 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
823 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
824 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
825 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
826 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
827 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
828 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
829 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
830
831 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
832 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
833 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
834 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
835 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
836 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
837 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
838 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
839 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
840 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
841 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
842 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
843 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
844 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
845 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
846 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
847
848 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
849 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
850 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
851 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
852 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
853 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
854 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
855 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
856 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
857 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
858 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
859 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
860 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
861 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
862 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
863 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
864
865 ;; update into data pointers
866 %assign I 0
867 %rep 8
868 mov inp0, [state + _data_ptr_md5 + (2*I)*8]
869 mov inp1, [state + _data_ptr_md5 + (2*I +1)*8]
870 add inp0, IDX
871 add inp1, IDX
872 mov [state + _data_ptr_md5 + (2*I)*8], inp0
873 mov [state + _data_ptr_md5 + (2*I+1)*8], inp1
874 %assign I (I+1)
875 %endrep
876
877 vpaddd Y_A,Y_A,[Y_AA]
878 vpaddd Y_B,Y_B,[Y_BB]
879 vpaddd Y_C,Y_C,[Y_CC]
880 vpaddd Y_D,Y_D,[Y_DD]
881
882 vpaddd Y_A2,Y_A2,[Y_AA2]
883 vpaddd Y_B2,Y_B2,[Y_BB2]
884 vpaddd Y_C2,Y_C2,[Y_CC2]
885 vpaddd Y_D2,Y_D2,[Y_DD2]
886
887 vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A
888 vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B
889 vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C
890 vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D
891
892
893 vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width
894 vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2
895 vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2
896 vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2
897
898 ;;;;;;;;;;;;;;;;
899 ;; Postamble
900
901 add rsp, STACK_size
902
903 ret
904
905 %ifdef LINUX
906 section .note.GNU-stack noalloc noexec nowrite progbits
907 %endif