]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx2/md5_x8x2_avx2.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx2 / md5_x8x2_avx2.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; code to compute double octal MD5 using AVX2
29
30 ;; Stack must be aligned to 32 bytes before call
31 ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
32 ;; Windows preserves: rcx rbp
33 ;;
34 ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
35 ;; Linux preserves: rdi rbp
36 ;;
37 ;; clobbers ymm0-15
38
39 %include "include/os.asm"
40 %include "mb_mgr_datastruct.asm"
41 %include "include/transpose_avx2.asm"
42
43 section .data
44 default rel
45 align 64
46 MD5_TABLE:
47 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
48 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
49 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
50 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
51 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
52 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
53 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
54 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
55 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
56 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
57 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
58 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
59 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
60 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
61 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
62 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
63 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
64 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
65 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
66 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
67 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
68 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
69 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
70 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
71 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
72 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
73 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
74 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
75 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
76 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
77 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
78 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
79 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
80 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
81 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
82 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
83 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
84 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
85 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
86 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
87 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
88 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
89 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
90 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
91 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
92 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
93 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
94 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
95 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
96 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
97 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
98 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
99 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
100 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
101 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
102 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
103 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
104 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
105 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
106 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
107 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
108 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
109 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
110 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
111 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
112 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
113 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
114 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
115 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
116 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
117 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
118 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
119 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
120 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
121 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
122 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
123 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
124 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
125 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
126 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
127 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
128 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
129 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
130 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
131 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
132 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
133 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
134 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
135 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
136 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
137 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
138 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
139 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
140 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
141 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
142 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
143 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
144 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
145 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
146 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
147 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
148 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
149 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
150 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
151 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
152 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
153 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
154 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
155 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
156 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
157 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
158 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
159 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
160 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
161 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
162 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
163 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
164 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
165 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
166 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
167 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
168 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
169 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
170 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
171 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
172 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
173 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
174 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
175 ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
176 dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
177
178 section .text
179
180 %ifndef LINUX
181 %define arg1 rcx
182 %define arg2 rdx
183 %define reg3 rdi
184 %define reg4 rsi
185 %else
186 %define arg1 rdi
187 %define arg2 rsi
188 %define reg3 rcx
189 %define reg4 rdx
190 %endif
191
192 ;; rbp is not clobbered
193
194 %define state arg1
195 %define num_blks arg2
196
197 %define inp0 r8
198 %define inp1 r9
199 %define inp2 r10
200 %define inp3 r11
201 %define inp4 r12
202 %define inp5 r13
203 %define inp6 r14
204 %define inp7 r15
205
206 ;; These are pointers to data block1 and block2 in the stack
207 ; which will ping pong back and forth
208 %define DPTR1 rbx
209 %define DPTR2 reg3
210
211 %define TBL rax
212 %define IDX reg4
213
214 ;; Transposed Digest Storage
215 %define Y_A ymm0
216 %define Y_B ymm1
217 %define Y_C ymm2
218 %define Y_D ymm3
219 %define Y_A2 ymm4
220 %define Y_B2 ymm5
221 %define Y_C2 ymm6
222 %define Y_D2 ymm7
223
224 ;; Temp YMM registers corresponding to the Temp XMM registers
225 ;; used during the transposition of the digests
226 %define Y_KTMP1 ymm12
227 %define Y_KTMP2 ymm13
228 ;; Temporary registers used during MD5 round operations
229 %define Y_FUN ymm8
230 %define Y_TMP ymm9
231 %define Y_FUN2 ymm10
232 %define Y_TMP2 ymm11
233
234 ;; YMM registers used during data fetching.
235 ;; Data are stored into the stack after transposition
236 %define Y_DAT0 ymm8
237 %define Y_DAT1 ymm9
238 %define Y_DAT2 ymm10
239 %define Y_DAT3 ymm11
240 %define Y_DAT4 ymm12
241 %define Y_DAT5 ymm13
242 %define Y_DAT6 ymm14
243 %define Y_DAT7 ymm15
244
245 ;; Temporary registers used during data transposition
246 %define Y_DTMP1 ymm0
247 %define Y_DTMP2 ymm1
248
249
250 %define RESY resb 32*
251 ;; Assume stack aligned to 32 bytes before call
252 ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
253 struc STACK
254 _DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
255 _DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2
256 _TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily
257 resb 24 ; align
258 endstruc
259
260
261 %define Y_AA rsp + _DIGEST + 32*0
262 %define Y_BB rsp + _DIGEST + 32*1
263 %define Y_CC rsp + _DIGEST + 32*2
264 %define Y_DD rsp + _DIGEST + 32*3
265 %define Y_AA2 rsp + _DIGEST + 32*4
266 %define Y_BB2 rsp + _DIGEST + 32*5
267 %define Y_CC2 rsp + _DIGEST + 32*6
268 %define Y_DD2 rsp + _DIGEST + 32*7
269
270 ;;
271 ;; MD5 left rotations (number of bits)
272 ;;
273 rot11 equ 7
274 rot12 equ 12
275 rot13 equ 17
276 rot14 equ 22
277 rot21 equ 5
278 rot22 equ 9
279 rot23 equ 14
280 rot24 equ 20
281 rot31 equ 4
282 rot32 equ 11
283 rot33 equ 16
284 rot34 equ 23
285 rot41 equ 6
286 rot42 equ 10
287 rot43 equ 15
288 rot44 equ 21
289
290
291 ;;
292 ;; Magic functions defined in RFC 1321
293 ;;
294 ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
295 %macro MAGIC_F 4
296 %define %%F %1
297 %define %%X %2
298 %define %%Y %3
299 %define %%Z %4
300 vpxor %%F,%%Z, %%Y
301 vpand %%F,%%F,%%X
302 vpxor %%F,%%F,%%Z
303 %endmacro
304
305 ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
306 %macro MAGIC_G 4
307 %define %%F %1
308 %define %%X %2
309 %define %%Y %3
310 %define %%Z %4
311 MAGIC_F %%F,%%Z,%%X,%%Y
312 %endmacro
313
314 ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
315 %macro MAGIC_H 4
316 %define %%F %1
317 %define %%X %2
318 %define %%Y %3
319 %define %%Z %4
320 vpxor %%F,%%Z, %%Y
321 vpxor %%F,%%F, %%X
322 %endmacro
323
324 ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
325 %macro MAGIC_I 4
326 %define %%F %1
327 %define %%X %2
328 %define %%Y %3
329 %define %%Z %4
330 vpxor %%F,%%Z,[rel ONES] ; pnot %%F
331 vpor %%F,%%F,%%X
332 vpxor %%F,%%F,%%Y
333 %endmacro
334
335 ; PROLD reg, imm, tmp
336 %macro PROLD 3
337 %define %%reg %1
338 %define %%imm %2
339 %define %%tmp %3
340 vpsrld %%tmp, %%reg, (32-%%imm)
341 vpslld %%reg, %%reg, %%imm
342 vpor %%reg, %%reg, %%tmp
343 %endmacro
344
345 ;;
346 ;; single MD5 step
347 ;;
348 ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
349 ;;
350 ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
351 ; MD5const, nrot
352 %macro MD5_STEP 16
353 %define %%MAGIC_FUN %1
354 %define %%rA %2
355 %define %%rB %3
356 %define %%rC %4
357 %define %%rD %5
358 %define %%rA2 %6
359 %define %%rB2 %7
360 %define %%rC2 %8
361 %define %%rD2 %9
362 %define %%FUN %10
363 %define %%TMP %11
364 %define %%FUN2 %12
365 %define %%TMP2 %13
366 %define %%data %14
367 %define %%MD5const %15
368 %define %%nrot %16
369
370 vpaddd %%rA, %%rA, %%MD5const
371 vpaddd %%rA2, %%rA2, %%MD5const
372 vpaddd %%rA, %%rA, [%%data]
373 vpaddd %%rA2, %%rA2, [%%data + 16*32]
374 %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
375 %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
376 vpaddd %%rA, %%rA, %%FUN
377 vpaddd %%rA2, %%rA2, %%FUN2
378 PROLD %%rA,%%nrot, %%TMP
379 PROLD %%rA2,%%nrot, %%TMP2
380 vpaddd %%rA, %%rA, %%rB
381 vpaddd %%rA2, %%rA2, %%rB2
382 %endmacro
383
384 align 32
385
386 ; void md5_x8x2_avx(MD5_ARGS *args, UINT64 num_blks)
387 ; arg 1 : pointer to MD5_ARGS structure
388 ; arg 2 : number of blocks (>=1)
389
390 MKGLOBAL(md5_x8x2_avx2,function,internal)
391 md5_x8x2_avx2:
392 sub rsp, STACK_size
393
394 mov DPTR1, rsp
395 lea DPTR2, [rsp + 32*32]
396
397 ;; Load MD5 constant pointer to register
398 lea TBL, [rel MD5_TABLE]
399
400 ; Initialize index for data retrieval
401 xor IDX, IDX
402
403 ;; Fetch Pointers to Data Stream 1 to 8
404 mov inp0,[state + _data_ptr_md5+0*PTR_SZ]
405 mov inp1,[state + _data_ptr_md5+1*PTR_SZ]
406 mov inp2,[state + _data_ptr_md5+2*PTR_SZ]
407 mov inp3,[state + _data_ptr_md5+3*PTR_SZ]
408 mov inp4,[state + _data_ptr_md5+4*PTR_SZ]
409 mov inp5,[state + _data_ptr_md5+5*PTR_SZ]
410 mov inp6,[state + _data_ptr_md5+6*PTR_SZ]
411 mov inp7,[state + _data_ptr_md5+7*PTR_SZ]
412
413 %assign I 0
414 %rep 2
415 TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
416 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
417
418 TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
419 vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0
420 vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1
421 vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2
422 vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3
423 vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4
424 vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5
425 vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6
426 vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7
427
428 %assign I (I+1)
429 %endrep
430
431 ;; Fetch Pointers to Data Stream 9 to 16
432 mov inp0,[state + _data_ptr_md5 + 8*8]
433 mov inp1,[state + _data_ptr_md5 + 9*8]
434 mov inp2,[state + _data_ptr_md5 + 10*8]
435 mov inp3,[state + _data_ptr_md5 + 11*8]
436 mov inp4,[state + _data_ptr_md5 + 12*8]
437 mov inp5,[state + _data_ptr_md5 + 13*8]
438 mov inp6,[state + _data_ptr_md5 + 14*8]
439 mov inp7,[state + _data_ptr_md5 + 15*8]
440
441 %assign I 0
442 %rep 2
443 TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
444 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
445
446 TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
447 vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
448 vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
449 vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
450 vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
451 vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
452 vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
453 vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
454 vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
455
456 %assign I (I+1)
457 %endrep
458 ;; digests are already transposed
459 vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
460 vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
461 vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
462 vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
463
464 ; Load the digest for each stream (9-16)
465 vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
466 vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
467 vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
468 vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
469
470 lloop:
471
472 ; save old digests to stack
473 vmovdqa [Y_AA], Y_A
474 vmovdqa [Y_BB], Y_B
475 vmovdqa [Y_CC], Y_C
476 vmovdqa [Y_DD], Y_D
477
478 vmovdqa [Y_AA2], Y_A2
479 vmovdqa [Y_BB2], Y_B2
480 vmovdqa [Y_CC2], Y_C2
481 vmovdqa [Y_DD2], Y_D2
482
483 ;; Increment IDX to point to next data block (64 bytes per block)
484 add IDX, 64
485
486 ;; Update size of remaining blocks to process
487 sub num_blks, 1
488 je lastblock
489
490 ; Perform the 64 rounds of processing ...
491 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
492 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
493 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
494 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
495 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
496 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
497 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
498 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
499
500
501 ;; Fetch Pointers to Data Stream 1 to 8 ??
502 mov inp0,[state + _data_ptr_md5 + 0*8]
503 mov inp1,[state + _data_ptr_md5 + 1*8]
504 mov inp2,[state + _data_ptr_md5 + 2*8]
505 mov inp3,[state + _data_ptr_md5 + 3*8]
506 mov inp4,[state + _data_ptr_md5 + 4*8]
507 mov inp5,[state + _data_ptr_md5 + 5*8]
508 mov inp6,[state + _data_ptr_md5 + 6*8]
509 mov inp7,[state + _data_ptr_md5 + 7*8]
510
511 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
512 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
513 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
514 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
515 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
516 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
517 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
518 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
519
520 %assign I 0
521
522 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
523 ; Therefore we need to save these to stack and restore after transpose
524 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
525 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
526
527 TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
528 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
529
530 TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
531 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
532 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
533 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
534 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
535 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
536 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
537 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
538 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
539
540 ; Restore Y_A and Y_B
541 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
542 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
543
544
545 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
546 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
547 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
548 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
549 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
550 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
551 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
552 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
553 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
554 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
555 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
556 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
557 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
558 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
559 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
560 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
561
562 %assign I (I+1)
563
564 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
565 ; Therefore we need to save these to stack and restore after transpose
566 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
567 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
568
569 TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
570 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
571
572 TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
573 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
574 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
575 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
576 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
577 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
578 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
579 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
580 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
581
582 ; Restore Y_A and Y_B
583 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
584 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
585
586 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
587 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
588 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
589 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
590 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
591 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
592 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
593 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
594
595 ;; Fetch Pointers to Data Stream 9 to 16
596 mov inp0,[state + _data_ptr_md5 + 8*8]
597 mov inp1,[state + _data_ptr_md5 + 9*8]
598 mov inp2,[state + _data_ptr_md5 + 10*8]
599 mov inp3,[state + _data_ptr_md5 + 11*8]
600 mov inp4,[state + _data_ptr_md5 + 12*8]
601 mov inp5,[state + _data_ptr_md5 + 13*8]
602 mov inp6,[state + _data_ptr_md5 + 14*8]
603 mov inp7,[state + _data_ptr_md5 + 15*8]
604
605 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
606 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
607 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
608 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
609 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
610 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
611 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
612 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
613
614 %assign I 0
615
616 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
617 ; Therefore we need to save these to stack and restore after transpose
618 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
619 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
620
621 TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
622 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
623
624 TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
625 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
626 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
627 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
628 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
629 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
630 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
631 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
632 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
633
634 ; Restore Y_A and Y_B
635 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
636 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
637
638 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
639 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
640 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
641 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
642 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
643 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
644 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
645 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
646 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
647 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
648 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
649 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
650 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
651 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
652 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
653 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
654
655 %assign I (I+1)
656
657 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
658 ; Therefore we need to save these to stack and restore after transpose
659 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
660 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
661
662 TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
663 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
664
665 TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
666 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
667 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
668 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
669 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
670 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
671 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
672 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
673 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
674
675 ; Restore Y_A and Y_B
676 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
677 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
678
679 ; Add results to old digest values
680
681 vpaddd Y_A,Y_A,[Y_AA]
682 vpaddd Y_B,Y_B,[Y_BB]
683 vpaddd Y_C,Y_C,[Y_CC]
684 vpaddd Y_D,Y_D,[Y_DD]
685
686 vpaddd Y_A2,Y_A2,[Y_AA2]
687 vpaddd Y_B2,Y_B2,[Y_BB2]
688 vpaddd Y_C2,Y_C2,[Y_CC2]
689 vpaddd Y_D2,Y_D2,[Y_DD2]
690
691 ; Swap DPTR1 and DPTR2
692 xchg DPTR1, DPTR2
693
694 ;; Proceed to processing of next block
695 jmp lloop
696
697 lastblock:
698
699 ; Perform the 64 rounds of processing ...
700 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
701 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
702 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
703 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
704 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
705 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
706 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
707 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
708 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
709 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
710 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
711 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
712 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
713 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
714 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
715 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
716
717 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
718 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
719 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
720 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
721 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
722 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
723 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
724 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
725 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
726 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
727 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
728 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
729 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
730 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
731 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
732 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
733
734 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
735 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
736 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
737 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
738 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
739 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
740 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
741 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
742 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
743 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
744 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
745 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
746 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
747 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
748 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
749 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
750
751 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
752 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
753 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
754 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
755 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
756 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
757 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
758 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
759 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
760 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
761 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
762 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
763 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
764 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
765 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
766 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
767
768 ;; update into data pointers
769 %assign I 0
770 %rep 8
771 mov inp0, [state + _data_ptr_md5 + (2*I)*8]
772 mov inp1, [state + _data_ptr_md5 + (2*I +1)*8]
773 add inp0, IDX
774 add inp1, IDX
775 mov [state + _data_ptr_md5 + (2*I)*8], inp0
776 mov [state + _data_ptr_md5 + (2*I+1)*8], inp1
777 %assign I (I+1)
778 %endrep
779
780 vpaddd Y_A,Y_A,[Y_AA]
781 vpaddd Y_B,Y_B,[Y_BB]
782 vpaddd Y_C,Y_C,[Y_CC]
783 vpaddd Y_D,Y_D,[Y_DD]
784
785 vpaddd Y_A2,Y_A2,[Y_AA2]
786 vpaddd Y_B2,Y_B2,[Y_BB2]
787 vpaddd Y_C2,Y_C2,[Y_CC2]
788 vpaddd Y_D2,Y_D2,[Y_DD2]
789
790 vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A
791 vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B
792 vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C
793 vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D
794
795
796 vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width
797 vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2
798 vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2
799 vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2
800
801 ;;;;;;;;;;;;;;;;
802 ;; Postamble
803
804 ;; Clear stack frame ((64+8+2)*32 bytes)
805 %ifdef SAFE_DATA
806 vpxor ymm0, ymm0
807 %assign i 0
808 %rep (2*2*16+8+2)
809 vmovdqa [rsp + i*32], ymm0
810 %assign i (i+1)
811 %endrep
812 %endif
813
814 add rsp, STACK_size
815
816 ret
817
818 %ifdef LINUX
819 section .note.GNU-stack noalloc noexec nowrite progbits
820 %endif