]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / md5_mb / md5_mb_x8x2_avx2.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "md5_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
32 default rel
33
34 ;; code to compute double octal MD5 using AVX2
35
36 ;; Stack must be aligned to 32 bytes before call
37 ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
38 ;; Windows preserves: rcx rbp
39 ;;
40 ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
41 ;; Linux preserves: rdi rbp
42 ;;
43 ;; clobbers ymm0-15
44
45 ;; clobbers all GPRs other than arg1 and rbp
46
47 %ifidn __OUTPUT_FORMAT__, win64
48 %define arg1 rcx
49 %define arg2 rdx
50 %define reg3 rdi
51 %define reg4 rsi
52 %else
53 %define arg1 rdi
54 %define arg2 rsi
55 %define reg3 rcx
56 %define reg4 rdx
57 %endif
58
59 ;; rbp is not clobbered
60
61 %define state arg1
62 %define num_blks arg2
63
64 %define inp0 r8
65 %define inp1 r9
66 %define inp2 r10
67 %define inp3 r11
68 %define inp4 r12
69 %define inp5 r13
70 %define inp6 r14
71 %define inp7 r15
72
73 ;; These are pointers to data block1 and block2 in the stack
74 ; which will ping pong back and forth
75 %define DPTR1 rbx
76 %define DPTR2 reg3
77
78 %define TBL rax
79 %define IDX reg4
80
81 ;; Transposed Digest Storage
82 %define Y_A ymm0
83 %define Y_B ymm1
84 %define Y_C ymm2
85 %define Y_D ymm3
86 %define Y_A2 ymm4
87 %define Y_B2 ymm5
88 %define Y_C2 ymm6
89 %define Y_D2 ymm7
90
91 ;; Temp YMM registers corresponding to the Temp XMM registers
92 ;; used during the transposition of the digests
93 %define Y_KTMP1 ymm12
94 %define Y_KTMP2 ymm13
95 ;; Temporary registers used during MD5 round operations
96 %define Y_FUN ymm8
97 %define Y_TMP ymm9
98 %define Y_FUN2 ymm10
99 %define Y_TMP2 ymm11
100
101
102 ;; YMM registers used during data fetching.
103 ;; Data are stored into the stack after transposition
104 %define Y_DAT0 ymm8
105 %define Y_DAT1 ymm9
106 %define Y_DAT2 ymm10
107 %define Y_DAT3 ymm11
108 %define Y_DAT4 ymm12
109 %define Y_DAT5 ymm13
110 %define Y_DAT6 ymm14
111 %define Y_DAT7 ymm15
112
113 ;; Temporary registers used during data transposition
114 %define Y_DTMP1 ymm0
115 %define Y_DTMP2 ymm1
116
117
118 %define RESY resb 32*
119 ;; Assume stack aligned to 32 bytes before call
120 ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
121 struc STACK
122 _DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
123 _DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2
124 _TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily
125 _RSP_SAVE: RESQ 1 ; original RSP
126 endstruc
127
128
129 %define Y_AA rsp + _DIGEST + 32*0
130 %define Y_BB rsp + _DIGEST + 32*1
131 %define Y_CC rsp + _DIGEST + 32*2
132 %define Y_DD rsp + _DIGEST + 32*3
133 %define Y_AA2 rsp + _DIGEST + 32*4
134 %define Y_BB2 rsp + _DIGEST + 32*5
135 %define Y_CC2 rsp + _DIGEST + 32*6
136 %define Y_DD2 rsp + _DIGEST + 32*7
137
138 %define MD5_DIGEST_ROW_SIZE (16*4)
139
140 ;;
141 ;; MD5 left rotations (number of bits)
142 ;;
143 rot11 equ 7
144 rot12 equ 12
145 rot13 equ 17
146 rot14 equ 22
147 rot21 equ 5
148 rot22 equ 9
149 rot23 equ 14
150 rot24 equ 20
151 rot31 equ 4
152 rot32 equ 11
153 rot33 equ 16
154 rot34 equ 23
155 rot41 equ 6
156 rot42 equ 10
157 rot43 equ 15
158 rot44 equ 21
159
160 ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
161 ; "transpose" data in {r0...r7} using temps {t0...t1}
162 ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
163 ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
164 ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
165 ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
166 ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
167 ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
168 ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
169 ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
170 ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
171 ;
172 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
173 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
174 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
175 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
176 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
177 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
178 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
179 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
180 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
181
182 ;
183 %macro TRANSPOSE8 10
184 %define %%r0 %1
185 %define %%r1 %2
186 %define %%r2 %3
187 %define %%r3 %4
188 %define %%r4 %5
189 %define %%r5 %6
190 %define %%r6 %7
191 %define %%r7 %8
192 %define %%t0 %9
193 %define %%t1 %10
194
195 ; process top half (r0..r3) {a...d}
196 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
197 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
198 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
199 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
200 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
201 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
202 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
203 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
204
205
206 ; use r2 in place of t0
207 ; process bottom half (r4..r7) {e...h}
208 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
209 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
210 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
211 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
212 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
213 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
214 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
215 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
216
217
218 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
219 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
220 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
221 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
222 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
223 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
224 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
225 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
226 %endmacro
227
228
229 ;;
230 ;; Magic functions defined in RFC 1321
231 ;;
232 ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
233 %macro MAGIC_F 4
234 %define %%F %1
235 %define %%X %2
236 %define %%Y %3
237 %define %%Z %4
238 vpxor %%F,%%Z, %%Y
239 vpand %%F,%%F,%%X
240 vpxor %%F,%%F,%%Z
241 %endmacro
242
243 ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
244 %macro MAGIC_G 4
245 %define %%F %1
246 %define %%X %2
247 %define %%Y %3
248 %define %%Z %4
249 MAGIC_F %%F,%%Z,%%X,%%Y
250 %endmacro
251
252 ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
253 %macro MAGIC_H 4
254 %define %%F %1
255 %define %%X %2
256 %define %%Y %3
257 %define %%Z %4
258 vpxor %%F,%%Z, %%Y
259 vpxor %%F,%%F, %%X
260 %endmacro
261
262 ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
263 %macro MAGIC_I 4
264 %define %%F %1
265 %define %%X %2
266 %define %%Y %3
267 %define %%Z %4
268 vpxor %%F,%%Z,[ONES] ; pnot %%F
269 vpor %%F,%%F,%%X
270 vpxor %%F,%%F,%%Y
271 %endmacro
272
273 ; PROLD reg, imm, tmp
274 %macro PROLD 3
275 %define %%reg %1
276 %define %%imm %2
277 %define %%tmp %3
278 vpsrld %%tmp, %%reg, (32-%%imm)
279 vpslld %%reg, %%reg, %%imm
280 vpor %%reg, %%reg, %%tmp
281 %endmacro
282
283 ;;
284 ;; single MD5 step
285 ;;
286 ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
287 ;;
288 ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
289 ; MD5const, nrot
290 %macro MD5_STEP 16
291 %define %%MAGIC_FUN %1
292 %define %%rA %2
293 %define %%rB %3
294 %define %%rC %4
295 %define %%rD %5
296 %define %%rA2 %6
297 %define %%rB2 %7
298 %define %%rC2 %8
299 %define %%rD2 %9
300 %define %%FUN %10
301 %define %%TMP %11
302 %define %%FUN2 %12
303 %define %%TMP2 %13
304 %define %%data %14
305 %define %%MD5const %15
306 %define %%nrot %16
307
308 vpaddd %%rA, %%rA, %%MD5const
309 vpaddd %%rA2, %%rA2, %%MD5const
310 vpaddd %%rA, %%rA, [%%data]
311 vpaddd %%rA2, %%rA2, [%%data + 16*32]
312 %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
313 %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
314 vpaddd %%rA, %%rA, %%FUN
315 vpaddd %%rA2, %%rA2, %%FUN2
316 PROLD %%rA,%%nrot, %%TMP
317 PROLD %%rA2,%%nrot, %%TMP2
318 vpaddd %%rA, %%rA, %%rB
319 vpaddd %%rA2, %%rA2, %%rB2
320 %endmacro
321
322 align 32
323
324 ; void md5_mb_x8x2_avx2(MD5_ARGS *args, UINT64 num_blks)
325 ; arg 1 : pointer to MD5_ARGS structure
326 ; arg 2 : number of blocks (>=1)
327
328 global md5_mb_x8x2_avx2:function internal
329 md5_mb_x8x2_avx2:
330 mov rax, rsp
331 sub rsp, STACK_size
332 and rsp, -32
333 mov [rsp + _RSP_SAVE], rax
334
335 mov DPTR1, rsp
336 lea DPTR2, [rsp + 32*32]
337
338 ;; Load MD5 constant pointer to register
339 lea TBL, [MD5_TABLE]
340
341 ; Initialize index for data retrieval
342 xor IDX, IDX
343
344 ;; Fetch Pointers to Data Stream 1 to 8
345 mov inp0,[state + _data_ptr + 0*8]
346 mov inp1,[state + _data_ptr + 1*8]
347 mov inp2,[state + _data_ptr + 2*8]
348 mov inp3,[state + _data_ptr + 3*8]
349 mov inp4,[state + _data_ptr + 4*8]
350 mov inp5,[state + _data_ptr + 5*8]
351 mov inp6,[state + _data_ptr + 6*8]
352 mov inp7,[state + _data_ptr + 7*8]
353
354 %assign I 0
355 %rep 2
356 vmovdqu Y_DAT0,[inp0+IDX+I*32]
357 vmovdqu Y_DAT1,[inp1+IDX+I*32]
358 vmovdqu Y_DAT2,[inp2+IDX+I*32]
359 vmovdqu Y_DAT3,[inp3+IDX+I*32]
360 vmovdqu Y_DAT4,[inp4+IDX+I*32]
361 vmovdqu Y_DAT5,[inp5+IDX+I*32]
362 vmovdqu Y_DAT6,[inp6+IDX+I*32]
363 vmovdqu Y_DAT7,[inp7+IDX+I*32]
364 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
365 vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0
366 vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1
367 vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2
368 vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3
369 vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4
370 vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5
371 vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6
372 vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7
373
374 %assign I (I+1)
375 %endrep
376
377 ;; Fetch Pointers to Data Stream 9 to 16
378 mov inp0,[state + _data_ptr + 8*8]
379 mov inp1,[state + _data_ptr + 9*8]
380 mov inp2,[state + _data_ptr + 10*8]
381 mov inp3,[state + _data_ptr + 11*8]
382 mov inp4,[state + _data_ptr + 12*8]
383 mov inp5,[state + _data_ptr + 13*8]
384 mov inp6,[state + _data_ptr + 14*8]
385 mov inp7,[state + _data_ptr + 15*8]
386
387 %assign I 0
388 %rep 2
389
390 vmovdqu Y_DAT0,[inp0+IDX+I*32]
391 vmovdqu Y_DAT1,[inp1+IDX+I*32]
392 vmovdqu Y_DAT2,[inp2+IDX+I*32]
393 vmovdqu Y_DAT3,[inp3+IDX+I*32]
394 vmovdqu Y_DAT4,[inp4+IDX+I*32]
395 vmovdqu Y_DAT5,[inp5+IDX+I*32]
396 vmovdqu Y_DAT6,[inp6+IDX+I*32]
397 vmovdqu Y_DAT7,[inp7+IDX+I*32]
398 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
399 vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
400 vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
401 vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
402 vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
403 vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
404 vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
405 vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
406 vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
407
408 %assign I (I+1)
409 %endrep
410 ;; digests are already transposed
411 vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
412 vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
413 vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
414 vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
415
416 ; Load the digest for each stream (9-16)
417 vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
418 vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
419 vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
420 vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
421
422 lloop:
423
424 ; save old digests to stack
425 vmovdqa [Y_AA], Y_A
426 vmovdqa [Y_BB], Y_B
427 vmovdqa [Y_CC], Y_C
428 vmovdqa [Y_DD], Y_D
429
430 vmovdqa [Y_AA2], Y_A2
431 vmovdqa [Y_BB2], Y_B2
432 vmovdqa [Y_CC2], Y_C2
433 vmovdqa [Y_DD2], Y_D2
434
435 ;; Increment IDX to point to next data block (64 bytes per block)
436 add IDX, 64
437
438 ;; Update size of remaining blocks to process
439 sub num_blks, 1
440 je lastblock
441
442 ; Perform the 64 rounds of processing ...
443 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
444 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
445 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
446 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
447 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
448 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
449 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
450 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
451
452
453 ;; Fetch Pointers to Data Stream 1 to 8 ??
454 mov inp0,[state + _data_ptr + 0*8]
455 mov inp1,[state + _data_ptr + 1*8]
456 mov inp2,[state + _data_ptr + 2*8]
457 mov inp3,[state + _data_ptr + 3*8]
458 mov inp4,[state + _data_ptr + 4*8]
459 mov inp5,[state + _data_ptr + 5*8]
460 mov inp6,[state + _data_ptr + 6*8]
461 mov inp7,[state + _data_ptr + 7*8]
462
463 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
464 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
465 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
466 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
467 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
468 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
469 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
470 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
471
472 %assign I 0
473
474 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
475 ; Therefore we need to save these to stack and restore after transpose
476 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
477 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
478
479 vmovdqu Y_DAT0,[inp0+IDX+I*32]
480 vmovdqu Y_DAT1,[inp1+IDX+I*32]
481 vmovdqu Y_DAT2,[inp2+IDX+I*32]
482 vmovdqu Y_DAT3,[inp3+IDX+I*32]
483 vmovdqu Y_DAT4,[inp4+IDX+I*32]
484 vmovdqu Y_DAT5,[inp5+IDX+I*32]
485 vmovdqu Y_DAT6,[inp6+IDX+I*32]
486 vmovdqu Y_DAT7,[inp7+IDX+I*32]
487 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
488 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
489 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
490 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
491 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
492 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
493 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
494 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
495 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
496
497 ; Restore Y_A and Y_B
498 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
499 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
500
501
502 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
503 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
504 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
505 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
506 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
507 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
508 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
509 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
510 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
511 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
512 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
513 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
514 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
515 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
516 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
517 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
518
519 %assign I (I+1)
520
521 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
522 ; Therefore we need to save these to stack and restore after transpose
523 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
524 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
525
526 vmovdqu Y_DAT0,[inp0+IDX+I*32]
527 vmovdqu Y_DAT1,[inp1+IDX+I*32]
528 vmovdqu Y_DAT2,[inp2+IDX+I*32]
529 vmovdqu Y_DAT3,[inp3+IDX+I*32]
530 vmovdqu Y_DAT4,[inp4+IDX+I*32]
531 vmovdqu Y_DAT5,[inp5+IDX+I*32]
532 vmovdqu Y_DAT6,[inp6+IDX+I*32]
533 vmovdqu Y_DAT7,[inp7+IDX+I*32]
534 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
535 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
536 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
537 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
538 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
539 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
540 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
541 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
542 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
543
544 ; Restore Y_A and Y_B
545 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
546 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
547
548 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
549 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
550 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
551 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
552 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
553 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
554 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
555 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
556
557 ;; Fetch Pointers to Data Stream 9 to 16
558 mov inp0,[state + _data_ptr + 8*8]
559 mov inp1,[state + _data_ptr + 9*8]
560 mov inp2,[state + _data_ptr + 10*8]
561 mov inp3,[state + _data_ptr + 11*8]
562 mov inp4,[state + _data_ptr + 12*8]
563 mov inp5,[state + _data_ptr + 13*8]
564 mov inp6,[state + _data_ptr + 14*8]
565 mov inp7,[state + _data_ptr + 15*8]
566
567 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
568 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
569 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
570 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
571 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
572 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
573 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
574 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
575
576 %assign I 0
577
578 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
579 ; Therefore we need to save these to stack and restore after transpose
580 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
581 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
582
583 vmovdqu Y_DAT0,[inp0+IDX+I*32]
584 vmovdqu Y_DAT1,[inp1+IDX+I*32]
585 vmovdqu Y_DAT2,[inp2+IDX+I*32]
586 vmovdqu Y_DAT3,[inp3+IDX+I*32]
587 vmovdqu Y_DAT4,[inp4+IDX+I*32]
588 vmovdqu Y_DAT5,[inp5+IDX+I*32]
589 vmovdqu Y_DAT6,[inp6+IDX+I*32]
590 vmovdqu Y_DAT7,[inp7+IDX+I*32]
591 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
592 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
593 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
594 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
595 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
596 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
597 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
598 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
599 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
600
601 ; Restore Y_A and Y_B
602 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
603 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
604
605 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
606 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
607 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
608 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
609 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
610 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
611 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
612 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
613 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
614 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
615 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
616 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
617 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
618 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
619 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
620 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
621
622 %assign I (I+1)
623
624 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
625 ; Therefore we need to save these to stack and restore after transpose
626 vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
627 vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
628
629 vmovdqu Y_DAT0,[inp0+IDX+I*32]
630 vmovdqu Y_DAT1,[inp1+IDX+I*32]
631 vmovdqu Y_DAT2,[inp2+IDX+I*32]
632 vmovdqu Y_DAT3,[inp3+IDX+I*32]
633 vmovdqu Y_DAT4,[inp4+IDX+I*32]
634 vmovdqu Y_DAT5,[inp5+IDX+I*32]
635 vmovdqu Y_DAT6,[inp6+IDX+I*32]
636 vmovdqu Y_DAT7,[inp7+IDX+I*32]
637 TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
638 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
639 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
640 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
641 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
642 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
643 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
644 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
645 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
646
647 ; Restore Y_A and Y_B
648 vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
649 vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
650
651 ; Add results to old digest values
652
653 vpaddd Y_A,Y_A,[Y_AA]
654 vpaddd Y_B,Y_B,[Y_BB]
655 vpaddd Y_C,Y_C,[Y_CC]
656 vpaddd Y_D,Y_D,[Y_DD]
657
658 vpaddd Y_A2,Y_A2,[Y_AA2]
659 vpaddd Y_B2,Y_B2,[Y_BB2]
660 vpaddd Y_C2,Y_C2,[Y_CC2]
661 vpaddd Y_D2,Y_D2,[Y_DD2]
662
663 ; Swap DPTR1 and DPTR2
664 xchg DPTR1, DPTR2
665
666 ;; Proceed to processing of next block
667 jmp lloop
668
669 lastblock:
670
671 ; Perform the 64 rounds of processing ...
672 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
673 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
674 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
675 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
676 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
677 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
678 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
679 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
680 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
681 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
682 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
683 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
684 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
685 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
686 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
687 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
688
689 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
690 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
691 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
692 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
693 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
694 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
695 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
696 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
697 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
698 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
699 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
700 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
701 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
702 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
703 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
704 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
705
706 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
707 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
708 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
709 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
710 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
711 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
712 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
713 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
714 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
715 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
716 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
717 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
718 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
719 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
720 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
721 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
722
723 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
724 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
725 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
726 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
727 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
728 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
729 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
730 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
731 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
732 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
733 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
734 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
735 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
736 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
737 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
738 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
739
740 ;; update into data pointers
741 %assign I 0
742 %rep 8
743 mov inp0, [state + _data_ptr + (2*I)*8]
744 mov inp1, [state + _data_ptr + (2*I +1)*8]
745 add inp0, IDX
746 add inp1, IDX
747 mov [state + _data_ptr + (2*I)*8], inp0
748 mov [state + _data_ptr + (2*I+1)*8], inp1
749 %assign I (I+1)
750 %endrep
751
752 vpaddd Y_A,Y_A,[Y_AA]
753 vpaddd Y_B,Y_B,[Y_BB]
754 vpaddd Y_C,Y_C,[Y_CC]
755 vpaddd Y_D,Y_D,[Y_DD]
756
757 vpaddd Y_A2,Y_A2,[Y_AA2]
758 vpaddd Y_B2,Y_B2,[Y_BB2]
759 vpaddd Y_C2,Y_C2,[Y_CC2]
760 vpaddd Y_D2,Y_D2,[Y_DD2]
761
762
763
764 vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A
765 vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B
766 vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C
767 vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D
768
769
770 vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width
771 vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2
772 vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2
773 vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2
774
775
776 ;;;;;;;;;;;;;;;;
777 ;; Postamble
778
779
780
781 mov rsp, [rsp + _RSP_SAVE]
782
783 ret
784
785 section .data
786 align 64
787 MD5_TABLE:
788 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
789 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
790 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
791 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
792 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
793 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
794 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
795 dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
796 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
797 dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
798 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
799 dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
800 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
801 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
802 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
803 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
804 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
805 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
806 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
807 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
808 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
809 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
810 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
811 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
812 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
813 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
814 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
815 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
816 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
817 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
818 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
819 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
820 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
821 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
822 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
823 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
824 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
825 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
826 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
827 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
828 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
829 dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
830 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
831 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
832 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
833 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
834 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
835 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
836 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
837 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
838 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
839 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
840 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
841 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
842 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
843 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
844 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
845 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
846 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
847 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
848 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
849 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
850 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
851 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
852 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
853 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
854 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
855 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
856 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
857 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
858 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
859 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
860 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
861 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
862 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
863 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
864 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
865 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
866 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
867 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
868 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
869 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
870 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
871 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
872 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
873 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
874 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
875 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
876 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
877 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
878 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
879 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
880 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
881 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
882 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
883 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
884 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
885 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
886 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
887 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
888 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
889 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
890 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
891 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
892 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
893 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
894 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
895 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
896 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
897 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
898 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
899 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
900 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
901 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
902 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
903 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
904 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
905 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
906 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
907 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
908 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
909 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
910 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
911 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
912 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
913 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
914 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
915 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
916 ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
917 dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff